diff --git a/.github/workflows/integration-test-suite.yaml b/.github/workflows/integration-test-suite.yaml index 440c99596..12157da0b 100644 --- a/.github/workflows/integration-test-suite.yaml +++ b/.github/workflows/integration-test-suite.yaml @@ -1,7 +1,7 @@ ################################################################################ ### GitHub Actions curation providing quality assurance for Haskell projects ### -name: 'PHAGE Integration Test CI' +name: 'PHANE Integration Test CI' ################################################################################ ### Actions Configuration @@ -99,7 +99,6 @@ jobs: # Recipient & email information to: crowley@amnh.org cc: wheeler@amnh.org - from: PHAGE ρbit - subject: "PHAGE CI Failure: Integration Test-suite - job ${{ github.job }}" - body: "The continuous integration performed by GitHub Workflows has failed!\n\nInspect job ${{ github.job.name }} numbered ${{ github.job }} at:\n\n https://github.com/${{ github.repository }}actions.\n\n\nVigilantly,\n ~ PHAGE ρbit" - + from: PHANE ρbit + subject: "PHANE CI Failure: Integration Test-suite - job ${{ github.job }}" + body: "The continuous integration performed by GitHub Workflows has failed!\n\nInspect job ${{ github.job.name }} numbered ${{ github.job }} at:\n\n https://github.com/${{ github.repository }}actions.\n\n\nVigilantly,\n ~ PHANE ρbit" diff --git a/.gitignore b/.gitignore index 22db38c52..5d4e00185 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,419 @@ +### +# +# Manually curated ignore definitions +# +## + +.gitignore + +# Temporary files +*.*~ +*#*# +*.DS_Store +*.pdf + +# Testing artifacts +log.err +log.out +*.DS_Store + +# Outputs from testing +*.csv +*.tree +*.eps +*.ps + + +### +# +# GitHub .gitignore +# --- --- --- --- --- +# Template: C +# +### + +# Prerequisites +*.d + +# Object files +*.o +*.ko +*.obj +*.elf + +# Linker output +*.ilk +*.map +*.exp + +# Precompiled Headers +*.gch +*.pch + +# Libraries +*.lib +*.a +*.la +*.lo + +# Shared objects (inc. Windows DLLs) +*.dll +*.so +*.so.* +*.dylib + +# Executables +*.exe +*.out +*.app +*.i*86 +*.x86_64 +*.hex + +# Debug files +*.dSYM/ +*.su +*.idb +*.pdb + +# Kernel Module Compile Results +*.mod* +*.cmd +.tmp_versions/ +modules.order +Module.symvers +Mkfile.old +dkms.conf + + +### +# +# GitHub .gitignore +# --- --- --- --- --- +# Template: C++ +# +### + +# Prerequisites +*.d + +# Compiled Object files +*.slo +*.lo +*.o +*.obj + +# Precompiled Headers +*.gch +*.pch + +# Compiled Dynamic libraries +*.so +*.dylib +*.dll + +# Fortran module files +*.mod +*.smod + +# Compiled Static libraries +*.lai +*.la +*.a +*.lib + +# Executables +*.exe +*.out +*.app + + +### +# +# GitHub .gitignore +# --- --- --- --- --- +# Template: Haskell +# +### + +dist +dist-* +cabal-dev +*.o +*.hi +*.hie +*.chi +*.chs.h +*.dyn_o +*.dyn_hi +.hpc +.hsenv +.cabal-sandbox/ +cabal.sandbox.config +*.prof +*.aux +*.hp +*.eventlog +.stack-work/ +cabal.project.local +cabal.project.local~ +.HTF/ +.ghc.environment.* + + +### +# +# GitHub .gitignore +# --- --- --- --- --- +# Template: Idris +# +### + +# Idris 2 +*.ttc +*.ttm + +# Idris 1 +*.ibc +*.o + + +### +# +# GitHub .gitignore +# --- --- --- --- --- +# Template: Python +# +### + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + + +### +# +# GitHub .gitignore +# --- --- --- --- --- +# Template: R +# +### + +# History files +.Rhistory +.Rapp.history + +# Session Data files +.RData +.RDataTmp + +# User-specific files +.Ruserdata + +# Example code in package build process +*-Ex.R + +# Output files from R CMD build +/*.tar.gz + +# Output files from R CMD check +/*.Rcheck/ + +# RStudio files +.Rproj.user/ + +# produced vignettes +vignettes/*.html +vignettes/*.pdf + +# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 +.httr-oauth + +# knitr and R markdown default cache directories +*_cache/ +/cache/ + +# Temporary files created by R markdown +*.utf8.md +*.knit.md + +# R Environment Variables +.Renviron + +# pkgdown site +docs/ + +# translation temp files +po/*~ + +# RStudio Connect folder +rsconnect/ + + +### +# +# GitHub .gitignore +# --- --- --- --- --- +# Template: Tex +# +### + ## Core latex/pdflatex auxiliary files: *.aux *.lof @@ -11,7 +427,6 @@ *.cb *.cb2 .*.lb -*.pdf ## Intermediate documents: *.dvi @@ -22,9 +437,6 @@ # *.eps # *.pdf -## SPINS intermediate artifacts -pan.* - ## Generated if empty string is given at "Please type another file name for output:" .pdf @@ -115,15 +527,23 @@ acs-*.bib *.glsdefs *.lzo *.lzs +*.slg +*.slo +*.sls # uncomment this for glossaries-extra (will ignore makeindex's style files!) # *.ist +# gnuplot +*.gnuplot +*.table + # gnuplottex *-gnuplottex-* # gregoriotex *.gaux +*.glog *.gtex # htlatex @@ -139,8 +559,8 @@ acs-*.bib # knitr *-concordance.tex -# TODO Comment the next line if you want to keep your tikz graphics files -*.tikz +# TODO Uncomment the next line if you use knitr and want to ignore its generated tikz files +# *.tikz *-tikzDictionary # listings @@ -170,6 +590,9 @@ _minted* # morewrites *.mw +# newpax +*.newpax + # nomencl *.nlg *.nlo @@ -189,6 +612,9 @@ _minted* # scrwfile *.wrt +# svg +svg-inkscape/ + # sympy *.sout *.sympy @@ -202,9 +628,6 @@ sympy-plots-for-*.tex/ *.pytxcode pythontex-files-*/ -# MacOS -.DS_Store - # tcolorbox *.listing @@ -216,6 +639,9 @@ pythontex-files-*/ *.md5 *.auxlock +# titletoc +*.ptc + # todonotes *.tdo @@ -282,16 +708,6 @@ TSWLatexianTemp* # Makeindex log files *.lpz +# xwatermark package +*.xwm -# Temporaty files -*.*~ -*#*# - -# Testing artifacts -log.err -log.out - -# Outputs from testing -*.csv -*.tre -*.tree diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 000000000..d9476bbe7 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,37 @@ +# This CITATION.cff file was generated with cffinit. +# Visit https://bit.ly/cffinit to generate yours today! + +cff-version: 1.2.0 +title: 'PhylogeneticGraph (PhyG) ' +message: >- + If you use this software, please cite it using the + metadata from this file. +type: software +authors: + - given-names: Ward + family-names: Wheeler + email: wheeler@amnh.org + affiliation: American Museum of Natural History + orcid: 'https://orcid.org/0000-0001-6977-984X' +repository-code: 'https://github.com/amnh/PhyG' +url: 'https://github.com/amnh/PhyG' +repository-artifact: 'https://github.com/amnh/PhyG-Integration-Tests' +abstract: >- + Phylogenetic Graph (PhyG) is a multi-platform program + designed to produce phylogenetic graphs from input data + and graphs via heuristic searching of general phylogenetic + graph space. The bio-informatics framework libraries of + the broader Phylogenetic Haskell Analytic Network Engine + (PHANE) project are the foundation upon which PhyG is + constructed. PhyG offers vast functionality, including the + optimization of unaligned sequences, and the ability to + implement search strategies such as random addition + sequence, swapping, and tree fusing. Furthermore, PhyG can + generate outputs in the form of implied alignments and + graphical representations of cladograms and graphs. What + sets PhyG apart from other phylogenetic analysis programs, + is the extension to broader classes of input data and + phylogenetic graphs. The phylogenetic graph inputs and + outputs of PhyG include trees, as well as softwired and + hardwired networks. +license: BSD-3-Clause diff --git a/LICENSE b/LICENSE deleted file mode 120000 index 9f12e97e2..000000000 --- a/LICENSE +++ /dev/null @@ -1 +0,0 @@ -doc/LICENSE \ No newline at end of file diff --git a/PhyG.cabal b/PhyG.cabal new file mode 100644 index 000000000..02db8b3be --- /dev/null +++ b/PhyG.cabal @@ -0,0 +1,736 @@ +Cabal-Version: 3.12 +Name: PhyG +Version: 0.1.4 +Stability: Alpha +Build-Type: Simple +Tested-With: + GHC == 9.2.8 + GHC == 9.4.7 + GHC == 9.6.3 + GHC == 9.8.1 + GHC == 9.10.1 + +Author: Ward Wheeler +Copyright: © 2015 Ward Wheeler and The American Museum of Natural History +License: BSD-3-Clause +License-File: doc/LICENSE + +Maintainer: Ward Wheeler +Homepage: https://github.com/AMNH/PhyG#readme +Bug-Reports: https://github.com/AMNH/PhyG/issues + +Synopsis: Performs heuristic search of phylogenetic graph space via scoring abstract input data. +Description: + Phylogenetic Graph (PhyG) is a multi-platform program designed to produce phylogenetic graphs + from input data and graphs via heuristic searching of general phylogenetic graph space. The + bio-informatics framework libraries of the broader Phylogenetic Haskell Analytic Network Engine + (PHANE) project are the foundation upon which PhyG is constructed. PhyG offers vast functionality, + including the optimization of unaligned sequences, and the ability to implement search strategies + such as random addition sequence, swapping, and tree fusing. Furthermore, PhyG can generate + outputs in the form of implied alignments and graphical representations of cladograms and graphs. + What sets PhyG apart from other phylogenetic analysis programs, is the extension to broader + classes of input data and phylogenetic graphs. The phylogenetic graph inputs and outputs of PhyG + include trees, as well as softwired and hardwired networks. + +Data-Files: + doc/Authors.md + doc/Funding.md + doc/LICENSE + +Extra-Doc-Files: + doc/Authors.md + doc/Changelog.md + doc/Funding.md + doc/LICENSE + README.md + +Extra-Source-Files: + doc/Authors.md + doc/Funding.md + doc/LICENSE + ffi/external-direct-optimization/*.h + + +Flag Enforce-Sanity + Description: Ensure that all warnings are handled + Default: False + Manual: True + +Flag Enforce-Timestamp + Description: Disable multi-threading + Default: True + Manual: False + +Flag Forego-Sanity + Description: Disable extensive and helpful compiler warnings + Default: False + Manual: True + +Flag Single-Threaded + Description: Disable multi-threading + Default: False + Manual: True + +Flag Super-Optimization + Description: Apply extremely agressive and extremely time consuming optimization passes + Default: False + Manual: True + +Flag Use-LLVM + Description: Utilize the LLVM code generator backend + Default: False + Manual: True + +Flag Memoize-Via-ConcurrentHashtable + Description: Memoized hash-table uses the "Concurrent Hashtable" package's implementation + Default: True + Manual: True + +Flag Memoize-Via-IORef + Description: Memoized hash-table uses the "IORef" implementation + Default: False + Manual: True + +Flag Memoize-Via-ManualLock + Description: Memoized hash-table uses the "ManualLock" implementation + Default: False + Manual: True + +Flag Memoize-Via-ReadWriteLock + Description: Memoized hash-table uses the "ReadWriteLock" implementation + Default: False + Manual: True + +Flag Memoize-Via-Semaphore + Description: Memoized hash-table uses the "Semaphore" implementation + Default: False + Manual: True + +Flag Memoize-Via-TVar + Description: Memoized hash-table uses the "TVar" implementation + Default: False + Manual: True + + +-- Global deviations from Haskell98 +Common ffi-build-info + +-- If !os(darwin) +-- CC-Options: +-- -rdynamic --std=c11 +-- +-- Else +-- CC-Options: +-- --std=c11 + + CC-Options: + --std=c11 + -- this for LINUX compile + -fPIC + + -- Here we list all directories that contain C & C++ header files that the FFI + -- tools will need to locate when preprocessing the C files. Without listing + -- the directories containing the C header files here, the FFI preprocessor + -- (hsc2hs, c2hs, etc.) will fail to locate the requisite files. Note also, + -- that the parent directory of the nessicary C & C++ header files must be + -- specified. The preprocessor will not recursively look in subdirectories for + -- header files! + Include-Dirs: + ffi/external-direct-optimization + + -- Specify the header files as required source files here. + -- Do not specify them in the c-sources or cxx-sources stanzas. + -- This is required for sdist and install commands to work correctly. + Includes: + alignCharacters.h + alignmentMatrices.h + c_alignment_interface.h + c_code_alloc_setup.h + costMatrix.h + debug_constants.h + dyn_character.h + ukkCheckPoint.h + ukkCommon.h + + C-Sources: + ffi/external-direct-optimization/alignCharacters.c + ffi/external-direct-optimization/alignmentMatrices.c + ffi/external-direct-optimization/c_alignment_interface.c + ffi/external-direct-optimization/c_code_alloc_setup.c + ffi/external-direct-optimization/costMatrix.c + ffi/external-direct-optimization/dyn_character.c + ffi/external-direct-optimization/ukkCheckPoint.c + ffi/external-direct-optimization/ukkCommon.c + +-- This is required as compile option to make static binaries on Linux at least +-- need to haev llvm-(verions) tools installed to link properly +-- enable-executable-static -fpic (no idea why but lower case seems to mater on +-- commandline)--sometimes have to run twice to get link correct + +-- Global build directives +Common build-defaults + + -- If we want to enforce that the compile-time timestamp is accurate, + -- then we must ensure that we recompile the appropriate module(s). + -- This sets the CPP flag so that the modules can query if they should + -- always be recompield and then ask GHC to recompile them. + If flag(Enforce-Timestamp) + CPP-Options: + -DENFORCE_TIMESTAMP + + -- Select the memoized hash-table implmentation to be used! + If flag(Memoize-Via-ReadWriteLock) + CPP-Options: + -DMemoize_Via_ReadWriteLock + + Elif flag(Memoize-Via-IORef) + CPP-Options: + -DMemoize_Via_IORef + + Elif flag(Memoize-Via-ManualLock) + CPP-Options: + -DMemoize_Via_ManualLock + + Elif flag(Memoize-Via-Semaphore) + CPP-Options: + -DMemoize_Via_Semaphore + + Elif flag(Memoize-Via-TVar) + CPP-Options: + -DMemoize_Via_TVar + + -- The flag of the default implementation goes last to give others preference. + Elif flag(Memoize-Via-ConcurrentHashtable) + CPP-Options: + -DMemoize_Via_ConcurrentHashtable + + -- If no flags are set, meaning that the default flag was manually disabled, + -- then proceed with the default implementation anyways! + Else + CPP-Options: + -DMemoize_Via_ConcurrentHashtable + + -- We want these Language Extensions enabled in *all* modules by default. + Default-Extensions: + BangPatterns + DerivingStrategies + ImportQualifiedPost + LambdaCase + NoGeneralizedNewtypeDeriving + RoleAnnotations + UnicodeSyntax + + Default-Language: + GHC2021 + + If flag(Enforce-Sanity) + GHC-Options: + -Werror + + If !flag(Forego-Sanity) + GHC-Options: + -- Sanity check warnings + -- 1. Include all warnings by default + -Weverything + -- 2. Exclude the undesirable warnings + -Wno-all-missed-specialisations + -Wno-implicit-prelude + -Wno-inferred-safe-imports + -Wno-missing-import-lists + -Wno-missing-kind-signatures + -Wno-missing-safe-haskell-mode + -Wno-monomorphism-restriction + -Wno-redundant-bang-patterns + -Wno-type-defaults + -Wno-unsafe + -- Remove these exclusions later + -Wno-missed-specialisations + -Wno-x-partial + + If os(darwin) + Ld-Options: + -Wl,-no_fixup_chains + + +-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- +-- +-- Preamble of package description and reusable definitions from above has ended +-- and below we list all build targets of the package. Build targets include: +-- +-- * Primary executable(s) for public usage +-- +-- * Exposed sub-libraries for public consumption as program dependancies +-- +-- * Benchmarks for executables and sub-libraries +-- +-- * Test-suites for executables and sub-libraries +-- +-- * Additional executables for non-public, diagnostic purposes +-- +-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- + + +-- Phylogenetic Graphs +-- +-- This is the main program of this package, supporting fully featured +-- phylogenetic trees, networks, and forests. +Executable phyg + + Import: + build-defaults, + ffi-build-info, + + Main-Is: + phygraph.hs + + GHC-Options: +-- -feager-blackholing -- may be causing parallel loop errors + -O2 + -rtsopts + -Wall + -- Maybe this is required for good paralellism? + -fno-omit-yields + -- There is/are known compiler bug(s) with this optimization in GHC 9.6.2 (and earlier?). + -- Hence we explicitly disable it and hope that it removes segfaults on some architechtures. + -- -fno-polymorphic-specialisation + +-- Ghc-Prof-Options: + + If flag(Use-LLVM) + GHC-Options: + -fllvm + + -- Setup the Run-Time System (RTS) defaults + -- Note: + -- * Usage of -AL8 was rejected should be >= A + -- * usage of --nonmoving-gc causes large memory blow-ups on cluster + + -- If the the program should only use a single thread, + -- Then use the following RTS options: + If flag(Single-Threaded) + GHC-Options: + "-with-rtsopts=-A64m -AL128M -H1024m -n4m" + + -- Otherwise, enable threading + -- and additionally set the maximum number of threads as the default: + Else + GHC-Options: + -threaded + "-with-rtsopts=-N -A64m -AL128M -H1024m -n4m -qa -qm" + + -- If need for a high performance executable was specified, + -- then at compile-time GHC will allocate additional time and memory + -- to explore and apply the most agrressive optimization possible. + If flag(Super-Optimization) + GHC-Options: + -fexcess-precision + -fexpose-all-unfoldings + -flate-specialise + -fmax-simplifier-iterations=16 + -foptimal-applicative-do + -fspec-constr-count=8 + -fspec-constr-keen + -fspecialize-aggressively + -fstatic-argument-transformation + -fstg-cse + -fstrictness-before=1 + -fstrictness-before=2 + -fstrictness-before=3 + -fstrictness-before=4 + -funbox-small-strict-fields + -funbox-strict-fields + + Build-Depends: + PHANE-alphabet, + PHANE-dynamic-character, + PHANE-dynamic-character-element, + PHANE-evaluation, + PhyG:dynamic-character, + PhyG:tcm, + PHANE-PhyloLib, + base >= 4.10, + bimap, + bv, + bv-little, + bytestring, + containers >= 0.6, + cpuinfo, + deepseq, + directory, + encode-string, + fingertree, + file-embed, + filepath >= 1.4.100, + fingertree, + fgl >=5.8, + foldl, + gitrev, + graphviz >=2999.20, + hashable, + inflist, + logfloat, + MissingH >= 1.4.3, + MonadRandom >= 0.6 && < 1.0, + mmark, + modern-uri, + optparse-applicative, + parallel, + prettyprinter, + process, + random, + random-shuffle, + split, + template-haskell, + text >= 2.0.1 && < 3.0, + text-builder-linear >= 0.1.1.1, + text-short, + th-lift-instances, + time, + unbounded-delays, + unliftio-core >= 0.2 && < 1.0, + vector >= 0.13 && < 1.0, + zlib + -- currently doesn't compile but could be useful + -- lots of good functions + -- Graphalyze, + + -- include the source files needed in src for github library + -- this for local library files + HS-Source-Dirs: + app + src + + Other-Extensions: + CPP + TemplateHaskell + + Autogen-Modules: + PackageInfo_PhyG + Paths_PhyG + + Other-Modules: + CommandLineOptions + CommandLineOptions.Display + CommandLineOptions.Types + Commands.CommandExecution + Commands.CommandUtilities + Commands.ProcessCommands + Commands.Transform + Commands.Verify + Complexity.CodeStrings + Complexity.Constants + Complexity.Graphs + Complexity.Huffman + Complexity.Utilities + Debug.Debug + GraphOptimization.Medians + GraphOptimization.PostOrderSoftWiredFunctions + GraphOptimization.PostOrderSoftWiredFunctionsNew + GraphOptimization.PreOrderFunctions + GraphOptimization.Traversals + Graphs.GraphOperations + Input.BitPack + Input.DataTransformation + Input.FastAC + Input.ReadInputFiles + Input.Reorganize + Input.TNTUtilities + PackageInfo_PhyG + Paths_PhyG + Reconciliation.Adams + Reconciliation.Eun + Reconciliation.ReconcileGraphs + Search.Build + Search.DistanceMethods + Search.DistanceWagner + Search.Fuse + Search.GeneticAlgorithm + Search.NetworkAddDelete + Search.Refinement + Search.Search + Search.Swap + Search.SwapMaster + Search.WagnerBuild + Software.Credits + Software.License + Software.Metadata + Software.Metadata.Embedded + Software.Metadata.TimeStamp + Software.Preamble + Software.SplashImage + Support.Support + System.Timing + Types.DistanceTypes + Types.Types + Utilities.DistanceUtilities + Utilities.Distances + Utilities.LocalGraph + Utilities.LocalSequence + Utilities.TcmHash + Utilities.ThreeWayFunctions + Utilities.Utilities + -- GraphOptimization.PostOrderFunctions + + +-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- +-- +-- Collection of sub-libraries which both, compose the package's primary program +-- (PCG/PhyG), and are also exposed for external consumption by other programs. +-- +-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- + + +-- Library for performing string alignment on dynamic characters. + +-- Provides various metrics for scoring static characters and +-- performing string alignment on dynamic characters. + +Library dynamic-character + + Import: + build-defaults, + ffi-build-info + + HS-Source-Dirs: + lib/dynamic-character/src + + Build-Depends: +-- PHANE-alphabet, + PHANE-dynamic-character, + PHANE-dynamic-character-element, + PhyG:tcm, + PhyG:utility, + base >= 4.11 && < 5.0, +-- bv-little, + containers >= 0.6.2 && < 1.0, + matrices >= 0.5 && < 1.0, + monad-loops >= 0.4 && < 1.0, + primitive >= 0.7.1 && < 1.0, + vector >= 0.12.0.3 && < 1.0, + + Exposed-Modules: +-- Bio.DynamicCharacter +-- Bio.DynamicCharacter.Measure +-- Bio.DynamicCharacter.HandleGaps + DirectOptimization.Pairwise + DirectOptimization.Pairwise.Visualization + DirectOptimization.Pairwise.Swapping + DirectOptimization.Pairwise.Ukkonen + DirectOptimization.PreOrder + + Other-Modules: + DirectOptimization.Pairwise.Direction + DirectOptimization.Pairwise.Huge + DirectOptimization.Pairwise.Internal + DirectOptimization.Pairwise.Slim + DirectOptimization.Pairwise.Slim.FFI + DirectOptimization.Pairwise.Wide + + +-- Library for working with TCMs and SCMs in various representations. + +-- General purpose library for working with transition cost matrices (TCMs) +-- and symbol change matrices (SCMs). Specialization options are provided +-- for the discrete metric (non-additive) and the L1 norm (additive) TCMs & +-- SCMs. Exposes a memoized binding for sparsely indexed, large TCMs. + +Library tcm + + Import: + build-defaults, + ffi-build-info + + HS-Source-Dirs: + lib/tcm/src + + Build-Depends: +-- PHANE-dynamic-character, + PHANE-dynamic-character-element, + PhyG:utility, + binary >= 0.8 && < 1.0, + base >= 4.11 && < 5.0, + containers >= 0.6.2 && < 1.0, + concurrent-extra >= 0.7.0 && < 1.0, + concurrent-hashtable >= 0.1.8 && < 1.0, + deepseq >= 1.4 && < 2.0, + hashable >= 1.3 && < 2.0, + hashtables >= 1.2 && < 2.0, + QuickCheck >= 2.14 && < 3.0, + mono-traversable >= 1.0 && < 2.0, + semigroupoids >= 5.3 && < 7.0, + stm, + vector >= 0.12.0.3 && < 1.0, + vector-binary-instances >= 0.2.5 && < 1.0, + + Exposed-Modules: + Data.Hashable.Memoize + Data.Hashable.Memoize.ViaConcurrentHashtable +-- Data.Hashable.Memoize.ViaConcurrentHashtableOpt + Data.Hashable.Memoize.ViaIORef + Data.Hashable.Memoize.ViaManualLock + Data.Hashable.Memoize.ViaReadWriteLock + Data.Hashable.Memoize.ViaSemaphore + Data.Hashable.Memoize.ViaTVar + Data.MetricRepresentation + Data.TCM + Data.TCM.Dense + Data.TCM.Overlap + + Other-Modules: + Data.TCM.Dense.FFI + Data.TCM.Internal + + +Library utility + + Import: + build-defaults + + HS-Source-Dirs: + lib/utility/src + + Build-Depends: + base >= 4.11 && < 5.0, + binary >= 0.8 && < 1.0, + containers >= 0.6.2 && < 1.0, + deepseq >= 1.4 && < 2.0, + foldl >= 1.4 && < 2.0, + hashable >= 1.3 && < 2.0, + keys >= 3.12 && < 4.0, + lens >= 4.18 && < 6.0, + matrix >= 0.3.6 && < 0.4, + pointed >= 5.0 && < 6.0, + QuickCheck >= 2.14 && < 3.0, + semigroupoids >= 5.3 && < 7.0, + vector >= 0.12.0.3 && < 1.0, + vector-binary-instances >= 0.2 && < 1.0, + vector-instances >= 3.4 && < 3.5, + + If impl(ghc < 9.0) + Build-Depends: + integer-gmp >= 1.0.2 && < 2.0 + + Exposed-Modules: + Data.List.Utility + Data.Matrix.NotStupid + Data.Vector.NonEmpty + + +-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- +-- +-- Test-suites of the sub-libraries which compose PCG/PhyG +-- +-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- + + +Test-Suite test-dynamic-character + + Import: + build-defaults + + Main-Is: + TestSuite.hs + + Type: + exitcode-stdio-1.0 + + HS-Source-Dirs: + lib/dynamic-character/test + + Build-Depends: + PHANE-alphabet, + PHANE-dynamic-character, + PHANE-dynamic-character-element:quickcheck, + PhyG:dynamic-character, + PhyG:tcm, + base >= 4.11 && < 5.0, + bimap >= 0.3 && < 1.0, + containers >= 0.6.2 && < 1.0, + QuickCheck >= 2.14 && < 3.0, + tasty >= 1.4 && < 2.0, + tasty-quickcheck >= 0.10 && < 1.0, + vector >= 0.12.0.3 && < 1.0, + + Other-Modules: + DirectOptimization.Pairwise.Test + Test.Aligners + Test.QuickCheck.Instances.DynamicCharacter + + +-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- +-- +-- Additional executables which exist for the one of a few select purposes: +-- +-- * Generating data for PCG/PhyG input +-- +-- * Debugging the component sub-libraries of PCG/PhyG and PCG/PhyG itself +-- +-- * Performing correctness verification +-- +-- * Stocastically searching for non-totality counterexamples +-- +-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- + + +--Executable inspect-dynamic-character +-- +-- Import: +-- build-defaults, +-- +-- Scope: +-- private +-- +-- Main-Is: +-- Inspect.hs +-- +-- HS-Source-Dirs: +-- lib/dynamic-character/test +-- +-- Build-Depends: +-- PHANE-alphabet, +-- PhyG:dynamic-character, +-- PhyG:tcm, +-- base >= 4.11 && < 5.0, +-- bimap >= 0.3 && < 1.0, +-- containers >= 0.6.2 && < 1.0, +-- QuickCheck >= 2.14 && < 3.0, +-- tasty >= 1.4 && < 2.0, +-- tasty-quickcheck >= 0.10 && < 1.0, +-- vector >= 0.12.0.3 && < 1.0, +-- +-- Other-Modules: +-- Test.Aligners +-- Test.QuickCheck.Instances.DynamicCharacter + + + +Benchmark bench-memoized-hashtable + + Import: + build-defaults + + Main-Is: + Time.hs + + Type: + exitcode-stdio-1.0 + + GHC-Options: + -threaded + "-with-rtsopts=-N" + + Hs-Source-Dirs: + lib/tcm/bench + + Build-Depends: + PhyG:tcm, + async, + base >= 4.11 && < 5.0, + criterion >= 1.5 && < 2.0, + deepseq >= 1.4 && < 2.0, + mtl >= 2.3 && < 3.0, + random, + vector >= 0.12.0.3 && < 1.0, + + Other-Modules: + Benchmark.Internal diff --git a/README.md b/README.md index 03b93b9c5..fd42db32e 120000 --- a/README.md +++ b/README.md @@ -1 +1 @@ -doc/ReadMe/PHAGE.md \ No newline at end of file +doc/README.md \ No newline at end of file diff --git a/app/CommandLineOptions.hs b/app/CommandLineOptions.hs new file mode 100644 index 000000000..c09468b51 --- /dev/null +++ b/app/CommandLineOptions.hs @@ -0,0 +1,148 @@ +{-# LANGUAGE OverloadedStrings #-} + +{- | +Command line options parser for the Phylogenetic Graph (PhyG) tool. +-} +module CommandLineOptions ( + -- * Data-types + CommandLineOptions (getArgsCLI), + + -- * Parser + parseCommandLineOptions, + + -- * Display information + printInformationDisplay, +) where + +import CommandLineOptions.Display +import CommandLineOptions.Types +import Data.Foldable +import Data.Functor ((<&>)) +import Data.List.NonEmpty (NonEmpty) +import Data.String +import Options.Applicative +import Options.Applicative.NonEmpty (some1) +import Prettyprinter (align, fillSep, hardline, indent, parens, vsep, (<+>)) +import Software.Metadata +import System.Environment + + +{- | +Command to parse the command line options. +-} +parseCommandLineOptions ∷ IO CommandLineOptions +parseCommandLineOptions = parserInformation >>= customExecParser parserPreferences + + +{- | +The preferences for the CLI parser. +-} +parserPreferences ∷ ParserPrefs +parserPreferences = + prefs $ + fold + [ columns 100 + , disambiguate + , showHelpOnEmpty + , showHelpOnError + ] + + +{- | +Information regarding which command line options are valid and how they are +parsed and interpreted. +-} +parserInformation ∷ IO (ParserInfo CommandLineOptions) +parserInformation = + parserDescription >>= \description → + let displayFlag ∷ DisplayInfoBlock → String → String → Parser DisplayInfoBlock + displayFlag val name desc = flag' val $ fold [long name, help desc] + + pCommandLineOptions2 ∷ Parser CommandLineOptions + pCommandLineOptions2 = + let caseInput ∷ Parser (Either (NonEmpty DisplayInfoBlock) FilePath) + caseInput = Right <$> pInputFile + casePrint ∷ Parser (Either (NonEmpty DisplayInfoBlock) FilePath) + casePrint = Left <$> pDisplayInfos + in fmap OptionsCLI $ caseInput <|> casePrint + + pDisplayInfos ∷ Parser (NonEmpty DisplayInfoBlock) + pDisplayInfos = some1 pDisplayFlag + + pDisplayFlag ∷ Parser DisplayInfoBlock + pDisplayFlag = + asum + [ displayFlag DisplayCredits "credits" "Roll project contributions credits" + , displayFlag DisplayLicense "license" "Emit the software license" + , displayFlag DisplaySplash "splash" "Show splash image" + , displayFlag DisplayVersion "version" "Show version information" + ] + + pInputFile ∷ Parser FilePath + pInputFile = + strArgument $ + fold + [ help $ fold ["Filepath to PhyG command script (required)"] + , metavar "FILE" + , action "file" + ] + in pure $ info (helper <*> pCommandLineOptions2) description + + +parserDescription ∷ IO (InfoMod a) +parserDescription = + getProgName <&> \name → + let projectLess = fromString $ nameAbbreviation projectName + projectMore = fromString $ nameExpansion projectName + programLess = fromString $ nameAbbreviation softwareName + programLong = fromString $ nameExpansion softwareName + programTell = fromString name + + headInfo ∷ InfoMod a + headInfo = + let projectPreamble = fillSep ["The", projectMore, parens projectLess, "project presents:"] + programNameBlock = + indent 2 . align $ + vsep + [ programLong <+> parens programLess + , programTell <+> shortVersionInformation + ] + in headerDoc . Just $ vsep [projectPreamble <> hardline, programNameBlock] + + progInfo ∷ InfoMod a + progInfo = + let sentences = + [ programLong + : parens programLess + : words' + "is a multi-platform program designed to produce phylogenetic graphs from input data and graphs via heuristic searching of general phylogenetic graph space." + , words' "The bio-informatics framework libraries of the broader" + <> [projectMore, parens projectLess] + <> words' "project are the foundation upon which" + <> [programLess] + <> words' "is constructed." + , programLess + : words' + "offers vast functionality, including the optimization of unaligned sequences, and the ability to implement search strategies such as random addition sequence, swapping, and tree fusing." + , "Furthermore," + : programLess + : words' "can generate outputs in the form of implied alignments and graphical representations of cladograms and graphs." + , "What" + : "sets" + : programLess + : words' + "apart from other phylogenetic analysis programs, is the extension to broader classes of input data and phylogenetic graphs." + , words' "The phylogenetic graph inputs and outputs of" + <> [programLess] + <> words' "include trees, as well as softwired and hardwired networks." + ] + textStream = align . fillSep $ fold sentences + words' = fmap fromString . words + in progDescDoc $ Just textStream + in fold + [ failureCode 2 + , fullDesc + , headInfo + , noIntersperse + , progInfo + ] diff --git a/app/CommandLineOptions/Display.hs b/app/CommandLineOptions/Display.hs new file mode 100644 index 000000000..9e46f33a1 --- /dev/null +++ b/app/CommandLineOptions/Display.hs @@ -0,0 +1,77 @@ +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE Strict #-} +{-# LANGUAGE TemplateHaskell #-} + +{- | +Human readable renderings for textual output of important program information. +-} +module CommandLineOptions.Display ( + printInformationDisplay, +) where + +import CommandLineOptions.Types +import Data.Foldable +import Data.List.NonEmpty (NonEmpty (..), sort) +import Data.Text.Builder.Linear +import Data.Text.IO +import Software.Credits +import Software.License +import Software.Metadata +import Software.SplashImage +import Prelude hiding (putStrLn) + + +{- | +Gather the information blocks specified from the CLI options and nicely render the blocks together. +-} +printInformationDisplay ∷ NonEmpty DisplayInfoBlock → IO () +printInformationDisplay = + let bordering ∷ Builder + bordering = "\n" + + delimiter ∷ Builder + delimiter = "\n\n\n" + + encloseNonSingle ∷ Builder → (NonEmpty Builder → Builder) → NonEmpty Builder → Builder + encloseNonSingle endcap f blocks@(_ :| bs) = + let joined = f blocks + in case bs of + [] → joined + _ → endcap <> joined <> endcap + + getBuilder ∷ DisplayInfoBlock → IO Builder + getBuilder = \case + DisplayVersion → builderVersionInfo + DisplayLicense → pure builderLicenseText + DisplaySplash → pure builderSplashImage + DisplayCredits → pure builderCreditsRoll + + joinBuilders ∷ NonEmpty Builder → Builder + joinBuilders = encloseNonSingle bordering (intercalate' delimiter) + + printBuilder ∷ IO Builder → IO () + printBuilder b = fmap runBuilder b >>= putStrLn + in printBuilder . fmap joinBuilders . traverse getBuilder . sort + + +builderLicenseText ∷ Builder +builderLicenseText = fromText licenseText + + +builderCreditsRoll ∷ Builder +builderCreditsRoll = contributors + + +builderVersionInfo ∷ IO Builder +builderVersionInfo = fullVersionInformation + + +builderSplashImage ∷ Builder +builderSplashImage = fromText $$(splashImage) + + +intercalate' ∷ Builder → NonEmpty Builder → Builder +intercalate' sep = \case + x :| [] → x + x :| xs → x <> foldMap (sep <>) xs diff --git a/app/CommandLineOptions/Types.hs b/app/CommandLineOptions/Types.hs new file mode 100644 index 000000000..508fb0160 --- /dev/null +++ b/app/CommandLineOptions/Types.hs @@ -0,0 +1,149 @@ +{-# LANGUAGE DeriveGeneric #-} +{-# LANGUAGE GeneralizedNewtypeDeriving #-} +{-# LANGUAGE Strict #-} + +{- | +Record which holds all the resulting preferences of the user as indicated by the +supplied command line options. +-} +module CommandLineOptions.Types ( + -- * Primary data-types + CommandLineOptions (..), + DisplayInfoBlock (..), + + -- * Secondary data-types + + -- ** CommandLineOptionsInput + CommandLineOptionsInput (..), + + -- ** DisplayFlags + DisplayFlags (DisplayFlags), + + -- *** Accessor + displaySomeInformation, + + -- ** InputFile + InputFile (), + + -- *** Constructor + missingInputFile, + + -- *** Accessor + maybeInputFile, +) where + +import Control.DeepSeq +import Data.Foldable (fold) +import Data.List.NonEmpty (NonEmpty, nonEmpty) +import Data.String (IsString (..)) +import GHC.Generics +import Text.Read (Read (readPrec)) + + +{- | +Valid command line options +-} +newtype CommandLineOptions = OptionsCLI {getArgsCLI ∷ Either (NonEmpty DisplayInfoBlock) FilePath} + deriving stock (Generic) + + +{- | +Collection of flags gathered from command line interface which has yet to be validated. +-} +data CommandLineOptionsInput = CommandLineOptionsInput + { inputFile ∷ InputFile + , displayFlags ∷ DisplayFlags + } + deriving stock (Generic) + + +{- | +Collection of binary flags indicating requests for software metadata information to be displayed. +-} +data DisplayFlags = DisplayFlags + { printCredits ∷ Bool + , printLicense ∷ Bool + , printSplash ∷ Bool + , printVersion ∷ Bool + } + deriving stock (Generic) + + +{- | +Enumeration of metadata information blocks which can be displayed. + +/Ordering of this type determines the ordering of information displayed!/ +-} +data DisplayInfoBlock + = DisplayVersion + | DisplayLicense + | DisplaySplash + | DisplayCredits + deriving stock (Eq, Ord, Generic) + + +{- | +Possibly missing input file for the software. +-} +newtype InputFile = InputFile {maybeInputFile ∷ Maybe FilePath} + deriving stock (Generic) + deriving newtype (Eq, Ord, Show) + + +instance IsString InputFile where + fromString [] = InputFile Nothing + fromString fp = InputFile $ Just fp + + +instance NFData CommandLineOptions + + +instance NFData CommandLineOptionsInput + + +instance NFData DisplayFlags + + +instance NFData DisplayInfoBlock + + +instance NFData InputFile + + +instance Read InputFile where + readPrec = InputFile . Just <$> readPrec + + +{- | +Accessor for 'DisplayFlags'. +-} +displaySomeInformation ∷ DisplayFlags → Maybe (NonEmpty DisplayInfoBlock) +displaySomeInformation = + let coalesce + ∷ ( Applicative f + , Monoid (f DisplayInfoBlock) + ) + ⇒ (DisplayFlags → Bool) + → DisplayInfoBlock + → DisplayFlags + → f DisplayInfoBlock + coalesce check val obj + | check obj = pure val + | otherwise = mempty + + gatherSpecifiedFlags = fold . (getFlags <*>) . pure + + getFlags = + [ coalesce printVersion DisplayVersion + , coalesce printLicense DisplayLicense + , coalesce printSplash DisplaySplash + , coalesce printCredits DisplayCredits + ] + in nonEmpty . gatherSpecifiedFlags + + +{- | +Represents the default value for when no input file exists. +-} +missingInputFile ∷ InputFile +missingInputFile = InputFile Nothing diff --git a/app/Software/Credits.hs b/app/Software/Credits.hs new file mode 100644 index 000000000..93e6c7d85 --- /dev/null +++ b/app/Software/Credits.hs @@ -0,0 +1,158 @@ +{-# LANGUAGE ImportQualifiedPost #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE Strict #-} + +{- | +Compile-time embeddings of project contributions via Template Haskell. +-} +module Software.Credits ( + contributors, +) where + +import Control.Foldl qualified as L +import Control.Monad +import Data.Foldable +import Data.List.NonEmpty (NonEmpty (..)) +import Data.Map ((!)) +import Data.String (IsString) +import Data.Text hiding (filter, intersperse, replicate) +import Data.Text.Builder.Linear +import Instances.TH.Lift () +import Software.Metadata.Embedded (embeddedDataFiles) +import Text.MMark +import Text.MMark.Extension +import Text.URI qualified as URI +import Prelude hiding (readFile) + + +{- | +Rendering of the financial and techncal contributors to the software. +-} +contributors ∷ Builder +contributors = + let joinLists ∷ (Semigroup a, IsString a) ⇒ a → a → a + joinLists x y = x <> "\n\n\n" <> y + in joinLists authorsList fundingList + + +{- | +List of authors who have contributed to the PHANE project. +-} +authorsList ∷ Builder +authorsList = + let drawAuthorLines ∷ [Text] → Builder + drawAuthorLines rawAuthorLines = + let renderedAuthorLines = bulletPrefix 1 '•' <$> rawAuthorLines + renderedHeaderLines = + unlines' + [ " │ Project Contributors: │" + , " ╘═══════════════════════╛" + ] + in seperateLines $ renderedHeaderLines : renderedAuthorLines + + readAuthorLines ∷ Text → [Text] + readAuthorLines = fmap fst . fromMarkdown processMarkdown + + renderAuthorData ∷ Text → Builder + renderAuthorData = drawAuthorLines . readAuthorLines + in renderAuthorData `fromEmbeddedFileData` "Authors.md" + + +{- | +List of funding sources which have contributed to PHANE project. +-} +fundingList ∷ Builder +fundingList = + let drawFunderLines ∷ [(Text, Maybe Text)] → Builder + drawFunderLines rawFundingSources = + let listingPrefix = bulletPrefix 1 '•' + linkingPrefix = bulletPrefix 2 '›' + processFunder (x, y) = unlines' [listingPrefix x, maybe "" linkingPrefix y] + renderedFunderLines = processFunder <$> rawFundingSources + renderedHeaderLines = + unlines' + [ " │ Funding Provided By: │" + , " ╘═══════════════════════╛" + ] + in seperateLines $ renderedHeaderLines : renderedFunderLines + + readFunderLines ∷ Text → [(Text, Maybe Text)] + readFunderLines = fromMarkdown processMarkdown + + renderFunderData ∷ Text → Builder + renderFunderData = drawFunderLines . readFunderLines + in renderFunderData `fromEmbeddedFileData` "Funding.md" + + +fromEmbeddedFileData ∷ (Text → Builder) → FilePath → Builder +fromEmbeddedFileData processor = processor . (embeddedDataFiles !) + + +processMarkdown ∷ MMark → [(Text, Maybe Text)] +processMarkdown = (`runScanner` L.foldMap g f) + where + f ∷ [Block (NonEmpty Inline)] → [(Text, Maybe Text)] + f = foldMap renderItem + + g ∷ Block a → [Block a] + g = getListBlocks + + +getListBlocks ∷ Block a → [Block a] +getListBlocks = fold . foldMap toList . toList . getListMay + + +getListMay ∷ Block a → Maybe (NonEmpty [Block a]) +getListMay (OrderedList _ xs) = Just xs +getListMay (UnorderedList xs) = Just xs +getListMay _ = Nothing + + +fromMarkdown ∷ (Monoid a) ⇒ (MMark → a) → Text → a +fromMarkdown f = foldMap f . parse "" + + +renderItem ∷ Block (NonEmpty Inline) → [(Text, Maybe Text)] +renderItem (CodeBlock _ val) = [(val, Nothing)] +renderItem (Naked val) = foldMap renderInline val +renderItem (Paragraph val) = foldMap renderInline val +renderItem _ = [] + + +renderInline ∷ Inline → [(Text, Maybe Text)] +renderInline (Plain txt) = [(txt, Nothing)] +renderInline (Emphasis val) = [(asPlainText val, Nothing)] +renderInline (Strong val) = [(asPlainText val, Nothing)] +renderInline (Strikeout val) = [(asPlainText val, Nothing)] +renderInline (Subscript val) = [(asPlainText val, Nothing)] +renderInline (Superscript val) = [(asPlainText val, Nothing)] +renderInline (CodeSpan txt) = [(txt, Nothing)] +renderInline (Link val uri _) = [(asPlainText val, Just $ URI.render uri)] +renderInline _ = [] + + +bulletPrefix ∷ Word → Char → Text → Builder +bulletPrefix depth bullet = + let padded = fold $ replicate (fromEnum depth) indentation + prefix = padded <> fromChar bullet <> " " + in (prefix <>) . fromText + + +indentation ∷ Builder +indentation = " " + + +unlines' ∷ (Foldable f) ⇒ f Builder → Builder +unlines' = intercalate' "\n" + + +intercalate' ∷ (Foldable f) ⇒ Builder → f Builder → Builder +intercalate' sep = + let consider [] = mempty + consider [x] = x + consider (x : xs) = x <> foldMap (sep <>) xs + in consider . toList + + +seperateLines ∷ (Foldable f) ⇒ f Builder → Builder +seperateLines = intercalate' "\n\n" diff --git a/app/Software/License.hs b/app/Software/License.hs new file mode 100644 index 000000000..a57575ea1 --- /dev/null +++ b/app/Software/License.hs @@ -0,0 +1,149 @@ +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE Strict #-} + +{- | +Compile-time embeddings of project contributions via Template Haskell. +-} +module Software.License ( + licenseText, +) where + +import Data.Map ((!)) +import Data.Text (Text) +import Software.Metadata.Embedded (embeddedDataFiles) + + +{- +import Control.Monad (filterM) +import Data.FileEmbed +import Data.Foldable +import Data.List (isPrefixOf, sort) +import Data.List.NonEmpty (init, last, nonEmpty) +import Data.Text (intercalate, lines) +import Data.Text.IO (readFile) +import Instances.TH.Lift () +import Language.Haskell.TH hiding (Inline) +import Language.Haskell.TH.Syntax hiding (Inline) +--import Paths_PhyG (getDataFileName) +import Paths_PhyG +import Prelude hiding (init, last, lines, readFile) +import System.FilePath ((), normalise, splitDirectories, splitFileName, takeFileName) +import System.Directory +-} + +licenseText ∷ Text +licenseText = embeddedDataFiles ! "LICENSE" + +{- +{- | +The text of the software license under which the PHANE tool is distributed. +-} +licenseText :: ExpQ +licenseText = + let filePath = "doc/LICENSE" + gatherData = polishData . readFile . normalise + polishData = fmap (intercalate "\n" . lines) + inlineData = gatherData filePath >>= lift + noting key = putStrLn . ("\n>>> [TEMPLATE HASKELL NOTE]:\n\t" <>) . (key <>) . (":\t" <>) + findLicense = + let predicate path = + let (fPath, fName) = splitFileName path + allDirs = nonEmpty $ splitDirectories fPath + check dirs = last dirs == "doc" && ("PhyG" `isPrefixOf`) `any` init dirs + in fName == "LICENSE" && maybe False check allDirs + + in getFilesFilteredBy $ pure . predicate + + searchPaths = + let pred = \case + '.':_ -> pure False + fName -> doesDirectoryExist fName + in getCurrentDirectory >>= listDirectory >>= filterM pred + + workingIO = do + getDataFileName "LICENSE" >>= noting "getDataFileName" + getBinDir >>= noting "getBinDir" + getLibDir >>= noting "getLibDir" + getDynLibDir >>= noting "getDynLibDir" + getDataDir >>= noting "getDataDir" + getLibexecDir >>= noting "getLibexecDir" + getSysconfDir >>= noting "getSysconfDir" + cwd <- getCurrentDirectory + noting "CurrentDir" cwd + listDirectory cwd >>= noting "listDirectory" . show + searchPaths >>= noting "searchPaths" . show + found <- searchPaths >>= findLicense + noting "findLicense" $ unlines found + inlineData + in addDependentFile filePath *> + (location >>= runIO . noting "location" . show) + *> runIO workingIO +{- +licenseText :: ExpQ +licenseText = + let locateData = runIO $ getDataFileName "LICENSE" + gatherData = polishData . readFile . normalise + polishData = fmap (intercalate "\n" . lines) + in do filePath <- locateData + addDependentFile filePath + runIO $ gatherData filePath >>= lift +-} + +{-# INLINE getFilesFilteredBy #-} +-- | Recursively get all files and subdirectories in the given directory that +-- satisfy the given predicate. Note that the content of subdirectories not +-- matching the filter is ignored. In particular, that means something like +-- @getDirFiltered doesFileExist@ will /not/ recursively return all files. +-- +-- @since 0.2.2.0 +getFilesFilteredBy + :: (FilePath -> IO Bool) -- ^ File filter + -> [FilePath] -- ^ Input paths + -> IO [FilePath] +getFilesFilteredBy predicate = foldMapA (getFilesFilteredBy' predicate) + +foldMapA :: (FilePath -> IO [FilePath]) -> [FilePath] -> IO [FilePath] +foldMapA = (fmap fold .) . traverse + +{-# INLINE getFilesFilteredBy' #-} +getFilesFilteredBy' + :: (FilePath -> IO Bool) -- ^ Filepath filter + -> FilePath + -> IO [FilePath] +getFilesFilteredBy' check path = + let prepend = (path ) + canDecend fp = do + isDir <- doesDirectoryExist fp + perms <- getPermissions fp + let result = isDir && readable perms && searchable perms + let action + | result = "SCAN" + | otherwise = "PASS" + noting fp "permissions" $ show perms + noting fp "Decendable" action + pure result + + noting file key val = putStrLn $ unwords [ "Dir Scan @", file, key <> ":\t", val ] + + consider fp = do + isFile <- doesFileExist fp + if not isFile + then noting fp "Consideration" "SKIP" *> pure False + else do + isGood <- check fp + let action + | isGood = "KEEP" + | otherwise = "SKIP" + noting fp "Consideration" action + pure $ isFile && isGood + + in do all' <- fmap prepend . sort <$> listDirectory path + curr <- filterM consider all' + dirs <- filterM canDecend all' + case dirs of + [] -> pure curr + ds -> do + next <- foldMapA (getFilesFilteredBy' check) ds + pure $ curr <> next +-} diff --git a/app/Software/Metadata.hs b/app/Software/Metadata.hs new file mode 100644 index 000000000..2f22a0f4a --- /dev/null +++ b/app/Software/Metadata.hs @@ -0,0 +1,120 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE Strict #-} +{-# LANGUAGE TemplateHaskell #-} + +{- | +Important project metadata which is programmatically accessible. + +All temporally variable information is generated at compile-time, reflecting the +state of the project when the program was built. +-} +module Software.Metadata ( + -- * Data-type + AbreviatedName (), + + -- ** Accessors + nameAbbreviation, + nameExpansion, + + -- * Metadata Names + projectName, + softwareName, + + -- * Versioning + shortVersionInformation, + fullVersionInformation, + + -- * Time of build + timeOfCompilation, +) where + +import Data.Foldable (toList) +import Data.Functor ((<&>)) +import Data.String +import Data.Version (showVersion) +import Development.GitRev (gitCommitCount, gitHash) +import PackageInfo_PhyG (name, version) +import Software.Metadata.TimeStamp (compilationTimeStamp, renderTimeStampAsLocalTime) + + +{- | +A name which also is an acronym. +-} +data AbreviatedName = AbbreviationOf + { getNameAbreviated ∷ String + , getNameExpansion ∷ String + } + + +{- | +Get the abbreviation or acronym of an 'AbreviatedName'. +-} +nameAbbreviation ∷ (IsString s) ⇒ AbreviatedName → s +nameAbbreviation = fromString . getNameAbreviated + + +{- | +Get the full expansion on an 'AbreviatedName'. +-} +nameExpansion ∷ (IsString s) ⇒ AbreviatedName → s +nameExpansion = fromString . getNameExpansion + + +{- | +Name of the larger software project. +-} +projectName ∷ AbreviatedName +projectName = "PHANE" `AbbreviationOf` "Phylogenetic Haskell Analytic Network Engine" + + +{- | +Name of the larger software project. +-} +softwareName ∷ AbreviatedName +softwareName = name `AbbreviationOf` "Phylogenetic Graph" + + +{- | +Brief description of the software version. +-} +shortVersionInformation ∷ (IsString s, Semigroup s) ⇒ s +shortVersionInformation = "β-version " <> fromString (showVersion version) + + +{- | +Full description of the software version. + +Uses @TemplateHaskell@ to splice in git hash and commit count information +from the compilation environment. +-} +fullVersionInformation ∷ (IsString s, Monoid s) ⇒ IO s +fullVersionInformation = + let intercalate' ∷ (Foldable f, Monoid s) ⇒ s → f s → s + intercalate' sep = + let consider [] = mempty + consider [x] = x + consider (x : xs) = x <> foldMap (sep <>) xs + in consider . toList + + commits = "(" <> fromString $(gitCommitCount) <> " commits)" + hashInfo = "[" <> fromString (take 7 $(gitHash)) <> "]" + lessName = "(" <> nameAbbreviation softwareName <> ")" + longName = nameExpansion softwareName + in timeOfCompilation <&> \time → + intercalate' + " " + [ longName + , lessName + , shortVersionInformation + , hashInfo + , commits + , "on" + , time + ] + + +{- | +The UTC system time at which (this module of) the binary was compiled. +-} +timeOfCompilation ∷ (IsString s) ⇒ IO s +timeOfCompilation = renderTimeStampAsLocalTime $$(compilationTimeStamp) diff --git a/app/Software/Metadata/Embedded.hs b/app/Software/Metadata/Embedded.hs new file mode 100644 index 000000000..6ec3f1f3b --- /dev/null +++ b/app/Software/Metadata/Embedded.hs @@ -0,0 +1,36 @@ +{-# LANGUAGE TemplateHaskell #-} + +{- | +Compile-time embedding of the UTC time at which the program was built. +-} +module Software.Metadata.Embedded ( + embeddedDataFiles, +) where + +import Data.Bifunctor (bimap) +import Data.ByteString (ByteString) +import Data.FileEmbed +import Data.Map (Map) +import Data.Text (Text) +import Data.Text.Encoding (decodeUtf8Lenient) +import GHC.Exts (fromListN) +import System.FilePath (splitDirectories) +import System.FilePath.Posix (joinPath) + + +embeddedDataFiles ∷ Map FilePath Text +embeddedDataFiles = + let -- We ensure that the data files are stored using Posix + -- path separators (/), even on Windows. + correctPath = joinPath . splitDirectories + correctData = decodeUtf8Lenient + alterations = bimap correctPath correctData + in fromListN 3 $ alterations <$> dataFiles' + + +dataFiles' ∷ [(FilePath, ByteString)] +dataFiles' = + [ ("Authors.md", $(embedFile "doc/Authors.md")) + , ("Funding.md", $(embedFile "doc/Funding.md")) + , ("LICENSE", $(embedFile "doc/LICENSE")) + ] diff --git a/app/Software/Metadata/TimeStamp.hs b/app/Software/Metadata/TimeStamp.hs new file mode 100644 index 000000000..3e19bdfec --- /dev/null +++ b/app/Software/Metadata/TimeStamp.hs @@ -0,0 +1,43 @@ +{-# LANGUAGE CPP #-} +{-# LANGUAGE DeriveLift #-} +{-# LANGUAGE Safe #-} +{-# LANGUAGE TemplateHaskellQuotes #-} +{-# OPTIONS_GHC -Wno-implicit-lift #-} + + +{- | +Compile-time embedding of the UTC time at which the program was built. +-} + +#ifdef ENFORCE_TIMESTAMP +{-# OPTIONS_GHC -fforce-recomp #-} +#endif + +module Software.Metadata.TimeStamp ( + compilationTimeStamp, + renderTimeStampAsLocalTime, +) where + +import Data.String (IsString (fromString)) +import Data.Time.Clock (getCurrentTime) +import Data.Time.Format +import Data.Time.LocalTime (getTimeZone, utcToLocalTime) +import Language.Haskell.TH.Syntax + + +{- | +The UTC system time at which (this module of) the binary was compiled. +-} +compilationTimeStamp ∷ Code Q String +compilationTimeStamp = (show <$> runIO getCurrentTime) `bindCode` (\time → [||time||]) + + +renderTimeStampAsLocalTime ∷ (IsString s) ⇒ String → IO s +renderTimeStampAsLocalTime timeStr = + let timeStamp = read timeStr + formatTimeLocal = formatTime defaultTimeLocale "%Y-%m-%d @ %T" + formatTimeZoned = formatTime defaultTimeLocale "%EZ" + formatTimeStamp zone = + let time = utcToLocalTime zone timeStamp + in unwords [formatTimeLocal time, formatTimeZoned zone] + in fromString . formatTimeStamp <$> getTimeZone timeStamp diff --git a/app/Software/Preamble.hs b/app/Software/Preamble.hs new file mode 100644 index 000000000..fe49ce138 --- /dev/null +++ b/app/Software/Preamble.hs @@ -0,0 +1,91 @@ +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE Strict #-} +{-# LANGUAGE TemplateHaskell #-} + +{- | +Human readable message describing the program being used. +-} +module Software.Preamble ( + preambleText, +) where + +import Data.Foldable (fold) +import Data.List.NonEmpty (NonEmpty (..)) +import Data.String (IsString (fromString)) +import Data.Text (unpack) +import Data.Text qualified as T +import Data.Text.Builder.Linear +import PackageInfo_PhyG (copyright, homepage, synopsis) +import Software.License +import Software.Metadata +import System.Environment (getProgName) + + +{- | +Text to be displayed at the beginning of each computation. +-} +preambleText ∷ IO Builder +preambleText = + let copywriting = fromString copyright + description = fromString synopsis + homepageURL = fromString homepage + licenseName = getLicenseLine 1 + + above = encloseAbove :| [] + below = encloseBelow :| [] + inner progName = + let makeMainBlock = + flip + (:|) + [ "" + , copywriting + , "The program '" <> fromString progName <> "' comes with ABSOLUTELY NO WARRANTY;" + , "This is free software, and may be redistributed under the " <> licenseName <> "." + , "" + , description + , "For more information, see: " <> homepageURL + ] + in fmap encloseLine . makeMainBlock <$> fullVersionInformation + + surround x = above <> x <> below + + topProgBlock = fmap surround . inner + in getProgName >>= fmap (intercalate' "\n") . topProgBlock + + +enclosing ∷ Char → String → String → String → Builder +enclosing pad strL strR strM = + let widthPreamble ∷ Int + widthPreamble = 100 + widthL = length strL + widthR = length strR + widthM = length strM + padded = replicate (widthPreamble - widthL - widthM - widthR) pad + in fromString $ fold [strL, strM, padded, strR] + + +encloseAbove ∷ Builder +encloseAbove = enclosing '─' "┌" "┐" "" + + +encloseBelow ∷ Builder +encloseBelow = enclosing '─' "└" "┘" "" + + +encloseLine ∷ String → Builder +encloseLine = enclosing ' ' "│ " " │" + + +getLicenseLine ∷ Word → String +getLicenseLine lineNum = + let n = fromEnum lineNum + in case drop (n - 1) . take n $ T.lines licenseText of + [] → mempty + x : _ → unpack x + + +intercalate' ∷ Builder → NonEmpty Builder → Builder +intercalate' sep = \case + x :| [] → x + x :| xs → x <> foldMap (sep <>) xs diff --git a/app/Software/SplashImage.hs b/app/Software/SplashImage.hs new file mode 100644 index 000000000..695387d11 --- /dev/null +++ b/app/Software/SplashImage.hs @@ -0,0 +1,49 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE Safe #-} +{-# LANGUAGE Strict #-} +{-# LANGUAGE TemplateHaskellQuotes #-} + +{- | +Compile-time embeddings of the ASCII art "splash image" via Template Haskell. +-} +module Software.SplashImage ( + splashImage, +) where + +import Data.Text (Text) +import Language.Haskell.TH (Code, Q) +import Prettyprinter (LayoutOptions (..), PageWidth (..), indent, layoutPretty, vsep) +import Prettyprinter.Render.Text (renderStrict) + + +{- | +The ASCII art "splash image" for the software. +-} +splashImage ∷ Code Q Text +splashImage = + [|| + renderStrict . layoutPretty (LayoutOptions Unbounded) . indent 1 $ + vsep + [ " _____ _ ___ _ _ _ _ _ ___ ___ _ _" + , "|_ _| |_ ___ | _ \\ || | /_\\ | \\| | __| | _ \\_ _ ___ (_)___ __| |_" + , " | | | ' \\/ -_) | _/ __ |/ _ \\| .` | _| | _/ '_/ _ \\| / -_) _| _|" + , " |_| |_||_\\___| |_| |_||_/_/ \\_\\_|\\_|___| |_| |_| \\___// \\___\\__|\\__|" + , " ___ _ _ |__/" + , " | _ \\_ _ ___ ___ ___ _ _| |_ ___ (_)" + , " | _/ '_/ -_|_- parseCommandLineOptions + either printInformationDisplay evaluatePhyG opts + + +{- | +Fully evaluate the 'PhyG' computation 'performSearch' by encapsulating effects +within the 'Evaluation' monad in order to correctly control logging, parallelism, +and randomness. +-} +evaluatePhyG ∷ FilePath → IO () +evaluatePhyG path = do + logConfig ← initializeLogging Dump Warn Nothing + firstSeed ← initializeRandomSeed + runEvaluation logConfig firstSeed () $ performSearch firstSeed path + + +{- | +Perform phylogenetic search using the supplied input file. +-} +performSearch ∷ RandomSeed → FilePath → PhyG () +performSearch initialSeed inputFilePath = do + printProgramPreamble + logWith LogInfo $ "\nCommand script file: '" <> inputFilePath <> "'\n" + logWith LogInfo $ "Initial random seed set to " <> show initialSeed <> "\n" + timeCDBegin ← liftIO getCurrentTime + + -- Process commands to get list of actions + commandContents ← liftIO $ readFile inputFilePath + + -- Process run commands to create one list of things to do + commandContents' ← PC.expandRunCommands [] (lines commandContents) + thingsToDo'' ← PC.getCommandList commandContents' + -- let thingsToDo'' = PC.getCommandList commandContents' + -- mapM_ (logWith LogTech . show) thingsToDo' + + -- preprocess commands for non-parsimony optimality criteria + let thingsToDo' = PC.preprocessOptimalityCriteriaScripts thingsToDo'' + + -- Process Read commands (with prealigned and tcm flags first) + -- expand read commands for wildcards + expandedReadCommands ← mapM (RIF.expandReadCommands []) $ filter ((== Read) . fst) thingsToDo' + + -- sort added to sort input read commands for left right consistency + let thingsToDo = PC.moveSetOutgroupFirst $ L.sort (fold expandedReadCommands) <> filter ((/= Read) . fst) thingsToDo' + -- logWith LogDump . show $ fold expandedReadCommands + + -- check commands and options for basic correctness + logWith LogMore "\nChecking command file syntax\n" + let !commandsOK = V.verifyCommands thingsToDo [] [] + + if commandsOK + then logWith LogMore "\tCommands appear to be properly specified--file availability and contents not checked.\n" + else failWithPhase Parsing "Commands not properly specified\n" + + movedPrealignedList ← mapM (PC.movePrealignedTCM . snd) (filter ((== Read) . fst) thingsToDo) + dataGraphList ← mapM RIF.executeReadCommands movedPrealignedList + let (rawData, rawGraphs, terminalsToInclude, terminalsToExclude, renameFilePairs, reBlockPairs) = RIF.extractInputTuple dataGraphList + + if null rawData && null rawGraphs + then failWithPhase Unifying "\n\nNeither data nor graphs entered. Nothing can be done.\n" + else logWith LogInfo $ unwords ["Entered", show (length rawData), "data file(s) and", show (length rawGraphs), "input graphs\n"] + + -- get set partitions character from Set commands early, the isFirst==True puts in first section--only processing a few fields + -- confusing and should be changed + let setCommands = filter ((== Set) . fst) thingsToDo + (_, partitionCharOptimalityGlobalSettings, _) ← + CE.executeCommands emptyGlobalSettings mempty 0 [] mempty mempty mempty mempty mempty setCommands True + + -- Split fasta/fastc sequences into corresponding pieces based on '#' partition character + rawDataSplit ← DT.partitionSequences (ST.fromString (partitionCharacter partitionCharOptimalityGlobalSettings)) rawData + + -- Process Rename Commands + newNamePairList ← liftIO $ CE.executeRenameReblockCommands Rename renameFilePairs thingsToDo + if thereExistsSome newNamePairList + then logWith LogInfo $ unwords ["Renaming", show $ length newNamePairList, "terminals\n"] + else logWith LogInfo "No terminals to be renamed\n" + + let renamedData = fmap (DT.renameData newNamePairList) rawDataSplit + let renamedGraphs = fmap (GFU.relabelGraphLeaves newNamePairList) rawGraphs + + let numInputFiles = length renamedData + + let thingsToDoAfterReadRename = filter ((/= Read) . fst) $ filter ((/= Rename) . fst) thingsToDo + + -- Reconcile Data and Graphs (if input) including ladderization + -- could be sorted, but no real need + -- get taxa to include in analysis + let renderTerminals pref = (<> "\n") . (("Terminals to " <> pref <> ": ") <>) . unwords . fmap Text.unpack + when (thereExistsSome terminalsToInclude) + . logWith LogInfo + $ renderTerminals "include" terminalsToInclude + + when (thereExistsSome terminalsToExclude) + . logWith LogInfo + $ renderTerminals "exclude" terminalsToExclude + + -- Uses names from terminal list if non-null, and remove excluded terminals + let dataLeafNames' = + if thereExistsSome terminalsToInclude + then L.sort $ L.nub terminalsToInclude + else L.sort $ DT.getDataTerminalNames renamedData + let dataLeafNames'' = dataLeafNames' L.\\ terminalsToExclude + logWith LogInfo ("Data were input for " <> show (length dataLeafNames'') <> " terminals\n") + + -- check data for missing data threshold and remove those above + let missingToExclude = DT.checkLeafMissingData (missingThreshold partitionCharOptimalityGlobalSettings) rawData + let dataLeafNames = + if (missingThreshold partitionCharOptimalityGlobalSettings) == 100 + then dataLeafNames'' + else -- get number of occurrences of name in rawData Terminfo lists + dataLeafNames'' L.\\ missingToExclude + + when (thereExistsSome missingToExclude) + . logWith LogInfo + $ unlines + [ "Terminals above missing data threshold and excluded: " + , unwords [show (length dataLeafNames'' - length dataLeafNames), show missingToExclude] + , unwords [show $ length dataLeafNames, "terminals remain to be analyzed"] + ] + + (crossReferenceString, defaultGlobalSettings, naiveData, reconciledData, reconciledGraphs) ← case dataLeafNames of + [] → failWithPhase Unifying "No leaf data to be analyzed--all excluded" + x : xs → + {- + Data processing here-- there are multiple steps not composed so that + large data files can be precessed and intermediate data goes out + of scope and can be freed back to system. + + This created here and passed to command execution later to + remove dependency of renamed data in command execution to + reduce memory footprint keeoing that stuff around. + -} + let dNames = x :| xs + crossReferenceString = CSV.genCsvFile $ CE.getDataListList renamedData dNames + -- Add in missing terminals to raw data where required + reconciledData' = DT.addMissingTerminalsToInput dNames [] <$> renamedData + reconciledGraphs = fmap (GFU.reIndexLeavesEdges dNames . GFU.checkGraphsAndData dNames) renamedGraphs + in do + -- Check for data file with all missing data--as in had no terminals with data in termainals list + reconciledData ← fold <$> traverse DT.removeAllMissingCharacters reconciledData' + + -- Create unique bitvector names for leaf taxa. + let leafBitVectorNames = DT.createBVNames reconciledData + + -- Create naive data + -- basic usable format organized into blocks, + -- but not grouped by types, or packed (bit, sankoff, prealigned etc) + -- Need to check data for equal in character number + naiveData ← DT.createNaiveData partitionCharOptimalityGlobalSettings reconciledData leafBitVectorNames [] + + -- get mix of static/dynamic characters to adjust dynmaicEpsilon + -- doing on naive data so no packing etc + let fractionDynamicData = U.getFractionDynamic naiveData + + -- Set global values before search--should be integrated with executing commands + -- only stuff that is data dependent here (and seed) + let defaultGlobalSettings = + emptyGlobalSettings + { outgroupIndex = 0 + , outGroupName = head dNames + , seed = fromEnum initialSeed + , numDataLeaves = length leafBitVectorNames + , fractionDynamic = fractionDynamicData + , dynamicEpsilon = 1.0 + ((dynamicEpsilon emptyGlobalSettings - 1.0) * fractionDynamicData) + } + + pure (crossReferenceString, defaultGlobalSettings, naiveData, reconciledData, reconciledGraphs) + + -- logWith LogInfo ("Fraction characters that are dynamic: " <> (show $ (fromIntegral lengthDynamicCharacters) / (fromIntegral $ lengthDynamicCharacters + numStaticCharacters))) + + -- Check to see if there are taxa without any observations. Would become total wildcards + let taxaDataSizeList = + filter ((== 0) . snd) + . zip dataLeafNames + . foldl1 (zipWith (+)) + $ fmap (fmap (snd3 . U.filledDataFields (0, 0)) . fst) reconciledData + + case taxaDataSizeList of + [] → logWith LogInfo "All taxa contain data\n" + xs → + failWithPhase + Unifying + $ fold + [ "\nError: There are taxa without any data: " + , L.intercalate ", " $ Text.unpack . fst <$> xs + , "\n" + ] + + -- Ladderizes (resolves) input graphs and ensures that networks are time-consistent + -- chained network nodes should never be introduced later so only checked no + -- checks for children of tree node that are all netowork nodee (causes displayu problem) + -- let noChainNetNodesList = fmap fromJust $ filter (/=Nothing) $ fmap (LG.removeChainedNetworkNodes True) reconciledGraphs + let noSisterNetworkNodes = fmap LG.removeTreeEdgeFromTreeNodeWithAllNetworkChildren reconciledGraphs -- noChainNetNodesList + ladderizedGraphList ← mapM (GO.convertGeneralGraphToPhylogeneticGraph True) noSisterNetworkNodes + + {-To do + -- Remove any not "selected" taxa from both data and graphs (easier to remove from fgl) + let reconciledData' = removeTaxaFromData includeList reconciledData + let reconciledGraphs' = removeTaxaFromGraphs includeList reconciledData + -} + + -- Group Data--all nonadditives to single character, additives with + -- alphabet < 64 recoded to nonadditive binary, additives with same alphabet + -- combined, + naiveDataGrouped ← R.combineDataByType partitionCharOptimalityGlobalSettings naiveData -- R.groupDataByType naiveData + + -- Bit pack non-additive data + naiveDataPacked ← BP.packNonAdditiveData partitionCharOptimalityGlobalSettings naiveDataGrouped + + -- Optimize Data convert + -- prealigned to non-additive or matrix + -- bitPack resulting non-additive + optimizedPrealignedData ← R.optimizePrealignedData partitionCharOptimalityGlobalSettings naiveDataPacked + + -- Execute any 'Block' change commands--make reBlockedNaiveData + newBlockPairList ← liftIO $ CE.executeRenameReblockCommands Reblock reBlockPairs thingsToDo + + reBlockedNaiveData ← R.reBlockData newBlockPairList optimizedPrealignedData -- naiveData + let thingsToDoAfterReblock = filter ((/= Reblock) . fst) $ filter ((/= Rename) . fst) thingsToDoAfterReadRename + + -- Combines data of exact types into single vectors in each block + -- this is final data processing step + optDataNBPL ← R.combineDataByType partitionCharOptimalityGlobalSettings reBlockedNaiveData + + when (thereExistsSome newBlockPairList) $ logWith LogInfo "Reorganizing Block data" + let optimizedData = + if thereExistsSome newBlockPairList + then optDataNBPL + else optimizedPrealignedData + + -- set outgroup needs to be the firt set command (I think due to stricness) + let initialSetCommands = filter ((== Set) . fst) thingsToDoAfterReblock + + let commandsAfterInitialDiagnose = filter ((/= Set) . fst) thingsToDoAfterReblock + + -- Set reporting data for qualitative characters to Naive data (usually but not if huge data set), empty if packed + reportingData <- + if reportNaiveData partitionCharOptimalityGlobalSettings then R.reBlockData newBlockPairList naiveData + else pure emptyProcessedData + + -- This rather awkward syntax makes sure global settings (outgroup, criterion etc) are in place for initial input graph diagnosis + (_, initialGlobalSettings, _) ← + CE.executeCommands + defaultGlobalSettings + (terminalsToExclude, renameFilePairs) + numInputFiles + crossReferenceString + optimizedData + optimizedData + reportingData + [] + [] + initialSetCommands + False + + -- Get CPUTime so far ()data input and processing + dataCPUTime ← liftIO getCPUTime + + -- Diagnose any input graphs + let action = T.multiTraverseFullyLabelGraphReduced initialGlobalSettings optimizedData True True Nothing + actionPar ← getParallelChunkTraverse + + inputGraphList ← + actionPar action (fmap (LG.rerootTree (outgroupIndex initialGlobalSettings)) ladderizedGraphList) + {-PU.seqParMap + PU.myStrategy + (T.multiTraverseFullyLabelGraphReduced initialGlobalSettings optimizedData True True Nothing) + (fmap (LG.rerootTree (outgroupIndex initialGlobalSettings)) ladderizedGraphList) + -} + + -- Get CPUTime for input graphs + afterGraphDiagnoseTCPUTime ← liftIO getCPUTime + let (inputGraphTime, inGraphNumber, minOutCost, maxOutCost) = case inputGraphList of + [] → (0, 0, infinity, infinity) + _ → + ( fromIntegral afterGraphDiagnoseTCPUTime - fromIntegral dataCPUTime + , length inputGraphList + , minimum $ fmap snd5 inputGraphList + , maximum $ fmap snd5 inputGraphList + ) + + let inputProcessingData = emptySearchData{commentString = "Input and data processing", duration = fromIntegral dataCPUTime} + let inputGraphProcessing = + emptySearchData + { minGraphCostOut = minOutCost + , maxGraphCostOut = maxOutCost + , numGraphsOut = inGraphNumber + , commentString = "Input graph processing" + , duration = inputGraphTime + } + + -- Execute Following Commands (searches, reports etc) + (finalGraphList, _, _) ← + CE.executeCommands + (initialGlobalSettings{searchData = [inputGraphProcessing, inputProcessingData]}) + (terminalsToExclude, renameFilePairs) + numInputFiles + crossReferenceString + optimizedData + optimizedData + reportingData + inputGraphList + [] + commandsAfterInitialDiagnose -- (transformString <> commandsAfterInitialDiagnose) + False + + -- print global setting just to check + -- logWith LogInfo (show _finalGlobalSettings) + + -- Add in model and root cost if optimality criterion needs it + -- if (rootComplexity initialGlobalSettings) /= 0.0 then logWith LogInfo ("\tUpdating final graph with any root priors") + -- else logWith LogInfo "" + + {- + This should not be necessary--moved to traversal with root cost adjustment + rediagnose for NCM and PMDL due to packing, in most cases not required, just being sure etc + -} + let rediagnoseWithReportingdata = True + finalGraphList' ← + if rediagnoseWithReportingdata + then -- if optimalityCriterion initialGlobalSettings `elem` [SI, NCM, PMDL] then + T.updateGraphCostsComplexities initialGlobalSettings reportingData optimizedData rediagnoseWithReportingdata finalGraphList + else pure finalGraphList + + -- let finalGraphList' = finalGraphList + + let minCost = if null finalGraphList then 0.0 else minimum $ fmap snd5 finalGraphList' + let maxCost = if null finalGraphList then 0.0 else maximum $ fmap snd5 finalGraphList' + + -- get network numbers for graph complexities (PMDL, SI) + let grabber = length . fth4 . LG.splitVertexList . fst5 <$> finalGraphList' + let pairFunction ∷ ∀ {a}. (a, a) → a + (netWorkVertexList, pairFunction, units) + | optimalityCriterion initialGlobalSettings == Parsimony = (replicate (length finalGraphList') 0, fst, "") + -- PMDL and DI in base 2 + | optimalityCriterion initialGlobalSettings `elem` [PMDL, SI] = (grabber, fst, " bits") + -- \| graphType initialGlobalSettings == SoftWired = (grabber, fst, " bits") + -- NCM and MAPA in base 10 + | otherwise = (grabber, snd, " dits") + + -- final results reporting to stderr + logWith LogInfo $ + unwords + [ "Execution returned" + , show $ length finalGraphList' + , "graph(s) at cost range" + , (show (minCost, maxCost)) <> units + , "\n" + ] + + -- insures model complexity 0 if not PMDL correctly accounted for in traversals + let adjModelComplexity = + if optimalityCriterion initialGlobalSettings == PMDL + then modelComplexity initialGlobalSettings + else 0.0 + + when (optimalityCriterion initialGlobalSettings /= Parsimony) $ + logWith LogInfo $ + unwords + [ "\tModel complexity " <> (show adjModelComplexity) <> units <> "\n" + , "\tRoot complexity " <> (show $ rootComplexity initialGlobalSettings) <> units <> "\n" + , "\tGraph complexities " + <> (show $ fmap pairFunction $ fmap ((graphComplexityList initialGlobalSettings) IL.!!!) netWorkVertexList) + <> units + , "\n\n" + ] + + -- Final Stderr report + timeCPUEnd ← liftIO getCPUTime + timeCDEnd ← liftIO getCurrentTime + + -- logWith LogInfo ("CPU Time " <> (show timeCPUEnd)) + let wallClockDuration = floor (1000000000000 * nominalDiffTimeToSeconds (diffUTCTime timeCDEnd timeCDBegin)) ∷ Integer + -- logWith LogInfo ("Current time " <> (show wallClockDuration)) + let cpuUsage = fromIntegral timeCPUEnd / fromIntegral wallClockDuration ∷ Double + -- logWith LogInfo ("CPU % " <> (show cpuUsage)) + + logWith LogInfo . unlines $ + ("\t" <>) + <$> [ unwords ["Wall-Clock time ", show ((fromIntegral wallClockDuration ∷ Double) / 1000000000000.0), "second(s)"] + , unwords ["CPU time", show ((fromIntegral timeCPUEnd ∷ Double) / 1000000000000.0), "second(s)"] + , unwords ["CPU usage", show (floor (100.0 * cpuUsage) ∷ Integer) <> "%"] + ] + + +thereExistsSome ∷ (Foldable f) ⇒ f a → Bool +thereExistsSome = not . null + + +printProgramPreamble ∷ PhyG () +printProgramPreamble = liftIO preambleText >>= logWith LogDone . runBuilder . (fromString "\n\n" <>) diff --git a/bin/build-docs-tutorials.sh b/bin/build-docs-tutorials.sh new file mode 100755 index 000000000..500c2d43e --- /dev/null +++ b/bin/build-docs-tutorials.sh @@ -0,0 +1,53 @@ +#!/bin/bash +set -e + +TUTORIAL_STEM='tutorial' +TUTORIAL_DIR="${TUTORIAL_STEM}s" +TUTORIAL_DOCS_PATTERN="${TUTORIAL_STEM}_*_doc" + +note() { + local above="┍━━" + local below="┕━━" + local prefix="│ " + local pattern="${above}\n%s\n${below}\n" + echo -n -e "${1}" | sed "s/^/$prefix /" | xargs -0 printf ${pattern} +} + +build_tutorial_documetation() { + local DOC_PATH="${1}" + local DIR_INIT=$(pwd) + cd "${DOC_PATH}" + + # Get file LaTeX file + FILE_TEX=$(find . -maxdepth 1 -iname "*.tex" -print -quit) + FILE_BASENAME="${FILE_TEX%.*}" + + # Build PDF from LaTeX + latexmk -f -pdf "${FILE_BASENAME}" > /dev/null 2>&1 + ret=$? + if [ $ret -ne 0 ]; then + printf "\n>>>\tFailed to build PDF for '%s'!\n>>>\tCheck the log file:>>>\t\t%s.log" "${FILE_BASENAME}" "${FILE_BASENAME}" + fi + + # Clean up build artifacts and exit + latexmk -c "${FILE_BASENAME}" > /dev/null 2>&1 + cd "${DIR_INIT}" +} + +# Find all tutorials +TUTORIAL_FOUND=$(find ${TUTORIAL_DIR} -maxdepth 2 -iname "${TUTORIAL_DOCS_PATTERN}" -print | sort) + +# Conert found results to a Bash "array" type +SAVEIFS=${IFS} # Save current IFS (Internal Field Separator) +IFS=$'\n' # Change IFS to newline char +TUTORIAL_FILEPATH=(${TUTORIAL_FOUND}) +IFS=${SAVEIFS} # Restore original IFS + + +printf "Compiling LaTeX documentation for the '${#TUTORIAL_FILEPATH[@]}' tutorials found:\n\n" +for (( i=0; i<${#TUTORIAL_FILEPATH[@]}; i++ )) +do + printf "\t%d: %s\n" "${i}" "${TUTORIAL_FILEPATH[${i}]}" + build_tutorial_documetation "${TUTORIAL_FILEPATH[${i}]}" +done +printf "Documentation PDFs successfully built!\n diff --git a/bin/build-docs-user-manual.sh b/bin/build-docs-user-manual.sh new file mode 100755 index 000000000..ee6dffcf0 --- /dev/null +++ b/bin/build-docs-user-manual.sh @@ -0,0 +1,36 @@ +#!/bin/bash +set -e + +TUTORIAL_STEM='PhyG-User-Manual' +TUTORIAL_DIR="doc/${TUTORIAL_STEM}" + +note() { + local above="┍━━" + local below="┕━━" + local prefix="│ " + local pattern="${above}\n%s\n${below}\n" + echo -n -e "${1}" | sed "s/^/$prefix /" | xargs -0 printf ${pattern} +} + +DIR_INIT=$(pwd) +cd "${TUTORIAL_DIR}" + +# Get file LaTeX file +FILE_TEX=$(find . -maxdepth 1 -iname "*.tex" -print -quit) +FILE_BASENAME="${FILE_TEX%.*}" + +printf "FILE_BASENAME:\t%s\n" "${FILE_BASENAME}" + +# Build PDF from LaTeX +latexmk -f -pdf "${FILE_BASENAME}" > /dev/null 2>&1 +ret=$? +if [ $ret -ne 0 ]; then + printf "\n>>>\tFailed to build PDF for '%s'!\n>>>\tCheck the log file:>>>\t\t%s.log" "${FILE_BASENAME}" "${FILE_BASENAME}" + exit 1 +fi + +# Clean up build artifacts and exit +latexmk -c "${FILE_BASENAME}" > /dev/null 2>&1 +cd "${DIR_INIT}" + +printf "User Manual PDF successfully built!\n diff --git a/cabal.project b/cabal.project index c02e7147b..689efaefd 120000 --- a/cabal.project +++ b/cabal.project @@ -1 +1 @@ -cfg/cabal.project.domain \ No newline at end of file +config/cabal.project.development \ No newline at end of file diff --git a/cfg/cabal.project.domain b/cfg/cabal.project.domain deleted file mode 100644 index 75ade70dd..000000000 --- a/cfg/cabal.project.domain +++ /dev/null @@ -1,86 +0,0 @@ ---------------------------------------------------------------------------------- --- Domain of PHAGE packages --- * PHAGE-integration-tests ---------------------------------------------------------------------------------- - -packages: - pkg/PHAGE-integration-tests - pkg/PhyGraph - pkg/PhyloLib - pkg/*/*.cabal - - ---------------------------------------------------------------------------------- --- Build decisions --- * Static or dynamic binaries --- * Compiler versions ---------------------------------------------------------------------------------- - -executable-static: False -with-compiler: ghc-9.2.4 - - ---------------------------------------------------------------------------------- --- Output paths --- --- Place build output(s) in more accessible locations. Binaries should go in `bin` --- and log files of the build/benchmarks/tests should go in `log`. ---------------------------------------------------------------------------------- - --- datadir: ./data --- docdir: ./doc --- htmldir: ./doc/html -builddir: ./doc -symlink-bindir: ./bin -installdir: ./bin -logs-dir: ./log -install-method: copy -overwrite-policy: always - - ---------------------------------------------------------------------------------- --- Build metadata flags --- --- Sensible alterations from the defaults to improve the build experience during --- iterative development. Simultaneously, does not hinder release builds. ---------------------------------------------------------------------------------- - -haddock-html: True -haddock-tests: True -haddock-benchmarks: True -haddock-internal: True -haddock-hyperlink-source: True -jobs: $ncpus -keep-going: True -minimize-conflict-set: True - - ---------------------------------------------------------------------------------- --- Dependency specifications --- --- Always allow a newer version of libraries which are tightly coupled to the GHC --- compiler version. This allows the project to build with newer versions of the --- compiler without having to tediously tinker with dependencies, both direct and --- transitive. --- --- Always allow a newer version of libraries which have had a major release due --- to a security or performance patch. Add a `preference` indicating the desire --- to use a version greater than or equal to the patch. ---------------------------------------------------------------------------------- - -allow-newer: - aeson, - bytestring, - text, - -preferences: - aeson >= 2.0.0, - bytestring >= 0.11.3, - filepath >= 1.4.100.0, - mtl >= 2.3, - text >= 2.0, - -source-repository-package - type: git - location: https://github.com/recursion-ninja/bv-little - tag: 4541d78545933f450af5c241a83e4b07a0e69dca diff --git a/cfg/cabal.project.integration b/cfg/cabal.project.integration deleted file mode 100644 index 02762bc14..000000000 --- a/cfg/cabal.project.integration +++ /dev/null @@ -1,119 +0,0 @@ ---------------------------------------------------------------------------------- --- Domain of PHAGE packages --- * PHAGE-integration-tests ---------------------------------------------------------------------------------- - -packages: - pkg/PHAGE-integration-tests - pkg/PhyGraph - pkg/PhyloLib - pkg/*/*.cabal - -with-compiler: ghc-9.2.4 - - ---------------------------------------------------------------------------------- --- Output paths --- --- Place build output(s) in more accessible locations. Binaries should go in `bin` --- and log files of the build/benchmarks/tests should go in `log`. ---------------------------------------------------------------------------------- - --- datadir: ./data --- docdir: ./doc --- htmldir: ./doc/html -symlink-bindir: ./bin -installdir: ./bin -logs-dir: ./log -install-method: copy -overwrite-policy: always - - ---------------------------------------------------------------------------------- --- Build metadata flags --- --- Sensible alterations from the defaults to improve the build experience during --- iterative development. Simultaneously, does not hinder release builds. ---------------------------------------------------------------------------------- - -haddock-html: True -haddock-tests: True -haddock-benchmarks: True -haddock-internal: True -haddock-hyperlink-source: True -jobs: $ncpus -keep-going: True -minimize-conflict-set: True - - ---------------------------------------------------------------------------------- --- Package flags (useful defaults) --- --- Instruct all transative dependencies to be built *with both* optimization and --- profiling enabled. This reduces rebuilds when debugging while not sacraficing --- undue execution speed from imported code. --- --- Contrastingly, *disable both* optimization and profiling when building the --- current package's codebase. This improves the frequent recompilation speed --- during iterative development. --- --- Finally, enable all warnings and then prune out the undesirable ones. Having --- the large litany of warnings enabled forces pre-emptive compatibility with --- future breaking changes as well as improving overall code quality. ---------------------------------------------------------------------------------- - --- Applies to *all* packages, not just dependencies... -package * - benchmarks: False - documentation: False - tests: False - optimization: 2 - - -package PHAGE-integration-tests - benchmarks: False - documentation: False - tests: True - executable-static: False - optimization: 2 - flags: -SingleThreaded - - -package PhyGraph - benchmarks: False - documentation: False - tests: False - executable-static: False - optimization: 2 - flags: -SingleThreaded - - ---------------------------------------------------------------------------------- --- Dependency specifications --- --- Always allow a newer version of libraries which are tightly coupled to the GHC --- compiler version. This allows the project to build with newer versions of the --- compiler without having to tediously tinker with dependencies, both direct and --- transitive. --- --- Always allow a newer version of libraries which have had a major release due --- to a security or performance patch. Add a `preference` indicating the desire --- to use a version greater than or equal to the patch. ---------------------------------------------------------------------------------- - -allow-newer: - aeson, - bytestring, - text, - -preferences: - aeson >= 2.0.0, - bytestring >= 0.11.3, - filepath >= 1.4.100.0, - mtl >= 2.3, - text >= 2.0, - -source-repository-package - type: git - location: https://github.com/recursion-ninja/bv-little - tag: 4541d78545933f450af5c241a83e4b07a0e69dca diff --git a/cfg/cabal.project.release b/cfg/cabal.project.release deleted file mode 100644 index 3affa00fb..000000000 --- a/cfg/cabal.project.release +++ /dev/null @@ -1,164 +0,0 @@ ---------------------------------------------------------------------------------- --- Domain of PHAGE packages --- * PHAGE-integration-tests ---------------------------------------------------------------------------------- - -packages: - pkg/PHAGE-integration-tests - pkg/PhyGraph - pkg/PhyloLib - pkg/*/*.cabal - - ---------------------------------------------------------------------------------- --- Build decisions --- * Static or dynamic binaries --- * Compiler versions ---------------------------------------------------------------------------------- - -executable-static: True -with-compiler: ghc-9.2.3 - - ---------------------------------------------------------------------------------- --- Output paths --- --- Place build output(s) in more accessible locations. Binaries should go in `bin` --- and log files of the build/benchmarks/tests should go in `log`. ---------------------------------------------------------------------------------- - --- datadir: ./data --- docdir: ./doc --- htmldir: ./doc/html -symlink-bindir: ./bin -installdir: ./bin -logs-dir: ./log --- install-method: copy -overwrite-policy: always - - ---------------------------------------------------------------------------------- --- Build metadata flags --- --- Sensible alterations from the defaults to improve the build experience during --- iterative development. Simultaneously, does not hinder release builds. ---------------------------------------------------------------------------------- - -haddock-html: True -haddock-tests: True -haddock-benchmarks: True -haddock-internal: True -haddock-hyperlink-source: True -jobs: $ncpus -keep-going: True -minimize-conflict-set: True - - ---------------------------------------------------------------------------------- --- Package flags (useful defaults) --- --- Instruct all transative dependencies to be built *with both* optimization and --- profiling enabled. This reduces rebuilds when debugging while not sacraficing --- undue execution speed from imported code. --- --- Contrastingly, *disable both* optimization and profiling when building the --- current package's codebase. This improves the frequent recompilation speed --- during iterative development. --- --- Finally, enable all warnings and then prune out the undesirable ones. Having --- the large litany of warnings enabled forces pre-emptive compatibility with --- future breaking changes as well as improving overall code quality. ---------------------------------------------------------------------------------- - --- Applies to *all* packages, not just dependencies... -package * - benchmarks: False - documentation: False - tests: False - library-profiling: False - library-stripping: True - optimization: 2 - ghc-options: - -- Optimization flags - -O2 - -fexcess-precision - -fexpose-all-unfoldings - -flate-specialise - -fmax-simplifier-iterations=16 - -foptimal-applicative-do - -fspec-constr-count=8 - -fspec-constr-keen - -fspecialize-aggressively - -fstatic-argument-transformation - -package PhyGraph - benchmarks: False - documentation: False - tests: False - library-profiling: False - library-stripping: True - executable-stripping: True - optimization: 2 - ghc-options: - -- Optimization flags - -O2 - -fexcess-precision - -fexpose-all-unfoldings - -flate-specialise - -fmax-simplifier-iterations=16 - -foptimal-applicative-do - -fspec-constr-count=8 - -fspec-constr-keen - -fspecialize-aggressively - -fstatic-argument-transformation - - -feager-blackholing - -threaded - -with-rtsopts="-A64m -AL8 -H1024m -maxN -n4m --nonmoving-gc -qa -qm" - - -- Sanity check warnings: - -- 1. Fail on a warning - -- 2. Include all warnings by default - -- 3. Exclude the undesirable warnings - -Werror - -Weverything - -- Exclusions: - -Wno-all-missed-specialisations - -Wno-implicit-prelude - -Wno-missing-import-lists - -Wno-missing-kind-signatures - -Wno-missing-safe-haskell-mode - -Wno-monomorphism-restriction - -Wno-type-defaults - -Wno-unsafe - - ---------------------------------------------------------------------------------- --- Dependency specifications --- --- Always allow a newer version of libraries which are tightly coupled to the GHC --- compiler version. This allows the project to build with newer versions of the --- compiler without having to tediously tinker with dependencies, both direct and --- transitive. --- --- Always allow a newer version of libraries which have had a major release due --- to a security or performance patch. Add a `preference` indicating the desire --- to use a version greater than or equal to the patch. ---------------------------------------------------------------------------------- - -allow-newer: - aeson, - bytestring, - text, - -preferences: - aeson >= 2.0.0, - bytestring >= 0.11.3, - filepath >= 1.4.100.0, - mtl >= 2.3, - text >= 2.0, - -source-repository-package - type: git - location: https://github.com/recursion-ninja/bv-little - tag: 4541d78545933f450af5c241a83e4b07a0e69dca diff --git a/cfg/.hlint.yaml b/config/.hlint.yaml similarity index 97% rename from cfg/.hlint.yaml rename to config/.hlint.yaml index 53a1e20fd..53cc86480 100644 --- a/cfg/.hlint.yaml +++ b/config/.hlint.yaml @@ -36,6 +36,11 @@ , rhs: fold1 } +- warn: { name: Prefer fold1 to sconcat + , lhs: bit $ fromEnum gapIndex + , rhs: \x -> (x `xor` x) `setBit` fromEnum gapIndex + } + # Custom suggestion hints - suggestion: { name: Strict sum , note: Prefer custom strict sum to lazy sum. Decreases laziness diff --git a/config/cabal.project.development b/config/cabal.project.development new file mode 100644 index 000000000..d886682f7 --- /dev/null +++ b/config/cabal.project.development @@ -0,0 +1,246 @@ +--------------------------------------------------------------------------------- +-- Build metadata flags +-- +-- Sensible alterations from the defaults to improve the build experience during +-- iterative development. Simultaneously, does not hinder release builds. +--------------------------------------------------------------------------------- + +Executable-Static: False +Haddock-HTML: True +Haddock-Tests: True +Haddock-benchmarks: True +Haddock-Internal: True +Haddock-Hyperlink-Source: True +Jobs: $ncpus +--keep-going: True +Minimize-Conflict-Set: True +Semaphore: True + + +--------------------------------------------------------------------------------- +-- Dependency specifications +-- +-- Always allow a newer version of libraries which are tightly coupled to the GHC +-- compiler version. This allows the project to build with newer versions of the +-- compiler without having to tediously tinker with dependencies, both direct and +-- transitive. +-- +-- Always allow a newer version of libraries which have had a major release due +-- to a security or performance patch. Add a `preference` indicating the desire +-- to use a version greater than or equal to the patch. +--------------------------------------------------------------------------------- + +Allow-Newer: + aeson, + base, + base-compat, + binary, + bytestring, + Cabal, + containers, + deepseq, + ghc, + ghc-bignum, + ghc-prim, + hashable, + integer-gmp, + lens, + mtl, + pretty, + primative, + semigroupoids, + template-haskell, + text, + text-builder-linear, + th-abstraction, + time + +-- Required to use the GHC-bound (installed) version +Constraints: + base installed, + ghc installed, + ghc-bignum installed, + ghc-prim installed, + integer-gmp installed, + template-haskell installed + + +-- Reduce the project's dependency footprint via build flags +Constraints: + bifunctors -tagged, + comonad -indexed-traversable, + lens +inlining -test-hunit -test-properties -test-templates +trustworthy, + pointed -comonad -kan-extensions -semigroupoids -semigroups -stm -tagged, + semialign -semigroupoids, + semigroups -binary -bytestring -deepseq -hashable -tagged -template-haskell -text -transformers -unordered-containers, + semigroupoids -comonad, + + +-- Apply performance flags to specific packages +Constraints: + bitvec +simd, + directory +os-string, + hashable -random-initial-seed, + hashtables -bounds-checking -debug -portable +unsafe-tricks, + optparse-applicative +process, + tasty +unix, + text -pure-haskell +simdutf, + vector -BoundsChecks -InternalChecks -UnsafeChecks -Wall, + +Preferences: + aeson >= 2.1.0.0, + bytestring >= 0.12.0.0, + filepath >= 1.4.100.0, + mtl >= 2.3.1, + text >= 2.0.1, + unordered-containers >= 0.2.18.0, + +Source-Repository-Package + Type: git + Location: https://github.com/amnh/PHANE + Subdir: pkg/PHANE-alphabet + Tag: 6f300d99c3be9e5201f31ac4480bbacb3f547f8b + +Source-Repository-Package + Type: git + Location: https://github.com/amnh/PHANE + Subdir: pkg/PHANE-dynamic-character + Tag: 6f300d99c3be9e5201f31ac4480bbacb3f547f8b + +Source-Repository-Package + Type: git + Location: https://github.com/amnh/PHANE + Subdir: pkg/PHANE-dynamic-character-element + Tag: 6f300d99c3be9e5201f31ac4480bbacb3f547f8b + +Source-Repository-Package + Type: git + Location: https://github.com/amnh/PHANE + Subdir: pkg/PHANE-evaluation + Tag: 6f300d99c3be9e5201f31ac4480bbacb3f547f8b + +Source-Repository-Package + Type: git + Location: https://github.com/amnh/PHANE + Subdir: pkg/PHANE-file-formats + Tag: 6f300d99c3be9e5201f31ac4480bbacb3f547f8b + +Source-Repository-Package + Type: git + Location: https://github.com/amnh/PHANE + Subdir: pkg/PHANE-measure-class + Tag: 6f300d99c3be9e5201f31ac4480bbacb3f547f8b + +Source-Repository-Package + Type: git + Location: https://github.com/amnh/PHANE + Subdir: pkg/PHANE-measure-transition + Tag: 6f300d99c3be9e5201f31ac4480bbacb3f547f8b + +Source-Repository-Package + Type: git + Location: https://github.com/amnh/PHANE + Subdir: pkg/PHANE-measure-units + Tag: 6f300d99c3be9e5201f31ac4480bbacb3f547f8b + +Source-Repository-Package + Type: git + Location: https://github.com/amnh/PHANE + Subdir: pkg/PHANE-PhyloLib + Tag: 6f300d99c3be9e5201f31ac4480bbacb3f547f8b + + +--------------------------------------------------------------------------------- +-- Output paths +-- +-- Place build output(s) in more accessible locations. Binaries should go in `bin` +-- and log files of the build/benchmarks/tests should go in `log`. +--------------------------------------------------------------------------------- + +Builddir: ./doc +Symlink-bindir: ./bin +Installdir: ./bin +Logs-Dir: ./log +Install-method: copy +Overwrite-Policy: always +-- Datadir: ./data +-- Docdir: ./doc +-- HTMLdir: ./doc/html + + +--------------------------------------------------------------------------------- +-- Package flags (useful defaults) +-- +-- Instruct all transative dependencies to be built *with both* optimization and +-- profiling enabled. This reduces rebuilds when debugging while not sacraficing +-- undue execution speed from imported code. +-- +-- Contrastingly, *disable both* optimization and profiling when building the +-- current package's codebase. This improves the frequent recompilation speed +-- during iterative development. +-- +-- Finally, enable all warnings and then prune out the undesirable ones. Having +-- the large litany of warnings enabled forces pre-emptive compatibility with +-- future breaking changes as well as improving overall code quality. +--------------------------------------------------------------------------------- + +Packages: . + +-- Applies to *all* packages, not just dependencies... +Package * + Benchmarks: False + Documentation: True + Tests: False + Library-Profiling: False + Library-Profiling-detail: all-functions + Optimization: 2 + +-- Applies to *only* the specified package! +Package PhyG + Benchmarks: True + Documentation: True + Tests: True + Executable-Profiling: True + Library-Profiling: True + Library-Profiling-detail: all-functions + Profiling-Detail: all-functions + + +--------------------------------------------------------------------------------- +-- GHC-9.8.* comptibility +-- +-- Updated Hackage references for GHC maintainer patched dependency packages +-- +-- Also manual source references for other patched dependency packages +--------------------------------------------------------------------------------- + +With-Compiler: + ghc-9.10.1 + +Active-Repositories: + hackage.haskell.org, + head.hackage.ghc.haskell.org:override + +Repository head.hackage.ghc.haskell.org + url: https://ghc.gitlab.haskell.org/head.hackage/ + secure: True + key-threshold: 3 + root-keys: + 26021a13b401500c8eb2761ca95c61f2d625bfef951b939a8124ed12ecf07329 + 7541f32a4ccca4f97aea3b22f5e593ba2c0267546016b992dfadcd2fe944e55d + f76d08be13e9a61a377a85e2fb63f4c5435d40f8feb3e12eb05905edb8cdea89 + + -- f76d08be13e9a61a377a85e2fb63f4c5435d40f8feb3e12eb05905edb8cdea89 + -- 26021a13b401500c8eb2761ca95c61f2d625bfef951b939a8124ed12ecf07329 + -- 7541f32a4ccca4f97aea3b22f5e593ba2c0267546016b992dfadcd2fe944e55d + + +--Source-Repository-Package +-- Type: git +-- Location: https://github.com/recursion-ninja/linear-builder + +--Source-Repository-Package +-- Type: git +-- Location: https://github.com/andreasabel/microlens/ +-- Subdir: microlens-th +-- Tag: 70a9990 diff --git a/config/cabal.project.ghc-9.8 b/config/cabal.project.ghc-9.8 new file mode 100644 index 000000000..228f01868 --- /dev/null +++ b/config/cabal.project.ghc-9.8 @@ -0,0 +1,202 @@ +--------------------------------------------------------------------------------- +-- Build metadata flags +-- +-- Sensible alterations from the defaults to improve the build experience during +-- iterative development. Simultaneously, does not hinder release builds. +--------------------------------------------------------------------------------- + +Executable-Static: False +Haddock-HTML: True +Haddock-Tests: True +Haddock-benchmarks: True +Haddock-Internal: True +Haddock-Hyperlink-Source: True +Jobs: $ncpus +--keep-going: True +Minimize-Conflict-Set: True + + +--------------------------------------------------------------------------------- +-- Dependency specifications +-- +-- Always allow a newer version of libraries which are tightly coupled to the GHC +-- compiler version. This allows the project to build with newer versions of the +-- compiler without having to tediously tinker with dependencies, both direct and +-- transitive. +-- +-- Always allow a newer version of libraries which have had a major release due +-- to a security or performance patch. Add a `preference` indicating the desire +-- to use a version greater than or equal to the patch. +--------------------------------------------------------------------------------- + +Allow-Newer: + aeson, + base, + base-compat, + binary, + bytestring, + Cabal, + containers, + deepseq, + ghc, + ghc-bignum, + ghc-prim, + hashable, + integer-gmp, + lens, + mtl, + pretty, + primative, + semigroupoids, + template-haskell, + text, + text-builder-linear, + th-abstraction, + time + +Constraints: + base installed, + ghc installed, + ghc-bignum installed, + ghc-prim installed, + integer-gmp installed, + template-haskell installed + +Constraints: + optparse-applicative -process, + tasty -unix + +Preferences: + aeson >= 2.1.0.0, + bytestring >= 0.12.0.0, + filepath >= 1.4.100.0, + mtl >= 2.3.1, + text >= 2.0.1, + unordered-containers >= 0.2.18.0, + +Source-Repository-Package + Type: git + Location: https://github.com/amnh/PHANE + Subdir: pkg/PHANE-alphabet + +Source-Repository-Package + Type: git + Location: https://github.com/amnh/PHANE + Subdir: pkg/PHANE-dynamic-character + +Source-Repository-Package + Type: git + Location: https://github.com/amnh/PHANE + Subdir: pkg/PHANE-dynamic-character-element + +Source-Repository-Package + Type: git + Location: https://github.com/amnh/PHANE + Subdir: pkg/PHANE-measure-class + +Source-Repository-Package + Type: git + Location: https://github.com/amnh/PHANE + Subdir: pkg/PHANE-measure-transition + +Source-Repository-Package + Type: git + Location: https://github.com/amnh/PHANE + Subdir: pkg/PHANE-measure-units + +Source-Repository-Package + Type: git + Location: https://github.com/amnh/PHANE + Subdir: pkg/PHANE-PhyloLib + + +--------------------------------------------------------------------------------- +-- Output paths +-- +-- Place build output(s) in more accessible locations. Binaries should go in `bin` +-- and log files of the build/benchmarks/tests should go in `log`. +--------------------------------------------------------------------------------- + +Builddir: ./doc +Symlink-bindir: ./bin +Installdir: ./bin +Logs-Dir: ./log +Install-method: copy +Overwrite-Policy: always +-- Datadir: ./data +-- Docdir: ./doc +-- HTMLdir: ./doc/html + + +--------------------------------------------------------------------------------- +-- Package flags (useful defaults) +-- +-- Instruct all transative dependencies to be built *with both* optimization and +-- profiling enabled. This reduces rebuilds when debugging while not sacraficing +-- undue execution speed from imported code. +-- +-- Contrastingly, *disable both* optimization and profiling when building the +-- current package's codebase. This improves the frequent recompilation speed +-- during iterative development. +-- +-- Finally, enable all warnings and then prune out the undesirable ones. Having +-- the large litany of warnings enabled forces pre-emptive compatibility with +-- future breaking changes as well as improving overall code quality. +--------------------------------------------------------------------------------- + +Packages: . + +-- Applies to *all* packages, not just dependencies... +Package * + Benchmarks: False + Documentation: True + Tests: False + Library-Profiling: False + Library-Profiling-detail: all-functions + Optimization: 2 + +-- Applies to *only* the specified package! +Package PhyG + Benchmarks: True + Documentation: True + Tests: True + Executable-Profiling: True + Library-Profiling: True + Library-Profiling-detail: all-functions + Profiling-Detail: all-functions + + +--------------------------------------------------------------------------------- +-- GHC-9.8.* comptibility +-- +-- Updated Hackage references for GHC maintainer patched dependency packages +-- +-- Also manual source references for other patched dependency packages +--------------------------------------------------------------------------------- + +With-Compiler: + ghc-9.8.1 + +Active-Repositories: + hackage.haskell.org, + head.hackage.ghc.haskell.org:override + +Repository head.hackage.ghc.haskell.org + url: https://ghc.gitlab.haskell.org/head.hackage/ + secure: True + key-threshold: 3 + root-keys: + f76d08be13e9a61a377a85e2fb63f4c5435d40f8feb3e12eb05905edb8cdea89 + 26021a13b401500c8eb2761ca95c61f2d625bfef951b939a8124ed12ecf07329 + 7541f32a4ccca4f97aea3b22f5e593ba2c0267546016b992dfadcd2fe944e55d + + +--Source-Repository-Package +-- Type: git +-- Location: https://github.com/recursion-ninja/linear-builder + +--Source-Repository-Package +-- Type: git +-- Location: https://github.com/andreasabel/microlens/ +-- Subdir: microlens-th +-- Tag: 70a9990 diff --git a/config/cabal.project.release b/config/cabal.project.release new file mode 100644 index 000000000..285e1ab75 --- /dev/null +++ b/config/cabal.project.release @@ -0,0 +1,163 @@ +--------------------------------------------------------------------------------- +-- Build metadata flags +-- +-- Sensible alterations from the defaults to improve the build experience during +-- iterative development. Simultaneously, does not hinder release builds. +--------------------------------------------------------------------------------- + +Executable-Static: True +Haddock-HTML: True +Haddock-Tests: True +Haddock-benchmarks: True +Haddock-Internal: True +Haddock-Hyperlink-Source: True +Jobs: $ncpus +--keep-going: True +Minimize-Conflict-Set: True + + +--------------------------------------------------------------------------------- +-- Dependency specifications +-- +-- Always allow a newer version of libraries which are tightly coupled to the GHC +-- compiler version. This allows the project to build with newer versions of the +-- compiler without having to tediously tinker with dependencies, both direct and +-- transitive. +-- +-- Always allow a newer version of libraries which have had a major release due +-- to a security or performance patch. Add a `preference` indicating the desire +-- to use a version greater than or equal to the patch. +--------------------------------------------------------------------------------- + +Allow-Newer: + aeson, + base, + base-compat, + bytestring, + containers, + ghc-prim, + hashable, + lens, + mtl, + primative, + semigroupoids, + template-haskell, + text, + th-abstraction + +Preferences: + aeson >= 2.1.0.0, + bytestring >= 0.12.0.0, + filepath >= 1.4.100.0, + mtl >= 2.3.1, + text >= 2.0.1, + unordered-containers >= 0.2.18.0, + +Source-Repository-Package + Type: git + Location: https://github.com/amnh/PHANE + Subdir: pkg/PHANE-alphabet + +Source-Repository-Package + Type: git + Location: https://github.com/amnh/PHANE + Subdir: pkg/PHANE-dynamic-character + +Source-Repository-Package + Type: git + Location: https://github.com/amnh/PHANE + Subdir: pkg/PHANE-dynamic-character-element + +Source-Repository-Package + Type: git + Location: https://github.com/amnh/PHANE + Subdir: pkg/PHANE-measure-units + +Source-Repository-Package + Type: git + Location: https://github.com/amnh/PHANE + Subdir: pkg/PHANE-PhyloLib + + +--------------------------------------------------------------------------------- +-- Output paths +-- +-- Place build output(s) in more accessible locations. Binaries should go in `bin` +-- and log files of the build/benchmarks/tests should go in `log`. +--------------------------------------------------------------------------------- + +Builddir: ./doc +Symlink-bindir: ./bin +Installdir: ./bin +Logs-Dir: ./log +Install-method: copy +Overwrite-Policy: always +-- Datadir: ./data +-- Docdir: ./doc +-- HTMLdir: ./doc/html + + +--------------------------------------------------------------------------------- +-- Package flags (useful defaults) +-- +-- Instruct all transative dependencies to be built *with both* optimization and +-- profiling enabled. This reduces rebuilds when debugging while not sacraficing +-- undue execution speed from imported code. +-- +-- Contrastingly, *disable both* optimization and profiling when building the +-- current package's codebase. This improves the frequent recompilation speed +-- during iterative development. +-- +-- Finally, enable all warnings and then prune out the undesirable ones. Having +-- the large litany of warnings enabled forces pre-emptive compatibility with +-- future breaking changes as well as improving overall code quality. +--------------------------------------------------------------------------------- + +Packages: . + +-- Applies to *all* packages, not just dependencies... +Package * + Benchmarks: False + Documentation: False + Tests: False + Library-Profiling: False + Executable-Profiling: False + Executable-stripping: True + Library-Profiling: False + Library-Stripping: True + Optimization: 2 + + +-- Applies to *only* the specified package! +Package PhyG + Benchmarks: False + Documentation: False + Tests: False + Executable-Profiling: False + Executable-stripping: True + Library-Profiling: False + Library-Stripping: True + Flags: +Forgo-Sanity +Super-Optimization + + +-- Reduce the project's dependency footprint via build flags +Constraints: + bifunctors -tagged, + comonad -indexed-traversable, + lens +inlining -test-hunit -test-properties -test-templates +trustworthy, + pointed -comonad -kan-extensions -semigroupoids -semigroups -stm -tagged, + semialign -semigroupoids, + semigroups -binary -bytestring -deepseq -hashable -tagged -template-haskell -text -transformers -unordered-containers, + semigroupoids -comonad, + + +-- Apply performance flags to specific packages +Constraints: + bitvec +simd, + directory +os-string, + hashable -random-initial-seed, + hashtables -bounds-checking -debug -portable +unsafe-tricks, + optparse-applicative +process, + tasty +unix, + text -pure-haskell +simdutf, + vector -BoundsChecks -InternalChecks -UnsafeChecks -Wall, diff --git a/doc/Authors.md b/doc/Authors.md new file mode 100644 index 000000000..5db3ad2c3 --- /dev/null +++ b/doc/Authors.md @@ -0,0 +1,18 @@ +# Authors + +The following people have made code contributions to the [Phylogenetic Haskell Analytic Network Engine][PHANE-ref] (PHANE) project and the [Phylogenetic Component][PhyG-ref] (PhyG) tool. + +Names below are sorted alphabetically. + +## Author of the PHANE project and PhyG + + * [Ward Wheeler][PI-ref] + +## Contributors + + * Louise Crowley + * [Alex Washburn](https://recursion.ninja) + +[PHANE-ref]: https://github.com/amnh/PhyGraph +[PhyG-ref]: https://github.com/amnh/PhyGraph/tree/main/pkg/PhyGraph +[PI-ref]: https://www.amnh.org/research/staff-directory/ward-wheeler \ No newline at end of file diff --git a/doc/CHANGELOG.md b/doc/CHANGELOG.md deleted file mode 100644 index bb1cc7400..000000000 --- a/doc/CHANGELOG.md +++ /dev/null @@ -1,24 +0,0 @@ -Phylogenetic Haskell Analytic Graph Engine (PHAGE) -================================================== - -# Changelog - -PHAGE uses [Semantic Versioning (v2.0.0)][1]. -The changelog is available [on GitHub][2]. - - -## Unreleased - - * Major codebase layout rearchitecting - * Added continuous integration via GitHub Actions - * Added integration test-suite - * Added `PHAGE-timing` library for timing IO operations - - -## 0.1.0 - - * Initial "alpha" state of PHAGE domain - - -[1]: https://semver.org/spec/v2.0.0.html -[2]: https://github.com/wardwheeler/PhyGraph/blob/main/doc/CHANGELOG.md diff --git a/doc/Changelog.md b/doc/Changelog.md new file mode 100644 index 000000000..9f45c7fa0 --- /dev/null +++ b/doc/Changelog.md @@ -0,0 +1,59 @@ +Phylogenetic Haskell Analytic Network Engine (PHANE) +==================================================== + +# Changelog + +PHANE uses [Semantic Versioning (v2.0.0)][1]. +The changelog is available [on GitHub][2]. + + +## Unreleased (`v0.2.0`) + +### PhyGraph + + * Major codebase layout rearchitecting + + * Modularized and migrated sub-libraries to [PHANE project][GitHub-PHANE] + + - `PHANE-alphabet` + + - `PHANE-dynamic-character` + + - `PHANE-dynamic-character-element` + + - `PHANE-evaluation` + + - `PHANE-measure-units` + + - `PHANE-PhyloLib` + + * Added command line options + + - `--credits` To list financial and technical contributors + + - `--license` Emits the license under which the software is distributed + + - `--splash` A nice ASCII art title "splash screen" + + - `--version` Outputs the software version, commit hash, and compilation timestamp + + * Added proper logging throughout computation(s) + + * Added support for GHC 9.10 + + * Corrected defect in median extraction from preliminary contexts + + * Corrected deadlocks in memoized hashtable access + + +### PhyloLib + + +## `v0.1.0` + + * Initial "alpha" state of PHANE domain + + +[1]: https://semver.org/spec/v2.0.0.html +[2]: https://github.com/wardwheeler/PhyGraph/blob/main/doc/Changelog.md +[GitHub-PHANE]: https://github.com/AMNH/PHANE#readme diff --git a/doc/Code_of_Conduct.md b/doc/Code_of_Conduct.md new file mode 100644 index 000000000..b7ec83871 --- /dev/null +++ b/doc/Code_of_Conduct.md @@ -0,0 +1,133 @@ +s# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, caste, color, religion, or sexual identity +and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the + overall community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or + advances of any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email + address, without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement: +[Ward Wheeler][contact]. +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series +of actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or +permanent ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within +the community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.0, available at +[https://www.contributor-covenant.org/version/2/0/code_of_conduct.html][v2.0]. + +Community Impact Guidelines were inspired by +[Mozilla's code of conduct enforcement ladder][Mozilla CoC]. + +For answers to common questions about this code of conduct, see the FAQ at +[https://www.contributor-covenant.org/faq][FAQ]. Translations are available +at [https://www.contributor-covenant.org/translations][translations]. + +[contact]: mailto:wheeler@amnh.org +[homepage]: https://www.contributor-covenant.org +[v2.0]: https://www.contributor-covenant.org/version/2/0/code_of_conduct.html +[Mozilla CoC]: https://github.com/mozilla/diversity +[FAQ]: https://www.contributor-covenant.org/faq +[translations]: https://www.contributor-covenant.org/translations diff --git a/doc/Contributing.md b/doc/Contributing.md new file mode 100644 index 000000000..8475967b7 --- /dev/null +++ b/doc/Contributing.md @@ -0,0 +1,51 @@ +# How to contribute + +I'm really glad you're reading this, because we need volunteer developers to help this project come to fruition. + +If you haven't already, contact the project author [Ward Wheeler][wheeler] to determine the best tasks for you to contribute to. +We want you working on things you're excited about. + +Here are some important resources: + + * [Documentation](https://github.com/amnh/PhyGraph/tree/master/doc) tells you how to get started with PHANE or PhyG, and + * [GitHub Issues](https://github.com/amnh/PhyGraph/issues) is our project management space. + +## Testing + +We have a handful of existing specific unit, property-based, and integration tests. +Adding one or more of these forms of tests with your contribution would be greating appreciated. + +## Submitting changes + +Please send a [GitHub Pull Request to PCG](https://github.com/amnh/PhyGraph/pull/new/master) with a clear list of what you've done (read more about [pull requests](http://help.github.com/pull-requests/)). +When you send a pull request, we will love you forever if you include [Tasty][tasty] test suites and examples within the Haddock documentation. +We can always use more test coverage. +Please follow our coding conventions (below) and make sure all of your commits are atomic (one feature per commit). + +Always write a clear log message for your commits. +One-line messages are fine for small changes, but bigger changes should look like this: + + $ git commit -m "A brief summary of the commit + > + > A paragraph describing what changed and its impact." + +## Coding conventions + +Start reading our code and you'll get the hang of it. +We optimize for readability: + + * We indent using four spaces + * We seperate top level definitions with *two* blank lines + * We use a variety of tools to maintain code hygeine, there are invokations for these in the `makefile` + * Run `make lint` prior to opening a pull request to run all code hygiene tools + +Thanks, + +[Ward Wheeler][wheeler] + +*[Professor Richard Gilder Graduate School][rggs], [American Museum of Natural History][amnh]* + +[amnh]: https://www.amnh.org/research +[rggs]: https://www.amnh.org/research/richard-gilder-graduate-school +[tasty]: https://hackage.haskell.org/package/tasty +[wheeler]: https://www.amnh.org/research/staff-directory/ward-wheeler diff --git a/doc/Funding.md b/doc/Funding.md new file mode 100644 index 000000000..8f6ec34e4 --- /dev/null +++ b/doc/Funding.md @@ -0,0 +1,7 @@ +### Funding provided by: + + * [American Museum of Natural History](https://www.amnh.org/our-research/computational-sciences) + + * [DARPA SIMPLEX](https://www.darpa.mil/program/simplifying-complexity-in-scientific-discovery) + + * [Kleberg Foundation](http://www.klebergfoundation.org/) diff --git a/doc/LICENSE b/doc/LICENSE index 503dedd61..a64ab5794 100644 --- a/doc/LICENSE +++ b/doc/LICENSE @@ -1,6 +1,6 @@ BSD 3-Clause License -Copyright (c) 2021, Ward Wheeler +Copyright (c) 2020, Ward Wheeler and The American Museum of Natural History All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,17 +13,18 @@ modification, are permitted provided that the following conditions are met: this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. -3. Neither the name of the copyright holder nor the names of its +3. Neither the name of the copyright holders nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY THIS +LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/doc/Notes/ToDo.txt b/doc/Notes/ToDo.txt index dd5150712..f980d2b09 100644 --- a/doc/Notes/ToDo.txt +++ b/doc/Notes/ToDo.txt @@ -1,16 +1,7 @@ - 1) add tnt fasta/c tcm stuff and -? to docs - Source docs - User docs - 2) Create MatrixApproxSmall, MatrixApproxLarge Prealigned chars Add to test.pg -3) Reexamine "executeCommands" to make more stream like - passing various results back to itself till all done - THis allow builds, swaps, transformtion in order - E.g. search tree first then network, search non implied alignment - 4) Set graph complexity for hard-wired ans soft-wired graphs differnetly Hard-wired--as in complexity code now--just edges and vertices Soft-wired--account for the display tree set @@ -24,12 +15,7 @@ 6) Transform Matrix <-> ApproxMatrix -7) Newick--count bits in bitvector of nodes to deermine laeft versus right - so output smaller group "top" left and "right" bottom. - If equal "Lower" number value left (ie earlier in input list) - likley more basal groups earlier in matrix - - parsing very slow +7) Newick parsing very slow 8) select command for taxa/chars/blocks to only use a list of taxa @@ -52,13 +38,6 @@ But basically--anyhting that returns a tree will have to return potentially updated CharInfo wrt hashMap -18) Check all commands before execution for valid syntax - so don't error out at very end after lots of effort - -21) Profile info +RTS -xc -RTS after cabal build --allow-newer --enable profiling - bin +RTS -p -RTS args.... - creates prog.prof - 32) Add affine cost option modify charInfo field for affine cost first arg in generateDenseTransitionCostMatrix @@ -78,11 +57,6 @@ 48) Hartigan 1973 preorder optimization -49) additive to binary for small number states? Bit packing - -50) Bit packing - Ronquist 1998, Goloboff 1999, 2002, Moelannen 1999, White and Holland 2011. - 52) Node clusters--Goloboff 1996,1999 53) Graph comparison hash, @@ -92,8 +66,6 @@ 58) reverse cross ref loci ? -70) Add Alex binary tree rendering for ascii output - 71) latex structure and all that stuff on how it works + haddock functions 72) stylish and hlint source code @@ -102,13 +74,6 @@ 86) Fix alphabet outout in diagnose to be single column -81) Check all command execution for correctness before execution. - except perhaps filenames - commands and options - -82) Need to add prealigned charcater/DO like - recode to matrix? Approx matrix? - 84) Fix diag output wrt sequences (IUPAC ambiguities for DNA) had to replace Alex code in report do to empty list issues @@ -143,50 +108,6 @@ 97) Whidden eta l 2009 2010 for fast SPR tre distance FPT -98) Tree rendering - Alex's code - https://hackage.haskell.org/package/tree-render-text-0.4.0.0/docs/Data-Tree-Render-Text.html - Based on Data.Tree - - import Data.Tree - import Data.Tree.Render.Text - - tree :: Tree String - tree - = Node "Add" - [ Node "Add" - [ Node "0" [] - , Node "Mul" - [ Node "1" [] - , Node "2" [] - ] - ] - , Node "Neg" - [ Node "Max" - [ Node "3" [] - , Node "4" [] - , Node "5" [] - , Node "Var" - [ Node "x" [] - ] - , Node "6" [] - ] - ] - ] - -https://hackage.haskell.org/package/tree-view-0.5.1/docs/Data-Tree-View.html -https://hackage.haskell.org/package/diagrams-contrib-0.1.0.0/docs/Diagrams-TwoD-Layout-Tree.html - -Write LGGraph -> Data.Tree and use https://hackage.haskell.org/package/tree-render-text-0.4.0.0/docs/Data-Tree-Render-Text.html -to render--would have repeated subtrees after network vertex - -101) Dashes in tnt files--option? - Document - -104) label invariven data has-> bv label - -105) Only partition DNA/Amino Acid data - 107) Graphical profiling profiteur phyg.prof (from +RTS -p after --enable-profiling) @@ -201,8 +122,6 @@ to render--would have repeated subtrees after network vertex 116) refactor command checking code at begining of commands -118) Clean up warnings - 121) Verify unique graph output for same length 122) get branch lengths on preorder pass for collapsing later @@ -212,10 +131,6 @@ to render--would have repeated subtrees after network vertex Can be added later if needed for outpu display or lengths or could use average of min and max as a placeholder for now -124) Test tree length on small huge alphabet tree - -129) Prealigned to nonadd/matrix - 130) Return Bridge set GraphAlgorithmBook.pdf https://www.geeksforgeeks.org/bridge-in-a-graph/ @@ -224,21 +139,6 @@ to render--would have repeated subtrees after network vertex 142) At some point keep equally costly traversals so add another vector dimension to traversal trees in Phylogenetic graph -142) For soft-wired - 1) change block graph tuple to Vector of Vector of Decorated graphs to hold - multiple (equally costly) display trees for a given block - when have block sets can reduce by intersection of display trees - for each block could reduce complexity of output a great deal - - - 2) Add resolution set to vertex information - Only updated in softwired graph optimization - Trees and Hardwired can be empty - [VertData] ? - Child indices? - keep trees on post order for diaply tree later - - 143) Check optimization for input forest issues with rootindex need to make more flexible so can just map over components @@ -251,11 +151,6 @@ to render--would have repeated subtrees after network vertex 146) Bridge Tarjan Popcount for size subtree -154) reduce number of resolutions by checking if charctert assignments - (Vector character data) and displayBV data are equal - could have different display trees--would loose that info, but could speed up alot. - Make an option in global settings? Would have same leaf set (bvlabel) but different edges. - 155) Document compress resolutions 156) In=out=1 contract for newick output @@ -269,8 +164,6 @@ to render--would have repeated subtrees after network vertex 161) Incremental pot-order traceback for soft-wired during rerooting -162) remove Maybe from resoltion child type - 166) make forest a final set deleting edges? 173) rename reroot functions to make more sense @@ -278,12 +171,8 @@ to render--would have repeated subtrees after network vertex 174) Abstract rectify graph functions Maybe a tuple with two kinds of graphs and branch WRT node update -175) Move softwired traceback to preorder ? - 177) remove calls to getRoots must be O(n) -178) Add graph reconciliation functions and report - 180) Check size of (number of) resolutions, that its not blowing up -- put limit @@ -299,8 +188,6 @@ to render--would have repeated subtrees after network vertex 187) create incremental postorder pass for use in wagern and swapping add damping if possible -194) Alex Data.Alphabet usefule funciotns for coding and uncoding functions - 195) Hadock 196) prelim/fimal states rendered using alphabet library @@ -310,19 +197,12 @@ to render--would have repeated subtrees after network vertex 207) Incremental optimization with delta for cost at root--need to update other node costs past convergance? Prob yes -209) refactor postorder to be separate functions - 210) preorder to deal with multiple roots 211) Swap preorder on base graph, add back nodes from pruned on original, then preorder -212) rerrot tree make morefficient--rewrite - 213) musl for static binaries -214) Check median left right order for static charcaters--make sure same as dynamic - And for IA assignments as well - 216) static branch lengths wrong 217) check swap delta cost for matrix charcaters @@ -335,24 +215,12 @@ to render--would have repeated subtrees after network vertex 221) Outputs with char changes in positions apolist type -222) abstract steepest and all functions to simplify code - 223) Is sorting edges oni length ofr steepest effective? -225) option to not do branch lengths--don't need in SPR/TBR - -226) SPR root state -- should be union of two descendants - 231) change tree delta calcualtion--to edge calculation wag style--missing good graphs -232) rediagnose preorder pass if chnage final assignment - 233) Check through <= delta in swap and wagner build -234) create alternate spr-tbr for when find new in steepest - -235) check edges for splitting and readding--make sure not missing some edges--especially near root - 236) Check order or branche readdition confirm sequence in closest order 239) IA swapping for network--perhoaps not due to non-uniquwe aspwect of resolutions @@ -378,7 +246,7 @@ to render--would have repeated subtrees after network vertex 250) ia for net swapping and final -252) Check final assingment output for dispolay trees and networks due to "hack" at traversal traceback line 677 +252) Check final assingment output for display trees and networks due to "hack" at traversal traceback line 677 256) For recombination use split on edge as in swapping, check pruned graph root BV @@ -390,32 +258,13 @@ to render--would have repeated subtrees after network vertex 259) set versus transform options set all at beginning, transform change later -260) In fuse--removed identity of components check - Should be tested in large data sets - 261) change fmap blah $ fmap beh to fmap (blah . bleh) 262) turn off IA for non-Trees is a tree thing anyway. can generate IA if desired at end with display trees perhaps change to grandparent not parent if paren in1out1 -263) postorder/preorder softwired - -generalizedGraphPostOrderTraversal -> -postOrderSoftWiredTraversal - post decorate sofwired - getOutDegree1VertexAndGraph - createBlockResolutions - minimalReRootPhyloGraph - updateAndFinalizePostOrderSoftWired -preOrderTreeTraversal - - -264) resolutoin bloick data--change to unlablled edges--much cleaner, then change function to - crete display trees - -266) make reroot for netowek on bridge edges only vector set up - need linear algrithm for bridge edges +266) Tarjan for bridghes no O(n^2) 269) move generat graph functions ie those qwith Gr a b interfaces to LocalGraphs @@ -434,13 +283,6 @@ preOrderTreeTraversal 273) for edges--don't use as split edge if "sister" edge incident on network node -- or sister node is network node -275) Harwired SPR issue--work around by using restricitve TBR - -277) Move all general graph functions to LG - -278) filter if two dwsxcndents are boith net nodes - espexially on blick build graph and input graph - 278) Check order of edges--make sure break/readd in reasonable order for SA 279) Unions after split @@ -457,22 +299,14 @@ preOrderTreeTraversal 284) steepest edge add with randomization (perhaps same with move and delete) -291) Swap steepest--add equal costs and remove default "all" after - 292) Add unions 293) Check static only parallel swap (and optimizatiopn--ie no reroots) -294) For net build -> phylo graph - if naked HTUs (no children) just delete those nodes and contract - edges etc - 295) use splitGraphOnEdge' in swap--save looking for edges later all over 296) output better graph if found in GB -297) in swap etc--if "isTree" don't do all the checking or cyclic etrc as well a bridge edges - 298) transform to tree, to hardwrired, to softwired all doable now to tree with some resoltoin options @@ -489,13 +323,8 @@ preOrderTreeTraversal 303) Check traverse if mix static and dynamic--make sure all charcters ar optimized--the 0 cost thing for choosing best may be issue -304) Add inputs for prealigned slim/wide/huge sequences - 305) Check min max edge cost WRT reordering with gap first versus last -307) Check input prealigned - slim/wide/huge - 308) Test union-like rejoin criterion in swap steepes/rejoin steepest 309) TNT ouput single charcters always? 010101 or can be 0 1 0 1 0 etc? @@ -537,9 +366,6 @@ preOrderTreeTraversal 325) efficiently parallleize bitpacking -330) BV nonadd diagnose screwed up including ambiguoius characters - Looks corret for bitpacked so must be a presentation issue not inherently incorrect - 331) Diagnosis Outputs: HTU reconstructions--suitable for entry into file wrt ambiguies etc Change list -- marked required/possibke @@ -553,8 +379,6 @@ preOrderTreeTraversal 336) add gap, subs, ledgin gap options and make own tcm for fasta and fastc -337) make sure steepst returns equal cost gaphs - 338) all--resolve graphs earlier to reduce memory footprint 339) display trees as pecentages on reconiliation at and @@ -564,27 +388,15 @@ preOrderTreeTraversal 340) Note--if bit packing results in all max states---then check missing value calculation for bits 'missing' codes to determine states present has been evolving -341) make '-' state for non-0additive consistent and sensible (fromSymbols, fromSymbolsWOGap) - -342) tnt scopes allow scope first then type after? not sure of standard - -343) add stupidf graph cost over edges for hardwired cost - 344) Should probably make packedData unboxed -345) reblock globbing with '*' at end didn't work but "??" did. - reblock("nonAddData", "*.tnt??") versus reblock("nonAddData", "*.tnt*") - maybe becasue * more than once and doesn't test? - 347) maybe have a size limit for parallel bitpacking--as in over 100k (tax * chars) or something 348) More fuse progress info 349) CHeck weights and tcms for prealigned/sequence for PMDL -350) Still transfrom add to nonadd--but keep weight (when non-integer) - 353) Output continuous from recoded non-add binary convert back or something @@ -595,8 +407,6 @@ preOrderTreeTraversal 358) Make all "read" "readMaybe" and check to catch failures -359) llvm for OSX - 360) add no changed field to bitpacked and non-add (maybe add too?, matrix?) chars 361) filter weight 0 characters @@ -608,26 +418,14 @@ preOrderTreeTraversal 363) verify local root cost and swap and fuse thing--perhaps not add if in there can be subtracted if double counted (inGS etc) - 364) bit no bits stuff for ML versius PMDL -365) Check heiristic versus postorder graph costs--heuristics too high for dynamic? - do to not rejiggering homologies? - could add emprical epsilon for dynamic characters to evaluate fully - but only needed for dynamic I ouwld think - this percentage could be basd on differnce between dynamic and static costs for those charctaers - get fraction length due to dynamic characters (costs random set pairwsie?) tjhat percent * percent reduction dynamic v static - only need for cases of heuristic cost estimation--swap, fuse with swap, net add delete? - - 366) Move root cost and model cost to final output--otherwise confusing to swap and fuse 367) write collapse graph zero egde use in unique graph/ nubGraph -368) make sure swapSteepest' is not evaluating everyhting at any stage--do a rewrite - 369) integrate collapse for graph compare add 7th field? options to report graph--collapse:True|False @@ -635,17 +433,14 @@ preOrderTreeTraversal 370) select unique--collapse and save but return rseolved (optimizable) graphs - 371) Fix parsing errro newick single ' Uto-Aztecan 372) Boostrap/Jackknife output--collapse < 50% -373) Add fgraph cost comment to dot files - 374) Add select best (1) for block build -375) Check wagner build character in prallel--seems low +375) Check wagner build character in prallel--seems slow 376) Sofwired/hardwired 1) only reroot on bridges @@ -658,18 +453,14 @@ preOrderTreeTraversal 377) Various parallel steepest--do the numbe ron parallel at once then check for better use of parallel -378) Go through and refactor treee/softwored/hardwired traversal post and preorder - 379) NOte for docs--for best tree analsyis use single block 380) heursitic add/delete deltas need to be completely rethought--always negative removed for now -381) extractDisplayTrees seems to be caled multiple times in doftwired for given graph +381) extractDisplayTrees seems to be caled multiple times in softwired for given graph -382) for net add--randomize and limit - 383) remove recursiveRerootList' 384) rationalize paralle calls @@ -678,7 +469,7 @@ preOrderTreeTraversal 386) generalize new reroot code to trees as well -388) Idenitify cause ofdu edges +388) Idenitify cause of dup edges -- bad edge insert? -- error in resolution cache? -- bad resolution of display graph @@ -688,13 +479,9 @@ preOrderTreeTraversal 391) Net "move" not doing enyhting--not sure if bad graph state yeielding infiniites (probabaly) or not finding anyhting better via that mechanism -392) Clean up old reroot code--migrate hardwired? - 393) add conditions for available netowrk add stuff like check for phylogenetic graph -394) Fix net move - 395) Remove cycle check from convertGeneralGraphToPhylogeneticGraph when ok 396) For distance build-- when missing comparison for a block or character--add the average distance value among all non @@ -719,13 +506,133 @@ preOrderTreeTraversal 408) add paralleize bitpacking-- with option to turn off for memory saving -409) redo fuse +414) swap unions + +415) IA seems to be underestimating graph costs--could be that the cost matrix not being tracked in transform? sg2t1.mat in charWag.pg? + +416) Search terminate after so many zero imporvements? (or time) + +417) Static approx costs when not 1:1 (add chars) + +418) graph number control in search/GA + swappinglarge numbers of graphs somtimes + +422) Chase down origin of duplicate edges in HArd and softwired edges + prob in rearrangement in swap of networks + + +424) Make surea affine works for slim + + +425) For PMDL/MAPA + add weight field to read (already there in TNT stuff) so can create integer tcm and weight for MAPA/PMDL + preprocess script file to create new tcm, weight and read for MAP/PMDL + set with options in parens ie gamma:(classes, alpha) + pass field to readInput files for homogeneous tcms to create new TCMs and weights + if tcm specified done ahead + will also allow for parsing PMDL files before reading data + in this case have alphabets + +428) Check presligned hard-wired 3-way in makeIAFinalCharacter + +429) Sectorial based on final states? + +430) Output collapsed graphs for pdf only as option + +434) Check static approx for hugeseqs cost going down initially when it shoudl stay the same + +435) Warning irrelevant options + +436) Check packed costs versus unpacked versus POY + +437) Fix unions stuff + +438) loop end ot Thompson/Search + +439) Check if duplicate edges check is still needed PostOrderSoftWiredFunctions.hs l 272 + +440) Post order traceback issue on unequal numbers of nodes in charcter trees + PostOrderSoftWiredFunctionsNew.hs l 850 + +441) traceBackBlock PostOrderSoftWiredFunctionsNew.hs l 701 + resolution index too large + +442) search timing on cputime + +443) test report writability (folders etc) + +444) Make sure support fully parallelized + +445) check for malfomed networks in support and LG.nodesAndEdgesAfter check on split graph + +446) Check character wag build seems awful compared to distance + +447) Add in hits on saerch output iterations + +448) Randomize Thompsons settings option + +451) TNT newick file--no labels, no lengths., no commas (spaces in stead) + +452) Affine and IP for slim + +453) Make tnt char weight work with negative wqeights-- or perhaps add "\" for negative weights (versus "/")" + +454) Static approx for wide/huge waaay off + +455) Check duynamicEpsilon references + +456) Check "<-" stricness-- move coputaiotns into logic + +457) numThreads <- liftIO $ getNumCapabilities + +458) PHANE-Evaluation + alterenvironment (global) + Control.Monad.MonadReader MondaReder Type classs "local" for local case, ask to get global + reader + gs <- ask (then can send to pure function) + good for abrtacting out inGS + +459) Fix double processing stuff in commandExecution + +460) fix/ disable static approx for wide/huge + +471) split post order (cost only) and preorder to help with strict calls to + multi traverse? Maybe check on profiling to see if its a drag + +472) new random seeds + if atRandom swapParams + {- + NOTE: Optional Syntax + f x y = undefined + z = f x y + z = x `f` y -- Infix notation + Could use infix notation for this randomness call: + then (`permuteList` rejoinEdges') <$> getRandom + -} + then (\randVal -> permuteList randVal rejoinEdges') <$> getRandom + else pure rejoinEdges' + +473) Add Moilannen recombination. Different and intersting method (Moilanen 1999, fig 1.) + +474) Change fuse to recombine, drift to SA + +475) Fix refactor issues in: + TNT parsing + IA errors + length + Diagnosis non-dequence states and differences in IA alignment states + +476) Report diagnois--does it work for : + Multi character states -- it fastc? + MultiCharacter blocks (ie after reblock)? + +477) Parallelize diagnosis over edges -410) redo parallel add/delete/move +478) Check strictness in heuasritics for swap and network edits -411) figure out issue with net move +479) IMporve tree identity testing--check 1k example in phastSimulations -412) Internal setting of dynamic epsilon - (also only for dynmaic/sequence characters) +481) Check parallelize over blocks and characters +482) affine and 3D IP +483) Look into branch lengths at root--suspiciously long \ No newline at end of file diff --git a/doc/Notes/release-notes.txt b/doc/Notes/release-notes.txt new file mode 100644 index 000000000..6bb927f03 --- /dev/null +++ b/doc/Notes/release-notes.txt @@ -0,0 +1,23 @@ +-- Changes in v 0.1.4 + -- Enhancements + Improved TNT ouput with TNT legible trees + Added graph evaluation procedures (Static Approximation, Single Traversal) to randomized search + Implemented Static Approximation for networks + Parallelized output of Implied Alignment fasta/c and TNT + Improved behavior of distance and character builds with missing data + Completely changed parallelization model + Changed bhow random values are seeded + + -- Bugs Fixed + Transform for non 1:1 nucleotide sequences caused seg fault + Simulated Annealing and Drift parameter errors for networks + + -- NCM root cost calcution fixed for dynamic data types + +-- Changes in v 0.1.4 + -- Enhancements + Completely changed parallelization model + Changed how random values are seeded + + -- Bugs Fixed + NCM root cost calcution fixed for dynamic data types \ No newline at end of file diff --git a/doc/PhyGraphCodeDoc/PhyGraphCodeDoc.tex b/doc/PhyG-Code-Reference/PhyGraphCodeDoc.tex similarity index 90% rename from doc/PhyGraphCodeDoc/PhyGraphCodeDoc.tex rename to doc/PhyG-Code-Reference/PhyGraphCodeDoc.tex index 17d484e93..d04896df9 100644 --- a/doc/PhyGraphCodeDoc/PhyGraphCodeDoc.tex +++ b/doc/PhyG-Code-Reference/PhyGraphCodeDoc.tex @@ -124,6 +124,16 @@ and the program exits.} \end{enumerate} + + \section{Order of execution operations} + \begin{enumerate} + \item{Script file is input} + \item{Script file is expanded if script contains \texttt{run} commands} + \item{Script file is reordered to place \texttt{set} commands first, \texttt{read} commands second, other commands are in their input order and follow} + \item{Data are input} + \item{Any graphs are input} + \item{Other commands in order} + \end{enumerate} \section{Data Structures} Most important (at least global) structures and definitions are contained in Module Types.Types. Not all types are discussed here--just @@ -323,6 +333,25 @@ \item{Functions with ``== NonAdd'' etc will need extra cases for any new character type} \end{itemize} + + \section{Simulated Annealing/Drift} + The \texttt{swap()} and \texttt{net/add/delete/move/adddelete} commands contain options for Simulated Annealing + and ``Drifting'' (basically simulated annealing with a single temperature). The options are set in SwapMaster.hs, + Refine.hs, and NetworkAddDelete.hs. For Simulated Annealing, only the number of temperature steps and + number of annealing rounds (specified with \texttt{annealing:N}) are specified. For Drifting, additional parameters + such as the maximum number of changes to graph topology, and probabilities of accepting equal and worse solutions. + + The acceptance functions are in Utilities.hs. + + \section{Adding New Commands} + In addition to their locations (functions) of operation, commands and their options are specified in Verify.hs, CommandExecution.hs, + CommandUtilities.hs, and ProcessCommands.hs. The command options are entered in lists in Verify.hs so that command scripts + can be checked for malformed options before runs begin to avoid command failures later during runs and all the concomitant wasted time. + CommandExecution.hs has the central function for processing commands and their arguments along with helper functions in CommandUtilities.hs. + ProcessCommands.hs contains functions to parse commands and has some error processing. + + In verify, all commands are in lower case--although can be any case for user. + \section{Execution in Parallel} By default the program will execute multi-threaded based on the number processors available. By specifying the options `+RTS -NX -RTS' where `X' is the number of processors offered to the program. These are specified after the program as in (for 4 parallel threads):\\ \\ @@ -335,6 +364,6 @@ \newpage %\bibliography{big-refs-3.bib} - \bibliography{/users/ward/Dropbox/Work_stuff/manus/big-refs-3.bib} - %\bibliography{/home/ward/Dropbox/Work_stuff/manus/big-refs-3.bib} + %\bibliography{/users/ward/Dropbox/Work_stuff/manus/big-refs-3.bib} + \bibliography{/home/ward/Dropbox/Work_stuff/manus/big-refs-3.bib} \end{document} \ No newline at end of file diff --git a/doc/PhyGraphUserManual/AMNHLogo.jpg b/doc/PhyG-User-Manual/AMNHLogo.jpg old mode 100755 new mode 100644 similarity index 100% rename from doc/PhyGraphUserManual/AMNHLogo.jpg rename to doc/PhyG-User-Manual/AMNHLogo.jpg diff --git a/doc/PhyG-User-Manual/AMNHLogo.pdf b/doc/PhyG-User-Manual/AMNHLogo.pdf new file mode 100644 index 000000000..f9e95adaa Binary files /dev/null and b/doc/PhyG-User-Manual/AMNHLogo.pdf differ diff --git a/doc/PhyGraphUserManual/First_run.jpg b/doc/PhyG-User-Manual/First_run.jpg similarity index 100% rename from doc/PhyGraphUserManual/First_run.jpg rename to doc/PhyG-User-Manual/First_run.jpg diff --git a/doc/PhyGraphUserManual/First_script.jpg b/doc/PhyG-User-Manual/First_script.jpg similarity index 100% rename from doc/PhyGraphUserManual/First_script.jpg rename to doc/PhyG-User-Manual/First_script.jpg diff --git a/doc/PhyG-User-Manual/PhyG-User-Manual.tex b/doc/PhyG-User-Manual/PhyG-User-Manual.tex new file mode 100644 index 000000000..7c370526d --- /dev/null +++ b/doc/PhyG-User-Manual/PhyG-User-Manual.tex @@ -0,0 +1,560 @@ +\documentclass[11pt]{book} +\usepackage{longtable} +\usepackage{color} +\usepackage{tabu} +\usepackage{setspace} +\usepackage{pdflscape} +\usepackage{graphicx} +\usepackage {float} +%\usepackage{subfigure} +\usepackage{caption} +\usepackage{subcaption} +\usepackage{natbib} +\usepackage{fullpage} +\bibliographystyle{plain} +%\bibliographystyle{cbe} +\usepackage{algorithmic} +\usepackage[vlined,ruled]{algorithm2e} +\usepackage{amsmath} +\usepackage{amsfonts} +\usepackage{amssymb} +\usepackage[T1]{fontenc} +\usepackage{url} +\usepackage[dvipsnames]{xcolor} +\usepackage{color, soul} +\usepackage[colorlinks=true, linkcolor=blue, citecolor=DarkOrchid, urlcolor=TealBlue ]{hyperref} +%\usepackage[nottoc,numbib]{tocbibind} +\usepackage{tocloft} +\usepackage[many]{tcolorbox} +\usepackage{marginnote} +\usepackage{lipsum} +%\usepackage[most]{tcolorbox} + +\setlength\itemindent{0.25cm} + +\newcommand{\phyg}{\texttt{PhyG} } +\newcommand{\atsymbol}{@} +\newenvironment{phygdescription}{\subsubsection{Description}}{} +\newenvironment{example}{\subsubsection{Examples} \begin{itemize}}{\end{itemize}} +\newenvironment{argument}{\subsection{Arguments}\begin{itemize}}{\end{itemize}} +%\item [ ] + +\begin{document} + %\firstpage{1} + + \title{PhylogeneticGraph\\User Manual\\Version 0.1} + + \maketitle + + \newpage + + \begin{center} + \includegraphics[width=0.75\textwidth]{AMNHLogo.PDF} + \end{center} + + \vspace*{2.50cm} + + \begin{flushleft} + \textbf {Program and Documentation} \\ Ward C. Wheeler \\ + \vspace*{0.50cm} + \textbf {Program} \\ Alex Washburn \\ + \vspace*{0.50cm} + \textbf{Documentation} \\ Louise M. Crowley + \end{flushleft} + + + \vspace*{2.50cm} + + \begin{flushleft} + \small + {\it Louise M. Crowley, Alex Washburn, Ward C. Wheeler} \\ + + Division of Invertebrate Zoology, American Museum of Natural History, New York, NY, U.S.A.\\ + \smallskip + The American Museum of Natural History\\ + \copyright 2023 by The American Museum of Natural History, \\ + All rights reserved. Published 2023. + + \vspace*{0.25cm} + + Available online at \url{https://github.com/amnh/PhyGraph} + + Comments or queries relating to the documentation should be sent to \href{mailto:wheeler@amnh.org} + {wheeler@amnh.org} or \href{mailto:crowley@amnh.org}{crowley@amnh.org} + \end{flushleft} + + \tableofcontents + +\chapter{What is PhyG?} + +\section{Introduction} + PhylogeneticGraph (\texttt{PhyG}) is a multi-platform program designed to produce phylogenetic + graphs from input data and graphs via heuristic searching of general phylogenetic graph + space. \texttt{PhyG} is the successor of \href{https://github.com/wardwheeler/POY5}{\textbf{POY}} + \citep{POY2,POY3,POY4,Varonetal2010,POY5, Wheeleretal2015}, containing much of its + functionality, including the optimization of \textit{unaligned} sequences, and the ability to implement + search strategies such as random addition sequence, swapping, and tree fusing. As in {\textbf{POY}, + \phyg produces heuristic tree-alignment, and can generate outputs in the form of implied alignments + and graphical representations of cladograms and graphs. What sets \phyg apart from {\textbf{POY}, + and other phylogenetic analysis programs, is the extension to broader classes of input data and + phylogenetic graphs. The phylogenetic graph inputs and outputs of \texttt{PhyG} include trees, as + well as softwired and hardwired networks. + + This is the initial version of documentation for the program. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%QUICKSTART +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Quick Start} + + \subsection{Requirements: software and hardware} + \phyg is an open-source program that can be compiled for macOS and Linux + (see information relating to Windows machines below). Some utility programs + (such as TextEdit for Mac, or Nano for Linux) can help in preparing \phyg scripts + and formatting data files, while others (such as Adobe Acrobat and TreeView + \citep{page1996}) can facilitate viewing the outputted graphs and trees. + + \phyg runs on a variety of computers, including desktops, laptops and cluster computers. + By default, \phyg is a multi-threaded application and will use the available resources of + the computer during the analysis (see Execution in Parallel \ref{subsec:parallel}). + + \subsection{Obtaining and Installing \phyg} + \phyg source code, precompiled binaries, test data, and documentation in pdf format, + as well as tutorials, are available from the \phyg \href{https://github.com/amnh/PhyGraph}{GitHub} + website. + + \subsubsection{Installing from the binaries} + Download the \phyg binary from the \href{https://github.com/amnh/PhyGraph}{GitHub} + website. Binaries are available for MacOS computers with either Intel or M1 processors, + and Linux machines (see information relating to Windows machines below). + The user can go directly to the website, navigate to + \href{https://github.com/amnh/PhyGraph/releases}{Releases} and click on the appropriate link + for the binary. On most systems this will download to either your Desktop or Downloads folder. \\ + + Alternatively, open a \textit{Terminal} window (located in your Applications folder) and type + the following for either the Mac Intel, Mac M1 or Linux binary: + + \begin {quote} + curl -LJ --output phyg https://github.com/amnh/PhyGraph/blob/main/bin/OSX/phyg-Intel?raw=true + \end{quote} + + \noindent or + + \begin {quote} + curl -LJ --output phyg https://github.com/amnh/PhyGraph/blob/main/bin/OSX/phyg-M1?raw=true + \end{quote} + + \noindent or + + \begin {quote} + curl -LJ --output phyg https://github.com/amnh/PhyGraph/blob/main/bin/linux/phyg? + raw=true + \end{quote} + + \noindent The binary should either be moved into your \$PATH or referred to its + absolute when executing a script.\\ + + For those users with Windows machines, a Windows Subsystem for Linux + (WSL) can be installed. This system will allow you to run the Linux binary directly + on your machine, without having to install a virtual machine or dual-boot setup. + The WSL, along with directions for installation, can be found + \href{https://learn.microsoft.com/en-us/windows/wsl/}{here}. + + \subsubsection{Compiling from the source} + For the majority of users, downloading the binaries will suffice. Should the user prefer to + compile \phyg directly from the source, the source code can be downloaded + from the \href{https://github.com/amnh/PhyGraph}{GitHub} website. \phyg is largely + written in Haskell. In order to compile \phyg from the source, the user must install Cabal, + a command-line program for downloading and building software written in Haskell (GHC). + Information on its installation can be found + \href{https://www.schoolofhaskell.com/user/simonmichael/how-to-cabal-install}{here}. + The `README.md' file on the GitHub website provides instructions for compiling an + optimized version of \texttt{PhyG}. + +% \begin{quote} +% %cabal install PhyGraph:phyg --project-file=cfg/cabal.project.release +% \hl{cabal build PhyGraph:phyg --flags=super-optimization} +% \end{quote} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%OVERVIEW OF THE PROGRAM +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Overview of program use} + At present, \phyg is operated solely via command-line in a \textit{Terminal} window + and cannot be executed interactively. Commands are entered via a script file + containing commands that specify input files, output files and formats, graph type + and search parameters. + + \subsection{Executing Scripts} + The program is invoked from the command-line as in: + + \begin{quote} + phyg commandFile + \end{quote} + + \smallskip + + \noindent For example, typing the following in a \textit{Terminal} window will invoke + \phyg to run the script \texttt{mol.pg}, which is located in the Desktop folder + \texttt{phygfiles}: + + \begin{quote} + phyg /Users/Ward/Desktop/phygfiles/mol.pg + \end{quote} + + \bigskip + + \noindent + This is the equivalent of typing the following from any location on your computer: + + \begin{quote} + cd ("/Users/Ward/Desktop/phygfiles") + \end{quote} + + \subsection{Creating and running \phyg scripts} + A script is a simple text file containing a list of commands to be performed. + This script can also include references to other script files + (Figure \ref{firstscript}). + + Scripts can be created using any conventional text editor such as \textit{TextEdit}, + \textit{TextWrangler}, \textit{BBEdit}, or \textit{Nano}. Do not use a word processing + application like \textit{Microsoft Word} or \textit{Apple Pages}, as these programs + can introduce hidden characters in the file, which will be interpreted by \phyg and + can cause unpredictable parsing of the data. Comments that describe the contents + of the file and provide other useful annotations can be included. Comment lines are + prepended with `-{}-' and can span multiple lines, provided each line begins with `-{}-'. + + \begin{figure}[H] + \centering + \includegraphics[width=\textwidth]{First_run.jpg} + \caption{\phyg scripts. The headers in the scripts are comments, leading with `-{}-', + which is ignored by \phyg. This first script ``\textbf{First\_script.pg}'' includes a reference + to the second script ``\textbf{Chel\_files.txt}'', which includes a group of data files to be + read by the program.} + \label{firstscript} + \end{figure} + + \subsection{Execution in Parallel} + \label{subsec:parallel} + \phyg is a multi-threading application and will, by default, use all available physical (not hyperthreaded) + cores. Should the user wish to limit or specify the number of processors used by \phyg this can be achieved + by including the options `\textbf{+RTS -NX -RTS}', where `\textbf{X}' is the number of processing cores + offered to the program, when executing the script. Should the user wish to use a single processor, + this can be specified by typing: + + \begin{quote} + phyg fileName +RTS -N1 + \end{quote} + + Specification of \textbf{X} greater than the number of physical cores can degrade overall performance. + + \medskip + \noindent This will execute the program sequentially. + %possible to include and other options + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%FORMATS +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Input Data Formats} + \phyg can analyze a number of different data types, including qualitative, nucleotide + and other sequences + (aligned and unaligned), in TNT, FASTA, and FASTC formats. Any character names in + input files are (for now) ignored and internal names are created by appending + the character number in its file to the filename as in ``\textbf{fileName:0}''. + Qualitative data, and prealigned data include their indices within input files and + unaligned data are treated as single characters. + + \phyg allows the user to comment out portions of a taxon name + in the imported data file. This is achieved by inserting a dollar sign (`\$') before the region + of text that the user wishes to comment out. As an example, placing a `\$' before the + GenBank information in the taxon name \textbf{Lingula\_anatina\$\_AB178773\_16S} + will comment out this information and the taxon name will be read as + \textbf{Lingula\_anatina} by the program. This can be useful for housekeeping purposes, + when it is desirable to maintain long verbose taxon names (such as catalog or NCBI + accession numbers) associated with the original data files but avoid reporting these + names on the graphs. Moreover, it allows the user to provide a single name for a terminal + in cases where the corresponding data are stored in different files under different terminal + names. + + \subsection{fasta} + Single character sequence input \citep{PearsonandLipman1988} (see Figure + \ref{fasta-c}). + + \subsection{fastc} + FASTC is a file format for multi-character sequence input \citep{WheelerandWashburn2019}. + This format is derived from the FASTA format and the custom alphabet format of + \href{https://github.com/wardwheeler/POY5}{\textbf{POY}} \citep{POY4,POY5}. + Multi-character sequence elements are useful for the analysis of data types such + as developmental, gene synteny, and comparative linguistic data (see Figure + \ref{fasta-c}). In this format, individual sequence elements are separated by a space. + + \begin{figure}[H] + \centering + \includegraphics[width=0.49\textwidth]{fasta.png} + \includegraphics[width=0.49\textwidth]{fastc.png} + \caption{FASTA and FASTC file formats. The file ``\textbf{chel\_cox1aln1.fasta}'' + represents a FASTA file with a greater than sign (`>') preceding taxon names and nucleotide + sequence data following on a new line. The file ``\textbf{woman.fastc}'' is a FASTC + file, with a greater than sign (`>') preceding taxon names and the linguistic data following + on a new line.} + \label{fasta-c} + \end{figure} + + \subsection{\texttt{TNT}} + \label{subsec:TNT} + The TNT \citep{Goloboffetal2008} format is accepted here for specification of + qualitative, measurement, and prealigned molecular sequence data. \phyg can + parse all of the basic character settings including activation/deactivation of + characters, making characters additive/non-additive, applying weight to characters, + step-matrix costs, interleaved data files and continuous characters (see the + argument \texttt{tnt} in Section \ref{subsec:read}). Continuous characters can only be + in the format of integers or floats. Moreover, they must be declared as ``additive'', + otherwise, they will be treated as non-additive character states (see + \href{http://phylo.wikidot.com/tnt-htm}{\textbf{TNT}} documentation). + The characters `-' and `?' can be used to indicate that characters or character + states are missing or inapplicable. Only one set of ccode commands is allowed + per line (e.g. to make everything additive: ccode +.;). Default values of step-matrix + weights are not implemented, all step-matrix values must be individually specified. + Ambiguities or ranges are denoted in square brackets (`\texttt{[ ]}') with a period, + as in [X.Y]. Continuous characters are denoted in square brackets with a dash or + hyphen, as in [X-Y]. + +\section{Input Graph Formats} + Graphs can be input in a number of formats, including Newick, Enhanced Newick (eNewick) + and GraphViz ``dot''. Newick tree file format is the parenthetical representation of trees as + interpreted by Olsen (linked \href{https://evolution.genetics.washington.edu/phylip/newick_doc.html} + {here}). Enhanced Newick \cite{Cardonaetal2008} is an extension of Newick file + format, with the addition of tags (`\#') that indicate hybrid zones or reticulation events + in the phylogenetic network. \href{https://graphviz.org/}{Dot} is a graph description + language and well suited to represent graphs and networks. + %and Forest Enhanced Newick (as defined by Wheeler, 2022 \cite{Wheeler2022}). + %Forest Enhanced Newick (FEN) is a format based on Enhanced Newick (ENewick) for + %forests of components, each of which is represented by an ENewick string. The ENewick + %components are surrounded by `$<$' and '$>$'. As in $<$(A, (B,C)); (D,(E,F));$>$. + %Groups may be shared among ENewick components. + + \begin{figure}%[H] + \centering + \includegraphics[width=\textwidth]{enewick.png} + \caption{The file \textbf{``flu\_net1.tre''} in Enhanced Newick (eNewick) graph format. The + values associated with the taxon names and HTUs are branch lengths. The cost of the + graph(s) can be found in square brackets at the end of each graph. A semi-colon follows + the cost of each graph.} + \label{enewick} + \end{figure} + +\section{Output Graph Formats} +\label{sec:outputgraphs} + Graph outputs can be in either Newick, eNewick, dot, and depending on the + operating system, eps or pdf formats. Newick tree files can be viewed in other + programs such as \href{http://tree.bio.ed.ac.uk/software/figtree/}{FigTree} or + \href{http:/https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/}{Dendroscope}. + eNewick files can only be viewed in a text editor. \phyg can output a dot file, + along with an eps (on macOS) or pdf (on linux) file that can be viewed in a vector + graphics program. The dot file can be viewed (and modified) in \textit{Graphviz}. + Note: in order to output pdf files the application \textit{dot} must be installed from + the \href{https://graphviz.org/download/}{Graphviz} website. Graphviz an open-source + graph visualization software.\\ + + \noindent PDFs can also be generated directly from dot files. In a \textit{Terminal}, + type the following: + + \begin{quote} + dot -Tpdf myDotFile.dot $>$ myDotFile.pdf + \end{quote} + + \noindent Multiple `dot' graphs can be output in a single file. To create pdf and + other formats the commandline would be (these files are named and numbered + automatically): + + \begin{quote} + dot -Tpdf -O myDotFile.dot + \end{quote} + + \noindent In MacOS the `pdf' option does not currently seem to work. In this case, + type the following: + + \begin{quote} + dot -Tps2 myDotFile.dot > myDotFile.ps + \end{quote} + + \noindent `-Tps2' will generate a postscript file that \textit{Preview} can read and + convert to pdf. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%COMMANDS +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\chapter{PhyG Commands} + +\section{\phyg Command Structure} + + \subsection{Brief description} + \phyg interprets and executes scripts coming from an input file. A script is a list of + commands, separated by any number of whitespace characters (spaces, tabs, or + newlines). Each command consists of a name, followed by an argument or list of + arguments separated by commas (`\texttt{,}') and enclosed in parentheses. + Commands and arguments are case insensitive with the exception of filename + specifications, which must match the filename \textit{exactly}, including suffixes + and are always in double quotes (``\textbf{fileName}''). Most commands, with the + exception of \texttt{reblock}, \texttt{rename} and \texttt{transform}, have default values + and are provided at the end of each command section below. If no arguments are + specified, the command is executed under its default settings.\\ + + \noindent \phyg recognizes three types of command arguments: \textit{primitive values}, + \textit{labeled values}, and \textit{lists of arguments}.\\ + + \noindent \textbf{Primitive values} can either be an integer (\texttt{INT}), + a real number (\texttt{FLOAT}), a string (\texttt{STRING}) or a boolean (\texttt{BOOL)} + i.e. True|False. \\ + + \noindent \textbf{Labeled arguments} consist of an argument and an identifier + (the label), separated by the colon (`\texttt{:}') character. Examples of + identifiers include \texttt{w15}, \texttt{nopenalty}, \texttt{hardwired}, and + \texttt{parsimony}. \\ %The majority of these labeled arguments are associated with + %the set command. + + \noindent \textbf{List of arguments} is several arguments enclosed in parenthesis + and separated by commas (`\texttt{,}'). Most arguments are optional, with only a + few requiring specification, e.g. the build method of distance must be specified. + %Some argument labels are obligatory, most are optional \hl{e.g. ?}. + In cases where an argument must be specified, \phyg will report a warning message + in the output of the terminal window. When no arguments are specified, \phyg will + use the default values.\\ + + \noindent The following examples illustrate the structure of valid \phyg commands. + + \begin{quote} + build() + \end{quote} + + \noindent In this simple example, the command \texttt{build} is followed by an open + and closed parenthesis. As no arguments are specified, \phyg will use the defaults, + so this is equivalent to \texttt{build(character, replicates:10)}. + + \begin{quote} + read(nucleotide:"chel-prealigned.fas", tcm:"sg1t4.mat") + \end{quote} + + \noindent In this second example, the command \texttt{read} reads data in the file + (\texttt{STRING}) ``\textbf{chel-prealigned.fas}'', parsing it as nucleotide sequence data. + It also read the data in the file ``\textbf{sg1t4.mat}'', parsing it as a transformation cost matrix, + with indel and substitution costs set to $1$. + + \begin{quote} + swap(drift:3, acceptequal:2.0) + \end{quote} + + \noindent In the third example, the command \texttt{swap} is followed by the list + of arguments that includes \texttt{drift} and \texttt{acceptequal}, enclosed in + parentheses and separated by a comma. Both of these are labeled-value arguments, + ascribed an \texttt{INTEGER} and a \texttt{FLOAT}, respectively. + + \begin{quote} + set(graphfactor:nopenalty) + \end{quote} + + \noindent In this fourth and final example, the command \texttt{set} is followed by the + labeled-value argument \texttt{graphfactor} with the label \texttt{nopenalty}. + + \subsection{Command order and processing} + The commands \texttt{read}, \texttt{rename}, \texttt{reblock}, and \texttt{set} are + executed at the beginning of program execution, irrespective of where they appear + in the command script. All other commands are executed in the order they are specified. + + \subsection{Notation} + Commands are listed alphabetically in the next section. Commands are shown in + \texttt{terminal} typeface. Optional items are enclosed in square brackets (`\texttt{[ ]}'). + Primitive values are shown in \texttt{UPPERCASE}. + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%COMMAND REFERENCE +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\section{Command Reference} + \input{PhyG_Allcommands.tex} + +%\chapter{Program Usage} +%\section{Example Script Files} +%Tutorials are available for download on the \phyg \href{https://github.com/amnh/PhyGraph}{GitHub} +%website. These tutorials provide guidance for using the program. Each tutorial contains a \phyg script +%that includes detailed commentaries explaining the rationale behind each step of the analysis. The +%command arguments will differ substantially depending on type, complexity, and size of the data set. +% +%% The following file (titled ``Example Script 1'') reads two input sequence files (net-I.fas and net-II.fas), +%% skips all the lines that begin with double dash (\texttt{--}), reads the graph file net-I-II.dot, sets the +%% outgroup to the taxon named ``zero,'' specifies the graph type for the analysis is a softwired network, +%% and reports a series of files with various information about the data and graphs. +%% +%% \begin{verbatim} +%% -- Example Script 1 +%% read("net-I.fas") +%% --read("net-Ia.fas") +%% --read("net-IIa.fas") +%% read("net-II.fas") +%% --read("net-I.dot") +%% --read("net-I.tre") +%% --read("net-II.tre") +%% --read("net-II.dot") +%% read("net-I-II.dot") +%% set(outgroup:"zero") +%% set(graphtype:softwired) +%% report("net-test.tre", graphs, newick, overwrite) +%% report("net-test.dot", graphs, dot, overwrite) +%% report("net-test-data.csv", data, overwrite) +%% report("net-test-diag.csv", diagnosis, overwrite) +%% report("net-display.dot", displaytrees, dot, overwrite) +%% report("net-display.tre", displaytrees, newick, overwrite) +%% \end{verbatim} +% +%\section{Faster and Slower} +%Multiple options affect both the quality of results (better ot worse optimality score) and +%the overall execution time. In general, the more time consuming options also have larger +%memory footprints. + +%Faster mutlitraverser:False (longer graphs can turn to True at end to rediagnose) +%Static Approx--but also longer then rediagnose + +%Memory--graphssteaperst +% "all' opiont on fuse and swap +% +%\subsection{Evaluation of Graphs} +% \texttt{Multitraverse}\\ +% \texttt{CompressResolutions}\\ +% \texttt{SoftwiredMethod}\\ +% +%\subsection{Search Options} +% \texttt{steepest}\\ +% \texttt{Limiting number of network edges}\\ +% \texttt{Limiting number of graphs} +% +%\section{Parallel Evaluation} +% \texttt{+RTS -NX -RTS} +% +%\section{Memory Use} +% +%The amount of memory used during program execution can be reported by adding the +%runtime option ``-s'' as in \texttt{+RTS -s -RTS} to the command line (runtime options can +%be specified together as in \texttt{+RTS -NX -s -RTS}). This will output several fields of +%data with the ``in use'' field specifying the maximum amount of memory requested from +%the OS. RTS options are described in detail at \url{https://downloads.haskell.org/ghc/latest/docs/users_guide/runtime_control.html}. +% + % Optimize memory consumption--keep low number of graphs in initial searches, later keep +% a larger number to get others +% \hl{(see email from WW 04-21-22)} +% \hl{where should I discuss this?} + + +\section*{Acknowledgments} + The authors would like to thank DARPA SIMPLEX N66001-15-C-4039, the Robert J. + Kleberg Jr. and Helen C. Kleberg foundation grant ``Mechanistic Analyses of Pancreatic + Cancer Evolution'', and the American Museum of Natural History for financial support. + + %\newpage + %\bibliography{big-refs-3.bib} + %\bibliography{/Users/louise/DropboxAMNH/big-refs-3.bib} + %\bibliography{/home/ward/Dropbox/Work_stuff/manus/big-refs-3.bib} + \bibliography{/users/ward/Dropbox/work_stuff/manus/big-refs-3.bib} + \end{document} \ No newline at end of file diff --git a/doc/PhyG-User-Manual/PhyG_Allcommands.tex b/doc/PhyG-User-Manual/PhyG_Allcommands.tex new file mode 100644 index 000000000..460229b03 --- /dev/null +++ b/doc/PhyG-User-Manual/PhyG_Allcommands.tex @@ -0,0 +1,1920 @@ +%-------------------------------------------------------------------------------------------------------------------------------- +%build +%-------------------------------------------------------------------------------------------------------------------------------- +\subsection{Build} + \subsubsection{Syntax} + \texttt{build([argument list])} + + \begin{phygdescription} + + {Builds initial graphs. The arguments of \texttt{build} specify the number of graphs + to be generated, and whether the build is based on \textit{distance} or \textit{character} + methods. The distance options \texttt{rdWag} and \texttt{wpgma} have a time complexity + of $O(n^2)$, while \texttt{dWag} and \texttt{nj} have a complexity of $O(n^3)$. Distance + methods are considerably faster (lower constant factor), but approximate, compared to + character-based methods. Refinement, in the form of branch swapping (\texttt{none}, + \texttt{otu}, \texttt{spr}, and \texttt{tbr}) can be specified within the command for distance + builds. Refinement for character-based Wagner builds occurs after the \texttt{build} + process through \texttt{swap} and other refinement commands. Given the large time + complexity, distance refinement is usually not worth the effort \citep{Wheeler2021}. + \texttt{build} does not replace graphs previously + stored in memory.} + + \end{phygdescription} + + \subsubsection{Arguments} + + \begin{description} + + \item[block] Performs independent builds for each `block' of data. If this option + is not specified, the builds are performed combining all the data. Builds are performed + according to the other options, i.e. \texttt{character} or \texttt{distance}. The resulting + tree or \texttt{graph} is reconciled using the \texttt{eun} or \texttt{cun} commands. The + reconciled graph is resolved into display trees via the \texttt{displayTrees}, \texttt{first}, + and \texttt{atRandom} options. This option is especially useful for softwired network + search. Associated arguments of \texttt{block} include: + + \begin{description} + + \item[cun] Reconciles \textbf {block} trees into a Cluster-Union-Network + \citep{Baroni2005} before resolution into display trees via \texttt{displayTrees} + and its associated arguments. + + \item[displayTrees[:INT]] When the option \texttt{block} is specified, this variable + returns $n$ display trees specified by this optional argument. If the number of + display trees is not specified, up to $2^{63}$ may be returned. + + \begin{description} + + \item[atRandom] When the option \texttt{block} is specified, this variable + returns the number of display trees as specified by the integer value in + \texttt{displayTrees[:INT]}, where trees are produced by resolving network + nodes uniformly at random. Compare with \texttt{first}. + + \item[first:INT] When the option \texttt{block} is specified, this variable specifies + that the first number of displays tree resolutions, as specified by the integer + value, are chosen for each input graph. Compare with \texttt{atRandom}. + + \end{description} + + \item[eun] Reconciles block trees into a Edge-Union-Network \citep{MiyagiandWheeler2019, + Wheeler2022} before resolution into display trees via \texttt{displayTrees} and its associated + arguments. + + \item[graph] When the option \texttt{block} is specified, this variable returns the + reconciled graph as specified by \texttt{eun} or \texttt{cun}. The graph may be + altered to ensure that it is a `phylogenetic' graph sensu \cite{Moretetal2005}. + + \end{description} + + \item [character] Performs random addition sequence Wagner \citep{Farris1970} builds + ($O(n^2)$) for tree construction. If the graphtype is specified as softwired or hardwired, + the resulting trees are rediagnosed as softwired graphs. This is + the default method for tree construction. + + \item [distance] Causes a pairwise distance matrix to be calculated ($O(n^2)$) and used + as a basis for distance tree construction. Specifies how the builds are refined (\texttt{none}, + \texttt{otu}, \texttt{spr}, \texttt{tbr}), as well as how the tree is constructed (\texttt{dWag}, + \texttt{nj}, \texttt{rdWag}, \texttt{wpgma}). Distance trees are subsequently rediagnosed + as character trees and returned for further analysis. Associated arguments of \texttt{distance} + include: + + \begin{description} + + \item[dWag] Performs distance Wagner build as in \citep{Farris1972} choosing the + `best' taxon to + add at each step, yielding a single tree. This method has a time complexity of $O(n^3)$. + + \item[nj] Performs Neighbor-Joining distance build \citep{Saitou1987}, yielding a single + tree. This method has a time complexity of $O(n^3)$. + + \item[none] No refinement (\texttt{otu}, \texttt{spr}, \texttt{tbr}) is performed after + distance builds. \texttt{none} is the default refinement method. + + \item[otu] Specifies that OTU refinement \citep{Wheeler2021} is performed + after distance builds. + + \item[rdWag] Performs random addition sequence distance Wagner builds + \citep{Farris1972,Wheeler2021}, yielding multiple trees determined by the + argument \texttt{replicates}. This method has a time complexity of $O(m \times n^2)$ + for $m$ replicates and $n$ taxa. + + \begin{description} + + \item[best:INT] Applies only to \texttt{rdWag}. Specifies the number of trees + retained after \texttt{rdWag} builds, selecting the best trees in terms of distance + cost. The options can be used to reduce the number of trees retained. + + \end{description} + + \item[spr] Specifies that SPR refinement \citep{Dayhoff1969} is performed + after distance builds. + + \item[tbr] Specifies that TBR refinement \citep{Farris1988, swofford1990a} + is performed after distance builds. + + \item[wpgma] Performs Weighted Pair Group Method with Arithmetic Mean + distance build \citep{SokalandMichener1958}, yielding a single tree. This method + has a time complexity of $O(n^2)$. + + \end{description} + + \item [replicates:[INT]] Applies to \texttt{rdWag} and \texttt{character}. Specifies the + number of random addition sequences to be performed, as indicated by the integer + value. By default 10 random addition sequences are performed. + + \item[return:[INT]] Applies to \texttt{rdWag} and \texttt{character}. This limits the + number of Wagner trees returned for further analysis, to the value as specified + by the integer. By default all graphs that are built are returned, unless limited + by \texttt{best} in distance analysis. Limiting the number of returned trees + (as opposed to simply generating that number) can result in a larger memory + footprint. + + \end{description} + + \subsubsection{Defaults} + \texttt{build(character, replicates:10)} By default, \phyg will build 10 graphs using + a random addition of sequences for each of them. + + \begin{example} + + \item{\texttt{build(replicates:100)} \\ + Builds 100 graphs using a random addition sequence (the default) for each of them.} + + \item{\texttt{build(character, block, graph, cun, displaytrees:5, atrandom)}\\ + Builds 10 (the default) random addition sequence character Wagner builds, for each + block of data. The graph is reconciled into a Cluster-Union-Network, before resolution + into 5 display trees. The trees are produced by resolving the network nodes + uniformly at random.} + + \item{\texttt{build(distance, dWag, nj, wpgma)} \\ + Builds a single `best' distance Wagner build, a Neighbor-Joining tree, and a + WPGMA tree. As the option \texttt{block} is not specified, the distance trees + are built using all the data. A total of three trees is returned.} + + \item{\texttt{build(distance, dWag, replicates:1000, best:10)}\\ + Builds 1000 distance Wagner builds and returns 10 of the lowest cost distance trees. + The best trees are chosen arbitrarily, but consistently--the first 10 with the lowest cost.} + + \item{\texttt{build(distance, rdwag, block, eun, displaytrees:3)}\\ + Builds 10 random addition sequence Wagner builds for each `block' of data. The graph + is reconciled into a Edge-Union-Network, before resolution into the 3 display trees. + The trees are produced by resolving the network nodes.} + + \item{\texttt{build(distance, block, rdWag, wpgma, replicates:100, best:10, otu)}\\ + Builds 100 random addition sequence distance Wagner builds and a wpgma tree. + OTU swapping is performed on the 10 of the lowest cost random addition + sequence Wagner trees. These distance searches, and subsequent refinements are + performed on each block of data.} + + \end{example} + +%-------------------------------------------------------------------------------------------------------------------------------- +%fuse +%-------------------------------------------------------------------------------------------------------------------------------- +\subsection{Fuse} + \subsubsection{Syntax} + \texttt{fuse([argument list])} + + \begin{phygdescription} + + {Performs Tree Fusing \citep{goloboff1999, moilanen1999, moilanen2001} on the graphs + in memory. \texttt{fuse} operates on a collection of graphs performing reciprocal graph + recombination between pairs of graphs. Non-identical subgraphs with identical leaf sets + are exchanged between graphs and the results evaluated. This process of exchange + and evaluation continues until no new graphs are found. This can be used in concert + with other options to perform a Genetic Algorithm refinement \citep{Holland1975}. The + behavior of \texttt{fuse} can be modified by the use of options specifying SPR and + TBR-like rearrangement of the combination process.} + + \end{phygdescription} + + \subsubsection{Arguments} + + \begin{description} + + \item [all] During branch swapping-type operations, all rearrangements are tried before + choosing a new graph. + + \item [best] Specifies the method for tree selection, which in this case returns the best + graphs found during fuse operations. + + \item [keep:INT] Limits the number of returned graphs to the integer value specified. + + \item [none] No branch swapping is performed during the fuse. This is the default. + + \item [noReciprocal] Turns off \texttt{reciprocal} (see below). + + \item [once] Performs a single round of fusing on input graphs and returns the resulting + graphs. Alternatively (and by default) fusing continues recursively until no new graphs + are found. + + \item [pairs:INT] Limits the number of graphs to be fused to the number of pairs as + specified by the integer value (as oppose to $\binom{m}{2}$ for $m$ graphs). + + \begin{description} + + \item [atRandom] Chooses graphs to fuse uniformly at random when \texttt{pairs} is + specified. + + \end{description} + + \item [reciprocal] By default, fuse takes a subgraph of one graph in a pair and replaces the + corresponding subgraph in the other. This argument results in the exchange and evaluation + of graphs in both directions---roughly doubling both the time and memory footprint. + + \item [spr[:INT]] Causes the exchanged subgraphs to be tried at multiple positions (up to + $n$ edges away from their initial positions, where $n$ equals the integer value). + + \item [steepest] During branch swapping-type operations, if a better graph is found, swapping + shifts greedily to that graph. This is the default if swapping is specified. + + \item [tbr[:INT]] Causes the exchanged subgraphs to be tried at multiple positions (up to + $n$ edges away from their initial positions) where $n$ equals the integer value. TBR-style + rerooting of the exchanged components occurs. + + \item [unique] Specifies the method for tree selection, which in this case returns all unique + graphs found during fuse operations. + + \end{description} + + \subsubsection{Defaults} + \texttt{fuse(best, none, reciprocal)} By default, \phyg keeps all the best graphs found, + and continues fusing until no new graphs are found. No branch swapping style rearrangements + are performed. The exchange and evaluation of graphs occurs in a reciprocal manner. + + \begin{example} + + \item{\texttt{fuse(best, once)}\\Fuses input graphs and returns best graphs after a single + round of fusing. No branch swapping style rearrangements are performed and the exchange + and evaluation of graphs occurs in a reciprocal manner (the default).} + + \item{\texttt{fuse(tbr, keep:10)} \\Fuses input graphs and performs TBR-style replacement + and rerooting of pruned components returning up to 10 best cost graphs. The exchange + and evaluation of graphs occurs in a reciprocal manner (the default).} + + \item{\texttt{fuse(spr:3, pairs:5, unique)} \\Fuses input graphs and performs SPR-style swapping, + with the exchange of subgraphs being tried at multiple positions up to 3 edges away from their + initial position. The number of graph pairs to be fused is limited to 5. All unique graphs found + during this operation are returned. The exchange and evaluation of graphs occurs in a reciprocal + manner (the default)}. + + \end{example} + +%-------------------------------------------------------------------------------------------------------------------------------- +%read +%-------------------------------------------------------------------------------------------------------------------------------- + +\subsection{Read} +\label{subsec:read} + \subsubsection{Syntax} + \texttt{read(argument list)} + + \begin{phygdescription} + {Imports file-based information, including data files and graph files. \texttt{read} + commands must contain an input file (\texttt{STRING}). Supported data formats + include FASTA, FASTC and TNT files, and graph formats include Dot, eNewick, %Fenewick (not yet implemented) + and Newick. Filenames must be included in quotes. Filenames must include the + appropriate suffix (e.g. .fas, .ss, .mat). The exclusion of these suffixes will result + in an error. The filename must match exactly, including capitalization. \phyg will + attempt to recognize the type of input and parse appropriately. Otherwise, the + type of file can be explicitly indicated, using one of arguments below. The argument + is followed by a colon (`:') and the data file name(s), enclosed in quotes, and + separated by commas. It is possible to import more than one data file on the + same input line of the command script, but only of the same data type. Reading + in files of different data types, e.g. amino acid and nucleotide in the same command, + will result in an error. Prepending the file type prevents any ambiguity when the file + is parsed (e.g. \texttt{read(nucleotide:"Chel.fas")}). If the data type is not specified, + it is important to verify that the data was interpreted properly (using the command + \texttt{report("STRING", data)} or by checking the output display in the + \textit{Terminal} window). \texttt{read} can also use wildcard expressions + (`*' and `?'), which can be useful when reading in multiple files of the + same type. For example, \texttt{read(preaminoacid:"*.fas*")} imports all files of the + FASTA format in the current directory (in this case this will include files that end + in both .fas and .fasta). This files will be interpreted as prealigned amino acid + sequences. Terminal names should not have spaces in the imported + data file, otherwise the names can be incorrectly interpreted by the program.} + \end{phygdescription} + + \subsubsection{Arguments} + + \begin{description} + + \item [aminoacid:STRING] Specifies that the file contents are parsed as IUPAC + coded amino acid data in FASTA \citep{PearsonandLipman1988} format. Sequence + gaps are removed. + + \begin{tcolorbox}[enhanced,fit to height=3cm, + colback=JungleGreen!40!black!2!white,colframe=JungleGreen!70!black,title=Note, + drop fuzzy shadow] + \phyg recognizes the characters `\textbf{x}' as representing any IUPAC character in + amino acid data, and `\textbf{n}' as representing any nucleotide base in nucleotide + sequence data files. A question mark character (`\textbf{?}') represents either an + `\textbf{x}' or a gap character `\textbf{-}' in amino acid data or an `\textbf{n}' or a + `\textbf{-}' nucleotide sequence data. + \end{tcolorbox} + + \item [block:STRING] Specifies that the string contains block information. Each line + contains the new block name followed by names of input files to be assigned to that + data block. Blocks are initially named as the input file name with ``\#0'' appended. + In the example below, data from files ``b'' and ``c'' will be assigned to block ``a''. + There can be no spaces in file or block names. This argument is only intended for + use with softwired networks. Characters in the same block have the same display + tree in a softwired network \citep{WheelerandWashburn2023}. + + \begin{quote} + \texttt{"a" "b\#0" "c\#0"} + \end{quote} + + \item [dot:STRING] Specifies that the file contains a graph in `dot' format for use with + graph rendering software such as \href{https://en.wikipedia.org/wiki/Graphviz}{GraphViz}. + + \item [enewick:STRING] Specifies that the file contains Enhanced Newick format graph(s) + as specified here \citep{Cardonaetal2008}. + + \item [exclude:STRING] Specifies that the file contains the names of terminal taxa to be + excluded from an analysis. Taxa appear in the form of a list, with a single taxon per + line. Thus, taxa not included in the list and present in input files, will be included in + analysis. Compare with \texttt{include}. + + \item [fasta:STRING] Ensures that file contents are parsed in FASTA \citep{PearsonandLipman1988} + format. This is used for single character sequences such as binary streams, IUPAC + nucleotide and amino acid sequence data. Sequence gaps are removed. + + \item [fastc:STRING] Ensures that file contents are parsed in FASTC \citep{WheelerandWashburn2019} + format. This is used for multi-character sequences such as gene synteny, developmental, + or linguistic data. Sequence gaps are removed. + +% \item [fenewick:STRING] Specifies that the file contains Forest Enhanced Newick format graph(s) +% specified \href{https://www.github.com/wardwheeler/euncon}{here} \citep{Wheeler2022}. +% Not yet implemented. + +% \item [gapopening:] Not yet implemented. + +% \item [hugeseq:] Only internal---used for testing purposes. + + \item [include:STRING] Specifies the names of terminal taxa to be included in the analysis. + Taxa appear in the form of a list, with a single taxon per line. It is possible to specify + terminals that have no data. This may be done in order to diagnose a large graph on + partial data. If there are no data for a leaf taxon, a warning will be printed to \texttt{stderr}. + Taxa not included in this list, but present in the inputted data files, will be excluded from + the analysis. Compare with \texttt{exclude}. + + \item [newick:STRING] Specifies that the file contains Newick format graph(s) as specified + \href{https://evolution.genetics.washington.edu/phylip/newick_doc.html}{here}. + + \item [nucleotide:STRING] Ensures that file contents are parsed as IUPAC coded nucleotide + data in FASTA \citep{PearsonandLipman1988} format. Sequence gaps are removed. + + \begin{tcolorbox}[enhanced,fit to height=3.5cm, + colback=JungleGreen!40!black!2!white,colframe=JungleGreen!70!black,title=Note, + drop fuzzy shadow] + Sequences can be divided into smaller fragments using an assigned character (default + `\textbf{\#}'). This character can be chosen by the user (unlike in POY, where the pound sign + (`\textbf{\#}') was the only character used to partition datasets). Each fragment is treated as + an individual character. When partitioning the data in this way, the number of partitions must + be the same across homologous sequences. The character should be set with the command + \texttt{set:partitioncharacter} (see Section \ref{subsec:set}). + \end{tcolorbox} + + \item [preaminoacid:STRING] Specifies that the file contents are parsed as IUPAC coded + amino acid data in prealigned FASTA \citep{PearsonandLipman1988} format. Gap characters + (``-'') in the sequences are maintained and alignment correspondences are not re-examined. + Prealigned amino acid sequence data \textit{must} be of the same length. + + \item [prefasta:STRING] Specifies that the sequences are prealigned in a FASTA format, + leaving gap characters (``-'') in the sequences and alignment correspondences are not + re-examined. This option exists to ensure proper parsing (and in case auto-format detection + is incorrect). Prefasta files can include any single character element such as nucleotide + sequence data, binary data or IUPAC amino acid sequences. Prealigned FASTA files + \textit{must} be of the same length. + + \item [prefastc:STRING] Specifies that the sequences are prealigned in a FASTC format, + leaving gap characters (``-'') in the sequences and alignment correspondences are not + re-examined. This option exists to ensure proper parsing (and in case auto-format detection + is incorrect). Prealigned FASTC files \textit{must} be of the same length. + +% \item [prehugeseq:] Only internal---used for testing purposes. + + \item [prenucleotide:STRING] Ensures that file contents are parsed as IUPAC coded + nucleotide data in FASTA \citep{PearsonandLipman1988} format, leaving gap characters + (``-'') in the sequences and alignment correspondences are not re-examined. Prealigned + nucleotide sequence data \textit{must} be of the same length. + + \item [rename:STRING] Replaces the name(s) of specified terminals in the file. This + command allows for substituting taxon names and can help merge multiple datasets without + modifying the original data file. The file contains a series of lines, each of which contains at + least two strings---these strings equate to synonyms separated by spaces. The first string + (input taxon name) will replace the second and all subsequent strings (taxon names) on + that line. The \texttt{rename} function can also be specified as a command, see \texttt{rename} + (Section \ref{subsec:Rename}) for more detail and examples. + + \item[STRING] Reads the file specified in the path included in the string argument. A path + can be absolute or relative to the current working directory. The file type is recognized + automatically, but as mentioned previously, this should be confirmed. + + \item [tcm:STRING] This refers to a file containing a custom-alphabet matrix that + specifies varying costs among alphabet elements in a sequence. The elements in + the alphabet can be letters, digits, or both. + The \texttt{tcm} contains two parts: the first line of the file contains the alphabet elements + separated by a space and the transformation cost matrix, which follows below. The dash + character representing an insertion/deletion or indel character is not specified on the first + line of the file, but added to the alphabet automatically. The second part is the \texttt{tcm}, + which is a square matrix with $n + 1$ elements ($n$ is the size of the alphabet). + The positions from left to right and top to bottom in this matrix correspond to the sequence + of the elements as they are listed in the alphabet. An extra rightmost column and lowermost + row correspond to indel (gap) costs to and from alphabet elements. At present, this matrix + must be symmetrical, but not necessarily metric. Non-metric tcms can yield unexpected + results. Transformation costs must be integers. If real values are desired, a character can + be weighted with a floating point value factor. \\ + + For a sequence with four elements alpha, beta, gamma and delta and an indel cost of 5 + for all insertion deletion transformations, a valid custom alphabet file is provided below: + \\ + \begin{equation*} + %\nolabel + \begin{array}{lllll} + alpha & beta & gamma & delta & \\ + 0 & 2 & 1 & 2 & 5 \\ + 2 & 0 & 2 & 1 & 5 \\ + 1 & 2 & 0 & 2 & 5 \\ + 2 & 1 & 2 & 0 & 5 \\ + 5 & 5 & 5 & 5 & 0 + \end{array} + \end{equation*} + \\ + + In this example, the cost of transformation of \texttt{alpha} into \texttt{beta} is \texttt{2}, + and cost of a deletion or insertion of any of the four elements is \texttt{5}. + + \item [tnt:STRING] Ensures that file contents are parsed in TNT \citep{Goloboffetal2008} + format. Not all TNT data commands are currently supported. To ensure that the file is + correctly parsed, the file must begin with \texttt{xread}, followed by an optional comment + in single quotes (`this is a comment'), followed by the number of characters and taxa. The + data follow on a new line. Taxon names are followed by character state data. Data can be in + multiple blocks (interleaved) or in sequential format. These interleaved blocks can consist + of a series of single character states without spaces between them, or multiple (or single) + character states (e.g. \textbf{alpha}) with space between the individual codings. Blocks + must be of all one type (i.e. single character codings without spaces, or multi-character + separated by spaces). The data block \textit{must} be followed by a single semicolon + (`;') on its own line.\\ + + The character settings (i.e. \texttt{ccode} commands) follow the data block, beginning + on a new line. These character settings always terminate with a semi-colon (`\texttt{;}'). + These settings include: activate (`\texttt{[}') or deactivate (`\texttt{]}'); make additive/ordered + (`\texttt{+}') or non-additive/unordered (`\texttt{-}'); apply step-matrix costs (`\texttt{(}') with + scopes (e.g. \texttt{cc + 10 12;} and \texttt{cc (.; costs 0 = 0/1 1 0/2 2 0/3 3 0/4 4 1/2 1 + 1/3 2 1/4 3 2/3 1 2/4 2 3/4 1;}) including abbreviated scopes (`\texttt{cc -.;}'). There may + be multiple character setting statements in a single line. Character settings must be + followed by \texttt{proc/;} on its own line. \texttt{PhyG} will not process + any file contents that follow \texttt{proc/;}.\\ + + Additive/ordered character states must be numbers (integer or floating point). Ranges + for continuous characters are specified with a dash within square brackets (e.g. + \texttt{[1-2.1]}). Character state polymorphism are specified in square brackets without + spaces for single character states (e.g. \texttt{[03]}), and with spaces for multi-character + states.\\ + + Dashes in multi-character states (e.g. \texttt{Blue-ish}) are treated as part of the character + state specification. + %If the user wishes that dashes be treated as missing data, the file must be edited to + %reflect this by replacing the dashes that are to be treated as missing data with question + %marks (`?'). + \\ + + Example file: + \begin{quote} + \texttt{xread\\ + `An example TNT file' 8 5\\ + A 000\\ + B a14\\ + C b22\\ + D ?33\\ + E d[01]4\\} + + \texttt{A Blue-ish -\\ + B Green-ish OneFish\\ + C Rather-Red TwoFish\\ + D Almost-Cyan RedFish\\ + E Orange-definitely BlueFish\\} + + \texttt{A 5.2 - ?\\ + B 5.3 0.3 1.1\\ + C 3.2 0.1 1.1\\ + D 5.2 1.1 0.1\\ + E 5.1 1.1 0.1\\ + ;\\ + cc .;\\ + cc + 2;\\ + proc/;\\} + \end{quote} + + \end{description} + + \subsubsection{Defaults} + \texttt{read("fileName")} reads data within the data file ``fileName'' and attempts to + recognize the file type and parse accordingly. The assumed file type is printed to + \texttt{stderr} for verification. + + \begin{example} + + \item{\texttt{read(nucleotide:"/Users/UserName/Desktop/phyg/metazoa.fas", + tcm:"sg1t4.mat")}\\ Reads the file ``\textbf{metazoa.fas}'' located in the path + \textbf{/Users/UserName/Desktop/phyg/}, parsing it as nucleotide sequence + data. The information in the transformation cost matrix \textbf{sg1t4.mat} + is applied to this imported sequence data.} + + \item{\texttt{read(prefasta:"myDnaSequenceFile.fas")}\\ Reads sequence data from + ``\textbf{myDnaSequenceFile.fas}'' as prealigned data.} + + \item{\texttt{read(include:"IncludeTaxa.txt")}\\ Reads a list of taxa in the file + ``\textbf{IncludeTaxa.txt}'' to be included in the analysis. All other taxa not included + in this list, but present in the inputted data files, will be excluded from the analysis.} + + \item{\texttt{read(rename:"RenameFile.txt")}\\ Reads the file ``\textbf{RenameFile.txt}'' + that contains a list synonyms, where the name of the item listed first will be substituted + for all the subsequently listed names. } + + \item{\texttt{read(newick:"squamata\_run1.tre", "squamata\_run2.tre")}\\ Reads the + Newick graph files ``\textbf{squamata\_run1.tre}'' and ``\textbf{squamata\_run2.tre}.} + + \end{example} + +%-------------------------------------------------------------------------------------------------------------------------------- +%reblock +%-------------------------------------------------------------------------------------------------------------------------------- +\subsection{Reblock} +\label{subsec:reblock} + \subsubsection{Syntax} + %\texttt{reblock("newBlockName", "inputFile0", "inputFile1",...)} + \texttt{reblock(STRING list)} + + \begin{phygdescription} + {Assigns input data to `blocks' that will follow the same display tree when optimized + as `softwired' networks. By default, each input data file is assigned its own block with + the name of the input file. The command \texttt{read(block)} (see Section \ref{subsec:read}) + is used to reassign these data to new, combined blocks. Spaces are not allowed in + block names and will produce `unrecognized block name' errors.} + \end{phygdescription} + + \subsubsection{Arguments} + \begin{description} + + \item [STRING list] The first argument is the block to be created, the remainder + are the input data to be assigned to that block. Blocks are initially named as the input + file name with `:0' appended. Blocks are reported using the \texttt{report(data)} command. + + \end{description} + + \subsubsection{Defaults} + None. + + \begin{example} + + \item{\texttt{reblock("a","b\#0","c\#0")}\\ Assigns input data from file ``\textbf{b}'' and + ``\textbf{c}'' to block ``\textbf{a}'', provided each of these files contain a single block + of data.} + + \end{example} + +%-------------------------------------------------------------------------------------------------------------------------------- +%refine +%-------------------------------------------------------------------------------------------------------------------------------- +\subsection{Refine} +\label{subsec:refine} + \subsubsection{Syntax} + \texttt{refine([argument list])} + + \begin{phygdescription} + {Performs Genetic Algorithm \citep{Holland1975} for any graph type. In addition, + it performs edit operations (addition, deletion, and move) on network edges that + only applies to softwired and hardwired graphs.} + \end{phygdescription} + + \subsubsection{Arguments} + + \begin{description} + + \item[acceptEqual[:FLOAT]] Specifies that equal cost graphs are accepted with the + probability as set by the FLOAT value. This argument can be applied to \texttt{drift} + and \texttt{annealing}. + + \item[acceptWorse[:FLOAT]] The acceptance of candidate graphs is determined by the + probability $1/ (wf + c_c - c_b)$, where $c_c$ is the cost of the candidate graph, $c_b$ + is the cost of the current best graph, and $wf$ is the values as specified by the float + (default 1.0). This argument can be applied to \texttt{drift} and \texttt{annealing}. + + \item[all] Turns off all preference strategies to make network edits, by simply trying + all possible edits to the graph. This is a memory intensive refinement. + The refinement examines the entire rearrangement neighborhood of the current graph + before retaining the best (lowest cost) solutions. + + \item[annealing[:INT]] Specifies the number of rounds (as specified by the integer + value) of simulated annealing optimization \citep{Metropolisetal1953, + Kirkpatricketal1983, Cerny1985}. This is performed in concert with \texttt{netAdd}, + \texttt{netDel}, and \texttt{netMove}. The acceptance of candidate graphs is + determined by the probability\\ $e ^ {- (c_c - c_b)/ (c_b * (k_{max} -k)/ k_{max})}$, + where $c_c$ is the cost of the candidate graph, $c_b$ is the cost of the current + best graph, $k$ is the step number, and $k_{max}$ is the maximum number of + steps (set by the \texttt{steps:INT}, default 10). + + \begin{description} + + \item[steps:INT] Specifies the number of temperature steps performed + during simulated annealing (as specified by the \texttt{annealing}) option. + The default is 10. + + \end{description} + + \item[atRandom] Network edit neighborhoods are traversed in a randomized order (compare + with \texttt{inorder}). This will result in different trajectories of the network edit space being + explored. + + \item[drift[:INT]] Specifies the number of rounds of the `drifting' form of simulated + annealing optimization \citep{goloboff1999} that are performed. This is done in concert + with \texttt{netAdd}, \texttt{netDel}, and \texttt{netMove}. The acceptance of candidate + graphs is determined by the probability $1/ (wf + c_c - c_b)$, where $c_c$ is the cost + of the candidate graph, $c_b$ is the cost of the current best graph, and $wf$ is the + \texttt{acceptWorse} (set by the \texttt{acceptWorse:FLOAT}, default 1.0) option. Equal + cost graphs are accepted with probability set by the \texttt{acceptEqual} option. + \texttt{Drift} differs from \texttt{annealing} in that there are no cooling steps to modify + acceptance probabilities. The maximum number of graph changes is set by + \texttt{maxChanges}. + + \item[ga] Synonym of GeneticAlgorithm. + + \item[geneticAlgorithm] Performs Genetic Algorithm \citep{Holland1975} refinement in + concert with the following options. + + \begin{description} + +% \item[elitist] Not yet implemented. + + \item[generations:[INT]] Specifies the number of generations (sequential iterations) for + \texttt{ga}. The default is 10. + + \item[maxnetedges:INT] Specifies the maximum number of network edges in the graphs + to be produced. The number of network edges in the graphs produced + by these edit operations are limited by the number as specified by the integer value. + Note: when deciding this value, the user should be aware that the addition of network + edges exponentially increases the time taken to optimize the graph + \cite{WheelerandWashburn2023}. This argument is also used in conjunction with the + network edit operations \texttt{netadd} and \texttt{netadddel}. + + \item[popsize:[INT]] Specifies the population size for \texttt{ga}. The default is + 20. + + \item[recombinations:[INT]] Specifies the number of recombination (fusing) events for + \texttt{ga}. The default is 100. + +% \item[severity:[FLOAT]] Specifies the severity of selection against +% sub-optimal graph solutions events for \texttt{ga}. The higher the value, the less severe +% the penalty. The default is 1.0. Not yet implemented. + + \item[stop:INT] Causes the \texttt{ga} to terminate after the number of + generations as specified by the integer value without improvement in graph + cost. By default, this procedure will only terminate when the number of specified + generations has been completed. + + \end{description} + + \item[inorder] Contra \texttt{atRandom}, network edit neighborhoods are always traversed + in the same undefined, but consistent order. + + \item[keep:INT] Limits the number of returned graphs to that as specified by the + integer value. + + \item[maxnetedges:INT] Specifies the maximum number of network edges in the graphs + to be produced. This argument is used in conjunction with the network edit operations + \texttt{netadd} and \texttt{netadddel}. The number of network edges in the graphs produced + by these edit operations are limited by the number as specified by the integer value. + Note: when deciding this value, the user should be aware that the addition of network + edges exponentially increases the time taken to optimize the graph + \cite{WheelerandWashburn2023}. + + \item[netadd] Adds network edges to existing input graphs at all possible positions + until no better cost graph is found. + + \item[netadddelete] Consecutively adds and then deletes network edges from + input graphs until certain conditions are met. + + \begin{description} + + \item[rounds:INT] Specifies the number of combine network add and delete + edit operations. + + \end{description} + + \item[netdel] Synonym of \texttt{netdelete}. + + \item[netdelete] Deletes network edges from input graphs one at a time until no better + cost graph is found. + + \item[netmove] Moves existing network edges in input graphs one at a time to new + positions until no better cost graph is found. + + % \item[returnmutated] Only internal---used for testing purposes. + + \item[steepest] Specifies that refinement follows a greedy path, abandoning the + neighborhood of the current graph when a better (lower cost) graph is found. + + \end{description} + + \subsubsection{Defaults} + \texttt{refine(ga, generations:10, popsize:20, recombinations:100)} By default, + \phyg will perform Genetic Algorithm refinement, with its associated default options, + if no other arguments are specified.} + + \begin{example} + + \item{\texttt{refine(netadddel, rounds:3, maxnetedges:5)}\\ + Consecutively adds and then deletes network edges from input graphs until + either no improvement (graph cost) is found in a round or until the number of + rounds of addition and deletions (as indicated by \texttt{rounds:INT}) is reached. + The maximum number of edges is still specified by \texttt{maxnetedges:INT} + within each round.} + + \item{\texttt{refine(netmove, atrandom, steepest)} \\ Moves existing network + edges in input graphs one at a time to new positions until there are no more + improvements in the cost of the graph. This edit operation follows a greedy + path, abandoning the neighborhood of the current graph when a better graph + of lower cost is found. During this operation, the network edit neighborhoods + are traversed at random. } + + %\item {\texttt{refine(netmove, atrandom, steepest)}\\ + + %add ga and drift examples. + + \end{example} + +%-------------------------------------------------------------------------------------------------------------------------------- +%rename +%-------------------------------------------------------------------------------------------------------------------------------- +\subsection{Rename} + \label{subsec:Rename} + \subsubsection{Syntax} + \texttt{rename(STRING list)} + + \begin{phygdescription} + {Replaces the name(s) of specified terminals in the file. This command can be useful + when combining data from different sources, such as GenBank, or in revising names + to reflect taxonomic changes. It also allows for merging multiple datasets without + modifying the original data file. The command arguments are (minimally) two + strings---these strings equate to synonyms separated by spaces. The first string + will replace the second and all subsequent strings (taxon names) on that line. In the + example given in Figure \ref{renamefile} the taxon Hydrus\_granulatus will be renamed + as Acrochordus\_granulatus, the taxa Gloydius\_boehmei and Gloydius\_mogoi will be + renamed as Gloydius\_halys and the taxa Crotalus\_mutus, Scytale\_catenatus and + Coluber\_crotalinus will be renamed as Lachesis\_muta. Irrespective of where this + command appears in the script file, \phyg will execute this command prior to importing + the data files. Compare with the argument \texttt{rename} of the command \texttt{read} + (Section \ref{subsec:read}). + + \begin{figure}[H] + \centering + \includegraphics[width=0.8\textwidth]{Rename_file.jpg} + \caption{Renaming text file containing the lists of terminal taxa to be renamed.} + \label{renamefile} + \end{figure} + } + \end{phygdescription} + + \subsubsection{Arguments} + + \begin{description} + + \item [STRING list] The first argument is the ``new name'' for the remainder of the + following arguments. + + \end{description} + + \subsubsection{Defaults} + None. + + \begin{example} + + \item{\texttt{rename("a","b","c")}\\ Renames the terminal names ``b'' and ``c'' to + the terminal name ``a''.} + + \end{example} + + +%-------------------------------------------------------------------------------------------------------------------------------- +%report +%-------------------------------------------------------------------------------------------------------------------------------- +\subsection{Report} +\label{subsec:report} + \subsubsection{Syntax} + \texttt{report([STRING, argument list])} + + \begin{phygdescription} + {Outputs the results of the current analysis to a file or to \texttt{stderr}. To redirect the + output to a file, the file name (in quotes), followed by a comma, must be included in + the argument list of report. All arguments for \texttt{report} are optional. This command + allows the user to output information concerning the characters and terminals, + diagnosis, export static homology data, implied alignments, trees, graphs, dot files, + as well as other miscellaneous arguments. By default, new information printed to + a file is appended to that file. The option \texttt{overwrite} overrides the default and + rewrites the file. Many of the report options are output in csv format, which can + subsequently be imported into spreadsheet programs like \textit{Excel} or + \textit{Numbers} for easy viewing.} + \end{phygdescription} + + \subsubsection{Arguments} + \begin{description} + + \item[append] When reporting data or graphs to a file, this information is + appended to the end of the file. By default, files are appended to the report, + rather than overwritten. Compare with the \texttt{overwrite} argument. + + \item[ascii] Reports ASCII character representations of the reported graphs. + This information will be directed to a file, in csv format, if a file name (in quotes), + followed by a comma, is included in the argument list of report. If no file name is + specified, this information will be printed to the stderr. This file can be viewed + in any text editor. + + \item[branchlengths] When reporting graphs, \phyg will report the branch + lengths of these graphs. In Newick and eNewick files, branch lengths + follow the terminal names, separated by a colon. In eps and pdf files, + branch lengths appear above the branches. In ASCII and dot files, branch + lengths follow the terminal labels. This is the default. Compare with + \texttt{nobranchlengths}. + + \item[color] For GraphViz or \texttt{dot} output, \phyg will write the edges of + the graph in color. These colors are based on the value of the edge and are + attributed to RBG colors. There are 11 colors in total, ranging from violet (the + lowest number) to red (the highest number). These colors are discrete and not + continuous. + + \item[collapse] Specifies that zero length branches are collapsed. If a dotpdf + graph is specified, the branches are collapsed by default. If ASCII, Newick, + eNewick and dot are specified, the zero length branches are not collapsed by + default. Compare with the \texttt{nocollapse} argument. + + \item[crossrefs] Reports whether data are present or absent for each terminal + in each of the imported data files. The argument will report a table with terminals + represented in rows, and the data files in columns. A plus sign (`+') indicates that + data for a given terminal is present in the corresponding file; a minus sign (`--') + indicates that it is absent. This provides a comprehensive visual overview of the + completeness of the data. It will highlight missing data, as well as inconsistencies + in the spelling of taxon names in different data files (see Figure \ref{crossrefs}). + This information will be directed to a file, in csv format, if a file name (in quotes), + followed by a comma, is included in the argument list of report. If no file name is + specified, this information will be printed to the stderr. + + \begin{figure} + \centering + \includegraphics[width=0.55\textwidth]{crossrefs1.png} + \caption{The figure shows a crossrefs files, which has been imported into + \textit{Excel}.} + \label{crossrefs} + \end{figure} + + \item[data] Outputs a summary of the input data and terminals. Information relating + to the input data (number of terminals, number of input files, number of character + blocks and the total number of characters) is summariazed. It also provides + information relating to the terminal taxa included in the analysis, including the + names of the taxa, a list of the excluded taxa (if any), and whether any terminals + were renamed. In this file you will also see information relating to ``Index'', ``Block'', + ``Name'', ``Type'', ``Activity'', ``Weight'', ``Prealigned'', ``Alphabet'', and ``TCM''. + ``Index'' reports the character number in the overall dataset; ``Name'' reports the + name of the character (by default based on its source data file); ``Type'' is the type + of character (e.g. Non-Additive, Matrix, Nucleotide Sequence), ``Activity'' reports + whether the character is active (included in the analysis) or not (excluded), + ``Weight'' is the weight of the character, ``Prealigned'' denotes whether a + sequence character (e.g. amino acids) is to treated as prealigned or not, + ``Alphabet'' the elements of a sequence character, ``TCM'' is the transition cost + matrix specifying costs among sequence elements and ``gap'' or insertion-deletion. + This information will be directed to a file, in csv format, if a file name (in quotes), + followed by a comma, is included in the argument list of report. If no file name is + specified, this information will be printed to the stderr. + + \item[diagnosis] Outputs graph diagnosis information such as vertex, states + and edge statistics. This information will be directed to a file, in csv format, if a + file name (in quotes), followed by a comma, is included in the argument list of + report. If no file name is specified, this information will be printed to the stderr. + + \item[displaytrees] Reports graph information for softwired networks. The + `display' trees are output for each data block. This information will be directed + to a file, if a file name (in quotes), followed by a comma, is included in the + argument list of report. If no file name is specified, this information + will be printed to the stderr. + + \item[dot] Outputs a graph in dot format. The dot file can be viewed (and + modified) in \textit{Graphviz} (see also \texttt{dotpdf}). In order to output pdf + files the application \textit{dot} must be installed from the + \href{https://graphviz.org/download/}{Graphviz} website. Graphviz is + open-source graph visualization software. \texttt{dot} is the default graph + representation---a dot file will only be reported if no other graph type is specified. + + \item[dotpdf] Outputs two files---a graph file in dot format, along with either + an eps (on MacOS) or pdf (on Linux) (see also \texttt{dot}). The eps and pdf + files can be read in \textit{Adobe Illustrator}, \textit{Apple Preview} or any + vectorial image edition program. By default, when \texttt{dotpdf} is specified, + edges are collapsed (contracted) if they have a minimum weight (length) of 0. + + \item[graphs] Outputs a graph in the format specified by the other arguments + in the command. These are either Newick, eNewick, ASCII, dot, and dotpdf, + which will output a graph in eps (on MacOS) or pdf (on Linux) format. + + \item[htulabels] Labels the HTUs in the output files. This is the default. Compare + with \texttt{nohtulabels}. + + \item[ia] Synonym of \texttt{impliedalignment}. + + \item[impliedalignment] Outputs the implied alignments of the specified + set of characters in FASTA or FASTC (depending on sequence type) format. + This argument is synonymous with the argument \texttt{ia}. By default, + an implied alignment is reported for each block of input data. + + \begin{description} + + \item[concatenate] This optional argument can be used with \texttt{ia} + | \texttt{impliedalignment}. Instead of outputting an implied alignment + for each block of input data, it will report a file with all the implied + alignments concatenated into a single file. + + \item[includemissing] Outputs a FASTA or FASTC (depending on sequence type), + including taxa that are missing data for that particular block of data in the implied + alignment. + In this case, \phyg will print question mark characters (`?') for the missing taxon in + this block of the implied alignment. This option interacts nicely with + \texttt{concatenate} in creating prealigned FASTA (implied alignments) + with all the terminals included for all the data. By default, taxa with + missing data are not included. + + \end{description} + + \item[metadata] Outputs edge and vertex metadata in csv format. + + \item[newick] Outputs graphs in the eNewick or Newick format, with the terminals + separated with commas, and graphs separated with semicolons. Branch + lengths follow terminals, separated by a colon. If newick is not specified, + a \texttt{dot} file will be reported by default. + + \item[nobranchlengths] When reporting graphs, \phyg will by default + report the branch lengths of these graphs. In Newick and eNewick files, + branch lengths follow the terminal names, separated by a colon. In eps + and pdf files, branch lengths appear above the branches. In ASCII and + dot files, branch lengths follow the terminal labels. The argument + \texttt{nobranchlengths} will override this default and branch lengths + of graphs are not reported. + + \item[nocollapse] Specifies that zero length branches are not collapsed. + If ASCII, Newick, eNewick and dot graphs are specified, the zero length + branches are not collapsed by default. In contrast, if a dotpdf graph is + specified, the zero-length branches are collapsed. Note: by specifying + a dotpdf file, this will by default also output a dot file---in both these files + zero length branches are collapsed. Compare with the \texttt{collapse} + argument. + + \item[nohtulabels] Labels of the HTUs are not included in the output files. + This option can not be applied to eNewick graph. Compare with the argument + \texttt{htulabels}. + + \item[overwrite] By default, when reporting data or graphs to a file, the + information is appended to the end of the file (see \texttt{append}). The + option \texttt{overwrite} overrides this default and rewrites the file rather + than appending to the existing information. + + \item[pairdist] Outputs a taxon pairwise distance matrix in csv format. + This information will be directed to a file, if a file name (in quotes), followed + by a comma, is included in the argument list of report. If no file name is + specified, this information will be printed to the stderr. + + \item[parameterestimation] Outputs a csv file related to the element + frequencies and character change frequencies for both block and character + type data. + + \item[reconcile] Outputs a single `reconciled' graph from all graphs in + memory. The methods include consensus, supertree, and other supergraph + methods as described in \cite{Wheeler2012, Wheeler2022}. When \texttt{reconcile} + is specified as a command option a series of other options may be specified + to tailor the desired outputs: + + \begin{description} + + \item [Compare:] Specifies how group comparisons are to be made. + + \begin{description} + \item[combinable] Group comparison are made by identical match + [(A,(B,C))$\neq$(A,B,C)]. This is the default. + + \item[identity] Group comparison are made by combinable components sensu + \cite{Nelson1979} [(A,(B,C)) consistent with (A,B,C)]. This option can be used + to specify `semi-strict' consensus \citep{Bremer1990}. + + \end{description} + + \item [Connect:BOOL] Specifies that the output graph be connected + (single component), potentially creating a root node and new edges labeled + with ``0.0''. The default value is TRUE. This option will connect any forest elements + into a single graph. + + \item [Method:] Specifies the reconciliation method when more than a single + graph is returned. + + \begin{description} + + \item[Adams] Specifies that returned graphs should be reconciled using the + Adams II consensus \citep{Adams1972} method. + + \item[cun] Graphs are reconciled using the Cluster Union Network + \citep{Baroni2005} method. This argument can be used in conjunction with + \texttt{threshold}. + + \item[eun] Graphs are reconciled using the Edge-Union-Network method of + \citep{MiyagiandWheeler2019}. This argument can be used in conjunction with + \texttt{threshold}. This is the default. + + \item[majority] Specifies that values between 0 and 100 of either vertices or + edges will be retained. If all inputs are graphs with the same leaf set this will + be the Majority-Rule Consensus \citep{MargushandMcMorris1981}. This + argument is used in conjunction with \texttt{threshold}. + + \item[strict] This option requires all vertices are present to be included in the final + graph. If all inputs are graphs with the same leaf set this will be the Strict Consensus + \citep{Schuhandpolhemus1980}. + + \item [Threshold:INT] Specifies the threshold frequency (between 0 and 100) + of vertex or edge occurrence in input graphs to be included in the output graph. + This value affects the behavior of `eun', `cun', and `majority'. The + default value is $0$. + + \end{description} + + \item [EdgeLabel:BOOL] Specifies the output graph have edges + labeled with their frequency in input graphs. The default value is TRUE. + + \item [VertexLabel:BOOL] Specifies the output graph have vertices + labeled with their subtree leaf set. The default value is FALSE. + + \end{description} + + \item[search] Outputs search statistics in csv format. See Section + \ref{subsec:search} for details about the randomized series of graph + optimization methods included in each iteration of a timed search. + + \item [STRING] Specifies the name of the file to which all types of report + outputs, designated by additional arguments, are printed. If no additional + arguments are specified, a graph in dot format, along with an eps or pdf + file (depending on the operating system) will be reported to a file named + `\textbf{defaultGraph}'. By default, files are appended to the report, rather + than overwritten (see \texttt{overwrite}). + + \item[support] Outputs support graphs (see Section \ref{subsec:support}) + that have been previously calculated by the command \texttt{support}. + Resampling graphs \citep{Farrisetal1996} are independent of the input graphs + while Goodman-Bremer graphs \citep{Goodmanetal1982, bremer1994} are + based on current graphs. Graphs can be output in multiple formats with the + use of the options \texttt{ascii}, \texttt{newick}, \texttt{enewick}, \texttt{dot}, + and depending on your operating system, \texttt{dotpdf} for pdf (Linux) or + eps (MacOS). See Section \ref{sec:outputgraphs} for information relating to + viewing and installation requirements. Should you fail to choose one of + these reporting formats, \phyg will issue a warning in the output display + of the \textit{Terminal} window, and output a `dot' file that can be processed + later. + + \item[tnt] Outputs information in TNT \citep{Goloboffetal2008} format (see + Section \ref{subsec:TNT}) using \texttt{impliedAlignment} for unaligned + sequences. This information will be directed to a file, in csv format, if a file + name (in quotes), followed by a comma, is included in the argument list of + report. If no file name is specified, this information will be printed to the stderr. + + \end{description} + + \subsubsection{Defaults} + \texttt{report(append, branchlengths, collapse, dot, htulabels)} + The default graph representation is \texttt{dot}. A dot file will only be + reported if no other graph type is indicated. Branch lengths and HTU + labels are printed in the output files. Zero length branches are collapsed. + When writing to a file, the information is appended to the end of the file. + + \begin{example} + + \item{\texttt{report("outFile.tre", newick, overwrite)}\\ Outputs graphs in newick + format to the file ``\textbf{outFile.tre}'', overwriting any existing information. + In addition, zero length branches are collapses and branch lengths and HTU + labels are printed in the output files (the default).} + + \item{\texttt{report("outFile\_cr.csv", crossrefs)}\\ Outputs the cross reference file + ``\textbf{outFile\_cr.csv}'' with data pertaining to the presence and absence of + taxa in the input files. This information is appended to the end of the file (the + default), if existing information exists.} + + \item{\texttt{report("outFile", dot, reconcile, method:eun, threshold:51)}\\ Outputs a + graph which has been reconciled using the Edge-Union-Network method with a + minimum edge frequency of 51\%. The graph is outputted in dot format to the + file ``\textbf{outFile}', appending to any existing information in the file.} + + \item{\texttt{report("outFile\_supports", support, dotpdf)}\\ Outputs a graph file in + dot format, along with either an eps (on MacOS) or pdf (on Linux) (see also dot). + Support values appear above branches in the eps or pdf graphs, and are \hl{...}. + Support values are generated in the script using the command \texttt{support} + will appear above the branch in the eps or pdf files.} + + \end{example} + +%--------------------------------------------- +%run +%--------------------------------------------- +\subsection{Run} + \subsubsection{Syntax} + \texttt{run(STRING)} + + \begin{phygdescription} + {Executes \phyg script file(s) containing commands. The filename must be + included in quotes. Executing scripts using \texttt{run} can be useful to specify + common actions such as inputting file(s) and graph construction.} + \end{phygdescription} + + \subsubsection{Arguments} + \begin{description} + + \item[STRING] The only argument is the name of the script-containing file + containing the commands to be executed. + + \end{description} + + \subsubsection{Defaults} + There are no default settings of \texttt{run}. + + \begin{example} + + \item{\texttt{run("readFiles.pg")}\\ Executes ``\textbf{readFiles.pg}'', which + may contain multiple input files to be \texttt{read}.} + + \item{\texttt{run("searchCommands.pg")}\\ Executes ``\textbf{searchCommands.pg}'', + which may contain commands defining a common search strategy (e.g. + \texttt{build}).} + + \end{example} + +%--------------------------------------------- +%search +%--------------------------------------------- +\subsection{Search} +\label{subsec:search} + \subsubsection{Syntax} + \texttt{search([argument list])} + + \begin{phygdescription} + {This command implements a randomized search strategy, performing a timed + series of graph optimization methods including building, swapping, + recombination (fusing), simulated annealing and drifting, network edge edits + (addition, deletion and moving), and Genetic Algorithm. The parameters and + their order of implementation are randomized. The arguments + associated with this command specify the duration and number of independent + instances of the \texttt{search}. Successive rounds of \texttt{search} gather any + solutions from previous sequential or parallel rounds, as well as any input graphs. + Since search methods may vary in how long they take, individual iterations may + take longer than the specified duration. By default, search strategies are chosen + uniformly at random, and if \texttt{Thompson} is specified, Thompson sampling + \cite{Thompson1933,WheelerThompson} is used to modify the probabilities + of search strategies over the search duration. + + When performing a \texttt{search} it is important to set the maximum amount of time, + such that the program has a reasonable amount of time to complete the search. + Therefore, it is important to have some idea as to the length of time it would take + to do a single round of searching. Performing a simple search that includes + build, swap, and network edits (if applicable) and calculating the amount of time + for a single graph within this search provides some approximation as to the amount + of time necessary to perform a thorough search. Obviously, this estimate is data and + optimality criterion dependent. With this information, the user can then estimate the + amount of time necessary to perform a thorough search (perhaps 10 times the + amount of time it took to perform this simple search). It is recommended to perform + a \texttt{swap(joinAll)} (see Section \ref{subsec:swap}) after the \texttt{search} has + completed. The user should also allow some time for the program to collate and + write the results to files.} + \end{phygdescription} + + \subsubsection{Arguments} + \begin{description} + + \item[days:INT] Adds the number of days (as specified by the integer) to the + maximum total execution time for the search. + + \item[hours:INT] Adds the number of hours (as specified by the integer) to the + maximum total execution time for the search. + + \item[instances:INT] Specifies the number of (potentially parallel) simultaneous + searches (instances), as indicated by the integer value. The overall search will + terminate only when all instances have completed. + + \item[maxNetEdges:INT] Limits the maximum number of network edges to that + specified by the integer value. This should only to be used if \texttt{graphtype} + has been \texttt{set} to \texttt{hardwired} or \texttt{softwired}. + + \item[minutes:INT] Adds the number of minutes (as specified by the integer) to + the maximum total execution time for the search. + +% \item[simple] Only internal---used for testing purposes. + +% \item[seconds:INT] Adds the number of seconds (as specified by the integer) to the +% maximum total execution time for the search. Only internal---used for testing purposes. + + \item[stop:INT] Causes a search instance to terminate after the number of iterations + without graph optimality improvement. This argument is an adjunct to the specified + time. The search iteration will continue until the maximum execution time for the search + has ended or until the cost of the best graph fails to improve after the specified integer + (whichever comes first). The overall search will terminate only when all instances have + completed. + + \item[Thompson:INT] Specifies that the randomized choice of search option (e.g. + Wagner Build, SPR, Genetic Algorithm) employs Thompson sampling \citep{Thompson1933}. + This form of sampling uses machine learning techniques to guide the randomized option + selection of the search \citep{WheelerThompson}. This sampling method is a heuristic for + timed search decision making. The integer value specifies the memory value for the Thompson + sampling. The lower the value (down to 0), the shorter the memory allowing for more rapid + adjustment of search strategy probabilities. This parameter is applied to both the \texttt{linear} + and \texttt{exponential} arguments of \texttt{Thompson}. + + \begin{description} + + \item[exponential] Specifies that the Thompson memory is an exponential function of + $m$ (the \texttt{INT} value specified by \texttt{Thompson} above). Updating of search + type, $\theta^k$, probability for iteration $n$ is $ \left(1 - \frac{1}{2^m} \theta^k_{n-1}\right) + + \left(\frac{1}{2^m} \theta^k_n \right)$. Thompson success is a function of whether a + search was successful in reducing the graph cost and how long that search took to + complete. + + \item[linear] Specifies that the Thompson memory is a linear function of $m$ (the + \texttt{INT} value specified by \texttt{Thompson} above). Updating of search type, $\theta^k$, + probability for iteration $n$ is $\frac{m}{m+1} \theta^k_{n-1} + \frac{1}{m+1} \theta^k_n$. + Thompson success is a function of whether a search was successful in reducing the graph + cost and how long that search took to complete. + + \end{description} + + \end{description} + + \subsubsection{Defaults} + \texttt{search(instances:1, seconds:30)} Under the default parameters, \phyg will perform 1 + instance of a search for at most 30 seconds. + + \begin{example} + + \item{\texttt{search(hours:10, instances:2)}\\ Performs 2 searches (instances) for 10 hours + each. These searches will be performed simultaneously if the computer capacity allows.} + + \item{\texttt{search(hours:10, minutes:30)}\\ Performs a single search instance (the default) + for 10 hours and 30 minutes.} + + \item{\texttt{search(days:1, thompson:2, linear)}\\ Employs Thompson sampling to guide the + randomized option selection of the search. The Thompson memory value is a linear function + of the parameter 2. This search is performed for 1 day.} + + \end{example} + +%--------------------------------------------------------------------------------------------------------------------------------- +%select +%--------------------------------------------------------------------------------------------------------------------------------- +\subsection{Select} + \subsubsection{Syntax} + \texttt{select([argument list])} + + \begin{phygdescription} + {Specifies the method for choosing and number of graphs to be saved at any point + during the analysis. When multiple graphs are present, the \texttt{select} command + will specify which graphs are retained for subsequent analysis or reporting.} + \end{phygdescription} + + \subsubsection{Arguments} + \begin{description} + + \item[all] Specifies all the graphs are kept. + + \item[atRandom:[INT]] Randomly selects the graphs irrespective of cost. The + number of chosen graphs can be specified with an integer value. This can + be a useful tool for randomized searches and subsequent analyses where + randomized sampling is desired. + + \item[best:[INT]] Selects the number of best or lowest cost graphs. The number + of returned graphs can be specified by the integer value. If no integer value is + specified, all of the graphs of best optimality value will be returned. If the number + of optimal graphs exceeds the value of best, the subset of optimal graphs are + chosen in an undefined but consistent order. + + \item[threshold:FLOAT] Keeps all the best (shortest) graphs and all the unique + graphs up to the fraction longer than shortest graph. This fraction is specified by + the \texttt{FLOAT}. + + \item[unique:[INT]] Selects only topologically unique graphs, irrespective of their + cost. The program will keep as many graphs as specified by the integer value. + + \end{description} + + \subsubsection{Defaults} + \texttt{select(best)} Keeps all unique graphs of best optimality value. + + \begin{example} + + \item{\texttt{select(atrandom:10)}\\ Keeps up to 10 graphs, selecting the graphs + at random.} + + \item{\texttt{select(best:10)}\\ Keeps up to 10 graphs of best optimality value.} + + \item{\texttt{select(threshold:0.5)}\\ Keeps all the graphs of best optimality value + and the unique graphs that are up to 50\% longer than these graphs.} + + \end{example} + +%-------------------------------------------------------------------------------------------------------------------------------- +%set +%-------------------------------------------------------------------------------------------------------------------------------- +\subsection{Set} +\label{subsec:set} + \subsubsection{Syntax} + \texttt{set(argument string)} + + \begin{phygdescription} + {Changes the settings of \texttt{PhyG}. This command performs an array of functions + from specifying the seed of the random number generator, to selecting a terminal for + rooting output graphs, to specifying graph type, final assignment, and optimality + criterion. All \texttt{set} commands are executed at the start of a run, irrespective of + where they appear in the script. The command \texttt{transform} is used to modify + global settings during a run (see \texttt{transform} Section \ref{subsec:transform}). + \texttt{set} arguments have to be indicated on separate lines of the script, otherwise + \phyg will return an error.} + \end{phygdescription} + + \subsubsection{Arguments} + \begin{description} + +% \item[bc2] Related to Information Theory. Not implemented yet. + +% \item[bc4] Related to Information Theory. Not implemented yet. + +% \item[bc5] Related to Information Theory. Not implemented yet. + +% \item[bc8] Related to Information Theory. Not implemented yet. + +% \item[bc64] Related to Information Theory. Not implemented yet. + +% \item[bcgt64] Related to Information Theory. Not implemented yet. + +% \item[compressResolutions:True|False] +% Determines whether softwired graph resolutions are ``compressed'' if multiple +% vertex assignments in alternate display trees are equal in subtree leaf set, only +% the first lowest cost resolution is retained. This option can significantly reduce +% the time to evaluate softwired graphs, but can increase the optimality score of +% the graph. This can be used in combination with \texttt{transform(compressResolutions:True)} +% or \texttt{transform(softwiredMethod:\\Exhaustive)} +% at a later stage to improve graph score. Not implemented yet. + + \item[criterion:] Sets the optimality criterion for graph search. + + \begin{description} + + \item[ncm] Graph costs are determined by the No-Common-Mechanism + likelihood model of \citep{TuffleyandSteel1997}. This is exact for trees, but + for networks should be treated as a form of character weighting. + + \item[parsimony] Graph costs are determined by the parsimony criterion, + that is the minimum sum of transformation events multiplied by their weights. + This is the default. + + \item[pmdl] Graph costs are determined by the Phylogenetic Minimum Description Length criterion, + of \citep{WheelerandVaron2025}. This command option is used in conjunction with + \texttt{modelComplexity}. Any network costs are specified by the criterion and other network + penalty options are ignored. Costs are reported in bits. + + \item[si] Graph costs are determined by the Self-Information (a form of maximum likelihood) criterion + of \citep{WheelerandVaron2025}. This command option ignores. + \texttt{modelComplexity}. Any network costs are specified by the criterion and other network + penalty options are ignored. Costs are reported in bits. + + + \end{description} + +% \item[defparstrat] Only internal---used for testing purposes. + + \item[dynamicepsilon:[FLOAT]] Alters the performance of the heuristic graph cost + calculations to force a full graph evaluation more frequently. This will slow down the + search but has the potential to identify better solutions. The float indicates the additional + cost factor in evaluating the graphs. + + \item[finalAssignment:] Sets the method of determining the `final' sequence states. + + \begin{description} + + \item[DirectOptimization] DirectOptimization uses the DO method to assign + the final states. This is more time consuming than \texttt{ImpliedAlignment}. + DO has an additional factor of potentially $O(n^2)$ in sequence length compared + to the constant factor for IA due to additional graph traversals. This is the default. + + \item[DO] Synonym of \texttt{DirectOptimization}. + + \item[IA] Synonym of \texttt{ImpliedAlignment}. + + \item[ImpliedAlignment] Uses implied alignments to assign the final states. This + final assignment procedure has a lower time complexity than DO. + + \end{description} + + \item[graphFactor:] Sets the network penalty for a softwired network. When conducting + a network analysis, a penalty can be specified, so that ‘softwired’ phylogenetic networks + can compete equally with phylogenetic trees on a parsimony optimality basis. A network + penalty takes into account the change in cost as edges are added to the graph. Hardwired + graphs are automatically set to \texttt{NoPenalty}. + + \begin{description} + + \item[nopenalty] No penalty is added. + + \item[w15] Employs the parsimony network penalty of \cite{Wheeler2015}. + + \item[w23] Sets the parsimony network penalty of \cite{WheelerandWashburn2023}, + which is more severe (the penalty is applied to all the blocks) but has a lower time + complexity than W15. + + \end{description} + + \item[graphsSteepest:[INT]] Sets the maximum number of graphs to be evaluated + simultaneously during `steepest' descent in swapping and network addition and + deletion operation. The number is the lower of the number of parallel threads + and $n$ (default 10). Setting this number to higher values can improve the use of + parallel resources, but at the cost of additional memory footprint. Accordingly, + reduction in this value can reduce memory consumption. + + \item[graphType:] Sets the phylogenetic graph type. The program allows for the input, + analysis of and output of a broad class of phylogenetic graphs. These include trees, + and both `softwired’ and `hardwired’ networks. + + \begin{description} + + \item[tree] Sets the graph type to tree. This is the default. + + \item[hardwired] Sets the graph type to hardwired. A `hardwired' network is + one where characters can have multiple parents. + + \item[softwired] Sets the graph type to softwired. A `softwired' network is + a summary of a set of individual `display' trees that have been generated by + removing edges from the network. Individual characters have single parents. + + \end{description} + +% \item[jointhreshold] Only internal---used for testing purposes. + +% \item[lazyparstrat] Only internal---used for testing purposes. + + \item[missingThreshold:[INT]] Sets the maximum percentage of missing data in a + terminal. The default setting is 100, leaving all terminals in the data set. Terminal taxa + with more than the missing fraction are deleted. The \texttt{missingThreshold} value is + determined by the number of input blocks (files). Hence, a taxon with all missing values + in a TNT file will be counted as not missing for this purpose. + + \item[modelComplexity] The model complexity for use with the PMDL optimality criterion + of \citep{WheelerandVaron2025}. + These values can be generated using \texttt{phyloComplexity} + \href{https://github.com/amnh/PhyAlgInfo}{phyloAlgInfo} + + \item[multiTraverse:[BOOL]] Controls the multi-traverse character optimization + option. If \texttt{True}, character trees are traversed from each edge as in + \citep{VaronandWheeler2012,VaronandWheeler2013, POY4, POY5}, but + individually for each dynamic character. This is the default and yields better + (lower) optimality scores at the cost of added execution time. This is the default. + If \texttt{False}, only outgroup-based traversals are performed as in + \citep{Wheeler1996, POY2, POY3}. + + \item[outgroup:STRING] Specifies the terminal to root the output trees. + This name must appear in double quotes. If the outgroup is not set, the + default outgroup is the taxon whose name is lexically first after any renaming + of taxa, and/or if taxa were specified by using the arguments \texttt{include} + or \texttt{exclude}. Only a single taxon can be set as the outgroup of the analysis. + + \item[partitioncharacter:STRING] Sets the character that is used to partition the + data. Sequences can be divided into smaller fragments using an assigned character. + This single character is chosen by the user (unlike in POY, where the pound sign + (`\textbf{\#}') was the only option to partition datasets). Each fragment is treated as an + individual character. When partitioning the data in this way, the number of partitions + must be the same across homologous sequences. The default is `\textbf{\#}'. + +% \item[reportnaive] Only internal---used for testing purposes. + + \item[rootCost:] Sets the root cost for a graph. + + \begin{description} + + \item[noRootCost] No cost is ascribed to the root. This is the default for parsimony searches. + + \item[w15] Sets a cost at $\frac{1}{2}$ the cost of `inserting' the root character + assignments. The W15 root cost is based on the same rationale as the parsimony + network penalty of \cite{Wheeler2015}. + + \end{description} + + \item[seed:INT] Sets the seed for the random number generator using the integer + value. If unspecified, \phyg uses the system time as the seed. By setting this value, + we are guaranteed to reproduce a given search trajectory each time the script is run. + This is the case even when the operations are randomized, as is the case when using + the argument \texttt{atrandom}. + + \item[softwiredMethod:] Sets the algorithm for softwired graphs to the + `Naive' method of diagnosing all display trees, as in \cite{Wheeler2015} or + the `Resolution Cache' method of \cite{WheelerandWashburn2023}. + + \begin{description} + + \item[Naive] Determines the score of a softwired network by evaluating all display + trees (up to $2^m$ for $m$ network nodes) and summing the costs of the minimum + cost display tree for each block of data. This can be extremely time consuming for + graphs with large number of network nodes. + + \item[ResolutionCache] Uses the method of \citep{WheelerandWashburn2023} to + determine softwired network costs in greatly reduced time. This is the default. + + \end{description} + +% \item[strictparstrat] Only internal---used for testing purposes. + + \item[useia] Uses implied alignments to do as many of the calculations as possible. + This method is faster, but approximate. + + \item[usenetaddheuristic] Uses heuristic graph cost calculations when adding network + edges to the graph. + + \end{description} + + \subsubsection{Defaults} + \texttt{set(criterion:parsimony)}\\ + \texttt{set(FinalAssignment:DirectOptimization)} \\ + \texttt{set(graphFactor:w15)} \\ + \texttt{set(graphtype:tree)}\\ + \texttt{set(outgroup:STRING)} The default outgroup is the taxon whose name is + lexically first after renaming of taxa and/or if taxon were specified for inclusion or + exclusion. + +% \texttt{CompressResolutions} +% is set to \texttt{True}, +% and \texttt{PMDL} if PMDL is set as the optimality criterion. The default rootCost +% is \texttt{noRootCost} if parsimony is the optimality criterion and \texttt{PMDL} if PMDL is set +% as the optimality criterion. + + \begin{example} + + \item{\texttt{set(criterion:parsimony)}\\Sets the graph search optimality criterion to + parsimony.} + + \item{\texttt{set(outgroup:"Lingula\_anatina")}\\This command selects the terminal + "Lingula\_anatina" and sets it as the outgroup for the analysis.} + + \item{\texttt{set(partitioncharacter:@)}\\Sets the partition character in the input dataset + to the at sign `@'.} + +% \item{\texttt{set(compressResolutions:False)}\\Turns off softwired graph resolution compression.} + + \end{example} + +%-------------------------------------------------------------------------------------------------------------------------------- +%support +%-------------------------------------------------------------------------------------------------------------------------------- +\subsection{Support} +\label{subsec:support} + \subsubsection{Syntax} + \texttt{support([argument list])} + + \begin{phygdescription} + {Generates graph supports via resampling \citep{Farrisetal1996} and Goodman-Bremer + \citep{Goodmanetal1982, bremer1994}. Currently, Jackknifing, Bootstrapping and + Goodman-Bremer supports are the resampling methods that are implemented. If + the criterion is set to \texttt{ncm} (see Section \ref{subsec:set}), the support values + for Goodman-Bremer are log likelihood ratios.} + \end{phygdescription} + + \subsubsection{Arguments} + \begin{description} + +% \item[atRandom] When testing support for network edges, it randomizes the evaluation. +% Only internal---used for testing purposes. + + \item[bootstrap] Calculates Bootstrap support. The user can specify the number of + iterations, using \texttt{replicates}. The edges are labeled with the bootstrap + frequencies when output via the command \texttt{report(support)}. Default replicate + search is 100 random addition sequence distance-based Wagner builds and TBR + branch swapping on the best distance tree. If the graph type is softwired or hardwired, + \texttt{netadd}, \texttt{maxnetedges:5}, \texttt{atrandom} are added. + + \item[buildonly] Performs very rapid, but not extensive graph searches for each + resampling replicate. + + \item[gb] Synonym of GoodmanBremer. + + \item[goodmanBremer] Specifies that Goodman-Bremer support is + calculated for input graphs. + + \begin{description} + + \item[gbsample:[INT]] Specifies the number of alternate graphs to be examined + (i.e. limited). The graphs are chosen uniformly at random. This is used to reduce + execution time of the Goodman-Bremer support at a cost of potentially increased + overestimates (higher upper bound values). + + \item[spr] Traverses the SPR neighborhood to determine an upper-bound on + the NP-hard values. + + \item[tbr] Traverses the TBR neighborhood + as optionally specified (TBR as default) to determine an upper bound on the NP-hard + values (this is the method used in POY v2; \citealp{POY2} \textit{et seq.}). + + \end{description} + + \item[jackknife:[FLOAT]] Specifies that Jackknife resampling is performed with $n$ + acceptance probability (default 0.6231 or $1 - e^{-1}$). When reported (via + \texttt{report(support)}), edges are labeled with the jackknife frequencies. Default + replicate search is 100 random addition sequence distance-based Wagner builds + and TBR branch swapping on the best distance tree. If the graph type is softwired + or hardwired, \texttt{netadd}, \texttt{maxnetedges:5}, \texttt{atrandom} are added. + + \item[replicates:[INT]] Specifies the number of resampling replicates, as indicated by the + integer value, that are performed in resampling support for Jackknife and bootstrap methods. + The default is 100. + + \end{description} + + \subsubsection{Defaults} + \texttt{support(goodmanBremer:TBR)} Calculates Goodman Bremer support values, + via traversing the TBR neighborhood. + + \begin{example} + + \item{\texttt{support(jackknife:0.50, replicates:1000)}\\Performs 1000 replicates of + delete 50\% jackknife resampling.} + + \item{\texttt{support(gb, SPR, gbSample:10000)}\\Produces Goodman-Bremer + support based on $10,000$ samples of the SPR neighborhood.} + + \end{example} + +%-------------------------------------------------------------------------------------------------------------------------------- +%swap +%-------------------------------------------------------------------------------------------------------------------------------- +\subsection{Swap} +\label{subsec:swap} + \subsubsection{Syntax} + \texttt{swap([argument list])} + + \begin{phygdescription} + {Performs branch-swapping rearrangement on graphs. This command implements a + group of algorithms referred to as branch swapping, that proceed by clipping + parts of the given tree and attaching them in different positions. These algorithms + include `NNI' \citep{CaminandSokal1965, Robinson1971}, `SPR' \citep{Dayhoff1969}, + and `TBR' \citep{Farris1988, swofford1990a} refinement. Default swapping trajectories + are randomized.} + \end{phygdescription} + + \subsubsection{Arguments} + \begin{description} + + \item[all] Turns off all preference strategies to break and join graphs, simply trying all + possible rearrangements. The refinement examines the entire rearrangement neighborhood + of the current graph before retaining the best (lowest cost) solutions. The user is advised to + only use this option on small datasets as it can be computationally intensive (and + time consuming). + + \item[alternate] Specifies that alternating rounds of \texttt{spr} \citep{Dayhoff1969} + and \texttt{tbr} \citep{Farris1988, swofford1990a} refinement are performed. + + \item[annealing[:INT]] Specifies the number of rounds of simulated annealing + \citep{Metropolisetal1953,Kirkpatricketal1983,Cerny1985} optimization to be performed, + as indicated by the integer value (the default is 1). The acceptance of candidate graphs + is determined by the probability $e ^ {- (c_c - c_b)/ (c_b * (k_{max} -k)/ k_{max})}$, + where $c_c$ is the cost of the candidate graph, $c_b$ is the cost of the current best + graph, $k$ is the step number, and $k_{max}$ is the maximum number of steps (set + by the argument \texttt{steps}, with a default of 10). + + \begin{description} + + \item[steps:[INT]] Specifies the number of temperature steps to be performed during + simulated annealing (as specified by the \texttt{annealing}) option. The default is 10. + + \end{description} + + \item[atRandom] Swap neighborhoods are traversed in a randomized order (compare + with \texttt{inorder}). This will result in different trajectories of the swap space being + explored. This is the default. + + \item[drift[:INT]] Specifies the number of rounds of the `drifting' form of simulated + annealing \citep{goloboff1999} optimization that are performed, as indicated by the integer + value (default 1). The acceptance of candidate graphs is determined by the options + \texttt{acceptEqual} and \texttt{acceptWorse}. \texttt{drift} differs from \texttt{annealing} + in that there are no cooling steps to modify acceptance probabilities. The maximum + number of graph changes is set by \texttt{maxChanges}. + + \begin{description} + + \item[acceptEqual[:FLOAT]] Specifies that equal cost graphs are accepted with the + probability as set by the FLOAT value. + + \item[acceptWorse[:FLOAT]] The acceptance of candidate graphs is determined by the + probability $1/ (wf + c_c - c_b)$, where $c_c$ is the cost of the candidate graph, $c_b$ + is the cost of the current best graph, and $wf$ is the values as specified by the float + (default 1.0). + + \item[maxChanges:INT] Specifies that drifting graph changes are limited to the + value as specified by the integer (default 15). + + \end{description} + + \item[ia] Specifies that Implied Alignment \citep{Wheeler2003} assignment are used for + branch swapping as opposed to full Direct Optimization for dynamic characters when the + graph type is \texttt{tree}. + + \item[inOrder] Contra \texttt{atRandom}, swap neighborhoods are always traversed + in the same undefined, but consistent order. + + \item[joinAll] Turns off all preference strategies to make a join, simply trying all possible + join positions for each pair of clades generated after a break. + +% \item[joinPruned] Only internal---used for testing purposes. + +% \item[joinAlternate] Only internal---used for testing purposes. + + \item[keep:INT] Specifies the number of equally costly graphs to be retained, as + determined by the integer value. + + \item[nni] Specifies that NNI refinement \citep{CaminandSokal1965, Robinson1971} + is performed. + + \item[replicates:[INT]] Sets the number of randomized swap replicate trajectories. + Default 1. + +% \item[returnMutated] Only internal---used for testing purposes. + + \item[spr:[INT]] Specifies that SPR refinement \citep{Dayhoff1969} is performed. If an + integer value is specified, the re-addition of pruned graphs will be within $INT$ + edges of its original placement. + + \item[steepest] Specifies that refinement follows a greedy path, abandoning the + neighborhood of the current graph when a better (lower cost) graph is found. + + \item[tbr:[INT]] Specifies that TBR refinement \citep{Farris1988, swofford1990a} + is performed. If an integer value is specified, the re-addition of pruned graphs + will be within $2 * INT$ edges of its original placement. + + \end{description} + + \subsubsection{Defaults} + \texttt{swap(alternate, keep:10, steepest)} + + \begin{example} + + \item{\texttt{swap(tbr, all, keep:10)}\\Performs tbr branch swapping on each current graph + returning up to 10 best graphs after examining all graphs in + the rearrangement neighborhood.} + + \item{\texttt{swap(annealing:10, steps:10)}\\Performs alternating rounds of \texttt{spr} and + \texttt{tbr} branch swapping (the default) on each current graph, specifying 10 rounds of + simulated annealing, with 10 temperature steps being performed for each + simulated annealing round (the default number of steps).} + + \end{example} + +%-------------------------------------------------------------------------------------------------------------------------------- +%transform +%-------------------------------------------------------------------------------------------------------------------------------- +\subsection{Transform} +\label{subsec:transform} + \subsubsection{Syntax} + \texttt{transform(argument list)} + + \begin{phygdescription} + {Transforms the properties of the imported characters, graphs, or general values + from one value or type to another. It + modifies the global settings during program execution (as opposed to \texttt{set}, + which operates at the start of the analysis). This includes changes of graph type + (e.g. tree to softwired Network) and data types (e.g. dynamic to static + approximation), among other operations.} + \end{phygdescription} + + \subsubsection{Arguments} + \begin{description} + +% \item[compressResolutions:BOOL] Not implemented yet. +% Determines whether softwired graph resolutions are ``compressed'' if multiple vertex +% assignments in alternate display trees are equal in subtree leaf set, only the first lowest +% cost resolution is retained. This option can significantly reduce the time to evaluate +% softwired graphs, but can increase the optimality score of the graph. Can be used in +% combination with \texttt{transform(compressResolutions:True)} or +% \texttt{transform(softwiredMethod:Exhaustive)} at a later stage to improve graph score. + + \item[displayTrees:[INT]] Specifies the number output of display trees for each graph. + When \texttt{toTree} is specified. The number of returned display trees is limited by this + integer value, or by the default value (10). Used in concert with \texttt{toTree}. + + \begin{description} + + \item[atRandom] Returns the number of display trees as specified by the integer in + \texttt{displayTrees}, where trees are produced by + resolving network nodes uniformly at random. Compare with the \texttt{first} option, + which takes the `first' number of display trees resolved in an arbitrary, but consistent, + order. + + \item[first:INT] Specifies that the first number of displays tree resolutions, as + specified by the integer value, are chosen for each input graph. + + \end{description} + + \item[dynamic] Reverts data type to the default `dynamic' for all dynamic homology + \citep{Wheeler2001} character types (e.g. DNA sequences). After this command, + graph optimization proceeds in the default manner with sequence characters treated + in their non-aligned (`dynamic') condition. + + \item[dynamicEpsilon] Sets the level at which heuristic graph costs are verified by full + (and time consuming) traversal. Candidate graphs with costs within \texttt{dynamicEpsilon} + of the current best graph are verified. + + \item[graphFactor] Sets the network penalty for a softwired network. When conducting + a network analysis, a penalty can be ascribed, so that ‘softwired’ phylogenetic networks + can compete equally with phylogenetic trees on a parsimony optimality basis. A network + penalty takes into account the change in cost as edges are added to the graph. Hardwired + graphs are automatically set to \texttt{NoPenalty}. + + \begin{description} + + \item[nopenalty] No penalty is ascribed. + + \item[w15] Employs the parsimony network penalty of \cite{Wheeler2015}. + + \item[w23] Sets the parsimony network penalty of \cite{WheelerandWashburn2023}, + which is more severe + but has a lower time complexity than W15. + + \end{description} + + \item[graphSteepest[:INT]] Changes the number or graphs simultaneously evaluated + in a variety of procedures (e.g. swap, fuse, netmove) from the default (the larger of the + number of parallel threads and INT). This option can be used to exploit or limit use + of parallel thread numbers. Higher numbers will consume more memory and lower, less. + The default value is 10. + +% \item[jointhreshold] Only internal---used for testing purposes. + + \item[multiTraverse:BOOL] Controls the multi-traverse character optimization option. + If \texttt{True}, character trees are traversed from each edge as in + \citep{VaronandWheeler2012,VaronandWheeler2013, POY4, POY5}, but individually + for each dynamic character. This is the default and yields better (lower) optimality + scores at the cost of added execution time. This is the default. + If \texttt{False}, only outgroup-based traversals are performed as in + \citep{Wheeler1996, POY2, POY3}. + +% \item[name] Not implemented yet. + + \item[outgroup:STRING] Specifies the terminal to root the output trees. + This name must appear in double quotes. If the outgroup is not set, the + default outgroup is the taxon whose name is lexically first after any renaming + of taxa, and/or if taxa were specified by using the arguments \texttt{include} + or \texttt{exclude}. Only a single taxon can be set as the outgroup of the analysis. + + \item[softwiredMethod:] Sets the algorithm for softwired graphs to the + `Naive' method of diagnosing all display trees, as in \cite{Wheeler2015} or + the `Resolution Cache' method of \cite{WheelerandWashburn2023}. + + \begin{description} + + \item[Naive] Determines the score of a softwired network by evaluating all + display trees (up to $2^m$ for $m$ network nodes) and summing the costs + of the minimum cost display tree for each block of data. This can be extremely + time consuming for graphs with large number of network nodes. + + \item[ResolutionCache] Uses the method of \citep{WheelerandWashburn2023} + to determine softwired network costs in greatly reduced time. This is the default. + + \end{description} + + \item[staticApprox] Converts non-aligned (`dynamic') sequence characters to their + implied alignment \citep{Wheeler2003, WashburnandWheeler2020} condition. + + \item[toHardwired] Converts exiting graphs to hardwired network graphs and graphfactor + to `NoNetPenalty'. + + \item[toSoftwired] Converts exiting graphs to softwired network graphs. + + \item[toTree] Converts exiting graphs to trees. For both Softwired and Hardwired graphs + this proceeds via graph resolution of network nodes into ``display'' trees. Since there are up to + $2^n$ display trees for a graph with $n$ network nodes, this number can be quite large. + The number of display trees produced for each graph is controlled via the options + \texttt{displayTrees:n}, \texttt{atRandom}, and \texttt{first}. + +% \item[type] Not implemented yet. + + \item[usenetaddheuristic:Bool] Employ a heuristic cost procedure for network addition + that has lower time complexity, but is approximate as opposed to evaluating each network + solution via full graph traversal. + +% \item[weight] Not implemented yet. + + \end{description} + + \subsubsection{Defaults} + None. + + \begin{example} + + \item{\texttt{transform(toSoftwired)}\\Converts each current graph to a softwired network graph.} + + \item{\texttt{transform(staticApprox)}\\Changes data to all static characters via Implied Alignment + for further analysis.} + + \end{example} + diff --git a/doc/PhyGraphUserManual/Rename_file.jpg b/doc/PhyG-User-Manual/Rename_file.jpg similarity index 100% rename from doc/PhyGraphUserManual/Rename_file.jpg rename to doc/PhyG-User-Manual/Rename_file.jpg diff --git a/doc/PhyG-User-Manual/crossrefs1.png b/doc/PhyG-User-Manual/crossrefs1.png new file mode 100644 index 000000000..1836a172f Binary files /dev/null and b/doc/PhyG-User-Manual/crossrefs1.png differ diff --git a/doc/PhyG-User-Manual/enewick.png b/doc/PhyG-User-Manual/enewick.png new file mode 100644 index 000000000..38d1703db Binary files /dev/null and b/doc/PhyG-User-Manual/enewick.png differ diff --git a/doc/PhyG-User-Manual/fasta.png b/doc/PhyG-User-Manual/fasta.png new file mode 100644 index 000000000..1a39609ed Binary files /dev/null and b/doc/PhyG-User-Manual/fasta.png differ diff --git a/doc/PhyG-User-Manual/fastc.png b/doc/PhyG-User-Manual/fastc.png new file mode 100644 index 000000000..d2dc925a3 Binary files /dev/null and b/doc/PhyG-User-Manual/fastc.png differ diff --git a/doc/PhyGraphUserManual/PhyG_Allcommands.tex b/doc/PhyGraphUserManual/PhyG_Allcommands.tex deleted file mode 100644 index 88240a0fe..000000000 --- a/doc/PhyGraphUserManual/PhyG_Allcommands.tex +++ /dev/null @@ -1,1064 +0,0 @@ -%--------------------------------------------- -%build -%--------------------------------------------- -\subsection{Build} - \subsubsection{Syntax} - \texttt{build(arg0, arg1:option, ...)}\\ - %\texttt{build([argument list])} \hl{is this a simpler way to write it?} - - \begin{phygdescription} - {Builds initial graphs. The arguments of \texttt{build} specify the number of graphs - to be generated, and whether the build is based on \textit{distance} or \textit{character} - methods. Most options, with the exception of \texttt{rdWag}, an $O(n^3)$ distance based - option, are $O(n^2)$. Distance methods are considerably faster (lower constant - factor), but approximate in terms of character-based methods. Refinement, in the form - of branch swapping (\texttt{none}, \texttt{otu}, \texttt{spr}, and \texttt{tbr}) can be - specified within the command for distance builds. Refinement for character-based - Wagner builds occurs after the \texttt{build} process through \texttt{swap} and other - refinement commands. Given the large time burden, distance refinement is usually - not time effective \citep{Wheeler2021}. \phyg does not replace trees previously - stored in memory.} - \end{phygdescription} - - \subsubsection{Arguments} - - \begin{description} - - \item[block] Performs independent builds for each ``block'' of data. If this option - is not specified, the builds are performed combining all the data. Builds are performed - according to the other options (e.g. \texttt{character, distance}). The resulting tree - or \texttt{graph} is reconciled using the \texttt{eun} or \texttt{cun} commands. The - reconciled graph is resolved into display trees via the \texttt{displayTrees}, \texttt{first}, - and \texttt{atRandom} options. This option is especially useful for soft-wired network search. - Associated arguments of \texttt{block} include: - - \begin{description} - \item[atRandom] When the option \texttt{block} is specified, this variable returns $n$ - display trees as specified by \texttt{displayTrees[:n]}, where trees are produced by - resolving network nodes uniformly at random. Compare with the \texttt{first} option, - which takes the ``first'' $n$ display trees resolved in arbitrary, but consistent, order. - - \item[cun] Reconciles \textbf {block} trees into a Cluster-Union-Network \citep{Baroni2005} - before resolution into display trees via the \texttt{displayTrees} or \texttt{atRandom} - options. - - \item[displayTrees[:n]] When the option \texttt{block} is specified, this variable - returns $n$ display trees specified by this optional argument. If the number of - display trees is not specified, up to $2^{63}$ may be returned. - - \item[eun] Reconciles block trees into a Edge-Union-Network \citep{MiyagiandWheeler2019, - Wheeler2022} before resolution into display trees via the \texttt{displayTrees} or - \texttt{atRandom} - options. - - \item[first] When the option \texttt{block} is specified, this variable specifies to - choose the first returns $n$ $n$ display trees resolved in arbitrary, but consistent, - order. Compare with \texttt{atRandom}. - - \item[graph] When the option \texttt{block} is specified, this variable returns the - reconciled graph as specified by \texttt{eun} or \texttt{cun}. The graph may be - altered to ensure that it is a ``phylogenetic'' graph sensu \cite{Moretetal2005}. - \end{description} - - \item [character] Performs random addition sequence Wagner \citep{Farris1970} builds - ($O(n^2)$) for tree construction. If the graphtype is specified as softwired or hardwired - the resulting trees are rediagnosed as soft-wired graphs. This is - the default method for tree construction. - - \item [distance] Causes a pairwise distance matrix to be calculated ($O(n^2)$) and used - as a basis for distance tree construction. Specifies how the builds are refined (\texttt{none}, - \texttt{otu}, \texttt{spr}, \texttt{tbr}), as well as how the tree is constructed (\texttt{dWag}, - \texttt{nj}, \texttt{rdWag}, \texttt{wpgma}). Associated arguments of \texttt{distance} include: - - \begin{description} - \item[best:n] Applies only to \texttt{rdWag}. Specifies the number of trees retained - after \texttt{rdWag} builds, selecting the best trees in terms of distance cost. The - options can be used to reduce the number of trees retained for refinement or returned - for further analysis. - - \item[dWag] Performs distance Wagner build as in \citep{Farris1972} choosing the - `best' taxon to - add at each step, yielding a single tree. This method has a time complexity of $O(n^3)$. - - \item[nj] Performs Neighbor-Joining distance build \citep{Saitou1987}, yielding a single - tree. This method has a time complexity of $O(n^3)$. - - \item[none] No refinement (\texttt{otu}, \texttt{spr}, \texttt{tbr})) is performed after - distance builds. \texttt{none} is the default refinement method. - - \item[otu] Specifies that \texttt{otu} refinement \citep{Wheeler2021} is performed - after distance - builds. - - \item[rdWag] Performs randoms addition sequence distance Wagner builds, - yielding multiple trees determined by the argument \texttt{replicates:n}. This - method has a time complexity of $O(m \times n^2)$. - - \item[spr] Specifies that \texttt{spr} refinement \citep{Dayhoff1969} is performed - after distance builds. - - \item[tbr] Specifies that \texttt{tbr} refinement \citep{Farris1988, swofford1990a} - is performed after distance builds. - - \item[wpgma] Performs Weighted Pair Group Method with Arithmetic Mean - distance build \citep{SokalandMichener1958}, yielding a single tree. This method - has a time complexity of $O(n^2)$. - \end{description} - - \item [replicates:n] Applies to \texttt{rdWag} and \texttt{character}. Specifies the number of - random addition sequences performed. - - \end{description} - - \subsubsection{Defaults} - \texttt{build(character, replicates:10)} - - \begin{example} - - \item{\texttt{build(replicates:100)} \\ - Builds 100 random addition sequence character Wagner builds.} - - \item{\texttt{build(character, block, graph, cun, displaytrees:5, atrandom)}\\ - Builds 10 (the default) random addition sequence character Wagner builds, for each - block of data. The graph is reconciled into a Cluster-Union-Network, before resolution - into the 5 display trees. The trees are produced by resolving the network nodes - uniformly atRandom.} - - \item{\texttt{build(distance, rdWag, nj, wpgma)} \\ - Builds a single `best' addition sequence distance Wagner build, a Neighbor-Joining tree, - and a WPGMA tree. As the option \texttt{block} is not specified, the distance trees are - built using all the data.} - - \item{\texttt{build(distance, dWag, replicates:1000, best:10)}\\ - Builds 1000 distance Wagner builds and returns 10 of the lowest cost distance trees.} - - \item{\texttt{build(distance, rdwag, block, eun, displaytrees:3)}\\ - Builds 10 random addition sequence Wagner builds for each `block' of data. The graph - is reconciled into a Edge-Union-Network, before resolution into the 3 display trees. - The trees are produced by resolving the network nodes.} - - \item{\texttt{build(distance, block, rdWag, replicates:100, wpgma, best:10, otu)}\\ - Builds 100 random addition sequence distance Wagner builds, a WPGMA tree, - performs OTU swapping on the WPGMA and 10 of the lowest cost random addition - sequence Wagner trees. This distance search is performed on the `blocks' of data, - as opposed to all of the data.} - - \end{example} - -%--------------------------------------------- -%fuse -%--------------------------------------------- -\subsection{Fuse} - \subsubsection{Syntax} - \texttt{fuse(option, option, ...)} - - \begin{phygdescription} - {Performs Tree Fusing \citep{goloboff1999, moilanen1999, moilanen2001}. \texttt{fuse} - operates on a collection of graphs performing reciprocal graph recombination between - pairs of graphs. Non-identical subgraphs with identical leaf sets are exchanged between - graphs and the results evaluated. This process of exchange and evaluation continues - until no new graphs are found. - This can be used in concert with other options to perform a Genetic Algorithm refinement - \citep{Holland1975}. The behavior of \texttt{fuse} can be modified by the use of options - specifying SPR and TBR-like rearrangement of the combination process.} - \end{phygdescription} - - \subsubsection{Arguments} - \begin{description} - %\item[all] \hl{details} - - %\item[atRandom] \hl{details} - - \item [best] Specifies the method for tree selection, which in this case returns the best graphs - found during fuse operations. - - \item [keep:n] Limits the number of returned graphs to $n$. - - \item [nni] Causes the exchanged subgraphs to be tested at their initial positions as well as the - two adjacent edges. - - \item [once] Performs a single round of fusing on input graphs and returns the resulting graphs. - Alternatively (and by default) fusing continues recursively until no new graphs are found. - - %\item[pairs[:n]] \hl{details} - - \item [spr[:n]] Causes the exchanged subgraphs to be tried at multiple positions (up to optional - $n$ edges aware from their initial positions). - - %\item[steepest] \hl{details} - - \item [tbr[:n]] Causes the exchanged subgraphs to be tried at multiple positions (up to optional - $n$ edges aware from their initial positions) and with TBR-style rerooting of the exchanged - components. - - \item [unique] Specifies the method for tree selection, which in this case returns all unique - graphs found during fuse operations. - \end{description} - - \subsubsection{Defaults} - \texttt{fuse()} Keeps all the best graphs found, continues fusing until no new graphs are found. - No branch swapping style rearrangements are performed. - - \begin{example} - - \item{\texttt{fuse(best, once)}\\Fuses input graphs and returns best graphs after a single round of - fusing.} - - \item{\texttt{fuse(tbr, keep:10)} \\Fuses input graphs and preforms TBR-style replacement and - rerooting of pruned components returning up to 10 best cost graphs.} - - \end{example} - -%--------------------------------------------- -%read -%--------------------------------------------- -\subsection{Read} - \label{subsec:Read} - \subsubsection{Syntax} - \texttt{read(option:"filename", option:"filename", ...)} - - \begin{phygdescription} - {Imports file-based information, including data files, tree and graph files. \texttt{read} - commands must contain an input file. Supported data formats include FASTA, FASTC - and TNT files, and graph formats include Dot, Enewick, Fenewick, and Newick. - Filenames must be included in quotes and, if multiple filenames are specified, - separated by commas. Filenames must include the appropriate suffix (e.g. .fas, - .ss, .mat). The exclusion of these suffixes will result in an error. The filename must - match exactly, including capitalization. \phyg will attempt to recognize the type of input - and parse appropriately. Otherwise, the type of file can be indicated, using one of the - options below. The options prepend the filename with a colon (`:') and will modify the - processing of the input file. Prepending the file type prevents any ambiguity when the - file is parsedThis command can also use wildcard expressions (`*', `?'), - which can be useful when reading in multiple files of the same type.} - \end{phygdescription} - - \subsubsection{Arguments} - - \begin{description} - \item [aminoacid:] Specifies that the file contents are parsed as IUPAC coded amino - acid data in fasta \citep{PearsonandLipman1988} format. - - \item [block:] Specifies that the file contains block %\hl{ref?} - information. Each line contains - the new block name followed by names of input files to be assigned to that data block. - Blocks are initially named as the input file name with ``:0'' appended. In the examples, - data from files ``b'' and ``c'' will be assigned to block ``a''. There can be no spaces in - file or block names. - - \begin{quote} - \texttt{"a" "b:0" "c:0"} - \end{quote} - - \item [dot:] Specifies that the file contains a graph in `dot' format for use with graph - rendering software such as \href{https://en.wikipedia.org/wiki/Graphviz}{GraphViz}. - - \item [enewick:] Specifies that the file contains Enhanced Newick format graph(s) as - specified here \citep{Cardonaetal2008}. - - \item [exclude:] Specifies that the file contains the names of terminal taxa to be - excluded from an analysis. Taxa appear in the form of a list, with a single taxon per - line. Thus, taxa not included in the list and present in input files, will be included in - analysis. Compare with \texttt{include}. - - \item [fasta:] Ensures that file contents are parsed in fasta \citep{PearsonandLipman1988} - format. This is used for single character sequences such as binary streams, IUPAC - nucleotide and amino acid sequence data. - - \item [fastc:] Ensures that file contents are parsed in fastc \citep{WheelerandWashburn2019} - format. This is used for multi-character sequences such as gene synteny, developmental, - or linguistic data. - - \item [fenewick:] Specifies that the file contains Forest Enhanced Newick format graph(s) - specified \href{https://www.github.com/wardwheeler/euncon}{here} \citep{Wheeler2022}. - - \item [include:] Specifies the names of terminal taxa to be included in the analysis. - Taxa appear in the form of a list, with a single taxon per line. It is possible to specify - terminals that have no data. This may be done in order to diagnose a large tree on - partial data. If there are no data for a leaf taxon, a warning will be printed to \texttt{stderr}. - Taxa not include in this list, but present in the inputted data files, will be excluded from - the analysis. Compare with \texttt{exclude}. - - \item [newick:] Specifies that the file contains Newick format graph(s) as specified - \href{https://evolution.genetics.washington.edu/phylip/newick_doc.html}{here}. - - \item [nucleotide:] Ensures that file contents are parsed as IUPAC coded nucleotide data - in fasta \citep{PearsonandLipman1988} format. - - \item [prefasta:] Specifies that the sequences are prealigned in a fasta format, leaving - gap characters (``-'') in the sequences and alignment correspondences are not re-examined. - This option exists to ensure proper parsing (and in case auto-format detection is incorrect). - Prealigned fasta files \textbf{must} be of the same length. - - \item [prefastc:] Specifies that the sequences are prealigned in a fastc format, leaving gap - characters (``-'') in the sequences and alignment correspondences are not re-examined. - This option exists to ensure proper parsing (and in case auto-format detection is incorrect). - Prealigned fastc files \textbf{must} be of the same length. - - \item [rename:] Replaces the name(s) of specified terminals in the file. This command allows - for substituting taxon names and can help merge multiple datasets without modifying the - original data file. The file contains a series of lines, each of which contains at least two strings. - The first string (input taxon name) will replace the second and all subsequent strings (taxon - names) on that line. In the example given in Figure \ref{renamefile} the taxon Hydrus\_granulatus - will be renamed as Acrochordus\_granulatus, the taxa Gloydius\_boehmei and Gloydius\_mogoi - will be renamed as Gloydius\_halys and the taxa Crotalus\_mutus, Scytale\_catenatus and - Coluber\_crotalinus will be renamed as Lachesis\_muta. - - \begin{figure} - \centering - \includegraphics[width=0.8\textwidth]{Rename_file.jpg} - \caption{Renaming text file containing the lists of terminal taxa to be renamed.} - \label{renamefile} - \end{figure} - - The \texttt{rename} function can also be specified as a command, see \texttt{rename} - (Section \ref{subsec:Rename}). - - \item [tcm:] This refers to a file containing a custom-alphabet matrix that specifies varying - costs among alphabet elements in a sequence. The elements in the alphabet can be letters, - digits, or both. \\ - The \texttt{tcm} contains two parts: the first line of the file contains the alphabet elements - separated by a space and the transformation cost matrix, which follows below. The dash - character representing an insertion/deletion or indel character is not specified on the first - line of the file, but added to the alphabet automatically. The second part is the \texttt{tcm}, - which is a square matrix with $n + 1$ elements ($n$ is the size of the alphabet). - The positions from left to right and top to bottom in this matrix correspond to the sequence - of the elements as they are listed in the alphabet. An extra rightmost column and lowermost - row correspond to indel (gap) costs to and from alphabet elements. At present, this matrix - must be symmetrical, but not necessarily metric. Non-metric tcm's can yield unexpected - results. Transformation costs must be integers. If real values are desired, a character can - be weighted with a floating point value factor. \\ - For a sequence with four elements alpha, beta, gamma and delta and an indel cost of 4 - for all insertion deletion transformations, a valid custom alphabet file is provided below: - \\ - \begin{equation*} - %\nolabel - \begin{array}{lllll} - alpha & beta & gamma & delta & \\ - 0 & 2 & 1 & 2 & 5 \\ - 2 & 0 & 2 & 1 & 5 \\ - 1 & 2 & 0 & 2 & 5 \\ - 2 & 1 & 2 & 0 & 5 \\ - 5 & 5 & 5 & 5 & 0 - \end{array} - \end{equation*} - \\ - In this example, the cost of transformation of \texttt{alpha} into \texttt{beta} is \texttt{2}, - and cost of a deletion or insertion of any of the four elements costs \texttt{5}. - - \item [tnt:] Ensures that file contents are parsed in TNT \citep{Goloboffetal2008} format. - Not all TNT data commands are currently supported. To ensure that the file is correctly - parsed, the file must begin with \texttt{xread}, followed by an optional comment in single - quotes ('this is a comment'), followed by the number of characters and taxa. The data - follow on a new line. Taxon names are followed by state data. Data may be in multiple - blocks (interleaved) or in sequential format. These interleaved blocks may consist of a - series of single character states without spaces between them, or multiple (or single) - character states (e.g. \texttt{alpha}) with space between the individual codings. Blocks - must be of all one type (ie. single character codings without spaces, or multi-character - separated by spaces). The data block \textit{must} be followed by a single semicolon - (``;'') on its own line.\\ - - The character settings (i.e. \texttt{ccode} commands) follow the data block, beginning - on a new line. These character settings always terminate with a semi-colon (\texttt{;}). - These settings include: activate (\texttt{[}) or deactivate (\texttt{]}); make additive/ordered - (\texttt{+}) or non-additive/unordered (\texttt{-}); apply step-matrix costs (\texttt{(}) with - scopes (e.g. \texttt{cc + 10 12;} and \texttt{cc (.; costs 0 = 0/1 1 0/2 2 0/3 3 0/4 4 1/2 1 - 1/3 2 1/4 3 2/3 1 2/4 2 3/4 1;}) including abbreviated scopes (\texttt{cc -.;}). %\hl{what about ) make additive or non-additive again?} - There may - be multiple character setting statements in a single line. Character settings must be - followed by \texttt{proc/;} on its own line. \texttt{PhyG} will not process - any file contents that follow \texttt{proc/;}. - - Additive/ordered character states must be numbers (integer or floating point). Ranges - for continuous characters are specified with a dash within square brackets (e.g. - \texttt{[1-2.1]}). Character state polymorphism are specified in square brackets without - spaces for single character states (e.g. \texttt{[03]}), and with spaces for multi-character - states. %(e.g. \hl{\texttt{[???}}). - - Dashes in multi-character states (e.g. \texttt{Blue-ish}) - are treated as part of the character state specification. If the user wishes that dashes - be treated as missing data (`?'), the file must be edited to reflect this by replacing the - dashes that are to be treated as missing data with question - marks (`?'). - - Example file: - \begin{quote} - \texttt{xread\\ - `An example TNT file' 8 5\\ - A 000\\ - B a14\\ - C b22\\ - D ?33\\ - E d[01]4\\} - - \texttt{A Blue-ish -\\ - B Green-ish OneFish\\ - C Rather-Red TwoFish\\ - D Almost-Cyan RedFish\\ - E Orange-definitely BlueFish\\} - - \texttt{A 5.2 - ?\\ - B 5.3 0.3 1.1\\ - C 3.2 0.1 1.1\\ - D 5.2 1.1 0.1\\ - E 5.1 1.1 0.1\\ - ;\\ - cc .;\\ - cc + 2;\\ - proc/;\\} - \end{quote} - \end{description} - - \subsubsection{Defaults} - \texttt{read("fileName")} reads data from fileName and attempts to recognize the - file type and parse accordingly. The assumed file type is printed to \texttt{stderr} for - verification. - - \begin{example} - - \item{\texttt{read(prefasta:"myDnaSequenceFile.fas")}\\ Reads sequence data from - ``myDnaSequenceFile.fas'' as prealigned data.} - - \item{\texttt{read(rename:"myRenameFile")}\\ Reads a list of taxa and names to be - assigned.} - - \end{example} - -%--------------------------------------------- -%reblock -%--------------------------------------------- -\subsection{Reblock} - \subsubsection{Syntax} - \texttt{reblock("newBlockName", "inputFile0", "inputFile1",...)} - - \begin{phygdescription} - {Assigns input data to ``blocks'' that will follow the same display tree when optimized - as ``soft-wired'' networks. By default, each input data file is assigned its own block with - the name of the input file. The \texttt{block} command is used to reassign these data to - new, combined blocks. Spaces are not allowed in block names and will produce - \texttt{unrecognized block name} errors.} - \end{phygdescription} - - \subsubsection{Arguments} - The first argument is the block to be created, the remainder are the input data to - be assigned to that block. Blocks are initially named as the input file name with - ``:0'' appended. Blocks are reported in \texttt{report(data)} command. - - \subsubsection{Defaults} - None. - - \begin{example} - - \item{\texttt{reblock("a","b\#0","c\#0")}\\ Assigns input data from file ``b'' and ``c'' - to block ``a''. } - - \end{example} - -%--------------------------------------------- -%rename -%--------------------------------------------- -\subsection{Rename} - \label{subsec:Rename} - \subsubsection{Syntax} - \texttt{rename("newName", "oldName1", "oldName2",...)} - - \begin{phygdescription} - {Replaces the name(s) of specified terminals. - The command consists of a terminal identifier followed by a comma and then by either - a string containing a pair (or pairs) of strings containing the names of items being renamed. - The first string (name) is assigned to taxa with the strings (names) that follow. This can be - useful when combining data from different sources, such as GenBank, or in revising names - to reflect taxonomic changes. Irrespective of where this command appears in the script file, - \phyg will execute this command prior to importing the data files. Compare with the - argument \texttt{rename} of the command \texttt{read} (Section \ref{subsec:Read}). - In the example below, the taxa ``b'' and ``c'' will be renamed to ``a''. - - \begin{quote} - rename("a","b","c") - \end{quote}} - \end{phygdescription} - - \subsubsection{Arguments} - Taxon names to assign and be assigned. - - \subsubsection{Defaults} - None. - - \begin{example} - - \item{\texttt{rename("a","b","c")}\\ Renames ``b'' and ``c'' to ``a''. } - - \end{example} - -%--------------------------------------------- -%refine -%--------------------------------------------- -\subsection{Refine} - \subsubsection{Syntax} - \texttt{refine(option, option,...)} - - \begin{phygdescription} - {Performs edit operations (addition, deletion, move) on network edges, and therefore - only applies to soft-wired and hard-wired graphs.} - \end{phygdescription} - - \subsubsection{Arguments} - - \begin{description} - \item[annealing[:n]] Specifies that $n$ (default 1) rounds of simulated annealing optimization - \citep{Metropolisetal1953, Kirkpatricketal1983, Cerny1985} are performed in concert with - \texttt{netAdd}, \texttt{netDel}, and \texttt{netMove}. The acceptance of candidate graphs is - determined by the probability $e ^ {- (c_c - c_b)/ (c_b * (k_{max} -k)/ k_{max})}$, where $c_c$ - is the cost of the candidate graph, $c_b$ is the cost of the current best graph, $k$ is the step - number, and $k_{max}$ is the maximum number of steps (set by the \texttt{steps:m}, default 10) - option. - - \begin{description} - - \item[netadd] Adds network edges to existing input graphs at all possible positions until no - better cost graph is found. - - \item[netdel] Deletes network edges from input graphs one at a time until no better cost - graph is found. - - \item[netmove] Moves existing network edges in input graphs one at a time to new positions - until no better cost graph - is found. - - \item[steps:n] Specifies that $n$ (default 10) temperature steps are performed during - simulated - annealing (as specified by the \texttt{annealing}) option. - - \end{description} - - - \item[drift[:n]] Specifies that $n$ (default 1) rounds of the ``drifting'' form of simulated annealing - \citep{goloboff1999} optimization are performed in concert with \texttt{netAdd}, \texttt{netDel}, - and \texttt{netMove}. The acceptance of candidate graphs is determined by the probability - $1/ (wf + c_c - c_b)$, where $c_c$ is the cost of the candidate graph, $c_b$ is the cost of the - current best graph, and $wf$ is the \texttt{acceptWorse} (set by the \texttt{acceptWorse:m}, - default 1.0) option. Equal cost graphs are accepted with probability set by the \texttt{acceptEqual} - option. \texttt{Drift} differs from \texttt{annealing} in that there are no cooling steps to modify - acceptance probabilities. The maximum number of graph changes is set by \texttt{maxChanges} - - - \item[geneticAlgorithm or ga] Performs Genetic Algorithm \citep{Holland1975} refinement in - concert with the options \texttt{generations}, \texttt{popsize}, \texttt{severity}, and - \texttt{recombinations}. - - \begin{description} - - \item[generations:[n]] Specifies the number of generations (sequential iterations) for - \texttt{geneticAlgorithm}. The default is $n=10$. - - \item[popsize:[n]] Specifies the population size for \texttt{geneticAlgorithm}. The default is - $n=20$. - - \item[recombinations:[n]] Specifies the number of recombination (fusing) events for - \texttt{geneticAlgorithm}. The default is $n=100$. - - \item[severity:[n]] Specifies the severity of selection against sub-optimal graph solutions - events for \texttt{geneticAlgorithm}. The higher the value, the less severe the penalty. The - default is $n=1.0$. - - \end{description} - - \item[keep:n] Limits the number of returned graphs to $n$. - - \end{description} - - %add defaults and examples -%--------------------------------------------- -%report -%--------------------------------------------- -\subsection{Report} - \subsubsection{Syntax} - \texttt{report("filename", arg0, arg1,...)} - - \begin{phygdescription} - {Outputs the results of the current analysis to a file or to \texttt{stderr}. To redirect the - output to a file, the file name (in quotes), followed by a comma, must be included in - the argument list of report. All arguments for \texttt{report} are optional. This command - allows the user to output information concerning the characters and terminals, - diagnosis, export static homology data, implied alignments, trees, graphs, dot files, - as well as other miscellaneous arguments. By default, new information printed to - a file if appended to the file. The option \texttt{overwrite} overrides the default and - rewrites the file. Many of the report options can be output in csv format, which can - subsequently be imported into spreadsheet programs.} - \end{phygdescription} - - \subsubsection{Arguments} - \begin{description} - - \item[crossrefs] Reports a table with terminals represented in rows, and the data files in - columns. A plus sign (``+'') indicates that data for a given terminal is present in the - corresponding file; a minus sign (``--'') indicates that it is absent. It is highly recommended - that the user use this report option to examine the data, having imported them into \phyg. - This argument is a useful tool for visual representation of missing data, as well as highlight - inconsistencies in the spelling of taxon names in different data files. The reported file is - in csv format. - - \item[data] Outputs a summary of the input data. More specifically, \phyg will report - aspects of the input data. - %\hl{"Index","Block","Name","Type","Activity","Weight","Prealigned","Alphabet","TCM"} - - \item[diagnosis] Outputs graph diagnosis information such as vertex and states and edge - statistics in csv format. - - \item[displaytrees] Outputs graph information for soft-wired networks. The ``display'' trees - are output for each data block. - - \item[graph] Outputs a graph in format specified by the other arguments in the command. - These are \texttt{dot} for - GraphViz graph format, \texttt{dotpdf} for pdf (or ps for OSX), \texttt{newick} for Newick, - ENewick, or ForestEnewick depending on the graph type, \texttt{ascii} for an ascii rendering. - In order to output pdf files (via \texttt{dotpdf}) ``dot'' must be installed from - \url{https://graphviz.org/download/}. PhyG will not error, but output a ``dot'' file that - can be processed later. - - \item[pairdist] Outputs a taxon pairwise distance matrix in csv format. - - \item[reconcile] Outputs a single ``reconciled'' graph from all graphs in memory. The - methods include consensus, supertree, and other supergraph methods as described in - \cite{Wheeler2012, Wheeler2022}. When \texttt{reconcile} is specified as a command - option a series of other options may be specified to tailor the desired outputs: - \begin{description} - \item {Method:eun$\mid$cun$\mid$majority$\mid$strict$\mid$Adams\\Default:eun\\ - This commands specifies the type of output graph. EUN is the Edge-Union-Network - \citep{MiyagiandWheeler2019}, CUN the Cluster Union Network \citep{Baroni2005}, - majority (with fraction specified by `threshold') specifies that a values between 0 and - 100 of either vertices or edges will be retained. If all inputs are trees with the same leaf - set this will be the Majority-Rule Consensus \citep{MargushandMcMorris1981}. - Strict requires all vertices be present to be included in the final graph. If all inputs are - trees with the same leaf set this will be the Strict Consensus \citep{Schuhandpolhemus1980}. - Adams denotes the Adams II consensus \citep{Adams1972}.} - - \item{Compare:Combinable$\mid$identity\\Default:combinable\\Species how group - comparisons are to be made. Either by identical match [(A, (B,C))$\neq$(A,B,C)], - combinable sensu \cite{Nelson1979} [(A, (B,C)) consistent with (A,B,C)]. This option - can be used to specify ``semi-strict'' consensus \citep{Bremer1990}.} - - \item{Threshold:(0-100)\\Default:0\\Threshold must be an integer between 0 and 100 - and specifies the frequency of vertex or edge occurrence in input graphs to be included - in the output graph. Affects the behavior of `eun' and` majority.'} - - \item{Connect:True$\mid$False\\Default:True\\Specifies the output graph be connected - (single component), potentially creating a root node and new edges labeled with ``0.0''.} - - \item{EdgeLabel:True$\mid$False\\Default:True\\Specifies the output graph have edges - labeled with their frequency in input graphs.} - - \item{VertexLabel:True$\mid$False\\Default:False\\Specifies the output graph have vertices - labeled with their subtree leaf set.} - - \item{OutFormat:Dot$\mid$FENewick\\Default:Dot\\Specifies the output graph format - as either Graphviz `dot' or FEN.} - \end{description} - - \item[support] Outputs support graphs. Resampling graphs are independent of the - input graphs while Goodman-Bremer are based on current graphs. Multiple formats - can be output via additional options including \texttt{dot} for GraphViz graph format, - \texttt{dotpdf} for pdf (or ps for OSX), \texttt{newick} for Newick, ENewick, or - ForestEnewick depending on the graph type, \texttt{ascii} for an ascii rendering. - In order to output pdf files (via \texttt{dotpdf}) ``dot'' must be installed from - \url{https://graphviz.org/download/}. \phyg will not error, but output a ``dot'' file that - can be processed later. - - \item[search] Outputs search statistics in csv format. - - \end{description} - - \subsubsection{Defaults} - \texttt{report()} prints input data and output graph diagnosis to stderr. Default graph - representation is \texttt{dot}. - - \begin{example} - \item{\texttt{report("outFile", newick, overwrite)}\\ Outputs graphs in newick format to - ``outFile'', overwriting any existing information.} - - \item{\texttt{report("outFile", crossrefs)}\\ Outputs presence/absence for taxa in input files. - A `+' is output if taxa are present in an input data file, and `-' if. File is in csv format. This - can be useful in checking for missing sequence or other data and expected renaming.} - - \item{\texttt{report("outFile", dot, reconcile, method:eun, threshold:51)}\\ Outputs reconciled - graph using the Edge-Union-Network method with a minimum edge frequency of 51\% in - dot format to ``outFile'', appending to any existing information in ``outFile''.} - \end{example} - -%--------------------------------------------- -%run -%--------------------------------------------- -\subsection{Run} - \subsubsection{Syntax} - \texttt{run("filename")} - - \begin{phygdescription} - {Used to execute a \phyg script file containing commands. The script filename must be - included in quotes. - Executing scripts using \texttt{run} can be useful to specify common actions such as file inputs - and graph construction. } - \end{phygdescription} - - \subsubsection{Arguments} - The only argument is the filename containing commands to be executed. - - \subsubsection{Defaults} - There are no default settings of \texttt{run}. - - \begin{example} - \item{\texttt{run("readFiles.pg")}\\ Executes "readFiles.pg", which may contain multiple input - files to be \texttt{read}} - - \item{\texttt{run("searchCommands.pg")}\\ Executes "searchCommands.pg", which may - contain commands defining a common search strategy (e.g. \texttt{build}).} - \end{example} - -%--------------------------------------------- -%search -%--------------------------------------------- -\subsection{Search} - \subsubsection{Syntax} - \texttt{search(arg0:option, arg1:option, ...)} - - \begin{phygdescription} - {This command implements a default search strategy, performing a timed randomized - series of graph optimization methods including building, swapping, recombination (fusing), - simulated annealing and drifting, network edge addition/deletion/moving, and Genetic - Algorithm. The parameters and order for this search are randomized. The arguments - specify the number of rounds of search (\texttt{instances}, default 1), and duration - (\texttt{days}, \texttt{hours}, \texttt{minutes}, and \texttt{seconds}; - default 30 seconds). Successive rounds of search (specified by \texttt{iterations}) gather - any solutions from previous sequential or parallel rounds as well as any input graphs. - Since search methods may vary in how long they take, individual iterations may take - longer that the specified duration. - - When performing a \texttt{search} it is important to set the amount of time, such that the - program has a reasonable amount of time to perform a search. Therefore, it is important - to have some idea as to the length of time it would take to do a single round of searching.} - \end{phygdescription} - - \subsubsection{Arguments} - \begin{description} - \item[keep:n] Keeps up to $n$ graphs. - - \item[days:n] Adds $n$ 24 hour days to search time. - - \item[hours:n] Adds $n$ hours to search time. - - %\item[instances:n] \hl{Performs $n$ successive rounds of search? How does instances - %differ from iterations?} - - \item[minutes:n] Adds $n$ minutes to search time. - - \item[seconds:n] Adds $n$ seconds to search time. - - \item[iterations:n] Specifies number of (potentially parallel) search iterations.\\ - - %\hl{would it not be simpler to do a max\_time:FLOAT:FLOAT:FLOAT, or min\_time - %FLOAT:FLOAT:FLOAT like we had in poy? How useful are seconds?} - \end{description} - - \subsubsection{Defaults} - \texttt{search()} Performs 1 iteration of 30 seconds keeping up to 10 graph}. - - \begin{example} - \item{\texttt{search(hours:10, iterations:2)}\\ Performs 2 search iterations (in parallel if the - program is executed in parallel) for 10 hours each.} - - \item{\texttt{search(hours:10, minutes:30)}\\ Performs a single search iteration for 10 - hours and 30 minutes.} - \end{example} - -%--------------------------------------------- -%select -%--------------------------------------------- -\subsection{Select} - \subsubsection{Syntax} - \texttt{set(arg0:option, arg1:option, ...)} - - \begin{phygdescription} - {Specifies the method and number of graphs to be saved at any point. When multiple - graphs are present, the \texttt{select} command will specify which of the graphs to keep - for further analysis or reporting.} - \end{phygdescription} - - \subsubsection{Arguments} - \begin{description} - \item[all] Keeps all graphs. - - \item[best:n] Selects and keeps the graphs with the best optimality value. - - \item[atRandom] Keeps graphs chosen at random. - - \item[unique:n] Keeps up to \texttt{n} unique graphs, which may not necessarily be the - best graphs. - \end{description} - - \subsubsection{Defaults} - \texttt{select()} Keeps all unique graphs of best optimality value. - - \begin{example} - \item{\texttt{select(random:10)}\\ Keeps up to 10 graphs, selected at random.} - - \item{\texttt{select(best:10)}\\ Keeps up to 10 graphs of best optimality value.} - \end{example} - -%--------------------------------------------- -%set -%--------------------------------------------- -\subsection{Set} - \subsubsection{Syntax} - \texttt{set(arg0:option, arg1:option, ...)} - - \begin{phygdescription} - {Changes the settings of \phyg. This command performs an array of functions - from specifying the seed of the random number generator, to selecting a terminal for - rooting output trees, to specifying graph type, final assignment, and optimality - criterion. All \texttt{set} commands are executed at the start of a run, irrespective of - where they appear in the script. The command \texttt{transform} is used to modify - global settings during a run (see \texttt{transform} Section \ref{subsec:Transform}).} - \end{phygdescription} - - \subsubsection{Arguments} - \begin{description} - \item[compressResolutions:True|False] %\hl{change to BOOL} - Determines whether soft-wired graph - resolutions are ``compressed'' if multiple vertex assignments in alternate display - trees are equal. - - \item[criterion:parsimony|pmdl] Sets the optimality criterion for graph search to be - method. Currently, parsimony and PMDL \citep{WheelerandVaron2022} are supported. - - \item[finalAssignment:DirectOptimization|DO|ImpliedAlignment|IA] Sets the method - of determining the ``final'' sequence states. DirectOptimization (DO) uses soft-wire the DO - method to assign the final states, which is more time consuming than \texttt{ImpliedAlignment}. - DO has an additional factor of potentially $O(n^2)$ in sequence length compared - to the constant factor for IA due to additional graph traversals. - - \item[graphFactor:nopenalty|W15|PMDL] Sets the network penalty for a soft-wired network\\ - (\texttt{|W15}) or \texttt{PMDL} (for criterion = PMDL). W15 employs the - parsimony network penalty of \cite{Wheeler2015}. - - \item[graphType:tree|hardwired|softwired] Sets the phylogenetic graph type to tree, - hard-wired network, or soft-wired network. Forest are allowed by the network options. - - %\item[modelcomplexity] \hl{details} - - \item[outgroup:STRING] Specifies the terminal to root the output trees. - This name must appear in quotes. - - \item[partitioncharacter] Sets the character that is used to partition data. %\hl{The separator can be something else} - - \item[rootCost:noRootCost|W15|PMDL] Sets the root cost for a graph. W15 sets a - cost at $\frac{1}{2}$ the cost of `inserting' the root character assignments. - The W15 root cost is based on the same rationale as the parsimony network penalty of - \cite{Wheeler2015}. - - \item[seed:INTEGER] Sets the seed for the random number generator using the integer - value. If unspecified, \phyg uses the system time as the seed. - \end{description} - - \subsubsection{Defaults} - The default outgroup is the taxon whose name is - lexically first after renaming and taxon inclusion/exclusion. For this reason, it is best to specify - an outgroup explicitly. The default optimality criterion is \texttt{parsimony}, \texttt{CompressResolutions} - is set to \texttt{True}, \texttt{FinalAssignment} is set to \texttt{DirectOptimization}, and the default - graph type is \texttt{tree}. The default graphFactor is \texttt{W15} if parsimony is the optimality - criterion and \texttt{PMDL} if PMDL is set as the optimality criterion. The default rootCost - is \texttt{noRootCost} if parsimony is the optimality criterion and \texttt{PMDL} if PMDL is set - as the optimality criterion. - - \begin{example} - \item{\texttt{set(optimality:parsimony)}\\Sets the graph search optimality criterion to parsimony.} - - \item{\texttt{set(compressResolutions:False)}\\Turns off soft-wired graph resolution compression.} - \end{example} - -%--------------------------------------------- -%support -%--------------------------------------------- -\subsection{Support} - \subsubsection{Syntax} - \texttt{swap(arg0, arg1:option, ...)} - - \begin{phygdescription} - {Generates graphs supports via resampling \citep{Farrisetal1996} and Goodman-Bremer - \citep{Goodmanetal1982, bremer1994}. Currently, Jackknifing and Bootstrapping - are the resampling methods that are implemented.} - \end{phygdescription} - - \subsubsection{Arguments} - \begin{description} - \item[buildonly] Performs very rapid, but not extensive graph searches for each - resampling replicate. - - \item[bootstrap] Calculates Bootstrap support. The user can specify the - number of iterations, using \texttt{replicates:n} (see below). - When reported (via \texttt{report(support)}) edges are labeled with the bootstrap frequencies. - - \item[goodmanbremer|gb[:spr|tbr]] Specifies that Goodman-Bremer support is - calculated for input graphs. The method traverses the SPR or TBR neighborhood - as optionally specified (TBR as default) to determine an upper bound on the NP-hard - values (this is the method used in POY v2; \citealp{POY2} \textit{et seq.}). The number - of alternate graphs to be examined can be specified (\textit{ie.} limited) by \texttt{gbsample:[n]}. - When \texttt{gbsample:[n]} is specified, graphs are chosen uniformly at random. - - \item[jackknife:[n]] Specifies that Jackknife resampling is performed with $n$ acceptance - probability (default 0.6231 or $1 - e^{-1}$). When reported (via \texttt{report(support)}) - edges are labeled with the jackknife frequencies. - - \item[replicates:n] Specifies that $n$ resampling replicates (default 100) are performed - in resampling support for Jackknife and bootstrap methods. - \end{description} - \subsubsection{Defaults} - \texttt{support(goodmanBremer:TBR)} - - - \begin{example} - \item{\texttt{support(jackknife:0.50, replicates:1000)}\\Performs 1000 replicates of - delete 50\% jackknife resampling.} - - \item{\texttt{support(gb:SPR, gbSample:10000)}\\Produces Goodman-Bremer - support based on $10,000$ samples of the SPR neighborhood.} - \end{example} - -%--------------------------------------------- -%swap -%--------------------------------------------- -\subsection{Swap} - \subsubsection{Syntax} - \texttt{swap(arg0, arg1:option, ...)} - - \begin{phygdescription} - {Performs branch-swapping rearrangement on graphs. This command implements a - group of algorithms referred to as branch swapping, that proceed by clipping - parts of the given tree and attaching them in different positions. These algorithms - include ``NNI'' \citep{CaminandSokal1965, Robinson1971}, ``SPR'' \citep{Dayhoff1969}, - and ``TBR'' \citep{Farris1988, swofford1990a} refinement.} - \end{phygdescription} - - \subsubsection{Arguments} - \begin{description} - \item[all] Turns off all preference strategies to make a join, simply trying all possible - join positions for each pair of clades generated after a break, in a randomized order. - The refinement examines the entire rearrangement neighborhood of the current graph - before retaining the best (lowest cost) solutions. - - \item[annealing[:n]] Specifies that $n$ rounds of simulated annealing \citep{Metropolisetal1953, - Kirkpatricketal1983, Cerny1985} optimization are performed (default 1). The acceptance - of candidate graphs is determined by the probability $e ^ {- (c_c - c_b)/ (c_b * (k_{max} -k)/ k_{max})}$, - where $c_c$ is the cost of the candidate graph, $c_b$ is the cost of the current best graph, $k$ - is the step number, and $k_{max}$ is the maximum number of steps (set by the \texttt{steps:m}, - default 10) option. - - \item[drift[:n]] Specifies that $n$ rounds of the ``drifting'' form of simulated annealing - \citep{goloboff1999} optimization are performed (default 1) . The acceptance of candidate - graphs is determined by the probability $1/ (wf + c_c - c_b)$, where $c_c$ is the cost - of the candidate graph, $c_b$ is the cost of the current best graph, and $wf$ is the - \texttt{acceptWorse} (set by the \texttt{acceptWorse:m}, default 1.0) option. Equal - cost graphs are accepted with probability set by the \texttt{acceptEqual} option. - \texttt{drift} differs from \texttt{annealing} in that there are no cooling steps to modify - acceptance probabilities. The maximum number of graphs changes is set by - \texttt{maxChanges}. - - \begin{description} - - \item[acceptEqual] - - \item[acceptWorse] - - \item[maxChanges:n] Specifies that drifting graph changes are limited to $n$ (default 15). - - \end{description} - - \item[ia] Specifies that Implied Alignment \citep{Wheeler2003} assignment are used for - branch swapping as opposed to full Direct Optimization for dynamic charters when the - graph type is \texttt{Tree}. - - \item[keep:n] Specifies that up to $n$ equally costly graphs are retained. - - \item[nni] Specifies that NNI refinement \citep{CaminandSokal1965, Robinson1971} is performed. - - \item[spr:[n]] Specifies that SPR refinement \citep{Dayhoff1969} is performed. If the optional - argument $n$ is specified, readdition of pruned graphs will be within $2 * N$ edges of its original - placement. - - \item[steps:n] Specifies that $n$ (default 10) temperature steps are performed during simulated - annealing (as specified by the \texttt{annealing}) option. - - \item[steepest] Specifies that refinement follows a greedy path, abandoning the neighborhood - of the current graph when a better (lower cost) graph is found. - - \item[tbr:[n]] Specifies that TBR refinement \citep{Farris1988, swofford1990a} is performed. If the - optional argument $n$ is specified, readdition of pruned graphs will be within $2 * N$ edges of its - original placement. - \end{description} - - \subsubsection{Defaults} - \texttt{swap(spr, keep:1, steepest)} - - \begin{example} - \item{\texttt{swap()}\\Performs spr branch swapping on each current graph returning the single - best rearrangement found for each graph employing steepest descent.} - - \item{\texttt{swap(tbr, all, keep:10)}\\Performs tbr branch swapping on each current graph - returning up to 10 best rearrangements found for each graph after examining all graphs in - the rearrangement neighborhood.} - \end{example} - -%--------------------------------------------- -%transform -%--------------------------------------------- -\subsection{Transform} - \label{subsec:Transform} - \subsubsection{Syntax} - \texttt{transform(arg0, arg1,...)} - - \begin{phygdescription} - {\texttt{Transform} modifies global setting during program execution (as opposed to the \texttt{set} - command that operates at the inauguration of calculations). The command allows for changing - graph (e.g. Tree to Softwired Network) and data types (between dynamic and static approximation) - among other operations.} - \end{phygdescription} - - \subsubsection{Arguments} - \begin{description} - \item[atRandom] In concert with \texttt{displayTrees:n} specifies that displays trees are chosen - uniformly at random for each input graph. - - \item[displayTrees:[n]] When this option is specified, returns $n$ display trees for each graph - determined by the optional argument If the number of display trees is not - specified, 10 are returned. Used in concert with \texttt{toTree}. - - \item[dynamic] Reverts data type to the default ``dynamic'' for all dynamic homology - \citep{Wheeler2001} character types (e.g. DNA sequences). After this command, - graph optimization proceeds in the default manner with sequence characters treated - in their non-aligned (``dynamic'') condition. - - \item[first] In concert with \texttt{displayTrees:n} specifies that the first $n$ displays tree - resolutions are chosen for each input graph.. - - \item[staticApprox] Converts non-aligned (``dynamic'') sequence characters to their Implied - Alignment \citep{Wheeler2003, WashburnandWheeler2020} condition. - - \item[toHardwired] Converts exiting graphs to hardwired network graphs. - - \item[toSoftwired] Converts exiting graphs to softwired network graphs. - - \item[toTree] Converts exiting graphs to trees. For both Softwired and Hardwired graphs - this proceeds via graph resolution of network nodes into ``display'' trees. Since there are up to - $2^n$ display trees for a graph with $n$ network nodes, this number can be quite large. - The number of display trees produced for each graph is controlled via the options - \texttt{displayTrees:n}, \texttt{atRandom}, and \texttt{first}. - \end{description} - - \subsubsection{Defaults} - None. - - \begin{example} - \item{\texttt{transform(toSoftwired)}\\Converts each current graph to a softwired network graph.} - - \item{\texttt{transform(staticApprox)}\\Changes data to all static characters via Implied Alignment - for further analysis.} - - \end{example} - diff --git a/doc/PhyGraphUserManual/PhyGraphUserManual.tex b/doc/PhyGraphUserManual/PhyGraphUserManual.tex deleted file mode 100644 index 149e394a5..000000000 --- a/doc/PhyGraphUserManual/PhyGraphUserManual.tex +++ /dev/null @@ -1,435 +0,0 @@ -\documentclass[11pt]{book} -\usepackage{longtable} -\usepackage{color} -\usepackage{tabu} -\usepackage{setspace} -\usepackage{pdflscape} -\usepackage{graphicx} -\usepackage {float} -%\usepackage{subfigure} -\usepackage{caption} -\usepackage{subcaption} -\usepackage{natbib} -\usepackage{fullpage} -\bibliographystyle{plain} -%\bibliographystyle{cbe} -\usepackage{algorithmic} -\usepackage[vlined,ruled]{algorithm2e} -\usepackage{amsmath} -\usepackage{amsfonts} -\usepackage{amssymb} -\usepackage[T1]{fontenc} -\usepackage{url} - -\usepackage[dvipsnames]{xcolor} -\usepackage{color, soul} -\usepackage[colorlinks=true, linkcolor=blue, citecolor=DarkOrchid, urlcolor=TealBlue ]{hyperref} -%\usepackage[nottoc,numbib]{tocbibind} -\usepackage{tocloft} - -\setlength\itemindent{1cm} - -\newcommand{\phyg}{\texttt{PhyG} } -\newenvironment{phygdescription}{\subsubsection{Description}}{} -\newenvironment{example}{\subsubsection{Examples} \begin{itemize}}{\end{itemize}} -\newenvironment{argument}{\subsection{Arguments}\begin{itemize}}{\end{itemize}} -%\item [ ] - -% We define a command environment for new command definitions, and store -% whatever command we are dealing with in the @commandname macro. Inside a -% command we can specify the defaults, the syntax, and the arguments to be used. - \newenvironment{command}[2]{ - \def\tmpa{} - \def\tmpb{#2} - \def\@commandname{#1} - \subsection{#1}\index{general}{#1} - \ifx\tmpa\tmpb - \label{comm:#1} - \else - \label{comm:#2} - \fi} -{} - -% Syntax definition. We use the name of the command as stored in @commandname -\newcommand{\syntax}{\subsubsection{Syntax} \@commandname} -\newcommand{\atsymbol}{@} - -\begin{document} - %\firstpage{1} - - \title{PhylogeneticGraph\\User Manual\\Version 0.1} - - \maketitle - - \newpage - - \begin{center} - \includegraphics[width=\textwidth]{AMNHLogo.jpg} - \end{center} - - \vspace*{5.50cm} - \begin{flushleft} - \textbf {Program and Documentation} \\ Ward C. Wheeler \\ - \vspace*{0.50cm} - \textbf {Program} \\ Alex Washburn \\ - \vspace*{0.50cm} - \textbf{Documentation} \\ Louise M. Crowley - \end{flushleft} - - - \vspace*{5.50cm} - - \begin{flushleft} - \small - {\it Louise M. Crowley, Alex Washburn, Ward C. Wheeler} \\ - - Division of Invertebrate Zoology, American Museum of Natural History, New York, NY, U.S.A.\\ - \smallskip - The American Museum of Natural History\\ - \copyright 2022 by The American Museum of Natural History, \\ - All rights reserved. Published 2022. - - % \vspace*{0.25cm} - % \emph{W. C. Wheeler.} 2022. \texttt{PHYG} 1..0. New York, - % American Museum of Natural History. Documentation by W.C. Wheeler and L. M. Crowley. - % - \vspace*{0.25cm} - - Available online at \url{https://github.com/amnh/PhyGraph} - - Comments or queries relating to the documentation should be sent to \href{mailto:wheeler@amnh.org} - {wheeler@amnh.org} or \href{mailto:crowley@amnh.org}{crowley@amnh.org} - \end{flushleft} - - \tableofcontents - -\chapter{What is PhyG?} - - \section{Introduction} - PhylogeneticGraph (\phyg) is a multi-platform program designed to produce phylogenetic - graphs from input data and graphs via heuristic searching of general phylogenetic graph - space. \texttt{PhyG} is the successor of \href{https://github.com/wardwheeler/POY5}{\textbf{POY}} - \citep{POY2,POY3,POY4,Varonetal2010,POY5, Wheeleretal2015}, containing much of its - functionality, including the optimization of \textit{unaligned} sequences, and the ability to implement - search strategies such as random addition sequence, swapping, and tree fusing. As in {\textbf{POY}, - \phyg can generate outputs in the form of implied alignments and graphical representations of - cladograms and graphs. What sets \phyg apart from {\textbf{POY}, and other phylogenetic - analysis programs, is the extension to broader classes of input data and phylogenetic graphs. - The phylogenetic graph inputs and outputs of \texttt{PhyG} include trees, as well as other forms - including forests and both soft- and hard-wired networks. - - This is the initial version of documentation for the program. - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%QUICKSTART -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - \section{Quick Start} - - \subsection{Requirements: software and hardware} - \phyg is an open-source program that can be compiled for Mac OSX and Linux. Some - utility programs (such as TextEdit for Mac, or Nano for Linux) can help in preparing - \phyg scripts and formatting data files, while others (such as Adobe Acrobat and TreeView - \citep{page1996}) can facilitate viewing the outputted graphs and trees. - - \phyg runs on a variety of computers, including desktops, laptops and cluster computers. - By default, \phyg is a multi-threaded application and will use the available resources of - the computer during the analysis (see Execution in Parallel \ref{subsec:parallel}). - - \subsection{Obtaining and Installing \phyg} - \phyg source code, precompiled binaries, test data, and documentation in pdf format, - %as well as tutorials - are available from the \phyg \href{https://github.com/amnh/PhyGraph}{GitHub} website. - - \subsubsection{Installing from the binaries} - Download the \phyg binary from the \href{https://github.com/amnh/PhyGraph}{GitHub} - website. Binaries are available for Mac OSX computers with either Intel or M1 processors, - and Linux machines. The user can go directly to the website and click on the appropriate link - for the binary. On most systems this will download to either your Desktop or Downloads folder. - Alternatively, open a \textit{Terminal} window (located in your Applications folder) and type - the following for either the Mac Intel, Mac M1 or Linux binary: - - \begin {quote} - curl -LJ --output phyg https://github.com/amnh/PhyGraph/blob/main/bin/OSX/phyg-Intel?raw=true - \end{quote} - - \noindent or - - \begin {quote} - curl -LJ --output phyg https://github.com/amnh/PhyGraph/blob/main/bin/OSX/phyg-M1?raw=true - \end{quote} - - \noindent or - - \begin {quote} - curl -LJ --output phyg https://github.com/amnh/PhyGraph/blob/main/bin/linux/phyg? - raw=true - \end{quote} - - \noindent The binary should either be moved into your \$PATH or referred to its - absolute when executing a script. - - \subsubsection{Compiling from the source} - For the majority of users, downloading the binaries will suffice. Should the user prefer to - compile \phyg directly from the source, the source code can be downloaded - from the \href{https://github.com/amnh/PhyGraph}{GitHub} website. \phyg is largely - written in Haskell. In order to compile \phyg from the source, the user must install Cabal, - a command-line program for downloading and building software written in Haskell (ghc). - Information on its installation can be found - \href{https://www.schoolofhaskell.com/user/simonmichael/how-to-cabal-install}{here}. - To install an optimized version of \phyg, run the following in a \textit{Terminal}: - - \begin{quote} - cabal install PhyGraph:phyg --project-file=cfg/cabal.project.release - \end{quote} - - %\hl{Explain this a bit better?} - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%OVERVIEW OF THE PROGRAM -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - \section{Overview of program use} - - At present, \phyg is operated solely via command-line in a \textit{Terminal} window. - Commands are entered via a script file containing commands that specify input files, - output files and formats, graph type and search parameters. - - \subsection{Executing Scripts} - The program is invoked from the command-line as in: - - \begin{quote} - phyg commandFile - \end{quote} - - \noindent For example, typing\\ - - \begin{quote} - phyg Users/Ward/Desktop/phygfiles/mol.pg - \end{quote} - - \noindent in a \textit{Terminal} window will invoke \phyg to run the script \texttt{mol.pg}, - which is located in the Desktop of the home directory \texttt{/Users/Ward/Desktop/phygfiles}. - This is the equivalent of including\\ - - \begin{quote} - cd ("Users/Ward/Desktop/phygfiles") - \end{quote} - - \subsection{Creating and running \phyg scripts} - A script is a simple text file containing a list of commands to be performed. Running - analyses using scripts allows for the entire analysis to proceed from the beginning to - the end with one click of a button and may produce faster results as \phyg automatically - optimizes the workflow of the analysis by taking into account the functional relationships - among various tasks and efficiently distributing the jobs and resources (such as memory - and multiple processors). This script can also include references to other script files - (Figure \ref{firstscript}). - - Scripts can be created using any conventional text editor such as TextEdit, TextWrangler, - BBEdit, or Nano. Comments that describe the contents of the file and provide other useful - annotations can be included. Comments are prepended with `-{}-' and can span multiple - lines, provided each line begins with `-{}-'. - - \begin{figure} - \centering - \includegraphics[width=\textwidth]{First_run.jpg} - \caption{\phyg scripts. The headers in the scripts are comments, leading with `-{}-', which is - ignored by \phyg. This first script "First\_script.pg" includes a reference to the second script - "Chel\_files.txt", which includes a group of data files to be read by the program.} - \label{firstscript} - \end{figure} - -% Optimize memory consumption--keep low number of graphs in initial searches, later keep a larger -% number to get others -% \hl{(see email from WW 04-21-22)} -% \hl{where should I discuss this?} - - \subsection{Execution in Parallel} - \label{subsec:parallel} - \phyg is a multi-threading application and will, by default, spawn as many jobs as there - are cores in the machine and can be more efficient. Should the user wish to limit or - specify the number of processors used by \phyg this can be achieved by including the options - `\textbf{+RTS -NX -RTS}', where `\textbf{X}' is the number of processors offered to the - program, when executing the script. These options are specified after the program as in: - - \begin{quote} - phyg fileName +RTS -NX -RTS - \end{quote} - - %possible to include and other options - - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%FORMATS -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - \section{Input Data Formats} - Any character names in input files are (for now) ignored and internal names are created - by appending the character number in its file to the filename as in "fileName:0". - %No idea what thing means - Qualitative data, and prealigned data include their index in their input files and unaligned - data are treated as a single character. - - \subsection{fasta} - Single character sequence input \citep{PearsonandLipman1988}. - - \subsection{fastc} - Multicharacter sequence input. \citep{WheelerandWashburn2019}. - - \subsection{\texttt{TNT}} - The TNT \citep{Goloboffetal2008} format is accepted here for specification of qualitative, - measurement, and prealigned molecular sequence data. \phyg does not parse all the - diversity of options that can be specified in \texttt{TNT} input files.\\ - - %\hl{[Notes to be fleshed out...]}\\ -% The majority of character scopes of TNT files are supported in \phyg. These include interleaved files, -% cc code and costs yes, but one set of commands per line.\\ -% Costs A$>$B A$/$B syntax no spaces. \\ -% Ambiguities not allowed. \\ -% Must specify all transformations manually. -% ')' sets to non-additive, if want additive then need to reset to additive after\\ -% character state designations single characters\\ -% continuous and other multicharacter states--can be read, ambiguities or ranges (unlike \texttt{TNT} -% for continuous) are -% in squarebrakets with period as in [X.Y]\\ -% - ans ? are always missing/inapplicable. \\ -% If DNA are not coded ACGT- => 01234 but left as letters, then to include gaps as a 5th state, just -% include another character, such the letter 'O' that is not an IUPAC -% ambiguity code for DNA (or amino acids for that matter) as an additional state. This can be used for -% matrix/Sankoff matrices as well.\\ -% Amino acid sequences would be processed in the same way. Including gaps as information requires -% an extra state (such as letter 'O').\\ -% multi-character state designations (letters, numbers, etc) must be in their own ``block'' with spaces -% between them.\\ -% Continuous characters must be numbers only (float fine) and declared as ``additive'' by cccode -% command, otherwise the number will be treated as non-additive character states. \\ -% nonAdd polymorphisms are [X.Y]--unless additive '-' for range\\ -% can't have '.' in multichar state (or single for that matter)\\ -% The inherent ordering of DNA and amino acid codes is alphabetical (e.g. A, C, G, T and `-'). -% - \section{Input Graph Formats} - Graphs can be input in the graphviz \href{https://graphviz.org/}{``dot''} format Newick (as - interpreted by Gary Olsen, linked \href{https://evolution.genetics.washington.edu/phylip/newick_doc.html} - {here}), Enhanced Newick \cite{Cardonaetal2008}, and Forest Enhanced Newick (defined by - \citealp{Wheeler2022}) formats. - Forest Enhanced Newick (FEN) is a format based on Enhanced Newick (ENewick) for - forests of components, each of which is represented by an ENewick string. The ENewick - components are surrounded by `$<$' and '$>$'. As in $<$(A, (B,C)); (D,(E,F));$>$. - Groups may be shared among ENewick components. - - \section{Output Graph Formats} - Graph outputs can be in either Graphviz `dot' or FEN formats. Dot files can be visualized in a variety of ways - using Graphviz (e.g. dot, neanto, twopi) into pdf, jpg and a large variety of other formats. FEN outputs of - single trees (ie forest with a single component) are rendered as enewick. Newick files can be visualized in a - large number of programs (e.g. \href{http://tree.bio.ed.ac.uk/software/figtree/}{FigTree}; - \href{http:/https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/} - {Dendroscope}). - When FEN/Enewick files are output, leaf vertices are modified if they have indegree $>$ 1, creating a new node as parent to that leaf - and redirecting the leaf's in-edges to that new node with a single edge connecting the new node to the leaf. Example dot command line: - - \begin{verbatim} - dot -Tpdf myDotFile.dot $>$ myDotFile.pdf - \end{verbatim} - - Multiple ``dot'' graphs can be output in a single file. To create pdf and other formats the - commandline would be (these files are named and numbered automatically): - - \begin{verbatim} - dot -Tpdf -O myDotFile.dot - \end{verbatim} - - For some reason on OSX the `pdf' option does not seem to work. You can use `-Tps2' and that will generate - a postscript file ($>$ blah.ps) that Preview can read and convert to pdf. - %what's -Tps2? - - - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%COMMANDS -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -\chapter{PhyG Commands} - -\section{\phyg Command Structure} - - \subsection{Brief description} - \phyg interprets and executes scripts coming from an input file. A script is a list of - commands, separated by any number of whitespace characters (spaces, tabs, or - newlines). Each command consists of a name followed by a list of arguments - separated by commas and enclosed in parentheses. Commands and arguments - are case insensitive with the exception of filename specifications, which are always - in double quotes (\texttt{"fileName"}). File options require a valid filename %\hl{what - does this mean?} - Arguments may be preceded or followed by - options separated by a colon \texttt{`:'}, with no space before the option. Command - arguments are order independent and can be entered from a file as stdin - (\texttt{$<$ fileName}). - - \begin{quote} - command(argument, option:argument, option[:optional argument]...) - \end{quote} - - \noindent Most of the arguments are optional, and there are only a few \phyg options - that require specification. There are defaults for all options except input graphs. - Parameters are given with options in a range `a to b' (a-b) with any value in the interval, - or alternates `a or b' (a|b). - %\hl{reword this, confusing} - - Commands are standard commands that can affect the behavior of another command - when included in its list of arguments. Therefore, certain commands can function as - arguments of other commands. For example, the command \texttt{swap} specifies - the method of branch swapping. This command is used to conduct a local search on - a set of trees. %In addition, \texttt{swap} functions as an argument for \texttt{build} - %specifying \hl{...}. - - For input graphs, wildcards are allowed (i.e. `*' and `?').\\ - - %\hl{Do you want a section on Notation within Command Structure?} - - \subsection{Command order and processing} - The commands \texttt{read}, \texttt{rename}, \texttt{reblock}, and \texttt{set} are executed at - the beginning of program execution, irrespective of where they appear in the command script. - All other commands are executed in the order they are specified. - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%COMMAND REFERENCE -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -\section{Command Reference} - \input{PhyG_Allcommands.tex} - -\section{Example Script Files} - The following file (titled ``Example Script 1'')reads two input sequence files (net-I.fas and net-II.fas), - skips all the lines that begin with double dash (\texttt{--}), reads the graph file net-I-II.dot, sets the - outgroup to the taxon named ``zero,'' specifies the graph type for the analysis is a soft-wired network, - and reports a series of files with various information about the data and graphs. - - \begin{verbatim} - -- Example Script 1 - read("net-I.fas") - --read("net-Ia.fas") - --read("net-IIa.fas") - read("net-II.fas") - --read("net-I.dot") - --read("net-I.tre") - --read("net-II.tre") - --read("net-II.dot") - read("net-I-II.dot") - set(outgroup:"zero") - set(graphtype:softwired) - report("net-test.tre", graphs, newick, overwrite) - report("net-test.dot", graphs, dot, overwrite) - report("net-test-data.csv", data, overwrite) - report("net-test-diag.csv", diagnosis, overwrite) - report("net-display.dot", displaytrees, dot, overwrite) - report("net-display.tre", displaytrees, newick, overwrite) - \end{verbatim} - -\section*{Acknowledgments} - The authors would like to thank DARPA SIMPLEX N66001-15-C-4039, the Robert J. Kleberg Jr. and Helen C. - Kleberg foundation grant ``Mechanistic Analyses of Pancreatic Cancer Evolution'', and the American Museum - of Natural History for financial support. - - %\newpage - %\bibliography{big-refs-3.bib} - %\bibliography{/Users/louise/DropboxAMNH/big-refs-3.bib} - \bibliography{/home/ward/Dropbox/Work_stuff/manus/big-refs-3.bib} - %\bibliography{/users/ward/Dropbox/work_stuff/manus/big-refs-3.bib} - \end{document} \ No newline at end of file diff --git a/doc/README.md b/doc/README.md new file mode 100644 index 000000000..f0b48c710 --- /dev/null +++ b/doc/README.md @@ -0,0 +1,93 @@ +# Phylogenetic Graph (PhyG) + +Performs heuristic search of phylogenetic graph space via scoring abstract input data. + +[![CI Status ][GitHub-actions-img]][GitHub-actions-ref] +[![Maintained ][GitHub-support-img]][GitHub-support-ref] + +[![Release ][GitHub-release-img]][GitHub-release-ref] +[![Release Date ][GitHub-tagdate-img]][GitHub-release-ref] +[![New Commits ][GitHub-commits-img]][GitHub-commits-ref] +[![Code Size ][GitHub-codelen-img]][GitHub-codelen-ref] + +[![Author ][GitHub-authors-img]][GitHub-authors-ref] +[![BSD3 License ][GitHub-license-img]][GitHub-license-ref] +[![Haskell Programming Language][GitHub-prolang-img]][GitHub-prolang-ref] +[![Contributor Covenant][GitHub-conduct-img]][GitHub-conduct-ref] + +The `PhyG` package exposes the `phyg` executable and the `PhyG` sub-libraries. + +## `phyg` + +Phylogenetic Graph (PhyG) is a multi-platform program designed to produce phylogenetic graphs from input data and graphs via heuristic searching of general phylogenetic graph space. +The bio-informatics framework libraries of the broader [Phylogenetic Haskell Analytic Network Engine (PHANE) project][GitHub-PHANE-Readme] are the foundation upon which PhyG is constructed. +PhyG offers vast functionality, including the optimization of unaligned sequences, and the ability to implement search strategies such as random addition sequence, swapping, and tree fusing. +Furthermore, PhyG can generate outputs in the form of implied alignments and graphical representations of cladograms and graphs. +What sets PhyG apart from other phylogenetic analysis programs, is the extension to broader classes of input data and phylogenetic graphs. +The phylogenetic graph inputs and outputs of PhyG include trees, as well as softwired and hardwired networks. + + +### [Funding provided by][GitHub-Funding]: + + * [American Museum of Natural History][Funding-0] + + * [DARPA SIMPLEX][Funding-1] + + * [Kleberg Foundation][Funding-2] + + +### [Installation instructions][GitHub-Install] + +``` +which ghcup || curl --proto '=https' --tlsv1.2 -sSf https://get-ghcup.haskell.org | sh +which ghc || ghcup install ghc latest +which cabal || ghcup install cabal latest +cabal update +cabal install +``` + +### [Publications][GitHub-PHANE-Papers] + +### Synchronizing Documentation + +If you are on the `documentation-authoring` branch and need the newest code for building `phyg`, *do not panic!* +Instead do the following: + +``` +git checkout main +git pull +git checkout documentation-authoring +git merge main +``` + +That's it! You've done it and are certainly a hero who just saved the day. + +[Funding-0]: https://www.amnh.org/our-research/computational-sciences +[Funding-1]: https://www.darpa.mil/program/simplifying-complexity-in-scientific-discovery +[Funding-2]: http://www.klebergfoundation.org/ + +[GitHub-actions-img]: https://github.com/amnh/PhyG/actions/workflows/integration-test-suite.yaml/badge.svg?branch=master +[GitHub-actions-ref]: https://github.com/AMNH/PhyG/actions +[GitHub-authors-img]: https://img.shields.io/badge/author-Ward%20Wheeler-blue.svg?color=134EA2 +[GitHub-authors-ref]: https://github.com/AMNH/PhyG/tree/master/doc/AUTHORS.md +[GitHub-codelen-img]: https://img.shields.io/github/languages/code-size/AMNH/PhyG.svg?style=popout&color=yellowgreen +[GitHub-codelen-ref]: https://github.com/AMNH/PhyG/archive/master.zip +[GitHub-commits-img]: https://img.shields.io/github/commits-since/AMNH/PhyG/v0.1.2.svg?style=popout&color=yellowgreen +[GitHub-commits-ref]: https://github.com/AMNH/PhyG/commits/master +[GitHub-conduct-img]: https://img.shields.io/badge/Contributor%20Covenant-2.0-4baaaa.svg +[GitHub-conduct-ref]: https://github.com/AMNH/PhyG/blob/master/doc/Code_Of_Conduct.md +[GitHub-license-img]: https://img.shields.io/badge/license-BSD3-blue.svg?color=134EA2 +[GitHub-license-ref]: https://github.com/AMNH/PhyG/blob/master/doc/LICENSE +[GitHub-prolang-img]: https://img.shields.io/badge/language-Haskell-blue.svg +[GitHub-prolang-ref]: https://www.haskell.org +[GitHub-release-img]: https://img.shields.io/github/release-pre/AMNH/PhyG.svg?style=popout&color=orange +[GitHub-release-ref]: https://github.com/AMNH/PhyG/releases/latest +[GitHub-tagdate-img]: https://img.shields.io/github/release-date-pre/AMNH/PhyG.svg?style=popout&color=orange +[GitHub-support-img]: https://img.shields.io/maintenance/yes/2023.svg?style=popout +[GitHub-support-ref]: https://github.com/AMNH/PhyG/graphs/contributors + +[GitHub-Funding]: https://github.com/AMNH/PhyG/blob/master/doc/Funding.md +[GitHub-Install]: https://github.com/AMNH/PhyG/blob/master/doc/tutorials/Installation.md + +[GitHub-PHANE-Readme]: https://github.com/AMNH/PHANE#readme +[GitHub-PHANE-Papers]: https://github.com/AMNH/PHANE/blob/master/doc/Publications.md diff --git a/doc/ReadMe/Integration-Tests.md b/doc/ReadMe/Integration-Tests.md deleted file mode 100644 index 392bfbe1a..000000000 --- a/doc/ReadMe/Integration-Tests.md +++ /dev/null @@ -1,4 +0,0 @@ -PHAGE Integration Tests -======================= - -A composition of integration tests for the PHAGE domain. \ No newline at end of file diff --git a/doc/ReadMe/PHAGE.md b/doc/ReadMe/PHAGE.md deleted file mode 100644 index 3080a8b1d..000000000 --- a/doc/ReadMe/PHAGE.md +++ /dev/null @@ -1,11 +0,0 @@ -# PhyGraph -Progam to perform phylogenetic searches on general graphs with diverse data types - - -## Installation - -To install an optimized, "batteries-included" version of `phyg`, run the following: - -``` -cabal install PhyGraph:phyg --project-file=cfg/cabal.project.release -``` diff --git a/doc/ReadMe/PhyGraph.md b/doc/ReadMe/PhyGraph.md deleted file mode 100644 index 3345d263b..000000000 --- a/doc/ReadMe/PhyGraph.md +++ /dev/null @@ -1,3 +0,0 @@ -# PhyGraph - -The `PhyGraph` package exposes the `phyg` executable and the `PhyGraph` library. \ No newline at end of file diff --git a/doc/ReadMe/PhyloLib.md b/doc/ReadMe/PhyloLib.md deleted file mode 100644 index 09f4940e9..000000000 --- a/doc/ReadMe/PhyloLib.md +++ /dev/null @@ -1,2 +0,0 @@ -# PhyloLibs -Library of modules useful for phylogenetic computing diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/COPYRIGHT b/ffi/external-direct-optimization/COPYRIGHT similarity index 100% rename from pkg/PhyGraph/ffi/external-direct-optimization/COPYRIGHT rename to ffi/external-direct-optimization/COPYRIGHT diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/Changelog.md b/ffi/external-direct-optimization/Changelog.md similarity index 100% rename from pkg/PhyGraph/ffi/external-direct-optimization/Changelog.md rename to ffi/external-direct-optimization/Changelog.md diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/README b/ffi/external-direct-optimization/README similarity index 100% rename from pkg/PhyGraph/ffi/external-direct-optimization/README rename to ffi/external-direct-optimization/README diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/alignCharacters.c b/ffi/external-direct-optimization/alignCharacters.c old mode 100755 new mode 100644 similarity index 100% rename from pkg/PhyGraph/ffi/external-direct-optimization/alignCharacters.c rename to ffi/external-direct-optimization/alignCharacters.c diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/alignCharacters.h b/ffi/external-direct-optimization/alignCharacters.h old mode 100755 new mode 100644 similarity index 100% rename from pkg/PhyGraph/ffi/external-direct-optimization/alignCharacters.h rename to ffi/external-direct-optimization/alignCharacters.h diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/alignmentMatrices.c b/ffi/external-direct-optimization/alignmentMatrices.c old mode 100755 new mode 100644 similarity index 100% rename from pkg/PhyGraph/ffi/external-direct-optimization/alignmentMatrices.c rename to ffi/external-direct-optimization/alignmentMatrices.c diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/alignmentMatrices.h b/ffi/external-direct-optimization/alignmentMatrices.h old mode 100755 new mode 100644 similarity index 100% rename from pkg/PhyGraph/ffi/external-direct-optimization/alignmentMatrices.h rename to ffi/external-direct-optimization/alignmentMatrices.h diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/c_alignment_interface.c b/ffi/external-direct-optimization/c_alignment_interface.c old mode 100755 new mode 100644 similarity index 99% rename from pkg/PhyGraph/ffi/external-direct-optimization/c_alignment_interface.c rename to ffi/external-direct-optimization/c_alignment_interface.c index 9024289cc..cbefe6851 --- a/pkg/PhyGraph/ffi/external-direct-optimization/c_alignment_interface.c +++ b/ffi/external-direct-optimization/c_alignment_interface.c @@ -167,7 +167,7 @@ size_t cAlign2D memcpy( lesserInput, retLesserChar->array_head, allocationLen * sizeof(elem_t)); memcpy( longerInput, retLongerChar->array_head, allocationLen * sizeof(elem_t)); memcpy(outputMedian, medianChar->array_head, allocationLen * sizeof(elem_t)); - *outputLength = medianChar->len - 1; // Subtract 1 for the prepended gap + *outputLength = medianChar->len == 0 ? 0 : medianChar->len - 1; // Subtract 1 for the prepended gap // Free temporary buffers dyn_char_free( retLongerChar ); diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/c_alignment_interface.h b/ffi/external-direct-optimization/c_alignment_interface.h old mode 100755 new mode 100644 similarity index 100% rename from pkg/PhyGraph/ffi/external-direct-optimization/c_alignment_interface.h rename to ffi/external-direct-optimization/c_alignment_interface.h diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/c_code_alloc_setup.c b/ffi/external-direct-optimization/c_code_alloc_setup.c old mode 100755 new mode 100644 similarity index 100% rename from pkg/PhyGraph/ffi/external-direct-optimization/c_code_alloc_setup.c rename to ffi/external-direct-optimization/c_code_alloc_setup.c diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/c_code_alloc_setup.h b/ffi/external-direct-optimization/c_code_alloc_setup.h old mode 100755 new mode 100644 similarity index 100% rename from pkg/PhyGraph/ffi/external-direct-optimization/c_code_alloc_setup.h rename to ffi/external-direct-optimization/c_code_alloc_setup.h diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/costMatrix.c b/ffi/external-direct-optimization/costMatrix.c old mode 100755 new mode 100644 similarity index 100% rename from pkg/PhyGraph/ffi/external-direct-optimization/costMatrix.c rename to ffi/external-direct-optimization/costMatrix.c diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/costMatrix.h b/ffi/external-direct-optimization/costMatrix.h old mode 100755 new mode 100644 similarity index 100% rename from pkg/PhyGraph/ffi/external-direct-optimization/costMatrix.h rename to ffi/external-direct-optimization/costMatrix.h diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/debug_constants.h b/ffi/external-direct-optimization/debug_constants.h similarity index 100% rename from pkg/PhyGraph/ffi/external-direct-optimization/debug_constants.h rename to ffi/external-direct-optimization/debug_constants.h diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/dyn_character.c b/ffi/external-direct-optimization/dyn_character.c similarity index 100% rename from pkg/PhyGraph/ffi/external-direct-optimization/dyn_character.c rename to ffi/external-direct-optimization/dyn_character.c diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/dyn_character.h b/ffi/external-direct-optimization/dyn_character.h similarity index 100% rename from pkg/PhyGraph/ffi/external-direct-optimization/dyn_character.h rename to ffi/external-direct-optimization/dyn_character.h diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/test-suite/C_source/character_test.c b/ffi/external-direct-optimization/test-suite/C_source/character_test.c old mode 100755 new mode 100644 similarity index 100% rename from pkg/PhyGraph/ffi/external-direct-optimization/test-suite/C_source/character_test.c rename to ffi/external-direct-optimization/test-suite/C_source/character_test.c diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/test-suite/C_source/makefile b/ffi/external-direct-optimization/test-suite/C_source/makefile old mode 100755 new mode 100644 similarity index 100% rename from pkg/PhyGraph/ffi/external-direct-optimization/test-suite/C_source/makefile rename to ffi/external-direct-optimization/test-suite/C_source/makefile diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/test-suite/C_source/test_initialization.c b/ffi/external-direct-optimization/test-suite/C_source/test_initialization.c old mode 100755 new mode 100644 similarity index 100% rename from pkg/PhyGraph/ffi/external-direct-optimization/test-suite/C_source/test_initialization.c rename to ffi/external-direct-optimization/test-suite/C_source/test_initialization.c diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/test-suite/C_source/test_interface b/ffi/external-direct-optimization/test-suite/C_source/test_interface old mode 100755 new mode 100644 similarity index 100% rename from pkg/PhyGraph/ffi/external-direct-optimization/test-suite/C_source/test_interface rename to ffi/external-direct-optimization/test-suite/C_source/test_interface diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/test-suite/C_source/test_interface.c b/ffi/external-direct-optimization/test-suite/C_source/test_interface.c similarity index 100% rename from pkg/PhyGraph/ffi/external-direct-optimization/test-suite/C_source/test_interface.c rename to ffi/external-direct-optimization/test-suite/C_source/test_interface.c diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/test-suite/C_source/test_interface_3d_for_python.c b/ffi/external-direct-optimization/test-suite/C_source/test_interface_3d_for_python.c old mode 100755 new mode 100644 similarity index 100% rename from pkg/PhyGraph/ffi/external-direct-optimization/test-suite/C_source/test_interface_3d_for_python.c rename to ffi/external-direct-optimization/test-suite/C_source/test_interface_3d_for_python.c diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/test-suite/python_2/generateConsistentRandomIntSeqs_python2.py b/ffi/external-direct-optimization/test-suite/python_2/generateConsistentRandomIntSeqs_python2.py similarity index 100% rename from pkg/PhyGraph/ffi/external-direct-optimization/test-suite/python_2/generateConsistentRandomIntSeqs_python2.py rename to ffi/external-direct-optimization/test-suite/python_2/generateConsistentRandomIntSeqs_python2.py diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/test-suite/python_2/run_3_rand_seqs_C_interface_python2.py b/ffi/external-direct-optimization/test-suite/python_2/run_3_rand_seqs_C_interface_python2.py similarity index 100% rename from pkg/PhyGraph/ffi/external-direct-optimization/test-suite/python_2/run_3_rand_seqs_C_interface_python2.py rename to ffi/external-direct-optimization/test-suite/python_2/run_3_rand_seqs_C_interface_python2.py diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/test-suite/python_2/run_3_rand_seqs_poy_python2.py b/ffi/external-direct-optimization/test-suite/python_2/run_3_rand_seqs_poy_python2.py similarity index 100% rename from pkg/PhyGraph/ffi/external-direct-optimization/test-suite/python_2/run_3_rand_seqs_poy_python2.py rename to ffi/external-direct-optimization/test-suite/python_2/run_3_rand_seqs_poy_python2.py diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/test-suite/python_3/generateConsistentRandomIntSeqs_python3.py b/ffi/external-direct-optimization/test-suite/python_3/generateConsistentRandomIntSeqs_python3.py similarity index 100% rename from pkg/PhyGraph/ffi/external-direct-optimization/test-suite/python_3/generateConsistentRandomIntSeqs_python3.py rename to ffi/external-direct-optimization/test-suite/python_3/generateConsistentRandomIntSeqs_python3.py diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/test-suite/python_3/run_3_rand_seqs_C_interface_python3.py b/ffi/external-direct-optimization/test-suite/python_3/run_3_rand_seqs_C_interface_python3.py similarity index 100% rename from pkg/PhyGraph/ffi/external-direct-optimization/test-suite/python_3/run_3_rand_seqs_C_interface_python3.py rename to ffi/external-direct-optimization/test-suite/python_3/run_3_rand_seqs_C_interface_python3.py diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/test-suite/python_3/run_3_rand_seqs_poy_python3.py b/ffi/external-direct-optimization/test-suite/python_3/run_3_rand_seqs_poy_python3.py similarity index 100% rename from pkg/PhyGraph/ffi/external-direct-optimization/test-suite/python_3/run_3_rand_seqs_poy_python3.py rename to ffi/external-direct-optimization/test-suite/python_3/run_3_rand_seqs_poy_python3.py diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/test-suite/python_3/variance_python3.py b/ffi/external-direct-optimization/test-suite/python_3/variance_python3.py similarity index 100% rename from pkg/PhyGraph/ffi/external-direct-optimization/test-suite/python_3/variance_python3.py rename to ffi/external-direct-optimization/test-suite/python_3/variance_python3.py diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/ukkCheckPoint.c b/ffi/external-direct-optimization/ukkCheckPoint.c similarity index 99% rename from pkg/PhyGraph/ffi/external-direct-optimization/ukkCheckPoint.c rename to ffi/external-direct-optimization/ukkCheckPoint.c index 8b34d5f43..4bd5286da 100644 --- a/pkg/PhyGraph/ffi/external-direct-optimization/ukkCheckPoint.c +++ b/ffi/external-direct-optimization/ukkCheckPoint.c @@ -82,10 +82,7 @@ CPType *CP(int ab, int ac, int d, int s) return getPtr(&myCPAllocInfo, ab, ac, d, s); } - -void printTraceBack(); - - +void printTraceBack( characters_t *inputs, characters_t *outputs ); int best( int ab, int ac, int d, int wantState ); diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/ukkCheckPoint.h b/ffi/external-direct-optimization/ukkCheckPoint.h similarity index 100% rename from pkg/PhyGraph/ffi/external-direct-optimization/ukkCheckPoint.h rename to ffi/external-direct-optimization/ukkCheckPoint.h diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/ukkCommon.c b/ffi/external-direct-optimization/ukkCommon.c similarity index 100% rename from pkg/PhyGraph/ffi/external-direct-optimization/ukkCommon.c rename to ffi/external-direct-optimization/ukkCommon.c diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/ukkCommon.h b/ffi/external-direct-optimization/ukkCommon.h similarity index 100% rename from pkg/PhyGraph/ffi/external-direct-optimization/ukkCommon.h rename to ffi/external-direct-optimization/ukkCommon.h diff --git a/pkg/PhyGraph/ffi/external-direct-optimization/ukkCommon_for_python.c b/ffi/external-direct-optimization/ukkCommon_for_python.c similarity index 100% rename from pkg/PhyGraph/ffi/external-direct-optimization/ukkCommon_for_python.c rename to ffi/external-direct-optimization/ukkCommon_for_python.c diff --git a/pkg/PhyGraph/ffi/memoized-tcm/3d tcm notes.md b/ffi/memoized-tcm/3d tcm notes.md similarity index 100% rename from pkg/PhyGraph/ffi/memoized-tcm/3d tcm notes.md rename to ffi/memoized-tcm/3d tcm notes.md diff --git a/pkg/PhyGraph/ffi/memoized-tcm/costMatrixWrapper.c b/ffi/memoized-tcm/costMatrixWrapper.c similarity index 100% rename from pkg/PhyGraph/ffi/memoized-tcm/costMatrixWrapper.c rename to ffi/memoized-tcm/costMatrixWrapper.c diff --git a/pkg/PhyGraph/ffi/memoized-tcm/costMatrixWrapper.h b/ffi/memoized-tcm/costMatrixWrapper.h similarity index 100% rename from pkg/PhyGraph/ffi/memoized-tcm/costMatrixWrapper.h rename to ffi/memoized-tcm/costMatrixWrapper.h diff --git a/pkg/PhyGraph/ffi/memoized-tcm/costMatrixWrapper_2d.c b/ffi/memoized-tcm/costMatrixWrapper_2d.c similarity index 100% rename from pkg/PhyGraph/ffi/memoized-tcm/costMatrixWrapper_2d.c rename to ffi/memoized-tcm/costMatrixWrapper_2d.c diff --git a/pkg/PhyGraph/ffi/memoized-tcm/costMatrixWrapper_2d.h b/ffi/memoized-tcm/costMatrixWrapper_2d.h similarity index 100% rename from pkg/PhyGraph/ffi/memoized-tcm/costMatrixWrapper_2d.h rename to ffi/memoized-tcm/costMatrixWrapper_2d.h diff --git a/pkg/PhyGraph/ffi/memoized-tcm/costMatrixWrapper_3d.c b/ffi/memoized-tcm/costMatrixWrapper_3d.c similarity index 100% rename from pkg/PhyGraph/ffi/memoized-tcm/costMatrixWrapper_3d.c rename to ffi/memoized-tcm/costMatrixWrapper_3d.c diff --git a/pkg/PhyGraph/ffi/memoized-tcm/costMatrixWrapper_3d.h b/ffi/memoized-tcm/costMatrixWrapper_3d.h similarity index 100% rename from pkg/PhyGraph/ffi/memoized-tcm/costMatrixWrapper_3d.h rename to ffi/memoized-tcm/costMatrixWrapper_3d.h diff --git a/pkg/PhyGraph/ffi/memoized-tcm/costMatrix_2d.cpp b/ffi/memoized-tcm/costMatrix_2d.cpp similarity index 100% rename from pkg/PhyGraph/ffi/memoized-tcm/costMatrix_2d.cpp rename to ffi/memoized-tcm/costMatrix_2d.cpp diff --git a/pkg/PhyGraph/ffi/memoized-tcm/costMatrix_2d.hpp b/ffi/memoized-tcm/costMatrix_2d.hpp similarity index 100% rename from pkg/PhyGraph/ffi/memoized-tcm/costMatrix_2d.hpp rename to ffi/memoized-tcm/costMatrix_2d.hpp diff --git a/pkg/PhyGraph/ffi/memoized-tcm/costMatrix_3d.cpp b/ffi/memoized-tcm/costMatrix_3d.cpp similarity index 100% rename from pkg/PhyGraph/ffi/memoized-tcm/costMatrix_3d.cpp rename to ffi/memoized-tcm/costMatrix_3d.cpp diff --git a/pkg/PhyGraph/ffi/memoized-tcm/costMatrix_3d.hpp b/ffi/memoized-tcm/costMatrix_3d.hpp similarity index 100% rename from pkg/PhyGraph/ffi/memoized-tcm/costMatrix_3d.hpp rename to ffi/memoized-tcm/costMatrix_3d.hpp diff --git a/pkg/PhyGraph/ffi/memoized-tcm/dynamicCharacterOperations.c b/ffi/memoized-tcm/dynamicCharacterOperations.c similarity index 100% rename from pkg/PhyGraph/ffi/memoized-tcm/dynamicCharacterOperations.c rename to ffi/memoized-tcm/dynamicCharacterOperations.c diff --git a/pkg/PhyGraph/ffi/memoized-tcm/dynamicCharacterOperations.h b/ffi/memoized-tcm/dynamicCharacterOperations.h similarity index 100% rename from pkg/PhyGraph/ffi/memoized-tcm/dynamicCharacterOperations.h rename to ffi/memoized-tcm/dynamicCharacterOperations.h diff --git a/pkg/PhyGraph/ffi/memoized-tcm/makefile b/ffi/memoized-tcm/makefile similarity index 100% rename from pkg/PhyGraph/ffi/memoized-tcm/makefile rename to ffi/memoized-tcm/makefile diff --git a/pkg/PhyGraph/ffi/memoized-tcm/test-suite/bitArrayExampleC.c b/ffi/memoized-tcm/test-suite/bitArrayExampleC.c similarity index 100% rename from pkg/PhyGraph/ffi/memoized-tcm/test-suite/bitArrayExampleC.c rename to ffi/memoized-tcm/test-suite/bitArrayExampleC.c diff --git a/pkg/PhyGraph/ffi/memoized-tcm/test-suite/bitArrayExampleC.h b/ffi/memoized-tcm/test-suite/bitArrayExampleC.h similarity index 100% rename from pkg/PhyGraph/ffi/memoized-tcm/test-suite/bitArrayExampleC.h rename to ffi/memoized-tcm/test-suite/bitArrayExampleC.h diff --git a/pkg/PhyGraph/ffi/memoized-tcm/test-suite/makefile b/ffi/memoized-tcm/test-suite/makefile similarity index 100% rename from pkg/PhyGraph/ffi/memoized-tcm/test-suite/makefile rename to ffi/memoized-tcm/test-suite/makefile diff --git a/pkg/PhyGraph/ffi/memoized-tcm/test-suite/printMatrix.py b/ffi/memoized-tcm/test-suite/printMatrix.py similarity index 100% rename from pkg/PhyGraph/ffi/memoized-tcm/test-suite/printMatrix.py rename to ffi/memoized-tcm/test-suite/printMatrix.py diff --git a/pkg/PhyGraph/ffi/memoized-tcm/test-suite/test_c_interface.c b/ffi/memoized-tcm/test-suite/test_c_interface.c similarity index 100% rename from pkg/PhyGraph/ffi/memoized-tcm/test-suite/test_c_interface.c rename to ffi/memoized-tcm/test-suite/test_c_interface.c diff --git a/pkg/PhyGraph/ffi/memoized-tcm/test-suite/test_c_interface_longer.c b/ffi/memoized-tcm/test-suite/test_c_interface_longer.c similarity index 100% rename from pkg/PhyGraph/ffi/memoized-tcm/test-suite/test_c_interface_longer.c rename to ffi/memoized-tcm/test-suite/test_c_interface_longer.c diff --git a/pkg/PhyGraph/ffi/memoized-tcm/test-suite/test_cost_matrix_2d.cpp b/ffi/memoized-tcm/test-suite/test_cost_matrix_2d.cpp similarity index 100% rename from pkg/PhyGraph/ffi/memoized-tcm/test-suite/test_cost_matrix_2d.cpp rename to ffi/memoized-tcm/test-suite/test_cost_matrix_2d.cpp diff --git a/pkg/PhyGraph/ffi/memoized-tcm/test-suite/test_cost_matrix_3d.cpp b/ffi/memoized-tcm/test-suite/test_cost_matrix_3d.cpp similarity index 100% rename from pkg/PhyGraph/ffi/memoized-tcm/test-suite/test_cost_matrix_3d.cpp rename to ffi/memoized-tcm/test-suite/test_cost_matrix_3d.cpp diff --git a/lib/dynamic-character/src/DirectOptimization/Pairwise.hs b/lib/dynamic-character/src/DirectOptimization/Pairwise.hs new file mode 100644 index 000000000..16dc5cdde --- /dev/null +++ b/lib/dynamic-character/src/DirectOptimization/Pairwise.hs @@ -0,0 +1,36 @@ +----------------------------------------------------------------------------- + +----------------------------------------------------------------------------- + +{- | +Module : DirectOptimization.Pairwise +Copyright : (c) 2015-2021 Ward Wheeler +License : BSD-style + +Maintainer : wheeler@amnh.org +Stability : provisional +Portability : portable + +Pairwise direct optimization alignment functions using a variety of techniques. +-} +module DirectOptimization.Pairwise ( + -- * Slim characters + SlimDynamicCharacter, + SlimState, + slimPairwiseDO, + + -- * Wide characters + WideDynamicCharacter, + WideState, + widePairwiseDO, + + -- * Huge characters + HugeDynamicCharacter, + HugeState, + hugePairwiseDO, +) where + +import DirectOptimization.Pairwise.Huge +import DirectOptimization.Pairwise.Slim +import DirectOptimization.Pairwise.Wide + diff --git a/lib/dynamic-character/src/DirectOptimization/Pairwise/Direction.hs b/lib/dynamic-character/src/DirectOptimization/Pairwise/Direction.hs new file mode 100644 index 000000000..3f780a173 --- /dev/null +++ b/lib/dynamic-character/src/DirectOptimization/Pairwise/Direction.hs @@ -0,0 +1,211 @@ +{-# LANGUAGE DerivingStrategies #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE MultiParamTypeClasses #-} +{-# LANGUAGE Strict #-} +{-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE UnboxedTuples #-} + +{- | +Module : DirectOptimization.Pairwise.Direction +Copyright : (c) 2015-2021 Ward Wheeler +License : BSD-style + +Maintainer : wheeler@amnh.org +Stability : provisional +Portability : portable + +Direct optimization pairwise alignment using the Needleman-Wunsch algorithm. +These functions will allocate an M * N matrix. +-} +module DirectOptimization.Pairwise.Direction ( + Direction (..), + + -- * Querying + minimumCostDirection, + + -- * Rendering + boldDirection, +) where + +import Data.Int +import Data.Vector.Generic qualified as GV +import Data.Vector.Generic.Mutable qualified as MGV +import Data.Vector.Primitive qualified as PV +import Data.Vector.Unboxed qualified as UV +import Data.Word + + +{- | +Which direction to align the character at a given matrix point. + +It should be noted that the ordering of the three arrow types are important, +as it guarantees that the derived 'Ord' instance will have the following +property: + +DiagArrow < LeftArrow < UpArrow + +This means: + + - DiagArrow has highest precedence when one or more costs are equal + + - LeftArrow has second highest precedence when one or more costs are equal + + - UpArrow has lowest precedence when one or more costs are equal + +Using this 'Ord' instance, we can resolve ambiguous transformations in a +deterministic way. Without loss of generality in determining the ordering, +we choose the same biasing as the C code called from the FFI for consistency. +-} +data Direction = DiagArrow | LeftArrow | UpArrow +--data Direction = UpArrow | LeftArrow | DiagArrow + deriving stock (Eq, Ord) + + +newtype instance UV.MVector s Direction = MV_Direction (PV.MVector s Word8) + + +newtype instance UV.Vector Direction = V_Direction (PV.Vector Word8) + + +instance UV.Unbox Direction + + +instance MGV.MVector UV.MVector Direction where + {-# INLINE basicLength #-} + basicLength (MV_Direction v) = MGV.basicLength v + + + {-# INLINE basicUnsafeSlice #-} + basicUnsafeSlice i n (MV_Direction v) = MV_Direction $ MGV.basicUnsafeSlice i n v + + + {-# INLINE basicOverlaps #-} + basicOverlaps (MV_Direction v1) (MV_Direction v2) = MGV.basicOverlaps v1 v2 + + + {-# INLINE basicUnsafeNew #-} + basicUnsafeNew n = MV_Direction <$> MGV.basicUnsafeNew n + + + {-# INLINE basicInitialize #-} + basicInitialize (MV_Direction v) = MGV.basicInitialize v + + + {-# INLINE basicUnsafeReplicate #-} + basicUnsafeReplicate n x = MV_Direction <$> MGV.basicUnsafeReplicate n (fromDirection x) + + + {-# INLINE basicUnsafeRead #-} + basicUnsafeRead (MV_Direction v) i = toDirection <$> MGV.basicUnsafeRead v i + + + {-# INLINE basicUnsafeWrite #-} + basicUnsafeWrite (MV_Direction v) i x = MGV.basicUnsafeWrite v i (fromDirection x) + + + {-# INLINE basicClear #-} + basicClear (MV_Direction v) = MGV.basicClear v + + + {-# INLINE basicSet #-} + basicSet (MV_Direction v) x = MGV.basicSet v (fromDirection x) + + + {-# INLINE basicUnsafeCopy #-} + basicUnsafeCopy (MV_Direction v1) (MV_Direction v2) = MGV.basicUnsafeCopy v1 v2 + + + basicUnsafeMove (MV_Direction v1) (MV_Direction v2) = MGV.basicUnsafeMove v1 v2 + + + {-# INLINE basicUnsafeGrow #-} + basicUnsafeGrow (MV_Direction v) n = MV_Direction <$> MGV.basicUnsafeGrow v n + + +instance GV.Vector UV.Vector Direction where + {-# INLINE basicUnsafeFreeze #-} + basicUnsafeFreeze (MV_Direction v) = V_Direction <$> GV.basicUnsafeFreeze v + + + {-# INLINE basicUnsafeThaw #-} + basicUnsafeThaw (V_Direction v) = MV_Direction <$> GV.basicUnsafeThaw v + + + {-# INLINE basicLength #-} + basicLength (V_Direction v) = GV.basicLength v + + + {-# INLINE basicUnsafeSlice #-} + basicUnsafeSlice i n (V_Direction v) = V_Direction $ GV.basicUnsafeSlice i n v + + + {-# INLINE basicUnsafeIndexM #-} + basicUnsafeIndexM (V_Direction v) i = toDirection <$> GV.basicUnsafeIndexM v i + + + basicUnsafeCopy (MV_Direction mv) (V_Direction v) = GV.basicUnsafeCopy mv v + + + {-# INLINE elemseq #-} + elemseq _ = seq + + +instance Show Direction where + show DiagArrow = "↖" + show LeftArrow = "←" + show UpArrow = "↑" + + +{-# INLINEABLE boldDirection #-} +boldDirection ∷ Char → Char +boldDirection '↖' = '⇖' +boldDirection '←' = '⇐' +boldDirection '↑' = '⇑' +boldDirection d = d + + +{- | +Given the cost of deletion, alignment, and insertion (respectively), selects +the least costly direction. In the case of one or more equal costs, the +direction arrows are returned in the following descending order of priority: + + [ DiagArrow, LeftArrow, UpArrow ] +-} +{-# INLINEABLE minimumCostDirection #-} +{-# SPECIALIZE INLINE minimumCostDirection ∷ Int → Int → Int → (# Int, Direction #) #-} +{-# SPECIALIZE INLINE minimumCostDirection ∷ Int8 → Int8 → Int8 → (# Int8, Direction #) #-} +{-# SPECIALIZE INLINE minimumCostDirection ∷ Int16 → Int16 → Int16 → (# Int16, Direction #) #-} +{-# SPECIALIZE INLINE minimumCostDirection ∷ Int32 → Int32 → Int32 → (# Int32, Direction #) #-} +{-# SPECIALIZE INLINE minimumCostDirection ∷ Int64 → Int64 → Int64 → (# Int64, Direction #) #-} +{-# SPECIALIZE INLINE minimumCostDirection ∷ Word → Word → Word → (# Word, Direction #) #-} +{-# SPECIALIZE INLINE minimumCostDirection ∷ Word8 → Word8 → Word8 → (# Word8, Direction #) #-} +{-# SPECIALIZE INLINE minimumCostDirection ∷ Word16 → Word16 → Word16 → (# Word16, Direction #) #-} +{-# SPECIALIZE INLINE minimumCostDirection ∷ Word32 → Word32 → Word32 → (# Word32, Direction #) #-} +{-# SPECIALIZE INLINE minimumCostDirection ∷ Word64 → Word64 → Word64 → (# Word64, Direction #) #-} +minimumCostDirection + ∷ (Ord e) + ⇒ e + → e + → e + → (# e, Direction #) +minimumCostDirection delCost alnCost insCost + | alnCost <= delCost = + if alnCost <= insCost + then (# alnCost, DiagArrow #) + else (# insCost, UpArrow #) + | delCost <= insCost = (# delCost, LeftArrow #) + | otherwise = (# insCost, UpArrow #) + + +{-# INLINE fromDirection #-} +fromDirection ∷ Direction → Word8 +fromDirection DiagArrow = 0 +fromDirection LeftArrow = 1 +fromDirection UpArrow = 2 + + +{-# INLINE toDirection #-} +toDirection ∷ Word8 → Direction +toDirection 0 = DiagArrow +toDirection 1 = LeftArrow +toDirection _ = UpArrow diff --git a/lib/dynamic-character/src/DirectOptimization/Pairwise/Huge.hs b/lib/dynamic-character/src/DirectOptimization/Pairwise/Huge.hs new file mode 100644 index 000000000..4dcd6bd8b --- /dev/null +++ b/lib/dynamic-character/src/DirectOptimization/Pairwise/Huge.hs @@ -0,0 +1,18 @@ +module DirectOptimization.Pairwise.Huge ( + HugeDynamicCharacter, + HugeState, + hugePairwiseDO, +) where + +import Bio.DynamicCharacter +import DirectOptimization.Pairwise.Ukkonen +import Bio.DynamicCharacter.Element + + +hugePairwiseDO + ∷ Word + → (HugeState → HugeState → (HugeState, Word)) + → HugeDynamicCharacter + → HugeDynamicCharacter + → (Word, HugeDynamicCharacter) +hugePairwiseDO = ukkonenDO diff --git a/lib/dynamic-character/src/DirectOptimization/Pairwise/Internal.hs b/lib/dynamic-character/src/DirectOptimization/Pairwise/Internal.hs new file mode 100644 index 000000000..f0492bc4f --- /dev/null +++ b/lib/dynamic-character/src/DirectOptimization/Pairwise/Internal.hs @@ -0,0 +1,294 @@ +{-# LANGUAGE ApplicativeDo #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE Strict #-} + +{- | +Module : DirectOptimization.Pairwise.Internal +Copyright : (c) 2015-2021 Ward Wheeler +License : BSD-style + +Maintainer : wheeler@amnh.org +Stability : provisional +Portability : portable + +Direct optimization pairwise alignment using the Needleman-Wunsch algorithm. +These functions will allocate an M * N matrix. +-} +module DirectOptimization.Pairwise.Internal ( + -- * Alignment types + Direction (..), + TCMλ, + + -- * Alignment feneric functions + directOptimization, + directOptimizationFromDirectionMatrix, +) where + +import Bio.DynamicCharacter +import Bio.DynamicCharacter.Element +import Bio.DynamicCharacter.HandleGaps +import Bio.DynamicCharacter.Measure +import Control.Applicative +import Control.Monad (join) +import Control.Monad.Loops (whileM_) +import Data.Bits +import Data.Matrix.Class (Matrix, dim, unsafeIndex) +import Data.Matrix.Unboxed qualified as UM +import Data.STRef +import Data.Vector qualified as V +import Data.Vector.Generic (Vector, (!), (!?)) +import Data.Vector.Generic qualified as GV +import Data.Vector.Storable qualified as SV +import Data.Vector.Unboxed qualified as UV +import DirectOptimization.Pairwise.Direction + + +{- | +A generalized function representation: the "overlap" between dynamic character +elements, supplying the corresponding median and cost to align the two +characters. +-} +type TCMλ e = e → e → (e, Word) + + +{-# SCC directOptimization #-} +{-# INLINEABLE directOptimization #-} +{-# SPECIALIZE directOptimization ∷ + (SV.Vector SlimState → SV.Vector SlimState → (Word, SlimDynamicCharacter)) + → (SlimState → SlimState → (SlimState, Word)) + → SlimDynamicCharacter + → SlimDynamicCharacter + → (Word, SlimDynamicCharacter) + #-} +{-# SPECIALIZE directOptimization ∷ + (UV.Vector WideState → UV.Vector WideState → (Word, WideDynamicCharacter)) + → (WideState → WideState → (WideState, Word)) + → WideDynamicCharacter + → WideDynamicCharacter + → (Word, WideDynamicCharacter) + #-} +{-# SPECIALIZE directOptimization ∷ + (V.Vector HugeState → V.Vector HugeState → (Word, HugeDynamicCharacter)) + → (HugeState → HugeState → (HugeState, Word)) + → HugeDynamicCharacter + → HugeDynamicCharacter + → (Word, HugeDynamicCharacter) + #-} +directOptimization + ∷ ( FiniteBits e + , Ord (v e) + , Vector v e + ) + ⇒ (v e → v e → (Word, OpenDynamicCharacter v e)) + -- ^ Alignment function + → TCMλ e + -- ^ Metric for computing state distance and median state + → OpenDynamicCharacter v e + → OpenDynamicCharacter v e + → (Word, OpenDynamicCharacter v e) +directOptimization alignmentFunction overlapλ = handleMissing generateAlignmentResult + where + generateAlignmentResult lhs rhs = + let -- Build a 'gap' state now that we know that we can access a non-empty sequence. + gap = let tmp = extractMediansGapped rhs ! 0 in (tmp `xor` tmp) `setBit` 0 + -- Remove gaps from the inputs and measure the results to determine + -- which ungapped character is longer and which is shorter. + -- Always pass the shorter character into alignment functions first! + ~(swapped, gapsLesser, gapsLonger, lesser, longer) = measureCharactersWithoutGaps lhs rhs + lesserMeds = extractMediansGapped lesser + longerMeds = extractMediansGapped longer + ~(alignmentCost, ungappedAlignment) = + if GV.length lesserMeds == 0 + then -- Neither character was Missing, but one or both are empty when gaps are removed + alignmentWithAllGaps overlapλ longerMeds + else -- Both have some non-gap elements, perform string alignment + alignmentFunction lesserMeds longerMeds + regappedAlignment = insertGaps gap gapsLesser gapsLonger ungappedAlignment + transformation = if swapped then transposeCharacter else id + alignmentContext = transformation regappedAlignment + in (alignmentCost, forceDynamicCharacter alignmentContext) + + +{-# SCC directOptimizationFromDirectionMatrix #-} +{-# INLINEABLE directOptimizationFromDirectionMatrix #-} +{-# SPECIALIZE directOptimizationFromDirectionMatrix ∷ + ( WideState + → (WideState → WideState → (WideState, Word)) + → UV.Vector WideState + → UV.Vector WideState + → (Word, UM.Matrix Direction) + ) + → (WideState → WideState → (WideState, Word)) + → WideDynamicCharacter + → WideDynamicCharacter + → (Word, WideDynamicCharacter) + #-} +{-# SPECIALIZE directOptimizationFromDirectionMatrix ∷ + ( HugeState + → (HugeState → HugeState → (HugeState, Word)) + → V.Vector HugeState + → V.Vector HugeState + → (Word, UM.Matrix Direction) + ) + → (HugeState → HugeState → (HugeState, Word)) + → HugeDynamicCharacter + → HugeDynamicCharacter + → (Word, HugeDynamicCharacter) + #-} +directOptimizationFromDirectionMatrix + ∷ ( FiniteBits e + , Matrix m t Direction + , Ord (v e) + , Vector v e + ) + ⇒ (e → TCMλ e → v e → v e → (Word, m t Direction)) + -- ^ Alignment matrix generator function + → TCMλ e + -- ^ Metric for computing state distance and median state + → OpenDynamicCharacter v e + → OpenDynamicCharacter v e + → (Word, OpenDynamicCharacter v e) +directOptimizationFromDirectionMatrix matrixGenerator overlapλ = + handleMissing $ directOptimization alignmentFunction overlapλ + where + alignmentFunction lhs rhs = + let gap = let tmp = rhs ! 0 in (tmp `xor` tmp) `setBit` 0 + (cost, traversalMatrix) = matrixGenerator gap overlapλ lhs rhs + in (cost, traceback gap overlapλ traversalMatrix lhs rhs) + + +{-# SCC traceback #-} +{-# INLINEABLE traceback #-} +{-# SPECIALIZE traceback ∷ + WideState + → (WideState → WideState → (WideState, Word)) + → UM.Matrix Direction + → UV.Vector WideState + → UV.Vector WideState + → WideDynamicCharacter + #-} +{-# SPECIALIZE traceback ∷ + HugeState + → (HugeState → HugeState → (HugeState, Word)) + → UM.Matrix Direction + → V.Vector HugeState + → V.Vector HugeState + → HugeDynamicCharacter + #-} +traceback + ∷ ( Bits e + , Matrix m t Direction + , Vector v e + ) + ⇒ e + → TCMλ e + → m t Direction + → v e + -- ^ Shorter dynamic character related to the "left column" + → v e + -- ^ Longer dynamic character related to the "top row" + → OpenDynamicCharacter v e + -- ^ Resulting dynamic character alignment context +traceback gap overlapλ directionMatrix lesser longer = forceDynamicCharacter alignment + where + f x y = fst $ overlapλ x y + getDirection = curry $ unsafeIndex directionMatrix + -- The maximum size the alignment could be + bufferLength = toEnum $ GV.length lesser + GV.length longer + + -- Construct the aligned dynamic character by using a buffer of it's maximum + -- possible length. Computet the length will performing the traceback. Then + -- after the traceback is performed, copy from the buffer to create a dynamic + -- character of the correct size. + -- + -- NOTE: The buffer creation, copying, and freeing are all handled by the + -- 'unsafeCharacterBuiltByBufferedST' function. + alignment = unsafeCharacterBuiltByBufferedST bufferLength $ \a → do + let (m, n) = dim directionMatrix + iR ← newSTRef $ m - 1 + jR ← newSTRef $ n - 1 + kR ← newSTRef $ fromEnum bufferLength + + -- Set up convenience methods for accessing character elements + let getElement char ref = (char !) <$> (modifySTRef ref pred *> readSTRef ref) + let getLesserElement = getElement lesser iR + let getLongerElement = getElement longer jR + + -- Set up convenience methods for setting alignment elements on traceback + let setElementAt = modifySTRef kR pred *> readSTRef kR + let delete re = setElementAt >>= \k → setDelete a k (f gap re) re + let align le re = setElementAt >>= \k → setAlign a k le (f le re) re + let insert le = setElementAt >>= \k → setInsert a k le (f le gap) + + -- Determine when to break the alignment loop + let continueAlignment = + let notAtOrigin i j = i /= 0 || j /= 0 + in liftA2 notAtOrigin (readSTRef iR) $ readSTRef jR + + -- Perform traceback + whileM_ continueAlignment $ do + arrow ← liftA2 getDirection (readSTRef iR) $ readSTRef jR + case arrow of + LeftArrow → getLongerElement >>= delete + DiagArrow → join $ liftA2 align getLesserElement getLongerElement + UpArrow → getLesserElement >>= insert + + -- Return the actual alignment length + k ← readSTRef kR + pure $ bufferLength - toEnum k + + +{-# SCC alignmentWithAllGaps #-} +{-# INLINEABLE alignmentWithAllGaps #-} +alignmentWithAllGaps + ∷ ( Bits e + , Vector v e + ) + ⇒ TCMλ e + → v e + → (Word, OpenDynamicCharacter v e) +alignmentWithAllGaps overlapλ character = + case character !? 0 of + -- Neither character was Missing, but both are empty when gaps are removed + Nothing → (0, (GV.empty, GV.empty, GV.empty)) + -- Neither character was Missing, but one of them is empty when gaps are removed + Just e → + let len = GV.length character + nil = e `xor` e + gap = nil `setBit` 0 + zed = GV.replicate len nil + med = GV.generate len $ fst . overlapλ gap . (character !) + in (0, (zed, med, character)) + + +{-# SCC handleMissing #-} +handleMissing + ∷ ( Bits e + , Vector v e + ) + ⇒ (OpenDynamicCharacter v e → OpenDynamicCharacter v e → (Word, OpenDynamicCharacter v e)) + → OpenDynamicCharacter v e + → OpenDynamicCharacter v e + → (Word, OpenDynamicCharacter v e) +handleMissing f lhs rhs = + let implicitlyAlignWithMissing strIsLeft (_, med, _) = + let sym = med GV.! 0 + zed = sym `xor` sym + nil = GV.replicate (GV.length med) zed + str + | strIsLeft = (med, med, nil) + | otherwise = (nil, med, med) + in (0, str) + in case (isMissing lhs, isMissing rhs) of + -- Case 1: *Both* children are missing + -- Without loss of generality, we return the "left" child, as both children are identical. + (True, True) → (0, lhs) + -- Case 2: The *left* child is missing + -- Implicitly align the non-missing string with an equal length string of '?' missing symbols + (True, False) → implicitlyAlignWithMissing False rhs + -- Case 2: The *right* child is missing + -- Implicitly align the non-missing string with an equal length string of '?' missing symbols + (False, True) → implicitlyAlignWithMissing True lhs + -- Case 4: *Niether* child is missing + -- Proceed with string alignment + (False, False) → f lhs rhs diff --git a/lib/dynamic-character/src/DirectOptimization/Pairwise/Slim.hs b/lib/dynamic-character/src/DirectOptimization/Pairwise/Slim.hs new file mode 100644 index 000000000..8c4f7cf6f --- /dev/null +++ b/lib/dynamic-character/src/DirectOptimization/Pairwise/Slim.hs @@ -0,0 +1,17 @@ +module DirectOptimization.Pairwise.Slim ( + SlimDynamicCharacter, + SlimState, + slimPairwiseDO, +) where + +import Bio.DynamicCharacter +import Bio.DynamicCharacter.Element (SlimState) +import DirectOptimization.Pairwise.Slim.FFI + + +slimPairwiseDO + ∷ DenseTransitionCostMatrix + → SlimDynamicCharacter + → SlimDynamicCharacter + → (Word, SlimDynamicCharacter) +slimPairwiseDO = smallAlphabetPairwiseDO diff --git a/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Pairwise/Slim/FFI.hsc b/lib/dynamic-character/src/DirectOptimization/Pairwise/Slim/FFI.hsc similarity index 71% rename from pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Pairwise/Slim/FFI.hsc rename to lib/dynamic-character/src/DirectOptimization/Pairwise/Slim/FFI.hsc index e6cba1580..2a9ba5060 100644 --- a/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Pairwise/Slim/FFI.hsc +++ b/lib/dynamic-character/src/DirectOptimization/Pairwise/Slim/FFI.hsc @@ -16,12 +16,13 @@ -- ----------------------------------------------------------------------------- -{-# LANGUAGE BangPatterns #-} -{-# LANGUAGE DeriveGeneric #-} -{-# LANGUAGE FlexibleContexts #-} -{-# LANGUAGE ForeignFunctionInterface #-} -{-# LANGUAGE Strict #-} -{-# LANGUAGE StrictData #-} +{-# Language BangPatterns #-} +{-# Language DeriveGeneric #-} +{-# Language FlexibleContexts #-} +{-# Language ForeignFunctionInterface #-} +{-# Language ImportQualifiedPost #-} +{-# Language Strict #-} +{-# Language StrictData #-} module DirectOptimization.Pairwise.Slim.FFI ( DenseTransitionCostMatrix @@ -29,17 +30,21 @@ module DirectOptimization.Pairwise.Slim.FFI -- , foreignThreeWayDO ) where -import Bio.DynamicCharacter +import Bio.DynamicCharacter (SlimDynamicCharacter) +import Bio.DynamicCharacter.Element (SlimState) import Data.Coerce +import Data.Functor (($>)) import Data.TCM.Dense -import Data.Vector.Storable (Vector) -import qualified Data.Vector.Storable as V +import Data.Vector.Generic (basicUnsafeSlice) +import Data.Vector.Storable (Vector) +import Data.Vector.Storable qualified as V import DirectOptimization.Pairwise.Internal import Foreign import Foreign.C.Types import GHC.ForeignPtr import Prelude hiding (sequence, tail) import System.IO.Unsafe (unsafePerformIO) +import System.IO (hPutStrLn, stderr) #include "c_alignment_interface.h" #include "c_code_alloc_setup.h" @@ -78,9 +83,9 @@ instance Enum UnionContext where foreign import ccall unsafe "c_alignment_interface.h cAlign2D" align2dFn_c - :: Ptr CUInt -- ^ character1, input & output (lesser) - -> Ptr CUInt -- ^ character2, input & output (longer) - -> Ptr CUInt -- ^ gapped median output + :: Ptr SlimState -- ^ character1, input & output (lesser) + -> Ptr SlimState -- ^ character2, input & output (longer) + -> Ptr SlimState -- ^ gapped median output -> Ptr CSize -- ^ length median output -> CSize -- ^ size of each buffer -> CSize -- ^ length of character1 @@ -169,58 +174,65 @@ algn2d -> SlimDynamicCharacter -- ^ First dynamic character -> SlimDynamicCharacter -- ^ Second dynamic character -> (Word, SlimDynamicCharacter) -- ^ The cost of the alignment -algn2d computeUnion computeMedians denseTCMs = directOptimization f $ lookupPairwise denseTCMs +algn2d computeUnion computeMedians denseTCMs = directOptimization useC $ lookupPairwise denseTCMs where - f :: Vector CUInt -> Vector CUInt -> (Word, SlimDynamicCharacter) - f lesser longer = {-# SCC f #-} unsafePerformIO . V.unsafeWith lesser $ \lesserPtr -> V.unsafeWith longer $ \longerPtr -> do + useC :: Vector SlimState -> Vector SlimState -> (Word, SlimDynamicCharacter) + useC lesser longer = {-# SCC useC #-} unsafePerformIO . V.unsafeWith lesser $ \lesserPtr -> V.unsafeWith longer $ \longerPtr -> do let lesserLength = V.length lesser let longerLength = V.length longer -- Add two because the C code needs stupid gap prepended to each character. -- Forgetting to do this will eventually corrupt the heap memory let bufferLength = lesserLength + longerLength + 2 - lesserBuffer <- allocCharacterBuffer bufferLength lesserLength lesserPtr - medianBuffer <- allocCharacterBuffer bufferLength 0 nullPtr - longerBuffer <- allocCharacterBuffer bufferLength longerLength longerPtr + lesserVector <- initializeCharacterBuffer bufferLength lesserLength lesserPtr + medianVector <- initializeCharacterBuffer bufferLength 0 nullPtr + longerVector <- initializeCharacterBuffer bufferLength longerLength longerPtr resultLength <- malloc :: IO (Ptr CSize) strategy <- getAlignmentStrategy <$> peek costStruct let medianOpt = coerceEnum computeMedians - let !cost = case strategy of + cost <- case strategy of Affine -> {-# SCC affine_undefined #-} - undefined -- align2dAffineFn_c lesserBuffer longerBuffer medianBuffer resultLength (ics bufferLength) (ics lesserLength) (ics longerLength) costStruct medianOpt + hPutStrLn stderr "Control Flow Diversion:\tAffine alignment not implemented" $> 0 + -- align2dAffineFn_c lesserBuffer longerBuffer medianBuffer resultLength (ics bufferLength) (ics lesserLength) (ics longerLength) costStruct medianOpt _ -> {-# SCC align2dFn_c #-} - align2dFn_c - lesserBuffer - longerBuffer - medianBuffer - resultLength - (ics bufferLength) - (ics lesserLength) - (ics longerLength) - costStruct - neverComputeOnlyGapped - medianOpt - (coerceEnum computeUnion) - - alignedLength <- {-# SCC alignedLength #-} coerce <$> peek resultLength - let g = buildResult bufferLength (csi alignedLength) + V.unsafeWith lesserVector $ \lesserBuffer -> + V.unsafeWith medianVector $ \medianBuffer -> + V.unsafeWith longerVector $ \longerBuffer -> pure $ align2dFn_c + lesserBuffer + longerBuffer + medianBuffer + resultLength + (ics bufferLength) + (ics lesserLength) + (ics longerLength) + costStruct + neverComputeOnlyGapped + medianOpt + (coerceEnum computeUnion) + + alignedLength <- getAlignedLength resultLength + let g = finalizeCharacterBuffer bufferLength alignedLength -- -- NOTE: Extremely important implementation detail! -- -- The C FFI swaps the results somewhere, we swap back here: - alignedLesser <- {-# SCC alignedLesser #-} g longerBuffer - alignedMedian <- {-# SCC alignedMedian #-} g medianBuffer - alignedLonger <- {-# SCC alignedLonger #-} g lesserBuffer + let alignedLesser = {-# SCC alignedLesser #-} g longerVector + let alignedMedian = {-# SCC alignedMedian #-} g medianVector + let alignedLonger = {-# SCC alignedLonger #-} g lesserVector let alignmentCost = fromIntegral cost let alignmentContext = (alignedLesser, alignedMedian, alignedLonger) pure $ {-# SCC ffi_result #-} (alignmentCost, alignmentContext) where - costStruct = costMatrix2D denseTCMs + costStruct = costMatrix2D denseTCMs neverComputeOnlyGapped = 0 + {-# SCC ics #-} ics :: Int -> CSize ics = coerce . (toEnum :: Int -> Word64) + {-# SCC csi #-} csi :: CSize -> Int - csi = (fromEnum :: Word64 -> Int) . coerce + csi = (fromEnum :: Word64 -> Int) . coerce + + {- -- | @@ -309,9 +321,52 @@ algn3d char1 char2 char3 mismatchCost openningGapCost indelCost denseTCMs = hand {- Generic helper functions -} +-- | +-- Use in conjunction with 'finalizeCharacterBuffer' to marshall data across FFI. +-- +-- /Should/ minimize number of, and maximize speed of copying operations. +{-# SCC initializeCharacterBuffer #-} +initializeCharacterBuffer :: Int -> Int -> Ptr SlimState -> IO (Vector SlimState) +initializeCharacterBuffer maxSize elemCount elements = + let e = min maxSize elemCount + off = maxSize - e + {- + Bind the vector creation within the monadic "do-block" scope to ensure + that no sharing of "vec" occurs between calls. + + This operation is inherently unsafe, modifying the data of the underling + Ptr contained within the vector. However, this permits faster marshalling + across the FFI + -} + in do let vec = V.replicate maxSize 0 + V.unsafeWith vec $ \ptr -> + moveArray (advancePtr ptr off) elements e $> vec + + +-- | +-- Use in conjunction with 'finalizeCharacterBuffer' to marshall data across FFI. +-- +-- /Should/ minimize number of, and maximize speed of copying operations. +{-# SCC finalizeCharacterBuffer #-} +finalizeCharacterBuffer :: Int -> Int -> Vector SlimState -> Vector SlimState +finalizeCharacterBuffer bufferLength alignedLength = + let e = min bufferLength alignedLength + off = bufferLength - e + in basicUnsafeSlice off e + + +-- | +-- Read and free the length of the resulting alignment. +getAlignedLength :: Ptr CSize -> IO Int +getAlignedLength lenRef = + let f = coerce :: CSize -> Word64 + in (fromEnum . f <$> peek lenRef) <* free lenRef + +{- -- | -- Allocates space for an align_io struct to be sent to C. -allocCharacterBuffer :: Int -> Int -> Ptr CUInt -> IO (Ptr CUInt) +{-# SCC allocCharacterBuffer #-} +allocCharacterBuffer :: Int -> Int -> Ptr SlimState -> IO (Ptr SlimState) allocCharacterBuffer maxSize elemCount elements = do let e = min maxSize elemCount buffer <- mallocArray maxSize @@ -321,17 +376,22 @@ allocCharacterBuffer maxSize elemCount elements = do pure buffer -buildResult :: Int -> Int -> Ptr CUInt -> IO (Vector CUInt) -buildResult bufferLength alignedLength alignedBuffer = do +-- | +-- Read and free the length of the resulting alignment. +{-# SCC getAlignedLength #-} +getAlignedLength :: Ptr CSize -> IO Int +getAlignedLength lenRef = + let f = coerce :: CSize -> Word64 + in (fromEnum . f <$> peek lenRef) <* free lenRef + + +buildResult :: Int -> Int -> Ptr SlimState -> IO (Vector SlimState) +buildResult bufferLength alignedLength alignedBuffer = let e = min bufferLength alignedLength - let off = bufferLength - e - let ref = advancePtr alignedBuffer off - vector <- mallocArray alignedLength - copyArray vector ref e - free alignedBuffer - fPtr <- newConcForeignPtr vector (free vector) - let res = V.unsafeFromForeignPtr0 fPtr e :: Vector CUInt - pure res + off = bufferLength - e + ref = advancePtr alignedBuffer off + in (V.fromListN e <$> peekArray e ref) <* free alignedBuffer +-} -- | @@ -345,4 +405,3 @@ buildResult bufferLength alignedLength alignedBuffer = do {-# SPECIALISE coerceEnum :: CUInt -> Word #-} coerceEnum :: (Enum a, Enum b) => a -> b coerceEnum = toEnum . fromEnum - diff --git a/lib/dynamic-character/src/DirectOptimization/Pairwise/Swapping.hs b/lib/dynamic-character/src/DirectOptimization/Pairwise/Swapping.hs new file mode 100644 index 000000000..f43740c0c --- /dev/null +++ b/lib/dynamic-character/src/DirectOptimization/Pairwise/Swapping.hs @@ -0,0 +1,161 @@ +{-# LANGUAGE ApplicativeDo #-} +{-# LANGUAGE ConstraintKinds #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE Strict #-} +{-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE UnboxedTuples #-} + +{- | +Module : DirectOptimization.Pairwise.Swapping +Copyright : (c) 2015-2021 Ward Wheeler +License : BSD-style + +Maintainer : wheeler@amnh.org +Stability : provisional +Portability : portable + +Direct optimization pairwise alignment using the Needleman-Wunsch algorithm. +These functions will allocate an M * N matrix. +-} +module DirectOptimization.Pairwise.Swapping ( + Direction (), + swappingDO, + buildDirectionMatrix, + minimumCostDirection, +) where + +import Bio.DynamicCharacter +import Bio.DynamicCharacter.Element +import Control.Monad.ST +import Data.Bits +import Data.Foldable +import Data.Matrix.Unboxed (Matrix, unsafeFreeze) +import Data.Matrix.Unboxed.Mutable qualified as M +import Data.Vector qualified as V +import Data.Vector.Generic (Vector, (!)) +import Data.Vector.Generic qualified as GV +import Data.Vector.Unboxed qualified as UV +import Data.Vector.Unboxed.Mutable qualified as MUV +import DirectOptimization.Pairwise.Direction +import DirectOptimization.Pairwise.Internal + + +{- | +Performs a naive direct optimization. +Takes in two characters to run DO on and an overlap function +Returns an assignment character, the cost of that assignment, the assignment +character with gaps included, the aligned version of the first input character, +and the aligned version of the second input character. The process for this +algorithm is to generate a traversal matrix, then perform a traceback. +-} +{-# SCC swappingDO #-} +{-# INLINEABLE swappingDO #-} +{-# SPECIALIZE swappingDO ∷ + (WideState → WideState → (WideState, Word)) → WideDynamicCharacter → WideDynamicCharacter → (Word, WideDynamicCharacter) + #-} +{-# SPECIALIZE swappingDO ∷ + (HugeState → HugeState → (HugeState, Word)) → HugeDynamicCharacter → HugeDynamicCharacter → (Word, HugeDynamicCharacter) + #-} +swappingDO + ∷ ( FiniteBits e + , Ord (v e) + , Vector v e + ) + ⇒ TCMλ e + → OpenDynamicCharacter v e + → OpenDynamicCharacter v e + → (Word, OpenDynamicCharacter v e) +swappingDO = directOptimizationFromDirectionMatrix buildDirectionMatrix + + +{-# SCC buildDirectionMatrix #-} +{-# INLINEABLE buildDirectionMatrix #-} +{-# SPECIALIZE buildDirectionMatrix ∷ + WideState + → (WideState → WideState → (WideState, Word)) + → UV.Vector WideState + → UV.Vector WideState + → (Word, Matrix Direction) + #-} +{-# SPECIALIZE buildDirectionMatrix ∷ + HugeState + → (HugeState → HugeState → (HugeState, Word)) + → V.Vector HugeState + → V.Vector HugeState + → (Word, Matrix Direction) + #-} +buildDirectionMatrix + ∷ (Vector v e) + ⇒ e + -- ^ Gap state + → TCMλ e + -- ^ Metric between states producing the medoid of states. + → v e + -- ^ Shorter dynamic character related to the "left column" + → v e + -- ^ Longer dynamic character related to the "top row" + → (Word, Matrix Direction) +buildDirectionMatrix gap tcmλ lesserLeft longerTop = fullMatrix + where + costλ x y = snd $ tcmλ x y + rows = GV.length lesserLeft + 1 + cols = GV.length longerTop + 1 + + fullMatrix = runST $ do + mDir ← M.new (rows, cols) + vOne ← MUV.new cols + vTwo ← MUV.new cols + + let write v p@(~(_, j)) c d = MUV.unsafeWrite v j c *> M.unsafeWrite mDir p d + + write vOne (0, 0) 0 DiagArrow + + -- Special case the first row + -- We need to ensure that there are only Left Arrow values in the directional matrix. + -- We can also reduce the number of comparisons the first row makes from 3 to 1, + -- since the diagonal and upward values are "out of bounds." + for_ [1 .. cols - 1] $ \j → + let topElement = longerTop ! (j - 1) + firstCellCost = costλ gap topElement + in do + firstPrevCost ← MUV.unsafeRead vOne (j - 1) + write vOne (0, j) (firstCellCost + firstPrevCost) LeftArrow + + for_ [1 .. rows - 1] $ \i → + let (prev, curr) + | odd i = (vOne, vTwo) + | otherwise = (vTwo, vOne) + leftElement = lesserLeft ! (i - 1) + -- Special case the first cell of each row + -- We need to ensure that there are only Up Arrow values in the directional matrix. + -- We can also reduce the number of comparisons the first row makes from 3 to 1, + -- since the diagonal and leftward values are "out of bounds." + firstCellCost = costλ leftElement gap + in do + firstPrevCost ← MUV.unsafeRead prev 0 + write curr (i, 0) (firstCellCost + firstPrevCost) UpArrow + -- Finish special case for first cell of each row + -- Begin processing all other cells in the curr vector + for_ [1 .. cols - 1] $ \j → + let topElement = longerTop ! (j - 1) + deleteCost = costλ gap topElement + alignCost = costλ leftElement topElement + insertCost = costλ leftElement gap + in do + diagCost ← MUV.unsafeRead prev $ j - 1 + topCost ← MUV.unsafeRead prev j + leftCost ← MUV.unsafeRead curr $ j - 1 + let (# c, d #) = + minimumCostDirection + (deleteCost + leftCost) + (alignCost + diagCost) + (insertCost + topCost) + write curr (i, j) c d + + let v + | odd rows = vOne + | otherwise = vTwo + + c ← MUV.unsafeRead v (cols - 1) + m ← unsafeFreeze mDir + pure (c, m) diff --git a/lib/dynamic-character/src/DirectOptimization/Pairwise/Ukkonen.hs b/lib/dynamic-character/src/DirectOptimization/Pairwise/Ukkonen.hs new file mode 100644 index 000000000..e885c2dd9 --- /dev/null +++ b/lib/dynamic-character/src/DirectOptimization/Pairwise/Ukkonen.hs @@ -0,0 +1,878 @@ +{-# LANGUAGE ApplicativeDo #-} +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE ConstraintKinds #-} +{-# LANGUAGE DerivingStrategies #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE UnboxedTuples #-} + +{- | +Module : DirectOptimization.Pairwise.Ukkonen +Copyright : (c) 2015-2021 Ward Wheeler +License : BSD-style + +Maintainer : wheeler@amnh.org +Stability : provisional +Portability : portable + +Direct optimization pairwise alignment using the Needleman-Wunsch algorithm. +These functions will allocate an M * N matrix. +-} +module DirectOptimization.Pairwise.Ukkonen ( + Direction (), + ukkonenDO, + createUkkonenMethodMatrix, +) where + +import Bio.DynamicCharacter +import Bio.DynamicCharacter.Element +import Bio.DynamicCharacter.Measure +import Control.Monad (unless, when) +import Control.Monad.Loops (iterateUntilM, whileM_) +import Control.Monad.Primitive +import Control.Monad.ST +import Data.Bits +import Data.Foldable +import Data.Matrix.Unboxed (Matrix, unsafeFreeze) +import Data.Matrix.Unboxed.Mutable (MMatrix) +import Data.Matrix.Unboxed.Mutable qualified as M +import Data.STRef +import Data.Vector qualified as V +import Data.Vector.Generic (Vector, (!)) +import Data.Vector.Generic qualified as GV +import Data.Vector.Unboxed qualified as UV +import DirectOptimization.Pairwise.Internal +import DirectOptimization.Pairwise.Swapping + + +{- | +Performs a naive direct optimization. +Takes in two characters to run DO on and an overlap function +Returns an assignment character, the cost of that assignment, the assignment +character with gaps included, the aligned version of the first input character, +and the aligned version of the second input character. The process for this +algorithm is to generate a traversal matrix, then perform a traceback. +-} +{-# SCC ukkonenDO #-} +{-# INLINEABLE ukkonenDO #-} +{-# SPECIALIZE ukkonenDO ∷ + Word + → (WideState → WideState → (WideState, Word)) + → WideDynamicCharacter + → WideDynamicCharacter + → (Word, WideDynamicCharacter) + #-} +{-# SPECIALIZE ukkonenDO ∷ + Word + → (HugeState → HugeState → (HugeState, Word)) + → HugeDynamicCharacter + → HugeDynamicCharacter + → (Word, HugeDynamicCharacter) + #-} +ukkonenDO + ∷ ( FiniteBits e + , Ord (v e) + , Vector v e + ) + ⇒ Word + -- ^ Coefficient value, representing the /minimum/ transition cost from a state to gap + → TCMλ e + -- ^ Metric between states producing the medoid of states. + → OpenDynamicCharacter v e + -- ^ /1st/ dynamic character + → OpenDynamicCharacter v e + -- ^ /2nd/ dynamic character + → (Word, OpenDynamicCharacter v e) +ukkonenDO coefficient tcmλ char1 char2 + | noGainFromUkkonenMethod = buildFullMatrix + | otherwise = buildBandMatrix + where + buildFullMatrix = swappingDO tcmλ char1 char2 + buildBandMatrix = directOptimizationFromDirectionMatrix ukkonenBandλ tcmλ char1 char2 + + ukkonenBandλ = createUkkonenMethodMatrix coefficient inputGapAmbiguities + + ~(_, _, _, x, y) = measureCharactersWithoutGaps char1 char2 + + lesser = extractMediansGapped x + longer = extractMediansGapped y + + -- /O(1)/ + -- + -- If the longer character is 50% larger than the shorter character, then + -- there is no point in using the barriers. Rather, we fill the full matrix + -- immediately. + -- + -- Additionally, if the shorter sequence is of length 4 or less, then the + -- initial barrier will be set adjacent to or beyond the lower left and + -- upper right corners. + -- + -- Also, a threshold coefficient is computed as the minimal indel cost from + -- any symbol in the alphabet to gap. However, if the indel cost for any + -- symbol is zero, the algorithm will hang, and a naive approach must be taken. + -- + -- Lastly, if the sum of the gaps in both strings is equal to or exceeds the + -- length of the longer string, then the threshold criteria will never be met + -- by definition. + -- + -- Do not perform Ukkonen's algorithm if and only if: + -- + -- > longerLen >= 1.5 * lesserLen + -- OR + -- > lesserLen <= 4 + -- OR + -- > coefficient == 0 + -- + noGainFromUkkonenMethod = + lesserLen <= 4 + || 2 * longerLen >= 3 * lesserLen + || coefficient == 0 + || inputGapAmbiguities >= longerLen + where + longerLen = vLength longer + lesserLen = vLength lesser + + -- /O(n + m)/ + -- + -- NOTE: There will be no *unambiguous* gap elements in the dynamic characters! + -- However, there may be abiguous elements which contain gap as a possibility. + -- + -- If one or more of the character elements contained a gap, diagonal + -- directions in the matrix have an "indel" cost. 'gapsPresentInInputs' is + -- necessary in order to decrement the threshold value to account for this. + -- This was not described in Ukkonen's original paper, as the inputs were assumed + -- not to contain any gaps. + inputGapAmbiguities = char1Gaps + char2Gaps + where + char1Gaps = countGaps lesser + char2Gaps = countGaps longer + countGaps = vLength . GV.filter maybeGap + maybeGap = (`testBit` 0) -- Zero is the gap bit! + vLength = toEnum . GV.length + + +{- | +/O( (n - m + 1 ) * log(n - m + 1) )/, /n/ >= /m/ + +Generates an /optimal/, partially-filled-in matrix using Ukkonen's string +edit distance algorithm. + +Note that the threshold value is lowered more than described in Ukkonen's +paper. This is to handle input elements that contain a gap. In Ukkonen's +original description of the algorithm, there was a subtle assumption that +input did not contain any gap symbols. +-} +{-# SCC createUkkonenMethodMatrix #-} +{-# INLINEABLE createUkkonenMethodMatrix #-} +{-# SPECIALIZE createUkkonenMethodMatrix ∷ + Word + → Word + → WideState + → (WideState → WideState → (WideState, Word)) + → UV.Vector WideState + → UV.Vector WideState + → (Word, Matrix Direction) + #-} +{-# SPECIALIZE createUkkonenMethodMatrix ∷ + Word + → Word + → HugeState + → (HugeState → HugeState → (HugeState, Word)) + → V.Vector HugeState + → V.Vector HugeState + → (Word, Matrix Direction) + #-} +createUkkonenMethodMatrix + ∷ (Vector v e) + ⇒ Word + -- ^ Coefficient value, representing the /minimum/ transition cost from a state to gap + → Word + -- ^ Number of abiguous elements in both inputs which contained gap as a possible state + → e + -- ^ Gap State + → TCMλ e + -- ^ Metric between states producing the medoid of states. + → v e + -- ^ Shorter dynamic character + → v e + -- ^ Longer dynamic character + → (Word, Matrix Direction) +createUkkonenMethodMatrix minimumIndelCost inputGapAmbiguities gap tcmλ lesserLeft longerTop = finalMatrix + where + -- General values that need to be in scope for the recursive computations. + lesserLen = GV.length lesserLeft + longerLen = GV.length longerTop + + -- We start the offset at four rather than at one so that the first doubling + -- isn't trivially small. + startOffset = 2 + + -- /O(1)/ + -- + -- Necessary to compute the width of a row in the barrier-constrained matrix. + quasiDiagonalWidth = toEnum $ differenceInLength + 1 + where + differenceInLength = longerLen - lesserLen + + extra = (inputGapAmbiguities +) + + finalMatrix = runST $ do + (mCost, mDir) ← buildInitialBandedMatrix gap tcmλ lesserLeft longerTop $ extra startOffset + let getAlignmentCost = M.unsafeRead mCost (lesserLen, longerLen) + offsetRef ← newSTRef startOffset + + let needToResizeBand = do + offset ← readSTRef offsetRef + -- If the filled row width exceeds the actual row length, + -- Then clearly we are done as we have filled the entire matrix. + if quasiDiagonalWidth + extra offset > toEnum longerLen + then pure False + else + let partialWidth = quasiDiagonalWidth + offset + -- Value that the alignment cost must be less than + threshold -- The threshold value must be non-negative + | partialWidth <= inputGapAmbiguities = 0 + | otherwise = minimumIndelCost * (partialWidth - inputGapAmbiguities) + in (threshold <=) <$> getAlignmentCost + + whileM_ needToResizeBand $ do + previousOffset ← readSTRef offsetRef + let currentOffset = previousOffset `shiftL` 1 -- Multiply by 2 + writeSTRef offsetRef currentOffset + expandBandedMatrix + gap + tcmλ + lesserLeft + longerTop + mCost + mDir + (extra previousOffset) + (extra currentOffset) + + c ← getAlignmentCost + m ← unsafeFreeze mDir + pure (c, m) + + +{-# SCC buildInitialBandedMatrix #-} +buildInitialBandedMatrix + ∷ (Vector v e) + ⇒ e + -- ^ Gap + → TCMλ e + -- ^ Metric between states producing the medoid of states. + → v e + -- ^ Shorter dynamic character + → v e + -- ^ Longer dynamic character + → Word + → ST s (MMatrix s Word, MMatrix s Direction) +buildInitialBandedMatrix gap tcmλ lesserLeft longerTop o = fullMatrix + where + (offset, costλ, rows, cols, width, quasiDiagonalWidth) = ukkonenConstants tcmλ lesserLeft longerTop o + + fullMatrix = do + --------------------------------------- + -- Allocate required space -- + --------------------------------------- + + mCost ← M.new (rows, cols) + mDir ← M.new (rows, cols) + + --------------------------------------- + -- Define some generalized functions -- + --------------------------------------- + let ~(readCost, write, internalCell, leftColumn, leftBoundary, rightBoundary, rightColumn) = + edgeCellDefinitions gap costλ longerTop mCost mDir + + -- Define how to compute values to an entire row of the Ukkonen matrix. + let writeRow i = + -- Precompute some values that will be used for the whole row + let start = max 0 $ i - offset + stop = min (cols - 1) $ i - offset + width - 1 + leftElement = lesserLeft ! (i - 1) + insertCost = costλ leftElement gap + + -- Each row in the matrix with values in the band has 'width' cells. + -- However, the band runs off the left end of the matrix for the first + -- several rows of the matrix. How many rows, though? + -- There are exactly 'offset' number of cells left of first matrix column + -- in the first row. The number of cells to the left of the matrix + -- decreases by one in each subsequent row. The means that the first + -- row, and then the next 'offset' number of rows require special handling + -- of the boundary. The last row index requiring special handling is index + -- 'offset'. Subsequent rows have the band begin at least one cell away + -- from the matrix boundary. + firstCell + | i <= offset = leftColumn + | otherwise = leftBoundary + + lastCell + | i <= cols - quasiDiagonalWidth - offset = rightBoundary + | otherwise = rightColumn + in do + -- Write to the first cell of the Ukkonen band + firstCell leftElement insertCost i start + -- Write to the all the intermediary cells of the Ukkonen band + for_ [start + 1 .. stop - 1] $ \j → + internalCell leftElement insertCost i j + -- Write to the last cell of the Ukkonen band + lastCell leftElement insertCost i stop + + --------------------------------------- + -- Compute all values of the matrix -- + --------------------------------------- + + -- Write to the origin to seed the first row. + write (0, 0) (# 0, DiagArrow #) + + -- Each row in the matrix with values in the band has 'width' cells. + -- However, the band runs off the end of the matrix for the first & last + -- rows of the matrix. We subtract the 'offest' from the 'width' because + -- there are exactly 'offset' number of cells left of first matrix column. + -- Hence the top row's width is 'width' minus 'offset'. The last cell index + -- in the top row of the band is 'width' minus 'offset' minus 1. + let topRowWidth = width - offset + let topRowWrite !j !cost = write (0, j) (# cost, LeftArrow #) + + -- Write the first row to seed subsequent rows. + for_ [1 .. min (cols - 1) (topRowWidth - 1)] $ \j → + let topElement = longerTop ! (j - 1) + firstCellCost = costλ gap topElement + in do + firstPrevCost ← readCost 0 $ j - 1 + topRowWrite j $ firstCellCost + firstPrevCost + + -- Loop through the remaining rows. + for_ [1 .. rows - 1] writeRow + + -- Return the matricies for possible expansion + pure (mCost, mDir) + + +{-# SCC expandBandedMatrix #-} + + +{- | +Given a partially computed alignment matrix, +will expand the computed region to the new specified offset. + + +Dimensions: 13 ⨉ 17 + ⊗ ┃ ⁎ α1 α2 α3 α4 α5 α6 α7 α8 α9 α0 α1 α2 α3 α4 α5 α6 +━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + ⁎ ┃ 0↖ 0← 0← 0← 0← 0← 0← 0← 0← 0← 0← 0← 0← 0← 0← 0← 0← +α1 ┃ 0↑ 0↖ 0↖ 0↖ 0← 0← 0← 0← 0← 0← 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0← +α2 ┃ 0↑ 0↖ 0↖ 0↖ 0← 0← 0← 0← 0← 0← 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0← +α3 ┃ 0↑ 0↑ 0↑ 0↖ 0↖ 0↖ 0← 0← 0← 0← 0← 0← 0← 0← 0← 0← 0↖ +α4 ┃ 0↑ 0↑ 0↑ 0↖ 0↖ 0← 0↖ 0← 0↖ 0← 0← 0← 0← 0← 0← 0← 0← +α5 ┃ 0↑ 0↑ 0↑ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0← 0← 0← 0← 0← 0← 0← 0← +α6 ┃ 0↑ 0↖ 0↖ 0↖ 0↑ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0← +α7 ┃ 0↑ 0↑ 0↑ 0↑ 0↖ 0↖ 0↖ 0↖ 0↖ 0← 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ +α8 ┃ 0↑ 0↑ 0↑ 0↑ 0↑ 0↖ 0← 0↖ 0↑ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ +α9 ┃ 0↑ 0↑ 0↑ 0↑ 0↖ 0↑ 0↖ 0← 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ +α0 ┃ 0↑ 0↑ 0↑ 0↑ 0↑ 0↖ 0↑ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ +α1 ┃ 0↑ 0↑ 0↑ 0↑ 0↖ 0↑ 0↖ 0↖ 0↖ 0← 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ +α2 ┃ 0↑ 0↖ 0↖ 0↖ 0↑ 0↑ 0↑ 0↖ 0↑ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ + + ┌───────────────w───────────────┐ + │ ┏━━━━━━━co━━━━━━━┪ + ┢━━━━━qd━━━━━━┓┠─po─┐┌────Δo────┨ + ⊗ ┃ ┃0 1 2 3 4┃┃5 6││7 8 9 10┃11 12 13 14 15 16 +━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + 0 ┃ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ ▒▒ ▒▒ + 1 ┃ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ ▒▒ ▒▒ + 2 ┃ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ ▒▒ ▒▒ + 3 ┃ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ ▒▒ ▒▒ + 4 ┃ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ ▒▒ ▒▒ + 5 ┃ ▒▒ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ ▒▒ ▒▒ + 6 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ ▒▒ ▒▒ + 7 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ ▒▒ + 8 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ + 9 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ + 0 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ + 1 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ + 2 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ + + +w : Width +qd : Quasi-diagonal +co : Current Offset +po : Previous Offset +Δo : Difference in Offset + +Note: +w = qd + co +co = po + Δo + +And often: +co = 2*po = 2*Δo + +██ : The core band + * Previously computed, sections may need to be recomputed + +▓▓ : The previous extension + * Previously computed, sections may need to be recomputed + +▒▒ : The new extension + * Needs to be computed +-} +expandBandedMatrix + ∷ (Vector v e) + ⇒ e + -- ^ Gap state + → TCMλ e + -- ^ Metric between states producing the medoid of states. + → v e + -- ^ Shorter dynamic character + → v e + -- ^ Longer dynamic character + → MMatrix s Word + → MMatrix s Direction + → Word + → Word + → ST s () +expandBandedMatrix gap tcmλ lesserLeft longerTop mCost mDir po co = updatedBand + where + (offset, costλ, rows, cols, width, qd) = ukkonenConstants tcmλ lesserLeft longerTop co + prevOffset = fromEnum po + + updatedBand = do + --------------------------------------- + -- Allocate mutable state variables -- + --------------------------------------- + + tailStart ← newSTRef cols + + t0' ← newSTRef (-1) + t1' ← newSTRef $ qd + fromEnum po + + --------------------------------------- + -- Define some generalized functions -- + --------------------------------------- + let ~(readCost, write, internalCell, leftColumn, leftBoundary, rightBoundary, rightColumn) = + edgeCellDefinitions gap costλ longerTop mCost mDir + + let computeCell !leftElement !insertCost !i !j = + {-# SCC recomputeCell #-} + let !topElement = longerTop ! (j - 1) + !deleteCost = costλ gap topElement + !alignCost = costλ leftElement topElement + in do + diagCost ← readCost (i - 1) $ j - 1 + topCost ← readCost (i - 1) j + leftCost ← readCost i $ j - 1 + oldCost ← readCost i j + let !e@(# c, _ #) = + minimumCostDirection + (deleteCost + leftCost) + (alignCost + diagCost) + (insertCost + topCost) + write (i, j) e + pure (c == oldCost, j + 1) + -- pure (c /= oldCost, j+1) + + let recomputeRange leftElement insertCost i x y = do + lastDiff ← newSTRef 0 + for_ [x .. y] $ \j → do + (same, _) ← computeCell leftElement insertCost i j + unless same $ writeSTRef lastDiff j + readSTRef lastDiff + + -- Define how to compute values to an entire row of the Ukkonen matrix. + let extendRow i = + -- Precopmute some values that will be used for the whole row + let start0 = max 0 $ i - offset + start3 = min cols $ i + width - offset - prevOffset - 1 + goUpTo = max 0 (i - prevOffset) - 1 + stop = min (cols - 1) $ i + width - offset - 1 + leftElement = lesserLeft ! (i - 1) + insertCost = costλ leftElement gap + firstCell + | i <= offset = leftColumn + | otherwise = leftBoundary + + lastCell + | i <= cols - qd - offset = rightBoundary + | otherwise = rightColumn + + b0 = start0 + e0 = goUpTo + b1 = start3 + e1 = stop + + continueRecomputing (same, j) = same || j >= stop + computeCell' ~(_, j) = computeCell leftElement insertCost i j + internalCell' j = internalCell leftElement insertCost i j + recomputeUntilSame j = snd <$> iterateUntilM continueRecomputing computeCell' (False, j) + in do + -- First, we fill in 0 or more cells of the left region of + -- the expanded band. This is the region [b0, e0] computed + -- above. + -- ⊗ ┃ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 + -- ━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + -- 0 ┃ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 1 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 2 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 3 ┃ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 4 ┃ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- b0 e0 + -- ┏━━━━━━━━┓ + -- 5 ┃ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- + -- 6 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 7 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ + -- 8 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ + -- 9 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ + -- 10 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ + -- 11 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ + -- 12 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ + -- + + -- Conditionally write to the first cell of the Ukkonen band + if i > prevOffset + then firstCell leftElement insertCost i b0 + else pure () + + for_ [b0 + 1 .. e0] internalCell' + + -- Next, we assign to s0 the value t0 from the previous row. + -- The cell t0 is up to where the values were recomputed in + -- the previous row. + -- We recompute the cells in the range [e0 + 1, s0]. + -- We assign to t0 the last cell in the range [s1, s2] which + -- was updated for the next row. + -- We remember cells t0 for the next row. + -- ⊗ ┃ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 + -- ━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + -- 0 ┃ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 1 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 2 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 3 ┃ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 4 ┃ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- e0 s0 + -- ┏━━━━━┓ + -- 5 ┃ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- + -- 6 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 7 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ + -- 8 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ + -- 9 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ + -- 10 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ + -- 11 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ + -- 12 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ + -- + -- + s0 ← (\x → min (x + 1) e1) <$> readSTRef t0' + writeSTRef t0' (-1) + + when (s0 > e0 && toEnum i > po) $ + recomputeRange leftElement insertCost i (e0 + 1) s0 >>= writeSTRef t0' + t0 ← readSTRef t0' + + -- If s0 = t0, we recompute the cell (s0 + 1). + -- If the cost is the same, we stop here and remember the cell + -- before we stopped. + -- If the cost is not the same, we update cell (s0 + 1) and + -- move on to (s0 + 2). + -- This procedure continues until (s0 + n) has the same cost + -- as before, or *until we reach b1.* + -- We remember the cell (s0 + n - 1) as t0 for the next row. + -- ⊗ ┃ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 + -- ━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + -- 0 ┃ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 1 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 2 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 3 ┃ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 4 ┃ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- s0 t0 + -- ╔═════╗ + -- 5 ┃ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- + -- 6 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 7 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ + -- 8 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ + -- 9 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ + -- 10 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ + -- 11 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ + -- 12 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ + -- + if s0 == t0 && s0 > 0 + then recomputeUntilSame (s0 + 1) >>= writeSTRef t0' . pred + else + if s0 <= e0 && e0 > 0 + then recomputeUntilSame (e0 + 1) >>= writeSTRef t0' . pred + else pure () + + -- Next, we assign to s1 the value t1 from the previous row. + -- We also assign s2 the value t2 from the previous row. + -- The range [t1, t2] is where the values were recomputed in + -- the previous row. + -- We recompute the cells in the range [s1, s2]. + -- We assign to t2 the last cell in the range [s1, s2] which + -- was updated for the next row. + -- We remember cells s1 as t1 for the next row. + -- ⊗ ┃ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 + -- ━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + -- 0 ┃ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 1 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 2 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 3 ┃ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 4 ┃ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- s1 s2 + -- ┏━━━━━┓ + -- 5 ┃ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- + -- 6 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 7 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ + -- 8 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ + -- 9 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ + -- 10 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ + -- 11 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ + -- 12 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ + -- + -- NOPE, Try again + -- + -- Next, we assign to s1 the value t1 from the previous row. + -- If s1 is less than t0, we assign to s1 the value t0 + 1. + -- This ensures that we do not start "behind" where we have + -- previously computed. + -- Then if s1 is greater than e1, we assign to s1 the + -- value e1. This ensures one cell is always written to. + -- We recompute the cells in the range [s1, b1 - 1]. + -- If any cell in the range was updated, we assign to s1 to t1. + -- We remember cell t1 for the next row. + -- ⊗ ┃ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 + -- ━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + -- 0 ┃ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 1 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 2 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 3 ┃ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 4 ┃ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- s1 b1 + -- ┏━━━━━━━━┓ + -- 5 ┃ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- + -- 6 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 7 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ + -- 8 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ + -- 9 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ + -- 10 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ + -- 11 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ + -- 12 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ + -- + s1 ← do + a ← readSTRef t0' + b ← readSTRef t1' + pure . min e1 $ max a b + + t1 ← recomputeRange leftElement insertCost i s1 $ b1 - 1 + + -- If no cells were updated, a zero value is returned. + -- In this case, the "last" updated cell for the next row is b1. + writeSTRef t1' $ if t1 == 0 then b1 else s1 + + -- Lastly, we fill in 0 or more cells of the left region of + -- the expanded band. This is the region [b1, e1] computed + -- above. + -- ⊗ ┃ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 + -- ━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + -- 0 ┃ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 1 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 2 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 3 ┃ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 4 ┃ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- b1 e1 + -- ┏━━━━━━━━━━━┓ + -- 5 ┃ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- + -- 6 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 7 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ + -- 8 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ + -- 9 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ + -- 10 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ + -- 11 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ + -- 12 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ + -- + for_ [b1 .. e1 - 1] internalCell' + + -- Conditionally write to the last cell of the Ukkonen band + if i < rows - fromEnum po + then lastCell leftElement insertCost i stop + else pure () + + --------------------------------------- + -- Compute all values of the matrix -- + --------------------------------------- + + -- We start computation in the top row at the index equal to the + -- quasi-diagonal width plus the previous offset. This is because the last + -- cell of the top row which we computed was at the previous index since + -- we previous computed an number of cells from the quasi-diagonal equal to + -- the previous offset. + let topRowStart = qd + prevOffset + + -- Each row in the matrix with values in the band has 'width' cells. + -- However, the band runs off the end of the matrix for the first & last + -- rows of the matrix. We subtract the 'offset' from the 'width' because + -- there are exactly 'offset' number of cells left of first matrix column. + -- Hence the top row's width is 'width' minus 'offset'. The last cell index + -- in the top row of the band is 'width' minus 'offset' minus 1. + let topRowWidth = width - offset + + -- Of course, we must be cetrain that we don't extend past the last column + -- of the matrix. To prevent this, we take the minimum of the top row width + -- and the number of columns. So the last index we will compute in the top + -- row is the minimum of the two options minus one due to zero-indexing. + let topRowCease = pred $ min cols topRowWidth + + -- Write out the left arrow value on the top row. + let topRowWrite !j !cost = write (0, j) (# cost, LeftArrow #) + + -- Extend the first row to seed subsequent rows. + for_ [topRowStart .. topRowCease] $ \j → + let !topElement = longerTop ! (j - 1) + !firstCellCost = costλ gap topElement + in do + !firstPrevCost ← readCost 0 $ j - 1 + topRowWrite j $ firstCellCost + firstPrevCost + + writeSTRef tailStart topRowStart + + -- Loop through the remaining rows. + for_ [1 .. rows - 1] extendRow + + +edgeCellDefinitions + ∷ ( PrimMonad m + , Vector v e + ) + ⇒ e + -- ^ Gap state + → (e → e → Word) + -- ^ Distance between states + → v e + -- ^ Longer dynamic character + → MMatrix (PrimState m) Word + → MMatrix (PrimState m) Direction + → ( Int → Int → m Word + , (Int, Int) → (# Word, Direction #) → m () + , e → Word → Int → Int → m () + , e → Word → Int → Int → m () + , e → Word → Int → Int → m () + , e → Word → Int → Int → m () + , e → Word → Int → Int → m () + ) +edgeCellDefinitions gap costλ longerTop mCost mDir = + (readCost, write, internalCell, leftColumn, leftBoundary, rightBoundary, rightColumn) + where + -- Read the cost of a cell + readCost = curry $ M.unsafeRead mCost + + -- Write to a single cell of the current vector and directional matrix simultaneously + write !p (# !c, !d #) = M.unsafeWrite mCost p c *> M.unsafeWrite mDir p d + + -- Write to an internal cell (not on a boundary) of the matrix. + internalCell !leftElement !insertCost !i !j = + {-# SCC internalCell_expanding #-} + let !topElement = longerTop ! (j - 1) + !deleteCost = costλ gap topElement + !alignCost = costλ leftElement topElement + in do + diagCost ← readCost (i - 1) $ j - 1 + topCost ← readCost (i - 1) j + leftCost ← readCost i $ j - 1 + let v = + minimumCostDirection + (deleteCost + leftCost) + (alignCost + diagCost) + (insertCost + topCost) + write (i, j) v + + -- Define how to compute the first cell of the first "offset" rows. + -- We need to ensure that there are only Up Arrow values in the directional matrix. + -- We can also reduce the number of comparisons the first row makes from 3 to 1, + -- since the diagonal and leftward values are "out of bounds." + leftColumn _leftElement !insertCost !i !j = + {-# SCC leftColumn #-} + do + firstPrevCost ← readCost (i - 1) j + write (i, j) (# insertCost + firstPrevCost, UpArrow #) + + -- Define how to compute the first cell of the remaining rows. + -- We need to ensure that there are no Left Arrow values in the directional matrix. + -- We can also reduce the number of comparisons the first row makes from 3 to 2, + -- since the leftward values are "out of bounds." + -- Define how to compute the first cell of the remaining rows. + -- We need to ensure that there are no Left Arrow values in the directional matrix. + -- We can also reduce the number of comparisons the first row makes from 3 to 2, + -- since the leftward values are "out of bounds." + leftBoundary !leftElement !insertCost !i !j = + {-# SCC leftBoundary #-} + let topElement = longerTop ! (j - 1) + alignCost = costλ leftElement topElement + in do + diagCost ← readCost (i - 1) $ j - 1 + topCost ← readCost (i - 1) j + let v = + minimumCostDirection + maxBound + (alignCost + diagCost) + (insertCost + topCost) + write (i, j) v + + -- Define how to compute the last cell of the first "rows - offset" rows. + -- We need to ensure that there are only Left Arrow values in the directional matrix. + -- We can also reduce the number of comparisons the first row makes from 3 to 1, + -- since the diagonal and upward values are "out of bounds." + rightBoundary !leftElement _insertCost !i !j = + {-# SCC rightBoundary #-} + let topElement = longerTop ! (j - 1) + deleteCost = costλ gap topElement + alignCost = costλ leftElement topElement + in do + diagCost ← readCost (i - 1) $ j - 1 + leftCost ← readCost i $ j - 1 + let v = + minimumCostDirection + (deleteCost + leftCost) + (alignCost + diagCost) + maxBound + write (i, j) v + + rightColumn = {-# SCC rightColumn #-} internalCell + + +{- | +Produces a set of reusable values and functions which are "constant" between +different incarnations of the Ukkonen algorithms. +-} +ukkonenConstants + ∷ (Vector v e) + ⇒ TCMλ e + -- ^ Metric between states producing the medoid of states. + → v e + -- ^ Shorter dynamic character + → v e + -- ^ Longer dynamic character + → Word + -- ^ Current offset from quasi-diagonal + → (Int, e → e → Word, Int, Int, Int, Int) +ukkonenConstants tcmλ lesserLeft longerTop co = + (offset, costλ, rows, cols, width, quasiDiagonalWidth) + where + offset = clampOffset co + costλ x = snd . tcmλ x + longerLen = GV.length longerTop + lesserLen = GV.length lesserLeft + rows = GV.length lesserLeft + 1 + cols = GV.length longerTop + 1 + width = quasiDiagonalWidth + (offset `shiftL` 1) -- Multiply by 2 + quasiDiagonalWidth = differenceInLength + 1 + where + differenceInLength = longerLen - lesserLen + + -- Note: "offset" cannot cause "width + quasiDiagonalWidth" to exceed "2 * cols" + clampOffset o = + let o' = fromEnum o in min o' $ cols - quasiDiagonalWidth diff --git a/lib/dynamic-character/src/DirectOptimization/Pairwise/Visualization.hs b/lib/dynamic-character/src/DirectOptimization/Pairwise/Visualization.hs new file mode 100644 index 000000000..084b3a77c --- /dev/null +++ b/lib/dynamic-character/src/DirectOptimization/Pairwise/Visualization.hs @@ -0,0 +1,382 @@ +----------------------------------------------------------------------------- +----------------------------------------------------------------------------- +{-# LANGUAGE ApplicativeDo #-} +{-# LANGUAGE ConstraintKinds #-} +{-# LANGUAGE DerivingStrategies #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE MultiParamTypeClasses #-} +{-# LANGUAGE Strict #-} +{-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE UnboxedTuples #-} +{-# OPTIONS_GHC -Wno-incomplete-patterns #-} +{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-} + +{- | +Module : DirectOptimization.Pairwise.Visualization +Copyright : (c) 2015-2021 Ward Wheeler +License : BSD-style + +Maintainer : wheeler@amnh.org +Stability : provisional +Portability : portable + +Direct optimization pairwise alignment using the Needleman-Wunsch algorithm. +These functions will allocate an M * N matrix. +-} +module DirectOptimization.Pairwise.Visualization ( + Direction (), + + -- * Operational + directOptimizationDiffDirectionMatricies, + + -- * Rendering + renderMatrix, + renderDirectionMatrix, + diffDirectionMatrix, +) where + +import Bio.DynamicCharacter +import Bio.DynamicCharacter.Measure +import Data.Bits +import Data.Foldable (fold) +import Data.Matrix.Class (Matrix, dim, toLists, unsafeIndex) +import Data.Set (Set, fromDistinctAscList, member) +import Data.Vector.Generic (Vector, basicLength, toList) +import DirectOptimization.Pairwise.Direction + + +directOptimizationDiffDirectionMatricies + ∷ ( FiniteBits e + , Matrix m t Direction + , Ord (v e) + , Vector v e + ) + ⇒ (v e → v e → (Word, m t Direction)) + → (v e → v e → (Word, m t Direction)) + → OpenDynamicCharacter v e + → OpenDynamicCharacter v e + → String +directOptimizationDiffDirectionMatricies matrixGenerator1 matrixGenerator2 lhs rhs = + let -- Remove gaps from the inputs and measure the results to determine + -- which ungapped character is longer and which is shorter. + -- Always pass the shorter character into alignment functions first! + ~(_, _, _, lesser, longer) = measureCharactersWithoutGaps lhs rhs + lesserMeds = extractMediansGapped lesser + longerMeds = extractMediansGapped longer + in case basicLength lesserMeds of + -- Neither character was Missing, but one or both are empty when gaps are removed + 0 → "One character was all gaps" + -- Both have some non-gap elements, perform string alignment + _ → + let dm1 = snd $ matrixGenerator1 lesserMeds longerMeds + dm2 = snd $ matrixGenerator2 lesserMeds longerMeds + in diffDirectionMatrix lesserMeds longerMeds dm1 dm2 + + +{- | +Serializes an alignment matrix to a 'String'. Uses input characters for row +and column labelings. + +Useful for debugging purposes. +-} +renderMatrix + ∷ ( Matrix m t x + , Vector v e + , Show x + ) + ⇒ v e + -- ^ Shorter vector of elements + → v e + -- ^ Longer vector of elements + → m t x + -- ^ Matrix of cells + → String +renderMatrix lesser longer mtx = + unlines + [ dimensionPrefix + , headerRow + , barRow + , renderedRows + ] + where + (colCount, rowCount, lesserTokens, longerTokens, maxPrefixWidth, maxColumnWidth, [matrixTokens]) = + getMatrixConstants lesser longer [mtx] + {- + toShownIntegers = fmap (show . showBitsValue) . otoList + + showBitsValue :: FiniteBits b => b -> Word + showBitsValue b = go (finiteBitSize b) 0 + where + go 0 v = v + go i v = let i' = i-1 + v' | b `testBit` i' = v + bit i' + | otherwise = v + in go i' v' + -} + + dimensionPrefix = + " " + <> unwords + [ "Dimensions:" + , show rowCount + , "X" + , show colCount + ] + + headerRow = + fold + [ " " + , pad maxPrefixWidth "\\" + , "| " + , pad maxColumnWidth "*" + , concatMap (pad maxColumnWidth) longerTokens + ] + + barRow = + fold + [ " " + , bar maxPrefixWidth + , "+" + , concatMap (const (bar maxColumnWidth)) $ undefined : longerTokens + ] + where + bar n = replicate (n + 1) '-' + + renderedRows = unlines $ zipWith renderRow ("*" : lesserTokens) matrixTokens + where + renderRow e cs = prefix <> suffix + where + prefix = fold [" ", pad maxPrefixWidth e, "| "] + suffix = concatMap (pad maxColumnWidth) cs + + pad ∷ Int → String → String + pad n e = replicate (n - len) ' ' <> e <> " " + where + len = length e + + +{- | +Serializes an alignment matrix to a 'String'. Uses input characters for row +and column labelings. + +Useful for debugging purposes. +-} +renderDirectionMatrix + ∷ ( Matrix m t Direction + , Vector v e + ) + ⇒ v e + → v e + → m t Direction + → String +renderDirectionMatrix lesser longer mtx = + unlines + [ dimensionPrefix + , headerRow + , barRow + , renderedRows + ] + where + (colCount, rowCount, lesserTokens, longerTokens, maxPrefixWidth, maxColumnWidth, [matrixTokens]) = + getMatrixConstants lesser longer [mtx] + + tracebackCells = getTracebackIndices mtx + + dimensionPrefix = + " " + <> unwords + [ "Dimensions:" + , show rowCount + , "X" + , show colCount + ] + + headerRow = + fold + [ " " + , pad maxPrefixWidth "\\" + , "| " + , pad maxColumnWidth "*" + , concatMap (pad maxColumnWidth) longerTokens + ] + + barRow = + fold + [ " " + , bar maxPrefixWidth + , "+" + , concatMap (const (bar maxColumnWidth)) $ undefined : longerTokens + ] + where + bar n = replicate (n + 1) '-' + + renderedRows = unlines $ zipWith3 renderRow [0 ..] ("*" : lesserTokens) matrixTokens + where + renderRow i e cs = prefix <> suffix + where + prefix = fold [" ", pad maxPrefixWidth e, "| "] + suffix = fold $ zipWith (renderCell i) [0 ..] cs + + renderCell i j = fmap f . pad maxColumnWidth + where + f + | (i, j) `elem` tracebackCells = boldDirection + | otherwise = id + + pad ∷ Int → String → String + pad n e = replicate (n - len) ' ' <> e <> " " + where + len = length e + + +{- | +Serializes an alignment matrix to a 'String'. Uses input characters for row +and column labelings. + +Useful for debugging purposes. +-} +diffDirectionMatrix + ∷ ( Matrix m t Direction + , Vector v e + ) + ⇒ v e + → v e + → m t Direction + → m t Direction + → String +diffDirectionMatrix lesser longer mtx1 mtx2 = + unlines + [ dimensionPrefix + , headerRow + , barRow + , renderedRows + ] + where + (colCount, rowCount, lesserTokens, longerTokens, maxPrefixWidth, maxColumnWidth, [tokMtx1, tokMtx2]) = + getMatrixConstants lesser longer [mtx1, mtx2] + + tracebackCells1 = getTracebackIndices mtx1 + tracebackCells2 = getTracebackIndices mtx2 + + dimensionPrefix = + " " + <> unwords + [ "Dimensions:" + , show rowCount + , "X" + , show colCount + ] + + headerRow = + fold + [ " " + , pad maxPrefixWidth "\\" + , "| " + , pad maxColumnWidth "*" + , concatMap (pad maxColumnWidth) longerTokens + ] + + barRow = + fold + [ " " + , bar maxPrefixWidth + , "+" + , concatMap (const (bar maxColumnWidth)) $ undefined : longerTokens + ] + where + bar n = replicate (n + 1) '-' + + renderedRows = unlines $ zipWith3 renderRow ("*" : lesserTokens) strMtx1 strMtx2 + where + renderRow e xs ys = prefix <> suffix + where + prefix = fold [" ", pad maxPrefixWidth e, "| "] + suffix = fold $ zipWith renderCell xs ys + + renderCell x y + | x' == y' = x + | otherwise = replicate (max (length x) (length y)) ' ' + where + x' = boldDirection <$> x + y' = boldDirection <$> y + + strMtx1 = tok2Str tracebackCells1 tokMtx1 + strMtx2 = tok2Str tracebackCells2 tokMtx2 + + tok2Str s = zipWith f [0 ..] + where + f i = zipWith (g i) [0 ..] + g i j = fmap h . pad maxColumnWidth + where + h + | (i, j) `member` s = boldDirection + | otherwise = id + + pad ∷ Int → String → String + pad n e = replicate (n - len) ' ' <> e <> " " + where + len = length e + + +{- | +Get the indices of the traceback route. +-} +getTracebackIndices + ∷ (Matrix m t Direction) + ⇒ m t Direction + → Set (Int, Int) +getTracebackIndices mtx = fromDistinctAscList $ go (# m - 1, n - 1 #) + where + getDirection = curry $ unsafeIndex mtx + (m, n) = dim mtx + go (# i, j #) + | i < 0 || j < 0 = [] + | (i, j) == (0, 0) = [(0, 0)] + | otherwise = + (i, j) : case getDirection i j of + LeftArrow → go (# i, j - 1 #) + DiagArrow → go (# i - 1, j - 1 #) + UpArrow → go (# i - 1, j #) + + +characterVectorToIndices ∷ (Vector v e) ⇒ v e → [String] +characterVectorToIndices = + let numbers = tail $ pure <$> cycle ['0' .. '9'] + in zipWith const numbers . toList + + +tokenizeMatrix ∷ (Matrix m t x, Show x) ⇒ m t x → [[String]] +tokenizeMatrix = fmap (fmap show) . toLists + + +maxLengthOfGrid ∷ (Foldable g, Foldable r, Foldable f, Functor g, Functor r) ⇒ g (r (f a)) → Int +maxLengthOfGrid = maximum . fmap maxLengthOfRow + + +maxLengthOfRow ∷ (Foldable r, Foldable f, Functor r) ⇒ r (f a) → Int +maxLengthOfRow = maximum . fmap length + + +getMatrixConstants + ∷ ( Matrix m t x + , Show x + , Vector v e + ) + ⇒ v e + → v e + → [m t x] + → (Int, Int, [String], [String], Int, Int, [[[String]]]) +getMatrixConstants lesser longer matrices = + (colCount, rowCount, lesserTokens, longerTokens, maxPrefixWidth, maxColumnWidth, matrixTokens) + where + colCount = basicLength longer + 1 + rowCount = basicLength lesser + 1 + lesserTokens = characterVectorToIndices lesser + longerTokens = characterVectorToIndices longer + maxPrefixWidth = maxLengthOfRow lesserTokens + maxHeaderWidth = maxLengthOfRow longerTokens + + matrixTokens = tokenizeMatrix <$> matrices + maxColumnWidth = + maximum $ + maxHeaderWidth : (maxLengthOfGrid <$> matrixTokens) diff --git a/lib/dynamic-character/src/DirectOptimization/Pairwise/Wide.hs b/lib/dynamic-character/src/DirectOptimization/Pairwise/Wide.hs new file mode 100644 index 000000000..3da18b415 --- /dev/null +++ b/lib/dynamic-character/src/DirectOptimization/Pairwise/Wide.hs @@ -0,0 +1,18 @@ +module DirectOptimization.Pairwise.Wide ( + WideDynamicCharacter, + WideState, + widePairwiseDO, +) where + +import Bio.DynamicCharacter +import Bio.DynamicCharacter.Element (WideState) +import DirectOptimization.Pairwise.Ukkonen + + +widePairwiseDO + ∷ Word + → (WideState → WideState → (WideState, Word)) + → WideDynamicCharacter + → WideDynamicCharacter + → (Word, WideDynamicCharacter) +widePairwiseDO = ukkonenDO diff --git a/lib/dynamic-character/src/DirectOptimization/PreOrder.hs b/lib/dynamic-character/src/DirectOptimization/PreOrder.hs new file mode 100644 index 000000000..8779026fd --- /dev/null +++ b/lib/dynamic-character/src/DirectOptimization/PreOrder.hs @@ -0,0 +1,102 @@ +{-# LANGUAGE ApplicativeDo #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE Strict #-} +{-# LANGUAGE RankNTypes #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# OPTIONS_GHC -foptimal-applicative-do #-} + +{- HLINT ignore "Redundant pure" -} + +module DirectOptimization.PreOrder ( + preOrderLogic, +) where + +import Bio.DynamicCharacter +import Control.Monad +import Control.Monad.ST +import Data.Bits +import Data.STRef +import Data.Vector.Generic (Vector) + + +{- | +Faithful translation of Algorithm 8 (Non-root Node Alignment) from the +"Efficient Implied Alignment" paper found at: + +-} +{-# INLINEABLE preOrderLogic #-} +{-# SPECIALIZE preOrderLogic ∷ + Bool → SlimDynamicCharacter → SlimDynamicCharacter → SlimDynamicCharacter → SlimDynamicCharacter + #-} +{-# SPECIALIZE preOrderLogic ∷ + Bool → WideDynamicCharacter → WideDynamicCharacter → WideDynamicCharacter → WideDynamicCharacter + #-} +{-# SPECIALIZE preOrderLogic ∷ + Bool → HugeDynamicCharacter → HugeDynamicCharacter → HugeDynamicCharacter → HugeDynamicCharacter + #-} +preOrderLogic + ∷ ∀ a v + . ( FiniteBits a + , Vector v a + ) + ⇒ Bool + → OpenDynamicCharacter v a + -- ^ Parent Final Alignment + → OpenDynamicCharacter v a + -- ^ Parent Preliminary Context + → OpenDynamicCharacter v a + -- ^ Child Preliminary Context + → OpenDynamicCharacter v a + -- ^ Child Final Alignment +preOrderLogic isLeftChild pAlignment pContext cContext = + let pAlignment' = forceDynamicCharacter pAlignment + pContext' = forceDynamicCharacter pContext + cContext' = forceDynamicCharacter cContext + + caLen = characterLength pAlignment' + ccLen = fromEnum $ characterLength cContext' + range = [0 .. fromEnum caLen - 1] + + -- Constructor definition supplied to 'unsafeCharacterBuiltByST'. + builder ∷ ∀ s. TempOpenDynamicCharacter (ST s) v a → ST s () + builder char = pokeTempChar char *> fillTempChar char + + -- We set an arbitrary element from the parent alignment context + -- to the first element of the temporary character. This ensures + -- that at least one element can be querried for the determining + -- "nil" and "gap" values. + pokeTempChar ∷ ∀ s. TempOpenDynamicCharacter (ST s) v a → ST s () + pokeTempChar char = setFrom pAlignment' char 0 0 + + -- Select character building function + fillTempChar ∷ ∀ s. TempOpenDynamicCharacter (ST s) v a → ST s () + fillTempChar + | isMissing cContext' = missingλ -- Missing case is all gaps + | otherwise = alignmentλ -- Standard pre-order logic + + -- Construct a missing character value of appropriate length. + missingλ ∷ ∀ s. TempOpenDynamicCharacter (ST s) v a → ST s () + missingλ char = + forM_ range (char `setGapped`) + + -- Construct alignment derived from parent pre-order and self post-order. + alignmentλ ∷ ∀ s. TempOpenDynamicCharacter (ST s) v a → ST s () + alignmentλ char = do + j' ← newSTRef 0 + k' ← newSTRef 0 + forM_ range $ \i → do + k ← readSTRef k' + if k > ccLen || pAlignment' `isGapped` i + then char `setGapped` i + else do + j ← readSTRef j' + modifySTRef j' succ + -- Remember that 'Delete' leaves 'voids' in the 'left' character. + if pAlignment' `isAlign` i + || (not isLeftChild && pAlignment' `isDelete` i && pContext' `isDelete` j) + || (isLeftChild && pAlignment' `isInsert` i && pContext' `isInsert` j) + then modifySTRef k' succ *> setFrom cContext' char k i + else char `setGapped` i + pure () -- For ApplicativeDo + pure () -- For ApplicativeDo + in forceDynamicCharacter $ unsafeCharacterBuiltByST caLen builder diff --git a/lib/dynamic-character/src/DirectOptimization/Wrapper/DOHuge.hs b/lib/dynamic-character/src/DirectOptimization/Wrapper/DOHuge.hs new file mode 100644 index 000000000..69c2f5d8b --- /dev/null +++ b/lib/dynamic-character/src/DirectOptimization/Wrapper/DOHuge.hs @@ -0,0 +1,26 @@ +module DirectOptimization.DOHuge where + +import Analysis.Parsimony.Dynamic.DirectOptimization.Pairwise.Huge +import Data.BitVector.LittleEndian (BitVector, dimension) +import Data.Bits +import Data.Foldable +import Data.MetricRepresentation +import Data.Vector (Vector, head) +import Prelude hiding (head) + + +wrapperHugeDO + :: Vector BitVector + -> Vector BitVector + -> MetricRepresentation BitVector + -> (Vector BitVector, Int) +wrapperHugeDO lhs rhs tcmMemo = (medians, fromEnum cost) + where + (cost, (medians, _, _)) = hugePairwiseDO (minInDelCost tcmMemo) gapState (retreivePairwiseTCM tcmMemo) (lhs, lhs, lhs) (rhs, rhs, rhs) + + gapState = bit . fromEnum $ n - 1 + n = case length lhs of + 0 -> case length rhs of + 0 -> 64 + _ -> dimension $ head rhs + _ -> dimension $ head lhs diff --git a/lib/dynamic-character/src/DirectOptimization/Wrapper/DOSlim.hs b/lib/dynamic-character/src/DirectOptimization/Wrapper/DOSlim.hs new file mode 100644 index 000000000..7eed74a3a --- /dev/null +++ b/lib/dynamic-character/src/DirectOptimization/Wrapper/DOSlim.hs @@ -0,0 +1,70 @@ +module DirectOptimization.DOSlim where + +import Analysis.Parsimony.Dynamic.DirectOptimization.Pairwise.Slim +import Data.BitVector.LittleEndian (BitVector) +import qualified Data.BitVector.LittleEndian as BV +import Data.Bits +import Data.TCM.Dense +import Data.Vector (Vector, (!)) +import qualified Data.Vector as V +import qualified Data.Vector.Storable as SV +import Foreign.C.Types (CUInt) + + +wrapperSlimDO :: Vector BitVector -> Vector BitVector -> DenseTransitionCostMatrix -> (Vector BitVector, Int) +-- wrapperPCG_DO_FFI lhs rhs tcm | trace (show tcm) False= undefined +wrapperSlimDO lhs rhs tcmDense = (resultMedians, fromEnum resultCost) + where + (resultCost, resultMedians) = slimDC2BVs 5 <$> slimPairwiseDO tcmDense lhsDC rhsDC + + lhsDC = bvs2SlimDC lhs + rhsDC = bvs2SlimDC rhs + + +{-} +tcmDense = generateDenseTransitionCostMatrix 0 5 getCost +getCost i j = + let x = SM.getFullVects tcm + in toEnum $ (x ! fromEnum i) ! fromEnum j + -} + +{- +wrapperSlimDO :: Vector BitVector -> Vector BitVector -> Vector (Vector Int) -> (Vector BitVector, Int) +-- wrapperPCG_DO_FFI lhs rhs tcm | trace (show tcm) False= undefined +wrapperSlimDO lhs rhs tcm = (resultMedians, fromEnum resultCost) + where + (resultCost, resultMedians) = slimDC2BVs 5 <$> slimPairwiseDO tcmDense lhsDC rhsDC + + lhsDC = bvs2SlimDC lhs + rhsDC = bvs2SlimDC rhs + tcmDense = generateDenseTransitionCostMatrix 0 5 getCost + getCost i j = + let x = SM.getFullVects tcm + in toEnum $ (x ! fromEnum i) ! fromEnum j + +-} +-- specializedAlphabetToDNA :: Alphabet String +-- specializedAlphabetToDNA = fromSymbols $ show <$> (0 :: Word) :| [1 .. 4] + +bvs2SlimDC :: V.Vector BitVector -> SlimDynamicCharacter +bvs2SlimDC v = (x, x, x) + where + x = SV.generate (V.length v) $ \i -> bv2w (v ! i) + + bv2w :: BitVector -> CUInt + bv2w bv = + let f i a + | bv `testBit` i = a `setBit` i + | otherwise = a + in foldr f 0 [0 .. fromEnum $ BV.dimension bv - 1] + + +slimDC2BVs :: Int -> SlimDynamicCharacter -> V.Vector BitVector +slimDC2BVs n (x, _, _) = V.generate (SV.length x) $ \i -> w2bv (x SV.! i) + where + w2bv :: CUInt -> BitVector + w2bv w = + let f i a + | w `testBit` i = a `setBit` i + | otherwise = a + in foldr f (BV.fromNumber (toEnum n) (0 :: Word)) [0 .. n - 1] diff --git a/lib/dynamic-character/src/DirectOptimization/Wrapper/DOWide.hs b/lib/dynamic-character/src/DirectOptimization/Wrapper/DOWide.hs new file mode 100644 index 000000000..3c818bfdc --- /dev/null +++ b/lib/dynamic-character/src/DirectOptimization/Wrapper/DOWide.hs @@ -0,0 +1,62 @@ +module DirectOptimization.DOWide where + +import Analysis.Parsimony.Dynamic.DirectOptimization.Pairwise.Wide +-- import Analysis.Parsimony.Dynamic.DirectOptimization.Pairwise.Ukkonen2 +import Control.Arrow (first) +import Data.BitVector.LittleEndian (BitVector) +import qualified Data.BitVector.LittleEndian as BV +import Data.Bits +import Data.MetricRepresentation +import Data.Vector (Vector, (!)) +import qualified Data.Vector as V +import qualified Data.Vector.Unboxed as UV +import Data.Word + + +wrapperWideDO + :: Vector BitVector + -> Vector BitVector + -> MetricRepresentation BitVector -- Word64 + -> (Vector BitVector, Int) +wrapperWideDO lhs rhs metric = (wideDC2BVs (fromIntegral n) resultMedians, fromEnum resultCost) + where + (resultCost, resultMedians) = widePairwiseDO (minInDelCost metric) gap tcm lhsDC rhsDC + -- (resultCost, resultMedians) = ukkonenDO (minInDelCost metric) gap tcm lhsDC rhsDC + + gap = bit . fromEnum $ n - 1 + + tcm :: Word64 -> Word64 -> (Word64, Word) + tcm x y = first BV.toUnsignedNumber (retreivePairwiseTCM metric) (BV.fromNumber 64 x) (BV.fromNumber 64 y) + + n = case length lhs of + 0 -> case length rhs of + 0 -> 64 + _ -> BV.dimension $ V.head rhs + _ -> BV.dimension $ V.head lhs + + lhsDC = bvs2WideDC lhs + rhsDC = bvs2WideDC rhs + + +bvs2WideDC :: V.Vector BitVector -> WideDynamicCharacter +bvs2WideDC v = (x, x, x) + where + x = UV.generate (V.length v) $ \i -> bv2w (v ! i) + + bv2w :: BitVector -> Word64 + bv2w bv = + let f i a + | bv `testBit` i = a `setBit` i + | otherwise = a + in foldr f 0 [0 .. fromEnum $ BV.dimension bv - 1] + + +wideDC2BVs :: Int -> WideDynamicCharacter -> V.Vector BitVector +wideDC2BVs n (x, _, _) = V.generate (UV.length x) $ \i -> w2bv (x UV.! i) + where + w2bv :: Word64 -> BitVector + w2bv w = + let f i a + | w `testBit` i = a `setBit` i + | otherwise = a + in foldr f (BV.fromNumber (toEnum n) (0 :: Word)) [0 .. n - 1] diff --git a/pkg/PhyGraph/lib/dynamic-character/test/DirectOptimization/Pairwise/Test.hs b/lib/dynamic-character/test/DirectOptimization/Pairwise/Test.hs similarity index 99% rename from pkg/PhyGraph/lib/dynamic-character/test/DirectOptimization/Pairwise/Test.hs rename to lib/dynamic-character/test/DirectOptimization/Pairwise/Test.hs index cb45afd8a..1324695b7 100644 --- a/pkg/PhyGraph/lib/dynamic-character/test/DirectOptimization/Pairwise/Test.hs +++ b/lib/dynamic-character/test/DirectOptimization/Pairwise/Test.hs @@ -508,7 +508,7 @@ checkAlignmentResults (testLabel, metric) (DNA lhs :×: DNA rhs) = case values of [] -> z x:xs -> counterexample errMsg $ - foldr (\e a -> (e==x) .&&. a) z xs + foldr (\e a -> (e == x) .&&. a) z xs where z = property True p = intercalate "\n" . fmap (" " <>) . lines @@ -520,7 +520,7 @@ checkAlignmentResults (testLabel, metric) (DNA lhs :×: DNA rhs) = [] -> Nothing [_] -> Nothing (x,_):_ -> - let matchModeVal = (==x) . snd + let matchModeVal = (== x) . snd (same, diff) = partition matchModeVal $ toList input in Just $ unlines [ "" diff --git a/pkg/PhyGraph/lib/dynamic-character/test/Inspect.hs b/lib/dynamic-character/test/Inspect.hs similarity index 100% rename from pkg/PhyGraph/lib/dynamic-character/test/Inspect.hs rename to lib/dynamic-character/test/Inspect.hs diff --git a/pkg/PhyGraph/lib/dynamic-character/test/Test/Aligners.hs b/lib/dynamic-character/test/Test/Aligners.hs similarity index 90% rename from pkg/PhyGraph/lib/dynamic-character/test/Test/Aligners.hs rename to lib/dynamic-character/test/Test/Aligners.hs index 3619d7401..578ae8a04 100644 --- a/pkg/PhyGraph/lib/dynamic-character/test/Test/Aligners.hs +++ b/lib/dynamic-character/test/Test/Aligners.hs @@ -26,10 +26,10 @@ import Data.MetricRepresentation import Data.TCM (fromList) import Data.TCM.Dense import Data.Word +import Foreign.C.Types (CUInt) import DirectOptimization.Pairwise import DirectOptimization.Pairwise.Swapping import DirectOptimization.Pairwise.Ukkonen -import Foreign.C.Types (CUInt(..)) import Test.QuickCheck.Instances.DynamicCharacter @@ -65,23 +65,28 @@ metricRepresentationToDenseTCM = translateSlimStateTCM :: MetricRepresentation Word32 -> SlimState -> SlimState -> (SlimState, Word) translateSlimStateTCM m = - let tcm = retreivePairwiseTCM m - c2w = coerce :: SlimState -> Word32 - w2c = coerce :: Word32 -> SlimState - in \x y -> first w2c $ tcm (c2w x) (c2w y) + let tcm = retreivePairwiseTCM m + c2w = (toEnum . fromEnum) :: SlimState -> Word32 + w2c = (toEnum . fromEnum) :: Word32 -> SlimState + in \x y -> first w2c $ tcm (c2w x) (c2w y) metricChoices :: [(String, MetricRepresentation Word32)] metricChoices = [ ("Discrete Metric", discreteMetric) , ("1st Linear Norm", linearNorm len) +{- , ("Sub-InDel (1:2)", subInDel 1 2 ) , ("Sub-InDel (2:1)", subInDel 2 1 ) +-} ] where len = toEnum $ finiteBitSize nucleotideGap +-- Comment out the monadic code +{- subInDel :: Word -> Word -> MetricRepresentation Word32 subInDel x g = metricRepresentation . snd . fromList $ let indices = [ 0 .. length nucleotideAlphabet - 1 ] in [ if i == j then 0 else if i == 0 || j == 0 then g else x | i <- indices, j <- indices ] +-} diff --git a/pkg/PhyGraph/lib/dynamic-character/test/Test/QuickCheck/Instances/DynamicCharacter.hs b/lib/dynamic-character/test/Test/QuickCheck/Instances/DynamicCharacter.hs similarity index 99% rename from pkg/PhyGraph/lib/dynamic-character/test/Test/QuickCheck/Instances/DynamicCharacter.hs rename to lib/dynamic-character/test/Test/QuickCheck/Instances/DynamicCharacter.hs index 531284755..922608a3a 100644 --- a/pkg/PhyGraph/lib/dynamic-character/test/Test/QuickCheck/Instances/DynamicCharacter.hs +++ b/lib/dynamic-character/test/Test/QuickCheck/Instances/DynamicCharacter.hs @@ -28,6 +28,7 @@ module Test.QuickCheck.Instances.DynamicCharacter ) where import Bio.DynamicCharacter +import Bio.DynamicCharacter.Element.QuickCheck import Control.Applicative import Data.Alphabet import Data.Alphabet.Codec @@ -45,7 +46,6 @@ import qualified Data.Vector as V import Data.Vector.Generic (Vector) import qualified Data.Vector.Generic as GV import qualified Data.Vector.Storable as SV -import Foreign.C.Types (CUInt(..)) import Test.Tasty.QuickCheck hiding ((.&.)) @@ -404,7 +404,7 @@ sizeOfDNA (DNA (_,mc,_)) = GV.length mc nucleotideAlphabet :: Alphabet String -nucleotideAlphabet = fromSymbols $ pure <$> "ACGT" +nucleotideAlphabet = fromSymbols . NE.fromList $ pure <$> "-ACGT" nucleotideGap :: Nucleotide diff --git a/lib/dynamic-character/test/TestSuite.hs b/lib/dynamic-character/test/TestSuite.hs new file mode 100644 index 000000000..1065ec47b --- /dev/null +++ b/lib/dynamic-character/test/TestSuite.hs @@ -0,0 +1,26 @@ +{-# LANGUAGE ImportQualifiedPost #-} + +{- | +Test-suite for the dynamic characters. +-} +module Main ( + main, +) where + +import DirectOptimization.Pairwise.Test qualified as Pairwise +import Test.Tasty + + +{- | +Entry point for the test-suite of the "dynamic-character" library. +-} +main ∷ IO () +main = defaultMain testSuite + + +testSuite ∷ TestTree +testSuite = + testGroup + "Dynamic Character Test-Suite" + [ Pairwise.testSuite + ] diff --git a/pkg/PhyGraph/lib/exportable/src/Bio/Character/Exportable.hs b/lib/exportable/src/Bio/Character/Exportable.hs similarity index 100% rename from pkg/PhyGraph/lib/exportable/src/Bio/Character/Exportable.hs rename to lib/exportable/src/Bio/Character/Exportable.hs diff --git a/pkg/PhyGraph/lib/exportable/src/Bio/Character/Exportable/Class.hs b/lib/exportable/src/Bio/Character/Exportable/Class.hs similarity index 100% rename from pkg/PhyGraph/lib/exportable/src/Bio/Character/Exportable/Class.hs rename to lib/exportable/src/Bio/Character/Exportable/Class.hs diff --git a/pkg/PhyGraph/lib/tcm-memo/Main.hs b/lib/tcm-memo/Main.hs similarity index 100% rename from pkg/PhyGraph/lib/tcm-memo/Main.hs rename to lib/tcm-memo/Main.hs diff --git a/pkg/PhyGraph/lib/tcm-memo/README.md b/lib/tcm-memo/README.md similarity index 100% rename from pkg/PhyGraph/lib/tcm-memo/README.md rename to lib/tcm-memo/README.md diff --git a/pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/3d tcm notes.md b/lib/tcm-memo/ffi/memoized-tcm/3d tcm notes.md similarity index 100% rename from pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/3d tcm notes.md rename to lib/tcm-memo/ffi/memoized-tcm/3d tcm notes.md diff --git a/pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/costMatrixWrapper.c b/lib/tcm-memo/ffi/memoized-tcm/costMatrixWrapper.c similarity index 100% rename from pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/costMatrixWrapper.c rename to lib/tcm-memo/ffi/memoized-tcm/costMatrixWrapper.c diff --git a/pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/costMatrixWrapper.h b/lib/tcm-memo/ffi/memoized-tcm/costMatrixWrapper.h similarity index 100% rename from pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/costMatrixWrapper.h rename to lib/tcm-memo/ffi/memoized-tcm/costMatrixWrapper.h diff --git a/pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/costMatrixWrapper_2d.c b/lib/tcm-memo/ffi/memoized-tcm/costMatrixWrapper_2d.c similarity index 100% rename from pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/costMatrixWrapper_2d.c rename to lib/tcm-memo/ffi/memoized-tcm/costMatrixWrapper_2d.c diff --git a/pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/costMatrixWrapper_2d.h b/lib/tcm-memo/ffi/memoized-tcm/costMatrixWrapper_2d.h similarity index 100% rename from pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/costMatrixWrapper_2d.h rename to lib/tcm-memo/ffi/memoized-tcm/costMatrixWrapper_2d.h diff --git a/pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/costMatrixWrapper_3d.c b/lib/tcm-memo/ffi/memoized-tcm/costMatrixWrapper_3d.c similarity index 100% rename from pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/costMatrixWrapper_3d.c rename to lib/tcm-memo/ffi/memoized-tcm/costMatrixWrapper_3d.c diff --git a/pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/costMatrixWrapper_3d.h b/lib/tcm-memo/ffi/memoized-tcm/costMatrixWrapper_3d.h similarity index 100% rename from pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/costMatrixWrapper_3d.h rename to lib/tcm-memo/ffi/memoized-tcm/costMatrixWrapper_3d.h diff --git a/pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/costMatrix_2d.cpp b/lib/tcm-memo/ffi/memoized-tcm/costMatrix_2d.cpp similarity index 100% rename from pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/costMatrix_2d.cpp rename to lib/tcm-memo/ffi/memoized-tcm/costMatrix_2d.cpp diff --git a/pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/costMatrix_2d.hpp b/lib/tcm-memo/ffi/memoized-tcm/costMatrix_2d.hpp similarity index 100% rename from pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/costMatrix_2d.hpp rename to lib/tcm-memo/ffi/memoized-tcm/costMatrix_2d.hpp diff --git a/pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/costMatrix_3d.cpp b/lib/tcm-memo/ffi/memoized-tcm/costMatrix_3d.cpp similarity index 100% rename from pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/costMatrix_3d.cpp rename to lib/tcm-memo/ffi/memoized-tcm/costMatrix_3d.cpp diff --git a/pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/costMatrix_3d.hpp b/lib/tcm-memo/ffi/memoized-tcm/costMatrix_3d.hpp similarity index 100% rename from pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/costMatrix_3d.hpp rename to lib/tcm-memo/ffi/memoized-tcm/costMatrix_3d.hpp diff --git a/pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/dynamicCharacterOperations.c b/lib/tcm-memo/ffi/memoized-tcm/dynamicCharacterOperations.c similarity index 100% rename from pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/dynamicCharacterOperations.c rename to lib/tcm-memo/ffi/memoized-tcm/dynamicCharacterOperations.c diff --git a/pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/dynamicCharacterOperations.h b/lib/tcm-memo/ffi/memoized-tcm/dynamicCharacterOperations.h similarity index 100% rename from pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/dynamicCharacterOperations.h rename to lib/tcm-memo/ffi/memoized-tcm/dynamicCharacterOperations.h diff --git a/pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/makefile b/lib/tcm-memo/ffi/memoized-tcm/makefile similarity index 100% rename from pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/makefile rename to lib/tcm-memo/ffi/memoized-tcm/makefile diff --git a/pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/test-suite/bitArrayExampleC.c b/lib/tcm-memo/ffi/memoized-tcm/test-suite/bitArrayExampleC.c similarity index 100% rename from pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/test-suite/bitArrayExampleC.c rename to lib/tcm-memo/ffi/memoized-tcm/test-suite/bitArrayExampleC.c diff --git a/pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/test-suite/bitArrayExampleC.h b/lib/tcm-memo/ffi/memoized-tcm/test-suite/bitArrayExampleC.h similarity index 100% rename from pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/test-suite/bitArrayExampleC.h rename to lib/tcm-memo/ffi/memoized-tcm/test-suite/bitArrayExampleC.h diff --git a/pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/test-suite/makefile b/lib/tcm-memo/ffi/memoized-tcm/test-suite/makefile similarity index 100% rename from pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/test-suite/makefile rename to lib/tcm-memo/ffi/memoized-tcm/test-suite/makefile diff --git a/pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/test-suite/printMatrix.py b/lib/tcm-memo/ffi/memoized-tcm/test-suite/printMatrix.py similarity index 100% rename from pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/test-suite/printMatrix.py rename to lib/tcm-memo/ffi/memoized-tcm/test-suite/printMatrix.py diff --git a/pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/test-suite/test_c_interface.c b/lib/tcm-memo/ffi/memoized-tcm/test-suite/test_c_interface.c similarity index 100% rename from pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/test-suite/test_c_interface.c rename to lib/tcm-memo/ffi/memoized-tcm/test-suite/test_c_interface.c diff --git a/pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/test-suite/test_c_interface_longer.c b/lib/tcm-memo/ffi/memoized-tcm/test-suite/test_c_interface_longer.c similarity index 100% rename from pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/test-suite/test_c_interface_longer.c rename to lib/tcm-memo/ffi/memoized-tcm/test-suite/test_c_interface_longer.c diff --git a/pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/test-suite/test_cost_matrix_2d.cpp b/lib/tcm-memo/ffi/memoized-tcm/test-suite/test_cost_matrix_2d.cpp similarity index 100% rename from pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/test-suite/test_cost_matrix_2d.cpp rename to lib/tcm-memo/ffi/memoized-tcm/test-suite/test_cost_matrix_2d.cpp diff --git a/pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/test-suite/test_cost_matrix_3d.cpp b/lib/tcm-memo/ffi/memoized-tcm/test-suite/test_cost_matrix_3d.cpp similarity index 100% rename from pkg/PhyGraph/lib/tcm-memo/ffi/memoized-tcm/test-suite/test_cost_matrix_3d.cpp rename to lib/tcm-memo/ffi/memoized-tcm/test-suite/test_cost_matrix_3d.cpp diff --git a/pkg/PhyGraph/lib/tcm-memo/src/Data/TCM/Memoized.hs b/lib/tcm-memo/src/Data/TCM/Memoized.hs similarity index 100% rename from pkg/PhyGraph/lib/tcm-memo/src/Data/TCM/Memoized.hs rename to lib/tcm-memo/src/Data/TCM/Memoized.hs diff --git a/pkg/PhyGraph/lib/tcm-memo/src/Data/TCM/Memoized/FFI.hsc b/lib/tcm-memo/src/Data/TCM/Memoized/FFI.hsc similarity index 100% rename from pkg/PhyGraph/lib/tcm-memo/src/Data/TCM/Memoized/FFI.hsc rename to lib/tcm-memo/src/Data/TCM/Memoized/FFI.hsc diff --git a/pkg/PhyGraph/lib/tcm-memo/src/Data/TCM/Memoized/Types.hsc b/lib/tcm-memo/src/Data/TCM/Memoized/Types.hsc similarity index 100% rename from pkg/PhyGraph/lib/tcm-memo/src/Data/TCM/Memoized/Types.hsc rename to lib/tcm-memo/src/Data/TCM/Memoized/Types.hsc diff --git a/lib/tcm/bench/Benchmark/Internal.hs b/lib/tcm/bench/Benchmark/Internal.hs new file mode 100644 index 000000000..326b5d9ec --- /dev/null +++ b/lib/tcm/bench/Benchmark/Internal.hs @@ -0,0 +1,114 @@ +module Benchmark.Internal ( + makeMemoizedHashTableIO, + measureMemoizedHashTable, + measureMemoizedHashTableParallel, +) where + +import Control.Concurrent.Async (mapConcurrently) +import Control.Exception +import Control.DeepSeq +import Control.Monad.State.Strict +import Criterion.Main +import Data.Bits ((.&.)) +import Data.Bifunctor (first) +import Data.TCM.Overlap (overlap3) +import System.Random (StdGen, genWord32R, mkStdGen) +import Data.Vector (Vector) +import Data.Vector qualified as V +import Data.Word +import GHC.IO (evaluate) + +type HashKey = Word32 + + +type HashVal = (Word8, Word) + + +maxKey ∷ HashKey +maxKey = pred $ 2 ^ 24 + + +randomKey ∷ State StdGen HashKey +randomKey = state $ first succ . genWord32R (maxKey - 1) + + +measureKey ∷ HashKey → HashVal +measureKey key = + let x = α 0xFF0000 + y = α 0x00FF00 + z = α 0x0000FF + α = fromIntegral . (key .&.) + σ :: Word -> Word -> Word + σ p q = max p q - min p q + in overlap3 8 σ x y z + + +seed :: Int +seed = 1029110096060097500 + + +createHashTable ∷ Word → Word → ((HashKey → HashVal) → HashKey → HashVal) → (Vector HashKey, HashKey → HashVal) +createHashTable valueCount queryCount memoizer = + let memo = memoizer measureKey + rgen = mkStdGen seed + note = V.replicateM (fromEnum queryCount) (randomKey) + load = V.replicateM (fromEnum valueCount) (memo <$> randomKey) + (keys, vals) = liftA2 (,) note load `evalState` rgen + in (keys, force vals `seq` memo) + + +createHashTableFour ∷ Word → Word → ((HashKey → HashVal) → HashKey → HashVal) → (Vector HashKey, Vector HashKey, Vector HashKey, Vector HashKey, HashKey → HashVal) +createHashTableFour valueCount queryCount memoizer = + let memo = memoizer measureKey + rgen = mkStdGen seed + key1 = V.replicateM (fromEnum queryCount) (randomKey) + key2 = V.replicateM (fromEnum queryCount) (randomKey) + key3 = V.replicateM (fromEnum queryCount) (randomKey) + key4 = V.replicateM (fromEnum queryCount) (randomKey) + load = V.replicateM (fromEnum valueCount) (memo <$> randomKey) + comp = (,,,,) <$> key1 <*> key2 <*> key3 <*> key4 <*> load + vals :: Vector HashVal -> HashKey -> HashVal + vals x = force x `seq` memo + in vals <$> comp `evalState` rgen + + +{- | +Abstract utility function for measuring the run time of the supplied +file parser on the specified input file. +-} +measureMemoizedHashTable + ∷ Word + → Word + → ((HashKey → HashVal) → HashKey → HashVal) + → String + → Benchmark +measureMemoizedHashTable values queries memoizer name = + let (keys, memo) = createHashTable values queries memoizer + in bench name $ nf (fmap memo) keys + + +makeMemoizedHashTableIO + ∷ Word → Word → ((HashKey → HashVal) → IO (HashKey → HashVal)) → IO (Vector HashKey, HashKey → HashVal) +makeMemoizedHashTableIO valueCount queryCount memoizer = + let rgen = mkStdGen seed + note = V.replicateM (fromEnum queryCount) (randomKey) + in do memo <- memoizer measureKey + let load = V.replicateM (fromEnum valueCount) (memo <$> randomKey) + let (keys, vals) = liftA2 (,) note load `evalState` rgen + pure (keys, force vals `seq` memo) + + +{- | +Abstract utility function for measuring the run time of the supplied +file parser on the specified input file. +-} +measureMemoizedHashTableParallel + ∷ Word + → Word + → ((HashKey → HashVal) → HashKey → HashVal) + → String + → Benchmark +measureMemoizedHashTableParallel values queries memoizer name = + let (keys1, keys2, keys3, keys4, memo) = createHashTableFour values queries memoizer + evaluator = mapConcurrently $ \xs -> onException (evaluate . force $ memo <$> xs) (evaluate 0) + in bench name $ nfAppIO evaluator [ keys1, keys2, keys3, keys4 ] diff --git a/lib/tcm/bench/Time.hs b/lib/tcm/bench/Time.hs new file mode 100644 index 000000000..eaaea99af --- /dev/null +++ b/lib/tcm/bench/Time.hs @@ -0,0 +1,59 @@ +{- | +Benchmarks for different memoized hashtable implmentations. +-} +module Main (main) where + + +import Benchmark.Internal +import Criterion.Main +import Data.Hashable.Memoize.ViaConcurrentHashtable qualified as Conc +--import Data.Hashable.Memoize.ViaConcurrentHashtableOpt qualified as OptQ +import Data.Hashable.Memoize.ViaIORef qualified as IORf +import Data.Hashable.Memoize.ViaManualLock qualified as Lock +import Data.Hashable.Memoize.ViaReadWriteLock qualified as RWLk +import Data.Hashable.Memoize.ViaSemaphore qualified as Semp +import Data.Hashable.Memoize.ViaTVar qualified as TVar +import Data.Word + + +-- | +-- Entry point for the run time performance benchmark suite /all/ the file parsers. +main :: IO () +main = defaultMain + [ benchAsSequential + , benchAsParallel + ] + + + +benchAsSequential :: Benchmark +benchAsSequential = + let basicMeasure :: ((Word32 -> (Word8, Word)) -> Word32 -> (Word8, Word)) -> String -> Benchmark + basicMeasure = measureMemoizedHashTable (2 ^ 10) (2 ^ 20) + basicMeasureIO :: ((Word32 -> (Word8, Word)) -> IO (Word32 -> (Word8, Word))) -> String -> Benchmark + basicMeasureIO f name = bench name $ + perRunEnv (makeMemoizedHashTableIO (2 ^ 10) (2 ^ 20) f) $ \(keys, memo) -> + pure $ memo <$> keys + in bgroup "Sequential" + [ basicMeasure Lock.memoize "Custom locking definition" +-- , basicMeasure IORf.memoize "Manual access through - IORef" +-- , basicMeasure TVar.memoize "Manual access through - TVar" +-- , basicMeasure Semp.memoize "Manual access through - Semaphore" + , basicMeasureIO Conc.memoize "Package `concurrent-hashtables` Hash-table" +-- , basicMeasure OptQ.memoize "Package `concurrent-hashtables-opt` Hash-table" +-- , basicMeasure RWLk.memoize "Package `concurrent-extra` Read/Write Lock" + ] + + +benchAsParallel :: Benchmark +benchAsParallel = + let basicMeasure :: ((Word32 -> (Word8, Word)) -> Word32 -> (Word8, Word)) -> String -> Benchmark + basicMeasure = measureMemoizedHashTableParallel (2 ^ 10) (2 ^ 20) + in bgroup "Parallel" + [ basicMeasure RWLk.memoize "Package `concurrent-extra` Read/Write Lock" + -- , basicMeasure Conc.memoize "Package `concurrent-hashtables` Hash-table" +-- , basicMeasure TVar.memoize "Manual access through - TVar" +-- , basicMeasure IORf.memoize "Manual access through - IORef" + , basicMeasure Lock.memoize "Custom locking definition" + , basicMeasure Semp.memoize "Manual access through - Semaphore" + ] diff --git a/lib/tcm/src/Data/Hashable/Memoize.hs b/lib/tcm/src/Data/Hashable/Memoize.hs new file mode 100644 index 000000000..675494f74 --- /dev/null +++ b/lib/tcm/src/Data/Hashable/Memoize.hs @@ -0,0 +1,119 @@ +{-# LANGUAGE CPP #-} + +-- {-# OPTIONS_GHC -fno-full-laziness #-} + +{- | +Exposes memoization combinators. Assumes that the supplied functions are +side effect free. If this assumption is violated, undefined and unexpected +behavior may result. +-} +module Data.Hashable.Memoize ( + memoize, + memoize2, + memoize3, +) where + +import Control.DeepSeq (NFData) +import Control.Monad.IO.Class (MonadIO(..)) +import Data.Hashable (Hashable) +#if defined (Memoize_Via_ConcurrentHashtable) +import Data.Hashable.Memoize.ViaConcurrentHashtable qualified as Memo (memoize) +#elif defined (Memoize_Via_IORef) +import Data.Hashable.Memoize.ViaIORef qualified as Memo (memoize) +#elif defined (Memoize_Via_ManualLock) +import Data.Hashable.Memoize.ViaManualLock qualified as Memo (memoize) +#elif defined (Memoize_Via_ReadWriteLock) +import Data.Hashable.Memoize.ViaReadWriteLock qualified as Memo (memoize) +#elif defined (Memoize_Via_Semaphore) +import Data.Hashable.Memoize.ViaSemaphore qualified as Memo (memoize) +#elif defined (Memoize_Via_TVar) +import Data.Hashable.Memoize.ViaTVar qualified as Memo (memoize) +#endif + + +{- | +/O(1)/ + +Takes a function with a hashable and equatable first argument and returns a +memoized function with constant time access for already computed values. + +__Note:__ This does /not/ memoize recursively defined functions. + +To memoize a recursively defined function, you must redefine the function's +recursive calls to internally call a memoized definition in a mutually recursive +manner. + +=== Example: Does /not/ memoize the recursive definitions + +> fib 0 = 0 +> fib 1 = 1 +> fib x = fib (x-1) + fib (x-2) + +>>> let memo = memoize fib in memo 10000 + + +=== Example: Does memoize the recursive definitions + +> fibM = f +> where +> f 0 = 0 +> f 1 = 1 +> f x = g (x-1) + g (x-2) +> g = memoize f + +>>> fibM 10000 +-} +{-# NOINLINE memoize #-} +memoize ∷ ∀ a b m. (Hashable a, MonadIO m, NFData b) ⇒ (a → b) → m (a → b) +memoize = Memo.memoize + + +{- | +A memoizing combinator similar to 'memoize' except that it that acts on a +function of two inputs rather than one. +-} +{-# NOINLINE memoize2 #-} +memoize2 ∷ (Hashable a, Hashable b, MonadIO m, NFData c) ⇒ (a → b → c) → m (a → b → c) +memoize2 f = + let f' = memoize (uncurry f) + in curry <$> f' + + +{- | +A memoizing combinator similar to 'memoize' except that it that acts on a +function of two inputs rather than one. +-} +{-# NOINLINE memoize3 #-} +memoize3 + ∷ ( Hashable a + , Hashable b + , Hashable c + , MonadIO m + , NFData d + ) + ⇒ (a → b → c → d) + → m (a → b → c → d) +memoize3 f = + let curry3 ∷ ((a, b, c) → t) → a → b → c → t + curry3 g x y z = g (x, y, z) + + uncurry3 ∷ (t1 → t2 → t3 → t4) → (t1, t2, t3) → t4 + uncurry3 g (x, y, z) = g x y z + + f' = memoize (uncurry3 f) + in curry3 <$> f' + +{- +-- These are included for haddock generation +fib 0 = 0 +fib 1 = 1 +fib x = fib (x-1) + fib (x-2) + +fibM :: Integer -> Integer +fibM = f + where + f 0 = 0 + f 1 = 1 + f x = g (x-1) + g (x-2) + g = memoize f +-} diff --git a/lib/tcm/src/Data/Hashable/Memoize/ViaConcurrentHashtable.hs b/lib/tcm/src/Data/Hashable/Memoize/ViaConcurrentHashtable.hs new file mode 100644 index 000000000..42d93061a --- /dev/null +++ b/lib/tcm/src/Data/Hashable/Memoize/ViaConcurrentHashtable.hs @@ -0,0 +1,57 @@ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE ScopedTypeVariables #-} + +-- {-# OPTIONS_GHC -fno-full-laziness #-} + +{- | +Exposes memoization combinators. Assumes that the supplied functions are +side effect free. If this assumption is violated, undefined and unexpected +behavior may result. +-} +module Data.Hashable.Memoize.ViaConcurrentHashtable ( + memoize, +) where + +import Control.DeepSeq +import Control.Monad.IO.Class (MonadIO(..)) +import Data.Functor (($>)) +import Data.HashTable +import Data.Hashable +import Data.IORef +import System.IO +import System.IO.Unsafe +import Prelude hiding (lookup) + + +{- | +The initalization size for a new memoized TCM. +-} +initializationSize ∷ Int +initializationSize = 2 ^ (10 ∷ Word) -- Made this smaller to reduce startup time. +--initializationSize = 2 ^ (16 ∷ Word) + + +{- | +/O(1)/ +-} +{-# NOINLINE memoize #-} +memoize ∷ ∀ a b m. (Hashable a, MonadIO m, NFData b) ⇒ (a → b) → m (a → b) +memoize f = liftIO $ do + + !htRef ← (newWithDefaults initializationSize ∷ IO (HashTable a b)) >>= newIORef + pure $ \k → unsafeDupablePerformIO $ do + ht ← readIORef htRef + result ← ht `lookup` k + -- Here we check if the memoized value exists + case result of + -- If the value exists return it + Just v → pure v + -- If the value doesn't exist: + Nothing → + -- Perform the expensive calculation to determine the value + -- associated with the key, fully evaluated. + let v = force $ f k + in -- we want to perform the following modification atomically. + -- Insert the key-value pair into the HashTable + -- Then we return the value associated with the key + insert ht k v $> v diff --git a/lib/tcm/src/Data/Hashable/Memoize/ViaConcurrentHashtableOpt.hs b/lib/tcm/src/Data/Hashable/Memoize/ViaConcurrentHashtableOpt.hs new file mode 100644 index 000000000..053c81075 --- /dev/null +++ b/lib/tcm/src/Data/Hashable/Memoize/ViaConcurrentHashtableOpt.hs @@ -0,0 +1,61 @@ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE ScopedTypeVariables #-} + +-- {-# OPTIONS_GHC -fno-full-laziness #-} + +{- | +Exposes memoization combinators. Assumes that the supplied functions are +side effect free. If this assumption is violated, undefined and unexpected +behavior may result. +-} +module Data.Hashable.Memoize.ViaConcurrentHashtableOpt ( + memoize, +) where + +import Control.DeepSeq +import PHANE.Memoized +import Data.Hashable +import Data.IORef +import System.IO +import System.IO.Unsafe +import Prelude hiding (lookup) + + +{- | +/O(1)/ + +Takes a function with a hashable and equatable first argument and returns a +memoized function with constant time access for already computed values. + +__Note:__ This does /not/ memoize recursively defined functions. + +To memoize a recursively defined function, you must redefine the function's +recursive calls to internally call a memoized definition in a mutually recursive +manner. + +=== Example: Does /not/ memoize the recursive definitions + +> fib 0 = 0 +> fib 1 = 1 +> fib x = fib (x-1) + fib (x-2) + +>>> let memo = memoize fib in memo 10000 + + +=== Example: Does memoize the recursive definitions + +> fibM = f +> where +> f 0 = 0 +> f 1 = 1 +> f x = g (x-1) + g (x-2) +> g = memoize f + +>>> fibM 10000 +-} +{-# NOINLINE memoize #-} +memoize ∷ ∀ a b. (Hashable a, NFData b) ⇒ (a → b) → a → b +memoize f = unsafePerformIO $ do + let initialSize = 2 ^ (16 ∷ Word) + !htRef ← (newWithDefaults initialSize ∷ IO (HashTable a b)) >>= newIORef + pure $ \k → unsafePerformIO $ readIORef htRef >>= \ht -> query ht k f diff --git a/lib/tcm/src/Data/Hashable/Memoize/ViaIORef.hs b/lib/tcm/src/Data/Hashable/Memoize/ViaIORef.hs new file mode 100644 index 000000000..e43f8e533 --- /dev/null +++ b/lib/tcm/src/Data/Hashable/Memoize/ViaIORef.hs @@ -0,0 +1,80 @@ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE ScopedTypeVariables #-} + +-- {-# OPTIONS_GHC -fno-full-laziness #-} + +{- | +Exposes memoization combinators. Assumes that the supplied functions are +side effect free. If this assumption is violated, undefined and unexpected +behavior may result. +-} +module Data.Hashable.Memoize.ViaIORef ( + memoize, +) where + +import Control.DeepSeq +import Data.HashTable.IO +import Data.Hashable +import Data.IORef +import System.IO +import System.IO.Unsafe +import Prelude hiding (lookup) + + +{- | +/O(1)/ + +Takes a function with a hashable and equatable first argument and returns a +memoized function with constant time access for already computed values. + +__Note:__ This does /not/ memoize recursively defined functions. + +To memoize a recursively defined function, you must redefine the function's +recursive calls to internally call a memoized definition in a mutually recursive +manner. + +=== Example: Does /not/ memoize the recursive definitions + +> fib 0 = 0 +> fib 1 = 1 +> fib x = fib (x-1) + fib (x-2) + +>>> let memo = memoize fib in memo 10000 + + +=== Example: Does memoize the recursive definitions + +> fibM = f +> where +> f 0 = 0 +> f 1 = 1 +> f x = g (x-1) + g (x-2) +> g = memoize f + +>>> fibM 10000 +-} +{-# NOINLINE memoize #-} +memoize ∷ ∀ a b. (Hashable a, NFData b) ⇒ (a → b) → a → b +memoize f = unsafePerformIO $ do + let initialSize = 2 ^ (16 ∷ Word) + + !htRef ← (newSized initialSize ∷ IO (BasicHashTable a b)) >>= newIORef + pure $ \k → unsafeDupablePerformIO $ do + ht ← readIORef htRef + result ← ht `lookup` k + -- Here we check if the memoized value exists + case result of + -- If the value exists return it + Just v → pure v + -- If the value doesn't exist: + Nothing → + -- Perform the expensive calculation to determine the value + -- associated with the key, fully evaluated. + let v = force $ f k + in -- we want to perform the following modification atomically. + do + insert ht k v -- Insert the key-value pair into the HashTable + writeIORef htRef ht -- Place the updated hashtable back in the IO-Ref + -- After performing the update side effects, + -- we return the value associated with the key + pure v diff --git a/lib/tcm/src/Data/Hashable/Memoize/ViaManualLock.hs b/lib/tcm/src/Data/Hashable/Memoize/ViaManualLock.hs new file mode 100644 index 000000000..cdc395729 --- /dev/null +++ b/lib/tcm/src/Data/Hashable/Memoize/ViaManualLock.hs @@ -0,0 +1,194 @@ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE RoleAnnotations #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE StrictData #-} + +-- {-# OPTIONS_GHC -fno-full-laziness #-} + +{- | +Exposes memoization combinators. Assumes that the supplied functions are +side effect free. If this assumption is violated, undefined and unexpected +behavior may result. +-} +module Data.Hashable.Memoize.ViaManualLock ( + memoize, +) where + +import Control.Concurrent.STM +import Control.DeepSeq +import Control.Exception (bracket, bracket_) +import Data.Functor (($>)) +import Data.HashTable.IO +import Data.Hashable +import System.IO.Unsafe +import Prelude hiding (lookup) + + +{- | +/O(1)/ + +Takes a function with a hashable and equatable first argument and returns a +memoized function with constant time access for already computed values. + +__Note:__ This does /not/ memoize recursively defined functions. + +To memoize a recursively defined function, you must redefine the function's +recursive calls to internally call a memoized definition in a mutually recursive +manner. + +=== Example: Does /not/ memoize the recursive definitions + +> fib 0 = 0 +> fib 1 = 1 +> fib x = fib (x-1) + fib (x-2) + +>>> let memo = memoize fib in memo 10000 + + +=== Example: Does memoize the recursive definitions + +> fibM = f +> where +> f 0 = 0 +> f 1 = 1 +> f x = g (x-1) + g (x-2) +> g = memoize f + +>>> fibM 10000 +-} +{-# NOINLINE memoize #-} +memoize ∷ ∀ a b. (Hashable a, NFData b) ⇒ (a → b) → a → b +memoize f = unsafePerformIO $ do + let initialSize = 2 ^ (16 ∷ Word) + -- let initialSize = 2 ^ (3 :: Word) + + -- Create a TVar which holds the HashTable + !tabRef ← newHashTableAccess initialSize + -- This is the returned closure of a memozized f + -- The closure captures the "mutable" reference to the hashtable above + -- through the TVar. + -- + -- Once the mutable hashtable reference is escaped from the IO monad, + -- this creates a new memoized reference to f. + -- The technique should be safe for all pure functions, probably, I think. + pure $ \k → unsafeDupablePerformIO $ do + readHashTableAccess tabRef k + >>= {-# SCC memoize_Lock_CASE #-} + \case + Just v → {-# SCC memoize_Lock_GET #-} pure v + Nothing → + {-# SCC memoize_Lock_PUT #-} + let v = force $ f k + in -- in (takeHashTableAccess tabRef >>= giveHashTableAccess tabRef k v) $> v + updateHashTableAccess tabRef k v $> v + + +-- in (takeHashTableAccess tabRef >>= giveHashTableAccess_old tabRef k v) $> v + +{- +-=-=-=-=-=-=-=- + T O D O +-=-=-=-=-=-=-=- +Consider another implementation which only locks on *resize,* +permitting truly concurrent reads and writes in all but an infintesimal number of cases. +-} +data HashTableAccess k v + = Access + {-# UNPACK #-} Bool + -- ^ Can Read the hash table? + {-# UNPACK #-} (BasicHashTable k v) + -- ^ Hash table reference + + +data Lockbox k v = Lockbox + { readQueue ∷ TQueue () + , lockToken ∷ TMVar () + , memoTable ∷ TVar (HashTableAccess k v) + } + + +type role HashTableAccess representational representational + + +type role Lockbox representational representational + + +{-# NOINLINE newHashTableAccess #-} +newHashTableAccess ∷ Int → IO (Lockbox k v) +newHashTableAccess size = do + token ← newTMVarIO () + table ← newTVarIO =<< (Access True <$> (newSized size ∷ IO (BasicHashTable a b))) + queue ← newTQueueIO + pure $ + Lockbox + { readQueue = queue + , lockToken = token + , memoTable = table + } + + +forbidReadAccess ∷ HashTableAccess k v → HashTableAccess k v +forbidReadAccess (Access _ table) = table `seq` Access False table + + +permitReadAccess ∷ HashTableAccess k v → HashTableAccess k v +permitReadAccess (Access _ table) = table `seq` Access True table + + +{- +forbidWriteAccess :: HashTableAccess k v -> HashTableAccess k v +forbidWriteAccess (Access r _ table) = table `seq` Access r False table + +permitWriteAccess :: HashTableAccess k v -> HashTableAccess k v +permitWriteAccess (Access r _ table) = table `seq` Access r True table +-} + +{-# NOINLINE readHashTableAccess #-} +readHashTableAccess ∷ (Hashable k) ⇒ Lockbox k v → k → IO (Maybe v) +readHashTableAccess lockbox@(Lockbox queue _ _) key = + {-# SCC readHashTableAccess #-} + let request = getReadableTable lockbox + release ∷ b → IO () + release = const . atomically $ readTQueue queue + in bracket request release $ (`lookup` key) + + +{-# NOINLINE updateHashTableAccess #-} +updateHashTableAccess ∷ (Hashable k) ⇒ Lockbox k v → k → v → IO () +updateHashTableAccess lockbox key val = + {-# SCC updateHashTableAccess #-} + bracket_ (markWriteableTable lockbox) (freeWriteableTable lockbox) $ do + tab ← gainWriteableTable lockbox + insert tab key val + + +{-# NOINLINE getReadableTable #-} +getReadableTable ∷ Lockbox k v → IO (BasicHashTable k v) +getReadableTable (Lockbox queue _ table) = atomically $ do + Access readable memo ← readTVar table + check readable + writeTQueue queue () + pure memo + + +{-# NOINLINE markWriteableTable #-} +markWriteableTable ∷ Lockbox k v → IO () +markWriteableTable (Lockbox _ token table) = atomically $ do + takeTMVar token + modifyTVar' table forbidReadAccess + + +{-# NOINLINE gainWriteableTable #-} +gainWriteableTable ∷ Lockbox k v → IO (BasicHashTable k v) +gainWriteableTable (Lockbox queue _ table) = atomically $ do + check =<< isEmptyTQueue queue + Access _ memo ← readTVar table + pure memo + + +{-# NOINLINE freeWriteableTable #-} +freeWriteableTable ∷ Lockbox k v → IO () +freeWriteableTable (Lockbox _ token table) = atomically $ do + modifyTVar' table permitReadAccess + putTMVar token () diff --git a/lib/tcm/src/Data/Hashable/Memoize/ViaReadWriteLock.hs b/lib/tcm/src/Data/Hashable/Memoize/ViaReadWriteLock.hs new file mode 100644 index 000000000..e580807c9 --- /dev/null +++ b/lib/tcm/src/Data/Hashable/Memoize/ViaReadWriteLock.hs @@ -0,0 +1,83 @@ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE CPP #-} +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE RoleAnnotations #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE StrictData #-} + +-- {-# OPTIONS_GHC -fno-full-laziness #-} + +{- | +Exposes memoization combinators. Assumes that the supplied functions are +side effect free. If this assumption is violated, undefined and unexpected +behavior may result. +-} +module Data.Hashable.Memoize.ViaReadWriteLock ( + memoize, +) where + +import Control.Concurrent.ReadWriteVar qualified as RWLock +import Control.DeepSeq +import Data.Functor (($>)) +import Data.HashTable.IO +import Data.Hashable +import System.IO +import System.IO.Unsafe +import Prelude hiding (lookup) + + +{- | +/O(1)/ + +Takes a function with a hashable and equatable first argument and returns a +memoized function with constant time access for already computed values. + +__Note:__ This does /not/ memoize recursively defined functions. + +To memoize a recursively defined function, you must redefine the function's +recursive calls to internally call a memoized definition in a mutually recursive +manner. + +=== Example: Does /not/ memoize the recursive definitions + +> fib 0 = 0 +> fib 1 = 1 +> fib x = fib (x-1) + fib (x-2) + +>>> let memo = memoize fib in memo 10000 + + +=== Example: Does memoize the recursive definitions + +> fibM = f +> where +> f 0 = 0 +> f 1 = 1 +> f x = g (x-1) + g (x-2) +> g = memoize f + +>>> fibM 10000 +-} +{-# NOINLINE memoize #-} +memoize ∷ ∀ a b. (Hashable a, NFData b) ⇒ (a → b) → a → b +memoize f = unsafePerformIO $ do + let initialSize = 2 ^ (16 ∷ Word) + + -- Create a RWVar which holds the HashTable + tableRef ← RWLock.new =<< (newSized initialSize ∷ IO (BasicHashTable a b)) + + -- This is the returned closure of a memozized f + -- The closure captures the "mutable" reference to the hashtable above + -- through the TVar. + -- + -- Once the mutable hashtable reference is escaped from the IO monad, + -- this creates a new memoized reference to f. + -- The technique should be safe for all pure functions, probably, I think. + pure $ \k → unsafePerformIO $ do + result ← RWLock.with tableRef (`lookup` k) + case result of + Just v → {-# SCC memoize_Lock_GET #-} pure v + Nothing → + {-# SCC memoize_Lock_PUT #-} + let v = force $ f k + in RWLock.modify tableRef $ \t → insert t k v $> (t, v) diff --git a/lib/tcm/src/Data/Hashable/Memoize/ViaSemaphore.hs b/lib/tcm/src/Data/Hashable/Memoize/ViaSemaphore.hs new file mode 100644 index 000000000..a08bbc225 --- /dev/null +++ b/lib/tcm/src/Data/Hashable/Memoize/ViaSemaphore.hs @@ -0,0 +1,154 @@ +{-# LANGUAGE RoleAnnotations #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE StrictData #-} + +-- {-# OPTIONS_GHC -fno-full-laziness #-} + +{- | +Exposes memoization combinators. Assumes that the supplied functions are +side effect free. If this assumption is violated, undefined and unexpected +behavior may result. +-} +module Data.Hashable.Memoize.ViaSemaphore ( + memoize, +) where + +import Control.Concurrent.QSemN +import Control.DeepSeq +import Control.Exception (bracket_, mask_) +import Data.Functor (($>)) +import Data.HashTable.IO +import Data.Hashable +import Data.IORef +import Data.Int (Int16, Int8) +import System.IO +import System.IO.Unsafe +import Prelude hiding (lookup) + + +{- | +/O(1)/ + +Takes a function with a hashable and equatable first argument and returns a +memoized function with constant time access for already computed values. + +__Note:__ This does /not/ memoize recursively defined functions. + +To memoize a recursively defined function, you must redefine the function's +recursive calls to internally call a memoized definition in a mutually recursive +manner. + +=== Example: Does /not/ memoize the recursive definitions + +> fib 0 = 0 +> fib 1 = 1 +> fib x = fib (x-1) + fib (x-2) + +>>> let memo = memoize fib in memo 10000 + + +=== Example: Does memoize the recursive definitions + +> fibM = f +> where +> f 0 = 0 +> f 1 = 1 +> f x = g (x-1) + g (x-2) +> g = memoize f + +>>> fibM 10000 +-} +{-# NOINLINE memoize #-} +memoize ∷ ∀ a b. (Hashable a, NFData b) ⇒ (a → b) → a → b +memoize f = unsafePerformIO $ do + let initialSize = 2 ^ (16 ∷ Word) + + -- Create a TVar which holds the HashTable + queueSem ← newQSemN quota + tableRef ← newIORef =<< (newSized initialSize ∷ IO (BasicHashTable a b)) + + let accessUsing ∷ Int → IO c → IO c + accessUsing quanta op = + bracket_ (waitQSemN queueSem quanta) (signalQSemN queueSem quanta) $ mask_ op + -- This is the returned closure of a memozized f + -- The closure captures the "mutable" reference to the hashtable above + -- through the TVar. + -- + -- Once the mutable hashtable reference is escaped from the IO monad, + -- this creates a new memoized reference to f. + -- The technique should be safe for all pure functions, probably, I think. + pure $ \k → unsafePerformIO $ do + result ← accessUsing quantaRead $ readIORef tableRef >>= (`lookup` k) + case result of + Just v → {-# SCC memoize_Lock_GET #-} pure v + Nothing → + {-# SCC memoize_Lock_PUT #-} + let v = force $ f k + in -- in (takeHashTableAccess tabRef >>= giveHashTableAccess tabRef k v) $> v + accessUsing quantaWrite $ readIORef tableRef >>= \t → (insert t k v $> v) + + +-- in (takeHashTableAccess tabRef >>= giveHashTableAccess_old tabRef k v) $> v + +{- +{- +-=-=-=-=-=-=-=- + T O D O +-=-=-=-=-=-=-=- +Consider another implementation which only locks on *resize,* +permitting truly concurrent reads and writes in all but an infintesimal number of cases. +-} +data Lockbox k v = Lockbox + {-# UNPACK #-} QSemN -- ^ The "Read Queue" of the nlock box + {-# UNPACK #-} (IORef (BasicHashTable k v)) -- ^ The Hashtable reference + +type role Lockbox representational representational +-} + +-- Implement a Read/Write lock. + +quota ∷ Int +quota = fromIntegral (maxBound ∷ Int16) + fromIntegral (minBound ∷ Int8) + + +quantaRead ∷ Int +quantaRead = 1 + + +quantaWrite ∷ Int +quantaWrite = quota + + +{- +operation :: Int -> QSemN -> IO a -> IO a +operation quanta semaphore = + bracket_ + (waitQSemN semaphore quanta) + (signalQSemN semaphore quanta) + . mask_ + +safeRead :: QSemN -> IO a -> IO a +safeRead = operation quantaRead + +safeWrite :: QSemN -> IO a -> IO a +safeWrite = operation quantaWrite + +{-# NOINLINE initializeSafeHashTable #-} +initializeSafeHashTable :: Int -> IO (Lockbox k v) +initializeSafeHashTable size = do + queue <- newQSemN quota + table <- newIORef =<< (newSized size :: IO (BasicHashTable a b)) + pure $ Lockbox queue table + +{-# NOINLINE safelyReadValue #-} +safelyReadValue :: Hashable k => Lockbox k v -> k -> IO (Maybe v) +safelyReadValue lockbox@(Lockbox queue table) key = {-# SCC readHashTableAccess #-} safeRead queue $ do + tab <- {-# SCC readHashTableAccess_Atomic_Block #-} readTVarIO table + tab `lookup` key + +{-# NOINLINE safelyWriteValue #-} +safelyWriteValue :: Hashable k => Lockbox k v -> k -> v -> IO () +safelyWriteValue lockbox@(Lockbox queue table) key val = {-# SCC updateHashTableAccess #-} safeWrite queue $ do + tab <- {-# SCC readHashTableAccess_Atomic_Block #-} readTIORef table + atomicModifyIORef' $ insert tab key val +-} diff --git a/lib/tcm/src/Data/Hashable/Memoize/ViaTVar.hs b/lib/tcm/src/Data/Hashable/Memoize/ViaTVar.hs new file mode 100644 index 000000000..dd168aca1 --- /dev/null +++ b/lib/tcm/src/Data/Hashable/Memoize/ViaTVar.hs @@ -0,0 +1,101 @@ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE RoleAnnotations #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE StrictData #-} + +-- {-# OPTIONS_GHC -fno-full-laziness #-} + +{- | +Exposes memoization combinators. Assumes that the supplied functions are +side effect free. If this assumption is violated, undefined and unexpected +behavior may result. +-} +module Data.Hashable.Memoize.ViaTVar ( + memoize, +) where + +import Control.Concurrent.STM +import Control.DeepSeq +import Data.HashTable.IO +import Data.Hashable +import GHC.Conc (unsafeIOToSTM) +import System.IO +import System.IO.Unsafe +import Prelude hiding (lookup) + + +type HashTableRef k v = TVar (BasicHashTable k v) + + +initialize ∷ Int → IO (HashTableRef k v) +initialize size = newSized size >>= newTVarIO + + +{- | +/O(1)/ + +Takes a function with a hashable and equatable first argument and returns a +memoized function with constant time access for already computed values. + +__Note:__ This does /not/ memoize recursively defined functions. + +To memoize a recursively defined function, you must redefine the function's +recursive calls to internally call a memoized definition in a mutually recursive +manner. + +=== Example: Does /not/ memoize the recursive definitions + +> fib 0 = 0 +> fib 1 = 1 +> fib x = fib (x-1) + fib (x-2) + +>>> let memo = memoize fib in memo 10000 + + +=== Example: Does memoize the recursive definitions + +> fibM = f +> where +> f 0 = 0 +> f 1 = 1 +> f x = g (x-1) + g (x-2) +> g = memoize f + +>>> fibM 10000 +-} +{-# NOINLINE memoize #-} +memoize ∷ ∀ a b. (Hashable a, NFData b) ⇒ (a → b) → a → b +memoize f = unsafePerformIO $ do + let initialSize = 2 ^ (16 ∷ Word) + + -- Create a TVar which holds the ST state and the HashTable + (!htRef ∷ HashTableRef a b) ← initialize initialSize + -- This is the returned closure of a memozized f + -- The closure captures the "mutable" reference to the hashtable above + -- through the TVar. + -- + -- Once the mutable hashtable reference is escaped from the IO monad, + -- this creates a new memoized reference to f. + -- The technique should be safe for all pure functions, probably, I think. + pure $ \k → unsafeDupablePerformIO $ do + -- Read the TVar, we use IO since it is the outer monad + -- and the documentation says that this doesn't perform a complete transaction, + -- it just reads the current value from the TVar + -- ht <- readTVarIO htRef + -- We use the HashTable to try and lookup the memoized value + result ← readTVarIO htRef >>= (`lookup` k) + -- Here we check if the memoized value exists + case result of + -- If the value exists return it + Just v → pure v + -- If the value doesn't exist: + Nothing → + -- Perform the expensive calculation to determine the value + -- associated with the key, fully evaluated. + let v = force $ f k + in -- we want to perform the following modification atomically. + atomically $ do + table ← readTVar htRef + unsafeIOToSTM $ insert table k v + pure v diff --git a/pkg/PhyGraph/lib/tcm/src/Data/MetricRepresentation.hs b/lib/tcm/src/Data/MetricRepresentation.hs similarity index 82% rename from pkg/PhyGraph/lib/tcm/src/Data/MetricRepresentation.hs rename to lib/tcm/src/Data/MetricRepresentation.hs index 6e2633e45..079b26147 100644 --- a/pkg/PhyGraph/lib/tcm/src/Data/MetricRepresentation.hs +++ b/lib/tcm/src/Data/MetricRepresentation.hs @@ -10,12 +10,12 @@ -- ----------------------------------------------------------------------------- -{-# LANGUAGE BangPatterns #-} -{-# LANGUAGE DeriveAnyClass #-} -{-# LANGUAGE DeriveGeneric #-} -{-# LANGUAGE DerivingStrategies #-} -{-# LANGUAGE StrictData #-} -{-# LANGUAGE UnboxedSums #-} +{-# Language BangPatterns #-} +{-# Language DeriveAnyClass #-} +{-# Language DeriveGeneric #-} +{-# Language DerivingStrategies #-} +{-# Language StrictData #-} +{-# Language UnboxedSums #-} module Data.MetricRepresentation ( -- * Smart Constructors @@ -34,6 +34,7 @@ module Data.MetricRepresentation ) where import Control.DeepSeq +import Control.Monad.IO.Class (MonadIO) import Data.Binary import Data.Bits import Data.Hashable @@ -64,6 +65,9 @@ data MetricRepresentation a deriving anyclass (NFData) +type role MetricRepresentation representational + + instance Eq (MetricRepresentation a) where (==) DiscreteMetric DiscreteMetric = True @@ -78,54 +82,69 @@ instance Show (MetricRepresentation a) where show ExplicitLayout {} = "General-Metric" --- | --- Nullary constructor for the . +{- | +__Time:__ \( \mathcal{O}\left( 1 \right) \) + +Nullary constructor for the . +-} discreteMetric :: MetricRepresentation a discreteMetric = DiscreteMetric --- | --- Nullary constructor for the . +{- | +__Time:__ \( \mathcal{O}\left( 1 \right) \) + +Nullary constructor for the . +-} linearNorm :: Word -> MetricRepresentation a linearNorm = LinearNorm --- | --- General constructor for an arbitrary metric. --- --- Performs memoization so repeated value queries are not recomputed. +{- | +__Time:__ \( \mathcal{O}\left( a \right) \), where \( a \) is the alphabet length. + +General constructor for an arbitrary metric. + +Performs memoization so repeated value queries are not recomputed. +-} metricRepresentation :: ( FiniteBits a , Hashable a + , MonadIO m , NFData a ) => TCM - -> MetricRepresentation a -metricRepresentation tcm = - let scm = makeSCM tcm - in ExplicitLayout tcm minInDel maxInDel - (memoize2 (overlap2 bitWidth scm)) - (memoize3 (overlap3 bitWidth scm)) - where - -- /O(2*(a - 1))/ - -- - -- This was taken from Ukkonen's original 1985 paper wherein the coefficient - -- delta @(Δ)@ was defined by the minimum transition cost from any symbol in - -- the alphabet @(Σ)@ to the gap symbol @'-'@. - -- - -- If there is any transition to a gap from a non-gap for which the cost is - -- zero, then this coefficient will be zero. This leaves us with no way to - -- determine if optimality is preserved, and the Ukkonen algorithm will hang. - -- Consequently, we do not perform Ukkonen's algorithm if the coefficient is - -- zero. - minInDel = fromIntegral . minimum $ inDelCost min <$> nonGapElements - maxInDel = fromIntegral . maximum $ inDelCost max <$> nonGapElements - bitWidth = toEnum alphabetSize - alphabetSize = size tcm - gapIndex = 0 - nonGapElements = [ 1 .. alphabetSize - 1 ] - inDelCost f i = f (tcm ! (i, gapIndex)) - (tcm ! (gapIndex, i)) + -> m (MetricRepresentation a) +metricRepresentation tcm = + let {- | + /O(2*(a - 1))/ + + This was taken from Ukkonen's original 1985 paper wherein the coefficient + delta @(Δ)@ was defined by the minimum transition cost from any symbol in + the alphabet @(Σ)@ to the gap symbol @'-'@. + + If there is any transition to a gap from a non-gap for which the cost is + zero, then this coefficient will be zero. This leaves us with no way to + determine if optimality is preserved, and the Ukkonen algorithm will hang. + Consequently, we do not perform Ukkonen's algorithm if the coefficient is + zero. + -} + minInDel = fromIntegral . minimum $ inDelCost min <$> nonGapElements + maxInDel = fromIntegral . maximum $ inDelCost max <$> nonGapElements + bitWidth = toEnum alphabetSize + alphabetSize = size tcm + gapIndex = 0 + nonGapElements = [ 1 .. alphabetSize - 1 ] + + inDelCost :: (Word32 -> Word32 -> t) -> Int -> t + inDelCost f i = f (tcm ! (i, gapIndex)) + (tcm ! (gapIndex, i)) + + in do let scm = makeSCM tcm + memo2D <- memoize2 $ overlap2 bitWidth scm + memo3D <- memoize3 $ overlap3 bitWidth scm + + pure $ ExplicitLayout tcm minInDel maxInDel memo2D memo3D -- | diff --git a/pkg/PhyGraph/lib/tcm/src/Data/TCM.hs b/lib/tcm/src/Data/TCM.hs similarity index 100% rename from pkg/PhyGraph/lib/tcm/src/Data/TCM.hs rename to lib/tcm/src/Data/TCM.hs diff --git a/pkg/PhyGraph/lib/tcm/src/Data/TCM/Dense.hs b/lib/tcm/src/Data/TCM/Dense.hs similarity index 100% rename from pkg/PhyGraph/lib/tcm/src/Data/TCM/Dense.hs rename to lib/tcm/src/Data/TCM/Dense.hs diff --git a/pkg/PhyGraph/lib/tcm/src/Data/TCM/Dense/FFI.hsc b/lib/tcm/src/Data/TCM/Dense/FFI.hsc similarity index 97% rename from pkg/PhyGraph/lib/tcm/src/Data/TCM/Dense/FFI.hsc rename to lib/tcm/src/Data/TCM/Dense/FFI.hsc index 4385092b1..8710ce62e 100644 --- a/pkg/PhyGraph/lib/tcm/src/Data/TCM/Dense/FFI.hsc +++ b/lib/tcm/src/Data/TCM/Dense/FFI.hsc @@ -37,6 +37,7 @@ module Data.TCM.Dense.FFI , getAlignmentStrategy ) where +import Bio.DynamicCharacter.Element (SlimState) import Control.DeepSeq import Data.Foldable import Foreign @@ -88,10 +89,10 @@ data CostMatrix2d -} , isMetric :: CInt -- if tcm is metric , allElems :: CInt -- total number of elements - , bestCost :: Ptr CUInt {- The transformation cost matrix, including ambiguities, + , bestCost :: Ptr SlimState {- The transformation cost matrix, including ambiguities, storing the **best** cost for each ambiguity pair -} - , medians :: Ptr CUInt {- The matrix of possible medians between elements in the + , medians :: Ptr SlimState {- The matrix of possible medians between elements in the alphabet. The best possible medians according to the cost matrix. -} @@ -121,8 +122,8 @@ data CostMatrix3d , include_ambiguities3D :: CInt , gapOpenCost3D :: CInt , allElems3D :: CInt - , bestCost3D :: Ptr CUInt - , medians3D :: Ptr CUInt + , bestCost3D :: Ptr SlimState + , medians3D :: Ptr SlimState } deriving stock (Eq, Generic) deriving anyclass (NFData) @@ -369,9 +370,9 @@ generateDenseTransitionCostMatrix affineCost alphabetSize costFunction = -- _NOTE: /Only considers the first 8 bits of the elements!/_ lookupPairwise :: DenseTransitionCostMatrix - -> CUInt - -> CUInt - -> (CUInt, Word) + -> SlimState + -> SlimState + -> (SlimState, Word) lookupPairwise m e1 e2 = unsafePerformIO $ do cm2d <- peek $ costMatrix2D m let dim = fromEnum $ alphSize cm2d @@ -390,10 +391,10 @@ lookupPairwise m e1 e2 = unsafePerformIO $ do -- _NOTE: /Only considers the first 8 bits of the elements!/_ lookupThreeway :: DenseTransitionCostMatrix - -> CUInt - -> CUInt - -> CUInt - -> (CUInt, Word) + -> SlimState + -> SlimState + -> SlimState + -> (SlimState, Word) lookupThreeway dtcm e1 e2 e3 = unsafePerformIO $ do cm3d <- peek $ costMatrix3D dtcm let dim = fromEnum $ alphSize3D cm3d diff --git a/pkg/PhyGraph/lib/tcm/src/Data/TCM/Internal.hs b/lib/tcm/src/Data/TCM/Internal.hs similarity index 94% rename from pkg/PhyGraph/lib/tcm/src/Data/TCM/Internal.hs rename to lib/tcm/src/Data/TCM/Internal.hs index ed071e7ce..15c3e72bc 100644 --- a/pkg/PhyGraph/lib/tcm/src/Data/TCM/Internal.hs +++ b/lib/tcm/src/Data/TCM/Internal.hs @@ -10,12 +10,14 @@ -- ----------------------------------------------------------------------------- -{-# LANGUAGE DeriveAnyClass #-} -{-# LANGUAGE DeriveDataTypeable #-} -{-# LANGUAGE DeriveGeneric #-} -{-# LANGUAGE DerivingStrategies #-} -{-# LANGUAGE Strict #-} -{-# LANGUAGE TypeFamilies #-} +{-# Language DeriveAnyClass #-} +{-# Language DeriveDataTypeable #-} +{-# Language DeriveGeneric #-} +{-# Language DerivingStrategies #-} +{-# Language Strict #-} +{-# Language TypeFamilies #-} + +{-# OPTIONS_GHC -Wno-missed-specialisations #-} module Data.TCM.Internal ( TCM(..) @@ -30,22 +32,22 @@ module Data.TCM.Internal , fromRows ) where -import Control.Arrow ((***)) -import Control.DeepSeq -import Data.Binary -import Data.Data -import Data.Foldable -import Data.List (transpose) -import Data.List.Utility (equalityOf, occurrences) -import Data.Map (delete, findMax, keys) -import qualified Data.Map as Map (fromList) -import Data.MonoTraversable -import Data.Ratio -import Data.Vector.Binary () -import Data.Vector.Unboxed (Vector) -import qualified Data.Vector.Unboxed as V -import GHC.Generics -import Test.QuickCheck hiding (generate) +import Control.Arrow ((***)) +import Control.DeepSeq +import Data.Binary +import Data.Data +import Data.Foldable +import Data.List (transpose) +import Data.List.Utility (equalityOf, occurrences) +import Data.Map (delete, findMax, keys) +import Data.Map qualified as Map (fromList) +import Data.MonoTraversable +import Data.Ratio +import Data.Vector.Binary () +import Data.Vector.Unboxed (Vector) +import Data.Vector.Unboxed qualified as V +import GHC.Generics +import Test.QuickCheck hiding (generate) -- | @@ -67,7 +69,7 @@ import Test.QuickCheck hiding (generate) -- constructors will result in a runtime exception. data TCM = TCM {-# UNPACK #-} Int !(Vector Word32) - deriving stock (Data, Eq, Generic, Typeable) + deriving stock (Data, Eq, Generic) deriving anyclass (Binary, NFData) @@ -143,7 +145,7 @@ data TCMStructure | UltraMetric | Additive | NonAdditive - deriving stock (Data, Eq, Generic, Show, Typeable) + deriving stock (Data, Eq, Generic, Show) deriving anyclass (NFData) @@ -159,7 +161,7 @@ data TCMDiagnosis , tcmStructure :: TCMStructure -- ^ The most restrictive present in the -- 'factoredTcm'. } - deriving stock (Data, Eq, Generic, Show, Typeable) + deriving stock (Data, Eq, Generic, Show) deriving anyclass (NFData) @@ -258,7 +260,7 @@ instance Show TCM where show tcm = headerLine <> matrixLines where - renderRow i = (" "<>) . unwords $ renderValue <$> [ tcm ! (i,j) | j <- rangeValues ] + renderRow i = (" " <>) . unwords $ renderValue <$> [ tcm ! (i,j) | j <- rangeValues ] matrixLines = unlines $ renderRow <$> rangeValues rangeValues = [0 .. size tcm - 1] headerLine = '\n' : unwords [ "TCM:", show $ size tcm, "x", show $ size tcm, "\n"] @@ -320,7 +322,10 @@ fromList xs len = length xs dimension = floor $ sqrt (fromIntegral len :: Double) notSquareList = square dimension /= len - square x = x*x + + square :: Num a => a -> a + square x = x * x + notSquareErrorMsg = fold [ "fromList: The number of element (" , show len ,") is not a square number. " diff --git a/pkg/PhyGraph/lib/tcm/src/Data/TCM/Memoized.hs b/lib/tcm/src/Data/TCM/Memoized.hs similarity index 100% rename from pkg/PhyGraph/lib/tcm/src/Data/TCM/Memoized.hs rename to lib/tcm/src/Data/TCM/Memoized.hs diff --git a/pkg/PhyGraph/lib/tcm/src/Data/TCM/Memoized/FFI.hsc b/lib/tcm/src/Data/TCM/Memoized/FFI.hsc similarity index 100% rename from pkg/PhyGraph/lib/tcm/src/Data/TCM/Memoized/FFI.hsc rename to lib/tcm/src/Data/TCM/Memoized/FFI.hsc diff --git a/lib/tcm/src/Data/TCM/Overlap.hs b/lib/tcm/src/Data/TCM/Overlap.hs new file mode 100644 index 000000000..f6d71a13f --- /dev/null +++ b/lib/tcm/src/Data/TCM/Overlap.hs @@ -0,0 +1,167 @@ +{-# LANGUAGE Safe #-} +{-# LANGUAGE Strict #-} + +module Data.TCM.Overlap ( + overlap, + overlap2, + overlap3, +) where + +import Data.Bits +import Data.Foldable +import Data.List.NonEmpty (NonEmpty (..)) +import Data.Semigroup +import Data.Semigroup.Foldable +import Data.Word +import Foreign.C.Types (CUInt) + + +data Bounds b = Bounds + { _lBound ∷ {-# UNPACK #-} Word + , _uBound ∷ {-# UNPACK #-} Word + , _bValue ∷ b + } + + +{- | +Takes one or more elements of 'FiniteBits' and a symbol change cost function +and returns a tuple of a new character, along with the cost of obtaining that +character. The return character may be (or is even likely to be) ambiguous. +Will attempt to intersect the two characters, but will union them if that is +not possible, based on the symbol change cost function. + +To clarify, the return character is an intersection of all possible least-cost +combinations, so for instance, if @ char1 == A,T @ and @ char2 == G,C @, and +the two (non-overlapping) least cost pairs are A,C and T,G, then the return +value is A,C,G,T. +-} +{-# INLINEABLE overlap #-} +{-# SPECIALIZE overlap ∷ (Foldable1 f, Functor f) ⇒ Word → (Word → Word → Word) → f CUInt → (CUInt, Word) #-} +{-# SPECIALIZE overlap ∷ (Foldable1 f, Functor f) ⇒ Word → (Word → Word → Word) → f Word → (Word, Word) #-} +{-# SPECIALIZE overlap ∷ (Foldable1 f, Functor f) ⇒ Word → (Word → Word → Word) → f Word8 → (Word8, Word) #-} +{-# SPECIALIZE overlap ∷ (Foldable1 f, Functor f) ⇒ Word → (Word → Word → Word) → f Word16 → (Word16, Word) #-} +{-# SPECIALIZE overlap ∷ (Foldable1 f, Functor f) ⇒ Word → (Word → Word → Word) → f Word32 → (Word32, Word) #-} +{-# SPECIALIZE overlap ∷ (Foldable1 f, Functor f) ⇒ Word → (Word → Word → Word) → f Word64 → (Word64, Word) #-} +{-# SPECIALIZE overlap ∷ (FiniteBits b) ⇒ Word → (Word → Word → Word) → NonEmpty b → (b, Word) #-} +{-# SPECIALIZE overlap ∷ Word → (Word → Word → Word) → NonEmpty CUInt → (CUInt, Word) #-} +{-# SPECIALIZE overlap ∷ Word → (Word → Word → Word) → NonEmpty Word → (Word, Word) #-} +{-# SPECIALIZE overlap ∷ Word → (Word → Word → Word) → NonEmpty Word8 → (Word8, Word) #-} +{-# SPECIALIZE overlap ∷ Word → (Word → Word → Word) → NonEmpty Word16 → (Word16, Word) #-} +{-# SPECIALIZE overlap ∷ Word → (Word → Word → Word) → NonEmpty Word32 → (Word32, Word) #-} +{-# SPECIALIZE overlap ∷ Word → (Word → Word → Word) → NonEmpty Word64 → (Word64, Word) #-} +overlap + ∷ ( FiniteBits b + , Foldable1 f + , Functor f + ) + ⇒ Word + -- ^ Alphabet size + → (Word → Word → Word) + -- ^ Symbol change matrix (SCM) to determine cost + → f b + -- ^ List of elements for of which to find the k-median and cost + → (b, Word) + -- ^ K-median and cost +overlap size sigma xs = go size maxBound zero + where + withBounds = getBitBounds <$> xs + wlog = getFirst $ foldMap1 First xs + zero = wlog `xor` wlog + + go 0 theCost bits = (bits, theCost) + go i oldCost bits = + let i' = i - 1 + newCost = foldl' (+) 0 $ getDistance i' <$> withBounds + (minCost, bits') = case oldCost `compare` newCost of + LT → (oldCost, bits) + EQ → (oldCost, bits `setBit` fromEnum i') + GT → (newCost, zero `setBit` fromEnum i') + in go i' minCost bits' + + getDistance ∷ (FiniteBits b) ⇒ Word → Bounds b → Word + getDistance i (Bounds lo hi b) = go' (hi + 1) (maxBound ∷ Word) + where + go' ∷ Word → Word → Word + go' j a | j <= lo = a + go' j a = + let j' = j - 1 + a' + | b `testBit` fromEnum j' = min a $ sigma i j' + | otherwise = a + in go' j' a' + + +{- | +Calculate the median between /two/ states. +-} +{-# INLINEABLE overlap2 #-} +{-# SPECIALIZE overlap2 ∷ Word → (Word → Word → Word) → CUInt → CUInt → (CUInt, Word) #-} +{-# SPECIALIZE overlap2 ∷ Word → (Word → Word → Word) → Word → Word → (Word, Word) #-} +{-# SPECIALIZE overlap2 ∷ Word → (Word → Word → Word) → Word8 → Word8 → (Word8, Word) #-} +{-# SPECIALIZE overlap2 ∷ Word → (Word → Word → Word) → Word16 → Word16 → (Word16, Word) #-} +{-# SPECIALIZE overlap2 ∷ Word → (Word → Word → Word) → Word32 → Word32 → (Word32, Word) #-} +{-# SPECIALIZE overlap2 ∷ Word → (Word → Word → Word) → Word64 → Word64 → (Word64, Word) #-} +overlap2 + ∷ (FiniteBits b) + ⇒ Word + → (Word → Word → Word) + → b + → b + → (b, Word) +overlap2 size sigma char1 char2 = overlap size sigma $ char1 :| [char2] + + +{- | +Calculate the median between /three/ states. +-} +{-# INLINE overlap3 #-} +{-# SPECIALIZE overlap3 ∷ Word → (Word → Word → Word) → CUInt → CUInt → CUInt → (CUInt, Word) #-} +{-# SPECIALIZE overlap3 ∷ Word → (Word → Word → Word) → Word → Word → Word → (Word, Word) #-} +{-# SPECIALIZE overlap3 ∷ Word → (Word → Word → Word) → Word8 → Word8 → Word8 → (Word8, Word) #-} +{-# SPECIALIZE overlap3 ∷ Word → (Word → Word → Word) → Word16 → Word16 → Word16 → (Word16, Word) #-} +{-# SPECIALIZE overlap3 ∷ Word → (Word → Word → Word) → Word32 → Word32 → Word32 → (Word32, Word) #-} +{-# SPECIALIZE overlap3 ∷ Word → (Word → Word → Word) → Word64 → Word64 → Word64 → (Word64, Word) #-} +overlap3 + ∷ (FiniteBits b) + ⇒ Word + → (Word → Word → Word) + → b + → b + → b + → (b, Word) +overlap3 size sigma char1 char2 char3 = overlap size sigma $ char1 :| [char2, char3] + + +{- | +Gets the lowest set bit and the highest set bit in the collection. +-} +{-# INLINEABLE getBitBounds #-} +{-# SPECIALIZE getBitBounds ∷ CUInt → Bounds CUInt #-} +{-# SPECIALIZE getBitBounds ∷ Word → Bounds Word #-} +{-# SPECIALIZE getBitBounds ∷ Word8 → Bounds Word8 #-} +{-# SPECIALIZE getBitBounds ∷ Word16 → Bounds Word16 #-} +{-# SPECIALIZE getBitBounds ∷ Word32 → Bounds Word32 #-} +{-# SPECIALIZE getBitBounds ∷ Word64 → Bounds Word64 #-} +{-# SPECIALIZE getBitBounds ∷ (FiniteBits b) ⇒ b → Bounds b #-} +{-# SPECIALIZE getBitBounds ∷ CUInt → Bounds CUInt #-} +{-# SPECIALIZE getBitBounds ∷ Word → Bounds Word #-} +{-# SPECIALIZE getBitBounds ∷ Word8 → Bounds Word8 #-} +{-# SPECIALIZE getBitBounds ∷ Word16 → Bounds Word16 #-} +{-# SPECIALIZE getBitBounds ∷ Word32 → Bounds Word32 #-} +{-# SPECIALIZE getBitBounds ∷ Word64 → Bounds Word64 #-} +getBitBounds + ∷ (FiniteBits b) + ⇒ b + → Bounds b +getBitBounds b = + let bitZero = (b `xor` b) `setBit` 0 + bigEndian = countLeadingZeros bitZero > 0 -- Check the endianness + (f, g) + | bigEndian = (countTrailingZeros, countLeadingZeros) + | otherwise = (countLeadingZeros, countTrailingZeros) + + lZeroes = f b + uZeroes = g b + lower = toEnum lZeroes + upper = toEnum . max 0 $ finiteBitSize b - uZeroes - 1 + in Bounds lower upper b diff --git a/pkg/PhyGraph/lib/tcm/test/Data/TCM/Test.hs b/lib/tcm/test/Data/TCM/Test.hs similarity index 100% rename from pkg/PhyGraph/lib/tcm/test/Data/TCM/Test.hs rename to lib/tcm/test/Data/TCM/Test.hs diff --git a/pkg/PhyGraph/lib/tcm/test/TestSuite.hs b/lib/tcm/test/TestSuite.hs similarity index 100% rename from pkg/PhyGraph/lib/tcm/test/TestSuite.hs rename to lib/tcm/test/TestSuite.hs diff --git a/pkg/PhyGraph/lib/utility/bench/Benchmarks.hs b/lib/utility/bench/Benchmarks.hs similarity index 100% rename from pkg/PhyGraph/lib/utility/bench/Benchmarks.hs rename to lib/utility/bench/Benchmarks.hs diff --git a/pkg/PhyGraph/lib/utility/bench/Data/MutualExclusionSet/Bench.hs b/lib/utility/bench/Data/MutualExclusionSet/Bench.hs similarity index 100% rename from pkg/PhyGraph/lib/utility/bench/Data/MutualExclusionSet/Bench.hs rename to lib/utility/bench/Data/MutualExclusionSet/Bench.hs diff --git a/pkg/PhyGraph/lib/utility/bench/Numeric/Extended/Natural/Bench.hs b/lib/utility/bench/Numeric/Extended/Natural/Bench.hs similarity index 100% rename from pkg/PhyGraph/lib/utility/bench/Numeric/Extended/Natural/Bench.hs rename to lib/utility/bench/Numeric/Extended/Natural/Bench.hs diff --git a/pkg/PhyGraph/lib/utility/blah.blah b/lib/utility/blah.blah similarity index 100% rename from pkg/PhyGraph/lib/utility/blah.blah rename to lib/utility/blah.blah diff --git a/pkg/PhyGraph/lib/utility/src/Data/List/Utility.hs b/lib/utility/src/Data/List/Utility.hs similarity index 80% rename from pkg/PhyGraph/lib/utility/src/Data/List/Utility.hs rename to lib/utility/src/Data/List/Utility.hs index 9c1f5a845..9160cb39c 100644 --- a/pkg/PhyGraph/lib/utility/src/Data/List/Utility.hs +++ b/lib/utility/src/Data/List/Utility.hs @@ -1,61 +1,74 @@ -{-# LANGUAGE BangPatterns #-} -{-# LANGUAGE FlexibleInstances #-} -{-# LANGUAGE FunctionalDependencies #-} -{-# LANGUAGE MultiParamTypeClasses #-} ------------------------------------------------------------------------------ --- | --- Module : Data.List.Utility --- Copyright : (c) 2015-2021 Ward Wheeler --- License : BSD-style --- --- Maintainer : wheeler@amnh.org --- Stability : provisional --- Portability : portable --- --- Functions for finding occurrences of elements in a list. --- ------------------------------------------------------------------------------ - -module Data.List.Utility where - -import Control.Lens (Lens', lens) -import Data.Foldable -import Data.Key (Zip(..)) -import Data.List (sort, sortBy) -import Data.List.NonEmpty (NonEmpty(..), nonEmpty) -import qualified Data.List.NonEmpty as NE -import Data.Map (assocs, empty, insertWith) -import Data.Maybe (catMaybes) -import Data.Ord (comparing) -import Data.Semigroup.Foldable -import Data.Set (insert, intersection) -import Prelude hiding (zipWith) - - --- | --- \( \mathcal{O} \left( n * k \right) \) --- --- Takes two nested, linear-dimensional structures and transposes their dimensions. --- It's like performing a matrix transpose operation, but more general. --- --- ==Example== --- --- >>> transpose [] --- [[]] --- --- >>> transpose [[1]] --- [[1]] --- --- >>> transpose [ [ 1, 2 ], [ 3, 4 ] ] --- [[1,3],[2,4]] --- --- >>> transpose [ [ 1, 2, 3 ], [ 4, 5, 6 ], [ 7, 8, 9] ] --- [[1,4,7],[2,5,8],[3,6,9]] --- --- >>> transpose [ [ 1, 2, 3, 0, 0 ], [ 4, 5, 6, 0 ], [ 7, 8, 9] ] --- [[1,4,7],[2,5,8],[3,6,9]] +{-# Language BangPatterns #-} +{-# Language FlexibleInstances #-} +{-# Language FunctionalDependencies #-} +{-# Language MultiParamTypeClasses #-} +{-# Language Safe #-} +{-# Language ScopedTypeVariables #-} + +{- | +Functions for finding occurrences of elements in a list. +-} + +module Data.List.Utility + ( catMaybes1 + , chunksOf + , duplicates + , equalityOf + , foldZipWith + , foldZipWith3 + , invariantTransformation + , isSingleton + , maximaBy + , minimaBy + , mostCommon + , occurrences + , pairwiseSequence + , prepend + , subsetOf + , transitivePropertyHolds + , transpose + ) where + +import Control.Lens (Lens', lens) +import Data.Foldable +import Data.Key (Zip(..)) +import Data.List (sort, sortBy) +import Data.List.NonEmpty (NonEmpty(..), nonEmpty) +import Data.List.NonEmpty qualified as NE +import Data.Map (Map, assocs, empty, insertWith) +import Data.Maybe (catMaybes) +import Data.Ord (comparing) +import Data.Semigroup.Foldable +import Data.Set (insert, intersection) +import Prelude hiding (zipWith) + + +{- | +\( \mathcal{O} \left( n * k \right) \) + +Takes two nested, linear-dimensional structures and transposes their dimensions. +It's like performing a matrix transpose operation, but more general. + +==Example== + +>>> transpose [] +[[]] + +>>> transpose [[1]] +[[1]] + +>>> transpose [ [ 1, 2 ], [ 3, 4 ] ] +[[1,3],[2,4]] + +>>> transpose [ [ 1, 2, 3 ], [ 4, 5, 6 ], [ 7, 8, 9] ] +[[1,4,7],[2,5,8],[3,6,9]] + +>>> transpose [ [ 1, 2, 3, 0, 0 ], [ 4, 5, 6, 0 ], [ 7, 8, 9] ] +[[1,4,7],[2,5,8],[3,6,9]] +-} transpose - :: ( Applicative f + :: forall a f t. + ( Applicative f , Applicative t , Semigroup (t a) , Traversable t @@ -65,33 +78,36 @@ transpose transpose value = case toList value of [] -> sequenceA value - x:xs -> transpose' $ x:|xs + x:xs -> transpose' $ x :| xs where - transpose' (e:|[]) = pure <$> e - transpose' (e:|(x:xs)) = (cons <$> e) `zap` transpose' (x:|xs) + transpose' :: NonEmpty (f a) -> f (t a) + transpose' (e :| []) = pure <$> e + transpose' (e :| (x:xs)) = (cons <$> e) `zap` transpose' (x :| xs) + cons :: a -> t a -> t a cons = (<>) . pure --- | --- \( \mathcal{O} \left( n * k \right) \) where \( k \) is the cost to convert the structure to a list in weak head --- normal form. --- --- Determines whether a 'Foldable' structure contains a single element. --- --- ==_Example== --- --- >>> isSingleton [] --- False --- --- >>> isSingleton [ () ] --- True --- --- >>> isSingleton [ (), () ] --- False +{- | +\( \mathcal{O} \left( n * k \right) \) where \( k \) is the cost to convert the structure to a list in weak head normal form. + +Determines whether a 'Foldable' structure contains a single element. + +==_Example== + +>>> isSingleton [] +False + +>>> isSingleton [ () ] +True + +>>> isSingleton [ (), () ] +False +-} isSingleton :: Foldable t => t a -> Bool isSingleton = f . toList where + f :: [a] -> Bool f [_] = True f _ = False @@ -129,9 +145,9 @@ prepend p ne = -- Nothing catMaybes1 :: Foldable1 f => f (Maybe a) -> Maybe (NonEmpty a) catMaybes1 v = - let x:|xs = toNonEmpty v - as = catMaybes xs - in maybe (nonEmpty as) (Just . (:|as)) x + let x :| xs = toNonEmpty v + as = catMaybes xs + in maybe (nonEmpty as) (Just . (:| as)) x -- | @@ -152,10 +168,11 @@ catMaybes1 v = duplicates :: (Foldable t, Ord a) => t a -> [a] duplicates = duplicates' . sort . toList where + duplicates' :: Ord a => [a] -> [a] duplicates' [] = [] duplicates' [_] = [] duplicates' (x:y:ys) = if x == y - then (x:) . duplicates $ dropWhile (==y) ys + then (x:) . duplicates $ dropWhile (== y) ys else duplicates (y:ys) @@ -202,10 +219,15 @@ occurrences = collateOccuranceMap . buildOccuranceMap where buildOccuranceMap = foldr occurrence empty where + occurrence :: (Ord k, Enum a, Num a) => k -> Map k a -> Map k a occurrence e = insertWith (const succ) e 1 + + collateOccuranceMap :: Ord v => Map k v -> [(k, v)] collateOccuranceMap = sortBy comparator . assocs where + comparator :: Ord v => (k, v) -> (k, v) -> Ordering comparator x y = descending $ comparing snd x y + descending LT = GT descending GT = LT descending x = x @@ -226,6 +248,7 @@ occurrences = collateOccuranceMap . buildOccuranceMap chunksOf :: Foldable t => Int -> t a -> [[a]] chunksOf n = chunksOf' . toList where + chunksOf' :: [a] -> [[a]] chunksOf' xs = case splitAt n xs of (y,[]) -> [y] diff --git a/pkg/PhyGraph/lib/utility/src/Data/Matrix/NotStupid.hs b/lib/utility/src/Data/Matrix/NotStupid.hs similarity index 89% rename from pkg/PhyGraph/lib/utility/src/Data/Matrix/NotStupid.hs rename to lib/utility/src/Data/Matrix/NotStupid.hs index 010c2c20d..b2cbea66a 100644 --- a/pkg/PhyGraph/lib/utility/src/Data/Matrix/NotStupid.hs +++ b/lib/utility/src/Data/Matrix/NotStupid.hs @@ -89,13 +89,13 @@ module Data.Matrix.NotStupid , Stupid.detLU ) where -import Control.Arrow ((***)) -import Data.Foldable -import Data.Key -import Data.Matrix (Matrix, (<->), (<|>)) -import qualified Data.Matrix as Stupid -import Data.Maybe (catMaybes) -import Data.Vector (Vector) +import Control.Arrow ((***)) +import Data.Foldable +import Data.Key +import Data.Matrix (Matrix, (<->), (<|>)) +import Data.Matrix qualified as Stupid +import Data.Maybe (catMaybes) +import Data.Vector (Vector) type instance Key Matrix = (Int, Int) @@ -145,7 +145,7 @@ getElem :: Int -- ^ Row getElem i j mtx = case errorMessage of Just err -> error err - Nothing -> Stupid.getElem (i+1) (j+1) mtx + Nothing -> Stupid.getElem (i + 1) (j + 1) mtx where m = Stupid.nrows mtx n = Stupid.ncols mtx @@ -168,20 +168,20 @@ unsafeGet :: Int -- ^ Row -> Int -- ^ Column -> Matrix a -- ^ Matrix -> a -unsafeGet i j = Stupid.unsafeGet (i+1) (j+1) +unsafeGet i j = Stupid.unsafeGet (i + 1) (j + 1) -- | -- Variant of 'getElem' that returns Maybe instead of an error. safeGet :: Int -> Int -> Matrix a -> Maybe a -safeGet i j = Stupid.safeGet (i+1) (j+1) +safeGet i j = Stupid.safeGet (i + 1) (j + 1) -- | -- /O(1)/. -- Get a row of a matrix as a vector. {-# INLINE getRow #-} getRow :: Int -> Matrix a -> Vector a -getRow i = Stupid.getRow (i+1) +getRow i = Stupid.getRow (i + 1) -- | @@ -189,7 +189,7 @@ getRow i = Stupid.getRow (i+1) -- Get a column of a matrix as a vector. {-# INLINE getCol #-} getCol :: Int -> Matrix a -> Vector a -getCol j = Stupid.getCol (j+1) +getCol j = Stupid.getCol (j + 1) -- | @@ -199,7 +199,7 @@ setElem :: a -- ^ New value. -> (Int,Int) -- ^ Position to replace. -> Matrix a -- ^ Original matrix. -> Matrix a -- ^ Matrix with the given position replaced with the given value. -setElem e (i,j) = Stupid.setElem e (i+1,j+1) +setElem e (i,j) = Stupid.setElem e (i + 1,j + 1) -- | @@ -209,7 +209,7 @@ unsafeSet :: a -- ^ New value. -> (Int,Int) -- ^ Position to replace. -> Matrix a -- ^ Original matrix. -> Matrix a -- ^ Matrix with the given position replaced with the given value. -unsafeSet e (i,j) = Stupid.unsafeSet e (i+1,j+1) +unsafeSet e (i,j) = Stupid.unsafeSet e (i + 1,j + 1) -- | @@ -219,12 +219,12 @@ unsafeSet e (i,j) = Stupid.unsafeSet e (i+1,j+1) -- -- > ( 1 2 3 ) ( 1 2 3 ) -- > ( 4 5 6 ) ( 5 6 7 ) --- > mapRow (\_ x -> x + 1) 2 ( 7 8 9 ) = ( 7 8 9 ) +-- > mapRow (\_ x -> x + 1) 2 ( 7 8 9 ) = ( 7 8 9 ) -- mapRow :: (Int -> a -> a) -- ^ Function takes the current column as additional argument. -> Int -- ^ Row to map. -> Matrix a -> Matrix a -mapRow f r = Stupid.mapCol (f <$> (+1)) (r+1) +mapRow f r = Stupid.mapCol (f <$> ( + 1)) (r + 1) -- | @@ -234,12 +234,12 @@ mapRow f r = Stupid.mapCol (f <$> (+1)) (r+1) -- -- > ( 1 2 3 ) ( 1 3 3 ) -- > ( 4 5 6 ) ( 4 6 6 ) --- > mapCol (\_ x -> x + 1) 2 ( 7 8 9 ) = ( 7 9 9 ) +-- > mapCol (\_ x -> x + 1) 2 ( 7 8 9 ) = ( 7 9 9 ) -- mapCol :: (Int -> a -> a) -- ^ Function takes the current row as additional argument. -> Int -- ^ Column to map. -> Matrix a -> Matrix a -mapCol f c = Stupid.mapCol (f <$> (+1)) (c+1) +mapCol f c = Stupid.mapCol (f <$> ( + 1)) (c + 1) -- | @@ -257,7 +257,7 @@ submatrix :: Int -- ^ Starting row -> Int -- ^ Ending column -> Matrix a -> Matrix a -submatrix r1 r2 c1 c2 = Stupid.submatrix (r1+1) (r2+2) (c1+1) (c2+1) +submatrix r1 r2 c1 c2 = Stupid.submatrix (r1 + 1) (r2 + 2) (c1 + 1) (c2 + 1) -- | @@ -272,7 +272,7 @@ minorMatrix :: Int -- ^ Row @r@ to remove. -> Int -- ^ Column @c@ to remove. -> Matrix a -- ^ Original matrix. -> Matrix a -- ^ Matrix with row @r@ and column @c@ removed. -minorMatrix r c = Stupid.minorMatrix (r+1) (c+1) +minorMatrix r c = Stupid.minorMatrix (r + 1) (c + 1) -- | @@ -302,7 +302,7 @@ splitBlocks :: Int -- ^ Row of the splitting element. -> Matrix a -- ^ Matrix to split. -> (Matrix a,Matrix a ,Matrix a,Matrix a) -- ^ (TL,TR,BL,BR) -splitBlocks i j = Stupid.splitBlocks (i+1) (j+1) +splitBlocks i j = Stupid.splitBlocks (i + 1) (j + 1) -- | @@ -313,7 +313,7 @@ splitBlocks i j = Stupid.splitBlocks (i+1) (j+1) -- > ( 4 5 6 ) ( 8 10 12 ) -- > scaleRow 2 2 ( 7 8 9 ) = ( 7 8 9 ) scaleRow :: Num a => a -> Int -> Matrix a -> Matrix a -scaleRow e r = Stupid.scaleRow e (r+1) +scaleRow e r = Stupid.scaleRow e (r + 1) -- | @@ -324,7 +324,7 @@ scaleRow e r = Stupid.scaleRow e (r+1) -- > ( 4 5 6 ) ( 6 9 12 ) -- > combineRows 2 2 1 ( 7 8 9 ) = ( 7 8 9 ) combineRows :: Num a => Int -> a -> Int -> Matrix a -> Matrix a -combineRows r1 l r2 = Stupid.combineRows (r1+1) l (r2+1) +combineRows r1 l r2 = Stupid.combineRows (r1 + 1) l (r2 + 1) -- | @@ -338,7 +338,7 @@ switchRows :: Int -- ^ Row 1. -> Int -- ^ Row 2. -> Matrix a -- ^ Original matrix. -> Matrix a -- ^ Matrix with rows 1 and 2 switched. -switchRows r1 r2 = Stupid.switchRows (r1+1) (r2+1) +switchRows r1 r2 = Stupid.switchRows (r1 + 1) (r2 + 1) -- | @@ -352,5 +352,5 @@ switchCols :: Int -- ^ Col 1. -> Int -- ^ Col 2. -> Matrix a -- ^ Original matrix. -> Matrix a -- ^ Matrix with cols 1 and 2 switched. -switchCols c1 c2 = Stupid.switchCols (c1+1) (c2+1) +switchCols c1 c2 = Stupid.switchCols (c1 + 1) (c2 + 1) diff --git a/pkg/PhyGraph/lib/utility/src/Data/MutualExclusionSet.hs b/lib/utility/src/Data/MutualExclusionSet.hs similarity index 100% rename from pkg/PhyGraph/lib/utility/src/Data/MutualExclusionSet.hs rename to lib/utility/src/Data/MutualExclusionSet.hs diff --git a/pkg/PhyGraph/lib/utility/src/Data/MutualExclusionSet/Internal.hs b/lib/utility/src/Data/MutualExclusionSet/Internal.hs similarity index 100% rename from pkg/PhyGraph/lib/utility/src/Data/MutualExclusionSet/Internal.hs rename to lib/utility/src/Data/MutualExclusionSet/Internal.hs diff --git a/pkg/PhyGraph/lib/utility/src/Data/Vector/Custom.hs b/lib/utility/src/Data/Vector/Custom.hs similarity index 100% rename from pkg/PhyGraph/lib/utility/src/Data/Vector/Custom.hs rename to lib/utility/src/Data/Vector/Custom.hs diff --git a/pkg/PhyGraph/lib/utility/src/Data/Vector/Memo.hs b/lib/utility/src/Data/Vector/Memo.hs similarity index 100% rename from pkg/PhyGraph/lib/utility/src/Data/Vector/Memo.hs rename to lib/utility/src/Data/Vector/Memo.hs diff --git a/pkg/PhyGraph/lib/utility/src/Data/Vector/NonEmpty.hs b/lib/utility/src/Data/Vector/NonEmpty.hs similarity index 80% rename from pkg/PhyGraph/lib/utility/src/Data/Vector/NonEmpty.hs rename to lib/utility/src/Data/Vector/NonEmpty.hs index 59466407f..b27cdea8c 100644 --- a/pkg/PhyGraph/lib/utility/src/Data/Vector/NonEmpty.hs +++ b/lib/utility/src/Data/Vector/NonEmpty.hs @@ -10,16 +10,17 @@ -- ----------------------------------------------------------------------------- -{-# LANGUAGE BangPatterns #-} -{-# LANGUAGE DeriveDataTypeable #-} -{-# LANGUAGE DeriveFunctor #-} -{-# LANGUAGE DeriveTraversable #-} -{-# LANGUAGE DerivingStrategies #-} +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE DeriveDataTypeable #-} +{-# LANGUAGE DeriveFunctor #-} +{-# LANGUAGE DeriveTraversable #-} +{-# LANGUAGE DerivingStrategies #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} -{-# LANGUAGE RankNTypes #-} -{-# LANGUAGE ScopedTypeVariables #-} -{-# LANGUAGE TypeApplications #-} -{-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE RankNTypes #-} +{-# LANGUAGE RoleAnnotations #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE TypeApplications #-} +{-# LANGUAGE TypeFamilies #-} {-# OPTIONS_GHC -fno-warn-unused-imports #-} @@ -42,28 +43,28 @@ module Data.Vector.NonEmpty ) where -import Control.DeepSeq hiding (force) -import qualified Control.Foldl as L -import Data.Binary -import Data.Coerce -import Data.Data -import Data.Foldable -import Data.Functor.Alt -import Data.Functor.Bind -import Data.Functor.Classes -import Data.Functor.Extend -import Data.Hashable -import Data.Key -import Data.List.NonEmpty (NonEmpty((:|))) -import qualified Data.List.NonEmpty as NE -import Data.Pointed -import Data.Semigroup.Foldable -import Data.Semigroup.Traversable -import qualified Data.Vector as V -import Data.Vector.Binary () -import Data.Vector.Instances () -import Test.QuickCheck hiding (generate) -import Text.Read +import Control.DeepSeq hiding (force) +import Control.Foldl qualified as L +import Data.Binary +import Data.Coerce +import Data.Data +import Data.Foldable +import Data.Functor.Alt +import Data.Functor.Bind +import Data.Functor.Classes +import Data.Functor.Extend +import Data.Hashable +import Data.Key +import Data.List.NonEmpty (NonEmpty((:|))) +import Data.List.NonEmpty qualified as NE +import Data.Pointed +import Data.Semigroup.Foldable +import Data.Semigroup.Traversable +import Data.Vector qualified as V +import Data.Vector.Binary () +import Data.Vector.Instances () +import Test.QuickCheck hiding (generate) +import Text.Read -- | @@ -93,6 +94,8 @@ newtype Vector a = NEV { unwrap :: V.Vector a } , ZipWithKey ) +type role Vector representational + -- | -- Generation biases towards medium length. @@ -201,7 +204,7 @@ unfoldr f = NEV . uncurry V.fromListN . go 0 -- go :: Int -> b -> (Int, [a]) go n b = let (v, mb) = f b - in (v:) <$> maybe (n, []) (go (n+1)) mb + in (v:) <$> maybe (n, []) (go (n + 1)) mb -- | diff --git a/pkg/PhyGraph/lib/utility/test/Control/Parallel/Test.hs b/lib/utility/test/Control/Parallel/Test.hs similarity index 100% rename from pkg/PhyGraph/lib/utility/test/Control/Parallel/Test.hs rename to lib/utility/test/Control/Parallel/Test.hs diff --git a/pkg/PhyGraph/lib/utility/test/Data/BitMatrix/Test.hs b/lib/utility/test/Data/BitMatrix/Test.hs similarity index 100% rename from pkg/PhyGraph/lib/utility/test/Data/BitMatrix/Test.hs rename to lib/utility/test/Data/BitMatrix/Test.hs diff --git a/pkg/PhyGraph/lib/utility/test/Data/List/Test.hs b/lib/utility/test/Data/List/Test.hs similarity index 100% rename from pkg/PhyGraph/lib/utility/test/Data/List/Test.hs rename to lib/utility/test/Data/List/Test.hs diff --git a/pkg/PhyGraph/lib/utility/test/Data/MutualExclusionSet/Test.hs b/lib/utility/test/Data/MutualExclusionSet/Test.hs similarity index 100% rename from pkg/PhyGraph/lib/utility/test/Data/MutualExclusionSet/Test.hs rename to lib/utility/test/Data/MutualExclusionSet/Test.hs diff --git a/pkg/PhyGraph/lib/utility/test/Numeric/Cost/Test.hs b/lib/utility/test/Numeric/Cost/Test.hs similarity index 100% rename from pkg/PhyGraph/lib/utility/test/Numeric/Cost/Test.hs rename to lib/utility/test/Numeric/Cost/Test.hs diff --git a/pkg/PhyGraph/lib/utility/test/Numeric/Extended/Natural/Test.hs b/lib/utility/test/Numeric/Extended/Natural/Test.hs similarity index 100% rename from pkg/PhyGraph/lib/utility/test/Numeric/Extended/Natural/Test.hs rename to lib/utility/test/Numeric/Extended/Natural/Test.hs diff --git a/pkg/PhyGraph/lib/utility/test/Numeric/Extended/Real/Test.hs b/lib/utility/test/Numeric/Extended/Real/Test.hs similarity index 100% rename from pkg/PhyGraph/lib/utility/test/Numeric/Extended/Real/Test.hs rename to lib/utility/test/Numeric/Extended/Real/Test.hs diff --git a/pkg/PhyGraph/lib/utility/test/Numeric/NonNegativeAverage/Test.hs b/lib/utility/test/Numeric/NonNegativeAverage/Test.hs similarity index 100% rename from pkg/PhyGraph/lib/utility/test/Numeric/NonNegativeAverage/Test.hs rename to lib/utility/test/Numeric/NonNegativeAverage/Test.hs diff --git a/pkg/PhyGraph/lib/utility/test/TestSuite.hs b/lib/utility/test/TestSuite.hs similarity index 100% rename from pkg/PhyGraph/lib/utility/test/TestSuite.hs rename to lib/utility/test/TestSuite.hs diff --git a/pkg/PHAGE-timing/src/System/Timing.hs b/pkg/PHAGE-timing/src/System/Timing.hs deleted file mode 100644 index 46ab57e30..000000000 --- a/pkg/PHAGE-timing/src/System/Timing.hs +++ /dev/null @@ -1,108 +0,0 @@ -{-# Language BangPatterns #-} -{-# Language DerivingStrategies #-} -{-# Language Safe #-} -{-# Language Strict #-} - -module System.Timing - ( CPUTime() - , fromPicoseconds - , fromMilliseconds - , fromMicroseconds - , fromSeconds - , toPicoseconds - , toMicroseconds - , toMilliseconds - , toSeconds - , timeDifference - , timeOp - ) where - - -import Control.DeepSeq (NFData(rnf), force) -import Control.Monad.IO.Class (MonadIO(liftIO)) -import Data.Foldable (fold) -import Numeric.Natural (Natural) -import System.CPUTime (getCPUTime) - - --- | CPU time with picosecond resolution -newtype CPUTime = CPUTime Natural - deriving stock (Eq, Ord) - - -instance NFData CPUTime where - - rnf (CPUTime !_) = () - - -instance Show CPUTime where - - show (CPUTime x) - | x < nSecond = let (q,_) = x `quotRem` 1 in fold [show q, ".", "???" , "ps" ] - | x < μSecond = let (q,r) = x `quotRem` nSecond in fold [show q, ".", zeroPad 3 (r `div` 1 ), "ns" ] - | x < mSecond = let (q,r) = x `quotRem` μSecond in fold [show q, ".", zeroPad 3 (r `div` nSecond ), "μs" ] - | x < second = let (q,r) = x `quotRem` mSecond in fold [show q, ".", zeroPad 3 (r `div` μSecond ), "ms" ] - | x < minute = let (q,r) = x `quotRem` second in fold [show q, ".", zeroPad 3 (r `div` mSecond ), "s " ] - | x < hour = let (q,r) = x `quotRem` minute in fold [show q, "m", zeroPad 2 (r `div` second ), "sec"] - | x < day = let (q,r) = x `quotRem` hour in fold [show q, "h", zeroPad 2 (r `div` minute ), "min"] - | otherwise = let (q,r) = x `quotRem` day in fold [show q, "d", zeroPad 2 (r `div` hour ), "hrs"] - where - nSecond :: Natural - nSecond = 1000 - μSecond = 1000 * nSecond - mSecond = 1000 * μSecond - second = 1000 * mSecond - minute = 60 * second - hour = 60 * minute - day = 24 * hour - - -zeroPad :: Int -> Natural -> String -zeroPad k i = replicate (k - length shown) '0' <> shown - where - shown = show i - - -timeOp :: (MonadIO m, NFData a) => m a -> m (CPUTime, a) -timeOp ioa = do - !t1 <- liftIO getCPUTime - !a <- force <$> ioa - !t2 <- liftIO getCPUTime - let t = CPUTime . fromIntegral $ t2 - t1 - pure (t, a) - - -timeDifference :: CPUTime -> CPUTime -> CPUTime -timeDifference (CPUTime a) (CPUTime b) = CPUTime $ max a b - min a b - - -fromPicoseconds :: Natural -> CPUTime -fromPicoseconds = CPUTime - - -fromMicroseconds :: Natural -> CPUTime -fromMicroseconds = CPUTime . (* 1000000) - - -fromMilliseconds :: Natural -> CPUTime -fromMilliseconds = CPUTime . (* 1000000000) - - -fromSeconds :: Natural -> CPUTime -fromSeconds = CPUTime . (* 1000000000000) - - -toPicoseconds :: CPUTime -> Natural -toPicoseconds (CPUTime x) = x - - -toMicroseconds :: CPUTime -> Natural -toMicroseconds (CPUTime x) = x `div` 1000000 - - -toMilliseconds :: CPUTime -> Natural -toMilliseconds (CPUTime x) = x `div` 1000000000 - - -toSeconds :: CPUTime -> Natural -toSeconds (CPUTime x) = x `div` 1000000000000 diff --git a/pkg/PhyGraph/CHANGELOG.md b/pkg/PhyGraph/CHANGELOG.md deleted file mode 120000 index df460b98c..000000000 --- a/pkg/PhyGraph/CHANGELOG.md +++ /dev/null @@ -1 +0,0 @@ -../../doc/CHANGELOG.md \ No newline at end of file diff --git a/pkg/PhyGraph/Commands/CommandExecution.hs b/pkg/PhyGraph/Commands/CommandExecution.hs deleted file mode 100644 index 73cfd633c..000000000 --- a/pkg/PhyGraph/Commands/CommandExecution.hs +++ /dev/null @@ -1,1512 +0,0 @@ -{- | -Module : CommandExecution.hs -Description : Module to coordinate command execution -Copyright : (c) 2021 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -{-# LANGUAGE BangPatterns #-} - -module Commands.CommandExecution - ( executeCommands - , executeRenameReblockCommands - , getDataListList - ) where - -import Data.Foldable -import qualified Data.CSV as CSV -import qualified Data.List as L -import Data.Maybe -import Text.Read -import qualified Data.Text.Lazy as T -import qualified Data.Text.Short as ST -import qualified Data.Vector as V -import qualified Data.Vector.Storable as SV -import qualified Data.Vector.Unboxed as UV -import Debug.Trace -import GeneralUtilities -import GraphFormatUtilities -import System.IO -import Types.Types -import qualified Utilities.LocalGraph as LG -import qualified Utilities.Utilities as U -import qualified Data.Char as C -import qualified Search.Build as B -import qualified Reconciliation.ReconcileGraphs as R -import qualified Search.Refinement as REF -import qualified Search.Search as S -import qualified Commands.Transform as TRANS -import System.Timing -import qualified Support.Support as SUP -import Data.Char -import qualified Data.List.Split as SL -import Graphs.GraphOperations as GO -import qualified GraphOptimization.Traversals as TRAV -import System.Info -import System.Process -import System.Directory -import qualified SymMatrix as S -import Data.Alphabet -import Data.Bits -import qualified Commands.Verify as VER -import qualified Input.Reorganize as IR -import qualified Data.InfList as IL - - - - --- | executeCommands reads input files and returns raw data --- need to close files after read -executeCommands :: GlobalSettings -> Int -> String -> ProcessedData -> ProcessedData -> [PhylogeneticGraph] -> [[VertexCost]] -> [Int] -> [PhylogeneticGraph] -> [Command] -> IO ([PhylogeneticGraph], GlobalSettings, [Int], [PhylogeneticGraph]) -executeCommands globalSettings numInputFiles crossReferenceString origProcessedData processedData curGraphs pairwiseDist seedList supportGraphList commandList = do - if null commandList then return (curGraphs, globalSettings, seedList, supportGraphList) - else do - let (firstOption, firstArgs) = head commandList - - -- skip "Read" and "Rename "commands already processed - if firstOption == Read then error ("Read command should already have been processed: " ++ show (firstOption, firstArgs)) - else if firstOption == Rename then error ("Rename command should already have been processed: " ++ show (firstOption, firstArgs)) - else if firstOption == Reblock then error ("Reblock command should already have been processed: " ++ show (firstOption, firstArgs)) - else if firstOption == Run then error ("Run command should already have been processed: " ++ show (firstOption, firstArgs)) - - -- other commands - else if firstOption == Build then do - (elapsedSeconds, newGraphList) <- timeOp $ pure $ B.buildGraph firstArgs globalSettings processedData pairwiseDist (head seedList) - - let searchInfo = makeSearchRecord firstOption firstArgs curGraphs newGraphList (fromIntegral $ toMilliseconds elapsedSeconds) "No Comment" - let newSearchData = searchInfo : (searchData globalSettings) - - executeCommands (globalSettings {searchData = newSearchData}) numInputFiles crossReferenceString origProcessedData processedData (curGraphs ++ newGraphList) pairwiseDist (tail seedList) supportGraphList (tail commandList) - - else if firstOption == Refine then do - (elapsedSeconds, newGraphList) <- timeOp $ pure $ REF.refineGraph firstArgs globalSettings processedData (head seedList) curGraphs - - let searchInfo = makeSearchRecord firstOption firstArgs curGraphs newGraphList (fromIntegral $ toMilliseconds elapsedSeconds) "No Comment" - let newSearchData = searchInfo : (searchData globalSettings) - - executeCommands (globalSettings {searchData = newSearchData}) numInputFiles crossReferenceString origProcessedData processedData newGraphList pairwiseDist (tail seedList) supportGraphList (tail commandList) - - else if firstOption == Fuse then do - (elapsedSeconds, newGraphList) <- timeOp $ pure $ REF.fuseGraphs firstArgs globalSettings processedData (head seedList) curGraphs - - let searchInfo = makeSearchRecord firstOption firstArgs curGraphs newGraphList (fromIntegral $ toMilliseconds elapsedSeconds) "No Comment" - let newSearchData = searchInfo : (searchData globalSettings) - - executeCommands (globalSettings {searchData = newSearchData}) numInputFiles crossReferenceString origProcessedData processedData newGraphList pairwiseDist (tail seedList) supportGraphList (tail commandList) - - else if firstOption == Report then do - - let doDotPDF = any (=="dotpdf") $ fmap (fmap toLower . fst) firstArgs - let collapse' = any (=="collapse") $ fmap (fmap toLower . fst) firstArgs - let noCollapse' = any (=="nocollapse") $ fmap (fmap toLower . fst) firstArgs - - -- set default collapse for dotPDF to True, False otherwise - let collapse = if collapse' then True - else if noCollapse' then False - else if doDotPDF then True - else False - - let curGraphs' = if (not collapse) then curGraphs - else fmap U.collapseGraph curGraphs - - -- use 'temp' updated graphs s don't repeatedly add model and root complexityies - -- reporting collapsed - -- reverse sorting graphs by cost - let graphsWithUpdatedCosts = reverse (L.sortOn snd6 $ fmap (TRAV.updateGraphCostsComplexities globalSettings) curGraphs') - reportStuff@(reportString, outFile, writeMode) = reportCommand globalSettings firstArgs numInputFiles crossReferenceString processedData graphsWithUpdatedCosts supportGraphList pairwiseDist - - if null reportString then do - executeCommands globalSettings numInputFiles crossReferenceString origProcessedData processedData curGraphs pairwiseDist seedList supportGraphList (tail commandList) - else do - hPutStrLn stderr ("Report writing to " ++ outFile) - - if doDotPDF then do - let reportString' = changeDotPreamble "digraph {" "digraph G {\n\trankdir = LR;\tnode [ shape = rect];\n" reportString - printGraphVizDot reportString' outFile - executeCommands globalSettings numInputFiles crossReferenceString origProcessedData processedData curGraphs pairwiseDist seedList supportGraphList (tail commandList) - - else do - if outFile == "stderr" then hPutStr stderr reportString - else if outFile == "stdout" then putStr reportString - else if writeMode == "overwrite" then writeFile outFile reportString - else if writeMode == "append" then appendFile outFile reportString - else error ("Error 'read' command not properly formatted" ++ show reportStuff) - executeCommands globalSettings numInputFiles crossReferenceString origProcessedData processedData curGraphs pairwiseDist seedList supportGraphList (tail commandList) - - else if firstOption == Search then do - (elapsedSeconds, output) <- timeOp $ S.search firstArgs globalSettings processedData pairwiseDist (head seedList) curGraphs - --in pure result - -- (newGraphList, serchInfoList) <- S.search firstArgs globalSettings origProcessedData processedData pairwiseDist (head seedList) curGraphs - let searchInfo = makeSearchRecord firstOption firstArgs curGraphs (fst output) (fromIntegral $ toMilliseconds elapsedSeconds) (concat $ fmap (L.intercalate "\n") $ snd output) - let newSearchData = searchInfo : (searchData globalSettings) - executeCommands (globalSettings {searchData = newSearchData}) numInputFiles crossReferenceString origProcessedData processedData (fst output) pairwiseDist (tail seedList) supportGraphList (tail commandList) - - else if firstOption == Select then do - (elapsedSeconds, newGraphList) <- timeOp $ pure $ GO.selectPhylogeneticGraph firstArgs (head seedList) VER.selectArgList curGraphs - - let searchInfo = makeSearchRecord firstOption firstArgs curGraphs newGraphList (fromIntegral $ toMilliseconds elapsedSeconds) "No Comment" - let newSearchData = searchInfo : (searchData globalSettings) - - executeCommands (globalSettings {searchData = newSearchData}) numInputFiles crossReferenceString origProcessedData processedData newGraphList pairwiseDist (tail seedList) supportGraphList (tail commandList) - - else if firstOption == Set then - -- if set changes graph aspects--may nned to reoptimize - let (newGlobalSettings, newProcessedData, seedList') = setCommand firstArgs globalSettings processedData seedList - newGraphList = if not (requireReoptimization globalSettings newGlobalSettings) then curGraphs - else trace ("Reoptimizing gaphs") fmap (TRAV.multiTraverseFullyLabelGraph newGlobalSettings newProcessedData True True Nothing) (fmap fst6 curGraphs) - - searchInfo = makeSearchRecord firstOption firstArgs curGraphs newGraphList 0 "No Comment" - newSearchData = searchInfo : (searchData newGlobalSettings) - in - - executeCommands (newGlobalSettings {searchData = newSearchData}) numInputFiles crossReferenceString origProcessedData processedData newGraphList pairwiseDist seedList' supportGraphList (tail commandList) - - else if firstOption == Swap then do - (elapsedSeconds, newGraphList) <- timeOp $ pure $ REF.swapMaster firstArgs globalSettings processedData (head seedList) curGraphs - - let searchInfo = makeSearchRecord firstOption firstArgs curGraphs newGraphList (fromIntegral $ toMilliseconds elapsedSeconds) "No Comment" - let newSearchData = searchInfo : (searchData globalSettings) - - executeCommands (globalSettings {searchData = newSearchData}) numInputFiles crossReferenceString origProcessedData processedData newGraphList pairwiseDist (tail seedList) supportGraphList (tail commandList) - - else if firstOption == Support then do - (elapsedSeconds, newSupportGraphList) <- timeOp $ pure $ SUP.supportGraph firstArgs globalSettings processedData (head seedList) curGraphs - - let searchInfo = makeSearchRecord firstOption firstArgs curGraphs newSupportGraphList (fromIntegral $ toMilliseconds elapsedSeconds) "No Comment" - let newSearchData = searchInfo : (searchData globalSettings) - - executeCommands (globalSettings {searchData = newSearchData}) numInputFiles crossReferenceString origProcessedData processedData curGraphs pairwiseDist (tail seedList) (supportGraphList ++ newSupportGraphList) (tail commandList) - - else if firstOption == Transform then do - (elapsedSeconds, (newGS, newOrigData, newProcessedData, newGraphs)) <- timeOp $ pure $ TRANS.transform firstArgs globalSettings origProcessedData processedData (head seedList) curGraphs - - let searchInfo = makeSearchRecord firstOption firstArgs curGraphs newGraphs (fromIntegral $ toMilliseconds elapsedSeconds) "No Comment" - let newSearchData = searchInfo : (searchData globalSettings) - - executeCommands (newGS {searchData = newSearchData}) numInputFiles crossReferenceString newOrigData newProcessedData newGraphs pairwiseDist (tail seedList) supportGraphList (tail commandList) - - else error ("Command " ++ (show firstOption) ++ " not recognized/implemented") - --- | makeSearchRecord take sbefore and after data of a commend and returns SearchData record -makeSearchRecord :: Instruction -> [Argument] -> [PhylogeneticGraph] -> [PhylogeneticGraph] -> Int -> String -> SearchData -makeSearchRecord firstOption firstArgs curGraphs newGraphList elapsedTime comment = - SearchData { instruction = firstOption - , arguments = firstArgs - , minGraphCostIn = if null curGraphs then infinity - else minimum $ fmap snd6 curGraphs - , maxGraphCostIn = if null curGraphs then infinity - else maximum $ fmap snd6 curGraphs - , numGraphsIn = length curGraphs - , minGraphCostOut = if null newGraphList then infinity - else minimum $ fmap snd6 newGraphList - , maxGraphCostOut = if null newGraphList then infinity - else maximum $ fmap snd6 newGraphList - , numGraphsOut = length newGraphList - , commentString = comment - , duration = elapsedTime - } - - --- | setCommand takes arguments to change globalSettings and multiple data aspects (e.g. 'blocks') --- needs to be abtracted--too long -setCommand :: [Argument] -> GlobalSettings -> ProcessedData -> [Int] -> (GlobalSettings, ProcessedData, [Int]) -setCommand argList globalSettings processedData inSeedList = - let commandList = fmap (fmap C.toLower) $ filter (/= "") $ fmap fst argList - optionList = fmap (fmap C.toLower) $ filter (/= "") $ fmap snd argList - checkCommandList = checkCommandArgs "set" commandList VER.setArgList - leafNameVect = fst3 processedData - - in - if not checkCommandList then errorWithoutStackTrace ("Unrecognized command in 'set': " ++ show argList) - - -- this could be changed later - else if length commandList > 1 || length optionList > 1 then errorWithoutStackTrace ("Set option error: can only have one set argument for each command: " ++ (show (commandList,optionList))) - - -- early extraction of partition character and bc2-gt64 follows from null inputs - -- this due to not having all info required for all global settings, so optoin resitrcted and repeated - else if (null inSeedList) then - if head commandList == "partitioncharacter" then - let localPartitionChar = head optionList - in - if length localPartitionChar /= 1 then errorWithoutStackTrace ("Error in 'set' command. Partitioncharacter '" ++ (show localPartitionChar) ++ "' must be a single character") - else - trace ("PartitionCharacter set to '" ++ (head optionList) ++ "'") - (globalSettings {partitionCharacter = localPartitionChar}, processedData, inSeedList) - - else if head commandList == "bc2" then - let noChangeString = takeWhile (/= ',') $ filter (`notElem` ['(', ')']) $ head optionList - noChangeValue = readMaybe noChangeString :: Maybe Double - changeString = tail $ dropWhile (/= ',') $ filter (`notElem` ['(', ')']) $ head optionList - changeValue = readMaybe changeString :: Maybe Double - in - if length commandList /= length optionList then errorWithoutStackTrace ("Set option error: number of values and options do not match: " ++ (show (commandList,optionList))) - else if (null . head) optionList then errorWithoutStackTrace ("Set option 'bc2' must be set to a pair of double values in parens, separated by a comma (e.g. bc2:(0.1, 1.1): no values found ") - else if (',' `notElem` (head optionList)) then errorWithoutStackTrace ("Set option 'bc2' must be set to a pair of double values in parens, separated by a comma (e.g. bc2:(0.1, 1.1): no comma found ") - else if isNothing noChangeValue then errorWithoutStackTrace ("Set option 'bc2' must be set to a pair of double values in parens, separated by a comma (e.g. bc2:(0.1, 1.1): " ++ (head optionList)) - else if isNothing changeValue then errorWithoutStackTrace ("Set option 'bc2' must be set to a pair of double values in parens, separated by a comma (e.g. bc2:(0.1, 1.1): " ++ (head optionList)) - else - if (bc2 globalSettings) /= (fromJust noChangeValue, fromJust changeValue) then - trace ("bit cost 2 state set to " ++ (show (fromJust noChangeValue, fromJust changeValue))) - (globalSettings {bc2 = (fromJust noChangeValue, fromJust changeValue)}, processedData, inSeedList) - else (globalSettings {bc2 = (fromJust noChangeValue, fromJust changeValue)}, processedData, inSeedList) - - - else if head commandList == "bc4" then - let noChangeString = takeWhile (/= ',') $ filter (`notElem` ['(', ')']) $ head optionList - noChangeValue = readMaybe noChangeString :: Maybe Double - changeString = tail $ dropWhile (/= ',') $ filter (`notElem` ['(', ')']) $ head optionList - changeValue = readMaybe changeString :: Maybe Double - in - if (null . head) optionList then errorWithoutStackTrace ("Set option 'bc4' must be set to a pair of double values in parens, separated by a comma (e.g. bc4:(0.1, 1.1): no values found ") - else if (',' `notElem` (head optionList)) then errorWithoutStackTrace ("Set option 'bc4' must be set to a pair of double values in parens, separated by a comma (e.g. bc4:(0.1, 1.1): no comma found ") - else if isNothing noChangeValue then errorWithoutStackTrace ("Set option 'bc4' must be set to a pair of double values in parens, separated by a comma (e.g. bc4:(0.1, 1.1): " ++ (head optionList)) - else if isNothing changeValue then errorWithoutStackTrace ("Set option 'bc4' must be set to a pair of double values in parens, separated by a comma (e.g. bc4:(0.1, 1.1): " ++ (head optionList)) - else - if (bc4 globalSettings) /= (fromJust noChangeValue, fromJust changeValue) then - trace ("bit cost 4 state set to " ++ (show (fromJust noChangeValue, fromJust changeValue))) - (globalSettings {bc4 = (fromJust noChangeValue, fromJust changeValue)}, processedData, inSeedList) - else (globalSettings {bc4 = (fromJust noChangeValue, fromJust changeValue)}, processedData, inSeedList) - - - else if head commandList == "bc5" then - let noChangeString = takeWhile (/= ',') $ filter (`notElem` ['(', ')']) $ head optionList - noChangeValue = readMaybe noChangeString :: Maybe Double - changeString = tail $ dropWhile (/= ',') $ filter (`notElem` ['(', ')']) $ head optionList - changeValue = readMaybe changeString :: Maybe Double - in - if (null . head) optionList then errorWithoutStackTrace ("Set option 'bc5' must be set to a pair of double values in parens, separated by a comma (e.g. bc5:(0.1, 1.1): no values found ") - else if (',' `notElem` (head optionList)) then errorWithoutStackTrace ("Set option 'bc5' must be set to a pair of double values in parens, separated by a comma (e.g. bc5:(0.1, 1.1): no comma found ") - else if isNothing noChangeValue then errorWithoutStackTrace ("Set option 'bc5' must be set to a pair of double values in parens, separated by a comma (e.g. bc5:(0.1, 1.1): " ++ (head optionList)) - else if isNothing changeValue then errorWithoutStackTrace ("Set option 'bc5' must be set to a pair of double values in parens, separated by a comma (e.g. bc5:(0.1, 1.1): " ++ (head optionList)) - else - if (bc5 globalSettings) /= (fromJust noChangeValue, fromJust changeValue) then - trace ("bit cost 5 state set to " ++ (show (fromJust noChangeValue, fromJust changeValue))) - (globalSettings {bc5 = (fromJust noChangeValue, fromJust changeValue)}, processedData, inSeedList) - else (globalSettings {bc5 = (fromJust noChangeValue, fromJust changeValue)}, processedData, inSeedList) - - - else if head commandList == "bc8" then - let noChangeString = takeWhile (/= ',') $ filter (`notElem` ['(', ')']) $ head optionList - noChangeValue = readMaybe noChangeString :: Maybe Double - changeString = tail $ dropWhile (/= ',') $ filter (`notElem` ['(', ')']) $ head optionList - changeValue = readMaybe changeString :: Maybe Double - in - if (null . head) optionList then errorWithoutStackTrace ("Set option 'bc8' must be set to a pair of double values in parens, separated by a comma (e.g. bc8:(0.1, 1.1): no values found ") - else if (',' `notElem` (head optionList)) then errorWithoutStackTrace ("Set option 'bc8' must be set to a pair of double values in parens, separated by a comma (e.g. bc8:(0.1, 1.1): no comma found ") - else if isNothing noChangeValue then errorWithoutStackTrace ("Set option 'bc8' must be set to a pair of double values in parens, separated by a comma (e.g. bc8:(0.1, 1.1): " ++ (head optionList)) - else if isNothing changeValue then errorWithoutStackTrace ("Set option 'bc8' must be set to a pair of double values in parens, separated by a comma (e.g. bc8:(0.1, 1.1): " ++ (head optionList)) - else - if (bc8 globalSettings) /= (fromJust noChangeValue, fromJust changeValue) then - trace ("bit cost 8 state set to " ++ (show (fromJust noChangeValue, fromJust changeValue))) - (globalSettings {bc8 = (fromJust noChangeValue, fromJust changeValue)}, processedData, inSeedList) - else (globalSettings {bc8 = (fromJust noChangeValue, fromJust changeValue)}, processedData, inSeedList) - - - else if head commandList == "bc64" then - let noChangeString = takeWhile (/= ',') $ filter (`notElem` ['(', ')']) $ head optionList - noChangeValue = readMaybe noChangeString :: Maybe Double - changeString = tail $ dropWhile (/= ',') $ filter (`notElem` ['(', ')']) $ head optionList - changeValue = readMaybe changeString :: Maybe Double - in - if (null . head) optionList then errorWithoutStackTrace ("Set option 'bc64' must be set to a pair of double values in parens, separated by a comma (e.g. bc64:(0.1, 1.1): no values found ") - else if (',' `notElem` (head optionList)) then errorWithoutStackTrace ("Set option 'bc64' must be set to a pair of double values in parens, separated by a comma (e.g. bc64:(0.1, 1.1): no comma found ") - else if isNothing noChangeValue then errorWithoutStackTrace ("Set option 'bc64' must be set to a pair of double values in parens, separated by a comma (e.g. bc64:(0.1, 1.1): " ++ (head optionList)) - else if isNothing changeValue then errorWithoutStackTrace ("Set option 'bc64' must be set to a pair of double values in parens, separated by a comma (e.g. bc64:(0.1, 1.1): " ++ (head optionList)) - else - if (bc64 globalSettings) /= (fromJust noChangeValue, fromJust changeValue) then - trace ("bit cost 64 state set to " ++ (show (fromJust noChangeValue, fromJust changeValue))) - (globalSettings {bc64 = (fromJust noChangeValue, fromJust changeValue)}, processedData, inSeedList) - else (globalSettings {bc64 = (fromJust noChangeValue, fromJust changeValue)}, processedData, inSeedList) - - - else if head commandList == "bcgt64" then - let noChangeString = takeWhile (/= ',') $ filter (`notElem` ['(', ')']) $ head optionList - noChangeValue = readMaybe noChangeString :: Maybe Double - changeString = tail $ dropWhile (/= ',') $ filter (`notElem` ['(', ')']) $ head optionList - changeValue = readMaybe changeString :: Maybe Double - in - if (null . head) optionList then errorWithoutStackTrace ("Set option 'bcgt64' must be set to a pair of double values in parens, separated by a comma (e.g. bcgt64:(0.1, 1.1): no values found ") - else if (',' `notElem` (head optionList)) then errorWithoutStackTrace ("Set option 'bcgt64' must be set to a pair of double values in parens, separated by a comma (e.g. bcgt64:(0.1, 1.1): no comma found ") - else if isNothing noChangeValue then errorWithoutStackTrace ("Set option 'bcgt64' must be set to a pair of double values in parens, separated by a comma (e.g. bcgt64:(0.1, 1.1): " ++ (head optionList)) - else if isNothing changeValue then errorWithoutStackTrace ("Set option 'bcgt64' must be set to a pair of double values in parens, separated by a comma (e.g. bcgt64:(0.1, 1.1): " ++ (head optionList)) - else - if (bcgt64 globalSettings) /= (fromJust noChangeValue, fromJust changeValue) then - trace ("bit cost > 64 state set to " ++ (show (fromJust noChangeValue, fromJust changeValue))) - (globalSettings {bcgt64 = (fromJust noChangeValue, fromJust changeValue)}, processedData, inSeedList) - else (globalSettings {bcgt64 = (fromJust noChangeValue, fromJust changeValue)}, processedData, inSeedList) - - - -- partition character to reset - else - -- trace ("PartitionCharacter set to '" ++ (partitionCharacter globalSettings) ++ "'") - (globalSettings, processedData, inSeedList) - -- regular command stuff not initial at start - else - if head commandList == "bc2" then - let noChangeString = takeWhile (/= ',') $ filter (`notElem` ['(', ')']) $ head optionList - noChangeValue = readMaybe noChangeString :: Maybe Double - changeString = tail $ dropWhile (/= ',') $ filter (`notElem` ['(', ')']) $ head optionList - changeValue = readMaybe changeString :: Maybe Double - in - if length commandList /= length optionList then errorWithoutStackTrace ("Set option error: number of values and options do not match: " ++ (show (commandList,optionList))) - else if (null . head) optionList then errorWithoutStackTrace ("Set option 'bc2' must be set to a pair of double values in parens, separated by a comma (e.g. bc2:(0.1, 1.1): no values found ") - else if (',' `notElem` (head optionList)) then errorWithoutStackTrace ("Set option 'bc2' must be set to a pair of double values in parens, separated by a comma (e.g. bc2:(0.1, 1.1): no comma found ") - else if isNothing noChangeValue then errorWithoutStackTrace ("Set option 'bc2' must be set to a pair of double values in parens, separated by a comma (e.g. bc2:(0.1, 1.1): " ++ (head optionList)) - else if isNothing changeValue then errorWithoutStackTrace ("Set option 'bc2' must be set to a pair of double values in parens, separated by a comma (e.g. bc2:(0.1, 1.1): " ++ (head optionList)) - else - if (bc2 globalSettings) /= (fromJust noChangeValue, fromJust changeValue) then - trace ("bit cost 2 state set to " ++ (show (fromJust noChangeValue, fromJust changeValue))) - (globalSettings {bc2 = (fromJust noChangeValue, fromJust changeValue)}, processedData, inSeedList) - else (globalSettings {bc2 = (fromJust noChangeValue, fromJust changeValue)}, processedData, inSeedList) - - - else if head commandList == "bc4" then - let noChangeString = takeWhile (/= ',') $ filter (`notElem` ['(', ')']) $ head optionList - noChangeValue = readMaybe noChangeString :: Maybe Double - changeString = tail $ dropWhile (/= ',') $ filter (`notElem` ['(', ')']) $ head optionList - changeValue = readMaybe changeString :: Maybe Double - in - if (null . head) optionList then errorWithoutStackTrace ("Set option 'bc4' must be set to a pair of double values in parens, separated by a comma (e.g. bc4:(0.1, 1.1): no values found ") - else if (',' `notElem` (head optionList)) then errorWithoutStackTrace ("Set option 'bc4' must be set to a pair of double values in parens, separated by a comma (e.g. bc4:(0.1, 1.1): no comma found ") - else if isNothing noChangeValue then errorWithoutStackTrace ("Set option 'bc4' must be set to a pair of double values in parens, separated by a comma (e.g. bc4:(0.1, 1.1): " ++ (head optionList)) - else if isNothing changeValue then errorWithoutStackTrace ("Set option 'bc4' must be set to a pair of double values in parens, separated by a comma (e.g. bc4:(0.1, 1.1): " ++ (head optionList)) - else - if (bc4 globalSettings) /= (fromJust noChangeValue, fromJust changeValue) then - trace ("bit cost 4 state set to " ++ (show (fromJust noChangeValue, fromJust changeValue))) - (globalSettings {bc4 = (fromJust noChangeValue, fromJust changeValue)}, processedData, inSeedList) - else (globalSettings {bc4 = (fromJust noChangeValue, fromJust changeValue)}, processedData, inSeedList) - - - else if head commandList == "bc5" then - let noChangeString = takeWhile (/= ',') $ filter (`notElem` ['(', ')']) $ head optionList - noChangeValue = readMaybe noChangeString :: Maybe Double - changeString = tail $ dropWhile (/= ',') $ filter (`notElem` ['(', ')']) $ head optionList - changeValue = readMaybe changeString :: Maybe Double - in - if (null . head) optionList then errorWithoutStackTrace ("Set option 'bc5' must be set to a pair of double values in parens, separated by a comma (e.g. bc5:(0.1, 1.1): no values found ") - else if (',' `notElem` (head optionList)) then errorWithoutStackTrace ("Set option 'bc5' must be set to a pair of double values in parens, separated by a comma (e.g. bc5:(0.1, 1.1): no comma found ") - else if isNothing noChangeValue then errorWithoutStackTrace ("Set option 'bc5' must be set to a pair of double values in parens, separated by a comma (e.g. bc5:(0.1, 1.1): " ++ (head optionList)) - else if isNothing changeValue then errorWithoutStackTrace ("Set option 'bc5' must be set to a pair of double values in parens, separated by a comma (e.g. bc5:(0.1, 1.1): " ++ (head optionList)) - else - if (bc5 globalSettings) /= (fromJust noChangeValue, fromJust changeValue) then - trace ("bit cost 5 state set to " ++ (show (fromJust noChangeValue, fromJust changeValue))) - (globalSettings {bc5 = (fromJust noChangeValue, fromJust changeValue)}, processedData, inSeedList) - else (globalSettings {bc5 = (fromJust noChangeValue, fromJust changeValue)}, processedData, inSeedList) - - - else if head commandList == "bc8" then - let noChangeString = takeWhile (/= ',') $ filter (`notElem` ['(', ')']) $ head optionList - noChangeValue = readMaybe noChangeString :: Maybe Double - changeString = tail $ dropWhile (/= ',') $ filter (`notElem` ['(', ')']) $ head optionList - changeValue = readMaybe changeString :: Maybe Double - in - if (null . head) optionList then errorWithoutStackTrace ("Set option 'bc8' must be set to a pair of double values in parens, separated by a comma (e.g. bc8:(0.1, 1.1): no values found ") - else if (',' `notElem` (head optionList)) then errorWithoutStackTrace ("Set option 'bc8' must be set to a pair of double values in parens, separated by a comma (e.g. bc8:(0.1, 1.1): no comma found ") - else if isNothing noChangeValue then errorWithoutStackTrace ("Set option 'bc8' must be set to a pair of double values in parens, separated by a comma (e.g. bc8:(0.1, 1.1): " ++ (head optionList)) - else if isNothing changeValue then errorWithoutStackTrace ("Set option 'bc8' must be set to a pair of double values in parens, separated by a comma (e.g. bc8:(0.1, 1.1): " ++ (head optionList)) - else - if (bc8 globalSettings) /= (fromJust noChangeValue, fromJust changeValue) then - trace ("bit cost 8 state set to " ++ (show (fromJust noChangeValue, fromJust changeValue))) - (globalSettings {bc8 = (fromJust noChangeValue, fromJust changeValue)}, processedData, inSeedList) - else (globalSettings {bc8 = (fromJust noChangeValue, fromJust changeValue)}, processedData, inSeedList) - - - else if head commandList == "bc64" then - let noChangeString = takeWhile (/= ',') $ filter (`notElem` ['(', ')']) $ head optionList - noChangeValue = readMaybe noChangeString :: Maybe Double - changeString = tail $ dropWhile (/= ',') $ filter (`notElem` ['(', ')']) $ head optionList - changeValue = readMaybe changeString :: Maybe Double - in - if (null . head) optionList then errorWithoutStackTrace ("Set option 'bc64' must be set to a pair of double values in parens, separated by a comma (e.g. bc64:(0.1, 1.1): no values found ") - else if (',' `notElem` (head optionList)) then errorWithoutStackTrace ("Set option 'bc64' must be set to a pair of double values in parens, separated by a comma (e.g. bc64:(0.1, 1.1): no comma found ") - else if isNothing noChangeValue then errorWithoutStackTrace ("Set option 'bc64' must be set to a pair of double values in parens, separated by a comma (e.g. bc64:(0.1, 1.1): " ++ (head optionList)) - else if isNothing changeValue then errorWithoutStackTrace ("Set option 'bc64' must be set to a pair of double values in parens, separated by a comma (e.g. bc64:(0.1, 1.1): " ++ (head optionList)) - else - if (bc64 globalSettings) /= (fromJust noChangeValue, fromJust changeValue) then - trace ("bit cost 64 state set to " ++ (show (fromJust noChangeValue, fromJust changeValue))) - (globalSettings {bc64 = (fromJust noChangeValue, fromJust changeValue)}, processedData, inSeedList) - else (globalSettings {bc64 = (fromJust noChangeValue, fromJust changeValue)}, processedData, inSeedList) - - - else if head commandList == "bcgt64" then - let noChangeString = takeWhile (/= ',') $ filter (`notElem` ['(', ')']) $ head optionList - noChangeValue = readMaybe noChangeString :: Maybe Double - changeString = tail $ dropWhile (/= ',') $ filter (`notElem` ['(', ')']) $ head optionList - changeValue = readMaybe changeString :: Maybe Double - in - if (null . head) optionList then errorWithoutStackTrace ("Set option 'bcgt64' must be set to a pair of double values in parens, separated by a comma (e.g. bcgt64:(0.1, 1.1): no values found ") - else if (',' `notElem` (head optionList)) then errorWithoutStackTrace ("Set option 'bcgt64' must be set to a pair of double values in parens, separated by a comma (e.g. bcgt64:(0.1, 1.1): no comma found ") - else if isNothing noChangeValue then errorWithoutStackTrace ("Set option 'bcgt64' must be set to a pair of double values in parens, separated by a comma (e.g. bcgt64:(0.1, 1.1): " ++ (head optionList)) - else if isNothing changeValue then errorWithoutStackTrace ("Set option 'bcgt64' must be set to a pair of double values in parens, separated by a comma (e.g. bcgt64:(0.1, 1.1): " ++ (head optionList)) - else - if (bcgt64 globalSettings) /= (fromJust noChangeValue, fromJust changeValue) then - trace ("bit cost > 64 state set to " ++ (show (fromJust noChangeValue, fromJust changeValue))) - (globalSettings {bcgt64 = (fromJust noChangeValue, fromJust changeValue)}, processedData, inSeedList) - else (globalSettings {bcgt64 = (fromJust noChangeValue, fromJust changeValue)}, processedData, inSeedList) - - -- modified criterion causes changes in graphfactor and root cost - else if head commandList == "criterion" then - let localCriterion - | (head optionList == "parsimony") = Parsimony - | (head optionList == "pmdl") = PMDL - | (head optionList == "ml") = Likelihood - | otherwise = errorWithoutStackTrace ("Error in 'set' command. Criterion '" ++ (head optionList) ++ "' is not 'parsimony', 'ml', or 'pmdl'") - - -- create lazy list of graph complexity indexed by number of network nodes--need leaf number for base tree complexity - lGraphComplexityList = if localCriterion == Parsimony then IL.repeat (0.0, 0.0) - else if localCriterion `elem` [PMDL, Likelihood] then U.calculateGraphComplexity processedData - else errorWithoutStackTrace ("Optimality criterion not recognized: " ++ (show localCriterion)) - - lRootComplexity = if localCriterion == Parsimony then 0.0 - else if localCriterion `elem` [PMDL, Likelihood] then U.calculateW15RootCost processedData - else error ("Optimality criterion not recognized: " ++ (show localCriterion)) - - lGraphFactor = if localCriterion `elem` [PMDL, Likelihood] then PMDLGraph - else graphFactor globalSettings - in - trace ("Optimality criterion set to " ++ (show localCriterion) ++ " Tree Complexity = " ++ (show $ fst $ IL.head lGraphComplexityList) ++ " bits") - (globalSettings {optimalityCriterion = localCriterion, graphComplexityList = lGraphComplexityList, rootComplexity = lRootComplexity, graphFactor = lGraphFactor}, processedData, inSeedList) - - else if head commandList == "compressresolutions" then - let localCriterion - | (head optionList == "true") = True - | (head optionList == "false") = False - | otherwise = errorWithoutStackTrace ("Error in 'set' command. CompressResolutions '" ++ (head optionList) ++ "' is not 'true' or 'false'") - in - trace ("CompressResolutions set to " ++ head optionList) - (globalSettings {compressResolutions = localCriterion}, processedData, inSeedList) - - -- this not intended to be for users - else if head commandList == "dynamicepsilon" then - let localValue = readMaybe (head optionList) :: Maybe Double - in - if localValue == Nothing then error ("Set option 'dynamicEpsilon' must be set to an double value >= 0.0 (e.g. dynamicepsilon:0.02): " ++ (head optionList)) - else if (fromJust localValue) < 0.0 then errorWithoutStackTrace ("Set option 'dynamicEpsilon' must be set to an double value >= 0.0 (e.g. dynamicepsilon:0.02): " ++ (head optionList)) - else - trace ("Dynammic Epsilon factor set to " ++ head optionList) - (globalSettings {dynamicEpsilon = 1.0 + (fromJust localValue)}, processedData, inSeedList) - - else if head commandList == "finalassignment" then - let localMethod - | ((head optionList == "do") || (head optionList == "directoptimization")) = DirectOptimization - | ((head optionList == "ia") || (head optionList == "impliedalignment")) = ImpliedAlignment - | otherwise = errorWithoutStackTrace ("Error in 'set' command. FinalAssignment '" ++ (head optionList) ++ "' is not 'DirectOptimization (DO)' or 'ImpliedAlignment (IA)'") - in - if (graphType globalSettings) == Tree then - trace ("FinalAssignment set to " ++ head optionList) - (globalSettings {finalAssignment = localMethod}, processedData, inSeedList) - else if localMethod == DirectOptimization then - (globalSettings {finalAssignment = localMethod}, processedData, inSeedList) - else - trace ("FinalAssignment set to DO (ignoring IA option) for non-Tree graphs") - (globalSettings {finalAssignment = DirectOptimization}, processedData, inSeedList) - - else if head commandList == "graphfactor" then - let localMethod - | (head optionList == "nopenalty") = NoNetworkPenalty - | (head optionList == "w15") = Wheeler2015Network - | (head optionList == "w23") = Wheeler2023Network - | (head optionList == "pmdl") = PMDLGraph - | otherwise = errorWithoutStackTrace ("Error in 'set' command. GraphFactor '" ++ (head optionList) ++ "' is not 'NoPenalty', 'W15', 'W23', or 'PMDL'") - in - trace ("GraphFactor set to " ++ (show localMethod)) - (globalSettings {graphFactor = localMethod}, processedData, inSeedList) - - else if head commandList == "graphtype" then - let localGraphType - | (head optionList == "tree") = Tree - | (head optionList == "softwired") = SoftWired - | (head optionList == "hardwired") = HardWired - | otherwise = errorWithoutStackTrace ("Error in 'set' command. Graphtype '" ++ (head optionList) ++ "' is not 'tree', 'hardwired', or 'softwired'") - in - if localGraphType /= Tree then - trace ("Graphtype set to " ++ (head optionList) ++ " and final assignment to DO") - (globalSettings {graphType = localGraphType, finalAssignment = DirectOptimization}, processedData, inSeedList) - else - trace ("Graphtype set to " ++ head optionList) - (globalSettings {graphType = localGraphType}, processedData, inSeedList) - - else if head commandList == "modelcomplexity" then - let localValue = readMaybe (head optionList) :: Maybe Double - in - if localValue == Nothing then error ("Set option 'modelComplexity' must be set to a double value (e.g. modelComplexity:123.456): " ++ (head optionList)) - else - trace ("Model Complexity set to " ++ head optionList) - (globalSettings {modelComplexity = (fromJust localValue)}, processedData, inSeedList) - - else if head commandList == "outgroup" then - let outTaxonName = T.pack $ filter (/= '"') $ head $ filter (/= "") $ fmap snd argList - outTaxonIndex = V.elemIndex outTaxonName leafNameVect - - in - if isNothing outTaxonIndex then errorWithoutStackTrace ("Error in 'set' command. Out-taxon " ++ T.unpack outTaxonName ++ " not found in input leaf list" ++ show (fmap (T.unpack) leafNameVect)) - else trace ("Outgroup set to " ++ T.unpack outTaxonName) (globalSettings {outgroupIndex = fromJust outTaxonIndex, outGroupName = outTaxonName}, processedData, inSeedList) - - else if head commandList == "partitioncharacter" then - let localPartitionChar = head optionList - in - if length localPartitionChar /= 1 then errorWithoutStackTrace ("Error in 'set' command. Partitioncharacter '" ++ (show localPartitionChar) ++ "' must be a single character") - else - if (localPartitionChar /= partitionCharacter globalSettings) then - trace ("PartitionCharacter set to '" ++ (head optionList) ++ "'") - (globalSettings {partitionCharacter = localPartitionChar}, processedData, inSeedList) - else - (globalSettings, processedData, inSeedList) - - else if head commandList == "rootcost" then - let localMethod - | (head optionList == "norootcost") = NoRootCost - | (head optionList == "w15") = Wheeler2015Root - | (head optionList == "pmdl") = PMDLRoot - | (head optionList == "ml") = MLRoot - | otherwise = errorWithoutStackTrace ("Error in 'set' command. RootCost '" ++ (head optionList) ++ "' is not 'NoRootCost', 'W15', or 'PMDL'") - - lRootComplexity = if localMethod == NoRootCost then 0.0 - else if localMethod `elem` [Wheeler2015Root, PMDLRoot, MLRoot] then U.calculateW15RootCost processedData - else error ("Root cost method not recognized: " ++ (show localMethod)) - in - trace ("RootCost set to " ++ (show localMethod) ++ " " ++ (show lRootComplexity) ++ " bits") - (globalSettings {rootCost = localMethod, rootComplexity = lRootComplexity}, processedData, inSeedList) - - else if head commandList == "seed" then - let localValue = readMaybe (head optionList) :: Maybe Int - in - if localValue == Nothing then error ("Set option 'seed' must be set to an integer value (e.g. seed:123): " ++ (head optionList)) - else - trace ("Random Seed set to " ++ head optionList) - (globalSettings {seed = (fromJust localValue)}, processedData, randomIntList (fromJust localValue)) - - else trace ("Warning--unrecognized/missing 'set' option in " ++ show argList) (globalSettings, processedData, inSeedList) - - - --- | reportCommand takes report options, current data and graphs and returns a --- (potentially large) String to print and the channel to print it to --- and write mode overwrite/append -reportCommand :: GlobalSettings -> [Argument] -> Int -> String -> ProcessedData -> [PhylogeneticGraph] -> [PhylogeneticGraph] -> [[VertexCost]] -> (String, String, String) -reportCommand globalSettings argList numInputFiles crossReferenceString processedData curGraphs supportGraphs pairwiseDistanceMatrix = - let argListWithoutReconcileCommands = filter ((`notElem` VER.reconcileArgList) .fst) argList - --check for balances double quotes and only one pair - outFileNameList = filter (/= "") $ fmap snd argListWithoutReconcileCommands --argList - commandList = fmap (fmap C.toLower) $ filter (/= "") $ fmap fst argListWithoutReconcileCommands - -- reconcileList = filter (/= "") $ fmap fst argList - in - if length outFileNameList > 1 then errorWithoutStackTrace ("Report can only have one file name: " ++ (show outFileNameList) ++ " " ++ (show argList)) - else - let checkCommandList = checkCommandArgs "report" commandList VER.reportArgList - outfileName = if null outFileNameList then "stderr" - else tail $ init $ head outFileNameList - writeMode = if "overwrite" `elem` commandList then "overwrite" - else "append" - - in - -- error too harsh, lose everything else - --if (null $ filter (/= "overwrite") $ filter (/= "append") commandList) then errorWithoutStackTrace ("Error: Missing 'report' option in " ++ show commandList) - --else - if not checkCommandList then errorWithoutStackTrace ("Unrecognized command in report: " ++ show argList) - else - -- This for reconciled data - if "crossrefs" `elem` commandList then - let dataString = crossReferenceString - in - (dataString, outfileName, writeMode) - - else if "data" `elem` commandList then - let dataString = phyloDataToString 0 $ thd3 processedData - baseData = ("There were " ++ (show numInputFiles) ++ " input data files with " ++ show (length $ thd3 processedData) ++ " blocks and " ++ (show ((length dataString) - 1)) ++ " total characters\n") - charInfoFields = ["Index", "Block", "Name", "Type", "Activity", "Weight", "Prealigned", "Alphabet", "TCM"] - in - (baseData ++ CSV.genCsvFile (charInfoFields : dataString), outfileName, writeMode) - - else if "diagnosis" `elem` commandList then - let dataString = CSV.genCsvFile $ concatMap (getGraphDiagnosis processedData) (zip curGraphs [0.. ((length curGraphs) - 1)]) - in - if null curGraphs then - trace ("No graphs to diagnose") - ("No graphs to diagnose", outfileName, writeMode) - else - trace ("Diagnosing " ++ (show $ length curGraphs) ++ " graphs at minimum cost " ++ (show $ minimum $ fmap snd6 curGraphs)) - (dataString, outfileName, writeMode) - - else if "displaytrees" `elem` commandList then - -- need to specify -O option for multiple graphs - let inputDisplayVVList = fmap fth6 curGraphs - costList = fmap snd6 curGraphs - displayCostListList = fmap GO.getDisplayTreeCostList curGraphs - displayInfoString = ("DisplayTree costs : " ++ (show (fmap sum $ fmap fst displayCostListList, displayCostListList))) - treeIndexStringList = fmap ((++ "\n") . ("Canonical Tree " ++)) (fmap show [0..(length inputDisplayVVList - 1)]) - canonicalGraphPairList = zip treeIndexStringList inputDisplayVVList - blockStringList = concatMap (++ "\n") (fmap (outputBlockTrees commandList costList (outgroupIndex globalSettings)) canonicalGraphPairList) - -- graphString = outputGraphString commandList (outgroupIndex globalSettings) (fmap thd6 curGraphs) (fmap snd6 curGraphs) - in - if null curGraphs || (graphType globalSettings) /= SoftWired then - trace ("No soft-wired graphs to report display trees") - ("No soft-wired graphs to report display trees", outfileName, writeMode) - else - (displayInfoString ++ "\n" ++ blockStringList, outfileName, writeMode) - - - else if "graphs" `elem` commandList then - --else if (not .null) (L.intersect ["graphs", "newick", "dot", "dotpdf"] commandList) then - let - graphString = outputGraphString commandList (outgroupIndex globalSettings) (fmap thd6 curGraphs) (fmap snd6 curGraphs) - in - if null curGraphs then - trace ("No graphs to report") - ("No graphs to report", outfileName, writeMode) - else - trace ("Reporting " ++ (show $ length curGraphs) ++ " graph(s) at minimum cost " ++ (show $ minimum $ fmap snd6 curGraphs)) - (graphString, outfileName, writeMode) - - else if "ia" `elem` commandList || "impliedalignment" `elem` commandList then - if null curGraphs then - trace ("No graphs to create implied alignments") - ("No impliedAlgnments to report", outfileName, writeMode) - else - let includeMissing = any (=="includemissing") commandList - concatSeqs = any (=="concatenate") commandList - iaContentList = zipWith (getImpliedAlignmentString globalSettings (includeMissing || concatSeqs) concatSeqs processedData) curGraphs [0.. (length curGraphs - 1)] - in - trace ("\tWarning--prealigned sequence data with non-additive type costs (all change values equal) have been recoded to non-additve characters and will not appear in implied alignment output.") - (concat iaContentList, outfileName, writeMode) - - else if "pairdist" `elem` commandList then - let nameData = L.intercalate "," (V.toList $ fmap T.unpack $ fst3 processedData) ++ "\n" - dataString = CSV.genCsvFile $ fmap (fmap show) pairwiseDistanceMatrix - in - (nameData ++ dataString, outfileName, writeMode) - - else if "reconcile" `elem` commandList then - let (reconcileString, _) = R.makeReconcileGraph VER.reconcileArgList argList (fmap fst6 curGraphs) - in - if null curGraphs then - trace ("No graphs to reconcile") - ([], outfileName, writeMode) - else - (reconcileString, outfileName, writeMode) - - else if "search" `elem` commandList then - let dataString = fmap showSearchFields $ reverse $ searchData globalSettings - baseData = ("SearchData\n") - charInfoFields = ["Command", "Arguments", "Min cost in", "Max cost in", "Num graphs in", "Min cost out", "Max cost out", "Num graphs out", "Duration (secs)", "Comment"] - in - (baseData ++ CSV.genCsvFile (charInfoFields : dataString), outfileName, writeMode) - - else if "support" `elem` commandList then - let graphString = outputGraphStringSimple commandList (outgroupIndex globalSettings) (fmap fst6 supportGraphs) (fmap snd6 supportGraphs) - in - -- trace ("Rep Sup: " ++ (LG.prettify $ fst6 $ head supportGraphs)) ( - if null supportGraphs then - trace ("\tNo support graphs to report") - ([], outfileName, writeMode) - else - trace ("Reporting " ++ (show $ length curGraphs) ++ " support graph(s)") - (graphString, outfileName, writeMode) - -- ) - - else if "tnt" `elem` commandList then - if null curGraphs then - trace ("No graphs to create implied alignments for TNT output") - ("No impliedAlgnments for TNT to report", outfileName, writeMode) - else - let tntContentList = zipWith (getTNTString globalSettings processedData) curGraphs [0.. (length curGraphs - 1)] - in - (concat tntContentList, outfileName, writeMode) - - else - trace ("\nWarning--unrecognized/missing report option in " ++ (show commandList) ++ " defaulting to 'graphs'") ( - let graphString = outputGraphString commandList (outgroupIndex globalSettings) (fmap thd6 curGraphs) (fmap snd6 curGraphs) - in - if null curGraphs then - trace ("No graphs to report") - ("No graphs to report", outfileName, writeMode) - else - trace ("Reporting " ++ (show $ length curGraphs) ++ " graph(s) at minimum cost " ++ (show $ minimum $ fmap snd6 curGraphs) ++"\n") - (graphString, outfileName, writeMode) - ) - --- changeDotPreamble takes an input string to search for and a new one to add in its place --- searches through dot file (can have multipl graphs) replacing teh search string each time. -changeDotPreamble :: String -> String -> String -> String -changeDotPreamble findString newString inDotString = - if null inDotString then [] - else - changePreamble' findString newString [] (lines inDotString) - --- changeDotPreamble' internal process for changeDotPreamble -changePreamble' :: String -> String -> [String] -> [String] -> String -changePreamble' findString newString accumList inLineList = - if null inLineList then unlines $ reverse accumList - else - -- trace ("CP':" ++ (head inLineList) ++ " " ++ findString ++ " " ++ newString) ( - let firstLine = head inLineList - in - if firstLine == findString then changePreamble' findString newString (newString : accumList) (tail inLineList) - else changePreamble' findString newString (firstLine : accumList) (tail inLineList) - -- ) - ---printGraph graphviz simple dot file of graph ---execute with "dot -Tps test.dot -o test.ps" ---need to add output to argument filename and call ---graphviz via System.Process.runprocess ---also, reorder GenForest so smalles (num leaves) is either first or ---last so can print small to large all the way so easier to read -printGraphVizDot :: String -> String -> IO () -printGraphVizDot graphDotString dotFile = - if null graphDotString then error "No graph to report" - else do - myHandle <- openFile dotFile WriteMode - if os /= "darwin" then hPutStrLn stderr ("\tOutputting graphviz to " ++ dotFile ++ ".pdf.") - else hPutStrLn stderr ("\tOutputting graphviz to " ++ dotFile ++ ".eps.") - let outputType = if os == "darwin" then "-Teps" - else "-Tpdf" - --hPutStrLn myHandle "digraph G {" - --hPutStrLn myHandle "\trankdir = LR;" - --hPutStrLn myHandle "\tnode [ shape = rect];" - --hPutStr myHandle $ (unlines . tail . lines) graphDotString - hPutStr myHandle graphDotString - -- hPutStrLn myHandle "}" - hClose myHandle - pCode <- findExecutable "dot" --system "dot" --check for Graphviz - {- - hPutStrLn stderr - (if isJust pCode then --pCode /= Nothing then - "executed dot " ++ outputType ++ dotFile ++ " -O " else - "Graphviz call failed (not installed or found). Dot file still created. Dot can be obtained from https://graphviz.org/download") - -} - if isJust pCode then do - _ <- createProcess (proc "dot" [outputType, dotFile, "-O"]) - hPutStrLn stderr ("\tExecuted dot " ++ outputType ++ " " ++ dotFile ++ " -O ") - else - hPutStrLn stderr "\tGraphviz call failed (not installed or found). Dot file still created. Dot can be obtained from https://graphviz.org/download" - - --- | showSearchFields cretes a String list for SearchData Fields -showSearchFields :: SearchData -> [String] -showSearchFields sD = - [show $ instruction sD, concat $ fmap showArg $ arguments sD, show $ minGraphCostIn sD, show $ maxGraphCostIn sD, show $ numGraphsIn sD, show $ minGraphCostOut sD, show $ maxGraphCostOut sD, show $ numGraphsOut sD, - show $ ((fromIntegral $ duration sD) / 1000 :: Double), commentString sD] - where showArg a = "(" ++ (fst a) ++ "," ++ (snd a) ++ ")" - --- | requireReoptimization checks if set command in globalk settings requires reoptimization of graphs due to change in --- graph type, optimality criterion etc. -requireReoptimization :: GlobalSettings -> GlobalSettings -> Bool -requireReoptimization gsOld gsNew = - if graphType gsOld /= graphType gsNew then True - else if optimalityCriterion gsOld /= optimalityCriterion gsNew then True - else if finalAssignment gsOld /= finalAssignment gsNew then True - else if graphFactor gsOld /= graphFactor gsNew then True - else if rootCost gsOld /= rootCost gsNew then True - else False - --- | outputBlockTrees takes a PhyloGeneticTree and outputs BlockTrees -outputBlockTrees :: [String] -> [VertexCost] -> Int -> (String , V.Vector [DecoratedGraph]) -> String -outputBlockTrees commandList costList lOutgroupIndex (labelString, graphLV) = - let blockIndexStringList = fmap ((++ "\n") . ("Block " ++)) (fmap show [0..((V.length graphLV) - 1)]) - blockStrings = concatMap (++ "\n") (fmap (makeBlockGraphStrings commandList costList lOutgroupIndex ) $ zip blockIndexStringList (V.toList graphLV)) - in - labelString ++ blockStrings - --- | makeBlockGraphStrings makes individual block display trees--potentially multiple -makeBlockGraphStrings :: [String] -> [VertexCost] -> Int -> (String ,[DecoratedGraph]) -> String -makeBlockGraphStrings commandList costList lOutgroupIndex (labelString, graphL) = - let diplayIndexString =("Display Tree(s): " ++ show (length graphL) ++ "\n") - displayString = (++ "\n") $ outputDisplayString commandList costList lOutgroupIndex graphL - in - labelString ++ diplayIndexString ++ displayString - --- | outputDisplayString is a wrapper around graph output functions--but without cost list -outputDisplayString :: [String] -> [VertexCost] -> Int -> [DecoratedGraph] -> String -outputDisplayString commandList costList lOutgroupIndex graphList - | "dot" `elem` commandList = makeDotList costList lOutgroupIndex (fmap GO.convertDecoratedToSimpleGraph graphList) - | "newick" `elem` commandList = GO.makeNewickList (not (any (=="nobranchlengths") commandList)) (not (any (=="nohtulabels") commandList)) lOutgroupIndex (fmap GO.convertDecoratedToSimpleGraph graphList) (replicate (length graphList) 0.0) - | "ascii" `elem` commandList = makeAsciiList lOutgroupIndex (fmap GO.convertDecoratedToSimpleGraph graphList) - | otherwise = -- "dot" as default - makeDotList costList lOutgroupIndex (fmap GO.convertDecoratedToSimpleGraph graphList) - --- | outputGraphString is a wrapper around graph output functions -outputGraphString :: [String] -> Int -> [DecoratedGraph] -> [VertexCost] -> String -outputGraphString commandList lOutgroupIndex graphList costList - | "dot" `elem` commandList = makeDotList costList lOutgroupIndex (fmap GO.convertDecoratedToSimpleGraph graphList) - | "newick" `elem` commandList = GO.makeNewickList (not (any (=="nobranchlengths") commandList)) (not (any (=="nohtulabels") commandList)) lOutgroupIndex (fmap GO.convertDecoratedToSimpleGraph graphList) costList - | "ascii" `elem` commandList = makeAsciiList lOutgroupIndex (fmap GO.convertDecoratedToSimpleGraph graphList) - | otherwise = -- "dot" as default - makeDotList costList lOutgroupIndex (fmap GO.convertDecoratedToSimpleGraph graphList) - --- | outputGraphStringSimple is a wrapper around graph output functions -outputGraphStringSimple :: [String] -> Int -> [SimpleGraph] -> [VertexCost] -> String -outputGraphStringSimple commandList lOutgroupIndex graphList costList - | "dot" `elem` commandList = makeDotList costList lOutgroupIndex graphList - | "newick" `elem` commandList = GO.makeNewickList True True lOutgroupIndex graphList costList - | "ascii" `elem` commandList = makeAsciiList lOutgroupIndex graphList - | otherwise = -- "dot" as default - makeDotList costList lOutgroupIndex graphList - - --- | makeDotList takes a list of fgl trees and outputs a single String cointaining the graphs in Dot format --- need to specify -O option for multiple graph(outgroupIndex globalSettings)s -makeDotList :: [VertexCost] -> Int -> [SimpleGraph] -> String -makeDotList costList rootIndex graphList = - let graphStringList = fmap fgl2DotString $ fmap (LG.rerootTree rootIndex) graphList - costStringList = fmap ("\n//" ++) $ fmap show costList - in - L.intercalate "\n" (zipWith (++) graphStringList costStringList) - --- | makeAsciiList takes a list of fgl trees and outputs a single String cointaining the graphs in ascii format -makeAsciiList :: Int -> [SimpleGraph] -> String -makeAsciiList rootIndex graphList = - concatMap LG.prettify (fmap (LG.rerootTree rootIndex) graphList) - -{- Older version wiht more data dependenncy --- | getDataListList returns a list of lists of Strings for data output as csv --- for row is source file names, suubsequent rows by taxon with +/- for present absent taxon in --- input file -getDataListList :: [RawData] -> ProcessedData -> [[String]] -getDataListList inDataList processedData = - if null inDataList then [] - else - let fileNames = " " : fmap (takeWhile (/= ':')) (fmap T.unpack $ fmap name $ fmap head $ fmap snd inDataList) - fullTaxList = V.toList $ fst3 processedData - presenceAbsenceList = fmap (isThere inDataList) fullTaxList - fullMatrix = zipWith (:) (fmap T.unpack fullTaxList) presenceAbsenceList - in - --trace (show fileNames) - fileNames : fullMatrix --} - --- | getDataListList returns a list of lists of Strings for data output as csv --- for row is source file names, subsequent rows by taxon with +/- for present absent taxon in --- input file --- different from getDataListList in removeal or processed data requiremenrt replaced with taxan name list -getDataListList :: [RawData] -> [T.Text] -> [[String]] -getDataListList inDataList fullTaxList = - if null inDataList then [] - else - let fileNames = " " : fmap (takeWhile (/= ':')) (fmap T.unpack $ fmap name $ fmap head $ fmap snd inDataList) - presenceAbsenceList = fmap (isThere inDataList) fullTaxList - fullMatrix = zipWith (:) (fmap T.unpack fullTaxList) presenceAbsenceList - in - --trace (show fileNames) - fileNames : fullMatrix - --- | isThere takes a list of Rawdata and reurns a String of + - -isThere :: [RawData] -> T.Text -> [String] -isThere inData inName = - if null inData then [] - else - let firstTaxList = fmap fst $ fst $ head inData - in - if inName `elem` firstTaxList then "+" : isThere (tail inData) inName - else "-" : isThere (tail inData) inName - --- | phyloDataToString converts RawData type to String --- for additive chars--multiply states by weight is < 1 when outputtting due to conversion on input -phyloDataToString :: Int -> V.Vector BlockData -> [[String]] -phyloDataToString charIndexStart inDataVect = - if V.null inDataVect then [] - else - let (blockName, _, charInfoVect) = V.head inDataVect - charStrings = zipWith (:) (replicate (V.length charInfoVect) (T.unpack blockName)) (getCharInfoStrings <$> V.toList charInfoVect) - charNumberString = fmap show [charIndexStart..(charIndexStart + length charStrings - 1)] - fullMatrix = zipWith (:) charNumberString charStrings - in - fullMatrix ++ phyloDataToString (charIndexStart + length charStrings) (V.tail inDataVect) - --- | getCharInfoStrings takes charInfo and returns list of Strings of fields -getCharInfoStrings :: CharInfo -> [String] -getCharInfoStrings inChar = - let activityString = if (activity inChar) then "active" - else "inactive" - prealignedString = if (prealigned inChar) then "prealigned" - else "unaligned" - in - [T.unpack $ name inChar, show $ charType inChar, activityString, show $ weight inChar, prealignedString] <> (fmap ST.toString . toList $ alphabet inChar) <> [show $ costMatrix inChar] - --- | executeRenameReblockCommands takes all the "Rename commands" pairs and --- creates a list of pairs of new name and list of old names to be converted --- as Text -executeRenameReblockCommands :: Instruction -> [(T.Text, T.Text)] -> [Command] -> IO [(T.Text, T.Text)] -executeRenameReblockCommands thisInStruction curPairs commandList = - if null commandList then return curPairs - else do - let (firstOption, firstArgs) = head commandList - - -- skip "Read" and "Rename "commands already processed - if (firstOption /= thisInStruction) then executeRenameReblockCommands thisInStruction curPairs (tail commandList) - else - let newName = T.filter C.isPrint $ T.filter (/= '"') $ T.pack $ snd $ head firstArgs - newNameList = replicate (length $ tail firstArgs) newName - oldNameList = (fmap (T.filter (/= '"') . T.pack) (fmap snd $ tail firstArgs)) - newPairs = zip newNameList oldNameList - in - executeRenameReblockCommands thisInStruction (curPairs ++ newPairs) (tail commandList) - --- | getGraphDiagnosis creates basic for CSV of graph vertex and node information --- nodes first then vertices -getGraphDiagnosis :: ProcessedData -> (PhylogeneticGraph, Int) -> [[String]] -getGraphDiagnosis inData (inGraph, graphIndex) = - let decGraph = thd6 inGraph - in - if LG.isEmpty decGraph then [] - else - let vertexList = LG.labNodes decGraph - edgeList = LG.labEdges decGraph - topHeaderList = ["Graph Index", "Vertex Index", "Vertex Name", "Vertex Type", "Child Vertices", "Parent Vertices", "Data Block", "Character Name", "Character Type", "Preliminary State", "Final State", "Local Cost"] - vertexInfoList = concatMap (getVertexCharInfo (thd3 inData) (fst6 inGraph) (six6 inGraph)) vertexList - edgeHeaderList = [[" "],[" ", "Edge Head Vertex", "Edge Tail Vertex", "Edge Type", "Minimum Length", "Maximum Length", "MidRange Length"]] - edgeInfoList = fmap getEdgeInfo edgeList - in - [topHeaderList, [show graphIndex]] ++ vertexInfoList ++ edgeHeaderList ++ edgeInfoList - --- | getVertexCharInfo returns a list of list of Strings of vertex infomation --- one list for each character at the vertex -getVertexCharInfo :: V.Vector BlockData -> SimpleGraph -> V.Vector (V.Vector CharInfo) -> LG.LNode VertexInfo -> [[String]] -getVertexCharInfo blockDataVect inGraph charInfoVectVect inVert = - let leafParents = LG.parents inGraph (fst inVert) - parentNodes - | nodeType (snd inVert) == RootNode = "None" - | nodeType (snd inVert) == LeafNode = show leafParents - | otherwise = show $ parents (snd inVert) - childNodes = if nodeType (snd inVert) == LeafNode then "None" else show $ children (snd inVert) - basicInfoList = [" ", show $ fst inVert, T.unpack $ vertName (snd inVert), show $ nodeType (snd inVert), childNodes, parentNodes, " ", " ", " ", " ", " ", show $ vertexCost (snd inVert)] - blockCharVect = V.zip3 (V.map fst3 blockDataVect) (vertData (snd inVert)) charInfoVectVect - blockInfoList = concat $ V.toList $ V.map getBlockList blockCharVect - in - basicInfoList : blockInfoList - --- | getBlockList takes a pair of Vector of chardata and vector of charInfo and returns Strings -getBlockList :: (NameText, V.Vector CharacterData, V.Vector CharInfo) -> [[String]] -getBlockList (blockName, blockDataVect, charInfoVect) = - let firstLine = [" ", " ", " ", " ", " ", " ", T.unpack blockName] - charlines = V.toList $ V.map makeCharLine (V.zip blockDataVect charInfoVect) - in - firstLine : charlines - --- | makeCharLine takes character data --- will be less legible for optimized data--so should use a diagnosis --- based on "naive" data for human legible output --- need to add back-converting to observed states using alphabet in charInfo --- nothing here for packed since not "entered" -makeCharLine :: (CharacterData, CharInfo) -> [String] -makeCharLine (blockDatum, charInfo) = - let localType = charType charInfo - localAlphabet = fmap ST.toString $ alphabet charInfo - isPrealigned = if prealigned charInfo == True then "Prealigned " - else "" - enhancedCharType = if localType `elem` sequenceCharacterTypes then (isPrealigned ++ (show localType)) - else if localType `elem` exactCharacterTypes then (show localType) - else error ("Character Type :" ++ (show localType) ++ "unrecogniized or not implemented") - - (stringPrelim, stringFinal) = if localType == Add then (show $ snd3 $ rangePrelim blockDatum, show $ rangeFinal blockDatum) - else if localType == NonAdd then (concat $ V.map (U.bitVectToCharState localAlphabet) $ snd3 $ stateBVPrelim blockDatum, concat $ V.map (U.bitVectToCharState localAlphabet) $ stateBVFinal blockDatum) - else if localType `elem` packedNonAddTypes then (UV.foldMap (U.bitVectToCharState localAlphabet) $ snd3 $ packedNonAddPrelim blockDatum, UV.foldMap (U.bitVectToCharState localAlphabet) $ packedNonAddFinal blockDatum) - else if localType == Matrix then (show $ matrixStatesPrelim blockDatum, show $ fmap (fmap fst3) $ matrixStatesFinal blockDatum) - else if localType `elem` sequenceCharacterTypes - then case localType of - x | x `elem` [SlimSeq, NucSeq ] -> (SV.foldMap (U.bitVectToCharState localAlphabet) $ slimPrelim blockDatum, SV.foldMap (U.bitVectToCharState localAlphabet) $ slimFinal blockDatum) - x | x `elem` [WideSeq, AminoSeq] -> (UV.foldMap (U.bitVectToCharState localAlphabet) $ widePrelim blockDatum, UV.foldMap (U.bitVectToCharState localAlphabet) $ wideFinal blockDatum) - x | x `elem` [HugeSeq] -> ( foldMap (U.bitVectToCharState localAlphabet) $ hugePrelim blockDatum, foldMap (U.bitVectToCharState localAlphabet) $ hugeFinal blockDatum) - x | x `elem` [AlignedSlim] -> (SV.foldMap (U.bitVectToCharState localAlphabet) $ snd3 $ alignedSlimPrelim blockDatum, SV.foldMap (U.bitVectToCharState localAlphabet) $ alignedSlimFinal blockDatum) - x | x `elem` [AlignedWide] -> (UV.foldMap (U.bitVectToCharState localAlphabet) $ snd3 $ alignedWidePrelim blockDatum, UV.foldMap (U.bitVectToCharState localAlphabet) $ alignedWideFinal blockDatum) - x | x `elem` [AlignedHuge] -> ( foldMap (U.bitVectToCharState localAlphabet) $ snd3 $ alignedHugePrelim blockDatum, foldMap (U.bitVectToCharState localAlphabet) $ alignedHugeFinal blockDatum) - - _ -> error ("Un-implemented data type " ++ show localType) - else error ("Un-implemented data type " ++ show localType) - in - [" ", " ", " ", " ", " ", " ", " ", T.unpack $ name charInfo, enhancedCharType, stringPrelim, stringFinal, show $ localCost blockDatum] - - --- | getEdgeInfo returns a list of Strings of edge infomation -getEdgeInfo :: LG.LEdge EdgeInfo -> [String] -getEdgeInfo inEdge = - [" ", show $ fst3 inEdge, show $ snd3 inEdge, show $ edgeType (thd3 inEdge), show $ minLength (thd3 inEdge), show $ maxLength (thd3 inEdge), show $ midRangeLength (thd3 inEdge)] - --- | TNT report functions - --- | getTNTStrings returns as a single String the implied alignments of all sequence characters --- softwired use display trees, hardWired transform to softwired then proceed with display trees --- key to keep cost matrices and weights -getTNTString :: GlobalSettings -> ProcessedData -> PhylogeneticGraph -> Int -> String -getTNTString inGS inData inGraph graphNumber = - if LG.isEmpty (fst6 inGraph) then error "No graphs for create TNT data for in getTNTString" - else - let leafList = snd4 $ LG.splitVertexList (thd6 inGraph) - leafNameList = fmap (++ "\t") $ fmap T.unpack $ fmap (vertName . snd) leafList - headerString = "xread\n'TNT data for Graph " ++ (show graphNumber) ++ " generated by PhylogeneticGraph (PhyG)\n\tSource characters:\n" - finalString = "proc/;\n\n" - numTaxa = V.length $ fst3 inData - charInfoVV = six6 inGraph - - -- get character information in 3-tuples and list of lengths--to match lengths - ccCodeInfo = getCharacterInfo charInfoVV - - in - - if graphType inGS == Tree then - let leafDataList = V.fromList $ fmap (vertData . snd) leafList - - -- get character strings - taxonCharacterStringList = V.toList $ fmap (++ "\n") $ fmap (getTaxonCharString charInfoVV) leafDataList - nameCharStringList = concat $ zipWith (++) leafNameList taxonCharacterStringList - - -- length information for cc code extents - charLengthList = concat $ V.toList $ V.zipWith getBlockLength (V.head leafDataList) charInfoVV - - -- Block/Character names for use in comment to show sources of new characters - charNameList = concat $ V.toList $ fmap getBlockNames charInfoVV - - nameLengthPairList = zip charNameList charLengthList - nameLengthString = concat $ pairListToStringList nameLengthPairList 0 - - -- merge lengths and cc codes - ccCodeString = mergeCharInfoCharLength ccCodeInfo charLengthList 0 - in - -- trace ("GTNTS:" ++ (show charLengthList)) - headerString ++ nameLengthString ++ "'\n" ++ (show $ sum charLengthList) ++ " " ++ (show numTaxa) ++ "\n" - ++ nameCharStringList ++ ";\n" ++ ccCodeString ++ finalString - - - -- for softwired networks--use display trees - else if graphType inGS == SoftWired then - - -- get display trees for each data block-- takes first of potentially multiple - let middleStuffString = createDisplayTreeTNT inGS inData inGraph - in - headerString ++ middleStuffString ++ finalString - - -- for hard-wired networks--transfoirm to softwired and use display trees - else if graphType inGS == HardWired then - let newGS = inGS {graphType = SoftWired} - - pruneEdges = False - warnPruneEdges = False - startVertex = Nothing - - newGraph = TRAV.multiTraverseFullyLabelGraph newGS inData pruneEdges warnPruneEdges startVertex (fst6 newGraph) - - middleStuffString = createDisplayTreeTNT inGS inData newGraph - - in - trace ("There is no implied alignment for hard-wired graphs--at least not yet. Ggenerating TNT text via softwired transformation") - --headerString ++ nameLengthString ++ "'\n" ++ (show $ sum charLengthList) ++ " " ++ (show numTaxa) ++ "\n" - -- ++ nameCharStringList ++ ";\n" ++ ccCodeString ++ finalString - headerString ++ middleStuffString ++ finalString - - else - trace ("TNT not yet implemented for graphtype " ++ show (graphType inGS)) - ("There is no implied alignment for "++ (show (graphType inGS))) - --- | createDisplayTreeTNT take a softwired graph and creates TNT data string -createDisplayTreeTNT :: GlobalSettings -> ProcessedData -> PhylogeneticGraph -> String -createDisplayTreeTNT inGS inData inGraph = - let leafList = snd4 $ LG.splitVertexList (thd6 inGraph) - leafNameList = fmap (++ "\t") $ fmap T.unpack $ fmap (vertName . snd) leafList - charInfoVV = six6 inGraph - numTaxa = V.length $ fst3 inData - ccCodeInfo = getCharacterInfo charInfoVV - blockDisplayList = fmap GO.convertDecoratedToSimpleGraph $ fmap head $ fth6 inGraph - - -- create seprate processed data for each block - blockProcessedDataList = fmap (makeBlockData (fst3 inData) (snd3 inData)) (thd3 inData) - - -- Perform full optimizations on display trees (as trees) with single block data (blockProcessedDataList) to creeate IAs - decoratedBlockTreeList = V.zipWith (TRAV.multiTraverseFullyLabelGraph' (inGS {graphType = Tree}) False False Nothing) blockProcessedDataList blockDisplayList - - -- create leaf data by merging display graph block data (each one a phylogentic graph) - (leafDataList, mergedCharInfoVV) = mergeDataBlocks (V.toList decoratedBlockTreeList) [] [] - - -- get character strings - taxonCharacterStringList = V.toList $ fmap (++ "\n") $ fmap (getTaxonCharString mergedCharInfoVV) leafDataList - nameCharStringList = concat $ zipWith (++) leafNameList taxonCharacterStringList - - -- length information for cc code extents - charLengthList = concat $ V.toList $ V.zipWith getBlockLength (V.head leafDataList) mergedCharInfoVV - - -- Block/Character names for use in comment to show sources of new characters - charNameList = concat $ V.toList $ fmap getBlockNames charInfoVV - - nameLengthPairList = zip charNameList charLengthList - nameLengthString = concat $ pairListToStringList nameLengthPairList 0 - - -- merge lengths and cc codes - ccCodeString = mergeCharInfoCharLength ccCodeInfo charLengthList 0 - in - nameLengthString ++ "'\n" ++ (show $ sum charLengthList) ++ " " ++ (show numTaxa) ++ "\n" - ++ nameCharStringList ++ ";\n" ++ ccCodeString - - - - --- | pairListToStringList takes alist of (String, Int) and a starting index and returns scope of charcter for leading comment -pairListToStringList :: [(String, Int)] -> Int -> [String] -pairListToStringList pairList startIndex = - if null pairList then [] - else - let (a, b) = head pairList - in - ("\t\t" ++ (show startIndex) ++ "-" ++ (show $ b + startIndex - 1) ++ " : " ++ a ++ "\n") : pairListToStringList (tail pairList) (startIndex + b) - --- | mergeDataBlocks takes a list of Phylogenetic Graphs (Trees) and merges the data blocks (each graph should have only 1) --- and merges the charInfo Vectors returning data and charInfo -mergeDataBlocks :: [PhylogeneticGraph] -> [[(V.Vector CharacterData)]] -> [V.Vector CharInfo] -> (V.Vector (V.Vector (V.Vector CharacterData)), V.Vector (V.Vector CharInfo)) -mergeDataBlocks inGraphList curDataList curInfoList = - if null inGraphList then (V.fromList $ fmap V.fromList $ fmap reverse curDataList, V.fromList $ reverse curInfoList) - else - let firstGraph = head inGraphList - firstTree = thd6 firstGraph - firstCharInfo = V.head $ six6 firstGraph - leafList = snd4 $ LG.splitVertexList firstTree - - -- since each graph has a single block--take head to get vector of characters - leafCharacterList = V.toList $ fmap V.head $ fmap (vertData . snd) (V.fromList leafList) - - -- zip data for each taxon - newDataList = if null curDataList then fmap (:[]) $ leafCharacterList - else zipWith (:) leafCharacterList curDataList - in - mergeDataBlocks (tail inGraphList) newDataList (firstCharInfo : curInfoList) - --- | getTaxonCharString returns the total character string for a taxon --- length and zipping for missing data -getTaxonCharString :: V.Vector (V.Vector CharInfo) -> VertexBlockData -> String -getTaxonCharString charInfoVV charDataVV = - let lengthBlock = maximum $ V.zipWith U.getCharacterLength (V.head charDataVV) (V.head charInfoVV) - in - concat $ V.zipWith (getBlockString lengthBlock) charInfoVV charDataVV - --- | getBlockString returns the String for a character block --- returns all '?' if missing -getBlockString :: Int -> V.Vector CharInfo -> V.Vector CharacterData -> String -getBlockString lengthBlock charInfoV charDataV = - -- this to deal with missing characters - -- trace ("GBS: " ++ (show $ V.length charDataV)) ( - if V.null charDataV then L.replicate lengthBlock '?' - else concat $ V.zipWith getCharacterString charDataV charInfoV - -- ) - - - --- | mergeCharInfoCharLength merges cc code char info and char lengths for scope -mergeCharInfoCharLength :: [(String, String, String)] -> [Int] -> Int -> String -mergeCharInfoCharLength codeList lengthList charIndex = - if null codeList then [] - else - let (ccCodeString, costsString, weightString) = head codeList - charLength = head lengthList - startScope = show charIndex - endScope = show (charIndex + charLength - 1) - scope = startScope ++ "." ++ endScope - weightString' = if null weightString then [] - else "cc /" ++ weightString ++ scope ++ ";\n" - costsString' = if null costsString then [] - else "costs " ++ scope ++ " = " ++ costsString ++ ";\n" - ccCodeString' = "cc " ++ ccCodeString ++ " " ++ scope ++ ";\n" - in - (ccCodeString' ++ weightString' ++ costsString') ++ (mergeCharInfoCharLength (tail codeList) (tail lengthList) (charIndex + charLength)) - - - --- | getCharacterInfo takes charInfo vect vect and reiurns triples of ccCode, costs, and weight values --- for each character -getCharacterInfo :: V.Vector (V.Vector CharInfo) -> [(String, String, String)] -getCharacterInfo inCharInfoVV = - concat $ V.toList $ fmap getBlockInfo inCharInfoVV - --- | getBlockInfo gets character code info for a block -getBlockInfo :: V.Vector CharInfo -> [(String, String, String)] -getBlockInfo inCharInfoV = V.toList $ fmap getCharCodeInfo inCharInfoV - --- | getCharCodeInfo extracts 3-tuple of cc code, costs and weight as strings --- from charInfo -getCharCodeInfo :: CharInfo -> (String, String, String) -getCharCodeInfo inCharInfo = - let inCharType = charType inCharInfo - charWeightString = if weight inCharInfo == 1 then "" - else show $ weight inCharInfo - inAlph = alphabet inCharInfo - inMatrix = costMatrix inCharInfo - (costMatrixType, _) = IR.getRecodingType inMatrix - matrixString = if costMatrixType == "nonAdd" then "" - else makeMatrixString inAlph inMatrix - in - let codeTriple = case inCharType of - x | x `elem` [Add ] -> ("+", "", charWeightString) - x | x `elem` [NonAdd ] -> ("-", "", charWeightString) - x | x `elem` packedNonAddTypes -> ("-", "", charWeightString) - x | x `elem` [Matrix ] -> ("(", matrixString, charWeightString) - x | x `elem` sequenceCharacterTypes -> if costMatrixType == "nonAdd" then ("-", "", charWeightString) - else ("(", matrixString, charWeightString) - _ -> error ("Un-implemented data type " ++ show inCharType) - in - codeTriple - --- | makeMatrixString takes alphabet and input cost matrix and creates TNT --- matrix cost line --- could be lesser but might not be symmetrical -makeMatrixString :: Alphabet ST.ShortText -> S.Matrix Int -> String -makeMatrixString inAlphabet inMatrix = - let elementList = fmap ST.toString $ toList inAlphabet - - -- get element pairs (Strings) - elementPairList = filter notDiag $ getListPairs elementList - - -- index pairs for accessing matrix - elementIndexPairList = filter notDiag $ getListPairs [0 .. (length elementList - 1)] - elementPairCosts = fmap (inMatrix S.!) elementIndexPairList - - -- make strings of form state_i / state_j cost ... - costString = makeCostString elementPairList elementPairCosts - - in - costString - where notDiag (a,b) = if a < b then True else False - --- | makeCostString takes list of state pairs and list of costs and creates tnt cost string -makeCostString :: [(String, String)] -> [Int] -> String -makeCostString namePairList costList = - if null namePairList then [] - else - let (a,b) = head namePairList - c = head costList - in - (a ++ "/" ++ b ++ " " ++ (show c) ++ " ") ++ (makeCostString (tail namePairList) (tail costList)) - --- | getBlockLength returns a list of the lengths of all characters in a blocks -getBlockLength :: V.Vector CharacterData -> V.Vector CharInfo -> [Int] -getBlockLength inCharDataV inCharInfoV = - -- trace ("GBL:" ++ (show $ V.zipWith U.getCharacterLength inCharDataV inCharInfoV)) - V.toList $ V.zipWith U.getCharacterLength inCharDataV inCharInfoV - --- | getBlockNames returns a list of the lengths of all characters in a blocks -getBlockNames :: V.Vector CharInfo -> [String] -getBlockNames inCharInfoV = - -- trace ("GBL:" ++ (show $ V.zipWith U.getCharacterLength inCharDataV inCharInfoV)) - V.toList $ fmap T.unpack $ fmap name inCharInfoV - - --- | getCharacterString returns a string of character states --- need to add splace between (large alphabets etc) --- local alphabet for charactes where that is input. MAytrix and additive are integers -getCharacterString :: CharacterData -> CharInfo -> String -getCharacterString inCharData inCharInfo = - let inCharType = charType inCharInfo - localAlphabet = if inCharType /= NonAdd then fmap ST.toString $ alphabet inCharInfo - else fmap ST.toString discreteAlphabet - in - let charString = case inCharType of - x | x `elem` [NonAdd ] -> foldMap (bitVectToCharStringTNT localAlphabet) $ snd3 $ stateBVPrelim inCharData - x | x `elem` packedNonAddTypes -> UV.foldMap (bitVectToCharStringTNT localAlphabet) $ snd3 $ packedNonAddPrelim inCharData - x | x `elem` [Add ] -> foldMap U.additivStateToString $ snd3 $ rangePrelim inCharData - x | x `elem` [Matrix ] -> foldMap U.matrixStateToString $ matrixStatesPrelim inCharData - x | x `elem` [SlimSeq, NucSeq ] -> SV.foldMap (bitVectToCharStringTNT localAlphabet) $ snd3 $ slimAlignment inCharData - x | x `elem` [WideSeq, AminoSeq] -> UV.foldMap (bitVectToCharStringTNT localAlphabet) $ snd3 $ wideAlignment inCharData - x | x `elem` [HugeSeq] -> foldMap (bitVectToCharStringTNT localAlphabet) $ snd3 $ hugeAlignment inCharData - x | x `elem` [AlignedSlim] -> SV.foldMap (bitVectToCharStringTNT localAlphabet) $ snd3 $ alignedSlimPrelim inCharData - x | x `elem` [AlignedWide] -> UV.foldMap (bitVectToCharStringTNT localAlphabet) $ snd3 $ alignedWidePrelim inCharData - x | x `elem` [AlignedHuge] -> foldMap (bitVectToCharStringTNT localAlphabet) $ snd3 $ alignedHugePrelim inCharData - _ -> error ("Un-implemented data type " ++ show inCharType) - in - charString - --- | bitVectToCharStringTNT wraps '[]' around ambiguous states and removes commas between states -bitVectToCharStringTNT :: Bits b => Alphabet String -> b -> String -bitVectToCharStringTNT localAlphabet bitValue = - let stateString = U.bitVectToCharState localAlphabet bitValue - in - if length stateString > 1 then "[" ++ (filter (/=',') stateString) ++ "]" - else stateString - --- | Implied Alignment report functions - --- | getImpliedAlignmentString returns as a single String the implied alignments of all sequence characters --- softwired use display trees, hardWired transform to softwired then proceed with display trees -getImpliedAlignmentString :: GlobalSettings -> Bool -> Bool -> ProcessedData -> PhylogeneticGraph -> Int -> String -getImpliedAlignmentString inGS includeMissing concatSeqs inData inGraph graphNumber = - if LG.isEmpty (fst6 inGraph) then error "No graphs for create IAs for in getImpliedAlignmentStrings" - else - let headerString = "Implied Alignments for Graph " ++ (show graphNumber) ++ "\n" - in - if graphType inGS == Tree then - if not concatSeqs then headerString ++ (getTreeIAString includeMissing inGraph) - else headerString ++ (U.concatFastas $ getTreeIAString includeMissing inGraph) - - -- for softwired networks--use display trees - else if graphType inGS == SoftWired then - -- get display trees for each data block-- takes first of potentially multiple - let blockDisplayList = fmap GO.convertDecoratedToSimpleGraph $ fmap head $ fth6 inGraph - - -- create seprate processed data for each block - blockProcessedDataList = fmap (makeBlockData (fst3 inData) (snd3 inData)) (thd3 inData) - - -- Perform full optimizations on display trees (as trees) with single block data (blockProcessedDataList) to creeate IAs - decoratedBlockTreeList = V.zipWith (TRAV.multiTraverseFullyLabelGraph' (inGS {graphType = Tree}) False False Nothing) blockProcessedDataList blockDisplayList - - -- extract IA strings as if mutiple graphs - diplayIAStringList = fmap (getTreeIAString includeMissing) $ V.toList decoratedBlockTreeList - - in - if not concatSeqs then headerString ++ (concat diplayIAStringList) - else headerString ++ (U.concatFastas $ concat diplayIAStringList) - - -- There is no IA for Hardwired at least as of yet - -- so convert to softwired and use display trees - else if graphType inGS == HardWired then - let newGS = inGS {graphType = SoftWired} - - pruneEdges = False - warnPruneEdges = False - startVertex = Nothing - - newGraph = TRAV.multiTraverseFullyLabelGraph newGS inData pruneEdges warnPruneEdges startVertex (fst6 newGraph) - - blockDisplayList = fmap GO.convertDecoratedToSimpleGraph $ fmap head $ fth6 newGraph - - -- create seprate processed data for each block - blockProcessedDataList = fmap (makeBlockData (fst3 inData) (snd3 inData)) (thd3 inData) - - -- Perform full optimizations on display trees (as trees) with single block data (blockProcessedDataList) to creeate IAs - decoratedBlockTreeList = V.zipWith (TRAV.multiTraverseFullyLabelGraph' (inGS {graphType = Tree}) False False Nothing) blockProcessedDataList blockDisplayList - - -- extract IA strings as if mutiple graphs - diplayIAStringList = fmap (getTreeIAString includeMissing) $ V.toList decoratedBlockTreeList - in - trace ("There is no implied alignment for hard-wired graphs--at least not yet. Transfroming to softwired and generate an implied alignment that way") ( - if not concatSeqs then headerString ++ (concat diplayIAStringList) - else headerString ++ (U.concatFastas $ concat diplayIAStringList) - ) - - else - trace ("IA not yet implemented for graphtype " ++ show (graphType inGS)) - ("There is no implied alignment for " ++ (show (graphType inGS))) - --- | getTreeIAString takes a Tree Decorated Graph and returns Implied AlignmentString -getTreeIAString :: Bool -> PhylogeneticGraph -> String -getTreeIAString includeMissing inGraph = - let leafList = snd4 $ LG.splitVertexList (thd6 inGraph) - leafNameList = fmap (vertName . snd) leafList - leafDataList = V.fromList $ fmap (vertData . snd) leafList - charInfoVV = six6 inGraph - characterStringList = makeFullIAStrings includeMissing charInfoVV leafNameList leafDataList - in - concat characterStringList - --- | makeBlockData cretes new single block processed data -makeBlockData :: V.Vector NameText-> V.Vector NameBV -> BlockData -> ProcessedData -makeBlockData a b c = (a, b, V.singleton c) - --- | makeFullIAStrings goes block by block, creating fasta strings for each -makeFullIAStrings :: Bool -> V.Vector (V.Vector CharInfo) -> [NameText] -> V.Vector VertexBlockData -> [String] -makeFullIAStrings includeMissing charInfoVV leafNameList leafDataList = - let numBlocks = V.length charInfoVV - in - concat $ fmap (makeBlockIAStrings includeMissing leafNameList leafDataList charInfoVV) (V.fromList [0.. numBlocks - 1]) - --- | makeBlockIAStrings extracts data for a block (via index) and calls function to make iaStrings for each character -makeBlockIAStrings :: Bool -> [NameText] -> V.Vector (V.Vector (V.Vector CharacterData)) -> V.Vector (V.Vector CharInfo) -> Int -> [String] -makeBlockIAStrings includeMissing leafNameList leafDataList charInfoVV blockIndex = - let thisBlockCharInfo = charInfoVV V.! blockIndex - numChars = V.length thisBlockCharInfo - thisBlockCharData = fmap (V.! blockIndex) leafDataList - blockCharacterStringList = V.zipWith (makeBlockCharacterString includeMissing leafNameList thisBlockCharData) thisBlockCharInfo (V.fromList [0 .. (numChars - 1)]) - in - V.toList blockCharacterStringList - --- | isAllGaps checks wether a sequence is all gap charcaters '-' -isAllGaps :: String -> Bool -isAllGaps inSeq = - if null inSeq then True - else - if length (filter (`notElem` ['-', '\n']) inSeq) == 0 then True - else False - --- | makeBlockCharacterString creates implied alignmennt string for sequnce charactes and null if not -makeBlockCharacterString :: Bool -> [NameText] -> V.Vector (V.Vector CharacterData) -> CharInfo -> Int -> String -makeBlockCharacterString includeMissing leafNameList leafDataVV thisCharInfo charIndex = - -- check if sequence type character - let thisCharType = charType thisCharInfo - thisCharName = name thisCharInfo - in - if thisCharType `notElem` sequenceCharacterTypes then [] - else - let -- thisCharData = fmap (V.! charIndex) leafDataVV - thisCharData = getTaxDataOrMissing leafDataVV charIndex 0 [] - nameDataPairList = zip leafNameList thisCharData - fastaString = pairList2Fasta includeMissing thisCharInfo nameDataPairList - in - -- trace ("MBCS: " ++ (show $ length leafNameList) ++ " " ++ (show $ V.length thisCharData) ++ "\n" ++ (show leafDataVV)) - "\nSequence character " ++ (T.unpack thisCharName) ++ "\n" ++ fastaString ++ "\n" - -{- --- | getCharacterDataOrMissing takes a vector of vector of charcter data and returns list --- of taxa for a given sequnce character. If there are no data for that character for a taxon -getCharacterDataOrMissing :: V.Vector (V.Vector CharacterData) -> Int -> [[CharacterData]] -> [[CharacterData]] -getCharacterDataOrMissing leafDataVV charIndex newCharList = - if charIndex == V.length leafDataVV then reverse newCharList - else - let firstCharData = getTaxDataOrMissing leafDataVV charIndex 0 [] - in - getCharacterDataOrMissing leafDataVV (charIndex + 1) (firstCharData : newCharList) --} - --- | getTaxDataOrMissing gets teh index character if data not null, empty character if not -getTaxDataOrMissing :: V.Vector (V.Vector CharacterData) -> Int -> Int -> [CharacterData] -> [CharacterData] -getTaxDataOrMissing charDataV charIndex taxonIndex newTaxList = - if taxonIndex == V.length charDataV then reverse newTaxList - else if V.null (charDataV V.! taxonIndex) then getTaxDataOrMissing charDataV charIndex (taxonIndex + 1) (emptyCharacter : newTaxList) - else getTaxDataOrMissing charDataV charIndex (taxonIndex + 1) (((charDataV V.! taxonIndex) V.! charIndex) : newTaxList) - --- | pairList2Fasta takes a character type and list of pairs of taxon names (as T.Text) --- and character data and returns fasta formated string -pairList2Fasta :: Bool -> CharInfo -> [(NameText, CharacterData)] -> String -pairList2Fasta includeMissing inCharInfo nameDataPairList = - if null nameDataPairList then [] - else - let (firstName, blockDatum) = head nameDataPairList - inCharType = charType inCharInfo - localAlphabet = fmap ST.toString $ alphabet inCharInfo - sequenceString = case inCharType of - x | x `elem` [SlimSeq, NucSeq ] -> SV.foldMap (U.bitVectToCharState localAlphabet) $ snd3 $ slimAlignment blockDatum - x | x `elem` [WideSeq, AminoSeq] -> UV.foldMap (U.bitVectToCharState localAlphabet) $ snd3 $ wideAlignment blockDatum - x | x `elem` [HugeSeq] -> foldMap (U.bitVectToCharState localAlphabet) $ snd3 $ hugeAlignment blockDatum - x | x `elem` [AlignedSlim] -> SV.foldMap (U.bitVectToCharState localAlphabet) $ snd3 $ alignedSlimPrelim blockDatum - x | x `elem` [AlignedWide] -> UV.foldMap (U.bitVectToCharState localAlphabet) $ snd3 $ alignedWidePrelim blockDatum - x | x `elem` [AlignedHuge] -> foldMap (U.bitVectToCharState localAlphabet) $ snd3 $ alignedHugePrelim blockDatum - _ -> error ("Un-implemented data type " ++ show inCharType) - - sequenceChunks = fmap (++ "\n") $ SL.chunksOf 50 sequenceString - - in - if (not includeMissing) && (isAllGaps sequenceString) then pairList2Fasta includeMissing inCharInfo (tail nameDataPairList) - else if blockDatum == emptyCharacter then pairList2Fasta includeMissing inCharInfo (tail nameDataPairList) - else (concat $ (('>' : (T.unpack firstName)) ++ "\n") : sequenceChunks) ++ (pairList2Fasta includeMissing inCharInfo (tail nameDataPairList)) - - diff --git a/pkg/PhyGraph/Commands/ProcessCommands.hs b/pkg/PhyGraph/Commands/ProcessCommands.hs deleted file mode 100644 index 09c94cf8f..000000000 --- a/pkg/PhyGraph/Commands/ProcessCommands.hs +++ /dev/null @@ -1,301 +0,0 @@ -{- | -Module : ProcessCommands.hs -Description : Module tpo process command -Copyright : (c) 2022 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - - -{- To add commands: - 1) add new command in Types.hs - 2) add string name of command to allowedCommandList - 3) add instruction processing to getInstruction - 4) add argument processing to parseCommandArg - 5) add argument processing function - with meaningful errors - 6) Add amchinery of command in general code --} - - -module Commands.ProcessCommands where - -import Data.Char -import Data.Foldable -import qualified Data.List as L -import Data.List.Split -import Data.Maybe -import GeneralUtilities -import qualified Input.ReadInputFiles as RIF -import Types.Types -import qualified Commands.Verify as V ---import Debug.Trace - --- | expandRunCommands takes raw coomands and if a "run" command is found it reads that file --- and adds those commands in place --- ensures one command per line -expandRunCommands :: [String] -> [String] -> IO [String] -expandRunCommands curLines inLines = - --trace ("EXP " ++ (show curLines) ++ show inLines) ( - if null inLines then return $ reverse curLines - else - let firstLineRead = removeComments [filter (/= ' ') $ head inLines] - (firstLine, restLine) = if null firstLineRead then ([],[]) - else splitCommandLine $ head firstLineRead - - leftParens = length $ filter (=='(') firstLine - rightParens = length $ filter (==')') firstLine - in - --trace ("FL " ++ firstLine) ( - -- only deal with run lines - if (leftParens /= rightParens) then errorWithoutStackTrace ("Command line with unbalances parens '()': " ++ firstLine) - else if null firstLine then expandRunCommands curLines (tail inLines) - else if take 3 (fmap toLower firstLine) /= "run" then expandRunCommands (firstLine : curLines) (restLine : tail inLines) - else do -- is a "run command" - let (_, runFileList) = head $ parseCommand firstLine - let runFileNames = fmap (checkFileNames . snd) runFileList - fileListContents <- mapM readFile runFileNames - let newLines = concatMap lines fileListContents - expandRunCommands (newLines ++ curLines) (restLine : tail inLines) - --) - --) - - --- | splitCommandLine takes a line with potentially multiple commands and splits --- between the first command and all others. -splitCommandLine :: String -> (String, String) -splitCommandLine inLine = - if null inLine then ([],[]) - else - let leftParens = length $ filter (=='(') inLine - rightParens = length $ filter (==')') inLine - firstPart = takeWhile (/= '(') inLine - parenPart = getBalancedParenPart "" (dropWhile (/= '(') inLine) 0 0 - firstCommand = firstPart ++ parenPart - restPart = drop (length firstCommand) inLine - in - if (leftParens /= rightParens) then errorWithoutStackTrace ("Command line with unbalances parens '()': " ++ inLine) - else (firstCommand, restPart) - --- | checkFileNames checks if first and last element of String are double quotes and removes them -checkFileNames :: String -> String -checkFileNames inName - | null inName = errorWithoutStackTrace "Error: Null file name" - | head inName /= '"' = errorWithoutStackTrace ("Error: File name must be in double quotes (b): " ++ inName) - | last inName /= '"' = errorWithoutStackTrace ("Error: File name must be in double quotes (e): " ++ inName) - | otherwise = init $ tail inName - - --- | getCommandList takes a String from a file and returns a list of commands and their arguments --- these are syntactically verified, but any input files are not checked --- commands in lines one to a line -getCommandList :: [String] -> [Command] -getCommandList rawContents = - if null rawContents then errorWithoutStackTrace "Error: Empty command file" - else - let rawList = removeComments $ fmap (filter (/= ' ')) rawContents - -- expand for read wildcards here--cretge a new, potentially longer list - processedCommands = concatMap parseCommand rawList - in - --trace (show rawList) - processedCommands - - --- | removeComments deletes anyhting on line (including line) --- after double dash "--" -removeComments :: [String] -> [String] -removeComments inLineList = - if null inLineList then [] - else - let firstLine = head inLineList - firstTwo = take 2 firstLine - in - -- Comment line - if (firstTwo == "--") || null firstLine then removeComments $ tail inLineList else ( - -- Remove commments from line to end - let nonComment = filter isPrint $ head $ splitOn "--" firstLine - in - nonComment : removeComments (tail inLineList)) - - - --- | getInstruction returns the command type from an input String --- all operations on lower case -getInstruction :: String -> [String] -> Instruction -getInstruction inString possibleCommands - | null inString = error "Empty command String" - | fmap toLower inString == "build" = Build - | fmap toLower inString == "fuse" = Fuse - | fmap toLower inString == "read" = Read - | fmap toLower inString == "reblock" = Reblock - | fmap toLower inString == "refine" = Refine - | fmap toLower inString == "rename" = Rename - | fmap toLower inString == "report" = Report - | fmap toLower inString == "run" = Run - | fmap toLower inString == "search" = Search - | fmap toLower inString == "select" = Select - | fmap toLower inString == "set" = Set - | fmap toLower inString == "support" = Support - | fmap toLower inString == "swap" = Swap - | fmap toLower inString == "transform" = Transform - | otherwise = - let errorMatch = snd $ getBestMatch (maxBound :: Int ,"no suggestion") possibleCommands inString - in errorWithoutStackTrace $ fold - ["\nError: Unrecognized command. By \'", inString, "\' did you mean \'", errorMatch, "\'?\n"] - - --- | parseCommand takes a command file line and processes the String into a command and its arguemnts --- assumes single command per line -parseCommand :: String -> [Command] -parseCommand inLine = - if null inLine then [] - else - let (firstString, restString) = getSubCommand inLine False - instructionString = takeWhile (/= '(') firstString --inLine - -- this doesn not allow recursive multi-option arguments - -- NEED TO FIX - -- make in to a more sophisticated split outside of parens - argList = argumentSplitter inLine $ init $ tail $ dropWhile (/= '(') $ filter (/= ' ') firstString - localInstruction = getInstruction instructionString V.allowedCommandList - processedArg = parseCommandArg firstString localInstruction argList - in - (localInstruction, processedArg) : parseCommand restString - - --- | getSubCommand takes a string ans extracts the first occurrence of the --- structure bleh(...), and splits the string on that, th esub command can contain --- parens and commas -getSubCommand :: String -> Bool -> (String, String) -getSubCommand inString hasComma = - if null inString then ([],[]) - else - let firstPart = takeWhile (/= '(') inString - secondPart = dropWhile (/= '(') inString - parenPart = getBalancedParenPart "" secondPart 0 0 - incrCounter = if hasComma then 1 else 0 - remainderPart = drop (length (firstPart ++ parenPart) + incrCounter) inString -- to remove ',' - in - (firstPart ++ parenPart, remainderPart) - - --- | getBalancedParenPart stakes a string starting with '(' and takes all --- characters until (and including) the balancing ')' --- call with getBalancedParenPart "" inString 0 0 -getBalancedParenPart :: String -> String -> Int -> Int -> String -getBalancedParenPart curString inString countLeft countRight = - if null inString then reverse curString - else - let firstChar = head inString - in - if firstChar == '(' then getBalancedParenPart (firstChar : curString) (tail inString) (countLeft + 1) countRight - else if firstChar == ')' then - if countLeft == countRight + 1 then reverse (firstChar : curString) - else getBalancedParenPart (firstChar : curString) (tail inString) countLeft (countRight + 1) - else getBalancedParenPart (firstChar : curString) (tail inString) countLeft countRight - - --- | argumentSplitter takes argument string and returns individual strings of arguments --- which can include null, single, multiple or sub-command arguments --- these are each pairs of an option string (could be null) and a subarguments String (also could be null) -argumentSplitter :: String -> String -> [(String, String)] -argumentSplitter commandLineString inString - | null inString = [] - | (freeOfSimpleErrors inString) == False = errorWithoutStackTrace ("Error in command specification format") - | otherwise = - let commaIndex = if (L.elemIndex ',' inString) == Nothing then (maxBound :: Int) else fromJust (L.elemIndex ',' inString) - semiIndex = if (L.elemIndex ':' inString) == Nothing then (maxBound :: Int) else fromJust (L.elemIndex ':' inString) - leftParenIndex = if (L.elemIndex '(' inString) == Nothing then (maxBound :: Int) else fromJust (L.elemIndex '(' inString) - firstDivider = minimum [commaIndex, semiIndex, leftParenIndex] - in - -- simple no argument arg - if firstDivider == (maxBound :: Int) then - if head inString == '"' then [([], inString)] - else [(inString, [])] - else if commaIndex == firstDivider then - -- no arg - if null (take firstDivider inString) then errorWithoutStackTrace ("Error in command '" ++ commandLineString ++ "' perhaps due to extraneous commas (',')") - else if head (take firstDivider inString) == '"' then ([], (take firstDivider inString)) : argumentSplitter commandLineString (drop (firstDivider + 1) inString) - else ((take firstDivider inString), []) : argumentSplitter commandLineString (drop (firstDivider + 1) inString) - else if semiIndex == firstDivider then - -- has arg after ':' - if inString !! (semiIndex + 1) == '(' then - (take firstDivider inString,(takeWhile (/= ')') (drop (firstDivider + 1) inString) ++ ")")) : argumentSplitter commandLineString (drop 2 $ dropWhile (/= ')') inString) - else - let nextStuff = dropWhile (/= ',') inString - remainder = if null nextStuff then [] else tail nextStuff - in - (take firstDivider inString, takeWhile (/= ',') (drop (firstDivider + 1) inString)) : argumentSplitter commandLineString remainder - else -- arg is sub-commnd - let (subCommand, remainderString) = getSubCommand inString True - in - (subCommand, []) : argumentSplitter commandLineString remainderString - - --- | freeOfSimpleErrors take command string and checks for simple for atting errors --- lack of separators ',' between args --- add as new errors are found -freeOfSimpleErrors :: String -> Bool -freeOfSimpleErrors commandString - | null commandString = errorWithoutStackTrace ("\n\nError in command string--empty") - | isSequentialSubsequence ['"','"'] commandString = errorWithoutStackTrace ("\n\nCommand format error: " ++ commandString ++ "\n\tPossibly missing comma ',' between arguments or extra double quotes'\"'.") - | isSequentialSubsequence [',',')'] commandString = errorWithoutStackTrace ("\n\nCommand format error: " ++ commandString ++ "\n\tPossibly terminal comma ',' after or within arguments.") - | isSequentialSubsequence [',','('] commandString = errorWithoutStackTrace ("\n\nCommand format error: " ++ commandString ++ "\n\tPossibly comma ',' before '('.") - | isSequentialSubsequence ['(',','] commandString = errorWithoutStackTrace ("\n\nCommand format error: " ++ commandString ++ "\n\tPossibly starting comma ',' before arguments.") - | otherwise = - let beforeDoubleQuotes = dropWhile (/= '"') commandString - in - if null beforeDoubleQuotes then True - -- likely more conditions to develop - else True - - --- | parseCommandArg takes an Instruction and arg list of Strings and returns list --- of parsed srguments for that instruction -parseCommandArg :: String -> Instruction -> [(String, String)] -> [Argument] -parseCommandArg fullCommand localInstruction argList - | localInstruction == Read = if not $ null argList then RIF.getReadArgs fullCommand argList - else errorWithoutStackTrace ("\n\n'Read' command error '" ++ fullCommand ++ "': Need to specify at least one filename in double quotes") - | otherwise = argList - - --- | movePrealignedTCM move prealigned and tcm commands to front of argument list -movePrealignedTCM :: [Argument] -> [Argument] -movePrealignedTCM inArgList = - if null inArgList then [] - else - let firstPart = filter ((== "prealigned").fst) inArgList - secondPart = filter ((== "tcm").fst) inArgList - restPart = filter ((/= "tcm").fst) $ filter ((/= "prealigned").fst) inArgList - in - if length secondPart > 1 then errorWithoutStackTrace ("\n\n'Read' command error '" ++ show inArgList ++ "': can only specify a single tcm file") - else firstPart ++ secondPart ++ restPart - - diff --git a/pkg/PhyGraph/Commands/Transform.hs b/pkg/PhyGraph/Commands/Transform.hs deleted file mode 100644 index 611f21e70..000000000 --- a/pkg/PhyGraph/Commands/Transform.hs +++ /dev/null @@ -1,467 +0,0 @@ -{- | -Module : Transform.hs -Description : Module to coordinate transform command execution -Copyright : (c) 2022 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -module Commands.Transform - ( transform - ) where - -import Types.Types -import Data.Alphabet -import qualified GraphOptimization.Traversals as T -import qualified Utilities.Utilities as U -import qualified ParallelUtilities as PU -import Control.Parallel.Strategies -import Data.Maybe -import Text.Read -import Data.Char -import qualified Graphs.GraphOperations as GO -import GeneralUtilities -import qualified Data.List as L -import qualified Utilities.LocalGraph as LG -import qualified Data.Vector as V -import qualified Data.Vector.Storable as SV -import qualified Data.Vector.Unboxed as UV -import Debug.Trace -import Foreign.C.Types (CUInt) -import qualified Data.BitVector.LittleEndian as BV -import qualified Data.Vector.Generic as GV -import Data.Word -import Data.Bits -import qualified Input.Reorganize as R -import qualified Input.DataTransformation as TRANS -import qualified Input.BitPack as BP -import qualified Commands.Verify as VER -import qualified Data.Text.Lazy as TL -import qualified Data.Char as C - - - - --- | transform changes aspects of data sande settings during execution --- as opposed to Set with all happens at begginign of program execution -transform :: [Argument] -> GlobalSettings -> ProcessedData -> ProcessedData -> Int -> [PhylogeneticGraph] -> (GlobalSettings, ProcessedData, ProcessedData, [PhylogeneticGraph]) -transform inArgs inGS origData inData rSeed inGraphList = - let fstArgList = fmap (fmap toLower . fst) inArgs - sndArgList = fmap (fmap toLower . snd) inArgs - lcArgList = zip fstArgList sndArgList - checkCommandList = checkCommandArgs "transform" fstArgList VER.transformArgList - in - -- check for valid command options - if not checkCommandList then errorWithoutStackTrace ("Unrecognized command in 'transform': " ++ show inArgs) - else - let displayBlock = filter ((=="displaytrees").fst) lcArgList - numDisplayTrees - | length displayBlock > 1 = - errorWithoutStackTrace ("Multiple displayTree number specifications in tansform--can have only one: " ++ show inArgs) - | null displayBlock = Just 10 - | null (snd $ head displayBlock) = Just 10 - | otherwise = readMaybe (snd $ head displayBlock) :: Maybe Int - - toTree = any ((=="totree").fst) lcArgList - toSoftWired = any ((=="tosoftwired").fst) lcArgList - toHardWired = any ((=="tohardwired").fst) lcArgList - toStaticApprox = any ((=="staticapprox").fst) lcArgList - toDynamic = any ((=="dynamic").fst) lcArgList - atRandom = any ((=="atrandom").fst) lcArgList - chooseFirst = any ((=="first").fst) lcArgList - reWeight = any ((=="weight").fst) lcArgList - changeEpsilon = any ((=="dynamicepsilon").fst) lcArgList - reRoot = any ((=="outgroup").fst) lcArgList - - reweightBlock = filter ((=="weight").fst) lcArgList - weightValue - | length reweightBlock > 1 = - errorWithoutStackTrace ("Multiple weight specifications in tansform--can have only one: " ++ show inArgs) - | null reweightBlock = Just 1.0 - | null (snd $ head reweightBlock) = Just 1 - | otherwise = readMaybe (snd $ head reweightBlock) :: Maybe Double - - - changeEpsilonBlock = filter ((=="dynamicepsilon").fst) lcArgList - epsilonValue - | length changeEpsilonBlock > 1 = - errorWithoutStackTrace ("Multiple dynamicEpsilon specifications in tansform--can have only one: " ++ show inArgs) - | null changeEpsilonBlock = Just $ dynamicEpsilon inGS - | null (snd $ head changeEpsilonBlock) = Just $ dynamicEpsilon inGS - | otherwise = readMaybe (snd $ head changeEpsilonBlock) :: Maybe Double - - reRootBlock = filter ((=="outgroup").fst) lcArgList - outgroupValue - | length reRootBlock > 1 = - errorWithoutStackTrace ("Multiple outgroup specifications in tansform--can have only one: " ++ show inArgs) - | null reRootBlock = Just $ outGroupName inGS - | null (snd $ head reRootBlock) = Just $ outGroupName inGS - | otherwise = readMaybe (snd $ head reRootBlock) :: Maybe TL.Text - - - nameList = fmap TL.pack $ fmap (filter (/= '"')) $ fmap snd $ filter ((=="name").fst) lcArgList - charTypeList = fmap snd $ filter ((=="type").fst) lcArgList - - in - if (length $ filter (== True) [toTree, toSoftWired, toHardWired]) > 1 then - errorWithoutStackTrace ("Multiple graph transform commands--can only have one : " ++ (show inArgs)) - else if toStaticApprox && toDynamic then - errorWithoutStackTrace ("Multiple staticApprox/Dynamic transform commands--can only have one : " ++ (show inArgs)) - else if atRandom && chooseFirst then - errorWithoutStackTrace ("Multiple display tree choice commands in transform (first, atRandom)--can only have one : " ++ (show inArgs)) - else if (toTree || toSoftWired || toHardWired) && (toDynamic || toDynamic) then - errorWithoutStackTrace ("Multiple transform operations in transform (e.g. toTree, staticApprox)--can only have one at a time: " ++ (show inArgs)) - else - let pruneEdges = False - warnPruneEdges = False - startVertex = Nothing - in - -- transform nets to tree - if toTree then - -- already Tree return - if (graphType inGS == Tree) then (inGS, origData, inData, inGraphList) - else - let newGS = inGS {graphType = Tree} - - -- generate and return display trees-- displayTreNUm / graph - displayGraphList = if chooseFirst then fmap (take (fromJust numDisplayTrees) . LG.generateDisplayTrees) (fmap fst6 inGraphList) - else fmap (LG.generateDisplayTreesRandom rSeed (fromJust numDisplayTrees)) (fmap fst6 inGraphList) - - -- prob not required - displayGraphs = fmap GO.ladderizeGraph $ fmap GO.renameSimpleGraphNodes (concat displayGraphList) - - -- reoptimize as Trees - newPhylogeneticGraphList = PU.seqParMap rdeepseq (T.multiTraverseFullyLabelGraph newGS inData pruneEdges warnPruneEdges startVertex) displayGraphs -- `using` PU.myParListChunkRDS - in - (newGS, origData, inData, newPhylogeneticGraphList) - - -- transform to softwired - else if toSoftWired then - if (graphType inGS == SoftWired) then (inGS, origData, inData, inGraphList) - else - let newGS = inGS {graphType = SoftWired} - newPhylogeneticGraphList = PU.seqParMap rdeepseq (T.multiTraverseFullyLabelGraph newGS inData pruneEdges warnPruneEdges startVertex) (fmap fst6 inGraphList) -- `using` PU.myParListChunkRDS - in - (newGS, origData, inData, newPhylogeneticGraphList) - - -- transform to hardwired - else if toHardWired then - if (graphType inGS == HardWired) then (inGS, origData, inData, inGraphList) - else - let newGS = inGS {graphType = HardWired} - - newPhylogeneticGraphList = PU.seqParMap rdeepseq (T.multiTraverseFullyLabelGraph newGS inData pruneEdges warnPruneEdges startVertex) (fmap fst6 inGraphList) -- `using` PU.myParListChunkRDS - in - (newGS, origData, inData, newPhylogeneticGraphList) - - -- roll back to dynamic data from static approx - else if toDynamic then - let newPhylogeneticGraphList = PU.seqParMap rdeepseq (T.multiTraverseFullyLabelGraph inGS origData pruneEdges warnPruneEdges startVertex) (fmap fst6 inGraphList) -- `using` PU.myParListChunkRDS - in - trace ("Transforming data to dynamic: " ++ (show $ minimum $ fmap snd6 inGraphList) ++ " -> " ++ (show $ minimum $ fmap snd6 newPhylogeneticGraphList)) - (inGS, origData, origData, newPhylogeneticGraphList) - - -- transform to static approx--using first Tree - else if toStaticApprox then - let newData = makeStaticApprox inGS inData (head $ L.sortOn snd6 inGraphList) - newPhylogeneticGraphList = PU.seqParMap rdeepseq (T.multiTraverseFullyLabelGraph inGS newData pruneEdges warnPruneEdges startVertex) (fmap fst6 inGraphList) -- `using` PU.myParListChunkRDS - - in - trace ("Transforming data to staticApprox: " ++ (show $ minimum $ fmap snd6 inGraphList) ++ " -> " ++ (show $ minimum $ fmap snd6 newPhylogeneticGraphList)) - (inGS, origData, newData, newPhylogeneticGraphList) - - -- change weight values in charInfo and reoptimize - -- reweights both origData and inData so weighting doens't get undone by static approc to and from transfomrations - else if reWeight then - let newOrigData = reWeightData (fromJust weightValue) charTypeList nameList origData - newData = reWeightData (fromJust weightValue) charTypeList nameList inData - newPhylogeneticGraphList = PU.seqParMap rdeepseq (T.multiTraverseFullyLabelGraph inGS newData pruneEdges warnPruneEdges startVertex) (fmap fst6 inGraphList) -- `using` PU.myParListChunkRDS - in - if isNothing weightValue then errorWithoutStackTrace ("Reweight value is not specified correcty. Must be a double (e.g. 1.2): " ++ (show (snd $ head reweightBlock))) - else - trace ("Reweighting types " ++ (show charTypeList) ++ " and/or characters " ++ (L.intercalate ", " $ fmap TL.unpack nameList) ++ " to " ++ (show $ fromJust weightValue) - ++ "\n\tReoptimizing graphs") - (inGS, newOrigData, newData, newPhylogeneticGraphList) - - -- changes dynamicEpsilon error check factor - else if changeEpsilon then - if isNothing epsilonValue then errorWithoutStackTrace ("DynamicEpsilon value is not specified correcty. Must be a double (e.g. 0.02): " ++ (show (snd $ head changeEpsilonBlock))) - else - trace ("Changing dynamicEpsilon factor to " ++ (show $ fromJust epsilonValue)) - (inGS {dynamicEpsilon = fromJust epsilonValue}, origData, inData, inGraphList) - - else if reRoot then - if isNothing outgroupValue then errorWithoutStackTrace ("Outgroup is not specified correctly. Must be a string (e.g. \"Name\"): " ++ (snd $ head reRootBlock)) - else - let newOutgroupName = TL.filter (/= '"') $ fromJust outgroupValue - newOutgroupIndex = V.elemIndex newOutgroupName (fst3 origData) - newPhylogeneticGraphList = PU.seqParMap rdeepseq (T.multiTraverseFullyLabelGraph inGS origData pruneEdges warnPruneEdges startVertex) (fmap (LG.rerootTree (fromJust newOutgroupIndex)) $ fmap fst6 inGraphList) - in - if isNothing newOutgroupIndex then errorWithoutStackTrace ("Outgoup name not found: " ++ (snd $ head reRootBlock)) - else - trace ("Changing outgroup to " ++ (TL.unpack newOutgroupName)) - (inGS {outgroupIndex = fromJust newOutgroupIndex, outGroupName = newOutgroupName}, origData, inData, newPhylogeneticGraphList) - - - else error ("Transform type not implemented/recognized" ++ (show inArgs)) - --- | reWeightData sets weights to new values based on -reWeightData :: Double -> [String] -> [NameText] -> ProcessedData -> ProcessedData -reWeightData weightValue charTypeStringList charNameList (inName, inNameBV, inBlockDataV) = - let charTypeList = concatMap stringToType charTypeStringList - newBlockData = fmap (reweightBlockData weightValue charTypeList charNameList) inBlockDataV - in - (inName, inNameBV, newBlockData) - --- | stringToType takes String and returns typelist -stringToType :: String -> [CharType] -stringToType inString = - if null inString then [] - else - let inVal = fmap C.toLower inString - typeList = if inVal == "all" then exactCharacterTypes ++ sequenceCharacterTypes - else if inVal == "prealigned" then prealignedCharacterTypes - else if inVal `elem` ["nonexact", "dynamic"] then nonExactCharacterTypes - else if inVal == "nonadditive" then [NonAdd, Packed2, Packed4, Packed5, Packed8, Packed64] - else if inVal == "additive" then [Add] - else if inVal == "matrix" then [Matrix] - else if inVal == "sequence" then sequenceCharacterTypes - else if inVal == "packed" then packedNonAddTypes - else if inVal == "packed2" then [Packed2] - else if inVal == "packed4" then [Packed4] - else if inVal == "packed5" then [Packed5] - else if inVal == "packed8" then [Packed8] - else if inVal == "packed64" then [Packed64] - else if inVal `elem` ["static", "exact", "qualitative"] then exactCharacterTypes - - else errorWithoutStackTrace ("Error in transform : Unrecognized character type '" ++ inString ++ "'") - in - typeList - --- | reweightBlockData applies new weight to catagories of data -reweightBlockData :: Double -> [CharType] -> [NameText] -> BlockData -> BlockData -reweightBlockData weightValue charTypeList charNameList (blockName, blockData, charInfoV) = - let newCharacterInfoV = fmap (reweightCharacterData weightValue charTypeList charNameList) charInfoV - in - (blockName, blockData, newCharacterInfoV) - --- | reweightCharacterData changes weight in charInfo based on type or name -reweightCharacterData :: Double -> [CharType] -> [NameText] -> CharInfo -> CharInfo -reweightCharacterData weightValue charTypeList charNameList charInfo = - let wildCardMatchCharName = filter (== True) $ fmap (textMatchWildcards (name charInfo)) charNameList - in - -- trace ("RWC Wildcards: " ++ (show $ fmap (textMatchWildcards (name charInfo)) charNameList)) ( - if null wildCardMatchCharName && (charType charInfo) `notElem` charTypeList then - -- trace ("RWC not : " ++ (show $ name charInfo) ++ " of " ++ (show charNameList) ++ " " ++ (show $ charType charInfo) ++ " of " ++ (show charTypeList)) - charInfo - else - -- trace ("RWC: " ++ (show $ name charInfo) ++ " " ++ (show $ charType charInfo)) - charInfo {weight = weightValue} - -- ) - --- | makeStaticApprox takes ProcessedData and returns static approx (implied alignment recoded) ProcessedData --- if Tree take SA fields and recode appropriatrely given cost regeme of character --- if Softwired--use display trees for SA --- if hardWired--convert to softwired and use display trees for SA --- since for heuristic searcing--uses additive weight for sequences and simple cost matrices, otherwise --- matrix characters -makeStaticApprox :: GlobalSettings -> ProcessedData -> PhylogeneticGraph -> ProcessedData -makeStaticApprox inGS inData inGraph = - if LG.isEmpty (fst6 inGraph) then error "Empty graph in makeStaticApprox" - - -- tree type - else if graphType inGS == Tree then - let decGraph = thd6 inGraph - (nameV, nameBVV, blockDataV) = inData - - -- do each block in turn pulling and transforming data from inGraph - newBlockDataV = PU.seqParMap rdeepseq (pullGraphBlockDataAndTransform decGraph inData) [0..(length blockDataV - 1)] -- `using` PU.myParListChunkRDS - - -- convert prealigned to non-additive if all 1's tcm - - -- remove constants from new prealigned - newProcessedData = R.removeConstantCharactersPrealigned (nameV, nameBVV, V.fromList newBlockDataV) - - -- bit pack any new non-additive characters - newProcessedData' = BP.packNonAdditiveData inGS newProcessedData - in - -- trace ("MSA:" ++ (show (fmap (V.length . thd3) blockDataV, fmap (V.length . thd3) newBlockDataV))) - newProcessedData' - - else error ("Static Approx not yet implemented for graph type :" ++ (show $ graphType inGS)) - - --- | pullGraphBlockDataAndTransform takes a DecoratedGrpah and block index and pulls --- the character data of the block and transforms the leaf data by using implied alignment --- feilds for dynamic characters -pullGraphBlockDataAndTransform :: DecoratedGraph -> ProcessedData -> Int -> BlockData -pullGraphBlockDataAndTransform inDecGraph (_, _, blockCharInfoV) blockIndex = - let (_, leafVerts, _, _) = LG.splitVertexList inDecGraph - (_, leafLabelList) = unzip leafVerts - leafBlockData = fmap (V.! blockIndex) (fmap vertData leafLabelList) - - -- new recoded data-- need to filter out constant chars after recoding - -- nedd character legnth for missing values - charLengthV = V.zipWith U.getMaxCharacterLength (thd3 $ blockCharInfoV V.! blockIndex) (V.fromList $ fmap V.toList leafBlockData) - - (transformedLeafBlockData, transformedBlockInfo) = unzip $ fmap (transformData (thd3 $ blockCharInfoV V.! blockIndex) charLengthV) leafBlockData - in - -- trace ("PGDT: " ++ show charLengthV) - (fst3 $ blockCharInfoV V.! blockIndex, V.fromList transformedLeafBlockData, head transformedBlockInfo) - - --- | transformData takes original Character info and character data and transforms to static if dynamic noting chracter type -transformData :: V.Vector CharInfo -> V.Vector Int -> V.Vector CharacterData -> (V.Vector CharacterData, V.Vector CharInfo) -transformData inCharInfoV inCharLengthV inCharDataV = - if V.null inCharInfoV then - (V.empty, V.empty) - else - let (outCharDataV, outCharInfoV) = V.unzip $ V.zipWith3 transformCharacter inCharDataV inCharInfoV inCharLengthV - in - (outCharDataV, outCharInfoV) - --- transformCharacter takes a single characer info and character and returns IA if dynamic as is if not --- checks if all gaps with the GV.filter. If all gaps--it means the sequence char was missing and --- implied alignment produced all gaps. The passing of character length is not necessary when changed missing seq to empty --- character--but leaving in case change back to [] -transformCharacter :: CharacterData -> CharInfo -> Int -> (CharacterData, CharInfo) -transformCharacter inCharData inCharInfo charLength = - let inCharType = charType inCharInfo - inCostMatrix = costMatrix inCharInfo - alphSize = length $ alphabet inCharInfo - - -- determine if matrix is all same costs => nonadditive - -- all same except fort single indel costs => non add with gap binary chars - -- not either => matrix char - (inCostMatrixType, gapCost) = R.getRecodingType inCostMatrix - - in - -- trace ("TC:" ++ (show alphSize) ++ " " ++ (show $ alphabet inCharInfo)) ( - --trace ("TC:" ++ (show charLength) ++ " " ++ (show (GV.length $ snd3 $ slimAlignment inCharData, GV.length $ snd3 $ wideAlignment inCharData, GV.length $ snd3 $ hugeAlignment inCharData))) ( - if inCharType `elem` exactCharacterTypes then (inCharData, inCharInfo) - - else if inCharType `elem` prealignedCharacterTypes then (inCharData, inCharInfo) - - else - -- trace ("TC: " ++ inCostMatrixType) ( - -- different types--vector wrangling - -- missing data fields set if no implied alignment ie missing data - if inCharType `elem` [SlimSeq, NucSeq] then - let gapChar = setBit (0 :: CUInt) gapIndex - impliedAlignChar = if (not . GV.null $ GV.filter (/= gapChar) $ snd3 $ slimAlignment inCharData) then slimAlignment inCharData - else - let missingElement = SV.replicate charLength $ TRANS.setMissingBits (0 :: CUInt) 0 alphSize - in - (missingElement, missingElement, missingElement) - - newPrelimBV = R.convert2BV 32 impliedAlignChar - newPrelimBVGaps = addGaps2BV gapCost newPrelimBV - in - -- trace ("TC-Slim:" ++ (show $ GV.length $ snd3 $ slimAlignment inCharData) ++ " " ++ (show $ snd3 $ impliedAlignChar)) ( - - if inCostMatrixType == "nonAdd" then - (inCharData {stateBVPrelim = newPrelimBV}, inCharInfo {charType = NonAdd}) - - else if inCostMatrixType == "nonAddGap" then - (inCharData {stateBVPrelim = newPrelimBVGaps}, inCharInfo {charType = NonAdd}) - - else -- matrix recoding - (inCharData {alignedSlimPrelim = impliedAlignChar}, inCharInfo {charType = AlignedSlim}) - -- ) - - else if inCharType `elem` [WideSeq, AminoSeq] then - let gapChar = setBit (0 :: Word64) gapIndex - impliedAlignChar = if (not . GV.null $ GV.filter (/= gapChar) $ snd3 $ wideAlignment inCharData) then wideAlignment inCharData - else - let missingElement = UV.replicate charLength $ TRANS.setMissingBits (0 :: Word64) 0 alphSize - in (missingElement, missingElement, missingElement) - - newPrelimBV = R.convert2BV 64 impliedAlignChar - newPrelimBVGaps = addGaps2BV gapCost newPrelimBV - in - if inCostMatrixType == "nonAdd" then - (inCharData {stateBVPrelim = newPrelimBV}, inCharInfo {charType = NonAdd}) - - else if inCostMatrixType == "nonAddGap" then - (inCharData {stateBVPrelim = newPrelimBVGaps}, inCharInfo {charType = NonAdd}) - - else -- matrix recoding - (inCharData {alignedWidePrelim = impliedAlignChar}, inCharInfo {charType = AlignedWide}) - - else if inCharType == HugeSeq then - let gapChar = setBit (BV.fromBits $ replicate alphSize False) gapIndex - impliedAlignChar = if (not . GV.null $ GV.filter (/= gapChar) $ snd3 $ hugeAlignment inCharData) then hugeAlignment inCharData - else - let missingElement = V.replicate charLength $ (BV.fromBits $ replicate alphSize True) - in (missingElement, missingElement, missingElement) - - newPrelimBV = impliedAlignChar - newPrelimBVGaps = addGaps2BV gapCost newPrelimBV - in - if inCostMatrixType == "nonAdd" then - (inCharData {stateBVPrelim = newPrelimBV}, inCharInfo {charType = NonAdd}) - - else if inCostMatrixType == "nonAddGap" then - (inCharData {stateBVPrelim = newPrelimBVGaps}, inCharInfo {charType = NonAdd}) - - else -- matrix recoding - (inCharData {alignedHugePrelim = impliedAlignChar}, inCharInfo {charType = AlignedHuge}) - - else - error ("Unrecognized character type in transformCharacter: " ++ (show inCharType)) - -- ) - --- | addGaps2BV adds gap characters 0 = nonGap, 1 = Gap to Vector --- of states to non-additive charcaters for static approx. gapCost - 1 characters are added --- sems wasteful, but comctant filtered out and recoded later when non-add/add charsa re optimized and bitpacked --- since this only for leaves assume inM good for all -addGaps2BV :: Int -> (V.Vector BV.BitVector, V.Vector BV.BitVector, V.Vector BV.BitVector) -> (V.Vector BV.BitVector, V.Vector BV.BitVector, V.Vector BV.BitVector) -addGaps2BV gapCost (_, inM, _) = - -- trace ("AG2BV: " ++ (show inM)) ( - let gapChar = BV.fromNumber (BV.dimension $ V.head inM) (1 :: Int) - noGap = L.replicate (gapCost - 1) $ BV.fromNumber (BV.dimension $ V.head inM) (1 :: Int) - hasGap = L.replicate (gapCost - 1) $ BV.fromNumber (BV.dimension $ V.head inM) (2 :: Int) - gapCharV = createGapChars inM gapChar [] noGap hasGap - outM = inM V.++ gapCharV - in - (outM, outM, outM) - -- ) - --- | createGapChars takes a vector of bitvector coded states and checks if first states == 1 (= gap) --- if so a number based on gap cost are created.. Will create n * original klength so need to --- filter out constant characters later -createGapChars :: V.Vector BV.BitVector -> BV.BitVector -> [BV.BitVector] -> [BV.BitVector] -> [BV.BitVector] -> V.Vector BV.BitVector -createGapChars origBVV gapCharacter newCharL noGapL hasGapL = - if V.null origBVV then V.fromList newCharL - else - if V.head origBVV == gapCharacter then createGapChars (V.tail origBVV) gapCharacter (hasGapL ++ newCharL) noGapL hasGapL - else createGapChars (V.tail origBVV) gapCharacter (noGapL ++ newCharL) noGapL hasGapL - diff --git a/pkg/PhyGraph/Commands/Verify.hs b/pkg/PhyGraph/Commands/Verify.hs deleted file mode 100644 index e1282f3b2..000000000 --- a/pkg/PhyGraph/Commands/Verify.hs +++ /dev/null @@ -1,284 +0,0 @@ -{- | -Module : Verify.hs -Description : Module to verify (more or less) input commands -Copyright : (c) 2022 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - - - -module Commands.Verify - ( verifyCommands - , allowedCommandList - , buildArgList - , fuseArgList - , geneticAlgorithmArgList - , netEdgeArgList - , readArgList - , reconcileArgList - , refineArgList - , reportArgList - , searchArgList - , setArgList - , selectArgList - , supportArgList - , swapArgList - , transformArgList - ) where - -import Types.Types -import GeneralUtilities -import qualified Data.List as L -import qualified Data.Char as C -import Text.Read - --- import Debug.Trace - --- | allowedCommandList is the permitted command string list -allowedCommandList :: [String] -allowedCommandList = ["build", "fuse", "read", "reblock", "refine", "rename", "report", "run", "search", "select", "set", "support", "swap"] - --- list of valid instructions -validInstructionList :: [Instruction] -validInstructionList = [Build, Fuse, Read, Reblock, Refine, Rename, Report, Run, Select, Set, Swap, Search, Support, Transform] - --- | buildArgList is the list of valid build arguments -buildArgList :: [String] -buildArgList = ["replicates", "nj", "wpgma", "dwag", "rdwag", "distance", "character", "best","none","otu","spr","tbr", "block","cun", "eun", "atrandom", "first", "displaytrees", "graph"] - --- | fuseArgList arguments -fuseArgList :: [String] -fuseArgList = ["spr","tbr", "keep", "steepest", "all", "nni", "best", "unique", "once", "atrandom", "pairs"] - --- | geneticAlgorithm arguments -geneticAlgorithmArgList :: [String] -geneticAlgorithmArgList = ["popsize", "generations", "elitist", "severity", "recombinations","geneticalgorithm", "ga", "maxnetedges"] - --- | netEdgeArgList arguments for network edge add/delete operations -netEdgeArgList :: [String] -netEdgeArgList = ["keep", "steepest", "all", "netadd", "netdel", "netdelete", "netadddel", "netadddelete", "netmove", "annealing", "steps", "returnmutated", "drift", "acceptequal", "acceptworse", "maxchanges","steepest","atrandom", "maxnetedges", "rounds"] - --- | Read arg list allowable modifiers in read -readArgList :: [String] -readArgList = ["tcm", "nucleotide", "aminoacid", "fasta", "fastc", "tnt", "csv", - "dot", "newick" , "enewick", "fenewick", "include", "exclude", "rename", "block", "prefasta", - "prefastc", "preaminoacid", "prenucleotide"] -- "prealigned", - --- should be moved to a single file for import --- | reconcileCommandList list of allowable commands -reconcileArgList :: [String] -reconcileArgList = ["method", "compare", "threshold", "outformat", "connect", "edgelabel", "vertexlabel"] -- "outfile" - --- | reconcileOptionsList list of allowable command options of method, compare, threshhold, and outformat -reconcileOptionsList :: [String] -reconcileOptionsList = ["eun", "cun", "strict", "majority", "adams", "dot" ,"dotpdf", "fen", "newick", "true", "false", "combinable", "identity"] - - --- | refinement arguments -refineArgList :: [String] -refineArgList = fuseArgList ++ netEdgeArgList ++ geneticAlgorithmArgList - --- | reportArgList contains valid 'report' arguments -reportArgList :: [String] -reportArgList = ["all", "data", "search", "graphs", "overwrite", "append", "dot", "dotpdf", "newick", "ascii", "crossrefs", "pairdist", "diagnosis","displaytrees", "reconcile", "support", "ia", "impliedalignment", "tnt", "includemissing", "concatenate", "htulabels", "branchlengths", "nohtulabels", "nobranchlengths", "collapse", "nocollapse"] ++ reconcileArgList - --- | search arguments -searchArgList :: [String] -searchArgList = ["days", "hours", "minutes", "seconds", "instances"] - --- | buildArgList is the list of valid build arguments -selectArgList :: [String] -selectArgList = ["best", "all", "unique", "atrandom"] - --- | setArgLIst contains valid 'set' arguments -setArgList :: [String] -setArgList = ["outgroup", "criterion", "graphtype", "compressresolutions", "finalassignment", "graphfactor", "rootcost", "seed","partitioncharacter", "modelcomplexity", - "bc2", "bc4", "bc5", "bc8", "bc64", "bcgt64", "dynamicepsilon"] - --- | refinement arguments -supportArgList :: [String] -supportArgList = ["bootstrap", "jackknife", "goodmanbremer", "gb", "gbsample", "replicates", "buildonly", "atrandom"] -- "bootstrap", - --- | buildArgList is the list of valid build arguments -swapArgList :: [String] -swapArgList = ["spr","tbr", "alternate", "keep", "steepest", "all", "nni", "ia", "annealing", "maxtemp", "mintemp", "steps", "returnmutated", "drift", "acceptequal", "acceptworse", "maxchanges"] - --- | transform arguments -transformArgList :: [String] -transformArgList = ["totree", "tosoftwired", "tohardwired", "staticapprox", "dynamic", "atrandom", "first", "displaytrees", "weight", "name", "type", "dynamicepsilon", "outgroup"] - - --- | verifyCommands takes a command list and tests whether the commands --- and arguments are permissible before program execution--prevents late failure --- after alot of processing time. --- bit does not check for files existance, write/read ability, or contents for format or --- anyhhting else for that matter --- does check if files are both read from and written to -verifyCommands :: [Command] -> [String] -> [String] -> Bool -verifyCommands inCommandList inFilesToRead inFilesToWrite = - if null inCommandList then True - else - let firstCommand = head inCommandList - commandInstruction = fst firstCommand - inArgs = snd firstCommand - - -- check valid commandInstructions - -- this is done earlier but might get oved so putting here just in case - checkInstruction = commandInstruction `elem` validInstructionList - - in - if not checkInstruction then errorWithoutStackTrace ("Invalid command was specified : " ++ (show commandInstruction)) - else - -- check each command for valid arguments - -- make lower-case arguments - let fstArgList = filter (/= []) $ fmap (fmap C.toLower . fst) inArgs - sndArgList = filter (/= []) $ fmap (fmap C.toLower . snd) inArgs - fileNameList = fmap (filter (/= '"')) $ filter (/= []) $ fmap snd inArgs - - -- Read - (checkOptions, filesToReadFrom, filesToWriteTo) = - - -- Build - if commandInstruction == Build then - (checkCommandArgs "build" fstArgList buildArgList, [""], [""]) - - -- Fuse - else if commandInstruction == Fuse then - (checkCommandArgs "fuse" fstArgList fuseArgList, [""], [""]) - - else if commandInstruction == Read then - let fileArgs = concat $ filter (/= []) $ fmap snd inArgs - numDoubleQuotes = length $ filter (== '"') fileArgs - has02DoubleQuotes = (numDoubleQuotes == 2) || (numDoubleQuotes == 0) - in - if not has02DoubleQuotes then errorWithoutStackTrace ("Unbalanced quotation marks in 'read' file argument: " ++ fileArgs) - else (checkCommandArgs "read" fstArgList readArgList, fileNameList, [""]) - - -- Reblock -- no arguments--but reads string and blocknames - else if commandInstruction == Reblock then - let fileArgs = concat $ filter (/= []) $ fmap snd inArgs - numDoubleQuotes = length $ filter (== '"') fileArgs - (numDouble, numUnbalanced) = divMod numDoubleQuotes 2 - in - -- trace (show (fstArgList, sndArgList,fileNameList)) ( - if numDouble < 2 then errorWithoutStackTrace ("Need at least two fields in 'rebock' command, new block name and old block(s) in double quotes: " ++ fileArgs) - else if numUnbalanced /= 0 then errorWithoutStackTrace ("Unbalanced quotation marks in 'reblock' command: " ++ fileArgs) - else (True,[""], [""]) - -- ) - - -- Reconcile -- part of report - - -- Refine - else if commandInstruction == Refine then - (checkCommandArgs "refine" fstArgList refineArgList,[""], [""]) - - -- Rename -- -- no arguments--but reads string and taxon names - else if commandInstruction == Rename then - let fileArgs = concat $ filter (/= []) $ fmap snd inArgs - numDoubleQuotes = length $ filter (== '"') fileArgs - (numDouble, numUnbalanced) = divMod numDoubleQuotes 2 - in - -- trace (show (fstArgList, sndArgList,fileNameList)) ( - if numDouble < 2 then errorWithoutStackTrace ("Need at least two fields in 'rename' command, new taxon name and old taxon name(s) in double quotes: " ++ fileArgs) - else if numUnbalanced /= 0 then errorWithoutStackTrace ("Unbalanced quotation marks in 'rename' command: " ++ fileArgs) - else (True,[""], [""]) - -- ) - - -- Report - else if commandInstruction == Report then - let fileArgs = concat $ filter (/= []) $ fmap snd inArgs - numDoubleQuotes = length $ filter (== '"') fileArgs - has02DoubleQuotes = (numDoubleQuotes == 2) || (numDoubleQuotes == 0) - in - if not has02DoubleQuotes then errorWithoutStackTrace ("Unbalanced quotation marks in report file argument: " ++ fileArgs) - else (checkCommandArgs "report" fstArgList reportArgList, [""], fileNameList) - - -- Run -- processed out before this into command list - - -- Search - else if commandInstruction == Search then - if "reconcile" `notElem` fstArgList then (checkCommandArgs "report" fstArgList searchArgList, [""], [""]) - else - let reconcilePairList = zip fstArgList sndArgList - nonThresholdreconcileModPairList = filter ((/= "threshold"). fst) $ reconcilePairList - thresholdreconcileModPairList = filter ((== "threshold"). fst) $ reconcilePairList - checkReconcile1 = checkCommandArgs "reconcile" (fmap fst nonThresholdreconcileModPairList) reconcileArgList - checkReconcile2 = checkCommandArgs "reconcile" (fmap fst thresholdreconcileModPairList) reconcileArgList - checkReconcile3 = checkCommandArgs "reconcile modifier (method, compare, outformat, connect, edgelabel, vertexlabel)" - (fmap snd nonThresholdreconcileModPairList) reconcileOptionsList - checkReconcile4 = L.foldl1' (&&) $ True : (fmap isInt (filter (/= []) (fmap snd thresholdreconcileModPairList))) - checkReconcile = checkReconcile1 && checkReconcile2 && checkReconcile3 && checkReconcile4 - in - if checkReconcile then (checkCommandArgs "report" fstArgList searchArgList,[""], [""]) - else (False, [], []) - -- Select - else if commandInstruction == Select then - (checkCommandArgs "select" fstArgList selectArgList, [""], [""]) - - -- Set - else if commandInstruction == Set then - (checkCommandArgs "set" fstArgList setArgList, [""], [""]) - - -- Support - else if commandInstruction == Support then - (checkCommandArgs "support" fstArgList supportArgList, [""], [""]) - - -- Swap - else if commandInstruction == Swap then - (checkCommandArgs "swap" fstArgList swapArgList, [""], [""]) - - -- Transform - else if commandInstruction == Transform then - (checkCommandArgs "transform" fstArgList transformArgList, [""], [""]) - - else errorWithoutStackTrace ("Unrecognized command was specified : " ++ (show commandInstruction)) - - - - in - if checkOptions then - let allFilesToReadFrom = filter (/= "") $ filesToReadFrom ++ inFilesToRead - allFilesToWriteTo = filter (/= "") $ filesToWriteTo ++ inFilesToWrite - readAndWriteFileList = L.intersect allFilesToReadFrom allFilesToWriteTo - in - -- trace (show (allFilesToReadFrom, allFilesToWriteTo)) ( - if (not .null) readAndWriteFileList then - errorWithoutStackTrace ("Error--Both reading from and writing to files (could cause errors and/or loss of data): " ++ (show readAndWriteFileList)) - else verifyCommands (tail inCommandList) allFilesToReadFrom allFilesToWriteTo - -- ) - - else - -- Won't get to here--will error at earlier stages - False - - where isInt a = if (readMaybe a :: Maybe Int) /= Nothing then True else False - diff --git a/pkg/PhyGraph/Debug/Debug.hs b/pkg/PhyGraph/Debug/Debug.hs deleted file mode 100644 index 3c1fee9d5..000000000 --- a/pkg/PhyGraph/Debug/Debug.hs +++ /dev/null @@ -1,90 +0,0 @@ -{- | -Module : Debug.hs -Description : Module with Debug version of functions -Copyright : (c) 2021 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - - - -module Debug.Debug where - -import Types.Types --- import Data.List -import qualified Data.Vector as V - -debugZip :: [a] -> [b] -> [(a,b)] -debugZip la lb - | not isDebug = zip la lb - | length la /= length lb = error ("Zip arguments not equal in length: " ++ show (length la, length lb)) - | null la = error ("First list null in debugZip " ++ show (length la, length lb)) - | null lb = error ("Second list null in debugZip " ++ show (length la, length lb)) - | otherwise = zip la lb - - -debugZip3 :: [a] -> [b] -> [c] -> [(a,b,c)] -debugZip3 la lb lc - | not isDebug = zip3 la lb lc - | (length la /= length lb) || (length la /= length lc) || (length lb /= length lc) = error ("Zip3 arguments not equal in length: " ++ show (length la, length lb, length lc)) - | null la = error ("First list null in debugZip3 " ++ show (length la, length lb, length lc)) - | null lb = error ("Second list null in debugZip3 " ++ show (length la, length lb, length lc)) - | null lc = error ("Third list null in debugZip3 " ++ show (length la, length lb, length lc)) - | otherwise = zip3 la lb lc - -debugVectorZip :: V.Vector a -> V.Vector b -> V.Vector (a,b) -debugVectorZip la lb - | not isDebug = V.zip la lb - | V.length la /= V.length lb = error ("Zip arguments not equal in length: " ++ show (V.length la, V.length lb)) - | V.null la = error ("First vector null in debugZip " ++ show (V.length la, V.length lb)) - | V.null lb = error ("Second vector null in debugZip " ++ show (V.length la, V.length lb)) - | otherwise = V.zip la lb - - -debugVectorZip3 :: V.Vector a -> V.Vector b -> V.Vector c -> V.Vector (a,b,c) -debugVectorZip3 la lb lc - | not isDebug = V.zip3 la lb lc - | (V.length la /= V.length lb) || (V.length la /= V.length lc) || (V.length lb /= V.length lc) = error ("Zip3 arguments not equal in length: " ++ show (V.length la, V.length lb, V.length lc)) - | V.null la = error ("First vector null in debugZip3 " ++ show (V.length la, V.length lb, V.length lc)) - | V.null lb = error ("Second vector null in debugZip3 " ++ show (V.length la, V.length lb, V.length lc)) - | V.null lc = error ("Third vector null in debugZip3 " ++ show (V.length la, V.length lb, V.length lc)) - | otherwise = V.zip3 la lb lc - -debugVectorZip4 :: V.Vector a -> V.Vector b -> V.Vector c -> V.Vector d -> V.Vector (a,b,c,d) -debugVectorZip4 la lb lc ld - | not isDebug = V.zip4 la lb lc ld - | (V.length la /= V.length lb) || (V.length la /= V.length lc) || (V.length la /= V.length ld) || (V.length lb /= V.length lc) || (V.length lb /= V.length ld) || (V.length lc /= V.length ld) = error ("Zip3 arguments not equal in length: " ++ show (V.length la, V.length lb, V.length lc, V.length ld)) - | V.null la = error ("First vector null in debugZip4 " ++ show (V.length la, V.length lb, V.length lc, V.length ld)) - | V.null lb = error ("Second vector null in debugZip4 " ++ show (V.length la, V.length lb, V.length lc, V.length ld)) - | V.null lc = error ("Third vector null in debugZip4 " ++ show (V.length la, V.length lb, V.length lc, V.length ld)) - | V.null ld = error ("Fourth vector null in debugZip4 " ++ show (V.length la, V.length lb, V.length lc, V.length ld)) - | otherwise = V.zip4 la lb lc ld - diff --git a/pkg/PhyGraph/GraphOptimization/Medians.hs b/pkg/PhyGraph/GraphOptimization/Medians.hs deleted file mode 100644 index 615337df3..000000000 --- a/pkg/PhyGraph/GraphOptimization/Medians.hs +++ /dev/null @@ -1,933 +0,0 @@ -{- | -Module : Medians.hs -Description : Module specifying data type medians -Copyright : (c) 2021 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -{-- -TODO: - - Parallelize median2Vect ---} - -module GraphOptimization.Medians ( median2 - , median2Single - , median2NonExact - -- , median2SingleNonExact - , median2StaticIA - , makeIAPrelimCharacter - , makeIAFinalCharacter - -- , createUngappedMedianSequence - , intervalAdd - , interUnion - , getNewRange - , addMatrix - , getDOMedian - , getPreAligned2Median - , getDOMedianCharInfo - , pairwiseDO - , makeDynamicCharacterFromSingleVector - , makeEdgeData - , createEdgeUnionOverBlocks - , get2WaySlim - , get2WayWideHuge - , getFinal3WaySlim - , getFinal3WayWideHuge - , generalSequenceDiff - , union2Single - ) where - -import Bio.DynamicCharacter -import Data.Alphabet -import Data.Bits -import qualified Data.BitVector.LittleEndian as BV -import Data.Foldable -import qualified Data.MetricRepresentation as MR -import qualified Data.TCM.Dense as TCMD -import qualified Data.Vector as V -import qualified Data.Vector.Generic as GV -import qualified Data.Vector.Storable as SV -import Data.Word -import DirectOptimization.Pairwise -import Foreign.C.Types (CUInt) -import GeneralUtilities -import qualified Input.BitPack as BP -import qualified SymMatrix as S -import Types.Types -import qualified Utilities.LocalGraph as LG -import Data.Maybe - --- import Debug.Trace - --- | makeDynamicCharacterFromSingleVector takes a single vector (usually a 'final' state) --- and returns a dynamic character that canbe used with other functions -makeDynamicCharacterFromSingleVector :: (GV.Vector v a) => v a -> (v a, v a, v a) -makeDynamicCharacterFromSingleVector dc = unsafeCharacterBuiltByST (toEnum $ GV.length dc) $ \dc' -> GV.imapM_ (\k v -> setAlign dc' k v v v) dc - --- | median2 takes the vectors of characters and applies median2Single to each --- character --- for parallel fmap over all then parallelized by type and sequences --- used for distances and post-order assignments -median2 :: V.Vector CharacterData -> V.Vector CharacterData -> V.Vector CharInfo -> V.Vector (CharacterData, VertexCost) -median2 = V.zipWith3 (median2Single False) - - --- | median2NonExact takes the vectors of characters and applies median2NonExact to each --- character for parallel fmap over all then parallelized by type and sequences --- this only reoptimized the nonexact characters (sequence characters for now, perhpas otehrs later) --- and takes the existing optimization for exact (Add, NonAdd, Matrix) for the others. -median2NonExact :: V.Vector CharacterData -> V.Vector CharacterData -> V.Vector CharInfo -> V.Vector (CharacterData, VertexCost) -median2NonExact = V.zipWith3 median2SingleNonExact - --- | median2StaticIA takes the vectors of characters and applies median2SingleStaticIA to each --- character for parallel fmap over all then parallelized by type and sequences --- this reoptimized only IA fields for the nonexact characters (sequence characters for now, perhpas others later) --- and takes the existing optimization for exact (Add, NonAdd, Matrix) for the others. -median2StaticIA :: V.Vector CharacterData -> V.Vector CharacterData -> V.Vector CharInfo -> V.Vector (CharacterData, VertexCost) -median2StaticIA = V.zipWith3 (median2Single True) - --- | median2Single takes character data and returns median character and cost --- median2single assumes that the character vectors in the various states are the same length --- that is--all leaves (hence other vertices later) have the same number of each type of character --- used for post-order assignments --- this is from preliminary states --- staticIA for dynm,aic assumes all same length --- PMDL costs are claculate by type--additive by conversion to non-additive --but if states> 129 worn't do it so warning in docs --- bp2,4,5,8,64, nonadd are by weights vis set command, matrix, sequence are set by tcm with non-zero diagnonal -median2Single :: Bool -> CharacterData -> CharacterData -> CharInfo -> (CharacterData, VertexCost) -median2Single staticIA firstVertChar secondVertChar inCharInfo = - let thisType = charType inCharInfo - thisWeight = weight inCharInfo - thisMatrix = costMatrix inCharInfo - thisSlimTCM = slimTCM inCharInfo - thisWideTCM = wideTCM inCharInfo - thisHugeTCM = hugeTCM inCharInfo - thisActive = activity inCharInfo - thisNoChangeCost = noChangeCost inCharInfo - thisChangeCost = changeCost inCharInfo - in - if not thisActive then (firstVertChar, 0) - else if thisType == Add then - let newCharVect = intervalAdd thisWeight firstVertChar secondVertChar - in - (newCharVect, localCost newCharVect) - - else if thisType == NonAdd then - let newCharVect = interUnion thisWeight (thisNoChangeCost, thisChangeCost) firstVertChar secondVertChar - in - (newCharVect, localCost newCharVect) - - else if thisType `elem` packedNonAddTypes then - --assumes all weight 1 - let newCharVect = BP.median2Packed thisType thisWeight (thisNoChangeCost, thisChangeCost) firstVertChar secondVertChar - in - (newCharVect, localCost newCharVect) - - else if thisType == Matrix then - let newCharVect = addMatrix thisWeight thisMatrix firstVertChar secondVertChar - in - (newCharVect, localCost newCharVect) - - else if thisType `elem` prealignedCharacterTypes then - let newCharVect = getPreAligned2Median inCharInfo emptyCharacter firstVertChar secondVertChar - in - (newCharVect, localCost newCharVect) - - else if thisType `elem` nonExactCharacterTypes then - let newCharVect = if staticIA then makeIAPrelimCharacter inCharInfo emptyCharacter firstVertChar secondVertChar - else getDOMedian thisWeight thisMatrix thisSlimTCM thisWideTCM thisHugeTCM thisType firstVertChar secondVertChar - in - -- trace ("M2S: " ++ (show $ localCost newCharVect)) - (newCharVect, localCost newCharVect) - - else error ("Character type " ++ show thisType ++ " unrecognized/not implemented") - --- | median2SingleNonExact takes character data and returns median character and cost --- median2single assumes that the character vectors in the various states are the same length --- that is--all leaves (hencee other vertices later) have the same number of each type of character --- this only reoptimized the nonexact characters (sequence characters for now, perhpas otehrs later) --- and skips optimization placing a dummy value exact (Add, NonAdd, Matrix) for the others. -median2SingleNonExact :: CharacterData -> CharacterData -> CharInfo -> (CharacterData, VertexCost) -median2SingleNonExact firstVertChar secondVertChar inCharInfo = - let thisType = charType inCharInfo - thisWeight = weight inCharInfo - thisMatrix = costMatrix inCharInfo - thisSlimTCM = slimTCM inCharInfo - thisWideTCM = wideTCM inCharInfo - thisHugeTCM = hugeTCM inCharInfo - thisActive = activity inCharInfo - dummyStaticCharacter = emptyCharacter - in - if (not thisActive) || (thisType `elem` exactCharacterTypes) then (dummyStaticCharacter, 0) - else if thisType `elem` prealignedCharacterTypes then - let newCharVect = getPreAligned2Median inCharInfo dummyStaticCharacter firstVertChar secondVertChar - in - -- trace ("M2S:" ++ (show $ localCost newCharVect) ++ (show (firstVertChar, secondVertChar))) - -- trace ("M2SNEP: " ++ (show thisType)) - (newCharVect, localCost newCharVect) - - else if thisType `elem` nonExactCharacterTypes then - let newCharVect = getDOMedian thisWeight thisMatrix thisSlimTCM thisWideTCM thisHugeTCM thisType firstVertChar secondVertChar - in - --trace ("M2SNE: " ++ (show thisType) ++ (show $ localCost newCharVect)) - (newCharVect, localCost newCharVect) - - else error ("Character type " ++ show thisType ++ " unrecognized/not implemented") - - -{- --- | median2SingleStaticIA takes character data and returns median character and cost for Static and IA fields of dynamic --- median2SingleStaticIA assumes that the character vectors in the various states are the same length --- that is--all leaves (hence other vertices later) have the same number of each type of character --- used for post-order assignments --- this is from preliminary states -median2SingleStaticIA :: CharacterData -> CharacterData -> CharInfo -> (CharacterData, VertexCost) -median2SingleStaticIA firstVertChar secondVertChar inCharInfo = - let thisType = charType inCharInfo - thisWeight = weight inCharInfo - thisMatrix = costMatrix inCharInfo - thisActive = activity inCharInfo - in - if not thisActive then (firstVertChar, 0) - else if thisType == Add then - let newCharVect = intervalAdd thisWeight firstVertChar secondVertChar - in - (newCharVect, localCost newCharVect) - - else if thisType == NonAdd then - let newCharVect = interUnion thisWeight firstVertChar secondVertChar - in - (newCharVect, localCost newCharVect) - - else if thisType `elem` packedNonAddTypes then - --assumes all weight 1 - let newCharVect = BP.median2Packed thisType firstVertChar secondVertChar - in - (newCharVect, localCost newCharVect) - - else if thisType == Matrix then - let newCharVect = addMatrix thisWeight thisMatrix firstVertChar secondVertChar - in - --trace (show $ alphabet inCharInfo) - (newCharVect, localCost newCharVect) - - else if thisType `elem` prealignedCharacterTypes then - let newCharVect = getPreAligned2Median inCharInfo emptyCharacter firstVertChar secondVertChar - in - (newCharVect, localCost newCharVect) - - else if thisType `elem` nonExactCharacterTypes then - let newCharVect = makeIAPrelimCharacter inCharInfo emptyCharacter firstVertChar secondVertChar - in - --trace (show $ alphabet inCharInfo) - (newCharVect, localCost newCharVect) - - else error ("Character type " ++ show thisType ++ " unrecognized/not implemented") --} - --- | localOr wrapper for BV.or for vector elements -localOr :: BV.BitVector -> BV.BitVector -> BV.BitVector -localOr lBV rBV = lBV .|. rBV - -{- --- | localAnd wrapper for BV.and for vector elements -localAnd :: BV.BitVector -> BV.BitVector -> BV.BitVector -localAnd lBV rBV = lBV .&. rBV - --- | localAndOr takes the intesection vect and union vect elements --- and return intersection is /= 0 otherwise union -localAndOr ::BV.BitVector -> BV.BitVector -> BV.BitVector -localAndOr interBV unionBV = if BV.isZeroVector interBV then unionBV else interBV --} - --- | interUnionBV takes two bitvectors and returns new state, nochange number (1 or 0), change number (0 or 1) -interUnionBV :: BV.BitVector -> BV.BitVector -> (BV.BitVector, Int, Int) -interUnionBV leftBV rightBV = - if BV.isZeroVector (leftBV .&. rightBV) then (leftBV .|. rightBV, 0, 1) - else (leftBV .&. rightBV, 1, 0) - - --- | interUnion takes two non-additive chars and creates newCharcter as 2-median --- in post-order pass to create preliminary states assignment --- assumes a single weight for all --- performs two passes though chars to get cost of assignments --- snd3 $ rangePrelim left/rightChar due to triple in prelim --- could have done noChnageCoast/Chaneg cost with length subtraction but very small issue in real use since --- only for nonadd characters with > 64 states. -interUnion :: Double -> (Double, Double) -> CharacterData -> CharacterData -> CharacterData -interUnion thisWeight (lNoChangeCost, lChangeCost) leftChar rightChar = - let (newStateVect, noChangeCostVect, changeCostVect) = V.unzip3 $ V.zipWith interUnionBV (snd3 $ stateBVPrelim leftChar) (snd3 $ stateBVPrelim rightChar) - newCost = thisWeight * ((lNoChangeCost * (fromIntegral $ V.sum noChangeCostVect)) + (lChangeCost * (fromIntegral $ V.sum changeCostVect))) - newCharacter = emptyCharacter { stateBVPrelim = (snd3 $ stateBVPrelim leftChar, newStateVect, snd3 $ stateBVPrelim rightChar) - , localCost = newCost - , globalCost = newCost + globalCost leftChar + globalCost rightChar - } - in - --trace ("NonAdditive: " ++ (show numUnions) ++ " " ++ (show newCost) ++ "\t" ++ (show $ stateBVPrelim leftChar) ++ "\t" ++ (show $ stateBVPrelim rightChar) ++ "\t" - -- ++ (show intersectVect) ++ "\t" ++ (show unionVect) ++ "\t" ++ (show newStateVect)) - newCharacter - --- | localUnion takes two non-additive chars and creates newCharcter as 2-union/or --- assumes a single weight for all --- performs single --- bsaed on final states -localUnion :: CharacterData -> CharacterData -> CharacterData -localUnion leftChar rightChar = - let unionVect = V.zipWith localOr (stateBVFinal leftChar) (stateBVFinal rightChar) - newCharacter = emptyCharacter { stateBVPrelim = (unionVect, unionVect, unionVect) - , stateBVFinal = unionVect - } - in - --trace ("NonAdditive: " ++ (show numUnions) ++ " " ++ (show newCost) ++ "\t" ++ (show $ stateBVPrelim leftChar) ++ "\t" ++ (show $ stateBVPrelim rightChar) ++ "\t" - -- ++ (show intersectVect) ++ "\t" ++ (show unionVect) ++ "\t" ++ (show newStateVect)) - newCharacter - --- | getNewRange takes min and max range of two additive charcaters and returns --- a triple of (newMin, newMax, Cost) -getNewRange :: Int-> Int -> Int -> Int -> (Int, Int, Int) -getNewRange lMin lMax rMin rMax = - -- subset - if (rMin >= lMin) && (rMax <= lMax) then (rMin, rMax, 0) - else if (lMin >= rMin) && (lMax <= rMax) then (lMin, lMax, 0) - -- overlaps - else if (rMin >= lMin) && (rMax >= lMax) && (rMin <= lMax) then (rMin, lMax,0) - else if (lMin >= rMin) && (lMax >= rMax) && (lMin <= rMax) then (lMin, rMax,0) - -- newInterval - else if lMax <= rMin then (lMax, rMin, rMin - lMax) - else if rMax <= lMin then (rMax, lMin, lMin - rMax) - else error ("This can't happen " ++ (show (lMin, lMax, rMin, rMax))) - --- | intervalAdd takes two additive chars and creates newCharcter as 2-median --- in post-order pass to create preliminary states assignment --- assumes a single weight for all --- snd3 $ rangePrelim left/rightChar due to triple in prelim -intervalAdd :: Double -> CharacterData -> CharacterData -> CharacterData -intervalAdd thisWeight leftChar rightChar = - let newRangeCosts = V.zipWith4 getNewRange (V.map fst $ snd3 $ rangePrelim leftChar) (V.map snd $ snd3 $ rangePrelim leftChar) (V.map fst $ snd3 $ rangePrelim rightChar) (V.map snd $ snd3 $ rangePrelim rightChar) - newMinRange = V.map fst3 newRangeCosts - newMaxRange = V.map snd3 newRangeCosts - newCost = thisWeight * fromIntegral (V.sum $ V.map thd3 newRangeCosts) - newCharcater = emptyCharacter { rangePrelim = (snd3 $ rangePrelim leftChar, V.zip newMinRange newMaxRange, snd3 $ rangePrelim rightChar) - , localCost = newCost - , globalCost = newCost + globalCost leftChar + globalCost rightChar - } - in - newCharcater - --- | getUnionRange takes min and max range of two additive charcaters and returns --- a pair of (newMin, newMax) -getUnionRange :: Int -> Int -> Int -> Int -> (Int, Int) -getUnionRange lMin lMax rMin rMax = - (min lMin rMin, max lMax rMax) - --- | intervalUnion takes two additive chars and creates newCharcter as 2-union --- min of all lower, max of all higher --- final states used and assigned to obthe prelim and final for use in swap delta -intervalUnion :: CharacterData -> CharacterData -> CharacterData -intervalUnion leftChar rightChar = - let newRangeCosts = V.zipWith4 getUnionRange (V.map fst $ rangeFinal leftChar) (V.map snd $ rangeFinal leftChar) (V.map fst $ rangeFinal rightChar) (V.map snd $ rangeFinal rightChar) - newMinRange = V.map fst newRangeCosts - newMaxRange = V.map snd newRangeCosts - newCharcater = emptyCharacter { rangePrelim = (V.zip newMinRange newMaxRange, V.zip newMinRange newMaxRange, V.zip newMinRange newMaxRange) - , rangeFinal = V.zip newMinRange newMaxRange - } - in - --trace ("Additive: " ++ (show newCost) ++ "\t" ++ (show $ rangeFinal leftChar) ++ "\t" ++ (show $ rangeFinal rightChar) - -- ++ (show newRangeCosts)) - newCharcater - --- | getMinCostStates takes cost matrix and vector of states (cost, _, _) and retuns a list of (totalCost, best child state) -getMinCostStates :: S.Matrix Int -> V.Vector MatrixTriple -> Int -> Int -> Int -> [(Int, ChildStateIndex)]-> Int -> [(Int, ChildStateIndex)] -getMinCostStates thisMatrix childVect bestCost numStates childState currentBestStates stateIndex = - --trace (show thisMatrix ++ "\n" ++ (show childVect) ++ "\n" ++ show (numStates, childState, stateIndex)) ( - if V.null childVect then reverse (filter ((== bestCost).fst) currentBestStates) - else - let (childCost, _, _) = V.head childVect - childStateCost = if childCost /= (maxBound :: Int) then childCost + (thisMatrix S.! (childState, stateIndex)) - else (maxBound :: Int) - in - if childStateCost > bestCost then getMinCostStates thisMatrix (V.tail childVect) bestCost numStates (childState + 1) currentBestStates stateIndex - else if childStateCost == bestCost then getMinCostStates thisMatrix (V.tail childVect) bestCost numStates (childState + 1) ((childStateCost, childState) : currentBestStates) stateIndex - else getMinCostStates thisMatrix (V.tail childVect) childStateCost numStates (childState + 1) [(childStateCost, childState)] stateIndex - -- ) - - --- | getNewVector takes the vector of states and costs from the child nodes and the --- cost matrix and calculates a new vector n^2 in states -getNewVector :: S.Matrix Int -> Int -> (V.Vector MatrixTriple, V.Vector MatrixTriple) -> V.Vector MatrixTriple -getNewVector thisMatrix numStates (lChild, rChild) = - let newStates = [0..(numStates -1)] - leftPairs = fmap (getMinCostStates thisMatrix lChild (maxBound :: Int) numStates 0 []) newStates - rightPairs = fmap (getMinCostStates thisMatrix rChild (maxBound :: Int) numStates 0 []) newStates - stateCosts = zipWith (+) (fmap (fst . head) leftPairs) (fmap (fst . head) rightPairs) - newStateTripleList = zip3 stateCosts (fmap (fmap snd) leftPairs) (fmap (fmap snd) rightPairs) - in - V.fromList newStateTripleList - --- | addMatrix thisWeight thisMatrix firstVertChar secondVertChar matrix character --- assumes each character has same cost matrix --- Need to add approximation ala DO tcm lookup later --- Local and global costs are based on current not necessaril;y optimal minimum cost states -addMatrix :: Double -> S.Matrix Int -> CharacterData -> CharacterData -> CharacterData -addMatrix thisWeight thisMatrix firstVertChar secondVertChar = - if null thisMatrix then error "Null cost matrix in addMatrix" - else - let numStates = length thisMatrix - initialMatrixVector = getNewVector thisMatrix numStates <$> V.zip (matrixStatesPrelim firstVertChar) (matrixStatesPrelim secondVertChar) - initialCostVector = fmap (V.minimum . fmap fst3) initialMatrixVector - newCost = thisWeight * fromIntegral (V.sum initialCostVector) - newCharacter = emptyCharacter { matrixStatesPrelim = initialMatrixVector - , localCost = newCost - globalCost firstVertChar - globalCost secondVertChar - , globalCost = newCost - } - in - --trace ("Matrix: " ++ (show newCost) ++ "\n\t" ++ (show $ matrixStatesPrelim firstVertChar) ++ "\n\t" ++ (show $ matrixStatesPrelim secondVertChar) ++ - -- "\n\t" ++ (show initialMatrixVector) ++ "\n\t" ++ (show initialCostVector)) - newCharacter - --- | getUnionVector takes the vector of states and costs from two nodes --- and sets the states with min cost in the two vertices and maxBound in other states -getUnionVector :: S.Matrix Int -> Int -> (V.Vector MatrixTriple, V.Vector MatrixTriple) -> V.Vector MatrixTriple -getUnionVector thisMatrix numStates (lChild, rChild) = - let newStates = [0..(numStates -1)] - leftPairs = fmap (getMinCostStates thisMatrix lChild (maxBound :: Int) numStates 0 []) newStates - rightPairs = fmap (getMinCostStates thisMatrix rChild (maxBound :: Int) numStates 0 []) newStates - stateCosts = zipWith (+) (fmap (fst . head) leftPairs) (fmap (fst . head) rightPairs) - minStateCost = minimum stateCosts - stateCosts' = fmap (minOrMax minStateCost) stateCosts - newStateTripleList = zip3 stateCosts' (fmap (fmap snd) leftPairs) (fmap (fmap snd) rightPairs) - in - V.fromList newStateTripleList - where minOrMax minVal curVal = if curVal == minVal then minVal - else maxBound :: Int - --- | unionMatrix thisMatrix firstVertChar secondVertChar matrix character --- assumes each character has same cost matrix --- Need to add approximation ala DO tcm lookup later --- Local and global costs are based on current not necessaril;y optimal minimum cost states -unionMatrix :: S.Matrix Int -> CharacterData -> CharacterData -> CharacterData -unionMatrix thisMatrix firstVertChar secondVertChar = - if null thisMatrix then error "Null cost matrix in addMatrix" - else - let numStates = length thisMatrix - initialMatrixVector = getUnionVector thisMatrix numStates <$> V.zip (matrixStatesFinal firstVertChar) (matrixStatesFinal secondVertChar) - newCharacter = emptyCharacter { matrixStatesPrelim = initialMatrixVector - , matrixStatesFinal = initialMatrixVector - } - in - --trace ("Matrix: " ++ (show newCost) ++ "\n\t" ++ (show $ matrixStatesPrelim firstVertChar) ++ "\n\t" ++ (show $ matrixStatesPrelim secondVertChar) ++ - -- "\n\t" ++ (show initialMatrixVector) ++ "\n\t" ++ (show initialCostVector)) - newCharacter - --- | pairwiseDO is a wrapper around slim/wise/hugeParwiseDO to allow direct call and return of --- DO medians and cost. This is used in final state assignment -pairwiseDO :: CharInfo - -> (SlimDynamicCharacter, WideDynamicCharacter, HugeDynamicCharacter) - -> (SlimDynamicCharacter, WideDynamicCharacter, HugeDynamicCharacter) - -> (SlimDynamicCharacter, WideDynamicCharacter, HugeDynamicCharacter, Double) -pairwiseDO charInfo (slim1, wide1, huge1) (slim2, wide2, huge2) = - let thisType = charType charInfo - in - if thisType `elem` [SlimSeq, NucSeq] then - let (cost, r) = slimPairwiseDO (slimTCM charInfo) slim1 slim2 - in - --trace ("pDO:" ++ (show (GV.length $ fst3 slim1)) ++ " " ++ (show (GV.length $ fst3 slim2))) - (r, mempty, mempty, weight charInfo * fromIntegral cost) - - else if thisType `elem` [WideSeq, AminoSeq] then - let coefficient = MR.minInDelCost (wideTCM charInfo) - (cost, r) = widePairwiseDO coefficient (MR.retreivePairwiseTCM $ wideTCM charInfo) wide1 wide2 - in - (mempty, r, mempty, weight charInfo * fromIntegral cost) - - else if thisType == HugeSeq then - let coefficient = MR.minInDelCost (hugeTCM charInfo) - (cost, r) = hugePairwiseDO coefficient (MR.retreivePairwiseTCM $ hugeTCM charInfo) huge1 huge2 - in - (mempty, mempty, r, weight charInfo * fromIntegral cost) - - else error $ fold ["Unrecognised character type '", show thisType, "'in a DYNAMIC character branch" ] - - - - --- | getDOMedianCharInfo is a wrapper around getDOMedian with CharInfo-based interface -getDOMedianCharInfo :: CharInfo -> CharacterData -> CharacterData -> CharacterData -getDOMedianCharInfo charInfo = getDOMedian (weight charInfo) (costMatrix charInfo) (slimTCM charInfo) (wideTCM charInfo) (hugeTCM charInfo) (charType charInfo) - --- | getDOMedian calls appropriate pairwise DO to create sequence median after some type wrangling --- works on preliminary states -getDOMedian - :: Double - -> S.Matrix Int - -> TCMD.DenseTransitionCostMatrix - -> MR.MetricRepresentation Word64 - -> MR.MetricRepresentation BV.BitVector - -> CharType - -> CharacterData - -> CharacterData - -> CharacterData -getDOMedian thisWeight thisMatrix thisSlimTCM thisWideTCM thisHugeTCM thisType leftChar rightChar - | null thisMatrix = error "Null cost matrix in getDOMedian" - | thisType `elem` [SlimSeq, NucSeq] = newSlimCharacterData - | thisType `elem` [WideSeq, AminoSeq] = newWideCharacterData - | thisType == HugeSeq = newHugeCharacterData - | otherwise = error $ fold ["Unrecognised character type '", show thisType, "'in a DYNAMIC character branch" ] - where - blankCharacterData = emptyCharacter - - newSlimCharacterData = - let newCost = thisWeight * fromIntegral cost - subtreeCost = sum [ newCost, globalCost leftChar, globalCost rightChar] - (cost, r) = slimPairwiseDO - thisSlimTCM (slimGapped leftChar) (slimGapped rightChar) - in blankCharacterData - { slimPrelim = extractMedians r - , slimGapped = r - , localCostVect = V.singleton $ fromIntegral cost - , localCost = newCost - , globalCost = subtreeCost - } - - newWideCharacterData = - let newCost = thisWeight * fromIntegral cost - coefficient = MR.minInDelCost thisWideTCM - subtreeCost = sum [ newCost, globalCost leftChar, globalCost rightChar] - (cost, r) = widePairwiseDO - coefficient - (MR.retreivePairwiseTCM thisWideTCM) - (wideGapped leftChar) (wideGapped rightChar) - in blankCharacterData - { widePrelim = extractMedians r - , wideGapped = r - , localCostVect = V.singleton $ fromIntegral cost - , localCost = newCost - , globalCost = subtreeCost - } - - newHugeCharacterData = - let newCost = thisWeight * fromIntegral cost - coefficient = MR.minInDelCost thisHugeTCM - subtreeCost = newCost + globalCost leftChar + globalCost rightChar - (cost, r) = hugePairwiseDO - coefficient - (MR.retreivePairwiseTCM thisHugeTCM) - (hugeGapped leftChar) (hugeGapped rightChar) - in blankCharacterData - { hugePrelim = extractMedians r - , hugeGapped = r - , localCostVect = V.singleton $ fromIntegral cost - , localCost = newCost - , globalCost = subtreeCost - } - --- | getPrealignedUnion calls appropriate pairwise function to create sequence union of final states --- for prealigned states -getPrealignedUnion :: CharType - -> CharacterData - -> CharacterData - -> CharacterData -getPrealignedUnion thisType leftChar rightChar = - let blankCharacterData = emptyCharacter - in - if thisType == AlignedSlim then - let finalUnion = GV.zipWith (.|.) (alignedSlimFinal leftChar) (alignedSlimFinal rightChar) - prelimState = (finalUnion, finalUnion, finalUnion) - in - blankCharacterData { alignedSlimPrelim = prelimState - , alignedSlimFinal = finalUnion - } - else if thisType == AlignedWide then - let finalUnion = GV.zipWith (.|.) (alignedWideFinal leftChar) (alignedWideFinal rightChar) - prelimState = (finalUnion, finalUnion, finalUnion) - in - blankCharacterData { alignedWidePrelim = prelimState - , alignedWideFinal = finalUnion - } - - else if thisType == AlignedHuge then - let finalUnion = GV.zipWith (.|.) (alignedHugeFinal leftChar) (alignedHugeFinal rightChar) - prelimState = (finalUnion, finalUnion, finalUnion) - in - blankCharacterData { alignedHugePrelim = prelimState - , alignedHugeFinal = finalUnion - } - - else error ("Unrecognised character type '" ++ (show thisType) ++ "' in getPrealignedUnion") - - --- | getDynamicUnion calls appropriate pairwise function to create sequence median after some type wrangling --- if using IA--takes IAFInal for each node, creates union of IAFinals states --- if DO then calculated DO medians and takes union of left and right states --- gaps need to be fitered if DO used later (as in Wagner), or as in SPR/TBR rearragement --- sets final and IA states for Swap delta heuristics -getDynamicUnion :: Bool - -> Bool - -> CharType - -> CharacterData - -> CharacterData - -> TCMD.DenseTransitionCostMatrix - -> MR.MetricRepresentation Word64 - -> MR.MetricRepresentation BV.BitVector - -> CharacterData -getDynamicUnion doIA filterGaps thisType leftChar rightChar thisSlimTCM thisWideTCM thisHugeTCM - | thisType `elem` [SlimSeq, NucSeq] = newSlimCharacterData - | thisType `elem` [WideSeq, AminoSeq] = newWideCharacterData - | thisType == HugeSeq = newHugeCharacterData - | otherwise = error $ fold ["Unrecognised character type '", show thisType, "'in a DYNAMIC character branch" ] - where - blankCharacterData = emptyCharacter - - newSlimCharacterData = - let r = if doIA then GV.zipWith (.|.) (slimIAFinal leftChar) (slimIAFinal rightChar) - else - let (_, (lG, _, rG)) = slimPairwiseDO thisSlimTCM (makeDynamicCharacterFromSingleVector $ slimFinal leftChar) (makeDynamicCharacterFromSingleVector $ slimFinal rightChar) - in - GV.zipWith (.|.) lG rG - - r' = if filterGaps then GV.filter (/= (bit gapIndex)) r - else r - - in blankCharacterData { slimPrelim = r' - , slimGapped = (r', r',r') - , slimFinal = r' - , slimIAPrelim = (r, r, r) - , slimIAFinal = r - } - - newWideCharacterData = - let r = if doIA then GV.zipWith (.|.) (wideIAFinal leftChar) (wideIAFinal rightChar) - else - let coefficient = MR.minInDelCost thisWideTCM - (_, (lG, _, rG)) = widePairwiseDO coefficient (MR.retreivePairwiseTCM thisWideTCM) (makeDynamicCharacterFromSingleVector $ wideFinal leftChar) (makeDynamicCharacterFromSingleVector $ wideFinal rightChar) - in - GV.zipWith (.|.) lG rG - -- r = GV.zipWith (.|.) (wideIAFinal leftChar) (wideIAFinal rightChar) - - r' = if filterGaps then GV.filter (/= (bit gapIndex)) r - else r - - in blankCharacterData { widePrelim = r' - , wideGapped = (r', r',r') - , wideFinal = r' - , wideIAPrelim = (r, r, r) - , wideIAFinal = r - } - - newHugeCharacterData = - let r = if doIA then GV.zipWith (.|.) (hugeIAFinal leftChar) (hugeIAFinal rightChar) - else - let coefficient = MR.minInDelCost thisHugeTCM - (_, (lG, _, rG)) = hugePairwiseDO coefficient (MR.retreivePairwiseTCM thisHugeTCM) (makeDynamicCharacterFromSingleVector $ hugeFinal leftChar) (makeDynamicCharacterFromSingleVector $ hugeFinal rightChar) - in - GV.zipWith (.|.) lG rG - - -- r = GV.zipWith (.|.) (hugeIAFinal leftChar) (hugeIAFinal rightChar) - - r' = if filterGaps then GV.filter (/= (bit gapIndex)) r - else r - - in blankCharacterData { hugePrelim = r' - , hugeGapped = (r', r',r') - , hugeFinal = r' - , hugeIAPrelim = (r, r, r) - , hugeIAFinal = r - } - --- | union2 takes the vectors of characters and applies union2Single to each character --- used for edge states in buikd and rearrangement -union2 :: Bool -> Bool -> V.Vector CharacterData -> V.Vector CharacterData -> V.Vector CharInfo -> V.Vector CharacterData -union2 doIA filterGaps = V.zipWith3 (union2Single doIA filterGaps) - --- | union2Single takes character data and returns union character data --- union2Single assumes that the character vectors in the various states are the same length --- that is--all leaves (hence other vertices later) have the same number of each type of character --- used IAFinal states for dynamic characters --- used in heurstic graph build and rearrangement -union2Single :: Bool -> Bool -> CharacterData -> CharacterData -> CharInfo -> CharacterData -union2Single doIA filterGaps firstVertChar secondVertChar inCharInfo = - let thisType = charType inCharInfo - thisMatrix = costMatrix inCharInfo - thisActive = activity inCharInfo - thisSlimTCM = slimTCM inCharInfo - thisWideTCM = wideTCM inCharInfo - thisHugeTCM = hugeTCM inCharInfo - in - if not thisActive then firstVertChar - else if thisType == Add then - intervalUnion firstVertChar secondVertChar - - else if thisType == NonAdd then - localUnion firstVertChar secondVertChar - - else if thisType `elem` packedNonAddTypes then - BP.unionPacked firstVertChar secondVertChar - - else if thisType == Matrix then - unionMatrix thisMatrix firstVertChar secondVertChar - - else if thisType `elem` prealignedCharacterTypes then - getPrealignedUnion thisType firstVertChar secondVertChar - - else if thisType `elem` nonExactCharacterTypes then - getDynamicUnion doIA filterGaps thisType firstVertChar secondVertChar thisSlimTCM thisWideTCM thisHugeTCM - - else error ("Character type " ++ show thisType ++ " unrecognized/not implemented") - --- | makeEdgeData takes and edge and makes the VertData for the edge from the union of the two vertices -makeEdgeData :: Bool -> DecoratedGraph -> V.Vector (V.Vector CharInfo) -> LG.LEdge b -> VertexBlockData -makeEdgeData doIA inGraph charInfoVV (eNode, vNode, _) = - let eNodeVertData = vertData $ fromJust $ LG.lab inGraph eNode - vNodeVertData = vertData $ fromJust $ LG.lab inGraph vNode - in - createEdgeUnionOverBlocks doIA (not doIA) eNodeVertData vNodeVertData charInfoVV [] - --- | createEdgeUnionOverBlocks creates the union of the final states characters on an edge --- The function takes data in blocks and block vector of char info and --- extracts the triple for each block and creates new block data --- this is used for delta's in edge invastion in Wagner and SPR/TBR --- filter gaps for using with DO (flterGaps = True) or IA (filterGaps = False) -createEdgeUnionOverBlocks :: Bool - -> Bool - -> VertexBlockData - -> VertexBlockData - -> V.Vector (V.Vector CharInfo) - -> [V.Vector CharacterData] - -> V.Vector (V.Vector CharacterData) -createEdgeUnionOverBlocks doIA filterGaps leftBlockData rightBlockData blockCharInfoVect curBlockData = - if V.null leftBlockData then - --trace ("Blocks: " ++ (show $ length curBlockData) ++ " Chars B0: " ++ (show $ V.map snd $ head curBlockData)) - V.fromList $ reverse curBlockData - else - let leftBlockLength = length $ V.head leftBlockData - rightBlockLength = length $ V.head rightBlockData - -- firstBlock = V.zip3 (V.head leftBlockData) (V.head rightBlockData) (V.head blockCharInfoVect) - - -- missing data cases first or zip defaults to zero length - firstBlockMedian - | (leftBlockLength == 0) = V.head rightBlockData - | (rightBlockLength == 0) = V.head leftBlockData - | otherwise = union2 doIA filterGaps (V.head leftBlockData) (V.head rightBlockData) (V.head blockCharInfoVect) - in - createEdgeUnionOverBlocks doIA filterGaps (V.tail leftBlockData) (V.tail rightBlockData) (V.tail blockCharInfoVect) (firstBlockMedian : curBlockData) - --- | getPreAligned2Median takes prealigned character types (AlignedSlim, AlignedWide, AlignedHuge) and returns 2-median and cost --- uses IA-type functions for slim/wide/huge -getPreAligned2Median :: CharInfo -> CharacterData -> CharacterData -> CharacterData -> CharacterData -getPreAligned2Median charInfo nodeChar leftChar rightChar = - let characterType = charType charInfo - in - if characterType == AlignedSlim then - let (prelimChar, cost) = get2WaySlim (slimTCM charInfo) (extractMediansGapped $ alignedSlimPrelim leftChar) (extractMediansGapped $ alignedSlimPrelim rightChar) - in - -- trace ("GPA2M: " ++ (show $ GV.length prelimChar)) - nodeChar { alignedSlimPrelim = (extractMediansGapped $ alignedSlimPrelim leftChar, prelimChar, extractMediansGapped $ alignedSlimPrelim rightChar) - , localCost = (weight charInfo) * (fromIntegral cost) - , globalCost = sum [ (weight charInfo) * (fromIntegral cost), globalCost leftChar, globalCost rightChar] - } - - else if characterType == AlignedWide then - let (prelimChar, cost) = get2WayWideHuge (wideTCM charInfo) (extractMediansGapped $ alignedWidePrelim leftChar) (extractMediansGapped $ alignedWidePrelim rightChar) - in - nodeChar { alignedWidePrelim = (extractMediansGapped $ alignedWidePrelim leftChar, prelimChar, extractMediansGapped $ alignedWidePrelim rightChar) - , localCost = (weight charInfo) * (fromIntegral cost) - , globalCost = sum [ (weight charInfo) * (fromIntegral cost), globalCost leftChar, globalCost rightChar] - } - - else if characterType == AlignedHuge then - let (prelimChar, cost) = get2WayWideHuge (hugeTCM charInfo) (extractMediansGapped $ alignedHugePrelim leftChar) (extractMediansGapped $ alignedHugePrelim rightChar) - in - nodeChar { alignedHugePrelim = (extractMediansGapped $ alignedHugePrelim leftChar, prelimChar, extractMediansGapped $ alignedHugePrelim rightChar) - , localCost = (weight charInfo) * (fromIntegral cost) - , globalCost = sum [ (weight charInfo) * (fromIntegral cost), globalCost leftChar, globalCost rightChar] - } - - else error ("Unrecognized character type " ++ show characterType) - - - --- | makeIAPrelimCharacter takes two characters and performs 2-way assignment --- based on character type and nodeChar--only IA fields are modified -makeIAPrelimCharacter :: CharInfo -> CharacterData -> CharacterData -> CharacterData -> CharacterData -makeIAPrelimCharacter charInfo nodeChar leftChar rightChar = - let characterType = charType charInfo - in - - if characterType `elem` [SlimSeq, NucSeq] then - let (prelimChar, cost) = get2WaySlim (slimTCM charInfo) (extractMediansGapped $ slimIAPrelim leftChar) (extractMediansGapped $ slimIAPrelim rightChar) - in - -- trace ("MPC: " ++ (show prelimChar) ++ "\nleft: " ++ (show $ extractMediansGapped $ slimIAPrelim leftChar) ++ "\nright: " ++ (show $ extractMediansGapped $ slimIAPrelim rightChar)) - nodeChar {slimIAPrelim = (extractMediansGapped $ slimIAPrelim leftChar - , prelimChar, extractMediansGapped $ slimIAPrelim rightChar) - , localCost = (weight charInfo) * (fromIntegral cost) - , globalCost = sum [ (weight charInfo) * (fromIntegral cost), globalCost leftChar, globalCost rightChar] - } - else if characterType `elem` [WideSeq, AminoSeq] then - let (prelimChar, minCost) = get2WayWideHuge (wideTCM charInfo) (extractMediansGapped $ wideIAPrelim leftChar) (extractMediansGapped $ wideIAPrelim rightChar) - in - nodeChar {wideIAPrelim = (extractMediansGapped $ wideIAPrelim leftChar - , prelimChar, extractMediansGapped $ wideIAPrelim rightChar) - , localCost = (weight charInfo) * (fromIntegral minCost) - , globalCost = sum [ (weight charInfo) * (fromIntegral minCost), globalCost leftChar, globalCost rightChar] - } - else if characterType == HugeSeq then - let (prelimChar, minCost) = get2WayWideHuge (hugeTCM charInfo) (extractMediansGapped $ hugeIAPrelim leftChar) (extractMediansGapped $ hugeIAPrelim rightChar) - in - nodeChar {hugeIAPrelim = (extractMediansGapped $ hugeIAPrelim leftChar - , prelimChar, extractMediansGapped $ hugeIAPrelim rightChar) - , localCost = (weight charInfo) * (fromIntegral minCost) - , globalCost = sum [ (weight charInfo) * (fromIntegral minCost), globalCost leftChar, globalCost rightChar] - } - else error ("Unrecognized character type " ++ show characterType) - - --- | makeIAFinalCharacterStaticIA takes two characters and performs 2-way assignment --- based on character type and nodeChar--only IA fields are modified --- this pulls from current node for left and right states -makeIAFinalCharacter :: AssignmentMethod -> CharInfo -> CharacterData -> CharacterData-> CharacterData -makeIAFinalCharacter finalMethod charInfo nodeChar parentChar = - let characterType = charType charInfo - in - if characterType `elem` [SlimSeq, NucSeq] then - let finalIAChar = getFinal3WaySlim (slimTCM charInfo) (slimIAFinal parentChar) (extractMediansLeftGapped $ slimIAPrelim nodeChar) (extractMediansRightGapped $ slimIAPrelim nodeChar) - finalChar = if finalMethod == ImpliedAlignment then extractMedians $ makeDynamicCharacterFromSingleVector finalIAChar - else slimFinal nodeChar - in - nodeChar { slimIAFinal = finalIAChar - , slimFinal = finalChar - } - else if characterType `elem` [WideSeq, AminoSeq] then - let finalIAChar = getFinal3WayWideHuge (wideTCM charInfo) (wideIAFinal parentChar) (extractMediansLeftGapped $ wideIAPrelim nodeChar) (extractMediansRightGapped $ wideIAPrelim nodeChar) - finalChar = if finalMethod == ImpliedAlignment then extractMedians $ makeDynamicCharacterFromSingleVector finalIAChar - else wideFinal nodeChar - in - nodeChar { wideIAFinal = finalIAChar - , wideFinal = finalChar - } - else if characterType == HugeSeq then - let finalIAChar = getFinal3WayWideHuge (hugeTCM charInfo) (hugeIAFinal parentChar) (extractMediansLeftGapped $ hugeIAPrelim nodeChar) (extractMediansRightGapped $ hugeIAPrelim nodeChar) - finalChar = if finalMethod == ImpliedAlignment then extractMedians $ makeDynamicCharacterFromSingleVector finalIAChar - else hugeFinal nodeChar - in - nodeChar { hugeIAFinal = finalIAChar - , hugeFinal = finalChar - } - else error ("Unrecognized character type " ++ show characterType) - - - - --- | get2WaySlim takes two slim vectors an produces a preliminary median -get2WayGeneric :: (FiniteBits e, GV.Vector v e) => (e -> e -> (e, Word)) -> v e -> v e -> (v e, Word) -get2WayGeneric tcm descendantLeftPrelim descendantRightPrelim = - let len = GV.length descendantLeftPrelim - vt = V.generate len $ \i -> tcm (descendantLeftPrelim GV.! i) (descendantRightPrelim GV.! i) -- :: V.Vector (CUInt, Word) - gen v = let med i = fst $ v V.! i in GV.generate len med - add = V.foldl' (\x e -> x + snd e) 0 - in (,) <$> gen <*> add $ vt - - --- | get2WaySlim takes two slim vectors an produces a preliminary median -get2WaySlim :: TCMD.DenseTransitionCostMatrix -> SV.Vector CUInt -> SV.Vector CUInt -> (SV.Vector CUInt, Word) -get2WaySlim lSlimTCM = get2WayGeneric (TCMD.lookupPairwise lSlimTCM) - - --- | get2WayWideHuge like get2WaySlim but for wide and huge characters -get2WayWideHuge :: (FiniteBits a, GV.Vector v a) => MR.MetricRepresentation a -> v a -> v a -> (v a, Word) -get2WayWideHuge whTCM = get2WayGeneric (MR.retreivePairwiseTCM whTCM) - --- | getFinal3Way takes parent final assignment (including indel characters) and descendent --- preliminary gapped assingment from postorder and creates a gapped final assignment based on --- minimum cost median for the three inputs. THis is done to preserve the ImpliedAlignment --- information to create a final assingment with out an additional DO call to keep the --- creation linear in sequence length. Since gaps remain--they must be filtered when output or --- used as true final sequence assignments using M.createUngappedMedianSequence -getFinal3WaySlim :: TCMD.DenseTransitionCostMatrix -> SV.Vector CUInt -> SV.Vector CUInt -> SV.Vector CUInt -> SV.Vector CUInt -getFinal3WaySlim lSlimTCM parentFinal descendantLeftPrelim descendantRightPrelim = - let newFinal = SV.zipWith3 (local3WaySlim lSlimTCM) parentFinal descendantLeftPrelim descendantRightPrelim - in - newFinal - --- | getFinal3WayWideHuge like getFinal3WaySlim but for wide and huge characters -getFinal3WayWideHuge :: (FiniteBits a, GV.Vector v a) => MR.MetricRepresentation a -> v a -> v a -> v a -> v a -getFinal3WayWideHuge whTCM parentFinal descendantLeftPrelim descendantRightPrelim = - let newFinal = GV.zipWith3 (local3WayWideHuge whTCM) parentFinal descendantLeftPrelim descendantRightPrelim - in - newFinal - --- | local3WayWideHuge takes tripples for wide and huge sequence types and returns median -local3WayWideHuge :: (FiniteBits a) => MR.MetricRepresentation a -> a -> a -> a -> a -local3WayWideHuge lWideTCM b c d = - let -- b' = if b == zeroBits then gap else b - -- c' = if c == zeroBits then gap else c - -- d' = if d == zeroBits then gap else d - (median, _) = MR.retreiveThreewayTCM lWideTCM b c d - in - -- trace ((show b) ++ " " ++ (show c) ++ " " ++ (show d) ++ " => " ++ (show median)) - median - --- | local3WaySlim takes triple of CUInt and retuns median -local3WaySlim :: TCMD.DenseTransitionCostMatrix -> CUInt -> CUInt -> CUInt -> CUInt -local3WaySlim lSlimTCM b c d = - -- trace ("L3WS: " ++ (show (b,c,d))) ( - let -- b' = if b == zeroBits then gap else b - -- c' = if c == zeroBits then gap else c - -- d' = if d == zeroBits then gap else d - in - -- trace ("L3WS: " ++ (show (b',c',d'))) ( - let (median, _) = TCMD.lookupThreeway lSlimTCM b c d - in - -- trace ("3way: " ++ (show b) ++ " " ++ (show c) ++ " " ++ (show d) ++ " => " ++ (show (median, cost))) - median - -- ) - - -- | generalSequenceDiff takes two sequence elemental bit types and retuns min and max integer --- cost differences using matrix values --- if value has no bits on--it is set to 0th bit on for GAP -generalSequenceDiff :: (Show a, FiniteBits a) => S.Matrix Int -> Int -> a -> a -> (Int, Int) -generalSequenceDiff thisMatrix numStates uState vState = - -- trace ("GSD: " ++ (show (numStates, uState, vState))) ( - let uState' = if (popCount uState == 0) then bit gapIndex else uState - vState' = if (popCount vState == 0) then bit gapIndex else vState - uStateList = fmap snd $ filter ((== True).fst) $ zip (fmap (testBit uState') [0.. numStates - 1]) [0.. numStates - 1] - vStateList = fmap snd $ filter ((== True).fst) $ zip (fmap (testBit vState') [0.. numStates - 1]) [0.. numStates - 1] - uvCombinations = cartProd uStateList vStateList - costOfPairs = fmap (thisMatrix S.!) uvCombinations - in - -- trace ("GSD: " ++ (show uStateList) ++ " " ++ (show vStateList) ++ " min " ++ (show $ minimum costOfPairs) ++ " max " ++ (show $ maximum costOfPairs)) - (minimum costOfPairs, maximum costOfPairs) - -- ) - - - - diff --git a/pkg/PhyGraph/GraphOptimization/PostOrderFunctions.hs b/pkg/PhyGraph/GraphOptimization/PostOrderFunctions.hs deleted file mode 100644 index 3e605b9ea..000000000 --- a/pkg/PhyGraph/GraphOptimization/PostOrderFunctions.hs +++ /dev/null @@ -1,480 +0,0 @@ -{- | -Module : PostOrderFunctions.hs -Description : Module specifying post-order graph functions -Copyright : (c) 2021 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -{- -ToDo: - Add parallel optimization overblocks and characters? --} - - -module GraphOptimization.PostOrderFunctions ( rerootPhylogeneticGraph - , rerootPhylogeneticGraph' - , rerootPhylogeneticNetwork - , rerootPhylogeneticNetwork' - , createVertexDataOverBlocks - , createVertexDataOverBlocksStaticIA - , updateDisplayTreesAndCost - ) where - -import Data.Bits -import qualified Data.BitVector.LittleEndian as BV -import qualified Data.List as L -import Data.Maybe -import qualified Data.Text.Lazy as T -import qualified Data.Vector as V -import GeneralUtilities -import qualified GraphOptimization.Medians as M -import qualified Graphs.GraphOperations as GO -import Types.Types -import qualified Utilities.LocalGraph as LG -import qualified Utilities.Utilities as U -import Control.Parallel.Strategies -import qualified ParallelUtilities as PU -import qualified GraphOptimization.PostOrderSoftWiredFunctions as POSW -import Debug.Trace - --- | updateDisplayTreesAndCost takes a softwired graph and updates --- display trees and graph cost based on resolutions at root -updateDisplayTreesAndCost :: PhylogeneticGraph -> PhylogeneticGraph -updateDisplayTreesAndCost inGraph = - if LG.isEmpty $ thd6 inGraph then emptyPhylogeneticGraph - else - -- True for check popCount at root fort valid resolution (all leaves in graph) - let (_, outgroupRootLabel) = head $ LG.getRoots (thd6 inGraph) - (displayGraphVL, lDisplayCost) = POSW.extractDisplayTrees Nothing True (vertexResolutionData outgroupRootLabel) - in - --trace ("UDTC: " ++ show lDisplayCost) - (fst6 inGraph, lDisplayCost, thd6 inGraph, displayGraphVL, fft6 inGraph, six6 inGraph) - --- | reOptimizeNodes takes a decorated graph and a list of nodes and reoptimizes (relabels) --- them based on children in input graph --- simple recursive since each node depends on children --- remove check for debugging after it works --- check for out-degree 1, doesn't matter for trees however. -reOptimizeNodes :: GlobalSettings -> V.Vector (V.Vector CharInfo) -> DecoratedGraph -> [LG.LNode VertexInfo] -> DecoratedGraph -reOptimizeNodes inGS charInfoVectVect inGraph oldNodeList = - -- trace ("RON:" ++ (show $ fmap fst oldNodeList)) ( - if null oldNodeList then inGraph - else - -- make sure that nodes are optimized in correct order so that nodes are only reoptimized using updated children - -- this should really not have to happen--order should be determined a priori - let curNode@(curNodeIndex, curNodeLabel) = head oldNodeList - nodeChildren = LG.descendants inGraph curNodeIndex -- should be 1 or 2, not zero since all leaves already in graph - foundCurChildern = filter (`elem` nodeChildren) $ fmap fst (tail oldNodeList) - in - if LG.isLeaf inGraph curNodeIndex then trace ("Should not be a leaf in reoptimize nodes: " ++ (show curNodeIndex) ++ " children " ++ (show nodeChildren) ++ "\nGraph:\n" ++ (LG.prettify $ GO.convertDecoratedToSimpleGraph inGraph)) inGraph - - -- if node in multiple times due to network--put off optimizatin till last time - else if curNodeIndex `elem` (fmap fst $ tail oldNodeList) then reOptimizeNodes inGS charInfoVectVect inGraph (tail oldNodeList) - - else if not $ null foundCurChildern then - -- trace ("Current node " ++ (show curNodeIndex) ++ " has children " ++ (show nodeChildren) ++ " in optimize list (optimization order error)" ++ (show $ fmap fst $ tail oldNodeList)) - reOptimizeNodes inGS charInfoVectVect inGraph (tail oldNodeList ++ [curNode]) - - - -- somehow root before others -- remove if not needed after debug - else if LG.isRoot inGraph curNodeIndex && length oldNodeList > 1 then - error ("Root first :" ++ (show $ fmap fst oldNodeList) ++ "RC " ++ show (LG.descendants inGraph curNodeIndex)) -- ++ "\n" ++ (LG.prettify $ GO.convertDecoratedToSimpleGraph inGraph)) - --reOptimizeNodes inGS localGraphType charInfoVectVect inGraph ((tail oldNodeList) ++ [curNode]) - - else - -- trace ("RON: " ++ (show curNodeIndex) ++ " children " ++ (show nodeChildren)) ( - let leftChild = head nodeChildren - rightChild = last nodeChildren - -- leftChildLabel = fromJust $ LG.lab inGraph leftChild - -- rightChildLabel = fromJust $ LG.lab inGraph rightChild - - -- this ensures that left/right choices are based on leaf BV for consistency and label invariance - (leftChildLabel, rightChildLabel) = U.leftRightChildLabelBV (fromJust $ LG.lab inGraph leftChild, fromJust $ LG.lab inGraph rightChild) - newVertexData = createVertexDataOverBlocksNonExact (vertData leftChildLabel) (vertData rightChildLabel) charInfoVectVect [] - -- newVertexData = createVertexDataOverBlocks (vertData leftChildLabel) (vertData rightChildLabel) charInfoVectVect [] - in - {- - --debug remove when not needed--checking to see if node should not be re optimized - if (sort nodeChildren) == (sort $ V.toList $ children curnodeLabel) then - trace ("Children for vertex unchanged " ++ (show curNodeIndex) - reOptimizeNodes localGraphType charInfoVectVect inGraph (tail oldNodeList) - else - -} - if (graphType inGS) == Tree || (graphType inGS) == HardWired then - let newCost = if length nodeChildren < 2 then 0 - else V.sum $ V.map V.sum $ V.map (V.map snd) newVertexData - newVertexLabel = VertexInfo { index = curNodeIndex - -- this bit labelling incorect for outdegree = 1, need to prepend bits - , bvLabel = if length nodeChildren < 2 then bvLabel leftChildLabel - else bvLabel leftChildLabel .|. bvLabel rightChildLabel - , parents = V.fromList $ LG.parents inGraph curNodeIndex - , children = V.fromList nodeChildren - , nodeType = nodeType curNodeLabel - , vertName = vertName curNodeLabel - , vertexResolutionData = mempty - , vertData = if length nodeChildren < 2 then vertData leftChildLabel - else V.map (V.map fst) newVertexData - , vertexCost = newCost - , subGraphCost = if length nodeChildren < 2 then subGraphCost leftChildLabel - else subGraphCost leftChildLabel + subGraphCost rightChildLabel + newCost - } - -- this to add back edges deleted with nodes (undocumented but sensible in fgl) - replacementEdges = LG.inn inGraph curNodeIndex ++ LG.out inGraph curNodeIndex - newGraph = LG.insEdges replacementEdges $ LG.insNode (curNodeIndex, newVertexLabel) $ LG.delNode curNodeIndex inGraph - in - --trace ("New vertexCost " ++ show newCost) -- ++ " lcn " ++ (show (vertData leftChildLabel, vertData rightChildLabel, vertData curnodeLabel))) - reOptimizeNodes inGS charInfoVectVect newGraph (tail oldNodeList) - - else if (graphType inGS) == SoftWired then - -- trace ("Reoptimizing " ++ (show curNodeIndex)) ( - -- single child of node (can certinly happen with soft-wired networks - if length nodeChildren == 1 then - --trace ("Out=1\n" ++ (LG.prettify $ GO.convertDecoratedToSimpleGraph inGraph)) ( - let (_,_, newVertexLabel, _, _) = POSW.getOutDegree1VertexAndGraph curNodeIndex leftChildLabel inGraph nodeChildren inGraph - - -- this to add back edges deleted with nodes (undocumented but sensible in fgl) - replacementEdges = LG.inn inGraph curNodeIndex ++ LG.out inGraph curNodeIndex - newGraph = LG.insEdges replacementEdges $ LG.insNode (curNodeIndex, newVertexLabel) $ LG.delNode curNodeIndex inGraph - in - reOptimizeNodes inGS charInfoVectVect newGraph (tail oldNodeList) - -- ) - - -- two children - else - -- trace ("Out=2 " ++ (show $ length nodeChildren)) ( - let -- this ensures that left/right choices are based on leaf BV for consistency and label invariance - -- larger bitvector is Right, smaller or equal Left - ((leftChild', leftChildLabel'), (rightChild', rightChildLabel')) = U.leftRightChildLabelBVNode ((leftChild, fromJust $ LG.lab inGraph leftChild), (rightChild, fromJust $ LG.lab inGraph rightChild)) - - -- create resolution caches for blocks - leftChildNodeType = nodeType leftChildLabel' - rightChildNodeType = nodeType rightChildLabel' - resolutionBlockVL = V.zipWith3 (POSW.createBlockResolutions (compressResolutions inGS) curNodeIndex leftChild' rightChild' leftChildNodeType rightChildNodeType (nodeType curNodeLabel)) (vertexResolutionData leftChildLabel') (vertexResolutionData rightChildLabel') charInfoVectVect - - -- create canonical Decorated Graph vertex - -- 0 cost becasue can't know cosrt until hit root and get best valid resolutions - newVertexLabel = VertexInfo { index = curNodeIndex - , bvLabel = bvLabel leftChildLabel' .|. bvLabel rightChildLabel' - , parents = V.fromList $ LG.parents inGraph curNodeIndex - , children = V.fromList nodeChildren - , nodeType = nodeType curNodeLabel - , vertName = T.pack $ "HTU" ++ show curNodeIndex - , vertData = mempty --empty because of resolution data - , vertexResolutionData = resolutionBlockVL - , vertexCost = 0.0 -- newCost - , subGraphCost = 0.0 -- (subGraphCost leftChildLabel) + (subGraphCost rightChildLabel) + newCost - } - - {- - leftEdgeType = if leftChildNodeType == NetworkNode then NetworkEdge - else if leftChildNodeType == LeafNode then PendantEdge - else TreeEdge - rightEdgeType = if rightChildNodeType == NetworkNode then NetworkEdge - else if rightChildNodeType == LeafNode then PendantEdge - else TreeEdge - - edgeLable = EdgeInfo { minLength = 0.0 - , maxLength = 0.0 - , midRangeLength = 0.0 - , edgeType = TreeEdge - } - -} - - replacementEdges = LG.inn inGraph curNodeIndex ++ LG.out inGraph curNodeIndex - newGraph = LG.insEdges replacementEdges $ LG.insNode (curNodeIndex, newVertexLabel) $ LG.delNode curNodeIndex inGraph - in - -- trace ("Resolution Data: \n" ++ "left child\n" ++ (show $ vertexResolutionData leftChildLabel) ++ "\nright child\n" ++ (show $ vertexResolutionData rightChildLabel) - -- ++ "\nCur Node\n" ++ (show $ vertexResolutionData newVertexLabel)) - reOptimizeNodes inGS charInfoVectVect newGraph (tail oldNodeList) - - -- ) -- ) - - else errorWithoutStackTrace ("Graph type unrecognized/not yet implemented: " ++ show (graphType inGS)) - -- ) - - --- | createVertexDataOverBlocks is a partial application of generalCreateVertexDataOverBlocks with full (all charcater) median calculation -createVertexDataOverBlocks :: VertexBlockData - -> VertexBlockData - -> V.Vector (V.Vector CharInfo) - -> [V.Vector (CharacterData, VertexCost)] - -> V.Vector (V.Vector (CharacterData, VertexCost)) -createVertexDataOverBlocks = generalCreateVertexDataOverBlocks M.median2 - --- | createVertexDataOverBlocksNonExact is a partial application of generalCreateVertexDataOverBlocks with partial (non-exact charcater) median calculation -createVertexDataOverBlocksNonExact :: VertexBlockData - -> VertexBlockData - -> V.Vector (V.Vector CharInfo) - -> [V.Vector (CharacterData, VertexCost)] - -> V.Vector (V.Vector (CharacterData, VertexCost)) -createVertexDataOverBlocksNonExact = generalCreateVertexDataOverBlocks M.median2NonExact - --- | createVertexDataOverBlocksStaticIA is an application of generalCreateVertexDataOverBlocks with exact charcater median calculation --- and IA claculation for dynmaic characters--not full optimizations -createVertexDataOverBlocksStaticIA :: VertexBlockData - -> VertexBlockData - -> V.Vector (V.Vector CharInfo) - -> [V.Vector (CharacterData, VertexCost)] - -> V.Vector (V.Vector (CharacterData, VertexCost)) -createVertexDataOverBlocksStaticIA = generalCreateVertexDataOverBlocks M.median2StaticIA - - --- | generalCreateVertexDataOverBlocks is a genreal version for optimizing all (Add, NonAdd, Matrix) --- and only non-exact (basically sequence) characters based on the median function passed --- The function takes data in blocks and block vector of char info and --- extracts the triple for each block and creates new block data for parent node (usually) --- not checking if vectors are equal in length -generalCreateVertexDataOverBlocks :: (V.Vector CharacterData -> V.Vector CharacterData -> V.Vector CharInfo -> V.Vector (CharacterData, VertexCost)) - -> VertexBlockData - -> VertexBlockData - -> V.Vector (V.Vector CharInfo) - -> [V.Vector (CharacterData, VertexCost)] - -> V.Vector (V.Vector (CharacterData, VertexCost)) -generalCreateVertexDataOverBlocks medianFunction leftBlockData rightBlockData blockCharInfoVect curBlockData = - if V.null leftBlockData then - --trace ("Blocks: " ++ (show $ length curBlockData) ++ " Chars B0: " ++ (show $ V.map snd $ head curBlockData)) - V.fromList $ reverse curBlockData - else - let leftBlockLength = length $ V.head leftBlockData - rightBlockLength = length $ V.head rightBlockData - -- firstBlock = V.zip3 (V.head leftBlockData) (V.head rightBlockData) (V.head blockCharInfoVect) - - -- missing data cases first or zip defaults to zero length - firstBlockMedian - | (leftBlockLength == 0) = V.zip (V.head rightBlockData) (V.replicate rightBlockLength 0) - | (rightBlockLength == 0) = V.zip (V.head leftBlockData) (V.replicate leftBlockLength 0) - | otherwise = medianFunction (V.head leftBlockData) (V.head rightBlockData) (V.head blockCharInfoVect) - in - generalCreateVertexDataOverBlocks medianFunction (V.tail leftBlockData) (V.tail rightBlockData) (V.tail blockCharInfoVect) (firstBlockMedian : curBlockData) - - - --- | rerootPhylogeneticNetwork take a vertex index and reroots phylogenetic network --- roots as for tree, but when a netwrok edge is chosen as root edge, its 'sister' network edge --- is deleted (leaving two in=out=1 nodes), th ereooting takes place, --- and the edge is re-added in both directions. Nodes may change between --- tree and network. This is updated -rerootPhylogeneticNetwork :: GlobalSettings -> Int -> Int -> PhylogeneticGraph -> PhylogeneticGraph -rerootPhylogeneticNetwork inGS originalRootIndex rerootIndex inGraph@(inSimple, _, inDecGraph, _, _, _) = - if LG.isEmpty inSimple then inGraph - else - let newRootParents = LG.parents inSimple rerootIndex - newRootLabel = LG.lab inDecGraph rerootIndex - parentNodeLabel = LG.lab inDecGraph $ head newRootParents - in - - if isNothing newRootLabel then error ("New root has no label: " ++ show rerootIndex) - - else if isNothing parentNodeLabel then error ("Parent of new root has no label: " ++ show rerootIndex) - - -- OK to reroot - else - --if length newRootParents > 1 then trace ("Root on network edge") inGraph - --else - let isNetworkNode = nodeType (fromJust newRootLabel) == NetworkNode - parentIsNetworkNode = nodeType (fromJust parentNodeLabel) == NetworkNode - in - -- if (not isNetworkNode) && (not parentIsNetworkNode) then - rerootPhylogeneticGraph inGS isNetworkNode originalRootIndex parentIsNetworkNode rerootIndex inGraph - -- else inGraph - - - --- | rerootPhylogeneticNetwork' flipped version of rerootPhylogeneticNetwork -rerootPhylogeneticNetwork' :: GlobalSettings -> PhylogeneticGraph -> Int -> Int -> PhylogeneticGraph -rerootPhylogeneticNetwork' inGS inGraph originalRootIndex rerootIndex = rerootPhylogeneticNetwork inGS originalRootIndex rerootIndex inGraph - --- | rerootPhylogeneticGraph' flipped version of rerootPhylogeneticGraph -rerootPhylogeneticGraph' :: GlobalSettings -> Bool -> Bool -> PhylogeneticGraph -> Int -> Int -> PhylogeneticGraph -rerootPhylogeneticGraph' inGS isNetworkNode parentIsNetworkNode inGraph originalRootIndex rerootIndex = rerootPhylogeneticGraph inGS isNetworkNode originalRootIndex parentIsNetworkNode rerootIndex inGraph - --- | rerootGraph takes a phylogenetic graph and reroots based on a vertex index (usually leaf outgroup) --- if input is a forest then only roots the component that contains the vertex wil be rerooted --- unclear how will effect network edges--will need to verify that does not create cycles --- multi-rooted components (as opposed to forests) are unaffected with trace warning thrown --- after checking for existing root and multiroots, should be O(n) where 'n is the length --- of the path between the old and new root --- the PhyloGenetic graph is (minimally) reoptimized along the spine of edegs that are redirected --- in order that the root costr be correct. The block display forest (components are always trees--for soft-wired --- graphs only) is also rerooted, and --- Character foci set to the new root edge --- NB--Uses calls to rerootGraph since traversals are for different graphs so wouldn't save --- much time by consolidating--also since labels are all different--can't re-use alot of info --- from graph to graph. --- NNB only deals with post-order states -rerootPhylogeneticGraph :: GlobalSettings -> Bool -> Int -> Bool -> Int -> PhylogeneticGraph -> PhylogeneticGraph -rerootPhylogeneticGraph inGS isNetworkNode originalRootIndex parentIsNetworkNode rerootIndex inPhyGraph@(inSimple, _, inDecGraph, blockGraphVV, _, charInfoVectVect) = - if LG.isEmpty inSimple then inPhyGraph - -- else if inCost == 0 then error ("Input graph with cost zero--likely non decorated input graph in rerootPhylogeneticGraph\n" ++ (LG.prettify $ GO.convertDecoratedToSimpleGraph inDecGraph)) - else - let -- decorated graph Boolean to specify that non-exact characters need to be reoptimized if affected - -- could just update with needges? from simple graph rerooting - (newDecGraph, touchedNodes) = if (graphType inGS) == Tree then (LG.rerootTree rerootIndex inDecGraph, []) - else if (graphType inGS) == SoftWired then rectifyGraphDecorated isNetworkNode originalRootIndex parentIsNetworkNode rerootIndex inDecGraph - else if (graphType inGS) == HardWired then - if (not . LG.cyclic) inSimple then - let (newDecGraph'', touchedNodes') = rectifyGraphDecorated isNetworkNode originalRootIndex parentIsNetworkNode rerootIndex inDecGraph - in - if (not . LG.cyclic) newDecGraph'' then (newDecGraph'', touchedNodes') - else (LG.empty, []) - else (LG.empty, []) - else error ("Error--Graph type unimplemented: " ++ (show (graphType inGS))) - - newSimpleGraph = GO.convertDecoratedToSimpleGraph newDecGraph - - - -- reoptimize nodes here - -- nodes on spine from new root to old root that needs to be reoptimized - -- THIS IS WRONG FOR SOFTWIRED--extra nodes for rectifying graph - (nodesToOptimize, _) = if (graphType inGS) == Tree then LG.pathToRoot inDecGraph (rerootIndex, fromJust $ LG.lab inDecGraph rerootIndex) - else if (graphType inGS) == SoftWired then (touchedNodes, []) - else if (graphType inGS) == HardWired then (touchedNodes, []) - else error ("Error--Graph type unimplemented: " ++ (show (graphType inGS))) - - -- this only reoptimizes non-exact characters since rerooting doesn't affect 'exact" character optimization' - newDecGraph' = reOptimizeNodes inGS charInfoVectVect newDecGraph nodesToOptimize -- (L.nub $ nodesToOptimize ++ touchedNodes) - -- newDecGraph' = reOptimizeNodes inGS charInfoVectVect newDecGraph nodesToOptimize (L.nub $ nodesToOptimize ++ touchedNodes) - - -- sum of root costs on Decorated graph - newGraphCost = sum $ fmap subGraphCost $ fmap snd $ LG.getRoots newDecGraph' - - -- rerooted display forests--don't care about costs--I hope (hence Bool False) - newblockGraphVV = if V.null blockGraphVV then mempty - --else fmap (fmap (GO.rerootTree rerootIndex)) blockGraphVV - else fmap (fmap (rectifyGraphDecorated' isNetworkNode originalRootIndex parentIsNetworkNode rerootIndex)) blockGraphVV - - in - --trace ("rerootPhylogeneticGraph:\n" ++ (LG.prettify $ GO.convertDecoratedToSimpleGraph inDecGraph) ++ "\nNew\n" ++ (LG.prettify $ GO.convertDecoratedToSimpleGraph newDecGraph) ( - - -- this for forbiden condition where rerooting a graph creates parent and child nodes boyth network - if (null touchedNodes) && ((graphType inGS) /= Tree) then emptyPhylogeneticGraph - else if newSimpleGraph == LG.empty then emptyPhylogeneticGraph - - -- Same root, so no need to redo - -- else if (length nodesToOptimize == 1) then inPhyGraph - else - - {- - trace ("New RootIndex :" ++ (show (rerootIndex, isNetworkNode, parentIsNetworkNode)) ++ " To optimize:" ++ (show $ fmap fst nodesToOptimize) ++ "\nOG:\n" - ++ (LG.prettify $ GO.convertDecoratedToSimpleGraph inDecGraph) ++ "\nRRG:" ++ ((LG.prettify $ GO.convertDecoratedToSimpleGraph newDecGraph))) ( - -} - -- (newSimpleGraph, newGraphCost, newDecGraph', newDecoratedGraphVect, V.replicate (length charInfoVectVect) (V.singleton newDecGraph'), charInfoVectVect) - if ((graphType inGS) == Tree || (graphType inGS) == HardWired) then (newSimpleGraph, newGraphCost, newDecGraph', newblockGraphVV, (snd $ POSW.divideDecoratedGraphByBlockAndCharacterTree newDecGraph'), charInfoVectVect) - else - -- get root resolutions and cost - let (displayGraphVL, lDisplayCost) = POSW.extractDisplayTrees (Just originalRootIndex) True (vertexResolutionData $ fromJust $ LG.lab newDecGraph' originalRootIndex) - in - (newSimpleGraph, lDisplayCost, newDecGraph', displayGraphVL, mempty, charInfoVectVect) - -- ) - - --- | rectifyGraphDecorated' wrapper around rectifyGraphDecorated for graph only -rectifyGraphDecorated' :: Bool -> Int -> Bool -> Int -> DecoratedGraph -> DecoratedGraph -rectifyGraphDecorated' isNetworkNode originalRootIndex parentIsNetworkNode rerootIndex inGraph = fst $ rectifyGraphDecorated isNetworkNode originalRootIndex parentIsNetworkNode rerootIndex inGraph - - --- | rectifyGraphDecorated 'fixes' (flips) edges where a network edge has be chosen as a reroot edge For Decorated Graph--thee should be able to be combined --- this can be abstracted if dummy graph set to some input edge label -rectifyGraphDecorated :: Bool -> Int -> Bool -> Int -> DecoratedGraph -> (DecoratedGraph, [LG.LNode VertexInfo]) -rectifyGraphDecorated isNetworkNode originalRootIndex parentIsNetworkNode rerootIndex inGraph = - if LG.isEmpty inGraph then (LG.empty, []) - else - -- sanity check of phylogenetic graph - if isNetworkNode || parentIsNetworkNode then - -- trace ("Graph with parent and child nodes network vertices--skipping reroot") - (LG.empty, []) - -- can't reroot on network node--can cause alot of problems - else - -- get nodes and edged on path from old to new root - let (nodePathToRoot, edgePathToRoot') = LG.pathToRoot inGraph (rerootIndex, fromJust $ LG.lab inGraph rerootIndex) - edgePathToRoot = fmap LG.toEdge edgePathToRoot' - origRootEdges = fmap LG.toEdge $ LG.out inGraph originalRootIndex - origVirtualRootEdge = if ((snd $ head origRootEdges) `elem` (fmap fst nodePathToRoot)) then GO.makeDummyLabEdge dummyEdge (snd $ head origRootEdges, snd $ last origRootEdges) - else GO.makeDummyLabEdge dummyEdge (snd $ last origRootEdges, snd $ head origRootEdges) - - -- arbitrarily chooses one of multiple parent vertices is network edge - -- dummy third field for new root edges - parentNewRoot = LG.parents inGraph rerootIndex - newRootEdge = (head parentNewRoot ,rerootIndex) - otherEdgesFromParentNewRoot = fmap LG.toEdge $ filter ((/= rerootIndex) . snd3) $ LG.out inGraph (head parentNewRoot) - newRootChildEdges = fmap (GO.makeDummyLabEdge dummyEdge) [(originalRootIndex, head parentNewRoot), (originalRootIndex, rerootIndex)] - - --make new graph--init here to not flip original edge to root - flippedEdgesToOldRoot = fmap (GO.makeDummyLabEdge dummyEdge) $ fmap LG.flipEdge $ edgePathToRoot L.\\ (newRootEdge : origRootEdges) - - --nodesTouchedFlippedEdgesIndices = (fmap fst3 flippedEdgesToOldRoot) `L.union` (fmap snd3 flippedEdgesToOldRoot) - --nodesTouchedFlippedLabels = fmap fromJust $ fmap (LG.lab inGraph) nodesTouchedFlippedEdgesIndices - - edgesToDelete = (newRootEdge : origRootEdges) ++ edgePathToRoot - - -- check if network edghe to be creted by rerooting is deleted - edgesToAddBack = fmap (GO.makeDummyLabEdge dummyEdge) $ filter (`elem` edgesToDelete) otherEdgesFromParentNewRoot - - edgesToInsert = ((origVirtualRootEdge : newRootChildEdges) ++ flippedEdgesToOldRoot ++ edgesToAddBack) L.\\ (fmap LG.flipLEdge edgesToAddBack) - - newGraph = LG.insEdges edgesToInsert $ LG.delEdges edgesToDelete inGraph - - -- check for HTU with outdegree 0 due to rerooting issues--could have nested networks - hasNetLeaf = True `elem` (fmap (LG.isNetworkLeaf newGraph) (LG.nodes newGraph)) - - -- get touched nodes - newRootNodeIndexList = [head parentNewRoot, rerootIndex] - newRootNodeLabelIndex = fmap (fromJust . LG.lab inGraph) newRootNodeIndexList - newRootNodeList = filter ((not . LG.isLeaf newGraph) . fst) $ zip newRootNodeIndexList newRootNodeLabelIndex - - nodesToReoptimize = nodePathToRoot `L.union` newRootNodeList - - - in - if length origRootEdges /= 2 then error ("Root does not have two children in rectifyGraphDecorated: " ++ (show origRootEdges)) - else if hasNetLeaf then - --trace ("Graph with HTU network vertex--skipping reroot") - (LG.empty, []) - - {- - else if LG.cyclic newGraph then - trace ("Cycle") - (LG.empty, []) - -} - else - {- - trace ("Original root edges:" ++ (show origRootEdges) - ++ " Insertions:" ++ (show (LG.toEdge origVirtualRootEdge, fmap LG.toEdge newRootChildEdges, fmap LG.toEdge flippedEdgesToOldRoot, fmap LG.toEdge edgesToAddBack)) - ++ "\nDeletions:" ++ (show ((newRootEdge, origRootEdges,edgePathToRoot)))) - -} - {- - if (length $ LG.getIsolatedNodes newGraph) > 0 || (length $ LG.getRoots newGraph) > 1 then - trace ("Isolated nodes: " ++ (show $ fmap fst $ LG.getIsolatedNodes newGraph) ++ " roots " ++ (show $ fmap fst $ LG.getRoots newGraph)) (newGraph, nodesToReoptimize) - else - -} - (newGraph, nodesToReoptimize) - diff --git a/pkg/PhyGraph/GraphOptimization/PostOrderSoftWiredFunctions.hs b/pkg/PhyGraph/GraphOptimization/PostOrderSoftWiredFunctions.hs deleted file mode 100644 index 6f1d70f09..000000000 --- a/pkg/PhyGraph/GraphOptimization/PostOrderSoftWiredFunctions.hs +++ /dev/null @@ -1,1441 +0,0 @@ -{- | -Module : PostOrderSoftWiredFunctions.hs -Description : Module specifying post-order softwiired graph functions -Copyright : (c) 2022 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -{- -ToDo: - Add parallel optimization overblocks and characters? --} - - -module GraphOptimization.PostOrderSoftWiredFunctions ( updateAndFinalizePostOrderSoftWired - , postOrderSoftWiredTraversal - , postDecorateSoftWired' - , postDecorateSoftWired - , assignPostOrderToDisplayTree - , softWiredPostOrderTraceBack - , makeLeafGraphSoftWired - , getOutDegree1VertexAndGraph - , getOutDegree1VertexSoftWired - , getOutDegree2VertexSoftWired - , extractDisplayTrees - , createBlockResolutions - , makeCharacterGraph - , getAllResolutionList - , getBestResolutionListPair - , getDisplayBasedRerootSoftWired - , divideDecoratedGraphByBlockAndCharacterTree - ) where - -import Data.Bits -import qualified Data.BitVector.LittleEndian as BV -import qualified Data.List as L -import Data.Maybe -import qualified Data.Text.Lazy as T -import qualified Data.Vector as V -import GeneralUtilities -import qualified GraphOptimization.Medians as M -import qualified Graphs.GraphOperations as GO -import Types.Types -import qualified Utilities.LocalGraph as LG -import qualified Utilities.Utilities as U -import Control.Parallel.Strategies -import qualified ParallelUtilities as PU --- import Debug.Debug -import Debug.Trace - - --- | getDisplayBasedRerootSoftWired takes a graph and generates reroot costs for each character of each block --- based on rerooting the display tree for that block. --- Written for soft-wired, but could be modified for tree (data split on vertdata not resolution data) --- this is a differnt approach from that of "tree" wher the decorated, canonical tree is rerooted and each character and block --- cost determined from that single rerooting --- this should help avoid the issue of rerooting complex, reticulate graphs and maintaining --- all the condition (cycles, time consistency etc) that occur. --- done correcly this should be able to be used for trees (all display trees same = cononical graph) as --- well as softwired, but not for hardwired where reticulations are maintained. - --- this can be modified for Tree data structres--basically by starting with vertdata initialiiy without --- resolutoin data trace back--shouls be more efficeinet in many was than existing code - --- Input didpay trees are for reproting only and do not contain actual character data so must be "pulled" --- from concinical Decorated graph (thd field) --- the list :[] stuff due to potential list of diaplsy trees not uemployed here -getDisplayBasedRerootSoftWired :: GraphType -> LG.Node -> PhylogeneticGraph -> PhylogeneticGraph -getDisplayBasedRerootSoftWired inGraphType rootIndex inPhyloGraph@(a,b,decGraph,_,_,f) = - if LG.isEmpty (fst6 inPhyloGraph) then inPhyloGraph - else - let -- update with pass to retrieve vert data from resolution data - -- Trfee allready has data in vertData field - (inSimpleGraph, _, inDecGraph, inBlockGraphV', inBlockCharGraphVV', charInfoVV) = if inGraphType == Tree then - let (displayTrees, charTrees) = divideDecoratedGraphByBlockAndCharacterTree decGraph - in - (a, b, decGraph, displayTrees, charTrees, f) - else updateAndFinalizePostOrderSoftWired (Just rootIndex) rootIndex inPhyloGraph - - -- purge double edges from display and charcater graphs - -- this should not be happening--issue with postorder network resolutions data - (inBlockGraphV, inBlockCharGraphVV) = if inGraphType == Tree then (inBlockGraphV', inBlockCharGraphVV') - else (fmap (fmap LG.removeDuplicateEdges) inBlockGraphV', fmap (fmap LG.removeDuplicateEdges) inBlockCharGraphVV') - - -- reroot block character trees - (newBlockDisplayTreeVect, newBlockCharGraphVV, blockCostV) = unzip3 (zipWith3 (rerootBlockCharTrees rootIndex) (V.toList $ fmap head inBlockGraphV) (V.toList inBlockCharGraphVV) (V.toList charInfoVV) `using` PU.myParListChunkRDS) -- not sure if should be parallelized `using` PU.myParListChunkRDS - - -- This is slower than myParListChunkRDS - -- (newBlockDisplayTreeVect, newBlockCharGraphVV, blockCostV) = unzip3 (PU.seqParMap rdeepseq (rerootBlockCharTrees' rootIndex) $ zip3 (V.toList $ fmap head inBlockGraphV) (V.toList inBlockCharGraphVV) (V.toList charInfoVV)) - - newCononicalGraph = backPortBlockTreeNodesToCanonicalGraph inDecGraph (V.fromList newBlockDisplayTreeVect) - in - -- trace ("GDBRS:" ++ "Dec graph:" ++ (LG.prettyIndices decGraph) ++ "\n" ++ (concatMap (++ "\nNew: ") $ fmap show $ fmap LG.getDuplicateEdges $ V.toList newBlockDisplayTreeVect)) - (inSimpleGraph, sum blockCostV, newCononicalGraph, V.fromList $ fmap (:[]) newBlockDisplayTreeVect, V.fromList newBlockCharGraphVV, charInfoVV) - - --- | rerootBlockCharTrees' wrapper for fmap/seqparmap -rerootBlockCharTrees' :: LG.Node -> (DecoratedGraph, V.Vector DecoratedGraph, V.Vector CharInfo) -> (DecoratedGraph, V.Vector DecoratedGraph, VertexCost) -rerootBlockCharTrees' rootIndex (blockDisplayTree, charTreeVect, charInfoVect) = rerootBlockCharTrees rootIndex blockDisplayTree charTreeVect charInfoVect - --- | rerootBlockCharTrees reroots all character trees (via fmap) in block returns best block char trees and costs --- with best character tree node assignment back ported to display tree -rerootBlockCharTrees ::LG.Node -> DecoratedGraph -> V.Vector DecoratedGraph -> V.Vector CharInfo -> (DecoratedGraph, V.Vector DecoratedGraph, VertexCost) -rerootBlockCharTrees rootIndex blockDisplayTree charTreeVect charInfoVect = - if V.null charTreeVect then error "Empty tree vector in rerootBlockCharTrees" - else - let -- next edges (to vertex in list) to perform rerroting - -- progresses recursivey over adjacent edges to minimize node reoptimization - -- since initially all same graph can get initial reroot nodes from display tree - childrenOfRoot = LG.descendants blockDisplayTree rootIndex - grandChildrenOfRoot = concatMap (LG.descendants blockDisplayTree) childrenOfRoot - - (rerootedCharTreeVect, rerootedCostVect) = unzip (zipWith (getCharTreeBestRoot rootIndex grandChildrenOfRoot) (V.toList charTreeVect) (V.toList charInfoVect) `using` PU.myParListChunkRDS) - - -- this is a little slower than myParListChunkRDS - -- (rerootedCharTreeVect, rerootedCostVect) = unzip (PU.seqParMap rdeepseq (getCharTreeBestRoot' rootIndex grandChildrenOfRoot) (zip (V.toList charTreeVect) (V.toList charInfoVect))) - - updateBlockDisplayTree = backPortCharTreeNodesToBlockTree blockDisplayTree (V.fromList rerootedCharTreeVect) - in - (updateBlockDisplayTree, V.fromList rerootedCharTreeVect, sum rerootedCostVect) - --- | getCharTreeBestRoot' wrapper for use with fmap/seqParMap -getCharTreeBestRoot' :: LG.Node -> [LG.Node] -> (DecoratedGraph, CharInfo) -> (DecoratedGraph, VertexCost) -getCharTreeBestRoot' rootIndex nodesToRoot (inCharacterGraph, charInfo) = getCharTreeBestRoot rootIndex nodesToRoot inCharacterGraph charInfo - --- | getCharTreeBestRoot takes the root index, a character tree (from a block) and its character info ---- and prerforms the rerootings of that character tree to get the best reroot cost and preliminary assignments -getCharTreeBestRoot :: LG.Node -> [LG.Node] -> DecoratedGraph -> CharInfo -> (DecoratedGraph, VertexCost) -getCharTreeBestRoot rootIndex nodesToRoot inCharacterGraph charInfo = - -- if prealigned should be rerooted? - let (bestRootCharGraph, bestRootCost) = if (charType charInfo `notElem` sequenceCharacterTypes) then (inCharacterGraph, (subGraphCost . snd) $ LG.labelNode inCharacterGraph rootIndex) - else rerootCharacterTree rootIndex nodesToRoot charInfo inCharacterGraph - in - (bestRootCharGraph, bestRootCost) - --- | rerootCharacterTree wrapper around rerootCharacterTree' with cleaner interface for "best" results -rerootCharacterTree :: LG.Node -> [LG.Node] -> CharInfo -> DecoratedGraph -> (DecoratedGraph, VertexCost) -rerootCharacterTree rootIndex nodesToRoot charInfo inCharacterGraph = - rerootCharacterTree' rootIndex nodesToRoot charInfo ((subGraphCost . snd) $ LG.labelNode inCharacterGraph rootIndex) inCharacterGraph inCharacterGraph - --- | rerootCharacterTree' takes a character tree and root index and returns best rooted character tree and cost --- this is recursive taking best cost to save on memory over an fmap and minimum --- since does reroot stuff over character trees--that component is less efficient --- root index always same--just edges conenct to change with rerooting --- graph is prgressively rerooted to be efficient -rerootCharacterTree' ::LG.Node -> [LG.Node] -> CharInfo -> VertexCost -> DecoratedGraph -> DecoratedGraph -> (DecoratedGraph, VertexCost) -rerootCharacterTree' rootIndex nodesToRoot charInfo bestCost bestGraph inGraph = - if null nodesToRoot then (bestGraph, bestCost) - else - let firstRerootIndex = head nodesToRoot - nextReroots = (LG.descendants inGraph firstRerootIndex) ++ tail nodesToRoot - newGraph = rerootAndDiagnoseTree rootIndex firstRerootIndex charInfo inGraph - newGraphCost = ((subGraphCost . snd) $ LG.labelNode newGraph rootIndex) - (bestGraph', bestCost') = if newGraphCost < bestCost then (newGraph, newGraphCost) - else (bestGraph, bestCost) - in - -- if LG.isEmpty newGraph then rerootCharacterTree' rootIndex (tail nodesToRoot) charInfo bestCost bestGraph inGraph - -- trace ("RRCT:" ++ (show (rootIndex, firstRerootIndex, bestCost, newGraphCost))) - --else - rerootCharacterTree' rootIndex nextReroots charInfo bestCost' bestGraph' newGraph - --- | rerootAndDiagnoseTree takes tree and reroots and reoptimizes nodes -rerootAndDiagnoseTree :: LG.Node -> LG.Node -> CharInfo -> DecoratedGraph -> DecoratedGraph -rerootAndDiagnoseTree rootIndex newRerootIndex charInfo inGraph = - let reRootGraph = LG.rerootDisplayTree rootIndex newRerootIndex inGraph - (nodesToOptimize, _) = LG.pathToRoot inGraph (LG.labelNode inGraph newRerootIndex) - reOptimizedGraph = reOptimizeCharacterNodes charInfo reRootGraph nodesToOptimize - in - if LG.isEmpty reRootGraph then inGraph - else reOptimizedGraph - - --- | reOptimizeCharacterNodes takes a decorated graph and a list of nodes and reoptimizes (relabels) --- them based on children in input graph --- simple recursive since each node depends on children --- check for out-degree 1 since can be resolved form diplay trees -reOptimizeCharacterNodes :: CharInfo -> DecoratedGraph -> [LG.LNode VertexInfo] -> DecoratedGraph -reOptimizeCharacterNodes charInfo inGraph oldNodeList = - -- trace ("RON:" ++ (show $ fmap fst oldNodeList)) ( - if null oldNodeList then inGraph - else - let curNode@(curNodeIndex, curNodeLabel) = head oldNodeList - nodeChildren = LG.descendants inGraph curNodeIndex -- should be 1 or 2, not zero since all leaves already in graph - foundCurChildern = filter (`elem` nodeChildren) $ fmap fst (tail oldNodeList) - in - {-These are checks that were in for network code--should be unncesesary for charactaer trees - -- make sure that nodes are optimized in correct order so that nodes are only reoptimized using updated children - -- this should really not have to happen--order should be determined a priori - -} - --if LG.isLeaf inGraph curNodeIndex then trace ("Should not be a leaf in reoptimize nodes: " ++ (show curNodeIndex) ++ " children " ++ (show nodeChildren) ++ "\nGraph:\n" ++ (LG.prettify $ GO.convertDecoratedToSimpleGraph inGraph)) inGraph - --else - - if not $ null foundCurChildern then - -- trace ("Current node " ++ (show curNodeIndex) ++ " has children " ++ (show nodeChildren) ++ " in optimize list (optimization order error)" ++ (show $ fmap fst $ tail oldNodeList)) - reOptimizeCharacterNodes charInfo inGraph (tail oldNodeList ++ [curNode]) - - -- somehow root before others -- remove if not needed after debug - --else if LG.isRoot inGraph curNodeIndex && length oldNodeList > 1 then - -- error ("Root first :" ++ (show $ fmap fst oldNodeList) ++ "RC " ++ show (LG.descendants inGraph curNodeIndex)) -- ++ "\n" ++ (LG.prettify $ GO.convertDecoratedToSimpleGraph inGraph)) - --reOptimizeNodes localGraphType charInfoVectVect inGraph ((tail oldNodeList) ++ [curNode]) - else if length nodeChildren > 2 then error ("Node has >2 children: " ++ (show nodeChildren)) - else - - -- trace ("RON: " ++ (show curNodeIndex) ++ " children " ++ (show nodeChildren)) ( - let leftChild = head nodeChildren - rightChild = last nodeChildren - - -- this ensures that left/right choices are based on leaf BV for consistency and label invariance - (leftChildLabel, rightChildLabel) = U.leftRightChildLabelBV (fromJust $ LG.lab inGraph leftChild, fromJust $ LG.lab inGraph rightChild) - (newVertexData, newVertexCost) = M.median2Single False ((V.head . V.head . vertData) leftChildLabel) ((V.head . V.head . vertData) rightChildLabel) charInfo - - in - - let (newCost, newBVLabel, newVertData, newSubGraphCost) = if length nodeChildren < 2 then (0, bvLabel leftChildLabel, vertData leftChildLabel, subGraphCost leftChildLabel) - else (newVertexCost, bvLabel leftChildLabel .|. bvLabel rightChildLabel, V.singleton (V.singleton newVertexData), subGraphCost leftChildLabel + subGraphCost rightChildLabel + newCost) - newVertexLabel = VertexInfo { index = curNodeIndex - -- this bit labelling incorect for outdegree = 1, need to prepend bits - , bvLabel = newBVLabel - , parents = V.fromList $ LG.parents inGraph curNodeIndex - , children = V.fromList nodeChildren - , nodeType = nodeType curNodeLabel - , vertName = vertName curNodeLabel - , vertexResolutionData = mempty - , vertData = newVertData - , vertexCost = newCost - , subGraphCost = newSubGraphCost - } - - -- this to add back edges deleted with nodes (undocumented but sensible in fgl) - replacementEdges = LG.inn inGraph curNodeIndex ++ LG.out inGraph curNodeIndex - newGraph = LG.insEdges replacementEdges $ LG.insNode (curNodeIndex, newVertexLabel) $ LG.delNode curNodeIndex inGraph - in - --trace ("New vertexCost " ++ show newCost) -- ++ " lcn " ++ (show (vertData leftChildLabel, vertData rightChildLabel, vertData curnodeLabel))) - reOptimizeCharacterNodes charInfo newGraph (tail oldNodeList) - - --- | backPortCharTreeNodesToBlockTree assigned nodes states (labels) of character trees to block doisplay Tree --- updates vertData, vertexCost, and subGraphCost for each . Subgraph cost queationable since relieds on rooting -backPortCharTreeNodesToBlockTree :: DecoratedGraph -> V.Vector DecoratedGraph -> DecoratedGraph -backPortCharTreeNodesToBlockTree blockDisplayTree rerootedCharTreeVect = - let blockDisplayNodes = LG.labNodes blockDisplayTree - blockDisplayEdges = LG.labEdges blockDisplayTree - - -- vector (characters) of vector (nodes) of labels - charTreeLabelsVV = fmap V.fromList $ fmap (fmap snd) $ fmap LG.labNodes rerootedCharTreeVect - - -- for each node index extract (head head) vertdata, vertexCost and subgraphcost - (vertDataVV, vertCostVV, subGraphCostVV) = V.unzip3 $ fmap (extractTripleVect charTreeLabelsVV) (V.fromList [0..(length blockDisplayNodes - 1)]) - - -- update labels for block data nodes - updatedDisplayNodes = V.zipWith4 updateNodes (V.fromList blockDisplayNodes) vertDataVV vertCostVV subGraphCostVV - - in - LG.mkGraph (V.toList updatedDisplayNodes) blockDisplayEdges - --- | extractTripleVect takes a vector of vector character tree labels and a node index and --- retuns a triple of data (vertData, VertCost, and subgraphCost) from a given node index in all labels -extractTripleVect :: V.Vector (V.Vector VertexInfo) -> Int -> (V.Vector CharacterData, V.Vector VertexCost, V.Vector VertexCost) -extractTripleVect inLabelVV charIndex = - let nodeLabelV = fmap (V.! charIndex) inLabelVV - vertDataV = fmap vertData nodeLabelV - vertCostV = fmap vertexCost nodeLabelV - subGraphCostV = fmap subGraphCost nodeLabelV - in - (fmap (V.head . V.head) vertDataV, vertCostV, subGraphCostV) - - --- | updateNodes takes vectors of labelled nodes and updates vertData, VerTCost, and subgraphCost fields -updateNodes ::LG.LNode VertexInfo -> V.Vector CharacterData -> V.Vector VertexCost -> V.Vector VertexCost -> LG.LNode VertexInfo -updateNodes (inIndex, inLabel) charDataV vertexCostV subGraphCostV = - let newVertCost = V.sum vertexCostV - newSubGraphCost = V.sum subGraphCostV - newLabel = inLabel {vertData = V.singleton charDataV, vertexCost = newVertCost, subGraphCost = newSubGraphCost} - in - (inIndex, newLabel) - --- | backPortBlockTreeNodesToCanonicalGraph takes block display trees (updated presumably) and ports the block tree node --- labels to the cononical Graph --- very similar to backPortCharTreeNodesToBlockTree but character vector is no singleton -backPortBlockTreeNodesToCanonicalGraph :: DecoratedGraph -> V.Vector DecoratedGraph -> DecoratedGraph -backPortBlockTreeNodesToCanonicalGraph inCanonicalGraph blockTreeVect = - let canonicalDisplayNodes = LG.labNodes inCanonicalGraph - canonicalDisplayEdges = LG.labEdges inCanonicalGraph - - -- vector (characters) of vector (nodes) of labels - blockTreeLabelsVV = fmap V.fromList $ fmap (fmap snd) $ fmap LG.labNodes blockTreeVect - - -- for each node index extract (head head) vertdata, vertexCost and subgraphcost - (vertDataVV, vertCostVV, subGraphCostVV) = V.unzip3 $ fmap (extractTripleVectBlock blockTreeLabelsVV) (V.fromList [0..(length canonicalDisplayNodes - 1)]) - - -- update labels for canonical data nodes - updatedCanonicalNodes = V.zipWith4 updateNodesBlock (V.fromList canonicalDisplayNodes) vertDataVV vertCostVV subGraphCostVV - - in - LG.mkGraph (V.toList updatedCanonicalNodes) canonicalDisplayEdges - --- | updateNodesBlock takes vectors of labelled nodes and updates vertData, VerTCost, and subgraphCost fields -updateNodesBlock ::LG.LNode VertexInfo -> V.Vector (V.Vector CharacterData) -> V.Vector VertexCost -> V.Vector VertexCost -> LG.LNode VertexInfo -updateNodesBlock (inIndex, inLabel) charDataVV vertexCostV subGraphCostV = - let newVertCost = V.sum vertexCostV - newSubGraphCost = V.sum subGraphCostV - newLabel = inLabel {vertData = charDataVV, vertexCost = newVertCost, subGraphCost = newSubGraphCost} - in - (inIndex, newLabel) - --- | extractTripleVectBlock takes a vector of vector block tree labels and a node index and --- retuns a triple of data (vertData, VertCost, and subgraphCost) from a given node index in all labels -extractTripleVectBlock :: V.Vector (V.Vector VertexInfo) -> Int -> (V.Vector (V.Vector CharacterData), V.Vector VertexCost, V.Vector VertexCost) -extractTripleVectBlock inLabelVV charIndex = - let nodeLabelV = fmap (V.! charIndex) inLabelVV - vertDataV = fmap vertData nodeLabelV - vertCostV = fmap vertexCost nodeLabelV - subGraphCostV = fmap subGraphCost nodeLabelV - in - (fmap V.head vertDataV, vertCostV, subGraphCostV) - --- | divideDecoratedGraphByBlockAndCharacterTree takes a DecoratedGraph with (potentially) multiple blocks --- and (potentially) multiple character per block and creates a Vector of Vector of Decorated Graphs --- over blocks and characters with the same graph, but only a single block and character for each graph --- this to be used to create the "best" cost over alternate graph traversals --- vertexCost and subGraphCost will be taken from characterData localcost/localcostVect and globalCost -divideDecoratedGraphByBlockAndCharacterTree :: DecoratedGraph -> (V.Vector [DecoratedGraph], V.Vector (V.Vector DecoratedGraph)) -divideDecoratedGraphByBlockAndCharacterTree inGraph = - if LG.isEmpty inGraph then (V.empty, V.empty) - else - let numBlocks = V.length $ vertData $ snd $ head $ LG.labNodes inGraph - blockGraphList = fmap (pullBlock inGraph) [0.. (numBlocks - 1)] - characterGraphList = fmap makeCharacterGraph blockGraphList - in - -- trace ("DDGBCT: Blocks " ++ (show numBlocks) ++ " Characters " ++ (show $ fmap length $ vertData $ snd $ head $ LG.labNodes inGraph) ++ "\n" ++ (show characterGraphList)) - (V.fromList (fmap (:[]) blockGraphList), V.fromList characterGraphList) - --- | pullBlocks take a DecoratedGraph and creates a newDecorated graph with --- only data from the input block index -pullBlock :: DecoratedGraph -> Int -> DecoratedGraph -pullBlock inGraph blockIndex = - if LG.isEmpty inGraph then LG.empty - else - let (inNodeIndexList, inNodeLabelList) = unzip $ LG.labNodes inGraph - blockNodeLabelList = fmap (makeBlockNodeLabels blockIndex) inNodeLabelList - in - LG.mkGraph (zip inNodeIndexList blockNodeLabelList) (LG.labEdges inGraph) - --- | makeBlockNodeLabels takes a block index and an orginal nodel label --- and creates a new list of a singleton block from the input block index -makeBlockNodeLabels :: Int -> VertexInfo -> VertexInfo -makeBlockNodeLabels blockIndex inVertexInfo = - let newVertexData = vertData inVertexInfo V.! blockIndex - newVertexCost = V.sum $ fmap localCost newVertexData - newsubGraphCost = V.sum $ fmap globalCost newVertexData - in - -- trace ("MBD " ++ (show $ length newVertexData) ++ " from " ++ (show $ length (vertData inVertexInfo))) - inVertexInfo { vertData = V.singleton newVertexData - , vertexCost = newVertexCost - , subGraphCost = newsubGraphCost - } - --- | updateAndFinalizePostOrderSoftWired performs the pre-order traceback on the resolutions of a softwired graph to create the correct vertex states, --- ports the post order assignments to the display trees, and creates the character trees from the block trees -updateAndFinalizePostOrderSoftWired :: Maybe Int -> Int -> PhylogeneticGraph -> PhylogeneticGraph -updateAndFinalizePostOrderSoftWired startVertex rootIndex inGraph = - if LG.isEmpty $ thd6 inGraph then inGraph - else - let -- this step is important--not repetetive since pull out teh resolution data for each block - outgroupRootLabel = fromJust $ LG.lab (thd6 inGraph) rootIndex - (displayGraphVL, lDisplayCost) = extractDisplayTrees startVertex True (vertexResolutionData outgroupRootLabel) - - -- traceback on resolutions - newGraph = softWiredPostOrderTraceBack rootIndex (thd6 inGraph) - - -- propagate updated post-order assignments to display trees, which updates display tree and cost ealier - displayGraphVL' = V.zipWith (assignPostOrderToDisplayTree (fmap (vertData . snd) (LG.labNodes newGraph) )) displayGraphVL (V.fromList [0..(V.length displayGraphVL - 1)]) - - -- create new, fully updated post-order graph - finalPreOrderGraph = (fst6 inGraph, lDisplayCost, newGraph, displayGraphVL', divideDecoratedGraphByBlockAndCharacterSoftWired displayGraphVL', six6 inGraph) - in - -- trace ("UAFPS: after extraction: " ++ (show $ fmap LG.getDuplicateEdges $ fmap head $ V.toList displayGraphVL) ++ " after assignment: " ++ (show $ fmap LG.getDuplicateEdges $ fmap head $ V.toList displayGraphVL')) - -- trace ("UFPOSW: " ++ (show $ fmap length displayGraphVL) ++ " " ++ (show $ fmap length displayGraphVL') ++ " " ++ (show $ fmap V.length $ fft6 finalPreOrderGraph)) - finalPreOrderGraph - --- | divideDecoratedGraphByBlockAndCharacterSoftWired takes a Vector of a list of DecoratedGraph --- continaing a list of decorated trees that are the display trees for that block --- with (potentially) multiple blocks --- and (potentially) multiple character per block and creates a Vector of Vector of Decorated Graphs --- over blocks and characters with the block diplay graph, but only a single block and character for each graph --- this to be used to create the "best" cost over alternate graph traversals --- vertexCost and subGraphCost will be taken from characterData localcost/localcostVect and globalCost --- for this assignment purpose for pre-order a single (head) member of list is used to create the --- character graphs -divideDecoratedGraphByBlockAndCharacterSoftWired :: V.Vector [DecoratedGraph] -> V.Vector (V.Vector DecoratedGraph) -divideDecoratedGraphByBlockAndCharacterSoftWired inGraphVL = - if V.null inGraphVL then mempty - else - let blockGraphList = fmap head inGraphVL - characterGraphList = fmap makeCharacterGraph blockGraphList - in - characterGraphList - --- | postOrderSoftWiredTraversal performs postorder traversal on Soft-wired graph --- staticIA is ignored--but kept for functional polymorphism --- ur-root = ntaxa is an invariant -postOrderSoftWiredTraversal :: GlobalSettings -> ProcessedData -> DecoratedGraph -> Bool -> Maybe Int -> SimpleGraph -> PhylogeneticGraph -postOrderSoftWiredTraversal inGS inData@(_, _, blockDataVect) leafGraph _ startVertex inSimpleGraph = - if LG.isEmpty inSimpleGraph then emptyPhylogeneticGraph - else - -- Assumes root is Number of Leaves - let rootIndex = if startVertex == Nothing then V.length $ fst3 inData - else fromJust startVertex - blockCharInfo = V.map thd3 blockDataVect - newSoftWired = postDecorateSoftWired inGS inSimpleGraph leafGraph blockCharInfo rootIndex rootIndex - in - --trace ("It Begins at " ++ show rootIndex) ( - -- trace ("POSWT:\n" ++ (LG.prettify inSimpleGraph) ++ "\nVertices:\n" ++ (show $ LG.labNodes $ thd6 newSoftWired)) ( - if (startVertex == Nothing) && (not $ LG.isRoot inSimpleGraph rootIndex) then - let localRootList = fst <$> LG.getRoots inSimpleGraph - localRootEdges = concatMap (LG.out inSimpleGraph) localRootList - currentRootEdges = LG.out inSimpleGraph rootIndex - in - error ("Index " ++ show rootIndex ++ " with edges " ++ show currentRootEdges ++ " not root in graph:" ++ show localRootList ++ " edges:" ++ show localRootEdges ++ "\n" ++ LG.prettify inSimpleGraph) - else - -- trace ("POSW:" ++ (show $ fmap V.length $ fft6 newSoftWired)) - newSoftWired - -- ) - --- | postDecorateSoftWired' wrapper for postDecorateSoftWired with args in differnt order for mapping -postDecorateSoftWired' :: GlobalSettings -> DecoratedGraph -> V.Vector (V.Vector CharInfo) -> LG.Node -> LG.Node -> SimpleGraph -> PhylogeneticGraph -postDecorateSoftWired' inGS curDecGraph blockCharInfo rootIndex curNode simpleGraph = postDecorateSoftWired inGS simpleGraph curDecGraph blockCharInfo rootIndex curNode - --- | postDecorateSoftWired begins at start index (usually root, but could be a subtree) and moves preorder till children are labelled --- and then recurses to root postorder labelling vertices and edges as it goes --- this for a single root -postDecorateSoftWired :: GlobalSettings -> SimpleGraph -> DecoratedGraph -> V.Vector (V.Vector CharInfo) -> LG.Node -> LG.Node -> PhylogeneticGraph -postDecorateSoftWired inGS simpleGraph curDecGraph blockCharInfo rootIndex curNode = - -- if node in current decortated graph then nothing to do and return - if LG.gelem curNode curDecGraph then - let nodeLabel = LG.lab curDecGraph curNode - in - if isNothing nodeLabel then error ("Null label for node " ++ show curNode) - else (simpleGraph, subGraphCost (fromJust nodeLabel), curDecGraph, mempty, mempty, blockCharInfo) - - else - -- get postodre assignmens of children - -- checks for single child of node - -- result is single graph afer left and right child traversals - -- trace ("PDSW making node " ++ show curNode ++ " in\n" ++ (LG.prettify $ GO.convertDecoratedToSimpleGraph curDecGraph)) ( - let nodeChildren = LG.descendants simpleGraph curNode -- should be 1 or 2, not zero since all leaves already in graph - leftChild = head nodeChildren - rightChild = last nodeChildren - leftChildTree = postDecorateSoftWired inGS simpleGraph curDecGraph blockCharInfo rootIndex leftChild - rightLeftChildTree = if length nodeChildren == 2 then postDecorateSoftWired inGS simpleGraph (thd6 leftChildTree) blockCharInfo rootIndex rightChild - else leftChildTree - in - -- Checks on children - if length nodeChildren > 2 then error ("Graph not dichotomous in postDecorateSoftWired node " ++ show curNode ++ "\n" ++ LG.prettify simpleGraph) - else if null nodeChildren then error ("Leaf not in graph in postDecorateSoftWired node " ++ show curNode ++ "\n" ++ LG.prettify simpleGraph) - - else - -- make node from child block resolutions - -- child resolutin made ealeri in post roder pass - let newSubTree = thd6 rightLeftChildTree - in - - -- single child of node (can certinly happen with soft-wired networks - if length nodeChildren == 1 then - -- trace ("Outdegree 1: " ++ (show curNode) ++ " " ++ (show $ GO.getNodeType simpleGraph curNode) ++ " Child: " ++ (show nodeChildren)) ( - let (newGraph, _, _, _, _) = getOutDegree1VertexAndGraph curNode (fromJust $ LG.lab newSubTree leftChild) simpleGraph nodeChildren newSubTree - in - (simpleGraph, 0, newGraph, mempty, mempty, blockCharInfo) - - -- check for error condition - else if length nodeChildren > 2 then error ("Node has >2 children: " ++ (show nodeChildren)) - - -- 2 children - else - -- trace ("Outdegree 2: " ++ (show curNode) ++ " " ++ (show $ GO.getNodeType simpleGraph curNode) ++ " Children: " ++ (show nodeChildren)) ( - -- need to create new resolutions and add to existing sets - let -- this ensures that left/right choices are based on leaf BV for consistency and label invariance - -- larger bitvector is Right, smaller or equal Left - ((leftChild', leftChildLabel), (rightChild', rightChildLabel)) = U.leftRightChildLabelBVNode ((leftChild, fromJust $ LG.lab newSubTree leftChild), (rightChild, fromJust $ LG.lab newSubTree rightChild)) - - -- create resolution caches for blocks - leftChildNodeType = nodeType leftChildLabel - rightChildNodeType = nodeType rightChildLabel - resolutionBlockVL = V.zipWith3 (createBlockResolutions (compressResolutions inGS) curNode leftChild' rightChild' leftChildNodeType rightChildNodeType (GO.getNodeType simpleGraph curNode)) (vertexResolutionData leftChildLabel) (vertexResolutionData rightChildLabel) blockCharInfo - - -- create canonical Decorated Graph vertex - -- 0 cost becasue can't know cosrt until hit root and get best valid resolutions - newVertexLabel = VertexInfo { index = curNode - , bvLabel = bvLabel leftChildLabel .|. bvLabel rightChildLabel - , parents = V.fromList $ LG.parents simpleGraph curNode - , children = V.fromList nodeChildren - , nodeType = GO.getNodeType simpleGraph curNode - , vertName = T.pack $ "HTU" ++ show curNode - , vertData = mempty --empty because of resolution data - , vertexResolutionData = resolutionBlockVL - , vertexCost = 0.0 --newCost - , subGraphCost = 0.0 -- (subGraphCost leftChildLabel) + (subGraphCost rightChildLabel) + newCost - } - - leftEdgeType - | leftChildNodeType == NetworkNode = NetworkEdge - | leftChildNodeType == LeafNode = PendantEdge - | otherwise = TreeEdge - rightEdgeType - | rightChildNodeType == NetworkNode = NetworkEdge - | rightChildNodeType == LeafNode = PendantEdge - | otherwise = TreeEdge - - edgeLable = EdgeInfo { minLength = 0.0 - , maxLength = 0.0 - , midRangeLength = 0.0 - , edgeType = TreeEdge - } - - leftEdge = (curNode, leftChild', edgeLable {edgeType = leftEdgeType}) - rightEdge = (curNode, rightChild', edgeLable {edgeType = rightEdgeType}) - newGraph = LG.insEdges [leftEdge, rightEdge] $ LG.insNode (curNode, newVertexLabel) newSubTree - - (displayGraphVL, lDisplayCost) = if curNode == rootIndex then - let (displayG, displayCost') = extractDisplayTrees (Just curNode) True resolutionBlockVL - in - (displayG, displayCost') - -- (fmap (fmap LG.removeDuplicateEdges) displayG, displayCost') - else (mempty, 0.0) - - in - -- trace ("PDWS: " ++ (show $ fmap LG.toEdge [leftEdge, rightEdge]) ++ " has edges " ++ (show $ fmap (LG.hasEdge newSubTree) $ fmap LG.toEdge [leftEdge, rightEdge]) ++ " dupes: " ++ (show $ fmap LG.getDuplicateEdges $ V.head $ fst (extractDisplayTrees (Just curNode) True resolutionBlockVL)) ++ " Resolutions " ++ (show $ fmap (fmap U.hasResolutionDuplicateEdges) resolutionBlockVL)) - - (simpleGraph, lDisplayCost, newGraph, displayGraphVL, mempty, blockCharInfo) - --- | assignPostOrderToDisplayTree takes the post-order (preliminary) block data from canonical Decorated graph --- to Block display graphs (through list of more than one for a block) --- input is canonical Decorated Graph and a pair containing the display tree and its Block index --- this could be integrated into preorder traceback to remove the extra pass --- do this here is a bit simpler -assignPostOrderToDisplayTree :: [VertexBlockData] -> [DecoratedGraph] -> Int -> [DecoratedGraph] -assignPostOrderToDisplayTree canonicalVertexBlockData displayTreeList displayTreeIndex = - if null canonicalVertexBlockData then [] - else - let - updatedDispayTreeList = fmap (assignVertexBlockData canonicalVertexBlockData displayTreeIndex) displayTreeList - in - -- trace ("Index: " ++ (show displayTreeIndex) ++ " Blocks : " ++ (show $ V.length $ head canonicalVertexBlockData) ++ " Nodes: " ++ (show $ length canonicalVertexBlockData)) - updatedDispayTreeList - --- | assignVertexBlockData assigns the block data of input index and assigns to all nodes of a tree -assignVertexBlockData :: [VertexBlockData] -> Int -> DecoratedGraph -> DecoratedGraph -assignVertexBlockData nodeDataList blockIndex inGraph = - if null nodeDataList || LG.isEmpty inGraph then LG.empty - else - -- trace ("In Index: " ++ (show blockIndex) ++ " BL: " ++ (show $ V.length $ head nodeDataList) ++ " lengths " ++ (show $ fmap V.length nodeDataList)) ( - let blockIndexDataList = fmap (V.! blockIndex) nodeDataList - (displayNodeIndexList, displayNodeLabelList) = L.unzip $ LG.labNodes inGraph - updatedNodeLabelList = zipWith updateVertData displayNodeLabelList blockIndexDataList - in - LG.mkGraph (zip displayNodeIndexList updatedNodeLabelList) (LG.labEdges inGraph) - -- ) - where updateVertData inVertexInfo newVertData = inVertexInfo {vertData = V.singleton newVertData} - --- | softWiredPostOrderTraceBack takes resolution data and assigns correct resolution median to preliminary --- data ssignments. Proceeds via typical pre-order pass over display tree -softWiredPostOrderTraceBack :: Int -> DecoratedGraph -> DecoratedGraph -softWiredPostOrderTraceBack rootIndex inGraph = - if LG.isEmpty inGraph then LG.empty - else - -- get edges to remake graph after nodes are updated with preliminary states - let rootLabel = fromJust $ LG.lab inGraph rootIndex - inEdgeList = LG.labEdges inGraph - - -- root first--choose best resolutions--returns a list and takes head of that list of equal cost/traceback preliminary - -- assignments. Later could look at multiple. - (rootVertData, rootSubGraphCost, rootResolutionCost, leftRightIndexVect) = getResolutionDataAndIndices rootLabel (V.singleton (Just (-1))) - newRootLabel = rootLabel { vertData = rootVertData - , vertexCost = rootResolutionCost - , subGraphCost = rootSubGraphCost - } - newRootNode = (rootIndex, newRootLabel) - - -- get child/children of root - rootChildren = LG.labDescendants inGraph newRootNode - - -- left / right to match post-order - rootChildrenBV = fmap (bvLabel . snd) rootChildren - rootChildrenIsLeft - | length rootChildrenBV == 1 = [True] - | head rootChildrenBV > (rootChildrenBV !! 1) = [False, True] - | otherwise = [True, False] - - rootChildrenTuples = zip3 rootChildren (replicate (length rootChildren) leftRightIndexVect) rootChildrenIsLeft - - -- recurse to children with resolution index from parent - softWiredUpdatedNodes = softWiredPrelimTraceback inGraph rootChildrenTuples [newRootNode] - - in - LG.mkGraph softWiredUpdatedNodes inEdgeList - --- | softWiredPrelimTraceback takes a list of nodes to update (with left right index info) based --- on resolution data, recurses with left right indices pre-order to leaves, keeping list of updated nodes -softWiredPrelimTraceback :: DecoratedGraph - -> [(LG.LNode VertexInfo, V.Vector (Maybe Int, Maybe Int), Bool)] - -> [LG.LNode VertexInfo] - -> [LG.LNode VertexInfo] -softWiredPrelimTraceback inGraph nodesToUpdate updatedNodes = - if null nodesToUpdate then updatedNodes - else - let (firstNode, firstLeftRight, isLeft) = head nodesToUpdate - - -- ensure consistent left/right from post-order - resolutionIndexVect = if isLeft then fmap fst firstLeftRight - else fmap snd firstLeftRight - - -- get resolution info - (newBlockData, newSubGraphCost, newVertexCost, childLeftRightIndexVect) = getResolutionDataAndIndices (snd firstNode) resolutionIndexVect - - -- make new node - newNodeLabel = (snd firstNode) { vertData = newBlockData - , vertexCost = newVertexCost - , subGraphCost = newSubGraphCost - } - - newFirstNode = (fst firstNode, newNodeLabel) - in - - -- not really necessary, but avoids the graph query operations - if nodeType (snd firstNode) == LeafNode then - softWiredPrelimTraceback inGraph (tail nodesToUpdate) (newFirstNode : updatedNodes) - - -- checks if network node (and its children) has been visited already - else if (nodeType (snd firstNode) == NetworkNode) && isJust (L.find ((== fst firstNode). fst) updatedNodes) then - softWiredPrelimTraceback inGraph (tail nodesToUpdate) updatedNodes - - else - let -- get children - firstChildren = LG.labDescendants inGraph firstNode - firstChildrenBV = fmap (bvLabel . snd) firstChildren - firstChildrenIsLeft - | length firstChildrenBV == 1 = [True] -- seems dumb but may assume lenght == 2 later - | head firstChildrenBV > (firstChildrenBV !! 1) = [False, True] - | otherwise = [True, False] - - childrenTuples = zip3 firstChildren (replicate (length firstChildren) childLeftRightIndexVect) firstChildrenIsLeft - - - in - {- - if V.head resolutionIndexVect == Nothing then error ("'Nothing' index in softWiredPrelimTraceback " ++ (show resolutionIndexVect) - ++ " Node " ++ (show $ fst firstNode) ++ " isLeft?: " ++ (show isLeft) ++ " " ++ (show firstLeftRight)) - else - -} - softWiredPrelimTraceback inGraph (childrenTuples ++ tail nodesToUpdate) (newFirstNode : updatedNodes) - - --- | getResolutionDataAndIndices takes a vertex label (VertexInfo) and returns the resolution data corresponding to --- the index taken from its child resolution (data, subgraph cost, local resolution cost, left/right pairs). --- Index = (-1) denotes that it is a root label and in that case --- the best (lowest cost) resolutions are returned -getResolutionDataAndIndices :: VertexInfo -> V.Vector (Maybe Int) -> (VertexBlockData, VertexCost, VertexCost, V.Vector (Maybe Int, Maybe Int)) -getResolutionDataAndIndices nodeLabel parentResolutionIndexVect = - - -- should not happen - --mtrace ("NL " ++ (show $ index nodeLabel) ++ " PRIL: " ++ " length " ++ (show $ V.length parentResolutionIndexVect) ++ " " ++ show parentResolutionIndexVect) ( - if nodeType nodeLabel == LeafNode then - let leafVertData = fmap (displayData . V.head) (vertexResolutionData nodeLabel) - in - -- trace ("GRDI: Leaf " ++ (show $ fmap V.length (vertexResolutionData nodeLabel))) - (leafVertData, 0, 0, V.singleton (Just 0, Just 0)) - - -- root node--take lowest cost - else if V.head parentResolutionIndexVect == Just (-1) then - let rootBlockResolutionPair = getBestBlockResolution <$> vertexResolutionData nodeLabel - (charDataVV, subGraphCostV, resCostV, leftRightIndexVect) = V.unzip4 rootBlockResolutionPair - in - -- trace ("GRDI: Root " ++ (show $ fmap show parentResolutionIndexVect) ++ " " ++ (show $ fmap V.length (vertexResolutionData nodeLabel))) - (charDataVV, V.sum subGraphCostV, V.sum resCostV, leftRightIndexVect) - - -- non-root node--return the index resolution information - else - -- trace ("GRD Length:" ++ (show $ V.length $ vertexResolutionData nodeLabel) ++ " " ++ (show $ fmap fromJust parentResolutionIndexVect) ++ "\n" ++ (show $ vertexResolutionData nodeLabel)) ( - let parentIndexVect = fmap fromJust parentResolutionIndexVect - - -- get resolution data from node label - resolutionData = vertexResolutionData nodeLabel - - -- get the correct (via index) resolution data for each block - -- complex for network node since keeps left right sort of array, but only first element maters--this hack keeps things ok for - -- tree-like traceback assignment - -- not sure if first or last would be better--hopefully not matter, or arbitrary equal solutions - resolutionsByBlockV = if nodeType nodeLabel == NetworkNode then - -- trace ("-" ++ (show (V.length resolutionData, V.length parentIndexVect, V.head parentIndexVect)) ++ " " ++ (show $ parentIndexVect)) - -- V.zipWith (V.!) resolutionData (V.replicate (V.length parentIndexVect) (V.head parentIndexVect)) - V.zipWith (V.!) resolutionData (V.replicate (V.length parentIndexVect) 0) - else - -- trace ("+" ++ (show (V.length resolutionData, V.length parentIndexVect)) ++ " " ++ (show $ parentIndexVect)) - V.zipWith (V.!) resolutionData parentIndexVect - - -- get other resolution info - charDataVV = fmap displayData resolutionsByBlockV - lSubGraphCost = V.sum $ fmap displayCost resolutionsByBlockV - localResolutionCost = V.sum $ fmap resolutionCost resolutionsByBlockV - - -- only takes first left right pair--although others in there - -- uses first for preliminary asiignment--but could be more - -- if euqla cost display trees, may have multiple possible preliminary states - leftRightIndexVect = fmap (head . childResolutions) resolutionsByBlockV - in - {- - if V.null resolutionData then - error ("Null resolution data in getResolutionDataAndIndices at node with label:" ++ (show nodeLabel)) - else - -} - -- trace ("GRDI: " ++ (show $ nodeType nodeLabel) ++ " " ++ (show $ fmap V.length resolutionData) ++ " " ++ (show parentIndexVect)) - (charDataVV, lSubGraphCost, localResolutionCost, leftRightIndexVect) - -- ) - --- | getBestBlockResolution takes vertexResolutionData and returns the best (lowest cost) resolution and associated data --- for a single block of characters. A single left right index pair is returns for child resolution sources. Could be multiple --- from resolution compression (equal leaf set, equal median) and avaialble for potenitally implementd in future, multiple --- preliminary assignments. Only valid for root node with all leaves in graph -getBestBlockResolution :: ResolutionBlockData -> (V.Vector CharacterData, VertexCost, VertexCost, (Maybe Int, Maybe Int)) -getBestBlockResolution inResBlockData = - if V.null inResBlockData then (mempty, 0.0, 0.0, (Nothing, Nothing)) - else - let -- makes sure all leaves in resolution - -- displayPopList = fmap (complement . displayBVLabel) inResBlockData - - -- this for non full graph root--uses highest number of bits on--make sure all taxa in - -- should be fine for a subgraph that has a single edge a base--may not be correct - -- for a sunbgaph that has conections outside of its graph root. - displayPopList' = fmap (popCount . displayBVLabel) inResBlockData - maxPop = maximum displayPopList' - - - - -- subgraph cost - displayCostList = fmap displayCost inResBlockData - - -- resolution local cost - resolutionCostList = fmap resolutionCost inResBlockData - - -- takes only first resolution index pair - childResolutionList = fmap (head . childResolutions) inResBlockData - - -- resolution medians - displayDataList = fmap displayData inResBlockData - - {- - -- take only those will all leaves in, then minimum cost - quintVect = V.zip5 displayPopList displayCostList resolutionCostList childResolutionList displayDataList - validVect = V.filter (BV.isZeroVector . fst5) quintVect - validMinCost = V.minimum $ fmap snd5 validVect - -} - - -- these for "best" this will largest leaf set - quintVect' = V.zip5 displayPopList' displayCostList resolutionCostList childResolutionList displayDataList - validVect' = V.filter ((== maxPop) . fst5) quintVect' - validMinCost' = V.minimum $ fmap snd5 validVect' - - -- ONly takes first of potentially multiple soliutions to begin traceback - (_, displayCostV, resCostV, childIndexPairV, displayMedianV) = V.unzip5 $ V.filter ((== validMinCost') . snd5) validVect' - in - if null validVect' then error "Null valid quad in getBestBlockResolution--perhaps not root node or forest component" - else (V.head displayMedianV, V.head displayCostV, V.head resCostV, V.head childIndexPairV) - --- | makeLeafGraphSoftWired takes input data and creates a 'graph' of leaves with Vertex information --- but with zero edges. This 'graph' can be reused as a starting structure for graph construction --- to avoid remaking of leaf vertices --- includes leave resolution data -makeLeafGraphSoftWired :: ProcessedData -> DecoratedGraph -makeLeafGraphSoftWired (nameVect, bvNameVect, blocDataVect) = - if V.null nameVect then error "Empty ProcessedData in makeLeafGraph" - else - let leafVertexList = V.toList $ V.map (makeLeafVertexSoftWired nameVect bvNameVect blocDataVect) (V.fromList [0.. V.length nameVect - 1]) - in - LG.mkGraph leafVertexList [] - --- | makeLeafVertexSoftWired makes a single unconnected vertex for a leaf in a Soft-wired graph -makeLeafVertexSoftWired :: V.Vector NameText -> V.Vector NameBV -> V.Vector BlockData -> Int -> LG.LNode VertexInfo -makeLeafVertexSoftWired nameVect bvNameVect inData localIndex = - --trace ("Making leaf " ++ (show localIndex) ++ " Data " ++ (show $ length inData) ++ " " ++ (show $ fmap length $ fmap snd3 inData)) ( - let centralData = V.map snd3 inData - thisData = V.map (V.! localIndex) centralData - thisBVLabel = bvNameVect V.! localIndex - thisResolutionData = makeLeafResolutionBlockData thisBVLabel ([(localIndex, minimalVertex)],[]) thisData - minimalVertex = VertexInfo { index = localIndex - , bvLabel = thisBVLabel - , parents = V.empty - , children = V.empty - , nodeType = LeafNode - , vertName = nameVect V.! localIndex - , vertData = mempty - , vertexResolutionData = mempty - , vertexCost = 0.0 - , subGraphCost = 0.0 - } - newVertex = VertexInfo { index = localIndex - , bvLabel = thisBVLabel - , parents = V.empty - , children = V.empty - , nodeType = LeafNode - , vertName = nameVect V.! localIndex - , vertData = mempty - , vertexResolutionData = thisResolutionData - , vertexCost = 0.0 - , subGraphCost = 0.0 - } - in - --trace ("RD" ++ show $ thisResolutionData) - (localIndex, newVertex) - -- ) - - --- | makeLeafResolutionBlockData creates leaf resolution data from leav BVLabel, leave node, and data. --- The return type is a vertor over character blocks each containing a list of potential resolutions (display trees) for that block --- the resolutoins include subtree (display) for that resolution the bv l;abel for the node given that resolution, and the character data --- in the block (Vector CharacterData) also given that resolution --- thiis is repeaatted for each bloick in VertexBlockData -makeLeafResolutionBlockData :: NameBV -> ([LG.LNode VertexInfo], [LG.LEdge EdgeInfo]) -> VertexBlockData -> V.Vector ResolutionBlockData -makeLeafResolutionBlockData inBV inSubGraph inVertData = - let defaultResolutionData = ResolutionData { displaySubGraph = inSubGraph - , displayBVLabel = inBV - , displayData = mempty - , childResolutions = [(Just 0, Just 0)] - , resolutionCost = 0.0 - , displayCost = 0.0 - } - - blockIndexList = [0..(V.length inVertData - 1)] - blockDataList = fmap (inVertData V.!) blockIndexList - resolutionDataList = modifyDisplayData defaultResolutionData blockDataList [] - resolutionData = V.fromList $ fmap (V.fromList . (:[])) resolutionDataList - in - resolutionData - --- | modifyDisplayData modifies displatData filed in ResolutionData --- stas list doesn't change number of V.fromList calls -modifyDisplayData :: ResolutionData -> [V.Vector CharacterData] -> [ResolutionData] -> [ResolutionData] -modifyDisplayData resolutionTemplate characterDataVList curResolutionList = - if null characterDataVList then reverse curResolutionList - else - let curBlockData = head characterDataVList - in - modifyDisplayData resolutionTemplate (tail characterDataVList) ((resolutionTemplate {displayData = curBlockData}) : curResolutionList) - --- | getOutDegree1VertexAndGraph makes parent node fomr single child for soft-wired resolutions -getOutDegree1VertexAndGraph :: (Show a, Show b) - => LG.Node - -> VertexInfo - -> LG.Gr a b - -> [LG.Node] - -> DecoratedGraph - -> (DecoratedGraph, Bool, VertexInfo, VertexCost, V.Vector [DecoratedGraph]) -getOutDegree1VertexAndGraph curNode childLabel simpleGraph nodeChildren subTree = - - -- trace ("In out=1: " ++ (show curNode)) ( - let childResolutionData = vertexResolutionData childLabel - - curNodeResolutionData = addNodeAndEdgeToResolutionData newDisplayNode newLEdge childResolutionData - - newEdgeLabel = EdgeInfo { minLength = 0.0 - , maxLength = 0.0 - , midRangeLength = 0.0 - , edgeType = TreeEdge - } - newMinVertex = VertexInfo { index = curNode - , bvLabel = bvLabel childLabel - , parents = V.fromList $ LG.parents simpleGraph curNode - , children = V.fromList nodeChildren - , nodeType = GO.getNodeType simpleGraph curNode - , vertName = T.pack $ "HTU" ++ show curNode - , vertData = mempty - , vertexResolutionData = mempty - , vertexCost = 0.0 - , subGraphCost = 0.0 - } - - newVertex = VertexInfo { index = curNode - , bvLabel = bvLabel childLabel - , parents = V.fromList $ LG.parents simpleGraph curNode - , children = V.fromList nodeChildren - , nodeType = GO.getNodeType simpleGraph curNode - , vertName = T.pack $ "HTU" ++ show curNode - , vertData = mempty - , vertexResolutionData = curNodeResolutionData - , vertexCost = 0.0 - , subGraphCost = subGraphCost childLabel - } - - newLEdge = (curNode, index childLabel, newEdgeLabel) - newLNode = (curNode, newVertex) - newDisplayNode = (curNode, newMinVertex) - newGraph = LG.insEdge newLEdge $ LG.insNode newLNode subTree - - (displayGraphVL, lDisplayCost) = if nodeType newVertex == RootNode then extractDisplayTrees Nothing True (vertexResolutionData childLabel) - else (mempty, 0.0) - - - in - --trace ("NV1: " ++ show newVertex) - --trace ("GOD1VG: " ++ (show $ LG.toEdge newLEdge) ++ " has edges " ++ (show $ LG.hasEdge subTree $ LG.toEdge newLEdge) ++ "Resolutions " ++ (show $ fmap (fmap U.hasResolutionDuplicateEdges) curNodeResolutionData)) - (newGraph, nodeType newVertex == RootNode, newVertex, lDisplayCost, displayGraphVL) - -- (newGraph, False, newVertex, 0.0, mempty) - -- ) - --- | getOutDegree1VertexSoftWired returns new vertex only from single child for soft-wired resolutions -getOutDegree1VertexSoftWired :: (Show a, Show b) - => LG.Node - -> VertexInfo - -> LG.Gr a b - -> [LG.Node] - -> VertexInfo -getOutDegree1VertexSoftWired curNode childLabel simpleGraph nodeChildren = - - -- trace ("In out=1: " ++ (show curNode)) ( - let childResolutionData = vertexResolutionData childLabel - - - newEdgeLabel = EdgeInfo { minLength = 0.0 - , maxLength = 0.0 - , midRangeLength = 0.0 - , edgeType = TreeEdge - } - newMinVertex = VertexInfo { index = curNode - , bvLabel = bvLabel childLabel - , parents = V.fromList $ LG.parents simpleGraph curNode - , children = V.fromList nodeChildren - , nodeType = NetworkNode - , vertName = T.pack $ "HTU" ++ show curNode - , vertData = mempty - , vertexResolutionData = mempty - , vertexCost = 0.0 - , subGraphCost = 0.0 - } - - newDisplayNode = (curNode, newMinVertex) - newLEdge = (curNode, index childLabel, newEdgeLabel) - - curNodeResolutionData = addNodeAndEdgeToResolutionData newDisplayNode newLEdge childResolutionData - - - newVertexLabel = VertexInfo { index = curNode - , bvLabel = bvLabel childLabel - , parents = V.fromList $ LG.parents simpleGraph curNode - , children = V.fromList nodeChildren - , nodeType = NetworkNode - , vertName = T.pack $ "HTU" ++ show curNode - , vertData = mempty - , vertexResolutionData = curNodeResolutionData - , vertexCost = 0.0 - , subGraphCost = subGraphCost childLabel - } - - - in - newVertexLabel - --- | getOutDegree2VertexSoftWired returns new vertex only from two child nodes for soft-wired resolutions -getOutDegree2VertexSoftWired :: GlobalSettings - -> V.Vector (V.Vector CharInfo) - -> LG.Node - -> LG.LNode VertexInfo - -> LG.LNode VertexInfo - -> DecoratedGraph - -> VertexInfo -getOutDegree2VertexSoftWired inGS charInfoVectVect curNodeIndex leftChild@(leftChildIndex, _) rightChild@(rightChildIndex, _) inGraph = - - let -- this ensures that left/right choices are based on leaf BV for consistency and label invariance - -- larger bitvector is Right, smaller or equal Left - ((leftChild', leftChildLabel'), (rightChild', rightChildLabel')) = U.leftRightChildLabelBVNode (leftChild, rightChild) - - -- create resolution caches for blocks - leftChildNodeType = nodeType leftChildLabel' - rightChildNodeType = nodeType rightChildLabel' - resolutionBlockVL = V.zipWith3 (createBlockResolutions (compressResolutions inGS) curNodeIndex leftChild' rightChild' leftChildNodeType rightChildNodeType TreeNode) (vertexResolutionData leftChildLabel') (vertexResolutionData rightChildLabel') charInfoVectVect - - -- create canonical Decorated Graph vertex - -- 0 cost becasue can't know cosrt until hit root and get best valid resolutions - newVertexLabel = VertexInfo { index = curNodeIndex - , bvLabel = bvLabel leftChildLabel' .|. bvLabel rightChildLabel' - , parents = V.fromList $ LG.parents inGraph curNodeIndex - , children = V.fromList [leftChildIndex, rightChildIndex] - , nodeType = TreeNode - , vertName = T.pack $ "HTU" ++ show curNodeIndex - , vertData = mempty --empty because of resolution data - , vertexResolutionData = resolutionBlockVL - , vertexCost = 0.0 --newCost - , subGraphCost = 0.0 -- (subGraphCost leftChildLabel) + (subGraphCost rightChildLabel) + newCost - } - in - newVertexLabel - --- | extractDisplayTrees takes resolutions and pulls out best cost (head for now) need to change type for multiple best --- option for filter based on pop-count for root cost and complete display tree check -extractDisplayTrees :: Maybe Int -> Bool -> V.Vector ResolutionBlockData -> (V.Vector [DecoratedGraph], VertexCost) -extractDisplayTrees startVertex checkPopCount inRBDV = - if V.null inRBDV then (V.empty, 0.0) - else - let (bestBlockDisplayResolutionList, costVect) = V.unzip $ fmap (getBestResolutionList startVertex checkPopCount) inRBDV - in - (bestBlockDisplayResolutionList, V.sum costVect) - --- | createBlockResolutions takes left and right child resolution data for a block (same display tree) --- and generates node resolution data -createBlockResolutions :: Bool - -> LG.Node - -> Int - -> Int - -> NodeType - -> NodeType - -> NodeType - -> ResolutionBlockData - -> ResolutionBlockData - -> V.Vector CharInfo - -> ResolutionBlockData -createBlockResolutions - compress - curNode - leftIndex - rightIndex - leftChildNodeType - rightChildNodeType - curNodeNodeType - leftChild - rightChild - charInfoV - | null leftChild && null rightChild = mempty - | null leftChild = rightChild - | null rightChild = leftChild - | otherwise = - -- trace ("CBR:" ++ (show (leftIndex, leftChildNodeType, rightIndex, rightChildNodeType)) ++ (show $fmap BV.toBits $ fmap displayBVLabel leftChild) ++ " and " ++ (show $fmap BV.toBits $ fmap displayBVLabel rightChild)) ( - let childResolutionPairs = cartProd (V.toList leftChild) (V.toList rightChild) - -- need to keep these indices correct (hence reverse in checkLeafOverlap) for traceback and compress - childResolutionIndices = cartProd [0.. (length leftChild - 1)] [0.. (length rightChild - 1)] - validPairs = checkLeafOverlap (zip childResolutionPairs childResolutionIndices) [] - - -- either parallel seems about the same - -- newResolutionList = fmap (createNewResolution curNode leftIndex rightIndex leftChildNodeType rightChildNodeType charInfoV) validPairs `using` PU.myParListChunkRDS - newResolutionList = PU.seqParMap rdeepseq (createNewResolution curNode leftIndex rightIndex leftChildNodeType rightChildNodeType charInfoV) validPairs - - --need to add in node and edge to left and right - edgeLable = EdgeInfo { minLength = 0.0 - , maxLength = 0.0 - , midRangeLength = 0.0 - , edgeType = TreeEdge - } - newMinVertex = VertexInfo { index = curNode - , bvLabel = BV.fromBits [False] - , parents = mempty - , children = mempty - , nodeType = curNodeNodeType - , vertName = T.pack $ "HTU" ++ show curNode - , vertData = mempty - , vertexResolutionData = mempty - , vertexCost = 0.0 - , subGraphCost = 0.0 - } - - newNode = (curNode, newMinVertex) - - addLeft = if leftChildNodeType == NetworkNode then - let newEdge = (curNode, rightIndex, edgeLable) - newRightChildBlockResolutionData = addNodeEdgeToResolutionList newNode newEdge 0 [] rightChild - in - -- trace ("ANEL:" ++ (show $ (curNode, rightIndex))) - newRightChildBlockResolutionData - else -- trace ("ANEL-Nothing") - mempty - - addRight = if rightChildNodeType == NetworkNode then - let newEdge = (curNode, leftIndex, edgeLable) - newLeftChildBlockResolutionData = addNodeEdgeToResolutionList newNode newEdge 0 [] leftChild - in - -- trace ("ANER:" ++ (show $ (curNode, leftIndex))) - newLeftChildBlockResolutionData - else -- trace ("ANER-Nothing") - mempty - - - in - -- trace ("CBR:" ++ (show $ leftIndex == rightIndex)) ( - -- trace ("=> " ++ (show $fmap BV.toBits $ fmap displayBVLabel totalResolutions) ) - -- compress to unique resolutions--can loose display trees, but speed up post-order a great deal - -- trace ("CBR Num res left: " ++ (show $ V.length leftChild) ++ " Num res right: " ++ (show $ V.length rightChild) ++ " =>NRL " ++ (show $ length newResolutionList) ++ " addleft " ++ (show $ length addLeft) ++ " addright " ++ (show $ length addRight)) ( - if compress then V.fromList (nubResolutions newResolutionList []) V.++ (addLeft V.++ addRight) - else V.fromList newResolutionList V.++ (addLeft V.++ addRight) - -- ) - -- ) - --- | createNewResolution takes a pair of resolutions and creates the median resolution --- need to watch let/right (based on BV) for preorder stuff -createNewResolution :: LG.Node -> Int -> Int -> NodeType -> NodeType -> V.Vector CharInfo -> ((ResolutionData, ResolutionData),(Int, Int)) -> ResolutionData -createNewResolution curNode leftIndex rightIndex leftChildNodeType rightChildNodeType charInfoV ((leftRes, rightRes), (leftResIndex, rightResIndex)) = - let -- make bvLabel for resolution - resBV = displayBVLabel leftRes .|. displayBVLabel rightRes - - -- Make resolution Display tree infomation - leftEdgeType - | leftChildNodeType == NetworkNode = NetworkEdge - | leftChildNodeType == LeafNode = PendantEdge - | otherwise = TreeEdge - rightEdgeType - | rightChildNodeType == NetworkNode = NetworkEdge - | rightChildNodeType == LeafNode = PendantEdge - | otherwise = TreeEdge - - edgeLable = EdgeInfo { minLength = 0.0 - , maxLength = 0.0 - , midRangeLength = 0.0 - , edgeType = TreeEdge - } - - leftEdge = (curNode, leftIndex, edgeLable {edgeType = leftEdgeType}) - rightEdge = (curNode, rightIndex, edgeLable {edgeType = rightEdgeType}) - leftChildTree = displaySubGraph leftRes - rightChildTree = displaySubGraph rightRes - - -- Data fields empty for display tree data--not needed and muptiple copies of everything - newNodeLabel = VertexInfo { index = curNode - , bvLabel = resBV - , parents = V.empty - , children = V.fromList [leftIndex, rightIndex] - , nodeType = TreeNode - , vertName = T.pack $ "HTU" ++ show curNode - , vertData = mempty - , vertexResolutionData = mempty - , vertexCost = 0.0 - , subGraphCost = 0.0 - } - - newNode = (curNode, newNodeLabel) - - -- this check for redundant edges in resoluton cash from combinations - -- resolutionEdgeList = leftEdge : (rightEdge: (snd leftChildTree ++ snd rightChildTree)) - existingEdges = snd leftChildTree ++ snd rightChildTree - resolutionEdgeList = if (leftEdge `notElem` existingEdges) && (rightEdge `notElem` existingEdges) then leftEdge : (rightEdge : existingEdges) - else if (leftEdge `elem` existingEdges) && (rightEdge `elem` existingEdges) then existingEdges - else if (leftEdge `notElem` existingEdges) then leftEdge : existingEdges - else rightEdge : existingEdges - - resolutionNodeList = newNode : (fst leftChildTree ++ fst rightChildTree) - - -- Make the data and cost for the resolution - leftBlockLength = V.length $ displayData leftRes - rightBlockLength = V.length $ displayData rightRes - resolutionMedianCostV - | (leftBlockLength == 0) = V.zip (displayData rightRes) (V.replicate rightBlockLength 0) - | (rightBlockLength == 0) = V.zip (displayData leftRes) (V.replicate leftBlockLength 0) - | otherwise = M.median2 (displayData leftRes) (displayData rightRes) charInfoV - (resolutionMedianV, resolutionCostV) = V.unzip resolutionMedianCostV - thisResolutionCost = V.sum resolutionCostV - displaySubTreeCost = displayCost leftRes + displayCost rightRes + thisResolutionCost - - in - ResolutionData { displaySubGraph = (resolutionNodeList, resolutionEdgeList) - , displayBVLabel = resBV - , displayData = resolutionMedianV - , childResolutions = [(Just leftResIndex, Just rightResIndex)] - , resolutionCost = thisResolutionCost - , displayCost = displaySubTreeCost - } - --- | nubResolutions takes a list of resolulutions and returns 'nub' based on leaf set (bvLabel) and Charinfo vector -nubResolutions :: [ResolutionData] -> [ResolutionData] -> [ResolutionData] -nubResolutions inData curData - | null inData = reverse curData - | null curData = nubResolutions (tail inData) [head inData] - | otherwise = - let firstData = head inData - (isUnique, matchIndex) = hasResolutionMatch (displayBVLabel firstData) (displayData firstData) curData 0 - in - if isUnique then nubResolutions (tail inData) (firstData : curData) - else - -- need to update the childResolutoin field of the match - let firstPart = L.take matchIndex curData - lastPart = L.drop (matchIndex + 1) curData - matchChildResolutionData = childResolutions (curData !! matchIndex) - firstDataChildResolutionData = childResolutions firstData - newMatchResolution = (curData !! matchIndex) {childResolutions = firstDataChildResolutionData ++ matchChildResolutionData} - in - nubResolutions (tail inData) (firstPart ++ (newMatchResolution : lastPart)) - --- | hasResolutionMatch checks for match of bvlabel and Vect charinfo with list -hasResolutionMatch :: NameBV -> V.Vector CharacterData -> [ResolutionData] -> Int -> (Bool, Int) -hasResolutionMatch inBV inCD rDList curIndex = - if null rDList then (True, curIndex) - else - let existingBV = displayBVLabel $ head rDList - existingCharData = displayData $ head rDList - in - if (existingBV /= inBV) || (existingCharData /= inCD) then hasResolutionMatch inBV inCD (tail rDList) (curIndex + 1) else (False, curIndex) - --- | checkLeafOverlap takes a left right resolution pair list and checks if --- there is leaf overlap via comparing displayBVLabel if & = 0 then no --- overlap, and adds to resulting list--reverses order--sholdn't matter -checkLeafOverlap :: [((ResolutionData, ResolutionData), (Int, Int))] -> [((ResolutionData, ResolutionData), (Int, Int))] -> [((ResolutionData, ResolutionData), (Int, Int))] -checkLeafOverlap inPairList curPairList = - if null inPairList then reverse curPairList - else - let inPair@((leftRes, rightRes), (_, _)) = head inPairList - leftBV = displayBVLabel leftRes - rightBV = displayBVLabel rightRes - in - if BV.isZeroVector (leftBV .&. rightBV) then checkLeafOverlap (tail inPairList) (inPair : curPairList) - else checkLeafOverlap (tail inPairList) curPairList - - - - --- | addNodeAndEdgeToResolutionData adds new node and edge to resolution data in outdegree = 1 nodes --- striaght copy would not add this node or edge to subtree in resolutions -addNodeAndEdgeToResolutionData :: LG.LNode VertexInfo -> LG.LEdge EdgeInfo -> V.Vector ResolutionBlockData -> V.Vector ResolutionBlockData -addNodeAndEdgeToResolutionData newNode newEdge = fmap (addNodeEdgeToResolutionList newNode newEdge 0 []) - --- | addNodeEdgeToResolutionList adds new node and edge to single subGraph in ResolutionData --- adds resolutoin pairs to be equal to the child straight one-for-one correpondance -addNodeEdgeToResolutionList :: LG.LNode VertexInfo -> LG.LEdge EdgeInfo -> Int -> [ResolutionData] -> V.Vector ResolutionData -> V.Vector ResolutionData -addNodeEdgeToResolutionList newNode newEdge resolutionIndex curData inData = - if null inData then V.fromList $ reverse curData - else - let firstInData = V.head inData - (inNodeList, inEdgeList) = displaySubGraph firstInData - - -- childResolutionIndexPairList = childResolutions firstInData - newNodeList = newNode : inNodeList - - -- this check for redundant edges in resoluton cash from combinations - newEdgeList = if (newEdge `notElem` inEdgeList) then newEdge : inEdgeList - else inEdgeList - newFirstData = firstInData { displaySubGraph = (newNodeList, newEdgeList) - -- this apir in case of left/right issues later - -- not sure this is correct--LWys first for children of out = 1 node - , childResolutions = [(Just 0, Just 0)] -- [(Just resolutionIndex, Just resolutionIndex)] - } - in - addNodeEdgeToResolutionList newNode newEdge (resolutionIndex + 1) (newFirstData : curData) (V.tail inData) - - --- | getAllResolutionList takes ResolutionBlockData and retuns a list of the all valid (ie all leaves in subtree) display trees --- for that block- and costs -getAllResolutionList :: ResolutionBlockData -> [(DecoratedGraph, VertexCost)] -getAllResolutionList inRDList = - --trace ("GBRL: " ++ (show inRDList)) ( - if null inRDList then error "Null resolution list" - else - let displayTreeList = fmap displaySubGraph inRDList - displayCostList = fmap displayCost inRDList - displayPopList = fmap (complement . displayBVLabel) inRDList - in - let displayBVList = V.zip3 displayTreeList displayCostList displayPopList - validDisplayList = V.filter (BV.isZeroVector . thd3) displayBVList - (displayList, costList, _) = V.unzip3 validDisplayList - in - --trace ("Valid display list number:" ++ (show $ length validDisplayList)) ( - if V.null validDisplayList then error ("Null validDisplayList in getAllResolutionList" ++ show inRDList) - else - let lDisplayTreeList = fmap LG.mkGraphPair (V.toList displayList) - -- displayTreeList' = fmap (updateRootCost validMinCost) displayTreeList - in - zip lDisplayTreeList (V.toList costList) - - --- | getBestResolutionList takes ResolutionBlockData and retuns a list of the best valid (ie all leaves in subtree) display trees --- for that block-- if checkPopCount is True--otherwise all display trees of any cost and contitution --- startVertex for a component-- to allow for not every leaf being in componnet but still getting softwired cost -getBestResolutionList :: Maybe Int -> Bool -> ResolutionBlockData -> ([DecoratedGraph], VertexCost) -getBestResolutionList startVertex checkPopCount inRDList = - --trace ("GBRL: " ++ (show inRDList)) ( - if null inRDList then error "Null resolution list" - else - let displayTreeList = fmap displaySubGraph inRDList - displayCostList = fmap displayCost inRDList - displayPopList = fmap (complement . displayBVLabel) inRDList - in - if not checkPopCount then - let minCost = minimum displayCostList - displayCostPairList = V.zip displayTreeList displayCostList - (bestDisplayList, _) = V.unzip $ V.filter ((== minCost) . snd) displayCostPairList - in - (fmap LG.mkGraphPair (V.toList bestDisplayList), minCost) - else - let minPopCount = minimum $ fmap popCount displayPopList --max since complemented above - displayBVList = V.zip3 displayTreeList displayCostList displayPopList - - -- must have all leaves if startvzertex == Nothing, component maximum otherwise - -- this for getting cost of component of a softwired network - validDisplayList = if startVertex == Nothing then V.filter (BV.isZeroVector . thd3) displayBVList - else V.filter ((== minPopCount) . (popCount . thd3)) displayBVList - validMinCost = V.minimum $ fmap snd3 validDisplayList - (bestDisplayList, _, _) = V.unzip3 $ V.filter ((== validMinCost) . snd3) validDisplayList - in - --trace ("Valid display list number:" ++ (show $ length validDisplayList)) ( - if (startVertex == Nothing) && (V.null validDisplayList) then error ("Null root validDisplayList in getBestResolutionList" ++ (show (startVertex,inRDList)) ++ " This can be caused if the graphType not set correctly.") - else - let lDisplayTreeList = fmap LG.mkGraphPair (V.toList bestDisplayList) - - -- update root cost of display trees for use later (e.g. net penalties, outputting display forrests) - lDisplayTreeList' = fmap (updateRootCost validMinCost) lDisplayTreeList - in - (lDisplayTreeList', validMinCost) - -- ) - - -- ) - --- | getBestResolutionListPair takes ResolutionBlockData and retuns a list of the best valid (ie all leaves in subtree) display trees --- for that block-- if checkPopCount is True--otherwise all display trees of any cost and contitution --- startVertex for a component-- to allow for not every leaf being in componnet but still getting softwired cost --- returns list of pairs -getBestResolutionListPair :: Maybe Int -> Bool -> ResolutionBlockData -> [(DecoratedGraph, VertexCost)] -getBestResolutionListPair startVertex checkPopCount inRDList = - --trace ("GBRL: " ++ (show inRDList)) ( - if null inRDList then error "Null resolution list" - else - let displayTreeList = fmap displaySubGraph inRDList - displayCostList = fmap displayCost inRDList - displayPopList = fmap (complement . displayBVLabel) inRDList - in - if not checkPopCount then - let minCost = minimum displayCostList - displayCostPairList = V.zip displayTreeList displayCostList - (bestDisplayList, minCostList) = V.unzip $ V.filter ((== minCost) . snd) displayCostPairList - in - zip (fmap LG.mkGraphPair (V.toList bestDisplayList)) (V.toList minCostList) - else - let minPopCount = minimum $ fmap popCount displayPopList --max since complemented above - displayBVList = V.zip3 displayTreeList displayCostList displayPopList - - -- must have all leaves if startvzertex == Nothing, component maximum otherwise - -- this for getting cost of component of a softwired network - validDisplayList = if startVertex == Nothing then V.filter (BV.isZeroVector . thd3) displayBVList - else V.filter ((== minPopCount) . (popCount . thd3)) displayBVList - validMinCost = V.minimum $ fmap snd3 validDisplayList - (bestDisplayList, minCostList, _) = V.unzip3 $ V.filter ((== validMinCost) . snd3) validDisplayList - in - --trace ("Valid display list number:" ++ (show $ length validDisplayList)) ( - if (startVertex == Nothing) && (V.null validDisplayList) then error ("Null root validDisplayList in getBestResolutionListPair" ++ (show (startVertex,inRDList)) ++ " This can be caused if the graphType not set correctly.") - else - let lDisplayTreeList = fmap LG.mkGraphPair (V.toList bestDisplayList) - - -- update root cost of display trees for use later (e.g. net penalties, outputting display forrests) - lDisplayTreeList' = fmap (updateRootCost validMinCost) lDisplayTreeList - in - zip lDisplayTreeList' (V.toList minCostList) - -- ) - - -- ) - --- | updateRootCost updates the subGraphCost of the root node(s) with input value --- new node is created, so original is deleted, new added, and original edges added back --- since deleted when node is --- assumes its a tree wiht a single root -updateRootCost :: VertexCost -> DecoratedGraph -> DecoratedGraph -updateRootCost newRootCost inGraph = - let (rootIndex, rootLabel) = head $ LG.getRoots inGraph - rootEdges = LG.out inGraph rootIndex - newRootLabel = rootLabel {subGraphCost = newRootCost} - in - -- trace ("DCC: " ++ (show newRootCost)) - LG.insEdges rootEdges $ LG.insNode (rootIndex, newRootLabel) $ LG.delNode rootIndex inGraph - --- | makeCharacterGraph takes a blockGraph and creates a vector of character graphs --- each with a single block and single character --- updating costs -makeCharacterGraph :: DecoratedGraph -> V.Vector DecoratedGraph -makeCharacterGraph inBlockGraph = - if LG.isEmpty inBlockGraph then V.empty - else - let numCharacters = V.length $ V.head $ vertData $ snd $ head $ LG.labNodes inBlockGraph - characterGraphList = if numCharacters > 0 then fmap (pullCharacter False inBlockGraph) [0.. (numCharacters - 1)] - -- missing data - else [pullCharacter True inBlockGraph 0] - in - if V.length (vertData $ snd $ head $ LG.labNodes inBlockGraph) /= 1 then error ("Number of blocks /= 1 in makeCharacterGraph :" ++ (show $ V.length (vertData $ snd $ head $ LG.labNodes inBlockGraph))) - else - -- trace ("Chars: " ++ show numCharacters) - V.fromList characterGraphList - --- | pullCharacter takes a DecoratedGraph with a single block and --- creates a new DecoratedGraph with a single character from the input index -pullCharacter :: Bool -> DecoratedGraph -> Int -> DecoratedGraph -pullCharacter isMissing inBlockGraph characterIndex = - if LG.isEmpty inBlockGraph then LG.empty - else - let (inNodeIndexList, inNodeLabelList) = unzip $ LG.labNodes inBlockGraph - characterLabelList = fmap (makeCharacterLabels isMissing characterIndex) inNodeLabelList - in - LG.mkGraph (zip inNodeIndexList characterLabelList) (LG.labEdges inBlockGraph) - --- | makeCharacterLabels pulls the index character label form the singleton block (via head) --- and creates a singleton character label, updating costs to that of the character --- NB the case of missing data is answered here by an "empty charcter" --- could be better to have V.empty --- isMIssingChar seems to be extraneous--not sure whey it was there. -makeCharacterLabels :: Bool -> Int -> VertexInfo -> VertexInfo -makeCharacterLabels isMissing characterIndex inVertexInfo = - -- trace ("MCl in:" ++ (show inVertexInfo) ++ " " ++ (show characterIndex)) ( - let -- isMissingChar = (V.length $ (vertData inVertexInfo) V.! characterIndex) == 0 - newVertexData = V.head (vertData inVertexInfo) V.! characterIndex - (newVertexCost, newSubGraphCost) = if isMissing then (0, 0) - --if isMissing || isMissingChar then (0, 0) - else (localCost newVertexData, globalCost newVertexData) - -- newVertexCost = localCost newVertexData - -- newSubGraphCost = globalCost newVertexData - in - -- trace ("MCL " ++ (show $ V.length $ vertData inVertexInfo) ++ " " ++ (show $ fmap V.length $ vertData inVertexInfo) ) ( - -- trace ("MCL: " ++ (show isMissing) ++ " CI: " ++ (show characterIndex) ++ " " ++ (show $ V.length $ (vertData inVertexInfo) V.! characterIndex)) - inVertexInfo { vertData = if not isMissing then V.singleton $ V.singleton newVertexData - -- if not isMissing && not isMissingChar then V.singleton $ V.singleton newVertexData - else V.singleton $ V.singleton emptyCharacter --V.empty - , vertexCost = newVertexCost - , subGraphCost = newSubGraphCost - } - - - -- ) ) diff --git a/pkg/PhyGraph/GraphOptimization/PreOrderFunctions.hs b/pkg/PhyGraph/GraphOptimization/PreOrderFunctions.hs deleted file mode 100644 index 65eefac7f..000000000 --- a/pkg/PhyGraph/GraphOptimization/PreOrderFunctions.hs +++ /dev/null @@ -1,1588 +0,0 @@ -{- | -Module : PostOrderFunctions.hs -Description : Module specifying pre-order graph functions -Copyright : (c) 2021 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -{- -ToDo: - Add parallel optimization overblocks and characters? --} - - -module GraphOptimization.PreOrderFunctions ( createFinalAssignmentOverBlocks - , preOrderTreeTraversal - , getBlockCostPairsFinal - , setFinalToPreliminaryStates - , setPreliminaryToFinalStates - , zero2Gap - ) where - -import Bio.DynamicCharacter -import Data.Bits -import qualified Data.BitVector.LittleEndian as BV -import qualified Data.List as L -import qualified Data.Map as MAP -import Data.Maybe -import qualified Data.Vector as V -import qualified Data.Vector.Generic as GV -import qualified Data.Vector.Unboxed as UV -import qualified DirectOptimization.PreOrder as DOP -import GeneralUtilities -import qualified GraphOptimization.Medians as M -import qualified Graphs.GraphOperations as GO -import qualified Input.BitPack as BP -import qualified SymMatrix as S -import Types.Types -import qualified Utilities.LocalGraph as LG -import qualified Utilities.ThreeWayFunctions as TW -import qualified Utilities.Utilities as U -import Data.Alphabet -import Debug.Trace -import Control.Parallel.Strategies -import qualified ParallelUtilities as PU - - --- | preOrderTreeTraversal takes a preliminarily labelled PhylogeneticGraph --- and returns a full labels with 'final' assignments based on character decorated graphs --- created postorder (5th of 6 fields). --- the preorder states are creted by traversing the traversal DecoratedGraphs in the 5th filed of PhylogeneticGraphs --- these are by block and character, Exact charcaters are vectors of standard characters and each seqeunce (non-exact) --- has its own traversal graph. These should be treees here (could be forests later) and should all have same root (number taxa) --- but worth checking to make sure. --- these were creted by "splitting" them after preorder - --- For sequence charcaters (slim/wide/huge) final states are created either by DirectOptimization or ImpliedAlignment --- if DO--then does a median between parent andchgaps wherre needed--then doing a 3way state assignmeent filteringgaps out --- if IA--then a separate post and preorder pass are donne on the slim/wide/huge/AI fields to crete full IA assignments --- that are then filtered of gaps and assigned to th efinal fields - --- The final states are propagated back to the second field --- DecoratedGraph of the full Phylogenetic graph--which does NOT have the character based preliminary assignments --- ie postorder--since those are traversal specific --- the character specific decorated graphs have appropriate post and pre-order assignments --- the traversal begins at the root (for a tree) and proceeds to leaves. -preOrderTreeTraversal :: GlobalSettings -> AssignmentMethod -> Bool -> Bool -> Bool -> Int -> Bool -> PhylogeneticGraph -> PhylogeneticGraph -preOrderTreeTraversal inGS finalMethod staticIA calculateBranchLengths hasNonExact rootIndex useMap inPGraph@(inSimple, inCost, inDecorated, blockDisplayV, blockCharacterDecoratedVV, inCharInfoVV) = - --trace ("PreO: " ++ (show finalMethod) ++ " " ++ (show $ fmap (fmap charType) inCharInfoVV)) ( - -- trace ("PR-OT pre: " ++ (show $ fmap V.length blockCharacterDecoratedVV)) ( - if LG.isEmpty (thd6 inPGraph) then error "Empty tree in preOrderTreeTraversal" - else - -- trace ("In PreOrder\n" ++ "Simple:\n" ++ (LG.prettify inSimple) ++ "Decorated:\n" ++ (LG.prettify $ GO.convertDecoratedToSimpleGraph inDecorated) ++ "\n" ++ (GFU.showGraph inDecorated)) ( - -- mapped recursive call over blkocks, later characters - let -- preOrderBlockVect = fmap doBlockTraversal $ Debug.debugVectorZip inCharInfoVV blockCharacterDecoratedVV - preOrderBlockVect = V.fromList (PU.seqParMap rdeepseq (doBlockTraversal' inGS finalMethod staticIA rootIndex) (zip (V.toList inCharInfoVV) (V.toList blockCharacterDecoratedVV))) -- `using` PU.myParListChunkRDS) - - -- if final non-exact states determined by IA then perform passes and assignments of final and final IA fields - -- always do IA pass if Tree--but only assign to final if finalMethod == ImpliedAlignment - preOrderBlockVect' = if hasNonExact && (graphType inGS) == Tree then V.zipWith (makeIAAssignments finalMethod rootIndex) preOrderBlockVect inCharInfoVV - else preOrderBlockVect - - fullyDecoratedGraph = assignPreorderStatesAndEdges inGS finalMethod calculateBranchLengths rootIndex preOrderBlockVect' useMap inCharInfoVV inDecorated - in - if null preOrderBlockVect then error ("Empty preOrderBlockVect in preOrderTreeTraversal at root index rootIndex: " ++ (show rootIndex) ++ " This can be caused if the graphType not set correctly: " ++ (show $ graphType inGS)) - else - {- - let blockPost = GO.showDecGraphs blockCharacterDecoratedVV - blockPre = GO.showDecGraphs preOrderBlockVect - in - trace ("BlockPost:\n" ++ blockPost ++ "BlockPre:\n" ++ blockPre ++ "After Preorder\n" ++ (LG.prettify $ GO.convertDecoratedToSimpleGraph fullyDecoratedGraph)) - -} - (inSimple, inCost, fullyDecoratedGraph, blockDisplayV, preOrderBlockVect, inCharInfoVV) - -- ) - --- | makeIAAssignments takes the vector of vector of character trees and (if) slim/wide/huge --- does an additional post and pre order pass to assign IA fileds and final fields in slim/wide/huge -makeIAAssignments :: AssignmentMethod -> Int -> V.Vector DecoratedGraph -> V.Vector CharInfo -> V.Vector DecoratedGraph -makeIAAssignments finalMethod rootIndex = V.zipWith (makeCharacterIA finalMethod rootIndex) - --- | makeCharacterIA takes an individual character postorder tree and if non-exact (non-prealigned) perform post and preorder IA passes --- and assignment to final field in slim/wide/huge -makeCharacterIA :: AssignmentMethod -> Int -> DecoratedGraph -> CharInfo -> DecoratedGraph -makeCharacterIA finalMethod rootIndex inGraph charInfo = - if charType charInfo `notElem` nonExactCharacterTypes then inGraph - else - let postOrderIATree = postOrderIA inGraph charInfo [(rootIndex, fromJust $ LG.lab inGraph rootIndex)] - preOrderIATree = preOrderIA postOrderIATree rootIndex finalMethod charInfo $ zip [(rootIndex, fromJust $ LG.lab postOrderIATree rootIndex)] [(rootIndex, fromJust $ LG.lab postOrderIATree rootIndex)] - in - preOrderIATree - --- | postOrderIA performs a post-order IA pass assigning leaf preliminary states --- from the "alignment" fields and setting HTU preliminary by calling the apropriate 2-way --- matrix --- should eb OK for any root--or partial graph as in for breanch swapping -postOrderIA :: DecoratedGraph -> CharInfo -> [LG.LNode VertexInfo] -> DecoratedGraph -postOrderIA inGraph charInfo inNodeList = - if null inNodeList then inGraph - else - let inNode@(nodeIndex, nodeLabel) = head inNodeList - (inNodeEdges, outNodeEdges) = LG.getInOutEdges inGraph nodeIndex - inCharacter = V.head $ V.head $ vertData nodeLabel - nodeType' = GO.getNodeType inGraph nodeIndex - in - - -- trace ("POIA Node: " ++ (show nodeIndex) ++ " " ++ (show $ nodeType nodeLabel) ++ " " ++ (show $ fmap fst inNodeList)) ( - -- checking sanity of data - if V.null $ vertData nodeLabel then error "Null vertData in postOrderIA" - else if V.null $ V.head $ vertData nodeLabel then - -- missing data for taxon - error "Null vertData data in postOrderIA" - - -- leaf take assignment from alignment field - else if nodeType' == LeafNode then -- nodeType nodeLabel == LeafNode then - - -- trace ("PostOLeaf: " ++ (show nodeIndex) ++ " " ++ (show $ slimFinal $ V.head $ V.head $ vertData nodeLabel)) - postOrderIA inGraph charInfo (tail inNodeList) - -- postOrderIA newGraph charInfo (tail inNodeList) - - -- HTU take create assignment from children - else - let childNodes = LG.labDescendants inGraph inNode - childTree = postOrderIA inGraph charInfo childNodes - in - --trace ("Children: " ++ (show $ fmap fst childNodes)) ( - - if length childNodes > 2 then error ("Too many children in postOrderIA: " ++ show (length childNodes)) - - -- in 1 out 1 vertex - else if length childNodes == 1 then - let childIndex = fst $ head childNodes - childLabel = fromJust $ LG.lab childTree childIndex - childCharacter = V.head $ V.head $ vertData childLabel - in - -- sanity checks - if isNothing (LG.lab childTree (fst $ head childNodes)) then error ("No label for node: " ++ show (fst $ head childNodes)) - else if V.null $ vertData childLabel then error "Null vertData in postOrderIA" - else if V.null $ V.head $ vertData childLabel then error "Null head vertData data in postOrderIA" - else - let newLabel = nodeLabel {vertData = V.singleton (V.singleton childCharacter), nodeType = nodeType'} - newGraph = LG.insEdges (inNodeEdges ++ outNodeEdges) $ LG.insNode (nodeIndex, newLabel) $ LG.delNode nodeIndex childTree - in - -- trace ("PostO1Child: " ++ (show nodeIndex) ++ " " ++ (show $ slimFinal childCharacter)) - postOrderIA newGraph charInfo (tail inNodeList) - - -- two children - else - let childIndices = fmap fst childNodes - childlabels = fmap (fromJust . LG.lab childTree) childIndices - childCharacters = fmap vertData childlabels - leftChar = V.head $ V.head $ head childCharacters - rightChar = V.head $ V.head $ last childCharacters - newCharacter = M.makeIAPrelimCharacter charInfo inCharacter leftChar rightChar - newLabel = nodeLabel {vertData = V.singleton (V.singleton newCharacter), nodeType = nodeType'} - newGraph = LG.insEdges (inNodeEdges ++ outNodeEdges) $ LG.insNode (nodeIndex, newLabel) $ LG.delNode nodeIndex childTree - in - -- trace ("PostO2hildren: " ++ (show nodeIndex) ++ " " ++ (show $ slimFinal newCharacter) ++ " " ++ (show $ nodeType newLabel)) -- ++ " From: " ++ (show childlabels)) - postOrderIA newGraph charInfo (tail inNodeList) - -- ) - -- ) - - - - --- | preOrderIA performs a pre-order IA pass assigning via the apropriate 3-way matrix --- the "final" fields are also set by filtering out gaps and 0. -preOrderIA :: DecoratedGraph -> Int -> AssignmentMethod -> CharInfo -> [(LG.LNode VertexInfo, LG.LNode VertexInfo)] -> DecoratedGraph -preOrderIA inGraph rootIndex finalMethod charInfo inNodePairList = - if null inNodePairList then inGraph - else - let (inNode@(nodeIndex, nodeLabel), (_, parentNodeLabel)) = head inNodePairList - (inNodeEdges, outNodeEdges) = LG.getInOutEdges inGraph nodeIndex - characterType = charType charInfo - inCharacter = V.head $ V.head $ vertData nodeLabel - inCharacter' = inCharacter - parentCharacter = V.head $ V.head $ vertData parentNodeLabel - childNodes = LG.labDescendants inGraph inNode - in - --trace ("PreIA Node:" ++ (show nodeIndex) ++ " " ++ (show $ nodeType nodeLabel) ++ " " ++ (show (fmap fst $ fmap fst inNodePairList,fmap fst $ fmap snd inNodePairList))) ( - -- checking sanity of data - if V.null $ vertData nodeLabel then error "Null vertData in preOrderIA" - else if V.null $ V.head $ vertData nodeLabel then error "Null vertData data in preOrderIA" - else if length childNodes > 2 then error ("Too many children in preOrderIA: " ++ show (length childNodes)) - - -- leaf done in post-order - else if nodeType nodeLabel == LeafNode then preOrderIA inGraph rootIndex finalMethod charInfo (tail inNodePairList) - - else if nodeType nodeLabel == RootNode || nodeIndex == rootIndex then - let newCharacter - | characterType `elem` [SlimSeq, NucSeq] = - inCharacter { slimIAFinal = extractMediansGapped $ slimIAPrelim inCharacter' - -- , slimFinal = extractMedians $ slimIAPrelim inCharacter' - } - | characterType `elem` [WideSeq, AminoSeq] = - inCharacter { wideIAFinal = extractMediansGapped $ wideIAPrelim inCharacter' - -- , wideFinal = extractMedians $ wideIAPrelim inCharacter' - } - | characterType == HugeSeq = - inCharacter { hugeIAFinal = extractMediansGapped $ hugeIAPrelim inCharacter' - -- , hugeFinal = extractMedians $ hugeIAPrelim inCharacter' - } - | otherwise = error ("Unrecognized character type " ++ show characterType) - newLabel = nodeLabel {vertData = V.singleton (V.singleton newCharacter)} - newGraph = LG.insEdges (inNodeEdges ++ outNodeEdges) $ LG.insNode (nodeIndex, newLabel) $ LG.delNode nodeIndex inGraph - parentNodeList = replicate (length childNodes) (nodeIndex, newLabel) - in - -- trace ("PreIARoot: " ++ (show nodeIndex) ++ " IAFinal: " ++ (show $ slimIAFinal newCharacter) ++ " Final: " ++ (show $ slimFinal newCharacter)) - preOrderIA newGraph rootIndex finalMethod charInfo (tail inNodePairList ++ zip childNodes parentNodeList) - - -- single child, take parent final assignments, but keep postorder assignments - else if length childNodes == 1 then - let newCharacter - | characterType `elem` [SlimSeq, NucSeq] = - inCharacter { slimFinal = slimFinal parentCharacter - , slimIAFinal = slimIAFinal parentCharacter - } - | characterType `elem` [WideSeq, AminoSeq] = - inCharacter { wideFinal = wideFinal parentCharacter - , wideIAFinal = wideIAFinal parentCharacter - } - | characterType == HugeSeq = - inCharacter { hugeFinal = hugeFinal parentCharacter - , hugeIAFinal = hugeIAFinal parentCharacter - } - | otherwise = error ("Unrecognized character type " ++ show characterType) - - newLabel = nodeLabel {vertData = V.singleton (V.singleton newCharacter)} - newGraph = LG.insEdges (inNodeEdges ++ outNodeEdges) $ LG.insNode (nodeIndex, newLabel) $ LG.delNode nodeIndex inGraph - parentNodeList = replicate (length childNodes) (nodeIndex, newLabel) - in - -- trace ("PreIANet: " ++ (show nodeIndex) ++ " IAFinal: " ++ (show $ slimIAFinal newCharacter) ++ " Final: " ++ (show $ slimFinal newCharacter)) - preOrderIA newGraph rootIndex finalMethod charInfo (tail inNodePairList ++ zip childNodes parentNodeList) - - -- 2 children, make 3-way - else - let {- - childLabels = fmap snd childNodes - leftChar = V.head $ V.head $ vertData $ head childLabels - rightChar = V.head $ V.head $ vertData $ last childLabels - -} - finalCharacter = M.makeIAFinalCharacter finalMethod charInfo inCharacter parentCharacter -- leftChar rightChar - - newLabel = nodeLabel {vertData = V.singleton (V.singleton finalCharacter)} - newGraph = LG.insEdges (inNodeEdges ++ outNodeEdges) $ LG.insNode (nodeIndex, newLabel) $ LG.delNode nodeIndex inGraph - parentNodeList = replicate (length childNodes) (nodeIndex, newLabel) - in - -- trace ("PreIATree: " ++ (show nodeIndex) ++ " IAFinal: " ++ (show $ slimIAFinal finalCharacter) ++ " Final: " ++ (show $ slimFinal finalCharacter)) - preOrderIA newGraph rootIndex finalMethod charInfo (tail inNodePairList ++ zip childNodes parentNodeList) - - -- ) - --- | doBlockTraversal' is a wrapper around doBlockTraversal fro seqParMap -doBlockTraversal' :: GlobalSettings -> AssignmentMethod -> Bool -> Int -> (V.Vector CharInfo, V.Vector DecoratedGraph) -> V.Vector DecoratedGraph -doBlockTraversal' inGS finalMethod staticIA rootIndex (inCharInfoV, traversalDecoratedVect) = - doBlockTraversal inGS finalMethod staticIA rootIndex inCharInfoV traversalDecoratedVect - --- | doBlockTraversal takes a block of postorder decorated character trees character info --- could be moved up preOrderTreeTraversal, but like this for legibility -doBlockTraversal :: GlobalSettings -> AssignmentMethod -> Bool -> Int -> V.Vector CharInfo -> V.Vector DecoratedGraph -> V.Vector DecoratedGraph -doBlockTraversal inGS finalMethod staticIA rootIndex inCharInfoV traversalDecoratedVect = - --trace ("BlockT:" ++ (show $ fmap charType inCharInfoV)) - V.zipWith (doCharacterTraversal inGS finalMethod staticIA rootIndex) inCharInfoV traversalDecoratedVect - --- | doCharacterTraversal performs preorder traversal on single character tree --- with single charInfo --- this so each character can be independently "rooted" for optimal traversals. -doCharacterTraversal :: GlobalSettings -> AssignmentMethod -> Bool -> Int -> CharInfo -> DecoratedGraph -> DecoratedGraph -doCharacterTraversal inGS finalMethod staticIA rootIndex inCharInfo inGraph = - -- find root--index should = number of leaves - --trace ("charT:" ++ (show $ charType inCharInfo)) ( - let -- this is a hack--remve after fixed - --inGraph = LG.removeDuplicateEdges inGraph' - - isolateNodeList = LG.getIsolatedNodes inGraph - -- (_, leafVertexList, _, _) = LG.splitVertexList inGraph - inEdgeList = LG.labEdges inGraph - in - -- remove these two lines if working - -- if rootIndex /= length leafVertexList then error ("Root index not = number leaves in doCharacterTraversal" ++ show (rootIndex, length leafVertexList)) - -- else - -- root vertex, repeat of label info to avoid problem with zero length zip later, second info ignored for root - -- since root cannot have 2nd parent - let rootLabel = fromJust $ LG.lab inGraph rootIndex - nothingVertData = U.copyToNothing (vertData rootLabel) - rootFinalVertData = createFinalAssignmentOverBlocks inGS finalMethod staticIA RootNode (vertData rootLabel) (vertData rootLabel) nothingVertData inCharInfo True False False - rootChildren =LG.labDescendants inGraph (rootIndex, rootLabel) - - -- left / right to match post-order - rootChildrenBV = fmap (bvLabel . snd) rootChildren - rootChildrenIsLeft - | length rootChildrenBV == 1 = [True] - | head rootChildrenBV > (rootChildrenBV !! 1) = [False, True] - | otherwise = [True, False] - newRootNode = (rootIndex, rootLabel {vertData = rootFinalVertData}) - rootChildrenPairs = zip3 rootChildren (replicate (length rootChildren) newRootNode) rootChildrenIsLeft - upDatedNodes = makeFinalAndChildren inGS finalMethod staticIA inGraph rootChildrenPairs [newRootNode] inCharInfo - - -- update isolated nodes with final == preliminary as with root nodes (and leaves, but without postorder logic) - updatedIsolateNodes = fmap (updateIsolatedNode inGS finalMethod staticIA inCharInfo) isolateNodeList - in - -- hope this is the most efficient way since all nodes have been remade - -- trace (U.prettyPrintVertexInfo $ snd newRootNode) - LG.mkGraph (upDatedNodes ++ updatedIsolateNodes) inEdgeList - --) - --- | updateIsolatedNode updates the final states of an isolated node as if it were a root with final=preliminary --- states without preorder logic as in regular leaves --- NB IA length won't match if compared since not in graph -updateIsolatedNode :: GlobalSettings -> AssignmentMethod -> Bool -> CharInfo -> LG.LNode VertexInfo -> LG.LNode VertexInfo -updateIsolatedNode inGS finalMethod staticIA inCharInfo (inNodeIndex, inNodeLabel) = - -- root so final = preliminary - let nothingVertData = U.copyToNothing (vertData inNodeLabel) - newVertData = createFinalAssignmentOverBlocks inGS finalMethod staticIA RootNode (vertData inNodeLabel) (vertData inNodeLabel) nothingVertData inCharInfo True False False - in - (inNodeIndex, inNodeLabel {vertData = newVertData}) - --- | makeFinalAndChildren takes a graph, list of pairs of (labelled nodes,parent node) to make final assignment and a list of updated nodes --- the input nodes are relabelled by preorder functions and added to the list of processed nodes and recursed to other nodes first, then --- their children -- thi sis important for preorder of hardwired graphs since can have 2 parents a single child. --- nodes are retuned in reverse order at they are made--need to check if this will affect graph identity or indexing in fgl -makeFinalAndChildren :: GlobalSettings - -> AssignmentMethod - -> Bool - -> DecoratedGraph - -> [(LG.LNode VertexInfo, LG.LNode VertexInfo, Bool)] - -> [LG.LNode VertexInfo] - -> CharInfo - -> [LG.LNode VertexInfo] -makeFinalAndChildren inGS finalMethod staticIA inGraph nodesToUpdate updatedNodes inCharInfo = - --trace ("mFAC:" ++ (show $ charType inCharInfo)) ( - if null nodesToUpdate then updatedNodes - else - let (firstNode, firstParent, isLeft) = head nodesToUpdate - - -- get current node data - firstLabel = snd firstNode - firstNodeType' = GO.getNodeType inGraph $ fst firstNode -- nodeType firstLabel - firstNodeType = if firstNodeType' /= NetworkNode then firstNodeType' - else - -- not issue if hardwired I don't think - if graphType inGS /= HardWired then trace ("NetNode:" ++ (show $ LG.getInOutDeg inGraph firstNode) ++ " DuplicateEdges (?): " ++ (show $ LG.getDuplicateEdges inGraph)) NetworkNode - else NetworkNode - firstVertData = vertData firstLabel - - -- get node parent data--check if more than one - firstParents = LG.labParents inGraph $ fst firstNode - - -- if single parent then as usual--else take head of two so no confusion as to whichn is which - -- this is holdover from no indegree 2 nodes--could be simplified and return structures changed - firstParentVertData = if (length firstParents == 1) then vertData $ snd firstParent - else vertData $ snd $ head firstParents - - secondParentData = if (length firstParents == 1) then U.copyToNothing firstParentVertData - else U.copyToJust $ vertData $ snd $ last firstParents - - -- child data - firstChildren = LG.labDescendants inGraph firstNode - - -- booleans for further pass - isIn1Out1 = (length firstChildren == 1) && (length firstParents == 1) -- softwired can happen, need to pass "grandparent" node to skip in 1 out 1 - isIn2Out1 = (length firstChildren == 1) && (length firstParents == 2) -- harwired can happen, need to pass both parents - - - -- this OK with one or two children - firstChildrenBV = fmap (bvLabel . snd) firstChildren - firstChildrenIsLeft - | length firstChildrenBV == 1 = [True] - | head firstChildrenBV > (firstChildrenBV !! 1) = [False, True] - | otherwise = [True, False] - firstFinalVertData = createFinalAssignmentOverBlocks inGS finalMethod staticIA firstNodeType firstVertData firstParentVertData secondParentData inCharInfo isLeft isIn1Out1 isIn2Out1 - newFirstNode = (fst firstNode, firstLabel {vertData = firstFinalVertData}) - - -- check children if indegree == 2 then don't add to nodes to do if in there already - - childrenTriple = zip3 firstChildren (replicate (length firstChildren) newFirstNode) firstChildrenIsLeft - childrenTriple' = if (graphType inGS) == HardWired then filter (indeg2NotInNodeList inGraph (tail nodesToUpdate)) childrenTriple - else childrenTriple - - in - -- trace (U.prettyPrintVertexInfo $ snd newFirstNode) - -- makeFinalAndChildren inGS finalMethod staticIA inGraph (childrenPairs ++ tail nodesToUpdate) (newFirstNode : updatedNodes) inCharInfo - -- childrenPair after nodess to do for hardWired to ensure both parent done before child - makeFinalAndChildren inGS finalMethod staticIA inGraph ((tail nodesToUpdate) ++ childrenTriple') (newFirstNode : updatedNodes) inCharInfo - --) - --- | indeg2NotInNodeList checcks a node agains a list by index (fst) if node is indegree 2 and --- already in the list of n odes "todo" filter out as will already be optimized in appropriate pre-order -indeg2NotInNodeList :: LG.Gr a b -> [(LG.LNode a, LG.LNode a, Bool)] -> (LG.LNode a, LG.LNode a, Bool) -> Bool -indeg2NotInNodeList inGraph checkNodeList (childNode@(childIndex, _), _, _) = - if LG.isEmpty inGraph then error "Empty graph in indeg2NotInNodeList" - else - if LG.indeg inGraph childNode < 2 then True - else if childIndex `elem` (fmap (fst . fst3) checkNodeList) then False - else True - - - --- | assignPreorderStatesAndEdges takes a postorder decorated graph (should be but not required) and propagates --- preorder character states from individual character trees. Exact characters (Add, nonAdd, matrix) postorder --- states should be based on the outgroup rooted tree. --- root should be median of finals of two descendets--for non-exact based on final 'alignments' field with gaps filtered --- postorder assignment and preorder will be out of whack--could change to update with correponding postorder --- but that would not allow use of base decorated graph for incremental optimization (which relies on postorder assignments) in other areas --- optyion code ikn there to set root final to outgropu final--but makes thigs scewey in matrix character and some pre-order assumptions -assignPreorderStatesAndEdges :: GlobalSettings -> AssignmentMethod -> Bool -> Int -> V.Vector (V.Vector DecoratedGraph) -> Bool -> V.Vector (V.Vector CharInfo) -> DecoratedGraph -> DecoratedGraph -assignPreorderStatesAndEdges inGS finalMethd calculateBranchEdges rootIndex preOrderBlockTreeVV useMap inCharInfoVV inGraph = - --trace ("aPSAE:" ++ (show $ fmap (fmap charType) inCharInfoVV)) ( - if LG.isEmpty inGraph then error "Empty graph in assignPreorderStatesAndEdges" - else - -- trace ("In assign") ( - let postOrderNodes = LG.labNodes inGraph - postOrderEdgeList = LG.labEdges inGraph - - -- update node labels - newNodeList = fmap (updateNodeWithPreorder preOrderBlockTreeVV inCharInfoVV) postOrderNodes - - -- create a vector of vector of pair of nodes and edges for display x charcater trees - blockTreePairVV = fmap (fmap LG.makeNodeEdgePairVect) preOrderBlockTreeVV - - -- update edge labels--for softwired need to account for not all edges in all block/display trees - -- map for case where tree does not contain all leaves as in swap procedures - -- needs to be updated for softwired as well - nodeMap = MAP.fromList $ zip (fmap fst newNodeList) newNodeList - newEdgeList = if (graphType inGS == Tree || graphType inGS == HardWired) then - if useMap then fmap (updateEdgeInfoTreeMap finalMethd inCharInfoVV nodeMap) postOrderEdgeList - else fmap (updateEdgeInfoTree finalMethd inCharInfoVV (V.fromList newNodeList)) postOrderEdgeList - else - fmap (updateEdgeInfoSoftWired finalMethd inCharInfoVV blockTreePairVV rootIndex) postOrderEdgeList - in - -- make new graph - -- LG.mkGraph newNodeList' newEdgeList - if calculateBranchEdges then LG.mkGraph newNodeList newEdgeList - else LG.mkGraph newNodeList postOrderEdgeList - --) - --- | updateNodeWithPreorder takes the preorder decorated graphs (by block and character) and updates the --- the preorder fields only using character info. This leaves post and preorder assignment out of sync. --- but that so can use incremental optimization on base decorated graph in other areas. -updateNodeWithPreorder :: V.Vector (V.Vector DecoratedGraph) -> V.Vector (V.Vector CharInfo) -> LG.LNode VertexInfo -> LG.LNode VertexInfo -updateNodeWithPreorder preOrderBlockTreeVV inCharInfoVV postOrderNode = - let nodeLabel = snd postOrderNode - nodeVertData = vertData nodeLabel - newNodeVertData = V.zipWith3 (updateVertexBlock (fst postOrderNode)) preOrderBlockTreeVV nodeVertData inCharInfoVV - in - (fst postOrderNode, nodeLabel {vertData = newNodeVertData}) - --- | updateVertexBlock takes a block of vertex data and updates preorder states of charactes via fmap -updateVertexBlock :: Int -> V.Vector DecoratedGraph -> V.Vector CharacterData -> V.Vector CharInfo -> V.Vector CharacterData -updateVertexBlock nodeIndex = V.zipWith3 (updatePreorderCharacter nodeIndex) - --- | updatePreorderCharacter updates the pre-order fields of character data for a vertex from a traversal --- since there is single character optimized for each character decorated graph-- it is always teh 0th 0th character --- exact are vectors so take care of multiple there. --- need to care for issues of missing data -updatePreorderCharacter :: Int -> DecoratedGraph -> CharacterData -> CharInfo -> CharacterData -updatePreorderCharacter nodeIndex preOrderTree postOrderCharacter charInfo = - --trace ("N:" ++ (show nodeIndex) ++ " B:" ++ (show blockIndex) ++ " C:" ++ (show characterIndex) ++ "\n" ++ (show $ vertData $ fromJust $ LG.lab preOrderTree nodeIndex)) ( - let maybePreOrderNodeLabel = LG.lab preOrderTree nodeIndex - preOrderVertData = vertData $ fromJust maybePreOrderNodeLabel - preOrderCharacterData - | V.null preOrderVertData = emptyCharacter - | V.null $ V.head preOrderVertData = emptyCharacter - | otherwise = V.head $ V.head preOrderVertData -- (preOrderVertData V.! 0) V.! 0 - - in - -- this can heppen in naked parent node of prunned subGraph in branch swapping - if isNothing maybePreOrderNodeLabel then emptyCharacter - -- error ("Nothing node label in updatePreorderCharacter node: " ++ show nodeIndex) - else - updateCharacter postOrderCharacter preOrderCharacterData (charType charInfo) - --) - --- | updateCharacter takes a postorder character and updates the preorder (final) fields with preorder data and character type --- only updating preorder assignment--except for root, that is needed to draw final state for branch lengths -updateCharacter :: CharacterData -> CharacterData -> CharType -> CharacterData -updateCharacter postOrderCharacter preOrderCharacter localCharType - | localCharType == Add = - postOrderCharacter { rangeFinal = rangeFinal preOrderCharacter } - - | localCharType == NonAdd = - postOrderCharacter { stateBVFinal = stateBVFinal preOrderCharacter } - - | localCharType `elem` packedNonAddTypes = - postOrderCharacter { packedNonAddFinal = packedNonAddFinal preOrderCharacter } - - | localCharType == Matrix = - postOrderCharacter { matrixStatesFinal = matrixStatesFinal preOrderCharacter } - - | localCharType == AlignedSlim = - postOrderCharacter { alignedSlimPrelim = alignedSlimPrelim preOrderCharacter - , alignedSlimFinal = alignedSlimFinal preOrderCharacter - } - - | localCharType == AlignedWide = - postOrderCharacter { alignedWidePrelim = alignedWidePrelim preOrderCharacter - , alignedWideFinal = alignedWideFinal preOrderCharacter - } - - | localCharType == AlignedHuge = - postOrderCharacter { alignedHugePrelim = alignedHugePrelim preOrderCharacter - , alignedHugeFinal = alignedHugeFinal preOrderCharacter - } - - | localCharType == SlimSeq || localCharType == NucSeq = - postOrderCharacter { slimAlignment = slimAlignment preOrderCharacter - , slimFinal = slimFinal preOrderCharacter - , slimIAFinal = slimIAFinal preOrderCharacter - } - - | localCharType == WideSeq || localCharType == AminoSeq = - postOrderCharacter { wideAlignment = wideAlignment preOrderCharacter - , wideFinal = wideFinal preOrderCharacter - , wideIAFinal = wideIAFinal preOrderCharacter - } - - | localCharType == HugeSeq = - postOrderCharacter { hugeAlignment = hugeAlignment preOrderCharacter - , hugeFinal = hugeFinal preOrderCharacter - , hugeIAFinal = hugeIAFinal preOrderCharacter - } - - | otherwise = error ("Character type unimplemented : " ++ show localCharType) - --- | updateEdgeInfoSoftWired gets edge weights via block trees as opposed to canonical graph --- this because not all edges present in all block/display trees -updateEdgeInfoSoftWired :: AssignmentMethod - -> V.Vector (V.Vector CharInfo) - -> V.Vector (V.Vector (V.Vector (LG.LNode VertexInfo), V.Vector (LG.LEdge EdgeInfo))) - -> Int - -> LG.LEdge EdgeInfo - -> LG.LEdge EdgeInfo -updateEdgeInfoSoftWired finalMethod inCharInfoVV blockTreePairVV rootIndex (uNode, vNode, edgeLabel) = - if V.null blockTreePairVV then error "Empty node-edge pair vector in updateEdgeInfoSoftWired" - else - let (minWList, maxWList) = V.unzip $ V.zipWith (getEdgeBlockWeightSoftWired finalMethod uNode vNode rootIndex) inCharInfoVV blockTreePairVV - localEdgeType = edgeType edgeLabel - newEdgeLabel = EdgeInfo { minLength = V.sum minWList - , maxLength = V.sum maxWList - , midRangeLength = (V.sum minWList + V.sum maxWList) / 2.0 - , edgeType = localEdgeType - } - in - (uNode, vNode, newEdgeLabel) - - --- | getEdgeBlockWeightSoftWired takes a block of charcter trees and maps character distances if edge exists in block tree -getEdgeBlockWeightSoftWired :: AssignmentMethod - -> Int - -> Int - -> Int - -> V.Vector CharInfo - -> V.Vector (V.Vector (LG.LNode VertexInfo), V.Vector (LG.LEdge EdgeInfo)) - -> (VertexCost, VertexCost) -getEdgeBlockWeightSoftWired finalMethod uNode vNode rootIndex inCharInfoV blockTreePairV = - let (minWList, maxWList) = V.unzip $ V.zipWith (getEdgeCharacterWeightSoftWired finalMethod uNode vNode rootIndex) inCharInfoV blockTreePairV - in (V.sum minWList, V.sum maxWList) - --- | getEdgeCharacterWeightSoftWired gets the edge weight for an individual character --- matches edge in either direction --- need examine edge root as two edges from rootIndex -getEdgeCharacterWeightSoftWired :: AssignmentMethod - -> Int - -> Int - -> Int - -> CharInfo - -> (V.Vector (LG.LNode VertexInfo), V.Vector (LG.LEdge EdgeInfo)) - -> (VertexCost, VertexCost) -getEdgeCharacterWeightSoftWired finalMethod uNode vNode rootIndex inCharInfo (nodeVect, edgeVect) = - let foundVertexPair = getEdgeVerts uNode vNode rootIndex nodeVect edgeVect - (uLabel, vLabel) = fromJust foundVertexPair - uCharacter = V.head $ V.head $ vertData uLabel - vCharacter = V.head $ V.head $ vertData vLabel - in - -- if edge not present and not around root then return no costs - if foundVertexPair == Nothing then (0,0) - else getCharacterDistFinal finalMethod uCharacter vCharacter inCharInfo - - --- | getEdgeVerts returns vertex labels if edge in vect or if a virtual edge including root -getEdgeVerts :: Int -> Int -> Int -> V.Vector (LG.LNode VertexInfo) -> V.Vector (LG.LEdge EdgeInfo) -> Maybe (VertexInfo, VertexInfo) -getEdgeVerts uNode vNode rootIndex nodeVect edgeVect = - -- trace ("GEV:" ++ (show (uNode, vNode, rootIndex) ++ " nodes " ++ (show $ fmap fst nodeVect) ++ " edges " ++ (show $ fmap LG.toEdge edgeVect))) ( - - --hack or display edge check I'm not sure--not all edges are in all display trees - if (uNode >= V.length nodeVect) || (vNode >= V.length nodeVect) then Nothing - - else if edgeInVect (uNode, vNode) edgeVect then Just (snd $ nodeVect V.! uNode, snd $ nodeVect V.! vNode) - else if (edgeInVect (rootIndex, uNode) edgeVect) && (edgeInVect (rootIndex, vNode) edgeVect) then Just (snd $ nodeVect V.! uNode, snd $ nodeVect V.! vNode) - else Nothing - -- ) - --- | edgeInVect takes an edges and returns True if in Vector, False otherwise -edgeInVect :: (Int , Int) -> V.Vector (LG.LEdge EdgeInfo) -> Bool -edgeInVect (u, v) edgeVect = - if V.null edgeVect then False - else - let (a, b, _) = V.head edgeVect - in - if (u, v) == (a, b) then True - else if (v, u) == (a, b) then True - else edgeInVect (u, v) (V.tail edgeVect) - --- | updateEdgeInfoTree takes a Decorated graph--fully labelled post and preorder and and edge and --- gets edge info--basically lengths --- this for a tree in that all edges are present in all character/block trees -updateEdgeInfoTree :: AssignmentMethod -> V.Vector (V.Vector CharInfo) -> V.Vector (LG.LNode VertexInfo) -> LG.LEdge EdgeInfo -> LG.LEdge EdgeInfo -updateEdgeInfoTree finalMethod inCharInfoVV nodeVector (uNode, vNode, edgeLabel) = - if V.null nodeVector then error "Empty node list in updateEdgeInfo" - else - let (minW, maxW) = getEdgeWeight finalMethod inCharInfoVV nodeVector (uNode, vNode) - midW = (minW + maxW) / 2.0 - localEdgeType = edgeType edgeLabel - newEdgeLabel = EdgeInfo { minLength = minW - , maxLength = maxW - , midRangeLength = midW - , edgeType = localEdgeType - } - in - (uNode, vNode, newEdgeLabel) - - --- | updateEdgeInfoTreeMap takes a Decorated graph--fully labelled post and preorder and and edge and --- gets edge info--basically lengths --- this for a tree in that all edges are present in all character/block trees --- uses MAP as opposed to index -updateEdgeInfoTreeMap :: AssignmentMethod -> V.Vector (V.Vector CharInfo) -> MAP.Map Int (LG.LNode VertexInfo) -> LG.LEdge EdgeInfo -> LG.LEdge EdgeInfo -updateEdgeInfoTreeMap finalMethod inCharInfoVV nodeMap (uNode, vNode, edgeLabel) = - if MAP.null nodeMap then error "Empty node MAP in updateEdgeInfo" - else - let (minW, maxW) = getEdgeWeightMap finalMethod inCharInfoVV nodeMap (uNode, vNode) - midW = (minW + maxW) / 2.0 - localEdgeType = edgeType edgeLabel - newEdgeLabel = EdgeInfo { minLength = minW - , maxLength = maxW - , midRangeLength = midW - , edgeType = localEdgeType - } - in - (uNode, vNode, newEdgeLabel) - --- | getEdgeWeight takes a preorder decorated decorated graph and an edge and gets the weight information for that edge --- basically a min/max distance between the two --- the indexing depends on the graph having all leaves in the graph which may not happen --- during graph swapping -getEdgeWeight :: AssignmentMethod -> V.Vector (V.Vector CharInfo) -> V.Vector (LG.LNode VertexInfo) -> (Int, Int) -> (VertexCost, VertexCost) -getEdgeWeight finalMethod inCharInfoVV nodeVector (uNode, vNode) = - if V.null nodeVector then error "Empty node list in getEdgeWeight" - else - -- trace ("GEW: " ++ (show $ fmap fst nodeVector) ++ " " ++ (show (uNode, vNode))) ( - let uNodeInfo = vertData $ snd $ nodeVector V.! uNode - vNodeInfo = vertData $ snd $ nodeVector V.! vNode - blockCostPairs = V.zipWith3 (getBlockCostPairsFinal finalMethod) uNodeInfo vNodeInfo inCharInfoVV - minCost = sum $ fmap fst blockCostPairs - maxCost = sum $ fmap snd blockCostPairs - in - - (minCost, maxCost) - -- ) - --- | getEdgeWeightMap takes a preorder decorated decorated graph and an edge and gets the weight information for that edge --- basically a min/max distance between the two --- in this case based on map or vertices rather than direct indexing. --- the indexing depends on the graph having all leaves in the graph which may not happen --- during graph swapping -getEdgeWeightMap :: AssignmentMethod -> V.Vector (V.Vector CharInfo) -> MAP.Map Int (LG.LNode VertexInfo) -> (Int, Int) -> (VertexCost, VertexCost) -getEdgeWeightMap finalMethod inCharInfoVV nodeMap (uNode, vNode) = - if MAP.null nodeMap then error "Empty node map in getEdgeWeight" - else - -- trace ("GEWM: " ++ (show $ MAP.toList nodeMap) ++ " " ++ (show (uNode, vNode))) ( - let uNodeInfo = vertData $ snd $ nodeMap MAP.! uNode - vNodeInfo = vertData $ snd $ nodeMap MAP.! vNode - blockCostPairs = V.zipWith3 (getBlockCostPairsFinal finalMethod) uNodeInfo vNodeInfo inCharInfoVV - minCost = sum $ fmap fst blockCostPairs - maxCost = sum $ fmap snd blockCostPairs - in - - (minCost, maxCost) - -- ) - --- | getBlockCostPairsFinal takes a block of two nodes and character infomation and returns the min and max block branch costs -getBlockCostPairsFinal :: AssignmentMethod -> V.Vector CharacterData -> V.Vector CharacterData -> V.Vector CharInfo -> (VertexCost, VertexCost) -getBlockCostPairsFinal finalMethod uNodeCharDataV vNodeCharDataV charInfoV = - let characterCostPairs = V.zipWith3 (getCharacterDistFinal finalMethod) uNodeCharDataV vNodeCharDataV charInfoV - minCost = sum $ fmap fst characterCostPairs - maxCost = sum $ fmap snd characterCostPairs - in - (minCost, maxCost) - --- | getCharacterDistFinal takes a pair of characters and character type, returning the minimum and maximum character distances --- for sequence characters this is based on slim/wide/hugeAlignment field, hence all should be O(n) in num characters/sequence length -getCharacterDistFinal :: AssignmentMethod -> CharacterData -> CharacterData -> CharInfo -> (VertexCost, VertexCost) -getCharacterDistFinal finalMethod uCharacter vCharacter charInfo = - let thisWeight = weight charInfo - thisMatrix = costMatrix charInfo - thisCharType = charType charInfo - lChangeCost = changeCost charInfo - lNoChangeCost = noChangeCost charInfo - in - -- no nded to do nochange/change--all recoded in that case - if thisCharType == Add then - let --minCost = localCost (M.intervalAdd thisWeight uCharacter vCharacter) - (minDiffV, maxDiffV) = V.unzip $ V.zipWith maxMinIntervalDiff (rangeFinal uCharacter) (rangeFinal vCharacter) - - minCost = thisWeight * (fromIntegral $ V.sum minDiffV) - maxCost = thisWeight * (fromIntegral $ V.sum maxDiffV) - in - (minCost, maxCost) - - -- assumes noChangeCost < changeCost for PMDL/ML - else if thisCharType == NonAdd then - let -- minCost = localCost (M.interUnion thisWeight uCharacter vCharacter) - minDiff = length $ V.filter (==False) $ V.zipWith hasBVIntersection (stateBVFinal uCharacter) (stateBVFinal vCharacter) - maxDiff = length $ V.filter (==False) $ V.zipWith equalAndSingleState (stateBVFinal uCharacter) (stateBVFinal vCharacter) - maxCost = thisWeight * fromIntegral maxDiff - minCost = thisWeight * fromIntegral minDiff - minNoChange = (length (stateBVFinal uCharacter)) - minDiff - maxNoChange = (length (stateBVFinal uCharacter)) - maxDiff - minCost' = thisWeight * ((lNoChangeCost * (fromIntegral minNoChange)) + (lChangeCost * (fromIntegral minDiff))) - maxCost' = thisWeight * ((lNoChangeCost * (fromIntegral maxNoChange)) + (lChangeCost * (fromIntegral maxDiff))) - - in - if lNoChangeCost == 0.0 then (minCost, maxCost) - else (minCost', maxCost') - - else if thisCharType `elem` packedNonAddTypes then - let -- minCost = localCost (BP.median2Packed thisCharType uCharacter vCharacter) - (minDiffV, maxDiffV) = UV.unzip $ UV.zipWith (BP.minMaxCharDiff thisCharType (lNoChangeCost, lChangeCost)) (packedNonAddFinal uCharacter) (packedNonAddFinal vCharacter) - maxCost = thisWeight * (UV.sum maxDiffV) - minCost = thisWeight * (UV.sum minDiffV) - in - (minCost, maxCost) - - else if thisCharType == Matrix then - let minMaxListList= V.zipWith (minMaxMatrixDiff thisMatrix) (fmap getLowestCostMatrixStates (matrixStatesFinal uCharacter)) (fmap getLowestCostMatrixStates (matrixStatesFinal vCharacter)) - minDiff = V.sum $ fmap fst minMaxListList - maxDiff = V.sum $ fmap snd minMaxListList - minCost = thisWeight * fromIntegral minDiff - maxCost = thisWeight * fromIntegral maxDiff - in - (minCost, maxCost) - - else if thisCharType `elem` prealignedCharacterTypes then - let - (minDiff, maxDiff) = unzip $ zipWith (M.generalSequenceDiff thisMatrix (length thisMatrix)) (GV.toList $ alignedSlimFinal uCharacter) (GV.toList $ alignedSlimFinal vCharacter) - minCost = thisWeight * fromIntegral (sum minDiff) - maxCost = thisWeight * fromIntegral (sum maxDiff) - in - (minCost, maxCost) - - else if thisCharType == SlimSeq || thisCharType == NucSeq then - let minMaxDiffList = if finalMethod == DirectOptimization then - let uFinal = M.makeDynamicCharacterFromSingleVector (slimFinal uCharacter) - vFinal = M.makeDynamicCharacterFromSingleVector (slimFinal vCharacter) - newEdgeCharacter = M.getDOMedianCharInfo charInfo (uCharacter {slimGapped = uFinal}) (vCharacter {slimGapped = vFinal}) - (newU, _, newV) = slimGapped newEdgeCharacter - in - --trace ("GCD:\n" ++ (show (slimFinal uCharacter, newU)) ++ "\n" ++ (show (slimFinal vCharacter, newV)) ++ "\nDO Cost:" ++ (show doCOST)) - zipWith (M.generalSequenceDiff thisMatrix (length thisMatrix)) (GV.toList newU) (GV.toList newV) - else zipWith (M.generalSequenceDiff thisMatrix (length thisMatrix)) (GV.toList $ slimIAFinal uCharacter) (GV.toList $ slimIAFinal vCharacter) - (minDiff, maxDiff) = unzip minMaxDiffList - minCost = thisWeight * fromIntegral (sum minDiff) - maxCost = thisWeight * fromIntegral (sum maxDiff) - in - --trace ("MMDL: " ++ (show minCost) ++ " " ++ (show maxCost)) - (minCost, maxCost) - - - else if thisCharType == WideSeq || thisCharType == AminoSeq then - let minMaxDiffList = if finalMethod == DirectOptimization then - let uFinal = M.makeDynamicCharacterFromSingleVector (wideFinal uCharacter) - vFinal = M.makeDynamicCharacterFromSingleVector (wideFinal vCharacter) - newEdgeCharacter = M.getDOMedianCharInfo charInfo (uCharacter {wideGapped = uFinal}) (vCharacter {wideGapped = vFinal}) - (newU, _, newV) = wideGapped newEdgeCharacter - in - --trace ("GCD:\n" ++ (show m) ++ "\n" ++ (show (uFinal, newU)) ++ "\n" ++ (show (vFinal, newV))) - zipWith (M.generalSequenceDiff thisMatrix (length thisMatrix)) (GV.toList newU) (GV.toList newV) - else GV.toList $ GV.zipWith (M.generalSequenceDiff thisMatrix (length thisMatrix)) (wideIAFinal uCharacter) (wideIAFinal vCharacter) - (minDiff, maxDiff) = unzip minMaxDiffList - minCost = thisWeight * fromIntegral (sum minDiff) - maxCost = thisWeight * fromIntegral (sum maxDiff) - in - (minCost, maxCost) - - else if thisCharType == HugeSeq then - let minMaxDiffList = if finalMethod == DirectOptimization then - let uFinal = M.makeDynamicCharacterFromSingleVector (hugeFinal uCharacter) - vFinal = M.makeDynamicCharacterFromSingleVector (hugeFinal vCharacter) - newEdgeCharacter = M.getDOMedianCharInfo charInfo (uCharacter {hugeGapped = uFinal}) (vCharacter {hugeGapped = vFinal}) - (newU, _, newV) = hugeGapped newEdgeCharacter - in - -- trace ("GCD:\n" ++ (show (uFinal, newU)) ++ "\n" ++ (show (vFinal, newV))) - zipWith (M.generalSequenceDiff thisMatrix (length thisMatrix)) (GV.toList newU) (GV.toList newV) - else GV.toList $ GV.zipWith (M.generalSequenceDiff thisMatrix (length thisMatrix)) (hugeIAFinal uCharacter) (hugeIAFinal vCharacter) - (minDiff, maxDiff) = unzip minMaxDiffList - minCost = thisWeight * fromIntegral (sum minDiff) - maxCost = thisWeight * fromIntegral (sum maxDiff) - in - (minCost, maxCost) - - else error ("Character type not recognized/unimplemented : " ++ show thisCharType) - where hasBVIntersection a b = (not . BV.isZeroVector) (a .&. b) - equalAndSingleState a b = if (a == b) && (popCount a == 1) then True else False - --- | zero2Gap converts a '0' or no bits set to gap (indel) value -zero2Gap :: (FiniteBits a) => a -> a -zero2Gap inVal = if popCount inVal == 0 then bit gapIndex - else inVal - -{- --- | zero2GapWide converts a '0' or no bits set to gap (indel) value -zero2GapWide :: Word64 -> Word64 -> Word64 -zero2GapWide gapChar inVal = if popCount inVal == 0 then bit gapIndex - else inVal - --- | zero2GapBV converts a '0' or no bits set to gap (indel) value -zero2GapBV :: BV.BitVector -> BV.BitVector -> BV.BitVector -zero2GapBV gapChar inVal = if popCount inVal == 0 then bit gapIndex - else inVal --} - --- | maxIntervalDiff takes two ranges and gets the maximum difference between the two based on differences --- in upp and lower ranges. -maxMinIntervalDiff :: (Int, Int)-> (Int, Int) -> (Int, Int) -maxMinIntervalDiff (a,b) (x,y) = - let upper = max b y - min b y - lower = max a x - min a x - in - (min upper lower, max upper lower) - --- | getLowestCostMatrixStates takes a Vector Triple for matrix charxcter and returns lowest cost states as vector --- of Ints -getLowestCostMatrixStates :: V.Vector MatrixTriple -> V.Vector Int -getLowestCostMatrixStates tripleVect = - if V.null tripleVect then V.empty - else - let minCost = minimum $ fmap fst3 tripleVect - stateCostPairList = V.zip (V.fromList [0..(V.length tripleVect - 1)]) (fmap fst3 tripleVect) - (minStateVect, _) = V.unzip $ V.filter ((== minCost) . snd) stateCostPairList - in - minStateVect - - --- | minMaxMatrixDiff takes twovetors of states and calculates the minimum and maximum state differnce cost --- between the two -minMaxMatrixDiff :: S.Matrix Int -> V.Vector Int -> V.Vector Int -> (Int, Int) -minMaxMatrixDiff localCostMatrix uStatesV vStatesV = - let statePairs = (V.toList uStatesV, V.toList vStatesV) - cartesianPairs = cartProdPair statePairs - costList = fmap (localCostMatrix S.!) cartesianPairs - in - {-THis ti check for errors - if (not . null) costList then (minimum costList, maximum costList) - else (-1, -1) - -} - -- trace ("MMD: " ++ (show (statePairs,cartesianPairs))) - (minimum costList, maximum costList) - - --- | createFinalAssignment takes vertex data (child or current vertex) and creates the final --- assignment from parent (if not root or leaf) and 'child' ie current vertex --- if root or leaf preliminary is assigned to final - -- need to watch zipping for missing sequence data --- this creates the IA during preorder from which final assignments are contructed --- via addition post and preorder passes on IA fields. -createFinalAssignmentOverBlocks :: GlobalSettings - -> AssignmentMethod - -> Bool - -> NodeType - -> VertexBlockData - -> VertexBlockData - -> VertexBlockDataMaybe -- second parent if indegree 2 node - -> CharInfo - -> Bool - -> Bool - -> Bool - -> VertexBlockData -createFinalAssignmentOverBlocks inGS finalMethod staticIA childType childBlockData parentBlockData parent2BlockDataM charInfo isLeft isInOutDegree1 isIn2Out1 = - -- if root or leaf final assignment <- preliminary asssignment - V.zipWith3 (assignFinal inGS finalMethod staticIA childType isLeft charInfo isInOutDegree1 isIn2Out1) childBlockData parentBlockData parent2BlockDataM - - --- | assignFinal takes a vertex type and single block of zip3 of child info, parent info, and character type --- to create pre-order assignments -assignFinal :: GlobalSettings - -> AssignmentMethod - -> Bool - -> NodeType - -> Bool - -> CharInfo - -> Bool - -> Bool - -> V.Vector CharacterData - -> V.Vector CharacterData - -> V.Vector (Maybe CharacterData) - -> V.Vector CharacterData -assignFinal inGS finalMethod staticIA childType isLeft charInfo isOutDegree1 isIn2Out1 = V.zipWith3 (setFinal inGS finalMethod staticIA childType isLeft charInfo isOutDegree1 isIn2Out1) - --- | setFinal takes a vertex type and single character of zip3 of child info, parent info, and character type --- to create pre-order assignments - -- | setFinalHTU takes a single character and its parent and sets the final state to prelim based --- on character info. --- non exact charcaters are vectors of characters of same type --- this does the same things for sequence types, but also --- performs preorder logic for exact characters --- staticIA flage is for IA and static only optimization used in IA heuriastics for DO --- no IA for networks--at least for now.Bool -> -setFinal :: GlobalSettings -> AssignmentMethod -> Bool -> NodeType -> Bool -> CharInfo -> Bool -> Bool -> CharacterData -> CharacterData -> Maybe CharacterData -> CharacterData -setFinal inGS finalMethod staticIA childType isLeft charInfo isIn1Out1 isIn2Out1 childChar parentChar parent2CharM = - let localCharType = charType charInfo - symbolCount = toEnum $ length $ costMatrix charInfo :: Int - isTree = (graphType inGS) == Tree - in - -- Three cases, Root, leaf, HTU - -- trace ("set final:" ++ (show (finalMethod, staticIA)) ++ " " ++ (show childType) ++ " " ++ (show isLeft) ++ " " ++ (show isIn1Out1) ++ " " ++ (show isIn2Out1)) ( - if childType == RootNode then - - if localCharType == Add then - childChar {rangeFinal = snd3 $ rangePrelim childChar} - - else if localCharType == NonAdd then - childChar {stateBVFinal = snd3 $ stateBVPrelim childChar} - - else if localCharType `elem` packedNonAddTypes then - childChar {packedNonAddFinal = snd3 $ packedNonAddPrelim childChar} - - else if localCharType == Matrix then - childChar {matrixStatesFinal = setMinCostStatesMatrix (fromEnum symbolCount) (matrixStatesPrelim childChar)} - - else if localCharType == AlignedSlim then - childChar {alignedSlimFinal = snd3 $ alignedSlimPrelim childChar} - - else if localCharType == AlignedWide then - childChar {alignedWideFinal = snd3 $ alignedWidePrelim childChar} - - else if localCharType == AlignedHuge then - childChar {alignedHugeFinal = snd3 $ alignedHugePrelim childChar} - - -- need to set both final and alignment for sequence characters - else if (localCharType == SlimSeq) || (localCharType == NucSeq) then - let finalAssignment' = extractMedians $ slimGapped childChar - in - -- trace ("TNFinal-Root: " ++ (show finalAssignment') ++ " " ++ (show $ slimGapped childChar)) ( - if staticIA then childChar {slimIAFinal = extractMediansGapped $ slimIAPrelim childChar} - else childChar { slimFinal = finalAssignment' - , slimAlignment = if isTree then slimGapped childChar - else mempty - } - -- ) - - else if (localCharType == WideSeq) || (localCharType == AminoSeq) then - let finalAssignment' = extractMedians $ wideGapped childChar - in - if staticIA then childChar {wideIAFinal = extractMediansGapped $ wideIAPrelim childChar} - else childChar { wideFinal = finalAssignment' - , wideAlignment = if isTree then wideGapped childChar - else mempty - } - - else if localCharType == HugeSeq then - let finalAssignment' = extractMedians $ hugeGapped childChar - in - if staticIA then childChar {hugeIAFinal = extractMediansGapped $ hugeIAPrelim childChar} - else childChar { hugeFinal = finalAssignment' - , hugeAlignment = if isTree then hugeGapped childChar - else mempty - } - - else error ("Unrecognized/implemented character type: " ++ show localCharType) - - else if childType == LeafNode then - -- since leaf no neeed to precess final alignment fields for sequence characters - if localCharType == Add then - childChar {rangeFinal = snd3 $ rangePrelim childChar} - - else if localCharType == NonAdd then - childChar {stateBVFinal = snd3 $ stateBVPrelim childChar} - - else if localCharType `elem` packedNonAddTypes then - childChar {packedNonAddFinal = snd3 $ packedNonAddPrelim childChar} - - else if localCharType == Matrix then - childChar {matrixStatesFinal = setMinCostStatesMatrix (fromEnum symbolCount) (matrixStatesPrelim childChar)} - - else if localCharType == AlignedSlim then - childChar {alignedSlimFinal = snd3 $ alignedSlimPrelim childChar} - - else if localCharType == AlignedWide then - childChar {alignedWideFinal = snd3 $ alignedWidePrelim childChar} - - else if localCharType == AlignedHuge then - childChar {alignedHugeFinal = snd3 $ alignedHugePrelim childChar} - - -- need to set both final and alignment for sequence characters - else if (localCharType == SlimSeq) || (localCharType == NucSeq) then - -- trace ("TNFinal-Leaf:" ++ (show (isLeft, (slimAlignment parentChar), (slimGapped parentChar) ,(slimGapped childChar)))) ( - let finalAlignment = DOP.preOrderLogic isLeft (slimAlignment parentChar) (slimGapped parentChar) (slimGapped childChar) - -- finalAssignment' = extractMedians finalAlignment - in - if staticIA then childChar {slimIAFinal = extractMediansGapped $ slimIAPrelim childChar} - else childChar { slimFinal = extractMedians $ slimGapped childChar -- finalAssignment' - , slimAlignment = if isTree then finalAlignment - else mempty - , slimIAPrelim = if isTree then finalAlignment - else mempty - , slimIAFinal = if isTree then extractMediansGapped $ finalAlignment - else mempty - } - -- ) - - else if (localCharType == WideSeq) || (localCharType == AminoSeq) then - let finalAlignment = DOP.preOrderLogic isLeft (wideAlignment parentChar) (wideGapped parentChar) (wideGapped childChar) - -- finalAssignment' = extractMedians finalAlignment - in - if staticIA then childChar {wideIAFinal = extractMediansGapped $ wideIAPrelim childChar} - else childChar { wideFinal = extractMedians $ wideGapped childChar -- finalAssignment' - , wideAlignment = if isTree then finalAlignment - else mempty - , wideIAPrelim = if isTree then finalAlignment - else mempty - , wideIAFinal = if isTree then extractMediansGapped $ finalAlignment - else mempty - } - - else if localCharType == HugeSeq then - let finalAlignment = DOP.preOrderLogic isLeft (hugeAlignment parentChar) (hugeGapped parentChar) (hugeGapped childChar) - -- finalAssignment' = extractMedians finalAlignment - in - if staticIA then childChar {hugeIAFinal = extractMediansGapped $ hugeIAPrelim childChar} - else childChar { hugeFinal = extractMedians $ hugeGapped childChar -- finalAssignment' - , hugeAlignment = if isTree then finalAlignment - else mempty - , hugeIAPrelim = if isTree then finalAlignment - else mempty - , hugeIAFinal = if isTree then extractMediansGapped $ finalAlignment - else mempty - } - - else error ("Unrecognized/implemented character type: " ++ show localCharType) - - else if childType == TreeNode && not isIn1Out1 then - - if localCharType == Add then - -- add logic for pre-order - let finalAssignment' = additivePreorder (rangePrelim childChar) (rangeFinal parentChar) - in - childChar {rangeFinal = finalAssignment'} - - else if localCharType == NonAdd then - -- add logic for pre-order - let finalAssignment' = nonAdditivePreorder (stateBVPrelim childChar) (stateBVFinal parentChar) - in - childChar {stateBVFinal = finalAssignment'} - - else if localCharType `elem` packedNonAddTypes then - let finalAssignment' = BP.packedPreorder localCharType (packedNonAddPrelim childChar) (packedNonAddFinal parentChar) - in - childChar {packedNonAddFinal = finalAssignment'} - - else if localCharType == Matrix then - -- add logic for pre-order - let finalAssignment' = matrixPreorder isLeft (matrixStatesPrelim childChar) (matrixStatesFinal parentChar) - in - childChar {matrixStatesFinal = finalAssignment'} - - else if localCharType == AlignedSlim then - let alignedFinal = M.getFinal3WaySlim (slimTCM charInfo) (alignedSlimFinal parentChar) (fst3 $ alignedSlimPrelim childChar) (thd3 $ alignedSlimPrelim childChar) - in - childChar {alignedSlimFinal = alignedFinal} - - else if localCharType == AlignedWide then - let alignedFinal = M.getFinal3WayWideHuge (wideTCM charInfo) (alignedWideFinal parentChar) (fst3 $ alignedWidePrelim childChar) (thd3 $ alignedWidePrelim childChar) - in - childChar {alignedWideFinal = alignedFinal} - - else if localCharType == AlignedHuge then - let alignedFinal = M.getFinal3WayWideHuge (hugeTCM charInfo) (alignedHugeFinal parentChar) (fst3 $ alignedHugePrelim childChar) (thd3 $ alignedHugePrelim childChar) - in - childChar {alignedHugeFinal = alignedFinal} - - -- need to set both final and alignment for sequence characters - else if (localCharType == SlimSeq) || (localCharType == NucSeq) then - -- trace ("TNFinal-Tree:" ++ (show (isLeft, (slimAlignment parentChar), (slimGapped parentChar) ,(slimGapped childChar)))) ( - let finalGapped = DOP.preOrderLogic isLeft (slimAlignment parentChar) (slimGapped parentChar) (slimGapped childChar) - finalAssignmentDO = if finalMethod == DirectOptimization then - let parentFinalDC = M.makeDynamicCharacterFromSingleVector (slimFinal parentChar) - parentFinal = (parentFinalDC, mempty, mempty) - -- parentGapped = (slimGapped parentChar, mempty, mempty) - childGapped = (slimGapped childChar, mempty, mempty) - finalAssignmentDOGapped = fst3 $ getDOFinal charInfo parentFinal childGapped - in - extractMedians finalAssignmentDOGapped - -- really could/should be mempty since overwritten by IA later - else extractMedians finalGapped - in - if staticIA then M.makeIAFinalCharacter finalMethod charInfo childChar parentChar - else childChar { slimFinal = GV.filter (/= 0) finalAssignmentDO - , slimAlignment = if isTree then finalGapped - else mempty - } - -- ) - - else if (localCharType == WideSeq) || (localCharType == AminoSeq) then - let finalGapped = DOP.preOrderLogic isLeft (wideAlignment parentChar) (wideGapped parentChar) (wideGapped childChar) - finalAssignmentDO = if finalMethod == DirectOptimization then - let parentFinalDC = M.makeDynamicCharacterFromSingleVector (wideFinal parentChar) - parentFinal = (mempty, parentFinalDC, mempty) - --parentGapped = (mempty, wideGapped parentChar, mempty) - childGapped = (mempty, wideGapped childChar, mempty) - finalAssignmentDOGapped = snd3 $ getDOFinal charInfo parentFinal childGapped - in - extractMedians finalAssignmentDOGapped - else extractMedians finalGapped - in - if staticIA then M.makeIAFinalCharacter finalMethod charInfo childChar parentChar - else childChar { wideFinal = GV.filter (/= 0) finalAssignmentDO - , wideAlignment = if isTree then finalGapped - else mempty - } - - else if localCharType == HugeSeq then - let finalGapped = DOP.preOrderLogic isLeft (hugeAlignment parentChar) (hugeGapped parentChar) (hugeGapped childChar) - finalAssignmentDO = if finalMethod == DirectOptimization then - let parentFinalDC = M.makeDynamicCharacterFromSingleVector (hugeFinal parentChar) - parentFinal = (mempty, mempty, parentFinalDC) - -- parentGapped = (mempty, mempty, hugeGapped parentChar) - childGapped = (mempty, mempty, hugeGapped childChar) - finalAssignmentDOGapped = thd3 $ getDOFinal charInfo parentFinal childGapped - in - extractMedians finalAssignmentDOGapped - else extractMedians finalGapped - in - if staticIA then M.makeIAFinalCharacter finalMethod charInfo childChar parentChar - else childChar { hugeFinal = GV.filter (not . BV.isZeroVector) finalAssignmentDO - , hugeAlignment = if isTree then finalGapped - else mempty - } - - else error ("Unrecognized/implemented character type: " ++ show localCharType) - - -- display tree indegree=outdegree=1 - -- since display trees here--indegree should be one as well - -- this doens't work--need to redo pass logic--perhaps by doing "grandparent" - else if isIn1Out1 then - -- trace ("InOut1 preorder") ( - if localCharType == Add then - childChar {rangeFinal = rangeFinal parentChar} - - else if localCharType == NonAdd then - childChar {stateBVFinal = stateBVFinal parentChar} - - else if localCharType `elem` packedNonAddTypes then - childChar {packedNonAddFinal = packedNonAddFinal parentChar} - - else if localCharType == Matrix then - childChar {matrixStatesFinal = matrixStatesFinal parentChar} - - else if localCharType == AlignedSlim then - childChar {alignedSlimFinal = alignedSlimFinal parentChar} - - else if localCharType == AlignedWide then - childChar {alignedWideFinal = alignedWideFinal parentChar} - - else if localCharType == AlignedHuge then - childChar {alignedHugeFinal = alignedHugeFinal parentChar} - - -- need to set both final and alignment for sequence characters - else if (localCharType == SlimSeq) || (localCharType == NucSeq) then -- parentChar - -- trace ("TNFinal-1/1:" ++ (show (isLeft, (slimAlignment parentChar), (slimGapped parentChar) ,(slimGapped childChar)))) ( - if staticIA then childChar { slimIAFinal = slimIAFinal parentChar} - else childChar { slimFinal = slimFinal parentChar - , slimAlignment = if isTree then slimAlignment parentChar -- finalGappedO -- slimAlignment parentChar -- finalGappedO-- slimAlignment parentChar - else mempty - , slimGapped = slimGapped parentChar -- slimGapped' -- slimGapped parentChar -- finalGappedO --slimGapped parentChar - -- , slimIAPrelim = slimIAPrelim parentChar - , slimIAFinal = if isTree then slimFinal parentChar - else mempty - } - -- ) - - else if (localCharType == WideSeq) || (localCharType == AminoSeq) then -- parentChar - -- trace ("TNFinal-1/1:" ++ (show (isLeft, (slimAlignment parentChar), (slimGapped parentChar) ,(slimGapped childChar)))) ( - if staticIA then childChar { wideIAFinal = wideIAFinal parentChar} - else childChar { wideFinal = wideFinal parentChar - , wideAlignment = if isTree then wideAlignment parentChar -- finalGappedO -- wideAlignment parentChar -- finalGappedO-- wideAlignment parentChar - else mempty - , wideGapped = wideGapped parentChar -- wideGapped' -- wideGapped parentChar -- finalGappedO --wideGapped parentChar - -- , wideIAPrelim = wideIAPrelim parentChar - , wideIAFinal = if isTree then wideFinal parentChar - else mempty - } - -- ) - - else if (localCharType == HugeSeq) then -- parentChar - -- trace ("TNFinal-1/1:" ++ (show (isLeft, (hugeAlignment parentChar), (hugeGapped parentChar) ,(hugeGapped childChar)))) ( - if staticIA then childChar { hugeIAFinal = hugeIAFinal parentChar} - else childChar { hugeFinal = hugeFinal parentChar - , hugeAlignment = if isTree then hugeAlignment parentChar -- finalGappedO -- hugeAlignment parentChar -- finalGappedO-- hugeAlignment parentChar - else mempty - , hugeGapped = hugeGapped parentChar -- hugeGapped' -- hugeGapped parentChar -- finalGappedO --hugeGapped parentChar - -- , hugeIAPrelim = hugeIAPrelim parentChar - , hugeIAFinal = if isTree then hugeFinal parentChar - else mempty - } - -- ) - - {- - else if (localCharType == WideSeq) || (localCharType == AminoSeq) then - if staticIA then childChar { wideIAFinal = wideIAFinal parentChar} - else childChar { wideFinal = if isTree then wideFinal parentChar - else mempty - , wideAlignment = DOP.preOrderLogic isLeft (wideAlignment parentChar) (wideGapped parentChar) (wideGapped childChar) - -- , wideIAPrelim = wideIAPrelim parentChar - , wideIAFinal = wideFinal parentChar - } - - else if localCharType == HugeSeq then - if staticIA then childChar { hugeIAFinal = hugeIAFinal parentChar} - else childChar { hugeFinal = if isTree then hugeFinal parentChar - else mempty - , hugeAlignment = DOP.preOrderLogic isLeft (hugeAlignment parentChar) (hugeGapped parentChar) (hugeGapped childChar) - -- , hugeIAPrelim = hugeIAPrelim parentChar - , hugeIAFinal = if isTree then hugeFinal parentChar - else mempty - } - -} - else error ("Unrecognized/implemented character type: " ++ show localCharType) - -- ) - - --for Hardwired graphs - else if isIn2Out1 then - if (parent2CharM == Nothing) then error ("Nothing parent2char in setFinal") - else TW.threeMedianFinal charInfo parentChar (fromJust parent2CharM) childChar - - else error ("Node type should not be here (pre-order on tree node only): " ++ show childType) - -- ) - --- | getDOFinal takes parent final, and node gapped (including its parent gapped) and performs a DO median --- to get the final state. This takes place in several steps --- 1) align (DOMedian) parent final with node gapped (ie node preliminary) --- 2) propagate new gaps in aligned node preliminary to child gapped in node triple (snd and thd) --- creating a 3-way alignment with parent final and child preliminary --- 3) apply appropriate get3way for the structure --- The final is then returned--with gaps to be filtered afterwards --- getDOFinal :: (FiniteBits a, GV.Vector v a) => v a -> (v a, v a, v a) -> CharInfo -> v a -getDOFinal :: CharInfo - -> (SlimDynamicCharacter, WideDynamicCharacter, HugeDynamicCharacter) - -> (SlimDynamicCharacter, WideDynamicCharacter, HugeDynamicCharacter) - -> (SlimDynamicCharacter, WideDynamicCharacter, HugeDynamicCharacter) -getDOFinal charInfo parentFinal nodeGapped = - let (a,b,c,_) = M.pairwiseDO charInfo parentFinal nodeGapped - parentNodeChar = (a,b,c) - - -- put "new" gaps into 2nd and thd gapped fields of appropriate seqeunce type - gappedFinal = makeGappedLeftRight charInfo parentNodeChar nodeGapped - in - gappedFinal - - --- | makeGappedLeftRight takes an alignment parent character and original node character and inserts "new" gaps into nodeCharcater --- makeGappedLeftRight :: CharacterData -> CharacterData -> CharInfo -> CharacterData --- makeGappedLeftRight gappedLeftRight nodeChar charInfo = -makeGappedLeftRight :: CharInfo - -> (SlimDynamicCharacter, WideDynamicCharacter, HugeDynamicCharacter) - -> (SlimDynamicCharacter, WideDynamicCharacter, HugeDynamicCharacter) - -> (SlimDynamicCharacter, WideDynamicCharacter, HugeDynamicCharacter) -makeGappedLeftRight charInfo gappedLeftRight nodeChar = - let localCharType = charType charInfo - in - if localCharType `elem` [SlimSeq, NucSeq] then - let (parentGapped, leftChildGapped, rightChildGapped) = TW.addGapsToChildren (fst3 gappedLeftRight) (fst3 nodeChar) - newFinalGapped = M.getFinal3WaySlim (slimTCM charInfo) parentGapped leftChildGapped rightChildGapped - in - (M.makeDynamicCharacterFromSingleVector newFinalGapped, mempty, mempty) - - else if localCharType `elem` [AminoSeq, WideSeq] then - let (parentGapped, leftChildGapped, rightChildGapped) = TW.addGapsToChildren (snd3 gappedLeftRight) (snd3 nodeChar) - newFinalGapped = M.getFinal3WayWideHuge (wideTCM charInfo) parentGapped leftChildGapped rightChildGapped - in - (mempty, M.makeDynamicCharacterFromSingleVector newFinalGapped, mempty) - - else if localCharType == HugeSeq then - let (parentGapped, leftChildGapped, rightChildGapped) = TW.addGapsToChildren (thd3 gappedLeftRight) (thd3 nodeChar) - newFinalGapped = M.getFinal3WayWideHuge (hugeTCM charInfo) parentGapped leftChildGapped rightChildGapped - in - (mempty, mempty, M.makeDynamicCharacterFromSingleVector newFinalGapped) - - else error ("Unrecognized character type: " ++ show localCharType) - - - --- | additivePreorder assignment takes preliminary triple of child (= current vertex) and --- final states of parent to create preorder final states of child -additivePreorder :: (V.Vector (Int, Int), V.Vector (Int, Int), V.Vector (Int, Int)) -> V.Vector (Int, Int) -> V.Vector (Int, Int) -additivePreorder (leftChild, nodePrelim, rightChild) parentFinal = - if null nodePrelim then mempty - else - V.zipWith4 makeAdditiveCharacterFinal nodePrelim leftChild rightChild parentFinal - --- | nonAdditivePreorder assignment takes preliminary triple of child (= current vertex) and --- final states of parent to create preorder final states of child -nonAdditivePreorder :: (V.Vector BV.BitVector, V.Vector BV.BitVector, V.Vector BV.BitVector) -> V.Vector BV.BitVector -> V.Vector BV.BitVector -nonAdditivePreorder (leftChild, nodePrelim, rightChild) parentFinal = - if null nodePrelim then mempty - else - V.zipWith4 makeNonAdditiveCharacterFinal nodePrelim leftChild rightChild parentFinal - - --- | matrixPreorder assigment akes preliminary matrix states of child (= current vertex) and --- final states of parent to create preorder final states of child --- th eboolean says whether the node is a 'left' node or right based on bitvetor label -matrixPreorder :: Bool -> V.Vector (V.Vector MatrixTriple) -> V.Vector (V.Vector MatrixTriple) -> V.Vector (V.Vector MatrixTriple) -matrixPreorder isLeft nodePrelim parentFinal = - if null nodePrelim then mempty - else - V.zipWith (makeMatrixCharacterFinal isLeft) nodePrelim parentFinal - - --- | makeAdditiveCharacterFinal takes vertex preliminary, and child preliminary states with well as parent final state --- and constructs final state assignment -makeAdditiveCharacterFinal :: (Int, Int) -> (Int, Int) -> (Int, Int) -> (Int, Int) -> (Int, Int) -makeAdditiveCharacterFinal nodePrelim leftChild rightChild parentFinal = - -- From Wheeler (20012) after Goloboff (1993) - let interNodeParent = intervalIntersection nodePrelim parentFinal - in - -- trace (show inData) ( - -- Rule 1 - if interNodeParent == Just parentFinal then - -- trace ("R1 " ++ show parentFinal) - parentFinal - -- Rule 2 - else if isJust ((leftChild `intervalUnion` rightChild) `intervalIntersection` parentFinal) then - let xFactor = ((leftChild `intervalUnion` rightChild) `intervalUnion` nodePrelim) `intervalIntersection` parentFinal - in - if isNothing xFactor then error ("I don't think this should happen in makeAdditiveCharacterFinal" ++ show (nodePrelim, leftChild, rightChild, parentFinal)) - else - if isJust (fromJust xFactor `intervalIntersection` nodePrelim) then - -- trace ("R2a " ++ show (fromJust xFactor)) - fromJust xFactor - else - -- trace ("Rb " ++ show (lciClosest (fromJust xFactor) nodePrelim)) - lciClosest (fromJust xFactor) nodePrelim - - -- Rule 3 - else - let unionLR = leftChild `intervalUnion` rightChild - closestPtoA = stateFirstClosestToSecond nodePrelim parentFinal - closestULRtoA = stateFirstClosestToSecond unionLR parentFinal - in - -- trace ("R3 " ++ show (min closestPtoA closestULRtoA, max closestPtoA closestULRtoA)) - (min closestPtoA closestULRtoA, max closestPtoA closestULRtoA) - -- ) - --- | stateFirstClosestToSecond takes teh states of the first interval and finds the state wiht smallest distance --- to either state in the second - -- assumes a <= b, x<= y -stateFirstClosestToSecond :: (Int, Int) -> (Int, Int) -> Int -stateFirstClosestToSecond (a,b) (x,y) = - let distASecond - | x > b = x - a - | y < a = a - y - | otherwise = error ("I don't think this should happen in makeAdditiveCharacterFinal" ++ show (a,b,x,y)) - distBSecond - | x > b = x - b - | y < a = b - y - | otherwise = error ("I don't think this should happen in makeAdditiveCharacterFinal" ++ show (a,b,x,y)) - in - if distASecond <= distBSecond then a - else b - --- | lciClosest returns the "largest closed interval" between the first interval --- and the closest state in the second interval - -- assumes a <= b, x<= y -lciClosest :: (Int, Int) -> (Int, Int) -> (Int, Int) -lciClosest (a,b) (x,y) - | x > b = (a,x) - | y < a = (y,b) - | otherwise = error ("I don't think this should happen in lciClosest" ++ show (a,b,x,y)) - - -- | intervalIntersection is bit-analogue intersection for additive character operations - -- takes two intervals and returnas range intersection - -- Nothing signifies an empty intersection - -- assumes a <= b, x<= y -intervalIntersection :: (Int, Int) -> (Int, Int) -> Maybe (Int, Int) -intervalIntersection (a,b) (x,y) = - let newPair = (max a x, min b y) - in - if max a x > min b y then Nothing - else Just newPair - - --- | intervalUnion is bit-analogue union for additive character operations --- takes two intervals and returnas union - -- assumes a <= b, x<= y -intervalUnion :: (Int, Int) -> (Int, Int) -> (Int, Int) -intervalUnion (a,b) (x,y) = (min a x, max b y) - - --- | makeNonAdditiveCharacterFinal takes vertex preliminary, and child preliminary states with well as parent final state --- and constructs final state assignment -makeNonAdditiveCharacterFinal :: BV.BitVector -> BV.BitVector-> BV.BitVector-> BV.BitVector -> BV.BitVector -makeNonAdditiveCharacterFinal nodePrelim leftChild rightChild parentFinal = - -- From Wheeler (2012) after Fitch (1971) - -- trace (show inData) ( - if BV.isZeroVector ((complement nodePrelim) .&. parentFinal) then - --trace ("R1 " ++ show parentFinal) - parentFinal - else if (BV.isZeroVector (leftChild .&. rightChild)) then - --trace ("R2 " ++ show (nodePrelim .|. parentFinal)) - nodePrelim .|. parentFinal - else - -- trace ("R3 " ++ show (nodePrelim .|. (leftChild .&. parentFinal) .|. (rightChild .&. parentFinal))) - nodePrelim .|. (parentFinal .&. (leftChild .|. rightChild)) - -- ) - --- | makeMatrixCharacterFinal vertex preliminary and parent final state --- and constructs final state assignment --- really just tracks the states on a traceback and sets the cost to maxBound :: Int for states not in the traceback --- path --- Bool for left right node -makeMatrixCharacterFinal :: Bool -> V.Vector MatrixTriple -> V.Vector MatrixTriple -> V.Vector MatrixTriple -makeMatrixCharacterFinal isLeft nodePrelim parentFinal = - let numStates = length nodePrelim - stateIndexList = V.fromList [0..(numStates - 1)] - (stateCostList, stateLeftChildList, stateRightChildList) = V.unzip3 parentFinal - (_, prelimStateLeftChildList, prelimStateRightChildList) = V.unzip3 nodePrelim - allThree = if isLeft then V.zip3 stateCostList stateLeftChildList stateIndexList - else V.zip3 stateCostList stateRightChildList stateIndexList - bestParentThree = V.filter ((/= (maxBound :: StateCost)). fst3) allThree - bestPrelimStates = L.sort $ L.nub $ concatMap snd3 bestParentThree - allFour = V.zipWith3 (setCostsAndStates bestPrelimStates) prelimStateLeftChildList prelimStateRightChildList stateIndexList - finalBestTriple = V.filter ((/= (maxBound :: StateCost)).fst3) allFour - in - finalBestTriple - --- | setCostsAndStates takes a list of states that are in teh set of 'best' and a four-tuple --- of a matrix triple annd a fourth field of the state index --- if the state is in the list of `best' indices it is kept and not if it isn't -setCostsAndStates :: [Int] -> [ChildStateIndex] -> [ChildStateIndex] -> Int -> (StateCost, [ChildStateIndex], [ChildStateIndex]) -setCostsAndStates bestPrelimStates leftChildState rightChildStates stateIndex = - if stateIndex `elem` bestPrelimStates then (stateIndex, leftChildState, rightChildStates) - else (maxBound :: StateCost, leftChildState, rightChildStates) - - --- | setMinCostStatesMatrix sets the cost of non-minimal cost states to maxBounnd :: StateCost (Int) -setMinCostStatesMatrix :: Int -> V.Vector (V.Vector MatrixTriple) -> V.Vector (V.Vector MatrixTriple) -setMinCostStatesMatrix numStates inStateVect = - let outStates = V.filter ((/= (maxBound :: StateCost)).fst3) <$> fmap (nonMinCostStatesToMaxCost (V.fromList [0.. (numStates - 1)])) inStateVect - in - outStates - --- | nonMinCostStatesToMaxCost takes an individual pair of minimum state cost and matrix character triple --- returns a new character with the states cost either the minium value or maxBound if not --- this only applied at root or leaf--other vertices minimum costs may not be part of the --- miniumm cost assignment, but may be useful heuristically -nonMinCostStatesToMaxCost :: V.Vector StateCost -> V.Vector MatrixTriple -> V.Vector MatrixTriple -nonMinCostStatesToMaxCost stateIndexList tripleVect = - let minStateCost = V.minimum $ fmap fst3 tripleVect - result = V.zipWith (modifyStateCost minStateCost) tripleVect stateIndexList - in - -- trace ((show stateIndexList) ++ " " ++ (show $ V.zip tripleVect stateIndexList)) - result - where - modifyStateCost d (a,b,c) e = if a == d then (e,b,c) - else (maxBound :: StateCost ,b,c) - --- | setFinalToPreliminaryStates takes VertexBlockData and sets the final values to Preliminary -setFinalToPreliminaryStates :: VertexBlockData -> VertexBlockData -setFinalToPreliminaryStates inVertBlockData = - if V.null inVertBlockData then mempty - else - fmap setBlockFinalToPrelim inVertBlockData - --- | setBlockFinalToPrelim sets characters in a block final values to Preliminary -setBlockFinalToPrelim :: V.Vector CharacterData -> V.Vector CharacterData -setBlockFinalToPrelim inCharVect = - if V.null inCharVect then mempty - else fmap setFinalToPrelimCharacterData inCharVect - --- | setFinalFinalToPrelimCharacterData takes a single chartcater and sets final values to Preliminary -setFinalToPrelimCharacterData :: CharacterData -> CharacterData -setFinalToPrelimCharacterData inChar = - inChar { stateBVFinal = snd3 $ stateBVPrelim inChar - , rangeFinal = snd3 $ rangePrelim inChar - , matrixStatesFinal = matrixStatesPrelim inChar - , slimAlignment = slimGapped inChar - , slimFinal = slimPrelim inChar - , slimIAFinal = snd3 $ slimIAPrelim inChar - - , wideAlignment = wideGapped inChar - , wideFinal = widePrelim inChar - , wideIAFinal = snd3 $ wideIAPrelim inChar - - , hugeAlignment = hugeGapped inChar - , hugeFinal = hugePrelim inChar - , hugeIAFinal = snd3 $ hugeIAPrelim inChar - - , alignedSlimFinal = snd3 $ alignedSlimPrelim inChar - , alignedWideFinal = snd3 $ alignedWidePrelim inChar - , alignedHugeFinal = snd3 $ alignedHugePrelim inChar - - , packedNonAddFinal = snd3 $ packedNonAddPrelim inChar - } - --- | setPreliminaryToFinalStates takes VertexBlockData and sets the Preliminary states to final values -setPreliminaryToFinalStates :: VertexBlockData -> VertexBlockData -setPreliminaryToFinalStates inVertBlockData = - if V.null inVertBlockData then mempty - else - fmap setBlockPrelimToFinal inVertBlockData - --- | setBlockPrelimToFinal sets characters in a block preliminary data to final -setBlockPrelimToFinal :: V.Vector CharacterData -> V.Vector CharacterData -setBlockPrelimToFinal inCharVect = - if V.null inCharVect then mempty - else fmap setPrelimToFinalCharacterData inCharVect - --- | setPrelimToFinalCharacterData takes a single chartcater and sets preliminary values to final -setPrelimToFinalCharacterData :: CharacterData -> CharacterData -setPrelimToFinalCharacterData inChar = - inChar { stateBVPrelim = (stateBVFinal inChar, stateBVFinal inChar, stateBVFinal inChar) - , rangePrelim = (rangeFinal inChar, rangeFinal inChar, rangeFinal inChar) - , matrixStatesPrelim = matrixStatesFinal inChar - -- , slimAlignment = slimGapped inChar - , slimGapped = (slimFinal inChar, slimFinal inChar, slimFinal inChar) - , slimIAPrelim = (slimIAFinal inChar,slimIAFinal inChar, slimIAFinal inChar) - - -- , wideAlignment = wideGapped inChar - , wideGapped = (wideFinal inChar, wideFinal inChar, wideFinal inChar) - , wideIAPrelim = (wideIAFinal inChar, wideIAFinal inChar, wideIAFinal inChar) - - -- , hugeAlignment = hugeGapped inChar - , hugeGapped = (hugeFinal inChar, hugeFinal inChar, hugeFinal inChar) - , hugeIAPrelim = (hugeIAFinal inChar, hugeIAFinal inChar, hugeIAFinal inChar) - - , alignedSlimPrelim = (alignedSlimFinal inChar, alignedSlimFinal inChar, alignedSlimFinal inChar) - , alignedWidePrelim = (alignedWideFinal inChar, alignedWideFinal inChar, alignedWideFinal inChar) - , alignedHugePrelim = (alignedHugeFinal inChar, alignedHugeFinal inChar, alignedHugeFinal inChar) - - , packedNonAddPrelim = (packedNonAddFinal inChar, packedNonAddFinal inChar, packedNonAddFinal inChar) - } diff --git a/pkg/PhyGraph/GraphOptimization/Traversals.hs b/pkg/PhyGraph/GraphOptimization/Traversals.hs deleted file mode 100644 index 9a362b680..000000000 --- a/pkg/PhyGraph/GraphOptimization/Traversals.hs +++ /dev/null @@ -1,725 +0,0 @@ -{- | -Module : Traversals.hs -Description : Module specifying graph traversal functions for PhyGraph -Copyright : (c) 2021 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -module GraphOptimization.Traversals ( postOrderTreeTraversal - , multiTraverseFullyLabelTree - , multiTraverseFullyLabelGraph - , multiTraverseFullyLabelGraph' - , multiTraverseFullyLabelSoftWired - , multiTraverseFullyLabelHardWired - , checkUnusedEdgesPruneInfty - , makeLeafGraph - , makeSimpleLeafGraph - , postDecorateTree - , postDecorateTree' - , updatePhylogeneticGraphCost - , updateGraphCostsComplexities - , getW15NetPenalty - , getW23NetPenalty - , getW15RootCost - , generalizedGraphPostOrderTraversal - ) where - -import qualified Data.List as L -import qualified Data.Vector as V -import GeneralUtilities -import qualified GraphFormatUtilities as GFU -import Types.Types -import qualified Utilities.LocalGraph as LG -import qualified GraphOptimization.PostOrderFunctions as PO -import qualified GraphOptimization.PreOrderFunctions as PRE -import qualified Graphs.GraphOperations as GO -import Data.Bits -import Data.Maybe -import qualified Data.Text.Lazy as T -import Debug.Trace -import Utilities.Utilities as U -import qualified GraphOptimization.PostOrderSoftWiredFunctions as POSW -import Control.Parallel.Strategies -import qualified ParallelUtilities as PU - --- | multiTraverseFullyLabelGraph is a wrapper around multi-traversal functions for Tree, --- Soft-wired network graph, and Hard-wired network graph --- can either find root, be given root, or start somwhere else (startVertex) do optimize only a component of a forest -multiTraverseFullyLabelGraph :: GlobalSettings -> ProcessedData -> Bool -> Bool -> Maybe Int -> SimpleGraph -> PhylogeneticGraph -multiTraverseFullyLabelGraph inGS inData pruneEdges warnPruneEdges startVertex inGraph - | LG.isEmpty inGraph = emptyPhylogeneticGraph - | graphType inGS == Tree = - -- test for Tree - let (_, _, _, networkVertexList) = LG.splitVertexList inGraph - in - if null networkVertexList then - let leafGraph = makeLeafGraph inData - in multiTraverseFullyLabelTree inGS inData leafGraph startVertex inGraph - else errorWithoutStackTrace ("Input graph is not a tree/forest, but graph type has been specified (perhaps by default) as Tree. Modify input graph or use 'set()' command to specify network type\n\tNetwork vertices: " ++ (show $ fmap fst networkVertexList) ++ "\n" ++ (LG.prettify inGraph)) - | graphType inGS == SoftWired = - let leafGraph = POSW.makeLeafGraphSoftWired inData - in multiTraverseFullyLabelSoftWired inGS inData pruneEdges warnPruneEdges leafGraph startVertex inGraph - | graphType inGS == HardWired = - let leafGraph = makeLeafGraph inData - in multiTraverseFullyLabelHardWired inGS inData leafGraph startVertex inGraph - | otherwise = errorWithoutStackTrace ("Unknown graph type specified: " ++ show (graphType inGS)) - - --- |multiTraverseFullyLabelGraph' maps to multiTraverseFullyLabelGraph with differnet order of arguments used by report IA and tnt output -multiTraverseFullyLabelGraph' :: GlobalSettings -> Bool -> Bool -> Maybe Int -> ProcessedData -> SimpleGraph -> PhylogeneticGraph -multiTraverseFullyLabelGraph' inGS pruneEdges warnPruneEdges startVertex inData inGraph = multiTraverseFullyLabelGraph inGS inData pruneEdges warnPruneEdges startVertex inGraph - - -multiTraverseFullyLabelHardWired :: GlobalSettings -> ProcessedData -> DecoratedGraph -> Maybe Int -> SimpleGraph -> PhylogeneticGraph -multiTraverseFullyLabelHardWired inGS inData leafGraph startVertex inSimpleGraph = multiTraverseFullyLabelTree inGS inData leafGraph startVertex inSimpleGraph - --- | multiTraverseFullyLabelSoftWired fully labels a softwired network component forest --- including traversal rootings-- does not reroot on network edges --- allows indegree=outdegree=1 vertices --- pruneEdges and warnPruneEdges specify if unused edges (ie not in diuaplytrees) are pruned from --- canonical tree or if an infinity cost is returned and if a trace warning is thrown if so. --- in general--input trees should use "pruneEdges" during search--not --- can either find root, be given root, or start somwhere else (startVertex) do optimize only a component of a forest --- first Bool for calcualting breanch edger weights -multiTraverseFullyLabelSoftWired :: GlobalSettings -> ProcessedData -> Bool -> Bool -> DecoratedGraph -> Maybe Int -> SimpleGraph -> PhylogeneticGraph -multiTraverseFullyLabelSoftWired inGS inData pruneEdges warnPruneEdges leafGraph startVertex inSimpleGraph = - if LG.isEmpty inSimpleGraph then emptyPhylogeneticGraph - else - let sequenceChars = U.getNumberSequenceCharacters (thd3 inData) - (postOrderGraph, localStartVertex) = generalizedGraphPostOrderTraversal inGS sequenceChars inData leafGraph False startVertex inSimpleGraph - fullyOptimizedGraph = PRE.preOrderTreeTraversal inGS (finalAssignment inGS) False True (sequenceChars > 0) localStartVertex False postOrderGraph - in - --trace ("MTFLS:\n" ++ (show $ thd6 postOrderGraph)) - checkUnusedEdgesPruneInfty inGS inData pruneEdges warnPruneEdges leafGraph $ updatePhylogeneticGraphCost fullyOptimizedGraph (snd6 fullyOptimizedGraph) - --- | multiTraverseFullyLabelTree performs potorder on default root and other traversal foci, taking the minimum --- traversal cost for all nonexact charcters--the initial rooting is used for exact characters --- operates with Tree functions --- need to add forest functionality--in principle just split into components and optimize them independently --- but get into root index issues the way this is written now. --- can either find root, be given root, or start somwhere else (startVertex) do optimize only a component of a forest -multiTraverseFullyLabelTree :: GlobalSettings -> ProcessedData -> DecoratedGraph -> Maybe Int -> SimpleGraph -> PhylogeneticGraph -multiTraverseFullyLabelTree inGS inData leafGraph startVertex inSimpleGraph = - if LG.isEmpty inSimpleGraph then emptyPhylogeneticGraph - else - let sequenceChars = U.getNumberSequenceCharacters (thd3 inData) - -- False for staticIA - (postOrderGraph, localStartVertex) = generalizedGraphPostOrderTraversal inGS sequenceChars inData leafGraph False startVertex inSimpleGraph - in - PRE.preOrderTreeTraversal inGS (finalAssignment inGS) False True (sequenceChars > 0) localStartVertex False postOrderGraph - - --- | generalizedGraphPostOrderTraversal performs the postorder pass --- on a graph (tree, softWired, or hardWired) to determine the "preliminary" character states --- include penalty factor cost but not root cost which may or may not be wanted depending on context --- if full graph--yes, if a component yes or no. --- hence returnde das pair -generalizedGraphPostOrderTraversal :: GlobalSettings -> Int -> ProcessedData -> DecoratedGraph -> Bool -> Maybe Int -> SimpleGraph -> (PhylogeneticGraph, Int) -generalizedGraphPostOrderTraversal inGS sequenceChars inData leafGraph staticIA startVertex inSimpleGraph = - - -- select postOrder function based on graph type - let postOrderFunction = if (graphType inGS) == Tree then postOrderTreeTraversal - else if (graphType inGS) == SoftWired then POSW.postOrderSoftWiredTraversal - else if (graphType inGS) == HardWired then postOrderTreeTraversal - else error ("Graph type not implemented: " ++ (show $ graphType inGS)) - - -- first traversal on outgroup roo - outgroupRooted = postOrderFunction inGS inData leafGraph staticIA startVertex inSimpleGraph - - -- start at start vertex--for components or ur-root for full graph - startVertexList = if startVertex == Nothing then fmap fst $ LG.getRoots $ thd6 outgroupRooted - else [fromJust startVertex] - - -- next edges (to vertex in list) to perform rerroting - -- progresses recursivey over adjacent edges to minimize node reoptimization - -- childrenOfRoot = concatMap (LG.descendants (thd6 outgroupRooted)) startVertexList - -- grandChildrenOfRoot = concatMap (LG.descendants (thd6 outgroupRooted)) childrenOfRoot - - -- create list of multi-traversals with original rooting first - -- subsequent rerooting do not reoptimize exact characters (add nonadd etc) - -- they are taken from the fully labelled first input decorated graph later when output graph created - -- it is important that the first graph be the ourgroup rooted graph (outgroupRootedPhyloGraph) so this - -- will have the preorder assignments for the outgroup rooted graph as 3rd field. This can be used for incremental - -- optimization to get O(log n) initial postorder assingment when mutsating graph. - -- hardwired reroot cause much pain - -- the head startvertex list for reoptimizing spit trees ni swapping - recursiveRerootList = if (graphType inGS == HardWired) then [outgroupRooted] - else if (graphType inGS == SoftWired) then [POSW.getDisplayBasedRerootSoftWired SoftWired (head startVertexList) outgroupRooted] - -- need to test which is better - -- else if (graphType inGS == SoftWired) then outgroupRooted : minimalReRootPhyloGraph inGS outgroupRooted (head startVertexList) grandChildrenOfRoot - else if (graphType inGS == Tree) then [POSW.getDisplayBasedRerootSoftWired Tree (head startVertexList) outgroupRooted] - --else if (graphType inGS == Tree) then outgroupRooted : minimalReRootPhyloGraph inGS outgroupRooted (head startVertexList) grandChildrenOfRoot - else error ("Graph type not implemented: " ++ (show $ graphType inGS)) - - - -- remove if tree reroot code pans out - finalizedPostOrderGraphList = L.sortOn snd6 recursiveRerootList - - -- create optimal final graph with best costs and best traversal (rerooting) forest for each character - -- traversal for exact characters (and costs) are the first of each least since exact only optimizaed for that - -- traversal graph. The result has approprotate post-order assignments for traversals, preorder "final" assignments - -- are propagated to the Decorated graph field after the preorder pass. - -- doesn't have to be sorted, but should minimize assignments - graphWithBestAssignments = head recursiveRerootList -- L.foldl1' setBetterGraphAssignment recursiveRerootList' - - {- root and model complexities moved to output - -- same root cost if same data and number of roots - localRootCost = rootComplexity inGS - {-if (rootCost inGS) == NoRootCost then 0.0 - else if (rootCost inGS) == Wheeler2015Root then getW15RootCost inGS outgroupRooted - else error ("Root cost type " ++ (show $ rootCost inGS) ++ " is not yet implemented") - -} - -} - - in - -- trace ("GPOT length: " ++ (show $ fmap snd6 recursiveRerootList) ++ " " ++ (show $ graphType inGS)) ( - -- trace ("TRAV:" ++ (show startVertex) ++ " " ++ (show sequenceChars) ++ " " ++ (show (snd6 outgroupRooted, fmap snd6 finalizedPostOrderGraphList, snd6 graphWithBestAssignments)) - -- ++ "\nTraversal root costs: " ++ (show (getTraversalCosts outgroupRooted, fmap getTraversalCosts recursiveRerootList', getTraversalCosts graphWithBestAssignments))) ( - - -- only static characters - if sequenceChars == 0 then - let penaltyFactor = if (graphType inGS == Tree) then 0.0 - --it is its own penalty due to counting all changes in in2 out 1 nodes - else if (graphType inGS == HardWired) then 0.0 - else if (graphFactor inGS) == NoNetworkPenalty then 0.0 - else if (graphFactor inGS) == Wheeler2015Network then getW15NetPenalty startVertex outgroupRooted - else if (graphFactor inGS) == Wheeler2023Network then getW23NetPenalty startVertex outgroupRooted - else error ("Network penalty type " ++ (show $ graphFactor inGS) ++ " is not yet implemented") - - staticOnlyGraph = if (graphType inGS) == SoftWired then POSW.updateAndFinalizePostOrderSoftWired startVertex (head startVertexList) outgroupRooted - else outgroupRooted - -- staticOnlyGraph = head recursiveRerootList' - staticOnlyGraph' = if startVertex == Nothing then updatePhylogeneticGraphCost staticOnlyGraph (penaltyFactor + (snd6 staticOnlyGraph)) - else updatePhylogeneticGraphCost staticOnlyGraph (penaltyFactor + (snd6 staticOnlyGraph)) - in - -- trace ("Only static: " ++ (snd6 staticOnlyGraph')) - (staticOnlyGraph', head startVertexList) - - -- single seuquence (prealigned, dynamic) only (ie no static) - else if sequenceChars == 1 && (U.getNumberExactCharacters (thd3 inData) == 0) then - let penaltyFactorList = if (graphType inGS == Tree) then replicate (length finalizedPostOrderGraphList) 0.0 - else if (graphType inGS == HardWired) then replicate (length finalizedPostOrderGraphList) 0.0 - else if (graphFactor inGS) == NoNetworkPenalty then replicate (length finalizedPostOrderGraphList) 0.0 - else if (graphFactor inGS) == Wheeler2015Network then fmap (getW15NetPenalty startVertex) finalizedPostOrderGraphList - else if (graphFactor inGS) == Wheeler2023Network then fmap (getW23NetPenalty startVertex) finalizedPostOrderGraphList - else error ("Network penalty type " ++ (show $ graphFactor inGS) ++ " is not yet implemented") - newCostList = zipWith (+) penaltyFactorList (fmap snd6 finalizedPostOrderGraphList) - - - finalizedPostOrderGraph = head $ L.sortOn snd6 $ zipWith updatePhylogeneticGraphCost finalizedPostOrderGraphList newCostList - in - -- trace ("GPOT-1: " ++ (show (snd6 finalizedPostOrderGraph))) - (finalizedPostOrderGraph, head startVertexList) - - -- multiple dynamic characters--checks for best root for each character - -- important to have outgroup rooted graph first for fold so don't use sorted recursive list - else - let penaltyFactor = if (graphType inGS == Tree) then 0.0 - else if (graphType inGS == HardWired) then 0.0 - else if (graphFactor inGS) == NoNetworkPenalty then 0.0 - else if (graphFactor inGS) == Wheeler2015Network then getW15NetPenalty startVertex graphWithBestAssignments - else if (graphFactor inGS) == Wheeler2023Network then getW23NetPenalty startVertex graphWithBestAssignments - else error ("Network penalty type " ++ (show $ graphFactor inGS) ++ " is not yet implemented") - - graphWithBestAssignments' = updatePhylogeneticGraphCost graphWithBestAssignments (penaltyFactor + (snd6 graphWithBestAssignments)) - in - -- trace ("GPOT-2: " ++ (show (penaltyFactor + (snd6 graphWithBestAssignments)))) - (graphWithBestAssignments', head startVertexList) - - -- ) - --- | updateGraphCostsComplexities adds root and model complexities if appropriate to graphs -updateGraphCostsComplexities :: GlobalSettings -> PhylogeneticGraph -> PhylogeneticGraph -updateGraphCostsComplexities inGS inGraph = - if optimalityCriterion inGS == Parsimony then inGraph - else if optimalityCriterion inGS == Likelihood then - -- trace ("\tFinalizing graph cost with root priors") - updatePhylogeneticGraphCost inGraph ((rootComplexity inGS) + (snd6 inGraph)) - else if optimalityCriterion inGS == PMDL then - -- trace ("\tFinalizing graph cost with model and root complexities") - updatePhylogeneticGraphCost inGraph ((rootComplexity inGS) + (modelComplexity inGS) + (snd6 inGraph)) - else error ("Optimality criterion not recognized/implemented: " ++ (show $ optimalityCriterion inGS)) - --- | updatePhylogeneticGraphCost takes a PhylgeneticGrtaph and Double and replaces the cost (snd of 6 fields) --- and returns Phylogenetic graph -updatePhylogeneticGraphCost :: PhylogeneticGraph -> VertexCost -> PhylogeneticGraph -updatePhylogeneticGraphCost (a, _, b, c, d, e) newCost = (a, newCost, b, c, d, e) - --- | getW15RootCost creates a root cost as the 'insertion' of character data. For sequence data averaged over --- leaf taxa -getW15RootCost :: GlobalSettings -> PhylogeneticGraph -> VertexCost -getW15RootCost inGS inGraph = - if LG.isEmpty $ thd6 inGraph then 0.0 - else - let (rootList, _, _, _) = LG.splitVertexList $ fst6 inGraph - numRoots = length rootList - - in - (fromIntegral numRoots) * (rootComplexity inGS) - --- | getW15NetPenalty takes a Phylogenetic tree and returns the network penalty of Wheeler (2015) --- modified to take the union of all edges of trees of minimal length --- currently modified -- not exactlty W15 -getW15NetPenalty :: Maybe Int -> PhylogeneticGraph -> VertexCost -getW15NetPenalty startVertex inGraph = - if LG.isEmpty $ thd6 inGraph then 0.0 - else - let (bestTreeList, _) = extractLowestCostDisplayTree startVertex inGraph - bestTreesEdgeList = L.nubBy undirectedEdgeEquality $ concat $ fmap LG.edges bestTreeList - rootIndex = if startVertex == Nothing then fst $ head $ LG.getRoots (fst6 inGraph) - else fromJust startVertex - blockPenaltyList = PU.seqParMap rdeepseq (getBlockW2015 bestTreesEdgeList rootIndex) (fth6 inGraph) - - -- leaf list for normalization - (_, leafList, _, _) = LG.splitVertexList (fst6 inGraph) - numLeaves = length leafList - numTreeEdges = 4.0 * (fromIntegral numLeaves) - 4.0 - divisor = numTreeEdges - in - -- trace ("W15:" ++ (show ((sum $ blockPenaltyList) / divisor )) ++ " from " ++ (show (numTreeEdges, numExtraEdges, divisor, sum blockPenaltyList))) ( - (sum $ blockPenaltyList) / divisor - -- ) - --- | getW23NetPenalty takes a Phylogenetic tree and returns the network penalty of Wheeler (2023) --- basic idea is new edge improvement must be better than average existing edge cost --- penalty for each added edge (unlike W15 which was on a block by block basis) --- num extra edges/2 since actually add 2 new edges when one network edge -getW23NetPenalty :: Maybe Int -> PhylogeneticGraph -> VertexCost -getW23NetPenalty startVertex inGraph = - if LG.isEmpty $ thd6 inGraph then 0.0 - else - let (bestTreeList, _) = extractLowestCostDisplayTree startVertex inGraph - bestTreesEdgeList = L.nubBy undirectedEdgeEquality $ concat $ fmap LG.edges bestTreeList - rootIndex = if startVertex == Nothing then fst $ head $ LG.getRoots (fst6 inGraph) - else fromJust startVertex - blockPenaltyList = PU.seqParMap rdeepseq (getBlockW2015 bestTreesEdgeList rootIndex) (fth6 inGraph) - - -- leaf list for normalization - (_, leafList, _, _) = LG.splitVertexList (fst6 inGraph) - numLeaves = length leafList - numTreeEdges = 2.0 * (fromIntegral numLeaves) - 2.0 - numExtraEdges = ((fromIntegral $ length bestTreesEdgeList) - numTreeEdges) / 2.0 - divisor = numTreeEdges - numExtraEdges - in - -- trace ("W23:" ++ (show ((numExtraEdges * (snd6 inGraph)) / (2.0 * numTreeEdges))) ++ " from " ++ (show (numTreeEdges, numExtraEdges))) ( - if divisor == 0.0 then infinity - -- else (sum blockPenaltyList) / divisor - -- else (numExtraEdges * (sum blockPenaltyList)) / divisor - else (numExtraEdges * (snd6 inGraph)) / (2.0 * numTreeEdges) - -- ) - - --- | getBlockW2015 takes the list of trees for a block, gets the root cost and determines the individual --- penalty cost of that block -getBlockW2015 :: [LG.Edge] -> Int -> [DecoratedGraph] -> VertexCost -getBlockW2015 treeEdgeList rootIndex blockTreeList = - if null treeEdgeList || null blockTreeList then 0.0 - else - let blockTreeEdgeList = L.nubBy undirectedEdgeEquality $ concatMap LG.edges blockTreeList - numExtraEdges = length $ undirectedEdgeMinus blockTreeEdgeList treeEdgeList - blockCost = subGraphCost $ fromJust $ LG.lab (head blockTreeList) rootIndex - in - -- trace ("GBW: " ++ (show (numExtraEdges, blockCost, blockTreeEdgeList)) ++ "\n" ++ (show $ fmap (subGraphCost . snd) $ LG.labNodes (head blockTreeList))) - blockCost * (fromIntegral numExtraEdges) - --- | checkUnusedEdgesPruneInfty checks if a softwired phylogenetic graph has --- "unused" edges sensu Wheeler 2015--that an edge in the canonical graph is --- not present in any of the block display trees (that are heurstically optimal) --- the options specify if the cost returned is Infinity (really max bound Double) --- with no pruning of edges or the cost is left unchanged and unused edges are --- pruned from the canonical graph --- this is unDirected due to rerooting heuristic in post/preorder optimization --- inifinity defined in Types.hs -checkUnusedEdgesPruneInfty :: GlobalSettings -> ProcessedData -> Bool -> Bool -> DecoratedGraph-> PhylogeneticGraph -> PhylogeneticGraph -checkUnusedEdgesPruneInfty inGS inData pruneEdges warnPruneEdges leafGraph inGraph@(inSimple, _, inCanonical, blockTreeV, charTreeVV, charInfoVV) = - let simpleEdgeList = LG.edges inSimple - displayEdgeSet = L.nubBy undirectedEdgeEquality $ concat $ concat $ fmap (fmap LG.edges) blockTreeV - unusedEdges = undirectedEdgeMinus simpleEdgeList displayEdgeSet - in - -- no unused edges all OK - if null unusedEdges then inGraph - - -- unused edges--do not prune return "infinite cost" - else if not pruneEdges then - -- trace ("Unused edge->Infinity") - (inSimple, infinity, inCanonical, blockTreeV, charTreeVV, charInfoVV) - - -- unused but pruned--need to prune nodes and reoptimize to get final assignments correct - else - let newSimpleGraph = LG.delEdges unusedEdges inSimple - contractedSimple = GO.contractIn1Out1EdgesRename newSimpleGraph - in - if warnPruneEdges then - trace ("Pruning " ++ (show $ length unusedEdges) ++ " unused edges and reoptimizing graph") - multiTraverseFullyLabelSoftWired inGS inData pruneEdges warnPruneEdges leafGraph Nothing contractedSimple - - else multiTraverseFullyLabelSoftWired inGS inData pruneEdges warnPruneEdges leafGraph Nothing contractedSimple - --- | undirectedEdgeEquality checks edgse for equality irrespective of direction -undirectedEdgeEquality :: LG.Edge -> LG.Edge -> Bool -undirectedEdgeEquality (a,b) (c,d) = if a == c && b == d then True - else if a == d && b == c then True - else False - --- | undirectedEdgeMinus subtracts edges in the second list from those in the first using --- undirected matching -undirectedEdgeMinus :: [LG.Edge] -> [LG.Edge] -> [LG.Edge] -undirectedEdgeMinus firstList secondList = - if null firstList then [] - else - let firstEdge@(a,b) = head firstList - in - if firstEdge `L.elem` secondList then undirectedEdgeMinus (tail firstList) secondList - else if (b,a) `L.elem` secondList then undirectedEdgeMinus (tail firstList) secondList - else firstEdge : undirectedEdgeMinus (tail firstList) secondList - --- | extractLowestCostDisplayTree takes a phylogenetic graph and takes all valid (complete) resolutions --- (display trees) and their costs --- and determines the total cost (over all blocks) of each display tree --- the lowest cost display tree(s) as list are returned with cost --- this is used in Wheeler (2015) network penalty -extractLowestCostDisplayTree :: Maybe Int -> PhylogeneticGraph -> ([DecoratedGraph], VertexCost) -extractLowestCostDisplayTree startVertex inGraph = - if LG.isEmpty $ thd6 inGraph then error "Empty graph in extractLowestCostDisplayTree" - else - let -- get componen t or global root label - rootLabel = if startVertex == Nothing then snd $ head $ LG.getRoots (thd6 inGraph) - else fromJust $ LG.lab (thd6 inGraph) (fromJust startVertex) - - -- get resolution data for start/rpoot vertex - blockResolutionLL = V.toList $ fmap POSW.getAllResolutionList (vertexResolutionData rootLabel) - --blockResolutionLL = V.toList $ fmap (PO.getBestResolutionListPair startVertex False) (vertexResolutionData rootLabel) - displayTreeBlockList = L.transpose blockResolutionLL - displayTreePairList = L.foldl1' sumTreeCostLists displayTreeBlockList - minimumCost = minimum $ fmap snd displayTreePairList - (bestDisplayTreeList, _) = unzip $ filter ((== minimumCost) . snd) displayTreePairList - in - -- trace ("FC: " ++ (show $ fmap snd displayTreePairList)) - (bestDisplayTreeList, minimumCost) - --- | sumTreeCostLists takes two lists of (Graph, Cost) pairs and sums the costs and keeps the trees the same --- does not check that graphs are the same after debug -sumTreeCostLists :: (Eq a, Eq b) => [(LG.Gr a b, VertexCost)] -> [(LG.Gr a b, VertexCost)] -> [(LG.Gr a b, VertexCost)] -sumTreeCostLists firstList secondList = - if null firstList || null secondList then error "Empty list in sumTreeCostLists" - else - let (firstGraphList, firstCostList) = unzip firstList - (secondGraphList, secondCostList) = unzip secondList - newCostList = zipWith (+) firstCostList secondCostList - - -- remove once working - checkList = filter (== False) $ zipWith LG.equal firstGraphList secondGraphList - in - if null checkList then error ("Graph lists not same : " ++ (show checkList)) - else - -- trace ("Graphs match ") - zip firstGraphList newCostList - --- | setBetterGraphAssignment takes two phylogenetic graphs and returns the lower cost optimization of each character, --- with traversal focus etc to get best overall graph --- since this is meant to work with graphs that have or do not have reoptimized exact (=static-Add/NonAdd/MAtrix) characters --- the criterion is lower cost character is taken, unless the cost is zero, then non-zero is taken --- this function is expected to be used in a fold over a list of graphs --- the basic comparison is over the costs of the root(s) cost for each of the character decorated (traversal) graphs - --- May change --- assumes that a single decorated graph comes in for each Phylogenetic graph from the fully and reroot optimize (V.singleton (V.singleton DecGraph)) --- and goes through the block-character-cost data and reassigns based on that creating a unique (although there could be more than one) decorated --- graph for each character in each block. --- postorder assignments in traversal set of block character trees are NOT propagated back to first decorated graph. --- the third field of phylogenetic Graph is set to the 3rd fieled of the first of two inputs--so if startiong fold with outgroup --- rooted graph--that is what stays which can be used as a preorder graph for incremental optimization --- when perfoming that sort of operation --- The traversal graphs --- are used for the pre-order final assignments which will be propagated back to set those of the 3rd field decorated graph - --- this will have to be modified for solf-wired since incoming blocks will not all be the same underlying gaph --- unclear how hardwired will be affected -setBetterGraphAssignment :: PhylogeneticGraph -> PhylogeneticGraph -> PhylogeneticGraph -setBetterGraphAssignment firstGraph@(fSimple, _, fDecGraph, fBlockDisplay, fTraversal, fCharInfo) secondGraph@(_, _, sDecGraph, _, sTraversal, _) = - -- trace ("SBGA:" ++ (show $ (length fTraversal, length sTraversal))) ( - if LG.isEmpty fDecGraph then secondGraph - else if LG.isEmpty sDecGraph then firstGraph - else - -- trace ("setBetter (" ++ (show fCost) ++ "," ++ (show sCost) ++ ")" ++ " CharInfo blocks:" ++ (show $ length fCharInfo) ++ " characters: " ++ (show $ fmap length fCharInfo) ++ " " - -- ++ (show $ fmap (fmap name) fCharInfo)) ( - let (mergedBlockVect, costVector) = V.unzip $ V.zipWith makeBetterBlock fTraversal sTraversal - in - --trace ("setBetter (" ++ (show fCost) ++ "," ++ (show sCost) ++ ") ->" ++ (show $ V.sum costVector) ++ " nt:" ++ (show $ length fTraversal) - -- ++ "length blocks " ++ (show $ fmap length fTraversal)) - (fSimple, V.sum costVector, fDecGraph, fBlockDisplay, mergedBlockVect, fCharInfo) - -- ) - --- | makeBetterBlocktakes two verctors of traversals. Each vector contains a decorated graph (=traversla graph) for each --- character. This can be a single sequence or series of exact characters --- the function applies a character cost comparison to get the better -makeBetterBlock :: V.Vector DecoratedGraph -> V.Vector DecoratedGraph -> (V.Vector DecoratedGraph, VertexCost) -makeBetterBlock firstBlockGraphVect secondBlockGraphVect = - let (mergedCharacterVect, costVector) = V.unzip $ V.zipWith chooseBetterCharacter firstBlockGraphVect secondBlockGraphVect - in - -- trace ("MBB: " ++ (show $ (length firstBlockGraphVect, length firstBlockGraphVect))) - (mergedCharacterVect, V.sum costVector) - --- | chooseBetterCharacter takes a pair of character decorated graphs and chooses teh "better" one as in lower cost, or non-zero cost --- if one is zer (for exact characters) and returns the better character and cost --- graph can have multiple roots -chooseBetterCharacter :: DecoratedGraph -> DecoratedGraph -> (DecoratedGraph, VertexCost) -chooseBetterCharacter firstGraph secondGraph - | LG.isEmpty firstGraph = error "Empty first graph in chooseBetterCharacter" - | LG.isEmpty secondGraph = error "Empty second graph in chooseBetterCharacter" - | otherwise = - let firstGraphCost = sum $ fmap (subGraphCost . snd) (LG.getRoots firstGraph) - secondGraphCost = sum $ fmap (subGraphCost . snd) (LG.getRoots secondGraph) - in - -- trace ("Costs " ++ show (firstGraphCost, secondGraphCost)) ( - if firstGraphCost == 0 then (secondGraph, secondGraphCost) - else if secondGraphCost == 0 then (firstGraph, firstGraphCost) - else if secondGraphCost < firstGraphCost then (secondGraph, secondGraphCost) - else (firstGraph, firstGraphCost) - -- ) - --- | minimalReRootPhyloGraph takes an inialtial post-order labelled phylogenetic graph --- and "intelligently" reroots by traversing through adjacent edges, hopefully --- reoptimizing the minimum number of vertices each time (2) but could be more depending --- on graph topology --- NB--only deals with post-order assignments -minimalReRootPhyloGraph :: GlobalSettings -> PhylogeneticGraph -> Int -> [LG.Node] -> [PhylogeneticGraph] -minimalReRootPhyloGraph inGS inGraph originalRoot nodesToRoot = - -- trace ("MRR: " ++ (show nodesToRoot) ++ " " ++ (show $ fmap (LG.descendants (thd6 inGraph)) nodesToRoot)) ( - if null nodesToRoot then [] - else - let firstRerootIndex = head nodesToRoot - nextReroots = LG.descendants (thd6 inGraph) firstRerootIndex ++ tail nodesToRoot - newGraph - | (graphType inGS) == Tree = PO.rerootPhylogeneticGraph' inGS False False inGraph originalRoot firstRerootIndex - | (graphType inGS) == SoftWired = PO.rerootPhylogeneticNetwork' inGS inGraph originalRoot firstRerootIndex - | (graphType inGS) == HardWired = PO.rerootPhylogeneticNetwork' inGS inGraph originalRoot firstRerootIndex - | otherwise = errorWithoutStackTrace ("Graph type not implemented/recognized: " ++ show (graphType inGS)) - in - -- trace ("NRR: " ++ " " ++ (show (LG.descendants (thd6 inGraph) firstRerootIndex)) ) ( -- ++ " -> " ++ (show nextReroots) ++ "\n" ++ (LG.prettify $ fst6 inGraph) ++ "\n" ++ (LG.prettify $ fst6 newGraph)) ( - -- trace ("MRR: " ++ (show $ snd6 inGraph) ++ " -> " ++ (show $ snd6 newGraph)) ( - if fst6 newGraph == LG.empty then minimalReRootPhyloGraph inGS inGraph originalRoot nextReroots - else newGraph : minimalReRootPhyloGraph inGS newGraph originalRoot nextReroots - -- ) ) - - --- | makeLeafGraph takes input data and creates a 'graph' of leaves with Vertex informnation --- but with zero edges. This 'graph' can be reused as a starting structure for graph construction --- to avoid remaking of leaf vertices -makeLeafGraph :: ProcessedData -> DecoratedGraph -makeLeafGraph (nameVect, bvNameVect, blocDataVect) = - if V.null nameVect then error "Empty ProcessedData in makeLeafGraph" - else - let leafVertexList = V.toList $ V.map (makeLeafVertex nameVect bvNameVect blocDataVect) (V.fromList [0.. V.length nameVect - 1]) - in - LG.mkGraph leafVertexList [] - - --- | makeSimpleLeafGraph takes input data and creates a 'graph' of leaves with Vertex informnation --- but with zero edges. This 'graph' can be reused as a starting structure for graph construction --- to avoid remaking of leaf vertices -makeSimpleLeafGraph :: ProcessedData -> SimpleGraph -makeSimpleLeafGraph (nameVect, _, _) = - if V.null nameVect then error "Empty ProcessedData in makeSimpleLeafGraph" - else - let leafVertexList = V.toList $ V.map (makeSimpleLeafVertex nameVect) (V.fromList [0.. V.length nameVect - 1]) - in - LG.mkGraph leafVertexList [] - where makeSimpleLeafVertex a b = (b, a V.! b) - - --- | makeLeafVertex makes a single unconnected vertex for a leaf -makeLeafVertex :: V.Vector NameText -> V.Vector NameBV -> V.Vector BlockData -> Int -> LG.LNode VertexInfo -makeLeafVertex nameVect bvNameVect inData localIndex = - -- trace ("Making leaf " ++ (show localIndex) ++ " Data " ++ (show $ length inData) ++ " " ++ (show $ fmap length $ fmap snd3 inData)) ( - let centralData = V.map snd3 inData - thisData = V.map (V.! localIndex) centralData - newVertex = VertexInfo { index = localIndex - , bvLabel = bvNameVect V.! localIndex - , parents = V.empty - , children = V.empty - , nodeType = LeafNode - , vertName = nameVect V.! localIndex - , vertData = thisData - , vertexResolutionData = mempty - , vertexCost = 0.0 - , subGraphCost = 0.0 - } - in - -- trace (show (length thisData) ++ (show $ fmap length thisData)) - (localIndex, newVertex) - -- ) - --- | postOrderTreeTraversal takes a 'simple' graph and generates 'preliminary' assignments --- vi post-order traversal, yields cost as well --- for a binary tree only --- depending on optimality criterion--will calculate root cost -postOrderTreeTraversal :: GlobalSettings -> ProcessedData -> DecoratedGraph -> Bool -> Maybe Int -> SimpleGraph -> PhylogeneticGraph -postOrderTreeTraversal _ (_, _, blockDataVect) leafGraph staticIA startVertex inGraph = - if LG.isEmpty inGraph then emptyPhylogeneticGraph - else - -- Assumes root is Number of Leaves - let rootIndex = if startVertex == Nothing then fst $ head $ LG.getRoots inGraph - else fromJust startVertex - blockCharInfo = V.map thd3 blockDataVect - newTree = postDecorateTree staticIA inGraph leafGraph blockCharInfo rootIndex rootIndex - in - -- trace ("It Begins at " ++ (show $ fmap fst $ LG.getRoots inGraph) ++ "\n" ++ show inGraph) ( - if (startVertex == Nothing) && (not $ LG.isRoot inGraph rootIndex) then - let localRootList = fst <$> LG.getRoots inGraph - localRootEdges = concatMap (LG.out inGraph) localRootList - currentRootEdges = LG.out inGraph rootIndex - in - error ("Index " ++ show rootIndex ++ " with edges " ++ show currentRootEdges ++ " not root in graph:" ++ show localRootList ++ " edges:" ++ show localRootEdges ++ "\n" ++ GFU.showGraph inGraph) - else newTree - --) - - --- | postDecorateTree' is wrapper for postDecorateTree to alow for mapping -postDecorateTree' :: Bool -> DecoratedGraph -> V.Vector (V.Vector CharInfo) -> LG.Node -> LG.Node -> SimpleGraph -> PhylogeneticGraph -postDecorateTree' staticIA curDecGraph blockCharInfo rootIndex curNode simpleGraph = postDecorateTree staticIA simpleGraph curDecGraph blockCharInfo rootIndex curNode - --- | postDecorateTree begins at start index (usually root, but could be a subtree) and moves preorder till children are labelled and then returns postorder --- labelling vertices and edges as it goes back to root --- this for a tree so single root -postDecorateTree :: Bool -> SimpleGraph -> DecoratedGraph -> V.Vector (V.Vector CharInfo) -> LG.Node -> LG.Node -> PhylogeneticGraph -postDecorateTree staticIA simpleGraph curDecGraph blockCharInfo rootIndex curNode = - -- if node in there (leaf) nothing to do and return - if LG.gelem curNode curDecGraph then - let nodeLabel = LG.lab curDecGraph curNode - in - if isNothing nodeLabel then error ("Null label for node " ++ show curNode) - else - -- checks for node already in graph--either leaf or pre-optimized node in Hardwired - -- trace ("In graph :" ++ (show curNode) ++ " " ++ (show nodeLabel)) - (simpleGraph, subGraphCost (fromJust nodeLabel), curDecGraph, mempty, mempty, blockCharInfo) - - -- Need to make node - else - - -- check if children in graph - let nodeChildren = LG.descendants simpleGraph curNode -- should be 1 or 2, not zero since all leaves already in graph - leftChild = head nodeChildren - rightChild = last nodeChildren - leftChildTree = postDecorateTree staticIA simpleGraph curDecGraph blockCharInfo rootIndex leftChild - rightLeftChildTree = if length nodeChildren == 2 then postDecorateTree staticIA simpleGraph (thd6 leftChildTree) blockCharInfo rootIndex rightChild - else leftChildTree - newSubTree = thd6 rightLeftChildTree - (leftChildLabel, rightChildLabel) = U.leftRightChildLabelBV (fromJust $ LG.lab newSubTree leftChild, fromJust $ LG.lab newSubTree rightChild) - - in - - if length nodeChildren > 2 then error ("Graph not dichotomous in postDecorateTree node " ++ show curNode ++ "\n" ++ LG.prettify simpleGraph) - else if null nodeChildren then error ("Leaf not in graph in postDecorateTree node " ++ show curNode ++ "\n" ++ LG.prettify simpleGraph) - - -- out-degree 1 should not happen with Tree but will with HardWired graph - else if length nodeChildren == 1 then - -- make node from single child and single new edge to child - -- takes characters in blocks--but for tree really all same block - let childVertexData = vertData leftChildLabel - newVertex = VertexInfo { index = curNode - -- same as child--could and perhaps should prepend 1 to make distinct - , bvLabel = bvLabel leftChildLabel - , parents = V.fromList $ LG.parents simpleGraph curNode - , children = V.fromList nodeChildren - , nodeType = GO.getNodeType simpleGraph curNode - , vertName = T.pack $ "HTU" ++ show curNode - , vertData = childVertexData - -- this not used for Hardwired or Tree - , vertexResolutionData = mempty - , vertexCost = 0.0 - , subGraphCost = subGraphCost leftChildLabel - } - newEdgesLabel = EdgeInfo { minLength = 0.0 - , maxLength = 0.0 - , midRangeLength = 0.0 - , edgeType = TreeEdge - } - newEdges = LG.toEdge <$> LG.out simpleGraph curNode - newLEdges = fmap (LG.toLEdge' newEdgesLabel) newEdges - newGraph = LG.insEdges newLEdges $ LG.insNode (curNode, newVertex) newSubTree - - (newDisplayVect, newCharTreeVV) = POSW.divideDecoratedGraphByBlockAndCharacterTree newGraph - - in - -- th curnode == roiot index for pruned subtrees - -- trace ("New vertex:" ++ (show newVertex) ++ " at cost " ++ (show newCost)) ( - -- Do we need to PO.divideDecoratedGraphByBlockAndCharacterTree if not root? probbaly not - - --if nodeType newVertex == RootNode then (simpleGraph, subGraphCost newVertex, newGraph, mempty, PO.divideDecoratedGraphByBlockAndCharacterTree newGraph, blockCharInfo) - if nodeType newVertex == RootNode || curNode == rootIndex then (simpleGraph, subGraphCost newVertex, newGraph, newDisplayVect, newCharTreeVV, blockCharInfo) - else (simpleGraph, subGraphCost newVertex, newGraph, mempty, mempty, blockCharInfo) - - -- make node from 2 children - else - -- make node from children and new edges to children - -- takes characters in blocks--but for tree really all same block - let -- this ensures that left/right choices are based on leaf BV for consistency and label invariance - -- larger bitvector is Right, smaller or equal Left - - newCharData = if staticIA then PO.createVertexDataOverBlocksStaticIA (vertData leftChildLabel) (vertData rightChildLabel) blockCharInfo [] - else PO.createVertexDataOverBlocks (vertData leftChildLabel) (vertData rightChildLabel) blockCharInfo [] - newCost = V.sum $ V.map V.sum $ V.map (V.map snd) newCharData - newVertex = VertexInfo { index = curNode - , bvLabel = bvLabel leftChildLabel .|. bvLabel rightChildLabel - , parents = V.fromList $ LG.parents simpleGraph curNode - , children = V.fromList nodeChildren - , nodeType = GO.getNodeType simpleGraph curNode - , vertName = T.pack $ "HTU" ++ show curNode - , vertData = V.map (V.map fst) newCharData - , vertexResolutionData = mempty - , vertexCost = newCost - , subGraphCost = subGraphCost leftChildLabel + subGraphCost rightChildLabel + newCost - } - newEdgesLabel = EdgeInfo { minLength = newCost / 2.0 - , maxLength = newCost / 2.0 - , midRangeLength = newCost / 2.0 - , edgeType = TreeEdge - } - newEdges = LG.toEdge <$> LG.out simpleGraph curNode - newLEdges = fmap (LG.toLEdge' newEdgesLabel) newEdges - newGraph = LG.insEdges newLEdges $ LG.insNode (curNode, newVertex) newSubTree - - (newDisplayVect, newCharTreeVV) = POSW.divideDecoratedGraphByBlockAndCharacterTree newGraph - - in - -- th curnode == roiot index for pruned subtrees - -- trace ("New vertex:" ++ (show newVertex) ++ " at cost " ++ (show newCost)) ( - -- Do we need to PO.divideDecoratedGraphByBlockAndCharacterTree if not root? probbaly not - - --if nodeType newVertex == RootNode then (simpleGraph, subGraphCost newVertex, newGraph, mempty, PO.divideDecoratedGraphByBlockAndCharacterTree newGraph, blockCharInfo) - if nodeType newVertex == RootNode || curNode == rootIndex then (simpleGraph, subGraphCost newVertex, newGraph, newDisplayVect, newCharTreeVV, blockCharInfo) - else (simpleGraph, subGraphCost newVertex, newGraph, mempty, mempty, blockCharInfo) - - -- ) -- ) - - diff --git a/pkg/PhyGraph/Graphs/GraphOperations.hs b/pkg/PhyGraph/Graphs/GraphOperations.hs deleted file mode 100644 index 2355672c9..000000000 --- a/pkg/PhyGraph/Graphs/GraphOperations.hs +++ /dev/null @@ -1,848 +0,0 @@ -{- | -Module : GraphOperations.hs -Description : Module specifying general graph functions--with types specific to Types.hs - graph functions that a re general are in LocalGraph.hs -Copyright : (c) 2021 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -module Graphs.GraphOperations ( ladderizeGraph - , convertDecoratedToSimpleGraph - , convertToSimpleEdge - , graphCostFromNodes - , dichotomizeRoot - , showDecGraphs - , sortEdgeListByLength - , selectPhylogeneticGraph - , getUniqueGraphs - , copyIAFinalToPrelim - , copyIAPrelimToFinal - , makeIAFinalFromPrelim - , makeIAPrelimFromFinal - , topologicalEqual - , getTopoUniqPhylogeneticGraph - , getBVUniqPhylogeneticGraph - , makeDummyLabEdge - , contractIn1Out1EdgesRename - , renameSimpleGraphNodes - , renameSimpleGraphNodesString - , hasNetNodeAncestorViolation - , convertGeneralGraphToPhylogeneticGraph - , selectGraphStochastic - , makeNewickList - , makeGraphTimeConsistent - , isNovelGraph - , getNodeType - , getDisplayTreeCostList - ) where - -import Bio.DynamicCharacter -import Control.Parallel.Strategies -import Data.Bits -import qualified Data.BitVector.LittleEndian as BV -import qualified Data.Char as C -import qualified Data.List as L -import Data.Maybe -import qualified Data.Text.Lazy as T -import qualified Data.Vector as V -import qualified Data.Vector.Generic as GV -import Debug.Trace -import GeneralUtilities -import qualified GraphFormatUtilities as GFU -import qualified GraphOptimization.Medians as M -import qualified ParallelUtilities as PU -import Text.Read -import Types.Types -import qualified Utilities.LocalGraph as LG -import qualified Utilities.Utilities as U - --- | makeNewickList takes a list of fgl trees and outputs a single String cointaining the graphs in Newick format -makeNewickList :: Bool -> Bool -> Int -> [SimpleGraph] -> [VertexCost] -> String -makeNewickList writeEdgeWeight writeNodeLabel' rootIndex graphList costList = - let allTrees = L.foldl' (&&) True (fmap LG.isTree graphList) - - -- check for network HTU label requirement - writeNodeLabel = if allTrees then writeNodeLabel' - else if writeNodeLabel' then writeNodeLabel' - else - trace ("HTU labels are required for ENewick Output") - True - - graphString = GFU.fglList2ForestEnhancedNewickString (fmap (LG.rerootTree rootIndex) graphList) writeEdgeWeight writeNodeLabel - newickStringList = fmap init $ filter (not . null) $ lines graphString - costStringList = fmap (('[' :) . (++ "];\n")) (fmap show costList) - graphStringCost = concat $ zipWith (++) newickStringList costStringList - in - graphStringCost - --- | convertGeneralGraphToPhylogeneticGraph inputs a SimpleGraph and converts it to a Phylogenetic graph by: --- 1) transitive reduction -- removes anc <-> desc netork edges --- 2) ladderizes -- all vertices are (in degree, outdegree) (0,1|2) (1,2) (2,1) (1,0) --- by adding extra HTIs and edges --- 3) checks time consistency and removes edges stepwise from --- those that violate the most before/after splits of network edges --- arbitrary but deterministic --- 4) contracts out any remaning indegree 1 outdegree 1 nodes and renames HTUs in order --- these tests can be screwed up by imporperly formated graphs comming in (self edges, chained network edge etc) -convertGeneralGraphToPhylogeneticGraph :: String -> SimpleGraph -> SimpleGraph -convertGeneralGraphToPhylogeneticGraph failCorrect inGraph = - if LG.isEmpty inGraph then LG.empty - else - let -- remove single "tail" edge from root with single child, replace child node with root - noTailGraph = LG.contractRootOut1Edge inGraph - - -- remove indeg 1 out deg 1 edges - noIn1Out1Graph = contractIn1Out1EdgesRename noTailGraph - - -- transitive reduction - -- only wanted to EUN and CUN--but they do it - -- reducedGraph = LG.transitiveReduceGraph noIn1Out1Graph - - -- laderization of indegree and outdegree edges - ladderGraph = ladderizeGraph noIn1Out1Graph -- reducedGraph - - -- time consistency (after those removed by transitrive reduction) - timeConsistentGraph = makeGraphTimeConsistent failCorrect ladderGraph - - -- removes ancestor descendent edges transitiveReduceGraph should do this - -- but that looks at all nods not just vertex - noParentChainGraph = removeParentsInChain failCorrect timeConsistentGraph - - -- remove sister-sister edge. where two network nodes have same parents - noSisterSisterGraph = removeSisterSisterEdges failCorrect noParentChainGraph - - in - if LG.isEmpty timeConsistentGraph then LG.empty - - else if LG.isEmpty noParentChainGraph then LG.empty - - else if LG.isEmpty noSisterSisterGraph then LG.empty - - -- trace ("CGP orig:\n" ++ (LG.prettify inGraph) ++ "\nNew:" ++ (LG.prettify timeConsistentGraph)) - -- cycle check to make sure--can be removed when things working - -- else if LG.cyclic noSisterSisterGraph then error ("Cycle in graph : \n" ++ (LG.prettify noSisterSisterGraph)) - - -- this final need to ladderize or recontract? - else noSisterSisterGraph - --- | removeParentsInChain checks the parents of each netowrk node are not anc/desc of each other -removeParentsInChain :: String -> SimpleGraph -> SimpleGraph -removeParentsInChain failCorrect inGraph = - if LG.isEmpty inGraph then LG.empty - else - let (_, _, _, netVertexList) = LG.splitVertexList inGraph - parentNetVertList = fmap (LG.labParents inGraph) $ fmap fst netVertexList - - -- get list of nodes that are transitively equal in age - concurrentList = LG.mergeConcurrentNodeLists parentNetVertList [] - concurrentPairList = concatMap getListPairs concurrentList - - -- get pairs that violate concurrency - violatingConcurrentPairs = concatMap (LG.concurrentViolatePair inGraph) concurrentPairList - - -- get netowrk nodes with violations - parentNodeViolateList = concatMap pairToList violatingConcurrentPairs - childNodeViolateList = concatMap (LG.descendants inGraph) parentNodeViolateList - netNodeViolateList = filter (LG.isNetworkNode inGraph) childNodeViolateList - - netEdgesThatViolate = fmap LG.toEdge $ LG.inn inGraph $ head netNodeViolateList - - in - if null violatingConcurrentPairs then inGraph - else if null netNodeViolateList then error ("Should be neNode that violate") - else if null netEdgesThatViolate then error "Should be violating in edges" - else if failCorrect == "fail" then LG.empty - else - let edgeDeletedGraph = LG.delEdge (head netEdgesThatViolate) inGraph - newGraph = contractIn1Out1EdgesRename edgeDeletedGraph - in - -- trace ("PIC") - removeParentsInChain failCorrect newGraph - where pairToList (a,b) = [fst a, fst b] - --- | removeSisterSisterEdges takes a graph and recursively removes a single edge fomr where two network --- edges have the same two parents -removeSisterSisterEdges :: String -> SimpleGraph -> SimpleGraph -removeSisterSisterEdges failCorrect inGraph = - if LG.isEmpty inGraph then LG.empty - else - let sisterSisterEdges = LG.getSisterSisterEdgeList inGraph - -- newGraph = LG.delEdge (head sisterSisterEdges) inGraph - newGraph = LG.delEdges sisterSisterEdges inGraph - newGraph' = contractIn1Out1EdgesRename newGraph - in - if null sisterSisterEdges then inGraph - else if failCorrect == "fail" then LG.empty - else - -- trace ("Sister") - -- removeSisterSisterEdges - newGraph' - --- | makeGraphTimeConsistent takes laderized, transitive reduced graph and deletes --- network edges in an arbitrary but deterministic sequence to produce a phylogentic graphs suitable --- for swapping etc --- looks for violation of time between netork edges based on "before" and "after" --- tests of nodes that should be potentially same age --- removes second edge of second pair of two network edges in each case adn remakes graph -makeGraphTimeConsistent :: String -> SimpleGraph -> SimpleGraph -makeGraphTimeConsistent failOut inGraph = - if LG.isEmpty inGraph then LG.empty - else if LG.isTree inGraph then inGraph - else - let coevalNodeConstraintList = LG.coevalNodePairs inGraph - coevalNodeConstraintList' = PU.seqParMap rdeepseq (LG.addBeforeAfterToPair inGraph) coevalNodeConstraintList -- `using` PU.myParListChunkRDS - coevalPairsToCompareList = getListPairs coevalNodeConstraintList' - timeOffendingEdgeList = LG.getEdgesToRemoveForTime inGraph coevalPairsToCompareList - newGraph = LG.delEdges timeOffendingEdgeList inGraph - in - -- trace ("MGTC:" ++ (show timeOffendingEdgeList)) - if (failOut == "fail") && ((not . null) timeOffendingEdgeList) then LG.empty - else contractIn1Out1EdgesRename newGraph - --- | contractIn1Out1EdgesRename contracts in degree and outdegree edges and renames HTUs in index order --- does one at a time and makes a graph and recurses -contractIn1Out1EdgesRename :: SimpleGraph -> SimpleGraph -contractIn1Out1EdgesRename inGraph = - if LG.isEmpty inGraph then LG.empty - else - let newGraph = LG.contractIn1Out1Edges inGraph - in - renameSimpleGraphNodes newGraph - - --- | renameSimpleGraphNodes takes nodes and renames HTU nodes based on index -renameSimpleGraphNodes :: SimpleGraph -> SimpleGraph -renameSimpleGraphNodes inGraph = - if LG.isEmpty inGraph then LG.empty - else - let inNodes = LG.labNodes inGraph - nodeLabels = fmap (makeSimpleLabel inGraph) inNodes - newNodes = zip (fmap fst inNodes) nodeLabels - newEdges = LG.labEdges inGraph - in - --newGraph - -- trace ("C11: " ++ (show $ LG.getIsolatedNodes newGraph) ++ " => " ++ (show newNodes) ++ " " ++ (show $ fmap LG.toEdge newEdges)) - LG.mkGraph newNodes newEdges - where makeSimpleLabel g (a, b) = if (not $ LG.isLeaf g a) then T.pack $ "HTU" ++ show a - else b - --- | renameSimpleGraphNodesString takes nodes and renames HTU nodes based on index -renameSimpleGraphNodesString :: LG.Gr String String -> LG.Gr String String -renameSimpleGraphNodesString inGraph = - if LG.isEmpty inGraph then LG.empty - else - let inNodes = LG.labNodes inGraph - nodeLabels = fmap (makeSimpleLabel inGraph) inNodes - newNodes = zip (fmap fst inNodes) nodeLabels - newEdges = LG.labEdges inGraph - in - --newGraph - -- trace ("C11: " ++ (show $ LG.getIsolatedNodes newGraph) ++ " => " ++ (show newNodes) ++ " " ++ (show $ fmap LG.toEdge newEdges)) - LG.mkGraph newNodes newEdges - where makeSimpleLabel g (a, b) = if (not $ LG.isLeaf g a) then "HTU" ++ show a - else b - --- | sortEdgeListByLength sorts edge list by length (midRange), highest to lowest -sortEdgeListByLength :: [LG.LEdge EdgeInfo] -> [LG.LEdge EdgeInfo] -sortEdgeListByLength inEdgeList = - if null inEdgeList then [] - else - reverse $ L.sortOn (midRangeLength . thd3) inEdgeList - --- | ladderizeGraph is a wrapper around ladderizeGraph' to allow for mapping with --- local nodelist -ladderizeGraph :: SimpleGraph -> SimpleGraph -ladderizeGraph inGraph = ladderizeGraph' inGraph (LG.nodes inGraph) - --- | ladderize takes an input graph and ensures/creates nodes --- such that all vertices are (indegree, outdegree) (0,>0), (1,2) (2,1) (1,0) ---ladderizeGraph' :: SimpleGraph -> [LG.Node] -> SimpleGraph -ladderizeGraph' :: SimpleGraph -> [LG.Node] -> SimpleGraph -ladderizeGraph' inGraph nodeList - | LG.isEmpty inGraph = LG.empty - | null nodeList = inGraph - | otherwise = - let -- these are roots, network, tree, leaf nodes - okNodeDegrees = [(0,2),(1,2),(2,1),(1,0)] - firstNode = head nodeList - (inEdgeList, outEdgeList) = LG.getInOutEdges inGraph firstNode - inOutPairLength = (length inEdgeList, length outEdgeList) - in - -- trace ("node " ++ (show firstNode) ++ " " ++ (show inOutPairLength)) ( - -- node ok to keep - if inOutPairLength `elem` okNodeDegrees then ladderizeGraph' inGraph (tail nodeList) - -- node edges need modification - else - let newGraph = resolveNode inGraph firstNode (inEdgeList, outEdgeList) inOutPairLength - in - ladderizeGraph' newGraph (LG.nodes newGraph) - -- ) - --- | resolveNode takes a graph and node and inbound edgelist and outbound edge list --- and converts node to one of (indeg, outdeg) (0,1),(0,2),(1,2),(2,1),(1,0) --- this only resolves a single nodes edges at a time and then returns new graph --- when more hase to be done--that will occur on lultiple passes through nodes. --- perhaps not the most efficient, but only done once per input graph --- contracts indegree 1 outdegree 1 nodes -resolveNode :: SimpleGraph -> LG.Node -> ([LG.LEdge Double], [LG.LEdge Double]) -> (Int, Int) -> SimpleGraph -resolveNode inGraph curNode inOutPair@(inEdgeList, outEdgeList) (inNum, outNum) = - if LG.isEmpty inGraph then LG.empty - else - --trace ("Resolveing " ++ show (inNum, outNum)) ( - let numNodes = length $ LG.nodes inGraph - in - -- isolated node -- throw error - if inNum == 0 && outNum == 0 then error ("ResolveNode error: Isolated vertex " ++ show curNode ++ " in graph\n" ++ LG.prettify inGraph ) - - -- indegree 1 outdegree 1 node to contract - else if inNum == 1 && outNum == 1 then - let newEdge = (fst3 $ head inEdgeList, snd3 $ head outEdgeList, 0.0 :: Double) - newGraph = LG.insEdge newEdge $ LG.delNode curNode $ LG.delLEdges (inEdgeList ++ outEdgeList) inGraph - in - newGraph - - -- leaf leaf with too many parents - else if (inNum > 1) && (outNum == 0) || (inNum > 2) && (outNum == 1) || (inNum > 1) && (outNum == 2) then - let first2Edges = take 2 inEdgeList - newNode = (numNodes , T.pack $ ("HTU" ++ (show numNodes))) - newEdge1 = (fst3 $ head first2Edges, numNodes, 0.0 :: Double) - newEdge2 = (fst3 $ last first2Edges, numNodes, 0.0 :: Double) - newEdge3 = (numNodes, curNode, 0.0 :: Double) - newGraph = LG.insEdges [newEdge1, newEdge2, newEdge3] $ LG.delLEdges first2Edges $ LG.insNode newNode inGraph - in - newGraph - - else if (inNum < 2 || outNum > 2) then - let first2Edges = take 2 outEdgeList - newNode = (numNodes , T.pack $ ("HTU" ++ (show numNodes))) - newEdge1 = (numNodes, snd3 $ head first2Edges, 0.0 :: Double) - newEdge2 = (numNodes, snd3 $ last first2Edges, 0.0 :: Double) - newEdge3 = (curNode, numNodes, 0.0 :: Double) - newGraph = LG.insEdges [newEdge1, newEdge2, newEdge3] $ LG.delLEdges first2Edges $ LG.insNode newNode inGraph - in - newGraph - - -- node with too parents and too many children - -- converts to tree node--biased in that direction - else if (inNum > 2) && (outNum > 2) then - let first2Edges = take 2 inEdgeList - newNode = (numNodes , T.pack $ ("HTU" ++ (show numNodes))) - newEdge1 = (fst3 $ head first2Edges, numNodes, 0.0 :: Double) - newEdge2 = (fst3 $ last first2Edges, numNodes, 0.0 :: Double) - newEdge3 = (numNodes, curNode, 0.0 :: Double) - newGraph = LG.insEdges [newEdge1, newEdge2, newEdge3] $ LG.delLEdges first2Edges $ LG.insNode newNode inGraph - in - newGraph - - - -- root or simple network indegree node - else if (inNum == 0 || outNum > 2) then - let first2Edges = take 2 outEdgeList - newNode = (numNodes , T.pack $ ("HTU" ++ (show numNodes))) - newEdge1 = (numNodes, snd3 $ head first2Edges, 0.0 :: Double) - newEdge2 = (numNodes, snd3 $ last first2Edges, 0.0 :: Double) - newEdge3 = (curNode, numNodes, 0.0 :: Double) - newGraph = LG.insEdges [newEdge1, newEdge2, newEdge3] $ LG.delLEdges first2Edges $ LG.insNode newNode inGraph - in - newGraph - - - else error ("This can't happen in resolveNode in/out edge lists don't need to be resolved " ++ show inOutPair ++ "\n" ++ LG.prettify inGraph) - -- ) - --- | convertDecoratedToSimpleGraph -convertDecoratedToSimpleGraph :: DecoratedGraph -> SimpleGraph -convertDecoratedToSimpleGraph inDec = - if LG.isEmpty inDec then LG.empty - else - let decNodeList = LG.labNodes inDec - newNodeLabels = fmap vertName $ fmap snd decNodeList - simpleNodes = zip (fmap fst decNodeList) newNodeLabels - - {- - decEdgeList = LG.labEdges inDec - sourceList = fmap fst3 decEdgeList - sinkList = fmap snd3 decEdgeList - newEdgeLables = replicate (length sourceList) 0.0 -- fmap midRangeLength $ fmap thd3 decEdgeList - simpleEdgeList = zip3 sourceList sinkList newEdgeLables - -} - simpleEdgeList = fmap convertToSimpleEdge $ LG.labEdges inDec - in - LG.mkGraph simpleNodes simpleEdgeList - - --- | convertToSimpleEdge takes a lables edge and relabels with 0.0 -convertToSimpleEdge :: LG.LEdge EdgeInfo -> LG.LEdge Double -convertToSimpleEdge (a, b, c) = (a, b, minLength c) - --- | graphCostFromNodes takes a Decorated graph and returns its cost by summing up the local costs --- of its nodes -graphCostFromNodes :: DecoratedGraph -> Double -graphCostFromNodes inGraph = - if LG.isEmpty inGraph then 0.0 - else - sum $ fmap vertexCost $ fmap snd $ LG.labNodes inGraph - --- | dichotomizeRoot takes greaph and dichotimizes not dichotomous roots in graph -dichotomizeRoot :: Int -> SimpleGraph -> SimpleGraph -dichotomizeRoot lOutgroupIndex inGraph = - if LG.isEmpty inGraph then LG.empty - else - let rootList = LG.getRoots inGraph - currentRoot = fst $ head rootList - rootEdgeList = LG.out inGraph $ currentRoot - in - -- not a tree error - if (length rootList /= 1) then error ("Graph input to dichotomizeRoot is not a tree--not single root:" ++ (show rootList)) - - -- nothing to do - else if (length rootEdgeList < 3) then inGraph - else - let numVertices = length $ LG.nodes inGraph - newNode = (numVertices, T.pack $ show numVertices) - edgesToDelete = filter ((/=lOutgroupIndex) . snd3) rootEdgeList - newEdgeDestinations = fmap snd3 edgesToDelete - newEdgeStarts = replicate (length newEdgeDestinations) numVertices - newEdgeLabels = replicate (length newEdgeDestinations) 0.0 - -- nub for case where root edge in "wrong" direction - -- doesn't filter edges to delete properly - newEdgesNewNode = L.nub $ zip3 newEdgeStarts newEdgeDestinations newEdgeLabels - newRootEdge = (currentRoot, numVertices, 0.0) - in - LG.delLEdges edgesToDelete $ LG.insEdges (newRootEdge : newEdgesNewNode) $ LG.insNode newNode inGraph - - --- | showBlockGraphs takes a vector of vector of DecoratedGraphs and converte and prettifies outputting a String -showDecGraphs :: V.Vector (V.Vector DecoratedGraph) -> String -showDecGraphs inDecVV = - if V.null inDecVV then [] - else - concat $ fmap concat $ V.toList $ fmap V.toList $ fmap (fmap LG.prettify) $ fmap (fmap convertDecoratedToSimpleGraph) inDecVV - --- | selectPhylogeneticGraph takes a series OF arguments and an input list ot PhylogeneticGraphs --- and returns or filters that list based on options. --- uses selectListCostPairs in GeneralUtilities -selectPhylogeneticGraph :: [Argument] -> Int -> [String] -> [PhylogeneticGraph] -> [PhylogeneticGraph] -selectPhylogeneticGraph inArgs rSeed selectArgList curGraphs = - if null curGraphs then [] - else - let fstArgList = fmap (fmap C.toLower . fst) inArgs - sndArgList = fmap (fmap C.toLower . snd) inArgs - lcArgList = zip fstArgList sndArgList - checkCommandList = checkCommandArgs "select" fstArgList selectArgList - in - -- check for valid command options - if not checkCommandList then errorWithoutStackTrace ("Unrecognized command in 'select': " ++ show inArgs) - else if length inArgs > 1 then errorWithoutStackTrace ("Can only have a single select type per command: " ++ show inArgs) - else - let doBest = not $ not (any ((=="best").fst) lcArgList) - doAll = not $ not (any ((=="all").fst) lcArgList) - doRandom = not $ not (any ((=="atrandom").fst) lcArgList) - doUnique = not $ not (any ((=="unique").fst) lcArgList) - numberToKeep - | null lcArgList = Just (maxBound :: Int) - | null $ snd $ head lcArgList = Just (maxBound :: Int) - | otherwise = readMaybe (snd $ head lcArgList) :: Maybe Int - in - if doAll then curGraphs - else if isNothing numberToKeep then errorWithoutStackTrace ("Number to keep specification not an integer: " ++ show (snd $ head lcArgList)) - else - let -- minimum graph cost - minGraphCost = minimum $ fmap snd6 curGraphs - - -- collapse zero-length branchs for unique - curGraphsCollapsed = fmap U.collapseGraph curGraphs - - -- keep only unique graphs based on non-zero edges--in sorted by cost - uniqueGraphList = L.sortOn snd6 $ getUniqueGraphs'' (zip curGraphs curGraphsCollapsed)-- curGraphs -- True curGraphs -- getBVUniqPhylogeneticGraph True curGraphs -- getTopoUniqPhylogeneticGraph True curGraphs - - -- this to avaoid alot of unncesesary graph comparisons for 'best' graphs - bestCostGraphs = filter ((== minGraphCost).snd6) curGraphs - uniqueBestGraphs = getUniqueGraphs'' (zip bestCostGraphs (fmap U.collapseGraph bestCostGraphs)) - - in - if doUnique then take (fromJust numberToKeep) uniqueGraphList - else if doBest then - -- trace ("SPG: " ++ (show (minGraphCost, length uniqueGraphList, fmap snd6 uniqueGraphList))) - take (fromJust numberToKeep) uniqueBestGraphs - else if doRandom then - let randList = head $ shuffleInt rSeed 1 [0..(length curGraphs - 1)] - (_, shuffledGraphs) = unzip $ L.sortOn fst $ zip randList curGraphs - in - take (fromJust numberToKeep) shuffledGraphs - -- default is all best and unique - else - uniqueBestGraphs - --- | getUniqueGraphs takes each pair of non-zero edges and conpares them--if equal not added to list --- maybe chnge to nub LG.pretify graphList? -getUniqueGraphs :: Bool -> [PhylogeneticGraph] -> [PhylogeneticGraph] -getUniqueGraphs removeZeroEdges inGraphList = - if null inGraphList then [] - else - let inGraphEdgeList = if removeZeroEdges then fmap (filter ((> 0.0) . minLength . thd3)) $ fmap LG.labEdges $ fmap thd6 inGraphList - else fmap LG.labEdges $ fmap thd6 inGraphList - in - getUniqueGraphs' (zip inGraphEdgeList inGraphList) [] - - --- | getUniqueGraphs Using fgl == --- basically a nub --- need to add a collapse function for compare as well --- takes pairs of (noCollapsed, collapsed) phylogenetic graphs, --- make strings based on collapsed and returns not collpased -getUniqueGraphs'' :: [(PhylogeneticGraph, PhylogeneticGraph)] -> [PhylogeneticGraph] -getUniqueGraphs'' inList = nubGraph [] inList - --- | isNovelGraph checks if a graph is in list of existing graphs --- uses colapsed representation -isNovelGraph :: [PhylogeneticGraph] -> PhylogeneticGraph -> Bool -isNovelGraph graphList testGraph = - if null graphList then True - else - let collapsedInGraph = (LG.prettyIndices . fst6 . U.collapseGraph) testGraph - collapseGraphList = fmap (LG.prettyIndices . fst6 . U.collapseGraph) graphList - matchList = filter (== collapsedInGraph) collapseGraphList - in - -- trace ("IsNovel: " ++ (show $ null matchList)) - null matchList - --- | keeps and returns unique graphs based on Eq of Topological Simple Graph --- String prettyIndices w/0 HTU names and branch lengths --- arbitrarily rooted on 0 for oonsistency ---reversed to keep original order in case sorted on length -nubGraph :: [(PhylogeneticGraph, PhylogeneticGraph, String)] -> [(PhylogeneticGraph, PhylogeneticGraph)] -> [PhylogeneticGraph] -nubGraph curList inList = - if null inList then reverse $ fmap fst3 curList - else - let (firstGraphNC, firstGraphC) = head inList - firstString = LG.prettyIndices $ thd6 firstGraphNC - isMatch = filter (== firstString) (fmap thd3 curList) - in - -- trace ("NG: " ++ (show $ null isMatch) ++ " " ++ firstString) ( - if null curList then nubGraph [(firstGraphNC, firstGraphC, firstString)] (tail inList) - else if null isMatch then nubGraph ((firstGraphNC, firstGraphC, firstString) : curList) (tail inList) - else nubGraph curList (tail inList) - -- ) - --- | getUniqueGraphs takes each pair of non-zero edges and compares them--if equal not added to list -getUniqueGraphs' :: [([LG.LEdge EdgeInfo], PhylogeneticGraph)] -> [([LG.LEdge EdgeInfo], PhylogeneticGraph)] -> [PhylogeneticGraph] -getUniqueGraphs' inGraphPairList currentUniquePairs = - if null inGraphPairList then fmap snd currentUniquePairs - else - let firstPair@(firstEdges, _) = head inGraphPairList - in - if null currentUniquePairs then getUniqueGraphs' (tail inGraphPairList) [firstPair] - else - let equalList = filter (== True) $ fmap ((== firstEdges) . fst) currentUniquePairs - in - if null equalList then getUniqueGraphs' (tail inGraphPairList) (firstPair : currentUniquePairs) - else getUniqueGraphs' (tail inGraphPairList) currentUniquePairs - --- | getNodeType returns node type for Node -getNodeType :: (Show a, Show b) => LG.Gr a b -> LG.Node -> NodeType -getNodeType inGraph inNode = - if not $ LG.gelem inNode inGraph then error ("Node " ++ (show inNode) ++ " not in graph\n" ++ (GFU.showGraph inGraph)) - else if LG.isLeaf inGraph inNode then LeafNode - else if LG.isTreeNode inGraph inNode then TreeNode - else if LG.isNetworkNode inGraph inNode then NetworkNode - else if LG.isRoot inGraph inNode then RootNode - else error ("Node type " ++ (show inNode) ++ " not Leaf, Tree, Network, or Root in graph\n" ++ (GFU.showGraph inGraph)) - --- | copyIAFinalToPrelim takes a Decorated graph and copies --- the IA final fields to preliminary IA states--this for IA only optimization --- inswapping and other operations. Thi sis done becasue the "preliminary" IA states --- are only known after full post/pre traversals -copyIAFinalToPrelim :: DecoratedGraph -> DecoratedGraph -copyIAFinalToPrelim inGraph = - if LG.isEmpty inGraph then error "Empty input graph in copyIAFinalToPrelim" - else - let nodes = LG.labNodes inGraph - edges = LG.labEdges inGraph - newNodes = fmap makeIAPrelimFromFinal nodes - in - LG.mkGraph newNodes edges - --- | makeIAPrelimFromFinal updates the label of a node for IA states --- setting preliminary to final -makeIAPrelimFromFinal :: LG.LNode VertexInfo -> LG.LNode VertexInfo -makeIAPrelimFromFinal (inIndex, label) = - let labData = vertData label - newLabData = fmap (fmap f) labData - in - (inIndex, label {vertData = newLabData}) - where f c = if (GV.null $ slimIAFinal c) && (GV.null $ wideIAFinal c) && (GV.null $ hugeIAFinal c) then c - else if (not $ GV.null $ slimIAFinal c) then c {slimIAPrelim = M.makeDynamicCharacterFromSingleVector $ slimIAFinal c} - else if (not $ GV.null $ wideIAFinal c) then c {wideIAPrelim = M.makeDynamicCharacterFromSingleVector $ wideIAFinal c} - else c {hugeIAPrelim = M.makeDynamicCharacterFromSingleVector $ hugeIAFinal c} - - --- | copyIAPrelimToFinal takes a Decorated graph and copies --- the IA prelim fields to final IA states--this for IA only optimization --- inswapping and other operations. THis is fdone for root and leaf vertices -copyIAPrelimToFinal :: DecoratedGraph -> DecoratedGraph -copyIAPrelimToFinal inGraph = - if LG.isEmpty inGraph then error "Empty input graph in copyIAFinalToPrelim" - else - let nodes = LG.labNodes inGraph - edges = LG.labEdges inGraph - newNodes = fmap makeIAFinalFromPrelim nodes - in - LG.mkGraph newNodes edges - --- | makeIAFinalFomPrelim updates the label of a node for IA states --- setting final to preliminary -makeIAFinalFromPrelim:: LG.LNode VertexInfo -> LG.LNode VertexInfo -makeIAFinalFromPrelim (inIndex, label) = - let labData = vertData label - newLabData = fmap (fmap f) labData - in - (inIndex, label {vertData = newLabData}) - where f c = let newSlimIAFinal = extractMediansGapped $ slimIAPrelim c - newWideIAFinal = extractMediansGapped $ wideIAPrelim c - newHugeIAFinal = extractMediansGapped $ hugeIAPrelim c - in - if (GV.null $ snd3 $ slimIAPrelim c) && (GV.null $ snd3 $ wideIAPrelim c) && (GV.null $ snd3 $ hugeIAPrelim c) then c - else if (not $ GV.null $ snd3 $ slimIAPrelim c) then c {slimIAFinal = newSlimIAFinal} - else if (not $ GV.null $ snd3 $ wideIAPrelim c) then c {wideIAFinal = newWideIAFinal} - else c {hugeIAFinal = newHugeIAFinal} - - --- | getTopoUniqPhylogeneticGraph takes a list of phylogenetic graphs and returns --- list of topologically unique graphs--operatres on simple graph field --- noZeroEdges flag passed to remove zero weight edges -getTopoUniqPhylogeneticGraph :: Bool -> [PhylogeneticGraph] -> [PhylogeneticGraph] -getTopoUniqPhylogeneticGraph nonZeroEdges inPhyloGraphList = - if null inPhyloGraphList then [] - else - let uniqueBoolList = createUniqueBoolList nonZeroEdges (fmap fst6 inPhyloGraphList) [] - boolPair = zip inPhyloGraphList uniqueBoolList - in - fmap fst $ filter ((== True) . snd) boolPair - --- | createUniqueBoolList creates a list of Bool if graphs are unique--first occurrence is True, others False -createUniqueBoolList :: Bool -> [SimpleGraph] -> [(SimpleGraph,Bool)] -> [Bool] -createUniqueBoolList nonZeroEdges inGraphList boolAccum = - if null inGraphList then reverse $ fmap snd boolAccum - else - let firstGraph = head inGraphList - in - if null boolAccum then createUniqueBoolList nonZeroEdges (tail inGraphList) ((firstGraph,True) : boolAccum) - else - let checkList = filter (== True) $ fmap (topologicalEqual nonZeroEdges firstGraph) (fmap fst boolAccum) - in - if null checkList then createUniqueBoolList nonZeroEdges (tail inGraphList) ((firstGraph,True) : boolAccum) - else createUniqueBoolList nonZeroEdges (tail inGraphList) ((firstGraph, False) : boolAccum) - - - --- | topologicalEqual takes two simple graphs and returns True if graphs have same nodes and edges --- option to exclude zero weight edges -topologicalEqual :: Bool -> SimpleGraph -> SimpleGraph -> Bool -topologicalEqual nonZeroEdges g1 g2 = - if LG.isEmpty g1 && LG.isEmpty g2 then True - else if LG.isEmpty g1 || LG.isEmpty g2 then False - else - let nodesG1 = LG.labNodes g1 - nodesG2 = LG.labNodes g2 - edgesG1 = if nonZeroEdges then fmap LG.toEdge $ filter ((> 0). thd3) $ LG.labEdges g1 - else LG.edges g1 - edgesG2 = if nonZeroEdges then fmap LG.toEdge $ filter ((> 0). thd3) $ LG.labEdges g2 - else LG.edges g2 - in - if nodesG1 == nodesG2 && edgesG1 == edgesG2 then True - else False - --- | getEdgeMinLengthToNode takes a labelled node and returns the min length of --- the edge leading to the node -getEdgeMinLengthToNode ::[LG.LEdge EdgeInfo] -> LG.LNode a -> Double -getEdgeMinLengthToNode edgeList (node, _)= - let foundEdge = L.find ((== node) . snd3) edgeList - in - -- root node will be nor be in in edge set and need so set > 0 - if foundEdge == Nothing then 1.0 -- error ("Edge not found in getEdgeMinLengthToNode: node " ++ (show node) ++ " edge list " ++ (show edgeList)) - else minLength $ thd3 $ fromJust foundEdge - --- | getBVUniqPhylogeneticGraph takes a list of phylogenetic graphs and returns --- list of topologically unique graphs based on their node bitvector assignments --- operatres on Decorated graph field --- noZeroEdges flag passed to remove zero weight edges -getBVUniqPhylogeneticGraph :: Bool -> [PhylogeneticGraph] -> [PhylogeneticGraph] -getBVUniqPhylogeneticGraph nonZeroEdges inPhyloGraphList = - if null inPhyloGraphList then [] - else - let bvGraphList = fmap (getBVNodeList nonZeroEdges) $ fmap thd6 inPhyloGraphList - uniqueBoolList = createBVUniqueBoolList bvGraphList [] - boolPair = zip inPhyloGraphList uniqueBoolList - in - fmap fst $ filter ((== True) . snd) boolPair - - --- | getBVNodeList takes a DecoratedGraph and returns sorted list (by BV) of nodes --- removes node with zero edge weight to them if specified -getBVNodeList :: Bool -> DecoratedGraph -> [BV.BitVector] -getBVNodeList nonZeroEdges inGraph = - if LG.isEmpty inGraph then [] - else - let nodeList = LG.labNodes inGraph - edgeList = LG.labEdges inGraph - minLengthList = fmap (getEdgeMinLengthToNode edgeList) nodeList - nodePairList = filter ((> 0) . snd) $ zip nodeList minLengthList - bvNodeList = if nonZeroEdges then L.sort $ fmap bvLabel $ fmap snd $ fmap fst nodePairList - else L.sort $ fmap bvLabel $ fmap snd nodeList - in - bvNodeList - --- | createBVUniqueBoolList creates a list of Bool if graphs are unique by bitvecector node list --- first occurrence is True, others False --- assumes edges filterd b=y lenght already -createBVUniqueBoolList :: [[BV.BitVector]] -> [([BV.BitVector],Bool)] -> [Bool] -createBVUniqueBoolList inBVGraphListList boolAccum = - if null inBVGraphListList then reverse $ fmap snd boolAccum - else - let firstGraphList = head inBVGraphListList - in - if null boolAccum then createBVUniqueBoolList (tail inBVGraphListList) ((firstGraphList,True) : boolAccum) - else - let checkList = filter (== True) $ fmap (== firstGraphList) (fmap fst boolAccum) - in - if null checkList then createBVUniqueBoolList (tail inBVGraphListList) ((firstGraphList,True) : boolAccum) - else createBVUniqueBoolList (tail inBVGraphListList) ((firstGraphList, False) : boolAccum) - --- | makeDummyLabEdge takes an unlabelled edge and adds a dummy label -makeDummyLabEdge :: EdgeInfo -> LG.Edge -> LG.LEdge EdgeInfo -makeDummyLabEdge edgeLab (u,v) = (u,v,edgeLab) - --- | netNodeAncestorViolation checks whether one of the edge into a netowrk node (in 2) --- is cinnected to an ancestor (via the other parent) of the node --- this is a form of time violation since the parents of a network node must be --- at least possibly coeval --- this uses the bit vector label of nodes. If the other child of either parent node --- of a network node has non-zero intersection between the BV label of the network node --- and that other child of parent then they conecting edge is from and ancestral node hence a time violation --- O(n) n netork nodes in Graph, but checks all nodes to see if network -hasNetNodeAncestorViolation :: LG.Gr VertexInfo b -> Bool -hasNetNodeAncestorViolation inGraph = - if LG.isEmpty inGraph then error "Empty graph in hasNetNodeAncestorViolation" - else - let (_, _, _, netWorkNodeList) = LG.splitVertexList inGraph - hasAncViolationList = filter (== True) $ fmap (nodeAncViolation inGraph) netWorkNodeList - in - -- trace ("HNV: " ++ (show $ (not . null) hasAncViolationList)) - (not . null) hasAncViolationList - --- | nodeAncViolation checks a single node fo ancestrpo connection--he ceviolation --- should be O(1). Return True if violation -nodeAncViolation :: LG.Gr VertexInfo b -> LG.LNode VertexInfo -> Bool -nodeAncViolation inGraph inNode = - let parentList = LG.labParents inGraph (fst inNode) - in - if length parentList /= 2 then error ("Parent number should be 2: " ++ (show $ fst inNode) ++ " <- " ++ (show $ fmap fst parentList)) - else - let sisterNodes = concatMap (LG.sisterLabNodes inGraph) parentList - sisterBVData = fmap (bvLabel . snd) sisterNodes - inNodeBVData = bvLabel $ snd inNode - sisterBVIntersections = fmap (.&. inNodeBVData) sisterBVData - isAncInNode = filter (== inNodeBVData) sisterBVIntersections - in - (not . null) isAncInNode - --- | selectGraphStochastic takes a list of graphs and retuns a list of graphs chosen at Random --- using an exponential distribution based on graph cost difference divided by an input factor --- if factor is 0 then stringth graphs cost --- mprob acceptance = -exp [(cost - minCost)/ factor] --- returns n graphs by random criterion without replacment -selectGraphStochastic :: Int -> Int -> Double -> [PhylogeneticGraph] -> [PhylogeneticGraph] -selectGraphStochastic rSeed number factor inGraphList = - if null inGraphList then inGraphList - else if number >= length inGraphList then inGraphList - else - let randList' = randomIntList rSeed - randList = fmap abs (tail randList') - newSeed = head randList' - minCost = minimum $ fmap snd6 inGraphList - deltaList = fmap ((-) minCost) $ fmap snd6 inGraphList - probAcceptList = fmap (getProb factor) deltaList - - -- multiplier for resolution 1000, 100 prob be ok - randMultiplier = 1000 - randMultiplier' = fromIntegral randMultiplier - intAcceptList = fmap floor $ fmap (* randMultiplier') probAcceptList - (_, intRandValList) = unzip $ zipWith divMod randList (replicate (length inGraphList) randMultiplier) - acceptList = zipWith (<) intRandValList intAcceptList - - -- zip graphs with Bools - (returnGraphList, _) = unzip $ filter ((== True) .snd) $ zip inGraphList acceptList - - -- takes some random remainder to fiill out length of list - numLucky = number - (length returnGraphList) - luckyList = if numLucky > 0 then takeRandom newSeed numLucky (fmap fst $ filter ((== False) .snd) $ zip inGraphList acceptList) - else [] - - - in - trace ("SGS " ++ (show intAcceptList) ++ " " ++ (show intRandValList) ++ " -> " ++ (show acceptList)) - -- so no more than specified - take number $ returnGraphList ++ luckyList - - where getProb a b = exp ((-1) * b / a) - --- | getDisplayTreeCostList returns a list of teh "block" costs of display trees --- in a piar with any graph 'penalty' cost -getDisplayTreeCostList :: PhylogeneticGraph -> ([VertexCost], VertexCost) -getDisplayTreeCostList inGraph = - if LG.isEmpty $ thd6 inGraph then ([], 0.0) - else - let rootIndex = fst $ head $ LG.getRoots $ fst6 inGraph - displayTreeCharVect = fft6 inGraph - displayTreeCostVect = fmap (getBlockCost rootIndex) displayTreeCharVect - nonGraphCost = V.sum displayTreeCostVect - in - (V.toList displayTreeCostVect, (snd6 inGraph) - nonGraphCost) - --- | getBlockCost returns the cost, summed over characters, of a character block -getBlockCost :: LG.Node -> V.Vector DecoratedGraph -> VertexCost -getBlockCost rootIndex charGraphVect = - if V.null charGraphVect then 0.0 - else - V.sum $ fmap (getCharacterCost rootIndex) charGraphVect - --- | getCharacterCost returns charcter cost as root of character tree -getCharacterCost :: LG.Node -> DecoratedGraph -> VertexCost -getCharacterCost rootIndex inGraph = - if LG.isEmpty inGraph then 0.0 - else - let rootLabel = LG.lab inGraph rootIndex - in - if isNothing rootLabel then error ("Root index without label: " ++ (show rootIndex)) - else subGraphCost $ fromJust rootLabel - diff --git a/pkg/PhyGraph/Input/BitPack.hs b/pkg/PhyGraph/Input/BitPack.hs deleted file mode 100644 index b11ee6564..000000000 --- a/pkg/PhyGraph/Input/BitPack.hs +++ /dev/null @@ -1,1681 +0,0 @@ -{- | -Module : BitPack.hs -Description : Module with functionality to transform NonAdditive data to bit packed - Word64 structures -Copyright : (c) 2022 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - - -module Input.BitPack - ( packNonAdditiveData - , median2Packed - , packedPreorder - , threeWayPacked - , threeWayPacked' - , unionPacked - , minMaxCharDiff - ) where - -import Control.Parallel.Strategies -import Data.Bits -import qualified Data.BitVector.LittleEndian as BV -import qualified Data.List as L -import qualified Data.List.Split as SL -import qualified Data.Text.Lazy as T -import qualified Data.Vector as V -import qualified Data.Vector.Unboxed as UV -import qualified Data.Vector.Generic as GV -import Data.Word -import Debug.Trace -import GeneralUtilities -import qualified ParallelUtilities as PU -import Types.Types -import qualified Utilities.Utilities as U - - -{- -This module contains structures and functions for bit-packing operations -the basic idea is to transform the Bit-vector representation of non-additive -characters (whihc have effectively unlimited number of states) to more efficient -operations based on Word64 encodings. Additionally, this should reduce -memory footprint when large number of non-additive characters are -input (such as genomic SNP data). - -Several new types are cretged and manipulated based on the n umber of states -in the character--Packed2, Packed4, Packed5, Packed8, Packed64. These types hold -as subsets of bits (1 per state) multiple (other than for Packed64) original -non-additive characters. - -The optimizations are based on using bi-wise operations specific -to each packed type to create preliminary (post-order with cost) and final -(pre-order) states. Ideally the functions should contain no logic branches -or recursion (= loops) so that operations are solely bit-based. - -Methods are similar too those of Lamport 1975, Ronquist 1998, Moelannen 1999, Goloboff 2002, -and White and Holland 2011, but differ in detail due to -the programming language (Haskell) and need to maintain data structures -useful for network analysis. - -The basic character data structure is a vector of Word64, the original vector of bit-vectors -is split into a series of new characters depending on the number of states (in non-missing cells). - -A single Word64 can then hold 32 2-state. 16 4-state, 12 5-state (awkward but useful for DNA), -4 8-state, 1 64-state, and a bit-vector for >64 states. - -Character weights are all = 1 in static charcaters. This is ensured by organizeBlockData -in Input.Reorganize.hs basically--characters are multiplied by weight (if integer--otherwise not recoded) -So can check and only recode characters with weight of 1. --} - - -{- -Functions for median2 calculations of packed types -These are used in post-order graph traversals and pairwise -distance functions among others. --} - - -{- -Masks for various operations and state numbers --} - --- | mask2A first Mask for 2 state 64 bit --- 32 x (01) --- 6148914691236517205 -mask2A :: Word64 -mask2A = 0x5555555555555555 - --- | mask2B second Mask for 2 state 64 bit --- 32 x (10) --- 12297829382473034410 -mask2B :: Word64 -mask2B = 0xAAAAAAAAAAAAAAAA --- | mask4A first mask for 4 state 64 bits --- 8608480567731124087 --- 16 X (0111) -mask4A :: Word64 -mask4A = 0x7777777777777777 - --- | mask4B second mask for 4 states 64 bits --- 9838263505978427528 --- 16 X (1000) -mask4B :: Word64 -mask4B = 0x8888888888888888 - --- | mask4C mak for 4 states an 64 bits --- 4919131752989213764 --- 16 x (0100) -mask4C :: Word64 -mask4C = 0x4444444444444444 - --- | mask4D mak for 4 states an 64 bits --- 2459565876494606882 --- 16 x (0010) -mask4D :: Word64 -mask4D = 0x2222222222222222 - --- | mask4E mak for 4 states an 64 bits --- 1229782938247303441 --- 16 x (0001) -mask4E :: Word64 -mask4E = 0X1111111111111111 - -{- -5 state masks top 4 bits OFF may require to mask out top 4 bits for states and cost -top 4 OFF rest ON 0xFFFFFFFFFFFFFFF - 1152921504606846975 -top 4 ON rest OFF 0xF000000000000000 - 17293822569102704640 --} - --- | mask5A first mask for 5 states 64 bits -- need to check what state of top 4 bits--should be OFF I think --- 12 x (01111) --- 557865244164603375 -mask5A :: Word64 -mask5A = 0x7BDEF7BDEF7BDEF - --- | mask5B second mask for 5 states 64 bits -- need to check what state of top 4 bits-these are OFF --- 12 x (10000) --- 595056260442243600 (top 4 OFF) v 17888878829544948240 (top 4 ON) --- 0x842108421084210 (top 4 OFF) v F842108421084210 (top 4 ON) -mask5B :: Word64 -mask5B = 0x842108421084210 --- | mask5C mask 5 states 64 bits --- 12 x (01000) --- 297528130221121800 -mask5C :: Word64 -mask5C = 0x421084210842108 - --- | mask5D mask 5 states 64 bits --- 12 x (00100) --- 148764065110560900 -mask5D :: Word64 -mask5D = 0x210842108421084 - --- | mask5E mask 5 states 64 bits --- 12 x (00010) --- 74382032555280450 -mask5E :: Word64 -mask5E = 0x108421084210842 - --- | mask5F mask 5 states 64 bits --- 12 x (00001) --- 37191016277640225 -mask5F :: Word64 -mask5F = 0x84210842108421 - --- | mask8A first mask for 8 states 64 bits --- 8 x (01111111) --- 9187201950435737471 -mask8A :: Word64 -mask8A = 0x7F7F7F7F7F7F7F7F - --- | mask8B second mask for 8 states 64 bits --- 8 x (10000000) --- 9259542123273814144 -mask8B :: Word64 -mask8B = 0x8080808080808080 - --- | mask8C mask for 8 states 64 bits --- 8 x (01000000) --- 4629771061636907072 -mask8C :: Word64 -mask8C = 0x4040404040404040 - --- | mask8D mask for 8 states 64 bits --- 8 x (00100000) --- 2314885530818453536 -mask8D :: Word64 -mask8D = 0x2020202020202020 - --- | mask8E mask for 8 states 64 bits --- 8 x (00010000) --- 1157442765409226768 -mask8E :: Word64 -mask8E = 0x1010101010101010 - --- | mask8F mask for 8 states 64 bits --- 8 x (00001000) --- 578721382704613384 -mask8F :: Word64 -mask8F = 0x808080808080808 - --- | mask8G mask for 8 states 64 bits --- 8 x (00000100) --- 289360691352306692 -mask8G :: Word64 -mask8G = 0x404040404040404 - --- | mask8H mask for 8 states 64 bits --- 8 x (00000010) --- 144680345676153346 -mask8H :: Word64 -mask8H = 0x202020202020202 - --- | mask8I mask for 8 states 64 bits --- 8 x (00000001) --- 72340172838076673 -mask8I :: Word64 -mask8I = 0x101010101010101 - --- | mask2scN 11(32x00) mask to reveal states of Nth subcharacter in 64Bit Word -mask2sc0 :: Word64 -mask2sc0 = 0x3 - -mask2sc1 :: Word64 -mask2sc1 = shiftL mask2sc0 (2 * 1) - -mask2sc2 :: Word64 -mask2sc2 = shiftL mask2sc0 (2 * 2) - -mask2sc3 :: Word64 -mask2sc3 = shiftL mask2sc0 (2 * 3) - -mask2sc4 :: Word64 -mask2sc4 = shiftL mask2sc0 (2 * 4) - -mask2sc5 :: Word64 -mask2sc5 = shiftL mask2sc0 (2 * 5) - -mask2sc6 :: Word64 -mask2sc6 = shiftL mask2sc0 (2 * 6) - -mask2sc7 :: Word64 -mask2sc7 = shiftL mask2sc0 (2 * 7) - -mask2sc8 :: Word64 -mask2sc8 = shiftL mask2sc0 (2 * 8) - -mask2sc9 :: Word64 -mask2sc9 = shiftL mask2sc0 (2 * 9) - -mask2sc10 :: Word64 -mask2sc10 = shiftL mask2sc0 (2 * 10) - -mask2sc11 :: Word64 -mask2sc11 = shiftL mask2sc0 (2 * 11) - -mask2sc12 :: Word64 -mask2sc12 = shiftL mask2sc0 (2 * 12) - -mask2sc13 :: Word64 -mask2sc13 = shiftL mask2sc0 (2 * 13) - -mask2sc14 :: Word64 -mask2sc14 = shiftL mask2sc0 (2 * 14) - -mask2sc15 :: Word64 -mask2sc15 = shiftL mask2sc0 (2 * 15) - -mask2sc16 :: Word64 -mask2sc16 = shiftL mask2sc0 (2 * 16) - -mask2sc17 :: Word64 -mask2sc17 = shiftL mask2sc0 (2 * 17) - -mask2sc18 :: Word64 -mask2sc18 = shiftL mask2sc0 (2 * 18) - -mask2sc19 :: Word64 -mask2sc19 = shiftL mask2sc0 (2 * 19) - -mask2sc20 :: Word64 -mask2sc20 = shiftL mask2sc0 (2 * 20) - -mask2sc21 :: Word64 -mask2sc21 = shiftL mask2sc0 (2 * 21) - -mask2sc22 :: Word64 -mask2sc22 = shiftL mask2sc0 (2 * 22) - -mask2sc23 :: Word64 -mask2sc23 = shiftL mask2sc0 (2 * 23) - -mask2sc24 :: Word64 -mask2sc24 = shiftL mask2sc0 (2 * 24) - -mask2sc25 :: Word64 -mask2sc25 = shiftL mask2sc0 (2 * 25) - -mask2sc26 :: Word64 -mask2sc26 = shiftL mask2sc0 (2 * 26) - -mask2sc27 :: Word64 -mask2sc27 = shiftL mask2sc0 (2 * 27) - -mask2sc28 :: Word64 -mask2sc28 = shiftL mask2sc0 (2 * 28) - -mask2sc29 :: Word64 -mask2sc29 = shiftL mask2sc0 (2 * 29) - -mask2sc30 :: Word64 -mask2sc30 = shiftL mask2sc0 (2 * 30) - -mask2sc31 :: Word64 -mask2sc31 = shiftL mask2sc0 (2 * 31) - --- | mask4scN 1111(16x0000) mask to reveal states of Nth subcharacter in 64Bit Word -mask4sc0 :: Word64 -mask4sc0 = 0xF - -mask4sc1 :: Word64 -mask4sc1 = shiftL mask4sc0 (4 * 1) - -mask4sc2 :: Word64 -mask4sc2 = shiftL mask4sc0 (4 * 2) - -mask4sc3 :: Word64 -mask4sc3 = shiftL mask4sc0 (4 * 3) - -mask4sc4 :: Word64 -mask4sc4 = shiftL mask4sc0 (4 * 4) - -mask4sc5 :: Word64 -mask4sc5 = shiftL mask4sc0 (4 * 5) - -mask4sc6 :: Word64 -mask4sc6 = shiftL mask4sc0 (4 * 6) - -mask4sc7 :: Word64 -mask4sc7 = shiftL mask4sc0 (4 * 7) - -mask4sc8 :: Word64 -mask4sc8 = shiftL mask4sc0 (4 * 8) - -mask4sc9 :: Word64 -mask4sc9 = shiftL mask4sc0 (4 * 9) - -mask4sc10 :: Word64 -mask4sc10 = shiftL mask4sc0 (4 * 10) - -mask4sc11 :: Word64 -mask4sc11 = shiftL mask4sc0 (4 * 11) - -mask4sc12 :: Word64 -mask4sc12 = shiftL mask4sc0 (4 * 12) - -mask4sc13 :: Word64 -mask4sc13 = shiftL mask4sc0 (4 * 13) - -mask4sc14 :: Word64 -mask4sc14 = shiftL mask4sc0 (4 * 14) - -mask4sc15 :: Word64 -mask4sc15 = shiftL mask4sc0 (4 * 15) - --- | mask5scN 11111(12x00000) mask to reveal states of Nth subcharacter in 64Bit Word -mask5sc0 :: Word64 -mask5sc0 = 0x1F - -mask5sc1 :: Word64 -mask5sc1 = shiftL mask5sc0 (5 * 1) - -mask5sc2 :: Word64 -mask5sc2 = shiftL mask5sc0 (5 * 2) - -mask5sc3 :: Word64 -mask5sc3 = shiftL mask5sc0 (5 * 3) - -mask5sc4 :: Word64 -mask5sc4 = shiftL mask5sc0 (5 * 4) - -mask5sc5 :: Word64 -mask5sc5 = shiftL mask5sc0 (5 * 5) - -mask5sc6 :: Word64 -mask5sc6 = shiftL mask5sc0 (5 * 6) - -mask5sc7 :: Word64 -mask5sc7 = shiftL mask5sc0 (5 * 7) - -mask5sc8 :: Word64 -mask5sc8 = shiftL mask5sc0 (5 * 8) - -mask5sc9 :: Word64 -mask5sc9 = shiftL mask5sc0 (5 * 9) - -mask5sc10 :: Word64 -mask5sc10 = shiftL mask5sc0 (5 * 10) - -mask5sc11 :: Word64 -mask5sc11 = shiftL mask5sc0 (5 * 11) - --- | mask8scN 11111111(7x00000000) mask to reveal states of Nth subcharacter in 64Bit Word -mask8sc0 :: Word64 -mask8sc0 = 0xFF - -mask8sc1 :: Word64 -mask8sc1 = shiftL mask8sc0 (8 * 1) - -mask8sc2 :: Word64 -mask8sc2 = shiftL mask8sc0 (8 * 2) - -mask8sc3 :: Word64 -mask8sc3 = shiftL mask8sc0 (8 * 3) - -mask8sc4 :: Word64 -mask8sc4 = shiftL mask8sc0 (8 * 4) - -mask8sc5 :: Word64 -mask8sc5 = shiftL mask8sc0 (8 * 5) - -mask8sc6 :: Word64 -mask8sc6 = shiftL mask8sc0 (8 * 6) - -mask8sc7 :: Word64 -mask8sc7 = shiftL mask8sc0 (8 * 7) - -{-- -Lists of sub-character masks for operations over packed characters --} - -packed2SubCharList :: [Word64] -packed2SubCharList = [mask2sc0, mask2sc1, mask2sc2, mask2sc3, mask2sc4, mask2sc5, mask2sc6, mask2sc7, mask2sc8, mask2sc9, - mask2sc10, mask2sc11, mask2sc12, mask2sc13, mask2sc14, mask2sc15, mask2sc16, mask2sc17, mask2sc18, mask2sc19, - mask2sc20, mask2sc21, mask2sc22, mask2sc23, mask2sc24, mask2sc25, mask2sc26, mask2sc27, mask2sc28, mask2sc29, - mask2sc30, mask2sc31] - -packed4SubCharList :: [Word64] -packed4SubCharList = [mask4sc0, mask4sc1, mask4sc2, mask4sc3, mask4sc4, mask4sc5, mask4sc6, mask4sc7, mask4sc8, mask4sc9, - mask4sc10, mask4sc11, mask4sc12, mask4sc13, mask4sc14, mask4sc15] - -packed5SubCharList :: [Word64] -packed5SubCharList = [mask5sc0, mask5sc1, mask5sc2, mask5sc3, mask5sc4, mask5sc5, mask5sc6, mask5sc7, mask5sc8, mask5sc9, - mask5sc10, mask5sc11] - -packed8SubCharList :: [Word64] -packed8SubCharList = [mask8sc0, mask8sc1, mask8sc2, mask8sc3, mask8sc4, mask8sc5, mask8sc6, mask8sc7] - -{- -Packed character minimum and maximum length functions --} - --- | mainMxCharDiff get the approximate minimum and maximum difference in number of states --- uses masking with &/== -minMaxCharDiff :: CharType -> (Double, Double) -> Word64 -> Word64 -> (Double, Double) -minMaxCharDiff inCharType bitCosts a b = - let (minVal, maxVal) = if inCharType == Packed2 then minMaxPacked2 bitCosts a b - else if inCharType == Packed4 then minMaxPacked4 bitCosts a b - else if inCharType == Packed5 then minMaxPacked5 bitCosts a b - else if inCharType == Packed8 then minMaxPacked8 bitCosts a b - else if inCharType == Packed64 then minMaxPacked64 bitCosts a b - else error ("Character type " ++ show inCharType ++ " unrecognized/not implemented") - in - (minVal, maxVal) - - --- | minMaxPacked2 minium and maximum cost 32x2 bit nonadditive character --- the popcount for equality A/C -> A/C is identical but could be A->C so max 1 --- basically unrolled to make faster -minMaxPacked2 :: (Double, Double) -> Word64 -> Word64 -> (Double, Double) -minMaxPacked2 (lNoChangeCost, lChangeCost) a b = - let a0 = a .&. mask2sc0 - b0 = b .&. mask2sc0 - max0 = if a0 == (0 :: Word64) then 0 - else if (a0 .&. b0) == (0 :: Word64) then 1 - else if (a0 == b0) then 0 else 1 - - a1 = a .&. mask2sc1 - b1 = b .&. mask2sc1 - max1 = if a1 == (0 :: Word64) then 0 - else if (a1 .&. b1) == (0 :: Word64) then 1 - else if (a1 == b1) then 0 else 1 - - a2 = a .&. mask2sc2 - b2 = b .&. mask2sc2 - max2 = if a2 == (0 :: Word64) then 0 - else if (a2 .&. b2) == (0 :: Word64) then 1 - else if (a2 == b2) then 0 else 1 - - a3 = a .&. mask2sc3 - b3 = b .&. mask2sc3 - max3 = if a3 == (0 :: Word64) then 0 - else if (a3 .&. b3) == (0 :: Word64) then 1 - else if (a3 == b3) then 0 else 1 - - a4 = a .&. mask2sc4 - b4 = b .&. mask2sc4 - max4 = if a4 == (0 :: Word64) then 0 - else if (a4 .&. b4) == (0 :: Word64) then 1 - else if (a4 == b4) then 0 else 1 - - a5 = a .&. mask2sc5 - b5 = b .&. mask2sc5 - max5 = if a5 == (0 :: Word64) then 0 - else if (a5 .&. b5) == (0 :: Word64) then 1 - else if (a5 == b5) then 0 else 1 - - a6 = a .&. mask2sc6 - b6 = b .&. mask2sc6 - max6 = if a6 == (0 :: Word64) then 0 - else if (a6 .&. b6) == (0 :: Word64) then 1 - else if (a6 == b6) then 0 else 1 - - a7 = a .&. mask2sc7 - b7 = b .&. mask2sc7 - max7 = if a7 == (0 :: Word64) then 0 - else if (a7 .&. b7) == (0 :: Word64) then 1 - else if (a7 == b7) then 0 else 1 - - a8 = a .&. mask2sc8 - b8 = b .&. mask2sc8 - max8 = if a8 == (0 :: Word64) then 0 - else if (a8 .&. b8) == (0 :: Word64) then 1 - else if (a8 == b8) then 0 else 1 - - a9 = a .&. mask2sc9 - b9 = b .&. mask2sc9 - max9 = if a9 == (0 :: Word64) then 0 - else if (a9 .&. b9) == (0 :: Word64) then 1 - else if (a9 == b9) then 0 else 1 - - a10 = a .&. mask2sc10 - b10 = b .&. mask2sc10 - max10 = if a10 == (0 :: Word64) then 0 - else if (a10 .&. b10) == (0 :: Word64) then 1 - else if (a10 == b10) then 0 else 1 - - a11 = a .&. mask2sc11 - b11 = b .&. mask2sc11 - max11 = if a11 == (0 :: Word64) then 0 - else if (a11 .&. b11) == (0 :: Word64) then 1 - else if (a11 == b11) then 0 else 1 - - a12 = a .&. mask2sc12 - b12 = b .&. mask2sc12 - max12 = if a12 == (0 :: Word64) then 0 - else if (a12 .&. b12) == (0 :: Word64) then 1 - else if (a12 == b12) then 0 else 1 - - a13 = a .&. mask2sc13 - b13 = b .&. mask2sc13 - max13 = if a13 == (0 :: Word64) then 0 - else if (a13 .&. b13) == (0 :: Word64) then 1 - else if (a13 == b13) then 0 else 1 - - a14 = a .&. mask2sc14 - b14 = b .&. mask2sc14 - max14 = if a14 == (0 :: Word64) then 0 - else if (a14 .&. b14) == (0 :: Word64) then 1 - else if (a14 == b14) then 0 else 1 - - a15 = a .&. mask2sc15 - b15 = b .&. mask2sc15 - max15 = if a15 == (0 :: Word64) then 0 - else if (a15 .&. b15) == (0 :: Word64) then 1 - else if (a15 == b15) then 0 else 1 - - a16 = a .&. mask2sc16 - b16 = b .&. mask2sc16 - max16 = if a16 == (0 :: Word64) then 0 - else if (a16 .&. b16) == (0 :: Word64) then 1 - else if (a16 == b16) then 0 else 1 - - a17 = a .&. mask2sc17 - b17 = b .&. mask2sc17 - max17 = if a17 == (0 :: Word64) then 0 - else if (a17 .&. b17) == (0 :: Word64) then 1 - else if (a17 == b17) then 0 else 1 - - a18 = a .&. mask2sc18 - b18 = b .&. mask2sc18 - max18 = if a18 == (0 :: Word64) then 0 - else if (a18 .&. b18) == (0 :: Word64) then 1 - else if (a1 == b18) then 0 else 1 - - a19 = a .&. mask2sc19 - b19 = b .&. mask2sc19 - max19 = if a19 == (0 :: Word64) then 0 - else if (a19 .&. b19) == (0 :: Word64) then 1 - else if (a19 == b19) then 0 else 1 - - a20 = a .&. mask2sc20 - b20 = b .&. mask2sc20 - max20 = if a20 == (0 :: Word64) then 0 - else if (a20 .&. b20) == (0 :: Word64) then 1 - else if (a20 == b20) then 0 else 1 - - a21 = a .&. mask2sc21 - b21 = b .&. mask2sc21 - max21 = if a21 == (0 :: Word64) then 0 - else if (a21 .&. b21) == (0 :: Word64) then 1 - else if (a21 == b21) then 0 else 1 - - a22 = a .&. mask2sc22 - b22 = b .&. mask2sc22 - max22 = if a22 == (0 :: Word64) then 0 - else if (a22 .&. b22) == (0 :: Word64) then 1 - else if (a22 == b22) then 0 else 1 - - a23 = a .&. mask2sc23 - b23 = b .&. mask2sc23 - max23 = if a23 == (0 :: Word64) then 0 - else if (a23 .&. b23) == (0 :: Word64) then 1 - else if (a23 == b23) then 0 else 1 - - a24 = a .&. mask2sc24 - b24 = b .&. mask2sc24 - max24 = if a24 == (0 :: Word64) then 0 - else if (a24 .&. b24) == (0 :: Word64) then 1 - else if (a24 == b24) then 0 else 1 - - a25 = a .&. mask2sc25 - b25 = b .&. mask2sc25 - max25 = if a25 == (0 :: Word64) then 0 - else if (a25 .&. b25) == (0 :: Word64) then 1 - else if (a25 == b25) then 0 else 1 - - a26 = a .&. mask2sc26 - b26 = b .&. mask2sc26 - max26 = if a26 == (0 :: Word64) then 0 - else if (a26 .&. b26) == (0 :: Word64) then 1 - else if (a26 == b26) then 0 else 1 - - a27 = a .&. mask2sc27 - b27 = b .&. mask2sc27 - max27 = if a27 == (0 :: Word64) then 0 - else if (a27 .&. b27) == (0 :: Word64) then 1 - else if (a27 == b27) then 0 else 1 - - a28 = a .&. mask2sc28 - b28 = b .&. mask2sc28 - max28 = if a28 == (0 :: Word64) then 0 - else if (a28 .&. b28) == (0 :: Word64) then 1 - else if (a28 == b28) then 0 else 1 - - a29 = a .&. mask2sc29 - b29 = b .&. mask2sc29 - max29 = if a29 == (0 :: Word64) then 0 - else if (a29 .&. b29) == (0 :: Word64) then 1 - else if (a29 == b29) then 0 else 1 - - a30 = a .&. mask2sc30 - b30 = b .&. mask2sc30 - max30 = if a30 == (0 :: Word64) then 0 - else if (a30 .&. b30) == (0 :: Word64) then 1 - else if (a30 == b30) then 0 else 1 - - a31 = a .&. mask2sc31 - b31 = b .&. mask2sc31 - max31 = if a31 == (0 :: Word64) then 0 - else if (a31 .&. b31) == (0 :: Word64) then 1 - else if (a31 == b31) then 0 else 1 - - -- sum up values - (_, minNumNoChange, minNumChange) = andOR2 a b - maxVal = sum [max0, max1, max2, max3, max4, max5, max6, max7, max8, max9 - , max10, max11, max12, max13, max14, max15, max16, max17, max18, max19 - , max20, max21, max22, max23, max24, max25, max26, max27, max28, max29 - , max30, max31] - - in - -- trace ("MM2:" ++ "\t" ++ (showBits a0) ++ " " ++ (showBits b0) ++ "->" ++ (showBits $ a0 .&. b0) ++ "=>" ++ (show max0) ++ "\n\t" ++ (showBits a10) ++ " " ++ (showBits b10) ++ "->" ++ (showBits $ a10 .&. b10) ++ "=>" ++ (show max10)) - if lNoChangeCost == 0.0 then (fromIntegral minNumChange, fromIntegral maxVal) - else ((lNoChangeCost * (fromIntegral minNumNoChange)) + (lChangeCost * (fromIntegral minNumChange)), (lNoChangeCost * (fromIntegral $ (32 :: Int) - maxVal)) + (lChangeCost * (fromIntegral maxVal))) - --- | minMaxPacked4 minium and maximum cost 16x4 bit nonadditive character --- could add popcount == 1 for equality A/C -> A/C is identical but could be A->C so max 1 --- basically unrolled to make faster --- any diffference between states gets 1 for max -minMaxPacked4 :: (Double, Double) -> Word64 -> Word64 -> (Double, Double) -minMaxPacked4 (lNoChangeCost, lChangeCost) a b = - let a0 = a .&. mask4sc0 - b0 = b .&. mask4sc0 - max0 = if a0 == (0 :: Word64) then 0 - else if (a0 .&. b0) == (0 :: Word64) then 1 - else if (a0 == b0) then 0 else 1 - - a1 = a .&. mask4sc1 - b1 = b .&. mask4sc1 - max1 = if a1 == (0 :: Word64) then 0 - else if (a1 .&. b1) == (0 :: Word64) then 1 - else if (a1 == b1) then 0 else 1 - - a2 = a .&. mask4sc2 - b2 = b .&. mask4sc2 - max2 = if a2 == (0 :: Word64) then 0 - else if (a2 .&. b2) == (0 :: Word64) then 1 - else if (a2 == b2) then 0 else 1 - - a3 = a .&. mask4sc3 - b3 = b .&. mask4sc3 - max3 = if a3 == (0 :: Word64) then 0 - else if (a3 .&. b3) == (0 :: Word64) then 1 - else if (a3 == b3) then 0 else 1 - - a4 = a .&. mask4sc4 - b4 = b .&. mask4sc4 - max4 = if a4 == (0 :: Word64) then 0 - else if (a4 .&. b4) == (0 :: Word64) then 1 - else if (a4 == b4) then 0 else 1 - - a5 = a .&. mask4sc5 - b5 = b .&. mask4sc5 - max5 = if a5 == (0 :: Word64) then 0 - else if (a5 .&. b5) == (0 :: Word64) then 1 - else if (a5 == b5) then 0 else 1 - - a6 = a .&. mask4sc6 - b6 = b .&. mask4sc6 - max6 = if a6 == (0 :: Word64) then 0 - else if (a6 .&. b6) == (0 :: Word64) then 1 - else if (a6 == b6) then 0 else 1 - - a7 = a .&. mask4sc7 - b7 = b .&. mask4sc7 - max7 = if a7 == (0 :: Word64) then 0 - else if (a7 .&. b7) == (0 :: Word64) then 1 - else if (a7 == b7) then 0 else 1 - - a8 = a .&. mask4sc8 - b8 = b .&. mask4sc8 - max8 = if a8 == (0 :: Word64) then 0 - else if (a8 .&. b8) == (0 :: Word64) then 1 - else if (a8 == b8) then 0 else 1 - - a9 = a .&. mask4sc9 - b9 = b .&. mask4sc9 - max9 = if a9 == (0 :: Word64) then 0 - else if (a9 .&. b9) == (0 :: Word64) then 1 - else if (a9 == b9) then 0 else 1 - - a10 = a .&. mask4sc10 - b10 = b .&. mask4sc10 - max10 = if a10 == (0 :: Word64) then 0 - else if (a10 .&. b10) == (0 :: Word64) then 1 - else if (a10 == b10) then 0 else 1 - - a11 = a .&. mask4sc11 - b11 = b .&. mask4sc11 - max11 = if a11 == (0 :: Word64) then 0 - else if (a11 .&. b11) == (0 :: Word64) then 1 - else if (a11 == b11) then 0 else 1 - - a12 = a .&. mask4sc12 - b12 = b .&. mask4sc12 - max12 = if a12 == (0 :: Word64) then 0 - else if (a12 .&. b12) == (0 :: Word64) then 1 - else if (a12 == b12) then 0 else 1 - - a13 = a .&. mask4sc13 - b13 = b .&. mask4sc13 - max13 = if a13 == (0 :: Word64) then 0 - else if (a13 .&. b13) == (0 :: Word64) then 1 - else if (a13 == b13) then 0 else 1 - - a14 = a .&. mask4sc14 - b14 = b .&. mask4sc14 - max14 = if a14 == (0 :: Word64) then 0 - else if (a14 .&. b14) == (0 :: Word64) then 1 - else if (a14 == b14) then 0 else 1 - - a15 = a .&. mask4sc15 - b15 = b .&. mask4sc15 - max15 = if a15 == (0 :: Word64) then 0 - else if (a15 .&. b15) == (0 :: Word64) then 1 - else if (a15 == b15) then 0 else 1 - - -- sum up values - (_, minNumNoChange, minNumChange) = andOR4 a b - maxVal = sum [max0, max1, max2, max3, max4, max5, max6, max7, max8, max9 - , max10, max11, max12, max13, max14, max15] - - in - -- trace ("MM2:" ++ "\t" ++ (showBits a0) ++ " " ++ (showBits b0) ++ "->" ++ (showBits $ a0 .&. b0) ++ "=>" ++ (show max0) ++ "\n\t" ++ (showBits a10) ++ " " ++ (showBits b10) ++ "->" ++ (showBits $ a10 .&. b10) ++ "=>" ++ (show max10)) - if lNoChangeCost == 0.0 then (fromIntegral minNumChange, fromIntegral maxVal) - else ((lNoChangeCost * (fromIntegral minNumNoChange)) + (lChangeCost * (fromIntegral minNumChange)), (lNoChangeCost * (fromIntegral $ (16 :: Int)- maxVal)) + (lChangeCost * (fromIntegral maxVal))) - --- | minMaxPacked5 minium and maximum cost 12x5 bit nonadditive character --- the popcount for equality A/C -> A/C is identical but could be A->C so max 1 --- basically unrolled to make faster -minMaxPacked5 :: (Double, Double) -> Word64 -> Word64 -> (Double, Double) -minMaxPacked5 (lNoChangeCost, lChangeCost) a b = - let a0 = a .&. mask8sc0 - b0 = b .&. mask5sc0 - max0 = if a0 == (0 :: Word64) then 0 - else if (a0 .&. b0) == (0 :: Word64) then 1 - else if (a0 == b0) then 0 else 1 - - a1 = a .&. mask5sc1 - b1 = b .&. mask5sc1 - max1 = if a1 == (0 :: Word64) then 0 - else if (a1 .&. b1) == (0 :: Word64) then 1 - else if (a1 == b1) then 0 else 1 - - a2 = a .&. mask5sc2 - b2 = b .&. mask5sc2 - max2 = if a2 == (0 :: Word64) then 0 - else if (a2 .&. b2) == (0 :: Word64) then 1 - else if (a2 == b2) then 0 else 1 - - a3 = a .&. mask5sc3 - b3 = b .&. mask5sc3 - max3 = if a3 == (0 :: Word64) then 0 - else if (a3 .&. b3) == (0 :: Word64) then 1 - else if (a3 == b3) then 0 else 1 - - a4 = a .&. mask5sc4 - b4 = b .&. mask5sc4 - max4 = if a4 == (0 :: Word64) then 0 - else if (a4 .&. b4) == (0 :: Word64) then 1 - else if (a4 == b4) then 0 else 1 - - a5 = a .&. mask5sc5 - b5 = b .&. mask5sc5 - max5 = if a5 == (0 :: Word64) then 0 - else if (a5 .&. b5) == (0 :: Word64) then 1 - else if (a5 == b5) then 0 else 1 - - a6 = a .&. mask5sc6 - b6 = b .&. mask5sc6 - max6 = if a6 == (0 :: Word64) then 0 - else if (a6 .&. b6) == (0 :: Word64) then 1 - else if (a6 == b6) then 0 else 1 - - a7 = a .&. mask5sc7 - b7 = b .&. mask5sc7 - max7 = if a7 == (0 :: Word64) then 0 - else if (a7 .&. b7) == (0 :: Word64) then 1 - else if (a7 == b7) then 0 else 1 - - a8 = a .&. mask5sc8 - b8 = b .&. mask5sc8 - max8 = if a8 == (0 :: Word64) then 0 - else if (a8 .&. b8) == (0 :: Word64) then 1 - else if (a8 == b8) then 0 else 1 - - a9 = a .&. mask5sc9 - b9 = b .&. mask5sc9 - max9 = if a9 == (0 :: Word64) then 0 - else if (a9 .&. b9) == (0 :: Word64) then 1 - else if (a9 == b9) then 0 else 1 - - a10 = a .&. mask5sc10 - b10 = b .&. mask5sc10 - max10 = if a10 == (0 :: Word64) then 0 - else if (a10 .&. b10) == (0 :: Word64) then 1 - else if (a10 == b10) then 0 else 1 - - a11 = a .&. mask5sc11 - b11 = b .&. mask5sc11 - max11 = if a11 == (0 :: Word64) then 0 - else if (a11 .&. b11) == (0 :: Word64) then 1 - else if (a11 == b11) then 0 else 1 - - - -- sum up values - (_, minNumNoChange, minNumChange) = andOR5 a b - maxVal = sum [max0, max1, max2, max3, max4, max5, max6, max7, max8, max9 - , max10, max11] - - in - -- trace ("MM2:" ++ "\t" ++ (showBits a0) ++ " " ++ (showBits b0) ++ "->" ++ (showBits $ a0 .&. b0) ++ "=>" ++ (show max0) ++ "\n\t" ++ (showBits a10) ++ " " ++ (showBits b10) ++ "->" ++ (showBits $ a10 .&. b10) ++ "=>" ++ (show max10)) - if lNoChangeCost == 0.0 then (fromIntegral minNumChange, fromIntegral maxVal) - else ((lNoChangeCost * (fromIntegral minNumNoChange)) + (lChangeCost * (fromIntegral minNumChange)), (lNoChangeCost * (fromIntegral $ (12 :: Int) - maxVal)) + (lChangeCost * (fromIntegral maxVal))) - --- | minMaxPacked8 minium and maximum cost 12x5 bit nonadditive character --- the popcount for equality A/C -> A/C is identical but could be A->C so max 1 --- basically unrolled to make faster -minMaxPacked8 :: (Double, Double) -> Word64 -> Word64 -> (Double, Double) -minMaxPacked8 (lNoChangeCost, lChangeCost) a b = - let a0 = a .&. mask8sc0 - b0 = b .&. mask8sc0 - max0 = if a0 == (0 :: Word64) then 0 - else if (a0 .&. b0) == (0 :: Word64) then 1 - else if (a0 == b0) then 0 else 1 - - a1 = a .&. mask8sc1 - b1 = b .&. mask8sc1 - max1 = if a1 == (0 :: Word64) then 0 - else if (a1 .&. b1) == (0 :: Word64) then 1 - else if (a1 == b1) then 0 else 1 - - a2 = a .&. mask8sc2 - b2 = b .&. mask8sc2 - max2 = if a2 == (0 :: Word64) then 0 - else if (a2 .&. b2) == (0 :: Word64) then 1 - else if (a2 == b2) then 0 else 1 - - a3 = a .&. mask8sc3 - b3 = b .&. mask8sc3 - max3 = if a3 == (0 :: Word64) then 0 - else if (a3 .&. b3) == (0 :: Word64) then 1 - else if (a3 == b3) then 0 else 1 - - a4 = a .&. mask8sc4 - b4 = b .&. mask8sc4 - max4 = if a4 == (0 :: Word64) then 0 - else if (a4 .&. b4) == (0 :: Word64) then 1 - else if (a4 == b4) then 0 else 1 - - a5 = a .&. mask8sc5 - b5 = b .&. mask8sc5 - max5 = if a5 == (0 :: Word64) then 0 - else if (a5 .&. b5) == (0 :: Word64) then 1 - else if (a5 == b5) then 0 else 1 - - a6 = a .&. mask8sc6 - b6 = b .&. mask8sc6 - max6 = if a6 == (0 :: Word64) then 0 - else if (a6 .&. b6) == (0 :: Word64) then 1 - else if (a6 == b6) then 0 else 1 - - a7 = a .&. mask8sc7 - b7 = b .&. mask8sc7 - max7 = if a7 == (0 :: Word64) then 0 - else if (a7 .&. b7) == (0 :: Word64) then 1 - else if (a7 == b7) then 0 else 1 - - - -- sum up values - (_, minNumNoChange, minNumChange) = andOR8 a b - maxVal = sum [max0, max1, max2, max3, max4, max5, max6, max7] - - in - -- trace ("MM2:" ++ "\t" ++ (showBits a0) ++ " " ++ (showBits b0) ++ "->" ++ (showBits $ a0 .&. b0) ++ "=>" ++ (show max0) ++ "\n\t" ++ (showBits a10) ++ " " ++ (showBits b10) ++ "->" ++ (showBits $ a10 .&. b10) ++ "=>" ++ (show max10)) - if lNoChangeCost == 0.0 then (fromIntegral minNumChange, fromIntegral maxVal) - else ((lNoChangeCost * (fromIntegral minNumNoChange)) + (lChangeCost * (fromIntegral minNumChange)), (lNoChangeCost * (fromIntegral $ (7 :: Int)- maxVal)) + (lChangeCost * (fromIntegral maxVal))) - - - --- | minMaxPacked64 minium and maximum cost 64 bit nonadditive character --- the popcount for equality A/C -> A/C is identical but could be A->C so max 1 --- operattion over each sub-character -minMaxPacked64 :: (Double, Double) -> Word64 -> Word64 -> (Double, Double) -minMaxPacked64 (lNoChangeCost, lChangeCost) a b = - let maxVal = if (a == b) && (popCount a == 1) then (0 :: Int) else (1 :: Int) - minVal = if (a .&. b) == (0 :: Word64) then (1 :: Int) else (0 :: Int) - in - if lNoChangeCost == 0.0 then (fromIntegral minVal, fromIntegral maxVal) - else ((lNoChangeCost * (fromIntegral maxVal)) + (lChangeCost * (fromIntegral minVal)), (lNoChangeCost * (fromIntegral $ minVal)) + (lChangeCost * (fromIntegral maxVal))) - --- | median2Packed takes two characters of packedNonAddTypes --- and retuns new character data based on 2-median and cost -median2Packed :: CharType -> Double -> (Double, Double) -> CharacterData -> CharacterData -> CharacterData -median2Packed inCharType inCharWeight (thisNoChangeCost, thisChangeCost) leftChar rightChar = - let (newStateVect, numNoChange, numChange) = if inCharType == Packed2 then median2Word64 andOR2 (snd3 $ packedNonAddPrelim leftChar) (snd3 $ packedNonAddPrelim rightChar) - else if inCharType == Packed4 then median2Word64 andOR4 (snd3 $ packedNonAddPrelim leftChar) (snd3 $ packedNonAddPrelim rightChar) - else if inCharType == Packed5 then median2Word64 andOR5 (snd3 $ packedNonAddPrelim leftChar) (snd3 $ packedNonAddPrelim rightChar) - else if inCharType == Packed8 then median2Word64 andOR8 (snd3 $ packedNonAddPrelim leftChar) (snd3 $ packedNonAddPrelim rightChar) - else if inCharType == Packed64 then median2Word64 andOR64 (snd3 $ packedNonAddPrelim leftChar) (snd3 $ packedNonAddPrelim rightChar) - else error ("Character type " ++ show inCharType ++ " unrecognized/not implemented") - - -- this for PMDL/ML costs - newCost = if thisNoChangeCost == 0.0 then inCharWeight * (fromIntegral numChange) - else inCharWeight * ((thisChangeCost * (fromIntegral numChange)) + (thisNoChangeCost * (fromIntegral numNoChange))) - - newCharacter = emptyCharacter { packedNonAddPrelim = (snd3 $ packedNonAddPrelim leftChar, newStateVect, snd3 $ packedNonAddPrelim rightChar) - , localCost = newCost - , globalCost =newCost + globalCost leftChar + globalCost rightChar - } - in - -- trace ("M2P: " ++ (showBitsV $ (snd3 . packedNonAddPrelim) leftChar) ++ " " ++ (showBitsV $ (snd3 . packedNonAddPrelim) rightChar) ++ " -> " ++ (showBitsV $ (snd3 . packedNonAddPrelim) newCharacter) ++ " at cost " ++ (show newCost)) - newCharacter - --- | unionPacked returns character that is the union (== OR) for bit packed characters --- of the final fields as preliminary and final -unionPacked :: CharacterData -> CharacterData -> CharacterData -unionPacked charL charR = - let newVect = UV.zipWith (.|.) (packedNonAddFinal charL) (packedNonAddFinal charR) - in - emptyCharacter { packedNonAddPrelim = (newVect, newVect, newVect) - , packedNonAddFinal = newVect - } - - -{- -andOrN functions derived from White and Holland 2011 --} - -{- --- | andOR2 Packed2 modified from Goloboff 2002 --- this is incomplete-- Goloboff uses look up table for on bits to length -andOR2 :: Word64 -> Word64 -> (Word64, Int) -andOR2 x y = - let x1 = x .&. y - c1 = xor mask2B (shiftR ((x1 .&. mask2B) .|. (x1 .&. mask2A)) 1) - c2 = c1 .|. (shiftL c1 1) - newState = x1 .|. c2 - numChanges = lookUpLegnth((c1 .|. (shiftR c1 31)) .&. 0xFFFFFFFF) - in - (newState, numChanges) - -} - - --- For all biut packed characters--post order median 2 --- for now--for somwe reason either mis-diagnosed or mis-coded (from White and Holland 2011) --- the "on" bit in u is reflective of number of intersections not unions. --- hence subtracting the number of unions from numbers of characters --- determined by leading OFF bits since packing will likely have ragged edges --- no not always the pack-able number - --- | median2Word64 driver function for median of two PackedN states -median2Word64 :: (Word64 -> Word64 -> (Word64, Int, Int)) -> UV.Vector Word64 -> UV.Vector Word64 -> (UV.Vector Word64, Int, Int) -median2Word64 andOrFun leftVect rightVect = - let (stateVect, noChangeVect,changeVect) = UV.unzip3 $ UV.zipWith andOrFun leftVect rightVect - in - (stateVect, UV.sum noChangeVect, UV.sum changeVect) - --- | andOR2 and or function for Packed2 encoding -andOR2 :: Word64 -> Word64 -> (Word64, Int, Int) -andOR2 x y = - let u = shiftR ((((x .&. y .&. mask2A) + mask2A) .|. (x .&. y)) .&. mask2B) 1 - z = (x .&. y) .|. ((x .|. y) .&. ((u + mask2A) `xor` mask2B)) - - -- get number of characters by checking states (may not be full) - numEmptyBits = countLeadingZeros x --- could be y just as well - - -- shift divide by 2 states - numNonCharacters = shiftR numEmptyBits 1 - numChars = 32 - numNonCharacters - in - {- - trace ("AO2 numChars:" ++ (show numChars) ++ " x & y:" ++ (showBits $ x .&. y) ++ "\nx .&. y .&. mask2A:" ++ (showBits $ (x .&. y .&. mask2A)) ++ "\n((x .&. y .&. mask2A) + mask2A):" ++ (showBits $ ((x .&. y .&. mask2A) + mask2A)) - ++ "\n:(((x .&. y .&. mask2A) + mask2A) .|. (x .&. y)): " ++ (showBits $ (((x .&. y .&. mask2A) + mask2A) .|. (x .&. y))) ++ "\n:((((x .&. y .&. mask2A) + mask2A) .|. (x .&. y)) .&. mask2B):" - ++ (showBits $ ((((x .&. y .&. mask2A) + mask2A) .|. (x .&. y)) .&. mask2B)) ++ "\nu: " ++ (showBits u) - ++"\npc: " ++ (show $ popCount u) ++ " x:" ++ (showBits x) ++ " y:" ++ (showBits y) ++ " => u:" ++ (showBits u) ++ " z:" ++ (showBits z)) -- ++ " mask2A:" ++ (showBits mask2A) ++ " mask2B:" ++ (showBits mask2B)) - -} - (z, popCount u, numChars - (popCount u)) - --- | andOR4 and or function for Packed4 encoding -andOR4 :: Word64 -> Word64 -> (Word64, Int, Int) -andOR4 x y = - let u = shiftR ((((x .&. y .&. mask4A) + mask4A) .|. (x .&. y)) .&. mask4B) 3 - z = (x .&. y) .|. ((x .|. y) .&. ((u + mask4A) `xor` mask4B)) - - -- get number of characters by checking states (may not be full) - numEmptyBits = countLeadingZeros x --- could be y just as well - - -- shift divide by 4 states - numNonCharacters = shiftR numEmptyBits 2 - numChars = 16 - numNonCharacters - in - (z, popCount u, numChars - (popCount u)) - --- | andOR5 and or function for Packed5 encoding --- potential issue with top 4 bits--not sure on mask5B whether top 4 should be on or OFF. --- can always mask top 4 with AND 0000111... (0xFFFFFFFFFFFFFFF or 1152921504606846975) --- to remove bits for counting --- and calcualted state -andOR5:: Word64 -> Word64 -> (Word64, Int, Int) -andOR5 x y = - let u = shiftR ((((x .&. y .&. mask5A) + mask5A) .|. (x .&. y)) .&. mask5B) 4 - z = (x .&. y) .|. ((x .|. y) .&. ((u + mask5A) `xor` mask5B)) - - -- get number of characters by checking states (may not be full) - numEmptyBits = countLeadingZeros x --- could be y just as well - - -- since top 4 bits always off (can't put anyhting in there) need to subtract those zeros - -- to get character number. Cant shift to get / 5 so integer divide - (numNonCharacters, _) = divMod (numEmptyBits - 4) 5 - numChars = 12 - numNonCharacters - in - -- trace ("AO5 numChars:" ++ (show numChars) ++ " x & y:" ++ (showBits $ x .&. y) ++ " u:" ++ (showBits u) ++ " z:" ++ (showBits z) ++ " leading 0:" ++ (show numEmptyBits) ++ " non-chars:" ++ (show numNonCharacters) ++ " popCount u:" ++ (show $ popCount u)) - (z, popCount u, numChars - (popCount u)) - --- | andOR8 and or function for Packed8 encoding -andOR8 :: Word64 -> Word64 -> (Word64, Int, Int) -andOR8 x y = - let u = shiftR ((((x .&. y .&. mask8A) + mask8A) .|. (x .&. y)) .&. mask8B) 7 - z = (x .&. y) .|. ((x .|. y) .&. ((u + mask8A) `xor` mask8B)) - - -- get number of characters by checking states (may not be full) - numEmptyBits = countLeadingZeros x --- could be y just as well - - -- shift divide by 8 states - numNonCharacters = shiftR numEmptyBits 3 - numChars = 8 - numNonCharacters - in - (z, popCount u, numChars - (popCount u)) - --- | andOR64 and or function for Packed64 encoding -andOR64 :: Word64 -> Word64 -> (Word64, Int, Int) -andOR64 x y = - if (x .&. y) /= zeroBits then (x .&. y, 1, 0) - else (x .|. y, 0, 1) - -{- -Functions for median3 calculations of packed types -These are used in pre-order graph traversals and final state assignment -among others. --} - - --- | packePreorder takes character type, current node (and children preliminary assignments) --- and parent final assignment and creates final assignment for current node --- a bit clumsy since uses Goloboff modifications and have to do some of the preOrder pass --- in Goloboff but not done here -packedPreorder :: CharType -> (UV.Vector Word64, UV.Vector Word64, UV.Vector Word64) -> UV.Vector Word64 -> UV.Vector Word64 -packedPreorder inCharType (leftPrelim, childPrelim, rightPrelim) parentFinal = - let newStateVect = if inCharType == Packed2 then UV.zipWith4 preOrder2 leftPrelim childPrelim rightPrelim parentFinal - else if inCharType == Packed4 then UV.zipWith4 preOrder4 leftPrelim childPrelim rightPrelim parentFinal - else if inCharType == Packed5 then UV.zipWith4 preOrder5 leftPrelim childPrelim rightPrelim parentFinal - else if inCharType == Packed8 then UV.zipWith4 preOrder8 leftPrelim childPrelim rightPrelim parentFinal - else if inCharType == Packed64 then UV.zipWith4 preOrder64 leftPrelim childPrelim rightPrelim parentFinal - else error ("Character type " ++ show inCharType ++ " unrecognized/not implemented") - in - newStateVect - - --- | preOrder2 performs bitpacked Fitch preorder based on Goloboff 2002 --- less efficient than it could be due to not using Goloboff for post-order --- assignment so have to calculate some post-order values that would --- already exist otherwise. Given that pre-order should be much less frequent than --- pre-order shouldn't be that bad -preOrder2 :: Word64 -> Word64 -> Word64 -> Word64 -> Word64 -preOrder2 leftPrelim childPrelim rightPrelim parentFinal = - -- post-order stuff to get "temp" state used to calculate final - let t = leftPrelim .&. rightPrelim - - -- preOrder values - x2 = parentFinal .&. (complement childPrelim) - c3 = (mask2A .&. x2) .|. (shiftR (mask2B .&. x2) 1) - c4 = c3 .|. (shiftL c3 1) - - finalState = (parentFinal .&. (complement c4)) .|. (c4 .&. (childPrelim .|. parentFinal .&. t)) - in - -- trace ("PO2: " ++ " in " ++ (show (showBits leftPrelim, showBits childPrelim, showBits rightPrelim, showBits parentFinal)) ++ "->" ++ (show $ showBits finalState)) - finalState - --- | preOrder4 from preOrder2 but for 4 states -preOrder4 :: Word64 -> Word64 -> Word64 -> Word64 -> Word64 -preOrder4 leftPrelim childPrelim rightPrelim parentFinal = - -- post-order stuff to get "temp" state used to calculate final - let x1 = leftPrelim .&. rightPrelim - y1 = leftPrelim .|. rightPrelim - c1 = xor mask4E ((mask4E .&. x1) .|. (shiftR (mask4D .&. x1) 1) .|. (shiftR (mask4C .&. x1) 2) .|. (shiftR (mask4B .&. x1) 3)) - c2 = c1 .|. (shiftL c1 1) .|. (shiftL c1 2) .|. (shiftL c1 3) - t = c2 .|. y1 - - -- preOrder values - x2 = parentFinal .&. (complement childPrelim) - c3 = (mask4E .&. x2) .|. (shiftR (mask4D .&. x2) 1) .|. (shiftR (mask4C .&. x2) 2) .|. (shiftR (mask4B .&. x2) 3) - c4 = c3 .|. (shiftL c3 1) .|. (shiftL c3 2) .|. (shiftL c3 3) - - finalState = (parentFinal .&. (complement c4)) .|. (c4 .&. (childPrelim .|. parentFinal .&. t)) - in - finalState - --- | preOrder5 from preOrder2 but for 5 states -preOrder5 :: Word64 -> Word64 -> Word64 -> Word64 -> Word64 -preOrder5 leftPrelim childPrelim rightPrelim parentFinal = - -- post-order stuff to get "temp" state used to calculate final - let x1 = leftPrelim .&. rightPrelim - y1 = leftPrelim .|. rightPrelim - c1 = xor mask5F ((mask5F .&. x1) .|. (shiftR (mask5E .&. x1) 1) .|. (shiftR (mask5D .&. x1) 2) .|. (shiftR (mask5C .&. x1) 3) .|. (shiftR (mask5B .&. x1) 4)) - c2 = c1 .|. (shiftL c1 1) .|. (shiftL c1 2) .|. (shiftL c1 3) .|. (shiftL c1 4) - t = c2 .|. y1 - - -- preOrder values - x2 = parentFinal .&. (complement childPrelim) - c3 = (mask5F .&. x2) .|. (shiftR (mask5E .&. x2) 1) .|. (shiftR (mask5D .&. x2) 2) .|. (shiftR (mask5C .&. x2) 3) .|. (shiftR (mask5B .&. x2) 4) - c4 = c3 .|. (shiftL c3 1) .|. (shiftL c3 2) .|. (shiftL c3 3) .|. (shiftL c3 4) - - finalState = (parentFinal .&. (complement c4)) .|. (c4 .&. (childPrelim .|. parentFinal .&. t)) - in - finalState - --- | preOrder8 from preOrder2 but for 8 states -preOrder8 :: Word64 -> Word64 -> Word64 -> Word64 -> Word64 -preOrder8 leftPrelim childPrelim rightPrelim parentFinal = - -- post-order stuff to get "temp" state used to calculate final - let x1 = leftPrelim .&. rightPrelim - y1 = leftPrelim .|. rightPrelim - c1 = xor mask8I ((mask8I .&. x1) .|. (shiftR (mask8H .&. x1) 1) .|. (shiftR (mask8G .&. x1) 2) .|. (shiftR (mask8F .&. x1) 3) .|. (shiftR (mask8E .&. x1) 4) .|. (shiftR (mask8D .&. x1) 5) .|. (shiftR (mask8C .&. x1) 6) .|. (shiftR (mask8B .&. x1) 7)) - c2 = c1 .|. (shiftL c1 1) .|. (shiftL c1 2) .|. (shiftL c1 3) .|. (shiftL c1 4) .|. (shiftL c1 5) .|. (shiftL c1 6) .|. (shiftL c1 7) - t = c2 .|. y1 - - -- preOrder values - x2 = parentFinal .&. (complement childPrelim) - c3 = (mask8I .&. x2) .|. (shiftR (mask8H .&. x2) 1) .|. (shiftR (mask8G .&. x2) 2) .|. (shiftR (mask8F .&. x2) 3) .|. (shiftR (mask8E .&. x2) 4) .|. (shiftR (mask8D .&. x2) 5) .|. (shiftR (mask8C .&. x2) 6) .|. (shiftR (mask8B .&. x2) 7) - c4 = c1 .|. (shiftL c3 1) .|. (shiftL c3 2) .|. (shiftL c3 3) .|. (shiftL c3 4) .|. (shiftL c3 5) .|. (shiftL c3 6) .|. (shiftL c3 7) - - finalState = (parentFinal .&. (complement c4)) .|. (c4 .&. (childPrelim .|. parentFinal .&. t)) - in - finalState - - --- | preOrder64 performs simple Fitch preorder ("up-pass") on Word64 -preOrder64 :: Word64 -> Word64 -> Word64 -> Word64 -> Word64 -preOrder64 leftPrelim childPrelim rightPrelim parentFinal = - let a = parentFinal .&. (complement childPrelim) - b = leftPrelim .&. rightPrelim - c = parentFinal .|. childPrelim - d = childPrelim .|. (parentFinal .&. (leftPrelim .|. rightPrelim)) - in - if a == (zeroBits:: Word64) then parentFinal - else if b == (zeroBits:: Word64) then c - else d - -{- -Functions for hard-wired 3-way optimization - basically - C & P1 & P2 -> if not 0 - else (C & P1) | (C & P2) | (P1 & P2) -> if not 0 - else C | P1 | P2 - bit operations based on Goloboff (2002) for trichotomous trees --} - -- | threeWayPacked median 3 for hard-wired networks - -- this is based on Goloboff (2002) for trichotomous trees -threeWayPacked :: CharType -> UV.Vector Word64 -> UV.Vector Word64 -> UV.Vector Word64 -> UV.Vector Word64 -threeWayPacked inCharType parent1 parent2 curNode = - let newStateVect = if inCharType == Packed2 then UV.zipWith3 threeWay2 parent1 parent2 curNode - else if inCharType == Packed4 then UV.zipWith3 threeWay4 parent1 parent2 curNode - else if inCharType == Packed5 then UV.zipWith3 threeWay5 parent1 parent2 curNode - else if inCharType == Packed8 then UV.zipWith3 threeWay8 parent1 parent2 curNode - else if inCharType == Packed64 then UV.zipWith3 threeWay64 parent1 parent2 curNode - else error ("Character type " ++ show inCharType ++ " unrecognized/not implemented") - in - newStateVect - --- | threeWayPacked' median 3 for hard-wired networks --- this uses lists of masks so likely slower than Goloboff --- this approach could also be used for min/max to be simpler but alos likelu slower since previous is --- manually unrolled -threeWayPacked' :: CharType -> UV.Vector Word64 -> UV.Vector Word64 -> UV.Vector Word64 -> UV.Vector Word64 -threeWayPacked' inCharType parent1 parent2 curNode = - let newStateVect = if inCharType == Packed2 then UV.zipWith3 (threeWayNWord64 packed2SubCharList) parent1 parent2 curNode - else if inCharType == Packed4 then UV.zipWith3 (threeWayNWord64 packed4SubCharList) parent1 parent2 curNode - else if inCharType == Packed5 then UV.zipWith3 (threeWayNWord64 packed5SubCharList) parent1 parent2 curNode - else if inCharType == Packed8 then UV.zipWith3 (threeWayNWord64 packed8SubCharList) parent1 parent2 curNode - else if inCharType == Packed64 then UV.zipWith3 threeWay64 parent1 parent2 curNode - else error ("Character type " ++ show inCharType ++ " unrecognized/not implemented") - in - newStateVect - - --- | threeWayNWord64 3-way hardwired optimization for Packed N Word64 --- non-additive character--maps over sub-characters with appropriate masks --- lists of subcharacters with all ovther bits OFF are created via masks --- then zipped over threeway function and ORed to create 32 bit final state --- this is an alternate approach to the three node optimization of Golobiff below. --- both should yield same result-- this is polymoprhic and simple--but not parallel ---as in Goloboff so likely slower -threeWayNWord64 :: [Word64] -> Word64 -> Word64 -> Word64 -> Word64 -threeWayNWord64 packedSubCharList p1 p2 cN = - let p1SubCharList = fmap (p1 .&.) packedSubCharList - p2SubCharList = fmap (p2 .&.) packedSubCharList - cNSubCharList = fmap (cN .&.) packedSubCharList - threeWayList = zipWith3 threeWay64 p1SubCharList p2SubCharList cNSubCharList - in - L.foldl1' (.|.) threeWayList - - --- | threeWay2 3-way hardwired optimization for Packed2 Word64 --- but used on subCharacters -threeWay2 :: Word64 -> Word64 -> Word64 -> Word64 -threeWay2 p1 p2 cN = - let x = p1 .&. p2 .&. cN - y = (p1 .&. p2) .|. (p1 .&. cN) .|. (p2 .&. cN) - z = p1 .|. p2 .|. cN - c1 = xor mask2B ((mask2B .&. x) .|. (shiftR (mask2A .&. x) 1)) - d1 = xor mask2B ((mask2B .&. y) .|. (shiftR (mask2A .&. y) 1)) - c2 = c1 .|. (shiftL c1 1) - d2 = d1 .|. (shiftL d1 1) - newState = x .|. (y .&. c2) .|. (z .&. d2) - in - newState - --- | threeWay4 3-way hardwired optimization for Packed4 Word64 -threeWay4 :: Word64 -> Word64 -> Word64 -> Word64 -threeWay4 p1 p2 cN = - let x = p1 .&. p2 .&. cN - y = (p1 .&. p2) .|. (p1 .&. cN) .|. (p2 .&. cN) - z = p1 .|. p2 .|. cN - c1 = xor mask4B ((mask4B .&. x) .|. (shiftR (mask4C .&. x) 1) .|. (shiftR (mask4D .&. x) 2) .|. (shiftR (mask4E .&. x) 3)) - d1 = xor mask4B ((mask4B .&. y) .|. (shiftR (mask4C .&. y) 1) .|. (shiftR (mask4D .&. y) 2) .|. (shiftR (mask4E .&. y) 3)) - c2 = c1 .|. (shiftL c1 1) .|. (shiftL c1 2) .|. (shiftL c1 3) - d2 = d1 .|. (shiftL d1 1) .|. (shiftL d1 2) .|. (shiftL d1 3) - newState = x .|. (y .&. c2) .|. (z .&. d2) - in - newState - --- | threeWay5 3-way hardwired optimization for Packed5 Word64 -threeWay5 :: Word64 -> Word64 -> Word64 -> Word64 -threeWay5 p1 p2 cN = - let x = p1 .&. p2 .&. cN - y = (p1 .&. p2) .|. (p1 .&. cN) .|. (p2 .&. cN) - z = p1 .|. p2 .|. cN - c1 = xor mask5B ((mask5B .&. x) .|. (shiftR (mask5C .&. x) 1) .|. (shiftR (mask5D .&. x) 2) .|. (shiftR (mask5E .&. x) 3) .|. (shiftR (mask5F .&. x) 4)) - d1 = xor mask5B ((mask5B .&. y) .|. (shiftR (mask5C .&. y) 1) .|. (shiftR (mask5D .&. y) 2) .|. (shiftR (mask5E .&. y) 3) .|. (shiftR (mask5F .&. y) 4)) - c2 = c1 .|. (shiftL c1 1) .|. (shiftL c1 2) .|. (shiftL c1 3) .|. (shiftL c1 4) - d2 = d1 .|. (shiftL d1 1) .|. (shiftL d1 2) .|. (shiftL d1 3) .|. (shiftL d1 4) - newState = x .|. (y .&. c2) .|. (z .&. d2) - in - newState - --- | threeWay8 3-way hardwired optimization for Packed8 Word64 -threeWay8 :: Word64 -> Word64 -> Word64 -> Word64 -threeWay8 p1 p2 cN = - let x = p1 .&. p2 .&. cN - y = (p1 .&. p2) .|. (p1 .&. cN) .|. (p2 .&. cN) - z = p1 .|. p2 .|. cN - c1 = xor mask8B ((mask8B .&. x) .|. (shiftR (mask8C .&. x) 1) .|. (shiftR (mask8D .&. x) 2) .|. (shiftR (mask8E .&. x) 3) .|. (shiftR (mask8F .&. x) 4) .|. (shiftR (mask8G .&. x) 5) .|. (shiftR (mask8H .&. x) 6) .|. (shiftR (mask8I .&. x) 7)) - d1 = xor mask8B ((mask8B .&. y) .|. (shiftR (mask8C .&. y) 1) .|. (shiftR (mask8D .&. y) 2) .|. (shiftR (mask8E .&. y) 3) .|. (shiftR (mask8F .&. y) 4) .|. (shiftR (mask8G .&. y) 5) .|. (shiftR (mask8H .&. y) 6) .|. (shiftR (mask8I .&. y) 7)) - c2 = c1 .|. (shiftL c1 1) .|. (shiftL c1 2) .|. (shiftL c1 3) .|. (shiftL c1 4) .|. (shiftL c1 5) .|. (shiftL c1 6) .|. (shiftL c1 7) - d2 = d1 .|. (shiftL d1 1) .|. (shiftL d1 2) .|. (shiftL d1 3) .|. (shiftL d1 4) .|. (shiftL d1 5) .|. (shiftL d1 6) .|. (shiftL d1 7) - newState = x .|. (y .&. c2) .|. (z .&. d2) - in - newState - --- | threeWay64 3-way hardwired optimization for straight Word64 -threeWay64 :: Word64 -> Word64 -> Word64 -> Word64 -threeWay64 p1 p2 cN = - let x = p1 .&. p2 .&. cN - y = (p1 .&. p2) .|. (p1 .&. cN) .|. (p2 .&. cN) - z = p1 .|. p2 .|. cN - in - if x /= (zeroBits :: Word64) then x - else if y /= (zeroBits :: Word64) then y - else z - - -{- -Functions to encode ("pack") non-additive characters into new Word64 characters -based on their number of states --} - --- | packData takes input data and creates a variety of bit-packed data types --- to increase efficiency and reduce footprint of non-additive characters --- that are encoded as bitvectors -packNonAdditiveData :: GlobalSettings -> ProcessedData -> ProcessedData -packNonAdditiveData inGS (nameVect, bvNameVect, blockDataVect) = - -- need to check if this blowws out memory on big data sets (e.g. genomic) - let -- newBlockDataList = fmap (recodeNonAddCharacters inGS) (V.toList blockDataVect) -- parallel could be an option to save memory etc - newBlockDataList = PU.seqParMap rdeepseq (recodeNonAddCharacters inGS) (V.toList blockDataVect) -- could be an option to save memory etc - in - (nameVect, bvNameVect, V.fromList newBlockDataList) - --- | recodeNonAddCharacters takes block data, goes through characters --- and recodes NonAdditive. --- Concat and list for charInfoV because new charcaters can be created --- and newCharInfo then as well, could be multiple per input 'charcater' -recodeNonAddCharacters :: GlobalSettings -> BlockData -> BlockData -recodeNonAddCharacters inGS (nameBlock, charDataVV, charInfoV) = - let numChars = V.length charInfoV - - -- create vector of single characters with vector of taxon data of sngle character each - singleCharVectList = V.toList $ fmap (U.getSingleCharacter charDataVV) (V.fromList [0.. numChars - 1]) - - -- bit pack the nonadd - (recodedSingleVecList, newCharInfoLL) = unzip $ zipWith (packNonAdd inGS) singleCharVectList (V.toList charInfoV) - - -- recreate BlockData, tacxon dominant structure - newTaxVectByCharVect = V.fromList $ fmap V.fromList $ L.transpose $ concat recodedSingleVecList - - in - -- trace ("RNAC: " ++ (show (length recodedSingleVecList, fmap length recodedSingleVecList)) ++ " -> " ++ (show $ fmap length newTaxVectByCharVect) ++ " " ++ (show $ length $ V.fromList $ concat newCharInfoLL)) - (nameBlock, newTaxVectByCharVect, V.fromList $ concat newCharInfoLL) - --- | packNonAdd takes (vector of taxa) by character data and list of character information --- and returns bit packed and recoded non-additive characters and charInfo --- input int is character index in block --- the weight is skipping because of the weight replication in reorganize --- if characters have non integer weight then they were not reorganized and left --- as single BV--here as well. Should be very few (if any) of them. -packNonAdd ::GlobalSettings -> V.Vector CharacterData -> CharInfo -> ([[CharacterData]], [CharInfo]) -packNonAdd inGS inCharDataV charInfo = - -- trace ("PNA in weight: " ++ (show $ weight charInfo)) ( - if (charType charInfo /= NonAdd) then ([V.toList inCharDataV],[charInfo]) - else - -- recode non-additive characters - let leafNonAddV = V.toList $ fmap (snd3 . stateBVPrelim) inCharDataV - numNonAdd = (length . head) leafNonAddV - - -- split characters into groups by states number 2,4,5,8,64, >64 (excluding missing) - stateNumDataPairList = PU.seqParMap rdeepseq (getStateNumber leafNonAddV) [0.. numNonAdd - 1] - - -- sort characters by states number (2, 4, 5, 8, 64, >64 -> 128) - (state2CharL, state4CharL, state5CharL, state8CharL, state64CharL, state128CharL) = binStateNumber stateNumDataPairList ([],[],[],[],[],[]) - - -- make new characters based on state size - -- (newStateCharListList, newCharInfoList) = unzip $ (zipWith (makeStateNCharacter inGS charInfo) [2,4,5,8,64,128] [state2CharL, state4CharL, state5CharL, state8CharL, state64CharL, state128CharL] `using` PU.myParListChunkRDS) - (newStateCharListList, newCharInfoList) = unzip $ (PU.seqParMap rdeepseq (makeStateNCharacterTuple inGS charInfo) (zip [2,4,5,8,64,128] [state2CharL, state4CharL, state5CharL, state8CharL, state64CharL, state128CharL])) - - - in - -- trace ("PNA out weights : " ++ (show $ fmap weight $ concat newCharInfoList)) -- (show $ fmap fst stateNumDataPairList) ) -- ++ "\n" ++ (show (newStateCharListList, newCharInfoList) )) - (newStateCharListList, concat newCharInfoList) - -- ) - --- | makeStateNCharacterTuple is a wrapper for makeStateNCharacter to allow for parMap use -makeStateNCharacterTuple :: GlobalSettings -> CharInfo -> (Int, [[BV.BitVector]]) -> ([CharacterData], [CharInfo]) -makeStateNCharacterTuple inGS charInfo (stateNumber, charDataLL) = makeStateNCharacter inGS charInfo stateNumber charDataLL - --- | makeStateNCharacter takes a list of characters each of which is a list of taxon character values and --- creates a new character of all characters for give taxon and packs (64/ state number) characters into a 64 bit Word64 --- via chuncksOf--or if 64, not packing, if 128 stays bitvector --- check for non-sequential states (A,T) or (0,2) etc --- return is list of taxa x single new (packed) character -makeStateNCharacter :: GlobalSettings -> CharInfo -> Int -> [[BV.BitVector]] -> ([CharacterData], [CharInfo]) -makeStateNCharacter inGS charInfo stateNumber charDataLL = - let (recodeList, newCharInfo) = if stateNumber > 64 then recodeBV2BV inGS charInfo charDataLL - else if stateNumber == 64 then recodeBV2Word64Single inGS charInfo charDataLL - else recodeBV2Word64 inGS charInfo stateNumber charDataLL - in - (recodeList, newCharInfo) - --- | recodeBV2BV take a list of BV.bitvector non-add characters and creates a list (taxa) --- BV non-additive characters of type NonAdd. --- this results in a single character and charInfo in list so can be concatenated --- and removed if empty -recodeBV2BV :: GlobalSettings -> CharInfo -> [[BV.BitVector]] -> ([CharacterData], [CharInfo]) -recodeBV2BV inGS charInfo charTaxBVLL = - if null charTaxBVLL then ([],[]) - else - let -- convert to taxon by characgter data list - newStateList = makeNewCharacterData charTaxBVLL - - -- rename with thype - newCharName = T.append (name charInfo) $ T.pack "LargeState" - - -- create new characters for each taxon - newCharDataList = fmap (makeNewData emptyCharacter) newStateList - in - (newCharDataList, [charInfo {name = newCharName, charType = NonAdd, noChangeCost = (fst . bcgt64) inGS, changeCost = (snd . bcgt64) inGS}]) - where makeNewData a b = a {stateBVPrelim = (b,b,b), stateBVFinal = b} - - --- | recodeBV2Word64Single take a list of BV.bitvector non-add characters and creates a list (taxa) --- of Word64 unpacked non-additive characters of type Packed64. --- this results in a single character and charInfo in list so can be concatenated --- and removed if empty --- Aassumes a leaf only sets snd3 -recodeBV2Word64Single :: GlobalSettings -> CharInfo -> [[BV.BitVector]] -> ([CharacterData], [CharInfo]) -recodeBV2Word64Single inGS charInfo charTaxBVLL = - if null charTaxBVLL then ([],[]) - else - let newCharName = T.append (name charInfo) $ T.pack "64State" - - -- convert BV to Word64 - taxWord64BLL = fmap (fmap BV.toUnsignedNumber) charTaxBVLL - - -- convert to taxon by character data lisyt - newStateList = makeNewCharacterData taxWord64BLL - - -- make new character data - newCharDataList = fmap (makeNewData emptyCharacter) newStateList - in - (newCharDataList, [charInfo {name = newCharName, charType = Packed64, noChangeCost = (fst . bc64) inGS, changeCost = (snd . bc64) inGS}]) - where makeNewData a b = a {packedNonAddPrelim = (b,b,b), packedNonAddFinal = b} - --- | makeNewCharacterData takes a list of characters, each of which is a list of taxon states --- of type a (bitvector or Word64) and returns a list of taxa each of which is a vector --- of type a charactyer data --- generic verctor so can have Unboxed V and Boxed V -makeNewCharacterData :: (GV.Vector v a) => [[a]] -> [v a] -makeNewCharacterData charByTaxSingleCharData = - let taxonByCharL = L.transpose charByTaxSingleCharData - taxonByCharV = fmap GV.fromList taxonByCharL - in - taxonByCharV - --- | recodeBV2Word64 take a list of BV.bitvector non-add characters and the states number of creates --- Word64 representaions where subcharcaters are created and shifted to proper positions and ORd --- to create packed reresentation--new character types Packed2, Packed4, Packed5, and Packed8. --- this results in a single character and charInfo in list so can be concatenated --- and removed if empty -recodeBV2Word64 ::GlobalSettings -> CharInfo -> Int -> [[BV.BitVector]] -> ([CharacterData], [CharInfo]) -recodeBV2Word64 inGS charInfo stateNumber charTaxBVLL = - -- trace ("Enter RBV2W64 In: " ++ (show stateNumber) ++ " " ++ (show (length charTaxBVLL, fmap length charTaxBVLL))) ( - if null charTaxBVLL then ([],[]) - else - let newCharType = if stateNumber == 2 then Packed2 - else if stateNumber == 4 then Packed4 - else if stateNumber == 5 then Packed5 - else if stateNumber == 8 then Packed8 - else error ("State number " ++ (show stateNumber) ++ " not to be packed in recodeBV2Word64") - - newCharName = T.append (name charInfo) $ T.pack ((show stateNumber) ++ "State") - - -- get number of characters that can be packed into Word64 for that state number - numCanPack = fst $ divMod 64 stateNumber - - -- convert to taxon by character data list - taxCharBVLL = L.transpose charTaxBVLL - - -- get state index list for all characters (could be non sequential 0|2; A|T etc) - stateIndexLL = PU.seqParMap rdeepseq getStateIndexList charTaxBVLL - - -- convert data each taxon into packedWord64 - packedDataL = PU.seqParMap rdeepseq (packIntoWord64 stateNumber numCanPack stateIndexLL) taxCharBVLL - - -- get noChange and Change cost for char type - (lNoChangeCost, lChangeCost) = if stateNumber == 2 then bc2 inGS - else if stateNumber == 4 then bc4 inGS - else if stateNumber == 5 then bc5 inGS - else if stateNumber == 8 then bc8 inGS - else error ("Change/NoChange costs for state number " ++ (show stateNumber) ++ " not to be set in recodeBV2Word64") - - in - -- trace ("RBV2W64 Out: " ++ (show $ fmap (snd3 . packedNonAddPrelim) packedDataL)) - -- trace ("RBV2W64: " ++ (show (lNoChangeCost, lChangeCost))) - (packedDataL, [charInfo {name = newCharName, charType = newCharType, noChangeCost = lNoChangeCost, changeCost = lChangeCost}]) - -- ) - - --- | packIntoWord64 takes a list of bitvectors for a taxon, the state number and number that can be packed into --- a Word64 and performs appropriate bit settting and shifting to create Word64 --- paralle looked to bag out here -packIntoWord64 :: Int -> Int -> [[Int]] -> [BV.BitVector] -> CharacterData -packIntoWord64 stateNumber numToPack stateCharacterIndexL inBVList = - -- get packable chunk of bv and correcsponding state indices - let packBVList = SL.chunksOf numToPack inBVList - packIndexLL = SL.chunksOf numToPack stateCharacterIndexL - - -- pack each chunk - packedWordVect = UV.fromList $ zipWith (makeWord64FromChunk stateNumber) packIndexLL packBVList - - in - -- trace ("PIW64 chunks/values: " ++ (show $ V.length packedWordVect)) - emptyCharacter { packedNonAddPrelim = (packedWordVect, packedWordVect, packedWordVect) - , packedNonAddFinal = packedWordVect - } - - --- | makeWord64FromChunk takes a list (= chunk) of bitvectors and creates bit subcharacter (Word64) --- with adjacent bits for each BV in chunk. It then bit shifts the appropriate number of bits for each member --- of the chunk and finally ORs all (64/stateNumber) Word64s to make the final packed representation -makeWord64FromChunk :: Int -> [[Int]] -> [BV.BitVector] -> Word64 -makeWord64FromChunk stateNumber stateIndexLL bvList = - if null bvList then (0 :: Word64) - else - let subCharacterList = zipWith3 (makeSubCharacter stateNumber) stateIndexLL bvList [0..(length bvList - 1)] - in - -- trace ("MW64FC: " ++ (show subCharacterList) ++ " " ++ (show $ L.foldl1' (.|.) subCharacterList)) - L.foldl1' (.|.) subCharacterList - --- | makeSubCharacter makes sub-character (ie only those bits for states) from single bitvector and shifts appropriate number of bits --- to make Word64 with sub character bits set and all other bits OFF and in correct bit positions for that sub-character -makeSubCharacter :: Int -> [Int] -> BV.BitVector -> Int -> Word64 -makeSubCharacter stateNumber stateIndexList inBV subCharacterIndex = - -- trace ("Making sub character:" ++ (show stateNumber ++ " " ++ (show stateIndexList) ++ " " ++ (show subCharacterIndex) ++ (show inBV))) ( - let -- get bit of state indices - bitStates = fmap (testBit inBV) stateIndexList - - -- get index of states when only minimally bit encoded (0101, 0001 -> 11, 01) - newBitStates = setOnBits (zeroBits :: Word64) bitStates 0 - subCharacter = shiftL newBitStates (subCharacterIndex * stateNumber) - in - -- trace ("MSC: " ++ (show subCharacterIndex) ++ " " ++ (show bitStates) ++ " " ++ (show newBitStates) ++ " " ++ (show subCharacter)) ( - -- cna remove this check when working - if length stateIndexList `notElem` [((fst $ divMod 2 stateNumber) + 1) .. stateNumber] then error ("State number of index list do not match: " ++ (show (stateNumber, length stateIndexList, stateIndexList))) - else - subCharacter - -- ) - -- ) - --- | setOnBits recursively sets On bits in a list of Bool -setOnBits :: Word64 -> [Bool] -> Int -> Word64 -setOnBits baseVal onList bitIndex = - if null onList then baseVal - else - let newVal = if (head onList == True) then setBit baseVal bitIndex - else baseVal - in - setOnBits newVal (tail onList) (bitIndex + 1) - --- | getStateIndexList takes list of list bit vectors and for each taxon for a given bv character --- and returns a list of --- bit indices of states in the bv this because states can be non-seqeuntial (0|3) --- used to have a list of all states used (ON) in a character in all taxa -getStateIndexList :: [BV.BitVector] -> [Int] -getStateIndexList taxBVL = - if null taxBVL then [] - else - let inBV = L.foldl1' (.|.) taxBVL - onList = fmap (testBit inBV) [0.. (finiteBitSize inBV) - 1] - onIndexPair = zip onList [0.. (finiteBitSize inBV) - 1] - indexList = fmap snd $ filter ((== True) .fst) onIndexPair - - in - -- trace ("GSIL: " ++ (show indexList)) - indexList - - --- | binStateNumber takes a list of pairs of char states number and data column as list of bitvectors and --- into list for 2,4,5,8,64,>64 -binStateNumber :: [(Int, [BV.BitVector])] - -> ([[BV.BitVector]],[[BV.BitVector]],[[BV.BitVector]],[[BV.BitVector]],[[BV.BitVector]],[[BV.BitVector]]) - -> ([[BV.BitVector]],[[BV.BitVector]],[[BV.BitVector]],[[BV.BitVector]],[[BV.BitVector]],[[BV.BitVector]]) -binStateNumber inPairList (cur2, cur4, cur5, cur8, cur64, cur128) = - if null inPairList then - --dont' really need to reverse here but seems hygenic - trace ("Recoding NonAdditive Characters : " ++ (show (length cur2, length cur4, length cur5, length cur8, length cur64, length cur128))) - (L.reverse cur2, L.reverse cur4, L.reverse cur5, L.reverse cur8, L.reverse cur64, L.reverse cur128) - else - let (stateNum, stateData) = head inPairList - in - -- skip--constant - if stateNum < 2 then binStateNumber (tail inPairList) (cur2, cur4, cur5, cur8, cur64, cur128) - else if stateNum == 2 then binStateNumber (tail inPairList) (stateData : cur2, cur4, cur5, cur8, cur64, cur128) - else if stateNum <= 4 then binStateNumber (tail inPairList) (cur2, stateData : cur4, cur5, cur8, cur64, cur128) - else if stateNum <= 5 then binStateNumber (tail inPairList) (cur2, cur4, stateData : cur5, cur8, cur64, cur128) - else if stateNum <= 8 then binStateNumber (tail inPairList) (cur2, cur4, cur5, stateData : cur8, cur64, cur128) - else if stateNum <= 64 then binStateNumber (tail inPairList) (cur2, cur4, cur5, cur8, stateData : cur64, cur128) - else binStateNumber (tail inPairList) (cur2, cur4, cur5, cur8, cur64, stateData : cur128) - --- | getStateNumber returns the number of uniqe (non missing) states for a 'column' --- of nonadd bitvector values --- the charState values are in ranges for 2,4,5,8,64 and bigger numbers --- missingVal not takeen from alphabet size since that was not updated in reorganize. --- So take OR of all on bits--may be non-sequential--ie 0 2 7 so need to watch that. --- returns pair of stateNUmber class (2,4,5,8,64, >64 as 128) and list of states --- for efficient glueing back together later - -getStateNumber :: [V.Vector BV.BitVector] -> Int -> (Int, [BV.BitVector]) -getStateNumber characterDataVV characterIndex = - -- trace ("GSN:" ++ (show characterIndex) ++ " " ++ (show $ fmap V.length characterDataVV) ++ "\n\t" ++ (show $ fmap (V.! characterIndex) characterDataVV)) ( - if null characterDataVV then (0, []) - else - let thisCharV = fmap (V.! characterIndex) characterDataVV - missingVal = L.foldl1' (.|.) thisCharV - nonMissingStates = filter (/= missingVal) thisCharV - nonMissingBV = L.foldl1' (.|.) nonMissingStates - numStates = popCount nonMissingBV - - -- this turns off non-missing bits - thisCharL = (fmap (.&. nonMissingBV) thisCharV) - in - -- trace ("GSN:" ++ (show nonMissingStates) ++ "\nNMBV " ++ (show nonMissingBV) ++ " MBV " ++ (show missingVal) ++ " -> " ++ (show numStates) ) ( - if null nonMissingStates then (1, []) - else if numStates == 1 then (1, []) - else if numStates == 2 then (2, thisCharL) - else if numStates <= 4 then (4, thisCharL) - else if numStates == 5 then (5, thisCharL) - else if numStates <= 8 then (8, thisCharL) - else if numStates <= 64 then (64, thisCharL) - else (128, thisCharL) - - -- ) -- ) diff --git a/pkg/PhyGraph/Input/DataTransformation.hs b/pkg/PhyGraph/Input/DataTransformation.hs deleted file mode 100644 index f51975134..000000000 --- a/pkg/PhyGraph/Input/DataTransformation.hs +++ /dev/null @@ -1,967 +0,0 @@ -{- | -Module : DataTransformation.hs -Description : Module with functionality to transform phylogenetic data -Copyright : (c) 2021 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - - -module Input.DataTransformation - ( renameData - , getDataTerminalNames - , addMissingTerminalsToInput - , checkDuplicatedTerminals - , createNaiveData - , createBVNames - , partitionSequences - , missingAligned - , setMissingBits - ) where - -import Data.Alphabet -import Data.Alphabet.Codec -import Data.Alphabet.IUPAC -import Data.Bifunctor -import Data.Bimap (Bimap) -import qualified Data.Bimap as BM -import Data.Foldable -import qualified Data.List as L -import Data.List.NonEmpty (NonEmpty) -import qualified Data.List.NonEmpty as NE -import Data.Maybe -import Data.String -import qualified Data.Text.Lazy as T -import Types.Types ---import qualified Data.BitVector as BV -import qualified Data.BitVector.LittleEndian as BV -import qualified Data.Vector as V -import qualified Data.Vector.Storable as SV -import qualified Data.Vector.Unboxed as UV - -import Data.Bits --- import qualified Data.Hashable as H -import qualified Data.Text.Short as ST -import Data.Word -import Debug.Trace -import Foreign.C.Types -import GeneralUtilities -import Numeric.Natural -import Text.Read -import qualified Utilities.Utilities as U --- import Debug.Trace - ---Todo-- add stuff for proper input of prealign seeunces--need charactert types set before this - --- | partitionSequences takes a character to split sequnces, usually '#'' as in POY, but can be changed --- and divides the seqeunces into corresponding partitions. Replicate character info appending --- a number to character name --- assumes that input rawdata are a single character (as in form a single file) for sequence data -partitionSequences :: ST.ShortText -> [RawData] -> [RawData] -partitionSequences partChar inDataList = - if null inDataList then [] - else - let firstRawData@(taxDataList, charInfoList) = head inDataList - in - -- for raw seqeunce data this will always be a single character - if (length charInfoList > 1) || (charType (head charInfoList) `notElem` sequenceCharacterTypes) then firstRawData : partitionSequences partChar (tail inDataList) else ( - let (leafNameList, leafDataList) = unzip taxDataList - partitionCharList = fmap (U.splitSequence partChar) leafDataList - partitionCharListByPartition = makePartitionList partitionCharList - firstPartNumber = length $ head partitionCharList - allSame = filter (== firstPartNumber) $ length <$> tail partitionCharList - pairPartitions = zip (fmap T.unpack leafNameList) (fmap length partitionCharList) - in - - -- check partition numbers consistent + 1 because of tail - if (length allSame + 1) /= length partitionCharList then errorWithoutStackTrace ("Number of sequence partitions not consistent in " ++ T.unpack (name $ head charInfoList) ++ " " ++ (show pairPartitions) ++ "\n\tThis may be due to occurence of the partition character '" ++ (show partChar) ++ "' in sequence data. This value can be changed with the 'set' command.") - - -- if single partition then nothing to do - else if firstPartNumber == 1 then firstRawData : partitionSequences partChar (tail inDataList) - - -- split data - else - trace ("\nPartitioning " ++ T.unpack (name $ head charInfoList) ++ " into " ++ show firstPartNumber ++ " segments\n") ( - - -- make new structures to create RawData list - let leafNameListList = replicate firstPartNumber leafNameList - - -- these filtered from terminal partitions - leafDataListList = fmap (fmap (filter (/= partChar))) partitionCharListByPartition - - -- create TermData - newTermDataList = joinLists leafNameListList leafDataListList - - -- filter out taxa with empty data-- so can be reconciled proerly later - newTermDataList' = fmap removeTaxaWithNoData newTermDataList - - -- create final [RawData] - charInfoListList = replicate firstPartNumber charInfoList - newRawDataList = zip newTermDataList' charInfoListList - in - --trace (" NCI " ++ (show $ newTermDataList')) - newRawDataList ++ partitionSequences partChar (tail inDataList) - - --firstRawData : (partitionSequences partChar (tail inDataList)) - )) - --- | removeTaxaWithNoData takes a single TermData list and removes taxa with empty data --- these can be created from paritioning sequences where there are no data in a --- partitition. This allows for data reconciliation/renaming later. -removeTaxaWithNoData :: [TermData] -> [TermData] -removeTaxaWithNoData inTermData = - if null inTermData then [] - else - let newData = filter ((not . null).snd) inTermData - in - --trace ((show $ length inTermData) ++ " -> " ++ (show $ length newData)) - newData - --- | joinLists takes two lists of lists (of same length) and zips the --- heads of each, then continues till all joined -joinLists :: [[a]] -> [[b]] -> [[(a,b)]] -joinLists listA listB - | length listA /= length listB = error ("Input lists not equal " ++ show (length listA, length listB)) - | null listA = [] - | otherwise = - let firstList = zip (head listA) (head listB) - in - firstList : joinLists (tail listA) (tail listB) - --- | makePartitionList take list by taxon and retuns list by partition -makePartitionList :: [[[ST.ShortText]]] -> [[[ST.ShortText]]] -makePartitionList inListList = - if null $ head inListList then [] - else - let firstParList = fmap head inListList - in - firstParList : makePartitionList (fmap tail inListList) - --- | renameData takes a list of rename Text pairs (new name, oldName) --- and replaces the old name with the new -renameData :: [(T.Text, T.Text)] -> RawData -> RawData -renameData newNamePairList inData = - if null newNamePairList then inData - else - let terminalData = fst inData - in - if null terminalData then inData - else - let newTerminalData = fmap (relabelterminalData newNamePairList) terminalData - in - (newTerminalData, snd inData) - --- | relabelterminalData takes a list of Text pairs and the terminals with the --- second name in the pairs is changed to the first -relabelterminalData :: [(T.Text, T.Text)] -> TermData -> TermData -relabelterminalData namePairList terminalData@(leafName, leafData) = - if null namePairList then terminalData - else - let foundName = find ((== leafName) .snd) namePairList - in - if isNothing foundName then - --trace ("Not renaming " ++ (T.unpack leafName)) -- ++ " " ++ show namePairList) - terminalData - else - --trace ("Renaming " ++ (T.unpack leafName) ++ " to " ++ (T.unpack $ fst $ fromJust foundName)) - (fst $ fromJust foundName, leafData) - --- | getDataTerminalNames takes all input data and gets full terminal list --- and adds missing data for terminals not in input files -getDataTerminalNames :: [RawData] -> [T.Text] -getDataTerminalNames inDataList = - if null inDataList then [] - else - L.sort $ L.nub $ fst <$> concatMap fst inDataList - --- | addMissingTerminalsToInput dataLeafNames renamedData -addMissingTerminalsToInput :: [T.Text] -> [TermData]-> RawData -> RawData -addMissingTerminalsToInput dataLeafNames curTermData inData@(termDataList, charInfoList) = - if null dataLeafNames then (reverse curTermData, charInfoList) - else - let firstLeafName = head dataLeafNames - foundLeaf = find ((== firstLeafName) .fst) termDataList - in - if isJust foundLeaf then addMissingTerminalsToInput (tail dataLeafNames) (fromJust foundLeaf : curTermData) inData - else addMissingTerminalsToInput (tail dataLeafNames) ((firstLeafName, []) : curTermData) inData - --- | checkDuplicatedTerminals takes list TermData and checks for repeated terminal names -checkDuplicatedTerminals :: [TermData] -> (Bool, [T.Text]) -checkDuplicatedTerminals inData = - if null inData then (False, []) - else - let nameList = L.group $ L.sort $ fmap fst inData - dupList = filter ((>1).length) nameList - in - if null dupList then (False, []) - else (True, fmap head dupList) - --- | joinSortFileData takes list of list of short text and merges line by line to join leaf states --- and sorts the result -joinSortFileData :: [[ST.ShortText]] -> [String] -joinSortFileData inFileLists = - if null (head inFileLists) then [] - else - let -- changed sort order (now by data file input, more or less) to reduce time complexity - -- should still be label invariant - --firstLeaf = L.sort $ ST.toString $ ST.concat $ fmap head inFileLists - firstLeaf = ST.toString $ ST.concat $ fmap head inFileLists - --firstLeaf = show $ L.sort $ fmap head inFileLists - in - firstLeaf : joinSortFileData (fmap tail inFileLists) - - --- | createBVNames takes input data, sorts the raw data, hashes, sorts those to create --- unique, label invariant (but data related so arbitrary but consistent) --- Assumes the rawData come in sorted by the data reconciliation process --- These used for vertex labels, caching, left/right DO issues -createBVNames :: [RawData] -> [(T.Text, BV.BitVector)] -createBVNames inDataList = - let rawDataList = fmap fst inDataList - textNameList = fst <$> head rawDataList - textNameList' = fst <$> last rawDataList - - fileLeafCharList = fmap (fmap snd) rawDataList - fileLeafList = fmap (fmap ST.concat) fileLeafCharList - leafList = reverse $ joinSortFileData fileLeafList - - -- hash not guaranteed to be stable over OS or library version - -- leafHash = fmap H.hash leafList - leafHash = leafList - leafHashPair = L.sortOn fst $ zip leafHash [0..(length textNameList - 1)] -- textNameList - (_, leafReoderedList) = unzip leafHashPair - -- leafOrder = sortOn fst $ zip leafReoderedList [0..((length textNameList) - 1)] - -- (nameList, intList) = unzip leafOrder - - --bv1 = BV.bitVec (length textNameList) (1 :: Integer) - boolList = replicate (length textNameList - 1) False - bv1 = BV.fromBits $ True : boolList - bvList = fmap (shiftL bv1) leafReoderedList -- [0..((length textNameList) - 1)] - in - if textNameList /= textNameList' then error "Taxa are not properly ordered in createBVNames" - else - -- trace (show $ fmap BV.toBits bvList) - zip textNameList bvList - --- | createNaiveData takes input RawData and transforms to "Naive" data. --- these data are organized into blocks (set to input filenames initially) --- and are bitvector coded, but are not organized by character type, packed ot --- optimized in any other way (prealigned-> nonadd, Sankoff. 2 state sankoff to binary, --- constant characters skipped etc) --- these processes take place later --- these data can be input to any data optimization commands and are useful --- for data output as they haven't been reordered or transformed in any way. --- the RawData is a list since it is organized by input file --- the list accumulator is to avoid Vector snoc/cons O(n) -createNaiveData :: [RawData] -> [(T.Text, BV.BitVector)] -> [BlockData] -> ProcessedData -createNaiveData inDataList leafBitVectorNames curBlockData = - -- trace ("CND: ") ( - if null inDataList - then --trace ("Naive data with " ++ (show $ length curBlockData) ++ " blocks and " ++ (show $ fmap length $ fmap V.head $ fmap snd3 curBlockData) ++ " characters") - ( V.fromList $ fmap fst leafBitVectorNames - , V.fromList $ fmap snd leafBitVectorNames - , V.fromList $ reverse curBlockData - ) - else - let (firstData, firstCharInfo) = head inDataList - in - -- empty file should have been caught earlier, but avoids some head/tail errors - if null firstCharInfo then trace "Empty CharInfo" createNaiveData (tail inDataList) leafBitVectorNames curBlockData - else - -- process data as come in--each of these should be from a single file - -- and initially assigned to a single, unique block - let thisBlockName = name $ head firstCharInfo - thisBlockCharInfo = V.fromList firstCharInfo - maxCharacterLength = maximum $ fmap length $ (fmap snd firstData) - recodedCharacters = recodeRawData (fmap fst firstData) (fmap snd firstData) firstCharInfo maxCharacterLength [] - --thisBlockGuts = V.zip (V.fromList $ fmap snd leafBitVectorNames) recodedCharacters - previousBlockName = if not $ null curBlockData then fst3 $ head curBlockData - else T.empty - thisBlockName' = if T.takeWhile (/= '#') previousBlockName /= T.takeWhile (/= '#') thisBlockName then thisBlockName - else - let oldSuffix = T.dropWhile (/= '#') previousBlockName - indexSuffix = if T.null oldSuffix then T.pack "#0" - else - let oldIndex = readMaybe (T.unpack $ T.tail oldSuffix) :: Maybe Int - newIndex = 1 + fromJust oldIndex - in - if isNothing oldIndex then error "Bad suffix in createNaiveData" - else T.pack ("#" ++ show newIndex) - in - T.append (T.takeWhile (/= '#') thisBlockName) indexSuffix - - thisBlockCharInfo'' = V.zipWith (resetAddNonAddAlphabets recodedCharacters) thisBlockCharInfo (V.fromList [0.. (V.length thisBlockCharInfo - 1)]) - - -- create "orginal" character info for later use in outputs after character recoding and transformation etc. - thisBlockCharInfo' = fmap setOrigCharInfo thisBlockCharInfo'' - - - recodedCharacters' = fmap (recodeNonAddMissingBlock thisBlockCharInfo') recodedCharacters - - thisBlockData = (thisBlockName', recodedCharacters', thisBlockCharInfo') - - (prealignedDataEqualLength, nameMinPairList, nameNonMinPairList) = checkPrealignedEqualLength (fmap fst leafBitVectorNames) thisBlockData - - in - -- trace ("CND:" ++ (show $ fmap length $ (fmap snd firstData))) ( - if not prealignedDataEqualLength then errorWithoutStackTrace ("Error on input of prealigned sequence characters in file " ++ (takeWhile (/='#') $ T.unpack thisBlockName') ++ "--not equal length [(Taxon, Length)]: \nMinimum length taxa: " ++ (show nameMinPairList) ++ "\nNon Minimum length taxa: " ++ (show nameNonMinPairList) ) - -- trace ("CND:" ++ (show $ fmap snd firstData)) ( - else - trace ("Recoding input block: " ++ T.unpack thisBlockName') - createNaiveData (tail inDataList) leafBitVectorNames (thisBlockData : curBlockData) - -- ) - --- | setOrigCharInfo takes fields from charInfo and sets the initial original charcter infomatin field --- as a singleton Vector -setOrigCharInfo :: CharInfo -> CharInfo -setOrigCharInfo inCharInfo = - let origData = (name inCharInfo, charType inCharInfo, alphabet inCharInfo) - in - inCharInfo {origInfo = V.singleton origData} - --- | recodeAddNonAddMissing takes Block data and recodes missing for additive and non-additive characters -recodeNonAddMissingBlock :: V.Vector CharInfo -> V.Vector CharacterData -> V.Vector CharacterData -recodeNonAddMissingBlock blockCharInfo singleTaxonBlockData = - V.zipWith recodeNonAddMissingCharacter blockCharInfo singleTaxonBlockData - --- | recodeAddNonAddMissingCharacter recodes additive and non -additive missing data -recodeNonAddMissingCharacter :: CharInfo -> CharacterData -> CharacterData -recodeNonAddMissingCharacter charInfo inCharData = - let inCharType = charType charInfo - in - if inCharType /= NonAdd then inCharData - else - let nonAddState = (V.head . snd3 . stateBVPrelim) inCharData - newState = if BV.isZeroVector nonAddState then V.singleton (complement nonAddState) - else V.singleton nonAddState - in - if BV.isZeroVector nonAddState then - inCharData { stateBVPrelim = (newState, newState, newState) - , stateBVFinal = newState} - else - inCharData - - --- | getAddNonAddAlphabets takes recoded character data and resets the alphabet --- field in charInfo to reflect observed states. This is used tpo properly sety missing and --- bit packing values -resetAddNonAddAlphabets :: V.Vector (V.Vector CharacterData) -> CharInfo -> Int -> CharInfo -resetAddNonAddAlphabets taxonByCharData charInfo charIndex = - let inCharType = charType charInfo - in - if inCharType `notElem` [Add, NonAdd] then charInfo - else - if inCharType == NonAdd then - let -- get actual states - inCharV = fmap (V.head . snd3 . stateBVPrelim) $ fmap V.head taxonByCharData - missingVal = BV.fromBits $ L.replicate (fromEnum $ BV.dimension (V.head . snd3 . stateBVPrelim . V.head $ V.head taxonByCharData)) True - --missingVal = V.foldl1' (.|.) inCharV - nonMissingBV = V.foldl1' (.|.) $ V.filter (/= missingVal) inCharV - - -- max in case of all missing character - numStates = max 1 (popCount nonMissingBV) - - -- numBits = BV.dimension $ (V.head . snd3 . stateBVPrelim) $ (V.head taxonByCharData) V.! charIndex - foundSymbols = fmap ST.fromString $ fmap show [0.. numStates - 1] - stateAlphabet = fromSymbolsWOGap foundSymbols -- fromSymbolsWOGap foundSymbols - in - -- trace ("RNA: " ++ (show stateAlphabet)) - charInfo {alphabet = stateAlphabet} - - else if inCharType == Add then - let (minRangeL, maxRangeL) = V.unzip $ fmap (V.head . snd3 . rangePrelim ) $ fmap (V.! charIndex) taxonByCharData - - minRange = if minimum minRangeL < (maxBound :: Int) then minimum minRangeL - else 0 - - maxRange = if maximum maxRangeL > (minBound :: Int) then maximum maxRangeL - else 0 - - foundSymbols = fmap ST.fromString $ fmap show [minRange.. maxRange] - stateAlphabet = fromSymbolsWOGap foundSymbols -- fromSymbolsWOGap foundSymbols - in - if maxRange < minRange then error ("Error in processing of additive character states " ++ (show (minRange, maxRange))) - else - -- trace ("RA: " ++ (show (minimum minRangeL, maximum maxRangeL)) ++ " -> " ++ (show (minRange, maxRange)) ++ " " ++ (show foundSymbols)) -- ++ " -> " ++ (show stateAlphabet)) - charInfo {alphabet = stateAlphabet} - - - - else error ("Unrecognized character type in resetAddNonAddAlphabets: " ++ (show inCharType)) - --- | checkPrealignedEqualLength checks prealigned type for equal length --- at this stage (called before reblocking) there should only be a single charcter per block --- but more general--if not great if > 1 character with naming -checkPrealignedEqualLength :: [T.Text] -> (NameText, V.Vector (V.Vector CharacterData), V.Vector CharInfo) -> (Bool, [(T.Text, Int)], [(T.Text, Int)]) -checkPrealignedEqualLength nameTextList (_, taxByCharacterDataVV, charInfoV) = - let numCharsIndexList = [0 .. (V.length charInfoV) - 1] - sameLengthPairList = zipWith (verifyPrealignedCharacterLength nameTextList taxByCharacterDataVV) (V.toList charInfoV) numCharsIndexList - badOnes = filter ((== False) . fst3) sameLengthPairList - in - if null badOnes then (True, [],[]) - else - (False, concat $ fmap snd3 badOnes, concat $ fmap thd3 badOnes) - --- | verifyPrealignedCharacterLength takes an index for character and examines theat character--if prealigned checks for --- equal length if prealigned then "True" -verifyPrealignedCharacterLength :: [T.Text] -> V.Vector (V.Vector CharacterData) -> CharInfo -> Int -> (Bool, [(T.Text, Int)],[(T.Text, Int)]) -verifyPrealignedCharacterLength nameTextList taxByCharacterDataVV charInfo charIndex = - let inCharType = charType charInfo - inCharV = fmap (V.! charIndex) taxByCharacterDataVV - in - if inCharType `notElem` prealignedCharacterTypes then (True, [],[]) - else - let prealigedDataLengthList = V.toList $ fmap (U.getCharacterLength' charInfo) inCharV - {- - if inCharType == AlignedSlim then V.toList $ fmap SV.length $ fmap (snd3 . alignedSlimPrelim) inCharV - else if inCharType == AlignedWide then V.toList $ fmap UV.length $ fmap (snd3 . alignedWidePrelim) inCharV - else if inCharType == AlignedHuge then V.toList $ fmap V.length $ fmap (snd3 . alignedHugePrelim) inCharV - else error ("Character type " ++ show inCharType ++ " unrecongized/not implemented") - -} - - nameLengthPairList = zip nameTextList prealigedDataLengthList - lMinLength = minimum prealigedDataLengthList - haveMinLength = filter ((== lMinLength) .snd) nameLengthPairList - notMinMinLength = filter ((/= lMinLength) .snd) nameLengthPairList - in - -- all min length then all same length - -- trace ("VPCL:" ++ (show $ (haveMinLength, notMinMinLength))) ( - if null notMinMinLength then (True, [], []) - else (False, haveMinLength, notMinMinLength) - -- ) - - --- | recodeRawData takes the ShortText representation of character states/ranges etc --- and recodes the appropriate fields in CharacterData (from Types) --- the list accumulator is to avoid Vector cons/snoc O(n) --- differentiates between seqeunce type and others with char info -recodeRawData :: [NameText] -> [[ST.ShortText]] -> [CharInfo] -> Int -> [[CharacterData]] -> V.Vector (V.Vector CharacterData) -recodeRawData inTaxNames inData inCharInfo maxCharLength curCharData = - -- trace ("RRD: ") ( - if null inTaxNames then V.fromList $ reverse $ fmap V.fromList curCharData - else - let firstData = head inData - firstDataRecoded = createLeafCharacter inCharInfo firstData maxCharLength - in - -- trace ("RRD:" ++ (show firstData)) - --trace ("Recoding " ++ (T.unpack $ head inTaxNames) ++ " as " ++ (show $ charType $ head inCharInfo) ++ "\n\t" ++ show firstDataRecoded) - --trace ((show $ length inData) ++ " " ++ (show $ length firstData) ++ " " ++ (show $ length inCharInfo) - recodeRawData (tail inTaxNames) (tail inData) inCharInfo maxCharLength (firstDataRecoded : curCharData) - -- ) - - --- | missingNonAdditive is non-additive missing character value, all 1's based on alphabet size -missingNonAdditive :: CharInfo -> CharacterData -missingNonAdditive inCharInfo = - let missingChar = V.singleton (BV.fromBits $ replicate (length $ alphabet inCharInfo) True) - in - emptyCharacter { stateBVPrelim = (missingChar, missingChar, missingChar) - , stateBVFinal = missingChar - } - - - --- | missingAdditive is additive missing character value, all 1's based on alphabet size -missingAdditive :: CharInfo -> CharacterData -missingAdditive inCharInfo = - let minState' = readMaybe (ST.toString . head . toList $ alphabet inCharInfo) :: Maybe Int - maxState' = readMaybe (ST.toString . last . toList $ alphabet inCharInfo) :: Maybe Int - minState = if isNothing minState' then 0 - else fromJust minState' - maxState = if isNothing maxState' then 0 - else fromJust maxState' - - missingRange = V.zip - (V.singleton minState) - (V.singleton maxState) - - in - emptyCharacter { rangePrelim = (missingRange, missingRange, missingRange) - , rangeFinal = missingRange - } - - --- | missingMatrix is additive missing character value, all 1's based on alphabet size --- setrting stateBVPrelim/Final for approx DO-like costs (lookup) -missingMatrix :: CharInfo -> CharacterData -missingMatrix inCharInfo = - let numStates = length $ alphabet inCharInfo - missingState = (0 :: StateCost , [] ,[]) - in - emptyCharacter { matrixStatesPrelim = V.singleton (V.replicate numStates missingState) - , matrixStatesFinal= V.singleton (V.replicate numStates missingState)} - - --- | getMissingValue takes the character type and returns the appropriate missing data value -getMissingValue :: [CharInfo] -> Int -> [CharacterData] -getMissingValue inChar maxCharLength - | null inChar = [] - | charType (head inChar) `elem` nonExactCharacterTypes = [emptyCharacter] -- [] - | charType (head inChar) `elem` prealignedCharacterTypes = [missingAligned (head inChar) maxCharLength] - | charType (head inChar) == NonAdd = missingNonAdditive (head inChar) : getMissingValue (tail inChar) maxCharLength - | charType (head inChar) == Add = missingAdditive (head inChar) : getMissingValue (tail inChar) maxCharLength - | charType (head inChar) == Matrix = missingMatrix (head inChar) : getMissingValue (tail inChar) maxCharLength - | otherwise = error ("Datatype " ++ show (charType $ head inChar) ++ " not recognized") - - --- | missingAligned creates missing data (all bits on) for prealigned data --- important n ot to sett all bits--then run of in to seg fault land -missingAligned :: CharInfo -> Int -> CharacterData -missingAligned inChar charLength = - let inCharType = charType inChar - alphSize = length $ alphabet inChar - missingElementSlim = -- CUInt type - SV.replicate charLength $ setMissingBits (0 :: CUInt) 0 alphSize - missingElementWide = -- Word64 type - UV.replicate charLength $ setMissingBits (0 :: Word64) 0 alphSize - missingElementHuge = -- bit vector type - V.replicate charLength $ (BV.fromBits $ replicate alphSize True) - in - -- trace ("MA: " ++ (show charLength) ++ (show (SV.head missingElementSlim, UV.head missingElementWide, V.head missingElementHuge))) ( - if inCharType == AlignedSlim then - emptyCharacter {alignedSlimPrelim = (missingElementSlim, missingElementSlim, missingElementSlim)} - - else if inCharType == AlignedWide then - emptyCharacter {alignedWidePrelim = (missingElementWide, missingElementWide, missingElementWide)} - - else if inCharType == AlignedHuge then - emptyCharacter {alignedHugePrelim = (missingElementHuge, missingElementHuge, missingElementHuge)} - - else error ("Datatype " ++ (show inCharType) ++ " not recognized") - -- ) - --- | setMissingBits sets the first bits by index to '1' rest left as is (0 on input) -setMissingBits :: (Show a, FiniteBits a) => a -> Int -> Int -> a -setMissingBits inVal curIndex alphSize = - if curIndex == alphSize then - -- trace ("SMB:" ++ (show (curIndex, alphSize, inVal))) - inVal - else - -- trace ("SMB:" ++ (show (curIndex, alphSize, inVal, setBit inVal curIndex))) - setMissingBits (setBit inVal curIndex) (curIndex + 1) alphSize - - --- | getStateBitVectorList takes the alphabet of a character ([ShorText]) --- and returns bitvectors (with of size alphabet) for each state in order of states in alphabet -getStateBitVectorList :: Alphabet ST.ShortText -> V.Vector (ST.ShortText, BV.BitVector) -getStateBitVectorList localStates = - if null localStates then error "Character with empty alphabet in getStateBitVectorList" - else - let stateCount = toEnum $ length localStates - stateIndexList = [0 .. stateCount - 1] - genNum = (2^) :: Word -> Natural - bvList = fmap (BV.fromNumber stateCount . genNum) stateIndexList - in V.fromList $ zip (toList localStates) bvList - - -iupacToBVPairs - :: (IsString s, Ord s) - => Alphabet s - -> Bimap (NonEmpty s) (NonEmpty s) - -> V.Vector (s, BV.BitVector) -iupacToBVPairs inputAlphabet iupac = V.fromList $ bimap NE.head encoder <$> BM.toAscList iupac - where - constructor = flip BV.fromNumber (0 :: Int) - encoder = encodeState inputAlphabet constructor - --- | nucleotideBVPairs for recoding DNA sequences --- this done to insure not recalculating everything for each base -nucleotideBVPairs :: V.Vector (ST.ShortText, BV.BitVector) -nucleotideBVPairs = iupacToBVPairs baseAlphabet iupacToDna - where - baseAlphabet = fromSymbols $ ST.fromString <$> ["A","C","G","T"] - -{- --- | getAminoAcidSequenceCodes returns the character sgtructure for an Amino Acid sequence type -getAminoAcidSequenceCodes :: Alphabet ST.ShortText -> V.Vector (ST.ShortText, BV.BitVector) -getAminoAcidSequenceCodes localAlphabet = - let stateBVList = getStateBitVectorList localAlphabet - pairB = (ST.singleton 'B', snd (stateBVList V.! 2) .|. snd (stateBVList V.! 11)) -- B = D or N - pairZ = (ST.singleton 'Z', snd (stateBVList V.! 3) .|. snd (stateBVList V.! 13))-- E or Q - pairX = (ST.singleton 'X', foldr1 (.|.) $ V.toList $ V.map snd (V.init stateBVList)) --All AA not '-' - pairQuest = (ST.singleton '?', foldr1 (.|.) $ V.toList $ V.map snd stateBVList) -- all including -'-' Not IUPAC - ambigPairVect = V.fromList [pairB, pairZ, pairX, pairQuest] - totalStateList = stateBVList V.++ ambigPairVect - - in - --trace (show $ fmap BV.showBin $ fmap snd $ totalStateList) - totalStateList --} - --- | aminoAcidBVPairs for recoding protein sequences --- this done to insure not recalculating everything for each residue --- B, Z, X, ? for ambiguities -aminoAcidBVPairs :: V.Vector (ST.ShortText, BV.BitVector) -aminoAcidBVPairs = iupacToBVPairs acidAlphabet iupacToAminoAcid - where - acidAlphabet = fromSymbols $ fromString <$> - ["A","C","D","E","F","G","H","I","K","L","M","N","P","Q","R","S","T","V","W","Y", "-"] - - --- | getBVCode take a Vector of (ShortText, BV) and returns bitvector code for --- ShortText state -getBVCode :: V.Vector (ST.ShortText, BV.BitVector) -> ST.ShortText -> BV.BitVector -getBVCode bvCodeVect inState = - let newCode = V.find ((== inState).fst) bvCodeVect - in - maybe (error ("State " ++ ST.toString inState ++ " not found in bitvect code " ++ show bvCodeVect)) snd newCode - - -getNucleotideSequenceChar :: Bool -> [ST.ShortText] -> [CharacterData] -getNucleotideSequenceChar isPrealigned stateList = - let sequenceVect - | null stateList = mempty - | otherwise = SV.fromList $ BV.toUnsignedNumber . getBVCode nucleotideBVPairs <$> stateList - newSequenceChar = if not isPrealigned then - emptyCharacter { slimPrelim = sequenceVect - , slimGapped = (sequenceVect, sequenceVect, sequenceVect) - } - else - emptyCharacter { alignedSlimPrelim = (sequenceVect, sequenceVect, sequenceVect) - } - in [newSequenceChar] - - -getAminoAcidSequenceChar :: Bool -> [ST.ShortText] -> [CharacterData] -getAminoAcidSequenceChar isPrealigned stateList = - let sequenceVect - | null stateList = mempty - | otherwise = UV.fromList $ BV.toUnsignedNumber . getBVCode aminoAcidBVPairs <$> stateList - newSequenceChar = if not isPrealigned then - emptyCharacter { widePrelim = sequenceVect - , wideGapped = (sequenceVect, sequenceVect, sequenceVect) - } - else - emptyCharacter { alignedWidePrelim = (sequenceVect, sequenceVect, sequenceVect) - } - in [newSequenceChar] - - --- | getGeneralBVCode take a Vector of (ShortText, BV) and returns bitvector code for --- ShortText state. These states can be ambiguous as in general sequences --- so states need to be parsed first --- the AND to all states makes ambiguityoes only observed states -getGeneralBVCode :: V.Vector (ST.ShortText, BV.BitVector) -> ST.ShortText -> (CUInt, Word64, BV.BitVector) -getGeneralBVCode bvCodeVect inState = - let inStateString = ST.toString inState - in - --if '[' `notElem` inStateString then --single state - if (head inStateString /= '[') && (last inStateString /= ']') then --single state - let newCode = V.find ((== inState).fst) bvCodeVect - allBVStates = V.foldl1' (.|.) (fmap snd bvCodeVect) - bvDimension = fromEnum $ BV.dimension $ snd $ V.head bvCodeVect - in - if isNothing newCode then - - --B is Aspartic Acid or Asparagine if '-' =0 then states 3 and 12. - if inState == ST.fromString "B" then - let x = BV.fromBits $ (replicate 3 False) ++ [True] ++ (replicate 8 False) ++ [True] ++ (replicate (bvDimension - 13) False) - in (BV.toUnsignedNumber x, BV.toUnsignedNumber x, x) - - -- any amino acid but not '-' - else if inState == ST.fromString "X" then - let x = allBVStates .&. (BV.fromBits (False : (replicate (bvDimension - 1) True))) - in (BV.toUnsignedNumber x, BV.toUnsignedNumber x, x) - - -- any state including '-' - else if inState == ST.fromString "?" then - let x = allBVStates .&. (BV.fromBits (replicate bvDimension True)) - in (BV.toUnsignedNumber x, BV.toUnsignedNumber x, x) - else error ("State " ++ ST.toString inState ++ " not found in bitvect code " ++ show bvCodeVect) - else let x = snd $ fromJust newCode - in (BV.toUnsignedNumber x, BV.toUnsignedNumber x, x) - else - let statesStringList = words $ tail $ init inStateString - stateList = fmap ST.fromString statesStringList - maybeBVList = fmap getBV stateList - stateBVList = fmap (snd . fromJust) maybeBVList - ambiguousBVState = foldr1 (.|.) stateBVList - in - if Nothing `elem` maybeBVList then error ("Ambiguity group " ++ inStateString ++ " contained states not found in bitvect code " ++ show bvCodeVect) - else (BV.toUnsignedNumber ambiguousBVState, BV.toUnsignedNumber ambiguousBVState, ambiguousBVState) - where getBV s = V.find ((== s).fst) bvCodeVect - --- | getGeneralSequenceChar encode general (ie not nucleotide or amino acid) sequences --- as bitvectors. Main difference with getSequenceChar is in dealing wioth ambiguities --- they need to be parsed and "or-ed" differently --- need to have all three preliminary fields populatied for some reason--prob shouldn't need that -getGeneralSequenceChar :: CharInfo -> [ST.ShortText] -> [CharacterData] -getGeneralSequenceChar inCharInfo stateList = - let cType = charType inCharInfo - -- isAligned = prealigned inCharInfo - stateBVPairVect = getStateBitVectorList $ alphabet inCharInfo - (slimVec, wideVec, hugeVec) = - if not $ null stateList - then (\(x,y,z) -> (SV.fromList $ toList x, UV.fromList $ toList y, z)) . V.unzip3 . V.fromList $ fmap (getGeneralBVCode stateBVPairVect) stateList - else (mempty, mempty, mempty) - newSequenceChar = emptyCharacter { slimPrelim = if cType `elem` [SlimSeq, NucSeq ] then slimVec else mempty - , slimGapped = if cType `elem` [SlimSeq, NucSeq ] then (slimVec, slimVec, slimVec) else (mempty, mempty, mempty) - , slimFinal = if cType `elem` [SlimSeq, NucSeq ] then slimVec else mempty - , widePrelim = if cType `elem` [WideSeq, AminoSeq] then wideVec else mempty - , wideGapped = if cType `elem` [WideSeq, AminoSeq] then (wideVec, wideVec, wideVec) else (mempty, mempty, mempty) - , wideFinal = if cType `elem` [WideSeq, AminoSeq] then wideVec else mempty - , hugePrelim = if cType == HugeSeq then hugeVec else mempty - , hugeGapped = if cType == HugeSeq then (hugeVec, hugeVec, hugeVec) else (mempty, mempty, mempty) - , hugeFinal = if cType == HugeSeq then hugeVec else mempty - , alignedSlimPrelim = if cType `elem` [AlignedSlim] then (slimVec, slimVec, slimVec) else (mempty, mempty, mempty) - , alignedSlimFinal = if cType `elem` [AlignedSlim] then slimVec else mempty - , alignedWidePrelim = if cType `elem` [AlignedWide] then (wideVec, wideVec, wideVec) else (mempty, mempty, mempty) - , alignedWideFinal = if cType `elem` [AlignedWide] then wideVec else mempty - , alignedHugePrelim = if cType `elem` [AlignedHuge] then (hugeVec, hugeVec, hugeVec) else (mempty, mempty, mempty) - , alignedHugeFinal = if cType `elem` [AlignedHuge] then hugeVec else mempty - } - in - -- trace ("GGSC" ++ (show stateList) ++ "\n" ++ (show newSequenceChar )) - [newSequenceChar] - - -{-Not used - --- | getSingleStateBV takes a single state and returns its bitvector --- based on alphabet size--does not check if ambiguous--assumes single state -getSingleStateBV :: [ST.ShortText] -> ST.ShortText -> BV.BitVector -getSingleStateBV localAlphabet localState = - let stateIndex = L.elemIndex localState localAlphabet - --bv1 = BV.bitVec (length localAlphabet) (1 :: Integer) - --bvState = bv1 BV.<<.(BV.bitVec (length localAlphabet)) (fromJust stateIndex) - bv1 = BV.fromBits (True : replicate (length localAlphabet - 1) False) - bvState = shiftL bv1 (fromJust stateIndex) - in - if isNothing stateIndex then - if localState `elem` fmap ST.fromString ["?","-"] then BV.fromBits $ replicate (length localAlphabet) True - else error ("getSingleStateBV: State " ++ ST.toString localState ++ " Not found in alphabet " ++ show localAlphabet) - else bvState --} - - --- | getStateBitVector takes teh alphabet of a character ([ShorText]) --- and returns then bitvectorfor that state in order of states in alphabet -getStateBitVector :: Alphabet ST.ShortText -> ST.ShortText -> BV.BitVector -getStateBitVector localAlphabet = encodeState localAlphabet constructor . (:[]) - where - constructor = flip BV.fromNumber (0 :: Int) - --- getMinMaxStates takes list of strings and determines the minimum and maximum integer values -getMinMaxStates :: [String] -> (Int, Int) -> (Int, Int) -getMinMaxStates inStateStringList (curMin, curMax) = - if null inStateStringList then (curMin, curMax) - else - let firstString = head inStateStringList - in - -- missing data - if firstString == "-" || firstString == "?" then getMinMaxStates (tail inStateStringList) (curMin, curMax) - -- single state - else if '[' `notElem` firstString then - let onlyInt = readMaybe firstString :: Maybe Int - in - if isNothing onlyInt then error ("State not an integer in getIntRange: " ++ firstString) - else - let minVal = if fromJust onlyInt < curMin then fromJust onlyInt - else curMin - maxVal = if fromJust onlyInt > curMax then fromJust onlyInt - else curMax - in - getMinMaxStates (tail inStateStringList) (minVal, maxVal) - - -- range of states - else - let statesStringList = words $ tail $ init firstString - stateInts = fmap readMaybe statesStringList :: [Maybe Int] - in - if Nothing `elem` stateInts then error ("Non-integer in range " ++ firstString) - else - let localMin = minimum $ fmap fromJust stateInts - localMax = maximum $ fmap fromJust stateInts - minVal = if localMin < curMin then localMin - else curMin - maxVal = if localMax > curMax then localMax - else curMax - in - getMinMaxStates (tail inStateStringList) (minVal, maxVal) - - - --- getIntRange takes the local states and returns the Integer range of an additive character --- in principle allows for > 2 states -getIntRange :: ST.ShortText -> Alphabet ST.ShortText -> (Int, Int) -getIntRange localState totalAlphabet = - let stateString = ST.toString localState - in - --single state - if (stateString == "?") || (stateString == "-") - then getMinMaxStates (ST.toString <$> toList totalAlphabet) (maxBound :: Int, minBound :: Int) - - else if '[' `notElem` stateString then - let onlyInt = readMaybe stateString :: Maybe Int - in - if isNothing onlyInt then error ("State not an integer in getIntRange: " ++ ST.toString localState) - else (fromJust onlyInt, fromJust onlyInt) - --Range of states - else - let hasDash = ST.any (== '-') localState - statesStringList = if hasDash then fmap ST.toString $ fmap (ST.filter (`notElem` ['[',']'])) $ ST.split (== '-') localState - else fmap (: []) $ ST.toString $ ST.filter (`notElem` ['[',']']) localState - --words $ tail $ init stateString - stateInts = fmap readMaybe statesStringList :: [Maybe Int] - in - -- trace ("GIR:" ++ (show localState) ++ " -> " ++ (show (minimum $ fmap fromJust stateInts, maximum $ fmap fromJust stateInts))) ( - if Nothing `elem` stateInts then error ("Non-integer in range " ++ ST.toString localState) - else (minimum $ fmap fromJust stateInts, maximum $ fmap fromJust stateInts) - -- ) - --- | getTripleList -getTripleList :: MatrixTriple -> MatrixTriple -> [ST.ShortText] -> [ST.ShortText]-> [MatrixTriple] -getTripleList hasState notHasState localAlphabet stateList = - if null localAlphabet then [] - else - let firstAlphState = head localAlphabet - in - if firstAlphState `elem` stateList then - -- trace ("State " ++ show firstAlphState ++ " in " ++ show localAlphabet) - hasState : getTripleList hasState notHasState (tail localAlphabet) stateList - else notHasState : getTripleList hasState notHasState (tail localAlphabet) stateList - --- | getInitialMatrixVector gets matrix vector -getInitialMatrixVector :: Alphabet ST.ShortText -> ST.ShortText -> V.Vector MatrixTriple -getInitialMatrixVector alphabet' localState = - let hasState = (0 :: StateCost , [] ,[]) - notHasState = (maxBound :: StateCost , [] ,[]) - localAlphabet = toList alphabet' - in - let stateString = ST.toString localState - in - --single state - if '[' `notElem` stateString then - -- trace ("GIMV: " ++ (show $ V.fromList $ getTripleList hasState notHasState localAlphabet [localState])) - V.fromList $ getTripleList hasState notHasState localAlphabet [localState] - -- polylorphic/ambiguous - else - let statesStringList = words $ tail $ init stateString - stateList = fmap ST.fromString statesStringList - in - V.fromList $ getTripleList hasState notHasState localAlphabet stateList - --- | getQualitativeCharacters processes non-sequence characters (non-additive, additive, sankoff/matrix) --- and recodes returning list of encoded characters --- reverses order due to prepending --- matrix stateBVPrelim/Final for approx matrix costs --- adddded in code for ambiguities for non-additive--somehow got lost--alphabet [robbaly wrong now as well -getQualitativeCharacters :: [CharInfo] -> [ST.ShortText] -> [CharacterData] -> [CharacterData] -getQualitativeCharacters inCharInfoList inStateList curCharList = - if null inCharInfoList then reverse curCharList - else - let firstCharInfo = head inCharInfoList - firstState = head inStateList - firstCharType = charType firstCharInfo - totalAlphabet = alphabet firstCharInfo - in - --single state - if firstCharType == NonAdd then - let stateBV = if ST.length firstState == 1 then getStateBitVector (alphabet firstCharInfo) firstState - else - let ambiguousStateST = ST.filter (`notElem` ['[', ']']) firstState - ambiguousStateString = ST.toString ambiguousStateST - stateSTList = fmap ST.singleton ambiguousStateString - stateBVList = fmap (getStateBitVector (alphabet firstCharInfo)) stateSTList - in - -- trace ("GQC: " ++ (show ambiguousStateString) ++ " " ++ (show stateSTList) ++ " " ++ (show stateBVList)) - L.foldl1' (.|.) stateBVList - newCharacter = emptyCharacter { stateBVPrelim = (V.singleton stateBV, V.singleton stateBV, V.singleton stateBV) } - in - getQualitativeCharacters (tail inCharInfoList) (tail inStateList) (newCharacter : curCharList) - - else if firstCharType == Add then - if firstState == ST.fromString "-1" then - getQualitativeCharacters (tail inCharInfoList) (tail inStateList) (missingAdditive firstCharInfo : curCharList) - else - let (minRange, maxRange) = getIntRange firstState totalAlphabet - newCharacter = emptyCharacter { rangePrelim = (V.singleton (minRange, maxRange), V.singleton (minRange, maxRange), V.singleton (minRange, maxRange)) } - in - if ST.length firstState > 1 then - -- trace ("GQC: " ++ show firstState) - getQualitativeCharacters (tail inCharInfoList) (tail inStateList) (newCharacter : curCharList) - else getQualitativeCharacters (tail inCharInfoList) (tail inStateList) (newCharacter : curCharList) - - else if firstCharType == Matrix then - if firstState `elem` fmap ST.fromString ["?","-"] then - getQualitativeCharacters (tail inCharInfoList) (tail inStateList) (missingMatrix firstCharInfo : curCharList) - else - let initialMatrixVector = getInitialMatrixVector (alphabet firstCharInfo) firstState - newCharacter = emptyCharacter { matrixStatesPrelim = V.singleton initialMatrixVector } - in - -- trace (show initialMatrixVector) ( - --trace ((show $ alphabet firstCharInfo) ++ " " ++ (ST.toString firstState)) ( - --trace ("GQC " ++ (T.unpack $ name firstCharInfo) ++ (show $ alphabet firstCharInfo) ++ " " ++ (show $ costMatrix firstCharInfo)) ( - if null (costMatrix firstCharInfo) then errorWithoutStackTrace ("\n\nMatrix character input error: No cost matrix has been specified for character " ++ T.unpack (name firstCharInfo) ++ " perhaps file " ++ (takeWhile (/= '#') $ T.unpack (name firstCharInfo)) ++ " character " ++ ((tail $ dropWhile (/= '#') $ T.unpack (name firstCharInfo)) )) - else getQualitativeCharacters (tail inCharInfoList) (tail inStateList) (newCharacter : curCharList) - -- ) - - else error ("Character type " ++ show firstCharType ++ " not recongnized/implemented") - - - --- | createLeafCharacter takes rawData and Charinfo and returns CharacterData type --- need to add in missing data as well -createLeafCharacter :: [CharInfo] -> [ST.ShortText] -> Int -> [CharacterData] -createLeafCharacter inCharInfoList rawDataList maxCharLength - | null inCharInfoList = - error "Null data in charInfoList createLeafCharacter" - | null rawDataList = -- missing data - getMissingValue inCharInfoList maxCharLength - | otherwise = let localCharType = charType $ head inCharInfoList - localAlphabet = alphabet $ head inCharInfoList - isNucleotideData = isAlphabetDna localAlphabet - isAminoAcidData = isAlphabetAminoAcid localAlphabet - in - -- trace ("CLC: " ++ (show localCharType)) ( - if localCharType `elem` sequenceCharacterTypes then - --in if length inCharInfoList == 1 then -- should this be `elem` sequenceCharacterTypes - case localCharType of - NucSeq -> getNucleotideSequenceChar False rawDataList - AminoSeq -> getAminoAcidSequenceChar False rawDataList - -- ambiguities different, and alphabet varies with character (potentially) - AlignedSlim -> if isNucleotideData then - getNucleotideSequenceChar True rawDataList - else getGeneralSequenceChar (head inCharInfoList) rawDataList - AlignedWide -> if isAminoAcidData then - getAminoAcidSequenceChar True rawDataList - else getGeneralSequenceChar (head inCharInfoList) rawDataList - AlignedHuge -> getGeneralSequenceChar (head inCharInfoList) rawDataList - SlimSeq -> getGeneralSequenceChar (head inCharInfoList) rawDataList - WideSeq -> getGeneralSequenceChar (head inCharInfoList) rawDataList - HugeSeq -> getGeneralSequenceChar (head inCharInfoList) rawDataList - _ -> getQualitativeCharacters inCharInfoList rawDataList [] - else if length inCharInfoList /= length rawDataList then - error "Mismatch in number of characters and character info" - else getQualitativeCharacters inCharInfoList rawDataList [] - -- ) - - diff --git a/pkg/PhyGraph/Input/FastAC.hs b/pkg/PhyGraph/Input/FastAC.hs deleted file mode 100644 index ded68a244..000000000 --- a/pkg/PhyGraph/Input/FastAC.hs +++ /dev/null @@ -1,500 +0,0 @@ -{- | -Module : FastAC.hs -Description : Module proving fasta/c sequence import functions -Copyright : (c) 2021 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -module Input.FastAC - ( getFastA - , getFastAText - , getFastaCharInfo - , getFastC - , getFastCText - , getFastcCharInfo - , genDiscreteDenseOfDimension - ) where - -import Control.DeepSeq -import Data.Alphabet -import Data.Bits -import qualified Data.Char as C -import Data.Hashable -import qualified Data.List as L -import Data.MetricRepresentation -import qualified Data.MetricRepresentation as MR -import Data.TCM (TCMDiagnosis (..), - TCMStructure (..)) -import qualified Data.TCM as TCM -import qualified Data.TCM.Dense as TCMD -import qualified Data.Text.Lazy as T -import qualified Data.Text.Short as ST -import qualified Data.Vector as V -import Debug.Trace -import GeneralUtilities -import qualified Input.DataTransformation as DT -import qualified SymMatrix as S -import Types.Types - - --- | getAlphabet takse a list of short-text lists and returns alphabet as list of short-text --- although with multicharacter alphabets that contain '[' or ']' this would be a problem, --- its only used for single character alphabets in fasta formats. --- '#' for partitions in fasta sequences -getAlphabet :: [String] -> [ST.ShortText] -> [ST.ShortText] -getAlphabet curList inList = - let notAlphElement = fmap ST.fromString ["?", "[", "]", "#"] - in - if null inList then filter (`notElem` notAlphElement) $ fmap ST.fromString $ L.sort curList `L.union` ["-"] - else - let firstChars = fmap (:[]) $ L.nub $ ST.toString $ head inList - in getAlphabet (firstChars `L.union` curList) (tail inList) - - --- | generateDefaultMatrix takes an alphabet and generates cost matrix (assuming '-' --- in already) -generateDefaultMatrix :: Alphabet ST.ShortText -> Int -> Int -> Int -> [[Int]] -generateDefaultMatrix inAlph rowCount indelCost substitutionCost = - if null inAlph then [] - else if rowCount == length inAlph then [] - else - let firstPart = if rowCount < ((length inAlph) - 1) then replicate rowCount substitutionCost - else replicate rowCount indelCost - thirdPart = if rowCount < ((length inAlph) - 1) then (replicate ((length inAlph) - rowCount - 1 - 1) substitutionCost) ++ [indelCost] - else [] - in - (firstPart ++ [0] ++ thirdPart) : generateDefaultMatrix inAlph (rowCount + 1) indelCost substitutionCost - --- | getFastaCharInfo get alphabet , names etc from processed fasta data --- this doesn't separate ambiguities from elements--processed later --- need to read in TCM or default -getFastaCharInfo :: [TermData] -> String -> String -> Bool -> ([ST.ShortText], [[Int]], Double) -> CharInfo -getFastaCharInfo inData dataName dataType isPrealigned localTCM = - if null inData then error "Empty inData in getFastaCharInfo" - else - let nucleotideAlphabet = fmap ST.fromString ["A","C","G","T","U","R","Y","S","W","K","M","B","D","H","V","N","?","-"] - lAminoAcidAlphabet = fmap ST.fromString ["A","B","C","D","E","F","G","H","I","K","L","M","N","P","Q","R","S","T","V","W","X","Y","Z", "-","?"] - --onlyInNucleotides = [ST.fromString "U"] - --onlyInAminoAcids = fmap ST.fromString ["E","F","I","L","P","Q","X","Z"] - sequenceData = getAlphabet [] $ foldMap snd inData - - seqType - | dataType == "nucleotide" = trace ("File " ++ dataName ++ " is nucleotide data.") NucSeq - | dataType == "aminoacid" = trace ("File " ++ dataName ++ " is aminoacid data.") AminoSeq - | dataType == "custom_alphabet" = trace ("File " ++ dataName ++ " is large alphabet data.") HugeSeq - | (sequenceData `L.intersect` nucleotideAlphabet == sequenceData) = trace ("Assuming file " ++ dataName - ++ " is nucleotide data. Specify `aminoacid' filetype if this is incorrect.") NucSeq - | (sequenceData `L.intersect` lAminoAcidAlphabet == sequenceData) = trace ("Assuming file " ++ dataName - ++ " is amino acid data. Specify `nucleotide' filetype if this is incorrect.") AminoSeq - | length sequenceData <= 8 = trace ("File " ++ dataName ++ " is small alphabet data.") SlimSeq - | length sequenceData <= 64 = trace ("File " ++ dataName ++ " is wide alphabet data.") WideSeq - | otherwise = trace ("File " ++ dataName ++ " is large alphabet data.") HugeSeq - - seqAlphabet = fromSymbols seqSymbols - - seqSymbols = - let toSymbols = fmap ST.fromString - in case seqType of - NucSeq -> toSymbols ["A","C","G","T","-"] - AminoSeq -> toSymbols ["A","C","D","E","F","G","H","I","K","L","M","N","P","Q","R","S","T","V","W","Y", "-"] - _ -> sequenceData - - localCostMatrix = if null $ fst3 localTCM then - let (indelCost, substitutionCost) = if null $ snd3 localTCM then (1,1) - else ((head . head . snd3) localTCM, (last . head . snd3) localTCM) - in - S.fromLists $ generateDefaultMatrix seqAlphabet 0 indelCost substitutionCost - else S.fromLists $ snd3 localTCM - - tcmDense = TCMD.generateDenseTransitionCostMatrix 0 (fromIntegral $ V.length localCostMatrix) (S.getCost localCostMatrix) - -- not sure of this - tcmNaught = genDiscreteDenseOfDimension (length sequenceData) - localDenseCostMatrix = if seqType `elem` [NucSeq, SlimSeq] then tcmDense - else tcmNaught - - (wideWeightFactor, localWideTCM) - | seqType `elem` [WideSeq, AminoSeq] = getTCMMemo (thisAlphabet, localCostMatrix) - | otherwise = metricRepresentation <$> TCM.fromRows [[0::Word, 0::Word],[0::Word, 0::Word]] -- this 2x2 so if some Show instances are called don't get error - - (hugeWeightFactor, localHugeTCM) - | seqType == HugeSeq = getTCMMemo (thisAlphabet, localCostMatrix) - | otherwise = metricRepresentation <$> TCM.fromRows [[0::Word, 0::Word],[0::Word, 0::Word]] -- this 2x2 so if some Show instances are called don't get error - - tcmWeightFactor = thd3 localTCM - thisAlphabet = - case fst3 localTCM of - a | null a -> seqAlphabet - a -> fromSymbols a - - alignedSeqType = if not isPrealigned then seqType - else - if seqType `elem` [NucSeq, SlimSeq] then AlignedSlim - else if seqType `elem` [WideSeq, AminoSeq] then AlignedWide - else if seqType == HugeSeq then AlignedHuge - else error "Unrecognozed data type in getFastaCharInfo" - - defaultSeqCharInfo = emptyCharInfo { - charType = alignedSeqType - , activity = True - , weight = tcmWeightFactor * - if seqType == HugeSeq - then fromRational hugeWeightFactor - else if seqType `elem` [WideSeq, AminoSeq] - then fromRational wideWeightFactor - else 1 - , costMatrix = localCostMatrix - , slimTCM = localDenseCostMatrix - , wideTCM = localWideTCM - , hugeTCM = localHugeTCM - , name = T.pack (filter (/= ' ') dataName <> "#0") - , alphabet = thisAlphabet - , prealigned = isPrealigned - , origInfo = V.singleton (T.pack (filter (/= ' ') dataName <> "#0"), alignedSeqType, thisAlphabet) - } - in - -- trace ("FASTCINFO:" ++ (show $ charType defaultSeqCharInfo)) ( - if (null . fst3) localTCM && (null . snd3) localTCM then - trace ("Warning: no tcm file specified for use with fasta file : " ++ dataName ++ ". Using default, all 1 diagonal 0 cost matrix.") - defaultSeqCharInfo - else - trace ("Processing TCM data for file : " ++ dataName) - defaultSeqCharInfo - -- ) - --- | getTCMMemo creates the memoized tcm for large alphabet sequences -getTCMMemo - :: ( FiniteBits b - , Hashable b - , NFData b - ) - => (a, S.Matrix Int) - -> (Rational, MR.MetricRepresentation b) -getTCMMemo (_inAlphabet, inMatrix) = - let (coefficient, tcm) = TCM.fromRows $ S.getFullVects inMatrix - metric = case tcmStructure $ TCM.diagnoseTcm tcm of - NonAdditive -> discreteMetric - Additive -> linearNorm . toEnum $ TCM.size tcm - _ -> metricRepresentation tcm - in (coefficient, metric) - - --- | getSequenceAphabet take a list of ShortText with inform ation and accumulatiors --- For both nonadditive and additve looks for [] to denote ambiguity and splits states --- if splits on spaces if there are spaces (within []) (ala fastc or multicharacter states) --- else if no spaces --- if non-additive then each symbol is split out as an alphabet element -- as in TNT --- if is additive splits on '-' to denote range --- rescales (integerizes later) additive characters with decimal places to an integer type rep --- for additive charcaters if states are not nummerical then throuws an error -getSequenceAphabet :: [ST.ShortText] -> [ST.ShortText] -> [ST.ShortText] -getSequenceAphabet newAlph inStates = - if null inStates then - -- removes indel gap from alphabet if present and then (re) adds at end - -- (filter (/= (ST.singleton '-')) $ sort $ nub newAlph) ++ [ST.singleton '-'] - L.sort (L.nub newAlph) ++ [ST.singleton '-'] - else - let firstState = ST.toString $ head inStates - in - if head firstState /= '[' then - if firstState `elem` ["?","-"] then getSequenceAphabet newAlph (tail inStates) - else getSequenceAphabet (head inStates : newAlph) (tail inStates) - else -- ambiguity - let newAmbigStates = fmap ST.fromString $ words $ filter (`notElem` ['[',']']) firstState - in - getSequenceAphabet (newAmbigStates ++ newAlph) (tail inStates) - - --- | getFastcCharInfo get alphabet , names etc from processed fasta data --- this doesn't separate ambiguities from elements--processed later --- need to read in TCM or default - ---Not correct with default alphabet and matrix now after tcm recodeing added to loow for decmials I htink. -getFastcCharInfo :: [TermData] -> String -> Bool -> ([ST.ShortText], [[Int]], Double) -> CharInfo -getFastcCharInfo inData dataName isPrealigned localTCM = - if null inData then error "Empty inData in getFastcCharInfo" - else - --if null $ fst localTCM then errorWithoutStackTrace ("Must specify a tcm file with fastc data for fie : " ++ dataName) - let thisAlphabet = fromSymbols symbolsFound - - symbolsFound - | not $ null $ fst3 localTCM = fst3 localTCM - | otherwise = getSequenceAphabet [] $ concatMap snd inData - - inMatrix - | not $ null $ fst3 localTCM = S.fromLists $ snd3 localTCM - | otherwise = let (indelCost, substitutionCost) = if null $ snd3 localTCM then (1,1) - else ((head . head . snd3) localTCM, (last . head . snd3) localTCM) - in - S.fromLists $ generateDefaultMatrix thisAlphabet 0 indelCost substitutionCost - - tcmWeightFactor = thd3 localTCM - tcmDense = TCMD.generateDenseTransitionCostMatrix 0 (fromIntegral $ V.length inMatrix) (S.getCost inMatrix) - - -- not sure of this - tcmNaught = genDiscreteDenseOfDimension (length thisAlphabet) - localDenseCostMatrix = if length thisAlphabet < 9 then tcmDense - else tcmNaught - seqType = - case length thisAlphabet of - n | n <= 8 -> SlimSeq - n | n <= 64 -> WideSeq - _ -> HugeSeq - - (wideWeightFactor, localWideTCM) - | seqType `elem` [WideSeq, AminoSeq] = getTCMMemo (thisAlphabet, inMatrix) - | otherwise = metricRepresentation <$> TCM.fromRows [[0::Word, 0::Word],[0::Word, 0::Word]] -- this 2x2 so if some Show instances are called don't get error - - (hugeWeightFactor, localHugeTCM) - | seqType == HugeSeq = getTCMMemo (thisAlphabet, inMatrix) - | otherwise = metricRepresentation <$> TCM.fromRows [[0::Word, 0::Word],[0::Word, 0::Word]] -- this 2x2 so if some Show instances are called don't get error - alignedSeqType = if not isPrealigned then seqType - else - if seqType `elem` [NucSeq, SlimSeq] then AlignedSlim - else if seqType `elem` [WideSeq, AminoSeq] then AlignedWide - else if seqType == HugeSeq then AlignedHuge - else error "Unrecognozed data type in getFastaCharInfo" - - defaultSeqCharInfo = emptyCharInfo { - charType = alignedSeqType - , activity = True - , weight = tcmWeightFactor * - if seqType == HugeSeq - then fromRational hugeWeightFactor - else if seqType `elem` [WideSeq, AminoSeq] - then fromRational wideWeightFactor - else 1 - , costMatrix = inMatrix - , slimTCM = localDenseCostMatrix - , wideTCM = localWideTCM - , hugeTCM = localHugeTCM - , name = T.pack (filter (/= ' ') dataName ++ "#0") - , alphabet = thisAlphabet - , prealigned = isPrealigned - , origInfo = V.singleton (T.pack (filter (/= ' ') dataName ++ "#0"), alignedSeqType, thisAlphabet) - } - in - --trace ("FCI " ++ (show $ length thisAlphabet) ++ " alpha size" ++ show thisAlphabet) ( - if (null . fst3) localTCM && (null . snd3) localTCM then - trace ("Warning: no tcm file specified for use with fasta file : " ++ dataName ++ ". Using default, all 1 diagonal 0 cost matrix.") - defaultSeqCharInfo - else - trace ("Processing TCM data for file : " ++ dataName) - defaultSeqCharInfo - --) - --- | getSequenceAphabet takes a list of ShortText and returns the alp[habet and adds '-' if not present - --- | getFastA processes fasta file --- assumes single character alphabet --- deletes '-' (unless "prealigned"), and spaces -getFastA :: String -> String -> Bool-> [TermData] -getFastA fileContents' fileName isPreligned = - if null fileContents' then errorWithoutStackTrace "\n\n'Read' command error: empty file" - else - -- removes ';' comments - let fileContents = unlines $ filter (not.null) $ takeWhile (/= ';') <$> lines fileContents' - in - if head fileContents /= '>' then errorWithoutStackTrace "\n\n'Read' command error: fasta file must start with '>'" - else - let terminalSplits = T.split (=='>') $ T.pack fileContents - pairData = getRawDataPairsFastA isPreligned (tail terminalSplits) - (hasDupTerminals, dupList) = DT.checkDuplicatedTerminals pairData - in - -- tail because initial split will an empty text - if hasDupTerminals then errorWithoutStackTrace ("\tInput file " ++ fileName ++ " has duplicate terminals: " ++ show dupList) - else pairData - --- | getFastAText processes fasta file --- assumes single character alphabet --- deletes '-' (unless "prealigned"), and spaces -getFastAText :: T.Text -> String -> Bool-> [TermData] -getFastAText fileContents' fileName isPreligned = - if T.null fileContents' then errorWithoutStackTrace "\n\n'Read' command error: empty file" - else - -- removes ';' comments - let fileContents = T.unlines $ filter (not . T.null) $ T.takeWhile (/= ';') <$> T.lines fileContents' - in - if T.head fileContents /= '>' then errorWithoutStackTrace "\n\n'Read' command error: fasta file must start with '>'" - else - let terminalSplits = T.split (=='>') fileContents - pairData = getRawDataPairsFastA isPreligned (tail terminalSplits) - (hasDupTerminals, dupList) = DT.checkDuplicatedTerminals pairData - in - -- tail because initial split will an empty text - if hasDupTerminals then errorWithoutStackTrace ("\tInput file " ++ fileName ++ " has duplicate terminals: " ++ show dupList) - else pairData - - - --- | getRawDataPairsFastA takes splits of Text and returns terminalName, Data pairs--minimal error checking -getRawDataPairsFastA :: Bool -> [T.Text] -> [TermData] -getRawDataPairsFastA isPreligned inTextList = - if null inTextList then [] - else - let firstText = head inTextList - firstName = T.strip $ T.filter (/= '"') $ T.filter C.isPrint $ T.takeWhile (/= '$') $ T.takeWhile (/= ';') $ head $ T.lines firstText - firstData = T.strip $ T.filter C.isPrint $ T.filter (/= ' ') $ T.toUpper $ T.concat $ tail $ T.lines firstText - firstDataNoGaps = T.filter (/= '-') firstData - firtDataSTList = fmap (ST.fromText . T.toStrict) (T.chunksOf 1 firstData) - firstDataNoGapsSTList = fmap (ST.fromText . T.toStrict) (T.chunksOf 1 firstDataNoGaps) - in - --trace (T.unpack firstName ++ "\n" ++ T.unpack firstData) ( - -- trace ("FA " ++ (show firtDataSTList)) ( - if isPreligned then - -- trace ("GRDPF: " ++ (show isPreligned)) - (firstName, firtDataSTList) : getRawDataPairsFastA isPreligned (tail inTextList) - else (firstName, firstDataNoGapsSTList) : getRawDataPairsFastA isPreligned (tail inTextList) - -- ) - --- | getFastC processes fasta file --- assumes spaces between alphabet elements --- deletes '-' (unless "prealigned") --- NEED TO ADD AMBIGUITY -getFastC :: String -> String -> Bool -> [TermData] -getFastC fileContents' fileName isPreligned = - if null fileContents' then errorWithoutStackTrace "\n\n'Read' command error: empty file" - else - let fileContentLines = filter (not.null) $ stripString <$> lines fileContents' - in - if null fileContentLines then errorWithoutStackTrace ("File " ++ show fileName ++ " is having problems reading as 'fastc'. If this is a 'fasta' file, " - ++ "prepend `fasta:' to the file name as in 'fasta:\"bleh.fas\"'") - -- ';' comments if in terminal name are removed by getRawDataPairsFastC--otherwise leaves in there--unless its first character of line - -- because of latexIPA encodings using ';'(and '$') - else - let fileContents = unlines $ filter ((/=';').head) fileContentLines - in - if null fileContents then errorWithoutStackTrace ("File " ++ show fileName ++ " is having problems reading as 'fastc'. If this is a 'fasta' file, " - ++ "prepend `fasta:' to the file name as in 'fasta:\"bleh.fas\"'") - else if head fileContents /= '>' then errorWithoutStackTrace "\n\n'Read' command error: fasta file must start with '>'" - else - let terminalSplits = T.split (=='>') $ T.pack fileContents - pairData = recodeFASTCAmbiguities fileName $ getRawDataPairsFastC isPreligned (tail terminalSplits) - (hasDupTerminals, dupList) = DT.checkDuplicatedTerminals pairData - in - -- tail because initial split will an empty text - if hasDupTerminals then errorWithoutStackTrace ("\tInput file " ++ fileName ++ " has duplicate terminals: " ++ show dupList) - else pairData - --- | getFastCText processes fasta file --- assumes spaces between alphabet elements --- deletes '-' (unless "prealigned") --- NEED TO ADD AMBIGUITY -getFastCText :: T.Text -> String -> Bool -> [TermData] -getFastCText fileContents' fileName isPreligned = - if T.null fileContents' then errorWithoutStackTrace "\n\n'Read' command error: empty file" - else - let fileContentLines = filter (not . T.null) $ fmap T.strip (T.lines fileContents') - in - if null fileContentLines then errorWithoutStackTrace ("File " ++ show fileName ++ " is having problems reading as 'fastc'. If this is a 'fasta' file, " - ++ "prepend `fasta:' to the file name as in 'fasta:\"bleh.fas\"'") - -- ';' comments if in terminal name are removed by getRawDataPairsFastC--otherwise leaves in there--unless its first character of line - -- because of latexIPA encodings using ';'(and '$') - else - let fileContents = T.unlines $ filter ((/=';') . T.head) fileContentLines - in - if T.null fileContents then errorWithoutStackTrace ("File " ++ show fileName ++ " is having problems reading as 'fastc'. If this is a 'fasta' file, " - ++ "prepend `fasta:' to the file name as in 'fasta:\"bleh.fas\"'") - else if T.head fileContents /= '>' then errorWithoutStackTrace "\n\n'Read' command error: fasta file must start with '>'" - else - let terminalSplits = T.split (=='>') fileContents - pairData = recodeFASTCAmbiguities fileName $ getRawDataPairsFastC isPreligned (tail terminalSplits) - (hasDupTerminals, dupList) = DT.checkDuplicatedTerminals pairData - in - -- tail because initial split will an empty text - if hasDupTerminals then errorWithoutStackTrace ("\tInput file " ++ fileName ++ " has duplicate terminals: " ++ show dupList) - else pairData - --- | recodeFASTCAmbiguities take list of TermData and scans for ambiguous groups staring with '['' and ending with '] -recodeFASTCAmbiguities :: String -> [TermData] -> [TermData] -recodeFASTCAmbiguities fileName inData = - if null inData then [] - else - let (firstName, firstData) = head inData - newData = concatAmbig fileName firstData - in - (firstName, newData) : recodeFASTCAmbiguities fileName (tail inData) - --- | concatAmbig takes a list of ShortText and concatanates ambiguyous states '['X Y Z...']' into a --- single Short Tex for later processing -concatAmbig :: String -> [ST.ShortText] -> [ST.ShortText] -concatAmbig fileName inList = - if null inList then [] - else - let firstGroup = ST.toString $ head inList - in - -- not ambiguity group - -- trace (firstGroup ++ show inList) ( - if null firstGroup then concatAmbig fileName (tail inList) - else if head firstGroup /= '[' then head inList : concatAmbig fileName (tail inList) - else - let ambiguityGroup = head inList : getRestAmbiguityGroup fileName (tail inList) - in - --trace (show ambiguityGroup) - ST.concat ambiguityGroup : concatAmbig fileName (drop (length ambiguityGroup) inList) - --) - --- | getRestAmbiguityGroup takes a list of ShorText and keeps added them until one is found with ']' -getRestAmbiguityGroup :: String -> [ST.ShortText] -> [ST.ShortText] -getRestAmbiguityGroup fileName inList = - if null inList then errorWithoutStackTrace ("\n\n'Read' command error: fastc file " ++ fileName ++ " with unterminated ambiguity specification ']'") - else - let firstGroup = ST.toString $ head inList - in - if ']' `notElem` firstGroup then ST.cons ' ' (head inList) : getRestAmbiguityGroup fileName (tail inList) - else [ST.cons ' ' $ head inList] - --- | getRawDataPairsFastC takes splits of Text and returns terminalName, Data pairs--minimal error checking --- this splits on spaces in sequences -getRawDataPairsFastC :: Bool -> [T.Text] -> [TermData] -getRawDataPairsFastC isPreligned inTextList = - if null inTextList then [] - else - let firstText = head inTextList - firstName = T.strip $ T.filter (/= '"') $ T.filter C.isPrint $ T.takeWhile (/= '$') $ T.takeWhile (/= ';') $ head $ T.lines firstText - firstData = T.split (== ' ') $ T.concat $ tail $ T.lines firstText - firstDataNoGaps = filter (/= T.pack "-") firstData - in - --trace (show firstData) ( - -- trace (T.unpack firstName ++ "\n" ++ (T.unpack $ T.intercalate (T.pack " ") firstData)) ( - if isPreligned then (firstName, fmap (ST.fromText . T.toStrict) firstData) : getRawDataPairsFastC isPreligned (tail inTextList) - else (firstName, fmap (ST.fromText . T.toStrict) firstDataNoGaps) : getRawDataPairsFastC isPreligned (tail inTextList) - --- | add to tnt -genDiscreteDenseOfDimension - :: Enum i - => i - -> TCMD.DenseTransitionCostMatrix -genDiscreteDenseOfDimension d = - let n = toEnum $ fromEnum d - r = [0 .. n - 1] - m = [ [ if i==j then 0 else 1 | j <- r] | i <- r] - in TCMD.generateDenseTransitionCostMatrix n n . S.getCost $ V.fromList <$> V.fromList m diff --git a/pkg/PhyGraph/Input/ReadInputFiles.hs b/pkg/PhyGraph/Input/ReadInputFiles.hs deleted file mode 100644 index 5b0add61f..000000000 --- a/pkg/PhyGraph/Input/ReadInputFiles.hs +++ /dev/null @@ -1,462 +0,0 @@ -{- | -Module : ReadInputFiles.hs -Description : Module to read input files for phylogenetic analysis -Copyright : (c) 2021 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -{- -Functions to peform the input file reading for PhyG --} - -module Input.ReadInputFiles -( executeReadCommands - , getReadArgs - , extractInputTuple - , expandReadCommands -) where - -import qualified Commands.Verify as V -import Data.Char -import qualified Data.Char as C -import Data.Foldable -import qualified Data.Graph.Inductive.Basic as B -import qualified Data.List as L -import qualified Data.Text.Lazy as T -import qualified Data.Text.Short as ST -import Debug.Trace -import qualified GeneralUtilities as GU -import qualified GraphFormatUtilities as GFU -import qualified Input.FastAC as FAC -import qualified Input.TNTUtilities as TNT -import System.IO -import qualified System.Path.Glob as SPG -import Types.Types -import qualified Utilities.LocalGraph as LG -import qualified Utilities.Utilities as U -import Data.Maybe -import Text.Read -import qualified Data.Text.Lazy.IO as TIO - - - --- | expandReadCommands expands read commands to multiple satisfying wild cards --- read command can have multiple file names -expandReadCommands :: [Command] -> Command -> IO [Command] -expandReadCommands _newReadList inCommand@(commandType, argList') = - let argList = filter ((`notElem` ["tcm"]) . fst) argList' - tcmArgList = filter ((`elem` ["tcm"]) . fst) argList' - fileNames = fmap snd $ filter ((/="tcm") . fst) $ filter ((/= "") . snd) argList' - modifierList = fmap fst argList - in - -- trace ("ERC: " ++ (show fileNames)) ( - if commandType /= Read then error ("Incorrect command type in expandReadCommands: " ++ show inCommand) - else do - globbedFileNames <- mapM SPG.glob fileNames - if all null globbedFileNames then errorWithoutStackTrace ("File(s) not found in 'read' command (could be due to incorrect filename or missing closing double quote '\"''): " ++ show fileNames) - else - let newArgPairs = makeNewArgs <$> zip modifierList globbedFileNames - commandList = replicate (length newArgPairs) commandType - tcmArgListL = replicate (length newArgPairs) tcmArgList - in - return $ zip commandList (zipWith (++) newArgPairs tcmArgListL) - -- ) - --- | makeNewArgs takes an argument modifier (first in pair) and replicates is and zips with --- globbed file name list to create a list of arguments -makeNewArgs :: (String, [String]) -> [(String, String)] -makeNewArgs (modifier, fileNameList) = - if null fileNameList then error ("Null filename list in makeNewArgs: " ++ (show $ (modifier, fileNameList))) - else let modList = replicate (length fileNameList) modifier - in zip modList fileNameList - - -{- -Added strictness on read so could close files after reading to - allow for large number (1000's) of files to be input. - this may not be the way to go, especially if files are big and - closing them is not an issue --} - --- | extractInputTuple takes the list of pairs from mapped executeReadCommands --- and returns ([RawData], [SimpleGraph]) -extractInputTuple :: [([RawData], [SimpleGraph], [NameText], [NameText], [(NameText, NameText)], [(NameText, NameText)])] - -> ([RawData], [SimpleGraph], [NameText], [NameText], [(NameText, NameText)], [(NameText, NameText)]) -extractInputTuple dataGraphList = - let (inDataList, inGraphList, inTerminalsList, inExcludeList, inRenamePairs, inReBlockPairs) = L.unzip6 dataGraphList - rawData = L.sort $ concat inDataList - rawGraphs = concat inGraphList - rawTerminals = concat inTerminalsList - excludeTerminals = concat inExcludeList - renamePairs = concat inRenamePairs - reBlockPairs = concat inReBlockPairs - in (rawData, rawGraphs, rawTerminals, excludeTerminals, renamePairs, reBlockPairs) - - --- | executeReadCommands wrapper for executeReadCommands' with out all the empty list imput -executeReadCommands :: [Argument] - -> IO ([RawData], [SimpleGraph], [NameText], [NameText], [(NameText, NameText)], [(NameText, NameText)]) -executeReadCommands = executeReadCommands' [] [] [] [] [] [] False ([],[],1.0) - --- | executeReadCommands' reads input files and returns raw data, input graphs, and terminal taxa to include --- assumes that "prealigned" and "tcm:File" are the first arguments if they are specified --- so that they can apply to all the files in the command without order depence -executeReadCommands' :: [RawData] - -> [SimpleGraph] - -> [NameText] - -> [NameText] - -> [(NameText, NameText)] - -> [(NameText, NameText)] - -> Bool - -> ([ST.ShortText], [[Int]], Double) - -> [Argument] - -> IO ([RawData], [SimpleGraph], [NameText], [NameText], [(NameText, NameText)], [(NameText, NameText)]) -executeReadCommands' curData curGraphs curTerminals curExcludeList curRenamePairs curReBlockPairs _ tcmPair argList = do - if null argList then return (curData, curGraphs, curTerminals, curExcludeList, curRenamePairs, curReBlockPairs) - else do - -- hPutStrLn stderr ("ERC: " ++ (show argList) ++ " " ++ (show tcmPair)) - let isPrealigned'= False --removed this option and m,asde specific to sequence types - -- | isPrealigned = True - -- | "prealigned" `elem` fmap fst argList = True - -- | otherwise = False - let (firstOption', firstFile) = head argList - let firstOption = fmap C.toLower firstOption' - --Check for prealigned - -- if firstOption == "prealigned" then - -- executeReadCommands' curData curGraphs curTerminals curExcludeList curRenamePairs curReBlockPairs isPrealigned' tcmPair (tail argList) - -- else do - do - fileHandle <- if (',' `notElem` firstFile) then openFile firstFile ReadMode - else return (stdin :: Handle) - canBeReadFrom <- hIsReadable fileHandle - if not canBeReadFrom then errorWithoutStackTrace ("\n\n'Read' error: file " ++ firstFile ++ " cannot be read") - else - if not $ null firstOption then hPutStrLn stderr ("Reading " ++ firstFile ++ " with option " ++ firstOption) - else hPutStrLn stderr ("Reading " ++ firstFile ++ " with no options") - - -- this is awkward but need to use dot utilities - if firstOption == "dot" then do - dotGraph <- LG.hGetDotLocal fileHandle - let inputDot = GFU.relabelFGL $ LG.dotToGraph dotGraph - let hasLoops = B.hasLoop inputDot - if hasLoops then errorWithoutStackTrace ("Input graph in " ++ firstFile ++ " has loops/self-edges") - else hPutStr stderr "" - let hasCycles = GFU.cyclic inputDot - if hasCycles then errorWithoutStackTrace ("Input graph in " ++ firstFile ++ " has at least one cycle") - else executeReadCommands' curData (inputDot : curGraphs) curTerminals curExcludeList curRenamePairs curReBlockPairs isPrealigned' tcmPair (tail argList) - -- not "dot" files - else do - -- hPutStrLn stderr ("FC1: " ++ firstFile) - fileContents <- if (',' `notElem` firstFile) then TIO.hGetContents fileHandle - else return (T.pack firstFile) - - {--Strict version--don't think necessary--but could getvinot open fikle number issues? - -- destroys lazyness but allows closing right away - -- this so don't have huge numbers of open files for large data sets - fileContents <- hGetContents' fileHandle - hClose fileHandle - -} - - if T.null fileContents then errorWithoutStackTrace ("Error: Input file " ++ firstFile ++ " is empty") - else - -- try to figure out file type based on first and following characters - if firstOption == "tcm" then - let newTCMPair = processTCMContents (',' `elem` firstFile) (T.unpack fileContents) firstFile - in - executeReadCommands' curData curGraphs curTerminals curExcludeList curRenamePairs curReBlockPairs isPrealigned' newTCMPair (tail argList) - else if null firstOption then - let firstChar = T.head $ T.dropWhile (== ' ') fileContents - in - if (toLower firstChar == '/') || (toLower firstChar == 'd') || (toLower firstChar == 'g')then do - hPutStrLn stderr ("\tTrying to parse " ++ firstFile ++ " as dot") - -- destroys lazyness but allows closing right away - -- this so don't have huge numbers of open files for large data sets - fileHandle2 <- openFile firstFile ReadMode - dotGraph <- LG.hGetDotLocal fileHandle2 - hClose fileHandle2 - let inputDot = GFU.relabelFGL $ LG.dotToGraph dotGraph - let hasLoops = B.hasLoop inputDot - if hasLoops then errorWithoutStackTrace ("Input graph in " ++ firstFile ++ " has loops/self-edges") - else hPutStr stderr "" - let hasCycles = GFU.cyclic inputDot - if hasCycles then errorWithoutStackTrace ("Input graph in " ++ firstFile ++ " has at least one cycle") - else executeReadCommands' curData (inputDot : curGraphs) curTerminals curExcludeList curRenamePairs curReBlockPairs isPrealigned' tcmPair (tail argList) - else if (toLower firstChar == '<') || (toLower firstChar == '(') then - let thisGraphList = getFENewickGraphText fileContents - hasCycles = filter (== True) $ fmap GFU.cyclic thisGraphList - hasLoops = filter (== True) $ fmap B.hasLoop thisGraphList - in - if not $ null hasLoops then errorWithoutStackTrace ("Input graph in " ++ firstFile ++ " has loops/self-edges") - else if not $ null hasCycles then errorWithoutStackTrace ("Input graph in " ++ firstFile ++ " has at least one cycle") - else executeReadCommands' curData (thisGraphList ++ curGraphs) curTerminals curExcludeList curRenamePairs curReBlockPairs isPrealigned' tcmPair (tail argList) - - else if toLower firstChar == 'x' then - let tntData = TNT.getTNTDataText fileContents firstFile - in - trace ("\tTrying to parse " ++ firstFile ++ " as TNT") - executeReadCommands' (tntData : curData) curGraphs curTerminals curExcludeList curRenamePairs curReBlockPairs isPrealigned' tcmPair (tail argList) - else - let fileContents' = T.unlines $ filter (not . T.null) $ T.takeWhile (/= ';') <$> T.lines fileContents - in - if T.null (T.dropWhile (== ' ') fileContents') then errorWithoutStackTrace ("Null file '" ++ firstFile ++ "' input") - else if T.head (T.dropWhile (== ' ') fileContents') == '>' then - let secondLine = (T.lines fileContents') !! 1 - hasSpaces = T.find (== ' ') secondLine - -- numWords = length $ words secondLine - in - -- spaces between alphabet elements suggest fastc - if isJust hasSpaces then - let fastcData = FAC.getFastCText fileContents firstFile isPrealigned' - fastcCharInfo = FAC.getFastcCharInfo fastcData firstFile isPrealigned' tcmPair - in - trace ("\tTrying to parse " ++ firstFile ++ " as fastc--if it should be fasta specify 'fasta:' on input.") - executeReadCommands' ((fastcData, [fastcCharInfo]) : curData) curGraphs curTerminals curExcludeList curRenamePairs curReBlockPairs isPrealigned' tcmPair (tail argList) - else - let fastaData = FAC.getFastAText fileContents firstFile isPrealigned' - fastaCharInfo = FAC.getFastaCharInfo fastaData firstFile firstOption isPrealigned' tcmPair - in - trace ("\tTrying to parse " ++ firstFile ++ " as fasta") - executeReadCommands' ((fastaData, [fastaCharInfo]) : curData) curGraphs curTerminals curExcludeList curRenamePairs curReBlockPairs isPrealigned' tcmPair (tail argList) - - else errorWithoutStackTrace ("Cannot determine file type for " ++ firstFile ++ " need to prepend type") - -- fasta - else if firstOption `elem` ["fasta", "nucleotide", "aminoacid"] then - let fastaData = FAC.getFastAText fileContents firstFile isPrealigned' - fastaCharInfo = FAC.getFastaCharInfo fastaData firstFile firstOption isPrealigned' tcmPair - in - executeReadCommands' ((fastaData, [fastaCharInfo]) : curData) curGraphs curTerminals curExcludeList curRenamePairs curReBlockPairs isPrealigned' tcmPair (tail argList) - -- fastc - else if firstOption `elem` ["fastc"] then - let fastcData = FAC.getFastCText fileContents firstFile isPrealigned' - fastcCharInfo = FAC.getFastcCharInfo fastcData firstFile isPrealigned' tcmPair - in - executeReadCommands' ((fastcData, [fastcCharInfo]) : curData) curGraphs curTerminals curExcludeList curRenamePairs curReBlockPairs isPrealigned' tcmPair (tail argList) - - --prealigned fasta - else if firstOption `elem` ["prefasta", "prenucleotide", "preaminoacid"] then - let fastaData = FAC.getFastAText fileContents firstFile True - fastaCharInfo = FAC.getFastaCharInfo fastaData firstFile firstOption True tcmPair - in - -- trace ("POSTREAD:" ++ (show fastaCharInfo) ++ "\n" ++ (show fastaData)) - executeReadCommands' ((fastaData, [fastaCharInfo]) : curData) curGraphs curTerminals curExcludeList curRenamePairs curReBlockPairs isPrealigned' tcmPair (tail argList) - - -- prealigned fastc - else if firstOption `elem` ["prefastc"] then - let fastcData = FAC.getFastCText fileContents firstFile True - fastcCharInfo = FAC.getFastcCharInfo fastcData firstFile True tcmPair - in - executeReadCommands' ((fastcData, [fastcCharInfo]) : curData) curGraphs curTerminals curExcludeList curRenamePairs curReBlockPairs isPrealigned' tcmPair (tail argList) - -- tnt - -- tnt - else if firstOption == "tnt" then - let tntData = TNT.getTNTDataText fileContents firstFile - in - executeReadCommands' (tntData : curData) curGraphs curTerminals curExcludeList curRenamePairs curReBlockPairs isPrealigned' tcmPair (tail argList) - -- else if firstOption == "prealigned" then executeReadCommands' curData curGraphs curTerminals curExcludeList curRenamePairs curReBlockPairs isPrealigned' tcmPair (tail argList) - -- FENEwick - else if firstOption `elem` ["newick" , "enewick", "fenewick"] then - let thisGraphList = getFENewickGraphText fileContents - hasLoops = filter (== True) $ fmap B.hasLoop thisGraphList - hasCycles = filter (== True) $ fmap GFU.cyclic thisGraphList - in - if not $ null hasLoops then errorWithoutStackTrace ("Input graphin " ++ firstFile ++ " has loops/self-edges") - else if not $ null hasCycles then errorWithoutStackTrace ("Input graph in " ++ firstFile ++ " has at least one cycle") - else executeReadCommands' curData (thisGraphList ++ curGraphs) curTerminals curExcludeList curRenamePairs curReBlockPairs isPrealigned' tcmPair (tail argList) - - -- reading terminals list to include--must be "new" names if taxa are renamed - else if firstOption == "include" then - let terminalsList = fmap ((T.pack . filter (/= '"')) . filter C.isPrint) (words $ unlines $ U.stripComments $ fmap T.unpack $ T.lines fileContents) - in - executeReadCommands' curData curGraphs (terminalsList ++ curTerminals) curExcludeList curRenamePairs curReBlockPairs isPrealigned' tcmPair (tail argList) - else if firstOption == "exclude" then - let excludeList = fmap ((T.pack . filter (/= '"')) . filter C.isPrint) (words $ unlines $ U.stripComments $ fmap T.unpack $ T.lines fileContents) - in - executeReadCommands' curData curGraphs curTerminals (excludeList ++ curExcludeList) curRenamePairs curReBlockPairs isPrealigned' tcmPair (tail argList) - else if firstOption == "rename" then - let renameLines = U.stripComments $ fmap T.unpack $ T.lines fileContents - namePairs = concatMap (makeNamePairs firstFile) renameLines - in - executeReadCommands' curData curGraphs curTerminals curExcludeList (namePairs ++ curRenamePairs) curReBlockPairs isPrealigned' tcmPair (tail argList) - else if firstOption == "block" then - let renameLines = U.stripComments $ fmap T.unpack $ T.lines fileContents - blockPairs = concatMap (makeNamePairs firstFile) renameLines - in - executeReadCommands' curData curGraphs curTerminals curExcludeList curRenamePairs (blockPairs ++ curReBlockPairs) isPrealigned' tcmPair (tail argList) - --else if firstOption == "prealigned" then - -- executeReadCommands' curData curGraphs curTerminals curExcludeList curRenamePairs curReBlockPairs isPrealigned' tcmPair (tail argList) - else errorWithoutStackTrace ("\n\n'Read' command error: option " ++ firstOption ++ " not recognized/implemented") - --- | makeNamePairs takes lines of rename or reblock file and returns pairs of names --- for renaming or reblocking -makeNamePairs :: String -> String -> [(T.Text, T.Text)] -makeNamePairs inFileName inLine = - if null inLine then [] - else - let lineWords = fmap T.strip $ T.words $ T.pack $ filter (/= '"') $ filter C.isPrint inLine - in - if length lineWords < 2 then errorWithoutStackTrace ("Rename file " ++ inFileName ++ " line needs at least two Strings to rename the second as the first: " ++ inLine) - else - let targetNameList = replicate (length $ tail lineWords) (head lineWords) - renamePairList = zip targetNameList (tail lineWords) - in - renamePairList - - - --- | getReadArgs processes arguments ofr the 'read' command --- should allow mulitple files and gracefully error check --- also allows tcm file specification (limit 1 tcm per command?) --- as fasta, fastc, tnt, tcm, prealigned -getReadArgs :: String -> [(String, String)] -> [Argument] -getReadArgs fullCommand argList = - if null argList then [] - else - -- trace ("GRA: " ++ fullCommand ++ " " ++ (show argList)) ( - let (firstPart, secondPart) = head argList - in - -- command in wrong place like prealigned or rename after file name - if (not . null) firstPart && null secondPart then - errorWithoutStackTrace ("\n\n'Read' command error: possibly incorrect placement of option specification '" ++ firstPart - ++ "' should be before filename as in '" ++ firstPart ++ ":filename'") - - -- plain file name with no modifier - else if null firstPart then - if (head secondPart == '"') || (last secondPart == '"') then (firstPart, init $ tail secondPart) : getReadArgs fullCommand (tail argList) - else errorWithoutStackTrace ("\n\n'Read' command error: '" ++ secondPart ++"' : Need to specify filename in double quotes") - - -- Change to allowed modifiers - else if fmap toLower firstPart `notElem` V.readArgList then - errorWithoutStackTrace ("\n\n'Read' command error: " ++ fullCommand ++ " contains unrecognized option '" ++ firstPart ++ "'") - - else if null secondPart && (firstPart == "prealigned") then - (firstPart, []) : getReadArgs fullCommand (tail argList) - - else if null (tail secondPart) then argList - - else (firstPart, init $ tail secondPart) : getReadArgs fullCommand (tail argList) - -- ) - -{- Not used when migrated to Text input from String - --- | getFENewickGraph takes graph contents and returns local graph format --- could be mulitple input graphs -getFENewickGraph :: String -> [LG.Gr T.Text Double] -getFENewickGraph fileString = - getFENewickGraphText (T.pack fileString) --} - --- | getFENewickGraphText takes graph contents and returns local graph format --- could be mulitple input graphs -getFENewickGraphText :: T.Text -> [LG.Gr T.Text Double] -getFENewickGraphText fileString = - -- trace (fileString) - LG.getFENLocal (T.filter (/= '\n') $ T.strip fileString) - --- | processTCMContents --- functionality added to integerize tcm and add weight factor to allow for decimal tcm values -processTCMContents :: Bool -> String -> String -> ([ST.ShortText], [[Int]], Double) -processTCMContents indelGap inContents fileName = - if null inContents then errorWithoutStackTrace ("\n\n'Read' 'tcm' command error: empty tcmfile `" ++ fileName) - else - if indelGap then - let indelString = L.takeWhile (/= ',') inContents - substString = tail $ L.dropWhile (/= ',') inContents - indelMaybe = readMaybe indelString :: Maybe Int - substMaybe = readMaybe substString :: Maybe Int - in - if isNothing indelMaybe then errorWithoutStackTrace ("Specification of indel cost must be an Integer (Indel cost, Substitution cost): " ++ indelString) - else if isNothing substMaybe then errorWithoutStackTrace ("Specification of substitution cost must be an Integer (Indel cost, Substitution cost): " ++ substString) - else ([], [[fromJust indelMaybe, fromJust substMaybe],[]], 1.0) - --First line is alphabet - else - let tcmLines = lines inContents - localAlphabet = fmap ST.pack $ words $ head tcmLines - -- to account for '-' - numElements = 1 + length localAlphabet - costMatrixStrings = words <$> tail tcmLines - -- localCostMatrix = filter (/= []) $ fmap (fmap (GU.stringToInt fileName)) costMatrixStrings - (scaleFactor, localCostMatrix) = getCostMatrixAndScaleFactor fileName costMatrixStrings - numLines = length localCostMatrix - lengthRowList = fmap length localCostMatrix - rowsCorrectLength = foldl' (&&) True $ fmap (== numElements) lengthRowList - in - if (any (== ST.singleton '-') localAlphabet) then errorWithoutStackTrace "\n\n'Read' 'tcm' file format error: '-' (InDel/Gap) should not be specifid as an alphabet element. It is added in automatically and assumed to be the last row and column of tcm matrix" - else if numLines /= numElements then errorWithoutStackTrace ("\n\n'Read' 'tcm' file format error: incorrect number of lines in matrix from " - ++ fileName ++ " this could be due to a missing element symbol line at the beginning of the file (e.g. A C G T, '-' is assumed) or there is a mismatch in the dimensions of the tcm (including '-') " ++ show numElements ++ " elements are implied and there are " ++ show numLines) - else if not rowsCorrectLength then errorWithoutStackTrace ("\n\n'Read' 'tcm' file format error: incorrect lines length(s) in matrix from " - ++ fileName ++ " there should be (including '-') " ++ show numElements) - else - --trace (show (scaleFactor, localCostMatrix)) - -- trace (show localAlphabet ++ "\n" ++ show localCostMatrix) - (localAlphabet ++ [ST.singleton '-'], localCostMatrix, scaleFactor) - - --- | getCostMatrixAndScaleFactor takes [[String]] and returns cost matrix as --- [[Int]] but if there are decimal values, a scalerFactor is determined and the --- cost matrix are integerized by multiplication by 1/scaleFactor --- scaleFactor will alway be a factor of 10 to allow for easier --- compilation of tcm charcaters later (weights the same...) -getCostMatrixAndScaleFactor :: String -> [[String]] -> (Double, [[Int]]) -getCostMatrixAndScaleFactor fileName inStringListList = - if null inStringListList then error "Empty list in inStringListList" - else - let maxDecimalPlaces = maximum $ getDecimals <$> concat inStringListList - scaleFactor = if maxDecimalPlaces == 0 then 1.0 - else 0.1 ** fromIntegral maxDecimalPlaces - in - if maxDecimalPlaces == 0 then (scaleFactor, filter (/= []) $ fmap (fmap (GU.stringToInt fileName)) inStringListList) - else - let newStringListList = fmap (fmap (integerizeString maxDecimalPlaces)) inStringListList - in (scaleFactor, filter (/= []) $ fmap (fmap (GU.stringToInt fileName)) newStringListList) - - --- | getDecimals returns the number of decimal places in a string rep of a number -getDecimals :: String -> Int -getDecimals inString = - if null inString then 0 - else - let decimalPart = dropWhile (/= '.') inString - in - if null decimalPart then 0 - else length decimalPart - 1 - --- | integerizeString integerizes a String by removing the '.' and adding the number of '0's to pad out --- adds maxDecimalPlaces - the nymber in the String -integerizeString :: Int -> String -> String -integerizeString maxDecimalPlaces inString = - if null inString then error "Null string in integerizeString" - else - let decimalPart = dropWhile (/= '.') inString - in - if inString == "0" then inString - else if null decimalPart then inString ++ replicate maxDecimalPlaces '0' - else filter (/= '.') inString ++ replicate (maxDecimalPlaces - (length decimalPart - 1)) '0' diff --git a/pkg/PhyGraph/Input/Reorganize.hs b/pkg/PhyGraph/Input/Reorganize.hs deleted file mode 100644 index 55e80d383..000000000 --- a/pkg/PhyGraph/Input/Reorganize.hs +++ /dev/null @@ -1,1076 +0,0 @@ -{- | -Module : DataTransformation.hs -Description : Module with functionality to transform phylogenetic data -Copyright : (c) 2021 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - - -module Input.Reorganize - ( combineDataByType - , reBlockData - , removeConstantCharactersPrealigned - , removeConstantCharsPrealigned - , optimizePrealignedData - , convert2BV - , getRecodingType - ) where - -import Control.Parallel.Strategies -import Data.Alphabet -import Data.Bits -import qualified Data.BitVector.LittleEndian as BV -import qualified Data.List as L -import Data.Maybe -import qualified Data.Text.Lazy as T -import qualified Data.Text.Short as ST -import qualified Data.Vector as V -import qualified Data.Vector.Generic as GV -import qualified Data.Vector.Storable as SV -import qualified Data.Vector.Unboxed as UV -import Data.Word -import Debug.Trace -import Foreign.C.Types (CUInt) -import GeneralUtilities -import qualified GraphOptimization.Medians as M -import qualified Input.BitPack as BP -import qualified ParallelUtilities as PU -import qualified SymMatrix as S -import Text.Read -import Types.Types -import qualified Utilities.Utilities as U -import qualified Data.MetricRepresentation as MR - - --- | optimizePrealignedData convert - -- prealigned to non-additive or matrix - -- here - -- bitPack new non-additive - -- packNonAdditive -optimizePrealignedData :: GlobalSettings -> ProcessedData -> ProcessedData -optimizePrealignedData inGS inData@(_, _, blockDataVect) = - let -- remove constant characters from prealigned - inData' = removeConstantCharactersPrealigned inData - - -- convert prealigned to nonadditive if all 1 tcms - inData'' = convertPrealignedToNonAdditive inData' - - -- bit packing for non-additivecharacters - inData''' = BP.packNonAdditiveData inGS inData'' - - in - - if U.getNumberPrealignedCharacters blockDataVect == 0 then - -- trace ("Not Bitpacking...") - inData - else - --trace ("Bitpacking...") - inData''' - - - --- | convertPrealignedToNonAdditive converts prealigned data to non-additive --- if homogeneous TCM (all 1's non-diagnoal) -convertPrealignedToNonAdditive :: ProcessedData -> ProcessedData -convertPrealignedToNonAdditive (nameVect, bvNameVect, blockDataVect) = (nameVect, bvNameVect, fmap convertPrealignedToNonAdditiveBlock blockDataVect) - --- | convertPrealignedToNonAdditiveBlock takes a character block and convertes prealigned to non-add if tcms all 1's --- this is done taxon by taxon and character by character since can convert with only local infomation -convertPrealignedToNonAdditiveBlock :: BlockData -> BlockData -convertPrealignedToNonAdditiveBlock (nameBlock, charDataVV, charInfoV) = - let codingTypeV = fmap fst $ fmap getRecodingType (fmap costMatrix charInfoV) - (newCharDataVV, newCharInfoVV) = V.unzip $ fmap (convertTaxonPrealignedToNonAdd charInfoV codingTypeV) charDataVV - in - (nameBlock, newCharDataVV, V.head newCharInfoVV) - --- | convertTaxonPrealignedToNonAdd takes a vector of character info and vector of charcter data for a taxon --- and recodes prealigned as non-additve if tcm's all 1s -convertTaxonPrealignedToNonAdd :: V.Vector CharInfo -> V.Vector String -> V.Vector CharacterData -> (V.Vector CharacterData, V.Vector CharInfo) -convertTaxonPrealignedToNonAdd charInfoV codingTypeV charDataV = - V.unzip $ V.zipWith3 convertTaxonPrealignedToNonAddCharacter charInfoV codingTypeV charDataV - --- | convertTaxonPrealignedToNonAddCharacter takes a taxon character and char info and cost matrix type --- and transforms to non-additive if all tcms are 1's -convertTaxonPrealignedToNonAddCharacter :: CharInfo -> String -> CharacterData -> (CharacterData, CharInfo) -convertTaxonPrealignedToNonAddCharacter charInfo matrixType charData = - -- trace ("CTP: " ++ (show $ charType charInfo) ++ " " ++ (show $ matrixType)) ( - if charType charInfo `notElem` prealignedCharacterTypes then (charData, charInfo) - else if matrixType /= "nonAdd" then (charData, charInfo) - else - -- this coefficient do to reducion by integer factors in matrix representation - -- need to be multipled or 2:2 goes to 1:1 and weigh/cost incorrect (non for slim--ffi etc) - let matrixCoefficient = if (charType charInfo) == AlignedSlim then 1 - else if (charType charInfo) == AlignedWide then MR.minInDelCost (wideTCM charInfo) - else if (charType charInfo) == AlignedHuge then MR.minInDelCost (hugeTCM charInfo) - else error ("Unrecognized--unimplemented character type in convertTaxonPrealignedToNonAddCharacter: " ++ (show $ charType charInfo)) - - charWeight = weight charInfo - - newStateBV = if charType charInfo == AlignedSlim then - convert2BVTriple 32 $ (snd3 . alignedSlimPrelim) charData - else if charType charInfo == AlignedWide then - convert2BVTriple 64 $ (snd3 . alignedWidePrelim) charData - else if charType charInfo == AlignedHuge then - (snd3 $ alignedHugePrelim charData, snd3 $ alignedHugePrelim charData, snd3 $ alignedHugePrelim charData) - else error ("Unrecognized character type in convertTaxonPrealignedToNonAddCharacter: " ++ (show $ charType charInfo)) - in - (emptyCharacter {stateBVPrelim = newStateBV}, charInfo {charType = NonAdd, weight = charWeight * (fromIntegral $ fromEnum matrixCoefficient)}) - -- ) - - --- | convert2BVTriple takes CUInt or Word64 and converts to Triple Vector of bitvectors -convert2BVTriple :: (Integral a, GV.Vector v a) => Word -> v a -> (V.Vector BV.BitVector, V.Vector BV.BitVector, V.Vector BV.BitVector) -convert2BVTriple size inM = - let inMList = GV.toList inM - inMBV = fmap (BV.fromNumber size) inMList - - in - (V.fromList inMBV, V.fromList inMBV, V.fromList inMBV) - --- | convert2BV takes CUInt or Word64 and converts to Vector of bitvectors --- this for leaves so assume M only one needed really -convert2BV :: (Integral a, GV.Vector v a) => Word -> (v a, v a, v a) -> (V.Vector BV.BitVector, V.Vector BV.BitVector, V.Vector BV.BitVector) -convert2BV size (_, inM, _) = - let inMList = GV.toList inM - inMBV = fmap (BV.fromNumber size) inMList - - in - (V.fromList inMBV, V.fromList inMBV, V.fromList inMBV) - --- | getRecodingType takes a cost matrix and detemines if it can be recodes as non-additive, --- non-additive with gap chars, or matrix --- assumes indel costs are in last row and column -getRecodingType :: S.Matrix Int -> (String, Int) -getRecodingType inMatrix = - if S.null inMatrix then error "Null matrix in getRecodingType" - else - if (not . S.isSymmetric) inMatrix then ("matrix", 0) - else - let matrixLL = S.toFullLists inMatrix - lastRow = L.last matrixLL - numUniqueCosts = length $ L.group $ L.sort $ (filter (/= 0) $ concat matrixLL) - - in - -- trace ("GRT: " ++ (show numUniqueCosts)) ( - -- all same except for 0 - if numUniqueCosts == 1 then ("nonAdd", 0) - - else ("matrix", head lastRow) - {- - -- all same except for gaps - else if numUniqueCosts == 2 then - trace ("NAG: " ++ (show $ length $ L.group $ L.sort $ filter (/= 0) lastRow)) ( - if (length $ L.group $ filter (/= 0) lastRow) == 1 then ("nonAddGap", head lastRow) - - -- some no gaps different - else ("matrix", head lastRow) - ) - -- to many types for nonadd coding - else ("matrix", head lastRow) - - -} - -- ) - --- | reBlockData takes original block assignments--each input file is a block-- --- and combines, creates new, deletes empty blocks from user input --- reblock pair names may contain wildcards -reBlockData :: [(NameText, NameText)] -> ProcessedData -> ProcessedData -reBlockData reBlockPairs inData@(leafNames, leafBVs, blockDataV) = - -- trace ("RBD:" ++ (show $ fmap fst3 blockDataV )) ( - if null reBlockPairs then trace "Character Blocks as input files" inData - else - let -- those block to be reassigned--nub in case repeated names - toBeReblockedNames = fmap (T.filter (/= '"')) $ L.nub $ fmap snd reBlockPairs - unChangedBlocks = V.filter ((`notElemWildcards` toBeReblockedNames).fst3) blockDataV - blocksToChange = V.filter ((`elemWildcards` toBeReblockedNames).fst3) blockDataV - newBlocks = makeNewBlocks reBlockPairs blocksToChange [] - reblockedBlocks = unChangedBlocks V.++ V.fromList newBlocks - in - trace ("\nReblocking: " ++ show toBeReblockedNames ++ " leaving unchanged: " ++ show (fmap fst3 unChangedBlocks) - ++ "\n\tNew blocks: " ++ show (fmap fst3 reblockedBlocks) ++ "\n") - (leafNames, leafBVs, reblockedBlocks) - -- ) - --- | makeNewBlocks takes lists of reblock pairs and existing relevant blocks and creates new blocks returned as a list -makeNewBlocks :: [(NameText, NameText)] -> V.Vector BlockData -> [BlockData] -> [BlockData] -makeNewBlocks reBlockPairs inBlockV curBlockList - | null reBlockPairs = curBlockList - | V.null inBlockV && null curBlockList = - errorWithoutStackTrace ("Reblock pair names do not have a match for any input block--perhaps missing '#0/N'? Blocks: " ++ (show $ fmap snd reBlockPairs)) - | V.null inBlockV = curBlockList - | otherwise = - let firstBlock = V.head inBlockV - firstName = fst3 firstBlock - newPairList = fst <$> filter (textMatchWildcards firstName.snd) reBlockPairs - in - if null newPairList then errorWithoutStackTrace ("Reblock pair names do not have a match for any input block--perhaps missing '#0'? Specified pairs: " ++ show reBlockPairs - ++ " input block name: " ++ T.unpack firstName) - else if length newPairList > 1 then errorWithoutStackTrace ("Multiple reblock destinations for single input block" ++ show newPairList) - else - let newBlockName = head newPairList - existingBlock = filter ((==newBlockName).fst3) curBlockList - in - -- new block to be created - if null existingBlock then - --trace("NBlocks:" ++ (show $ fmap fst3 curBlockList)) - makeNewBlocks reBlockPairs (V.tail inBlockV) ((newBlockName, snd3 firstBlock, thd3 firstBlock) : curBlockList) - - -- existing block to be added to - else if length existingBlock > 1 then error ("Error: Block to be added to more than one block-should not happen: " ++ show reBlockPairs) - else - -- need to add character vectors to vertex vectors and add to CharInfo - -- could be multiple 'characteres' if non-exact data in inpuit file (Add, NonAdd, MAtrix etc) - let blockToAddTo = head existingBlock - newCharData = V.zipWith (V.++) (snd3 blockToAddTo) (snd3 firstBlock) - newCharInfo = thd3 blockToAddTo V.++ thd3 firstBlock - in - --trace("EBlocks:" ++ (show $ fmap fst3 curBlockList)) - makeNewBlocks reBlockPairs (V.tail inBlockV) ((newBlockName, newCharData, newCharInfo) : filter ((/=newBlockName).fst3) curBlockList) - - --- | combineDataByType combines data of same type for (exact types) into --- same vectors in character so have one non-add, one add, one of each packed type, --- can have multiple matrix (due to cost matrix differneces) --- simialr result to groupDataByType, but does not assume single characters. -combineDataByType :: GlobalSettings -> ProcessedData -> ProcessedData -combineDataByType inGS inData@(taxNames, taxBVNames, _) = - --recode add to non-add before combine-- takes care wor integer weighting - let (_, _, blockDataV') = recodeAddToNonAddCharacters inGS maxAddStatesToRecode inData - recodedData = fmap combineData blockDataV' - in - (taxNames, taxBVNames, recodedData) - --- | combineData creates for a block) lists of each data type and concats then creating new data and new char info -combineData :: BlockData -> BlockData -combineData (blockName, blockDataVV, charInfoV) = - let (newBlockDataLV, newCharInfoLV) = unzip (PU.seqParMap rdeepseq (combineBlockData charInfoV) (V.toList blockDataVV)) -- `using` PU.myParListChunkRDS) - in - (blockName, V.fromList newBlockDataLV, head newCharInfoLV) - --- | combineBlockData takes a vector of char info and vector or charcater data for a taxon and --- combined exact data types into single characters --- additive characters should have already been converted (if less that maxAddStatesToRecode) to binary --- non-additive characters with integer weights could be repeated before combining-- but this has been disabled --- due to memory issues in large data sets --- non-integer additive and non-additive are grouped together by weight so they can be combined and bit packed --- all other types are grouped by weight for efficiency of optimization and reductionn of multiplies --- zero weight characters are filtered out -combineBlockData :: V.Vector CharInfo -> V.Vector CharacterData -> (V.Vector CharacterData, V.Vector CharInfo) -combineBlockData inCharInfoV inCharDataV = - let pairCharsInfo = V.zip inCharInfoV inCharDataV - - -- characters to not be reorganized-- nbasically the sequence characters - sequenceCharacters = V.toList $ V.filter ((> 0) . weight . fst) $ V.filter ((`elem` sequenceCharacterTypes) . charType . fst) pairCharsInfo - - -- matrix characters are more complex--can only join if same matrix - matrixCharsPair = V.filter ((> 0) . weight . fst) $ V.filter ((== Matrix) . charType . fst) pairCharsInfo - (newMatrixCharL, newMatrixCharInfoL) = if (not . null) matrixCharsPair then unzip $ organizeMatrixCharsByMatrix (V.toList matrixCharsPair) - else ([],[]) - - -- non-additive characters - -- multiple characters by weight, if only 1 weight then all together - nonAddChars = V.filter ((> 0) . weight . fst) $ V.filter ((== NonAdd) . charType . fst) pairCharsInfo - (newNonAddCharInfo, newNonAddChar) = unzip $ V.toList $ groupCharactersByWeight nonAddChars - - -- additive characters - -- multiple characters by weight, if only 1 weight then all together - - addChars = V.filter ((> 0) . weight . fst) $ V.filter ((== Add) . charType . fst) pairCharsInfo - (newAddCharInfo, newAddChar) = unzip $ V.toList $ groupCharactersByWeight addChars - - -- Packed2 characters - packed2Chars = V.filter ((> 0) . weight . fst) $ V.filter ((== Packed2) . charType . fst) pairCharsInfo - (newPacked2CharInfo, newPacked2Char) = unzip $ V.toList $ groupCharactersByWeight packed2Chars - - -- Packed4 characters - packed4Chars = V.filter ((> 0) . weight . fst) $ V.filter ((== Packed4) . charType . fst) pairCharsInfo - (newPacked4CharInfo, newPacked4Char) = unzip $ V.toList $ groupCharactersByWeight packed4Chars - - - -- Packed5 characters - packed5Chars = V.filter ((> 0) . weight . fst) $ V.filter ((== Packed5) . charType . fst) pairCharsInfo - (newPacked5CharInfo, newPacked5Char) = unzip $ V.toList $ groupCharactersByWeight packed5Chars - - - -- Packed8 characters - packed8Chars = V.filter ((> 0) . weight . fst) $ V.filter ((== Packed8) . charType . fst) pairCharsInfo - (newPacked8CharInfo, newPacked8Char) = unzip $ V.toList $ groupCharactersByWeight packed8Chars - - -- Packed64 characters - packed64Chars = V.filter ((> 0) . weight . fst) $ V.filter ((== Packed64) . charType . fst) pairCharsInfo - (newPacked64CharInfo, newPacked64Char) = unzip $ V.toList $ groupCharactersByWeight packed64Chars - - -- Add together all new characters, seqeunce characters and char info - -- newCharList = newNonAddCharL ++ (V.toList nonAddCharsWeightNotInt) ++ newAddCharL ++ (V.toList addCharsWeightNot1) ++ newPacked2CharL ++ newPacked4CharL ++ newPacked5CharL ++ newPacked8CharL ++ newPacked64CharL ++ newMatrixCharL ++ (fmap snd sequenceCharacters) - -- newCharInfoList = newNonAddCharInfoL ++ (V.toList nonAddCharsWeightNotIntInfo) ++ newAddCharInfoL ++ (V.toList addCharsWeightNot1Info) ++ newPacked2CharInfoL ++ newPacked4CharInfoL ++ newPacked5CharInfoL ++ newPacked8CharInfoL ++ newPacked64CharInfoL ++ newMatrixCharInfoL ++ (fmap fst sequenceCharacters) - - newCharList = newNonAddChar ++ newAddChar ++ newPacked2Char ++ newPacked4Char ++ newPacked5Char ++ newPacked8Char ++ newPacked64Char ++ newMatrixCharL ++ (fmap snd sequenceCharacters) - newCharInfoList = newNonAddCharInfo ++ newAddCharInfo ++ newPacked2CharInfo ++ newPacked4CharInfo ++ newPacked5CharInfo ++ newPacked8CharInfo ++ newPacked64CharInfo ++ newMatrixCharInfoL ++ (fmap fst sequenceCharacters) - - in - (V.fromList newCharList, V.fromList newCharInfoList) - --- | groupCharactersByWeight takes a list of characters and returns a list of lists of charcter with same weight --- checked as Double. --- NB--Does not check for character type---assuming this is desired it must be assured before input -groupCharactersByWeight :: V.Vector (CharInfo, CharacterData) -> V.Vector (CharInfo, CharacterData) -groupCharactersByWeight inCharsPairList = - if V.null inCharsPairList then V.empty - else - let weightList = L.nub $ V.toList $ fmap weight (fmap fst inCharsPairList) - charListByWeight = fmap (getSameWeightChars inCharsPairList) $ V.fromList weightList - in - V.concatMap mergeCharacters charListByWeight - --- | mergeCharacters merges the data fieed of characters based on type --- NB--Does not check all chars are same type or weight--will silently combine mempty values --- with whatever weight. --- returns list with single member so can concat later -mergeCharacters :: V.Vector (CharInfo, CharacterData) -> V.Vector (CharInfo, CharacterData) -mergeCharacters inCharsPairList = - if V.null inCharsPairList then V.empty - else - let thisCharType = (charType . fst . V.head) inCharsPairList - - -- non-add data - dataFieldNonAdd = V.concatMap (snd3 . stateBVPrelim . snd) inCharsPairList - newNonAddChar = ((snd . V.head) inCharsPairList) {stateBVPrelim = (dataFieldNonAdd, dataFieldNonAdd, dataFieldNonAdd), stateBVFinal = dataFieldNonAdd} - - -- add data - dataFieldAdd = V.concatMap (snd3 . rangePrelim . snd) inCharsPairList - newAddChar = ((snd . V.head) inCharsPairList) {rangePrelim = (dataFieldAdd, dataFieldAdd, dataFieldAdd), rangeFinal = dataFieldAdd} - - -- packed data - dataFieldPacked = UV.concat $ V.toList $ fmap (snd3 . packedNonAddPrelim . snd) inCharsPairList - newPackedChar = ((snd . V.head) inCharsPairList) {packedNonAddPrelim = (dataFieldPacked, dataFieldPacked, dataFieldPacked), packedNonAddFinal = dataFieldPacked} - - -- add info of merging to character info - newOrigInfo = V.concatMap (origInfo . fst) inCharsPairList - newCharInfo = ((fst . V.head) inCharsPairList) {origInfo = newOrigInfo} - - in - if thisCharType == NonAdd then V.singleton (newCharInfo, newNonAddChar) - else if thisCharType == Add then V.singleton (newCharInfo, newAddChar) - else if thisCharType `elem` packedNonAddTypes then V.singleton (newCharInfo, newPackedChar) - else error ("Error in mergeCharacters: Character type " ++ show thisCharType ++ " unrecognized/not implemented") - - - --- | getSameWeightChars returns character pairs with same matrix as testMatrix -getSameWeightChars :: V.Vector (CharInfo, CharacterData) -> Double -> V.Vector (CharInfo, CharacterData) -getSameWeightChars inCharsPairList testWeight = - if V.null inCharsPairList then V.empty - else - let inWeightList = fmap weight (fmap fst inCharsPairList) - weightPairPair = V.zip inWeightList inCharsPairList - matchList = V.filter ((== testWeight) . weight . fst . snd) weightPairPair - in - fmap snd matchList - -{- Currently unused--employed to reduce chrater umbers by replicating characters with integer weight - can cause memory issues with large data sets --- | replicateCharPairByWeight replicates characters by integer weight -replicateCharPairByWeight :: (CharInfo, CharacterData) -> [(CharInfo, CharacterData)] -replicateCharPairByWeight firstPair = - let charIntWeight = doubleAsInt $ (weight . fst) firstPair - in - if isNothing charIntWeight then error ("Character weight not an integer in replicateCharPair: " ++ (show $ (weight . fst) firstPair)) - else - (replicate (fromJust charIntWeight) firstPair) --} - --- | organizeMatrixCharsByMatrix combines matrix charcters if they have the same cost matrix -organizeMatrixCharsByMatrix :: [(CharInfo, CharacterData)] -> [(CharacterData, CharInfo)] -organizeMatrixCharsByMatrix inCharsPairList = - if null inCharsPairList then [] - else - let costMatrixList = L.nub $ fmap costMatrix (fmap fst inCharsPairList) - charMatrixLL = fmap (getSameMatrixChars inCharsPairList) costMatrixList - newMatrixPairs = fmap combineMatrixCharsByMatrix charMatrixLL - in - newMatrixPairs - --- | combineMatrixCharsByMatrix combines matrix characters--assumes cost matrices are the same -combineMatrixCharsByMatrix :: [(CharInfo, CharacterData)] -> (CharacterData, CharInfo) -combineMatrixCharsByMatrix inCharList = - let newMatrixcharData = V.concat $ fmap matrixStatesPrelim $ fmap snd inCharList - newMatrixChar = ((snd . head) inCharList) { matrixStatesPrelim = newMatrixcharData - , matrixStatesFinal = newMatrixcharData - } - newMatrixCharInfo = ((fst . head) inCharList) {origInfo = V.concat $ fmap (origInfo . fst) inCharList} - in - (newMatrixChar, newMatrixCharInfo) - --- | getSameMatrixChars returns character pairs with same matrix as testMatrix -getSameMatrixChars :: [(CharInfo, CharacterData)] -> S.Matrix Int -> [(CharInfo, CharacterData)] -getSameMatrixChars inCharsPairList testMatrix = - let inMatrixList = fmap costMatrix (fmap fst inCharsPairList) - matrixPairPair = zip inMatrixList inCharsPairList - matchList = filter ((== testMatrix) . costMatrix . fst . snd) matrixPairPair - in - fmap snd matchList - --- | removeConstantCharactersPrealigned takes processed data and removes constant characters --- from prealignedCharacterTypes -removeConstantCharactersPrealigned :: ProcessedData -> ProcessedData -removeConstantCharactersPrealigned (nameVect, bvNameVect, blockDataVect) = - let newBlockData = V.fromList (PU.seqParMap rdeepseq removeConstantBlockPrealigned (V.toList blockDataVect)) -- `using` PU.myParListChunkRDS) - in - (nameVect, bvNameVect, newBlockData) - --- | removeConstantBlockPrealigned takes block data and removes constant characters -removeConstantBlockPrealigned :: BlockData -> BlockData -removeConstantBlockPrealigned inBlockData@(blockName, taxVectByCharVect, charInfoV) = - -- check for null data--really reallyu shouldn't happen - if V.null taxVectByCharVect then trace ("Warning: Null block data in removeConstantBlockPrealigned") inBlockData - - -- check for prealigned data in block - else if U.getNumberPrealignedCharacters (V.singleton inBlockData) == 0 then inBlockData - else - let numChars = V.length $ V.head taxVectByCharVect - - -- create vector of single characters with vector of taxon data of sngle character each - -- like a standard matrix with a single character - singleCharVect = fmap (U.getSingleCharacter taxVectByCharVect) (V.fromList [0.. numChars - 1]) - - -- actually remove constants form chaarcter list - singleCharVect' = V.zipWith removeConstantCharsPrealigned singleCharVect charInfoV - - -- recreate the taxa vext by character vect block data expects - -- should filter out length zero characters - newTaxVectByCharVect = U.glueBackTaxChar singleCharVect' - in - (blockName, newTaxVectByCharVect, charInfoV) - --- | removeConstantCharsPrealigned takes a single 'character' and if proper type removes if all values are the same --- could be done if character has max lenght of 0 as well. --- packed types already filtered when created -removeConstantCharsPrealigned :: V.Vector CharacterData -> CharInfo -> V.Vector CharacterData -removeConstantCharsPrealigned singleChar charInfo = - let inCharType = charType charInfo - in - - -- dynamic characters don't do this - if inCharType `notElem` prealignedCharacterTypes then singleChar - else - let variableVect = getVariableChars inCharType singleChar - in - variableVect - --- | getVariableChars checks identity of states in a vector positin in all taxa --- and returns True if variable, False if constant --- bit packed and non-exact should not get in here -getVariableChars :: CharType -> V.Vector CharacterData -> V.Vector CharacterData -getVariableChars inCharType singleChar = - let nonAddV = fmap snd3 $ fmap stateBVPrelim singleChar - addV = fmap snd3 $ fmap rangePrelim singleChar - matrixV = fmap matrixStatesPrelim singleChar - alSlimV = fmap snd3 $ fmap alignedSlimPrelim singleChar - alWideV = fmap snd3 $ fmap alignedWidePrelim singleChar - alHugeV = fmap snd3 $ fmap alignedHugePrelim singleChar - - -- get identity vect - boolVar = if inCharType == NonAdd then getVarVectBits inCharType nonAddV [] - else if inCharType == Add then getVarVectAdd addV [] - else if inCharType == Matrix then getVarVectMatrix matrixV [] - else if inCharType == AlignedSlim then getVarVectBits inCharType alSlimV [] - else if inCharType == AlignedWide then getVarVectBits inCharType alWideV [] - else if inCharType == AlignedHuge then getVarVectBits inCharType alHugeV [] - else error ("Char type unrecognized in getVariableChars: " ++ show inCharType) - - -- get Variable characters by type - nonAddVariable = fmap (filterConstantsV (V.fromList boolVar)) nonAddV - addVariable = fmap (filterConstantsV (V.fromList boolVar)) addV - matrixVariable = fmap (filterConstantsV (V.fromList boolVar)) matrixV - alSlimVariable = fmap (filterConstantsSV (V.fromList boolVar)) alSlimV - alWideVariable = fmap (filterConstantsUV (V.fromList boolVar)) alWideV - alHugeVariable = fmap (filterConstantsV (V.fromList boolVar)) alHugeV - - -- assign to propoer character fields - outCharVect = V.zipWith (assignNewField inCharType) singleChar (V.zip6 nonAddVariable addVariable matrixVariable alSlimVariable alWideVariable alHugeVariable) - - in - -- trace ("GVC:" ++ (show $ length boolVar) ++ " -> " ++ (show $ length $ filter (== False) boolVar)) - outCharVect - --- | getVarVectAdd takes a vector of a vector additive ranges and returns False if range overlap --- True if not (short circuits) --- based on range overlap -getVarVectAdd :: V.Vector (V.Vector (Int, Int)) -> [Bool] -> [Bool] -getVarVectAdd stateVV curBoolList = - if V.null (V.head stateVV) then L.reverse curBoolList - - else - let firstChar = fmap V.head stateVV - isVariable = checkIsVariableAdditive (V.head firstChar) (V.tail firstChar) - - in - getVarVectAdd (fmap V.tail stateVV) (isVariable : curBoolList) - - --- | getVarVectMatrix takes a generic vector and returns False if values are same --- True if not (short circuits) --- based on simple identity not max cost zero -getVarVectMatrix :: V.Vector (V.Vector (V.Vector MatrixTriple)) -> [Bool] -> [Bool] -getVarVectMatrix stateVV curBoolList = - if V.null (V.head stateVV) then L.reverse curBoolList - - else - let firstChar = fmap V.head stateVV - isVariable = checkIsVariableMatrix (getMatrixStateList $ V.head firstChar) (V.tail firstChar) - - in - getVarVectMatrix (fmap V.tail stateVV) (isVariable : curBoolList) - - --- | getVarVectBits takes a generic vector and returns False if values are same --- True if not (short circuits) --- based on simple identity not max cost zero -getVarVectBits :: (FiniteBits a, Eq a, GV.Vector v a) => CharType -> V.Vector (v a) -> [Bool] -> [Bool] -getVarVectBits inCharType stateVV curBoolList = - if GV.null (V.head stateVV) then L.reverse curBoolList - - else - let firstChar = fmap GV.head stateVV - isVariable = if inCharType == NonAdd then checkIsVariableBit (GV.head firstChar) (GV.tail firstChar) - else if inCharType == AlignedSlim then checkIsVariableBit (GV.head firstChar) (GV.tail firstChar) - else if inCharType == AlignedWide then checkIsVariableBit (GV.head firstChar) (GV.tail firstChar) - else if inCharType == AlignedHuge then checkIsVariableBit (GV.head firstChar) (GV.tail firstChar) - else error ("Char type unrecognized in getVariableChars: " ++ show inCharType) - - in - getVarVectBits inCharType (fmap GV.tail stateVV) (isVariable : curBoolList) - -{- --- | checkIsVariableIdentity takes a generic vector and sees if all elements are identical -checkIsVariableIdentity :: (Eq a, GV.Vector v a) => a -> v a -> Bool -checkIsVariableIdentity firstElement inVect = - if GV.null inVect then False - else - if firstElement /= GV.head inVect then True - else checkIsVariableIdentity firstElement (GV.tail inVect) --} - --- | checkIsVariableAdditive checks if additive charcter is variable --- by taking new ranges and range costs of first element with all others --- if summed cost > 0 then variable -checkIsVariableAdditive :: (Int, Int) -> V.Vector (Int, Int) -> Bool -checkIsVariableAdditive (ir1, ir2) rangeList = - if V.null rangeList then False - else - let (nr1, nr2) = V.head rangeList - (newMin, newMax, newCost) = M.getNewRange ir1 ir2 nr1 nr2 - in - if newCost > 0 then True - else checkIsVariableAdditive (newMin, newMax) (V.tail rangeList) - --- | getMatrixStateList returns minimum matrix characters states as integers -getMatrixStateList :: V.Vector MatrixTriple -> [Int] -getMatrixStateList inState = - let statePairList = zip (fmap fst3 $ V.toList inState) [0..(V.length inState - 1)] - minCost = minimum $ fmap fst3 $ V.toList inState - stateList = fmap snd $ filter ((== minCost) .fst) statePairList - in - stateList - --- | checkIsVariableMatrix checks if matrix charcter is variable --- by taking min cost states of first element and --- checks for overlap--if empty list intersect then variable -checkIsVariableMatrix :: [Int] -> V.Vector (V.Vector MatrixTriple) -> Bool -checkIsVariableMatrix inStateList restStatesV = - if V.null restStatesV then False - else - let nextStateList = getMatrixStateList $ V.head restStatesV - - newStateList = L.intersect inStateList nextStateList - in - if null newStateList then True - else checkIsVariableMatrix newStateList (V.tail restStatesV) - --- | checkIsVariableBit takes a generic vector and checks for --- state overlap via bit AND (.&.) -checkIsVariableBit :: (FiniteBits a, GV.Vector v a) => a -> v a -> Bool -checkIsVariableBit firstElement restVect = - if GV.null restVect then False - else - let newState = firstElement .&. (GV.head restVect) - in - if popCount newState == 0 then True - else checkIsVariableBit newState (GV.tail restVect) - --- | these need to be abstracted but had problems with the bool list -> Generic vector, and SV pair - --- | filerConstantsV takes the charcter data and filters out teh constants --- uses filter to keep O(n) ---filterConstantsV :: (GV.Vector v a) => [Bool] -> v a -> v a -filterConstantsV :: V.Vector Bool -> V.Vector a -> V.Vector a -filterConstantsV inVarBoolV charVect = - let pairVect = V.zip charVect inVarBoolV - variableCharV = V.map fst $ V.filter ((== True) . snd) pairVect - in - variableCharV - - --- | filerConstantsSV takes the charcter data and filters out teh constants --- uses filter to keep O(n) ---filterConstantsV :: (GV.Vector v a) => [Bool] -> v a -> v a -filterConstantsSV :: (SV.Storable a) => V.Vector Bool -> SV.Vector a -> SV.Vector a -filterConstantsSV inVarBoolV charVect = - let varVect = filterConstantsV inVarBoolV (V.fromList $ SV.toList charVect) - in - SV.fromList $ V.toList varVect - --- | filerConstantsUV takes the charcter data and filters out teh constants --- uses filter to keep O(n) ---filterConstantsV :: (GV.Vector v a) => [Bool] -> v a -> v a -filterConstantsUV :: (UV.Unbox a) => V.Vector Bool -> UV.Vector a -> UV.Vector a -filterConstantsUV inVarBoolV charVect = - let varVect = filterConstantsV inVarBoolV (V.fromList $ UV.toList charVect) - in - UV.fromList $ V.toList varVect - --- | assignNewField takes character type and a 6-tuple of charcter fields and assigns the appropriate --- to the correct field --- neither bit packed nor nno-exact should het here -assignNewField :: CharType - -> CharacterData - -> (V.Vector BV.BitVector, V.Vector (Int, Int), V.Vector (V.Vector MatrixTriple), SV.Vector CUInt, UV.Vector Word64, V.Vector BV.BitVector) - -> CharacterData -assignNewField inCharType charData (nonAddData, addData, matrixData, alignedSlimData, alignedWideData, alignedHugeData) = - if inCharType == NonAdd then charData {stateBVPrelim = (nonAddData, nonAddData, nonAddData)} - else if inCharType == Add then charData {rangePrelim = (addData, addData, addData)} - else if inCharType == Matrix then charData {matrixStatesPrelim = matrixData} - else if inCharType == AlignedSlim then charData {alignedSlimPrelim = (alignedSlimData, alignedSlimData, alignedSlimData)} - else if inCharType == AlignedWide then charData {alignedWidePrelim = (alignedWideData, alignedWideData, alignedWideData)} - else if inCharType == AlignedHuge then charData {alignedHugePrelim = (alignedHugeData, alignedHugeData, alignedHugeData)} - else error ("Char type unrecognized in assignNewField: " ++ show inCharType) - --- | recodeAddToNonAddCharacters takes an max states number and processsed data --- and recodes additive characters with max state < input max (0..input max - 1) --- as a series of binary non-additive characters -recodeAddToNonAddCharacters :: GlobalSettings -> Int -> ProcessedData -> ProcessedData -recodeAddToNonAddCharacters inGS maxStateToRecode (nameVect, nameBVVect, blockDataVect) = - let newBlockDataVect = fmap (convertAddToNonAddBlock inGS maxStateToRecode) blockDataVect - in - (nameVect, nameBVVect, newBlockDataVect) - --- | convertAddToNonAddBlock converts additive charcters to no-additive in a block -convertAddToNonAddBlock :: GlobalSettings -> Int -> BlockData -> BlockData -convertAddToNonAddBlock inGS maxStateToRecode (blockName, taxByCharDataVV, charInfoV) = - let (newTaxByCharDataVV, newCharInfoVV) = V.unzip $ fmap (recodeTaxonData inGS maxStateToRecode charInfoV) taxByCharDataVV - in - -- trace ("CNAB: " ++ (show (V.length $ V.head newTaxByCharDataVV, V.length $ V.head newCharInfoVV))) - (blockName, newTaxByCharDataVV, V.head newCharInfoVV) - --- | recodeTaxonData recodes Add as nonAdd for each taxon in turn -recodeTaxonData :: GlobalSettings -> Int -> V.Vector CharInfo -> V.Vector CharacterData -> (V.Vector CharacterData, V.Vector CharInfo) -recodeTaxonData inGS maxStateToRecode charInfoV taxonCharacterDataV = - let (newCharDataVV, newCharInfoVV) = unzip $ zipWith (recodeAddToNonAddCharacter inGS maxStateToRecode) (V.toList taxonCharacterDataV) (V.toList charInfoV) - in - -- trace ("RTD: " ++ (show (V.length $ V.concat newCharDataVV, V.length $ V.concat newCharInfoVV))) - (V.concat newCharDataVV, V.concat newCharInfoVV) - --- |recodeAddToNonAddCharacter takes a single character for single taxon and recodes if non-additive with --- fewer than maxStateToRecode states. --- assumes states in linear order --- replicatee charinfo for multiple new characters after recoding -recodeAddToNonAddCharacter :: GlobalSettings -> Int -> CharacterData -> CharInfo -> (V.Vector CharacterData, V.Vector CharInfo) -recodeAddToNonAddCharacter inGS maxStateToRecode inCharData inCharInfo = - let inCharType = charType inCharInfo - numStates = 1 + (L.maximum $ fmap makeInt $ alphabet inCharInfo) -- min 2 (1 + (L.last $ L.sort $ fmap makeInt $ alphabetSymbols $ alphabet inCharInfo)) - -- numStates = 1 + (L.last $ L.sort $ fmap makeInt $ alphabetSymbols $ alphabet inCharInfo) - origName = name inCharInfo - in - -- if a single state recodes to a single uninfomative binary - -- removed || ((not . doubleIsInt . weight) inCharInfo) to allow for recodding (leaving weight) for non-integer weights - if (inCharType /= Add) then (V.singleton inCharData, V.singleton inCharInfo) - - -- the limit on recoded states is removed for PMDL/ML since otherwise bit costs will be incorrect - else if (numStates > maxStateToRecode) && ((optimalityCriterion inGS) `notElem` [PMDL, Likelihood]) then (V.singleton inCharData, V.singleton inCharInfo) - else if numStates < 2 then (V.empty, V.empty) - else - -- create numStates - 1 no-additve chaaracters (V.singleton inCharData, V.singleton inCharInfo) - -- bits ON-- [0.. snd range] - let minRangeIndex = fst $ V.head $ snd3 $ rangePrelim inCharData - maxRangeIndex = snd $ V.head $ snd3 $ rangePrelim inCharData - inCharOrigData = origInfo inCharInfo - newCharInfo = inCharInfo { name = (T.pack $ (T.unpack origName) ++ "RecodedToNonAdd") - , charType = NonAdd - , alphabet = fromSymbols $ fmap ST.fromString $ fmap show [(0 :: Int), (1 :: Int)] - , origInfo = inCharOrigData - } - - -- create new characters and new character info - newCharList = fmap (makeNewNonAddChar minRangeIndex maxRangeIndex) [0..numStates - 2] - - newCharInfoList = replicate (numStates - 1) newCharInfo - - - in - -- trace ("RTNA: Numstates " ++ (show numStates) ++ " " ++ (show $ (snd3 . rangePrelim) inCharData) ++ " -> " ++ (show $ fmap (snd3 . stateBVPrelim) newCharList)) - -- (show (length newCharList, V.length $ V.replicate (numStates - 1) newCharInfo)) ++ "\n" ++ (show newCharList) ++ "\n" ++ (show $ charType newCharInfo)) - (V.fromList newCharList, V.fromList newCharInfoList) - where makeInt a = let newA = readMaybe (ST.toString a) :: Maybe Int - in - if isNothing newA then error ("State '" ++ (ST.toString a) ++ "' not recoding to Int") - else fromJust newA - --- | makeNewNonAddCharacter takes a stateIndex and charcatear number --- and makes a non-additive character with 0 or 1 coding --- based on stateIndex versus state number --- if stateIndex > charNumber then 1 else 0 (coded as bit 0 for 0, bit 1 for 1) -makeNewNonAddChar :: Int -> Int -> Int -> CharacterData -makeNewNonAddChar minStateIndex maxStateIndex charIndex = - let bvMinState = if minStateIndex <= charIndex then BV.fromBits [True, False] - else BV.fromBits [False, True] - - bvMaxState = if maxStateIndex <= charIndex then BV.fromBits [True, False] - else BV.fromBits [False, True] - - bvState = bvMinState .|. bvMaxState - in - emptyCharacter { stateBVPrelim = (V.singleton bvState, V.singleton bvState, V.singleton bvState) - , stateBVFinal = V.singleton bvState - } - -{- There is an error in this replaced by combineData -import qualified Data.Bifunctor as BF - --- | groupDataByType takes naive data (ProcessedData) and returns PrcessedData --- with characters reorganized (within blocks) - -- all non-additive (with same weight) merged to a single vector character - -- all additive with same alphabet (ie numberical) recoded to single vector - -- all matrix characters with same costmatrix recoded to single character - -- removes innactive characters -groupDataByType :: ProcessedData -> ProcessedData -groupDataByType inData = - let -- before reorganizing--convert additive with <= maxAddStatesToRecode in TYpes.hs states to non-additive ala Farris (1970) - (nameVect, nameBVVect, blockDataVect) = recodeAddToNonAddCharacters maxAddStatesToRecode inData - - -- reorganize data into same type - organizedBlockData = fmap organizeBlockData' (V.toList blockDataVect) `using` PU.myParListChunkRDS - in - --trace ("Before Taxa:" ++ (show $ length nameBVVect) ++ " Blocks:" ++ (show $ length blockDataVect) ++ " Characters:" ++ (show $ fmap length $ fmap thd3 blockDataVect) - -- ++ "\nAfter Taxa:" ++ (show $ length nameBVVect) ++ " Blocks:" ++ (show $ length organizedBlockData) ++ " Characters:" ++ (show $ fmap length $ fmap thd3 organizedBlockData)) - (nameVect, nameBVVect, V.fromList organizedBlockData) - --- | organizeBlockData' special cases and divides characters so that exact characters --- are reorgnized into single characters by type and cost matrix, while non-exact sequence --- characters are unchanged. Characters are reorganized with exact first in block then non-exact -organizeBlockData' :: BlockData -> BlockData -organizeBlockData' localBlockData = - let numExactChars = U.getNumberExactCharacters (V.singleton localBlockData) - numNonExactChars = U.getNumberSequenceCharacters (V.singleton localBlockData) - in - -- if no exact--nothing to combine - if numExactChars == 0 then localBlockData - - -- if only non-exact--split and recombine - else if numNonExactChars == 0 then organizeBlockData [] [] [] [] localBlockData - - -- if both nonexact and exact--pull out non-exact and recombine exact - else if (numExactChars > 0) && (numNonExactChars > 0) then - let (exactCharacters, nonSequenceCharacters) = U.splitBlockCharacters (snd3 localBlockData) (thd3 localBlockData) 0 [] [] - newExactCharacters = organizeBlockData [] [] [] [] exactCharacters - newCharData = V.zipWith (V.++) (snd3 newExactCharacters) (snd3 nonSequenceCharacters) - newCharInfo = thd3 newExactCharacters V.++ thd3 nonSequenceCharacters - in - (fst3 localBlockData, newCharData, newCharInfo) - else error "This shouldn't happen in organizeBlockData'" - --- | organizeBlockData takes a BlockData element and organizes its character by character type --- to single add, non-add, matrix, non-exact characters (and those with non-integer weights) are left as is due to their need for --- individual traversal graphs --- second element of tuple is a vector over taxa (leaves on input) with --- a character vector for each leaf/taxon-- basically a matrix with taxon rows and character columns --- the character info vector is same size as one for each leaf --- the first 4 args are accumulators for the character types. Matrix type is list of list since can have multiple --- matrices. All non-Exact are in same pile. --- characters with weight > 1 are recoded as multiples of same character, if weight non-integer geoes into the "unchanged" pile --- when bit packed later (if non-additive) will have only log 64 operations impact --- the pairs for some data types are to keep track of things that vary--like matrices and non-exact character information --- there should be no bit-packed data created yet -organizeBlockData :: [([CharacterData], CharInfo)] - -> [([CharacterData], CharInfo)] - -> [([[CharacterData]], CharInfo)] - -> [([CharacterData], CharInfo)] - -> BlockData - -> BlockData -organizeBlockData nonAddCharList addCharList matrixCharListList unchangedCharList (blockName, characterDataVectVect, charInfoVect) = - -- Bit of a cop out but not managing the missing data in blocks thing for multiple non-exact in block - -- need to add multiple non-exact in block - if null charInfoVect then - -- concatenate all new characters, reverse (for good measure), and convert to vectors - -- with unrecoded non-Exact characters and new CharInfo vector (reversed) - -- need to make sure the character info is in the order of return types--nonAdd, Add, Matrix etc - {-Will need a function to add all this stuff back together - (blockNamne, newCharacterVector, newCharInfoVect) - -} - --trace ("New Char lengths :" ++ (show (length nonAddCharList, length addCharList, length matrixCharListList, length unchangedCharList))) ( - let (newCharDataVectVect, newCharInfoVect) = makeNewCharacterData nonAddCharList addCharList matrixCharListList unchangedCharList - in - (blockName, newCharDataVectVect, newCharInfoVect) - -- ) - else - -- proceed character by character increasing accumulators and consuming character data vector and character infoVect - -- maybe only accumulate for matrix and non additives? - let firstCharacter = V.head charInfoVect - fCharType = charType firstCharacter - fCharWeight = weight firstCharacter - intWeight = doubleAsInt fCharWeight - fCharMatrix = costMatrix firstCharacter - fCharActivity = activity firstCharacter - fAlphabet = alphabet firstCharacter - firstCharacterTaxa = fmap U.safeVectorHead characterDataVectVect - in - -- trace ("FCT: " ++ (show $ length firstCharacterTaxa) ++ " " ++ (show characterDataVectVect)) ( - --trace ("CVDD: " ++ (show (length characterDataVectVect, fmap length characterDataVectVect))) ( - - -- remove inactive characters - if not fCharActivity || (length fAlphabet < 2) || fCharWeight == (0 :: Double) then - --trace ("Innactive/Weight 0") - organizeBlockData nonAddCharList addCharList matrixCharListList unchangedCharList (blockName, V.map V.tail characterDataVectVect, V.tail charInfoVect) - else - (-- trace ("OBD: " ++ (show intWeight)) ( - if isNothing intWeight then - -- add to unchanged pile - let currentUnchangedCharacter = (V.toList firstCharacterTaxa, firstCharacter) - - in - -- trace ("Unchanged character:" ++ (show $ length $ fst currentUnchangedCharacter) ++ " Name:" ++ (T.unpack $ name firstCharacter) ++ " " ++ (show (charType firstCharacter)) - -- ++ " " ++ (show $ fst currentUnchangedCharacter)) - -- trace ("Character Weight non-integer:" ++ show fCharWeight) - organizeBlockData nonAddCharList addCharList matrixCharListList (currentUnchangedCharacter : unchangedCharList) (blockName, V.map V.tail characterDataVectVect, V.tail charInfoVect) - - -- issue with the line "firstCharacterTaxa = fmap V.head characterDataVectVect" since missing character will be empoty and throw an error on V.head - else if fCharType `notElem` exactCharacterTypes - then errorWithoutStackTrace "Blocks with more than one Non-Exact Character not yet implemented" - - -- non-additive characters - else if fCharType == NonAdd then - let replicateNumber = fromJust intWeight - currentNonAdditiveCharacter = (V.toList $ fmap V.head characterDataVectVect, firstCharacter) - in - -- trace ("Non-Additive") ( - if replicateNumber == 1 then organizeBlockData (currentNonAdditiveCharacter : nonAddCharList) addCharList matrixCharListList unchangedCharList (blockName, V.map V.tail characterDataVectVect, V.tail charInfoVect) - else organizeBlockData (replicate replicateNumber currentNonAdditiveCharacter ++ nonAddCharList) addCharList matrixCharListList unchangedCharList (blockName, V.map V.tail characterDataVectVect, V.tail charInfoVect) - -- ) - - -- additive characters - else if fCharType == Add then - let replicateNumber = fromJust intWeight - currentAdditiveCharacter = (V.toList $ fmap V.head characterDataVectVect, firstCharacter) - in - -- trace ("Additive") ( - if replicateNumber == 1 then organizeBlockData nonAddCharList (currentAdditiveCharacter : addCharList) matrixCharListList unchangedCharList (blockName, V.map V.tail characterDataVectVect, V.tail charInfoVect) - else organizeBlockData nonAddCharList (replicate replicateNumber currentAdditiveCharacter ++ addCharList) matrixCharListList unchangedCharList (blockName, V.map V.tail characterDataVectVect, V.tail charInfoVect) - -- ) - - -- matrix characters--more complex since need to check for matrix identity - else if fCharType == Matrix then - let replicateNumber = fromJust intWeight - currentMatrixCharacter = (V.toList $ fmap V.head characterDataVectVect, firstCharacter) - newMatrixCharacterList = addMatrixCharacter matrixCharListList fCharMatrix currentMatrixCharacter replicateNumber - in - -- trace ("Matrix") ( - organizeBlockData nonAddCharList addCharList newMatrixCharacterList unchangedCharList (blockName, V.map V.tail characterDataVectVect, V.tail charInfoVect) - -- ) - - -- error in type--there should be no bit-packed data created yet. - else error ("Unrecognized/not implemented charcter type: " ++ show fCharType)) -- ) - - - --- | makeNewCharacterData takes nonAddCharList addCharList matrixCharListList unchangedCharList and synthesises them into new character data --- with a single character for the exact types (nonAdd, Add, Matrix) and mulitple characters for the "unchanged" which includes --- non-exact characters and those with non-integer weights --- and character Information vector --- these only update preliminary of their type--meant to happen before decoration processes --- emptyCharacter defined in Types -makeNewCharacterData :: [([CharacterData], CharInfo)] - -> [([CharacterData], CharInfo)] - -> [([[CharacterData]], CharInfo)] - -> [([CharacterData], CharInfo)] - -> (V.Vector (V.Vector CharacterData), V.Vector CharInfo) -makeNewCharacterData nonAddCharList addCharList matrixCharListList unchangedCharList = - let - -- Non-Additive Characters - nonAddCharacter = combineNonAdditveCharacters nonAddCharList emptyCharacter [] - - -- keep track or original data in origInfo field - origNonAddData = V.fromList $ zip3 (fmap name $ fmap snd nonAddCharList) (fmap charType $ fmap snd nonAddCharList) (fmap alphabet $ fmap snd nonAddCharList) - - nonAddCharInfo = V.singleton $ (snd $ head nonAddCharList) {name = T.pack "CombinedNonAdditiveCharacters", origInfo = origNonAddData} - - -- Additive Characters - addCharacter = combineAdditveCharacters addCharList emptyCharacter [] - - -- keep track or original data in origInfo field - origAddData = V.fromList $ zip3 (fmap name $ fmap snd addCharList) (fmap charType $ fmap snd addCharList) (fmap alphabet $ fmap snd addCharList) - - addCharInfo = V.singleton $ (snd $ head addCharList) {name = T.pack "CombinedAdditiveCharacters", origInfo = origAddData} - - -- Matrix Characters - (matrixCharacters, matrixCharInfoList) = mergeMatrixCharacters matrixCharListList emptyCharacter - - -- Unchanged characters - (unchangedCharacters, unchangeCharacterInfoList) = unzip unchangedCharList - - -- buildList incrementally - newCharacterList' = [nonAddCharacter | not (null nonAddCharacter)] - newCharacterList'' = if null addCharacter then newCharacterList' - else addCharacter : newCharacterList' - newCharacterList''' = newCharacterList'' ++ matrixCharacters ++ unchangedCharacters - - newChararacterInfoList' = [nonAddCharInfo | not (null nonAddCharacter)] - newChararacterInfoList'' = if null addCharacter then newChararacterInfoList' - else addCharInfo : newChararacterInfoList' - newChararacterInfoList''' = newChararacterInfoList'' ++ (fmap V.singleton matrixCharInfoList) ++ (fmap V.singleton unchangeCharacterInfoList) - - in - {- - trace ("Recoded Non-Additive: " ++ (show $ length nonAddCharList) ++ "->" ++ (show (length nonAddCharacter, fmap length $ fmap stateBVPrelim nonAddCharacter)) - ++ " Additive: " ++ (show $ length addCharList) ++ "->" ++ (show (length addCharacter, fmap length $ fmap rangePrelim addCharacter)) - ++ " Matrix " ++ (show $length matrixCharListList) ++ "->" ++ (show $ length matrixCharacters) - ++ " total list: " ++ (show (length newCharacterList''', fmap length newCharacterList''')) ++ " CI " ++ (show $ length newChararacterInfoList''')) - -} - (V.fromList $ V.fromList <$> L.transpose newCharacterList''', V.concat newChararacterInfoList''') - - --- | combineMatrixCharacters cretes a series of lists of characters each of which has a different cost matrix --- each character "type" (based on matrix) can have 1 or more characters -mergeMatrixCharacters :: [([[CharacterData]], CharInfo)] -> CharacterData -> ([[CharacterData]], [CharInfo]) -mergeMatrixCharacters inMatrixCharListList charTemplate = - -- should probably reverse the characters to maintian similar ordering to input - let (charDataList, charInfoList) = unzip inMatrixCharListList - combinedMatrixCharList = fmap (combineMatrixCharacters charTemplate []) charDataList - in - (combinedMatrixCharList, charInfoList) - --- | combineMatrixCharacters takes all matrix characters with same cost matrix and combines into --- a single character with vector of original characters -combineMatrixCharacters :: CharacterData -> [[V.Vector MatrixTriple]] -> [[CharacterData]] -> [CharacterData] -combineMatrixCharacters charTemplate currentTripleList inMatrixCharDataList = - if null inMatrixCharDataList then - -- create character vector for preliminary states concatenating by taxon - let taxRowCharList = L.transpose currentTripleList - newCharacterData = fmap (makeMatrixCharacterList charTemplate) taxRowCharList - in - newCharacterData - else - -- first Character - let charDataList = head inMatrixCharDataList - prelimTripleList = fmap (V.head . matrixStatesPrelim) charDataList - in - combineMatrixCharacters charTemplate (prelimTripleList : currentTripleList) (tail inMatrixCharDataList) - --- | makeMatrixCharacterList takes a taxon list of matrix characters --- and converts to single vector and makes new character for the taxon -makeMatrixCharacterList :: CharacterData -> [V.Vector MatrixTriple] -> CharacterData -makeMatrixCharacterList charTemplate tripleList = charTemplate {matrixStatesPrelim = V.fromList tripleList} - --- | combineNonAdditveCharacters takes a list of character data with singleton non-additive characters and puts --- them together in a single character for each taxon -combineNonAdditveCharacters :: [([CharacterData], CharInfo)] -> CharacterData -> [[BV.BitVector]] -> [CharacterData] -combineNonAdditveCharacters nonAddCharList charTemplate currentBVList = - if null nonAddCharList then - -- create character vector for preliminary states concatenating by taxon - -- single created and redone twice with prepend no need to reverse (that there really is anyway) - let taxRowCharList = L.transpose currentBVList - newCharacterData = fmap (makeNonAddCharacterList charTemplate) taxRowCharList - in - newCharacterData - else - -- first Character - let (charDataList, _) = head nonAddCharList - prelimBVList = fmap ((V.head . snd3) . stateBVPrelim) charDataList - in - combineNonAdditveCharacters (tail nonAddCharList) charTemplate (prelimBVList : currentBVList) - --- | combineAdditveCharacters takes a list of character data with singleton non-additive characters and puts --- them together in a single character for each taxon -combineAdditveCharacters :: [([CharacterData], CharInfo)] -> CharacterData -> [[(Int, Int)]] -> [CharacterData] -combineAdditveCharacters addCharList charTemplate currentRangeList = - if null addCharList then - -- create character vector for preliminary states concatenating by taxon - -- single created and redone twice with prepend no need to reverse (that there really is anyway) - let taxRowCharList = L.transpose currentRangeList - newCharacterData = fmap (makeAddCharacterList charTemplate) taxRowCharList - in - newCharacterData - else - -- first Character - let (charDataList, _) = head addCharList - prelimRangeList = fmap ((V.head . snd3) . rangePrelim) charDataList - in - combineAdditveCharacters (tail addCharList) charTemplate (prelimRangeList : currentRangeList) - --- | makeNonAddCharacterList takes a taxon list of characters --- convertes chars to single vector and makes new character for the taxon --- assumes a leaf so all fields same -makeNonAddCharacterList :: CharacterData -> [BV.BitVector] -> CharacterData -makeNonAddCharacterList charTemplate bvList = charTemplate {stateBVPrelim = (V.fromList bvList, V.fromList bvList,V.fromList bvList)} - --- | makeAddCharacterList takes a taxon list of characters --- to single vector and makes new character for the taxon --- assums a leaf so so all fields same -makeAddCharacterList :: CharacterData -> [(Int, Int)] -> CharacterData -makeAddCharacterList charTemplate rangeList = charTemplate {rangePrelim = (V.fromList rangeList, V.fromList rangeList, V.fromList rangeList)} - --- | addMatrixCharacter adds a matrix character to the appropriate (by cost matrix) list of matrix characters --- replicates character by integer weight -addMatrixCharacter :: [([[CharacterData]], CharInfo)] -> S.Matrix Int -> ([CharacterData], CharInfo)-> Int -> [([[CharacterData]], CharInfo)] -addMatrixCharacter inMatrixCharacterList currentCostMatrix currentMatrixCharacter replicateNumber = - if null inMatrixCharacterList then - -- didn't find a match --so need to add new type to list of matrix character types - if replicateNumber == 1 then - [([fst currentMatrixCharacter], snd currentMatrixCharacter)] - - else - [BF.first (replicate replicateNumber) currentMatrixCharacter] - - else - let firstList@(firstMatrixCharList, localCharInfo) = head inMatrixCharacterList - firstMatrix = costMatrix localCharInfo - in - - -- matrices match--so correct matrix character type - if firstMatrix == currentCostMatrix then - if replicateNumber == 1 then - (fst currentMatrixCharacter : firstMatrixCharList, localCharInfo) : tail inMatrixCharacterList - - else - (replicate replicateNumber (fst currentMatrixCharacter) ++ firstMatrixCharList, localCharInfo) : tail inMatrixCharacterList - - -- matrices don't match so recurse to next one - else firstList : addMatrixCharacter (tail inMatrixCharacterList) currentCostMatrix currentMatrixCharacter replicateNumber --} diff --git a/pkg/PhyGraph/Input/TNTUtilities.hs b/pkg/PhyGraph/Input/TNTUtilities.hs deleted file mode 100644 index 0d62dc870..000000000 --- a/pkg/PhyGraph/Input/TNTUtilities.hs +++ /dev/null @@ -1,718 +0,0 @@ -{- | -Module : TNTUtilities.hs -Description : Module to read tnt input files for phylogenetic analysis -Copyright : (c) 2021-2022 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -{- -Functions to input TNT file reading for PhyG - This is far from complete wrt TNT functionality - Deals with scope and add/nonadd/sankloff - ccode, ccosts - Ambiguities in "dense" tnt rows (single character states, no spaces) - Ambiguiities and multi-character states (e.g. CYTB, 1.23) - Will limit continuous character reps to 9 sig digits - this to allow 2x32 bit representations ina single Word64 later - -One Big thing-- - For multicharacter states-- - the parsing assumes there is more than one multicharacter character. - Can easily add a char from single-state (with spaces) - But the parsing relies on counting hte number of "words" in a line of data - if 2--single character states - if > 2 --multicharacter states. - A singl multi-character states would be parsed as single charcter states - NEED TO FIX --} - -module Input.TNTUtilities (getTNTData - , getTNTDataText - ) where - -import Data.Alphabet -import Data.Char -import qualified Data.Char as C -import Data.Foldable -import qualified Data.List as L -import Data.Maybe -import Data.MetricRepresentation -import qualified Data.Set as Set -import qualified Data.TCM as TCM -import qualified Data.Text.Lazy as T -import qualified Data.Text.Short as ST -import Debug.Trace ---import qualified GeneralUtilities as GU -import qualified Input.DataTransformation as DT -import qualified Input.FastAC as FAC -import qualified SymMatrix as SM -import qualified Data.Vector as V -import Text.Read -import Types.Types - - --- getTNTData take file contents and returns raw data and char info form TNT file -getTNTData :: String -> String -> RawData -getTNTData inString fileName = - if null inString then errorWithoutStackTrace ("\n\nTNT input file " ++ fileName ++ " processing error--empty file") - else - getTNTDataText (T.pack inString) fileName - --- getTNTDataText take file contents and returns raw data and char info form TNT file -getTNTDataText :: T.Text -> String -> RawData -getTNTDataText inString fileName = - if T.null inString then errorWithoutStackTrace ("\n\nTNT input file " ++ fileName ++ " processing error--empty file") - else - let inString' = T.unlines $ filter (not . T.null) $ fmap T.strip (T.lines inString) - inText = T.strip inString' - in - if toLower (T.head inText) /= 'x' then errorWithoutStackTrace ("\n\nTNT input file " ++ fileName ++ " processing error--must begin with 'xread'") - else - -- look for quoted message - let singleQuotes = T.count (T.pack "'") inText - quotedMessage - | singleQuotes == 0 = T.pack "No TNT title message" - | singleQuotes > 2 = errorWithoutStackTrace ("\n\nTNT input file " ++ fileName ++ " processing error--too many single quotes in title") - | otherwise = T.split (== '\'') inText !! 1 - (firstNum, secondNum, remainderText) = removeNCharNTax $ T.split (== '\'') inText !! 2 - numCharM = readMaybe (T.unpack firstNum) :: Maybe Int - numTaxM = readMaybe (T.unpack secondNum) :: Maybe Int - restFile = filter ((>0). T.length) $ T.lines remainderText - - numChar = fromJust numCharM - numTax = fromJust numTaxM - in - -- trace (show quotedMessage ++ " " ++ (show remainderText) ++ "\n" ++ show restFile) ( - if T.null inText then errorWithoutStackTrace ("n\nTNT input file " ++ fileName ++ " processing error--empty TNT contents") - else if null restFile then errorWithoutStackTrace ("n\nTNT input file " ++ fileName ++ " processing error--empty TNT contents after first line") - else if isNothing numCharM then errorWithoutStackTrace ("n\nTNT input file " ++ fileName ++ " processing error--number of characters:" ++ show (T.unpack firstNum)) - else if isNothing numTaxM then errorWithoutStackTrace ("n\nTNT input file " ++ fileName ++ " processing error--number of taxa:" ++ show (T.unpack secondNum)) - else - trace ("\nTNT file file " ++ fileName ++ " message : " ++ T.unpack quotedMessage ++ " with " ++ show numTax ++ " taxa and " ++ show numChar ++ " characters") ( - let semiColonLineNumber = L.findIndex ((== ';').T.head) restFile -- (== T.pack ";") restFile - in - if isNothing semiColonLineNumber then errorWithoutStackTrace ("\n\nTNT input file " ++ fileName ++ " processing error--can't find ';' to end data block" ++ show restFile) - else - let dataBlock = filter ((>0).T.length) $ fmap (T.filter printOrSpace) $ take (fromJust semiColonLineNumber) restFile - --dataBlock = filter ((>0).T.length) $ fmap (T.filter C.isPrint) $ take (fromJust semiColonLineNumber) restFile - charInfoBlock = filter (/= T.pack ";") $ filter ((>0).T.length) $ tail $ drop (fromJust semiColonLineNumber) restFile - numDataLines = length dataBlock - -- (interleaveNumber, interleaveRemainder) = numDataLines `quotRem` numTax - (_, interleaveRemainder) = numDataLines `quotRem` numTax - in - -- trace (show dataBlock ++ "\n" ++ show (interleaveNumber, interleaveRemainder, numDataLines, numTax)) ( - if interleaveRemainder /= 0 then errorWithoutStackTrace ("\n\nTNT input file " ++ fileName ++ " processing error--number of taxa mis-specified or interleaved format error ") - else - let sortedData = glueInterleave fileName dataBlock numTax numChar [] - charNumberList = fmap (length . snd) sortedData - nameLengthList = zip (fmap fst sortedData) charNumberList - incorrectLengthList = filter ((/= numChar).snd) nameLengthList - (hasDupTerminals, dupList) = DT.checkDuplicatedTerminals sortedData - renamedDefaultCharInfo = renameTNTChars fileName 0 (replicate numChar defaultTNTCharInfo) - charInfoData = getTNTCharInfo fileName numChar renamedDefaultCharInfo charInfoBlock - checkInfo = length charInfoData == numChar - in - -- trace ("Sorted data:" ++ show sortedData) ( - --trace ("Alph2 " ++ (show $ fmap alphabet charInfoData)) ( - if not checkInfo then error ("Character information number not equal to input character number: " ++ show numChar ++ " v " ++ show (length charInfoData)) - else if not $ null incorrectLengthList then errorWithoutStackTrace ("\tInput file " ++ fileName ++ " has terminals with incorrect or varying numbers of characters (should be " - ++ show numChar ++ "):" ++ show incorrectLengthList) - else if hasDupTerminals then errorWithoutStackTrace ("\tInput file " ++ fileName ++ " has duplicate terminals: " ++ show dupList) - else - -- check non-Additive alphabet to be numbers - -- integerize and reweight additive chars (including in ambiguities) - let curNames = fmap ((T.filter (/= '"') . T.filter C.isPrint) . fst) sortedData - curData = fmap snd sortedData - (curData',charInfoData') = checkAndRecodeCharacterAlphabets fileName curData charInfoData [] [] - in - -- trace (show (curNames, curData')) - (zip curNames curData',charInfoData') - ) -- ) ) - where printOrSpace a = (C.isPrint a || C.isSpace a) && (a /= '\r') - --- | removeNCharNTax removes teh first two "words" of nchar and ntax, but leaves text with line feeds so can use --- lines later -removeNCharNTax :: T.Text -> (T.Text, T.Text, T.Text) -removeNCharNTax inText = - let noLeadingSpaces = T.dropWhile (not . C.isDigit) inText - nCharacters = T.takeWhile C.isDigit noLeadingSpaces - remainder = T.dropWhile C.isDigit noLeadingSpaces - noLeadingSpaces' = T.dropWhile (not . C.isDigit) remainder - nTaxa = T.takeWhile C.isDigit noLeadingSpaces' - remainder' = T.dropWhile C.isDigit noLeadingSpaces' - in - (nCharacters, nTaxa, remainder') - --- | glueInterleave takes interleves lines and puts them together with name error checking based on number of taxa --- needs to be more robust on detecting multichar blocks--now if only a single multicahr in a block would think ---its a regular block -glueInterleave :: String -> [T.Text] -> Int -> Int -> [(T.Text, [String])] -> [TermData] -glueInterleave fileName lineList numTax numChars curData - | null lineList = - -- recheck num taxa - -- check chars after process due to ambiguities - if length curData /= numTax then error ("Error in glueInterleave: final taxon number error: " ++ show numTax ++ " vs. " ++ show (length curData)) - else - let nameList = fmap T.strip $ fmap fst curData - charShortTextList = fmap (fmap ST.fromString . snd) curData - in - --trace ((show $ length curData) ++ " " ++ (show $ length $ snd $ head curData)) - zip nameList charShortTextList - | length lineList < numTax = error "Error in glueInterleave: line number error" - | otherwise = - let thisDataBlock = T.words <$> take numTax lineList - blockNames = fmap head thisDataBlock - -- if two words then regular TNT without space, otherwise spaces between states - blockStrings = if length (head thisDataBlock) == 2 then fmap (((collectAmbiguities fileName . fmap (:[])) . T.unpack) . last) thisDataBlock - else fmap ((collectMultiCharAmbiguities fileName . fmap T.unpack) . tail) thisDataBlock - canonicalNames = if not (null curData) then fmap fst curData - else blockNames - canonicalStrings = fmap snd curData - in - -- trace ("GIL: " ++ show blockNames ++ "\n" ++ show canonicalNames ++ "\n" ++ show blockStrings ++ "\n" ++ show canonicalStrings) ( - --check for taxon order - --trace ("Words:" ++ (show $ length $ head thisDataBlock)) ( - if blockNames /= canonicalNames then errorWithoutStackTrace ("\n\nTNT input file " ++ fileName ++ "processing error--interleaved taxon order error or mispecified number of taxa") - else - let newChars = if not (null curData) then zipWith (++) canonicalStrings blockStrings - else blockStrings - in - glueInterleave fileName (drop numTax lineList) numTax numChars (zip canonicalNames newChars) - -- ) - --- | collectMultiCharAmbiguities take a list of Strings and collects TNT ambiguities [X Y] into single Strings --- this only for multicharacter TNT characters as opposed to collectAmbiguities -collectMultiCharAmbiguities :: String -> [String] -> [String] -collectMultiCharAmbiguities fileName inStringList = - if null inStringList then [] - else - let firstString = head inStringList - in - -- might be better to check for head and last [] for better error processing - if ('[' `elem` firstString) && (']' `elem` firstString) then - if '-' `elem` firstString then - let firstPart = takeWhile (/= '-') firstString - secondPart = tail $ dropWhile (/= '-') firstString - in - concat [firstPart, " ", secondPart] : collectMultiCharAmbiguities fileName (tail inStringList) - else errorWithoutStackTrace ("\n\nTNT input file " ++ fileName ++ " processing error: ambiguity format problem no ambiguity or range '-' in : " ++ firstString) - else if ']' `elem` firstString then errorWithoutStackTrace ("\n\nTNT input file " ++ fileName ++ " processing error: ambiguity format problem naked ']' in : " ++ firstString) - else if '[' `elem` firstString then - let firstStringList = takeWhile (']' `notElem`) inStringList - ambiguityStringList = firstStringList ++ [inStringList !! max 0 (length firstStringList)] - in - --trace (show firstStringList ++ show ambiguityStringList ++ show (head $ drop (length firstStringList) inStringList)) - unwords ambiguityStringList : collectMultiCharAmbiguities fileName (drop (length ambiguityStringList) inStringList) - else firstString : collectMultiCharAmbiguities fileName (tail inStringList) - --) - --- | collectAmbiguities take a list of Strings and collects TNT ambiguities [XY] --- into single Strings --- this only for single 'block' TNT characters where states are a single character -collectAmbiguities :: String -> [String] -> [String] -collectAmbiguities fileName inStringList = - if null inStringList then [] - else - let firstString = head inStringList - in - --trace ("CA " ++ firstString) ( - if firstString == "]" then errorWithoutStackTrace ("\n\nTNT input file " ++ fileName ++ " processing error: ambiguity format problem naked ']' in : " ++ firstString) - else if firstString == "[" then - let ambiguityStringList = takeWhile (/="]") inStringList ++ ["]"] - in - --trace ("CA:" ++ (concat ambiguityStringList)) -- ++ " " ++ concat (drop (length $ concat ambiguityStringList) inStringList)) - (concat ambiguityStringList) : collectAmbiguities fileName (drop (length $ concat ambiguityStringList) inStringList) - else firstString : collectAmbiguities fileName (tail inStringList) - -- ) - - --- | defaultTNTCharInfo default values for TNT characters -defaultTNTCharInfo :: CharInfo -defaultTNTCharInfo = emptyCharInfo { charType = NonAdd - , activity = True - , weight = 1.0 - , costMatrix = SM.empty - , name = T.empty - , alphabet = fromSymbolsWOGap [ST.fromString "0"] -- fromSymbols [] - , prealigned = True - , slimTCM = FAC.genDiscreteDenseOfDimension (0 :: Word) - , wideTCM = snd $ metricRepresentation <$> TCM.fromRows [[0::Word]] - , hugeTCM = snd $ metricRepresentation <$> TCM.fromRows [[0::Word]] - , origInfo = V.singleton (T.empty, NonAdd, fromSymbolsWOGap []) -- fromSymbols []) - } - --- | renameTNTChars creates a unique name for each character from fileNamer:Number -renameTNTChars :: String -> Int -> [CharInfo] -> [CharInfo] -renameTNTChars fileName charIndex inCharInfo = - if null inCharInfo then [] - else - let newName = T.pack $ filter (/= ' ') fileName ++ "#" ++ show charIndex - firstCharInfo = head inCharInfo - localCharInfo = firstCharInfo {name = newName} - in - localCharInfo : renameTNTChars fileName (charIndex + 1) (tail inCharInfo) - - --- | getTNTCharInfo numChar charInfoBlock --- bit of but needs to update as it goes along -getTNTCharInfo :: String -> Int -> [CharInfo] -> [T.Text] -> [CharInfo] -getTNTCharInfo fileName charNumber curCharInfo inLines = - if null inLines then curCharInfo - else - let firstLine' = T.strip $ head inLines - multipleCommandsInLine = fmap ((T.reverse . T.cons ';') . T.reverse) (filter ((>0).T.length) $ T.strip <$> T.splitOn (T.pack ";") firstLine') - firstLine = head multipleCommandsInLine - - in - -- trace (show multipleCommandsInLine) ( - if T.null firstLine then getTNTCharInfo fileName charNumber curCharInfo (tail inLines) - -- hit 'proc /;' line at end - else if T.head firstLine == 'p' then curCharInfo - else if T.last firstLine /= ';' then errorWithoutStackTrace ("\n\nTNT input file " ++ fileName ++ " processing error--ccode/costs lines must end with semicolon ';': " ++ T.unpack firstLine) - else -- have a valid line - let wordList = T.words $ T.init firstLine - command2 = T.toLower $ T.take 2 $ head wordList - localCharInfo = if command2 == T.pack "cc" then getCCodes fileName charNumber (tail wordList) curCharInfo - else curCharInfo - localCharInfo' = if command2 == T.pack "co" then getCosts fileName charNumber (tail wordList) localCharInfo - else localCharInfo - - in - if (command2 /= T.pack "cc") && (command2 /= T.pack "co") then - trace ("\n\nWarning: TNT input file " ++ fileName ++ " unrecognized/not implemented command ignored : " ++ T.unpack firstLine) - getTNTCharInfo fileName charNumber curCharInfo (tail multipleCommandsInLine ++ tail inLines) - else getTNTCharInfo fileName charNumber localCharInfo' (tail multipleCommandsInLine ++ tail inLines) - -- ) - --- | ccodeChars are the TNT ccode control characters -ccodeChars :: [Char] -ccodeChars = ['+', '-', '[', ']', '(', ')', '/'] - --- | getCCodes takes a line from TNT and modifies characters according to cc-code option --- assumes single command (ccodeChars) per line --- could sort and num so only hit each char once--but would be n^2 then. --- the singleton stuff for compount things like "+." -getCCodes :: String -> Int -> [T.Text] -> [CharInfo] -> [CharInfo] -getCCodes fileName charNumber commandWordList curCharInfo = - if null curCharInfo then [] - else - let charStatus = if T.length (head commandWordList) == 1 then head commandWordList - else T.singleton $ T.head $ head commandWordList - scopeList = if T.length (head commandWordList) == 1 then tail commandWordList - --not a weight--weight gets added to scope without special case - else if (T.head $ head commandWordList) /= '/' then - T.tail (head commandWordList) : tail commandWordList - --a weight '/'--weight gets added to scope without special case - else - (T.unwords $ tail $ T.words $ T.tail (head commandWordList)) : tail commandWordList - charIndices = L.nub $ L.sort $ concatMap (scopeToIndex fileName charNumber) scopeList - updatedCharInfo = getNewCharInfo fileName curCharInfo charStatus (head commandWordList) charIndices 0 [] - in - --trace (show charStatus ++ " " ++ (show scopeList) ++ " " ++ show charIndices) - --if T.length charStatus > 1 then errorWithoutStackTrace ("\n\nTNT input file " ++ fileName ++ " character status processing error: option not recognized/implemented " ++ (T.unpack charStatus)) - --else - updatedCharInfo - - --- | getCosts takes a line from TNT and modifies characters according to cc-code option --- command format : costs A.B = X/Y Z U>V Q; --- assumes X/Y and U>V have no sapces (= one word) -getCosts :: String -> Int -> [T.Text] -> [CharInfo] -> [CharInfo] -getCosts fileName charNumber commandWordList curCharInfo = - --trace ("Costs " ++ show commandWordList) ( - if null curCharInfo then [] - else - let scopeList = takeWhile (/= T.pack "=") commandWordList - charIndices = L.nub $ L.sort $ concatMap (scopeToIndex fileName charNumber) scopeList - (localAlphabet, localMatrix) = processCostsLine fileName $ tail $ dropWhile (/= T.pack "=") commandWordList - updatedCharInfo = newCharInfoMatrix curCharInfo localAlphabet localMatrix charIndices 0 [] - in - --trace ("Alph " ++ (show $ fmap alphabet updatedCharInfo)) - updatedCharInfo - --) - --- | processCostsLine takes the transformation commands of TNT and creates a TCM matrix from that --- does not check for alphabet size or order so sorts states to get matrix --- TNT states (alphabet elements) must be single characters -processCostsLine :: String -> [T.Text] -> ([ST.ShortText],[[Int]]) -processCostsLine fileName wordList = - if null wordList then errorWithoutStackTrace ("\n\nTNT input file " ++ fileName ++ " costs processing error: 'costs' command without transfomation costs specified") - else - let localAlphabet = L.sort $ L.nub $ concatMap getAlphabetElements wordList - transCosts = getTransformationCosts fileName localAlphabet wordList - localMatrix = makeMatrix fileName localAlphabet transCosts - in - -- trace ("TNT" ++ show localAlphabet ++ " " ++ show transCosts ++ "\n\t" ++ show localMatrix) - (localAlphabet, localMatrix) - --- | makeMatrix takes triples and makes a square matrix with 0 diagonals if not specified -makeMatrix :: String -> [ST.ShortText]-> [(Int, Int, Int)] -> [[Int]] -makeMatrix fileName localAlphabet transCosts - | null transCosts = [] - | length transCosts < (length localAlphabet * length localAlphabet) - length localAlphabet = - errorWithoutStackTrace ("\n\nTNT input file " ++ fileName ++ " costs processing error: 'costs' command not all pairwise (non-diagnonal) transformation costs specified") - | length transCosts > (length localAlphabet * length localAlphabet) = - errorWithoutStackTrace ("\n\nTNT input file " ++ fileName ++ " costs processing error: 'costs' command too many pairwise transformation costs specified") - | otherwise = - let initialMatrix = replicate (length localAlphabet) $ replicate (length localAlphabet) 0 - newMatrix = SM.toFullLists $ SM.updateMatrix (SM.fromLists initialMatrix) transCosts - in - -- check for uninitialized, non-diagnonal cells--maybe metricity as warning - newMatrix - --- | getTransformationCosts get state pairs and their costs --- retuns as (i,j,k) = i->j costs k --- need to make this gerneral to letter states -getTransformationCosts :: String -> [ST.ShortText]-> [T.Text] -> [(Int, Int, Int)] -getTransformationCosts fileName localAlphabet wordList - | null wordList = [] - | length wordList == 1 = errorWithoutStackTrace ("\n\nTNT input file " ++ fileName ++ " ccode processing error: 'costs' command imporperly formated (transformation and costs in pairs)") - | otherwise = - let -- wordlist must be >= 2 due to above tests - transText = head wordList - costText = wordList !! 1 - transCost = readMaybe (T.unpack costText) :: Maybe Int - directedOperator = T.find (== '>') transText - symetricalOperator = T.find (== '/') transText - in - if isNothing directedOperator && isNothing symetricalOperator then errorWithoutStackTrace ("\n\nTNT input file " ++ fileName ++ " ccode processing error: 'costs' command requires '/' or '>': " ++ T.unpack transText) - else - let fromStateText = if isNothing symetricalOperator then T.takeWhile (/= '>') transText - else T.takeWhile (/= '/') transText - toStateText = if isNothing symetricalOperator then T.tail $ T.dropWhile (/= '>') transText - else T.tail $ T.dropWhile (/= '/') transText - fromState = readMaybe (T.unpack fromStateText) :: Maybe Int - toState = readMaybe (T.unpack toStateText) :: Maybe Int - in - if isNothing transCost then errorWithoutStackTrace ("\n\nTNT input file " ++ fileName ++ " ccode processing error: 'costs' command transformation " ++ T.unpack costText ++ " does not appear to be an integer.") - else if isJust fromState && isJust toState then - -- states are numerical - let newTripleList = if isNothing symetricalOperator then [(fromJust fromState, fromJust toState, fromJust transCost)] - else [(fromJust fromState, fromJust toState, fromJust transCost), (fromJust toState, fromJust fromState, fromJust transCost)] - in - newTripleList ++ getTransformationCosts fileName localAlphabet (drop 2 wordList) - else - -- states are characters (or multicharacters) - let fromStateIndex = L.elemIndex (ST.fromText $ T.toStrict fromStateText) localAlphabet - toStateIndex = L.elemIndex (ST.fromText $ T.toStrict toStateText) localAlphabet - newTripleList = if isNothing symetricalOperator then [(fromJust fromStateIndex, fromJust toStateIndex, fromJust transCost)] - else [(fromJust fromStateIndex, fromJust toStateIndex, fromJust transCost), (fromJust toStateIndex, fromJust fromStateIndex, fromJust transCost)] - in - if isNothing fromStateIndex then errorWithoutStackTrace ("\n\nTNT input file " ++ fileName ++ " ccode processing error: 'costs' command " ++ show (T.unwords wordList) ++ " transformation state " ++ T.unpack fromStateText ++ " was not found in charcater alphabet " ++ show localAlphabet) - else if isNothing toStateIndex then errorWithoutStackTrace ("\n\nTNT input file " ++ fileName ++ " ccode processing error: 'costs' command " ++ show (T.unwords wordList) ++ " transformation state " ++ T.unpack toStateText ++ " was not found in charcater alphabet " ++ show localAlphabet) - else newTripleList ++ getTransformationCosts fileName localAlphabet (drop 2 wordList) - - --- | getAlphabetElements takes Text and returns non '/' '>' elements --- this is for a single "block" as in A1>G2 or C>T -getAlphabetElements :: T.Text -> [ST.ShortText] -getAlphabetElements inText = - if T.null inText then [] - else - let hasForwardSlash = T.find (== '/') inText - hasGreaterThan = T.find (== '>') inText - in - if isNothing hasForwardSlash && isNothing hasGreaterThan then [] - else - let firstSymbol = T.takeWhile (`notElem` ['/','>']) inText - secondSymbol = T.tail $ T.dropWhile (`notElem` ['/','>']) inText - in - [ST.fromText $ T.toStrict firstSymbol, ST.fromText $ T.toStrict secondSymbol] - {- - let symbolList = T.filter (/= '/') $ T.filter (/= '>') inText - in - fmap ST.singleton $ T.unpack symbolList - -} - - --- | scopeToIndex takes the number of characters and converts to a list of indices -scopeToIndex :: String -> Int -> T.Text -> [Int] -scopeToIndex fileName numChars scopeText = - if T.null scopeText then [] - else -- stop will include '.' if present` - let (start, stop) = T.breakOn (T.pack ".") scopeText - in - --trace (show (start, stop)) ( - --single integer index - if not (T.null start) && (T.head start `elem` ccodeChars) then errorWithoutStackTrace ("\n\nTNT file " ++ fileName ++ " ccode processing error: ccode '" ++ T.unpack scopeText ++ "' incorrect format. Scope must follow operator ") - else if start == scopeText then - let scopeSingleton = readMaybe (T.unpack scopeText) :: Maybe Int - in - if isNothing scopeSingleton then errorWithoutStackTrace ("\n\nTNT file " ++ fileName ++ " ccode processing error: ccode '" ++ T.unpack scopeText ++ "' contains non-integer") - else if fromJust scopeSingleton < numChars then [fromJust scopeSingleton] - else errorWithoutStackTrace ("\n\nTNT file " ++ fileName ++ " ccode processing error: scope greater than char number " ++ show (fromJust scopeSingleton) ++ " > " ++ show (numChars - 1)) - else - let startIndex = if T.null start then Just 0 - else readMaybe (T.unpack start) :: Maybe Int - stopIndex = if stop == T.pack "." then Just (numChars - 1) - else readMaybe (T.unpack $ T.tail stop) :: Maybe Int - in - if isNothing startIndex || isNothing stopIndex then errorWithoutStackTrace ("\n\nTNT file " ++ fileName ++ " ccode processing error: ccode '" ++ T.unpack scopeText ++ "' contains non-integer") - else if fromJust startIndex >= numChars then errorWithoutStackTrace ("\n\nTNT file " ++ fileName ++ " ccode processing error: scope greater than char number " ++ show (fromJust startIndex) ++ " > " ++ show (numChars - 1)) - else if fromJust stopIndex >= numChars then errorWithoutStackTrace ("\n\nTNT file " ++ fileName ++ " ccode processing error: scope greater than char number " ++ show (fromJust stopIndex) ++ " > " ++ show (numChars - 1)) - else [(max 0 (fromJust startIndex))..(min (fromJust stopIndex) numChars)] - --) - --- | getNewCharInfo updates the a specific list character element --- if that char is not in index list it is unaffected and added back to create the new list --- in a single pass. --- if nothing to do (and nothing done so curCharLIst == []) then return original charInfo --- othewise return the reverse since new values are prepended -getNewCharInfo :: String -> [CharInfo] -> T.Text -> T.Text -> [Int] -> Int -> [CharInfo] -> [CharInfo] -getNewCharInfo fileName inCharList newStatus newStatusFull indexList charIndex curCharList = - --trace (show charIndex ++ " " ++ show indexList ++ " " ++ (show $ length inCharList)) ( - if null inCharList then reverse curCharList - else if null indexList then - if null curCharList then inCharList - else - reverse curCharList ++ inCharList - else - let firstIndex = head indexList - firstCharInfo = head inCharList - in - if charIndex /= firstIndex then getNewCharInfo fileName (tail inCharList) newStatus newStatusFull indexList (charIndex + 1) (firstCharInfo : curCharList) - else - let updatedCharInfo - | newStatus == T.pack "-" = firstCharInfo {charType = NonAdd} - | newStatus == T.pack "+" = firstCharInfo {charType = Add} - | newStatus == T.pack "[" = firstCharInfo {activity = True} - | newStatus == T.pack "]" = firstCharInfo {activity = False} - | newStatus == T.pack "(" = firstCharInfo {charType = Matrix} - | newStatus == T.pack ")" = firstCharInfo {charType = NonAdd} - -- | newStatus == T.pack "/" = firstCharInfo {weight = 1.0} - | T.head newStatus == '/' = - let newWeight = readMaybe (tail $ T.unpack newStatusFull) :: Maybe Double - in - if isNothing newWeight then errorWithoutStackTrace ("\n\nTNT file " ++ fileName ++ " ccode processing error: weight " ++ tail (T.unpack newStatusFull) ++ " not a double") - else firstCharInfo {weight = fromJust newWeight} - | otherwise = - trace ("Warning: TNT file " ++ fileName ++ " ccodes command " ++ T.unpack newStatus ++ " is unrecognized/not implemented--skipping") - firstCharInfo - in - getNewCharInfo fileName (tail inCharList) newStatus newStatusFull (tail indexList) (charIndex + 1) (updatedCharInfo : curCharList) - - --- | newCharInfoMatrix updates alphabet and tcm matrix for characters in indexList --- if that character is not in index list it is unaffected and added back to create the new list --- in a single pass. --- if nothing to do (and nothing done so curCharLIst == []) then return original charInfo --- othewise retiurn the reverse since new values are prepended -newCharInfoMatrix :: [CharInfo] -> [ST.ShortText] -> [[Int]] -> [Int] -> Int -> [CharInfo] -> [CharInfo] -newCharInfoMatrix inCharList localAlphabet localMatrix indexList charIndex curCharList = - --trace (show charIndex ++ " " ++ show indexList ++ " " ++ (show $ length inCharList)) ( - if null inCharList then reverse curCharList - else if null indexList then - if null curCharList then inCharList - else - reverse curCharList ++ inCharList - else - let firstIndex = head indexList - firstCharInfo = head inCharList - in - if charIndex /= firstIndex then newCharInfoMatrix (tail inCharList) localAlphabet localMatrix indexList (charIndex + 1) (firstCharInfo : curCharList) - else - --let updatedCharInfo = firstCharInfo {alphabet = fromSymbols localAlphabet, costMatrix = SM.fromLists localMatrix} - let updatedCharInfo = firstCharInfo {alphabet = fromSymbolsWOGap localAlphabet, costMatrix = SM.fromLists localMatrix} - in - -- trace ("TNT2" ++ (show $ alphabet updatedCharInfo)) - newCharInfoMatrix (tail inCharList) localAlphabet localMatrix (tail indexList) (charIndex + 1) (updatedCharInfo : curCharList) - --- | reconcileAlphabetAndCostMatrix trakes the original charcater alphabet created from the cost matrix and compares to the --- observed states. If observed states are a subset of the inferred, the inferred are used to replace the original --- this could happen if a matrix is specified for arange of characters, some of which do not exhibit all the states --- otherwsie an error is thrown since states don't agree with m,artrix specification --- this could happen for a DNA character (ACGT-) with a martix specified of numerical values (01234) -reconcileAlphabetAndCostMatrix - :: ( Ord s - , Show s - ) - => String - -> String - -> Alphabet s -> Alphabet s -> Alphabet s -reconcileAlphabetAndCostMatrix fileName charName observedAlphabet inferredAlphabet - | observedAlphabet `isAlphabetSubsetOf` inferredAlphabet = inferredAlphabet - | otherwise = errorWithoutStackTrace $ fold - [ "Error: TNT file " - , fileName - , " character number " - , (tail $ dropWhile (/= '#') charName), " Observed " - , show observedAlphabet - , " is incompatible with matrix specification states " - , show inferredAlphabet - ] - --- checks is observed alphabet is sub set of inferred (ie that from cost matrix string) --- this can happen if a general DNA cost is sspecified for many characters, but some --- characters may have only a few of the states. -isAlphabetSubsetOf :: (Show s, Ord s) => Alphabet s -> Alphabet s -> Bool -isAlphabetSubsetOf specialAlphabet queryAlphabet = - let querySet = alphabetSymbols queryAlphabet - specialSet = alphabetSymbols specialAlphabet - in not $ querySet `Set.isSubsetOf` specialSet - -{- --- this seems to produce backwards logic -isAlphabetSubsetOf' :: Ord s => Alphabet s -> Alphabet s -> Bool -isAlphabetSubsetOf' specialAlphabet queryAlphabet = - let querySet = alphabetSymbols queryAlphabet - specialSet = alphabetSymbols specialAlphabet - in trace ("RACM: " ++ (show $ querySet `Set.isSubsetOf` specialSet)) querySet `Set.isSubsetOf` specialSet --} - --- | checkAndRecodeCharacterAlphabets take RawData and checks the data with char info. --- verifies that states (including in ambiguity) are Numerical for additive, and checks alphabets and cost matrices --- and assigns correct alphabet to all characters -checkAndRecodeCharacterAlphabets :: String -> [[ST.ShortText]] -> [CharInfo] -> [[ST.ShortText]] -> [CharInfo] -> ([[ST.ShortText]], [CharInfo]) -checkAndRecodeCharacterAlphabets fileName inData inCharInfo newData newCharInfo - | null inCharInfo && null newCharInfo = error "Empty inCharInfo on input in checkAndRecodeCharacterAlphabets" - | null inData = error "Empty inData in checkAndRecodeCharacterAlphabets" - | null inCharInfo = - --trace (show $ L.transpose newData) - -- (reverse $ fmap reverse newData, reverse newCharInfo) - (L.transpose $ reverse newData, reverse newCharInfo) - | otherwise = - let firstColumn = fmap head inData - firstCharInfo = head inCharInfo - originalAlphabet = alphabet firstCharInfo - thisName = name firstCharInfo - (firstAlphabet, newWeight, newColumn) = getAlphabetFromSTList fileName firstColumn firstCharInfo - updatedCharInfo = if (Matrix == charType firstCharInfo) && (firstAlphabet /= originalAlphabet) then - firstCharInfo {alphabet = reconcileAlphabetAndCostMatrix fileName (T.unpack thisName) firstAlphabet originalAlphabet} - else firstCharInfo {alphabet = firstAlphabet, weight = newWeight} - in - -- checkAndRecodeCharacterAlphabets fileName (fmap tail inData) (tail inCharInfo) (prependColumn newColumn newData []) (updatedCharInfo : newCharInfo) - checkAndRecodeCharacterAlphabets fileName (fmap tail inData) (tail inCharInfo) (newColumn : newData) (updatedCharInfo : newCharInfo) - - --- | getAlphabetFromSTList take a list of ST.ShortText and returns list of unique alphabet elements, --- recodes decimat AB.XYZ to ABXYZ and reweights by that factor 1/1000 for .XYZ 1/10 for .X etc --- checks if char is additive for numerical alphabet -getAlphabetFromSTList :: String -> [ST.ShortText] -> CharInfo -> (Alphabet ST.ShortText, Double, [ST.ShortText]) -getAlphabetFromSTList fileName inStates inCharInfo = - if null inStates then error "Empty column data in getAlphabetFromSTList" - else - let thisType = charType inCharInfo - thisWeight = weight inCharInfo - mostDecimals = if thisType == Add then maximum $ fmap getDecimals inStates - else 0 - (thisAlphabet', newColumn) = getAlphWithAmbiguity fileName inStates thisType mostDecimals [] [] - newWeight = if mostDecimals > 0 then thisWeight / (10.0 ** fromIntegral mostDecimals) - else thisWeight - - thisAlphabet = if null thisAlphabet' then [ST.fromString "0"] - else thisAlphabet' - in - --trace (show (thisAlphabet, newWeight, newColumn, mostDecimals)) - -- (fromSymbols thisAlphabet, newWeight, newColumn) - -- trace ("getAlph weight: " ++ (show thisWeight)) - (fromSymbolsWOGap thisAlphabet, newWeight, newColumn) - - --- | getDecimals tkase a state ShortText and return number decimals--if ambiguous then the most of range -getDecimals :: ST.ShortText -> Int -getDecimals inChar = - if ST.null inChar then 0 - else - let inCharText = ST.toString inChar - in - -- trace (inCharText) ( - if '.' `notElem` inCharText then 0 - else if head inCharText /= '[' then - -- no ambiguity - length (dropWhile (/= '.') inCharText) - 1 - else - -- has ambiguity (range) - let rangeStringList = words $ filter (`notElem` ['[',']']) inCharText - fstChar = if '.' `elem` head rangeStringList then dropWhile (/= '.') (head rangeStringList) - else ['.'] - sndChar = if '.' `elem` last rangeStringList then dropWhile (/= '.') (last rangeStringList) - else ['.'] - in - -- trace (fstChar ++ " " ++ sndChar) - max (length fstChar) (length sndChar) - 1 - --) - - --- | getAlphWithAmbiguity take a list of ShortText with information and accumulatiors --- For both nonadditive and additive. Searches for [] to denote ambiguity and splits states --- if splits on spaces if there are spaces (within []) (ala fastc or multicharacter states) --- else if no spaces --- if non-additive then each symbol is split out as an alphabet element -- as in TNT --- if is additive splits on '-' to denote range --- rescales (integerizes later) additive characters with decimal places to an integer type rep --- for additive characters if states are not nummerical then throws an error -getAlphWithAmbiguity :: String -> [ST.ShortText] -> CharType -> Int -> [ST.ShortText] -> [ST.ShortText] -> ([ST.ShortText], [ST.ShortText]) -getAlphWithAmbiguity fileName inStates thisType mostDecimals newAlph newStates = - if null inStates then (L.sort $ L.nub newAlph, reverse newStates) - else - let firstState = ST.toString $ head inStates - in - if thisType /= Add then - if head firstState /= '[' then - if firstState `elem` ["?", "-"] then getAlphWithAmbiguity fileName (tail inStates) thisType mostDecimals newAlph (head inStates : newStates) - else getAlphWithAmbiguity fileName (tail inStates) thisType mostDecimals (head inStates : newAlph) (head inStates : newStates) - else -- ambiguity - let newAmbigStates = fmap ST.fromString $ words $ filter (`notElem` ['[',']']) firstState - in - getAlphWithAmbiguity fileName (tail inStates) thisType mostDecimals (newAmbigStates ++ newAlph) (head inStates : newStates) - - else -- additive character - let scaleFactor = 1.0 / (10.0 ** fromIntegral mostDecimals) - in - if head firstState /= '[' then - if mostDecimals == 0 then getAlphWithAmbiguity fileName (tail inStates) thisType mostDecimals (head inStates : newAlph) (head inStates : newStates) - else - let stateNumber = readMaybe firstState :: Maybe Double - newStateNumber = takeWhile (/='.') $ show (fromJust stateNumber / scaleFactor) - in - if isNothing stateNumber then - if firstState `elem` ["?", "-"] then getAlphWithAmbiguity fileName (tail inStates) thisType mostDecimals (ST.fromString "-1" : newAlph) (ST.fromString "-1" : newStates) - else errorWithoutStackTrace ("\n\nTNT file " ++ fileName ++ " ccode processing error: Additive character not a number (Int/Float) " ++ firstState) - else getAlphWithAmbiguity fileName (tail inStates) thisType mostDecimals (ST.fromString newStateNumber : newAlph) (ST.fromString newStateNumber : newStates) - else - -- trace ("GAlphAmb: " ++ (show firstState)) ( - let hasDecimal = any (== '.') firstState - gutsList = if hasDecimal then words $ filter (`notElem` ['[',']']) firstState - else fmap (:[]) $ filter (`notElem` ['[',']']) firstState - newStateNumberList = fmap readMaybe gutsList :: [Maybe Double] - newStateNumberStringList = fmap (((takeWhile (/='.') . show) . (/ scaleFactor)) . fromJust) newStateNumberList - in - if Nothing `elem` newStateNumberList then errorWithoutStackTrace ("\n\nTNT file " ++ fileName ++ " ccode processing error: Additive character not a number (Int/Float) " ++ firstState) - else - let newAmbigState = if hasDecimal then ST.fromString $ '[' : unwords newStateNumberStringList ++ "]" - else ST.fromString $ '[' : (concat newStateNumberStringList) ++ "]" - in - getAlphWithAmbiguity fileName (tail inStates) thisType mostDecimals (fmap ST.fromString newStateNumberStringList ++ newAlph) (newAmbigState : newStates) - -- ) - - - diff --git a/pkg/PhyGraph/LICENSE b/pkg/PhyGraph/LICENSE deleted file mode 120000 index abdaf5ea7..000000000 --- a/pkg/PhyGraph/LICENSE +++ /dev/null @@ -1 +0,0 @@ -../../doc/LICENSE \ No newline at end of file diff --git a/pkg/PhyGraph/PhyGraph.cabal b/pkg/PhyGraph/PhyGraph.cabal deleted file mode 100644 index 81521eec5..000000000 --- a/pkg/PhyGraph/PhyGraph.cabal +++ /dev/null @@ -1,591 +0,0 @@ -Cabal-Version: 3.0 -Name: PhyGraph -Version: 0.1.0 -Stability: Alpha -Build-Type: Simple -Tested-With: - GHC == 9.2.2 - -Author: Ward Wheeler -Copyright: (c) 2015-2022 Ward Wheeler -License: BSD-3-Clause -License-File: LICENSE - -Maintainer: Ward Wheeler -Homepage: https://github.com/wardwheeler/PhyGraph#readme -Bug-Reports: https://github.com/wardwheeler/PhyGraph/issues - -Extra-Doc-Files: - CHANGELOG.md - README.md - -Extra-Source-Files: - ffi/external-direct-optimization/*.h - - --- Global deviations from Haskell98 -common ffi-build-info - - cc-options: - -rdynamic --std=c11 - - -- Here we list all directories that contain C & C++ header files that the FFI - -- tools will need to locate when preprocessing the C files. Without listing - -- the directories containing the C header files here, the FFI preprocessor - -- (hsc2hs, c2hs, etc.) will fail to locate the requisite files. Note also, - -- that the parent directory of the nessicary C & C++ header files must be - -- specified. The preprocessor will not recursively look in subdirectories for - -- header files! - include-dirs: - ffi/external-direct-optimization - - -- Specify the header files as required source files here. - -- Do not specify them in the c-sources or cxx-sources stanzas. - -- This is required for sdist and install commands to work correctly. - Includes: - alignCharacters.h - alignmentMatrices.h - c_alignment_interface.h - c_code_alloc_setup.h - costMatrix.h - debug_constants.h - dyn_character.h - ukkCheckPoint.h - ukkCommon.h - - c-sources: - ffi/external-direct-optimization/alignCharacters.c - ffi/external-direct-optimization/alignmentMatrices.c - ffi/external-direct-optimization/c_alignment_interface.c - ffi/external-direct-optimization/c_code_alloc_setup.c - ffi/external-direct-optimization/costMatrix.c - ffi/external-direct-optimization/dyn_character.c - ffi/external-direct-optimization/ukkCheckPoint.c - ffi/external-direct-optimization/ukkCommon.c - - --- Global deviations from Haskell98 -common language-specs - - -- Always use MonadFail(fail), not Monad(fail) - other-extensions: - DerivingStrategies - - --- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --- --- Preamble of package description and reusable definitions from above has ended --- and below we list all build targets of the package. Build targets include: --- --- * Primary executable(s) for public usage --- --- * Exposed sub-libraries for public consumption as program dependancies --- --- * Benchmarks for executables and sub-libraries --- --- * Test-suites for executables and sub-libraries --- --- * Additional executables for non-public, diagnostic purposes --- --- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- - - --- Phylogenetic Graphs --- --- This is the main program of this package, supporting fully featured --- phylogenetic trees, networks, and forests. -executable phyg - - import: - ffi-build-info, - language-specs - - main-is: - phygraph.hs - - if !os(darwin) - ghc-options: - -Wall - -Wincomplete-patterns - -threaded - -feager-blackholing - -with-rtsopts=-N - -O2 - -rtsopts --- -static --- -fllvm - else - ghc-options: - -Wall - -Wincomplete-patterns - -threaded - -feager-blackholing - -with-rtsopts=-N - -O2 - -rtsopts --- -static - -- problems getting llvm to function in OSX (can't find opt) - -- -fllvm - - --- if !os(darwin) --- cc-options: --- -static --- else --- cc-options: - - if !os(darwin) - ld-options: --- -static - -pthread --- else --- ld-options: - - - build-depends: - alphabet, - dynamic-character, - PHAGE-timing, - PhyloLib, - tcm, - base >=4.10, - containers >=0.6, - graphviz >=2999.20, - fgl, - text, - parallel, - deepseq, - bv, - -- exceptions, - array, - split, - text-short, - random, - hashable, - time, - vector, - sort, - -- unboxing-vector, - MissingH >= 1.4.3, - bv-little, - -- bits, - -- hashmap, - bimap, - logfloat, - random-shuffle, - -- currently doesn't compile but could be useful,, - -- lots of good functions, - -- Graphalyze, - async, - process, - directory, - inflist, - - - - default-extensions: BangPatterns - - default-language: Haskell2010 - - -- include the source files needed in src for github library - -- this for local library files - hs-source-dirs: . - - other-modules: - Commands.ProcessCommands - Commands.CommandExecution - Commands.Verify - Debug.Debug - Graphs.GraphOperations - GraphOptimization.Medians - GraphOptimization.PostOrderFunctions - GraphOptimization.PostOrderSoftWiredFunctions - GraphOptimization.PreOrderFunctions - GraphOptimization.Traversals - Input.BitPack - Input.DataTransformation - Input.FastAC - Input.ReadInputFiles - Input.Reorganize - Input.TNTUtilities - Search.Build - Search.DistanceMethods - Search.DistanceWagner - Search.Fuse - Search.GeneticAlgorithm - Search.NetworkAddDelete - Search.Refinement - Search.Search - Search.Swap - Search.SwapMaster - Search.WagnerBuild - Support.Support - Commands.Transform - Types.DistanceTypes - Types.Types - Utilities.Distances - Utilities.DistanceUtilities - Utilities.LocalGraph - Utilities.LocalSequence - Utilities.TcmHash - Utilities.ThreeWayFunctions - Utilities.Utilities - Reconciliation.Adams - Reconciliation.Eun - Reconciliation.ReconcileGraphs - - --- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --- --- Collection of sub-libraries which both, compose the package's primary program --- (PCG/PhyG), and are also exposed for external consumption by other programs. --- --- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- - - -library alphabet - - import: - language-specs - - default-language: - Haskell2010 - - hs-source-dirs: - lib/alphabet/src - - build-depends: --- serialize, - utility, - base >= 4.11 && < 5.0, - bimap >= 0.3 && < 1.0, - binary >= 0.8 && < 1.0, - containers >= 0.6.2 && < 1.0, - deepseq >= 1.4 && < 2.0, - keys >= 3.12 && < 4.0, - mtl >= 2.2.2 && < 3.0, - QuickCheck >= 2.14 && < 3.0, - semigroupoids >= 5.3 && < 5.4, - text-short >= 0.1.3 && < 1.0, - transformers >= 0.5.6 && < 1.0, - vector >= 0.12.0.3 && < 0.13, - - -- Apparently this is needed to compile with profiling, which is really stupid. - -- It should only be in the modules which use TemplateHaskell, not all modules. - -- I consider this a temporary hack to get things to compile with profiling. - other-extensions: TemplateHaskell - - exposed-modules: - Data.Alphabet - Data.Alphabet.Codec - Data.Alphabet.IUPAC - Data.Alphabet.Special - - other-modules: - Data.Alphabet.Internal - - --- Library for performing string alignment on dynamic characters. - --- Provides various metrics for scoring static characters and --- performing string alignment on dynamic characters. -library dynamic-character - - import: - ffi-build-info, - language-specs - - default-language: - Haskell2010 - - hs-source-dirs: - lib/dynamic-character/src - - build-depends: - alphabet, - tcm, - utility, - base >= 4.11 && < 5.0, - bv-little, - containers >= 0.6.2 && < 1.0, - matrices >= 0.5 && < 1.0, - monad-loops >= 0.4 && < 1.0, - primitive >= 0.7.1 && < 1.0, - vector >= 0.12.0.3 && < 0.13, - - exposed-modules: - Bio.DynamicCharacter - Bio.DynamicCharacter.Measure - Bio.DynamicCharacter.HandleGaps - DirectOptimization.Pairwise - DirectOptimization.Pairwise.Visualization - DirectOptimization.Pairwise.Swapping - DirectOptimization.Pairwise.Ukkonen - DirectOptimization.PreOrder - - other-modules: - DirectOptimization.Pairwise.Direction - DirectOptimization.Pairwise.Huge - DirectOptimization.Pairwise.Internal - DirectOptimization.Pairwise.Slim - DirectOptimization.Pairwise.Slim.FFI - DirectOptimization.Pairwise.Wide - - --- Library for working with TCMs and SCMs in various representations. - --- General purpose library for working with transition cost matrices (TCMs) --- and symbol change matrices (SCMs). Specialization options are provided --- for the discrete metric (non-additive) and the L1 norm (additive) TCMs & --- SCMs. Exposes a memoized binding for sparsely indexed, large TCMs. - -library tcm - - import: - ffi-build-info, - language-specs - - default-language: - Haskell2010 - - hs-source-dirs: - lib/tcm/src - - build-depends: - utility, - binary >= 0.8 && < 1.0, - base >= 4.11 && < 5.0, - containers >= 0.6.2 && < 1.0, - deepseq >= 1.4 && < 2.0, - hashable >= 1.3 && < 2.0, - hashtables >= 1.2 && < 2.0, - QuickCheck >= 2.14 && < 3.0, - mono-traversable >= 1.0 && < 2.0, - semigroupoids >= 5.3 && < 5.4, - vector >= 0.12.0.3 && < 0.13, - vector-binary-instances >= 0.2.5 && < 1.0, - - exposed-modules: - Data.Hashable.Memoize - Data.MetricRepresentation - Data.TCM - Data.TCM.Dense - Data.TCM.Overlap - - other-modules: - Data.TCM.Dense.FFI - Data.TCM.Internal - - -- Includes: (Omitted) - -- costMatrix_2d.hpp - -- costMatrix_3d.hpp - -- costMatrixWrapper_2d.h - -- costMatrixWrapper_3d.h - -- costMatrixWrapper.h - -- dynamicCharacterOperations.h - - -- Include-Dirs: (Omitted) - -- ffi/memoized-tcm - - - - --- A binding to a C++ hashtable for thread-safe memoization. - --- This package is designed to provide a thread safe binding to a "pure" --- memoization of two-way and three-way Sankoff character cost and median --- computations. - ---library tcm-memo --- --- import: --- ffi-buildinfo, ----- language-specs, --- --- default-language: --- Haskell2010 --- --- hs-source-dirs: --- lib/tcm-memo/src --- --- cc-options: --std=c11 --- --- cxx-options: --std=c++14 --- --- -- This library is required for linking to the C++ standard template library. --- if os(darwin) --- extra-libraries: c++ --- else --- extra-libraries: stdc++ --- --- hs-source-dirs: lib/tcm-memo/ffi --- --- c-sources: --- ffi/memoized-tcm/costMatrixWrapper.c --- ffi/memoized-tcm/dynamicCharacterOperations.c --- --- cxx-sources: --- ffi/memoized-tcm/costMatrix_2d.cpp --- ffi/memoized-tcm/costMatrix_3d.cpp --- --- -- Here we list all directories that contain C & C++ header files that the FFI --- -- tools will need to locate when preprocessing the C files. Without listing --- -- the directories containing the C header files here, the FFI preprocessor --- -- (hsc2hs, c2hs, etc.) will fail to locate the requisite files. Note also, --- -- that the parent directory of the nessicary C & C++ header files must be --- -- specified. The preprocessor will not recursively look in subdirectories for --- -- header files! --- include-dirs: --- ffi/memoized-tcm --- --- build-depends: --- exportable, --- base >= 4.11 && < 5.0, --- deepseq >= 1.4 && < 2.0, --- QuickCheck >= 2.14 && < 3.0, --- --- exposed-modules: --- Data.TCM.Memoized --- Data.TCM.Memoized.Types --- Data.TCM.Memoized.FFI --- --- other-modules: --- -- Data.TCM.Memoized.FFI - - --- Collection of utility functions and data structures - --- Defines custom data structures for special use cases, more abstract functions --- that base provides, and re-exported correcting to deficient libraries. - -library utility - - import: - language-specs - - default-language: - Haskell2010 - - hs-source-dirs: - lib/utility/src - - build-depends: - base >= 4.11 && < 5.0, - binary >= 0.8 && < 1.0, - bv-little >= 1.0.1 && < 2.0, - bv-little:instances >= 1.0.1 && < 2.0, - bytestring >= 0.10.10 && < 1.0, - containers >= 0.6.2 && < 1.0, - deepseq >= 1.4 && < 2.0, - foldl >= 1.4 && < 2.0, - hashable >= 1.3 && < 2.0, - keys >= 3.12 && < 4.0, - lens >= 4.18 && < 6.0, - matrix >= 0.3.6 && < 0.4, - mono-traversable >= 1.0 && < 2.0, - parallel >= 3.2 && < 4.0, - pointed >= 5.0 && < 6.0, - QuickCheck >= 2.14 && < 3.0, - semigroupoids >= 5.3 && < 5.4, - tasty-hunit >= 0.10 && < 1.0, - text-short >= 0.1.3 && < 1.0, - vector >= 0.12.0.3 && < 0.13, - vector-binary-instances >= 0.2 && < 1.0, - vector-instances >= 3.4 && < 3.5, - - if impl(ghc < 9.0) - build-depends: - integer-gmp >= 1.0.2 && < 2.0 - - - exposed-modules: - Data.List.Utility - Data.Matrix.NotStupid - Data.Vector.NonEmpty - - --- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --- --- Test-suites of the sub-libraries which compose PCG/PhyG --- --- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- - -test-suite test-dynamic-character - - import: - language-specs - - default-language: - Haskell2010 - - main-is: - TestSuite.hs - - type: - exitcode-stdio-1.0 - - hs-source-dirs: - lib/dynamic-character/test - - build-depends: - alphabet, - dynamic-character, - tcm, - base >= 4.11 && < 5.0, - bimap >= 0.3 && < 1.0, - containers >= 0.6.2 && < 1.0, - QuickCheck >= 2.14 && < 3.0, - tasty >= 1.4 && < 2.0, - tasty-quickcheck >= 0.10 && < 1.0, - vector >= 0.12.0.3 && < 0.13, - - other-modules: - DirectOptimization.Pairwise.Test - Test.Aligners - Test.QuickCheck.Instances.DynamicCharacter - - --- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --- --- Additional executables which exist for the one of a few select purposes: --- --- * Generating data for PCG/PhyG input --- --- * Debugging the component sub-libraries of PCG/PhyG and PCG/PhyG itself --- --- * Performing correctness verification --- --- * Stocastically searching for non-totality counterexamples --- --- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- - - -executable inspect-dynamic-character - - import: - language-specs - - default-language: - Haskell2010 - - main-is: - Inspect.hs - - hs-source-dirs: - lib/dynamic-character/test - - build-depends: - alphabet, - dynamic-character, - tcm, - base >= 4.11 && < 5.0, - bimap >= 0.3 && < 1.0, - containers >= 0.6.2 && < 1.0, - QuickCheck >= 2.14 && < 3.0, - tasty >= 1.4 && < 2.0, - tasty-quickcheck >= 0.10 && < 1.0, - vector >= 0.12.0.3 && < 0.13, - - other-modules: - Test.Aligners - Test.QuickCheck.Instances.DynamicCharacter diff --git a/pkg/PhyGraph/README.md b/pkg/PhyGraph/README.md deleted file mode 120000 index 701f84e3d..000000000 --- a/pkg/PhyGraph/README.md +++ /dev/null @@ -1 +0,0 @@ -../../doc/ReadMe/PhyGraph.md \ No newline at end of file diff --git a/pkg/PhyGraph/Reconciliation/Adams.hs b/pkg/PhyGraph/Reconciliation/Adams.hs deleted file mode 100644 index e843c4ebd..000000000 --- a/pkg/PhyGraph/Reconciliation/Adams.hs +++ /dev/null @@ -1,583 +0,0 @@ -{- | -Module : adams.hs -Description : functions to create Adams II consensus trees (unlabelled internal veritices) -Copyright : (c) 2020 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - - --} - - - - -module Reconciliation.Adams (makeAdamsII) where - -import Control.Parallel.Strategies -import qualified Data.Graph.Inductive.Graph as G -import qualified Data.Graph.Inductive.PatriciaTree as P -import qualified Data.List as L -import Data.Maybe -import qualified Data.Set as Set -import qualified Data.Text.Lazy as T -import qualified Data.Vector as V -import qualified GraphFormatUtilities as PhyP -import qualified ParallelUtilities as PU --- import Debug.Trace - -data VertexType = Root | Internal | Leaf | Network | Tree - deriving (Read, Show, Eq) --NFData ? - -type Vertex = (String, [Int], [Int], VertexType) -- name, child vertices, parent vertices, type (a bit redundant) -type Edge = (Int, Int, Maybe Double) -- terminal vertices (by numbers) and potential length -type PhyloGraphVect = (V.Vector Vertex, V.Vector Edge) -type GenPhyNetNode = (String, [String], [String]) --Make list for both so unresolved network (node name, [descendant name], [ancestor name]) -type GenPhyNet = [GenPhyNetNode] - --- | null PhyloGraphVect -nullGraphVect :: PhyloGraphVect -nullGraphVect = (V.empty, V.empty) - --- | getAdamsIIPair inputs 2 PhyloGraphVects and returns AdamsII consensus -getAdamsIIPair :: PhyloGraphVect -> PhyloGraphVect -> PhyloGraphVect -getAdamsIIPair inGraphVectA inGraphVectB = - let inGraphVectList = [inGraphVectA, inGraphVectB] - (sameLeafSet, leafSets) = getAndCheckLeafSets inGraphVectList - curVertexSets = map fst inGraphVectList - rootPairList = map (findRoot 0) inGraphVectList - --rootIndexList = map fst rootPairList - rootVertexList = map snd rootPairList - rootSplits = map getSecond rootVertexList - rootSplitLeafListList = getSplitLeafListList rootSplits inGraphVectList - rootLUBPre = leastUpperBound rootSplitLeafListList - rootLUB = map L.sort [x | x <- rootLUBPre, not (null x)] --need map sort $ - - --create nodes based on LUBs - leavesPlaced = concat [x | x <- rootLUB, length x < 3] - rootNode = ("root", map lub2TreeRep rootLUB, []) - vertexLeafSetList = map (map getLeafSetFromNodeName . V.toList) curVertexSets - potentialVertexSets = map (map getSecond . V.toList) curVertexSets - in - if not sameLeafSet then errorWithoutStackTrace("Leaf sets of input graphs do not match" ++ show leafSets) - else - --return processed when have all nodes - let allAdamsNodes = makeAdamsNodes [rootNode] "root" rootLUB leavesPlaced (zip potentialVertexSets vertexLeafSetList) --curVertexSets vertexLeafSetList - in - genPhyNet2PhyloGraphVect allAdamsNodes - --- | mkGraphPair take LNode list lEdge List pair and reutns fgl graph -mkGraphPair :: ([G.LNode a], [G.LEdge b]) -> P.Gr a b -mkGraphPair (nodeList, edgeList) = G.mkGraph nodeList edgeList - - --- | makeAdamsII takes a list of fgl graphs, convertes them to PhyloGraphVect --- makes the Adamns consensus and then converts back to fgl for return to EUN code -makeAdamsII :: [G.LNode String] -> [P.Gr String Double] -> P.Gr String Double -makeAdamsII leafNodeList inFGList - | null leafNodeList = error "Null leaf node list in makeAdamsII" - | null inFGList = G.empty - | otherwise = - let inGraphNodes = fmap G.labNodes inFGList - inGraphEdges = fmap G.labEdges inFGList - inGraphNonLeafNodes = fmap (drop $ length leafNodeList) inGraphNodes - newNodeListList = fmap (leafNodeList ++ ) inGraphNonLeafNodes - inFGList' = mkGraphPair <$> zip newNodeListList inGraphEdges - allTreesList = PU.seqParMap rdeepseq isTree inFGList' -- `using` PU.myParListChunkRDS - allTrees = L.foldl1' (&&) allTreesList - in - if not allTrees then errorWithoutStackTrace("Input graphs are not all trees in makeAdamsII: " ++ show allTreesList) - else if not (leafSetConstant [] inFGList) then errorWithoutStackTrace"Input leaf sets not constant in makeAdamsII" - else - let inPGVList = fmap fgl2PGV inFGList' -- paralle problem with NFData seqParMap myStrategy fgl2PGV inFGList - adamsPGV = L.foldl1' getAdamsIIPair inPGVList - in - -- trace ("\nAdams: " ++ show adamsPGV) - pgv2FGL adamsPGV - --- | fgl2PGVEdge takes an fgl Labeled edge (e,u,label) --- and returns a PGV edge with no brnach length (e,u,Nothing) -fgl2PGVEdge :: G.LEdge b -> Edge -fgl2PGVEdge (e, u, _) = (e, u, Nothing) - --- | fgl2PGVNode takes a tripple of an fgl labelled node (Int, a), its --- child verteces and parent versitices and returns the PGV Vertex including its type -fgl2PGVNode :: (Show b) => P.Gr T.Text b -> (G.LNode String, [Int], [Int]) -> Vertex -fgl2PGVNode inGraph ((index, inLabel), childList, parentList) = - --if null label then error "Null node label in fgl2PGVNode" - --else - let guts = T.init $ PhyP.component2Newick inGraph False False (index, T.pack inLabel) - label = if PhyP.checkIfLeaf guts then T.unpack $ T.tail $ T.init guts else T.unpack guts - in - if null parentList then (label, childList, parentList, Root) - else if null childList then (label, childList, parentList, Leaf) - else if length parentList == 1 then (label, childList, parentList, Tree) - else if length parentList > 1 then (label, childList, parentList, Network) - else error "This can't happen in fgl2PGVNode" - --- | fgl2PGV takes an fgl (functional graph) and convertes to PhyloGraphVect --- to use local (and old) Adams consensus functions --- retuns "Nothing" for edge labels ( no need for branch lengths) -fgl2PGV :: P.Gr String Double -> PhyloGraphVect -fgl2PGV inGraph = - if G.isEmpty inGraph then nullGraphVect - else - let fglNodeList = G.labNodes inGraph - fglNodeParentList = fmap (G.pre inGraph . fst) fglNodeList - fglNodeChildList = fmap (G.suc inGraph . fst) fglNodeList - fglNodeInfoList = zip3 fglNodeList fglNodeChildList fglNodeParentList - fglEdgeList = G.labEdges inGraph - pgvNodeList = fmap (fgl2PGVNode (PhyP.stringGraph2TextGraph inGraph)) fglNodeInfoList - pgvEdgeList = fmap fgl2PGVEdge fglEdgeList - in - (V.fromList pgvNodeList, V.fromList pgvEdgeList) - --- | vertex2Node take alist of vertices and returns a list of fgl Labelled nodes -vertex2Node :: Int -> V.Vector Vertex -> [G.LNode String] -vertex2Node counter inVertexVect = - if V.null inVertexVect then [] - else - let (label, _, _, _) = V.head inVertexVect - in - (counter, label) : vertex2Node (counter + 1) (V.tail inVertexVect) - --- | edge2FGLEdge take vertex of Int Int Maybe Double and returns --- fgl node with type Double -edge2FGLEdge :: (Int, Int, Maybe Double) -> (Int, Int, Double) -edge2FGLEdge (e, u, _) = (e,u, 1.0 :: Double) - --- | pgv2FGL take a PhyloGraphVect and converts to an fgl graph -pgv2FGL :: PhyloGraphVect -> P.Gr String Double -pgv2FGL (inVertexVect, inEdgeVect) = - let fglNodes = vertex2Node 0 inVertexVect - fglEdges = V.map edge2FGLEdge inEdgeVect - in - G.mkGraph fglNodes (V.toList fglEdges) - - --- | getLeafList returns sorted leaf complement of graph fgl -getLeafLabelListFGL :: (Ord a) => P.Gr a b -> [a] -getLeafLabelListFGL inGraph = - if G.isEmpty inGraph then error "Empty graph in getLeafLabelListFGL" - else - let degOutList = G.outdeg inGraph <$> G.nodes inGraph - newNodePair = zip degOutList (G.labNodes inGraph) - leafPairList = filter ((==0).fst ) newNodePair - (_, leafList) = unzip leafPairList - in - L.sort $ snd <$> leafList - --- | leafSetConstant takes a series of fgl graphs and checks if leaf sets are the same for --- all of them -leafSetConstant :: (Ord a) => [a] -> [P.Gr a b] -> Bool -leafSetConstant leafList inFGLList - | null inFGLList = True - | null leafList = - -- first graph - let firstGraph = head inFGLList - firstLeaves = getLeafLabelListFGL firstGraph - in - leafSetConstant firstLeaves (tail inFGLList) - | otherwise = - let thisGraph = head inFGLList - theseLeaves = getLeafLabelListFGL thisGraph - in - theseLeaves == leafList && leafSetConstant leafList (tail inFGLList) - --- | isTree takes fgl graph and checks is conected, no self edges, single root (includes connected), no indegree --- > 1 nodes, leaf labels appear only once -isTree :: (Ord a) => P.Gr a b -> Bool -isTree inGraph = - not (G.isEmpty inGraph) && ( - let nodeIndegrees = G.indeg inGraph <$> G.nodes inGraph - maxIndegree = maximum nodeIndegrees - rootNodes = filter (==0) nodeIndegrees - leafLabels = getLeafLabelListFGL inGraph - uniqueLeafLabels = L.nub leafLabels - eList = fst <$> G.edges inGraph - uList = snd <$> G.edges inGraph - selfList = filter (== True) $ zipWith (==) eList uList - in - not ((((length rootNodes /= 1) || (maxIndegree > 1)) || (length leafLabels /= length uniqueLeafLabels)) || not (null selfList))) - --- | getRootNamesFromGenPhyNet extracts non-leaf-non-root --- names from vertices in order found -getRootNamesFromGenPhyNet :: GenPhyNet -> [String] -getRootNamesFromGenPhyNet inNet = - if null inNet then [] - else - let (name, desc, anc) = head inNet - vertType = getVertType (length desc) (length anc) - in - if vertType == Root then - name : getRootNamesFromGenPhyNet (tail inNet) - else - getRootNamesFromGenPhyNet (tail inNet) - --- | getNonLeafNonRootNamesFromGenPhyNet extracts non-leaf-non-root --- names from vertices in order found -getNonLeafNonRootNamesFromGenPhyNet :: GenPhyNet -> [String] -getNonLeafNonRootNamesFromGenPhyNet inNet = - if null inNet then [] - else - let (name, desc, anc) = head inNet - vertType = getVertType (length desc) (length anc) - in - if (vertType /= Leaf) && (vertType /= Root) then - name : getNonLeafNonRootNamesFromGenPhyNet (tail inNet) - else - getNonLeafNonRootNamesFromGenPhyNet (tail inNet) - --- | getLeafNamesFromGenPhyNet extracts leaf names from vertices in order found -getLeafNamesFromGenPhyNet :: GenPhyNet -> [String] -getLeafNamesFromGenPhyNet inNet = - if null inNet then [] - else - let (name, desc, anc) = head inNet - in - if getVertType (length desc) (length anc) == Leaf then - name : getLeafNamesFromGenPhyNet (tail inNet) - else - getLeafNamesFromGenPhyNet (tail inNet) - --- | getVertType takes list of desc and anc to determine type of vertex -getVertType :: Int -> Int -> VertexType -getVertType nDesc nAnc - | (nDesc == 0) && (nAnc == 0) = error "Isolated node" - | nAnc == 0 = Root - | nDesc == 0 = Leaf - | nAnc == 1 = Tree - | nAnc > 2 = Network - | otherwise = error ("Screwey node: indegree " ++ show nDesc ++ " outdegree " ++ show nAnc) - --- | getVertNum takes a list of vertex names and teh complete list and --- returns a list of the indices (integers) of the names -getVertNum :: [String] -> [String] -> [Int] -getVertNum nameList vertexNameList = - if null vertexNameList then [] - else - let firstVertName = head vertexNameList - vertNum = L.elemIndex firstVertName nameList - in - if isNothing vertNum then error ("Error in vertex name index: " ++ show firstVertName ++ " in " ++ show nameList) - else - fromJust vertNum : getVertNum nameList (tail vertexNameList) - --- | oldIndex2New takes a PhyloGraphVect and creates a list of reorder based ordered name list -oldIndex2New :: V.Vector Vertex -> [String] -> [(Int, Vertex)] -oldIndex2New inVertexVect nameList = - if V.null inVertexVect then [] - else - let curVert = V.head inVertexVect - (vertName, _, _, _) = curVert - vertNum = L.elemIndex vertName nameList - in - (fromJust vertNum, curVert) : oldIndex2New (V.tail inVertexVect) nameList - --- | genForestToPhyloGraph converts GenForest to PhyloGraph (so can use legacy --- ENewick etc parsers --- takes flattened vector of GenPhyNetNodes and builds vertices (leaf, internal, --- and root) and edges. Vertices and edges are added to input null --- PhyloGraphVect --- EDGES SEEM TO BE INCORRECT IN PLACES -genForestToPhyloGraphVect :: V.Vector GenPhyNetNode -> PhyloGraphVect -> [String] -> PhyloGraphVect -genForestToPhyloGraphVect inGen inPhyVect nameList = - if V.null inGen then inPhyVect - else - let (inVertexName, inVertexDescNameList, inVertexAncNameList) = V.head inGen - (curVertVect, curEdgeVect) = inPhyVect - descNumList = getVertNum nameList inVertexDescNameList - ancNumList = getVertNum nameList inVertexAncNameList - vertType = getVertType (length descNumList) (length ancNumList) - newEdgeVect = V.zip3 (V.fromList ancNumList) (V.replicate (length ancNumList) - (head $ getVertNum nameList [inVertexName])) - (V.replicate (length ancNumList) Nothing) --edge from anc to current, no weight info - in - genForestToPhyloGraphVect (V.tail inGen) - (V.snoc curVertVect (inVertexName, descNumList, ancNumList, vertType), - curEdgeVect V.++ newEdgeVect) nameList - --- | getNamesFromGenPhyNet extracts names from vertices in order found --- leaves are first, then internal, root last -getNamesFromGenPhyNet :: GenPhyNet -> [String] -getNamesFromGenPhyNet inNet = - L.sort (getLeafNamesFromGenPhyNet inNet) ++ getNonLeafNonRootNamesFromGenPhyNet inNet - ++ getRootNamesFromGenPhyNet inNet - --- | getShortestList takes list and length and list of lists and return --- shortest list -getShortestList :: ([a], [b]) -> Int -> [([a],[b])] -> ([a],[b]) -getShortestList bestList lengthBestList inListList = - if null inListList then bestList - else - let curList = head inListList - lengthCurList = length $ fst curList - in - if lengthCurList < lengthBestList then getShortestList curList lengthCurList (tail inListList) - else getShortestList bestList lengthBestList (tail inListList) - --- | getSplitList take an LUB, list of placed taxa, and vector of tree vertices --- and returns a list of splits for each input tree (from tree vertices) --- also filters out placed taxa --- CLEANUP--many more ioperations than needed--should be passed as better --- structure -getSplitList :: [String] -> [String] -> ([[Int]], [[String]]) -> [[String]] -getSplitList curLUB placedTaxa (potentialVerts, vertLeafSet) = - if null curLUB then error "Null LUB in getSplitList" - else - let vertList = [(x, y) | (x, y) <- zip vertLeafSet potentialVerts, L.intersect x curLUB == curLUB] - smallestVert = snd $ getShortestList (head vertList) (length $ fst $ head vertList) (tail vertList) - vectVertLeafSet = V.fromList vertLeafSet --adds factor of "n", could pass another variable? - rawLUBs = map (vectVertLeafSet V.!) smallestVert - newLUBs = map (L.\\ placedTaxa) rawLUBs - in - newLUBs - --- | replaceChar take set of charcters to be replaced by a char in a String -replaceChar :: String -> Char -> Char -> Char -replaceChar inSet2Replace replaceChar2 inChar = - if inChar `elem` inSet2Replace then replaceChar2 - else inChar - --- | getVertsFromIndexList takes a list of vertex vector indices and returns a list of --- vertices -getVertsFromIndexList :: [Int] -> PhyloGraphVect -> [Vertex] -getVertsFromIndexList indexList inGraphVect = - if null indexList then [] - else - let (vertVect, _) = inGraphVect - in - (vertVect V.! head indexList) : getVertsFromIndexList (tail indexList) inGraphVect - --- | ggenPhyNet2PhyloGraphVect takes as input GenPhyNet and return --- PhyloGraphVect with root as last node -genPhyNet2PhyloGraphVect :: GenPhyNet -> PhyloGraphVect -genPhyNet2PhyloGraphVect inGenPhyNet = - if null inGenPhyNet then error "Null GenPhyNet in genPhyNet2PhyloGraphVect" - else - let nameList = getNamesFromGenPhyNet inGenPhyNet - (vertVect, edgeVect) = genForestToPhyloGraphVect (V.fromList inGenPhyNet) nullGraphVect nameList - newVertVect = vertVect V.// oldIndex2New vertVect nameList - in - (newVertVect, edgeVect) - --- | makeAdamsNodes takes root Adams node, rootLUB, vertex sets of input --- trees and placed leaf set and constructs each Adams node in turn. -makeAdamsNodes :: GenPhyNet -> String -> [[String]] -> [String] -> [([[Int]], [[String]])] -> GenPhyNet -makeAdamsNodes inAdamsTree parentName inLUBList placedTaxa bothLeafLists = --inTreeVertexLists vertexLeafSetList = - if null inLUBList then inAdamsTree - else - let curLUB = head inLUBList - in - if length curLUB == 1 then --make nodes since done - let newNode = (head curLUB, [], [parentName]) - in - makeAdamsNodes (newNode : inAdamsTree) parentName (tail inLUBList) - (head curLUB : placedTaxa) bothLeafLists --inTreeVertexLists vertexLeafSetList - else if length curLUB == 2 then - let leftChild = lub2TreeRep [head curLUB] - rightChild = lub2TreeRep [last curLUB] - newNode1 = (lub2TreeRep curLUB, [leftChild, rightChild], [parentName]) - newNode2 = (leftChild, [], [lub2TreeRep curLUB]) - newNode3 = (rightChild, [], [lub2TreeRep curLUB]) - newGenPhyNet = newNode2 : (newNode3 : (newNode1 : inAdamsTree)) - newPlacedTaxa = lub2TreeRep curLUB : (leftChild : (rightChild : placedTaxa)) - in - makeAdamsNodes newGenPhyNet parentName (tail inLUBList) - newPlacedTaxa bothLeafLists --inTreeVertexLists vertexLeafSetList - else --core case with LUB creation and taxon placementg - let splitListList = map (getSplitList curLUB placedTaxa) bothLeafLists --(zip inTreeVertexLists vertexLeafSetList) - newLUBpre = leastUpperBound splitListList - newLUB = map L.sort [x | x <- newLUBpre, not (null x)] --had "map sort $" was this "sort" necessary? for List intersection? - newNode = (lub2TreeRep curLUB, map lub2TreeRep newLUB, [parentName]) - in - --trace ("New LUBs " ++ show newLUB ++ " newNode " ++ show newNode) - makeAdamsNodes (newNode : inAdamsTree) (lub2TreeRep curLUB) (tail inLUBList) - placedTaxa bothLeafLists ++ --inTreeVertexLists vertexLeafSetList) ++ - makeAdamsNodes [] (lub2TreeRep curLUB) newLUB placedTaxa bothLeafLists --inTreeVertexLists vertexLeafSetList) - --- | getLeafSetFromNodeName takes String name of node and returns sorted list of leaf --- names--ASSUMES node names are not given in input and are assigned as trees --- are parsed -getLeafSetFromNodeName :: Vertex -> [String] -getLeafSetFromNodeName inVertex = - let (nodeName, _, _, _) = inVertex - in - if null nodeName then error "Null node name in getLeafSetFromNodeName" - else - let rawList = map (replaceChar ['(', ')', ','] ' ') nodeName - in - L.sort $ words rawList --this sort required - --- | lub2TreeRep takes list of names and makes into unresolved subtree --- in parens -lub2TreeRep :: [String] -> String -lub2TreeRep inStringList - | null inStringList = error "Null input in lub2TreeRep" - | length inStringList == 1 = head inStringList - | otherwise = - let inside = init $ concatMap (++ ",") inStringList - in - ( '(' : inside ) ++ ")" - --- | getDecendantLeafList iputs a vertex and returns leaf set (as list of --- leaf names as strings) descdended from --- that vertex, if a leaf, returns that leaf -getDecendantLeafList :: [Vertex] -> PhyloGraphVect -> [String] -getDecendantLeafList inVertexList inGraphVect = - if null inVertexList then [] - else - let (curVertName, descList, _, vertType) = head inVertexList - descVertList = getVertsFromIndexList descList inGraphVect - in - if vertType == Leaf then - curVertName : getDecendantLeafList (tail inVertexList) inGraphVect - else - getDecendantLeafList [head descVertList] inGraphVect - ++ getDecendantLeafList (tail descVertList) inGraphVect - ++ getDecendantLeafList (tail inVertexList) inGraphVect - --- | getSplitLeafList takes a node and returns a list of list of descendent leaves -getSplitLeafList :: [Int] -> PhyloGraphVect -> [[String]] -getSplitLeafList descList inGraphVect = - if null descList then [] - else - let curDesc = head descList - (vertexVect, _) = inGraphVect - curLeaves = getDecendantLeafList [vertexVect V.! curDesc] inGraphVect - in curLeaves : getSplitLeafList (tail descList) inGraphVect - --- | getSplitLeafListList takes list of descenndents for PhyloGraphVect and --- returns a list of descendant list for each split of each tree -getSplitLeafListList :: [[Int]] -> [PhyloGraphVect] -> [[[String]]] -getSplitLeafListList descListList inGraphVectList - | null descListList = [] - | null inGraphVectList = error "Diff numbers of descdent lists and graphs" - | otherwise = - let curIntList = head descListList - curGraphVectList = head inGraphVectList - in - getSplitLeafList curIntList curGraphVectList : - getSplitLeafListList (tail descListList) (tail inGraphVectList) - --- | lub2 takes two lists of lists of names and generates the pairswise set of --- intersections -lub2 :: [[String]] -> [[String]] -> [[String]] -lub2 s1 s2 - | null s1 = [] - | null s2 = [] - | otherwise = - let intersectFirst = L.intersect (head s1) (head s2) : lub2 [head s1] (tail s2) - in - intersectFirst ++ lub2 (tail s1) s2 - --- | leastUpperBound takes list of list vertex leaf descendants (as Strings) --- and returns LUB of Adams II (1972) consensus -leastUpperBound :: [[[String]]] -> [[String]] -leastUpperBound inVertexListList - | length inVertexListList < 2 = - error "Too few name lists in leastUpperBound" - | length inVertexListList == 2 = - let x = head inVertexListList - y = last inVertexListList - in - lub2 x y - | otherwise = - let x = head inVertexListList - y = head $ tail inVertexListList - z = tail $ tail inVertexListList - t = lub2 x y - in - leastUpperBound (t : z) - --- | get second retriueves 2nd element of 4 -getSecond :: (a, b, c, d) -> b -getSecond inTuple = - let (_, b2, _, _) = inTuple - in - b2 - --- | leafSetFromVertexVect takes vector of veritces and returns set of leaf --- names -leafSetFromVertexVect :: Set.Set String -> V.Vector Vertex -> Set.Set String -leafSetFromVertexVect inSet inVerts = - if V.null inVerts then inSet - else - let (curName, _, _, curType) = V.head inVerts - in - if curType == Leaf then - leafSetFromVertexVect (Set.insert curName inSet) (V.tail inVerts) - else - leafSetFromVertexVect inSet (V.tail inVerts) - --- | getLeafSet tgake a list pf PhyloGraphVect and returns a pair with --- True if the leaf sets are identical, and a list of the sets -getLeafSet :: PhyloGraphVect -> Set.Set String -getLeafSet inGraphVect = - let (inVerts, _) = inGraphVect - in - leafSetFromVertexVect Set.empty inVerts - - --- | setEqual checks for set equality by difference between union and --- intersection is empty -setEqual :: Ord a => Set.Set a -> Set.Set a-> Bool -setEqual firstSet secondSet = - let combinedElem = Set.union firstSet secondSet - sameElem = Set.intersection firstSet secondSet - in - Set.empty == Set.difference combinedElem sameElem - - --- | getAndCheckLeafSets take graphs and checks that leaf sets are identical -getAndCheckLeafSets :: [PhyloGraphVect] -> (Bool, [Set.Set String]) -getAndCheckLeafSets inGraphs = - if null inGraphs then error "Empty graph list in getAndCheckLeafSets" - else - let leafSetList = map getLeafSet inGraphs - firstSet = head leafSetList - setDiffList = map (setEqual firstSet) (tail leafSetList) - allEmpty = all (True ==) setDiffList - in - (allEmpty, leafSetList) - --- | findRoot take PhyloGraphVect and return root index and Vertex -findRoot :: Int -> PhyloGraphVect -> (Int, Vertex) -findRoot index inGraph = - let (vertexVect, _) = inGraph - in - if index < V.length vertexVect then - let (_, _, _, vertexType) = vertexVect V.! index - in - if vertexType == Root then - (index, vertexVect V.! index) - else - findRoot (index + 1) inGraph - else error "Index exceeeds vertex number in findRoot" - diff --git a/pkg/PhyGraph/Reconciliation/Eun.hs b/pkg/PhyGraph/Reconciliation/Eun.hs deleted file mode 100644 index bd6d95c6e..000000000 --- a/pkg/PhyGraph/Reconciliation/Eun.hs +++ /dev/null @@ -1,885 +0,0 @@ - {- | -Module : Eun.hs -Description : Module to calculate various graph reconciliation methods Wheeler (2021) - input graphviz dot files and newick -Copyright : (c) 2021 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -{-# LANGUAGE ScopedTypeVariables #-} - -module Reconciliation.Eun ( reconcile - , makeProcessedGraph - , addGraphLabels) - where - -import Control.Parallel.Strategies -import qualified Data.Bits as B -import qualified Data.BitVector as BV -import qualified Data.Graph.Inductive.Graph as G -import qualified Data.Graph.Inductive.PatriciaTree as P -import qualified Data.Graph.Inductive.Query.BFS as BFS -import Data.GraphViz as GV -import Data.GraphViz.Printing -import qualified Data.List as L -import qualified Data.Map.Strict as Map -import Data.Maybe -import qualified Data.Set as S -import qualified Data.Text.Lazy as T -import qualified Data.Vector as V -import qualified GraphFormatUtilities as PhyP -import qualified Graphs.GraphOperations as GO -import ParallelUtilities as PU -import qualified Reconciliation.Adams as A -import Types.Types -import qualified Utilities.LocalGraph as LG ---import Debug.Trace - -{- --- | turnOnOutZeroBit turns on the bit 'nleaves" signifying that --- the node is outdegree 1 --- this so outdegree one nodes and their child have differnet bit sets -turnOnOutZeroBit :: BV.BV -> Int -> BV.BV -turnOnOutZeroBit inBitVect nLeaves = BV.or [B.bit nLeaves, inBitVect] --} - -{- --- | turnOffOutZeroBit turns off the bit 'nleaves" signifying that --- the node is outdegree /= 1 --- this so outdegree one nodes and their child have differnet bit sets -turnOffOutZeroBit :: BV.BV -> Int -> BV.BV -turnOffOutZeroBit inBitVect nLeaves = BV.extract (nLeaves - 1) 0 inBitVect --} - --- | setOutDegreeOneBits assumes outdegree of vertex = 1, takes the number of leaves, bitvector --- repersentation (rawBV) of vertex, a L.sorted list of outdegree=1 vertices and the vertex index --- and creates a bitvector to prepend of length the number of outdgree=1 vertices where --- the correpsonding vertex index position in ilist is 'On' and the remainder are 'Off' --- this insures a unique lablelling for all outdegree=1 vertices -setOutDegreeOneBits :: BV.BV -> [Int] -> Int -> BV.BV -setOutDegreeOneBits inBitVect out1VertexList vertexIndex = - if null out1VertexList then error "Empty outdegree=1 vertex list in setOutDegreeOneBits" - else - let vertListIndex = L.elemIndex vertexIndex out1VertexList - vertexPosition = fromJust vertListIndex - boolList = (replicate vertexPosition False) ++ [True] ++ (replicate ((length out1VertexList) - vertexPosition - 1) False) - prependBitVect = BV.fromBits boolList - in - if vertListIndex == Nothing then error ("Vertex " ++ (show vertexIndex) ++ " not found in list " ++ show out1VertexList) - else BV.append prependBitVect inBitVect - --- | getRoot takes a graph and list of nodes and returns vertex with indegree 0 --- so assumes a connected graph--with a single root--not a forest --- this does noit include unconnected leaves -getRoots :: P.Gr a b -> [G.Node] -> [Int] -getRoots inGraph nodeList = - if null nodeList then [] - else - let firstNode = head nodeList - in - if (G.indeg inGraph firstNode == 0) && (G.outdeg inGraph firstNode > 0) then firstNode : getRoots inGraph (tail nodeList) - else getRoots inGraph (tail nodeList) - - --- | getUnConnectedLeaves takes a graph and list of nodes and returns vertex with indegree 0 --- and outdegree == 0 -getUnConnectedLeaves :: P.Gr a b -> [G.Node] -> [Int] -getUnConnectedLeaves inGraph nodeList = - if null nodeList then [] - else - let firstNode = head nodeList - in - if (G.indeg inGraph firstNode == 0) && (G.outdeg inGraph firstNode == 0) then firstNode : getUnConnectedLeaves inGraph (tail nodeList) - else getUnConnectedLeaves inGraph (tail nodeList) - - - --- | getUnconmnectedNOdes takes a graph and list of nodes and returns vertex with indegree 0 --- and outdegeee 0 -getUnConnectedNodes :: P.Gr String String -> Int -> [G.Node] -> [G.LNode BV.BV] -getUnConnectedNodes inGraph nLeaves nodeList = - if null nodeList then [] - else - let firstNode = head nodeList - newNode = (firstNode, B.bit firstNode) - in - if G.deg inGraph firstNode == 0 then - newNode : getUnConnectedNodes inGraph nLeaves (tail nodeList) - else getUnConnectedNodes inGraph nLeaves (tail nodeList) - - --- | makeNodeFromChildren gets bit vectors as union of children in a post order traversal from leaves --- the prepending of a single 'On' bit if there is only once child (setOutDegreeOneBit) --- is modified to allow for multiple outdegree 1 vertices as parent of single vertex -makeNodeFromChildren :: P.Gr String String -> Int -> V.Vector (G.LNode BV.BV) -> [Int] -> Int -> [G.LNode BV.BV] -makeNodeFromChildren inGraph nLeaves leafNodes out1VertexList myVertex = - if myVertex < nLeaves then [leafNodes V.! myVertex] - else - let myChildren = G.suc inGraph myVertex - myChildrenNodes = fmap (makeNodeFromChildren inGraph nLeaves leafNodes out1VertexList) myChildren -- `using` PU.myParListChunkRDS - - rawBV = BV.or $ fmap (snd . head) myChildrenNodes - myBV = if (length myChildren /= 1) then rawBV - else setOutDegreeOneBits rawBV out1VertexList myVertex - in - (myVertex, myBV) : concat myChildrenNodes - --- | getNodesFromARoot follows nodes connected to a root. --- can be fmapped over roots to hit all--should be ok if multiple hits on nodes --- since all labeled by BV.BVs need to fuse them if multiple roots to make sure nodes are consistent --- and only one per root--should be ok for multikple jhists of nodes since BVs are from childre --- just wasted work. Should L.nub after to maeksure only unique (by BV) nodes in list at end -getNodesFromARoot :: P.Gr String String -> Int -> [G.LNode BV.BV] -> Int -> [G.LNode BV.BV] -getNodesFromARoot inGraph nLeaves leafNodes rootVertex = - if G.isEmpty inGraph then error "Input graph is empty in getLabelledNodes" - else - let rootChildVerts = G.suc inGraph rootVertex - - -- get outdree = 1 node list for creting prepended bit vectors - out1VertexList = L.sort $ filter ((==1).G.outdeg inGraph) $ G.nodes inGraph - - -- recurse to children since assume only leaves can be labbeled with BV.BVs - -- fmap becasue could be > 2 (as in at root) - rootChildNewNodes = fmap (makeNodeFromChildren inGraph nLeaves (V.fromList leafNodes) out1VertexList) rootChildVerts -- `using` PU.myParListChunkRDS - - -- check if outdegree = 1 - rawBV = BV.or $ fmap (snd . head) rootChildNewNodes - rootBV = if (length rootChildVerts /= 1) then rawBV - else setOutDegreeOneBits rawBV out1VertexList rootVertex - in - (rootVertex, rootBV) : concat rootChildNewNodes - --- | getLabelledNodes labels nodes with bit vectors union of subtree leaves via post order traversal --- adds nodes to reDoneNodes as they are preocessed --- reorder NOdes is n^2 should be figured out how to keep them in order more efficeintly -getLabelledNodes :: P.Gr String String -> Int -> [G.LNode BV.BV] -> [G.LNode BV.BV] -getLabelledNodes inGraph nLeaves leafNodes = - -- trace ("getLabbeled graph with " ++ (show $ G.noNodes inGraph) ++ " nodes in " ++ (showGraph inGraph)) ( - if G.isEmpty inGraph then error "Input graph is empty in getLabelledNodes" - else - let rootVertexList = getRoots inGraph (G.nodes inGraph) - htuList = L.nub $ concatMap (getNodesFromARoot inGraph nLeaves leafNodes) rootVertexList - in - -- this for adding in missing data - let unConnectedNodeList = getUnConnectedNodes inGraph nLeaves (G.nodes inGraph) - in - reorderLNodes (htuList ++ unConnectedNodeList) 0 - - --- | findLNode takes an index and looks for node with that as vertex and retuirns that node -findLNode :: Int -> [G.LNode BV.BV] -> G.LNode BV.BV -findLNode vertex lNodeList = - if null lNodeList then error ("Node " ++ show vertex ++ " not found") - else - let (a,b) = head lNodeList - in - if a == vertex then (a,b) - else findLNode vertex (tail lNodeList) - --- | reorderLNodes takes a list of nodes and reorders and order based on node vertex number --- n^2 ugh -reorderLNodes :: [G.LNode BV.BV] -> Int -> [G.LNode BV.BV] -reorderLNodes inNodeList inIndex - | null inNodeList = [] - | inIndex == length inNodeList = [] - | otherwise = - let newNode = findLNode inIndex inNodeList - in - newNode : reorderLNodes inNodeList (inIndex + 1) - -- ) - --- | relabelEdgs creates (BV.BV, BV.BV) labnels for an edges -relabelEdge :: V.Vector (G.LNode BV.BV) -> G.LEdge String -> G.LEdge (BV.BV, BV.BV) -relabelEdge allNodesVect inLEdge = - let (e,u,_) = inLEdge - eNodeBV = snd (allNodesVect V.! e) - uNodeBV = snd (allNodesVect V.! u) - in - (e,u,(eNodeBV,uNodeBV)) - --- | changeLabelEdge labels edges by descendent vertex label --- assumes leaves first then vertices labeled in order --- offset for numLeaves assumes only labeling non-leaves so smaller set --- assumes that if an "urroot" has been added via `union` then it is last vertex --- this for condition where root has been added to majority consensus tree -changeLabelEdge :: Int -> V.Vector Double -> [G.LEdge b] -> [G.LEdge Double] -changeLabelEdge numLeaves freqVect edgeList = - if null edgeList then [] - else - let (e,u,_) =head edgeList - newLabel - | u < numLeaves = 1 - | (u - numLeaves) >= V.length freqVect = 1 - | otherwise = freqVect V.! (u - numLeaves) - in - --trace (show (e,u) ++ " " ++ (show (u - numLeaves)) ++ " " ++ show freqVect) -- ++ " " ++ show newLabel) - (e,u, newLabel) : changeLabelEdge numLeaves freqVect (tail edgeList) - --- | addEdgeFrequenciesToGraph takes a greaph and edge frequencies and relables edges --- with node frequencies of discedendet node -addEdgeFrequenciesToGraph :: P.Gr a b -> Int -> [Double] -> P.Gr a Double -addEdgeFrequenciesToGraph inGraph numLeaves freqList = - let inNodes = G.labNodes inGraph - inEdges = G.labEdges inGraph - newEdges = changeLabelEdge numLeaves (V.fromList freqList) inEdges - in - --trace (show inEdges) - G.mkGraph inNodes newEdges - --- | getLeafNumber take Graph and gets nu,ber of leaves (outdegree = 0) -getLeafNumber :: P.Gr BV.BV (BV.BV, BV.BV) -> Int -getLeafNumber inGraph = - let degOutList = G.outdeg inGraph <$> G.nodes inGraph - in length $ filter (==0) degOutList - -{- --- | findStrLabel checks Attributes (list f Attribute) from Graphvz to extract the String label of node --- returns Maybe Text -findStrLabel :: Attributes -> Maybe T.Text -findStrLabel = getFirst . foldMap getStrLabel - - --- | getStrLabel takes an Attribute and reurns Text if StrLabel found, mempty otherwise -getStrLabel :: Attribute -> First T.Text -getStrLabel (Label (StrLabel txt)) = First . Just $ txt -getStrLabel _ = mempty - - --- | getLeafString takes a pairs (node vertex number, graphViz Attributes) --- and returns String name of leaf of Stringified nude number if unlabbeled -getLeafString :: (Int, Attributes) -> String -getLeafString (nodeIndex, nodeLabel) = - let maybeTextLabel = findStrLabel nodeLabel - in - maybe (show nodeIndex) T.unpack maybeTextLabel - --- | getLeafList returns leaf complement of graph from DOT file -getLeafList :: P.Gr Attributes Attributes -> [G.LNode String] -getLeafList inGraph = - if G.isEmpty inGraph then [] - else - let degOutList = G.outdeg inGraph <$> G.nodes inGraph - newNodePair = zip degOutList (G.labNodes inGraph) - leafPairList = filter ((==0).fst ) newNodePair - (_, leafList) = unzip leafPairList - (nodeVerts, _) = unzip leafList - newLabels = fmap getLeafString leafList - leafList' = zip nodeVerts newLabels - in - leafList' --} - --- | getLeafListNewick returns leaf complement of graph from newick file --- difference from above is in the leaf label type -getLeafListNewick :: P.Gr a b -> [G.LNode a] -getLeafListNewick inGraph = - if G.isEmpty inGraph then [] - else - let degOutList = G.outdeg inGraph <$> G.nodes inGraph - newNodePair = zip degOutList (G.labNodes inGraph) - leafPairList = filter ((==0).fst ) newNodePair - (_, leafList) = unzip leafPairList - (nodeVerts, _) = unzip leafList - -- only different line - newLabels = fmap snd leafList - leafList' = zip nodeVerts newLabels - in - leafList' - -{- --- | checkNodesSequential takes a list of nodes and returns booolean --- True if nodes are input with sequential numerical indices --- False if not--screws up reindexing later which assumes they are successive -checkNodesSequential :: G.Node -> [G.Node] -> Bool -checkNodesSequential prevNode inNodeList - | null inNodeList = True - | (head inNodeList - prevNode) /= 1 = trace ("Index or indices missing between " ++ (show $ head inNodeList) ++ " and " ++ (show prevNode)) False - | otherwise = checkNodesSequential (head inNodeList) (tail inNodeList) --} - --- | reAnnotateGraphs takes parsed graph input and reformats for EUN -reAnnotateGraphs :: P.Gr String String -> P.Gr BV.BV (BV.BV, BV.BV) -reAnnotateGraphs inGraph = - -- trace ("Reannotating " ++ (showGraph inGraph)) ( - if G.isEmpty inGraph then error "Input graph is empty in reAnnotateGraphs" - else - let degOutList = G.outdeg inGraph <$> G.nodes inGraph - nLeaves = length $ filter (==0) degOutList - leafVerts = [0..(nLeaves - 1)] - leafIntegers = fmap B.bit leafVerts - leafBitVects = leafIntegers -- fmap (BV.bitVec nLeaves) leafIntegers - leafNodes = Prelude.zip leafVerts leafBitVects - allNodes = getLabelledNodes inGraph nLeaves leafNodes - allEdges = fmap (relabelEdge (V.fromList allNodes)) (G.labEdges inGraph) - in - -- assign HTU BV via postorder pass. - G.mkGraph allNodes allEdges - --- | checkBVs looks at BV.BV of node and retuns FALSE if found True if not -checkBVs :: BV.BV -> [G.LNode BV.BV] -> Bool -checkBVs inBV nodeList = - null nodeList || ( - let (_, bv) = head nodeList - in - inBV /= bv && checkBVs inBV (tail nodeList)) - --- | checkBVs looks at BV.BV of node and retuns FALSE if found True if not -checkEdgeBVs :: (BV.BV, BV.BV) -> [G.LEdge (BV.BV, BV.BV)] -> Bool -checkEdgeBVs (inABV, inBBV) edgeList = - null edgeList || ( - let (_, _, (aBV,bBV)) = head edgeList - in - not ((inABV == aBV) && (inBBV == bBV)) && checkEdgeBVs (inABV, inBBV) (tail edgeList)) - --- | addAndReIndexUniqueNodes takes an inital list of nodes and adds new nodes reindexed --- check identity by BV.BV --- index bigger than size becasue starting after number of leaves -addAndReIndexUniqueNodes :: Int -> [G.LNode BV.BV] -> [G.LNode BV.BV] -> [G.LNode BV.BV] -addAndReIndexUniqueNodes newIndex nodesToExamine uniqueReIndexedNodes = - if null nodesToExamine then uniqueReIndexedNodes - else - let (_, inBV) = head nodesToExamine - isUnique = checkBVs inBV uniqueReIndexedNodes - in - if isUnique then - let newNode = (newIndex, inBV) - in - addAndReIndexUniqueNodes (newIndex + 1) (tail nodesToExamine) (newNode : uniqueReIndexedNodes) - else addAndReIndexUniqueNodes newIndex (tail nodesToExamine) uniqueReIndexedNodes - - --- | getNodeIndex takes a BV.BV and returns a node with the same BV.BV -getNodeIndex :: BV.BV -> [G.LNode BV.BV] -> Int -getNodeIndex inBV nodeList = - if null nodeList then error ("Node with BV " ++ show inBV ++ " not found in getNodeIndex") - else - let (inIndex, bv) = head nodeList - in - if bv == inBV then inIndex - else getNodeIndex inBV (tail nodeList) - --- | addAndReIndexEdges takes list of indexed nodes and BV, a list of edges to examine and a list of edges to keep --- checks for uniqueness of edges by BV.BVs on (e,u) and reindexes the edge nodes based on the node set with bit vectors --- keep method either 'unique" or "all" to keep lists of unique or all edges -addAndReIndexEdges :: String -> [G.LNode BV.BV] -> [G.LEdge (BV.BV,BV.BV)] -> [G.LEdge (BV.BV,BV.BV)] -> [G.LEdge (BV.BV,BV.BV)] -addAndReIndexEdges keepMethod indexedNodes edgesToExamine uniqueReIndexedEdges = - if null edgesToExamine then uniqueReIndexedEdges - else - let (_, _, (eBV, uBV)) = head edgesToExamine - isUnique = checkEdgeBVs (eBV, uBV) uniqueReIndexedEdges - in - if (keepMethod == "all") || isUnique then - -- Find nodes with BVs of edge - let eNode = getNodeIndex eBV indexedNodes - uNode = getNodeIndex uBV indexedNodes - newEdge = (eNode, uNode, (eBV, uBV)) - in - addAndReIndexEdges keepMethod indexedNodes (tail edgesToExamine) (newEdge : uniqueReIndexedEdges) - else addAndReIndexEdges keepMethod indexedNodes (tail edgesToExamine) uniqueReIndexedEdges - --- | testEdge nodeList fullEdgeList) counter --- chnage to input graph and delete edge from graph as opposed to making new graphs each time. --- should be much faster using P.delLEdge (since only one edge to delete) -testEdge :: (Eq b) => P.Gr a b -> G.LEdge b -> [G.LEdge b] -testEdge fullGraph candidateEdge@(e,u,_) = - let newGraph = G.delLEdge candidateEdge fullGraph - bfsNodes = BFS.bfs e newGraph - foundU = L.find (== u) bfsNodes - in - [candidateEdge | isNothing foundU] - --- | makeEUN take list of nodes and edges, deletes each edge (e,u) in turn makes graph, --- checks for path between nodes e and u, if there is delete edge otherwise keep edge in list for new graph -makeEUN :: (Eq b, NFData b) => [G.LNode a] -> [G.LEdge b] -> P.Gr a b -> P.Gr a b -makeEUN nodeList fullEdgeList fullGraph = - let -- counterList = [0..(length fullEdgeList - 1)] - -- requiredEdges = concat $ fmap (testEdge nodeList fullEdgeList) counterList - requiredEdges = PU.seqParMap rdeepseq (testEdge fullGraph) fullEdgeList -- `using` PU.myParListChunkRDS - newGraph = G.mkGraph nodeList (concat requiredEdges) - in - newGraph - --- | getLeafLabelMatches tyakes the total list and looks for elements in the smaller local leaf set --- retuns int index of the match or (-1) if not found so that leaf can be added in orginal order -getLeafLabelMatches ::[G.LNode String] -> G.LNode String -> (Int, Int) -getLeafLabelMatches localLeafList totNode = - if null localLeafList then (-1, fst totNode) - else - let (inIndex, leafString) = head localLeafList - in - if snd totNode == leafString then (inIndex, fst totNode) - else getLeafLabelMatches (tail localLeafList) totNode - --- | reIndexEdge takes an (Int, Int) map, labelled edge, and returns a new labelled edge with new e,u vertices -reIndexLEdge :: Map.Map Int Int -> G.LEdge a -> G.LEdge String -reIndexLEdge vertexMap inEdge = - if Map.null vertexMap then error "Null vertex map" - else - let (e,u,_) = inEdge - newE = Map.lookup e vertexMap - newU = Map.lookup u vertexMap - in - if isNothing newE then error ("Error looking up vertex " ++ show e ++ " in " ++ show (e,u)) - else if isNothing newU then error ("Error looking up vertex " ++ show u ++ " in " ++ show (e,u)) - else (fromJust newE, fromJust newU, "") - --- | reIndexAndAddLeaves takes rawGraphs and total input leaf sets and reindexes node, and edges, and adds --- in leaves (with out edges) so that later processing can get bit vectors correct and match from --- graph to graph. --- new node set in teh total leaf set form all graphs plus teh local HTUs renumbered up based on added leaves --- the map contains leaf mappings based on label of leaf, the HTUs extend that map with stright integers. --- edges are re-indexed based on that map -reIndexAndAddLeavesEdges :: [G.LNode String] -> ([G.LNode String], P.Gr a b) -> P.Gr String String -reIndexAndAddLeavesEdges totallLeafSet (inputLeafList, inGraph) = - if G.isEmpty inGraph then G.empty - else - -- reindex nodes and edges and add in new nodes (total leaf set + local HTUs) - -- create a map between inputLeafSet and totalLeafSet which is the canonical enumeration - -- then add in local HTU nodes and for map as well - -- trace ("Original graph: " ++ (showGraph inGraph)) ( - let correspondanceList = PU.seqParMap rdeepseq (getLeafLabelMatches inputLeafList) totallLeafSet -- `using` PU.myParListChunkRDS - matchList = filter ((/=(-1)).fst) correspondanceList - --remove order dependancey - -- htuList = [(length inputLeafList)..(length inputLeafList + htuNumber - 1)] - htuList = fmap fst (G.labNodes inGraph) L.\\ fmap fst inputLeafList - htuNumber = length (G.labNodes inGraph) - length inputLeafList - newHTUNumbers = [(length totallLeafSet)..(length totallLeafSet + htuNumber - 1)] - htuMatchList = zip htuList newHTUNumbers - vertexMap = Map.fromList (matchList ++ htuMatchList) - reIndexedEdgeList = fmap (reIndexLEdge vertexMap) (G.labEdges inGraph) - - newNodeNumbers = [0..(length totallLeafSet + htuNumber - 1)] - attributeList = replicate (length totallLeafSet + htuNumber) "" -- origAttribute - newNodeList = zip newNodeNumbers attributeList - in - G.mkGraph newNodeList reIndexedEdgeList - --- | relabelNode takes nofde list and labels leaves with label and HTUs with String of HexCode of BV label -relabelNodes :: [G.LNode BV.BV] -> [G.LNode String] -> [G.LNode String] -relabelNodes inNodes leafLabelledNodes - | null inNodes = [] - | not $ null leafLabelledNodes = head leafLabelledNodes : relabelNodes (tail inNodes) (tail leafLabelledNodes) - | otherwise = - let (vertex, _) = head inNodes - in - (vertex, "HTU" ++ show vertex) : relabelNodes (tail inNodes) [] - --- | addGraphLabels take Graph and changes to add nodes labelled wiyth String, edges as well -addGraphLabels :: P.Gr BV.BV (BV.BV, BV.BV) -> [G.LNode String] -> P.Gr String String -addGraphLabels inGraph totallLeafSet - | G.isEmpty inGraph = error "Empty graph in addGraphLabels" - | null totallLeafSet = error "Empty leaf set in addGraphLabels" - | otherwise = - let newNodes = relabelNodes (G.labNodes inGraph) totallLeafSet - -- newNodes = totallLeafSet ++ newHTUList - (eList, uList) = unzip (G.edges inGraph) - - newEdges = zip3 eList uList (replicate (length eList) "") - in - -- trace ("Relabelled EUN : " ++ (showGraph $ G.mkGraph newNodes newEdges) ++ " from " ++ (show totallLeafSet)) - G.mkGraph newNodes newEdges - --- | intermediateNodeExists takes two node bitvetors and the full bitvector list --- and checks to see if there is an intermediate node between first two that would --- remove need for the edge between the first two. --- This should reduce time complexity of vertex-based reconciliation to O(n^3) from O(n^4) -intermediateNodeExists :: BV.BV -> BV.BV -> [BV.BV] -> Bool -intermediateNodeExists aBV cBV fullNodeBVList = - not (null fullNodeBVList) && - let bBV = head fullNodeBVList - leftIntersection = BV.and [aBV, bBV] - rightIntersection = BV.and [bBV, cBV] - in - if (bBV == aBV) || (bBV == cBV) then intermediateNodeExists aBV cBV (tail fullNodeBVList) - else ((leftIntersection == bBV) && (rightIntersection == cBV)) || intermediateNodeExists aBV cBV (tail fullNodeBVList) - --- | getIntersectionEdges takes a node A and cretes directed edges to each other edge in [B] --- with rulkesLEdge --- if A intesect B = empty then no edge --- else if A intesect B = B then create edge A->B --- else if A intesect B = A then create edge B->A --- else --can't happen --- Added in check for intermediate (by bitvector) node that shold obviate need for --- breadth first search for vertex-based reconciliation --- if A > B and B > C and A intersect B = B and B intersect C = C --- then edge A->C is redundant and is not added to edge set -getIntersectionEdges ::[BV.BV] -> [G.LNode BV.BV] -> G.LNode BV.BV -> [G.LEdge (BV.BV,BV.BV)] -getIntersectionEdges fullNodeBVList bNodeList aNode = - if null bNodeList then [] - else - let (aIndex, aBV) = aNode - (bIndex, bBV) = head bNodeList - intersection = BV.and [aBV, bBV] - in - -- only do the directed 1/2 so no L.nub issues later - if (bBV >= aBV) || (intersection == 0) then getIntersectionEdges fullNodeBVList (tail bNodeList) aNode - else if intersection == bBV then - if intermediateNodeExists aBV bBV fullNodeBVList then getIntersectionEdges fullNodeBVList (tail bNodeList) aNode - else (aIndex, bIndex, (aBV, bBV)) : getIntersectionEdges fullNodeBVList (tail bNodeList) aNode - else getIntersectionEdges fullNodeBVList (tail bNodeList) aNode - --- | combinable tales a list of bitvecotrs and a single bitvector --- and checks each of the first to see if combinable --- if A and B == A,B, or 0 then True else False --- if True return [bitvector] else [] if not -combinable :: String -> [BV.BV] -> BV.BV -> [BV.BV] -combinable comparison bvList bvIn - | comparison == "identity" = - if null bvList then [] - else [bvIn | bvIn `elem` bvList] - | comparison == "combinable" = -- combinable sensu Nelson 1979 - if null bvList then [bvIn] - else - let intersectList = PU.seqParMap rdeepseq (checkBitVectors bvIn) bvList -- `using` PU.myParListChunkRDS - isCombinable = L.foldl' (&&) True intersectList - in - [bvIn | isCombinable] - | otherwise = errorWithoutStackTrace("Comparison method " ++ comparison ++ " unrecongnized (combinable/identity)") - where checkBitVectors a b - = let c = BV.and [a, b] in - c == a || c == b || c == 0 - --- | getGraphCompatibleList takes a list of graphs (list of node Bitvectors) --- and retuns a list of each graph a bitvector node is compatible with --- this isued later for majority rule consensus --- each bit vector node will have a list of length 1..number of graphs -getGraphCompatibleList :: String -> [[BV.BV]] -> BV.BV-> [BV.BV] -getGraphCompatibleList comparison inBVListList bvToCheck = - if null inBVListList then error "Null list of list of bitvectors in getGraphCompatibleList" - else - let compatibleList = concat $ fmap (flip (combinable comparison) bvToCheck) inBVListList - in - -- trace (show $ length compatibleList) - compatibleList - --- | getCompatibleList takes a list of graph node bitvectors as lists --- retuns a list of lists of bitvectors where the length of the list of the individual bitvectors --- is the number of graphs it is compatible with -getCompatibleList :: String -> [[BV.BV]] -> [[BV.BV]] -getCompatibleList comparison inBVListList = - if null inBVListList then error "Null list of list of bitvectors in getCompatibleList" - else - let uniqueBVList = L.nub $ concat inBVListList - bvCompatibleListList = fmap (getGraphCompatibleList comparison inBVListList) uniqueBVList - in - filter (not . null) bvCompatibleListList - --- | getThresholdNodes takes a threshold and keeps those unique objects present in the threshold percent or --- higher. L.sorted by frequency (low to high) --- urRoot added to make sure there will be a single connected graph -getThresholdNodes :: String -> Int -> Int -> [[G.LNode BV.BV]] -> ([G.LNode BV.BV], [Double]) -getThresholdNodes comparison thresholdInt numLeaves objectListList - | thresholdInt < 0 || thresholdInt > 100 = errorWithoutStackTrace"Threshold must be in range [0,100]" - | null objectListList = error "Empty list of object lists in getThresholdObjects" - | otherwise = - let numGraphs = fromIntegral $ length objectListList - indexList = [numLeaves..(numLeaves + length objectGroupList - 1)] - objectGroupList - | comparison == "combinable" = getCompatibleList comparison (fmap (fmap snd) objectListList) - | comparison == "identity" = L.group $ L.sort (snd <$> concat objectListList) - | otherwise = errorWithoutStackTrace("Comparison method " ++ comparison ++ " unrecognized (combinable/identity)") - uniqueList = zip indexList (fmap head objectGroupList) - frequencyList = PU.seqParMap rdeepseq (((/ numGraphs) . fromIntegral) . length) objectGroupList -- `using` PU.myParListChunkRDS - fullPairList = zip uniqueList frequencyList - threshold = (fromIntegral thresholdInt / 100.0) :: Double - in - --trace ("There are " ++ (show $ length objectListList) ++ " to filter: " ++ (show uniqueList) ++ "\n" ++ (show objectGroupList) ++ " " ++ (show frequencyList)) - (fst <$> filter ((>= threshold). snd) fullPairList, snd <$> fullPairList) - --- | getThresholdEdges takes a threshold and number of graphs and keeps those unique edges present in the threshold percent or --- higher. L.sorted by frequency (low to high) --- modified from getThresholdNodes due to type change in edges --- used and number from numleaves so can use BV -getThresholdEdges :: (Show a, Ord a) => Int -> Int -> [a] -> ([a], [Double]) -getThresholdEdges thresholdInt numGraphsInput objectList - | thresholdInt < 0 || thresholdInt > 100 = errorWithoutStackTrace"Threshold must be in range [0,100]" - | null objectList = error "Empty list of object lists in getThresholdEdges" - | otherwise = - let threshold = (fromIntegral thresholdInt / 100.0) :: Double - numGraphs = fromIntegral numGraphsInput - objectGroupList = L.group $ L.sort objectList - uniqueList = fmap head objectGroupList - frequencyList = PU.seqParMap rdeepseq (((/ numGraphs) . fromIntegral) . length) objectGroupList -- `using` PU.myParListChunkRDS - fullPairList = zip uniqueList frequencyList - in - --trace ("There are " ++ (show numGraphsIn) ++ " to filter: " ++ (show uniqueList) ++ "\n" ++ (show $ fmap length objectGroupList) ++ " " ++ (show frequencyList)) - (fst <$> filter ((>= threshold). snd) fullPairList, snd <$> fullPairList) - - --- | getPostOrderVerts takes a vertex and traverses postorder to root places all visirted nodes in a set of found --- vertices. Keeps placing new nodes in recursion list until a root is hit. If a node is already in found set --- it is not added to list of nodes to recurse --- returns set of visited nodes -getPostOrderVerts :: P.Gr BV.BV (BV.BV, BV.BV) -> S.Set G.Node -> [G.Node] -> S.Set G.Node -getPostOrderVerts inGraph foundVertSet inVertexList = - if null inVertexList then foundVertSet - else - let firstVertex = head inVertexList - in - if S.member firstVertex foundVertSet then getPostOrderVerts inGraph foundVertSet (tail inVertexList) - else - let newFoundSet = S.insert firstVertex foundVertSet - parentVerts = G.pre inGraph firstVertex - in - getPostOrderVerts inGraph newFoundSet (inVertexList ++ parentVerts) - --- | verticesByPostorder takes a graph and a leaf set and an initially empty found vertex set --- as the postorder pass takes place form each leaf, each visited vertex is placed in foundVertSet --- when roots are hit, it recurses back untill all paths are traced to a root. --- final final rgaph is created and retuyrned from foundVertSet and input list --- could have edges unconnected to leaves if consistent edge leading to a subtree with inconsistent configuration --- so are filtered out by making sure each vertex in an edge is in the vertex list -verticesByPostorder :: P.Gr BV.BV (BV.BV, BV.BV) -> [G.LNode BV.BV] -> S.Set G.Node -> P.Gr BV.BV (BV.BV, BV.BV) -verticesByPostorder inGraph leafNodes foundVertSet - | G.isEmpty inGraph = error "Empty graph in verticesByPostorder" - | null leafNodes = - let vertexIndexList = S.toList foundVertSet - vertexLabelList = fmap (fromJust . G.lab inGraph) vertexIndexList - vertexList = zip vertexIndexList vertexLabelList - edgeList = PU.seqParMap rdeepseq (verifyEdge vertexIndexList) (G.labEdges inGraph) -- `using` PU.myParListChunkRDS - in G.mkGraph vertexList (concat edgeList) - | otherwise = - let firstLeaf = fst $ head leafNodes - firstVertices = getPostOrderVerts inGraph foundVertSet [firstLeaf] - in - verticesByPostorder inGraph (tail leafNodes) (S.union foundVertSet firstVertices) - --- | verifyEdge takes a vertex index list and an edge and checks to see if --- the subtyending vertices are in the vertex list nad returns teh edge as asingleton list --- if yes--else empty list (for mapping purposes) -verifyEdge :: [G.Node] -> G.LEdge (BV.BV, BV.BV) -> [G.LEdge (BV.BV, BV.BV)] -verifyEdge vertIndexList inEdge@(e,u,_) - | e `notElem` vertIndexList = [] - | u `notElem` vertIndexList = [] - | otherwise = [inEdge] - -{- --- | sortInputArgs takes a list of arguments (Strings) nd retuns a pair of lists --- of strings that are newick or graphviz dotFile filenames for later parsing -sortInputArgs :: [String] -> [String] -> ([T.Text],[T.Text],[String],[String],[String]) -> ([T.Text],[T.Text],[String],[String],[String]) -sortInputArgs inContents inArgs (curFEN, curNewick, curDot, curNewFiles, curFENFILES) = - if null inArgs then (curFEN, curNewick, curDot, curNewFiles, curFENFILES) - else - let firstFileName = head inArgs - firstContents = filter (not . isSpace) $ head inContents - in - if head firstContents == '(' then -- Newick/EnhancedNewick - sortInputArgs (tail inContents) (tail inArgs) (curFEN, T.pack firstContents : curNewick, curDot, firstFileName : curNewFiles, curFENFILES) - else if head firstContents == '<' then -- ForestEnhancedNewick - sortInputArgs (tail inContents) (tail inArgs) (T.pack firstContents : curFEN, curNewick, curDot, curNewFiles, firstFileName : curFENFILES) - else if (head firstContents == 's') || (head firstContents == 'g') || (head firstContents == 'd') then --Dot - sortInputArgs (tail inContents) (tail inArgs) (curFEN, curNewick, firstFileName : curDot, curNewFiles, curFENFILES) - else errorWithoutStackTrace("Input file " ++ firstFileName ++ " does not appear to be Newick, Enhanced Newick, Forest Enhanced Newick or dot format ") - --- | nodeText2String takes a node with text label and returns a node with String label -nodeText2String :: G.LNode T.Text -> G.LNode String -nodeText2String (inIndex, label) = (inIndex, T.unpack label) - --- | fglTextA2TextString converts the graph types from Text A to Text String -fglTextB2Text :: P.Gr b Double -> P.Gr b T.Text -fglTextB2Text inGraph = - if G.isEmpty inGraph then G.empty - else - let labNodes = G.labNodes inGraph - labEdges = G.labEdges inGraph - (eList, uList, labelList) = unzip3 labEdges - --- newLabels = fmap toShortest labelList - newLabels = fmap (T.pack . show) labelList - newEdges = zip3 eList uList newLabels - in - G.mkGraph labNodes newEdges --} - --- | addUrRootAndEdges creates a single root and adds edges to existing roots --- and unconnected leaves -addUrRootAndEdges :: P.Gr String Double -> P.Gr String Double -addUrRootAndEdges inGraph = - let origLabVerts = G.labNodes inGraph - origLabEdges = G.labEdges inGraph - origRootList = getRoots inGraph (fst <$> origLabVerts) - unconnectedLeafList = getUnConnectedLeaves inGraph (fst <$> origLabVerts) - in - -- all ok--no unconnected vertices - if (length origRootList == 1) && (null unconnectedLeafList) then inGraph - - -- add edges to unconencted leaves - else if length origRootList == 1 then - let newEdgeList = zip3 (replicate (length unconnectedLeafList) (head origRootList)) unconnectedLeafList (replicate (length unconnectedLeafList) 0.0) - in - G.mkGraph origLabVerts (origLabEdges ++ newEdgeList) - - -- add UR root, edges to existing roots, and edges to unconnected leaves - else - let unRootedVertices = origRootList ++ unconnectedLeafList - numOrigVerts = length origLabVerts - newRoot = (numOrigVerts, "HTU" ++ show numOrigVerts) - newEdgeList = zip3 (replicate (length unRootedVertices) numOrigVerts) unRootedVertices (replicate (length unRootedVertices) 0.0) - in - G.mkGraph (origLabVerts ++ [newRoot]) (origLabEdges ++ newEdgeList) - --- | changeVertexEdgeLabels keeps or removes vertex and edge labels -changeVertexEdgeLabels :: (Show b) => Bool -> Bool -> P.Gr String b -> P.Gr String String -changeVertexEdgeLabels keepVertexLabel keepEdgeLabel inGraph = - let inLabNodes = G.labNodes inGraph - degOutList = G.outdeg inGraph <$> G.nodes inGraph - nodeOutList = zip degOutList inLabNodes - leafNodeList = snd <$> filter ((==0).fst) nodeOutList - nonLeafNodeList = snd <$> filter ((>0).fst) nodeOutList - newNonLeafNodes = if keepVertexLabel then nonLeafNodeList - else zip (fmap fst nonLeafNodeList) (replicate (length nonLeafNodeList) "") - inLabEdges = G.labEdges inGraph - inEdges = fmap G.toEdge inLabEdges - newEdges = if keepEdgeLabel then fmap showLabel inLabEdges - else fmap (`G.toLEdge` "") inEdges - in - -- trace ("CVEL " ++ (show (keepVertexLabel, keepEdgeLabel ))) - G.mkGraph (leafNodeList ++ newNonLeafNodes) newEdges - where showLabel (e,u,l) = (e,u,show l) - --- | reconcile is the overall function to drive all methods -reconcile :: (String, String, Int, Bool, Bool, Bool, String, [P.Gr String String]) -> (String, P.Gr String String) -reconcile (localMethod, compareMethod, threshold, connectComponents, edgeLabel, vertexLabel, outputFormat, inputGraphList) = - - let -- Reformat graphs with appropriate annotations, BV.BVs, etc - processedGraphs = PU.seqParMap rdeepseq reAnnotateGraphs inputGraphList -- `using` PU.myParListChunkRDS - - -- Create lists of reindexed unique nodes and edges, identity by BV.BVs - -- The drops to not reexamine leaves repeatedly - -- Assumes leaves are first in list - numLeaves = getLeafNumber (head processedGraphs) - leafNodes = take numLeaves (G.labNodes $ head processedGraphs) - firstNodes = G.labNodes $ head processedGraphs - numFirstNodes = length firstNodes - unionNodes = L.sort $ leafNodes ++ addAndReIndexUniqueNodes numFirstNodes (concatMap (drop numLeaves) (G.labNodes <$> tail processedGraphs)) (drop numLeaves firstNodes) - -- unionEdges = addAndReIndexEdges "unique" unionNodes (concatMap G.labEdges (tail processedGraphs)) (G.labEdges $ head processedGraphs) - - totallLeafString = L.foldl' L.union [] (fmap (fmap snd) (fmap getLeafListNewick inputGraphList)) - totallLeafSet = zip [0..(length totallLeafString - 1)] totallLeafString - -- - -- Create Adams II consensus - -- - adamsII = A.makeAdamsII totallLeafSet (fmap PhyP.relabelFGLEdgesDouble inputGraphList) - -- adamsIIInfo = "There are " ++ show (length $ G.nodes adamsII) ++ " nodes present in Adams II consensus" - adamsII' = changeVertexEdgeLabels vertexLabel False adamsII - adamsIIOutDotString = T.unpack $ renderDot $ toDot $ GV.graphToDot GV.quickParams adamsII' - adamsIIOutFENString = PhyP.fglList2ForestEnhancedNewickString [PhyP.stringGraph2TextGraph $ PhyP.relabelFGLEdgesDouble adamsII'] False False - - -- - -- Create thresholdMajority rule Consensus and dot string - -- vertex-based CUN-> Majority rule ->Strict - -- - (thresholdNodes', nodeFreqs) = getThresholdNodes compareMethod threshold numLeaves (fmap (drop numLeaves . G.labNodes) processedGraphs) - thresholdNodes = leafNodes ++ thresholdNodes' - thresholdEdgesList = PU.seqParMap rdeepseq (getIntersectionEdges (fmap snd thresholdNodes) thresholdNodes) thresholdNodes -- `using` PU.myParListChunkRDS - thresholdEdges = L.nub $ concat thresholdEdgesList - -- numPossibleEdges = ((length thresholdNodes * length thresholdNodes) - length thresholdNodes) `div` 2 - thresholdConsensusGraph = G.mkGraph thresholdNodes thresholdEdges -- O(n^3) - - -- thresholdConInfo = "There are " ++ show (length thresholdNodes) ++ " nodes present in >= " ++ (show threshold ++ "%") ++ " of input graphs and " ++ show numPossibleEdges ++ " candidate edges" - -- ++ " yielding a final graph with " ++ show (length (G.labNodes thresholdConsensusGraph)) ++ " nodes and " ++ show (length (G.labEdges thresholdConsensusGraph)) ++ " edges" - - -- add back labels for vertices and "GV.quickParams" for G.Gr String Double or whatever - labelledTresholdConsensusGraph' = addGraphLabels thresholdConsensusGraph totallLeafSet - labelledTresholdConsensusGraph'' = addEdgeFrequenciesToGraph labelledTresholdConsensusGraph' (length leafNodes) nodeFreqs - - -- Add urRoot and edges to existing roots if there are unconnected components and connnectComponets is True - labelledTresholdConsensusGraph = if not connectComponents then labelledTresholdConsensusGraph'' - else addUrRootAndEdges labelledTresholdConsensusGraph'' - gvRelabelledConsensusGraph = GO.renameSimpleGraphNodesString $ LG.reindexGraph $ changeVertexEdgeLabels vertexLabel edgeLabel labelledTresholdConsensusGraph - thresholdConsensusOutDotString = T.unpack $ renderDot $ toDot $ GV.graphToDot GV.quickParams gvRelabelledConsensusGraph - thresholdConsensusOutFENString = PhyP.fglList2ForestEnhancedNewickString [PhyP.stringGraph2TextGraph labelledTresholdConsensusGraph] edgeLabel True - - -- - -- Create threshold EUN and dot string, orignial EUN is threshold = 0 - -- - allEdges = addAndReIndexEdges "all" unionNodes (concatMap G.labEdges (tail processedGraphs)) (G.labEdges $ head processedGraphs) - (thresholdEUNEdges, edgeFreqs) = getThresholdEdges threshold (length processedGraphs) allEdges - thresholdEUNGraph' = makeEUN unionNodes thresholdEUNEdges (G.mkGraph unionNodes thresholdEUNEdges) - - -- Remove unnconnected HTU nodes via postorder pass from leaves - thresholdEUNGraph = verticesByPostorder thresholdEUNGraph' leafNodes S.empty - -- thresholdEUNInfo = "\nThreshold EUN deleted " ++ show (length unionEdges - length (G.labEdges thresholdEUNGraph) ) ++ " of " ++ show (length unionEdges) ++ " total edges" - -- ++ " for a final graph with " ++ show (length (G.labNodes thresholdEUNGraph)) ++ " nodes and " ++ show (length (G.labEdges thresholdEUNGraph)) ++ " edges" - - -- add back labels for vertices and "GV.quickParams" for G.Gr String Double or whatever - thresholdLabelledEUNGraph' = addGraphLabels thresholdEUNGraph totallLeafSet - thresholdLabelledEUNGraph'' = addEdgeFrequenciesToGraph thresholdLabelledEUNGraph' (length leafNodes) edgeFreqs - - -- Add urRoot and edges to existing roots if there are unconnected components and connnectComponets is True - thresholdLabelledEUNGraph = if not connectComponents then thresholdLabelledEUNGraph'' - else addUrRootAndEdges thresholdLabelledEUNGraph'' - - -- Create EUN Dot String - gvRelabelledEUNGraph = GO.renameSimpleGraphNodesString $ LG.reindexGraph $ changeVertexEdgeLabels vertexLabel edgeLabel thresholdLabelledEUNGraph - thresholdEUNOutDotString = T.unpack $ renderDot $ toDot $ GV.graphToDot GV.quickParams gvRelabelledEUNGraph -- eunGraph - thresholdEUNOutFENString = PhyP.fglList2ForestEnhancedNewickString [PhyP.stringGraph2TextGraph thresholdLabelledEUNGraph] edgeLabel True - - in - - if localMethod == "eun" then - if outputFormat == "dot" then (thresholdEUNOutDotString, gvRelabelledEUNGraph) - else if outputFormat == "fenewick" then (thresholdEUNOutFENString, gvRelabelledEUNGraph) - else errorWithoutStackTrace("Output graph format " ++ outputFormat ++ " is not implemented") - - else if localMethod == "adams" then - if outputFormat == "dot" then (adamsIIOutDotString, adamsII') - else if outputFormat == "fenewick" then (adamsIIOutFENString, adamsII') - else errorWithoutStackTrace("Output graph format " ++ outputFormat ++ " is not implemented") - - else if (localMethod == "majority") || (localMethod == "cun") || (localMethod == "strict") then - if outputFormat == "dot" then (thresholdConsensusOutDotString, gvRelabelledConsensusGraph) - else if outputFormat == "fenewick" then (thresholdConsensusOutFENString, gvRelabelledConsensusGraph) - else errorWithoutStackTrace("Output graph format " ++ outputFormat ++ " is not implemented") - - else errorWithoutStackTrace("Graph combination method " ++ localMethod ++ " is not implemented") - - --- | makeProcessedGraph takes a set of graphs and a leaf set and adds teh missing leafws to teh graphs and reindexes --- the nodes and edges of the input graphs consistenly --- String as oposed to Text due tyo reuse of code in Eun.c -makeProcessedGraph :: [LG.LNode T.Text] -> SimpleGraph -> SimpleGraph -makeProcessedGraph leafTextList inGraph = - if null leafTextList then error "Null leaf list in makeFullLeafSetGraph" - else if LG.isEmpty inGraph then error "Empty graph in makeFullLeafSetGraph" - else - let (_, graphleafTextList, _, _) = LG.splitVertexList inGraph - leafStringList = fmap nodeToString leafTextList - graphLeafStringList = fmap nodeToString graphleafTextList - reIndexedGraph = reIndexAndAddLeavesEdges leafStringList (graphLeafStringList, inGraph) - textNodes = fmap nodeToText $ LG.labNodes reIndexedGraph - doubleEdges = fmap edgeToDouble $ LG.labEdges reIndexedGraph - - in - LG.mkGraph textNodes doubleEdges - where nodeToString (a,b) = (a, T.unpack b) - nodeToText (a,b) = (a, T.pack b) - edgeToDouble (a,b,c) = (a,b, read c :: Double) diff --git a/pkg/PhyGraph/Reconciliation/ReconcileGraphs.hs b/pkg/PhyGraph/Reconciliation/ReconcileGraphs.hs deleted file mode 100644 index 9a1aeb785..000000000 --- a/pkg/PhyGraph/Reconciliation/ReconcileGraphs.hs +++ /dev/null @@ -1,241 +0,0 @@ -{- | -Module : ReconcileGraphs.hs -Description : Module to call graph reconciliation functions -Copyright : (c) 2021 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -module Reconciliation.ReconcileGraphs ( makeReconcileGraph - ) where - -import qualified Data.List as L -import qualified Data.Text.Lazy as T -import GeneralUtilities -import qualified GraphFormatUtilities as GFU -import qualified Reconciliation.Eun as E -import Types.Types -import qualified Utilities.LocalGraph as LG --- import Debug.Trace - --- | makeReconcileGraph is a wrapper around eun.hs functions to return String of reconciled graph -makeReconcileGraph :: [String] -> [(String, String)] -> [SimpleGraph] -> (String, SimpleGraph) -makeReconcileGraph validCommandList commandPairList inGraphList = - if null inGraphList then ("Error: No input graphs to reconcile", LG.empty) - else - let -- convert SimpleGraph to String String from Text Double - stringGraphs = fmap (GFU.modifyVertexEdgeLabels True True) $ fmap GFU.textGraph2StringGraph inGraphList - - -- parse arguements - commandList = fmap mergePair $ filter (('"' `notElem`).snd) commandPairList - (localMethod, compareMethod, threshold, connectComponents, edgeLabel, vertexLabel, outputFormat) = processReconcileArgs validCommandList commandList - - -- call EUN/reconcile functions - (reconcileString, reconcileGraph) = E.reconcile (localMethod, compareMethod, threshold, connectComponents, edgeLabel, vertexLabel, outputFormat,stringGraphs) - - -- convert eun format graph back to SimpleGraph - reconcileSimpleGraph = GFU.stringGraph2TextGraphDouble reconcileGraph - in - -- trace ("MRG :" ++ (show (localMethod, compareMethod, threshold, connectComponents, edgeLabel, vertexLabel, outputFormat)) ++ "\n" ++ reconcileString) - (reconcileString, reconcileSimpleGraph) - where mergePair (a,b) = if a /= [] && b /= [] then a ++ (':' : b) - else a ++ b - - --- | processReconcileArgs takes a list of strings and returns values of commands for proram execution --- including defaults --- checks commands for misspellings -processReconcileArgs :: [String] -> [String] -> (String, String, Int, Bool, Bool, Bool, String) -processReconcileArgs validCommandList inList' = - let inList = inList' L.\\ ["overwrite", "append", "reconcile"] - in - if null inList then - let -- default values - localMethod = "eun" - compareMethod = "combinable" - threshold = 0 - connectComponents = True - edgeLabel = True - vertexLabel = True - outputFormat = "dot" - in - (localMethod, compareMethod, threshold, connectComponents, edgeLabel, vertexLabel, outputFormat) - - else - -- trace ("Rec args: " ++ (show inList)) ( - let inTextList = fmap T.pack inList - inTextListLC = fmap T.toLower inTextList - commandList = filter (T.any (== ':')) inTextListLC - stringCommands = fmap (T.unpack . T.takeWhile (/= ':')) commandList - (editCostList, matchList) = unzip $ fmap (getBestMatch (maxBound :: Int ,"no suggestion") validCommandList) stringCommands - commandMatch = zip3 editCostList stringCommands matchList - notMatchedList = filter ((>0).fst3) commandMatch - localMethod = getMethod inTextListLC - compareMethod = getCompareMethod inTextListLC - connect = getConnect inTextListLC - edgeLabel = getEdgeLabel inTextListLC - vertexLabel = getVertexLabel inTextListLC - threshold - | localMethod == "cun" = 0 - | localMethod == "strict" = 100 - | otherwise = getThreshold inTextListLC - outFormat = getOutputFormat inTextListLC - in - if null notMatchedList then - (localMethod, compareMethod, threshold, connect, edgeLabel, vertexLabel, outFormat) - else errorWithoutStackTrace("\n\nError(s) in reconcile command specification (case insensitive):\n" ++ getCommandErrorString notMatchedList) - -- ) - --- | getMethod returns method value or dedfault otherwise --- assumes in lower case -getMethod :: [T.Text] -> String -getMethod inTextList = - -- default - if null inTextList then "eun" - else - let firstCommand = T.takeWhile (/= ':') $ head inTextList - firstOption = T.tail $ T.dropWhile (/= ':') $ head inTextList - in - if (T.find (== ':') (head inTextList) == Nothing) then getMethod (tail inTextList) - else if firstCommand == T.pack "method" then - let option = T.unpack firstOption - in - if option == "eun" then "eun" - else if option == "cun" then "cun" - else if option == "majority" then "majority" - else if option == "strict" then "strict" - else if option == "adams" then "adams" - else errorWithoutStackTrace("Reconcile option \'" ++ option ++ "\' not recognized (eun|cun|majority|strict)") - else getMethod (tail inTextList) - --- | getCompareMethod returns compareMethod value or default otherwise --- assumes in lower case -getCompareMethod :: [T.Text] -> String -getCompareMethod inTextList = - -- default - if null inTextList then "combinable" - else - let firstCommand = T.takeWhile (/= ':') $ head inTextList - firstOption = T.tail $ T.dropWhile (/= ':') $ head inTextList - in - if (T.find (== ':') (head inTextList) == Nothing) then getCompareMethod (tail inTextList) - else if firstCommand == T.pack "compare" then - let option = T.unpack firstOption - in - if option == "combinable" then "combinable" - else if option == "identity" then "identity" - else errorWithoutStackTrace("Compare option \'" ++ option ++ "\' not recognized (combinable|identity)") - else getCompareMethod (tail inTextList) - --- | getConect returns connect value or default otherwise (True|False) --- assumes in lower case -getConnect :: [T.Text] -> Bool -getConnect inTextList = - -- default - if null inTextList then False - else - let firstCommand = T.takeWhile (/= ':') $ head inTextList - firstOption = T.tail $ T.dropWhile (/= ':') $ head inTextList - in - if (T.find (== ':') (head inTextList) == Nothing) then getConnect (tail inTextList) - else if firstCommand == T.pack "connect" then - let option = T.unpack firstOption - in - (option == "true") || (option /= "false" && errorWithoutStackTrace("Connect option \'" ++ option ++ "\' not recognized (True|False)")) - else getConnect (tail inTextList) - --- | getEdgeLabel returns edgeLabel value or default otherwise (True|False) --- assumes in lower case -getEdgeLabel :: [T.Text] -> Bool -getEdgeLabel inTextList = - -- default - if null inTextList then True - else - let firstCommand = T.takeWhile (/= ':') $ head inTextList - firstOption = T.tail $ T.dropWhile (/= ':') $ head inTextList - in - if (T.find (== ':') (head inTextList) == Nothing) then getEdgeLabel (tail inTextList) - else if firstCommand == T.pack "edgelabel" then - let option = T.unpack firstOption - in - (option == "true") || (option /= "false" && errorWithoutStackTrace("EdgeLAbel option \'" ++ option ++ "\' not recognized (True|False)")) - else getEdgeLabel (tail inTextList) - --- | getVertexLabel returns edgeLabel value or default otherwise (True|False) --- assumes in lower case -getVertexLabel :: [T.Text] -> Bool -getVertexLabel inTextList = - -- default - if null inTextList then False - else - let firstCommand = T.takeWhile (/= ':') $ head inTextList - firstOption = T.tail $ T.dropWhile (/= ':') $ head inTextList - in - if (T.find (== ':') (head inTextList) == Nothing) then getVertexLabel (tail inTextList) - else if firstCommand == T.pack "vertexlabel" then - let option = T.unpack firstOption - in - (option == "true") || (option /= "false" && errorWithoutStackTrace("VertexLabel option \'" ++ option ++ "\' not recognized (True|False)")) - else getVertexLabel (tail inTextList) - - --- | getThreshold returns threshold value or default otherwise --- assumes in lower case -getThreshold :: [T.Text] -> Int -getThreshold inTextList = - -- default - if null inTextList then 0 :: Int - else - let firstCommand = T.takeWhile (/= ':') $ head inTextList - firstOption = T.tail $ T.dropWhile (/= ':') $ head inTextList - in - if (T.find (== ':') (head inTextList) == Nothing) then getThreshold (tail inTextList) - else if firstCommand == T.pack "threshold" then read (T.unpack firstOption) :: Int - else getThreshold (tail inTextList) - --- | getOutputFormat returns output file format or default otherwise --- assumes in lower case -getOutputFormat :: [T.Text] -> String -getOutputFormat inTextList = - -- default - if null inTextList then "dot" - else - --removed prefix for output graph format - let firstOption = head inTextList - outFormat = T.unpack firstOption - in - if outFormat == "dot" then "dot" - else if outFormat == "fenewick" then "fenewick" - else if outFormat == "newick" then "fenewick" - else getOutputFormat (tail inTextList) - - --- | diff --git a/pkg/PhyGraph/Search/Build.hs b/pkg/PhyGraph/Search/Build.hs deleted file mode 100644 index c17908b2a..000000000 --- a/pkg/PhyGraph/Search/Build.hs +++ /dev/null @@ -1,350 +0,0 @@ -{- | -Module : Build.hs -Description : Module specifying graph building functions -Copyright : (c) 2021 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -{- -Distance builds (Wagner, NJ, WPGMA are imported from Wag2020. -https://www.github.com/wardwheeler/wag2020 --} - -module Search.Build ( buildGraph - ) where - -import qualified Commands.Verify as VER -import Control.Parallel.Strategies -import Data.Char -import qualified Data.List as L -import Data.Maybe -import qualified Data.Text.Lazy as TL -import qualified Data.Vector as V -import GeneralUtilities -import qualified GraphOptimization.Traversals as T -import qualified Graphs.GraphOperations as GO -import qualified ParallelUtilities as PU -import qualified Reconciliation.ReconcileGraphs as R -import qualified Search.DistanceMethods as DM -import qualified Search.DistanceWagner as DW -import qualified Search.WagnerBuild as WB -import qualified SymMatrix as M -import Text.Read -import Types.Types -import qualified Utilities.Distances as DD -import qualified Utilities.DistanceUtilities as DU -import qualified Utilities.LocalGraph as LG -import qualified Utilities.Utilities as U -import Debug.Trace - --- | buildGraph wraps around build tree--build trees and adds network edges after build if network --- with appropriate options --- transforms graph type to Tree for builds then back to initial graph type -buildGraph :: [Argument] -> GlobalSettings -> ProcessedData -> [[VertexCost]] -> Int-> [PhylogeneticGraph] -buildGraph inArgs inGS inData pairwiseDistances rSeed = - let fstArgList = fmap (fmap toLower . fst) inArgs - sndArgList = fmap (fmap toLower . snd) inArgs - lcArgList = zip fstArgList sndArgList - checkCommandList = checkCommandArgs "build" fstArgList VER.buildArgList - in - -- check for valid command options - if not checkCommandList then errorWithoutStackTrace ("Unrecognized command in 'build': " ++ show inArgs) - else - let buildBlock = filter ((=="block").fst) lcArgList - displayBlock = filter ((=="displaytrees").fst) lcArgList - numDisplayTrees - | length displayBlock > 1 = - errorWithoutStackTrace ("Multiple displayTree number specifications in command--can have only one: " ++ show inArgs) - | null displayBlock = Just 10 - | null (snd $ head displayBlock) = Just 10 - | otherwise = readMaybe (snd $ head displayBlock) :: Maybe Int - - doEUN' = any ((=="eun").fst) lcArgList - doCUN' = any ((=="cun").fst) lcArgList - doEUN = if not doEUN' && not doCUN' then True - else doEUN' - returnTrees' = any ((=="displaytrees").fst) lcArgList - returnGraph' = any ((=="graph").fst) lcArgList - returnRandomDisplayTrees' = any ((=="atrandom").fst) lcArgList - returnFirst' = any ((=="first").fst) lcArgList - buildDistance = any ((=="distance").fst) lcArgList - - -- temprary change (if needed) to buyild tree structures - inputGraphType = graphType inGS - treeGS = inGS {graphType = Tree} - - -- really only trees now--but maybe later if can ensure phylogenetic graph from recocnile - (returnGraph, returnTrees) = if (graphType inGS) == Tree then (False, True) - else - if returnGraph' || returnTrees' then (returnGraph', returnTrees') - else (False, True) - - -- default to return reandom and overrides if both specified - (returnRandomDisplayTrees, _) = if returnRandomDisplayTrees' || returnFirst' then (returnRandomDisplayTrees', returnFirst') - else (True, False) - - -- initial build of trees from combined data--or by blocks - firstGraphs = if null buildBlock then - let simpleTreeOnly = False - in - buildTree simpleTreeOnly inArgs treeGS inData pairwiseDistances rSeed - else -- removing taxa with missing data for block - trace ("Block building initial graph(s)") ( - let simpleTreeOnly = True - processedDataList = U.getProcessDataByBlock True inData - distanceMatrixList = if buildDistance then PU.seqParMap rdeepseq DD.getPairwiseDistances processedDataList -- `using` PU.myParListChunkRDS - else replicate (length processedDataList) [] - - blockTrees = concat (PU.seqParMap rdeepseq (buildTree' simpleTreeOnly inArgs treeGS rSeed) (zip distanceMatrixList processedDataList)) -- `using` PU.myParListChunkRDS) - -- blockTrees = concat (PU.myChunkParMapRDS (buildTree' simpleTreeOnly inArgs treeGS inputGraphType seed) (zip distanceMatrixList processedDataList)) - - -- reconcile trees and return graph and/or display trees (limited by numDisplayTrees) already re-optimized with full data set - returnGraphs = reconcileBlockTrees rSeed blockTrees (fromJust numDisplayTrees) returnTrees returnGraph returnRandomDisplayTrees doEUN - in - -- trace (concatMap LG.prettify returnGraphs) - PU.seqParMap rdeepseq (T.multiTraverseFullyLabelGraph inGS inData True True Nothing) returnGraphs -- `using` PU.myParListChunkRDS - ) - costString = if (not . null) firstGraphs then ("\tBlock build yielded " ++ (show $ length firstGraphs) ++ " graphs at cost range " ++ (show (minimum $ fmap snd6 firstGraphs, maximum $ fmap snd6 firstGraphs))) - else "\t\tBlock build returned 0 graphs" - in - if isNothing numDisplayTrees then errorWithoutStackTrace ("DisplayTrees specification in build not an integer: " ++ show (snd $ head displayBlock)) - - -- trace ("BG:" ++ (show (graphType inGS, graphType treeGS)) ++ " bb " ++ (show buildBlock)) ( - else if inputGraphType == Tree || (not . null) buildBlock then - -- trace ("BB: " ++ (concat $ fmap LG.prettify $ fmap fst6 firstGraphs)) ( - if null buildBlock then firstGraphs - else trace (costString) firstGraphs - -- ) - else - trace ("\tRediagnosing as " ++ (show (graphType inGS))) - PU.seqParMap rdeepseq (T.multiTraverseFullyLabelGraph inGS inData False False Nothing) (fmap fst6 firstGraphs) -- `using` PU.myParListChunkRDS - -- ) - --- | reconcileBlockTrees takes a lists of trees (with potentially varying leave complement) and reconciled them --- as per the arguments producing a set of displayTrees (ordered or resolved random), and/or the reconciled graph --- all outputs are re-optimzed and ready to go -reconcileBlockTrees :: Int -> [PhylogeneticGraph] -> Int -> Bool -> Bool -> Bool -> Bool -> [SimpleGraph] -reconcileBlockTrees rSeed blockTrees numDisplayTrees returnTrees returnGraph returnRandomDisplayTrees doEUN = - --trace ("Reconcile producing " ++ (show numDisplayTrees)) ( - let -- numLeaves = V.length $ fst3 inData - -- fullLeafSet = zip [0..(numLeaves - 1)] (V.toList $ fst3 inData) - simpleGraphList = fmap fst6 blockTrees - -- fullLeafGraphList = fmap (E.makeProcessedGraph fullLeafSet) simpleGraphList - reconcileArgList = if doEUN then [("eun", []), ("vertexLabel:true", []), ("connect:True", [])] - else [("cun", []), ("vertexLabel:true", []), ("connect:True", [])] - - -- create reconciled graph--NB may NOT be phylogenetic graph--time violations etc. - reconciledGraphInitial = snd $ R.makeReconcileGraph VER.reconcileArgList reconcileArgList simpleGraphList - - -- ladderize, time consistent-ized, removed chained network edges, removed treenodes with all network edge children - reconciledGraph' = GO.convertGeneralGraphToPhylogeneticGraph "correct" reconciledGraphInitial - noChainedGraph = LG.removeChainedNetworkNodes reconciledGraph' - noTreeNdesWithAllNetChildern = LG.removeTreeEdgeFromTreeNodeWithAllNetworkChildren $ fromJust noChainedGraph - reconciledGraph = GO.contractIn1Out1EdgesRename noTreeNdesWithAllNetChildern - - - displayGraphs' = if not returnRandomDisplayTrees then take numDisplayTrees $ LG.generateDisplayTrees reconciledGraph - else LG.generateDisplayTreesRandom rSeed numDisplayTrees reconciledGraph - displayGraphs = fmap (GO.convertGeneralGraphToPhylogeneticGraph "correct") displayGraphs' - -- displayGraphs = fmap GO.ladderizeGraph $ fmap GO.renameSimpleGraphNodes displayGraphs' - in - if returnGraph && not returnTrees then - if isNothing noChainedGraph then error "Reconciled Graph generated chained network nodes that cannot be resolved. Perhaps try 'displayTrees' option" - else [reconciledGraph] - else if not returnGraph && returnTrees then - displayGraphs - else - if isNothing noChainedGraph then - trace ("Reconciled Graph generated chained network nodes that cannot be resolved. ONly retunring display trees") - displayGraphs - else reconciledGraph : displayGraphs - -- ) - - --- | buildTree' wraps build tree and changes order of arguments for mapping -buildTree' :: Bool-> [Argument] -> GlobalSettings -> Int -> ([[VertexCost]], ProcessedData) -> [PhylogeneticGraph] -buildTree' simpleTreeOnly inArgs inGS rSeed (pairwiseDistances, inData) = - buildTree simpleTreeOnly inArgs inGS inData pairwiseDistances rSeed - --- | buildTree takes build options and returns contructed graphList --- simpleTreeOnly (for block build) returns a single best tree to reduce edges in --- reconcile step -buildTree :: Bool -> [Argument] -> GlobalSettings ->ProcessedData -> [[VertexCost]] -> Int-> [PhylogeneticGraph] -buildTree simpleTreeOnly inArgs inGS inData@(nameTextVect, _, _) pairwiseDistances rSeed = - let fstArgList = fmap (fmap toLower . fst) inArgs - sndArgList = fmap (fmap toLower . snd) inArgs - lcArgList = zip fstArgList sndArgList - checkCommandList = checkCommandArgs "build" fstArgList VER.buildArgList - in - -- check for valid command options - if not checkCommandList then errorWithoutStackTrace ("Unrecognized command in 'build': " ++ show inArgs) - else - - let buildDistance = any ((=="distance").fst) lcArgList - buildCharacter = any ((=="character").fst) lcArgList - repPairList = filter ((=="replicates").fst) lcArgList - numReplicates - | length repPairList > 1 = - errorWithoutStackTrace ("Multiple replicate number specifications in command--can have only one: " ++ show inArgs) - | null repPairList = Just 10 - | otherwise = readMaybe (snd $ head repPairList) :: Maybe Int - keepPairList = filter ((=="best").fst) lcArgList - numToSave - | length keepPairList > 1 = - errorWithoutStackTrace ("Multiple best number specifications in command--can have only one: " ++ show inArgs) - | null keepPairList = numReplicates - | otherwise = readMaybe (snd $ head keepPairList) :: Maybe Int - - in - if buildDistance && buildCharacter then - errorWithoutStackTrace ("Cannot specify both 'character' and 'distance' builds in same build command" ++ show inArgs) - else if isNothing numReplicates then errorWithoutStackTrace ("Replicates specification not an integer: " ++ show (snd $ head repPairList)) - else if isNothing numToSave then errorWithoutStackTrace ("Best specification not an integer: " ++ show (snd $ head keepPairList)) - else if buildDistance then - -- distance build - -- do all options in line and add together for return tree list - let doDWag = any ((=="dwag").fst) lcArgList - doRDwag = any ((=="rdwag").fst) lcArgList - doNJ = any ((=="nj").fst) lcArgList - doWPGMA = any ((=="wpgma").fst) lcArgList - doOTU = any ((=="otu").fst) lcArgList - doSPR = any ((=="spr").fst) lcArgList - doTBR = any ((=="tbr").fst) lcArgList - outgroupElem = outgroupIndex inGS - nameStringVect = fmap TL.unpack nameTextVect - distMatrix = M.fromLists pairwiseDistances - in - trace ("\tBuilding Distance Wagner") ( - let refinement - | doTBR = "tbr" - | doSPR = "spr" - | doOTU = "otu" - | otherwise = "none" - treeList = [distanceWagner simpleTreeOnly inGS inData nameStringVect distMatrix outgroupElem refinement | doDWag] - treeList' = if doRDwag then treeList ++ randomizedDistanceWagner simpleTreeOnly inGS inData nameStringVect distMatrix outgroupElem (fromJust numReplicates) rSeed (fromJust numToSave) refinement - else treeList - treeList'' = if doNJ then treeList' ++ [neighborJoin simpleTreeOnly inGS inData nameStringVect distMatrix outgroupElem refinement] - else treeList' - treeList''' = if doWPGMA then treeList'' ++ [wPGMA simpleTreeOnly inGS inData nameStringVect distMatrix outgroupElem refinement] - else treeList'' - in - if null treeList''' then errorWithoutStackTrace ("Distance build is specified, but without any method: " ++ show inArgs) - else - let costRangeString = if (not simpleTreeOnly) then (" at cost range " ++ (show (minimum $ fmap snd6 treeList''', maximum $ fmap snd6 treeList'''))) - else "" - in - trace ("\tDistance build yielded " ++ (show $ length treeList''') ++ " trees" ++ costRangeString) ( - if (not simpleTreeOnly) then treeList''' - else GO.selectPhylogeneticGraph [("best", (show (1 :: Int)))] 0 ["best"] treeList''' - ) - ) - - else - -- character build - -- final diagnosis in input graph type - trace ("\tBuilding Character Wagner") ( - let treeList = WB.rasWagnerBuild inGS inData rSeed (fromJust numReplicates) - costRangeString = if (not simpleTreeOnly) then (" at cost range " ++ (show (minimum $ fmap snd6 treeList, maximum $ fmap snd6 treeList))) - else "" - in - if (not simpleTreeOnly) then - trace ("\tCharacter build yielded " ++ (show $ length treeList) ++ " tree(s)" ++ costRangeString) - treeList - else - trace ("\tCharacter build yielded " ++ (show $ length $ GO.selectPhylogeneticGraph [("best", (show (1 :: Int)))] 0 ["best"] treeList) ++ " tree(s)" ++ costRangeString) - GO.selectPhylogeneticGraph [("best", (show (1 :: Int)))] 0 ["best"] treeList - ) - --- | distanceWagner takes Processed data and pairwise distance matrix and returns --- 'best' addition sequence Wagner (defined in Farris, 1972) as fully decorated tree (as Graph) -distanceWagner :: Bool -> GlobalSettings -> ProcessedData -> V.Vector String -> M.Matrix Double -> Int -> String -> PhylogeneticGraph -distanceWagner simpleTreeOnly inGS inData leafNames distMatrix outgroupValue refinement = - let distWagTree = head $ DM.doWagnerS leafNames distMatrix "closest" outgroupValue "best" [] - distWagTree' = head $ DW.performRefinement refinement "best:1" "first" leafNames outgroupValue distWagTree - distWagTreeSimpleGraph = DU.convertToDirectedGraphText leafNames outgroupValue (snd4 distWagTree') - charInfoVV = V.map thd3 $ thd3 inData - in - if not simpleTreeOnly then T.multiTraverseFullyLabelGraph inGS inData False False Nothing (GO.renameSimpleGraphNodes $ GO.dichotomizeRoot outgroupValue $ LG.switchRootTree (length leafNames) distWagTreeSimpleGraph) - else - let simpleWag = GO.renameSimpleGraphNodes $ GO.dichotomizeRoot outgroupValue $ LG.switchRootTree (length leafNames) distWagTreeSimpleGraph - in - (simpleWag, 0.0, LG.empty, V.empty, V.empty, charInfoVV) - --- | randomizedDistanceWagner takes Processed data and pairwise distance matrix and returns --- random addition sequence Wagner trees fully decorated tree (as Graph) -randomizedDistanceWagner :: Bool -> GlobalSettings -> ProcessedData -> V.Vector String -> M.Matrix Double -> Int -> Int -> Int -> Int -> String -> [PhylogeneticGraph] -randomizedDistanceWagner simpleTreeOnly inGS inData leafNames distMatrix outgroupValue numReplicates rSeed numToKeep refinement = - let randomizedAdditionSequences = V.fromList <$> shuffleInt rSeed numReplicates [0..(length leafNames - 1)] - randomizedAdditionWagnerTreeList = DM.doWagnerS leafNames distMatrix "random" outgroupValue "random" randomizedAdditionSequences - randomizedAdditionWagnerTreeList' = take numToKeep $ L.sortOn thd4 randomizedAdditionWagnerTreeList - randomizedAdditionWagnerTreeList'' = head <$> PU.seqParMap PU.myStrategy (DW.performRefinement refinement "best:1" "first" leafNames outgroupValue) randomizedAdditionWagnerTreeList' - randomizedAdditionWagnerSimpleGraphList = fmap (DU.convertToDirectedGraphText leafNames outgroupValue . snd4) randomizedAdditionWagnerTreeList'' - charInfoVV = V.map thd3 $ thd3 inData - in - if not simpleTreeOnly then fmap ((T.multiTraverseFullyLabelGraph inGS inData False False Nothing . GO.renameSimpleGraphNodes . GO.dichotomizeRoot outgroupValue) . LG.switchRootTree (length leafNames)) randomizedAdditionWagnerSimpleGraphList - else - let numTrees = length randomizedAdditionWagnerSimpleGraphList - simpleRDWagList = fmap ((GO.dichotomizeRoot outgroupValue) . LG.switchRootTree (length leafNames)) randomizedAdditionWagnerSimpleGraphList - in - L.zip6 simpleRDWagList (replicate numTrees 0.0) (replicate numTrees LG.empty) (replicate numTrees V.empty) (replicate numTrees V.empty) (replicate numTrees charInfoVV) - --- | neighborJoin takes Processed data and pairwise distance matrix and returns --- Neighbor-Joining tree as fully decorated tree (as Graph) -neighborJoin :: Bool -> GlobalSettings -> ProcessedData -> V.Vector String -> M.Matrix Double -> Int -> String -> PhylogeneticGraph -neighborJoin simpleTreeOnly inGS inData leafNames distMatrix outgroupValue refinement = - let njTree = DM.neighborJoining leafNames distMatrix outgroupValue - njTree' = head $ DW.performRefinement refinement "best:1" "first" leafNames outgroupValue njTree - njSimpleGraph = DU.convertToDirectedGraphText leafNames outgroupValue (snd4 njTree') - charInfoVV = V.map thd3 $ thd3 inData - in - if not simpleTreeOnly then T.multiTraverseFullyLabelGraph inGS inData False False Nothing (GO.renameSimpleGraphNodes $ GO.dichotomizeRoot outgroupValue $ LG.switchRootTree (length leafNames) njSimpleGraph) - else - let simpleNJ = GO.dichotomizeRoot outgroupValue $ LG.switchRootTree (length leafNames) njSimpleGraph - in - (simpleNJ, 0.0, LG.empty, V.empty, V.empty, charInfoVV) - - --- | wPGMA takes Processed data and pairwise distance matrix and returns --- WPGMA tree as fully decorated tree (as Graph) --- since root index not nOTUs as with other tres--chanegd as with dWag and NJ to make consistent. -wPGMA :: Bool -> GlobalSettings -> ProcessedData -> V.Vector String -> M.Matrix Double -> Int -> String -> PhylogeneticGraph -wPGMA simpleTreeOnly inGS inData leafNames distMatrix outgroupValue refinement = - let wpgmaTree = DM.wPGMA leafNames distMatrix outgroupValue - wpgmaTree' = head $ DW.performRefinement refinement "best:1" "first" leafNames outgroupValue wpgmaTree - wpgmaSimpleGraph = DU.convertToDirectedGraphText leafNames outgroupValue (snd4 wpgmaTree') - charInfoVV = V.map thd3 $ thd3 inData - in - if not simpleTreeOnly then T.multiTraverseFullyLabelGraph inGS inData False False Nothing (GO.renameSimpleGraphNodes $ GO.dichotomizeRoot outgroupValue $ LG.switchRootTree (length leafNames) wpgmaSimpleGraph) - else - let simpleWPGMA = GO.dichotomizeRoot outgroupValue $ LG.switchRootTree (length leafNames) wpgmaSimpleGraph - in - (simpleWPGMA, 0.0, LG.empty, V.empty, V.empty, charInfoVV) diff --git a/pkg/PhyGraph/Search/DistanceMethods.hs b/pkg/PhyGraph/Search/DistanceMethods.hs deleted file mode 100644 index 49710d76b..000000000 --- a/pkg/PhyGraph/Search/DistanceMethods.hs +++ /dev/null @@ -1,381 +0,0 @@ -{- | -Module : DistanceMethods.hs -Description : Module to calculate distance tree construction methods Neightbor-Joining, WPGMA, and WPGMA - -- but with added refinement based on 4-point metric -Copyright : (c) 2020 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - -I --} - -{-# LANGUAGE BangPatterns #-} - -module Search.DistanceMethods (neighborJoining, wPGMA, doWagnerS, performWagnerRefinement) where - -import qualified Data.Number.Transfinite as NT -import qualified Data.Vector as V -import Debug.Trace -import ParallelUtilities as PU -import qualified Search.DistanceWagner as W -import qualified SymMatrix as M -import Types.DistanceTypes -import Utilities.DistanceUtilities ---import qualified LocalSequence as LS -import Control.Parallel.Strategies -import qualified Data.Vector as LS - - --- | wPGMA takes a list of leaves and a distance matrixx and returns --- an WGPGMA tree --- WPGMA not UPGMA since the linkages are from the two descendent linkages and not weighted by --- the number of taxa in each group as in https://en.wikipedia.org/wiki/WPGMA -wPGMA :: V.Vector String -> M.Matrix Double -> Int -> TreeWithData -wPGMA leafNames distMatrix outgroup = - if M.null distMatrix then error "Null matrix in WPGMA" - else - trace "\n\tBuilding WPGMA tree" ( - let numLeaves = V.length leafNames - leafVertexVect = V.fromList [0..(numLeaves - 1)] - ((vertexVect, edgeVect), finalMatrix) = addTaxaWPGMA distMatrix numLeaves (leafVertexVect, V.empty) [] - wPGMATree' = convertLinkagesToEdgeWeights vertexVect (V.toList edgeVect) (V.toList edgeVect) [] numLeaves - -- linkage leves are not same as edgeweights so need to be converted - newickString = convertToNewick leafNames outgroup wPGMATree' - treeCost = getTreeCost wPGMATree' - in - --trace (show wPGMATree ++ "\n" ++ show wPGMATree') - (take (length newickString - 3) newickString ++ "[" ++ show treeCost ++ "];\n", wPGMATree', treeCost, finalMatrix) - ) - --- | pulls dWagner function from module Wagner -doWagnerS :: V.Vector String -> M.Matrix Double -> String -> Int -> String -> [V.Vector Int]-> [TreeWithData] -doWagnerS leafNames distMatrix firstPairMethod outgroup addSequence replicateSequences = - trace ("\tBuilding " ++ (show $ length replicateSequences) ++ " Wagner tree(s)") - W.doWagnerS leafNames distMatrix firstPairMethod outgroup addSequence replicateSequences - --- | pulls Wagner refinement from Wagner module -performWagnerRefinement :: String -> String -> String -> V.Vector String -> Int -> TreeWithData -> [TreeWithData] -performWagnerRefinement = W.performRefinement - --- | neighborJoining takes a list of leaves and a distance matrixx and returns --- an NJ tree -neighborJoining :: V.Vector String -> M.Matrix Double -> Int -> TreeWithData -neighborJoining leafNames distMatrix outgroup = - if M.null distMatrix then error "Null matrix in neighborJoining" - else - trace "\n\tBuilding NJ tree" ( - -- get intial matrices - let -- initialBigDMatrix = makeDMatrix distMatrix [] -- 0 0 [] - numLeaves = V.length leafNames - leafVertexVect = V.fromList [0..(numLeaves - 1)] - (nJTree, finalLittleDMatrix) = addTaxaNJ distMatrix numLeaves (leafVertexVect, V.empty) [] - newickString = convertToNewick leafNames outgroup nJTree - treeCost = getTreeCost nJTree - in - --trace (show nJTree) - (take (length newickString - 3) newickString ++ "[" ++ show treeCost ++ "];\n", nJTree, treeCost, finalLittleDMatrix) - ) - --- | sumAvail sums the row only thos values not already added to tree -sumAvail :: [Int] -> Int -> [Double] -> Double -sumAvail vertInList index distList - | null distList = 0.0 - | index `elem` vertInList = sumAvail vertInList (index + 1) (tail distList) - | otherwise = - let firstDist = head distList - in - firstDist + sumAvail vertInList (index + 1) (tail distList) - --- | makeDMatrixRow make a single row of the bif D matrix -makeDMatrixRow :: M.Matrix Double -> [Int] -> Int -> Int -> V.Vector Double -- LS.Seq Double -makeDMatrixRow inObsMatrix vertInList column row - | M.null inObsMatrix = error "Null matrix in makeInitialDMatrix" - | row `elem` vertInList = LS.replicate (LS.length (inObsMatrix LS.! row)) NT.infinity - | column == LS.length (inObsMatrix LS.! row) = LS.empty - | column == row = LS.cons 0.0 (makeDMatrixRow inObsMatrix vertInList (column + 1) row) - | column `notElem` vertInList = - let dij = inObsMatrix M.! (row, column) - divisor = (fromIntegral (M.rows inObsMatrix) - 2) - fromIntegral (length vertInList) - ri = (sumAvail vertInList 0 $ M.getFullRow inObsMatrix row) - rj = (sumAvail vertInList 0 $ M.getFullRow inObsMatrix column) - bigDij = dij - ((ri + rj) / divisor) - in - LS.cons bigDij (makeDMatrixRow inObsMatrix vertInList (column + 1) row) - | otherwise = LS.cons NT.infinity (makeDMatrixRow inObsMatrix vertInList (column + 1) row) - - --- | makeIDMatrix makes adjusted matrix (D) from observed (d) values --- assumes matrix is square and symmetrical --- makes values Infinity if already added --- adjust ri and rj to bew based on on values not in termInList --- does by row so can be parallelized call with column = 0 update list [] --- makes DMatrix direclty not via M.updateMatrix -makeDMatrix :: M.Matrix Double -> [Int] -> M.Matrix Double -makeDMatrix inObsMatrix vertInList = - if M.null inObsMatrix then error "Null matrix in makeInitialDMatrix" - else - let newMatrix = PU.seqParMap rdeepseq (makeDMatrixRow inObsMatrix vertInList 0) [0..(M.rows inObsMatrix - 1)] -- `using` PU.myParListChunkRDS - in - LS.fromList newMatrix - - -{- --- | makeIDMatrix makes adjusted matrix (D) from observed (d) values --- assumes matrix is square and symmetrical --- makes values Infinity if already added - -- adjust ri and rj to bew based on on values not in termInList -makeDMatrix' :: M.Matrix Double -> [Int] -> Int -> Int -> [(Int, Int, Double)] -> M.Matrix Double -makeDMatrix' inObsMatrix vertInList row column updateList - | M.null inObsMatrix = error "Null matrix in makeInitialDMatrix" - | row == M.rows inObsMatrix = M.updateMatrix inObsMatrix updateList - | column == M.cols inObsMatrix = makeDMatrix' inObsMatrix vertInList (row + 1) 0 updateList - | column == row = makeDMatrix' inObsMatrix vertInList row (column + 1) ((row, column, 0.0) : updateList) - | (column `elem` vertInList) || (row `elem` vertInList) = - makeDMatrix' inObsMatrix vertInList row (column + 1) ((row, column, NT.infinity) : updateList) - | otherwise = - let dij = inObsMatrix M.! (row, column) - divisor = (fromIntegral (M.rows inObsMatrix) - 2) - fromIntegral (length vertInList) - ri = (sumAvail vertInList 0 $ M.getFullRow inObsMatrix row) - rj = (sumAvail vertInList 0 $ M.getFullRow inObsMatrix column) - bigDij = dij - ((ri + rj) / divisor) - in - makeDMatrix' inObsMatrix vertInList row (column + 1) ((row, column, bigDij) : updateList) --} - --- | pickNearestUpdateMatrix takes d and D matrices, pickes nearest based on D --- then updates d and D to reflect new node and distances created --- updates teh column/row for vertices that are joined to be infinity so --- won't be chosen to join again -pickNearestUpdateMatrixNJ :: M.Matrix Double -> [Int] -> (M.Matrix Double, Vertex, Edge, Edge, [Int]) -pickNearestUpdateMatrixNJ littleDMatrix vertInList - | M.null littleDMatrix = error "Null d matrix in pickNearestUpdateMatrix" - | otherwise = - let (iMin, jMin, distIJ) = getMatrixMinPairTabu (makeDMatrix littleDMatrix vertInList) vertInList - --(iMin, jMin, distIJ) = getMatrixMinPairTabu (makeDMatrix' littleDMatrix vertInList 0 0 []) vertInList - in - --trace ("First pair " ++ show (iMin, jMin, distIJ) ++ " matrix: " ++ (show (makeDMatrix littleDMatrix vertInList)) - -- ++ "\nVertsIn: " ++ show vertInList ++ " matrix': " ++ show (makeDMatrix' littleDMatrix vertInList 0 0 [])) ( - if distIJ == NT.infinity then error "No minimum found in pickNearestUpdateMatrix" - else - let -- new vertex is size of distance matrix (0 indexed) - newVertIndex = M.rows littleDMatrix - dij = littleDMatrix M.! (iMin, jMin) - divisor = fromIntegral (M.rows littleDMatrix) - 2 - fromIntegral (length vertInList) - -- only count values of those in - ri = (sumAvail vertInList 0 $ M.getFullRow littleDMatrix iMin) - rj = (sumAvail vertInList 0 $ M.getFullRow littleDMatrix jMin) - -- seem reversed compared to examples, seems arbitrary to me (for leaf pairs at least) - -- diMinNewVert = (dij / 2.0) - ((ri - rj) / (2.0 * divisor)) - -- djMinNewVert = dij - diMinNewVert - djMinNewVert = (dij / 2.0) - ((ri - rj) / (2.0 * divisor)) - diMinNewVert = dij - djMinNewVert - - newVertInList = vertInList ++ [iMin, jMin] - - -- get distances to existing vertices - otherVertList = [0..(M.rows littleDMatrix - 1)] - newLittleDRow = PU.seqParMap rdeepseq (getNewDist littleDMatrix dij iMin jMin diMinNewVert djMinNewVert) otherVertList -- `using` myParListChunkRDS - newLittleDMatrix = M.addMatrixRow littleDMatrix (LS.fromList $ newLittleDRow ++ [0.0]) - -- recalculate whole D matrix since new row affects all the original ones (except those merged) - -- included vertex values set to infinity so won't be chosen later - -- newBigDMatrix = makeDMatrix newLittleDMatrix newVertInList -- 0 0 [] - - -- create new edges - newEdgeI = (newVertIndex, iMin, diMinNewVert) - newEdgeJ = (newVertIndex, jMin, djMinNewVert) - in - (newLittleDMatrix, newVertIndex, newEdgeI, newEdgeJ, newVertInList) - --) - --- | getNewDist get ditance of new vertex to existing vertices -getNewDist :: M.Matrix Double -> Double-> Int -> Int -> Double -> Double -> Int -> Double -getNewDist littleDMatrix dij iMin jMin diMinNewVert djMinNewVert otherVert - | otherVert == iMin = diMinNewVert - | otherVert == jMin = djMinNewVert - | otherwise = - let dik = littleDMatrix M.! (iMin, otherVert) - djk = littleDMatrix M.! (jMin, otherVert) - in - (dik + djk - dij) / 2.0 - --- | addTaxaNJ recursively calls pickNearestUpdateMatrix untill all internal nodes are created --- recursively called until all (n - 2) internal vertices are created. -addTaxaNJ :: M.Matrix Double -> Int -> Tree -> [Int] -> (Tree, M.Matrix Double) -addTaxaNJ littleDMatrix numLeaves (vertexVect, edgeVect) vertInList = - if V.length vertexVect == (2 * numLeaves) - 2 then - let --(iMin, jMin, _) = getMatrixMinPairTabu (makeDMatrix littleDMatrix vertInList) vertInList - last2 = subtractVector (V.fromList vertInList) vertexVect - iMin = last2 V.! 0 - jMin = last2 V.! 1 - {-This is wrong--not last two - iMin = vertexVect V.! ((V.length vertexVect) - 1) - jMin = vertexVect V.! ((V.length vertexVect) - 2) - -} - lastEdge = (iMin, jMin, littleDMatrix M.! (iMin, jMin)) - in - {- - trace (show last2 ++ " from " ++ show vertexVect ++ "\nlast edge: " ++ " size " ++ (show $ V.length vertexVect) ++ " matrix: " ++ (show littleDMatrix) - ++ " edge: " - ++ (show lastEdge) ++ " vertInList: " ++ show vertInList) - -} - ((vertexVect, edgeVect `V.snoc` lastEdge), littleDMatrix) - -- more to add - else - let !(newLittleDMatrix, newVertIndex, newEdgeI, newEdgeJ, newVertInList) = pickNearestUpdateMatrixNJ littleDMatrix vertInList - newVertexVect = vertexVect `V.snoc` newVertIndex - newEdgeVect = edgeVect V.++ V.fromList [newEdgeI, newEdgeJ] - in - --trace (M.showMatrixNicely newLittleDMatrix ++ "\n" ++ M.showMatrixNicely bigDMatrix) - let -- progress = takeWhile (/='.') $ show ((fromIntegral (100 * (V.length vertexVect - numLeaves))/fromIntegral (numLeaves - 2)) :: Double) - (percentAdded, _) = divMod (100 * (V.length vertexVect - numLeaves)) (numLeaves - 2) - (decileNumber, decileRemainder) = divMod percentAdded 10 - (_, oddRemainder) = divMod (V.length vertexVect - numLeaves) 2 - in - if decileRemainder == 0 && oddRemainder == 0 then - trace ("\t\t"++ (show $ 10 * decileNumber) ++ "%") - addTaxaNJ newLittleDMatrix numLeaves (newVertexVect, newEdgeVect) newVertInList - else addTaxaNJ newLittleDMatrix numLeaves (newVertexVect, newEdgeVect) newVertInList - - --- | addTaxaWPGMA perfomrs recursive reduction of distance matrix until all internal vertices are created -addTaxaWPGMA :: M.Matrix Double -> Int -> Tree -> [Int] -> (Tree, M.Matrix Double) -addTaxaWPGMA distMatrix numLeaves (vertexVect, edgeVect) vertInList = - if V.length vertexVect == (2 * numLeaves) - 2 then - let --(iMin, jMin, _) = getMatrixMinPairTabu distMatrix vertInList -- (-1, -1, NT.infinity) 0 0 - last2 = subtractVector (V.fromList vertInList) vertexVect - iMin = last2 V.! 0 - jMin = last2 V.! 1 - lastEdge = (iMin, jMin, distMatrix M.! (iMin, jMin)) - in - {- - trace ((show last2) ++ " from " ++ show vertexVect ++ "\nlast edge: " ++ " size " ++ (show $ V.length vertexVect) ++ " matrix: " ++ (show distMatrix) - ++ " last edge: " - ++ (show lastEdge) ++ " vertInList: " ++ (show vertInList) - ++ " all edges " ++ show (edgeVect `V.snoc` lastEdge)) - -} - ((vertexVect, edgeVect `V.snoc` lastEdge), distMatrix) - - else -- building - let !(newDistMatrix, newVertIndex, newEdgeI, newEdgeJ, newVertInList) = pickUpdateMatrixWPGMA distMatrix vertInList - newVertexVect = vertexVect `V.snoc` newVertIndex - newEdgeVect = edgeVect V.++ V.fromList [newEdgeI, newEdgeJ] - in - --trace (M.showMatrixNicely distMatrix) - let -- progress = takeWhile (/='.') $ show ((fromIntegral (100 * (V.length vertexVect - numLeaves))/fromIntegral (numLeaves - 2)) :: Double) - (percentAdded, _) = divMod (100 * (V.length vertexVect - numLeaves)) (numLeaves - 2) - (decileNumber, decileRemainder) = divMod percentAdded 10 - (_, oddRemainder) = divMod (V.length vertexVect - numLeaves) 2 - in - if decileRemainder == 0 && oddRemainder == 0 then - trace ("\t\t"++ (show $ 10 * decileNumber) ++ "%") - addTaxaWPGMA newDistMatrix numLeaves (newVertexVect, newEdgeVect) newVertInList - else addTaxaWPGMA newDistMatrix numLeaves (newVertexVect, newEdgeVect) newVertInList - - --- | pickUpdateMatrixWPGMA takes d matrix, pickes closesst based on d --- then updates d to reflect new node and distances created --- updates the column/row for vertices that are joined to be infinity so --- won't be chosen to join again -pickUpdateMatrixWPGMA :: M.Matrix Double -> [Int] -> (M.Matrix Double, Vertex, Edge, Edge, [Int]) -pickUpdateMatrixWPGMA distMatrix vertInList = - if M.null distMatrix then error "Null d matrix in pickNearestUpdateMatrix" - else - let (iMin, jMin, dij) = getMatrixMinPairTabu distMatrix vertInList -- (-1, -1, NT.infinity) 0 0 - in - --trace ("First pair " ++ show (iMin, jMin, dij)) ( - if dij == NT.infinity then error "No minimum found in pickNearestUpdateMatrix" - else - let -- new vertex is size of distance matrix (0 indexed) - newVertIndex = M.rows distMatrix - - diMinNewVert = dij /2.0 - djMinNewVert = dij /2.0 - - newVertInList = vertInList ++ [iMin, jMin] - - -- get distances to existing vertices - otherVertList = [0..(M.rows distMatrix - 1)] - newDistRow = PU.seqParMap rdeepseq (getNewDistWPGMA distMatrix iMin jMin diMinNewVert djMinNewVert) otherVertList -- `using` myParListChunkRDS - newDistMatrix = M.addMatrixRow distMatrix (LS.fromList $ newDistRow ++ [0.0]) - - -- create new edges - newEdgeI = (newVertIndex, iMin, diMinNewVert) - newEdgeJ = (newVertIndex, jMin, djMinNewVert) - in - (newDistMatrix, newVertIndex, newEdgeI, newEdgeJ, newVertInList) - --) - --- | getNewDistWPGMA get ditance of new vertex to existing vertices WPGMA--cluster levels -getNewDistWPGMA :: M.Matrix Double -> Int -> Int -> Double -> Double -> Int -> Double -getNewDistWPGMA distMatrix iMin jMin diMinNewVert djMinNewVert otherVert - | otherVert == iMin = diMinNewVert - | otherVert == jMin = djMinNewVert - | otherwise = - let dik = distMatrix M.! (iMin, otherVert) - djk = distMatrix M.! (jMin, otherVert) - in - (dik + djk) / 2.0 - --- | convertLinkagesToEdgeWeights converts linake leevs to branch lengths --- by subtracting descnedent linakge from edge weight --- edges are created in linakege order so can use that direction, leaves are --- always 2nd element in edge -convertLinkagesToEdgeWeights :: V.Vector Vertex -> [Edge] -> [Edge] -> [Edge] -> Int -> Tree -convertLinkagesToEdgeWeights vertexVect fullEdgeList inEdgeList curEdgeList numLeaves - | null fullEdgeList = error "Null edge set in convertLinkagesToEdgeWeights" - | V.null vertexVect = error "Null vertex set in convertLinkagesToEdgeWeights" - | null inEdgeList = (vertexVect, V.fromList curEdgeList) - | null curEdgeList = - -- first case, take last edge (highest linkage) special case need - -- to subtract both descendent linkages - let (eVert, uVert, linkage) = last inEdgeList - eDescLinkage = if eVert >= numLeaves then getWeightDescLink eVert fullEdgeList else 0.0 - uDescLinkage = if uVert >= numLeaves then getWeightDescLink uVert fullEdgeList else 0.0 - newEdgeList = [(eVert, uVert, linkage - eDescLinkage - uDescLinkage)] - in - convertLinkagesToEdgeWeights vertexVect fullEdgeList (init inEdgeList) newEdgeList numLeaves - | otherwise = -- not first but still some edegs to go - let (eVert, uVert, linkage) = head inEdgeList - uDescLinkage = if uVert >= numLeaves then getWeightDescLink uVert fullEdgeList else 0.0 - newEdgeList = (eVert, uVert, linkage - uDescLinkage) : curEdgeList - in - convertLinkagesToEdgeWeights vertexVect fullEdgeList (tail inEdgeList) newEdgeList numLeaves - --- | getWeightDescLink takes the m,pore derived (smaller linkage, 2nd vertex of edge) and returns --- the weight of that edge (assumes not leaf--checked earlier) -getWeightDescLink :: Int -> [Edge] -> Double -getWeightDescLink uVert fullEdgeList = - if null fullEdgeList then error "Edge not found in getWeightDescLink" - else - let (eVert, _, weight) = head fullEdgeList - in - if eVert == uVert then weight - else getWeightDescLink uVert (tail fullEdgeList) - diff --git a/pkg/PhyGraph/Search/DistanceWagner.hs b/pkg/PhyGraph/Search/DistanceWagner.hs deleted file mode 100644 index 68eb6be7c..000000000 --- a/pkg/PhyGraph/Search/DistanceWagner.hs +++ /dev/null @@ -1,923 +0,0 @@ -{- | -Module : Wagner.hs -Description : Module with distance tree construction methods Distance Wagner Farris 1972 - -- but with added refinement based on 4-point metric -Copyright : (c) 2020 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -{- -Need to integerize costs for swapping very slow on Double values - do due spurious precision --} - -module Search.DistanceWagner (doWagnerS, performRefinement) where - -import Control.Parallel.Strategies -import Data.Maybe -import qualified Data.Number.Transfinite as NT -import qualified Data.Vector as V -import Debug.Trace -import GeneralUtilities -import ParallelUtilities as PU -import qualified SymMatrix as M -import Types.DistanceTypes -import Utilities.DistanceUtilities ---import qualified LocalSequence as LS -import qualified Data.Vector as LS - --- | getStartingPair returns starying pair for Wagner build --- closts mnimal cost pair --- furthest maximal cost pair --- random chooses uniformly at random from leaf set -getStartingPair :: String -> M.Matrix Double -> Edge -getStartingPair choiceOpt distMatrix - | choiceOpt == "closest" = getMatrixMinPair distMatrix (-1, -1, NT.infinity) 0 0 - | choiceOpt == "furthest" = getMatrixMaxPair distMatrix (-1 , -1, 0 :: Double) 0 0 - | choiceOpt == "random" = errorWithoutStackTrace "Initial pair option 'random' not yet implemented" - | otherwise = errorWithoutStackTrace ("Initial pair option " ++ choiceOpt ++ " unrecognized. Must be 'closest', 'furthest', or 'random'") - --- | getBestEdgeTree take list of edge tuples and return trhe one with best addition cost -getBestEdgeTree :: V.Vector (Double, Tree, M.Matrix Double) -> Double -> (Double, Tree, M.Matrix Double) -> (Double, Tree, M.Matrix Double) -getBestEdgeTree edgeTreeList curBestCost curBestResult = - if V.null edgeTreeList then curBestResult - else - let (firstAddCost, _, _) = V.head edgeTreeList - in - if firstAddCost < curBestCost then getBestEdgeTree (V.tail edgeTreeList) firstAddCost (V.head edgeTreeList) - else getBestEdgeTree (V.tail edgeTreeList) curBestCost curBestResult - --- | add leaf to an edge creating new tree with distances and add cost, also augmented distance matrix --- but this for swap so returns entire new3-edge cost so not Farris triangle it is sum of three diveded by 2 -addToEdgeSwap :: M.Matrix Double -> Int -> Tree -> Int -> Edge -> (Double, Tree, M.Matrix Double) -addToEdgeSwap distMatrix leaf initialTree newLeafIndex inEdge = - let (eVertex, uVertex, inWeight) = inEdge - (initialVertexVect, initialEdgeVect) = initialTree - addCost = ((distMatrix M.! (leaf, eVertex)) + (distMatrix M.! (leaf, uVertex)) - (distMatrix M.! (eVertex, uVertex))) / 2.0 - eVertLeafDist = (distMatrix M.! (leaf, eVertex)) - addCost - uVertLeafDist = (distMatrix M.! (leaf, uVertex)) - addCost - newVertexVect = V.snoc initialVertexVect leaf - newEdges = V.fromList [(leaf,newLeafIndex, addCost),(eVertex, newLeafIndex, eVertLeafDist),(uVertex, newLeafIndex, uVertLeafDist)] - cleanupEdges = V.filter (/= inEdge) initialEdgeVect - newEdgeVect = cleanupEdges V.++ newEdges - newTree = (newVertexVect, newEdgeVect) - -- add new costs from added vertex to each reamaining leaf - augmentedDistMatrix = getNewDistMatrix distMatrix addCost eVertLeafDist uVertLeafDist eVertex uVertex leaf - in - (addCost + eVertLeafDist + uVertLeafDist - inWeight , newTree, augmentedDistMatrix) - - --- | add leaf to an edge creating new tree with distances and add cost, also augmented distance matrix -addToEdge :: M.Matrix Double -> Int -> Tree -> Int -> Edge -> (Double, Tree, M.Matrix Double) -addToEdge distMatrix leaf initialTree newLeafIndex inEdge = - -- trace ("In addToEdge with " ++ (show (leaf, initialTree, newLeafIndex, (M.rows distMatrix), inEdge))) ( - let (eVertex, uVertex, _) = inEdge - (initialVertexVect, initialEdgeVect) = initialTree - addCost = ((distMatrix M.! (leaf, eVertex)) + (distMatrix M.! (leaf, uVertex)) - (distMatrix M.! (eVertex, uVertex))) / 2.0 - eVertLeafDist = (distMatrix M.! (leaf, eVertex)) - addCost - uVertLeafDist = (distMatrix M.! (leaf, uVertex)) - addCost - newVertexVect = V.snoc initialVertexVect leaf - newEdges = V.fromList [(leaf,newLeafIndex, addCost),(eVertex, newLeafIndex, eVertLeafDist),(uVertex, newLeafIndex, uVertLeafDist)] - cleanupEdges = V.filter (/= inEdge) initialEdgeVect - newEdgeVect = V.map orderEdge $ cleanupEdges V.++ newEdges - newTree = (newVertexVect, newEdgeVect) - -- add new costs from added vertex to each reamaining leaf - augmentedDistMatrix = getNewDistMatrix distMatrix addCost eVertLeafDist uVertLeafDist eVertex uVertex leaf - in - (addCost, newTree, augmentedDistMatrix) - - --- | addTaxonToTree takes distMatrix, an initialTree, Vector of leavesToAdd, and leaf index to add --- and retursn a tuple wiht the addition cost, the new tree, the new leaves to add, and new distance matrix (enhanced) -addTaxonToTree :: M.Matrix Double -> Tree -> V.Vector Int -> Int -> Int -> (Double, Tree, V.Vector Int, M.Matrix Double) -addTaxonToTree distMatrix initialTree leavesToAdd newVertexIndex leaf = - if V.null leavesToAdd then (0.0, initialTree, leavesToAdd, distMatrix) - else - let leavesRemaining = V.filter (/= leaf) leavesToAdd - (_, edgesInitial) = initialTree - - -- Parallelize heretoo much and destroys lazy matrix update - addEdgeList = V.map (addToEdge distMatrix leaf initialTree newVertexIndex) edgesInitial - (firstAddCost, _, _) = V.head addEdgeList -- this to initialize getBestEdge below - in - --filter for best addition point - let (addCost, newTree, augmentedDistMatrix) = getBestEdgeTree (V.tail addEdgeList) firstAddCost (V.head addEdgeList) - in - (addCost, newTree, leavesRemaining, augmentedDistMatrix) - --- | getBestLeafAdd chooses best leaf to add based on cost field -getBestLeafAdd :: V.Vector (Double, Tree, V.Vector Int, M.Matrix Double) -> Double -> (Double, Tree, V.Vector Int, M.Matrix Double) -> (Double, Tree, V.Vector Int, M.Matrix Double) -getBestLeafAdd addPosVect curBestCost curBestLeaf = - if V.null addPosVect then curBestLeaf - else - let (thisCost, _, _, _) = V.head addPosVect - in - if thisCost < curBestCost then getBestLeafAdd (V.tail addPosVect) thisCost (V.head addPosVect) - else getBestLeafAdd (V.tail addPosVect) curBestCost curBestLeaf - - --- | wagBest takes distMatrix, and intial tree of two leaves, a vector of leavesToAdd, the input nuber of leaves --- and returns the Farris 1972 distance Wagner adding the "closest" leaf at each iteration -wagBest :: M.Matrix Double -> Tree -> V.Vector Int -> Int -> Int -> V.Vector Int -> String -> (Tree, V.Vector Int, M.Matrix Double) -wagBest distMatrix inTree leavesToAdd nOTUs newVertexIndex leavesToMap choiceOpt - | length leavesToAdd == nOTUs = - let (eVertex, uVertex, edgeWeight) = orderEdge $ getStartingPair choiceOpt distMatrix - initialTree = (V.fromList[eVertex, uVertex],V.fromList [(eVertex, uVertex, edgeWeight)]) - leavesToAdd' = V.filter (/= eVertex) $ V.filter (/= uVertex) leavesToAdd - in - wagBest distMatrix initialTree leavesToAdd' nOTUs nOTUs leavesToAdd' choiceOpt - | V.null leavesToAdd = (inTree, leavesToAdd, distMatrix) - | otherwise = - let addPosVect = V.map (addTaxonToTree distMatrix inTree leavesToAdd newVertexIndex) leavesToMap - (firstLeafCost, _, _, _ ) = V.head addPosVect -- To initialize below - (_, newTree, newLeavesToAdd, augmentedDistMatrix) = getBestLeafAdd (V.tail addPosVect) firstLeafCost (V.head addPosVect) - in - let -- progress = takeWhile (/='.') $ show ((fromIntegral (100 * (newVertexIndex - nOTUs))/fromIntegral (nOTUs - 2)) :: Double) - (percentAdded, _) = divMod (100 * (newVertexIndex - nOTUs)) (nOTUs - 2) - (decileNumber, decileRemainder) = divMod percentAdded 10 - (_, oddRemainder) = divMod (newVertexIndex - nOTUs) 2 - in - --trace (show (percentAdded, decileNumber, decileRemainder)) ( - if decileRemainder == 0 && oddRemainder == 0 then - trace ("\t\t"++ (show $ 10 * decileNumber) ++ "%") - wagBest augmentedDistMatrix newTree newLeavesToAdd nOTUs (newVertexIndex + 1) newLeavesToAdd choiceOpt - else wagBest augmentedDistMatrix newTree newLeavesToAdd nOTUs (newVertexIndex + 1) newLeavesToAdd choiceOpt - --) - - --- | calculateWagnerTrees takes an input distance matrix (and options later) and returns --- a tree (V,E) discription of Wagner tree with labelled internal veritices and branch lengths -calculateWagnerTrees :: M.Matrix Double -> String -> (Tree, M.Matrix Double) -calculateWagnerTrees distMatrix choiceOpt = - if M.dim distMatrix == (0,0) then errorWithoutStackTrace "Null distance matrix" - else - -- get initial pair of leaves and create initial tree - let nOTUs = M.cols distMatrix - allLeaves = V.fromList [0..(nOTUs - 1)] - in - let (newTree, _, endMatrix) = wagBest distMatrix (V.empty, V.empty) allLeaves nOTUs nOTUs allLeaves choiceOpt - in - (newTree, endMatrix) - --- | makeTreeFromOrder takes an input order and other arguemnts and cretes tree using a single additoin --- seqeunce, best plaecment for the leaf each round -makeTreeFromOrder :: M.Matrix Double -> Tree -> Int -> Int -> V.Vector Int -> (Tree, M.Matrix Double) -makeTreeFromOrder distMatrix initialTree nOTUs vertexIndex leavesToAdd = - if null leavesToAdd then (initialTree, distMatrix) - else - let leaf = V.head leavesToAdd - (_, newTree, _, augmentedDistMatrix) = addTaxonToTree distMatrix initialTree leavesToAdd vertexIndex leaf - in - {-Too much output - let (percentAdded, _) = divMod (100 * (nOTUs - (V.length leavesToAdd))) (nOTUs - 2) - (decileNumber, decileRemainder) = divMod percentAdded 10 - in - if decileRemainder == 0 then - trace ("\t\t"++ (show $ 10 * decileNumber) ++ "%") - makeTreeFromOrder augmentedDistMatrix newTree nOTUs (vertexIndex + 1) (V.tail leavesToAdd) - else - -} - makeTreeFromOrder augmentedDistMatrix newTree nOTUs (vertexIndex + 1) (V.tail leavesToAdd) - --- | getRandomAdditionSequence initializes based on input sequence and adds in order from there -getRandomAdditionSequence :: V.Vector String -> M.Matrix Double -> Int -> V.Vector Int -> TreeWithData -getRandomAdditionSequence leafNames distMatrix outgroup initiaLeavesToAdd = - let nOTUs = V.length leafNames - in - let eVertex = initiaLeavesToAdd V.! 0 - uVertex = initiaLeavesToAdd V.! 1 - edgeWeight = distMatrix M.! (eVertex, uVertex) - initialTree = (V.fromList[eVertex, uVertex],V.fromList [(eVertex, uVertex, edgeWeight)]) - leavesToAdd = V.filter (/= eVertex) $ V.filter (/= uVertex) initiaLeavesToAdd - in - let thisTree = makeTreeFromOrder distMatrix initialTree nOTUs nOTUs leavesToAdd - -- (_, edgeVect) = fst thisTree - treeCost = getTreeCost $ fst thisTree -- V.sum $ V.map getEdgeCost edgeVect - newickTree = convertToNewick leafNames outgroup (fst thisTree) - newickTree' = take (length newickTree - 3) newickTree ++ "[" ++ showDouble precision treeCost ++ "]" ++ ";" - in - (newickTree', fst thisTree, treeCost, snd thisTree) - --- | doWagnerS takes user options and produces the Wagner tree methods desired (best, asis, or random) --- outputs newick rep list -doWagnerS :: V.Vector String -> M.Matrix Double -> String -> Int -> String -> [V.Vector Int]-> [TreeWithData] -doWagnerS leafNames distMatrix firstPairMethod outgroup addSequence replicateSequences = - let nOTUs = V.length leafNames - in - if addSequence == "best" then - let wagnerResult = calculateWagnerTrees distMatrix firstPairMethod - -- (_, edgeVect) = fst wagnerResult - treeCost = getTreeCost $ fst wagnerResult --- V.sum $ V.map getEdgeCost edgeVect - newickTree = convertToNewick leafNames outgroup (fst wagnerResult) - newickTree' = take (length newickTree - 3) newickTree ++ "[" ++ showDouble precision treeCost ++ "]" ++ ";" - in - [(newickTree', fst wagnerResult, treeCost, snd wagnerResult)] - else if addSequence == "asis" then - let initialTree = (V.fromList[0, 1],V.fromList [(0, 1, distMatrix M.! (0,1))]) - leavesToAdd = V.fromList [2..(nOTUs-1)] - asIsResult = makeTreeFromOrder distMatrix initialTree nOTUs nOTUs leavesToAdd - treeCost = getTreeCost $ fst asIsResult -- V.sum $ V.map getEdgeCost asIsEdges - newickTree = convertToNewick leafNames outgroup (fst asIsResult) - newickTree' = take (length newickTree - 3) newickTree ++ "[" ++ showDouble precision treeCost ++ "]" ++ ";" - in - [(newickTree', fst asIsResult, treeCost, snd asIsResult)] - else if head addSequence == 'r' then - if null replicateSequences then errorWithoutStackTrace "Zero replicate additions specified--could be error in configuration file" - else - let randomAddTrees = PU.seqParMap rdeepseq (getRandomAdditionSequence leafNames distMatrix outgroup) replicateSequences -- `using` myParListChunkRDS -- was rseq not sure whats better - -- randomAddTrees = parmap rseq (getRandomAdditionSequence leafNames distMatrix outgroup) replicateSequences - in - randomAddTrees - else errorWithoutStackTrace ("Addition sequence " ++ addSequence ++ " not implemented") - --- | edgeHasVertex takes an vertex and an edge and returns Maybe Int --- of other vertex -edgeHasVertex :: Vertex -> Edge -> Maybe (Vertex, Edge) -edgeHasVertex inVert inEdge = - let (a, b, _) = inEdge - in - if a == inVert then Just (b, inEdge) - else if b == inVert then Just (a, inEdge) - else Nothing - --- | getSubEdges take a Vector of edges and retuns a list of edges connected to input vertex --- uses nOTUs to know when to stop recursing -getSubEdges :: [Vertex] -> Int -> V.Vector Edge -> V.Vector Edge -> String -> V.Vector Edge -getSubEdges inVertexList nOTUs edgeVect subEdgeVect howMany - | V.null edgeVect = subEdgeVect - | null inVertexList = subEdgeVect - | otherwise = - let startVertex = head inVertexList - foundVect = V.filter (/= Nothing) $ V.map (edgeHasVertex startVertex) edgeVect - in - if V.null foundVect then getSubEdges (tail inVertexList) nOTUs edgeVect subEdgeVect howMany-- only terminals left - else if V.length foundVect /= 2 then error ("Index (" ++ howMany ++ ")" ++ show startVertex ++ "->found " ++ show (V.length foundVect) ++ " but should be two edges in " ++ show foundVect ++ " in " ++ show edgeVect) - else - let thingsFound = V.map fromJust foundVect - verticesFound = V.toList $ V.map fst thingsFound - edgesFound = V.map snd thingsFound - - -- add found edges to subEdgeSet - newSubEdgeVect = subEdgeVect V.++ edgesFound - -- delete delete edges from those to be searched - newEdgeVect = V.filter (/= V.last edgesFound) $ V.filter (/= V.head edgesFound) edgeVect - in - -- recurse on vertices that were found - if howMany == "first2" then edgesFound - else getSubEdges (verticesFound ++ tail inVertexList) nOTUs newEdgeVect newSubEdgeVect howMany - --- | adjustInternalEdgeVertex adjusts vertex of an internal (ie non-pendent, none 2 terminal edge) --- assumes hv > lv -adjustInternalEdgeVertex :: Vertex -> Vertex -> Vertex -> Int -> Int -> Vertex -adjustInternalEdgeVertex inV hV lV maxOffSet nOTUs - | inV <= max nOTUs lV = inV - | inV > hV = inV - maxOffSet - | inV > lV = inV - 1 - | otherwise = error ("This can't happen " ++ show (inV, lV, hV)) - -- ) - --- | adjustVertex reduces vertex (hV,lV,_) is the dge that was deleted --- index by 2,1, or 0 if greater than hVert, lVert, or neither --- asumes hv > lv; and inE > inU --- if selfe edge its a termin and only reduce by 1 --- assumes edge is ordered e > u -adjustVertex :: Edge -> Vertex -> Vertex -> Int -> Edge -adjustVertex (inE, inU, w) hV lV nOTUs - | (inE <= max lV nOTUs) && (inU <= max lV nOTUs) = (inE, inU, w) - | lV < nOTUs = -- update pendant edge was deleted since hV > lV; hV must be > nOTUs - if (inE > max hV nOTUs) && (inU > max hV nOTUs) then (inE - 1, inU - 1, w) - else if (inE <= max hV nOTUs) && (inU > max hV nOTUs) then (inE, inU - 1, w) - else if (inE > max hV nOTUs) && (inU <= max hV nOTUs) then (inE - 1, inU, w) - else (inE, inU, w) - | otherwise = -- internal edge was deleted -- both deleted verteces internal - let newE = adjustInternalEdgeVertex inE hV lV 2 nOTUs - newU = adjustInternalEdgeVertex inU hV lV 2 nOTUs - in - (newE, newU, w) - --- | updateVertexNUmbersOnEdges taked vertex numbers and updates HTU indices to reflect --- the deletion of those vertices --- ASSUMES edges are ordered (a,b,weight) a > b --- subtracts 2 from index if > the bigger of teh two, subtract 1 if bigger than lower, --- otherwise leaves unchanged -updateVertexNUmbersOnEdges :: Vertex -> Vertex -> V.Vector Edge -> Int -> V.Vector Edge -updateVertexNUmbersOnEdges eVert uVert edgeList nOTUs = - -- trace ("In updateVertexNUmbersOnEdges") ( - if V.null edgeList then V.empty - else - let hVert = max eVert uVert - lVert = min eVert uVert - newVertex = adjustVertex (V.head edgeList) hVert lVert nOTUs - in - V.cons newVertex (updateVertexNUmbersOnEdges eVert uVert (V.tail edgeList) nOTUs) - - --- | updateDistMatrix updates distance matrix to remove eVertex and uVertex columns and rows as in updateVertexNUmbersOnEdges --- update costs for two contracte edges c1Edge and c2Edge --- the distance update takes place first and row/coumn removal second --- this to keep the proper indices (since the non-deleted rows and columns indices are changed) -updateDistMatrix :: Vertex -> Vertex -> M.Matrix Double -> Int -> Edge -> Edge -> M.Matrix Double -updateDistMatrix eVert uVert distMatrix nOTUs c1Edge c2Edge = - let validEdgeList = filter ((>= 0).fst3) [c1Edge, c2Edge] - newMatrix = M.unsafeUpdateMatrix distMatrix validEdgeList - newMatrix' = M.deleteRowsAndColumns newMatrix (filter (> (nOTUs - 1)) [eVert, uVert]) - in - newMatrix' - - - --- | getEndVertices takes a pair of edges and returns the non-index vertices -getEndVertices :: V.Vector Edge -> Vertex -> (Vertex, Vertex) -getEndVertices inEdges index = - if V.length inEdges /= 2 then error ("Edge number should be 2 not " ++ show (V.length inEdges) ++ " in getEndVertices") - else - let (fEVertex, fUVertex, _) = V.head inEdges - (sEVertex, sUVertex, _) = V.last inEdges - in - if fEVertex == index then - if sEVertex == index then (fUVertex, sUVertex) - else - (fUVertex, sEVertex) - else if fUVertex == index then - if sEVertex == index then - (fEVertex, sUVertex) - else - (fEVertex, sEVertex) - else error ("Error finding ends of " ++ show (V.head inEdges) ++ " and " ++ show (V.last inEdges)) - --- | getContractedWeight gets edge contractd edge weights ither form distMatrix for OTUs or 1 OTU and 1 --- internal wertex or by 4 point metric (maximum of two estimations) -getContractedWeight :: Vertex -> Vertex -> M.Matrix Double -> Int -> V.Vector Edge -> Double -getContractedWeight aVert bVert distMatrix nOTUs edgeVect - | aVert < nOTUs && bVert < nOTUs = distMatrix M.! (aVert, bVert) - | aVert < nOTUs = distMatrix M.! (aVert, bVert) - | bVert < nOTUs = distMatrix M.! (aVert, bVert) - | otherwise = - -- both internal 4-point mertic estimation - -- get edges connected to the contracted edge - let aEdgeVect = getSubEdges [aVert] nOTUs edgeVect V.empty "first2" - bEdgeVect = getSubEdges [bVert] nOTUs edgeVect V.empty "first2" - (a,b) = getEndVertices aEdgeVect aVert - (c,d) = getEndVertices bEdgeVect bVert - firstEstimate = (distMatrix M.! (a,c)) + (distMatrix M.! (b,d)) - (distMatrix M.! (a,b)) - (distMatrix M.! (c,d)) - secondEstimate = (distMatrix M.! (a,b)) + (distMatrix M.! (b,c)) - (distMatrix M.! (a,b)) - (distMatrix M.! (c,d)) - in - max firstEstimate secondEstimate / 2.0 - - --- | contractEdges takes two edges that share a vertex and fuses them --- If terminal then return selfdge --- update contracted edge weight --- dummy edge (-1,-1,-1) is returned when no edge to contract so won't be updated in distance Matrix -contractEdges :: M.Matrix Double -> Int -> V.Vector Edge -> Vertex -> V.Vector Edge -> Edge -contractEdges distMatrix nOTUs edgeVect index allEdges - | V.null edgeVect = (-1,-1,0) - | V.length edgeVect == 1 = - let (a,b,w) = V.head edgeVect - in - if (a == index) || (b == index) then (index, index, w) - else error ("Contacting single edge: " ++ show (V.head edgeVect) ++ " with vertex " ++ show index ++ " not found") - | otherwise = - let (aVert,bVert) = getEndVertices edgeVect index - newWeight = getContractedWeight aVert bVert distMatrix nOTUs (subtractVector edgeVect allEdges) - in - orderEdge (aVert, bVert, newWeight) - --- | getDistance creates distances from remaing OTUs to new vertex (rowID) via Farris 1972 --- rowID is the row (or column) of distance Matrix -getDistance :: M.Matrix Double -> Double -> Double -> Double -> Int -> Int -> Int -> Int -> Double -getDistance origDist addCost eVertLeafDist uVertLeafDist leafIndex eVertex uVertex rowID = - let first = (origDist M.! (rowID, leafIndex)) - addCost - second = (origDist M.! (rowID, eVertex)) - eVertLeafDist - third = (origDist M.! (rowID, uVertex)) - uVertLeafDist - in - maximum [first, second, third] - --- | getNewDistMatrix takes distMatrix and adds Cost for new eVertLeafDist uVertLeafDist --- created in build process (new HTUs) --- should be complete (on input) for leaves already added (initail paiwise distances and HTUs added) --- adds a single new row (minus last 0.0 as new row at end) which is appended -getNewDistMatrix :: M.Matrix Double -> Double -> Double -> Double -> Int -> Int -> Int -> M.Matrix Double -getNewDistMatrix origDist addCost eVertLeafDist uVertLeafDist eVertex uVertex leafIndex = - let columnHolder = LS.fromList [0..(M.rows origDist - 1)] -- List of HTU and OTU indices in pairwise dist matrix - newDistRow = LS.map (getDistance origDist addCost eVertLeafDist uVertLeafDist leafIndex eVertex uVertex) columnHolder - newDistRow' = newDistRow `LS.snoc` (0.0 :: Double) - in - M.addMatrixRow origDist newDistRow' - --- | enterNewEdgeCost cretes new row of costs from edge vectors --- infty NT.infinity if not there --- this makes update n^2 which is dumb -enterNewEdgeCost :: Int -> V.Vector Edge -> Int -> Double -enterNewEdgeCost columnNumber edgeVect rowNumber = - if V.null edgeVect then NT.infinity --not found so Infty - else - let (a, b, weight) = V.head edgeVect - in - if (columnNumber == a && rowNumber == b) || (columnNumber == b && rowNumber == a) then weight else enterNewEdgeCost columnNumber (V.tail edgeVect) rowNumber - --- | addEdgesToDistMatrix adds new edges from internal node join to existing matrix --- 0 if not created so only the two new vertices and the edges they touch (3 each) --- so need to add two columns and two rows, for new vertices I, and II (here numIn and numIn + 1) -getNewDistMatrixInternal :: M.Matrix Double -> V.Vector Edge -> M.Matrix Double -getNewDistMatrixInternal inMatrix newEdgeVect = - -- trace ("In getNewDistMatrixInternal") ( - if V.length newEdgeVect /= 5 then error ("Wrong size edgeVector shoud be 5 and is " ++ show (V.length newEdgeVect)) - else - let numIn = M.rows inMatrix - columnHolder = [0..(numIn - 1)] - newDistColumnI = fmap (enterNewEdgeCost numIn newEdgeVect) columnHolder ++ [0.0] - newDistColumnII = fmap (enterNewEdgeCost (numIn + 1) newEdgeVect) columnHolder ++ [enterNewEdgeCost numIn newEdgeVect (numIn + 1), 0.0] - in - M.addMatrices inMatrix (LS.fromList [LS.fromList newDistColumnI, LS.fromList newDistColumnII]) - --- | connectEdges takes two vectors of edges and adds and edge between the two in edgesToConnect --- this deletes the two old edges from the edge Vectors and creates a new tree with the five new edges --- added in; the addition cost is for total of created edges minus edges that were destroyed -connectEdges :: M.Matrix Double -> V.Vector Edge -> V.Vector Edge -> V.Vector Edge -> (Double, Tree, M.Matrix Double) -connectEdges distMatrix eEdges uEdges edgesToConnect - | V.null eEdges = error "Empty e Edge vector in connectEdges" - | V.null uEdges = error "Empty u Edge vector in connectEdges" - | V.length edgesToConnect /= 2 = error ("There shoule be 2 edges to connect and ther are " ++ show (V.length edgesToConnect)) - | otherwise = - let edgesKept = subtractVector edgesToConnect eEdges V.++ subtractVector edgesToConnect uEdges - numInTree = M.rows distMatrix - -- order new edges - (a, b, wAB) = V.head edgesToConnect - (c, d, wCD) = V.last edgesToConnect - firstEstimate = (distMatrix M.! (a,c)) + (distMatrix M.! (b,d)) - (distMatrix M.! (a,b)) - (distMatrix M.! (c,d)) - secondEstimate = (distMatrix M.! (a,b)) + (distMatrix M.! (b,c)) - (distMatrix M.! (a,b)) - (distMatrix M.! (c,d)) - centralEdgeCost = max firstEstimate secondEstimate / 2.0 - centralEdge = orderEdge (numInTree, numInTree + 1, centralEdgeCost) - aCost = ((distMatrix M.! (c,a)) - (distMatrix M.! (c,b)) + (distMatrix M.! (a,b))) / 2.0 - bCost = (distMatrix M.! (a,b)) - aCost - cCost = ((distMatrix M.! (a,c)) - (distMatrix M.! (a,d)) + (distMatrix M.! (c,d))) / 2.0 - dCost = (distMatrix M.! (c,d)) - cCost - newAEdge = orderEdge (a, numInTree, aCost) -- edge cost not unique either (a or b, numInTree) could be max or min - newBEdge = orderEdge (b, numInTree, bCost) -- edge cost not unique either (a or b, numInTree) could be max or min - newCEdge = orderEdge (c, numInTree + 1, cCost) -- edge cost not unique either (a or b, numInTree + 1) could be max or min - newDEdge = orderEdge (d, numInTree + 1, dCost) -- edge cost not unique either (a or b, numInTree + 1) could be max or min - newEdgeVect = V.fromList [centralEdge, newAEdge, newBEdge, newCEdge, newDEdge] - newDistMatrix = getNewDistMatrixInternal distMatrix newEdgeVect - in - (centralEdgeCost + aCost + bCost + cCost + dCost - wAB - wCD, (V.empty, newEdgeVect V.++ edgesKept), newDistMatrix) - - --- | addEdgeToSplit adds a new edge to a specifc pair of input edges detemined addition cost --- and creted new edge set with appropriate weights -addEdgeToSplit :: V.Vector Edge -> V.Vector Edge -> Edge -> Edge -> M.Matrix Double -> V.Vector Edge -> (Double, Tree, M.Matrix Double) -addEdgeToSplit eEdges uEdges eTerminal uTerminal distMatrix edgesToConnect - | V.null eEdges && V.null uEdges = error "Empty e/u Edge vectors in addEdgeToSplit" - | V.null edgesToConnect = error "Empty eEdge vector in addEdgeToSplit" - | fst3 (V.head eEdges) == (-1) = -- adding eOTU, V.last since the (-1,-1,0) edge should always be first - addToEdgeSwap distMatrix (fst3 eTerminal) (V.empty, uEdges) (M.rows distMatrix) (V.last edgesToConnect) - | fst3 (V.head uEdges) == (-1) = -- adding uOTU - addToEdgeSwap distMatrix (fst3 uTerminal) (V.empty, eEdges) (M.rows distMatrix) (V.last edgesToConnect) - | otherwise = -- both internal edges - connectEdges distMatrix eEdges uEdges edgesToConnect - --- | splitTree takes a tree description and its edgeList and return pairs of edge list --- split at input edge in tree with "repaird"/contracted edges, delta, and original --- edge (pairs of vertices and weight) for each split -splitTree :: M.Matrix Double -> Tree -> Double -> Edge -> SplitTreeData -splitTree distMatrix inTree inTreeCost edgeToRemove = - -- check if proper tree--remove later - let (_, edgeVect) = inTree - (eVertex, uVertex, _) = edgeToRemove - - -- newEdgeSet = subtractVector (V.cons edgeToRemove $ eEdges V.++ uEdges) edgeVect - newEdgeSet = V.filter (/= edgeToRemove) edgeVect - nOTUs = div (3 + V.length edgeVect) 2 - eSubEdges = getSubEdges [eVertex] nOTUs newEdgeSet V.empty "all" - uSubEdges = getSubEdges [uVertex] nOTUs newEdgeSet V.empty "all" - - -- get edges that need to be contracted and re-estimate weights - eEdges = getSubEdges [eVertex] nOTUs eSubEdges V.empty "first2" - uEdges = getSubEdges [uVertex] nOTUs uSubEdges V.empty "first2" - eMergedEdge = contractEdges distMatrix nOTUs eEdges eVertex edgeVect - uMergedEdge = contractEdges distMatrix nOTUs uEdges uVertex edgeVect - - -- remove non-contracted edges and add in contracted edges - eSubEdges' = V.cons eMergedEdge (subtractVector eEdges eSubEdges) - uSubEdges' = V.cons uMergedEdge (subtractVector uEdges uSubEdges) - - -- need to know this order for SPR/TBR so which edge was in which set - previousEdges = V.fromList [eMergedEdge, uMergedEdge] - - -- map new HTU indices in edges and create new distance matrix - -- HTU indices are updated first--then values updated in distMatrix as rows/columns are - -- deleted to remove info from delted edge - eSubEdges'' = updateVertexNUmbersOnEdges eVertex uVertex (V.map orderEdge eSubEdges') nOTUs - uSubEdges'' = updateVertexNUmbersOnEdges eVertex uVertex (V.map orderEdge uSubEdges') nOTUs - previousEdges'' = updateVertexNUmbersOnEdges eVertex uVertex (V.map orderEdge previousEdges) nOTUs - - -- Update with deleted node and reestimated contracted edges - -- update matrix the costs (2 ij x 2 ji) of contracted edges - distMatrix'' = updateDistMatrix eVertex uVertex distMatrix nOTUs eMergedEdge uMergedEdge -- (V.head previousEdges'') (V.last previousEdges'') - - -- Delta calcualted by differene in original tree cost and split and readdition to split edges (then tail splits later - -- so not remake the original tree) - splitCost = getTreeCost (V.empty, eSubEdges'') + getTreeCost (V.empty, uSubEdges'') - - -- Delta of tree length will be weights of the the edges removed - the weights of the two edges contracted and reestimated - -- delta = weight + (V.sum $ V.map thd3 eEdges) + (V.sum $ V.map thd3 uEdges) - (V.sum $ V.map thd3 previousEdges'') - delta = inTreeCost - splitCost -- readditionCost - in - if eVertex < nOTUs then (V.singleton (eVertex, eVertex, 0.0), uSubEdges'', delta, previousEdges'', distMatrix'') --this so know a pendant edge - else if uVertex < nOTUs then (V.singleton (uVertex, uVertex, 0.0), eSubEdges'', delta, previousEdges'', distMatrix'') --this so know a pendant edge - else - -- neded to have e first then u for SPR/TBR so can coordinate with previous edges - (eSubEdges'', uSubEdges'', delta, previousEdges'', distMatrix'') - - --- | sieveTrees takes a list of (addition cost, Tree, distnce matrix) and returns list --- of better or equal trees to input delta -sieveTrees :: Double -> Double -> V.Vector (Double, Tree, M.Matrix Double) -> V.Vector String -> Int -> [TreeWithData] -> [TreeWithData] -sieveTrees inDelta curBestCost inAddList leafNames outgroup savedTrees = - if null inAddList then savedTrees - else - let firstTuple = V.head inAddList - (firstDelta, firstTree, firstMatrix) = firstTuple - newCost = curBestCost - inDelta + firstDelta - -- checkCost = getTreeCost firstTree - newickTree = convertToNewick leafNames outgroup firstTree - newickTree' = take (length newickTree - 3) newickTree ++ "[" ++ showDouble precision newCost ++ "]" ++ ";" - newTuple = (newickTree', firstTree, newCost, firstMatrix) - in - if firstDelta > inDelta then sieveTrees inDelta curBestCost (V.tail inAddList) leafNames outgroup savedTrees - else - if newCost < curBestCost then - sieveTrees inDelta curBestCost (V.tail inAddList) leafNames outgroup [newTuple] - else if withinEpsilon newCost curBestCost then sieveTrees inDelta curBestCost (V.tail inAddList) leafNames outgroup (newTuple : savedTrees) - else sieveTrees inDelta curBestCost (V.tail inAddList) leafNames outgroup savedTrees - - --- | reAddTerminals checks to see if split on pendant edge--if so reads terminal to each edge but saves equal cost if found --- and saveMethod specifies it. Idenitical to the wagner addition process with saveing equal as option --- could use addEdgeToSplit for consistancey with SPR/TBR -reAddTerminals :: String -> Double -> V.Vector String -> Int -> SplitTreeData -> [TreeWithData] -reAddTerminals rejoinType curBestCost leafNames outGroup split = - if rejoinType /= "otu" then error ("Incorrect swap function in reAddTerminals: " ++ rejoinType) - else - let (eEdgeVect, uEdgeVect, delta, _, distMatrix) = split - nOTUs = V.length leafNames - in - if (V.length eEdgeVect > 1) || ((fst3 (V.head eEdgeVect) /= snd3 (V.head eEdgeVect)) && (fst3 (V.head eEdgeVect) < nOTUs) && (snd3 (V.head eEdgeVect) < nOTUs)) then [] - else (if M.rows distMatrix /= ((2 * nOTUs) - 3) then error ("Dist Matrix incorrect size " ++ show (M.dim distMatrix) ++ " should be " ++ show ((2 * nOTUs) - 3, (2 * nOTUs) - 3)) - else - let newLeafIndex = M.rows distMatrix - -- take tail of uEdgeVect so not regerate input tree - additionList = V.map (addToEdgeSwap distMatrix (fst3 $ V.head eEdgeVect) (V.empty,uEdgeVect) newLeafIndex) uEdgeVect -- (V.tail uEdgeVect) -- tail so not hit original tree, leave all to reestimate if necesary - minAdditionCost = V.minimum (V.map fst3 additionList) - in - if minAdditionCost > delta then [] - else sieveTrees delta curBestCost additionList leafNames outGroup []) - - --- | add leaf to an edge creating new tree with distances and add cost, also augmented distance matrix --- but this for swap so returns entire new3-edge cost so not Farris triangle it is sum of three diveded by 2 -addToEdgeSwapRecurse :: Double -> M.Matrix Double -> Int -> Tree -> Int -> V.Vector Edge -> (Double, Tree, M.Matrix Double) -addToEdgeSwapRecurse inDelta distMatrix leaf initialTree newLeafIndex inEdgeVect = - if V.null inEdgeVect then (inDelta, initialTree, distMatrix) - else - let inEdge@(eVertex, uVertex, inWeight) = V.head inEdgeVect - (initialVertexVect, initialEdgeVect) = initialTree - addCost = ((distMatrix M.! (leaf, eVertex)) + (distMatrix M.! (leaf, uVertex)) - (distMatrix M.! (eVertex, uVertex))) / 2.0 - eVertLeafDist = (distMatrix M.! (leaf, eVertex)) - addCost - uVertLeafDist = (distMatrix M.! (leaf, uVertex)) - addCost - newVertexVect = V.snoc initialVertexVect leaf - newEdges = V.fromList [(leaf,newLeafIndex, addCost),(eVertex, newLeafIndex, eVertLeafDist),(uVertex, newLeafIndex, uVertLeafDist)] - cleanupEdges = V.filter (/= inEdge) initialEdgeVect - newEdgeVect = cleanupEdges V.++ newEdges - newTree = (newVertexVect, newEdgeVect) - -- add new costs from added vertex to each reamaining leaf - augmentedDistMatrix = getNewDistMatrix distMatrix addCost eVertLeafDist uVertLeafDist eVertex uVertex leaf - newDelta = addCost + eVertLeafDist + uVertLeafDist - inWeight - in - if newDelta < inDelta then (newDelta, newTree, augmentedDistMatrix) - else addToEdgeSwapRecurse inDelta distMatrix leaf initialTree newLeafIndex (V.tail inEdgeVect) - --- | getVectorAllVectorPairs takes two vectors and creates a vector of avector of two elements each for each --- pairwise combinatrion of elements -getVectorAllVectorPairs :: V.Vector a -> V.Vector a -> V.Vector (V.Vector a) -getVectorAllVectorPairs firstVect secondVect = - if V.null firstVect then V.empty - else - let firstElement = V.head firstVect - firstPairs = V.map (V.cons firstElement) $ V.map V.singleton secondVect - in - firstPairs V.++ getVectorAllVectorPairs (V.tail firstVect) secondVect - --- | createVectorEdgePairs creates teh Vector of Vectors of edges (2 in each case) to connect --- if SPR then takes the initial (previous e edge) and pairs with all in u edge vect --- if TBR then all conbinations of pairs -createVectorEdgePairs :: String -> V.Vector Edge -> V.Vector Edge -> V.Vector Edge -> V.Vector (V.Vector Edge) -createVectorEdgePairs pairSet previousEdges eEdgeVect uEdgeVect - | pairSet == "spr" = - let eEdgePrev = V.head previousEdges - in - V.map (V.cons eEdgePrev) $ V.map V.singleton uEdgeVect - | pairSet == "tbr" = getVectorAllVectorPairs eEdgeVect uEdgeVect - | otherwise = errorWithoutStackTrace ("Pair set option " ++ pairSet ++ " not implemented") - - --- | addEdgeToSplitRecurse like addToEdgeSplit but recursiblye yeilds a single best tree -addEdgeToSplitRecurse :: V.Vector Edge -> V.Vector Edge -> Edge -> Edge -> M.Matrix Double -> V.Vector (V.Vector Edge) -> (Double, Tree, M.Matrix Double) -> (Double, Tree, M.Matrix Double) -addEdgeToSplitRecurse eEdges uEdges eTerminal uTerminal distMatrix edgesToConnectVect origTriple@(inDelta, _, _) = - if V.null edgesToConnectVect then origTriple - else - let edgesToConnect = V.head edgesToConnectVect - in - if V.null eEdges && V.null uEdges then error "Empty e/u Edge vectors in addEdgeToSplit" - else if V.null edgesToConnect then error "Empty eEdge vector in addEdgeToSplit" - else if fst3 (V.head eEdges) == (-1) then -- adding eOTU, V.last since the (-1,-1,0) edge should always be first - let (newDelta, newTree, newMatrix) = addToEdgeSwap distMatrix (fst3 eTerminal) (V.empty, uEdges) (M.rows distMatrix) (V.last edgesToConnect) - in - if newDelta < inDelta then (newDelta, newTree, newMatrix) - else addEdgeToSplitRecurse eEdges uEdges eTerminal uTerminal distMatrix (V.tail edgesToConnectVect) origTriple - else if fst3 (V.head uEdges) == (-1) then -- adding uOTU - let (newDelta, newTree, newMatrix) = addToEdgeSwap distMatrix (fst3 uTerminal) (V.empty, eEdges) (M.rows distMatrix) (V.last edgesToConnect) - in - if newDelta < inDelta then (newDelta, newTree, newMatrix) - else addEdgeToSplitRecurse eEdges uEdges eTerminal uTerminal distMatrix (V.tail edgesToConnectVect) origTriple - else -- both internal edges - let (newDelta, newTree, newMatrix) = connectEdges distMatrix eEdges uEdges edgesToConnect - in - if newDelta < inDelta then (newDelta, newTree, newMatrix) - else addEdgeToSplitRecurse eEdges uEdges eTerminal uTerminal distMatrix (V.tail edgesToConnectVect) origTriple - - --- | doSPRTBR takes split tree and rejoins by creating edges from a single one of the first edge set to each of the mebers of the second --- important that e and u edges come in correct order and previous edges are e and u sets in order as well -doSPRTBR :: String -> Double -> V.Vector String -> Int -> SplitTreeData -> [TreeWithData] -doSPRTBR rejoinType curBestCost leafNames outGroup split = - -- trace ("In doSPRTBR") ( - let (eEdgeVect, uEdgeVect, delta, previousEdges, distMatrix) = split - in - if V.null eEdgeVect || V.null uEdgeVect then error "Empty edge vectors in doSPRTBR" - else - if fst3 (V.head eEdgeVect) == snd3 (V.head eEdgeVect) then -- if an OTU call readdTerminal - let newLeafIndex = M.rows distMatrix - -- keep whole uEdgeVect so recheck input tree edges - additionList = V.map (addToEdgeSwap distMatrix (fst3 $ V.head eEdgeVect) (V.empty,uEdgeVect) newLeafIndex) uEdgeVect - minAdditionCost = V.minimum (V.map fst3 additionList) - in - if minAdditionCost > delta then [] - else sieveTrees delta curBestCost additionList leafNames outGroup [] - else -- internal edge or edge with two OTUs as vertices - -- check to make sure edges are where they should be can remove later - -- fix e edge and join to each u edge for SPR (n^2) - let edgesToConnect = createVectorEdgePairs rejoinType previousEdges eEdgeVect uEdgeVect - -- e and u terminal should not be used here since OTUs are shortcircuited above - eTerminal = (-1,-1,0) - uTerminal = (-1,-1,0) - additionList = V.map (addEdgeToSplit eEdgeVect uEdgeVect eTerminal uTerminal distMatrix) edgesToConnect - minAdditionCost = V.minimum (V.map fst3 additionList) - in - if minAdditionCost > delta then [] - else sieveTrees delta curBestCost additionList leafNames outGroup [] - - --- | doSPRTBRSteep like doSPRTBR but only saves a sinlge (and better) tree -doSPRTBRSteep :: String -> Double -> V.Vector String -> Int -> SplitTreeData -> TreeWithData -> TreeWithData -doSPRTBRSteep rejoinType curBestCost leafNames outGroup split origTree@(_, inTree, _, inMatrix) = - -- trace ("In doSPRTBR") ( - let (eEdgeVect, uEdgeVect, delta, previousEdges, distMatrix) = split - in - if V.null eEdgeVect || V.null uEdgeVect then error "Empty edge vectors in doSPRTBR" - else - if fst3 (V.head eEdgeVect) == snd3 (V.head eEdgeVect) then -- if an OTU call readdTerminal - let newLeafIndex = M.rows distMatrix - -- keep whole uEdgeVect so recheck input tree edges - (newDelta, newTree, newMatrix) = addToEdgeSwapRecurse delta distMatrix (fst3 $ V.head eEdgeVect) (V.empty,uEdgeVect) newLeafIndex uEdgeVect - newCost = curBestCost - delta + newDelta - newickTree = convertToNewick leafNames outGroup newTree - newickTree' = take (length newickTree - 3) newickTree ++ "[" ++ showDouble precision newCost ++ "]" ++ ";" - in - if newCost < curBestCost then (newickTree', newTree, newCost, newMatrix) - else origTree - else -- internal edge or edge with two OTUs as vertices - -- check to make sure edges are where they should be can remove later - -- fix e edge and join to each u edge for SPR (n^2) - let edgesToConnect = createVectorEdgePairs rejoinType previousEdges eEdgeVect uEdgeVect - -- e and u terminal should not be used here since OTUs are shortcircuited above - eTerminal = (-1,-1,0) - uTerminal = (-1,-1,0) - (newDelta, newTree, newMatrix) = addEdgeToSplitRecurse eEdgeVect uEdgeVect eTerminal uTerminal distMatrix edgesToConnect (delta, inTree, inMatrix) - newCost = curBestCost - delta + newDelta - newickTree = convertToNewick leafNames outGroup newTree - newickTree' = take (length newickTree - 3) newickTree ++ "[" ++ showDouble precision newCost ++ "]" ++ ";" - in - if newCost < curBestCost then - --trace ("->" ++ show newCost) - (newickTree', newTree, newCost, newMatrix) - else origTree - - --- | reAddTerminalsSteep like readdTerminals but only returns one tree keeping better -reAddTerminalsSteep :: String -> Double -> V.Vector String -> Int -> SplitTreeData -> TreeWithData -> TreeWithData -reAddTerminalsSteep rejoinType curBestCost leafNames outGroup split origTree = - if rejoinType /= "otu" then error ("Incorrect swap function in reAddTerminals: " ++ rejoinType) - else - let (eEdgeVect, uEdgeVect, delta, _, distMatrix) = split - nOTUs = V.length leafNames - in - if (V.length eEdgeVect > 1) || ((fst3 (V.head eEdgeVect) /= snd3 (V.head eEdgeVect)) && (fst3 (V.head eEdgeVect) < nOTUs) && (snd3 (V.head eEdgeVect) < nOTUs)) then origTree - else if M.rows distMatrix /= ((2 * nOTUs) - 3) then error ("Dist Matrix incorrect size " ++ show (M.dim distMatrix) ++ " should be " ++ show ((2 * nOTUs) - 3, (2 * nOTUs) - 3)) - else - let newLeafIndex = M.rows distMatrix - -- take tail of uEdgeVect so not regerate input tree - (newDelta, newTree, newMatrix) = addToEdgeSwapRecurse delta distMatrix (fst3 $ V.head eEdgeVect) (V.empty,uEdgeVect) newLeafIndex uEdgeVect -- (V.tail uEdgeVect) -- tail so not hit original tree, leave all to reestimate if necesary - newCost = curBestCost - delta + newDelta - newickTree = convertToNewick leafNames outGroup newTree - newickTree' = take (length newickTree - 3) newickTree ++ "[" ++ showDouble precision newCost ++ "]" ++ ";" - in - if newCost < curBestCost then - --trace ("->" ++ show newCost) - (newickTree', newTree, newCost, newMatrix) - else origTree - - - --- | filterNewTreesOnCost returns list of all unique new best cost trees from list --- assumes curBestCost = cost of sabed trees -filterNewTreesOnCost :: Double -> [TreeWithData] -> [TreeWithData] -> [TreeWithData] -filterNewTreesOnCost curBestCost firstTreeList savedTrees = - if null firstTreeList then savedTrees - else - let firstTree = head firstTreeList - (_, _, firstCost, _) = firstTree - in - if firstCost < curBestCost then filterNewTreesOnCost firstCost (tail firstTreeList) [firstTree] - else if firstCost > curBestCost then filterNewTreesOnCost curBestCost (tail firstTreeList) savedTrees - else - let uniqueTree = filterNewTrees savedTrees firstTree - in - if isNothing uniqueTree then filterNewTreesOnCost curBestCost (tail firstTreeList) savedTrees - else filterNewTreesOnCost curBestCost (tail firstTreeList) (fromJust uniqueTree : savedTrees ) - --- | filterNewTrees takes the first tree and checks if in the second list -filterNewTrees :: [TreeWithData] -> TreeWithData -> Maybe TreeWithData -filterNewTrees secondTreeList firstTree = - if null secondTreeList then Just firstTree - else - let (firstNewick, _, _, _) = firstTree - (secondNewick, _, _, _) = head secondTreeList - in - if firstNewick == secondNewick then Nothing - else filterNewTrees (tail secondTreeList) firstTree - --- | getSaveNumber returns number ot save or Infty -getSaveNumber :: String -> Int -getSaveNumber inString = - if length inString == 4 then maxBound :: Int - else (read $ drop 5 inString) :: Int - --- | splitJoin does both split and rejoin operations in a fashion that if a better (shorter) tree is found is shortcircuits and --- begins again on the new tree, else proceeds untill all splits and joins are completed, but only on a single tree -splitJoin :: (String -> Double -> V.Vector String -> Int -> SplitTreeData -> TreeWithData -> TreeWithData) -> String -> V.Vector String -> Int -> V.Vector Edge -> TreeWithData -> TreeWithData -splitJoin swapFunction refineType leafNames outGroup edgeVect curTreeWithData@(_, curTree, curTreeCost, curTreeMatrix) = - if V.null edgeVect then curTreeWithData -- All splits tested, nothing better found - else - let firstEdge = V.head edgeVect - firstSplit = splitTree curTreeMatrix curTree curTreeCost firstEdge - firstTree@(_, firstNewTree, firstTreeCost, _) = swapFunction refineType curTreeCost leafNames outGroup firstSplit curTreeWithData - in - if firstTreeCost < curTreeCost then splitJoin swapFunction refineType leafNames outGroup (snd firstNewTree) firstTree - else splitJoin swapFunction refineType leafNames outGroup (V.tail edgeVect) curTreeWithData - - --- | splitJoinWrapper wraps around splitJoin to allow parallel execution --- the reason is to allow the consumption of "edgeVect" recursively within the same tree -splitJoinWrapper :: (String -> Double -> V.Vector String -> Int -> SplitTreeData -> TreeWithData -> TreeWithData) -> String -> V.Vector String -> Int -> TreeWithData -> TreeWithData -splitJoinWrapper swapFunction refineType leafNames outGroup curTreeWithData@(_, curTree, _, _) = - let edgeVect = snd curTree - in - splitJoin swapFunction refineType leafNames outGroup edgeVect curTreeWithData - --- | getGeneralSwapSteepestOne performs refinement as in getGeneralSwap but saves on a single tree (per split/swap) and --- immediately accepts a Better (shorter) tree and resumes the search on that new tree --- relies heavily on laziness of splitTree so not parallel at this level -getGeneralSwapSteepestOne :: String -> (String -> Double -> V.Vector String -> Int -> SplitTreeData -> TreeWithData -> TreeWithData) -> V.Vector String -> Int -> [TreeWithData] -> [TreeWithData] -> [TreeWithData] -getGeneralSwapSteepestOne refineType swapFunction leafNames outGroup inTreeList savedTrees = - if null inTreeList then savedTrees - else - trace ("In "++ refineType ++ " Swap (steepest) with " ++ show (length inTreeList) ++ " trees with minimum length " ++ show (minimum $ fmap thd4 inTreeList)) ( - let steepTreeList = PU.seqParMap rdeepseq (splitJoinWrapper swapFunction refineType leafNames outGroup) inTreeList -- `using` myParListChunkRDS - steepCost = minimum $ fmap thd4 steepTreeList - in - --this to maintina the trajectories untill final swap--otherwise could converge down to single tree prematurely - keepTrees steepTreeList "unique" "first" steepCost - ) - --- | getGeneralSwap performs a "re-add" of terminal identical to wagner build addition to available edges --- performed on all splits recursively until no more better/equal cost trees found --- this won't work to save "all" just unique and best and unique of best --- add "steep-est" descent -getGeneralSwap :: String -> (String -> Double -> V.Vector String -> Int -> SplitTreeData -> [TreeWithData]) -> String -> String -> V.Vector String -> Int -> [TreeWithData] -> [TreeWithData] -> [TreeWithData] -getGeneralSwap refineType swapFunction saveMethod keepMethod leafNames outGroup inTreeList savedTrees = - let maxNumSave = getSaveNumber saveMethod - in - if null inTreeList then savedTrees - else - trace ("In "++ refineType ++ " Swap with " ++ show (length inTreeList) ++ " trees with minimum length " ++ show (minimum $ fmap thd4 inTreeList)) ( - let curFullTree = head inTreeList - overallBestCost = minimum $ fmap thd4 savedTrees - (_, curTree, curTreeCost, curTreeMatrix) = curFullTree - -- parallelize here - splitTreeList = PU.seqParMap rdeepseq (splitTree curTreeMatrix curTree curTreeCost) (V.toList $ snd curTree) -- `using` myParListChunkRDS - firstTreeList = PU.seqParMap rdeepseq (swapFunction refineType curTreeCost leafNames outGroup) splitTreeList -- `using` myParListChunkRDS - firstTreeList' = filterNewTreesOnCost overallBestCost (curFullTree : concat firstTreeList) savedTrees -- keepTrees (concat $ V.toList firstTreeList) saveMethod overallBestCost - in - -- Work around for negative NT.infinity tree costs (could be dst matrix issue) - if NT.isInfinite curTreeCost || null firstTreeList' then getGeneralSwap refineType swapFunction saveMethod keepMethod leafNames outGroup (tail inTreeList) savedTrees else ( - let (_, _, costOfFoundTrees, _) = head firstTreeList' - in - --workaround for negatrive NT.infinity trees - if NT.isInfinite costOfFoundTrees then getGeneralSwap refineType swapFunction saveMethod keepMethod leafNames outGroup (tail inTreeList) savedTrees - else if costOfFoundTrees < overallBestCost then - let uniqueTreesToAdd = fmap fromJust $ filter (/= Nothing ) $ fmap (filterNewTrees inTreeList) firstTreeList' - treesToSwap = keepTrees (tail inTreeList ++ uniqueTreesToAdd) saveMethod keepMethod costOfFoundTrees - in - getGeneralSwap refineType swapFunction saveMethod keepMethod leafNames outGroup treesToSwap (take maxNumSave firstTreeList') - else if costOfFoundTrees == overallBestCost then - if length savedTrees >= maxNumSave then getGeneralSwap refineType swapFunction saveMethod keepMethod leafNames outGroup (tail inTreeList) savedTrees - else - let uniqueTreesToAdd = fmap fromJust $ filter (/= Nothing ) $ fmap (filterNewTrees inTreeList) firstTreeList' - treesToSwap = keepTrees (tail inTreeList ++ uniqueTreesToAdd) saveMethod keepMethod costOfFoundTrees - in - getGeneralSwap refineType swapFunction saveMethod keepMethod leafNames outGroup treesToSwap (take maxNumSave $ savedTrees ++ firstTreeList') - else getGeneralSwap refineType swapFunction saveMethod keepMethod leafNames outGroup (tail inTreeList) savedTrees) - ) -- ) - --- | performRefinement takes input trees in TRE and Newick format and performs different forms of tree refinement --- at present just OTU (remove leaves and re-add), SPR and TBR -performRefinement :: String -> String -> String -> V.Vector String -> Int -> TreeWithData -> [TreeWithData] -performRefinement refinement saveMethod keepMethod leafNames outGroup inTree - | refinement == "none" = [inTree] - | refinement == "otu" = - let newTrees = getGeneralSwapSteepestOne "otu" reAddTerminalsSteep leafNames outGroup [inTree] [([],(V.empty,V.empty), NT.infinity, M.empty)] - newTrees' = getGeneralSwap "otu" reAddTerminals saveMethod keepMethod leafNames outGroup newTrees [([],(V.empty,V.empty), NT.infinity, M.empty)] - in - if refinement == "best:1" then - if not $ null newTrees then newTrees - else [inTree] - else if not (null newTrees') then newTrees' - else - trace "OTU swap did not find any new trees" - [inTree] - | refinement == "spr" = - let newTrees = getGeneralSwapSteepestOne "otu" reAddTerminalsSteep leafNames outGroup [inTree] [([],(V.empty,V.empty), NT.infinity, M.empty)] - newTrees' = getGeneralSwapSteepestOne "spr" doSPRTBRSteep leafNames outGroup newTrees [([],(V.empty,V.empty), NT.infinity, M.empty)] - newTrees'' = getGeneralSwap "spr" doSPRTBR saveMethod keepMethod leafNames outGroup newTrees' [([],(V.empty,V.empty), NT.infinity, M.empty)] - in - if saveMethod == "best:1" then - if not $ null newTrees' then newTrees' - else [inTree] - else if not (null newTrees'') then newTrees'' - else - trace "SPR swap did not find any new trees" - [inTree] - | refinement == "tbr" = - let newTrees = getGeneralSwapSteepestOne "otu" reAddTerminalsSteep leafNames outGroup [inTree] [([],(V.empty,V.empty), NT.infinity, M.empty)] - newTrees' = getGeneralSwapSteepestOne "spr" doSPRTBRSteep leafNames outGroup newTrees [([],(V.empty,V.empty), NT.infinity, M.empty)] - newTrees'' = getGeneralSwapSteepestOne "tbr" doSPRTBRSteep leafNames outGroup newTrees' [([],(V.empty,V.empty), NT.infinity, M.empty)] - newTrees''' = getGeneralSwap "tbr" doSPRTBR saveMethod keepMethod leafNames outGroup newTrees'' [([],(V.empty,V.empty), NT.infinity, M.empty)] - in - if saveMethod == "best:1" then - if not $ null newTrees' then newTrees'' - else [inTree] - else if not (null newTrees''') then newTrees''' - else - trace "TBR swap did not find any new trees" - [inTree] - | otherwise = errorWithoutStackTrace ("Unrecognized refinement method: " ++ refinement) diff --git a/pkg/PhyGraph/Search/Fuse.hs b/pkg/PhyGraph/Search/Fuse.hs deleted file mode 100644 index a733b2a2b..000000000 --- a/pkg/PhyGraph/Search/Fuse.hs +++ /dev/null @@ -1,582 +0,0 @@ -{- | -Module : Fuse.hs -Description : Module specifying graph fusing recombination functions -Copyright : (c) 2022 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -module Search.Fuse ( fuseAllGraphs - ) where - -import Control.Parallel.Strategies -import Data.Bits -import qualified Data.BitVector.LittleEndian as BV -import qualified Data.List as L -import qualified Data.Map as MAP -import Data.Maybe -import qualified Data.Text.Lazy as TL -import qualified Data.Vector as V -import Debug.Trace -import GeneralUtilities -import qualified GraphOptimization.Traversals as T -import qualified Graphs.GraphOperations as GO -import qualified ParallelUtilities as PU -import qualified Search.Swap as S -import Types.Types -import qualified Utilities.LocalGraph as LG -import qualified Data.InfList as IL - - -{- - For redo: - - 1) Key issue is reindexing exchanged components - 2) move to new swap functions on rejoin - with reasonable defaults (steepest, spr) - 3) check logic all through - 4) randomized options to limit number of fuses - --} - --- | fuseAllGraphs takes a list of phylogenetic graphs and performs all pairwise fuses --- later--could limit by options making random choices for fusing --- keeps results according to options (best, unique, etc) --- unique is unique of "best" from individual fusings --- singleRound short circuits recursive continuation on newly found graphs -fuseAllGraphs :: GlobalSettings - -> ProcessedData - -> [Int] - -> Int - -> Int - -> Int - -> Bool - -> Bool - -> Bool - -> Bool - -> Bool - -> Bool - -> Bool - -> Bool - -> Maybe Int - -> Bool - -> [PhylogeneticGraph] - -> ([PhylogeneticGraph], Int) -fuseAllGraphs inGS inData rSeedList keepNum maxMoveEdgeDist counter doNNI doSPR doTBR doSteepest doAll returnBest returnUnique singleRound fusePairs randomPairs inGraphList = - if null inGraphList then ([], 0) - else - let -- getting values to be passed for graph diagnosis later - numLeaves = V.length $ fst3 inData - -- leafGraph = T.makeSimpleLeafGraph inData - -- leafDecGraph = T.makeLeafGraph inData - -- leafGraphSoftWired = T.makeLeafGraphSoftWired inData - -- hasNonExactChars = U.getNumberSequenceCharacters (thd3 inData) > 0 - charInfoVV = six6 $ head inGraphList - - curBest = minimum $ fmap snd6 inGraphList - - curBestGraph = head $ filter ((== curBest) . snd6) inGraphList - - -- get net penalty estimate from optimal graph for delta recombine later - inGraphNetPenalty = if (graphType inGS == Tree) then 0.0 - else if (graphFactor inGS) == NoNetworkPenalty then 0.0 - else if (graphFactor inGS) == Wheeler2015Network then - if (graphType inGS) == HardWired then 0.0 - else T.getW15NetPenalty Nothing curBestGraph - else if (graphFactor inGS) == Wheeler2023Network then - if (graphType inGS) == HardWired then 0.0 - else T.getW23NetPenalty Nothing curBestGraph - else if (graphFactor inGS) == PMDLGraph then - let (_, _, _, networkNodeList) = LG.splitVertexList (fst6 curBestGraph) - in - if (graphType inGS) == Tree then fst $ IL.head (graphComplexityList inGS) - else if (graphType inGS) == SoftWired then fst $ (graphComplexityList inGS) IL.!!! (length networkNodeList) - else if (graphType inGS) == HardWired then snd $ (graphComplexityList inGS) IL.!!! (length networkNodeList) - else error ("Graph type " ++ (show $ graphType inGS) ++ " is not yet implemented in fuseAllGraphs") - else error ("Network penalty type " ++ (show $ graphFactor inGS) ++ " is not yet implemented") - inGraphNetPenaltyFactor = inGraphNetPenalty / curBest - - - -- get fuse pairs - graphPairList' = getListPairs inGraphList - (graphPairList, randString) = if isNothing fusePairs then (graphPairList', "") - else if randomPairs then (takeRandom (head rSeedList) (fromJust fusePairs) graphPairList', " randomized") - else (takeNth (fromJust fusePairs) graphPairList', "") - - - newGraphList = concat (PU.seqParMap rdeepseq (fusePair inGS inData numLeaves charInfoVV inGraphNetPenaltyFactor keepNum maxMoveEdgeDist doNNI doSPR doTBR) graphPairList) -- `using` PU.myParListChunkRDS) - - fuseBest = if not (null newGraphList) then minimum $ fmap snd6 newGraphList - else infinity - - in - - trace ("\tFusing " ++ (show $ length graphPairList) ++ randString ++ " graph pairs") ( - if null newGraphList then (inGraphList, counter + 1) - else if returnUnique then - let uniqueList = take keepNum $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] (inGraphList ++ newGraphList) - in - if fuseBest < curBest then - trace ("\t->" ++ (show fuseBest)) -- ++ "\n" ++ (LG.prettify $ GO.convertDecoratedToSimpleGraph $ thd6 $ head bestSwapGraphList)) - fuseAllGraphs inGS inData (drop 2 rSeedList) keepNum maxMoveEdgeDist (counter + 1) doNNI doSPR doTBR doSteepest doAll returnBest returnUnique singleRound fusePairs randomPairs uniqueList - else (uniqueList, counter + 1) - - else -- return best - -- only do one round of fusing - if singleRound then (take keepNum $ GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] (inGraphList ++ newGraphList), counter + 1) - - -- recursive rounds - else - let allBestList = take keepNum $ GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] (inGraphList ++ newGraphList) - in - - -- found worse - if fuseBest > curBest then (allBestList, counter + 1) - - -- found better - else if fuseBest < curBest then - trace ("\t->" ++ (show fuseBest)) -- ++ "\n" ++ (LG.prettify $ GO.convertDecoratedToSimpleGraph $ thd6 $ head bestSwapGraphList)) - fuseAllGraphs inGS inData (drop 2 rSeedList) keepNum maxMoveEdgeDist (counter + 1) doNNI doSPR doTBR doSteepest doAll returnBest returnUnique singleRound fusePairs randomPairs allBestList - - -- equal cost just return--could keep finding equal - else (allBestList, counter + 1) - ) - - --- | fusePair recombines a single pair of graphs --- this is done by coopting the split and readd functinos from the Swap.Swap functions and exchanging --- pruned subgraphs with the same leaf complement (as recorded by the subtree root node bit vector field) --- spr-like and tbr-like readds can be performed as with options -fusePair :: GlobalSettings - -> ProcessedData - -> Int - -> V.Vector (V.Vector CharInfo) - -> VertexCost - -> Int - -> Int - -> Bool - -> Bool - -> Bool - -> (PhylogeneticGraph, PhylogeneticGraph) - -> [PhylogeneticGraph] -fusePair inGS inData numLeaves charInfoVV netPenalty keepNum maxMoveEdgeDist doNNI doSPR doTBR (leftGraph, rightGraph) = - if (LG.isEmpty $ fst6 leftGraph) || (LG.isEmpty $ fst6 rightGraph) then error "Empty graph in fusePair" - else if (fst6 leftGraph) == (fst6 rightGraph) then [] - else - -- split graphs at all bridge edges (all edges for Tree) - let -- left graph splits - leftDecoratedGraph = thd6 leftGraph - (leftRootIndex, _) = head $ LG.getRoots leftDecoratedGraph - leftBreakEdgeList = if (graphType inGS) == Tree then filter ((/= leftRootIndex) . fst3) $ LG.labEdges leftDecoratedGraph - else filter ((/= leftRootIndex) . fst3) $ LG.getEdgeSplitList leftDecoratedGraph - leftSplitTupleList = PU.seqParMap rdeepseq (LG.splitGraphOnEdge leftDecoratedGraph) leftBreakEdgeList -- `using` PU.myParListChunkRDS - (_, _, leftPrunedGraphRootIndexList, leftOriginalConnectionOfPrunedList) = L.unzip4 leftSplitTupleList - --leftPrunedGraphRootIndexList = fmap thd4 leftSplitTupleList - leftPrunedGraphBVList = fmap bvLabel $ fmap fromJust $ fmap (LG.lab leftDecoratedGraph) leftPrunedGraphRootIndexList - - - -- right graph splits - rightDecoratedGraph = thd6 rightGraph - (rightRootIndex, _) = head $ LG.getRoots rightDecoratedGraph - rightBreakEdgeList = if (graphType inGS) == Tree then filter ((/= rightRootIndex) . fst3) $ LG.labEdges rightDecoratedGraph - else filter ((/= rightRootIndex) . fst3) $ LG.getEdgeSplitList rightDecoratedGraph - rightSplitTupleList = PU.seqParMap rdeepseq (LG.splitGraphOnEdge rightDecoratedGraph) rightBreakEdgeList -- `using` PU.myParListChunkRDS - (_, _, rightPrunedGraphRootIndexList, rightOriginalConnectionOfPrunedList) = L.unzip4 rightSplitTupleList - -- rightPrunedGraphRootIndexList = fmap thd4 rightSplitTupleList - rightPrunedGraphBVList = fmap bvLabel $ fmap fromJust $ fmap (LG.lab rightDecoratedGraph) rightPrunedGraphRootIndexList - - - -- need to get all pairs of split graphs - (leftSplitTupleList', rightSplitTupleList') = unzip $ cartProd leftSplitTupleList rightSplitTupleList - (leftPrunedGraphBVList', rightPrunedGraphBVList') = unzip $ cartProd leftPrunedGraphBVList rightPrunedGraphBVList - -- (leftBaseBVList, rightBaseBVList) = unzip $ cartProd leftBaseGraphBVList rightBaseGraphBVList - - - -- get compatible split pairs via checking bv of root index of pruned subgraphs - leftRightMatchList = zipWith (==) leftPrunedGraphBVList' rightPrunedGraphBVList' - - -- only take compatible, non-identical pairs with > 2 terminal--otherwise basically SPR move or nmothing (if identical) - -- oalso checks that prune and splits don't match between the grap[hs to be recombined] - recombinablePairList = L.zipWith (getCompatibleNonIdenticalSplits numLeaves) leftRightMatchList leftPrunedGraphBVList' - (leftValidTupleList, rightValidTupleList, _) = L.unzip3 $ filter ((==True) . thd3) $ zip3 leftSplitTupleList' rightSplitTupleList' recombinablePairList - - - -- create new "splitgraphs" by replacing nodes and edges of pruned subgraph in reciprocal graphs - -- retuns reindexed list of base graph root, pruned component root, parent of pruned component root, original graph break edge - (leftBaseRightPrunedSplitGraphList, leftRightGraphRootIndexList, leftRightPrunedParentRootIndexList, leftRightPrunedRootIndexList, leftRightOriginalConnectionOfPrunedList) = L.unzip5 (PU.seqParMap rdeepseq (exchangePrunedGraphs numLeaves) (zip3 leftValidTupleList rightValidTupleList leftOriginalConnectionOfPrunedList)) -- `using` PU.myParListChunkRDS) - - (rightBaseLeftPrunedSplitGraphList, rightLeftGraphRootIndexList, rightLeftPrunedParentRootIndexList, rightLeftPrunedRootIndexList, rightLeftOriginalConnectionOfPrunedList) = L.unzip5 (PU.seqParMap rdeepseq (exchangePrunedGraphs numLeaves) (zip3 rightValidTupleList leftValidTupleList rightOriginalConnectionOfPrunedList)) -- `using` PU.myParListChunkRDS) - - -- reoptimize splitGraphs so ready for readdition--using updated base and prune indices - -- False for doIA - leftRightOptimizedSplitGraphCostList = PU.seqParMap rdeepseq (S.reoptimizeSplitGraphFromVertexTuple inGS inData False netPenalty) (zip3 leftBaseRightPrunedSplitGraphList leftRightGraphRootIndexList leftRightPrunedRootIndexList) -- `using` PU.myParListChunkRDS - - rightLeftOptimizedSplitGraphCostList = PU.seqParMap rdeepseq (S.reoptimizeSplitGraphFromVertexTuple inGS inData False netPenalty) (zip3 rightBaseLeftPrunedSplitGraphList rightLeftGraphRootIndexList rightLeftPrunedRootIndexList) -- `using` PU.myParListChunkRDS - - -- Check if base graphs are different as well (nneded to be reoptimized to get base root bv) - -- otherwise no point in recombination - {- - leftBaseGraphBVList = fmap bvLabel $ fmap fromJust $ zipWith LG.lab (fmap fst leftRightOptimizedSplitGraphCostList) leftRightGraphRootIndexList - rightBaseGraphBVList =fmap bvLabel $ fmap fromJust $ zipWith LG.lab (fmap fst rightLeftOptimizedSplitGraphCostList) rightLeftGraphRootIndexList - baseGraphDifferentList = zipWith (/=) leftBaseBVList rightBaseBVList - -} - baseGraphDifferentList = L.replicate (length leftRightOptimizedSplitGraphCostList) True - - (_, leftRightOptimizedSplitGraphCostList', _, leftRightPrunedRootIndexList', leftRightPrunedParentRootIndexList', leftRightOriginalConnectionOfPrunedList') = L.unzip6 $ filter ((== True) . fst6) $ L.zip6 baseGraphDifferentList leftRightOptimizedSplitGraphCostList leftRightGraphRootIndexList leftRightPrunedRootIndexList leftRightPrunedParentRootIndexList leftRightOriginalConnectionOfPrunedList - - (_, rightLeftOptimizedSplitGraphCostList', _, rightLeftPrunedRootIndexList', rightLeftPrunedParentRootIndexList', rightLeftOriginalConnectionOfPrunedList') = L.unzip6 $ filter ((== True) . fst6) $ L.zip6 baseGraphDifferentList rightLeftOptimizedSplitGraphCostList rightLeftGraphRootIndexList rightLeftPrunedRootIndexList rightLeftPrunedParentRootIndexList rightLeftOriginalConnectionOfPrunedList - - -- re-add pruned component to base component left-right and right-left - -- need cure best cost - curBetterCost = min (snd6 leftGraph) (snd6 rightGraph) - - -- get network penalty factors to pass on - networkCostFactor = min (getNetworkPentaltyFactor inGS (snd6 leftGraph) leftGraph) (getNetworkPentaltyFactor inGS (snd6 rightGraph) rightGraph) - - - -- left and right root indices shold be the same - leftRightFusedGraphList = recombineComponents inGS inData keepNum maxMoveEdgeDist doNNI doSPR doTBR charInfoVV curBetterCost leftRightOptimizedSplitGraphCostList' leftRightPrunedRootIndexList' leftRightPrunedParentRootIndexList' leftRightOriginalConnectionOfPrunedList' leftRootIndex networkCostFactor - rightLeftFusedGraphList = recombineComponents inGS inData keepNum maxMoveEdgeDist doNNI doSPR doTBR charInfoVV curBetterCost rightLeftOptimizedSplitGraphCostList' rightLeftPrunedRootIndexList' rightLeftPrunedParentRootIndexList' rightLeftOriginalConnectionOfPrunedList' rightRootIndex networkCostFactor - - - -- get "best" fused graphs from leftRight and rightLeft - bestFusedGraphs = take keepNum $ GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] (leftRightFusedGraphList ++ rightLeftFusedGraphList) - - -- | get fuse graphs via swap function - in - if null leftValidTupleList then [] - else - {- - trace ("FP: " ++ (show (length leftValidTupleList, length rightValidTupleList)) ++ " num (Left,Right) " ++ (show (length leftSplitTupleList, length rightSplitTupleList)) - ++ "\nLeftRight splitCost " ++ (show $ fmap snd leftRightOptimizedSplitGraphCostList) - ++ "\nrightLeft splitCost " ++ (show $ fmap snd rightLeftOptimizedSplitGraphCostList)) - -} - bestFusedGraphs - --- | recombineComponents takes readdition arguments (swap, steepest etc) and wraps the swap-stype rejoining of components --- ignores doSteepeast for now--doesn't seem to have meaning in rejoining since not then taking that graph for fusion and shortcircuiting --- not doing original connection first (originalConnectionOfPrunedComponentList)-since so much work might as well do soem SPR at least -recombineComponents :: GlobalSettings - -> ProcessedData - -> Int - -> Int - -> Bool - -> Bool - -> Bool - -> V.Vector (V.Vector CharInfo) - -> VertexCost - -> [(DecoratedGraph, VertexCost)] - -> [Int] - -> [Int] - -> [Int] - -> LG.Node - -> VertexCost - -> [PhylogeneticGraph] -recombineComponents inGS inData numToKeep inMaxMoveEdgeDist doNNI doSPR doTBR charInfoVV curBestCost splitGraphCostPairList prunedRootIndexList prunedParentRootIndexList _ graphRoot networkCostFactor = - -- check and see if any reconnecting to do - --trace ("RecombineComponents " ++ (show $ length splitGraphCostPairList)) ( - if null splitGraphCostPairList then [] - else - -- top line to cover SPR HarWired bug - let swapType = if doTBR then "tbr" - else if doSPR then "spr" - else if doNNI then "nni" - else "spr" -- will be set with 2 as maxMoveEdgeDist - - doIA = False --- since splits not created together, IA won't be consistent between components - steepest = False -- should look at all better - - -- network costs--using an input value that is minimum of inputs - netPenaltyFactorList = L.replicate (length splitGraphCostPairList) networkCostFactor - - -- no simulated annealling functionality here - inSimAnnealParams = Nothing - - -- get edges in pruned (to be exchanged) graphs - edgesInPrunedList = fmap LG.getEdgeListAfter $ zip (fmap fst splitGraphCostPairList) prunedParentRootIndexList - - -- get edges in base (not to be exchanged) graphs - rejoinEdgesList = fmap (getBaseGraphEdges graphRoot) $ zip (fmap fst splitGraphCostPairList) edgesInPrunedList - - --huge zip to fit arguments into revised join function - graphDataList = zip9 (fmap fst splitGraphCostPairList) - (fmap GO.convertDecoratedToSimpleGraph $ fmap fst splitGraphCostPairList) - (fmap snd splitGraphCostPairList) - (L.replicate (length splitGraphCostPairList) graphRoot) - prunedRootIndexList - prunedParentRootIndexList - rejoinEdgesList - edgesInPrunedList - netPenaltyFactorList - - -- do "all additions" - - recombinedGraphList = concat $ PU.seqParMap rdeepseq (S.rejoinGraphTuple swapType inGS inData numToKeep inMaxMoveEdgeDist steepest curBestCost [] doIA charInfoVV inSimAnnealParams) graphDataList - - - -- this based on heuristic deltas - bestFuseCost = if null recombinedGraphList then infinity - else minimum $ fmap snd6 recombinedGraphList - - in - --trace ("Checking in fusing") ( - if null recombinedGraphList then [] - else if bestFuseCost <= curBestCost then - take numToKeep $ GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] recombinedGraphList - else [] - -- ) - -- ) - --- | getNetworkPentaltyFactor get scale network penalty for graph -getNetworkPentaltyFactor :: GlobalSettings -> VertexCost -> PhylogeneticGraph -> VertexCost -getNetworkPentaltyFactor inGS graphCost inGraph = - if LG.isEmpty $ thd6 inGraph then 0.0 - else - let inGraphNetPenalty = if (graphType inGS == Tree) || (graphType inGS == HardWired) then 0.0 - else if (graphFactor inGS) == NoNetworkPenalty then 0.0 - else if (graphFactor inGS) == Wheeler2015Network then T.getW15NetPenalty Nothing inGraph - else if (graphFactor inGS) == Wheeler2023Network then T.getW23NetPenalty Nothing inGraph - else error ("Network penalty type " ++ (show $ graphFactor inGS) ++ " is not yet implemented") - in - inGraphNetPenalty / graphCost - - - - --- | getBaseGraphEdges gets teh edges in the base graph teh trhe exchanged sub graphs can be rejoined --- basically all edges except at root and those in the subgraph -getBaseGraphEdges :: (Eq b) => LG.Node -> (LG.Gr a b, [LG.LEdge b]) -> [LG.LEdge b] -getBaseGraphEdges graphRoot (inGraph, edgesInSubGraph) = - if LG.isEmpty inGraph then [] - else - filter ((/= graphRoot) . fst3) $ (LG.labEdges inGraph) L.\\ edgesInSubGraph - --- | getCompatibleNonIdenticalSplits takes the number of leaves, splitGraph of the left graph, the splitGraph if the right graph, --- the bitVector equality list of pruned roots, the bitvector of the root of the pruned graph on left --- (this could be either since filter for identity--just to check leaf numbers) --- checks that the leaf sets of the pruned subgraphs are equal, greater than 1 leaf, fewer thanm nleaves - 2, and non-identical --- removed identity check fo now--so much time to do that (O(n)) may not be worth it -getCompatibleNonIdenticalSplits :: Int - -> Bool - -> BV.BitVector - -> Bool -getCompatibleNonIdenticalSplits numLeaves leftRightMatch leftPrunedGraphBV = - - if not leftRightMatch then False - else if popCount leftPrunedGraphBV < 3 then False - else if popCount leftPrunedGraphBV > (numLeaves - 3) then False - else True - {- - -- check for pruned components non-identical - - let -- (leftNodesInPrunedGraph, _) = LG.nodesAndEdgesAfter (fst4 leftSplitTuple) [((thd4 leftSplitTuple), fromJust $ LG.lab (fst4 leftSplitTuple) (thd4 leftSplitTuple))] - -- leftPrunedBVNodeList = L.sort $ filter ((> 1) . popCount) $ fmap (bvLabel . snd) leftNodesInPrunedGraph - -- (rightNodesInPrunedGraph, _) = LG.nodesAndEdgesAfter (fst4 rightSplitTuple) [((thd4 rightSplitTuple), fromJust $ LG.lab (fst4 rightSplitTuple) (thd4 rightSplitTuple))] - -- rightPrunedBVNodeList = L.sort $ filter ((> 1) . popCount) $ fmap (bvLabel . snd) rightNodesInPrunedGraph - - - -- (leftNodesInBaseGraph, _) = LG.nodesAndEdgesAfter (fst4 leftSplitTuple) [((snd4 leftSplitTuple), fromJust $ LG.lab (fst4 leftSplitTuple) (snd4 leftSplitTuple))] - -- leftBaseBVNodeList = L.sort $ filter ((> 1) . popCount) $ fmap (bvLabel . snd) leftNodesInPrunedGraph - -- (rightNodesInBaseGraph, _) = LG.nodesAndEdgesAfter (fst4 rightSplitTuple) [((snd4 rightSplitTuple), fromJust $ LG.lab (fst4 rightSplitTuple) (snd4 rightSplitTuple))] - -- rightBaseBVNodeList = L.sort $ filter ((> 1) . popCount) $ fmap (bvLabel . snd) rightNodesInPrunedGraph - - in - --if leftPrunedBVNodeList == rightPrunedBVNodeList then False - -- else if leftBaseBVNodeList == rightBaseBVNodeList then False - --else True - True - -} - --- | exchangePrunedGraphs creates a new "splitGraph" containing both first (base) and second (pruned) graph components --- both components need to have HTU and edges reindexed to be in sync, oringal edge terminal node is also reindexed and returned for limit readd distance -exchangePrunedGraphs :: Int -> ((DecoratedGraph, LG.Node, LG.Node, LG.Node), (DecoratedGraph, LG.Node, LG.Node, LG.Node), LG.Node) -> (DecoratedGraph, Int , Int, Int, Int) -exchangePrunedGraphs numLeaves (firstGraphTuple, secondGraphTuple, breakEdgeNode) = - let (firstSplitGraph, firstGraphRootIndex, _, _) = firstGraphTuple - (secondSplitGraph, _, secondPrunedGraphRootIndex, _) = secondGraphTuple - - -- get nodes and edges of firstBase - firstGraphRootLabel = fromJust $ LG.lab firstSplitGraph firstGraphRootIndex - firstGraphRootNode = (firstGraphRootIndex, firstGraphRootLabel) - (firstBaseGraphNodeList', firstBaseGraphEdgeList) = LG.nodesAndEdgesAfter firstSplitGraph [firstGraphRootNode] - - --add in root nodes of partitions since not included in "nodesAfter" function - firstBaseGraphNodeList = firstGraphRootNode : firstBaseGraphNodeList' - - - -- get nodes and edges of second pruned - secondPrunedGraphRootLabel = fromJust $ LG.lab secondSplitGraph secondPrunedGraphRootIndex - secondPrunedGraphRootNode = (secondPrunedGraphRootIndex, secondPrunedGraphRootLabel) - secondPrunedParentNode = head $ LG.labParents secondSplitGraph secondPrunedGraphRootIndex - (secondPrunedGraphNodeList', secondPrunedGraphEdgeList') = LG.nodesAndEdgesAfter secondSplitGraph [secondPrunedGraphRootNode] - - -- add root node of second pruned since not included in "nodesAfter" function - -- add in gandparent nodes of pruned and its edges to pruned graphs - secondPrunedGraphNodeList = [secondPrunedGraphRootNode, secondPrunedParentNode] ++ secondPrunedGraphNodeList' - secondPrunedGraphEdgeList = (head $ LG.inn secondSplitGraph secondPrunedGraphRootIndex) : secondPrunedGraphEdgeList' - - -- reindex base and pruned partitions (HTUs and edges) to get in sync and make combinable - -- 0 is dummy since won't be in base split - (baseGraphNodes, baseGraphEdges, numBaseHTUs, reindexedBreakEdgeNodeBase) = reindexSubGraph numLeaves 0 firstBaseGraphNodeList firstBaseGraphEdgeList breakEdgeNode - (prunedGraphNodes, prunedGraphEdges, _, _) = reindexSubGraph numLeaves numBaseHTUs secondPrunedGraphNodeList secondPrunedGraphEdgeList breakEdgeNode - - -- should always be in base graph--should be in first (base) component--if not use original node - reindexedBreakEdgeNode = if (reindexedBreakEdgeNodeBase /= Nothing) then fromJust reindexedBreakEdgeNodeBase - else breakEdgeNode - - - -- create and reindex new split graph - newSplitGraph = LG.mkGraph (baseGraphNodes ++ prunedGraphNodes) (baseGraphEdges ++ prunedGraphEdges) - - -- get graph root Index, pruned root index, pruned root parent index - -- firstGraphRootIndex should not have changed in reindexing--same as numLeaves - prunedParentRootIndex = fst $ head $ (LG.getRoots newSplitGraph) L.\\ [firstGraphRootNode] - prunedRootIndex = head $ LG.descendants newSplitGraph prunedParentRootIndex - - in - if (length $ LG.getRoots newSplitGraph) /= 2 then error ("Not 2 components in split graph: " ++ "\n" ++ (LG.prettify $ GO.convertDecoratedToSimpleGraph newSplitGraph)) - else if (length $ LG.descendants newSplitGraph prunedParentRootIndex) /= 1 then error ("Too many children of parentPrunedNode: " ++ "\n" ++ (LG.prettify $ GO.convertDecoratedToSimpleGraph newSplitGraph)) - else if (length $ LG.parents secondSplitGraph secondPrunedGraphRootIndex) /= 1 then error ("Parent number not equal to 1 in node " - ++ (show secondPrunedGraphRootIndex) ++ " of second graph\n" ++ (LG.prettify $ GO.convertDecoratedToSimpleGraph secondSplitGraph)) - else if (length $ LG.inn secondSplitGraph secondPrunedGraphRootIndex) /= 1 then error ("Edge incedent tor pruned graph not equal to 1 in node " - ++ (show $ fmap LG.toEdge $ LG.inn secondSplitGraph secondPrunedGraphRootIndex) ++ " of second graph\n" ++ (LG.prettify $ GO.convertDecoratedToSimpleGraph secondSplitGraph)) - else - {- - } trace ("Nodes: " ++ (show (firstGraphRootIndex, prunedParentRootIndex, prunedRootIndex)) ++ " First Graph\n:" ++ (LG.prettify $ GO.convertDecoratedToSimpleGraph firstSplitGraph) - ++ "\nSecond Graph\n:" ++ (LG.prettify $ GO.convertDecoratedToSimpleGraph secondSplitGraph) - ++ "\nNew split graph\n" ++ (LG.prettify $ GO.convertDecoratedToSimpleGraph newSplitGraph) - ) - -} - (newSplitGraph, firstGraphRootIndex, prunedParentRootIndex, prunedRootIndex, reindexedBreakEdgeNode) - - --- | reindexSubGraph reindexes the non-leaf nodes and edges of a subgraph to allow topological combination of subgraphs --- the leaf indices are unchanges but HTUs are changes ot in order enumeration statting with an input offset --- new BreakEdge is returned as a Maybe becuase may be either in base or pruned subgraphs -reindexSubGraph :: Int -> Int -> [LG.LNode VertexInfo] -> [LG.LEdge b] -> LG.Node -> ([LG.LNode VertexInfo], [LG.LEdge b], Int, Maybe LG.Node) -reindexSubGraph numLeaves offset nodeList edgeList origBreakEdge = - if null nodeList || null edgeList then ([],[], offset, Nothing) - else - -- create map of node indices from list - let (newNodeList, indexList) = unzip $ getPairList numLeaves offset nodeList - indexMap = MAP.fromList indexList - newEdgeList = fmap (reIndexEdge indexMap) edgeList - newBreakEdge = MAP.lookup origBreakEdge indexMap - in - {- - if newBreakEdge == Nothing then error ("Map index for break edge node not found: " ++ (show origBreakEdge) ++ " in Map " ++ (show $ MAP.toList indexMap)) - else - -} - -- trace ("RISG:" ++ (show (fmap fst nodeList, fmap fst newNodeList, numLeaves)) ++ " map " ++ (show $ MAP.toList indexMap)) - (newNodeList, newEdgeList, 1 + (maximum $ fmap fst newNodeList) - numLeaves, newBreakEdge) - --- | reIndexEdge takes a map and a labelled edge and returns new indices same label edge based on map -reIndexEdge :: MAP.Map Int Int -> LG.LEdge b -> LG.LEdge b -reIndexEdge indexMap (u,v,l) = - let u' = MAP.lookup u indexMap - v' = MAP.lookup v indexMap - in - if u' == Nothing || v' == Nothing then error ("Error in map lookup in reindexEdge: " ++ show (u,v)) - else (fromJust u', fromJust v', l) - - --- | getPairList returns an original index new index lits of pairs --- assumes leaf nmodes are first numleaves -getPairList :: Int -> Int -> [LG.LNode VertexInfo] -> [(LG.LNode VertexInfo, (Int, Int))] -getPairList numLeaves counter nodeList = - if null nodeList then [] - else - let (firstIndex, firstLabel) = head nodeList - newLabel = firstLabel {vertName = TL.pack ("HTU" ++ (show $ counter + numLeaves))} - in - if firstIndex < numLeaves then (head nodeList, (firstIndex, firstIndex)) : getPairList numLeaves counter (tail nodeList) - else ((counter + numLeaves, newLabel) , (firstIndex, (counter + numLeaves))) : getPairList numLeaves (counter + 1) (tail nodeList) - -{- --- | recombineComponents' takes readdition arguments (swap, steepest etc) and wraps the swap-stype rejoining of components --- ignores doSteepeast for now--doesn't seem to have meaning in rejoining since not then taking that graph for fusion and shortcircuiting -recombineComponents' :: GlobalSettings - -> ProcessedData - -> Int - -> Int - -> Bool - -> Bool - -> Bool - -> V.Vector (V.Vector CharInfo) - -> VertexCost - -> [(DecoratedGraph, VertexCost)] - -> [Int] - -> [Int] - -> [Int] - -> [PhylogeneticGraph] -recombineComponents' inGS inData numToKeep inMaxMoveEdgeDist doNNI' doSPR' doTBR' charInfoVV curBestCost splitGraphCostPairList prunedRootIndexList prunedParentRootIndexList originalConnectionOfPrunedComponentList = - -- check and see if any reconnecting to do - --trace ("RecombineComponents " ++ (show $ length splitGraphCostPairList)) ( - if null splitGraphCostPairList then [] - else - -- top line to cover SPR HarWired bug - let (doTBR, doSPR, doNNI, maxMoveEdgeDist, hardWiredSPR) = if (graphType inGS /= HardWired) then (doTBR', doSPR', doNNI', inMaxMoveEdgeDist, False) - else - if doNNI' then (True, False, False, 2, True) - else if doSPR' then (True, False, False, inMaxMoveEdgeDist, True) - else (doTBR', doSPR', doNNI', inMaxMoveEdgeDist, False) - swapType = if doTBR then "tbr" - else if doSPR then "spr" - else if doNNI then "nni" - else "spr" -- will be set with 2 as maxMoveEdgeDist - - doIA = False --- since splits not created together, IA won't be consistent between components - - graphDataList = L.zip4 splitGraphCostPairList prunedRootIndexList prunedParentRootIndexList originalConnectionOfPrunedComponentList - - -- do "all additions" --steepest really doens't have meaning here since will not pop out to recombine on new graph - -- False for doSteepest - recombinedSimpleGraphCostPairList = concat (PU.seqParMap rdeepseq (S.rejoinGraphKeepBestTuple inGS swapType hardWiredSPR curBestCost maxMoveEdgeDist doIA charInfoVV) graphDataList) -- `using` PU.myParListChunkRDS) - - -- this based on heuristic deltas - bestFuseCost = minimum $ fmap snd recombinedSimpleGraphCostPairList - - -- check harwired for cycles - bestFuseSimpleGraphs = if (graphType inGS == HardWired ) then fmap fst $ filter ((== bestFuseCost) . snd) $ filter ((== False) . (LG.cyclic . fst)) $ recombinedSimpleGraphCostPairList - else fmap fst $ filter ((== bestFuseCost) . snd) recombinedSimpleGraphCostPairList - - in - --trace ("Checking in fusing") ( - if null recombinedSimpleGraphCostPairList then [] - else if (bestFuseCost / (dynamicEpsilon inGS)) <= curBestCost then - let rediagnodedGraphList = PU.seqParMap rdeepseq (T.multiTraverseFullyLabelGraph inGS inData False False Nothing) bestFuseSimpleGraphs -- `using` PU.myParListChunkRDS - bestRediagnosedGraphList = take numToKeep $ GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] rediagnodedGraphList - in - if (snd6 $ head bestRediagnosedGraphList) <= curBestCost then bestRediagnosedGraphList - else [] - else [] - -- ) - -- ) --} - diff --git a/pkg/PhyGraph/Search/GeneticAlgorithm.hs b/pkg/PhyGraph/Search/GeneticAlgorithm.hs deleted file mode 100644 index d15fe9562..000000000 --- a/pkg/PhyGraph/Search/GeneticAlgorithm.hs +++ /dev/null @@ -1,187 +0,0 @@ -{- | -Module : GeneticAlgorithm.hs -Description : Module specifying graph sGeneticAlgorithm functions -Copyright : (c) 2022 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -module Search.GeneticAlgorithm ( geneticAlgorithm - ) where - -import GeneralUtilities -import qualified Graphs.GraphOperations as GO -import qualified Search.Fuse as F -import qualified Search.NetworkAddDelete as N -import qualified Search.Swap as S -import Types.Types -import qualified Utilities.LocalGraph as LG - - --- | geneticAlgorithm takes arguments and performs genetic algorithm on input graphs --- the process follows several steps --- 1) input graphs are mutated --- this step is uncharacteristically first so that is can operate on --- graphs that have been "fused" (recombined) already --- mutated graphs are added up to popsize --- if input graphs are already at the population size, an equal number of mutants are added (exceeding input popsize) --- 2) graph are recombined using fusing operations --- 3) population undergoes selection to population size (unique graphs) --- selection based on delta with best graph and severity factor on (0,Inf) 1 pure cost delta < 1 more severe, > 1 less severe --- if "elitist" (default) 'best' graphs are always selected to ensure no worse. --- 4) operation repearts for number of generations -geneticAlgorithm :: GlobalSettings -> ProcessedData -> Int -> Bool -> Int -> Int -> Int -> Int -> Int -> Double -> Int -> [PhylogeneticGraph] -> ([PhylogeneticGraph], Int) -geneticAlgorithm inGS inData rSeed doElitist maxNetEdges keepNum popSize generations generationCounter severity recombinations inGraphList = - if null inGraphList then ([], 0) - else if generationCounter == generations then (inGraphList, generationCounter) - else - let seedList = randomIntList rSeed - - -- get elite list of best solutions - initialEliteList = GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] inGraphList - - -- mutate input graphs, produces number input, limited to popsize - mutatedGraphList' = zipWith (mutateGraph inGS inData maxNetEdges) (randomIntList $ head seedList) $ takeRandom (seedList !! 1) popSize inGraphList - - -- adjust to correct populationsize if input number < popSize - mutatedGraphList = if length mutatedGraphList' >= popSize then mutatedGraphList' - else - let numShort = popSize - (length mutatedGraphList') - additionalMutated = zipWith (mutateGraph inGS inData maxNetEdges) (randomIntList $ seedList !! 2) $ takeRandom (seedList !! 3) numShort inGraphList - in - mutatedGraphList' ++ additionalMutated - - -- get unique graphs, no point in recombining repetitions - uniqueMutatedGraphList = GO.selectPhylogeneticGraph [("unique","")] 0 ["unique"] (mutatedGraphList ++ inGraphList) - - -- recombine elite with mutated and mutated with mutated - recombineSwap = getRandomElement (seedList !! 4) ["nni", "spr", "tbr"] - (doNNI, doSPR, doTBR) = if recombineSwap == "nni" then (True, False, False) - else if recombineSwap == "spr" then (False, True, False) - else (False, False, True) - doSteepest = True - doAll = False - returnBest = False - returnUnique = True - singleRound = False - fusePairs = Just recombinations - randomPairs = True - - (recombinedGraphList, _) = F.fuseAllGraphs inGS inData (drop 6 seedList) (2 * popSize) (maxBound :: Int) 0 doNNI doSPR doTBR doSteepest doAll returnBest returnUnique singleRound fusePairs randomPairs uniqueMutatedGraphList - - -- selection of graphs population - -- unique sorted on cost so getting unique with lowest cost - selectedGraphs = take popSize $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] recombinedGraphList - newCost = snd6 $ head selectedGraphs - - in - {- - trace ("\tGA " ++ (show $ snd6 $ head initialEliteList) ++ " -> " ++ (show newCost) ++ "\nInGraphs " ++ (show $ L.sort $ fmap snd6 inGraphList) - ++ "\nMutated " ++ (show $ L.sort $ fmap snd6 mutatedGraphList) - ++ "\nRecombined " ++ recombineSwap ++ " " ++ (show $ L.sort $ fmap snd6 recombinedGraphList)) ( - -} - -- if new graphs better cost then take those - if newCost < (snd6 $ head initialEliteList) then - geneticAlgorithm inGS inData (seedList !! 5) doElitist maxNetEdges keepNum popSize generations (generationCounter + 1) severity recombinations selectedGraphs - - -- if noew graphs not better then add in elites toi ensure monotonic decrease in cost - else - let newGraphList = take keepNum $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] (initialEliteList ++ selectedGraphs) - in - geneticAlgorithm inGS inData (seedList !! 5) doElitist maxNetEdges keepNum popSize generations (generationCounter + 1) severity recombinations newGraphList - -- ) - --- | mutateGraph mutates a graph using drift functionality -mutateGraph :: GlobalSettings -> ProcessedData -> Int -> Int -> PhylogeneticGraph -> PhylogeneticGraph -mutateGraph inGS inData maxNetEdges rSeed inGraph = - if LG.isEmpty (fst6 inGraph) then error "Empty graph in mutateGraph" - else - let randList = randomIntList rSeed - saValues = Just $ SAParams { method = Drift - , numberSteps = 0 - , currentStep = 0 - , randomIntegerList = randomIntList rSeed - , rounds = 1 - , driftAcceptEqual = 0.67 - , driftAcceptWorse = 0.0 - -- this could be an important factor don't want too severe, but significant - , driftMaxChanges = getRandomElement (randList !! 1) [1,2,4] -- or something - , driftChanges = 0 - } - in - let --randomize edit type - editType = getRandomElement (randList !! 0) ["swap", "netEdge"] - - -- randomize Swap parameters - alternate = True - numToKeep = 1 - maxMoveEdgeDist = 10 - steepest = True - doIA = False - returnMutated = True - inSimAnnealParams = saValues - swapType = getRandomElement (randList !! 2) ["spr","tbr"] - - --randomize network edit parameters - netEditType = getRandomElement (randList !! 3) ["netAdd", "netDelete", "netMove", "netAddDelete"] - doRandomOrder = True - maxRounds = getRandomElement (randList !! 4) [1..5] - - in - - -- only swap stuff for tree - if graphType inGS == Tree || (LG.isTree (fst6 inGraph) && netEditType /= "netadd") then - head $ fst $ S.swapSPRTBR swapType inGS inData numToKeep maxMoveEdgeDist steepest alternate doIA returnMutated (inSimAnnealParams, inGraph) - - -- graphs choose what type of mutation at random - else - if editType == "swap" then - head $ fst $ S.swapSPRTBR swapType inGS inData numToKeep maxMoveEdgeDist steepest alternate doIA returnMutated (inSimAnnealParams, inGraph) - else - -- move only for Hardwired - if (graphType inGS) == HardWired then - head $ fst $ N.moveAllNetEdges inGS inData (randList !! 4) maxNetEdges numToKeep 0 returnMutated steepest doRandomOrder ([], infinity) (inSimAnnealParams, [inGraph]) - - - -- SoftWired - else - if netEditType == "netMove" then - head $ fst $ N.moveAllNetEdges inGS inData (randList !! 4) maxNetEdges numToKeep 0 returnMutated steepest doRandomOrder ([], infinity) (inSimAnnealParams, [inGraph]) - - else if netEditType == "netAdd" then - head $ fst $ N.insertAllNetEdges inGS inData (randList !! 4) maxNetEdges numToKeep maxRounds 0 returnMutated steepest doRandomOrder ([], infinity) (inSimAnnealParams, [inGraph]) - - else if netEditType == "netAddDelete" then - head $ fst $ N.addDeleteNetEdges inGS inData (randList !! 4) maxNetEdges numToKeep maxRounds 0 returnMutated steepest doRandomOrder ([], infinity) (inSimAnnealParams, [inGraph]) - - -- net delete - else - head $ fst $ N.deleteAllNetEdges inGS inData (randList !! 4) maxNetEdges numToKeep 0 returnMutated steepest doRandomOrder ([], infinity) (inSimAnnealParams, [inGraph]) diff --git a/pkg/PhyGraph/Search/NetworkAddDelete.hs b/pkg/PhyGraph/Search/NetworkAddDelete.hs deleted file mode 100644 index 1edda5502..000000000 --- a/pkg/PhyGraph/Search/NetworkAddDelete.hs +++ /dev/null @@ -1,1651 +0,0 @@ -{- | -Module : NetworkAddDelete.hs -Description : Module specifying graph egde adding and deleting functions -Copyright : (c) 2021 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -module Search.NetworkAddDelete ( deleteAllNetEdges - , insertAllNetEdges - , moveAllNetEdges - , deltaPenaltyAdjustment - , deleteNetEdge - , deleteOneNetAddAll - , addDeleteNetEdges - ) where - -import Control.Parallel.Strategies -import Data.Bits -import Data.Maybe -import qualified Data.Text.Lazy as TL -import qualified Data.Vector as V -import Debug.Trace -import GeneralUtilities -import qualified GraphOptimization.Medians as M -import qualified GraphOptimization.PostOrderFunctions as POS -import qualified GraphOptimization.Traversals as T -import qualified Graphs.GraphOperations as GO -import qualified ParallelUtilities as PU -import Types.Types -import qualified Utilities.LocalGraph as LG -import qualified Utilities.Utilities as U -import qualified Data.InfList as IL -import qualified GraphOptimization.PostOrderSoftWiredFunctions as POSW -import qualified GraphOptimization.PreOrderFunctions as PRE --- import qualified Data.List as L - - --- | addDeleteNetEdges is a wrapper for addDeleteNetEdges' allowing for multiple simulated annealing rounds -addDeleteNetEdges :: GlobalSettings - -> ProcessedData - -> Int - -> Int - -> Int - -> Int - -> Int - -> Bool - -> Bool - -> Bool - -> ([PhylogeneticGraph], VertexCost) - -> (Maybe SAParams, [PhylogeneticGraph]) - -> ([PhylogeneticGraph], Int) -addDeleteNetEdges inGS inData rSeed maxNetEdges numToKeep maxRounds counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) (inSimAnnealParams, inPhyloGraphList) = - let leafGraph = LG.extractLeafGraph $ (thd6 . head) inPhyloGraphList - in - if inSimAnnealParams == Nothing then addDeleteNetEdges' inGS inData leafGraph rSeed maxNetEdges numToKeep maxRounds counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) Nothing inPhyloGraphList - else - let -- create list of params with unique list of random values for rounds of annealing - annealingRounds = rounds $ fromJust inSimAnnealParams - saPAramList = (U.generateUniqueRandList annealingRounds inSimAnnealParams) -- (replicate annealingRounds inPhyloGraphList) - - (annealRoundsList, counterList) = unzip (PU.seqParMap rdeepseq (addDeleteNetEdges'' inGS inData leafGraph rSeed maxNetEdges numToKeep maxRounds counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost)) (zip saPAramList (replicate annealingRounds inPhyloGraphList))) - - in - (take numToKeep $ GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] (concat annealRoundsList) , sum counterList) - --- | addDeleteNetEdges'' is wrapper around addDeleteNetEdges' to use parmap -addDeleteNetEdges'' :: GlobalSettings - -> ProcessedData - -> DecoratedGraph - -> Int - -> Int - -> Int - -> Int - -> Int - -> Bool - -> Bool - -> Bool - -> ([PhylogeneticGraph], VertexCost) - -> (Maybe SAParams, [PhylogeneticGraph]) - -> ([PhylogeneticGraph], Int) -addDeleteNetEdges'' inGS inData leafGraph rSeed maxNetEdges numToKeep maxRounds counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) (inSimAnnealParams, inPhyloGraphList) = - addDeleteNetEdges' inGS inData leafGraph rSeed maxNetEdges numToKeep maxRounds counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) inSimAnnealParams inPhyloGraphList - --- | addDeleteNetEdges' removes each edge and adds an edge to all possible places (or steepest) each round --- until no better or additional graphs are found (or max rounds met) --- call with ([], infinity) [single input graph] --- doesn't have to be random, but likely to converge quickly if not -addDeleteNetEdges' :: GlobalSettings - -> ProcessedData - -> DecoratedGraph - -> Int - -> Int - -> Int - -> Int - -> Int - -> Bool - -> Bool - -> Bool - -> ([PhylogeneticGraph], VertexCost) - -> Maybe SAParams - -> [PhylogeneticGraph] - -> ([PhylogeneticGraph], Int) -addDeleteNetEdges' inGS inData leafGraph rSeed maxNetEdges numToKeep maxRounds counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) inSimAnnealParams inPhyloGraphList = - if null inPhyloGraphList then (take numToKeep curBestGraphList, counter) - - -- if hit maxmimum rounds then return - else if counter == maxRounds then (take numToKeep curBestGraphList, counter) - - -- other wise add/delete - else - trace ("\tRound " ++ (show counter)) ( - let -- insert edges first - randIntList = randomIntList rSeed - (insertGraphList, _) = insertAllNetEdges' inGS inData leafGraph maxNetEdges numToKeep counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) randIntList inSimAnnealParams inPhyloGraphList - - -- if no better--take input for delte phase - randIntList2 = randomIntList (randIntList !! 1) - (insertGraphList', insertGraphCost, toDeleteList) = if null insertGraphList then (curBestGraphList, curBestGraphCost, inPhyloGraphList) - else - let newList = GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] insertGraphList - in - (newList, snd6 $ head newList, newList) - - -- delete edges - (deleteGraphList, _) = deleteAllNetEdges' inGS inData leafGraph maxNetEdges numToKeep counter returnMutated doSteepest doRandomOrder (insertGraphList', insertGraphCost) randIntList2 inSimAnnealParams toDeleteList - - -- gather beter if any - (newBestGraphList, newBestGraphCost, graphsToDoNext) = if null deleteGraphList then (curBestGraphList, curBestGraphCost, inPhyloGraphList) - else - let newDeleteGraphs = GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] deleteGraphList - in - (newDeleteGraphs, snd6 $ head newDeleteGraphs, newDeleteGraphs) - in - -- check is same then return - if newBestGraphCost == curBestGraphCost then - (take numToKeep curBestGraphList, counter) - - -- if better (or nothing) keep going - else - addDeleteNetEdges' inGS inData leafGraph (randIntList !! 2) maxNetEdges numToKeep maxRounds (counter + 1) returnMutated doSteepest doRandomOrder (newBestGraphList, newBestGraphCost) inSimAnnealParams graphsToDoNext - ) - - - - - --- | moveAllNetEdges is a wrapper for moveAllNetEdges' allowing for multiple simulated annealing rounds -moveAllNetEdges :: GlobalSettings - -> ProcessedData - -> Int - -> Int - -> Int - -> Int - -> Bool - -> Bool - -> Bool - -> ([PhylogeneticGraph], VertexCost) - -> (Maybe SAParams, [PhylogeneticGraph]) - -> ([PhylogeneticGraph], Int) -moveAllNetEdges inGS inData rSeed maxNetEdges numToKeep counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) (inSimAnnealParams, inPhyloGraphList) = - if inSimAnnealParams == Nothing then moveAllNetEdges' inGS inData rSeed maxNetEdges numToKeep counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) Nothing inPhyloGraphList - else - let -- create list of params with unique list of random values for rounds of annealing - annealingRounds = rounds $ fromJust inSimAnnealParams - saPAramList = (U.generateUniqueRandList annealingRounds inSimAnnealParams) -- (replicate annealingRounds inPhyloGraphList) - - (annealRoundsList, counterList) = unzip (PU.seqParMap rdeepseq (moveAllNetEdges'' inGS inData rSeed maxNetEdges numToKeep counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost)) (zip saPAramList (replicate annealingRounds inPhyloGraphList))) - - in - (take numToKeep $ GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] (concat annealRoundsList) , sum counterList) - --- | moveAllNetEdges'' is wrapper around moveAllNetEdges' to use parmap -moveAllNetEdges'' :: GlobalSettings - -> ProcessedData - -> Int - -> Int - -> Int - -> Int - -> Bool - -> Bool - -> Bool - -> ([PhylogeneticGraph], VertexCost) - -> (Maybe SAParams, [PhylogeneticGraph]) - -> ([PhylogeneticGraph], Int) -moveAllNetEdges'' inGS inData rSeed maxNetEdges numToKeep counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) (inSimAnnealParams, inPhyloGraphList) = - moveAllNetEdges' inGS inData rSeed maxNetEdges numToKeep counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) inSimAnnealParams inPhyloGraphList - --- | moveAllNetEdges' removes each edge and adds an edge to all possible places (or steepest) each round --- until no better or additional graphs are found --- call with ([], infinity) [single input graph] -moveAllNetEdges' :: GlobalSettings - -> ProcessedData - -> Int - -> Int - -> Int - -> Int - -> Bool - -> Bool - -> Bool - -> ([PhylogeneticGraph], VertexCost) - -> Maybe SAParams - -> [PhylogeneticGraph] - -> ([PhylogeneticGraph], Int) -moveAllNetEdges' inGS inData rSeed maxNetEdges numToKeep counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) inSimAnnealParams inPhyloGraphList = - if null inPhyloGraphList then (take numToKeep curBestGraphList, counter) - else - let firstPhyloGraph = head inPhyloGraphList - leafGraph = LG.extractLeafGraph $ thd6 firstPhyloGraph - currentCost = min curBestGraphCost (snd6 firstPhyloGraph) - - -- randomize order of edges to try moving - netEdgeList = if not doRandomOrder then - LG.labNetEdges (thd6 firstPhyloGraph) - else - permuteList rSeed $ LG.labNetEdges (thd6 firstPhyloGraph) - - - newGraphList' = deleteOneNetAddAll inGS inData leafGraph maxNetEdges numToKeep doSteepest doRandomOrder firstPhyloGraph (fmap LG.toEdge netEdgeList) rSeed inSimAnnealParams - newGraphList = take numToKeep $ GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] newGraphList' - newGraphCost = if (not . null) newGraphList' then snd6 $ head newGraphList - else infinity - in - - -- if graph is a tree no edges to delete - {- - if LG.isTree (fst6 firstPhyloGraph) then - trace ("\tGraph in move network edges is tree--skipping") - moveAllNetEdges' inGS inData rSeed maxNetEdges numToKeep (counter + 1) returnMutated doSteepest doRandomOrder (firstPhyloGraph : curBestGraphList, currentCost) inSimAnnealParams (tail inPhyloGraphList) - -} - - -- if graph is a tree no edges to delete - if null netEdgeList then trace ("\t\tGraph in move has no network edges to move--skipping") (inPhyloGraphList, counter) - - -- regular move keeping best - else if inSimAnnealParams == Nothing then - if newGraphCost > currentCost then - -- trace ("\t MANE : Worse") - moveAllNetEdges' inGS inData maxNetEdges rSeed numToKeep (counter + 1) returnMutated doSteepest doRandomOrder (firstPhyloGraph : curBestGraphList, currentCost) inSimAnnealParams (tail inPhyloGraphList) - else if newGraphCost < currentCost then - trace ("\tMANE-> " ++ (show newGraphCost)) ( - if doSteepest then - moveAllNetEdges' inGS inData rSeed maxNetEdges numToKeep (counter + 1) returnMutated doSteepest doRandomOrder (newGraphList, newGraphCost) inSimAnnealParams newGraphList - else - moveAllNetEdges' inGS inData rSeed maxNetEdges numToKeep (counter + 1) returnMutated doSteepest doRandomOrder (newGraphList, newGraphCost) inSimAnnealParams (newGraphList ++ (tail inPhyloGraphList)) - ) - - else - -- new graph list contains the input graph if equal and filterd unique already in moveAllNetEdges - let newCurSameBestList = take numToKeep $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] (curBestGraphList ++ newGraphList) - in - -- trace ("\t MANE : Equal") - moveAllNetEdges' inGS inData rSeed maxNetEdges numToKeep (counter + 1) returnMutated doSteepest doRandomOrder (newCurSameBestList, currentCost) inSimAnnealParams (tail inPhyloGraphList) - - -- sim anneal choice - else - let -- abstract stopping criterion to continue - numDone = if (method $ fromJust inSimAnnealParams) == SimAnneal then currentStep $ fromJust inSimAnnealParams - else driftChanges $ fromJust inSimAnnealParams - numMax = if (method $ fromJust inSimAnnealParams) == SimAnneal then numberSteps $ fromJust inSimAnnealParams - else driftMaxChanges $ fromJust inSimAnnealParams - - -- get acceptance based on heuristic costs - uniqueGraphList = take numToKeep $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] newGraphList' - annealBestCost = if (not . null) uniqueGraphList then min curBestGraphCost (snd6 $ head uniqueGraphList) - else curBestGraphCost - (acceptFirstGraph, newSAParams) = if (not . null) uniqueGraphList then U.simAnnealAccept inSimAnnealParams annealBestCost (snd6 $ head uniqueGraphList) - else (False, U.incrementSimAnnealParams inSimAnnealParams) - in - -- trace ("ACG" ++ (show acceptFirstGraph) ++ " " ++ (show $ snd6 $ head uniqueGraphList)) ( - if (numDone < numMax) then - -- this fixes tail fail - let nextUniqueList = if (not . null) uniqueGraphList then tail uniqueGraphList - else [] - in - - if acceptFirstGraph then - moveAllNetEdges' inGS inData rSeed maxNetEdges numToKeep (counter + 1) returnMutated doSteepest doRandomOrder ((head uniqueGraphList) : curBestGraphList, annealBestCost) newSAParams (nextUniqueList ++ (tail inPhyloGraphList)) - else - moveAllNetEdges' inGS inData rSeed maxNetEdges numToKeep (counter + 1) returnMutated doSteepest doRandomOrder (curBestGraphList, annealBestCost) newSAParams (nextUniqueList ++ (tail inPhyloGraphList)) - - -- if want non-optimized list for GA or whatever - else if returnMutated then (take numToKeep curBestGraphList, counter) - - -- optimize list and return - else - let (bestMoveList', counter') = moveAllNetEdges' inGS inData rSeed maxNetEdges numToKeep (counter + 1) False doSteepest doRandomOrder ([], annealBestCost) Nothing (take numToKeep curBestGraphList) - bestMoveList = take numToKeep $ GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] bestMoveList' - in - --trace ("BM: " ++ (show $ snd6 $ head bestMoveList)) - (take numToKeep bestMoveList, counter') - -- ) - --- | (curBestGraphList, annealBestCost) is a wrapper for moveAllNetEdges' allowing for multiple simulated annealing rounds -insertAllNetEdges :: GlobalSettings - -> ProcessedData - -> Int - -> Int - -> Int - -> Int - -> Int - -> Bool - -> Bool - -> Bool - -> ([PhylogeneticGraph], VertexCost) - -> (Maybe SAParams, [PhylogeneticGraph]) - -> ([PhylogeneticGraph], Int) -insertAllNetEdges inGS inData rSeed maxNetEdges numToKeep maxRounds counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) (inSimAnnealParams, inPhyloGraphList) = - let leafGraph = LG.extractLeafGraph $ (thd6 . head) inPhyloGraphList - in - if inSimAnnealParams == Nothing then - - -- check for multiple rounds of addition--if > 1 then need to randomize order - if maxRounds == 1 then - insertAllNetEdges' inGS inData leafGraph maxNetEdges numToKeep counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) (randomIntList rSeed) Nothing inPhyloGraphList - else - -- need to concat and send different randomization lists for each "round" - let randSeedList = take maxRounds (randomIntList rSeed) - randIntListList = fmap randomIntList randSeedList - (insertGraphList, counterList) = unzip $ PU.seqParMap rdeepseq (insertAllNetEdgesRand inGS inData leafGraph maxNetEdges numToKeep counter returnMutated doSteepest (curBestGraphList, curBestGraphCost) Nothing inPhyloGraphList) randIntListList - in - -- insert functions take care of returning "better" or empty - -- should be empty if nothing better - (take numToKeep $ GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] (concat insertGraphList), sum counterList) - else - let -- create list of params with unique list of random values for rounds of annealing - annealingRounds = rounds $ fromJust inSimAnnealParams - annealParamGraphList = U.generateUniqueRandList annealingRounds inSimAnnealParams - replicateRandIntList = fmap randomIntList (take annealingRounds (randomIntList rSeed)) - - (annealRoundsList, counterList) = unzip (PU.seqParMap rdeepseq (insertAllNetEdges'' inGS inData leafGraph maxNetEdges numToKeep counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost)) (zip3 replicateRandIntList annealParamGraphList (replicate annealingRounds inPhyloGraphList))) - in - (take numToKeep $ GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] (concat annealRoundsList) , sum counterList) - --- | insertAllNetEdgesRand is a wrapper around insertAllNetEdges'' to pass unique randomLists to insertAllNetEdges' -insertAllNetEdgesRand :: GlobalSettings - -> ProcessedData - -> DecoratedGraph - -> Int - -> Int - -> Int - -> Bool - -> Bool - -> ([PhylogeneticGraph], VertexCost) - -> Maybe SAParams - -> [PhylogeneticGraph] - -> [Int] - -> ([PhylogeneticGraph], Int) -insertAllNetEdgesRand inGS inData leafGraph maxNetEdges numToKeep counter returnMutated doSteepest (curBestGraphList, curBestGraphCost) inSimAnnealParams inPhyloGraphList randIntList = - insertAllNetEdges' inGS inData leafGraph maxNetEdges numToKeep counter returnMutated doSteepest True (curBestGraphList, curBestGraphCost) randIntList inSimAnnealParams inPhyloGraphList - - --- | insertAllNetEdges'' is a wrapper around insertAllNetEdges' to allow for seqParMap -insertAllNetEdges'' :: GlobalSettings - -> ProcessedData - -> DecoratedGraph - -> Int - -> Int - -> Int - -> Bool - -> Bool - -> Bool - -> ([PhylogeneticGraph], VertexCost) - -> ([Int], Maybe SAParams, [PhylogeneticGraph]) - -> ([PhylogeneticGraph], Int) -insertAllNetEdges'' inGS inData leafGraph maxNetEdges numToKeep counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) (randIntList, inSimAnnealParams, inPhyloGraphList) = - insertAllNetEdges' inGS inData leafGraph maxNetEdges numToKeep counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) randIntList inSimAnnealParams inPhyloGraphList - - --- | insertAllNetEdges' adds network edges one each each round until no better or additional --- graphs are found --- call with ([], infinity) [single input graph] -insertAllNetEdges' :: GlobalSettings - -> ProcessedData - -> DecoratedGraph - -> Int - -> Int - -> Int - -> Bool - -> Bool - -> Bool - -> ([PhylogeneticGraph], VertexCost) - -> [Int] - -> Maybe SAParams - -> [PhylogeneticGraph] - -> ([PhylogeneticGraph], Int) -insertAllNetEdges' inGS inData leafGraph maxNetEdges numToKeep counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) randIntList inSimAnnealParams inPhyloGraphList = - if null inPhyloGraphList then (take numToKeep curBestGraphList, counter) - else - let currentCost = min curBestGraphCost (snd6 $ head inPhyloGraphList) - - --check for max net edges - (_, _, _, netNodes) = LG.splitVertexList (thd6 $ head inPhyloGraphList) - - (newGraphList, _) = insertEachNetEdge inGS inData leafGraph (head randIntList) maxNetEdges numToKeep doSteepest doRandomOrder Nothing inSimAnnealParams (head inPhyloGraphList) - - - bestNewGraphList = take numToKeep $ GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] newGraphList - newGraphCost = if (not . null) bestNewGraphList then snd6 $ head bestNewGraphList - else infinity - - -- this to deal with potential tail of empty list - nextNewGraphList = if (not . null) newGraphList then tail newGraphList - else [] - - in - -- trace ("IANE: " ++ (show $ length netNodes)) ( - if length netNodes >= maxNetEdges then - trace ("Maximum number of network edges reached: " ++ (show $ length netNodes)) - (take numToKeep curBestGraphList, counter) - - - else if null newGraphList then - trace ("\t\tNumber of network edges: " ++ (show $ length netNodes)) - (take numToKeep curBestGraphList, counter) - - -- regular insert keeping best - else if inSimAnnealParams == Nothing then - -- "steepest style descent" abandons existing list if better cost found - trace ("\t\tNumber of network edges: " ++ (show $ length netNodes)) ( - if newGraphCost < currentCost then - -- check if graph OK--done in insert function - let --- isCyclicList = filter (== True) $ fmap LG.cyclic $ fmap thd6 newGraphList - --- hasDupEdges = filter (== True) $ fmap LG.hasDuplicateEdge $ fmap thd6 newGraphList - - graphsToInsert = if doSteepest then newGraphList - else take numToKeep $ newGraphList ++ (tail inPhyloGraphList) - in - trace ("\t-> " ++ (show newGraphCost)) - insertAllNetEdges' inGS inData leafGraph maxNetEdges numToKeep (counter + 1) returnMutated doSteepest doRandomOrder (newGraphList, newGraphCost) (tail randIntList) inSimAnnealParams graphsToInsert - - - -- worse graphs found--go on - else if newGraphCost > currentCost then - -- trace ("IANE: Worse") - insertAllNetEdges' inGS inData leafGraph maxNetEdges numToKeep (counter + 1) returnMutated doSteepest doRandomOrder (curBestGraphList, currentCost) (tail randIntList) inSimAnnealParams (tail inPhyloGraphList) - - -- equal cost - -- not sure if should add new graphs to queue to do edge deletion again - else - -- new graph list contains the input graph if equal and filterd unique already in insertAllNetEdges - let newCurSameBestList = GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] $ take numToKeep (curBestGraphList ++ newGraphList) - in - -- trace ("IANE: same " ++ (show $ length (tail inPhyloGraphList))) - insertAllNetEdges' inGS inData leafGraph maxNetEdges numToKeep (counter + 1) returnMutated doSteepest doRandomOrder (newCurSameBestList, currentCost) (tail randIntList) inSimAnnealParams (tail inPhyloGraphList) - ) - - -- simulated annealing - else - -- if steepest -- simAnneal/Drift stuff done during net add function so jusyt take results and move on - -- create new - trace ("\t\tNumber of network edges: " ++ (show $ length netNodes)) ( - if doSteepest then - let annealBestCost = min curBestGraphCost newGraphCost - newSAParams = Just $ (fromJust (U.incrementSimAnnealParams inSimAnnealParams)) {currentStep = 0, driftChanges = 0} - in - insertAllNetEdges' inGS inData leafGraph maxNetEdges numToKeep (counter + 1) returnMutated doSteepest doRandomOrder ((head newGraphList) : curBestGraphList, annealBestCost) (tail randIntList) newSAParams (nextNewGraphList ++ (tail inPhyloGraphList)) - - -- simmAnneal on "all list" - else - -- sim anneal choice - let numDone = if (method $ fromJust inSimAnnealParams) == SimAnneal then currentStep $ fromJust inSimAnnealParams - else driftChanges $ fromJust inSimAnnealParams - numMax = if (method $ fromJust inSimAnnealParams) == SimAnneal then numberSteps $ fromJust inSimAnnealParams - else driftMaxChanges $ fromJust inSimAnnealParams - annealBestCost = min curBestGraphCost (snd6 $ head newGraphList) - (acceptFirstGraph, newSAParams) = U.simAnnealAccept inSimAnnealParams annealBestCost (snd6 $ head newGraphList) - in - -- trace ("ACG" ++ (show acceptFirstGraph) ++ " " ++ (show $ snd6 $ head uniqueGraphList)) ( - if (numDone < numMax) then - -- this fixes tail fail - if acceptFirstGraph then - insertAllNetEdges' inGS inData leafGraph maxNetEdges numToKeep (counter + 1) returnMutated doSteepest doRandomOrder ((head newGraphList) : curBestGraphList, annealBestCost) (tail randIntList) newSAParams (nextNewGraphList ++ (tail inPhyloGraphList)) - else - insertAllNetEdges' inGS inData leafGraph maxNetEdges numToKeep (counter + 1) returnMutated doSteepest doRandomOrder (curBestGraphList, annealBestCost) (tail randIntList) newSAParams (nextNewGraphList ++ (tail inPhyloGraphList)) - - -- returns non-optimized list for GA or whatever - else if returnMutated then (take numToKeep curBestGraphList, counter) - - -- run net delete regular to get back to optimized edges - else - let (bestList', counter') = deleteAllNetEdges' inGS inData leafGraph maxNetEdges numToKeep (counter + 1) False doSteepest doRandomOrder ([], annealBestCost) (tail randIntList) Nothing (take numToKeep curBestGraphList) - bestList = take numToKeep $ GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] bestList' - in - --trace ("BM: " ++ (show $ snd6 $ head bestMoveList)) - (take numToKeep bestList, counter') - ) - - --- | insertEachNetEdge takes a phylogenetic graph and inserts all permissible network edges one at time --- and returns unique list of new Phylogenetic Graphs and cost --- even if worse--could be used for simulated annealing later --- if equal returns unique graph list -insertEachNetEdge :: GlobalSettings - -> ProcessedData - -> DecoratedGraph - -> Int - -> Int - -> Int - -> Bool - -> Bool - -> Maybe VertexCost - -> Maybe SAParams - -> PhylogeneticGraph - -> ([PhylogeneticGraph], VertexCost) -insertEachNetEdge inGS inData leafGraph rSeed maxNetEdges numToKeep doSteepest doRandomOrder preDeleteCost inSimAnnealParams inPhyloGraph = - if LG.isEmpty $ fst6 inPhyloGraph then error "Empty input insertEachNetEdge graph in deleteAllNetEdges" - else - let currentCost = if preDeleteCost == Nothing then snd6 inPhyloGraph - else fromJust preDeleteCost - - candidateNetworkEdgeList' = getPermissibleEdgePairs (thd6 inPhyloGraph) - - -- radomize pair list - rSeedList = randomIntList rSeed - candidateNetworkEdgeList = if doRandomOrder then permuteList (head rSeedList) candidateNetworkEdgeList' - else candidateNetworkEdgeList' - - -- newGraphList = concat (fmap (insertNetEdgeBothDirections inGS inData inPhyloGraph) candidateNetworkEdgeList `using` PU.myParListChunkRDS) - newGraphList = if not doSteepest then filter (/= emptyPhylogeneticGraph) (PU.seqParMap rdeepseq (insertNetEdge inGS inData leafGraph inPhyloGraph preDeleteCost) candidateNetworkEdgeList) -- `using` PU.myParListChunkRDS) - else insertNetEdgeRecursive inGS inData leafGraph (tail rSeedList) maxNetEdges doSteepest doRandomOrder inPhyloGraph preDeleteCost Nothing candidateNetworkEdgeList - - --minCostGraphList = GO.selectPhylogeneticGraph [("best", (show numToKeep))] 0 ["best"] newGraphList - minCost = if null candidateNetworkEdgeList || null newGraphList then infinity - else minimum $ fmap snd6 newGraphList - in - trace ("\tExamining at most " ++ (show $ length candidateNetworkEdgeList) ++ " candidate edge pairs") ( - - -- no network edges to insert - -- trace ("IENE: " ++ (show minCost)) ( - if null candidateNetworkEdgeList then - -- trace ("IENE num cand edges:" ++ (show $ length candidateNetworkEdgeList)) - ([inPhyloGraph], currentCost) - -- filter later - -- single if steepest so no neeed to unique - else if doSteepest then - -- trace ("IENE: Steepest " ++ (show minCost)) - (newGraphList, minCost) - - else - -- trace ("IENE: All " ++ (show minCost)) - (take numToKeep $ GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] $ newGraphList, minCost) - ) -- ) - --- | insertNetEdgeRecursive recursively inserts edges and returns new graph only if better --- if parallel ebvaluated numThreads each time -insertNetEdgeRecursive :: GlobalSettings - -> ProcessedData - -> DecoratedGraph - -> [Int] - -> Int - -> Bool - -> Bool - -> PhylogeneticGraph - -> Maybe VertexCost - -> Maybe SAParams - -> [(LG.LEdge EdgeInfo, LG.LEdge EdgeInfo)] - -> [PhylogeneticGraph] -insertNetEdgeRecursive inGS inData leafGraph rSeedList maxNetEdges doSteepest doRandomOrder inPhyloGraph preDeleteCost inSimAnnealParams inEdgePairList = - -- trace ("Edges pairs to go : " ++ (show $ length edgePairList)) ( - if null inEdgePairList then [inPhyloGraph] - else - let numGraphsToExamine = PU.getNumThreads - -- firstEdgePair = head edgePairList - edgePairList = take numGraphsToExamine inEdgePairList - - --check for max net edges - (_, _, _, netNodes) = LG.splitVertexList (thd6 inPhyloGraph) - - -- needf to check disapaly/charcter trees not conical graph - -- newGraph = insertNetEdge inGS inData leafGraph inPhyloGraph preDeleteCost firstEdgePair - newGraphList'' = PU.seqParMap rdeepseq (insertNetEdge inGS inData leafGraph inPhyloGraph preDeleteCost) edgePairList - newGraphList' = filter (/= emptyPhylogeneticGraph) newGraphList'' - newGraphList = GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] newGraphList' - newGraphCost = snd6 $ head newGraphList - - - - in - traceNoLF ("*") ( -- trace ("INER: " ++ (show $ snd6 newGraph) ++ " " ++ (show preDeleteCost)) ( - if length netNodes >= maxNetEdges then - trace ("Maximum number of network edges reached: " ++ (show $ length netNodes)) - [inPhyloGraph] - - -- malformed graph - else if null newGraphList' then - -- trace ("INER: Empty more to go : " ++ (show $ length $ tail edgePairList)) - insertNetEdgeRecursive inGS inData leafGraph rSeedList maxNetEdges doSteepest doRandomOrder inPhyloGraph preDeleteCost inSimAnnealParams (drop numGraphsToExamine inEdgePairList) - - else if (inSimAnnealParams == Nothing) then - -- better cost - if newGraphCost < snd6 inPhyloGraph then - -- cyclic check in insert edge function - -- trace ("INER: Better -> " ++ (show $ snd6 newGraph)) - newGraphList - - {- - let isCyclic = V.filter (== True) $ fmap LG.cyclic $ fmap head $ fth6 newGraph - dupList = V.filter (== True) $ fmap LG.hasDuplicateEdge $ fmap head $ fth6 newGraph - in - if (null isCyclic && null dupList) then - -- trace ("INER: Better -> " ++ (show $ snd6 newGraph)) - -- trace ("INER:" ++ (LG.prettyIndices $ thd6 newGraph)) - [newGraph] - else - trace ("Cycle " ++ (show isCyclic) ++ " Duplicate Edges " ++ (show dupList)) - insertNetEdgeRecursive inGS inData rSeedList maxNetEdges doSteepest doRandomOrder inPhyloGraph preDeleteCost inSimAnnealParams (drop numGraphsToExamine edgePairList) - -} - - -- not better - else -- trace ("INER: Really Not Better") - insertNetEdgeRecursive inGS inData leafGraph rSeedList maxNetEdges doSteepest doRandomOrder inPhyloGraph preDeleteCost inSimAnnealParams (drop numGraphsToExamine inEdgePairList) - - -- sim annealing/drift - else - let numDone = if (method $ fromJust inSimAnnealParams) == SimAnneal then currentStep $ fromJust inSimAnnealParams - else driftChanges $ fromJust inSimAnnealParams - numMax = if (method $ fromJust inSimAnnealParams) == SimAnneal then numberSteps $ fromJust inSimAnnealParams - else driftMaxChanges $ fromJust inSimAnnealParams - (acceptGraph, nextSAParams) = U.simAnnealAccept inSimAnnealParams (snd6 inPhyloGraph) newGraphCost - in - if (numDone < numMax) then - if acceptGraph then newGraphList - else insertNetEdgeRecursive inGS inData leafGraph rSeedList maxNetEdges doSteepest doRandomOrder inPhyloGraph preDeleteCost nextSAParams (drop numGraphsToExamine inEdgePairList) - - -- hit end of SA/Drift - else [inPhyloGraph] - ) - -- ) - --- | insertNetEdge inserts an edge between two other edges, creating 2 new nodes and rediagnoses graph --- contacts deletes 2 orginal edges and adds 2 nodes and 5 new edges --- does not check any edge reasonable-ness properties --- new edge directed from first to second edge --- naive for now --- predeletecost of edge move -insertNetEdge :: GlobalSettings -> ProcessedData -> DecoratedGraph -> PhylogeneticGraph -> Maybe VertexCost -> (LG.LEdge b, LG.LEdge b) -> PhylogeneticGraph -insertNetEdge inGS inData leafGraph inPhyloGraph preDeleteCost edgePair@((u,v, _), (u',v', _)) = - -- trace ("InsertEdge between: " ++ (show ((u,v), (u',v'))) )( -- ++ " into:\n " ++ (LG.prettify $ GO.convertDecoratedToSimpleGraph $ thd6 inPhyloGraph)) ( - if LG.isEmpty $ thd6 inPhyloGraph then error "Empty input phylogenetic graph in insNetEdge" - - else - let inSimple = fst6 inPhyloGraph - - -- get children of u' to make sure no net children--moved to permissiable edges - --u'ChildrenNetNodes = filter (== True) $ fmap (LG.isNetworkNode inSimple) $ LG.descendants inSimple u' - - numNodes = length $ LG.nodes inSimple - newNodeOne = (numNodes, TL.pack ("HTU" ++ (show numNodes))) - newNodeTwo = (numNodes + 1, TL.pack ("HTU" ++ (show $ numNodes + 1))) - newEdgeList = [(u, fst newNodeOne, 0.0),(fst newNodeOne, v, 0.0),(u', fst newNodeTwo, 0.0),(fst newNodeTwo, v', 0.0),(fst newNodeOne, fst newNodeTwo, 0.0)] - edgesToDelete = [(u,v), (u',v')] - newSimple = LG.insEdges newEdgeList $ LG.delEdges edgesToDelete $ LG.insNodes [newNodeOne, newNodeTwo] inSimple - - - -- do not prune other edges if now unused - pruneEdges = False - - -- don't warn that edges are being pruned - warnPruneEdges = False - - -- graph optimization from root - startVertex = Nothing - - -- conversion as if input--see if affects length - -- removed check after checks moved to permissible edges - -- can add back if there are malformed graphs being generated - -- newSimple' = GO.convertGeneralGraphToPhylogeneticGraph newSimple - -- permissibale not catching timeconsistency issues with edges - -- newSimple' = GO.makeGraphTimeConsistent "fail" newSimple - -- newSimple' = GO.convertGeneralGraphToPhylogeneticGraph "fail" newSimple - - - -- full two-pass optimization - newPhyloGraph = T.multiTraverseFullyLabelSoftWired inGS inData pruneEdges warnPruneEdges leafGraph startVertex newSimple - - -- calculates heursitic graph delta - -- (heuristicDelta, _, _, _, _) = heuristicAddDelta inGS inPhyloGraph edgePair (fst newNodeOne) (fst newNodeTwo) - -- heuristicDelta = heuristicAddDelta' inGS inPhyloGraph edgePair - - - -- edgeAddDelta = deltaPenaltyAdjustment inGS inPhyloGraph "add" - - in - - -- remove these checks when working - - if ((not . LG.isGraphTimeConsistent) newSimple) then - trace ("\tWarning: Time consistency error") - emptyPhylogeneticGraph - - {- - else if LG.hasChainedNetworkNodes newSimple then - trace ("\tWarning: Chained network nodes in insertNetEdge skipping deletion") - emptyPhylogeneticGraph - -} - else if (graphType inGS) == HardWired then newPhyloGraph - - else - -- need heuristics in here - -- if (heuristicDelta + edgeAddDelta) < 0 then newPhyloGraph - -- let oldPhyloGraph = T.multiTraverseFullyLabelSoftWired inGS inData pruneEdges warnPruneEdges leafGraph startVertex inSimple - -- in - -- trace ("INE: OK " ++ (show (numNodes, newNodeOne, newNodeTwo, newEdgeList, edgesToDelete, snd6 oldPhyloGraph)) ++ "\nOrig\n" ++ (LG.prettyDot inSimple) ++ "\nNew\n" ++ (LG.prettyDot newSimple)) ( - {- - if True then - trace ("INE: " ++ (show (heuristicDelta, edgeAddDelta, snd6 inPhyloGraph)) ++ " -> " ++ (show (heuristicDelta + edgeAddDelta + (snd6 inPhyloGraph), snd6 inPhyloGraph))) - newPhyloGraph - -} - -- if (heuristicDelta + edgeAddDelta) < 0 then newPhyloGraph - if LG.isEmpty newSimple then emptyPhylogeneticGraph - else newPhyloGraph - -- ) - - --- | (curBestGraphList, annealBestCost) is a wrapper for moveAllNetEdges' allowing for multiple simulated annealing rounds -deleteAllNetEdges :: GlobalSettings - -> ProcessedData - -> Int - -> Int - -> Int - -> Int - -> Bool - -> Bool - -> Bool - -> ([PhylogeneticGraph], VertexCost) - -> (Maybe SAParams, [PhylogeneticGraph]) - -> ([PhylogeneticGraph], Int) -deleteAllNetEdges inGS inData rSeed maxNetEdges numToKeep counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) (inSimAnnealParams, inPhyloGraphList) = - let leafGraph = LG.extractLeafGraph $ (thd6 . head) inPhyloGraphList - in - if inSimAnnealParams == Nothing then - deleteAllNetEdges' inGS inData leafGraph maxNetEdges numToKeep counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) (randomIntList rSeed) inSimAnnealParams inPhyloGraphList - else - let -- create list of params with unique list of random values for rounds of annealing - annealingRounds = rounds $ fromJust inSimAnnealParams - annealParamGraphList = U.generateUniqueRandList annealingRounds inSimAnnealParams - replicateRandIntList = fmap randomIntList (take annealingRounds (randomIntList rSeed)) - - -- (annealRoundsList, counterList) = unzip (zipWith3 (deleteAllNetEdges' inGS inData leafGraph maxNetEdges numToKeep counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost)) replicateRandIntList annealParamGraphList (replicate annealingRounds inPhyloGraphList) `using` PU.myParListChunkRDS) - (annealRoundsList, counterList) = unzip (PU.seqParMap rdeepseq (deleteAllNetEdges'' inGS inData leafGraph maxNetEdges numToKeep counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost)) (zip3 replicateRandIntList annealParamGraphList (replicate annealingRounds inPhyloGraphList))) - in - (take numToKeep $ GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] (concat annealRoundsList) , sum counterList) - - --- | deleteAllNetEdges'' is a wrapper around deleteAllNetEdges' to allow use of seqParMap -deleteAllNetEdges'' :: GlobalSettings - -> ProcessedData - -> DecoratedGraph - -> Int - -> Int - -> Int - -> Bool - -> Bool - -> Bool - -> ([PhylogeneticGraph], VertexCost) - -> ([Int], Maybe SAParams, [PhylogeneticGraph]) - -> ([PhylogeneticGraph], Int) -deleteAllNetEdges'' inGS inData leafGraph maxNetEdges numToKeep counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) (randIntList, inSimAnnealParams, inPhyloGraphList) = - deleteAllNetEdges' inGS inData leafGraph maxNetEdges numToKeep counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) randIntList inSimAnnealParams inPhyloGraphList - - --- | deleteAllNetEdges deletes network edges one each each round until no better or additional --- graphs are found --- call with ([], infinity) [single input graph] -deleteAllNetEdges' :: GlobalSettings - -> ProcessedData - -> DecoratedGraph - -> Int - -> Int - -> Int - -> Bool - -> Bool - -> Bool - -> ([PhylogeneticGraph], VertexCost) - -> [Int] - -> Maybe SAParams - -> [PhylogeneticGraph] - -> ([PhylogeneticGraph], Int) -deleteAllNetEdges' inGS inData leafGraph maxNetEdges numToKeep counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) randIntList inSimAnnealParams inPhyloGraphList = - -- trace ("In deleteAllNetEdges " ++ (show $ length inPhyloGraphList)) ( - if null inPhyloGraphList then (take numToKeep curBestGraphList, counter) - else - let currentCost = min curBestGraphCost (snd6 $ head inPhyloGraphList) - - (newGraphList', _) = deleteEachNetEdge inGS inData leafGraph (head randIntList) numToKeep doSteepest doRandomOrder False inSimAnnealParams (head inPhyloGraphList) - - newGraphList = take numToKeep $ GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] newGraphList' - newGraphCost = if (not . null) newGraphList then snd6 $ head newGraphList - else infinity - - in - -- trace ("DANE: " ++ (show (newGraphCost, length newGraphList))) ( - -- if graph is a tree no edges to delete - if LG.isTree (fst6 $ head inPhyloGraphList) then - let (a,b,c,d) = LG.splitVertexList (fst6 $ head inPhyloGraphList) - in - trace ("\tGraph in delete network edges is tree--skipping") -- :" ++ (show $ (snd6 $ head inPhyloGraphList, length a, length b, length c, length d))) - deleteAllNetEdges' inGS inData leafGraph maxNetEdges numToKeep (counter + 1) returnMutated doSteepest doRandomOrder ((head inPhyloGraphList) : curBestGraphList, currentCost) (tail randIntList) inSimAnnealParams (tail inPhyloGraphList) - - else if null newGraphList then (take numToKeep curBestGraphList, counter + 1) - - -- regular delte wihtout simulated annealing - -- worse graphs found--go on - else if inSimAnnealParams == Nothing then - if newGraphCost > currentCost then deleteAllNetEdges' inGS inData leafGraph maxNetEdges numToKeep (counter + 1) returnMutated doSteepest doRandomOrder ((head inPhyloGraphList) : curBestGraphList, currentCost) (tail randIntList) inSimAnnealParams (tail inPhyloGraphList) - - -- "steepest style descent" abandons existing list if better cost found - else if newGraphCost < currentCost then - trace ("\t-> " ++ (show newGraphCost)) ( - if doSteepest then - deleteAllNetEdges' inGS inData leafGraph maxNetEdges numToKeep (counter + 1) returnMutated doSteepest doRandomOrder (newGraphList, newGraphCost) (tail randIntList) inSimAnnealParams newGraphList - else - deleteAllNetEdges' inGS inData leafGraph maxNetEdges numToKeep (counter + 1) returnMutated doSteepest doRandomOrder (newGraphList, newGraphCost) (tail randIntList) inSimAnnealParams (newGraphList ++ (tail inPhyloGraphList)) - ) - - -- equal cost - -- not sure if should add new graphs to queue to do edge deletion again - else - -- new grapjh list contains the input graph if equal and filterd unique already in deleteEachNetEdge - let newCurSameBestList = take numToKeep $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] (curBestGraphList ++ newGraphList) - in - deleteAllNetEdges' inGS inData leafGraph maxNetEdges numToKeep (counter + 1) returnMutated doSteepest doRandomOrder (newCurSameBestList, currentCost) (tail randIntList) inSimAnnealParams (tail inPhyloGraphList) - - -- simulated annealing - else - -- if steepest -- simAnneal/Drift stuff done during net delete function so jusyt take results and move on - -- create new - if doSteepest then - let annealBestCost = min curBestGraphCost newGraphCost - newSAParams = Just $ (fromJust (U.incrementSimAnnealParams newSAParams)) {currentStep = 0, driftChanges = 0} - in - deleteAllNetEdges' inGS inData leafGraph maxNetEdges numToKeep (counter + 1) returnMutated doSteepest doRandomOrder ((head newGraphList') : curBestGraphList, annealBestCost) (tail randIntList) newSAParams (newGraphList' ++ (tail inPhyloGraphList)) - - -- simmAnneal on "all list" - else - -- sim anneal choice - let numDone = if (method $ fromJust inSimAnnealParams) == SimAnneal then currentStep $ fromJust inSimAnnealParams - else driftChanges $ fromJust inSimAnnealParams - numMax = if (method $ fromJust inSimAnnealParams) == SimAnneal then numberSteps $ fromJust inSimAnnealParams - else driftMaxChanges $ fromJust inSimAnnealParams - annealBestCost = min curBestGraphCost (snd6 $ head newGraphList') - (acceptFirstGraph, nextSAParams) = U.simAnnealAccept inSimAnnealParams annealBestCost (snd6 $ head newGraphList') - in - -- trace ("ACG" ++ (show acceptFirstGraph) ++ " " ++ (show $ snd6 $ head uniqueGraphList)) ( - if (numDone < numMax) then - let nextNewGraphList = if (not . null) newGraphList' then tail newGraphList' - else [] - in - - if acceptFirstGraph then - deleteAllNetEdges' inGS inData leafGraph maxNetEdges numToKeep (counter + 1) returnMutated doSteepest doRandomOrder ((head newGraphList') : curBestGraphList, annealBestCost) (tail randIntList) nextSAParams (nextNewGraphList ++ (tail inPhyloGraphList)) - else - deleteAllNetEdges' inGS inData leafGraph maxNetEdges numToKeep (counter + 1) returnMutated doSteepest doRandomOrder (curBestGraphList, annealBestCost) (tail randIntList) nextSAParams (nextNewGraphList ++ (tail inPhyloGraphList)) - - -- if want non-optimized list for GA or whatever - else if returnMutated then (take numToKeep curBestGraphList, counter) - - -- optimize with net insert to add back edges to optimiality - else - let (bestList', counter') = insertAllNetEdges' inGS inData leafGraph maxNetEdges numToKeep (counter + 1) False doSteepest doRandomOrder ([], annealBestCost) (tail randIntList) Nothing (take numToKeep curBestGraphList) - bestList = take numToKeep $ GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] bestList' - in - --trace ("BM: " ++ (show $ snd6 $ head bestMoveList)) - (take numToKeep bestList, counter') - -- ) -- ) - --- | deleteOneNetAddAll version deletes net edges in turn and readds-based on original cost --- but this cost in graph (really not correct) but allows logic of insert edge to function better --- unlike deleteOneNetAddAll' only deals with single edge deletion at a time -deleteOneNetAddAll :: GlobalSettings - -> ProcessedData - -> DecoratedGraph - -> Int - -> Int - -> Bool - -> Bool - -> PhylogeneticGraph - -> [LG.Edge] - -> Int - -> Maybe SAParams - -> [PhylogeneticGraph] -deleteOneNetAddAll inGS inData leafGraph maxNetEdges numToKeep doSteepest doRandomOrder inPhyloGraph edgeToDeleteList rSeed inSimAnnealParams = - if null edgeToDeleteList then - -- trace ("\tGraph has no edges to move---skipping") - [inPhyloGraph] - else if LG.isEmpty $ thd6 inPhyloGraph then error "Empty graph in deleteOneNetAddAll" - else - -- trace ("DONAA-New: " ++ (show $ snd6 inPhyloGraph) ++ " Steepest:" ++ (show doSteepest)) ( - trace ("Moving " ++ (show $ length edgeToDeleteList) ++ " network edges, current best cost: " ++ (show $ snd6 inPhyloGraph)) ( - -- start with initial graph cost - let inGraphCost = snd6 inPhyloGraph - - -- get deleted simple graphs and bool for changed - delGraphBoolPair = deleteNetworkEdge (fst6 inPhyloGraph) (head edgeToDeleteList) - - in - - -- no change in network structure - if snd delGraphBoolPair == False then - deleteOneNetAddAll inGS inData leafGraph maxNetEdges numToKeep doSteepest doRandomOrder inPhyloGraph (tail edgeToDeleteList) rSeed inSimAnnealParams - - else - let simpleGraphToInsert = fst delGraphBoolPair - - (_, _, _, curNetNodes) = LG.splitVertexList simpleGraphToInsert - curNumNetNodes = length curNetNodes - - -- optimize deleted graph and update cost with input cost - graphToInsert = T.multiTraverseFullyLabelSoftWired inGS inData False False leafGraph Nothing simpleGraphToInsert -- `using` PU.myParListChunkRDS - - -- keep same cost and just keep better--check if better than original later - graphToInsert' = T.updatePhylogeneticGraphCost graphToInsert inGraphCost - - - insertedGraphPairList = if not doSteepest then insertEachNetEdge inGS inData leafGraph rSeed (curNumNetNodes + 1) numToKeep doSteepest doRandomOrder Nothing inSimAnnealParams graphToInsert' - else insertEachNetEdgeRecursive inGS inData leafGraph (curNumNetNodes + 1) numToKeep doSteepest doRandomOrder (head $ randomIntList rSeed) inSimAnnealParams [graphToInsert'] - - - newMinimumCost = snd insertedGraphPairList - - newBestGraphs = filter ((== newMinimumCost) . snd6) $ fst insertedGraphPairList - - in - -- trace ("DONAA-New: " ++ (show (inGraphCost, fmap snd6 graphsToInsert, fmap snd6 graphsToInsert', newMinimumCost))) ( - if newMinimumCost < inGraphCost then - trace ("DONA-> ") - newBestGraphs - - else deleteOneNetAddAll inGS inData leafGraph maxNetEdges numToKeep doSteepest doRandomOrder inPhyloGraph (tail edgeToDeleteList) rSeed inSimAnnealParams - - ) -- ) - --- | deleteOneNetAddAll' new version deletes net edges in turn and readds-based on original cost --- but his cost in graph (really not correct) but allows logic of insert edge to function better --- seems like after delete--or insert--the graphs are improper condition and returnin infinite cost due to that --- seems likely true for deleteOneNetAddAll' as well --- this has no SA in it -deleteOneNetAddAll' :: GlobalSettings - -> ProcessedData - -> DecoratedGraph - -> Int - -> Int - -> Bool - -> Bool - -> PhylogeneticGraph - -> [LG.Edge] - -> Int - -> Maybe SAParams - -> [PhylogeneticGraph] -deleteOneNetAddAll' inGS inData leafGraph maxNetEdges numToKeep doSteepest doRandomOrder inPhyloGraph edgeToDeleteList rSeed inSimAnnealParams = - if null edgeToDeleteList then - -- trace ("\tGraph has no edges to move---skipping") - [inPhyloGraph] - else if LG.isEmpty $ thd6 inPhyloGraph then error "Empty graph in deleteOneNetAddAll" - else - -- trace ("DONAA-New: " ++ (show $ snd6 inPhyloGraph) ++ " Steepest:" ++ (show doSteepest)) ( - trace ("Moving " ++ (show $ length edgeToDeleteList) ++ " network edges, current best cost: " ++ (show $ snd6 inPhyloGraph)) ( - -- start with initial graph cost - let inGraphCost = snd6 inPhyloGraph - - -- get deleted simple graphs and bool for changed - delGraphBoolPairList = PU.seqParMap rdeepseq (deleteNetworkEdge (fst6 inPhyloGraph)) edgeToDeleteList - (simpleGraphsToInsert, _) = unzip $ filter ((== True ) . snd) delGraphBoolPairList - - -- check for cycles -- already done - -- simpleGraphsToInsert' = filter ((== False) . LG.cyclic) simpleGraphsToInsert - - -- optimize deleted graph and update cost with input cost - graphsToInsert = PU.seqParMap rdeepseq (T.multiTraverseFullyLabelSoftWired inGS inData False False leafGraph Nothing) simpleGraphsToInsert -- `using` PU.myParListChunkRDS - - -- keep same cost and just keep better--check if better than original later - graphsToInsert' = PU.seqParMap rdeepseq (flip T.updatePhylogeneticGraphCost inGraphCost) graphsToInsert - - - insertedGraphPairList = if not doSteepest then PU.seqParMap rdeepseq (insertEachNetEdge inGS inData leafGraph rSeed maxNetEdges numToKeep doSteepest doRandomOrder Nothing inSimAnnealParams) graphsToInsert' -- `using` PU.myParListChunkRDS - else - let -- potentially randomize order of list - graphsToInsert'' = if not doRandomOrder then graphsToInsert' - else permuteList rSeed graphsToInsert' - in - [insertEachNetEdgeRecursive inGS inData leafGraph maxNetEdges numToKeep doSteepest doRandomOrder (head $ randomIntList rSeed) inSimAnnealParams graphsToInsert''] - - - newMinimumCost = minimum $ fmap snd insertedGraphPairList - - newBestGraphs = filter ((== newMinimumCost) . snd6) $ concat $ fmap fst insertedGraphPairList - - in - -- trace ("DONAA-New: " ++ (show (inGraphCost, fmap snd6 graphsToInsert, fmap snd6 graphsToInsert', newMinimumCost))) ( - if null simpleGraphsToInsert then [emptyPhylogeneticGraph] - - else if newMinimumCost < inGraphCost then - trace ("-> ") - newBestGraphs - - else [inPhyloGraph] - - ) -- ) - - --- | insertEachNetEdgeRecursive is a wrapper arounf insertEachNet each for edge move heuristic and returns better graph when found immediately -insertEachNetEdgeRecursive :: GlobalSettings - -> ProcessedData - -> DecoratedGraph - -> Int - -> Int - -> Bool - -> Bool - -> Int - -> Maybe SAParams - -> [PhylogeneticGraph] - -> ([PhylogeneticGraph], VertexCost) -insertEachNetEdgeRecursive inGS inData leafGraph maxNetEdges numToKeep doSteepest doRandomOrder rSeed inSimAnnealParams inPhyloGraphList = - if null inPhyloGraphList then ([],infinity) - else - let firstGraph = head inPhyloGraphList - (newGraphList, newCost) = insertEachNetEdge inGS inData leafGraph rSeed maxNetEdges numToKeep doSteepest doRandomOrder Nothing inSimAnnealParams firstGraph - - minCost = if null newGraphList then infinity - else newCost - in - -- return immediately - if minCost < (snd6 firstGraph) then - trace ("IENER->" ++ (show minCost)) - (take numToKeep $ GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] $ newGraphList, minCost) - else - insertEachNetEdgeRecursive inGS inData leafGraph maxNetEdges numToKeep doSteepest doRandomOrder rSeed inSimAnnealParams (tail inPhyloGraphList) - - --- | getPermissibleEdgePairs takes a DecoratedGraph and returns the list of all pairs --- of edges that can be joined by a network edge and meet all necessary conditions - --- add in other conditions --- reproducable--ie not tree noide with two net node children--other stuff -getPermissibleEdgePairs :: DecoratedGraph -> [(LG.LEdge EdgeInfo, LG.LEdge EdgeInfo)] -getPermissibleEdgePairs inGraph = - if LG.isEmpty inGraph then error "Empty input graph in isEdgePairPermissible" - else - let edgeList = LG.labEdges inGraph - - -- edges to potentially conenct - edgePairs = cartProd edgeList edgeList - - -- get coeval node pairs in existing grap - coevalNodeConstraintList = LG.coevalNodePairs inGraph - coevalNodeConstraintList' = PU.seqParMap rdeepseq (LG.addBeforeAfterToPair inGraph) coevalNodeConstraintList -- `using` PU.myParListChunkRDS - - edgeTestList = PU.seqParMap rdeepseq (isEdgePairPermissible inGraph coevalNodeConstraintList') edgePairs -- `using` PU.myParListChunkRDS - pairList = fmap fst $ filter ((== True) . snd) $ zip edgePairs edgeTestList - in - -- trace ("Edge Pair list :" ++ (show $ fmap f pairList) ++ "\n" - -- ++ "GPEP\n" ++ (LG.prettify $ GO.convertDecoratedToSimpleGraph inGraph)) - pairList - -- where f (a, b) = (LG.toEdge a, LG.toEdge b) - -getPermissibleEdgePairs' :: DecoratedGraph -> [(LG.LEdge EdgeInfo, LG.LEdge EdgeInfo)] -getPermissibleEdgePairs' inGraph = - if LG.isEmpty inGraph then error "Empty input graph in isEdgePairPermissible" - else - let edgeList = LG.labEdges inGraph - edgePairs = cartProd edgeList edgeList - contraintList = LG.getGraphCoevalConstraints inGraph - edgeTestList = PU.seqParMap rdeepseq (isEdgePairPermissible' inGraph contraintList) edgePairs -- `using` PU.myParListChunkRDS - pairList = fmap fst $ filter ((== True) . snd) $ zip edgePairs edgeTestList - in - -- trace ("Edge Pair list :" ++ (show $ fmap f pairList) ++ "\n" - -- ++ "GPEP\n" ++ (LG.prettify $ GO.convertDecoratedToSimpleGraph inGraph)) - pairList - -- where f (a, b) = (LG.toEdge a, LG.toEdge b) - --- | isEdgePairPermissible takes a graph and two edges, coeval contraints, and tests whether a --- pair of edges can be linked by a new edge and satify three consitions: --- 1) neither edge is a network edge --- 2) one edge cannot be "before" while the other is "after" in any of the constraint pairs --- 3) neither neither edge is an ancestor or descndent edge of the other (tested via bv of nodes) --- the result should apply to a new edge in either direction --- new edge to be creted is edge1 -> ege2 -isEdgePairPermissible :: DecoratedGraph -> [(LG.LNode a, LG.LNode a, [LG.LNode a], [LG.LNode a], [LG.LNode a], [LG.LNode a])] -> (LG.LEdge EdgeInfo, LG.LEdge EdgeInfo) -> Bool -isEdgePairPermissible inGraph constraintList (edge1@(u,v,_), edge2@(u',v',_)) = - if LG.isEmpty inGraph then error "Empty input graph in isEdgePairPermissible" - else - if u == u' then False - else if v == v' then False - -- equality implied in above two - -- else if LG.toEdge edge1 == LG.toEdge edge2 then False - else if (LG.isNetworkNode inGraph u) || (LG.isNetworkNode inGraph u') then False - else if (LG.isNetworkLabEdge inGraph edge1) || (LG.isNetworkLabEdge inGraph edge2) then False - else if not (LG.meetsAllCoevalConstraintsNodes (fmap removeNodeLabels constraintList) edge1 edge2) then False - else if (isAncDescEdge inGraph edge1 edge2) then False - -- get children of u' to make sure no net children - else if (not . null) $ filter (== True) $ fmap (LG.isNetworkNode inGraph) $ LG.descendants inGraph u' then False - else True - where removeNodeLabels (a,b,c,d,e,f) = (LG.toNode a, LG.toNode b, fmap LG.toNode c, fmap LG.toNode d, fmap LG.toNode e, fmap LG.toNode f) - - --- | isEdgePairPermissible takes a graph and two edges, coeval contraints, and tests whether a --- pair of edges can be linked by a new edge and satify three consitions: --- 1) neither edge is a network edge --- 2) one edge cannot be "before" while the other is "after" in any of the constraint pairs --- 3) neither neither edge is an ancestor or descndent edge of the other (tested via bv of nodes) --- the result should apply to a new edge in either direction --- new edge to be creted is edge1 -> ege2 -isEdgePairPermissible' :: DecoratedGraph -> [([LG.LEdge EdgeInfo],[LG.LEdge EdgeInfo])] -> (LG.LEdge EdgeInfo, LG.LEdge EdgeInfo) -> Bool -isEdgePairPermissible' inGraph constraintList (edge1@(u,v,_), edge2@(u',v',_)) = - if LG.isEmpty inGraph then error "Empty input graph in isEdgePairPermissible" - else - if u == u' then False - else if v == v' then False - -- equality implied in above two - -- else if LG.toEdge edge1 == LG.toEdge edge2 then False - else if (LG.isNetworkNode inGraph u) || (LG.isNetworkNode inGraph u') then False - else if (LG.isNetworkLabEdge inGraph edge1) || (LG.isNetworkLabEdge inGraph edge2) then False - else if not (LG.meetsAllCoevalConstraintsEdges constraintList edge1 edge2) then False - else if (isAncDescEdge inGraph edge1 edge2) then False - -- get children of u' to make sure no net children - else if (not . null) $ filter (== True) $ fmap (LG.isNetworkNode inGraph) $ LG.descendants inGraph u' then False - else True - - - --- | isAncDescEdge takes a graph and two edges and examines whethe either edge is the ancestor or descendent of the other --- this is done via examination of teh bitvector fields of the node -isAncDescEdge :: DecoratedGraph -> LG.LEdge EdgeInfo -> LG.LEdge EdgeInfo -> Bool -isAncDescEdge inGraph (a,_,_) (b, _, _) = - if LG.isEmpty inGraph then error "Empty input graph in isAncDescEdge" - else - let aBV = bvLabel $ fromJust $ LG.lab inGraph a - bBV = bvLabel $ fromJust $ LG.lab inGraph b - in - if aBV .&. bBV == aBV then True - else if aBV .&. bBV == bBV then True - else False - -{- --- | insertNetEdgeBothDirections calls insertNetEdge for both u -> v and v -> u new edge orientations -insertNetEdgeBothDirections :: GlobalSettings -> ProcessedData -> PhylogeneticGraph -> Maybe VertexCost -> (LG.LEdge b, LG.LEdge b) -> [PhylogeneticGraph] -insertNetEdgeBothDirections inGS inData inPhyloGraph preDeleteCost (u,v) = fmap (insertNetEdge inGS inData inPhyloGraph preDeleteCost) [(u,v), (v,u)] --} - -{- These heuristics do not seem tom work well at all-} - --- | heuristic add delta' based on new display tree and delta from existing costs by block--assumming < 0 --- original edges subtree1 ((u,l),(u,v)) and subtree2 ((u',v'),(u',l')) create a directed edge from --- subtree 1 to subtree 2 via --- 1) Add node x and y, delete edges (u,v) and (u'v') and create edges (u,x), (x,v), (u',y), and (y,v') --- 2) real cost is the sum of block costs that are lower for new graph versus older --- 3) heuristic is when new subtree is lower than existing block by block --- so calculate d(u,v) + d(u',v') [existing display tree cost estimate] compared to --- d((union u,v), v') - d(u'.v') [New display tree cost estimate] over blocks --- so blockDelta = if d((union u,v), v') - d(u'.v') < d(u,v) + d(u',v') then d((union u,v), v') - d(u'.v') --- else 0 [existing better] --- graphDelta = egdeAddDelta (separately calcualated) + sum [blockDelta] --- Compare to real delta to check behavior --- original subtrees u -> (a,v) and u' -> (v',b) -heuristicAddDelta' :: GlobalSettings -> PhylogeneticGraph -> (LG.LEdge b, LG.LEdge b) -> VertexCost -heuristicAddDelta' inGS inPhyloGraph ((u,v, _), (u',v', _)) = - if LG.isEmpty (fst6 inPhyloGraph) then error "Empty graph in heuristicAddDelta" - else - let a = head $ filter (/= v) $ LG.descendants (fst6 inPhyloGraph) u - b = head $ filter (/= v') $ LG.descendants (fst6 inPhyloGraph) u' - blockDeltaV = V.zipWith (getBlockDelta (u,v,u',v',a,b)) (fft6 inPhyloGraph) (six6 inPhyloGraph) - in - V.sum blockDeltaV - --- | getBlockDelta determines the network add delta for each block (vector of characters) --- if existing is lower then zero, else (existing - new) -getBlockDelta :: (LG.Node, LG.Node, LG.Node, LG.Node, LG.Node, LG.Node) -> V.Vector DecoratedGraph -> V.Vector CharInfo -> VertexCost -getBlockDelta (u,v,u',v',a,b) inCharV charInfoV = - if V.null inCharV then error "Empty charcter tree vector in getBlockDelta" - else - let (charNewV, charExistingV) = V.unzip $ V.zipWith (getCharacterDelta (u,v,u',v',a,b)) inCharV charInfoV - newCost = V.sum charNewV - existingCost = V.sum charExistingV - in - -- trace ("GBD: " ++ (show (newCost, existingCost))) ( - if (newCost < existingCost) then newCost - existingCost - else 0.0 - -- ) - --- | getCharacterDelta determines the network add delta for each block (vector of characters) --- if existing is lower then zero, else (existing - new) --- calculate d(u,v) + d(u',v') [existing display tree cost estimate] compared to --- d((union u,v), v') - d(u'.v') --- need to use final assignemnts--so set prelim to final first -getCharacterDelta :: (LG.Node, LG.Node, LG.Node, LG.Node, LG.Node, LG.Node) -> DecoratedGraph -> CharInfo -> (VertexCost, VertexCost) -getCharacterDelta (u,v,u',v',a,b) inCharTree charInfo = - let doIA = False - filterGaps = True - uData = V.head $ V.head $ vertData $ fromJust $ LG.lab inCharTree u - vData = V.head $ V.head $ vertData $ fromJust $ LG.lab inCharTree v - vFinalData = V.head $ V.head $ PRE.setPreliminaryToFinalStates $ vertData $ fromJust $ LG.lab inCharTree v - u'Data = V.head $ V.head $ vertData $ fromJust $ LG.lab inCharTree u' - v'Data = V.head $ V.head $ vertData $ fromJust $ LG.lab inCharTree v' - v'FinalData = V.head $ V.head $ PRE.setPreliminaryToFinalStates $ vertData $ fromJust $ LG.lab inCharTree v' - aData = V.head $ V.head $ vertData $ fromJust $ LG.lab inCharTree a - aFinalData = V.head $ V.head $ PRE.setPreliminaryToFinalStates $ vertData $ fromJust $ LG.lab inCharTree a - bData = V.head $ V.head $ vertData $ fromJust $ LG.lab inCharTree b - - -- unionUV = M.union2Single doIA filterGaps uData vData charInfo - -- (_,dUV) = M.median2Single doIA uData vData charInfo - -- dUV = vertexCost $ fromJust $ LG.lab inCharTree u - -- dU'V' = vertexCost $ fromJust $ LG.lab inCharTree u' - -- (_, dUnionUVV') = M.median2Single doIA unionUV v'Data charInfo - - (newX, dVV') = M.median2Single doIA vFinalData v'FinalData charInfo - (_, dAX) = M.median2Single doIA aFinalData newX charInfo - (_, dAV) = M.median2Single doIA aData vData charInfo - (_, dV'B) = M.median2Single doIA v'Data bData charInfo - in - -- trace ("GCD: " ++ (show (dVV' + dAX, dAV + dV'B))) ( - (dVV' + dAX, dAV + dV'B) - -- if dUnionUVV' - dU'V' < dU'V' then dUnionUVV' - dU'V' - -- else 0.0 - -- ) - --- | heuristicAddDelta takes the existing graph, edge pair, and new nodes to create and makes --- the new nodes and reoptimizes starting nodes of two edges. Returns cost delta based on --- previous and new node resolution caches --- returns cost delta and the reoptimized nodes for use in incremental optimization --- original edges (to be deleted) (u,v) and (u',v'), n1 inserted in (u,v) and n2 inserted into (u',v') --- creates (n1, n2), (u,n1), (n1,v), (u',n2), (n2, v') -heuristicAddDelta :: GlobalSettings -> PhylogeneticGraph -> (LG.LEdge b, LG.LEdge b) -> LG.Node -> LG.Node -> (VertexCost, LG.LNode VertexInfo, LG.LNode VertexInfo, LG.LNode VertexInfo, LG.LNode VertexInfo) -heuristicAddDelta inGS inPhyloGraph ((u,v, _), (u',v', _)) n1 n2 = - if LG.isEmpty (fst6 inPhyloGraph) then error "Empty graph in heuristicAddDelta" - else if graphType inGS == HardWired then - let uvVertData = M.makeEdgeData False (thd6 inPhyloGraph) (six6 inPhyloGraph) (u, v, dummyEdge) - uvPrimeData = M.makeEdgeData False (thd6 inPhyloGraph) (six6 inPhyloGraph) (u', v', dummyEdge) - hardDelta = V.sum $ fmap V.sum $ fmap (fmap snd) $ POS.createVertexDataOverBlocks uvVertData uvPrimeData (six6 inPhyloGraph) [] - in - (hardDelta, dummyNode, dummyNode, dummyNode, dummyNode) - - -- softwired - else - let uLab = fromJust $ LG.lab (thd6 inPhyloGraph) u - uPrimeLab = fromJust $ LG.lab (thd6 inPhyloGraph) u' - vLab = fromJust $ LG.lab (thd6 inPhyloGraph) v - vPrimeLab = fromJust $ LG.lab (thd6 inPhyloGraph) v' - uPrimeOtherChild = head $ filter ((/= v') . fst) $ LG.labDescendants (thd6 inPhyloGraph) (u', uPrimeLab) - uOtherChild = head $ filter ((/= v) . fst) $ LG.labDescendants (thd6 inPhyloGraph) (u, uLab) - - -- direction first edge to second so n2 is outdegree 1 to v' - n2Lab = POSW.getOutDegree1VertexSoftWired n2 vPrimeLab (thd6 inPhyloGraph) [n2] - uPrimeLabAfter = POSW.getOutDegree2VertexSoftWired inGS (six6 inPhyloGraph) u' (n2, n2Lab) uPrimeOtherChild (thd6 inPhyloGraph) - n1Lab = POSW.getOutDegree2VertexSoftWired inGS (six6 inPhyloGraph) n1 (v, vLab) (n2, n2Lab) (thd6 inPhyloGraph) - uLabAfter = POSW.getOutDegree2VertexSoftWired inGS (six6 inPhyloGraph) u uOtherChild (n1, n1Lab) (thd6 inPhyloGraph) - - -- cost of resolutions - (_, uCostBefore) = POSW.extractDisplayTrees (Just (-1)) False (vertexResolutionData uLab) - (_, uPrimeCostBefore) = POSW.extractDisplayTrees (Just (-1)) False (vertexResolutionData uPrimeLab) - (_, uCostAfter) = POSW.extractDisplayTrees (Just (-1)) False (vertexResolutionData uLabAfter) - (_, uPrimeCostAfter) = POSW.extractDisplayTrees (Just (-1)) False (vertexResolutionData uPrimeLabAfter) - - addNetDelta = uCostAfter - uCostBefore + uPrimeCostAfter - uPrimeCostBefore - - - in - trace ("HAD: " ++ (show (uCostAfter, uCostBefore, uPrimeCostAfter, uPrimeCostBefore) ++ " -> " ++ (show $ uCostAfter - uCostBefore + uPrimeCostAfter - uPrimeCostBefore))) ( - if null (filter ((/= v') . fst) $ LG.labDescendants (thd6 inPhyloGraph) (u', uPrimeLab)) || null (filter ((/= v) . fst) $ LG.labDescendants (thd6 inPhyloGraph) (u, uLab)) then (infinity, dummyNode, dummyNode, dummyNode, dummyNode) - -- this should not happen--should try to crete new edges from children of net edges - else if (length $ LG.descendants (thd6 inPhyloGraph) u) < 2 || (length $ LG.descendants (thd6 inPhyloGraph) u') < 2 then error ("Outdegree 1 nodes in heuristicAddDelta") - else - (addNetDelta, (u, uLabAfter), (u', uPrimeLabAfter), (n1, n1Lab), (n2, n2Lab)) - ) - - - --- | deltaPenaltyAdjustment takes number of leaves and Phylogenetic graph and returns a heuristic graph penalty for adding a single network edge --- if Wheeler2015Network, this is based on a all changes affecting a single block (most permissive) and Wheeler 2015 calculation of penalty --- if PMDLGraph -- KMDL not yet implemented --- if NoNetworkPenalty then 0 --- modification "add" or subtrct to calculate delta --- always delta is positive--whether neg or pos is deltermined when used -deltaPenaltyAdjustment :: GlobalSettings -> PhylogeneticGraph -> String -> VertexCost -deltaPenaltyAdjustment inGS inGraph modification = - -- trace ("DPA: entering: " ++ (show $ graphFactor inGS)) ( - let numLeaves = numDataLeaves inGS - edgeCostModel = graphFactor inGS - (_, _, _, networkNodeList) = LG.splitVertexList (fst6 inGraph) - in - if edgeCostModel == NoNetworkPenalty then - -- trace ("DPA: No penalty") - 0.0 - - else if length networkNodeList == 0 then - -- trace ("DPA: No cost") - 0.0 - - else if edgeCostModel == Wheeler2015Network then - -- trace ("DPW: In Wheeler2015Network") ( - let graphCost = snd6 inGraph -- this includes any existing penalties--would be better not to include - numBlocks = V.length $ fth6 inGraph - in - if (graphType inGS) == HardWired then 0.0 - -- trace ("DPA Value: " ++ (show $ graphCost / (fromIntegral $ numBlocks * 2 * ((2 * numLeaves) - 2)))) - else graphCost / (fromIntegral $ numBlocks * 2 * ((2 * numLeaves) - 2)) - -- ) - - else if edgeCostModel == PMDLGraph then - -- trace ("DPW: In PMDLGraph") ( - if graphType inGS == Tree then - fst $ IL.head (graphComplexityList inGS) - - else if graphType inGS == SoftWired then - let currentComplexity = fst $ (graphComplexityList inGS) IL.!!! (length networkNodeList) - nextComplexity = if modification == "add" then fst $ (graphComplexityList inGS) IL.!!! ((length networkNodeList) + 1) - else if modification == "delete" then fst $ (graphComplexityList inGS) IL.!!! ((length networkNodeList) - 1) - else error ("SoftWired deltaPenaltyAdjustment modification not recognized: " ++ modification) - in - abs (currentComplexity - nextComplexity) - - else if graphType inGS == HardWired then - let currentComplexity = snd $ (graphComplexityList inGS) IL.!!! (length networkNodeList) - nextComplexity = if modification == "add" then snd $ (graphComplexityList inGS) IL.!!! ((length networkNodeList) + 1) - else if modification == "delete" then snd $ (graphComplexityList inGS) IL.!!! ((length networkNodeList) - 1) - else error ("HardWired deltaPenaltyAdjustment modification not recognized: " ++ modification) - in - abs (currentComplexity - nextComplexity) - - - else error ("Graph type not yet implemented: " ++ (show $ graphType inGS)) - -- ) - - else error ("Network edge cost model not yet implemented: " ++ (show edgeCostModel)) - -- ) - - - --- | deleteEachNetEdge takes a phylogenetic graph and deletes all network edges one at time --- and returns best list of new Phylogenetic Graphs and cost --- even if worse--could be used for simulated annealing later --- if equal returns unique graph list -deleteEachNetEdge :: GlobalSettings -> ProcessedData -> DecoratedGraph -> Int -> Int -> Bool -> Bool -> Bool-> Maybe SAParams -> PhylogeneticGraph -> ([PhylogeneticGraph], VertexCost) -deleteEachNetEdge inGS inData leafGraph rSeed numToKeep doSteepest doRandomOrder force inSimAnnealParams inPhyloGraph = - -- trace ("DENE start") ( - if LG.isEmpty $ thd6 inPhyloGraph then ([], infinity) -- error "Empty input phylogenetic graph in deleteAllNetEdges" - else - let currentCost = snd6 inPhyloGraph - - -- potentially randomize order of list - networkEdgeList' = LG.netEdges $ thd6 inPhyloGraph - networkEdgeList = if not doRandomOrder then networkEdgeList' - else permuteList rSeed networkEdgeList' - - - newGraphList = if not doSteepest then PU.seqParMap rdeepseq (deleteNetEdge inGS inData leafGraph inPhyloGraph force) networkEdgeList -- `using` PU.myParListChunkRDS - else deleteNetEdgeRecursive inGS inData leafGraph inPhyloGraph force inSimAnnealParams networkEdgeList - - uniqueCostGraphList = filter ((/= infinity) . snd6) $ take numToKeep $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] newGraphList - minCost = if null uniqueCostGraphList then infinity - else minimum $ fmap snd6 uniqueCostGraphList - in - -- no network edges to delete - if null networkEdgeList then trace ("\tNo network edges to delete") ([inPhyloGraph], currentCost) - - else - -- single if steepest so no neeed to unique - if doSteepest then (newGraphList, minCost) - - else - -- trace ("DENE end: " ++ (show (currentCost, minCost, length newGraphList, length uniqueCostGraphList))) ( - if minCost < currentCost then - -- trace ("DENE--Delete net edge return:" ++ (show (minCost,length uniqueCostGraphList))) ( - let nextGraphPairList = PU.seqParMap rdeepseq (deleteEachNetEdge inGS inData leafGraph rSeed numToKeep doSteepest doRandomOrder force inSimAnnealParams) (filter ((== minCost) .snd6) uniqueCostGraphList) -- `using` PU.myParListChunkRDS - newMinCost = minimum $ fmap snd nextGraphPairList - newGraphListBetter = filter ((== newMinCost) . snd6) $ concatMap fst nextGraphPairList - in - (take numToKeep $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] $ newGraphListBetter, newMinCost) - - -- ) - - else - -- filter later - -- trace ("DENE returning same") - (take numToKeep $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] $ uniqueCostGraphList, currentCost) - --- | deleteNetworkEdge deletes a network edges from a simple graph --- retuns newGraph if can be modified or input graph with Boolean to tell if modified --- and contracts, reindexes/names internaledges/veritices around deletion --- can't raise to general graph level due to vertex info --- in edges (b,a) (c,a) (a,d), deleting (a,b) deletes node a, inserts edge (b,d) --- contacts node c since now in1out1 vertex --- checks for chained network edges--can be created by progressive deletion --- checks for cycles now --- shouldn't need for check for creting a node with children that are both network nodes --- sine that woudl require that condition coming in and shodl be there--ie checked earlier in addition and input -deleteNetworkEdge :: SimpleGraph -> LG.Edge -> (SimpleGraph, Bool) -deleteNetworkEdge inGraph inEdge@(p1, nodeToDelete) = - if LG.isEmpty inGraph then error ("Cannot delete edge from empty graph") - else - let childrenNodeToDelete = LG.descendants inGraph nodeToDelete - parentsNodeToDelete = LG.parents inGraph nodeToDelete - --parentNodeToKeep = head $ filter (/= p1) parentsNodeToDelete - --newEdge = (parentNodeToKeep, head childrenNodeToDelete, 0.0) - -- newGraph = LG.insEdge newEdge $ LG.delNode nodeToDelete inGraph - newGraph = LG.delEdge inEdge inGraph - -- newGraph' = GO.contractIn1Out1EdgesRename newGraph - - -- conversion as if input--see if affects length - newGraph'' = GO.convertGeneralGraphToPhylogeneticGraph "fail" newGraph - -- newGraph'' = GO.contractIn1Out1EdgesRename newGraph - - in - -- error conditions and creation of chained network edges (forbidden in phylogenetic graph--causes resolutoin cache issues) - if length childrenNodeToDelete /= 1 then error ("Cannot delete non-network edge in deleteNetworkEdge: (1)" ++ (show inEdge) ++ "\n" ++ (LG.prettyIndices inGraph)) - else if length parentsNodeToDelete /= 2 then error ("Cannot delete non-network edge in deleteNetworkEdge (2): " ++ (show inEdge) ++ "\n" ++ (LG.prettyIndices inGraph)) - - -- warning if chained on input, skip if chained net edges in output - else if (LG.isNetworkNode inGraph p1) then - -- error ("Error: Chained network nodes in deleteNetworkEdge : " ++ (show inEdge) ++ "\n" ++ (LG.prettyIndices inGraph) ++ " skipping") - trace ("\tWarning: Chained network nodes in deleteNetworkEdge skipping deletion") - (LG.empty, False) - - else if LG.hasChainedNetworkNodes newGraph'' then - trace ("\tWarning: Chained network nodes in deleteNetworkEdge skipping deletion (2)") - (LG.empty, False) - - else if LG.isEmpty newGraph'' then (LG.empty, False) - - else - {-trace ("DNE: Edge to delete " ++ (show inEdge) ++ " cnd " ++ (show childrenNodeToDelete) ++ " pnd " ++ (show parentsNodeToDelete) ++ " pntk " ++ (show parentNodeToKeep) - ++ " ne " ++ (show newEdge) ++ "\nInGraph: " ++ (LG.prettyIndices inGraph) ++ "\nNewGraph: " ++ (LG.prettyIndices newGraph) ++ "\nNewNewGraph: " - ++ (LG.prettyIndices newGraph')) -} - (newGraph'', True) - - --- | deleteNetEdgeRecursive like deleteEdge, deletes an edge (checking if network) and rediagnoses graph --- contacts in=out=1 edges and removes node, reindexing nodes and edges --- except returns on first better (as opposed to do all deletes first) --- or sim annleal/drift -deleteNetEdgeRecursive :: GlobalSettings -> ProcessedData -> DecoratedGraph -> PhylogeneticGraph -> Bool -> Maybe SAParams -> [LG.Edge] -> [PhylogeneticGraph] -deleteNetEdgeRecursive inGS inData leafGraph inPhyloGraph force inSimAnnealParams inEdgeToDeleteList = - if null inEdgeToDeleteList then [] - else - let numGraphsToExamine = PU.getNumThreads - -- edgeToDelete = head inEdgeToDeleteList - edgeToDeleteList = take numGraphsToExamine inEdgeToDeleteList - - -- calls general funtion to remove network graph edge - -- (delSimple, wasModified) = deleteNetworkEdge (fst6 inPhyloGraph) edgeToDelete - simpleGraphList = fmap fst $ filter ((== True) . snd) $ PU.seqParMap rdeepseq (deleteNetworkEdge (fst6 inPhyloGraph)) edgeToDeleteList - - -- delSimple = GO.contractIn1Out1EdgesRename $ LG.delEdge edgeToDelete $ fst6 inPhyloGraph - - -- prune other edges if now unused - pruneEdges = False - - -- don't warn that edges are being pruned - warnPruneEdges = False - - -- graph optimization from root - startVertex = Nothing - - -- (heuristicDelta, _, _) = heuristicDeleteDelta inGS inPhyloGraph edgeToDelete - heuristicDelta = 0.0 - - edgeAddDelta = 0.0 -- deltaPenaltyAdjustment inGS inPhyloGraph "delete" - - - -- full two-pass optimization - newPhyloGraphList' = if (graphType inGS == SoftWired) then - PU.seqParMap rdeepseq (T.multiTraverseFullyLabelSoftWired inGS inData pruneEdges warnPruneEdges leafGraph startVertex) simpleGraphList - else if (graphType inGS == HardWired) then - PU.seqParMap rdeepseq (T.multiTraverseFullyLabelHardWired inGS inData leafGraph startVertex) simpleGraphList - else error "Unsupported graph type in deleteNetEdge. Must be soft or hard wired" - - newPhyloGraphList = GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] newPhyloGraphList' - - in - - -- trace ("DNERec: " ++ (show edgeToDelete) ++ " at " ++ (show $ snd6 newPhyloGraph)) ( - -- if not modified retiurn original graph - - -- This check seems to be issue with delete not functioning properly - if null simpleGraphList then [inPhyloGraph] - - -- forcing delete for move - else if force then - -- trace ("DNERec forced") - newPhyloGraphList - - -- regular search not sim anneal/drift - else if (inSimAnnealParams == Nothing) then - - -- harwired must use fiull optimization cost - if (graphType inGS) == HardWired then - -- better - if (snd6 $ head newPhyloGraphList) < (snd6 inPhyloGraph) then newPhyloGraphList - -- not better continue - else deleteNetEdgeRecursive inGS inData leafGraph inPhyloGraph force inSimAnnealParams (drop numGraphsToExamine inEdgeToDeleteList) - - -- better for tree / softwired - -- heuristic no good so just checing result - else if (snd6 $ head newPhyloGraphList) < (snd6 inPhyloGraph) then - -- trace ("DNERec Better -> " ++ (show $ snd6 newPhyloGraph)) - newPhyloGraphList - - else - -- need to update edge list for new graph - -- potentially randomize order of list - deleteNetEdgeRecursive inGS inData leafGraph inPhyloGraph force inSimAnnealParams (drop numGraphsToExamine inEdgeToDeleteList) - - -- sim anneal/drift - else - let numDone = if (method $ fromJust inSimAnnealParams) == SimAnneal then currentStep $ fromJust inSimAnnealParams - else driftChanges $ fromJust inSimAnnealParams - numMax = if (method $ fromJust inSimAnnealParams) == SimAnneal then numberSteps $ fromJust inSimAnnealParams - else driftMaxChanges $ fromJust inSimAnnealParams - - candidateGraphCost = if (graphType inGS) == HardWired then snd6 $ head newPhyloGraphList - else ((snd6 $ head newPhyloGraphList) + (heuristicDelta - edgeAddDelta)) - - (acceptGraph, nextSAParams) = U.simAnnealAccept inSimAnnealParams (snd6 $ head newPhyloGraphList) candidateGraphCost - in - if (numDone < numMax) then - if acceptGraph then newPhyloGraphList - else deleteNetEdgeRecursive inGS inData leafGraph inPhyloGraph force nextSAParams (drop numGraphsToExamine inEdgeToDeleteList) - - -- hit end of SA/Drift - else [inPhyloGraph] - -- ) - --- | deleteEdge deletes an edge (checking if network) and rediagnoses graph --- contacts in=out=1 edgfes and removes node, reindexing nodes and edges --- naive for now --- force requires reoptimization no matter what--used for net move --- slipping heuristics for now--awful -deleteNetEdge :: GlobalSettings -> ProcessedData -> DecoratedGraph -> PhylogeneticGraph -> Bool -> LG.Edge -> PhylogeneticGraph -deleteNetEdge inGS inData leafGraph inPhyloGraph force edgeToDelete = - if LG.isEmpty $ thd6 inPhyloGraph then error "Empty input phylogenetic graph in deleteNetEdge" - else if not (LG.isNetworkEdge (fst6 inPhyloGraph) edgeToDelete) then error ("Edge to delete: " ++ (show edgeToDelete) ++ " not in graph:\n" ++ (LG.prettify $ fst6 inPhyloGraph)) - else - -- trace ("DNE: " ++ (show edgeToDelete)) ( - let (delSimple, wasModified) = deleteNetworkEdge (fst6 inPhyloGraph) edgeToDelete - - -- delSimple = GO.contractIn1Out1EdgesRename $ LG.delEdge edgeToDelete $ fst6 inPhyloGraph - - -- prune other edges if now unused - pruneEdges = False - - -- don't warn that edges are being pruned - warnPruneEdges = False - - -- graph optimization from root - startVertex = Nothing - - -- (heuristicDelta, _, _) = heuristicDeleteDelta inGS inPhyloGraph edgeToDelete - - - -- edgeAddDelta = deltaPenaltyAdjustment inGS inPhyloGraph "delete" - - - -- full two-pass optimization--cycles checked in edge deletion function - newPhyloGraph = if (graphType inGS == SoftWired) then T.multiTraverseFullyLabelSoftWired inGS inData pruneEdges warnPruneEdges leafGraph startVertex delSimple - else if (graphType inGS == HardWired) then - T.multiTraverseFullyLabelHardWired inGS inData leafGraph startVertex delSimple - else error "Unsupported graph type in deleteNetEdge. Must be soft or hard wired" - in - --check if deletino modified graph - if not wasModified then inPhyloGraph - - else if force || (graphType inGS) == HardWired then - -- trace ("DNE forced") - newPhyloGraph - else -- if (heuristicDelta / (dynamicEpsilon inGS)) - edgeAddDelta < 0 then newPhyloGraph - if (snd6 newPhyloGraph) < (snd6 inPhyloGraph) then - -- trace ("DNE Better: " ++ (show $ snd6 newPhyloGraph)) - newPhyloGraph - else - -- trace ("DNE Not Better: " ++ (show $ snd6 newPhyloGraph)) - inPhyloGraph - -- ) - - - - --- | heuristicDeleteDelta takes the existing graph, edge to delete, --- reoptimizes starting nodes of two created edges. Returns cost delta based on --- previous and new node resolution caches --- delete n1 -> n2, create u -> v, u' -> v' --- assumes original is edge n1 -> n2, u' -> (n2, X), n1 -> (n2,v), u (n1,Y) -heuristicDeleteDelta :: GlobalSettings -> PhylogeneticGraph -> LG.Edge -> (VertexCost, LG.LNode VertexInfo, LG.LNode VertexInfo) -heuristicDeleteDelta inGS inPhyloGraph (n1, n2) = - if LG.isEmpty (fst6 inPhyloGraph) then error "Empty graph in heuristicDeleteDelta" - else if graphType inGS == HardWired then - -- ensures delete--will always be lower or equakl cost if delete edge from HardWired - (-1, dummyNode, dummyNode) - else - let inGraph = thd6 inPhyloGraph - u = head $ LG.parents inGraph n1 - u' = head $ filter (/= n1) $ LG.parents inGraph n2 - v' = head $ LG.descendants inGraph n2 - v = head $ filter (/= n2) $ LG.descendants inGraph n1 - - uLab = fromJust $ LG.lab inGraph u - uPrimeLab = fromJust $ LG.lab inGraph u' - vLab = fromJust $ LG.lab inGraph v - vPrimeLab = fromJust $ LG.lab inGraph v' - - uOtherChild = head $ filter ((/= n1) . fst) $ LG.labDescendants inGraph (u, uLab) - uPrimeOtherChild = head $ filter ((/= n2) . fst) $ LG.labDescendants inGraph (u', uPrimeLab) - - -- skip over netnodes - uLabAfter = POSW.getOutDegree2VertexSoftWired inGS (six6 inPhyloGraph) u (v, vLab) uOtherChild inGraph - uPrimeLabAfter = POSW.getOutDegree2VertexSoftWired inGS (six6 inPhyloGraph) u' (v', vPrimeLab) uPrimeOtherChild inGraph - - -- cost of resolutions - (_, uCostBefore) = POSW.extractDisplayTrees (Just (-1)) False (vertexResolutionData uLab) - (_, uPrimeCostBefore) = POSW.extractDisplayTrees (Just (-1)) False (vertexResolutionData uPrimeLab) - (_, uCostAfter) = POSW.extractDisplayTrees (Just (-1)) False (vertexResolutionData uLabAfter) - (_, uPrimeCostAfter) = POSW.extractDisplayTrees (Just (-1)) False (vertexResolutionData uPrimeLabAfter) - - addNetDelta = uCostAfter - uCostBefore + uPrimeCostAfter - uPrimeCostBefore - - - in - -- this should not happen--should try to crete new edges from children of net edges - if null (LG.parents inGraph n1) || null (filter (/= n1) $ LG.parents inGraph n2) || null (LG.descendants inGraph n2) || null (filter (/= n2) $ LG.descendants inGraph n1) || null (filter ((/= n2) . fst) $ LG.labDescendants inGraph (u', uPrimeLab)) || null (filter ((/= n1) . fst) $ LG.labDescendants inGraph (u, uLab)) then (infinity, dummyNode, dummyNode) - -- this should not happen--should try to crete new edges from children of net edges - else if (length (LG.parents inGraph n1) /= 1) || (length (LG.parents inGraph n2) /= 2) || (length (LG.descendants inGraph n2) /= 1) || (length (LG.descendants inGraph n1) /= 2) then error ("Graph malformation in numbers of parents and children in heuristicDeleteDelta") - else - (addNetDelta, (u, uLabAfter), (u', uPrimeLabAfter)) - diff --git a/pkg/PhyGraph/Search/Refinement.hs b/pkg/PhyGraph/Search/Refinement.hs deleted file mode 100644 index 9e84fcc07..000000000 --- a/pkg/PhyGraph/Search/Refinement.hs +++ /dev/null @@ -1,511 +0,0 @@ -{- | -Module : Refinement.hs -Description : Module controlling graph refinement functions -Copyright : (c) 2022 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -module Search.Refinement ( refineGraph - , netEdgeMaster - , fuseGraphs - , swapMaster - , geneticAlgorithmMaster - ) where - -import Control.Parallel.Strategies -import Debug.Trace -import GeneralUtilities -import qualified Graphs.GraphOperations as GO -import qualified ParallelUtilities as PU -import Types.Types --- import qualified Search.Swap as S -import Data.Char -import Data.Maybe -import qualified Search.Fuse as F -import qualified Search.GeneticAlgorithm as GA -import qualified Search.NetworkAddDelete as N -import qualified Search.SwapMaster as SM -import Text.Read -import Utilities.Utilities as U -import qualified Commands.Verify as VER - --- | swapMaster moved to Search.SwapMaster due to very long (>20') compile times --- with --enalble-profinling -swapMaster :: [Argument] - -> GlobalSettings - -> ProcessedData - -> Int - -> [PhylogeneticGraph] - -> [PhylogeneticGraph] -swapMaster = SM.swapMaster - - --- | driver for overall refinement -refineGraph :: [Argument] -> GlobalSettings -> ProcessedData -> Int -> [PhylogeneticGraph] -> [PhylogeneticGraph] -refineGraph inArgs inGS inData rSeed inGraphList = - if null inGraphList then errorWithoutStackTrace ("No graphs input to refine") - else - let fstArgList = fmap (fmap toLower . fst) inArgs - sndArgList = fmap (fmap toLower . snd) inArgs - lcArgList = zip fstArgList sndArgList - checkCommandList = checkCommandArgs "refineGraph" fstArgList VER.refineArgList - in - -- check for valid command options - if not checkCommandList then errorWithoutStackTrace ("Unrecognized command in 'GeneticAlgorithm': " ++ show inArgs) - else - let doNetAdd = any ((=="netadd").fst) lcArgList - doNetDel = any ((=="netdel").fst) lcArgList || (any ((=="netdelete").fst) lcArgList) - doNetAddDel = (any ((=="netadddel").fst) lcArgList) || (any ((=="netadddelete").fst) lcArgList) - doNetMov = any ((=="netmove").fst) lcArgList - doGenAlg = any ((=="ga").fst) lcArgList || any ((=="geneticalgorithm").fst) lcArgList - in - - -- network edge edits - if doNetAdd || doNetDel || doNetAddDel || doNetMov then - netEdgeMaster inArgs inGS inData rSeed inGraphList - - -- genetic algorithm - else if doGenAlg then - geneticAlgorithmMaster inArgs inGS inData rSeed inGraphList - - else error "No refinement operation specified" - --- | geneticAlgorithmMaster takes arguments and performs genetic algorithm on input graphs --- the process follows several steps --- 1) input graphs are mutated --- this step is uncharacteristically first so that is can operate on --- graphs that have been "fused" (recombined) already --- mutated graphs are added up to popsize --- if input graphs are already at the population size, an equal number of mutants are added (exceeding input popsize) --- 2) graph are recombined using fusing operations --- 3) population undergoes selection to population size (unique graphs) --- selection based on delta with best graph and severity factor on (0,Inf) 1 pure cost delta < 1 more severe, > 1 less severe --- if "elitist" (default) 'best' graphs are always selected to ensure no worse. --- 4) operation repearts for number of generations -geneticAlgorithmMaster :: [Argument] -> GlobalSettings -> ProcessedData -> Int -> [PhylogeneticGraph] -> [PhylogeneticGraph] -geneticAlgorithmMaster inArgs inGS inData rSeed inGraphList = - if null inGraphList then trace ("No graphs to undergo Genetic Algorithm") [] - else - trace ("Genetic Algorithm operating on population of " ++ (show $ length inGraphList) ++ " input graph(s) with cost range ("++ (show $ minimum $ fmap snd6 inGraphList) ++ "," ++ (show $ maximum $ fmap snd6 inGraphList) ++ ")") ( - - -- process args - let (doElitist, keepNum, popSize, generations, severity, recombinations, maxNetEdges) = getGeneticAlgParams inArgs - (newGraphList, generationCounter) = GA.geneticAlgorithm inGS inData rSeed doElitist (fromJust maxNetEdges) (fromJust keepNum) (fromJust popSize) (fromJust generations) 0 (fromJust severity) (fromJust recombinations) inGraphList - in - trace ("\tGenetic Algorithm: " ++ (show $ length newGraphList) ++ " resulting graphs with cost range (" ++ (show $ minimum $ fmap snd6 newGraphList) ++ "," ++ (show $ maximum $ fmap snd6 newGraphList) ++ ")" ++ " after " ++ (show generationCounter) ++ " generation(s)") - newGraphList - ) - --- | getGeneticAlgParams returns paramlist from arglist -getGeneticAlgParams :: [Argument] -> (Bool, Maybe Int, Maybe Int, Maybe Int, Maybe Double, Maybe Int, Maybe Int) -getGeneticAlgParams inArgs = - let fstArgList = fmap (fmap toLower . fst) inArgs - sndArgList = fmap (fmap toLower . snd) inArgs - lcArgList = zip fstArgList sndArgList - checkCommandList = checkCommandArgs "geneticalgorithm" fstArgList VER.geneticAlgorithmArgList - in - -- check for valid command options - if not checkCommandList then errorWithoutStackTrace ("Unrecognized command in 'GeneticAlgorithm': " ++ show inArgs) - else - let keepList = filter ((=="keep").fst) lcArgList - keepNum - | length keepList > 1 = - errorWithoutStackTrace ("Multiple 'keep' number specifications in fuse command--can have only one: " ++ show inArgs) - | null keepList = Just 10 - | otherwise = readMaybe (snd $ head keepList) :: Maybe Int - - popSizeList = filter ((=="popsize").fst) lcArgList - popSize - | length popSizeList > 1 = - errorWithoutStackTrace ("Multiple 'popsize' number specifications in genetic algorithm command--can have only one: " ++ show inArgs) - | null popSizeList = Just 20 - | otherwise = readMaybe (snd $ head popSizeList) :: Maybe Int - - generationsList = filter ((=="generations").fst) lcArgList - generations - | length generationsList > 1 = - errorWithoutStackTrace ("Multiple 'generations' number specifications in genetic algorithm command--can have only one: " ++ show inArgs) - | null generationsList = Just 10 - | otherwise = readMaybe (snd $ head generationsList) :: Maybe Int - - severityList = filter ((=="severity").fst) lcArgList - severity - | length severityList > 1 = - errorWithoutStackTrace ("Multiple 'severity' number specifications in genetic algorithm command--can have only one: " ++ show inArgs) - | null severityList = Just 1.0 - | otherwise = readMaybe (snd $ head severityList) :: Maybe Double - - recombinationsList = filter ((=="recombinations").fst) lcArgList - recombinations - | length recombinationsList > 1 = - errorWithoutStackTrace ("Multiple 'recombinations' number specifications in genetic algorithm command--can have only one: " ++ show inArgs) - | null recombinationsList = Just 100 - | otherwise = readMaybe (snd $ head recombinationsList) :: Maybe Int - - maxNetEdgesList = filter ((=="maxnetedges").fst) lcArgList - maxNetEdges - | length maxNetEdgesList > 1 = - errorWithoutStackTrace ("Multiple 'maxNetEdges' number specifications in genetic algorithm command--can have only one: " ++ show inArgs) - | null maxNetEdgesList = Just 10 - | otherwise = readMaybe (snd $ head maxNetEdgesList) :: Maybe Int - - -- in case want to make it an option - -- doElitist' = any ((=="nni").fst) lcArgList - doElitist = True - in - - --check arguments - if isNothing keepNum then errorWithoutStackTrace ("Keep specification not an integer in Genetic Algorithm: " ++ show (head keepList)) - else if isNothing popSize then errorWithoutStackTrace ("PopSize specification not an integer in Genetic Algorithm: " ++ show (head popSizeList)) - else if isNothing generations then errorWithoutStackTrace ("Generations specification not an integer in Genetic Algorithm: " ++ show (head generationsList)) - else if isNothing severity then errorWithoutStackTrace ("Severity factor specification not an integer in Genetic Algorithm: " ++ show (head severityList)) - else if isNothing recombinations then errorWithoutStackTrace ("Severity factor specification not an integer in Genetic Algorithm: " ++ show (head recombinationsList)) - - else - (doElitist, keepNum, popSize, generations, severity, recombinations, maxNetEdges) - --- | fuseGraphs is a wrapper for graph recombination --- the functions make heavy use of branch swapping functions in Search.Swap -fuseGraphs :: [Argument] -> GlobalSettings -> ProcessedData -> Int -> [PhylogeneticGraph] -> [PhylogeneticGraph] -fuseGraphs inArgs inGS inData rSeed inGraphList = - if null inGraphList then trace ("No graphs to fuse") [] - else - trace ("Fusing " ++ (show $ length inGraphList) ++ " input graph(s) with minimum cost "++ (show $ minimum $ fmap snd6 inGraphList)) ( - - -- process args for fuse placement - let (keepNum, maxMoveEdgeDist, fusePairs, lcArgList) = getFuseGraphParams inArgs - - -- steepest off by default due to wanteing to check all addition points - doSteepest' = any ((=="steepest").fst) lcArgList - doAll = any ((=="all").fst) lcArgList - - doSteepest = if (not doSteepest' && not doAll) then False - else doSteepest' - - -- readdition options, specified as swap types - doNNI' = any ((=="nni").fst) lcArgList - doSPR' = any ((=="spr").fst) lcArgList - doTBR = any ((=="tbr").fst) lcArgList - doSPR = if doTBR then False - else doSPR' - doNNI = if doSPR || doTBR then False - else doNNI' - - returnBest = any ((=="best").fst) lcArgList - returnUnique = any ((=="unique").fst) lcArgList - doSingleRound = any ((=="once").fst) lcArgList - randomPairs = any ((=="atrandom").fst) lcArgList - fusePairs' = if fusePairs == Just (maxBound :: Int) then Nothing - else fusePairs - - seedList = randomIntList rSeed - in - -- perform graph fuse operations - let (newGraphList, counterFuse) = F.fuseAllGraphs inGS inData seedList (fromJust keepNum) (2 * (fromJust maxMoveEdgeDist)) 0 doNNI doSPR doTBR doSteepest doAll returnBest returnUnique doSingleRound fusePairs' randomPairs inGraphList - - in - trace ("\tAfter fusing: " ++ (show $ length newGraphList) ++ " resulting graphs with minimum cost " ++ (show $ minimum $ fmap snd6 newGraphList) ++ " after fuse rounds (total): " ++ (show counterFuse)) - newGraphList - ) - --- | getFuseGraphParams returns fuse parameters from arglist -getFuseGraphParams :: [Argument] -> (Maybe Int, Maybe Int, Maybe Int, [(String, String)]) -getFuseGraphParams inArgs = - let fstArgList = fmap (fmap toLower . fst) inArgs - sndArgList = fmap (fmap toLower . snd) inArgs - lcArgList = zip fstArgList sndArgList - checkCommandList = checkCommandArgs "fuse" fstArgList VER.fuseArgList - in - -- check for valid command options - if not checkCommandList then errorWithoutStackTrace ("Unrecognized command in 'fuse': " ++ show inArgs) - else - let keepList = filter ((=="keep").fst) lcArgList - keepNum - | length keepList > 1 = - errorWithoutStackTrace ("Multiple 'keep' number specifications in fuse command--can have only one: " ++ show inArgs) - | null keepList = Just 10 - | otherwise = readMaybe (snd $ head keepList) :: Maybe Int - - moveLimitList = filter (not . null) $ fmap snd $ filter ((/="keep").fst) lcArgList - maxMoveEdgeDist - | length moveLimitList > 1 = - errorWithoutStackTrace ("Multiple maximum edge distance number specifications in fuse command--can have only one (e.g. spr:2): " ++ show inArgs) - | null moveLimitList = Just ((maxBound :: Int) `div` 3) - | otherwise = readMaybe (head moveLimitList) :: Maybe Int - - pairList = filter ((=="pairs").fst) lcArgList - fusePairs - | length pairList > 1 = - errorWithoutStackTrace ("Multiple 'pair' number specifications in fuse command--can have only one: " ++ show inArgs) - | null pairList = Just (maxBound :: Int) - | otherwise = readMaybe (snd $ head pairList) :: Maybe Int - - in - - --check arguments - if isNothing keepNum then errorWithoutStackTrace ("Keep specification not an integer in swap: " ++ show (head keepList)) - else if isNothing maxMoveEdgeDist then errorWithoutStackTrace ("Maximum edge move distance specification in fuse command not an integer (e.g. spr:2): " ++ show (head moveLimitList)) - else if isNothing fusePairs then errorWithoutStackTrace ("fusePairs specification not an integer in fuse: " ++ show (head pairList)) - - else - (keepNum, maxMoveEdgeDist, fusePairs, lcArgList) - --- | netEdgeMaster overall master for add/delete net edges -netEdgeMaster :: [Argument] -> GlobalSettings -> ProcessedData -> Int -> [PhylogeneticGraph] -> [PhylogeneticGraph] -netEdgeMaster inArgs inGS inData rSeed inGraphList = - if null inGraphList then trace ("No graphs to edit network edges") [] - else if graphType inGS == Tree then trace ("\tCannot perform network edge operations on graphtype tree--set graphtype to SoftWired or HardWired") inGraphList - - -- process args for netEdgeMaster - else - let (keepNum, steps', annealingRounds', driftRounds', acceptEqualProb, acceptWorseFactor, maxChanges, maxNetEdges, lcArgList, maxRounds) = getNetEdgeParams inArgs - doNetAdd = any ((=="netadd").fst) lcArgList - doNetDelete = (any ((=="netdel").fst) lcArgList) || (any ((=="netdelete").fst) lcArgList) - doAddDelete = (any ((=="netadddel").fst) lcArgList) || (any ((=="netadddelete").fst) lcArgList) - doMove = any ((=="netmove").fst) lcArgList - doSteepest' = any ((=="steepest").fst) lcArgList - doAll = any ((=="all").fst) lcArgList - - -- do steepest default - doSteepest = if (not doSteepest' && not doAll) then True - else if doSteepest' && doAll then True - else doSteepest' - doRandomOrder = any ((=="atrandom").fst) lcArgList - - -- simulated annealing parameters - -- returnMutated to return annealed Graphs before swapping fir use in Genetic Algorithm - doAnnealing = any ((=="annealing").fst) lcArgList - - doDrift = any ((=="drift").fst) lcArgList - - returnMutated = any ((=="returnmutated").fst) lcArgList - - simAnnealParams = if (not doAnnealing && not doDrift) then Nothing - else - let steps = max 3 (fromJust steps') - annealingRounds = if annealingRounds' == Nothing then 1 - else if fromJust annealingRounds' < 1 then 1 - else fromJust annealingRounds' - - driftRounds = if driftRounds' == Nothing then 1 - else if fromJust driftRounds' < 1 then 1 - else fromJust driftRounds' - - saMethod = if doDrift && doAnnealing then - trace ("\tSpecified both Simulated Annealing (with temperature steps) and Drifting (without)--defaulting to drifting.") - Drift - else if doDrift then Drift - else SimAnneal - - equalProb = if fromJust acceptEqualProb < 0.0 then 0.0 - else if fromJust acceptEqualProb > 1.0 then 1.0 - else fromJust acceptEqualProb - - worseFactor = if fromJust acceptWorseFactor < 0.0 then 0.0 - else fromJust acceptWorseFactor - - changes = if fromJust maxChanges < 0 then 15 - else fromJust maxChanges - - saValues = SAParams { method = saMethod - , numberSteps = steps - , currentStep = 0 - , randomIntegerList = randomIntList rSeed - , rounds = max annealingRounds driftRounds - , driftAcceptEqual = equalProb - , driftAcceptWorse = worseFactor - , driftMaxChanges = changes - , driftChanges = 0 - } - in - Just saValues - - - -- create simulated annealing random lists uniquely for each fmap - newSimAnnealParamList = U.generateUniqueRandList (length inGraphList) simAnnealParams - - -- perform add/delete/move operations - bannerText = if simAnnealParams /= Nothing then - if (method $ fromJust simAnnealParams) == SimAnneal then - ("Simulated Annealing (Network edge moves) " ++ (show $ rounds $ fromJust simAnnealParams) ++ " rounds " ++ (show $ length inGraphList) ++ " with " ++ (show $ numberSteps $ fromJust simAnnealParams) ++ " cooling steps " ++ (show $ length inGraphList) ++ " input graph(s) at minimum cost "++ (show $ minimum $ fmap snd6 inGraphList) ++ " keeping maximum of " ++ (show $ fromJust keepNum) ++ " graphs") - else - ("Drifting (Network edge moves) " ++ (show $ rounds $ fromJust simAnnealParams) ++ " rounds " ++ (show $ length inGraphList) ++ " with " ++ (show $ numberSteps $ fromJust simAnnealParams) ++ " cooling steps " ++ (show $ length inGraphList) ++ " input graph(s) at minimum cost "++ (show $ minimum $ fmap snd6 inGraphList) ++ " keeping maximum of " ++ (show $ fromJust keepNum) ++ " graphs") - else if doNetDelete then - ("Network edge delete on " ++ (show $ length inGraphList) ++ " input graph(s) with minimum cost "++ (show $ minimum $ fmap snd6 inGraphList)) - else if doNetAdd then - ("Network edge add on " ++ (show $ length inGraphList) ++ " input graph(s) with minimum cost "++ (show $ minimum $ fmap snd6 inGraphList) ++ " and maximum " ++ (show $ fromJust maxRounds) ++ " rounds") - else if doAddDelete then - ("Network edge add/delete on " ++ (show $ length inGraphList) ++ " input graph(s) with minimum cost "++ (show $ minimum $ fmap snd6 inGraphList) ++ " and maximum " ++ (show $ fromJust maxRounds) ++ " rounds") - else if doMove then - ("Network edge move on " ++ (show $ length inGraphList) ++ " input graph(s) with minimum cost "++ (show $ minimum $ fmap snd6 inGraphList)) - else "" - - in - trace (bannerText) ( - - let (newGraphList, counterAdd) = if doNetAdd then - -- trace ("REFINE Add") ( - let graphPairList = PU.seqParMap rdeepseq (N.insertAllNetEdges inGS inData rSeed (fromJust maxNetEdges) (fromJust keepNum) (fromJust maxRounds) 0 returnMutated doSteepest doRandomOrder ([], infinity)) (zip newSimAnnealParamList (fmap (: []) inGraphList)) -- `using` PU.myParListChunkRDS - (graphListList, counterList) = unzip graphPairList - in - (take (fromJust keepNum) $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] $ concat graphListList, sum counterList) - -- ) - else (inGraphList, 0) - - - (newGraphList', counterDelete) = if doNetDelete then - -- trace ("REFINE Delete") ( - let graphPairList = PU.seqParMap rdeepseq (N.deleteAllNetEdges inGS inData rSeed (fromJust maxNetEdges) (fromJust keepNum) 0 returnMutated doSteepest doRandomOrder ([], infinity)) (zip newSimAnnealParamList (fmap (: []) newGraphList)) -- `using` PU.myParListChunkRDS - (graphListList, counterList) = unzip graphPairList - in - (take (fromJust keepNum) $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] $ concat graphListList, sum counterList) - -- ) - else (newGraphList, 0) - - - (newGraphList'', counterMove) = if doMove then - trace ("Network move option currently disabled--skipping.") - (newGraphList', 0 :: Int) - {- - let graphPairList = PU.seqParMap rdeepseq (N.moveAllNetEdges inGS inData rSeed (fromJust maxNetEdges) (fromJust keepNum) 0 returnMutated doSteepest doRandomOrder ([], infinity)) (zip newSimAnnealParamList (fmap (: []) newGraphList')) -- `using` PU.myParListChunkRDS - (graphListList, counterList) = unzip graphPairList - in - (take (fromJust keepNum) $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] $ concat graphListList, sum counterList) - -} - else (newGraphList', 0) - - (newGraphList''', counterAddDelete) = if doAddDelete then - let graphPairList = PU.seqParMap rdeepseq (N.addDeleteNetEdges inGS inData rSeed (fromJust maxNetEdges) (fromJust keepNum) (fromJust maxRounds) 0 returnMutated doSteepest doRandomOrder ([], infinity)) (zip newSimAnnealParamList (fmap (: []) newGraphList'')) -- `using` PU.myParListChunkRDS - (graphListList, counterList) = unzip graphPairList - in - (take (fromJust keepNum) $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] $ concat graphListList, sum counterList) - - else (newGraphList'', 0) - - in - let resultGraphList = if null newGraphList''' then inGraphList - else take (fromJust keepNum) $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] $ newGraphList''' - in - trace ("\tAfter network edge add/delete/move: " ++ (show $ length resultGraphList) ++ " resulting graphs at cost " ++ (show $ minimum $ fmap snd6 resultGraphList) ++ " with add/delete/move rounds (total): " ++ (show counterAdd) ++ " Add, " - ++ (show counterDelete) ++ " Delete, " ++ (show counterMove) ++ " Move, " ++ (show counterAddDelete) ++ " AddDelete") - resultGraphList - ) - --- | getNetEdgeParams returns net edge cparameters from argument list -getNetEdgeParams :: [Argument] -> (Maybe Int, Maybe Int, Maybe Int, Maybe Int, Maybe Double, Maybe Double, Maybe Int, Maybe Int, [(String, String)], Maybe Int) -getNetEdgeParams inArgs = - let fstArgList = fmap (fmap toLower . fst) inArgs - sndArgList = fmap (fmap toLower . snd) inArgs - lcArgList = zip fstArgList sndArgList - checkCommandList = checkCommandArgs "netEdgeMaster" fstArgList VER.netEdgeArgList - in - -- check for valid command options - if not checkCommandList then errorWithoutStackTrace ("Unrecognized command in 'netEdge': " ++ show inArgs) - else - let keepList = filter ((=="keep").fst) lcArgList - keepNum - | length keepList > 1 = - errorWithoutStackTrace ("Multiple 'keep' number specifications in netEdge command--can have only one: " ++ show inArgs) - | null keepList = Just 10 - | otherwise = readMaybe (snd $ head keepList) :: Maybe Int - - -- simulated anealing options - stepsList = filter ((=="steps").fst) lcArgList - steps' - | length stepsList > 1 = - errorWithoutStackTrace ("Multiple annealing steps value specifications in netEdge command--can have only one (e.g. steps:10): " ++ show inArgs) - | null stepsList = Just 10 - | otherwise = readMaybe (snd $ head stepsList) :: Maybe Int - - annealingList = filter ((=="annealing").fst) lcArgList - annealingRounds' - | length annealingList > 1 = - errorWithoutStackTrace ("Multiple 'annealing' rounds number specifications in netEdge command--can have only one: " ++ show inArgs) - | null annealingList = Just 1 - | otherwise = readMaybe (snd $ head annealingList) :: Maybe Int - - -- drift options - driftList = filter ((=="drift").fst) lcArgList - driftRounds' - | length driftList > 1 = - errorWithoutStackTrace ("Multiple 'drift' rounds number specifications in swap command--can have only one: " ++ show inArgs) - | null driftList = Just 1 - | otherwise = readMaybe (snd $ head driftList) :: Maybe Int - - acceptEqualList = filter ((=="acceptequal").fst) lcArgList - acceptEqualProb - | length acceptEqualList > 1 = - errorWithoutStackTrace ("Multiple 'drift' acceptEqual specifications in swap command--can have only one: " ++ show inArgs) - | null acceptEqualList = Just 0.5 - | otherwise = readMaybe (snd $ head acceptEqualList) :: Maybe Double - - acceptWorseList = filter ((=="acceptworse").fst) lcArgList - acceptWorseFactor - | length acceptWorseList > 1 = - errorWithoutStackTrace ("Multiple 'drift' acceptWorse specifications in swap command--can have only one: " ++ show inArgs) - | null acceptWorseList = Just 1.0 - | otherwise = readMaybe (snd $ head acceptWorseList) :: Maybe Double - - maxChangesList = filter ((=="maxchanges").fst) lcArgList - maxChanges - | length maxChangesList > 1 = - errorWithoutStackTrace ("Multiple 'drift' maxChanges number specifications in swap command--can have only one: " ++ show inArgs) - | null maxChangesList = Just 15 - | otherwise = readMaybe (snd $ head maxChangesList) :: Maybe Int - - maxNetEdgesList = filter ((=="maxnetedges").fst) lcArgList - maxNetEdges - | length maxNetEdgesList > 1 = - errorWithoutStackTrace ("Multiple 'maxNetEdges' number specifications in netEdge command--can have only one: " ++ show inArgs) - | null maxNetEdgesList = Just 10 - | otherwise = readMaybe (snd $ head maxNetEdgesList) :: Maybe Int - - maxRoundsList = filter ((=="rounds").fst) lcArgList - maxRounds - | length maxRoundsList > 1 = - errorWithoutStackTrace ("Multiple 'rounds' number specifications in netEdge command--can have only one: " ++ show inArgs) - | null maxRoundsList = Just 1 - | otherwise = readMaybe (snd $ head maxRoundsList) :: Maybe Int - - in - - -- check inputs - if isNothing keepNum then errorWithoutStackTrace ("Keep specification not an integer in netEdge: " ++ show (head keepList)) - else if isNothing steps' then errorWithoutStackTrace ("Annealing steps specification not an integer (e.g. steps:10): " ++ show (snd $ head stepsList)) - else if isNothing acceptEqualProb then errorWithoutStackTrace ("Drift 'acceptEqual' specification not a float (e.g. acceptEqual:0.75): " ++ show (snd $ head acceptEqualList)) - else if isNothing acceptWorseFactor then errorWithoutStackTrace ("Drift 'acceptWorse' specification not a float (e.g. acceptWorse:1.0): " ++ show (snd $ head acceptWorseList)) - else if isNothing maxChanges then errorWithoutStackTrace ("Drift 'maxChanges' specification not an integer (e.g. maxChanges:10): " ++ show (snd $ head maxChangesList)) - - else if isNothing maxRounds then errorWithoutStackTrace ("Network edit 'rounds' specification not an integer (e.g. rounds:10): " ++ show (snd $ head maxRoundsList)) - - else - (keepNum, steps', annealingRounds', driftRounds', acceptEqualProb, acceptWorseFactor, maxChanges, maxNetEdges, lcArgList, maxRounds) diff --git a/pkg/PhyGraph/Search/Search.hs b/pkg/PhyGraph/Search/Search.hs deleted file mode 100644 index 2225cf594..000000000 --- a/pkg/PhyGraph/Search/Search.hs +++ /dev/null @@ -1,375 +0,0 @@ -{- | -Module : Search.hs -Description : Module controlling timed randomized search functions -Copyright : (c) 2022 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -module Search.Search (search - ) where - -import qualified Commands.Transform as TRANS -import qualified Commands.Verify as VER -import Control.Concurrent.Async -import Control.DeepSeq -import Data.Bifunctor (bimap) -import Data.Char -import Data.Foldable -import qualified Data.List as L -import Data.Maybe -import GeneralUtilities -import qualified GraphOptimization.Traversals as T -import qualified Graphs.GraphOperations as GO -import qualified Search.Build as B -import qualified Search.Refinement as R -import System.Timing -import Text.Read -import Types.Types - --- | A strict, three-way version of 'uncurry'. -uncurry3' :: (Functor f, NFData d) => (a -> b -> c -> f d) -> (a, b, c) -> f d -uncurry3' f (a, b, c) = force <$> f a b c - --- | search timed randomized search returns graph list and comment list with info String for each serch instance -search :: [Argument] -> GlobalSettings -> ProcessedData -> [[VertexCost]] -> Int -> [PhylogeneticGraph] -> IO ([PhylogeneticGraph], [[String]]) -search inArgs inGS inData pairwiseDistances rSeed inGraphList = - let (searchTime, keepNum, instances) = getSearchParams inArgs - threshold = fromSeconds . fromIntegral $ (9 * searchTime) `div` 10 - searchTimed = uncurry3' $ searchForDuration inGS inData pairwiseDistances keepNum threshold [] - infoIndices = [1..] - seadStreams = randomIntList <$> randomIntList rSeed - in do -- threadCount <- (max 1) <$> getNumCapabilities - let threadCount = instances -- <- (max 1) <$> getNumCapabilities - let startGraphs = replicate threadCount (inGraphList, mempty) - let threadInits = zip3 infoIndices seadStreams startGraphs - resultList <- mapConcurrently searchTimed threadInits - pure $ - let (newGraphList, commentList) = unzip resultList - completeGraphList = inGraphList <> fold newGraphList - filteredGraphList = GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] completeGraphList - selectedGraphList = take keepNum filteredGraphList - in (selectedGraphList, commentList) - - -searchForDuration :: GlobalSettings -> ProcessedData -> [[VertexCost]] -> Int -> CPUTime -> [String] -> Int -> [Int] -> ([PhylogeneticGraph], [String]) -> IO ([PhylogeneticGraph], [String]) -searchForDuration inGS inData pairwiseDistances keepNum allotedSeconds inCommentList refIndex seedList input@(inGraphList, infoStringList) = do - (elapsedSeconds, output) <- timeOp $ - let result = force $ performSearch inGS inData pairwiseDistances keepNum (head seedList) input - in pure result - let remainingTime = allotedSeconds `timeDifference` elapsedSeconds - putStrLn $ unlines [ "Thread \t" <> show refIndex - , "Alloted \t" <> show allotedSeconds - , "Ellapsed \t" <> show elapsedSeconds - , "Remaining\t" <> show remainingTime - ] - if elapsedSeconds >= allotedSeconds - then pure output - else searchForDuration inGS inData pairwiseDistances keepNum remainingTime (inCommentList ++ (snd output)) refIndex (tail seedList) $ bimap (inGraphList <>) (infoStringList <>) output - - --- | perform search takes in put graphs and performs randomized build and search with time limit --- if run completres before 90% of time limit then will keep going -performSearch :: GlobalSettings -> ProcessedData -> [[VertexCost]] -> Int -> Int -> ([PhylogeneticGraph], [String]) -> ([PhylogeneticGraph], [String]) -performSearch inGS' inData' pairwiseDistances keepNum rSeed (inGraphList', _) = - -- set up basic parameters for search/refine methods - let randIntList = randomIntList rSeed - buildType = getRandomElement (randIntList !! 0) ["distance", "character"] - buildMethod = getRandomElement (randIntList !! 1) ["unitary", "block"] - - -- build options - numToCharBuild = keepNum - numToDistBuild = (100 :: Int) - - -- build block options - reconciliationMethod = getRandomElement (randIntList !! 2) ["eun", "cun"] - - distOptions = if buildType == "distance" then - if buildMethod == "block" then [("replicates", show numToDistBuild), ("rdwag", ""), ("best", show (1 :: Int))] - else [("replicates", show numToDistBuild), ("rdwag", ""), ("best", show numToCharBuild)] - else - if buildMethod == "block" then [("replicates", show numToCharBuild)] - else [("replicates", show (1 :: Int))] - - blockOptions = if buildMethod == "block" then [("block", ""),("atRandom", ""),("displaytrees", show numToCharBuild),(reconciliationMethod, "")] - else [] - - buildArgs = [(buildType, "")] ++ distOptions ++ blockOptions - - -- swap options - swapType = getRandomElement (randIntList !! 3) ["nni", "spr", "tbr"] - swapKeep = keepNum - swapArgs = [(swapType, ""), ("steepest", ""), ("keep", show swapKeep)] - - - -- fuse options - fuseSwap = getRandomElement (randIntList !! 8) ["nni", "spr", "tbr"] - fusePairs = getRandomElement (randIntList !! 9) ["20", "40", "100"] - fuseKeep = 2 * keepNum - fuseArgs = [(fuseSwap, ""), ("steepest",""), ("unique",""), ("atrandom", ""), ("pairs", fusePairs), ("keep", show fuseKeep)] - - -- net edit options - netGenArgs = [("keep", show keepNum), ("steepest", ""), ("atRandom", "")] - - -- net move options - netMoveArgs = ("netMove", "") : netGenArgs - - -- net add options - netAddArgs = ("netAdd", "") : netGenArgs - - -- net delete options - netDelArgs = ("netDel", "") : netGenArgs - - - -- Genetic Algorithm Arguments - popSize = getRandomElement (randIntList !! 11) ["10", "20", "40"] - generations = getRandomElement (randIntList !! 12) ["1"] -- , "2" , "4"] - severity = getRandomElement (randIntList !! 13) ["0.0", "1.0", "2.0"] - recombinations = getRandomElement (randIntList !! 14) ["20", "40", "100"] - - gaArgs = [("popsize", popSize), ("generations", generations), ("severity", severity), ("recombinations", recombinations)] - - -- drift options - maxChanges = getRandomElement (randIntList !! 15) ["5", "10", "15"] - acceptEqual = getRandomElement (randIntList !! 16) ["0.1", "0.5", "0.75"] - acceptWorse = getRandomElement (randIntList !! 17) ["0.0", "2.0", "10.0"] - - drfitArgs = [("drift", ""),("maxChanges", maxChanges), ("acceptEqual", acceptEqual), ("acceptWorse", acceptWorse)] - - -- simulated annealing options - tempSteps = getRandomElement (randIntList !! 18) ["5", "10", "15"] - - simulatedAnnealArgs = [("annealing", ""),("steps", tempSteps)] - - -- swap with drift arguments - swapDriftArgs = swapArgs ++ drfitArgs - - -- swap with simulated anneling options - swapAnnealArgs = swapArgs ++ simulatedAnnealArgs - - -- choose staticApproximation or not - transformToStaticApproximation = (not . null) inGraphList' && getRandomElement (randIntList !! 19) [True, False, False] - ((inGS, _, inData, inGraphList), staticApproxString) = if transformToStaticApproximation then - (TRANS.transform [("staticapprox",[])] inGS inData' inData' 0 inGraphList', "StaticApprox ") - else ((inGS', inData', inData', inGraphList'), "") - - - in - - -- no input graphs so must build and refine a bit to start - if null inGraphList then - - -- do build + Net add (if network) + swap to crete initial solutions - let buildGraphs = B.buildGraph buildArgs inGS inData pairwiseDistances (randIntList !! 4) - uniqueBuildGraphs = take keepNum $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] buildGraphs - - netAddGraphs = if (graphType inGS == Tree) then uniqueBuildGraphs - else R.netEdgeMaster netAddArgs inGS inData (randIntList !! 10) uniqueBuildGraphs - - swapGraphs = R.swapMaster swapArgs inGS inData (randIntList !! 5) netAddGraphs - - uniqueSwapGraphs = take keepNum $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] swapGraphs - searchString = if (graphType inGS == Tree) then "Build " ++ (L.intercalate "," $ fmap showArg buildArgs) ++ " Swap " ++ (L.intercalate "," $ fmap showArg swapArgs) - else "Build " ++ (L.intercalate "," $ fmap showArg buildArgs) ++ " Net Add " ++ (L.intercalate "," $ fmap showArg netAddArgs) ++ " Swap " ++ (L.intercalate "," $ fmap showArg swapArgs) - in (uniqueSwapGraphs, [searchString]) - -- performSearch inArgs inGS inData pairwiseDistances keepNum startTime searchTime (searchString : infoStringList) (randIntList !! 6) uniqueSwapGraphs - - - -- already have some input graphs - -- choose a method and paramteres at random - else - let operation = if (graphType inGS == Tree) then getRandomElement (randIntList !! 7) ["buildSwap","fuse", "GeneticAlgorithm", "swapAnneal", "swapDrift"] - else getRandomElement (randIntList !! 7) ["buildSwap","fuse", "GeneticAlgorithm", "swapAnneal", "swapDrift", "netAdd", "netDelete", "netMove"] -- add/del/move edges with and without drifting - - -- this so 1/2 time annealing - saDrift = getRandomElement (randIntList !! 19) ["noSA", "noSA", "drift", "anneal"] - - -- for rediagnosis after static approx - -- sequential rediagnosis since assuming that if in parallel teh search operations are running in parallel - pruneEdges = False - warnPruneEdges = False - startVertex = Nothing - in - if operation == "buildSwap" then - let buildGraphs = B.buildGraph buildArgs inGS inData pairwiseDistances (randIntList !! 4) - uniqueBuildGraphs = take keepNum $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] buildGraphs - swapGraphs = R.swapMaster swapArgs inGS inData (randIntList !! 5) uniqueBuildGraphs - uniqueGraphs' = take keepNum $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] (swapGraphs ++ inGraphList) - uniqueGraphs = if not transformToStaticApproximation then uniqueGraphs' - else fmap (T.multiTraverseFullyLabelGraph inGS' inData' pruneEdges warnPruneEdges startVertex) (fmap fst6 uniqueGraphs') - searchString = staticApproxString ++ "Build " ++ (L.intercalate "," $ fmap showArg buildArgs) ++ " Swap " ++ (L.intercalate "," $ fmap showArg swapArgs) - in - (uniqueGraphs, [searchString]) - - else if operation == "fuse" then - let fuseGraphs = R.fuseGraphs fuseArgs inGS inData (randIntList !! 10) inGraphList - uniqueGraphs' = take keepNum $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] (fuseGraphs ++ inGraphList) - uniqueGraphs = if not transformToStaticApproximation then uniqueGraphs' - else fmap (T.multiTraverseFullyLabelGraph inGS' inData' pruneEdges warnPruneEdges startVertex) (fmap fst6 uniqueGraphs') - searchString = staticApproxString ++ "Fuse " ++ (L.intercalate "," $ fmap showArg fuseArgs) - in - (uniqueGraphs, [searchString]) - - else if operation == "GeneticAlgorithm" then - let gaGraphs = R.geneticAlgorithmMaster gaArgs inGS inData (randIntList !! 10) inGraphList - uniqueGraphs' = take keepNum $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] (gaGraphs ++ inGraphList) - uniqueGraphs = if not transformToStaticApproximation then uniqueGraphs' - else fmap (T.multiTraverseFullyLabelGraph inGS' inData' pruneEdges warnPruneEdges startVertex) (fmap fst6 uniqueGraphs') - searchString = staticApproxString ++ "Genetic Algorithm " ++ (L.intercalate "," $ fmap showArg gaArgs) - in - (uniqueGraphs, [searchString]) - - else if operation == "swapDrift" then - let swapDriftGraphs = R.swapMaster swapDriftArgs inGS inData (randIntList !! 10) inGraphList - uniqueGraphs' = take keepNum $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] (swapDriftGraphs ++ inGraphList) - uniqueGraphs = if not transformToStaticApproximation then uniqueGraphs' - else fmap (T.multiTraverseFullyLabelGraph inGS' inData' pruneEdges warnPruneEdges startVertex) (fmap fst6 uniqueGraphs') - searchString = staticApproxString ++ "SwapDrift " ++ (L.intercalate "," $ fmap showArg swapDriftArgs) - in - (uniqueGraphs, [searchString]) - - else if operation == "swapAnneal" then - let swapAnnealGraphs = R.swapMaster swapAnnealArgs inGS inData (randIntList !! 10) inGraphList - uniqueGraphs' = take keepNum $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] (swapAnnealGraphs ++ inGraphList) - uniqueGraphs = if not transformToStaticApproximation then uniqueGraphs' - else fmap (T.multiTraverseFullyLabelGraph inGS' inData' pruneEdges warnPruneEdges startVertex) (fmap fst6 uniqueGraphs') - searchString = staticApproxString ++ "SwapAnneal " ++ (L.intercalate "," $ fmap showArg swapAnnealArgs) - in - (uniqueGraphs, [searchString]) - - else if operation == "netMove" then - let netMoveArgs' = if saDrift == "noSA" then netMoveArgs - else if saDrift == "drift" then netMoveArgs ++ drfitArgs - else netMoveArgs ++ simulatedAnnealArgs - netMoveGraphs = R.netEdgeMaster netMoveArgs' inGS inData (randIntList !! 10) inGraphList - uniqueGraphs' = take keepNum $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] (netMoveGraphs ++ inGraphList) - uniqueGraphs = if not transformToStaticApproximation then uniqueGraphs' - else fmap (T.multiTraverseFullyLabelGraph inGS' inData' pruneEdges warnPruneEdges startVertex) (fmap fst6 uniqueGraphs') - searchString = staticApproxString ++ "NetMove " ++ (L.intercalate "," $ fmap showArg netMoveArgs') - in - (uniqueGraphs, [searchString]) - - else if operation == "netAdd" then - let netAddArgs' = if saDrift == "noSA" then netAddArgs - else if saDrift == "drift" then netAddArgs ++ drfitArgs - else netAddArgs ++ simulatedAnnealArgs - netAddGraphs = R.netEdgeMaster netAddArgs' inGS inData (randIntList !! 10) inGraphList - uniqueGraphs' = take keepNum $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] (netAddGraphs ++ inGraphList) - uniqueGraphs = if not transformToStaticApproximation then uniqueGraphs' - else fmap (T.multiTraverseFullyLabelGraph inGS' inData' pruneEdges warnPruneEdges startVertex) (fmap fst6 uniqueGraphs') - searchString = staticApproxString ++ "NetAdd " ++ (L.intercalate "," $ fmap showArg netAddArgs') - in - (uniqueGraphs, [searchString]) - - else if operation == "netDelete" then - let netDelArgs' = if saDrift == "noSA" then netDelArgs - else if saDrift == "drift" then netDelArgs ++ drfitArgs - else netDelArgs ++ simulatedAnnealArgs - netDelGraphs = R.netEdgeMaster netDelArgs' inGS inData (randIntList !! 10) inGraphList - uniqueGraphs' = take keepNum $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] (netDelGraphs ++ inGraphList) - uniqueGraphs = if not transformToStaticApproximation then uniqueGraphs' - else fmap (T.multiTraverseFullyLabelGraph inGS' inData' pruneEdges warnPruneEdges startVertex) (fmap fst6 uniqueGraphs') - searchString = staticApproxString ++ "netDelete " ++ (L.intercalate "," $ fmap showArg netDelArgs') - in - (uniqueGraphs, [searchString]) - - - else error ("Unknown/unimplemented method in search: " ++ operation) - where showArg a = "(" ++ (fst a) ++ "," ++ (snd a) ++ ")" - --- | getSearchParams takes arguments and returns search params -getSearchParams :: [Argument] -> (Int, Int, Int) -getSearchParams inArgs = - let fstArgList = fmap (fmap toLower . fst) inArgs - sndArgList = fmap (fmap toLower . snd) inArgs - lcArgList = zip fstArgList sndArgList - checkCommandList = checkCommandArgs "search" fstArgList VER.searchArgList - in - -- check for valid command options - if not checkCommandList then errorWithoutStackTrace ("Unrecognized command in 'search': " ++ show inArgs) - else - let instancesList = filter ((=="instances").fst) lcArgList - instances - | length instancesList > 1 = - errorWithoutStackTrace ("Multiple 'keep' number specifications in search command--can have only one: " ++ show inArgs) - | null instancesList = Just 1 - | otherwise = readMaybe (snd $ head instancesList) :: Maybe Int - - keepList = filter ((=="keep").fst) lcArgList - keepNum - | length keepList > 1 = - errorWithoutStackTrace ("Multiple 'keep' number specifications in search command--can have only one: " ++ show inArgs) - | null keepList = Just 10 - | otherwise = readMaybe (snd $ head keepList) :: Maybe Int - - daysList = filter ((=="days").fst) lcArgList - days - | length daysList > 1 = - errorWithoutStackTrace ("Multiple 'days' number specifications in search command--can have only one: " ++ show inArgs) - | null daysList = Just 0 - | otherwise = readMaybe (snd $ head daysList) :: Maybe Int - - hoursList = filter ((=="hours").fst) lcArgList - hours - | length hoursList > 1 = - errorWithoutStackTrace ("Multiple 'hours' number specifications in search command--can have only one: " ++ show inArgs) - | null hoursList = Just 0 - | otherwise = readMaybe (snd $ head hoursList) :: Maybe Int - - minutesList = filter ((=="minutes").fst) lcArgList - minutes - | length minutesList > 1 = - errorWithoutStackTrace ("Multiple 'minutes' number specifications in search command--can have only one: " ++ show inArgs) - | null minutesList = Just 0 - | otherwise = readMaybe (snd $ head minutesList) :: Maybe Int - - secondsList = filter ((=="seconds").fst) lcArgList - seconds - | length secondsList > 1 = - errorWithoutStackTrace ("Multiple 'seconds' number specifications in search command--can have only one: " ++ show inArgs) - | null secondsList = Just 30 - | otherwise = readMaybe (snd $ head secondsList) :: Maybe Int - - in - if isNothing keepNum then errorWithoutStackTrace ("Keep specification not an integer in search: " ++ show (head keepList)) - else if isNothing instances then errorWithoutStackTrace ("Instnaces specification not an integer in search: " ++ show (head instancesList)) - else if isNothing days then errorWithoutStackTrace ("Days specification not an integer in search: " ++ show (head daysList)) - else if isNothing hours then errorWithoutStackTrace ("Hours specification not an integer in search: " ++ show (head hoursList)) - else if isNothing minutes then errorWithoutStackTrace ("Minutes specification not an integer in search: " ++ show (head minutesList)) - else if isNothing seconds then errorWithoutStackTrace ("seconds factor specification not an integer in search: " ++ show (head secondsList)) - - else - let seconds' = if ((fromJust minutes > 0) || (fromJust hours > 0) || (fromJust days > 0)) && (null secondsList) then Just 0 - else seconds - searchTime = (fromJust seconds') + (60 * (fromJust minutes)) + (3600 * (fromJust hours)) - in - (searchTime, fromJust keepNum, fromJust instances) - diff --git a/pkg/PhyGraph/Search/Swap.hs b/pkg/PhyGraph/Search/Swap.hs deleted file mode 100644 index e98e85430..000000000 --- a/pkg/PhyGraph/Search/Swap.hs +++ /dev/null @@ -1,1634 +0,0 @@ -{- | -Module : Swap.hs -Description : Module specifying graph swapping rearrangement functions -Copyright : (c) 2021 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -module Search.Swap ( swapSPRTBR - , reoptimizeSplitGraphFromVertexTuple - , rejoinGraphTuple - ) where - -import Control.Parallel.Strategies -import qualified Data.List as L -import Data.Maybe -import qualified Data.Vector as V -import GeneralUtilities -import qualified GraphOptimization.Medians as M -import qualified GraphOptimization.PostOrderFunctions as POS -import qualified GraphOptimization.PreOrderFunctions as PRE -import qualified GraphOptimization.Traversals as T -import qualified Graphs.GraphOperations as GO -import qualified ParallelUtilities as PU -import Types.Types -import qualified Utilities.LocalGraph as LG -import Utilities.Utilities as U -import qualified GraphOptimization.PostOrderSoftWiredFunctions as POSW ---import Debug.Trace - - --- | swapSPRTBR perfomrs SPR or TBR branch (edge) swapping on graphs --- runs both SPR and TBR depending on argument since so much duplicated functionality --- 'steepest' abandons swap graph and switces to found graph as soon as anyhting 'better' --- is found. The alternative (all) examines the entire neighborhood and retuns the best result --- the retuns is a list of better graphs and the number of swapping rounds were required to ge there -swapSPRTBR :: String - -> GlobalSettings - -> ProcessedData - -> Int - -> Int - -> Bool - -> Bool - -> Bool - -> Bool - -> (Maybe SAParams, PhylogeneticGraph) - -> ([PhylogeneticGraph], Int) -swapSPRTBR swapType inGS inData numToKeep maxMoveEdgeDist steepest alternate doIA returnMutated (inSimAnnealParams, inGraph) = - -- trace ("In swapSPRTBR:") ( - if LG.isEmpty (fst6 inGraph) then ([], 0) - else - let numLeaves = V.length $ fst3 inData - leafGraph = T.makeSimpleLeafGraph inData - leafDecGraph = T.makeLeafGraph inData - leafGraphSoftWired = POSW.makeLeafGraphSoftWired inData - charInfoVV = six6 inGraph - - - inGraphNetPenalty = if (graphType inGS == Tree) || (graphType inGS == HardWired) then 0.0 - else if (graphFactor inGS) == NoNetworkPenalty then 0.0 - else if (graphFactor inGS) == Wheeler2015Network then T.getW15NetPenalty Nothing inGraph - else if (graphFactor inGS) == Wheeler2023Network then T.getW23NetPenalty Nothing inGraph - else error ("Network penalty type " ++ (show $ graphFactor inGS) ++ " is not yet implemented") - inGraphNetPenaltyFactor = inGraphNetPenalty / (snd6 inGraph) - in - -- trace ("SSPRTBR:" ++ (show inGraphNetPenaltyFactor)) ( - - if inSimAnnealParams == Nothing then - -- trace ("Non SA swap") ( - -- steepest takes immediate best--does not keep equall cost-- for now--disabled not working correctly so goes to "all" - -- Nothing for SimAnneal Params - let (swappedGraphs, counter) = swapAll' swapType inGS inData numToKeep maxMoveEdgeDist steepest alternate 0 (snd6 inGraph) [] [inGraph] numLeaves leafGraph leafDecGraph leafGraphSoftWired charInfoVV doIA inGraphNetPenaltyFactor inSimAnnealParams - in - -- trace ("SWAPSPRTBR: " ++ (show $ fmap snd6 swappedGraphs)) ( - if null swappedGraphs then ([inGraph], counter) - else (swappedGraphs, counter) - -- ) - - - -- simulated annealing/drifting acceptance does a steepest with SA acceptance - -- then a swap steepest and all on annealed graph - -- same at this level method (SA, Drift) choice occurs at lower level - else - -- annealed should only yield a single graph - --trace ("\tAnnealing/Drifting Swap") ( - let -- create list of params with unique list of random values for rounds of annealing - annealingRounds = rounds $ fromJust inSimAnnealParams - newSimAnnealParamList = U.generateUniqueRandList annealingRounds inSimAnnealParams - - -- this to ensure current step set to 0 - (annealedGraphs', anealedCounter) = unzip $ (PU.seqParMap rdeepseq (swapAll' swapType inGS inData 1 maxMoveEdgeDist True alternate 0 (snd6 inGraph) [] [inGraph] numLeaves leafGraph leafDecGraph leafGraphSoftWired charInfoVV doIA inGraphNetPenaltyFactor) newSimAnnealParamList) -- `using` PU.myParListChunkRDS) - - annealedGraphs = take numToKeep $ GO.selectPhylogeneticGraph [("unique","")] 0 ["unique"] $ concat annealedGraphs' - - (swappedGraphs, counter) = swapAll' swapType inGS inData numToKeep maxMoveEdgeDist True alternate 0 (min (snd6 inGraph) (snd6 $ head annealedGraphs)) [] annealedGraphs numLeaves leafGraph leafDecGraph leafGraphSoftWired charInfoVV doIA inGraphNetPenaltyFactor Nothing - - -- swap "all" after steepest descent - (swappedGraphs', counter') = swapAll' swapType inGS inData numToKeep maxMoveEdgeDist True alternate counter (snd6 $ head swappedGraphs) [] swappedGraphs numLeaves leafGraph leafDecGraph leafGraphSoftWired charInfoVV doIA inGraphNetPenaltyFactor Nothing - - uniqueGraphs = take numToKeep $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] (inGraph : swappedGraphs') - in - -- trace ("Steepest SSPRTBR: " ++ (show (length swappedGraphs, counter))) - --trace ("AC:" ++ (show $ fmap snd6 $ concat annealedGraphs') ++ " -> " ++ (show $ fmap snd6 $ swappedGraphs')) ( - - -- this Bool for Genetic Algorithm mutation step - if not returnMutated then (uniqueGraphs, counter' + (sum anealedCounter)) - else (annealedGraphs, counter' + (sum anealedCounter)) - -- ) - -- ) - - --- | swapAll' performs branch swapping on all 'break' edges and all readditions --- this not a "map" version to reduce memory footprint to a more mangeable level --- "steepest" a passable option to short circuit readdition action to return immediately --- if better graph found --- 1) takes first graph --- 2) if steepeast checks to make sure <= current best cost --- 3) gets list of "break-able" edges --- all non-root edges if tree --- all non-root bridge edges if network --- 4) send list (and other info) to split-join function --- goes on if empty list returned or > current best --- add graphs todo list if == current best cost --- 5) returns all of minimum cost found --- if Alternate then when found better do SPR first then TBR - -- assumes SPR done before "alternate" entering so can star with TBR and iff get better - -- go back to SPR. NBest for "steepest" descent -swapAll' :: String - -> GlobalSettings - -> ProcessedData - -> Int - -> Int - -> Bool - -> Bool - -> Int - -> VertexCost - -> [PhylogeneticGraph] - -> [PhylogeneticGraph] - -> Int - -> SimpleGraph - -> DecoratedGraph - -> DecoratedGraph - -> V.Vector (V.Vector CharInfo) - -> Bool - -> VertexCost - -> Maybe SAParams - -> ([PhylogeneticGraph], Int) -swapAll' swapType inGS inData numToKeep maxMoveEdgeDist steepest alternate counter curBestCost curSameBetterList inGraphList numLeaves leafSimpleGraph leafDecGraph leafGraphSoftWired charInfoVV doIA netPenaltyFactor inSimAnnealParams = - -- trace (" In cost " ++ (show curBestCost) ++ (" " ++ swapType)) ( - if null inGraphList then - -- trace (" Out cost " ++ (show curBestCost) ++ (" " ++ swapType)) - (take numToKeep $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] curSameBetterList, counter) - else - let firstGraph = head inGraphList - firstDecoratedGraph = thd6 firstGraph - (firstRootIndex, _) = head $ LG.getRoots firstDecoratedGraph - - -- determine edges to break on--'bridge' edges only for network - -- filter out edges from root since no use--would just rejoin - -- sort longest edge to shortest--option to speeed up steepest nd conditions for all as well - breakEdgeList = if (graphType inGS) == Tree || LG.isTree firstDecoratedGraph then GO.sortEdgeListByLength $ filter ((/= firstRootIndex) . fst3) $ LG.labEdges firstDecoratedGraph - else GO.sortEdgeListByLength $ filter ((/= firstRootIndex) . fst3) $ LG.getEdgeSplitList firstDecoratedGraph - - -- perform split and rejoin on each edge in first graph - newGraphList' = - -- trace ("List of edges to break:" ++ (show $ fmap LG.toEdge breakEdgeList)) - splitJoinGraph swapType inGS inData numToKeep maxMoveEdgeDist steepest curBestCost curSameBetterList numLeaves leafSimpleGraph leafDecGraph leafGraphSoftWired charInfoVV doIA netPenaltyFactor inSimAnnealParams firstGraph breakEdgeList breakEdgeList - - -- get best return graph list-can be empty if nothing better ort smame cost - newGraphList = GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] newGraphList' - - newMinCost = if (not . null) newGraphList then minimum $ fmap snd6 newGraphList - else infinity - - in - -- trace ("Curent min cost: " ++ (show (newMinCost, curBestCost))) ( - -- found better cost graph - if newMinCost < curBestCost then - traceNoLF ("\t->" ++ (show newMinCost)) ( - -- for alternarte do SPR first then TBR - if alternate then - let (sprList, _) = swapAll' "spr" inGS inData numToKeep maxMoveEdgeDist steepest alternate (counter + 1) newMinCost newGraphList (newGraphList ++ (tail inGraphList)) numLeaves leafSimpleGraph leafDecGraph leafGraphSoftWired charInfoVV doIA netPenaltyFactor inSimAnnealParams - sprMinCost = min curBestCost ((snd6 . head) sprList) - - in - swapAll' "tbr" inGS inData numToKeep maxMoveEdgeDist steepest alternate (counter + 2) sprMinCost sprList (sprList ++ (tail inGraphList)) numLeaves leafSimpleGraph leafDecGraph leafGraphSoftWired charInfoVV doIA netPenaltyFactor inSimAnnealParams - else swapAll' swapType inGS inData numToKeep maxMoveEdgeDist steepest alternate (counter + 1) newMinCost newGraphList (newGraphList ++ (tail inGraphList)) numLeaves leafSimpleGraph leafDecGraph leafGraphSoftWired charInfoVV doIA netPenaltyFactor inSimAnnealParams - ) - - -- found only worse graphs--never happens due to the way splitjoin returns only better or equal - else if newMinCost > curBestCost then - -- trace ("Worse " ++ (show newMinCost)) ( - let newCurSameBetterList = GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] (curSameBetterList ++ newGraphList) - in - swapAll' swapType inGS inData numToKeep maxMoveEdgeDist steepest alternate (counter + 1) curBestCost newCurSameBetterList (tail inGraphList) numLeaves leafSimpleGraph leafDecGraph leafGraphSoftWired charInfoVV doIA netPenaltyFactor inSimAnnealParams - -- ) - - -- found same cost graphs - else - -- trace ("Equal " ++ (show newMinCost)) ( - let newCurSameBetterList = GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] (curSameBetterList ++ newGraphList) - ( _, newNovelGraphList) = unzip $ filter ((== True) .fst) $ zip (fmap (GO.isNovelGraph (curSameBetterList ++ (tail inGraphList))) newGraphList) newGraphList - -- newNovelGraphList = newGraphList L.\\ curSameBetterList - graphsToDo = (tail inGraphList) ++ newNovelGraphList - -- graphsToDo' = if length graphsToDo >= (numToKeep - 1) then (tail inGraphList) - -- else graphsToDo - --graphsToDo' = (tail inGraphList) - in - -- trace ("Num in best: " ++ (show $ length curSameBetterList) ++ " Num to do: " ++ (show $ length graphsToDo) ++ " from: " ++ (show (length newNovelGraphList, length newGraphList))) - swapAll' swapType inGS inData numToKeep maxMoveEdgeDist steepest alternate (counter + 1) curBestCost newCurSameBetterList graphsToDo numLeaves leafSimpleGraph leafDecGraph leafGraphSoftWired charInfoVV doIA netPenaltyFactor inSimAnnealParams - -- ) - -- ) - --- | splitJoinGraph splits a graph on a single input edge (recursively though edge list) and rejoins to all possible other edges --- if steepest == True then returns on finding a better graph (lower cost) --- this will traverse entire SPR neighbohood if nothing better found (or steepest == False) --- different from swapALL (original) in that it doesn't build up split list so lower memory footprint --- breakEdgeList Complete keeps original edge list so can create readdition edge lists more easily --- parallel map on rejoin if not steepest, if steepest do number of parallel threads so can reurn if any one is better --- NB -- need to verify NNI/SPR/TBR rearrangement numbers --- assumes break edges are bridge edges --- graph split into two peices "base" graph with original root and "pruned" graph that was split off. --- the edge connecting the two is (originalConnectionOfPruned -> prunedGraphRootIndex) --- the edges in pruned graph do not contaiun that edge since are enumerated via preorder pass from prunedGraphRootIndex --- this is the edge that is reconnected when graphs are joined, it is often delted and rejoined to update info and to deal with --- conditions where the pruned graph is a single terminal -splitJoinGraph :: String - -> GlobalSettings - -> ProcessedData - -> Int - -> Int - -> Bool - -> VertexCost - -> [PhylogeneticGraph] - -> Int - -> SimpleGraph - -> DecoratedGraph - -> DecoratedGraph - -> V.Vector (V.Vector CharInfo) - -> Bool - -> VertexCost - -> Maybe SAParams - -> PhylogeneticGraph - -> [LG.LEdge EdgeInfo] - -> [LG.LEdge EdgeInfo] - -> [PhylogeneticGraph] -splitJoinGraph swapType inGS inData numToKeep maxMoveEdgeDist steepest curBestCost curSameBetterList numLeaves leafSimpleGraph leafDecGraph leafGraphSoftWired charInfoVV doIA netPenaltyFactor inSimAnnealParams firstGraph breakEdgeListComplete breakEdgeList = - if null breakEdgeList then curSameBetterList - else - -- split on first input edge - let edgeToBreakOn = head breakEdgeList - - -- split input graph into a part with the original root ("base") and the "pruned" graph -- the piece split off w/o original root - (splitGraph, graphRoot, prunedGraphRootIndex, originalConnectionOfPruned) = LG.splitGraphOnEdge (thd6 firstGraph) edgeToBreakOn - - -- reoptimize split graph for re-addition heuristics - (reoptimizedSplitGraph, splitCost) = reoptimizeSplitGraphFromVertex inGS inData doIA netPenaltyFactor splitGraph graphRoot prunedGraphRootIndex - - -- get root in base (for readdition) and edges in pruned section for rerooting during TBR readdition - (_, edgesInPrunedGraph) = LG.nodesAndEdgesAfter splitGraph [(originalConnectionOfPruned, fromJust $ LG.lab splitGraph originalConnectionOfPruned)] - - edgesInBaseGraph = breakEdgeListComplete L.\\ (edgeToBreakOn : edgesInPrunedGraph) - - -- determine those edges within distance of original if limited (ie NNI etc) - rejoinEdges = if maxMoveEdgeDist >= ((maxBound :: Int) `div` 3) then edgesInBaseGraph - else take maxMoveEdgeDist $ (LG.sortEdgeListByDistance splitGraph [graphRoot] [graphRoot]) - - -- rejoin graph to all possible edges in base graph - newGraphList = - {- - trace ("Edge to break on:" ++ (show $ LG.toEdge edgeToBreakOn) - ++ "\nBase graph edges: " ++ (show $ fmap LG.toEdge edgesInBaseGraph) - ++ "\nPruned graph edges: " ++ (show $ fmap LG.toEdge edgesInPrunedGraph) - ++ "\nTarget edges to rejoin: " ++ (show $ fmap LG.toEdge rejoinEdges) - ++ "\nFull edgelist: " ++ (show $ fmap LG.toEdge breakEdgeListComplete)) - -} - rejoinGraph swapType inGS inData numToKeep maxMoveEdgeDist steepest curBestCost [] doIA netPenaltyFactor reoptimizedSplitGraph (GO.convertDecoratedToSimpleGraph splitGraph) splitCost graphRoot prunedGraphRootIndex originalConnectionOfPruned rejoinEdges edgesInPrunedGraph charInfoVV inSimAnnealParams - - newGraphList' = take numToKeep $ GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] newGraphList - in - -- only returns graphs if same of better else empty - if (not . null) newGraphList then newGraphList' - else splitJoinGraph swapType inGS inData numToKeep maxMoveEdgeDist steepest curBestCost curSameBetterList numLeaves leafSimpleGraph leafDecGraph leafGraphSoftWired charInfoVV doIA netPenaltyFactor inSimAnnealParams firstGraph breakEdgeListComplete (tail breakEdgeList) - --- | rejoinGraphTuple is a wrapper around rejoinGraph for fmapping -rejoinGraphTuple :: String - -> GlobalSettings - -> ProcessedData - -> Int - -> Int - -> Bool - -> VertexCost - -> [PhylogeneticGraph] - -> Bool - -> V.Vector (V.Vector CharInfo) - -> Maybe SAParams - -> (DecoratedGraph, SimpleGraph, VertexCost, LG.Node,LG.Node, LG.Node, [LG.LEdge EdgeInfo], [LG.LEdge EdgeInfo], VertexCost) - -> [PhylogeneticGraph] -rejoinGraphTuple swapType inGS inData numToKeep maxMoveEdgeDist steepest curBestCost curBestGraphs doIA charInfoVV inSimAnnealParams (reoptimizedSplitGraph, splitGraphSimple, splitGraphCost, graphRoot, prunedGraphRootIndex, originalConnectionOfPruned, rejoinEdges, edgesInPrunedGraph, netPenaltyFactor) = - rejoinGraph swapType inGS inData numToKeep maxMoveEdgeDist steepest curBestCost curBestGraphs doIA netPenaltyFactor reoptimizedSplitGraph splitGraphSimple splitGraphCost graphRoot prunedGraphRootIndex originalConnectionOfPruned rejoinEdges edgesInPrunedGraph charInfoVV inSimAnnealParams - --- | rejoinGraph rejoins a split graph at all edges (if not steepest and found better) --- in "base" graph. --- if not steepest then do all as map, else recursive on base graph edge list -rejoinGraph :: String - -> GlobalSettings - -> ProcessedData - -> Int - -> Int - -> Bool - -> VertexCost - -> [PhylogeneticGraph] - -> Bool - -> VertexCost - -> DecoratedGraph - -> SimpleGraph - -> VertexCost - -> LG.Node - -> LG.Node - -> LG.Node - -> [LG.LEdge EdgeInfo] - -> [LG.LEdge EdgeInfo] - -> V.Vector (V.Vector CharInfo) - -> Maybe SAParams - -> [PhylogeneticGraph] -rejoinGraph swapType inGS inData numToKeep maxMoveEdgeDist steepest curBestCost curBestGraphs doIA netPenaltyFactor reoptimizedSplitGraph splitGraphSimple splitGraphCost graphRoot prunedGraphRootIndex originalConnectionOfPruned rejoinEdges edgesInPrunedGraph charInfoVV inSimAnnealParams = - - -- found no better--but return equal cost graphs - -- trace ("In rejoinGraph with num rejoining edges: " ++ (show $ length rejoinEdges)) ( - if null rejoinEdges then curBestGraphs - - -- check if split graph cost same as graph then return since graph can only get longer on readdition - else if splitGraphCost >= curBestCost then [] - - else - -- fmap over all edges in base graph - if not steepest then - let -- rejoinGraphList = concatMap (singleJoin swapType steepest inGS inData reoptimizedSplitGraph splitGraphSimple splitGraphCost doIA prunedGraphRootIndex originalConnectionOfPruned charInfoVV curBestCost edgesInPrunedGraph) rejoinEdges `using` PU.myParListChunkRDS - rejoinGraphList = concat $ PU.seqParMap rdeepseq (singleJoin swapType steepest inGS inData reoptimizedSplitGraph splitGraphSimple splitGraphCost doIA prunedGraphRootIndex originalConnectionOfPruned charInfoVV curBestCost edgesInPrunedGraph) rejoinEdges - - {-Checking only min but seems to make slower - newMinCost = if null rejoinGraphList then infinity - else minimum $ fmap snd rejoinGraphList - (minEstCostNewGraphList, _) = unzip $ filter ((== newMinCost) . snd) rejoinGraphList - -} - - -- newGraphList = fmap (T.multiTraverseFullyLabelGraph inGS inData False False Nothing) (fmap fst rejoinGraphList) `using` PU.myParListChunkRDS - newGraphList' = take numToKeep $ GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] rejoinGraphList -- newGraphList - in - -- will only return graph if <= curBest cost - if null rejoinGraphList then [] - else if (snd6 . head) newGraphList' <= curBestCost then newGraphList' - else [] - - -- famp over number of threads edges in base graph - -- then recurse - else - -- trace ("In steepest: " ++ (show PU.getNumThreads) ++ " " ++ (show $ length $ take PU.getNumThreads rejoinEdges)) ( - let -- this could be made a little paralle--but if lots of threads basically can do all - numGraphsToExamine = PU.getNumThreads - rejoinEdgeList = take numGraphsToExamine rejoinEdges - --rejoinGraphList = concatMap (singleJoin swapType steepest inGS inData reoptimizedSplitGraph splitGraphSimple splitGraphCost doIA prunedGraphRootIndex originalConnectionOfPruned charInfoVV curBestCost edgesInPrunedGraph) rejoinEdgeList `using` PU.myParListChunkRDS - rejoinGraphList = concat $ PU.seqParMap rdeepseq (singleJoin swapType steepest inGS inData reoptimizedSplitGraph splitGraphSimple splitGraphCost doIA prunedGraphRootIndex originalConnectionOfPruned charInfoVV curBestCost edgesInPrunedGraph) rejoinEdgeList - - {--Checking only min but seems to make slower - newMinCost = if null rejoinGraphList then infinity - else minimum $ fmap snd rejoinGraphList - (minEstCostNewGraphList, _) = unzip $ filter ((== newMinCost) . snd) rejoinGraphList - -} - - -- newGraphList = fmap (T.multiTraverseFullyLabelGraph inGS inData False False Nothing) (fmap fst rejoinGraphList) `using` PU.myParListChunkRDS - newGraphList' = take numToKeep $ GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] rejoinGraphList -- newGraphList - in - -- found nothing better or equal - if null rejoinGraphList then - -- trace ("In steepest worse: " ++ (show $ length (drop PU.getNumThreads rejoinEdges))) - rejoinGraph swapType inGS inData numToKeep maxMoveEdgeDist steepest curBestCost curBestGraphs doIA netPenaltyFactor reoptimizedSplitGraph splitGraphSimple splitGraphCost graphRoot prunedGraphRootIndex originalConnectionOfPruned (drop numGraphsToExamine rejoinEdges) edgesInPrunedGraph charInfoVV inSimAnnealParams - - -- found better graph - else if (snd6 . head) newGraphList' < curBestCost then - -- trace ("Steepest better") - newGraphList' - - -- found equal cost graph - else if (snd6 . head) newGraphList' == curBestCost then - let newBestList = take numToKeep $ GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] (curBestGraphs ++ newGraphList') - in - rejoinGraph swapType inGS inData numToKeep maxMoveEdgeDist steepest curBestCost newBestList doIA netPenaltyFactor reoptimizedSplitGraph splitGraphSimple splitGraphCost graphRoot prunedGraphRootIndex originalConnectionOfPruned (drop numGraphsToExamine rejoinEdges) edgesInPrunedGraph charInfoVV inSimAnnealParams - -- found worse graphs only - else - -- trace ("In steepest worse (after recalculation): " ++ (show $ length (drop PU.getNumThreads rejoinEdges))) - rejoinGraph swapType inGS inData numToKeep maxMoveEdgeDist steepest curBestCost curBestGraphs doIA netPenaltyFactor reoptimizedSplitGraph splitGraphSimple splitGraphCost graphRoot prunedGraphRootIndex originalConnectionOfPruned (drop numGraphsToExamine rejoinEdges) edgesInPrunedGraph charInfoVV inSimAnnealParams - -- )) - --- | singleJoin takes optimized split graph, split cost, target edge, swap type (ie TBR/SPR/NNI) --- and "rejoins" the split graph to a single graph--creates joined graph and calculates a heuristic graph cost --- based on the union assignment of the edge and its distance to the root vertex of the pruned graph --- if TBR checks all edges in pruned graph with readdition edge (shortcircuits if steepest == True) --- always deletes connecting edge to pruned part and readds--this because sometimes it is there and sometimes not (depending on --- if SPR for terminal etc) and can create parallel edges with different weights (0.0 or not) so just remove to be sure. --- TBR uses dynamic epsilon even in SPR moves--SPR does not -singleJoin :: String - -> Bool - -> GlobalSettings - -> ProcessedData - -> DecoratedGraph - -> SimpleGraph - -> VertexCost - -> Bool - -> LG.Node - -> LG.Node - -> V.Vector (V.Vector CharInfo) - -> VertexCost - -> [LG.LEdge EdgeInfo] - -> LG.LEdge EdgeInfo - -> [PhylogeneticGraph] -singleJoin swapType steepest inGS inData splitGraph splitGraphSimple splitCost doIA prunedGraphRootIndex originalConnectionOfPruned charInfoVV curBestCost edgesInPrunedGraph targetEdge@(u,v, _) = - -- trace ("Rejoinging: " ++ (show $ LG.toEdge targetEdge)) ( - let newEdgeList = [(u, originalConnectionOfPruned, 0.0),(originalConnectionOfPruned, v, 0.0),(originalConnectionOfPruned, prunedGraphRootIndex, 0.0)] - targetEdgeData = M.makeEdgeData doIA splitGraph charInfoVV targetEdge - - --this for SPR/NNI only - prunedRootVertexData = vertData $ fromJust $ LG.lab splitGraph prunedGraphRootIndex - - sprReJoinCost = edgeJoinDelta doIA charInfoVV prunedRootVertexData targetEdgeData - - sprNewGraph = LG.insEdges newEdgeList $ LG.delEdges [(u,v),(originalConnectionOfPruned, prunedGraphRootIndex)] splitGraphSimple - - -- here when needed - rediagnosedSPRGraph = T.multiTraverseFullyLabelGraph inGS inData False False Nothing sprNewGraph - - - in - if originalConnectionOfPruned `elem` [u,v] then [] - - -- SPR or no TBR rearrangements - else if (swapType == "spr") || ((length edgesInPrunedGraph) < 4) then - if (sprReJoinCost + splitCost) <= curBestCost then - if (graphType inGS /= Tree) && ((not . LG.isGraphTimeConsistent) sprNewGraph) then [] - else if snd6 rediagnosedSPRGraph <= curBestCost then [rediagnosedSPRGraph] - else [] - else [] - - else -- TBR - -- Filter for bridge edges - let edgesInPrunedGraph' = if (graphType inGS == Tree) || LG.isTree splitGraphSimple then edgesInPrunedGraph - else fmap fst $ filter ((== True) . snd) $ zip edgesInPrunedGraph (fmap (LG.isBridge splitGraphSimple) (fmap LG.toEdge edgesInPrunedGraph)) - in - -- check SPR first if steepest - if steepest && (sprReJoinCost + splitCost) <= (curBestCost * (dynamicEpsilon inGS)) then - if (graphType inGS /= Tree) && ((not . LG.isGraphTimeConsistent) sprNewGraph) then [] - else if snd6 rediagnosedSPRGraph <= curBestCost then [rediagnosedSPRGraph] - else - -- do TBR stuff - tbrJoin steepest inGS inData splitGraph splitGraphSimple splitCost doIA prunedGraphRootIndex originalConnectionOfPruned charInfoVV curBestCost edgesInPrunedGraph' targetEdge - else - -- do TBR stuff adding SPR results if heuristic better - let sprResult = if (sprReJoinCost + splitCost) <= curBestCost then - if (graphType inGS /= Tree) && ((not . LG.isGraphTimeConsistent) sprNewGraph) then [] - else if snd6 rediagnosedSPRGraph <= curBestCost then [rediagnosedSPRGraph] - else [] - else [] - in - sprResult ++ tbrJoin steepest inGS inData splitGraph splitGraphSimple splitCost doIA prunedGraphRootIndex originalConnectionOfPruned charInfoVV curBestCost edgesInPrunedGraph' targetEdge - -- ) - --- | edgeJoinDelta calculates heuristic cost for jopineing pair edges -edgeJoinDelta :: Bool -> V.Vector (V.Vector CharInfo) -> VertexBlockData -> VertexBlockData -> VertexCost -edgeJoinDelta doIA charInfoVV edgeA edgeB = - if (not doIA) then V.sum $ fmap V.sum $ fmap (fmap snd) $ POS.createVertexDataOverBlocks edgeA edgeB charInfoVV [] - else V.sum $ fmap V.sum $ fmap (fmap snd) $ POS.createVertexDataOverBlocksStaticIA edgeA edgeB charInfoVV [] - - --- | tbrJoin performs TBR rearrangements on pruned graph component --- "reroots" pruned graph on each bridge edge and tries join to --- target edge as in SPR --- each edge is tried in turn (except for original root edge covered by singleJoin SPR function) --- if heuristic edge join cost is below current best cost then the component is rerooted, joined to --- target edge and graph fully diagnosed to verify cost --- "steepest" short circuits checking edges to return better verified cost graph immediately --- otherwise ("all") returns all graphs better than or equal to current better cost --- if nothing equal or better found returns empty list --- tests if reroot edges are bridge edges --- uses dynamic epsilon--seems the delta estimate is high -tbrJoin :: Bool - -> GlobalSettings - -> ProcessedData - -> DecoratedGraph - -> SimpleGraph - -> VertexCost - -> Bool - -> LG.Node - -> LG.Node - -> V.Vector (V.Vector CharInfo) - -> VertexCost - -> [LG.LEdge EdgeInfo] - -> LG.LEdge EdgeInfo - -> [PhylogeneticGraph] -tbrJoin steepest inGS inData splitGraph splitGraphSimple splitCost doIA prunedGraphRootIndex originalConnectionOfPruned charInfoVV curBestCost edgesInPrunedGraph targetEdge = - if null edgesInPrunedGraph then [] - else - -- get target edge data - let targetEdgeData = M.makeEdgeData doIA splitGraph charInfoVV targetEdge - in - - if not steepest then - -- get heuristic delta joins for edges in pruned graph - let rerootEdgeList = filter ((/= prunedGraphRootIndex) . fst3) $ filter ((/= originalConnectionOfPruned) . fst3) edgesInPrunedGraph - rerootEdgeDataList = PU.seqParMap rdeepseq (M.makeEdgeData doIA splitGraph charInfoVV) rerootEdgeList - rerootEdgeDeltaList = fmap (+ splitCost) $ PU.seqParMap rdeepseq (edgeJoinDelta doIA charInfoVV targetEdgeData) rerootEdgeDataList - - -- check for possible better/equal graphs and verify - candidateEdgeList = fmap fst $ filter ((<= (curBestCost * (dynamicEpsilon inGS))) . snd) (zip rerootEdgeList rerootEdgeDeltaList) - candidateJoinedGraphList = PU.seqParMap rdeepseq (rerootPrunedAndMakeGraph splitGraphSimple prunedGraphRootIndex originalConnectionOfPruned targetEdge) candidateEdgeList - redignosedGraphList = filter ((<= curBestCost) . snd6) $ PU.seqParMap rdeepseq (T.multiTraverseFullyLabelGraph inGS inData False False Nothing) candidateJoinedGraphList - - -- for debugging - -- allRediagnosedList = PU.seqParMap rdeepseq (T.multiTraverseFullyLabelGraph inGS inData False False Nothing) (PU.seqParMap rdeepseq (rerootPrunedAndMakeGraph splitGraphSimple prunedGraphRootIndex originalConnectionOfPruned targetEdge) rerootEdgeList) - - in - {- - trace ("TBR All equal/better: " ++ (show curBestCost) ++ " " ++ (show rerootEdgeDeltaList) ++ " -> " - ++ (show $ fmap snd6 allRediagnosedList) - ++ " " ++ (show (length rerootEdgeList, length candidateEdgeList, length redignosedGraphList))) ( - -} - if null candidateEdgeList then [] - else if null redignosedGraphList then [] - else redignosedGraphList - -- ) - else - -- trace ("TBR steepest") ( - -- get steepest edges - let numEdgesToExamine = PU.getNumThreads - firstSetEdges = take numEdgesToExamine edgesInPrunedGraph - - -- get heuristic delta joins for steepest edge set - rerootEdgeList = filter ((/= prunedGraphRootIndex) . fst3) $ filter ((/= originalConnectionOfPruned) . fst3) firstSetEdges - rerootEdgeDataList = PU.seqParMap rdeepseq (M.makeEdgeData doIA splitGraph charInfoVV) rerootEdgeList - rerootEdgeDeltaList = fmap (+ splitCost) $ PU.seqParMap rdeepseq (edgeJoinDelta doIA charInfoVV targetEdgeData) rerootEdgeDataList - - -- check for possible better/equal graphs and verify - candidateEdgeList = fmap fst $ filter ((<= (curBestCost * (dynamicEpsilon inGS))) . snd) (zip rerootEdgeList rerootEdgeDeltaList) - candidateJoinedGraphList = PU.seqParMap rdeepseq (rerootPrunedAndMakeGraph splitGraphSimple prunedGraphRootIndex originalConnectionOfPruned targetEdge) candidateEdgeList - redignosedGraphList = filter ((<= curBestCost) . snd6) $ PU.seqParMap rdeepseq (T.multiTraverseFullyLabelGraph inGS inData False False Nothing) candidateJoinedGraphList-- get - - in - if null candidateEdgeList then - tbrJoin steepest inGS inData splitGraph splitGraphSimple splitCost doIA prunedGraphRootIndex originalConnectionOfPruned charInfoVV curBestCost (drop numEdgesToExamine edgesInPrunedGraph) targetEdge - else if null redignosedGraphList then - tbrJoin steepest inGS inData splitGraph splitGraphSimple splitCost doIA prunedGraphRootIndex originalConnectionOfPruned charInfoVV curBestCost (drop numEdgesToExamine edgesInPrunedGraph) targetEdge - else redignosedGraphList - -- ) - - --- | rerootPrunedAndMakeGraph reroots the pruned graph component on the rerootEdge and joins to base gaph at target edge -rerootPrunedAndMakeGraph :: SimpleGraph -> LG.Node -> LG.Node -> LG.LEdge EdgeInfo -> LG.LEdge EdgeInfo -> SimpleGraph -rerootPrunedAndMakeGraph splitGraphSimple prunedGraphRootIndex originalConnectionOfPruned (u,v, _) rerootEdge = - -- get edges to delete and edges to add - let (prunedEdgesToAdd, prunedEdgesToDelete) = getTBREdgeEditsSimple splitGraphSimple prunedGraphRootIndex rerootEdge - - -- edges to connect rerooted pruned component and base graph - connectingEdges = [(u, originalConnectionOfPruned, 0.0),(originalConnectionOfPruned, v, 0.0),(originalConnectionOfPruned, prunedGraphRootIndex, 0.0)] - - tbrNewGraph = LG.insEdges (connectingEdges ++ prunedEdgesToAdd) $ LG.delEdges ([(u,v),(originalConnectionOfPruned, prunedGraphRootIndex)] ++ prunedEdgesToDelete) splitGraphSimple - in - tbrNewGraph - --- | getTBREdgeEditsSimple takes and edge and returns the list of edit to pruned subgraph --- as a pair of edges to add and those to delete --- since reroot edge is directed (e,v), edges away from v will have correct --- orientation. Edges between 'e' and the root will have to be flipped --- original root edges and reroort edge are deleted and new root and edge spanning orginal root created --- delete original connection edge and creates a new one--like SPR --- returns ([add], [delete]) -getTBREdgeEditsSimple :: (Show a) => SimpleGraph -> LG.Node -> LG.LEdge a -> ([LG.LEdge Double],[LG.Edge]) -getTBREdgeEditsSimple inGraph prunedGraphRootIndex rerootEdge = - --trace ("Getting TBR Edits for " ++ (show rerootEdge)) ( - let -- originalRootEdgeNodes = LG.descendants inGraph prunedGraphRootIndex - originalRootEdges = LG.out inGraph prunedGraphRootIndex - - -- get path from new root edge fst vertex to orginal root and flip those edges - -- since (u,v) is u -> v u "closer" to root - closerToPrunedRootEdgeNode = (fst3 rerootEdge, fromJust $ LG.lab inGraph $ fst3 rerootEdge) - (nodesInPath, edgesinPath) = LG.postOrderPathToNode inGraph closerToPrunedRootEdgeNode (prunedGraphRootIndex, fromJust $ LG.lab inGraph prunedGraphRootIndex) - - -- don't want original root edges to be flipped since later deleted - edgesToFlip = edgesinPath L.\\ originalRootEdges - flippedEdges = fmap LG.flipLEdge edgesToFlip - - -- new edges on new root position and spanning old root - -- add in closer vertex to root to make sure direction of edge is correct - newEdgeOnOldRoot = if (snd3 $ head originalRootEdges) `elem` ((fst3 rerootEdge) : (fmap fst nodesInPath)) then (snd3 $ head originalRootEdges, snd3 $ last originalRootEdges, 0.0) - else (snd3 $ last originalRootEdges, snd3 $ head originalRootEdges, 0.0) - newRootEdges = [(prunedGraphRootIndex, fst3 rerootEdge, 0.0 ),(prunedGraphRootIndex, snd3 rerootEdge, 0.0)] - - in - -- assumes we are not checking original root - -- rerooted - -- delete orignal root edges and rerootEdge - -- add new root edges - -- and new edge on old root--but need orientation - -- flip edges from new root to old (delete and add list) - {- - trace ("\n\nIn Graph:\n"++ (LG.prettyIndices inGraph) ++ "\nTBR Edits: " ++ (show (LG.toEdge rerootEdge, prunedGraphRootIndex)) - ++ " NewEdgeOldRoot: " ++ (show $ LG.toEdge newEdgeOnOldRoot) - ++ " New rootEdges: " ++ (show $ fmap LG.toEdge newRootEdges) - ) - -} - -- ++ "\nEdges to add: " ++ (show $ fmap LG.toEdge $ newEdgeOnOldRoot : (flippedEdges ++ newRootEdges)) ++ "\nEdges to delete: " ++ (show $ rerootEdge : (fmap LG.toEdge (edgesToFlip ++ originalRootEdges)))) - (newEdgeOnOldRoot : (flippedEdges ++ newRootEdges), LG.toEdge rerootEdge : (fmap LG.toEdge (edgesToFlip ++ originalRootEdges))) - -- ) - --- | reoptimizeSplitGraphFromVertex fully labels the component graph that is connected to the specified vertex --- retuning that graph with 2 optimized components and their cost --- both components goo through multi-traversal optimizations --- doIA option to only do IA optimization as opposed to full thing--should be enormously faster--but yet more approximate --- creates final for both base graph and priunned component due to rerooting non-concordance of preorder and post order assignments --- terminology bse graph is the component with the original root, pruned that which has been removed form the original --- graph to be readded to edge set --- The function - -- 1) optimizes two components seprately from their "root" - -- 2) takes nodes and edges for each and cretes new graph - -- 3) returns graph and summed cost of two components - -- 4) adds in root and netPenalty factor estimates since net penalty can only be calculated on full graph - -- part of this is turning off net penalty cost when optimizing base and pruned graph components --- if doIA is TRUE then call function that onl;y optimizes the IA assignments on the "original graph" after split. --- this keeps teh IA chracters in sync across the two graphs -reoptimizeSplitGraphFromVertex :: GlobalSettings - -> ProcessedData - -> Bool - -> VertexCost - -> DecoratedGraph - -> Int - -> Int - -> (DecoratedGraph, VertexCost) -reoptimizeSplitGraphFromVertex inGS inData doIA netPenaltyFactor inSplitGraph startVertex prunedSubGraphRootVertex = - if doIA then - -- only reoptimize the IA states for dynamic characters - reoptimizeSplitGraphFromVertexIA inGS inData netPenaltyFactor inSplitGraph startVertex prunedSubGraphRootVertex - else - -- perform full optimizations of nodes - -- these required for full optimization - let nonExactCharacters = U.getNumberSequenceCharacters (thd3 inData) - origGraph = inSplitGraph -- thd6 origPhyloGraph - leafGraph = LG.extractLeafGraph origGraph - calcBranchLengths = False - - -- create simple graph version of split for post order pass - splitGraphSimple = GO.convertDecoratedToSimpleGraph inSplitGraph - - - -- create optimized base graph - -- False for staticIA - (postOrderBaseGraph, _) = T.generalizedGraphPostOrderTraversal (inGS {graphFactor = NoNetworkPenalty}) nonExactCharacters inData leafGraph False (Just startVertex) splitGraphSimple - - - fullBaseGraph = PRE.preOrderTreeTraversal (inGS {graphFactor = NoNetworkPenalty}) (finalAssignment inGS) False calcBranchLengths (nonExactCharacters > 0) startVertex True postOrderBaseGraph - - -- create fully optimized pruned graph. Post order tehn preorder - - -- get root node of pruned graph--parent since that is the full pruned piece (keeping that node for addition to base graph and edge creation) - startPrunedNode = (prunedSubGraphRootVertex, fromJust $ LG.lab origGraph prunedSubGraphRootVertex) - startPrunedParentNode = head $ LG.labParents origGraph prunedSubGraphRootVertex - startPrunedParentEdge = (fst startPrunedParentNode, prunedSubGraphRootVertex, dummyEdge) - - - -- False for staticIA - (postOrderPrunedGraph, _) = T.generalizedGraphPostOrderTraversal (inGS {graphFactor = NoNetworkPenalty}) nonExactCharacters inData leafGraph False (Just prunedSubGraphRootVertex) splitGraphSimple - - - -- False for staticIA - fullPrunedGraph = PRE.preOrderTreeTraversal (inGS {graphFactor = NoNetworkPenalty}) (finalAssignment inGS) False calcBranchLengths (nonExactCharacters > 0) prunedSubGraphRootVertex True postOrderPrunedGraph - - -- get root node of base graph - startBaseNode = (startVertex, fromJust $ LG.lab (thd6 fullBaseGraph) startVertex) - - - - -- get nodes and edges in base and pruned graph (both PhylogeneticGrapgs so thd6) - (baseGraphNonRootNodes, baseGraphEdges) = LG.nodesAndEdgesAfter (thd6 fullBaseGraph) [startBaseNode] - - (prunedGraphNonRootNodes, prunedGraphEdges) = if LG.isLeaf origGraph prunedSubGraphRootVertex then ([],[]) - else LG.nodesAndEdgesAfter (thd6 fullPrunedGraph) [startPrunedNode] - - -- make fully optimized graph from base and split components - fullSplitGraph = LG.mkGraph ([startBaseNode, startPrunedNode, startPrunedParentNode] ++ baseGraphNonRootNodes ++ prunedGraphNonRootNodes) (startPrunedParentEdge : (baseGraphEdges ++ prunedGraphEdges)) - - -- cost of split graph to be later combined with re-addition delta for heuristic graph cost - prunedCost = if LG.isLeaf origGraph prunedSubGraphRootVertex then 0 - else snd6 fullPrunedGraph - splitGraphCost = ((1.0 + netPenaltyFactor) * ((snd6 fullBaseGraph) + prunedCost)) - - in - - {- - trace ("Orig graph cost " ++ (show $ subGraphCost $ fromJust $ LG.lab origGraph startVertex) ++ " Base graph cost " ++ (show $ snd6 fullBaseGraph) ++ " pruned subgraph cost " ++ (show prunedCost) ++ " at node " ++ (show prunedSubGraphRootVertex) ++ " parent " ++ (show $ fst startPrunedParentNode) - - ++ "\nBaseGraphNodes\n" ++ (show $ L.sort $ fmap fst baseGraphNonRootNodes) ++ "\nPruned nodes from root: " ++ "\n" ++ (show $ fmap fst $ startPrunedNode : prunedGraphNonRootNodes) - ++ "\nSplit Graph\n" ++ (LG.prettify $ GO.convertDecoratedToSimpleGraph fullSplitGraph) - ++ "\nOrig graph:\n" ++ (LG.prettify $ GO.convertDecoratedToSimpleGraph origGraph)) - -} - --trace ("reoptimizeSplitGraphFromVertex: " ++ (show splitGraphCost)) - (fullSplitGraph, splitGraphCost) - --- | reoptimizeSplitGraphFromVertexTuple wrapper for reoptimizeSplitGraphFromVertex with last 3 args as tuple -reoptimizeSplitGraphFromVertexTuple :: GlobalSettings - -> ProcessedData - -> Bool - -> VertexCost - -> (DecoratedGraph, Int , Int) - -> (DecoratedGraph, VertexCost) -reoptimizeSplitGraphFromVertexTuple inGS inData doIA netPenaltyFactor (inSplitGraph, startVertex, prunedSubGraphRootVertex) = - reoptimizeSplitGraphFromVertex inGS inData doIA netPenaltyFactor inSplitGraph startVertex prunedSubGraphRootVertex - - --- | reoptimizeSplitGraphFromVertexIA performs operations of reoptimizeSplitGraphFromVertex for static charcaters --- but dynamic characters--only update IA assignments and initialized from origPhylo graph (at leaves) to keep IA characters in sync --- since all "static" only need single traversal post order pass -reoptimizeSplitGraphFromVertexIA :: GlobalSettings - -> ProcessedData - -> VertexCost - -> DecoratedGraph - -> Int - -> Int - -> (DecoratedGraph, VertexCost) -reoptimizeSplitGraphFromVertexIA inGS inData netPenaltyFactor inSplitGraph startVertex prunedSubGraphRootVertex = - --if graphType inGS /= Tree then error "Networks not yet implemented in reoptimizeSplitGraphFromVertexIA" - --else - let nonExactCharacters = U.getNumberSequenceCharacters (thd3 inData) - origGraph = inSplitGraph -- thd6 origPhyloGraph - - -- create leaf graphs--but copy IA final to prelim - leafGraph = GO.copyIAFinalToPrelim $ LG.extractLeafGraph origGraph - calcBranchLengths = False - - -- create simple graph version of split for post order pass - splitGraphSimple = GO.convertDecoratedToSimpleGraph inSplitGraph - - --Create base graph - -- create postorder assignment--but only from single traversal - -- True flag fior staticIA - postOrderBaseGraph = T.postOrderTreeTraversal (inGS {graphFactor = NoNetworkPenalty}) inData leafGraph True (Just startVertex) splitGraphSimple - baseGraphCost = snd6 postOrderBaseGraph - - -- True flag fior staticIA - fullBaseGraph = PRE.preOrderTreeTraversal (inGS {graphFactor = NoNetworkPenalty}) (finalAssignment inGS) True calcBranchLengths (nonExactCharacters > 0) startVertex True postOrderBaseGraph - - {- - localRootCost = if (rootCost inGS) == NoRootCost then 0.0 - else if (rootCost inGS) == Wheeler2015Root then T.getW15RootCost inGS postOrderBaseGraph - else error ("Root cost type " ++ (show $ rootCost inGS) ++ " is not yet implemented") - -} - - -- get root node of base graph - startBaseNode = (startVertex, fromJust $ LG.lab (thd6 fullBaseGraph) startVertex) - - --Create pruned graph - -- get root node of pruned graph--parent since that is the full pruned piece (keeping that node for addition to base graph and edge creation) - startPrunedNode = GO.makeIAPrelimFromFinal (prunedSubGraphRootVertex, fromJust $ LG.lab origGraph prunedSubGraphRootVertex) - startPrunedParentNode = head $ LG.labParents origGraph prunedSubGraphRootVertex - startPrunedParentEdge = (fst startPrunedParentNode, prunedSubGraphRootVertex, dummyEdge) - - - -- True flag fior staticIA - postOrderPrunedGraph = T.postOrderTreeTraversal (inGS {graphFactor = NoNetworkPenalty}) inData leafGraph True (Just prunedSubGraphRootVertex) splitGraphSimple - prunedGraphCost = snd6 postOrderPrunedGraph - - -- True flag fior staticIA - fullPrunedGraph = PRE.preOrderTreeTraversal (inGS {graphFactor = NoNetworkPenalty}) (finalAssignment inGS) True calcBranchLengths (nonExactCharacters > 0) prunedSubGraphRootVertex True postOrderPrunedGraph - - -- get nodes and edges in base and pruned graph (both PhylogeneticGrapgs so thd6) - (baseGraphNonRootNodes, baseGraphEdges) = LG.nodesAndEdgesAfter (thd6 fullBaseGraph) [startBaseNode] - - (prunedGraphNonRootNodes, prunedGraphEdges) = if LG.isLeaf origGraph prunedSubGraphRootVertex then ([],[]) - else LG.nodesAndEdgesAfter (thd6 fullPrunedGraph) [startPrunedNode] - - -- make fully optimized graph from base and split components - fullSplitGraph = LG.mkGraph ([startBaseNode, startPrunedNode, startPrunedParentNode] ++ baseGraphNonRootNodes ++ prunedGraphNonRootNodes) (startPrunedParentEdge : (baseGraphEdges ++ prunedGraphEdges)) - - splitGraphCost = ((1.0 + netPenaltyFactor) * (baseGraphCost + prunedGraphCost)) - - in - -- remove when working - -- trace ("ROGFVIA split costs:" ++ (show (baseGraphCost, prunedGraphCost, localRootCost)) ++ " -> " ++ (show splitGraphCost)) ( - if splitGraphCost == 0 then - error ("Split costs:" ++ (show (baseGraphCost, prunedGraphCost)) ++ " -> " ++ (show splitGraphCost) - ++ " Split graph simple:\n" ++ (LG.prettify splitGraphSimple) - ++ "\nFull:\n" ++ (show inSplitGraph) - ++ "\nOriginal Graph:\n" ++ (show origGraph)) - else (fullSplitGraph, splitGraphCost) - -- ) - - -{- --- | rejoinGraphKeepBestTuple wrapper for rejoinGraphKeepBest but with last 5 arguments as a tuple -rejoinGraphKeepBestTuple :: GlobalSettings - -> String - -> Bool - -> VertexCost - -> Int - -> Bool - -> V.Vector (V.Vector CharInfo) - -> ((DecoratedGraph, VertexCost), LG.Node, LG.Node, LG.Node) - -> [(SimpleGraph, VertexCost)] -rejoinGraphKeepBestTuple inGS swapType hardwiredSPR curBestCost maxMoveEdgeDist doIA charInfoVV ((splitGraph, splitCost), prunedGraphRootIndex, nakedNode, originalSplitNode) = - rejoinGraphKeepBest inGS swapType hardwiredSPR curBestCost maxMoveEdgeDist doIA charInfoVV (splitGraph, splitCost) prunedGraphRootIndex nakedNode originalSplitNode - - --- | rejoinGraphKeepBest rejoins split trees on available edges (non-root, and not original split) --- if steepest is False does not sort order of edges, other wise sorts in order of closeness to original edge --- uses delta --- NNI sorts edges on propinquity taking first 2 edges --- TBR does the rerooting of pruned subtree --- originalConnectionOfPruned is the "naked" node that was creted when teh graph was split and will --- be used for the rejoin node in the middle of th einvaded edge -rejoinGraphKeepBest :: GlobalSettings - -> String - -> Bool - -> VertexCost - -> Int - -> Bool - -> V.Vector (V.Vector CharInfo) - -> (DecoratedGraph, VertexCost) - -> LG.Node - -> LG.Node - -> LG.Node - -> [(SimpleGraph, VertexCost)] -rejoinGraphKeepBest inGS swapType hardWiredSPR curBestCost maxMoveEdgeDist doIA charInfoVV (splitGraph, splitCost) prunedGraphRootIndex nakedNode originalSplitNode = - -- case where swap split retunred empty because too few nodes in remaining graph to add to - if LG.isEmpty splitGraph then [] - -- this more for fusing situations - else if splitCost > curBestCost then [] - else - let -- outgroupEdges = LG.out splitGraph graphRoot - (_, prunedSubTreeEdges) = LG.nodesAndEdgesAfter splitGraph [(nakedNode, fromJust $ LG.lab splitGraph nakedNode)] - - --only sort if limited egde rejoins - splitEdges = if (graphType inGS) == Tree then LG.labEdges splitGraph - else if LG.isTree splitGraph then LG.labEdges splitGraph - else GO.getEdgeSplitList splitGraph - edgesToInvade = if maxMoveEdgeDist >= ((maxBound :: Int) `div` 3) then splitEdges L.\\ prunedSubTreeEdges -- L.\\ (outgroupEdges ++ prunedSubTreeEdges) - else take maxMoveEdgeDist $ L.intersect splitEdges ((GO.sortEdgeListByDistance splitGraph [originalSplitNode] [originalSplitNode]) L.\\ prunedSubTreeEdges) - - prunedGraphRootNode = (prunedGraphRootIndex, fromJust $ LG.lab splitGraph prunedGraphRootIndex) - (nodesAfterPrunedRoot, edgesInPrunedSubGraph) = LG.nodesAndEdgesAfter splitGraph [prunedGraphRootNode] - - -- pruned subgraph too small to reroot in TBR - onlySPR = length nodesAfterPrunedRoot < 3 - - - candidateEditList = concatMap (addSubGraph inGS swapType hardWiredSPR doIA splitGraph prunedGraphRootNode splitCost nakedNode onlySPR edgesInPrunedSubGraph charInfoVV) edgesToInvade `using` PU.myParListChunkRDS - - - minCandidateCost = if (not $ null candidateEditList) then minimum $ fmap fst3 candidateEditList - else infinity - in - -- trace ("RGKB: " ++ (show $ fmap LG.toEdge edgesToInvade) ++ "\n" ++ (LG.prettify $ GO.convertDecoratedToSimpleGraph splitGraph)) ( - if minCandidateCost > curBestCost then [] - else - let bestEdits = filter ((<= (curBestCost * (dynamicEpsilon inGS))). fst3) candidateEditList -- not minimum candidate cost--better if check all equal or better than curent best - splitGraphSimple = GO.convertDecoratedToSimpleGraph splitGraph - swapSimpleGraphList = fmap (applyGraphEdits splitGraphSimple) bestEdits - in - zip swapSimpleGraphList (L.replicate (length swapSimpleGraphList) minCandidateCost) - -- ) - --- | addSubTree "adds" a subtree back into an edge calculating the cost of the graph via the delta of the add and costs of the two components --- does NOT reoptimize candidate trees--happens after return since looking at "all" -addSubGraph :: GlobalSettings - -> String - -> Bool - -> Bool - -> DecoratedGraph - -> LG.LNode VertexInfo - -> VertexCost - -> LG.Node - -> Bool - -> [LG.LEdge EdgeInfo] - -> V.Vector (V.Vector CharInfo) - -> LG.LEdge EdgeInfo - -> [(VertexCost, [LG.LEdge Double], [LG.Edge])] -addSubGraph inGS swapType hardWiredSPR doIA inGraph prunedGraphRootNode splitCost parentPrunedRoot onlySPR edgesInPrunedSubGraph charInfoVV targetEdge@(eNode, vNode, _) = - let -- existingEdgeCost = minLength targetlabel - edge0 = (parentPrunedRoot, vNode, 0.0) - edge1 = (eNode, parentPrunedRoot, 0.0) - targetEdgeData = M.makeEdgeData doIA inGraph charInfoVV targetEdge - -- edge2 = (nakedNode, prunedGraphRootIndex, 0.0) - --newNode = (nakedNode, TL.pack ("HTU" ++ (show nakedNode))) - -- if subtree fewer than 3 leaves then can only do an SPR rearragement--no rerro0ts - -- prunedGraphRootNode = (prunedGraphRootIndex, fromJust $ LG.lab inGraph prunedGraphRootIndex) - -- (nodesAfterPrunedRoot, edgesInPrunedSubGraph) = LG.nodesAndEdgesAfter inGraph [prunedGraphRootNode] - -- onlySPR = length nodesAfterPrunedRoot < 3 - doSPR = if swapType == "spr" || swapType == "nni" || onlySPR then True - else False - subGraphEdgeVertDataTripleList = if doSPR then [((-1, -1), vertData $ snd prunedGraphRootNode, ([],[]))] - else if hardWiredSPR then take 2 $ getPrunedEdgeData (graphType inGS) doIA inGraph prunedGraphRootNode edgesInPrunedSubGraph charInfoVV - else getPrunedEdgeData (graphType inGS) doIA inGraph prunedGraphRootNode edgesInPrunedSubGraph charInfoVV - - -- get deltas and edges for TBR rerooting of pruned subgraph - deltaEdgeTripleList = fmap (getSubGraphDelta targetEdgeData doIA charInfoVV) subGraphEdgeVertDataTripleList `using` PU.myParListChunkRDS - - delta = minimum $ fmap fst3 deltaEdgeTripleList - - minDeltaEditList = fmap thd3 $ filter ((== delta) . fst3) deltaEdgeTripleList - - - -- get TBR edits if rerooting took place - tbrEdgeEditList = if length subGraphEdgeVertDataTripleList == 1 then [] - else minDeltaEditList - - (tbrEdgesToAddList, tbrEdgesToDeleteList) = unzip tbrEdgeEditList - - deltaCostList = replicate (length tbrEdgeEditList) (delta + splitCost) - edgesToAddList = zipWith (++) (replicate (length tbrEdgeEditList) [edge0, edge1]) tbrEdgesToAddList - edgesToDeleteList = zipWith (:) (replicate (length tbrEdgeEditList) (eNode, vNode)) tbrEdgesToDeleteList - - in - - -- do not redo origal edge so retun infinite cost and dummy edits - --trace ("ASG " ++ (show (length subGraphEdgeVertDataTripleList, delta, length tbrEdgeEditList, length deltaCostList, length edgesToAddList, length edgesToDeleteList))) ( - {- - if (eNode == parentPrunedRoot) then - -- trace ("ASG: break edge") - [(infinity, [], [])] - else - -} - -- SPR or single reroot TBR case - if null tbrEdgeEditList then - -- trace ("all single roots") - [(delta + splitCost, [edge0, edge1], [(eNode, vNode)])] - - -- TBR reroots - else - -- trace ("all multiple roots") - zip3 deltaCostList edgesToAddList edgesToDeleteList - -- ) - - --- | getTBREdgeEdits takes and edge and returns the list of edit to pruned subgraph --- as a pair of edges to add and those to delete --- since reroot edge is directed (e,v), edges away from v will have correct --- orientation. Edges between 'e' and the root will have to be flipped --- original root edges and reroort edge are deleted and new root and edge spanning orginal root created --- returns ([add], [delete]) -getTBREdgeEdits :: DecoratedGraph -> LG.LNode VertexInfo -> LG.Edge -> ([LG.LEdge Double],[LG.Edge]) -getTBREdgeEdits inGraph prunedGraphRootNode rerootEdge = - --trace ("Gettiung TBR Edits for " ++ (show rerootEdge)) ( - let prunedGraphRootIndex = fst prunedGraphRootNode - originalRootEdgeNodes = LG.descendants inGraph prunedGraphRootIndex - originalRootEdges = LG.out inGraph prunedGraphRootIndex - - -- get path from new root edge fst vertex to orginal root and flip those edges - closerToPrunedRootEdgeNode = (fst rerootEdge, fromJust $ LG.lab inGraph $ fst rerootEdge) - (nodesInPath, edgesinPath) = LG.postOrderPathToNode inGraph closerToPrunedRootEdgeNode prunedGraphRootNode - - -- don't want original root edges to be flipped since deleted - edgesToFlip = edgesinPath L.\\ originalRootEdges - flippedEdges = fmap GO.convertToSimpleEdge $ fmap LG.flipLEdge edgesToFlip - - -- new edges on new root position and spanning old root - -- ad in closer vertex to root to make sure direction of edge is correct - newEdgeOnOldRoot = if (snd3 $ head originalRootEdges) `elem` ((fst rerootEdge) : (fmap fst nodesInPath)) then (snd3 $ head originalRootEdges, snd3 $ last originalRootEdges, 0.0) - else (snd3 $ last originalRootEdges, snd3 $ head originalRootEdges, 0.0) - newRootEdges = [(prunedGraphRootIndex, fst rerootEdge, 0.0 ),(prunedGraphRootIndex, snd rerootEdge, 0.0)] - - - in - -- original root edge so no change - if (fst rerootEdge) `elem` originalRootEdgeNodes && (snd rerootEdge) `elem` originalRootEdgeNodes then ([],[]) - - -- rerooted - else - -- delete orignal root edges and rerootEdge - -- add new root edges - -- and new edge on old root--but need orientation - -- flip edges from new root to old (delete and add list) - --trace ("\n\nIn Graph:\n"++ (LG.prettify $ GO.convertDecoratedToSimpleGraph inGraph) ++ "\nTBR Edits: " ++ (show (rerootEdge, prunedGraphRootIndex, fmap LG.toEdge flippedEdges)) - -- ++ "\nEdges to add: " ++ (show $ fmap LG.toEdge $ newEdgeOnOldRoot : (flippedEdges ++ newRootEdges)) ++ "\nEdges to delete: " ++ (show $ rerootEdge : (fmap LG.toEdge (edgesToFlip ++ originalRootEdges)))) - (newEdgeOnOldRoot : (flippedEdges ++ newRootEdges), rerootEdge : (fmap LG.toEdge (edgesToFlip ++ originalRootEdges))) - -- ) - - --- | getPrunedEdgeData takes fully optimized pruned data and returns edge list, edge data for TBR additions, --- and graph edits for that edge reroot -getPrunedEdgeData :: GraphType - -> Bool - -> DecoratedGraph - -> LG.LNode VertexInfo - -> [LG.LEdge EdgeInfo] - -> V.Vector (V.Vector CharInfo) - -> [(LG.Edge, VertexBlockData, ([LG.LEdge Double],[LG.Edge]))] -getPrunedEdgeData inGraphType doIA inGraph prunedGraphRootNode edgesInPrunedSubGraph charInfoVV = - if LG.isEmpty inGraph then error "Empty graph in getPrunedEdgeData" - else - -- trace ("PED") ( - let -- (_, edgeAfterList) = LG.nodesAndEdgesAfter inGraph [prunedGraphRootNode] - - -- this so not rerooted on network edge--screws things up - nonNetWorkEdgeList = if inGraphType /= Tree then filter ((== False) . (LG.isNetworkLabEdge inGraph)) edgesInPrunedSubGraph - else edgesInPrunedSubGraph - - -- oringal pruned root - prunedRootEdges = LG.out inGraph $ fst prunedGraphRootNode - - -- new virtual edge of oringal root 2 edges - virtualRootEdge = (snd3 $ head prunedRootEdges, snd3 $ last prunedRootEdges, thd3 $ head prunedRootEdges) - - -- edges availabel for testing - edgeAfterList' = virtualRootEdge : (nonNetWorkEdgeList L.\\ prunedRootEdges) - - -- could be parallelized - edgeDataList = fmap (M.makeEdgeData doIA inGraph charInfoVV) edgeAfterList' `using` PU.myParListChunkRDS - - -- get potential TBR edits--here so not recalculated multiple times for each prune - edgeEdits = fmap (getTBREdgeEdits inGraph prunedGraphRootNode) (fmap LG.toEdge edgeAfterList') `using` PU.myParListChunkRDS - in - if length prunedRootEdges /= 2 then error ("Incorrect number of out edges (should be 2) in root of pruned graph: " ++ (show $ length prunedRootEdges)) - else - -- trace ("PED Lengths: " ++ (show $ (length edgeAfterList', length edgeDataList, length edgeEdits))) - zip3 (fmap LG.toEdge edgeAfterList') edgeDataList edgeEdits - -- ) - --- | getSubGraphDelta calculated cost of adding a subgraph into and edge --- for SPR use the preliminary of subGraph to final of e and v nodes --- can use median fruntions for postorder if set final-> prelim or e and f -getSubGraphDelta :: VertexBlockData - -> Bool - -> V.Vector (V.Vector CharInfo) - -> (LG.Edge, VertexBlockData, ([LG.LEdge Double],[LG.Edge])) - -> (VertexCost, LG.Edge, ([LG.LEdge Double],[LG.Edge])) -getSubGraphDelta evEdgeData doIA charInfoVV (edgeToJoin, subGraphVertData, edgeSubGraphEdits) = - let -- create edge union 'character' blockData - -- based on final assignments but set to preliminary - -- need to filter gaps if DO, not itIA - - subGraphEdgeUnionCost = if (not doIA) then V.sum $ fmap V.sum $ fmap (fmap snd) $ POS.createVertexDataOverBlocks subGraphVertData evEdgeData charInfoVV [] - else V.sum $ fmap V.sum $ fmap (fmap snd) $ POS.createVertexDataOverBlocksStaticIA subGraphVertData evEdgeData charInfoVV [] - - in - (subGraphEdgeUnionCost, edgeToJoin, edgeSubGraphEdits) - --- | applyGraphEdits takes a graphs and list of nodes and edges to add and delete and creates new graph -applyGraphEdits :: (Show a, Show b) => LG.Gr a b -> (VertexCost, [LG.LEdge b], [LG.Edge]) -> LG.Gr a b -applyGraphEdits inGraph (_, edgesToAdd, edgesToDelete) = LG.insertDeleteEdges inGraph (edgesToAdd, edgesToDelete) - --- | applyGraphEdits' takes a graphs and list of nodes and edges to add and delete and creates new graph -applyGraphEdits' :: (Show a, Show b) => LG.Gr a b -> (VertexCost, [LG.LEdge b], [LG.Edge]) -> LG.Gr a b -applyGraphEdits' inGraph (_, edgesToAdd, edgesToDelete) = - let editedGraph = LG.insEdges edgesToAdd $ LG.delEdges edgesToDelete inGraph - in - -- trace ("AGE: " ++ (show editStuff) ++ "\nIn graph:\n" ++ (LG.prettify inGraph) ++ "New Graph:\n" ++ (LG.prettify editedGraph)) - editedGraph --} - -{- --- | rejoinGraphKeepBestSteepest rejoins split trees on available edges (non-root, and not original split) --- if steepest is False does not sort order of edges, other wise sorts in order of closeness to original edge --- uses delta --- NNI sorts edges on propinquity taking first 2 edges --- TBR does the rerooting of pruned subtree --- originalConnectionOfPruned is the "naked" node that was creted when teh graph was split and will --- be used for the rejoin node in the middle of the invaded edge -rejoinGraphKeepBestSteepest :: GlobalSettings - -> ProcessedData - -> String - -> Bool - -> VertexCost - -> Int - -> Int - -> Bool - -> Bool - -> V.Vector (V.Vector CharInfo) - -> Maybe SAParams - -> [((DecoratedGraph, VertexCost), LG.Node , LG.Node , LG.Node)] - -> ([(PhylogeneticGraph, VertexCost)], Maybe SAParams) -rejoinGraphKeepBestSteepest inGS inData swapType hardwiredSPR curBestCost numToKeep maxMoveEdgeDist steepest doIA charInfoVV inSimAnnealVals splitInfoList = - if null splitInfoList then ([], inSimAnnealVals) - else - -- trace ("In rejoin steepes with split node " ++ (show $ fft5 $ head splitInfoList)) ( - let ((splitGraph, splitCost), prunedGraphRootIndex, nakedNode, originalSplitNode) = head splitInfoList - -- outgroupEdges = LG.out splitGraph graphRoot - (_, prunedSubTreeEdges) = LG.nodesAndEdgesAfter splitGraph [(nakedNode, fromJust $ LG.lab splitGraph nakedNode)] - - splitEdges = if (graphType inGS) == Tree then LG.labEdges splitGraph - else if LG.isTree splitGraph then LG.labEdges splitGraph - else GO.getEdgeSplitList splitGraph - - edgesToInvade = take maxMoveEdgeDist $ L.intersect splitEdges ((GO.sortEdgeListByDistance splitGraph [originalSplitNode] [originalSplitNode]) L.\\ prunedSubTreeEdges) -- L.\\ (outgroupEdges ++ prunedSubTreeEdges) - - - prunedGraphRootNode = (prunedGraphRootIndex, fromJust $ LG.lab splitGraph prunedGraphRootIndex) - (nodesAfterPrunedRoot, edgesInPrunedSubGraph) = LG.nodesAndEdgesAfter splitGraph [prunedGraphRootNode] - onlySPR = length nodesAfterPrunedRoot < 3 - - (candidateGraphList, newAnnealVals) = addSubGraphSteepest inGS inData swapType hardwiredSPR doIA splitGraph prunedGraphRootNode splitCost curBestCost nakedNode onlySPR edgesInPrunedSubGraph charInfoVV inSimAnnealVals edgesToInvade - - -- numLeaves = fromIntegral $ V.length $ fst3 inData - in - -- experimental union-type split exclusion - -- skip rearrangements if split delta too small VAron and Wheeler (2013) - -- if (curBestCost / numLeaves) > 1.17 * (curBestCost - splitCost) then ([], inSimAnnealVals) - - -- case where swap split returned empty because too few nodes in remaining graph to add to - -- else - -- trace ("RGKBS:" ++ (show (length candidateGraphList, fmap snd6 candidateGraphList)) ++ " " ++ (show $ LG.isEmpty splitGraph)) ( - if LG.isEmpty splitGraph || null candidateGraphList then ([], inSimAnnealVals) - - -- normal steepest--only return if better if equal does not return, but will return multiple - else if inSimAnnealVals == Nothing then - let candidateGraphList' = take numToKeep $ GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] candidateGraphList - in - if (snd6 $ head candidateGraphList') < curBestCost then - -- ([(head candidateGraphList, snd6 $ head candidateGraphList)], Nothing) - (zip candidateGraphList' (fmap snd6 candidateGraphList'), Nothing) - -- ([(head candidateGraphList, snd6 $ head candidateGraphList)], Nothing) - - else rejoinGraphKeepBestSteepest inGS inData swapType hardwiredSPR curBestCost numToKeep maxMoveEdgeDist steepest doIA charInfoVV Nothing (tail splitInfoList) - - -- simulated Annealing/Drifting--recurse if not at max steps - else - ([(head candidateGraphList, snd6 $ head candidateGraphList)], newAnnealVals) - - -- ) - - -- )) - --- | addSubGraphSteepest "adds" a subtree back into an edge calculating the cost of the graph via the delta of the add and costs of the two components --- used in "steepest" descendt swapping -addSubGraphSteepest :: GlobalSettings - -> ProcessedData - -> String - -> Bool - -> Bool - -> DecoratedGraph - -> LG.LNode VertexInfo - -> VertexCost - -> VertexCost - -> LG.Node - -> Bool - -> [LG.LEdge EdgeInfo] - -> V.Vector (V.Vector CharInfo) - -> Maybe SAParams - -> [LG.LEdge EdgeInfo] - -> ([PhylogeneticGraph], Maybe SAParams) -addSubGraphSteepest inGS inData swapType hardwiredSPR doIA inGraph prunedGraphRootNode splitCost curBestCost nakedNode onlySPR edgesInPrunedSubGraph charInfoVV inSimAnnealVals targetEdgeList = - if null targetEdgeList then ([], inSimAnnealVals) - -- this more for graph fusing checks - else if (splitCost > curBestCost) && (inSimAnnealVals == Nothing) then ([], Nothing) - else - let targetEdge@(eNode, vNode, _) = head targetEdgeList - -- existingEdgeCost = minLength targetlabel - edge0 = (nakedNode, vNode, 0.0) - edge1 = (eNode, nakedNode, 0.0) - targetEdgeData = M.makeEdgeData doIA inGraph charInfoVV targetEdge - -- edge2 = (nakedNode, prunedGraphRootIndex, 0.0) - --newNode = (nakedNode, TL.pack ("HTU" ++ (show nakedNode))) - - -- for SPRT/NNI only need preliminary state of root-pruned node - -- for TBR ther are multiple verticces created for each edge - doSPR = if swapType == "spr" || swapType == "nni" || onlySPR then True - else False - subGraphEdgeVertDataTripleList = if doSPR then [((-1, -1), vertData $ snd prunedGraphRootNode, ([],[]))] - else if hardwiredSPR then take 2 $ getPrunedEdgeData (graphType inGS) doIA inGraph prunedGraphRootNode edgesInPrunedSubGraph charInfoVV - else getPrunedEdgeData (graphType inGS) doIA inGraph prunedGraphRootNode edgesInPrunedSubGraph charInfoVV - - - -- this is SPR/NNI - (delta, _, (tbrEdgesAdd, tbrEdgesDelete)) = getSubGraphDelta targetEdgeData doIA charInfoVV (head subGraphEdgeVertDataTripleList) - - -- this if TBR --need to recurse through all reootings of pruned tree - (newTBRList, newAnnealVals) = getSubGraphDeltaTBR inGS inData targetEdgeData [edge0, edge1] (eNode, vNode) doIA inGraph splitCost curBestCost charInfoVV inSimAnnealVals subGraphEdgeVertDataTripleList - - in - -- trace ("ASGR: " ++ (show (delta, splitCost, delta + splitCost, curBestCost))) ( - -- do not redo origal edge so retun infinite cost and dummy edits - if doSPR && (eNode == nakedNode) then addSubGraphSteepest inGS inData swapType hardwiredSPR doIA inGraph prunedGraphRootNode splitCost curBestCost nakedNode onlySPR edgesInPrunedSubGraph charInfoVV inSimAnnealVals (tail targetEdgeList) - - --TBR case - else if length subGraphEdgeVertDataTripleList > 1 then - if null newTBRList then addSubGraphSteepest inGS inData swapType hardwiredSPR doIA inGraph prunedGraphRootNode splitCost curBestCost nakedNode onlySPR edgesInPrunedSubGraph charInfoVV newAnnealVals (tail targetEdgeList) - - -- must be better if non-empty list - else (newTBRList, newAnnealVals) - - -- SPR case - -- regular non-SA case - else if inSimAnnealVals == Nothing then - -- better heursitic cost - -- reoptimize to check cost - if (delta + splitCost <= (curBestCost * (dynamicEpsilon inGS))) then - let splitGraphSimple = GO.convertDecoratedToSimpleGraph inGraph - swapSimpleGraph = applyGraphEdits splitGraphSimple (delta + splitCost, [edge0, edge1] ++ tbrEdgesAdd, (eNode, vNode) : tbrEdgesDelete) - reoptimizedCandidateGraph = if (graphType inGS == Tree) then T.multiTraverseFullyLabelGraph inGS inData False False Nothing swapSimpleGraph - else - if (not . LG.cyclic) swapSimpleGraph && (not . GO.parentInChain) swapSimpleGraph then T.multiTraverseFullyLabelGraph inGS inData False False Nothing swapSimpleGraph - else emptyPhylogeneticGraph - in - if (snd6 reoptimizedCandidateGraph < curBestCost) then - {- - if (graphType inGS /= Tree) then - -- check for cycles and anc/desc time violations in Networks - if (not . LG.cyclic) swapSimpleGraph && (not . GO.parentInChainthen swapSimpleGraph) then (filter ((== False) . (GO.hasNetNodeAncestorViolation . thd6)) [reoptimizedCandidateGraph], Nothing) - else ([], Nothing) - else ([reoptimizedCandidateGraph], Nothing) - -} - ([reoptimizedCandidateGraph], Nothing) - else addSubGraphSteepest inGS inData swapType hardwiredSPR doIA inGraph prunedGraphRootNode splitCost curBestCost nakedNode onlySPR edgesInPrunedSubGraph charInfoVV Nothing (tail targetEdgeList) - - -- not better heuristic cost - else addSubGraphSteepest inGS inData swapType hardwiredSPR doIA inGraph prunedGraphRootNode splitCost curBestCost nakedNode onlySPR edgesInPrunedSubGraph charInfoVV Nothing (tail targetEdgeList) - - - -- simulated annealing/Drift case - else - --simulated Annealing check if at end of steps then return - -- original sim anneal values since is SPR case and not updated yet - let splitGraphSimple = GO.convertDecoratedToSimpleGraph inGraph - swapSimpleGraph = applyGraphEdits splitGraphSimple (delta + splitCost, [edge0, edge1] ++ tbrEdgesAdd, (eNode, vNode) : tbrEdgesDelete) - reoptimizedCandidateGraph = if (graphType inGS == Tree) then T.multiTraverseFullyLabelGraph inGS inData False False Nothing swapSimpleGraph - else - if (not . LG.cyclic) swapSimpleGraph && (not . GO.parentInChain) swapSimpleGraph then T.multiTraverseFullyLabelGraph inGS inData False False Nothing swapSimpleGraph - else emptyPhylogeneticGraph - - in - ([reoptimizedCandidateGraph], newAnnealVals) - - - -- ) - - - --- | getSubGraphDeltaTBR calculated cost of adding a subgraph into and edge --- for SPR use the preliminary of subGraph to final of e and v nodes --- can use median functions for postorder if set final-> prelim or e and f -getSubGraphDeltaTBR :: GlobalSettings - -> ProcessedData - -> VertexBlockData - -> [LG.LEdge Double] - -> LG.Edge - -> Bool - -> DecoratedGraph - -> VertexCost - -> VertexCost - -> V.Vector (V.Vector CharInfo) - -> Maybe SAParams - -> [(LG.Edge, VertexBlockData, ([LG.LEdge Double],[LG.Edge]))] - -> ([PhylogeneticGraph], Maybe SAParams) -getSubGraphDeltaTBR inGS inData evEdgeData edgeToAddInList edgeToDeleteIn doIA inGraph splitCost curBestCost charInfoVV inSimAnnealVals subGraphEdgeVertDataTripleList = - -- trace ("SGD") ( - -- found nothing better - if null subGraphEdgeVertDataTripleList then ([], inSimAnnealVals) - else - let (_, subGraphVertData, (tbrEdgesAdd, tbrEdgesDelete)) = head subGraphEdgeVertDataTripleList - -- Use edge union data for delta to edge data - -- costMethod = if doIA then ImpliedAlignment - -- else DirectOptimization - - subGraphEdgeUnionCost = if (not doIA) then V.sum $ fmap V.sum $ fmap (fmap snd) $ POS.createVertexDataOverBlocks subGraphVertData evEdgeData charInfoVV [] - else V.sum $ fmap V.sum $ fmap (fmap snd) $ POS.createVertexDataOverBlocksStaticIA subGraphVertData evEdgeData charInfoVV [] - - in - - -- regular TBR case - if inSimAnnealVals == Nothing then - if subGraphEdgeUnionCost + splitCost <= (curBestCost * (dynamicEpsilon inGS)) then - -- reoptimize and check - let splitGraphSimple = GO.convertDecoratedToSimpleGraph inGraph - swapSimpleGraph = applyGraphEdits splitGraphSimple (subGraphEdgeUnionCost + splitCost, edgeToAddInList ++ tbrEdgesAdd, edgeToDeleteIn : tbrEdgesDelete) - reoptimizedCandidateGraph = T.multiTraverseFullyLabelGraph inGS inData False False Nothing swapSimpleGraph - in - if snd6 reoptimizedCandidateGraph < curBestCost then ([reoptimizedCandidateGraph], Nothing) - else getSubGraphDeltaTBR inGS inData evEdgeData edgeToAddInList edgeToDeleteIn doIA inGraph splitCost curBestCost charInfoVV Nothing (tail subGraphEdgeVertDataTripleList) - - -- move on - else - getSubGraphDeltaTBR inGS inData evEdgeData edgeToAddInList edgeToDeleteIn doIA inGraph splitCost curBestCost charInfoVV Nothing (tail subGraphEdgeVertDataTripleList) - - -- Simulated Annealing/Drifting case - else - let -- update annealing values for next round - -- abstract stopping criterion to continue - numDone = if (method $ fromJust inSimAnnealVals) == SimAnneal then currentStep $ fromJust inSimAnnealVals - else driftChanges $ fromJust inSimAnnealVals - numMax = if (method $ fromJust inSimAnnealVals) == SimAnneal then numberSteps $ fromJust inSimAnnealVals - else driftMaxChanges $ fromJust inSimAnnealVals - - -- get acceptance based on heuristic costs - (acceptGraph, nextAnnealVals) = simAnnealAccept inSimAnnealVals curBestCost (subGraphEdgeUnionCost + splitCost) - - -- optimize graph - splitGraphSimple = GO.convertDecoratedToSimpleGraph inGraph - swapSimpleGraph = applyGraphEdits splitGraphSimple (subGraphEdgeUnionCost + splitCost, edgeToAddInList ++ tbrEdgesAdd, edgeToDeleteIn : tbrEdgesDelete) - reoptimizedCandidateGraph = T.multiTraverseFullyLabelGraph inGS inData False False Nothing swapSimpleGraph - - in - if (numDone < numMax) then - -- accept based on heurisrtic cost - if acceptGraph then - ([reoptimizedCandidateGraph], nextAnnealVals) - else - getSubGraphDeltaTBR inGS inData evEdgeData edgeToAddInList edgeToDeleteIn doIA inGraph splitCost curBestCost charInfoVV nextAnnealVals (tail subGraphEdgeVertDataTripleList) - else - ([reoptimizedCandidateGraph], nextAnnealVals) - --} - -{- --- | getSubGraphDelta calculated cost of adding a subgraph into and edge --- for SPR use the preliminary of subGraph to final of e and v nodes --- can use median fruntions for postorder if set final-> prelim or e and f -getSubGraphDelta :: VertexBlockData - -> Bool - -> V.Vector (V.Vector CharInfo) - -> (LG.Edge, VertexBlockData, ([LG.LEdge Double],[LG.Edge])) - -> (VertexCost, LG.Edge, ([LG.LEdge Double],[LG.Edge])) -getSubGraphDelta evEdgeData doIA charInfoVV (edgeToJoin, subGraphVertData, edgeSubGraphEdits) = - let -- create edge union 'character' blockData - -- based on final assignments but set to preliminary - -- need to filter gaps if DO, not itIA - --edgeUnionVertData = M.makeEdgeData doIA inGraph charInfoVV evEdge - -- edgeUnionVertData = M.createEdgeUnionOverBlocks (not doIA) eNodeVertData vNodeVertData charInfoVV [] - - {-- - Use edge union data for delta to edge data - costMethod = if doIA then ImpliedAlignment - else DirectOptimization - -} - - subGraphEdgeUnionCost = if (not doIA) then V.sum $ fmap V.sum $ fmap (fmap snd) $ POS.createVertexDataOverBlocks subGraphVertData evEdgeData charInfoVV [] - else V.sum $ fmap V.sum $ fmap (fmap snd) $ POS.createVertexDataOverBlocksStaticIA subGraphVertData evEdgeData charInfoVV [] - - -- subGraphEdgeUnionCost = sum $ fmap fst $ V.zipWith3 (PRE.getBlockCostPairsFinal costMethod) subGraphVertData edgeUnionVertData charInfoVV - - - {- - This estimate very close to subGraphEdgeUnionCost. Seems to always underestmate where as subGraphEdgeUnionCost can sometimes overestimate - but 3x as much work if use costEV, 2x if use existingEdgeCost - - -- existingEdgeCost = minLength edgeToJoin - -- eNodeVertData = vertData $ fromJust $ LG.lab inGraph (fst edgeToJoin) - -- vNodeVertData = vertData $ fromJust $ LG.lab inGraph (snd edgeToJoin) - -- subGraphVertData = snd subGraphEdgeVertDataPair - - - -- dummyE = M.createEdgeUnionOverBlocks False eNodeVertData eNodeVertData charInfoVV [] - -- dummyV = M.createEdgeUnionOverBlocks False vNodeVertData vNodeVertData charInfoVV [] - -- dummySGV = M.createEdgeUnionOverBlocks False (PRE.setFinalToPreliminaryStates subGraphVertData) (PRE.setFinalToPreliminaryStates subGraphVertData) charInfoVV [] - - costNewE = V.sum $ fmap V.sum $ fmap (fmap snd) $ POS.createVertexDataOverBlocks dummyE subGraphVertData charInfoVV [] - costNewV = V.sum $ fmap V.sum $ fmap (fmap snd) $ POS.createVertexDataOverBlocks dummyV subGraphVertData charInfoVV [] - costEV = V.sum $ fmap V.sum $ fmap (fmap snd) $ POS.createVertexDataOverBlocks dummyE dummyV charInfoVV [] - - subGraphEdgeUnionCost' = (costNewE + costNewV - costEV) / 2.0 - -} - - in - -- remove this check when things are working - {- - if null subGraphVertData || null evEdgeData || (subGraphEdgeUnionCost == 0.0) then - trace ("SGD null or 0 :" ++ (show (edgeToJoin, length subGraphVertData, length evEdgeData, subGraphEdgeUnionCost)) ) - -- ++ "\nInGraph:\n" - -- ++ (LG.prettify $ GO.convertDecoratedToSimpleGraph inGraph) ++ "\n" ++ (show inGraph)) - (subGraphEdgeUnionCost, edgeToJoin, edgeSubGraphEdits) - --trace ("GSD:" ++ (show ((costNewE, costNewV, costEV))) ++ " -> " ++ (show subGraphEdgeUnionCost') ++ " v " ++ (show subGraphEdgeUnionCost)) - -- trace ("Delta: " ++ (show subGraphEdgeUnionCost)) - --min raphEdgeUnionCost subGraphEdgeUnionCost' - else - -} - -- trace ("SGD :" ++ (show subGraphEdgeUnionCost) ++ " vs " ++ (show subGraphEdgeUnionCost')) - (subGraphEdgeUnionCost, edgeToJoin, edgeSubGraphEdits) --} - - -{- --- | swapAll performs branch swapping on all 'break' edges and all readditions --- edges are unsorted since doing all of them -swapAll :: String - -> Bool - -> GlobalSettings - -> ProcessedData - -> Int - -> Int - -> Bool - -> Int - -> VertexCost - -> [PhylogeneticGraph] - -> [PhylogeneticGraph] - -> Int - -> SimpleGraph - -> DecoratedGraph - -> DecoratedGraph - -> V.Vector (V.Vector CharInfo) - -> Bool - -> VertexCost - -> ([PhylogeneticGraph], Int) -swapAll swapType hardwiredSPR inGS inData numToKeep maxMoveEdgeDist steepest counter curBestCost curSameBetterList inGraphList numLeaves leafSimpleGraph leafDecGraph leafGraphSoftWired charInfoVV doIA netPenaltyFactor = - -- trace ("ALL") ( - if null inGraphList then - (take numToKeep $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] curSameBetterList, counter) - else - let firstGraph = head inGraphList - firstDecoratedGraph = thd6 firstGraph - (firstRootIndex, _) = head $ LG.getRoots firstDecoratedGraph - - -- determine edges to break on--'bridge' edges only for network - -- filter out edges from root since no use--would just rejoin - breakEdgeList = if (graphType inGS) == Tree then filter ((/= firstRootIndex) . fst3) $ LG.labEdges firstDecoratedGraph - else if LG.isTree firstDecoratedGraph then filter ((/= firstRootIndex) . fst3) $ LG.labEdges firstDecoratedGraph - else filter ((/= firstRootIndex) . fst3) $ GO.getEdgeSplitList firstDecoratedGraph - - -- create list of breaks - splitTupleList = fmap (GO.splitGraphOnEdge firstDecoratedGraph) breakEdgeList -- `using` PU.myParListChunkRDS - (splitGraphList, graphRootList, prunedGraphRootIndexList, originalConnectionOfPruned) = L.unzip4 splitTupleList - - reoptimizedSplitGraphList = zipWith3 (reoptimizeSplitGraphFromVertex inGS inData doIA netPenaltyFactor) splitGraphList graphRootList prunedGraphRootIndexList -- `using` PU.myParListChunkRDS - - -- create rejoins-- adds in break list so don't remake the initial graph - -- didn't concatMap so can parallelize later - -- this cost prob doesn't include the root/net penalty--so need to figure out - swapPairList = concat (L.zipWith4 (rejoinGraphKeepBest inGS swapType hardwiredSPR curBestCost maxMoveEdgeDist doIA charInfoVV) reoptimizedSplitGraphList prunedGraphRootIndexList originalConnectionOfPruned (fmap head $ fmap ((LG.parents $ thd6 firstGraph).fst3) breakEdgeList) `using` PU.myParListChunkRDS) - - -- keeps better heuristic swap costs graphs based on current best as opposed to minimum heuristic costs - -- minimumCandidateGraphCost = if (null swapPairList) then infinity - -- else minimum $ fmap snd swapPairList - candidateSwapGraphList = if (graphType inGS /= Tree) then - -- checks for cylces--rare but can occur - filter ((== False) . (GO.parentInChain . fst)) $ filter ((== False) . (LG.cyclic . fst)) $ filter ((<= (curBestCost * (dynamicEpsilon inGS))). snd) swapPairList - else filter ((<= (curBestCost * (dynamicEpsilon inGS))). snd) swapPairList - - - -- this should be incremental--full 2-pass for now - reoptimizedSwapGraphList = if (graphType inGS /= Tree) then - -- checks for anc/desc time violation--can occur with some edge splits - let newGraphs = fmap (T.multiTraverseFullyLabelGraph inGS inData False False Nothing) (fmap fst candidateSwapGraphList) `using` PU.myParListChunkRDS - in filter ((== False) . (GO.hasNetNodeAncestorViolation . thd6)) newGraphs - else fmap (T.multiTraverseFullyLabelGraph inGS inData False False Nothing) (fmap fst candidateSwapGraphList) `using` PU.myParListChunkRDS - - firstBettterGraphList = takeFirstBetterGraph curBestCost reoptimizedSwapGraphList - - -- selects best graph list based on full optimization - bestSwapGraphList = take numToKeep $ GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] reoptimizedSwapGraphList - - bestSwapCost = if null breakEdgeList || null swapPairList || null candidateSwapGraphList || null reoptimizedSwapGraphList || null bestSwapGraphList then infinity - else snd6 $ head bestSwapGraphList - - in - -- trace ("Cycles: " ++ (show $ fmap LG.cyclic $ fmap fst candidateSwapGraphList)) ( - -- trace ("Breakable Edges :" ++ (show $ fmap LG.toEdge breakEdgeList) ++ "\nIn graph:\n" ++ (LG.prettify $ fst6 firstGraph)) ( - -- trace ("(Est, [FP]): " ++ (show minimumCandidateGraphCost) ++ " " ++ (show $ fmap snd6 reoptimizedSwapGraphList)) ( - -- either no better or more of same cost graphs - -- trace ("BSG: " ++ " simple " ++ (LG.prettify $ fst6 $ head bestSwapGraphList) ++ " Decorated " ++ (LG.prettify $ thd6 $ head bestSwapGraphList) ++ "\nCharinfo\n" ++ (show $ charType $ V.head $ V.head $ six6 $ head bestSwapGraphList)) ( - -- trace ("All--Choosing what to do: " ++ (show (bestSwapCost, curBestCost, length curSameBetterList, numToKeep)) ++ " " ++ (show (length reoptimizedSplitGraphList, length swapPairList, length candidateSwapGraphList, length reoptimizedSwapGraphList, length bestSwapGraphList))) ( - if steepest && (not . null) firstBettterGraphList then - trace ("\t->" ++ (show $ (snd6. head) firstBettterGraphList)) - swapAll swapType hardwiredSPR inGS inData numToKeep maxMoveEdgeDist steepest (counter + 1) ((snd6. head) firstBettterGraphList) firstBettterGraphList firstBettterGraphList numLeaves leafSimpleGraph leafDecGraph leafGraphSoftWired charInfoVV doIA netPenaltyFactor - - else if bestSwapCost == curBestCost then - if (length curSameBetterList == numToKeep) then - -- trace ("Same cost and maxed out") - swapAll swapType hardwiredSPR inGS inData numToKeep maxMoveEdgeDist steepest (counter + 1) curBestCost curSameBetterList (tail inGraphList) numLeaves leafSimpleGraph leafDecGraph leafGraphSoftWired charInfoVV doIA netPenaltyFactor - else - --equality informed by zero-length edges - let newCurSameBestList = take numToKeep $ GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] (firstGraph : curSameBetterList) - -- if firstGraph `notElem` curSameBetterList then (firstGraph : curSameBetterList) - -- else curSameBetterList - graphsToSwap = take numToKeep $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] (tail inGraphList) - - in - -- trace ("Same cost: " ++ (show bestSwapCost) ++ " with " ++ (show $ length $ graphsToSwap) ++ " more to swap and " ++ (show $ length newCurSameBestList) - -- ++ " graphs in 'best' list " ++ " max size " ++ (show numToKeep) ) -- ++ "\n" ++ (concat prettyBestSwapGraphList)) - swapAll swapType hardwiredSPR inGS inData numToKeep maxMoveEdgeDist steepest (counter + 1) curBestCost newCurSameBestList graphsToSwap numLeaves leafSimpleGraph leafDecGraph leafGraphSoftWired charInfoVV doIA netPenaltyFactor - - -- better cost graphs - else if (bestSwapCost < curBestCost) then - trace ("\t->" ++ (show bestSwapCost)) -- ++ "\n" ++ (LG.prettify $ GO.convertDecoratedToSimpleGraph $ thd6 $ head bestSwapGraphList)) - swapAll swapType hardwiredSPR inGS inData numToKeep maxMoveEdgeDist steepest (counter + 1) bestSwapCost bestSwapGraphList (bestSwapGraphList ++ (tail inGraphList)) numLeaves leafSimpleGraph leafDecGraph leafGraphSoftWired charInfoVV doIA netPenaltyFactor - - -- didn't find equal or better graphs - else - -- trace ("Worse cost")( - let newCurSameBestList = take numToKeep $ GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] (firstGraph : curSameBetterList) - -- if firstGraph `notElem` curSameBetterList then (firstGraph : curSameBetterList) - -- else curSameBetterList - in - swapAll swapType hardwiredSPR inGS inData numToKeep maxMoveEdgeDist steepest (counter + 1) curBestCost newCurSameBestList (tail inGraphList) numLeaves leafSimpleGraph leafDecGraph leafGraphSoftWired charInfoVV doIA netPenaltyFactor - -- ) - -- ) - -- ) - - --- | takeFirstBetterGraph returns the first graph with cost lower than input -takeFirstBetterGraph :: VertexCost -> [PhylogeneticGraph] -> [PhylogeneticGraph] -takeFirstBetterGraph curBestCost inGraphList = - if null inGraphList then [] - else if (snd6 . head) inGraphList < curBestCost then [head inGraphList] - else takeFirstBetterGraph curBestCost (tail inGraphList) - --- | swapSteepest performs branch swapping greedily switching to found graph if better --- infomrs evaluation--less parallelism -swapSteepest :: String - -> Bool - -> GlobalSettings - -> ProcessedData - -> Int - -> Int - -> Bool - -> Int - -> VertexCost - -> [PhylogeneticGraph] - -> [PhylogeneticGraph] - -> Int - -> SimpleGraph - -> DecoratedGraph - -> DecoratedGraph - -> V.Vector (V.Vector CharInfo) - -> Bool - -> VertexCost - -> Maybe SAParams - -> ([PhylogeneticGraph], Int) -swapSteepest swapType hardwiredSPR inGS inData numToKeep maxMoveEdgeDist steepest counter curBestCost curSameBetterList inGraphList numLeaves leafSimpleGraph leafDecGraph leafGraphSoftWired charInfoVV doIA netPenaltyFactor inSimAnnealVals = - --trace ("steepest") ( - if null inGraphList then - (take numToKeep $ GO.getBVUniqPhylogeneticGraph True curSameBetterList, counter) - else - let firstGraph = head inGraphList - firstDecoratedGraph = thd6 firstGraph - (firstRootIndex, _) = head $ LG.getRoots firstDecoratedGraph - - -- filter out edges from root since no use--would just rejoin - firstEdgeList = filter ((/= firstRootIndex) . fst3) $ LG.labEdges firstDecoratedGraph - - -- determine edges to break on--'bridge' edges only for network - -- longest edges first first - breakEdgeList = if (graphType inGS) == Tree then GO.sortEdgeListByLength firstEdgeList - else if LG.isTree firstDecoratedGraph then GO.sortEdgeListByLength firstEdgeList - else GO.sortEdgeListByLength $ GO.getEdgeSplitList firstDecoratedGraph - - -- create list of breaks - splitTupleList = fmap (GO.splitGraphOnEdge firstDecoratedGraph) breakEdgeList - - (splitGraphList, graphRootList, prunedGraphRootIndexList, originalConnectionOfPruned) = L.unzip4 splitTupleList - - reoptimizedSplitGraphList = zipWith3 (reoptimizeSplitGraphFromVertex inGS inData doIA netPenaltyFactor) splitGraphList graphRootList prunedGraphRootIndexList - - -- create rejoins-- reoptimized fully in steepest returns PhylogheneticGraph - (reoptimizedSwapGraphList, newAnnealVals) = rejoinGraphKeepBestSteepest inGS inData swapType hardwiredSPR curBestCost numToKeep maxMoveEdgeDist True doIA charInfoVV inSimAnnealVals $ L.zip4 reoptimizedSplitGraphList prunedGraphRootIndexList originalConnectionOfPruned (fmap head $ fmap ((LG.parents $ thd6 firstGraph).fst3) breakEdgeList) - - -- this should be incremental--full 2-pass for now - -- reoptimizedSwapGraph = T.multiTraverseFullyLabelGraph inGS inData False False Nothing $ fst $ head swapPairList - - - bestSwapCost = if null breakEdgeList || null reoptimizedSplitGraphList || null reoptimizedSwapGraphList then infinity - else snd $ head reoptimizedSwapGraphList - - in - -- trace ("Breakable Edges :" ++ (show $ fmap LG.toEdge breakEdgeList) ++ "\nIn graph:\n" ++ (LG.prettify $ fst6 firstGraph)) ( - - -- either no better or more of same cost graphs - -- trace ("BSG: " ++ " simple " ++ (LG.prettify $ fst6 $ head bestSwapGraphList) ++ " Decorated " ++ (LG.prettify $ thd6 $ head bestSwapGraphList) ++ "\nCharinfo\n" ++ (show $ charType $ V.head $ V.head $ six6 $ head bestSwapGraphList)) ( - - -- trace ("Steepest--Choosing what to do: " ++ (show (bestSwapCost, curBestCost, length curSameBetterList, numToKeep)) ++ " " ++ (show (length reoptimizedSplitGraphList, length reoptimizedSwapGraphList))) ( - -- check that graphs were returned. If nothing then return in Graph - if bestSwapCost == infinity then (inGraphList, counter + 1) - - else if inSimAnnealVals == Nothing then - if (bestSwapCost < curBestCost) then - --trace ("Steepest better") - trace ("\t->" ++ (show bestSwapCost)) - swapSteepest swapType hardwiredSPR inGS inData numToKeep maxMoveEdgeDist steepest (counter + 1) bestSwapCost (fmap fst reoptimizedSwapGraphList) (fmap fst reoptimizedSwapGraphList) numLeaves leafSimpleGraph leafDecGraph leafGraphSoftWired charInfoVV doIA netPenaltyFactor Nothing - - else if (bestSwapCost == curBestCost) then - -- buffer full - if (length curSameBetterList == numToKeep) then - swapSteepest swapType hardwiredSPR inGS inData numToKeep maxMoveEdgeDist steepest (counter + 1) curBestCost curSameBetterList (tail inGraphList) numLeaves leafSimpleGraph leafDecGraph leafGraphSoftWired charInfoVV doIA netPenaltyFactor inSimAnnealVals - -- room for another Graph if unnique - else - let newCurSameBestList = take numToKeep $ GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] (firstGraph : curSameBetterList) - -- if firstGraph `notElem` curSameBetterList then (firstGraph : curSameBetterList) - -- else curSameBetterList - graphsToSwap = take numToKeep $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] ((tail inGraphList) L.\\ newCurSameBestList) - in - swapSteepest swapType hardwiredSPR inGS inData numToKeep maxMoveEdgeDist steepest (counter + 1) curBestCost curSameBetterList graphsToSwap numLeaves leafSimpleGraph leafDecGraph leafGraphSoftWired charInfoVV doIA netPenaltyFactor inSimAnnealVals - - -- didn't find equal or better graphs - else (inGraphList, counter + 1) - - -- Simulated annealing - else - -- abstract stopping criterion to continue - let numDone = if (method $ fromJust inSimAnnealVals) == SimAnneal then currentStep $ fromJust newAnnealVals - else driftChanges $ fromJust newAnnealVals - numMax = if (method $ fromJust inSimAnnealVals) == SimAnneal then numberSteps $ fromJust newAnnealVals - else driftMaxChanges $ fromJust newAnnealVals - in - - --simulated Annealing/Drift check if at end then return - if (numDone < numMax) then - swapSteepest swapType hardwiredSPR inGS inData numToKeep maxMoveEdgeDist steepest (counter + 1) bestSwapCost (fmap fst reoptimizedSwapGraphList) (fmap fst reoptimizedSwapGraphList) numLeaves leafSimpleGraph leafDecGraph leafGraphSoftWired charInfoVV doIA netPenaltyFactor newAnnealVals - - else - (fmap fst reoptimizedSwapGraphList, counter + 1) - - - -- ) - --} diff --git a/pkg/PhyGraph/Search/SwapMaster.hs b/pkg/PhyGraph/Search/SwapMaster.hs deleted file mode 100644 index 570d79627..000000000 --- a/pkg/PhyGraph/Search/SwapMaster.hs +++ /dev/null @@ -1,268 +0,0 @@ -{- | -Module : SwapMaster.hs -Description : Module controlling grapjh swap functions -Copyright : (c) 2022 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -module Search.SwapMaster ( swapMaster - ) where - -import qualified Commands.Verify as VER -import Control.Parallel.Strategies -import Data.Char -import Data.Maybe -import Debug.Trace -import GeneralUtilities -import qualified Graphs.GraphOperations as GO -import qualified ParallelUtilities as PU -import qualified Search.Swap as S -import Text.Read -import Types.Types -import Utilities.Utilities as U - --- | swapMaster processes and spawns the swap functions --- the 2 x maxMoveDist since distance either side to list 2* dist on sorted edges -swapMaster :: [Argument] -> GlobalSettings -> ProcessedData -> Int -> [PhylogeneticGraph] -> [PhylogeneticGraph] -swapMaster inArgs inGS inData rSeed inGraphList = - if null inGraphList then trace ("No graphs to swap") [] - else - - let -- process args for swap - (keepNum, maxMoveEdgeDist, steps', annealingRounds', doDrift, driftRounds', acceptEqualProb, acceptWorseFactor, maxChanges, lcArgList) = getSwapParams inArgs - doNNI = any ((=="nni").fst) lcArgList - doSPR = any ((=="spr").fst) lcArgList - doTBR = any ((=="tbr").fst) lcArgList - - -- randomized orders of split and join-- not implemented - -- doRandomized = any ((=="randomized").fst) lcArgList - - -- set implied alignment swapping - doIA' = any ((=="ia").fst) lcArgList - doIA = if (graphType inGS /= Tree) && doIA' then trace ("\tIgnoring 'IA' swap option for non-Tree") False - else doIA' - - --- steepest/all options - doSteepest' = any ((=="steepest").fst) lcArgList - doAll = any ((=="all").fst) lcArgList - - -- steepest default - doSteepest = if (not doSteepest' && not doAll) then True - else doSteepest' - - -- alternating rounds of SPR ande TBR is default unless NNI, SPR, or TBR specified - -- swap type if alternate will be "TBR" - doAlternate = if (doTBR || doSPR || doNNI) then False - else True - - -- simulated annealing parameters - -- returnMutated to return annealed Graphs before swapping fir use in Genetic Algorithm - doAnnealing = any ((=="annealing").fst) lcArgList - - returnMutated = any ((=="returnmutated").fst) lcArgList - - simAnnealParams = getSimAnnealParams doAnnealing doDrift steps' annealingRounds' driftRounds' acceptEqualProb acceptWorseFactor maxChanges rSeed - - -- create simulated annealing random lists uniquely for each fmap - newSimAnnealParamList = U.generateUniqueRandList (length inGraphList) simAnnealParams - - progressString = if not doAnnealing then ("Swapping " ++ (show $ length inGraphList) ++ " input graph(s) with minimum cost "++ (show $ minimum $ fmap snd6 inGraphList) ++ " keeping maximum of " ++ (show $ fromJust keepNum) ++ " graphs per input graph") - else - if (method $ fromJust simAnnealParams) == SimAnneal then - ("Simulated Annealing (Swapping) " ++ (show $ rounds $ fromJust simAnnealParams) ++ " rounds " ++ (show $ length inGraphList) ++ " with " ++ (show $ numberSteps $ fromJust simAnnealParams) ++ " cooling steps " ++ (show $ length inGraphList) ++ " input graph(s) at minimum cost "++ (show $ minimum $ fmap snd6 inGraphList) ++ " keeping maximum of " ++ (show $ fromJust keepNum) ++ " graphs") - else - ("Drifting (Swapping) " ++ (show $ rounds $ fromJust simAnnealParams) ++ " rounds " ++ (show $ length inGraphList) ++ " with " ++ (show $ numberSteps $ fromJust simAnnealParams) ++ " cooling steps " ++ (show $ length inGraphList) ++ " input graph(s) at minimum cost "++ (show $ minimum $ fmap snd6 inGraphList) ++ " keeping maximum of " ++ (show $ fromJust keepNum) ++ " graphs") - - in - - trace (progressString) ( - let (newGraphList, counterNNI) = if doNNI then - let graphPairList = PU.seqParMap rdeepseq (S.swapSPRTBR "nni" inGS inData (fromJust keepNum) 2 doSteepest False doIA returnMutated) (zip newSimAnnealParamList inGraphList) -- `using` PU.myParListChunkRDS - (graphListList, counterList) = unzip graphPairList - in (take (fromJust keepNum) $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] $ concat graphListList, sum counterList) - else (inGraphList, 0) - (newGraphList', counterSPR') = if (doSPR || doAlternate) then - let graphPairList = PU.seqParMap rdeepseq (S.swapSPRTBR "spr" inGS inData (fromJust keepNum) (2 * (fromJust maxMoveEdgeDist)) doSteepest False doIA returnMutated) (zip newSimAnnealParamList newGraphList) -- `using` PU.myParListChunkRDS - (graphListList, counterList) = unzip graphPairList - in - (take (fromJust keepNum) $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] $ concat graphListList, sum counterList) - else (newGraphList, 0) - - (newGraphList'', counterTBR') = if (doTBR || doAlternate) then - let graphPairList = PU.seqParMap rdeepseq (S.swapSPRTBR "tbr" inGS inData (fromJust keepNum) (2 * (fromJust maxMoveEdgeDist)) doSteepest doAlternate doIA returnMutated) (zip newSimAnnealParamList newGraphList') -- `using` PU.myParListChunkRDS - (graphListList, counterList) = unzip graphPairList - in - (take (fromJust keepNum) $ GO.selectPhylogeneticGraph [("unique", "")] 0 ["unique"] $ concat graphListList, sum counterList) - else (newGraphList', 0) - in - let (counterSPR, counterTBR, counterAlternate) = if doAlternate then (0,0, counterSPR' + counterTBR') - else (counterSPR', counterTBR', 0) - finalGraphList = if null newGraphList'' then inGraphList - else newGraphList'' - endString = if not doAnnealing then ("\n\tAfter swap: " ++ (show $ length finalGraphList) ++ " resulting graphs with minimum cost " ++ (show $ minimum $ fmap snd6 finalGraphList) ++ " with swap rounds (total): " ++ (show counterNNI) ++ " NNI, " ++ (show counterSPR) ++ " SPR, " ++ (show counterTBR) ++ " TBR, " ++ (show counterAlternate) ++ " Alternating SPR/TBR") - else if (method $ fromJust simAnnealParams) == SimAnneal then - ("\n\tAfter Simulated Annealing: " ++ (show $ length finalGraphList) ++ " resulting graphs") - else - ("\n\tAfter Drifting: " ++ (show $ length finalGraphList) ++ " resulting graphs") - - in - trace (endString) - finalGraphList - ) - - --- | getSimumlatedAnnealingParams returns SA parameters -getSimAnnealParams :: Bool -> Bool -> Maybe Int -> Maybe Int -> Maybe Int -> Maybe Double -> Maybe Double -> Maybe Int -> Int -> Maybe SAParams -getSimAnnealParams doAnnealing doDrift steps' annealingRounds' driftRounds' acceptEqualProb acceptWorseFactor maxChanges rSeed = - if (not doAnnealing && not doDrift) then Nothing - else - let steps = max 3 (fromJust steps') - annealingRounds = if annealingRounds' == Nothing then 1 - else if fromJust annealingRounds' < 1 then 1 - else fromJust annealingRounds' - - driftRounds = if driftRounds' == Nothing then 1 - else if fromJust driftRounds' < 1 then 1 - else fromJust driftRounds' - - saMethod = if doDrift && doAnnealing then - trace ("\tSpecified both Simulated Annealing (with temperature steps) and Drifting (without)--defaulting to drifting.") - Drift - else if doDrift then Drift - else SimAnneal - - equalProb = if fromJust acceptEqualProb < 0.0 then 0.0 - else if fromJust acceptEqualProb > 1.0 then 1.0 - else fromJust acceptEqualProb - - - worseFactor = if fromJust acceptWorseFactor < 0.0 then 0.0 - else fromJust acceptWorseFactor - - changes = if fromJust maxChanges < 0 then 15 - else fromJust maxChanges - - saValues = SAParams { method = saMethod - , numberSteps = steps - , currentStep = 0 - , randomIntegerList = randomIntList rSeed - , rounds = max annealingRounds driftRounds - , driftAcceptEqual = equalProb - , driftAcceptWorse = worseFactor - , driftMaxChanges = changes - , driftChanges = 0 - } - in - Just saValues - --- | getSwapParams takes areg list and preocesses returning parameter values -getSwapParams :: [Argument] -> (Maybe Int, Maybe Int, Maybe Int, Maybe Int, Bool, Maybe Int, Maybe Double, Maybe Double, Maybe Int, [(String, String)]) -getSwapParams inArgs = - let fstArgList = fmap (fmap toLower . fst) inArgs - sndArgList = fmap (fmap toLower . snd) inArgs - lcArgList = zip fstArgList sndArgList - checkCommandList = checkCommandArgs "swap" fstArgList VER.swapArgList - in - -- check for valid command options - if not checkCommandList then errorWithoutStackTrace ("Unrecognized command in 'swap': " ++ show inArgs) - else - let keepList = filter ((=="keep").fst) lcArgList - keepNum - | length keepList > 1 = - errorWithoutStackTrace ("Multiple 'keep' number specifications in swap command--can have only one: " ++ show inArgs) - | null keepList = Just 10 - | otherwise = readMaybe (snd $ head keepList) :: Maybe Int - - moveLimitList = filter (not . null) $ fmap snd $ filter ((`elem` ["spr", "tbr", "nni"]).fst) lcArgList - maxMoveEdgeDist' - | length moveLimitList > 1 = - errorWithoutStackTrace ("Multiple maximum edge distance number specifications in swap command--can have only one (e.g. spr:2): " ++ show inArgs) - | null moveLimitList = Just ((maxBound :: Int) `div` 3) - | otherwise = readMaybe (head moveLimitList) :: Maybe Int - - -- simulated anealing options - stepsList = filter ((=="steps").fst) lcArgList - steps' - | length stepsList > 1 = - errorWithoutStackTrace ("Multiple annealing steps value specifications in swap command--can have only one (e.g. steps:10): " ++ show inArgs) - | null stepsList = Just 10 - | otherwise = readMaybe (snd $ head stepsList) :: Maybe Int - - annealingList = filter ((=="annealing").fst) lcArgList - annealingRounds' - | length annealingList > 1 = - errorWithoutStackTrace ("Multiple 'annealing' rounds number specifications in swap command--can have only one: " ++ show inArgs) - | null annealingList = Just 1 - | otherwise = readMaybe (snd $ head annealingList) :: Maybe Int - - -- drift options - doDrift = any ((=="drift").fst) lcArgList - - driftList = filter ((=="drift").fst) lcArgList - driftRounds' - | length driftList > 1 = - errorWithoutStackTrace ("Multiple 'drift' rounds number specifications in swap command--can have only one: " ++ show inArgs) - | null driftList = Just 1 - | otherwise = readMaybe (snd $ head driftList) :: Maybe Int - - acceptEqualList = filter ((=="acceptequal").fst) lcArgList - acceptEqualProb - | length acceptEqualList > 1 = - errorWithoutStackTrace ("Multiple 'drift' acceptEqual specifications in swap command--can have only one: " ++ show inArgs) - | null acceptEqualList = Just 0.5 - | otherwise = readMaybe (snd $ head acceptEqualList) :: Maybe Double - - acceptWorseList = filter ((=="acceptworse").fst) lcArgList - acceptWorseFactor - | length acceptWorseList > 1 = - errorWithoutStackTrace ("Multiple 'drift' acceptWorse specifications in swap command--can have only one: " ++ show inArgs) - | null acceptWorseList = Just 1.0 - | otherwise = readMaybe (snd $ head acceptWorseList) :: Maybe Double - - maxChangesList = filter ((=="maxchanges").fst) lcArgList - maxChanges - | length maxChangesList > 1 = - errorWithoutStackTrace ("Multiple 'drift' maxChanges number specifications in swap command--can have only one: " ++ show inArgs) - | null maxChangesList = Just 15 - | otherwise = readMaybe (snd $ head maxChangesList) :: Maybe Int - - in - -- check inputs - if isNothing keepNum then errorWithoutStackTrace ("Keep specification not an integer in swap: " ++ show (head keepList)) - else if isNothing maxMoveEdgeDist' then errorWithoutStackTrace ("Maximum edge move distance specification not an integer (e.g. spr:2): " ++ show (head moveLimitList)) - else if isNothing steps' then errorWithoutStackTrace ("Annealing steps specification not an integer (e.g. steps:10): " ++ show (snd $ head stepsList)) - else if isNothing acceptEqualProb then errorWithoutStackTrace ("Drift 'acceptEqual' specification not a float (e.g. acceptEqual:0.75): " ++ show (snd $ head acceptEqualList)) - else if isNothing acceptWorseFactor then errorWithoutStackTrace ("Drift 'acceptWorse' specification not a float (e.g. acceptWorse:1.0): " ++ show (snd $ head acceptWorseList)) - else if isNothing maxChanges then errorWithoutStackTrace ("Drift 'maxChanges' specification not an integer (e.g. maxChanges:10): " ++ show (snd $ head maxChangesList)) - - else - (keepNum, maxMoveEdgeDist', steps', annealingRounds', doDrift, driftRounds', acceptEqualProb, acceptWorseFactor, maxChanges, lcArgList) diff --git a/pkg/PhyGraph/Search/WagnerBuild.hs b/pkg/PhyGraph/Search/WagnerBuild.hs deleted file mode 100644 index f90294dd6..000000000 --- a/pkg/PhyGraph/Search/WagnerBuild.hs +++ /dev/null @@ -1,234 +0,0 @@ -{- | -Module : WagnerBuild.hs -Description : Module specifying charcter-based Wagner tree building functions -Copyright : (c) 2021 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -module Search.WagnerBuild ( wagnerTreeBuild - , wagnerTreeBuild' - , rasWagnerBuild - ) where - -import Control.Parallel.Strategies -import qualified Data.List as L -import Data.Maybe -import qualified Data.Text.Lazy as TL -import qualified Data.Vector as V -import Debug.Trace -import GeneralUtilities -import qualified GraphOptimization.Medians as M -import qualified GraphOptimization.PreOrderFunctions as PRE -import qualified GraphOptimization.Traversals as T -import qualified Graphs.GraphOperations as GO -import qualified ParallelUtilities as PU -import Types.Types -import qualified Utilities.LocalGraph as LG -import Utilities.Utilities as U - - --- import qualified ParallelUtilities as PU --need instance for VerexInfo - - --- | rasWagnerBuild generates a series of random addition sequences and then calls wagnerTreeBuild to construct them. --- Does not filter by best, unique etc. That happens with the select() command specified separately. -rasWagnerBuild :: GlobalSettings -> ProcessedData -> Int -> Int -> [PhylogeneticGraph] -rasWagnerBuild inGS inData rSeed numReplicates = - if numReplicates == 0 then [] - else - let numLeaves = V.length $ fst3 inData - randomizedAdditionSequences = V.fromList <$> shuffleInt rSeed numReplicates [0..numLeaves - 1] - - -- "graph" of leaf nodes without any edges - leafGraph = T.makeSimpleLeafGraph inData - leafDecGraph = T.makeLeafGraph inData - - hasNonExactChars = U.getNumberSequenceCharacters (thd3 inData) > 0 - in - trace ("\t\tBuilding " ++ (show numReplicates) ++ " character Wagner replicates") - -- seqParMap better for high level parallel stuff - -- PU.seqParMap PU.myStrategy (wagnerTreeBuild inGS inData) randomizedAdditionSequences - -- zipWith (wagnerTreeBuild inGS inData leafGraph leafDecGraph numLeaves hasNonExactChars) randomizedAdditionSequences [0..numReplicates - 1] `using` PU.myParListChunkRDS - PU.seqParMap rdeepseq (wagnerTreeBuild' inGS inData leafGraph leafDecGraph numLeaves hasNonExactChars) (zip randomizedAdditionSequences [0..numReplicates - 1]) - -- fmap (wagnerTreeBuild' inGS inData leafGraph leafDecGraph numLeaves hasNonExactChars) (zip randomizedAdditionSequences [0..numReplicates - 1]) `using` PU.myParListChunkRDS - - --- | wagnerTreeBuild' is a wrapper around wagnerTreeBuild to allow for better parallation--(zipWith not doing so well?) -wagnerTreeBuild' :: GlobalSettings -> ProcessedData -> SimpleGraph -> DecoratedGraph -> Int -> Bool -> (V.Vector Int, Int) -> PhylogeneticGraph -wagnerTreeBuild' inGS inData leafSimpleGraph leafDecGraph numLeaves hasNonExactChars (additionSequence, replicateIndex) = - wagnerTreeBuild inGS inData leafSimpleGraph leafDecGraph numLeaves hasNonExactChars additionSequence replicateIndex - --- | wagnerTreeBuild builds a wagner tree (Farris 1970--but using random addition seqeuces--not "best" addition) --- from a leaf addition sequence. Always produces a tree that can be converted to a soft/hard wired network --- afterwards --- basic procs is to add edges to unresolved tree --- currently naive wrt candidate tree costs -wagnerTreeBuild :: GlobalSettings -> ProcessedData -> SimpleGraph -> DecoratedGraph -> Int -> Bool -> V.Vector Int -> Int -> PhylogeneticGraph -wagnerTreeBuild inGS inData leafSimpleGraph leafDecGraph numLeaves hasNonExactChars additionSequence replicateIndex = - trace ("\tBuilding Wagner replicate " ++ (show replicateIndex)) ( - let rootHTU = (numLeaves, TL.pack $ "HTU" ++ (show numLeaves)) - nextHTU = (numLeaves + 1, TL.pack $ "HTU" ++ (show $ numLeaves + 1)) - - edge0 = (numLeaves, (additionSequence V.! 0), 0.0) - edge1 = (numLeaves, numLeaves + 1, 0.0) - edge2 = (numLeaves + 1, (additionSequence V.! 1), 0.0) - edge3 = (numLeaves + 1, (additionSequence V.! 2), 0.0) - - initialTree = LG.insEdges [edge0, edge1, edge2, edge3] $ LG.insNodes [rootHTU, nextHTU] leafSimpleGraph - - blockCharInfo = V.map thd3 $ thd3 inData - - -- initialFullyDecoratedTree = T.multiTraverseFullyLabelTree inGS inData initialTree - -- False flag for staticIA--can't be done in build - calculateBranchLengths = False -- must be True for delata using existing edge - initialFullyDecoratedTree = PRE.preOrderTreeTraversal inGS (finalAssignment inGS) False calculateBranchLengths hasNonExactChars numLeaves False $ T.postDecorateTree False initialTree leafDecGraph blockCharInfo numLeaves numLeaves - - wagnerTree = recursiveAddEdgesWagner (V.drop 3 $ additionSequence) numLeaves (numLeaves + 2) inGS inData hasNonExactChars leafDecGraph initialFullyDecoratedTree - in - -- trace ("Initial Tree:\n" ++ (LG.prettify initialTree) ++ "FDT at cost "++ (show $ snd6 initialFullyDecoratedTree) ++":\n" - -- ++ (LG.prettify $ GO.convertDecoratedToSimpleGraph $ thd6 initialFullyDecoratedTree)) - wagnerTree - ) - - --- | recursiveAddEdgesWagner adds edges until 2n -1 (n leaves) vertices in graph --- this tested by null additin sequence list --- interface will change with correct final states--using post-order pass for now -recursiveAddEdgesWagner ::V.Vector Int -> Int -> Int -> GlobalSettings -> ProcessedData -> Bool -> DecoratedGraph -> PhylogeneticGraph -> PhylogeneticGraph -recursiveAddEdgesWagner additionSequence numLeaves numVerts inGS inData hasNonExactChars leafDecGraph inGraph@(inSimple, _, inDecGraph, _, _, charInfoVV) = - -- all edges/ taxa in graph - -- trace ("To go " ++ (show additionSequence) ++ " verts " ++ (show numVerts)) ( - if null additionSequence then inGraph - else - -- edges/taxa to add, but not the edges that leads to outgroup--redundant with its sister edge - let outgroupEdges = filter ((< numLeaves) . snd3) $ LG.out inDecGraph numLeaves - edgesToInvade = (LG.labEdges inDecGraph) L.\\ outgroupEdges - leafToAdd = V.head additionSequence - - -- since this is apporximate--can get a bit off - -- candidateEditList = fmap (addTaxonWagner numVerts inGraph leafToAdd) edgesToInvade - candidateEditList = PU.seqParMap rdeepseq (addTaxonWagner numVerts inGraph leafToAdd) edgesToInvade - minDelta = minimum $ fmap fst4 candidateEditList - (_, nodeToAdd, edgesToAdd, edgeToDelete) = head $ filter ((== minDelta). fst4) candidateEditList - - -- create new tree - newSimple = LG.insEdges edgesToAdd $ LG.insNode nodeToAdd $ LG.delEdge edgeToDelete inSimple - - newSimple' = if V.length additionSequence == 1 then LG.rerootTree (outgroupIndex inGS) newSimple - else LG.rerootTree (outgroupIndex inGS) newSimple - -- else newSimple - - -- create fully labelled tree, if all taxa in do full multi-labelled for correct graph type - -- False flag for static IA--can't do when adding in new leaves - calculateBranchLengths = False -- must be True for delata using existing edge - newPhyloGraph = -- T.multiTraverseFullyLabelTree inGS inData leafDecGraph (Just numLeaves) newSimple' - if (V.length additionSequence > 1) then PRE.preOrderTreeTraversal inGS (finalAssignment inGS) False calculateBranchLengths hasNonExactChars numLeaves False $ T.postDecorateTree False newSimple' leafDecGraph charInfoVV numLeaves numLeaves - else T.multiTraverseFullyLabelTree inGS inData leafDecGraph (Just numLeaves) newSimple' - - in - {- - let -- progress = takeWhile (/='.') $ show ((fromIntegral (100 * (newVertexIndex - nOTUs))/fromIntegral (nOTUs - 2)) :: Double) - (percentAdded, _) = divMod (100 * ((numLeaves - 2) - (V.length additionSequence))) (numLeaves - 2) - (decileNumber, decileRemainder) = divMod percentAdded 10 - (_, oddRemainder) = divMod ((numLeaves - 2) - (V.length additionSequence)) 2 - in - --trace (show (percentAdded, decileNumber, decileRemainder)) ( - - if decileRemainder == 0 && oddRemainder == 0 then - trace ("\t\t"++ (show $ 10 * decileNumber) ++ "%") - recursiveAddEdgesWagner (V.tail additionSequence) numLeaves (numVerts + 1) inGS inData hasNonExactChars leafDecGraph newPhyloGraph - else - -} - recursiveAddEdgesWagner (V.tail additionSequence) numLeaves (numVerts + 1) inGS inData hasNonExactChars leafDecGraph newPhyloGraph - -- ) - --- | addTaxonWagner adds a taxon (really edges) by 'invading' and edge, deleting that adege and creteing 3 more --- to existing tree and gets cost (for now by postorder traversal--so wasteful but will be by final states later) --- returns a tuple of the cost, node to add, edges to add, edge to delete -addTaxonWagner :: Int - -> PhylogeneticGraph - -> Int - -> LG.LEdge EdgeInfo - -> (VertexCost, LG.LNode TL.Text, [LG.LEdge Double], LG.Edge) -addTaxonWagner numVerts (_, _, inDecGraph, _, _, charInfoVV) leafToAdd targetEdge = - let edge0 = (numVerts, leafToAdd, 0.0) - edge1 = (fst3 targetEdge, numVerts, 0.0) - edge2 = (numVerts, snd3 targetEdge, 0.0) - newNode = (numVerts, TL.pack ("HTU" ++ (show numVerts))) - - -- full post order - --newSimpleGraph = LG.insEdges [edge0, edge1, edge2] $ LG.insNode newNode $ LG.delEdge (LG.toEdge targetEdge) inSimple - --newCost = snd6 $ T.postDecorateTree newSimpleGraph leafDecGraph charInfoVV numLeaves - - -- heuristic delta - delta = getDelta leafToAdd targetEdge inDecGraph charInfoVV - - in - (delta, newNode, [edge0, edge1, edge2], LG.toEdge targetEdge) - -- (newCost, newNode, [edge0, edge1, edge2], LG.toEdge targetEdge) - - --- | getDelta estimates the delta in tree cost by adding a leaf taxon in Wagner build --- must be DO for this--isolated leaves won't have IA -getDelta :: Int -> LG.LEdge EdgeInfo -> DecoratedGraph -> V.Vector (V.Vector CharInfo) -> VertexCost -getDelta leafToAdd (eNode, vNode, _) inDecGraph charInfoVV = - let leafToAddVertData = vertData $ fromJust $ LG.lab inDecGraph leafToAdd - eNodeVertData = vertData $ fromJust $ LG.lab inDecGraph eNode - vNodeVertData = vertData $ fromJust $ LG.lab inDecGraph vNode - - -- create edge union 'character' blockData - -- filters gaps (True argument) because using DOm (as must) to add taxa not in IA framework - -- based on final assignments - -- not useing IA--False argument since no IA field for all leaves - edgeUnionVertData = M.createEdgeUnionOverBlocks False True eNodeVertData vNodeVertData charInfoVV [] - - in - -- trace ("GD: " ++ (show edgeUnionVertData)) ( - if (LG.lab inDecGraph leafToAdd == Nothing) || (LG.lab inDecGraph eNode == Nothing) || (LG.lab inDecGraph vNode == Nothing) then error ("Missing label data for vertices") - else - let -- Use edge union data for delta to edge data - dLeafEdgeUnionCost = sum $ fmap fst $ V.zipWith3 (PRE.getBlockCostPairsFinal DirectOptimization) leafToAddVertData edgeUnionVertData charInfoVV - - - -- should be able to use existing information--but for now using this - -- existingEdgeCost' = sum $ fmap fst $ V.zipWith3 (PRE.getBlockCostPairsFinal DirectOptimization) eNodeVertData vNodeVertData charInfoVV - in - -- trace ("Delta: " ++ (show (dLeafENode, dLeafVNode, existingEdgeCost))) - -- dLeafENode + dLeafVNode - existingEdgeCost - -- trace ("Delta: " ++ (show dLeafEdgeUnionCost) ++ " vs " ++ (show dLeafEVAddCost)) - - -- min dLeafEdgeUnionCost dLeafEVAddCost - -- dLeafEVAddCost - dLeafEdgeUnionCost - -- ) - - diff --git a/pkg/PhyGraph/Support/Support.hs b/pkg/PhyGraph/Support/Support.hs deleted file mode 100644 index 155e5fccf..000000000 --- a/pkg/PhyGraph/Support/Support.hs +++ /dev/null @@ -1,821 +0,0 @@ -{- | -Module : Support.hs -Description : Module containing support functions -Copyright : (c) 2022 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -module Support.Support ( supportGraph - ) where - -import qualified Commands.Verify as VER -import Control.Parallel.Strategies -import Data.Char -import qualified Data.List as L -import Data.Maybe -import qualified Data.Vector as V -import qualified Data.Vector.Unboxed as UV -import qualified Data.Vector.Generic as GV -import Debug.Trace -import GeneralUtilities -import qualified GraphOptimization.Traversals as T -import qualified Graphs.GraphOperations as GO -import qualified ParallelUtilities as PU -import qualified Reconciliation.ReconcileGraphs as REC -import qualified Search.Build as B -import qualified Search.NetworkAddDelete as N -import qualified Search.Refinement as R -import Text.Read -import Types.Types -import qualified Utilities.Distances as DD -import qualified Utilities.LocalGraph as LG - --- | driver for overall support -supportGraph :: [Argument] -> GlobalSettings -> ProcessedData -> Int -> [PhylogeneticGraph] -> [PhylogeneticGraph] -supportGraph inArgs inGS inData rSeed inGraphList = - if null inGraphList then error ("No graphs input to calculate support") - else - let fstArgList = fmap (fmap toLower . fst) inArgs - sndArgList = fmap (fmap toLower . snd) inArgs - lcArgList = zip fstArgList sndArgList - checkCommandList = checkCommandArgs "support" fstArgList VER.supportArgList - in - -- check for valid command options - if not checkCommandList then errorWithoutStackTrace ("Unrecognized command in 'support': " ++ show inArgs) - else - let doBootStrap = any ((=="bootstrap").fst) lcArgList - onlyBuild = any ((=="buildonly").fst) lcArgList - - jackList = filter ((=="jackknife").fst) lcArgList - jackFreq' - | length jackList > 1 = - errorWithoutStackTrace ("Multiple jackknife sampling frequency specifications in support command--can have only one (e.g. jackknife:0.62): " ++ show inArgs) - | null jackList = Just 0.6321 -- 1- 1/e - | null (snd $ head jackList) = Just 0.6321 - | otherwise = readMaybe (snd $ head jackList) :: Maybe Double - - replicateList = filter ((=="replicates").fst) lcArgList - replicates' - | length replicateList > 1 = - errorWithoutStackTrace ("Multiple resampling replicate specifications in support command--can have only one (e.g. replicates:100): " ++ show inArgs) - | null replicateList = Just 100 - | otherwise = readMaybe (snd $ head replicateList) :: Maybe Int - - goodBremList = filter ((`elem` ["goodmanbremer", "gb"]).fst) lcArgList - goodBremMethod - | length goodBremList > 1 = - errorWithoutStackTrace ("Multiple Goodman-Bremer method specifications in support command--can have only one (e.g. gb:tbr): " ++ show inArgs) - | null (snd $ head goodBremList) = Just "tbr" - | otherwise = Just $ snd $ head goodBremList - - goodBremSampleList = filter ((`elem` ["gbsample"]).fst) lcArgList - goodBremSample - | length goodBremSampleList > 1 = - errorWithoutStackTrace ("Multiple Goodman-Bremer sample specifications in support command--can have only one (e.g. gbsample:1000): " ++ show inArgs) - | null goodBremSampleList = Just (maxBound :: Int) - | otherwise = readMaybe (snd $ head goodBremSampleList) :: Maybe Int - - in - if isNothing jackFreq' then errorWithoutStackTrace ("Jacknife frequency not a float (e.g. jackknife:0.5) in support: " ++ show (snd $ head jackList)) - else if isNothing replicates' then errorWithoutStackTrace ("Resampling replicates specification not a string (e.g. replicates:100) in support: " ++ show (snd $ head replicateList)) - --else if isNothing goodBremMethod then errorWithoutStackTrace ("Goodman-Bremer method specification not a string (e.g. goodmanBremer:SPR) in support: " ++ (show (snd $ head goodBremList)) ++ (show lcArgList)) - else if isNothing goodBremSample then errorWithoutStackTrace ("Goodman-Bremer sample specification not an integer (e.g. gbsample:1000) in support: " ++ show (snd $ head goodBremSampleList)) - else - let thisMethod = if doBootStrap && (not . null) jackList && (null goodBremList) then trace ("Bootstrap and Jackknife specified--defaulting to Jackknife") "jackknife" - else if (doBootStrap || (not . null) jackList) && (not . null) goodBremList then trace ("Resampling (Bootstrap or Jackknife) and Goodman-Bremer specified--defaulting to Goodman-Bremer") "goodBrem" - else if doBootStrap then - "bootstrap" - else if (not . null) jackList then "jackknife" - else "goodBrem" - - gbSampleSize = if goodBremSample == Just (maxBound :: Int) then Nothing - else goodBremSample - - -- sample trees uniformly at random--or "nth" - gbRandomSample = if gbSampleSize /= Nothing then True -- any ((=="atrandom").fst) lcArgList - else False - - replicates = if fromJust replicates' < 0 then - trace ("Negative replicates number--defaulting to 100") - 100 - else fromJust replicates' - jackFreq = if fromJust jackFreq' <= 0 || fromJust jackFreq' >= 1.0 then - trace ("Jackknife frequency must be on (0.0, 1.0) defaulting to 0.6321") - 0.6321 - else fromJust jackFreq' - - buildOptions = [("distance",""), ("replicates", show (100 :: Int)), ("best", show (1 :: Int)), ("rdwag", ""), ("dWag", "")] -- [("replicates", show 10), ("best", show 1)] - swapOptions = if onlyBuild then [] - else [("tbr", ""), ("steepest", ""), ("keep", show (1 :: Int))] - supportGraphList = if thisMethod == "bootstrap" || thisMethod == "jackknife" then - let extraString = if thisMethod == "jackknife" then (" with delete fraction " ++ (show $ 1 - jackFreq)) - else "" - in - trace ("Generating " ++ thisMethod ++ " resampling support with " ++ (show replicates) ++ " replicates" ++ extraString) - [getResampleGraph inGS inData rSeed thisMethod replicates buildOptions swapOptions jackFreq] - else - let extraString = if gbSampleSize /= Nothing then (" based on " ++ (show $ fromJust gbSampleSize) ++ " samples at random") - else "" - in - trace ("Generating Goodman-Bremer support" ++ extraString) - fmap (getGoodBremGraphs inGS inData rSeed (fromJust goodBremMethod) gbSampleSize gbRandomSample) inGraphList - in - - supportGraphList - --- | getResampledGraphs performs resampling and search for bootstrap and jackknife support -getResampleGraph :: GlobalSettings -> ProcessedData -> Int -> String -> Int -> [(String, String)] -> [(String, String)] -> Double -> PhylogeneticGraph -getResampleGraph inGS inData rSeed resampleType replicates buildOptions swapOptions jackFreq = - let resampledGraphList = PU.seqParMap rdeepseq (makeResampledDataAndGraph inGS inData resampleType buildOptions swapOptions jackFreq) (take replicates $ randomIntList rSeed) -- `using` PU.myParListChunkRDS - -- create appropriate support graph >50% ? - -- need to add args - reconcileArgs = if graphType inGS == Tree then [("method","majority"), ("compare","identity"), ("edgelabel","true"), ("vertexlabel","true"), ("connect","true"), ("threshold","51"), ("outformat", "dot")] - else [("method","eun"), ("compare","identity"), ("edgelabel","true"), ("vertexlabel","true"), ("connect","true"), ("threshold","51"),("outformat", "dot")] - -- majority ruke consensus if no args - (_, reconciledGraph) = REC.makeReconcileGraph VER.reconcileArgList reconcileArgs (fmap fst6 resampledGraphList) - in - -- trace ("GRG: \n" ++ reconciledGraphString) ( - -- generate resampled graph - -- can't really relabel easily wihtout bv and maybe not necessary anyway--node numebrs inconsistent - (reconciledGraph, infinity, LG.empty, V.empty, V.empty, V.empty) - -- ) - --- | makeResampledDataAndGraph takes paramters, resmaples data and find a graph based on search parameters --- returning the resampled graph -makeResampledDataAndGraph :: GlobalSettings -> ProcessedData -> String -> [(String, String)] -> [(String, String)] -> Double -> Int -> PhylogeneticGraph -makeResampledDataAndGraph inGS inData resampleType buildOptions swapOptions jackFreq rSeed = - let randomIntegerList1 = randomIntList rSeed - -- create resampled data - newData = resampleData (randomIntegerList1 !! 0) resampleType jackFreq inData - - -- pairwise distances for distance analysis - pairwiseDistances = DD.getPairwiseDistances newData - - -- build graphs - buildGraphs = B.buildGraph buildOptions inGS newData pairwiseDistances (randomIntegerList1 !! 1) - bestBuildGraphList = GO.selectPhylogeneticGraph [("best", "")] 0 ["best"] buildGraphs - - -- if not a tree then try to add net edges - netAddArgs = [("netAdd", ""), ("keep", show (1 :: Int)), ("steepest", ""), ("atRandom", "")] - netGraphList = if (graphType inGS == Tree) then bestBuildGraphList - else R.netEdgeMaster netAddArgs inGS newData (randomIntegerList1 !! 2) bestBuildGraphList - - --simple swap refinement - swapGraphList = if null swapOptions then netGraphList - else R.swapMaster swapOptions inGS newData (randomIntegerList1 !! 3) netGraphList - in - -- no data in there - if (V.null . thd3) newData then emptyPhylogeneticGraph - else head swapGraphList - --- | resampleData perfoms a single randomized data resampling --- based on either with replacement (bootstrp) or without (jackknife) --- jackknife moves through processed data and cretes a new data set --- based on simple prob --- bootStrap draws chars from input directly copying--currently disabled --- if a block of data end up with zero resampled characters it is deleted -resampleData :: Int -> String -> Double -> ProcessedData -> ProcessedData -resampleData rSeed resampleType sampleFreq (nameVect, nameBVVect, blockDataVect) = - if V.null blockDataVect then error "Null input data in resampleData" - else - let lRandomIntegerList = randomIntList rSeed - in - --Bootstrap or Jackknife resampling - let newBlockDataVect' = if resampleType == "bootstrap" then V.zipWith resampleBlockBootstrap (V.fromList lRandomIntegerList) blockDataVect - else V.zipWith (resampleBlockJackknife sampleFreq) (V.fromList lRandomIntegerList) blockDataVect - -- filter any zero length blocks - newBlockDataVect = V.filter ((not . V.null) . thd3) newBlockDataVect' - in - (nameVect, nameBVVect, newBlockDataVect) - - --- | resampleBlockBootstrap takes BlockData and a seed and creates a Bootstrap resampled BlockData -resampleBlockBootstrap :: Int -> BlockData -> BlockData -resampleBlockBootstrap rSeed (nameText, charDataVV, charInfoV) = - let lRandomIntegerList = randomIntList rSeed - randomIntegerList2 = randomIntList (head lRandomIntegerList) - - -- maps over taxa in data bLock - (newCharDataVV, newCharInfoV) = V.unzip $ fmap (makeSampledPairVectBootstrap lRandomIntegerList randomIntegerList2 charInfoV) charDataVV - in - (nameText, newCharDataVV, V.head newCharInfoV) - --- | makeSampledPairVectBootstrap takes a list of Int and a vectors of charinfo and char data --- and returns new vectors of chardata and charinfo based on randomly sampled character indices --- this to create a bootstrap replicate of equal size --- this for a single taxon hecen pass teh random ints so same for each one -makeSampledPairVectBootstrap :: [Int] -> [Int] -> V.Vector CharInfo -> V.Vector CharacterData -> (V.Vector CharacterData, V.Vector CharInfo) -makeSampledPairVectBootstrap fullRandIntlList randIntList inCharInfoVect inCharDataVect = - -- get character numbers and set resampling indices for dynamic characters--statric happen within those characters - let -- zip so can filter static and dynamic characters - dataInfoPairV = V.zip inCharDataVect inCharInfoVect - - -- filter static - (staticCharsV, staticCharsInfoV) = V.unzip $ V.filter ((`elem` exactCharacterTypes) . charType . snd ) dataInfoPairV - - -- filter dynamic - (dynamicCharsV, dynamicCharsInfoV) = V.unzip $ V.filter ((`notElem` exactCharacterTypes) . charType .snd ) dataInfoPairV - - numDynamicChars = V.length dynamicCharsV - dynCharIndices = fmap (randIndex numDynamicChars) (take numDynamicChars randIntList) - - in - -- trace ("MSPVB: " ++ (show $ length dynCharIndices) ++ " " ++ (show dynCharIndices)) ( - -- resample dynamic characters - -- straight resample - let resampleDynamicChars = V.map (dynamicCharsV V.!) (V.fromList dynCharIndices) - resmapleDynamicCharInfo = V.map (dynamicCharsInfoV V.!) (V.fromList dynCharIndices) - - -- static chars do each one mapping random choices within the character type - -- but keeping each one--hence char info is staticCharsInfoV - resampleStaticChars = V.zipWith (subSampleStatic fullRandIntlList) staticCharsV staticCharsInfoV - - in - -- cons the vectors for chrater data and character info - (resampleStaticChars V.++ resampleDynamicChars, staticCharsInfoV V.++ resmapleDynamicCharInfo) - -- ) - where randIndex a b = snd $ divMod (abs b) a - - --- | subSampleStatic takes a random int list and a static charcter --- bootstrap resamples that character based on ransom it list and number of "subcharacters" in character -subSampleStatic :: [Int] -> CharacterData -> CharInfo -> CharacterData -subSampleStatic randIntList inCharData inCharInfo = - let (a1, a2, a3) = rangePrelim inCharData - (na1, na2, na3) = stateBVPrelim inCharData - (pa1, pa2, pa3) = packedNonAddPrelim inCharData - m1 = matrixStatesPrelim inCharData - inCharType = charType inCharInfo - - charLength = if inCharType == Add then V.length a2 - else if inCharType == NonAdd then V.length na2 - else if inCharType == Matrix then V.length m1 - else error ("Dynamic character in subSampleStatic: " ++ (show inCharType)) - - -- get character indices based on number "subcharacters" - staticCharIndices = V.fromList $ fmap (randIndex charLength) (take charLength randIntList) - staticCharIndicesUV = UV.fromList $ fmap (randIndex charLength) (take charLength randIntList) - - - in - -- trace ("SSS:" ++ (show $ V.length staticCharIndices) ++ " " ++ (show staticCharIndices)) ( - if inCharType == Add then - inCharData {rangePrelim = (V.map (a1 V.!) staticCharIndices, V.map (a2 V.!) staticCharIndices, V.map (a3 V.!) staticCharIndices)} - - else if inCharType == NonAdd then - inCharData {stateBVPrelim = (V.map (na1 V.!) staticCharIndices, V.map (na2 V.!) staticCharIndices, V.map (na3 V.!) staticCharIndices)} - - else if inCharType `elem` packedNonAddTypes then - inCharData {packedNonAddPrelim = (UV.map (pa1 UV.!) staticCharIndicesUV, UV.map (pa2 UV.!) staticCharIndicesUV, UV.map (pa3 UV.!) staticCharIndicesUV)} - - else if inCharType == Matrix then - inCharData {matrixStatesPrelim = V.map (m1 V.!) staticCharIndices} - - else error ("Incorrect character type in subSampleStatic: " ++ show inCharType) - -- ) - where randIndex a b = snd $ divMod (abs b) a - --- | makeSampledCharCharInfoVect takes a vector of Int and a vector of charData and a vector of charinfo --- if teh data type is not static--the character is returns if Bool is True not otherwise --- if the char is static (add, non add, matrix) then the bool array is applied --- across the vetor of those characters (since they re vectors of charcters themselves --- returned as a pair of vectors (reversed--but shouldn't matter for resampling purposes) --- does not check if equal in length -makeSampledVect :: (GV.Vector v a) => [Bool] -> [a] -> v a -> v a -makeSampledVect boolList accumList inVect = - if GV.null inVect then - -- trace ("MSV R: " ++ (show $ length accumList)) - GV.fromList accumList - else - -- trace ("MSV: " ++ (show $ head boolList)) ( - if head boolList then makeSampledVect (tail boolList) ((GV.head inVect) : accumList) (GV.tail inVect) - - else makeSampledVect (tail boolList) accumList (GV.tail inVect) - -- ) - --- | makeSampledVect takes a liust of Bool and avector and returns those values --- with True as a vector (reversed--but shouldn't matter for resampling purposes) --- does not check if equal in length -makeSampledPairVect :: [Bool] -> [Bool] -> [CharacterData] -> [CharInfo] -> V.Vector CharInfo -> V.Vector CharacterData -> (V.Vector CharacterData, V.Vector CharInfo) -makeSampledPairVect fullBoolList boolList accumCharDataList accumCharInfoList inCharInfoVect inCharDataVect = - if V.null inCharInfoVect then (V.fromList accumCharDataList, V.fromList accumCharInfoList) - else - let firstCharInfo = V.head inCharInfoVect - firstCharData = V.head inCharDataVect - firstCharType = charType firstCharInfo - in - - -- straight resample if dynamic - if firstCharType `notElem` exactCharacterTypes - then - if head boolList then makeSampledPairVect fullBoolList (tail boolList) (firstCharData : accumCharDataList) (firstCharInfo : accumCharInfoList) (V.tail inCharInfoVect) (V.tail inCharDataVect) - - else makeSampledPairVect fullBoolList (tail boolList) accumCharDataList accumCharInfoList (V.tail inCharInfoVect) (V.tail inCharDataVect) - - -- static character--keep in sample, but need to sample in the vector - else - let (a1, a2, a3) = rangePrelim firstCharData - (na1, na2, na3) = stateBVPrelim firstCharData - (pa1, pa2, pa3) = packedNonAddPrelim firstCharData - m1 = matrixStatesPrelim firstCharData - in - if firstCharType == Add then - let newCharData = firstCharData {rangePrelim = (makeSampledVect fullBoolList [] a1, makeSampledVect fullBoolList [] a2, makeSampledVect fullBoolList [] a3)} - in - -- trace ("Length Add: " ++ (show $ V.length $ snd3 $ rangePrelim newCharData)) ( - if V.null (makeSampledVect fullBoolList [] a2) then makeSampledPairVect fullBoolList (tail boolList) accumCharDataList accumCharInfoList (V.tail inCharInfoVect) (V.tail inCharDataVect) - else makeSampledPairVect fullBoolList (tail boolList) (newCharData : accumCharDataList) (firstCharInfo : accumCharInfoList) (V.tail inCharInfoVect) (V.tail inCharDataVect) - -- ) - - else if firstCharType == NonAdd then - let newCharData = firstCharData {stateBVPrelim = (makeSampledVect fullBoolList [] na1, makeSampledVect fullBoolList [] na2, makeSampledVect fullBoolList [] na3)} - in - -- trace ("Length NonAdd: " ++ (show $ V.length $ snd3 $ stateBVPrelim newCharData)) ( - if V.null (makeSampledVect fullBoolList [] na2) then makeSampledPairVect fullBoolList (tail boolList) accumCharDataList accumCharInfoList (V.tail inCharInfoVect) (V.tail inCharDataVect) - else makeSampledPairVect fullBoolList (tail boolList) (newCharData : accumCharDataList) (firstCharInfo : accumCharInfoList) (V.tail inCharInfoVect) (V.tail inCharDataVect) - -- ) - - else if firstCharType `elem` packedNonAddTypes then - let newCharData = firstCharData {packedNonAddPrelim = (makeSampledVect fullBoolList [] pa1, makeSampledVect fullBoolList [] pa2, makeSampledVect fullBoolList [] pa3)} - in - -- trace ("Length NonAdd: " ++ (show $ V.length $ snd3 $ stateBVPrelim newCharData)) ( - if GV.null (makeSampledVect fullBoolList [] pa2) then makeSampledPairVect fullBoolList (tail boolList) accumCharDataList accumCharInfoList (GV.tail inCharInfoVect) (GV.tail inCharDataVect) - else makeSampledPairVect fullBoolList (tail boolList) (newCharData : accumCharDataList) (firstCharInfo : accumCharInfoList) (GV.tail inCharInfoVect) (GV.tail inCharDataVect) - -- ) - - else if firstCharType == Matrix then - let newCharData = firstCharData {matrixStatesPrelim = (makeSampledVect fullBoolList [] m1)} - in - if V.null (makeSampledVect fullBoolList [] m1) then makeSampledPairVect fullBoolList (tail boolList) accumCharDataList accumCharInfoList (V.tail inCharInfoVect) (V.tail inCharDataVect) - else makeSampledPairVect fullBoolList (tail boolList) (newCharData : accumCharDataList) (firstCharInfo : accumCharInfoList) (V.tail inCharInfoVect) (V.tail inCharDataVect) - - - else error ("Incorrect character type in makeSampledPairVect: " ++ show firstCharType) - --- | resampleBlockJackknife takes BlockData and a seed and creates a jackknife resampled BlockData -resampleBlockJackknife :: Double -> Int -> BlockData -> BlockData -resampleBlockJackknife sampleFreq rSeed (nameText, charDataVV, charInfoV) = - let randomIntegerList1 = randomIntList rSeed - randomIntegerList2 = randomIntList (head randomIntegerList1) - acceptanceList = fmap (randAccept sampleFreq) randomIntegerList1 - acceptanceList2 = fmap (randAccept sampleFreq) randomIntegerList2 - -- newCharInfoV = makeSampledVect acceptanceVect [] charInfoV - -- newCharDataV = fmap (makeSampledVect acceptanceVect []) charDataVV - (newCharDataVV, newCharInfoV) = V.unzip $ fmap (makeSampledPairVect acceptanceList acceptanceList2 [] [] charInfoV) charDataVV - - - in - trace ("RB length " ++ (show $ V.length charInfoV) ++ " -> " ++ (show $ V.length $ V.head newCharInfoV) ) - (nameText, newCharDataVV, V.head newCharInfoV) - - where randAccept b a = let (_, randVal) = divMod (abs a) 1000 - critVal = floor (1000 * b) - in - -- trace ("RA : " ++ (show (b,a, randVal, critVal, randVal < critVal))) - randVal < critVal - --- | getGoodBremGraphs performs Goodman-Bremer support --- examines complete SPR or TBR swap neighborhood chekcing for presence/absence of edges in input Graph List --- can do sample of trees either "nth" or at random if specified --- sample based on SPR-- 4n^2 - 26n - 42 for TBR 8n^3 for now --- this will only examine bridge edges for networks, networkedge values willl be doen via net delete --- MAPs for each graph? -getGoodBremGraphs :: GlobalSettings -> ProcessedData -> Int -> String -> Maybe Int -> Bool -> PhylogeneticGraph -> PhylogeneticGraph -getGoodBremGraphs inGS inData rSeed swapType sampleSize sampleAtRandom inGraph = - if LG.isEmpty (fst6 inGraph) then error ("Null graph in getGoodBremGraphs") -- maybe should be error? - else - -- create list of edges for input graph and a structure with egde node indices and bitvector values - -- requires index BV of each node - {- - let egdeList = LG.edges (fst6 inGraph) - - -- graph node list - nodeList = LG.labNodes (thd6 inGraph) - nodeIndexBVPairList = fmap makeindexBVPair nodeList - - -- list of vectors for contant time access via index = fst (a, bv) - nodeIndexBVPairVect= V.fromList nodeIndexBVPairList - - -- make tuple for each edge in each graph - -- (uIndex,vINdex,uBV, vBV, graph cost) - tupleList = makeGraphEdgeTuples nodeIndexBVPairVect infinity egdeList - -} - let tupleList = getGraphTupleList inGraph - - -- traverse neighborhood (and net edge removal) keeping min cost without edges - supportEdgeTupleList = getGBTuples inGS inData rSeed swapType sampleSize sampleAtRandom tupleList inGraph - - simpleGBGraph = LG.mkGraph (LG.labNodes $ fst6 inGraph) (fmap (tupleToSimpleEdge (snd6 inGraph)) supportEdgeTupleList) - in - -- trace ("GGBG: " ++ (show $ length tupleList) ++ " -> " ++ (show $ length supportEdgeTupleList)) - (simpleGBGraph, snd6 inGraph, thd6 inGraph, fth6 inGraph, fft6 inGraph, six6 inGraph) - - where tupleToSimpleEdge d (a,b, _, _, c) = (a, b, c - d) - --- | getGraphTupleList takes a graph and cost (maybe initialized to infinity) returns tuple list -getGraphTupleList :: PhylogeneticGraph -> [(Int, Int, NameBV, NameBV, VertexCost)] -getGraphTupleList inGraph = - if LG.isEmpty (fst6 inGraph) then error ("Null graph in getGraphTupleList") - else - let egdeList = LG.edges (fst6 inGraph) - - -- graph node list - nodeList = LG.labNodes (thd6 inGraph) - nodeIndexBVPairList = fmap makeindexBVPair nodeList - - -- list of vectors for contant time access via index = fst (a, bv) - nodeIndexBVPairVect= V.fromList nodeIndexBVPairList - - -- make tuple for each edge in each graph - -- (uIndex,vINdex,uBV, vBV, graph cost) - tupleList = makeGraphEdgeTuples nodeIndexBVPairVect infinity egdeList - in - tupleList - where makeindexBVPair (a,b) = (a, bvLabel b) - --- | getGBTuples takes a tuple list fomr graph containing initialized values and update those values based --- on each graph in the inGraph neigborhood --- first doess this via swap--for network does edge net edge in turn by removing using netDel -getGBTuples :: GlobalSettings - -> ProcessedData - -> Int - -> String - -> Maybe Int - -> Bool - -> [(Int, Int, NameBV, NameBV, VertexCost)] - -> PhylogeneticGraph - -> [(Int, Int, NameBV, NameBV, VertexCost)] -getGBTuples inGS inData rSeed swapType sampleSize sampleAtRandom inTupleList inGraph = - -- traverse swap (SPR/TBR) neighborhood optimizing each graph fully - let swapTuples = performGBSwap inGS inData rSeed swapType sampleSize sampleAtRandom inTupleList inGraph - - -- network edge support if not Tree - netTuples = if (graphType inGS == Tree) || (LG.isTree $ fst6 inGraph) then - -- swap only for Tree-do nothing - swapTuples - - -- SoftWired => delete edge -- could add net move if needed - else if graphType inGS == SoftWired then - PU.seqParMap rdeepseq (updateDeleteTuple inGS inData (LG.extractLeafGraph $ thd6 inGraph) inGraph) swapTuples -- `using` PU.myParListChunkRDS - - -- HardWired => move edge - else - PU.seqParMap rdeepseq (updateMoveTuple inGS inData (LG.extractLeafGraph $ thd6 inGraph) inGraph) swapTuples -- `using` PU.myParListChunkRDS - in - netTuples - --- | updateDeleteTuple take a graph and and edge and delete a network edge (or retunrs tuple if not network) --- if this were a HardWWired graph--cost would always go down, so only applied to softwired graphs -updateDeleteTuple :: GlobalSettings -> ProcessedData -> DecoratedGraph -> PhylogeneticGraph -> (Int, Int, NameBV, NameBV, VertexCost) -> (Int, Int, NameBV, NameBV, VertexCost) -updateDeleteTuple inGS inData leafGraph inGraph inTuple@(inE, inV, inEBV, inVBV, inCost) = - let isNetworkEdge = LG.isNetworkEdge (fst6 inGraph) (inE, inV) - in - if not isNetworkEdge then inTuple - else - -- True to force full evalutation - let deleteCost = snd6 $ N.deleteNetEdge inGS inData leafGraph inGraph True (inE, inV) - in - (inE, inV, inEBV, inVBV, min inCost deleteCost) - - --- | updateMoveTuple take a graph and and edge and moves a network edge (or returns tuple if not network) --- if this were a HardWWired graph--cost would always go down, so only applied to softwired graphs - -- max bound because its a place holder for max num net edges -updateMoveTuple :: GlobalSettings -> ProcessedData -> DecoratedGraph -> PhylogeneticGraph -> (Int, Int, NameBV, NameBV, VertexCost) -> (Int, Int, NameBV, NameBV, VertexCost) -updateMoveTuple inGS inData leafGraph inGraph inTuple@(inE, inV, inEBV, inVBV, inCost) = - let isNetworkEdge = LG.isNetworkEdge (fst6 inGraph) (inE, inV) - in - if not isNetworkEdge then inTuple - else - -- True to force full evalutation - let steepest = False - randomOrder = False - keepNum = 10 -- really could be one since sorted by cost, but just to make sure)Order - rSeed = 0 - saParams = Nothing - moveCost = minimum $ fmap snd6 $ N.deleteOneNetAddAll inGS inData leafGraph (maxBound :: Int) keepNum steepest randomOrder inGraph [(inE, inV)] rSeed saParams - in - (inE, inV, inEBV, inVBV, min inCost moveCost) - - - --- | performGBSwap takes parameters and graphs and traverses swap neighborhood --- examining each (or nth, or random) Graphs examining each ech in each graph for Goodman-Bremer --- optimality support -performGBSwap :: GlobalSettings - -> ProcessedData - -> Int - -> String - -> Maybe Int - -> Bool - -> [(Int, Int, NameBV, NameBV, VertexCost)] - -> PhylogeneticGraph - -> [(Int, Int, NameBV, NameBV, VertexCost)] -performGBSwap inGS inData rSeed swapType sampleSize sampleAtRandom inTupleList inGraph = - if LG.isEmpty (fst6 inGraph) then error ("Null graph in performGBSwap") - else - let -- work with simple graph - inSimple = fst6 inGraph - (firstRootIndex, _) = head $ LG.getRoots inSimple - - -- determine edges to break on--'bridge' edges only for network - -- filter out edges from root since no use--would just rejoin - breakEdgeList = if (graphType inGS) == Tree then filter ((/= firstRootIndex) . fst3) $ LG.labEdges inSimple - else filter ((/= firstRootIndex) . fst3) $ LG.getEdgeSplitList inSimple - - -- get random integer lists for swap - lRandomIntegerList = randomIntList rSeed - randomIntegerListList = fmap randomIntList lRandomIntegerList - - -- integerized critical value for prob accept - -- based on approx (leaves - netnodes)^2 or (leaves - netnodes)^3 - (_, leafList, _, netVertList) = LG.splitVertexList (fst6 inGraph) - intProbAccept = if swapType == "spr" then floor ((1000.0 * (fromIntegral $ fromJust sampleSize)) / ((2.0 * (fromIntegral $ (length leafList) - (length netVertList))) ** 2) :: Double) - else floor ((1000.0 * (fromIntegral $ fromJust sampleSize)) / ((2.0 * (fromIntegral $ (length leafList) - (length netVertList))) ** 3) :: Double) - - - -- generate tuple lists for each break edge parallelized at this level - tupleListList = PU.seqParMap rdeepseq (splitRejoinGB' inGS inData swapType intProbAccept sampleAtRandom inTupleList inSimple breakEdgeList) (zip randomIntegerListList breakEdgeList) -- `using` PU.myParListChunkRDS - - -- merge tuple lists--should all be in same order - newTupleList = mergeTupleLists (filter (not . null) tupleListList) [] - in - -- trace ("PGBS:" ++ (show $ fmap length tupleListList) ++ " -> " ++ (show $ length newTupleList)) - newTupleList - - --- | splitRejoinGB' is wrapper for splitRejoinGB to allow for seqParMap -splitRejoinGB' :: GlobalSettings - -> ProcessedData - -> String - -> Int - -> Bool - -> [(Int, Int, NameBV, NameBV, VertexCost)] - -> SimpleGraph - -> [LG.LEdge Double] - -> ([Int], LG.LEdge Double) - -> [(Int, Int, NameBV, NameBV, VertexCost)] -splitRejoinGB' inGS inData swapType intProbAccept sampleAtRandom inTupleList inGraph originalBreakEdgeList (inRandomIntegerList, breakEdge) = - splitRejoinGB inGS inData swapType intProbAccept sampleAtRandom inTupleList inGraph originalBreakEdgeList inRandomIntegerList breakEdge - --- | splitRejoinGB take parameters and splits input graph at specified edge and rejoins at all available edge --- (reroots the pruned subgraph if TBR) and creates and gets cost of graph (lazy takes care of post order only) --- with optimized graph, tuple list is creted and compared to input graph tuple list. --- original edge was (parentPrunedGraphRoot, prunedGraphRootIndex) --- working with SimpleGraph -splitRejoinGB :: GlobalSettings - -> ProcessedData - -> String - -> Int - -> Bool - -> [(Int, Int, NameBV, NameBV, VertexCost)] - -> SimpleGraph - -> [LG.LEdge Double] - -> [Int] - -> LG.LEdge Double - -> [(Int, Int, NameBV, NameBV, VertexCost)] -splitRejoinGB inGS inData swapType intProbAccept sampleAtRandom inTupleList inGraph originalBreakEdgeList inRandomIntegerList breakEdge = - - let - -- split graph on breakEdge - (splitGraph, _, prunedGraphRootIndex, _, _, edgeDeleteList) = LG.splitGraphOnEdge' inGraph breakEdge - - -- get edges in base graph to be invaded (ie not in pruned graph) - prunedGraphRootNode = (prunedGraphRootIndex, fromJust $ LG.lab splitGraph prunedGraphRootIndex) - (prunedSubTreeNodes, prunedSubTreeEdges) = LG.nodesAndEdgesAfter splitGraph [prunedGraphRootNode] - edgesNotToInvade = ((LG.toEdge breakEdge) : edgeDeleteList) ++ (fmap LG.toEdge prunedSubTreeEdges) - edgesToInvade = filter (LG.notMatchEdgeIndices edgesNotToInvade) originalBreakEdgeList - - -- rejoin, evaluate, get better tuple - -- check if there are tbr-type rearrangements to do (rerooting pruned graph) - -- create TBR rerooot split graphs if required - splitGraphList = if (length prunedSubTreeNodes < 3) || swapType == "spr" then [splitGraph] - - -- generate "tbr" rerootings in split graph - else getTBRSplitGraphs inGS splitGraph breakEdge - - -- new random lists for rejoin - randomIntegerListList = fmap randomIntList inRandomIntegerList - - -- parallel at break level above - rejoinTupleListList = zipWith (rejoinGB inGS inData intProbAccept sampleAtRandom inTupleList splitGraphList breakEdge) randomIntegerListList edgesToInvade - - -- merge tuples - newTupleList = mergeTupleLists rejoinTupleListList [] - in - newTupleList - --- | rejoinGB rejoins split graph at specific edge, id SPR then that's it, if TBR reroot pruned subgraph --- splitGraph is SimpleGraph --- the rejoin is SPR type relying on teh list lengt of split graph to present the TBR reroots -rejoinGB :: GlobalSettings - -> ProcessedData - -> Int - -> Bool - -> [(Int, Int, NameBV, NameBV, VertexCost)] - -> [SimpleGraph] - -> (LG.LEdge Double) - -> [Int] - -> LG.LEdge Double - -> [(Int, Int, NameBV, NameBV, VertexCost)] -rejoinGB inGS inData intProbAccept sampleAtRandom inTupleList splitGraphList originalBreakEdge@(eBreak, _, _) randIntList edgeToInvade = - if null splitGraphList then inTupleList - else - let splitGraph = head splitGraphList - doGraph = if sampleAtRandom then - let (_, intRandVal) = divMod (abs (head randIntList)) 1000 - in - if intRandVal < intProbAccept then True - else False - else True - in - if doGraph then - let newGraph = LG.joinGraphOnEdge splitGraph edgeToInvade eBreak - pruneEdges = False - warnPruneEdges = False - startVertex = Nothing - newPhylogeneticGraph = if (graphType inGS == Tree) || (LG.isTree newGraph) then - T.multiTraverseFullyLabelGraph inGS inData pruneEdges warnPruneEdges startVertex newGraph - else - if (not . LG.cyclic) newGraph && (not . LG.parentInChain) newGraph then T.multiTraverseFullyLabelGraph inGS inData pruneEdges warnPruneEdges startVertex newGraph - else emptyPhylogeneticGraph - in - -- return original - if newPhylogeneticGraph == emptyPhylogeneticGraph then rejoinGB inGS inData intProbAccept sampleAtRandom inTupleList (tail splitGraphList) originalBreakEdge (tail randIntList) edgeToInvade - - -- update tuple list based on new graph - else - let updatedTupleList = getLowerGBEdgeCost inTupleList newPhylogeneticGraph -- ((2 * numTaxa) -1) - in - rejoinGB inGS inData intProbAccept sampleAtRandom updatedTupleList (tail splitGraphList) originalBreakEdge (tail randIntList) edgeToInvade - - - -- return original - else rejoinGB inGS inData intProbAccept sampleAtRandom inTupleList (tail splitGraphList) originalBreakEdge (tail randIntList) edgeToInvade - --- | mergeTupleLists takes a list of list of tuples and merges them choosing the better each recursive round -mergeTupleLists :: [[(Int, Int, NameBV, NameBV, VertexCost)]] -> [(Int, Int, NameBV, NameBV, VertexCost)] -> [(Int, Int, NameBV, NameBV, VertexCost)] -mergeTupleLists inTupleListList accumList = - if null inTupleListList then accumList - else - if null accumList then mergeTupleLists (tail inTupleListList) (head inTupleListList) - else - let firstTupleList = head inTupleListList - newTupleList = zipWith chooseBetterTuple firstTupleList accumList - in - mergeTupleLists (tail inTupleListList) newTupleList - --- | chooseBetterTuple takes two (Int, Int, NameBV, NameBV, VertexCost) and returns better cost -chooseBetterTuple :: (Int, Int, NameBV, NameBV, VertexCost) -> (Int, Int, NameBV, NameBV, VertexCost) -> (Int, Int, NameBV, NameBV, VertexCost) -chooseBetterTuple (aE, aV, aEBV, aVBV, aCost) (_, _, _, _, bCost) = (aE, aV, aEBV, aVBV, min aCost bCost) - --- | makeGraphEdgeTuples take node and edge,cost tuples from a graph and returns a list of tuples of the form --- (uIndex,vINdex,uBV, vBV, graph cost) --- this for edge comparisons for Goodman-Bremer and other optimality-type support -makeGraphEdgeTuples :: V.Vector (Int, NameBV) -> VertexCost -> [(Int, Int)] -> [(Int, Int, NameBV, NameBV, VertexCost)] -makeGraphEdgeTuples nodeBVVect graphCost edgeList = - -- trace ("MET: " ++ (show $ V.length nodeBVVect)) - fmap (make5Tuple nodeBVVect graphCost) edgeList - where make5Tuple nv c (a, b) = (a, b, snd (nv V.! a), snd (nv V.! b), c) - --- | getLowerGBEdgeCost take a list of edge tuples of (uIndex,vINdex,uBV, vBV, graph cost) from the graph --- whose supports are being calculated and a new graph and updates the edge cost (GB value) if that edge --- is NOT present in the graph taking the minimum of the original GB value and the new graph cost -getLowerGBEdgeCost :: [(Int, Int, NameBV, NameBV, VertexCost)] -> PhylogeneticGraph -> [(Int, Int, NameBV, NameBV, VertexCost)] -getLowerGBEdgeCost edgeTupleList inGraph = - if LG.isEmpty (fst6 inGraph) || null edgeTupleList then error ("Empty graph or null edge tuple list in getLowerGBEdgeCost") - else - let inGraphTupleList = getGraphTupleList inGraph - in - fmap (updateEdgeTuple (snd6 inGraph) inGraphTupleList) edgeTupleList - - --- | updateEdgeTuple checks is edge is NOT in input graph edge tuple list and if not takes minimum --- of edge cost GB value and in graph cost, else returns unchanged -updateEdgeTuple :: VertexCost -> [(Int, Int, NameBV, NameBV, VertexCost)] -> (Int, Int, NameBV, NameBV, VertexCost) -> (Int, Int, NameBV, NameBV, VertexCost) -updateEdgeTuple inGraphCost inGraphTupleList (uIndex, vIndex, uBV, vBV, edgeGBValue) = - let edgeNotFoundCost = getNotFoundCost uBV vBV inGraphCost inGraphTupleList - in - if edgeNotFoundCost == Nothing then (uIndex, vIndex, uBV, vBV, edgeGBValue) - else (uIndex, vIndex, uBV, vBV, min edgeGBValue (fromJust edgeNotFoundCost)) - --- | getNotFoundCost take a pair of BitVectors (of vertices in graph) from an edge --- and a list of (Int, Int, NameBV, NameBV, VertexCost) tuples and returns --- Nothing is the BVs of the two match (= signifying edge in graph) or --- Just graph cost if not present for Goodman-Bremer calculations -getNotFoundCost :: NameBV -> NameBV -> VertexCost -> [(Int, Int, NameBV, NameBV, VertexCost)] -> Maybe VertexCost -getNotFoundCost uBV vBV inTupleCost inTupleList = - if null inTupleList then Just inTupleCost - else - let (_, _, uInBV, vInBV, _) = head inTupleList - in - if uBV == uInBV && vBV == vInBV then Nothing - else getNotFoundCost uBV vBV inTupleCost (tail inTupleList) - - --- | getTBRSplitGraphs takes a split gaph and the original split edge and --- returns a list of rerooted subgrahs split graphs suitable for rejoining --- via SPR-type rejoin each to generate TBR neighborhood --- much of this is modified from Swap.hs but removing data and delta portions -getTBRSplitGraphs :: GlobalSettings -> SimpleGraph -> LG.LEdge Double -> [SimpleGraph] -getTBRSplitGraphs inGS splitGraph splitEdge = - if LG.isEmpty splitGraph then error ("Empty graph in getTBRSplitGraphs") - else - -- get edges in pruned graph and reroot on those edges that are 1) not from original "root" of prune - -- and 2) not network edges - let prunedGraphRootNode = (snd3 splitEdge, fromJust $ LG.lab splitGraph $ snd3 splitEdge) - edgesInPrunedSubGraph = snd $ LG.nodesAndEdgesAfter splitGraph [prunedGraphRootNode] - - nonNetWorkEdgeList = if graphType inGS /= Tree then filter ((== False) . (LG.isNetworkLabEdge splitGraph)) edgesInPrunedSubGraph - else edgesInPrunedSubGraph - - -- original pruned root edges - prunedRootEdges = LG.out splitGraph $ fst prunedGraphRootNode - - -- edges available for rerooting - edgeAfterList = nonNetWorkEdgeList L.\\ prunedRootEdges - - -- get edges to add and delete for TBR rerooting - tbrEdits = fmap (getTBREdits splitGraph prunedGraphRootNode edgesInPrunedSubGraph) (fmap LG.toEdge edgeAfterList) - - -- TBR split graph list - tbrGraphList = fmap (LG.insertDeleteEdges splitGraph) tbrEdits - - in - splitGraph : tbrGraphList - --- | getTBREdits takes and edge and returns the list of edits to pruned subgraph --- as a pair of edges to add and those to delete --- since reroot edge is directed (e,v), edges away from v will have correct --- orientation. Edges between 'e' and the root will have to be flipped --- original root edges and reroort edge are deleted and new root and edge spanning orginal root created --- returns ([add], [delete]) --- modified from function in swap to be more general and operate on SimpleGraphs as are used here -getTBREdits :: (Eq a, Eq b) => LG.Gr a b -> LG.LNode a -> [LG.LEdge b] -> LG.Edge -> ([LG.LEdge b],[LG.Edge]) -getTBREdits inGraph prunedGraphRootNode edgesInPrunedSubGraph rerootEdge = - --trace ("Gettiung TBR Edits for " ++ (show rerootEdge)) ( - let prunedGraphRootIndex = fst prunedGraphRootNode - originalRootEdgeNodes = LG.descendants inGraph prunedGraphRootIndex - originalRootEdges = LG.out inGraph prunedGraphRootIndex - - -- get path from new root edge fst vertex to orginal root and flip those edges - closerToPrunedRootEdgeNode = (fst rerootEdge, fromJust $ LG.lab inGraph $ fst rerootEdge) - (nodesInPath, edgesinPath) = LG.postOrderPathToNode inGraph closerToPrunedRootEdgeNode prunedGraphRootNode - - -- don't want original root edges to be flipped since deleted - edgesToFlip = edgesinPath L.\\ originalRootEdges - flippedEdges = fmap LG.flipLEdge edgesToFlip - - -- dummyEdgeLabel so can be type "b" - dummyEdgeLabel = thd3 $ head edgesInPrunedSubGraph - - -- new edges on new root position and spanning old root - -- add in closer vertex to root to make sure direction of edge is correct - newEdgeOnOldRoot = if (snd3 $ head originalRootEdges) `elem` ((fst rerootEdge) : (fmap fst nodesInPath)) then (snd3 $ head originalRootEdges, snd3 $ last originalRootEdges, dummyEdgeLabel) - else (snd3 $ last originalRootEdges, snd3 $ head originalRootEdges, dummyEdgeLabel) - newRootEdges = [(prunedGraphRootIndex, fst rerootEdge, dummyEdgeLabel),(prunedGraphRootIndex, snd rerootEdge, dummyEdgeLabel)] - - - in - -- original root edge so no change - if (fst rerootEdge) `elem` originalRootEdgeNodes && (snd rerootEdge) `elem` originalRootEdgeNodes then ([],[]) - - -- rerooted - else - -- delete orignal root edges and rerootEdge - -- add new root edges - -- and new edge on old root--but need orientation - -- flip edges from new root to old (delete and add list) - --trace ("\n\nIn Graph:\n"++ (LG.prettify $ GO.convertDecoratedToSimpleGraph inGraph) ++ "\nTBR Edits: " ++ (show (rerootEdge, prunedGraphRootIndex, fmap LG.toEdge flippedEdges)) - -- ++ "\nEdges to add: " ++ (show $ fmap LG.toEdge $ newEdgeOnOldRoot : (flippedEdges ++ newRootEdges)) ++ "\nEdges to delete: " ++ (show $ rerootEdge : (fmap LG.toEdge (edgesToFlip ++ originalRootEdges)))) - (newEdgeOnOldRoot : (flippedEdges ++ newRootEdges), rerootEdge : (fmap LG.toEdge (edgesToFlip ++ originalRootEdges))) - -- ) diff --git a/pkg/PhyGraph/Types/Types.hs b/pkg/PhyGraph/Types/Types.hs deleted file mode 100644 index 800574712..000000000 --- a/pkg/PhyGraph/Types/Types.hs +++ /dev/null @@ -1,625 +0,0 @@ -{- | -Module : Types.hs -Description : Module specifying data types -Copyright : (c) 2021 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -{-# LANGUAGE DeriveAnyClass #-} -{-# LANGUAGE DeriveGeneric #-} -{-# LANGUAGE DerivingStrategies #-} - -module Types.Types where - -import Control.DeepSeq -import Data.Alphabet -import qualified Data.TCM as TCM -import qualified Data.BitVector.LittleEndian as BV -import qualified Data.MetricRepresentation as MR -import qualified Data.TCM.Dense as TCMD -import qualified Data.Text.Lazy as T -import qualified Data.Text.Short as ST -import qualified Data.Vector as V -import qualified Data.Vector.Storable as SV -import qualified Data.Vector.Unboxed as UV -import Data.Word -import Foreign.C.Types (CUInt) -import GHC.Generics -import qualified SymMatrix as S -import qualified Utilities.LocalGraph as LG -import qualified Data.InfList as IL - --- | Debug Flag -isDebug :: Bool -isDebug = False - --- | Program Version -pgVersion :: String -pgVersion = "0.1" - --- | used for comparing graph costs and edge lengths that are Double -epsilon :: Double -epsilon = 0.0001 - - --- | infinity is a large Double for use with Graph costs -infinity :: Double -infinity = (read "Infinity") :: Double - --- |maxAddStatesToRecode maximum size of addditive charcater to recode into ---non-additive charcaters 65 can fit in 4 Word64 since nstates - 1 binaries --- prob could be bigger based on cost of optimizing additive versus but this --- seems a reasonale number (prob should be timed to verify) -maxAddStatesToRecode :: Int -maxAddStatesToRecode = 129 - --- | Types for timed searches -type Days = Int -type Hours = Int -type Minutes = Int -type Seconds = Int -type Time = (Days, Hours, Minutes, Seconds) - - --- | Command types --- data Argument = String | Double | Int | Bool | Time --- deriving stock (Show, Eq) -type Argument = (String, String) - - ---For rename format rename:(a,b,c,...,y,z) => a-y renamed to z -data Instruction = NotACommand | Build | Fuse | Read | Reblock | Refine | Rename | Report | Run | Select | Set | Swap | Search | Support | Transform - deriving stock (Show, Eq, Ord) - --- | Node variety -data NodeType = RootNode | LeafNode | TreeNode | NetworkNode - deriving stock (Show, Eq) - --- | Edge types -data EdgeType = NetworkEdge | TreeEdge | PendantEdge - deriving stock (Show, Eq, Ord) - --- | Command type structure -type Command = (Instruction, [Argument]) - --- | CharType data type for input characters -data CharType = Add | NonAdd | Matrix | SlimSeq | WideSeq | HugeSeq | NucSeq | AminoSeq | AlignedSlim | AlignedWide | AlignedHuge | - Packed2 | Packed4 | Packed5 | Packed8 | Packed64 - deriving stock (Read, Show, Eq) - --- non additive bit packed types (64 not really 'packed' but treated as if were) --- these are not entered but are created by transforming existing non-additive characters -packedNonAddTypes :: [CharType] -packedNonAddTypes = [Packed2, Packed4, Packed5, Packed8, Packed64] - --- aligned not in here because they are not reorganized, and would screw up reroot optimization -exactCharacterTypes :: [CharType] -exactCharacterTypes = [Add, NonAdd, Matrix] ++ packedNonAddTypes - --- | types for character classes -nonExactCharacterTypes :: [CharType] -nonExactCharacterTypes = [SlimSeq, WideSeq, HugeSeq, NucSeq, AminoSeq] -- , AlignedSlim, AlignedWide, AlignedHuge] - --- prealigned types -prealignedCharacterTypes :: [CharType] -prealignedCharacterTypes = [AlignedSlim, AlignedWide, AlignedHuge] - --- sequence types -sequenceCharacterTypes :: [CharType] -sequenceCharacterTypes = nonExactCharacterTypes ++ prealignedCharacterTypes - --- | Graph types for searching etc. Can be modified by 'Set command --- HardWired and SoftWired are network types --- 'Tree' would be a single tree in the sense as produced by typical phylogentic ---seqrch programs--no forests -data GraphType = Tree | HardWired | SoftWired - deriving stock (Show, Eq) - --- | Optimality criterion sets the cost function for graphs and potentially models --- likelihood form is the "self Information" in context of Kolmogorov complexity/MDL/PMDL -data OptimalityCriterion = Parsimony | PMDL | Likelihood - deriving stock (Show, Eq) - -data GraphFactor = NoNetworkPenalty | Wheeler2015Network | Wheeler2023Network | PMDLGraph - deriving stock (Show, Eq) - -data RootCost = NoRootCost | Wheeler2015Root | PMDLRoot | MLRoot - deriving stock (Show, Eq) - --- | Method for makeing final seqeujnce charcatert states assignment --- do an DO-based method--more exact but higher time complexity--single preorder --- pass but worst cae O(n^2) in seqeunce length --- or assign based on Implied alignment --requires additional post/pre order --- traversal but is linear in sequence length -data AssignmentMethod = DirectOptimization | ImpliedAlignment - deriving stock (Show, Eq) - -data SearchData - = SearchData - { instruction :: Instruction - , arguments :: [Argument] - , minGraphCostIn :: VertexCost - , maxGraphCostIn :: VertexCost - , numGraphsIn :: Int - , minGraphCostOut :: VertexCost - , maxGraphCostOut :: VertexCost - , numGraphsOut :: Int - , commentString :: String - , duration :: Int - } deriving stock (Show, Eq) - - -data GlobalSettings - = GlobalSettings - { outgroupIndex :: Int -- Outgroup terminal index, default 0 (first input leaf) - , outGroupName :: T.Text -- Outgroup name - , optimalityCriterion :: OptimalityCriterion - , graphType :: GraphType - , compressResolutions :: Bool -- "nub" resolutions in softwired graph - , finalAssignment :: AssignmentMethod - , graphFactor :: GraphFactor -- net penalty/graph complexity - , partitionCharacter :: String -- 'character' for mparitioning seqeunce data into homologous sections'--checks for length == 1 later - , rootCost :: RootCost - , rootComplexity :: VertexCost -- complexity of root in bits per root for PMDL/ML calculations - , graphComplexityList :: IL.InfList (VertexCost, VertexCost) --complexity of graphs in bits, index for number of network nodes (0= tree etc0 lazy so only evaluate each once when needed O(n) but needlazyness and permanence - , modelComplexity :: Double -- model cost for PMDL, 0.0 for other criteria - , seed :: Int -- random seed - , searchData :: [SearchData] - , numDataLeaves :: Int --number of leaves set after data processing--for conveniance really - , bc2 :: (Double, Double) -- PMDL bitCost for 2 states of no-change and change as pair - , bc4 :: (Double, Double) -- PMDL bitCost for 4 states of no-change and change as pair - , bc5 :: (Double, Double) -- PMDL bitCost for 5 states of no-change and change as pair - , bc8 :: (Double, Double) -- PMDL bitCost for 8 states of no-change and change as pair - , bc64 :: (Double, Double) -- PMDL bitCost for 64 states of no-change and change as pair - , bcgt64 :: (Double, Double) -- PMDL bitCost for > 64 states of no-change and change as pair - , dynamicEpsilon :: Double -- factor of dynamic heuristics overestimating graph deltas detemiend by fraction of data is dynamic and user value - } deriving stock (Show, Eq) - -instance NFData GlobalSettings where rnf x = seq x () - - --- | CharInfo information about characters --- null values for these are in Input.FastAC.hs --- TCMD.DenseTransitionCostMatrix => genDiscreteDenseOfDimension (length alphabet) --- MR.MetricRepresentation Word64 => metricRepresentation <$> TCM.fromRows [[0::Word]] --- MR.MetricRepresentation BV.BitVector => metricRepresentation <$> TCM.fromRows [[0::Word]] --- changeCost and noChange costs are for PMDL costs for packed/non-additive character for --- other character types the cost matrix holds this information in comvcert with weight since the matrix values are integers -data CharInfo = CharInfo { name :: NameText - , charType :: CharType - , activity :: Bool - , weight :: Double - , costMatrix :: S.Matrix Int - , slimTCM :: TCMD.DenseTransitionCostMatrix - , wideTCM :: MR.MetricRepresentation Word64 - , hugeTCM :: MR.MetricRepresentation BV.BitVector - , changeCost :: Double - , noChangeCost :: Double - , alphabet :: Alphabet ST.ShortText - , prealigned :: Bool - , origInfo :: V.Vector (NameText, CharType, Alphabet ST.ShortText) - } deriving stock (Show, Eq) - -instance NFData CharInfo where rnf x = seq x () -instance Ord CharInfo where x `compare` y = (show x) `compare` (show y) - - --- | Types for vertex information -type VertexCost = Double -type StateCost = Int -type VertexIndex = Int - --- | index of child vertices -type ChildStateIndex = Int - --- | unique bitvector labelling for vertex based on descednent labellings --- these labels are used for caching, left/right DO optimizaiton --- thery should be label invariant --- a hash of sorted input data for leaves --- will need a map from NameBV to T.Text name (or not) -type NameBV = BV.BitVector - --- | Human legibale name for vertices, characters, and Blocks -type NameText = T.Text - --- | TYpes for Matrix/Sankoff characters - -- Triple contains info from left and right child--could be only one - -- use fst then - -- finals state also vector of triple--but need to keep min cost - -- for final assignments - -- filter by local costVect to get 'best' states an each node -type MatrixTriple = (StateCost, [ChildStateIndex], [ChildStateIndex]) - --- Only date here that varies by vertex, rest inglobal character info --- vectors so all data of single type can be grouped together --- will need to add masks for bit-packing non-additive chars --- may have to add single assignment for hardwired and IP optimization --- for approximate sakoff (DO-like) costs can use stateBVPrelim/stateBVFinal --- for matrix/Saknoff characters-- Vector of vector of States - --BUT all with same cost matrix/tcm --- triples (add, no-add, sequence) are to keep children of vertex states for pre-order pass --- order is always (left parent median, median, right parent median) --- do not need for matrix since up pass is a traceback from parent --- sequence characters are a vector of bitvectors--so only a single seqeunce character --- per "charctaer" this is so the multi-traversal can take place independently for each --- sequence character, creating a properly "rooted" tree/graph for each non-exact seqeunce character --- prelim is created from gapped, final from (via 3-way minimization) parent final and child alignment (2nd and 3rd fields). --- th ea'alignment' fields hold the im plied alignment data -data CharacterData = CharacterData { stateBVPrelim :: (V.Vector BV.BitVector, V.Vector BV.BitVector, V.Vector BV.BitVector) -- preliminary for Non-additive chars, Sankoff Approx - -- for Non-additive ans Sankoff/Matrix approximate state - , stateBVFinal :: V.Vector BV.BitVector - -- for Additive - , rangePrelim :: (V.Vector (Int, Int), V.Vector (Int, Int), V.Vector (Int, Int)) - , rangeFinal :: V.Vector (Int, Int) - -- for multiple Sankoff/Matrix with slim tcm - , matrixStatesPrelim :: V.Vector (V.Vector MatrixTriple) - , matrixStatesFinal :: V.Vector (V.Vector MatrixTriple) - -- preliminary for m,ultiple seqeunce chars with same TCM - , slimPrelim :: SV.Vector CUInt - -- gapped medians of left, right, and preliminary used in preorder pass - , slimGapped :: (SV.Vector CUInt, SV.Vector CUInt, SV.Vector CUInt) - , slimAlignment :: (SV.Vector CUInt, SV.Vector CUInt, SV.Vector CUInt) - , slimFinal :: SV.Vector CUInt - , slimIAPrelim :: (SV.Vector CUInt, SV.Vector CUInt, SV.Vector CUInt) - , slimIAFinal :: SV.Vector CUInt - -- vector of individual character costs (Can be used in reweighting-ratchet) - , widePrelim :: UV.Vector Word64 - -- gapped median of left, right, and preliminary used in preorder pass - , wideGapped :: (UV.Vector Word64, UV.Vector Word64, UV.Vector Word64) - , wideAlignment :: (UV.Vector Word64, UV.Vector Word64, UV.Vector Word64) - , wideFinal :: UV.Vector Word64 - , wideIAPrelim :: (UV.Vector Word64, UV.Vector Word64, UV.Vector Word64) - , wideIAFinal :: UV.Vector Word64 - -- vector of individual character costs (Can be used in reweighting-ratchet) - , hugePrelim :: V.Vector BV.BitVector - -- gapped medians of left, right, and preliminary used in preorder pass - , hugeGapped :: (V.Vector BV.BitVector, V.Vector BV.BitVector, V.Vector BV.BitVector) - , hugeAlignment :: (V.Vector BV.BitVector, V.Vector BV.BitVector, V.Vector BV.BitVector) - , hugeFinal :: V.Vector BV.BitVector - , hugeIAPrelim :: (V.Vector BV.BitVector, V.Vector BV.BitVector, V.Vector BV.BitVector) - , hugeIAFinal :: V.Vector BV.BitVector - - -- vectors for pre-aligned sequences also used in static approx - , alignedSlimPrelim :: (SV.Vector CUInt, SV.Vector CUInt, SV.Vector CUInt) - , alignedSlimFinal :: SV.Vector CUInt - , alignedWidePrelim :: (UV.Vector Word64, UV.Vector Word64, UV.Vector Word64) - , alignedWideFinal :: UV.Vector Word64 - , alignedHugePrelim :: (V.Vector BV.BitVector, V.Vector BV.BitVector, V.Vector BV.BitVector) - , alignedHugeFinal :: V.Vector BV.BitVector - - -- coiuld be made Storable later is using C or GPU/Accelerate - , packedNonAddPrelim :: (UV.Vector Word64, UV.Vector Word64, UV.Vector Word64) - , packedNonAddFinal :: UV.Vector Word64 - - -- vector of individual character costs (Can be used in reweighting-ratchet) - , localCostVect :: V.Vector StateCost - -- weight * V.sum localCostVect - , localCost :: VertexCost - -- unclear if need vector version - , globalCost :: VertexCost - } deriving stock (Show, Eq, Generic) - -instance NFData CharacterData where rnf x = seq x () - - --- | - --- | type TermData type contains termnal name and list of characters --- characters as ShortText to save space on input -type TermData = (NameText, [ST.ShortText]) -type LeafData = (NameText, V.Vector CharacterData) - --- | VertexBlockData vector over blocks of character data in block (Vector) --- blocks of character data for a given vertex -type VertexBlockData = V.Vector (V.Vector CharacterData) - --- | VertexBlockDataMaybe vector over maybe blocks of character data in block (Vector) --- blocks of character data for a given vertex -type VertexBlockDataMaybe = V.Vector (V.Vector (Maybe CharacterData)) - - --- | ResolutionData contains vertex information for soft-wired network components --- these are used in the idenitification of minimal cost display trees for a block of --- data that follow the same display tree -type ResolutionVertexData = V.Vector ResolutionBlockData - --- | ResolutionBlockData contains a list of ResolutionData --- this list contains all the potential resolutions of a softwired --- networtk vertex -type ResolutionBlockData = V.Vector ResolutionData - --- | ResolutionData contains individual block information for a given resoluton of soft-wired network components --- these are used in the idenitification of minimal cost display trees for a block of --- data that follow the same display tree --- nodes are VertexInfo for ease of conversion--but nthe info is largely bogus and not to be trusted, same with EdgeInfo -data ResolutionData = ResolutionData { displaySubGraph :: ([LG.LNode VertexInfo], [LG.LEdge EdgeInfo]) -- holds the post-order display sub-tree for the block - , displayBVLabel :: NameBV -- For comparison of vertices subtrees, left/right, anmd root leaf inclusion - , displayData :: V.Vector CharacterData -- data for characters in block - -- list of left, right resolution indices to create current index, used in traceback to get prelminary states - -- and in compressing reolutions to keep only those that result in differnet preliminary states - -- but allowing traceback of resoliutions to get preliminary states - , childResolutions :: [(Maybe Int, Maybe Int)] - , resolutionCost :: VertexCost -- cost of creating the resolution - , displayCost :: VertexCost -- cost of that display subtree - } deriving stock (Show, Eq) - -instance NFData ResolutionData where rnf x = seq x () - - --- | VertexInfo type -- vertex information for Decorated Graph -data VertexInfo = VertexInfo { index :: Int -- For accessing - , bvLabel :: NameBV -- For comparison of vertices subtrees, left/right - , parents :: V.Vector Int --indegree indices - , children :: V.Vector Int -- outdegree indices - , nodeType :: NodeType -- root, leaf, network, tree - , vertName :: NameText --Text name of vertex either input or HTU# - , vertData :: VertexBlockData -- data as vector of blocks (each a vector of characters) - , vertexResolutionData :: V.Vector ResolutionBlockData -- soft-wired network component resolution information for Blocks - , vertexCost :: VertexCost -- local cost of vertex - , subGraphCost :: VertexCost -- cost of graph to leaves from the vertex - } deriving stock (Generic, Show, Eq) - -instance NFData VertexInfo where rnf x = seq x () - --- | type edge data, source and sink node indices are fst3 and snd3 fields. -data EdgeInfo = EdgeInfo { minLength :: VertexCost - , maxLength :: VertexCost - , midRangeLength :: VertexCost - , edgeType :: EdgeType - } deriving stock (Show, Eq, Ord) - -instance NFData EdgeInfo where rnf x = seq x () - --- | DecortatedGraph is the canonical graph contining all final information --- from preorder traversal trees --- and post-order info usually from an initial root-based traversal -type DecoratedGraph = LG.Gr VertexInfo EdgeInfo - --- | Type BLockDisplayTree is a Forest of tree components (indegree, outdegree) = (0,1|2),(1,2),(1,0) --- these are "resolved" from more general graphs --- will have to allow for indegre=outdegree=1 for dispaly tree generation and reconciliation --- the vertData field will always have a single Bloclk--teh vecor of blocks will be a vector of --- DecoratedGraphs. These woulod better have a single Vector of cChracter info as --- opposed to the Decorated Tree type, but the record naming and use gets screwed up. --- type BlockDisplayForest = LG.Gr VertexInfo EdgeInfo - --- | DecoratedGraph is a forest of tree compnents for a single character --- this is used for non-exact character traversal trees --- there will always be only a single block and single character even though --- expresed as Vector fo Vector of Chaarcters. Would be better as a single character as --- opposed to the Decorated Tree type, but the record naming and use gets screwed up. --- type DecoratedGraph = LG.Gr VertexInfo EdgeInfo - - --- | type RawGraph is input graphs with leaf and edge labels -type SimpleGraph = LG.Gr NameText Double - --- | Type phylogentic Graph is a graph with --- cost, optimality value, --- block display trees, character traversal "foci" (could have multiple) --- Data optimizations exist in Processed Data --- Question of wheterh traversal foci shold be in Graph or Data section --- for now in Graph but could easiily be moved to Processed data --- May need "EdgeData" like VertexData for heuristics. UNclear if local scope during SPR/TBR will do it. --- Fields: --- 1) "Simple" graph with fileds useful for outputting graphs --- 2) Graph optimality value or cost --- 3) Decorated Graph with optimized vertex/Node data --- 4) Vector of display trees for each data Block --- root and vertex costs not updated in rerooting so cannot be trusted --- Each block can have multiple disdpaly trees so extra vector there --- 5) Vector of traversal foci for each character (Vector of Blocks -> Vector of Characters, a single tree for each character) --- vector is over blocks, then characters (could have have multiple for each character, but only single tracked here) --- only important for dynamic (ie non-exact) characters whose costs depend on traversal focus --- one graph per character --- 6) Vector of Block Character Information (whihc is a Vector itself) required to properly optimize characters -type PhylogeneticGraph = (SimpleGraph, VertexCost, DecoratedGraph, V.Vector [DecoratedGraph], V.Vector (V.Vector DecoratedGraph), V.Vector (V.Vector CharInfo)) - - --- | RawData type processed from input to be passed to characterData --- to recode into usable form --- the format is tuple of a list of taxon-data list tuples and charinfo list. --- the data list and charinfo list must have the same length -type RawData = ([TermData], [CharInfo]) - --- | Processed data is the basic data structire for analysis --- can be input to functions --- based on "blocks" that follow same display tree (soft-wired network) --- each block has a set of characters (for each vertex eventually) and character info --- the vector of T.Text are the names--leaves form input, internal HTU ++ (show index) --- ablock are initialy set bu input file, and can be later changed by "set(block,...)" --- command --- "Naive" "Optimized" and "Transformed" darta are this type after different processing steps --- the first and second vectors are size number of leaves, teh third is number of blocks -type ProcessedData = (V.Vector NameText, V.Vector NameBV, V.Vector BlockData) - --- | Block data is the basic data unit that is optimized on a display tree --- Block data contain data fo all leaves and all characters in the block --- it is row, ie vertex dominant --- it has a bitvector name derived from leaf bitvector labels (union of children) --- the bitvector name can vary among blocks (for non-leaves) due to alternate display trees --- a vector of characterer data where assignments and costs reside, and a vector of character info --- leaves will alwasy be first (indices 0..n-1) for simpler updating of data during graph optimization --- NameText is the block label used for assignment and reporting output --- Initially set to input filename of character --- Fields: --- 1) name of the block--intially taken from input filenames --- 2) vector of vertex/leaf data with vector of character data for each leaf --- 3) Vector of character information for characters in the block -type BlockData = (NameText, V.Vector (V.Vector CharacterData), V.Vector CharInfo) - --- | type SAParams parameter structure for simulated alnnealing and Drifting -data SimulatedAnnealingMethod = SimAnneal | Drift - deriving stock (Read, Show, Eq) - -data SAParams = SAParams { method :: SimulatedAnnealingMethod - , numberSteps :: Int - , currentStep :: Int - , randomIntegerList :: [Int] - , rounds :: Int - , driftAcceptEqual :: Double - , driftAcceptWorse :: Double - , driftMaxChanges :: Int - , driftChanges :: Int - } deriving stock (Show, Eq) - -instance NFData SAParams where rnf x = seq x () - --- | empty structures for convenient use - --- | emptyGlobalSettings for early use in parition charcter. Can't have full due to data dependency of outpogroup name -emptyGlobalSettings :: GlobalSettings -emptyGlobalSettings = GlobalSettings { outgroupIndex = 0 - , outGroupName = T.pack "NoOutgroupSet" - , optimalityCriterion = Parsimony - , graphType = Tree - , compressResolutions = True - , finalAssignment = DirectOptimization - , graphFactor = Wheeler2015Network - , rootCost = NoRootCost - , rootComplexity = 0.0 - , graphComplexityList = IL.repeat (0.0, 0.0) - , modelComplexity = 0.0 - , seed = 0 - , searchData = [] - , partitionCharacter = "#" - , numDataLeaves = 0 - , bc2 = (0.0,1.0) - , bc4 = (0.0,1.0) - , bc5 = (0.0,1.0) - , bc8 = (0.0,1.0) - , bc64 = (0.0,1.0) - , bcgt64 = (0.0,1.0) - , dynamicEpsilon = 1.02 - } - --- | emptyPhylogeneticGraph specifies and empty phylogenetic graph --- important cost is infinity for filtering operations -emptyPhylogeneticGraph :: PhylogeneticGraph -emptyPhylogeneticGraph = (LG.empty, infinity, LG.empty, V.empty, V.empty, V.empty) - --- | emptyCharcater useful for intialization and missing data -emptyCharacter :: CharacterData -emptyCharacter = CharacterData { stateBVPrelim = (mempty, mempty, mempty) -- preliminary for Non-additive chars, Sankoff Approx - , stateBVFinal = mempty - -- for Additive - , rangePrelim = (mempty, mempty, mempty) - , rangeFinal = mempty - -- for multiple Sankoff/Matrix with sme tcm - , matrixStatesPrelim = mempty - , matrixStatesFinal = mempty - -- preliminary for m,ultiple seqeunce cahrs with same TCM - , slimPrelim = mempty - -- gapped medians of left, right, and preliminary used in preorder pass - , slimGapped = (mempty, mempty, mempty) - , slimAlignment = (mempty, mempty, mempty) - , slimFinal = mempty - , slimIAPrelim = (mempty, mempty, mempty) - , slimIAFinal = mempty - -- gapped median of left, right, and preliminary used in preorder pass - , widePrelim = mempty - -- gapped median of left, right, and preliminary used in preorder pass - , wideGapped = (mempty, mempty, mempty) - , wideAlignment = (mempty, mempty, mempty) - , wideFinal = mempty - , wideIAPrelim = (mempty, mempty, mempty) - , wideIAFinal = mempty - -- vector of individual character costs (Can be used in reweighting-ratchet) - , hugePrelim = mempty - -- gapped mediasn of left, right, and preliminary used in preorder pass - , hugeGapped = (mempty, mempty, mempty) - , hugeAlignment = (mempty, mempty, mempty) - , hugeFinal = mempty - , hugeIAPrelim = (mempty, mempty, mempty) - , hugeIAFinal = mempty - -- vectors for pre-aligned sequences also used in static approx - , alignedSlimPrelim = (mempty, mempty, mempty) - , alignedSlimFinal = mempty - , alignedWidePrelim = (mempty, mempty, mempty) - , alignedWideFinal = mempty - , alignedHugePrelim = (mempty, mempty, mempty) - , alignedHugeFinal = mempty - - , packedNonAddPrelim = (mempty, mempty, mempty) - , packedNonAddFinal = mempty - - -- vector of individual character costs (Can be used in reweighting-ratchet) - , localCostVect = V.singleton 0 - -- weight * V.sum localCostVect - , localCost = 0 - -- unclear if need vector version - , globalCost = 0 - } - --- | emptyVertex useful for graph rearrangements -emptyVertexInfo :: VertexInfo -emptyVertexInfo = VertexInfo { index = (-1) - , bvLabel = BV.fromBits [False] - , parents = mempty - , children = mempty - , nodeType = TreeNode -- root, leaf, network, tree - , vertName = T.pack "EmptyVertex" - , vertData = mempty - , vertexResolutionData = mempty - , vertexCost = 0.0 - , subGraphCost = 0.0 - } - --- | usefule in some cases -dummyNode :: LG.LNode VertexInfo -dummyNode = (-1, emptyVertexInfo) - --- | dummyEdge for convenience -dummyEdge :: EdgeInfo -dummyEdge = EdgeInfo { minLength = 0 - , maxLength = 0 - , midRangeLength = 0 - , edgeType = TreeEdge - } - --- emptyCharInfo for convenience -emptyCharInfo :: CharInfo -emptyCharInfo = CharInfo { name = T.pack "EmptyCharName" - , charType = NonAdd - , activity = True - , weight = 1.0 - , costMatrix = S.empty - , slimTCM = TCMD.generateDenseTransitionCostMatrix 2 2 . S.getCost $ V.fromList <$> V.fromList [[0,1],[1,0]] -- genDiscreteDenseOfDimension 2 - , wideTCM = snd $ MR.metricRepresentation <$> TCM.fromRows [[0::Word, 0::Word],[0::Word, 0::Word]] - , hugeTCM = snd $ MR.metricRepresentation <$> TCM.fromRows [[0::Word, 0::Word],[0::Word, 0::Word]] - , changeCost = 1.0 - , noChangeCost = 0.0 - , alphabet = fromSymbols $ fmap ST.fromString ["0", "1"] - , prealigned = False - , origInfo = V.empty - } - diff --git a/pkg/PhyGraph/Utilities/DistanceUtilities.hs b/pkg/PhyGraph/Utilities/DistanceUtilities.hs deleted file mode 100644 index 9c927b01c..000000000 --- a/pkg/PhyGraph/Utilities/DistanceUtilities.hs +++ /dev/null @@ -1,534 +0,0 @@ -{- | -Module : Utilities.hs -Description : Module with useful functionsfor distance tree construction methods dWag, Neightbor-Joining, UPGMA, and WPGMA - -- but with added refinement based on 4-point metric -Copyright : (c) 2020 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -module Utilities.DistanceUtilities where - -import qualified Data.Graph.Inductive.Graph as G -import qualified Data.Graph.Inductive.PatriciaTree as P -import Data.Maybe -import qualified Data.Number.Transfinite as NT -import qualified Data.Set as Set -import qualified Data.Text.Lazy as T -import qualified Data.Vector as V -import GeneralUtilities -import qualified GraphFormatUtilities as PP -import ParallelUtilities -import qualified SymMatrix as M -import System.IO.Unsafe -import qualified System.Random as Rand -import qualified System.Random.Shuffle as RandS -import Types.DistanceTypes ---import qualified LocalSequence as LS -import qualified Data.List as L -import qualified Data.Vector as LS - - - --- | localRoundtakes a double multiplies by 10^precisoin, rounds to integer then divides --- by precision -localRound :: Double -> Int -> Double -localRound val places = - let factor = 10.0 ^^ places - newVal = val * factor - roundedVal = round newVal :: Int - in - fromIntegral roundedVal / factor - --- | showDouble integer number of string postion )including sign and decimal) and returns string --- of that length--basically truncation -showDouble :: Int -> Double -> String -showDouble places val = show $ localRound val places -- reverse $ dropWhile (== '0') $ reverse $ take places $ show val - --- | test for equality with epsilon -withinEpsilon :: Double -> Double -> Bool -withinEpsilon a b = a == b - {- - if abs (a - b) < epsilon then True - else False - -} - --- | vertex2FGLNode take vertex of Int and a Vector of Strings (leaf names) and returns --- fgl node with type T.Text -vertex2FGLNode :: V.Vector String -> Vertex -> (Int, T.Text) -vertex2FGLNode leafVect vertIndex= - if vertIndex < V.length leafVect then (vertIndex, T.pack (leafVect V.! vertIndex)) - else - let vertexName = "HTU" ++ show (vertIndex - V.length leafVect) - in - (vertIndex, T.pack vertexName) - --- | treeFGL take a Treee type and converts to an fgl graph --- to be used with PhylParsers module hence Text -tree2FGL :: Tree -> V.Vector String -> P.Gr T.Text Double -tree2FGL inTree@(inVertexVect, inEdgeVect) leafNameVect = - if inTree == emptyTree then error "Empty tree in tree2FGL" - else - let fglNodes = V.map (vertex2FGLNode leafNameVect) inVertexVect - in - -- trace ("tree2FGL orig, vertices, edges = " ++ (show $ length inVertexVect ) ++ " " ++ (show $ length fglNodes ) ++ " " ++ (show $ length inEdgeVect )) - G.mkGraph (V.toList fglNodes) (V.toList inEdgeVect) - --- reIndexEdges take a list of vertices as integer indices and an edges (Int, Int, Double) --- and retuns a new vertex with the indives of the vertex labels -reIndexEdges :: [Int] -> Edge -> Edge -reIndexEdges inVertexList (e,u,w) = - if null inVertexList then error "Null inVertexList in reIndexEdges" - else - let newE = L.elemIndex e inVertexList - newU = L.elemIndex u inVertexList - in - --un safe not testing but list is created from edges first - (fromJust newE, fromJust newU, w) - --- | makeVertexNames takes vertgex indices and returns leaf name if < nOTUs and "HTU" ++ show Index --- if not -makeVertexNames :: [Vertex] -> Int -> V.Vector String -> Bool -> [String] -makeVertexNames vertList nOTUs leafNames nameHTUs = - if null vertList then [] - else - let firstVert = head vertList - in - if firstVert < nOTUs then (leafNames V.! firstVert) : makeVertexNames (tail vertList) nOTUs leafNames nameHTUs - else if nameHTUs then ("HTU" ++ show firstVert) : makeVertexNames (tail vertList) nOTUs leafNames nameHTUs - else "" : makeVertexNames (tail vertList) nOTUs leafNames nameHTUs - --- | directSingleEdge takes an Int and makes that 'e' and otehr vertex as 'u' in edge (e->u) -directSingleEdge :: Int -> Edge -> Edge -directSingleEdge index (a,b,w) - | a == index = (a,b,w) - | b == index = (b,a,w) - | otherwise = error ("Index " ++ show index ++ " doesn't match edge " ++ show (a,b,w) ++ " in directSingleEdge") - --- | getChildEdges returns the two edges that are childre of a vertex -getChildEdges :: Int -> Int -> V.Vector Edge -> V.Vector Edge -getChildEdges vertIndex nLeaves inEdgeVect - | V.null inEdgeVect = V.empty - | vertIndex < nLeaves = error ("Looking for child of leaf " ++ show (vertIndex, nLeaves)) - | otherwise = - let (a,b,w) = V.head inEdgeVect - in - if (a == vertIndex) || (b == vertIndex) then V.cons (a,b,w) (getChildEdges vertIndex nLeaves (V.tail inEdgeVect)) else getChildEdges vertIndex nLeaves (V.tail inEdgeVect) - - --- | directexEdges takes a vector of edges and outgrop index and directs the edges (parent -> child vertices) based on that -directEdges :: Int -> Int -> Bool -> V.Vector Edge -> V.Vector Edge -directEdges vertIndex nLeaves isFirst inEdgeVect - | V.null inEdgeVect = V.empty - | isFirst = --to find out group edge order larger to smaller will have outgroup index second - let outgroupEdge = getEdgeRoot vertIndex inEdgeVect - remainingEdgeVect = subtractVector (V.singleton outgroupEdge) inEdgeVect - (a,b,w) = orderEdge outgroupEdge - in - V.cons (a,b,w) (directEdges a nLeaves False remainingEdgeVect) - | vertIndex < nLeaves = V.empty - | otherwise = -- not outgroup but regular node, get two child edges - let descdendantEdges = getChildEdges vertIndex nLeaves inEdgeVect - remainingEdgeVect = subtractVector descdendantEdges inEdgeVect - newDescEdges = V.map (directSingleEdge vertIndex) descdendantEdges - in - if V.length newDescEdges /= 2 then error ("There should be 2 child edges for index " ++ show vertIndex ++ " and there are(is) " ++ show (V.length newDescEdges) ++ " " ++ show newDescEdges) - else - let (_, bf, _) = V.head newDescEdges - (_, bs, _) = V.last newDescEdges - firstSubEdges = directEdges bf nLeaves False remainingEdgeVect - remainingEdgeVect' = subtractVector firstSubEdges remainingEdgeVect - secondSubEdges = directEdges bs nLeaves False remainingEdgeVect' - in - (newDescEdges V.++ (firstSubEdges V.++ secondSubEdges)) - - --- | convertToGraph takes Vertex of names and a tree and return inductive Graph format -convertToDirectedGraph :: V.Vector String -> Int -> Tree -> P.Gr String Double -convertToDirectedGraph leafList outgroupIndex inTree = - let (_, edgeVect) = inTree - nOTUs = length leafList - -- should be stright 0->n-1 but in case some vertex number if missing - vertexList = L.sort $ Set.toList $ getVertexSet edgeVect - vertexNames = makeVertexNames vertexList nOTUs leafList True - labelledVertexList = L.zip vertexList vertexNames - edgeList = V.toList $ directEdges outgroupIndex nOTUs True edgeVect - in - G.mkGraph labelledVertexList edgeList - --- | convertToGraphText takes Vertex of names and a tree and return inductive Graph format -convertToDirectedGraphText :: V.Vector String -> Int -> Tree -> P.Gr T.Text Double -convertToDirectedGraphText leafList outgroupIndex inTree = - let (_, edgeVect) = inTree - nOTUs = length leafList - -- should be stright 0->n-1 but in case some vertex number if missing - vertexList = L.sort $ Set.toList $ getVertexSet edgeVect - vertexNames = makeVertexNames vertexList nOTUs leafList False - labelledVertexList = L.zip vertexList (fmap T.pack vertexNames) - edgeList = V.toList $ directEdges outgroupIndex nOTUs True edgeVect - in - G.mkGraph labelledVertexList edgeList - - --- | convertToNewick generates a newick file by converting Tree type to FGL Graph, --- adds a root and two new edges (deleting root edge) --- and calls convert function from PhyloParsers -convertToNewick :: V.Vector String -> Int -> Tree -> String -convertToNewick leafNames outGroup inTree - | inTree == emptyTree = error "Empty tree in convertToNewick" - | V.null leafNames = error "Empty leaf names in convertToNewick" - | otherwise = - let fglTree = convertToDirectedGraphText leafNames outGroup inTree - in - --trace ("ConverttoNewick in-vertices in-edges" ++ (show $ length inVertexVect ) ++ " " ++ (show $ V.toList inVertexVect ) ++ "\n" ++ (show $ length vertexVect ) ++ " " ++ (show vertexVect ) ++ "\n" ++ (show $ length edgeVect ) ++ " " ++ show edgeVect ++ "\n" ++ show fglTree) - PP.fglList2ForestEnhancedNewickString [fglTree] True False - - --- | getEdgeRootIndex takes edge Vect, Index, and determines edges from root -getEdgeRootIndex :: Int -> Int -> V.Vector Edge -> (Int, Edge) -getEdgeRootIndex edgeIndex outgroup edgeVect = - if V.null edgeVect then error "Root edge not found" - else - let (eVect, uVect, _) = V.head edgeVect - in - if (eVect == outgroup) || (uVect == outgroup) then (edgeIndex, V.head edgeVect) - else getEdgeRootIndex (edgeIndex + 1) outgroup (V.tail edgeVect) - - - --- | convertToNewick wrapper to remove double commas -convertToNewick' :: V.Vector String -> Int -> Tree -> String -convertToNewick' leafNames outGroup wagTree = removeCrap $ convertToNewickGuts leafNames outGroup wagTree - --- | removeDoubleCommas removes second of double comas ",," -> "," --- this a hack to fix problem in convertToNewick -removeCrap :: String -> String -removeCrap inString = - if length inString == 1 then inString - else - let firstChar = head inString - secondChar = inString !! 1 - in - if firstChar == ',' && secondChar == ',' then ',' : removeCrap (drop 2 inString) - else if firstChar == ',' && secondChar == ')' then ')' : removeCrap (drop 2 inString) - else firstChar : removeCrap (tail inString) - --- | convertToNewick converts Tree rep to Newick String --- includes edge cost--splits root edge cost into halves to help --- tree viewers like FigTree (so input is unrooted) --- NEED TO ADD smaller group left larger group right for more legible trees -convertToNewickGuts :: V.Vector String ->Int -> Tree -> String -convertToNewickGuts leafNames outGroup wagTree = - let (inLeaves, inEdges) = wagTree - newEdges = fmap orderEdge inEdges - (_, edgeVect) = orderTree (inLeaves, newEdges) - foundEdge = getEdgeRoot outGroup edgeVect - in - let (firstVert, secondVert, weight) = foundEdge - remainderEdges = V.filter (/= foundEdge) edgeVect - in - -- this is embarassing bullshit -- converting ",," to "," - if firstVert == outGroup then "(" ++ (leafNames V.! outGroup) ++ ":" ++ showDouble 8 (weight/2.0) ++ "," ++ getEdgesNonRoot secondVert remainderEdges (V.length leafNames) leafNames ++ ":" ++ showDouble 8 (weight/2.0) ++ ")" - else "(" ++ (leafNames V.! outGroup) ++ ":" ++ showDouble 8 (weight/2.0) ++ "," ++ getEdgesNonRoot firstVert remainderEdges (V.length leafNames) leafNames ++ ":" ++ showDouble 8 (weight/2.0) ++ ")" - --- | orderEdge takes an Edge and puts high index first then lower -orderEdge :: Edge -> Edge -orderEdge (a,b,w) = - if a > b then (a,b,w) - else (b,a,w) - --- | orderTree puts Tree edges in order based on edges -orderTree :: Tree -> Tree -orderTree (leaves, edges) = - let edgeList = L.sort $ V.toList edges - in - (leaves, V.fromList edgeList) - --- | getEdges takes root Index and determines edges from root -getEdgeRoot :: Int -> V.Vector Edge -> Edge -getEdgeRoot edgeIndex edgeVect = - if V.null edgeVect then (-1,-1,-1.0) - else - let (eVect, uVect, _) = V.head edgeVect - in - if (eVect == edgeIndex) || (uVect == edgeIndex) then V.head edgeVect else getEdgeRoot edgeIndex (V.tail edgeVect) - --- | getEdgesNonRoot takes root Index and determines edges from root returns String of Taxa --- alwasy getting ordered edges and ordered tree --- so eVect > uVect always; eVect can never be < nOTUs ---Need to add smaller tree left, bigger right -getEdgesNonRoot :: Int -> V.Vector Edge -> Int -> V.Vector String -> String -getEdgesNonRoot edgeIndex edgeVect nOTUs leafNames = - --trace (show edgeIndex) ( - let terminal = (-1,-1,-1.0) - in - if V.null edgeVect then "END" - else - let thisEdge = getEdgeRoot edgeIndex edgeVect - in - if thisEdge == terminal then "ERROR" - else - let (eVect, uVect, weight) = thisEdge - remainderEdges = V.filter (/= thisEdge) edgeVect - eDesc = getEdgeRoot eVect remainderEdges - uDesc = getEdgeRoot uVect remainderEdges - eSubTree = getEdgesNonRoot eVect remainderEdges nOTUs leafNames - uSubTree = getEdgesNonRoot uVect remainderEdges nOTUs leafNames - in - if V.null remainderEdges then - (leafNames V.! uVect) ++ ":" ++ showDouble precision weight ++ "," - - else if eVect == edgeIndex then - - if eVect < nOTUs then - if uDesc /= terminal then "(" ++ (leafNames V.! eVect) ++ ":" ++ showDouble precision weight ++ "," ++ uSubTree ++ ")" - else (leafNames V.! eVect) ++ ":" ++ showDouble precision weight ++ "," - else - if uVect < nOTUs then - if eDesc /= terminal then "(" ++ (leafNames V.! uVect) ++ ":" ++ showDouble precision weight ++ "," ++ eSubTree ++ ")" - else (leafNames V.! uVect) ++ ":" ++ showDouble precision weight ++ "," - else - if (eDesc /= terminal) && (uDesc == terminal) then eSubTree ++ ":" ++ showDouble precision weight ++ "," - else if (eDesc == terminal) && (uDesc /= terminal) then uSubTree ++ ":" ++ showDouble precision weight ++ "," - else - if length eSubTree < length uSubTree then "(" ++ eSubTree ++ "," ++ uSubTree ++ ":" ++ showDouble precision weight ++ ")" - else "(" ++ uSubTree ++ "," ++ eSubTree ++ ":" ++ showDouble precision weight ++ ")" - - else if uVect == edgeIndex then - if uVect < nOTUs then - if eDesc /= terminal then "(" ++ (leafNames V.! uVect) ++ ":" ++ showDouble precision weight ++ "," ++ eSubTree ++ ")" - else (leafNames V.!uVect) ++ ":" ++ showDouble precision weight ++ "," - - else if eVect < nOTUs then - if uDesc /= terminal then "(" ++ (leafNames V.! eVect) ++ ":" ++ showDouble precision weight ++ "," ++ uSubTree ++ ")" - else (leafNames V.!eVect) ++ ":" ++ showDouble precision weight ++ "," - - else - if (eDesc /= terminal) && (uDesc == terminal) then eSubTree ++ ":" ++ showDouble precision weight ++ "," - else if (eDesc == terminal) && (uDesc /= terminal) then uSubTree ++ ":" ++ showDouble precision weight ++ "," - else - if length eSubTree < length uSubTree then "(" ++ eSubTree ++ "," ++ uSubTree ++ ":" ++ showDouble precision weight ++ ")" - else "(" ++ uSubTree ++ ":" ++ showDouble precision weight ++ "," ++ eSubTree ++ ")" - - else getEdgesNonRoot edgeIndex remainderEdges nOTUs leafNames ++ ":" ++ showDouble precision weight ++ "," - --- getBestTrees takes newick and L.sorts on comment at end with cost -getBestTrees :: String -> Int -> [TreeWithData] -> Double -> [TreeWithData] -> [TreeWithData] -getBestTrees keepMethod number inList curBestCost curBestTrees = - if null inList then - -- apply keep method if not keeping all - if length curBestTrees <= number then curBestTrees - else if keepMethod == "first" then take number curBestTrees - else if keepMethod == "last" then reverse $ take number $ reverse curBestTrees - else if keepMethod == "random" then - -- not Fitch but shuffles elements and takes first n - let randIntList = betterRandomList (length curBestTrees) (length curBestTrees - 1) - newList = RandS.shuffle curBestTrees randIntList - in - take number newList - else error ("Keep method " ++ keepMethod ++ " not implemented") - else - let firstTree = head inList - (_, _, firstCost, _) = firstTree - in - if firstCost < curBestCost then getBestTrees keepMethod number (tail inList) firstCost [firstTree] - else if withinEpsilon firstCost curBestCost then getBestTrees keepMethod number (tail inList) firstCost (firstTree : curBestTrees) - else getBestTrees keepMethod number (tail inList) curBestCost curBestTrees - --- | getUniqueTrees saves uniqe newick trees --- different paths--ehnce different distMatrices coud result in same newick tree -getUniqueTrees :: [TreeWithData] -> [TreeWithData] -> [TreeWithData] -getUniqueTrees inList uniqueList = - if null inList then uniqueList - else - let firstTree = head inList - fstNewick = fst4 firstTree - in - if fstNewick `notElem` fmap fst4 uniqueList then getUniqueTrees (tail inList) (firstTree : uniqueList) - else getUniqueTrees (tail inList) uniqueList - --- | keepTrees filters newick trees based on options --- all keep all --- best shortest (and unique) allows number of max to save --- unique unique representations irespective of length --- keep metyhod for save first | last | atRandom if buffer full -keepTrees :: [TreeWithData] -> String -> String -> Double -> [TreeWithData] -keepTrees inList saveMethod keepMethod curBestCost - | null inList = [] - | saveMethod == "all" = inList - | take 6 saveMethod == "unique" = - if length saveMethod == 6 then getUniqueTrees inList [] - else - let number = read (drop 7 saveMethod) :: Int - in - take number $ L.sortOn thd4 $ getUniqueTrees inList [] - | take 4 saveMethod == "best" = - if length saveMethod == 4 then getUniqueTrees (getBestTrees keepMethod (maxBound :: Int) inList curBestCost []) [] - else if (saveMethod !! 4) == ':' then - let number = read (drop 5 saveMethod) :: Int - saveTrees = take number $ L.sortOn thd4 $ getUniqueTrees inList [] - (_, _, bestCost, _) = head saveTrees - in - getBestTrees keepMethod number saveTrees bestCost [] - else error ("Save method " ++ saveMethod ++ " improperly formatted") - | otherwise = error ("Save method " ++ saveMethod ++ " not implemented") - - -- | ranList generates random list of positive integers -ranList :: Rand.StdGen -> Int -> Int -> [Int] -ranList sg n maxValue = take n $ Rand.randomRs (0,maxValue) sg - --- | driver function to generate list of positive integers -{-# NOINLINE betterRandomList #-} -betterRandomList :: Int -> Int -> [Int] -betterRandomList n maxValue= unsafePerformIO $ do - sg <- Rand.getStdGen - return $ ranList sg n maxValue - --- | Subtrace vector subtracts elements of vector a from vector b --- is thins n^2 ? --- edges are directed -subtractVector :: (Eq a) => V.Vector a -> V.Vector a -> V.Vector a -subtractVector a b - | V.null a = b - | V.null b = V.empty - | otherwise = - let firstB = V.head b - notFound = V.notElem firstB a - in - if notFound then V.cons firstB (subtractVector a (V.tail b)) - else subtractVector a (V.tail b) - --- | getVertexSet take a vector of edges and creates the set of vertex numbers -getVertexSet :: V.Vector Edge -> Set.Set Vertex -getVertexSet edgeVect = - if V.null edgeVect then Set.empty - else - let (a,b,_) = V.head edgeVect - thisSet = Set.fromList [a,b] - in - Set.union thisSet (getVertexSet $ V.tail edgeVect) - --- | getMinRowDistMatrix distMatrix tabuList -getMinRowDistMatrix :: M.Matrix Double -> [Int] -> (Int, Double) -> Int -> Int -> (Int, Int, Double) -getMinRowDistMatrix distMatrix tabuList minPair@(minCol, minVal) curColumn row - | curColumn == LS.length (distMatrix LS.! row) = (row, minCol, minVal) - | row `elem` tabuList = (-1,-1, NT.infinity) - | curColumn == row = getMinRowDistMatrix distMatrix tabuList minPair (curColumn + 1) row - | curColumn `elem` tabuList = getMinRowDistMatrix distMatrix tabuList minPair (curColumn + 1) row - | otherwise = - let firstVal = distMatrix M.! (row, curColumn) - in - if firstVal < minVal then getMinRowDistMatrix distMatrix tabuList (curColumn, firstVal) (curColumn + 1) row - else getMinRowDistMatrix distMatrix tabuList minPair (curColumn + 1) row - - --- | compareTriples take two triples and orders them based on the smaller thrid element -minTriples :: (Ord c) => (a,b,c) -> (a,b,c) -> Ordering -minTriples (_,__,c) (_,_,d) - | c < d = LT - | c > d = GT - | otherwise = EQ - --- | getMatrixMinPairTabu' takes distMatrix initial integer pair and value --- traverses the matrix (skippiong rows and columns in tabuList and return minimum distance and index pair --- if tie takes first --- gets minimum by row and parallelizes ovcer rows - -- call with (-1, -1, NT.infinity) 0 0 -getMatrixMinPairTabu :: M.Matrix Double -> [Int] -> (Int, Int, Double) -getMatrixMinPairTabu distMatrix tabuList = - if M.null distMatrix then error "Empty matrix in getMatrixPairTabu" - else - let minValueList = seqParMap myStrategy (getMinRowDistMatrix distMatrix tabuList (-1, NT.infinity) 0) [0..(M.rows distMatrix - 1)] - in - L.minimumBy minTriples minValueList - --- | getMatrixMinPairTabu takes distMatrix initial integer pair and value --- traverses the matrix (skippiong rows and columns in tabuList and return minimum distance and index pair --- if tie takes first - -- call with (-1, -1, NT.infinity) 0 0 -getMatrixMinPairTabu' :: M.Matrix Double -> [Int] -> (Int, Int, Double) -> Int -> Int -> (Int, Int, Double) -getMatrixMinPairTabu' distMatrix tabuList curBest curRow curColumn - | curRow == M.rows distMatrix = curBest - | curColumn == M.cols distMatrix = getMatrixMinPairTabu' distMatrix tabuList curBest (curRow + 1) 0 - | curColumn == curRow = getMatrixMinPairTabu' distMatrix tabuList curBest curRow (curColumn + 1) - | (curColumn `elem` tabuList) || (curRow `elem` tabuList) = getMatrixMinPairTabu' distMatrix tabuList curBest curRow (curColumn + 1) - | otherwise = - let (_, _, currentBestDistance) = curBest - in - if distMatrix M.! (curRow, curColumn) < currentBestDistance then - getMatrixMinPairTabu' distMatrix tabuList (curRow, curColumn, distMatrix M.! (curRow, curColumn)) curRow (curColumn + 1) - else getMatrixMinPairTabu' distMatrix tabuList curBest curRow (curColumn + 1) - - - --- | getMatrixMinPair takes distMatrix initla pinteger pair and value --- traverses teh matrix and return minimum distance and index pair --- if tie takes first - -- call with (-1, -1, NT.infinity) 0 0 -getMatrixMinPair :: M.Matrix Double -> (Int, Int, Double) -> Int -> Int -> (Int, Int, Double) -getMatrixMinPair distMatrix curBest curRow curColumn - | curRow == M.rows distMatrix = curBest - | curColumn == M.cols distMatrix = getMatrixMinPair distMatrix curBest (curRow + 1) 0 - | curColumn == curRow = getMatrixMinPair distMatrix curBest curRow (curColumn + 1) - | otherwise = - let (_, _, currentBestDistance) = curBest - in - if distMatrix M.! (curRow, curColumn) < currentBestDistance then - getMatrixMinPair distMatrix (curRow, curColumn, distMatrix M.! (curRow, curColumn)) curRow (curColumn + 1) - else getMatrixMinPair distMatrix curBest curRow (curColumn + 1) - - --- | getMatrixMaxPair takes distMatrix initla pinteger pair and value --- traverses teh matrix and return maximum distance and index pair --- if tie takes first --- call with (-1 , -1, 0 :: Double) 0 0 -getMatrixMaxPair :: M.Matrix Double -> (Int, Int, Double) -> Int -> Int -> (Int, Int, Double) -getMatrixMaxPair distMatrix curBest curRow curColumn - | curRow == M.rows distMatrix = curBest - | curColumn == M.cols distMatrix = getMatrixMaxPair distMatrix curBest (curRow + 1) 0 - | curColumn == curRow = getMatrixMaxPair distMatrix curBest curRow (curColumn + 1) - | otherwise = - let (_, _, currentBestDistance) = curBest - in - if distMatrix M.! (curRow, curColumn) > currentBestDistance then - getMatrixMaxPair distMatrix (curRow, curColumn, distMatrix M.! (curRow, curColumn)) curRow (curColumn + 1) - else getMatrixMaxPair distMatrix curBest curRow (curColumn + 1) - --- | getTreeCost takes Tree and returns cost based on sum of edge weights -getTreeCost :: Tree -> Double -getTreeCost inTree = - V.sum $ V.map getEdgeCost $ snd inTree - --- | getEdgeCost returns weight form edge tuple -getEdgeCost :: (Vertex, Vertex, Weight) -> Double -getEdgeCost (_, _, edgeWeight) = edgeWeight - - diff --git a/pkg/PhyGraph/Utilities/Distances.hs b/pkg/PhyGraph/Utilities/Distances.hs deleted file mode 100644 index 5119a7fe2..000000000 --- a/pkg/PhyGraph/Utilities/Distances.hs +++ /dev/null @@ -1,95 +0,0 @@ -{- | -Module : Distances.hs -Description : Module specifying data types -Copyright : (c) 2021 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -module Utilities.Distances (getPairwiseDistances - , getBlockDistance - , getPairwiseBlockDistance - ) where - -import Control.Parallel.Strategies -import Data.Foldable -import qualified Data.Vector as V -import Debug.Trace -import GeneralUtilities -import qualified GraphOptimization.Medians as M -import qualified ParallelUtilities as P -import qualified SymMatrix as S -import Types.Types - - --- | getPairwiseDistances takes Processed data --- and retuns a matrix (list of lists of Double) of pairwise --- distances among vertices in data set over blocks ans all character types --- sums over blocks -getPairwiseDistances :: ProcessedData -> [[VertexCost]] -getPairwiseDistances (nameVect, _, blockDataVect) - | V.null nameVect = error "Null name vector in getPairwiseDistances" - | V.null blockDataVect = error "Null Block Data vector in getPairwiseDistances" - | otherwise = - let blockDistancesList = V.toList $ V.map (getPairwiseBlockDistance (V.length nameVect)) blockDataVect - summedBlock = foldl' (S.zipWith (+)) (head blockDistancesList) (tail blockDistancesList) - in - trace ("\tGenerating pairwise distances for " ++ show (V.length blockDataVect) ++ " character blocks") - S.toFullLists summedBlock - - - --- | getBlockDistance takes Block data and returns distance between --- vertices based on block data --- this can be done for leaves only or all via the input processed --- data leaves are first--then HTUs follow -getBlockDistance :: BlockData -> (Int, Int) -> VertexCost -getBlockDistance (_, localVertData, blockCharInfo) (firstIndex, secondIndex) = - let pairCost = V.sum $ V.map snd $ M.median2 (localVertData V.! firstIndex) (localVertData V.! secondIndex) blockCharInfo - in - pairCost - --- | getPairwiseBlocDistance returns pairwsie distances among vertices for --- a block of data --- this can be done for ;leaves only or all via the input processed --- data leaves are first--then HTUs follow -getPairwiseBlockDistance :: Int -> BlockData-> S.Matrix VertexCost -getPairwiseBlockDistance numVerts inData = - let pairList = makeIndexPairs True numVerts numVerts 0 0 - initialPairMatrix = S.fromLists $ replicate numVerts $ replicate numVerts 0.0 - pairListCosts = fmap (getBlockDistance inData) pairList `using` P.myParListChunkRDS - (iLst, jList) = unzip pairList - threeList = zip3 iLst jList pairListCosts - newMatrix = S.updateMatrix initialPairMatrix threeList - in - --trace ("NM:\n" ++ (show threeList) ++ "\n" ++(show $ S.toFullLists newMatrix)) - newMatrix - diff --git a/pkg/PhyGraph/Utilities/LocalGraph.hs b/pkg/PhyGraph/Utilities/LocalGraph.hs deleted file mode 100644 index a63e7528d..000000000 --- a/pkg/PhyGraph/Utilities/LocalGraph.hs +++ /dev/null @@ -1,1910 +0,0 @@ -{- | -Module : Utilities.LocalGraph.hs -Description : Module specifying graph types and functionality - This is for indirection so can change underlying graph library - without polutting the rest of the code -Copyright : (c) 2021 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -{-# LANGUAGE ScopedTypeVariables #-} - -module Utilities.LocalGraph where - -import qualified Data.Graph.Inductive.Basic as B -import qualified Data.Graph.Inductive.PatriciaTree as P -import qualified Data.Graph.Inductive.Query.ArtPoint as AP -import qualified Data.Graph.Inductive.Query.BCC as BCC -import qualified Data.Graph.Inductive.Query.BFS as BFS -import qualified Data.Graph.Inductive.Query.DFS as DFS -import qualified GraphFormatUtilities as GFU ---import qualified Data.Text as T -import Data.GraphViz as GV -import qualified Data.Text.Lazy as T ---import Data.GraphViz.Attributes.Complete (Attribute (Label), - --Label (..)) -import qualified Data.Graph.Inductive.Graph as G -import Data.GraphViz.Commands.IO as GVIO -import Control.Parallel.Strategies -import qualified Cyclic as C -import qualified Data.List as L -import qualified Data.Map as MAP -import Data.Maybe -import qualified Data.Vector as V -import GeneralUtilities -import qualified ParallelUtilities as PU -import System.IO -import Debug.Trace - - - - --- | Gr local graph definition using FGL -type Gr a b = P.Gr a b -type Node = G.Node -type LNode a = G.LNode a -type DotGraph = GV.DotGraph -type Edge = G.Edge -type LEdge b = G.LEdge b - - --- | getFENLocal maps to forestEnhancedNewickStringList2FGLList in GraphFormatUtilities --- to allow for potnetial swapping FGL graph backend --- requires leading and trailing space and newlines to be removed -getFENLocal :: T.Text -> [Utilities.LocalGraph.Gr T.Text Double] -getFENLocal = GFU.forestEnhancedNewickStringList2FGLList - - --- | readDotLocal calls GrapvViz function to allow for substitution later -readDotLocal :: String -> IO (Utilities.LocalGraph.DotGraph Utilities.LocalGraph.Node) -readDotLocal = GVIO.readDotFile - --- | dotToGraph local mapo dor -dotToGraph :: Utilities.LocalGraph.DotGraph Utilities.LocalGraph.Node -> Utilities.LocalGraph.Gr Attributes Attributes -dotToGraph = GV.dotToGraph - --- | hGetDotLocal calls hGetDot from GraphVoz -hGetDotLocal :: Handle -> IO (Utilities.LocalGraph.DotGraph Utilities.LocalGraph.Node) -hGetDotLocal = GVIO.hGetDot - --- | fglToPrettyString calls prettify from FGL -fglToPrettyString :: (Show a, Show b) => P.Gr a b -> String -fglToPrettyString inGraph = - if G.isEmpty inGraph then "Empty Graph" - else G.prettify inGraph - --- | pretty prints graph to String -prettify :: (Show a, Show b) => Gr a b -> String -prettify inGraph = - if G.isEmpty inGraph then "Empty Graph" - else G.prettify inGraph - --- | prettyIndices prints graph to String only using indices -prettyIndices :: (Show a, Show b) => Gr a b -> String -prettyIndices inGraph = - if G.isEmpty inGraph then "Empty Graph" - else - let nodeList = concat $ fmap (++ ",") $ fmap show $ nodes inGraph - edgeList = concat $ fmap (++ ",") $ fmap show $ edges inGraph - in - nodeList ++ "\n" ++ edgeList - --- | prettyDot prints generic graph to generic dot format -prettyDot :: Gr a b -> String -prettyDot inGraph = - if G.isEmpty inGraph then error ("Empty Graph") - else - let topPart = "digraph G {\n\trankdir = LR; node [ shape = rect];\n" - nodeList = concatMap ("\t" ++ ) $ fmap (++ ";\n") $ fmap show $ nodes inGraph - edgeList = concatMap ("\t" ++ ) $ fmap makeEdgeString $ edges inGraph - endPart = "}\n" - in - topPart ++ nodeList ++ edgeList ++ endPart - where makeEdgeString (a, b) = (show a) ++ " -> " ++ (show b) ++ ";\n" - --- these duplicate edge functions should be O(n log n) based on sort--rest linear - --- hasDuplicateEdge checked for duplicate edges based on indices (not label) -hasDuplicateEdge :: Gr a b -> Bool -hasDuplicateEdge inGraph = - if isEmpty inGraph then False - else - let sortedEdges = L.sort $ fmap toEdge $ labEdges inGraph - groupedEdges = L.group sortedEdges - dupEdges = filter ((>1) . length ) groupedEdges - --dupEdges = (fmap toEdge $ labEdges inGraph) L.\\ (L.nub $ fmap toEdge $ labEdges inGraph) - in - (not . null) dupEdges - --- | getDuplicateEdges retuns a list of edges that are duplicated --- by indeices--no label comparison --- can be used to delete the extra -getDuplicateEdges :: Gr a b -> [Edge] -getDuplicateEdges inGraph = - if isEmpty inGraph then [] - else - let sortedEdges = L.sort $ fmap toEdge $ labEdges inGraph - groupedEdges = L.group sortedEdges - dupEdges = filter ((>1) . length ) groupedEdges - --dupEdges = (fmap toEdge $ labEdges inGraph) L.\\ (L.nub $ fmap toEdge $ labEdges inGraph) - in - fmap head dupEdges - -- (fmap toEdge $ labEdges inGraph) L.\\ (L.nub $ fmap toEdge $ labEdges inGraph) - --- | removeDuplicateEdges removes duplicate edges from graph -removeDuplicateEdges :: Gr a b -> Gr a b -removeDuplicateEdges inGraph = - if isEmpty inGraph then inGraph - else - let dupEdges = getDuplicateEdges inGraph - in - if null dupEdges then inGraph - else delEdges dupEdges inGraph - --- | hasTreeNodeWithAllNetworkChildren checks treenodes for all (should be 2) children that --- are netork nodes -hasTreeNodeWithAllNetworkChildren :: Gr a b -> (Bool, [Node]) -hasTreeNodeWithAllNetworkChildren inGraph = - if isEmpty inGraph then (False, []) - else - let (_, _, treeNodeList, _) = splitVertexList inGraph - hasAllNetChildrenList = fmap (hasAllNetChildren inGraph) (fmap fst treeNodeList) - nodesWithAllNetChildren = fmap (fst . fst) $ filter ((== True) .snd) $ zip treeNodeList hasAllNetChildrenList - in - ((not . null) nodesWithAllNetChildren, nodesWithAllNetChildren) - --- | hasAllNetChildren checks whether all (usually 2) childrenb of a vertex are network nodes -hasAllNetChildren :: Gr a b -> Node -> Bool -hasAllNetChildren inGraph inNode = - let children = descendants inGraph inNode - childVertNodes = filter (== True) $ fmap (isNetworkNode inGraph) children - in - length children == length childVertNodes - --- | removeTreeEdgeFromTreeNodeWithAllNetworkChildren takes a greaph and removes the first edge (head) --- from each tree node with all netowork children, tehn contracts those edges and nodes, --- then reindexes -- but doesn not rename graph nodes -removeTreeEdgeFromTreeNodeWithAllNetworkChildren :: Gr a b -> Gr a b -removeTreeEdgeFromTreeNodeWithAllNetworkChildren inGraph = - let (toDo, nodesWithEdgesToDelete) = hasTreeNodeWithAllNetworkChildren inGraph - outEdgesToDeleteList = fmap toEdge $ fmap head $ fmap (out inGraph) nodesWithEdgesToDelete - newGraph = delEdges outEdgesToDeleteList inGraph - newGraph' = reindexGraph $ contractIn1Out1Edges newGraph - in - if not toDo then inGraph - else newGraph' - - --- | hasChainedNetworkNodes checks if a graph has network nodes with at least one parent that is also a network node -hasChainedNetworkNodes :: Gr a b -> Bool -hasChainedNetworkNodes inGraph = - if isEmpty inGraph then False - else - let (_, _, _, netVertexList) = splitVertexList inGraph - chainedNodeList = filter (== True) $ fmap (hasNetParent inGraph) $ fmap fst netVertexList - netWithChildNetList = filter (== True) $ fmap (hasAllNetChildren inGraph) $ fmap fst netVertexList - in - if null netVertexList then False - else (not . null) (chainedNodeList ++ netWithChildNetList) - --- | hasNetParent checks parent of node and retuens True if one or both are network nodes -hasNetParent :: Gr a b -> Node -> Bool -hasNetParent inGraph inNode = - let parentList = parents inGraph inNode - parentNetList = filter (== True) $ fmap (isNetworkNode inGraph) parentList - in - (not . null) parentNetList - --- | removeChainedNetworkNodes detectes and fixes (if possible) chained networtk edges --- if 1 parent of network edge is tree node can be fixed by delete and contracting that node/edge --- else if both parent are netowrks--cannot be fixed and errors out --- doens NOT rename nodes since need vertex info on that--but are reindexed -removeChainedNetworkNodes :: (Show a, Show b) => Gr a b -> Maybe (Gr a b) -removeChainedNetworkNodes inGraph = - if isEmpty inGraph then Just inGraph - else - let (_, _, _, netVertexList) = splitVertexList inGraph - parentNetNodeList = fmap (hasNetParent inGraph) $ fmap fst netVertexList - chainedNodeList = fmap fst $ filter ((== True) . snd) $ zip netVertexList parentNetNodeList - fixableChainedEdgeList = concatMap (getTreeEdgeParent inGraph) (fmap fst chainedNodeList) - newGraph = delEdges fixableChainedEdgeList inGraph - newGraph' = reindexGraph $ contractIn1Out1Edges newGraph - in - if null netVertexList then Just inGraph - else if null chainedNodeList then Just inGraph - else if null fixableChainedEdgeList then - trace ("Warning: Unfixable chained network nodes (both parent and child nodes are indegree > 1). Deleting skipping graph") - Nothing - else - trace ("Warning: Chained network nodes (both parent and child nodes are indegree > 1), removing edges to tree node parents (this may affect graph cost): " ++ (show fixableChainedEdgeList))-- ++ "\n" ++ (prettyIndices inGraph)) - Just newGraph' - --- | getTreeEdgeParent gets the tree edge (as list) into a network node as opposed to the edge from a network parent --- if both parents are netowrk nodes then returns [] -getTreeEdgeParent :: Gr a b -> Node -> [Edge] -getTreeEdgeParent inGraph inNode = - let parentList = parents inGraph inNode - parentTreeList = fmap fst $ filter ((== False) . snd) $ zip parentList (fmap (isNetworkNode inGraph) parentList) - in - if null parentTreeList then [] - else [(head parentTreeList, inNode)] - --- Wrapper functions for fgl so could swap out later if want to - --- | maps to isEmpty -isEmpty :: Gr a b -> Bool -isEmpty = G.isEmpty - --- | maps to empty -empty :: Gr a b -empty = G.empty - --- | maps to equal -equal :: (Eq a, Eq b) => Gr a b -> Gr a b -> Bool -equal x y = G.equal x y - --- | gelem is a node in a graph -gelem :: Node -> Gr a b -> Bool -gelem = G.gelem - --- | maps to labNodes-- I beleive in vertex index order -labNodes :: Gr a b -> [LNode a] -labNodes = G.labNodes - --- | maps to labEdges -labEdges :: Gr a b -> [LEdge b] -labEdges = G.labEdges - --- | toEdge removes label from edge -toEdge :: LEdge b -> Edge -toEdge = G.toEdge - --- | toNode returns fst of LNode a -toNode :: LNode a -> Node -toNode inNode = fst inNode - --- | toLEdge adds a label to an edge -toLEdge :: Edge -> b -> LEdge b -toLEdge = G.toLEdge - --- | toLEdge' flipped version of toLEdge -toLEdge' :: b -> Edge -> LEdge b -toLEdge' inLabel inEdge = G.toLEdge inEdge inLabel - --- | deg mapes to fgl deg -deg :: Gr a b -> Node -> Int -deg inGraph inNode = G.deg inGraph inNode - --- | maps to indeg -indeg :: Gr a b -> LNode a -> Int -indeg inGraph inLNode = G.indeg inGraph $ fst inLNode - --- | maps to outdeg -outdeg :: Gr a b -> LNode a -> Int -outdeg inGraph inLNode = G.outdeg inGraph $ fst inLNode - --- | getInOut takes a node and a graph and returns --- a triple (LNode a, indegree, outDegree) -getInOutDeg :: Gr a b -> LNode a -> (LNode a, Int, Int) -getInOutDeg inGraph inLNode = - if isEmpty inGraph then error "Empty graph in getInOut" - else - let inDeg = indeg inGraph inLNode - outDeg = outdeg inGraph inLNode - in - (inLNode, inDeg, outDeg) - --- | in-bound edge list to node, maps to inn -inn :: Gr a b -> Node -> [LEdge b] -inn = G.inn - --- | lab returns label of node as Maybe -lab :: Gr a b -> Node -> Maybe a -lab = G.lab - --- | out-bound edge list from node, maps to out -out :: Gr a b -> Node -> [LEdge b] -out = G.out - --- | hasEdge maps to fgl function returns True if graphs has directed edge between nodes -hasEdge :: Gr a b -> Edge -> Bool -hasEdge = G.hasEdge - --- | sisterLabNodes returns list of nodes that are "sister" ie share same parent --- as input node -sisterLabNodes :: (Eq a) => Gr a b -> LNode a -> [LNode a] -sisterLabNodes inGraph inNode = - if isEmpty inGraph then error "Empty graph in sisterLabNodes" - else - let parentNodeList = labParents inGraph (fst inNode) - otherChildrenOfParentsList = (concatMap (labDescendants inGraph) parentNodeList) L.\\ [inNode] - in - -- shoudl not need nub for phylogenetic graphs but whatever - L.nub otherChildrenOfParentsList - --- | parents of unlabelled node -parents :: Gr a b -> Node -> [Node] -parents inGraph inNode = fst3 <$> G.inn inGraph inNode - --- | grandParents of unlabelled node -grandParents :: Gr a b -> Node -> [Node] -grandParents inGraph inNode = - let nodeParents = parents inGraph inNode - in - concatMap (parents inGraph) nodeParents - --- | sharedGrandParents sees if a node has common grandparents -sharedGrandParents :: Gr a b -> Node -> Bool -sharedGrandParents inGraph inNode = - if isEmpty inGraph then error "Empty gaph in sharedGrandParents" - else if isRoot inGraph inNode then False - else - let parentList = parents inGraph inNode - grandParentLists = fmap (parents inGraph) parentList - intersection = L.foldl1' L.intersect grandParentLists - in - --True - if null intersection then False else True - --- | labParents returns the labelled parents of a node -labParents :: (Eq a) => Gr a b -> Node -> [LNode a] -labParents inGraph inNode = - let parentNodeList = parents inGraph inNode - parentLabelList = fmap (lab inGraph) parentNodeList - hasNothing = Nothing `elem` parentLabelList - parentLabelList' = fmap fromJust parentLabelList - in - if hasNothing then error "Unlabeled nodes in labParents" - else zip parentNodeList parentLabelList' - - --- | descendants of unlabelled node -descendants :: Gr a b -> Node -> [Node] -descendants inGraph inNode = snd3 <$> G.out inGraph inNode - --- | labDescendants labelled descendents of labelled node -labDescendants :: (Eq a) => Gr a b -> LNode a -> [LNode a] -labDescendants inGraph inNode = - let nodeList = snd3 <$> G.out inGraph (fst inNode) - maybeLabelList = fmap (lab inGraph) nodeList - hasNothing = Nothing `elem` maybeLabelList - labelList = fmap fromJust maybeLabelList - labNodeList = zip nodeList labelList - in - if hasNothing then error "Unlabeled nodes in labDescendants" - else labNodeList - --- | takes a graph and node and returns pair of inbound and noutbound labelled edges -getInOutEdges :: Gr a b -> Node -> ([LEdge b], [LEdge b]) -getInOutEdges inGraph inNode = (inn inGraph inNode, out inGraph inNode ) - --- | nodes returns list of unlabbeled nodes, maps to nodes -nodes :: Gr a b -> [Node] -nodes = G.nodes - --- | edges returns list of unlabeled nodes, maps to nodes -edges :: Gr a b -> [Edge] -edges = G.edges - --- | insEdges inserts a list of labelled edges into a graph -insEdges :: [LEdge b] -> Gr a b -> Gr a b -insEdges = G.insEdges - --- | insEdge inserts a labelled edge into a graph -insEdge :: LEdge b -> Gr a b -> Gr a b -insEdge = G.insEdge - --- | delLEdges delete a labelled edge from a graph --- wrapps around delEdge -delLEdge :: LEdge b -> Gr a b -> Gr a b -delLEdge inEdge = G.delEdge (G.toEdge inEdge) - --- | delEdges delete an unlabelled edge from a graph --- wrapps around delEdge -delEdge :: Edge -> Gr a b -> Gr a b -delEdge inEdge = G.delEdge inEdge - --- | delLEdge deletes a list of unlabelled edges from a graph --- wrapps around delEdges -delEdges :: [Edge] -> Gr a b -> Gr a b -delEdges inEdgeList = G.delEdges inEdgeList - --- | delLEdge deletes a list of labelled edges from a graph --- wrapps around delEdges -delLEdges :: [LEdge b] -> Gr a b -> Gr a b -delLEdges inEdgeList = G.delEdges (fmap G.toEdge inEdgeList) - --- | insNode inserts a labelled node into a graph -insNode :: LNode a -> Gr a b -> Gr a b -insNode = G.insNode - --- | insNodes inserts multiple labelled nodes into a graph -insNodes :: [LNode a] -> Gr a b -> Gr a b -insNodes = G.insNodes - --- | delLNode deletes a labelled node from a graph --- NB Removes any edges involving this node -delLNode :: LNode a -> Gr a b -> Gr a b -delLNode inNode = G.delNode (fst inNode) - --- | delNode deletes an unlabelled node from a graph --- NB Removes any edges involving this node -delNode :: Node -> Gr a b -> Gr a b -delNode = G.delNode - --- | delNodes deletes a list of unlabelled nodes from a graph --- NB I beleive removes any edges involving these nodes -delNodes :: [Node] -> Gr a b -> Gr a b -delNodes = G.delNodes - --- | mkGraph creates a graph from list of nodes and list of edges -mkGraph :: [LNode a] -> [LEdge b] -> Gr a b -mkGraph = G.mkGraph - - --- | mkGraphPair creates a graph from pair of list of nodes and list of edges -mkGraphPair :: ([LNode a], [LEdge b]) -> Gr a b -mkGraphPair (nodeList, edgeList) = G.mkGraph nodeList edgeList - --- | components list of list of nodes (G.Graphalyze can return graph list) -components :: Gr a b -> [[Node]] -components = DFS.components - --- | componentGraphs takes a graph and returns its compnent gaphs -componentGraphs :: Gr a b -> [Gr a b] -componentGraphs inGraph = - if isEmpty inGraph then [] - else - let componentNodeListList = components inGraph - labComponentNodeListList = fmap (fmap (labelNode inGraph)) componentNodeListList - edgeListListList = fmap (fmap (inn inGraph)) componentNodeListList - componentEdgeList = fmap concat edgeListListList - componentGraphList = zipWith mkGraph labComponentNodeListList componentEdgeList - in - if length componentNodeListList == 1 then [inGraph] - else componentGraphList - - --- | labelNode uses lab but checks for Nothing and returns labelled node -labelNode :: Gr a b -> Node -> LNode a -labelNode inGraph inNode = - if isEmpty inGraph then error "Empty graph for label source" - else - let label = lab inGraph inNode - in - if isNothing label then error ("No label for node " ++ show inNode) - else - (inNode, fromJust label) - - --- | noComponents returns number of components -noComponents :: Gr a b -> Int -noComponents = DFS.noComponents - --- | isLeaf checks if node is root -isLeaf :: Gr a b -> Node -> Bool -isLeaf inGraph inNode = G.outdeg inGraph inNode == 0 - --- | isOutDeg1Node checks if node has a single child -isOutDeg1Node :: Gr a b -> Node -> Bool -isOutDeg1Node inGraph inNode = G.outdeg inGraph inNode == 1 - --- | isNetworkNode checks if node is network node -isNetworkNode :: Gr a b -> Node -> Bool -isNetworkNode inGraph inNode = (G.indeg inGraph inNode > 1) && (G.outdeg inGraph inNode > 0) - --- | isNetworkLeaf checks if node is network node and a leaf--usually an error condition in phylogenetic networks -isNetworkLeaf :: Gr a b -> Node -> Bool -isNetworkLeaf inGraph inNode = (G.indeg inGraph inNode > 1) && (G.outdeg inGraph inNode == 0) - - --- | - | isNetworkEdge checks if edge is network edge -isNetworkEdge :: Gr a b -> Edge -> Bool -isNetworkEdge inGraph inEdge = (G.indeg inGraph (snd inEdge) > 1) && (G.outdeg inGraph (snd inEdge) > 0) - --- | - | isNetworkLabEdge checks if edge is network edge -isNetworkLabEdge :: Gr a b -> LEdge b -> Bool -isNetworkLabEdge inGraph inEdge = (G.indeg inGraph (snd3 inEdge) > 1) && (G.outdeg inGraph (snd3 inEdge) > 0) - --- | labNetEdges takes a graph and returns list of network labelled edges -labNetEdges :: Gr a b -> [LEdge b] -labNetEdges inGraph = - if isEmpty inGraph then error "Empty graph in labNetEdges" - else filter (isNetworkLabEdge inGraph) $ labEdges inGraph - --- | netEdges takes a graph and returns list of network edges -netEdges :: Gr a b -> [Edge] -netEdges inGraph = - if isEmpty inGraph then error "Empty graph in labNetEdges" - else filter (isNetworkEdge inGraph) $ edges inGraph - --- | isTreeNode checks if node is network node -isTreeNode :: Gr a b -> Node -> Bool -isTreeNode inGraph inNode = (G.indeg inGraph inNode == 1) && (G.outdeg inGraph inNode > 0) - --- getRoots returns list of graph roots (labelled) -getRoots :: Gr a b -> [LNode a] -getRoots inGraph = - if isEmpty inGraph then [] - else - let nodeList = labNodes inGraph - -- rootBoolList = fmap (isRoot inGraph . fst) nodeList - rootBoolList = fmap ((== 0).length) $ fmap (inn inGraph) $ fmap fst nodeList - pairList = zip rootBoolList nodeList - rootPairList = filter ((==True).fst) pairList - rootList = fmap snd rootPairList - in - rootList - --- | getIsolatedNodes returns list of labelled nodes with indegree=outdegree=0 --- should change to use G.deg == 0 -getIsolatedNodes :: Gr a b -> [LNode a] -getIsolatedNodes inGraph = - if isEmpty inGraph then [] - else - let nodeList = labNodes inGraph - -- rootBoolList = fmap (isRoot inGraph . fst) nodeList - in0BoolList = fmap ((== 0).length) $ fmap (inn inGraph) $ fmap fst nodeList - out0BoolList = fmap ((== 0).length) $ fmap (out inGraph) $ fmap fst nodeList - isolateBoolList = zipWith (&&) in0BoolList out0BoolList - - pairList = zip isolateBoolList nodeList - isolatePairList = filter ((==True).fst) pairList - isolateList = fmap snd isolatePairList - in - isolateList - --- | isRoot checks if node is root -isRoot :: Gr a b -> Node-> Bool -isRoot inGraph inNode = - G.gelem inNode inGraph && (G.indeg inGraph inNode == 0) - --- | pre returns list of nodes linking to a node -pre :: Gr a b -> Node -> [Node] -pre = G.pre - --- | suc returns list of nodes linking from a node -suc :: Gr a b -> Node -> [Node] -suc = G.suc - --- | edgeLabel returns label of edge -edgeLabel :: LEdge b -> b -edgeLabel = G.edgeLabel - --- | getOtherVertex retuns the edge vertex /= index -getOtherVertex :: LEdge b -> Int -> Int -getOtherVertex (u,v,_) index = if u == index then v else u - --- | flipEdge flips orientation of unlabelled edge -flipEdge :: Edge -> Edge -flipEdge (u,v) = (v,u) - --- | flipLEdge flips orientation of labelled edge -flipLEdge :: LEdge b -> LEdge b -flipLEdge (u,v,w) = (v,u,w) - - --- | isTree takes a graph and checks if there are anmy network nodes--if not returns True -isTree :: Gr a b -> Bool -isTree inGraph = - if G.isEmpty inGraph then error "Empty graph in isTree" - else - let (_, _, _, netNodes) = splitVertexList inGraph - in - null netNodes - - - -- | splitVertexList splits the vertices of a graph into ([root], [leaf], [tree], [network]) -splitVertexList :: Gr a b -> ([LNode a], [LNode a], [LNode a], [LNode a]) -splitVertexList inGraph = - if G.isEmpty inGraph then ([],[],[],[]) - else - let -- leaves - degOutList = outdeg inGraph <$> labNodes inGraph - newNodePair = zip degOutList (labNodes inGraph) - leafPairList = filter ((==0).fst ) newNodePair - (_, leafList) = unzip leafPairList - - -- roots - degInList = indeg inGraph <$> labNodes inGraph - newRootPair = zip degInList (labNodes inGraph) - rootPairList = filter ((==0).fst ) newRootPair - (_, rootList) = unzip rootPairList - - -- tree nodes - nodeTripleList = zip3 degInList degOutList (labNodes inGraph) - treeTripleList = filter ((==1).fst3 ) $ filter ((>0).snd3 ) nodeTripleList - (_, _, treeVertexList) = unzip3 treeTripleList - - -- network nodes - networkTripleList = filter ((>1).fst3 ) $ filter ((>0).snd3 ) nodeTripleList - (_, _, networkVertexList) = unzip3 networkTripleList - in - (rootList, leafList, treeVertexList, networkVertexList) - --- | pathToRoot takes a graph and a vertex and returns a pair of lists --- of vertices and edges to root(s) in order of encountering them to root --- if a tree--not necessarily if network--should work -pathToRoot :: (Eq a, Eq b) => Gr a b -> LNode a -> ([LNode a], [LEdge b]) -pathToRoot inGraph inNode = - if G.isEmpty inGraph then error "Empty graph in pathToRoot" - else pathToRoot' inGraph [inNode] [] [] - --- | pathToRoot' with accumulators --- filter operators basically for networks so not retrace paths --- includes roots as nodes -pathToRoot' :: (Eq a, Eq b) => Gr a b -> [LNode a] -> [LNode a] -> [LEdge b] -> ([LNode a], [LEdge b]) -pathToRoot' inGraph inNodeList curNodeList curEdgeList = - if null inNodeList then (reverse curNodeList, reverse curEdgeList) - else - let inNode = head inNodeList - in - -- root would already be inlist of nodes visited - if isRoot inGraph (fst inNode) then pathToRoot' inGraph (tail inNodeList) curNodeList curEdgeList - else - let inLEdges = filter (`notElem` curEdgeList) $ inn inGraph (fst inNode) - inNodes = filter (`notElem` (fmap fst curNodeList)) $ fmap fst3 inLEdges - --inLabNodes = concatMap (labParents inGraph) (fmap fst3 inLEdges) - inLabNodes = zip inNodes (fmap (fromJust . lab inGraph) inNodes) - in - pathToRoot' inGraph (inLabNodes ++ tail inNodeList) (inLabNodes ++ curNodeList) (inLEdges ++ curEdgeList) - --- | postOrderPathToNode takes a graph and two vertices nd returns a pair of lists --- of vertices and edges to beteween them in order of encountering them from first to second --- the path is post order to root so if second vertex is leaf-side of first node will hit root and fail -postOrderPathToNode :: (Eq a, Eq b) => Gr a b -> LNode a -> LNode a -> ([LNode a], [LEdge b]) -postOrderPathToNode inGraph startNode endNode = - if G.isEmpty inGraph then error "Empty graph in pathToRoot" - else postOrderPathToNode' inGraph endNode [startNode] [] [] - --- | postOrderPathToNode' with accumulators --- filter operators basically for networks so not retrace paths -postOrderPathToNode' :: (Eq a, Eq b) => Gr a b -> LNode a -> [LNode a] -> [LNode a] -> [LEdge b] -> ([LNode a], [LEdge b]) -postOrderPathToNode' inGraph endNode inNodeList curNodeList curEdgeList = - if null inNodeList then (reverse curNodeList, reverse curEdgeList) - else - let inNode = head inNodeList - in - -- root would already be inlist of nodes visited - if (fst inNode) == (fst endNode) then postOrderPathToNode' inGraph endNode (tail inNodeList) curNodeList curEdgeList - else if isRoot inGraph (fst inNode) then error ("postOrderPathToNode hit root before end node. Root index " ++ (show $ fst inNode) - ++ " edges " ++ (show $ fmap toEdge curEdgeList)) - else - let inLEdges = filter (`notElem` curEdgeList) $ inn inGraph (fst inNode) - inNodes = filter (`notElem` (fmap fst curNodeList)) $ fmap fst3 inLEdges - inLabNodes = zip inNodes (fmap (fromJust . lab inGraph) inNodes) - in - postOrderPathToNode' inGraph endNode (inLabNodes ++ tail inNodeList) (inLabNodes ++ curNodeList) (inLEdges ++ curEdgeList) - --- | nodesAndEdgesBefore takes a graph and list of nodes to get list of nodes --- and edges 'before' in the sense of leading to--ie between root and --- (not including)) that node --- call with ([], []) --- filter operators basically for networks so not retrace paths -nodesAndEdgesBefore' :: (Eq a, Eq b, Show a) => Gr a b -> ([LNode a], [LEdge b]) -> [LNode a] -> ([LNode a], [LEdge b]) -nodesAndEdgesBefore' inGraph curResults@(curNodes, curEdges) inNodeList - | G.isEmpty inGraph = error "Input Graph is empty in nodesAndEdgesBefore" - | null inNodeList = curResults - | otherwise = - let intoEdgeList = filter (`notElem` curEdges) $ inn inGraph (fst $ head inNodeList) - intoNodeList = filter (`notElem` (fmap fst curNodes)) $ fmap fst3 intoEdgeList - labelMaybeList = fmap (lab inGraph) intoNodeList - labelList = fmap fromJust labelMaybeList - intoLabNodeList = zip intoNodeList labelList - in - if Nothing `elem` labelMaybeList then error ("Empty node label in nodesAndEdgesBefore" ++ show intoLabNodeList) - else nodesAndEdgesBefore' inGraph (intoLabNodeList ++ curNodes, intoEdgeList ++ curEdges) (intoLabNodeList ++ tail inNodeList) - --- | nodesAndEdgesBefore takes a graph and list of nodes to get list of nodes --- and edges 'before' in the sense of leading to--ie between root and --- (not including)) that node --- filter operators basically for networks so not retrace paths --- wrapper without accuulator --- Does NOT Contain starting nodes -nodesAndEdgesBefore :: (Eq a, Eq b, Show a) => Gr a b -> [LNode a] -> ([LNode a], [LEdge b]) -nodesAndEdgesBefore inGraph inNodeList = nodesAndEdgesBefore' inGraph ([],[]) inNodeList - --- | getEdgeListAfter wrapper around nodesAndEdgesAfter to return only edges -getEdgeListAfter :: (Eq a, Eq b,Show a) => (Gr a b, Node) -> [LEdge b] -getEdgeListAfter (inGraph, inNode) = - snd $ nodesAndEdgesAfter inGraph [(inNode, fromJust $ lab inGraph inNode)] - - --- | nodesAndEdgesAfter' takes a graph and list of nodes to get list of nodes --- and edges 'after' in the sense of leading from-ie between (not including)) that node --- and all the way to any leaves is connects to. --- Does NOT Contain starting nodes --- call with ([], []) --- filter operators basically for networks so not retrace paths -nodesAndEdgesAfter' :: (Eq a, Eq b, Show a) => Gr a b -> ([LNode a], [LEdge b]) -> [LNode a] -> ([LNode a], [LEdge b]) -nodesAndEdgesAfter' inGraph curResults@(curNodes, curEdges) inNodeList - | G.isEmpty inGraph = error "Input Graph is empty in nodesAndEdgesAfter" - | null inNodeList = curResults - | otherwise = - let fromEdgeList = filter (`notElem` curEdges) $ out inGraph (fst $ head inNodeList) - fromNodeList = filter (`notElem` (fmap fst curNodes)) $ fmap snd3 fromEdgeList - labelMaybeList = fmap (lab inGraph) fromNodeList - labelList = fmap fromJust labelMaybeList - fromLabNodeList = zip fromNodeList labelList - in - if Nothing `elem` labelMaybeList then error ("Empty node label in nodesAndEdgesAfter" ++ show fromLabNodeList) - else nodesAndEdgesAfter' inGraph (fromLabNodeList ++ curNodes, fromEdgeList ++ curEdges) (fromLabNodeList ++ tail inNodeList) - --- | nodesAndEdgesAfter takes a graph and list of nodes to get list of nodes --- and edges 'after' in the sense of leading from-ie between (not including)) that node --- and all the way to any leaves is connects to. --- wrapper wihtout accumulator --- Does NOT Contain starting nodes -nodesAndEdgesAfter :: (Eq a, Eq b,Show a) => Gr a b -> [LNode a] -> ([LNode a], [LEdge b]) -nodesAndEdgesAfter inGraph inNodeList = nodesAndEdgesAfter' inGraph ([],[]) inNodeList - --- | indexMatchNode returns True if two labelled nodes have same index -indexMatchNode :: LNode a -> LNode a -> Bool -indexMatchNode (a, _) (b, _) = if a == b then True else False - - --- | coevalNodePairs generatres a list of pairs of nodes that must be potentially equal in --- age (ie parents of networkNode) -coevalNodePairs :: (Eq a) => Gr a b -> [(LNode a, LNode a)] -coevalNodePairs inGraph = - if G.isEmpty inGraph then [] - else - let (_, _, _, netVertexList) = splitVertexList inGraph - pairListList = fmap (labParents inGraph) $ fmap fst netVertexList - in - if null netVertexList then [] - else fmap makePairs pairListList - where makePairs a = if length a /= 2 then error ("Not two parents for coevalNodePairs") - else (head a, a !! 1) - - --- | indexMatchEdge returns True if two labelled edges have same node indices -indexMatchEdge :: LEdge b -> LEdge b -> Bool -indexMatchEdge (a,b,_) (c,d,_) = if a == c && b == d then True else False - --- | contractRootOut1Edge contracts indegree 0, outdegree 1, edges and removes the node in the middle --- does one at a time and makes a graph and recurses --- removes "tail" edges (single) from root to single child -contractRootOut1Edge :: (Show a, Show b) => Gr a b -> Gr a b -contractRootOut1Edge inGraph = - if G.isEmpty inGraph then G.empty - else - let inOutDeg = getInOutDeg inGraph <$> labNodes inGraph - out1RootList = filter ((==1) . thd3) $ filter ((==0) . snd3) inOutDeg - out2RootList = filter ((>1) . thd3) $ filter ((==0) . snd3) inOutDeg - in - -- trace ("CRO1E :" ++ (show (inOutDeg, out1RootList, out2RootList))) ( - if null out1RootList then inGraph - else if length out2RootList > 1 then error ("Multiple roots in graph in contractRootOut1Edge: " ++ (show $ length out2RootList)) - else if null out2RootList then - let -- get root with out = 1 and its child, that childs' children, and edges - rootVertex = head out1RootList - childOfRoot = snd3 $ head $ out inGraph ((fst . fst3) rootVertex) - childOfRootEdges = out inGraph childOfRoot - - newEdgeToAdd0 = ((fst . fst3) rootVertex, snd3 $ head childOfRootEdges, thd3 $ head childOfRootEdges) - newEdgeToAdd1 = ((fst . fst3) rootVertex, snd3 $ last childOfRootEdges, thd3 $ last childOfRootEdges) - - -- create new Graph, deleting child node deltes three edges around it - newGraph = insEdges [newEdgeToAdd0, newEdgeToAdd1] $ delNode childOfRoot inGraph - - - in - -- trace ("Removing tail edge root :" ++ (show $ snd3 $ head $ out inGraph ((fst . fst3) rootVertex))) - contractRootOut1Edge $ reindexGraph newGraph - else -- case where mupltiple roots--combine by deleting in0out1 node and creting edge to its child from regular root. - let rootVertex = head out1RootList - in0out1RootIndex = (fst . fst3) rootVertex - - out2RootVertex = (fst . fst3 . head) out2RootList - - -- removes one of root out edges and inserts teh in0 out1 node adding two oew edges - root02EdgeToDelete = last $ out inGraph out2RootVertex - newEdgeFrom02Root = (out2RootVertex, in0out1RootIndex, thd3 root02EdgeToDelete) - newEdgeFrom01Root = (in0out1RootIndex, snd3 root02EdgeToDelete, thd3 root02EdgeToDelete) - - -- this relies on (a) root in first HTU - nonOTUOut0Nodes = fmap (fst . fst3) $ filter ((>= out2RootVertex) . (fst . fst3)) $ filter ((==0) . thd3) $ filter ((>0) . snd3) inOutDeg - - newGraph = insEdges [newEdgeFrom02Root, newEdgeFrom01Root] $ delEdge (toEdge root02EdgeToDelete) $ delNodes nonOTUOut0Nodes inGraph - - in - -- trace ("Removing extra edge root :" ++ (show $ (root02EdgeToDelete, newEdgeFrom02Root, newEdgeFrom01Root))) - contractRootOut1Edge $ reindexGraph $ contractIn1Out1Edges $ newGraph - - -- ) - - - --- | contractIn1Out1Edges contracts indegree 1, outdegree 1, edges and removes the node in the middle --- does one at a time and makes a graph and recurses -contractIn1Out1Edges :: Gr a b -> Gr a b -contractIn1Out1Edges inGraph = - if G.isEmpty inGraph then G.empty - else - let inOutDeg = getInOutDeg inGraph <$> labNodes inGraph - degree11VertexList = filter ((==1) . thd3) $ filter ((==1) . snd3) inOutDeg - in - -- trace ("vertex 11:" ++ show degree11VertexList) ( - if null degree11VertexList then inGraph - else - let nodeToDelete = fst3 $ head degree11VertexList - inEdgeToDelete = head $ inn inGraph $ fst nodeToDelete - outEdgeToDelete = head $ out inGraph $ fst nodeToDelete - newEdgeToAdd = (fst3 inEdgeToDelete, snd3 outEdgeToDelete, thd3 inEdgeToDelete) - reindexedNodes = reindexNodes (fst nodeToDelete) [] $ labNodes inGraph - reindexedEdges = reindexEdges (fst nodeToDelete) [] (newEdgeToAdd : (labEdges inGraph)) - - newGraph = mkGraph reindexedNodes reindexedEdges - --newGraph = insEdge newEdgeToAdd $ delLNode nodeToDelete inGraph - in - -- trace ("Deleting Node " ++ show (fst nodeToDelete) ++ " " ++ show (inEdgeToDelete, outEdgeToDelete) ++ " inserting " ++ show newEdgeToAdd) - contractIn1Out1Edges newGraph - -- ) - --- | reindexNodes takes a node (assumes index and fst of node are the same) and a list of --- nodes deleting the input node and reindexing all the other nodes with indices > than the input are reduced by 1 -reindexNodes :: Int -> [LNode a] -> [LNode a] -> [LNode a] -reindexNodes inNodeIndex curList nodeList = - if null nodeList then reverse curList - else - let firstNode@(index, label) = head nodeList - in - if index < inNodeIndex then reindexNodes inNodeIndex (firstNode : curList) (tail nodeList) - else if index == inNodeIndex then reindexNodes inNodeIndex curList (tail nodeList) - else reindexNodes inNodeIndex ((index - 1, label) : curList) (tail nodeList) - - --- | reindexEdges takes the index of a node that has/is being delted and reindexes indices --- that are not incident on the node and deleted if incedent on node index -reindexEdges :: Int -> [LEdge b] -> [LEdge b] -> [LEdge b] -reindexEdges inNodeIndex curList edgeList = - if null edgeList then curList - else - let (a,b,c) = head edgeList - a' = if a < inNodeIndex then a - else a - 1 - b' = if b < inNodeIndex then b - else b - 1 - in - -- incident on node to be deleted - if a == inNodeIndex || b == inNodeIndex then reindexEdges inNodeIndex curList (tail edgeList) - - -- reindexed edge added in - else reindexEdges inNodeIndex ((a', b', c) : curList) (tail edgeList) - - --- | artPoint calls ap to get articulation points of graph -artPoint :: (Eq b) => Gr a b -> [Node] -artPoint inGraph = AP.ap $ B.undir inGraph - - --- | makeNodeEdgePair takes a graph and extracts list of nodes and edges --- returning a pair -makeNodeEdgePair :: Gr a b -> ([LNode a], [LEdge b]) -makeNodeEdgePair inGraph = - if isEmpty inGraph then ([],[]) - else (labNodes inGraph, labEdges inGraph) - - --- | makeNodeEdgePairVect takes a graph and extracts list of nodes and edges --- returning a pair of a vector of nodes and Vector of edges -makeNodeEdgePairVect :: Gr a b -> (V.Vector (LNode a), V.Vector (LEdge b)) -makeNodeEdgePairVect inGraph = - if isEmpty inGraph then (V.empty, V.empty) - else (V.fromList $ labNodes inGraph, V.fromList $ labEdges inGraph) - --- | extractLeafGraph takes a Decorated graphs and cretes a graph from the leaves and no edges -extractLeafGraph :: Gr a b -> Gr a b -extractLeafGraph inGraph = - if isEmpty inGraph then G.empty - else - let (_, leafNodes, _, _) = splitVertexList inGraph - in - mkGraph leafNodes [] - --- | makes graph undirected -undir :: (Eq b) => Gr a b -> Gr a b -undir inGraph = B.undir inGraph - --- | finds bi connected components of a graph -bcc :: Gr a b -> [Gr a b] -bcc inGraph = BCC.bcc inGraph - - --- | removeNonLeafOut0Nodes removed nodes (and edges attached) that are ourtdegree = zero --- but not in the leaf node list --- does not reindex graph -removeNonLeafOut0Nodes :: (Eq a) => [LNode a] -> Gr a b -> Gr a b -removeNonLeafOut0Nodes leafList inGraph = - if null leafList then error "Null leaf list in removeNonLeafOut0Nodes" - else if isEmpty inGraph then empty - else - let nonLeafList = (labNodes inGraph) L.\\ leafList - outdegreePairList = zip nonLeafList (fmap (outdeg inGraph) nonLeafList) - (zeroOutNodeList, _) = unzip $ filter ((==0) .snd) outdegreePairList - in - if null zeroOutNodeList then inGraph - else - let newGraph = delNodes (fmap fst zeroOutNodeList) inGraph - in - removeNonLeafOut0Nodes leafList newGraph - --- | reindexGraph takes a graph and reindexes nodes and edges such that nodes --- are sequential and the firt field matches their node index -reindexGraph :: Gr a b -> Gr a b -reindexGraph inGraph = - if isEmpty inGraph then empty - else - let nodeList = labNodes inGraph - newIndexList = [0..(length nodeList - 1)] - nodeIndexPair = zip (fmap fst nodeList) newIndexList - nodeIndexMap = MAP.fromList nodeIndexPair - newNodeList = fmap (makeNewNode nodeIndexMap) nodeList - newEdgeList = fmap (makeNewEdge nodeIndexMap) (labEdges inGraph) - in - mkGraph newNodeList newEdgeList - - where makeNewNode indexMap (a,b) = (fromJust $ MAP.lookup a indexMap, b) - makeNewEdge indexMap (a,b,c) = (fromJust $ MAP.lookup a indexMap, fromJust $ MAP.lookup b indexMap, c) - --- | isBridge uses naive (component number) procedure to determine if edge is a bridge O(n) -isBridge :: Gr a b -> Edge -> Bool -isBridge inGraph inNode = - if isEmpty inGraph then error ("Empty graph in isBridge") - else - let numComponents = noComponents inGraph - numComponents' = noComponents $ delEdge inNode inGraph - in - numComponents' > numComponents - - - --- FGL articulation point code--could be modified to get brisge edges in linear time} ------------------------------------------------------------------------------- --- Tree for storing the DFS numbers and back edges for each node in the graph. --- Each node in this tree is of the form (v,n,b) where v is the vertex number, --- n is its DFS number and b is the list of nodes (and their DFS numbers) that --- lead to back back edges for that vertex v. ------------------------------------------------------------------------------- -data DFSTree a = B (a,a,[(a,a)]) [DFSTree a] - deriving (Eq, Show, Read) - ------------------------------------------------------------------------------- --- Tree for storing the DFS and low numbers for each node in the graph. --- Each node in this tree is of the form (v,n,l) where v is the vertex number, --- n is its DFS number and l is its low number. ------------------------------------------------------------------------------- -data LOWTree a = Brc (a,a,a) [LOWTree a] - deriving (Eq, Show, Read) - ------------------------------------------------------------------------------- --- Finds the back edges for a given node. ------------------------------------------------------------------------------- -getBackEdges :: Node -> [[(Node,Int)]] -> [(Node,Int)] -getBackEdges _ [] = [] -getBackEdges v ls = map head (filter (elem (v,0)) (tail ls)) - ------------------------------------------------------------------------------- --- Builds a DFS tree for a given graph. Each element (v,n,b) in the tree --- contains: the node number v, the DFS number n, and a list of backedges b. ------------------------------------------------------------------------------- -dfsTree :: Int -> Node -> [Node] -> [[(Node,Int)]] -> Gr a b -> ([DFSTree Int],Gr a b,Int) -dfsTree n _ [] _ g = ([],g,n) -dfsTree n _ _ _ g | isEmpty g = ([],g,n) -dfsTree n u (v:vs) ls g = case G.match v g of - (Nothing, g1) -> dfsTree n u vs ls g1 - (Just c , g1) -> (B (v,n+1,bck) ts:ts', g3, k) - where bck = getBackEdges v ls - (ts, g2,m) = dfsTree (n+1) v sc ls' g1 - (ts',g3,k) = dfsTree m v vs ls g2 - ls' = ((v,n+1):sc'):ls - sc' = map (\x->(x,0)) sc - sc = G.suc' c - ------------------------------------------------------------------------------- --- Finds the minimum between a dfs number and a list of back edges' dfs --- numbers. ------------------------------------------------------------------------------- -minbckEdge :: Int -> [(Node,Int)] -> Int -minbckEdge n [] = n -minbckEdge n bs = min n (minimum (map snd bs)) - ------------------------------------------------------------------------------- --- Returns the low number for a node in a subtree. ------------------------------------------------------------------------------- -getLow :: LOWTree Int -> Int -getLow (Brc (_,_,l) _) = l - ------------------------------------------------------------------------------- --- Builds a low tree from a DFS tree. Each element (v,n,low) in the tree --- contains: the node number v, the DFS number n, and the low number low. ------------------------------------------------------------------------------- -lowTree :: DFSTree Int -> LOWTree Int -lowTree (B (v,n,[] ) [] ) = Brc (v,n,n) [] -lowTree (B (v,n,bcks) [] ) = Brc (v,n,minbckEdge n bcks) [] -lowTree (B (v,n,bcks) trs) = Brc (v,n,lowv) ts - where lowv = min (minbckEdge n bcks) lowChild - lowChild = minimum (map getLow ts) - ts = map lowTree trs - ------------------------------------------------------------------------------- --- Builds a low tree for a given graph. Each element (v,n,low) in the tree --- contains: the node number v, the DFS number n, and the low number low. ------------------------------------------------------------------------------- -getLowTree :: Gr a b -> Node -> LOWTree Int -getLowTree g v = lowTree (head dfsf) - where (dfsf, _, _) = dfsTree 0 0 [v] [] g - ------------------------------------------------------------------------------- --- Tests if a node in a subtree is an articulation point. An non-root node v --- is an articulation point iff there exists at least one child w of v such --- that lowNumber(w) >= dfsNumber(v). The root node is an articulation point --- iff it has two or more children. ------------------------------------------------------------------------------- -isap :: LOWTree Int -> Bool -isap (Brc (_,_,_) []) = False -isap (Brc (_,1,_) ts) = length ts > 1 -isap (Brc (_,n,_) ts) = not (null ch) - -- modify for bridges - -- where ch = filter ( >= n) (map getLow ts) - where ch = filter ( >= n) (map getLow ts) ------------------------------------------------------------------------------- --- Finds the articulation points by traversing the low tree. ------------------------------------------------------------------------------- -arp :: LOWTree Int -> [Node] -arp (Brc (v,1,_) ts) | length ts > 1 = v:concatMap arp ts - | otherwise = concatMap arp ts -arp (Brc (v,n,l) ts) | isap (Brc (v,n,l) ts) = v:concatMap arp ts - | otherwise = concatMap arp ts - ------------------------------------------------------------------------------- --- Finds the articulation points of a graph starting at a given node. ------------------------------------------------------------------------------- -artpoints :: Gr a b -> Node -> [Node] -artpoints g v = arp (getLowTree g v) - -{-| - Finds the articulation points for a connected undirected graph, - by using the low numbers criteria: - - a) The root node is an articulation point iff it has two or more children. - - b) An non-root node v is an articulation point iff there exists at least - one child w of v such that lowNumber(w) >= dfsNumber(v). --} -ap' :: Gr a b -> [Node] -ap' g = artpoints g v where ((_,v,_,_),_) = G.matchAny g - --- | cyclic maps to cyclic function in module Cyclic.hs -cyclic :: Gr a b -> Bool -cyclic inGraph = - -- trace ("Cyclic:" ++ (show $ C.cyclic inGraph)) - C.cyclic inGraph - --- | testEdge nodeList fullEdgeList) counter --- chnage to input graph and delete edge from graph as opposed to making new graphs each time. --- should be much faster using P.delLEdge (since only one edge to delete) -testEdge :: (Eq b) => P.Gr a b -> G.LEdge b -> [G.LEdge b] -testEdge fullGraph candidateEdge@(e,u,_) = - let newGraph = G.delLEdge candidateEdge fullGraph - bfsNodes = BFS.bfs e newGraph - foundU = L.find (== u) bfsNodes - in - [candidateEdge | isNothing foundU] - --- | transitiveReduceGraph take list of nodes and edges, deletes each edge (e,u) in turn makes graph, --- checks for path between nodes e and u, if there is delete edge otherwise keep edge in list for new graph --- transitive reduction Aho et al. 1972 --- this not iterative with new graphs--shold it be? -transitiveReduceGraph :: (Eq b) => Gr a b -> Gr a b -transitiveReduceGraph fullGraph = - let requiredEdges = fmap (testEdge fullGraph) (labEdges fullGraph) - newGraph = G.mkGraph (labNodes fullGraph) (concat requiredEdges) - in - newGraph - --- | getCoevalConstraintEdges takes a graph and a network node and creates two lists: one of edges --- "before" (ie towards root) and a second "after (ie away from root) --- this defines a coeval constraint. No network edge can be added that would be directed --- from the before group to the after -getCoevalConstraintEdges :: (Eq a, Eq b, Show a) => Gr a b -> LNode a -> ([LEdge b],[LEdge b]) -getCoevalConstraintEdges inGraph inNode = - if isEmpty inGraph then error "Empty input graph in getCoevalConstraintEdges" - else - let (_, edgeBeforeList) = nodesAndEdgesBefore inGraph [inNode] - (_, edgeAfterList) = nodesAndEdgesAfter inGraph [inNode] - in - (edgeBeforeList, edgeAfterList) - - --- | getGraphCoevalConstraints takes a greaph and returns coeval constraints based on network nodes -getGraphCoevalConstraints :: (Eq a, Eq b, Show a, NFData b) => Gr a b -> [([LEdge b],[LEdge b])] -getGraphCoevalConstraints inGraph = - if isEmpty inGraph then error "Empty input graph in getGraphCoevalConstraints" - else - let (_, _, _, networkNodeList) = splitVertexList inGraph - in - if null networkNodeList then [] - else PU.seqParMap rdeepseq (getCoevalConstraintEdges inGraph) networkNodeList -- `using` PU.myParListChunkRDS - --- | getGraphCoevalConstraintsNodes takes a graph and returns coeval constraints based on network nodes --- and nodes as a triple -getGraphCoevalConstraintsNodes :: (Eq a, Eq b, Show a, NFData b) => Gr a b -> [(LNode a, [LEdge b],[LEdge b])] -getGraphCoevalConstraintsNodes inGraph = - if isEmpty inGraph then error "Empty input graph in getGraphCoevalConstraints" - else - let (_, _, _, networkNodeList) = splitVertexList inGraph - in - if null networkNodeList then [] - else - let (edgeBeforeList, edgeAfterList) = unzip (PU.seqParMap rdeepseq (getCoevalConstraintEdges inGraph) networkNodeList) -- `using` PU.myParListChunkRDS) - in zip3 networkNodeList edgeBeforeList edgeAfterList - --- | meetsAllCoevalConstraintsNodes checks constraint pair list and examines --- whether one edge is from before and one after--if so fails False --- else True if all pass --- new edghe that woudl be creatated in edge1 -> edge2 --- checks if starting and ending vertices of edges to be linked are on same side of each --- coeval contraint -meetsAllCoevalConstraintsNodes :: (Eq b) => [(Node, Node, [Node], [Node], [Node], [Node])] -> LEdge b -> LEdge b -> Bool -meetsAllCoevalConstraintsNodes constraintList edge1@(u,v,_) edge2@(u',v',_) = - if null constraintList then True - else - let (_, _, aNodesBefore, bNodesBefore, aNodesAfter, bNodesAfter) = head constraintList - in - --checks if edge nodes would stradle existing coeveal nodes - if (u `elem` aNodesAfter) && (u' `elem` bNodesBefore) then False - else if (u' `elem` aNodesAfter) && (u `elem` bNodesBefore) then False - else if (v `elem` aNodesAfter) && (v' `elem` bNodesBefore) then False - else if (v' `elem` aNodesAfter) && (v `elem` bNodesBefore) then False - - else if (u `elem` aNodesBefore) && (u' `elem` bNodesAfter) then False - else if (u' `elem` aNodesBefore) && (u `elem` bNodesAfter) then False - else if (v `elem` aNodesBefore) && (v' `elem` bNodesAfter) then False - else if (v' `elem` aNodesBefore) && (v `elem` bNodesAfter) then False - - - else meetsAllCoevalConstraintsNodes (tail constraintList) edge1 edge2 - - --- | meetsAllCoevalConstraintsEdges checks constraint pair list and examines --- whether one edge is from before and one after--if so fails False --- else True if all pass - -- not correct I don't htink -meetsAllCoevalConstraintsEdges :: (Eq b) =>[([LEdge b],[LEdge b])] -> LEdge b -> LEdge b -> Bool -meetsAllCoevalConstraintsEdges constraintList edge1 edge2 = - if null constraintList then True - else - let (beforeList, afterList) = head constraintList - in - if edge1 `elem` beforeList && edge2 `elem` afterList then False - else if edge2 `elem` beforeList && edge1 `elem` afterList then False - else meetsAllCoevalConstraintsEdges (tail constraintList) edge1 edge2 - --- | insertDeleteEdges takes a graphs and list of nodes and edges to add and delete and creates new graph -insertDeleteEdges :: (Show a, Show b) => Gr a b -> ([LEdge b], [Edge]) -> Gr a b -insertDeleteEdges inGraph (edgesToAdd, edgesToDelete) = - let editedGraph = insEdges edgesToAdd $ delEdges edgesToDelete inGraph - in - -- trace ("AGE: " ++ (show editStuff) ++ "\nIn graph:\n" ++ (prettify inGraph) ++ "New Graph:\n" ++ (prettify editedGraph)) - editedGraph - - --- | notMatchEdgeIndices reutnr True if not in edge list but doesn't compare label only indices -notMatchEdgeIndices :: [Edge] -> LEdge b -> Bool -notMatchEdgeIndices unlabeledEdegList labelledEdge = - if toEdge labelledEdge `elem` unlabeledEdegList then False - else True - --- | isGraphTimeConsistent retuns False if graph fails time consistency -isGraphTimeConsistent :: (Show a,Eq a,Eq b, NFData a) => Gr a b -> Bool -isGraphTimeConsistent inGraph = - if isEmpty inGraph then True - else - if isTree inGraph then True - else - let coevalNodeConstraintList = coevalNodePairs inGraph - coevalNodeConstraintList' = PU.seqParMap rdeepseq (addBeforeAfterToPair inGraph) coevalNodeConstraintList -- `using` PU.myParListChunkRDS - coevalPairsToCompareList = getListPairs coevalNodeConstraintList' - timeOffendingEdgeList = getEdgesToRemoveForTime inGraph coevalPairsToCompareList - in - null timeOffendingEdgeList - --- | addBeforeAfterToPair adds before and after node list to pari of nodes for later use --- in time contraint edge removal -addBeforeAfterToPair :: (Show a,Eq a,Eq b) - => Gr a b - -> (LNode a, LNode a) - -> (LNode a, LNode a, [LNode a], [LNode a], [LNode a], [LNode a]) -addBeforeAfterToPair inGraph (a,b) = - if isEmpty inGraph then error "Empty graph in addBeforeAfterToPair" - else - let (aNodesBefore, _) = nodesAndEdgesBefore inGraph [a] - (bNodesBefore, _) = nodesAndEdgesBefore inGraph [b] - (aNodesAfter, _) = nodesAndEdgesAfter inGraph [a] - (bNodesAfter, _) = nodesAndEdgesAfter inGraph [b] - in - (a,b, aNodesBefore, bNodesBefore, aNodesAfter, bNodesAfter) - --- | getEdgesToRemoveForTime recursive looks at each pair of nodes that should --- be potentially coeval based on network vertices. And see if it conflicts with --- other pairs that should also be coeval. --- two pairs of nodes (a,b) and (a',b'), one for each net node tio be compared --- if a "before" a' then b must be before b' --- if a "after" a' then b must be after b' --- otherwise its a time violation --- returns net edge to delte in second pair -getEdgesToRemoveForTime :: (Show a,Eq a,Eq b) - => Gr a b - -> [((LNode a, LNode a, [LNode a], [LNode a], [LNode a], [LNode a]),(LNode a, LNode a, [LNode a], [LNode a], [LNode a], [LNode a]))] - -> [Edge] -getEdgesToRemoveForTime inGraph inNodePairList = - if isEmpty inGraph then [] - else if null inNodePairList then [] - else - let ((_,_, aNodesBefore, bNodesBefore, aNodesAfter, bNodesAfter),(a',b', _, _,_,_)) = head inNodePairList - in - -- trace ("GETRT: " ++ (show inNodePairList)) ( - -- condition if a before a' then b before b' to be ok - if (a' `elem` aNodesAfter) && (b' `elem` bNodesBefore) then - let edgeToRemove = (head $ filter (isNetworkEdge inGraph) $ fmap toEdge $ out inGraph $ fst b') - in - trace ("\tRemoving network edge edge due to time consistancy: " ++ (show edgeToRemove)) - -- trace ("GERT Edges0:" ++ (show edgeToRemove) ++ " " ++ (show $ fmap toEdge $ out inGraph $ fst b') ++ " Net: " ++ (show $ fmap (isNetworkEdge inGraph) $ fmap toEdge $ out inGraph $ fst b')) - edgeToRemove : getEdgesToRemoveForTime inGraph (tail inNodePairList) - else if (a' `elem` aNodesBefore) && (b' `elem` bNodesAfter) then - let edgeToRemove = (head $ filter (isNetworkEdge inGraph) $ fmap toEdge $ out inGraph $ fst b') - in - -- trace ("GERT Edges1:" ++ (show edgeToRemove) ++ " " ++ (show $ fmap toEdge $ out inGraph $ fst b')) - trace ("\tRemoving network edge due to time consistancy : "++ (show edgeToRemove)) - edgeToRemove : getEdgesToRemoveForTime inGraph (tail inNodePairList) - else if (b' `elem` aNodesAfter) && (a' `elem` bNodesBefore) then - let edgeToRemove = (head $ filter (isNetworkEdge inGraph) $ fmap toEdge $ out inGraph $ fst b') - in - trace ("\tRemoving network edge edge due to time consistancy: " ++ (show edgeToRemove)) - -- trace ("GERT Edges0:" ++ (show edgeToRemove) ++ " " ++ (show $ fmap toEdge $ out inGraph $ fst b') ++ " Net: " ++ (show $ fmap (isNetworkEdge inGraph) $ fmap toEdge $ out inGraph $ fst b')) - edgeToRemove : getEdgesToRemoveForTime inGraph (tail inNodePairList) - else if (b' `elem` aNodesBefore) && (a' `elem` bNodesAfter) then - let edgeToRemove = (head $ filter (isNetworkEdge inGraph) $ fmap toEdge $ out inGraph $ fst b') - in - -- trace ("GERT Edges1:" ++ (show edgeToRemove) ++ " " ++ (show $ fmap toEdge $ out inGraph $ fst b')) - trace ("\tRemoving network edge due to time consistancy : "++ (show edgeToRemove)) - edgeToRemove : getEdgesToRemoveForTime inGraph (tail inNodePairList) - else getEdgesToRemoveForTime inGraph (tail inNodePairList) - -- ) - --- | getEdgeSplitList takes a graph and returns list of edges --- that split a graph increasing the number of components by 1 --- this is quadratic --- should change to Tarjan's algorithm (linear) --- everyhting else in there is O(n^2-3) so maybe doesn't matter --- filters out edges with parent nodes that are out degree 1 and root edges -getEdgeSplitList :: (Show a, Show b, Eq b) => Gr a b -> [LEdge b] -getEdgeSplitList inGraph = - if isEmpty inGraph then error ("Empty graph in getEdgeSplitList") - else - let origNumComponents = noComponents inGraph - origEdgeList = labEdges inGraph - edgeDeleteComponentNumberList = fmap noComponents $ fmap (flip delEdge inGraph) (fmap toEdge origEdgeList) - bridgeList = fmap snd $ filter ((> origNumComponents) . fst) $ zip edgeDeleteComponentNumberList origEdgeList - - -- filter out edges starting in an outdegree 1 node (network or in out 1) node - -- this would promote an HTU to a leaf later. Its a bridge, but not what need - bridgeList' = filter ((not . isRoot inGraph) .fst3 ) $ filter ((not. isNetworkNode inGraph) . snd3) $ filter ((not . isOutDeg1Node inGraph) . fst3) bridgeList - in - - -- trace ("BridgeList" ++ (show $ fmap toEdge bridgeList') ++ "\nGraph\n" ++ (prettyIndices inGraph)) - bridgeList' - --- | splitGraphOnEdge takes a graph and an edge and returns a single graph but with two components --- the roots of each component are returned with two graphs, with broken edge contracted, and 'naked' --- node returned (this is teh original connection vertex on base graph connected to pruned graph). --- The naked node is used for rejoining the two components during rearrangement --- (SplitGraph, root of component that has original root, root of component that was cut off, naked node left over) --- this function does not check whether edge is a 'bridge' -splitGraphOnEdge :: (Show b) => Gr a b -> LEdge b -> (Gr a b, Node, Node, Node) -splitGraphOnEdge inGraph (e,v,l) = - if isEmpty inGraph then error "Empty graph in splitGraphOnEdge" - else if (length $ getRoots inGraph) /= 1 then error ("Incorrect number roots in splitGraphOnEdge--must be 1: " ++ (show $ fmap fst $ getRoots inGraph)) - else - let childrenENode = (descendants inGraph e) L.\\ [v] - parentsENode = parents inGraph e - newEdge = (head parentsENode, head childrenENode, l) - edgesToDelete = [(head parentsENode, e), (e, head childrenENode)] -- (e,v) - - -- make new graph - splitGraph = insEdge newEdge $ delEdges edgesToDelete inGraph - in - if length childrenENode /= 1 then error ("Incorrect number of children of edge to split--must be 1: " ++ (show ((e,v), childrenENode))) - else if length parentsENode /= 1 then error ("Incorrect number of parents of edge to split--must be 1: " ++ (show ((e,v), parentsENode))) - else - -- trace ("SGE:" ++ (show (childrenENode, parentsENode, newEdge, edgesToDelete))) - (splitGraph, fst $ head $ getRoots inGraph, v, e) - --- | splitGraphOnEdge' like splitGrpahOnEdge above but returns edges created and destroyed as well --- used in Goodman-Bermer and could make swap more efficient as well. -splitGraphOnEdge' :: (Show b) => Gr a b -> LEdge b -> (Gr a b, Node, Node, Node, LEdge b, [Edge]) -splitGraphOnEdge' inGraph (e,v,l) = - if isEmpty inGraph then error "Empty graph in splitGraphOnEdge" - else if (length $ getRoots inGraph) /= 1 then error ("Incorrect number roots in splitGraphOnEdge--must be 1: " ++ (show $ fmap fst $ getRoots inGraph)) - else - let childrenENode = (descendants inGraph e) L.\\ [v] - parentsENode = parents inGraph e - newEdge = (head parentsENode, head childrenENode, l) - edgesToDelete = [(head parentsENode, e), (e, head childrenENode)] -- (e,v) - - -- make new graph - splitGraph = insEdge newEdge $ delEdges edgesToDelete inGraph - in - if length childrenENode /= 1 then error ("Incorrect number of children of edge to split--must be 1: " ++ (show ((e,v), childrenENode))) - else if length parentsENode /= 1 then error ("Incorrect number of parents of edge to split--must be 1: " ++ (show ((e,v), parentsENode))) - else - -- trace ("SGE:" ++ (show (childrenENode, parentsENode, newEdge, edgesToDelete))) - (splitGraph, fst $ head $ getRoots inGraph, v, e, newEdge, edgesToDelete) - --- | joinGraphOnEdge takes a graph and adds an edge reducing the component number --- expected ot be two components to one in SPR/TBR --- assumes that first node of edge (e,v,l) is 'naked' ie avaiable to make edges but is in graph --- created from splitGraphOnEdge -joinGraphOnEdge :: (Show a,Show b) => Gr a b -> LEdge b -> Node ->Gr a b -joinGraphOnEdge inGraph (x,y,l) parentofPrunedSubGraph = - if isEmpty inGraph then error ("Empty graph in joinGraphOnEdge") - else - let edgeToCreate0 = (x, parentofPrunedSubGraph, l) - edgeToCreate1 = (parentofPrunedSubGraph, y, l) - -- edgeToCreate2 = (parentofPrunedSubGraph, graphToJoinRoot, l) - in - -- make new graph - -- trace ("JGE:" ++ (show edgeToInvade) ++ " " ++ (show (parentofPrunedSubGraph, graphToJoinRoot))) -- ++ "\n" ++ (prettify inGraph)) - insEdges [edgeToCreate0, edgeToCreate1] $ delEdge (x,y) inGraph - --- | parentsInChain checks for parents in chain ie network edges --- that implies a network event between nodes where one is the ancestor of the other --- a time violation -parentInChain :: (Show a, Eq a, Eq b) => Gr a b -> Bool -parentInChain inGraph = - if isEmpty inGraph then error "Null graph in parentInChain" - else - let (_, _, _, netVertexList) = splitVertexList inGraph - parentNetVertList = fmap (labParents inGraph) $ fmap fst netVertexList - - -- get list of nodes that are transitively equal in age - concurrentList = mergeConcurrentNodeLists parentNetVertList [] - concurrentPairList = concatMap getListPairs concurrentList - - -- get pairs that violate concurrency - violatingConcurrentPairs = concatMap (concurrentViolatePair inGraph) concurrentPairList - in - if null violatingConcurrentPairs then False - else True - --- | getSisterSisterEdgeList take a graph and returns list of edges where two network nodes --- have the same two parents -getSisterSisterEdgeList :: Gr a b -> [Edge] -getSisterSisterEdgeList inGraph = - if isEmpty inGraph then [] - else - let (_, _, _, netVertexList) = splitVertexList inGraph - in - if null netVertexList then [] - else getSisterSisterEdgeByNetVertex inGraph netVertexList - --- | getSisterSisterEdgeByNetVertex takes a list of vertices and recursively generate a list of edges to delete -getSisterSisterEdgeByNetVertex :: Gr a b -> [LNode a] -> [Edge] -getSisterSisterEdgeByNetVertex inGraph netNodeList = - if null netNodeList then [] - else - let firstNode = fst $ head netNodeList - parentsList = parents inGraph firstNode - grandParentsList = fmap (parents inGraph) parentsList - sameGrandParentList = L.foldl1' L.intersect grandParentsList - in - if null sameGrandParentList then getSisterSisterEdgeByNetVertex inGraph (tail netNodeList) - else - -- trace ("Found sister-sister") - (toEdge $ head $ inn inGraph firstNode) : getSisterSisterEdgeByNetVertex inGraph (tail netNodeList) - --- | concurrentViolatePair takes a pair of nodes and sees if either is ancetral to the other--if so returns pair --- as list otherwise null list -concurrentViolatePair :: (Eq a, Show a, Eq b) => Gr a b -> (LNode a, LNode a) -> [(LNode a, LNode a)] -concurrentViolatePair inGraph (node1, node2) = - if isEmpty inGraph then error "Empty graph in concurrentViolatePair" - else - let (nodesBeforeFirst, _) = nodesAndEdgesBefore inGraph [node1] - (nodesBeforeSecond, _) = nodesAndEdgesBefore inGraph [node2] - in - if node2 `elem` nodesBeforeFirst then [(node1, node2)] - else if node1 `elem` nodesBeforeSecond then [(node1, node2)] - else [] - - --- | mergeConcurrentNodeLists takes a list os lists and returns a list os lists of merged lists --- lists are merged if they share any elements -mergeConcurrentNodeLists :: (Eq a) => [[LNode a]] -> [[LNode a]] -> [[LNode a]] -mergeConcurrentNodeLists inListList currentListList = - if null inListList then - -- trace ("MCNL:" ++ (show $ fmap (fmap fst) currentListList)) - currentListList - - -- first case - else if null currentListList then mergeConcurrentNodeLists (tail inListList) [head inListList] - else - let firstList = head inListList - (intersectList, _) = unzip $ filter ((== True) . snd) $ zip (currentListList) (fmap (not . null) $ fmap (L.intersect firstList) currentListList) - - (noIntersectLists, _) = unzip $ filter ((== False) . snd) $ zip (currentListList) (fmap (not . null) $ fmap (L.intersect firstList) currentListList) - - mergedList = if null intersectList then firstList - else L.foldl' L.union firstList intersectList - in - -- trace ("MCL-F:" ++ (show $ fmap fst firstList) ++ " inter " ++ (show $ fmap (fmap fst) intersectList) ++ - -- " noInter " ++ (show $ fmap (fmap fst) noIntersectLists) ++ " curList " ++ (show $ fmap (fmap fst) currentListList)) - mergeConcurrentNodeLists (tail inListList) (mergedList : noIntersectLists) - --- | sortEdgeListByDistance sorts edges by distance (in edges) from edge pair of vertices --- cretes a list of edges into (but traveling away from) an initial eNOde and away from --- an initial vNode adding new nodes to those lists as encountered by traversing edges. --- the eidea is theat the nodes from a directed edge (eNode, vNode) --- the list is creted at each round from the "in" and "out" edge lists --- so they are in order of 1 edge 2 edges etc. -sortEdgeListByDistance :: Gr a b -> [Node] -> [Node] -> [LEdge b] -sortEdgeListByDistance inGraph eNodeList vNodeList = - if isEmpty inGraph then error ("Empty graph in edgeListByDistance") - else if (null eNodeList && null vNodeList) then [] - else - -- get edges 'in' to eNodeList - let inEdgeList = concatMap (inn inGraph) eNodeList - newENodeList = fmap fst3 inEdgeList - - -- get edges 'out' from vNodeList - outEdgeList = concatMap (out inGraph) vNodeList - newVNodeList = fmap snd3 outEdgeList - in - inEdgeList ++ outEdgeList ++ (sortEdgeListByDistance inGraph newENodeList newVNodeList) - --- | switchRootTree takes a new root vertex index of a tree and switches the existing root (and all relevent edges) --- to new index -switchRootTree :: (Show a) => Int -> Gr a b -> Gr a b -switchRootTree newRootIndex inGraph = - if isEmpty inGraph then empty - else - let rootList = getRoots inGraph - (newRootCurInEdges, newRootCurOutEdges) = getInOutEdges inGraph newRootIndex - oldRootEdges = out inGraph $ fst $ head rootList - - in - -- not a directed tree - if length rootList /= 1 then error ("Graph input to switchRootTree is not a tree--not single root:" ++ (show rootList)) - - -- same root - else if newRootIndex == (fst $ head rootList) then inGraph - else - -- create new edges and delete the old ones - let newEdgesToAdd = fmap (flipVertices (fst $ head rootList) newRootIndex) (newRootCurInEdges ++ newRootCurOutEdges ++ oldRootEdges) - in - insEdges newEdgesToAdd $ delLEdges (newRootCurInEdges ++ newRootCurOutEdges ++ oldRootEdges) inGraph - --- | flipVertices takes an old vertex index and a new vertex index and inserts one for the other --- in a labelled edge -flipVertices :: Node -> Node -> LEdge b -> LEdge b -flipVertices a b (u,v,l) = - let newU = if u == a then b - else if u == b then a - else u - newV = if v == a then b - else if v == b then a - else v - in - -- trace (show (u,v,l) ++ "->" ++ show (newU, newV, l)) - (newU, newV, l) - --- | rerootTree takes a graph and reroots based on a vertex index (usually leaf outgroup) --- if input is a forest then only roots the component that contains the vertex wil be rerooted --- unclear how will effect network edges--will need to verify that does not create cycles --- multi-rooted components (as opposed to forests) are unaffected with trace warning thrown --- after checking for existing root and multiroots, should be O(n) where 'n is the length --- of the path between the old and new root -rerootTree :: (Show a, Show b, Eq b) => Node -> Gr a b -> Gr a b -rerootTree rerootIndex inGraph = - --trace ("In reroot Graph: " ++ show rerootIndex) ( - if isEmpty inGraph then inGraph - else - let componentList = components inGraph - parentNewRootList = pre inGraph rerootIndex - newRootOrigEdge = head $ inn inGraph rerootIndex - parentRootList = fmap (isRoot inGraph) parentNewRootList - outgroupInComponent = fmap (rerootIndex `elem`) componentList - componentWithOutgroup = filter ((== True).fst) $ zip outgroupInComponent componentList - -- (_, inNewRoot, outNewRoot) = getInOutDeg inGraph (labelNode inGraph rerootIndex) - in - - -- rerooting on root so no indegree edges - -- this for wagner build reroots where can try to reroot on leaf not yet added - if null $ inn inGraph rerootIndex then inGraph -- error ("Rerooting on indegree 0 node " ++ (show rerootIndex) ++ "\n" ++ prettyIndices inGraph) -- empty - - - else if null componentWithOutgroup then inGraph -- error ("Error rooting wierdness in rerootTree " ++ (show rerootIndex) ++ "\n" ++ prettyIndices inGraph) -- empty - - -- check if new outtaxon has a parent--shouldn't happen-but could if its an internal node reroot - else if null parentNewRootList || (True `elem` parentRootList) then inGraph - else (if null componentWithOutgroup then error ("Outgroup index " ++ show rerootIndex ++ " not found in graph") - else - --trace ("RRT: " ++ (show (rerootIndex, inNewRoot, outNewRoot))) ( - -- reroot component with new outtaxon - let componentWithNewOutgroup = snd $ head componentWithOutgroup - (_, originalRootList) = unzip $ filter ((==True).fst) $ zip (fmap (isRoot inGraph) componentWithNewOutgroup) componentWithNewOutgroup - numRoots = length originalRootList - orginalRoot = head originalRootList - originalRootEdges = out inGraph orginalRoot - - in - - if numRoots == 0 then error ("No root in rerootTree: Attempting to reroot on edge to node " ++ (show rerootIndex) ++ "\n" ++ prettyIndices inGraph) --empty - - -- check if outgroup in a multirooted component - -- if wagner build this is ok - else if numRoots > 1 then inGraph -- error ("Error: Attempting to reroot multi-rooted component") -- inGraph - else - --reroot graph safely automatically will only affect the component with the outgroup - -- delete old root edge and create two new edges from oringal root node. - -- keep orignl root node and delte/crete new edges when they are encounterd - --trace ("Moving root from " ++ (show orginalRoot) ++ " to " ++ (show rerootIndex)) ( - let leftChildEdge = (orginalRoot, rerootIndex, edgeLabel $ head originalRootEdges) - rightChildEdge = (orginalRoot, fst3 newRootOrigEdge, edgeLabel $ last originalRootEdges) - - -- this assumes 2 children of old root -- shouled be correct as Phylogenetic Graph - newEdgeOnOldRoot = if (length originalRootEdges) /= 2 then error ("Number of root out edges /= 2 in rerootGraph: " ++ (show $ length originalRootEdges) - ++ " root index: " ++ (show (orginalRoot, rerootIndex)) ++ "\nGraph:\n" ++ (prettyIndices inGraph)) - else (snd3 $ head originalRootEdges, snd3 $ last originalRootEdges, thd3 $ head originalRootEdges) - - newRootEdges = [leftChildEdge, rightChildEdge, newEdgeOnOldRoot] - newGraph = insEdges newRootEdges $ delLEdges (newRootOrigEdge : originalRootEdges) inGraph - - -- get edges that need reversing - newGraph' = preTraverseAndFlipEdges [leftChildEdge,rightChildEdge] newGraph - - in - --trace ("=") - --trace ("Deleting " ++ (show (newRootOrigEdge : originalRootEdges)) ++ "\nInserting " ++ (show newRootEdges)) - --trace ("In " ++ (GFU.showGraph inGraph) ++ "\nNew " ++ (GFU.showGraph newGraph) ++ "\nNewNew " ++ (GFU.showGraph newGraph')) - newGraph') - -- ) -- ) - --- | rerootDisplayTree like reroot but inputs original root position instead of figuring it out. --- assumes graph is tree--not useable fo Wagner builds since they have multiple components while building -rerootDisplayTree :: (Show a, Show b, Eq a, Eq b) => Node -> Node -> Gr a b -> Gr a b -rerootDisplayTree orginalRootIndex rerootIndex inGraph = - --trace ("In reroot Graph: " ++ show rerootIndex) ( - if isEmpty inGraph then inGraph - else - let -- componentList = components inGraph' - - -- hack---remove when figured out - {- - inGraph = if inGraphType == SoftWired then inGraph' -- removeDuplicateEdges inGraph' - else inGraph' - -} - - parentNewRootList = pre inGraph rerootIndex - newRootOrigEdge = head $ inn inGraph rerootIndex - parentRootList = fmap (isRoot inGraph) parentNewRootList - -- outgroupInComponent = fmap (rerootIndex `elem`) componentList - -- componentWithOutgroup = filter ((== True).fst) $ zip outgroupInComponent componentList - (_, inNewRoot, outNewRoot) = getInOutDeg inGraph (labelNode inGraph rerootIndex) - - {- Checks for valid trees - cyclicString = if cyclic inGraph then " Is cyclic " - else " Not cyclic " - - parentInCharinString = if parentInChain inGraph then " Is Parent in Chain " - else " Not Parent in Chain " - - duplicatedsEdgeString = if not (hasDuplicateEdge inGraph) then " No duplicate edges " - else - let dupEdgeList' = getDuplicateEdges inGraph - in (" Has duplicate edges: " ++ (show dupEdgeList')) - -} - in - - -- trace ("RRT In: " ++ cyclicString ++ parentInCharinString ++ duplicatedsEdgeString) ( - - -- check if cycle and exit if so - -- don't reroot on in=out=1 since same as it descendent edge - if (inNewRoot == 1) && (outNewRoot == 1) then - -- trace ("RRT: in 1 out 1") - inGraph - - -- else if cyclic inGraph then empty -- inGraph - - - -- rerooting on root so no indegree edges - -- this for wagner build reroots where can try to reroot on leaf not yet added - else if null $ inn inGraph rerootIndex then inGraph -- error ("Rerooting on indegree 0 node " ++ (show rerootIndex) ++ "\n" ++ prettyIndices inGraph) -- empty - - - -- else if null componentWithOutgroup then inGraph -- error ("Error rooting wierdness in rerootTree " ++ (show rerootIndex) ++ "\n" ++ prettyIndices inGraph) -- empty - - -- check if new outtaxon has a parent--shouldn't happen-but could if its an internal node reroot - else if null parentNewRootList || (True `elem` parentRootList) then inGraph - -- else if null componentWithOutgroup then error ("Outgroup index " ++ show rerootIndex ++ " not found in graph") - else - -- trace ("RRT: " ++ (show (rerootIndex, inNewRoot, outNewRoot))) ( - --reroot component with new outtaxon - let -- componentWithNewOutgroup = snd $ head componentWithOutgroup - -- (_, originalRootList) = unzip $ filter ((==True).fst) $ zip (fmap (isRoot inGraph) componentWithNewOutgroup) componentWithNewOutgroup - -- numRoots = 1 -- length originalRootList - orginalRoot = orginalRootIndex -- head originalRootList - originalRootEdges = (out inGraph orginalRoot) - - in - - {- - if numRoots == 0 then error ("No root in rerootDisplayTree: Attempting to reroot on edge to node " ++ (show (orginalRoot,rerootIndex)) ++ prettyIndices inGraph) --empty - - -- check if outgroup in a multirooted component - -- if wagner build this is ok - -- else if numRoots > 1 then inGraph -- error ("Error: Attempting to reroot multi-rooted component") -- inGraph - else - -} - --reroot graph safely automatically will only affect the component with the outgroup - -- delete old root edge and create two new edges from oringal root node. - -- keep orignl root node and delte/crete new edges when they are encounterd - --trace ("Moving root from " ++ (show orginalRoot) ++ " to " ++ (show rerootIndex)) ( - let leftChildEdge = (orginalRoot, rerootIndex, edgeLabel $ head originalRootEdges) - rightChildEdge = (orginalRoot, fst3 newRootOrigEdge, edgeLabel $ last originalRootEdges) - - -- this assumes 2 children of old root -- shouled be correct as Phylogenetic Graph - newEdgeOnOldRoot = if (length originalRootEdges) /= 2 then error ("Number of root out edges /= 2 in rerootGraph: " ++ (show $ length originalRootEdges) - ++ " root index: " ++ (show (orginalRoot, rerootIndex)) ++ "\nGraph:\n" ++ (prettyIndices inGraph)) - else (snd3 $ head originalRootEdges, snd3 $ last originalRootEdges, thd3 $ head originalRootEdges) - - newRootEdges = [leftChildEdge, rightChildEdge, newEdgeOnOldRoot] - newGraph = insEdges newRootEdges $ delLEdges (newRootOrigEdge : originalRootEdges) inGraph - - -- get edges that need reversing - newGraph' = preTraverseAndFlipEdgesTree orginalRootIndex [leftChildEdge,rightChildEdge] newGraph - - - {- Check for valid tree - cyclicString' = if cyclic newGraph' then " Is cyclic " - else " Not cyclic " - - parentInCharinString'= if parentInChain newGraph' then " Is Parent in Chain " - else " Not Parent in Chain " - - duplicatedsEdgeString' = if not (hasDuplicateEdge newGraph') then " No duplicate edges " - else let dupEdgeList' = getDuplicateEdges newGraph' - in - (" Has duplicate edges: " ++ (show dupEdgeList') ++ "\nDeleting " ++ (show $ fmap toEdge $ (newRootOrigEdge : originalRootEdges)) ++ "\nInserting " ++ (show $ fmap toEdge $ newRootEdges)) - -} - in - --trace ("=") - --trace ("In " ++ (GFU.showGraph inGraph) ++ "\nNew " ++ (GFU.showGraph newGraph) ++ "\nNewNew " ++ (GFU.showGraph newGraph')) - - -- trace ("Deleting " ++ (show $ fmap toEdge (newRootOrigEdge : originalRootEdges)) ++ "\nInserting " ++ (show $ fmap toEdge newRootEdges) - -- ++ "\nRRT Out: " ++ cyclicString' ++ parentInCharinString' ++ duplicatedsEdgeString') ( - - -- if cyclicString' == " Is cyclic " then inGraph - -- else if hasDuplicateEdge newGraph' then removeDuplicateEdges newGraph' - -- else - {-Cycle check - if cyclic newGraph' then - trace ("Orignal root: " ++ (show orginalRootIndex) ++ "New root: " ++ (show rerootIndex) ++ " Deleting " ++ (show $ fmap toEdge (newRootOrigEdge : originalRootEdges)) ++ "\nInserting " ++ (show $ fmap toEdge newRootEdges) - ++ "\nOrigGraph: " ++ (prettyIndices inGraph) ++ "\nNewGraph: " ++ (prettyIndices newGraph)++ "\nNewGraph': " ++ (prettyIndices newGraph')) - empty - else newGraph'-} - newGraph' - -- ) - -- ) -- ) - - --- | preTraverseAndFlipEdgesTree traverses a tree from starting edge flipping in-edges since they should --- be out-edges --- when recursion its edges that don't need to be fliped then stops --- assumes input edge is directed correctly --- follows traversal out "pre" order updating graph as edges flipped -preTraverseAndFlipEdgesTree :: (Eq b) => Node -> [LEdge b] -> Gr a b -> Gr a b -preTraverseAndFlipEdgesTree rootIndex inEdgeList inGraph = - if null inEdgeList then inGraph - else - let -- first edge directled correctly - inEdge@(_,v,_) = head inEdgeList - - -- edges "in" to child node of first edge--these should be out and need to be flipped - childEdges = filter ((/= rootIndex) . fst3) $ filter (/= inEdge) $ inn inGraph v - - -- flip to "in" to "out" edges - flippedEdges = fmap flipLEdge childEdges - - -- -- modify graph accordingly - newGraph = insEdges flippedEdges $ delLEdges childEdges inGraph - in - --trace ("PTFE: flipped " ++ (show $ fmap toEdge flippedEdges)) ( - -- edge terminates in leaf or edges in correct orientation - if null childEdges then preTraverseAndFlipEdgesTree rootIndex (tail inEdgeList) inGraph - - -- edge needs to be reversed to follow through its children from a new graph - else preTraverseAndFlipEdgesTree rootIndex (flippedEdges ++ (tail inEdgeList)) newGraph - -- ) - - --- | preTraverseAndFlipEdges traverses graph from starting edge flipping edges as needed --- when recursion its edges that don't need to be fliped then stops --- assumes input edge is directed correctly --- follows traversal out "pre" order ish from edges --- have to check edge orientatins--make sure thay haven't changed as graph was updated earlier -preTraverseAndFlipEdges :: (Eq b) => [LEdge b] -> Gr a b -> Gr a b -preTraverseAndFlipEdges inEdgelist inGraph = - if null inEdgelist then inGraph - else - let inEdge@(_,v,_) = head inEdgelist - childEdges = (out inGraph v) ++ (filter (/= inEdge) $ inn inGraph v) - - -- returns list of edges that had to be flipped - edgesToFlip = getToFlipEdges v childEdges - flippedEdges = fmap flipLEdge edgesToFlip - newGraph = insEdges flippedEdges $ delLEdges edgesToFlip inGraph - in - --trace ("PTFE: flipped " ++ (show $ fmap toEdge flippedEdges)) ( - -- edge terminates in leaf or edges in correct orientation - if null childEdges || null edgesToFlip then preTraverseAndFlipEdges (tail inEdgelist) inGraph - -- edge needs to be reversed to follow through its children from a new graph - else preTraverseAndFlipEdges (flippedEdges ++ (tail inEdgelist)) newGraph - -- ) - --- | getToFlipEdges takes an index and check edge list --- and creates new list of edges that need to be flipped -getToFlipEdges :: Node -> [LEdge b] -> [LEdge b] -getToFlipEdges parentNodeIndex inEdgeList = - if null inEdgeList then [] - else - let firstEdge@(u,_,_) = head inEdgeList - in - if parentNodeIndex /= u then firstEdge : getToFlipEdges parentNodeIndex (tail inEdgeList) - else getToFlipEdges parentNodeIndex (tail inEdgeList) - --- | Random generates display trees up to input number by choosing --- to keep indegree nodes > 1 unifomaly at random -generateDisplayTreesRandom :: (Show a, Show b, Eq a, Eq b, NFData a, NFData b) => Int -> Int -> Gr a b -> [Gr a b] -generateDisplayTreesRandom rSeed numDisplayTrees inGraph = - if isEmpty inGraph then error "Empty graph in generateDisplayTreesRandom" - else - let atRandomList = take numDisplayTrees $ randomIntList rSeed - randDisplayTreeList = PU.seqParMap rdeepseq (randomlyResolveGraphToTree inGraph) atRandomList -- `using` PU.myParListChunkRDS - in - randDisplayTreeList - --- | randomlyResolveGraphToTree resolves a single graph to a tree by choosing single indegree edges --- uniformly at random and deleting all others from graph --- in=out=1 nodes are contracted, HTU's withn outdegree 0 removed, graph reindexed --- but not renamed--edges from root are left alone. -randomlyResolveGraphToTree :: (Show a, Show b, Eq a, Eq b) => Gr a b -> Int -> Gr a b -randomlyResolveGraphToTree inGraph randVal = - if isEmpty inGraph then error "Empty graph in randomlyResolveGraphToTree" - else - let (_, leafList, _, _) = splitVertexList inGraph - -- rootEdgeList = fmap (out inGraph) $ fmap fst rootList - inEdgeListByVertex = (fmap (inn inGraph) (nodes inGraph)) -- L.\\ rootEdgeList - randList = fmap abs $ randomIntList randVal - edgesToDelete = concat $ zipWith chooseOneDumpRest randList (fmap (fmap toEdge) inEdgeListByVertex) - newTree = delEdges edgesToDelete inGraph - newTree' = removeNonLeafOut0Nodes leafList newTree - newTree'' = contractIn1Out1Edges newTree' - reindexTree = reindexGraph newTree'' - in - -- trace ("RRGT\n" ++ (prettify inGraph) ++ "\n to delete " ++ (show edgesToDelete) ++ "\nNew graph:\n" ++ (prettify newTree) - -- ++ "\nnewTree'\n" ++ (prettify newTree') ++ "\nnewTree''\n" ++ (prettify newTree'') ++ "\reindex\n" ++ (prettify reindexTree)) - reindexTree - --- | chooseOneDumpRest takes random val and chooses to keep the edge in list returning list of edges to delete -chooseOneDumpRest :: Int -> [Edge] -> [Edge] -chooseOneDumpRest randVal inEdgeList = - if null inEdgeList then [] - else if length inEdgeList == 1 then [] - else - let numEdgesIn = length inEdgeList - (_, indexToKeep) = divMod randVal numEdgesIn - in - -- trace ("CODR: " ++ (show inEdgeList) ++ " keeping " ++ (show $ inEdgeList !! indexToKeep) ++ " deleting " ++ (show $ inEdgeList L.\\ [inEdgeList !! indexToKeep]) ++ "based on " ++ (show (randVal, indexToKeep))) - inEdgeList L.\\ [inEdgeList !! indexToKeep] - - --- | generateDisplayTrees nice wrapper around generateDisplayTrees' with clean interface -generateDisplayTrees :: (Eq a) => Gr a b -> [Gr a b] -generateDisplayTrees inGraph = - let (_, leafList, _, _) = splitVertexList inGraph - in - generateDisplayTrees' leafList [inGraph] [] - --- | generateDisplayTrees' takes a graph list and recursively generates --- a list of trees created by progresively resolving each network vertex into a tree vertex --- in each input graph --- creating up to 2**m (m network vertices) trees. --- call -> generateDisplayTrees' [startGraph] [] --- the second and third args contain graphs that need more work and graphs that are done (ie trees) -generateDisplayTrees' :: (Eq a) => [LNode a] -> [Gr a b] -> [Gr a b] -> [Gr a b] -generateDisplayTrees' leafList curGraphList treeList = - if null curGraphList then - let treeList' = fmap (removeNonLeafOut0Nodes leafList) treeList - treeList'' = fmap contractIn1Out1Edges treeList' - reindexedTreeList = fmap reindexGraph treeList'' - in - reindexedTreeList - - else - let firstGraph = head curGraphList - in - if isEmpty firstGraph then [] - else - let nodeList = labNodes firstGraph - inNetEdgeList = filter ((>1).length) $ fmap (inn firstGraph) $ fmap fst nodeList - in - if null inNetEdgeList then generateDisplayTrees' leafList (tail curGraphList) (firstGraph : treeList) - else - let newGraphList = splitGraphListFromNode inNetEdgeList [firstGraph] - in - generateDisplayTrees' leafList (newGraphList ++ (tail curGraphList)) treeList - --- | splitGraphListFromNode take a graph and a list of edges for indegree > 1 node --- removes each in edge in turn to create a new graph and maintains any in 1 out 1 nodes --- if these edges are to be contracted out use 'contractOneOneEdges' --- should be a single traversal 'splitting' graph each time. removing edges anmd recursing --- to do it again untill all edges are indegree 1. --- the edges in list for recurvise deleteion should always be in all graphs uuntil --- the end -splitGraphListFromNode :: [[LEdge b]] -> [Gr a b] -> [Gr a b] -splitGraphListFromNode inEdgeListList inGraphList = - if null inEdgeListList then inGraphList - else if null inGraphList then error "Empty graph lits in splitGraphListFromNode" - else - let firstNetEdgeList = head inEdgeListList - indexList = [0.. (length firstNetEdgeList - 1)] - repeatedEdgeList = replicate (length firstNetEdgeList) firstNetEdgeList - netEdgeIndexPairList = zip repeatedEdgeList indexList - newGraphList = concat $ fmap (deleteEdgesCreateGraphs netEdgeIndexPairList 0) inGraphList - in - splitGraphListFromNode (tail inEdgeListList) newGraphList - --- | deleteEdgesCreateGraphs takes a list of edges and an index list and a graph, --- recursively keeps the index-th edge and deletes the others creating a new graph list -deleteEdgesCreateGraphs :: [([LEdge b], Int)] -> Int -> Gr a b -> [Gr a b] -deleteEdgesCreateGraphs netEdgeIndexPairList counter inGraph = - if isEmpty inGraph then error "Empty graph in "deleteEdgesCreateGraphs - else if null netEdgeIndexPairList then [] - else - let (edgeList, lIndex) = head netEdgeIndexPairList - --edgeToKeep = edgeList !! index - edgesToDelete = (take lIndex edgeList) ++ (drop (lIndex + 1) edgeList) - newGraph = delLEdges edgesToDelete inGraph - in - newGraph : deleteEdgesCreateGraphs (tail netEdgeIndexPairList) (counter + 1) inGraph diff --git a/pkg/PhyGraph/Utilities/ThreeWayFunctions.hs b/pkg/PhyGraph/Utilities/ThreeWayFunctions.hs deleted file mode 100644 index 165b55177..000000000 --- a/pkg/PhyGraph/Utilities/ThreeWayFunctions.hs +++ /dev/null @@ -1,465 +0,0 @@ -{- | -Module : ThreeWayFunctions.hs -Description : Module specifying three way optimization functions for use in pre-order - optimization of HardWired graphs and iterative pass-type optimization for Trees -Copyright : (c) 2022 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -{- -ToDo: - Add parallel optimization overblocks and characters? --} - - -module Utilities.ThreeWayFunctions ( threeMedianFinal - , addGapsToChildren - , threeWayGeneric - ) where - -import Data.Alphabet -import Data.Bits -import qualified Data.BitVector.LittleEndian as BV -import qualified Data.List as L -import qualified Data.MetricRepresentation as MR -import qualified Data.TCM.Dense as TCMD -import qualified Data.Vector as V -import qualified Data.Vector.Generic as GV -import qualified Data.Vector.Storable as SV -import qualified Data.Vector.Unboxed as UV -import Data.Word -import Foreign.C.Types (CUInt) -import GeneralUtilities -import qualified GraphOptimization.Medians as M -import qualified Input.BitPack as BP -import qualified SymMatrix as S -import Types.Types --- import Debug.Trace - --- | threeMedianFinal calculates a 3-median for data types in a single character --- for dynamic characters this is done by 3 min-trees --- taking best result. Later true 3-way via ffi can be incorporated --- first type of operation two parents and current node-- since prelim --- has (left child preliminary, node preliminary, right child preliminary) --- that information can be used if needed --- since assumes in 2 out 1 only the node preliminary field is used -threeMedianFinal :: CharInfo -> CharacterData -> CharacterData -> CharacterData -> CharacterData -threeMedianFinal charInfo parent1 parent2 curNode = - let localCharType = charType charInfo - - in - if localCharType == Add then - let threeFinal = V.zipWith3 threeWayAdditive (rangeFinal parent1) (rangeFinal parent2) (snd3 $ rangePrelim curNode) - in curNode {rangeFinal = threeFinal} - - else if localCharType == NonAdd then - let threeFinal = V.zipWith3 threeWayNonAdditive (stateBVFinal parent1) (stateBVFinal parent2) (snd3 $ stateBVPrelim curNode) - in curNode {stateBVFinal = threeFinal} - - else if localCharType `elem` packedNonAddTypes then - let threeFinal = BP.threeWayPacked localCharType (packedNonAddFinal parent1) (packedNonAddFinal parent2) (snd3 $ packedNonAddPrelim curNode) - in curNode {packedNonAddFinal = threeFinal} - - else if localCharType == Matrix then - let threeFinal = V.zipWith3 (threeWayMatrix (costMatrix charInfo)) (matrixStatesFinal parent1) (matrixStatesFinal parent2) (matrixStatesPrelim curNode) - in curNode {matrixStatesFinal = threeFinal} - - else if localCharType == AlignedSlim then - let threeFinal = M.getFinal3WaySlim (slimTCM charInfo) (alignedSlimFinal parent1) (alignedSlimFinal parent2) (snd3 $ alignedSlimPrelim curNode) - in curNode {alignedSlimFinal = threeFinal} - - else if localCharType == AlignedWide then - let threeFinal = M.getFinal3WayWideHuge (wideTCM charInfo) (alignedWideFinal parent1) (alignedWideFinal parent2) (snd3 $ alignedWidePrelim curNode) - in curNode {alignedWideFinal = threeFinal} - - else if localCharType == AlignedHuge then - let threeFinal = M.getFinal3WayWideHuge (hugeTCM charInfo) (alignedHugeFinal parent1) (alignedHugeFinal parent2) (snd3 $ alignedHugePrelim curNode) - in curNode {alignedHugeFinal = threeFinal} - - else if (localCharType == SlimSeq) || (localCharType == NucSeq) then - let threeFinal = threeWaySlim charInfo parent1 parent2 curNode - in - curNode { slimFinal = threeFinal - } - - else if (localCharType == WideSeq) || (localCharType == AminoSeq) then - let threeFinal = threeWayWide charInfo parent1 parent2 curNode - in - curNode { wideFinal = threeFinal - } - - else if localCharType == HugeSeq then - let threeFinal = threeWayHuge charInfo parent1 parent2 curNode - in - curNode { hugeFinal = threeFinal - } - - else error ("Unrecognized/implemented character type: " ++ show localCharType) - - --- | threeWayNonAdditive takes the union/intersection operation over 3 non additive states -threeWayNonAdditive :: BV.BitVector -> BV.BitVector -> BV.BitVector -> BV.BitVector -threeWayNonAdditive inA inB inC = - let intersection3 = (inA .&. inB) .&. inC - intersectionAB = inA .&. inB - intersectionAC = inA .&. inC - intersectionBC = inB .&. inC - union3 = (inA .|. inB) .|. inC - in - if not (BV.isZeroVector intersection3) then intersection3 - else if not (BV.isZeroVector intersectionAB) then intersectionAB .|. inC - else if not (BV.isZeroVector intersectionAC) then intersectionAC .|. inB - else if not (BV.isZeroVector intersectionBC) then intersectionBC .|. inA - else union3 - --- | threeWayAdditive take three additive states and returns median --- the idea is the interval between the minimum of all three maximum (min maxA, maxB, maxC) --- and the maximum of all three minima (max minA, minB, minC) --- ordered such that the fst of pair not greater than second -threeWayAdditive :: (Int, Int) -> (Int, Int) -> (Int, Int) -> (Int, Int) -threeWayAdditive (minA, maxA) (minB, maxB) (minC, maxC) = - let minOfMaxs = minimum [maxA, maxB, maxC] - maxOfMins = maximum [minA, minB, minC] - in - if maxOfMins > minOfMaxs then (maxOfMins, minOfMaxs) - else (minOfMaxs, maxOfMins) - --- | threeWayMatrix creates median best state vector from a traceback, since parents could conflict --- on traceback does a minimization. --- The final states of parents will have non-maximum costs and these compared --- to the the child states with pointers to their children are set for --- traceback from current node to child(ren) from preliminary assignment --- since type assumes two children--they are both set to same value so if either left or right --- is set later the process will be correct -threeWayMatrix :: S.Matrix Int -> V.Vector MatrixTriple -> V.Vector MatrixTriple -> V.Vector MatrixTriple -> V.Vector MatrixTriple -threeWayMatrix inCostMatrix parent1 parent2 curNode = - let numStates = S.rows inCostMatrix - - -- get the costs of each state for each node, for prents non-maximal cost will be final states - parent1StatesCost = fmap fst3 parent1 - parent2StatesCost = fmap fst3 parent2 - curNodeStatesCost = fmap fst3 curNode - - -- get the minimum cost for each state given combinations of all three nodes and the min cost child state - minCost3States = getMinStatePair inCostMatrix (maxBound :: StateCost) numStates parent1StatesCost parent2StatesCost curNodeStatesCost - -- minStateCost = V.minimum $ fmap fst minCost3States - -- finalStatesTriple = fmap (assignMinMaxCost minStateCost (maxBound :: StateCost)) minCost3States - in - minCost3States - --- | getMinStatePair takes cost matrix and state costs (vector of Int) and returns best median cost state of child for that best cost --- if either parent or child has maxbound cost then that state get max bound cost -getMinStatePair :: S.Matrix Int -> StateCost -> Int -> V.Vector StateCost -> V.Vector StateCost -> V.Vector StateCost -> V.Vector (StateCost, [ChildStateIndex], [ChildStateIndex]) -getMinStatePair inCostMatrix maxCost numStates p1CostV p2CostV curCostV = - let -- get costs to parents-- will assume parent costs are 0 or max - bestMedianP1Cost = fmap (getBestPairCost inCostMatrix maxCost numStates p1CostV) [0..(numStates - 1)] - bestMedianP2Cost = fmap (getBestPairCost inCostMatrix maxCost numStates p2CostV) [0..(numStates - 1)] - - - -- get costs to single child via preliminary states - medianChildCostPairVect = fmap (getBestPairCostAndState inCostMatrix maxCost numStates curCostV) [0..(numStates - 1)] - - -- get 3 sum costs and best state value - threeWayStateCostList = zipWith3 f bestMedianP1Cost bestMedianP2Cost (fmap fst medianChildCostPairVect) - minThreeWayCost = minimum threeWayStateCostList - - finalStateCostL = zipWith (assignBestMax minThreeWayCost maxCost) threeWayStateCostList medianChildCostPairVect - - in - V.fromList finalStateCostL - where f a b c = a + b + c - --- | assignBestMax checks 3-way median state cost and if minimum sets to that otherwise sets to max --- double 2nd field for 2-child type asumption -assignBestMax :: StateCost -> StateCost -> StateCost -> (StateCost, [ChildStateIndex]) -> (StateCost, [ChildStateIndex], [ChildStateIndex]) -assignBestMax minCost maxCost stateCost (_, stateChildList) = - if stateCost == minCost then (minCost, stateChildList, stateChildList) - else (maxCost, stateChildList, stateChildList) - --- | getBestPairCost gets the baest cost for a state to each of parent states--does not keep parent state -getBestPairCost :: S.Matrix Int -> StateCost -> Int -> V.Vector StateCost -> Int -> StateCost -getBestPairCost inCostMatrix maxCost numStates parentStateCostV medianStateIndex = - let stateCost = V.minimum $ V.zipWith (g inCostMatrix maxCost medianStateIndex)parentStateCostV (V.fromList [0..(numStates - 1)]) - in - stateCost - - where g cM mC mS pC pS = if pC == mC then mC - else cM S.! (mS, pS) - --- | getBestPairCostAndState gets best pair of median state and chikd states based on preliminarr states of node -getBestPairCostAndState :: S.Matrix Int -> StateCost -> Int -> V.Vector StateCost -> Int -> (StateCost, [ChildStateIndex]) -getBestPairCostAndState inCostMatrix maxCost numStates childStateCostV medianStateIndex = - let statecostV = V.zipWith (g inCostMatrix maxCost medianStateIndex) childStateCostV (V.fromList [0..(numStates - 1)]) - minStateCost = V.minimum $ fmap fst statecostV - bestPairs = V.filter ((== minStateCost) . fst) statecostV - bestChildStates = V.toList $ fmap snd bestPairs - in - (minStateCost, L.sort bestChildStates) - - where g cM mC mS pC pS = if pC == mC then (mC, pS) - else (cM S.! (mS, pS), pS) - --- | threeWaySlim take charInfo, 2 parents, and curNOde and creates 3 median via --- 1) 3 DO medians (choosing lowest cost median) ((p1,p2), cn), ((cn,p1), p2), and ((cn,p2), p1) --- 2) inserting gaps to make all 3 line up --- 3) creating 3-medians --- 4) choosing lowest cost median -threeWaySlim :: CharInfo -> CharacterData -> CharacterData -> CharacterData -> SV.Vector CUInt -threeWaySlim charInfo parent1 parent2 curNode = - let -- pairwise median structures - p1p2 = M.getDOMedianCharInfo charInfo parent1 parent2 - p1cN = M.getDOMedianCharInfo charInfo parent1 curNode - p2cN = M.getDOMedianCharInfo charInfo parent2 curNode - - -- get 3rd to pairwise - p1p2cN = M.getDOMedianCharInfo charInfo p1p2 curNode - p1cNp2 = M.getDOMedianCharInfo charInfo p1cN parent2 - p2cNp1 = M.getDOMedianCharInfo charInfo p2cN parent1 - - (a1,b1,c1) = addGapsToChildren (slimGapped p1p2cN) (slimGapped p1p2) - (median1, cost1) = get3WayGeneric (TCMD.lookupThreeway (slimTCM charInfo)) a1 b1 c1 - - (a2,b2,c2) = addGapsToChildren (slimGapped p1cNp2) (slimGapped p1cN) - (median2, cost2) = get3WayGeneric (TCMD.lookupThreeway (slimTCM charInfo)) a2 b2 c2 - - (a3,b3,c3) = addGapsToChildren (slimGapped p2cNp1) (slimGapped p2cN) - (median3, cost3) = get3WayGeneric (TCMD.lookupThreeway (slimTCM charInfo)) a3 b3 c3 - - minCost = minimum [cost1, cost2, cost3] - - in - if cost1 == minCost then median1 - else if cost2 == minCost then median2 - else median3 - - - --- | threeWayWide take charInfo, 2 parents, and curNOde and creates 3 median via --- 1) 3 DO medians (choosing lowest cost median) ((p1,p2), cn), ((cn,p1), p2), and ((cn,p2), p1) --- 2) inserting gaps to make all 3 line up --- 3) creating 3-medians --- 4) choosing lowest cost median -threeWayWide :: CharInfo -> CharacterData -> CharacterData -> CharacterData -> UV.Vector Word64 -threeWayWide charInfo parent1 parent2 curNode = - let -- pairwise median structures - p1p2 = M.getDOMedianCharInfo charInfo parent1 parent2 - p1cN = M.getDOMedianCharInfo charInfo parent1 curNode - p2cN = M.getDOMedianCharInfo charInfo parent2 curNode - - -- get 3rd to pairwise - p1p2cN = M.getDOMedianCharInfo charInfo p1p2 curNode - p1cNp2 = M.getDOMedianCharInfo charInfo p1cN parent2 - p2cNp1 = M.getDOMedianCharInfo charInfo p2cN parent1 - - (a1,b1,c1) = addGapsToChildren (wideGapped p1p2cN) (wideGapped p1p2) - (median1, cost1) = get3WayGeneric (MR.retreiveThreewayTCM (wideTCM charInfo)) a1 b1 c1 - - (a2,b2,c2) = addGapsToChildren (wideGapped p1cNp2) (wideGapped p1cN) - (median2, cost2) = get3WayGeneric (MR.retreiveThreewayTCM (wideTCM charInfo)) a2 b2 c2 - - (a3,b3,c3)= addGapsToChildren (wideGapped p2cNp1) (wideGapped p2cN) - (median3, cost3) = get3WayGeneric (MR.retreiveThreewayTCM (wideTCM charInfo)) a3 b3 c3 - - minCost = minimum [cost1, cost2, cost3] - - in - if cost1 == minCost then median1 - else if cost2 == minCost then median2 - else median3 - --- | threeWayHuge take charInfo, 2 parents, and curNOde and creates 3 median via --- 1) 3 DO medians (choosing lowest cost median) ((p1,p2), cn), ((cn,p1), p2), and ((cn,p2), p1) --- 2) inserting gaps to make all 3 line up --- 3) creating 3-medians --- 4) choosing lowest cost median -threeWayHuge :: CharInfo -> CharacterData -> CharacterData -> CharacterData -> V.Vector BV.BitVector -threeWayHuge charInfo parent1 parent2 curNode = - let -- pairwise median structures - p1p2 = M.getDOMedianCharInfo charInfo parent1 parent2 - p1cN = M.getDOMedianCharInfo charInfo parent1 curNode - p2cN = M.getDOMedianCharInfo charInfo parent2 curNode - - -- get 3rd to pairwise - p1p2cN = M.getDOMedianCharInfo charInfo p1p2 curNode - p1cNp2 = M.getDOMedianCharInfo charInfo p1cN parent2 - p2cNp1 = M.getDOMedianCharInfo charInfo p2cN parent1 - - (a1,b1,c1) = addGapsToChildren (hugeGapped p1p2cN) (hugeGapped p1p2) - (median1, cost1) = get3WayGeneric (MR.retreiveThreewayTCM (hugeTCM charInfo)) a1 b1 c1 - - (a2,b2,c2) = addGapsToChildren (hugeGapped p1cNp2) (hugeGapped p1cN) - (median2, cost2) = get3WayGeneric (MR.retreiveThreewayTCM (hugeTCM charInfo)) a2 b2 c2 - - (a3,b3,c3) = addGapsToChildren (hugeGapped p2cNp1) (hugeGapped p2cN) - (median3, cost3) = get3WayGeneric (MR.retreiveThreewayTCM (hugeTCM charInfo)) a3 b3 c3 - - minCost = minimum [cost1, cost2, cost3] - - in - if cost1 == minCost then median1 - else if cost2 == minCost then median2 - else median3 - --- | addGapsToChildren pads out "new" gaps based on identity--if not identical--adds a gap based on cost matrix size --- importand node filed orders correct--has moved around -addGapsToChildren :: (FiniteBits a, GV.Vector v a) => (v a, v a, v a) -> (v a, v a, v a) -> (v a, v a, v a) -addGapsToChildren (reGappedParentFinal, _, reGappedNodePrelim) (gappedLeftChild, gappedNodePrelim, gappedRightChild) = - let (reGappedLeft, reGappedRight) = slideRegap reGappedNodePrelim gappedNodePrelim gappedLeftChild gappedRightChild mempty mempty - in - if (GV.length reGappedParentFinal /= GV.length reGappedLeft) || (GV.length reGappedParentFinal /= GV.length reGappedRight) then error ("Vectors not same length " - ++ show (GV.length reGappedParentFinal, GV.length reGappedLeft, GV.length reGappedRight)) - else (reGappedParentFinal, reGappedLeft, reGappedRight) - --- | slideRegap takes two version of same vectors (1st and snd) one with additional gaps and if the two aren't equal then adds gaps --- to the 3rd and 4th input vectors -slideRegap :: (FiniteBits a, GV.Vector v a) => v a -> v a -> v a -> v a -> [a] -> [a] -> (v a, v a) -slideRegap reGappedNode gappedNode gappedLeft gappedRight newLeftList newRightList = - -- trace ("SRG: " ++ (show (GV.length reGappedNode, GV.length gappedNode, GV.length gappedLeft, GV.length gappedRight))) ( - if GV.null reGappedNode then (GV.fromList $ reverse newLeftList, GV.fromList $ reverse newRightList) - else - let firstRGN = GV.head reGappedNode - firstGN = GV.head gappedNode - in - - -- gap in reGappedNode, null gappedNode is gap at end of reGappedNode - -- can copmplete the remainder of the slide as gaps only - if GV.null gappedNode then - let gapList = replicate (GV.length reGappedNode) (bit gapIndex) - in - (GV.fromList $ reverse (gapList ++ newLeftList), GV.fromList $ reverse (gapList ++ newRightList)) - - else if firstRGN /= firstGN then - let gap = bit gapIndex - in - slideRegap (GV.tail reGappedNode) gappedNode gappedLeft gappedRight (gap : newLeftList) (gap : newRightList) - - -- no "new gap" - else -- if firstRGN == firstGN then - slideRegap (GV.tail reGappedNode) (GV.tail gappedNode) (GV.tail gappedLeft) (GV.tail gappedRight) (GV.head gappedLeft : newLeftList) (GV.head gappedRight : newRightList) - -- ) - --- | get3WayGeneric takes thee vectors and produces a (median, cost) pair -get3WayGeneric :: (FiniteBits e, GV.Vector v e) => (e -> e -> e -> (e, Word)) -> v e -> v e -> v e -> (v e, Word) -get3WayGeneric tcm in1 in2 in3 = - let len = GV.length in1 - vt = V.generate len $ \i -> tcm (in1 GV.! i) (in2 GV.! i) (in3 GV.! i) - gen v = let med i = fst $ v V.! i in GV.generate len med - add = V.foldl' (\x e -> x + snd e) 0 - in (,) <$> gen <*> add $ vt - - -{-Not using this now--but could. Would need to add Aligned Types-} --- | threeWayGeneric take charInfo, 2 parents, and curNOde and creates 3 median via --- 1) 3 DO medians (choosing lowest cost median) ((p1,p2), cn), ((cn,p1), p2), and ((cn,p2), p1) --- 2) inserting gaps to make all 3 line up --- 3) creating 3-medians --- 4) choosing lowest cost median -threeWayGeneric :: CharInfo -> CharacterData -> CharacterData -> CharacterData -> CharacterData -threeWayGeneric charInfo parent1 parent2 curNode = - let localCharType = charType charInfo - -- pairwise medina structures - p1p2 = M.getDOMedianCharInfo charInfo parent1 parent2 - p1cN = M.getDOMedianCharInfo charInfo parent1 curNode - p2cN = M.getDOMedianCharInfo charInfo parent2 curNode - - -- get 3rd to pairwise - p1p2cN = M.getDOMedianCharInfo charInfo p1p2 curNode - p1cNp2 = M.getDOMedianCharInfo charInfo p1cN parent2 - p2cNp1 = M.getDOMedianCharInfo charInfo p2cN parent1 - - (median1Slim, median1Wide, median1Huge, cost1) = if localCharType `elem` [SlimSeq, NucSeq] then - let (a,b,c) = addGapsToChildren (slimGapped p1p2cN) (slimGapped p1p2) - (median, cost) = get3WayGeneric (TCMD.lookupThreeway (slimTCM charInfo)) a b c - in - (median, mempty, mempty, cost) - else if localCharType `elem` [AminoSeq, WideSeq] then - let (a,b,c) = addGapsToChildren (wideGapped p1p2cN) (wideGapped p1p2) - (median, cost) = get3WayGeneric (MR.retreiveThreewayTCM (wideTCM charInfo)) a b c - in - (mempty, median, mempty, cost) - else if localCharType == HugeSeq then - let (a,b,c) = addGapsToChildren (hugeGapped p1p2cN) (hugeGapped p1p2) - (median, cost) = get3WayGeneric (MR.retreiveThreewayTCM (hugeTCM charInfo)) a b c - in - (mempty, mempty, median, cost) - else error ("Unrecognized character type: " ++ show localCharType) - - (median2Slim, median2Wide, median2Huge, cost2) = if localCharType `elem` [SlimSeq, NucSeq] then - let (a,b,c) = addGapsToChildren (slimGapped p1cNp2) (slimGapped p1cN) - (median, cost) = get3WayGeneric (TCMD.lookupThreeway (slimTCM charInfo)) a b c - in - (median, mempty, mempty, cost) - else if localCharType `elem` [AminoSeq, WideSeq] then - let (a,b,c) = addGapsToChildren (wideGapped p1cNp2) (wideGapped p1cN) - (median, cost) = get3WayGeneric (MR.retreiveThreewayTCM (wideTCM charInfo)) a b c - in - (mempty, median, mempty, cost) - else if localCharType == HugeSeq then - let (a,b,c) = addGapsToChildren (hugeGapped p1cNp2) (hugeGapped p1cN) - (median, cost) = get3WayGeneric (MR.retreiveThreewayTCM (hugeTCM charInfo)) a b c - in - (mempty, mempty, median, cost) - else error ("Unrecognized character type: " ++ show localCharType) - - (median3Slim, median3Wide, median3Huge, cost3) = if localCharType `elem` [SlimSeq, NucSeq] then - let (a,b,c) = addGapsToChildren (slimGapped p2cNp1) (slimGapped p2cN) - (median, cost) = get3WayGeneric (TCMD.lookupThreeway (slimTCM charInfo)) a b c - in - (median, mempty, mempty, cost) - else if localCharType `elem` [AminoSeq, WideSeq] then - let (a,b,c) = addGapsToChildren (wideGapped p2cNp1) (wideGapped p2cN) - (median, cost) = get3WayGeneric (MR.retreiveThreewayTCM (wideTCM charInfo)) a b c - in - (mempty, median, mempty, cost) - else if localCharType == HugeSeq then - let (a,b,c) = addGapsToChildren (hugeGapped p2cNp1) (hugeGapped p2cN) - (median, cost) = get3WayGeneric (MR.retreiveThreewayTCM (hugeTCM charInfo)) a b c - in - (mempty, mempty, median, cost) - else error ("Unrecognized character type: " ++ show localCharType) - - - minCost = minimum [cost1, cost2, cost3] - (medianBestSlim, medianBestWide, medianBestHuge) = if cost1 == minCost then (median1Slim, median1Wide, median1Huge) - else if cost2 == minCost then (median2Slim, median2Wide, median2Huge) - else (median3Slim, median3Wide, median3Huge) - - - in - -- set for correct data type - if localCharType `elem` [SlimSeq, NucSeq] then emptyCharacter {slimFinal = medianBestSlim} - - else if localCharType `elem` [AminoSeq, WideSeq] then emptyCharacter {wideFinal = medianBestWide} - - else if localCharType == HugeSeq then emptyCharacter {hugeFinal = medianBestHuge} - - else error ("Unrecognized character type: " ++ show localCharType) diff --git a/pkg/PhyGraph/Utilities/Utilities.hs b/pkg/PhyGraph/Utilities/Utilities.hs deleted file mode 100644 index 9a9e772a6..000000000 --- a/pkg/PhyGraph/Utilities/Utilities.hs +++ /dev/null @@ -1,762 +0,0 @@ -{- | -Module : Utilities.hs -Description : Module specifying utility functions for use with PhyGraph -Copyright : (c) 2021 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -module Utilities.Utilities where - -import Data.Alphabet -import Data.Alphabet.IUPAC -import qualified Data.BitVector.LittleEndian as BV -import qualified Data.List as L -import qualified Data.List.NonEmpty as NE -import qualified Data.List.Split as SL -import Data.Maybe -import qualified Data.Vector as V --- import Data.Alphabet.Special -import qualified Data.Bimap as BM -import Data.Bits -import Data.Foldable -import qualified Data.Text.Lazy as T -import qualified Data.Text.Short as ST -import qualified Data.Vector.Storable as SV -import qualified Data.Vector.Unboxed as UV -import Debug.Trace -import GeneralUtilities -import qualified GeneralUtilities as GU -import qualified SymMatrix as S -import Types.Types -import qualified Utilities.LocalGraph as LG -import qualified Data.InfList as IL - --- | collapseGraph collapses zero-length edges in 3rd field of a phylogenetic graph --- does not affect display trees or character graphs --- fst6 and thd6 (both) are modified since this is used for output --- Collapsed frst can no longer be used for greaph optimization since non-binary --- this is done by removing the end vertex 'v' of min length 0 edge (u->v) --- this removes (u,v) and teh two (v, x) and (v, y) out edges from v. New edges are created --- (u, x) and (u,y) with length labels of (v,x) and (v,y) --- assumes all indexing is the same between the simple and decorated graph --- done recusively until no minLength == zero edges so edges renumbered properly --- network edges, pendant edges and root edges, are not collapsed -collapseGraph :: PhylogeneticGraph -> PhylogeneticGraph -collapseGraph inPhylograph@(inSimple, inC, inDecorated, inD, inE, inF) = - if LG.isEmpty inSimple then inPhylograph - else - let inDecTreeEdgeList = filter (not . (LG.isNetworkLabEdge inDecorated)) $ LG.labEdges inDecorated - zeroEdgeList' = filter ((== 0.0) . minLength . thd3) inDecTreeEdgeList - - -- remove cases of pendent edges--don't remove those either - leafIndexList = fmap fst $ snd4 $ LG.splitVertexList inSimple - rootChildIndexList = fmap snd3 $ concatMap (LG.out inSimple) $ fmap fst $ fst4 $ LG.splitVertexList inSimple - zeroEdgeList = filter ((`notElem` (leafIndexList ++ rootChildIndexList)) . snd3) zeroEdgeList' - in - if null zeroEdgeList then inPhylograph - else - -- get node to be deleted and its out edges - let nodeToDelete = snd3 $ head zeroEdgeList - sourceEdgeToDelete = fst3 $ head zeroEdgeList - - -- new dec edges - firstOutEdgeDec = head $ LG.out inDecorated nodeToDelete - secondOutEdgeDec = last $ LG.out inDecorated nodeToDelete - - newFirstEdgeDec = (sourceEdgeToDelete, snd3 firstOutEdgeDec, thd3 firstOutEdgeDec) - newSecondEdgeDec = (sourceEdgeToDelete, snd3 secondOutEdgeDec, thd3 secondOutEdgeDec) - - -- new simple edges - firstOutEdgeSimple = head $ LG.out inSimple nodeToDelete - secondOutEdgeSimple = last $ LG.out inSimple nodeToDelete - - newFirstEdgeSimple = (sourceEdgeToDelete, snd3 firstOutEdgeSimple, thd3 firstOutEdgeSimple) - newSecondEdgeSimple = (sourceEdgeToDelete, snd3 secondOutEdgeSimple, thd3 secondOutEdgeSimple) - - -- make new decorated--deleting node removes all incident edges - newDecGraph = LG.insEdges [newFirstEdgeDec, newSecondEdgeDec] $ LG.delNode nodeToDelete inDecorated - - -- make new simple--deleting node removes all incident edges - newSimpleGraph = LG.insEdges [newFirstEdgeSimple, newSecondEdgeSimple] $ LG.delNode nodeToDelete inSimple - in - (newSimpleGraph, inC, newDecGraph, inD, inE, inF) - --- | calculateGraphComplexity returns an infiniat list of graph complexities indexed by --- number of network nodes-assumes for now--single component gaph not forest --- first in pair is softwired complexity, second hardwired complexity -calculateGraphComplexity :: ProcessedData -> IL.InfList (VertexCost, VertexCost) -calculateGraphComplexity (nameVect, _, _) = - let numNetNodesList = IL.fromList [(0 :: Int)..] - numRoots = 1 - graphComplexity = IL.map (getGraphComplexity (V.length nameVect) numRoots) numNetNodesList - in - graphComplexity - --- | getGraphComplexity takes the number of leaves and number of --- network nodes and calculates the graph complexity --- tree num edges (2n-2) n leaves * 2 nodes for each edge * (log 2n -1 vertices-- min specify) -getGraphComplexity :: Int -> Int -> Int -> (VertexCost, VertexCost) -getGraphComplexity numLeaves numRoots numNetNodes = - -- place holder for now - let nodeComplexity = logBase 2.0 (fromIntegral $ (2 * numLeaves) - 1 + numNetNodes) -- bits to specify each vertex - treeEdges = (2 * numLeaves) - 2 - extraRootEdges = 2 * (numRoots - 1) - baseTreeComplexity = nodeComplexity * (fromIntegral $ 2 * (treeEdges - extraRootEdges)) - numDisplayTrees = 2.0 ** (fromIntegral numNetNodes) - harWiredEdges = (treeEdges - extraRootEdges) + (3 * numNetNodes) - hardwiredAddComplexity = nodeComplexity * (fromIntegral $ 2 * harWiredEdges) - in - -- maybe softwired is numDisplatTrees * harWired since have those edges in input - (baseTreeComplexity * numDisplayTrees, hardwiredAddComplexity) - - --- | calculateW15RootCost creates a root cost as the 'insertion' of character data. For sequence data averaged over --- leaf taxa --- this for a single root -calculateW15RootCost :: ProcessedData -> VertexCost -calculateW15RootCost (nameVect, _, blockDataV) = - let numLeaves = V.length nameVect - insertDataCost = V.sum $ fmap getblockInsertDataCost blockDataV - in - insertDataCost / (fromIntegral numLeaves) - --- | getblockInsertDataCost gets the total cost of 'inserting' the data in a block --- this most easily done before bit packing since won't vary anyway. --- then store value in Global Settings -getblockInsertDataCost :: BlockData -> Double -getblockInsertDataCost (_, characterDataVV, charInfoV) = - V.sum $ fmap (getLeafInsertCost charInfoV) characterDataVV - --- | getLeafInsertCost is the cost or originating or 'inserting' leaf data --- for all characters in a block -getLeafInsertCost :: V.Vector CharInfo -> V.Vector CharacterData -> Double -getLeafInsertCost charInfoV charDataV = - V.sum $ V.zipWith getCharacterInsertCost charDataV charInfoV - --- | getCharacterInsertCost takes a character and characterInfo and returns origination/insert cost for the character -getCharacterInsertCost :: CharacterData -> CharInfo -> Double -getCharacterInsertCost inChar charInfo = - let localCharType = charType charInfo - thisWeight = weight charInfo - inDelCost = (costMatrix charInfo) S.! (0, (length (alphabet charInfo) - 1)) - in - if localCharType == Add then thisWeight * (fromIntegral $ V.length $ GU.snd3 $ rangePrelim inChar) - else if localCharType == NonAdd then thisWeight * (fromIntegral $ V.length $ GU.snd3 $ stateBVPrelim inChar) - -- this wrong--need to count actual characters packed2/32, packed4/32 - else if localCharType `elem` packedNonAddTypes then thisWeight * (fromIntegral $ UV.length $ GU.snd3 $ packedNonAddPrelim inChar) - else if localCharType == Matrix then thisWeight * (fromIntegral $ V.length $ matrixStatesPrelim inChar) - else if localCharType == SlimSeq || localCharType == NucSeq then thisWeight * (fromIntegral inDelCost) * (fromIntegral $ SV.length $ slimPrelim inChar) - else if localCharType == WideSeq || localCharType == AminoSeq then thisWeight * (fromIntegral inDelCost) * (fromIntegral $ UV.length $ widePrelim inChar) - else if localCharType == HugeSeq then thisWeight * (fromIntegral inDelCost) * (fromIntegral $ V.length $ hugePrelim inChar) - else if localCharType == AlignedSlim then thisWeight * (fromIntegral inDelCost) * (fromIntegral $ SV.length $ snd3 $ alignedSlimPrelim inChar) - else if localCharType == AlignedWide then thisWeight * (fromIntegral inDelCost) * (fromIntegral $ UV.length $ snd3 $ alignedWidePrelim inChar) - else if localCharType == AlignedHuge then thisWeight * (fromIntegral inDelCost) * (fromIntegral $ V.length $ snd3 $ alignedHugePrelim inChar) - else error ("Character type unimplemented : " ++ show localCharType) - - --- | splitSequence takes a ShortText divider and splits a list of ShortText on --- that ShortText divider consuming it akin to Text.splitOn -splitSequence :: ST.ShortText -> [ST.ShortText] -> [[ST.ShortText]] -splitSequence partitionST stList = - if null stList then [] - else - let firstPart = takeWhile (/= partitionST) stList - restList = dropWhile (/= partitionST) stList - in - if restList == [partitionST] then firstPart : [[ST.fromString "#"]] - else if not $ null restList then firstPart : splitSequence partitionST (tail restList) - else [firstPart] - --- See Bio.DynamicCharacter.decodeState for a better implementation for dynamic character elements -bitVectToCharState :: Bits b => Alphabet String -> b -> String -bitVectToCharState localAlphabet bitValue = L.intercalate "," $ foldr pollSymbol mempty indices - where - indices = [ 0 .. len - 1 ] - len = length vec - vec = alphabetSymbols localAlphabet - pollSymbol i polled - | bitValue `testBit` i = (vec V.! i) : polled - | otherwise = polled - - --- bitVectToCharState takes a bit vector representation and returns a list states as integers -bitVectToCharState' :: (Bits b) => [String] -> b -> String -bitVectToCharState' localAlphabet bitValue - | isAlphabetDna hereAlphabet = fold $ iupacToDna BM.!> observedSymbols - | isAlphabetAminoAcid hereAlphabet = fold $ iupacToAminoAcid BM.!> observedSymbols - | otherwise = L.intercalate "," $ toList observedSymbols - where - hereAlphabet = fromSymbols localAlphabet - symbolCountH = length localAlphabet - observedSymbols - = NE.fromList - $ foldMap - (\ i -> [localAlphabet !! i | bitValue `testBit` i]) - [0 .. symbolCountH - 1] - --- | matrixStateToStringtakes a matrix state and returns a string representation -matrixStateToString :: V.Vector MatrixTriple -> String -matrixStateToString inStateVect = - let minCost = V.minimum $ fmap fst3 inStateVect - minCostStates = V.toList $ V.filter ((== minCost) . fst3) inStateVect - statesStringList = fmap show minCostStates - in - if length statesStringList == 1 then head statesStringList - else - "[" ++ (L.intercalate " " statesStringList) ++ "]" - - - --- | additivStateToString take an additive range and prints single state if range equal or --- [a-b] if not -additivStateToString :: (Int, Int) -> String -additivStateToString (a,b) = if a == b then show a - else "[" ++ (show a) ++ "-" ++ (show b) ++ "]" - --- | filledDataFields takes rawData and checks taxon to see what percent --- "characters" are found. --- call with (0,0) -filledDataFields :: (Int, Int) -> TermData -> (NameText, Int, Int) -filledDataFields (hasData, totalData) (taxName, taxData) - | null taxData = (taxName, hasData, totalData) - | ST.length (head taxData) == 0 = filledDataFields (hasData, 1 + totalData) (taxName, tail taxData) - | otherwise = filledDataFields (1 + hasData, 1 + totalData) (taxName, tail taxData) - --- | stripComments removes all lines that being with "--" haskell stype comments --- needs to be reversed on return to maintain order. -stripComments :: [String] -> [String] -stripComments inStringList = - if null inStringList then [] - else - let strippedLine = GU.stripString $ head inStringList - in - if null strippedLine then stripComments $ tail inStringList - else if length strippedLine < 2 then strippedLine : stripComments (tail inStringList) - else - if "--" == take 2 strippedLine then stripComments $ tail inStringList - else strippedLine : stripComments (tail inStringList) - --- | getDecoratedGraphBlockCharInformation takes decorated graph and reports number of blosk and size of each -getDecoratedGraphBlockCharInformation :: DecoratedGraph -> ((Int, Int), [V.Vector Int]) -getDecoratedGraphBlockCharInformation inGraph = - if LG.isEmpty inGraph then ((0,0), []) - else - -- get a vertices from graph and take their information - let inVertDataList = fmap (vertData . snd) (LG.labNodes inGraph) - blockNumMax = maximum $ fmap length inVertDataList - blocknumMin = minimum $ fmap length inVertDataList - blockLengthList = fmap (fmap length) inVertDataList - in - ((blockNumMax, blocknumMin), blockLengthList) - --- | vectMaybeHead takes a vector and returns JUst V.head if not V.empty --- Nothing otherwise -vectMaybeHead :: V.Vector a -> Maybe a -vectMaybeHead inVect = - if V.null inVect then Nothing - else Just (V.head inVect) - --- vectResolveMaybe takes a Vector of Maybe a --- and returns Just a or V.empty -vectResolveMaybe :: V.Vector (Maybe a) -> V.Vector a -vectResolveMaybe inVect = - trace ("VRM " ++ show (length inVect)) ( - if isNothing (V.head inVect) then V.empty - else V.singleton $ fromJust $ V.head inVect - ) - --- | getNumberPrealignedCharacters takes processed data and returns the number of prealigned sequence characters --- used to special case procedurs with prealigned sequences -getNumberPrealignedCharacters :: V.Vector BlockData -> Int -getNumberPrealignedCharacters blockDataVect = - if V.null blockDataVect then 0 - else - let firstBlock = GU.thd3 $ V.head blockDataVect - characterTypes = V.map charType firstBlock - sequenceChars = length $ V.filter (== True) $ V.map (`elem` prealignedCharacterTypes) characterTypes - in - sequenceChars + getNumberPrealignedCharacters (V.tail blockDataVect) - --- | getNumberSequenceCharacters takes processed data and returns the number of non-exact (= sequen ce) characters --- ised to special case datasets with limited non-exact characters -getNumberSequenceCharacters :: V.Vector BlockData -> Int -getNumberSequenceCharacters blockDataVect = - if V.null blockDataVect then 0 - else - let firstBlock = GU.thd3 $ V.head blockDataVect - characterTypes = V.map charType firstBlock - sequenceChars = length $ V.filter (== True) $ V.map (`elem` sequenceCharacterTypes) characterTypes - in - sequenceChars + getNumberSequenceCharacters (V.tail blockDataVect) - --- | getNumberExactCharacters takes processed data and returns the number of non-exact characters --- ised to special case datasets with limited non-exact characters -getNumberExactCharacters :: V.Vector BlockData -> Int -getNumberExactCharacters blockDataVect = - if V.null blockDataVect then 0 - else - let firstBlock = GU.thd3 $ V.head blockDataVect - characterTypes = V.map charType firstBlock - exactChars = length $ V.filter (== True) $ V.map (`elem` exactCharacterTypes) characterTypes - in - exactChars + getNumberExactCharacters (V.tail blockDataVect) - --- | splitBlockCharacters takes a block of characters (vector) and splits into two partitions of exact (Add, NonAdd, Matrix) and sequence characters --- (= nonExact) using accumulators -splitBlockCharacters :: V.Vector (V.Vector CharacterData) - -> V.Vector CharInfo - -> Int - -> [([CharacterData], CharInfo)] - -> [([CharacterData], CharInfo)] - -> (BlockData, BlockData) -splitBlockCharacters inDataVV inCharInfoV localIndex exactCharPairList seqCharPairList = - if localIndex == V.length inCharInfoV then - let (exactDataList, exactCharInfoList) = unzip exactCharPairList - (sequenceDataList, sequenceCharInfoList) = unzip seqCharPairList - newExactCharInfoVect = V.fromList $ reverse exactCharInfoList - newSeqCharCharInfoVect = V.fromList $ reverse sequenceCharInfoList - newExactData = V.fromList $ fmap (V.fromList . reverse) (L.transpose exactDataList) - newSeqCharData = V.fromList $ fmap (V.fromList . reverse) (L.transpose sequenceDataList) - in - ((T.pack "ExactCharacters", newExactData, newExactCharInfoVect), (T.pack "Non-ExactCharacters", newSeqCharData, newSeqCharCharInfoVect)) - else - let localCharacterType = charType (inCharInfoV V.! localIndex) - thisCharacterData = V.toList $ fmap (V.! localIndex) inDataVV - newPair = (thisCharacterData, inCharInfoV V.! localIndex) - in - if localCharacterType `elem` exactCharacterTypes then - splitBlockCharacters inDataVV inCharInfoV (localIndex + 1) (newPair : exactCharPairList) seqCharPairList - else if localCharacterType `elem` sequenceCharacterTypes then - splitBlockCharacters inDataVV inCharInfoV (localIndex + 1) exactCharPairList (newPair : seqCharPairList) - else error ("Unrecongized/implemented character type: " ++ show localCharacterType) - - - --- | safeVectorHead safe vector head--throws error if null -safeVectorHead :: V.Vector a -> a -safeVectorHead inVect = - if V.null inVect then error "Empty vector in safeVectorHead" - else V.head inVect - --- | get leftRightChilLabelBV takes a pair of vertex labels and returns left and right --- based on their bitvector representation. This ensures left/right consistancey in --- pre and postoder passes, and with bitvectors of leaves determined by data hash, --- ensures label invariance with repect to leaves --- larger bitvector goes second (bigge) -leftRightChildLabelBV :: (VertexInfo, VertexInfo) -> (VertexInfo, VertexInfo) -leftRightChildLabelBV inPair@(firstNode, secondNode) = - let firstLabel = bvLabel firstNode - secondLabel = bvLabel secondNode - in - if firstLabel > secondLabel then (secondNode, firstNode) - else inPair - --- | get leftRightChildLabelBVNode takes a pair ofnodes and returns left and right --- based on their bitvector representation. This ensures left/right consistancey in --- pre and postoder passes, and with bitvectors of leaves determined by data hash, --- ensures label invariance with repect to leaves --- larger bitvector goes second (bigge) -leftRightChildLabelBVNode :: (LG.LNode VertexInfo, LG.LNode VertexInfo) -> (LG.LNode VertexInfo, LG.LNode VertexInfo) -leftRightChildLabelBVNode inPair@(firstNode, secondNode) = - let firstLabel = bvLabel $ snd firstNode - secondLabel = bvLabel $ snd secondNode - in - if firstLabel > secondLabel then (secondNode, firstNode) - else inPair - - --- | prettyPrintVertexInfo returns a string with formated version of --- vertex info -prettyPrintVertexInfo :: VertexInfo -> String -prettyPrintVertexInfo inVertData = - let zerothPart = "Vertex name " ++ T.unpack (vertName inVertData) ++ " Index " ++ show (index inVertData) - firstPart = "\n\tBitVector (as number) " ++ (show ((BV.toUnsignedNumber $ bvLabel inVertData) :: Int)) - secondPart = "\n\tParents " ++ show (parents inVertData) ++ " Children " ++ show (children inVertData) - thirdPart = "\n\tType " ++ show (nodeType inVertData) ++ " Local Cost " ++ show (vertexCost inVertData) ++ " SubGraph Cost " ++ show (subGraphCost inVertData) - fourthPart = "\n\tData Blocks: " ++ show (V.length $ vertData inVertData) ++ " Characters (by block) " ++ show (V.length <$> vertData inVertData) - fifthPart = "\n\t" ++ show (vertData inVertData) - in - zerothPart ++ firstPart ++ secondPart ++ thirdPart ++ fourthPart ++ fifthPart - - --- | add3 adds three values -add3 :: (Num a) => a -> a -> a -> a -add3 x y z = x + y + z - --- | getProcessDataByBlock takes ProcessData and returns a list of Processed data with one block --- per processed data element --- argument to filter terminals with missing taxa --- wraps around getProcessDataByBlock' with counter -getProcessDataByBlock :: Bool -> ProcessedData -> [ProcessedData] -getProcessDataByBlock filterMissing (nameVect, nameBVVect, blockDataVect) = reverse $ getProcessDataByBlock' filterMissing 0 (nameVect, nameBVVect, blockDataVect) - - --- | getProcessDataByBlock' called by getProcessDataByBlock with counter --- and later reversed -getProcessDataByBlock' :: Bool -> Int -> ProcessedData -> [ProcessedData] -getProcessDataByBlock' filterMissing counter (nameVect, nameBVVect, blockDataVect) = - if V.null blockDataVect then [] - else if counter == (V.length blockDataVect) then [] - else - let thisBlockData = blockDataVect V.! counter - in - if not filterMissing then (nameVect, nameBVVect, V.singleton thisBlockData) : getProcessDataByBlock' filterMissing (counter + 1) (nameVect, nameBVVect, blockDataVect) - else - let (blockName, charDataLeafVect, blockCharInfo) = thisBlockData - isMissingVect = V.map V.null charDataLeafVect - (nonMissingNameVect, nonMissingBVVect, nonMissingLeafData, _) = V.unzip4 $ V.filter ((== False) . GU.fth4) (V.zip4 nameVect nameBVVect charDataLeafVect isMissingVect) - nonMissingBlockData = (blockName, nonMissingLeafData, blockCharInfo) - in - (nonMissingNameVect, nonMissingBVVect, V.singleton nonMissingBlockData) : getProcessDataByBlock' filterMissing (counter + 1) (nameVect, nameBVVect, blockDataVect) - - --- | copyToNothing takes VertexBlockData and copies to VertexBlockDataMaybe --- data as nothing -copyToNothing :: VertexBlockData -> VertexBlockDataMaybe -copyToNothing vbd = fmap setNothing vbd - where setNothing a = V.replicate (V.length a) Nothing - - --- | copyToJust takes VertexBlockData and copies to VertexBlockDataMaybe --- data as Just CharacterData -copyToJust :: VertexBlockData -> VertexBlockDataMaybe -copyToJust vbd = fmap (fmap Just) vbd - --- | simAnnealAccept takes simulated annealing parameters, current best graph (e) cost, --- candidate graph cost (e') and a uniform random integer and returns a Bool to accept or reject --- the candidate solution --- the basic method is --- 1) accepts if current is better --- 2) Other wise prob accept = exp(-(e' -e)/T) --- where T is a step from max to min --- maxT and minT can probbaly be set to 100 and 1 or something but leaving some flexibility --- curStep == 0 random walk (always accept) --- curStep == (numSteps -1) greedy False is not better -simAnnealAccept :: Maybe SAParams -> VertexCost -> VertexCost -> (Bool, Maybe SAParams) -simAnnealAccept inParams curBestCost candCost = - if inParams == Nothing then error "simAnnealAccept Simulated anneling parameters = Nothing" - - -- drifting probs - else if (method $ fromJust inParams) == Drift then - driftAccept inParams curBestCost candCost - - -- simulated annealing probs - else - let simAnealVals = fromJust inParams - numSteps = numberSteps simAnealVals - curStep = currentStep simAnealVals - randIntList = randomIntegerList simAnealVals - - stepFactor = (fromIntegral $ numSteps - curStep) / (fromIntegral numSteps) - tempFactor = curBestCost * stepFactor - - candCost' = if curBestCost == candCost then candCost + 1 - else candCost - -- flipped order - (e' -e) - -- probAcceptance = exp ((curBestCost - candCost) / ((maxTemp - minTemp) * tempFactor)) - probAcceptance = exp ( (fromIntegral (curStep + 1)) * (curBestCost - candCost') / tempFactor) - - -- multiplier for resolution 1000, 100 prob be ok - randMultiplier = 1000 - intAccept = floor $ (fromIntegral randMultiplier) * probAcceptance - - -- use remainder for testing--passing infinite list and take head - (_, intRandVal) = divMod (abs $ head randIntList) randMultiplier - - nextSAParams = Just $ (fromJust inParams) {currentStep = curStep + 1, randomIntegerList = tail randIntList} - in - -- lowest cost-- greedy - -- trace ("RA " ++ (show intAccept)) ( - if candCost < curBestCost then - --trace ("SAB: " ++ (show curStep) ++ " True") - (True, nextSAParams) - - -- not better and at lowest temp - else if curStep >= (numSteps - 1) then - -- trace ("SAEnd: " ++ (show curStep) ++ " False") - (False, nextSAParams) - - -- test for non-lowest temp conditions - else if intRandVal < intAccept then - -- trace ("SAAccept: " ++ (show (curStep, candCost, curBestCost, tempFactor, probAcceptance, intAccept, intRandVal)) ++ " True") - (True, nextSAParams) - else - -- trace ("SAReject: " ++ (show (curStep, candCost, curBestCost, tempFactor, probAcceptance, intAccept, intRandVal)) ++ " False") - (False, nextSAParams) - -- ) - --- | incrementSimAnnealParams increments the step number by 1 but returns all other the same -incrementSimAnnealParams :: Maybe SAParams -> Maybe SAParams -incrementSimAnnealParams inParams = - if inParams == Nothing then error "incrementSimAnnealParams Simulated anneling parameters = Nothing" - else - let curStep = currentStep $ fromJust inParams - curChanges = driftChanges $ fromJust inParams - randList = tail $ randomIntegerList $ fromJust inParams - in - - -- simulated annelaing temperature step - if method (fromJust inParams) == SimAnneal then - Just $ (fromJust inParams) { currentStep = curStep + 1 - , randomIntegerList = randList - } - -- drifting change number - else - Just $ (fromJust inParams) { driftChanges = curChanges + 1 - , randomIntegerList = randList - } - --- | generateUniqueRandList take a int and simulated anealing parameter slist and creates --- a list of SA paramter values with unique rnandomInt lists --- sets current step to 0 -generateUniqueRandList :: Int -> Maybe SAParams -> [Maybe SAParams] -generateUniqueRandList number inParams = - if number == 0 then [] - else if inParams == Nothing then replicate number Nothing - else - let randIntList = randomIntegerList $ fromJust inParams - randSeedList = take number randIntList - randIntListList = fmap GU.randomIntList randSeedList - -- simAnnealParamList = replicate number inParams - newSimAnnealParamList = fmap Just $ fmap (updateSAParams (fromJust inParams)) randIntListList - in - -- trace (show $ fmap (take 1) randIntListList) - newSimAnnealParamList - - where updateSAParams a b = a {randomIntegerList = b} - --- | driftAccept takes SAParams, currrent best cost, and cadidate cost --- and returns a Boolean and an incremented set of params -driftAccept :: Maybe SAParams -> VertexCost -> VertexCost -> (Bool, Maybe SAParams) -driftAccept simAnealVals curBestCost candCost = - if simAnealVals == Nothing then error "Nothing value in driftAccept" - else - let curNumChanges = driftChanges $ fromJust simAnealVals - randIntList = randomIntegerList $ fromJust simAnealVals - - --- prob acceptance for better, same, and worse costs - probAcceptance = if candCost < curBestCost then 1.0 - else if candCost == curBestCost then driftAcceptEqual $ fromJust simAnealVals - else 1.0 / ((driftAcceptWorse $ fromJust simAnealVals) + candCost - curBestCost) - - -- multiplier for resolution 1000, 100 prob be ok - randMultiplier = 1000 - intAccept = floor $ (fromIntegral randMultiplier) * probAcceptance - - -- use remainder for testing--passing infinite list and take head - (_, intRandVal) = divMod (abs $ head randIntList) randMultiplier - - -- not always incrementing becasue may not result in changes - nextSAParams = Just $ (fromJust simAnealVals) {driftChanges = curNumChanges + 1, randomIntegerList = tail randIntList} - nextSAPAramsNoChange = Just $ (fromJust simAnealVals) {randomIntegerList = tail randIntList} - - in - -- only increment nnumberof changes for True values - if candCost < curBestCost then - -- trace ("Drift B: " ++ (show (curNumChanges, candCost, curBestCost, probAcceptance, intAccept, intRandVal)) ++ " True") - (True, nextSAParams) - - else if intRandVal < intAccept then - -- trace ("Drift T: " ++ (show (curNumChanges, candCost, curBestCost, probAcceptance, intAccept, intRandVal)) ++ " True") - (True, nextSAParams) - - else - -- trace ("Drift F: " ++ (show (curNumChanges, candCost, curBestCost, probAcceptance, intAccept, intRandVal)) ++ " False") - (False, nextSAPAramsNoChange) - -- ) - - --- | getTraversalCosts takes a Phylogenetic Graph and returns costs of traversal trees -getTraversalCosts :: PhylogeneticGraph -> [VertexCost] -getTraversalCosts inGraph = - let traversalTrees = V.toList $ fmap V.toList $ fft6 inGraph - traversalRoots = fmap head $ fmap LG.getRoots $ concat traversalTrees - traversalRootCosts = fmap (subGraphCost . snd) traversalRoots - in - traversalRootCosts - - --- | getCharacterLengths returns a the length of block characters -getCharacterLength :: CharacterData -> CharInfo -> Int -getCharacterLength inCharData inCharInfo = - let inCharType = charType inCharInfo - in - -- trace ("GCL:" ++ (show inCharType) ++ " " ++ (show $ snd3 $ stateBVPrelim inCharData)) ( - case inCharType of - x | x `elem` [NonAdd ] -> V.length $ snd3 $ stateBVPrelim inCharData - x | x `elem` packedNonAddTypes -> UV.length $ snd3 $ packedNonAddPrelim inCharData - x | x `elem` [Add ] -> V.length $ snd3 $ rangePrelim inCharData - x | x `elem` [Matrix ] -> V.length $ matrixStatesPrelim inCharData - x | x `elem` [SlimSeq, NucSeq ] -> SV.length $ snd3 $ slimAlignment inCharData - x | x `elem` [WideSeq, AminoSeq] -> UV.length $ snd3 $ wideAlignment inCharData - x | x `elem` [HugeSeq] -> V.length $ snd3 $ hugeAlignment inCharData - x | x `elem` [AlignedSlim] -> SV.length $ snd3 $ alignedSlimPrelim inCharData - x | x `elem` [AlignedWide] -> UV.length $ snd3 $ alignedWidePrelim inCharData - x | x `elem` [AlignedHuge] -> V.length $ snd3 $ alignedHugePrelim inCharData - _ -> error ("Un-implemented data type " ++ show inCharType) - -- ) - --- | getCharacterLengths' flipped arg version of getCharacterLength -getCharacterLength' :: CharInfo -> CharacterData -> Int -getCharacterLength' inCharInfo inCharData = getCharacterLength inCharData inCharInfo - --- | getMaxCharacterLengths get maximum charcter legnth from a list -getMaxCharacterLength :: CharInfo -> [CharacterData] -> Int -getMaxCharacterLength inCharInfo inCharDataList = maximum $ fmap (getCharacterLength' inCharInfo) inCharDataList - - --- | getSingleTaxon takes a taxa x characters block and an index and returns the character vector for that index -getSingleTaxon :: V.Vector (V.Vector CharacterData) -> Int -> V.Vector CharacterData -getSingleTaxon singleCharVect taxonIndex = fmap (V.! taxonIndex) singleCharVect - --- | glueBackTaxChar takes single chartacter taxon vectors and glues them back inot multiple characters for each --- taxon as expected in Blockdata. Like a transpose. FIlters out zero length characters -glueBackTaxChar :: V.Vector (V.Vector CharacterData) -> V.Vector (V.Vector CharacterData) -glueBackTaxChar singleCharVect = - let numTaxa = V.length $ V.head singleCharVect - multiCharVect = fmap (getSingleTaxon singleCharVect) (V.fromList [0.. numTaxa - 1]) - in - multiCharVect - --- | getSingleCharacter takes a taxa x characters block and an index and returns the character vector for that index --- resulting in a taxon by single charcater vector -getSingleCharacter :: V.Vector (V.Vector CharacterData) -> Int -> V.Vector CharacterData -getSingleCharacter taxVectByCharVect charIndex = fmap (V.! charIndex) taxVectByCharVect - --- | concatFastas is used by "report(ia)" to create a single concatenated fasta --- for use by programs such as RAxML andf TNT --- takes a single string of multiple fasta output, each entity to be concated (by taxon name) --- separate by a line starting with "Seqeunce character" from ia output --- there must be the same number of taxa and names in each element to be concatenated -concatFastas :: String -> String -concatFastas inMultFastaString = - if null inMultFastaString then [] - else - -- split on "Sequence character" - let fastaFileStringList = filter (not . null) $ spitIntoFastas inMultFastaString - - -- make pairs from each "file" - fastaTaxDataPairLL = fmap fasta2PairList fastaFileStringList - - fastTaxNameList = fmap fst $ head fastaTaxDataPairLL - - -- merge sequences with (++) - fastaDataLL = fmap (fmap snd) fastaTaxDataPairLL - mergedData = mergeFastaData fastaDataLL - - -- create new tuples - newFastaPairs = zipWith (++) fastTaxNameList mergedData - - in - -- trace ("CF:" ++ (show $ length fastaFileStringList) ++ " " ++ (show $ fmap length fastaFileStringList)) - concat newFastaPairs - --- | spitIntoFastas takes String generated by reprot ia functions, --- splits on "Sequence character" line, creating separate fastas -spitIntoFastas :: String -> [String] -spitIntoFastas inString = - if null inString then [] - else - let linesList = splitFastaLines [] [] $ filter (not .null) $ lines inString - fastaStringList = fmap unlines linesList - in - fastaStringList - --- | splitFastaLines splits a list of lines into lists based on the line "Sequence character" -splitFastaLines :: [String] -> [[String]] -> [String] -> [[String]] -splitFastaLines curFasta curFastaList inLineList = - if null inLineList then reverse ((reverse curFasta) : curFastaList) - else - let firstLine = head inLineList - firstWord = head $ words firstLine - in - if null firstLine then - splitFastaLines curFasta curFastaList (tail inLineList) - - else if firstWord == "Sequence" then - splitFastaLines [] ((reverse curFasta) : curFastaList) (tail inLineList) - - else - splitFastaLines (firstLine : curFasta) curFastaList (tail inLineList) - - --- | mergeFastaData takes list of list of Strings and merges into a single list of Strings -mergeFastaData :: [[String]] -> [String] -mergeFastaData inDataLL = - if null $ head inDataLL then [] - else - let firstData = concat $ fmap head inDataLL - formattedData = concatMap (++ "\n") $ SL.chunksOf 50 firstData - in - formattedData : mergeFastaData (fmap tail inDataLL) - --- | fasta2PairList takes an individual fasta file as single string and returns --- pairs data of taxon name and data --- assumes single character alphabet --- deletes '-' (unless "prealigned"), and spaces -fasta2PairList :: String -> [(String, String)] -fasta2PairList fileContents' = - if null fileContents' then [] - else - let fileContents = unlines $ filter (not.null) $ lines fileContents' - in - if null fileContents then [] - else if head fileContents /= '>' then errorWithoutStackTrace "\n\n'Read' command error: fasta file must start with '>'" - else - let terminalSplits = SL.splitWhen (== '>') fileContents - - -- tail because initial split will an empty text - pairData = getRawDataPairs (tail terminalSplits) - in - -- trace ("FSPL: " ++ (show $ length pairData) ++ " " ++ (show $ fmap fst pairData)) - pairData - --- | getRawDataPairstakes splits of Text and returns terminalName, Data pairs--minimal error checking -getRawDataPairs :: [String] -> [(String, String)] -getRawDataPairs inList = - if null inList then [] - else - let firstText = head inList - firstName = head $ lines firstText - firstData = concat $ tail $ lines firstText - in - ('>' : firstName ++ "\n", firstData) : getRawDataPairs (tail inList) - - --- | hasResolutionDuplicateEdges checks resolution subtree in verteexInfo for duplicate d edges in softwired --- subtree field -hasResolutionDuplicateEdges :: ResolutionData -> Bool -hasResolutionDuplicateEdges inResData = - let edgeList = snd $ displaySubGraph inResData - edgeDupList = length $ (fmap LG.toEdge edgeList) L.\\ (L.nub $ fmap LG.toEdge edgeList) - in - edgeDupList > 0 \ No newline at end of file diff --git a/pkg/PhyGraph/cabal.project b/pkg/PhyGraph/cabal.project deleted file mode 100644 index 75d99a5d8..000000000 --- a/pkg/PhyGraph/cabal.project +++ /dev/null @@ -1,80 +0,0 @@ ---------------------------------------------------------------------------------- --- Dependency specifications ---------------------------------------------------------------------------------- - -allow-newer: - aeson, - base, - base-compat, - bytestring, - containers, - ghc-prim, - hashable, - lens, - mtl, - primative, - semigroupoids, - template-haskell, - text, - th-abstraction - -preferences: - aeson >= 2.0, - bytestring >= 0.11, - text >= 2.0 - - -with-compiler: ghc-9.2.4 - - ---------------------------------------------------------------------------------- --- Output paths ---------------------------------------------------------------------------------- - -logs-dir: ./log -installdir: ./bin -install-method: copy -overwrite-policy: always ---docdir: ./doc ---htmldir: ./doc/html - - ---------------------------------------------------------------------------------- --- Build metadata flags ---------------------------------------------------------------------------------- - -executable-static: False ---keep-going: True -haddock-html: True -haddock-tests: True -haddock-benchmarks: True -haddock-internal: True -haddock-hyperlink-source: True -jobs: $ncpus -minimize-conflict-set: True - - ---------------------------------------------------------------------------------- --- Package flags (for dependencies) ---------------------------------------------------------------------------------- - --- NOTE: Currently applies to *all* packages, not just dependencies... -package * - benchmarks: False - documentation: True - tests: False - library-profiling: False - library-profiling-detail: all-functions - optimization: 2 - ghc-options: - --- NOTE: Hopefully overrides the above package flags -package phygraph - benchmarks: True - documentation: True - tests: True - library-profiling: False - optimization: 0 - ghc-options: -fwrite-ide-info -hiedir=.hie - -packages: . diff --git a/pkg/PhyGraph/lib/alphabet/src/Data/Alphabet.hs b/pkg/PhyGraph/lib/alphabet/src/Data/Alphabet.hs deleted file mode 100644 index 14b096a02..000000000 --- a/pkg/PhyGraph/lib/alphabet/src/Data/Alphabet.hs +++ /dev/null @@ -1,69 +0,0 @@ ------------------------------------------------------------------------------ --- | --- Module : Data.Alphabet --- Copyright : (c) 2015-2021 Ward Wheeler --- License : BSD-style --- --- Maintainer : wheeler@amnh.org --- Stability : provisional --- Portability : portable --- --- An 'Alphabet' represents an /ordered/ list of unique symbols with constant --- time random access. Symbols are any data type which are coercible from a --- 'String' through the 'Data.String.IsString' type-class. --- --- An 'Alphabet' is constructed in one of two ways: --- --- 1. Supplying a `Foldable` structure of symbols which are 'Data.String.IsString' --- instances to the 'fromSymbols' function. --- --- 2. Supplying a `Foldable` structure of symbols and state name pairs, --- both of which are 'Data.String.IsString' instances to the --- 'fromSymbolsWithStateNames' function. --- --- Both 'Alphabet' construction methods are order preserving with respect to the --- input symbol order. --- --- Every 'Alphabet' contains a "gap" symbol denoted by the expression: --- > fromString "-" --- The "gap" character is always the /FIRST/ in the ordered --- list regardless of its presence or position in the construction structure. --- --- An 'Alphabet' will never contain the "missing" symbol denoted by the --- expression: --- > fromString "?" --- This symbol will be removed from the 'Alphabet' --- if it is present in the construction structure. ------------------------------------------------------------------------------ - -module Data.Alphabet - ( Alphabet() - -- * Construction - , fromSymbols - , fromSymbolsWOGap - , fromSymbolsWithStateNames - , fromSymbolsWithStateNamesAndTCM - , fromSymbolsWithTCM - -- * Queries - , alphabetStateNames - , alphabetSymbols - -- * Gap Symbol Queries - , gapIndex - , gapSymbol - -- * Truncation - , truncateAtSymbol - , truncateAtMaxSymbol - -- * Special Alphabet Constructions - , aminoAcidAlphabet - , dnaAlphabet - , rnaAlphabet - , discreteAlphabet - -- * Special Alphabet Queries - , isAlphabetAminoAcid - , isAlphabetDna - , isAlphabetRna - , isAlphabetDiscrete - ) where - -import Data.Alphabet.Internal -import Data.Alphabet.Special diff --git a/pkg/PhyGraph/lib/alphabet/src/Data/Alphabet/Codec.hs b/pkg/PhyGraph/lib/alphabet/src/Data/Alphabet/Codec.hs deleted file mode 100644 index 331089ed4..000000000 --- a/pkg/PhyGraph/lib/alphabet/src/Data/Alphabet/Codec.hs +++ /dev/null @@ -1,204 +0,0 @@ ------------------------------------------------------------------------------ --- | --- Module : Data.Alphabet.Codec --- Copyright : (c) 2015-2021 Ward Wheeler --- License : BSD-style --- --- Maintainer : wheeler@amnh.org --- Stability : provisional --- Portability : portable --- --- Facilitates encoding and decoding symbols from an 'Alphabet' into a bit-state. --- Works for any 'Bits' instance. ------------------------------------------------------------------------------ - -{-# LANGUAGE GADTs #-} -{-# LANGUAGE Strict #-} - -module Data.Alphabet.Codec - ( decodeState - , encodeState - ) where - -import Data.Alphabet.Internal -import Data.Bits -import Data.Foldable -import Data.List.NonEmpty (NonEmpty) -import Data.Set (Set) -import qualified Data.Set as Set -import Data.Text.Short (ShortText) -import qualified Data.Vector as V -import Data.Word -import Foreign.C.Types -import GHC.Exts (IsList(fromList), Item) - - -{-# INLINEABLE encodeState #-} -{-# SPECIALISE encodeState :: (Foldable f, Ord s) => Alphabet s -> (Word -> CUChar ) -> f s -> CUChar #-} -{-# SPECIALISE encodeState :: (Foldable f, Ord s) => Alphabet s -> (Word -> CUShort) -> f s -> CUShort #-} -{-# SPECIALISE encodeState :: (Foldable f, Ord s) => Alphabet s -> (Word -> CUInt ) -> f s -> CUInt #-} -{-# SPECIALISE encodeState :: (Foldable f, Ord s) => Alphabet s -> (Word -> CULong ) -> f s -> CULong #-} -{-# SPECIALISE encodeState :: (Foldable f, Ord s) => Alphabet s -> (Word -> Word ) -> f s -> Word #-} -{-# SPECIALISE encodeState :: (Foldable f, Ord s) => Alphabet s -> (Word -> Word8 ) -> f s -> Word8 #-} -{-# SPECIALISE encodeState :: (Foldable f, Ord s) => Alphabet s -> (Word -> Word16 ) -> f s -> Word16 #-} -{-# SPECIALISE encodeState :: (Foldable f, Ord s) => Alphabet s -> (Word -> Word32 ) -> f s -> Word32 #-} -{-# SPECIALISE encodeState :: (Foldable f, Ord s) => Alphabet s -> (Word -> Word64 ) -> f s -> Word64 #-} -{-# SPECIALISE encodeState :: Ord s => Alphabet s -> (Word -> CUChar ) -> Set s -> CUChar #-} -{-# SPECIALISE encodeState :: Ord s => Alphabet s -> (Word -> CUShort) -> Set s -> CUShort #-} -{-# SPECIALISE encodeState :: Ord s => Alphabet s -> (Word -> CUInt ) -> Set s -> CUInt #-} -{-# SPECIALISE encodeState :: Ord s => Alphabet s -> (Word -> CULong ) -> Set s -> CULong #-} -{-# SPECIALISE encodeState :: Ord s => Alphabet s -> (Word -> Word ) -> Set s -> Word #-} -{-# SPECIALISE encodeState :: Ord s => Alphabet s -> (Word -> Word8 ) -> Set s -> Word8 #-} -{-# SPECIALISE encodeState :: Ord s => Alphabet s -> (Word -> Word16 ) -> Set s -> Word16 #-} -{-# SPECIALISE encodeState :: Ord s => Alphabet s -> (Word -> Word32 ) -> Set s -> Word32 #-} -{-# SPECIALISE encodeState :: Ord s => Alphabet s -> (Word -> Word64 ) -> Set s -> Word64 #-} -{-# SPECIALISE encodeState :: Alphabet String -> (Word -> CUChar ) -> Set String -> CUChar #-} -{-# SPECIALISE encodeState :: Alphabet String -> (Word -> CUShort) -> Set String -> CUShort #-} -{-# SPECIALISE encodeState :: Alphabet String -> (Word -> CUInt ) -> Set String -> CUInt #-} -{-# SPECIALISE encodeState :: Alphabet String -> (Word -> CULong ) -> Set String -> CULong #-} -{-# SPECIALISE encodeState :: Alphabet String -> (Word -> Word ) -> Set String -> Word #-} -{-# SPECIALISE encodeState :: Alphabet String -> (Word -> Word8 ) -> Set String -> Word8 #-} -{-# SPECIALISE encodeState :: Alphabet String -> (Word -> Word16 ) -> Set String -> Word16 #-} -{-# SPECIALISE encodeState :: Alphabet String -> (Word -> Word32 ) -> Set String -> Word32 #-} -{-# SPECIALISE encodeState :: Alphabet String -> (Word -> Word64 ) -> Set String -> Word64 #-} -{-# SPECIALISE encodeState :: Alphabet ShortText -> (Word -> CUChar ) -> Set ShortText -> CUChar #-} -{-# SPECIALISE encodeState :: Alphabet ShortText -> (Word -> CUShort) -> Set ShortText -> CUShort #-} -{-# SPECIALISE encodeState :: Alphabet ShortText -> (Word -> CUInt ) -> Set ShortText -> CUInt #-} -{-# SPECIALISE encodeState :: Alphabet ShortText -> (Word -> CULong ) -> Set ShortText -> CULong #-} -{-# SPECIALISE encodeState :: Alphabet ShortText -> (Word -> Word ) -> Set ShortText -> Word #-} -{-# SPECIALISE encodeState :: Alphabet ShortText -> (Word -> Word8 ) -> Set ShortText -> Word8 #-} -{-# SPECIALISE encodeState :: Alphabet ShortText -> (Word -> Word16 ) -> Set ShortText -> Word16 #-} -{-# SPECIALISE encodeState :: Alphabet ShortText -> (Word -> Word32 ) -> Set ShortText -> Word32 #-} -{-# SPECIALISE encodeState :: Alphabet ShortText -> (Word -> Word64 ) -> Set ShortText -> Word64 #-} -{-# SPECIALISE encodeState :: Alphabet String -> (Word -> CUChar ) -> NonEmpty String -> CUChar #-} -{-# SPECIALISE encodeState :: Alphabet String -> (Word -> CUShort) -> NonEmpty String -> CUShort #-} -{-# SPECIALISE encodeState :: Alphabet String -> (Word -> CUInt ) -> NonEmpty String -> CUInt #-} -{-# SPECIALISE encodeState :: Alphabet String -> (Word -> CULong ) -> NonEmpty String -> CULong #-} -{-# SPECIALISE encodeState :: Alphabet String -> (Word -> Word ) -> NonEmpty String -> Word #-} -{-# SPECIALISE encodeState :: Alphabet String -> (Word -> Word8 ) -> NonEmpty String -> Word8 #-} -{-# SPECIALISE encodeState :: Alphabet String -> (Word -> Word16 ) -> NonEmpty String -> Word16 #-} -{-# SPECIALISE encodeState :: Alphabet String -> (Word -> Word32 ) -> NonEmpty String -> Word32 #-} -{-# SPECIALISE encodeState :: Alphabet String -> (Word -> Word64 ) -> NonEmpty String -> Word64 #-} -{-# SPECIALISE encodeState :: Alphabet ShortText -> (Word -> CUChar ) -> NonEmpty ShortText -> CUChar #-} -{-# SPECIALISE encodeState :: Alphabet ShortText -> (Word -> CUShort) -> NonEmpty ShortText -> CUShort #-} -{-# SPECIALISE encodeState :: Alphabet ShortText -> (Word -> CUInt ) -> NonEmpty ShortText -> CUInt #-} -{-# SPECIALISE encodeState :: Alphabet ShortText -> (Word -> CULong ) -> NonEmpty ShortText -> CULong #-} -{-# SPECIALISE encodeState :: Alphabet ShortText -> (Word -> Word ) -> NonEmpty ShortText -> Word #-} -{-# SPECIALISE encodeState :: Alphabet ShortText -> (Word -> Word8 ) -> NonEmpty ShortText -> Word8 #-} -{-# SPECIALISE encodeState :: Alphabet ShortText -> (Word -> Word16 ) -> NonEmpty ShortText -> Word16 #-} -{-# SPECIALISE encodeState :: Alphabet ShortText -> (Word -> Word32 ) -> NonEmpty ShortText -> Word32 #-} -{-# SPECIALISE encodeState :: Alphabet ShortText -> (Word -> Word64 ) -> NonEmpty ShortText -> Word64 #-} -encodeState - :: ( Bits e - , Foldable f - , Ord s - ) - => Alphabet s -- ^ Alphabet of symbols - -> (Word -> e) -- ^ Constructor for an empty element, taking the alphabet size - -> f s -- ^ ambiguity groups of symbols - -> e -- ^ Encoded dynamic character element -encodeState alphabet f symbols = getSubsetIndex alphabet symbolsSet emptyElement - where - emptyElement = f . toEnum $ length alphabet - symbolsSet = Set.fromList $ toList symbols - - -{-# INLINEABLE decodeState #-} -{-# SPECIALISE decodeState :: Alphabet s -> CUChar -> [s] #-} -{-# SPECIALISE decodeState :: Alphabet s -> CUShort -> [s] #-} -{-# SPECIALISE decodeState :: Alphabet s -> CUInt -> [s] #-} -{-# SPECIALISE decodeState :: Alphabet s -> CULong -> [s] #-} -{-# SPECIALISE decodeState :: Alphabet s -> Word -> [s] #-} -{-# SPECIALISE decodeState :: Alphabet s -> Word8 -> [s] #-} -{-# SPECIALISE decodeState :: Alphabet s -> Word16 -> [s] #-} -{-# SPECIALISE decodeState :: Alphabet s -> Word32 -> [s] #-} -{-# SPECIALISE decodeState :: Alphabet s -> Word64 -> [s] #-} -{-# SPECIALISE decodeState :: Alphabet String -> CUChar -> [String] #-} -{-# SPECIALISE decodeState :: Alphabet String -> CUShort -> [String] #-} -{-# SPECIALISE decodeState :: Alphabet String -> CUInt -> [String] #-} -{-# SPECIALISE decodeState :: Alphabet String -> CULong -> [String] #-} -{-# SPECIALISE decodeState :: Alphabet String -> Word -> [String] #-} -{-# SPECIALISE decodeState :: Alphabet String -> Word8 -> [String] #-} -{-# SPECIALISE decodeState :: Alphabet String -> Word16 -> [String] #-} -{-# SPECIALISE decodeState :: Alphabet String -> Word32 -> [String] #-} -{-# SPECIALISE decodeState :: Alphabet String -> Word64 -> [String] #-} -{-# SPECIALISE decodeState :: Alphabet ShortText -> CUChar -> [ShortText] #-} -{-# SPECIALISE decodeState :: Alphabet ShortText -> CUShort -> [ShortText] #-} -{-# SPECIALISE decodeState :: Alphabet ShortText -> CUInt -> [ShortText] #-} -{-# SPECIALISE decodeState :: Alphabet ShortText -> CULong -> [ShortText] #-} -{-# SPECIALISE decodeState :: Alphabet ShortText -> Word -> [ShortText] #-} -{-# SPECIALISE decodeState :: Alphabet ShortText -> Word8 -> [ShortText] #-} -{-# SPECIALISE decodeState :: Alphabet ShortText -> Word16 -> [ShortText] #-} -{-# SPECIALISE decodeState :: Alphabet ShortText -> Word32 -> [ShortText] #-} -{-# SPECIALISE decodeState :: Alphabet ShortText -> Word64 -> [ShortText] #-} -{-# SPECIALISE decodeState :: Ord s => Alphabet s -> CUChar -> Set s #-} -{-# SPECIALISE decodeState :: Ord s => Alphabet s -> CUShort -> Set s #-} -{-# SPECIALISE decodeState :: Ord s => Alphabet s -> CUInt -> Set s #-} -{-# SPECIALISE decodeState :: Ord s => Alphabet s -> CULong -> Set s #-} -{-# SPECIALISE decodeState :: Ord s => Alphabet s -> Word -> Set s #-} -{-# SPECIALISE decodeState :: Ord s => Alphabet s -> Word8 -> Set s #-} -{-# SPECIALISE decodeState :: Ord s => Alphabet s -> Word16 -> Set s #-} -{-# SPECIALISE decodeState :: Ord s => Alphabet s -> Word32 -> Set s #-} -{-# SPECIALISE decodeState :: Ord s => Alphabet s -> Word64 -> Set s #-} -{-# SPECIALISE decodeState :: Alphabet String -> CUChar -> Set String #-} -{-# SPECIALISE decodeState :: Alphabet String -> CUShort -> Set String #-} -{-# SPECIALISE decodeState :: Alphabet String -> CUInt -> Set String #-} -{-# SPECIALISE decodeState :: Alphabet String -> CULong -> Set String #-} -{-# SPECIALISE decodeState :: Alphabet String -> Word -> Set String #-} -{-# SPECIALISE decodeState :: Alphabet String -> Word8 -> Set String #-} -{-# SPECIALISE decodeState :: Alphabet String -> Word16 -> Set String #-} -{-# SPECIALISE decodeState :: Alphabet String -> Word32 -> Set String #-} -{-# SPECIALISE decodeState :: Alphabet String -> Word64 -> Set String #-} -{-# SPECIALISE decodeState :: Alphabet ShortText -> CUChar -> Set ShortText #-} -{-# SPECIALISE decodeState :: Alphabet ShortText -> CUShort -> Set ShortText #-} -{-# SPECIALISE decodeState :: Alphabet ShortText -> CUInt -> Set ShortText #-} -{-# SPECIALISE decodeState :: Alphabet ShortText -> CULong -> Set ShortText #-} -{-# SPECIALISE decodeState :: Alphabet ShortText -> Word -> Set ShortText #-} -{-# SPECIALISE decodeState :: Alphabet ShortText -> Word8 -> Set ShortText #-} -{-# SPECIALISE decodeState :: Alphabet ShortText -> Word16 -> Set ShortText #-} -{-# SPECIALISE decodeState :: Alphabet ShortText -> Word32 -> Set ShortText #-} -{-# SPECIALISE decodeState :: Alphabet ShortText -> Word64 -> Set ShortText #-} -{-# SPECIALISE decodeState :: Alphabet s -> CUChar -> NonEmpty s #-} -{-# SPECIALISE decodeState :: Alphabet s -> CUShort -> NonEmpty s #-} -{-# SPECIALISE decodeState :: Alphabet s -> CUInt -> NonEmpty s #-} -{-# SPECIALISE decodeState :: Alphabet s -> CULong -> NonEmpty s #-} -{-# SPECIALISE decodeState :: Alphabet s -> Word -> NonEmpty s #-} -{-# SPECIALISE decodeState :: Alphabet s -> Word8 -> NonEmpty s #-} -{-# SPECIALISE decodeState :: Alphabet s -> Word16 -> NonEmpty s #-} -{-# SPECIALISE decodeState :: Alphabet s -> Word32 -> NonEmpty s #-} -{-# SPECIALISE decodeState :: Alphabet s -> Word64 -> NonEmpty s #-} -{-# SPECIALISE decodeState :: Alphabet String -> CUChar -> NonEmpty String #-} -{-# SPECIALISE decodeState :: Alphabet String -> CUShort -> NonEmpty String #-} -{-# SPECIALISE decodeState :: Alphabet String -> CUInt -> NonEmpty String #-} -{-# SPECIALISE decodeState :: Alphabet String -> CULong -> NonEmpty String #-} -{-# SPECIALISE decodeState :: Alphabet String -> Word -> NonEmpty String #-} -{-# SPECIALISE decodeState :: Alphabet String -> Word8 -> NonEmpty String #-} -{-# SPECIALISE decodeState :: Alphabet String -> Word16 -> NonEmpty String #-} -{-# SPECIALISE decodeState :: Alphabet String -> Word32 -> NonEmpty String #-} -{-# SPECIALISE decodeState :: Alphabet String -> Word64 -> NonEmpty String #-} -{-# SPECIALISE decodeState :: Alphabet ShortText -> CUChar -> NonEmpty ShortText #-} -{-# SPECIALISE decodeState :: Alphabet ShortText -> CUShort -> NonEmpty ShortText #-} -{-# SPECIALISE decodeState :: Alphabet ShortText -> CUInt -> NonEmpty ShortText #-} -{-# SPECIALISE decodeState :: Alphabet ShortText -> CULong -> NonEmpty ShortText #-} -{-# SPECIALISE decodeState :: Alphabet ShortText -> Word -> NonEmpty ShortText #-} -{-# SPECIALISE decodeState :: Alphabet ShortText -> Word8 -> NonEmpty ShortText #-} -{-# SPECIALISE decodeState :: Alphabet ShortText -> Word16 -> NonEmpty ShortText #-} -{-# SPECIALISE decodeState :: Alphabet ShortText -> Word32 -> NonEmpty ShortText #-} -{-# SPECIALISE decodeState :: Alphabet ShortText -> Word64 -> NonEmpty ShortText #-} -decodeState - :: ( Bits e - , IsList (f s) - , Item (f s) ~ s - ) - => Alphabet s -- ^ Alphabet of symbols - -> e -- ^ State to decode - -> f s -decodeState alphabet state = fromList $ foldr pollSymbol mempty indices - where - indices = [ 0 .. len - 1 ] - len = length vec - vec = alphabetSymbols alphabet - pollSymbol i polled - | state `testBit` i = (vec V.! i) : polled - | otherwise = polled - diff --git a/pkg/PhyGraph/lib/alphabet/src/Data/Alphabet/IUPAC.hs b/pkg/PhyGraph/lib/alphabet/src/Data/Alphabet/IUPAC.hs deleted file mode 100644 index 4fca1de70..000000000 --- a/pkg/PhyGraph/lib/alphabet/src/Data/Alphabet/IUPAC.hs +++ /dev/null @@ -1,114 +0,0 @@ ------------------------------------------------------------------------------ --- | --- Module : Data.Alphabet.IUPAC --- Copyright : (c) 2015-2021 Ward Wheeler --- License : BSD-style --- --- Maintainer : wheeler@amnh.org --- Stability : provisional --- Portability : portable --- ------------------------------------------------------------------------------ - -module Data.Alphabet.IUPAC - ( iupacToAminoAcid - , iupacToDna - , iupacToRna - ) where - - -import Control.Arrow ((***)) -import Data.Bimap (Bimap) -import qualified Data.Bimap as BM -import Data.List.NonEmpty (NonEmpty) -import qualified Data.List.NonEmpty as NE -import Data.String - - --- | --- Substitutions for converting to an Amino Acid sequence based on IUPAC codes. -iupacToAminoAcid :: (IsString s, Ord s) => Bimap (NonEmpty s) (NonEmpty s) -iupacToAminoAcid = toBimap - [ ('A', "A") - , ('B', "DN") - , ('C', "C") - , ('D', "D") - , ('E', "E") - , ('F', "F") - , ('G', "G") - , ('H', "H") - , ('I', "I") - , ('K', "K") - , ('L', "L") - , ('M', "M") - , ('N', "N") - , ('P', "P") - , ('Q', "Q") - , ('R', "R") - , ('S', "S") - , ('T', "T") - , ('V', "V") - , ('W', "W") - , ('X', "ACDEFGHIKLMNPQRSTVWY") - , ('Y', "Y") - , ('Z', "EQ") - , ('-', "-") - , ('?', "-ACDEFGHIKLMNPQRSTVWY") - ] - - --- | --- Substitutions for converting to a DNA sequence based on IUPAC codes. -iupacToDna :: (IsString s, Ord s) => Bimap (NonEmpty s) (NonEmpty s) -iupacToDna = toBimap - [ ('A', "A") - , ('C', "C") - , ('G', "G") - , ('T', "T") - , ('R', "AG") - , ('Y', "CT") - , ('S', "CG") - , ('W', "AT") - , ('K', "GT") - , ('M', "AC") - , ('B', "CGT") - , ('D', "AGT") - , ('H', "ACT") - , ('V', "ACG") - , ('N', "ACGT") - , ('-', "-") - , ('?', "-ACGT") - - , ('a', "-A") - , ('c', "-C") - , ('g', "-G") - , ('t', "-T") - , ('r', "-AG") - , ('y', "-CT") - , ('s', "-CG") - , ('w', "-AT") - , ('k', "-GT") - , ('m', "-AC") - , ('b', "-CGT") - , ('d', "-AGT") - , ('h', "-ACT") - , ('v', "-ACG") - ] - - --- | --- Substitutions for converting to a RNA sequence based on IUPAC codes. -iupacToRna :: (IsString s, Ord s) => Bimap (NonEmpty s) (NonEmpty s) -iupacToRna = BM.mapMonotonic setUpdate $ BM.mapMonotonicR setUpdate iupacToDna - where - setUpdate = fmap f - where - f x - | x == fromString "T" = fromString "U" - | otherwise = x - - -toBimap :: (IsString s, Ord s) => [(Char, String)] -> Bimap (NonEmpty s) (NonEmpty s) -toBimap = BM.fromList . fmap transform - where - transform = pure . fromString . pure *** fmap (fromString . pure) . NE.fromList diff --git a/pkg/PhyGraph/lib/alphabet/src/Data/Alphabet/Internal.hs b/pkg/PhyGraph/lib/alphabet/src/Data/Alphabet/Internal.hs deleted file mode 100644 index 68480869f..000000000 --- a/pkg/PhyGraph/lib/alphabet/src/Data/Alphabet/Internal.hs +++ /dev/null @@ -1,669 +0,0 @@ ------------------------------------------------------------------------------ --- | --- Module : Data.Alphabet.Internal --- Copyright : (c) 2015-2021 Ward Wheeler --- License : BSD-style --- --- Maintainer : wheeler@amnh.org --- Stability : provisional --- Portability : portable --- --- We must ensure that missing and gap are appropriately --- code as "-" & "?", respectively, before this module is used, i.e., as output --- from either parsers or in unification step. --- ------------------------------------------------------------------------------ - -{-# LANGUAGE BangPatterns #-} -{-# LANGUAGE DeriveAnyClass #-} -{-# LANGUAGE DeriveDataTypeable #-} -{-# LANGUAGE DeriveFunctor #-} -{-# LANGUAGE DeriveGeneric #-} -{-# LANGUAGE DerivingStrategies #-} -{-# LANGUAGE FlexibleContexts #-} -{-# LANGUAGE FlexibleInstances #-} -{-# LANGUAGE OverloadedStrings #-} -{-# LANGUAGE TypeFamilies #-} - -module Data.Alphabet.Internal - ( Alphabet(..) - , alphabetStateNames - , alphabetSymbols - , fromSymbols - , fromSymbolsWOGap - , fromSymbolsWithStateNames - , fromSymbolsWithStateNamesAndTCM - , fromSymbolsWithTCM - , gapIndex - , gapSymbol - , symbolSet - , truncateAtSymbol - , truncateAtMaxSymbol - , getSubsetIndex - , getSubsetIndices - ) where - -import Control.Arrow -import Control.DeepSeq (NFData) -import Control.Monad.State.Strict -import Data.Bifunctor (bimap) -import Data.Binary (Binary) -import Data.Bits -import Data.Data -import Data.Foldable -import Data.IntSet (IntSet) -import qualified Data.IntSet as Int -import Data.Key -import Data.List (elemIndex, intercalate, sort) -import Data.List.NonEmpty (NonEmpty(..), unzip) -import qualified Data.List.NonEmpty as NE -import qualified Data.Map as Map -import Data.Matrix.NotStupid (Matrix, matrix) -import Data.Maybe -import Data.Monoid -import Data.Ord -import Data.Semigroup.Foldable -import Data.Set (Set) -import qualified Data.Set as Set -import Data.String -import Data.Text.Short (ShortText) -import qualified Data.Vector as V -import Data.Vector.NonEmpty (Vector) -import qualified Data.Vector.NonEmpty as NEV -import Data.Word -import GHC.Exts (IsList(fromList), Item) -import GHC.Generics (Generic) -import Numeric.Natural -import Prelude hiding (lookup, unzip, zip) -import Test.QuickCheck ---import Test.QuickCheck.Arbitrary.Instances () - - --- | --- The index of the vector where the gap state is stored. --- --- /NOTE:/ This index value is very important for many gap-related operations, --- both internally for the 'Alphabet' module/linbrary and externally in general. -gapIndex :: Int -gapIndex = 0 - - --- | --- A collection of symbols and optional corresponding state names. -data Alphabet a = - Alphabet - { isSorted :: !Bool - , symbolVector :: {-# UNPACK #-} !(Vector a) - , stateNames :: [a] - } - deriving anyclass (Binary, NFData) - deriving stock (Data, Generic, Functor, Typeable) - - -type instance Key Alphabet = Int - - --- Newtypes for corecing and consolidation of alphabet input processing logic -newtype AlphabetInputSingle a = ASI { toSingle :: a } - deriving anyclass (NFData) - deriving stock (Data, Eq, Ord, Generic) - - -newtype AlphabetInputTuple a = ASNI { toTuple :: (a,a) } - deriving anyclass (NFData) - deriving stock (Data, Eq, Ord, Generic) - - --- | --- -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- --- - Supporting code and data structures: --- -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- --- - -newtype UnnamedSymbol a = Unnamed a - deriving anyclass (NFData) - deriving stock (Generic) - - -newtype NamedSymbol a = Named (a,a) - deriving anyclass (NFData) - deriving stock (Generic) - - -class InternalClass a where - - gapSymbol' :: a - - isGapSymboled :: a -> Bool - - isMissingSymboled :: a -> Bool - - -instance (Ord a, IsString a) => Arbitrary (Alphabet a) where - - arbitrary = do - n <- (arbitrary :: Gen Int) `suchThat` (\x -> 0 < x && x <= 62) - pure . fromSymbols $ take n symbolSpace - where - -- We do this to simplify Alphabet generation, ensuring that there is at least one non gap symbol. - symbolSpace = fromString . pure <$> "-" <>['0'..'9'] <> ['A'..'Z'] <> ['a'..'z'] <> "?" - - --- | --- \( \mathcal{O} \left( n * \log_2 n \right) \) -instance Ord a => Eq (Alphabet a) where - - lhs == rhs = - (length lhs == length rhs) && - ( (isSorted lhs && isSorted rhs && symbolVector lhs == symbolVector rhs) - || sort (toList lhs) == sort (toList rhs) - ) - - -instance Foldable Alphabet where - - {-# INLINE toList #-} - toList = toList . symbolVector - - {-# INLINE foldMap #-} - foldMap f = foldMap f . symbolVector - - {-# INLINE foldr #-} - foldr f e = foldr f e . symbolVector - - {-# INLINE foldl #-} - foldl f e = foldl f e . symbolVector - - {-# INLINE foldr1 #-} - foldr1 f = foldr1 f . symbolVector - - {-# INLINE foldl1 #-} - foldl1 f = foldl1 f . symbolVector - - {-# INLINE length #-} - length = length . symbolVector - - -instance Foldable1 Alphabet where - - {-# INLINE fold1 #-} - fold1 = fold1 . symbolVector - - {-# INLINE foldMap1 #-} - foldMap1 f = foldMap1 f . symbolVector - - {-# INLINE toNonEmpty #-} - toNonEmpty = toNonEmpty . symbolVector - - -instance FoldableWithKey Alphabet where - - {-# INLINE foldrWithKey #-} - foldrWithKey f e = foldrWithKey f e . symbolVector - - {-# INLINE foldlWithKey #-} - foldlWithKey f e = foldlWithKey f e . symbolVector - - -instance FoldableWithKey1 Alphabet where - - {-# INLINE foldMapWithKey1 #-} - foldMapWithKey1 f = foldMapWithKey1 f . symbolVector - - -instance Indexable Alphabet where - - {-# INLINE index #-} - index a i = fromMaybe raiseError $ i `lookup` a - where - raiseError = error $ fold - ["Error indexing Alphabet at location " - , show i - , ", valid inclusive index range is [0, " - , show $ length a - 1 - , "]." - ] - - -instance (Eq a, IsString a) => InternalClass (AlphabetInputSingle a) where - - gapSymbol' = ASI $ fromString "-" - isGapSymboled = (gapSymbol' ==) - isMissingSymboled = (ASI (fromString "?") ==) - - -instance (Eq a, IsString a) => InternalClass (AlphabetInputTuple a) where - - gapSymbol' = ASNI (fromString "-", fromString "-") - isGapSymboled (ASNI (x,_)) = x == fromString "-" - isMissingSymboled (ASNI (x,_)) = x == fromString "?" - - -instance Lookup Alphabet where - - {-# INLINE lookup #-} - lookup i = lookup i . symbolVector - - -instance Ord a => Ord (Alphabet a) where - - compare = comparing symbolVector - - -instance Show a => Show (Alphabet a) where - - show x = fold - [ "Alphabet: {" - , intercalate ", " $ show <$> toList x - , "}" - ] - - --- | --- \( \mathcal{O} \left( n \right) \) --- --- Retrieves the state names for the symbols of the 'Alphabet'. --- --- If the symbols of the 'Alphabet' were not given state names during --- construction then an empty list is returned. -alphabetStateNames :: (IsList (f a), Item (f a) ~ a) => Alphabet a -> f a -alphabetStateNames = fromList . toList . stateNames - - --- | --- \( \mathcal{O} \left( n \right) \) --- --- Retrieves the symbols of the 'Alphabet'. Synonym for 'toList'. -{-# INLINE[1] alphabetSymbols #-} -{-# RULES -"alphabetSymbols/Set" forall (x :: Ord a => Alphabet a). alphabetSymbols x = symbolSet x -"alphabetSymbols/Vector" alphabetSymbols = (NEV.unwrap :: Vector a -> V.Vector a) . symbolVector -#-} -alphabetSymbols :: (IsList (f a), Item (f a) ~ a) => Alphabet a -> f a -alphabetSymbols = fromList . toList - - --- | --- \( \mathcal{O} \left( 1 \right) \) --- --- Retrieves the "gap character" from the alphabet. -gapSymbol :: Alphabet a -> a -gapSymbol alphabet = alphabet ! gapIndex - - --- | --- \( \mathcal{O} \left( n \right) \) for a sorted alphabet. --- --- \( \mathcal{O} \left( n log n \right) \) for a /non-sorted/ alphabet. --- --- Retrieves the set of all symbols from the alphabet. -symbolSet :: Ord a => Alphabet a -> Set a -symbolSet a = - case toList $ symbolVector a of - [] -> mempty - -- First element is 'gap' - g:xs -> Set.insert g $ f xs - where - f | isSorted a = Set.fromDistinctAscList - | otherwise = Set.fromList - - --- | --- For a given subset of symbols, this function returns a positive 'Natural' number --- in the range @[0, 2^|alphabet| - 1]@. --- This number is the unique index of the given subset in the powerset of the alphabet. -{-# INLINE getSubsetIndices #-} -{-# SPECIALISE getSubsetIndices :: Alphabet String -> Set String -> IntSet #-} -{-# SPECIALISE getSubsetIndices :: Alphabet ShortText -> Set ShortText -> IntSet #-} -getSubsetIndices :: Ord a => Alphabet a -> Set a -> IntSet -getSubsetIndices a s - | isSorted a = produceSet . go low $ consumeSet s -- /O(log a + n)/, a >= n - | otherwise = produceSet . mo low $ consumeSet s -- /O(a)/ - where - vec = symbolVector a - gap = gapSymbol a - low = gapIndex + 1 - - inputHadGap = Set.member gap s - consumeSet = Set.toAscList . Set.delete gap - produceSet = addGapVal . Int.fromDistinctAscList - - addGapVal - | inputHadGap = Int.insert gapIndex - | otherwise = id - - -- Faster binary search for a sorted alphabet - go _ [] = [] - go !lo (x:xs) = - case withinVec vec x lo of - Right i -> i : go (i+1) xs - Left i -> go i xs - - -- Slower version for an unsorted alphabet - mo _ [] = [] - mo i (x:xs) - | i > length vec = [] - | x == (vec ! i) = i : mo 0 xs - | otherwise = mo (i+1) (x:xs) - - --- | --- For a given subset of symbols, this function returns a positive 'Natural' number --- in the range @[0, 2^|alphabet| - 1]@. --- This number is the unique index of the given subset in the powerset of the alphabet. -{-# INLINE getSubsetIndex #-} -{-# SPECIALISE getSubsetIndex :: Alphabet String -> Set String -> Word -> Word #-} -{-# SPECIALISE getSubsetIndex :: Alphabet ShortText -> Set ShortText -> Word -> Word #-} -{-# SPECIALISE getSubsetIndex :: Alphabet String -> Set String -> Word32 -> Word32 #-} -{-# SPECIALISE getSubsetIndex :: Alphabet ShortText -> Set ShortText -> Word32 -> Word32 #-} -{-# SPECIALISE getSubsetIndex :: Alphabet String -> Set String -> Word64 -> Word64 #-} -{-# SPECIALISE getSubsetIndex :: Alphabet ShortText -> Set ShortText -> Word64 -> Word64 #-} -{-# SPECIALISE getSubsetIndex :: Alphabet String -> Set String -> Natural -> Natural #-} -{-# SPECIALISE getSubsetIndex :: Alphabet ShortText -> Set ShortText -> Natural -> Natural #-} -getSubsetIndex :: (Bits b, Ord a) => Alphabet a -> Set a -> b -> b -getSubsetIndex a s zero - | isSorted a = addGapVal . go zero low $ consumeSet s -- /O(log a + n)/, a >= n - | otherwise = addGapVal . mo zero low $ consumeSet s -- /O(a)/ - where - vec = symbolVector a - gap = gapSymbol a - low = gapIndex + 1 - - consumeSet = Set.toAscList . Set.delete gap - inputHadGap = Set.member gap s - - addGapVal - | inputHadGap = (`setBit` gapIndex) - | otherwise = id - - -- Faster binary search for a sorted alphabet - go !bits _ [] = bits - go !bits !lo (x:xs) = - case withinVec vec x lo of - Right i -> go (bits .|. bit i) (i+1) xs - Left i -> go bits i xs - - -- Slower version for an unsorted alphabet - mo bits _ [] = bits - mo bits i (x:xs) - | i > length vec = bits - | x == (vec ! i) = mo (bits .|. bit i) low xs - | otherwise = mo bits (i+1) (x:xs) - - --- | --- \( \mathcal{O} \left( n * \log_2 n \right) \) --- --- Constructs an 'Alphabet' from a 'Foldable' structure of symbols which are --- 'IsString' values. --- WITHOUT ADDING GAPS -{-# INLINE[1] fromSymbolsWOGap #-} -{-# SPECIALISE fromSymbolsWOGap :: Foldable t => t String -> Alphabet String #-} -{-# SPECIALISE fromSymbolsWOGap :: Foldable t => t ShortText -> Alphabet ShortText #-} -{-# SPECIALISE fromSymbolsWOGap :: [String] -> Alphabet String #-} -{-# SPECIALISE fromSymbolsWOGap :: [ShortText] -> Alphabet ShortText #-} -{-# RULES -"fromSymbols/Set" forall (s :: (IsString x, Ord x) => Set x). fromSymbols s = - let g = fromString "-" - x = g : toList (Set.delete g s) - v = NEV.fromNonEmpty $ NE.fromList x - in Alphabet True v [] -#-} -fromSymbolsWOGap :: (Ord a, IsString a, Foldable t) => t a -> Alphabet a -fromSymbolsWOGap inputSymbols = Alphabet sorted symbols [] - where - symbols = NEV.fromNonEmpty . fmap toSingle . alphabetPreprocessingWOGap . fmap fromSingle $ toList inputSymbols - - sorted = - -- Coerce to a plain Vector, drop the last (gap) element - let v = init $ toList symbols - -- Zip each element with the next element, - -- and assert that all pairs are less-then-equal - in all (uncurry (<=)) . zip v $ tail v - - - --- | --- \( \mathcal{O} \left( n * \log_2 n \right) \) --- --- Constructs an 'Alphabet' from a 'Foldable' structure of symbols which are --- 'IsString' values. -{-# INLINE[1] fromSymbols #-} -{-# SPECIALISE fromSymbols :: Foldable t => t String -> Alphabet String #-} -{-# SPECIALISE fromSymbols :: Foldable t => t ShortText -> Alphabet ShortText #-} -{-# SPECIALISE fromSymbols :: [String] -> Alphabet String #-} -{-# SPECIALISE fromSymbols :: [ShortText] -> Alphabet ShortText #-} -{-# RULES -"fromSymbols/Set" forall (s :: (IsString x, Ord x) => Set x). fromSymbols s = - let g = fromString "-" - x = g : toList (Set.delete g s) - v = NEV.fromNonEmpty $ NE.fromList x - in Alphabet True v [] -#-} -fromSymbols :: (Ord a, IsString a, Foldable t) => t a -> Alphabet a -fromSymbols inputSymbols = Alphabet sorted symbols [] - where - symbols = NEV.fromNonEmpty . fmap toSingle . alphabetPreprocessing . fmap fromSingle $ toList inputSymbols - - sorted = - -- Coerce to a plain Vector, drop the last (gap) element - let v = init $ toList symbols - -- Zip each element with the next element, - -- and assert that all pairs are less-then-equal - in all (uncurry (<=)) . zip v $ tail v - - --- | --- \( \mathcal{O} \left( n * \log_2 n \right) \) --- --- Constructs an 'Alphabet' from a 'Foldable' structure of symbols and --- corresponding state names, both of which are 'IsString' values. --- --- The input ordering is preserved. -{-# SPECIALISE fromSymbolsWithStateNames :: Foldable t => t ( String, String) -> Alphabet String #-} -{-# SPECIALISE fromSymbolsWithStateNames :: Foldable t => t (ShortText, ShortText) -> Alphabet ShortText #-} -{-# SPECIALISE fromSymbolsWithStateNames :: [( String, String)] -> Alphabet String #-} -{-# SPECIALISE fromSymbolsWithStateNames :: [(ShortText, ShortText)] -> Alphabet ShortText #-} -fromSymbolsWithStateNames :: (Ord a, IsString a, Foldable t) => t (a, a) -> Alphabet a -fromSymbolsWithStateNames inputSymbols = Alphabet False symbols names - where - (symbols, names) = bimap NEV.fromNonEmpty toList . unzip . fmap toTuple . alphabetPreprocessing . fmap fromTuple $ toList inputSymbols - - --- | --- Constructs an 'Alphabet' with a corresponding TCM. Permutes TCM rows and --- columns as the 'Alphabet' is reordered. Deletes TCM rows and columns where --- 'Alphabet' symbols are eliminated. --- --- If the alphabet has been permuted the corresponding TCM needs to be permuted in the same manner. --- --- /O(n*log(n) + n^2)/ -{-# SPECIALISE fromSymbolsWithTCM :: FoldableWithKey t => t String -> Matrix b -> (Alphabet String , Matrix b) #-} -{-# SPECIALISE fromSymbolsWithTCM :: FoldableWithKey t => t ShortText -> Matrix b -> (Alphabet ShortText, Matrix b) #-} -{-# SPECIALISE fromSymbolsWithTCM :: [String] -> Matrix b -> (Alphabet String , Matrix b) #-} -{-# SPECIALISE fromSymbolsWithTCM :: [ShortText] -> Matrix b -> (Alphabet ShortText, Matrix b) #-} -fromSymbolsWithTCM :: (Ord a, IsString a, FoldableWithKey t) => t a -> Matrix b -> (Alphabet a, Matrix b) -fromSymbolsWithTCM symbols originalTcm = (alphabet, permutedTcm) - where - (uniqueSymbols, permuted) = removeSpecialSymbolsAndDuplicates symbols - alphabet = Alphabet True (NEV.fromNonEmpty uniqueSymbols) [] - permutedTcm = getPermutionContext alphabet originalTcm permuted - - removeSpecialSymbolsAndDuplicates xs = (uniqueVals, permutedKeys) - where - count = length uniqueSymbols - 1 - uniqueVals = NE.fromList . fmap toSingle . ([gapSymbol']<>) . Set.toAscList $ Map.keysSet uniques - permutedKeys = - case gapMay of - Nothing -> NE.fromList $ toList uniques <> [count] - Just i -> NE.fromList $ toList uniques <> [i] - - (uniques, gapMay) = foldrWithKey go (mempty, Nothing) . fmap fromSingle $ toList xs - go i s acc@(m, g) - | isGapSymboled s = (m, Just i) - | isMissingSymboled s = acc - | otherwise = (Map.insert s i m, g) - - --- | --- Constructs an 'Alphabet' with a corresponding TCM. Permutes TCM rows and --- columns as the 'Alphabet' is reordered. Deletes TCM rows and columns where --- 'Alphabet' symbols are eliminated. --- --- If the alphabet has been permuted the corresponding TCM needs to be permuted in the same manner. --- --- /O(n*log(n) + n^2)/ -{-# SPECIALISE fromSymbolsWithStateNamesAndTCM :: FoldableWithKey t => t ( String, String) -> Matrix b -> (Alphabet String , Matrix b) #-} -{-# SPECIALISE fromSymbolsWithStateNamesAndTCM :: FoldableWithKey t => t (ShortText, ShortText) -> Matrix b -> (Alphabet ShortText, Matrix b) #-} -{-# SPECIALISE fromSymbolsWithStateNamesAndTCM :: [( String, String)] -> Matrix b -> (Alphabet String , Matrix b) #-} -{-# SPECIALISE fromSymbolsWithStateNamesAndTCM :: [(ShortText, ShortText)] -> Matrix b -> (Alphabet ShortText, Matrix b) #-} -fromSymbolsWithStateNamesAndTCM :: (Ord a, IsString a, FoldableWithKey t) => t (a, a) -> Matrix b -> (Alphabet a, Matrix b) -fromSymbolsWithStateNamesAndTCM symbols originalTcm = (alphabet, permutedTcm) - where - (uniqueSymbols, uniqueStates, permuted) = removeSpecialSymbolsAndDuplicates symbols - alphabet = Alphabet True (NEV.fromNonEmpty uniqueSymbols) uniqueStates - permutedTcm = getPermutionContext alphabet originalTcm permuted - - removeSpecialSymbolsAndDuplicates xs = (uniqueVals, uniqueNames, permutedKeys) - where - count = length uniqueSymbols - 1 - (uniqueVals, uniqueNames) = first NE.fromList . unzip . fmap toTuple . ([gapSymbol']<>) . Set.toAscList $ Map.keysSet uniques - permutedKeys = - case gapMay of - Nothing -> NE.fromList $ toList uniques <> [count] - Just i -> NE.fromList $ toList uniques <> [i] - - (uniques, gapMay) = foldrWithKey go (mempty, Nothing) . fmap fromTuple $ toList xs - go i s acc@(m, g) - | isGapSymboled s = (m, Just i) - | isMissingSymboled s = acc - | otherwise = (Map.insert s i m, g) - - -getPermutionContext :: (Foldable1 f, Foldable t) => t a -> Matrix b -> f Int -> Matrix b -getPermutionContext alphabet originalTcm permuted - | isPermuted = matrix len len f - | otherwise = originalTcm - where - len = length alphabet - oldOrdering = NEV.fromNonEmpty permuted - isPermuted = oldOrdering /= NEV.generate len id - f (i,j) = originalTcm ! (oldOrdering ! i, oldOrdering ! j) - - --- | --- \( \mathcal{O} \left( n * \log_2 n \right) \) --- --- Attempts to find the symbol in the 'Alphabet'. --- If the symbol exists, returns an alphabet that includes all symbols occurring --- before the supplied symbol and excludes all symbols occurring after the --- supplied symbol. The gap character is preserved in the alphabet --- regardless of the supplied symbol. --- --- The resulting alphabet /includes/ the input symbol. -truncateAtSymbol :: (Ord a, IsString a) => a -> Alphabet a -> Alphabet a -truncateAtSymbol symbol alphabet = - case elemIndex symbol $ toList alphabet of - Nothing -> alphabet - Just i -> - case alphabetStateNames alphabet of - [] -> fromSymbols . take (i + 1) $ alphabetSymbols alphabet - xs -> fromSymbolsWithStateNames . take (i + 1) $ zip (alphabetSymbols alphabet) xs - - --- | --- \( \mathcal{O} \left( n * \log_2 n \right) \) --- --- Attempts to find the maximum provided symbol in the Alphabet. --- If the any of the provided symbols exists, returns an alphabet including all --- the symbols occurring before the maximum provided symbol and excluding all symbols --- occurring after the maximum supplied symbol. The gap character is --- preserved in the alphabet regardless of the supplied symbol. --- --- The resulting alphabet /includes/ the input symbol. -truncateAtMaxSymbol :: (Foldable t, Ord a, IsString a) => t a -> Alphabet a -> Alphabet a -truncateAtMaxSymbol symbols alphabet = - case maxIndex of - Nothing -> alphabet - Just i -> - case alphabetStateNames alphabet of - [] -> fromSymbols . take (i + 1) $ alphabetSymbols alphabet - xs -> fromSymbolsWithStateNames . take (i + 1) $ zip (alphabetSymbols alphabet) xs - where - maxIndex = foldlWithKey' f Nothing alphabet - f e k v - | v `notElem` symbols = e - | otherwise = - case e of - Nothing -> Just k - Just i -> Just $ max k i - --- | --- \( \mathcal{O} \left( n * \log_2 n \right) \) -alphabetPreprocessingWOGap :: (Ord a, InternalClass a, Foldable t) => t a -> NonEmpty a -alphabetPreprocessingWOGap = NE.fromList . sort . removeSpecialSymbolsAndDuplicates . toList - where - removeSpecialSymbolsAndDuplicates = (`evalState` mempty) . filterM f - where - f x - | isGapSymboled x = pure False - | isMissingSymboled x = pure False - | otherwise = do - seenSet <- get - _ <- put $ x `Set.insert` seenSet - pure $ x `notElem` seenSet - - - --- | --- \( \mathcal{O} \left( n * \log_2 n \right) \) -alphabetPreprocessing :: (Ord a, InternalClass a, Foldable t) => t a -> NonEmpty a -alphabetPreprocessing = prependGapSymbol . sort . removeSpecialSymbolsAndDuplicates . toList - where - prependGapSymbol xs = - case xs of - [] -> gapSymbol':|[] - y:ys -> gapSymbol':|(y:ys) - - - removeSpecialSymbolsAndDuplicates = (`evalState` mempty) . filterM f - where - f x - | isGapSymboled x = pure False - | isMissingSymboled x = pure False - | otherwise = do - seenSet <- get - _ <- put $ x `Set.insert` seenSet - pure $ x `notElem` seenSet - - -fromSingle :: a -> AlphabetInputSingle a -fromSingle = ASI - - -fromTuple :: (a, a) -> AlphabetInputTuple a -fromTuple = ASNI - - -{-# INLINE withinVec #-} -{-# SPECIALISE withinVec :: Vector String -> String -> Int -> Either Int Int #-} --- {-# SPECIALISE withinVec :: Vector ShortText -> ShortText -> Int -> Either Int Int #-} -withinVec :: Ord a => Vector a -> a -> Int -> Either Int Int -withinVec v e m - | e == gap = Right gapIndex - | otherwise = go m $ length v - 1 - where - gap = v ! gapIndex - -- Perform a binary search on the unboxed vector - -- to determine if a symbol is present. - -- - -- Equally fast, and uses less memory than a Set. - {-# INLINE go #-} - go !lo !hi - | lo > hi = Left hi - | otherwise = let !md = (hi + lo) `div` 2 - !z = v ! md - in case z `compare` e of - EQ -> Right md - LT -> go (md + 1) hi - GT -> go lo (md - 1) diff --git a/pkg/PhyGraph/lib/alphabet/src/Data/Alphabet/Special.hs b/pkg/PhyGraph/lib/alphabet/src/Data/Alphabet/Special.hs deleted file mode 100644 index b7794976c..000000000 --- a/pkg/PhyGraph/lib/alphabet/src/Data/Alphabet/Special.hs +++ /dev/null @@ -1,123 +0,0 @@ ------------------------------------------------------------------------------ --- | --- Module : Data.Alphabet.Special --- Copyright : (c) 2015-2021 Ward Wheeler --- License : BSD-style --- --- Maintainer : wheeler@amnh.org --- Stability : provisional --- Portability : portable --- ------------------------------------------------------------------------------ - -module Data.Alphabet.Special - ( -- * Special Alphabet constructions - aminoAcidAlphabet - , dnaAlphabet - , rnaAlphabet - , discreteAlphabet - -- * Special Alphabet Queries - , isAlphabetAminoAcid - , isAlphabetDna - , isAlphabetRna - , isAlphabetDiscrete - ) where - -import Data.Alphabet.IUPAC -import Data.Alphabet.Internal -import Data.Bimap (Bimap) -import qualified Data.Bimap as BM -import Data.Char (isUpper) -import Data.Foldable -import Data.List.NonEmpty (NonEmpty) -import qualified Data.List.NonEmpty as NE -import qualified Data.Set as Set -import Data.String - - --- | --- Alphabet of amino acids. -aminoAcidAlphabet :: (IsString s, Ord s) => Alphabet s -aminoAcidAlphabet = fromBimap iupacToAminoAcid - - --- | --- Alphabet of DNA bases. -dnaAlphabet :: (IsString s, Ord s) => Alphabet s -dnaAlphabet = fromBimap iupacToDna - - --- | --- Alphabet of RNA bases. -rnaAlphabet :: (IsString s, Ord s) => Alphabet s -rnaAlphabet = fromBimap iupacToRna - - --- | --- Alphabet of "discrete" values. --- --- The discrete alphabet includes the following 63 values: --- --- @ [\'0\'..\'9\'] <> [\'A\'..\'Z\'] <> [\'a\'..\'z\'] <> "-" @ -discreteAlphabet :: (IsString s, Ord s) => Alphabet s -discreteAlphabet = fromSymbols $ fromString . pure <$> fold [['0'..'9'], ['A'..'Z'], ['a'..'z'], "-"] - - --- | --- /O(n)/ --- --- Determines if the supplied alphabet represents amino acid symbols. --- --- Useful for determining if an 'NonEmpty' should be rendered as an IUPAC --- code. -isAlphabetAminoAcid :: (IsString s, Ord s) => Alphabet s -> Bool -isAlphabetAminoAcid = isAlphabetSubsetOf aminoAcidAlphabet - - --- | --- /O(n)/ --- --- Determines if the supplied alphabet represents DNA symbols. --- --- Useful for determining if an 'NonEmpty' should be rendered as an IUPAC --- code. -isAlphabetDna :: (IsString s, Ord s) => Alphabet s -> Bool -isAlphabetDna = isAlphabetSubsetOf dnaAlphabet - - --- | --- /O(n)/ --- --- Determines if the supplied alphabet represents DNA symbols. --- --- Useful for determining if an 'NonEmpty' should be rendered as an IUPAC --- code. -isAlphabetRna :: (IsString s, Ord s) => Alphabet s -> Bool -isAlphabetRna = isAlphabetSubsetOf rnaAlphabet - - --- | --- /O(n)/ --- --- Determines if the supplied alphabet represents DNA symbols. --- --- Useful for determining if an 'NonEmpty' should be rendered as an IUPAC --- code. -isAlphabetDiscrete :: (IsString s, Ord s) => Alphabet s -> Bool -isAlphabetDiscrete = isAlphabetSubsetOf discreteAlphabet - - -isAlphabetSubsetOf :: Ord s => Alphabet s -> Alphabet s -> Bool -isAlphabetSubsetOf specialAlphabet queryAlphabet = querySet `Set.isSubsetOf` specialSet - where - querySet = symbolSet queryAlphabet - specialSet = symbolSet specialAlphabet - - -fromBimap :: (IsString s, Ord s) => Bimap (NonEmpty String) a -> Alphabet s -fromBimap = fromSymbols . fmap fromString . filter isUpperCaseStr . fmap NE.head . BM.keys - where - isUpperCaseStr (x:_) = isUpper x - isUpperCaseStr _ = False - - diff --git a/pkg/PhyGraph/lib/alphabet/test/Data/Alphabet/Test.hs b/pkg/PhyGraph/lib/alphabet/test/Data/Alphabet/Test.hs deleted file mode 100644 index 18923c9c4..000000000 --- a/pkg/PhyGraph/lib/alphabet/test/Data/Alphabet/Test.hs +++ /dev/null @@ -1,224 +0,0 @@ ------------------------------------------------------------------------------- --- | --- Module : Data.Alphabet.Test --- Copyright : (c) 2015-2021 Ward Wheeler --- License : BSD-style --- --- Maintainer : wheeler@amnh.org --- Stability : provisional --- Portability : portable --- ------------------------------------------------------------------------------ - -module Data.Alphabet.Test - ( testSuite - ) where - -import Data.Alphabet.Internal -import Data.Foldable -import Data.List (nubBy) -import qualified Data.Set as Set -import Test.Tasty -import Test.Tasty.HUnit as HU -import Test.Tasty.QuickCheck as QC - - --- | --- The test-suite for the 'Alphabet' data type. -testSuite :: TestTree -testSuite = testGroup "Alphabet Tests" - [ testPropertyCases - , testExampleCases - ] - - -testPropertyCases :: TestTree -testPropertyCases = testGroup "Invariant properties" - [ alphabetStateNamesProperties - , alphabetSymbolsProperties - , gapSymbolProperties - , truncateAtSymbolProperties - , truncateAtMaxSymbolProperties - ] - - -testExampleCases :: TestTree -testExampleCases = testGroup "Example cases for Data.Alphabet" - [ alphabetDNACases - , subsetIndex - ] - - -alphabetStateNamesProperties :: TestTree -alphabetStateNamesProperties = testGroup "Properties of alphabetStateNames" - [ QC.testProperty "The list of state names is always empty if none are supplied" - noStateNames - ] - where - noStateNames :: [String] -> Bool - noStateNames = null . alphabetStateNames . fromSymbols - - -alphabetSymbolsProperties :: TestTree -alphabetSymbolsProperties = testGroup "Properties of alphabetSymbols" - [ QC.testProperty "The list of symbols is the same if we forget the state names" - forgetStateNames - ] - where - forgetStateNames :: [(String, String)] -> Property - forgetStateNames strs' = - let strs = - nubBy (\p q -> fst p == fst q) strs' in - (alphabetSymbols . fromSymbolsWithStateNames $ strs) - === (alphabetSymbols . fromSymbols . fmap fst $ strs) - - -gapSymbolProperties :: TestTree -gapSymbolProperties = testGroup "Properties of gapSymbol" - [ QC.testProperty "The gap symbol is always \"-\"" constGapSymbol - ] - where - constGapSymbol :: [(String, String)] -> Property - constGapSymbol = (=== "-") . gapSymbol . fromSymbolsWithStateNames - - -truncateAtSymbolProperties :: TestTree -truncateAtSymbolProperties = testGroup "Properties of truncateAtSymbol" - [ QC.testProperty - "TruncateAtSymbol y (fromSymbols (xs ++ [y] ++ ys)) == fromSymbols (xs ++ [y])" - splitOrderedList - ] - where - splitOrderedList :: (NonNegative Int, [String]) -> Property - splitOrderedList (NonNegative n, strs') = - let strs = toList $ fromSymbols strs' - in case splitAt n strs of - (_, []) -> property True - (xs, y:_) -> (truncateAtSymbol y . fromSymbols $ strs) === fromSymbols (xs <> [y]) - - -truncateAtMaxSymbolProperties :: TestTree -truncateAtMaxSymbolProperties = testGroup "Properties of truncateAtMaxSymbol" - [ QC.testProperty - "truncateAtMaxSymbol of the input returns the original alphabet" - truncatePreserve - ] - where - truncatePreserve :: [String] -> Property - truncatePreserve strs = let alpha = fromSymbols strs in - truncateAtMaxSymbol strs alpha === alpha - - --- Cases for unit tests - -alphabetDNAString :: [(String, String)] -alphabetDNAString = - [ ("A", "adenine") - , ("C", "cytosine") - , ("G", "guanine") - , ("T", "thymine") - ] - - -{- -alphabetDNAText :: [(T.Text, T.Text)] -alphabetDNAText = - fmap (\(s1, s2) -> (T.pack s1, T.pack s2)) alphabetDNAString --} - - -alphabetDNA :: Alphabet String -alphabetDNA = fromSymbolsWithStateNames alphabetDNAString - - -alphabetDNACases :: TestTree -alphabetDNACases = - testGroup - (unlines - ["Cases for DNA alphabet given by:" - , " A adenine" - , " C cytosine" - , " G guanine" - , " T thymine" - ] - ) - [ HU.testCase "The symbols are A, C, G, T and -" symbols1 - , HU.testCase "The state names are adenine, cytosine, guanine, thymine and -" states1 - ] - where - symbols1 :: Assertion - symbols1 = alphabetSymbols alphabetDNA @?= ["A", "C", "G", "T", "-"] - - states1 :: Assertion - states1 = alphabetStateNames alphabetDNA - @?= ["adenine", "cytosine", "guanine", "thymine", "-"] - - - -subsetIndex :: TestTree -subsetIndex = - testGroup "Subset Index Tests:" - [ HU.testCase - "getSortedLookup agrees for sorted and unsorted alphabet [0, 1, 2] and input 1" - sortedLookup - , HU.testCase - "getSortedLookup agrees for sorted and unsorted alphabet [0, 1, 2] and gap input" - gapLookup - , QC.testProperty - (unlines - [ "getSortedLookup agrees for sorted and unsorted alphabets [0..n] for inputs" - , " as subsets from [0..i] with i <= n" - ]) - sortedUnsortedAgree - , QC.testProperty - "getSortedLookup agrees for sorted and unsorted alphabets [0..n] for gap input" - sortedUnsortedGapAgree - ] - where - alphabet :: Alphabet String - alphabet = fromSymbols ["0","1","2"] - - sortedAlphabet :: Alphabet String - sortedAlphabet = alphabet {isSorted = True} - - sortedLookup :: Assertion - sortedLookup = - getSubsetIndex alphabet (Set.singleton "1") - @?= - getSubsetIndex sortedAlphabet (Set.singleton "1") - - gapLookup :: Assertion - gapLookup = - getSubsetIndex alphabet (Set.singleton $ gapSymbol alphabet) - @?= - getSubsetIndex sortedAlphabet (Set.singleton $ gapSymbol sortedAlphabet) - --- This is a more elaborate version of the above version making sure --- both the sorted and unsorted branches agree for a given ambiguity group. - sortedUnsortedAgree :: Int -> Int -> Property - sortedUnsortedAgree i n = - let - symbols = fmap show [0..n] - unsortedAlphabet = fromSymbols symbols - sortedAlphabet = unsortedAlphabet {isSorted = True} - lookupGroup = Set.singleton (show i) - -- generator for a subset of the list [0..i] - groupGen = Set.fromList <$> sublistOf (fmap show [0..i]) - in - -- Make sure the pre-condition holds that i is less than or equal to n. - (i <= n) ==> - forAll groupGen $ \ambGroup -> - getSubsetIndex unsortedAlphabet ambGroup === - getSubsetIndex sortedAlphabet ambGroup - - sortedUnsortedGapAgree :: Int -> Property - sortedUnsortedGapAgree n = - let - symbols = fmap show [0..n] - unsortedAlphabet = fromSymbols symbols - sortedAlphabet = unsortedAlphabet {isSorted = True} - gap = Set.singleton $ gapSymbol sortedAlphabet - in - -- Test sorted and unsorted lookup is the same for gap character - getSubsetIndex unsortedAlphabet gap === - getSubsetIndex sortedAlphabet gap diff --git a/pkg/PhyGraph/lib/alphabet/test/TestSuite.hs b/pkg/PhyGraph/lib/alphabet/test/TestSuite.hs deleted file mode 100644 index 31a7e442f..000000000 --- a/pkg/PhyGraph/lib/alphabet/test/TestSuite.hs +++ /dev/null @@ -1,26 +0,0 @@ ------------------------------------------------------------------------------- --- | --- Module : Main --- Copyright : (c) 2015-2021 Ward Wheeler --- License : BSD-style --- --- Maintainer : wheeler@amnh.org --- Stability : provisional --- Portability : portable --- ------------------------------------------------------------------------------ - -module Main where - -import Data.Alphabet.Test -import Test.Tasty -import Test.Tasty.Ingredients.Rerun (rerunningTests) - - --- | --- The entry point for the test-suite of the 'Data.Alphabet.Alphabet' data type. -main :: IO () -main = - defaultMainWithIngredients - [ rerunningTests defaultIngredients ] - testSuite diff --git a/pkg/PhyGraph/lib/dynamic-character/src/Bio/DynamicCharacter.hs b/pkg/PhyGraph/lib/dynamic-character/src/Bio/DynamicCharacter.hs deleted file mode 100644 index c71b4f395..000000000 --- a/pkg/PhyGraph/lib/dynamic-character/src/Bio/DynamicCharacter.hs +++ /dev/null @@ -1,555 +0,0 @@ -{-# LANGUAGE FlexibleContexts #-} -{-# LANGUAGE GADTs #-} -{-# LANGUAGE RankNTypes #-} -{-# LANGUAGE Strict #-} -{-# LANGUAGE UnboxedTuples #-} - -module Bio.DynamicCharacter - ( -- * Element Varieties of a Dynamic Character - SlimState - , WideState - , HugeState - -- * Generic Dynamic Character Constructions - , OpenDynamicCharacter - , TempOpenDynamicCharacter - -- * Dynamic Character Immutable Varieties - , SlimDynamicCharacter - , WideDynamicCharacter - , HugeDynamicCharacter - -- * Dynamic Character Mutable Varieties - , TempSlimDynamicCharacter - , TempWideDynamicCharacter - , TempHugeDynamicCharacter - -- * Queries - , isAlign - , isDelete - , isInsert - , isGapped - , isGap - , isMissing - , characterLength - -- * Strictness - , forceDynamicCharacter - -- * Mutators - , setAlign - , setDelete - , setInsert - , setGapped - , setFrom - , transposeCharacter - -- * Extractors - , extractMedians - , extractMediansLeft - , extractMediansRight - , extractMediansGapped - , extractMediansLeftGapped - , extractMediansRightGapped - -- * Immutable Constructors - , encodeDynamicCharacter - , generateCharacter - -- * Mutable Constructors - , newTempCharacter - , freezeTempCharacter - , unsafeCharacterBuiltByST - , unsafeCharacterBuiltByBufferedST - -- * Rendering - , renderDynamicCharacter - ) where - -import Control.Monad.Primitive -import Control.Monad.ST -import Data.Alphabet -import Data.Alphabet.Codec -import Data.BitVector.LittleEndian -import Data.Bits -import Data.Foldable -import Data.Ord -import Data.STRef -import Data.Set (Set) -import qualified Data.Vector as V -import Data.Vector.Generic (Mutable, Vector, unsafeFreeze, (!)) -import qualified Data.Vector.Generic as GV -import Data.Vector.Generic.Mutable (unsafeNew, unsafeRead, unsafeWrite) -import qualified Data.Vector.Generic.Mutable as GMV (length) -import qualified Data.Vector.Storable as SV -import qualified Data.Vector.Unboxed as UV -import Data.Word -import Foreign.C.Types - - --- | --- Encoding for a dynamic character element with an alphabet size of /8/ or less. --- --- /NOTE:/ This encoding uses more bits than required! This is due to the C FFI --- implementation details. It would be possible to reduce this to a 'CUChar' if --- and only iff the C interface and implementation is updated. -type SlimState = CUInt - - --- | --- Encoding for a dynamic character element with an alphabet size in the range [9, 64]. -type WideState = Word64 - - --- | --- Encoding for a dynamic character element with an alphabet size of /65/ or greater. -type HugeState = BitVector - - --- | --- This triple of vectors is the general structure of all dynamic character --- representations. Each vector is equal length. --- --- A triple of empty vectors represents a missing character. This can be queried --- by 'isMissing'. --- --- All triples are arranged thusly: --- --- * 1: ``Left'' character. When normalized, this will be the shorter character. --- Delete events leave a void in this character. --- --- * 2: Median character. The alignment of both the Left and Right characters. --- Align, Delete, & Insert do *not* leave a void. This vector will always --- contain states with one or more bits set, i.e. never contain voids. --- --- * 3: ``Right'' character. When normalized, this will be the longer character. --- Insert events leave a void in this character. --- -type OpenDynamicCharacter v e = ( v e, v e, v e ) - - --- | --- Encoding for dynamic characters with an alphabet size of /8/ or less. -type SlimDynamicCharacter = OpenDynamicCharacter SV.Vector SlimState - --- | --- Encoding for dynamic characters with an alphabet size in the range [9, 64]. -type WideDynamicCharacter = OpenDynamicCharacter UV.Vector WideState - - --- | --- Encoding for dynamic characters with an alphabet size of /65/ or greater. -type HugeDynamicCharacter = OpenDynamicCharacter V.Vector HugeState - - --- | --- Generic representation of a /mutable/ dynamic character. -type TempOpenDynamicCharacter m v e = OpenDynamicCharacter (Mutable v (PrimState m)) e - - --- | --- Mutable encoding of 'SlimDynamicCharacter'. -type TempSlimDynamicCharacter m = TempOpenDynamicCharacter m SV.Vector SlimState - - --- | --- Mutable encoding of 'WideDynamicCharacter'. -type TempWideDynamicCharacter m = TempOpenDynamicCharacter m UV.Vector WideState - - --- | --- Mutable encoding of 'HugeDynamicCharacter'. -type TempHugeDynamicCharacter m = TempOpenDynamicCharacter m V.Vector HugeState - - -isAlign, isDelete, isInsert, isGapped :: (FiniteBits e, Vector v e) => OpenDynamicCharacter v e -> Int -> Bool -{-# INLINEABLE isAlign #-} -{-# SPECIALISE isAlign :: SlimDynamicCharacter -> Int -> Bool #-} -{-# SPECIALISE isAlign :: WideDynamicCharacter -> Int -> Bool #-} -{-# SPECIALISE isAlign :: HugeDynamicCharacter -> Int -> Bool #-} -{-# INLINEABLE isDelete #-} -{-# SPECIALISE isDelete :: SlimDynamicCharacter -> Int -> Bool #-} -{-# SPECIALISE isDelete :: WideDynamicCharacter -> Int -> Bool #-} -{-# SPECIALISE isDelete :: HugeDynamicCharacter -> Int -> Bool #-} -{-# INLINEABLE isInsert #-} -{-# SPECIALISE isInsert :: SlimDynamicCharacter -> Int -> Bool #-} -{-# SPECIALISE isInsert :: WideDynamicCharacter -> Int -> Bool #-} -{-# SPECIALISE isInsert :: HugeDynamicCharacter -> Int -> Bool #-} -{-# INLINEABLE isGapped #-} -{-# SPECIALISE isGapped :: SlimDynamicCharacter -> Int -> Bool #-} -{-# SPECIALISE isGapped :: WideDynamicCharacter -> Int -> Bool #-} -{-# SPECIALISE isGapped :: HugeDynamicCharacter -> Int -> Bool #-} -isAlign (lc,_,rc) i = i < GV.length lc && popCount (lc ! i) /= 0 && popCount (rc ! i) /= 0 -isDelete (lc,_,rc) i = i < GV.length lc && popCount (lc ! i) == 0 && popCount (rc ! i) /= 0 -isInsert (lc,_,rc) i = i < GV.length lc && popCount (lc ! i) /= 0 && popCount (rc ! i) == 0 -isGapped (lc,_,rc) i = i < GV.length lc && popCount (lc ! i) == 0 && popCount (rc ! i) == 0 - - -{-# INLINEABLE isGap #-} -{-# SPECIALISE isGap :: SlimDynamicCharacter -> Int -> Bool #-} -{-# SPECIALISE isGap :: WideDynamicCharacter -> Int -> Bool #-} -{-# SPECIALISE isGap :: HugeDynamicCharacter -> Int -> Bool #-} -isGap :: (Bits e, Vector v e) => OpenDynamicCharacter v e -> Int -> Bool -isGap (_,mc,_) i = i < GV.length mc && - let val = mc ! i - gap = buildGap val - in gap == val - - -{-# INLINEABLE isMissing #-} -{-# SPECIALISE isMissing :: SlimDynamicCharacter -> Bool #-} -{-# SPECIALISE isMissing :: WideDynamicCharacter -> Bool #-} -{-# SPECIALISE isMissing :: HugeDynamicCharacter -> Bool #-} -isMissing :: Vector v e => OpenDynamicCharacter v e -> Bool -isMissing (x,y,z) = GV.length x == 0 && GV.length y == 0 && GV.length z == 0 - - -{-# INLINEABLE setFrom #-} -{-# SPECIALISE setFrom :: SlimDynamicCharacter -> TempSlimDynamicCharacter (ST s) -> Int -> Int -> ST s () #-} -{-# SPECIALISE setFrom :: WideDynamicCharacter -> TempWideDynamicCharacter (ST s) -> Int -> Int -> ST s () #-} -{-# SPECIALISE setFrom :: HugeDynamicCharacter -> TempHugeDynamicCharacter (ST s) -> Int -> Int -> ST s () #-} -setFrom - :: ( PrimMonad m - , Vector v e - ) - => OpenDynamicCharacter v e -- ^ source - -> TempOpenDynamicCharacter m v e -- ^ destination - -> Int -- ^ Index to read from source - -> Int -- ^ Index to write to destination - -> m () -setFrom (slc,smc,src) (dlc,dmc,drc) i j = - unsafeWrite dlc j (slc ! i) *> unsafeWrite dmc j (smc ! i) *> unsafeWrite drc j (src ! i) - - -{-# INLINEABLE setAlign #-} -{-# SPECIALISE setAlign :: TempSlimDynamicCharacter (ST s) -> Int -> SlimState -> SlimState -> SlimState -> ST s () #-} -{-# SPECIALISE setAlign :: TempWideDynamicCharacter (ST s) -> Int -> WideState -> WideState -> WideState -> ST s () #-} -{-# SPECIALISE setAlign :: TempHugeDynamicCharacter (ST s) -> Int -> HugeState -> HugeState -> HugeState -> ST s () #-} -setAlign - :: ( PrimMonad m - , Vector v e - ) - => TempOpenDynamicCharacter m v e - -> Int -- ^ Index to set - -> e -- ^ Aligned ``Left'' element - -> e -- ^ Median Element - -> e -- ^ Aligned ``Right'' Element - -> m () -setAlign (lc,mc,rc) i le me re = - unsafeWrite lc i le *> unsafeWrite mc i me *> unsafeWrite rc i re - - -{-# INLINEABLE setDelete #-} -{-# SPECIALISE setDelete :: TempSlimDynamicCharacter (ST s) -> Int -> SlimState -> SlimState -> ST s () #-} -{-# SPECIALISE setDelete :: TempWideDynamicCharacter (ST s) -> Int -> WideState -> WideState -> ST s () #-} -{-# SPECIALISE setDelete :: TempHugeDynamicCharacter (ST s) -> Int -> HugeState -> HugeState -> ST s () #-} -setDelete - :: ( Bits e - , PrimMonad m - , Vector v e - ) - => TempOpenDynamicCharacter m v e -- ^ Modifiable character - -> Int -- ^ Index to set - -> e -- ^ Deleted ``Right'' element - -> e -- ^ Median Element - -> m () -setDelete (lc,mc,rc) i me re = - unsafeWrite lc i (me `xor` me) *> unsafeWrite mc i me *> unsafeWrite rc i re - - -{-# INLINEABLE setInsert #-} -{-# SPECIALISE setInsert :: TempSlimDynamicCharacter (ST s) -> Int -> SlimState -> SlimState -> ST s () #-} -{-# SPECIALISE setInsert :: TempWideDynamicCharacter (ST s) -> Int -> WideState -> WideState -> ST s () #-} -{-# SPECIALISE setInsert :: TempHugeDynamicCharacter (ST s) -> Int -> HugeState -> HugeState -> ST s () #-} -setInsert - :: ( Bits e - , PrimMonad m - , Vector v e - ) - => TempOpenDynamicCharacter m v e -- ^ Modifiable character - -> Int -- ^ Index to set - -> e -- ^ Median Element - -> e -- ^ Inserted ``Left'' element - -> m () -setInsert (lc,mc,rc) i le me = - unsafeWrite lc i le *> unsafeWrite mc i me *> unsafeWrite rc i (me `xor` me) - - -{-# INLINEABLE setGapped #-} -{-# SPECIALISE setGapped :: TempSlimDynamicCharacter (ST s) -> Int -> ST s () #-} -{-# SPECIALISE setGapped :: TempWideDynamicCharacter (ST s) -> Int -> ST s () #-} -{-# SPECIALISE setGapped :: TempHugeDynamicCharacter (ST s) -> Int -> ST s () #-} -setGapped - :: ( Bits e - , PrimMonad m - , Vector v e - ) - => TempOpenDynamicCharacter m v e -- ^ Modifiable character - -> Int -- ^ Index to set - -> m () -setGapped (lc,mc,rc) i = do - tmp <- unsafeRead mc 0 -- Get cell dimension context from first cell in vector - let (# gap, nil #) = buildGapAndNil tmp - unsafeWrite lc i nil - unsafeWrite mc i gap - unsafeWrite rc i nil - - -{-# INLINEABLE transposeCharacter #-} -transposeCharacter :: OpenDynamicCharacter v e -> OpenDynamicCharacter v e -transposeCharacter (lc,mc,rc) = (rc,mc,lc) - - -{-# INLINEABLE characterLength #-} -characterLength :: Vector v e => OpenDynamicCharacter v e -> Word -characterLength = toEnum . GV.length . extractMediansGapped - - -{-# INLINEABLE forceDynamicCharacter #-} -{-# SPECIALISE forceDynamicCharacter :: SlimDynamicCharacter -> SlimDynamicCharacter #-} -{-# SPECIALISE forceDynamicCharacter :: WideDynamicCharacter -> WideDynamicCharacter #-} -{-# SPECIALISE forceDynamicCharacter :: HugeDynamicCharacter -> HugeDynamicCharacter #-} -forceDynamicCharacter :: Vector v e => OpenDynamicCharacter v e -> OpenDynamicCharacter v e -forceDynamicCharacter (lv,mv,rv) = ( GV.force lv, GV.force mv, GV.force rv ) - - --- | --- Extract the /ungapped/ medians of a dynamic character. -{-# INLINEABLE extractMedians #-} -{-# SPECIALISE extractMedians :: SlimDynamicCharacter -> SV.Vector SlimState #-} -{-# SPECIALISE extractMedians :: WideDynamicCharacter -> UV.Vector WideState #-} -{-# SPECIALISE extractMedians :: HugeDynamicCharacter -> V.Vector HugeState #-} -extractMedians :: (FiniteBits e, Vector v e) => OpenDynamicCharacter v e -> v e -extractMedians (_,me,_) - | GV.null me = me - | otherwise = - let gap = buildGap $ me ! 0 - in GV.filter (/=gap) me - - --- | --- Extract the left child's /ungapped/ medians used to construct the dynamic character. -{-# INLINEABLE extractMediansLeft #-} -{-# SPECIALISE extractMediansLeft :: SlimDynamicCharacter -> SV.Vector SlimState #-} -{-# SPECIALISE extractMediansLeft :: WideDynamicCharacter -> UV.Vector WideState #-} -{-# SPECIALISE extractMediansLeft :: HugeDynamicCharacter -> V.Vector HugeState #-} -extractMediansLeft :: (FiniteBits e, Vector v e) => OpenDynamicCharacter v e -> v e -extractMediansLeft (lc,_,_) - | GV.null lc = lc - | otherwise = - let nil = buildNil $ lc ! 0 - in GV.filter (/=nil) lc - - --- | --- Extract the right child's /ungapped/ medians used to construct the dynamic character. -{-# INLINEABLE extractMediansRight #-} -{-# SPECIALISE extractMediansRight :: SlimDynamicCharacter -> SV.Vector SlimState #-} -{-# SPECIALISE extractMediansRight :: WideDynamicCharacter -> UV.Vector WideState #-} -{-# SPECIALISE extractMediansRight :: HugeDynamicCharacter -> V.Vector HugeState #-} -extractMediansRight :: (FiniteBits e, Vector v e) => OpenDynamicCharacter v e -> v e -extractMediansRight (_,_,rc) - | GV.null rc = rc - | otherwise = - let nil = buildNil $ rc ! 0 - in GV.filter (/=nil) rc - - --- | --- Extract the /gapped/ medians of a dynamic character. -{-# INLINEABLE extractMediansGapped #-} -extractMediansGapped :: OpenDynamicCharacter v e -> v e -extractMediansGapped (_,me,_) = me - - --- | --- Extract the left child's /gapped/ medians used to construct the dynamic character. -{-# INLINEABLE extractMediansLeftGapped #-} -extractMediansLeftGapped :: OpenDynamicCharacter v e -> v e -extractMediansLeftGapped (lc,_,_) = lc - - --- | --- Extract the right child's /gapped/ medians used to construct the dynamic character. -{-# INLINEABLE extractMediansRightGapped #-} -extractMediansRightGapped :: OpenDynamicCharacter v e -> v e -extractMediansRightGapped (_,_,rc) = rc - - -{-# INLINEABLE encodeDynamicCharacter #-} -{-# SPECIALISE encodeDynamicCharacter :: (Foldable f, Foldable g, Ord s) => Alphabet s -> (Word -> SlimState) -> f (g s) -> SlimDynamicCharacter #-} -{-# SPECIALISE encodeDynamicCharacter :: (Foldable f, Foldable g, Ord s) => Alphabet s -> (Word -> WideState) -> f (g s) -> WideDynamicCharacter #-} -{-# SPECIALISE encodeDynamicCharacter :: (Foldable f, Foldable g, Ord s) => Alphabet s -> (Word -> HugeState) -> f (g s) -> HugeDynamicCharacter #-} -{-# SPECIALISE encodeDynamicCharacter :: (Foldable f, Ord s) => Alphabet s -> (Word -> SlimState) -> f (Set s) -> SlimDynamicCharacter #-} -{-# SPECIALISE encodeDynamicCharacter :: (Foldable f, Ord s) => Alphabet s -> (Word -> WideState) -> f (Set s) -> WideDynamicCharacter #-} -{-# SPECIALISE encodeDynamicCharacter :: (Foldable f, Ord s) => Alphabet s -> (Word -> HugeState) -> f (Set s) -> HugeDynamicCharacter #-} -{-# SPECIALISE encodeDynamicCharacter :: (Foldable f) => Alphabet String -> (Word -> SlimState) -> f (Set String) -> SlimDynamicCharacter #-} -{-# SPECIALISE encodeDynamicCharacter :: (Foldable f) => Alphabet String -> (Word -> WideState) -> f (Set String) -> WideDynamicCharacter #-} -{-# SPECIALISE encodeDynamicCharacter :: (Foldable f) => Alphabet String -> (Word -> HugeState) -> f (Set String) -> HugeDynamicCharacter #-} -{-# SPECIALISE encodeDynamicCharacter :: Ord s => Alphabet s -> (Word -> SlimState) -> V.Vector (Set s) -> SlimDynamicCharacter #-} -{-# SPECIALISE encodeDynamicCharacter :: Ord s => Alphabet s -> (Word -> WideState) -> V.Vector (Set s) -> WideDynamicCharacter #-} -{-# SPECIALISE encodeDynamicCharacter :: Ord s => Alphabet s -> (Word -> HugeState) -> V.Vector (Set s) -> HugeDynamicCharacter #-} -{-# SPECIALISE encodeDynamicCharacter :: Alphabet String -> (Word -> SlimState) -> V.Vector (Set String) -> SlimDynamicCharacter #-} -{-# SPECIALISE encodeDynamicCharacter :: Alphabet String -> (Word -> WideState) -> V.Vector (Set String) -> WideDynamicCharacter #-} -{-# SPECIALISE encodeDynamicCharacter :: Alphabet String -> (Word -> HugeState) -> V.Vector (Set String) -> HugeDynamicCharacter #-} -encodeDynamicCharacter - :: ( Bits e - , Foldable f - , Foldable g - , Ord s - , Vector v e - ) - => Alphabet s -- ^ Alphabet of symbols - -> (Word -> e) -- ^ Constructor for an empty element, taking the alphabet size - -> f (g s) -- ^ Sequence of ambiguity groups of symbols - -> OpenDynamicCharacter v e -- ^ Encoded dynamic character -encodeDynamicCharacter alphabet f sequenceOfSymbols = dynamicCharater - where - len = length sequenceOfSymbols - - dynamicCharater = unsafeCharacterBuiltByST (toEnum len) $ \char -> do - iRef <- newSTRef 0 - - let writeElement symbols = - let encodedVal = encodeState alphabet f symbols - in do i <- readSTRef iRef - setAlign char i encodedVal encodedVal encodedVal - modifySTRef iRef succ - - traverse_ writeElement sequenceOfSymbols - - -{-# INLINEABLE newTempCharacter #-} -{-# SPECIALISE newTempCharacter :: Word -> ST s (TempSlimDynamicCharacter (ST s)) #-} -{-# SPECIALISE newTempCharacter :: Word -> ST s (TempWideDynamicCharacter (ST s)) #-} -{-# SPECIALISE newTempCharacter :: Word -> ST s (TempHugeDynamicCharacter (ST s)) #-} -newTempCharacter - :: ( Vector v e - , PrimMonad m - ) - => Word - -> m (TempOpenDynamicCharacter m v e) -newTempCharacter n = - let i = fromEnum n - in (,,) <$> unsafeNew i <*> unsafeNew i <*> unsafeNew i - - -{-# INLINEABLE freezeTempCharacter #-} -{-# SPECIALISE freezeTempCharacter :: TempSlimDynamicCharacter (ST s) -> ST s SlimDynamicCharacter #-} -{-# SPECIALISE freezeTempCharacter :: TempWideDynamicCharacter (ST s) -> ST s WideDynamicCharacter #-} -{-# SPECIALISE freezeTempCharacter :: TempHugeDynamicCharacter (ST s) -> ST s HugeDynamicCharacter #-} -freezeTempCharacter - :: ( PrimMonad m - , Vector v e - ) - => TempOpenDynamicCharacter m v e - -> m (OpenDynamicCharacter v e) -freezeTempCharacter (lc,mc,rc) = - let forcedFreeze = fmap GV.force . unsafeFreeze - in (,,) <$> forcedFreeze lc <*> forcedFreeze mc <*> forcedFreeze rc - - -{-# INLINEABLE generateCharacter #-} -{-# SPECIALISE generateCharacter :: Word -> (Word -> (# SlimState, SlimState, SlimState #)) -> SlimDynamicCharacter #-} -{-# SPECIALISE generateCharacter :: Word -> (Word -> (# WideState, WideState, WideState #)) -> WideDynamicCharacter #-} -{-# SPECIALISE generateCharacter :: Word -> (Word -> (# HugeState, HugeState, HugeState #)) -> HugeDynamicCharacter #-} -generateCharacter - :: Vector v e - => Word - -> (Word -> (# e, e, e #)) - -> OpenDynamicCharacter v e -generateCharacter n f = runST $ do - char <- newTempCharacter n - forM_ [ 0 .. n - 1] $ \i -> - let (# x, y, z #) = f i - in setAlign char (fromEnum i) x y z - freezeTempCharacter char - - -{-# INLINEABLE unsafeCharacterBuiltByST #-} -{-# SPECIALISE unsafeCharacterBuiltByST :: Word -> (forall s. TempSlimDynamicCharacter (ST s) -> ST s ()) -> SlimDynamicCharacter #-} -{-# SPECIALISE unsafeCharacterBuiltByST :: Word -> (forall s. TempWideDynamicCharacter (ST s) -> ST s ()) -> WideDynamicCharacter #-} -{-# SPECIALISE unsafeCharacterBuiltByST :: Word -> (forall s. TempHugeDynamicCharacter (ST s) -> ST s ()) -> HugeDynamicCharacter #-} -unsafeCharacterBuiltByST - :: Vector v e - => Word - -> (forall s. TempOpenDynamicCharacter (ST s) v e -> ST s ()) - -> OpenDynamicCharacter v e -unsafeCharacterBuiltByST n f = runST $ do - char <- newTempCharacter n - f char - freezeTempCharacter char - - --- | --- Allocated a buffer of specified size. --- Uses function to generate character and return the final size. --- Copies character from buffer to character of final size. -{-# INLINEABLE unsafeCharacterBuiltByBufferedST #-} -{-# SPECIALISE unsafeCharacterBuiltByBufferedST :: Word -> (forall s. TempSlimDynamicCharacter (ST s) -> ST s Word) -> SlimDynamicCharacter #-} -{-# SPECIALISE unsafeCharacterBuiltByBufferedST :: Word -> (forall s. TempWideDynamicCharacter (ST s) -> ST s Word) -> WideDynamicCharacter #-} -{-# SPECIALISE unsafeCharacterBuiltByBufferedST :: Word -> (forall s. TempHugeDynamicCharacter (ST s) -> ST s Word) -> HugeDynamicCharacter #-} -unsafeCharacterBuiltByBufferedST - :: Vector v e - => Word -- ^ Buffer length - -> (forall s. TempOpenDynamicCharacter (ST s) v e -> ST s Word) - -> OpenDynamicCharacter v e -unsafeCharacterBuiltByBufferedST b f = runST $ do - buff <- newTempCharacter b - char <- f buff >>= newTempCharacter - copyCharacter buff char - freezeTempCharacter char - where - copyCharacter src@(x,_,_) des@(y,_,_) = - let m = GMV.length x - n = GMV.length y - o = m - n - in forM_ [ 0 .. n - 1 ] $ copyAt src des o - - copyAt (slc,smc,src) (dlc,dmc,drc) o i = - let i' = o + i - in do unsafeRead slc i' >>= unsafeWrite dlc i - unsafeRead smc i' >>= unsafeWrite dmc i - unsafeRead src i' >>= unsafeWrite drc i - - -renderDynamicCharacter - :: ( FiniteBits e - , Show e - , Vector v e - ) - => OpenDynamicCharacter v e - -> String -renderDynamicCharacter (lc,mc,rc) = unlines - [ "Character Length: " <> show (GV.length mc) - , printVector lcStr - , printVector mcStr - , printVector rcStr - ] - where - show' x | popCount x > 0 = show x - | otherwise = [voidC] - voidC = '█' - lcStr = show' <$> GV.toList lc - mcStr = show' <$> GV.toList mc - rcStr = show' <$> GV.toList rc - eSize = length . maximumBy (comparing length) $ lcStr <> mcStr <> rcStr <> [[voidC]] - pad s = - let c | s == [voidC] = voidC - | otherwise = ' ' - in replicate (eSize - length s) c <> s - - intercalate' [] = [] - intercalate' [x] = x - intercalate' (x:xs) = - let sep = case x of - e:_ | e == voidC -> [voidC] - _ -> " " - in x <> sep <> intercalate' xs - - printVector vec = "[ " <> intercalate' (pad <$> vec) <> " ]" - - -buildGap :: Bits e => e -> e -buildGap e = buildNil e `setBit` gapIndex - - -buildNil :: Bits e => e -> e -buildNil e = e `xor` e - - -buildGapAndNil :: Bits e => e -> (# e, e #) -buildGapAndNil e = - let nil = buildNil e - gap = nil `setBit` gapIndex - in (# gap, nil #) diff --git a/pkg/PhyGraph/lib/dynamic-character/src/Bio/DynamicCharacter/HandleGaps.hs b/pkg/PhyGraph/lib/dynamic-character/src/Bio/DynamicCharacter/HandleGaps.hs deleted file mode 100644 index b1fc1c6a2..000000000 --- a/pkg/PhyGraph/lib/dynamic-character/src/Bio/DynamicCharacter/HandleGaps.hs +++ /dev/null @@ -1,225 +0,0 @@ ------------------------------------------------------------------------------ --- | --- Module : Bio.DynamicCharacter.HandleGaps --- Copyright : (c) 2015-2021 Ward Wheeler --- License : BSD-style --- --- Maintainer : wheeler@amnh.org --- Stability : provisional --- Portability : portable --- --- Direct optimization pairwise alignment using the Needleman-Wunsch algorithm. --- These functions will allocate an M * N matrix. --- ------------------------------------------------------------------------------ - -{-# LANGUAGE ApplicativeDo #-} -{-# LANGUAGE DerivingStrategies #-} -{-# LANGUAGE GeneralizedNewtypeDeriving #-} -{-# LANGUAGE Strict #-} - -module Bio.DynamicCharacter.HandleGaps - ( GapSet() - , nullGapSet - , deleteGaps - , insertGaps - ) where - -import Bio.DynamicCharacter -import Control.Monad (unless, when) -import Control.Monad.Loops (whileM_) -import Control.Monad.ST -import Data.Bits -import Data.Foldable -import Data.IntMap.Strict (IntMap, fromDistinctAscList, lookupMax, toAscList) -import Data.STRef -import Data.Semigroup -import Data.Vector.Generic (Vector, (!)) -import qualified Data.Vector.Generic as GV -import qualified Data.Vector.Generic.Mutable as MGV -import qualified Data.Vector.Unboxed.Mutable as MUV - - --- | --- A set of gaps extracted froma sequence to be re-inserted later. --- --- /NOTE:/ The 'GapSet' type is closed to ensure invaiants hold. -newtype GapSet = GapSet (IntMap Word) - deriving newtype (Eq, Show) - - --- | --- The 'GapSet' which contains no gaps. --- --- /NOTE:/ Useful for making comparisons. -nullGapSet :: GapSet -nullGapSet = GapSet mempty - - --- | --- Strips the gap elements from the supplied character. --- --- Remembers the locations of the gap characters that were deleted --- --- If the character contains /only/ gaps, a missing character is returned. -{-# INLINEABLE deleteGaps #-} -{-# SPECIALISE deleteGaps :: SlimDynamicCharacter -> (GapSet, SlimDynamicCharacter) #-} -{-# SPECIALISE deleteGaps :: WideDynamicCharacter -> (GapSet, WideDynamicCharacter) #-} -{-# SPECIALISE deleteGaps :: HugeDynamicCharacter -> (GapSet, HugeDynamicCharacter) #-} -deleteGaps - :: ( FiniteBits e - , Vector v e - ) - => OpenDynamicCharacter v e -- ^ Dynamic character - -> (GapSet, OpenDynamicCharacter v e) -deleteGaps c@(x,y,z) - | GV.null x = (noGaps, c) - | null gaps = (noGaps, c) - | newLen == 0 = (gapSet, missing) - | otherwise = (gapSet, newVector) - where - noGaps = GapSet mempty - missing = (GV.empty, GV.empty, GV.empty) - newVector = runST $ do - j <- newSTRef 0 - let isGapAtJ = isGap c <$> readSTRef j - - let g v = do - whileM_ isGapAtJ $ modifySTRef j succ - j' <- readSTRef j - modifySTRef j succ - pure $ v ! j' - - let genVec v = do - vec <- GV.generateM newLen (const (g v)) - writeSTRef j 0 - pure vec - - (,,) <$> genVec x <*> genVec y <*> genVec z - - gapCount = fromEnum . getSum $ foldMap Sum gaps - charLen = GV.length x - newLen = charLen - gapCount - - gapSet = GapSet gaps - gaps = fromDistinctAscList $ reverse refs - refs = runST $ do - nonGaps <- newSTRef 0 - prevGap <- newSTRef False - gapLen <- newSTRef 0 - gapRefs <- newSTRef [] - - let handleGapBefore op = do - gapBefore <- readSTRef prevGap - when gapBefore $ do - j <- readSTRef nonGaps - g <- readSTRef gapLen - modifySTRef gapRefs ( (j,g): ) - op - - for_ [0 .. charLen - 1] $ \i -> - if isGap c i - then modifySTRef gapLen succ *> writeSTRef prevGap True - else do handleGapBefore $ do - writeSTRef gapLen 0 - writeSTRef prevGap False - modifySTRef nonGaps succ - - handleGapBefore $ pure () - readSTRef gapRefs - - --- | --- Adds gaps elements to the supplied character. --- --- /NOTE:/ It is important to have the 'gap' state passed in as a parameter! --- There is the possibility that the alignment context is empty, but one or more --- gaps need to be added. We cannot construct a gap state from an empty input so --- the gap state must be externally supplied. -{-# INLINEABLE insertGaps #-} -{-# SPECIALISE insertGaps :: SlimState -> GapSet -> GapSet -> SlimDynamicCharacter -> SlimDynamicCharacter #-} -{-# SPECIALISE insertGaps :: WideState -> GapSet -> GapSet -> WideDynamicCharacter -> WideDynamicCharacter #-} -{-# SPECIALISE insertGaps :: HugeState -> GapSet -> GapSet -> HugeDynamicCharacter -> HugeDynamicCharacter #-} -insertGaps - :: ( FiniteBits e - , Vector v e - ) - => e -- ^ Gap State - -> GapSet -- ^ Removed gap references from "left" (shorter) dynamic character - -> GapSet -- ^ Removed gap references from "right" (larger) dynamic character - -> OpenDynamicCharacter v e -- ^ Alignment context to have gap references inserted - -> OpenDynamicCharacter v e -- ^ Fully gapped alignment context -insertGaps gap (GapSet lGaps) (GapSet rGaps) input@(x,y,z) - | null lGaps && null rGaps = input -- No work needed - | otherwise = - -- Use a Let/In binding here since the module is STRICT! - -- Prevents indexing errors on missing character input. - let nil = gap `xor` gap - totalGaps = fromEnum . getSum . foldMap Sum - gapVecLen = maybe 0 (succ . fst) . lookupMax - lGapCount = totalGaps lGaps - rGapCount = totalGaps rGaps - medLength = GV.length x - newLength = lGapCount + rGapCount + medLength - - ins = (gap, gap, nil) - del = (nil, gap, gap) - - in runST $ do --- xVec <- MGV.unsafeNew newLength --- yVec <- MGV.unsafeNew newLength --- zVec <- MGV.unsafeNew newLength - xVec <- MGV.replicate newLength zeroBits - yVec <- MGV.replicate newLength zeroBits - zVec <- MGV.replicate newLength zeroBits - lVec <- MUV.replicate (gapVecLen lGaps) 0 - rVec <- MUV.replicate (gapVecLen rGaps) 0 - lGap <- newSTRef 0 - mPtr <- newSTRef 0 - rGap <- newSTRef 0 - -- Write out to the mutable vectors - for_ (toAscList lGaps) $ uncurry (MUV.unsafeWrite lVec) - for_ (toAscList rGaps) $ uncurry (MUV.unsafeWrite rVec) - - let inc r = modifySTRef r succ - - let align i = do - m <- readSTRef mPtr - MGV.unsafeWrite xVec i $ x ! m - MGV.unsafeWrite yVec i $ y ! m - MGV.unsafeWrite zVec i $ z ! m - inc mPtr - -- Deletions leave a void in the left (shorter) character. - -- - -- This means we are "consuming" an element from the right (longer) character - -- Hence we increment the right gap reference, as we "progress" through the - -- right character sequence's elements in the alignment context. - when (isAlign input m || isDelete input m) $ - inc rGap - -- Similar logic as above, however, - -- Insertions leave a void in the right (longer) character. - when (isAlign input m || isInsert input m) $ - inc lGap - - let insertGapWith i (xe,ye,ze) gapRef gapVec = do - let len = MGV.length gapVec - rg <- readSTRef gapRef - v <- if rg >= len then pure 0 else MGV.unsafeRead gapVec rg - if v == 0 - then pure False - else do MGV.unsafeWrite xVec i xe - MGV.unsafeWrite yVec i ye - MGV.unsafeWrite zVec i ze - MGV.unsafeWrite gapVec rg $ v - 1 - pure True - - for_ [0 .. newLength - 1] $ \i -> do - written <- insertGapWith i ins lGap lVec - unless written $ do - written' <- insertGapWith i del rGap rVec - unless written' $ align i - - x' <- GV.unsafeFreeze xVec - y' <- GV.unsafeFreeze yVec - z' <- GV.unsafeFreeze zVec - pure (x', y', z') diff --git a/pkg/PhyGraph/lib/dynamic-character/src/Bio/DynamicCharacter/Measure.hs b/pkg/PhyGraph/lib/dynamic-character/src/Bio/DynamicCharacter/Measure.hs deleted file mode 100644 index cca0e1de7..000000000 --- a/pkg/PhyGraph/lib/dynamic-character/src/Bio/DynamicCharacter/Measure.hs +++ /dev/null @@ -1,118 +0,0 @@ ------------------------------------------------------------------------------ --- | --- Module : Bio.DynamicCharacter.Measure --- Copyright : (c) 2015-2021 Ward Wheeler --- License : BSD-style --- --- Maintainer : wheeler@amnh.org --- Stability : provisional --- Portability : portable --- --- Direct optimization pairwise alignment using the Needleman-Wunsch algorithm. --- These functions will allocate an M * N matrix. --- ------------------------------------------------------------------------------ - -{-# LANGUAGE Strict #-} - -module Bio.DynamicCharacter.Measure - ( measureCharacters - , measureCharactersWithoutGaps - ) where - -import Bio.DynamicCharacter -import Bio.DynamicCharacter.HandleGaps -import Data.Bits -import Data.Ord -import Data.Vector.Generic (Vector, basicLength) - - --- | --- /O(1)/ for input characters of differing lengths --- --- /O(k)/ for input characters of equal length, where /k/ is the shared prefix of --- both characters. --- --- Returns the dynamic character that is shorter first, longer second, and notes --- whether or not the inputs were swapped to place the characters in this ordering. --- --- Handles equal length characters by considering the lexicographically larger --- character as longer. --- --- Handles equality of inputs by /not/ swapping. -{-# INLINEABLE measureCharacters #-} -{-# SPECIALISE measureCharacters :: SlimDynamicCharacter -> SlimDynamicCharacter -> (Ordering, SlimDynamicCharacter, SlimDynamicCharacter) #-} -{-# SPECIALISE measureCharacters :: WideDynamicCharacter -> WideDynamicCharacter -> (Ordering, WideDynamicCharacter, WideDynamicCharacter) #-} -{-# SPECIALISE measureCharacters :: HugeDynamicCharacter -> HugeDynamicCharacter -> (Ordering, HugeDynamicCharacter, HugeDynamicCharacter) #-} -measureCharacters - :: ( Ord (v e) - , Vector v e - ) - => OpenDynamicCharacter v e - -> OpenDynamicCharacter v e - -> (Ordering, OpenDynamicCharacter v e, OpenDynamicCharacter v e) -measureCharacters lhs rhs - | lhsOrdering == GT = (lhsOrdering, rhs, lhs) - | otherwise = (lhsOrdering, lhs, rhs) - where - lhsMedians = extractMediansGapped lhs - rhsMedians = extractMediansGapped rhs - lhsOrdering = - -- First, compare inputs by length. - case comparing basicLength lhsMedians rhsMedians of - -- If the inputs are equal length, - -- Then compare by the (arbitrary) lexicographical ordering of the median states. - EQ -> case lhsMedians `compare` rhsMedians of - -- If the input median states have the same ordering, - -- Lastly, we compare by the lexicographic ordering of the "tagged triples." - -- - -- If they are equal after this step, - -- Then the inputs are representationally equal. - -- Actually, honest to goodness 100% equal! - EQ -> lhs `compare` rhs - v -> v - v -> v - - --- | --- /O(n)/ --- --- Considers the median values of the characters, ignores the left/right tagging. --- --- First remove the gaps from the input characters. --- --- If both "ungapped" inputs are empty, we measure the original "gapped" inputs to --- determine if the inputs need to be swapped. This is required to ensure comutativity --- of subsequent operations which use this method. --- --- Returns the "ungapped" dynamic character that is "shorter" first, "longer" second, --- the removed gap mappings (in the same order), and notes whether or not the inputs --- were swapped to place the characters in this ordering. --- --- Handles equal length characters by considering the lexicographically larger --- character as longer. --- --- Handles equality of inputs by /not/ swapping. -{-# INLINEABLE measureCharactersWithoutGaps #-} -{-# SPECIALISE measureCharactersWithoutGaps :: SlimDynamicCharacter -> SlimDynamicCharacter -> (Bool, GapSet, GapSet, SlimDynamicCharacter, SlimDynamicCharacter) #-} -{-# SPECIALISE measureCharactersWithoutGaps :: WideDynamicCharacter -> WideDynamicCharacter -> (Bool, GapSet, GapSet, WideDynamicCharacter, WideDynamicCharacter) #-} -{-# SPECIALISE measureCharactersWithoutGaps :: HugeDynamicCharacter -> HugeDynamicCharacter -> (Bool, GapSet, GapSet, HugeDynamicCharacter, HugeDynamicCharacter) #-} -measureCharactersWithoutGaps - :: ( FiniteBits e - , Ord (v e) - , Vector v e - ) - => OpenDynamicCharacter v e -- ^ First dynamic character - -> OpenDynamicCharacter v e -- ^ Second dynamic character - -> (Bool, GapSet, GapSet, OpenDynamicCharacter v e, OpenDynamicCharacter v e) -measureCharactersWithoutGaps char1 char2 - | swapInputs = (True , gapsChar2, gapsChar1, ungappedChar2, ungappedChar1) - | otherwise = (False, gapsChar1, gapsChar2, ungappedChar1, ungappedChar2) - where - swapInputs = measure == GT - (gapsChar1, ungappedChar1) = deleteGaps char1 - (gapsChar2, ungappedChar2) = deleteGaps char2 - (measure, _, _) = - case measureCharacters ungappedChar1 ungappedChar2 of - (EQ,_,_) -> measureCharacters char1 char2 - x -> x diff --git a/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Pairwise.hs b/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Pairwise.hs deleted file mode 100644 index c4c956d02..000000000 --- a/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Pairwise.hs +++ /dev/null @@ -1,34 +0,0 @@ ------------------------------------------------------------------------------ --- | --- Module : DirectOptimization.Pairwise --- Copyright : (c) 2015-2021 Ward Wheeler --- License : BSD-style --- --- Maintainer : wheeler@amnh.org --- Stability : provisional --- Portability : portable --- --- Pairwise direct optimization alignment functions using a variety of techniques. --- ------------------------------------------------------------------------------ - -module DirectOptimization.Pairwise - ( - -- * Slim characters - SlimDynamicCharacter - , SlimState - , slimPairwiseDO - -- * Wide characters - , WideDynamicCharacter - , WideState - , widePairwiseDO - -- * Huge characters - , HugeDynamicCharacter - , HugeState - , hugePairwiseDO - ) where - -import DirectOptimization.Pairwise.Huge -import DirectOptimization.Pairwise.Slim -import DirectOptimization.Pairwise.Wide - diff --git a/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Pairwise/Direction.hs b/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Pairwise/Direction.hs deleted file mode 100644 index 573d59478..000000000 --- a/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Pairwise/Direction.hs +++ /dev/null @@ -1,194 +0,0 @@ ------------------------------------------------------------------------------ --- | --- Module : DirectOptimization.Pairwise.Direction --- Copyright : (c) 2015-2021 Ward Wheeler --- License : BSD-style --- --- Maintainer : wheeler@amnh.org --- Stability : provisional --- Portability : portable --- --- Direct optimization pairwise alignment using the Needleman-Wunsch algorithm. --- These functions will allocate an M * N matrix. --- ------------------------------------------------------------------------------ - -{-# LANGUAGE DerivingStrategies #-} -{-# LANGUAGE FlexibleContexts #-} -{-# LANGUAGE MultiParamTypeClasses #-} -{-# LANGUAGE Strict #-} -{-# LANGUAGE TypeFamilies #-} -{-# LANGUAGE UnboxedTuples #-} - -module DirectOptimization.Pairwise.Direction - ( Direction(..) - -- * Querying - , minimumCostDirection - -- * Rendering - , boldDirection - ) where - -import Data.Int -import qualified Data.Vector.Generic as GV -import qualified Data.Vector.Generic.Mutable as MGV -import qualified Data.Vector.Primitive as PV -import qualified Data.Vector.Unboxed as UV -import Data.Word - - --- | --- Which direction to align the character at a given matrix point. --- --- It should be noted that the ordering of the three arrow types are important, --- as it guarantees that the derived 'Ord' instance will have the following --- property: --- --- DiagArrow < LeftArrow < UpArrow --- --- This means: --- --- - DiagArrow has highest precedence when one or more costs are equal --- --- - LeftArrow has second highest precedence when one or more costs are equal --- --- - UpArrow has lowest precedence when one or more costs are equal --- --- Using this 'Ord' instance, we can resolve ambiguous transformations in a --- deterministic way. Without loss of generality in determining the ordering, --- we choose the same biasing as the C code called from the FFI for consistency. -data Direction = DiagArrow | LeftArrow | UpArrow - deriving stock (Eq, Ord) - - -newtype instance UV.MVector s Direction = MV_Direction (PV.MVector s Word8) - - -newtype instance UV.Vector Direction = V_Direction (PV.Vector Word8) - - -instance UV.Unbox Direction - - -instance MGV.MVector UV.MVector Direction where - - {-# INLINE basicLength #-} - basicLength (MV_Direction v) = MGV.basicLength v - - {-# INLINE basicUnsafeSlice #-} - basicUnsafeSlice i n (MV_Direction v) = MV_Direction $ MGV.basicUnsafeSlice i n v - - {-# INLINE basicOverlaps #-} - basicOverlaps (MV_Direction v1) (MV_Direction v2) = MGV.basicOverlaps v1 v2 - - {-# INLINE basicUnsafeNew #-} - basicUnsafeNew n = MV_Direction <$> MGV.basicUnsafeNew n - - {-# INLINE basicInitialize #-} - basicInitialize (MV_Direction v) = MGV.basicInitialize v - - {-# INLINE basicUnsafeReplicate #-} - basicUnsafeReplicate n x = MV_Direction <$> MGV.basicUnsafeReplicate n (fromDirection x) - - {-# INLINE basicUnsafeRead #-} - basicUnsafeRead (MV_Direction v) i = toDirection <$> MGV.basicUnsafeRead v i - - {-# INLINE basicUnsafeWrite #-} - basicUnsafeWrite (MV_Direction v) i x = MGV.basicUnsafeWrite v i (fromDirection x) - - {-# INLINE basicClear #-} - basicClear (MV_Direction v) = MGV.basicClear v - - {-# INLINE basicSet #-} - basicSet (MV_Direction v) x = MGV.basicSet v (fromDirection x) - - {-# INLINE basicUnsafeCopy #-} - basicUnsafeCopy (MV_Direction v1) (MV_Direction v2) = MGV.basicUnsafeCopy v1 v2 - - basicUnsafeMove (MV_Direction v1) (MV_Direction v2) = MGV.basicUnsafeMove v1 v2 - - {-# INLINE basicUnsafeGrow #-} - basicUnsafeGrow (MV_Direction v) n = MV_Direction <$> MGV.basicUnsafeGrow v n - - -instance GV.Vector UV.Vector Direction where - - {-# INLINE basicUnsafeFreeze #-} - basicUnsafeFreeze (MV_Direction v) = V_Direction <$> GV.basicUnsafeFreeze v - - {-# INLINE basicUnsafeThaw #-} - basicUnsafeThaw (V_Direction v) = MV_Direction <$> GV.basicUnsafeThaw v - - {-# INLINE basicLength #-} - basicLength (V_Direction v) = GV.basicLength v - - {-# INLINE basicUnsafeSlice #-} - basicUnsafeSlice i n (V_Direction v) = V_Direction $ GV.basicUnsafeSlice i n v - - {-# INLINE basicUnsafeIndexM #-} - basicUnsafeIndexM (V_Direction v) i = toDirection <$> GV.basicUnsafeIndexM v i - - basicUnsafeCopy (MV_Direction mv) (V_Direction v) = GV.basicUnsafeCopy mv v - - {-# INLINE elemseq #-} - elemseq _ = seq - - -instance Show Direction where - - show DiagArrow = "↖" - show LeftArrow = "←" - show UpArrow = "↑" - - -{-# INLINEABLE boldDirection #-} -boldDirection :: Char -> Char -boldDirection '↖' = '⇖' -boldDirection '←' = '⇐' -boldDirection '↑' = '⇑' -boldDirection d = d - - --- | --- Given the cost of deletion, alignment, and insertion (respectively), selects --- the least costly direction. In the case of one or more equal costs, the --- direction arrows are returned in the following descending order of priority: --- --- [ DiagArrow, LeftArrow, UpArrow ] --- -{-# INLINEABLE minimumCostDirection #-} -{-# SPECIALISE INLINE minimumCostDirection :: Int -> Int -> Int -> (# Int , Direction #) #-} -{-# SPECIALISE INLINE minimumCostDirection :: Int8 -> Int8 -> Int8 -> (# Int8 , Direction #) #-} -{-# SPECIALISE INLINE minimumCostDirection :: Int16 -> Int16 -> Int16 -> (# Int16 , Direction #) #-} -{-# SPECIALISE INLINE minimumCostDirection :: Int32 -> Int32 -> Int32 -> (# Int32 , Direction #) #-} -{-# SPECIALISE INLINE minimumCostDirection :: Int64 -> Int64 -> Int64 -> (# Int64 , Direction #) #-} -{-# SPECIALISE INLINE minimumCostDirection :: Word -> Word -> Word -> (# Word , Direction #) #-} -{-# SPECIALISE INLINE minimumCostDirection :: Word8 -> Word8 -> Word8 -> (# Word8 , Direction #) #-} -{-# SPECIALISE INLINE minimumCostDirection :: Word16 -> Word16 -> Word16 -> (# Word16, Direction #) #-} -{-# SPECIALISE INLINE minimumCostDirection :: Word32 -> Word32 -> Word32 -> (# Word32, Direction #) #-} -{-# SPECIALISE INLINE minimumCostDirection :: Word64 -> Word64 -> Word64 -> (# Word64, Direction #) #-} -minimumCostDirection - :: Ord e - => e - -> e - -> e - -> (# e, Direction #) -minimumCostDirection delCost alnCost insCost - | alnCost <= delCost = if alnCost <= insCost - then (# alnCost, DiagArrow #) - else (# insCost, UpArrow #) - | delCost <= insCost = (# delCost, LeftArrow #) - | otherwise = (# insCost, UpArrow #) - - -{-# INLINE fromDirection #-} -fromDirection :: Direction -> Word8 -fromDirection DiagArrow = 0 -fromDirection LeftArrow = 1 -fromDirection UpArrow = 2 - - -{-# INLINE toDirection #-} -toDirection :: Word8 -> Direction -toDirection 0 = DiagArrow -toDirection 1 = LeftArrow -toDirection _ = UpArrow diff --git a/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Pairwise/Huge.hs b/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Pairwise/Huge.hs deleted file mode 100644 index d369faebd..000000000 --- a/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Pairwise/Huge.hs +++ /dev/null @@ -1,17 +0,0 @@ -module DirectOptimization.Pairwise.Huge - ( HugeDynamicCharacter - , HugeState - , hugePairwiseDO - ) where - -import Bio.DynamicCharacter -import DirectOptimization.Pairwise.Ukkonen - - -hugePairwiseDO - :: Word - -> (HugeState -> HugeState -> (HugeState, Word)) - -> HugeDynamicCharacter - -> HugeDynamicCharacter - -> (Word, HugeDynamicCharacter) -hugePairwiseDO = ukkonenDO diff --git a/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Pairwise/Internal.hs b/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Pairwise/Internal.hs deleted file mode 100644 index 9a229e98a..000000000 --- a/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Pairwise/Internal.hs +++ /dev/null @@ -1,215 +0,0 @@ ------------------------------------------------------------------------------ --- | --- Module : DirectOptimization.Pairwise.Internal --- Copyright : (c) 2015-2021 Ward Wheeler --- License : BSD-style --- --- Maintainer : wheeler@amnh.org --- Stability : provisional --- Portability : portable --- --- Direct optimization pairwise alignment using the Needleman-Wunsch algorithm. --- These functions will allocate an M * N matrix. --- ------------------------------------------------------------------------------ - -{-# LANGUAGE ApplicativeDo #-} -{-# LANGUAGE FlexibleContexts #-} -{-# LANGUAGE Strict #-} - -module DirectOptimization.Pairwise.Internal - ( -- * Alignment types - Direction(..) - , TCMλ - -- * Alignment feneric functions - , directOptimization - , directOptimizationFromDirectionMatrix - ) where - -import Bio.DynamicCharacter -import Bio.DynamicCharacter.HandleGaps -import Bio.DynamicCharacter.Measure -import Control.Applicative -import Control.Monad (join) -import Control.Monad.Loops (whileM_) -import Data.Bits -import Data.Matrix.Class (Matrix, dim, unsafeIndex) -import qualified Data.Matrix.Unboxed as UM -import Data.STRef -import qualified Data.Vector as V -import Data.Vector.Generic (Vector, (!), (!?)) -import qualified Data.Vector.Generic as GV -import qualified Data.Vector.Storable as SV -import qualified Data.Vector.Unboxed as UV -import DirectOptimization.Pairwise.Direction - - --- | --- A generalized function representation: the "overlap" between dynamic character --- elements, supplying the corresponding median and cost to align the two --- characters. -type TCMλ e = e -> e -> (e, Word) - - -{-# SCC directOptimization #-} -{-# INLINEABLE directOptimization #-} -{-# SPECIALISE directOptimization :: (SV.Vector SlimState -> SV.Vector SlimState -> (Word, SlimDynamicCharacter)) -> (SlimState -> SlimState -> (SlimState, Word)) -> SlimDynamicCharacter -> SlimDynamicCharacter -> (Word, SlimDynamicCharacter) #-} -{-# SPECIALISE directOptimization :: (UV.Vector WideState -> UV.Vector WideState -> (Word, WideDynamicCharacter)) -> (WideState -> WideState -> (WideState, Word)) -> WideDynamicCharacter -> WideDynamicCharacter -> (Word, WideDynamicCharacter) #-} -{-# SPECIALISE directOptimization :: ( V.Vector HugeState -> V.Vector HugeState -> (Word, HugeDynamicCharacter)) -> (HugeState -> HugeState -> (HugeState, Word)) -> HugeDynamicCharacter -> HugeDynamicCharacter -> (Word, HugeDynamicCharacter) #-} -directOptimization - :: ( FiniteBits e - , Ord (v e) - , Vector v e - ) - => (v e -> v e -> (Word, OpenDynamicCharacter v e)) -- ^ Alignment function - -> TCMλ e -- ^ Metric for computing state distance and median state - -> OpenDynamicCharacter v e - -> OpenDynamicCharacter v e - -> (Word, OpenDynamicCharacter v e) -directOptimization alignmentFunction overlapλ = handleMissing generateAlignmentResult - where - generateAlignmentResult lhs rhs = - let -- Build a 'gap' state now that we know that we can access a non-empty sequence. - gap = let tmp = extractMediansGapped rhs ! 0 in (tmp `xor` tmp) `setBit` 0 - -- Remove gaps from the inputs and measure the results to determine - -- which ungapped character is longer and which is shorter. - -- Always pass the shorter character into alignment functions first! - ~(swapped, gapsLesser, gapsLonger, lesser, longer) = measureCharactersWithoutGaps lhs rhs - lesserMeds = extractMediansGapped lesser - longerMeds = extractMediansGapped longer - ~(alignmentCost, ungappedAlignment) = - if GV.length lesserMeds == 0 - -- Neither character was Missing, but one or both are empty when gaps are removed - then alignmentWithAllGaps overlapλ longerMeds - -- Both have some non-gap elements, perform string alignment - else alignmentFunction lesserMeds longerMeds - regappedAlignment = insertGaps gap gapsLesser gapsLonger ungappedAlignment - transformation = if swapped then transposeCharacter else id - alignmentContext = transformation regappedAlignment - in (alignmentCost, forceDynamicCharacter alignmentContext) - - -{-# SCC directOptimizationFromDirectionMatrix #-} -{-# INLINEABLE directOptimizationFromDirectionMatrix #-} -{-# SPECIALISE directOptimizationFromDirectionMatrix :: (WideState -> (WideState -> WideState -> (WideState, Word)) -> UV.Vector WideState -> UV.Vector WideState -> (Word, UM.Matrix Direction)) -> (WideState -> WideState -> (WideState, Word)) -> WideDynamicCharacter -> WideDynamicCharacter -> (Word, WideDynamicCharacter) #-} -{-# SPECIALISE directOptimizationFromDirectionMatrix :: (HugeState -> (HugeState -> HugeState -> (HugeState, Word)) -> V.Vector HugeState -> V.Vector HugeState -> (Word, UM.Matrix Direction)) -> (HugeState -> HugeState -> (HugeState, Word)) -> HugeDynamicCharacter -> HugeDynamicCharacter -> (Word, HugeDynamicCharacter) #-} -directOptimizationFromDirectionMatrix - :: ( FiniteBits e - , Matrix m t Direction - , Ord (v e) - , Vector v e - ) - => (e -> TCMλ e -> v e -> v e -> (Word, m t Direction)) -- ^ Alignment matrix generator function - -> TCMλ e -- ^ Metric for computing state distance and median state - -> OpenDynamicCharacter v e - -> OpenDynamicCharacter v e - -> (Word, OpenDynamicCharacter v e) -directOptimizationFromDirectionMatrix matrixGenerator overlapλ = - handleMissing $ directOptimization alignmentFunction overlapλ - where - alignmentFunction lhs rhs = - let gap = let tmp = rhs ! 0 in (tmp `xor` tmp) `setBit` 0 - (cost, traversalMatrix) = matrixGenerator gap overlapλ lhs rhs - in (cost, traceback gap overlapλ traversalMatrix lhs rhs) - - -{-# SCC traceback #-} -{-# INLINEABLE traceback #-} -{-# SPECIALISE traceback :: WideState -> (WideState -> WideState -> (WideState, Word)) -> UM.Matrix Direction -> UV.Vector WideState -> UV.Vector WideState -> WideDynamicCharacter #-} -{-# SPECIALISE traceback :: HugeState -> (HugeState -> HugeState -> (HugeState, Word)) -> UM.Matrix Direction -> V.Vector HugeState -> V.Vector HugeState -> HugeDynamicCharacter #-} -traceback - :: ( Bits e - , Matrix m t Direction - , Vector v e - ) - => e - -> TCMλ e - -> m t Direction - -> v e -- ^ Shorter dynamic character related to the "left column" - -> v e -- ^ Longer dynamic character related to the "top row" - -> OpenDynamicCharacter v e -- ^ Resulting dynamic character alignment context -traceback gap overlapλ directionMatrix lesser longer = forceDynamicCharacter alignment - where - f x y = fst $ overlapλ x y - getDirection = curry $ unsafeIndex directionMatrix - -- The maximum size the alignment could be - bufferLength = toEnum $ GV.length lesser + GV.length longer - - -- Construct the aligned dynamic character by using a buffer of it's maximum - -- possible length. Computet the length will performing the traceback. Then - -- after the traceback is performed, copy from the buffer to create a dynamic - -- character of the correct size. - -- - -- NOTE: The buffer creation, copying, and freeing are all handled by the - -- 'unsafeCharacterBuiltByBufferedST' function. - alignment = unsafeCharacterBuiltByBufferedST bufferLength $ \a -> do - let (m,n) = dim directionMatrix - iR <- newSTRef $ m - 1 - jR <- newSTRef $ n - 1 - kR <- newSTRef $ fromEnum bufferLength - - -- Set up convenience methods for accessing character elements - let getElement char ref = (char !) <$> (modifySTRef ref pred *> readSTRef ref) - let getLesserElement = getElement lesser iR - let getLongerElement = getElement longer jR - - -- Set up convenience methods for setting alignment elements on traceback - let setElementAt = modifySTRef kR pred *> readSTRef kR - let delete re = setElementAt >>= \k -> setDelete a k (f gap re) re - let align le re = setElementAt >>= \k -> setAlign a k le (f le re) re - let insert le = setElementAt >>= \k -> setInsert a k le (f le gap) - - -- Determine when to break the alignment loop - let continueAlignment = - let notAtOrigin i j = i /= 0 || j /= 0 - in liftA2 notAtOrigin (readSTRef iR) $ readSTRef jR - - -- Perform traceback - whileM_ continueAlignment $ do - arrow <- liftA2 getDirection (readSTRef iR) $ readSTRef jR - case arrow of - LeftArrow -> getLongerElement >>= delete - DiagArrow -> join $ liftA2 align getLesserElement getLongerElement - UpArrow -> getLesserElement >>= insert - - -- Return the actual alignment length - k <- readSTRef kR - pure $ bufferLength - toEnum k - - -{-# SCC alignmentWithAllGaps #-} -{-# INLINEABLE alignmentWithAllGaps #-} -alignmentWithAllGaps - :: ( Bits e - , Vector v e - ) - => TCMλ e - -> v e - -> (Word, OpenDynamicCharacter v e) -alignmentWithAllGaps overlapλ character = - case character !? 0 of - -- Neither character was Missing, but both are empty when gaps are removed - Nothing -> (0, (GV.empty, GV.empty, GV.empty)) - -- Neither character was Missing, but one of them is empty when gaps are removed - Just e -> - let len = GV.length character - nil = e `xor` e - gap = nil`setBit` 0 - zed = GV.replicate len nil - med = GV.generate len $ fst . overlapλ gap . (character !) - in (0, (zed, med, character)) - - -{-# SCC handleMissing #-} -handleMissing - :: Vector v e - => (OpenDynamicCharacter v e -> OpenDynamicCharacter v e -> (Word, OpenDynamicCharacter v e)) - -> OpenDynamicCharacter v e - -> OpenDynamicCharacter v e - -> (Word, OpenDynamicCharacter v e) -handleMissing f lhs rhs = - case (isMissing lhs, isMissing rhs) of - (True , True ) -> (0, lhs) - (True , False) -> (0, rhs) - (False, True ) -> (0, lhs) - (False, False) -> f lhs rhs diff --git a/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Pairwise/Slim.hs b/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Pairwise/Slim.hs deleted file mode 100644 index 3fe46a2f5..000000000 --- a/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Pairwise/Slim.hs +++ /dev/null @@ -1,16 +0,0 @@ -module DirectOptimization.Pairwise.Slim - ( SlimDynamicCharacter - , SlimState - , slimPairwiseDO - ) where - -import Bio.DynamicCharacter -import DirectOptimization.Pairwise.Slim.FFI - - -slimPairwiseDO - :: DenseTransitionCostMatrix - -> SlimDynamicCharacter - -> SlimDynamicCharacter - -> (Word, SlimDynamicCharacter) -slimPairwiseDO = smallAlphabetPairwiseDO diff --git a/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Pairwise/Swapping.hs b/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Pairwise/Swapping.hs deleted file mode 100644 index a02010311..000000000 --- a/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Pairwise/Swapping.hs +++ /dev/null @@ -1,138 +0,0 @@ ------------------------------------------------------------------------------ --- | --- Module : DirectOptimization.Pairwise.Swapping --- Copyright : (c) 2015-2021 Ward Wheeler --- License : BSD-style --- --- Maintainer : wheeler@amnh.org --- Stability : provisional --- Portability : portable --- --- Direct optimization pairwise alignment using the Needleman-Wunsch algorithm. --- These functions will allocate an M * N matrix. --- ------------------------------------------------------------------------------ - -{-# LANGUAGE ApplicativeDo #-} -{-# LANGUAGE ConstraintKinds #-} -{-# LANGUAGE FlexibleContexts #-} -{-# LANGUAGE Strict #-} -{-# LANGUAGE TypeFamilies #-} -{-# LANGUAGE UnboxedTuples #-} - -module DirectOptimization.Pairwise.Swapping - ( Direction() - , swappingDO - , buildDirectionMatrix - , minimumCostDirection - ) where - -import Bio.DynamicCharacter -import Control.Monad.ST -import Data.Bits -import Data.Foldable -import Data.Matrix.Unboxed (Matrix, unsafeFreeze) -import qualified Data.Matrix.Unboxed.Mutable as M -import qualified Data.Vector as V -import Data.Vector.Generic (Vector, (!)) -import qualified Data.Vector.Generic as GV -import qualified Data.Vector.Unboxed as UV -import qualified Data.Vector.Unboxed.Mutable as MUV -import DirectOptimization.Pairwise.Direction -import DirectOptimization.Pairwise.Internal - - --- | --- Performs a naive direct optimization. --- Takes in two characters to run DO on and an overlap function --- Returns an assignment character, the cost of that assignment, the assignment --- character with gaps included, the aligned version of the first input character, --- and the aligned version of the second input character. The process for this --- algorithm is to generate a traversal matrix, then perform a traceback. -{-# SCC swappingDO #-} -{-# INLINEABLE swappingDO #-} -{-# SPECIALISE swappingDO :: (WideState -> WideState -> (WideState, Word)) -> WideDynamicCharacter -> WideDynamicCharacter -> (Word, WideDynamicCharacter) #-} -{-# SPECIALISE swappingDO :: (HugeState -> HugeState -> (HugeState, Word)) -> HugeDynamicCharacter -> HugeDynamicCharacter -> (Word, HugeDynamicCharacter) #-} -swappingDO - :: ( FiniteBits e - , Ord (v e) - , Vector v e - ) - => TCMλ e - -> OpenDynamicCharacter v e - -> OpenDynamicCharacter v e - -> (Word, OpenDynamicCharacter v e) -swappingDO = directOptimizationFromDirectionMatrix buildDirectionMatrix - - -{-# SCC buildDirectionMatrix #-} -{-# INLINEABLE buildDirectionMatrix #-} -{-# SPECIALISE buildDirectionMatrix :: WideState -> (WideState -> WideState -> (WideState, Word)) -> UV.Vector WideState -> UV.Vector WideState -> (Word, Matrix Direction) #-} -{-# SPECIALISE buildDirectionMatrix :: HugeState -> (HugeState -> HugeState -> (HugeState, Word)) -> V.Vector HugeState -> V.Vector HugeState -> (Word, Matrix Direction) #-} -buildDirectionMatrix - :: Vector v e - => e -- ^ Gap state - -> TCMλ e -- ^ Metric between states producing the medoid of states. - -> v e -- ^ Shorter dynamic character related to the "left column" - -> v e -- ^ Longer dynamic character related to the "top row" - -> (Word, Matrix Direction) -buildDirectionMatrix gap tcmλ lesserLeft longerTop = fullMatrix - where - costλ x y = snd $ tcmλ x y - rows = GV.length lesserLeft + 1 - cols = GV.length longerTop + 1 - - fullMatrix = runST $ do - mDir <- M.new (rows, cols) - vOne <- MUV.new cols - vTwo <- MUV.new cols - - let write v p@(~(_,j)) c d = MUV.unsafeWrite v j c *> M.unsafeWrite mDir p d - - write vOne (0,0) 0 DiagArrow - - -- Special case the first row - -- We need to ensure that there are only Left Arrow values in the directional matrix. - -- We can also reduce the number of comparisons the first row makes from 3 to 1, - -- since the diagonal and upward values are "out of bounds." - for_ [1 .. cols - 1] $ \j -> - let topElement = longerTop ! (j - 1) - firstCellCost = costλ gap topElement - in do firstPrevCost <- MUV.unsafeRead vOne (j - 1) - write vOne (0,j) (firstCellCost + firstPrevCost) LeftArrow - - for_ [1 .. rows - 1] $ \i -> - let (prev, curr) - | odd i = (vOne, vTwo) - | otherwise = (vTwo, vOne) - leftElement = lesserLeft ! (i - 1) - -- Special case the first cell of each row - -- We need to ensure that there are only Up Arrow values in the directional matrix. - -- We can also reduce the number of comparisons the first row makes from 3 to 1, - -- since the diagonal and leftward values are "out of bounds." - firstCellCost = costλ leftElement gap - in do firstPrevCost <- MUV.unsafeRead prev 0 - write curr (i,0) (firstCellCost + firstPrevCost) UpArrow - -- Finish special case for first cell of each row - -- Begin processing all other cells in the curr vector - for_ [1 .. cols - 1] $ \j -> - let topElement = longerTop ! (j - 1) - deleteCost = costλ gap topElement - alignCost = costλ leftElement topElement - insertCost = costλ leftElement gap - in do diagCost <- MUV.unsafeRead prev $ j - 1 - topCost <- MUV.unsafeRead prev j - leftCost <- MUV.unsafeRead curr $ j - 1 - let (# c, d #) = minimumCostDirection - (deleteCost + leftCost) - ( alignCost + diagCost) - (insertCost + topCost) - write curr (i,j) c d - - let v | odd rows = vOne - | otherwise = vTwo - - c <- MUV.unsafeRead v (cols - 1) - m <- unsafeFreeze mDir - pure (c, m) - diff --git a/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Pairwise/Ukkonen.hs b/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Pairwise/Ukkonen.hs deleted file mode 100644 index a702c7493..000000000 --- a/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Pairwise/Ukkonen.hs +++ /dev/null @@ -1,799 +0,0 @@ ------------------------------------------------------------------------------ --- | --- Module : DirectOptimization.Pairwise.Ukkonen --- Copyright : (c) 2015-2021 Ward Wheeler --- License : BSD-style --- --- Maintainer : wheeler@amnh.org --- Stability : provisional --- Portability : portable --- --- Direct optimization pairwise alignment using the Needleman-Wunsch algorithm. --- These functions will allocate an M * N matrix. --- ------------------------------------------------------------------------------ - -{-# LANGUAGE ApplicativeDo #-} -{-# LANGUAGE BangPatterns #-} -{-# LANGUAGE ConstraintKinds #-} -{-# LANGUAGE DerivingStrategies #-} -{-# LANGUAGE FlexibleContexts #-} -{-# LANGUAGE ScopedTypeVariables #-} -{-# LANGUAGE TypeFamilies #-} -{-# LANGUAGE UnboxedTuples #-} - -module DirectOptimization.Pairwise.Ukkonen - ( Direction() - , ukkonenDO - , createUkkonenMethodMatrix - ) where - -import Bio.DynamicCharacter -import Bio.DynamicCharacter.Measure -import Control.Monad (unless, when) -import Control.Monad.Loops (iterateUntilM, whileM_) -import Control.Monad.Primitive -import Control.Monad.ST -import Data.Bits -import Data.Foldable -import Data.Matrix.Unboxed (Matrix, unsafeFreeze) -import Data.Matrix.Unboxed.Mutable (MMatrix) -import qualified Data.Matrix.Unboxed.Mutable as M -import Data.STRef -import qualified Data.Vector as V -import Data.Vector.Generic (Vector, (!)) -import qualified Data.Vector.Generic as GV -import qualified Data.Vector.Unboxed as UV -import DirectOptimization.Pairwise.Internal -import DirectOptimization.Pairwise.Swapping - - --- | --- Performs a naive direct optimization. --- Takes in two characters to run DO on and an overlap function --- Returns an assignment character, the cost of that assignment, the assignment --- character with gaps included, the aligned version of the first input character, --- and the aligned version of the second input character. The process for this --- algorithm is to generate a traversal matrix, then perform a traceback. -{-# SCC ukkonenDO #-} -{-# INLINEABLE ukkonenDO #-} -{-# SPECIALISE ukkonenDO :: Word -> (WideState -> WideState -> (WideState, Word)) -> WideDynamicCharacter -> WideDynamicCharacter -> (Word, WideDynamicCharacter) #-} -{-# SPECIALISE ukkonenDO :: Word -> (HugeState -> HugeState -> (HugeState, Word)) -> HugeDynamicCharacter -> HugeDynamicCharacter -> (Word, HugeDynamicCharacter) #-} -ukkonenDO - :: ( FiniteBits e - , Ord (v e) - , Vector v e - ) - => Word -- ^ Coefficient value, representing the /minimum/ transition cost from a state to gap - -> TCMλ e -- ^ Metric between states producing the medoid of states. - -> OpenDynamicCharacter v e -- ^ /1st/ dynamic character - -> OpenDynamicCharacter v e -- ^ /2nd/ dynamic character - -> (Word, OpenDynamicCharacter v e) -ukkonenDO coefficient tcmλ char1 char2 - | noGainFromUkkonenMethod = buildFullMatrix - | otherwise = buildBandMatrix - where - buildFullMatrix = swappingDO tcmλ char1 char2 - buildBandMatrix = directOptimizationFromDirectionMatrix ukkonenBandλ tcmλ char1 char2 - - ukkonenBandλ = createUkkonenMethodMatrix coefficient inputGapAmbiguities - - ~(_,_,_,x,y) = measureCharactersWithoutGaps char1 char2 - - lesser = extractMediansGapped x - longer = extractMediansGapped y - - -- /O(1)/ - -- - -- If the longer character is 50% larger than the shorter character, then - -- there is no point in using the barriers. Rather, we fill the full matrix - -- immediately. - -- - -- Additionally, if the shorter sequence is of length 4 or less, then the - -- initial barrier will be set adjacent to or beyond the lower left and - -- upper right corners. - -- - -- Also, a threshold coefficient is computed as the minimal indel cost from - -- any symbol in the alphabet to gap. However, if the indel cost for any - -- symbol is zero, the algorithm will hang, and a naive approach must be taken. - -- - -- Lastly, if the sum of the gaps in both strings is equal to or exceeds the - -- length of the longer string, then the threshold criteria will never be met - -- by definition. - -- - -- Do not perform Ukkonen's algorithm if and only if: - -- - -- > longerLen >= 1.5 * lesserLen - -- OR - -- > lesserLen <= 4 - -- OR - -- > coefficient == 0 - -- - noGainFromUkkonenMethod = lesserLen <= 4 - || 2 * longerLen >= 3 * lesserLen - || coefficient == 0 - || inputGapAmbiguities >= longerLen - where - longerLen = vLength longer - lesserLen = vLength lesser - - -- /O(n + m)/ - -- - -- NOTE: There will be no *unambiguous* gap elements in the dynamic characters! - -- However, there may be abiguous elements which contain gap as a possibility. - -- - -- If one or more of the character elements contained a gap, diagonal - -- directions in the matrix have an "indel" cost. 'gapsPresentInInputs' is - -- necessary in order to decrement the threshold value to account for this. - -- This was not described in Ukkonen's original paper, as the inputs were assumed - -- not to contain any gaps. - inputGapAmbiguities = char1Gaps + char2Gaps - where - char1Gaps = countGaps lesser - char2Gaps = countGaps longer - countGaps = vLength . GV.filter maybeGap - maybeGap = (`testBit` 0) -- Zero is the gap bit! - - vLength = toEnum . GV.length - - --- | --- /O( (n - m + 1 ) * log(n - m + 1) )/, /n/ >= /m/ --- --- Generates an /optimal/, partially-filled-in matrix using Ukkonen's string --- edit distance algorithm. --- --- Note that the threshold value is lowered more than described in Ukkonen's --- paper. This is to handle input elements that contain a gap. In Ukkonen's --- original description of the algorithm, there was a subtle assumption that --- input did not contain any gap symbols. -{-# SCC createUkkonenMethodMatrix #-} -{-# INLINEABLE createUkkonenMethodMatrix #-} -{-# SPECIALISE createUkkonenMethodMatrix :: Word -> Word -> WideState -> (WideState -> WideState -> (WideState, Word)) -> UV.Vector WideState -> UV.Vector WideState -> (Word, Matrix Direction) #-} -{-# SPECIALISE createUkkonenMethodMatrix :: Word -> Word -> HugeState -> (HugeState -> HugeState -> (HugeState, Word)) -> V.Vector HugeState -> V.Vector HugeState -> (Word, Matrix Direction) #-} -createUkkonenMethodMatrix - :: Vector v e - => Word -- ^ Coefficient value, representing the /minimum/ transition cost from a state to gap - -> Word -- ^ Number of abiguous elements in both inputs which contained gap as a possible state - -> e -- ^ Gap State - -> TCMλ e -- ^ Metric between states producing the medoid of states. - -> v e -- ^ Shorter dynamic character - -> v e -- ^ Longer dynamic character - -> (Word, Matrix Direction) -createUkkonenMethodMatrix minimumIndelCost inputGapAmbiguities gap tcmλ lesserLeft longerTop = finalMatrix - where - -- General values that need to be in scope for the recursive computations. - lesserLen = GV.length lesserLeft - longerLen = GV.length longerTop - - -- We start the offset at four rather than at one so that the first doubling - -- isn't trivially small. - startOffset = 2 - - -- /O(1)/ - -- - -- Necessary to compute the width of a row in the barrier-constrained matrix. - quasiDiagonalWidth = toEnum $ differenceInLength + 1 - where - differenceInLength = longerLen - lesserLen - - extra = (inputGapAmbiguities +) - - finalMatrix = runST $ do - (mCost, mDir) <- buildInitialBandedMatrix gap tcmλ lesserLeft longerTop $ extra startOffset - let getAlignmentCost = M.unsafeRead mCost (lesserLen, longerLen) - offsetRef <- newSTRef startOffset - - let needToResizeBand = do - offset <- readSTRef offsetRef - -- If the filled row width exceeds the actual row length, - -- Then clearly we are done as we have filled the entire matrix. - if quasiDiagonalWidth + extra offset > toEnum longerLen - then pure False - else let partialWidth = quasiDiagonalWidth + offset - -- Value that the alignment cost must be less than - threshold -- The threshold value must be non-negative - | partialWidth <= inputGapAmbiguities = 0 - | otherwise = minimumIndelCost * (partialWidth - inputGapAmbiguities) - in (threshold <=) <$> getAlignmentCost - - whileM_ needToResizeBand $ do - previousOffset <- readSTRef offsetRef - let currentOffset = previousOffset `shiftL` 1 -- Multiply by 2 - writeSTRef offsetRef currentOffset - expandBandedMatrix gap tcmλ lesserLeft longerTop mCost mDir - (extra previousOffset) - (extra currentOffset) - - c <- getAlignmentCost - m <- unsafeFreeze mDir - pure (c, m) - - -{-# SCC buildInitialBandedMatrix #-} -buildInitialBandedMatrix - :: Vector v e - => e -- ^ Gap - -> TCMλ e -- ^ Metric between states producing the medoid of states. - -> v e -- ^ Shorter dynamic character - -> v e -- ^ Longer dynamic character - -> Word - -> ST s (MMatrix s Word, MMatrix s Direction) -buildInitialBandedMatrix gap tcmλ lesserLeft longerTop o = fullMatrix - where - (offset, costλ, rows, cols, width, quasiDiagonalWidth) = ukkonenConstants tcmλ lesserLeft longerTop o - - fullMatrix = do - - --------------------------------------- - -- Allocate required space -- - --------------------------------------- - - mCost <- M.new (rows, cols) - mDir <- M.new (rows, cols) - - --------------------------------------- - -- Define some generalized functions -- - --------------------------------------- - let ~(readCost, write, internalCell, leftColumn, leftBoundary, rightBoundary, rightColumn) = - edgeCellDefinitions gap costλ longerTop mCost mDir - - -- Define how to compute values to an entire row of the Ukkonen matrix. - let writeRow i = - -- Precompute some values that will be used for the whole row - let start = max 0 $ i - offset - stop = min (cols - 1) $ i - offset + width - 1 - leftElement = lesserLeft ! (i - 1) - insertCost = costλ leftElement gap - - -- Each row in the matrix with values in the band has 'width' cells. - -- However, the band runs off the left end of the matrix for the first - -- several rows of the matrix. How many rows, though? - -- There are exactly 'offset' number of cells left of first matrix column - -- in the first row. The number of cells to the left of the matrix - -- decreases by one in each subsequent row. The means that the first - -- row, and then the next 'offset' number of rows require special handling - -- of the boundary. The last row index requiring special handling is index - -- 'offset'. Subsequent rows have the band begin at least one cell away - -- from the matrix boundary. - firstCell - | i <= offset = leftColumn - | otherwise = leftBoundary - - lastCell - | i <= cols - quasiDiagonalWidth - offset = rightBoundary - | otherwise = rightColumn - in do -- Write to the first cell of the Ukkonen band - firstCell leftElement insertCost i start - -- Write to the all the intermediary cells of the Ukkonen band - for_ [start + 1 .. stop - 1] $ \j -> - internalCell leftElement insertCost i j - -- Write to the last cell of the Ukkonen band - lastCell leftElement insertCost i stop - - --------------------------------------- - -- Compute all values of the matrix -- - --------------------------------------- - - -- Write to the origin to seed the first row. - write (0, 0) (# 0, DiagArrow #) - - -- Each row in the matrix with values in the band has 'width' cells. - -- However, the band runs off the end of the matrix for the first & last - -- rows of the matrix. We subtract the 'offest' from the 'width' because - -- there are exactly 'offset' number of cells left of first matrix column. - -- Hence the top row's width is 'width' minus 'offset'. The last cell index - -- in the top row of the band is 'width' minus 'offset' minus 1. - let topRowWidth = width - offset - let topRowWrite !j !cost = write (0,j) (# cost, LeftArrow #) - - -- Write the first row to seed subsequent rows. - for_ [1 .. min (cols - 1) (topRowWidth - 1)] $ \j -> - let topElement = longerTop ! (j - 1) - firstCellCost = costλ gap topElement - in do firstPrevCost <- readCost 0 $ j - 1 - topRowWrite j $ firstCellCost + firstPrevCost - - -- Loop through the remaining rows. - for_ [1 .. rows - 1] writeRow - - -- Return the matricies for possible expansion - pure (mCost, mDir) - - -{-# SCC expandBandedMatrix #-} --- | --- Given a partially computed alignment matrix, --- will expand the computed region to the new specified offset. --- --- --- Dimensions: 13 ⨉ 17 --- ⊗ ┃ ⁎ α1 α2 α3 α4 α5 α6 α7 α8 α9 α0 α1 α2 α3 α4 α5 α6 --- ━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ --- ⁎ ┃ 0↖ 0← 0← 0← 0← 0← 0← 0← 0← 0← 0← 0← 0← 0← 0← 0← 0← --- α1 ┃ 0↑ 0↖ 0↖ 0↖ 0← 0← 0← 0← 0← 0← 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0← --- α2 ┃ 0↑ 0↖ 0↖ 0↖ 0← 0← 0← 0← 0← 0← 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0← --- α3 ┃ 0↑ 0↑ 0↑ 0↖ 0↖ 0↖ 0← 0← 0← 0← 0← 0← 0← 0← 0← 0← 0↖ --- α4 ┃ 0↑ 0↑ 0↑ 0↖ 0↖ 0← 0↖ 0← 0↖ 0← 0← 0← 0← 0← 0← 0← 0← --- α5 ┃ 0↑ 0↑ 0↑ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0← 0← 0← 0← 0← 0← 0← 0← --- α6 ┃ 0↑ 0↖ 0↖ 0↖ 0↑ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0← --- α7 ┃ 0↑ 0↑ 0↑ 0↑ 0↖ 0↖ 0↖ 0↖ 0↖ 0← 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ --- α8 ┃ 0↑ 0↑ 0↑ 0↑ 0↑ 0↖ 0← 0↖ 0↑ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ --- α9 ┃ 0↑ 0↑ 0↑ 0↑ 0↖ 0↑ 0↖ 0← 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ --- α0 ┃ 0↑ 0↑ 0↑ 0↑ 0↑ 0↖ 0↑ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ --- α1 ┃ 0↑ 0↑ 0↑ 0↑ 0↖ 0↑ 0↖ 0↖ 0↖ 0← 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ --- α2 ┃ 0↑ 0↖ 0↖ 0↖ 0↑ 0↑ 0↑ 0↖ 0↑ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ --- --- ┌───────────────w───────────────┐ --- │ ┏━━━━━━━co━━━━━━━┪ --- ┢━━━━━qd━━━━━━┓┠─po─┐┌────Δo────┨ --- ⊗ ┃ ┃0 1 2 3 4┃┃5 6││7 8 9 10┃11 12 13 14 15 16 --- ━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ --- 0 ┃ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ ▒▒ ▒▒ --- 1 ┃ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ ▒▒ ▒▒ --- 2 ┃ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ ▒▒ ▒▒ --- 3 ┃ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ ▒▒ ▒▒ --- 4 ┃ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ ▒▒ ▒▒ --- 5 ┃ ▒▒ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ ▒▒ ▒▒ --- 6 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ ▒▒ ▒▒ --- 7 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ ▒▒ --- 8 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ --- 9 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ --- 0 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ --- 1 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ --- 2 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ --- --- --- w : Width --- qd : Quasi-diagonal --- co : Current Offset --- po : Previous Offset --- Δo : Difference in Offset --- --- Note: --- w = qd + co --- co = po + Δo --- --- And often: --- co = 2*po = 2*Δo --- --- ██ : The core band --- * Previously computed, sections may need to be recomputed --- --- ▓▓ : The previous extension --- * Previously computed, sections may need to be recomputed --- --- ▒▒ : The new extension --- * Needs to be computed --- -expandBandedMatrix - :: Vector v e - => e -- ^ Gap state - -> TCMλ e -- ^ Metric between states producing the medoid of states. - -> v e -- ^ Shorter dynamic character - -> v e -- ^ Longer dynamic character - -> MMatrix s Word - -> MMatrix s Direction - -> Word - -> Word - -> ST s () -expandBandedMatrix gap tcmλ lesserLeft longerTop mCost mDir po co = updatedBand - where - (offset, costλ, rows, cols, width, qd) = ukkonenConstants tcmλ lesserLeft longerTop co - prevOffset = fromEnum po - - updatedBand = do - - --------------------------------------- - -- Allocate mutable state variables -- - --------------------------------------- - - tailStart <- newSTRef cols - - t0' <- newSTRef (-1) - t1' <- newSTRef $ qd + fromEnum po - - --------------------------------------- - -- Define some generalized functions -- - --------------------------------------- - let ~(readCost, write, internalCell, leftColumn, leftBoundary, rightBoundary, rightColumn) = - edgeCellDefinitions gap costλ longerTop mCost mDir - - let computeCell !leftElement !insertCost !i !j = {-# SCC recomputeCell #-} - let !topElement = longerTop ! (j - 1) - !deleteCost = costλ gap topElement - !alignCost = costλ leftElement topElement - in do - diagCost <- readCost (i - 1) $ j - 1 - topCost <- readCost (i - 1) j - leftCost <- readCost i $ j - 1 - oldCost <- readCost i j - let !e@(# c, _ #) = minimumCostDirection - (deleteCost + leftCost) - ( alignCost + diagCost) - (insertCost + topCost) - write (i,j) e - pure (c == oldCost, j+1) --- pure (c /= oldCost, j+1) - - let recomputeRange leftElement insertCost i x y = do - lastDiff <- newSTRef 0 - for_ [x .. y] $ \j -> do - (same, _) <- computeCell leftElement insertCost i j - unless same $ writeSTRef lastDiff j - readSTRef lastDiff - - -- Define how to compute values to an entire row of the Ukkonen matrix. - let extendRow i = - -- Precopmute some values that will be used for the whole row - let start0 = max 0 $ i - offset - start3 = min cols $ i + width - offset - prevOffset - 1 - goUpTo = max 0 ( i - prevOffset) - 1 - stop = min (cols - 1) $ i + width - offset - 1 - leftElement = lesserLeft ! (i - 1) - insertCost = costλ leftElement gap - firstCell - | i <= offset = leftColumn - | otherwise = leftBoundary - - lastCell - | i <= cols - qd - offset = rightBoundary - | otherwise = rightColumn - - b0 = start0 - e0 = goUpTo - b1 = start3 - e1 = stop - - continueRecomputing (same, j) = same || j >= stop - computeCell' ~(_,j) = computeCell leftElement insertCost i j - internalCell' j = internalCell leftElement insertCost i j - recomputeUntilSame j = snd <$> iterateUntilM continueRecomputing computeCell' (False, j) - in do -- First, we fill in 0 or more cells of the left region of - -- the expanded band. This is the region [b0, e0] computed - -- above. - -- ⊗ ┃ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 - -- ━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - -- 0 ┃ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 1 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 2 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 3 ┃ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 4 ┃ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- b0 e0 - -- ┏━━━━━━━━┓ - -- 5 ┃ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- - -- 6 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 7 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ - -- 8 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ - -- 9 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ - -- 10 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ - -- 11 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ - -- 12 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ - -- - - -- Conditionally write to the first cell of the Ukkonen band - if i > prevOffset - then firstCell leftElement insertCost i b0 - else pure () - - for_ [b0+1 .. e0] internalCell' - - -- Next, we assign to s0 the value t0 from the previous row. - -- The cell t0 is up to where the values were recomputed in - -- the previous row. - -- We recompute the cells in the range [e0 + 1, s0]. - -- We assign to t0 the last cell in the range [s1, s2] which - -- was updated for the next row. - -- We remember cells t0 for the next row. - -- ⊗ ┃ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 - -- ━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - -- 0 ┃ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 1 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 2 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 3 ┃ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 4 ┃ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- e0 s0 - -- ┏━━━━━┓ - -- 5 ┃ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- - -- 6 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 7 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ - -- 8 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ - -- 9 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ - -- 10 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ - -- 11 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ - -- 12 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ - -- - -- - s0 <- (\x -> min (x+1) e1) <$> readSTRef t0' - writeSTRef t0' (-1) - - when (s0 > e0 && toEnum i > po) $ - recomputeRange leftElement insertCost i (e0+1) s0 >>= writeSTRef t0' - t0 <- readSTRef t0' - - -- If s0 = t0, we recompute the cell (s0 + 1). - -- If the cost is the same, we stop here and remember the cell - -- before we stopped. - -- If the cost is not the same, we update cell (s0 + 1) and - -- move on to (s0 + 2). - -- This procedure continues until (s0 + n) has the same cost - -- as before, or *until we reach b1.* - -- We remember the cell (s0 + n - 1) as t0 for the next row. - -- ⊗ ┃ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 - -- ━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - -- 0 ┃ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 1 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 2 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 3 ┃ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 4 ┃ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- s0 t0 - -- ╔═════╗ - -- 5 ┃ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- - -- 6 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 7 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ - -- 8 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ - -- 9 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ - -- 10 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ - -- 11 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ - -- 12 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ - -- - if s0 == t0 && s0 > 0 - then recomputeUntilSame (s0 + 1) >>= writeSTRef t0' . pred - else if s0 <= e0 && e0 > 0 - then recomputeUntilSame (e0 + 1) >>= writeSTRef t0' . pred - else pure () - - -- Next, we assign to s1 the value t1 from the previous row. - -- We also assign s2 the value t2 from the previous row. - -- The range [t1, t2] is where the values were recomputed in - -- the previous row. - -- We recompute the cells in the range [s1, s2]. - -- We assign to t2 the last cell in the range [s1, s2] which - -- was updated for the next row. - -- We remember cells s1 as t1 for the next row. - -- ⊗ ┃ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 - -- ━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - -- 0 ┃ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 1 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 2 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 3 ┃ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 4 ┃ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- s1 s2 - -- ┏━━━━━┓ - -- 5 ┃ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- - -- 6 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 7 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ - -- 8 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ - -- 9 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ - -- 10 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ - -- 11 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ - -- 12 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ - -- - -- NOPE, Try again - -- - -- Next, we assign to s1 the value t1 from the previous row. - -- If s1 is less than t0, we assign to s1 the value t0 + 1. - -- This ensures that we do not start "behind" where we have - -- previously computed. - -- Then if s1 is greater than e1, we assign to s1 the - -- value e1. This ensures one cell is always written to. - -- We recompute the cells in the range [s1, b1 - 1]. - -- If any cell in the range was updated, we assign to s1 to t1. - -- We remember cell t1 for the next row. - -- ⊗ ┃ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 - -- ━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - -- 0 ┃ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 1 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 2 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 3 ┃ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 4 ┃ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- s1 b1 - -- ┏━━━━━━━━┓ - -- 5 ┃ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- - -- 6 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 7 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ - -- 8 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ - -- 9 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ - -- 10 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ - -- 11 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ - -- 12 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ - -- - s1 <- do a <- readSTRef t0' - b <- readSTRef t1' - pure . min e1 $ max a b - - t1 <- recomputeRange leftElement insertCost i s1 $ b1 - 1 - - -- If no cells were updated, a zero value is returned. - -- In this case, the "last" updated cell for the next row is b1. - writeSTRef t1' $ if t1 == 0 then b1 else s1 - - -- Lastly, we fill in 0 or more cells of the left region of - -- the expanded band. This is the region [b1, e1] computed - -- above. - -- ⊗ ┃ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 - -- ━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - -- 0 ┃ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 1 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 2 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 3 ┃ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 4 ┃ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- b1 e1 - -- ┏━━━━━━━━━━━┓ - -- 5 ┃ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- - -- 6 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 7 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ - -- 8 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ - -- 9 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ - -- 10 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ - -- 11 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ - -- 12 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ - -- - for_ [b1 .. e1 - 1] internalCell' - - -- Conditionally write to the last cell of the Ukkonen band - if i < rows - fromEnum po - then lastCell leftElement insertCost i stop - else pure () - - --------------------------------------- - -- Compute all values of the matrix -- - --------------------------------------- - - -- We start computation in the top row at the index equal to the - -- quasi-diagonal width plus the previous offset. This is because the last - -- cell of the top row which we computed was at the previous index since - -- we previous computed an number of cells from the quasi-diagonal equal to - -- the previous offset. - let topRowStart = qd + prevOffset - - -- Each row in the matrix with values in the band has 'width' cells. - -- However, the band runs off the end of the matrix for the first & last - -- rows of the matrix. We subtract the 'offset' from the 'width' because - -- there are exactly 'offset' number of cells left of first matrix column. - -- Hence the top row's width is 'width' minus 'offset'. The last cell index - -- in the top row of the band is 'width' minus 'offset' minus 1. - let topRowWidth = width - offset - - -- Of course, we must be cetrain that we don't extend past the last column - -- of the matrix. To prevent this, we take the minimum of the top row width - -- and the number of columns. So the last index we will compute in the top - -- row is the minimum of the two options minus one due to zero-indexing. - let topRowCease = pred $ min cols topRowWidth - - -- Write out the left arrow value on the top row. - let topRowWrite !j !cost = write (0,j) (# cost, LeftArrow #) - - -- Extend the first row to seed subsequent rows. - for_ [ topRowStart .. topRowCease ] $ \j -> - let !topElement = longerTop ! (j - 1) - !firstCellCost = costλ gap topElement - in do !firstPrevCost <- readCost 0 $ j - 1 - topRowWrite j $ firstCellCost + firstPrevCost - - writeSTRef tailStart topRowStart - - -- Loop through the remaining rows. - for_ [1 .. rows - 1] extendRow - - -edgeCellDefinitions - :: ( PrimMonad m - , Vector v e - ) - => e -- ^ Gap state - -> (e -> e -> Word) -- ^ Distance between states - -> v e -- ^ Longer dynamic character - -> MMatrix (PrimState m) Word - -> MMatrix (PrimState m) Direction - -> ( Int -> Int -> m Word - , (Int, Int) -> (# Word, Direction #) -> m () - , e -> Word -> Int -> Int -> m () - , e -> Word -> Int -> Int -> m () - , e -> Word -> Int -> Int -> m () - , e -> Word -> Int -> Int -> m () - , e -> Word -> Int -> Int -> m () - ) -edgeCellDefinitions gap costλ longerTop mCost mDir = - (readCost, write, internalCell, leftColumn, leftBoundary, rightBoundary, rightColumn) - where - -- Read the cost of a cell - readCost = curry $ M.unsafeRead mCost - - -- Write to a single cell of the current vector and directional matrix simultaneously - write !p (# !c, !d #) = M.unsafeWrite mCost p c *> M.unsafeWrite mDir p d - - -- Write to an internal cell (not on a boundary) of the matrix. - internalCell !leftElement !insertCost !i !j = {-# SCC internalCell_expanding #-} - let !topElement = longerTop ! (j - 1) - !deleteCost = costλ gap topElement - !alignCost = costλ leftElement topElement - in do diagCost <- readCost (i - 1) $ j - 1 - topCost <- readCost (i - 1) j - leftCost <- readCost i $ j - 1 - let v = minimumCostDirection - (deleteCost + leftCost) - ( alignCost + diagCost) - (insertCost + topCost) - write (i,j) v - - - -- Define how to compute the first cell of the first "offset" rows. - -- We need to ensure that there are only Up Arrow values in the directional matrix. - -- We can also reduce the number of comparisons the first row makes from 3 to 1, - -- since the diagonal and leftward values are "out of bounds." - leftColumn _leftElement !insertCost !i !j = {-# SCC leftColumn #-} do - firstPrevCost <- readCost (i - 1) j - write (i,j) (# insertCost + firstPrevCost, UpArrow #) - - -- Define how to compute the first cell of the remaining rows. - -- We need to ensure that there are no Left Arrow values in the directional matrix. - -- We can also reduce the number of comparisons the first row makes from 3 to 2, - -- since the leftward values are "out of bounds." - -- Define how to compute the first cell of the remaining rows. - -- We need to ensure that there are no Left Arrow values in the directional matrix. - -- We can also reduce the number of comparisons the first row makes from 3 to 2, - -- since the leftward values are "out of bounds." - leftBoundary !leftElement !insertCost !i !j = {-# SCC leftBoundary #-} - let topElement = longerTop ! (j - 1) - alignCost = costλ leftElement topElement - in do diagCost <- readCost (i - 1) $ j - 1 - topCost <- readCost (i - 1) j - let v = minimumCostDirection - maxBound - ( alignCost + diagCost) - (insertCost + topCost) - write (i,j) v - - -- Define how to compute the last cell of the first "rows - offset" rows. - -- We need to ensure that there are only Left Arrow values in the directional matrix. - -- We can also reduce the number of comparisons the first row makes from 3 to 1, - -- since the diagonal and upward values are "out of bounds." - rightBoundary !leftElement _insertCost !i !j = {-# SCC rightBoundary #-} - let topElement = longerTop ! (j - 1) - deleteCost = costλ gap topElement - alignCost = costλ leftElement topElement - in do diagCost <- readCost (i - 1) $ j - 1 - leftCost <- readCost i $ j - 1 - let v = minimumCostDirection - (deleteCost + leftCost) - ( alignCost + diagCost) - maxBound - write (i,j) v - - rightColumn = {-# SCC rightColumn #-} internalCell - - --- | --- Produces a set of reusable values and functions which are "constant" between --- different incarnations of the Ukkonen algorithms. -ukkonenConstants - :: Vector v e - => TCMλ e -- ^ Metric between states producing the medoid of states. - -> v e -- ^ Shorter dynamic character - -> v e -- ^ Longer dynamic character - -> Word -- ^ Current offset from quasi-diagonal - -> (Int, e -> e -> Word, Int, Int, Int, Int) -ukkonenConstants tcmλ lesserLeft longerTop co = - (offset, costλ, rows, cols, width, quasiDiagonalWidth) - where - offset = clampOffset co - costλ x = snd . tcmλ x - longerLen = GV.length longerTop - lesserLen = GV.length lesserLeft - rows = GV.length lesserLeft + 1 - cols = GV.length longerTop + 1 - width = quasiDiagonalWidth + (offset `shiftL` 1) -- Multiply by 2 - quasiDiagonalWidth = differenceInLength + 1 - where - differenceInLength = longerLen - lesserLen - - -- Note: "offset" cannot cause "width + quasiDiagonalWidth" to exceed "2 * cols" - clampOffset o = - let o' = fromEnum o in min o' $ cols - quasiDiagonalWidth diff --git a/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Pairwise/Visualization.hs b/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Pairwise/Visualization.hs deleted file mode 100644 index d22dccf97..000000000 --- a/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Pairwise/Visualization.hs +++ /dev/null @@ -1,359 +0,0 @@ ------------------------------------------------------------------------------ --- | --- Module : DirectOptimization.Pairwise.Visualization --- Copyright : (c) 2015-2021 Ward Wheeler --- License : BSD-style --- --- Maintainer : wheeler@amnh.org --- Stability : provisional --- Portability : portable --- --- Direct optimization pairwise alignment using the Needleman-Wunsch algorithm. --- These functions will allocate an M * N matrix. --- ------------------------------------------------------------------------------ - -{-# LANGUAGE ApplicativeDo #-} -{-# LANGUAGE ConstraintKinds #-} -{-# LANGUAGE DerivingStrategies #-} -{-# LANGUAGE FlexibleContexts #-} -{-# LANGUAGE MultiParamTypeClasses #-} -{-# LANGUAGE Strict #-} -{-# LANGUAGE TypeFamilies #-} -{-# LANGUAGE UnboxedTuples #-} - -{-# OPTIONS_GHC -Wno-incomplete-patterns #-} -{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-} - -module DirectOptimization.Pairwise.Visualization - ( Direction() - -- * Operational - , directOptimizationDiffDirectionMatricies - -- * Rendering - , renderMatrix - , renderDirectionMatrix - , diffDirectionMatrix - ) where - -import Bio.DynamicCharacter -import Bio.DynamicCharacter.Measure -import Data.Bits -import Data.Foldable (fold) -import Data.Matrix.Class (Matrix, dim, toLists, unsafeIndex) -import Data.Set (Set, fromDistinctAscList, member) -import Data.Vector.Generic (Vector, basicLength, toList) -import DirectOptimization.Pairwise.Direction - - -directOptimizationDiffDirectionMatricies - :: ( FiniteBits e - , Matrix m t Direction - , Ord (v e) - , Vector v e - ) - => (v e -> v e -> (Word, m t Direction)) - -> (v e -> v e -> (Word, m t Direction)) - -> OpenDynamicCharacter v e - -> OpenDynamicCharacter v e - -> String -directOptimizationDiffDirectionMatricies matrixGenerator1 matrixGenerator2 lhs rhs = - let -- Remove gaps from the inputs and measure the results to determine - -- which ungapped character is longer and which is shorter. - -- Always pass the shorter character into alignment functions first! - ~(_, _, _, lesser, longer) = measureCharactersWithoutGaps lhs rhs - lesserMeds = extractMediansGapped lesser - longerMeds = extractMediansGapped longer - in case basicLength lesserMeds of - -- Neither character was Missing, but one or both are empty when gaps are removed - 0 -> "One character was all gaps" - -- Both have some non-gap elements, perform string alignment - _ -> let dm1 = snd $ matrixGenerator1 lesserMeds longerMeds - dm2 = snd $ matrixGenerator2 lesserMeds longerMeds - in diffDirectionMatrix lesserMeds longerMeds dm1 dm2 - - --- | --- Serializes an alignment matrix to a 'String'. Uses input characters for row --- and column labelings. --- --- Useful for debugging purposes. -renderMatrix - :: ( Matrix m t x - , Vector v e - , Show x - ) - => v e -- ^ Shorter vector of elements - -> v e -- ^ Longer vector of elements - -> m t x -- ^ Matrix of cells - -> String -renderMatrix lesser longer mtx = unlines - [ dimensionPrefix - , headerRow - , barRow - , renderedRows - ] - where - (colCount, rowCount, lesserTokens, longerTokens, maxPrefixWidth, maxColumnWidth, [matrixTokens]) = - getMatrixConstants lesser longer [mtx] -{- - toShownIntegers = fmap (show . showBitsValue) . otoList - - showBitsValue :: FiniteBits b => b -> Word - showBitsValue b = go (finiteBitSize b) 0 - where - go 0 v = v - go i v = let i' = i-1 - v' | b `testBit` i' = v + bit i' - | otherwise = v - in go i' v' --} - - dimensionPrefix = " " <> unwords - [ "Dimensions:" - , show rowCount - , "X" - , show colCount - ] - - headerRow = fold - [ " " - , pad maxPrefixWidth "\\" - , "| " - , pad maxColumnWidth "*" - , concatMap (pad maxColumnWidth) longerTokens - ] - - barRow = fold - [ " " - , bar maxPrefixWidth - , "+" - , concatMap (const (bar maxColumnWidth)) $ undefined : longerTokens - ] - where - bar n = replicate (n+1) '-' - - renderedRows = unlines $ zipWith renderRow ("*":lesserTokens) matrixTokens - where - renderRow e cs = prefix <> suffix - where - prefix = fold [" ", pad maxPrefixWidth e, "| "] - suffix = concatMap (pad maxColumnWidth) cs - - pad :: Int -> String -> String - pad n e = replicate (n - len) ' ' <> e <> " " - where - len = length e - - --- | --- Serializes an alignment matrix to a 'String'. Uses input characters for row --- and column labelings. --- --- Useful for debugging purposes. -renderDirectionMatrix - :: ( Matrix m t Direction - , Vector v e - ) - => v e - -> v e - -> m t Direction - -> String -renderDirectionMatrix lesser longer mtx = unlines - [ dimensionPrefix - , headerRow - , barRow - , renderedRows - ] - where - (colCount, rowCount, lesserTokens, longerTokens, maxPrefixWidth, maxColumnWidth, [matrixTokens]) = - getMatrixConstants lesser longer [mtx] - - tracebackCells = getTracebackIndices mtx - - dimensionPrefix = " " <> unwords - [ "Dimensions:" - , show rowCount - , "X" - , show colCount - ] - - headerRow = fold - [ " " - , pad maxPrefixWidth "\\" - , "| " - , pad maxColumnWidth "*" - , concatMap (pad maxColumnWidth) longerTokens - ] - - barRow = fold - [ " " - , bar maxPrefixWidth - , "+" - , concatMap (const (bar maxColumnWidth)) $ undefined : longerTokens - ] - where - bar n = replicate (n+1) '-' - - renderedRows = unlines $ zipWith3 renderRow [0..] ("*":lesserTokens) matrixTokens - where - renderRow i e cs = prefix <> suffix - where - prefix = fold [" ", pad maxPrefixWidth e, "| "] - suffix = fold $ zipWith (renderCell i) [0..] cs - - renderCell i j = fmap f . pad maxColumnWidth - where - f | (i,j) `elem` tracebackCells = boldDirection - | otherwise = id - - pad :: Int -> String -> String - pad n e = replicate (n - len) ' ' <> e <> " " - where - len = length e - - --- | --- Serializes an alignment matrix to a 'String'. Uses input characters for row --- and column labelings. --- --- Useful for debugging purposes. -diffDirectionMatrix - :: ( Matrix m t Direction - , Vector v e - ) - => v e - -> v e - -> m t Direction - -> m t Direction - -> String -diffDirectionMatrix lesser longer mtx1 mtx2 = unlines - [ dimensionPrefix - , headerRow - , barRow - , renderedRows - ] - where - (colCount, rowCount, lesserTokens, longerTokens, maxPrefixWidth, maxColumnWidth, [tokMtx1,tokMtx2]) = - getMatrixConstants lesser longer [mtx1, mtx2] - - tracebackCells1 = getTracebackIndices mtx1 - tracebackCells2 = getTracebackIndices mtx2 - - dimensionPrefix = " " <> unwords - [ "Dimensions:" - , show rowCount - , "X" - , show colCount - ] - - headerRow = fold - [ " " - , pad maxPrefixWidth "\\" - , "| " - , pad maxColumnWidth "*" - , concatMap (pad maxColumnWidth) longerTokens - ] - - barRow = fold - [ " " - , bar maxPrefixWidth - , "+" - , concatMap (const (bar maxColumnWidth)) $ undefined : longerTokens - ] - where - bar n = replicate (n+1) '-' - - renderedRows = unlines $ zipWith3 renderRow ("*":lesserTokens) strMtx1 strMtx2 - where - renderRow e xs ys = prefix <> suffix - where - prefix = fold [" ", pad maxPrefixWidth e, "| "] - suffix = fold $ zipWith renderCell xs ys - - renderCell x y - | x' == y' = x - | otherwise = replicate (max (length x) (length y)) ' ' - where - x' = boldDirection <$> x - y' = boldDirection <$> y - - strMtx1 = tok2Str tracebackCells1 tokMtx1 - strMtx2 = tok2Str tracebackCells2 tokMtx2 - - tok2Str s = zipWith f [0..] - where - f i = zipWith (g i) [0..] - g i j = fmap h . pad maxColumnWidth - where - h | (i,j) `member` s = boldDirection - | otherwise = id - - - pad :: Int -> String -> String - pad n e = replicate (n - len) ' ' <> e <> " " - where - len = length e - - --- | --- Get the indices of the traceback route. -getTracebackIndices - :: Matrix m t Direction - => m t Direction - -> Set (Int, Int) -getTracebackIndices mtx = fromDistinctAscList $ go (# m - 1, n - 1 #) - where - getDirection = curry $ unsafeIndex mtx - (m,n) = dim mtx - go (# i, j #) - | i < 0 || j < 0 = [] - | (i,j) == (0,0) = [(0,0)] - | otherwise = - (i,j) : case getDirection i j of - LeftArrow -> go (# i , j - 1 #) - DiagArrow -> go (# i - 1, j - 1 #) - UpArrow -> go (# i - 1, j #) - - -characterVectorToIndices :: Vector v e => v e -> [String] -characterVectorToIndices = - let numbers = tail $ pure <$> cycle ['0'..'9'] - in zipWith const numbers . toList - - -tokenizeMatrix :: (Matrix m t x, Show x) => m t x -> [[String]] -tokenizeMatrix = fmap (fmap show) . toLists - - -maxLengthOfGrid :: (Foldable g, Foldable r, Foldable f, Functor g, Functor r) => g (r (f a)) -> Int -maxLengthOfGrid = maximum . fmap maxLengthOfRow - - -maxLengthOfRow :: (Foldable r, Foldable f, Functor r) => r (f a) -> Int -maxLengthOfRow = maximum . fmap length - - -getMatrixConstants - :: ( Matrix m t x - , Show x - , Vector v e - ) - => v e - -> v e - -> [m t x] - -> (Int, Int, [String], [String], Int, Int, [[[String]]]) -getMatrixConstants lesser longer matrices = - (colCount, rowCount, lesserTokens, longerTokens, maxPrefixWidth, maxColumnWidth, matrixTokens) - where - colCount = basicLength longer + 1 - rowCount = basicLength lesser + 1 - lesserTokens = characterVectorToIndices lesser - longerTokens = characterVectorToIndices longer - maxPrefixWidth = maxLengthOfRow lesserTokens - maxHeaderWidth = maxLengthOfRow longerTokens - - matrixTokens = tokenizeMatrix <$> matrices - maxColumnWidth = maximum $ - maxHeaderWidth : (maxLengthOfGrid <$> matrixTokens) - - diff --git a/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Pairwise/Wide.hs b/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Pairwise/Wide.hs deleted file mode 100644 index a01c47981..000000000 --- a/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Pairwise/Wide.hs +++ /dev/null @@ -1,17 +0,0 @@ -module DirectOptimization.Pairwise.Wide - ( WideDynamicCharacter - , WideState - , widePairwiseDO - ) where - -import Bio.DynamicCharacter -import DirectOptimization.Pairwise.Ukkonen - - -widePairwiseDO - :: Word - -> (WideState -> WideState -> (WideState, Word)) - -> WideDynamicCharacter - -> WideDynamicCharacter - -> (Word, WideDynamicCharacter) -widePairwiseDO = ukkonenDO diff --git a/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/PreOrder.hs b/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/PreOrder.hs deleted file mode 100644 index 6354b873e..000000000 --- a/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/PreOrder.hs +++ /dev/null @@ -1,68 +0,0 @@ -{-# LANGUAGE FlexibleContexts #-} -{-# LANGUAGE Strict #-} - -module DirectOptimization.PreOrder - ( preOrderLogic - ) where - -import Bio.DynamicCharacter -import Control.Monad -import Data.Bits -import Data.STRef -import Data.Vector.Generic (Vector) - - --- | --- Faithful translation of Algorithm 8 (Non-root Node Alignment) from the --- "Efficient Implied Alignment" paper found at: --- -{-# INLINEABLE preOrderLogic #-} -{-# SPECIALISE preOrderLogic :: Bool -> SlimDynamicCharacter -> SlimDynamicCharacter -> SlimDynamicCharacter -> SlimDynamicCharacter #-} -{-# SPECIALISE preOrderLogic :: Bool -> WideDynamicCharacter -> WideDynamicCharacter -> WideDynamicCharacter -> WideDynamicCharacter #-} -{-# SPECIALISE preOrderLogic :: Bool -> HugeDynamicCharacter -> HugeDynamicCharacter -> HugeDynamicCharacter -> HugeDynamicCharacter #-} -preOrderLogic - :: ( FiniteBits a - , Vector v a - ) - => Bool - -> (v a, v a, v a) -- ^ Parent Final Alignment - -> (v a, v a, v a) -- ^ Parent Preliminary Context - -> (v a, v a, v a) -- ^ Child Preliminary Context - -> (v a, v a, v a) -- ^ Child Final Alignment -preOrderLogic isLeftChild pAlignment pContext cContext = forceDynamicCharacter $ unsafeCharacterBuiltByST caLen f - where - ccLen = fromEnum $ characterLength cContext - caLen = characterLength pAlignment - indices = [ 0 .. fromEnum caLen - 1 ] - - f char = pokeTempChar char *> fillTempChar char - - -- We set an arbitrary element from the parent alignemnt context to the first element of the temporary character. - -- This ensures that at least one element can be querried for the determining "nil" and "gap" values. - pokeTempChar char = setFrom pAlignment char 0 0 - - -- Select character building function - fillTempChar - | isMissing cContext = missingλ -- Missing case is all gaps - | otherwise = alignmentλ -- Standard pre-order logic - - missingλ char = - forM_ indices (char `setGapped`) - - alignmentλ char = do - j' <- newSTRef 0 - k' <- newSTRef 0 - - forM_ indices $ \i -> do - k <- readSTRef k' - if k > ccLen || pAlignment `isGapped` i - then char `setGapped` i - else do - j <- readSTRef j' - modifySTRef j' succ - -- Remember that 'Delete' leaves 'voids' in the 'left' character. - if pAlignment `isAlign` i - || (not isLeftChild && pAlignment `isDelete` i && pContext `isDelete` j) - || ( isLeftChild && pAlignment `isInsert` i && pContext `isInsert` j) - then modifySTRef k' succ *> setFrom cContext char k i - else char `setGapped` i diff --git a/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Wrapper/DOHuge.hs b/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Wrapper/DOHuge.hs deleted file mode 100644 index c2f6abf5c..000000000 --- a/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Wrapper/DOHuge.hs +++ /dev/null @@ -1,25 +0,0 @@ -module DirectOptimization.DOHuge where - -import Analysis.Parsimony.Dynamic.DirectOptimization.Pairwise.Huge -import Data.BitVector.LittleEndian (BitVector, dimension) -import Data.Bits -import Data.Foldable -import Data.MetricRepresentation -import Data.Vector (Vector, head) -import Prelude hiding (head) - -wrapperHugeDO - :: Vector BitVector - -> Vector BitVector - -> MetricRepresentation BitVector - -> (Vector BitVector, Int) -wrapperHugeDO lhs rhs tcmMemo = (medians, fromEnum cost) - where - (cost, (medians,_,_)) = hugePairwiseDO (minInDelCost tcmMemo) gapState (retreivePairwiseTCM tcmMemo) (lhs, lhs, lhs) (rhs, rhs, rhs) - - gapState = bit . fromEnum $ n - 1 - n = case length lhs of - 0 -> case length rhs of - 0 -> 64 - _ -> dimension $ head rhs - _ -> dimension $ head lhs diff --git a/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Wrapper/DOSlim.hs b/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Wrapper/DOSlim.hs deleted file mode 100644 index 661b6de5f..000000000 --- a/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Wrapper/DOSlim.hs +++ /dev/null @@ -1,70 +0,0 @@ -module DirectOptimization.DOSlim where - -import Analysis.Parsimony.Dynamic.DirectOptimization.Pairwise.Slim -import Data.BitVector.LittleEndian (BitVector) -import qualified Data.BitVector.LittleEndian as BV -import Data.Bits -import Data.TCM.Dense -import Data.Vector (Vector, (!)) -import qualified Data.Vector as V -import qualified Data.Vector.Storable as SV -import Foreign.C.Types (CUInt) - - -wrapperSlimDO :: Vector BitVector -> Vector BitVector -> DenseTransitionCostMatrix -> (Vector BitVector, Int) - --- wrapperPCG_DO_FFI lhs rhs tcm | trace (show tcm) False= undefined -wrapperSlimDO lhs rhs tcmDense = (resultMedians, fromEnum resultCost) - where - (resultCost, resultMedians) = slimDC2BVs 5 <$> slimPairwiseDO tcmDense lhsDC rhsDC - - lhsDC = bvs2SlimDC lhs - rhsDC = bvs2SlimDC rhs - {-} - tcmDense = generateDenseTransitionCostMatrix 0 5 getCost - getCost i j = - let x = SM.getFullVects tcm - in toEnum $ (x ! fromEnum i) ! fromEnum j - -} - -{- -wrapperSlimDO :: Vector BitVector -> Vector BitVector -> Vector (Vector Int) -> (Vector BitVector, Int) --- wrapperPCG_DO_FFI lhs rhs tcm | trace (show tcm) False= undefined -wrapperSlimDO lhs rhs tcm = (resultMedians, fromEnum resultCost) - where - (resultCost, resultMedians) = slimDC2BVs 5 <$> slimPairwiseDO tcmDense lhsDC rhsDC - - lhsDC = bvs2SlimDC lhs - rhsDC = bvs2SlimDC rhs - tcmDense = generateDenseTransitionCostMatrix 0 5 getCost - getCost i j = - let x = SM.getFullVects tcm - in toEnum $ (x ! fromEnum i) ! fromEnum j - --} ---specializedAlphabetToDNA :: Alphabet String ---specializedAlphabetToDNA = fromSymbols $ show <$> (0 :: Word) :| [1 .. 4] - - -bvs2SlimDC :: V.Vector BitVector -> SlimDynamicCharacter -bvs2SlimDC v = (x,x,x) - where - x = SV.generate (V.length v) $ \i -> bv2w (v ! i) - - bv2w :: BitVector -> CUInt - bv2w bv = - let f i a - | bv `testBit` i = a `setBit` i - | otherwise = a - in foldr f 0 [0 .. fromEnum $ BV.dimension bv - 1] - - -slimDC2BVs :: Int -> SlimDynamicCharacter -> V.Vector BitVector -slimDC2BVs n (x,_,_) = V.generate (SV.length x) $ \i -> w2bv (x SV.! i) - where - w2bv :: CUInt -> BitVector - w2bv w = - let f i a - | w `testBit` i = a `setBit` i - | otherwise = a - in foldr f (BV.fromNumber (toEnum n) (0:: Word)) [0 .. n - 1] diff --git a/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Wrapper/DOWide.hs b/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Wrapper/DOWide.hs deleted file mode 100644 index 38fdf9f92..000000000 --- a/pkg/PhyGraph/lib/dynamic-character/src/DirectOptimization/Wrapper/DOWide.hs +++ /dev/null @@ -1,63 +0,0 @@ -module DirectOptimization.DOWide where - -import Analysis.Parsimony.Dynamic.DirectOptimization.Pairwise.Wide ---import Analysis.Parsimony.Dynamic.DirectOptimization.Pairwise.Ukkonen2 -import Control.Arrow (first) -import Data.BitVector.LittleEndian (BitVector) -import qualified Data.BitVector.LittleEndian as BV -import Data.Bits -import Data.MetricRepresentation -import Data.Vector (Vector, (!)) -import qualified Data.Vector as V -import qualified Data.Vector.Unboxed as UV -import Data.Word - - -wrapperWideDO - :: Vector BitVector - -> Vector BitVector - -> MetricRepresentation BitVector --Word64 - -> (Vector BitVector, Int) -wrapperWideDO lhs rhs metric = (wideDC2BVs (fromIntegral n) resultMedians, fromEnum resultCost) - where - (resultCost, resultMedians) = widePairwiseDO (minInDelCost metric) gap tcm lhsDC rhsDC --- (resultCost, resultMedians) = ukkonenDO (minInDelCost metric) gap tcm lhsDC rhsDC - - gap = bit . fromEnum $ n - 1 - - tcm :: Word64 -> Word64 -> (Word64, Word) - tcm x y = first BV.toUnsignedNumber (retreivePairwiseTCM metric) (BV.fromNumber 64 x) (BV.fromNumber 64 y) - - - n = case length lhs of - 0 -> case length rhs of - 0 -> 64 - _ -> BV.dimension $ V.head rhs - _ -> BV.dimension $ V.head lhs - - lhsDC = bvs2WideDC lhs - rhsDC = bvs2WideDC rhs - - -bvs2WideDC :: V.Vector BitVector -> WideDynamicCharacter -bvs2WideDC v = (x,x,x) - where - x = UV.generate (V.length v) $ \i -> bv2w (v ! i) - - bv2w :: BitVector -> Word64 - bv2w bv = - let f i a - | bv `testBit` i = a `setBit` i - | otherwise = a - in foldr f 0 [0 .. fromEnum $ BV.dimension bv - 1] - - -wideDC2BVs :: Int -> WideDynamicCharacter -> V.Vector BitVector -wideDC2BVs n (x,_,_) = V.generate (UV.length x) $ \i -> w2bv (x UV.! i) - where - w2bv :: Word64 -> BitVector - w2bv w = - let f i a - | w `testBit` i = a `setBit` i - | otherwise = a - in foldr f (BV.fromNumber (toEnum n) (0 :: Word)) [0 .. n - 1] diff --git a/pkg/PhyGraph/lib/dynamic-character/test/TestSuite.hs b/pkg/PhyGraph/lib/dynamic-character/test/TestSuite.hs deleted file mode 100644 index 1bd5e6b03..000000000 --- a/pkg/PhyGraph/lib/dynamic-character/test/TestSuite.hs +++ /dev/null @@ -1,30 +0,0 @@ ------------------------------------------------------------------------------- --- | --- Module : Main --- Copyright : (c) 2015-2021 Ward Wheeler --- License : BSD-style --- --- Maintainer : wheeler@amnh.org --- Stability : provisional --- Portability : portable --- ------------------------------------------------------------------------------ - -module Main - ( main - ) where - -import qualified DirectOptimization.Pairwise.Test as Pairwise -import Test.Tasty - - --- | --- Entry point for the test-suite of the "dynamic-character" library. -main :: IO () -main = defaultMain testSuite - - -testSuite :: TestTree -testSuite = testGroup "Dynamic Character Test-Suite" - [ Pairwise.testSuite - ] diff --git a/pkg/PhyGraph/lib/tcm/src/Data/Hashable/Memoize.hs b/pkg/PhyGraph/lib/tcm/src/Data/Hashable/Memoize.hs deleted file mode 100644 index 9ed01b14a..000000000 --- a/pkg/PhyGraph/lib/tcm/src/Data/Hashable/Memoize.hs +++ /dev/null @@ -1,254 +0,0 @@ ------------------------------------------------------------------------------ --- | --- Module : Data.Hashable.Memoize --- Copyright : (c) 2015-2021 Ward Wheeler --- License : BSD-style --- --- Maintainer : wheeler@amnh.org --- Stability : provisional --- Portability : portable --- --- Exposes memoization combinators. Assumes that the supplied functions are --- side effect free. If this assumption is violated, undefined and unexpected --- behavior may result. ------------------------------------------------------------------------------ - -{-# LANGUAGE BangPatterns #-} -{-# LANGUAGE ScopedTypeVariables #-} - -module Data.Hashable.Memoize - ( memoize - , memoize2 - , memoize3 - ) where - - ---import Control.Concurrent.STM ---import Control.Concurrent.STM.TVar -import Control.DeepSeq ---import Control.Monad (join) ---import Control.Monad.ST ---import Data.Functor -import Data.Hashable ---import qualified Data.HashTable as HT -import Data.HashTable.IO ---import Data.HashTable.ST.Basic -import Data.IORef -import Prelude hiding (lookup) -import System.IO.Unsafe - - --- | --- /O(1)/ --- --- Takes a function with a hashable and equatable first argument and returns a --- memoized function with constant time access for already computed values. --- --- __Note:__ This does /not/ memoize recursively defined functions. --- --- To memoize a recursively defined function, you must redefine the function's --- recursive calls to internally call a memoized definition in a mutually recursive --- manner. --- --- === Example: Does /not/ memoize the recursive definitions --- --- > fib 0 = 0 --- > fib 1 = 1 --- > fib x = fib (x-1) + fib (x-2) --- --- >>> let memo = memoize fib in memo 10000 --- --- --- === Example: Does memoize the recursive definitions --- --- > fibM = f --- > where --- > f 0 = 0 --- > f 1 = 1 --- > f x = g (x-1) + g (x-2) --- > g = memoize f --- --- >>> fibM 10000 --- -{-# NOINLINE memoize #-} -memoize :: forall a b. (Eq a, Hashable a, NFData b) => (a -> b) -> a -> b -memoize f = unsafePerformIO $ do - -{- - modifyIORef memoEntries succ - entries <- readIORef memoEntries - if entries `mod` 50 == 0 then print entries else pure () --} - let initialSize = 2 ^ (16 :: Word) - - -- Create a TVar which holds the ST state and the HashTable --- !htRef <- newTVarIO (newSized initialSize :: IO (BasicHashTable a b)) - !htRef <- (newSized initialSize :: IO (BasicHashTable a b)) >>= newIORef - -- This is the returned closure of a memozized f - -- The closure captures the "mutable" reference to the hashtable above - -- through the TVar. - -- - -- Once the mutable hashtable reference is escaped from the IO monad, - -- this creates a new memoized reference to f. - -- The technique should be safe for all pure functions, probably, I think. - pure $ \k -> unsafePerformIO $ do - -- Read the TVar, we use IO since it is the outer monad - -- and the documentation says that this doesn't perform a complete transaction, - -- it just reads the current value from the TVar --- ht <- join $ readTVarIO htRef - ht <- readIORef htRef - -- We use the HashTable to try and lookup the memoized value --- let result = runST $ (ht `lookup` k :: forall s. ST s (Maybe b)) - result <- ht `lookup` k - -- Here we check if the memoized value exists - case result of - -- If the value exists return it - Just v -> pure v - -- If the value doesn't exist: - Nothing -> - -- Perform the expensive calculation to determine the value - -- associated with the key, fully evaluated. - let v = force $ f k - -- we want to perform the following modification atomically. - in do insert ht k v -- Insert the key-value pair into the HashTable - writeIORef htRef ht -- Place the updated hashtable back in the IO-Ref - -- After performing the update side effects, - -- we return the value associated with the key - pure v -{- - in atomically $ - -- Don't use writeTVar or use a reference to the HashTable from above. - -- It may have been concurrently modified before reaching this point! - -- We *atomically* insert the new key-value pair into the existing - -- HashTable behind the TVar, modifying the results of the TVar. - modifyTVar' htRef - (\st -> st -- Get the ST state from the TVar - >>= (\ht' -> -- Bind the hashtable in the state to x - insert ht' k v -- Insert the key-value pair into the HashTable - $> ht' -- Return the HashTable as the value in ST state - ) - ) - -- After performing the update side effects, - -- we return the value associated with the key - $> v --} - - --- Old TVar based code -{- -{-# NOINLINE memoize' #-} -memoize' :: forall a b. (Eq a, Hashable a, NFData b) => (a -> b) -> a -> b -memoize' f = unsafePerformIO $ do - - let initialSize = 2 ^ (16 :: Word) - - -- Create a TVar which holds the ST state and the HashTable --- !htRef <- newTVarIO (newSized initialSize :: IO (BasicHashTable a b)) - !htRef <- (HT.newWithDefaults initialSize :: IO (HT.HashTable a b)) >>= newIORef - -- This is the returned closure of a memozized f - -- The closure captures the "mutable" reference to the hashtable above - -- through the TVar. - -- - -- Once the mutable hashtable reference is escaped from the IO monad, - -- this creates a new memoized reference to f. - -- The technique should be safe for all pure functions, probably, I think. - pure $ \k -> unsafePerformIO $ do - -- Read the TVar, we use IO since it is the outer monad - -- and the documentation says that this doesn't perform a complete transaction, - -- it just reads the current value from the TVar --- ht <- join $ readTVarIO htRef - ht <- readIORef htRef - -- We use the HashTable to try and lookup the memoized value --- let result = runST $ (ht `lookup` k :: forall s. ST s (Maybe b)) - result <- ht `HT.lookup` k - -- Here we check if the memoized value exists - case result of - -- If the value exists return it - Just v -> pure v - -- If the value doesn't exist: - Nothing -> - -- Perform the expensive calculation to determine the value - -- associated with the key, fully evaluated. - let v = force $ f k - -- we want to perform the following modification atomically. - in do void $ HT.add ht k v -- Insert the key-value pair into the HashTable - writeIORef htRef ht -- Place the updated hashtable back in the IO-Ref - -- After performing the update side effects, - -- we return the value associated with the key - pure v -{- - in atomically $ - -- Don't use writeTVar or use a reference to the HashTable from above. - -- It may have been concurrently modified before reaching this point! - -- We *atomically* insert the new key-value pair into the existing - -- HashTable behind the TVar, modifying the results of the TVar. - modifyTVar' htRef - (\st -> st -- Get the ST state from the TVar - >>= (\ht' -> -- Bind the hashtable in the state to x - insert ht' k v -- Insert the key-value pair into the HashTable - $> ht' -- Return the HashTable as the value in ST state - ) - ) - -- After performing the update side effects, - -- we return the value associated with the key - $> v --} --} - - -{- -{-# NOINLINE memoEntries #-} -memoEntries :: IORef Word -memoEntries = unsafePerformIO $ newIORef 0 --} - - --- | --- A memoizing combinator similar to 'memoize' except that it that acts on a --- function of two inputs rather than one. -{-# NOINLINE memoize2 #-} -memoize2 :: (Eq a, Eq b, Hashable a, Hashable b, NFData c) => (a -> b -> c) -> a -> b -> c -memoize2 f = let f' = memoize (uncurry f) - in curry f' - - --- | --- A memoizing combinator similar to 'memoize' except that it that acts on a --- function of two inputs rather than one. -{-# NOINLINE memoize3 #-} -memoize3 - :: ( Eq a - , Eq b - , Eq c - , Hashable a - , Hashable b - , Hashable c - , NFData d - ) - => (a -> b -> c -> d) - -> a - -> b - -> c - -> d -memoize3 f = let f' = memoize (uncurry3 f) - in curry3 f' - where - curry3 g x y z = g (x,y,z) - uncurry3 g (x,y,z) = g x y z - - - -{- --- These are included for haddock generation -fib 0 = 0 -fib 1 = 1 -fib x = fib (x-1) + fib (x-2) - -fibM :: Integer -> Integer -fibM = f - where - f 0 = 0 - f 1 = 1 - f x = g (x-1) + g (x-2) - g = memoize f --} diff --git a/pkg/PhyGraph/lib/tcm/src/Data/TCM/Overlap.hs b/pkg/PhyGraph/lib/tcm/src/Data/TCM/Overlap.hs deleted file mode 100644 index db035ef8d..000000000 --- a/pkg/PhyGraph/lib/tcm/src/Data/TCM/Overlap.hs +++ /dev/null @@ -1,170 +0,0 @@ ------------------------------------------------------------------------------ --- | --- Module : Data.TCM.Overlap --- Copyright : (c) 2015-2021 Ward Wheeler --- License : BSD-style --- --- Maintainer : wheeler@amnh.org --- Stability : provisional --- Portability : portable --- ------------------------------------------------------------------------------ - -{-# LANGUAGE Strict #-} - -module Data.TCM.Overlap - ( overlap - , overlap2 - , overlap3 - ) where - -import Data.Bits -import Data.Foldable -import Data.List.NonEmpty (NonEmpty(..)) -import Data.Semigroup -import Data.Semigroup.Foldable -import Data.Word -import Foreign.C.Types (CUInt) - - -data Bounds b - = Bounds - { _lBound :: {-# UNPACK #-} Word - , _uBound :: {-# UNPACK #-} Word - , _bValue :: b - } - - --- | --- Takes one or more elements of 'FiniteBits' and a symbol change cost function --- and returns a tuple of a new character, along with the cost of obtaining that --- character. The return character may be (or is even likely to be) ambiguous. --- Will attempt to intersect the two characters, but will union them if that is --- not possible, based on the symbol change cost function. --- --- To clarify, the return character is an intersection of all possible least-cost --- combinations, so for instance, if @ char1 == A,T @ and @ char2 == G,C @, and --- the two (non-overlapping) least cost pairs are A,C and T,G, then the return --- value is A,C,G,T. -{-# INLINEABLE overlap #-} -{-# SPECIALISE overlap :: (Foldable1 f, Functor f) => Word -> (Word -> Word -> Word) -> f CUInt -> (CUInt , Word) #-} -{-# SPECIALISE overlap :: (Foldable1 f, Functor f) => Word -> (Word -> Word -> Word) -> f Word -> (Word , Word) #-} -{-# SPECIALISE overlap :: (Foldable1 f, Functor f) => Word -> (Word -> Word -> Word) -> f Word8 -> (Word8 , Word) #-} -{-# SPECIALISE overlap :: (Foldable1 f, Functor f) => Word -> (Word -> Word -> Word) -> f Word16 -> (Word16, Word) #-} -{-# SPECIALISE overlap :: (Foldable1 f, Functor f) => Word -> (Word -> Word -> Word) -> f Word32 -> (Word32, Word) #-} -{-# SPECIALISE overlap :: (Foldable1 f, Functor f) => Word -> (Word -> Word -> Word) -> f Word64 -> (Word64, Word) #-} -{-# SPECIALISE overlap :: FiniteBits b => Word -> (Word -> Word -> Word) -> NonEmpty b -> (b , Word) #-} -{-# SPECIALISE overlap :: Word -> (Word -> Word -> Word) -> NonEmpty CUInt -> (CUInt , Word) #-} -{-# SPECIALISE overlap :: Word -> (Word -> Word -> Word) -> NonEmpty Word -> (Word , Word) #-} -{-# SPECIALISE overlap :: Word -> (Word -> Word -> Word) -> NonEmpty Word8 -> (Word8 , Word) #-} -{-# SPECIALISE overlap :: Word -> (Word -> Word -> Word) -> NonEmpty Word16 -> (Word16, Word) #-} -{-# SPECIALISE overlap :: Word -> (Word -> Word -> Word) -> NonEmpty Word32 -> (Word32, Word) #-} -{-# SPECIALISE overlap :: Word -> (Word -> Word -> Word) -> NonEmpty Word64 -> (Word64, Word) #-} -overlap - :: ( FiniteBits b - , Foldable1 f - , Functor f - ) - => Word -- ^ Alphabet size - -> (Word -> Word -> Word) -- ^ Symbol change matrix (SCM) to determine cost - -> f b -- ^ List of elements for of which to find the k-median and cost - -> (b, Word) -- ^ K-median and cost -overlap size sigma xs = go size maxBound zero - where - withBounds = getBitBounds <$> xs - wlog = getFirst $ foldMap1 First xs - zero = wlog `xor` wlog - - go 0 theCost bits = (bits, theCost) - go i oldCost bits = - let i' = i - 1 - newCost = foldl' (+) 0 $ getDistance i' <$> withBounds - (minCost, bits') = case oldCost `compare` newCost of - LT -> (oldCost, bits ) - EQ -> (oldCost, bits `setBit` fromEnum i') - GT -> (newCost, zero `setBit` fromEnum i') - in go i' minCost bits' - - getDistance :: FiniteBits b => Word -> Bounds b -> Word - getDistance i (Bounds lo hi b) = go' (hi+1) (maxBound :: Word) - where - go' :: Word -> Word -> Word - go' j a | j <= lo = a - go' j a = - let j' = j - 1 - a' | b `testBit` fromEnum j' = min a $ sigma i j' - | otherwise = a - in go' j' a' - - --- | --- Calculate the median between /two/ states. -{-# INLINEABLE overlap2 #-} -{-# SPECIALISE overlap2 :: Word -> (Word -> Word -> Word) -> CUInt -> CUInt -> (CUInt , Word) #-} -{-# SPECIALISE overlap2 :: Word -> (Word -> Word -> Word) -> Word -> Word -> (Word , Word) #-} -{-# SPECIALISE overlap2 :: Word -> (Word -> Word -> Word) -> Word8 -> Word8 -> (Word8 , Word) #-} -{-# SPECIALISE overlap2 :: Word -> (Word -> Word -> Word) -> Word16 -> Word16 -> (Word16, Word) #-} -{-# SPECIALISE overlap2 :: Word -> (Word -> Word -> Word) -> Word32 -> Word32 -> (Word32, Word) #-} -{-# SPECIALISE overlap2 :: Word -> (Word -> Word -> Word) -> Word64 -> Word64 -> (Word64, Word) #-} -overlap2 - :: FiniteBits b - => Word - -> (Word -> Word -> Word) - -> b - -> b - -> (b, Word) -overlap2 size sigma char1 char2 = overlap size sigma $ char1 :| [char2] - - --- | --- Calculate the median between /three/ states. -{-# INLINE overlap3 #-} -{-# SPECIALISE overlap3 :: Word -> (Word -> Word -> Word) -> CUInt -> CUInt -> CUInt -> (CUInt , Word) #-} -{-# SPECIALISE overlap3 :: Word -> (Word -> Word -> Word) -> Word -> Word -> Word -> (Word , Word) #-} -{-# SPECIALISE overlap3 :: Word -> (Word -> Word -> Word) -> Word8 -> Word8 -> Word8 -> (Word8 , Word) #-} -{-# SPECIALISE overlap3 :: Word -> (Word -> Word -> Word) -> Word16 -> Word16 -> Word16 -> (Word16, Word) #-} -{-# SPECIALISE overlap3 :: Word -> (Word -> Word -> Word) -> Word32 -> Word32 -> Word32 -> (Word32, Word) #-} -{-# SPECIALISE overlap3 :: Word -> (Word -> Word -> Word) -> Word64 -> Word64 -> Word64 -> (Word64, Word) #-} -overlap3 - :: FiniteBits b - => Word - -> (Word -> Word -> Word) - -> b - -> b - -> b - -> (b, Word) -overlap3 size sigma char1 char2 char3 = overlap size sigma $ char1 :| [char2, char3] - - --- | --- Gets the lowest set bit and the highest set bit in the collection. -{-# INLINEABLE getBitBounds #-} -{-# SPECIALISE getBitBounds :: CUInt -> Bounds CUInt #-} -{-# SPECIALISE getBitBounds :: Word -> Bounds Word #-} -{-# SPECIALISE getBitBounds :: Word8 -> Bounds Word8 #-} -{-# SPECIALISE getBitBounds :: Word16 -> Bounds Word16 #-} -{-# SPECIALISE getBitBounds :: Word32 -> Bounds Word32 #-} -{-# SPECIALISE getBitBounds :: Word64 -> Bounds Word64 #-} -{-# SPECIALISE getBitBounds :: FiniteBits b => b -> Bounds b #-} -{-# SPECIALISE getBitBounds :: CUInt -> Bounds CUInt #-} -{-# SPECIALISE getBitBounds :: Word -> Bounds Word #-} -{-# SPECIALISE getBitBounds :: Word8 -> Bounds Word8 #-} -{-# SPECIALISE getBitBounds :: Word16 -> Bounds Word16 #-} -{-# SPECIALISE getBitBounds :: Word32 -> Bounds Word32 #-} -{-# SPECIALISE getBitBounds :: Word64 -> Bounds Word64 #-} -getBitBounds - :: FiniteBits b - => b - -> Bounds b -getBitBounds b = - let bitZero = (b `xor` b) `setBit` 0 - bigEndian = countLeadingZeros bitZero > 0 -- Check the endianness - - (f,g) | bigEndian = (countTrailingZeros, countLeadingZeros ) - | otherwise = (countLeadingZeros , countTrailingZeros) - - lZeroes = f b - uZeroes = g b - lower = toEnum lZeroes - upper = toEnum . max 0 $ finiteBitSize b - uZeroes - 1 - in Bounds lower upper b diff --git a/pkg/PhyGraph/phygraph.hs b/pkg/PhyGraph/phygraph.hs deleted file mode 100644 index 3b9835815..000000000 --- a/pkg/PhyGraph/phygraph.hs +++ /dev/null @@ -1,255 +0,0 @@ -{- | -Module : phygraph.hs -Description : Progam to perform phylogenetic searchs on general graphs with diverse data types -Copyright : (c) 2021 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -module Main (main) where - -import qualified Commands.CommandExecution as CE -import qualified Commands.ProcessCommands as PC -import qualified Commands.Verify as V -import qualified Data.List as L -import qualified Data.Text.Lazy as Text -import qualified Data.Text.Short as ST -import GeneralUtilities -import qualified GraphFormatUtilities as GFU -import qualified GraphOptimization.Traversals as T -import qualified Graphs.GraphOperations as GO -import qualified Input.DataTransformation as DT -import qualified Input.ReadInputFiles as RIF -import qualified Input.Reorganize as R -import System.Environment -import System.IO -import Types.Types -import qualified Utilities.Distances as D -import qualified Utilities.Utilities as U -import qualified Input.BitPack as BP -import qualified Data.Time as DT -import qualified Data.CSV as CSV -import qualified Utilities.LocalGraph as LG -import Debug.Trace -import Data.Maybe - --- | main driver -main :: IO () -main = do - dateFull <- DT.getCurrentTime - let splash = "\nPhyG version " ++ pgVersion ++ "\nCopyright(C) 2022 Ward Wheeler and The American Museum of Natural History\n" - let splash2 = "PhyG comes with ABSOLUTELY NO WARRANTY; This is free software, and may be \nredistributed " - let splash3 = "\tunder the 3-Clause BSD License.\nCompiled " ++ (show dateFull) - hPutStrLn stderr (splash ++ splash2 ++ splash3) - - -- Process arguments--a single file containing commands - args <- getArgs - - if length args /= 1 then errorWithoutStackTrace "\nProgram requires a single argument--the name of command script file.\n\n" - else hPutStr stderr "\nCommand script file: " - hPutStrLn stderr $ head args - - -- System time for Random seed - timeD <- getSystemTimeSeconds - hPutStrLn stderr ("Initial random seed set to " ++ (show timeD)) - - -- hPutStrLn stderr ("Current time is " ++ show timeD) - let seedList = randomIntList timeD - - -- Process commands to get list of actions - commandContents <- readFile $ head args - - -- Process so one command per line? - - commandContents' <- PC.expandRunCommands [] (lines commandContents) - let thingsToDo' = PC.getCommandList commandContents' - --mapM_ (hPutStrLn stderr) (fmap show thingsToDo') - - -- Process Read commands (with prealigned and tcm flags first) - --expand read commands for wildcards - expandedReadCommands <- mapM (RIF.expandReadCommands []) $ filter ((== Read) . fst) thingsToDo' - - -- sort added to sort input read commands for left right consistancy - let thingsToDo = (L.sort $ concat expandedReadCommands) ++ (filter ((/= Read) . fst) thingsToDo') - --hPutStrLn stderr (show $ concat expandedReadCommands) - - -- check commands and options for basic correctness - hPutStrLn stderr "\tChecking command file syntax" - let !commandsOK = V.verifyCommands thingsToDo [] [] - - if commandsOK then hPutStrLn stderr "Commands appear to be properly specified--file availability and contents not checked.\n" - else errorWithoutStackTrace "Commands not properly specified" - - dataGraphList <- mapM RIF.executeReadCommands $ fmap PC.movePrealignedTCM $ fmap snd $ filter ((== Read) . fst) thingsToDo - let (rawData, rawGraphs, terminalsToInclude, terminalsToExclude, renameFilePairs, reBlockPairs) = RIF.extractInputTuple dataGraphList - - if null rawData && null rawGraphs then errorWithoutStackTrace "\n\nNeither data nor graphs entered. Nothing can be done." - else hPutStrLn stderr ("Entered " ++ (show $ length rawData) ++ " data file(s) and " ++ (show $ length rawGraphs) ++ " input graphs") - - -- get set paritions character from Set commands early - let setCommands = filter ((== Set).fst) thingsToDo - (_, partitionCharOptimalityGlobalSettings, _, _) <- CE.executeCommands emptyGlobalSettings 0 [] mempty mempty mempty mempty mempty mempty setCommands - - -- Split fasta/fastc sequences into corresponding pieces based on '#' partition character - let rawDataSplit = DT.partitionSequences (ST.fromString (partitionCharacter partitionCharOptimalityGlobalSettings)) rawData - - -- Process Rename Commands - newNamePairList <- CE.executeRenameReblockCommands Rename renameFilePairs thingsToDo - if (not $ null newNamePairList) then hPutStrLn stderr ("Renaming " ++ (show $ length newNamePairList) ++ " terminals") - else hPutStrLn stderr ("No terminals to be renamed") - - let renamedData = fmap (DT.renameData newNamePairList) rawDataSplit - let renamedGraphs = fmap (GFU.relabelGraphLeaves newNamePairList) rawGraphs - - let numInputFiles = length renamedData - - let thingsToDoAfterReadRename = (filter ((/= Read) .fst) $ filter ((/= Rename) .fst) thingsToDo) - - -- Reconcile Data and Graphs (if input) including ladderization - -- could be sorted, but no real need - -- get taxa to include in analysis - if not $ null terminalsToInclude then hPutStrLn stderr ("Terminals to include:" ++ show terminalsToInclude) - else hPutStrLn stderr ("") - if not $ null terminalsToExclude then hPutStrLn stderr ("Terminals to exclude:" ++ show terminalsToExclude) - else hPutStrLn stderr ("") - - -- Uses names form terminal list if non-null, and remove exckuded terminals - let dataLeafNames' = if (not $ null terminalsToInclude) then L.sort $ L.nub terminalsToInclude - else L.sort $ DT.getDataTerminalNames renamedData - let dataLeafNames = dataLeafNames' L.\\ terminalsToExclude - hPutStrLn stderr ("Data were input for " ++ (show $ length dataLeafNames) ++ " terminals") - - -- this created here and passed to command execution later to remove dependency of renamed data in command execution to - -- reduce memory footprint keeoing that stuff around. - let crossReferenceString = CSV.genCsvFile $ CE.getDataListList renamedData dataLeafNames - - - let reconciledData = fmap (DT.addMissingTerminalsToInput dataLeafNames []) renamedData - let reconciledGraphs = fmap (GFU.reIndexLeavesEdges dataLeafNames) $ fmap (GFU.checkGraphsAndData dataLeafNames) renamedGraphs - - -- Check to see if there are taxa without any observations. Would become total wildcards - let taxaDataSizeList = filter ((==0).snd) $ zip dataLeafNames $ foldl1 (zipWith (+)) $ fmap (fmap snd3) $ fmap (fmap (U.filledDataFields (0,0))) $ fmap fst reconciledData - if (length taxaDataSizeList /= 0) then hPutStrLn stderr ("\nWarning (but a serious one): There are taxa without any data: " - ++ (L.intercalate ", " $ fmap Text.unpack $ fmap fst taxaDataSizeList) ++ "\n") - else hPutStrLn stderr "All taxa contain data" - - -- Ladderizes (resolves) input graphs and ensures that networks are time-consistent - -- chained netowrk nodes should never be introduced later so only checked no - -- checks for children of tree node that are all netowork nodee (causes displayu problem) - let noChainNetNodesList = fmap fromJust $ filter (/=Nothing) $ fmap LG.removeChainedNetworkNodes reconciledGraphs - let noSisterNetworkNodes = fmap LG.removeTreeEdgeFromTreeNodeWithAllNetworkChildren noChainNetNodesList - let ladderizedGraphList = fmap (GO.convertGeneralGraphToPhylogeneticGraph "correct") noSisterNetworkNodes - - {-To do - -- Remove any not "selected" taxa from both data and graphs (easier to remove from fgl) - let reconciledData' = removeTaxaFromData includeList reconciledData - let reconciledGraphs' = removeTaxaFromGraphs includeList reconciledData - -} - - -- Create unique bitvector names for leaf taxa. - let leafBitVectorNames = DT.createBVNames reconciledData - - {- - Data processing here-- there are multiple steps not cvomposed so that - large data files can be precessed and intermediate data goes out - of scope and can be freed back to system - -} - - -- Create Naive data -- basic usable format organized into blocks, but not grouped by types, or packed (bit, sankoff, prealigned etc) - -- Need to check data for equal in character number - let naiveData = DT.createNaiveData reconciledData leafBitVectorNames [] - - -- Group Data--all nonadditives to single character, additives with - -- alphbet < 64 recoded to nonadditive binary, additives with same alphabet - -- combined, - let naiveDataGrouped = R.combineDataByType partitionCharOptimalityGlobalSettings naiveData -- R.groupDataByType naiveData - - -- Bit pack non-additive data - let naiveDataPacked = BP.packNonAdditiveData partitionCharOptimalityGlobalSettings naiveDataGrouped - - -- Optimize Data convert - -- prealigned to non-additive or matrix - -- bitPack resulting non-additive - let optimizedPrealignedData = R.optimizePrealignedData partitionCharOptimalityGlobalSettings naiveDataPacked - - - -- Execute any 'Block' change commands--make reBlockedNaiveData - newBlockPairList <- CE.executeRenameReblockCommands Reblock reBlockPairs thingsToDo - - let reBlockedNaiveData = R.reBlockData newBlockPairList optimizedPrealignedData -- naiveData - let thingsToDoAfterReblock = filter ((/= Reblock) .fst) $ filter ((/= Rename) .fst) thingsToDoAfterReadRename - - -- Combines data of exact types into single vectors in each block - -- thids is final data processing step - let optimizedData = if (not . null) newBlockPairList then - trace ("Reorganizing Block data") - R.combineDataByType partitionCharOptimalityGlobalSettings reBlockedNaiveData - else optimizedPrealignedData - - - - -- Set global vaues before search--should be integrated with executing commands - -- only stuff that is data dependent here (and seed) - let defaultGlobalSettings = emptyGlobalSettings { outgroupIndex = 0 - , outGroupName = head dataLeafNames - , seed = timeD - , numDataLeaves = length leafBitVectorNames - } - --hPutStrLn stderr (show defaultGlobalSettings) - - let initialSetCommands = filter ((== Set).fst) thingsToDoAfterReblock - let commandsAfterInitialDiagnose = filter ((/= Set).fst) thingsToDoAfterReblock - - -- This rather awkward syntax makes sure global settings (outgroup, criterion etc) are in place for initial input graph diagnosis - (_, initialGlobalSettings, seedList', _) <- CE.executeCommands defaultGlobalSettings numInputFiles crossReferenceString optimizedData optimizedData [] [] seedList [] initialSetCommands - let inputGraphList = map (T.multiTraverseFullyLabelGraph initialGlobalSettings optimizedData True True Nothing) (fmap (LG.rerootTree (outgroupIndex initialGlobalSettings)) ladderizedGraphList) - - - -- Create lazy pairwise distances if needed later for build or report - let pairDist = D.getPairwiseDistances optimizedData - - - -- Execute Following Commands (searches, reports etc) - (finalGraphList, _, _, _) <- CE.executeCommands initialGlobalSettings numInputFiles crossReferenceString optimizedData optimizedData inputGraphList pairDist seedList' [] commandsAfterInitialDiagnose - - -- print global setting just to check - --hPutStrLn stderr (show _finalGlobalSettings) - - -- Add in model and root cost if optimality criterion needs it - hPutStrLn stderr ("\tUpdating final graph costs") - let finalGraphList' = fmap (T.updateGraphCostsComplexities initialGlobalSettings) finalGraphList - - -- Final Stderr report - timeDN <- getSystemTimeSeconds - let minCost = if null finalGraphList then 0.0 else minimum $ fmap snd6 finalGraphList' - let maxCost = if null finalGraphList then 0.0 else maximum $ fmap snd6 finalGraphList' - hPutStrLn stderr ("Execution returned " ++ (show $ length finalGraphList') ++ " graph(s) at cost range " ++ (show (minCost, maxCost)) ++ " in "++ show (timeDN - timeD) ++ " second(s)") - diff --git a/pkg/PhyloLib/CHANGELOG.md b/pkg/PhyloLib/CHANGELOG.md deleted file mode 120000 index df460b98c..000000000 --- a/pkg/PhyloLib/CHANGELOG.md +++ /dev/null @@ -1 +0,0 @@ -../../doc/CHANGELOG.md \ No newline at end of file diff --git a/pkg/PhyloLib/LICENSE b/pkg/PhyloLib/LICENSE deleted file mode 120000 index abdaf5ea7..000000000 --- a/pkg/PhyloLib/LICENSE +++ /dev/null @@ -1 +0,0 @@ -../../doc/LICENSE \ No newline at end of file diff --git a/pkg/PhyloLib/PhyloLib.cabal b/pkg/PhyloLib/PhyloLib.cabal deleted file mode 100644 index ed68d725c..000000000 --- a/pkg/PhyloLib/PhyloLib.cabal +++ /dev/null @@ -1,71 +0,0 @@ -Cabal-Version: 3.0 -Name: PhyloLib -Version: 0.1.0 -Stability: Alpha -Build-Type: Simple -Tested-With: - GHC == 9.2.4 - -Author: Ward Wheeler -Copyright: (c) 2015-2022 Ward Wheeler -License: BSD-3-Clause -License-File: LICENSE - -Maintainer: Ward Wheeler -Homepage: https://github.com/wardwheeler/PhyGraph#readme -Bug-Reports: https://github.com/wardwheeler/PhyGraph/issues - -Extra-Doc-Files: - CHANGELOG.md - README.md - - -library - - default-language: - Haskell2010 - - hs-source-dirs: - src - - build-depends: - array >= 0.5.4 && < 0.6, - base >= 4.11 && < 5.0, - bv >= 0.5 && < 1.0, - bv-little >= 1.2 && < 2.0, - containers >= 0.6.2 && < 1.0, - deepseq >= 1.4 && < 2.0, - fgl >= 5.7 && < 6.0, - graphviz >= 2999.20 && < 3000, - parallel >= 3.2 && < 4.0, - random >= 1.2 && < 2.0, - sort >= 1.0 && < 2.0, - text >= 1.2 && < 3.0, - time >= 1.12 && < 2.0, - vector >= 0.12.0.3 && < 0.13, - - exposed-modules: - Cyclic - GeneralUtilities - GraphFormatUtilities - LocalSequence - ParallelUtilities - SymMatrix - SymMatrixSeq - - ghc-options: - -- Sanity check warnings: - -- 1. Fail on a warning - -- 2. Include all warnings by default - -- 3. Exclude the undesirable warnings - -Werror - -Weverything - -- Exclusions: - -Wno-all-missed-specialisations - -Wno-implicit-prelude - -Wno-missing-import-lists - -Wno-missing-kind-signatures - -Wno-missing-safe-haskell-mode - -Wno-monomorphism-restriction - -Wno-type-defaults - -Wno-unsafe diff --git a/pkg/PhyloLib/README.md b/pkg/PhyloLib/README.md deleted file mode 120000 index 2c761ea48..000000000 --- a/pkg/PhyloLib/README.md +++ /dev/null @@ -1 +0,0 @@ -../../doc/ReadMe/PhyloLib.md \ No newline at end of file diff --git a/pkg/PhyloLib/src/Cyclic.hs b/pkg/PhyloLib/src/Cyclic.hs deleted file mode 100644 index a1b6520cf..000000000 --- a/pkg/PhyloLib/src/Cyclic.hs +++ /dev/null @@ -1,79 +0,0 @@ -{- | -Module : Cyclic.hs -Description : Topological sorting involves removing nodes from the graph into a list to - determine the order they can appear (let nodes be tasks and edges be - constraints). This algorithm can only work on Directed Acyclic Graphs. In - this variation we do not save the nodes in an order, but if we cannot - remove all nodes from the graph then a topological sort isn't possible - implying the graph has a cycle. In a topological sort, leaf nodes (nodes - without a successor) would be last in the ordering. If we remove a leaf - and the edges that connet to it from a graph, then another leaf must - remain. If no other leaves remain, then the graph cannot be topologically - sorted which indicates the existence of a cycle. - - Source : https://gist.github.com/msanatan/7933189#file-cyclic-hs --} - -module Cyclic -(leafNode, -hasLeaf, -delLeaf, -cyclic -) where - -import Data.Graph.Inductive - -{- Topological sorting involves removing nodes from the graph into a list to - determine the order they can appear (let nodes be tasks and edges be - constraints). This algorithm can only work on Directed Acyclic Graphs. In - this variation we do not save the nodes in an order, but if we cannot - remove all nodes from the graph then a topological sort isn't possible - implying the graph has a cycle. In a topological sort, leaf nodes (nodes - without a successor) would be last in the ordering. If we remove a leaf - and the edges that connet to it from a graph, then another leaf must - remain. If no other leaves remain, then the graph cannot be topologically - sorted which indicates the existence of a cycle. --} - - -{- This method determines whether a nodes is a leaf. It receives a graph and a - node and returns a boolean with True indicating it's a leaf node --} -leafNode :: (DynGraph g) => g a b -> Node -> Bool -leafNode gr node = suc gr node == [] - - -{- This method determines whether the graph has a leaf. This is done by - testing the leaf_node condition on every node of the graph It receives a - graph and returns a boolean with True indicating there is a leaf node --} -hasLeaf :: (DynGraph g) => g a b -> Bool -hasLeaf gr = checkNodes . nodes $ gr - where - checkNodes :: [Node] -> Bool - checkNodes [] = False - checkNodes (x:xs) | leafNode gr x = True - | otherwise = checkNodes(xs) - - -{- This method deletes a leaf node from a graph. The edges connecting to the - leaf node are deleted as well. --} -delLeaf :: (DynGraph g) => g a b -> g a b -delLeaf gr = delNode leaf gr' - where - leaf = head [x | x <- nodes gr, leafNode gr x] - gr' = delEdges newEdges gr - newLedges = inn gr leaf - newEdges = [(x,y) | (x,y,_) <- newLedges] - - -{- This method indicates whether a given graph has a cycle or not. If it does - True is returned, False otherwise. If the graph has no nodes it's acyclic, - if it has no leaf nodes then cyclic. If it does have leaf nodes, remove it - and check again --} -cyclic :: (DynGraph g) => g a b -> Bool -cyclic gr | isEmpty gr = False - | not $ hasLeaf gr = True - | otherwise = cyclic $ delLeaf gr \ No newline at end of file diff --git a/pkg/PhyloLib/src/GeneralUtilities.hs b/pkg/PhyloLib/src/GeneralUtilities.hs deleted file mode 100644 index 11e351e37..000000000 --- a/pkg/PhyloLib/src/GeneralUtilities.hs +++ /dev/null @@ -1,549 +0,0 @@ -{- | -Module : GeneralUtilities.hs -Description : Module with useful functions -Copyright : (c) 2021 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -{-# Language BangPatterns #-} -{-# Language ImportQualifiedPost #-} -{-# Language ScopedTypeVariables #-} - -module GeneralUtilities - ( module GeneralUtilities - ) where - -import Data.Array -import Data.Text qualified as T -import Data.Text.Lazy qualified as TL -import System.Random -import Data.Array.IO -import Data.Foldable -import Control.Monad -import Control.DeepSeq -import Data.Time -import Data.Time.Clock.POSIX -import System.IO.Unsafe -import Text.Read -import Data.Maybe -import Data.Bits -import Data.BitVector.LittleEndian qualified as BV -import Data.List qualified as L -import Data.Vector qualified as V -import Data.Word -import Data.Char -import Numeric - --- imports for traceNoLFIO -import Foreign.C.String -import Data.List (partition) - - --- | traceNOLF is trace modified from Debug/Trace to not have --- a line feed (\n) after message -traceNoLF :: String -> a -> a -traceNoLF string expr = unsafePerformIO $ do - traceNoLFIO string - return expr - --- | traceNOLFIO is traceIO modified from Debug/Trace to not have --- a line feed (\n) after message -traceNoLFIO :: String -> IO () -traceNoLFIO msg = - withCString "%s" $ \cfmt -> do - -- NB: debugBelch can't deal with null bytes, so filter them - -- out so we don't accidentally truncate the message. See #9395 - let (nulls, msg') = partition (== '\0') msg - withCString msg' $ \cmsg -> - debugBelch cfmt cmsg - when (not (null nulls)) $ - withCString "WARNING: previous trace message had null bytes" $ \cmsg -> - debugBelch cfmt cmsg - --- don't use debugBelch() directly, because we cannot call varargs functions --- using the FFI. -foreign import ccall unsafe "HsBase.h debugBelch2" - debugBelch :: CString -> CString -> IO () - --- | functions for triples, quadruples -fst3 :: (a,b,c) -> a -fst3 (d,_,_) = d - -snd3 :: (a,b,c) -> b -snd3 (_,e,_) = e - -thd3 :: (a,b,c) -> c -thd3 (_,_,e) = e - -fst4 :: (a,b,c,d) -> a -fst4 (e,_,_,_) = e - -snd4 :: (a,b,c,d) -> b -snd4 (_,e,_,_) = e - -thd4 :: (a,b,c,d) -> c -thd4 (_,_,e,_) = e - -fth4 :: (a,b,c,d) -> d -fth4 (_,_,_,f) = f - -fst5 :: (a,b,c,d,e) -> a -fst5 (e,_,_,_,_) = e - -snd5 :: (a,b,c,d,e) -> b -snd5 (_,e,_,_,_) = e - -thd5 :: (a,b,c,d,e)-> c -thd5 (_,_,e,_,_) = e - -fth5 :: (a,b,c,d,e) -> d -fth5 (_,_,_,e,_) = e - -fft5 :: (a,b,c,d,e) -> e -fft5 (_,_,_,_,e) = e - -fst6 :: (a,b,c,d,e,f) -> a -fst6 (e,_,_,_,_,_) = e - -snd6 :: (a,b,c,d,e,f) -> b -snd6 (_,e,_,_,_,_) = e - -thd6 :: (a,b,c,d,e,f)-> c -thd6 (_,_,e,_,_,_) = e - -fth6 :: (a,b,c,d,e,f) -> d -fth6 (_,_,_,e,_,_) = e - -fft6 :: (a,b,c,d,e,f) -> e -fft6 (_,_,_,_,e,_) = e - -six6 :: (a,b,c,d,e,f) -> f -six6 (_,_,_,_,_,e) = e - --- from https://gist.github.com/thealmarty/643c0509dc6e7e4ad6bcd05e7dbb0e44 --- | The 'zipWith8' function takes a function which combines eight --- elements, as well as eight lists and returns a list of their point-wise --- combination, analogous to 'zipWith'. -zipWith8 :: (a->b->c->d->e->f->g->h->i) -> - [a]->[b]->[c]->[d]->[e]->[f]->[g]->[h]->[i] -zipWith8 z (a:as) (b:bs) (c:cs) (d:ds) (e:es) (f:fs) (g:gs) (h:hs) - = z a b c d e f g h : zipWith8 z as bs cs ds es fs gs hs -zipWith8 _ _ _ _ _ _ _ _ _ = [] - --- | The 'zip8' function takes eight lists and returns a list of --- eight-tuples, analogous to 'zip'. -zip8 :: [a] -> [b] -> [c] -> [d] -> [e] -> [f] -> - [g] -> [h] -> [(a,b,c,d,e,f,g,h)] -zip8 = zipWith8 (,,,,,,,) - --- | The 'zipWith9' function takes a function which combines nine --- elements, as well as eight lists and returns a list of their point-wise --- combination, analogous to 'zipWith'. -zipWith9 :: (a->b->c->d->e->f->g->h->i->j) -> - [a]->[b]->[c]->[d]->[e]->[f]->[g]->[h]->[i]->[j] -zipWith9 z (a:as) (b:bs) (c:cs) (d:ds) (e:es) (f:fs) (g:gs) (h:hs) (i:is) - = z a b c d e f g h i : zipWith9 z as bs cs ds es fs gs hs is -zipWith9 _ _ _ _ _ _ _ _ _ _ = [] - --- | The 'zip9' function takes nine lists and returns a list of --- nine-tuples, analogous to 'zip'. -zip9 :: [a] -> [b] -> [c] -> [d] -> [e] -> [f] -> - [g] -> [h] -> [i] ->[(a,b,c,d,e,f,g,h,i)] -zip9 = zipWith9 (,,,,,,,,) - - - --- | showBits cponverts Value to bits as Srting -showBits :: Word64 -> String -showBits inVal = showIntAtBase 2 intToDigit inVal "" - --- | showBitsV shoiw vector of bits -showBitsV :: V.Vector Word64 -> String -showBitsV inValV = concat $ fmap (++ " ") $ V.toList $ fmap showBits inValV - - --- | doubleAsInt takes floor and ceil of Double and retuns Maybe Int --- nothing if not, Just Int if it is -doubleAsInt :: Double -> Maybe Int -doubleAsInt inDouble = - if ceiling inDouble /= floor inDouble then Nothing - else Just (floor inDouble :: Int) - --- | doubleIsInt returns True if Double is an integer -doubleIsInt :: Double -> Bool -doubleIsInt inDouble = - if ceiling inDouble /= floor inDouble then False - else True - --- | doubleIsInt1 returns True if Double is = Integer 1 -doubleIsInt1 :: Double -> Bool -doubleIsInt1 inDouble = - if (ceiling inDouble == 1) && (floor inDouble == 1) then True - else False - --- | editDistance is a naive edit distance between two lists --- takes two lists and returns edit distance ---- from https://wiki.haskell.org/Edit_distance -editDistance :: Eq a => [a] -> [a] -> Int -editDistance xs ys = table ! (m,n) - where - (m,n) = (length xs, length ys) - x = array (1,m) (zip [1..] xs) - y = array (1,n) (zip [1..] ys) - - table :: Array (Int,Int) Int - table = array bnds [(ij, dist ij) | ij <- range bnds] - bnds = ((0,0),(m,n)) - - dist (0,j) = j - dist (i,0) = i - dist (i,j) = minimum [table ! (i-1,j) + 1, table ! (i,j-1) + 1, - if x ! i == y ! j then table ! (i-1,j-1) else 1 + table ! (i-1,j-1)] - --- | checkCommandArgs takes comamnd and args and verifies that they are in list -checkCommandArgs :: String -> [String] -> [String] -> Bool -checkCommandArgs commandString commandList permittedList = - null commandList || ( - let firstCommand = head commandList - foundCommand = firstCommand `elem` permittedList - in - if foundCommand then checkCommandArgs commandString (tail commandList) permittedList - else - let errorMatch = snd $ getBestMatch (maxBound :: Int ,"no suggestion") permittedList firstCommand - in errorWithoutStackTrace $ fold - [ "\nError: Unrecognized '" - , commandString - , "' option. By '" - , firstCommand - , "' did you mean '" - , errorMatch - , "'?\n" - ] ) - - --- | getBestMatch compares input to allowable commands and checks if in list and if not outputs --- closest match --- call with (maxBound :: Int ,"no suggestion") commandList inString -getBestMatch :: (Int, String) -> [String] -> String -> (Int, String) -getBestMatch curBest@(minDist, _) allowedStrings inString = - if null allowedStrings then curBest - else - let candidate = head allowedStrings - candidateEditCost = editDistance candidate inString - in - if candidateEditCost == 0 then (0, candidate) - else if candidateEditCost < minDist then getBestMatch (candidateEditCost, candidate) (tail allowedStrings) inString - else getBestMatch curBest (tail allowedStrings) inString - --- | getCommandErrorString takes list of non zero edits to allowed commands and reurns meaningful error string -getCommandErrorString :: [(Int, String, String)] -> String -getCommandErrorString noMatchList = - if null noMatchList then "" - else - let (_, firstCommand, firstMatch) = head noMatchList - firstError = "\tBy \'" ++ firstCommand ++ "\' did you mean \'" ++ firstMatch ++ "\'?\n" - in - firstError ++ getCommandErrorString (tail noMatchList) - --- | isSequentialSubsequence takes two lists and determines if the first List is --- a subsequence of the second but the elements must be sequencetial unlike --- isSubsequenceOf in Data.List --- Uses Text.filter to see if there is a match ---isSequentialSubsequence :: (Eq a) => [a] -> [a] -> Bool -isSequentialSubsequence :: String -> String -> Bool -isSequentialSubsequence firstL secondL - | null firstL = False - | length firstL > length secondL = False - | otherwise = - let foundNumber = T.count (T.pack firstL) (T.pack secondL) - in - foundNumber /= 0 - --- | shuffle Randomly shuffles a list --- /O(N)/ --- from https://wiki.haskell.org/Random_shuffle -shuffleIO :: [a] -> IO [a] -shuffleIO xs = do - ar <- newArrayLocal n xs - forM [1..n] $ \i -> do - j <- randomRIO (i,n) - vi <- readArray ar i - vj <- readArray ar j - writeArray ar j vi - return vj - where - n = length xs - newArrayLocal :: Int -> [a] -> IO (IOArray Int a) - newArrayLocal nL xsL = newListArray (1,nL) xsL - --- | shuffleInt takes a seed, number of replicates and a list of Ints and --- repeately shuffles the order -shuffleInt :: Int -> Int -> [Int] -> [[Int]] -shuffleInt seed numReplicates inIntList = - if null inIntList then [] - else if numReplicates < 1 then [] - else - let randList = take (length inIntList) $ randomIntList seed - pairList = L.sortOn fst $ zip randList inIntList - (_, newList) = unzip pairList - in - newList : shuffleInt (seed + 1) (numReplicates - 1) inIntList - - -{-# NOINLINE randomList #-} --- | randomList generates an infinite random list from a seed--no IO or ST monad --- but needs a good seed--perhaps system tiem --- can cast to to other types like :: [Int] -randomList :: Int -> [Double] -randomList seed = randoms (mkStdGen seed) :: [Double] - -{-# NOINLINE randomIntList #-} --- | randomIntList generates an infinite random list of Ints -randomIntList :: Int -> [Int] -randomIntList seed = randoms (mkStdGen seed) :: [Int] - -{-# NOINLINE permuteList #-} --- | permuteList ranomzes list order with seed -permuteList :: Int -> [a] -> [a] -permuteList rSeed inList = - if null inList then [] - else if length inList == 1 then inList - else - fst $ unzip $ L.sortOn snd $ zip inList (randomIntList rSeed) - --- | takeRandom premutes a list and takes a number b ased on sed aned number to take -takeRandom :: Int -> Int -> [a] -> [a] -takeRandom rSeed number inList = - if null inList then [] - else if number >= length inList then inList - else - L.take number $ permuteList rSeed inList - --- | takeNth takes n elments (each nth) of a list of length m -takeNth :: Int -> [a] -> [a] -takeNth number inList = - if null inList then [] - else if number == 0 then [] - else if number == 1 then [head inList] - else if number >= length inList then inList - else - let (value, _) = divMod (length inList) number - indexList = [0..(length inList - 1)] - (_, remList) = unzip $ zipWith divMod indexList (L.replicate (length inList) value) - (outList, _) = unzip $ filter ((== 1) . snd) $ zip inList remList - in - take number outList - --- | getRandomElement returns the nth random element uniformly --- at random -getRandomElement :: Int -> [a] -> a -getRandomElement rVal inList = - if null inList then error "Null list in getRandomElement" - else if length inList == 1 then head inList - else - let (_, idx) = divMod (abs rVal) (length inList) - in - inList !! idx - --- | selectListCostPairs is general to list of (a, Double) --- but here used for graph sorting and selecting)takes a pair of graph representation (such as String or fgl graph), and --- a Double cost and returns the whole of number of 'best', 'unique' or 'random' cost --- need an Eq function such as '==' for Strings or equal for fgl --- assumes options are all lower case --- options are pairs of String and number for number or graphs to keeep, if number is set to (-1) then all are kept --- if the numToKeep to return graphs is lower than number of graphs, the "best" number are returned --- except for random. -selectListCostPairs :: forall a . (a -> a -> Bool) -> [(a, Double)] -> [String] -> Int -> Int -> [(a, Double)] -selectListCostPairs compFun pairList optionList numToKeep seed = - if null optionList then error "No options specified for selectGraphCostPairs" - else if null pairList then [] - else - let firstPass = - let compFunPair :: forall b c. (a, b) -> (a, c) -> Bool - compFunPair x = compFun (fst x) . fst - in if ("unique" `elem` optionList) - then L.nubBy compFunPair pairList - else pairList - secondPass - | ("best" `elem` optionList) = reverse $ L.sortOn snd firstPass - | otherwise = firstPass - in - if ("random" `notElem` optionList) then take numToKeep secondPass - else -- shuffling with hash of structure as seed (not the best but simple for here) - let randList = randomList seed - pairListWRand = zip randList secondPass - thirdPass = fmap snd $ L.sortOn fst pairListWRand - - in take numToKeep thirdPass - - --- | getSystemTimeSeconds gets teh syste time and returns IO Int -getSystemTimeSeconds :: IO Int -getSystemTimeSeconds = do - systemTime <- getCurrentTime - let timeD = (round $ utcTimeToPOSIXSeconds systemTime) :: Int - return timeD - -{-# NOINLINE getSystemTimeNDT #-} --- | getSystemTimeNDT gets the syste time and returns IO NominalDiffTime -getSystemTimeNDT :: IO NominalDiffTime -getSystemTimeNDT = do - systemTime <- getCurrentTime - let !timeD = utcTimeToPOSIXSeconds systemTime - return timeD - -{-# NOINLINE getSystemTimeNDTUnsafe #-} --- | getSystemTimeNDTUnsafe gets the syste time and returns IO NominalDiffTime -getSystemTimeNDTUnsafe :: NominalDiffTime -getSystemTimeNDTUnsafe = unsafePerformIO getSystemTimeNDT - - -{-# NOINLINE getSystemTimeSecondsUnsafe #-} --- | getSystemTimeSecondsUnsafe gets the system time and returns Int via unsafePerformIO --- without the NOINLINE the function would probbaly be comverted to a --- constant which would be "safe" and OK as a random seed or if only called once -getSystemTimeSecondsUnsafe :: Int -getSystemTimeSecondsUnsafe = unsafePerformIO $ force <$> getSystemTimeSeconds - --- | stringToInt converts a String to an Int -stringToInt :: String -> String -> Int -stringToInt fileName inStr = - let result = readMaybe inStr :: Maybe Int - in - if result == Nothing then errorWithoutStackTrace ("\n\n'Read' 'tcm' format error non-Integer value " ++ inStr ++ " in " ++ fileName) - else fromJust result - --- | makeIndexPairs takes n and creates upper triangular matrix pairs (0,m) -makeIndexPairs :: Bool -> Int -> Int -> Int -> Int -> [(Int, Int)] -makeIndexPairs doDiagValues numI numJ indexI indexJ = - if indexI == numI then [] - else if indexJ == numJ then makeIndexPairs doDiagValues numI numJ (indexI + 1) 0 - else - if doDiagValues && (indexI == indexJ) then (indexI, indexJ) : makeIndexPairs doDiagValues numI numJ indexI (indexJ + 1) - else if (indexI < indexJ) then (indexI, indexJ) : makeIndexPairs doDiagValues numI numJ indexI (indexJ + 1) - else makeIndexPairs doDiagValues numI numJ indexI (indexJ + 1) - --- | stripString removes leading and trailing spaces from String --- akin to Text 'strip' -stripString :: String -> String -stripString inString = - if null inString then inString - else - let firstS = dropWhile (== ' ') inString - secondS = dropWhile (== ' ') $ reverse firstS - in - reverse secondS - --- | replaceVal replaces first value with second value e.g. carriage return '\r' with line newlinme '\n' --- call with [] accumulator -replaceVal :: (Eq a) => a -> a -> [a] -> [a] -> [a] -replaceVal target replacement inList curList = - if null inList then reverse curList - else - let firstVal = head inList - in - if firstVal == target then replaceVal target replacement (tail inList) (replacement : curList) - else replaceVal target replacement (tail inList) (firstVal : curList) - - --- | cartProd takes two lists and retuns carteian product as list of pairs -cartProd :: [a] -> [b] -> [(a,b)] -cartProd xs ys = [(x,y) | x <- xs, y <- ys] - - --- | cartProdPair takes a pair of lists and retuns carteian product as list of pairs -cartProdPair :: ([a], [b]) -> [(a,b)] -cartProdPair (xs, ys) = [(x,y) | x <- xs, y <- ys] - - --- | isCompatible takes a bit vector and a list of bit vectors --- and returns True if the fist bit vector is compatible will all in the list -isBVCompatible :: BV.BitVector -> [BV.BitVector] -> Bool -isBVCompatible inBV bvList = - if null bvList then True - else - let firstBV = head bvList - bvVal = inBV .&. firstBV - in - if bvVal == inBV then isBVCompatible inBV (tail bvList) - else if bvVal == firstBV then isBVCompatible inBV (tail bvList) - else False - --- | textMatchWildcards takes two Text's first may have wildcards and second without --- return True if they match, False otherwise. -textMatchWildcards :: TL.Text -> TL.Text -> Bool -textMatchWildcards straightText wildText = - if TL.null wildText && TL.null straightText then - True - else if TL.null wildText then - False - else if ((TL.head wildText == '*') && (TL.length wildText == 1)) then - True - else if TL.null straightText then - False - else if ((TL.head wildText == '*') && ((TL.length $ TL.dropWhile (== '*') wildText ) > 0)) && (TL.null straightText) then - False - else if (TL.head wildText == '?') || (TL.head wildText == TL.head straightText) then - textMatchWildcards (TL.tail straightText) (TL.tail wildText) - else if (TL.head wildText == '*') then - (textMatchWildcards (TL.tail straightText) wildText) || (textMatchWildcards straightText (TL.tail wildText)) - else - False - --- | elemWildards checks if a Text matches (without wildcards) at least one element of a List of Wildcard Text -elemWildcards :: TL.Text -> [TL.Text] -> Bool -elemWildcards straightText wildTextList = - if null wildTextList then False - else - if textMatchWildcards straightText (head wildTextList) then True - else elemWildcards straightText (tail wildTextList) - - --- | notElemWildcards checks if a Text matches (without wildcards) no elements of a List of Wildcard Text -notElemWildcards :: TL.Text -> [TL.Text] -> Bool -notElemWildcards straightText wildTextList = - if null wildTextList then True - else - if textMatchWildcards straightText (head wildTextList) then False - else notElemWildcards straightText (tail wildTextList) - --- | getListPairs takes a list and returns all unique pairs of elements --- order is (first found in list, second found in list) -getListPairs :: [a] -> [(a,a)] -getListPairs inList = - if null inList then [] - else - let firstElem = head inList - firstPairs = zip (replicate (length $ tail inList) firstElem) (tail inList) - in - firstPairs ++ (getListPairs (tail inList)) - diff --git a/pkg/PhyloLib/src/GraphFormatUtilities.hs b/pkg/PhyloLib/src/GraphFormatUtilities.hs deleted file mode 100644 index 8b44ecad7..000000000 --- a/pkg/PhyloLib/src/GraphFormatUtilities.hs +++ /dev/null @@ -1,1182 +0,0 @@ -{- | -Module : GraphFormatUtilities.hs -Description : module witb interconversion functions for commonly used phylogentic graph formats (newick. dot, fgl) - graphs parsed to fgl types. -Copyright : (c) 2021 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - - -Forest Extended Newick defined here as a series of ENewick representations -within '<' ans '>'. Nodes can be shared among consituent ENewick representations -(';' from enewick itself, just for illustration, not doubled) - - - -ExtendedNewick from Cardona et al. 2008. BMC Bioinformatics 2008:9:532 - - The labels and comments are as in Olsen Newick formalization below, except - that underscores in unquoted label are NOT converted to spaces and quoted labels - ar left as is with quotes and all. - Other elements as in Cardona et all ENewick. - - - -Gary Olsen's Interpretation of the "Newick's 8:45" Tree Format Standard -https://evolution.genetics.washington.edu/phylip/newick_doc.html - -Conventions: - Items in { } may appear zero or more times. - Items in [ ] are optional, they may appear once or not at all. - All other punctuation marks (colon, semicolon, parentheses, comma and - single quote) are required parts of the format. - - tree ==> descendant_list [ root_label ] [ : branch_length ] ; - - descendant_list ==> ( subtree { , subtree } ) - - subtree ==> descendant_list [internal_node_label] [: branch_length] - ==> leaf_label [: branch_length] - - root_label ==> label - internal_node_label ==> label - leaf_label ==> label - - label ==> unquoted_label - ==> quoted_label - - unquoted_label ==> string_of_printing_characters - quoted_label ==> ' string_of_printing_characters ' - - branch_length ==> signed_number - ==> unsigned_number - -Notes: - Unquoted labels may not contain blanks, parentheses, square brackets, - single_quotes, colons, semicolons, or commas. - Underscore characters in unquoted labels are converted to blanks. - Single quote characters in a quoted label are represented by two single - quotes. - Blanks or tabs may appear anywhere except within unquoted labels or - branch_lengths. - Newlines may appear anywhere except within labels or branch_lengths. - Comments are enclosed in square brackets and may appear anywhere - newlines are permitted. - -Other notes: - PAUP (David Swofford) allows nesting of comments. - TreeAlign (Jotun Hein) writes a root node branch length (with a value of - 0.0). - PHYLIP (Joseph Felsenstein) requires that an unrooted tree begin with a - trifurcation; it will not "uproot" a rooted tree. - -Example: - (((One:0.2,Two:0.3):0.3,(Three:0.5,Four:0.3):0.2):0.3,Five:0.7):0.0; - - +-+ One - +--+ - | +--+ Two - +--+ - | | +----+ Three - | +-+ - | +--+ Four - + - +------+ Five ---} - -{-# Language ImportQualifiedPost #-} - -{- -ToDo: - Need to verify FEN output and input with Dendroscope (errors/non-parse reported) --} - -module GraphFormatUtilities - ( forestEnhancedNewickStringList2FGLList - , fglList2ForestEnhancedNewickString - , component2Newick - , checkIfLeaf - , stringGraph2TextGraph - , textGraph2StringGraph - , stringGraph2TextGraphDouble - , showGraph - , relabelFGL - , convertGraphToStrictText - , splitVertexList - , relabelFGLEdgesDouble - , getDistToRoot - , fgl2DotString - , modifyVertexEdgeLabels - , relabelGraphLeaves - , checkGraphsAndData - , cyclic - , reIndexLeavesEdges - ) where - -import Control.Parallel.Strategies -import Data.Char (isSpace) -import Data.Graph.Inductive.Graph qualified as G -import Data.Graph.Inductive.PatriciaTree qualified as P -import Data.List qualified as L -import Data.Map.Strict qualified as Map -import Data.Maybe -import Data.Text qualified as StrictT -import Data.Text.Lazy qualified as T -import Data.GraphViz qualified as GV -import Data.GraphViz.Attributes.Complete (Attribute (Label), Attributes, Label (..)) -import Data.GraphViz.Printing qualified as GVP -import Data.Monoid -import GeneralUtilities -import ParallelUtilities -import Cyclic qualified as C - - ---import qualified Data.Graph.Analysis as GAC currently doesn't compile (wanted to use for cycles) ---import Debug.Trace - - --- | showGraph a semi-formatted show for Graphs -showGraph :: (Show a, Show b) => P.Gr a b -> String -- BV.BV (BV.BV, BV.BV) -> String -showGraph inGraph = - if G.isEmpty inGraph then "Empty Graph" - else - let nodeString = show $ G.labNodes inGraph - edgeString = show $ G.labEdges inGraph - in - ("Nodes:" <> nodeString <> "\n" <> "Edges: " <> edgeString) - --- | getForestEnhancedNewickList takes String file contents and returns a list --- of fgl graphs with Text labels for nodes and edges or error if not ForestEnhancedNewick or Newick formats. -forestEnhancedNewickStringList2FGLList :: T.Text -> [P.Gr T.Text Double] -forestEnhancedNewickStringList2FGLList fileText = - if T.null fileText then [] - else - let feNewickList = fmap (removeNewickSpaces . removeNewickComments) (divideGraphText fileText) - in - -- trace ("There are " <> (show $ length feNewickList) <> " graphs to convert: " <> (show feNewickList)) - parmap rdeepseq text2FGLGraph feNewickList - --- | divideGraphText splits multiple Text representations of graphs (Newick styles) --- and returns a list of Text graph descriptions -divideGraphText :: T.Text -> [T.Text] -divideGraphText inText = - if T.null inText then [] - else - let firstChar = T.head inText - in - if firstChar == '<' then - let firstPart = T.snoc (T.takeWhile (/= '>') inText) '>' - restPart = T.tail $ T.dropWhile (/= '>') inText - in - firstPart : divideGraphText restPart - else if firstChar == '(' then - let firstPart = T.snoc (T.takeWhile (/= ';') inText) ';' - restPart = T.tail $ T.dropWhile (/= ';') inText - in - firstPart : divideGraphText restPart - else error ("First character in graph representation " <> T.unpack inText <> " : " <> show firstChar <> " is not either '<' or '('") - --- | removeNewickComments take text and removes all "[...]" -removeNewickComments :: T.Text -> T.Text -removeNewickComments inString - | T.null inString = T.empty - | not (T.any (== ']') inString) = inString - | otherwise = - let firstPart = T.takeWhile (/= '[') inString - secondPart = T.tail $ T.dropWhile (/= ']') inString - in - T.append firstPart (removeNewickComments secondPart) - --- | convertQuotedText takes single quoted Text and removes single quotes and converts spaces --- to underscores -convertQuotedText :: T.Text -> (T.Text, T.Text) -convertQuotedText inText = - if T.null inText then error "Emmpty Text in convertQuotedText" - else - let firstPart = T.replace (T.singleton ' ') (T.singleton '_') $ T.takeWhile (/= '\'') (T.tail inText) - restPart = T.tail $ T.dropWhile (/= '\'') (T.tail inText) - in - (firstPart, restPart) - --- | removeNewickSpaces removes spaces and converts single quoted strings --- with spaces to unquoted strings with underscores replacing spaces: ' blah bleh ' => blah_bleh -removeNewickSpaces :: T.Text -> T.Text -removeNewickSpaces inText = - if T.null inText then T.empty - else - let firstChar = T.head inText - in - if firstChar == '\'' then - let (newText, restText) = convertQuotedText inText - in - T.concat [newText, removeNewickSpaces restText] - else if isSpace firstChar then removeNewickSpaces $ T.tail inText - else T.cons firstChar (removeNewickSpaces $ T.tail inText) - --- | text2FGLGraph takes Text of newick (forest or enhanced or OG) and --- retns fgl graph representation -text2FGLGraph :: T.Text -> P.Gr T.Text Double -text2FGLGraph inGraphText = - if T.null inGraphText then error "Empty graph text in text2FGLGraph" - else - let firstChar = T.head inGraphText - lastChar = T.last inGraphText - in - if firstChar == '<' && lastChar == '>' then fENewick2FGL inGraphText -- getFENewick inGraphText - else if firstChar == '(' && lastChar == ';' then mergeNetNodesAndEdges $ makeGraphFromPairList $ eNewick2FGL [] [] [(inGraphText, (-1, T.empty))] - else error "Graph text not in ForestEnhancedNewick or (Enhanced)Newick format" - - --- | fENewick2FGL takes a Forest Extended Newick (Text) string and returns FGL graph --- breaks up forest and parses seprate eNewicks then modifes for any --- common network nodes in the sub-graphs -fENewick2FGL :: T.Text -> P.Gr T.Text Double -fENewick2FGL inText = - if T.null inText then error "Empty graph text in fENewick2FGL" - else - -- split eNewicks - let eNewickTextList = splitForest inText - startNodeList = replicate (length eNewickTextList) (-1, T.empty) - textNodeList = zip eNewickTextList startNodeList - -- init to remove trailing ';' from eNewick - eNewickGraphList = fmap ((mergeNetNodesAndEdges . makeGraphFromPairList) . (eNewick2FGL [] [] . (:[]))) textNodeList - in - if length eNewickGraphList == 1 then head eNewickGraphList - else - -- merge graphs then merge network nodes and edges edges - let fENewickGraph = mergeNetNodesAndEdges $ mergeFGLGraphs G.empty eNewickGraphList - in - fENewickGraph - --- | splitForest takes a Text (string) Forest Enhanced Newick representation and splits into --- its consituent Extended Newick representations -splitForest :: T.Text -> [T.Text] -splitForest inText - | T.null inText = [] - | (T.head inText /= '<') || (T.last inText /= '>') = error ("Invalid Forest Extended Newick representation," <> - " must begin with \'<\'' and end with \'>\' : " <> T.unpack inText) - | otherwise = - let partsList = filter (not.T.null) $ T.splitOn (T.singleton ';') (T.init $ T.tail inText) - eNewickList = fmap (`T.append` T.singleton ';') partsList - in - eNewickList - --- | makeGraphFromPairList takes pair of node list and edge list and returns Graph --- | filters to remove place holder node and edges creted during eNewick pass -makeGraphFromPairList :: [(G.LNode T.Text,G.LEdge Double)] -> P.Gr T.Text Double -makeGraphFromPairList pairList = - if null pairList then G.empty - else - let (nodeList, edgeList) = unzip pairList - in - G.mkGraph (filter ((> (-1)).fst) nodeList) (filter ((> (-1)).fst3) edgeList) - --- | getBranchLength extracts branch length from Text label and puts in '1' if there is no --- branch length--makes sure after last ')' -getBranchLength :: T.Text -> Double -getBranchLength inText = - --trace ("Getting branch length of " <> show inText) ( - if T.null inText then error "Null text in getBranchLength" - else - let a = T.dropWhile (/= ':') $ T.reverse $ T.takeWhile (/= ')') $ T.reverse inText - in - if T.null a then 1 - else if T.length a == 1 then error "Need branch length after \':\')" - else (read (T.unpack $ T.tail a) :: Double) - --) - --- | getNodeLabel get--or makes--a label for a node --- after last ')' before any ':', without ',' after last ')' -getNodeLabel :: Int -> T.Text -> T.Text -getNodeLabel nodeNumber inText = - --trace ("Getting node label of " <> show inText) ( - if T.null inText then error "Null text in getNodeLabel" - else - let a = T.takeWhile (/= ':') $ T.reverse $ T.takeWhile (/= ')') $ T.reverse inText - in - if T.any (== ',') a || T.null a then T.append (T.pack "HTU") (T.pack $ show nodeNumber) else a - --) - --- | getLeafInfo takes Text of teminal (no ',') and parses to yeild --- either a single leaf label, edge, and edge weight, or two --- leaves with labels and costs if there is a network node as parent --- need to merge network nodes later -getLeafInfo :: T.Text -> G.LNode T.Text -> [G.LNode T.Text] -> [(G.LNode T.Text,G.LEdge Double)] -getLeafInfo leafText parentNode nodeList - | T.null leafText = error "Empty leaf text in getLeafInfo" - | not (T.any (== '(') leafText) = - let leafLabel = T.takeWhile (/= ':') leafText - edgeWeight = getBranchLength leafText - -- CHECK FOR EXISTING - thisNode = (length nodeList, leafLabel) - thisEdge = (fst parentNode, length nodeList, edgeWeight) - --preexistingNode = checkForExistingNode leafLabel nodeList - in - [(thisNode, thisEdge)] - | otherwise = - let -- leaf parent info - -- (leafLabel)leafParentLabel:leafParentBranchLength - leafParentEdgeWeight = getBranchLength leafText - leafParentLabel = getNodeLabel (length nodeList) leafText - leafParentNode = (length nodeList, leafParentLabel) - leafParentEdge = (fst parentNode, fst leafParentNode, leafParentEdgeWeight) - - -- leaf info - -- (leafLabel)X#H:000 => leafLabel - leafLabelText = T.takeWhile (/= ')') $ T.tail leafText - -- check for existing - leafLabel = T.takeWhile (/= ':') leafLabelText - leafEdgeWeight = getBranchLength leafLabelText - leafNode = (1 + length nodeList, leafLabel) - leafEdge = (fst leafParentNode, fst leafNode, leafEdgeWeight) - in - [(leafNode, leafEdge),(leafParentNode, leafParentEdge)] - --- | getBodyParts takes a Text of a subTree and splits out the group description '(blah)', any node label --- and any branch length -getBodyParts :: T.Text -> Int -> (T.Text, T.Text, Double) -getBodyParts inRep nodeNumber = - if T.null inRep then error "No group to parse in getBodyParts" - else - --trace ("In body parts") ( - let subGraphPart = T.reverse $ T.dropWhile (/= ')') $ T.reverse inRep - branchLength = getBranchLength inRep - subGraphLabel = getNodeLabel nodeNumber inRep - in - --trace (show (subGraphPart, subGraphLabel, branchLength)) - (subGraphPart, subGraphLabel, branchLength) - --) - --- | getParenBoundedGraph tkaes a Text String and returns the first graph component --- with balanced parens and remainder of Text -getParenBoundedGraph :: Int -> Int -> T.Text -> T.Text -> (T.Text, T.Text) -getParenBoundedGraph leftParenCounter rightParenCounter curText inText = - --trace ("GB " <> show curText <> " " <> show inText) ( - if T.null inText then (curText, inText) - else - let firstChar = T.head inText - in - if firstChar == '(' then getParenBoundedGraph (leftParenCounter + 1) rightParenCounter (T.snoc curText firstChar) (T.tail inText) - else if firstChar /= ')' then getParenBoundedGraph leftParenCounter rightParenCounter (T.snoc curText firstChar) (T.tail inText) - else -- right paren - if rightParenCounter + 1 == leftParenCounter then -- closing matrched paren - let restOfComponent = T.takeWhile (/= ',') inText - remainderText = T.dropWhile (/= ',') inText - in - (curText `T.append` restOfComponent, remainderText) - else getParenBoundedGraph leftParenCounter (rightParenCounter + 1) (T.snoc curText firstChar) (T.tail inText) - -- ) - --- | getSubComponents takes a Text String and reurns 1 or more subcomponents of graph --- scenarios include leaf, leaf in parens, subgraph in parens -getSubComponents :: T.Text -> [T.Text] -getSubComponents inText - | T.null inText = [] - | T.head inText == ',' = getSubComponents (T.tail inText) - | T.head inText /= '(' = -- simple leaf (no net node labels) - let subGraph = T.takeWhile (/= ',') inText - restGraph = T.dropWhile (/= ',') inText - in - subGraph : getSubComponents restGraph - | otherwise = -- "regular" paren defined element - let (subGraph, restGraph) = getParenBoundedGraph 0 0 T.empty inText - in - subGraph : getSubComponents restGraph - --- | getChildren splits a subGraph Text '(blah, blah)' by commas, removing outer parens -getChildren :: T.Text -> [T.Text] -getChildren inText - | T.null inText = [] - | (T.head inText /= '(') || (T.last inText /= ')') = error ("Invalid Extended Newick component," <> - " must begin with \'(\'' and end with \')\' : " <> T.unpack inText <> "\nPerhaps missing commas ',' in F/E/Newick format?") - | otherwise = - -- modify for indegree 1 outdegree 1 and print warning. - let guts = T.init $ T.tail inText -- removes leading and training parens - subComponents = filter (not.T.null) $ getSubComponents guts - in - subComponents - --- | checkForExistingNode takes a node label and checs the node list for the first --- node with the same label and returns a Maybe node, else Nothing -checkForExistingNode :: T.Text -> [G.LNode T.Text] -> Maybe (G.LNode T.Text) -checkForExistingNode nodeLabel nodeList = - if null nodeList then Nothing - else - let matchList = filter ((== nodeLabel) . snd) nodeList - in - if null matchList then Nothing - else Just $ head matchList - --- | checkIfLeaf checks text to see if leaf. --- if the number of left parens is 1 and right parens 1 and no ',' then leaf --- if no left parens and no right paren then leaf --- then its a leaf --- either "bleh", "bleh:00", or "(bleh)label:00" -checkIfLeaf :: T.Text -> Bool -checkIfLeaf inText = - if T.null inText then error "Null text to check if leaf in checkIfLeaf" - else - let leftParenCount = T.count (T.singleton '(') inText - rightParenCount = T.count (T.singleton ')') inText - commaCount = T.count (T.singleton ',') inText - in - ((leftParenCount == 0) && (rightParenCount == 0) && (commaCount == 0)) || (if (leftParenCount == 0) && (rightParenCount == 0) && (commaCount > 0) then error ("Comma within leaf label" <> show inText) - else (leftParenCount == 1) && (rightParenCount == 1) && (commaCount == 0)) - --- | eNewick2FGL takes a single Extended Newick (Text) string and returns FGL graph --- allows arbitrary in and out degree except for root and leaves -eNewick2FGL :: [G.LNode T.Text] -> [G.LEdge Double] -> [(T.Text, G.LNode T.Text)] -> [(G.LNode T.Text,G.LEdge Double)] -eNewick2FGL nodeList edgeList inTextParentList = - if null inTextParentList then [] - else - let inTextFirst = fst $ head inTextParentList - parentNode = snd $ head inTextParentList - isRoot = null nodeList - in - -- see if initial call and check format - if isRoot && ((T.head inTextFirst /= '(') || (T.last inTextFirst /= ';')) then error ("Invalid Extended Newick component," <> - " must begin with \'(\'' and end with \')\' : " <> T.unpack inTextFirst) - -- not first call and/or format OK - else - let inText = if isRoot then T.takeWhile (/= ';') inTextFirst else inTextFirst -- remove trailing ';' if first (a bit wasteful--but intial check on format) - isLeaf = checkIfLeaf inText - in - -- trace ("Parsing " <> show inText <> " from parent " <> show parentNode <> " " <> show isLeaf)( - -- is a single leaf - -- need better could be series of indegree `1 outdegree 1 nodes to a single leaf with no ',' - -- ike (a(b(c(d)))) - if isLeaf then - -- parse label ala Gary Olsen formalization - -- since could have reticulate label yeilding two edges and two nodes - -- Cardona et al 2008 Extended Newick - let newLeafList = getLeafInfo inText parentNode nodeList - newNodeList = fmap fst newLeafList - newEdgeList = fmap snd newLeafList - in - newLeafList <> eNewick2FGL (newNodeList <> nodeList) (newEdgeList <> edgeList) (tail inTextParentList) - else - -- is subtree assumes start and end with parens '(blah)' - let (subTree, nodeLabel, edgeWeight) = getBodyParts inText (length nodeList) - thisNode = (length nodeList, nodeLabel) - thisEdge = (fst parentNode, length nodeList, edgeWeight) - childTextList = getChildren subTree - parentNodeList = replicate (length childTextList) thisNode - childParentList = zip childTextList parentNodeList - - in - (thisNode, thisEdge) : eNewick2FGL (thisNode : nodeList) (thisEdge : edgeList) (childParentList <> tail inTextParentList) - --- | reindexNode takes an offset and adds to the node index --- returning new node -reindexNode :: Int -> G.LNode T.Text -> G.LNode T.Text -reindexNode offSet (index, label) = (index + offSet, label) - --- | reindexEdge takes an offset and adds to the two indices of the edge --- returning the new edge -reindexEdge :: Int -> G.LEdge Double -> G.LEdge Double -reindexEdge offSet (e, u, label) = (e + offSet, u + offSet, label) - --- | mergeFGLGraphs takes multiple graphs and merges --- nodes and edges via reindexing --- just adds progessive offsets from graph node indices as added -mergeFGLGraphs :: P.Gr T.Text Double -> [P.Gr T.Text Double] -> P.Gr T.Text Double -mergeFGLGraphs curGraph inGraphList - | null inGraphList = curGraph - | G.isEmpty curGraph = mergeFGLGraphs (head inGraphList) (tail inGraphList) - | otherwise = - let firstGraph = head inGraphList - firstNodes = G.labNodes firstGraph - firstEdges = G.labEdges firstGraph - curNodes = G.labNodes curGraph - curEdges = G.labEdges curGraph - newNodes = fmap (reindexNode (length curNodes)) firstNodes - newEdges = fmap (reindexEdge (length curNodes)) firstEdges - in - mergeFGLGraphs (G.mkGraph (curNodes <> newNodes) (curEdges <> newEdges)) (tail inGraphList) - --- | getNodeIndexPair take a list of unique nodes and checks successive nodes and --- adds to unique list, also creating a full list of pairs of indicess for non-unique that --- can be used as an index map for edges --- length of unique list to keep the node indices sequential -getNodeIndexPair :: [G.LNode T.Text] -> [(Int, Int)] -> [G.LNode T.Text] -> ([G.LNode T.Text], [(Int, Int)]) -getNodeIndexPair uniqueList pairList nodeToCheckList = - if null nodeToCheckList then (reverse uniqueList, reverse pairList) - else - let firstNode@(index, label) = head nodeToCheckList - matchingNode = checkForExistingNode label uniqueList - in - if isNothing matchingNode then getNodeIndexPair (firstNode : uniqueList) ((index, length uniqueList) : pairList) (tail nodeToCheckList) - else - let existingNode = fromJust matchingNode - newPair = (index, fst existingNode) - in - getNodeIndexPair uniqueList (newPair : pairList) (tail nodeToCheckList) - --- | mergeNetNodesAndEdges takes a single graph and merges --- nodes and edges due to network nodes and edges --- uses checkForExistingNode and creates a map from nodes to reindex edges --- needs to be merged first if graphs are combined--or indices will be wrong -mergeNetNodesAndEdges :: P.Gr T.Text Double -> P.Gr T.Text Double -mergeNetNodesAndEdges inGraph = - --trace (showGraph inGraph) ( - if G.isEmpty inGraph then G.empty - else - let nodeList = G.labNodes inGraph - graphDelta = getMergeNodeEdgeDelta inGraph nodeList - in - - -- nothing to do (no repeated node labels) - if isNothing graphDelta then - -- need to reindex nodes and edges so nodes are sequential - let (_, nodeIndexPairs) = getNodeIndexPair [] [] nodeList - nodeMap = Map.fromList nodeIndexPairs - reindexedNodeList = fmap (reIndexLNode nodeMap) (G.labNodes inGraph) - reIndexedEdgeList = fmap (reIndexLEdge nodeMap) (G.labEdges inGraph) - in - -- to make nodes sequencentioal required later - G.mkGraph reindexedNodeList reIndexedEdgeList - - -- modifications were made to graph - else - let (nodesToDelete, edgesToDelete, edgesToCreate) = fromJust graphDelta - newGraph = G.insEdges edgesToCreate $ G.delNodes (fmap fst nodesToDelete) $ G.delEdges (fmap G.toEdge edgesToDelete) inGraph - in - --trace (showGraph newGraph) - mergeNetNodesAndEdges newGraph - --) - --- | getMergeNodeEdgeDelta takes a graph and list of labelled nodes --- merges the nodes with identical labels, dfeltes all but the first --- and creates new edges to and from first node, deleting the others --- works recursively to update graph to keep node and edge indices in synch --- this is n^2 in number of network nodes--prob could be made linear -getMergeNodeEdgeDelta :: P.Gr T.Text Double -> [G.LNode T.Text] -> Maybe ([G.LNode T.Text], [G.LEdge Double], [G.LEdge Double]) -getMergeNodeEdgeDelta inGraph inNodeList - | G.isEmpty inGraph = error "Empty graph in getMergeNodeEdgeDelta" - | null inNodeList = Nothing - | otherwise = -- Nodes to examine - let curNode = head inNodeList - nodeList = filter ((== snd curNode). snd) inNodeList - in - -- node label not repeated - if length nodeList == 1 then getMergeNodeEdgeDelta inGraph (tail inNodeList) - else -- node label repeated - let nodesToDelete = tail nodeList - inEdgesToDelete = concatMap (G.inn inGraph) (fmap fst nodesToDelete) - outEdgesToDelete = concatMap (G.out inGraph) (fmap fst nodesToDelete) - - -- Make new in-edges - newInEdgeUs = fmap fst3 inEdgesToDelete - newInEdgeLabels = fmap thd3 inEdgesToDelete - newInEdgeVs = replicate (length inEdgesToDelete) (fst curNode) - inEdgesToAdd = zip3 newInEdgeUs newInEdgeVs newInEdgeLabels - - -- make new out-edges - newOutEdgeUs = replicate (length outEdgesToDelete) (fst curNode) - newOutEdgeLabels = fmap thd3 outEdgesToDelete - newOutEdgeVs = fmap snd3 outEdgesToDelete - outEdgesToAdd = zip3 newOutEdgeUs newOutEdgeVs newOutEdgeLabels - - in - Just (nodesToDelete, inEdgesToDelete <> outEdgesToDelete, inEdgesToAdd <> outEdgesToAdd) - - - --- | subTreeSize takes a nodeList and retuns the number of leaves that can be --- traced back to those nodes (for single just pass list of 1 node) --- this used for ordering of groups left (smaller) right (larger) -subTreeSize :: P.Gr a b -> Int -> [G.LNode a] -> Int -subTreeSize inGraph counter nodeList = - if null nodeList then counter - else - let firstNode = head nodeList - children = G.suc inGraph $ fst firstNode - -- assumes all childrfen have a label (if not--problems) - labelList = fmap (fromJust . G.lab inGraph) children - labChildren = zip children labelList - in - subTreeSize inGraph (counter + length labChildren) (tail nodeList <> labChildren) - --- | getRoot takes a greaph and list of nodes and returns vertex with indegree 0 --- so assumes a connected graph--with a single root--not a forest -getRoots :: P.Gr a b -> [G.LNode a] -> [G.LNode a] -getRoots inGraph nodeList = - if null nodeList then [] --error "Root vertex not found in getRoot" - else - let firstNode@(index, _) = head nodeList - in - if G.indeg inGraph index == 0 then firstNode : getRoots inGraph (tail nodeList) - else getRoots inGraph (tail nodeList) - -{- --- | removeDuplicateSubtreeText removes duplicate subtree textx that come from indegree > 1 nodes --- there should be at least two of each network texts. --- for each case, the first instance is kept, and the remainders are replaced with the node label --- and edge weight if specified (:000) -removeDuplicateSubtreeText :: (Show b) => T.Text -> [G.LNode T.Text] -> P.Gr T.Text b -> Bool -> Bool -> T.Text -removeDuplicateSubtreeText inRep netNodeList fglGraph writeEdgeWeight writeNodeLable = - if null netNodeList then inRep - else - let netNodeText = T.init $ component2Newick fglGraph writeEdgeWeight writeNodeLable (head netNodeList) - -- edge weight already removed or may not match all occurences - -- I have no idea why--but there are extraneous double quotes that have to be removed. - nodeText' = T.filter (/= '\"') netNodeText - -- checks to see if leaf is indegree > 1 - nodeText = if not (checkIfLeaf nodeText') then nodeText' else T.filter (/= '(') $ T.filter (/= ')') nodeText' - nodeLabel = T.reverse $ T.takeWhile (/= ')') $ T.reverse nodeText - textList = T.splitOn nodeText inRep - -- isFound = T.isInfixOf nodeText inRep -- (T.pack "(4:1.0)Y#H1") inRep - in - -- trace ("Removing ? " <> show isFound <> " " <> show nodeText <> " " <> show nodeLabel <> " from " <> show inRep <> " in list (" <> show (length textList) <> ") " <> show textList) ( - -- since root cannot be network neither first nor last pieces should be empty - if T.null (head textList) || T.null (last textList) then error ("Text representation of graph is incorrect with subtree:\n" <> T.unpack nodeText - <> " first or last in representation:\n " <> T.unpack inRep) - --else if length textList == 1 then error ("Text representation of graph is incorrect with subtree:\n" <> T.unpack nodeText - -- <> " not found in representation:\n " <> T.unpack inRep) - else if length textList == 2 then - trace "Warning: Network subtree present only once--extraneous use of \'#\' perhaps--" - inRep - else -- should be minimum of 3 pieces (two occurences of subtree text removed) is subtree found 2x and not sister to itself (which should never happen) - -- edge weights (if they occur) remain the beginning of each text list (after the first) - let firstPart = head textList `T.append` nodeText - secondPart = T.intercalate nodeLabel (tail textList) - in - removeDuplicateSubtreeText (firstPart `T.append` secondPart) (tail netNodeList) fglGraph writeEdgeWeight writeNodeLable - --) --} - --- | getDistToRoot takes a node and a graph and gets the shortes path to root --- and returns the number of links -getDistToRoot :: P.Gr T.Text b -> Int -> G.Node -> Int -getDistToRoot fglGraph counter inNode = - if counter > length (G.nodes fglGraph) then error "Cycle likely in graph, path to root larger than number of nodes" - else - let parents = G.pre fglGraph inNode - in - if null parents then counter - else - let parentPaths = fmap (getDistToRoot fglGraph (counter + 1)) parents - in - minimum parentPaths - --- | modifyInDegGT1Leaves operates on the leaf nodes with indegree > 1 to prepare them for enewick representation --- leaves with indegree greater than one are modified such that: --- 1) a new node is created as parent to the leaf and added to the "add" list --- 2) edges incident on the leaf are put in the "delete" list --- 3) edges are created from teh new node to the leaf, and edges are "added" from teh leaf's parent to the new node -modifyInDegGT1Leaves :: P.Gr T.Text Double -> Int -> [G.LNode T.Text] -> ([G.LNode T.Text],[G.LEdge Double],[G.LEdge Double]) -> ([G.LNode T.Text],[G.LEdge Double],[G.LEdge Double]) -modifyInDegGT1Leaves origGraph totalNumberNodes leafNodes graphDelta@(nodesToAdd, edgesToAdd, edgesToDelete) - | G.isEmpty origGraph = error "Empty graph in modifyInDegGT1Leaves" - | null leafNodes = graphDelta - | otherwise = - let firstLeaf = head leafNodes - firstInDegree = G.indeg origGraph (fst firstLeaf) - in - if firstInDegree == 1 then modifyInDegGT1Leaves origGraph totalNumberNodes (tail leafNodes) graphDelta - else -- in a "network leaf" - let inEdgeList = G.inn origGraph (fst firstLeaf) - parentNodeList = fmap fst3 inEdgeList - inEdgeLabels = fmap thd3 inEdgeList - newNode = (totalNumberNodes, T.pack $ "HTU" <> show totalNumberNodes) - repeatedNodeNumber = replicate (length inEdgeList) totalNumberNodes - newEdgeList = (totalNumberNodes, fst firstLeaf, 0 :: Double) : zip3 parentNodeList repeatedNodeNumber inEdgeLabels - in - modifyInDegGT1Leaves origGraph (totalNumberNodes + 1) (tail leafNodes) (newNode : nodesToAdd, newEdgeList <> edgesToAdd, inEdgeList <> edgesToDelete) - --- | modifyInDegGT1HTU operates on the HTU nodes with indegree > 1 to prepare them for enewick representation --- HTUs with indegree greater than one are modified such that: --- 1) the original HTU is maintained the first edge to that HTU is Maintained also --- 2) other edges to that HTU are put in "delete" list --- 3) a new HTU node is created for each deleted edge with same label as original HTU --- 4) edges are created from the parents (except for the firt one) to the new nodes such thayt each one is indegree=1 and outdegree=0 --- 5) ne edges are put in teh "add list" --- Graphs are remade at eash recursivfe step tpo keep node/edge indexing correct -modifyInDegGT1HTU :: P.Gr T.Text Double -> Int -> [G.LNode T.Text] -> P.Gr T.Text Double -modifyInDegGT1HTU origGraph nodeIndex htuNodes - | G.isEmpty origGraph = error "Empty graph in modifyInDegGT1HTU" - | null htuNodes = origGraph - | otherwise = - let firstHTU = head htuNodes - firstInDegree = G.indeg origGraph (fst firstHTU) - in - if firstInDegree == 1 then modifyInDegGT1HTU origGraph nodeIndex (tail htuNodes) - else -- in a "network leaf" - --trace ("Mod " <> show firstHTU) ( - let inEdgeList = G.inn origGraph (fst firstHTU) - outEdgeList = G.out origGraph (fst firstHTU) - parentNodeList = fmap fst3 inEdgeList - childNodeList = fmap snd3 outEdgeList - inEdgeLabels = fmap thd3 inEdgeList - - -- Create new nodes and edges - numNewNodes = length inEdgeList - nodeIndexList = [nodeIndex .. (nodeIndex + numNewNodes - 1)] - nodeForChildenList = replicate (length childNodeList) nodeIndex - nodeLabelList = replicate numNewNodes (T.cons '#' (snd firstHTU)) - newNodeList = zip nodeIndexList nodeLabelList - - -- Edges to new nodes and edges from first new node ot children - newEdgeList = zip3 parentNodeList nodeIndexList (fmap thd3 inEdgeList) <> zip3 nodeForChildenList childNodeList inEdgeLabels - - newGraph = G.insEdges newEdgeList $ G.insNodes newNodeList $ G.delNode (fst firstHTU) $ G.delEdges (fmap G.toEdge (inEdgeList <> outEdgeList)) origGraph - in - --trace (show inEdgeList <> "\n" <> show outEdgeList <> "\n" <> show (newNodeList <> nodesToAdd, newEdgeList <> edgesToAdd, firstHTU : nodesToDelete, - -- (inEdgeList <> outEdgeList) <> edgesToDelete)) - - modifyInDegGT1HTU newGraph (nodeIndex + numNewNodes) (tail htuNodes) - --) - --- | modifyFGLForEnewick takes an FGl graphs and modified for enewick output --- 1) makes leaves that are indegree > 1 indegree 1 by creationg a new parent node with --- the leafs parents as parents and teh leaf as single child --- 2) convertes each indegree > 1 node (non--leaf) to a series of indegree 1 nodes --- the first of which gets the first parent and all the children. Each subsequent --- parent gets a node with the name label and no children -modifyFGLForEnewick :: P.Gr T.Text Double -> P.Gr T.Text Double -modifyFGLForEnewick inGraph = - if G.isEmpty inGraph then error "Empty graph to modify in modifyFGLForEnewick" - else - let (_, leafNodes, nonLeafNodes) = splitVertexList inGraph - - -- leaf nodes - (nodesToAdd, edgesToAdd, edgesToDelete) = modifyInDegGT1Leaves inGraph (G.order inGraph) leafNodes ([],[],[]) - leafModGraph = G.insEdges edgesToAdd $ G.insNodes nodesToAdd $ G.delEdges (fmap G.toEdge edgesToDelete) inGraph - - -- HTU nodes - --(nodesToAddHTU, edgesToAddHTU, nodesToDeleteHTU, edgesToDeleteHTU) = modifyInDegGT1HTU leafModGraph (G.order leafModGraph) (nonLeafNodes <> nodesToAdd) - --htuModGraph = G.insEdges edgesToAddHTU $ G.insNodes nodesToAddHTU $ G.delNodes (fmap fst nodesToDeleteHTU) $ G.delEdges (fmap G.toEdge edgesToDeleteHTU) leafModGraph - htuModGraph = modifyInDegGT1HTU leafModGraph (G.order leafModGraph) (nonLeafNodes <> nodesToAdd) - in - --trace (showGraph leafModGraph <> "\n" <> showGraph htuModGraph) - htuModGraph - - --- | fgl2FEN take a fgl graph and returns a Forest Enhanced Newick Text --- Can be simplified along lines of Cardona with change to graph before --- generating the rep bu splitting Hybrid nodes. --- enewick requires (it seems) indegree 1 for leaves --- so need to creates nodes for indegree > 1 leaves --- these are not issues for dot files -fgl2FEN :: Bool -> Bool -> P.Gr T.Text Double -> T.Text -fgl2FEN writeEdgeWeight writeNodeLable inFGLGraph = - if G.isEmpty inFGLGraph then error "Empty graph to convert in fgl2FEN" - else - -- Modify greaph for enewick stuff (leaves -> indegree 1, 'split' network nodes) - --trace ("Original:\n" <> showGraph inFGLGraph) ( - let fglGraph = modifyFGLForEnewick inFGLGraph - in - -- get forest roots - --trace ("Final:\n" <> showGraph fglGraph) ( - let numRoots = getRoots fglGraph (G.labNodes fglGraph) - rootGraphSizeList = fmap (subTreeSize fglGraph 0 . (:[])) numRoots - rootAndSizes = zip rootGraphSizeList numRoots - rootOrder = L.sortOn fst rootAndSizes - fenTextList = fmap (component2Newick fglGraph writeEdgeWeight writeNodeLable . snd) rootOrder - wholeRep = T.concat $ (`T.append` T.singleton '\n') <$> fenTextList - in - -- trace ("fgl2FEN " <> (show $ length numRoots) <> " " <> show rootOrder <> "->" <> show fenTextList) ( - if length fenTextList == 1 then wholeRep -- just a single tree/network - else T.snoc (T.cons '<' wholeRep) '>' -- forest - --)) - - --- | fglList2ForestEnhancedNewickString takes FGL representation of forest and returns --- list of Forest Enhanced Newick as a single String -fglList2ForestEnhancedNewickString :: [P.Gr T.Text Double] -> Bool -> Bool -> String -fglList2ForestEnhancedNewickString inFGLList writeEdgeWeight writeNodeLable = - if null inFGLList then "\n" - else - let forestTextList = (`T.append` T.singleton '\n') <$> parmap rdeepseq (fgl2FEN writeEdgeWeight writeNodeLable) inFGLList - forestListString = T.unpack $ T.concat forestTextList - in - forestListString - --- | component2Newick take a graph and root and creates enhanced newick from that root -component2Newick :: (Show a) => P.Gr T.Text a -> Bool -> Bool -> G.LNode T.Text -> T.Text -component2Newick fglGraph writeEdgeWeight writeNodeLable (index, label) = - if G.isEmpty fglGraph then error "Empty graph to convert in component2Newick" - else - -- start with root (no in edge weight) issue at root not seeing multiple components properly - let -- preorder traversal - middlePartList = concatMap (getNewick fglGraph writeEdgeWeight writeNodeLable) (fmap (replicate 1) (G.out fglGraph index)) - label' = if writeNodeLable then label else T.empty -- trivial trees or write node name - in - --trace ("MPL (" <> (show $ length middlePartList) <> ") " <> show middlePartList <> " " <> show (G.out fglGraph index)) ( - -- "naked" root - let firstText - | null middlePartList = T.concat [T.singleton '(', label, T.singleton ')', T.singleton ';'] - | length middlePartList == 1 = - T.concat [T.singleton '(', head middlePartList, T.singleton ')', label', T.singleton ';'] - | otherwise = T.concat [T.singleton '(', T.intercalate (T.singleton ',') middlePartList, T.singleton ')', label', T.singleton ';'] - in T.replace (T.pack ",)") (T.singleton ')') $ T.replace (T.pack ",,") (T.singleton ',') firstText - - - --- | makeLabel takes Maybe T.Text and retuns T.empty if Nothing, Text otherwise -makeLabel :: Maybe T.Text -> T.Text -makeLabel = fromMaybe T.empty - --- | fix for newick lack of paren in specific situation--inelegant -endStart :: T.Text -endStart = T.pack ")(" - -newEndStart :: T.Text -newEndStart = T.pack "),(" - --- | getNewick takes an edge of a graph and either creates the text if a leaf --- or recurses down tree if has descendents, adding commas, outer parens, labels, and edge weights if they exist. --- need to filter redundant subtrees later at the forest level (Can have shared node between rooted components) -getNewick :: (Show a) => P.Gr T.Text a -> Bool -> Bool -> [G.LEdge a] -> [T.Text] -getNewick fglGraph writeEdgeWeight writeNodeLable inEdgeList - | G.isEmpty fglGraph = [T.empty] - | null inEdgeList = [] - | otherwise = - let (_, curNodeIndex, edgeLabel) = head inEdgeList - outEdges = G.out fglGraph curNodeIndex - in - -- is a leaf, no children - if null outEdges then - let leafLabel = G.lab fglGraph curNodeIndex - in - if isNothing leafLabel then error ("Leaf without label in getNewick: node " <> show curNodeIndex <> " edge: " <> show (head inEdgeList) <> "\n" <> (G.prettify fglGraph)) - else - let newLabelList = if writeEdgeWeight then [T.concat [fromJust leafLabel, T.singleton ':', T.pack $ show edgeLabel]] else [fromJust leafLabel] - in - if length inEdgeList == 1 then newLabelList - else [T.concat $ newLabelList <> [T.singleton ','] <> getNewick fglGraph writeEdgeWeight writeNodeLable (tail inEdgeList)] - -- is HTU recurse - else - let nodeLabel = if not writeNodeLable then T.empty else makeLabel $ G.lab fglGraph curNodeIndex - middlePartList = getNewick fglGraph writeEdgeWeight writeNodeLable (G.out fglGraph curNodeIndex) - in - if length middlePartList == 1 then -- outdegree 1 - let middleText = T.replace endStart newEndStart (head middlePartList) - in - if not writeEdgeWeight then T.concat [T.singleton '(', middleText, T.singleton ')', nodeLabel, T.singleton ','] : getNewick fglGraph writeEdgeWeight writeNodeLable (tail inEdgeList) - else T.concat [T.singleton '(', middleText, T.singleton ')', nodeLabel, T.singleton ':', T.pack $ show edgeLabel, T.singleton ','] : getNewick fglGraph writeEdgeWeight writeNodeLable (tail inEdgeList) - else -- multiple children, outdegree > 1 - let middleText = T.intercalate (T.singleton ',') middlePartList - in - if not writeEdgeWeight then - T.replace (T.pack ",)") (T.singleton ')') (T.replace (T.pack ",,") (T.singleton ',') $ T.concat [T.singleton '(', middleText, T.singleton ')', nodeLabel]) : getNewick fglGraph writeEdgeWeight writeNodeLable (tail inEdgeList) - else - T.replace (T.pack ",)") (T.singleton ')') (T.replace (T.pack ",,") (T.singleton ',') $ T.concat [T.singleton '(', middleText, T.singleton ')', nodeLabel, T.singleton ':', T.pack $ show edgeLabel]) : getNewick fglGraph writeEdgeWeight writeNodeLable (tail inEdgeList) - - --- | stringGraph2TextGraph take P.Gr String Doble and converts to P.Gr Text a -stringGraph2TextGraph :: P.Gr String Double -> P.Gr T.Text Double -stringGraph2TextGraph inStringGraph = - let (indices, labels) = unzip $ G.labNodes inStringGraph - edges = G.labEdges inStringGraph - textLabels = fmap T.pack labels - newNodes = zip indices textLabels - in - G.mkGraph newNodes edges - --- | stringGraph2TextGraphDouble take P.Gr String a and converts to P.Gr Text Double --- ignores the edge label and reurns "0.0" -stringGraph2TextGraphDouble :: P.Gr String String -> P.Gr T.Text Double -stringGraph2TextGraphDouble inStringGraph = - let (indices, labels) = unzip $ G.labNodes inStringGraph - textLabels = fmap T.pack labels - newNodes = zip indices textLabels - origEdges = G.labEdges inStringGraph - newEdges = fmap dummyRelabelEdges origEdges - in - G.mkGraph newNodes newEdges - where - dummyRelabelEdges :: (a, b, String) -> (a, b, Double) - dummyRelabelEdges (a,b,c) = (a,b, read c :: Double) - --- | textGraph2StringGraph take P.Gr String a and converts to P.Gr Text a -textGraph2StringGraph :: P.Gr T.Text b -> P.Gr String b -textGraph2StringGraph inTextGraph = - let (indices, labels) = unzip $ G.labNodes inTextGraph - edges = G.labEdges inTextGraph - stringLabels = fmap T.unpack labels - newNodes = zip indices stringLabels - in - G.mkGraph newNodes edges - -{- -Fucntions to relabel Dot greaph to RawGraph format --} - --- | findStrLabel checks Attributes (list f Attribute) from Graphvz to extract the String label of node --- returns Maybe Text -findStrLabel :: Attributes -> Maybe T.Text -findStrLabel = getFirst . foldMap getStrLabel - --- | getStrLabel takes an Attribute and reurns Text if StrLabel found, mempty otherwise -getStrLabel :: Attribute -> First T.Text -getStrLabel (Label (StrLabel txt)) = First . Just $ txt -getStrLabel _ = mempty - --- | getLeafText takes a pairs (node vertex number, graphViz Attributes) --- and returns Text name of leaf of Stringified nude number if unlabbeled -getLeafText :: (Int, Attributes) -> T.Text -getLeafText (nodeIndex, nodeLabel) = - let maybeTextLabel = findStrLabel nodeLabel - in - fromMaybe (T.pack $ show nodeIndex) maybeTextLabel - --- | splitVertexList splits the vertices of a graph into ([root], [leaf], [non-leaf-non-root]) -splitVertexList :: P.Gr a b -> ([G.LNode a], [G.LNode a], [G.LNode a]) -splitVertexList inGraph = - if G.isEmpty inGraph then ([],[],[]) - else - let -- leaves - degOutList = G.outdeg inGraph <$> G.nodes inGraph - newNodePair = zip degOutList (G.labNodes inGraph) - leafPairList = filter ((== 0) . fst) newNodePair - (_, leafList) = unzip leafPairList - - -- roots - degInList = G.indeg inGraph <$> G.nodes inGraph - newRootPair = zip degInList (G.labNodes inGraph) - rootPairList = filter ((== 0) . fst) newRootPair - (_, rootList) = unzip rootPairList - - -- non-leaves, non-root - nodeTripleList = zip3 degOutList degInList (G.labNodes inGraph) - nonLeafTripleList = filter ((> 0) . fst3) $ filter ((> 0) . snd3) nodeTripleList - (_, _, nonLeafList) = unzip3 nonLeafTripleList - in - (rootList, leafList, nonLeafList) - --- | getVertexList returns vertex complement of graph from DOT file -getVertexList :: P.Gr Attributes Attributes -> [G.LNode T.Text] -getVertexList inGraph = - if G.isEmpty inGraph then [] - else - let (nodeVerts, _) = unzip $ G.labNodes inGraph - newLabels = fmap getLeafText $ G.labNodes inGraph - vertexList' = zip nodeVerts newLabels - in - vertexList' - --- | relabelFGL takes P.Gr Attributes Attributes and converts to P.Gr T.Text Double -relabelFGL :: P.Gr Attributes Attributes -> P.Gr T.Text Double -relabelFGL inGraph = - if G.isEmpty inGraph then G.empty - else - let newVertexList = getVertexList inGraph - newEdgeList = fmap relabeLEdge (G.labEdges inGraph) - in - G.mkGraph newVertexList newEdgeList - --- | relabeLEdge convertes edhe labels to Double -relabeLEdge :: G.LEdge b -> G.LEdge Double -relabeLEdge (u,v,_) = (u,v,0.0:: Double) - --- | relabelFGL takes P.Gr Attributes Attributes and converts to P.Gr T.Text Double -relabelFGLEdgesDouble :: P.Gr a b -> P.Gr a Double -relabelFGLEdgesDouble inGraph = - if G.isEmpty inGraph then G.empty - else - let newEdgeList = fmap relabeLEdge (G.labEdges inGraph) - in - G.mkGraph (G.labNodes inGraph) newEdgeList - --- | convertGraphToStrictText take a graphs with laze Text and makes it strict. -convertGraphToStrictText :: P.Gr T.Text Double -> P.Gr StrictT.Text Double -convertGraphToStrictText inGraph = - if G.isEmpty inGraph then G.empty - else - let nodeList = G.labNodes inGraph - nodesStrictText = fmap (T.toStrict . snd) nodeList - nodeIndices = fmap fst nodeList - in - G.mkGraph (zip nodeIndices nodesStrictText) (G.labEdges inGraph) - --- | fgl2DotString takes an FGL graph and returns a String -fgl2DotString :: (GV.Labellable a, GV.Labellable b) => P.Gr a b -> String -fgl2DotString inGraph = - T.unpack $ GVP.renderDot $ GVP.toDot $ GV.graphToDot GV.quickParams inGraph - --- | modifyVertexEdgeLabels keeps or removes vertex and edge labels -modifyVertexEdgeLabels :: (Show b) => Bool -> Bool -> P.Gr String b -> P.Gr String String -modifyVertexEdgeLabels keepVertexLabel keepEdgeLabel inGraph = - let inLabNodes = G.labNodes inGraph - degOutList = G.outdeg inGraph <$> G.nodes inGraph - nodeOutList = zip degOutList inLabNodes - leafNodeList = snd <$> filter ((== 0) . fst) nodeOutList - nonLeafNodeList = snd <$> filter ((> 0) . fst) nodeOutList - newNonLeafNodes = if keepVertexLabel then nonLeafNodeList - else zip (fmap fst nonLeafNodeList) (replicate (length nonLeafNodeList) "") - inLabEdges = G.labEdges inGraph - inEdges = fmap G.toEdge inLabEdges - newEdges = if keepEdgeLabel then fmap showLabel inLabEdges - else fmap (`G.toLEdge` "") inEdges - in G.mkGraph (leafNodeList <> newNonLeafNodes) newEdges - where - showLabel :: Show c => (a, b, c) -> (a, b, String) - showLabel (e, u, l) = (e, u, show l) - --- | relabelLeaf takes list of pairs and if current leaf label --- is snd in a pair, it replaces the label with the first of the pair -relabelLeaf :: [(T.Text, T.Text)] -> G.LNode T.Text -> G.LNode T.Text -relabelLeaf namePairList leafNode = - if null namePairList then leafNode - else - let foundName = L.find ((== (snd leafNode)) .snd) namePairList - in - if foundName == Nothing then leafNode - else (fst leafNode, (fst $ fromJust foundName)) - - --- | relabelGraphLeaves takes and FGL graph T.Text Double and renames based on pair of Text --- old name second, new name first in pair -relabelGraphLeaves :: [(T.Text, T.Text)] -> P.Gr T.Text Double -> P.Gr T.Text Double -relabelGraphLeaves namePairList inGraph = - if null namePairList then inGraph - else if G.isEmpty inGraph then inGraph - else - let (rootVerts, leafList, otherVerts) = splitVertexList inGraph - edgeList = G.labEdges inGraph - newLeafList = fmap (relabelLeaf namePairList) leafList - in - G.mkGraph (newLeafList <> rootVerts <> otherVerts) edgeList - --- | checkGraphsAndData leaf names (must be sorted) and a graph --- nedd to add other sanity checks --- does not check for cycles becasue that is done on input -checkGraphsAndData :: [T.Text] -> P.Gr T.Text Double -> P.Gr T.Text Double -checkGraphsAndData leafNameList inGraph = - if G.isEmpty inGraph then inGraph - else if null leafNameList then error "Empty leaf name list" - else - let (_, leafList, _) = splitVertexList inGraph - graphLeafNames = L.sort $ fmap snd leafList - nameGroupsGT1 = filter ((> 1).length) $ L.group graphLeafNames - in - -- check for repeated terminals - if not $ null nameGroupsGT1 then errorWithoutStackTrace ("Input graph has repeated leaf labels" <> - (show $ fmap head nameGroupsGT1)) - -- check for leaf complement identity - else if leafNameList /= graphLeafNames then - let inBoth = L.intersect leafNameList graphLeafNames - onlyInData = leafNameList L.\\ inBoth - onlyInGraph = graphLeafNames L.\\ inBoth - in - errorWithoutStackTrace ("Data leaf list does not match graph leaf list: \n\tOnly in data : " <> show onlyInData - <> "\n\tOnly in Graph : " <> (show onlyInGraph) <> " (concatenated names could be due to lack of commas ',' or unbalanced parentheses '()') in grap[h specification") - - else inGraph - --- | cyclic maps to cyclic funcitn in moduel Cyclic.hs -cyclic :: (G.DynGraph g) => g a b -> Bool -cyclic inGraph = C.cyclic inGraph - --- | makeHTULabel take HTU index and amkes into HTU# -makeHTULabel :: Int -> T.Text -makeHTULabel index = T.pack $ "HTU" <> show index - --- | getLeafLabelMatches tyakes the total list and looks for elements in the smaller local leaf set --- retuns int index of the match or (-1) if not found so that leaf can be added in orginal order -getLeafLabelMatches ::[G.LNode T.Text] -> G.LNode T.Text -> (Int, Int) -getLeafLabelMatches localLeafList totNode = - if null localLeafList then (-1, fst totNode) - else - let (index, leafString) = head localLeafList - in - if snd totNode == leafString then (index, fst totNode) - else getLeafLabelMatches (tail localLeafList) totNode - --- | reIndexLeavesEdges Leaves takes input fgl graph and total input leaf sets and reindexes node, and edges --- such that leaves are nodes 0-n-1, then roots and then other htus and edges are reindexed based on that via a map -reIndexLeavesEdges :: [T.Text] -> P.Gr T.Text Double -> P.Gr T.Text Double -reIndexLeavesEdges leafList inGraph = - if G.isEmpty inGraph then G.empty - else - --trace ("In Graph :" <> (show $ G.order inGraph) <> " " <> (show $ G.size inGraph) <> "\n" <> (showGraph inGraph)) ( - --trace ("LL:" <> (show $ length leafList) <> " " <> (show $ length $ G.nodes inGraph)) ( - -- reindex nodes and edges and add in new nodes (total leaf set + local HTUs) - -- create a map between inputLeafSet and graphLeafSet which is the canonical enumeration - -- then add in local HTU nodes and for map as well - -- trace ("Original graph: " <> (showGraph inGraph)) ( - let canonicalLeafOrder = zip [0..((length leafList) - 1)] leafList - (rootList, leafVertexList, nonRootHTUList) = splitVertexList inGraph - --correspondanceList = parmap rdeepseq (getLeafLabelMatches canonicalLeafOrder) leafVertexList - correspondanceList = fmap (getLeafLabelMatches leafVertexList) canonicalLeafOrder - matchList = filter ((/= (-1)) . fst) correspondanceList - htuList = fmap fst $ rootList <> nonRootHTUList - --htuList = fmap fst (G.labNodes inGraph) \\ fmap fst leafVertexList - htuNumber = length htuList - newHTUNumbers = [(length leafList)..(length leafList + htuNumber - 1)] - newHTULabels = fmap makeHTULabel newHTUNumbers - htuMatchList = zip htuList newHTUNumbers - - in - --trace (show canonicalLeafOrder <> "\n" <> show leafVertexList <> "\n" <> show matchList <> "\n" <> show htuMatchList) ( - let - --remove order dependancey - -- htuList = [(length inputLeafList)..(length inputLeafList + htuNumber - 1)] - vertexMap = Map.fromList (matchList <> htuMatchList) - reIndexedEdgeList = fmap (reIndexLEdge vertexMap) (G.labEdges inGraph) - - --newNodeNumbers = [0..(length leafList + htuNumber - 1)] - --attributeList = replicate (length leafList + htuNumber) (T.pack "") -- origAttribute - --newNodeList = zip newNodeNumbers attributeList - newNodeList = canonicalLeafOrder <> (zip newHTUNumbers newHTULabels) - newGraph = G.mkGraph newNodeList reIndexedEdgeList - in - --trace ("Out Graph :" <> (show $ G.order newGraph) <> " " <> (show $ G.size newGraph) <> "\n" <> (showGraph newGraph)) - --trace ("Orig graph: " <> (G.prettify inGraph) <> "\nNew graph: " <> (G.prettify newGraph)) - newGraph - --)) - - --- | reIndexEdge takes an (Int, Int) map, labelled edge, and returns a new labelled edge with new e,u vertices -reIndexLEdge :: Map.Map Int Int -> G.LEdge Double -> G.LEdge Double -reIndexLEdge vertexMap inEdge = - if Map.null vertexMap then error "Null vertex map" - else - let (e,u,label) = inEdge - newE = Map.lookup e vertexMap - newU = Map.lookup u vertexMap - in - --trace ((show $ Map.size vertexMap) <> " " <> (show $ Map.toList vertexMap)) ( - if isNothing newE then error ("Edge error looking up vertex " <> show e <> " in " <> show (e,u)) - else if isNothing newU then error ("Edge error looking up vertex " <> show u <> " in " <> show (e,u)) - else (fromJust newE, fromJust newU, label) - --) - --- | reIndexNode takes an (Int, Int) map, labelled node, and returns a new labelled node with new vertex -reIndexLNode :: Map.Map Int Int -> G.LNode T.Text -> G.LNode T.Text -reIndexLNode vertexMap inNode = - if Map.null vertexMap then error "Null vertex map" - else - let (index,label) = inNode - newIndex = Map.lookup index vertexMap - in - if isNothing newIndex then error ("Error looking up vertex " <> show index <> " in " <> show inNode) - else (fromJust newIndex, label) - diff --git a/pkg/PhyloLib/src/LocalSequence.hs b/pkg/PhyloLib/src/LocalSequence.hs deleted file mode 100644 index ec85b4245..000000000 --- a/pkg/PhyloLib/src/LocalSequence.hs +++ /dev/null @@ -1,158 +0,0 @@ -{- | -Module : LocalSequence -Description : Functions that map Data.Sequence to list-like functtions (head, tail etc) -Copyright : (c) 2014 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -{-# Language ImportQualifiedPost #-} - -module LocalSequence - ( module LocalSequence - ) where - -import Data.Sequence ((<|), (|>), (><)) -import Data.Sequence qualified as S -import Data.Vector qualified as V -import Data.Foldable qualified as F - --- | sequjence type for exporting -type Seq = S.Seq - --- | head maps to Take -head :: Seq a -> a -head inSeq = S.index inSeq 0 - --- | tail maps to drop Seq -tail :: Seq a -> Seq a -tail inSeq = S.drop 1 inSeq - --- | (!!) index -(!) :: Seq a -> Int -> a -(!) inSeq index = S.index inSeq index - --- | cons maps to (<|) -cons :: a -> Seq a -> Seq a -cons newElem inSeq = newElem <| inSeq - --- | snoc maps to (|>) -snoc ::Seq a -> a -> Seq a -snoc inSeq newElem = inSeq |> newElem - --- | snoc with args reversed -snocFlip :: a -> Seq a -> Seq a -snocFlip newElem inSeq = inSeq |> newElem - --- | empty to empty -empty :: Seq a -empty = S.empty - --- | null equal to empty -null :: (Eq a) => Seq a -> Bool -null inSeq = if inSeq == S.empty then True - else False - --- | singleton to singleton -singleton :: a -> Seq a -singleton newElem = S.singleton newElem - --- | ++ maps to >< -(++) :: Seq a -> Seq a -> Seq a -(++) inSeqA inSeqB = inSeqA >< inSeqB - --- | concat fold over >< -concat :: (Eq a) => Seq (Seq a) -> Seq a -concat inSeqSeq = concatInternal inSeqSeq LocalSequence.empty - --- | concatInternal internal concat function with accumulator -concatInternal :: (Eq a) => Seq (Seq a) -> Seq a -> Seq a -concatInternal inSeqSeq newSeq - | LocalSequence.null inSeqSeq = newSeq - | otherwise = - let firstSeq = LocalSequence.head inSeqSeq - in concatInternal (LocalSequence.tail inSeqSeq) (firstSeq >< newSeq) - --- | zip maps to zip -zip :: Seq a -> Seq b -> Seq (a,b) -zip inSeqA inSeqB = S.zip inSeqA inSeqB - --- | length maps to length -length :: Seq a -> Int -length inSeq = S.length inSeq - --- | toList from Foldable -toList :: Seq a -> [a] -toList inSeq = F.toList inSeq - --- | fromList from fromList -fromList :: [a] -> Seq a -fromList aList = S.fromList aList - --- | toVector via intemediate List (alas) -toVector :: Seq a -> V.Vector a -toVector inSeq = V.fromList $ toList inSeq - --- | toVector via intemediate List (alas) -fromVector :: V.Vector a -> Seq a -fromVector inVect = S.fromList $ V.toList inVect - --- | reverse to reverse -reverse :: Seq a -> Seq a -reverse inSeq = S.reverse inSeq - --- | last should be connstant time -last :: Seq a -> a -last inSeq = S.index inSeq $ (S.length inSeq) - 1 - --- | map to fmap for ease of migrating libraries -map :: Traversable t => (a->b) -> t a -> t b -map f = fmap f - --- | drop maps to drop -drop :: Int -> Seq a -> Seq a -drop number inSeq = S.drop number inSeq - --- | take maps to take -take :: Int -> Seq a -> Seq a -take number inSeq = S.take number inSeq - --- | unsafeTake maps to take -unsafeTake :: Int -> Seq a -> Seq a -unsafeTake number inSeq = S.take number inSeq - --- | unsafeDrop maps to drop -unsafeDrop :: Int -> Seq a -> Seq a -unsafeDrop number inSeq = S.drop number inSeq - --- | replicate maps to replicate -replicate :: Int -> a -> Seq a -replicate number value = S.replicate number value diff --git a/pkg/PhyloLib/src/ParallelUtilities.hs b/pkg/PhyloLib/src/ParallelUtilities.hs deleted file mode 100644 index 7324e3dd4..000000000 --- a/pkg/PhyloLib/src/ParallelUtilities.hs +++ /dev/null @@ -1,105 +0,0 @@ -{- | -Module : ParallelUtilities.hs -Description : Utilities for parallel traversals, and other related functions -Copyright : (c) 2021 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -{-# Language ImportQualifiedPost #-} - -{-# Options_GHC -fno-warn-orphans #-} - -module ParallelUtilities - ( parmap - , seqParMap - , getNumThreads - , rnf - , myStrategy - , myStrategyRS - , myStrategyRPAR - , myParListChunk - , myParListChunkRDS - , myChunkParMapRDS - ) where - -import Control.Concurrent -import Control.DeepSeq -import Control.Parallel.Strategies -import Data.BitVector qualified as BV -import System.IO.Unsafe - - --- Map a function over a traversable structure in parallel --- Preferred over parMap which is limited to lists --- Add chunking (with arguement) (via chunkList) "fmap blah blah `using` parListChunk chunkSize rseq/rpar" --- but would have to do one for lists (with Chunk) and one for vectors (splitAt recusively) -parmap :: Traversable t => Strategy b -> (a -> b) -> t a -> t b -parmap strat f = withStrategy (parTraversable strat).fmap f - --- | seqParMap takes strategy, if numThread == 1 retuns fmap otherwise parmap and -seqParMap :: Traversable t => Strategy b -> (a -> b) -> t a -> t b -seqParMap strat f = - if getNumThreads > 1 then parmap strat f - else fmap f - -myParListChunk :: Strategy a -> Strategy [a] -myParListChunk localStrategy = parListChunk getNumThreads localStrategy - -myParListChunkRDS :: (NFData a) => Strategy [a] -myParListChunkRDS = parListChunk getNumThreads myStrategy - -myStrategy :: (NFData b) => Strategy b -myStrategy = rdeepseq - -myStrategyRS :: Strategy b -myStrategyRS = rseq - -myStrategyRPAR :: Strategy b -myStrategyRPAR = rpar - --- | getNumThreads gets number of COncurrent threads -{-# NOINLINE getNumThreads #-} -getNumThreads :: Int -getNumThreads = unsafePerformIO getNumCapabilities - - --- NFData instance for parmap/rdeepseq Bit Vectory types -instance NFData BV.BV where - - rnf bv = BV.size bv `seq` BV.nat bv `seq` () - - --- | myChunkParMapRDS chuncked parmap that defaults to fmap if not paralell -myChunkParMapRDS :: NFData c => (b -> c) -> [b] -> [c] -myChunkParMapRDS f inList = - if getNumThreads == 1 then fmap f inList - else fmap f inList `using` myParListChunkRDS diff --git a/pkg/PhyloLib/src/SymMatrix.hs b/pkg/PhyloLib/src/SymMatrix.hs deleted file mode 100644 index 96e864251..000000000 --- a/pkg/PhyloLib/src/SymMatrix.hs +++ /dev/null @@ -1,450 +0,0 @@ -{- | -Module : SymMatrix.hs -Description : Progam to manipulate square symmetric lower diagonal matrices with diagnonal values - as if they were normal matrices -Copyright : (c) 2020 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -{-# Language ImportQualifiedPost #-} - -module SymMatrix - ( empty - , dim - , fromLists - , Matrix - , SymMatrix.null - , cols - , rows - , (!) - , toLists - , toRows - , fromRows - , toFullLists - , getFullRow - , isSymmetric - , updateMatrix - , unsafeUpdateMatrix - , addMatrixRow - , addMatrices - , deleteRowsAndColumns - , showMatrixNicely - , SymMatrix.map - , SymMatrix.flatten - , getFullRowVect - , SymMatrix.zipWith - , SymMatrix.zip - , combine - , safeIndex - , makeDefaultMatrix - , getFullVects - , getCost - ) where - -import Data.List qualified as L -import Data.Sort qualified as S -import Data.Vector qualified as V -import Data.Vector.Generic qualified as G - --- | Matrix type as Vector of Vectors -type Matrix a = V.Vector (V.Vector a) - --- | functions for triples -fst3 :: (a,b,c) -> a -fst3 (d,_,_) = d - --- | empty matrix value from Vector -empty :: Matrix a -empty = G.empty - --- | dim returns dimmension (rows = cols) -dim :: (Eq a) => Matrix a -> (Int, Int) -dim inMatrix = - if SymMatrix.null inMatrix then (0,0) - else (V.length inMatrix, V.length inMatrix) - --- | rows returns number rows in Matrix (rows = cols) -rows :: (Eq a) => Matrix a -> Int -rows inM = fst $ dim inM - --- | cols returns number cols in Matrix (rows = cols) -cols :: (Eq a) => Matrix a -> Int -cols inM = fst $ dim inM - --- | null returns True of row number is 0 -null :: (Eq a) => Matrix a -> Bool -null inMatrix = inMatrix == empty - --- | isSymmetric is true by defineition--when created error if not -isSymmetric :: (Eq a) => Matrix a -> Bool -isSymmetric inM = - not (SymMatrix.null inM) || error "Null matrix in isSymmetric" - --- | fromRows creates a lower diagonal matrix (with diagonal) -fromRows :: (Eq a, Show a) => [V.Vector a] -> Matrix a -fromRows inVectList = fromLists $ fmap V.toList inVectList - --- | toRows converts a Matrix to a list of Vectors --- unequal in length -toRows :: Matrix a -> [V.Vector a] -toRows = V.toList - --- | fromLists takes list of list of a and returns lower diagnoal (with diagonal) --- matrix as Matrix (Vector of Vectors) -fromLists :: (Eq a, Show a) => [[a]] -> Matrix a -fromLists inListList = - if L.null inListList then empty - else - let initialSquare = V.map V.fromList $ V.fromList inListList - colsH = V.length $ V.head initialSquare - rowsH = V.length initialSquare - in - if colsH /= rowsH then error ("Input matrix is not square " ++ (show (colsH, rowsH)) ++ " " ++ show inListList) - else - let indexPairs = cartProd [0..(rowsH - 1)] [0..(rowsH - 1)] - sym = checkSymmetry initialSquare indexPairs - in - if not sym then error "Input matrix not symmetrical" - else makeLowerDiag initialSquare 0 rowsH - --- | toLists takes a Matrix and returns a list of lists (not all same length) -toLists :: Eq a => Matrix a -> [[a]] -toLists inM = - if SymMatrix.null inM then [] - else - V.toList $ V.map V.toList inM - --- | toFullLists takes a Matrix and returns a list of lists of full length --- square matrix -toFullLists :: Eq a => Matrix a -> [[a]] -toFullLists inM = - if SymMatrix.null inM then [] - else - fmap (getFullRow inM) [0..(rows inM - 1)] - --- | getFullRow returns a specific full row (is if matrix were square) --- as a list -getFullRow :: Eq a => Matrix a -> Int -> [a] -getFullRow inM index = - if SymMatrix.null inM then [] - else - let firstPart = V.toList $ inM V.! index -- initial [0..index] of elements - restMatrix = V.drop (index + 1) inM - restByColumn = V.toList $ V.map (V.! index) restMatrix - in - firstPart ++ restByColumn - --- | getFullVects returns full size verctor of vector not lower triangular -getFullVects :: Eq a => Matrix a -> Matrix a -getFullVects inLV = - if SymMatrix.null inLV then SymMatrix.empty - else - V.fromList $ fmap (getFullRowVect inLV) [0..((SymMatrix.rows inLV) - 1)] - --- | getFullRowVect reurns a specific full row (is if matrix were square) --- as a Vector -getFullRowVect :: Eq a => Matrix a -> Int -> V.Vector a -getFullRowVect inM index = - if SymMatrix.null inM then V.empty - else - let firstPart = inM V.! index -- initial [0..index] of elements - restMatrix = V.drop (index + 1) inM - restByColumn = V.map (V.! index) restMatrix - in - firstPart V.++ restByColumn - --- | indexing lower diag matrix -(!) :: Matrix a -> (Int, Int) -> a -(!) inM (iIndex,jIndex) = - if iIndex > jIndex then - (inM V.! iIndex) V.! jIndex - else - (inM V.! jIndex) V.! iIndex - --- | indexing lower diag matrix -safeIndex :: Matrix a -> (Int, Int) -> a -safeIndex inM (iIndex,jIndex) = - if iIndex >= (length inM) then error ("First out of bounds " ++ show (iIndex, (length inM))) - else if jIndex >= (length inM) then error ("Second out of bounds " ++ show (jIndex, (length inM))) - else if iIndex > jIndex then - (inM V.! iIndex) V.! jIndex - else - (inM V.! jIndex) V.! iIndex - --- | makeLowerDiag take a Vector of Vetors (Matrix) and returns a lower diagonal matrix --- including diagonal -makeLowerDiag :: (Eq a) => Matrix a -> Int -> Int -> Matrix a -makeLowerDiag inM row numRows - | SymMatrix.null inM = error "Input matrix is empty in makeLowerDiag" - | row == numRows = V.empty - | otherwise = - let origRow = inM V.! row - newRow = V.take (row + 1) origRow - in V.cons newRow (makeLowerDiag inM (row + 1) numRows) - --- | makeDefaultMatrix creates an all 1 wth diagonal 0 matrix of size n x n -makeDefaultMatrix :: Int -> Matrix Int -makeDefaultMatrix n = - let row = replicate n 1 - rowList = replicate n row - initialMattrix = fromLists rowList - updateIndexList = [0..(n-1)] - zeroList = replicate n 0 - updateList = zip3 updateIndexList updateIndexList zeroList - in - updateMatrix initialMattrix updateList - - --- | cartesian product of two lists -cartProd :: [a] -> [a] -> [(a,a)] -cartProd xs ys = [(x,y) | x <- xs, y <- ys] - --- | checkSymmetry takes a Vector of VEctors of a --- and checks for symmetry -checkSymmetry :: (Eq a, Show a) => Matrix a -> [(Int, Int)] -> Bool -checkSymmetry inVV pairList - | SymMatrix.null inVV = error "Input matrix is empty" - | L.null pairList = True - | otherwise = - let (iIndex, jIndex) = head pairList - firstCheck = ((inVV V.! iIndex) V.! jIndex) == ((inVV V.! jIndex) V.! iIndex) - in - if firstCheck then checkSymmetry inVV (tail pairList) - else error ("Matrix is not symmetrical:" ++ show (iIndex, jIndex) ++ "=>" ++ show ((inVV V.! iIndex) V.! jIndex) ++ " /= " ++ show ((inVV V.! jIndex) V.! iIndex)) - --- | addMatrixRow add a row to existing matrix to extend Matrix dimension --- used when adding HTU distances to existing distance matrix as Wagner tree is built -addMatrixRow :: Matrix a -> V.Vector a -> Matrix a -addMatrixRow inM newRow = - if V.null newRow then inM - else - inM `V.snoc` newRow - - --- | addMatrices adds a Matrix to existing matrix to extend Matrix dimension -addMatrices :: (Eq a) => Matrix a -> Matrix a -> Matrix a -addMatrices inM newMatrix = - if SymMatrix.null newMatrix then inM - else - inM V.++ newMatrix - --- | reIndexTriple taske (i,j,k) and returns (max i j, min i j, k) -reIndexTriple :: (Ord a) => (a, a, b) -> (a, a, b) -reIndexTriple trip@(iIndex, jIndex, value) = - if iIndex > jIndex then trip - else (jIndex, iIndex, value) - --- | updateMatrix takes a list of triples and update matrix --- update all at once checking for bounds --- could naively do each triple in turn, but would be alot of copying -updateMatrix :: (Show a, Ord a) => Matrix a -> [(Int, Int, a)] -> Matrix a -updateMatrix inM modList = - if L.null modList then inM - else - let orderedTripleList = S.uniqueSort $ fmap reIndexTriple modList - minRow = fst3 $ head orderedTripleList - maxRow = fst3 $ last orderedTripleList - in - if minRow < 0 then error ("Update matrix out of bounds: " ++ show orderedTripleList) - else if maxRow >= rows inM then error ("Update matrix out of bounds, row = " ++ show (rows inM) ++ " and trying to update row " ++ show maxRow) - else - let firstPart = V.unsafeTake minRow inM - restPart = V.unsafeDrop minRow inM - modifiedRemainder = updateRows restPart orderedTripleList minRow - in - addMatrices firstPart modifiedRemainder - --- | unsafeUpdateMatrix unsafe version of updateMatrix -unsafeUpdateMatrix :: (Show a, Ord a) => Matrix a -> [(Int, Int, a)] -> Matrix a -unsafeUpdateMatrix inM modList = - if L.null modList then inM - else - let orderedTripleList = S.uniqueSort $ fmap reIndexTriple modList - minRow = fst3 $ head orderedTripleList - firstPart = V.unsafeTake minRow inM - restPart = V.unsafeDrop minRow inM - modifiedRemainder = updateRows restPart orderedTripleList minRow - in - addMatrices firstPart modifiedRemainder - --- | updateRows takes the section of the matrix containing rows that wil be modified --- (some not) and modifes or copies rows and rerns a Matrix (vector of roow vectors) -updateRows :: (Show a, Eq a) => Matrix a -> [(Int, Int, a)] -> Int -> Matrix a -updateRows inM tripList currentRow - | L.null tripList = inM - | SymMatrix.null inM = error ("Matrix is empty and there are modifications that remain: " ++ show tripList) - | otherwise = - let (rowIndex, columnIndex, value) = L.head tripList - firstOrigRow = V.head inM - in - if currentRow /= rowIndex then firstOrigRow `V.cons` updateRows (V.tail inM) tripList (currentRow + 1) - else -- account for multiple modifications to same row - let (newRow, newTripList) = modifyRow firstOrigRow columnIndex value currentRow (L.tail tripList) - in - -- This for debug--remove after test - newRow `V.cons` updateRows (V.tail inM) newTripList (currentRow + 1) - - --- | modifyRow takes an initial modification (column and value) and then checks to see if there are more modifications in that --- row (rowNumber) in the remainder of the list of modifications, returning the new row and mod list as a pair --- assumes that sorted triples sort by first, second, then third elements -modifyRow :: V.Vector a -> Int -> a -> Int -> [(Int, Int, a)] -> (V.Vector a, [(Int, Int, a)]) -modifyRow inRow colIndex value rowNumber modList = - if colIndex >= V.length inRow then error ("Column to modify is outside length of row " ++ show (rowNumber, colIndex)) - else - let firstPart = V.unsafeTake colIndex inRow - remainderPart = V.unsafeDrop (colIndex + 1) inRow - newRow = firstPart V.++ (value `V.cons` remainderPart) - in - if L.null modList then (newRow, modList) - else continueRow (firstPart `V.snoc` value) inRow (colIndex + 1) rowNumber modList - - --- | continueRow continues to modify a row with multiple column modifcations --- assumes that sorted triples sorted by first, second, then third elements -continueRow :: V.Vector a ->V.Vector a -> Int -> Int -> [(Int, Int, a)] -> (V.Vector a, [(Int, Int, a)]) -continueRow partRow origRow colIndex rowNumber modList - | colIndex == V.length origRow = (partRow, modList) - | L.null modList = (partRow V.++ V.unsafeDrop colIndex origRow, modList) - | otherwise = - let (nextRowNumber, nextColIndex, nextValue) = L.head modList - in - if nextRowNumber /= rowNumber then (partRow V.++ V.unsafeDrop colIndex origRow, modList) - else - if nextColIndex /= colIndex then continueRow (partRow `V.snoc` (origRow V.! colIndex)) origRow (colIndex + 1) rowNumber modList - else continueRow (partRow `V.snoc` nextValue) origRow (colIndex + 1) rowNumber (L.tail modList) - --- | makeNiceRow pretty preints a list -makeNiceRow :: (Show a) => V.Vector a -> String -makeNiceRow aVect = - if V.null aVect then "\n" - else - show (V.head aVect) ++ " " ++ makeNiceRow (V.tail aVect) - --- | showNicely pretty prins matrix -showMatrixNicely :: (Show a, Eq a) => Matrix a -> String -showMatrixNicely inM = - let mRows = rows inM - mCols = cols inM - niceRows = V.map makeNiceRow inM - in - ("Dimensions: :" ++ show mRows ++ " " ++ show mCols ++ "\n" ++ concat niceRows) - --- | deleteRowsAndColumns take a list of rows (and same index for columns) --- to delete from Matrix. Uses lisyt to do in single pass -deleteRowsAndColumns :: (Show a, Eq a) => Matrix a -> [Int] -> Matrix a -deleteRowsAndColumns inM deleteList = - if L.null deleteList then inM - else deleteRC inM deleteList (rows inM) 0 - - --- | deleteRC takes matri delete list and counter to delte coumns and rows -deleteRC :: (Show a, Eq a) => Matrix a -> [Int] -> Int -> Int -> Matrix a -deleteRC inM deleteList origRows rowCounter = - if rowCounter == origRows then empty - else - let firstRow = V.head inM - toKeep = rowCounter `L.notElem` deleteList - newRow = deleteColumn firstRow deleteList (rowCounter + 1) 0 - in - if toKeep then newRow `V.cons` deleteRC (V.tail inM) deleteList origRows (rowCounter + 1) - else deleteRC (V.tail inM) deleteList origRows (rowCounter + 1) - --- | deleteColumn takes a row of a matrix (lower diagnonal), its length, --- a list of cilumns to delete and a column counter and creates a new row -deleteColumn :: (Show a, Eq a) => V.Vector a -> [Int] -> Int -> Int -> V.Vector a -deleteColumn origRow deleteList rowLength colCounter = - if colCounter == rowLength then V.empty - else - let firstValue = V.head origRow - toKeep = colCounter `L.notElem` deleteList - in - if toKeep then firstValue `V.cons` deleteColumn (V.tail origRow) deleteList rowLength (colCounter + 1) - else deleteColumn (V.tail origRow) deleteList rowLength (colCounter + 1) - --- | map maps a function over the matrix returning a matrix of new type -map :: (Eq a) => (a->b) -> Matrix a -> Matrix b -map f m = - if SymMatrix.null m then empty - else V.map (V.map f) m - --- | flatten concats rows of matrix to make a single Vector -flatten :: Eq a => Matrix a -> V.Vector a -flatten m = - if SymMatrix.null m then V.empty - else - let rowList = fmap (getFullRowVect m) [0..(rows m - 1)] - in - V.concat rowList - --- | zip takes two matrices and zips into a matrix of pairs -zip :: (Eq a, Eq b) => Matrix a -> Matrix b -> Matrix (a,b) -zip m1 m2 = - if dim m1 /= dim m2 then error ("Cannot zip matrices with unequal dimensions " ++ (show $ dim m1) ++ " " ++ (show $ dim m2)) - else if V.null m1 then V.empty - else - let m1r = V.head m1 - m2r = V.head m2 - newRow = V.zip m1r m2r - in - V.cons newRow (SymMatrix.zip (V.tail m1) (V.tail m2)) - --- | zip takes two matrices and zips into a matrix using f -zipWith :: (Eq a, Eq b) => (a -> b -> c) -> Matrix a -> Matrix b -> Matrix c -zipWith f m1 m2 = - if dim m1 /= dim m2 then error ("Cannot zip matrices with unequal dimensions " ++ (show $ dim m1) ++ " " ++ (show $ dim m2)) - else if V.null m1 then V.empty - else - let m1r = V.head m1 - m2r = V.head m2 - newRow = V.zipWith f m1r m2r - in - V.cons newRow (SymMatrix.zipWith f (V.tail m1) (V.tail m2)) - --where g (a,b) = f a b - --- | combine takes an operator f (Enforcing Num as opposed to zipWith) and two matrices --- applying f to each element of the two matrices M1 f M2 --- to create the output -combine :: Eq a => (a -> a -> a) -> Matrix a -> Matrix a -> Matrix a -combine f m1 m2 = - if SymMatrix.null m1 then error "Null matrix 1 in combine" - else if SymMatrix.null m2 then error "Null matrix 2 in combine" - else if dim m1 /= dim m2 then error ("Cannot combine matrices with unequal dimensions " ++ (show $ dim m1) ++ " " ++ (show $ dim m2)) - else - SymMatrix.map g $ SymMatrix.zip m1 m2 - where g (a,b) = f a b - --- | getCost is helper function for generation for a dense TCM -getCost :: Matrix Int -> Word -> Word -> Word -getCost localCM i j = - let x = SymMatrix.getFullVects localCM - in toEnum $ (x V.! fromEnum i) V.! fromEnum j diff --git a/pkg/PhyloLib/src/SymMatrixSeq.hs b/pkg/PhyloLib/src/SymMatrixSeq.hs deleted file mode 100644 index 46f29ed81..000000000 --- a/pkg/PhyloLib/src/SymMatrixSeq.hs +++ /dev/null @@ -1,439 +0,0 @@ -{- | -Module : SymMatrixSeq.hs -Description : Fu\nctions to manipulate square symmetric lower diagonal matrices with diagnonal values - as if they were normal matrices, based on Sequence type -Copyright : (c) 2020 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. -License : - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. - -Maintainer : Ward Wheeler -Stability : unstable -Portability : portable (I hope) - --} - -{-# Language ImportQualifiedPost #-} - -module SymMatrixSeq - ( empty - , dim - , fromLists - , Matrix - , SymMatrixSeq.null - , cols - , rows - , (!) - , toLists - , toRows - , fromRows - , toFullLists - , getFullRow - , isSymmetric - , updateMatrix - , unsafeUpdateMatrix - , addMatrixRow - , addMatrices - , deleteRowsAndColumns - , showMatrixNicely - , SymMatrixSeq.map - , SymMatrixSeq.flatten - , getFullRowVect - , SymMatrixSeq.zipWith - , SymMatrixSeq.zip - , combine - , safeIndex - , makeDefaultMatrix - , toVectorVector - ) where - -import Data.List qualified as L -import Data.Sort qualified as S -import LocalSequence qualified as LS -import Data.Vector qualified as V - --- | Matrix type as Vector of Vectors -type Matrix a = LS.Seq (LS.Seq a) - --- | functions for triples -fst3 :: (a,b,c) -> a -fst3 (d,_,_) = d - --- | empty matrix value from Vector -empty :: Matrix a -empty = LS.empty - --- | dim returns dimmension (rows = cols) -dim :: (Eq a) => Matrix a -> (Int, Int) -dim inMatrix = - if SymMatrixSeq.null inMatrix then (0,0) - else (LS.length inMatrix, LS.length inMatrix) - --- | rows returns number rows in Matrix (rows = cols) -rows :: (Eq a) => Matrix a -> Int -rows inM = fst $ dim inM - --- | cols returns number cols in Matrix (rows = cols) -cols :: (Eq a) => Matrix a -> Int -cols inM = fst $ dim inM - --- | null returns True of row number is 0 -null :: (Eq a) => Matrix a -> Bool -null inMatrix = inMatrix == empty - --- | isSymmetric is true by defineition--when created error if not -isSymmetric :: (Eq a) => Matrix a -> Bool -isSymmetric inM = - not (SymMatrixSeq.null inM) || error "Null matrix in isSymmetric" - --- | fromRows creates a lower diagonal matrix (with diagonal) -fromRows :: (Eq a, Show a) => [LS.Seq a] -> Matrix a -fromRows inVectList = fromLists $ fmap LS.toList inVectList - --- | toRows converts a Matrix to a list of Vectors --- unequal in length -toRows :: Matrix a -> [LS.Seq a] -toRows = LS.toList - --- | fromLists takes list of list of a and returns lower diagnoal (with diagonal) --- matrix as Matrix (Vector of Vectors) -fromLists :: (Eq a, Show a) => [[a]] -> Matrix a -fromLists inListList = - if L.null inListList then empty - else - let initialSquare = LS.map LS.fromList $ LS.fromList inListList - colsH = LS.length $ LS.head initialSquare - rowsH = LS.length initialSquare - in - if colsH /= rowsH then error ("Input matrix is not square " ++ (show (colsH, rowsH)) ++ " " ++ show inListList) - else - let indexPairs = cartProd [0..(rowsH - 1)] [0..(rowsH - 1)] - sym = checkSymmetry initialSquare indexPairs - in - if not sym then error "Input matrix not symmetrical" - else makeLowerDiag initialSquare 0 rowsH - --- | toLists takes a Matrix and returns a list of lists (not all same length) -toLists :: Eq a => Matrix a -> [[a]] -toLists inM = - if SymMatrixSeq.null inM then [] - else - LS.toList $ LS.map LS.toList inM - --- | toFullLists takes a Matrix and returns a list of lists of full length --- square matrix -toFullLists :: Eq a => Matrix a -> [[a]] -toFullLists inM = - if SymMatrixSeq.null inM then [] - else - fmap (getFullRow inM) [0..(rows inM - 1)] - --- | getFullRow returns a specific full row (is if matrix were square) --- as a list -getFullRow :: Eq a => Matrix a -> Int -> [a] -getFullRow inM index = - if SymMatrixSeq.null inM then [] - else - let firstPart = LS.toList $ inM LS.! index -- initial [0..index] of elements - restMatrix = LS.drop (index + 1) inM - restByColumn = LS.toList $ LS.map (LS.! index) restMatrix - in - firstPart ++ restByColumn - --- | getFullRowVect reurns a specific full row (is if matrix were square) --- as a Vector -getFullRowVect :: Eq a => Matrix a -> Int -> LS.Seq a -getFullRowVect inM index = - if SymMatrixSeq.null inM then LS.empty - else - let firstPart = inM LS.! index -- initial [0..index] of elements - restMatrix = LS.drop (index + 1) inM - restByColumn = LS.map (LS.! index) restMatrix - in - firstPart LS.++ restByColumn - --- | indexing lower diag matrix -(!) :: Matrix a -> (Int, Int) -> a -(!) inM (iIndex,jIndex) = - if iIndex > jIndex then - (inM LS.! iIndex) LS.! jIndex - else - (inM LS.! jIndex) LS.! iIndex - --- | indexing lower diag matrix -safeIndex :: Matrix a -> (Int, Int) -> a -safeIndex inM (iIndex,jIndex) = - if iIndex >= (length inM) then error ("First out of bounds " ++ show (iIndex, (length inM))) - else if jIndex >= (length inM) then error ("Second out of bounds " ++ show (jIndex, (length inM))) - else if iIndex > jIndex then - (inM LS.! iIndex) LS.! jIndex - else - (inM LS.! jIndex) LS.! iIndex - --- | makeLowerDiag take a Vector of Vetors (Matrix) and returns a lower diagonal matrix --- including diagonal -makeLowerDiag :: (Eq a) => Matrix a -> Int -> Int -> Matrix a -makeLowerDiag inM row numRows - | SymMatrixSeq.null inM = error "Input matrix is empty in makeLowerDiag" - | row == numRows = LS.empty - | otherwise = - let origRow = inM LS.! row - newRow = LS.take (row + 1) origRow - in - LS.cons newRow (makeLowerDiag inM (row + 1) numRows) - --- | makeDefaultMatrix creates an all 1 wth diagonal 0 matrix of size n x n -makeDefaultMatrix :: Int -> Matrix Int -makeDefaultMatrix n = - let row = replicate n 1 - rowList = replicate n row - initialMattrix = fromLists rowList - updateIndexList = [0..(n-1)] - zeroList = replicate n 0 - updateList = zip3 updateIndexList updateIndexList zeroList - in - updateMatrix initialMattrix updateList - - --- | cartesian product of two lists -cartProd :: [a] -> [a] -> [(a,a)] -cartProd xs ys = [(x,y) | x <- xs, y <- ys] - --- | checkSymmetry takes a Vector of VEctors of a --- and checks for symmetry -checkSymmetry :: (Eq a, Show a) => Matrix a -> [(Int, Int)] -> Bool -checkSymmetry inVV pairList - | SymMatrixSeq.null inVV = error "Input matrix is empty" - | L.null pairList = True - | otherwise = - let (iIndex, jIndex) = head pairList - firstCheck = ((inVV LS.! iIndex) LS.! jIndex) == ((inVV LS.! jIndex) LS.! iIndex) - in - if firstCheck then checkSymmetry inVV (tail pairList) - else error ("Matrix is not symmetrical:" ++ show (iIndex, jIndex) ++ "=>" ++ show ((inVV LS.! iIndex) LS.! jIndex) ++ " /= " ++ show ((inVV LS.! jIndex) LS.! iIndex)) - --- | addMatrixRow add a row to existing matrix to extend Matrix dimension --- used when adding HTU distances to existing distance matrix as Wagner tree is built -addMatrixRow :: (Eq a) => Matrix a -> LS.Seq a -> Matrix a -addMatrixRow inM newRow = - if LS.null newRow then inM - else - inM `LS.snoc` newRow - - --- | addMatrices adds a Matrix to existing matrix to extend Matrix dimension -addMatrices :: (Eq a) => Matrix a -> Matrix a -> Matrix a -addMatrices inM newMatrix = - if SymMatrixSeq.null newMatrix then inM - else - inM LS.++ newMatrix - --- | reIndexTriple taske (i,j,k) and returns (max i j, min i j, k) -reIndexTriple :: (Ord a) => (a, a, b) -> (a, a, b) -reIndexTriple trip@(iIndex, jIndex, value) = - if iIndex > jIndex then trip - else (jIndex, iIndex, value) - --- | updateMatrix takes a list of triples and update matrix --- update all at once checking for bounds --- could naively do each triple in turn, but would be alot of copying -updateMatrix :: (Show a, Ord a) => Matrix a -> [(Int, Int, a)] -> Matrix a -updateMatrix inM modList = - if L.null modList then inM - else - let orderedTripleList = S.uniqueSort $ fmap reIndexTriple modList - minRow = fst3 $ head orderedTripleList - maxRow = fst3 $ last orderedTripleList - in - if minRow < 0 then error ("Update matrix out of bounds: " ++ show orderedTripleList) - else if maxRow >= rows inM then error ("Update matrix out of bounds, row = " ++ show (rows inM) ++ " and trying to update row " ++ show maxRow) - else - let firstPart = LS.unsafeTake minRow inM - restPart = LS.unsafeDrop minRow inM - modifiedRemainder = updateRows restPart orderedTripleList minRow - in - addMatrices firstPart modifiedRemainder - --- | unsafeUpdateMatrix unsafe version of updateMatrix -unsafeUpdateMatrix :: (Show a, Ord a) => Matrix a -> [(Int, Int, a)] -> Matrix a -unsafeUpdateMatrix inM modList = - if L.null modList then inM - else - let orderedTripleList = S.uniqueSort $ fmap reIndexTriple modList - minRow = fst3 $ head orderedTripleList - firstPart = LS.unsafeTake minRow inM - restPart = LS.unsafeDrop minRow inM - modifiedRemainder = updateRows restPart orderedTripleList minRow - in - addMatrices firstPart modifiedRemainder - --- | updateRows takes the section of the matrix containing rows that wil be modified --- (some not) and modifes or copies rows and rerns a Matrix (vector of roow vectors) -updateRows :: (Show a, Eq a) => Matrix a -> [(Int, Int, a)] -> Int -> Matrix a -updateRows inM tripList currentRow - | L.null tripList = inM - | SymMatrixSeq.null inM = error ("Matrix is empty and there are modifications that remain: " ++ show tripList) - | otherwise = - let (rowIndex, columnIndex, value) = L.head tripList - firstOrigRow = LS.head inM - in - if currentRow /= rowIndex then firstOrigRow `LS.cons` updateRows (LS.tail inM) tripList (currentRow + 1) - else -- account for multiple modifications to same row - let (newRow, newTripList) = modifyRow firstOrigRow columnIndex value currentRow (L.tail tripList) - in - -- This for debug--remove after test - newRow `LS.cons` updateRows (LS.tail inM) newTripList (currentRow + 1) - - --- | modifyRow takes an initial modification (column and value) and then checks to see if there are more modifications in that --- row (rowNumber) in the remainder of the list of modifications, returning the new row and mod list as a pair --- assumes that sorted triples sort by first, second, then third elements -modifyRow :: LS.Seq a -> Int -> a -> Int -> [(Int, Int, a)] -> (LS.Seq a, [(Int, Int, a)]) -modifyRow inRow colIndex value rowNumber modList = - if colIndex >= LS.length inRow then error ("Column to modify is outside length of row " ++ show (rowNumber, colIndex)) - else - let firstPart = LS.unsafeTake colIndex inRow - remainderPart = LS.unsafeDrop (colIndex + 1) inRow - newRow = firstPart LS.++ (value `LS.cons` remainderPart) - in - if L.null modList then (newRow, modList) - else continueRow (firstPart `LS.snoc` value) inRow (colIndex + 1) rowNumber modList - - --- | continueRow continues to modify a row with multiple column modifcations --- assumes that sorted triples sorted by first, second, then third elements -continueRow :: LS.Seq a ->LS.Seq a -> Int -> Int -> [(Int, Int, a)] -> (LS.Seq a, [(Int, Int, a)]) -continueRow partRow origRow colIndex rowNumber modList - | colIndex == LS.length origRow = (partRow, modList) - | L.null modList = (partRow LS.++ LS.unsafeDrop colIndex origRow, modList) - | otherwise = - let (nextRowNumber, nextColIndex, nextValue) = L.head modList - in - if nextRowNumber /= rowNumber then (partRow LS.++ LS.unsafeDrop colIndex origRow, modList) - else - if nextColIndex /= colIndex then continueRow (partRow `LS.snoc` (origRow LS.! colIndex)) origRow (colIndex + 1) rowNumber modList - else continueRow (partRow `LS.snoc` nextValue) origRow (colIndex + 1) rowNumber (L.tail modList) - --- | makeNiceRow pretty preints a list -makeNiceRow :: (Eq a, Show a) => LS.Seq a -> String -makeNiceRow aVect = - if LS.null aVect then "\n" - else - show (LS.head aVect) ++ " " ++ makeNiceRow (LS.tail aVect) - --- | showNicely pretty prins matrix -showMatrixNicely :: (Show a, Eq a) => Matrix a -> String -showMatrixNicely inM = - let mRows = rows inM - mCols = cols inM - niceRows = LS.map makeNiceRow inM - in - ("Dimensions: :" ++ show mRows ++ " " ++ show mCols ++ "\n" ++ concat niceRows) - --- | deleteRowsAndColumns take a list of rows (and same index for columns) --- to delete from Matrix. Uses lisyt to do in single pass -deleteRowsAndColumns :: (Show a, Eq a) => Matrix a -> [Int] -> Matrix a -deleteRowsAndColumns inM deleteList = - if L.null deleteList then inM - else deleteRC inM deleteList (rows inM) 0 - - --- | deleteRC takes matri delete list and counter to delte coumns and rows -deleteRC :: (Show a, Eq a) => Matrix a -> [Int] -> Int -> Int -> Matrix a -deleteRC inM deleteList origRows rowCounter = - if rowCounter == origRows then empty - else - let firstRow = LS.head inM - toKeep = rowCounter `L.notElem` deleteList - newRow = deleteColumn firstRow deleteList (rowCounter + 1) 0 - in - if toKeep then newRow `LS.cons` deleteRC (LS.tail inM) deleteList origRows (rowCounter + 1) - else deleteRC (LS.tail inM) deleteList origRows (rowCounter + 1) - --- | deleteColumn takes a row of a matrix (lower diagnonal), its length, --- a list of cilumns to delete and a column counter and creates a new row -deleteColumn :: (Show a, Eq a) => LS.Seq a -> [Int] -> Int -> Int -> LS.Seq a -deleteColumn origRow deleteList rowLength colCounter = - if colCounter == rowLength then LS.empty - else - let firstValue = LS.head origRow - toKeep = colCounter `L.notElem` deleteList - in - if toKeep then firstValue `LS.cons` deleteColumn (LS.tail origRow) deleteList rowLength (colCounter + 1) - else deleteColumn (LS.tail origRow) deleteList rowLength (colCounter + 1) - --- | map maps a function over the matrix returning a matrix of new type -map :: (Eq a) => (a->b) -> Matrix a -> Matrix b -map f m = - if SymMatrixSeq.null m then empty - else LS.map (LS.map f) m - --- | flatten concats rows of matrix to make a single Vector -flatten :: Eq a => Matrix a -> LS.Seq a -flatten m = - if SymMatrixSeq.null m then LS.empty - else LS.concat m - --- | zip takes two matrices and zips into a matrix of pairs -zip :: (Eq a, Eq b) => Matrix a -> Matrix b -> Matrix (a,b) -zip m1 m2 = - if dim m1 /= dim m2 then error ("Cannot zip matrices with unequal dimensions " ++ (show $ dim m1) ++ " " ++ (show $ dim m2)) - else if LS.null m1 then LS.empty - else - let m1r = LS.head m1 - m2r = LS.head m2 - newRow = LS.zip m1r m2r - in - LS.cons newRow (SymMatrixSeq.zip (LS.tail m1) (LS.tail m2)) - --- | zip takes two matrices and zips into a matrix using f -zipWith :: (Eq a, Eq b) => (a -> b -> c) -> Matrix a -> Matrix b -> Matrix c -zipWith f m1 m2 = - if dim m1 /= dim m2 then error ("Cannot zip matrices with unequal dimensions " ++ (show $ dim m1) ++ " " ++ (show $ dim m2)) - else if LS.null m1 then LS.empty - else - let m1r = LS.head m1 - m2r = LS.head m2 - newRow = LS.map g $ LS.zip m1r m2r - in - LS.cons newRow (SymMatrixSeq.zipWith f (LS.tail m1) (LS.tail m2)) - where g (a,b) = f a b - --- | combine takes an operator f (Enforcing Num as opposed to zipWith) and two matrices --- applying f to each element of the two matrices M1 f M2 --- to create the output -combine :: Eq a => (a -> a -> a) -> Matrix a -> Matrix a -> Matrix a -combine f m1 m2 = - if SymMatrixSeq.null m1 then error "Null matrix 1 in combine" - else if SymMatrixSeq.null m2 then error "Null matrix 2 in combine" - else if dim m1 /= dim m2 then error ("Cannot combine matrices with unequal dimensions " ++ (show $ dim m1) ++ " " ++ (show $ dim m2)) - else - SymMatrixSeq.map g $ SymMatrixSeq.zip m1 m2 - where g (a,b) = f a b - - --- | convert to Vector based matrix -toVectorVector :: Matrix a -> V.Vector (V.Vector a) -toVectorVector inMat = LS.toVector $ LS.map LS.toVector inMat diff --git a/src/Commands/CommandExecution.hs b/src/Commands/CommandExecution.hs new file mode 100644 index 000000000..6b488f52e --- /dev/null +++ b/src/Commands/CommandExecution.hs @@ -0,0 +1,1417 @@ +{-# LANGUAGE CPP #-} + +{- | +Module to coordinate command execution. +-} +module Commands.CommandExecution ( + executeCommands, + executeRenameReblockCommands, + getDataListList, +) where + +import Commands.CommandUtilities +import Commands.Transform qualified as TRANS +import Commands.Verify qualified as VER +import Control.Arrow ((&&&)) +import Control.Monad (unless, when) +import Control.Monad.IO.Class (MonadIO (..)) +import Data.Bifunctor (bimap) +import Data.CSV qualified as CSV +import Data.Char +import Data.Char qualified as C +import Data.Foldable (fold) +import Data.InfList qualified as IL +import Data.List qualified as L +import Data.List.Split qualified as SL +import Data.Maybe +import Data.Ord +import Data.Text.Lazy qualified as T +import Data.Vector qualified as V +import Data.Version qualified as DV +import GeneralUtilities +import GraphOptimization.Traversals qualified as TRAV +import Graphs.GraphOperations qualified as GO +import PHANE.Evaluation +import PHANE.Evaluation.ErrorPhase (ErrorPhase (..)) +import PHANE.Evaluation.Logging (LogLevel (..), Logger (..)) +import Reconciliation.ReconcileGraphs qualified as R +import Search.Build qualified as B +import Search.Refinement qualified as REF +import Search.Search qualified as S +import Support.Support qualified as SUP +import System.CPU qualified as SC +import System.IO +import System.IO.Unsafe qualified as SIOU +import System.Info qualified as SI +import System.Timing +import Text.Read +import Types.Types +import Utilities.Distances qualified as D +import Utilities.Utilities qualified as U + + +{- | executeCommands reads input files and returns raw data +need to close files after read +-} +executeCommands + ∷ GlobalSettings + → ([NameText], [(NameText, NameText)]) + → Int + → String + → ProcessedData + → ProcessedData + → ProcessedData + → [ReducedPhylogeneticGraph] + → [ReducedPhylogeneticGraph] + → [Command] + → Bool + → PhyG ([ReducedPhylogeneticGraph], GlobalSettings, [ReducedPhylogeneticGraph]) +executeCommands globalSettings excludeRename numInputFiles crossReferenceString origProcessedData processedData reportingData curGraphs supportGraphList commandList isFirst = case commandList of + [] → pure (curGraphs, globalSettings, supportGraphList) + (firstOption, firstArgs) : otherCommands → case firstOption of + -- skip "Read" and "Rename "commands already processed + Read → error ("Read command should already have been processed: " <> show (firstOption, firstArgs)) + Rename → error ("Rename command should already have been processed: " <> show (firstOption, firstArgs)) + Reblock → error ("Reblock command should already have been processed: " <> show (firstOption, firstArgs)) + Run → error ("Run command should already have been processed: " <> show (firstOption, firstArgs)) + -- other commands + Build → do + (elapsedSeconds, newGraphList') ← + timeOp . pure $ + B.buildGraph firstArgs globalSettings processedData + newGraphList ← newGraphList' + let searchInfo = makeSearchRecord firstOption firstArgs curGraphs newGraphList (fromIntegral $ toMilliseconds elapsedSeconds) "No Comment" + let newSearchData = searchInfo : searchData globalSettings + + executeCommands + (globalSettings{searchData = newSearchData}) + excludeRename + numInputFiles + crossReferenceString + origProcessedData + processedData + reportingData + (curGraphs <> newGraphList) + supportGraphList + otherCommands + isFirst + Refine → do + (elapsedSeconds, newGraphList') ← + timeOp . pure $ + REF.refineGraph firstArgs globalSettings processedData curGraphs + + newGraphList ← newGraphList' + let searchInfo = makeSearchRecord firstOption firstArgs curGraphs newGraphList (fromIntegral $ toMilliseconds elapsedSeconds) "No Comment" + let newSearchData = searchInfo : searchData globalSettings + + executeCommands + (globalSettings{searchData = newSearchData}) + excludeRename + numInputFiles + crossReferenceString + origProcessedData + processedData + reportingData + newGraphList + supportGraphList + otherCommands + isFirst + Fuse → do + (elapsedSeconds, newGraphList) ← + timeOp $ + REF.fuseGraphs firstArgs globalSettings processedData curGraphs + + let searchInfo = makeSearchRecord firstOption firstArgs curGraphs newGraphList (fromIntegral $ toMilliseconds elapsedSeconds) "No Comment" + let newSearchData = searchInfo : searchData globalSettings + + executeCommands + (globalSettings{searchData = newSearchData}) + excludeRename + numInputFiles + crossReferenceString + origProcessedData + processedData + reportingData + newGraphList + supportGraphList + otherCommands + isFirst + Report → do + let doDotPDF = elem "dotpdf" $ fmap (fmap toLower . fst) firstArgs + let collapse' = elem "collapse" $ fmap (fmap toLower . fst) firstArgs + let noCollapse' = elem "nocollapse" $ fmap (fmap toLower . fst) firstArgs + let reconcile = any ((== "reconcile") . fst) firstArgs + + -- set default collapse for dotPDF to True, False otherwise + let collapse -- this will casue problems with reconcile + | reconcile = False + | collapse' = True + | noCollapse' = False + -- | doDotPDF = True + | otherwise = False + + let curGraphs' + | not collapse = curGraphs + | otherwise = U.collapseReducedGraph <$> curGraphs + + -- use 'temp' updated graphs s don't repeatedly add model and root complexityies + -- reporting collapsed + -- reverse sorting graphs by cost + let rediagnoseWithReportingData = optimalityCriterion globalSettings == NCM && U.has4864PackedChars (thd3 processedData) + updatedCostGraphs ← + TRAV.updateGraphCostsComplexities globalSettings reportingData processedData rediagnoseWithReportingData curGraphs' + let graphsWithUpdatedCosts = + L.sortOn + (Data.Ord.Down . snd5) + updatedCostGraphs + -- (TRAV.updateGraphCostsComplexities globalSettings reportingData processedData rediagnoseWithReportingData curGraphs') + reportStuff@(reportString, outFile, writeMode) ← + reportCommand + globalSettings + firstArgs + excludeRename + numInputFiles + crossReferenceString + reportingData + graphsWithUpdatedCosts + supportGraphList + + unless (null reportString) . logWith LogInfo $ "Report writing to \"" <> outFile <> "\"\n" + + case reportString of + "" → + executeCommands + globalSettings + excludeRename + numInputFiles + crossReferenceString + origProcessedData + processedData + reportingData + curGraphs + supportGraphList + otherCommands + isFirst + _ | doDotPDF → do + let reportString' = changeDotPreamble "digraph {" "digraph G {\n\trankdir = LR;\tedge [colorscheme=spectral11];\tnode [shape = none];\n" reportString + printGraphVizDot reportString' outFile + executeCommands + globalSettings + excludeRename + numInputFiles + crossReferenceString + origProcessedData + processedData + reportingData + curGraphs + supportGraphList + otherCommands + isFirst + _ → do + case outFile of + "stderr" → liftIO $ hPutStr stderr reportString + "stdout" → liftIO $ putStr reportString + _ → case writeMode of + "overwrite" → liftIO $ writeFile outFile reportString + "append" → liftIO $ appendFile outFile reportString + _ → + failWithPhase Parsing $ + "Error 'report' command not properly formatted" <> show reportStuff + + executeCommands + globalSettings + excludeRename + numInputFiles + crossReferenceString + origProcessedData + processedData + reportingData + curGraphs + supportGraphList + otherCommands + isFirst + Search → do + (elapsedSeconds, output) ← + timeOp $ + S.search firstArgs globalSettings processedData curGraphs + let searchInfo = + makeSearchRecord + firstOption + firstArgs + curGraphs + (fst output) + (fromIntegral $ toMilliseconds elapsedSeconds) + (concatMap (L.intercalate "\n") (snd output)) + let newSearchData = searchInfo : searchData globalSettings + executeCommands + (globalSettings{searchData = newSearchData}) + excludeRename + numInputFiles + crossReferenceString + origProcessedData + processedData + reportingData + (fst output) + supportGraphList + otherCommands + isFirst + Select → do + (elapsedSeconds, newGraphList) ← + timeOp $ + GO.selectPhylogeneticGraphReduced firstArgs curGraphs + let searchInfo = makeSearchRecord firstOption firstArgs curGraphs newGraphList (fromIntegral $ toMilliseconds elapsedSeconds) "No Comment" + let newSearchData = searchInfo : searchData globalSettings + let typeSelected = case firstArgs of + [] → "best" + (a, _) : _ → C.toLower <$> a + logWith LogInfo $ unwords ["Selecting", typeSelected, "graphs", "\n"] + executeCommands + (globalSettings{searchData = newSearchData}) + excludeRename + numInputFiles + crossReferenceString + origProcessedData + processedData + reportingData + newGraphList + supportGraphList + otherCommands + isFirst + Set → do + -- if set changes graph aspects--may need to reoptimize + (newGlobalSettings, newProcessedData) ← setCommand firstArgs globalSettings reportingData processedData isFirst + let needReoptimize = requireReoptimization globalSettings newGlobalSettings + newGraphList ← + if not needReoptimize + then pure curGraphs + else -- then logWith LogInfo "No need to reoptimize graphs\n" $> curGraphs + do + logWith LogInfo "Reoptimizing graphs\n" + -- TODO should be parallel + mapM (TRAV.multiTraverseFullyLabelGraphReduced newGlobalSettings newProcessedData True True Nothing) $ fst5 <$> curGraphs + + let searchInfo = makeSearchRecord firstOption firstArgs curGraphs newGraphList 0 "No Comment" + let newSearchData = searchInfo : searchData newGlobalSettings + + executeCommands + (newGlobalSettings{searchData = newSearchData}) + excludeRename + numInputFiles + crossReferenceString + origProcessedData + processedData + reportingData + newGraphList + supportGraphList + otherCommands + False + Swap → do + (elapsedSeconds, newGraphList) ← + timeOp $ + REF.swapMaster firstArgs globalSettings processedData curGraphs + let searchInfo = makeSearchRecord firstOption firstArgs curGraphs newGraphList (fromIntegral $ toMilliseconds elapsedSeconds) "No Comment" + let newSearchData = searchInfo : searchData globalSettings + executeCommands + (globalSettings{searchData = newSearchData}) + excludeRename + numInputFiles + crossReferenceString + origProcessedData + processedData + reportingData + newGraphList + supportGraphList + otherCommands + isFirst + Support → do + (elapsedSeconds, newSupportGraphList') ← + timeOp . pure $ + SUP.supportGraph firstArgs globalSettings processedData curGraphs + + newSupportGraphList ← newSupportGraphList' + let searchInfo = + makeSearchRecord firstOption firstArgs curGraphs newSupportGraphList (fromIntegral $ toMilliseconds elapsedSeconds) "No Comment" + let newSearchData = searchInfo : searchData globalSettings + executeCommands + (globalSettings{searchData = newSearchData}) + excludeRename + numInputFiles + crossReferenceString + origProcessedData + processedData + reportingData + curGraphs + (supportGraphList <> newSupportGraphList) + otherCommands + isFirst + Transform → do + (elapsedSeconds, (newGS, newOrigData, newProcessedData, newGraphs)) ← + timeOp $ + TRANS.transform firstArgs globalSettings origProcessedData processedData curGraphs + + let searchInfo = makeSearchRecord firstOption firstArgs curGraphs newGraphs (fromIntegral $ toMilliseconds elapsedSeconds) "No Comment" + let newSearchData = searchInfo : searchData globalSettings + executeCommands + (newGS{searchData = newSearchData}) + excludeRename + numInputFiles + crossReferenceString + newOrigData + newProcessedData + reportingData + newGraphs + supportGraphList + otherCommands + isFirst + val → error $ "Command " <> show val <> " not recognized/implemented" + + +-- | makeSearchRecord take sbefore and after data of a commend and returns SearchData record +makeSearchRecord + ∷ Instruction → [Argument] → [ReducedPhylogeneticGraph] → [ReducedPhylogeneticGraph] → Int → String → SearchData +makeSearchRecord firstOption firstArgs curGraphs newGraphList elapsedTime comment = + SearchData + { instruction = firstOption + , arguments = firstArgs + , minGraphCostIn = + if null curGraphs + then infinity + else minimum $ fmap snd5 curGraphs + , maxGraphCostIn = + if null curGraphs + then infinity + else maximum $ fmap snd5 curGraphs + , numGraphsIn = length curGraphs + , minGraphCostOut = + if null newGraphList + then infinity + else minimum $ fmap snd5 newGraphList + , maxGraphCostOut = + if null newGraphList + then infinity + else maximum $ fmap snd5 newGraphList + , numGraphsOut = length newGraphList + , commentString = comment + , duration = elapsedTime + } + + +{- | setCommand takes arguments to change globalSettings and multiple data aspects (e.g. 'blocks') +needs to be abtracted--too long +if seed list is empty [] then processes first set--confusing--shold be refactored +-} +setCommand ∷ [Argument] → GlobalSettings → ProcessedData → ProcessedData → Bool → PhyG (GlobalSettings, ProcessedData) +setCommand argList globalSettings origProcessedData processedData isFirst = + let material ∷ Argument → Bool + material (k, v) = not $ null k || null v + normalize = fmap C.toLower + niceArgs = bimap normalize normalize <$> filter material argList + (commandList, optionList) = unzip niceArgs + checkCommandList = checkCommandArgs "set" commandList VER.setArgList + leafNameVect = fst3 processedData + in case niceArgs of + [] → failWithPhase Computing "Attempting to process a non existant 'set' command" + (firstCommand, firstOption) : _otherArgs → do + when (not checkCommandList) . failWithPhase Parsing $ + "Unrecognized command in 'set': " <> show argList + + when (length commandList > 1 || length optionList > 1) . failWithPhase Parsing $ + "Set option error: can only have one set argument for each command: " <> show (commandList, optionList) + + case isFirst of + True → case firstCommand of + "partitioncharacter" → case firstOption of + localPartitionChar@[_] → + do + logWith LogInfo $ "PartitionCharacter set to '" <> localPartitionChar <> "'\n" + let x = globalSettings{partitionCharacter = localPartitionChar} + pure (x, processedData) + val → + failWithPhase Parsing $ + "Error in 'set' command. Partitioncharacter '" <> val <> "' must be a single character\n" + "missingthreshold" → case readMaybe (firstOption) ∷ Maybe Int of + Nothing → + failWithPhase Parsing $ + "Set option 'missingThreshold' must be set to an integer value (e.g. missingThreshold:50): " <> firstOption + Just val + | val < 0 || 100 < val → + failWithPhase Parsing $ + "Set option 'missingThreshold' must be set to an integer value between 0 and 100: " <> firstOption + Just val → do + logWith LogInfo $ "MissingThreshold set to " <> firstOption <> "\n" + pure (globalSettings{missingThreshold = val}, processedData) + + -- sets root cost as well-- need in both places--one to process data and one to + -- keep in current global + -- MUST be set aheard of data packing so correct--otherwise alhobaet etc modified + "criterion" → do + localCriterion ← case firstOption of + "parsimony" → pure Parsimony + "pmdl" → pure PMDL + "si" → pure SI + "mapa" → pure MAPA + "ncm" → pure NCM + val → + failWithPhase Parsing $ + "Error in 'set' command. Criterion '" <> val <> "' is not 'parsimony', 'ml', or 'pmdl'" + + -- create lazy list of graph complexity indexed by number of network nodes--need leaf number for base tree complexity + (lGraphComplexityList, lRootComplexity) ← case localCriterion of + -- NCM | origProcessedData /= emptyProcessedData → + -- pure (IL.repeat (0.0, 0.0), U.calculateNCMRootCost origProcessedData) + + NCM → pure (IL.repeat (0.0, 0.0), U.calculateNCMRootCost origProcessedData) + Parsimony → pure $ (IL.repeat (0.0, 0.0), 0.0) + MAPA → pure $ (IL.repeat (0.0, 0.0), U.calculateMAPARootCost origProcessedData) + val + | val `elem` [PMDL, SI] → + pure $ (U.calculateGraphComplexity &&& (U.calculatePMDLRootCost True)) origProcessedData + val → failWithPhase Parsing $ "Optimality criterion not recognized: " <> show val + + let lGraphFactor + | localCriterion `elem` [PMDL, SI, MAPA] = PMDLGraph + | otherwise = graphFactor globalSettings + + logWith LogInfo $ case localCriterion of + NCM → unwords ["Optimality criterion set to", show NCM, "in -log (base 10) likelihood units\n"] + val + | val `elem` [PMDL, SI] → + unwords ["Optimality criterion set to", show val, "Tree Complexity =", show . fst $ IL.head lGraphComplexityList, "bits\n"] + val → "Optimality criterion set to " <> show val <> "\n" + + pure $ + ( globalSettings + { optimalityCriterion = localCriterion + , graphComplexityList = lGraphComplexityList + , rootComplexity = lRootComplexity + , graphFactor = lGraphFactor + } + , processedData + ) + "bc2" → + let (noChangeString, changeString) = changingStrings firstOption + noChangeMaybe = readMaybe noChangeString ∷ Maybe Double + changeMaybe = readMaybe changeString ∷ Maybe Double + in do + when (length commandList /= length optionList) . failWithPhase Parsing $ + "Set option error: number of values and options do not match:" <> show (commandList, optionList) + when (null $ firstOption) . failWithPhase Parsing $ + "Set option 'bc2' must be set to a pair of double values in parens, separated by a comma (e.g. bc2:(0.1, 1.1): no values found " + when (',' `notElem` firstOption) . failWithPhase Parsing $ + "Set option 'bc2' must be set to a pair of double values in parens, separated by a comma (e.g. bc2:(0.1, 1.1): no comma found " + case liftA2 (,) noChangeMaybe changeMaybe of + Nothing → + failWithPhase Parsing $ + "Set option 'bc2' must be set to a pair of double values in parens, separated by a comma (e.g. bc2:(0.1, 1.1): " + <> firstOption + Just val@(_, _) → do + when (bc2 globalSettings /= val) . logWith LogInfo $ + "bit cost 2 state set to " <> show val <> "\n" + pure (globalSettings{bc2 = val}, processedData) + "bc4" → + let (noChangeString, changeString) = changingStrings firstOption + noChangeMaybe = readMaybe noChangeString ∷ Maybe Double + changeMaybe = readMaybe changeString ∷ Maybe Double + in do + when (length commandList /= length optionList) . failWithPhase Parsing $ + "Set option error: number of values and options do not match:" <> show (commandList, optionList) + when (null $ firstOption) . failWithPhase Parsing $ + "Set option 'bc4' must be set to a pair of double values in parens, separated by a comma (e.g. bc4:(0.1, 1.1): no values found " + when (',' `notElem` firstOption) . failWithPhase Parsing $ + "Set option 'bc4' must be set to a pair of double values in parens, separated by a comma (e.g. bc4:(0.1, 1.1): no comma found " + case liftA2 (,) noChangeMaybe changeMaybe of + Nothing → + failWithPhase Parsing $ + "Set option 'bc4' must be set to a pair of double values in parens, separated by a comma (e.g. bc4:(0.1, 1.1): " + <> firstOption + Just val@(_, _) → do + when (bc4 globalSettings /= val) . logWith LogInfo $ + "bit cost 4 state set to " <> show val <> "\n" + pure (globalSettings{bc4 = val}, processedData) + "bc5" → + let (noChangeString, changeString) = changingStrings firstOption + noChangeMaybe = readMaybe noChangeString ∷ Maybe Double + changeMaybe = readMaybe changeString ∷ Maybe Double + in do + when (length commandList /= length optionList) . failWithPhase Parsing $ + "Set option error: number of values and options do not match:" <> show (commandList, optionList) + when (null $ firstOption) . failWithPhase Parsing $ + "Set option 'bc5' must be set to a pair of double values in parens, separated by a comma (e.g. bc5:(0.1, 1.1): no values found " + when (',' `notElem` firstOption) . failWithPhase Parsing $ + "Set option 'bc5' must be set to a pair of double values in parens, separated by a comma (e.g. bc5:(0.1, 1.1): no comma found " + case liftA2 (,) noChangeMaybe changeMaybe of + Nothing → + failWithPhase Parsing $ + "Set option 'bc5' must be set to a pair of double values in parens, separated by a comma (e.g. bc5:(0.1, 1.1): " + <> firstOption + Just val@(_, _) → do + when (bc5 globalSettings /= val) . logWith LogInfo $ + "bit cost 5 state set to " <> show val <> "\n" + pure (globalSettings{bc5 = val}, processedData) + "bc8" → + let (noChangeString, changeString) = changingStrings firstOption + noChangeMaybe = readMaybe noChangeString ∷ Maybe Double + changeMaybe = readMaybe changeString ∷ Maybe Double + in do + when (length commandList /= length optionList) . failWithPhase Parsing $ + "Set option error: number of values and options do not match:" <> show (commandList, optionList) + when (null $ firstOption) . failWithPhase Parsing $ + "Set option 'bc8' must be set to a pair of double values in parens, separated by a comma (e.g. bc8:(0.1, 1.1): no values found " + when (',' `notElem` firstOption) . failWithPhase Parsing $ + "Set option 'bc8' must be set to a pair of double values in parens, separated by a comma (e.g. bc8:(0.1, 1.1): no comma found " + case liftA2 (,) noChangeMaybe changeMaybe of + Nothing → + failWithPhase Parsing $ + "Set option 'bc8' must be set to a pair of double values in parens, separated by a comma (e.g. bc8:(0.1, 1.1): " + <> firstOption + Just val@(_, _) → do + when (bc8 globalSettings /= val) . logWith LogInfo $ + "bit cost 8 state set to " <> show val <> "\n" + pure (globalSettings{bc8 = val}, processedData) + "bc64" → + let (noChangeString, changeString) = changingStrings firstOption + noChangeMaybe = readMaybe noChangeString ∷ Maybe Double + changeMaybe = readMaybe changeString ∷ Maybe Double + in do + when (length commandList /= length optionList) . failWithPhase Parsing $ + "Set option error: number of values and options do not match:" <> show (commandList, optionList) + when (null $ firstOption) . failWithPhase Parsing $ + "Set option 'bc64' must be set to a pair of double values in parens, separated by a comma (e.g. bc64:(0.1, 1.1): no values found " + when (',' `notElem` firstOption) . failWithPhase Parsing $ + "Set option 'bc64' must be set to a pair of double values in parens, separated by a comma (e.g. bc64:(0.1, 1.1): no comma found " + case liftA2 (,) noChangeMaybe changeMaybe of + Nothing → + failWithPhase Parsing $ + "Set option 'bc64' must be set to a pair of double values in parens, separated by a comma (e.g. bc64:(0.1, 1.1): " + <> firstOption + Just val@(_, _) → do + when (bc64 globalSettings /= val) . logWith LogInfo $ + "bit cost 64 state set to " <> show val <> "\n" + pure (globalSettings{bc64 = val}, processedData) + "bcgt64" → + let (noChangeString, changeString) = changingStrings firstOption + noChangeMaybe = readMaybe noChangeString ∷ Maybe Double + changeMaybe = readMaybe changeString ∷ Maybe Double + in do + when (length commandList /= length optionList) . failWithPhase Parsing $ + "Set option error: number of values and options do not match:" <> show (commandList, optionList) + when (null $ firstOption) . failWithPhase Parsing $ + "Set option 'bcgt64' must be set to a pair of double values in parens, separated by a comma (e.g. bcgt64:(0.1, 1.1): no values found " + when (',' `notElem` firstOption) . failWithPhase Parsing $ + "Set option 'bcgt64' must be set to a pair of double values in parens, separated by a comma (e.g. bcgt64:(0.1, 1.1): no comma found " + case liftA2 (,) noChangeMaybe changeMaybe of + Nothing → + failWithPhase Parsing $ + "Set option 'bcgt64' must be set to a pair of double values in parens, separated by a comma (e.g. bcgt64:(0.1, 1.1): " + <> firstOption + Just val@(_, _) → do + when (bcgt64 globalSettings /= val) . logWith LogInfo $ + "bit cost > 64 state set to " <> show val <> "\n" + pure (globalSettings{bcgt64 = val}, processedData) + + + val → do + when (val `notElem` VER.setArgList) . logWith LogWarn $ + fold ["Warning: Unrecognized/missing 'set' option '", val, "' in ", show argList, "\n"] + pure (globalSettings, processedData) + + -- =-=-=-=-=-=-=-=-=-=-=-=-= + -- = = + -- = regular command stuff = + -- = not initial at start = + -- = = + -- =-=-=-=-=-=-=-=-=-=-=-=-= + _ → case firstCommand of + "bc2" → + let (noChangeString, changeString) = changingStrings firstOption + noChangeMaybe = readMaybe noChangeString ∷ Maybe Double + changeMaybe = readMaybe changeString ∷ Maybe Double + in do + when (length commandList /= length optionList) . failWithPhase Parsing $ + "Set option error: number of values and options do not match:" <> show (commandList, optionList) + when (null $ firstOption) . failWithPhase Parsing $ + "Set option 'bc2' must be set to a pair of double values in parens, separated by a comma (e.g. bc2:(0.1, 1.1): no values found " + when (',' `notElem` firstOption) . failWithPhase Parsing $ + "Set option 'bc2' must be set to a pair of double values in parens, separated by a comma (e.g. bc2:(0.1, 1.1): no comma found " + case liftA2 (,) noChangeMaybe changeMaybe of + Nothing → + failWithPhase Parsing $ + "Set option 'bc2' must be set to a pair of double values in parens, separated by a comma (e.g. bc2:(0.1, 1.1): " + <> firstOption + Just val@(_, _) → do + when (bc2 globalSettings /= val) . logWith LogInfo $ + "bit cost 2 state set to " <> show val <> "\n" + pure (globalSettings{bc2 = val}, processedData) + "bc4" → + let (noChangeString, changeString) = changingStrings firstOption + noChangeMaybe = readMaybe noChangeString ∷ Maybe Double + changeMaybe = readMaybe changeString ∷ Maybe Double + in do + when (length commandList /= length optionList) . failWithPhase Parsing $ + "Set option error: number of values and options do not match:" <> show (commandList, optionList) + when (null $ firstOption) . failWithPhase Parsing $ + "Set option 'bc4' must be set to a pair of double values in parens, separated by a comma (e.g. bc4:(0.1, 1.1): no values found " + when (',' `notElem` firstOption) . failWithPhase Parsing $ + "Set option 'bc4' must be set to a pair of double values in parens, separated by a comma (e.g. bc4:(0.1, 1.1): no comma found " + case liftA2 (,) noChangeMaybe changeMaybe of + Nothing → + failWithPhase Parsing $ + "Set option 'bc4' must be set to a pair of double values in parens, separated by a comma (e.g. bc4:(0.1, 1.1): " + <> firstOption + Just val@(_, _) → do + when (bc4 globalSettings /= val) . logWith LogInfo $ + "bit cost 4 state set to " <> show val <> "\n" + pure (globalSettings{bc4 = val}, processedData) + "bc5" → + let (noChangeString, changeString) = changingStrings firstOption + noChangeMaybe = readMaybe noChangeString ∷ Maybe Double + changeMaybe = readMaybe changeString ∷ Maybe Double + in do + when (length commandList /= length optionList) . failWithPhase Parsing $ + "Set option error: number of values and options do not match:" <> show (commandList, optionList) + when (null $ firstOption) . failWithPhase Parsing $ + "Set option 'bc5' must be set to a pair of double values in parens, separated by a comma (e.g. bc5:(0.1, 1.1): no values found " + when (',' `notElem` firstOption) . failWithPhase Parsing $ + "Set option 'bc5' must be set to a pair of double values in parens, separated by a comma (e.g. bc5:(0.1, 1.1): no comma found " + case liftA2 (,) noChangeMaybe changeMaybe of + Nothing → + failWithPhase Parsing $ + "Set option 'bc5' must be set to a pair of double values in parens, separated by a comma (e.g. bc5:(0.1, 1.1): " + <> firstOption + Just val@(_, _) → do + when (bc5 globalSettings /= val) . logWith LogInfo $ + "bit cost 5 state set to " <> show val <> "\n" + pure (globalSettings{bc5 = val}, processedData) + "bc8" → + let (noChangeString, changeString) = changingStrings firstOption + noChangeMaybe = readMaybe noChangeString ∷ Maybe Double + changeMaybe = readMaybe changeString ∷ Maybe Double + in do + when (length commandList /= length optionList) . failWithPhase Parsing $ + "Set option error: number of values and options do not match:" <> show (commandList, optionList) + when (null $ firstOption) . failWithPhase Parsing $ + "Set option 'bc8' must be set to a pair of double values in parens, separated by a comma (e.g. bc8:(0.1, 1.1): no values found " + when (',' `notElem` firstOption) . failWithPhase Parsing $ + "Set option 'bc8' must be set to a pair of double values in parens, separated by a comma (e.g. bc8:(0.1, 1.1): no comma found " + case liftA2 (,) noChangeMaybe changeMaybe of + Nothing → + failWithPhase Parsing $ + "Set option 'bc8' must be set to a pair of double values in parens, separated by a comma (e.g. bc8:(0.1, 1.1): " + <> firstOption + Just val@(_, _) → do + when (bc8 globalSettings /= val) . logWith LogInfo $ + "bit cost 8 state set to " <> show val <> "\n" + pure (globalSettings{bc8 = val}, processedData) + "bc64" → + let (noChangeString, changeString) = changingStrings firstOption + noChangeMaybe = readMaybe noChangeString ∷ Maybe Double + changeMaybe = readMaybe changeString ∷ Maybe Double + in do + when (length commandList /= length optionList) . failWithPhase Parsing $ + "Set option error: number of values and options do not match:" <> show (commandList, optionList) + when (null $ firstOption) . failWithPhase Parsing $ + "Set option 'bc64' must be set to a pair of double values in parens, separated by a comma (e.g. bc64:(0.1, 1.1): no values found " + when (',' `notElem` firstOption) . failWithPhase Parsing $ + "Set option 'bc64' must be set to a pair of double values in parens, separated by a comma (e.g. bc64:(0.1, 1.1): no comma found " + case liftA2 (,) noChangeMaybe changeMaybe of + Nothing → + failWithPhase Parsing $ + "Set option 'bc64' must be set to a pair of double values in parens, separated by a comma (e.g. bc64:(0.1, 1.1): " + <> firstOption + Just val@(_, _) → do + when (bc64 globalSettings /= val) . logWith LogInfo $ + "bit cost 64 state set to " <> show val <> "\n" + pure (globalSettings{bc64 = val}, processedData) + "bcgt64" → + let (noChangeString, changeString) = changingStrings firstOption + noChangeMaybe = readMaybe noChangeString ∷ Maybe Double + changeMaybe = readMaybe changeString ∷ Maybe Double + in do + when (length commandList /= length optionList) . failWithPhase Parsing $ + "Set option error: number of values and options do not match:" <> show (commandList, optionList) + when (null $ firstOption) . failWithPhase Parsing $ + "Set option 'bcgt64' must be set to a pair of double values in parens, separated by a comma (e.g. bcgt64:(0.1, 1.1): no values found " + when (',' `notElem` firstOption) . failWithPhase Parsing $ + "Set option 'bcgt64' must be set to a pair of double values in parens, separated by a comma (e.g. bcgt64:(0.1, 1.1): no comma found " + case liftA2 (,) noChangeMaybe changeMaybe of + Nothing → + failWithPhase Parsing $ + "Set option 'bcgt64' must be set to a pair of double values in parens, separated by a comma (e.g. bcgt64:(0.1, 1.1): " + <> firstOption + Just val@(_, _) → do + when (bcgt64 globalSettings /= val) . logWith LogInfo $ + "bit cost > 64 state set to " <> show val <> "\n" + pure (globalSettings{bcgt64 = val}, processedData) + + -- processed above, but need here since put in different value + "criterion" → do + localCriterion ← case firstOption of + "parsimony" → pure Parsimony + "pmdl" → pure PMDL + "si" → pure SI + "mapa" → pure MAPA + "ncm" → pure NCM + val → + failWithPhase Parsing $ + "Error in 'set' command. Criterion '" <> val <> "' is not 'parsimony', 'ml', or 'pmdl'" + + -- create lazy list of graph complexity indexed by number of network nodes--need leaf number for base tree complexity + (lGraphComplexityList, lRootComplexity) ← case localCriterion of + -- NCM | origProcessedData /= emptyProcessedData → + -- pure (IL.repeat (0.0, 0.0), U.calculateNCMRootCost origProcessedData) + + NCM → pure (IL.repeat (0.0, 0.0), U.calculateNCMRootCost origProcessedData) + Parsimony → pure $ (IL.repeat (0.0, 0.0), 0.0) + MAPA → pure $ (IL.repeat (0.0, 0.0), U.calculateMAPARootCost origProcessedData) + val + | val `elem` [PMDL, SI] → + pure $ (U.calculateGraphComplexity &&& (U.calculatePMDLRootCost True)) origProcessedData + val → failWithPhase Parsing $ "Optimality criterion not recognized: " <> show val + + let lGraphFactor + | localCriterion `elem` [PMDL, SI, MAPA] = PMDLGraph + | otherwise = graphFactor globalSettings + + logWith LogInfo $ case localCriterion of + NCM → unwords ["Optimality criterion set to", show NCM, "in -log (base 10) likelihood units\n"] + val + | val `elem` [PMDL, SI] → + unwords ["Optimality criterion set to", show val, "Tree Complexity =", show . fst $ IL.head lGraphComplexityList, "bits\n"] + val → "Optimality criterion set to " <> show val <> "\n" + + pure $ + ( globalSettings + { optimalityCriterion = localCriterion + , graphComplexityList = lGraphComplexityList + , rootComplexity = lRootComplexity + , graphFactor = lGraphFactor + } + , processedData + ) + + -- modify the behavior of resolutionCache softwired optimization + "compressresolutions" → do + localCriterion ← case toLower <$> firstOption of + "true" → pure True + "false" → pure False + val → + failWithPhase Parsing $ + "Error in 'set' command. CompressResolutions '" <> val <> "' is not 'true' or 'false'" + logWith LogInfo $ "CompressResolutions set to " <> show localCriterion <> "\n" + pure (globalSettings{compressResolutions = localCriterion}, processedData) + + -- this not intended to be for users + "dynamicepsilon" → case readMaybe (firstOption) ∷ Maybe Double of + Nothing → + failWithPhase Parsing $ + "Set option 'dynamicEpsilon' must be set to a double value >= 0.0 (e.g. dynamicepsilon:0.02): " <> firstOption + Just val + | val < 0.0 → + failWithPhase Parsing $ + "Set option 'dynamicEpsilon' must be set to a double value >= 0.0 (e.g. dynamicepsilon:0.02): " <> show val + Just localValue → do + logWith LogInfo $ "Dynamic Epsilon factor set to " <> firstOption <> "\n" + pure (globalSettings{dynamicEpsilon = 1.0 + (localValue * fractionDynamic globalSettings)}, processedData) + "finalassignment" → do + localMethod ← case firstOption of + "do" → pure DirectOptimization + "directoptimization" → pure DirectOptimization + "ia" → pure ImpliedAlignment + "impliedalignment" → pure ImpliedAlignment + val → + failWithPhase Parsing $ + fold + [ "Error in 'set' command. FinalAssignment '" + , val + , "' is not 'DirectOptimization (DO)' or 'ImpliedAlignment (IA)'" + ] + + case graphType globalSettings of + Tree → do + logWith LogInfo $ "FinalAssignment set to " <> show localMethod <> "\n" + pure (globalSettings{finalAssignment = localMethod}, processedData) + _ → do + unless (localMethod == DirectOptimization) $ + logWith LogInfo "FinalAssignment set to DO (ignoring IA option) for non-Tree graphs\n" + pure (globalSettings{finalAssignment = DirectOptimization}, processedData) + "graphfactor" → do + localMethod ← case toLower <$> firstOption of + "nopenalty" → pure NoNetworkPenalty + "w15" → pure Wheeler2015Network + "w23" → pure Wheeler2023Network + "pmdl" → pure PMDLGraph + val → + failWithPhase Parsing $ + "Error in 'set' command. GraphFactor '" <> val <> "' is not 'NoPenalty', 'W15', 'W23', or 'PMDL'" + logWith LogInfo $ "GraphFactor set to " <> show localMethod <> "\n" + pure (globalSettings{graphFactor = localMethod}, processedData) + "graphssteepest" → case readMaybe (firstOption) ∷ Maybe Int of + Nothing → + failWithPhase Parsing $ + "Set option 'graphsSteepest' must be set to an integer value (e.g. graphsSteepest:5): " <> firstOption + Just localValue → do + logWith LogInfo $ "GraphsStreepest set to " <> show localValue <> "\n" + pure (globalSettings{graphsSteepest = localValue}, processedData) + "graphtype" → do + localGraphType ← case firstOption of + "tree" → pure Tree + "softwired" → pure SoftWired + "hardwired" → pure HardWired + val → + failWithPhase Parsing $ + "Error in 'set' command. Graphtype '" <> val <> "' is not 'tree', 'hardwired', or 'softwired'" + + let netPenalty = case localGraphType of + HardWired → NoNetworkPenalty + _ → graphFactor globalSettings + + let settingResult = case localGraphType of + Tree → globalSettings{graphType = localGraphType} + _ → + globalSettings + { graphType = localGraphType + , finalAssignment = DirectOptimization + , graphFactor = netPenalty + } + when (localGraphType /= Tree) $ + logWith LogInfo $ + unwords + ["Graphtype set to", show localGraphType, "with graph factor NoPenalty and final assignment to DO\n"] + + pure (settingResult, processedData) + + -- In first to do stuff above also + "missingthreshold" → case readMaybe (firstOption) ∷ Maybe Int of + Nothing → + failWithPhase Parsing $ + "Set option 'missingThreshold' must be set to an integer value (e.g. missingThreshold:50): " <> firstOption + Just localValue | localValue == missingThreshold globalSettings → pure (globalSettings, processedData) + Just localValue → do + logWith LogWarn $ "MissingThreshold set to " <> show localValue <> "\n" + pure (globalSettings{missingThreshold = localValue}, processedData) + "modelcomplexity" → case readMaybe (firstOption) ∷ Maybe Double of + Nothing → + failWithPhase Parsing $ + "Set option 'modelComplexity' must be set to a double value (e.g. modelComplexity:123.456): " <> firstOption + Just localValue → do + logWith LogInfo $ "Model Complexity set to " <> firstOption <> "\n" + pure (globalSettings{modelComplexity = localValue}, processedData) + + -- modify the behavior of rerooting character trees for all graph types + "multitraverse" → do + localCriterion ← case toLower <$> firstOption of + "true" → pure True + "false" → pure False + _ → + failWithPhase Parsing $ + "Error in 'set' command. MultiTraverse '" <> firstOption <> "' is not 'true' or 'false'" + logWith LogInfo $ "MultiTraverse set to " <> show localCriterion <> "\n" + pure (globalSettings{multiTraverseCharacters = localCriterion}, processedData) + "outgroup" → + let outTaxonName = T.pack $ filter (/= '"') $ head $ filter (/= "") $ fmap snd argList + in case V.elemIndex outTaxonName leafNameVect of + Nothing → + failWithPhase Parsing $ + unwords + ["Error in 'set' command. Out-taxon", T.unpack outTaxonName, "not found in input leaf list", show $ T.unpack <$> leafNameVect] + Just outTaxonIndex → do + logWith LogInfo $ "Outgroup set to " <> T.unpack outTaxonName <> "\n" + pure (globalSettings{outgroupIndex = outTaxonIndex, outGroupName = outTaxonName}, processedData) + "partitioncharacter" → case firstOption of + localPartitionChar@[_] → do + when (localPartitionChar /= partitionCharacter globalSettings) . logWith LogInfo $ + "PartitionCharacter set to '" <> firstOption <> "'" + pure (globalSettings{partitionCharacter = localPartitionChar}, processedData) + val → + failWithPhase Parsing $ + "Error in 'set' command. Partitioncharacter '" <> val <> "' must be a single character" + "reportnaivedata" → do + localMethod ← case toLower <$> firstOption of + "true" → pure True + "false" → pure False + val → failWithPhase Parsing $ "Error in 'set' command. NeportNaive '" <> val <> "' is not 'True' or 'False'" + logWith LogInfo $ "ReportNaiveData set to " <> show localMethod <> "\n" + pure (globalSettings{reportNaiveData = localMethod}, processedData) + "rootcost" → do + localMethod ← case toLower <$> firstOption of + "mapa" → pure MAPARoot + "ncm" → pure NCMRoot + "norootcost" → pure NoRootCost + "pmdl" → pure PMDLRoot + "si" → pure SIRoot + "w15" → pure Wheeler2015Root + val → failWithPhase Parsing $ "Error in 'set' command. RootCost '" <> val <> "' is not 'NoRootCost', 'W15', or 'PMDL'" + + lRootComplexity ← case localMethod of + NoRootCost → pure 0.0 + val | val `elem` [Wheeler2015Root, PMDLRoot] → pure $ U.calculatePMDLRootCost True origProcessedData + val → failWithPhase Parsing $ "Error in 'set' command. No determined root complexity of '" <> show val <> "'" + + logWith LogInfo $ unwords ["RootCost set to", show localMethod, show lRootComplexity, "bits\n"] + pure (globalSettings{rootCost = localMethod, rootComplexity = lRootComplexity}, processedData) + "seed" → case readMaybe firstOption ∷ Maybe Int of + Nothing → failWithPhase Parsing $ "Set option 'seed' must be set to an integer value (e.g. seed:123): " <> firstOption + Just localValue → do + logWith LogInfo $ "Random Seed set to " <> firstOption <> "\n" + setRandomSeed localValue + pure (globalSettings, processedData) + "softwiredmethod" → do + localMethod ← case firstOption of + "naive" → pure Naive + "exhaustive" → pure Naive + "resolutioncache" → pure ResolutionCache + _ → + failWithPhase Parsing $ + fold ["Error in 'set' command. SoftwiredMethod '", firstOption, "' is not 'Exhaustive' or 'ResolutionCache'"] + + logWith LogInfo $ "SoftwiredMethod " <> show localMethod <> "\n" + pure (globalSettings{softWiredMethod = localMethod}, processedData) + + -- modify the use of Network Add heurisitcs in network optimization + "usenetaddheuristic" → do + localCriterion ← case firstOption of + "true" → pure True + "false" → pure False + val → + failWithPhase Parsing $ + "Error in 'set' command. UseNetAddHeuristic '" <> val <> "' is not 'true' or 'false'" + logWith LogInfo $ "UseNetAddHeuristic set to " <> show localCriterion <> "\n" + pure (globalSettings{useNetAddHeuristic = localCriterion}, processedData) + + -- these not intended for users + "jointhreshold" → case readMaybe (firstOption) ∷ Maybe Double of + Nothing → + failWithPhase Parsing $ + "Set option 'joinThreshold' must be set to an double value >= 1.0 (e.g. joinThreshold:1.17): " <> firstOption + Just localValue + | localValue < 1.0 → + failWithPhase Parsing $ + "Set option 'joinThreshold' must be set to a double value >= 1.0 (e.g. joinThreshold:1.17): " <> show localValue + Just localValue → do + logWith LogInfo $ "JoinThreshold set to " <> show localValue <> "\n" + pure (globalSettings{unionThreshold = localValue}, processedData) + + -- parallel strategy settings options + "defparstrat" → do + localMethod ← case firstOption of + "r0" → pure R0 + "rpar" → pure RPar + "rseq" → pure RSeq + "rdeepseq" → pure RDeepSeq + val → + failWithPhase Parsing $ + "Error in 'set' command. DefParStrat '" <> val <> "' is not 'r0', 'WrPar', 'rSeq', or 'rDeepSeq'" + logWith LogInfo $ "DefParStrat set to " <> show localMethod <> "\n" + pure (globalSettings{defaultParStrat = localMethod}, processedData) + "lazyparstrat" → do + localMethod ← case firstOption of + "r0" → pure R0 + "rpar" → pure RPar + "rseq" → pure RSeq + "rdeepseq" → pure RDeepSeq + val → + failWithPhase Parsing $ + "Error in 'set' command. DefParStrat '" <> val <> "' is not 'r0', 'WrPar', 'rSeq', or 'rDeepSeq'" + logWith LogInfo $ "LazyParStrat set to " <> show localMethod <> "\n" + pure (globalSettings{lazyParStrat = localMethod}, processedData) + "strictparstrat" → do + localMethod ← case firstOption of + "r0" → pure R0 + "rpar" → pure RPar + "rseq" → pure RSeq + "rdeepseq" → pure RDeepSeq + val → + failWithPhase Parsing $ + "Error in 'set' command. DefParStrat '" <> val <> "' is not 'r0', 'WrPar', 'rSeq', or 'rDeepSeq'" + logWith LogInfo $ "StrictParStrat set to " <> show localMethod <> "\n" + pure (globalSettings{strictParStrat = localMethod}, processedData) + + -- modify the use of implied alkignemnt in heuristics + "useia" → do + localCriterion ← case firstOption of + "true" → pure True + "false" → pure False + val → + failWithPhase Parsing $ + "Error in 'set' command. UseIA '" <> val <> "' is not 'true' or 'false'" + logWith LogInfo $ "UseIA set to " <> show localCriterion <> "\n" + pure (globalSettings{useIA = localCriterion}, processedData) + val → do + logWith LogWarn $ fold ["Warning: Unrecognized/missing 'set' option '", val, "' in ", show argList, "\n"] + pure (globalSettings, processedData) + + +{- | +'reportCommand' takes report options, current data and graphs and returns a +(potentially large) String to print and the channel to print it to and write mode +overwrite/append if global settings reportNaiveData is True then need to rediagnose +graph with processed data since naiveData was sent to command and will not match +what is in the optimized graphs. +-} +reportCommand + ∷ GlobalSettings + → [Argument] + → ([NameText], [(NameText, NameText)]) + → Int + → String + → ProcessedData + → [ReducedPhylogeneticGraph] + → [ReducedPhylogeneticGraph] + → PhyG (String, String, String) +reportCommand globalSettings argList excludeRename numInputFiles crossReferenceString processedData curGraphs supportGraphs = + let argListWithoutReconcileCommands = filter ((`notElem` VER.reconcileArgList) . fst) argList + -- check for balances double quotes and only one pair + outFileNameList = filter (`notElem` ["", "min", "max", "mid"]) $ fmap snd argListWithoutReconcileCommands -- argList + edgeWeigthVal = + let valList = filter (`elem` ["min", "max", "mid"]) $ fmap snd argListWithoutReconcileCommands + in + if null valList then "min" + else head valList + commandList = fmap (fmap C.toLower) $ filter (/= "") $ fmap fst argListWithoutReconcileCommands + in -- reconcileList = filter (/= "") $ fmap fst argList + + if length outFileNameList > 1 + then do failWithPhase Outputting ("Report can only have one file name: " <> show outFileNameList <> " " <> show argList) + else + let checkCommandList = checkCommandArgs "report" commandList VER.reportArgList + outfileName = + if null outFileNameList + then "stderr" + else tail $ L.init $ head outFileNameList + writeMode = + if "overwrite" `elem` commandList + then "overwrite" + else "append" + in -- error too harsh, lose everything else + -- if (null $ filter (/= "overwrite") $ filter (/= "append") commandList) then errorWithoutStackTrace ("Error: Missing 'report' option in " <> show commandList) + -- else + if not checkCommandList + then errorWithoutStackTrace ("Unrecognized command in report: " <> show argList) + else -- This for edge complexities + if ("complexity" `elem` commandList) && (optimalityCriterion globalSettings `notElem` [PMDL, SI]) + then do + logWith LogInfo "Cannot report edge complexities unless optimality criterion is either PMDL or SI\n" + pure ("Cannot report edge complexities unless optimality criterion is either PMDL or SI", outfileName, writeMode) + else if ("complexity" `elem` commandList) && (optimalityCriterion globalSettings `elem` [PMDL, SI]) + then -- else if (not .null) (L.intersect ["graphs", "newick", "dot", "dotpdf"] commandList) then + let relabelledDecoratedGraph = fmap (relabelEdgeComplexity globalSettings processedData ) (fmap thd5 curGraphs) + graphString = outputGraphString edgeWeigthVal commandList (outgroupIndex globalSettings) relabelledDecoratedGraph (fmap snd5 curGraphs) + in if null curGraphs + then do + logWith LogInfo "No graphs to report edge compleities \n" + pure ("No graphs to edge complexities report", outfileName, writeMode) + else do + logWith LogInfo ("Reporting " <> show (length curGraphs) <> " edge complexities of graphs at minimum cost " <> show (minimum $ fmap snd5 curGraphs) <> "\n") + pure (graphString, outfileName, writeMode) + else if "crossrefs" `elem` commandList + then + let dataString = crossReferenceString + in pure (dataString, outfileName, writeMode) + else + if "data" `elem` commandList + then + let blocks = thd3 processedData + numChars = V.sum $ fmap (V.length . thd3) blocks + dataString = phyloDataToString 0 blocks + -- \$ thd3 processedData + baseData = + [ ["Input data contained:"] + , ["", show (length $ fst3 processedData) <> " terminal taxa"] + , ["", show numInputFiles <> " input data files"] + , ["", show (length blocks) <> " character blocks"] + , ["", show numChars <> " total characters"] + ] + leafNames = V.toList (T.unpack <$> fst3 processedData) + leafField = ["Terminal taxa:"] : fmap (" " :) (SL.chunksOf 10 leafNames) + excludedTaxa = + if (not . null . fst) excludeRename + then T.unpack <$> fst excludeRename + else ["None"] + excludedField = ["Excluded taxa:"] : fmap (" " :) (SL.chunksOf 10 excludedTaxa) + renameFirstList = fmap (((: []) . T.unpack) . fst) (snd excludeRename) + renameSecondList = fmap (((: []) . T.unpack) . snd) (snd excludeRename) + renamePairList = + if (not . null . snd) excludeRename + then (" " :) <$> zipWith (<>) renameFirstList renameSecondList + else [[" ", "None", "None"]] + renameField = ["Renamed taxa:", "New Name", "Original Name"] : renamePairList + charInfoFields = ["Index", "Block", "Name", "Type", "Activity", "Weight", "Prealigned", "Alphabet", "TCM"] + in pure + ( CSV.genCsvFile + (baseData <> [[""]] <> leafField <> [[""]] <> excludedField <> [[""]] <> renameField <> [[""]] <> (charInfoFields : dataString)) + , outfileName + , writeMode + ) + else + if "diagnosis" `elem` commandList + then do + curGraphs' ← + if False -- not (reportNaiveData globalSettings) + then pure curGraphs + else + let action ∷ SimpleGraph → PhyG ReducedPhylogeneticGraph + action = TRAV.multiTraverseFullyLabelGraphReduced globalSettings processedData False False Nothing + in getParallelChunkTraverse >>= \pTraverse → + (action . fst5) `pTraverse` curGraphs + + dataStringList <- + let action :: (ReducedPhylogeneticGraph, Int) → PhyG [[String]] + action = getGraphDiagnosis globalSettings processedData + in do + diagPar ← getParallelChunkTraverse + diagPar action (zip curGraphs' [0 .. (length curGraphs' - 1)]) + + let dataString = CSV.genCsvFile $ concat dataStringList + if null curGraphs + then do + logWith LogInfo "No graphs to diagnose\n" + pure ("No graphs to diagnose", outfileName, writeMode) + else do + logWith + LogInfo + ("Diagnosing " <> show (length curGraphs) <> " graphs at minimum cost " <> show (minimum $ fmap snd5 curGraphs) <> "\n") + pure (dataString, outfileName, writeMode) + else + if "displaytrees" `elem` commandList + then do + -- need to specify -O option for multiple graphs + -- TODO parallelize + rediagnodesGraphs ← + mapM (TRAV.multiTraverseFullyLabelGraph globalSettings processedData False False Nothing) (fmap fst5 curGraphs) + let inputDisplayVVList = fmap fth6 rediagnodesGraphs + let costList = fmap snd5 curGraphs + let displayCostListList = fmap GO.getDisplayTreeCostList rediagnodesGraphs + let displayInfoString = + if ("dot" `elem` commandList) || ("dotpdf" `elem` commandList) + then ("//DisplayTree costs : " <> show (fmap (sum . fst) displayCostListList, displayCostListList)) + else -- newick + + let middle = fmap bracketToCurly $ show (fmap (sum . fst) displayCostListList, displayCostListList) + in ("[DisplayTree costs : " <> middle <> "]") + + let treeIndexStringList = + if ("dot" `elem` commandList) || ("dotpdf" `elem` commandList) + then fmap (((<> "\n") . ("//Canonical Tree " <>)) . show) [0 .. (length inputDisplayVVList - 1)] + else -- newick + fmap (((<> "]\n") . ("[Canonical Tree " <>)) . show) [0 .. (length inputDisplayVVList - 1)] + let canonicalGraphPairList = zip treeIndexStringList inputDisplayVVList + let blockStringList = unlines (fmap (outputBlockTrees commandList costList (outgroupIndex globalSettings)) canonicalGraphPairList) + -- graphString = outputGraphString commandList (outgroupIndex globalSettings) (fmap thd6 curGraphs) (fmap snd6 curGraphs) + + if null curGraphs || graphType globalSettings /= SoftWired + then do + logWith LogInfo "No soft-wired graphs to report display trees\n" + pure ("No soft-wired graphs to report display trees", outfileName, writeMode) + else pure (displayInfoString <> "\n" <> blockStringList, outfileName, writeMode) + else + if ("graphs" `elem` commandList) && ("reconcile" `notElem` commandList) + then -- else if (not .null) (L.intersect ["graphs", "newick", "dot", "dotpdf"] commandList) then + + let graphString = outputGraphString edgeWeigthVal commandList (outgroupIndex globalSettings) (fmap thd5 curGraphs) (fmap snd5 curGraphs) + in if null curGraphs + then do + logWith LogInfo "No graphs to report\n" + pure ("No graphs to report", outfileName, writeMode) + else do + logWith + LogInfo + ("Reporting " <> show (length curGraphs) <> " graph(s) at minimum cost " <> show (minimum $ fmap snd5 curGraphs) <> "\n") + pure (graphString, outfileName, writeMode) + else + if "ia" `elem` commandList || "impliedalignment" `elem` commandList + then + if null curGraphs + then do + logWith LogInfo "No graphs to create implied alignments\n" + pure ("No impliedAlgnments to report", outfileName, writeMode) + else + let includeMissing = elem "includemissing" commandList + concatSeqs = elem "concatenate" commandList + in do + iaContentList ← + mapM + (getImpliedAlignmentString globalSettings (includeMissing || concatSeqs) concatSeqs processedData) + (zip curGraphs [0 .. (length curGraphs - 1)]) + logWith + LogInfo + "\tWarning: Prealigned sequence data with non-additive type costs (all change values equal) have been recoded to non-additive characters and will not appear in implied alignment output.\n" + pure (concat iaContentList, outfileName, writeMode) + else + if "metadata" `elem` commandList + then do + curGraphs' ← + if False -- not (reportNaiveData globalSettings) + then pure curGraphs + else + let action ∷ SimpleGraph → PhyG ReducedPhylogeneticGraph + action = TRAV.multiTraverseFullyLabelGraphReduced globalSettings processedData False False Nothing + in getParallelChunkTraverse >>= \pTraverse → + (action . fst5) `pTraverse` curGraphs + + dataStringList <- + let action :: (ReducedPhylogeneticGraph, Int) → PhyG [[String]] + action = getGraphMetaData globalSettings processedData + in do + diagPar ← getParallelChunkTraverse + diagPar action (zip curGraphs' [0 .. (length curGraphs' - 1)]) + + let dataString = CSV.genCsvFile $ concat dataStringList + if null curGraphs + then do + logWith LogInfo "No graphs to get metaData\n" + pure ("No graphs to get metaData", outfileName, writeMode) + else do + logWith + LogInfo + ("Getting metaData from " <> show (length curGraphs) <> " graphs at minimum cost " <> show (minimum $ fmap snd5 curGraphs) <> "\n") + pure (dataString, outfileName, writeMode) + else if "pairdist" `elem` commandList + then + let nameData = L.intercalate "," (V.toList (T.unpack <$> fst3 processedData)) <> "\n" + in do + pairwiseDistanceMatrix' ← D.getPairwiseDistances processedData + let dataString = CSV.genCsvFile $ fmap (fmap show) pairwiseDistanceMatrix' + pure (nameData <> dataString, outfileName, writeMode) + else + if "parameterestimation" `elem` commandList + then do + curGraphs' ← + if False -- not (reportNaiveData globalSettings) + then pure curGraphs + else + let action ∷ SimpleGraph → PhyG ReducedPhylogeneticGraph + action = TRAV.multiTraverseFullyLabelGraphReduced globalSettings processedData False False Nothing + in getParallelChunkTraverse >>= \pTraverse → + (action . fst5) `pTraverse` curGraphs + + dataStringList <- + let action :: (ReducedPhylogeneticGraph, Int) → PhyG [[String]] + action = getGraphParameters globalSettings processedData + in do + diagPar ← getParallelChunkTraverse + diagPar action (zip curGraphs' [0 .. (length curGraphs' - 1)]) + + let dataString = CSV.genCsvFile $ concat dataStringList + + if null curGraphs + then do + logWith LogInfo "No graphs for metaData\n" + pure ("No graphs for metaData", outfileName, writeMode) + else do + logWith + LogInfo + ("Getting metaData for " <> show (length curGraphs) <> " graphs at minimum cost " <> show (minimum $ fmap snd5 curGraphs) <> "\n") + pure (dataString, outfileName, writeMode) + + else if "reconcile" `elem` commandList + then do + recResult ← R.makeReconcileGraph VER.reconcileArgList argList (fmap fst5 curGraphs) + -- let (reconcileString, ) = recResult + let (_, reconcileGraph) = recResult + let reconcileString = outputGraphString edgeWeigthVal commandList (outgroupIndex globalSettings) [GO.convertSimpleToDecoratedGraph reconcileGraph] [0] + if null curGraphs + then do + logWith LogInfo "No graphs to reconcile\n" + pure ([], outfileName, writeMode) + else pure (reconcileString, outfileName, writeMode) + else + if "search" `elem` commandList + then + let dataString' = fmap showSearchFields $ reverse $ searchData globalSettings + -- reformat the "search" command fields a bit + dataString = processSearchFields dataString' + sysInfoData = + "System Info, OS: " + <> SI.os + <> ", Chip Arch: " + <> SI.arch + <> ", Compiler: " + <> SI.compilerName + <> " " + <> DV.showVersion SI.compilerVersion + <> ", Compile Date: " + <> (__DATE__ <> " " <> __TIME__) + cpuInfoString = + if SI.os /= "linux" + then "CPU Info, No /proc/cpuinfo on darwin" + else + let cpuInfoM = SIOU.unsafePerformIO SC.tryGetCPUs + in if isNothing cpuInfoM + then "CPU Info, Couldn't parse CPU Info" + else + "CPU Info, Physical Processors: " + <> show (SC.physicalProcessors (fromJust cpuInfoM)) + <> ", Physical Cores: " + <> show (SC.physicalCores (fromJust cpuInfoM)) + <> ", Logical Cores: " + <> show (SC.logicalCores (fromJust cpuInfoM)) + baseData = sysInfoData <> "\n" <> cpuInfoString <> "\nSearchData\nRandom seed, " <> show (seed globalSettings) <> "\n" + charInfoFields = + [ "Command" + , "Arguments" + , "Min cost in" + , "Max cost in" + , "Num graphs in" + , "Min cost out" + , "Max cost out" + , "Num graphs out" + , "CPU time (secs)" + , "Comment" + ] + in pure (baseData <> CSV.genCsvFile (charInfoFields : dataString), outfileName, writeMode) + else + if "support" `elem` commandList + then + let graphString = outputGraphStringSimple commandList (outgroupIndex globalSettings) (fmap fst5 supportGraphs) (fmap snd5 supportGraphs) + in -- trace ("Rep Sup: " <> (LG.prettify $ fst5 $ head supportGraphs)) ( + if null supportGraphs + then do + logWith LogInfo "\tNo support graphs to report\n" + pure ([], outfileName, writeMode) + else do + logWith LogInfo ("Reporting " <> show (length curGraphs) <> " support graph(s)" <> "\n") + pure (graphString, outfileName, writeMode) + else -- ) + + if "tnt" `elem` commandList + then + if null curGraphs + then do + logWith LogInfo "No graphs to create implied alignments for TNT output\n" + pure ("No impliedAlgnments for TNT to report", outfileName, writeMode) + else do + curGraphs' ← + if not (reportNaiveData globalSettings) + then pure $ GO.convertReduced2PhylogeneticGraph <$> curGraphs + else + let action ∷ SimpleGraph → PhyG PhylogeneticGraph + action = TRAV.multiTraverseFullyLabelGraph globalSettings processedData False False Nothing + in getParallelChunkTraverse >>= \pTraverse → + pTraverse (action . fst5) curGraphs + + tntContentList' ← traverse (getTNTString globalSettings processedData) $ zip curGraphs' [0 .. length curGraphs' - 1] + let tntContentList = concat tntContentList' + pure (tntContentList, outfileName, writeMode) + else do + logWith LogWarn ("\nUnrecognized/missing report option in " <> show commandList <> " defaulting to 'graphs'" <> "\n") + let graphString = outputGraphString edgeWeigthVal commandList (outgroupIndex globalSettings) (fmap thd5 curGraphs) (fmap snd5 curGraphs) + if null curGraphs + then do + logWith LogInfo "No graphs to report\n" + pure ("No graphs to report", outfileName, writeMode) + else do + logWith + LogInfo + ("Reporting " <> show (length curGraphs) <> " graph(s) at minimum cost " <> show (minimum $ fmap snd5 curGraphs) <> "\n") + pure (graphString, outfileName, writeMode) + where + bracketToCurly = \case + '(' → '{' + ')' → '}' + val → val + + +changingStrings ∷ String → (String, String) +changingStrings str = case span (/= ',') $ filter (`notElem` ['(', ')']) str of + (prefix, ',' : suffix) → (prefix, suffix) + (prefix, _) → (prefix, mempty) diff --git a/src/Commands/CommandUtilities.hs b/src/Commands/CommandUtilities.hs new file mode 100644 index 000000000..24b1d85fd --- /dev/null +++ b/src/Commands/CommandUtilities.hs @@ -0,0 +1,2692 @@ +{- | +Module exposing helper functions used for command processing. +-} +module Commands.CommandUtilities where + +import Bio.DynamicCharacter +import Bio.DynamicCharacter.Element +import Control.Monad (when) +import Control.Monad.IO.Class (MonadIO (..)) +import Control.Parallel.Strategies +import Data.Alphabet +import Data.Alphabet.Codec (decodeState) +import Data.Alphabet.Special +import Data.BitVector.LittleEndian qualified as BV +import Data.Bits +import Data.Char qualified as C +import Complexity.Utilities qualified as CU +import Data.Foldable +import Data.List qualified as L +import Data.List.NonEmpty qualified as NE +import Data.List.Split qualified as LS +import Data.Maybe +import Data.MetricRepresentation qualified as MR +import Data.Set qualified as SET +import Data.Text.Lazy qualified as T +import Data.Text.Short qualified as ST +import Data.Vector qualified as V +import Data.Vector.Generic qualified as GV +import Data.Vector.Storable qualified as SV +import Data.Vector.Unboxed qualified as UV +import Debug.Trace +import DirectOptimization.Pairwise +import GeneralUtilities +import GraphFormatUtilities +import GraphOptimization.Medians qualified as M +import GraphOptimization.PreOrderFunctions qualified as PRE +import GraphOptimization.Traversals qualified as TRAV +import Graphs.GraphOperations qualified as GO +import Input.Reorganize qualified as IR +import PHANE.Evaluation +import PHANE.Evaluation.ErrorPhase (ErrorPhase (..)) +import PHANE.Evaluation.Logging (LogLevel (..), Logger (..)) +import PHANE.Evaluation.Verbosity (Verbosity (..)) +import SymMatrix qualified as S +import System.Directory +import System.IO +import System.Info +import System.Process +import Types.Types +import Utilities.LocalGraph qualified as LG +import Utilities.Utilities qualified as U +import Debug.Trace + +{- relabelEdgeComplexity relabels decorated graphs with edge complexity indices +-} +relabelEdgeComplexity :: GlobalSettings -> ProcessedData -> DecoratedGraph -> DecoratedGraph +relabelEdgeComplexity inGS inData inGraph = + if LG.isEmpty inGraph then inGraph + else + -- get complexity edge values and relable graph edges + let vertexList = LG.labNodes inGraph + edgeList = LG.labEdges inGraph + edgeComplexityList = fmap thd3 $ U.getEdgeComplexityFactors inGS inData vertexList edgeList + newEdgeList = zipWith relabel edgeList edgeComplexityList + in + LG.mkGraph vertexList newEdgeList + + where relabel (a,b,c) d = + let newEdgeInfo = EdgeInfo { minLength = d + , maxLength = d + , midRangeLength = d + , edgeType = TreeEdge + } + in (a, b, newEdgeInfo) + +{- | processSearchFields takes a [String] and reformats the String associated with the +"search" commands and especially Thompson sampling data, +otherwise leaves the list unchanged +-} +processSearchFields ∷ [[String]] → [[String]] +processSearchFields inStringListList = + if null inStringListList + then [] + else + let firstList = head inStringListList + in if head firstList /= "Search" + then firstList : processSearchFields (tail inStringListList) + else + let newHeader = ["Iteration", "Search Type", "Delta", "Min Cost out", "CPU time (secs)"] + instanceSplitList = LS.splitOn "*" (L.last firstList) + hitsMinimum = filter (/= '*') $ last $ LS.splitOn "," (L.last firstList) + (instanceStringListList, searchBanditListList) = unzip $ fmap processSearchInstance instanceSplitList -- (L.last firstList) + in -- trace ("GSI: " <> (show firstList) <> "\nLF: " <> (hitsMinimum) <> "\nILL: " <> (show instanceStringListList) <> "\nSB: " <> (show searchBanditListList)) + fmap (fmap (filter (/= '\n'))) $ + [L.init firstList] + <> [newHeader <> (head searchBanditListList) <> ["Arguments"]] + <> concat instanceStringListList + <> [[hitsMinimum]] + <> processSearchFields (tail inStringListList) + + +-- processSearchInstance takes the String of instance information and +-- returns appropriate [[String]] for pretty csv output +processSearchInstance ∷ String → ([[String]], [String]) +processSearchInstance inString = + if null inString + then ([], []) + else + let tempList = getSearchIterations inString + iterationList = L.init tempList + iterationCounterList = fmap ((: []) . show) [0 .. (length iterationList - 1)] + searchBanditList' = getBanditNames $ tail $ dropWhile (/= '[') $ head iterationList + preArgStringList = fmap getPreArgString iterationList + searchArgStringList = fmap getSearchArgString iterationList + searchBanditProbsList' = fmap (getBanditProbs . tail . (dropWhile (/= '['))) iterationList + + -- add columns between graph and search bandits + searchBanditList = (take 3 searchBanditList') <> [" "] <> (drop 3 searchBanditList') + searchBanditProbsList = fmap (addColumn 3) searchBanditProbsList' + + processedSearchList = L.zipWith4 concat4 iterationCounterList preArgStringList searchBanditProbsList searchArgStringList + finalString = addColumn 8 $ LS.splitOn "," $ L.last tempList + instanceStringList = processedSearchList <> [finalString] + in if null iterationList + then ([], []) + else (instanceStringList, searchBanditList) + where + concat4 ∷ ∀ {a}. (Semigroup a) ⇒ a → a → a → a → a + concat4 a b c d = a <> b <> c <> d + + +-- | addColumn takea list of strings and adds a list of empty strings after third string in each +addColumn ∷ Int → [String] → [String] +addColumn index inList = + if null inList + then [] + else (take index inList) <> [" "] <> (drop index inList) + + +-- | getBanditProbs parses bandit prob line for probabilities +getBanditProbs ∷ String → [String] +getBanditProbs inString = + if null inString + then [] + else + let stuff = dropWhile (/= ',') inString + stuff2 = tail $ takeWhile (/= ')') stuff + remainder = tail $ dropWhile (/= ')') stuff + remainder' = + if length remainder > 2 + then drop 2 remainder + else [] + in stuff2 : getBanditProbs remainder' + + +-- | getSearchArgString get seach iteration arguments and concats, removing ',' +getSearchArgString ∷ String → [String] +getSearchArgString inString = + [L.intercalate " " $ drop 4 $ tail $ LS.splitOn "," $ takeWhile (/= '[') inString | not (null inString)] + + +-- getPreArgString gets search srtategy fields (type, delta, min cost, CPUtime) +-- befroe arguments fields +getPreArgString ∷ String → [String] +getPreArgString inString = + if null inString + then [] + else take 4 $ tail $ LS.splitOn "," inString + + +{- | getBanditNames extracts the names of search bandits from comment list +already first part filtered out so only pairs in "(,)" +-} +getBanditNames ∷ String → [String] +getBanditNames inString = + if null inString + then [] + else + let firstBanditName = takeWhile (/= ',') $ tail inString + remainder = dropWhile (/= '(') $ tail inString + in firstBanditName : getBanditNames remainder + + +-- | getSearchIterations breaks up comment feild into individual iteration lines +getSearchIterations ∷ String → [String] +getSearchIterations inList = + if null inList + then [] + else + let commentField = filter (/= '"') inList + commentLines = LS.splitOn "]" commentField + in commentLines + + +-- changeDotPreamble takes an input string to search for and a new one to add in its place +-- searches through dot file (can have multipl graphs) replacing teh search string each time. +changeDotPreamble ∷ String → String → String → String +changeDotPreamble findString newString inDotString = + if null inDotString + then [] + else changePreamble' findString newString [] (lines inDotString) + + +-- changeDotPreamble' internal process for changeDotPreamble +changePreamble' ∷ String → String → [String] → [String] → String +changePreamble' findString newString accumList inLineList = + if null inLineList + then unlines $ reverse accumList + else -- trace ("CP':" <> (head inLineList) <> " " <> findString <> " " <> newString) ( + + let firstLine = head inLineList + in if firstLine == findString + then changePreamble' findString newString (newString : accumList) (tail inLineList) + else changePreamble' findString newString (firstLine : accumList) (tail inLineList) + + +-- ) + +-- printGraph graphviz simple dot file of graph +-- execute with "dot -Teps test.dot -o test.eps" +-- need to add output to argument filename and call +-- graphviz via System.Process.runprocess +-- also, reorder GenForest so smalles (num leaves) is either first or +-- last so can print small to large all the way so easier to read +-- eps on OSX because ps gets cutt off for some reason and no pdf onOSX +-- -O foir multiple graphs I htink +printGraphVizDot ∷ String → String → PhyG () +printGraphVizDot graphDotString dotFile = + if null graphDotString + then do failWithPhase Outputting "No graph to report" + else do + myHandle ← liftIO $ openFile dotFile WriteMode + if os /= "darwin" + then do logWith LogInfo ("\tOutputting graphviz to " <> dotFile <> ".pdf.\n") + else do logWith LogInfo ("\tOutputting graphviz to " <> dotFile <> ".eps.\n") + let outputType = + if os == "darwin" + then "-Teps" + else "-Tpdf" + -- hPutStrLn myHandle "digraph G {" + -- hPutStrLn myHandle "\trankdir = LR;" + -- hPutStrLn myHandle "\tnode [ shape = rect];" + -- hPutStr myHandle $ (unlines . tail . lines) graphDotString + liftIO $ hPutStr myHandle graphDotString + -- hPutStrLn myHandle "}" + liftIO $ hClose myHandle + pCode ← liftIO $ findExecutable "dot" -- system "dot" --check for Graphviz + {- + hPutStrLn stderr + (if isJust pCode then --pCode /= Nothing then + "executed dot " <> outputType <> dotFile <> " -O " else + "Graphviz call failed (not installed or found). Dot file still created. Dot can be obtained from https://graphviz.org/download") + -} + if isJust pCode + then do + liftIO $ createProcess (proc "dot" [outputType, dotFile, "-O"]) + logWith LogInfo ("\tExecuted dot " <> outputType <> " " <> dotFile <> " -O \n") + else + logWith + LogInfo + "\tGraphviz call failed (not installed or found). Dot file still created. Dot can be obtained from https://graphviz.org/download\n" + + +{- | showSearchFields creates a String list for SearchData Fields +special processing for notACommand that contains info for initial data and graph processing +-} +showSearchFields ∷ SearchData → [String] +showSearchFields sD = + let inInstruction = instruction sD + (instructionString, durationString, commentString') = + if inInstruction /= NotACommand + then (show $ instruction sD, show ((fromIntegral $ duration sD) / 1000 ∷ Double), commentString sD) + else (commentString sD, show ((fromIntegral $ duration sD) / 1000000000000 ∷ Double), "No Comment") + in [ instructionString + , unwords $ (showArg <$> arguments sD) + , show $ minGraphCostIn sD + , show $ maxGraphCostIn sD + , show $ numGraphsIn sD + , show $ minGraphCostOut sD + , show $ maxGraphCostOut sD + , show $ numGraphsOut sD + , durationString + , commentString' + ] + where + showArg a = fst a <> ":" <> snd a + + +{- | requireReoptimization checks if set command in globalk settings requires reoptimization of graphs due to change in +graph type, optimality criterion etc. +-} +requireReoptimization ∷ GlobalSettings → GlobalSettings → Bool +requireReoptimization gsOld gsNew + | graphType gsOld /= graphType gsNew = True + | optimalityCriterion gsOld /= optimalityCriterion gsNew = True + | finalAssignment gsOld /= finalAssignment gsNew = True + | graphFactor gsOld /= graphFactor gsNew = True + | rootCost gsOld /= rootCost gsNew = True + | otherwise = False + + +-- | outputBlockTrees takes a PhyloGeneticTree and outputs BlockTrees +outputBlockTrees ∷ [String] → [VertexCost] → Int → (String, V.Vector [DecoratedGraph]) → String +outputBlockTrees commandList costList lOutgroupIndex (labelString, graphLV) = + let blockIndexStringList = + if ("dot" `elem` commandList) || ("dotpdf" `elem` commandList) + then -- dot comments + fmap (((<> "\n") . ("//Block " <>)) . show) [0 .. ((V.length graphLV) - 1)] + else -- newick + fmap (((<> "]\n") . ("[Block " <>)) . show) [0 .. ((V.length graphLV) - 1)] + blockStrings = unlines (makeBlockGraphStrings commandList costList lOutgroupIndex <$> zip blockIndexStringList (V.toList graphLV)) + in labelString <> blockStrings + + +-- | makeBlockGraphStrings makes individual block display trees--potentially multiple +makeBlockGraphStrings ∷ [String] → [VertexCost] → Int → (String, [DecoratedGraph]) → String +makeBlockGraphStrings commandList costList lOutgroupIndex (labelString, graphL) = + let diplayIndexString = + if ("dot" `elem` commandList) || ("dotpdf" `elem` commandList) + then ("//Display Tree(s): " <> show (length graphL) <> "\n") + else ("[Display Tree[s]: " <> show (length graphL) <> "]\n") + edgeWeightString = "min" -- minimum length edges + --displayString = (<> "\n") $ outputDisplayString commandList costList lOutgroupIndex graphL + displayString = (<> "\n") $ outputGraphString edgeWeightString commandList lOutgroupIndex graphL costList + in labelString <> diplayIndexString <> displayString + +{- Don't think need this--redundant with outputGraphString +-- | outputDisplayString is a wrapper around graph output function with different argument order for some rason +outputDisplayString ∷ [String] → [VertexCost] → Int → [DecoratedGraph] → String +outputDisplayString commandList costList lOutgroupIndex graphList + | "dot" `elem` commandList = + makeDotList + (not (elem "nobranchlengths" commandList)) + (not (elem "nohtulabels" commandList)) + (elem "color" commandList) + costList + lOutgroupIndex + (fmap GO.convertDecoratedToSimpleGraph graphList) + | "newick" `elem` commandList = + GO.makeNewickList + False + (not (elem "nobranchlengths" commandList)) + (not (elem "nohtulabels" commandList)) + lOutgroupIndex + (fmap GO.convertDecoratedToSimpleGraph graphList) + (replicate (length graphList) 0.0) + | "ascii" `elem` commandList = makeAsciiList lOutgroupIndex (fmap GO.convertDecoratedToSimpleGraph graphList) + | otherwise -- "dot" as default + = + makeDotList + (not (elem "nobranchlengths" commandList)) + (not (elem "nohtulabels" commandList)) + (elem "color" commandList) + costList + lOutgroupIndex + (fmap GO.convertDecoratedToSimpleGraph graphList) +-} + +-- | outputGraphString is a wrapper around graph output functions +outputGraphString ∷ String -> [String] → Int → [DecoratedGraph] → [VertexCost] → String +outputGraphString edgeWeight commandList lOutgroupIndex graphList costList + | "dot" `elem` commandList = + makeDotList + (not (elem "nobranchlengths" commandList)) + (not (elem "nohtulabels" commandList)) + (elem "color" commandList) + costList + lOutgroupIndex + (fmap (GO.convertDecoratedToSimpleGraphBranchLength edgeWeight) graphList) + | "newick" `elem` commandList = + GO.makeNewickList + False + (not (elem "nobranchlengths" commandList)) + (not (elem "nohtulabels" commandList)) + lOutgroupIndex + (fmap (GO.convertDecoratedToSimpleGraphBranchLength edgeWeight) graphList) + costList + | "ascii" `elem` commandList = makeAsciiList lOutgroupIndex (fmap (GO.convertDecoratedToSimpleGraphBranchLength edgeWeight) graphList) + | otherwise -- "dot" as default + = + makeDotList + (not (elem "nobranchlengths" commandList)) + (not (elem "nohtulabels" commandList)) + (elem "color" commandList) + costList + lOutgroupIndex + (fmap (GO.convertDecoratedToSimpleGraphBranchLength edgeWeight) graphList) + + +-- | outputGraphStringSimple is a wrapper around graph output functions +outputGraphStringSimple ∷ [String] → Int → [SimpleGraph] → [VertexCost] → String +outputGraphStringSimple commandList lOutgroupIndex graphList costList + | "dot" `elem` commandList = + makeDotList (not (elem "nobranchlengths" commandList)) (not (elem "nohtulabels" commandList)) (elem "color" commandList) costList lOutgroupIndex graphList + | "newick" `elem` commandList = GO.makeNewickList False True True lOutgroupIndex graphList costList + | "ascii" `elem` commandList = makeAsciiList lOutgroupIndex graphList + | otherwise -- "dot" as default + = + makeDotList (not (elem "nobranchlengths" commandList)) (not (elem "nohtulabels" commandList)) (elem "color" commandList) costList lOutgroupIndex graphList + + +{- | makeDotList takes a list of fgl trees and outputs a single String cointaining the graphs in Dot format +need to specify -O option for multiple graph(outgroupIndex globalSettings)s +-} +makeDotList ∷ Bool → Bool → Bool-> [VertexCost] → Int → [SimpleGraph] → String +makeDotList writeEdgeWeight writeNodeLabel colorEdges costList rootIndex graphList = + let graphStringList'' = fmap (fgl2DotString . LG.rerootTree rootIndex) graphList + graphStringList' = fmap (stripDotLabels writeEdgeWeight writeNodeLabel) graphStringList'' + graphStringList = if colorEdges then zipWith addEdgeColor graphList graphStringList' + else graphStringList' + costStringList = fmap (("\n//" <>) . show) costList + in L.intercalate "\n" (zipWith (<>) graphStringList costStringList) + +{- addEdgeColor adds field of edge color based on a pallete and edge weight + here using GraphViz colorscheme spectral11 +-} +addEdgeColor :: SimpleGraph -> String -> String +addEdgeColor inGraph inString = + if LG.isEmpty inGraph || null inString then [] + else + let edgeWeightList = fmap thd3 $ LG.labEdges inGraph + maxColorNumber = 11 + edgeColorList = fmap show $ U.getEdgeColor maxColorNumber edgeWeightList + newEdgeInfo = zip3 (fmap (show .fst3) $ LG.labEdges inGraph) (fmap (show . snd3) $ LG.labEdges inGraph) edgeColorList + in + -- trace ("AEC: " <> (show edgeWeightList)) $ + addColor newEdgeInfo inString + +-- | addColor removes edge labels from HTUs in graphviz format string +addColor ∷ [(String, String, String)] -> String → String +addColor colorEdgeList inString = + if null inString + then inString + else + let lineStringList = lines inString + newLines = fmap (makeNewLine colorEdgeList) lineStringList + in unlines newLines + where + makeNewLine cl a = + if (null $ L.intersect "->" a) + then a + else + let b = words a + newB3 = getEdgeColor cl b + in + if (length b > 2) then " " <> (concat [b !! 0, " ", b !! 1, " ", b !! 2, " ", newB3]) + else a + + getEdgeColor cl b = + if null cl then unwords b -- error ("Edge not found in getEdgeColor: " <> (show b) <> " " <> (show cl)) + else if (b !! 0) == (fst3 $ head cl) && (b !! 2) == (snd3 $ head cl) then + if (length b > 3) then + "[" <> "color=" <> (thd3 $ head cl) <> "," <> (tail $ b !! 3) + else "[" <> "color=" <> (thd3 $ head cl) <> "]" + else getEdgeColor (tail cl) b + + +-- | stripDotLabels strips away edge and HTU labels from dot files +stripDotLabels ∷ Bool → Bool → String → String +stripDotLabels writeEdgeWeight writeNodeLabel inGraphString = + if null inGraphString + then inGraphString + else + if writeNodeLabel && writeEdgeWeight + then inGraphString + else + let firstString = + if writeEdgeWeight + then inGraphString + else stripEdge inGraphString + + secondString = + if writeNodeLabel + then firstString + else stripNode firstString + in secondString + + +-- | stripEdge removes edge labels from HTUs in graphviz format string +stripEdge ∷ String → String +stripEdge inString = + if null inString + then inString + else + let lineStringList = lines inString + newLines = fmap makeNewLine lineStringList + in unlines newLines + where + makeNewLine a = + if (null $ L.intersect "->" a) + then a + else + let b = words a + in " " <> (concat [b !! 0, " ", b !! 1, " ", b !! 2]) <> " [];" + + +-- | stripNode removes edge labels from HTUs in graphviz format string +stripNode ∷ String → String +stripNode inString = + if null inString + then inString + else + let lineStringList = lines inString + newLines = fmap makeNewLine lineStringList + in unlines newLines + where + makeNewLine a = + if (null $ L.intersect "HTU" a) + then a + else + let b = words a + c = take 10 $ b !! 1 + newLine = + if c == "[label=HTU" + then " " <> (head b) <> " [];" + else a + in newLine + + +-- | makeAsciiList takes a list of fgl trees and outputs a single String cointaining the graphs in ascii format +makeAsciiList ∷ Int → [SimpleGraph] → String +makeAsciiList rootIndex graphList = + concatMap LG.prettify (fmap (LG.rerootTree rootIndex) graphList) + + +{- Older version wiht more data dependenncy +-- | getDataListList returns a list of lists of Strings for data output as csv +-- for row is source file names, suubsequent rows by taxon with +/- for present absent taxon in +-- input file +getDataListList :: [RawData] -> ProcessedData -> [[String]] +getDataListList inDataList processedData = + if null inDataList then [] + else + let fileNames = " " : fmap (takeWhile (/= ':')) (fmap T.unpack $ fmap name $ fmap head $ fmap snd inDataList) + fullTaxList = V.toList $ fst3 processedData + presenceAbsenceList = fmap (isThere inDataList) fullTaxList + fullMatrix = zipWith (:) (fmap T.unpack fullTaxList) presenceAbsenceList + in + --trace (show fileNames) + fileNames : fullMatrix +-} + +{- | getDataListList returns a list of lists of Strings for data output as csv +for row is source file names, subsequent rows by taxon with +/- for present absent taxon in +input file +different from getDataListList in removeal or processed data requiremenrt replaced with taxan name list +-} +getDataListList ∷ (Foldable f) ⇒ [RawData] → f T.Text → [[String]] +getDataListList inDataList comprehensiveTaxaSet + | null inDataList = [] + | otherwise = + let taxaList = toList comprehensiveTaxaSet + fileNames = " " : fmap (takeWhile (/= ':') . T.unpack) (fmap name $ fmap head $ fmap snd inDataList) + presenceAbsenceList = fmap (isThere inDataList) taxaList + fullMatrix = zipWith (:) (fmap T.unpack taxaList) presenceAbsenceList + in -- trace (show fileNames) + fileNames : fullMatrix + + +-- | isThere takes a list of Rawdata and reurns a String of + - +isThere ∷ [RawData] → T.Text → [String] +isThere inData inName = + if null inData + then [] + else + let firstTaxList = fmap fst $ fst $ head inData + in if inName `elem` firstTaxList + then "+" : isThere (tail inData) inName + else "-" : isThere (tail inData) inName + + +{- | phyloDataToString converts RawData type to String +for additive chars--multiply states by weight is < 1 when outputtting due to conversion on input +-} +phyloDataToString ∷ Int → V.Vector BlockData → [[String]] +phyloDataToString charIndexStart inDataVect = + if V.null inDataVect + then [] + else + let (blockName, blockData, charInfoVect) = V.head inDataVect + charStrings = zipWith (:) (replicate (V.length charInfoVect) (T.unpack blockName)) (getCharInfoStrings <$> V.toList charInfoVect) + charNumberString = fmap show [charIndexStart .. (charIndexStart + length charStrings - 1)] + fullMatrix = zipWith (:) charNumberString charStrings + in fullMatrix <> phyloDataToString (charIndexStart + length charStrings) (V.tail inDataVect) + + +-- | extractAlphabetStrings takes vector of block data and returns block x character string list +extractAlphabetStrings ∷ V.Vector BlockData → [[[String]]] +extractAlphabetStrings inBlockDataV = + let result = V.toList $ fmap extractAlphabetStringsBlock inBlockDataV + in trace + ("EAS: " <> (show result)) + result + + +-- | extractAlphabetStringsBlock takes block data and returns character string list +extractAlphabetStringsBlock ∷ BlockData → [[String]] +extractAlphabetStringsBlock inBlockData = V.toList $ fmap extractAlphabetStringsChar (thd3 inBlockData) + + +-- | extractAlphabetStringsChar takes char data and returns character string list +extractAlphabetStringsChar ∷ CharInfo → [String] +extractAlphabetStringsChar inCharInfo = (alphabetSymbols $ ST.toString <$> alphabet inCharInfo) + + +{- | getDataElementTransformations takes alphabet, parent and child final states +and calculates and formats the transition matrix in frequency and raw numbers +this for list of blocks each with list of characters +over all edges +need to ddeal woith missing data +-} +getDataElementTransformations ∷ [[[String]]] → [[String]] → [[String]] → [[String]] → [[String]] +getDataElementTransformations alphabetStrings parentLL childLL parentChildLL = + if null childLL + then [] + else -- convert to block x character (Parent String, Child String) for counting exercise + + let numBlocks = length alphabetStrings + numCharsList = fmap length alphabetStrings + + numParent = length parentLL + numChild = length childLL + in -- dataByBlock = reorderBySection numBlocks diffLL [] + -- dataByBlockChar = zipeWith reorderBySection numCharsList dataByBlock + + trace + ("GDET: " <> (show (numBlocks, numCharsList, numParent, numChild)) <> " " <> (show alphabetStrings)) -- <> + -- (show parentLL) <> "\n\n" <> (show childLL)) $ + parentChildLL + + +-- | reorderBySection takes a list of list of Strings and reorders klists into first diominant (as in blocks here) +reorderBySection ∷ Int → [[String]] → [[String]] → [[String]] +reorderBySection numSections inData reorgData = + if null inData + then reorgData + else + if length inData < numSections + then error ("Incorrect input list size: " <> (show numSections) <> " sections and " <> (show $ length inData) <> " pieces") + else + let firstGroup = filter (not . null) $ take numSections inData + newData = + if null reorgData + then firstGroup + else zipWith (<>) reorgData firstGroup + in trace ("RBS: " <> (show firstGroup)) $ + reorderBySection numSections (drop numSections inData) newData + + +{- | getBlockElementTransformations +need to deal with missing data +-} +getBlockElementTransformations ∷ [[String]] → [String] → [String] +getBlockElementTransformations alphabetStringL diffL = + if null diffL + then [] + else + trace ("GBET: " <> (show alphabetStringL) <> " " <> (concat diffL)) $ + zipWith getCharElementTransformations alphabetStringL diffL + + +{- | getCharElementTransformations +need to deal with missing data +-} +getCharElementTransformations ∷ [String] → String → String +getCharElementTransformations alphabetString diffState = + trace + ("GCET: " <> (show alphabetString) <> " " <> diffState) + [] + + +-- | getDataElementFrequencies takes a vecor of block data and returns character element frequencies +-- as tiple (in String) of element, frequncy and number of elements +-- +getDataElementFrequencies ∷ Bool → V.Vector BlockData → [[String]] +getDataElementFrequencies useIA inBlockDataV = + if V.null inBlockDataV + then [] + else + let blockCharFreqLLL = V.toList $ V.zipWith (getBlockCharElementFrequencies useIA) (fmap snd3 inBlockDataV) (fmap thd3 inBlockDataV) + in + -- this for triple info (element, frequency, and number) + -- fmap (fmap show) blockCharFreqLLL + -- this for frequency only + fmap (fmap show) $ fmap (fmap (fmap snd3)) blockCharFreqLLL + + +{- | getBlockElementFrequencies gets the element grequencies for each character in a block +if an unaligned sequence type infers based on length differences of inputs using number of gaps to make +inputs square (so minimum number of gaps) +-} +getBlockCharElementFrequencies ∷ Bool → V.Vector (V.Vector CharacterData) → V.Vector CharInfo → [[(String, Double, Int)]] +getBlockCharElementFrequencies useIA charDataV charInfoV = + if V.null charDataV + then [] + else -- set to preliminary states or IA states + + let dataV = + if not useIA + then PRE.setFinalToPreliminaryStates charDataV + else charDataV + in V.toList $ V.zipWith (getCharElementFrequencies useIA) (U.transposeVector dataV) charInfoV + + +{- | getCharElementFrequencies gets element frequencies for a character +if an unaligned sequence type infers based on length differences of inputs using number of gaps to make +inputs square (so minimum number of gaps) +returns frequencies and number for each alphabet element +ignores ambiguities/polymorphism +-} +getCharElementFrequencies ∷ Bool → V.Vector CharacterData → CharInfo → [(String, Double, Int)] +getCharElementFrequencies useIA charData charInfo = + if V.null charData + then [] + else + let -- alphabet element strings + alphabetElementStrings = (alphabetSymbols $ ST.toString <$> alphabet charInfo) + + charInfoV = V.replicate (V.length charData) charInfo + charPairV = V.zip charData charInfoV + taxonElementList = V.toList $ fmap L.last $ fmap (makeCharLine useIA) charPairV + + -- get implicit gaps from unaligned seqs (will end up zero if all equal in length) + numExtraGaps = + if "-" `notElem` alphabetElementStrings + then 0 + else + let maxLength = maximum $ fmap length taxonElementList + in getMinGapNumber maxLength 0 taxonElementList + + totalElementList = concat $ (L.replicate numExtraGaps '-') : taxonElementList + elementsGroups = L.group $ L.sort totalElementList + elementList = fmap (: []) $ fmap head elementsGroups + doubleList = zip elementList (fmap length elementsGroups) + elementNumberList = reorderAsInAlphabet alphabetElementStrings doubleList + numElements = fromIntegral $ sum elementNumberList + elementfreqList = fmap (/ numElements) $ fmap fromIntegral elementNumberList + in -- trace ("GCEF: " <> totalElementList ) $ -- <> " -> " <> (concat $ concat $ fmap (makeCharLine usaIA) charPairV)) + zip3 alphabetElementStrings elementfreqList elementNumberList + + +-- | getMinGapNumber gets implicit gap number by summing length differneces among sequences in order +getMinGapNumber ∷ Int → Int → [String] → Int +getMinGapNumber maxLength curNum inElementLL = + if null inElementLL + then curNum + else + let increment = maxLength - (length $ head inElementLL) + in getMinGapNumber maxLength (curNum + increment) (tail inElementLL) + + +-- | reorderAsInAlphabet +reorderAsInAlphabet ∷ [String] → [(String, Int)] → [Int] +reorderAsInAlphabet inAlphList inDoubleList = + -- trace ("RASIA: " <> (show inAlphList) <> " " <> (show inDoubleList)) $ + if null inDoubleList + then [] + else fmap (findDouble inDoubleList) inAlphList + + +-- | findDouble takes list of strings then pulls the tipple wthat matches the string +findDouble ∷ [(String, Int)] → String → Int +findDouble inDoubleList matchString = + if null inDoubleList + then 0 + else + let (alphElement, number) = head inDoubleList + in if alphElement == matchString + then number + else findDouble (tail inDoubleList) matchString + + +-- | getCharInfoStrings takes charInfo and returns list of Strings of fields +getCharInfoStrings ∷ CharInfo → [String] +getCharInfoStrings inChar = + let activityString = + if activity inChar + then "active" + else "inactive" + prealignedString = + if prealigned inChar + then "prealigned" + else "unaligned" + in [T.unpack $ name inChar, show $ charType inChar, activityString, show $ weight inChar, prealignedString] + <> [init $ concat $ fmap (<> ",") $ fmap ST.toString . toList $ alphabet inChar] + <> [show $ costMatrix inChar] + + +{- | executeRenameReblockCommands takes all the "Rename commands" pairs and +creates a list of pairs of new name and list of old names to be converted +as Text +-} +executeRenameReblockCommands ∷ Instruction → [(T.Text, T.Text)] → [Command] → IO [(T.Text, T.Text)] +executeRenameReblockCommands thisInStruction curPairs commandList = + if null commandList + then return curPairs + else do + let (firstOption, firstArgs) = head commandList + + -- skip "Read" and "Rename "commands already processed + if firstOption /= thisInStruction + then executeRenameReblockCommands thisInStruction curPairs (tail commandList) + else + let newName = T.filter C.isPrint $ T.filter (/= '"') $ T.pack $ snd $ head firstArgs + newNameList = replicate (length $ tail firstArgs) newName + oldNameList = fmap (T.filter (/= '"') . T.pack) (fmap snd $ tail firstArgs) + newPairs = zip newNameList oldNameList + changeNamePairs = filter areDiff (curPairs <> newPairs) + newChangeNames = fmap fst changeNamePairs + origChangeNames = fmap snd changeNamePairs + intersectionChangeNames = L.intersect newChangeNames origChangeNames + in if (not $ null intersectionChangeNames) + then errorWithoutStackTrace ("Renaming of " <> (show intersectionChangeNames) <> " as both a new name and to be renamed") + else executeRenameReblockCommands thisInStruction (curPairs <> newPairs) (tail commandList) + where + areDiff (a, b) = + if a /= b + then True + else False + +{- | getGraphParameters estimates basic parameters--element frequencies and transformation frequencies + From uniquely optimized (ie not 'R') edges and veritces +-} +getGraphParameters ∷ GlobalSettings → ProcessedData → (ReducedPhylogeneticGraph, Int) → PhyG [[String]] +getGraphParameters _ inData (inGraph, graphIndex) = + let decGraph = thd5 inGraph + in + if LG.isEmpty decGraph then pure [] + else do + let useIA = True + let useDO = False + let vertexList = LG.labNodes decGraph + let edgeList = LG.labEdges decGraph + + vertexInfo <- + let action :: LG.LNode VertexInfo → [[String]] + action = getVertexCharInfo useDO (thd3 inData) (fst5 inGraph) (fft5 inGraph) + in do + diagPar ← getParallelChunkMap + let result = diagPar action vertexList + pure result + + -- False for Use IA here + + let alphabetInfo = getDataElementFrequencies False (thd3 inData) + + -- Get changes based on edges + let edgeIndexList = fmap LG.toEdge edgeList + + -- should be parallel + let vertexVect = V.fromList $ vertexList + edgeTransformationTotalList <- + let action :: (LG.Node, LG.Node) → [[(String, [(String, Int)], [[Int]])]] + action = getEdgeTransformations vertexVect (fft5 inGraph) + in do + actionPar <- getParallelChunkMap + let result = actionPar action edgeIndexList + pure result + + -- extract relevent info + let edgeTransformationList = fmap (fmap (fmap fst3)) edgeTransformationTotalList + let transformationNumbersLLL = fmap (fmap (fmap thd3)) edgeTransformationTotalList + + let overallElementTransformations = sumEdgeTransformationLists [] transformationNumbersLLL + overallElementTransformationsFreq <- + let action :: [[[Int]]] → [[[Double]]] + action = fmap U.normalizeMatrix + in do + actionPar <- getParallelChunkMap + let result = actionPar action overallElementTransformations + pure result + + let vertexHeader = fmap (fmap (take 9)) vertexInfo + + let edgeListLists = knitTitlesChangeInfo vertexHeader edgeTransformationList + + let transformationHeader = fmap (drop 5) $ tail $ head vertexHeader + + let statsListList = + addBlockCharStrings + transformationHeader + alphabetInfo + (fmap (fmap show) overallElementTransformationsFreq) + (fmap (fmap show) overallElementTransformations) + + let edgeTransformationTitle = + [ [" "] + , ["Edge Transformation Statistics"] + , + [ "" + , "Data Block" + , "Character Name" + , "Character Type" + , "Element Frequencies" + , "Transformation Frequencies" + ] + ] + pure $ edgeTransformationTitle + <> statsListList + where + -- <> fmap show overallElementTransformations + + concat4 ∷ ∀ {a}. (Semigroup a) ⇒ a → a → a → a → a + concat4 a b c d = a <> b <> c <> d + concat3 ∷ ∀ {a}. (Semigroup a) ⇒ a → a → a → a + concat3 a b c = a <> b <> c + + + +{- | getGraphMetaData creates basic strings for CSV of graph vertex and node information + nodes first then vertices + Now--spawned from getDiagnosis, but assuming they will diverge and want metData to + be stable to visualization codes +-} +getGraphMetaData ∷ GlobalSettings → ProcessedData → (ReducedPhylogeneticGraph, Int) → PhyG [[String]] +getGraphMetaData _ inData (inGraph, graphIndex) = + let decGraph = thd5 inGraph + in + if LG.isEmpty decGraph then pure [] + else do + let useIA = True + let useDO = False + let vertexList = LG.labNodes decGraph + let edgeList = LG.labEdges decGraph + let numLeaves = V.length $ fst3 inData + + -- Vertex final character states for currect node + let vertexTitle = ["Vertex Character State Information"] + -- topHeaderList = ["Graph Index", "Vertex Index", "Vertex Name", "Vertex Type", "Child Vertices", "Parent Vertices", "Data Block", "Character Name", "Character Type", "Preliminary State", "Final State", "Local Cost"] + let topHeaderList = + [ "Graph Index" + , "Vertex Index" + , "Vertex Name" + , "Vertex Type" + , "Child Vertices" + , "Parent Vertices" + , "Data Block" + , "Character Name" + , "Character Type" + , "Final State" + ] + + -- let vertexInfo = fmap (getVertexCharInfo useDO (thd3 inData) (fst5 inGraph) (fft5 inGraph)) vertexList + vertexInfo <- + let action :: LG.LNode VertexInfo → [[String]] + action = getVertexCharInfo useDO (thd3 inData) (fst5 inGraph) (fft5 inGraph) + in do + diagPar ← getParallelChunkMap + let result = diagPar action vertexList + pure result + let vertexInfoList = concat vertexInfo + + -- this using IA fields to get changes + let vertexInfoListChanges = vertexInfoList -- concatMap (getVertexCharInfo useIA (thd3 inData) (fst5 inGraph) (fft5 inGraph)) vertexList + + -- Edge length information + let edgeTitle = [[" "], ["Edge Weight/Length Information"]] + let edgeHeaderList = [[" ", "Edge Head Vertex", "Edge Tail Vertex", "Edge Type", "Minimum Length", "Maximum Length", "MidRange Length"]] + + -- this is basically a Show + let edgeInfoList = fmap (U.getEdgeInfo numLeaves) edgeList + + -- Alphabet element numbers + let alphabetTitle = [["Alphabet (element, frequency, number) Gap, if estimated from unaligned sequences, is a minimum"]] + -- False for Use IA here + let alphabetInfo = getDataElementFrequencies False (thd3 inData) + + let alphbetStringLL = extractAlphabetStrings (thd3 inData) + + let vertexChangeTitle = + [ [" "] + , ["Vertex Character Changes"] + , + [ "Graph Index" + , "Vertex Index" + , "Vertex Name" + , "Vertex Type" + , "Child Vertices" + , "Parent Vertices" + , "Data Block" + , "Character Name" + , "Character Type" + , "Parent Final State" + , "Node Final State" + -- , "Sequence Changes (position, parent final state, node final state)" + ] + ] + + -- let vertexParentStateList = fmap ((: []) . last) (concatMap (getVertexAndParentCharInfo useDO (thd3 inData) (fst5 inGraph) (fft5 inGraph) (V.fromList vertexList)) vertexList) + vertParCharInfoList <- + let action :: LG.LNode VertexInfo → [[String]] + action = getVertexAndParentCharInfo useDO (thd3 inData) (fst5 inGraph) (fft5 inGraph) (V.fromList vertexList) + in do + actionPar <- getParallelChunkMap + let result = actionPar action vertexList + pure result + + let vertexParentStateList = fmap ((: []) . last) $ concat vertParCharInfoList + let vertexStateList = fmap (drop 9) vertexInfoListChanges + + -- process to change to lines of individual changes--basically a transpose + -- True to only report diffs + -- vertexChangeListByPosition = fmap (getAlignmentBasedChanges' True 0) (zip vertexParentStateList vertexStateList) + -- let parentChildStatesList = fmap (getAlignmentBasedChanges' False 0) (zip vertexParentStateList vertexStateList) + parentChildStatesList <- + let action :: ([String], [String]) → [String] + action = getAlignmentBasedChanges' False 0 + in do + actionPar <- getParallelChunkMap + let result = actionPar action (zip vertexParentStateList vertexStateList) + pure result + + -- putting parent states before current state + let vertexStateInfoList = fmap (take 9) vertexInfoListChanges + + let vertexChangeList = L.zipWith3 concat3 vertexStateInfoList vertexParentStateList vertexStateList -- + -- filter out those that are the same states + let differenceList = removeNoChangeLines vertexChangeList + + -- element transformation numbers + let elementTransformationTitle = [["Element Transformations (element<->element, frequency, number) based in Implied Alignment for unaligned sequences"]] + -- get element transfomation by re-parsing formated results--uses teh character states strings this way + let elementTransformationInfo = getDataElementTransformations alphbetStringLL vertexParentStateList vertexStateList parentChildStatesList + + -- Get changes based on edges + let edgeIndexList = fmap LG.toEdge edgeList + + -- should be parallel + let vertexVect = V.fromList $ vertexList + -- let edgeTransformationTotalList = fmap (getEdgeTransformations vertexVect (fft5 inGraph)) edgeIndexList + edgeTransformationTotalList <- + let action :: (LG.Node, LG.Node) → [[(String, [(String, Int)], [[Int]])]] + action = getEdgeTransformations vertexVect (fft5 inGraph) + in do + actionPar <- getParallelChunkMap + let result = actionPar action edgeIndexList + pure result + + -- extract relevent info + let edgeTransformationList = fmap (fmap (fmap fst3)) edgeTransformationTotalList + -- elementNumbersLLL = fmap (fmap (fmap snd3)) edgeTransformationTotalList + let transformationNumbersLLL = fmap (fmap (fmap thd3)) edgeTransformationTotalList + + -- overallElementNumbers = sumEdgeElementLists [] $ take numLeaves elementNumbersLLL + let overallElementTransformations = sumEdgeTransformationLists [] transformationNumbersLLL + -- let overallElementTransformationsFreq = fmap (fmap U.normalizeMatrix) overallElementTransformations + overallElementTransformationsFreq <- + let action :: [[[Int]]] → [[[Double]]] + action = fmap U.normalizeMatrix + in do + actionPar <- getParallelChunkMap + let result = actionPar action overallElementTransformations + pure result + + let vertexHeader = fmap (fmap (take 9)) vertexInfo + + let edgeListLists = knitTitlesChangeInfo vertexHeader edgeTransformationList + + let transformationHeader = fmap (drop 5) $ tail $ head vertexHeader + + + let vertexChangeTitleNew = + [ [" "] + , ["Vertex Character Changes"] + , + [ "" + , "Vertex Index" + , "Vertex Name" + , "Vertex Type" + , "Child Vertices" + , "Parent Vertices" + , "Data Block" + , "Character Name" + , "Character Type" + , "Parent Final State" + , "Node Final State" + , "Unambiguous Transformations" + ] + ] + + pure $ [vertexTitle, topHeaderList, [show graphIndex]] + <> vertexInfoList + <> edgeTitle + <> edgeHeaderList + <> edgeInfoList + -- <> vertexChangeTitle + -- <> differenceList + <> vertexChangeTitleNew + <> edgeListLists + -- <> elementTransformationTitle + -- <> elementTransformationInfo + -- <> alphabetTitle + -- <> alphabetInfo + where + -- <> fmap show overallElementTransformations + + concat4 ∷ ∀ {a}. (Semigroup a) ⇒ a → a → a → a → a + concat4 a b c d = a <> b <> c <> d + concat3 ∷ ∀ {a}. (Semigroup a) ⇒ a → a → a → a + concat3 a b c = a <> b <> c + +{- | getGraphDiagnosis creates basic strings for CSV of graph vertex and node information +nodes first then vertices +-} +getGraphDiagnosis ∷ GlobalSettings → ProcessedData → (ReducedPhylogeneticGraph, Int) → PhyG [[String]] +getGraphDiagnosis inGS inData (inGraph, graphIndex) = + let decGraph = thd5 inGraph + in + if LG.isEmpty decGraph then pure [] + else do + let useIA = True + let useDO = False + let vertexList = LG.labNodes decGraph + let edgeList = LG.labEdges decGraph + let numLeaves = V.length $ fst3 inData + + -- Vertex final character states for currect node + let vertexTitle = ["Vertex Character State Information"] + -- topHeaderList = ["Graph Index", "Vertex Index", "Vertex Name", "Vertex Type", "Child Vertices", "Parent Vertices", "Data Block", "Character Name", "Character Type", "Preliminary State", "Final State", "Local Cost"] + let topHeaderList = + [ "Graph Index" + , "Vertex Index" + , "Vertex Name" + , "Vertex Type" + , "Child Vertices" + , "Parent Vertices" + , "Data Block" + , "Character Name" + , "Character Type" + , "Final State" + ] + + -- let vertexInfo = fmap (getVertexCharInfo useDO (thd3 inData) (fst5 inGraph) (fft5 inGraph)) vertexList + vertexInfo <- + let action :: LG.LNode VertexInfo → [[String]] + action = getVertexCharInfo useDO (thd3 inData) (fst5 inGraph) (fft5 inGraph) + in do + diagPar ← getParallelChunkMap + let result = diagPar action vertexList + pure result + let vertexInfoList = concat vertexInfo + + -- this using IA fields to get changes + let vertexInfoListChanges = vertexInfoList -- concatMap (getVertexCharInfo useIA (thd3 inData) (fst5 inGraph) (fft5 inGraph)) vertexList + + + let edgeInfoList = fmap (U.getEdgeInfo numLeaves) edgeList + + + {- + -- Get complexity information--empty of not complexity + let edgeTripleList = U.getEdgeComplexityFactors inGS inData vertexList edgeList + let edgeComplexityFactor = zipWith (:)(fmap show $ fmap fst3 edgeTripleList) (fmap (:[]) $fmap show $ fmap thd3 edgeTripleList) + + let (vertexComplexityLabel, vertexComplexityList) = if (optimalityCriterion inGS `elem` [PMDL, SI]) then + (["Tail Vertex Complexity", "Complexity Factor"], edgeComplexityFactor) -- fmap (:[]) $ fmap show edgeComplexityFactor) + else (["",""], fmap (:[]) $ replicate (length edgeList) "") + -} + + -- Edge length information + let edgeTitle = [[" "], ["Edge Weight/Length Information"]] + let edgeHeaderList = [[" ", "Edge Head Vertex", "Edge Tail Vertex", "Edge Type", "Minimum Length", "Maximum Length", "MidRange Length"]] + -- , vertexComplexityLabel !! 0, vertexComplexityLabel !! 1]] + + + -- Alphabet element numbers + let alphabetTitle = [["Alphabet (element, frequency, number) Gap, if estimated from unaligned sequences, is a minimum"]] + -- False for Use IA here + let alphabetInfo = getDataElementFrequencies False (thd3 inData) + + let alphbetStringLL = extractAlphabetStrings (thd3 inData) + + let vertexChangeTitle = + [ [" "] + , ["Vertex Character Changes"] + , + [ "Graph Index" + , "Vertex Index" + , "Vertex Name" + , "Vertex Type" + , "Child Vertices" + , "Parent Vertices" + , "Data Block" + , "Character Name" + , "Character Type" + , "Parent Final State" + , "Node Final State" + -- , "Sequence Changes (position, parent final state, node final state)" + ] + ] + + -- let vertexParentStateList = fmap ((: []) . last) (concatMap (getVertexAndParentCharInfo useDO (thd3 inData) (fst5 inGraph) (fft5 inGraph) (V.fromList vertexList)) vertexList) + vertParCharInfoList <- + let action :: LG.LNode VertexInfo → [[String]] + action = getVertexAndParentCharInfo useDO (thd3 inData) (fst5 inGraph) (fft5 inGraph) (V.fromList vertexList) + in do + actionPar <- getParallelChunkMap + let result = actionPar action vertexList + pure result + + let vertexParentStateList = fmap ((: []) . last) $ concat vertParCharInfoList + let vertexStateList = fmap (drop 9) vertexInfoListChanges + + -- process to change to lines of individual changes--basically a transpose + -- True to only report diffs + -- vertexChangeListByPosition = fmap (getAlignmentBasedChanges' True 0) (zip vertexParentStateList vertexStateList) + -- let parentChildStatesList = fmap (getAlignmentBasedChanges' False 0) (zip vertexParentStateList vertexStateList) + parentChildStatesList <- + let action :: ([String], [String]) → [String] + action = getAlignmentBasedChanges' False 0 + in do + actionPar <- getParallelChunkMap + let result = actionPar action (zip vertexParentStateList vertexStateList) + pure result + + -- putting parent states before current state + let vertexStateInfoList = fmap (take 9) vertexInfoListChanges + + let vertexChangeList = L.zipWith3 concat3 vertexStateInfoList vertexParentStateList vertexStateList -- + -- filter out those that are the same states + let differenceList = removeNoChangeLines vertexChangeList + + -- element transformation numbers + let elementTransformationTitle = [["Element Transformations (element<->element, frequency, number) based in Implied Alignment for unaligned sequences"]] + -- get element transfomation by re-parsing formated results--uses teh character states strings this way + let elementTransformationInfo = getDataElementTransformations alphbetStringLL vertexParentStateList vertexStateList parentChildStatesList + + -- Get changes based on edges + let edgeIndexList = fmap LG.toEdge edgeList + + -- should be parallel + let vertexVect = V.fromList $ vertexList + -- let edgeTransformationTotalList = fmap (getEdgeTransformations vertexVect (fft5 inGraph)) edgeIndexList + edgeTransformationTotalList <- + let action :: (LG.Node, LG.Node) → [[(String, [(String, Int)], [[Int]])]] + action = getEdgeTransformations vertexVect (fft5 inGraph) + in do + actionPar <- getParallelChunkMap + let result = actionPar action edgeIndexList + pure result + + -- extract relevent info + let edgeTransformationList = fmap (fmap (fmap fst3)) edgeTransformationTotalList + -- elementNumbersLLL = fmap (fmap (fmap snd3)) edgeTransformationTotalList + let transformationNumbersLLL = fmap (fmap (fmap thd3)) edgeTransformationTotalList + + -- overallElementNumbers = sumEdgeElementLists [] $ take numLeaves elementNumbersLLL + let overallElementTransformations = sumEdgeTransformationLists [] transformationNumbersLLL + -- let overallElementTransformationsFreq = fmap (fmap U.normalizeMatrix) overallElementTransformations + overallElementTransformationsFreq <- + let action :: [[[Int]]] → [[[Double]]] + action = fmap U.normalizeMatrix + in do + actionPar <- getParallelChunkMap + let result = actionPar action overallElementTransformations + pure result + + let vertexHeader = fmap (fmap (take 9)) vertexInfo + + let edgeListLists = knitTitlesChangeInfo vertexHeader edgeTransformationList + + -- test code to get conditional info of child given parent K(c|p) = K(c<>p) - K(p), K(c)/K(c|p) randomnes index + let characterParentFields = fmap (getIndex 9) edgeListLists + let characterChildFields = fmap (getIndex 10) edgeListLists + let characterEdgeInformationContent = zipWith getEdgeInformationContent characterChildFields characterParentFields + + let transformationHeader = fmap (drop 5) $ tail $ head vertexHeader + + + let vertexChangeTitleNew = + [ [" "] + , ["Vertex Character Changes"] + , + [ "" + , "Vertex Index" + , "Vertex Name" + , "Vertex Type" + , "Child Vertices" + , "Parent Vertices" + , "Data Block" + , "Character Name" + , "Character Type" + , "Parent Final State" + , "Node Final State" + , "Unambiguous Transformations" + ] + ] + + pure $ [vertexTitle, topHeaderList, [show graphIndex]] + <> vertexInfoList + <> edgeTitle + <> edgeHeaderList + <> edgeInfoList + -- <> zipWith (<>) edgeInfoList vertexComplexityList + -- <> vertexChangeTitle + -- <> differenceList + <> vertexChangeTitleNew + <> edgeListLists + --compression lists + -- <> vertexComplexityList + -- <> zipWith (<>) edgeListLists (fmap (:[]) $ fmap showInfo characterEdgeInformationContent) + -- <> elementTransformationTitle + -- <> elementTransformationInfo + -- <> alphabetTitle + -- <> alphabetInfo + where + -- <> fmap show overallElementTransformations + + concat4 ∷ ∀ {a}. (Semigroup a) ⇒ a → a → a → a → a + concat4 a b c d = a <> b <> c <> d + concat3 ∷ ∀ {a}. (Semigroup a) ⇒ a → a → a → a + concat3 a b c = a <> b <> c + + getIndex a s = if (length s > a) then s !! a + else "" + + showInfo a = if snd5 a == -1.0 then "" + else show a + +{- U.getEdgeInformationContent--Experimental + returns conditional complexity of child state given parent--K(c|p) and + ration of child complexity to child conditinal complexity-- K(c) / K(c|p) + compression based (zip) +-} +getEdgeInformationContent :: String -> String -> (Double, Double,Double, Double, Double) +getEdgeInformationContent childString' parentString' = + -- filter out solitary gaps--put in by IA for transformation counting + let childString = filter (/= '-') childString' + parentString = filter (/= '-') parentString' + in + -- empty line + if null childString && null parentString then (0,-1.0, 0,0,0) + -- child empty or missing + else if null childString then (0,-1.0, 0,0,0) + -- parent empty or missing + else if null parentString then + -- only K(c) + let (_, _, _, kc) = CU.getInformationContent childString + in (kc, 1.0, 0,0,0) + else + let (_, _, _, kc) = CU.getInformationContent childString + (_, _, _, kcGp) = CU.getInformationContent (childString <> parentString) + (_, _, _, kp) = CU.getInformationContent parentString + in + (kcGp - kp, kcGp / kc, kc, kp, kcGp) + + +{- | sumEdgeElementLists takes list of edges of block of characters transomftion matricwes and +sums over edge (outermost) list for eventual csv output +-} +sumEdgeElementLists ∷ [[[(String, Int)]]] → [[[[(String, Int)]]]] → [[[(String, Int)]]] +sumEdgeElementLists curSum inEdgeList = + if null inEdgeList + then curSum + else + let firstEdge = head inEdgeList + newSum = + if null curSum + then firstEdge + else addBlockCharacterElements curSum firstEdge + in sumEdgeElementLists newSum (tail inEdgeList) + + +-- | addBlockCharacterElements adds two block lists of element lists +addBlockCharacterElements ∷ [[[(String, Int)]]] → [[[(String, Int)]]] → [[[(String, Int)]]] +addBlockCharacterElements inBlockEdgeL1 inBlockEdgeL2 = + if null inBlockEdgeL1 + then inBlockEdgeL2 + else + if null inBlockEdgeL2 + then inBlockEdgeL1 + else zipWith addCharacterElementLists inBlockEdgeL1 inBlockEdgeL2 + + +-- | addCharacterElementLists adds two lists of charcter element lists +addCharacterElementLists ∷ [[(String, Int)]] → [[(String, Int)]] → [[(String, Int)]] +addCharacterElementLists inCharList1 inCharList2 = + if null inCharList1 + then inCharList2 + else + if null inCharList2 + then inCharList1 + else zipWith addElementLists inCharList1 inCharList2 + + +{- | addElementLists adds two lists of elements and numbers + Assumes lists are in same element order +-} +addElementLists ∷ [(String, Int)] → [(String, Int)] → [(String, Int)] +addElementLists elementL1 elementL2 = + if null elementL1 + then elementL2 + else + if null elementL2 + then elementL1 + else + if (fmap fst elementL1) /= (fmap fst elementL2) + then error ("Element lists do not match: " <> (show $ fmap fst elementL1) <> " versus " <> (show $ fmap fst elementL2)) + else + let numberL1 = fmap snd elementL1 + numberL2 = fmap snd elementL2 + newNUmberL = zipWith (+) numberL1 numberL2 + in zip (fmap fst elementL1) newNUmberL + + +{- | sumEdgeTransformationLists takes list of edges of block of characters transomftion matricwes and +sums over edge (outermost) list for eventual csv output +-} +sumEdgeTransformationLists ∷ [[[[Int]]]] → [[[[[Int]]]]] → [[[[Int]]]] +sumEdgeTransformationLists curSum inEdgeList = + if null inEdgeList + then curSum + else + let firstEdge = head inEdgeList + newSum = + if null curSum + then firstEdge + else addBlockCharacterMatrix curSum firstEdge + in sumEdgeTransformationLists newSum (tail inEdgeList) + + +-- | addBlockCharacterMatrix adds two block lists of character lists of transrmation matrices +addBlockCharacterMatrix ∷ [[[[Int]]]] → [[[[Int]]]] → [[[[Int]]]] +addBlockCharacterMatrix inBlockEdgeL1 inBlockEdgeL2 = + if null inBlockEdgeL1 + then inBlockEdgeL2 + else + if null inBlockEdgeL2 + then inBlockEdgeL1 + else zipWith addCharacterLists inBlockEdgeL1 inBlockEdgeL2 + + +-- | addCharacterLists adds two lists of charcter transfomratin matrices +addCharacterLists ∷ [[[Int]]] → [[[Int]]] → [[[Int]]] +addCharacterLists inCharList1 inCharList2 = + if null inCharList1 + then inCharList2 + else + if null inCharList2 + then inCharList1 + else zipWith (U.combineMatrices (+)) inCharList1 inCharList2 + + +{- | addBlockCharStrings adds block and character strings to transformation info + for CSV output +-} +addBlockCharStrings ∷ [[String]] → [[String]] → [[String]] → [[String]] → [[String]] +addBlockCharStrings labelStringList elementStringList matrixStringList matrixStringList2 = + if null labelStringList || null matrixStringList + then [] + else + let blockTitle = head labelStringList + charMatrixList = head matrixStringList + charMatrixList2 = head matrixStringList2 + elementList = head elementStringList + charNameList = take (length charMatrixList) (tail labelStringList) + in -- first doesn't ahve numbers of transforms + -- (blockTitle : (zipWith3 (concat3) charNameList (fmap (:[]) elementList) (fmap (:[]) charMatrixList))) <> addBlockCharStrings (drop (1 + length charMatrixList) labelStringList) (tail elementStringList) (tail matrixStringList) + ( blockTitle + : (L.zipWith4 (concat4) charNameList (fmap (: []) elementList) (fmap (: []) charMatrixList) (fmap (: []) charMatrixList2)) + ) + <> addBlockCharStrings + (drop (1 + length charMatrixList) labelStringList) + (tail elementStringList) + (tail matrixStringList) + (tail matrixStringList2) + where + -- concat3 a b c = a <> b <> c + concat4 a b c d = a <> b <> c <> d + + +{- | knitTitlesChangeInfo takes [[[String]]] of title info and knits with [[[String]]] of character change info + into [[String]] for CSV output + Edges x Blocks x characters (final is transformation info) +-} +knitTitlesChangeInfo ∷ [[[String]]] → [[[String]]] → [[String]] +knitTitlesChangeInfo titlesLLL transLLL = + if null titlesLLL || null transLLL + then [] + else -- trace ("KTCI: " <> (show titlesLLL)) $ + -- trace ("KTIC2: " <> (show transLLL)) + concat $ zipWith formatEdge titlesLLL transLLL + + +{- | formatEdge formats edges string for CSV + divides list into blocks by lengfth olf charcters and block titles haveing an extra line +-} +formatEdge ∷ [[String]] → [[String]] → [[String]] +formatEdge titleLL transLL = + if null titleLL || null transLL + then [] + else + let edgeTitle = head titleLL + numCharsL = fmap length transLL + blockTitlesLLL = U.divideList (fmap (+ 1) numCharsL) (tail titleLL) + blockTransLLL = transLL -- U.divideList numCharsL transLL + in -- trace ("FE: " <> (show edgeTitle)) $ + edgeTitle : (concat $ zipWith (formatBlock numCharsL) blockTitlesLLL blockTransLLL) + + +-- | formatBlocks formats block string for CSV +formatBlock ∷ [Int] → [[String]] → [String] → [[String]] +formatBlock charLengthL titleLL transL = + if null titleLL || null transL + then [] + else + let blockTitle = head titleLL + charTitleLL = tail titleLL + in -- trace ("FB: " <> (show blockTitle)) $ + blockTitle : (zipWith formatCharacter charTitleLL transL) + + +-- | formatCharacter formats charcater string for CSV +formatCharacter ∷ [String] → String → [String] +formatCharacter titleLine transS = + if null titleLine || null transS + then [] + else -- trace ("FC: " <> (show titleLine)) $ + titleLine <> (LS.splitOn "," transS) + + +{- | getEdgeTransformations get tranformations for an edge by block and character + changes by block then character +-} +getEdgeTransformations + ∷ V.Vector (LG.LNode VertexInfo) → V.Vector (V.Vector CharInfo) → (LG.Node, LG.Node) → [[(String, [(String, Int)], [[Int]])]] +getEdgeTransformations nodeVect charInfoVV (parentIndex, childIndex) = + let parentNodeLabel = snd $ nodeVect V.! parentIndex + childNodeLabel = snd $ nodeVect V.! childIndex + parentBlockData = vertData parentNodeLabel + childBlockData = vertData childNodeLabel + in V.toList $ V.zipWith3 getEdgeBlockChanges parentBlockData childBlockData charInfoVV + + +-- | getEdgeBlockChanges takes VertexBlockData from parent and child node and gets transformations by character +getEdgeBlockChanges + ∷ V.Vector CharacterData → V.Vector CharacterData → V.Vector CharInfo → [(String, [(String, Int)], [[Int]])] +getEdgeBlockChanges parentBlockData childBlockData charInfoV = + V.toList $ V.zipWith3 getCharacterChanges parentBlockData childBlockData charInfoV + + +{- | getCharacterChanges takes strings of character pairs and outputs differences as pairs, + for unaligned sequences performs a DO and uses aligned states +-} +getCharacterChanges ∷ CharacterData → CharacterData → CharInfo → (String, [(String, Int)], [[Int]]) +getCharacterChanges parentChar nodeChar charInfo = + let localType = charType charInfo + localAlphabet = (ST.toString <$> alphabet charInfo) + + -- this to avoid recalculations and list access issues + lANES = (fromJust $ NE.nonEmpty $ alphabetSymbols localAlphabet) + lAVect = V.fromList $ NE.toList $ lANES + + -- getCharState + -- ∷ ∀ {b} + -- . (Show b, Bits b) + -- ⇒ b + -- → String + getCharState a = U.bitVectToCharState localAlphabet lANES lAVect a -- concat $ NE.toList $ decodeState localAlphabet a -- + (slimParent, slimChild) = + if localType `notElem` [NucSeq, SlimSeq] + then ([], []) + else + let (_, r) = + slimPairwiseDO + (slimTCM charInfo) + (M.makeDynamicCharacterFromSingleVector $ slimFinal parentChar) + (M.makeDynamicCharacterFromSingleVector $ slimFinal nodeChar) + in -- trace (show (r, length $ SV.foldMap getCharState $ extractMediansLeftGapped r, length $ SV.foldMap getCharState $ extractMediansRightGapped r)) + (SV.foldMap getCharState $ extractMediansLeftGapped r, SV.foldMap getCharState $ extractMediansRightGapped r) + (wideParent, wideChild) = + if localType `notElem` [WideSeq, AminoSeq] + then ([], []) + else + let coefficient = MR.minInDelCost (wideTCM charInfo) + (_, r) = + widePairwiseDO + coefficient + (MR.retreivePairwiseTCM $ wideTCM charInfo) + (M.makeDynamicCharacterFromSingleVector $ wideFinal parentChar) + (M.makeDynamicCharacterFromSingleVector $ wideFinal nodeChar) + in (UV.foldMap getCharState $ extractMediansLeftGapped r, UV.foldMap getCharState $ extractMediansRightGapped r) + (hugeParent, hugeChild) = + if localType `notElem` [HugeSeq] + then ([], []) + else + let coefficient = MR.minInDelCost (hugeTCM charInfo) + (_, r) = + hugePairwiseDO + coefficient + (MR.retreivePairwiseTCM $ hugeTCM charInfo) + (M.makeDynamicCharacterFromSingleVector $ hugeFinal parentChar) + (M.makeDynamicCharacterFromSingleVector $ hugeFinal nodeChar) + in (foldMap getCharState $ extractMediansLeftGapped r, foldMap getCharState $ extractMediansRightGapped r) + + (parentState, nodeState) + | localType == Add = (show $ rangeFinal parentChar, show $ rangeFinal nodeChar) + | localType == NonAdd = + ( concat $ V.map (U.bitVectToCharStateQual localAlphabet) $ stateBVFinal parentChar + , concat $ V.map (U.bitVectToCharStateQual localAlphabet) $ stateBVFinal nodeChar + ) + | localType `elem` packedNonAddTypes = + (UV.foldMap getCharState $ packedNonAddFinal parentChar, UV.foldMap getCharState $ packedNonAddFinal nodeChar) + | localType == Matrix = + (show $ fmap (fmap fst3) $ matrixStatesFinal parentChar, UV.foldMap getCharState $ packedNonAddFinal nodeChar) + | localType `elem` sequenceCharacterTypes = case localType of + x | x `elem` [NucSeq, SlimSeq] → (slimParent, slimChild) + x | x `elem` [WideSeq, AminoSeq] → (wideParent, wideChild) + x | x `elem` [HugeSeq] → (hugeParent, hugeChild) + x + | x `elem` [AlignedSlim] → + (SV.foldMap getCharState $ alignedSlimFinal parentChar, SV.foldMap getCharState $ alignedSlimFinal nodeChar) + x + | x `elem` [AlignedWide] → + (UV.foldMap getCharState $ alignedWideFinal parentChar, UV.foldMap getCharState $ alignedWideFinal nodeChar) + x + | x `elem` [AlignedHuge] → + (foldMap getCharState $ alignedHugeFinal parentChar, foldMap getCharState $ alignedHugeFinal nodeChar) + _ → error ("Un-implemented data type " <> show localType) + | otherwise = error ("Un-implemented data type " <> show localType) + + -- character states and transformation numbers (only counting unambiguous) + elementsParent = getElementNumbers (alphabetSymbols localAlphabet) parentState + elementsChild = getElementNumbers (alphabetSymbols localAlphabet) nodeState + elementsCombined = elementsChild -- zip (alphabetSymbols localAlphabet) (zipWith (+) (fmap snd elementsParent) (fmap snd elementsChild)) + elementsCombinedString = L.intercalate "," $ fmap makeElementString elementsCombined + + emptyMatrix = replicate (length localAlphabet) (replicate (length localAlphabet) 0) + elementTransformations = getTransformations (alphabetSymbols localAlphabet) parentState nodeState emptyMatrix + in -- convert String to pair list + -- (parentState <> "," <> nodeState <> "," <> elementsCombinedString <> "," <> (replaceComma $ show elementTransformations), elementsCombined, elementTransformations) + ( parentState <> "," <> nodeState <> "," <> (replaceComma $ show elementTransformations) + , elementsCombined + , elementTransformations + ) + where + -- ((replaceComma $ show elementTransformations), elementsCombined, elementTransformations) + + makeElementString (a, b) = a <> " " <> (show b) + replaceComma a = + if null a + then [] + else + if head a == ',' + then ' ' : replaceComma (tail a) + else (head a) : replaceComma (tail a) + + +{- | getTransformations takes alphabet and parent and child SINGE CHARACXTRE STATES +and returns matrix of numbers of changes +This an absurdely slow implementation n^2 at least +-} +getTransformations ∷ [String] → String → String → [[Int]] → [[Int]] +getTransformations alphabet parentCharList childCharList curMatrix = + if null parentCharList || null childCharList + then curMatrix + else + let pChar = head parentCharList + cChar = head childCharList + in -- trace ("GT: " <> (show curMatrix)) $ + if pChar == cChar + then getTransformations alphabet (tail parentCharList) (tail childCharList) curMatrix + else + if (pChar : []) `elem` alphabet && (cChar : []) `elem` alphabet + then + let pIndex = fromJust $ L.elemIndex (pChar : []) alphabet + cIndex = fromJust $ L.elemIndex (cChar : []) alphabet + newMatrix = incrementMatrix curMatrix pIndex cIndex + in getTransformations alphabet (tail parentCharList) (tail childCharList) newMatrix + else getTransformations alphabet (tail parentCharList) (tail childCharList) curMatrix + + +-- | incrementMatrix updates matrix very stupidly +incrementMatrix ∷ [[Int]] → Int → Int → [[Int]] +incrementMatrix inMatrix p c = + if null inMatrix + then [] + else + let firstRows = take p inMatrix + lastRows = drop (p + 1) inMatrix + updateRow = inMatrix !! p + firstPart = take c updateRow + lastPart = drop (c + 1) updateRow + newRow = firstPart <> [1 + (updateRow !! c)] <> lastPart + in firstRows <> [newRow] <> lastRows + + +-- | getElementNumbers takes a String of SINGLECHARACTRE states and checks versus alphabet for unambiguous states numbers +getElementNumbers ∷ [String] → String → [(String, Int)] +getElementNumbers alphabet charList = + if null charList + then [] + else + if null alphabet + then [] + else + let stringList = fmap (: []) charList + alphNumber = length $ L.findIndices (== (head alphabet)) stringList + in ((head alphabet), alphNumber) : getElementNumbers (tail alphabet) charList + + +{- | getAlignmentBasedChanges' takes two equal length implied Alignments and outputs list of element changes between the two +assumes single String in each list +-} +getAlignmentBasedChanges' ∷ Bool → Int → ([String], [String]) → [String] +getAlignmentBasedChanges' onlyDiffs index (a, b) + | length a > 1 || length b < 1 = error ("Should only have length 1 lists here: " <> (show (length a, length b))) + | null a = [] + | otherwise = + -- empty spaces sometimes + let stringList1 = filter (not . null) $ LS.splitOn (" ") (head a) + stringList2 = filter (not . null) $ LS.splitOn (" ") (head b) + in -- this so returns empty for non--sequence characters + if null stringList1 + then [] + else -- this shouldn't happen but there seem to be extraneous '-' at time in wide and huge IAs + -- since all zip functions as well--changed getAlignmentBasedChangesGuts to check for either null stringList1 or stringList2 + -- to avoid issue-a bit of a hack + -- else if length stringList1 /= length stringList2 then + -- error ("Unequal characters in parent and node state lists in getAlignmentBasedChanges': " + -- <> (show (length stringList1, length stringList2) <> "\n" <> (show stringList1) <> "\n" <> (show stringList2))) + getAlignmentBasedChangesGuts onlyDiffs index stringList1 stringList2 + + +-- | getAlignmentBasedChangesGuts takes processed element lists and creates string of changes +getAlignmentBasedChangesGuts ∷ Bool → Int → [String] → [String] → [String] +getAlignmentBasedChangesGuts onlyDiffs index a b + | null a || null b = [] + | (head a) == (head b) && onlyDiffs = getAlignmentBasedChangesGuts onlyDiffs (index + 1) (tail a) (tail b) + | otherwise = + ((show index) <> ":" <> (head a) <> "," <> (head b)) : getAlignmentBasedChangesGuts onlyDiffs (index + 1) (tail a) (tail b) + + +{- | getAlignmentBasedChanges takes two equal length implied Alignments and outputs list of element changes between the two +only working for nucleotide prealigned or not +-} +getAlignmentBasedChanges ∷ Int → ([String], [String]) → [String] +getAlignmentBasedChanges index (a, b) = + if null a + then [] + else -- empty spaces sometimes + + let string1 = + if (head $ head a) == ' ' + then tail $ head a + else head a + string2 = + if (head $ head b) == ' ' + then tail $ head b + else head b + in if null string1 + then [] + else -- this so returns empty for non--sequence characters + + if (length string1 < 2) || (length string2 < 2) + then [] + else + if length string1 /= length string2 + then [] + else -- index stuff in case gets out of sync in list (missing data, no changes etc) + + if (take 1 string1) == (take 1 string2) + then getAlignmentBasedChanges (index + 1) ([tail string1], [tail string2]) + else + ((show index) <> ":" <> (take 1 string1) <> "," <> (take 1 string2)) + : getAlignmentBasedChanges (index + 1) ([tail string1], [tail string2]) + + +{- | removeNoChangeLines takes lines of vertex changes and removes lines where parent and child startes are the same +so missing or ambiguous in one and not the other will be maintained +-} +removeNoChangeLines ∷ [[String]] → [[String]] +removeNoChangeLines inStringList = + if null inStringList + then [] + else + let parentState = head inStringList !! 9 + childState = head inStringList !! 10 + in ( if (parentState == " ") || (parentState /= childState) + then (head inStringList) : removeNoChangeLines (tail inStringList) + else removeNoChangeLines (tail inStringList) + ) + + +{- | getVertexCharInfo returns a list of list of Strings of vertex information +one list for each character at the vertex +-} +getVertexCharInfo + ∷ Bool → V.Vector BlockData → SimpleGraph → V.Vector (V.Vector CharInfo) → LG.LNode VertexInfo → [[String]] +getVertexCharInfo useIA blockDataVect inGraph charInfoVectVect inVert = + let nodeParents = LG.parents inGraph (fst inVert) + parentNodes + | nodeType (snd inVert) == RootNode = "None" + | nodeType (snd inVert) == LeafNode = show nodeParents + | otherwise = show $ parents (snd inVert) + childNodes = if nodeType (snd inVert) == LeafNode then "None" else show $ children (snd inVert) + basicInfoList = + [ " " + , show $ fst inVert + , T.unpack $ vertName (snd inVert) + , show $ nodeType (snd inVert) + , childNodes + , parentNodes + , " " + , " " + , " " + , " " + , " " + ] + blockCharVect = V.zip3 (V.map fst3 blockDataVect) (vertData (snd inVert)) charInfoVectVect + blockInfoList = concat $ V.toList $ V.map (getBlockList useIA) blockCharVect + in basicInfoList : blockInfoList + + +{- | getVertexAndParentCharInfo returns a list of list of Strings of vertex information +for child and its parent +-} +getVertexAndParentCharInfo + ∷ Bool + → V.Vector BlockData + → SimpleGraph + → V.Vector (V.Vector CharInfo) + → V.Vector (LG.LNode VertexInfo) + → LG.LNode VertexInfo + → [[String]] +getVertexAndParentCharInfo useIA blockDataVect inGraph charInfoVectVect allVertVect inVert = + let nodeParents = LG.parents inGraph (fst inVert) + parentNodes + | nodeType (snd inVert) == RootNode = "None" + | nodeType (snd inVert) == LeafNode = show nodeParents + | otherwise = show $ parents (snd inVert) + childNodes = if nodeType (snd inVert) == LeafNode then "None" else show $ children (snd inVert) + basicInfoList = + [ " " + , show $ fst inVert + , T.unpack $ vertName (snd inVert) + , show $ nodeType (snd inVert) + , childNodes + , parentNodes + , " " + , " " + , " " + , " " + , " " + ] + blockCharVectNode = V.zip3 (V.map fst3 blockDataVect) (vertData (snd inVert)) charInfoVectVect + + -- for root--gets its own values as parent--filtered out in diff list later + blockCharVectParent = + if parentNodes == "None" + then blockCharVectNode + else V.zip3 (V.map fst3 blockDataVect) (vertData (snd $ allVertVect V.! head nodeParents)) charInfoVectVect + blockInfoListParent = concat $ V.toList $ V.map (getBlockList useIA) blockCharVectParent + in basicInfoList : blockInfoListParent + + +-- | getBlockList takes a pair of Vector of chardata and vector of charInfo and returns Strings +getBlockList ∷ Bool → (NameText, V.Vector CharacterData, V.Vector CharInfo) → [[String]] +getBlockList useIA (blockName, blockDataVect, charInfoVect) = + let firstLine = [" ", " ", " ", " ", " ", " ", T.unpack blockName, " ", " ", " ", " ", " "] + charlines = V.toList $ V.map (makeCharLine useIA) (V.zip blockDataVect charInfoVect) + in firstLine : charlines + + +{- | makeCharLine takes character data +will be less legible for optimized data--so should use a diagnosis +based on "naive" data for human legible output +need to add back-converting to observed states using alphabet in charInfo +nothing here for packed since not "entered" +useIA for using alignment fields for changes in diagnosis +is useIA == False then just printing final sequence and removes spaces +for single character sequences (e.g. DNA/Protein) +-} +makeCharLine ∷ Bool → (CharacterData, CharInfo) → [String] +makeCharLine useIA (blockDatum, charInfo) = + let localType = charType charInfo + localAlphabet = (ST.toString <$> alphabet charInfo) + isPrealigned = + if prealigned charInfo + then "Prealigned " + else "" + enhancedCharType + | localType `elem` sequenceCharacterTypes = (isPrealigned <> (show localType)) + | localType `elem` exactCharacterTypes = (show localType) + | otherwise = error ("Character Type :" <> (show localType) <> "unrecogniized or not implemented") + + -- set where get string from, IA for change lists + (slimField, wideField, hugeField) = + if useIA + then (slimIAFinal blockDatum, wideIAFinal blockDatum, hugeIAFinal blockDatum) + else (slimFinal blockDatum, wideFinal blockDatum, hugeFinal blockDatum) + + -- this to avoid recalculations and list access issues + lANES = (fromJust $ NE.nonEmpty $ alphabetSymbols localAlphabet) + lAVect = V.fromList $ NE.toList $ lANES + + getCharState + ∷ ∀ {b} + . (Show b, Bits b) + ⇒ b + → String + getCharState a = U.bitVectToCharState localAlphabet lANES lAVect a + + -- (stringPrelim, stringFinal) = if localType == Add then (show $ snd3 $ rangePrelim blockDatum, show $ rangeFinal blockDatum) + stringFinal + | localType == Add = (show $ rangeFinal blockDatum) + | localType == NonAdd = (concat $ V.map (U.bitVectToCharStateQual localAlphabet) $ stateBVFinal blockDatum) + | localType `elem` packedNonAddTypes = (UV.foldMap getCharState $ packedNonAddFinal blockDatum) + | localType == Matrix = (show $ fmap (fmap fst3) $ matrixStatesFinal blockDatum) + | localType `elem` sequenceCharacterTypes = case localType of + x | x `elem` [NucSeq] → (SV.foldMap getCharState slimField) + x | x `elem` [SlimSeq] → (SV.foldMap getCharState slimField) + x | x `elem` [WideSeq] → (UV.foldMap getCharState wideField) + x | x `elem` [AminoSeq] → (UV.foldMap getCharState wideField) + x | x `elem` [HugeSeq] → (foldMap getCharState hugeField) + x | x `elem` [AlignedSlim] → (SV.foldMap getCharState $ alignedSlimFinal blockDatum) + x | x `elem` [AlignedWide] → (UV.foldMap getCharState $ alignedWideFinal blockDatum) + x | x `elem` [AlignedHuge] → (foldMap getCharState $ alignedHugeFinal blockDatum) + _ → error ("Un-implemented data type " <> show localType) + | otherwise = error ("Un-implemented data type " <> show localType) + + -- this removes ' ' between elements if sequence elements are a single character (e.g. DNA) + stringFinal' + | useIA = stringFinal + | localType `elem` [NucSeq, AminoSeq] = filter (/= ' ') stringFinal + | otherwise = + let maxSymbolLength = maximum $ fmap length $ SET.toList (alphabetSymbols localAlphabet) + in if maxSymbolLength > 1 + then removeRecurrrentSpaces $ fmap nothingToNothing stringFinal + else filter (/= ' ') stringFinal + in -- trace ("MCL:" <> (show localType) <> " " <> stringFinal) + -- [" ", " ", " ", " ", " ", " ", " ", T.unpack $ name charInfo, enhancedCharType, stringPrelim, stringFinal, show $ localCost blockDatum] + [" ", " ", " ", " ", " ", " ", " ", T.unpack $ name charInfo, enhancedCharType, stringFinal'] + where + nothingToNothing a = + if a == '\8220' + then '\00' + else a + + +-- | removeRecurrrentSpaces removes spaces that are followed by spaces +removeRecurrrentSpaces ∷ String → String +removeRecurrrentSpaces inString + | null inString = [] + | length inString == 1 = inString + | head inString == ' ' = + if (head $ tail inString) == ' ' + then removeRecurrrentSpaces (tail inString) + else (head inString) : removeRecurrrentSpaces (tail inString) + | otherwise = (head inString) : removeRecurrrentSpaces (tail inString) + + +-- | TNT report functions + +{- | getTNTStrings returns as a set of interleaved blocks==one for each "character" so not mix numerical and +sequence characters +softwired use display trees, hardWired transform to softwired then proceed with display trees +key to keep cost matrices and weights +Uses Phylogenetic graph to noit repeat functions for display and charcter trees +-} +getTNTString ∷ GlobalSettings → ProcessedData → (PhylogeneticGraph, Int) → PhyG String +getTNTString inGS inData (inGraph, graphNumber) = + if LG.isEmpty (fst6 inGraph) + then error "No graphs for create TNT data for in getTNTString" + else + let leafList = snd4 $ LG.splitVertexList (thd6 inGraph) + leafNameList = fmap ((<> "\t") . T.unpack) (fmap (vertName . snd) leafList) + headerString = + "taxname + 90;\nxread\n'TNT data for Graph " + <> show graphNumber + <> " generated by PhylogeneticGraph (PhyG)\n\tSource characters:\n" + finalString = "proc/;\nlength;\n\n" + numTaxa = V.length $ fst3 inData + charInfoVV = six6 inGraph + + -- get character information in 3-tuples and list of lengths--to match lengths + ccCodeInfo = getCharacterInfo charInfoVV + + -- only first of block list type--so assumes single sequence per block and only that type--no exact types + charTypeList = V.toList $ fmap charType $ fmap V.head charInfoVV + + -- Tree in TNT format + tntTreeString = + "tread 'Trees from PhyG'\n" + <> (GO.makeNewickList True False False (outgroupIndex inGS) [GO.convertDecoratedToSimpleGraph $ thd6 inGraph] [snd6 inGraph]) + in if graphType inGS == Tree + then + let leafDataList = fmap (vertData . snd) leafList + in do + -- get character strings + blockStringListstList ← mapM (getTaxonCharStringListPair charInfoVV) (zip leafDataList leafNameList) + -- let blockStringListstList = V.toList blockStringListstList' + interleavedBlocks' ← makePairInterleave blockStringListstList charTypeList + let interleavedBlocks = concat interleavedBlocks' + -- taxonCharacterStringList = V.toList $ fmap ((<> "\n") . getTaxonCharString charInfoVV) leafDataList + -- nameCharStringList = concat $ zipWith (<>) leafNameList taxonCharacterStringList + + -- length information for cc code extents + let charLengthList = concat $ V.toList $ V.zipWith getBlockLength (head leafDataList) charInfoVV + + -- Block/Character names for use in comment to show sources of new characters + let charNameList = concat $ V.toList $ fmap getBlockNames charInfoVV + + let nameLengthPairList = zip charNameList charLengthList + let nameLengthString = concat $ pairListToStringList nameLengthPairList 0 + + -- merge lengths and cc codes + let ccCodeString = mergeCharInfoCharLength ccCodeInfo charLengthList 0 + + -- trace ("GTNTS:" <> (show charLengthList)) + pure $ + headerString + <> nameLengthString + <> "'\n" + <> show (sum charLengthList) + <> " " + <> show numTaxa + <> "\n" + <> interleavedBlocks + <> ";\n" + <> ccCodeString + <> tntTreeString + <> finalString + else -- for softwired networks--use display trees + + if graphType inGS == SoftWired + then do + -- get display trees for each data block-- takes first of potentially multiple + middleStuffString ← createDisplayTreeTNT inGS inData inGraph + + pure $ headerString <> middleStuffString <> tntTreeString <> finalString + else -- for hard-wired networks--transfoirm to softwired and use display trees + + if graphType inGS == HardWired + then + let newGS = inGS{graphType = SoftWired} + + pruneEdges = False + warnPruneEdges = False + + startVertex ∷ ∀ {a}. Maybe a + startVertex = Nothing + in do + newGraph ← TRAV.multiTraverseFullyLabelGraph newGS inData pruneEdges warnPruneEdges startVertex (fst6 inGraph) + + middleStuffString ← createDisplayTreeTNT inGS inData newGraph + + logWith + LogWarn + "There is no implied alignment for hard-wired graphs--at least not yet. Ggenerating TNT text via softwired transformation\n" + -- headerString <> nameLengthString <> "'\n" <> (show $ sum charLengthList) <> " " <> (show numTaxa) <> "\n" + -- <> nameCharStringList <> ";\n" <> ccCodeString <> finalString + pure $ headerString <> middleStuffString <> tntTreeString <> finalString + else do + logWith LogWarn ("TNT not yet implemented for graphtype " <> show (graphType inGS) <> "\n") + pure $ ("There is no implied alignment for " <> show (graphType inGS)) + + +{- | makePairInterleave creates interleave block strings from character name block list +list of taxa and blocck with taxon name +-} +makePairInterleave ∷ [[String]] → [CharType] → PhyG [String] +makePairInterleave inTaxCharStringList alphabetType = + -- concat $ fmap concat inTaxCharStringList + if null inTaxCharStringList + then do pure [] + else + if null $ head inTaxCharStringList + then do pure [] + else + let firstInterleave = concatMap (<> "\n") $ fmap head inTaxCharStringList + remainder = fmap tail inTaxCharStringList + interleaveMarkerString + | head alphabetType `elem` exactCharacterTypes = "& [num]" + | head alphabetType `elem` [NucSeq, AlignedSlim] = "& [dna gaps]" + | head alphabetType `elem` [AminoSeq, AlignedWide] = "& [prot gaps]" + | otherwise = "& [other]" + in do + endString ← makePairInterleave remainder (tail alphabetType) + let returnString = (interleaveMarkerString <> "\n") : (firstInterleave : endString) + if (head alphabetType `notElem` (exactCharacterTypes <> [NucSeq, AlignedSlim] <> [AminoSeq, AlignedWide])) + then do + logWith LogWarn ("Warning--sequence data in tnt ouput not of type TNT accepts" <> "\n") + pure returnString + else do + pure returnString + + +-- | createDisplayTreeTNT take a softwired graph and creates TNT data string +createDisplayTreeTNT ∷ GlobalSettings → ProcessedData → PhylogeneticGraph → PhyG String +createDisplayTreeTNT inGS inData inGraph = + let leafList = snd4 $ LG.splitVertexList (thd6 inGraph) + leafNameList = fmap ((<> "\t") . T.unpack) (fmap (vertName . snd) leafList) + charInfoVV = six6 inGraph + + -- only first of block list type--so assumes single sequence per block and only that type--no exact types + charTypeList = V.toList $ fmap charType $ fmap V.head charInfoVV + + numTaxa = V.length $ fst3 inData + ccCodeInfo = getCharacterInfo charInfoVV + + -- parallel stuff + contract ∷ [DecoratedGraph] → SimpleGraph + contract = GO.contractIn1Out1EdgesRename . GO.convertDecoratedToSimpleGraph . head + + block ∷ BlockData → ProcessedData + block = makeBlockData (fst3 inData) (snd3 inData) + + traverseAction ∷ (ProcessedData, SimpleGraph) → PhyG PhylogeneticGraph + traverseAction = TRAV.multiTraverseFullyLabelGraphPair (inGS{graphType = Tree}) False False Nothing + + taxonString ∷ (VertexBlockData, String) → PhyG [String] + taxonString = getTaxonCharStringListPair charInfoVV + in do + contractPar ← getParallelChunkMap + let blockDisplayList = contractPar contract (V.toList $ fth6 inGraph) + + -- blockDisplayList = fmap (GO.contractIn1Out1EdgesRename . GO.convertDecoratedToSimpleGraph . head) (V.toList $ fth6 inGraph) `using` PU.myParListChunkRDS + -- blockDisplayList = PU.seqParMap PU.myStrategyHighLevel (GO.contractIn1Out1EdgesRename . GO.convertDecoratedToSimpleGraph . head) (V.toList $ fth6 inGraph) + + -- create separate processed data for each block + blockPar ← getParallelChunkMap + let blockProcessedDataList = blockPar block (V.toList $ thd3 inData) + -- blockProcessedDataList = PU.seqParMap PU.myStrategyHighLevel (makeBlockData (fst3 inData) (snd3 inData)) (thd3 inData) + + -- Perform full optimizations on display trees (as trees) with single block data (blockProcessedDataList) to creeate IAs + decoratedBlockTreeList ← + getParallelChunkTraverse >>= \pTraverse → + traverseAction `pTraverse` zip blockProcessedDataList blockDisplayList + + -- create leaf data by merging display graph block data (each one a phylogentic graph) + let (leafDataList, mergedCharInfoVV) = mergeDataBlocks decoratedBlockTreeList [] [] + + -- get character block strings as interleaved groups + blockStringListstList ← + getParallelChunkTraverse >>= \pTraverse → + taxonString `pTraverse` zip (V.toList leafDataList) leafNameList + + interleavedBlocks ← fold <$> makePairInterleave blockStringListstList charTypeList + + -- taxonCharacterStringList = V.toList $ fmap ((<> "\n") . getTaxonCharString mergedCharInfoVV) leafDataList + -- nameCharStringList = concat $ zipWith (<>) leafNameList taxonCharacterStringList + + -- length information for cc code extents + let charLengthList = concat $ V.toList $ V.zipWith getBlockLength (V.head leafDataList) mergedCharInfoVV + + -- Block/Character names for use in comment to show sources of new characters + let charNameList = concat $ V.toList $ fmap getBlockNames charInfoVV + + let nameLengthPairList = zip charNameList charLengthList + let nameLengthString = concat $ pairListToStringList nameLengthPairList 0 + + -- merge lengths and cc codes + let ccCodeString = mergeCharInfoCharLength ccCodeInfo charLengthList 0 + pure $ + nameLengthString + <> "'\n" + <> show (sum charLengthList) + <> " " + <> show numTaxa + <> "\n" + <> interleavedBlocks + <> ";\n" + <> ccCodeString + + +-- | pairListToStringList takes alist of (String, Int) and a starting index and returns scope of charcter for leading comment +pairListToStringList ∷ [(String, Int)] → Int → [String] +pairListToStringList pairList startIndex = + if null pairList + then [] + else + let (a, b) = head pairList + in ("\t\t" <> show startIndex <> "-" <> show (b + startIndex - 1) <> " : " <> a <> "\n") + : pairListToStringList (tail pairList) (startIndex + b) + + +{- | mergeDataBlocks takes a list of Phylogenetic Graphs (Trees) and merges the data blocks (each graph should have only 1) +and merges the charInfo Vectors returning data and charInfo +-} +mergeDataBlocks + ∷ [PhylogeneticGraph] + → [[V.Vector CharacterData]] + → [V.Vector CharInfo] + → (V.Vector (V.Vector (V.Vector CharacterData)), V.Vector (V.Vector CharInfo)) +mergeDataBlocks inGraphList curDataList curInfoList = + if null inGraphList + then (V.fromList $ fmap (V.fromList . reverse) curDataList, V.fromList $ reverse curInfoList) + else + let firstGraph = head inGraphList + firstTree = thd6 firstGraph + firstCharInfo = V.head $ six6 firstGraph + leafList = snd4 $ LG.splitVertexList firstTree + + -- since each graph has a single block--take head to get vector of characters + leafCharacterList = V.toList $ fmap (V.head . vertData . snd) (V.fromList leafList) + + -- zip data for each taxon + newDataList = + if null curDataList + then (: []) <$> leafCharacterList + else zipWith (:) leafCharacterList curDataList + in mergeDataBlocks (tail inGraphList) newDataList (firstCharInfo : curInfoList) + + +{- | getTaxonCharString returns the total character string for a taxon +length and zipping for missing data +-} +getTaxonCharString ∷ V.Vector (V.Vector CharInfo) → VertexBlockData → PhyG String +getTaxonCharString charInfoVV charDataVV = + -- False for not use IA field + let lengthBlock = maximum $ V.zipWith (U.getCharacterLength False) (V.head charDataVV) (V.head charInfoVV) + -- parallel stuff + action ∷ (V.Vector CharInfo, V.Vector CharacterData) → PhyG String + action = getBlockStringPair lengthBlock + in getParallelChunkTraverse >>= \pTraverse → + fmap fold . pTraverse action . zip (V.toList charInfoVV) $ V.toList charDataVV + + +-- concat (zipWith (getBlockString lengthBlock) (V.toList charInfoVV) (V.toList charDataVV) `using` PU.myParListChunkRDS) + +-- | getTaxonCharStringListPair wrapper for getTaxonCharStringList with differnet parameter passing +getTaxonCharStringListPair ∷ V.Vector (V.Vector CharInfo) → (VertexBlockData, String) → PhyG [String] +getTaxonCharStringListPair charInfoVV (charDataVV, leafName) = getTaxonCharStringList charInfoVV charDataVV leafName + + +{- | getTaxonCharStringList returns the total character string list (over blocks) for a taxon +length and zipping for missing data +-} +getTaxonCharStringList ∷ V.Vector (V.Vector CharInfo) → VertexBlockData → String → PhyG [String] +getTaxonCharStringList charInfoVV charDataVV leafName = + let -- False for not use IA field + lengthBlock = maximum $ V.zipWith (U.getCharacterLength False) (V.head charDataVV) (V.head charInfoVV) + -- parallel stuff + action ∷ (V.Vector CharInfo, V.Vector CharacterData) → PhyG String + action = getBlockStringPair lengthBlock + prefix = (fmap (leafName <>)) + in getParallelChunkTraverse >>= \pTraverse → + fmap prefix . pTraverse action . zip (V.toList charInfoVV) $ V.toList charDataVV + + +-- fmap (leafName <>) $ (zipWith (getBlockString lengthBlock) (V.toList charInfoVV) (V.toList charDataVV) `using` PU.myParListChunkRDS) + +-- | getBlockStringPair wrapper around getBlockString +getBlockStringPair ∷ Int → (V.Vector CharInfo, V.Vector CharacterData) → PhyG String +getBlockStringPair lengthBlock (charInfoV, charDataV) = getBlockString lengthBlock charInfoV charDataV + + +{- | getBlockString returns the String for a character block +returns all '?' if missing +-} +getBlockString ∷ Int → V.Vector CharInfo → V.Vector CharacterData → PhyG String +getBlockString lengthBlock charInfoV charDataV = + -- this to deal with missing characters + -- trace ("GBS: " <> (show $ V.length charDataV)) ( + if V.null charDataV + then pure $ L.replicate lengthBlock '?' + else -- parallel stuff + + let action ∷ (CharacterData, CharInfo) → String + action = getCharacterStringPair + in do + actionPar ← getParallelChunkMap + let result = actionPar action (zip (V.toList charDataV) (V.toList charInfoV)) + pure $ concat result + + +-- concat (zipWith getCharacterString (V.toList charDataV) (V.toList charInfoV) `using` PU.myParListChunkRDS) +-- ) + +-- | mergeCharInfoCharLength merges cc code char info and char lengths for scope +mergeCharInfoCharLength ∷ [(String, String, String)] → [Int] → Int → String +mergeCharInfoCharLength codeList lengthList charIndex = + if null codeList + then [] + else + let (ccCodeString, costsString, weightString) = head codeList + charLength = head lengthList + startScope = show charIndex + endScope = show (charIndex + charLength - 1) + scope = startScope <> "." <> endScope + weightString' = + if null weightString + then [] + else "cc /" <> weightString <> scope <> ";\n" + costsString' = + if null costsString + then [] + else "costs " <> scope <> " = " <> costsString <> ";\n" + ccCodeString' = "cc " <> ccCodeString <> " " <> scope <> ";\n" + in (ccCodeString' <> weightString' <> costsString') + <> mergeCharInfoCharLength (tail codeList) (tail lengthList) (charIndex + charLength) + + +{- | getCharacterInfo takes charInfo vect vect and reiurns triples of ccCode, costs, and weight values +for each character +-} +getCharacterInfo ∷ V.Vector (V.Vector CharInfo) → [(String, String, String)] +getCharacterInfo inCharInfoVV = + concat $ V.toList $ fmap getBlockInfo inCharInfoVV + + +-- | getBlockInfo gets character code info for a block +getBlockInfo ∷ V.Vector CharInfo → [(String, String, String)] +getBlockInfo inCharInfoV = V.toList $ fmap getCharCodeInfo inCharInfoV + + +{- | getCharCodeInfo extracts 3-tuple of cc code, costs and weight as strings +from charInfo +-} +getCharCodeInfo ∷ CharInfo → (String, String, String) +getCharCodeInfo inCharInfo = + let inCharType = charType inCharInfo + charWeightString = + if weight inCharInfo == 1 + then "" + else show $ weight inCharInfo + inAlph = alphabet inCharInfo + inMatrix = costMatrix inCharInfo + (costMatrixType, _) = IR.getRecodingType inMatrix + matrixString = + if costMatrixType == "nonAdd" + then "" + else makeMatrixString inAlph inMatrix + in let codeTriple = case inCharType of + x | x == Add → ("+", "", charWeightString) + x | x == NonAdd → ("-", "", charWeightString) + x | x `elem` packedNonAddTypes → ("-", "", charWeightString) + x | x == Matrix → ("(", matrixString, charWeightString) + x + | x `elem` sequenceCharacterTypes → + if costMatrixType == "nonAdd" + then ("-", "", charWeightString) + else ("(", matrixString, charWeightString) + _ → error ("Un-implemented data type " <> show inCharType) + in codeTriple + + +{- | makeMatrixString takes alphabet and input cost matrix and creates TNT +matrix cost line +could be lesser but might not be symmetrical +-} +makeMatrixString ∷ Alphabet ST.ShortText → S.Matrix Int → String +makeMatrixString inAlphabet inMatrix = + let elementList = (ST.toString <$> toList inAlphabet) + + -- get element pairs (Strings) + elementPairList = filter notDiag $ getListPairs elementList + + -- index pairs for accessing matrix + elementIndexPairList = filter notDiag $ getListPairs [0 .. (length elementList - 1)] + elementPairCosts = fmap (inMatrix S.!) elementIndexPairList + + -- make strings of form state_i / state_j cost ... + costString = makeCostString elementPairList elementPairCosts + in costString + where + notDiag ∷ ∀ {a}. (Ord a) ⇒ (a, a) → Bool + notDiag (a, b) = a < b + + +-- | makeCostString takes list of state pairs and list of costs and creates tnt cost string +makeCostString ∷ [(String, String)] → [Int] → String +makeCostString namePairList costList = + if null namePairList + then [] + else + let (a, b) = head namePairList + c = head costList + in (a <> "/" <> b <> " " <> show c <> " ") <> makeCostString (tail namePairList) (tail costList) + + +-- | getBlockLength returns a list of the lengths of all characters in a blocks +getBlockLength ∷ V.Vector CharacterData → V.Vector CharInfo → [Int] +getBlockLength inCharDataV inCharInfoV = + -- trace ("GBL:" <> (show $ V.zipWith U.getCharacterLength inCharDataV inCharInfoV)) + -- False so not use IA field + V.toList $ V.zipWith (U.getCharacterLength False) inCharDataV inCharInfoV + + +-- | getBlockNames returns a list of the lengths of all characters in a blocks +getBlockNames ∷ V.Vector CharInfo → [String] +getBlockNames inCharInfoV = + -- trace ("GBL:" <> (show $ V.zipWith U.getCharacterLength inCharDataV inCharInfoV)) + V.toList $ fmap (T.unpack . name) inCharInfoV + + +-- | getCharacterStringPair is a wrapper around getCharacterString +getCharacterStringPair ∷ (CharacterData, CharInfo) → String +getCharacterStringPair (inCharData, inCharInfo) = getCharacterString inCharData inCharInfo + + +{- | getCharacterString returns a string of character states +need to add space between (large alphabets etc) +local alphabet for characters where that is input. Matrix and additive are integers +-} +getCharacterString ∷ CharacterData → CharInfo → String +getCharacterString inCharData inCharInfo = + let inCharType = charType inCharInfo + localAlphabet = + if inCharType /= NonAdd + then ST.toString <$> alphabet inCharInfo + else fmap ST.toString discreteAlphabet + -- alphSize = S.rows $ costMatrix inCharInfo + + -- this to avoid recalculations and list access issues + lANES = (fromJust $ NE.nonEmpty $ alphabetSymbols localAlphabet) + lAVect = V.fromList $ NE.toList $ lANES + + getCharState + ∷ ∀ {b} + . (Show b, Bits b) + ⇒ b + → String + getCharState a = U.bitVectToCharState localAlphabet lANES lAVect a + in let charString = case inCharType of + x + | x == NonAdd → + filter (/= ' ') $ foldMap (U.bitVectToCharStateNonAdd localAlphabet) $ snd3 $ stateBVPrelim inCharData + x | x `elem` packedNonAddTypes → UV.foldMap (U.bitVectToCharStateQual localAlphabet) $ snd3 $ packedNonAddPrelim inCharData + x | x == Add → filter (/= ' ') $ foldMap (U.additivStateToString lAVect) $ snd3 $ rangePrelim inCharData + x | x == Matrix → filter (/= ' ') $ foldMap U.matrixStateToString $ matrixStatesPrelim inCharData + x | x `elem` [SlimSeq, NucSeq] → filter (/= ' ') $ SV.foldMap getCharState $ snd3 $ slimAlignment inCharData + x | x `elem` [WideSeq, AminoSeq] → filter (/= ' ') $ UV.foldMap getCharState $ snd3 $ wideAlignment inCharData + x | x == HugeSeq → foldMap getCharState $ snd3 $ hugeAlignment inCharData + x | x == AlignedSlim → filter (/= ' ') $ SV.foldMap getCharState $ snd3 $ alignedSlimPrelim inCharData + x | x == AlignedWide → filter (/= ' ') $ UV.foldMap getCharState $ snd3 $ alignedWidePrelim inCharData + x | x == AlignedHuge → foldMap getCharState $ snd3 $ alignedHugePrelim inCharData + _ → error ("Un-implemented data type " <> show inCharType) + in if not (isAllGaps charString) + then charString + else fmap replaceDashWithQuest charString + where + replaceDashWithQuest s = + if s == '-' + then '?' + else s + +-- | Implied Alignment report functions + +{- | getImpliedAlignmentString returns as a single String the implied alignments of all sequence characters +softwired use display trees, hardWired transform to softwired then proceed with display trees +-} +getImpliedAlignmentString + ∷ GlobalSettings + → Bool + → Bool + → ProcessedData + → (ReducedPhylogeneticGraph, Int) + → PhyG String +getImpliedAlignmentString inGS includeMissing concatSeqs inData (inReducedGraph, graphNumber) = + if LG.isEmpty (fst5 inReducedGraph) + then error "No graphs for create IAs for in getImpliedAlignmentStrings" + else + let headerString = "Implied Alignments for Graph " <> show graphNumber <> "\n" + inGraph = GO.convertReduced2PhylogeneticGraph inReducedGraph + + -- parallel stuff + reoptimize ∷ (ProcessedData, SimpleGraph) → PhyG PhylogeneticGraph + reoptimize = TRAV.multiTraverseFullyLabelGraphPair (inGS{graphType = Tree}) False False Nothing + + getIAAction ∷ PhylogeneticGraph → PhyG String + getIAAction = getTreeIAString includeMissing + in if graphType inGS == Tree + then do + resultIA ← getTreeIAString includeMissing inGraph + if not concatSeqs + then do + pure $ headerString <> resultIA + else do + pure $ headerString <> U.concatFastas resultIA -- (getTreeIAString includeMissing inGraph) + else -- for softwired networks--use display trees + + if graphType inGS == SoftWired + then -- get display trees for each data block-- takes first of potentially multiple + + let blockDisplayList = fmap (GO.contractIn1Out1EdgesRename . GO.convertDecoratedToSimpleGraph . head) (fth6 inGraph) + + -- create seprate processed data for each block + blockProcessedDataList = fmap (makeBlockData (fst3 inData) (snd3 inData)) (thd3 inData) + in do + decoratedBlockTreeList' ← + getParallelChunkTraverse >>= \pTraverse → + pTraverse reoptimize . zip (V.toList blockProcessedDataList) $ V.toList blockDisplayList + -- Perform full optimizations on display trees (as trees) with single block data (blockProcessedDataList) to create IAs + let decoratedBlockTreeList = V.fromList decoratedBlockTreeList' + + -- extract IA strings as if multiple graphs + diplayIAStringList ← mapM (getTreeIAString includeMissing) (V.toList decoratedBlockTreeList) + + if not concatSeqs + then do + pure $ headerString <> concat diplayIAStringList + else do + pure $ headerString <> U.concatFastas (concat diplayIAStringList) + else -- There is no IA for Hardwired at least as of yet + -- so convert to softwired and use display trees + + if graphType inGS == HardWired + then + let newGS = inGS{graphType = SoftWired} + + pruneEdges = False + warnPruneEdges = False + + startVertex ∷ ∀ {a}. Maybe a + startVertex = Nothing + in do + -- create seprate processed data for each block + newGraph ← TRAV.multiTraverseFullyLabelGraph newGS inData pruneEdges warnPruneEdges startVertex (fst6 inGraph) + + let blockDisplayList = fmap (GO.convertDecoratedToSimpleGraph . head) (fth6 newGraph) + + let blockProcessedDataList = fmap (makeBlockData (fst3 inData) (snd3 inData)) (thd3 inData) + + decoratedBlockTreeList' ← + getParallelChunkTraverse >>= \pTraverse → + pTraverse reoptimize . zip (V.toList blockProcessedDataList) $ V.toList blockDisplayList + -- Perform full optimizations on display trees (as trees) with single block data (blockProcessedDataList) to creeate IAs + let decoratedBlockTreeList = V.fromList decoratedBlockTreeList' + + -- extract IA strings as if mutiple graphs + diplayIAStringList ← + getParallelChunkTraverse >>= \pTraverse → + getIAAction `pTraverse` V.toList decoratedBlockTreeList + + logWith + LogWarn + "There is no implied alignment for hard-wired graphs--at least not yet. Transfroming to softwired and generate an implied alignment that way\n" + if not concatSeqs + then do + pure $ headerString <> concat diplayIAStringList + else do + pure $ headerString <> U.concatFastas (concat diplayIAStringList) + else do + logWith LogWarn ("IA not yet implemented for graphtype " <> show (graphType inGS) <> "\n") + pure $ "There is no implied alignment for " <> show (graphType inGS) + + +-- | getTreeIAString takes a Tree Decorated Graph and returns Implied AlignmentString +getTreeIAString ∷ Bool → PhylogeneticGraph → PhyG String +getTreeIAString includeMissing inGraph = + let leafList = snd4 $ LG.splitVertexList (thd6 inGraph) + leafNameList = fmap (vertName . snd) leafList + leafDataList = V.fromList $ fmap (vertData . snd) leafList + charInfoVV = six6 inGraph + in do + characterStringList ← makeFullIAStrings includeMissing charInfoVV leafNameList leafDataList + + pure $ concat characterStringList + + +-- | makeBlockData cretes new single block processed data +makeBlockData ∷ V.Vector NameText → V.Vector NameBV → BlockData → ProcessedData +makeBlockData a b c = (a, b, V.singleton c) + + +-- | makeFullIAStrings goes block by block, creating fasta strings for each +makeFullIAStrings ∷ Bool → V.Vector (V.Vector CharInfo) → [NameText] → V.Vector VertexBlockData → PhyG [String] +makeFullIAStrings includeMissing charInfoVV leafNameList leafDataList = + let numBlocks = V.length charInfoVV + -- parallel stuff + action ∷ Int → PhyG [String] + action = makeBlockIAStrings includeMissing leafNameList leafDataList charInfoVV + in getParallelChunkTraverse >>= \pTraverse → + fold <$> pTraverse action [0 .. numBlocks - 1] + + +-- | makeBlockIAStrings extracts data for a block (via index) and calls function to make iaStrings for each character +makeBlockIAStrings + ∷ Bool → [NameText] → V.Vector (V.Vector (V.Vector CharacterData)) → V.Vector (V.Vector CharInfo) → Int → PhyG [String] +makeBlockIAStrings includeMissing leafNameList leafDataList charInfoVV blockIndex = + let thisBlockCharInfo = V.toList $ charInfoVV V.! blockIndex + numChars = length thisBlockCharInfo + thisBlockCharData = fmap (V.! blockIndex) leafDataList + + -- parallel stuff + action ∷ (CharInfo, Int) → String + action = makeBlockCharacterStringPair includeMissing leafNameList thisBlockCharData + in do + actionPar ← getParallelChunkMap + let blockCharacterStringList = actionPar action (zip thisBlockCharInfo [0 .. (numChars - 1)]) + pure blockCharacterStringList + + +-- blockCharacterStringList = zipWith (makeBlockCharacterStringPair includeMissing leafNameList thisBlockCharData) thisBlockCharInfo [0 .. (numChars - 1)] `using` PU.myParListChunkRDS + +-- | isAllGaps checks whether a sequence is all gap charcaters '-' +isAllGaps ∷ String → Bool +isAllGaps inSeq + | null inSeq = True + | length (filter (`notElem` [' ', '-', '\n']) inSeq) == 0 = True + | otherwise = False + + +-- | makeBlockCharacterStringPair is a wrapper for makeBlockCharacterString +makeBlockCharacterStringPair ∷ Bool → [NameText] → V.Vector (V.Vector CharacterData) → (CharInfo, Int) → String +makeBlockCharacterStringPair includeMissing leafNameList leafDataVV (thisCharInfo, charIndex) = + makeBlockCharacterString includeMissing leafNameList leafDataVV thisCharInfo charIndex + + +-- | makeBlockCharacterString creates implied alignmennt string for sequence charactes and null if not +makeBlockCharacterString ∷ Bool → [NameText] → V.Vector (V.Vector CharacterData) → CharInfo → Int → String +makeBlockCharacterString includeMissing leafNameList leafDataVV thisCharInfo charIndex = + -- check if sequence type character + let thisCharType = charType thisCharInfo + thisCharName = name thisCharInfo + in if thisCharType `notElem` sequenceCharacterTypes + then [] + else + let -- thisCharData = fmap (V.! charIndex) leafDataVV + thisCharData = getTaxDataOrMissing leafDataVV charIndex 0 [] + nameDataPairList = zip leafNameList thisCharData + fastaString = pairList2Fasta includeMissing thisCharInfo nameDataPairList + in -- trace ("MBCS: " <> (show $ length leafNameList) <> " " <> (show $ V.length thisCharData) <> "\n" <> (show leafDataVV)) + "\nSequence character " <> T.unpack thisCharName <> "\n" <> fastaString <> "\n" + + +{- +-- | getCharacterDataOrMissing takes a vector of vector of charcter data and returns list +-- of taxa for a given sequnce character. If there are no data for that character for a taxon +getCharacterDataOrMissing :: V.Vector (V.Vector CharacterData) -> Int -> [[CharacterData]] -> [[CharacterData]] +getCharacterDataOrMissing leafDataVV charIndex newCharList = + if charIndex == V.length leafDataVV then reverse newCharList + else + let firstCharData = getTaxDataOrMissing leafDataVV charIndex 0 [] + in + getCharacterDataOrMissing leafDataVV (charIndex + 1) (firstCharData : newCharList) +-} + +-- | getTaxDataOrMissing gets the index character if data not null, empty character if not +getTaxDataOrMissing ∷ V.Vector (V.Vector CharacterData) → Int → Int → [CharacterData] → [CharacterData] +getTaxDataOrMissing charDataV charIndex taxonIndex newTaxList + | taxonIndex == V.length charDataV = reverse newTaxList + | V.null (charDataV V.! taxonIndex) = getTaxDataOrMissing charDataV charIndex (taxonIndex + 1) (emptyCharacter : newTaxList) + | otherwise = getTaxDataOrMissing charDataV charIndex (taxonIndex + 1) (((charDataV V.! taxonIndex) V.! charIndex) : newTaxList) + + +{- | pairList2Fasta takes a character type and list of pairs of taxon names (as T.Text) +and character data and returns fasta formated string +-} +pairList2Fasta ∷ Bool → CharInfo → [(NameText, CharacterData)] → String +pairList2Fasta includeMissing inCharInfo nameDataPairList = + if null nameDataPairList + then [] + else + let (firstName, blockDatum) = head nameDataPairList + inCharType = charType inCharInfo + + -- this to avoid recalculations and list access issues + localAlphabet = (ST.toString <$> alphabet inCharInfo) + lANES = (fromJust $ NE.nonEmpty $ alphabetSymbols localAlphabet) + lAVect = V.fromList $ NE.toList $ lANES + + getCharState + ∷ ∀ {b} + . (Show b, Bits b) + ⇒ b + → String + getCharState a = U.bitVectToCharState localAlphabet lANES lAVect a + + sequenceString = case inCharType of + -- x | x `elem` [SlimSeq, NucSeq ] -> SV.foldMap (U.bitVectToCharState localAlphabet) $ snd3 $ slimAlignment blockDatum + x | x == SlimSeq → SV.foldMap getCharState $ snd3 $ slimAlignment blockDatum + x | x == NucSeq → SV.foldMap getCharState $ snd3 $ slimAlignment blockDatum + -- x | x `elem` [WideSeq, AminoSeq] -> UV.foldMap (U.bitVectToCharState localAlphabet) $ snd3 $ wideAlignment blockDatum + x | x == WideSeq → UV.foldMap getCharState $ snd3 $ wideAlignment blockDatum + x | x == AminoSeq → UV.foldMap getCharState $ snd3 $ wideAlignment blockDatum + x | x == HugeSeq → foldMap getCharState $ snd3 $ hugeAlignment blockDatum + x | x == AlignedSlim → SV.foldMap getCharState $ snd3 $ alignedSlimPrelim blockDatum + x | x == AlignedWide → UV.foldMap getCharState $ snd3 $ alignedWidePrelim blockDatum + x | x == AlignedHuge → foldMap getCharState $ snd3 $ alignedHugePrelim blockDatum + _ → error ("Un-implemented data type " <> show inCharType) + + -- If all gaps then change to all question marks since missing + sequenceString' = + if isAllGaps sequenceString + then fmap replaceDashWithQuest sequenceString + else sequenceString + + -- Make lines 50 chars long + sequenceChunks = ((<> "\n") <$> LS.chunksOf 50 sequenceString') + in if ((not includeMissing) && (isAllGaps sequenceString)) || (blockDatum == emptyCharacter) + then pairList2Fasta includeMissing inCharInfo (tail nameDataPairList) + else + (concat $ (('>' : (T.unpack firstName)) <> "\n") : sequenceChunks) + <> (pairList2Fasta includeMissing inCharInfo (tail nameDataPairList)) + where + replaceDashWithQuest s = + if s == '-' + then '?' + else s diff --git a/src/Commands/ProcessCommands.hs b/src/Commands/ProcessCommands.hs new file mode 100644 index 000000000..0e5e5b9cb --- /dev/null +++ b/src/Commands/ProcessCommands.hs @@ -0,0 +1,381 @@ +{- To add commands: + 1) add new command in Types.hs + 2) add string name of command to allowedCommandList + 3) add instruction processing to getInstruction + 4) add argument processing to parseCommandArg + 5) add argument processing function + with meaningful errors + 6) Add amchinery of command in general code +-} + +{- | +Module to process commands. +-} +module Commands.ProcessCommands ( + expandRunCommands, + getCommandList, + movePrealignedTCM, + moveSetOutgroupFirst, + preprocessOptimalityCriteriaScripts, +) where + +import Commands.Verify qualified as V +import Control.Monad.IO.Class (MonadIO (..)) +import Data.Char +import Data.Foldable +import Data.List qualified as L +import Data.List.Split +import Data.Maybe +import GeneralUtilities +import Input.ReadInputFiles qualified as RIF +import PHANE.Evaluation +import PHANE.Evaluation.ErrorPhase (ErrorPhase (..)) +import PHANE.Evaluation.Logging (LogLevel (..), Logger (..)) +import Types.Types +--import Debug.Trace + + +{- | preprocessOptimalityCriteriaScripts takes a processed command list and +processes for optimlity criteria that change tcms and such for +Parsimony, PMDL, SI, MAPA, and NCM +-} +preprocessOptimalityCriteriaScripts ∷ [Command] → [Command] +preprocessOptimalityCriteriaScripts inCommandList = inCommandList + + +{- | expandRunCommands takes raw coomands and if a "run" command is found it reads that file +and adds those commands in place +ensures one command per line +-} +expandRunCommands ∷ [String] → [String] → PhyG [String] +expandRunCommands curLines inLines = + -- trace ("EXP " <> (show curLines) <> show inLines) ( + if null inLines + then return $ reverse curLines + else do + let firstLineRead = removeComments [filter (/= ' ') $ head inLines] + (firstLine, restLine) ← + if null firstLineRead + then return ([], []) + else splitCommandLine $ head firstLineRead + + let leftParens = length $ filter (== '(') firstLine + let rightParens = length $ filter (== ')') firstLine + + -- trace ("FL " <> firstLine) ( + -- only deal with run lines + if leftParens /= rightParens + then do failWithPhase Parsing ("Command line with unbalances parens '()': " <> firstLine <> "\n") + else + if null firstLine + then expandRunCommands curLines (tail inLines) + else + if take 3 (fmap toLower firstLine) /= "run" + then expandRunCommands (firstLine : curLines) (restLine : tail inLines) + else do + -- is a "run command" + parsedFirst ← parseCommand firstLine + let (_, runFileList) = head parsedFirst + runFileNames ← mapM (checkFileNames . snd) runFileList + fileListContents ← liftIO $ mapM readFile runFileNames + let newLines = concatMap lines fileListContents + expandRunCommands (newLines <> curLines) (restLine : tail inLines) + + +-- ) +-- ) + +{- | splitCommandLine takes a line with potentially multiple commands and splits +between the first command and all others. +-} +splitCommandLine ∷ String → PhyG (String, String) +splitCommandLine inLine = + if null inLine + then return ([], []) + else + let leftParens = length $ filter (== '(') inLine + rightParens = length $ filter (== ')') inLine + firstPart = takeWhile (/= '(') inLine + parenPart = getBalancedParenPart "" (dropWhile (/= '(') inLine) 0 0 + firstCommand = firstPart <> parenPart + restPart = drop (length firstCommand) inLine + in if leftParens /= rightParens + then do failWithPhase Parsing ("Command line with unbalances parens '()': " <> inLine <> "\n") + else return (firstCommand, restPart) + + +-- | checkFileNames checks if first and last element of String are double quotes and removes them +checkFileNames ∷ String → PhyG String +checkFileNames inName + | null inName = do failWithPhase Parsing "Error: Null file name" + | head inName /= '"' = do failWithPhase Parsing ("Error: File name must be in double quotes (b): " <> inName <> "\n") + | last inName /= '"' = do failWithPhase Parsing ("Error: File name must be in double quotes (e): " <> inName <> "\n") + | otherwise = return $ init $ tail inName + + +{- | getCommandList takes a String from a file and returns a list of commands and their arguments +these are syntactically verified, but any input files are not checked +commands in lines one to a line +-} +getCommandList ∷ [String] → PhyG [Command] +getCommandList rawContents = + if null rawContents + then do + -- errorWithoutStackTrace "Error: Empty command file" + failWithPhase Parsing "Empty command file" + else do + let rawList = removeComments $ fmap (filter (/= ' ')) rawContents + -- expand for read wildcards here--cretge a new, potentially longer list + parsedRaw ← mapM parseCommand rawList + let processedCommands = concat parsedRaw + + let reportCommands = filter ((== Report) . fst) processedCommands + + let reportGraphsArgs = filter (== "graphs") $ fmap (fmap toLower) $ fmap fst $ concatMap snd reportCommands + let reportNewickArgs = filter (== "newick") $ fmap (fmap toLower) $ fmap fst $ concatMap snd reportCommands + let reportDotArgs = filter (== "dot") $ fmap (fmap toLower) $ fmap fst $ concatMap snd reportCommands + + if null reportCommands || null (reportGraphsArgs <> reportNewickArgs <> reportDotArgs) + then -- trace ("Warning: No reporting of resulting graphs is specified. Adding default report graph file 'defaultGraph.dot'") $ + + let addedReport = (Report, [("graphs", []), ([], "_defaultGraph.dot_"), ("dotpdf", [])]) + in do + logWith LogWarn "Warning: No reporting of resulting graphs is specified. Adding default report graph file 'defaultGraph.dot'\n" + return (processedCommands <> [addedReport]) + else -- trace (show rawList) + do + return processedCommands + + +{- | removeComments deletes anyhting on line (including line) +after double dash "--" +-} +removeComments ∷ [String] → [String] +removeComments = \case + [] → [] + firstLine : otherLines → case firstLine of + [] → removeComments otherLines + '-' : '-' : _ → removeComments otherLines + _ → + let nonComment = case splitOn "--" firstLine of + [] → firstLine + prefix : _ → filter isPrint prefix + in nonComment : removeComments otherLines + + +{- | getInstruction returns the command type from an input String +all operations on lower case +-} +getInstruction ∷ String → [String] → PhyG Instruction +getInstruction inString possibleCommands + | null inString = do failWithPhase Parsing "Empty command String" + | fmap toLower inString == "build" = return Build + | fmap toLower inString == "fuse" = return Fuse + | fmap toLower inString == "read" = return Read + | fmap toLower inString == "reblock" = return Reblock + | fmap toLower inString == "refine" = return Refine + | fmap toLower inString == "rename" = return Rename + | fmap toLower inString == "report" = return Report + | fmap toLower inString == "run" = return Run + | fmap toLower inString == "search" = return Search + | fmap toLower inString == "select" = return Select + | fmap toLower inString == "set" = return Set + | fmap toLower inString == "support" = return Support + | fmap toLower inString == "swap" = return Swap + | fmap toLower inString == "transform" = return Transform + | otherwise = + let errorMatch = snd $ getBestMatch (maxBound ∷ Int, "no suggestion") possibleCommands inString + in do + failWithPhase Parsing $ + fold + ["Error: Unrecognized command. By \'", inString, "\' did you mean \'", errorMatch, "\'?\n"] + + +{- | parseCommand takes a command file line and processes the String into a command and its arguemnts +assumes single command per line +-} +parseCommand ∷ String → PhyG [Command] +parseCommand = \case + [] → pure [] + inLine → + let (firstString, restString) = getSubCommand inLine False + instructionString = takeWhile (/= '(') firstString -- inLine + in do + -- this does not allow recursive multi-option arguments + -- NEED TO FIX + -- make in to a more sophisticated split outside of parens + argList ← argumentSplitter inLine $ init $ tail $ dropWhile (/= '(') $ filter (/= ' ') firstString + + localInstruction ← getInstruction instructionString V.allowedCommandList + processedArg ← parseCommandArg firstString localInstruction argList + parsedRest ← parseCommand restString + + pure $ (localInstruction, processedArg) : parsedRest + + +{- | getSubCommand takes a string and extracts the first occurrence of the +structure bleh(...), and splits the string on that, the sub command can contain +parens and commas +-} +getSubCommand ∷ String → Bool → (String, String) +getSubCommand inString hasComma = + if null inString + then ([], []) + else + let firstPart = takeWhile (/= '(') inString + secondPart = dropWhile (/= '(') inString + parenPart = getBalancedParenPart "" secondPart 0 0 + incrCounter = if hasComma then 1 else 0 + remainderPart = drop (length (firstPart <> parenPart) + incrCounter) inString -- to remove ',' + in (firstPart <> parenPart, remainderPart) + + +{- | getBalancedParenPart stakes a string starting with '(' and takes all +characters until (and including) the balancing ')' +call with getBalancedParenPart "" inString 0 0 +-} +getBalancedParenPart ∷ String → String → Int → Int → String +getBalancedParenPart curString inString countLeft countRight = + if null inString + then reverse curString + else + let firstChar = head inString + in if firstChar == '(' + then getBalancedParenPart (firstChar : curString) (tail inString) (countLeft + 1) countRight + else + if firstChar == ')' + then + if countLeft == countRight + 1 + then reverse (firstChar : curString) + else getBalancedParenPart (firstChar : curString) (tail inString) countLeft (countRight + 1) + else getBalancedParenPart (firstChar : curString) (tail inString) countLeft countRight + + +{- | argumentSplitter takes argument string and returns individual strings of arguments +which can include null, single, multiple or sub-command arguments +these are each pairs of an option string (could be null) and a subarguments String (also could be null) +-} +argumentSplitter ∷ String → String → PhyG [(String, String)] +argumentSplitter commandLineString inString + | null inString = return [] + | otherwise = + let commaIndex = fromMaybe (maxBound ∷ Int) (L.elemIndex ',' inString) + semiIndex = fromMaybe (maxBound ∷ Int) (L.elemIndex ':' inString) + leftParenIndex = fromMaybe (maxBound ∷ Int) (L.elemIndex '(' inString) + firstDivider = minimum [commaIndex, semiIndex, leftParenIndex] + in do + checkErrors ← freeOfSimpleErrors inString + if not checkErrors + then do failWithPhase Parsing "Error in command specification format\n" + else -- simple no argument arg + + if firstDivider == (maxBound ∷ Int) + then + if head inString == '"' + then return [([], inString)] + else return [(inString, [])] + else + if commaIndex == firstDivider + then -- no arg + + if null (take firstDivider inString) + then do failWithPhase Parsing ("Error in command '" <> commandLineString <> "' perhaps due to extraneous commas (',')\n") + else + if head (take firstDivider inString) == '"' + then do + restPart ← argumentSplitter commandLineString (drop (firstDivider + 1) inString) + return $ ([], take firstDivider inString) : restPart + else do + restPart ← argumentSplitter commandLineString (drop (firstDivider + 1) inString) + return $ (take firstDivider inString, []) : restPart + else + if semiIndex == firstDivider + then -- has arg after ':' + + if inString !! (semiIndex + 1) == '(' + then do + restPart ← argumentSplitter commandLineString (drop 2 $ dropWhile (/= ')') inString) + return $ (take firstDivider inString, takeWhile (/= ')') (drop (firstDivider + 1) inString) <> ")") : restPart + else + let nextStuff = dropWhile (/= ',') inString + remainder = if null nextStuff then [] else tail nextStuff + in do + restPart ← argumentSplitter commandLineString remainder + return $ (take firstDivider inString, takeWhile (/= ',') (drop (firstDivider + 1) inString)) : restPart + else -- arg is sub-commnd + + let (subCommand, remainderString) = getSubCommand inString True + in do + restPart ← argumentSplitter commandLineString remainderString + return $ (subCommand, []) : restPart + + +{- | freeOfSimpleErrors take command string and checks for simple for atting errors +lack of separators ',' between args +add as new errors are found +-} +freeOfSimpleErrors ∷ String → PhyG Bool +freeOfSimpleErrors commandString + | null commandString = errorWithoutStackTrace "\n\nError in command string--empty" + | isSequentialSubsequence ['"', '"'] commandString = do + failWithPhase + Parsing + ( "\n\nCommand format error: " <> commandString <> "\n\tPossibly missing comma ',' between arguments or extra double quotes'\"'." + ) + | isSequentialSubsequence [',', ')'] commandString = do + failWithPhase + Parsing + ("\n\nCommand format error: " <> commandString <> "\n\tPossibly terminal comma ',' after or within arguments.") + | isSequentialSubsequence [',', '('] commandString = do failWithPhase Parsing ("\n\nCommand format error: " <> commandString <> "\n\tPossibly comma ',' before '('.") + | isSequentialSubsequence ['(', ','] commandString = do failWithPhase Parsing ("\n\nCommand format error: " <> commandString <> "\n\tPossibly starting comma ',' before arguments.") + | otherwise = + let beforeDoubleQuotes = dropWhile (/= '"') commandString + in if null beforeDoubleQuotes + then do return True + else -- likely more conditions to develop + do return True + + +{- | parseCommandArg takes an Instruction and arg list of Strings and returns list +of parsed srguments for that instruction +-} +parseCommandArg ∷ String → Instruction → [(String, String)] → PhyG [Argument] +parseCommandArg fullCommand localInstruction argList + | localInstruction == Read = + if not $ null argList + then RIF.getReadArgs fullCommand argList + else + failWithPhase + Parsing + ("\n\n'Read' command error '" <> fullCommand <> "': Need to specify at least one filename in double quotes") + | otherwise = return argList + + +-- | movePrealignedTCM move prealigned and tcm commands to front of argument list +movePrealignedTCM ∷ [Argument] → PhyG [Argument] +movePrealignedTCM inArgList = + if null inArgList + then return [] + else + let firstPart = filter ((== "prealigned") . fst) inArgList + secondPart = filter ((== "tcm") . fst) inArgList + restPart = filter ((/= "tcm") . fst) $ filter ((/= "prealigned") . fst) inArgList + in if length secondPart > 1 + then failWithPhase Parsing ("\n\n'Read' command error '" <> show inArgList <> "': can only specify a single tcm file") + else return $ firstPart <> secondPart <> restPart + + +{- | moveSetOutgroupFirst reofers set command order such that set outgroup if first of set commands + This seems to be requirted due some stricness issues with processing data +-} +moveSetOutgroupFirst :: [Command] → [Command] +moveSetOutgroupFirst inCommandList = + if null inCommandList then inCommandList + else + let setList = filter ((== Set) . fst) inCommandList + outGroupCommandList = filter ((== "outgroup") . fst) $ concat $ fmap snd setList + newComandList = (Set, outGroupCommandList) : (filter (/= (Set, outGroupCommandList)) inCommandList) + in + if null setList then inCommandList + else if null outGroupCommandList then inCommandList + else newComandList \ No newline at end of file diff --git a/src/Commands/Transform.hs b/src/Commands/Transform.hs new file mode 100644 index 000000000..692c03a3c --- /dev/null +++ b/src/Commands/Transform.hs @@ -0,0 +1,900 @@ +{-# OPTIONS_GHC -Wno-missed-specialisations #-} + +{- | +Module to coordinate transform command execution. +-} +module Commands.Transform ( + transform, + makeStaticApprox, +) where + +import Bio.DynamicCharacter (HugeDynamicCharacter) +import Bio.DynamicCharacter.Element (HugeState, SlimState, WideState, fromBits, toUnsignedNumber) +import Commands.CommandUtilities qualified as CU +import Commands.Verify qualified as VER +import Control.Monad (when) +import Control.Monad.IO.Class (MonadIO (..)) +import Control.Parallel.Strategies +import Data.Alphabet +import Data.BitVector.LittleEndian qualified as BV +import Data.Bits +import Data.Char +import Data.Char qualified as C +import Data.Foldable (fold) +import Data.Functor ((<&>)) +import Data.List qualified as L +import Data.Maybe +import Data.Text.Lazy qualified as TL +import Data.Vector qualified as V +import Data.Vector.Generic qualified as GV +import Data.Vector.Storable qualified as SV +import Data.Vector.Unboxed qualified as UV +import GeneralUtilities +import GraphOptimization.Traversals qualified as T +import Graphs.GraphOperations qualified as GO +import Input.BitPack qualified as BP +import Input.Reorganize qualified as R +import Numeric.Natural (Natural) +import PHANE.Evaluation +import PHANE.Evaluation.ErrorPhase (ErrorPhase (..)) +import PHANE.Evaluation.Logging (LogLevel (..), Logger (..)) +import PHANE.Evaluation.Verbosity (Verbosity (..)) +import Text.Read +import Types.Types +import Utilities.LocalGraph qualified as LG +import Utilities.Utilities qualified as U + + +-- import Debug.Trace + +{- | transform changes aspects of data sande settings during execution +as opposed to Set with all happens at begginign of program execution +-} +transform + ∷ [Argument] + → GlobalSettings + → ProcessedData + → ProcessedData + → [ReducedPhylogeneticGraph] + → PhyG (GlobalSettings, ProcessedData, ProcessedData, [ReducedPhylogeneticGraph]) +transform inArgs inGS origData inData inGraphList = + let fstArgList = fmap (fmap toLower . fst) inArgs + sndArgList = fmap (fmap toLower . snd) inArgs + lcArgList = zip fstArgList sndArgList + checkCommandList = checkCommandArgs "transform" fstArgList VER.transformArgList + in -- check for valid command options + if not checkCommandList + then errorWithoutStackTrace ("Unrecognized command in 'transform': " <> show inArgs) + else + let displayBlock = filter ((== "displaytrees") . fst) lcArgList + numDisplayTrees + | length displayBlock > 1 = + errorWithoutStackTrace ("Multiple displayTree number specifications in transform--can have only one: " <> show inArgs) + | null displayBlock = Just 10 + | null (snd $ head displayBlock) = Just 10 + | otherwise = readMaybe (snd $ head displayBlock) ∷ Maybe Int + + toTree = any ((== "totree") . fst) lcArgList + toSoftWired = any ((== "tosoftwired") . fst) lcArgList + toHardWired = any ((== "tohardwired") . fst) lcArgList + toStaticApprox = any ((== "staticapprox") . fst) lcArgList + toDynamic = any ((== "dynamic") . fst) lcArgList + atRandom = any ((== "atrandom") . fst) lcArgList + chooseFirst = any ((== "first") . fst) lcArgList + reWeight = any ((== "weight") . fst) lcArgList + changeEpsilon = any ((== "dynamicepsilon") . fst) lcArgList + reRoot = any ((== "outgroup") . fst) lcArgList + changeGraphsSteepest = any ((== "graphssteepest") . fst) lcArgList + changeSoftwiredMethod = any ((== "softwiredmethod") . fst) lcArgList + changeGraphFactor = any ((== "graphfactor") . fst) lcArgList + changeCompressionResolutions = any ((== "compressresolutions") . fst) lcArgList + changeMultiTraverse = any ((== "multitraverse") . fst) lcArgList + changeUnionThreshold = any ((== "jointhreshold") . fst) lcArgList + + reweightBlock = filter ((== "weight") . fst) lcArgList + weightValue + | length reweightBlock > 1 = + errorWithoutStackTrace ("Multiple weight specifications in transform--can have only one: " <> show inArgs) + | null reweightBlock = Just 1.0 + | null (snd $ head reweightBlock) = Just 1 + | otherwise = readMaybe (snd $ head reweightBlock) ∷ Maybe Double + + changeCompressionBlock = filter ((== "compressresolutions") . fst) lcArgList + compressionValue + | length changeCompressionBlock > 1 = + errorWithoutStackTrace ("Multiple compressResolutions specifications in transform--can have only one: " <> show inArgs) + | null changeCompressionBlock = Just $ fmap toLower $ show $ compressResolutions inGS + | null (snd $ head changeCompressionBlock) = Just $ fmap toLower $ show $ compressResolutions inGS + | otherwise = readMaybe (show $ snd $ head changeCompressionBlock) ∷ Maybe String + + changeEpsilonBlock = filter ((== "dynamicepsilon") . fst) lcArgList + epsilonValue + | length changeEpsilonBlock > 1 = + errorWithoutStackTrace ("Multiple dynamicEpsilon specifications in transform--can have only one: " <> show inArgs) + | null changeEpsilonBlock = Just $ dynamicEpsilon inGS + | null (snd $ head changeEpsilonBlock) = Just $ dynamicEpsilon inGS + | otherwise = readMaybe (snd $ head changeEpsilonBlock) ∷ Maybe Double + + changeGraphFactorBlock = filter ((== "graphfactor") . fst) lcArgList + newGraphFactor + | length changeGraphFactorBlock > 1 = + errorWithoutStackTrace ("Multiple graphFactor specifications in transform--can have only one: " <> show inArgs) + | null changeGraphFactorBlock = Just $ fmap toLower $ show $ graphFactor inGS + | null (snd $ head changeGraphFactorBlock) = Just $ fmap toLower $ show $ graphFactor inGS + | otherwise = readMaybe (show $ snd $ head changeGraphFactorBlock) ∷ Maybe String + + changeGraphsSteepestBlock = filter ((== "graphssteepest") . fst) lcArgList + newGraphsSteepest + | length changeGraphsSteepestBlock > 1 = + errorWithoutStackTrace ("Multiple graphsSteepest specifications in transform--can have only one: " <> show inArgs) + | null changeGraphsSteepestBlock = Just $ graphsSteepest inGS + | null (snd $ head changeGraphsSteepestBlock) = Just $ graphsSteepest inGS + | otherwise = readMaybe (snd $ head changeGraphsSteepestBlock) ∷ Maybe Int + + changeMultiTraverseBlock = filter ((== "multitraverse") . fst) lcArgList + multiTraverseValue + | length changeMultiTraverseBlock > 1 = + errorWithoutStackTrace ("Multiple multiTraverse specifications in transform--can have only one: " <> show inArgs) + | null changeMultiTraverseBlock = Just $ fmap toLower $ show $ multiTraverseCharacters inGS + | null (snd $ head changeMultiTraverseBlock) = Just $ fmap toLower $ show $ multiTraverseCharacters inGS + | otherwise = readMaybe (show $ snd $ head changeMultiTraverseBlock) ∷ Maybe String + + changeSoftwiredMethodBlock = filter ((== "softwiredmethod") . fst) lcArgList + newSoftwiredMethod + | length changeSoftwiredMethodBlock > 1 = + errorWithoutStackTrace ("Multiple softwiredMethod specifications in transform--can have only one: " <> show inArgs) + | null changeSoftwiredMethodBlock = Just $ fmap toLower $ show $ softWiredMethod inGS + | null (snd $ head changeSoftwiredMethodBlock) = Just $ fmap toLower $ show $ softWiredMethod inGS + | otherwise = readMaybe (show $ snd $ head changeSoftwiredMethodBlock) ∷ Maybe String + + reRootBlock = filter ((== "outgroup") . fst) lcArgList + outgroupValue + | length reRootBlock > 1 = + errorWithoutStackTrace ("Multiple outgroup specifications in transform--can have only one: " <> show inArgs) + | null reRootBlock = Just $ outGroupName inGS + | null (snd $ head reRootBlock) = Just $ outGroupName inGS + | otherwise = readMaybe (snd $ head reRootBlock) ∷ Maybe TL.Text + + changeUnionBlock = filter ((== "jointhreshold") . fst) lcArgList + unionValue + | length changeUnionBlock > 1 = + errorWithoutStackTrace ("Multiple joinThreshold specifications in transform--can have only one: " <> show inArgs) + | null changeUnionBlock = Just $ unionThreshold inGS + | null (snd $ head changeUnionBlock) = Just $ unionThreshold inGS + | otherwise = readMaybe (snd $ head changeUnionBlock) ∷ Maybe Double + + nameList = fmap TL.pack $ fmap (filter (/= '"')) $ fmap snd $ filter ((== "name") . fst) lcArgList + charTypeList = fmap snd $ filter ((== "type") . fst) lcArgList + in if (length $ filter (== True) [toTree, toSoftWired, toHardWired]) > 1 + then errorWithoutStackTrace ("Multiple graph transform commands--can only have one : " <> (show inArgs)) + else + if toStaticApprox && toDynamic + then errorWithoutStackTrace ("Multiple staticApprox/Dynamic transform commands--can only have one : " <> (show inArgs)) + else + if atRandom && chooseFirst + then + errorWithoutStackTrace + ("Multiple display tree choice commands in transform (first, atRandom)--can only have one : " <> (show inArgs)) + else + if (toTree || toSoftWired || toHardWired) && (toDynamic || toStaticApprox) + then + errorWithoutStackTrace + ("Multiple transform operations in transform (e.g. toTree, staticApprox)--can only have one at a time: " <> (show inArgs)) + else + let pruneEdges = False + warnPruneEdges = False + startVertex ∷ Maybe a + startVertex = Nothing + + -- setup parallel + reoptimizeAction ∷ GlobalSettings → ProcessedData → Bool → Bool → Maybe Int → SimpleGraph → PhyG ReducedPhylogeneticGraph + reoptimizeAction = T.multiTraverseFullyLabelGraphReduced + in -- transform nets to tree + if toTree + then -- already Tree return + + if (graphType inGS == Tree) + then do + pure (inGS, origData, inData, inGraphList) + else + let newGS = inGS{graphType = Tree} + + -- generate and return display trees-- displayTreNUm / graph + contractIn1Out1Nodes = True + in do + displayGraphList ← + if chooseFirst + then pure $ fmap (take (fromJust numDisplayTrees) . (LG.generateDisplayTrees) contractIn1Out1Nodes) (fmap fst5 inGraphList) + else traverse (LG.generateDisplayTreesRandom (fromJust numDisplayTrees)) $ fst5 <$> inGraphList + + -- prob not required + let displayGraphs = fmap GO.ladderizeGraph $ fmap GO.renameSimpleGraphNodes (concat displayGraphList) + -- reoptimize as Trees + newPhylogeneticGraphList ← + getParallelChunkTraverse >>= \pTraverse → + reoptimizeAction newGS inData pruneEdges warnPruneEdges startVertex `pTraverse` displayGraphs + pure (newGS, origData, inData, newPhylogeneticGraphList) + else -- transform to softwired + + if toSoftWired + then + if (graphType inGS == SoftWired) + then do + pure (inGS, origData, inData, inGraphList) + else + let newGS = inGS{graphType = SoftWired} + in do + newPhylogeneticGraphList ← + getParallelChunkTraverse >>= \pTraverse → + pTraverse (reoptimizeAction newGS inData pruneEdges warnPruneEdges startVertex . fst5) inGraphList + pure (newGS, origData, inData, newPhylogeneticGraphList) + else -- transform to hardwired + + if toHardWired + then + if (graphType inGS == HardWired) + then do + pure (inGS, origData, inData, inGraphList) + else + let newGS = inGS{graphType = HardWired, graphFactor = NoNetworkPenalty} + in do + logWith LogInfo ("Changing GraphFactor to NoNetworkPenalty for HardWired graphs" <> "\n") + newPhylogeneticGraphList ← + getParallelChunkTraverse >>= \pTraverse → + pTraverse (reoptimizeAction newGS inData pruneEdges warnPruneEdges startVertex . fst5) inGraphList + pure (newGS, origData, inData, newPhylogeneticGraphList) + else -- roll back to dynamic data from static approx + + if toDynamic + then do + newPhylogeneticGraphList ← + getParallelChunkTraverse >>= \pTraverse → + pTraverse (reoptimizeAction inGS origData pruneEdges warnPruneEdges startVertex . fst5) inGraphList + + logWith + LogInfo + ( "Transforming data to dynamic: " + <> (show $ minimum $ fmap snd5 inGraphList) + <> " -> " + <> (show $ minimum $ fmap snd5 newPhylogeneticGraphList) + <> "\n" + ) + pure (inGS, origData, origData, newPhylogeneticGraphList) + else -- transform to static approx--using first Tree + + if toStaticApprox + then do + newData ← makeStaticApprox inGS False inData (head $ L.sortOn snd5 inGraphList) + newPhylogeneticGraphList ← + getParallelChunkTraverse >>= \pTraverse → + pTraverse (reoptimizeAction inGS newData pruneEdges warnPruneEdges startVertex . fst5) inGraphList + if null inGraphList + then do + logWith LogInfo ("No graphs to base static approximation on--skipping." <> "\n") + pure (inGS, origData, origData, inGraphList) + else do + logWith + LogInfo + ( "Transforming data to staticApprox: " + <> (show $ minimum $ fmap snd5 inGraphList) + <> " -> " + <> (show $ minimum $ fmap snd5 newPhylogeneticGraphList) + <> "\n" + ) + pure (inGS, origData, newData, newPhylogeneticGraphList) + else -- change weight values in charInfo and reoptimize + -- reweights both origData and inData so weighting doens't get undone by static approc to and from transfomrations + + if reWeight + then + let newOrigData = reWeightData (fromJust weightValue) charTypeList nameList origData + newData = reWeightData (fromJust weightValue) charTypeList nameList inData + in if isNothing weightValue + then + errorWithoutStackTrace + ("Reweight value is not specified correcty. Must be a double (e.g. 1.2): " <> (show (snd $ head reweightBlock))) + else do + logWith + LogInfo + ( "Reweighting types " + <> (show charTypeList) + <> " and/or characters " + <> (L.intercalate ", " $ fmap TL.unpack nameList) + <> " to " + <> (show $ fromJust weightValue) + <> "\n\tReoptimizing graphs" + <> "\n" + ) + newPhylogeneticGraphList ← + getParallelChunkTraverse >>= \pTraverse → + pTraverse (reoptimizeAction inGS newData pruneEdges warnPruneEdges startVertex . fst5) inGraphList + -- PU.seqParMap (parStrategy $ strictParStrat inGS) (T.multiTraverseFullyLabelGraphReduced inGS newData pruneEdges warnPruneEdges startVertex) (fmap fst5 inGraphList) -- `using` PU.myParListChunkRDS + pure (inGS, newOrigData, newData, newPhylogeneticGraphList) + else -- changes the softwired optimization algorithm--this really for experimental use + + if changeCompressionResolutions + then + if isNothing compressionValue + then + errorWithoutStackTrace + ( "CompressResolutions value is not specified correcty. Must be either 'True' or 'False': " + <> (show (snd $ head changeCompressionBlock)) + ) + else + let newMethod = + if fromJust compressionValue == "true" + then True + else + if fromJust compressionValue == "false" + then False + else + errorWithoutStackTrace + ( "CompressResolutions value is not specified correcty. Must be either 'True' or 'False': " + <> (show (snd $ head changeSoftwiredMethodBlock)) + ) + in do + newPhylogeneticGraphList ← + getParallelChunkTraverse >>= \pTraverse → + pTraverse + (reoptimizeAction (inGS{compressResolutions = newMethod}) origData pruneEdges warnPruneEdges startVertex . fst5) + inGraphList + -- PU.seqParMap (parStrategy $ strictParStrat inGS) (T.multiTraverseFullyLabelGraphReduced (inGS {compressResolutions = newMethod}) origData pruneEdges warnPruneEdges startVertex) (fmap fst5 inGraphList) + if newMethod /= compressResolutions inGS + then do + logWith LogInfo ("Changing compressResolutions method to " <> (show newMethod) <> "\n") + pure (inGS{compressResolutions = newMethod}, origData, inData, newPhylogeneticGraphList) + else do + pure (inGS{compressResolutions = newMethod}, origData, inData, inGraphList) + else -- changes dynamicEpsilon error check factor + + if changeEpsilon + then + if isNothing epsilonValue + then + errorWithoutStackTrace + ("DynamicEpsilon value is not specified correcty. Must be a double (e.g. 0.02): " <> (show (snd $ head changeEpsilonBlock))) + else do + logWith LogInfo ("Changing dynamicEpsilon factor to " <> (show $ fromJust epsilonValue) <> "\n") + pure (inGS{dynamicEpsilon = 1.0 + ((fromJust epsilonValue) * (fractionDynamic inGS))}, origData, inData, inGraphList) + else -- changes the softwired optimization algorithm--this really for experimental use + + if changeGraphFactor + then + if isNothing newGraphFactor + then + errorWithoutStackTrace + ( "GraphFactor value is not specified correcty. Must be either 'NoPenalty', 'W15'. 'W23', or 'PMDL': " + <> (show (snd $ head changeGraphFactorBlock)) + ) + else + let newMethod = + if fromJust newGraphFactor == "nopenalty" + then NoNetworkPenalty + else + if fromJust newGraphFactor == "w15" + then Wheeler2015Network + else + if fromJust newGraphFactor == "w23" + then Wheeler2023Network + else + if fromJust newGraphFactor == "pmdl" + then PMDLGraph + else + errorWithoutStackTrace + ( "GraphFactor value is not specified correcty. Must be either 'NoPenalty', 'W15'. 'W23', or 'PMDL': " + <> (show (snd $ head changeGraphFactorBlock)) + ) + in do + newPhylogeneticGraphList ← + getParallelChunkTraverse >>= \pTraverse → + pTraverse (reoptimizeAction (inGS{graphFactor = newMethod}) origData pruneEdges warnPruneEdges startVertex . fst5) inGraphList + -- PU.seqParMap (parStrategy $ strictParStrat inGS) (T.multiTraverseFullyLabelGraphReduced (inGS {graphFactor = newMethod}) origData pruneEdges warnPruneEdges startVertex) (fmap fst5 inGraphList) -- `using` PU.myParListChunkRDS + if newMethod /= graphFactor inGS + then do + logWith LogInfo ("Changing graphFactor method to " <> (show newMethod) <> "\n") + pure (inGS{graphFactor = newMethod}, origData, inData, newPhylogeneticGraphList) + else do + pure (inGS{graphFactor = newMethod}, origData, inData, inGraphList) + else -- changes graphsSteepest -- maximum number of graphs evaluated in paralell at each "steepest" phse in swpa dn netadd/delete + + if changeGraphsSteepest + then + if isNothing newGraphsSteepest + then + errorWithoutStackTrace + ( "GraphsSteepest value is not specified correcty. Must be an Integer (e.g. 5): " <> (show (snd $ head changeGraphsSteepestBlock)) + ) + else do + logWith LogInfo ("Changing GraphsSteepest factor to " <> (show $ fromJust newGraphsSteepest) <> "\n") + pure (inGS{graphsSteepest = fromJust newGraphsSteepest}, origData, inData, inGraphList) + else -- changes the multiTraverse behavior for all graphs + + if changeMultiTraverse + then + if isNothing multiTraverseValue + then + errorWithoutStackTrace + ( "MultiTraverse value is not specified correcty. Must be either 'True' or 'False': " + <> (show (snd $ head changeMultiTraverseBlock)) + ) + else + let newMethod = + if fromJust multiTraverseValue == "true" + then True + else + if fromJust multiTraverseValue == "false" + then False + else + errorWithoutStackTrace + ( "MultiTraverse value is not specified correcty. Must be either 'True' or 'False': " + <> (show (snd $ head changeMultiTraverseBlock)) + ) + in do + newPhylogeneticGraphList ← + getParallelChunkTraverse >>= \pTraverse → + pTraverse + (reoptimizeAction (inGS{multiTraverseCharacters = newMethod}) origData pruneEdges warnPruneEdges startVertex . fst5) + inGraphList + -- PU.seqParMap (parStrategy $ strictParStrat inGS) (T.multiTraverseFullyLabelGraphReduced (inGS {multiTraverseCharacters = newMethod}) origData pruneEdges warnPruneEdges startVertex) (fmap fst5 inGraphList) -- `using` PU.myParListChunkRDS + if newMethod /= multiTraverseCharacters inGS + then + let lengthChangeString = + if null inGraphList + then "" + else (":" <> (show $ minimum $ fmap snd5 inGraphList) <> " -> " <> (show $ minimum $ fmap snd5 newPhylogeneticGraphList)) + in do + logWith LogInfo ("Changing multiTraverse to " <> (show newMethod) <> lengthChangeString <> "\n") + pure (inGS{multiTraverseCharacters = newMethod}, origData, inData, newPhylogeneticGraphList) + else do + pure (inGS{multiTraverseCharacters = newMethod}, origData, inData, inGraphList) + else -- changes the softwired optimization algorithm--this really for experimental use + + if changeSoftwiredMethod + then + if isNothing newSoftwiredMethod + then + errorWithoutStackTrace + ( "SoftwiredMethod value is not specified correcty. Must be either 'Exhaustive' or 'ResolutionCache': " + <> (show (snd $ head changeSoftwiredMethodBlock)) + ) + else + let newMethod = + if fromJust newSoftwiredMethod == "naive" + then Naive + else + if fromJust newSoftwiredMethod == "exhaustive" + then Naive + else + if fromJust newSoftwiredMethod == "resolutioncache" + then ResolutionCache + else + errorWithoutStackTrace + ( "SoftwiredMethod value is not specified correcty. Must be either 'Naive' or 'ResolutionCache': " + <> (show (snd $ head changeSoftwiredMethodBlock)) + ) + newMethodString = + if newMethod == ResolutionCache + then "ResolutionCache" + else "Exhaustive" + in do + newPhylogeneticGraphList ← + getParallelChunkTraverse >>= \pTraverse → + pTraverse + (reoptimizeAction (inGS{softWiredMethod = newMethod}) origData pruneEdges warnPruneEdges startVertex . fst5) + inGraphList + + if newMethod /= softWiredMethod inGS + then do + logWith LogInfo ("Changing softwired optimization method to " <> newMethodString <> "\n") + pure (inGS{softWiredMethod = newMethod}, origData, inData, newPhylogeneticGraphList) + else do + pure (inGS{softWiredMethod = newMethod}, origData, inData, inGraphList) + else -- changes outgroup + + if reRoot + then + if isNothing outgroupValue + then errorWithoutStackTrace ("Outgroup is not specified correctly. Must be a string (e.g. \"Name\"): " <> (snd $ head reRootBlock)) + else + let newOutgroupName = TL.filter (/= '"') $ fromJust outgroupValue + newOutgroupIndex = V.elemIndex newOutgroupName (fst3 origData) + in do + newPhylogeneticGraphList ← + getParallelChunkTraverse >>= \pTraverse → + pTraverse + (reoptimizeAction inGS origData pruneEdges warnPruneEdges startVertex . LG.rerootTree (fromJust newOutgroupIndex) . fst5) + inGraphList + -- PU.seqParMap (parStrategy $ strictParStrat inGS) (T.multiTraverseFullyLabelGraphReduced inGS origData pruneEdges warnPruneEdges startVertex) (fmap (LG.rerootTree (fromJust newOutgroupIndex)) $ fmap fst5 inGraphList) + if isNothing newOutgroupIndex + then errorWithoutStackTrace ("Outgoup name not found: " <> (snd $ head reRootBlock)) + else do + newPhylogeneticGraphList ← + getParallelChunkTraverse >>= \pTraverse → + pTraverse + (reoptimizeAction inGS origData pruneEdges warnPruneEdges startVertex . LG.rerootTree (fromJust newOutgroupIndex) . fst5) + inGraphList + + logWith LogInfo ("Changing outgroup to " <> (TL.unpack newOutgroupName) <> "\n") + pure + (inGS{outgroupIndex = fromJust newOutgroupIndex, outGroupName = newOutgroupName}, origData, inData, newPhylogeneticGraphList) + else -- changes unionThreshold error check factor + + if changeUnionThreshold + then + if isNothing unionValue + then + errorWithoutStackTrace + ("UninThreshold value is not specified correcty. Must be a double (e.g. 1.17): " <> (show (snd $ head changeUnionBlock))) + else do + logWith LogInfo ("Changing uninoTHreshold factor to " <> (show $ fromJust unionValue) <> "\n") + pure (inGS{dynamicEpsilon = (fromJust unionValue)}, origData, inData, inGraphList) + else -- modify the use of Network Add heurisitcs in network optimization + + if (fst $ head lcArgList) == "usenetaddheuristic" + then + let localCriterion + | ((snd $ head lcArgList) == "true") = True + | ((snd $ head lcArgList) == "false") = False + | otherwise = + errorWithoutStackTrace + ("Error in 'transform' command. UseNetAddHeuristic '" <> (snd $ head lcArgList) <> "' is not 'true' or 'false'") + in do + logWith LogInfo ("UseNetAddHeuristic set to " <> (snd $ head lcArgList) <> "\n") + pure (inGS{useNetAddHeuristic = localCriterion}, origData, inData, inGraphList) + else error ("Transform type not implemented/recognized" <> (show inArgs)) + + +-- | reWeightData sets weights to new values based on +reWeightData ∷ Double → [String] → [NameText] → ProcessedData → ProcessedData +reWeightData weightValue charTypeStringList charNameList (inName, inNameBV, inBlockDataV) = + let charTypeList = concatMap stringToType charTypeStringList + newBlockData = fmap (reweightBlockData weightValue charTypeList charNameList) inBlockDataV + in (inName, inNameBV, newBlockData) + + +-- | stringToType takes String and returns typelist +stringToType ∷ String → [CharType] +stringToType = + fmap C.toLower <&> \case + "" → [] + "all" → exactCharacterTypes <> sequenceCharacterTypes + "prealigned" → prealignedCharacterTypes + "nonadditive" → [NonAdd, Packed2, Packed4, Packed5, Packed8, Packed64] + "additive" → [Add] + "matrix" → [Matrix] + "sequence" → sequenceCharacterTypes + "packed" → packedNonAddTypes + "packed2" → [Packed2] + "packed4" → [Packed4] + "packed5" → [Packed5] + "packed8" → [Packed8] + "packed64" → [Packed64] + val | val `elem` ["nonexact", "dynamic"] → nonExactCharacterTypes + val | val `elem` ["static", "exact", "qualitative"] → exactCharacterTypes + -- TODO: replace error with failWithPhase + val → errorWithoutStackTrace $ fold ["Error in transform : Unrecognized character type '", val, "'"] + + +-- | reweightBlockData applies new weight to catagories of data +reweightBlockData ∷ Double → [CharType] → [NameText] → BlockData → BlockData +reweightBlockData weightValue charTypeList charNameList (blockName, blockData, charInfoV) = + let newCharacterInfoV = fmap (reweightCharacterData weightValue charTypeList charNameList) charInfoV + in (blockName, blockData, newCharacterInfoV) + + +-- | reweightCharacterData changes weight in charInfo based on type or name +reweightCharacterData ∷ Double → [CharType] → [NameText] → CharInfo → CharInfo +reweightCharacterData weightValue charTypeList charNameList charInfo = + let wildCardMatchCharName = filter (== True) $ fmap (textMatchWildcards (name charInfo)) charNameList + in -- trace ("RWC Wildcards: " <> (show $ fmap (textMatchWildcards (name charInfo)) charNameList)) ( + if null wildCardMatchCharName && (charType charInfo) `notElem` charTypeList + then -- trace ("RWC not : " <> (show $ name charInfo) <> " of " <> (show charNameList) <> " " <> (show $ charType charInfo) <> " of " <> (show charTypeList)) + charInfo + else -- trace ("RWC: " <> (show $ name charInfo) <> " " <> (show $ charType charInfo)) + charInfo{weight = weightValue} + + +-- ) + +{- | makeStaticApprox takes ProcessedData and returns static approx (implied alignment recoded) ProcessedData +if Tree take SA fields and recode appropriatrely given cost regeme of character +if Softwired--use display trees for SA +if hardWired--convert to softwired and use display trees for SA +since for heuristic searcing--uses additive weight for sequences and simple cost matrices, otherwise +matrix characters +-} +makeStaticApprox ∷ GlobalSettings → Bool → ProcessedData → ReducedPhylogeneticGraph → PhyG ProcessedData +makeStaticApprox inGS leavePrealigned inData@(nameV, nameBVV, blockDataV) inGraph = + if LG.isEmpty (fst5 inGraph) + then error "Empty graph in makeStaticApprox" + else -- tree type + -- trace ("MSA: " ) $ + + if graphType inGS == Tree + then do + let decGraph = thd5 inGraph + + -- parallel setup + -- action :: Int -> BlockData + let action = pullGraphBlockDataAndTransform leavePrealigned decGraph blockDataV + + pTraverse ← getParallelChunkMap + -- do each block in turn pulling and transforming data from inGraph + let newBlockDataV = pTraverse action [0 .. (length blockDataV - 1)] + -- PU.seqParMap (parStrategy $ strictParStrat inGS) (pullGraphBlockDataAndTransform leavePrealigned decGraph blockDataV) [0..(length blockDataV - 1)] -- `using` PU.myParListChunkRDS + + if leavePrealigned + then do + pure (nameV, nameBVV, blockDataV) + else do + -- convert prealigned to non-additive if all 1's tcm + + -- remove constants from new prealigned this may be redundant since bit packing also removes constants + -- error here in case where there is missing seqeunce data for all but one input block for a character + -- if SI/PMDL need no change cost so cant remove contant characters + newProcessedData ← + if not (optimalityCriterion inGS `notElem` [SI, PMDL]) + then R.removeConstantCharactersPrealigned (nameV, nameBVV, V.fromList newBlockDataV) + else pure (nameV, nameBVV, blockDataV) + + -- bit pack any new non-additive characters + newProcessedData' ← BP.packNonAdditiveData inGS newProcessedData -- (nameV, nameBVV, V.fromList newBlockDataV) -- newProcessedData + + -- trace ("MSA:" <> (show (fmap (V.length . thd3) blockDataV, fmap (V.length . thd3) newBlockDataV))) + -- issues if no variation in block reducing length to zero so need leave "prealigned" if so + pure newProcessedData' + else -- network static approx relies on display tree implied alignments after contacting out in=out=1 vertices + -- harwired based on softwired optimization + + if graphType inGS `elem` [SoftWired, HardWired] + then + let -- parallel setup + action ∷ (ProcessedData, SimpleGraph) → PhyG PhylogeneticGraph + action = T.multiTraverseFullyLabelGraphPair (inGS{graphType = Tree}) False False Nothing + in do + -- get display trees for each data block-- takes first of potentially multiple + inFullGraph ← + if graphType inGS == SoftWired + then pure $ GO.convertReduced2PhylogeneticGraph inGraph + else -- rediagnose HardWired as softwired + T.multiTraverseFullyLabelGraph (inGS{graphType = SoftWired}) inData False False Nothing (fst5 inGraph) + + let blockDisplayList = fmap (GO.contractIn1Out1EdgesRename . GO.convertDecoratedToSimpleGraph . head) (fth6 inFullGraph) + + -- create seprate processed data for each block + let blockProcessedDataList = fmap (CU.makeBlockData (fst3 inData) (snd3 inData)) (thd3 inData) + + decoratedBlockTreeList' ← + getParallelChunkTraverse >>= \pTraverse → + pTraverse action . zip (V.toList blockProcessedDataList) $ V.toList blockDisplayList + + -- Perform full optimizations on display trees (as trees) with single block data (blockProcessedDataList) to create IAs + let decoratedBlockTreeList = V.fromList decoratedBlockTreeList' + + -- get new processed (leaf) data + let newBlockDataV = V.zipWith (getBlockLeafDataFromDisplayTree leavePrealigned) (fmap thd6 decoratedBlockTreeList) blockDataV + + if leavePrealigned + then do + pure (nameV, nameBVV, newBlockDataV) + else do + -- remove constants from new prealigned-- this may be redundant since bit packing also removes constants + -- error here in case where there is missing seqeunce data for all but one input block for a character + -- need to leave in constant charcters for SI/PMDL + newProcessedData ← + if not (optimalityCriterion inGS `notElem` [SI, PMDL]) + then R.removeConstantCharactersPrealigned (nameV, nameBVV, newBlockDataV) + else pure (nameV, nameBVV, blockDataV) + + -- bit pack any new non-additive characters + newProcessedData' ← BP.packNonAdditiveData inGS newProcessedData -- (nameV, nameBVV, newBlockDataV) -- newProcessedData + pure newProcessedData' + else do + logWith LogWarn ("Static Approx not yet implemented for graph type : " <> (show $ graphType inGS) <> " skipping" <> "\n") + pure inData + + +{- | getBlockLeafDataFromDisplayTree take a display tree and the block data for that tree +and returns leaf data--prealiged IA data +like pullGraphBlockDataAndTransform but for a single block and display tree +-} +getBlockLeafDataFromDisplayTree ∷ Bool → DecoratedGraph → BlockData → BlockData +getBlockLeafDataFromDisplayTree leavePrealigned inDecGraph blockCharInfo = + let (_, leafVerts, _, _) = LG.splitVertexList inDecGraph + (_, leafLabelList) = unzip leafVerts + leafBlockData = fmap V.toList $ V.fromList $ fmap V.head $ fmap vertData leafLabelList + + -- new recoded data-- need to filter out constant chars after recoding + -- need character length for missing values + -- True for IA lengths + charLengthV = V.zipWith (U.getMaxCharacterLength True) (thd3 blockCharInfo) leafBlockData + + (transformedLeafBlockData, transformedBlockInfo) = unzip $ fmap (transformData leavePrealigned (thd3 blockCharInfo) charLengthV) (fmap V.fromList $ V.toList leafBlockData) + in (fst3 blockCharInfo, V.fromList transformedLeafBlockData, head transformedBlockInfo) + + +{- | pullGraphBlockDataAndTransform takes a DecoratedGraph and block index and pulls +the character data of the block and transforms the leaf data by using implied alignment +feilds for dynamic characters +-} +pullGraphBlockDataAndTransform ∷ Bool → DecoratedGraph → V.Vector BlockData → Int → BlockData +pullGraphBlockDataAndTransform leavePrealigned inDecGraph blockCharInfoV blockIndex = + let (_, leafVerts, _, _) = LG.splitVertexList inDecGraph + (_, leafLabelList) = unzip leafVerts + leafBlockData = fmap (V.! blockIndex) (fmap vertData leafLabelList) + + -- new recoded data-- need to filter out constant chars after recoding + -- need character length for missing values + -- True for IA lengths + charLengthV = V.zipWith (U.getMaxCharacterLength True) (thd3 $ blockCharInfoV V.! blockIndex) (V.fromList $ fmap V.toList leafBlockData) + + (transformedLeafBlockData, transformedBlockInfo) = unzip $ fmap (transformData leavePrealigned (thd3 $ blockCharInfoV V.! blockIndex) charLengthV) leafBlockData + in -- trace ("PGDT: " <> show charLengthV) + (fst3 $ blockCharInfoV V.! blockIndex, V.fromList transformedLeafBlockData, head transformedBlockInfo) + + +-- | transformData takes original Character info and character data and transforms to static if dynamic noting character type +transformData + ∷ Bool → V.Vector CharInfo → V.Vector Int → V.Vector CharacterData → (V.Vector CharacterData, V.Vector CharInfo) +transformData leavePrealigned inCharInfoV inCharLengthV inCharDataV = + if V.null inCharInfoV + then (V.empty, V.empty) + else + let (outCharDataV, outCharInfoV) = V.unzip $ V.zipWith3 (transformCharacter leavePrealigned) inCharDataV inCharInfoV inCharLengthV + in (outCharDataV, outCharInfoV) + + +-- transformCharacter takes a single character info and character and returns IA if dynamic as is if not +-- checks if all gaps with the GV.filter. If all gaps--it means the sequence char was missing and +-- implied alignment produced all gaps. The passing of character length is not necessary when changed missing seq to empty +-- character--but leaving in case change back to [] +-- "nonAddGap" not currently implemented +transformCharacter ∷ Bool → CharacterData → CharInfo → Int → (CharacterData, CharInfo) +transformCharacter leavePrealigned inCharData inCharInfo charLength = + let inCharType = charType inCharInfo + inCostMatrix = costMatrix inCharInfo + alphSize = length $ alphabet inCharInfo + + -- determine if matrix is all same costs => nonadditive + -- all same except fort single indel costs => non add with gap binary chars + -- not either => matrix char + (inCostMatrixType, gapCost) = R.getRecodingType inCostMatrix + in -- trace ("TC:" <> (show alphSize) <> " " <> (show $ alphabet inCharInfo)) ( + -- trace ("TC:" <> (show charLength) <> " " <> (show (GV.length $ snd3 $ slimAlignment inCharData, GV.length $ snd3 $ wideAlignment inCharData, GV.length $ snd3 $ hugeAlignment inCharData))) ( + if inCharType `elem` exactCharacterTypes + then (inCharData, inCharInfo) + else + if inCharType `elem` prealignedCharacterTypes + then (inCharData, inCharInfo) + else -- trace ("TC: " <> inCostMatrixType) ( + -- different types--vector wrangling + -- missing data fields set if no implied alignment ie missing data + + if inCharType `elem` [SlimSeq, NucSeq] + then + let gapChar = (0 ∷ SlimState) `setBit` fromEnum gapIndex + missingState = L.foldl' (setBit) (0 ∷ SlimState) [0 .. alphSize - 1] + impliedAlignChar = + if (not . GV.null $ GV.filter (/= gapChar) $ snd3 $ slimAlignment inCharData) + then slimAlignment inCharData + else + let missingElement = SV.replicate charLength missingState -- if simple all ON then segfault do to lookup outside of cost matrix + {- + impliedAlignChar = if (not . GV.null $ GV.filter (/= gapChar) $ snd3 $ slimAlignment inCharData) then slimAlignment inCharData + else + let missingElement = SV.replicate charLength $ B.complement (0 :: SlimState) -- TRANS.setMissingBits (0 :: SlimState) 0 alphSize + -} + in (missingElement, missingElement, missingElement) + + newPrelimBV = R.convert2BV 32 impliedAlignChar + newPrelimBVGaps = addGaps2BV gapCost newPrelimBV + in -- trace ("TC-Slim:" <> (show $ GV.length $ snd3 $ slimAlignment inCharData) <> " " <> (show $ snd3 $ impliedAlignChar)) ( + + if leavePrealigned + then (inCharData{alignedSlimPrelim = impliedAlignChar}, inCharInfo{charType = AlignedSlim}) + else + if inCostMatrixType == "nonAdd" + then (inCharData{stateBVPrelim = newPrelimBV}, inCharInfo{charType = NonAdd}) + else + if inCostMatrixType == "nonAddGap" + then (inCharData{stateBVPrelim = newPrelimBVGaps}, inCharInfo{charType = NonAdd}) + else -- matrix recoding + (inCharData{alignedSlimPrelim = impliedAlignChar}, inCharInfo{charType = AlignedSlim}) + else + if inCharType `elem` [WideSeq, AminoSeq] + then + let gapChar = (0 ∷ WideState) `setBit` fromEnum gapIndex + missingState = L.foldl' (setBit) (0 ∷ WideState) [0 .. alphSize - 1] + impliedAlignChar = + if (not . GV.null $ GV.filter (/= gapChar) $ snd3 $ wideAlignment inCharData) + then wideAlignment inCharData + else + let missingElement = UV.replicate charLength missingState -- if simple all ON then segfault do to lookup outside of cost matrix + in {- + impliedAlignChar = if (not . GV.null $ GV.filter (/= gapChar) $ snd3 $ wideAlignment inCharData) then wideAlignment inCharData + else + let missingElement = UV.replicate charLength $ B.complement (0 :: WideState) -- TRANS.setMissingBits (0 :: WideState) 0 alphSize + -} + (missingElement, missingElement, missingElement) + + newPrelimBV = R.convert2BV 64 impliedAlignChar + newPrelimBVGaps = addGaps2BV gapCost newPrelimBV + in if leavePrealigned + then (inCharData{alignedWidePrelim = impliedAlignChar}, inCharInfo{charType = AlignedWide}) + else + if inCostMatrixType == "nonAdd" + then (inCharData{stateBVPrelim = newPrelimBV}, inCharInfo{charType = NonAdd}) + else + if inCostMatrixType == "nonAddGap" + then (inCharData{stateBVPrelim = newPrelimBVGaps}, inCharInfo{charType = NonAdd}) + else -- matrix recoding + (inCharData{alignedWidePrelim = impliedAlignChar}, inCharInfo{charType = AlignedWide}) + else + if inCharType == HugeSeq + then + let gapChar = (fromBits $ replicate alphSize False) `setBit` fromEnum gapIndex + missingState = fromBits $ replicate alphSize True + impliedAlignChar ∷ HugeDynamicCharacter + impliedAlignChar + | (not . GV.null $ GV.filter (/= gapChar) $ snd3 $ hugeAlignment inCharData) = hugeAlignment inCharData + | otherwise = + let missingElement = V.replicate alphSize missingState + in (missingElement, missingElement, missingElement) + + conversion ∷ HugeState → Natural + conversion = toUnsignedNumber + naturalize + ∷ (V.Vector HugeState, V.Vector HugeState, V.Vector HugeState) + → (V.Vector Natural, V.Vector Natural, V.Vector Natural) + naturalize (x, y, z) = (conversion <$> x, conversion <$> y, conversion <$> z) + newPrelimBV = R.convert2BV (toEnum alphSize) $ naturalize impliedAlignChar + newPrelimBVGaps = addGaps2BV gapCost newPrelimBV + in if leavePrealigned + then (inCharData{alignedHugePrelim = impliedAlignChar}, inCharInfo{charType = AlignedHuge}) + else + if inCostMatrixType == "nonAdd" + then (inCharData{stateBVPrelim = newPrelimBV}, inCharInfo{charType = NonAdd}) + else + if inCostMatrixType == "nonAddGap" + then (inCharData{stateBVPrelim = newPrelimBVGaps}, inCharInfo{charType = NonAdd}) + else -- matrix recoding + (inCharData{alignedHugePrelim = impliedAlignChar}, inCharInfo{charType = AlignedHuge}) + else error ("Unrecognized character type in transformCharacter: " <> (show inCharType)) + + +-- ) + +{- | addGaps2BV adds gap characters 0 = nonGap, 1 = Gap to Vector +of states to non-additive charcaters for static approx. gapCost - 1 characters are added +sems wasteful, but comctant filtered out and recoded later when non-add/add charsa re optimized and bitpacked +since this only for leaves assume inM good for all +-} +addGaps2BV + ∷ Int + → (V.Vector BV.BitVector, V.Vector BV.BitVector, V.Vector BV.BitVector) + → (V.Vector BV.BitVector, V.Vector BV.BitVector, V.Vector BV.BitVector) +addGaps2BV gapCost (_, inM, _) = + -- trace ("AG2BV: " <> (show inM)) ( + let gapChar = BV.fromNumber (BV.dimension $ V.head inM) (1 ∷ Int) + noGap = L.replicate (gapCost - 1) $ BV.fromNumber (BV.dimension $ V.head inM) (1 ∷ Int) + hasGap = L.replicate (gapCost - 1) $ BV.fromNumber (BV.dimension $ V.head inM) (2 ∷ Int) + gapCharV = createGapChars inM gapChar [] noGap hasGap + outM = inM <> gapCharV + in (outM, outM, outM) + + +-- ) + +{- | createGapChars takes a vector of bitvector coded states and checks if first states == 1 (= gap) +if so a number based on gap cost are created.. Will create n * original klength so need to +filter out constant characters later +-} +createGapChars + ∷ V.Vector BV.BitVector → BV.BitVector → [BV.BitVector] → [BV.BitVector] → [BV.BitVector] → V.Vector BV.BitVector +createGapChars origBVV gapCharacter newCharL noGapL hasGapL = + if V.null origBVV + then V.fromList newCharL + else + if V.head origBVV == gapCharacter + then createGapChars (V.tail origBVV) gapCharacter (hasGapL <> newCharL) noGapL hasGapL + else createGapChars (V.tail origBVV) gapCharacter (noGapL <> newCharL) noGapL hasGapL diff --git a/src/Commands/Verify.hs b/src/Commands/Verify.hs new file mode 100644 index 000000000..72e78e765 --- /dev/null +++ b/src/Commands/Verify.hs @@ -0,0 +1,585 @@ +{- | +Module : Verify.hs +Description : Module to verify (more or less) input commands +Copyright : (c) 2022-2023 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. +License : + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The views and conclusions contained in the software and documentation are those +of the authors and should not be interpreted as representing official policies, +either expressed or implied, of the FreeBSD Project. + +Maintainer : Ward Wheeler +Stability : unstable +Portability : portable (I hope) +-} + +{- +ToDo + 1) actually check all sub options to make sure valid + 2) Check for "bad" combinations or irrelevent options and warn + +-} +module Commands.Verify ( + verifyCommands, + allowedCommandList, + buildArgList, + fuseArgList, + geneticAlgorithmArgList, + netEdgeArgList, + readArgList, + reconcileArgList, + refineArgList, + reportArgList, + searchArgList, + setArgList, + selectArgList, + supportArgList, + swapArgList, + transformArgList, +) where + +import Data.Char qualified as C +import Data.List qualified as L +import GeneralUtilities +import Text.Read +import Types.Types + + +-- import Debug.Trace + +-- | allowedCommandList is the permitted command string list +allowedCommandList ∷ [String] +allowedCommandList = + [ "build" + , "fuse" + , "read" + , "reblock" + , "refine" + , "rename" + , "report" + , "run" + , "search" + , "select" + , "set" + , "support" + , "swap" + , "transform" + ] + + +-- list of valid instructions +validInstructionList ∷ [Instruction] +validInstructionList = + [ Build + , Fuse + , Read + , Reblock + , Refine + , Rename + , Report + , Run + , Search + , Select + , Set + , Support + , Swap + , Transform + ] + + +-- | buildArgList is the list of valid build arguments +buildArgList ∷ [String] +buildArgList = + [ "atrandom" + , "best" + , "block" + , "character" + , "cun" + , "displaytrees" + , "distance" + , "dwag" + , "eun" + , "first" + , "graph" + , "none" + , "nj" + , "otu" + , "rdwag" + , "return" + , "replicates" + , "spr" + , "tbr" + , "wpgma" + ] + + +-- | fuseArgList arguments +fuseArgList ∷ [String] +fuseArgList = + [ "all" + , "atrandom" + , "best" + , "keep" + , "once" + , "pairs" + , "none" + , "spr" + , "steepest" + , "tbr" + , "unique" + , "reciprocal" + , "noreciprocal" + ] + + +-- | geneticAlgorithm arguments +geneticAlgorithmArgList ∷ [String] +geneticAlgorithmArgList = + [ "elitist" + , "ga" + , "generations" + , "geneticalgorithm" + , "maxnetedges" + , "popsize" + , "recombinations" + , "severity" + , "stop" + ] + + +-- | netEdgeArgList arguments for network edge add/delete operations +netEdgeArgList ∷ [String] +netEdgeArgList = + [ "acceptequal" + , "acceptworse" + , "all" + , "annealing" + , "atrandom" + , "drift" + , "inorder" + , "keep" + , "maxnetedges" + , "netadd" + , "netadddel" + , "netadddelete" + , "netdel" + , "netdelete" + , "netmove" + , "returnmutated" + , "rounds" + , "steepest" + , "steps" + ] + + +-- | Read arg list allowable modifiers in read +readArgList ∷ [String] +readArgList = + [ "aminoacid" + , "block" + , "dot" + , "enewick" + , "exclude" + , "fasta" + , "fastc" + , "fenewick" + , "gapopening" + , "hugeseq" + , "include" + , "newick" + , "nucleotide" + , "preaminoacid" + , "prefasta" + , "prefastc" + , "prehugeseq" + , "prenucleotide" + , "rename" + , "tcm" + , "tnt" -- "prealigned", "csv", + ] + + +-- should be moved to a single file for import + +-- | reconcileCommandList list of allowable commands +reconcileArgList ∷ [String] +reconcileArgList = + [ "compare" + , "connect" + , "edgelabel" + , "method" + , "threshold" + , "vertexlabel" + ] -- "outfile" + + +-- | reconcileOptionsList list of allowable command options of method, compare, threshhold, and outformat +reconcileOptionsList ∷ [String] +reconcileOptionsList = + [ "adams" + , "combinable" + , "cun" + , "eun" + , "false" + , "fen" + , "identity" + , "majority" + , "strict" + , "true"] + + +-- | refinement arguments +refineArgList ∷ [String] +refineArgList = fuseArgList <> netEdgeArgList <> geneticAlgorithmArgList + + +-- | reportArgList contains valid 'report' arguments +reportArgList ∷ [String] +reportArgList = + [ "append" + , "ascii" + , "branchlengths" + , "collapse" + , "complexity" + , "concatenate" + , "color" + , "crossrefs" + , "data" + , "diagnosis" + , "displaytrees" + , "dot" + , "dotpdf" + , "graphs" + , "htulabels" + , "ia" + , "includemissing" + , "impliedalignment" + , "metadata" + , "newick" + , "nobranchlengths" + , "nocollapse" + , "nohtulabels" + , "overwrite" + , "pairdist" + , "parameterestimation" + , "reconcile" + , "search" + , "support" + , "tnt" + ] + <> reconcileArgList + + +-- | search arguments +searchArgList ∷ [String] +searchArgList = + [ "days" + , "exponential" + , "hours" + , "instances" + , "linear" + , "maxnetedges" + , "minutes" + , "seconds" + , "simple" + , "stop" + , "thompson" + ] + + +-- | selectArgList is the list of valid select arguments +selectArgList ∷ [String] +selectArgList = + [ "all" + , "atrandom" + , "best" + , "threshold" + , "unique" + ] + + +{- | setArgList contains valid 'set' arguments +joinThreshold and dynamicEpsilon are not intended for users--but could be of course +-} +setArgList ∷ [String] +setArgList = + [ "bc2" + , "bc4" + , "bc5" + , "bc8" + , "bc64" + , "bcgt64" + , "compressresolutions" + , "criterion" + , "defparstrat" + , "dynamicepsilon" + , "finalassignment" + , "graphfactor" + , "graphssteepest" + , "graphtype" + , "jointhreshold" + , "lazyparstrat" + , "missingthreshold" + , "modelcomplexity" + , "multitraverse" + , "outgroup" + , "partitioncharacter" + , "reportnaive" + , "rootcost" + , "seed" + , "softwiredmethod" + , "strictparstrat" + , "usenetaddheuristic" + , "useia" + ] + + +-- | refinement arguments +supportArgList ∷ [String] +supportArgList = + [ "atrandom" + , "bootstrap" + , "buildonly" + , "gb" + , "gbsample" + , "goodmanbremer" + , "jackknife" + , "replicates" + , "spr" + , "tbr" + ] + + +-- | swapArgList is the list of valid swap arguments +swapArgList ∷ [String] +swapArgList = + [ "acceptequal" + , "acceptworse" + , "all" + , "alternate" + , "annealing" + , "atrandom" + , "drift" + , "ia" + , "inorder" + , "joinall" + , "joinpruned" + , "joinalternate" + , "keep" + , "maxchanges" + , "nni" + , "replicates" + , "returnmutated" + , "spr" + , "steepest" + , "steps" + , "tbr" + ] + + +-- | transform arguments +transformArgList ∷ [String] +transformArgList = + [ "atrandom" + , "compressresolutions" + , "displaytrees" + , "dynamic" + , "dynamicepsilon" + , "first" + , "graphfactor" + , "graphssteepest" + , "jointhreshold" + , "multitraverse" + , "name" + , "outgroup" + , "softwiredmethod" + , "staticapprox" + , "tohardwired" + , "tosoftwired" + , "totree" + , "type" + , "usenetaddheuristic" + , "weight" + ] + + +{- | verifyCommands takes a command list and tests whether the commands +and arguments are permissible before program execution--prevents late failure +after alot of processing time. +bit does not check for files existance, write/read ability, or contents for format or +anyhhting else for that matter +does check if files are both read from and written to +-} +verifyCommands ∷ [Command] → [String] → [String] → Bool +verifyCommands inCommandList inFilesToRead inFilesToWrite = + if null inCommandList + then True + else + let firstCommand = head inCommandList + commandInstruction = fst firstCommand + inArgs = snd firstCommand + + -- check valid commandInstructions + -- this is done earlier but might get oved so putting here just in case + checkInstruction = commandInstruction `elem` validInstructionList + in if not checkInstruction + then errorWithoutStackTrace ("Invalid command was specified : " <> (show commandInstruction)) + else -- check each command for valid arguments + -- make lower-case arguments + + let fstArgList = filter (/= []) $ fmap (fmap C.toLower . fst) inArgs + sndArgList = filter (/= []) $ fmap (fmap C.toLower . snd) inArgs + fileNameList = fmap (filter (/= '"')) $ filter (/= []) $ fmap snd inArgs + + -- Read + (checkOptions, filesToReadFrom, filesToWriteTo) = + -- Build + if commandInstruction == Build + then (checkCommandArgs "build" fstArgList buildArgList, [""], [""]) + else -- Fuse + + if commandInstruction == Fuse + then (checkCommandArgs "fuse" fstArgList fuseArgList, [""], [""]) + else + if commandInstruction == Read + then + let fileArgs = concat $ filter (/= []) $ fmap snd inArgs + numDoubleQuotes = length $ filter (== '"') fileArgs + has02DoubleQuotes = (numDoubleQuotes == 2) || (numDoubleQuotes == 0) + in if not has02DoubleQuotes + then errorWithoutStackTrace ("Unbalanced quotation marks in 'read' file argument: " <> fileArgs) + else (checkCommandArgs "read" fstArgList readArgList, fileNameList, [""]) + else -- Reblock -- no arguments--but reads string and blocknames + + if commandInstruction == Reblock + then + let fileArgs = concat $ filter (/= []) $ fmap snd inArgs + numDoubleQuotes = length $ filter (== '"') fileArgs + (numDouble, numUnbalanced) = divMod numDoubleQuotes 2 + in -- trace (show (fstArgList, sndArgList,fileNameList)) ( + if numDouble < 2 + then + errorWithoutStackTrace + ("Need at least two fields in 'rebock' command, new block name and old block(s) in double quotes: " <> fileArgs) + else + if numUnbalanced /= 0 + then errorWithoutStackTrace ("Unbalanced quotation marks in 'reblock' command: " <> fileArgs) + else (True, [""], [""]) + else -- ) + + -- Reconcile -- part of report + + -- Refine + + if commandInstruction == Refine + then (checkCommandArgs "refine" fstArgList refineArgList, [""], [""]) + else -- Rename -- -- no arguments--but reads string and taxon names + + if commandInstruction == Rename + then + let fileArgs = concat $ filter (/= []) $ fmap snd inArgs + numDoubleQuotes = length $ filter (== '"') fileArgs + (numDouble, numUnbalanced) = divMod numDoubleQuotes 2 + in -- trace (show (fstArgList, sndArgList,fileNameList)) ( + if numDouble < 2 + then + errorWithoutStackTrace + ("Need at least two fields in 'rename' command, new taxon name and old taxon name(s) in double quotes: " <> fileArgs) + else + if numUnbalanced /= 0 + then errorWithoutStackTrace ("Unbalanced quotation marks in 'rename' command: " <> fileArgs) + else (True, [""], [""]) + else -- ) + + -- Report + + if commandInstruction == Report + then + let fileArgs = concat $ filter (/= []) $ fmap snd inArgs + numDoubleQuotes = length $ filter (== '"') fileArgs + has02DoubleQuotes = (numDoubleQuotes == 2) || (numDoubleQuotes == 0) + in if not has02DoubleQuotes + then errorWithoutStackTrace ("Unbalanced quotation marks in report file argument: " <> fileArgs) + else + if "reconcile" `notElem` fstArgList + then (checkCommandArgs "report" fstArgList reportArgList, [""], fileNameList) + else + let reconcilePairList = filter ((`notElem` ("reconcile" : reportArgList)) . fst) $ zip fstArgList sndArgList + nonThresholdreconcileModPairList = filter ((/= "threshold") . fst) $ reconcilePairList + thresholdreconcileModPairList = filter ((== "threshold") . fst) $ reconcilePairList + checkReconcile1 = checkCommandArgs "reconcile" (fmap fst nonThresholdreconcileModPairList) reconcileArgList + checkReconcile2 = checkCommandArgs "reconcile" (fmap fst thresholdreconcileModPairList) reconcileArgList + checkReconcile3 = + checkCommandArgs + "reconcile modifier (method, compare, outformat, connect, edgelabel, vertexlabel)" + (fmap snd nonThresholdreconcileModPairList) + reconcileOptionsList + checkReconcile4 = L.foldl1' (&&) $ True : (fmap isInt (filter (/= []) (fmap snd thresholdreconcileModPairList))) + checkReconcile = checkReconcile1 && checkReconcile2 && checkReconcile3 && checkReconcile4 + in if checkReconcile + then (checkCommandArgs "report" fstArgList reportArgList, [""], [""]) + else (False, [], []) + else -- Run -- processed out before this into command list + + -- Search + + if commandInstruction == Search + then (checkCommandArgs "search" fstArgList searchArgList, [""], [""]) + else -- Select + + if commandInstruction == Select + then (checkCommandArgs "select" fstArgList selectArgList, [""], [""]) + else -- Set + + if commandInstruction == Set + then (checkCommandArgs "set" fstArgList setArgList, [""], [""]) + else -- Support + + if commandInstruction == Support + then (checkCommandArgs "support" fstArgList supportArgList, [""], [""]) + else -- Swap + + if commandInstruction == Swap + then (checkCommandArgs "swap" fstArgList swapArgList, [""], [""]) + else -- Transform + + if commandInstruction == Transform + then (checkCommandArgs "transform" fstArgList transformArgList, [""], [""]) + else errorWithoutStackTrace ("Unrecognized command was specified : " <> (show commandInstruction)) + in if checkOptions + then + let allFilesToReadFrom = filter (/= "") $ filesToReadFrom <> inFilesToRead + allFilesToWriteTo = filter (/= "") $ filesToWriteTo <> inFilesToWrite + readAndWriteFileList = L.intersect allFilesToReadFrom allFilesToWriteTo + in -- trace (show (allFilesToReadFrom, allFilesToWriteTo)) ( + if (not . null) readAndWriteFileList + then + errorWithoutStackTrace + ("Error--Both reading from and writing to files (could cause errors and/or loss of data): " <> (show readAndWriteFileList)) + else verifyCommands (tail inCommandList) allFilesToReadFrom allFilesToWriteTo + else -- ) + + -- Won't get to here--will error at earlier stages + False + where + isInt a = if (readMaybe a ∷ Maybe Int) /= Nothing then True else False diff --git a/src/Complexity/CodeStrings.hs b/src/Complexity/CodeStrings.hs new file mode 100644 index 000000000..266723517 --- /dev/null +++ b/src/Complexity/CodeStrings.hs @@ -0,0 +1,1659 @@ +{- | +Module : CodeStrings +Description: Strings for generated minimal Haskell code--functions for Algorithmic (Kolmogorov) complexity +Copyright : (c) 2019-2023 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. +License : + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The views and conclusions contained in the software and documentation are those +of the authors and should not be interpreted as representing official policies, +either expressed or implied, of the FreeBSD Project + +Maintainer : Ward Wheeler +Stability : unstable +Portability: portable (I hope) +-} +module Complexity.CodeStrings where + +import Complexity.Constants + + +-- In general, function names are letter number and internal variable are letters only +-- exception \aG, \bG, cG, dG for \aGraph etc since 10 \a functions +-- \fM for \fModel when 0-9 used as for graph ones above + +-- remove prelude stuff for production +-- only for debugging +programStartString ∷ String +programStartString = "module Main where\n" -- import Debug.Trace\n" + + +recursivePrintString ∷ String +recursivePrintString = + "\ + \p0 a b=\n\ + \ if nU b then putStr a\n\ + \ else\n\ + \ let ((c,d):e)=b\n\ + \ in p0(a++(show c++\" \"++show d++\"\\n\")) e\n" + + +programStartStringGraph ∷ String +programStartStringGraph = "module Main where\n" ++ recursivePrintString + + +getSingletonEdgesString ∷ String +getSingletonEdgesString = + "\ + \aG b c=\n\ + \ if c==0 then []\n\ + \ else (2*b,2*b+1):aG(b+1)(c-1)\n" + + +minimalTreesString ∷ String +minimalTreesString = + "\ + \bG a c d=\n\ + \ if (d<2)||(c==0) then []\n\ + \ else [(a,a+1),(a,a+2)]++bG(a+3)(c-2)(d-1)\n" + + +fullTreeString ∷ String +fullTreeString = + "\ + \cG a b d\n\ + \ |d==0=[]\n\ + \ |a=[(b+2,b),(b+2,b+1)]++cG False (b+2)(d-2)\n\ + \ |otherwise=[(b+2,b),(b+2,b+1)]++cG False (b+2)(d-1)\n" + + +addEdgeString ∷ String +addEdgeString = + "\ + \dG a ((e,f):(g,h):i) c=\n\ + \ if c==0 then (e,f):(g,h):i\n\ + \ else\n\ + \ let j=a\n\ + \ k=a+1\n\ + \ l=i++[(j,k),(e,j),(j,f),(g,k),(k,h)]\n\ + \ in dG(a+2) l (c-1)\n" + + +factorialString ∷ String +factorialString = + "\ + \fM 0=1\n\ + \fM 1=1\n\ + \fM n=n*fM (n-1)\n" + + +-- power :: Double -> Int -> Int -> Double +-- power base exponent counter= +powerString ∷ String +powerString = + "\ + \p1 b x c\n\ + \ |x==0=1\n\ + \ |x==c=1\n\ + \ |otherwise=b*(p1 b x(c+1))\n" + + +{- +\p1 b x c=\n\ +\ if x==0 then 1\n\ +\ else if x==c then 1\n\ +\ else b*(p1 b x (c+1))\n" +-} + +-- expE :: Double -> Int -> Int -> Double +-- expE exponent iterations counter= +-- This needs alot of iterations to behave well--100 for sure +-- Is n^2 due to use of factorial could be made linear +-- by saving previous factorial value and just mulgtipolying +-- the use of p1 also makes n^2--same preocedure--need a new value to pass +expEString ∷ String +expEString = + "\ + \e0 x i c=\n\ + \ if c==" + ++ fixedPrecisionString + ++ " then 0\n\ + \ else ((p1 x c 0)/(fM$fromIntegral c))+e0 x i (c+1)\n" + + +{- +-- \e x i c=exp x\n" test with library +expEString="\ +\e x i c=p(1+(x/(fromIntegral "++ fixedPrecisionString ++")))" ++ fixedPrecisionString ++ " 0\n" +-} + +-- logE :: Double -> Int -> Int -> Double -> Double +-- logE value iterations counter curValue= +logEString ∷ String +logEString = + "\ + \l0 v i c w=\n\ + \ if c==i then 2*w\n\ + \ else l0 v i (c+1) (w+((p1 ((v-1)/(v+1)) (1+2*c) 0)/(fromIntegral$1+2*c)))\n" + + +-- log2 :: Double -> Int-> Int -> Double -> Double +-- log2 value iterations blah bleh=(logE value iterations 0 0.0)/(logE 2.0 iterations 0 0.0) +-- the 'c' and 'w' params are not used but there for function as parame with logE +log2String ∷ String +log2String = + "\ + \b0 v i c w=(l0 v i 0 0)/(l0 2 i 0 0)\n" + + +-- start with 0 +-- \r0 :: Int -> a -> Int -> [a]\n\ +replicateString ∷ String +replicateString = + "\ + \r0 n v c=\n\ + \ if n==c then []\n\ + \ else v:r0 n v (c+1)\n" + + +-- makeSimpleMatrix :: Int -> Double -> Double -> Int -> String -> [[Double]] +-- makeSimpleMatrix size diag nondiag rowCounter lastElement= +-- \m :: Int -> Double -> Double -> Int -> String -> [[Double]]\n\ +makeMatrixString ∷ String +makeMatrixString = + "\ + \m0 s d n c e\n\ + \ |s==c=[]\n\ + \ |(e/=\"-\")||(c [a] -> Bool +elemString ∷ String +elemString = + "\ + \eS a b\n\ + \ |nU b=False\n\ + \ |a==a5 b=True\n\ + \ |otherwise=eS a (a6 b)\n" + + +-- is the first arg not an element of second (list) +-- eS :: a -> [a] -> Bool +notElemString ∷ String +notElemString = + "\ + \nE a b\n\ + \ |nU b=True\n\ + \ |a==a5 b=False\n\ + \ |otherwise=nE a (a6 b)\n" + + +-- getRepeatedElements in list +getRepeatedElementsString ∷ String +getRepeatedElementsString = + "\ + \rE l@(x:y)\n\ + \ |nU l=[]\n\ + \ |nU y=[]\n\ + \ |eS x y=x:(rE y)\n\ + \ |otherwise=rE y\n" + + +-- childrenParentsOfNode call with 2 nU lists +-- childrenParentsOfNodeString :: Int -> [(Int,Int)] -> [(Int,Int)] -> [(Int,Int)] -> ([(Int,Int)], [(Int,Int)]) +childrenParentsOfNodeString ∷ String +childrenParentsOfNodeString = + "\ + \cN n l c p=\n\ + \ if nU l then (c,p)\n\ + \ else\n\ + \ let (e,u)=a5 l\n\ + \ in\n\ + \ if n==e then cN n (a6 l) (u:c) p\n\ + \ else if n==u then cN n (a6 l) c (e:p)\n\ + \ else cN n (a6 l) c p\n" + + +-- get display edges from full input edge list (n) +-- not checking for nU list +-- first arg is 0/1 bit list 1 per net node +-- uses binary strem fo detemine left right +displayEdgesString ∷ String +displayEdgesString = + "\ + \dE b n=\n\ + \ let v=gM iM n\n\ + \ r=rE v\n\ + \ in\n\ + \ if nU r then n\n\ + \ else\n\ + \ let h=a5 r\n\ + \ (c,p)=cN h n [] []\n\ + \ (_,pL')=cN (a5 p) n [] []\n\ + \ (_,pR')=cN (a3 p) n [] []\n\ + \ (pL, pR)=if(a5 b)==0 then (pL',pR') else (pR',pL')\n\ + \ gR=a5 pR\n\ + \ (cR, _)=cN (a3 p) n [] []\n\ + \ sR=a5$g2 (/=h) cR\n\ + \ d=[(h,a5 c),(a5 p, h),(a3 p, h),(gR,a3 p),(a3 p,sR)]\n\ + \ a=[(a5 p, a5 c),(gR, sR)]\n\ + \ w=(g2(`nE` d)n)++a\n\ + \ in dE (a6 b) w\n" + + +fmapString ∷ String +fmapString = + "\ + \gM f l=\n\ + \ if nU l then []\n\ + \ else f (a5 l):gM f (a6 l)\n" + + +nullString ∷ String +nullString = + "\ + \nU a=(a==[])\n" + + +matrix2StringString ∷ String +matrix2StringString = + "\ + \s0 m=\n\ + \ if nU m then \"\\n\"\n\ + \ else (a4$gM (++ \" \")$gM show$a5 m)++\"\\n\"++s0 (a6 m)\n" + + +neymanUniformString ∷ String +neymanUniformString = + "\ + \u0 n a i=((a-((((fromIntegral n)-1))*(((e0 (-1*a) i 0)-1))))/((fromIntegral n)*a),(a-1+(e0 (-1*a) i 0))/((fromIntegral n)*a))\n" + + +-- p argument here is ignored by exponential +neymanExponentialString ∷ String +neymanExponentialString = + "\ + \y0 n a p=((1+(a*(fromIntegral n)))/((a+1)*(fromIntegral n)),1/((a+1)*(fromIntegral n)))\n" + + +{- | this passes function of Neyman Uniform or NeymanExponential as argument + f is functino, n=alpabet size, a is branch distribution param, p is precision integer, + \c9 :: (Int -> Double -> Int -> (Double, Double)) -> Int -> Double -> Int -> String -> [[Double]]\n\ +-} +makeTCMBitsString ∷ String +makeTCMBitsString = + "\ + \c9 f n a p s=\n\ + \ let (d,e)=f n a p\n\ + \ in m0 n (-1*b0 d p 0 0)(-1*b0 e p 0 0) 0 s\n" + + +fstString ∷ String +fstString = + "\ + \hM(a,_)=a\n" + + +sndString ∷ String +sndString = + "\ + \iM(_,b)=b\n" + + +{- | does the fmap/sum operation over multiple pii/pij for weights if [(1,1)] + same as simple Neyman functions +makeNeymanMatrix :: (Double -> Int-> Int -> Double -> Double) -> Distribution -> Int -> DistributionParameter -> Int -> [(Double, Double)] -> [[Double]] +makeNeymanMatrix logType distribution cbetSize cParam iterations modifiers + codes in log2 unlike general cde that passes log type + \n :: Int -> Int -> Double -> Int -> [(Double, Double)] -> String -> [[Double]]\n\ +-} +makeNeymanGeneralMatrixString ∷ String +makeNeymanGeneralMatrixString = + "\ + \n0 r l a p o s=\n\ + \ let k=gM(jM r l a p)o\n\ + \ q=c3(+)0$gM hM k\n\ + \ c=c3(+)0$gM iM k\n\ + \ e=(-1)*b0 q p 0 0\n\ + \ f=(-1)*b0 c p 0 0\n\ + \ in m0 l e f 0 s\n" + + +-- inner part of Neym,ena general--get the pii/pij for weight and fractions +-- d=0 is "uniform" distribution +-- \j :: Int -> Int -> Double -> Int -> (Double, Double) -> (Double, Double)\n\ +neymanGeneralWithKString ∷ String +neymanGeneralWithKString = + "\ + \jM d r a i (w, f)\n\ + \ |w<" + ++ epsilonString + ++ "=(f,0)\n\ + \ |d==0=\n\ + \ let r1=fromIntegral r\n\ + \ b=(e0(-1 *a*w)i 0)\n\ + \ l=((b+(a*w)-1)/(a*w*r1))\n\ + \ m=1-((r1-1)*l)\n\ + \ in (f*m,f*l)\n\ + \ |otherwise=\n\ + \ let r1=fromIntegral r\n\ + \ l=(w/(r1*(w+a)))\n\ + \ m=1-((r1-1)*l)\n\ + \ in (f*m,f*l)\n" + + +{- +\j d r a i (w,f)=\n\ +\ if w<" ++ epsilonString ++ " then (f,0)\n\ +\ else if d==0 then\n\ +\ let r1=fromIntegral r\n\ +\ b=(e(-1*a*w)i 0)\n\ +\ l=((b+(a*w)-1)/(a*w*r1))\n\ +\ m=1-((r1-1)*l)\n\ +\ in (f*m,f*l)\n\ +\ else\n\ +\ let r1=fromIntegral r\n\ +\ l=(w/(r1*(w+a)))\n\ +\ m=1-((r1-1)*l)\n\ +\ in (f*m,f*l)\n" +-} + +-- \x :: Double -> Int -> Double -> Int -> Int -> [Double]\n\ +discreteGammaString ∷ String +discreteGammaString = + "\ + \x0 a n m i e=\n\ + \ if n==1 then [1]\n\ + \ else\n\ + \ let b=n*e\n\ + \ v=m/(fromIntegral b)\n\ + \ z=(k2(r0 b v 0)0)\n\ + \ c=k2(gM(t0 a i v)z)0\n\ + \ j=gM(+(1-(a3 c)))c\n\ + \ s=w0 n j z 1 True\n\ + \ in gM(*((fromIntegral n)/(c3(+)0 s)))s\n" + + +-- \k2:: [Double] -> Double -> [Double]\n\ +cumulativeSumString ∷ String +cumulativeSumString = + "\ + \k2(a:c)b=\n\ + \ if nU c then [b+a]\n\ + \ else (b+a):k2 c(b+a)\n" + + +-- \t :: Double -> Int -> Double -> Double -> Double\n\ +gammaPDFString ∷ String +gammaPDFString = + "\ + \t0 a i l r=l*(q0 a a i)*(q0 r(a-1)i)*(e0(-1*a*r)i 0)/(v0 a i 0 1)\n" + + +-- \w0 :: Int -> [Double] -> [Double] -> Int -> Bool-> [Double]\n\ +getNTilesString ∷ String +getNTilesString = + "\ + \w0 c d g t o=\n\ + \ if nU d then []\n\ + \ else\n\ + \ let f=a5 d\n\ + \ e=a5 g\n\ + \ v=(fromIntegral t)/(2*(fromIntegral c))\n\ + \ in\n\ + \ if f>=v then\n\ + \ if o then e:w0 c(a6 d)(a6 g)(t+1)False\n\ + \ else w0 c(a6 d)(a6 g)(t+1)True\n\ + \ else w0 c(a6 d)(a6 g)t o\n" + + +-- \q :: Double -> Double -> Int -> Double\n\ +-- maybe replace with expE when used +expX2YString ∷ String +expX2YString = + "\ + \q0 a x i=e0(x*(l0 a i 0 0))i 0\n" + + +-- \v0 :: Double -> Int -> Int -> Double -> Double\n\ +gammaFunString ∷ String +gammaFunString = + "\ + \v0 u i c l=\n\ + \ if c>i then (fM$fromIntegral i)*(q0(fromIntegral i)u i)/l\n\ + \ else v0 u i(c+1)(l*(u+(fromIntegral c)))\n" + + +-- \a9 :: Int -> Double -> Int -> (Double, Double) -> (Double, Double)\n\ +neymanUniformWithKString ∷ String +neymanUniformWithKString = + "\ + \a9 r c i(w,f)=\n\ + \ if w<" + ++ epsilonString + ++ " then (f,0)\n\ + \ else\n\ + \ let r1=fromIntegral r\n\ + \ b=(e0(-1*c*w)i 0)\n\ + \ l=((b+(c*w)-1)/(c*w*r1))\n\ + \ m=1-((r1-1)*l)\n\ + \ in (f*m,f*l)\n" + + +-- \z :: Int -> Double -> Int -> (Double, Double) -> (Double, Double)\n\ +neymanExponentialWithKString ∷ String +neymanExponentialWithKString = + "\ + \z0 r a i(w,f)=\n\ + \ if w<" + ++ epsilonString + ++ " then (f,0)\n\ + \ else\n\ + \ let r1=fromIntegral r\n\ + \ l=(w/(r1*(w+a)))\n\ + \ m=1-((r1-1)*l)\n\ + \ in (f*m,f*l)\n" + + +-- \a1 :: Int -> Double -> Int -> Int -> Double -> Int -> [(Double, Double)]\n\ +getModifierListSmallString ∷ String +getModifierListSmallString = + "\ + \a1 v t h n a i\n\ + \ |v==0 && h==0=[(1,1)]\n\ + \ |v==1 && h==0=[(0,t),(1/(1-t),1-t)]\n\ + \ |otherwise=\n\ + \ let w=x0 a n 10.0 i(n*i)\n\ + \ f=r0 n(1.0/(fromIntegral n))0\n\ + \ in\n\ + \ if t==0 then a2 w f else\n\ + \ a2(0:(gM(*(1/(1-t)))w))(t:(gM(*(1-t))f))\n" + + +{- +\a1 v t h n a i=\n\ +\ if v==0 && h==0 then [(1,1)]\n\ +\ else if v==1 && h==0 then [(0,t),(1/(1-t),(1-t))]\n\ +\ else\n\ +\ let w=x a n " ++ maxGammaRateString ++ " i(n*i)\n\ +\ f=r n (1.0/(fromIntegral n))0\n\ +\ in\n\ +\ if t==0 then a2 w f\n\ +\ else a2(0:(g(*(1/(1-t)))w))(t:(g(*(1-t))f))\n" +-} + +-- \a2 :: [a] -> [b] -> [(a,b)]\n\ +zipString ∷ String +zipString = + "\ + \a2(l:m)(n:o)=\n\ + \ if nU m then [(l,n)]\n\ + \ else (l,n):a2 m o\n" + + +-- \a3 :: [a] -> a\n\ +lastString ∷ String +lastString = + "\ + \a3(a:b)=\n\ + \ if nU b then a\n\ + \ else a3 b\n" + + +-- \a4 :: [[a]] -> [a]\n\ +concatString ∷ String +concatString = + "\ + \a4(a:b)=\n\ + \ if nU b then a\n\ + \ else a++a4 b\n" + + +headString ∷ String +headString = + "\ + \a5(a:_)=a\n" + + +-- \a5[]=error"BadHead"\n" + +tailString ∷ String +tailString = + "\ + \a6(_:b)=b\n" + + +-- \a7 :: Int -> Double -> Int -> [(Double, Double)] -> String -> [[Double]]\n\ +makeNeymanUniformMatrixString ∷ String +makeNeymanUniformMatrixString = + "\ + \a7 l j p o s=\n\ + \ let k=gM(a9 l j p)o\n\ + \ q=c3(+)0$gM hM k\n\ + \ c=c3(+)0$gM iM k\n\ + \ e=(-1)*b0 q p 0 0\n\ + \ f=(-1)*b0 c p 0 0\n\ + \ in m0 l e f 0 s\n" + + +-- \a8 :: Int -> Double -> Int -> [(Double, Double)] -> String -> [[Double]]\n\ +makeNeymanExponentialMatrixString ∷ String +makeNeymanExponentialMatrixString = + "\ + \a8 l j p o s=\n\ + \ let k=gM(z0 l j p)o\n\ + \ q=c3(+)0$gM hM k\n\ + \ c=c3(+)0$gM iM k\n\ + \ e=(-1)*b0 q p 0 0\n\ + \ f=(-1)*b0 c p 0 0\n\ + \ in m0 l e f 0 s\n" + + +--- \b7 :: [[Double]] -> [[Double]]\n\ +transposeMatrixString ∷ String +transposeMatrixString = + "\ + \b7 a=\n\ + \ if nU$a5 a then []\n\ + \ else gM a5 a:b7(gM a6 a)\n" + + +absString ∷ String +absString = + "\ + \c8 a=\n\ + \ if a<0 then -1*a\n\ + \ else a\n" + + +-- start at 0 +-- \d5 :: Int -> [a] -> Int\n\ +lengthString ∷ String +lengthString = + "\ + \d5 b a=\n\ + \ if nU a then b\n\ + \ else d5(b+1)(a6 a)\n" + + +-- \c0 :: [[Double]] -> [[Double]]\n\ +invertMatrixString ∷ String +invertMatrixString = + "\ + \c0 a=d8(1/(d7 a))(d6 a 1)\n" + + +-- \zipWith :: (Double -> Double -> Double) -> [Double] -> [Double] -> [Double]\n\ +zipWithString ∷ String +zipWithString = + "\ + \e5 f a b=\n\ + \ if nU a ||nU b then []\n\ + \ else f(a5 a)(a5 b):e5 f(a6 a)(a6 b)\n" + + +-- \b2 :: Int -> [Double] -> [[Double]]\n\ +split2MatrixString ∷ String +split2MatrixString = + "\ + \b2 a b=\n\ + \ if nU b then []\n\ + \ else (d3 0 a b):b2 a(d4 0 a b)\n" + + +-- \c4 :: [[Double]] -> [[Double]] -> [[Double]]\n\ +addMatricesString ∷ String +addMatricesString = + "\ + \c4 a b=\n\ + \ if nU a && nU b then []\n\ + \ else (e5(+)(a5 a)(a5 b)):c4(a6 a)(a6 b)\n" + + +-- \c3 :: (a -> b -> a) -> a -> [b] -> a\n\ +foldlString ∷ String +foldlString = + "\ + \c3 f a b=\n\ + \ if nU b then a\n\ + \ else c3 f(f a$a5 b)(a6 b)\n" + + +-- \c2 :: [Double] -> [[Double]] -> [[Double]]-> Int -> Int -> Double -> Double -> Int -> Int -> (Double -> Int -> Double -> Double) -> (Double, Double) -> [Double]\n\ +-- 2.0 is maxTime--should be String +integrateGTRMatrixWithKString ∷ String +integrateGTRMatrixWithKString = + "\ + \c2 a b c d e f g h i j(k,l)\n\ + \ |d==i=[]\n\ + \ |e==i=c2 a b c(d+1)0 f g h i j(k,l)\n\ + \ |k<" + ++ epsilonString + ++ "=0:(c2 a b c d(e+1)f g h i j(k,l))\n\ + \ |otherwise=(l*(e6 e7 j a b c d e f 2.0 h 0 k)):(c2 a b c d(e+1)f g h i j(k,l))\n" + + +-- \c5 :: [[Double]] -> Int -> Int -> [Double]\n\ +adjustSymString ∷ String +adjustSymString = + "\ + \c5 a b c=\n\ + \ let e=c5 a b(c+1)\n\ + \ in\n\ + \ if b==d5 0 a then []\n\ + \ else if c==d5 0 a then c5 a(b+1)0\n\ + \ else if c==b then ((a!!b)!!c):e\n\ + \ else\n\ + \ let d=((a!!b)!!c+(a!!c)!!b)/2\n\ + \ in\n\ + \ if d>0 then d:e\n\ + \ else " + ++ epsilonString + ++ ":e\n" + + +-- \c6 :: [[Double]] -> [[Double]] -> Int -> [[Double]]\n\ +adjustDiagString ∷ String +adjustDiagString = + "\ + \c6 a b c=\n\ + \ if c==d5 0 b then []\n\ + \ else\n\ + \ let e=a5 a\n\ + \ in ((d3 0 c e)++[1-((c3(+)0 e)-((b!!c)!!c))]++(d4 0(c+1)e)):c6(a6 a)b(c+1)\n" + + +-- \e7 :: [Double] -> [[Double]] -> [[Double]] -> Double -> Int -> Int -> Int -> Int -> Double\n\ +getPijString ∷ String +getPijString = + "\ + \e7 a b c d f g h i=\n\ + \ if f==d5 0 b then 0\n\ + \ else (e0((a!!f)*d)i 0*((b!!g)!!f)*((c!!f)!!h))+e7 a b c d(f+1)g h i\n" + + +-- removeColumn :: Int -> [[a]] -> [[a]]\n\ +removeColumnString ∷ String +removeColumnString = + "\ + \e9 a b=\n\ + \ if nU b then []\n\ + \ else\n\ + \ let c=a5 b\n\ + \ in ((d3 0(a-1)c)++(d4 0 a c)):(e9 a$a6 b)\n" + + +-- \f0 :: Int -> Int -> [[a]] -> [[a]]\n\ +removeRowAndColumnString ∷ String +removeRowAndColumnString = + "\ + \f0 a b c=\n\ + \ if nU c then []\n\ + \ else e9 b ((d3 0 (a -1) c) ++ (d4 0 a c))\n" + + +-- \d8 :: Double -> [[Double]] -> [[Double]]\n\ +matrixMultiplyScalarString ∷ String +matrixMultiplyScalarString = + "\ + \d8 a b=\n\ + \ if nU b then []\n\ + \ else gM(a *)(a5 b):d8 a(a6 b)\n" + + +-- \d7 :: [[Double]] -> Double\n\ +determinantNumericalString ∷ String +determinantNumericalString = + "\ + \d7 e=\n\ + \ if (d5 0$a5 e)==2 then\n\ + \ let [a,b]=a5 e\n\ + \ [c,d]=a3 e\n\ + \ in a*d-b*c\n\ + \ else f2 1 e\n" + + +-- \f2 :: Int -> [[Double]] -> Double\n\ +getCofactor1String ∷ String +getCofactor1String = + "\ + \f2 a b=\n\ + \ if (a>(d5 0$a5 b)) then 0\n\ + \ else (((a5 b)!!(a-1))*(p1(-1)(1+a)0)*(d7(e9 a(a6 b))))+f2(a+1)b\n" + + +-- \d6 :: [[Double]] -> Int -> [[Double]]\n\ +cofactorTMatrixString ∷ String +cofactorTMatrixString = + "\ + \d6 a b=\n\ + \ if b>(d5 0$a5 a) then []\n\ + \ else f3 b 1 a:d6 a(b+1)\n" + + +-- \e8 :: Double -> Int -> Double -> Double\n\ +getUniformPdfString ∷ String +getUniformPdfString = + "\ + \e8 a b c=1/a\n" + + +-- numerical issue? +-- a*exp (-1*a*c)\n" +-- gM[a*(e(-1 *a*c)b 0)]0\n" +-- \h1 a b c=a*(e(-1*a*c)b 0)\n" +-- \getExponentialPdf :: Double -> Int -> Double -> Double\n\ +getExponentialPdfString ∷ String +getExponentialPdfString = + "\ + \h1 a b c=a*(e0(-1*a*c)b 0)\n" + + +-- \f7 :: (a -> Bool) -> [a] -> [a]\n\ +takeWhileString ∷ String +takeWhileString = + "\ + \f7 f a\n\ + \ |nU a=[]\n\ + \ |(not$f(a5 a))=[]\n\ + \ |otherwise=(a5 a):(f7 f(a6 a))\n" + + +{- +\f7 f a=\n\ +\ if nU a then []\n\ +\ else if (not$f(a5 a)) then []\n\ +\ else (a5 a):(f7 f(a6 a))\n" +-} + +-- \b1 :: [[Double]] -> [[Double]]\n\ +regularizeRString ∷ String +regularizeRString = + "\ + \b1 a=\n\ + \ let b=fromIntegral$d5 0 a\n\ + \ in gM(gM(*((((b*b)-b)/2)/(c3(+)0(gM(c3(+)0)(gM(f7(>0))a))))))a\n" + + +-- need to start with 0 as first arg +-- \d3 :: Int -> Int -> [a] -> [a]\n\ +takeString ∷ String +takeString = + "\ + \d3 m n a\n\ + \ |nU a=[]\n\ + \ |m==n=[]\n\ + \ |otherwise=a5 a:d3(m+1)n(a6 a)\n" + + +{- +\d3 m n a=\n\ +\ if nU a then []\n\ +\ else if m==n then []\n\ +\ else a5 a:d3(m+1)n(a6 a)\n" +-} + +-- need to start with first arg 0 +-- \d4 :: Int -> Int -> [a] -> [a]\n\ +dropString ∷ String +dropString = + "\ + \d4 m n a\n\ + \ |nU a=[]\n\ + \ |m==n=a\n\ + \ |otherwise=d4(m+1)n(a6 a)\n" + + +{- +\d4 m n a=\n\ +\ if nU a then []\n\ +\ else if m==n then a\n\ +\ else d4(m+1)n(a6 a)\n" +-} + +{- +-- \reverse :: [a] -> [a]\n\ +reverseString :: String +reverseString="\ +\f8 a=\n\ +\ if nU a then []\n\ +\ else\n\ +\ a3 a:f8(f6 a)\n" +-} + +-- \b4 :: [[Double]] -> Int -> Int -> Int -> [Double]\n\ +addDiagValuesString ∷ String +addDiagValuesString = + "\ + \b4 a b c e\n\ + \ |c==b=[]\n\ + \ |e==b=b4 a b(c+1)0\n\ + \ |c/=e=((a!!c)!!e):b4 a b c(e+1)\n\ + \ |otherwise=(-1 *(c3(+)0(a!!c))):b4 a b c(e+1)\n" + + +{- +\b4 a b c e=\n\ +\ if c==b then []\n\ +\ else if e==b then b4 a b(c+1)0\n\ +\ else if c/=e then ((a!!c)!!e):b4 a b c(e+1)\n\ +\ else (-1 *(c3(+)0(a!!c))):b4 a b c(e+1)\n" +-} + +-- \b3 :: [[Double]] -> [Double] -> Int -> Int -> Int -> [Double]\n\ +makeQString ∷ String +makeQString = + "\ + \b3 a b c d e\n\ + \ |d==c=[]\n\ + \ |e==c=b3 a b c(d+1)0\n\ + \ |d==e=0:b3 a b c d(e+1)\n\ + \ |otherwise=((a!!d)!!e)*(b!!e):b3 a b c d(e+1)\n" + + +{- +\b3 a b c d e=\n\ +\ if d==c then []\n\ +\ else if e==c then b3 a b c(d+1)0\n\ +\ else if d==e then 0:b3 a b c d(e+1)\n\ +\ else ((a!!d)!!e)*(b!!e):b3 a b c d(e+1)\n" +-} + +-- \f3 :: Int -> Int -> [[Double]] -> [Double]\n\ +getRowString ∷ String +getRowString = + "\ + \f3 a b c=\n\ + \ if b>(d5 0$a5 c) then []\n\ + \ else (p1(-1)(b+a)0)*(d7$f0 b a c):f3 a(b+1)c\n" + + +-- /f9 :: Double -> Int -> Int -> Double +-- value iterations counter (start at 0) +sqrtString ∷ String +sqrtString = + "\ + \f9 x i c=e0(0.5*l0 x i c 0)i c\n" + + +-- \g0 ::(Ord a)=> [a] -> a -> a\n\ +maximumString ∷ String +maximumString = + "\ + \g0 a c\n\ + \ |nU a=c\n\ + \ |c>a5 a=g0(a6 a)c\n\ + \ |otherwise=g0(a6 a)(a5 a)\n" + + +{- +\g0 a c=\n\ +\ if nU a then c\n\ +\ else if c>a5 a then g0(a6 a)c\n\ +\ else g0(a6 a)(a5 a)\n" +-} + +-- \g1 ::(Ord a)=> [a] -> a -> a\n\ +minimumString ∷ String +minimumString = + "\ + \g1 a c\n\ + \ |nU a=c\n\ + \ |c [Double]-> [Double]\n\ +normalizeVectorWithSignString ∷ String +normalizeVectorWithSignString = + "\ + \b8 i a=\n\ + \ let s=f9(c3(+)0$e5(*)a a)i 0\n\ + \ in\n\ + \ if g0(a6 a)(a5 a)>=(c8$g1(a6 a)(a5 a)) then gM(/s)a\n\ + \ else gM(/(-1 *s))a\n" + + +-- \g2 :: (a -> Bool) -> [a] -> [a]\n\ +filterString ∷ String +filterString = + "\ + \g2 f b\n\ + \ |nU b=[]\n\ + \ |f(a5 b)=(a5 b):g2 f(a6 b)\n\ + \ |otherwise=g2 f(a6 b)\n" + + +-- \g3 :: [[Double]] -> Int -> [[Double]]\n\ +reduceRowString ∷ String +reduceRowString = + "\ + \g3 c r=\n\ + \ let d=(g4 c r$a5$g2(\\x -> c!!x!!r /=0)[r..(d5 0 c)-1])\n\ + \ e=d!!r\n\ + \ f=g(\\x -> x/(e!!r))e\n\ + \ h nr=let k=nr!!r in e5(\\a b -> k*a -b)f nr\n\ + \ i=g h$d4 0(r+1)d\n\ + \ in d3 0 r d++[f]++i\n" + + +-- \fixlastrow :: [[Double]] -> [[Double]]\n\ +fixlastrowString ∷ String +fixlastrowString = + "\ + \g5 b=(f6 b)++[f6(f6(a3 b))++[1,(a3(a3 b))/a3(f6(a3 b))]]\n" + + +-- \substitute :: [[Double]] -> [Double]\n\ +substituteString ∷ String +substituteString = + "\ + \g6 a=h0 b [a3(a3 a)] (f6 a) where\n\ + \ b c e=a3 c -c3(+)0(e5(*)e$f6$d4 0(d5 0 a -d5 0 e)c):e\n" + + +-- \g9 :: Int -> [a] -> ([a],[a])\n\ +splitAtString ∷ String +splitAtString = + "\ + \g9 a b=(d3 0 a b,d4 0 a b)\n" + + +-- \h0 :: (a -> b -> b) -> b -> [a] -> b\n\ +foldrString ∷ String +foldrString = + "\ + \h0 f b a=\n\ + \ if nU a then b\n\ + \ else h0 f (f(a5 a)b) (a6 a)\n" + + +-- error \"\"\n\ if maxiterations exceeded precision issue? +-- \h3 :: [[Double]] -> [[Double]] -> Int -> Int -> ([[Double]], [[Double]], [[Double]])\n\ +qrFactorizationString ∷ String +qrFactorizationString = + "\ + \h3 a u c i=\n\ + \ let (q,r)=h4 a i\n\ + \ n=h2 r q\n\ + \ o=h2 u q\n\ + \ l=a4 a\n\ + \ m=a4 n\n\ + \ in\n\ + \ if (((c3(+)0$gM c8$e5(-)l m)/fromIntegral (d5 0 a * d5 0 a))<" + ++ epsilonString + ++ ") ||(c<" + ++ maxIterationsString + ++ ") then (q,r,o)\n\ + \ else h3 n o (c+1)i\n" + + +-- \h5 :: [[Double]] -> Int -> [Double]\n\ +getDiagValuesString ∷ String +getDiagValuesString = + "\ + \h5 i c\n\ + \ |nU i=error \"\"\n\ + \ |c==d5 0 i=[]\n\ + \ |otherwise=((i!!c)!!c):h5 i(c+1)\n" + + +{- +\h5 i c=\n\ +\ if nU i then error \"\"\n\ +\ else if c==d5 0 i then []\n\ +\ else ((i!!c)!!c):h5 i (c+1)\n" +-} + +-- \h4 :: [[Double]] -> Int -> ([[Double]], [[Double]])\n\ +qrDecompositionString ∷ String +qrDecompositionString = + "\ + \h4 a i=\n\ + \ let c=gM b7(h6 a a(d5 0 a)0 i)\n\ + \ d=c3 h2(a5 c)(a6 c)\n\ + \ in (d,h2(b7 d)a)\n" + + +-- \h2 :: [[Double]] -> [[Double]] -> [[Double]]\n\ +matrixMultiplyString ∷ String +matrixMultiplyString = + "\ + \h2 a b=h7 0(d5 0 a)(d5 0$a5 b)a b\n" + + +-- \h6 :: [[Double]] -> [[Double]] -> Int -> Int -> Int -> [[[Double]]]\n\ +getHouseholderListString ∷ String +getHouseholderListString = + "\ + \h6 a b d c i=\n\ + \ let j=h8 a i\n\ + \ k=h9 j d\n\ + \ in\n\ + \ if d5 0 j==2 then [k]\n\ + \ else k:h6(i0 c c(h2 k b))b d(c+1)i\n" + + +-- \h8 :: [[Double]]-> Int -> [[Double]]\n\ +getHouseholderString ∷ String +getHouseholderString = + "\ + \h8 a i=\n\ + \ let v=i2 0 a\n\ + \ e=i5(i4 v(d8(i3(a4 v)i)(b7[i1(d5 0 a)0 1])))i\n\ + \ in i4(i6(d5 0 a)1)(d8 2$h2 e(b7 e))\n" + + +-- \h9 :: [[Double]] ->Int -> [[Double]]\n\ +padOutMinorString ∷ String +padOutMinorString = + "\ + \h9 a d=\n\ + \ if nU a then i6 d 1 --identity matrix\n\ + \ else\n\ + \ let m=d5 0 a\n\ + \ in\n\ + \ if m==d then a\n\ + \ else\n\ + \ let r=d-m\n\ + \ l=i6 d 1\n\ + \ in d3 0 r l++e5(++)(gM(d3 0 r)$d4 0 r l)a\n" + + +-- \i0 :: Int -> Int -> [[Double]] -> [[Double]]\n\ +makeMatrixMinorString ∷ String +makeMatrixMinorString = + "\ + \i0 r c a=gM(d4 0(c+1))(d4 0(r+1)a)\n" + + +-- \i6 :: Int -> Double -> [[Double]]\n\ +makeDiagMatrixString ∷ String +makeDiagMatrixString = + "\ + \i6 d v=i7 d v 0\n" + + +-- \i2 :: Int -> [[Double]] -> [[Double]]\n\ +getColumnVectorString ∷ String +getColumnVectorString = + "\ + \i2 c a=gM(:[])$gM(!!c)a\n" + + +-- \i1 :: Int -> Int -> Double -> [Double]\n\ +makeEVectorString ∷ String +makeEVectorString = + "\ + \i1 d p v=r0 p 0 0++[v]++r0(d-p-1)0 0\n" + + +-- \i4 :: [[Double]] -> [[Double]] -> [[Double]]\n\ +subtractMatricesString ∷ String +subtractMatricesString = + "\ + \i4 a b=\n\ + \ if nU a && nU b then []\n\ + \ else e5(-)(a5 a)(a5 b):i4(a6 a)(a6 b)\n" + + +-- \i5 :: [[Double]] -> Int -> [[Double]]\n\ +normalizeColumnVectorString ∷ String +normalizeColumnVectorString = + "\ + \i5 c i=gM(:[])(i8(a4 c)i)\n" + + +-- \i7 :: Int -> Double -> Int -> [[Double]]\n\ +makeDiagRowString ∷ String +makeDiagRowString = + "\ + \i7 d v c=\n\ + \ if c==d then []\n\ + \ else (r0 c 0 0++[v]++r0(d-c-1)0 0):i7 d v(c+1)\n" + + +-- \h7 :: Int -> Int -> Int -> [[Double]] -> [[Double]] -> [[Double]]\n\ +getRowsString ∷ String +getRowsString = + "\ + \h7 c a b d e=\n\ + \ if c==a then []\n\ + \ else i9 c 0 b d e:h7(c+1)a b d e\n" + + +-- \i9 :: Int -> Int -> Int -> [[Double]] -> [[Double]] -> [Double]\n\ +getElementString ∷ String +getElementString = + "\ + \i9 a c b d e=\n\ + \ if c==b then []\n\ + \ else c3(+)0(e5(*)(d!!a)(gM(!!c)e)):i9 a(c+1)b d e\n" + + +-- \i8 :: [Double] -> Int -> [Double]\n\ +normalizeVectorString ∷ String +normalizeVectorString = + "\ + \i8 a i=gM(/i3 a i)a\n" + + +-- \i3 :: [Double] -> Int -> Double\n\ +euclidNormString ∷ String +euclidNormString = + "\ + \i3 a i=f9(c3(+)0(e5(*)a a))i 0\n" + + +-- \j0 :: [[Double]] -> Int -> Int -> Int -> [Double]\n\ +isPosRString ∷ String +isPosRString = + "\ + \j0 i a r c\n\ + \ |r==a=[]\n\ + \ |c==a=j0 i a(r+1)0\n\ + \ |r==c=0:j0 i a r(c+1)\n\ + \ |otherwise=((i!!r)!!c):j0 i a r(c+1)\n" + + +{- +\j0 i a r c=\n\ +\ if r==a then []\n\ +\ else if c==a then j0 i a(r+1)0\n\ +\ else if r==c then 0:j0 i a r(c+1)\n\ +\ else ((i!!r)!!c):j0 i a r(c+1)\n" +-} + +-- \makeGTRMatrixLocal :: Int -> [[Double]] -> [Double]-> Int -> ([Double], [[Double]], [[Double]])\n\ +-- \a0 :: Int -> [[Double]] -> [Double]-> Int -> ([Double], [[Double]], [[Double]])\n\ +makeGTRMatrixLocalString ∷ String +makeGTRMatrixLocalString = + "\ + \a0 a r p i=\n\ + \ let q=(b2 a$b4(b2 a$b3(b1(b2 a$j0 r a 0 0))p a 0 0)a 0 0)\n\ + \ (j,k,l)=h3 q(i6(d5 0 q)1)0 i\n\ + \ in (h5(h2 k j)0,l,c0 l)\n" + + +-- \c1 :: (Double -> Int-> Int -> Double -> Double) -> String -> [Double] -> [[Double]] -> [[Double]] -> Int -> Double -> Double -> Int -> (Double -> Int -> Double -> Double) -> [(Double, Double)] -> [[Double]]\n\ +makeGTRLogMatrixString ∷ String +makeGTRLogMatrixString = + "\ + \c1 l m e u v a p x i d f=\n\ + \ let y=b2 a$c5(c3 c4(r0 a(r0 a 0.0 0)0)(gM(b2 a)$gM(c2 e u v 0 0 p x i a d)f))0 0\n\ + \ in b2 a$gM(*(-1))$c7 l(c6 y y 0)a m 0 0 i\n" + + +-- \makeGTRLogMatrix4State :: (Double -> Int-> Int -> Double -> Double) -> ([Double] -> [Double] -> Double -> Int -> (Double, Double) -> [[Double]]) -> Double -> Int -> [Double] -> [Double] -> [(Double, Double)] -> [[Double]]\n +makeGTRLogMatrix4StateString ∷ String +makeGTRLogMatrix4StateString = + "\ + \j1 l m p i o q f=\n\ + \ let y=b2 4$c5 (c3 c4(r0 4(r0 4 0.0 0)0)(gM(m o q p i)f)) 0 0\n\ + \ in b2 4$gM(* (-1))$c7 l(c6 y y 0)4 \"T\" 0 0 i\n" + + +-- \c7 :: (Double -> Int-> Int -> Double -> Double) -> [[Double]] -> Int -> String -> Int -> Int -> Int -> [Double]\n\ +getLogMatrixString ∷ String +getLogMatrixString = + "\ + \c7 l m a e r c i\n\ + \ |r==a=[]\n\ + \ |c==a=c7 l m a e(r+1)0 i\n\ + \ |(r==(a-1))&&(c==(a-1))=if e==\"-\" then [0] else (l((m!!r)!!c)i 0 0):c7 l m a e r(c+1)i\n\ + \ |otherwise=(l((m!!r)!!c)i 0 0):c7 l m a e r(c+1)i\n" + + +{- +\c7 l m a e r c i=\n\ +\ if r==a then []\n\ +\ else if c==a then c7 l m a e(r+1)0 i\n\ +\ else if (r==(a-1))&&(c==(a-1)) then\n\ +\ if e==\"-\" then [0]\n\ +\ else (l((m!!r)!!c)i 0 0):c7 l m a e r(c+1)i\n\ +\ else (l((m!!r)!!c)i 0 0):c7 l m a e r(c+1)i\n" +-} + +-- \e6 :: ([Double] -> [[Double]] -> [[Double]] -> Double -> Int -> Int -> Int -> Int -> Double) -> (Double -> Int -> Double -> Double) -> [Double] -> [[Double]] -> [[Double]]-> Int -> Int -> Double -> Double -> Int -> Int -> Double -> Double\n\ +trapezoidIntegrationString ∷ String +trapezoidIntegrationString = + "\ + \e6 f s e u v r j p x i c k=\n\ + \ if c==2*i then 0\n\ + \ else\n\ + \ let l=x/fromIntegral(2*i)\n\ + \ t=fromIntegral c*l\n\ + \ in (l*((f e u v(t*k)0 r j i*s p i t)+(f e u v((t*k)+l)0 r j i*s p i (t+l)))/2)+e6 f s e u v r j p x i(c+1)k\n" + + +{- +The variable names with letter number and the 'l' like k2l are to avoid name shadowing with functions. +some simpler models have unused parasm to allow for function as parameter +-} + +-- \tn93ExponentialWithK :: [Double] -> [Double] -> Double -> Int -> (Double, Double) -> [[Double]]\n\ +tn93ExponentialWithKString ∷ String +tn93ExponentialWithKString = + "\ + \j2 [a,c,b] [pA,pC,pG,pT] x i (w,k)=\n\ + \ if w<" + ++ epsilonString + ++ " then [[k,0,0,0],[0,k,0,0],[0,0,k,0],[0,0,0,k]]\n\ + \ else\n\ + \ let pR=pA+pG\n\ + \ pY=pC+pT\n\ + \ f1l=(pY*a+pR*b)\n\ + \ f2l=(pR*c+pY*b)\n\ + \ c0l=pA*pY/pR\n\ + \ c1l=pC*pR/pY\n\ + \ c2l=pG*pY/pR\n\ + \ c3l=pT*pR/pY\n\ + \ d0l=pG/pR\n\ + \ d1l=pT/pY\n\ + \ d2l=pA/pR\n\ + \ d3l=pC/pY\n\ + \ m2l=x*x\n\ + \ km=x*w\n\ + \ k2l=w*w\n\ + \ p00=(((d0l+c0l+pA)*m2l)+((((c0l+pA)*f1l)+(b*d0l)+(pA *b))*km)+(pA*b*f1l*k2l))/((x+(b*w))*(x+(f1l*w)))\n\ + \ p01=pC*b*w/(x+(b*w))\n\ + \ p02=(-1)*(((d0l-c2l-pG)*m2l)+(((((-1*c2l)-pG)*f1l)+(b*d0l)-(pG *b))*km)-(pG*b*f1l*k2l))/((x+(b*w))*(x+(f1l*w)))\n\ + \ p03=pT*b*w/(x+(b*w))\n\ + \ p10=pA*b*w/(x+(b*w))\n\ + \ p11=(((d1l+c1l+pC)*m2l)+((((c1l+pC)*f2l)+(b*d1l)+(pC *b))*km)+(pC*b*f2l*k2l))/((x+(b*w))*(x+(f2l*w)))\n\ + \ p12=pG*b*w/(x+(b*w))\n\ + \ p13=(-1)*(((d1l-c3l-pT)*m2l)+(((((-1*c3l)-pT)*f2l)+(b*d1l)-(pT *b))*km)-(pT*b*f2l*k2l))/((x+(b*w))*(x+(f2l*w)))\n\ + \ p20=(-1)*(((d2l-c0l-pA)*m2l)+(((((-1*c0l)-pA)*f1l)+(b*d2l)-(pA *b))*km)-(pA*b*f1l*k2l))/((x+(b*w))*(x+(f1l*w)))\n\ + \ p21=pC*b*w/(x+(b*w))\n\ + \ p22=(((d2l+c2l+pG)*m2l)+((((c2l+pG)*f1l)+(b*d2l)+(pG *b))*km)+(pG*b*f1l*k2l))/((x+(b*w))*(x+(f1l*w)))\n\ + \ p23=pT*b*w/(x+(b*w))\n\ + \ p30=pA*b*w/(x+(b*w))\n\ + \ p31=(-1)*(((d3l-c1l-pC)*m2l)+(((((-1*c1l)-pC)*f2l)+(b*d3l)-(pC *b))*km)-(pC*b*f2l*k2l))/((x+(b*w))*(x+(f2l*w)))\n\ + \ p32=pG*b*w/(x+(b*w))\n\ + \ p33=(((d3l+c3l+pT)*m2l)+((((c3l+pT)*f2l)+(b*d3l)+(pT *b))*km)+(pT*b*f2l*k2l))/((x+(b*w))*(x+(f2l*w)))\n\ + \ in d8 k [[p00,p01,p02,p03],[p10,p11,p12,p13],[p20,p21,p22,p23],[p30,p31,p32,p33]]\n" + + +-- \tn93UniformWithK :: [Double] -> [Double] -> Double -> Int -> (Double, Double) -> [[Double]]\n\ +tn93UniformWithKString ∷ String +tn93UniformWithKString = + "\ + \j3 [a,c,b] [pA,pC,pG,pT] x i (w,k)=\n\ + \ if w<" + ++ epsilonString + ++ " then [[k,0,0,0],[0,k,0,0],[0,0,k,0],[0,0,0,k]]\n\ + \ else\n\ + \ let pR=pA+pG\n\ + \ pY=pC+pT\n\ + \ f1l=(pY*a+pR*b)\n\ + \ f2l=(pR*c+pY*b)\n\ + \ c0l=pA*pY/pR\n\ + \ c1l=pC*pR/pY\n\ + \ c2l=pG*pY/pR\n\ + \ c3l=pT*pR/pY\n\ + \ d0l=pG/pR\n\ + \ d1l=pT/pY\n\ + \ d2l=pA/pR\n\ + \ d3l=pC/pY\n\ + \ ebkm=e0(b*w*x)i 0\n\ + \ enbkm=e0(-1*b*w*x)i 0\n\ + \ enf1lkm=e0(-1*f1l*w*x)i 0\n\ + \ enf2lkm=e0(-1*f2l*w*x)i 0\n\ + \ bkmfactor=(enbkm+((b*w*x)-1))/(b*w*x)\n\ + \ p00=((enbkm*((ebkm*((pA*b*f1l*w*x)+(c0l*f1l)+(b*d0l)))-(c0l*f1l)))-(b*d0l*enf1lkm))/(b*f1l*w*x)\n\ + \ p01=pC*bkmfactor\n\ + \ p02=((enbkm*((ebkm*((pG*b*f1l*w*x)+(c2l*f1l)-(b*d0l)))-(c2l*f1l)))+(b*d0l*enf1lkm))/(b*f1l*w*x)\n\ + \ p03=pT*bkmfactor\n\ + \ p10=pA*bkmfactor\n\ + \ p11=((enbkm*((ebkm*((pC*b*f2l*w*x)+(c1l*f2l)+(b*d1l)))-(c1l*f2l)))-(b*d1l*enf2lkm))/(b*f2l*w*x)\n\ + \ p12=pG*bkmfactor\n\ + \ p13=((enbkm*((ebkm*((pT*b*f2l*w*x)+(c3l*f2l)-(b*d1l)))-(c3l*f2l)))+(b*d1l*enf2lkm))/(b*f2l*w*x)\n\ + \ p20=((enbkm*((ebkm*((pA*b*f1l*w*x)+(c0l*f1l)-(b*d2l)))-(c0l*f1l)))+(b*d2l*enf1lkm))/(b*f1l*w*x)\n\ + \ p21=pC*bkmfactor\n\ + \ p22=((enbkm*((ebkm*((pG*b*f1l*w*x)+(c2l*f1l)+(b*d2l)))-(c2l*f1l)))-(b*d2l*enf1lkm))/(b*f1l*w*x)\n\ + \ p23=pG*bkmfactor\n\ + \ p30=pA*bkmfactor\n\ + \ p31=((enbkm*((ebkm*((pC*b*f2l*w*x)+(c1l*f2l)-(b*d3l)))-(c1l*f2l)))+(b*d3l*enf2lkm))/(b*f2l*w*x)\n\ + \ p32=pG*bkmfactor\n\ + \ p33=((enbkm*((ebkm*((pT*b*f2l*w*x)+(c3l*f2l)+(b*d3l)))-(c3l*f2l)))-(b*d3l*enf2lkm))/(b*f2l*w*x)\n\ + \ in d8 k [[p00,p01,p02,p03],[p10,p11,p12,p13],[p20,p21,p22,p23],[p30,p31,p32,p33]]\n" + + +-- \f84ExponentialWithK :: [Double] -> [Double] -> Double -> Int -> (Double, Double) -> [[Double]]\n\ +f84ExponentialWithKString ∷ String +f84ExponentialWithKString = + "\ + \j4 [p, b] [pA,pC,pG,pT] x i (w,k)=\n\ + \ if w<" + ++ epsilonString + ++ " then [[k,0,0,0],[0,k,0,0],[0,0,k,0],[0,0,0,k]]\n\ + \ else\n\ + \ let pR=pA+pG\n\ + \ pY=pC+pT\n\ + \ a=(1+(p/pY))*b\n\ + \ c=(1+(p/pR))*b\n\ + \ f1l=(pY*a+pR*b)\n\ + \ f2l=(pR*c+pY*b)\n\ + \ c0l=pA*pY/pR\n\ + \ c1l=pC*pR/pY\n\ + \ c2l=pG*pY/pR\n\ + \ c3l=pT*pR/pY\n\ + \ d0l=pG/pR\n\ + \ d1l=pT/pY\n\ + \ d2l=pA/pR\n\ + \ d3l=pC/pY\n\ + \ m2l=x*x\n\ + \ km=x*w\n\ + \ k2l=w*w\n\ + \ p00=(((d0l+c0l+pA)*m2l)+((((c0l+pA)*f1l)+(b*d0l)+(pA *b))*km)+(pA*b*f1l*k2l))/((x+(b*w))*(x+(f1l*w)))\n\ + \ p01=pC*b*w/(x+(b*w))\n\ + \ p02=(-1)*(((d0l-c2l-pG)*m2l)+(((((-1*c2l)-pG)*f1l)+(b*d0l)-(pG *b))*km)-(pG*b*f1l*k2l))/((x+(b*w))*(x+(f1l*w)))\n\ + \ p03=pT*b*w/(x+(b*w))\n\ + \ p10=pA*b*w/(x+(b*w))\n\ + \ p11=(((d1l+c1l+pC)*m2l)+((((c1l+pC)*f2l)+(b*d1l)+(pC *b))*km)+(pC*b*f2l*k2l))/((x+(b*w))*(x+(f2l*w)))\n\ + \ p12=pG*b*w/(x+(b*w))\n\ + \ p13=(-1)*(((d1l-c3l-pT)*m2l)+(((((-1*c3l)-pT)*f2l)+(b*d1l)-(pT *b))*km)-(pT*b*f2l*k2l))/((x+(b*w))*(x+(f2l*w)))\n\ + \ p20=(-1)*(((d2l-c0l-pA)*m2l)+(((((-1*c0l)-pA)*f1l)+(b*d2l)-(pA *b))*km)-(pA*b*f1l*k2l))/((x+(b*w))*(x+(f1l*w)))\n\ + \ p21=pC*b*w/(x+(b*w))\n\ + \ p22=(((d2l+c2l+pG)*m2l)+((((c2l+pG)*f1l)+(b*d2l)+(pG *b))*km)+(pG*b*f1l*k2l))/((x+(b*w))*(x+(f1l*w)))\n\ + \ p23=pT*b*w/(x+(b*w))\n\ + \ p30=pA*b*w/(x+(b*w))\n\ + \ p31=(-1)*(((d3l-c1l-pC)*m2l)+(((((-1*c1l)-pC)*f2l)+(b*d3l)-(pC *b))*km)-(pC*b*f2l*k2l))/((x+(b*w))*(x+(f2l*w)))\n\ + \ p32=pG*b*w/(x+(b*w))\n\ + \ p33=(((d3l+c3l+pT)*m2l)+((((c3l+pT)*f2l)+(b*d3l)+(pT *b))*km)+(pT*b*f2l*k2l))/((x+(b*w))*(x+(f2l*w)))\n\ + \ in d8 k [[p00,p01,p02,p03],[p10,p11,p12,p13],[p20,p21,p22,p23],[p30,p31,p32,p33]]\n" + + +-- \f84UniformWithK :: [Double] -> [Double] -> Double -> Int -> (Double, Double) -> [[Double]]\n\ +f84UniformWithKString ∷ String +f84UniformWithKString = + "\ + \j5 [p, b] [pA,pC,pG,pT] x i (w,k)=\n\ + \ if w<" + ++ epsilonString + ++ " then [[k,0,0,0],[0,k,0,0],[0,0,k,0],[0,0,0,k]]\n\ + \ else\n\ + \ let pR=pA+pG\n\ + \ pY=pC+pT\n\ + \ a=(1+(p/pY))*b\n\ + \ c=(1+(p/pR))*b\n\ + \ f1l=(pY*a+pR*b)\n\ + \ f2l=(pR*c+pY*b)\n\ + \ c0l=pA*pY/pR\n\ + \ c1l=pC*pR/pY\n\ + \ c2l=pG*pY/pR\n\ + \ c3l=pT*pR/pY\n\ + \ d0l=pG/pR\n\ + \ d1l=pT/pY\n\ + \ d2l=pA/pR\n\ + \ d3l=pC/pY\n\ + \ ebkm=e0(b*w*x)i 0\n\ + \ enbkm=e0(-1*b*w*x)i 0\n\ + \ enf1lkm=e0(-1*f1l*w*x)i 0\n\ + \ enf2lkm=e0(-1*f2l*w*x)i 0\n\ + \ bkmfactor=(enbkm+((b*w*x)-1))/(b*w*x)\n\ + \ p00=((enbkm*((ebkm*((pA*b*f1l*w*x)+(c0l*f1l)+(b*d0l)))-(c0l*f1l)))-(b*d0l*enf1lkm))/(b*f1l*w*x)\n\ + \ p01=pC*bkmfactor\n\ + \ p02=((enbkm*((ebkm*((pG*b*f1l*w*x)+(c2l*f1l)-(b*d0l)))-(c2l*f1l)))+(b*d0l*enf1lkm))/(b*f1l*w*x)\n\ + \ p03=pT*bkmfactor\n\ + \ p10=pA*bkmfactor\n\ + \ p11=((enbkm*((ebkm*((pC*b*f2l*w*x)+(c1l*f2l)+(b*d1l)))-(c1l*f2l)))-(b*d1l*enf2lkm))/(b*f2l*w*x)\n\ + \ p12=pG*bkmfactor\n\ + \ p13=((enbkm*((ebkm*((pT*b*f2l*w*x)+(c3l*f2l)-(b*d1l)))-(c3l*f2l)))+(b*d1l*enf2lkm))/(b*f2l*w*x)\n\ + \ p20=((enbkm*((ebkm*((pA*b*f1l*w*x)+(c0l*f1l)-(b*d2l)))-(c0l*f1l)))+(b*d2l*enf1lkm))/(b*f1l*w*x)\n\ + \ p21=pC*bkmfactor\n\ + \ p22=((enbkm*((ebkm*((pG*b*f1l*w*x)+(c2l*f1l)+(b*d2l)))-(c2l*f1l)))-(b*d2l*enf1lkm))/(b*f1l*w*x)\n\ + \ p23=pG*bkmfactor\n\ + \ p30=pA*bkmfactor\n\ + \ p31=((enbkm*((ebkm*((pC*b*f2l*w*x)+(c1l*f2l)-(b*d3l)))-(c1l*f2l)))+(b*d3l*enf2lkm))/(b*f2l*w*x)\n\ + \ p32=pG*bkmfactor\n\ + \ p33=((enbkm*((ebkm*((pT*b*f2l*w*x)+(c3l*f2l)+(b*d3l)))-(c3l*f2l)))-(b*d3l*enf2lkm))/(b*f2l*w*x)\n\ + \ in d8 k [[p00,p01,p02,p03],[p10,p11,p12,p13],[p20,p21,p22,p23],[p30,p31,p32,p33]]\n" + + +-- \hky85ExponentialWithK :: [Double] -> [Double] -> Double -> Int -> (Double, Double) -> [[Double]]\n\ +hky85ExponentialWithKString ∷ String +hky85ExponentialWithKString = + "\ + \j6 [h, b] [pA,pC,pG,pT] x i (w,k)=\n\ + \ if w<" + ++ epsilonString + ++ " then [[k,0,0,0],[0,k,0,0],[0,0,k,0],[0,0,0,k]]\n\ + \ else\n\ + \ let pR=pA+pG\n\ + \ pY=pC+pT\n\ + \ f1l=(pY*h+pR*b)\n\ + \ f2l=(pR*h+pY*b)\n\ + \ c0l=pA*pY/pR\n\ + \ c1l=pC*pR/pY\n\ + \ c2l=pG*pY/pR\n\ + \ c3l=pT*pR/pY\n\ + \ d0l=pG/pR\n\ + \ d1l=pT/pY\n\ + \ d2l=pA/pR\n\ + \ d3l=pC/pY\n\ + \ m2l=x*x\n\ + \ km=x*w\n\ + \ k2l=w*w\n\ + \ p00=(((d0l+c0l+pA)*m2l)+((((c0l+pA)*f1l)+(b*d0l)+(pA *b))*km)+(pA*b*f1l*k2l))/((x+(b*w))*(x+(f1l*w)))\n\ + \ p01=pC*b*w/(x+(b*w))\n\ + \ p02=(-1)*(((d0l-c2l-pG)*m2l)+(((((-1*c2l)-pG)*f1l)+(b*d0l)-(pG *b))*km)-(pG*b*f1l*k2l))/((x+(b*w))*(x+(f1l*w)))\n\ + \ p03=pT*b*w/(x+(b*w))\n\ + \ p10=pA*b*w/(x+(b*w))\n\ + \ p11=(((d1l+c1l+pC)*m2l)+((((c1l+pC)*f2l)+(b*d1l)+(pC *b))*km)+(pC*b*f2l*k2l))/((x+(b*w))*(x+(f2l*w)))\n\ + \ p12=pG*b*w/(x+(b*w))\n\ + \ p13=(-1)*(((d1l-c3l-pT)*m2l)+(((((-1*c3l)-pT)*f2l)+(b*d1l)-(pT *b))*km)-(pT*b*f2l*k2l))/((x+(b*w))*(x+(f2l*w)))\n\ + \ p20=(-1)*(((d2l-c0l-pA)*m2l)+(((((-1*c0l)-pA)*f1l)+(b*d2l)-(pA *b))*km)-(pA*b*f1l*k2l))/((x+(b*w))*(x+(f1l*w)))\n\ + \ p21=pC*b*w/(x+(b*w))\n\ + \ p22=(((d2l+c2l+pG)*m2l)+((((c2l+pG)*f1l)+(b*d2l)+(pG *b))*km)+(pG*b*f1l*k2l))/((x+(b*w))*(x+(f1l*w)))\n\ + \ p23=pT*b*w/(x+(b*w))\n\ + \ p30=pA*b*w/(x+(b*w))\n\ + \ p31=(-1)*(((d3l-c1l-pC)*m2l)+(((((-1*c1l)-pC)*f2l)+(b*d3l)-(pC *b))*km)-(pC*b*f2l*k2l))/((x+(b*w))*(x+(f2l*w)))\n\ + \ p32=pG*b*w/(x+(b*w))\n\ + \ p33=(((d3l+c3l+pT)*m2l)+((((c3l+pT)*f2l)+(b*d3l)+(pT *b))*km)+(pT*b*f2l*k2l))/((x+(b*w))*(x+(f2l*w)))\n\ + \ in d8 k [[p00,p01,p02,p03],[p10,p11,p12,p13],[p20,p21,p22,p23],[p30,p31,p32,p33]]\n" + + +-- \hky85UniformWithK :: [Double] -> [Double] -> Double -> Int -> (Double, Double) -> [[Double]]\n\ +hky85UniformWithKString ∷ String +hky85UniformWithKString = + "\ + \j7 [h, b] [pA,pC,pG,pT] x i (w,k)=\n\ + \ if w<" + ++ epsilonString + ++ " then [[k,0,0,0],[0,k,0,0],[0,0,k,0],[0,0,0,k]]\n\ + \ else\n\ + \ let pR=pA+pG\n\ + \ pY=pC+pT\n\ + \ f1l=(pY*h+pR*b)\n\ + \ f2l=(pR*h+pY*b)\n\ + \ c0l=pA*pY/pR\n\ + \ c1l=pC*pR/pY\n\ + \ c2l=pG*pY/pR\n\ + \ c3l=pT*pR/pY\n\ + \ d0l=pG/pR\n\ + \ d1l=pT/pY\n\ + \ d2l=pA/pR\n\ + \ d3l=pC/pY\n\ + \ ebkm=e0(b*w*x)i 0\n\ + \ enbkm=e0(-1*b*w*x)i 0\n\ + \ enf1lkm=e0(-1*f1l*w*x)i 0\n\ + \ enf2lkm=e0(-1*f2l*w*x)i 0\n\ + \ bkmfactor=(enbkm+((b*w*x)-1))/(b*w*x)\n\ + \ p00=((enbkm*((ebkm*((pA*b*f1l*w*x)+(c0l*f1l)+(b*d0l)))-(c0l*f1l)))-(b*d0l*enf1lkm))/(b*f1l*w*x)\n\ + \ p01=pC*bkmfactor\n\ + \ p02=((enbkm*((ebkm*((pG*b*f1l*w*x)+(c2l*f1l)-(b*d0l)))-(c2l*f1l)))+(b*d0l*enf1lkm))/(b*f1l*w*x)\n\ + \ p03=pT*bkmfactor\n\ + \ p10=pA*bkmfactor\n\ + \ p11=((enbkm*((ebkm*((pC*b*f2l*w*x)+(c1l*f2l)+(b*d1l)))-(c1l*f2l)))-(b*d1l*enf2lkm))/(b*f2l*w*x)\n\ + \ p12=pG*bkmfactor\n\ + \ p13=((enbkm*((ebkm*((pT*b*f2l*w*x)+(c3l*f2l)-(b*d1l)))-(c3l*f2l)))+(b*d1l*enf2lkm))/(b*f2l*w*x)\n\ + \ p20=((enbkm*((ebkm*((pA*b*f1l*w*x)+(c0l*f1l)-(b*d2l)))-(c0l*f1l)))+(b*d2l*enf1lkm))/(b*f1l*w*x)\n\ + \ p21=pC*bkmfactor\n\ + \ p22=((enbkm*((ebkm*((pG*b*f1l*w*x)+(c2l*f1l)+(b*d2l)))-(c2l*f1l)))-(b*d2l*enf1lkm))/(b*f1l*w*x)\n\ + \ p23=pG*bkmfactor\n\ + \ p30=pA*bkmfactor\n\ + \ p31=((enbkm*((ebkm*((pC*b*f2l*w*x)+(c1l*f2l)-(b*d3l)))-(c1l*f2l)))+(b*d3l*enf2lkm))/(b*f2l*w*x)\n\ + \ p32=pG*bkmfactor\n\ + \ p33=((enbkm*((ebkm*((pT*b*f2l*w*x)+(c3l*f2l)+(b*d3l)))-(c3l*f2l)))-(b*d3l*enf2lkm))/(b*f2l*w*x)\n\ + \ in d8 k [[p00,p01,p02,p03],[p10,p11,p12,p13],[p20,p21,p22,p23],[p30,p31,p32,p33]]\n" + + +-- \f81ExponentialWithK :: [Double] -> [Double] -> Double -> Int -> (Double, Double) -> [[Double]]\n\ +f81ExponentialWithKString ∷ String +f81ExponentialWithKString = + "\ + \j8 z [pA,pC,pG,pT] x i (w,k)=\n\ + \ if w<" + ++ epsilonString + ++ " then [[k,0,0,0],[0,k,0,0],[0,0,k,0],[0,0,0,k]]\n\ + \ else\n\ + \ let pR=pA+pG\n\ + \ pY=pC+pT\n\ + \ c0l=pA*pY/pR\n\ + \ c1l=pC*pR/pY\n\ + \ c2l=pG*pY/pR\n\ + \ c3l=pT*pR/pY\n\ + \ d0l=pG/pR\n\ + \ d1l=pT/pY\n\ + \ d2l=pA/pR\n\ + \ d3l=pC/pY\n\ + \ m2l=x*x\n\ + \ km=x*w\n\ + \ k2l=w*w\n\ + \ p00=(((d0l+c0l+pA)*m2l)+((((c0l+pA))+d0l+pA)*km)+(pA*k2l))/((x+w)*(x+w))\n\ + \ p01=pC*w/(x+w)\n\ + \ p02=(-1)*(((d0l-c2l-pG)*m2l)+(((((-1*c2l)-pG))+d0l-pG)*km)-(pG*k2l))/((x+w)*(x+w))\n\ + \ p03=pT*w/(x+w)\n\ + \ p10=pA*w/(x+w)\n\ + \ p11=(((d1l+c1l+pC)*m2l)+((((c1l+pC))+d1l+pC)*km)+(pC*k2l))/((x+w)*(x+w))\n\ + \ p12=pG*w/(x+w)\n\ + \ p13=(-1)*(((d1l-c3l-pT)*m2l)+(((((-1*c3l)-pT))+d1l-pT)*km)-(pT*k2l))/((x+w)*(x+w))\n\ + \ p20=(-1)*(((d2l-c0l-pA)*m2l)+(((((-1*c0l)-pA))+(d2l)-pA)*km)-(pA*k2l))/((x+w)*(x+w))\n\ + \ p21=pC*w/(x+w)\n\ + \ p22=(((d2l+c2l+pG)*m2l)+((((c2l+pG))+(d2l)+pG)*km)+(pG*k2l))/((x+w)*(x+w))\n\ + \ p23=pT*w/(x+w)\n\ + \ p30=pA*w/(x+w)\n\ + \ p31=(-1)*(((d3l-c1l-pC)*m2l)+(((((-1*c1l)-pC))+(d3l)-pC)*km)-(pC*k2l))/((x+w)*(x+w))\n\ + \ p32=pG*w/(x+w)\n\ + \ p33=(((d3l+c3l+pT)*m2l)+((((c3l+pT))+(d3l)+pT)*km)+(pT*k2l))/((x+w)*(x+w))\n\ + \ in d8 k [[p00,p01,p02,p03],[p10,p11,p12,p13],[p20,p21,p22,p23],[p30,p31,p32,p33]]\n" + + +-- \f81UniformWithK :: [Double] -> [Double] -> Double -> Int -> (Double, Double) -> [[Double]]\n\ +f81UniformWithKString ∷ String +f81UniformWithKString = + "\ + \j9 z [pA,pC,pG,pT] x i (w,k)=\n\ + \ if w<" + ++ epsilonString + ++ " then [[k,0,0,0],[0,k,0,0],[0,0,k,0],[0,0,0,k]]\n\ + \ else\n\ + \ let pR=pA+pG\n\ + \ pY=pC+pT\n\ + \ c0l=pA*pY/pR\n\ + \ c1l=pC*pR/pY\n\ + \ c2l=pG*pY/pR\n\ + \ c3l=pT*pR/pY\n\ + \ d0l=pG/pR\n\ + \ d1l=pT/pY\n\ + \ d2l=pA/pR\n\ + \ d3l=pC/pY\n\ + \ ebkm=e0(w*x)i 0\n\ + \ enbkm=e0(-1*w*x)i 0\n\ + \ enf1lkm=e0(-1*w*x)i 0\n\ + \ enf2lkm=e0(-1*w*x)i 0\n\ + \ bkmfactor=(enbkm+((w*x)-1))/(w*x)\n\ + \ p00=((enbkm*((ebkm*((pA*w*x)+(c0l)+d0l))-(c0l)))-(d0l*enf1lkm))/(w*x)\n\ + \ p01=pC*bkmfactor\n\ + \ p02=((enbkm*((ebkm*((pG*w*x)+(c2l)-d0l))-(c2l)))+(d0l*enf1lkm))/(w*x)\n\ + \ p03=pT*bkmfactor\n\ + \ p10=pA*bkmfactor\n\ + \ p11=((enbkm*((ebkm*((pC*w*x)+(c1l)+d1l))-(c1l)))-(d1l*enf2lkm))/(w*x)\n\ + \ p12=pG*bkmfactor\n\ + \ p13=((enbkm*((ebkm*((pT*w*x)+(c3l)-d1l))-(c3l)))+(d1l*enf2lkm))/(w*x)\n\ + \ p20=((enbkm*((ebkm*((pA*w*x)+(c0l)-(d2l)))-(c0l)))+(d2l*enf1lkm))/(w*x)\n\ + \ p21=pC*bkmfactor\n\ + \ p22=((enbkm*((ebkm*((pG*w*x)+(c2l)+(d2l)))-(c2l)))-(d2l*enf1lkm))/(w*x)\n\ + \ p23=pG*bkmfactor\n\ + \ p30=pA*bkmfactor\n\ + \ p31=((enbkm*((ebkm*((pC*w*x)+(c1l)-(d3l)))-(c1l)))+(d3l*enf2lkm))/(w*x)\n\ + \ p32=pG*bkmfactor\n\ + \ p33=((enbkm*((ebkm*((pT*w*x)+(c3l)+(d3l)))-(c3l)))-(d3l*enf2lkm))/(w*x)\n\ + \ in d8 k [[p00,p01,p02,p03],[p10,p11,p12,p13],[p20,p21,p22,p23],[p30,p31,p32,p33]]\n" + + +-- \k80ExponentialWithK :: [Double] -> [Double] -> Double -> Int -> (Double, Double) -> [[Double]]\n\ +k80ExponentialWithKString ∷ String +k80ExponentialWithKString = + "\ + \k0 [h, b] z x i (w,k)=\n\ + \ if w<" + ++ epsilonString + ++ " then [[k,0,0,0],[0,k,0,0],[0,0,k,0],[0,0,0,k]]\n\ + \ else\n\ + \ let f=(h +b)/2\n\ + \ c=0.25\n\ + \ d=0.5\n\ + \ m2l=x*x\n\ + \ km=x*w\n\ + \ k2l=w*w\n\ + \ o=0.25*b*w/(x+(b*w))\n\ + \ p=(((d+c+0.25)*m2l)+((((c+0.25)*f)+(b*d)+(0.25 *b))*km)+(0.25*b*f*k2l))/((x+(b*w))*(x+(f*w)))\n\ + \ m=(-1)*(((d-c-0.25)*m2l)+(((((-1*c)-0.25)*f)+(b*d)-(0.25*b))*km)-(0.25*b*f*k2l))/((x+(b*w))*(x+(f*w)))\n\ + \ in d8 k [[p,o,m,o],[o,p,o,m],[m,o,p,o],[o,m,o,p]]\n" + + +-- \k80UniformWithK :: [Double] -> [Double] -> Double -> Int -> (Double, Double) -> [[Double]]\n\ +k80UniformWithKString ∷ String +k80UniformWithKString = + "\ + \k1 [h, b] z x i (w,k)=\n\ + \ if w<" + ++ epsilonString + ++ " then [[k,0,0,0],[0,k,0,0],[0,0,k,0],[0,0,0,k]]\n\ + \ else\n\ + \ let f=(h +b)/2\n\ + \ c=0.25\n\ + \ d=0.5\n\ + \ ebkm=e0(b*w*x)i 0\n\ + \ enbkm=e0(-1*b*w*x)i 0\n\ + \ enfkm=e0(-1*f*w*x)i 0\n\ + \ o=0.25*(enbkm+((b*w*x)-1))/(b*w*x)\n\ + \ p=((enbkm*((ebkm*((0.25*b*f*w*x)+(c*f)+(b*d)))-(c*f)))-(b*d*enfkm))/(b*f*w*x)\n\ + \ m=((enbkm*((ebkm*((0.25*b*f*w*x)+(c*f)-(b*d)))-(c*f)))+(b*d*enfkm))/(b*f*w*x)\n\ + \ in d8 k [[p,o,m,o],[o,p,o,m],[m,o,p,o],[o,m,o,p]]\n" diff --git a/src/Complexity/Constants.hs b/src/Complexity/Constants.hs new file mode 100644 index 000000000..156be9dea --- /dev/null +++ b/src/Complexity/Constants.hs @@ -0,0 +1,83 @@ +{- | +Module : Constants +Description : Constant values used by functions +Copyright : (c) 2019-2020 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. +License : + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The views and conclusions contained in the software and documentation are those +of the authors and should not be interpreted as representing official policies, +either expressed or implied, of the FreeBSD Project. + +Maintainer : Ward Wheeler +Stability : unstable +Portability : portable (I hope) +-} +module Complexity.Constants where + + +-- | epsilon value for floating point comparisons +epsilon ∷ Double +epsilon = 0.0001 ∷ Double + + +-- | maximum iterations for Newton's method +maxIterations ∷ Int +maxIterations = 100 ∷ Int + + +-- maximum rate modifier for discrete Gamma +maxGammaRate ∷ Double +maxGammaRate = 10.0 ∷ Double + + +-- maximum time parameter for exponential distribution +maxTime ∷ Double +maxTime = 2.0 ∷ Double + + +-- fixed precision for some functins like expE +-- this becuase of precision issues +-- ned this high with facvtorial on Doubles +fixedPrecision ∷ Int +fixedPrecision = 100 ∷ Int + + +-- strings for use in code generation +maxTimeString ∷ String +maxTimeString = show maxTime + + +epsilonString ∷ String +epsilonString = show epsilon + + +maxIterationsString ∷ String +maxIterationsString = show maxIterations + + +maxGammaRateString ∷ String +maxGammaRateString = show maxGammaRate + + +fixedPrecisionString ∷ String +fixedPrecisionString = show fixedPrecision diff --git a/src/Complexity/Graphs.hs b/src/Complexity/Graphs.hs new file mode 100644 index 000000000..f7321f573 --- /dev/null +++ b/src/Complexity/Graphs.hs @@ -0,0 +1,396 @@ +-- \|E| = |L| + (|L|-2) - 2(|R|-1) + 3|N| + |S| +-- = 2|L| - 2|R| + 3|N| + |S| +-- |V| = |L| + (|L|-1) - (|R|-1) + 2|N| + |S| +-- = 2|L| - |R| + 2|N| + |S| +-- E = edge set, L = leave set, R = root set, N = network edge set, +-- S = "singleton" (single leaf + root) components. +-- +-- To Do: read graph from file and do get numbers that way? +{-# LANGUAGE Safe #-} + +{- | +Module : Graphs +Description : Functions to generate (algorithmic) complexity of graphs + Generates a Haskell program (compiles) + with a description of a graph. The output program can be executed with + GHCi interpreter. ghci --:load progName + also outputs Huffman binary code of program +Copyright : (c) 2018-2019 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. +License : + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The views and conclusions contained in the software and documentation are those +of the authors and should not be interpreted as representing official policies, +either expressed or implied, of the FreeBSD Project. + +Maintainer : Ward Wheeler +Stability : unstable +Portability : portable (I hope) +-} +module Complexity.Graphs ( + makeProgramStringGraph, + makeDisplayGraphString, +) where + +import Complexity.CodeStrings +import Data.Foldable (fold) + + +-- import Debug.Trace + +mainStartString ∷ String +mainStartString = "main=do\n" + + +-- | makeProgramString is wrapper for makeGraphProgramString to simplify interface +makeProgramStringGraph ∷ Int → Int → Int → Int → String +makeProgramStringGraph numLeaves numSingle numRoots numNetEdges = + let (outProgram, middleString, sumString) = + makeGraphProgramString + numLeaves + numRoots + numSingle + programStartStringGraph + mainStartString + "" + numLeaves + numRoots + numNetEdges + numSingle + in outProgram <> middleString <> sumString + + +-- | makeGraphProgramString puts apropriate code and values into string to define a general graph +makeGraphProgramString + ∷ Int → Int → Int → String → String → String → Int → Int → Int → Int → (String, String, String) +makeGraphProgramString numLeavesOrig numRootsOrig numSingleOrig beforeMain afterMain sumString numLeaves numRoots numNetEdges numSingle = + let numLeaves2 = numLeavesOrig - numSingleOrig + numRoots2 = numRootsOrig - numSingleOrig + numLeaves3 = numLeaves2 - max 0 (2 * (numRootsOrig - numSingleOrig - 1)) + maxVertex = (2 * numSingleOrig) + max 0 (3 * (numRootsOrig - numSingleOrig - 1)) + maxVertex2 = maxVertex + (2 * numLeaves3) - 1 + doSingle = (numSingleOrig > 0) + doMinimal = (numRootsOrig > (numSingleOrig + 1)) + doTree = numLeavesOrig > max 0 (2 * (numRootsOrig - numSingleOrig - 1)) + numSingleOrig + doEdges = numNetEdges > 0 + in if numSingle > 0 -- make singleton trees one root -> one leaf each + then + if doMinimal || doTree -- More stuff to come + then + makeGraphProgramString + numLeavesOrig + numRootsOrig + numSingleOrig + (beforeMain <> getSingletonEdgesString <> nullString) + (afterMain <> " let s=aG 0 " <> show numSingleOrig <> "\n") + " p0 \"\" (s<>" + (numLeaves - numSingle) + (numRoots - numSingle) + numNetEdges + 0 + else -- singles only + + makeGraphProgramString + numLeavesOrig + numRootsOrig + numSingleOrig + (beforeMain <> getSingletonEdgesString <> nullString) + (afterMain <> " let s=aG 0 " <> show numSingleOrig <> "\n") + " p0 \"\" s" + (numLeaves - numSingle) + (numRoots - numSingle) + numNetEdges + 0 + else + if numRoots > (numSingle + 1) -- have trivial trees. one root -> two leaves--always do a tree so always a tree follows if this is done. hence "m<>"" + then + if not doSingle + then + makeGraphProgramString + numLeavesOrig + numRootsOrig + numSingleOrig + (beforeMain <> minimalTreesString <> nullString) + (afterMain <> " let m=bG " <> show (max 0 $ 2 * numSingleOrig) <> " " <> show numLeaves2 <> " " <> show numRoots2 <> "\n") + (sumString <> " p0 \"\" (m<>") + (numLeaves - 2 * (numRoots - 1)) + 1 + numNetEdges + 0 + else + makeGraphProgramString + numLeavesOrig + numRootsOrig + numSingleOrig + (beforeMain <> minimalTreesString <> nullString) + (afterMain <> " let m=bG " <> show (max 0 $ 2 * numSingleOrig) <> " " <> show numLeaves2 <> " " <> show numRoots2 <> "\n") + (sumString <> "m<>") + (numLeaves - 2 * (numRoots - 1)) + 1 + numNetEdges + 0 + else + if numLeaves > 0 -- pectinate tree for remaining leaves one root -> all remining leaves + then + if not doEdges + then + if not doSingle && not doMinimal + then + ( beforeMain <> fullTreeString <> nullString + , afterMain <> " let t=cG True " <> show maxVertex <> " " <> show numLeaves3 <> "\n" + , sumString <> " p0 \"\" t" + ) + else + ( beforeMain <> fullTreeString + , afterMain <> " let t=cG True " <> show maxVertex <> " " <> show numLeaves3 <> "\n" + , sumString <> "t)" + ) + else + if not doSingle && not doMinimal + then + ( beforeMain <> fullTreeString <> nullString <> addEdgeString + , afterMain + <> " let t=cG True " + <> show maxVertex + <> " " + <> show numLeaves3 + <> "\n" + <> " let n=dG " + <> show maxVertex2 + <> " t " + <> show numNetEdges + <> "\n" + , sumString <> " p0 \"\" n" + ) + else + ( beforeMain <> fullTreeString <> nullString <> addEdgeString + , afterMain + <> " let t=cG True " + <> show maxVertex + <> " " + <> show numLeaves3 + <> "\n" + <> " let n=dG " + <> show maxVertex2 + <> " t " + <> show numNetEdges + <> "\n" + , sumString <> "n)" + ) + else (beforeMain, afterMain, sumString) + + +{- | makeBaseStringGraph creates graph code for a graph that can become a display tree +so conditions already checked--leaves > 0, numSunbgle == 0, numRoots == 1 +-} +makeBaseStringGraph ∷ Int → String → String → String → Int → Int → (String, String, String) +makeBaseStringGraph numLeavesOrig beforeMain afterMain sumString _numLeaves numNetEdges = + let maxVertex2 = (2 * numLeavesOrig) - 1 + zero0 = 0 ∷ Int -- quiets warning + necessaryFunctionStrings = + fold + [ beforeMain + , fullTreeString + , addEdgeString + , fmapString + , sndString + , headString + , tailString + , elemString + , notElemString + , getRepeatedElementsString + , childrenParentsOfNodeString + , lastString + , filterString + , displayEdgesString + , nullString + ] + bitList = replicate numNetEdges (0 ∷ Int) + in ( necessaryFunctionStrings + , fold + [ afterMain + , " let t=cG True " + , show zero0 + , " " + , show numLeavesOrig + , "\n" + , " let n=dG " + , show maxVertex2 + , " t " + , show numNetEdges + , "\n" + , " let b=" + , show bitList + , "\n" + , " let d=dE b n\n" + ] + , sumString <> " p0 \"\" d" + ) + + +{- | makeDisplayGraphString cretges code to generate a general gaph and then output a display graph based on that graph +the ouput is the display graph +if input is tree then return graph string as is +-} +makeDisplayGraphString ∷ Int → Int → Int → Int → String +makeDisplayGraphString numLeaves numSingle numRoots numNetEdges = + if numNetEdges == 0 + then makeProgramStringGraph numLeaves numSingle numRoots numNetEdges + else + let (outProgram, middleString, sumString) = makeBaseStringGraph numLeaves programStartStringGraph mainStartString "" numLeaves numNetEdges + in outProgram <> middleString <> sumString + +{- +Resolve general graph to display tree +(Assuming here can be ie 1 root, leaves >4, net edges > 1, no isolated nodes etc) + +0) Get node list from edge list (don't care about root--only once there for sure + so can just get all n odes that are terminations of edges +1) get indegree > 1 nodes +2) take first netNode +3) get edges and nodes that will be effected +4) delete edges from list that need to be +5) add new edges to list that need to be create +6) since only dealing with edges no need to update nodes +7) recurse till no net nodes +-} + +{- -- | Functions that are needed + need + n = list of edges from previous code + gM = fmapString + iM = sndString + a5 = headString + a6 = tailString + eS = elemString + nE = notElementString + eR = getRepeatedElementsString + cN = childrenParentsOfNode + a3 = lastString + g2 = filterString + dE = displayEdgesString + nU = nullString +-} + +{- general flow +-- get list of nodes that terminate an edge +-- fmap snd [(u,v)] +"let v=gM iM n\n" + +--for each member of list, is it repeated--ie found in tail +-- return list of repeated elements + +"let r=rE v\n" + +-- if r is empty then return current node list for final print and termination +"if null r then p0 \"\" n\n" + +-- else resolve (arbitrarily first since only need cost of doing one resolution really) first vertex in list +"else " + vertex = head r +"let h=a5 r\n" + +-- get children and parents of network node from list of edges + +-- call with empty lists for child and parent +childrenParentsOfNode :: Int -> [(Int,Int)] -> [Int] -> [Int] -> ([Int], [Int]) +childrenParentsOfNode node edgeList childList parentList = + if null edgeList then (childList, parentList) + else + let (e,u) = head edgeList + if node == e then childrenParentsOfNode node (tail edgeList) (u:childList) parentList + else if node == u then childrenParentsOfNode node (tail edgeList) childList (e:parentList) + else childrenParentsOfNode node (tail edgeList) childList parentList + +"let (c,p)=cN h n [] []\n" + +-- get grandparents of network edge (need for new edges) +"let (_,pL)=cN (a5 p) n [] []\n" +" (_,pR)=cn (a3 p) n [] []\n" +--" gL=a5 pL\n" +" gR=a5 pR\n" + +-- get left and right siblings of net node (h) +--" (cL, _)=cN (a5 p) n [] []\n" +" (cR, _)=cN (a3 p) n [] []\n" +--" sL=a5 $ g2 (/=h) cL\n" +" sR=a5 $ g2 (/=h) cR\n" + +-- get list of edges to delete + -- connecting child of network node (h) to "left" side + -- net node to child of net node + -- parent left to net node + -- parent right to net node + -- grand parent right to parent right + -- parent right to sibling right +"let d=[(h,a5 c),(a5 p, h),(a3 p, h),(gR,a3 p),(a3 p,sR)]\n" + +-- get edges to add +" a=[(a5 p, a5 c),(gR, sR)]\n" + +-- filter out nodes to delete and add new ones +-- displayNodes = filter (`notElem` d) n +" w=(gM(`nE` d)h)<>a \n" + +-- print edge set and terminate + +" p0 "" d\n" + +-- wrap all inside of a recursive function +displayEdges :: [(Int, Int)] -> [(Int, Int)] +displayEdges n = + if null n then [] + else + -- get node list + "let v=gM iM n\n" + -- get repeated node list + " r=rE v\n" + "in\n" + -- if no repeated nodes then return current list + "if null r then n\n" + -- else resolve first net (repeated) node by removing 5 edges and adding 2 new + "else\n" + " let h=a5 r\n" + -- get children and parents of net node + " (c,p)=cN h n [] []\n" + -- get left and right parent and right grandparent (since adding net child to left parent of net node) + -- of network edge (need for new edges) + " (_,pL)=cN (a5 p) n [] []\n" + " (_,pR)=cn (a3 p) n [] []\n" + " gR=a5 pR\n" + -- get right sibling of net node (h) + " (cR, _)=cN (a3 p) n [] []\n" + " sR=a5 $ g2 (/=h) cR\n" + -- get list of edges to delete + -- connecting child of network node (h) to "left" side + -- net node to child of net node + -- parent left to net node + -- parent right to net node + -- grand parent right to parent right + -- parent right to sibling right + " d=[(h,a5 c),(a5 p, h),(a3 p, h),(gR,a3 p),(a3 p,sR)]\n" + -- get edges to add + " a=[(a5 p, a5 c),(gR, sR)]\n" + -- filter out nodes to delete and add new ones and return + " w=(gM(`nE` d)h)<>a \n" + " in displayEdges w\n" +-} diff --git a/src/Complexity/Huffman.hs b/src/Complexity/Huffman.hs new file mode 100644 index 000000000..9f3a228cd --- /dev/null +++ b/src/Complexity/Huffman.hs @@ -0,0 +1,123 @@ +{-# LANGUAGE Safe #-} + +{- +Module from: +http://hackage.haskell.org/package/huffman-1.0.1/docs/Data-Compression-Huffman.html +modified to work with newer versino of Containers +-} + +module Complexity.Huffman ( + HuffmanTree (..), + Bit (..), + Code, + huffman, + huffmanSorted, + codewords, + ppCode, +) where + +import Control.Arrow (first, second) +import Data.List (intercalate) +import Data.PriorityQueue.FingerTree qualified as PQ +import Data.Sequence ( + ViewL (EmptyL, (:<)), + viewl, + (|>), + ) +import Data.Sequence qualified as S + + +data Bit = Zero | One + deriving stock (Eq) + + +instance Show Bit where + show Zero = "0" + show One = "1" + + +data HuffmanTree a + = Empty + | Node (HuffmanTree a) (HuffmanTree a) + | Leaf a + deriving stock (Show) + + +type role HuffmanTree representational + + +type Code a = [(a, [Bit])] + + +{- | +__Time:__ \( \mathcal{O}\left( n \log n \right) \) + +Simple implementation. +-} +huffman ∷ ∀ a w. (Ord w, Num w) ⇒ [(a, w)] → HuffmanTree a +huffman = + let build ∷ PQ.PQueue w (HuffmanTree a) → HuffmanTree a + build pq = case PQ.minViewWithKey pq of + Nothing → Empty + Just ((w, x), pq') → + case PQ.minViewWithKey pq' of + Nothing → x + Just ((w', y), pq'') → build $ PQ.insert (w + w') (Node x y) pq'' + + prepare ∷ [(a, w)] → PQ.PQueue w (HuffmanTree a) + prepare = PQ.fromList . map (\(x, w) → (w, Leaf x)) + in build . prepare + + +{- | +__Time:__ \( \mathcal{O}\left( n \right) \) + +More efficient implementation. +Requires that the input list of symbols and weight is sorted by increasing weight. +-} +huffmanSorted ∷ ∀ a w. (Ord w, Num w) ⇒ [(a, w)] → HuffmanTree a +huffmanSorted = + let prepare ∷ [(a, w)] → S.Seq (HuffmanTree a, w) + prepare = S.fromList . map (first Leaf) + + dequeue + ∷ S.Seq (HuffmanTree a, w) + → S.Seq (HuffmanTree a, w) + → Maybe ((HuffmanTree a, w), S.Seq (HuffmanTree a, w), S.Seq (HuffmanTree a, w)) + dequeue s t = + case (viewl s, viewl t) of + (EmptyL, EmptyL) → Nothing + (EmptyL, x :< ts) → Just (x, s, ts) + (x :< ss, EmptyL) → Just (x, ss, t) + ((x, w) :< ss, (y, w') :< ts) + | w < w' → Just ((x, w), ss, t) + | otherwise → Just ((y, w'), s, ts) + + build ∷ S.Seq (HuffmanTree a, w) → S.Seq (HuffmanTree a, w) → HuffmanTree a + build s t = + case dequeue s t of + Nothing → Empty + Just ((x, w), s', t') → + case dequeue s' t' of + Nothing → x + Just ((y, w'), s'', t'') → build (s'' |> (Node x y, w + w')) t'' + in build S.empty . prepare + + +-- Derive the prefix-free binary code from a huffman tree. +codewords ∷ HuffmanTree a → Code a +codewords = code' [] + where + code' ∷ [Bit] → HuffmanTree a → [(a, [Bit])] + code' _ Empty = [] + code' bits (Leaf x) = [(x, bits)] + code' bits (Node l r) = + map (second (Zero :)) (code' bits l) + ++ map (second (One :)) (code' bits r) + + +-- Pretty-print a binary code. Mostly useful for debugging. +ppCode ∷ (Show a) ⇒ Code a → String +ppCode = + intercalate "\n" + . map (\(x, bits) → show x ++ ": " ++ concatMap show bits) diff --git a/src/Complexity/Utilities.hs b/src/Complexity/Utilities.hs new file mode 100644 index 000000000..63ae98ef5 --- /dev/null +++ b/src/Complexity/Utilities.hs @@ -0,0 +1,166 @@ +{-# LANGUAGE Safe #-} + +{- | +Module : Utilities +Description : Functions to generate (algorithmic) complexity of objects +Copyright : (c) 2018-2020 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. +License : + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The views and conclusions contained in the software and documentation are those +of the authors and should not be interpreted as representing official policies, +either expressed or implied, of the FreeBSD Project. + +Maintainer : Ward Wheeler +Stability : unstable +Portability : portable (I hope) +-} +module Complexity.Utilities ( + getInformationContent, + split2Matrix, + -- , invertMatrixExt + ppMatrix, +) where + +import Codec.Compression.GZip qualified as GZ +import Complexity.Huffman +import Data.ByteString.Lazy qualified as B +import Data.List (nub) +import Data.String.Encode qualified as E + + +-- import Complexity.Constants +-- import Numeric.LinearAlgebra + +-- | occurencesInList counts occurences elements in list--makes result double so can divide later +occurencesInList ∷ (Eq a) ⇒ [a] → a → Double +occurencesInList elementList element = fromIntegral $ (length . filter (== element)) elementList + + +-- | getTotalBits takes occurence and bit encoding to return total bits of program +getTotalBits ∷ [Double] → [Double] → Double +getTotalBits = \case + [] → const 0 + occurrences : rest → \case + [] → 0 + bitCoding : bitList → + occurrences * bitCoding + getTotalBits rest bitList + + +-- | getShannon gets Shannon entropy bits by freqency for string (list of symbols) +getShannon ∷ String → Int +getShannon inCharList = + if null inCharList + then error "Input list to getShannon is empty" + else + let totalSymbols = length inCharList + symbolList = nub inCharList + symbolOccurences = fmap (occurencesInList inCharList) symbolList + symbolFrequency = fmap (/ fromIntegral totalSymbols) symbolOccurences + symbolBits = fmap (logBase 2.0) symbolFrequency + totalBits = getTotalBits symbolOccurences symbolBits -- This and above line could be a single function + in -- trace ("There were " ++ show (length symbolList) ++ " unique symbols in program.") + ceiling $ abs totalBits + + +-- | getHuffCode takes the code map list and rturns the binary code +getHuffCode ∷ Char → Code Char → [Bit] +getHuffCode inChar = \case + [] → error "Code empty or character not found" + (candChar, bitList) : _ | candChar == inChar → bitList + _ : rest → getHuffCode inChar rest + + +bitToChar ∷ Bit → Char +bitToChar inBit = + if inBit == Zero + then '0' + else '1' + + +-- | getHuffman takes input list of chars and returns Huffman code representation +getHuffman ∷ String → (Int, String) +getHuffman inString = + if null inString + then error "Input list to getHuffman is empty" + else + let symbolList = nub inString + symbolOccurences = fmap (occurencesInList inString) symbolList + symbolOccurencePairs = zip symbolList symbolOccurences + huffTree = huffman symbolOccurencePairs + huffCodes = codewords huffTree + bitList = foldMap (flip getHuffCode huffCodes) inString + bitString = fmap bitToChar bitList + in -- trace (ppCode huffCodes) + (length bitList, bitString) + + +{- | getInformation takes a program string and returns Shannon, Huffman bits, Huffman binary program, and comopressed length bits . +for min on compression--short things can increase in bits when compressed. +Using Shannon bits, but could use Huffman since Shannon can't be realized +-} +getInformationContent ∷ String → (Double, Int, String, Double) +getInformationContent programString = + if null programString + then error "Empty program in getInformation" + else + let (huffBits, huffBinary) = getHuffman programString + compressedStream = GZ.compressWith GZ.defaultCompressParams{GZ.compressLevel = GZ.bestCompression} (E.convertString programString) + shannonBits = getShannon programString + compressedBits = min shannonBits $ 8 * length (B.unpack compressedStream) + in (fromIntegral shannonBits, huffBits, huffBinary, fromIntegral compressedBits) + + +-- | split2Matrix takes alist and splits after n elements to make it a list of lists +split2Matrix ∷ Int → [Double] → [[Double]] +split2Matrix lengthPieces inList + | null inList = [] + | lengthPieces > length inList = error ("List too short to split " ++ show lengthPieces ++ " " ++ show inList) + | otherwise = + take lengthPieces inList : split2Matrix lengthPieces (drop lengthPieces inList) + + +{- +-- | exportable versions of functions +--invertMatrixExt uses library to invert matrix but uses simple +invertMatrixExt :: [[Double]] -> [[Double]] +invertMatrixExt a = + if null a then error "Null matrix to invert" + else + let dimension = length a + aFlat = concat a + a' = matrix dimension aFlat + invA' = inv a' + in + toLists invA' +-} + +-- | ppMatrix makes a nice(er) string for matrix +ppMatrix ∷ [[Double]] → String +ppMatrix inMatrix = + if null inMatrix + then "[]" + else + let rowS = fmap show inMatrix + rowStrings = fmap (++ "\n") rowS + matString = concat rowStrings + in matString diff --git a/src/Debug/Debug.hs b/src/Debug/Debug.hs new file mode 100644 index 000000000..3725fa888 --- /dev/null +++ b/src/Debug/Debug.hs @@ -0,0 +1,96 @@ +{- | +Module : Debug.hs +Description : Module with Debug version of functions +Copyright : (c) 2021 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. +License : + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The views and conclusions contained in the software and documentation are those +of the authors and should not be interpreted as representing official policies, +either expressed or implied, of the FreeBSD Project. + +Maintainer : Ward Wheeler +Stability : unstable +Portability : portable (I hope) +-} +module Debug.Debug where + +-- import Data.List +import Data.Vector qualified as V +import Types.Types + + +debugZip ∷ [a] → [b] → [(a, b)] +debugZip la lb + | not isDebug = zip la lb + | length la /= length lb = error ("Zip arguments not equal in length: " <> show (length la, length lb)) + | null la = error ("First list null in debugZip " <> show (length la, length lb)) + | null lb = error ("Second list null in debugZip " <> show (length la, length lb)) + | otherwise = zip la lb + + +debugZip3 ∷ [a] → [b] → [c] → [(a, b, c)] +debugZip3 la lb lc + | not isDebug = zip3 la lb lc + | (length la /= length lb) || (length la /= length lc) || (length lb /= length lc) = + error ("Zip3 arguments not equal in length: " <> show (length la, length lb, length lc)) + | null la = error ("First list null in debugZip3 " <> show (length la, length lb, length lc)) + | null lb = error ("Second list null in debugZip3 " <> show (length la, length lb, length lc)) + | null lc = error ("Third list null in debugZip3 " <> show (length la, length lb, length lc)) + | otherwise = zip3 la lb lc + + +debugVectorZip ∷ V.Vector a → V.Vector b → V.Vector (a, b) +debugVectorZip la lb + | not isDebug = V.zip la lb + | V.length la /= V.length lb = error ("Zip arguments not equal in length: " <> show (V.length la, V.length lb)) + | V.null la = error ("First vector null in debugZip " <> show (V.length la, V.length lb)) + | V.null lb = error ("Second vector null in debugZip " <> show (V.length la, V.length lb)) + | otherwise = V.zip la lb + + +debugVectorZip3 ∷ V.Vector a → V.Vector b → V.Vector c → V.Vector (a, b, c) +debugVectorZip3 la lb lc + | not isDebug = V.zip3 la lb lc + | (V.length la /= V.length lb) || (V.length la /= V.length lc) || (V.length lb /= V.length lc) = + error ("Zip3 arguments not equal in length: " <> show (V.length la, V.length lb, V.length lc)) + | V.null la = error ("First vector null in debugZip3 " <> show (V.length la, V.length lb, V.length lc)) + | V.null lb = error ("Second vector null in debugZip3 " <> show (V.length la, V.length lb, V.length lc)) + | V.null lc = error ("Third vector null in debugZip3 " <> show (V.length la, V.length lb, V.length lc)) + | otherwise = V.zip3 la lb lc + + +debugVectorZip4 ∷ V.Vector a → V.Vector b → V.Vector c → V.Vector d → V.Vector (a, b, c, d) +debugVectorZip4 la lb lc ld + | not isDebug = V.zip4 la lb lc ld + | (V.length la /= V.length lb) + || (V.length la /= V.length lc) + || (V.length la /= V.length ld) + || (V.length lb /= V.length lc) + || (V.length lb /= V.length ld) + || (V.length lc /= V.length ld) = + error ("Zip3 arguments not equal in length: " <> show (V.length la, V.length lb, V.length lc, V.length ld)) + | V.null la = error ("First vector null in debugZip4 " <> show (V.length la, V.length lb, V.length lc, V.length ld)) + | V.null lb = error ("Second vector null in debugZip4 " <> show (V.length la, V.length lb, V.length lc, V.length ld)) + | V.null lc = error ("Third vector null in debugZip4 " <> show (V.length la, V.length lb, V.length lc, V.length ld)) + | V.null ld = error ("Fourth vector null in debugZip4 " <> show (V.length la, V.length lb, V.length lc, V.length ld)) + | otherwise = V.zip4 la lb lc ld diff --git a/src/GraphOptimization/Medians.hs b/src/GraphOptimization/Medians.hs new file mode 100644 index 000000000..f3bd694db --- /dev/null +++ b/src/GraphOptimization/Medians.hs @@ -0,0 +1,1890 @@ +{-- +TODO: + + Parallelize median2Vect +--} + +{- | +Module : Medians.hs +Description : Module specifying data type medians +Copyright : (c) 2021 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. +License : + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The views and conclusions contained in the software and documentation are those +of the authors and should not be interpreted as representing official policies, +either expressed or implied, of the FreeBSD Project. + +Maintainer : Ward Wheeler +Stability : unstable +Portability : portable (I hope) +-} +module GraphOptimization.Medians ( + addMatrix, + createEdgeUnionOverBlocks, + createEdgeUnionOverBlocksM, + diagonalNonZero, + distance2Unions, + distance2UnionsM, + get2WaySlim, + get2WayWideHuge, + getDOMedian, + getDOMedianCharInfo, + getFinal3WaySlim, + getFinal3WayWideHuge, + getNewRange, + getNoGapPrelimContext, + getPreAligned2Median, + generalSequenceDiff, + interUnion, + intervalAdd, + makeDynamicCharacterFromSingleVector, + makeEdgeData, + makeEdgeDataM, + makeIAFinalCharacter, + makeIAPrelimCharacter, + makeIAUnionPrelimLeaf, + median2, + median2M, + median2NonExact, + median2NonExactM, + median2Single, + median2StaticIA, + median2StaticIAM, + pairwiseDO, + union2Single +) where + +import Bio.DynamicCharacter +import Bio.DynamicCharacter.Element +import Data.Alphabet +import Data.BitVector.LittleEndian qualified as BV +import Data.Bits +import Data.Foldable +import Data.Kind (Type) +import Data.List qualified as L +import Data.Maybe +import Data.MetricRepresentation qualified as MR +import Data.TCM.Dense qualified as TCMD +import Data.Vector qualified as V +import Data.Vector.Generic qualified as GV +import Data.Vector.Storable qualified as SV +import DirectOptimization.Pairwise +import GeneralUtilities +import Input.BitPack qualified as BP +import PHANE.Evaluation +import PHANE.Evaluation.ErrorPhase (ErrorPhase (..)) +import PHANE.Evaluation.Logging (LogLevel (..), Logger (..)) +import PHANE.Evaluation.Verbosity (Verbosity (..)) +import SymMatrix qualified as S +import Types.Types +import Utilities.LocalGraph qualified as LG +import Debug.Trace + + +{- These top level function need to be parallel to evaluate a given graph in parallel + and also for heurstic costs in searches +-} + +{- | median2 takes the vectors of characters and applies median2Single to each +character +for parallel fmap over all then parallelized by type and sequences +used for distances and post-order assignments +-} +median2 + ∷ Bool → V.Vector CharacterData → V.Vector CharacterData → V.Vector CharInfo → V.Vector (CharacterData, VertexCost) +median2 isMedian = V.zipWith3 (median2Single isMedian False) + +{- | median2M takes the vectors of characters and applies median2Single to each character in parallel + used for distances and post-order assignments +-} +median2M ∷ Bool -> V.Vector CharacterData → V.Vector CharacterData → V.Vector CharInfo → PhyG (V.Vector (CharacterData, VertexCost)) +median2M isMedian firstDataV secondDataV charInfoV = + let medianAction :: (CharacterData, CharacterData, CharInfo) → (CharacterData, VertexCost) + medianAction = median2SingleTuple isMedian False + in + do + makeMedianPar ← getParallelChunkMap + let medianList = makeMedianPar medianAction (V.toList $ V.zip3 firstDataV secondDataV charInfoV) + pure $ V.fromList medianList + + +{- | median2NonExact takes the vectors of characters and applies median2NonExact to each +character for parallel fmap over all then parallelized by type and sequences +this only reoptimized the nonexact characters (sequence characters for now, perhpas otehrs later) +and takes the existing optimization for exact (Add, NonAdd, Matrix) for the others. +-} +median2NonExact + ∷ Bool → V.Vector CharacterData → V.Vector CharacterData → V.Vector CharInfo → V.Vector (CharacterData, VertexCost) +median2NonExact isMedian = V.zipWith3 (median2SingleNonExact isMedian) + + +{- | median2NonExactM takes the vectors of characters and applies median2NonExact to each +character in parallel +this only reoptimized the nonexact characters (sequence characters for now, perhpas otehrs later) +and takes the existing optimization for exact (Add, NonAdd, Matrix) for the others. +-} +median2NonExactM ∷ Bool → V.Vector CharacterData → V.Vector CharacterData → V.Vector CharInfo → PhyG (V.Vector (CharacterData, VertexCost)) +median2NonExactM isMedian firstDataV secondDataV charInfoV = + let medianAction :: (CharacterData, CharacterData, CharInfo) → (CharacterData, VertexCost) + medianAction = median2SingleNonExactTuple isMedian + in + do + makeMedianPar ← getParallelChunkMap + let medianList = makeMedianPar medianAction (V.toList $ V.zip3 firstDataV secondDataV charInfoV) + pure $ V.fromList medianList + +{- | median2StaticIA takes the vectors of characters and applies median2SingleStaticIA to each +character for parallel fmap over all then parallelized by type and sequences +this reoptimized only IA fields for the nonexact characters (sequence characters for now, perhpas others later) +and takes the existing optimization for exact (Add, NonAdd, Matrix) for the others. +-} +median2StaticIA + ∷ Bool → V.Vector CharacterData → V.Vector CharacterData → V.Vector CharInfo → V.Vector (CharacterData, VertexCost) +median2StaticIA isMedian = V.zipWith3 (median2Single isMedian True) + +{- | median2StaticIAM takes the vectors of characters and applies median2SingleStaticIA to each +character in parallel +this reoptimized only IA fields for the nonexact characters (sequence characters for now, perhpas others later) +and takes the existing optimization for exact (Add, NonAdd, Matrix) for the others. +-} +median2StaticIAM ∷ Bool → V.Vector CharacterData → V.Vector CharacterData → V.Vector CharInfo → PhyG (V.Vector (CharacterData, VertexCost)) +median2StaticIAM isMedian firstDataV secondDataV charInfoV = + let medianAction :: (CharacterData, CharacterData, CharInfo) → (CharacterData, VertexCost) + medianAction = median2SingleTuple isMedian True + in + do + makeMedianPar ← getParallelChunkMap + let medianList = makeMedianPar medianAction (V.toList $ V.zip3 firstDataV secondDataV charInfoV) + pure $ V.fromList medianList + +{- | distance2Unions is a block wrapper around distance2UnionsBlock +really only to get union distance--keeping union states in there in csae needed later +-} +distance2Unions ∷ Bool → VertexBlockData → VertexBlockData → V.Vector (V.Vector CharInfo) → (VertexBlockData, VertexCost) +distance2Unions isMedian firstBlock secondBlock charInfoVV = + let (newBlockV, newCostV) = V.unzip $ V.zipWith3 (distance2UnionsBlock isMedian) firstBlock secondBlock charInfoVV + in (newBlockV, V.sum newCostV) + +{- | distance2UnionsM is a block wrapper around distance2UnionsBlock + really only to get union distance--keeping union states in there in case needed later + operates in parallel +-} +distance2UnionsM ∷ Bool → VertexBlockData → VertexBlockData → V.Vector (V.Vector CharInfo) → PhyG VertexCost +distance2UnionsM isMedian firstBlock secondBlock charInfoVV = + let distanceAction :: (V.Vector CharacterData, V.Vector CharacterData, V.Vector CharInfo) → PhyG VertexCost + distanceAction = distance2UnionsBlockM isMedian + in + do + distPar <- getParallelChunkTraverse + newCostL <- distPar distanceAction (V.toList $ V.zip3 firstBlock secondBlock charInfoVV) + pure $ sum newCostL + +-- | distance2UnionsBlock is a block wrapper around distance2UnionsCharacter +distance2UnionsBlock ∷ Bool → V.Vector CharacterData → V.Vector CharacterData → V.Vector CharInfo → (V.Vector CharacterData, VertexCost) +distance2UnionsBlock isMedian firstBlock secondBlock charInfoV = + let (newBlockV, newCostV) = V.unzip $ V.zipWith3 (distance2UnionsCharacter isMedian) firstBlock secondBlock charInfoV + in (newBlockV, V.sum newCostV) + +-- | distance2UnionsBlockM is a block wrapper around distance2UnionsCharacterM +-- operates in parallel +distance2UnionsBlockM ∷ Bool → (V.Vector CharacterData, V.Vector CharacterData, V.Vector CharInfo) → PhyG VertexCost +distance2UnionsBlockM isMedian (firstBlock, secondBlock, charInfoV) = + let distanceAction :: (CharacterData, CharacterData, CharInfo) → VertexCost + distanceAction = distance2UnionsCharacterTuple isMedian + in + do + distPar <- getParallelChunkMap + let newCostL = distPar distanceAction (V.toList $ V.zip3 firstBlock secondBlock charInfoV) + pure $ sum newCostL + +{- | union2 takes the vectors of characters and applies union2Single to each character +used for edge states in build and rearrangement +-} +union2 ∷ Bool → Bool → V.Vector CharacterData → V.Vector CharacterData → V.Vector CharInfo → V.Vector CharacterData +union2 useIA filterGaps = V.zipWith3 (union2Single useIA filterGaps) + +{- | union2M takes the vectors of characters and applies union2Single to each character +used for edge states in build and rearrangement +-} +union2M ∷ Bool → Bool → (V.Vector CharacterData, V.Vector CharacterData, V.Vector CharInfo) → PhyG (V.Vector CharacterData) +union2M useIA filterGaps (firstBlock, secondBlock, charInfoV) = + let unionAction :: (CharacterData, CharacterData, CharInfo) → CharacterData + unionAction = union2SingleTuple useIA filterGaps + in + do + unionPar <- getParallelChunkMap + let result = unionPar unionAction (V.toList $ V.zip3 firstBlock secondBlock charInfoV) + pure $ V.fromList result + + +{- | createEdgeUnionOverBlocks creates the union of the final states characters on an edge +The function takes data in blocks and block vector of char info and +extracts the triple for each block and creates new block data +this is used for delta's in edge invastion in Wagner and SPR/TBR +filter gaps for using with DO (flterGaps = True) or IA (filterGaps = False) +-} +createEdgeUnionOverBlocks + ∷ Bool + → Bool + → VertexBlockData + → VertexBlockData + → V.Vector (V.Vector CharInfo) + → [V.Vector CharacterData] + → V.Vector (V.Vector CharacterData) +createEdgeUnionOverBlocks useIA filterGaps leftBlockData rightBlockData blockCharInfoVect curBlockData = + if V.null leftBlockData + then -- trace ("Blocks: " <> (show $ length curBlockData) <> " Chars B0: " <> (show $ V.map snd $ head curBlockData)) + V.fromList $ reverse curBlockData + else + let leftBlockLength = length $ V.head leftBlockData + rightBlockLength = length $ V.head rightBlockData + -- firstBlock = V.zip3 (V.head leftBlockData) (V.head rightBlockData) (V.head blockCharInfoVect) + + -- missing data cases first or zip defaults to zero length + firstBlockMedian + | (leftBlockLength == 0) = V.head rightBlockData + | (rightBlockLength == 0) = V.head leftBlockData + | otherwise = union2 useIA filterGaps (V.head leftBlockData) (V.head rightBlockData) (V.head blockCharInfoVect) + in createEdgeUnionOverBlocks + useIA + filterGaps + (V.tail leftBlockData) + (V.tail rightBlockData) + (V.tail blockCharInfoVect) + (firstBlockMedian : curBlockData) + +{- | createEdgeUnionOverBlocks creates the union of the final states characters on an edge +The function takes data in blocks and block vector of char info and +extracts the triple for each block and creates new block data +this is used for delta's in edge invastion in Wagner and SPR/TBR +filter gaps for using with DO (flterGaps = True) or IA (filterGaps = False) +Operates in parallel +-} +createEdgeUnionOverBlocksM + ∷ Bool + → Bool + → VertexBlockData + → VertexBlockData + → V.Vector (V.Vector CharInfo) + → PhyG (V.Vector (V.Vector CharacterData)) +createEdgeUnionOverBlocksM useIA filterGaps leftBlockData rightBlockData blockCharInfoVect = + let action :: (V.Vector CharacterData, V.Vector CharacterData, V.Vector CharInfo) → PhyG (V.Vector CharacterData) + action = makeBlockMedianM useIA filterGaps + in + do + actionPar <- getParallelChunkTraverse + result <- actionPar action (V.toList $ V.zip3 leftBlockData rightBlockData blockCharInfoVect) + pure $ V.fromList result + +{- makeBlockMedianM calls union2M to make a block median for use in + createEdgeUnionOverBlocksM +-} +makeBlockMedianM :: Bool -> Bool -> (V.Vector CharacterData, V.Vector CharacterData, V.Vector CharInfo) -> PhyG (V.Vector CharacterData) +makeBlockMedianM useIA filterGaps (leftDataV, rightDataV, charInfoV) = + if length leftDataV == 0 then pure rightDataV + else if length rightDataV == 0 then pure leftDataV + else do + union2M useIA filterGaps (leftDataV, rightDataV, charInfoV) + + +{- | makeEdgeData takes and edge and makes the VertData for the edge from the union of the two vertices +using IA assignments not so great for search deltas +-} +makeEdgeData ∷ Bool → Bool → DecoratedGraph → V.Vector (V.Vector CharInfo) → LG.LEdge b → VertexBlockData +makeEdgeData useIA filterGaps inGraph charInfoVV (eNode, vNode, _) = + let eNodeVertData = vertData $ fromJust $ LG.lab inGraph eNode + vNodeVertData = vertData $ fromJust $ LG.lab inGraph vNode + in createEdgeUnionOverBlocks useIA filterGaps eNodeVertData vNodeVertData charInfoVV [] + +{- | makeEdgeDataM takes and edge and makes the VertData for the edge from the union of the two vertices + using IA assignments not so great for search deltas + calls parallel version of createEdgeUnionOverBlocksM +-} +makeEdgeDataM ∷ Bool → Bool → DecoratedGraph → V.Vector (V.Vector CharInfo) → LG.LEdge b → PhyG VertexBlockData +makeEdgeDataM useIA filterGaps inGraph charInfoVV (eNode, vNode, _) = + let eNodeVertData = vertData $ fromJust $ LG.lab inGraph eNode + vNodeVertData = vertData $ fromJust $ LG.lab inGraph vNode + in do + createEdgeUnionOverBlocksM useIA filterGaps eNodeVertData vNodeVertData charInfoVV + +{- + Most of these functions operate on single characters +-} + +{- | diagonalNonZero checks if any diagonal values are == 0 +assumes square +call with index = 0 +-} +diagonalNonZero ∷ V.Vector (V.Vector Int) → Int → Bool +diagonalNonZero inMatrix index = + if index == V.length inMatrix + then False + else + if (inMatrix V.! index) V.! index /= 0 + then True + else diagonalNonZero inMatrix (index + 1) + + +{- | makeDynamicCharacterFromSingleVector takes a single vector (usually a 'final' state) +and returns a dynamic character that canbe used with other functions +-} +makeDynamicCharacterFromSingleVector ∷ (GV.Vector v a) ⇒ v a → (v a, v a, v a) +makeDynamicCharacterFromSingleVector dc = unsafeCharacterBuiltByST (toEnum $ GV.length dc) $ \dc' → GV.imapM_ (\k v → setAlign dc' k v v v) dc + +{- median2SingleTuple is a wrapper around median2Single to allow paralle execution +-} +median2SingleTuple :: Bool → Bool → (CharacterData, CharacterData, CharInfo) → (CharacterData, VertexCost) +median2SingleTuple a b (c,d,e) = median2Single a b c d e + +{- | median2Single takes character data and returns median character and cost +median2single assumes that the character vectors in the various states are the same length +that is--all leaves (hence other vertices later) have the same number of each type of character +used for post-order assignments +this is from preliminary states +staticIA for dynm,aic assumes all same length +PMDL costs are calculated by type--additive by conversion to non-additive --but if states> 129 won't do it so warning in docs +bp2,4,5,8,64, nonadd are by weights vis set command, matrix, sequence are set by tcm with non-zero diagnonal +-} +median2Single ∷ Bool → Bool → CharacterData → CharacterData → CharInfo → (CharacterData, VertexCost) +median2Single isMedian staticIA firstVertChar secondVertChar inCharInfo = + let thisType = charType inCharInfo + thisWeight = weight inCharInfo + thisMatrix = costMatrix inCharInfo + thisSlimTCM = slimTCM inCharInfo + thisWideTCM = wideTCM inCharInfo + thisHugeTCM = hugeTCM inCharInfo + thisActive = activity inCharInfo + thisNoChangeCost = noChangeCost inCharInfo + thisChangeCost = changeCost inCharInfo + in if not thisActive + then (firstVertChar, 0) + else + if thisType == Add + then + let newCharVect = intervalAdd thisWeight firstVertChar secondVertChar + in (newCharVect, localCost newCharVect) + else + if thisType == NonAdd + then + let newCharVect = interUnion isMedian thisWeight (thisNoChangeCost, thisChangeCost) firstVertChar secondVertChar + in (newCharVect, localCost newCharVect) + else + if thisType `elem` packedNonAddTypes + then -- assumes all weight 1 + + let newCharVect = BP.median2Packed isMedian thisType thisWeight (thisNoChangeCost, thisChangeCost) firstVertChar secondVertChar + in (newCharVect, localCost newCharVect) + else + if thisType == Matrix + then + let newCharVect = addMatrix thisWeight thisMatrix firstVertChar secondVertChar + in (newCharVect, localCost newCharVect) + else + if thisType `elem` prealignedCharacterTypes + then + let newCharVect = getPreAligned2Median isMedian inCharInfo emptyCharacter firstVertChar secondVertChar + in (newCharVect, localCost newCharVect) + else + if thisType `elem` nonExactCharacterTypes + then + let newCharVect + | staticIA = makeIAPrelimCharacter isMedian inCharInfo emptyCharacter firstVertChar secondVertChar + | otherwise = + getDOMedian isMedian thisWeight thisMatrix thisSlimTCM thisWideTCM thisHugeTCM thisType firstVertChar secondVertChar + in -- trace ("M2S: " <> (show $ localCost newCharVect)) + (newCharVect, localCost newCharVect) + else error ("Character type " <> show thisType <> " unrecognized/not implemented") + +{- median2SingleNonExactTuple is wrapper around median2SingleNonExact + for parallelism +-} +median2SingleNonExactTuple :: Bool → (CharacterData, CharacterData, CharInfo) → (CharacterData, VertexCost) +median2SingleNonExactTuple a (b,c,d) = median2SingleNonExact a b c d + +{- | median2SingleNonExact takes character data and returns median character and cost +median2single assumes that the character vectors in the various states are the same length +that is--all leaves (hencee other vertices later) have the same number of each type of character +this only reoptimized the nonexact characters (sequence characters for now, perhpas otehrs later) +and skips optimization placing a dummy value exact (Add, NonAdd, Matrix) for the others. +-} +median2SingleNonExact ∷ Bool → CharacterData → CharacterData → CharInfo → (CharacterData, VertexCost) +median2SingleNonExact isMedian firstVertChar secondVertChar inCharInfo = + let thisType = charType inCharInfo + thisWeight = weight inCharInfo + thisMatrix = costMatrix inCharInfo + thisSlimTCM = slimTCM inCharInfo + thisWideTCM = wideTCM inCharInfo + thisHugeTCM = hugeTCM inCharInfo + thisActive = activity inCharInfo + dummyStaticCharacter = emptyCharacter + in if not thisActive || (thisType `elem` exactCharacterTypes) + then (dummyStaticCharacter, 0) + else + if thisType `elem` prealignedCharacterTypes + then + let newCharVect = getPreAligned2Median isMedian inCharInfo dummyStaticCharacter firstVertChar secondVertChar + in -- trace ("M2S:" <> (show $ localCost newCharVect) <> (show (firstVertChar, secondVertChar))) + -- trace ("M2SNEP: " <> (show thisType)) + (newCharVect, localCost newCharVect) + else + if thisType `elem` nonExactCharacterTypes + then + let newCharVect = getDOMedian isMedian thisWeight thisMatrix thisSlimTCM thisWideTCM thisHugeTCM thisType firstVertChar secondVertChar + in -- trace ("M2SNE: " <> (show thisType) <> (show $ localCost newCharVect)) + (newCharVect, localCost newCharVect) + else error ("Character type " <> show thisType <> " unrecognized/not implemented") + + + + +{- | distance2UnionsCharacterTuple wrapper around distance2UnionsCharacter for parallel use + only returns cost-} +distance2UnionsCharacterTuple :: Bool → (CharacterData, CharacterData, CharInfo) → VertexCost +distance2UnionsCharacterTuple a (b,c,d) = snd $ distance2UnionsCharacter a b c d + + +{- | distance2UnionsCharacter takes character data and returns median of union characters and cost +median2Unions assumes that the character vectors in the various states are the same length +this is from union states +assumes all same length +PMDL costs are calculated by type--additive by conversion to non-additive --but if states> 129 won't do it so warning in docs +bp2,4,5,8,64, nonadd are by weights vis set command, matrix, sequence are set by tcm with non-zero diagnonal +-} +-- wrong for prealigned--needs to DO +-- 1) Check length/composition union fields +-- 2) check cost for each type +-- 3) use DO for alignments even on unoin fields (triple) +-- as a result can use non-alignred dynamic characters as unions for comparison +distance2UnionsCharacter ∷ Bool → CharacterData → CharacterData → CharInfo → (CharacterData, VertexCost) +distance2UnionsCharacter isMedian firstVertChar secondVertChar inCharInfo = + let thisType = charType inCharInfo + thisWeight = weight inCharInfo + thisMatrix = costMatrix inCharInfo + thisActive = activity inCharInfo + thisNoChangeCost = noChangeCost inCharInfo + thisChangeCost = changeCost inCharInfo + in if not thisActive + then (firstVertChar, 0) + else + if thisType == Add + then + let newCharVect = intervalAddUnionField thisWeight firstVertChar secondVertChar + in (newCharVect, localCost newCharVect) + else + if thisType == NonAdd + then + let newCharVect = interUnionUnionField isMedian thisWeight (thisNoChangeCost, thisChangeCost) firstVertChar secondVertChar + in (newCharVect, localCost newCharVect) + else + if thisType `elem` packedNonAddTypes + then -- assumes all weight 1 + + let newCharVect = BP.median2PackedUnionField isMedian thisType thisWeight (thisNoChangeCost, thisChangeCost) firstVertChar secondVertChar + in (newCharVect, localCost newCharVect) + else + if thisType == Matrix + then + let newCharVect = addMatrixUnionField thisWeight thisMatrix firstVertChar secondVertChar + in (newCharVect, localCost newCharVect) + else + if thisType `elem` prealignedCharacterTypes + then + let newCharVect = getPreAligned2MedianUnionFields isMedian inCharInfo emptyCharacter firstVertChar secondVertChar + in (newCharVect, localCost newCharVect) + else + if thisType `elem` nonExactCharacterTypes + then + let newCharVect = getDOMedianCharInfoUnion isMedian inCharInfo firstVertChar secondVertChar + in -- let newCharVect = getNonExactUnionFields inCharInfo emptyCharacter firstVertChar secondVertChar + + -- trace ("M2S: " <> (show $ localCost newCharVect)) + (newCharVect, localCost newCharVect) + else error ("Character type " <> show thisType <> " unrecognized/not implemented") + + +-- | localOr wrapper for BV.or for vector elements +localOr ∷ BV.BitVector → BV.BitVector → BV.BitVector +localOr lBV rBV = lBV .|. rBV + + +-- | interUnionBV takes two bitvectors and returns new state, nochange number (1 or 0), change number (0 or 1) +interUnionBV ∷ BV.BitVector → BV.BitVector → (BV.BitVector, Int, Int) +interUnionBV leftBV rightBV = + if BV.isZeroVector (leftBV .&. rightBV) + then (leftBV .|. rightBV, 0, 1) + else (leftBV .&. rightBV, 1, 0) + + +{- | interUnion takes two non-additive chars and creates newCharcter as 2-median +in post-order pass to create preliminary states assignment +assumes a single weight for all +performs two passes though chars to get cost of assignments +snd3 $ rangePrelim left/rightChar due to triple in prelim +could have done noChnageCoast/Chaneg cost with length subtraction but very small issue in real use since +only for nonadd characters with > 64 states. +-} +interUnion ∷ Bool → Double → (Double, Double) → CharacterData → CharacterData → CharacterData +interUnion isMedian thisWeight (lNoChangeCost, lChangeCost) leftChar rightChar = + let (newStateVect, noChangeCostVect, changeCostVect) = V.unzip3 $ V.zipWith interUnionBV (snd3 $ stateBVPrelim leftChar) (snd3 $ stateBVPrelim rightChar) + newCost + | isMedian = + thisWeight * ((lNoChangeCost * fromIntegral (V.sum noChangeCostVect)) + (lChangeCost * fromIntegral (V.sum changeCostVect))) + | otherwise = + let noChangeCostNum = (2 * (V.sum noChangeCostVect)) + (V.sum changeCostVect) + in thisWeight * ((lNoChangeCost * fromIntegral noChangeCostNum) + (lChangeCost * fromIntegral (V.sum changeCostVect))) + + newCharacter = + emptyCharacter + { stateBVPrelim = (snd3 $ stateBVPrelim leftChar, newStateVect, snd3 $ stateBVPrelim rightChar) + , localCost = newCost + , globalCost = newCost + globalCost leftChar + globalCost rightChar + } + in -- trace ("NonAdditive: " <> (show numUnions) <> " " <> (show newCost) <> "\t" <> (show $ stateBVPrelim leftChar) <> "\t" <> (show $ stateBVPrelim rightChar) <> "\t" + -- <> (show intersectVect) <> "\t" <> (show unionVect) <> "\t" <> (show newStateVect)) + newCharacter + + +{- | interUnionUnionField takes two non-additive chars and creates newCharcter as 2-median +in post-order pass to create union states assignment and cost +assumes a single weight for all +performs two passes though chars to get cost of assignments +snd3 $ rangePrelim left/rightChar due to triple in prelim +could have done noChnageCoast/Chaneg cost with length subtraction but very small issue in real use since +only for nonadd characters with > 64 states. +-} +interUnionUnionField ∷ Bool → Double → (Double, Double) → CharacterData → CharacterData → CharacterData +interUnionUnionField isMedian thisWeight (lNoChangeCost, lChangeCost) leftChar rightChar = + let (newStateVect, noChangeCostVect, changeCostVect) = V.unzip3 $ V.zipWith interUnionBV (stateBVUnion leftChar) (stateBVUnion rightChar) + newCost + | isMedian = + thisWeight * ((lNoChangeCost * fromIntegral (V.sum noChangeCostVect)) + (lChangeCost * fromIntegral (V.sum changeCostVect))) + | otherwise = + let noChangeCostNum = (2 * (V.sum noChangeCostVect)) + (V.sum changeCostVect) + in thisWeight * ((lNoChangeCost * fromIntegral noChangeCostNum) + (lChangeCost * fromIntegral (V.sum changeCostVect))) + + newCharacter = + emptyCharacter + { stateBVUnion = newStateVect + , localCost = newCost + , globalCost = newCost + globalCost leftChar + globalCost rightChar + } + in -- trace ("NonAdditive: " <> (show numUnions) <> " " <> (show newCost) <> "\t" <> (show $ stateBVPrelim leftChar) <> "\t" <> (show $ stateBVPrelim rightChar) <> "\t" + -- <> (show intersectVect) <> "\t" <> (show unionVect) <> "\t" <> (show newStateVect)) + newCharacter + + +{- | localUnion takes two non-additive chars and creates newCharcter as 2-union/or +assumes a single weight for all +performs single +bsaed on final states +-} +localUnion ∷ CharacterData → CharacterData → CharacterData +localUnion leftChar rightChar = + let unionVect = V.zipWith localOr (stateBVFinal leftChar) (stateBVFinal rightChar) + newCharacter = + emptyCharacter + { stateBVPrelim = (unionVect, unionVect, unionVect) + , stateBVFinal = unionVect + } + in -- trace ("NonAdditive: " <> (show numUnions) <> " " <> (show newCost) <> "\t" <> (show $ stateBVPrelim leftChar) <> "\t" <> (show $ stateBVPrelim rightChar) <> "\t" + -- <> (show intersectVect) <> "\t" <> (show unionVect) <> "\t" <> (show newStateVect)) + newCharacter + + +{- | getNewRange takes min and max range of two additive characters and returns +a triple of (newMin, newMax, Cost) +-} +getNewRange ∷ Int → Int → Int → Int → (Int, Int, Int) +getNewRange lMin lMax rMin rMax = + -- subset + if (rMin >= lMin) && (rMax <= lMax) + then (rMin, rMax, 0) + else + if (lMin >= rMin) && (lMax <= rMax) + then (lMin, lMax, 0) + else -- overlaps + + if (rMin >= lMin) && (rMax >= lMax) && (rMin <= lMax) + then (rMin, lMax, 0) + else + if (lMin >= rMin) && (lMax >= rMax) && (lMin <= rMax) + then (lMin, rMax, 0) + else -- newInterval + + if lMax <= rMin + then (lMax, rMin, rMin - lMax) + else + if rMax <= lMin + then (rMax, lMin, lMin - rMax) + else error ("This can't happen " <> show (lMin, lMax, rMin, rMax)) + + +{- | intervalAdd takes two additive chars and creates newCharcter as 2-median +in post-order pass to create preliminary states assignment +assumes a single weight for all +snd3 $ rangePrelim left/rightChar due to triple in prelim +-} +intervalAdd ∷ Double → CharacterData → CharacterData → CharacterData +intervalAdd thisWeight leftChar rightChar = + let newRangeCosts = + V.zipWith4 + getNewRange + (V.map fst $ snd3 $ rangePrelim leftChar) + (V.map snd $ snd3 $ rangePrelim leftChar) + (V.map fst $ snd3 $ rangePrelim rightChar) + (V.map snd $ snd3 $ rangePrelim rightChar) + newMinRange = V.map fst3 newRangeCosts + newMaxRange = V.map snd3 newRangeCosts + newCost = thisWeight * fromIntegral (V.sum $ V.map thd3 newRangeCosts) + newCharacter = + emptyCharacter + { rangePrelim = (snd3 $ rangePrelim leftChar, V.zip newMinRange newMaxRange, snd3 $ rangePrelim rightChar) + , localCost = newCost + , globalCost = newCost + globalCost leftChar + globalCost rightChar + } + in newCharacter + + +{- | intervalAddUnionField takes two additive chars and creates newCharcter as 2-median +in post-order pass to create union states assignment and cost +assumes a single weight for all +-} +intervalAddUnionField ∷ Double → CharacterData → CharacterData → CharacterData +intervalAddUnionField thisWeight leftChar rightChar = + let newRangeCosts = + V.zipWith4 + getNewRange + (V.map fst $ rangeUnion leftChar) + (V.map snd $ rangeUnion leftChar) + (V.map fst $ rangeUnion rightChar) + (V.map snd $ rangeUnion rightChar) + newMinRange = V.map fst3 newRangeCosts + newMaxRange = V.map snd3 newRangeCosts + newCost = thisWeight * fromIntegral (V.sum $ V.map thd3 newRangeCosts) + newCharacter = + emptyCharacter + { rangeUnion = V.zip newMinRange newMaxRange + , localCost = newCost + , globalCost = newCost + globalCost leftChar + globalCost rightChar + } + in newCharacter + + +{- | getUnionRange takes min and max range of two additive characters and returns +a pair of (newMin, newMax) +-} +getUnionRange ∷ Int → Int → Int → Int → (Int, Int) +getUnionRange lMin lMax rMin rMax = + (min lMin rMin, max lMax rMax) + + +{- | intervalUnion takes two additive chars and creates newCharcter as 2-union +min of all lower, max of all higher +final states used and assigned to obthe prelim and final for use in swap delta +-} +intervalUnion ∷ CharacterData → CharacterData → CharacterData +intervalUnion leftChar rightChar = + let newRangeCosts = + V.zipWith4 + getUnionRange + (V.map fst $ rangeFinal leftChar) + (V.map snd $ rangeFinal leftChar) + (V.map fst $ rangeFinal rightChar) + (V.map snd $ rangeFinal rightChar) + newMinRange = V.map fst newRangeCosts + newMaxRange = V.map snd newRangeCosts + newRange = V.zip newMinRange newMaxRange + newCharacter = + emptyCharacter + { rangePrelim = (newRange, newRange, newRange) + , rangeFinal = newRange + } + in -- trace ("Additive: " <> (show newCost) <> "\t" <> (show $ rangeFinal leftChar) <> "\t" <> (show $ rangeFinal rightChar) + -- <> (show newRangeCosts)) + newCharacter + + +-- | getMinCostStates takes cost matrix and vector of states (cost, _, _) and retuns a list of (totalCost, best child state) +getMinCostStates + ∷ S.Matrix Int → V.Vector MatrixTriple → Int → Int → Int → [(Int, ChildStateIndex)] → Int → [(Int, ChildStateIndex)] +getMinCostStates thisMatrix childVect bestCost numStates childState currentBestStates stateIndex = + -- trace (show thisMatrix <> "\n" <> (show childVect) <> "\n" <> show (numStates, childState, stateIndex)) ( + if V.null childVect + then reverse (filter ((== bestCost) . fst) currentBestStates) + else + let (childCost, _, _) = V.head childVect + childStateCost = + if childCost /= (maxBound ∷ Int) + then childCost + (thisMatrix S.! (childState, stateIndex)) + else (maxBound ∷ Int) + in if childStateCost > bestCost + then getMinCostStates thisMatrix (V.tail childVect) bestCost numStates (childState + 1) currentBestStates stateIndex + else + if childStateCost == bestCost + then + getMinCostStates + thisMatrix + (V.tail childVect) + bestCost + numStates + (childState + 1) + ((childStateCost, childState) : currentBestStates) + stateIndex + else + getMinCostStates + thisMatrix + (V.tail childVect) + childStateCost + numStates + (childState + 1) + [(childStateCost, childState)] + stateIndex + + +-- ) + +{- | getNewVector takes the vector of states and costs from the child nodes and the +cost matrix and calculates a new vector n^2 in states +-} +getNewVector ∷ S.Matrix Int → Int → (V.Vector MatrixTriple, V.Vector MatrixTriple) → V.Vector MatrixTriple +getNewVector thisMatrix numStates (lChild, rChild) = + let newStates = [0 .. (numStates - 1)] + leftPairs = fmap (getMinCostStates thisMatrix lChild (maxBound ∷ Int) numStates 0 []) newStates + rightPairs = fmap (getMinCostStates thisMatrix rChild (maxBound ∷ Int) numStates 0 []) newStates + stateCosts = zipWith (+) (fmap (fst . head) leftPairs) (fmap (fst . head) rightPairs) + newStateTripleList = zip3 stateCosts (fmap (fmap snd) leftPairs) (fmap (fmap snd) rightPairs) + in V.fromList newStateTripleList + + +{- | addMatrix thisWeight thisMatrix firstVertChar secondVertChar matrix character +assumes each character has same cost matrix +Need to add approximation ala DO tcm lookup later +Local and global costs are based on current not necessaril;y optimal minimum cost states +-} +addMatrix ∷ Double → S.Matrix Int → CharacterData → CharacterData → CharacterData +addMatrix thisWeight thisMatrix firstVertChar secondVertChar = + if null thisMatrix + then error "Null cost matrix in addMatrix" + else + let numStates = length thisMatrix + initialMatrixVector = getNewVector thisMatrix numStates <$> V.zip (matrixStatesPrelim firstVertChar) (matrixStatesPrelim secondVertChar) + initialCostVector = fmap (V.minimum . fmap fst3) initialMatrixVector + newCost = thisWeight * fromIntegral (V.sum initialCostVector) + newCharacter = + emptyCharacter + { matrixStatesPrelim = initialMatrixVector + , localCost = newCost - globalCost firstVertChar - globalCost secondVertChar + , globalCost = newCost + } + in -- trace ("Matrix: " <> (show newCost) <> "\n\t" <> (show $ matrixStatesPrelim firstVertChar) <> "\n\t" <> (show $ matrixStatesPrelim secondVertChar) <> + -- "\n\t" <> (show initialMatrixVector) <> "\n\t" <> (show initialCostVector)) + newCharacter + + +{- | addMatrixUnionField thisWeight thisMatrix firstVertChar secondVertChar matrix character +uinion fields +assumes each character has same cost matrix +Need to add approximation ala DO tcm lookup later +Local and global costs are based on current not necessarily optimal minimum cost states +-} +addMatrixUnionField ∷ Double → S.Matrix Int → CharacterData → CharacterData → CharacterData +addMatrixUnionField thisWeight thisMatrix firstVertChar secondVertChar = + if null thisMatrix + then error "Null cost matrix in addMatrix" + else + let numStates = length thisMatrix + initialMatrixVector = getNewVector thisMatrix numStates <$> V.zip (matrixStatesUnion firstVertChar) (matrixStatesUnion secondVertChar) + initialCostVector = fmap (V.minimum . fmap fst3) initialMatrixVector + newCost = thisWeight * fromIntegral (V.sum initialCostVector) + newCharacter = + emptyCharacter + { matrixStatesUnion = initialMatrixVector + , localCost = newCost - globalCost firstVertChar - globalCost secondVertChar + , globalCost = newCost + } + in newCharacter + + +{- | getUnionVector takes the vector of states and costs from two nodes +and sets the states with min cost in the two vertices and maxBound in other states +-} +getUnionVector ∷ S.Matrix Int → Int → (V.Vector MatrixTriple, V.Vector MatrixTriple) → V.Vector MatrixTriple +getUnionVector thisMatrix numStates (lChild, rChild) = + let newStates = [0 .. (numStates - 1)] + leftPairs = fmap (getMinCostStates thisMatrix lChild (maxBound ∷ Int) numStates 0 []) newStates + rightPairs = fmap (getMinCostStates thisMatrix rChild (maxBound ∷ Int) numStates 0 []) newStates + stateCosts = zipWith (+) (fmap (fst . head) leftPairs) (fmap (fst . head) rightPairs) + minStateCost = minimum stateCosts + stateCosts' = fmap (minOrMax minStateCost) stateCosts + newStateTripleList = zip3 stateCosts' (fmap (fmap snd) leftPairs) (fmap (fmap snd) rightPairs) + in V.fromList newStateTripleList + where + minOrMax minVal curVal = + if curVal == minVal + then minVal + else maxBound ∷ Int + + +{- | unionMatrix thisMatrix firstVertChar secondVertChar matrix character +assumes each character has same cost matrix +Need to add approximation ala DO tcm lookup later +Local and global costs are based on current not necessaril;y optimal minimum cost states +-} +unionMatrix ∷ S.Matrix Int → CharacterData → CharacterData → CharacterData +unionMatrix thisMatrix firstVertChar secondVertChar = + if null thisMatrix + then error "Null cost matrix in addMatrix" + else + let numStates = length thisMatrix + initialMatrixVector = getUnionVector thisMatrix numStates <$> V.zip (matrixStatesFinal firstVertChar) (matrixStatesFinal secondVertChar) + newCharacter = + emptyCharacter + { matrixStatesPrelim = initialMatrixVector + , matrixStatesFinal = initialMatrixVector + } + in -- trace ("Matrix: " <> (show newCost) <> "\n\t" <> (show $ matrixStatesPrelim firstVertChar) <> "\n\t" <> (show $ matrixStatesPrelim secondVertChar) <> + -- "\n\t" <> (show initialMatrixVector) <> "\n\t" <> (show initialCostVector)) + newCharacter + + +{- | pairwiseDO is a wrapper around slim/wise/hugeParwiseDO to allow direct call and return of +DO medians and cost. This is used in final state assignment +The adjustments are False since only uses the prealigned distance--hence no plus-one iussue +-} +pairwiseDO + ∷ Bool + → CharInfo + → (SlimDynamicCharacter, WideDynamicCharacter, HugeDynamicCharacter) + → (SlimDynamicCharacter, WideDynamicCharacter, HugeDynamicCharacter) + → (SlimDynamicCharacter, WideDynamicCharacter, HugeDynamicCharacter, Double) +pairwiseDO isMedian charInfo (slim1, wide1, huge1) (slim2, wide2, huge2) = case charType charInfo of + thisType + | thisType `elem` [SlimSeq, NucSeq] → + let cost' = adjustNoCostNonZeroDiag isMedian (costMatrix charInfo) (cost, noChangeAdjust) + (cost, r) = slimPairwiseDO (slimTCM charInfo) slim1 slim2 + -- adjustment based on aligned left and right + noChangeAdjust + | not isMedian = 0 + | otherwise = snd $ get2WaySlim (slimTCM charInfo) (extractMedians r) (extractMedians r) + + in --trace ("PDO: " <> (show (cost,cost'))) $ + (r, mempty, mempty, weight charInfo * fromIntegral cost') + thisType + | thisType `elem` [WideSeq, AminoSeq] → + let coefficient = MR.minInDelCost (wideTCM charInfo) + cost' = adjustNoCostNonZeroDiag isMedian (costMatrix charInfo) (cost, noChangeAdjust) + (cost, r) = widePairwiseDO coefficient (MR.retreivePairwiseTCM $ wideTCM charInfo) wide1 wide2 + noChangeAdjust + | not isMedian = 0 + | otherwise =snd $ get2WayWideHuge (wideTCM charInfo) (extractMedians r) (extractMedians r) + + in (mempty, r, mempty, weight charInfo * fromIntegral cost') + HugeSeq → + let coefficient = MR.minInDelCost (hugeTCM charInfo) + cost' = adjustNoCostNonZeroDiag isMedian (costMatrix charInfo) (cost, noChangeAdjust) + (cost, r) = hugePairwiseDO coefficient (MR.retreivePairwiseTCM $ hugeTCM charInfo) huge1 huge2 + noChangeAdjust + | not isMedian = 0 + | otherwise = snd $ get2WayWideHuge (hugeTCM charInfo) (extractMedians r) (extractMedians r) + + in (mempty, mempty, r, weight charInfo * fromIntegral cost') + thisType → error $ fold ["Unrecognised character type '", show thisType, "'in a DYNAMIC character branch"] + + +{- | getDOMedianCharInfoUnion is a wrapper around getDOMedian with CharInfo-based interface +for union fields. +Strips out solo gaps (0/1) before DO step +-} +getDOMedianCharInfoUnion ∷ Bool → CharInfo → CharacterData → CharacterData → CharacterData +getDOMedianCharInfoUnion isMedian charInfo = + getDOMedianUnion + isMedian + (weight charInfo) + (costMatrix charInfo) + (slimTCM charInfo) + (wideTCM charInfo) + (hugeTCM charInfo) + (charType charInfo) + + +{- | getDOMedianUnion calls appropriate pairwise DO to create sequence median after some type wrangling +works on union states +filters out gaps (0/1) values before DO (>1) +-} +getDOMedianUnion + ∷ Bool + → Double + → S.Matrix Int + → TCMD.DenseTransitionCostMatrix + → MR.MetricRepresentation WideState + → MR.MetricRepresentation HugeState + → CharType + → CharacterData + → CharacterData + → CharacterData +getDOMedianUnion isMedian thisWeight thisMatrix thisSlimTCM thisWideTCM thisHugeTCM thisType leftChar rightChar + | null thisMatrix = error "Null cost matrix in getDOMedian" + | thisType `elem` [SlimSeq, NucSeq] = newSlimCharacterData + | thisType `elem` [WideSeq, AminoSeq] = newWideCharacterData + | thisType == HugeSeq = newHugeCharacterData + | otherwise = error $ fold ["Unrecognised character type '", show thisType, "'in a DYNAMIC character branch"] + where + blankCharacterData = emptyCharacter + + newSlimCharacterData = + let cost' = adjustNoCostNonZeroDiag isMedian thisMatrix (cost, noChangeAdjust) + newCost = thisWeight * fromIntegral cost' + subtreeCost = sum [newCost, globalCost leftChar, globalCost rightChar] + slimIAUnionNoGapsLeft = extractMediansSingle $ slimIAUnion leftChar + slimIAUnionNoGapsRight = extractMediansSingle $ slimIAUnion rightChar + (cost, r) = + slimPairwiseDO + thisSlimTCM + (makeDynamicCharacterFromSingleVector slimIAUnionNoGapsLeft) + (makeDynamicCharacterFromSingleVector slimIAUnionNoGapsRight) + noChangeAdjust = + if isMedian + then 0 + else snd $ get2WaySlim thisSlimTCM (extractMedians r) (extractMedians r) + + in --trace ("GDOMU:" <> show (cost, extractMedians r, slimIAUnionNoGapsLeft, slimIAUnionNoGapsRight)) $ + blankCharacterData + { slimIAUnion = extractMedians r + , localCostVect = V.singleton $ fromIntegral cost' + , localCost = newCost + , globalCost = subtreeCost + } + + newWideCharacterData = + let cost' = adjustNoCostNonZeroDiag isMedian thisMatrix (cost, noChangeAdjust) + newCost = thisWeight * fromIntegral cost' + coefficient = MR.minInDelCost thisWideTCM + subtreeCost = sum [newCost, globalCost leftChar, globalCost rightChar] + wideIAUnionNoGapsLeft = extractMediansSingle $ wideIAUnion leftChar + wideIAUnionNoGapsRight = extractMediansSingle $ wideIAUnion rightChar + (cost, r) = + widePairwiseDO + coefficient + (MR.retreivePairwiseTCM thisWideTCM) + (makeDynamicCharacterFromSingleVector wideIAUnionNoGapsLeft) + (makeDynamicCharacterFromSingleVector wideIAUnionNoGapsRight) + + noChangeAdjust = + if isMedian + then 0 + else snd $ get2WayWideHuge thisWideTCM (extractMedians r) (extractMedians r) + + in blankCharacterData + { wideIAUnion = extractMedians r + , localCostVect = V.singleton $ fromIntegral cost' + , localCost = newCost + , globalCost = subtreeCost + } + + newHugeCharacterData = + let cost' = adjustNoCostNonZeroDiag isMedian thisMatrix (cost, noChangeAdjust) + newCost = thisWeight * fromIntegral cost' + coefficient = MR.minInDelCost thisHugeTCM + subtreeCost = newCost + globalCost leftChar + globalCost rightChar + hugeIAUnionNoGapsLeft = extractMediansSingle $ hugeIAUnion leftChar + hugeIAUnionNoGapsRight = extractMediansSingle $ hugeIAUnion rightChar + (cost, r) = + hugePairwiseDO + coefficient + (MR.retreivePairwiseTCM thisHugeTCM) + (makeDynamicCharacterFromSingleVector hugeIAUnionNoGapsLeft) + (makeDynamicCharacterFromSingleVector hugeIAUnionNoGapsRight) + + noChangeAdjust = + if isMedian + then 0 + else snd $ get2WayWideHuge thisHugeTCM (extractMedians r) (extractMedians r) + + in blankCharacterData + { hugeIAUnion = extractMedians r + , localCostVect = V.singleton $ fromIntegral cost' + , localCost = newCost + , globalCost = subtreeCost + } + + +-- | getDOMedianCharInfo is a wrapper around getDOMedian with CharInfo-based interface +getDOMedianCharInfo ∷ Bool → CharInfo → CharacterData → CharacterData → CharacterData +getDOMedianCharInfo isMedian charInfo = + getDOMedian + isMedian + (weight charInfo) + (costMatrix charInfo) + (slimTCM charInfo) + (wideTCM charInfo) + (hugeTCM charInfo) + (charType charInfo) + + +{- | adjustNoCostNonZeroDiag makes adjustment to DO/Prealigned cost based on no cost asjustment for non-zero diags + + The way costs are detemined is via minimum cost medians ie State1 <- Median -> State2 + Hence, is teh cost of a state is staying the same, then the best median will include both the + change cost and the stasis cost--rthis is correct when diagnosing a graph post order since we need + costs to account for both edges coming out of median sequence + However, is we disty wantt he distance between two states, we either want q single statis cost (not two) + or just the change cost, so we must subtract the stasis cost (diag non zero) to be correct (if /=0) + isMedian == True leaves cost alone (ie including "both" costs from median), + isMedian == False (is a distance) subtracts off the "extra" noChange cost +-} +adjustNoCostNonZeroDiag ∷ Bool → S.Matrix Int → (Word, Word) → Word +adjustNoCostNonZeroDiag isMedian thisMatrix (inCost, inNoChangeAdjust) = + if isMedian then inCost + else if not (diagonalNonZero thisMatrix 0) || (inCost == 0) then inCost + else + -- this for distance so subtract off the "extra" noChange cost from median + let inNoChangeCost' = toEnum $ round ((fromIntegral inNoChangeAdjust) / 2.0) + in + inCost - inNoChangeCost' + + +{- | getDOMedian calls appropriate pairwise DO to create sequence median after some type wrangling +works on preliminary states +*** Has adjustment (perhaps only for a while) for non-zero diag matrices adding 1 to each cost perlength of sequence +-} +getDOMedian + ∷ Bool + → Double + → S.Matrix Int + → TCMD.DenseTransitionCostMatrix + → MR.MetricRepresentation WideState + → MR.MetricRepresentation HugeState + → CharType + → CharacterData + → CharacterData + → CharacterData +getDOMedian isMedian thisWeight thisMatrix thisSlimTCM thisWideTCM thisHugeTCM thisType leftChar rightChar + | null thisMatrix = error "Null cost matrix in getDOMedian" + | thisType `elem` [SlimSeq, NucSeq] = newSlimCharacterData + | thisType `elem` [WideSeq, AminoSeq] = newWideCharacterData + | thisType == HugeSeq = newHugeCharacterData + | otherwise = error $ fold ["Unrecognised character type '", show thisType, "'in a DYNAMIC character branch"] + where + blankCharacterData = emptyCharacter + + newSlimCharacterData = + let cost' = adjustNoCostNonZeroDiag isMedian thisMatrix (cost, noChangeAdjust) + newCost = thisWeight * fromIntegral cost' + subtreeCost = sum [newCost, globalCost leftChar, globalCost rightChar] + (cost, r) = + slimPairwiseDO + thisSlimTCM + (slimGapped leftChar) + (slimGapped rightChar) + noChangeAdjust = + if isMedian + then 0 + else snd $ get2WaySlim thisSlimTCM (extractMedians r) (extractMedians r) + in -- trace ("GDM: " <> (show (cost, noChangeAdjust, cost'))) $ + blankCharacterData + { slimPrelim = extractMedians r + , slimGapped = r + , localCostVect = V.singleton $ fromIntegral cost' + , localCost = newCost + , globalCost = subtreeCost + } + + newWideCharacterData = + let cost' = adjustNoCostNonZeroDiag isMedian thisMatrix (cost, noChangeAdjust) + newCost = thisWeight * fromIntegral cost' + coefficient = MR.minInDelCost thisWideTCM + subtreeCost = sum [newCost, globalCost leftChar, globalCost rightChar] + (cost, r) = + widePairwiseDO + coefficient + (MR.retreivePairwiseTCM thisWideTCM) + (wideGapped leftChar) + (wideGapped rightChar) + + noChangeAdjust = + if isMedian + then 0 + else snd $ get2WayWideHuge thisWideTCM (extractMedians r) (extractMedians r) + in blankCharacterData + { widePrelim = extractMedians r + , wideGapped = r + , localCostVect = V.singleton $ fromIntegral cost' + , localCost = newCost + , globalCost = subtreeCost + } + + newHugeCharacterData = + let cost' = adjustNoCostNonZeroDiag isMedian thisMatrix (cost, noChangeAdjust) + newCost = thisWeight * fromIntegral cost' + coefficient = MR.minInDelCost thisHugeTCM + subtreeCost = newCost + globalCost leftChar + globalCost rightChar + (cost, r) = + hugePairwiseDO + coefficient + (MR.retreivePairwiseTCM thisHugeTCM) + (hugeGapped leftChar) + (hugeGapped rightChar) + + noChangeAdjust = + if isMedian + then 0 + else snd $ get2WayWideHuge thisHugeTCM (extractMedians r) (extractMedians r) + in blankCharacterData + { hugePrelim = extractMedians r + , hugeGapped = r + , localCostVect = V.singleton $ fromIntegral cost' + , localCost = newCost + , globalCost = subtreeCost + } + + +{- | getPrealignedUnion calls appropriate pairwise function to create sequence union of final states +for prealigned states +-} +getPrealignedUnion + ∷ CharType + → CharacterData + → CharacterData + → CharacterData +getPrealignedUnion thisType leftChar rightChar = + let blankCharacterData = emptyCharacter + in if thisType == AlignedSlim + then + let finalUnion = GV.zipWith (.|.) (alignedSlimFinal leftChar) (alignedSlimFinal rightChar) + prelimState = (finalUnion, finalUnion, finalUnion) + in blankCharacterData + { alignedSlimPrelim = prelimState + , alignedSlimFinal = finalUnion + } + else + if thisType == AlignedWide + then + let finalUnion = GV.zipWith (.|.) (alignedWideFinal leftChar) (alignedWideFinal rightChar) + prelimState = (finalUnion, finalUnion, finalUnion) + in blankCharacterData + { alignedWidePrelim = prelimState + , alignedWideFinal = finalUnion + } + else + if thisType == AlignedHuge + then + let finalUnion = GV.zipWith (.|.) (alignedHugeFinal leftChar) (alignedHugeFinal rightChar) + prelimState = (finalUnion, finalUnion, finalUnion) + in blankCharacterData + { alignedHugePrelim = prelimState + , alignedHugeFinal = finalUnion + } + else error ("Unrecognised character type '" <> show thisType <> "' in getPrealignedUnion") + + +{- | getDynamicUnion calls appropriate pairwise function to create sequence median after some type wrangling +if using IA--takes IAFInal for each node, creates union of IAFinals states +if DO then calculated DO medians and takes union of left and right states +gaps need to be fitered if DO used later (as in Wagner), or as in SPR/TBR rearragement +sets final and IA states for Swap delta heuristics +-} +getDynamicUnion + ∷ Bool + → Bool + → CharType + → CharacterData + → CharacterData + → TCMD.DenseTransitionCostMatrix + → MR.MetricRepresentation WideState + → MR.MetricRepresentation HugeState + → CharacterData +getDynamicUnion useIA filterGaps thisType leftChar rightChar thisSlimTCM thisWideTCM thisHugeTCM + | thisType `elem` [SlimSeq, NucSeq] = newSlimCharacterData + | thisType `elem` [WideSeq, AminoSeq] = newWideCharacterData + | thisType == HugeSeq = newHugeCharacterData + | otherwise = error $ fold ["Unrecognised character type '", show thisType, "'in a DYNAMIC character branch"] + where + blankCharacterData = emptyCharacter + + newSlimCharacterData = + let r = + if useIA + then GV.zipWith (.|.) (slimIAFinal leftChar) (slimIAFinal rightChar) + else + let (_, (lG, _, rG)) = + slimPairwiseDO + thisSlimTCM + (makeDynamicCharacterFromSingleVector $ slimFinal leftChar) + (makeDynamicCharacterFromSingleVector $ slimFinal rightChar) + in GV.zipWith (.|.) lG rG + + r' = + if filterGaps + then extractMediansSingle r + else r + in blankCharacterData + { slimPrelim = r' + , slimGapped = (r', r', r') + , slimFinal = r' + , slimIAPrelim = (r, r, r) + , slimIAFinal = r + } + + newWideCharacterData = + let r = + if useIA + then GV.zipWith (.|.) (wideIAFinal leftChar) (wideIAFinal rightChar) + else + let coefficient = MR.minInDelCost thisWideTCM + (_, (lG, _, rG)) = + widePairwiseDO + coefficient + (MR.retreivePairwiseTCM thisWideTCM) + (makeDynamicCharacterFromSingleVector $ wideFinal leftChar) + (makeDynamicCharacterFromSingleVector $ wideFinal rightChar) + in GV.zipWith (.|.) lG rG + -- r = GV.zipWith (.|.) (wideIAFinal leftChar) (wideIAFinal rightChar) + + r' = + if filterGaps + then extractMediansSingle r + else r + in blankCharacterData + { widePrelim = r' + , wideGapped = (r', r', r') + , wideFinal = r' + , wideIAPrelim = (r, r, r) + , wideIAFinal = r + } + + newHugeCharacterData = + let r = + if useIA + then GV.zipWith (.|.) (hugeIAFinal leftChar) (hugeIAFinal rightChar) + else + let coefficient = MR.minInDelCost thisHugeTCM + (_, (lG, _, rG)) = + hugePairwiseDO + coefficient + (MR.retreivePairwiseTCM thisHugeTCM) + (makeDynamicCharacterFromSingleVector $ hugeFinal leftChar) + (makeDynamicCharacterFromSingleVector $ hugeFinal rightChar) + in GV.zipWith (.|.) lG rG + + -- r = GV.zipWith (.|.) (hugeIAFinal leftChar) (hugeIAFinal rightChar) + + r' = + if filterGaps + then extractMediansSingle r + else r + in blankCharacterData + { hugePrelim = r' + , hugeGapped = (r', r', r') + , hugeFinal = r' + , hugeIAPrelim = (r, r, r) + , hugeIAFinal = r + } + + +{- union2SingleTuple is a wrapper around union2Single for parallelism -} +union2SingleTuple ∷ Bool → Bool → (CharacterData, CharacterData, CharInfo) → CharacterData +union2SingleTuple a b (c,d,e) = union2Single a b c d e + +{- | union2Single takes character data and returns union character data +union2Single assumes that the character vectors in the various states are the same length +that is--all leaves (hence other vertices later) have the same number of each type of character +used IAFinal states for dynamic characters +used in heurstic graph build and rearrangement +-} +union2Single ∷ Bool → Bool → CharacterData → CharacterData → CharInfo → CharacterData +union2Single useIA filterGaps firstVertChar secondVertChar inCharInfo = + let thisType = charType inCharInfo + thisMatrix = costMatrix inCharInfo + thisActive = activity inCharInfo + thisSlimTCM = slimTCM inCharInfo + thisWideTCM = wideTCM inCharInfo + thisHugeTCM = hugeTCM inCharInfo + in if not thisActive + then firstVertChar + else + if thisType == Add + then intervalUnion firstVertChar secondVertChar + else + if thisType == NonAdd + then localUnion firstVertChar secondVertChar + else + if thisType `elem` packedNonAddTypes + then BP.unionPacked firstVertChar secondVertChar + else + if thisType == Matrix + then unionMatrix thisMatrix firstVertChar secondVertChar + else + if thisType `elem` prealignedCharacterTypes + then getPrealignedUnion thisType firstVertChar secondVertChar + else + if thisType `elem` nonExactCharacterTypes + then getDynamicUnion useIA filterGaps thisType firstVertChar secondVertChar thisSlimTCM thisWideTCM thisHugeTCM + else error ("Character type " <> show thisType <> " unrecognized/not implemented") + + + +{- | getPreAligned2Median takes prealigned character types (AlignedSlim, AlignedWide, AlignedHuge) and returns 2-median and cost +uses IA-type functions for slim/wide/huge +adjustNoCost distance between two nodes or creating a median with two edges +calcuated by the no change for entire length based min of the two nodes to self for non-change +-} +getPreAligned2Median ∷ Bool → CharInfo → CharacterData → CharacterData → CharacterData → CharacterData +getPreAligned2Median isMedian charInfo nodeChar leftChar rightChar = + let setCost + ∷ ∀ {a} + . (Integral a) + ⇒ a + → CharacterData + → CharacterData + setCost cVal r = + r + { localCost = weight charInfo * fromIntegral cVal + , globalCost = sum [weight charInfo * fromIntegral cVal, globalCost leftChar, globalCost rightChar] + } + + setSlimPrelim v r = r{alignedSlimPrelim = v} + setWidePrelim v r = r{alignedWidePrelim = v} + setHugePrelim v r = r{alignedHugePrelim = v} + + getCharL + ∷ ∀ {k} {v ∷ k → Type} {e ∷ k} + . (CharacterData → OpenDynamicCharacter v e) + → v e + getCharL f = extractMediansGapped $ f leftChar + + getCharR + ∷ ∀ {k} {v ∷ k → Type} {e ∷ k} + . (CharacterData → OpenDynamicCharacter v e) + → v e + getCharR f = extractMediansGapped $ f rightChar + + (setter, cost) = case charType charInfo of + AlignedSlim → + let cL = getCharL alignedSlimPrelim + cR = getCharR alignedSlimPrelim + (cM, score) = get2WaySlim (slimTCM charInfo) cL cR + noChangeAdjustment = + if isMedian + then 0 + else + let (_, lCost) = get2WaySlim (slimTCM charInfo) cL cL + (_, rCost) = get2WaySlim (slimTCM charInfo) cR cR + in min lCost rCost + score' = adjustNoCostNonZeroDiag isMedian (costMatrix charInfo) (score, noChangeAdjustment) + in -- trace ("GPA2MS: " <> (show (score, score'))) $ + (setSlimPrelim (cL, cM, cR), score') + AlignedWide → + let cL = getCharL alignedWidePrelim + cR = getCharR alignedWidePrelim + (cM, score) = get2WayWideHuge (wideTCM charInfo) cL cR + noChangeAdjustment = + if isMedian + then 0 + else + let (_, lCost) = get2WayWideHuge (wideTCM charInfo) cL cL + (_, rCost) = get2WayWideHuge (wideTCM charInfo) cR cR + in min lCost rCost + score' = adjustNoCostNonZeroDiag isMedian (costMatrix charInfo) (score, noChangeAdjustment) + in (setWidePrelim (cL, cM, cR), score') + AlignedHuge → + let cL = getCharL alignedHugePrelim + cR = getCharR alignedHugePrelim + (cM, score) = get2WayWideHuge (hugeTCM charInfo) cL cR + noChangeAdjustment + | not isMedian = 0 + | otherwise = + let (_, lCost) = get2WayWideHuge (hugeTCM charInfo) cL cL + (_, rCost) = get2WayWideHuge (hugeTCM charInfo) cR cR + in min lCost rCost + score' = adjustNoCostNonZeroDiag isMedian (costMatrix charInfo) (score, noChangeAdjustment) + in (setHugePrelim (cL, cM, cR), score') + other → error $ "Unrecognized character type: " <> show other + in setter $ setCost cost nodeChar + +{- | getPreAligned2MedianUnionFields takes prealigned character types (AlignedSlim, AlignedWide, AlignedHuge) and returns 2-median and cost +uses IA-type functions for slim/wide/huge-- based on union fields +not sure if ever need adjust cost but in there to be complete +-} +getPreAligned2MedianUnionFields ∷ Bool → CharInfo → CharacterData → CharacterData → CharacterData → CharacterData +getPreAligned2MedianUnionFields isMedian charInfo nodeChar leftChar rightChar = + let characterType = charType charInfo + in if characterType == AlignedSlim + then + let cost' = adjustNoCostNonZeroDiag isMedian (costMatrix charInfo) (cost, noChangeAdjust) + (prelimChar, cost) = get2WaySlim (slimTCM charInfo) (alignedSlimUnion leftChar) (alignedSlimUnion rightChar) + noChangeAdjust = + if isMedian + then 0 + else snd $ get2WaySlim (slimTCM charInfo) prelimChar prelimChar + in -- trace ("GPA2M-slim: " <> (show (GV.length prelimChar, GV.length $ alignedSlimUnion leftChar, GV.length $ alignedSlimUnion rightChar))) + nodeChar + { alignedSlimUnion = prelimChar + , localCost = weight charInfo * fromIntegral cost' + , globalCost = sum [weight charInfo * fromIntegral cost', globalCost leftChar, globalCost rightChar] + } + else + if characterType == AlignedWide + then + let cost' = adjustNoCostNonZeroDiag isMedian (costMatrix charInfo) (cost, noChangeAdjust) + (prelimChar, cost) = get2WayWideHuge (wideTCM charInfo) (alignedWideUnion leftChar) (alignedWideUnion rightChar) + noChangeAdjust = + if isMedian + then 0 + else snd $ get2WayWideHuge (wideTCM charInfo) prelimChar prelimChar + in -- trace ("GPA2M-wide: " <> (show $ GV.length prelimChar)) + nodeChar + { alignedWideUnion = prelimChar + , localCost = weight charInfo * fromIntegral cost' + , globalCost = sum [weight charInfo * fromIntegral cost', globalCost leftChar, globalCost rightChar] + } + else + if characterType == AlignedHuge + then + let cost' = adjustNoCostNonZeroDiag isMedian (costMatrix charInfo) (cost, noChangeAdjust) + (prelimChar, cost) = get2WayWideHuge (hugeTCM charInfo) (alignedHugeUnion leftChar) (alignedHugeUnion rightChar) + noChangeAdjust = + if isMedian + then 0 + else snd $ get2WayWideHuge (hugeTCM charInfo) prelimChar prelimChar + in -- trace ("GPA2M-huge: " <> (show $ GV.length prelimChar)) + nodeChar + { alignedHugeUnion = prelimChar + , localCost = weight charInfo * fromIntegral cost' + , globalCost = sum [weight charInfo * fromIntegral cost', globalCost leftChar, globalCost rightChar] + } + else error ("Unrecognized character type " <> show characterType) + + +-- | makeIAUnionPrelimLeaf makes union and sets for leaf characters--leaves alignment fields unchanged +makeIAUnionPrelimLeaf ∷ CharInfo → CharacterData → CharacterData +makeIAUnionPrelimLeaf charInfo nodeChar = + let characterType = charType charInfo + in if characterType == NonAdd + then + let prelimState = snd3 $ stateBVPrelim nodeChar + in nodeChar{stateBVUnion = prelimState} + else + if characterType == Add + then + let prelimState = snd3 $ rangePrelim nodeChar + in nodeChar{rangeUnion = prelimState} + else + if characterType == Matrix + then + let prelimState = matrixStatesPrelim nodeChar + in nodeChar{matrixStatesUnion = prelimState} + else -- the checking for null in prelimstate (slim/wide/huge sequences) comes form case of splitting graph + -- and reoptimizing in swap and fuse but single leaf is split off + + if characterType `elem` [SlimSeq, NucSeq] + then + let prelimState = extractMediansGapped $ slimAlignment nodeChar + unionState = unionBVOrMissing prelimState (length $ alphabet charInfo) (slimGapped nodeChar) + in nodeChar + { slimIAPrelim = makeDynamicCharacterFromSingleVector unionState -- slimAlignment nodeChar + , slimIAFinal = unionState -- prelimState + , slimIAUnion = unionState + } + else + if characterType `elem` [WideSeq, AminoSeq] + then + let prelimState = extractMediansGapped $ wideAlignment nodeChar + unionState = unionBVOrMissing prelimState (length $ alphabet charInfo) (wideGapped nodeChar) + in nodeChar + { wideIAPrelim = makeDynamicCharacterFromSingleVector unionState -- ideAlignment nodeChar + , wideIAFinal = unionState -- prelimState + , wideIAUnion = unionState + } + else + if characterType == HugeSeq + then + let prelimState = extractMediansGapped $ hugeAlignment nodeChar + unionState = unionBVOrMissing prelimState (length $ alphabet charInfo) (hugeGapped nodeChar) + in nodeChar + { hugeIAPrelim = makeDynamicCharacterFromSingleVector unionState -- hugeAlignment nodeChar + , hugeIAFinal = unionState -- prelimState + , hugeIAUnion = unionState + } + else + if characterType == AlignedSlim + then + let prelimState = snd3 $ alignedSlimPrelim nodeChar + in nodeChar{alignedSlimUnion = prelimState} + else + if characterType == AlignedWide + then + let prelimState = snd3 $ alignedWidePrelim nodeChar + in nodeChar{alignedWideUnion = prelimState} + else + if characterType == AlignedHuge + then + let prelimState = snd3 $ alignedHugePrelim nodeChar + in nodeChar{alignedHugeUnion = prelimState} + else + if characterType `elem` [Packed2, Packed4, Packed5, Packed8, Packed64] + then + let prelimState = snd3 $ packedNonAddPrelim nodeChar + in nodeChar{packedNonAddUnion = prelimState} + else error ("Unrecognized character type " <> show characterType) + + +{- | unionBVOrMissing returns leaf algnment state, if all '-' converts to missing characters + so BV unions and IAFinal get a zero cost as opposed to checking against all '-' seqquence +-} +unionBVOrMissing ∷ (FiniteBits e, GV.Vector v e) ⇒ v e → Int → (v e, v e, v e) → v e +unionBVOrMissing prelimState alphSize nodeGapped = + if not $ GV.null prelimState + then + if GV.null $ extractMediansSingle prelimState + then -- trace ("MIAUPL: " <> (show $ convertIfAllGapsToAllBitsOn (length $ alphabet charInfo) prelimState)) + convertAllGapsToAllBitsOn alphSize prelimState + else prelimState + else extractMedians nodeGapped + + +{- | convertAllGapsToAllBitsOn takes a single fields of a dynamic character and +converts replaces +'gaps' with all bits on--in essence '?' or missing element +-} +convertAllGapsToAllBitsOn ∷ (FiniteBits e, GV.Vector v e) ⇒ Int → v e → v e +convertAllGapsToAllBitsOn alphSize inVect = + if GV.null inVect + then inVect + else + let numElements = GV.length inVect + onBitList = fmap (setBit (inVect GV.! 0)) [0 .. alphSize - 1] + onBits = L.foldl1' (.|.) onBitList + in GV.replicate numElements onBits + + +-- | allMissingBits test if all bits in alphabet size are ON +allMissingBits ∷ (FiniteBits e, GV.Vector v e) ⇒ Int → v e → Bool +allMissingBits alphSize inVect = + not (GV.null inVect) + && ( let onBitList = fmap (setBit (inVect GV.! 0)) [0 .. alphSize - 1] + missingBits = L.foldl1' (.|.) onBitList + offBits = GV.filter (/= missingBits) inVect + in GV.null offBits + ) + +{- | makeIAPrelimCharacter takes two characters and performs 2-way assignment +based on character type and nodeChar +modifies both IA and union fields +union fields are unions of parent states (aligned, or IA, or static) +does not adjust for no change cost stuff since never used for actual cost--just assignment (I hope) +-} +makeIAPrelimCharacter ∷ Bool → CharInfo → CharacterData → CharacterData → CharacterData → CharacterData +makeIAPrelimCharacter _ charInfo nodeChar leftChar rightChar = + let characterType = charType charInfo + in if characterType == NonAdd + then + let leftState = stateBVUnion leftChar + rightState = stateBVUnion rightChar + unionState = V.zipWith (.|.) leftState rightState + in nodeChar{stateBVUnion = unionState} + else + if characterType == Add + then + let prelimState = + V.zipWith4 + getUnionRange + (V.map fst $ rangeUnion leftChar) + (V.map snd $ rangeUnion leftChar) + (V.map fst $ rangeUnion rightChar) + (V.map snd $ rangeUnion rightChar) + in nodeChar{rangeUnion = prelimState} + else + if characterType == Matrix + then + let numStates = length $ costMatrix charInfo + newMatrixVector = getUnionVector (costMatrix charInfo) numStates <$> V.zip (matrixStatesUnion leftChar) (matrixStatesUnion rightChar) + in nodeChar{matrixStatesUnion = newMatrixVector} + else + if characterType == AlignedSlim + then + let prelimState = GV.zipWith (.|.) (alignedSlimUnion leftChar) (alignedSlimUnion rightChar) + in nodeChar{alignedSlimUnion = prelimState} + else + if characterType == AlignedWide + then + let prelimState = GV.zipWith (.|.) (alignedWideUnion leftChar) (alignedWideUnion rightChar) + in nodeChar{alignedWideUnion = prelimState} + else + if characterType == AlignedHuge + then + let prelimState = GV.zipWith (.|.) (alignedHugeUnion leftChar) (alignedHugeUnion rightChar) + in nodeChar{alignedHugeUnion = prelimState} + else + if characterType `elem` [Packed2, Packed4, Packed5, Packed8, Packed64] + then + let prelimState = GV.zipWith (.|.) (packedNonAddUnion leftChar) (packedNonAddUnion leftChar) + in nodeChar{packedNonAddUnion = prelimState} + else + if characterType `elem` [SlimSeq, NucSeq] + then + let (prelimChar, cost) = get2WaySlim (slimTCM charInfo) (extractMediansGapped $ slimIAPrelim leftChar) (extractMediansGapped $ slimIAPrelim rightChar) + in -- trace ("MPC: " <> (show prelimChar) <> "\nleft: " <> (show $ extractMediansGapped $ slimIAPrelim leftChar) <> "\nright: " <> (show $ extractMediansGapped $ slimIAPrelim rightChar)) + -- trace ("MIAUP-C: " <> (show $ GV.length $ GV.zipWith (.|.) (slimIAUnion leftChar) (slimIAUnion rightChar))) + + -- the check for all missing basically creates an intersection result so all missing isn't porpagated post-order + nodeChar + { slimIAPrelim = + ( extractMediansGapped $ slimIAPrelim leftChar + , prelimChar + , extractMediansGapped $ slimIAPrelim rightChar + ) + , slimIAUnion = orBVOrMissingIntersection (length $ alphabet charInfo) (slimIAUnion leftChar) (slimIAUnion rightChar) + , localCost = + if GV.null (slimIAUnion leftChar) || GV.null (slimIAUnion rightChar) + then 0 + else weight charInfo * fromIntegral cost + , globalCost = sum [weight charInfo * fromIntegral cost, globalCost leftChar, globalCost rightChar] + } + else + if characterType `elem` [WideSeq, AminoSeq] + then + let (prelimChar, minCost) = + get2WayWideHuge + (wideTCM charInfo) + (extractMediansGapped $ wideIAPrelim leftChar) + (extractMediansGapped $ wideIAPrelim rightChar) + in nodeChar + { wideIAPrelim = + ( extractMediansGapped $ wideIAPrelim leftChar + , prelimChar + , extractMediansGapped $ wideIAPrelim rightChar + ) + , wideIAUnion = orBVOrMissingIntersection (length $ alphabet charInfo) (wideIAUnion leftChar) (wideIAUnion rightChar) + , localCost = weight charInfo * fromIntegral minCost + , globalCost = sum [weight charInfo * fromIntegral minCost, globalCost leftChar, globalCost rightChar] + } + else + if characterType == HugeSeq + then + let (prelimChar, minCost) = + get2WayWideHuge + (hugeTCM charInfo) + (extractMediansGapped $ hugeIAPrelim leftChar) + (extractMediansGapped $ hugeIAPrelim rightChar) + in nodeChar + { hugeIAPrelim = + ( extractMediansGapped $ hugeIAPrelim leftChar + , prelimChar + , extractMediansGapped $ hugeIAPrelim rightChar + ) + , hugeIAUnion = orBVOrMissingIntersection (length $ alphabet charInfo) (hugeIAUnion leftChar) (hugeIAUnion rightChar) + , localCost = weight charInfo * fromIntegral minCost + , globalCost = sum [weight charInfo * fromIntegral minCost, globalCost leftChar, globalCost rightChar] + } + else nodeChar -- error ("Unrecognized character type " <> show characterType) + + +-- | orBVOrMissingIntersection takes two uninBV seqs and returns union or intersectino if one/both missing +orBVOrMissingIntersection ∷ (FiniteBits e, GV.Vector v e) ⇒ Int → v e → v e → v e +orBVOrMissingIntersection alphSize unionIALeft unionIARight + | GV.null unionIALeft || allMissingBits alphSize unionIALeft = unionIARight + | GV.null unionIARight || allMissingBits alphSize unionIARight = unionIALeft + | otherwise = GV.zipWith (.|.) unionIALeft unionIARight + + +{- | makeIAFinalCharacterStaticIA takes two characters and performs 2-way assignment +based on character type and nodeChar--only IA fields are modified +this pulls from current node for left and right states +skips other than unaligned sequence characters +-} +makeIAFinalCharacter ∷ AssignmentMethod → CharInfo → CharacterData → CharacterData → CharacterData +makeIAFinalCharacter finalMethod charInfo nodeChar parentChar = + let characterType = charType charInfo + in if characterType `elem` [SlimSeq, NucSeq] + then + let finalIAChar = + getFinal3WaySlim + (slimTCM charInfo) + (slimIAFinal parentChar) + (extractMediansLeftGapped $ slimIAPrelim nodeChar) + (extractMediansRightGapped $ slimIAPrelim nodeChar) + finalChar = + if finalMethod == ImpliedAlignment + then extractMedians $ makeDynamicCharacterFromSingleVector finalIAChar + else slimFinal nodeChar + in nodeChar + { slimIAFinal = finalIAChar + , slimFinal = finalChar + } + else + if characterType `elem` [WideSeq, AminoSeq] + then + let finalIAChar = + getFinal3WayWideHuge + (wideTCM charInfo) + (wideIAFinal parentChar) + (extractMediansLeftGapped $ wideIAPrelim nodeChar) + (extractMediansRightGapped $ wideIAPrelim nodeChar) + finalChar = + if finalMethod == ImpliedAlignment + then extractMedians $ makeDynamicCharacterFromSingleVector finalIAChar + else wideFinal nodeChar + in nodeChar + { wideIAFinal = finalIAChar + , wideFinal = finalChar + } + else + if characterType == HugeSeq + then + let finalIAChar = + getFinal3WayWideHuge + (hugeTCM charInfo) + (hugeIAFinal parentChar) + (extractMediansLeftGapped $ hugeIAPrelim nodeChar) + (extractMediansRightGapped $ hugeIAPrelim nodeChar) + finalChar = + if finalMethod == ImpliedAlignment + then extractMedians $ makeDynamicCharacterFromSingleVector finalIAChar + else hugeFinal nodeChar + in nodeChar + { hugeIAFinal = finalIAChar + , hugeFinal = finalChar + } + else nodeChar -- error ("Unrecognized character type " <> show characterType) + + +-- | get2WaySlim takes two slim vectors an produces a preliminary median +get2WayGeneric + ∷ ∀ e (v ∷ Type → Type). (GV.Vector v e) ⇒ (e → e → (e, Word)) → v e → v e → (v e, Word) +get2WayGeneric tcm descendantLeftPrelim descendantRightPrelim = + let -- this should not be needed some problems at times with IA + -- len = GV.length descendantLeftPrelim + len = min (GV.length descendantLeftPrelim) (GV.length descendantRightPrelim) + vt = V.generate len $ \i → tcm (descendantLeftPrelim GV.! i) (descendantRightPrelim GV.! i) -- :: V.Vector (SlimState, Word) + gen ∷ (GV.Vector v a) ⇒ V.Vector (a, b) → v a + gen v = let med i = fst $ v V.! i in GV.generate len med + + add ∷ (Num b) ⇒ V.Vector (a, b) → b + add = V.foldl' (\x e → x + snd e) 0 + in (,) <$> gen <*> add $ vt + + +-- | get2WaySlim takes two slim vectors an produces a preliminary median +get2WaySlim ∷ TCMD.DenseTransitionCostMatrix → SV.Vector SlimState → SV.Vector SlimState → (SV.Vector SlimState, Word) +get2WaySlim lSlimTCM = get2WayGeneric (TCMD.lookupPairwise lSlimTCM) + + +-- | get2WayWideHuge like get2WaySlim but for wide and huge characters +get2WayWideHuge ∷ (FiniteBits a, GV.Vector v a) ⇒ MR.MetricRepresentation a → v a → v a → (v a, Word) +get2WayWideHuge whTCM = get2WayGeneric (MR.retreivePairwiseTCM whTCM) + + +{- | getFinal3Way takes parent final assignment (including indel characters) and descendent +preliminary gapped assingment from postorder and creates a gapped final assignment based on +minimum cost median for the three inputs. THis is done to preserve the ImpliedAlignment +information to create a final assingment with out an additional DO call to keep the +creation linear in sequence length. Since gaps remain--they must be filtered when output or +used as true final sequence assignments using M.createUngappedMedianSequence +-} +getFinal3WaySlim + ∷ TCMD.DenseTransitionCostMatrix → SV.Vector SlimState → SV.Vector SlimState → SV.Vector SlimState → SV.Vector SlimState +getFinal3WaySlim lSlimTCM parentFinal descendantLeftPrelim descendantRightPrelim = + let newFinal = removeGapAndNil $ SV.zipWith3 (local3WaySlim lSlimTCM) parentFinal descendantLeftPrelim descendantRightPrelim + in newFinal + + +-- | getFinal3WayWideHuge like getFinal3WaySlim but for wide and huge characters +getFinal3WayWideHuge ∷ (FiniteBits a, GV.Vector v a) ⇒ MR.MetricRepresentation a → v a → v a → v a → v a +getFinal3WayWideHuge whTCM parentFinal descendantLeftPrelim descendantRightPrelim = + let newFinal = removeGapAndNil $ GV.zipWith3 (local3WayWideHuge whTCM) parentFinal descendantLeftPrelim descendantRightPrelim + in newFinal + + +-- | local3WayWideHuge takes tripples for wide and huge sequence types and returns median +local3WayWideHuge ∷ (FiniteBits a) ⇒ MR.MetricRepresentation a → a → a → a → a +local3WayWideHuge lWideTCM b c d = + let -- b' = if b == zeroBits then gap else b + -- c' = if c == zeroBits then gap else c + -- d' = if d == zeroBits then gap else d + (median, _) = MR.retreiveThreewayTCM lWideTCM b c d + in -- trace ((show b) <> " " <> (show c) <> " " <> (show d) <> " => " <> (show median)) + median + + +-- | local3WaySlim takes triple of SlimState and retuns median +local3WaySlim ∷ TCMD.DenseTransitionCostMatrix → SlimState → SlimState → SlimState → SlimState +local3WaySlim lSlimTCM b c d = + -- trace ("L3WS: " <> (show (b,c,d))) ( + let + in -- b' = if b == zeroBits then gap else b + -- c' = if c == zeroBits then gap else c + -- d' = if d == zeroBits then gap else d + + -- trace ("L3WS: " <> (show (b',c',d'))) ( + let (median, _) = TCMD.lookupThreeway lSlimTCM b c d + in -- trace ("3way: " <> (show b) <> " " <> (show c) <> " " <> (show d) <> " => " <> (show (median, cost))) + median + + +-- ) + +-- | generalSequenceDiff takes two sequence elemental bit types and returns min and max integer +-- cost differences using matrix values +-- if value has no bits on--it is set to 0th bit on for GAP or 0 for minimum +-- the Nil Stuff can be an issue for counting +-- can't short circuit if equal due to PMDL stuff +generalSequenceDiff ∷ (FiniteBits a, Show a) ⇒ S.Matrix Int → Int → a → a → (Int, Int) +generalSequenceDiff thisMatrix numStates uState vState = + --trace ("GSD: " <> (show (numStates, uState, vState))) $ + -- these are checks for Nil + let minState = if uState == (uState `xor` uState) then 0 + else if vState == (uState `xor` uState) then 0 + else maxBound :: Int + in + let (minBits, maxBits) = + let gapIfNil ∷ ∀ {a}. (Bits a) ⇒ a → a + gapIfNil x + | popCount x == 0 = (x `xor` x) `setBit` fromEnum gapIndex + | otherwise = x + uState' = gapIfNil uState + vState' = gapIfNil vState + uStateList = fmap snd $ filter fst $ zip (fmap (testBit uState') [0 .. numStates - 1]) [0 .. numStates - 1] + vStateList = fmap snd $ filter fst $ zip (fmap (testBit vState') [0 .. numStates - 1]) [0 .. numStates - 1] + uvCombinations = cartProd uStateList vStateList + costOfPairs = fmap (thisMatrix S.!) uvCombinations + in -- trace ("GSD: " <> (show uStateList) <> " " <> (show vStateList) <> " min " <> (show $ minimum costOfPairs) <> " max " <> (show $ maximum costOfPairs)) + (minimum costOfPairs, maximum costOfPairs) + in + (min minState minBits, maxBits) + + + +-- ) + +-- | getNoGapPrelimContext takes gaps and nils out of left, median, and right of gapped structure +getNoGapPrelimContext + ∷ ( FiniteBits e + , GV.Vector v e + ) + ⇒ OpenDynamicCharacter v e + → OpenDynamicCharacter v e +getNoGapPrelimContext prelimContext = + let lhs = extractMediansLeft prelimContext + med = extractMedians prelimContext + rhs = extractMediansRight prelimContext + in (lhs, med, rhs) diff --git a/src/GraphOptimization/PostOrderSoftWiredFunctions.hs b/src/GraphOptimization/PostOrderSoftWiredFunctions.hs new file mode 100644 index 000000000..3ee750134 --- /dev/null +++ b/src/GraphOptimization/PostOrderSoftWiredFunctions.hs @@ -0,0 +1,1318 @@ +{- | +Module : PostOrderSoftWiredFunctions.hs +Description : Module specifying post-order softwiired graph functions +Copyright : (c) 2022 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. +License : + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The views and conclusions contained in the software and documentation are those +of the authors and should not be interpreted as representing official policies, +either expressed or implied, of the FreeBSD Project. + +Maintainer : Ward Wheeler +Stability : unstable +Portability : portable (I hope) +-} +module GraphOptimization.PostOrderSoftWiredFunctions ( + updateAndFinalizePostOrderSoftWired, + postOrderSoftWiredTraversal, + makeLeafGraphSoftWired, + getDisplayBasedRerootSoftWired, + divideDecoratedGraphByBlockAndCharacterTree, + postOrderTreeTraversal, + postDecorateTree, + createVertexDataOverBlocks, + createVertexDataOverBlocksStaticIA, + createVertexDataOverBlocksNonExact, + getBlockCost, + getW15NetPenalty, + getW15NetPenaltyFull, + getW23NetPenalty, + getW23NetPenaltyReduced, + getW15RootCost, + -- , getNetPenalty + -- , getNetPenaltyReduced +) where + +import Control.DeepSeq +import Control.Parallel.Strategies +import Data.Bits +import Data.Functor ((<&>)) +import Data.List qualified as L +import Data.Maybe +import Data.Text.Lazy qualified as T +import Data.Vector qualified as V +import GeneralUtilities +import GraphFormatUtilities qualified as GFU +import GraphOptimization.Medians qualified as M +import GraphOptimization.PostOrderSoftWiredFunctionsNew qualified as NEW +import Graphs.GraphOperations qualified as GO +import PHANE.Evaluation +import PHANE.Evaluation.Logging (LogLevel (..), Logger (..)) +import PHANE.Evaluation.Verbosity (Verbosity (..)) +import Types.Types +import Utilities.LocalGraph qualified as LG +import Utilities.Utilities qualified as U +import Debug.Trace + + +{- | naivePostOrderSoftWiredTraversal produces the post-order result for a softwired graph using +a naive algorithm where all display trees are generated and diagnosed, keeping the best results +to return a phylogenetic graph +any network penalty is not applied as in postOrderSoftWiredTraversal +not contracting in=1 out =1 nodes so that indexing will be consistent +does not create resolution cache data structures +really only for diagnosis and time complexity comparison with resolution cache algorithm +-} +naivePostOrderSoftWiredTraversal + ∷ GlobalSettings → ProcessedData → DecoratedGraph → Maybe Int → SimpleGraph → PhyG PhylogeneticGraph +naivePostOrderSoftWiredTraversal inGS inData@(_, _, blockDataVect) leafGraph startVertex inSimpleGraph = + -- this is a lazy list so can be consumed and not an issue with exponential number of Trees + let contractIn1Out1Nodes = False + -- (_, _, _, netVertexList) = LG.splitVertexList inSimpleGraph + displayTreeList = LG.generateDisplayTrees contractIn1Out1Nodes inSimpleGraph + + -- get root index + rootIndex = + if isJust startVertex + then fromJust startVertex + else fst $ head $ LG.getRoots inSimpleGraph + in do + -- get the best traversals of the best display trees for each block + bestDisplayResult ← getBestDisplayCharBlockList inGS inData leafGraph rootIndex 0 [] [] displayTreeList + let (bestTripleInfo, _) = bestDisplayResult -- getBestDisplayCharBlockList inGS inData leafGraph rootIndex 0 [] [] displayTreeList + let (blockCostList, bestDisplayTreeList, charTreeVectList) = unzip3 bestTripleInfo + + -- extract specific information to create the phylogenetic graph + let graphCost = sum blockCostList + let displayTreeVect = V.fromList bestDisplayTreeList + let charTreeVectVect = V.fromList charTreeVectList + + -- propagate display node assignment to canonical graph + -- does not have correct VertInfo--just character assignments + -- to fox would need to propagate (and update other vertinfo like BV) via postorder pass + let newCononicalGraph = NEW.backPortBlockTreeNodesToCanonicalGraph (GO.convertSimpleToDecoratedGraph inSimpleGraph) displayTreeVect + + -- create postorder Phylgenetic graph + let postOrderPhyloGraph = (inSimpleGraph, graphCost, newCononicalGraph, fmap (: []) displayTreeVect, charTreeVectVect, (fmap thd3 blockDataVect)) + + -- trace ("NPOSW: " <> (show $ fmap bvLabel $ fmap snd $ LG.labNodes newCononicalGraph) <> "\nDisplay :" <> (show $ fmap bvLabel $ fmap snd $ LG.labNodes $ V.head displayTreeVect)) + pure postOrderPhyloGraph + + +{- | getBestDisplayCharBlockList takes a Tree gets best rootings, compares to input list if there is one and takes better +returning triple of block cost, display tree, char vect from better tree +-} +getBestDisplayCharBlockList + ∷ GlobalSettings + → ProcessedData + → DecoratedGraph + → Int + → Int + → [(VertexCost, DecoratedGraph, V.Vector DecoratedGraph)] + → [PhylogeneticGraph] + → [SimpleGraph] + → PhyG ([(VertexCost, DecoratedGraph, V.Vector DecoratedGraph)], [PhylogeneticGraph]) +getBestDisplayCharBlockList inGS inData leafGraph rootIndex treeCounter currentBestTriple currentBestTreeList displayTreeList = + if null displayTreeList + then -- trace ("\tExamined " <> (show treeCounter) <> " display trees") + pure (currentBestTriple, currentBestTreeList) + else -- trace ("GBDCBL Trees: " <> (show $ length displayTreeList)) ( + -- take first graph + + let -- get number of threads for parallel evaluation of display trees + -- can set +RTS -N1 if CPUTime is off + + numDisplayTreesToEvaluate = graphsSteepest inGS -- PU.getNumThreads + firstGraphList = take numDisplayTreesToEvaluate displayTreeList + + staticIA = False + + postOrderAction ∷ SimpleGraph → PhyG PhylogeneticGraph + postOrderAction = postOrderTreeTraversal inGS inData leafGraph staticIA (Just rootIndex) + + displayAction ∷ PhylogeneticGraph → PhyG PhylogeneticGraph + displayAction = getDisplayBasedRerootSoftWired' inGS Tree rootIndex + + tripleAction ∷ PhylogeneticGraph → [(VertexCost, DecoratedGraph, V.Vector DecoratedGraph)] + tripleAction = getTreeTriple rootIndex + in do + -- diagnose post order as Tree + postOrderPar ← getParallelChunkTraverse + outgroupDiagnosedTreeList <- postOrderPar postOrderAction firstGraphList + -- PU.seqParMap (parStrategy $ lazyParStrat inGS) (postOrderTreeTraversal inGS inData leafGraph staticIA (Just rootIndex)) firstGraphList + + -- do rerooting of character trees + multiTraverseTreeList ← + getParallelChunkTraverse >>= \pTraverse → + displayAction `pTraverse` outgroupDiagnosedTreeList + + -- extract triple (relevent info)--sets if multitraverse (reroot characters) or not + multiTraverseTripleList ← + getParallelChunkMap <&> \pMap → + pMap tripleAction $ + if multiTraverseCharacters inGS + then multiTraverseTreeList + else outgroupDiagnosedTreeList + + -- choose better vs currentBestTriple + -- this can be folded for a list > 2 + let newBestTriple = L.foldl' chooseBetterTriple currentBestTriple multiTraverseTripleList -- multiTraverseTree + + -- save best overall dysplay trees for later use in penalty phase + newBestTreeList ← GO.selectGraphsFull Best (maxBound ∷ Int) 0.0 $ multiTraverseTreeList <> currentBestTreeList + + -- trace ("GBDCBL: " <> (show (fmap snd6 currentBestTreeList, fmap snd6 newBestTreeList, fmap snd6 multiTraverseTreeList))) + getBestDisplayCharBlockList + inGS + inData + leafGraph + rootIndex + (treeCounter + (length firstGraphList)) + newBestTriple + newBestTreeList + (drop numDisplayTreesToEvaluate displayTreeList) + + +-- | getTreeTriple takes a phylogenetic gaph and returns the triple list of block cost, display tree, and character graphs +getTreeTriple ∷ LG.Node → PhylogeneticGraph → [(VertexCost, DecoratedGraph, V.Vector DecoratedGraph)] +getTreeTriple rootIndex inGraph@(g, _, _, _, _, _) + | LG.isEmpty g = [] + | otherwise = + let blockCostList = V.toList $ fmap (getBlockCost rootIndex) (fft6 inGraph) + in zip3 blockCostList (L.replicate (length blockCostList) (thd6 inGraph)) . V.toList $ fft6 inGraph + + +{- | chooseBetterTriple takes the current best triplet of graph data and compares to Phylogenetic graph +and creates a new triple of better block cost, displayGraph for blocks, and character graphs +-} +chooseBetterTriple + ∷ [(VertexCost, DecoratedGraph, V.Vector DecoratedGraph)] + → [(VertexCost, DecoratedGraph, V.Vector DecoratedGraph)] + → [(VertexCost, DecoratedGraph, V.Vector DecoratedGraph)] +chooseBetterTriple inTripleList newTripleList = + if null inTripleList + then newTripleList + else -- compare current to previous, take better of two + + zipWith chooseBetterBlock inTripleList newTripleList + where + chooseBetterBlock (a, b, c) (a', b', c') = if a' > a then (a, b, c) else (a', b', c') + + +-- | getBlockCost takes a block and returns sum of root character tree costs +getBlockCost ∷ LG.Node → V.Vector DecoratedGraph → VertexCost +getBlockCost rootIndex characterTreeVect = + if V.null characterTreeVect + then error "Empty character block vector in getBlockCost" + else V.sum $ fmap (getCharacterTreeCost rootIndex) characterTreeVect + + +-- | getCharacterTreeCost takes a character Tree and returns costs +getCharacterTreeCost ∷ LG.Node → DecoratedGraph → VertexCost +getCharacterTreeCost rootIndex characterTree = + if LG.isEmpty characterTree + then error "Empty charcter tree in getCharacterTreeCost" + else (subGraphCost . snd) $ LG.labelNode characterTree rootIndex + + +-- | postOrderSoftWiredTraversal is a wrapper to allow correct function choice for alternate softwired algorithms +postOrderSoftWiredTraversal + ∷ GlobalSettings → ProcessedData → DecoratedGraph → Bool → Maybe Int → SimpleGraph → PhyG PhylogeneticGraph +postOrderSoftWiredTraversal inGS inData leafGraph _ startVertex inSimpleGraph = + -- firt case shouldn't happen--just checking if naive is chosen + if graphType inGS == Tree + then postOrderSoftWiredTraversal' inGS inData leafGraph startVertex inSimpleGraph + else + if softWiredMethod inGS == ResolutionCache + then postOrderSoftWiredTraversal' inGS inData leafGraph startVertex inSimpleGraph + else naivePostOrderSoftWiredTraversal inGS inData leafGraph startVertex inSimpleGraph + + +{- | postOrderSoftWiredTraversal' sets up and calls postorder traversal on Soft-wired graph +at root +staticIA is ignored--but kept for functional polymorphism +ur-root = ntaxa is an invariant +-} +postOrderSoftWiredTraversal' + ∷ GlobalSettings → ProcessedData → DecoratedGraph → Maybe Int → SimpleGraph → PhyG PhylogeneticGraph +postOrderSoftWiredTraversal' inGS inData@(_, _, blockDataVect) leafGraph startVertex inSimpleGraph = + if LG.isEmpty inSimpleGraph + then pure emptyPhylogeneticGraph + else -- Assumes root is Number of Leaves--should be invariant everywhere + + let rootIndex = + if startVertex == Nothing + then V.length $ fst3 inData + else fromJust startVertex + blockCharInfo = V.map thd3 blockDataVect + in do + -- newSoftWired = postDecorateSoftWired inGS inSimpleGraph leafGraph blockCharInfo rootIndex rootIndex + newSoftWired ← NEW.postDecorateSoftWired inGS inSimpleGraph leafGraph blockCharInfo rootIndex rootIndex + + if (startVertex == Nothing) && (not $ LG.isRoot inSimpleGraph rootIndex) + then + let localRootList = fst <$> LG.getRoots inSimpleGraph + localRootEdges = concatMap (LG.out inSimpleGraph) localRootList + currentRootEdges = LG.out inSimpleGraph rootIndex + in error + ( "Index " + <> show rootIndex + <> " with edges " + <> show currentRootEdges + <> " not root in graph:" + <> show localRootList + <> " edges:" + <> show localRootEdges + <> "\n" + <> LG.prettify inSimpleGraph + ) + else pure newSoftWired + + +-- | getDisplayBasedRerootSoftWired is a wrapper to allow correct function choice for alternate softwired algorithms +getDisplayBasedRerootSoftWired ∷ GlobalSettings → GraphType → LG.Node → PhylogeneticGraph → PhyG PhylogeneticGraph +getDisplayBasedRerootSoftWired inGS inGraphType rootIndex inPhyloGraph = + -- check if doing rerooting--if not then return existing graph + if inGraphType == Tree + then getDisplayBasedRerootSoftWired' inGS inGraphType rootIndex inPhyloGraph + else + if softWiredMethod inGS == ResolutionCache + then getDisplayBasedRerootSoftWired' inGS inGraphType rootIndex inPhyloGraph + else pure $ naiveGetDisplayBasedRerootSoftWired inGS inGraphType rootIndex inPhyloGraph + + +{- | naiveGetDisplayBasedRerootSoftWired is the naive (based on all resolution display trees) +the work of getDisplayBasedRerootSoftWired' is already done (rerooting and all) in naivePostOrderSoftWiredTraversal +-} +naiveGetDisplayBasedRerootSoftWired ∷ GlobalSettings → GraphType → LG.Node → PhylogeneticGraph → PhylogeneticGraph +naiveGetDisplayBasedRerootSoftWired _ _ _ inPhyloGraph = + inPhyloGraph + + +{- | getDisplayBasedRerootSoftWired' takes a graph and generates reroot costs for each character of each block +based on rerooting the display tree for that block. +Written for soft-wired, but could be modified for tree (data split on vertdata not resolution data) +this is a differnt approach from that of "tree" where the decorated, canonical tree is rerooted and each character and block +cost determined from that single rerooting +this should help avoid the issue of rerooting complex, reticulate graphs and maintaining +all the condition (cycles, time consistency etc) that occur. +done correcly this should be able to be used for trees (all display trees same = cononical graph) as +well as softwired, but not for hardwired where reticulations are maintained. +-} + +-- this can be modified for Tree data structres--basically by starting with vertdata initially without +-- resolutoin data trace back--should be more efficient in many was than existing code + +-- Input display trees are for reporting only and do not contain actual character data so must be "pulled" +-- from cononical Decorated graph (thd field) +-- the list :[] stuff due to potential list of diplay trees not employed here +getDisplayBasedRerootSoftWired' ∷ GlobalSettings → GraphType → LG.Node → PhylogeneticGraph → PhyG PhylogeneticGraph +getDisplayBasedRerootSoftWired' inGS inGraphType rootIndex inPhyloGraph@(a, b, decGraph, _, _, f) = + if LG.isEmpty (fst6 inPhyloGraph) + then pure inPhyloGraph + else do + -- update with pass to retrieve vert data from resolution data + -- Trfee allready has data in vertData field + (inSimpleGraph, _, inDecGraph, inBlockGraphV', inBlockCharGraphVV', charInfoVV) ← + if inGraphType == Tree + then + let (displayTrees, charTrees) = divideDecoratedGraphByBlockAndCharacterTree decGraph + in pure (a, b, decGraph, displayTrees, charTrees, f) + else updateAndFinalizePostOrderSoftWired (Just rootIndex) rootIndex inPhyloGraph + + -- purge double edges from display and character graphs + -- this should not be happening--issue with postorder network resolutions data + let (inBlockGraphV, inBlockCharGraphVV) = + if inGraphType == Tree + then (inBlockGraphV', inBlockCharGraphVV') + else (fmap (fmap LG.removeDuplicateEdges) inBlockGraphV', fmap (fmap LG.removeDuplicateEdges) inBlockCharGraphVV') + + -- reroot block character trees + rerootResult ← + getParallelChunkTraverse >>= \pTraverse → + pTraverse (rerootBlockCharTrees' inGS rootIndex) . zip3 (V.toList $ fmap head inBlockGraphV) (V.toList inBlockCharGraphVV) $ + V.toList charInfoVV + + let (newBlockDisplayTreeVect, newBlockCharGraphVV, blockCostV) = unzip3 rerootResult + let newCononicalGraph = NEW.backPortBlockTreeNodesToCanonicalGraph inDecGraph (V.fromList newBlockDisplayTreeVect) + pure + ( inSimpleGraph + , sum blockCostV + , newCononicalGraph + , V.fromList $ fmap (: []) newBlockDisplayTreeVect + , V.fromList newBlockCharGraphVV + , charInfoVV + ) + + +-- | rerootBlockCharTrees' wrapper around rerootBlockCharTrees to allow for parMap +rerootBlockCharTrees' + ∷ GlobalSettings + → LG.Node + → (DecoratedGraph, V.Vector DecoratedGraph, V.Vector CharInfo) + → PhyG (DecoratedGraph, V.Vector DecoratedGraph, VertexCost) +rerootBlockCharTrees' inGS rootIndex (blockDisplayTree, charTreeVect, charInfoVect) = rerootBlockCharTrees inGS rootIndex blockDisplayTree charTreeVect charInfoVect + + +{- | rerootBlockCharTrees reroots all character trees (via fmap) in block returns best block char trees and costs +with best character tree node assignment back ported to display tree +-} +rerootBlockCharTrees + ∷ GlobalSettings + → LG.Node + → DecoratedGraph + → V.Vector DecoratedGraph + → V.Vector CharInfo + → PhyG (DecoratedGraph, V.Vector DecoratedGraph, VertexCost) +rerootBlockCharTrees inGS rootIndex blockDisplayTree charTreeVect charInfoVect = + if V.null charTreeVect + then error "Empty tree vector in rerootBlockCharTrees" + else + let -- next edges (to vertex in list) to perform rerooting + -- progresses recursively over adjacent edges to minimize node reoptimization + -- since initially all same graph can get initial reroot nodes from display tree + childrenOfRoot = LG.descendants blockDisplayTree rootIndex + grandChildrenOfRoot = concatMap (LG.descendants blockDisplayTree) childrenOfRoot + + getCharAction ∷ (DecoratedGraph, CharInfo) → (DecoratedGraph, VertexCost) + getCharAction = getCharTreeBestRoot' inGS rootIndex grandChildrenOfRoot + in do + -- leaving parallel since can be few blocks + -- (rerootedCharTreeVect, rerootedCostVect) = unzip (PU.seqParMap (parStrategy $ lazyParStrat inGS) (getCharTreeBestRoot' rootIndex grandChildrenOfRoot) (zip (V.toList charTreeVect) (V.toList charInfoVect))) + + (updateBlockDisplayTree, updatedDisplayVect, blockCost) ← + if multiTraverseCharacters inGS == True + then do + getCharPar ← getParallelChunkMap + let charResult = getCharPar getCharAction (zip (V.toList charTreeVect) (V.toList charInfoVect)) + let (rerootedCharTreeVect, rerootedCostVect) = unzip charResult + pure + ( backPortCharTreeNodesToBlockTree blockDisplayTree (V.fromList rerootedCharTreeVect) + , V.fromList rerootedCharTreeVect + , sum rerootedCostVect + ) + else + let rootCharLabelNodes = fmap (LG.labelNodeFlip rootIndex) charTreeVect + existingCost = sum $ fmap (subGraphCost . snd) rootCharLabelNodes + in pure (backPortCharTreeNodesToBlockTree blockDisplayTree charTreeVect, charTreeVect, existingCost) + pure (updateBlockDisplayTree, updatedDisplayVect, blockCost) + + +-- | getCharTreeBestRoot' is awrapper around getCharTreeBestRoot to use parMap +getCharTreeBestRoot' ∷ GlobalSettings → LG.Node → [LG.Node] → (DecoratedGraph, CharInfo) → (DecoratedGraph, VertexCost) +getCharTreeBestRoot' inGS rootIndex nodesToRoot (inCharacterGraph, charInfo) = + getCharTreeBestRoot inGS rootIndex nodesToRoot inCharacterGraph charInfo + + +-- | getCharTreeBestRoot takes the root index, a character tree (from a block) and its character info + +--- and prerforms the rerootings of that character tree to get the best reroot cost and preliminary assignments +getCharTreeBestRoot ∷ GlobalSettings → LG.Node → [LG.Node] → DecoratedGraph → CharInfo → (DecoratedGraph, VertexCost) +getCharTreeBestRoot inGS rootIndex nodesToRoot inCharacterGraph charInfo = + -- if prealigned should be rerooted? + let (bestRootCharGraph, bestRootCost) = + if (charType charInfo `notElem` sequenceCharacterTypes) + then (inCharacterGraph, (subGraphCost . snd) $ LG.labelNode inCharacterGraph rootIndex) + else rerootCharacterTree inGS rootIndex nodesToRoot charInfo inCharacterGraph + in (bestRootCharGraph, bestRootCost) + + +-- | rerootCharacterTree wrapper around rerootCharacterTree' with cleaner interface for "best" results +rerootCharacterTree ∷ GlobalSettings → LG.Node → [LG.Node] → CharInfo → DecoratedGraph → (DecoratedGraph, VertexCost) +rerootCharacterTree inGS rootIndex nodesToRoot charInfo inCharacterGraph = + rerootCharacterTree' + inGS + rootIndex + nodesToRoot + charInfo + ((subGraphCost . snd) $ LG.labelNode inCharacterGraph rootIndex) + inCharacterGraph + inCharacterGraph + + +{- | rerootCharacterTree' takes a character tree and root index and returns best rooted character tree and cost +this is recursive taking best cost to save on memory over an fmap and minimum +since does reroot stuff over character trees--that component is less efficient +root index always same--just edges conenct to change with rerooting +graph is prgressively rerooted to be efficient +-} +rerootCharacterTree' + ∷ GlobalSettings + → LG.Node + → [LG.Node] + → CharInfo + → VertexCost + → DecoratedGraph + → DecoratedGraph + → (DecoratedGraph, VertexCost) +rerootCharacterTree' inGS rootIndex nodesToRoot charInfo bestCost bestGraph inGraph = + if null nodesToRoot + then (bestGraph, bestCost) + else + let firstRerootIndex = head nodesToRoot + nextReroots = (LG.descendants inGraph firstRerootIndex) <> tail nodesToRoot + newGraph = rerootAndDiagnoseTree inGS rootIndex firstRerootIndex charInfo inGraph + newGraphCost = ((subGraphCost . snd) $ LG.labelNode newGraph rootIndex) + (bestGraph', bestCost') = + if newGraphCost < bestCost + then (newGraph, newGraphCost) + else (bestGraph, bestCost) + in -- if LG.isEmpty newGraph then rerootCharacterTree' rootIndex (tail nodesToRoot) charInfo bestCost bestGraph inGraph + -- trace ("RRCT:" <> (show (rootIndex, firstRerootIndex, bestCost, newGraphCost))) + -- else + -- trace ("RRCT: " <> (show (newGraphCost, bestCost))) + rerootCharacterTree' inGS rootIndex nextReroots charInfo bestCost' bestGraph' newGraph + + +-- | rerootAndDiagnoseTree takes tree and reroots and reoptimizes nodes +rerootAndDiagnoseTree ∷ GlobalSettings → LG.Node → LG.Node → CharInfo → DecoratedGraph → DecoratedGraph +rerootAndDiagnoseTree inGS rootIndex newRerootIndex charInfo inGraph = + let reRootGraph = LG.rerootDisplayTree rootIndex newRerootIndex inGraph + (nodesToOptimize, _) = LG.pathToRoot inGraph (LG.labelNode inGraph newRerootIndex) + reOptimizedGraph = reOptimizeCharacterNodes inGS charInfo reRootGraph nodesToOptimize + in if LG.isEmpty reRootGraph + then inGraph + else reOptimizedGraph + + +{- | reOptimizeCharacterNodes takes a decorated graph and a list of nodes and reoptimizes (relabels) +them based on children in input graph +simple recursive since each node depends on children +check for out-degree 1 since can be resolved form diplay trees +-} +reOptimizeCharacterNodes ∷ GlobalSettings → CharInfo → DecoratedGraph → [LG.LNode VertexInfo] → DecoratedGraph +reOptimizeCharacterNodes inGS charInfo inGraph oldNodeList = + -- trace ("RON:" <> (show $ fmap fst oldNodeList)) ( + if null oldNodeList + then inGraph + else + let curNode@(curNodeIndex, curNodeLabel) = head oldNodeList + nodeChildren = LG.descendants inGraph curNodeIndex -- should be 1 or 2, not zero since all leaves already in graph + foundCurChildern = filter (`elem` nodeChildren) $ fmap fst (tail oldNodeList) + in {-These are checks that were in for network code--should be unncesesary for charactaer trees + -- make sure that nodes are optimized in correct order so that nodes are only reoptimized using updated children + -- this should really not have to happen--order should be determined a priori + -} + -- if LG.isLeaf inGraph curNodeIndex then trace ("Should not be a leaf in reoptimize nodes: " <> (show curNodeIndex) <> " children " <> (show nodeChildren) <> "\nGraph:\n" <> (LG.prettify $ GO.convertDecoratedToSimpleGraph inGraph)) inGraph + -- else + + if not $ null foundCurChildern + then -- trace ("Current node " <> (show curNodeIndex) <> " has children " <> (show nodeChildren) <> " in optimize list (optimization order error)" <> (show $ fmap fst $ tail oldNodeList)) + reOptimizeCharacterNodes inGS charInfo inGraph (tail oldNodeList <> [curNode]) + else -- somehow root before others -- remove if not needed after debug + -- else if LG.isRoot inGraph curNodeIndex && length oldNodeList > 1 then + -- error ("Root first :" <> (show $ fmap fst oldNodeList) <> "RC " <> show (LG.descendants inGraph curNodeIndex)) -- <> "\n" <> (LG.prettify $ GO.convertDecoratedToSimpleGraph inGraph)) + -- reOptimizeNodes localGraphType charInfoVectVect inGraph ((tail oldNodeList) <> [curNode]) + + if length nodeChildren > 2 + then error ("Node has >2 children: " <> (show nodeChildren)) + else -- trace ("RON: " <> (show curNodeIndex) <> " children " <> (show nodeChildren)) ( + + let leftChild = head nodeChildren + rightChild = last nodeChildren + + -- this ensures that left/right choices are based on leaf BV for consistency and label invariance + (leftChildLabel, rightChildLabel) = U.leftRightChildLabelBV (fromJust $ LG.lab inGraph leftChild, fromJust $ LG.lab inGraph rightChild) + (newVertexData, newVertexCost) = + M.median2Single + (U.needTwoEdgeNoCostAdjust inGS True) + False + ((V.head . V.head . vertData) leftChildLabel) + ((V.head . V.head . vertData) rightChildLabel) + charInfo + in let (newCost, newBVLabel, newVertData, newSubGraphCost) = + if length nodeChildren < 2 + then (0, bvLabel leftChildLabel, vertData leftChildLabel, subGraphCost leftChildLabel) + else + ( newVertexCost + , bvLabel leftChildLabel .|. bvLabel rightChildLabel + , V.singleton (V.singleton newVertexData) + , subGraphCost leftChildLabel + subGraphCost rightChildLabel + newCost + ) + newVertexLabel = + VertexInfo + { index = curNodeIndex + , -- this bit labelling incorect for outdegree = 1, need to prepend bits + bvLabel = newBVLabel + , parents = V.fromList $ LG.parents inGraph curNodeIndex + , children = V.fromList nodeChildren + , nodeType = GO.getNodeType inGraph curNodeIndex -- nodeType curNodeLabel + , vertName = vertName curNodeLabel + , vertexResolutionData = mempty + , vertData = newVertData + , vertexCost = newCost + , subGraphCost = newSubGraphCost + } + + -- this to add back edges deleted with nodes (undocumented but sensible in fgl) + replacementEdges = LG.inn inGraph curNodeIndex <> LG.out inGraph curNodeIndex + newGraph = LG.insEdges replacementEdges $ LG.insNode (curNodeIndex, newVertexLabel) $ LG.delNode curNodeIndex inGraph + in -- trace ("New vertexCost " <> show newCost) -- <> " lcn " <> (show (vertData leftChildLabel, vertData rightChildLabel, vertData curnodeLabel))) + reOptimizeCharacterNodes inGS charInfo newGraph (tail oldNodeList) + + +{- | backPortCharTreeNodesToBlockTree assigned nodes states (labels) of character trees to block display Tree +updates vertData, vertexCost, and subGraphCost for each. Subgraph cost questionable since relies on rooting +-} +backPortCharTreeNodesToBlockTree ∷ DecoratedGraph → V.Vector DecoratedGraph → DecoratedGraph +backPortCharTreeNodesToBlockTree blockDisplayTree rerootedCharTreeVect = + let blockDisplayNodes = LG.labNodes blockDisplayTree + blockDisplayEdges = LG.labEdges blockDisplayTree + + -- vector (characters) of vector (nodes) of labels + charTreeLabelsVV = fmap V.fromList $ fmap (fmap snd) $ fmap LG.labNodes rerootedCharTreeVect + + -- for each node index extract (head head) vertdata, vertexCost and subgraphcost + -- (vertDataVV, vertCostVV, subGraphCostVV) = V.unzip3 $ fmap (extractTripleVect charTreeLabelsVV) (V.fromList [0..(length blockDisplayNodes - 1)]) + (vertDataVV, vertCostVV, subGraphCostVV) = V.unzip3 $ fmap (extractTripleVect charTreeLabelsVV) (V.fromList [0 .. (length blockDisplayNodes - 1)]) + + -- update labels for block data nodes + updatedDisplayNodes = V.zipWith4 updateNodes (V.fromList blockDisplayNodes) vertDataVV vertCostVV subGraphCostVV + in LG.mkGraph (V.toList updatedDisplayNodes) blockDisplayEdges + + +{- | extractTripleVect takes a vector of vector character tree labels and a node index and +retuns a triple of data (vertData, VertCost, and subgraphCost) from a given node index in all labels +-} +extractTripleVect ∷ V.Vector (V.Vector VertexInfo) → Int → (V.Vector CharacterData, V.Vector VertexCost, V.Vector VertexCost) +extractTripleVect inLabelVV charIndex = + let nodeLabelV = fmap (V.! charIndex) inLabelVV + vertDataV = fmap vertData nodeLabelV + vertCostV = fmap vertexCost nodeLabelV + subGraphCostV = fmap subGraphCost nodeLabelV + in (fmap (V.head . V.head) vertDataV, vertCostV, subGraphCostV) + + +-- | updateNodes takes vectors of labelled nodes and updates vertData, VerTCost, and subgraphCost fields +updateNodes ∷ LG.LNode VertexInfo → V.Vector CharacterData → V.Vector VertexCost → V.Vector VertexCost → LG.LNode VertexInfo +updateNodes (inIndex, inLabel) charDataV vertexCostV subGraphCostV = + let newVertCost = V.sum vertexCostV + newSubGraphCost = V.sum subGraphCostV + newLabel = inLabel{vertData = V.singleton charDataV, vertexCost = newVertCost, subGraphCost = newSubGraphCost} + in (inIndex, newLabel) + + +{- | divideDecoratedGraphByBlockAndCharacterTree takes a DecoratedGraph with (potentially) multiple blocks +and (potentially) multiple character per block and creates a Vector of Vector of Decorated Graphs +over blocks and characters with the same graph, but only a single block and character for each graph +this to be used to create the "best" cost over alternate graph traversals +vertexCost and subGraphCost will be taken from characterData localcost/localcostVect and globalCost +-} +divideDecoratedGraphByBlockAndCharacterTree ∷ DecoratedGraph → (V.Vector [DecoratedGraph], V.Vector (V.Vector DecoratedGraph)) +divideDecoratedGraphByBlockAndCharacterTree inGraph = + if LG.isEmpty inGraph + then (V.empty, V.empty) + else + let numBlocks = V.length $ vertData $ snd $ head $ LG.labNodes inGraph + blockGraphList = fmap (pullBlock inGraph) [0 .. (numBlocks - 1)] + characterGraphList = fmap makeCharacterGraph blockGraphList + in -- trace ("DDGBCT: Blocks " <> (show numBlocks) <> " Characters " <> (show $ fmap length $ vertData $ snd $ head $ LG.labNodes inGraph) <> "\n" <> (show characterGraphList)) + (V.fromList (fmap (: []) blockGraphList), V.fromList characterGraphList) + + +{- | pullBlocks take a DecoratedGraph and creates a newDecorated graph with +only data from the input block index +-} +pullBlock ∷ DecoratedGraph → Int → DecoratedGraph +pullBlock inGraph blockIndex = + if LG.isEmpty inGraph + then LG.empty + else + let (inNodeIndexList, inNodeLabelList) = unzip $ LG.labNodes inGraph + blockNodeLabelList = fmap (makeBlockNodeLabels blockIndex) inNodeLabelList + in LG.mkGraph (zip inNodeIndexList blockNodeLabelList) (LG.labEdges inGraph) + + +{- | makeBlockNodeLabels takes a block index and an orginal nodel label +and creates a new list of a singleton block from the input block index +-} +makeBlockNodeLabels ∷ Int → VertexInfo → VertexInfo +makeBlockNodeLabels blockIndex inVertexInfo = + let newVertexData = vertData inVertexInfo V.! blockIndex + newVertexCost = V.sum $ fmap localCost newVertexData + newsubGraphCost = V.sum $ fmap globalCost newVertexData + in -- trace ("MBD " <> (show $ length newVertexData) <> " from " <> (show $ length (vertData inVertexInfo))) + inVertexInfo + { vertData = V.singleton newVertexData + , vertexCost = newVertexCost + , subGraphCost = newsubGraphCost + } + + +{- | updateAndFinalizePostOrderSoftWired performs the pre-order traceback on the resolutions of a softwired graph to create the correct vertex states, +ports the post order assignments to the display trees, and creates the character trees from the block trees +-} +updateAndFinalizePostOrderSoftWired ∷ Maybe Int → Int → PhylogeneticGraph → PhyG PhylogeneticGraph +updateAndFinalizePostOrderSoftWired startVertexMaybe startVertex inGraph = + if isNothing startVertexMaybe + then NEW.softWiredPostOrderTraceBack startVertex inGraph + else NEW.softWiredPostOrderTraceBack (fromJust startVertexMaybe) inGraph + + +{- | makeLeafGraphSoftWired takes input data and creates a 'graph' of leaves with Vertex information +but with zero edges. This 'graph' can be reused as a starting structure for graph construction +to avoid remaking of leaf vertices +includes leave resolution data +-} +makeLeafGraphSoftWired ∷ GlobalSettings → ProcessedData → DecoratedGraph +makeLeafGraphSoftWired inGS inData@(nameVect, bvNameVect, blocDataVect) = + if V.null nameVect + then error "Empty ProcessedData in makeLeafGraph" + else + if softWiredMethod inGS == Naive + then GO.makeLeafGraph inData + else + let leafVertexList = V.toList $ V.map (makeLeafVertexSoftWired nameVect bvNameVect blocDataVect) (V.fromList [0 .. V.length nameVect - 1]) + in LG.mkGraph leafVertexList [] + + +-- | makeLeafVertexSoftWired makes a single unconnected vertex for a leaf in a Soft-wired graph +makeLeafVertexSoftWired ∷ V.Vector NameText → V.Vector NameBV → V.Vector BlockData → Int → LG.LNode VertexInfo +makeLeafVertexSoftWired nameVect bvNameVect inData localIndex = + -- trace ("Making leaf " <> (show localIndex) <> " Data " <> (show $ length inData) <> " " <> (show $ fmap length $ fmap snd3 inData)) ( + let centralData = V.map snd3 inData + thisData = V.map (V.! localIndex) centralData + thisBVLabel = bvNameVect V.! localIndex + thisResolutionData = makeLeafResolutionBlockData thisBVLabel ([(localIndex, minimalVertex)], []) thisData + minimalVertex = + VertexInfo + { index = localIndex + , bvLabel = thisBVLabel + , parents = V.empty + , children = V.empty + , nodeType = LeafNode + , vertName = nameVect V.! localIndex + , vertData = mempty + , vertexResolutionData = mempty + , vertexCost = 0.0 + , subGraphCost = 0.0 + } + newVertex = + VertexInfo + { index = localIndex + , bvLabel = thisBVLabel + , parents = V.empty + , children = V.empty + , nodeType = LeafNode + , vertName = nameVect V.! localIndex + , vertData = mempty + , vertexResolutionData = thisResolutionData + , vertexCost = 0.0 + , subGraphCost = 0.0 + } + in -- trace ("MVSW" <> (show (localIndex, vertexResolutionData newVertex))) + (localIndex, newVertex) + + +-- ) + +{- | makeLeafResolutionBlockData creates leaf resolution data from leav BVLabel, leave node, and data. +The return type is a vertor over character blocks each containing a list of potential resolutions (display trees) for that block +the resolutoins include subtree (display) for that resolution the bv l;abel for the node given that resolution, and the character data +in the block (Vector CharacterData) also given that resolution +thiis is repeaatted for each bloick in VertexBlockData +-} +makeLeafResolutionBlockData + ∷ NameBV → ([LG.LNode VertexInfo], [LG.LEdge EdgeInfo]) → VertexBlockData → V.Vector ResolutionBlockData +makeLeafResolutionBlockData inBV inSubGraph inVertData = + let defaultResolutionData = + ResolutionData + { displaySubGraph = inSubGraph + , displayBVLabel = inBV + , displayData = mempty + , childResolutionIndices = (Just 0, Just 0) + , resolutionCost = 0.0 + , displayCost = 0.0 + } + + blockIndexList = [0 .. (V.length inVertData - 1)] + blockDataList = fmap (inVertData V.!) blockIndexList + resolutionDataList = modifyDisplayData defaultResolutionData blockDataList [] + resolutionData = V.fromList $ fmap (V.fromList . (: [])) resolutionDataList + in resolutionData + + +{- | modifyDisplayData modifies displatData filed in ResolutionData +stas list doesn't change number of V.fromList calls +-} +modifyDisplayData ∷ ResolutionData → [V.Vector CharacterData] → [ResolutionData] → [ResolutionData] +modifyDisplayData resolutionTemplate characterDataVList curResolutionList = + if null characterDataVList + then reverse curResolutionList + else + let curBlockData = head characterDataVList + in modifyDisplayData + resolutionTemplate + (tail characterDataVList) + ((resolutionTemplate{displayData = curBlockData}) : curResolutionList) + + +{- | makeCharacterGraph takes a blockGraph and creates a vector of character graphs +each with a single block and single character +updating costs +-} +makeCharacterGraph ∷ DecoratedGraph → V.Vector DecoratedGraph +makeCharacterGraph inBlockGraph = + if LG.isEmpty inBlockGraph + then V.empty + else + let numCharacters = V.length $ V.head $ vertData $ snd $ head $ LG.labNodes inBlockGraph + characterGraphList = + if numCharacters > 0 + then fmap (pullCharacter False inBlockGraph) [0 .. (numCharacters - 1)] + else -- missing data + [pullCharacter True inBlockGraph 0] + in if V.length (vertData $ snd $ head $ LG.labNodes inBlockGraph) /= 1 + then error ("Number of blocks /= 1 in makeCharacterGraph :" <> (show $ V.length (vertData $ snd $ head $ LG.labNodes inBlockGraph))) + else -- trace ("Chars: " <> show numCharacters) + V.fromList characterGraphList + + +{- | pullCharacter takes a DecoratedGraph with a single block and +creates a new DecoratedGraph with a single character from the input index +-} +pullCharacter ∷ Bool → DecoratedGraph → Int → DecoratedGraph +pullCharacter isMissing inBlockGraph characterIndex = + if LG.isEmpty inBlockGraph + then LG.empty + else + let (inNodeIndexList, inNodeLabelList) = unzip $ LG.labNodes inBlockGraph + characterLabelList = fmap (makeCharacterLabels isMissing characterIndex) inNodeLabelList + in LG.mkGraph (zip inNodeIndexList characterLabelList) (LG.labEdges inBlockGraph) + + +{- | makeCharacterLabels pulls the index character label form the singleton block (via head) +and creates a singleton character label, updating costs to that of the character +NB the case of missing data is answered here by an "empty charcter" +could be better to have V.empty +isMIssingChar seems to be extraneous--not sure whey it was there. +-} +makeCharacterLabels ∷ Bool → Int → VertexInfo → VertexInfo +makeCharacterLabels isMissing characterIndex inVertexInfo = + -- trace ("MCl in:" <> (show inVertexInfo) <> " " <> (show characterIndex)) ( + let -- isMissingChar = (V.length $ (vertData inVertexInfo) V.! characterIndex) == 0 + newVertexData = V.head (vertData inVertexInfo) V.! characterIndex + (newVertexCost, newSubGraphCost) = + if isMissing + then (0, 0) + else -- if isMissing || isMissingChar then (0, 0) + (localCost newVertexData, globalCost newVertexData) + in -- newVertexCost = localCost newVertexData + -- newSubGraphCost = globalCost newVertexData + + -- trace ("MCL " <> (show $ V.length $ vertData inVertexInfo) <> " " <> (show $ fmap V.length $ vertData inVertexInfo) ) ( + -- trace ("MCL: " <> (show isMissing) <> " CI: " <> (show characterIndex) <> " " <> (show $ V.length $ (vertData inVertexInfo) V.! characterIndex)) + inVertexInfo + { vertData = + if not isMissing + then V.singleton $ V.singleton newVertexData + else -- if not isMissing && not isMissingChar then V.singleton $ V.singleton newVertexData + V.singleton $ V.singleton emptyCharacter -- V.empty + , vertexCost = newVertexCost + , subGraphCost = newSubGraphCost + } + + +-- ) ) + +{- | postOrderTreeTraversal takes a 'simple' graph and generates 'preliminary' assignments +vi post-order traversal, yields cost as well +for a binary tree only +depending on optimality criterion--will calculate root cost +-} +postOrderTreeTraversal + ∷ GlobalSettings → ProcessedData → DecoratedGraph → Bool → Maybe Int → SimpleGraph → PhyG PhylogeneticGraph +postOrderTreeTraversal inGS (_, _, blockDataVect) leafGraph staticIA startVertex inGraph = + if LG.isEmpty inGraph + then pure emptyPhylogeneticGraph + else -- Assumes root is Number of Leaves + + let rootIndex = + if startVertex == Nothing + then fst $ head $ LG.getRoots inGraph + else fromJust startVertex + blockCharInfo = V.map thd3 blockDataVect + + {- + -- Hardwired--Not sure whey these edges can occur--somethng about adding edges in after not deleting them when assuming so + inGraph' = if graphType inGS == Tree then inGraph + else (LG.removeNonLeafOut0NodesAfterRoot . LG.removeDuplicateEdges) inGraph + -} + newTree = postDecorateTree inGS staticIA inGraph leafGraph blockCharInfo rootIndex rootIndex + in -- trace ("It Begins at " <> (show $ fmap fst $ LG.getRoots inGraph) <> "\n" <> show inGraph) ( + if (startVertex == Nothing) && (not $ LG.isRoot inGraph rootIndex) + then + let localRootList = fst <$> LG.getRoots inGraph + localRootEdges = concatMap (LG.out inGraph) localRootList + currentRootEdges = LG.out inGraph rootIndex + in error + ( "Index " + <> show rootIndex + <> " with edges " + <> show currentRootEdges + <> " not root in graph:" + <> show localRootList + <> " edges:" + <> show localRootEdges + <> "\n" + <> GFU.showGraph inGraph + ) + else do + postDecorateTree inGS staticIA inGraph leafGraph blockCharInfo rootIndex rootIndex + + +-- ) + +{- | postDecorateTree begins at start index (usually root, but could be a subtree) and moves preorder till children are labelled and then returns postorder +labelling vertices and edges as it goes back to root +this for a tree so single root +-} +postDecorateTree + ∷ GlobalSettings + → Bool + → SimpleGraph + → DecoratedGraph + → V.Vector (V.Vector CharInfo) + → LG.Node + → LG.Node + → PhyG PhylogeneticGraph +postDecorateTree inGS staticIA simpleGraph curDecGraph blockCharInfo rootIndex curNode = + -- if node in there (leaf) or Hardwired network nothing to do and return + if LG.gelem curNode curDecGraph + then + let nodeLabel = LG.lab curDecGraph curNode + in if isNothing nodeLabel + then error ("Null label for node " <> show curNode) + else -- checks for node already in graph--either leaf or pre-optimized node in Hardwired + -- trace ("In graph :" <> (show curNode) <> " " <> (show nodeLabel)) + pure $ (simpleGraph, subGraphCost (fromJust nodeLabel), curDecGraph, mempty, mempty, blockCharInfo) + else -- Need to make node + + -- check if children in graph + + let nodeChildren = LG.descendants simpleGraph curNode -- should be 1 or 2, not zero since all leaves already in graph + leftChild = head nodeChildren + rightChild = last nodeChildren + in do + leftChildTree <- postDecorateTree inGS staticIA simpleGraph curDecGraph blockCharInfo rootIndex leftChild + rightLeftChildTree <- + if length nodeChildren == 2 + then postDecorateTree inGS staticIA simpleGraph (thd6 leftChildTree) blockCharInfo rootIndex rightChild + else pure leftChildTree + let newSubTree = thd6 rightLeftChildTree + let ((_, leftChildLabel), (_, rightChildLabel)) = U.leftRightChildLabelBVNode (LG.labelNode newSubTree leftChild, LG.labelNode newSubTree rightChild) + if length nodeChildren > 2 + then error ("Graph not dichotomous in postDecorateTree node " <> show curNode <> "\n" <> LG.prettify simpleGraph) + else + if null nodeChildren + then error ("Leaf not in graph in postDecorateTree node " <> show curNode <> "\n" <> LG.prettify simpleGraph) + else -- out-degree 1 should not happen with Tree but will with HardWired graph + + if length nodeChildren == 1 + then -- make node from single child and single new edge to child + -- takes characters in blocks--but for tree really all same block + + let childVertexData = vertData leftChildLabel + newVertex = + VertexInfo + { index = curNode + , -- same as child--could and perhaps should prepend 1 to make distinct + bvLabel = bvLabel leftChildLabel + , parents = V.fromList $ LG.parents simpleGraph curNode + , children = V.fromList nodeChildren + , nodeType = GO.getNodeType simpleGraph curNode + , vertName = T.pack $ "HTU" <> show curNode + , vertData = childVertexData + , -- this not used for Hardwired or Tree + vertexResolutionData = mempty + , vertexCost = 0.0 + , subGraphCost = subGraphCost leftChildLabel + } + newEdgesLabel = + EdgeInfo + { minLength = 0.0 + , maxLength = 0.0 + , midRangeLength = 0.0 + , edgeType = TreeEdge + } + newEdges = LG.toEdge <$> LG.out simpleGraph curNode + newLEdges = fmap (LG.toLEdge' newEdgesLabel) newEdges + newGraph = LG.insEdges newLEdges $ LG.insNode (curNode, newVertex) newSubTree + + (newDisplayVect, newCharTreeVV) = divideDecoratedGraphByBlockAndCharacterTree newGraph + in -- th curnode == root index for pruned subtrees + -- trace ("New vertex:" <> (show newVertex) <> " at cost " <> (show newCost)) ( + -- Do we need to PO.divideDecoratedGraphByBlockAndCharacterTree if not root? probbaly not + + -- if nodeType newVertex == RootNode then (simpleGraph, subGraphCost newVertex, newGraph, mempty, PO.divideDecoratedGraphByBlockAndCharacterTree newGraph, blockCharInfo) + if nodeType newVertex == RootNode || curNode == rootIndex + then pure (simpleGraph, subGraphCost newVertex, newGraph, newDisplayVect, newCharTreeVV, blockCharInfo) + else pure (simpleGraph, subGraphCost newVertex, newGraph, mempty, mempty, blockCharInfo) + else do -- make node from 2 children + + -- make node from children and new edges to children + -- takes characters in blocks--but for tree really all same block + + -- this ensures that left/right choices are based on leaf BV for consistency and label invariance + -- larger bitvector is Right, smaller or equal Left + + newCharData <- + if staticIA + then createVertexDataOverBlocksStaticIA inGS (vertData leftChildLabel) (vertData rightChildLabel) blockCharInfo [] + else createVertexDataOverBlocks inGS (vertData leftChildLabel) (vertData rightChildLabel) blockCharInfo [] + + let newCost = V.sum $ V.map V.sum $ V.map (V.map snd) newCharData + + let newVertex = VertexInfo + { index = curNode + , bvLabel = bvLabel leftChildLabel .|. bvLabel rightChildLabel + , parents = V.fromList $ LG.parents simpleGraph curNode + , children = V.fromList nodeChildren + , nodeType = GO.getNodeType simpleGraph curNode + , vertName = T.pack $ "HTU" <> show curNode + , vertData = V.map (V.map fst) newCharData + , vertexResolutionData = mempty + , vertexCost = newCost + , -- this cost is incorrrect for Harwired netwqork fix at root + subGraphCost = subGraphCost leftChildLabel + subGraphCost rightChildLabel + newCost + } + let newEdgesLabel = EdgeInfo + { minLength = newCost / 2.0 + , maxLength = newCost / 2.0 + , midRangeLength = newCost / 2.0 + , edgeType = TreeEdge + } + let newEdges = LG.toEdge <$> LG.out simpleGraph curNode + let newLEdges = fmap (LG.toLEdge' newEdgesLabel) newEdges + let newGraph = LG.insEdges newLEdges $ LG.insNode (curNode, newVertex) newSubTree + + let (newDisplayVect, newCharTreeVV) = divideDecoratedGraphByBlockAndCharacterTree newGraph + + -- Graph cost is calculated differently for Tree and Hardwired. Sub trees can be counted multiple times + -- in hardwired for outdegree two nodes with one or more network nodes as descendents + -- this cannot be dealt with at the local node since there can be network all over the graph + -- so simple add up the local costs of all nodes + + if nodeType newVertex == RootNode || curNode == rootIndex + then -- Need full info for building trees + + let localCostSum = sum $ fmap vertexCost $ fmap snd $ LG.labNodes newGraph + in -- updatedDisplayVect = V.zipWith NEW.backPortBlockTreeNodesToCanonicalGraph (fmap head newDisplayVect) newCharTreeVV + -- updatedCanonicalGraph = NEW.backPortBlockTreeNodesToCanonicalGraph newGraph updatedDisplayVect + + -- trace ("PDT End: " <> (show (newCost, subGraphCost newVertex, localCostSum))) + -- (LG.removeDuplicateEdges simpleGraph, localCostSum, LG.removeDuplicateEdges newGraph, fmap (fmap LG.removeDuplicateEdges) newDisplayVect, fmap (fmap LG.removeDuplicateEdges) newCharTreeVV, blockCharInfo) + -- (simpleGraph, localCostSum, updatedCanonicalGraph, fmap (:[]) updatedDisplayVect, newCharTreeVV, blockCharInfo) + pure (simpleGraph, localCostSum, newGraph, newDisplayVect, newCharTreeVV, blockCharInfo) + else pure(simpleGraph, subGraphCost newVertex, newGraph, mempty, mempty, blockCharInfo) + + +-- | createVertexDataOverBlocks is a partial application of generalCreateVertexDataOverBlocks with full (all charcater) median calculation +createVertexDataOverBlocks + ∷ GlobalSettings + → VertexBlockData + → VertexBlockData + → V.Vector (V.Vector CharInfo) + → [V.Vector (CharacterData, VertexCost)] + → PhyG (V.Vector (V.Vector (CharacterData, VertexCost))) +createVertexDataOverBlocks inGS = generalCreateVertexDataOverBlocks (M.median2M (U.needTwoEdgeNoCostAdjust inGS True)) + + +-- | createVertexDataOverBlocksNonExact is a partial application of generalCreateVertexDataOverBlocks with partial (non-exact charcater) median calculation +createVertexDataOverBlocksNonExact + ∷ GlobalSettings + → VertexBlockData + → VertexBlockData + → V.Vector (V.Vector CharInfo) + → [V.Vector (CharacterData, VertexCost)] + → PhyG (V.Vector (V.Vector (CharacterData, VertexCost))) +createVertexDataOverBlocksNonExact inGS = generalCreateVertexDataOverBlocks (M.median2NonExactM (U.needTwoEdgeNoCostAdjust inGS True)) + + +{- | createVertexDataOverBlocksStaticIA is an application of generalCreateVertexDataOverBlocks with exact charcater median calculation +and IA claculation for dynmaic characters--not full optimizations +-} +createVertexDataOverBlocksStaticIA + ∷ GlobalSettings + → VertexBlockData + → VertexBlockData + → V.Vector (V.Vector CharInfo) + → [V.Vector (CharacterData, VertexCost)] + → PhyG (V.Vector (V.Vector (CharacterData, VertexCost))) +createVertexDataOverBlocksStaticIA inGS = generalCreateVertexDataOverBlocks (M.median2StaticIAM (U.needTwoEdgeNoCostAdjust inGS True)) + + +{- | generalCreateVertexDataOverBlocks is a genreal version for optimizing all (Add, NonAdd, Matrix) +and only non-exact (basically sequence) characters based on the median function passed +The function takes data in blocks and block vector of char info and +extracts the triple for each block and creates new block data for parent node (usually) +not checking if vectors are equal in length +-} +generalCreateVertexDataOverBlocks + ∷ (V.Vector CharacterData → V.Vector CharacterData → V.Vector CharInfo → PhyG (V.Vector (CharacterData, VertexCost))) + → VertexBlockData + → VertexBlockData + → V.Vector (V.Vector CharInfo) + → [V.Vector (CharacterData, VertexCost)] + → PhyG (V.Vector (V.Vector (CharacterData, VertexCost))) +generalCreateVertexDataOverBlocks medianFunction leftBlockData rightBlockData blockCharInfoVect curBlockData = + if V.null leftBlockData + then -- trace ("Blocks: " <> (show $ length curBlockData) <> " Chars B0: " <> (show $ V.map snd $ head curBlockData)) + pure $ V.fromList $ reverse curBlockData + else + let leftBlockLength = length $ V.head leftBlockData + rightBlockLength = length $ V.head rightBlockData + -- firstBlock = V.zip3 (V.head leftBlockData) (V.head rightBlockData) (V.head blockCharInfoVect) + in do + -- missing data cases first or zip defaults to zero length + firstBlockMedian <- + if (leftBlockLength == 0) then pure $ V.zip (V.head rightBlockData) (V.replicate rightBlockLength 0) + else if (rightBlockLength == 0) then pure $ V.zip (V.head leftBlockData) (V.replicate leftBlockLength 0) + else medianFunction (V.head leftBlockData) (V.head rightBlockData) (V.head blockCharInfoVect) + generalCreateVertexDataOverBlocks + medianFunction + (V.tail leftBlockData) + (V.tail rightBlockData) + (V.tail blockCharInfoVect) + (firstBlockMedian : curBlockData) + + +{-_ +-- | getNetPenaltyReduced returns appropriate network penalty for a reduced graph +getNetPenaltyReduced :: GlobalSettings -> ProcessedData -> ReducedPhylogeneticGraph -> PhyG VertexCost +getNetPenaltyReduced inGS inData inGraph = + getNetPenalty inGS inData (GO.convertReduced2PhylogeneticGraph inGraph) + +-- | getNetPenalty returns appropriate network penalty +getNetPenalty :: GlobalSettings -> ProcessedData -> PhylogeneticGraph -> PhyG VertexCost +getNetPenalty inGS inData inGraph = + if (graphType inGS == Tree) then pure 0.0 + else if (graphFactor inGS) == NoNetworkPenalty then pure 0.0 + else if (graphFactor inGS) == Wheeler2015Network then getW15NetPenaltyFull Nothing inGS inData Nothing inGraph + else if (graphFactor inGS) == Wheeler2023Network then pure $ getW23NetPenalty inGraph + else error ("Network penalty type " <> (show $ graphFactor inGS) <> " is not yet implemented") +-} + +{- | getW15RootCost creates a root cost as the 'insertion' of character data. For sequence data averaged over +leaf taxa +-} +getW15RootCost ∷ GlobalSettings → PhylogeneticGraph → VertexCost +getW15RootCost inGS inGraph = + if LG.isEmpty $ thd6 inGraph + then 0.0 + else + let (rootList, _, _, _) = LG.splitVertexList $ fst6 inGraph + numRoots = length rootList + in (fromIntegral numRoots) * (rootComplexity inGS) + + +{- | getW15NetPenaltyFull takes a Phylogenetic tree and returns the network penalty of Wheeler (2015) +does not use resolution cache's so can be used with Naive or Resolution cache SoftWired +has to be a single display tree--or could have no penalty for network since edges would be in one or other +display tree +-} +getW15NetPenaltyFull + ∷ Maybe ([VertexCost], [SimpleGraph], PhylogeneticGraph, Int) + → GlobalSettings + → ProcessedData + → Maybe Int + → PhylogeneticGraph + → PhyG VertexCost +getW15NetPenaltyFull blockInfo inGS inData@(nameVect, _, _) startVertex inGraph = + if LG.isEmpty $ fst6 inGraph + then pure 0.0 + else + if LG.isTree $ fst6 inGraph + then pure 0.0 + else -- have to do full data pasess on display trees + + if isNothing blockInfo + then + let rootIndex = + if startVertex == Nothing + then fst $ head $ LG.getRoots (fst6 inGraph) + else fromJust startVertex + numLeaves' = V.length nameVect + blockTreeList = V.toList $ fmap GO.convertDecoratedToSimpleGraph $ fmap head $ fth6 inGraph + blockCostList = V.toList $ fmap (getBlockCost rootIndex) (fft6 inGraph) + + -- get lowest cost display tree + staticIA = False + + postOrderAction ∷ SimpleGraph → PhyG PhylogeneticGraph + postOrderAction = postOrderTreeTraversal inGS inData (GO.makeLeafGraph inData) staticIA (Just rootIndex) + + displayAction ∷ PhylogeneticGraph → PhyG PhylogeneticGraph + displayAction = getDisplayBasedRerootSoftWired' inGS Tree rootIndex + in do + actionPar <- getParallelChunkTraverse + outgroupRootedList ← actionPar postOrderAction blockTreeList + -- getParallelChunk <&> \pMap → + -- postOrderAction `pMap` blockTreeList + + multiTraverseTreeList ← + getParallelChunkTraverse >>= \pTraverse → + displayAction `pTraverse` outgroupRootedList + + lowestCostDisplayTree ← head <$> GO.selectGraphsFull Best 1 0.0 multiTraverseTreeList + + -- now can do as input (below) + let lowestCostEdgeList = (LG.edges . fst6) lowestCostDisplayTree + let lowestCostEdgesFlipped = fmap LG.flipEdge lowestCostEdgeList + let blockEdgeList = fmap LG.edges blockTreeList + let numBlockExtraEdgesList = fmap length $ fmap (L.\\ (lowestCostEdgeList <> lowestCostEdgesFlipped)) blockEdgeList + let blockPenalty = sum $ zipWith (*) blockCostList (fmap fromIntegral numBlockExtraEdgesList) + + -- trace ("GW15N: " <> (show (blockEdgeList, numBlockExtraEdgesList, blockPenalty, blockPenalty / (4.0 * (fromIntegral numLeaves') - 4.0)))) + pure $ blockPenalty / (4.0 * (fromIntegral numLeaves') - 4.0) + else + let (blockCostList, blockTreeList, lowestCostDisplayTree, numLeaves) = fromJust blockInfo + lowestCostEdgeList = (LG.edges . fst6) lowestCostDisplayTree + lowestCostEdgesFlipped = fmap LG.flipEdge lowestCostEdgeList + blockEdgeList = fmap LG.edges blockTreeList + numBlockExtraEdgesList = fmap length $ fmap (L.\\ (lowestCostEdgeList <> lowestCostEdgesFlipped)) blockEdgeList + blockPenalty = sum $ zipWith (*) blockCostList (fmap fromIntegral numBlockExtraEdgesList) + in -- trace ("GW15N: " <> (show (blockEdgeList, numBlockExtraEdgesList, blockPenalty, blockPenalty / (4.0 * (fromIntegral numLeaves) - 4.0)))) + pure $ blockPenalty / (4.0 * (fromIntegral numLeaves) - 4.0) + + +{- | getW15NetPenalty takes a Phylogenetic tree and returns the network penalty of Wheeler (2015) +modified to take the union of all edges of trees of minimal length +currently modified -- not exactlty W15 +-} +getW15NetPenalty ∷ Maybe Int → PhylogeneticGraph → PhyG VertexCost +getW15NetPenalty startVertex inGraph = + if LG.isEmpty $ thd6 inGraph + then pure 0.0 + else + if LG.isTree $ fst6 inGraph + then pure 0.0 + else + let -- (bestTreeList, _) = extractLowestCostDisplayTree startVertex inGraph + bestTreeList = V.toList $ fmap head $ fth6 inGraph + bestTreesEdgeList = L.nubBy LG.undirectedEdgeEquality $ concat $ fmap LG.edges bestTreeList + rootIndex = + if startVertex == Nothing + then fst $ head $ LG.getRoots (fst6 inGraph) + else fromJust startVertex + + blockAction ∷ [DecoratedGraph] → VertexCost + blockAction = getBlockW2015 bestTreesEdgeList rootIndex + in do + blockPar ← getParallelChunkMap + let blockPenaltyList = blockPar blockAction (V.toList $ fth6 inGraph) + -- PU.seqParMap PU.myStrategy (getBlockW2015 bestTreesEdgeList rootIndex) (fth6 inGraph) + + -- leaf list for normalization + let (_, leafList, _, _) = LG.splitVertexList (fst6 inGraph) + let numLeaves = length leafList + let numTreeEdges = 4.0 * (fromIntegral numLeaves) - 4.0 + let divisor = numTreeEdges + + -- trace ("W15:" <> (show ((sum $ blockPenaltyList) / divisor ))) + pure $ (sum $ blockPenaltyList) / divisor + + +{- | getW23NetPenaltyReduced takes a ReducedPhylogeneticGraph tree and returns the network penalty of Wheeler and Washburn (2023) +basic idea is new edge improvement must be better than average existing edge cost +penalty for each added edge (unlike W15 which was on a block by block basis--and requires additional tree diagnoses) +num extra edges/2 since actually add 2 new edges when one network edge +requires resolution cache data structures +-} +getW23NetPenaltyReduced ∷ ReducedPhylogeneticGraph → VertexCost +getW23NetPenaltyReduced inGraph = + if LG.isEmpty $ thd5 inGraph + then 0.0 + else + if LG.isTree $ fst5 inGraph + then 0.0 + else + let -- (bestTreeList, _) = extractLowestCostDisplayTree startVertex inGraph + bestTreeList = V.toList $ fmap head $ fth5 inGraph + bestTreesEdgeList = L.nubBy LG.undirectedEdgeEquality $ concat $ fmap LG.edges bestTreeList + + -- leaf list for normalization + (_, leafList, _, _) = LG.splitVertexList (fst5 inGraph) + numLeaves = length leafList + numTreeEdges = 2.0 * (fromIntegral numLeaves) - 2.0 + numExtraEdges = ((fromIntegral $ length bestTreesEdgeList) - numTreeEdges) / 2.0 + in -- divisor = numTreeEdges - numExtraEdges + + -- trace ("W23:" <> (show ((numExtraEdges * (snd6 inGraph)) / (2.0 * numTreeEdges))) <> " from " <> (show (numTreeEdges, numExtraEdges))) ( + -- if divisor == 0.0 then infinity + -- else (sum blockPenaltyList) / divisor + -- else (numExtraEdges * (sum blockPenaltyList)) / divisor + -- else + (numExtraEdges * (snd5 inGraph)) / (2.0 * numTreeEdges) + + +-- ) + +{- | getW23NetPenalty takes a Phylogenetic tree and returns the network penalty of Wheeler and Washburn (2023) +basic idea is new edge improvement must be better than average existing edge cost +penalty for each added edge (unlike W15 which was on a block by block basis--and requires additional tree diagnoses) +num extra edges/2 since actually add 2 new edges when one network edge +requires resolution cache data structures +-} +getW23NetPenalty ∷ PhylogeneticGraph → VertexCost +getW23NetPenalty inGraph = + if LG.isEmpty $ thd6 inGraph + then 0.0 + else + if LG.isTree $ fst6 inGraph + then 0.0 + else + let -- (bestTreeList, _) = extractLowestCostDisplayTree startVertex inGraph + bestTreeList = V.toList $ fmap head $ fth6 inGraph + bestTreesEdgeList = L.nubBy LG.undirectedEdgeEquality $ concat $ fmap LG.edges bestTreeList + + -- leaf list for normalization + (_, leafList, _, _) = LG.splitVertexList (fst6 inGraph) + numLeaves = length leafList + numTreeEdges = 2.0 * (fromIntegral numLeaves) - 2.0 + numExtraEdges = ((fromIntegral $ length bestTreesEdgeList) - numTreeEdges) / 2.0 + in -- divisor = numTreeEdges - numExtraEdges + + -- trace ("W23:" <> (show ((numExtraEdges * (snd6 inGraph)) / (2.0 * numTreeEdges))) <> " from " <> (show (numTreeEdges, numExtraEdges))) ( + -- if divisor == 0.0 then infinity + -- else (sum blockPenaltyList) / divisor + -- else (numExtraEdges * (sum blockPenaltyList)) / divisor + -- else + (numExtraEdges * (snd6 inGraph)) / (2.0 * numTreeEdges) + + +-- ) + +{- | getBlockW2015 takes the list of trees for a block, gets the root cost and determines the individual +penalty cost of that block +-} +getBlockW2015 ∷ [LG.Edge] → Int → [DecoratedGraph] → VertexCost +getBlockW2015 treeEdgeList rootIndex blockTreeList = + if null treeEdgeList || null blockTreeList + then 0.0 + else + let blockTreeEdgeList = L.nubBy LG.undirectedEdgeEquality $ concatMap LG.edges blockTreeList + numExtraEdges = length $ LG.undirectedEdgeMinus blockTreeEdgeList treeEdgeList + blockCost = subGraphCost $ fromJust $ LG.lab (head blockTreeList) rootIndex + in -- trace ("GBW: " <> (show (numExtraEdges, blockCost, blockTreeEdgeList)) <> "\n" <> (show $ fmap (subGraphCost . snd) $ LG.labNodes (head blockTreeList))) + blockCost * (fromIntegral numExtraEdges) diff --git a/src/GraphOptimization/PostOrderSoftWiredFunctionsNew.hs b/src/GraphOptimization/PostOrderSoftWiredFunctionsNew.hs new file mode 100644 index 000000000..fe0f92cc9 --- /dev/null +++ b/src/GraphOptimization/PostOrderSoftWiredFunctionsNew.hs @@ -0,0 +1,1121 @@ +{- | +Module : PostOrderSoftWiredFunctionsNew.hs +Description : Module specifying post-order softwiired graph functions +Copyright : (c) 2022 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. +License : + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The views and conclusions contained in the software and documentation are those +of the authors and should not be interpreted as representing official policies, +either expressed or implied, of the FreeBSD Project. + +Maintainer : Ward Wheeler +Stability : unstable +Portability : portable (I hope) +-} +module GraphOptimization.PostOrderSoftWiredFunctionsNew ( + postDecorateSoftWired, + softWiredPostOrderTraceBack, + createBlockResolutions, + addNodeAndEdgeToResolutionData, + updateRootCost, + getOutDegree1VertexAndGraph, + getOutDegree1VertexSoftWired, + getOutDegree2VertexSoftWired, + extractDisplayTrees, + backPortBlockTreeNodesToCanonicalGraph, +) where + +import Control.DeepSeq +import Control.Parallel.Strategies +import Data.BitVector.LittleEndian qualified as BV +import Data.Bits +import Data.List qualified as L +import Data.Maybe +import Data.Text.Lazy qualified as T +import Data.Vector qualified as V +import GeneralUtilities +import GraphOptimization.Medians qualified as M +import Graphs.GraphOperations qualified as GO +import PHANE.Evaluation +import PHANE.Evaluation.Logging (LogLevel (..), Logger (..)) +import PHANE.Evaluation.Verbosity (Verbosity (..)) +import Types.Types +import Utilities.LocalGraph qualified as LG +import Utilities.Utilities qualified as U + + +-- import Debug.Trace + +{-Intial Postorder softwired pass. All functions with 'New" appended-} + +{- | postDecorateSoftWired begins at start index (usually root, but could be a subtree) and moves preorder till children are labelled +and then recurses to root postorder labelling vertices and edges as it goes +this for a single root +-} +postDecorateSoftWired + ∷ GlobalSettings → SimpleGraph → DecoratedGraph → V.Vector (V.Vector CharInfo) → LG.Node → LG.Node → PhyG PhylogeneticGraph +postDecorateSoftWired inGS simpleGraph curDecGraph blockCharInfo rootIndex curNode = + -- if node in current decorated graph then nothing to do and return it + -- this because will hit node twice if network node + if LG.gelem curNode curDecGraph + then + let nodeLabel = LG.lab curDecGraph curNode + in if isNothing nodeLabel + then error ("Null label for node " <> show curNode) + else pure (simpleGraph, subGraphCost (fromJust nodeLabel), curDecGraph, mempty, mempty, blockCharInfo) + else -- node is not in decorated graph--ie has not been creted/optimized + + -- get postorder assignment of children + -- checks for single child of node + -- result is single graph after left and right child traversals + -- trace ("PDSW making node " <> show curNode <> " in\n" <> (LG.prettify $ GO.convertDecoratedToSimpleGraph curDecGraph)) ( + + let nodeChildren = LG.descendants simpleGraph curNode -- should be 1 or 2, not zero since all leaves already in graph + leftChild = head nodeChildren + rightChild = last nodeChildren -- will be same is first for out 1 (network) node + in do + leftChildTree ← postDecorateSoftWired inGS simpleGraph curDecGraph blockCharInfo rootIndex leftChild + rightLeftChildTree ← + if length nodeChildren == 2 + then postDecorateSoftWired inGS simpleGraph (thd6 leftChildTree) blockCharInfo rootIndex rightChild + else pure leftChildTree + + -- Checks on children + if length nodeChildren > 2 + then error ("Graph not dichotomous in postDecorateSoftWired node " <> show curNode <> "\n" <> LG.prettyIndices simpleGraph) + else + if null nodeChildren + then error ("Leaf not in graph in postDecorateSoftWired node " <> show curNode <> "\n" <> LG.prettyIndices simpleGraph) + else -- after recursing to any children can optimize current node + + -- make node from child block resolutions + -- child resolution made earler in post order pass + -- sub tree is updated graph from children--ie not iuncluding current node + + let newSubTree = thd6 rightLeftChildTree + in -- single child of node--network node + if length nodeChildren == 1 + then -- use left child for out degree = 1 nmodes, right should be "Nothing" + + let (newGraph, _, _, _, _) = getOutDegree1VertexAndGraph curNode (fromJust $ LG.lab newSubTree leftChild) simpleGraph nodeChildren newSubTree + in -- display graphs and character block are not done yet since will need traceback to get preliminary states + -- graph ahas all soft wired decorations + pure (simpleGraph, 0, newGraph, mempty, mempty, blockCharInfo) + else -- 2 children--tree node + + -- trace ("Outdegree 2: " <> (show curNode) <> " " <> (show $ GO.getNodeType simpleGraph curNode) <> " Children: " <> (show nodeChildren)) ( + -- need to create resolutions and add to existing sets + + let -- this ensures that left/right choices are based on leaf BV for consistency and label invariance + -- larger bitvector is Right, smaller or equal Left + ((leftChild', leftChildLabel), (rightChild', rightChildLabel)) = + U.leftRightChildLabelBVNode + ((leftChild, fromJust $ LG.lab newSubTree leftChild), (rightChild, fromJust $ LG.lab newSubTree rightChild)) + + -- create resolution caches for blocks + leftChildNodeType = GO.getNodeType simpleGraph leftChild' -- nodeType leftChildLabel + rightChildNodeType = GO.getNodeType simpleGraph rightChild' -- nodeType rightChildLabel + leftEdgeType + | leftChildNodeType == NetworkNode = NetworkEdge + | leftChildNodeType == LeafNode = PendantEdge + | otherwise = TreeEdge + rightEdgeType + | rightChildNodeType == NetworkNode = NetworkEdge + | rightChildNodeType == LeafNode = PendantEdge + | otherwise = TreeEdge + + edgeLable = + EdgeInfo + { minLength = 0.0 + , maxLength = 0.0 + , midRangeLength = 0.0 + , edgeType = TreeEdge + } + in do + resolutionBlockVL ← + mapM + ( createBlockResolutions' + inGS + (compressResolutions inGS) + curNode + leftChild' + rightChild' + leftChildNodeType + rightChildNodeType + (GO.getNodeType simpleGraph curNode) + ) + (V.zip3 (vertexResolutionData leftChildLabel) (vertexResolutionData rightChildLabel) blockCharInfo) + + -- create canonical Decorated Graph vertex + -- 0 cost becasue can't know cosrt until hit root and get best valid resolutions + let newVertexLabel = + VertexInfo + { index = curNode + , bvLabel = bvLabel leftChildLabel .|. bvLabel rightChildLabel + , parents = V.fromList $ LG.parents simpleGraph curNode + , children = V.fromList nodeChildren + , nodeType = GO.getNodeType simpleGraph curNode -- TreeNode + , vertName = T.pack $ "HTU" <> show curNode + , vertData = mempty -- empty because of resolution data + , vertexResolutionData = resolutionBlockVL + , vertexCost = 0.0 -- newCost + , subGraphCost = 0.0 -- (subGraphCost leftChildLabel) + (subGraphCost rightChildLabel) + newCost + } + let leftEdge = (curNode, leftChild', edgeLable{edgeType = leftEdgeType}) + let rightEdge = (curNode, rightChild', edgeLable{edgeType = rightEdgeType}) + let newGraph = LG.insEdges [leftEdge, rightEdge] $ LG.insNode (curNode, newVertexLabel) newSubTree + + let (displayGraphVL, lDisplayCost) = + if curNode == rootIndex + then + let (displayG, displayCost') = extractDisplayTrees (Just curNode) True resolutionBlockVL + in (displayG, displayCost') + else -- (fmap (fmap LG.removeDuplicateEdges) displayG, displayCost') + (mempty, 0.0) + + pure (simpleGraph, lDisplayCost, newGraph, displayGraphVL, mempty, blockCharInfo) + + +-- | getOutDegree1VertexAndGraph makes parent node from single child for soft-wired resolutions +getOutDegree1VertexAndGraph + ∷ (Show a, Show b) + ⇒ LG.Node + → VertexInfo + → LG.Gr a b + → [LG.Node] + → DecoratedGraph + → (DecoratedGraph, Bool, VertexInfo, VertexCost, V.Vector [DecoratedGraph]) +getOutDegree1VertexAndGraph curNode childLabel simpleGraph nodeChildren subTree = + -- trace ("In out=1: " <> (show curNode)) ( + let childResolutionData = vertexResolutionData childLabel + + curNodeResolutionData = addNodeAndEdgeToResolutionData newDisplayNode newLEdge childResolutionData + + newEdgeLabel = + EdgeInfo + { minLength = 0.0 + , maxLength = 0.0 + , midRangeLength = 0.0 + , edgeType = TreeEdge + } + newMinVertex = + VertexInfo + { index = curNode + , bvLabel = bvLabel childLabel + , parents = V.fromList $ LG.parents simpleGraph curNode + , children = V.fromList nodeChildren + , nodeType = GO.getNodeType simpleGraph curNode + , vertName = T.pack $ "HTU" <> show curNode + , vertData = mempty + , vertexResolutionData = mempty + , vertexCost = 0.0 + , subGraphCost = 0.0 + } + + newVertex = + VertexInfo + { index = curNode + , bvLabel = bvLabel childLabel + , parents = V.fromList $ LG.parents simpleGraph curNode + , children = V.fromList nodeChildren + , nodeType = GO.getNodeType simpleGraph curNode -- NetworkNode + , vertName = T.pack $ "HTU" <> show curNode + , vertData = mempty + , vertexResolutionData = curNodeResolutionData + , vertexCost = 0.0 + , subGraphCost = subGraphCost childLabel + } + + newLEdge = (curNode, index childLabel, newEdgeLabel) + newLNode = (curNode, newVertex) + newDisplayNode = (curNode, newMinVertex) + newGraph = LG.insEdge newLEdge $ LG.insNode newLNode subTree + + -- Root node should be out degree 2 so this should not happen in general--but could during some + -- graph rearangemnts in fusing and swapping + (displayGraphVL, lDisplayCost) = + if nodeType newVertex == RootNode + then extractDisplayTrees (Just curNode) True (vertexResolutionData childLabel) + else (mempty, 0.0) + in -- trace ("NV1: " <> show newVertex) + -- trace ("GOD1VG: " <> (show $ LG.toEdge newLEdge) <> " has edges " <> (show $ LG.hasEdge subTree $ LG.toEdge newLEdge) <> "Resolutions " <> (show $ fmap (fmap U.hasResolutionDuplicateEdges) curNodeResolutionData)) + -- trace ("PDSW-1:" <> (show $ bvLabel newVertex)) + (newGraph, nodeType newVertex == RootNode, newVertex, lDisplayCost, displayGraphVL) + + +-- (newGraph, False, newVertex, 0.0, mempty) +-- ) + +-- | getOutDegree1VertexSoftWired returns new vertex only from single child for soft-wired resolutions +getOutDegree1VertexSoftWired + ∷ (Show a, Show b) + ⇒ LG.Node + → VertexInfo + → LG.Gr a b + → [LG.Node] + → VertexInfo +getOutDegree1VertexSoftWired curNode childLabel simpleGraph nodeChildren = + -- trace ("In out=1: " <> (show curNode)) ( + let childResolutionData = vertexResolutionData childLabel + + newEdgeLabel = + EdgeInfo + { minLength = 0.0 + , maxLength = 0.0 + , midRangeLength = 0.0 + , edgeType = TreeEdge + } + newMinVertex = + VertexInfo + { index = curNode + , bvLabel = bvLabel childLabel + , parents = V.fromList $ LG.parents simpleGraph curNode + , children = V.fromList nodeChildren + , nodeType = NetworkNode + , vertName = T.pack $ "HTU" <> show curNode + , vertData = mempty + , vertexResolutionData = mempty + , vertexCost = 0.0 + , subGraphCost = 0.0 + } + + newDisplayNode = (curNode, newMinVertex) + newLEdge = (curNode, index childLabel, newEdgeLabel) + + curNodeResolutionData = addNodeAndEdgeToResolutionData newDisplayNode newLEdge childResolutionData + + newVertexLabel = + VertexInfo + { index = curNode + , bvLabel = bvLabel childLabel + , parents = V.fromList $ LG.parents simpleGraph curNode + , children = V.fromList nodeChildren + , nodeType = NetworkNode + , vertName = T.pack $ "HTU" <> show curNode + , vertData = mempty + , vertexResolutionData = curNodeResolutionData + , vertexCost = 0.0 + , subGraphCost = subGraphCost childLabel + } + in newVertexLabel + + +{- | getOutDegree2VertexSoftWired returns new vertex only from two child nodes for soft-wired resolutions +used in Net Add Delete heuristics +-} +getOutDegree2VertexSoftWired + ∷ GlobalSettings + → V.Vector (V.Vector CharInfo) + → LG.Node + → LG.LNode VertexInfo + → LG.LNode VertexInfo + → DecoratedGraph + → PhyG VertexInfo +getOutDegree2VertexSoftWired inGS charInfoVectVect curNodeIndex leftChild@(leftChildIndex, _) rightChild@(rightChildIndex, _) inGraph = + let -- this ensures that left/right choices are based on leaf BV for consistency and label invariance + -- larger bitvector is Right, smaller or equal Left + ((leftChild', leftChildLabel'), (rightChild', rightChildLabel')) = U.leftRightChildLabelBVNode (leftChild, rightChild) + + -- create resolution caches for blocks + leftChildNodeType = nodeType leftChildLabel' + rightChildNodeType = nodeType rightChildLabel' + in do + -- TODO PArallelize? its parallel in lower call + resolutionBlockVL ← + mapM + ( createBlockResolutions' + inGS + (compressResolutions inGS) + curNodeIndex + leftChild' + rightChild' + leftChildNodeType + rightChildNodeType + TreeNode + ) + (V.zip3 (vertexResolutionData leftChildLabel') (vertexResolutionData rightChildLabel') charInfoVectVect) + + -- create canonical Decorated Graph vertex + -- 0 cost becasue can't know cosrt until hit root and get best valid resolutions + let newVertexLabel = + VertexInfo + { index = curNodeIndex + , bvLabel = bvLabel leftChildLabel' .|. bvLabel rightChildLabel' + , parents = V.fromList $ LG.parents inGraph curNodeIndex + , children = V.fromList [leftChildIndex, rightChildIndex] + , nodeType = TreeNode + , vertName = T.pack $ "HTU" <> show curNodeIndex + , vertData = mempty -- empty because of resolution data + , vertexResolutionData = resolutionBlockVL + , vertexCost = 0.0 -- newCost + , subGraphCost = 0.0 -- (subGraphCost leftChildLabel) + (subGraphCost rightChildLabel) + newCost + } + + pure newVertexLabel + + +{- | addNodeAndEdgeToResolutionData adds node and edge to resolution data in outdegree = 1 nodes +straight copy would not add this node or edge to subtree in resolutions +-} +addNodeAndEdgeToResolutionData + ∷ LG.LNode VertexInfo → LG.LEdge EdgeInfo → V.Vector ResolutionBlockData → V.Vector ResolutionBlockData +addNodeAndEdgeToResolutionData newNode newEdge = fmap (addNodeEdgeToResolutionBlock newNode newEdge True) + + +-- | addNodeEdgeToResolutionBlock adds node and edge to resolutoin block data +addNodeEdgeToResolutionBlock ∷ LG.LNode VertexInfo → LG.LEdge EdgeInfo → Bool → ResolutionBlockData → ResolutionBlockData +addNodeEdgeToResolutionBlock newNode newEdge isIn1Out1Node inResBlockData = + V.zipWith + (addNodeEdgeToResolutionList newNode newEdge isIn1Out1Node) + inResBlockData + (V.fromList [0 .. V.length inResBlockData - 1]) + + +{- | addNodeEdgeToResolutionList adds node and edge to single subGraph in ResolutionData +adds resolution pairs to be equal to the child straight one-for-one correpondance +although only a single child-both indieces are set to resolution index since singels can be added to +paired resolitions if one or other is a network node +-} +addNodeEdgeToResolutionList ∷ LG.LNode VertexInfo → LG.LEdge EdgeInfo → Bool → ResolutionData → Int → ResolutionData +addNodeEdgeToResolutionList newNode newEdge _ inResData resolutionIndex = + let (inNodeList, inEdgeList) = displaySubGraph inResData + + -- childResolutionIndexPairList = childResolutions inResData + newNodeList = newNode : inNodeList + + -- this check for redundant edges in resolution cash from combinations + newEdgeList = + if newEdge `notElem` inEdgeList + then newEdge : inEdgeList + else -- trace "Should not happen: Extra edge in addNodeEdgeToResolutionListNew" + inEdgeList + newFirstData = + inResData + { displaySubGraph = (newNodeList, newEdgeList) + , -- both set because can be a display node left right added to 2 child resolutoins + childResolutionIndices = (Just resolutionIndex, Just resolutionIndex) + } + in -- trace ("ANETRL:" <> (show $ Just resolutionIndex)) + newFirstData + + +-- | createBlockResolutions' is a wrapper around createBlockResolutions +createBlockResolutions' + ∷ GlobalSettings + → Bool + → LG.Node + → Int + → Int + → NodeType + → NodeType + → NodeType + → (ResolutionBlockData, ResolutionBlockData, V.Vector CharInfo) + → PhyG ResolutionBlockData +createBlockResolutions' inGS compressResolutions curNode leftIndex rightIndex leftChildNodeType rightChildNodeType curNodeNodeType (leftChild, rightChild, charInfoV) = + createBlockResolutions + inGS + compressResolutions + curNode + leftIndex + rightIndex + leftChildNodeType + rightChildNodeType + curNodeNodeType + leftChild + rightChild + charInfoV + + +{- | createBlockResolutions takes left and right child resolution data for a block (same display tree) +and generates node resolution data +-} +createBlockResolutions + ∷ GlobalSettings + → Bool + → LG.Node + → Int + → Int + → NodeType + → NodeType + → NodeType + → ResolutionBlockData + → ResolutionBlockData + → V.Vector CharInfo + → PhyG ResolutionBlockData +createBlockResolutions + inGS + compressResolutions + curNode + leftIndex + rightIndex + leftChildNodeType + rightChildNodeType + curNodeNodeType + leftChild + rightChild + charInfoV + | null leftChild && null rightChild = pure mempty + | null leftChild = pure rightChild + | null rightChild = pure leftChild + | otherwise = + -- trace ("CBR:" <> (show (leftIndex, leftChildNodeType, rightIndex, rightChildNodeType)) <> (show $fmap BV.toBits $ fmap displayBVLabel leftChild) <> " and " <> (show $fmap BV.toBits $ fmap displayBVLabel rightChild)) ( + -- trace ("CNR: " <> (show (length leftChild, length rightChild))) ( + let childResolutionPairs = cartProd (V.toList leftChild) (V.toList rightChild) + -- need to keep these indices correct (hence reverse in checkLeafOverlap ) for traceback and compress + childResolutionIndices = cartProd [0 .. (length leftChild - 1)] [0 .. (length rightChild - 1)] + validPairs = concatMap checkLeafOverlap (zip childResolutionPairs childResolutionIndices) + + -- need to add in node and edge to left and right + edgeLable = + EdgeInfo + { minLength = 0.0 + , maxLength = 0.0 + , midRangeLength = 0.0 + , edgeType = TreeEdge + } + newMinVertex = + VertexInfo + { index = curNode + , bvLabel = BV.fromBits [False] + , parents = mempty + , children = mempty + , nodeType = curNodeNodeType + , vertName = T.pack $ "HTU" <> show curNode + , vertData = mempty + , vertexResolutionData = mempty + , vertexCost = 0.0 + , subGraphCost = 0.0 + } + + newNode = (curNode, newMinVertex) + + addLeft = + if leftChildNodeType == NetworkNode + then + let newEdge = (curNode, rightIndex, edgeLable) + newRightChildBlockResolutionData = addNodeEdgeToResolutionBlock newNode newEdge False rightChild + in -- trace ("ANEL:" <> (show $ (curNode, rightIndex))) + newRightChildBlockResolutionData + else -- trace ("ANEL-Nothing") + mempty + + addRight = + if rightChildNodeType == NetworkNode + then + let newEdge = (curNode, leftIndex, edgeLable) + newLeftChildBlockResolutionData = addNodeEdgeToResolutionBlock newNode newEdge False leftChild + in -- trace ("ANER:" <> (show $ (curNode, leftIndex))) + newLeftChildBlockResolutionData + else -- trace ("ANER-Nothing") + mempty + + resolutionAction ∷ ((ResolutionData, ResolutionData), (Int, Int)) → PhyG ResolutionData + resolutionAction = createNewResolution inGS curNode leftIndex rightIndex leftChildNodeType rightChildNodeType charInfoV + in do + resolutionPar ← getParallelChunkTraverse + newResolutionList <- resolutionPar resolutionAction validPairs + -- newResolutionList = PU.seqParMap PU.myStrategy (createNewResolution curNode leftIndex rightIndex leftChildNodeType rightChildNodeType charInfoV) validPairs + + -- trace ("CNR: " <> (show (length leftChild, length rightChild))) ( -- <> "\n" <> (show childResolutionIndices) <> "\n" <> (show $ fmap snd validPairs)) ( + if compressResolutions + then pure $ compressBlockResolution (newResolutionList <> V.toList addLeft <> V.toList addRight) + else pure $ V.fromList newResolutionList <> (addLeft <> addRight) + + +{- | compressBlockResolution 'compresses' resolutions of a block by taking only the first of resolutions with the +same set of leaves (via bitvector) and lowest cost +can speed up graph diagnosis, but at the cost of potentially loosing resolutions whihc would be better later +(ie closer to root) +-} +compressBlockResolution ∷ [ResolutionData] → V.Vector ResolutionData +compressBlockResolution inResList = + if null inResList + then V.empty + else + let -- group by bitvectors of subtree + resLL = L.groupBy compareBVLabel inResList + minCostVV = fmap getMinCostList resLL + in V.fromList minCostVV + where + compareBVLabel a b = displayBVLabel a == displayBVLabel b + + +-- | getMinCostList takes a list resolutions and returns first lowest cost resolution +getMinCostList ∷ [ResolutionData] → ResolutionData +getMinCostList inList = + if null inList + then error "Empty resolution list in getMinCostList" + else + let minResCost = minimum $ fmap displayCost inList + in head $ filter ((== minResCost) . displayCost) inList + + +{- | createNewResolution takes a pair of resolutions and creates the median resolution +need to watch let/right (based on BV) for preorder stuff +-} +createNewResolution + ∷ GlobalSettings + → LG.Node + → Int + → Int + → NodeType + → NodeType + → V.Vector CharInfo + → ((ResolutionData, ResolutionData), (Int, Int)) + → PhyG ResolutionData +createNewResolution inGS curNode leftIndex rightIndex leftChildNodeType rightChildNodeType charInfoV ((leftRes, rightRes), (leftResIndex, rightResIndex)) = + let -- make bvLabel for resolution + resBV = displayBVLabel leftRes .|. displayBVLabel rightRes + + -- Make resolution Display tree infomation + leftEdgeType + | leftChildNodeType == NetworkNode = NetworkEdge + | leftChildNodeType == LeafNode = PendantEdge + | otherwise = TreeEdge + rightEdgeType + | rightChildNodeType == NetworkNode = NetworkEdge + | rightChildNodeType == LeafNode = PendantEdge + | otherwise = TreeEdge + + edgeLable = + EdgeInfo + { minLength = 0.0 + , maxLength = 0.0 + , midRangeLength = 0.0 + , edgeType = TreeEdge + } + + leftEdge = (curNode, leftIndex, edgeLable{edgeType = leftEdgeType}) + rightEdge = (curNode, rightIndex, edgeLable{edgeType = rightEdgeType}) + leftChildTree = displaySubGraph leftRes + rightChildTree = displaySubGraph rightRes + + -- Data fields empty for display tree data--not needed and multiple copies of everything + newNodeLabel = + VertexInfo + { index = curNode + , bvLabel = resBV + , parents = V.empty + , children = V.fromList [leftIndex, rightIndex] + , nodeType = TreeNode + , vertName = T.pack $ "HTU" <> show curNode + , vertData = mempty + , vertexResolutionData = mempty + , vertexCost = 0.0 + , subGraphCost = 0.0 + } + + newNode = (curNode, newNodeLabel) + + -- this check for redundant edges in resoluton cash from combinations + -- resolutionEdgeList = leftEdge : (rightEdge: (snd leftChildTree <> snd rightChildTree)) + -- is this required? + existingEdges = snd leftChildTree <> snd rightChildTree + resolutionEdgeList + | (leftEdge `notElem` existingEdges) && (rightEdge `notElem` existingEdges) = leftEdge : (rightEdge : existingEdges) + | (leftEdge `elem` existingEdges) && (rightEdge `elem` existingEdges) = existingEdges + | (leftEdge `notElem` existingEdges) = leftEdge : existingEdges + | otherwise = rightEdge : existingEdges + + resolutionNodeList = newNode : (fst leftChildTree <> fst rightChildTree) + + -- Make the data and cost for the resolution + -- No chnage cost adjustment is True here if PMDL/SI + leftBlockLength = V.length $ displayData leftRes + rightBlockLength = V.length $ displayData rightRes + in do + resolutionMedianCostV <- + if (leftBlockLength == 0) then pure $ V.zip (displayData rightRes) (V.replicate rightBlockLength 0) + else if (rightBlockLength == 0) then pure $ V.zip (displayData leftRes) (V.replicate leftBlockLength 0) + else M.median2M (U.needTwoEdgeNoCostAdjust inGS True) (displayData leftRes) (displayData rightRes) charInfoV + let (resolutionMedianV, resolutionCostV) = V.unzip resolutionMedianCostV + let thisResolutionCost = V.sum resolutionCostV + let displaySubTreeCost = displayCost leftRes + displayCost rightRes + thisResolutionCost + + pure ResolutionData + { displaySubGraph = (resolutionNodeList, resolutionEdgeList) + , displayBVLabel = resBV + , displayData = resolutionMedianV + , childResolutionIndices = (Just leftResIndex, Just rightResIndex) + , resolutionCost = thisResolutionCost + , displayCost = displaySubTreeCost + } + + +{- | extractDisplayTrees takes resolutions and pulls out best cost (head for now) need to change type for multiple best +option for filter based on pop-count for root cost and complete display tree check +-} +extractDisplayTrees ∷ Maybe Int → Bool → V.Vector ResolutionBlockData → (V.Vector [DecoratedGraph], VertexCost) +extractDisplayTrees startVertex checkPopCount inRBDV = + if V.null inRBDV + then (V.empty, 0.0) + else + let (bestBlockDisplayResolutionList, costVect, _) = V.unzip3 $ fmap (getBestResolutionList startVertex checkPopCount) inRBDV + in -- trace ("EDT: " <> (show (V.length bestBlockDisplayResolutionList, fmap length bestBlockDisplayResolutionList, V.sum costVect))) + (bestBlockDisplayResolutionList, V.sum costVect) + + +{- | checkLeafOverlap takes a left right resolution pair list and checks if +there is leaf overlap via comparing displayBVLabel if & = 0 then no +overlap, and adds to resulting list--reverses order--sholdn't matter +-} +checkLeafOverlap ∷ ((ResolutionData, ResolutionData), (Int, Int)) → [((ResolutionData, ResolutionData), (Int, Int))] +checkLeafOverlap inPair@((leftRes, rightRes), (_, _)) = + let leftBV = displayBVLabel leftRes + rightBV = displayBVLabel rightRes + in ([inPair | BV.isZeroVector (leftBV .&. rightBV)]) + + +{- | getBestResolutionList takes ResolutionBlockData and retuns a list of the best valid (ie all leaves in subtree) display trees +for that block with cost and resolujtion data in triple-- if checkPopCount is True--otherwise all display trees of any cost and contitution +startVertex for a component-- to allow for not every leaf being in componnet but still getting softwired cost +-} +getBestResolutionList ∷ Maybe Int → Bool → ResolutionBlockData → ([DecoratedGraph], VertexCost, ResolutionBlockData) +getBestResolutionList startVertex checkPopCount inRDList = + -- trace ("GBRL: " <> (show $ V.length inRDList)) ( + if null inRDList + then error "Null resolution list" + else + let displayTreeList = fmap displaySubGraph inRDList + displayCostList = fmap displayCost inRDList + displayPopList = fmap (complement . displayBVLabel) inRDList + in if not checkPopCount + then + let minCost = minimum displayCostList + displayCostTripleList = V.zip3 displayTreeList displayCostList inRDList + (bestDisplayList, _, bestResList) = V.unzip3 $ V.filter ((== minCost) . snd3) displayCostTripleList + in (fmap LG.mkGraphPair (V.toList bestDisplayList), minCost, bestResList) + else + let minPopCount = minimum $ fmap popCount displayPopList -- max since complemented above + displayBVList = V.zip4 displayTreeList displayCostList displayPopList inRDList + + -- must have all leaves if startvzertex == Nothing, component maximum otherwise + -- this for getting cost of component of a softwired network + validDisplayList = + if isNothing startVertex + then V.filter (BV.isZeroVector . thd4) displayBVList + else V.filter ((== minPopCount) . (popCount . thd4)) displayBVList + validMinCost = V.minimum $ fmap snd4 validDisplayList + (bestDisplayList, _, _, bestResList) = V.unzip4 $ V.filter ((== validMinCost) . snd4) validDisplayList + in -- trace ("Valid display list number:" <> (show $ length validDisplayList)) ( + if V.null validDisplayList + then + error + ( "Null root validDisplayList in getBestResolutionList " + <> show (startVertex, inRDList) + <> " This can be caused if the graphType not set correctly." + ) + else + let lDisplayTreeList = fmap LG.mkGraphPair (V.toList bestDisplayList) + + -- update root cost of display trees for use later (e.g. net penalties, outputting display forrests) + lDisplayTreeList' = fmap (updateRootCost validMinCost) lDisplayTreeList + in -- trace ("GBRL: " <> (show (length lDisplayTreeList', validMinCost))) + (lDisplayTreeList', validMinCost, bestResList) + + +-- ) + +{- | updateRootCost updates the subGraphCost of the root node(s) with input value + node is created, so original is deleted, added, and original edges added back +since deleted when node is +assumes its a tree wiht a single root +-} +updateRootCost ∷ VertexCost → DecoratedGraph → DecoratedGraph +updateRootCost newRootCost inGraph = + let (rootIndex, rootLabel) = head $ LG.getRoots inGraph + rootEdges = LG.out inGraph rootIndex + newRootLabel = rootLabel{subGraphCost = newRootCost} + in -- trace ("DCC: " <> (show newRootCost)) + LG.insEdges rootEdges $ LG.insNode (rootIndex, newRootLabel) $ LG.delNode rootIndex inGraph + + +{- + Traceback code for after intial Postorder softwired pass. All functions with 'New" appended +-} + +{- | softWiredPostOrderTraceBack takes resolution data and assigns correct resolution median +from vertexResolutionData to preliminary data assignments in vertData. +Proceeds via typical pre-order pass over display tree for each block +using the indices of left and right (if present) of resolutions +first gets root assignment from resolution data and then each block is traversed given its block display tree +-} +softWiredPostOrderTraceBack ∷ Int → PhylogeneticGraph → PhyG PhylogeneticGraph +softWiredPostOrderTraceBack rootIndex inGraph@(inSimpleGraph, b, canonicalGraph, _, _, f) = + if LG.isEmpty canonicalGraph + then pure emptyPhylogeneticGraph + else -- this condition can arise due to strictness in graph evaluation in parallel + + if length (LG.descendants canonicalGraph rootIndex) /= 2 + then pure emptyPhylogeneticGraph + else -- error ("Root node has improper number of children: " <> show (LG.descendants canonicalGraph rootIndex) <>"\n" <> (LG.prettyIndices canonicalGraph)) + + let -- extract display trees and bloxck char trees from PhylogeneticGraph + -- block character trees do not exist yet + displayTreeV = (head <$> fth6 inGraph) + + -- get root node resolution data from canonical Graph created in postorder + rootLabel = fromJust $ LG.lab canonicalGraph rootIndex + rootResData = vertexResolutionData rootLabel + + -- traceback for each block based on its display tree, updating trees as it goes, left descendent then right + -- at this stage all character trees will have same root descendents sionce all rooted from outgropu postorder traversal + -- later (after reroot pass) this will not be the case since each charcater may have a unique traversal root/edge + leftChild = head $ LG.descendants canonicalGraph rootIndex + rightChild = last $ LG.descendants canonicalGraph rootIndex + + -- get left right from BV as in postorder + ((leftChild', _), (rightChild', _)) = + U.leftRightChildLabelBVNode + ((leftChild, fromJust $ LG.lab canonicalGraph leftChild), (rightChild, fromJust $ LG.lab canonicalGraph rightChild)) + + getBestAction ∷ ResolutionBlockData → ([DecoratedGraph], VertexCost, ResolutionBlockData) + getBestAction = getBestResolutionList (Just rootIndex) True + + updateAction ∷ (ResolutionData, DecoratedGraph) → (DecoratedGraph, V.Vector DecoratedGraph) + updateAction = updateRootBlockTrees rootIndex + + traceBackAction + ∷ LG.Node → (DecoratedGraph, V.Vector DecoratedGraph, Maybe Int, Int) → (DecoratedGraph, V.Vector DecoratedGraph) + traceBackAction = traceBackBlock canonicalGraph + in do + -- let (rootNodes, leafNode, treeNodes,networkNodes) = LG.splitVertexList inSimpleGraph + -- logWith LogInfo ("SWPOT: " <> (show (length rootNodes, length leafNode, length treeNodes, length networkNodes))) + + -- extract (first) best resolution for each block--there can be more than one for each, but only use the first for + -- traceback, preliminary and final assignment etc--part of the heuristic + resolutionPar ← getParallelChunkMap + let resolutionResult = resolutionPar getBestAction $ V.toList rootResData + let (_, _, rootDisplayBlockCharResolutionV) = unzip3 resolutionResult + -- \$ PU.seqParMap PU.myStrategy (getBestResolutionList (Just rootIndex) True) rootResData + let firstOfEachRootRes = fmap V.head rootDisplayBlockCharResolutionV + + -- get preliminary character data for blocks + -- these should be ok wihtout left right check since were creted with that check on post order + let (leftIndexList, rightIndexList) = V.unzip $ fmap childResolutionIndices $ V.fromList firstOfEachRootRes + + -- update root vertex info for display and character trees for each block + -- this includes preliminary data and other fields + updatePar ← getParallelChunkMap + let updateResult = updatePar updateAction (zip firstOfEachRootRes (V.toList displayTreeV)) + + let (rootUpdatedDisplayTreeV, rootUpdatedCharTreeVV) = unzip updateResult + -- \$ PU.seqParMap PU.myStrategy (updateRootBlockTrees rootIndex) (V.zip (V.fromList firstOfEachRootRes) displayTreeV) + + traceLeftPar ← getParallelChunkMap + let leftResult = + traceLeftPar + (traceBackAction leftChild') + (L.zip4 rootUpdatedDisplayTreeV rootUpdatedCharTreeVV (V.toList leftIndexList) ([0 .. (length rootUpdatedDisplayTreeV - 1)])) + let (traceBackDisplayTreeVLeft, traceBackCharTreeVVLeft) = unzip leftResult + -- \$ PU.seqParMap PU.myStrategy (traceBackBlock canonicalGraph leftChild') (V.zip4 rootUpdatedDisplayTreeV rootUpdatedCharTreeVV leftIndexList (V.fromList [0..(V.length rootUpdatedDisplayTreeV - 1)])) + + traceRightPar ← getParallelChunkMap + let rightResult = + traceRightPar + (traceBackAction rightChild') + ( L.zip4 traceBackDisplayTreeVLeft traceBackCharTreeVVLeft (V.toList rightIndexList) ([0 .. (length rootUpdatedDisplayTreeV - 1)]) + ) + let (traceBackDisplayTreeV, traceBackCharTreeVV) = unzip rightResult + -- \$ PU.seqParMap PU.myStrategy (traceBackBlock canonicalGraph rightChild') (V.zip4 traceBackDisplayTreeVLeft traceBackCharTreeVVLeft rightIndexList (V.fromList [0..(V.length rootUpdatedDisplayTreeV - 1)])) + + let newCanonicalGraph = backPortBlockTreeNodesToCanonicalGraph canonicalGraph (V.fromList traceBackDisplayTreeV) + + -- this is a hack due to missing nodes in some character trees--perhpas issue with postorder resolutions? + if LG.isEmpty newCanonicalGraph + then pure emptyPhylogeneticGraph + else + pure $ + (inSimpleGraph, b, newCanonicalGraph, fmap (: []) (V.fromList traceBackDisplayTreeV), (V.fromList traceBackCharTreeVV), f) + + +{- | traceBackBlock performs softwired traceback on block data returns updated display and character trees +the block index specifies which resolution list from the canonical tree at node nodeIndex +the resoliution index is the resolution element that was used to create the parent's state +need to account for out degree 2 and 1 in recursive calls +-} +traceBackBlock + ∷ DecoratedGraph + → LG.Node + → (DecoratedGraph, V.Vector DecoratedGraph, Maybe Int, Int) + → (DecoratedGraph, V.Vector DecoratedGraph) +traceBackBlock canonicalGraph nodeIndex (displayTree, charTreeV, resolutionIndex, blockIndex) = + -- trace ("TBB: " <> "Node " <> (show nodeIndex) <> " Block " <> (show blockIndex) <> " Resolution " <> (show (resolutionIndex, LG.descendants displayTree nodeIndex)) + -- <> " nrd: " <> (show (length (vertexResolutionData (fromJust $ LG.lab canonicalGraph nodeIndex)), fmap length (vertexResolutionData (fromJust $ LG.lab canonicalGraph nodeIndex)))) ) ( + -- <> "\n" <> (show (vertexResolutionData (fromJust $ LG.lab canonicalGraph nodeIndex)))) ( + if LG.isEmpty displayTree || V.null charTreeV + then error "Null data in traceBackBlock" + else + let -- get block resolution data from canonical graph + nodeCanonicalLabel = fromJust $ LG.lab canonicalGraph nodeIndex + nodeResolutionData = vertexResolutionData nodeCanonicalLabel + + -- hack checking for too high res index here + newIndex = + if (fromJust resolutionIndex) < length (nodeResolutionData V.! blockIndex) + then fromJust resolutionIndex + else -- traceNoLF ("(" <> (show (fromJust resolutionIndex, length (nodeResolutionData V.! blockIndex))) <> ")") + (length (nodeResolutionData V.! blockIndex)) - 1 + + -- blockResolutionData = (nodeResolutionData V.! blockIndex) V.! fromJust resolutionIndex + blockResolutionData = (nodeResolutionData V.! blockIndex) V.! newIndex + + -- update display tree and character tree nodes + (newDisplayTree, newCharTreeV) = updateNodeBlockTrees nodeIndex (blockResolutionData, displayTree, charTreeV) + + -- get left and right display resolution indices + (leftResIndex, rightResIndex) = childResolutionIndices blockResolutionData + + -- get left and right node children, won't recurse if leaf so no issue there + childList = LG.descendants displayTree nodeIndex + in -- trace ("TBB: " <> (show (length childList, length nodeResolutionData, blockIndex, length (nodeResolutionData V.! blockIndex), fromJust resolutionIndex))) $ + if isNothing resolutionIndex + then + error + ( "Nothing resolution in traceBackBlock of node " + <> show nodeIndex + <> " with children " + <> show childList + <> LG.prettyIndices displayTree + ) + else + if length childList > 2 + then error ("Node " <> show nodeIndex <> " with > 2 children: " <> show childList) + else + if null childList + then -- its a leaf + (newDisplayTree, newCharTreeV) + else + if length childList == 1 + then -- recurse to left of in 1 out 1 node + traceBackBlock canonicalGraph (head childList) (newDisplayTree, newCharTreeV, leftResIndex, blockIndex) + else -- two children to recurse to + + let leftChild = head childList + rightChild = last childList + -- get left right from BV as in postorder + ((leftChild', _), (rightChild', _)) = + U.leftRightChildLabelBVNode + ((leftChild, fromJust $ LG.lab canonicalGraph leftChild), (rightChild, fromJust $ LG.lab canonicalGraph rightChild)) + + (leftDisplayTree, leftCharTreeV) = traceBackBlock canonicalGraph leftChild' (newDisplayTree, newCharTreeV, leftResIndex, blockIndex) + (rightDisplayTree, rightCharTreeV) = traceBackBlock canonicalGraph rightChild' (leftDisplayTree, leftCharTreeV, rightResIndex, blockIndex) + in (rightDisplayTree, rightCharTreeV) + + +-- ) + +{- | updateNodeBlockTrees takes root resolution data and sets various fields in block display +and creates character trees from block display tree +-} +updateNodeBlockTrees + ∷ LG.Node → (ResolutionData, DecoratedGraph, V.Vector DecoratedGraph) → (DecoratedGraph, V.Vector DecoratedGraph) +updateNodeBlockTrees nodeIndex (nodeRes, displayTree, charTreeV) = + if LG.isEmpty displayTree + then error "Null data in updateNodeBlockTrees" + else -- update Display tree vertex info + -- data are a singleton vector since only one "block" (with characters) per block + -- same, but double for characters V.singleton of V.singleton of CharacterData + + let origNodeLabel = fromJust $ LG.lab displayTree nodeIndex + newNodeLabel = + origNodeLabel + { vertData = V.singleton $ displayData nodeRes + , vertexResolutionData = mempty + , vertexCost = resolutionCost nodeRes + , subGraphCost = displayCost nodeRes + } + newDisplayTree = LG.updateNodeLabel displayTree nodeIndex newNodeLabel + newCharTreeV = + V.zipWith + (updateNodeCharacterTree nodeIndex (displayData nodeRes)) + charTreeV + (V.fromList [0 .. (V.length (displayData nodeRes) - 1)]) + in -- trace ("URBT: " <> (show $ LG.prettyIndices displayTree) <> "\n" <> (show $ LG.prettyIndices newDisplayTree)) + (newDisplayTree, newCharTreeV) + + +-- | updateNodeCharacterTree updates an individual character tree with node info from a display tree +updateNodeCharacterTree ∷ LG.Node → V.Vector CharacterData → DecoratedGraph → Int → DecoratedGraph +updateNodeCharacterTree nodeIndex nodeBlockCharData charTree charIndex = + if LG.isEmpty charTree + then error "Empty tree in updateNodeCharacterTree" + else + let charData = nodeBlockCharData V.! charIndex + origNodeLabel = fromJust $ LG.lab charTree nodeIndex + newNodeLabel = + origNodeLabel + { vertData = V.singleton (V.singleton charData) + , vertexResolutionData = mempty + , vertexCost = localCost charData + , subGraphCost = globalCost charData + } + newCharTree = LG.updateNodeLabel charTree nodeIndex newNodeLabel + in -- trace ("URCT: " <> (show charData)) + newCharTree + + +{- | updateRootBlockTrees takes root resolution data and sets various fields in block display +and creates character trees from block display tree +-} +updateRootBlockTrees ∷ LG.Node → (ResolutionData, DecoratedGraph) → (DecoratedGraph, V.Vector DecoratedGraph) +updateRootBlockTrees nodeIndex (nodeRes, displayTree) = + if LG.isEmpty displayTree + then error "Null data in updateNodeBlockTrees" + else -- update Display tree vertex info + -- data are a singleton vector since only one "block" (with characters) per block + -- same, but double for characters V.singleton of V.singleton of CharacterData + + let origNodeLabel = fromJust $ LG.lab displayTree nodeIndex + newNodeLabel = + origNodeLabel + { vertData = V.singleton $ displayData nodeRes + , vertexResolutionData = mempty + , vertexCost = resolutionCost nodeRes + , subGraphCost = displayCost nodeRes + } + newDisplayTree = LG.updateNodeLabel displayTree nodeIndex newNodeLabel + newCharTreeV = + fmap + (createNodeCharacterTree nodeIndex (displayData nodeRes) newDisplayTree) + (V.fromList [0 .. (V.length (displayData nodeRes) - 1)]) + in -- trace ("URBT: " <> (show $ LG.prettyIndices displayTree) <> "\n" <> (show $ LG.prettyIndices newDisplayTree)) + (newDisplayTree, newCharTreeV) + + +-- | createNodeCharacterTree creates an individual character tree with node info from a display tree +createNodeCharacterTree ∷ LG.Node → V.Vector CharacterData → DecoratedGraph → Int → DecoratedGraph +createNodeCharacterTree nodeIndex nodeBlockCharData displayTree charIndex = + if LG.isEmpty displayTree + then error "Empty tree in updateNodeCharacterTree" + else + let charData = nodeBlockCharData V.! charIndex + origNodeLabel = fromJust $ LG.lab displayTree nodeIndex + newNodeLabel = + origNodeLabel + { vertData = V.singleton (V.singleton charData) + , vertexResolutionData = mempty + , vertexCost = localCost charData + , subGraphCost = globalCost charData + } + newCharTree = LG.updateNodeLabel displayTree nodeIndex newNodeLabel + in -- trace ("URCT: " <> (show charData)) + newCharTree + + +{- | backPortBlockTreeNodesToCanonicalGraph takes block display trees (updated presumably) and ports the block tree node +labels to the cononical Graph--very similar to backPortCharTreeNodesToBlockTree but character vector is not singleton +back port is based on indices of block characters that may have fewer nodes than canonical in a +split graph/ sub graph situation like in swap and fuse +hence not all canonical nodes may be updated--left unchanged, so need to add back those unchanged +-} +backPortBlockTreeNodesToCanonicalGraph ∷ DecoratedGraph → V.Vector DecoratedGraph → DecoratedGraph +backPortBlockTreeNodesToCanonicalGraph inCanonicalGraph blockTreeVect = + let canonicalNodes = LG.labNodes inCanonicalGraph + canonicalEdges = LG.labEdges inCanonicalGraph + + -- vector (characters) of vector (nodes) of labels + blockTreeNodeLabelsVV = fmap ((V.fromList . fmap snd) . LG.labNodes) blockTreeVect + -- blockTreeNodeIndicesV = V.fromList $ V.head $ fmap (fmap fst) $ fmap LG.labNodes blockTreeVect + blockTreeNodeIndicesV = V.fromList $ fmap fst $ LG.labNodes $ V.head blockTreeVect + + -- setting max index if things aren't all same length--that's the prob + updatedCanonicalNodes = + V.toList $ + fmap + (updateCanonicalNodes inCanonicalGraph blockTreeNodeLabelsVV) + (V.zip blockTreeNodeIndicesV (V.fromList [0 .. (length blockTreeNodeIndicesV - 1)])) + -- maxIndex = minimum $ fmap length blockTreeNodeLabelsVV + -- updatedCanonicalNodes = V.toList $ fmap (updateCanonicalNodes inCanonicalGraph blockTreeNodeLabelsVV) (V.zip blockTreeNodeIndicesV (V.fromList [0..(maxIndex- 1)])) + + -- add in nodes not in characters trees--can happen during swap split trees + -- based on vertex index first field of node + unModifiedNodes = orderedNodeMinus canonicalNodes updatedCanonicalNodes + in -- update BV vector of unModified nodes? + + -- trace ("BPTCG: " <> (show (fmap fst unModifiedNodes))) + + -- this is a hack if graph is imporper--not sure why htis happens yet + if (V.null blockTreeVect) || (not $ allSameLength blockTreeNodeLabelsVV) + then -- trace ("BPTCG: " <> (show (fmap length blockTreeNodeLabelsVV))) + LG.empty + else LG.mkGraph (updatedCanonicalNodes <> unModifiedNodes) canonicalEdges + where + allSameLength a = + if V.null a + then True + else + if length a == 1 + then True + else + if (length $ V.head a) /= (length $ a V.! 1) + then False + else allSameLength (V.tail a) + + +{- | updateCanonicalNodes takes a pair of block node index and vector of labels and +assigns data to canonical node of same index +-} +updateCanonicalNodes ∷ DecoratedGraph → V.Vector (V.Vector VertexInfo) → (LG.Node, Int) → LG.LNode VertexInfo +updateCanonicalNodes canonicalGraph blockNodeLabelVV (blockNodeIndex, vectIndex) = + let canonicalLabel = fromJust $ LG.lab canonicalGraph blockNodeIndex + blockNodeLabelV = fmap (V.! vectIndex) blockNodeLabelVV + vertDataV = V.concatMap vertData blockNodeLabelV + vertCostV = fmap vertexCost blockNodeLabelV + subGraphCostV = fmap subGraphCost blockNodeLabelV + + -- this for Naive softwired where there is no canonical bitvecotr labelling for nodes + -- and set to default [False] + canonicalBV = + if bvLabel canonicalLabel /= BV.fromBits [False] + then bvLabel canonicalLabel + else L.foldl1 (.|.) $ fmap bvLabel blockNodeLabelV + + -- update Info + newVertCost = V.sum vertCostV + newSubGraphCost = V.sum subGraphCostV + newLabel = canonicalLabel{bvLabel = canonicalBV, vertData = vertDataV, vertexCost = newVertCost, subGraphCost = newSubGraphCost} + in -- trace ("UCN:" <> (show (blockNodeIndex, vectIndex, fmap length blockNodeLabelVV))) + (blockNodeIndex, newLabel) + + +{- | orderedNodeMinus takes two lists of pairs where first pair is Int and +pairs are orderd by first element and returns a list of nodes in first list +not in second +assumes lists are orderd (haven't thought if non-unique elements) +should be O(n) +-} +orderedNodeMinus ∷ [(Int, a)] → [(Int, b)] → [(Int, a)] +orderedNodeMinus firstList secondList + | null firstList = [] + | null secondList = firstList + | otherwise = + let firstFirst@(af, _) = head firstList + (as, _) = head secondList + in if af < as + then firstFirst : orderedNodeMinus (tail firstList) secondList + else + if af == as + then orderedNodeMinus (tail firstList) (tail secondList) + else -- asf > as + orderedNodeMinus firstList (tail secondList) diff --git a/src/GraphOptimization/PreOrderFunctions.hs b/src/GraphOptimization/PreOrderFunctions.hs new file mode 100644 index 000000000..05d34902b --- /dev/null +++ b/src/GraphOptimization/PreOrderFunctions.hs @@ -0,0 +1,2016 @@ +{-# OPTIONS_GHC -Wno-missed-specialisations #-} + +{- | +Module specifying pre-order graph functions +-} +module GraphOptimization.PreOrderFunctions ( + createFinalAssignmentOverBlocks, + getBlockCostPairsFinal, + makeIAUnionAssignments, + preOrderTreeTraversal, + setFinalToPreliminaryStates, + setPreliminaryToFinalStates, + updateLeafIABlock, + zero2Gap, +) where + +import Bio.DynamicCharacter +import Control.Monad (when) +import Control.Monad.IO.Class (MonadIO (..)) +import Data.Alphabet +import Data.BitVector.LittleEndian qualified as BV +import Data.Bits +import Data.List qualified as L +import Data.Map qualified as MAP +import Data.Maybe +import Data.Vector qualified as V +import Data.Vector.Generic qualified as GV +import Data.Vector.Unboxed qualified as UV +import DirectOptimization.PreOrder qualified as DOP +import GeneralUtilities +import GraphOptimization.Medians qualified as M +import Graphs.GraphOperations qualified as GO +import Input.BitPack qualified as BP +import PHANE.Evaluation +import PHANE.Evaluation.Verbosity (Verbosity (..)) +import SymMatrix qualified as S +import Types.Types +import Utilities.LocalGraph qualified as LG +import Utilities.ThreeWayFunctions qualified as TW +import Utilities.Utilities qualified as U + + +-- import Debug.Trace +-- import ParallelUtilities qualified as PU + +{- | +preOrderTreeTraversal takes a preliminarily labelled PhylogeneticGraph +and returns a full labels with 'final' assignments based on character decorated graphs +created postorder (5th of 6 fields). +the preorder states are created by traversing the traversal DecoratedGraphs in the 5th filed of PhylogeneticGraphs +these are by block and character, Exact characters are vectors of standard characters and each sequence (non-exact) +has its own traversal graph. These should be trees here (could be forests later) and should all have same root (number taxa) +but worth checking to make sure. +these were created by "splitting" them after preorder, or by separate passes in softwired graphs + +For sequence characters (slim/wide/huge) final states are created either by DirectOptimization or ImpliedAlignment +if DO--then does a median between parent andchgaps wherre needed--then doing a 3way state assignmeent filteringgaps out +if IA--then a separate post and preorder pass are donne on the slim/wide/huge/AI fields to crete full IA assignments +that are then filtered of gaps and assigned to th efinal fields + +The final states are propagated back to the second field +DecoratedGraph of the full Phylogenetic graph--which does NOT have the character based preliminary assignments +ie postorder--since those are traversal specific +the character specific decorated graphs have appropriate post and pre-order assignments +the traversal begins at the root (for a tree) and proceeds to leaves. + +Hardfwired dos not have IA fileds so skipped--so medians for edges etc must be do calculated on final states + +In, general, no Change adjustment (for PMDL and SI) are not perfomred (False option) since graph costs are not calculated in the preorder passes +-} +preOrderTreeTraversal + ∷ GlobalSettings → AssignmentMethod → Bool → Bool → Bool → Int → Bool → PhylogeneticGraph → PhyG PhylogeneticGraph +preOrderTreeTraversal inGS finalMethod staticIA calculateBranchLengths hasNonExact rootIndex useMap (inSimple, inCost, inDecorated, blockDisplayV, blockCharacterDecoratedVV, inCharInfoVV) = + let -- parallel setup + doBlockAction ∷ (V.Vector CharInfo, V.Vector DecoratedGraph) → V.Vector DecoratedGraph + doBlockAction = doBlockTraversal' inGS finalMethod staticIA rootIndex + + updateLeafAction ∷ Int → (V.Vector DecoratedGraph, V.Vector DecoratedGraph, V.Vector CharInfo) → V.Vector DecoratedGraph + updateLeafAction = updateLeafIABlock' + + makeIAAction ∷ (V.Vector DecoratedGraph, V.Vector CharInfo) → V.Vector DecoratedGraph + makeIAAction = makeIAUnionAssignments' finalMethod rootIndex + in if LG.isEmpty inDecorated + then pure emptyPhylogeneticGraph -- error "Empty tree in preOrderTreeTraversal" + else + if null blockCharacterDecoratedVV + then pure emptyPhylogeneticGraph + else do + -- trace ("In PreOrder\n" <> "Simple:\n" <> (LG.prettify inSimple) <> "Decorated:\n" <> (LG.prettify $ GO.convertDecoratedToSimpleGraph inDecorated) <> "\n" <> (GFU.showGraph inDecorated)) ( + -- mapped recursive call over blocks, later character + blockPar ← getParallelChunkMap + let preOrderBlockVect = V.fromList $ blockPar doBlockAction (V.toList $ V.zip inCharInfoVV blockCharacterDecoratedVV) + + -- if final non-exact states determined by IA then perform passes and assignments of final and final IA fields + -- always do IA pass if Tree--but only assign to final if finalMethod == ImpliedAlignment + -- also assignes unions for use in rearrangemenrts + -- update leaf IA assignments based on cotracted edge for softWired to make IAs + -- no need for trees--can't for hardWired + softwiredUpdatedLeafIA ← + if graphType inGS /= SoftWired + then pure preOrderBlockVect + else + let -- get display trees for each data block-- takes first of potentially multiple + contractedBlockCharacterDecoratedVV = fmap (fmap LG.contractIn1Out1Edges) blockCharacterDecoratedVV + in do + -- preform full passes on contracted graphs on blocks to create corrext IA fields for leaves + contractPar ← getParallelChunkMap + let contractedBlockVect = contractPar doBlockAction (zip (V.toList inCharInfoVV) (V.toList contractedBlockCharacterDecoratedVV)) + + -- update leaf IA fields with contracted IAs + let maxLeafIndex = maximum $ filter (LG.isLeaf inSimple) $ LG.nodes inSimple + + blockIAPar ← getParallelChunkMap + let blockCharDecNewLeafIA = + V.fromList $ + blockIAPar (updateLeafAction maxLeafIndex) $ + V.toList $ + V.zip3 preOrderBlockVect (V.fromList contractedBlockVect) inCharInfoVV + -- holder for now + pure blockCharDecNewLeafIA + + preOrderBlockVect' ← + if hasNonExact && (graphType inGS /= HardWired) + then do + preOrderPar ← getParallelChunkMap + pure . V.fromList . preOrderPar makeIAAction . V.toList $ V.zip softwiredUpdatedLeafIA inCharInfoVV + else pure preOrderBlockVect + + let fullyDecoratedGraph = + assignPreorderStatesAndEdges + inGS + finalMethod + calculateBranchLengths + rootIndex + preOrderBlockVect' + useMap + inCharInfoVV + inDecorated + + if null blockCharacterDecoratedVV + then + error + ( "Empty preOrderBlockVect in preOrderTreeTraversal at root index rootIndex: " + <> show rootIndex + <> " This can be caused if the graphType not set correctly: " + <> show (graphType inGS) + ) + else pure (inSimple, inCost, fullyDecoratedGraph, blockDisplayV, preOrderBlockVect, inCharInfoVV) + + +-- | updateLeafIABlock' is a triple argument to allow for parMap +updateLeafIABlock' ∷ Int → (V.Vector DecoratedGraph, V.Vector DecoratedGraph, V.Vector CharInfo) → V.Vector DecoratedGraph +updateLeafIABlock' maxLeafIndex (origCharV, newCharV, charInfoV) = V.zipWith3 (updateLeafIAChar maxLeafIndex) origCharV newCharV charInfoV + + +{- | updateLeafIABlock takes a graph, existing character info and updates IA fields in leaves +for IA post and preorder passes on softwored graphs that may have indegree=outdegree=1 vertices +these nodes screw up the implied alignment algorithm +-} +updateLeafIABlock ∷ Int → V.Vector DecoratedGraph → V.Vector DecoratedGraph → V.Vector CharInfo → V.Vector DecoratedGraph +updateLeafIABlock maxLeafIndex origCharV newCharV charInfoV = V.zipWith3 (updateLeafIAChar maxLeafIndex) origCharV newCharV charInfoV + + +{- | ujpdates single charracter leaf IA assignments +uses max net edge thing +-} +updateLeafIAChar ∷ Int → DecoratedGraph → DecoratedGraph → CharInfo → DecoratedGraph +updateLeafIAChar maxLeafIndex origCharGraph newCharGraph charInfo = + let origLeafVertexList = filter ((<= maxLeafIndex) . fst) $ LG.labNodes origCharGraph + originalNonLeafVertexList = filter ((> maxLeafIndex) . fst) $ LG.labNodes origCharGraph + newLeafVertexList = filter ((<= maxLeafIndex) . fst) $ LG.labNodes newCharGraph + updatedVertexList = zipWith (updateIAFields charInfo) origLeafVertexList newLeafVertexList + origEdgeList = LG.labEdges origCharGraph + in LG.mkGraph (updatedVertexList <> originalNonLeafVertexList) origEdgeList + + +{- | updateIAFields updates the IA field in the first node with that of second if its non-exact sequence character +assumes single character trees +-} +updateIAFields ∷ CharInfo → LG.LNode VertexInfo → LG.LNode VertexInfo → LG.LNode VertexInfo +updateIAFields charInfo origNode@(origIndex, origLabel) (_, newLabel) = + -- trace ("USF: Node " <> (show origIndex) <> " " <> (show (V.length $ vertData origLabel, V.length $ vertData newLabel)) <> " " <> (show (fmap V.length $ vertData origLabel)) <> " " <> (show (fmap V.length $ vertData newLabel))) $ + let characterType = charType charInfo + origChar = V.head $ V.head $ vertData origLabel + newChar = V.head $ V.head $ vertData newLabel + in if characterType `notElem` nonExactCharacterTypes + then origNode + else + let updatedChar = + if characterType `elem` [SlimSeq, NucSeq] + then origChar{slimAlignment = slimAlignment newChar} + else + if characterType `elem` [WideSeq, AminoSeq] + then origChar{wideAlignment = wideAlignment newChar} + else + if characterType == HugeSeq + then origChar{hugeAlignment = hugeAlignment newChar} + else error ("Character type unimplemented : " <> show characterType) + in (origIndex, origLabel{vertData = V.singleton (V.singleton updatedChar)}) + + +-- | makeIAUnionAssignments' version of makeIAUnionAssignments allowing tuple for parMap +makeIAUnionAssignments' ∷ AssignmentMethod → Int → (V.Vector DecoratedGraph, V.Vector CharInfo) → V.Vector DecoratedGraph +makeIAUnionAssignments' finalMethod rootIndex (a, b) = V.zipWith (makeCharacterIAUnion finalMethod rootIndex) a b + + +{- | makeIAUnionAssignments takes the vector of vector of character trees and (if) slim/wide/huge +does an additional post and pre order pass to assign IAand final fields in all sequece types slim/wide/huge +assigns union for all types +-} +makeIAUnionAssignments ∷ AssignmentMethod → Int → V.Vector DecoratedGraph → V.Vector CharInfo → V.Vector DecoratedGraph +makeIAUnionAssignments finalMethod rootIndex = V.zipWith (makeCharacterIAUnion finalMethod rootIndex) + + +{- | makeCharacterIAUnion takes an individual character postorder tree and if perform post and preorder IA passes +and assignment to final field in slim/wide/huge +also assignes unions for all types +-} +makeCharacterIAUnion ∷ AssignmentMethod → Int → DecoratedGraph → CharInfo → DecoratedGraph +makeCharacterIAUnion finalMethod rootIndex inGraph charInfo = + -- if charType charInfo `notElem` nonExactCharacterTypes then inGraph + if False + then inGraph + else + let postOrderIATree = postOrderIAUnion inGraph charInfo [(rootIndex, fromJust $ LG.lab inGraph rootIndex)] + preOrderIATree = + preOrderIA postOrderIATree rootIndex finalMethod charInfo $ + zip [(rootIndex, fromJust $ LG.lab postOrderIATree rootIndex)] [(rootIndex, fromJust $ LG.lab postOrderIATree rootIndex)] + in -- trace ("MCIAU:" <> (U.getUnionFieldsNode $ vertData $ fromJust $ LG.lab postOrderIATree rootIndex) <> "\n" <> (U.getUnionFieldsNode $ vertData $ fromJust $ LG.lab postOrderIATree 0) + -- <> "\nAfter preorder:\t" <> (U.getUnionFieldsNode $ vertData $ fromJust $ LG.lab preOrderIATree rootIndex) <> "\n" <> (U.getUnionFieldsNode $ vertData $ fromJust $ LG.lab preOrderIATree 0)) + preOrderIATree + + +{- | postOrderIAUnion performs a post-order IA pass assigning leaf preliminary states +from the "alignment" fields and setting HTU preliminary by calling the apropriate 2-way +matrix +should be OK for any root--or partial graph as in for branch swapping +also sets unions for all character types, IA based for sequence +Since costs are not used here, No Change Adjust is set to False +-} +postOrderIAUnion ∷ DecoratedGraph → CharInfo → [LG.LNode VertexInfo] → DecoratedGraph +postOrderIAUnion inGraph charInfo inNodeList = + if null inNodeList + then inGraph + else + let inNode@(nodeIndex, nodeLabel) = head inNodeList + (inNodeEdges, outNodeEdges) = LG.getInOutEdges inGraph nodeIndex + inCharacter = V.head $ V.head $ vertData nodeLabel + nodeType' = GO.getNodeType inGraph nodeIndex + in -- checking sanity of data + if V.null $ vertData nodeLabel + then error "Null vertData in postOrderIA" + else + if V.null $ V.head $ vertData nodeLabel + then -- missing data for taxon + error "Null vertData data in postOrderIA" + else -- leaf take assignment from alignment field + + if nodeType' == LeafNode + then -- set leaf union fields to preliminary or IA fields + + let newCharacter = M.makeIAUnionPrelimLeaf charInfo inCharacter + newLabel = nodeLabel{vertData = V.singleton (V.singleton newCharacter), nodeType = nodeType'} + newGraph = LG.insEdges (inNodeEdges <> outNodeEdges) $ LG.insNode (nodeIndex, newLabel) $ LG.delNode nodeIndex inGraph + in postOrderIAUnion newGraph charInfo (tail inNodeList) + else -- HTU take create assignment from children + + let childNodes = LG.labDescendants inGraph inNode + childTree = postOrderIAUnion inGraph charInfo childNodes + in -- trace ("Children: " <> (show $ fmap fst childNodes)) ( + + if length childNodes > 2 + then error ("Too many children in postOrderIA: " <> show (length childNodes)) + else -- in 1 out 1 vertex + + if length childNodes == 1 + then + let childIndex = fst $ head childNodes + childLabel = fromJust $ LG.lab childTree childIndex + childCharacter = V.head $ V.head $ vertData childLabel + in -- sanity checks + if isNothing (LG.lab childTree (fst $ head childNodes)) + then error ("No label for node: " <> show (fst $ head childNodes)) + else + if V.null $ vertData childLabel + then error "Null vertData in postOrderIA" + else + if V.null $ V.head $ vertData childLabel + then error "Null head vertData data in postOrderIA" + else + let newLabel = nodeLabel{vertData = V.singleton (V.singleton childCharacter), nodeType = nodeType'} + newGraph = LG.insEdges (inNodeEdges <> outNodeEdges) $ LG.insNode (nodeIndex, newLabel) $ LG.delNode nodeIndex childTree + in -- trace ("PostO1Child: " <> (show nodeIndex) <> " " <> (show $ slimFinal childCharacter)) + postOrderIAUnion newGraph charInfo (tail inNodeList) + else -- two children + + if length childNodes == 2 + then + let childIndices = fmap fst childNodes + childlabels = fmap (fromJust . LG.lab childTree) childIndices + childCharacters = fmap vertData childlabels + leftChar = V.head $ V.head $ head childCharacters + rightChar = V.head $ V.head $ last childCharacters + + -- Since this is a median + isMedian = True + newCharacter = M.makeIAPrelimCharacter isMedian charInfo inCharacter leftChar rightChar + + newLabel = nodeLabel{vertData = V.singleton (V.singleton newCharacter), nodeType = nodeType'} + newGraph = LG.insEdges (inNodeEdges <> outNodeEdges) $ LG.insNode (nodeIndex, newLabel) $ LG.delNode nodeIndex childTree + in -- trace ("PostO2hildren: " <> (show nodeIndex) <> " " <> (show $ slimFinal newCharacter) <> " " <> (show $ nodeType newLabel)) -- <> " From: " <> (show childlabels)) + postOrderIAUnion newGraph charInfo (tail inNodeList) + else error ("No children in non-leaf node: " <> (show nodeIndex) <> "\n" <> (LG.prettyIndices inGraph)) + + +-- ) +-- ) + +{- | preOrderIA performs a pre-order IA pass assigning via the apropriate 3-way matrix +the "final" fields are also set by filtering out gaps and 0. +skips non-unaligned-sequence types +-} +preOrderIA + ∷ DecoratedGraph → Int → AssignmentMethod → CharInfo → [(LG.LNode VertexInfo, LG.LNode VertexInfo)] → DecoratedGraph +preOrderIA inGraph rootIndex finalMethod charInfo inNodePairList = + if null inNodePairList + then inGraph + else + let (inNode@(nodeIndex, nodeLabel), (_, parentNodeLabel)) = head inNodePairList + (inNodeEdges, outNodeEdges) = LG.getInOutEdges inGraph nodeIndex + characterType = charType charInfo + inCharacter = V.head $ V.head $ vertData nodeLabel + inCharacter' = inCharacter + parentCharacter = V.head $ V.head $ vertData parentNodeLabel + childNodes = LG.labDescendants inGraph inNode + in -- trace ("PreIA Node:" <> (show nodeIndex) <> " " <> (show $ nodeType nodeLabel) <> " " <> (show (fmap fst $ fmap fst inNodePairList,fmap fst $ fmap snd inNodePairList))) ( + -- checking sanity of data + if V.null $ vertData nodeLabel + then error "Null vertData in preOrderIA" + else + if V.null $ V.head $ vertData nodeLabel + then error "Null vertData data in preOrderIA" + else + if length childNodes > 2 + then error ("Too many children in preOrderIA: " <> show (length childNodes)) + else -- leaf done in post-order + + if nodeType nodeLabel == LeafNode + then preOrderIA inGraph rootIndex finalMethod charInfo (tail inNodePairList) + else + if nodeType nodeLabel == RootNode || nodeIndex == rootIndex + then + let newCharacter + | characterType `elem` [SlimSeq, NucSeq] = + inCharacter + { slimIAFinal = extractMediansGapped $ slimIAPrelim inCharacter' + , slimFinal = extractMedians $ slimGapped inCharacter' + } + | characterType `elem` [WideSeq, AminoSeq] = + inCharacter + { wideIAFinal = extractMediansGapped $ wideIAPrelim inCharacter' + , wideFinal = extractMedians $ wideGapped inCharacter' + } + | characterType == HugeSeq = + inCharacter + { hugeIAFinal = extractMediansGapped $ hugeIAPrelim inCharacter' + , hugeFinal = extractMedians $ hugeGapped inCharacter' + } + | otherwise = inCharacter -- error ("Unrecognized character type " <> show characterType) + newLabel = nodeLabel{vertData = V.singleton (V.singleton newCharacter)} + newGraph = LG.insEdges (inNodeEdges <> outNodeEdges) $ LG.insNode (nodeIndex, newLabel) $ LG.delNode nodeIndex inGraph + parentNodeList = replicate (length childNodes) (nodeIndex, newLabel) + in -- trace ("PreIARoot: " <> (show nodeIndex) <> " IAFinal: " <> (show $ slimIAFinal newCharacter) <> " Final: " <> (show $ slimFinal newCharacter)) + preOrderIA newGraph rootIndex finalMethod charInfo (tail inNodePairList <> zip childNodes parentNodeList) + else -- single child, take parent final assignments, but keep postorder assignments + + if length childNodes == 1 + then + let newCharacter + | characterType `elem` [SlimSeq, NucSeq] = + inCharacter + { slimFinal = slimFinal parentCharacter + , slimIAFinal = slimIAFinal parentCharacter + } + | characterType `elem` [WideSeq, AminoSeq] = + inCharacter + { wideFinal = wideFinal parentCharacter + , wideIAFinal = wideIAFinal parentCharacter + } + | characterType == HugeSeq = + inCharacter + { hugeFinal = hugeFinal parentCharacter + , hugeIAFinal = hugeIAFinal parentCharacter + } + | otherwise = inCharacter -- error ("Unrecognized character type " <> show characterType) + newLabel = nodeLabel{vertData = V.singleton (V.singleton newCharacter)} + newGraph = LG.insEdges (inNodeEdges <> outNodeEdges) $ LG.insNode (nodeIndex, newLabel) $ LG.delNode nodeIndex inGraph + parentNodeList = replicate (length childNodes) (nodeIndex, newLabel) + in -- trace ("PreIANet: " <> (show nodeIndex) <> " IAFinal: " <> (show $ slimIAFinal newCharacter) <> " Final: " <> (show $ slimFinal newCharacter)) + preOrderIA newGraph rootIndex finalMethod charInfo (tail inNodePairList <> zip childNodes parentNodeList) + else -- 2 children, make 3-way + + let finalCharacter = M.makeIAFinalCharacter finalMethod charInfo inCharacter parentCharacter -- leftChar rightChar + newLabel = nodeLabel{vertData = V.singleton (V.singleton finalCharacter)} + newGraph = LG.insEdges (inNodeEdges <> outNodeEdges) $ LG.insNode (nodeIndex, newLabel) $ LG.delNode nodeIndex inGraph + parentNodeList = replicate (length childNodes) (nodeIndex, newLabel) + in -- trace ("PreIATree: " <> (show nodeIndex) <> " IAFinal: " <> (show $ slimIAFinal finalCharacter) <> " Final: " <> (show $ slimFinal finalCharacter)) + preOrderIA newGraph rootIndex finalMethod charInfo (tail inNodePairList <> zip childNodes parentNodeList) + + +-- ) + +-- | doBlockTraversal' is a wrapper around doBlockTraversal fo seqParMap +doBlockTraversal' + ∷ GlobalSettings → AssignmentMethod → Bool → Int → (V.Vector CharInfo, V.Vector DecoratedGraph) → V.Vector DecoratedGraph +doBlockTraversal' inGS finalMethod staticIA rootIndex (inCharInfoV, traversalDecoratedVect) = + doBlockTraversal inGS finalMethod staticIA rootIndex inCharInfoV traversalDecoratedVect + + +{- | doBlockTraversal takes a block of postorder decorated character trees character info +could be moved up preOrderTreeTraversal, but like this for legibility +-} +doBlockTraversal + ∷ GlobalSettings → AssignmentMethod → Bool → Int → V.Vector CharInfo → V.Vector DecoratedGraph → V.Vector DecoratedGraph +doBlockTraversal inGS finalMethod staticIA rootIndex inCharInfoV traversalDecoratedVect = + -- trace ("BlockT:" <> (show $ fmap charType inCharInfoV)) + V.zipWith (doCharacterTraversal inGS finalMethod staticIA rootIndex) inCharInfoV traversalDecoratedVect + + +{- | doCharacterTraversal performs preorder traversal on single character tree +with single charInfo +this so each character can be independently "rooted" for optimal traversals. +-} +doCharacterTraversal ∷ GlobalSettings → AssignmentMethod → Bool → Int → CharInfo → DecoratedGraph → DecoratedGraph +doCharacterTraversal inGS finalMethod staticIA rootIndex inCharInfo inGraph = + -- find root--index should = number of leaves + -- trace ("charT:" <> (show $ charType inCharInfo)) ( + let -- this is a hack--remve after fixed + -- inGraph = LG.removeDuplicateEdges inGraph' + + isolateNodeList = LG.getIsolatedNodes inGraph + -- (_, leafVertexList, _, _) = LG.splitVertexList inGraph + inEdgeList = LG.labEdges inGraph + in -- remove these two lines if working + -- if rootIndex /= length leafVertexList then error ("Root index not = number leaves in doCharacterTraversal" <> show (rootIndex, length leafVertexList)) + -- else + -- root vertex, repeat of label info to avoid problem with zero length zip later, second info ignored for root + -- since root cannot have 2nd parent + let rootLabel = fromJust $ LG.lab inGraph rootIndex + nothingVertData = U.copyToNothing (vertData rootLabel) + rootFinalVertData = + createFinalAssignmentOverBlocks + inGS + finalMethod + staticIA + RootNode + (vertData rootLabel) + (vertData rootLabel) + nothingVertData + inCharInfo + True + False + False + rootChildren = LG.labDescendants inGraph (rootIndex, rootLabel) + + -- left / right to match post-order + rootChildrenBV = fmap (bvLabel . snd) rootChildren + rootChildrenIsLeft + | length rootChildrenBV == 1 = [True] + | head rootChildrenBV > (rootChildrenBV !! 1) = [False, True] + | otherwise = [True, False] + newRootNode = (rootIndex, rootLabel{vertData = rootFinalVertData}) + rootChildrenPairs = zip3 rootChildren (replicate (length rootChildren) newRootNode) rootChildrenIsLeft + upDatedNodes = makeFinalAndChildren inGS finalMethod staticIA inGraph rootChildrenPairs [newRootNode] inCharInfo + + -- update isolated nodes with final == preliminary as with root nodes (and leaves, but without postorder logic) + updatedIsolateNodes = fmap (updateIsolatedNode inGS finalMethod staticIA inCharInfo) isolateNodeList + in -- hope this is the most efficient way since all nodes have been remade + -- trace (U.prettyPrintVertexInfo $ snd newRootNode) + LG.mkGraph (upDatedNodes <> updatedIsolateNodes) inEdgeList + + +-- ) + +{- | updateIsolatedNode updates the final states of an isolated node as if it were a root with final=preliminary +states without preorder logic as in regular leaves +NB IA length won't match if compared since not in graph +-} +updateIsolatedNode ∷ GlobalSettings → AssignmentMethod → Bool → CharInfo → LG.LNode VertexInfo → LG.LNode VertexInfo +updateIsolatedNode inGS finalMethod staticIA inCharInfo (inNodeIndex, inNodeLabel) = + -- root so final = preliminary + let nothingVertData = U.copyToNothing (vertData inNodeLabel) + newVertData = + createFinalAssignmentOverBlocks + inGS + finalMethod + staticIA + RootNode + (vertData inNodeLabel) + (vertData inNodeLabel) + nothingVertData + inCharInfo + True + False + False + in (inNodeIndex, inNodeLabel{vertData = newVertData}) + + +{- | makeFinalAndChildren takes a graph, list of pairs of (labelled nodes,parent node) to make final assignment and a list of updated nodes +the input nodes are relabelled by preorder functions and added to the list of processed nodes and recursed to other nodes first, then +their children -- thi sis important for preorder of hardwired graphs since can have 2 parents a single child. +nodes are retuned in reverse order at they are made--need to check if this will affect graph identity or indexing in fgl +-} +makeFinalAndChildren + ∷ GlobalSettings + → AssignmentMethod + → Bool + → DecoratedGraph + → [(LG.LNode VertexInfo, LG.LNode VertexInfo, Bool)] + → [LG.LNode VertexInfo] + → CharInfo + → [LG.LNode VertexInfo] +makeFinalAndChildren inGS finalMethod staticIA inGraph nodesToUpdate updatedNodes inCharInfo = + -- trace ("mFAC:" <> (show $ charType inCharInfo)) ( + if null nodesToUpdate + then updatedNodes + else + let (firstNode, firstParent, isLeft) = head nodesToUpdate + + -- get current node data + firstLabel = snd firstNode + firstNodeType' = GO.getNodeType inGraph $ fst firstNode -- nodeType firstLabel + firstNodeType = + if firstNodeType' /= NetworkNode + then firstNodeType' + else NetworkNode + -- not issue if hardwired I don't think + -- this really a debug sort of issue + -- if graphType inGS /= HardWired then trace ("NetNode:" <> show (LG.getInOutDeg inGraph firstNode) <> " DuplicateEdges (?): " <> show (LG.getDuplicateEdges inGraph)) NetworkNode + -- else NetworkNode + firstVertData = vertData firstLabel + + -- get node parent data--check if more than one + firstParents = LG.labParents inGraph $ fst firstNode + + -- if single parent then as usual--else take head of two so no confusion as to whichn is which + -- this is holdover from no indegree 2 nodes--could be simplified and return structures changed + firstParentVertData = + if length firstParents == 1 + then vertData $ snd firstParent + else vertData $ snd $ head firstParents + + secondParentData = + if length firstParents == 1 + then U.copyToNothing firstParentVertData + else U.copyToJust $ vertData $ snd $ last firstParents + + -- child data + firstChildren = LG.labDescendants inGraph firstNode + + -- booleans for further pass + isIn1Out1 = (length firstChildren == 1) && (length firstParents == 1) -- softwired can happen, need to pass "grandparent" node to skip in 1 out 1 + isIn2Out1 = (length firstChildren == 1) && (length firstParents == 2) -- hardwired can happen, need to pass both parents + + -- this OK with one or two children + firstChildrenBV = fmap (bvLabel . snd) firstChildren + firstChildrenIsLeft + | length firstChildrenBV == 1 = [True] + | head firstChildrenBV > (firstChildrenBV !! 1) = [False, True] + | otherwise = [True, False] + firstFinalVertData = + createFinalAssignmentOverBlocks + inGS + finalMethod + staticIA + firstNodeType + firstVertData + firstParentVertData + secondParentData + inCharInfo + isLeft + isIn1Out1 + isIn2Out1 + newFirstNode = (fst firstNode, firstLabel{vertData = firstFinalVertData}) + + -- check children if indegree == 2 then don't add to nodes to do if in there already + + childrenTriple = zip3 firstChildren (replicate (length firstChildren) newFirstNode) firstChildrenIsLeft + childrenTriple' = + if graphType inGS == HardWired + then filter (indeg2NotInNodeList inGraph (tail nodesToUpdate)) childrenTriple + else childrenTriple + in -- trace (U.prettyPrintVertexInfo $ snd newFirstNode) + -- makeFinalAndChildren inGS finalMethod staticIA inGraph (childrenPairs <> tail nodesToUpdate) (newFirstNode : updatedNodes) inCharInfo + -- childrenPair after nodess to do for hardWired to ensure both parent done before child + makeFinalAndChildren + inGS + finalMethod + staticIA + inGraph + (tail nodesToUpdate <> childrenTriple') + (newFirstNode : updatedNodes) + inCharInfo + + +-- ) + +{- | indeg2NotInNodeList checcks a node agains a list by index (fst) if node is indegree 2 and +already in the list of n odes "todo" filter out as will already be optimized in appropriate pre-order +-} +indeg2NotInNodeList ∷ LG.Gr a b → [(LG.LNode a, LG.LNode a, Bool)] → (LG.LNode a, LG.LNode a, Bool) → Bool +indeg2NotInNodeList inGraph checkNodeList (childNode@(childIndex, _), _, _) + | LG.isEmpty inGraph = error "Empty graph in indeg2NotInNodeList" + | LG.indeg inGraph childNode < 2 = True + | childIndex `elem` fmap (fst . fst3) checkNodeList = False + | otherwise = True + + +{- | assignPreorderStatesAndEdges takes a postorder decorated graph (should be but not required) and propagates +preorder character states from individual character trees. Exact characters (Add, nonAdd, matrix) postorder +states should be based on the outgroup rooted tree. +root should be median of finals of two descendets--for non-exact based on final 'alignments' field with gaps filtered +postorder assignment and preorder will be out of whack--could change to update with correponding postorder +but that would not allow use of base decorated graph for incremental optimization (which relies on postorder assignments) in other areas +optyion code ikn there to set root final to outgropu final--but makes thigs scewey in matrix character and some pre-order assumptions +-} +assignPreorderStatesAndEdges + ∷ GlobalSettings + → AssignmentMethod + → Bool + → Int + → V.Vector (V.Vector DecoratedGraph) + → Bool + → V.Vector (V.Vector CharInfo) + → DecoratedGraph + → DecoratedGraph +assignPreorderStatesAndEdges inGS finalMethd calculateBranchEdges rootIndex preOrderBlockTreeVV useMap inCharInfoVV inGraph = + -- trace ("aPSAE:" <> (show $ fmap (fmap charType) inCharInfoVV)) ( + if LG.isEmpty inGraph + then error "Empty graph in assignPreorderStatesAndEdges" + else -- trace ("In assign") ( + + let postOrderNodes = LG.labNodes inGraph + postOrderEdgeList = LG.labEdges inGraph + + -- update node labels + newNodeList = fmap (updateNodeWithPreorder preOrderBlockTreeVV inCharInfoVV) postOrderNodes + + -- create a vector of vector of pair of nodes and edges for display x charcater trees + blockTreePairVV = fmap (fmap LG.makeNodeEdgePairVect) preOrderBlockTreeVV + + -- update edge labels--for softwired need to account for not all edges in all block/display trees + -- map for case where tree does not contain all leaves as in swap procedures + -- needs to be updated for softwired as well + nodeMap = MAP.fromList $ zip (fmap fst newNodeList) newNodeList + newEdgeList = + if graphType inGS == Tree || graphType inGS == HardWired + then + if useMap + then fmap (updateEdgeInfoTreeMap finalMethd inCharInfoVV nodeMap) postOrderEdgeList + else fmap (updateEdgeInfoTree finalMethd inCharInfoVV (V.fromList newNodeList)) postOrderEdgeList + else fmap (updateEdgeInfoSoftWired finalMethd inCharInfoVV blockTreePairVV rootIndex) postOrderEdgeList + in -- make new graph + -- LG.mkGraph newNodeList' newEdgeList + if calculateBranchEdges + then LG.mkGraph newNodeList newEdgeList + else LG.mkGraph newNodeList postOrderEdgeList + + +-- ) + +{- | updateNodeWithPreorder takes the preorder decorated graphs (by block and character) and updates the +the preorder fields only using character info. This leaves post and preorder assignment out of sync. +but that so can use incremental optimization on base decorated graph in other areas. +-} +updateNodeWithPreorder + ∷ V.Vector (V.Vector DecoratedGraph) → V.Vector (V.Vector CharInfo) → LG.LNode VertexInfo → LG.LNode VertexInfo +updateNodeWithPreorder preOrderBlockTreeVV inCharInfoVV postOrderNode = + let nodeLabel = snd postOrderNode + nodeVertData = vertData nodeLabel + newNodeVertData = V.zipWith3 (updateVertexBlock (fst postOrderNode)) preOrderBlockTreeVV nodeVertData inCharInfoVV + in (fst postOrderNode, nodeLabel{vertData = newNodeVertData}) + + +-- | updateVertexBlock takes a block of vertex data and updates preorder states of charactes via fmap +updateVertexBlock ∷ Int → V.Vector DecoratedGraph → V.Vector CharacterData → V.Vector CharInfo → V.Vector CharacterData +updateVertexBlock nodeIndex = V.zipWith3 (updatePreorderCharacter nodeIndex) + + +{- | updatePreorderCharacter updates the pre-order fields of character data for a vertex from a traversal +since there is single character optimized for each character decorated graph-- it is always teh 0th 0th character +exact are vectors so take care of multiple there. +need to care for issues of missing data +-} +updatePreorderCharacter ∷ Int → DecoratedGraph → CharacterData → CharInfo → CharacterData +updatePreorderCharacter nodeIndex preOrderTree postOrderCharacter charInfo = + -- trace ("N:" <> (show nodeIndex) <> " B:" <> (show blockIndex) <> " C:" <> (show characterIndex) <> "\n" <> (show $ vertData $ fromJust $ LG.lab preOrderTree nodeIndex)) ( + let maybePreOrderNodeLabel = LG.lab preOrderTree nodeIndex + preOrderVertData = vertData $ fromJust maybePreOrderNodeLabel + preOrderCharacterData + | V.null preOrderVertData = emptyCharacter + | V.null $ V.head preOrderVertData = emptyCharacter + | otherwise = V.head $ V.head preOrderVertData -- (preOrderVertData V.! 0) V.! 0 + in -- this can heppen in naked parent node of prunned subGraph in branch swapping + if isNothing maybePreOrderNodeLabel + then emptyCharacter + else -- error ("Nothing node label in updatePreorderCharacter node: " <> show nodeIndex) + + updateCharacter postOrderCharacter preOrderCharacterData (charType charInfo) + + +-- ) + +{- | updateCharacter takes a postorder character and updates the preorder (final) fields with preorder data and character type +only updating preorder assignment--except for root, that is needed to draw final state for branch lengths +-} +updateCharacter ∷ CharacterData → CharacterData → CharType → CharacterData +updateCharacter postOrderCharacter preOrderCharacter localCharType + | localCharType == Add = + postOrderCharacter + { rangeFinal = rangeFinal preOrderCharacter + , rangeUnion = rangeUnion preOrderCharacter + } + | localCharType == NonAdd = + postOrderCharacter + { stateBVFinal = stateBVFinal preOrderCharacter + , stateBVUnion = stateBVUnion preOrderCharacter + } + | localCharType `elem` packedNonAddTypes = + postOrderCharacter + { packedNonAddFinal = packedNonAddFinal preOrderCharacter + , packedNonAddUnion = packedNonAddUnion preOrderCharacter + } + | localCharType == Matrix = + postOrderCharacter + { matrixStatesFinal = matrixStatesFinal preOrderCharacter + , matrixStatesUnion = matrixStatesUnion preOrderCharacter + } + | localCharType == AlignedSlim = + postOrderCharacter + { alignedSlimPrelim = alignedSlimPrelim preOrderCharacter + , alignedSlimFinal = alignedSlimFinal preOrderCharacter + , alignedSlimUnion = alignedSlimUnion preOrderCharacter + } + | localCharType == AlignedWide = + postOrderCharacter + { alignedWidePrelim = alignedWidePrelim preOrderCharacter + , alignedWideFinal = alignedWideFinal preOrderCharacter + , alignedWideUnion = alignedWideUnion preOrderCharacter + } + | localCharType == AlignedHuge = + postOrderCharacter + { alignedHugePrelim = alignedHugePrelim preOrderCharacter + , alignedHugeFinal = alignedHugeFinal preOrderCharacter + , alignedHugeUnion = alignedHugeUnion preOrderCharacter + } + | localCharType == SlimSeq || localCharType == NucSeq = + postOrderCharacter + { slimAlignment = slimAlignment preOrderCharacter + , slimFinal = slimFinal preOrderCharacter + , slimIAFinal = slimIAFinal preOrderCharacter + , slimIAUnion = slimIAUnion preOrderCharacter + } + | localCharType == WideSeq || localCharType == AminoSeq = + postOrderCharacter + { wideAlignment = wideAlignment preOrderCharacter + , wideFinal = wideFinal preOrderCharacter + , wideIAFinal = wideIAFinal preOrderCharacter + , wideIAUnion = wideIAUnion preOrderCharacter + } + | localCharType == HugeSeq = + postOrderCharacter + { hugeAlignment = hugeAlignment preOrderCharacter + , hugeFinal = hugeFinal preOrderCharacter + , hugeIAFinal = hugeIAFinal preOrderCharacter + , hugeIAUnion = hugeIAUnion preOrderCharacter + } + | otherwise = error ("Character type unimplemented : " <> show localCharType) + + +{- | updateEdgeInfoSoftWired gets edge weights via block trees as opposed to canonical graph +this because not all edges present in all block/display trees +-} +updateEdgeInfoSoftWired + ∷ AssignmentMethod + → V.Vector (V.Vector CharInfo) + → V.Vector (V.Vector (V.Vector (LG.LNode VertexInfo), V.Vector (LG.LEdge EdgeInfo))) + → Int + → LG.LEdge EdgeInfo + → LG.LEdge EdgeInfo +updateEdgeInfoSoftWired finalMethod inCharInfoVV blockTreePairVV rootIndex (uNode, vNode, edgeLabel) = + if V.null blockTreePairVV + then error "Empty node-edge pair vector in updateEdgeInfoSoftWired" + else + let (minWList, maxWList) = V.unzip $ V.zipWith (getEdgeBlockWeightSoftWired finalMethod uNode vNode rootIndex) inCharInfoVV blockTreePairVV + localEdgeType = edgeType edgeLabel + newEdgeLabel = + EdgeInfo + { minLength = V.sum minWList + , maxLength = V.sum maxWList + , midRangeLength = (V.sum minWList + V.sum maxWList) / 2.0 + , edgeType = localEdgeType + } + in (uNode, vNode, newEdgeLabel) + + +-- | getEdgeBlockWeightSoftWired takes a block of character trees and maps character distances if edge exists in block tree +getEdgeBlockWeightSoftWired + ∷ AssignmentMethod + → Int + → Int + → Int + → V.Vector CharInfo + → V.Vector (V.Vector (LG.LNode VertexInfo), V.Vector (LG.LEdge EdgeInfo)) + → (VertexCost, VertexCost) +getEdgeBlockWeightSoftWired finalMethod uNode vNode rootIndex inCharInfoV blockTreePairV = + let (minWList, maxWList) = V.unzip $ V.zipWith (getEdgeCharacterWeightSoftWired finalMethod uNode vNode rootIndex) inCharInfoV blockTreePairV + in (V.sum minWList, V.sum maxWList) + + +{- | getEdgeCharacterWeightSoftWired gets the edge weight for an individual character +matches edge in either direction +need examine edge root as two edges from rootIndex +-} +getEdgeCharacterWeightSoftWired + ∷ AssignmentMethod + → Int + → Int + → Int + → CharInfo + → (V.Vector (LG.LNode VertexInfo), V.Vector (LG.LEdge EdgeInfo)) + → (VertexCost, VertexCost) +getEdgeCharacterWeightSoftWired finalMethod uNode vNode rootIndex inCharInfo (nodeVect, edgeVect) = + let foundVertexPair = getEdgeVerts uNode vNode rootIndex nodeVect edgeVect + (uLabel, vLabel) = fromJust foundVertexPair + uCharacter = V.head $ V.head $ vertData uLabel + vCharacter = V.head $ V.head $ vertData vLabel + in -- if edge not present and not around root then return no costs + if isNothing foundVertexPair + then (0, 0) + else getCharacterDistFinal finalMethod uCharacter vCharacter inCharInfo + + +-- | getEdgeVerts returns vertex labels if edge in vect or if a virtual edge including root +getEdgeVerts + ∷ Int → Int → Int → V.Vector (LG.LNode VertexInfo) → V.Vector (LG.LEdge EdgeInfo) → Maybe (VertexInfo, VertexInfo) +getEdgeVerts uNode vNode rootIndex nodeVect edgeVect = + -- trace ("GEV:" <> (show (uNode, vNode, rootIndex) <> " nodes " <> (show $ fmap fst nodeVect) <> " edges " <> (show $ fmap LG.toEdge edgeVect))) ( + + -- hack or display edge check I'm not sure--not all edges are in all display trees + if (uNode >= V.length nodeVect) || (vNode >= V.length nodeVect) + then Nothing + else + ( if edgeInVect (uNode, vNode) edgeVect || (edgeInVect (rootIndex, uNode) edgeVect && edgeInVect (rootIndex, vNode) edgeVect) + then Just (snd $ nodeVect V.! uNode, snd $ nodeVect V.! vNode) + else Nothing + ) + + +-- ) + +-- | edgeInVect takes an edges and returns True if in Vector, False otherwise +edgeInVect ∷ (Int, Int) → V.Vector (LG.LEdge EdgeInfo) → Bool +edgeInVect (u, v) edgeVect = + not (V.null edgeVect) + && ( let (a, b, _) = V.head edgeVect + in ((((u, v) == (a, b)) || ((v, u) == (a, b))) || edgeInVect (u, v) (V.tail edgeVect)) + ) + + +{- | updateEdgeInfoTree takes a Decorated graph--fully labelled post and preorder and and edge and +gets edge info--basically lengths +this for a tree in that all edges are present in all character/block trees +-} +updateEdgeInfoTree + ∷ AssignmentMethod → V.Vector (V.Vector CharInfo) → V.Vector (LG.LNode VertexInfo) → LG.LEdge EdgeInfo → LG.LEdge EdgeInfo +updateEdgeInfoTree finalMethod inCharInfoVV nodeVector (uNode, vNode, edgeLabel) = + if V.null nodeVector + then error "Empty node list in updateEdgeInfo" + else + let (minW, maxW) = getEdgeWeight finalMethod inCharInfoVV nodeVector (uNode, vNode) + midW = (minW + maxW) / 2.0 + localEdgeType = edgeType edgeLabel + newEdgeLabel = + EdgeInfo + { minLength = minW + , maxLength = maxW + , midRangeLength = midW + , edgeType = localEdgeType + } + in (uNode, vNode, newEdgeLabel) + + +{- | updateEdgeInfoTreeMap takes a Decorated graph--fully labelled post and preorder and and edge and +gets edge info--basically lengths +this for a tree in that all edges are present in all character/block trees +uses MAP as opposed to index +-} +updateEdgeInfoTreeMap + ∷ AssignmentMethod → V.Vector (V.Vector CharInfo) → MAP.Map Int (LG.LNode VertexInfo) → LG.LEdge EdgeInfo → LG.LEdge EdgeInfo +updateEdgeInfoTreeMap finalMethod inCharInfoVV nodeMap (uNode, vNode, edgeLabel) = + if MAP.null nodeMap + then error "Empty node MAP in updateEdgeInfo" + else + let (minW, maxW) = getEdgeWeightMap finalMethod inCharInfoVV nodeMap (uNode, vNode) + midW = (minW + maxW) / 2.0 + localEdgeType = edgeType edgeLabel + newEdgeLabel = + EdgeInfo + { minLength = minW + , maxLength = maxW + , midRangeLength = midW + , edgeType = localEdgeType + } + in (uNode, vNode, newEdgeLabel) + + +{- | getEdgeWeight takes a preorder decorated decorated graph and an edge and gets the weight information for that edge +basically a min/max distance between the two +the indexing depends on the graph having all leaves in the graph which may not happen +during graph swapping +-} +getEdgeWeight + ∷ AssignmentMethod → V.Vector (V.Vector CharInfo) → V.Vector (LG.LNode VertexInfo) → (Int, Int) → (VertexCost, VertexCost) +getEdgeWeight finalMethod inCharInfoVV nodeVector (uNode, vNode) = + if V.null nodeVector + then error "Empty node list in getEdgeWeight" + else -- trace ("GEW: " <> (show $ fmap fst nodeVector) <> " " <> (show (uNode, vNode))) ( + + let uNodeInfo = vertData $ snd $ nodeVector V.! uNode + vNodeInfo = vertData $ snd $ nodeVector V.! vNode + blockCostPairs = V.zipWith3 (getBlockCostPairsFinal finalMethod) uNodeInfo vNodeInfo inCharInfoVV + minCost = sum $ fmap fst blockCostPairs + maxCost = sum $ fmap snd blockCostPairs + in (minCost, maxCost) + + +-- ) + +{- | getEdgeWeightMap takes a preorder decorated decorated graph and an edge and gets the weight information for that edge +basically a min/max distance between the two +in this case based on map or vertices rather than direct indexing. +the indexing depends on the graph having all leaves in the graph which may not happen +during graph swapping +-} +getEdgeWeightMap + ∷ AssignmentMethod → V.Vector (V.Vector CharInfo) → MAP.Map Int (LG.LNode VertexInfo) → (Int, Int) → (VertexCost, VertexCost) +getEdgeWeightMap finalMethod inCharInfoVV nodeMap (uNode, vNode) = + if MAP.null nodeMap + then error "Empty node map in getEdgeWeight" + else -- trace ("GEWM: " <> (show $ MAP.toList nodeMap) <> " " <> (show (uNode, vNode))) ( + + let uNodeInfo = vertData $ snd $ nodeMap MAP.! uNode + vNodeInfo = vertData $ snd $ nodeMap MAP.! vNode + blockCostPairs = V.zipWith3 (getBlockCostPairsFinal finalMethod) uNodeInfo vNodeInfo inCharInfoVV + minCost = sum $ fmap fst blockCostPairs + maxCost = sum $ fmap snd blockCostPairs + in (minCost, maxCost) + + +-- ) + +-- | getBlockCostPairsFinal takes a block of two nodes and character infomation and returns the min and max block branch costs +getBlockCostPairsFinal + ∷ AssignmentMethod → V.Vector CharacterData → V.Vector CharacterData → V.Vector CharInfo → (VertexCost, VertexCost) +getBlockCostPairsFinal finalMethod uNodeCharDataV vNodeCharDataV charInfoV = + let characterCostPairs = V.zipWith3 (getCharacterDistFinal finalMethod) uNodeCharDataV vNodeCharDataV charInfoV + minCost = sum $ fmap fst characterCostPairs + maxCost = sum $ fmap snd characterCostPairs + in (minCost, maxCost) + + +{- | getCharacterDistFinal takes a pair of characters and character type, returning the minimum and maximum character distances +for sequence characters this is based on slim/wide/hugeAlignment field, hence all should be O(n) in num characters/sequence length +Since these are charcater distances--no need for no change cost adjustment +-} +getCharacterDistFinal ∷ AssignmentMethod → CharacterData → CharacterData → CharInfo → (VertexCost, VertexCost) +getCharacterDistFinal finalMethod uCharacter vCharacter charInfo = + let thisWeight = weight charInfo + thisMatrix = costMatrix charInfo + thisCharType = charType charInfo + lChangeCost = changeCost charInfo + lNoChangeCost = noChangeCost charInfo + isMedian = False --since want distances so adjust for extra noChnage costs + + in -- no nded to do nochange/change--all recoded in that case + if thisCharType == Add + then + let -- minCost = localCost (M.intervalAdd thisWeight uCharacter vCharacter) + (minDiffV, maxDiffV) = V.unzip $ V.zipWith maxMinIntervalDiff (rangeFinal uCharacter) (rangeFinal vCharacter) + + minCost = thisWeight * fromIntegral (V.sum minDiffV) + maxCost = thisWeight * fromIntegral (V.sum maxDiffV) + in (minCost, maxCost) + else -- assumes noChangeCost < changeCost for PMDL/ML + + if thisCharType == NonAdd + then + let -- minCost = localCost (M.interUnion thisWeight uCharacter vCharacter) + minDiff = length $ V.filter not $ V.zipWith hasBVIntersection (stateBVFinal uCharacter) (stateBVFinal vCharacter) + maxDiff = length $ V.filter not $ V.zipWith equalAndSingleState (stateBVFinal uCharacter) (stateBVFinal vCharacter) + maxCost = thisWeight * fromIntegral maxDiff + minCost = thisWeight * fromIntegral minDiff + minNoChange = length (stateBVFinal uCharacter) - minDiff + maxNoChange = length (stateBVFinal uCharacter) - maxDiff + minCost' = thisWeight * ((lNoChangeCost * fromIntegral minNoChange) + (lChangeCost * fromIntegral minDiff)) + maxCost' = thisWeight * ((lNoChangeCost * fromIntegral maxNoChange) + (lChangeCost * fromIntegral maxDiff)) + in if lNoChangeCost == 0.0 + then (minCost, maxCost) + else (minCost', maxCost') + else + if thisCharType `elem` packedNonAddTypes + then + let -- minCost = localCost (BP.median2Packed thisCharType uCharacter vCharacter) + (minDiffV, maxDiffV) = + UV.unzip $ + UV.zipWith + (BP.minMaxCharDiff isMedian thisCharType (lNoChangeCost, lChangeCost)) + (packedNonAddFinal uCharacter) + (packedNonAddFinal vCharacter) + maxCost = thisWeight * UV.sum maxDiffV + minCost = thisWeight * UV.sum minDiffV + in (minCost, maxCost) + else + if thisCharType == Matrix + then + let minMaxListList = + V.zipWith + (minMaxMatrixDiff thisMatrix) + (fmap getLowestCostMatrixStates (matrixStatesFinal uCharacter)) + (fmap getLowestCostMatrixStates (matrixStatesFinal vCharacter)) + minDiff = V.sum $ fmap fst minMaxListList + maxDiff = V.sum $ fmap snd minMaxListList + minCost = thisWeight * fromIntegral minDiff + maxCost = thisWeight * fromIntegral maxDiff + in (minCost, maxCost) + else + if thisCharType `elem` prealignedCharacterTypes + then + let (minDiff, maxDiff) = + unzip $ + zipWith + (M.generalSequenceDiff thisMatrix (length thisMatrix)) + (GV.toList $ alignedSlimFinal uCharacter) + (GV.toList $ alignedSlimFinal vCharacter) + minCost = thisWeight * fromIntegral (sum minDiff) + maxCost = thisWeight * fromIntegral (sum maxDiff) + in (minCost, maxCost) + else + if thisCharType == SlimSeq || thisCharType == NucSeq + then + let minMaxDiffList = + if finalMethod == DirectOptimization + then + let uFinal = M.makeDynamicCharacterFromSingleVector (slimFinal uCharacter) + vFinal = M.makeDynamicCharacterFromSingleVector (slimFinal vCharacter) + + -- Since these are character distances--not a median + newEdgeCharacter = M.getDOMedianCharInfo isMedian charInfo (uCharacter{slimGapped = uFinal}) (vCharacter{slimGapped = vFinal}) + (newU, _, newV) = slimGapped newEdgeCharacter + in -- trace ("GCD:\n" <> (show (slimFinal uCharacter, newU)) <> "\n" <> (show (slimFinal vCharacter, newV)) <> "\nDO Cost:" <> (show doCOST)) + zipWith (M.generalSequenceDiff thisMatrix (length thisMatrix)) (GV.toList newU) (GV.toList newV) + else + zipWith + (M.generalSequenceDiff thisMatrix (length thisMatrix)) + (GV.toList $ slimIAFinal uCharacter) + (GV.toList $ slimIAFinal vCharacter) + + (minDiff, maxDiff) = unzip minMaxDiffList + minCost = thisWeight * fromIntegral (sum minDiff) + maxCost = thisWeight * fromIntegral (sum maxDiff) + in -- trace ("MMDL: " <> (show minCost) <> " " <> (show maxCost)) + (minCost, maxCost) + else + if thisCharType == WideSeq || thisCharType == AminoSeq + then + let minMaxDiffList = + if finalMethod == DirectOptimization + then + let uFinal = M.makeDynamicCharacterFromSingleVector (wideFinal uCharacter) + vFinal = M.makeDynamicCharacterFromSingleVector (wideFinal vCharacter) + + -- Since these are character distances--no need for no change cost adjustment + newEdgeCharacter = M.getDOMedianCharInfo isMedian charInfo (uCharacter{wideGapped = uFinal}) (vCharacter{wideGapped = vFinal}) + (newU, _, newV) = wideGapped newEdgeCharacter + in -- trace ("GCD:\n" <> (show m) <> "\n" <> (show (uFinal, newU)) <> "\n" <> (show (vFinal, newV))) + zipWith (M.generalSequenceDiff thisMatrix (length thisMatrix)) (GV.toList newU) (GV.toList newV) + else GV.toList $ GV.zipWith (M.generalSequenceDiff thisMatrix (length thisMatrix)) (wideIAFinal uCharacter) (wideIAFinal vCharacter) + (minDiff, maxDiff) = unzip minMaxDiffList + minCost = thisWeight * fromIntegral (sum minDiff) + maxCost = thisWeight * fromIntegral (sum maxDiff) + in (minCost, maxCost) + else + if thisCharType == HugeSeq + then + let minMaxDiffList = + if finalMethod == DirectOptimization + then + let uFinal = M.makeDynamicCharacterFromSingleVector (hugeFinal uCharacter) + vFinal = M.makeDynamicCharacterFromSingleVector (hugeFinal vCharacter) + + -- Since these are character distances--no need for no change cost adjustment + newEdgeCharacter = M.getDOMedianCharInfo isMedian charInfo (uCharacter{hugeGapped = uFinal}) (vCharacter{hugeGapped = vFinal}) + (newU, _, newV) = hugeGapped newEdgeCharacter + in -- trace ("GCD:\n" <> (show (uFinal, newU)) <> "\n" <> (show (vFinal, newV))) + zipWith (M.generalSequenceDiff thisMatrix (length thisMatrix)) (GV.toList newU) (GV.toList newV) + else GV.toList $ GV.zipWith (M.generalSequenceDiff thisMatrix (length thisMatrix)) (hugeIAFinal uCharacter) (hugeIAFinal vCharacter) + (minDiff, maxDiff) = unzip minMaxDiffList + minCost = thisWeight * fromIntegral (sum minDiff) + maxCost = thisWeight * fromIntegral (sum maxDiff) + in (minCost, maxCost) + else error ("Character type not recognized/unimplemented : " <> show thisCharType) + where + hasBVIntersection a b = (not . BV.isZeroVector) (a .&. b) + + equalAndSingleState ∷ (Bits a) ⇒ a → a → Bool + equalAndSingleState a b = (a == b) && (popCount a == 1) + + +-- | zero2Gap converts a '0' or no bits set to gap (indel) value +zero2Gap ∷ (FiniteBits a) ⇒ a → a +zero2Gap inVal + | popCount inVal == 0 = (inVal `xor` inVal) `setBit` fromEnum gapIndex + | otherwise = inVal + + +{- | maxIntervalDiff takes two ranges and gets the maximum difference between the two based on differences +in upp and lower ranges. +-} +maxMinIntervalDiff ∷ (Int, Int) → (Int, Int) → (Int, Int) +maxMinIntervalDiff (a, b) (x, y) = + let upper = max b y - min b y + lower = max a x - min a x + in (min upper lower, max upper lower) + + +{- | getLowestCostMatrixStates takes a Vector Triple for matrix charxcter and returns lowest cost states as vector +of Ints +-} +getLowestCostMatrixStates ∷ V.Vector MatrixTriple → V.Vector Int +getLowestCostMatrixStates tripleVect = + if V.null tripleVect + then V.empty + else + let minCost = minimum $ fmap fst3 tripleVect + stateCostPairList = V.zip (V.fromList [0 .. (V.length tripleVect - 1)]) (fmap fst3 tripleVect) + (minStateVect, _) = V.unzip $ V.filter ((== minCost) . snd) stateCostPairList + in minStateVect + + +{- | minMaxMatrixDiff takes twovetors of states and calculates the minimum and maximum state differnce cost +between the two +-} +minMaxMatrixDiff ∷ S.Matrix Int → V.Vector Int → V.Vector Int → (Int, Int) +minMaxMatrixDiff localCostMatrix uStatesV vStatesV = + let statePairs = (V.toList uStatesV, V.toList vStatesV) + cartesianPairs = cartProdPair statePairs + costList = fmap (localCostMatrix S.!) cartesianPairs + in {-THis ti check for errors + if (not . null) costList then (minimum costList, maximum costList) + else (-1, -1) + -} + -- trace ("MMD: " <> (show (statePairs,cartesianPairs))) + (minimum costList, maximum costList) + + +{- | createFinalAssignment takes vertex data (child or current vertex) and creates the final +assignment from parent (if not root or leaf) and 'child' ie current vertex +if root or leaf preliminary is assigned to final +need to watch zipping for missing sequence data +this creates the IA during preorder from which final assignments are contructed +via addition post and preorder passes on IA fields. +-} +createFinalAssignmentOverBlocks + ∷ GlobalSettings + → AssignmentMethod + → Bool + → NodeType + → VertexBlockData + → VertexBlockData + → VertexBlockDataMaybe -- second parent if indegree 2 node + → CharInfo + → Bool + → Bool + → Bool + → VertexBlockData +createFinalAssignmentOverBlocks inGS finalMethod staticIA childType childBlockData parentBlockData parent2BlockDataM charInfo isLeft isInOutDegree1 isIn2Out1 = + -- if root or leaf final assignment <- preliminary asssignment + V.zipWith3 + (assignFinal inGS finalMethod staticIA childType isLeft charInfo isInOutDegree1 isIn2Out1) + childBlockData + parentBlockData + parent2BlockDataM + + +{- | assignFinal takes a vertex type and single block of zip3 of child info, parent info, and character type +to create pre-order assignments +-} +assignFinal + ∷ GlobalSettings + → AssignmentMethod + → Bool + → NodeType + → Bool + → CharInfo + → Bool + → Bool + → V.Vector CharacterData + → V.Vector CharacterData + → V.Vector (Maybe CharacterData) + → V.Vector CharacterData +assignFinal inGS finalMethod staticIA childType isLeft charInfo isOutDegree1 isIn2Out1 = V.zipWith3 (setFinal inGS finalMethod staticIA childType isLeft charInfo isOutDegree1 isIn2Out1) + + +{- | setFinal takes a vertex type and single character of zip3 of child info, parent info, and character type +to create pre-order assignments +| setFinalHTU takes a single character and its parent and sets the final state to prelim based +on character info. +non exact characters are vectors of characters of same type +this does the same things for sequence types, but also +performs preorder logic for exact characters +staticIA flage is for IA and static only optimization used in IA heuriastics for DO +no IA for networks--at least for now.Bool -> +-} +setFinal + ∷ GlobalSettings + → AssignmentMethod + → Bool + → NodeType + → Bool + → CharInfo + → Bool + → Bool + → CharacterData + → CharacterData + → Maybe CharacterData + → CharacterData +setFinal inGS finalMethod staticIA childType isLeft charInfo isIn1Out1 isIn2Out1 childChar parentChar parent2CharM = + let localCharType = charType charInfo + symbolCount = toEnum $ length $ costMatrix charInfo ∷ Int + isTree = graphType inGS == Tree + in -- Three cases, Root, leaf, HTU + -- trace ("set final:" <> (show (finalMethod, staticIA)) <> " " <> (show childType) <> " " <> (show isLeft) <> " " <> (show isIn1Out1) <> " " <> (show isIn2Out1)) ( + if childType == RootNode + then + if localCharType == Add + then childChar{rangeFinal = snd3 $ rangePrelim childChar} + else + if localCharType == NonAdd + then childChar{stateBVFinal = snd3 $ stateBVPrelim childChar} + else + if localCharType `elem` packedNonAddTypes + then childChar{packedNonAddFinal = snd3 $ packedNonAddPrelim childChar} + else + if localCharType == Matrix + then childChar{matrixStatesFinal = setMinCostStatesMatrix (fromEnum symbolCount) (matrixStatesPrelim childChar)} + else + if localCharType == AlignedSlim + then childChar{alignedSlimFinal = extractMedians $ alignedSlimPrelim childChar} + else + if localCharType == AlignedWide + then childChar{alignedWideFinal = extractMedians $ alignedWidePrelim childChar} + else + if localCharType == AlignedHuge + then childChar{alignedHugeFinal = extractMedians $ alignedHugePrelim childChar} + else -- need to set both final and alignment for sequence characters + + if (localCharType == SlimSeq) || (localCharType == NucSeq) + then + let finalAssignment' = extractMedians $ slimGapped childChar + in -- trace ("TNFinal-Root: " <> (show finalAssignment') <> " " <> (show (GV.length finalAssignment', slimGapped childChar))) $ + -- traceNoLF ("TNFinal-Root") $ + if staticIA + then childChar{slimIAFinal = extractMediansGapped $ slimIAPrelim childChar} + else + childChar + { slimFinal = finalAssignment' + , slimAlignment -- slimGapped childChar + = + if isTree + then slimGapped childChar + else mempty + } + else + if (localCharType == WideSeq) || (localCharType == AminoSeq) + then + let finalAssignment' = extractMedians $ wideGapped childChar + in if staticIA + then childChar{wideIAFinal = extractMediansGapped $ wideIAPrelim childChar} + else + childChar + { wideFinal = finalAssignment' + , wideAlignment -- wideGapped childChar + = + if isTree + then wideGapped childChar + else mempty + } + else + if localCharType == HugeSeq + then + let finalAssignment' = extractMedians $ hugeGapped childChar + in if staticIA + then childChar{hugeIAFinal = extractMediansGapped $ hugeIAPrelim childChar} + else + childChar + { hugeFinal = finalAssignment' + , hugeAlignment -- hugeGapped childChar + = + if isTree + then hugeGapped childChar + else mempty + } + else error ("Unrecognized/implemented character type: " <> show localCharType) + else + if childType == LeafNode + then -- since leaf no neeed to precess final alignment fields for sequence characters + + if localCharType == Add + then childChar{rangeFinal = snd3 $ rangePrelim childChar} + else + if localCharType == NonAdd + then childChar{stateBVFinal = snd3 $ stateBVPrelim childChar} + else + if localCharType `elem` packedNonAddTypes + then childChar{packedNonAddFinal = snd3 $ packedNonAddPrelim childChar} + else + if localCharType == Matrix + then childChar{matrixStatesFinal = setMinCostStatesMatrix (fromEnum symbolCount) (matrixStatesPrelim childChar)} + else + if localCharType == AlignedSlim + then childChar{alignedSlimFinal = extractMediansGapped $ alignedSlimPrelim childChar} + else + if localCharType == AlignedWide + then childChar{alignedWideFinal = extractMediansGapped $ alignedWidePrelim childChar} + else + if localCharType == AlignedHuge + then childChar{alignedHugeFinal = extractMediansGapped $ alignedHugePrelim childChar} + else -- need to set both final and alignment for sequence characters + + if (localCharType == SlimSeq) || (localCharType == NucSeq) + then + let finalAlignment = doPreOrderWithParentCheck isLeft (slimAlignment parentChar) (slimGapped parentChar) (slimGapped childChar) + in -- traceNoLF ("TNFinal-Leaf " <> (show isSingleParentIn1Out1) <> " ") $ + if staticIA + then childChar{slimIAFinal = extractMediansGapped $ slimIAPrelim childChar} + else + childChar + { slimFinal = extractMedians $ slimGapped childChar -- finalAssignment' + , slimAlignment -- slimAlignment parentChar + = + if isTree + then finalAlignment + else mempty + , slimIAPrelim -- finalAlignment + = + if isTree + then finalAlignment + else mempty + , slimIAFinal -- extractMediansGapped finalAlignment + = + if isTree + then extractMediansGapped $ finalAlignment + else mempty + } + else -- ) + + if (localCharType == WideSeq) || (localCharType == AminoSeq) + then + let finalAlignment = doPreOrderWithParentCheck isLeft (wideAlignment parentChar) (wideGapped parentChar) (wideGapped childChar) + in if staticIA + then childChar{wideIAFinal = extractMediansGapped $ wideIAPrelim childChar} + else + childChar + { wideFinal = extractMedians $ wideGapped childChar -- finalAssignment' + , wideAlignment -- finalAlignment + = + if isTree + then finalAlignment + else mempty + , wideIAPrelim -- finalAlignment + = + if isTree + then finalAlignment + else mempty + , wideIAFinal -- extractMediansGapped finalAlignment + = + if isTree + then extractMediansGapped $ finalAlignment + else mempty + } + else + if localCharType == HugeSeq + then + let finalAlignment = doPreOrderWithParentCheck isLeft (hugeAlignment parentChar) (hugeGapped parentChar) (hugeGapped childChar) + in -- finalAssignment' = extractMedians finalAlignment + + if staticIA + then childChar{hugeIAFinal = extractMediansGapped $ hugeIAPrelim childChar} + else + childChar + { hugeFinal = extractMedians $ hugeGapped childChar -- finalAssignment' + , hugeAlignment -- finalAlignment + = + if isTree + then finalAlignment + else mempty + , hugeIAPrelim -- finalAlignment + = + if isTree + then finalAlignment + else mempty + , hugeIAFinal -- extractMediansGapped finalAlignment + = + if isTree + then extractMediansGapped $ finalAlignment + else mempty + } + else error ("Unrecognized/implemented character type: " <> show localCharType) + else + if childType == TreeNode && not isIn1Out1 + then + if localCharType == Add + then -- add logic for pre-order + + let finalAssignment' = additivePreorder (rangePrelim childChar) (rangeFinal parentChar) + in childChar{rangeFinal = finalAssignment'} + else + if localCharType == NonAdd + then -- add logic for pre-order + + let finalAssignment' = nonAdditivePreorder (stateBVPrelim childChar) (stateBVFinal parentChar) + in childChar{stateBVFinal = finalAssignment'} + else + if localCharType `elem` packedNonAddTypes + then + let finalAssignment' = BP.packedPreorder localCharType (packedNonAddPrelim childChar) (packedNonAddFinal parentChar) + in childChar{packedNonAddFinal = finalAssignment'} + else + if localCharType == Matrix + then -- add logic for pre-order + + let finalAssignment' = matrixPreorder isLeft (matrixStatesPrelim childChar) (matrixStatesFinal parentChar) + in childChar{matrixStatesFinal = finalAssignment'} + else + if localCharType == AlignedSlim + then + let alignedFinal = + M.getFinal3WaySlim + (slimTCM charInfo) + (alignedSlimFinal parentChar) + (fst3 $ alignedSlimPrelim childChar) + (thd3 $ alignedSlimPrelim childChar) + in childChar{alignedSlimFinal = alignedFinal} + else + if localCharType == AlignedWide + then + let alignedFinal = + M.getFinal3WayWideHuge + (wideTCM charInfo) + (alignedWideFinal parentChar) + (fst3 $ alignedWidePrelim childChar) + (thd3 $ alignedWidePrelim childChar) + in childChar{alignedWideFinal = alignedFinal} + else + if localCharType == AlignedHuge + then + let alignedFinal = + M.getFinal3WayWideHuge + (hugeTCM charInfo) + (alignedHugeFinal parentChar) + (fst3 $ alignedHugePrelim childChar) + (thd3 $ alignedHugePrelim childChar) + in childChar{alignedHugeFinal = alignedFinal} + else -- need to set both final and alignment for sequence characters + + if (localCharType == SlimSeq) || (localCharType == NucSeq) + then -- traceNoLF ("TNFinal-Tree " <> (show isSingleParentIn1Out1) <> " ") $ + + let finalGapped = doPreOrderWithParentCheck isLeft (slimAlignment parentChar) (slimGapped parentChar) (slimGapped childChar) + + finalAssignmentDO = + if finalMethod == DirectOptimization + then + let parentFinalDC = M.makeDynamicCharacterFromSingleVector (slimFinal parentChar) + parentFinal = (parentFinalDC, mempty, mempty) + -- parentGapped = (slimGapped parentChar, mempty, mempty) + childGapped = (slimGapped childChar, mempty, mempty) + finalAssignmentDONonGapped = fst3 $ getDOFinal charInfo parentFinal childGapped + in extractMedians finalAssignmentDONonGapped + else -- really could/should be mempty since overwritten by IA later + removeGapAndNil $ extractMedians finalGapped + in -- trace ("TNFinal-Tree:" <> (show (SV.length $ fst3 (slimAlignment parentChar), SV.length $ fst3 finalGapped,isLeft, (slimAlignment parentChar), (slimGapped parentChar) ,(slimGapped childChar))) <> "->" <> (show finalGapped)) ( + if staticIA + then M.makeIAFinalCharacter finalMethod charInfo childChar parentChar + else + childChar + { slimFinal = finalAssignmentDO + , slimAlignment -- finalGapped + = + if isTree + then finalGapped + else mempty + } + else + if (localCharType == WideSeq) || (localCharType == AminoSeq) + then + let finalGapped = doPreOrderWithParentCheck isLeft (wideAlignment parentChar) (wideGapped parentChar) (wideGapped childChar) + + finalAssignmentDO = + if finalMethod == DirectOptimization + then + let parentFinalDC = M.makeDynamicCharacterFromSingleVector (wideFinal parentChar) + parentFinal = (mempty, parentFinalDC, mempty) + -- parentGapped = (mempty, wideGapped parentChar, mempty) + childGapped = (mempty, wideGapped childChar, mempty) + finalAssignmentDONonGapped = snd3 $ getDOFinal charInfo parentFinal childGapped + in extractMedians finalAssignmentDONonGapped + else removeGapAndNil $ extractMedians finalGapped + in if staticIA + then M.makeIAFinalCharacter finalMethod charInfo childChar parentChar + else + childChar + { wideFinal = finalAssignmentDO + , wideAlignment -- finalGapped + = + if isTree + then finalGapped + else mempty + } + else + if localCharType == HugeSeq + then + let finalGapped = doPreOrderWithParentCheck isLeft (hugeAlignment parentChar) (hugeGapped parentChar) (hugeGapped childChar) + + finalAssignmentDO = + if finalMethod == DirectOptimization + then + let parentFinalDC = M.makeDynamicCharacterFromSingleVector (hugeFinal parentChar) + parentFinal = (mempty, mempty, parentFinalDC) + -- parentGapped = (mempty, mempty, hugeGapped parentChar) + childGapped = (mempty, mempty, hugeGapped childChar) + finalAssignmentDONonGapped = thd3 $ getDOFinal charInfo parentFinal childGapped + in snd3 finalAssignmentDONonGapped + else removeGapAndNil $ extractMedians finalGapped + in if staticIA + then M.makeIAFinalCharacter finalMethod charInfo childChar parentChar + else + childChar + { hugeFinal = finalAssignmentDO + , hugeAlignment -- finalGapped + = + if isTree + then finalGapped + else mempty + } + else error ("Unrecognized/implemented character type: " <> show localCharType) + else -- display tree indegree=outdegree=1 + -- since display trees here--indegree should be one as well + -- this doens't work--need to redo pass logic--perhaps by doing "grandchild" + -- churrently using childChild + + if isIn1Out1 + then -- trace ("InOut1 preorder") ( + + if localCharType == Add + then childChar{rangeFinal = rangeFinal parentChar} + else + if localCharType == NonAdd + then childChar{stateBVFinal = stateBVFinal parentChar} + else + if localCharType `elem` packedNonAddTypes + then childChar{packedNonAddFinal = packedNonAddFinal parentChar} + else + if localCharType == Matrix + then childChar{matrixStatesFinal = matrixStatesFinal parentChar} + else + if localCharType == AlignedSlim + then childChar{alignedSlimFinal = alignedSlimFinal parentChar} + else + if localCharType == AlignedWide + then childChar{alignedWideFinal = alignedWideFinal parentChar} + else + if localCharType == AlignedHuge + then childChar{alignedHugeFinal = alignedHugeFinal parentChar} + else -- need to set both final and alignment for sequence characters + + if (localCharType == SlimSeq) || (localCharType == NucSeq) -- parentChar + then -- traceNoLF ("TNFinal-1/1") $ + -- trace ("TNFinal-1/1:" <> (show (isLeft, (slimAlignment parentChar), (slimGapped parentChar) ,(slimGapped childChar)))) $ + + if staticIA + then childChar{slimIAFinal = slimIAFinal parentChar} + else + childChar + { slimFinal = slimFinal parentChar + , slimAlignment -- slimAlignment parentChar + = + if isTree + then slimAlignment parentChar -- finalGappedO -- slimAlignment parentChar -- finalGappedO-- slimAlignment parentChar + else mempty + , slimGapped = slimGapped parentChar -- slimGapped' -- slimGapped parentChar -- finalGappedO --slimGapped parentChar + -- , slimIAPrelim = slimIAPrelim parentChar + , slimIAFinal -- slimFinal parentChar + = + if isTree + then slimFinal parentChar + else mempty + } + else + if (localCharType == WideSeq) || (localCharType == AminoSeq) -- parentChar + -- trace ("TNFinal-1/1:" <> (show (isLeft, (slimAlignment parentChar), (slimGapped parentChar) ,(slimGapped childChar)))) ( + then + if staticIA + then childChar{wideIAFinal = wideIAFinal parentChar} + else + childChar + { wideFinal = wideFinal parentChar + , wideAlignment -- wideAlignment parentChar + = + if isTree + then wideAlignment parentChar -- finalGappedO -- wideAlignment parentChar -- finalGappedO-- wideAlignment parentChar + else mempty + , wideGapped = wideGapped parentChar -- wideGapped' -- wideGapped parentChar -- finalGappedO --wideGapped parentChar + -- , wideIAPrelim = wideIAPrelim parentChar + , wideIAFinal -- wideFinal parentChar + = + if isTree + then wideFinal parentChar + else mempty + } + else -- ) + + if localCharType == HugeSeq -- parentChar + -- trace ("TNFinal-1/1:" <> (show (isLeft, (hugeAlignment parentChar), (hugeGapped parentChar) ,(hugeGapped childChar)))) ( + then + if staticIA + then childChar{hugeIAFinal = hugeIAFinal parentChar} + else + childChar + { hugeFinal = hugeFinal parentChar + , hugeAlignment -- hugeAlignment parentChar + = + if isTree + then hugeAlignment parentChar -- finalGappedO -- hugeAlignment parentChar -- finalGappedO-- hugeAlignment parentChar + else mempty + , hugeGapped = hugeGapped parentChar -- hugeGapped' -- hugeGapped parentChar -- finalGappedO --hugeGapped parentChar + -- , hugeIAPrelim = hugeIAPrelim parentChar + , hugeIAFinal -- hugeFinal parentChar + = + if isTree + then hugeFinal parentChar + else mempty + } + else -- ) + + error ("Unrecognized/implemented character type: " <> show localCharType) + else -- ) + + -- for Hardwired graphs + + if isIn2Out1 + then + if isNothing parent2CharM + then error "Nothing parent2char in setFinal" + else -- trace ("SF: " <> "makeing 3-way final") + TW.threeMedianFinal charInfo parentChar (fromJust parent2CharM) childChar + else error ("Node type should not be here (pre-order on tree node only): " <> show childType) + + +-- ) + +-- | doPreOrderWithParentCheck performs post order logic if parent non-zero--otherwise returns preliminary assignment +doPreOrderWithParentCheck + ∷ (FiniteBits e, GV.Vector v e) ⇒ Bool → (v e, v e, v e) → (v e, v e, v e) → (v e, v e, v e) → (v e, v e, v e) +doPreOrderWithParentCheck isLeft alignmentParent gappedParent gappedChild = + if not $ GV.null $ extractMediansGapped alignmentParent + then -- if True then + DOP.preOrderLogic isLeft alignmentParent gappedParent gappedChild + else gappedChild + + +{- | getDOFinal takes parent final, and node gapped (including its parent gapped) and performs a DO median +to get the final state. This takes place in several steps + 1) align (DOMedian) parent final with node gapped (ie node preliminary) + 2) propagate new gaps in aligned node preliminary to child gapped in node triple (snd and thd) + creating a 3-way alignment with parent final and child preliminary + 3) apply appropriate get3way for the structure +The final is then returned--with gaps to be filtered afterwards +getDOFinal :: (FiniteBits a, GV.Vector v a) => v a -> (v a, v a, v a) -> CharInfo -> v a +Since just getting final states--not using noChangeCostAdjust +-} +getDOFinal + ∷ CharInfo + → (SlimDynamicCharacter, WideDynamicCharacter, HugeDynamicCharacter) + → (SlimDynamicCharacter, WideDynamicCharacter, HugeDynamicCharacter) + → (SlimDynamicCharacter, WideDynamicCharacter, HugeDynamicCharacter) +getDOFinal charInfo parentFinal nodeGapped = + let isMedian = True + (a, b, c, _) = M.pairwiseDO isMedian charInfo parentFinal nodeGapped + parentNodeChar = (a, b, c) + + -- put "new" gaps into 2nd and thd gapped fields of appropriate seqeunce type + nonGappedFinal = makeNonGappedLeftRight charInfo parentNodeChar nodeGapped + in nonGappedFinal + + +{- | makeNonGappedLeftRight takes an alignment parent character and original node character and inserts "new" gaps into nodeCharcater +to line them up with parent, vreates 3-way median, and strips out gaps to retujrn final DO assignment +-} +makeNonGappedLeftRight + ∷ CharInfo + → (SlimDynamicCharacter, WideDynamicCharacter, HugeDynamicCharacter) + → (SlimDynamicCharacter, WideDynamicCharacter, HugeDynamicCharacter) + → (SlimDynamicCharacter, WideDynamicCharacter, HugeDynamicCharacter) +makeNonGappedLeftRight charInfo gappedLeftRight nodeChar = + let localCharType = charType charInfo + in if localCharType `elem` [SlimSeq, NucSeq] + then + let (parentGapped, leftChildGapped, rightChildGapped) = TW.addGapsToChildren (fst3 gappedLeftRight) (fst3 nodeChar) + newFinalGapped = M.getFinal3WaySlim (slimTCM charInfo) parentGapped leftChildGapped rightChildGapped + in (M.makeDynamicCharacterFromSingleVector newFinalGapped, mempty, mempty) + else + if localCharType `elem` [AminoSeq, WideSeq] + then + let (parentGapped, leftChildGapped, rightChildGapped) = TW.addGapsToChildren (snd3 gappedLeftRight) (snd3 nodeChar) + newFinalGapped = M.getFinal3WayWideHuge (wideTCM charInfo) parentGapped leftChildGapped rightChildGapped + in (mempty, M.makeDynamicCharacterFromSingleVector newFinalGapped, mempty) + else + if localCharType == HugeSeq + then + let (parentGapped, leftChildGapped, rightChildGapped) = TW.addGapsToChildren (thd3 gappedLeftRight) (thd3 nodeChar) + newFinalGapped = M.getFinal3WayWideHuge (hugeTCM charInfo) parentGapped leftChildGapped rightChildGapped + in (mempty, mempty, M.makeDynamicCharacterFromSingleVector newFinalGapped) + else error ("Unrecognized character type: " <> show localCharType) + + +{- | additivePreorder assignment takes preliminary triple of child (= current vertex) and +final states of parent to create preorder final states of child +-} +additivePreorder + ∷ (V.Vector (Int, Int), V.Vector (Int, Int), V.Vector (Int, Int)) → V.Vector (Int, Int) → V.Vector (Int, Int) +additivePreorder (leftChild, nodePrelim, rightChild) parentFinal = + if null nodePrelim + then mempty + else V.zipWith4 makeAdditiveCharacterFinal nodePrelim leftChild rightChild parentFinal + + +{- | nonAdditivePreorder assignment takes preliminary triple of child (= current vertex) and +final states of parent to create preorder final states of child +-} +nonAdditivePreorder + ∷ (V.Vector BV.BitVector, V.Vector BV.BitVector, V.Vector BV.BitVector) → V.Vector BV.BitVector → V.Vector BV.BitVector +nonAdditivePreorder (leftChild, nodePrelim, rightChild) parentFinal = + if null nodePrelim + then mempty + else V.zipWith4 makeNonAdditiveCharacterFinal nodePrelim leftChild rightChild parentFinal + + +{- | matrixPreorder assigment akes preliminary matrix states of child (= current vertex) and +final states of parent to create preorder final states of child +th eboolean says whether the node is a 'left' node or right based on bitvetor label +-} +matrixPreorder + ∷ Bool → V.Vector (V.Vector MatrixTriple) → V.Vector (V.Vector MatrixTriple) → V.Vector (V.Vector MatrixTriple) +matrixPreorder isLeft nodePrelim parentFinal = + if null nodePrelim + then mempty + else V.zipWith (makeMatrixCharacterFinal isLeft) nodePrelim parentFinal + + +{- | makeAdditiveCharacterFinal takes vertex preliminary, and child preliminary states with well as parent final state +and constructs final state assignment +-} +makeAdditiveCharacterFinal ∷ (Int, Int) → (Int, Int) → (Int, Int) → (Int, Int) → (Int, Int) +makeAdditiveCharacterFinal nodePrelim leftChild rightChild parentFinal = + -- From Wheeler (20012) after Goloboff (1993) + let interNodeParent = intervalIntersection nodePrelim parentFinal + in -- trace (show inData) ( + -- Rule 1 + if interNodeParent == Just parentFinal + then -- trace ("R1 " <> show parentFinal) + parentFinal + else -- Rule 2 + + if isJust ((leftChild `intervalUnion` rightChild) `intervalIntersection` parentFinal) + then + let xFactor = ((leftChild `intervalUnion` rightChild) `intervalUnion` nodePrelim) `intervalIntersection` parentFinal + in if isNothing xFactor + then + error + ("I don't think this should happen in makeAdditiveCharacterFinal" <> show (nodePrelim, leftChild, rightChild, parentFinal)) + else + if isJust (fromJust xFactor `intervalIntersection` nodePrelim) + then -- trace ("R2a " <> show (fromJust xFactor)) + fromJust xFactor + else -- trace ("Rb " <> show (lciClosest (fromJust xFactor) nodePrelim)) + lciClosest (fromJust xFactor) nodePrelim + else -- Rule 3 + + let unionLR = leftChild `intervalUnion` rightChild + closestPtoA = stateFirstClosestToSecond nodePrelim parentFinal + closestULRtoA = stateFirstClosestToSecond unionLR parentFinal + in -- trace ("R3 " <> show (min closestPtoA closestULRtoA, max closestPtoA closestULRtoA)) + (min closestPtoA closestULRtoA, max closestPtoA closestULRtoA) + + +-- ) + +{- | stateFirstClosestToSecond takes teh states of the first interval and finds the state wiht smallest distance +to either state in the second +assumes a <= b, x<= y +-} +stateFirstClosestToSecond ∷ (Int, Int) → (Int, Int) → Int +stateFirstClosestToSecond (a, b) (x, y) = + let distASecond + | x > b = x - a + | y < a = a - y + | otherwise = error ("I don't think this should happen in makeAdditiveCharacterFinal" <> show (a, b, x, y)) + distBSecond + | x > b = x - b + | y < a = b - y + | otherwise = error ("I don't think this should happen in makeAdditiveCharacterFinal" <> show (a, b, x, y)) + in if distASecond <= distBSecond + then a + else b + + +{- | lciClosest returns the "largest closed interval" between the first interval +and the closest state in the second interval +assumes a <= b, x<= y +-} +lciClosest ∷ (Int, Int) → (Int, Int) → (Int, Int) +lciClosest (a, b) (x, y) + | x > b = (a, x) + | y < a = (y, b) + | otherwise = error ("I don't think this should happen in lciClosest" <> show (a, b, x, y)) + + +-- \| intervalIntersection is bit-analogue intersection for additive character operations +-- takes two intervals and returnas range intersection +-- Nothing signifies an empty intersection +-- assumes a <= b, x<= y +intervalIntersection ∷ (Int, Int) → (Int, Int) → Maybe (Int, Int) +intervalIntersection (a, b) (x, y) = + let newPair = (max a x, min b y) + in if max a x > min b y + then Nothing + else Just newPair + + +{- | intervalUnion is bit-analogue union for additive character operations +takes two intervals and returnas union +assumes a <= b, x<= y +-} +intervalUnion ∷ (Int, Int) → (Int, Int) → (Int, Int) +intervalUnion (a, b) (x, y) = (min a x, max b y) + + +{- | makeNonAdditiveCharacterFinal takes vertex preliminary, and child preliminary states with well as parent final state +and constructs final state assignment +-} +makeNonAdditiveCharacterFinal ∷ BV.BitVector → BV.BitVector → BV.BitVector → BV.BitVector → BV.BitVector +makeNonAdditiveCharacterFinal nodePrelim leftChild rightChild parentFinal = + -- From Wheeler (2012) after Fitch (1971) + -- trace (show inData) ( + if BV.isZeroVector (complement nodePrelim .&. parentFinal) + then -- trace ("R1 " <> show parentFinal) + parentFinal + else + if BV.isZeroVector (leftChild .&. rightChild) + then -- trace ("R2 " <> show (nodePrelim .|. parentFinal)) + nodePrelim .|. parentFinal + else -- trace ("R3 " <> show (nodePrelim .|. (leftChild .&. parentFinal) .|. (rightChild .&. parentFinal))) + nodePrelim .|. (parentFinal .&. (leftChild .|. rightChild)) + + +-- ) + +{- | makeMatrixCharacterFinal vertex preliminary and parent final state +and constructs final state assignment +really just tracks the states on a traceback and sets the cost to maxBound :: Int for states not in the traceback +path +Bool for left right node +-} +makeMatrixCharacterFinal ∷ Bool → V.Vector MatrixTriple → V.Vector MatrixTriple → V.Vector MatrixTriple +makeMatrixCharacterFinal isLeft nodePrelim parentFinal = + let numStates = length nodePrelim + stateIndexList = V.fromList [0 .. (numStates - 1)] + (stateCostList, stateLeftChildList, stateRightChildList) = V.unzip3 parentFinal + (_, prelimStateLeftChildList, prelimStateRightChildList) = V.unzip3 nodePrelim + allThree = + if isLeft + then V.zip3 stateCostList stateLeftChildList stateIndexList + else V.zip3 stateCostList stateRightChildList stateIndexList + bestParentThree = V.filter ((/= (maxBound ∷ StateCost)) . fst3) allThree + bestPrelimStates = L.sort $ L.nub $ concatMap snd3 bestParentThree + allFour = V.zipWith3 (setCostsAndStates bestPrelimStates) prelimStateLeftChildList prelimStateRightChildList stateIndexList + finalBestTriple = V.filter ((/= (maxBound ∷ StateCost)) . fst3) allFour + in finalBestTriple + + +{- | setCostsAndStates takes a list of states that are in teh set of 'best' and a four-tuple +of a matrix triple annd a fourth field of the state index +if the state is in the list of `best' indices it is kept and not if it isn't +-} +setCostsAndStates ∷ [Int] → [ChildStateIndex] → [ChildStateIndex] → Int → (StateCost, [ChildStateIndex], [ChildStateIndex]) +setCostsAndStates bestPrelimStates leftChildState rightChildStates stateIndex = + if stateIndex `elem` bestPrelimStates + then (stateIndex, leftChildState, rightChildStates) + else (maxBound ∷ StateCost, leftChildState, rightChildStates) + + +-- | setMinCostStatesMatrix sets the cost of non-minimal cost states to maxBounnd :: StateCost (Int) +setMinCostStatesMatrix ∷ Int → V.Vector (V.Vector MatrixTriple) → V.Vector (V.Vector MatrixTriple) +setMinCostStatesMatrix numStates inStateVect = + let outStates = + V.filter ((/= (maxBound ∷ StateCost)) . fst3) + <$> fmap (nonMinCostStatesToMaxCost (V.fromList [0 .. (numStates - 1)])) inStateVect + in outStates + + +{- | nonMinCostStatesToMaxCost takes an individual pair of minimum state cost and matrix character triple +returns a new character with the states cost either the minium value or maxBound if not +this only applied at root or leaf--other vertices minimum costs may not be part of the +miniumm cost assignment, but may be useful heuristically +-} +nonMinCostStatesToMaxCost ∷ V.Vector StateCost → V.Vector MatrixTriple → V.Vector MatrixTriple +nonMinCostStatesToMaxCost stateIndexList tripleVect = + let minStateCost = V.minimum $ fmap fst3 tripleVect + result = V.zipWith (modifyStateCost minStateCost) tripleVect stateIndexList + in -- trace ((show stateIndexList) <> " " <> (show $ V.zip tripleVect stateIndexList)) + result + where + modifyStateCost ∷ (Eq a) ⇒ a → (a, b, c) → StateCost → (StateCost, b, c) + modifyStateCost d (a, b, c) e + | a == d = (e, b, c) + | otherwise = (maxBound ∷ StateCost, b, c) + + +-- | setFinalToPreliminaryStates takes VertexBlockData and sets the final values to Preliminary +setFinalToPreliminaryStates ∷ VertexBlockData → VertexBlockData +setFinalToPreliminaryStates inVertBlockData = + if V.null inVertBlockData + then mempty + else fmap setBlockFinalToPrelim inVertBlockData + + +-- | setBlockFinalToPrelim sets characters in a block final values to Preliminary +setBlockFinalToPrelim ∷ V.Vector CharacterData → V.Vector CharacterData +setBlockFinalToPrelim inCharVect = + if V.null inCharVect + then mempty + else fmap setFinalToPrelimCharacterData inCharVect + + +-- | setFinalFinalToPrelimCharacterData takes a single chartcater and sets final values to Preliminary +setFinalToPrelimCharacterData ∷ CharacterData → CharacterData +setFinalToPrelimCharacterData inChar = + inChar + { stateBVFinal = snd3 $ stateBVPrelim inChar + , rangeFinal = snd3 $ rangePrelim inChar + , matrixStatesFinal = matrixStatesPrelim inChar + , slimAlignment = slimGapped inChar + , slimFinal = slimPrelim inChar + , slimIAFinal = snd3 $ slimIAPrelim inChar + , wideAlignment = wideGapped inChar + , wideFinal = widePrelim inChar + , wideIAFinal = snd3 $ wideIAPrelim inChar + , hugeAlignment = hugeGapped inChar + , hugeFinal = hugePrelim inChar + , hugeIAFinal = snd3 $ hugeIAPrelim inChar + , alignedSlimFinal = snd3 $ alignedSlimPrelim inChar + , alignedWideFinal = snd3 $ alignedWidePrelim inChar + , alignedHugeFinal = snd3 $ alignedHugePrelim inChar + , packedNonAddFinal = snd3 $ packedNonAddPrelim inChar + } + + +-- | setPreliminaryToFinalStates takes VertexBlockData and sets the Preliminary states to final values +setPreliminaryToFinalStates ∷ VertexBlockData → VertexBlockData +setPreliminaryToFinalStates inVertBlockData = + if V.null inVertBlockData + then mempty + else fmap setBlockPrelimToFinal inVertBlockData + + +-- | setBlockPrelimToFinal sets characters in a block preliminary data to final +setBlockPrelimToFinal ∷ V.Vector CharacterData → V.Vector CharacterData +setBlockPrelimToFinal inCharVect = + if V.null inCharVect + then mempty + else fmap setPrelimToFinalCharacterData inCharVect + + +-- | setPrelimToFinalCharacterData takes a single chartcater and sets preliminary values to final +setPrelimToFinalCharacterData ∷ CharacterData → CharacterData +setPrelimToFinalCharacterData inChar = + inChar + { stateBVPrelim = (stateBVFinal inChar, stateBVFinal inChar, stateBVFinal inChar) + , rangePrelim = (rangeFinal inChar, rangeFinal inChar, rangeFinal inChar) + , matrixStatesPrelim = matrixStatesFinal inChar + , -- , slimAlignment = slimGapped inChar + slimGapped = (slimFinal inChar, slimFinal inChar, slimFinal inChar) + , slimIAPrelim = (slimIAFinal inChar, slimIAFinal inChar, slimIAFinal inChar) + , -- , wideAlignment = wideGapped inChar + wideGapped = (wideFinal inChar, wideFinal inChar, wideFinal inChar) + , wideIAPrelim = (wideIAFinal inChar, wideIAFinal inChar, wideIAFinal inChar) + , -- , hugeAlignment = hugeGapped inChar + hugeGapped = (hugeFinal inChar, hugeFinal inChar, hugeFinal inChar) + , hugeIAPrelim = (hugeIAFinal inChar, hugeIAFinal inChar, hugeIAFinal inChar) + , alignedSlimPrelim = (alignedSlimFinal inChar, alignedSlimFinal inChar, alignedSlimFinal inChar) + , alignedWidePrelim = (alignedWideFinal inChar, alignedWideFinal inChar, alignedWideFinal inChar) + , alignedHugePrelim = (alignedHugeFinal inChar, alignedHugeFinal inChar, alignedHugeFinal inChar) + , packedNonAddPrelim = (packedNonAddFinal inChar, packedNonAddFinal inChar, packedNonAddFinal inChar) + } diff --git a/src/GraphOptimization/Traversals.hs b/src/GraphOptimization/Traversals.hs new file mode 100644 index 000000000..c2f68a10e --- /dev/null +++ b/src/GraphOptimization/Traversals.hs @@ -0,0 +1,471 @@ +{- | +Module specifying graph traversal functions for PhyGraph +-} +module GraphOptimization.Traversals ( + multiTraverseFullyLabelTree, + multiTraverseFullyLabelGraph, + multiTraverseFullyLabelGraph', + multiTraverseFullyLabelSoftWired, + multiTraverseFullyLabelSoftWiredReduced, + multiTraverseFullyLabelHardWired, + multiTraverseFullyLabelHardWiredReduced, + checkUnusedEdgesPruneInfty, + generalizedGraphPostOrderTraversal, + getPenaltyFactor, + getPenaltyFactorList, + updateGraphCostsComplexities, + updatePhylogeneticGraphCost, + updatePhylogeneticGraphCostReduced, + multiTraverseFullyLabelGraphReduced, + multiTraverseFullyLabelGraphPair, +) where + +import Control.DeepSeq +import Control.Monad (when) +import Control.Monad.IO.Class (MonadIO (..)) +import Data.InfList qualified as IL +import Data.List qualified as L +import Data.Maybe +import Debug.Trace +import GHC.Stack (errorWithStackTrace) +import GeneralUtilities +import GraphOptimization.PostOrderSoftWiredFunctions qualified as POSW +import GraphOptimization.PreOrderFunctions qualified as PRE +import Graphs.GraphOperations qualified as GO +import PHANE.Evaluation +import PHANE.Evaluation.ErrorPhase (ErrorPhase (..)) +import PHANE.Evaluation.Logging +import PHANE.Evaluation.Verbosity (Verbosity (..)) +import Types.Types +import Utilities.LocalGraph qualified as LG +import Utilities.Utilities as U + + +{- | +'multiTraverseFullyLabelGraphReduced' wrapper to return 'ReducedPhylogeneticGraph'. +-} +multiTraverseFullyLabelGraphReduced + ∷ GlobalSettings → ProcessedData → Bool → Bool → Maybe Int → SimpleGraph → PhyG ReducedPhylogeneticGraph +multiTraverseFullyLabelGraphReduced inGS inData pruneEdges warnPruneEdges startVertex inGraph = + GO.convertPhylogeneticGraph2Reduced <$> multiTraverseFullyLabelGraph inGS inData pruneEdges warnPruneEdges startVertex inGraph + + + +-- \$ multiTraverseFullyLabelGraph inGS inData pruneEdges warnPruneEdges startVertex inGraph + +{- | multiTraverseFullyLabelGraph is a wrapper around multi-traversal functions for Tree, +Soft-wired network graph, and Hard-wired network graph +can either find root, be given root, or start somwhere else (startVertex) do optimize only a component of a forest +-} +multiTraverseFullyLabelGraph + ∷ GlobalSettings → ProcessedData → Bool → Bool → Maybe Int → SimpleGraph → PhyG PhylogeneticGraph +multiTraverseFullyLabelGraph inGS inData pruneEdges warnPruneEdges startVertex inGraph + | LG.isEmpty inGraph = pure emptyPhylogeneticGraph + | otherwise = + {-# SCC multiTraverseFullyLabelGraph_TOP_DEF #-} + case graphType inGS of + SoftWired → + let leafGraph = POSW.makeLeafGraphSoftWired inGS inData + in multiTraverseFullyLabelSoftWired inGS inData pruneEdges warnPruneEdges leafGraph startVertex inGraph + HardWired → + let leafGraph = GO.makeLeafGraph inData + in multiTraverseFullyLabelHardWired inGS inData leafGraph startVertex inGraph + Tree → + {-# SCC multiTraverseFullyLabelGraph_CASE_OF_Tree #-} + -- test for Tree + let (_, _, _, networkVertexList) = LG.splitVertexList inGraph + in -- in do when (not $ null networkVertexList) . failWithPhase Computing $ unlines + do + when (not $ null networkVertexList) $ do + logWith LogFail $ + unlines + [ "Input graph is not a tree/forest, but graph type has been specified (perhaps by default) as Tree." + , "Modify input graph or use 'set()' command to specify network type." + , "\tNetwork vertices: " <> show (fst <$> networkVertexList) + , LG.prettify inGraph + ] + errorWithStackTrace "Exceptional state reached: MALFORMED GRAPH" + + let leafGraph = GO.makeLeafGraph inData + multiTraverseFullyLabelTree inGS inData leafGraph startVertex inGraph + val → failWithPhase Computing $ "Unknown graph type specified: " <> show val + + +-- | multiTraverseFullyLabelGraphPair maps to multiTraverseFullyLabelGraph with paired arguments used by report IA and tnt output +multiTraverseFullyLabelGraphPair + ∷ GlobalSettings → Bool → Bool → Maybe Int → (ProcessedData, SimpleGraph) → PhyG PhylogeneticGraph +multiTraverseFullyLabelGraphPair inGS pruneEdges warnPruneEdges startVertex (inData, inGraph) = multiTraverseFullyLabelGraph inGS inData pruneEdges warnPruneEdges startVertex inGraph + + +-- | multiTraverseFullyLabelGraph' maps to multiTraverseFullyLabelGraph with differnet order of arguments used by report IA and tnt output +multiTraverseFullyLabelGraph' + ∷ GlobalSettings → Bool → Bool → Maybe Int → ProcessedData → SimpleGraph → PhyG PhylogeneticGraph +multiTraverseFullyLabelGraph' inGS pruneEdges warnPruneEdges startVertex inData inGraph = multiTraverseFullyLabelGraph inGS inData pruneEdges warnPruneEdges startVertex inGraph + + +-- | multiTraverseFullyLabelHardWiredReduced is a wrapper for ReducedPhylogeneticTrees +multiTraverseFullyLabelHardWiredReduced + ∷ GlobalSettings → ProcessedData → DecoratedGraph → Maybe Int → SimpleGraph → PhyG ReducedPhylogeneticGraph +multiTraverseFullyLabelHardWiredReduced inGS inData leafGraph startVertex inSimpleGraph = do + result ← multiTraverseFullyLabelTree inGS inData leafGraph startVertex inSimpleGraph + pure $ GO.convertPhylogeneticGraph2Reduced result + + +multiTraverseFullyLabelHardWired + ∷ GlobalSettings → ProcessedData → DecoratedGraph → Maybe Int → SimpleGraph → PhyG PhylogeneticGraph +multiTraverseFullyLabelHardWired inGS inData leafGraph startVertex inSimpleGraph = multiTraverseFullyLabelTree inGS inData leafGraph startVertex inSimpleGraph + + +-- | multiTraverseFullyLabelSoftWiredReduced is a wrapper for ReducedPhylogeneticTrees +multiTraverseFullyLabelSoftWiredReduced + ∷ GlobalSettings → ProcessedData → Bool → Bool → DecoratedGraph → Maybe Int → SimpleGraph → PhyG ReducedPhylogeneticGraph +multiTraverseFullyLabelSoftWiredReduced inGS inData pruneEdges warnPruneEdges leafGraph startVertex inSimpleGraph = do + result ← multiTraverseFullyLabelSoftWired inGS inData pruneEdges warnPruneEdges leafGraph startVertex inSimpleGraph + pure $ GO.convertPhylogeneticGraph2Reduced result + + +{- | multiTraverseFullyLabelSoftWired fully labels a softwired network component forest +including traversal rootings-- does not reroot on network edges +allows indegree=outdegree=1 vertices +pruneEdges and warnPruneEdges specify if unused edges (ie not in diuaplytrees) are pruned from +canonical tree or if an infinity cost is returned and if a trace warning is thrown if so. +in general--input trees should use "pruneEdges" during search--not +can either find root, be given root, or start somwhere else (startVertex) do optimize only a component of a forest +first Bool for calcualting breanch edger weights +-} +multiTraverseFullyLabelSoftWired + ∷ GlobalSettings → ProcessedData → Bool → Bool → DecoratedGraph → Maybe Int → SimpleGraph → PhyG PhylogeneticGraph +multiTraverseFullyLabelSoftWired inGS inData pruneEdges warnPruneEdges leafGraph startVertex inSimpleGraph = + if LG.isEmpty inSimpleGraph + then pure emptyPhylogeneticGraph + else do + let sequenceChars = U.getNumberSequenceCharacters (thd3 inData) + (postOrderGraph, localStartVertex) ← + generalizedGraphPostOrderTraversal inGS sequenceChars inData leafGraph False startVertex inSimpleGraph + fullyOptimizedGraph ← + PRE.preOrderTreeTraversal inGS (finalAssignment inGS) False True (sequenceChars > 0) localStartVertex False postOrderGraph + checkUnusedEdgesPruneInfty inGS inData pruneEdges warnPruneEdges leafGraph $ + updatePhylogeneticGraphCost fullyOptimizedGraph (snd6 fullyOptimizedGraph) + + +{- | multiTraverseFullyLabelTree performs potorder on default root and other traversal foci, taking the minimum +traversal cost for all nonexact charcters--the initial rooting is used for exact characters +operates with Tree functions +need to add forest functionality--in principle just split into components and optimize them independently +but get into root index issues the way this is written now. +can either find root, be given root, or start somwhere else (startVertex) do optimize only a component of a forest +-} +multiTraverseFullyLabelTree + ∷ GlobalSettings → ProcessedData → DecoratedGraph → Maybe Int → SimpleGraph → PhyG PhylogeneticGraph +multiTraverseFullyLabelTree inGS inData leafGraph startVertex inSimpleGraph = + if LG.isEmpty inSimpleGraph + then pure emptyPhylogeneticGraph + else + let sequenceChars = U.getNumberSequenceCharacters (thd3 inData) + staticIA = False + in do + (postOrderGraph, localStartVertex) ← + generalizedGraphPostOrderTraversal inGS sequenceChars inData leafGraph staticIA startVertex inSimpleGraph + PRE.preOrderTreeTraversal inGS (finalAssignment inGS) False True (sequenceChars > 0) localStartVertex False postOrderGraph + + +{- | generalizedGraphPostOrderTraversal performs the postorder pass +on a graph (tree, softWired, or hardWired) to determine the "preliminary" character states +include penalty factor cost but not root cost which may or may not be wanted depending on context +if full graph--yes, if a component yes or no. +hence returns the pair +Adds root complexity cost if the star vertex is Nothing (e.g. the graph root), so should be corrext when graphs are split in swap. +-} +generalizedGraphPostOrderTraversal + ∷ GlobalSettings → Int → ProcessedData → DecoratedGraph → Bool → Maybe Int → SimpleGraph → PhyG (PhylogeneticGraph, Int) +generalizedGraphPostOrderTraversal inGS sequenceChars inData leafGraph staticIA startVertex inSimpleGraph = do + -- next edges (to vertex in list) to perform rerroting + -- progresses recursivey over adjacent edges to minimize node reoptimization + -- childrenOfRoot = concatMap (LG.descendants (thd6 outgroupRooted)) startVertexList + -- grandChildrenOfRoot = concatMap (LG.descendants (thd6 outgroupRooted)) childrenOfRoot + + -- create list of multi-traversals with original rooting first + -- subsequent rerooting do not reoptimize exact characters (add nonadd etc) + -- they are taken from the fully labelled first input decorated graph later when output graph created + -- it is important that the first graph be the ourgroup rooted graph (outgroupRootedPhyloGraph) so this + -- will have the preorder assignments for the outgroup rooted graph as 3rd field. This can be used for incremental + -- optimization to get O(log n) initial postorder assingment when mutsating graph. + -- hardwired reroot cause much pain + -- the head startvertex list for reoptimizing spit trees ni swapping + + -- create optimal final graph with best costs and best traversal (rerooting) forest for each character + -- traversal for exact characters (and costs) are the first of each least since exact only optimizaed for that + -- traversal graph. The result has approprotate post-order assignments for traversals, preorder "final" assignments + -- are propagated to the Decorated graph field after the preorder pass. + -- doesn't have to be sorted, but should minimize assignments + -- graphWithBestAssignments = head recursiveRerootList -- L.foldl1' setBetterGraphAssignment recursiveRerootList' + + {- root and model complexities moved to output + -- same root cost if same data and number of roots + localRootCost = rootComplexity inGS + {-if (rootCost inGS) == NoRootCost then 0.0 + else error ("Root cost type " <> (show $ rootCost inGS) <> " is not yet implemented") + -} + -} + -- let (rootNodes, leafNode, treeNodes,networkNodes) = LG.splitVertexList inSimpleGraph + -- logWith LogInfo ("GPOT: " <> (show (length rootNodes, length leafNode, length treeNodes, length networkNodes))) + + -- first traversal on outgroup root + outgroupRooted ← + if (graphType inGS) `elem` [Tree, HardWired] + then POSW.postOrderTreeTraversal inGS inData leafGraph staticIA startVertex inSimpleGraph + else + if (graphType inGS) == SoftWired + then POSW.postOrderSoftWiredTraversal inGS inData leafGraph staticIA startVertex inSimpleGraph + else error ("Graph type not implemented: " <> (show $ graphType inGS)) + + -- start at start vertex--for components or ur-root for full graph + -- root cost for overall cost is only added for thopse greaophs that include overall root + -- model complexity for PMDL is also added here + let (startVertexList, rootAndModelCost) = + if isJust startVertex + then ([fromJust startVertex], 0) + else + if optimalityCriterion inGS == PMDL + then (fmap fst $ LG.getRoots $ thd6 outgroupRooted, rootComplexity inGS + modelComplexity inGS) + else (fmap fst $ LG.getRoots $ thd6 outgroupRooted, rootComplexity inGS) + + -- only static characters + if sequenceChars == 0 + then do + penaltyFactor ← getPenaltyFactor inGS inData startVertex outgroupRooted + {- if (graphType inGS == Tree) + then pure 0.0 + else -- it is its own penalty due to counting all changes in in2 out 1 nodes + -- else if (graphType inGS == HardWired) then 0.0 + + -- softwired versions + + if (graphFactor inGS) == NoNetworkPenalty + then pure 0.0 + else + if (graphFactor inGS) == Wheeler2015Network + then POSW.getW15NetPenaltyFull Nothing inGS inData startVertex outgroupRooted + else + if (graphFactor inGS) == Wheeler2023Network + then pure $ POSW.getW23NetPenalty outgroupRooted + else error ("Network penalty type " <> (show $ graphFactor inGS) <> " is not yet implemented") + -} + + staticOnlyGraph ← + if (graphType inGS) == SoftWired + then POSW.updateAndFinalizePostOrderSoftWired startVertex (head startVertexList) outgroupRooted + else pure outgroupRooted + -- staticOnlyGraph = head recursiveRerootList' + let staticOnlyGraph' = + if startVertex == Nothing + then updatePhylogeneticGraphCost staticOnlyGraph (rootAndModelCost + penaltyFactor + (snd6 staticOnlyGraph)) + else updatePhylogeneticGraphCost staticOnlyGraph (rootAndModelCost + penaltyFactor + (snd6 staticOnlyGraph)) + pure (staticOnlyGraph', head startVertexList) + else do + recursiveRerootList ← + if (graphType inGS == HardWired) + then pure [outgroupRooted] + else + if (graphType inGS == SoftWired) + then do + displayResult ← POSW.getDisplayBasedRerootSoftWired inGS SoftWired (head startVertexList) outgroupRooted + pure [displayResult] + else + if (graphType inGS == Tree) + then do + displayResult ← POSW.getDisplayBasedRerootSoftWired inGS Tree (head startVertexList) outgroupRooted + -- logWith LogInfo ("RRL: " <> (show (snd6 displayResult, snd6 outgroupRooted)) <> "\n") + pure [displayResult] + else error ("Graph type not implemented: " <> (show $ graphType inGS)) + + -- single sequence (prealigned, dynamic) only (ie no static) + + if sequenceChars == 1 && (U.getNumberExactCharacters (thd3 inData) == 0) + then do + let finalizedPostOrderGraphList = L.sortOn snd6 recursiveRerootList + penaltyFactorList ← getPenaltyFactorList inGS inData startVertex finalizedPostOrderGraphList + {-if (graphType inGS == Tree) + then pure $ replicate (length finalizedPostOrderGraphList) 0.0 + else -- else if (graphType inGS == HardWired) then replicate (length finalizedPostOrderGraphList) 0.0 + + if (graphFactor inGS) == NoNetworkPenalty + then pure $ replicate (length finalizedPostOrderGraphList) 0.0 + else + if (graphFactor inGS) == Wheeler2015Network + then mapM (POSW.getW15NetPenaltyFull Nothing inGS inData startVertex) finalizedPostOrderGraphList + else + if (graphFactor inGS) == Wheeler2023Network + then pure $ fmap POSW.getW23NetPenalty finalizedPostOrderGraphList + else error ("Network penalty type " <> (show $ graphFactor inGS) <> " is not yet implemented") + -} + let newCostList = + zipWith3 + sum3 + penaltyFactorList + (fmap snd6 finalizedPostOrderGraphList) + (replicate (length finalizedPostOrderGraphList) rootAndModelCost) + + let finalizedPostOrderGraph = head $ L.sortOn snd6 $ zipWith updatePhylogeneticGraphCost finalizedPostOrderGraphList newCostList + + pure (finalizedPostOrderGraph, head startVertexList) + else do + -- multiple dynamic characters--checks for best root for each character + -- important to have outgroup rooted graph first for fold so don't use sorted recursive list + let graphWithBestAssignments = head recursiveRerootList + penaltyFactor ← getPenaltyFactor inGS inData startVertex graphWithBestAssignments + {-if (graphType inGS == Tree) + then pure 0.0 + else -- else if (graphType inGS == HardWired) then 0.0 + + if (graphFactor inGS) == NoNetworkPenalty + then pure 0.0 + else + if (graphFactor inGS) == Wheeler2015Network + then POSW.getW15NetPenaltyFull Nothing inGS inData startVertex graphWithBestAssignments + else + if (graphFactor inGS) == Wheeler2023Network + then pure $ POSW.getW23NetPenalty graphWithBestAssignments + else error ("Network penalty type " <> (show $ graphFactor inGS) <> " is not yet implemented") + -} + + let graphWithBestAssignments' = updatePhylogeneticGraphCost graphWithBestAssignments (rootAndModelCost + penaltyFactor + (snd6 graphWithBestAssignments)) + -- logWith LogInfo ("GGPOT: " <> (show (snd6 graphWithBestAssignments', snd6 graphWithBestAssignments)) <> "\n") + pure (graphWithBestAssignments', head startVertexList) + where + sum3 ∷ VertexCost → VertexCost → VertexCost → VertexCost + sum3 a b c = a + b + c + + +{- | getPenaltyFactorList takes graph type and other options and a list of graphs to return a + list of penalty factors +-} +getPenaltyFactorList ∷ GlobalSettings → ProcessedData → Maybe Int → [PhylogeneticGraph] → PhyG [VertexCost] +getPenaltyFactorList inGS inData startVertex inGraphList = + if null inGraphList + then pure [] + else mapM (getPenaltyFactor inGS inData startVertex) inGraphList + + +{- | getPenaltyFactor take graph information and optimality criterion and returns penalty +or graph complexity for single graph +-} +getPenaltyFactor ∷ GlobalSettings → ProcessedData → Maybe Int → PhylogeneticGraph → PhyG VertexCost +getPenaltyFactor inGS inData startVertex inGraph = + if LG.isEmpty $ fst6 inGraph + then pure 0.0 + else + if (optimalityCriterion inGS) `elem` [PMDL, SI] + then + let (_, _, _, networkNodeList) = LG.splitVertexList (fst6 inGraph) + in if graphType inGS == Tree + then pure $ fst $ IL.head (graphComplexityList inGS) + else + if graphType inGS == SoftWired + then pure $ fst $ (graphComplexityList inGS) IL.!!! (length networkNodeList) + else + if graphType inGS == HardWired + then pure $ snd $ (graphComplexityList inGS) IL.!!! (length networkNodeList) + else error ("Graph type " <> (show (graphType inGS)) <> " is not yet implemented") + else + if graphType inGS == Tree + then pure 0.0 + else + if (graphFactor inGS) == NoNetworkPenalty + then pure 0.0 + else + if (graphFactor inGS) == Wheeler2015Network + then POSW.getW15NetPenaltyFull Nothing inGS inData startVertex inGraph + else + if (graphFactor inGS) == Wheeler2023Network + then pure $ POSW.getW23NetPenalty inGraph + else error ("Network penalty type " <> (show (graphFactor inGS)) <> " is not yet implemented") + + +{- | checkUnusedEdgesPruneInfty checks if a softwired phylogenetic graph has +"unused" edges sensu Wheeler 2015--that an edge in the canonical graph is +not present in any of the block display trees (that are heurstically optimal) +the options specify if the cost returned is Infinity (really max bound Double) +with no pruning of edges or the cost is left unchanged and unused edges are +pruned from the canonical graph +this is unDirected due to rerooting heuristic in post/preorder optimization +inifinity defined in Types.hs +-} +checkUnusedEdgesPruneInfty + ∷ GlobalSettings → ProcessedData → Bool → Bool → DecoratedGraph → PhylogeneticGraph → PhyG PhylogeneticGraph +checkUnusedEdgesPruneInfty inGS inData pruneEdges warnPruneEdges leafGraph inGraph@(inSimple, _, inCanonical, blockTreeV, charTreeVV, charInfoVV) = + let simpleEdgeList = LG.edges inSimple + displayEdgeSet = L.nubBy LG.undirectedEdgeEquality $ concat $ concat $ fmap (fmap LG.edges) blockTreeV + unusedEdges = LG.undirectedEdgeMinus simpleEdgeList displayEdgeSet + in -- no unused edges all OK + if null unusedEdges + then pure inGraph + else -- unused edges--do not prune return "infinite cost" + + if not pruneEdges + then -- trace ("Unused edge->Infinity") + pure (inSimple, infinity, inCanonical, blockTreeV, charTreeVV, charInfoVV) + else -- unused but pruned--need to prune nodes and reoptimize to get final assignments correct + + let newSimpleGraph = LG.delEdges unusedEdges inSimple + contractedSimple = GO.contractIn1Out1EdgesRename newSimpleGraph + in if warnPruneEdges + then do + -- too lazy to thread PhyG logging throuhg everything + logWith LogWarn ("Pruning " <> (show $ length unusedEdges) <> " unused edges and reoptimizing graph") + multiTraverseFullyLabelSoftWired inGS inData pruneEdges warnPruneEdges leafGraph Nothing contractedSimple + else multiTraverseFullyLabelSoftWired inGS inData pruneEdges warnPruneEdges leafGraph Nothing contractedSimple + + +{- | updateGraphCostsComplexities adds root and model complexities if appropriate to graphs +updates NCM with original data due to weights of bitpacking +-} +updateGraphCostsComplexities + ∷ GlobalSettings → ProcessedData → ProcessedData → Bool → [ReducedPhylogeneticGraph] → PhyG [ReducedPhylogeneticGraph] +updateGraphCostsComplexities inGS reportingData processedData rediagnoseWithReportingData inGraphList = + -- parallel setup + -- trace ("UGCC: " <> (show (optimalityCriterion inGS, rootComplexity inGS))) $ + let traverseAction ∷ SimpleGraph → PhyG ReducedPhylogeneticGraph + traverseAction = multiTraverseFullyLabelGraphReduced inGS reportingData False False Nothing + in if optimalityCriterion inGS /= NCM + then do + pure inGraphList + else -- NCM re-check graph cost (not root or model) due to bit packing + do + updatedGraphList ← + if (reportingData == emptyProcessedData) || (not rediagnoseWithReportingData) || (not $ U.has4864PackedChars (thd3 processedData)) + then -- trace ("\t\tCannot update cost with original data--skipping") + pure inGraphList + else + getParallelChunkTraverse >>= \pTraverse → + (traverseAction . fst5) `pTraverse` inGraphList + + logWith LogInfo ("\tFinalizing graph cost (updating NCM)" <> "\n") + pure updatedGraphList + + +{- | +'updatePhylogeneticGraphCostList' is a list wrapper for 'updatePhylogeneticGraphCost'. +-} +updatePhylogeneticGraphCostList ∷ VertexCost → [ReducedPhylogeneticGraph] → [ReducedPhylogeneticGraph] +updatePhylogeneticGraphCostList rootCost inGraphList = + fmap (updateCost rootCost) inGraphList + where + updateCost ∷ ∀ {b} {a} {c} {d} {e}. (Num b) ⇒ b → (a, b, c, d, e) → (a, b, c, d, e) + updateCost z (a, oldCost, b, c, e) = (a, oldCost + z, b, c, e) + + +{- | +'updatePhylogeneticGraphCost' takes a 'PhylgeneticGraph' and 'Double' and replaces the cost (snd of 6 fields) +and returns Phylogenetic graph. +-} +updatePhylogeneticGraphCost ∷ PhylogeneticGraph → VertexCost → PhylogeneticGraph +updatePhylogeneticGraphCost (a, _, b, c, d, e) newCost = (a, newCost, b, c, d, e) + + +{- | +'updatePhylogeneticGraphCost' takes a 'ReducedPhylogeneticGraph' and 'Double' and replaces the cost (snd of 6 fields) +and returns Phylogenetic graph. +-} +updatePhylogeneticGraphCostReduced ∷ ReducedPhylogeneticGraph → VertexCost → ReducedPhylogeneticGraph +updatePhylogeneticGraphCostReduced (a, _, b, c, e) newCost = (a, newCost, b, c, e) diff --git a/src/Graphs/GraphOperations.hs b/src/Graphs/GraphOperations.hs new file mode 100644 index 000000000..e83342040 --- /dev/null +++ b/src/Graphs/GraphOperations.hs @@ -0,0 +1,1431 @@ +{-# OPTIONS_GHC -Wno-missed-specialisations #-} + +{- | +Module specifying general graph functions--with types specific to Types.hs +graph functions that a re general are in LocalGraph.hs +-} +module Graphs.GraphOperations ( + contractIn1Out1EdgesRename, + convertDecoratedToSimpleGraph, + convertDecoratedToSimpleGraphBranchLength, + convertGeneralGraphToPhylogeneticGraph, + convertPhylogeneticGraph2Reduced, + convertReduced2PhylogeneticGraph, + convertReduced2PhylogeneticGraphSimple, + convertSimpleToDecoratedGraph, + convertToSimpleEdge, + copyIAFinalToPrelim, + copyIAPrelimToFinal, + dichotomizeRoot, + getBVUniqPhylogeneticGraph, + getDecoratedDisplayTreeList, + getDisplayTreeCostList, + getNodeType, + getTopoUniqPhylogeneticGraph, + getUniqueGraphs, + graphCostFromNodes, + hasNetNodeAncestorViolation, + isNovelGraph, + isPhylogeneticDecoratedGraph, + ladderizeGraph, + makeDummyLabEdge, + makeGraphTimeConsistent, + makeIAFinalFromPrelim, + makeIAPrelimFromFinal, + makeLeafGraph, + makeNewickList, + makeSimpleLeafGraph, + parentsInChainGraph, + phylogeneticGraphListMinus, + reducedphylogeneticGraphListMinus, + remakePhylogeneticGraph, + removeParentsInChain, + renameSimpleGraphNodes, + renameSimpleGraphNodesString, + selectGraphStochastic, + selectGraphs, + selectGraphsFull, + selectPhylogeneticGraph, + selectPhylogeneticGraphReduced, + showDecGraphs, + sortEdgeListByLength, + topologicalEqual, +) where + +import Bio.DynamicCharacter +import Commands.Verify qualified as V +import Control.Monad.Random (filterM) +import Control.Monad.Random.Class +import Data.BitVector.LittleEndian qualified as BV +import Data.Bits +import Data.Char qualified as C +import Data.Functor ((<&>)) +import Data.List qualified as L +import Data.Maybe +import Data.Ord +import Data.Text.Lazy qualified as T +import Data.Vector qualified as V +import Data.Vector.Generic qualified as GV +import GeneralUtilities +import GraphFormatUtilities qualified as GFU +import GraphOptimization.Medians qualified as M +import PHANE.Evaluation +import PHANE.Evaluation.ErrorPhase (ErrorPhase (Computing)) +import PHANE.Evaluation.Verbosity (Verbosity (..)) +import Text.Read +import Types.Types +import Utilities.LocalGraph qualified as LG +import Utilities.Utilities qualified as U + + +-- | convertPhylogeneticGraph2Reduced takes a Phylogenetic graph and returns a reduced phylogenetiv graph +convertPhylogeneticGraph2Reduced ∷ PhylogeneticGraph → ReducedPhylogeneticGraph +convertPhylogeneticGraph2Reduced inPhyloGraph@(a, b, c, displayTreeV, _, f) = + if inPhyloGraph == emptyPhylogeneticGraph + then emptyReducedPhylogeneticGraph + else + let newDisplayTreeV = fmap (fmap convertDecoratedToSimpleGraph) displayTreeV + in (a, b, c, newDisplayTreeV, f) + + +{- | convertReduced2PhylogeneticGraphSimple just adds in a correctly typed fifth field +bit decorartions are not extractged +-} +convertReduced2PhylogeneticGraphSimple ∷ ReducedPhylogeneticGraph → PhylogeneticGraph +convertReduced2PhylogeneticGraphSimple (a, b, c, d, f) = + let newDisplayTreeV = fmap (fmap convertSimpleToDecoratedGraph) d + in (a, b, c, newDisplayTreeV, mempty, f) + + +-- | convertReduced2PhylogeneticGraph takes a reduced phylogenetic graph and returns a phylogenetiv graph +convertReduced2PhylogeneticGraph ∷ ReducedPhylogeneticGraph → PhylogeneticGraph +convertReduced2PhylogeneticGraph inReducedPhyloGraph@(a, b, canonicalGraph, displayTreeV, charInfoVV) = + if inReducedPhyloGraph == emptyReducedPhylogeneticGraph + then emptyPhylogeneticGraph + else + let newDisplayTreeVL = fmap (getDecoratedDisplayTreeList canonicalGraph) $ V.zip displayTreeV $ V.fromList [0 .. (V.length charInfoVV - 1)] + newCharacterTreeVV = fmap getCharacterTree $ V.zip (fmap head newDisplayTreeVL) (fmap V.length charInfoVV) + in (a, b, canonicalGraph, newDisplayTreeVL, newCharacterTreeVV, charInfoVV) + + +{- | getCharacterTree takes an input Decotate disply tree list and returns a vectopr of character trees +based on teh first display tree in list +-} +getCharacterTree ∷ (DecoratedGraph, Int) → V.Vector DecoratedGraph +getCharacterTree (inDisplayTree, numCharTrees) = + if LG.isEmpty inDisplayTree + then error "Empty display tree getCharaterTree" + else + let displayNodeList = LG.labNodes inDisplayTree + displayEdgeList = LG.labEdges inDisplayTree + charNodeListList = fmap (makeCharacterNodes displayNodeList) [0 .. numCharTrees - 1] + in V.fromList $ zipWith LG.mkGraph charNodeListList (L.replicate numCharTrees displayEdgeList) + + +-- | makeCharacterNodes makes character nodes from dispaly tree nodes +makeCharacterNodes ∷ [LG.LNode VertexInfo] → Int → [LG.LNode VertexInfo] +makeCharacterNodes displayTreeNodeList charIndex = + if null displayTreeNodeList + then error "Null node list in extractCharacterNodeInfo" + else fmap (extractCharNodeInfo charIndex) displayTreeNodeList + + +-- | extractCharNodeInfo takes a charcter indfex and produces a node with newinfo of single block and single character +extractCharNodeInfo ∷ Int → LG.LNode VertexInfo → LG.LNode VertexInfo +extractCharNodeInfo charIndex (a, cLabel) = + let charVertData = V.singleton $ V.singleton $ (V.head (vertData cLabel)) V.! charIndex + charVertexResolutionData = V.singleton $ V.singleton $ (V.head (vertexResolutionData cLabel)) V.! charIndex + newLabel = + cLabel + { vertData = charVertData + , vertexResolutionData = charVertexResolutionData + } + in (a, newLabel) + + +{- | getDecoratedDisplayTreeList takes a cononical graph and pair of list of display trees as ASimpleGraph and an index +and creates a new display tree list from the canonical infomation +all display treelist nodes are assigned same decorations +edge weights are not recalculated +-} +getDecoratedDisplayTreeList ∷ DecoratedGraph → ([SimpleGraph], Int) → [DecoratedGraph] +getDecoratedDisplayTreeList inCanonicalGraph (inDisplayL, blockIndex) + | LG.isEmpty inCanonicalGraph = error "Empty canonical graph in getDecoratedDisplayTree" + | otherwise = + -- get edges and nodes for canonical graph + let canNodList = LG.labNodes inCanonicalGraph + canEdgedList = LG.labEdges inCanonicalGraph + newNodeListList = fmap (makeDisplayNodes canNodList blockIndex) (fmap LG.nodes inDisplayL) + newDisplayEdgeList = fmap (makeDisplayEdges canEdgedList) inDisplayL + in zipWith LG.mkGraph newNodeListList newDisplayEdgeList + + +{- | makeDisplayNodes takes canincal node label data and pulls out specific block +infomartion and cretes new nodes with single block data only +assumes nodes have same indices +-} +makeDisplayNodes ∷ [LG.LNode VertexInfo] → Int → [LG.Node] → [LG.LNode VertexInfo] +makeDisplayNodes canonicalNodeList blockIndex inDisplayNodeList + | length canonicalNodeList /= length inDisplayNodeList = error "Canonical and display node lists are unequal in length" + | otherwise = extractBlockNodeInfo blockIndex <$> canonicalNodeList + + +-- | extractBlockNodeInfo takes a single node and block inde and extracts block data to make a new, single block +extractBlockNodeInfo ∷ Int → LG.LNode VertexInfo → LG.LNode VertexInfo +extractBlockNodeInfo blockIndex (a, cLabel) = + let blockVertData = V.singleton $ (vertData cLabel) V.! blockIndex + blockVertexResolutionData = V.singleton $ (vertexResolutionData cLabel) V.! blockIndex + newLabel = + cLabel + { vertData = blockVertData + , vertexResolutionData = blockVertexResolutionData + } + in (a, newLabel) + + +{- | makeDisplayEdges tcretes edges in display tree +by taking corresponding edge in canonical tree +edge weight info is not recalculated and is likely incorrect +-} +makeDisplayEdges ∷ [LG.LEdge EdgeInfo] → SimpleGraph → [LG.LEdge EdgeInfo] +makeDisplayEdges canonicalEdgeList displayTree = + let displayEdgeList = LG.labEdges displayTree + in fmap (copyLabelInfo canonicalEdgeList) displayEdgeList + + +-- | copyLabelInfo finds corresponding edge (undirected) and takes relevant data from it to make new labelled edge +copyLabelInfo ∷ [LG.LEdge EdgeInfo] → LG.LEdge Double → LG.LEdge EdgeInfo +copyLabelInfo canonicalEdgeList displayEdge@(u, v, dl) = + if null canonicalEdgeList + then error "Empty edge list in copyLabelInfo" + else + let canonicalEdge = L.find (matchIndices displayEdge) canonicalEdgeList + newLabel = + (thd3 $ fromJust canonicalEdge) + { minLength = dl + , maxLength = dl + , midRangeLength = dl + } + in if isNothing canonicalEdge + then error "Cannot find canonical edge in copyLabelInfo" + else (u, v, newLabel) + where + matchIndices (a, b, _) (a', b', _) = + if a == a' && b == b' + then True + else + if a == b' && b == a' + then True + else False + + +{- | isPhylogeneticDecoratedGraph checks various issues to see if +there is wierdness in graph +-} +isPhylogeneticDecoratedGraph ∷ DecoratedGraph → PhyG Bool +isPhylogeneticDecoratedGraph inGraph = + if LG.isEmpty inGraph + then pure False + else + let nodeList = fmap fst $ LG.labNodes inGraph + indegreeList = fmap (LG.inn inGraph) nodeList + outdegreeList = fmap (LG.out inGraph) nodeList + in if LG.hasDuplicateEdgesNub inGraph + then pure False + else + if length (LG.getRoots inGraph) /= 1 + then pure False + else + if LG.outdeg inGraph (head $ LG.getRoots inGraph) /= 2 + then pure False + else + if (not . null) (LG.getIsolatedNodes inGraph) + then pure False + else + if (not . null) (filter ((> 2) . length) indegreeList) + then pure False + else + if (not . null) (filter ((> 2) . length) outdegreeList) + then pure False + else + if parentsInChainGraph inGraph + then pure False + else do + consistent ← LG.isGraphTimeConsistent inGraph + if not consistent + then pure False + else pure True + + +{- | parentsInChainGraph checks all network vertices in graph to see if +parents in any network vertex is ancestor of other +-} +parentsInChainGraph ∷ DecoratedGraph → Bool +parentsInChainGraph inGraph = + if LG.isEmpty inGraph + then False + else + let (_, _, _, netVertexList) = LG.splitVertexList inGraph + parentChainList = fmap (parentsInChainVertex inGraph) $ fmap fst netVertexList + in -- if ((not . null) $ filter (== True) parentChainList) then traceNoLF ("NPDG+ ") $ ((not . null) $ filter (== True) parentChainList) + -- else traceNoLF ("NPDG- ") $ (not . null) $ filter (== True) parentChainList + (not . null) $ filter (== True) parentChainList + + +{- | parentsInChainVertex checks for a network vertex if +one parent is ancestor of other +retuens false if not net vertex +-} +parentsInChainVertex ∷ DecoratedGraph → LG.Node → Bool +parentsInChainVertex inGraph inNode = + if LG.isEmpty inGraph + then False + else + let parentNodes = LG.labParents inGraph inNode + firstParent = head parentNodes + secondParent = last parentNodes + firstBV = bvLabel $ snd firstParent + secondBV = bvLabel $ snd secondParent + oneAncOther = (firstBV .&. secondBV) `elem` [firstBV, secondBV] + in if length parentNodes < 2 + then False + else + if oneAncOther + then True + else -- traceNoLF ("PCV:" <> (show (firstBV, secondBV, firstBV .&. secondBV))) + False + + +{- | ReducedphylogeneticGraphListMinus subtracts teh secoind argiument list from first +if an element is multiple times in firt list each will be removed +equality comparison is based on String rep of graphs vertes and edges (prettyVertices) +does not take cost into account--or edge weight--only topology. +result like (minuendList - subtrahendList) +-} +reducedphylogeneticGraphListMinus ∷ [ReducedPhylogeneticGraph] → [ReducedPhylogeneticGraph] → [ReducedPhylogeneticGraph] +reducedphylogeneticGraphListMinus minuendList subtrahendList + | null minuendList = [] + | null subtrahendList = minuendList + | otherwise = + let minuendSimpleStringList = fmap (LG.prettyIndices . fst5) minuendList + subtrahendSinpleStringList = fmap (LG.prettyIndices . fst5) subtrahendList + inSubtrahendList = fmap (`elem` subtrahendSinpleStringList) minuendSimpleStringList + differenceList = fmap fst $ filter (not . snd) $ zip minuendList inSubtrahendList + in differenceList + + +{- | phylogeneticGraphListMinus subtracts teh secoind argiument list from first +if an element is multiple times in firt list each will be removed +equality comparison is based on String rep of graphs vertes and edges (prettyVertices) +does not take cost into account--or edge weight--only topology. +result like (minuendList - subtrahendList) +-} +phylogeneticGraphListMinus ∷ [PhylogeneticGraph] → [PhylogeneticGraph] → [PhylogeneticGraph] +phylogeneticGraphListMinus minuendList subtrahendList + | null minuendList = [] + | null subtrahendList = minuendList + | otherwise = + let minuendSimpleStringList = fmap (LG.prettyIndices . fst6) minuendList + subtrahendSinpleStringList = fmap (LG.prettyIndices . fst6) subtrahendList + inSubtrahendList = fmap (`elem` subtrahendSinpleStringList) minuendSimpleStringList + differenceList = fmap fst $ filter (not . snd) $ zip minuendList inSubtrahendList + in differenceList + + +-- | makeNewickList takes a list of fgl trees and outputs a single String cointaining the graphs in Newick format +makeNewickList ∷ Bool → Bool → Bool → Int → [SimpleGraph] → [VertexCost] → String +makeNewickList isTNT writeEdgeWeight writeNodeLabel' rootIndex graphList costList = + let allTrees = L.foldl' (&&) True (fmap LG.isTree graphList) + + -- check for network HTU label requirement + writeNodeLabel + | allTrees = writeNodeLabel' + | writeNodeLabel' = writeNodeLabel' + | otherwise -- trace "HTU labels are required for ENewick Output" + = + True + + graphString = GFU.fglList2ForestEnhancedNewickString (fmap (LG.rerootTree rootIndex) graphList) writeEdgeWeight writeNodeLabel + newickStringList = fmap init $ filter (not . null) $ lines graphString + costStringList = + if not isTNT + then fmap ((('[' :) . (<> "];\n")) . show) costList + else L.replicate (length costList) "*\n" + + graphStringCost = concat $ zipWith (<>) newickStringList costStringList + graphStringCostTNT = fmap commaToSpace graphStringCost + + graphStringCost' = + if not isTNT + then graphStringCost + else (take (length graphStringCostTNT - 2) graphStringCostTNT) <> ";\n" + in graphStringCost' + where + commaToSpace a = if a /= ',' then a else ' ' + + +{- | convertGeneralGraphToPhylogeneticGraph inputs a SimpleGraph and converts it to a Phylogenetic graph by: + 1) transitive reduction -- removes anc <-> desc netork edges + 2) ladderizes -- all vertices are (in degree, outdegree) (0,1|2) (1,2) (2,1) (1,0) + by adding extra HTIs and edges + 3) checks time consistency and removes edges stepwise from + those that violate the most before/after splits of network edges + arbitrary but deterministic + 4) contracts out any remaning indegree 1 outdegree 1 nodes and renames HTUs in order +these tests can be screwed up by imporperly formated graphs comming in (self edges, chained network edge etc) +-} +convertGeneralGraphToPhylogeneticGraph ∷ Bool → SimpleGraph → PhyG SimpleGraph +convertGeneralGraphToPhylogeneticGraph correct inGraph = + if LG.isEmpty inGraph + then pure LG.empty + else + let -- remove single "tail" edge from root with single child, replace child node with root + noTailGraph = LG.contractRootOut1Edge inGraph + + -- remove non-leaf nodes (index > root) with outdegree 0 + nonNonLeafOut0 = LG.removeNonLeafOut0NodesAfterRoot noTailGraph + + -- remove indeg 1 out deg 1 edges + noIn1Out1Graph = contractIn1Out1EdgesRename nonNonLeafOut0 -- noTailGraph + + -- transitive reduction + -- only wanted to EUN and CUN--but they do it + -- reducedGraph = LG.transitiveReduceGraph noIn1Out1Graph + + -- caused problems at one point + -- laderization of indegree and outdegree edges + ladderGraph = ladderizeGraph noIn1Out1Graph -- reducedGraph + in do + -- time consistency (after those removed by transitrive reduction) + timeConsistentGraph ← makeGraphTimeConsistent correct ladderGraph + + -- removes parent child network edges + let noChainedGraph = LG.removeChainedNetworkNodes False timeConsistentGraph + + -- removes ancestor descendent edges transitiveReduceGraph should do this + -- but that looks at all nodes not just vertex + let noParentChainGraph = removeParentsInChain correct $ fromJust noChainedGraph -- timeConsistentGraph -- + + -- deals the nodes with all network children + let noTreeEdgeGraph = LG.removeTreeEdgeFromTreeNodeWithAllNetworkChildren noParentChainGraph + + -- remove sister-sister edge. where two network nodes have same parents + let noSisterSisterGraph = removeSisterSisterEdges correct noTreeEdgeGraph + + -- remove and new zero nodes + let finalGraph = LG.removeNonLeafOut0NodesAfterRoot noSisterSisterGraph + + if LG.isEmpty timeConsistentGraph + then pure LG.empty + else + if isNothing noChainedGraph + then pure LG.empty + else + if LG.isEmpty noParentChainGraph + then pure LG.empty + else + if LG.isEmpty noSisterSisterGraph + then pure LG.empty + else -- trace ("CGP orig:\n" <> (LG.prettify inGraph) <> "\nNew:" <> (LG.prettify timeConsistentGraph)) + -- cycle check to make sure--can be removed when things working + -- else if LG.cyclic noSisterSisterGraph then error ("Cycle in graph : \n" <> (LG.prettify noSisterSisterGraph)) + + -- this final need to ladderize or recontract? + + if finalGraph == inGraph + then pure finalGraph + else convertGeneralGraphToPhylogeneticGraph correct finalGraph + + +-- | removeParentsInChain checks the parents of each netowrk node are not anc/desc of each other +removeParentsInChain ∷ Bool → SimpleGraph → SimpleGraph +removeParentsInChain correct inGraph = + if LG.isEmpty inGraph + then LG.empty + else + let (_, _, _, netVertexList) = LG.splitVertexList inGraph + parentNetVertList = fmap (LG.labParents inGraph . fst) netVertexList + + -- get list of nodes that are transitively equal in age + concurrentList = LG.mergeConcurrentNodeLists parentNetVertList [] + concurrentPairList = concatMap getListPairs concurrentList + + -- get pairs that violate concurrency + violatingConcurrentPairs = concatMap (LG.concurrentViolatePair inGraph) concurrentPairList + + -- get network nodes with violations + parentNodeViolateList = concatMap pairToList violatingConcurrentPairs + childNodeViolateList = concatMap (LG.descendants inGraph) parentNodeViolateList + netNodeViolateList = filter (LG.isNetworkNode inGraph) childNodeViolateList + + netEdgesThatViolate = fmap LG.toEdge $ LG.inn inGraph $ head netNodeViolateList + in if null violatingConcurrentPairs + then inGraph + else + if null netNodeViolateList + then error "Should be neNode that violate" + else + if null netEdgesThatViolate + then error "Should be violating in edges" + else + if not correct + then LG.empty + else + let edgeDeletedGraph = LG.delEdge (head netEdgesThatViolate) inGraph + newGraph = contractIn1Out1EdgesRename edgeDeletedGraph + in -- trace ("PIC") + removeParentsInChain correct newGraph + where + pairToList (a, b) = [fst a, fst b] + + +{- | removeSisterSisterEdges takes a graph and recursively removes a single edge fomr where two network +edges have the same two parents +-} +removeSisterSisterEdges ∷ Bool → SimpleGraph → SimpleGraph +removeSisterSisterEdges correct inGraph = + if LG.isEmpty inGraph + then LG.empty + else + let sisterSisterEdges = LG.getSisterSisterEdgeList inGraph + -- newGraph = LG.delEdge (head sisterSisterEdges) inGraph + newGraph = LG.delEdges sisterSisterEdges inGraph + newGraph' = contractIn1Out1EdgesRename newGraph + in if null sisterSisterEdges + then inGraph + else + if not correct + then LG.empty + else -- trace ("Sister") + -- removeSisterSisterEdges + newGraph' + + +{- | makeGraphTimeConsistent takes laderized, transitive reduced graph and deletes +network edges in an arbitrary but deterministic sequence to produce a phylogentic graphs suitable +for swapping etc +looks for violation of time between netork edges based on "before" and "after" +tests of nodes that should be potentially same age +removes second edge of second pair of two network edges in each case adn remakes graph +strict paralle since will need each recursive run +-} +makeGraphTimeConsistent ∷ Bool → SimpleGraph → PhyG SimpleGraph +makeGraphTimeConsistent correct inGraph + | LG.isEmpty inGraph = pure LG.empty + | LG.isTree inGraph = pure inGraph + | otherwise = + let coevalNodeConstraintList = LG.coevalNodePairs inGraph + -- parallel setup + -- action :: (Show a,Eq a,Eq b) => (LG.LNode a, LG.LNode a) -> (LG.LNode a, LG.LNode a, [LG.LNode a], [LG.LNode a], [LG.LNode a], [LG.LNode a]) + action = LG.addBeforeAfterToPair inGraph + in do + pTraverse ← getParallelChunkMap + let coevalNodeConstraintList' = pTraverse action coevalNodeConstraintList + -- PU.seqParMap PU.myStrategyRDS (LG.addBeforeAfterToPair inGraph) coevalNodeConstraintList -- `using` PU.myParListChunkRDS + let coevalPairsToCompareList = getListPairs coevalNodeConstraintList' + let timeOffendingEdgeList = LG.getEdgesToRemoveForTime inGraph coevalPairsToCompareList + let newGraph = LG.delEdges timeOffendingEdgeList inGraph + -- trace ("MGTC:" <> (show timeOffendingEdgeList)) + if (not correct) && (not . null) timeOffendingEdgeList + then pure LG.empty + else pure $ contractIn1Out1EdgesRename newGraph + + +{- | contractIn1Out1EdgesRename contracts in degree and outdegree edges and renames HTUs in index order +does one at a time and makes a graph and recurses +-} +contractIn1Out1EdgesRename ∷ SimpleGraph → SimpleGraph +contractIn1Out1EdgesRename inGraph = + if LG.isEmpty inGraph + then LG.empty + else + let newGraph = LG.contractIn1Out1Edges inGraph + in renameSimpleGraphNodes newGraph + + +-- | renameSimpleGraphNodes takes nodes and renames HTU nodes based on index +renameSimpleGraphNodes ∷ SimpleGraph → SimpleGraph +renameSimpleGraphNodes inGraph + | LG.isEmpty inGraph = LG.empty + | otherwise = + let inNodes = LG.labNodes inGraph + nodeLabels = fmap (makeSimpleLabel inGraph) inNodes + newNodes = zip (fmap fst inNodes) nodeLabels + newEdges = LG.labEdges inGraph + makeSimpleLabel g (a, b) + | not $ LG.isLeaf g a = T.pack $ "HTU" <> show a + | otherwise = b + in LG.mkGraph newNodes newEdges + + +-- | renameSimpleGraphNodesString takes nodes and renames HTU nodes based on index +renameSimpleGraphNodesString ∷ LG.Gr String String → LG.Gr String String +renameSimpleGraphNodesString inGraph = + if LG.isEmpty inGraph + then LG.empty + else + let inNodes = LG.labNodes inGraph + nodeLabels = fmap (makeSimpleLabel inGraph) inNodes + newNodes = zip (fmap fst inNodes) nodeLabels + newEdges = LG.labEdges inGraph + in -- newGraph + -- trace ("C11: " <> (show $ LG.getIsolatedNodes newGraph) <> " => " <> (show newNodes) <> " " <> (show $ fmap LG.toEdge newEdges)) + LG.mkGraph newNodes newEdges + where + makeSimpleLabel g (a, b) = + if not $ LG.isLeaf g a + then "HTU" <> show a + else b + + +-- | sortEdgeListByLength sorts edge list by length (midRange), highest to lowest +sortEdgeListByLength ∷ [LG.LEdge EdgeInfo] → [LG.LEdge EdgeInfo] +sortEdgeListByLength inEdgeList = + if null inEdgeList + then [] + else L.sortOn (Data.Ord.Down . midRangeLength . thd3) inEdgeList + + +{- | ladderizeGraph is a wrapper around ladderizeGraph' to allow for mapping with +local nodelist +-} +ladderizeGraph ∷ SimpleGraph → SimpleGraph +ladderizeGraph inGraph = ladderizeGraph' inGraph (LG.nodes inGraph) + + +{- | ladderize takes an input graph and ensures/creates nodes +such that all vertices are (indegree, outdegree) (0,>0), (1,2) (2,1) (1,0) +ladderizeGraph' :: SimpleGraph -> [LG.Node] -> SimpleGraph +-} +ladderizeGraph' ∷ SimpleGraph → [LG.Node] → SimpleGraph +ladderizeGraph' inGraph nodeList + | LG.isEmpty inGraph = LG.empty + | null nodeList = inGraph + | otherwise = + let -- these are roots, network, tree, leaf nodes + okNodeDegrees = [(0, 2), (1, 2), (2, 1), (1, 0)] + firstNode = head nodeList + (inEdgeList, outEdgeList) = LG.getInOutEdges inGraph firstNode + inOutPairLength = (length inEdgeList, length outEdgeList) + in -- trace ("node " <> (show firstNode) <> " " <> (show inOutPairLength)) ( + -- node ok to keep + if inOutPairLength `elem` okNodeDegrees + then ladderizeGraph' inGraph (tail nodeList) + else -- node edges need modification + + let newGraph = resolveNode inGraph firstNode (inEdgeList, outEdgeList) inOutPairLength + in -- trace ("resolving " <> "node " <> (show firstNode) <> " " <> (show inOutPairLength) ) + ladderizeGraph' newGraph (LG.nodes newGraph) + + +-- ) + +{- | resolveNode takes a graph and node and inbound edgelist and outbound edge list +and converts node to one of (indeg, outdeg) (0,1),(0,2),(1,2),(2,1),(1,0) +this only resolves a single nodes edges at a time and then returns new graph +when more hase to be done--that will occur on lultiple passes through nodes. +perhaps not the most efficient, but only done once per input graph +contracts indegree 1 outdegree 1 nodes +-} +resolveNode ∷ SimpleGraph → LG.Node → ([LG.LEdge Double], [LG.LEdge Double]) → (Int, Int) → SimpleGraph +resolveNode inGraph curNode inOutPair@(inEdgeList, outEdgeList) (inNum, outNum) = + if LG.isEmpty inGraph + then LG.empty + else -- trace ("Resolving " <> show (inNum, outNum)) ( + + let numNodes = length $ LG.nodes inGraph + in -- isolated node -- throw warning and delete + if inNum == 0 && outNum == 0 + then -- this is debug info + -- trace ("Warning: ResolveNode deleting isolated vertex " <> show curNode) ( -- <> " in graph\n" <> LG.prettify inGraph ) + + let newGraph = LG.delNode curNode inGraph + in newGraph + else -- node with too many parents and too many children + -- converts to tree node--biased in that direction + + if (inNum > 2) && (outNum > 2) + then + let first2Edges = take 2 inEdgeList + newNode = (numNodes, T.pack ("HTU" <> show numNodes)) + newEdge1 = (fst3 $ head first2Edges, numNodes, 0.0 ∷ Double) + newEdge2 = (fst3 $ last first2Edges, numNodes, 0.0 ∷ Double) + newEdge3 = (numNodes, curNode, 0.0 ∷ Double) + newGraph = LG.insEdges [newEdge1, newEdge2, newEdge3] $ LG.delLEdges first2Edges $ LG.insNode newNode inGraph + in newGraph + else -- leaf leaf with too many parents + + if (inNum > 1) && (outNum == 0) || (inNum > 2) && (outNum == 1) || (inNum > 1) && (outNum == 2) + then + let first2Edges = take 2 inEdgeList + newNode = (numNodes, T.pack ("HTU" <> show numNodes)) + newEdge1 = (fst3 $ head first2Edges, numNodes, 0.0 ∷ Double) + newEdge2 = (fst3 $ last first2Edges, numNodes, 0.0 ∷ Double) + newEdge3 = (numNodes, curNode, 0.0 ∷ Double) + newGraph = LG.insEdges [newEdge1, newEdge2, newEdge3] $ LG.delLEdges first2Edges $ LG.insNode newNode inGraph + in newGraph + else -- indegree 1 outdegree 1 node to contract + + if inNum == 1 && outNum == 1 + then + let newEdge = (fst3 $ head inEdgeList, snd3 $ head outEdgeList, 0.0 ∷ Double) + newGraph = LG.insEdge newEdge $ LG.delNode curNode $ LG.delLEdges (inEdgeList <> outEdgeList) inGraph + in newGraph + else + if inNum < 2 || outNum > 2 + then + let first2Edges = take 2 outEdgeList + newNode = (numNodes, T.pack ("HTU" <> show numNodes)) + newEdge1 = (numNodes, snd3 $ head first2Edges, 0.0 ∷ Double) + newEdge2 = (numNodes, snd3 $ last first2Edges, 0.0 ∷ Double) + newEdge3 = (curNode, numNodes, 0.0 ∷ Double) + newGraph = LG.insEdges [newEdge1, newEdge2, newEdge3] $ LG.delLEdges first2Edges $ LG.insNode newNode inGraph + in newGraph + else -- root or simple network indegree node + + if inNum == 0 || outNum > 2 + then + let first2Edges = take 2 outEdgeList + newNode = (numNodes, T.pack ("HTU" <> show numNodes)) + newEdge1 = (numNodes, snd3 $ head first2Edges, 0.0 ∷ Double) + newEdge2 = (numNodes, snd3 $ last first2Edges, 0.0 ∷ Double) + newEdge3 = (curNode, numNodes, 0.0 ∷ Double) + newGraph = LG.insEdges [newEdge1, newEdge2, newEdge3] $ LG.delLEdges first2Edges $ LG.insNode newNode inGraph + in newGraph + else -- check if indegree 0 is a leaf (ie index < root) + + if outNum == 0 + then -- get root index + + let rootIndex = fst $ head $ LG.getRoots inGraph + in if curNode < rootIndex + then inGraph + else LG.delNode curNode inGraph + else + error + ( "This can't happen in resolveNode in/out edge lists don't need to be resolved " <> show inOutPair <> "\n" <> LG.prettify inGraph + ) + + +-- ) + +{- | convertSimpleToDecoratedGraph takes a sinple graph and creates a Decorated graph +but with dummy info--basically just with the correct type and structures +-} +convertSimpleToDecoratedGraph ∷ SimpleGraph → DecoratedGraph +convertSimpleToDecoratedGraph inSimple = + if LG.isEmpty inSimple + then LG.empty + else + let simpleNodeList = LG.labNodes inSimple + simpleEdgeList = LG.labEdges inSimple + decNodeList = fmap simpleNodeToDecorated simpleNodeList + decEdgeList = fmap simpleEdgeToDecorated simpleEdgeList + in LG.mkGraph decNodeList decEdgeList + + +-- | simpleNodeToDecorated takes a simple node and cretes a decoraetd node with info available +simpleNodeToDecorated ∷ LG.LNode T.Text → LG.LNode VertexInfo +simpleNodeToDecorated (indexNode, nameNode) = + -- probbaly need to add other info--but cn;t get all of it (e..g. BV and costs) + ( indexNode + , emptyVertexInfo + { index = indexNode + , vertName = nameNode + } + ) + + +-- | simpleEdgeToDecorated takes a Double edge label and returns EdgInfo +simpleEdgeToDecorated ∷ LG.LEdge Double → LG.LEdge EdgeInfo +simpleEdgeToDecorated (a, b, weightDouble) = (a, b, dummyEdge{minLength = weightDouble, maxLength = weightDouble, midRangeLength = weightDouble}) + +{-convertDecoratedToSimpleGraphBranchLength convertgs decorated to simple graph but allows + specification of edge/branch weight/length ("min", "max", "mid") -} +convertDecoratedToSimpleGraphBranchLength :: String -> DecoratedGraph → SimpleGraph +convertDecoratedToSimpleGraphBranchLength branchWeight inDec = + if LG.isEmpty inDec then LG.empty + else if branchWeight `notElem` ["min", "max", "mid"] then + errorWithoutStackTrace ("Edge/Branch weight/length not recognized ('min', 'max', 'mid') : " <> branchWeight) + else + let edgeWeight = if branchWeight == "min" then minLength + else if branchWeight == "max" then maxLength + else midRangeLength + + decNodeList = LG.labNodes inDec + newNodeLabels = fmap (vertName . snd) decNodeList + simpleNodes = zip (fmap fst decNodeList) newNodeLabels + labEdgeList = LG.labEdges inDec + edgeWeightList = fmap (edgeWeight . thd3) labEdgeList + simpleEdgeList = fmap convertToSimpleEdge (zip edgeWeightList labEdgeList) + in LG.mkGraph simpleNodes simpleEdgeList + + +-- | convertDecoratedToSimpleGraph takes a decorated graph and returns the simple graph equivalent +convertDecoratedToSimpleGraph ∷ DecoratedGraph → SimpleGraph +convertDecoratedToSimpleGraph inDec = convertDecoratedToSimpleGraphBranchLength "min" inDec + +-- | convertToSimpleEdge takes a lables edge and relabels with input +convertToSimpleEdge ∷ (VertexCost, LG.LEdge EdgeInfo) → LG.LEdge Double +convertToSimpleEdge (edgeWeight, (a, b, _)) = (a, b, edgeWeight) + + +{- | graphCostFromNodes takes a Decorated graph and returns its cost by summing up the local costs + of its nodes +-} +graphCostFromNodes ∷ DecoratedGraph → Double +graphCostFromNodes inGraph = + if LG.isEmpty inGraph + then 0.0 + else sum $ fmap (vertexCost . snd) (LG.labNodes inGraph) + +-- | dichotomizeRoot takes greaph and dichotimizes not dichotomous roots in graph +dichotomizeRoot ∷ Int → SimpleGraph → SimpleGraph +dichotomizeRoot lOutgroupIndex inGraph = + if LG.isEmpty inGraph + then LG.empty + else + let rootList = LG.getRoots inGraph + currentRoot = fst $ head rootList + rootEdgeList = LG.out inGraph currentRoot + in -- not a tree error + if length rootList /= 1 + then error ("Graph input to dichotomizeRoot is not a tree--not single root:" <> show rootList) + else -- nothing to do + + if length rootEdgeList < 3 + then inGraph + else + let numVertices = length $ LG.nodes inGraph + newNode = (numVertices, T.pack $ show numVertices) + edgesToDelete = filter ((/= lOutgroupIndex) . snd3) rootEdgeList + newEdgeDestinations = fmap snd3 edgesToDelete + newEdgeStarts = replicate (length newEdgeDestinations) numVertices + newEdgeLabels = replicate (length newEdgeDestinations) 0.0 + -- nub for case where root edge in "wrong" direction + -- doesn't filter edges to delete properly + newEdgesNewNode = L.nub $ zip3 newEdgeStarts newEdgeDestinations newEdgeLabels + newRootEdge = (currentRoot, numVertices, 0.0) + in LG.delLEdges edgesToDelete $ LG.insEdges (newRootEdge : newEdgesNewNode) $ LG.insNode newNode inGraph + + +-- | showBlockGraphs takes a vector of vector of DecoratedGraphs and converte and prettifies outputting a String +showDecGraphs ∷ V.Vector (V.Vector DecoratedGraph) → String +showDecGraphs inDecVV = + if V.null inDecVV + then [] + else concatMap concat (V.toList $ fmap ((V.toList . fmap LG.prettify) . fmap convertDecoratedToSimpleGraph) inDecVV) + + +{- | +A wrapper around selectGraphsFull for ReducedPhylogeneticGraph. +-} +selectGraphs ∷ SelectGraphType → Int → Double → [ReducedPhylogeneticGraph] → PhyG [ReducedPhylogeneticGraph] +selectGraphs selectType numberToKeep threshold inGraphList = + let convertReduced2GenPhyloGraph (a, b, c, d, f) = (a, b, c, d, mempty, f) + + convertGenPhyloGraph2Reduced (a, b, c, d, _, f) = (a, b, c, d, f) + + fullPhyloGraphList = convertReduced2GenPhyloGraph <$> inGraphList + + newFullGraphs = selectGraphsFull selectType numberToKeep threshold fullPhyloGraphList + in fmap convertGenPhyloGraph2Reduced <$> newFullGraphs + + +-- Basically a Phylogenetic Graph with abstract graph types--can't seem to get a type with this to compile +-- (SimpleGraph, VertexCost, LG.Gr a b, V.Vector [LG.Gr a b], V.Vector (V.Vector (LG.Gr a b)), V.Vector (V.Vector CharInfo)) + +{- | +A wrapper around selectPhylogeneticGraph with a better interface. +-} +selectGraphsFull + ∷ SelectGraphType + → Int + → Double + → [GenPhyloGraph a b] + → PhyG [GenPhyloGraph a b] +selectGraphsFull selectType numberToKeep threshold = \case + [] → pure [] + inGraphList → + let stringArgs + | threshold > 0.0 = ("threshold", show threshold) + | otherwise = case selectType of + Best → ("best", "") + Unique → ("unique", "") + AtRandom → ("atrandom", "") + All → ("all", "") + _ → ("best", "") + in take numberToKeep <$> selectPhylogeneticGraph [stringArgs] [] inGraphList + + +{- | +A wrapper for ReducedPhylogeneticGraph. + +/Note: Inefficient in conversions./ +-} +selectPhylogeneticGraphReduced ∷ [Argument] → [ReducedPhylogeneticGraph] → PhyG [ReducedPhylogeneticGraph] +selectPhylogeneticGraphReduced inArgs curGraphs = + let phylographList = fmap convertReduced2PhylogeneticGraph curGraphs + in do + selectedPhylographs ← selectPhylogeneticGraph inArgs [] phylographList + pure $ convertPhylogeneticGraph2Reduced <$> selectedPhylographs + + +{- | selectPhylogeneticGraph takes a series OF arguments and an input list ot PhylogeneticGraphs +and returns or filters that list based on options. +uses selectListCostPairs in GeneralUtilities +-} +selectPhylogeneticGraph + ∷ [Argument] + → [String] + → [GenPhyloGraph a b] + → PhyG [GenPhyloGraph a b] +selectPhylogeneticGraph inArgs _ [] = pure [] +selectPhylogeneticGraph inArgs _ curGraphs = + let fstArgList = fmap (fmap C.toLower . fst) inArgs + sndArgList = fmap (fmap C.toLower . snd) inArgs + lcArgList = zip fstArgList sndArgList + checkCommandList = checkCommandArgs "select" fstArgList V.selectArgList + in -- check for valid command options + if not checkCommandList + then errorWithoutStackTrace ("Unrecognized command in 'select': " <> show inArgs) + else + if length inArgs > 1 + then errorWithoutStackTrace ("Can only have a single select type per command: " <> show inArgs) + else + let doBest = keyExists "best" lcArgList + doAll = keyExists "all" lcArgList + doRandom = keyExists "atrandom" lcArgList + doUnique = keyExists "unique" lcArgList + doThreshold = keyExists "threshold" lcArgList + + thresholdList = filter ((== "threshold") . fst) lcArgList + thresholdParam + | length thresholdList > 1 = + errorWithoutStackTrace ("Multiple 'threshold' number specifications in select command--can have only one: " <> show inArgs) + | null thresholdList = Just 0.1 + | otherwise = readMaybe (snd $ head thresholdList) ∷ Maybe Double + + nonThresholdList = filter ((/= "threshold") . fst) lcArgList + numberToKeep + | length nonThresholdList > 1 = + errorWithoutStackTrace + ("Multiple 'best/unique/atRandom' number specifications in select command--can have only one: " <> show inArgs) + | null nonThresholdList = Just (maxBound ∷ Int) + | null (snd $ head nonThresholdList) = Just (maxBound ∷ Int) + | otherwise = readMaybe (snd $ head nonThresholdList) ∷ Maybe Int + in case numberToKeep of + Nothing → + failWithPhase Computing $ + "Keep specification not an integer in select: " <> show (snd $ head nonThresholdList) <> show lcArgList + Just keep → case thresholdParam of + Nothing → failWithPhase Computing $ "Threshold specification not a float in select: " <> show (snd $ head thresholdList) + Just _ | doAll → pure curGraphs + Just threshold → + let -- minimum graph cost + minGraphCost = minimum $ fmap snd6 curGraphs + + -- collapse zero-length branchs for unique + curGraphsCollapsed = fmap U.collapseGraph curGraphs + + -- keep only unique graphs based on non-zero edges--in sorted by cost + uniqueGraphList = L.sortOn snd6 $ getUniqueGraphs'' (zip curGraphs curGraphsCollapsed) -- curGraphs -- True curGraphs -- getBVUniqPhylogeneticGraph True curGraphs -- getTopoUniqPhylogeneticGraph True curGraphs + + -- this to avaoid alot of unncesesary graph comparisons for 'best' graphs + bestCostGraphs = filter ((== minGraphCost) . snd6) curGraphs + uniqueBestGraphs = getUniqueGraphs'' (zip bestCostGraphs (fmap U.collapseGraph bestCostGraphs)) + + result + | doUnique = pure $ take keep uniqueGraphList + | doThreshold = + let baseGraphValue = snd6 $ head uniqueGraphList + thresholdLimit = thresholdValue * baseGraphValue + thresholdValue + | threshold < 0.0 = 1 + | threshold > 1.0 = threshold + | otherwise = 1.0 + threshold + + thresholdGraphList = filter ((<= thresholdLimit) . snd6) uniqueGraphList + in pure thresholdGraphList + | doBest = pure $ take keep uniqueBestGraphs + | doRandom = shuffleList $ take keep curGraphs + | otherwise = pure uniqueBestGraphs -- default is all best and unique + in result + + +{- | getUniqueGraphs takes each pair of non-zero edges and conpares them--if equal not added to list +maybe chnge to nub LG.pretify graphList? +-} +getUniqueGraphs + ∷ Bool + → [GenPhyloGraph a b] + → [GenPhyloGraph a b] +getUniqueGraphs removeZeroEdges inGraphList = + if null inGraphList + then [] + else + let inGraphEdgeList = + if removeZeroEdges + then fmap ((filter ((> 0.0) . minLength . thd3) . LG.labEdges) . thd6) inGraphList + else fmap (LG.labEdges . thd6) inGraphList + in getUniqueGraphs' (zip inGraphEdgeList inGraphList) [] + + +{- | getUniqueGraphs Using fgl == +basically a nub +need to add a collapse function for compare as well +takes pairs of (noCollapsed, collapsed) phylogenetic graphs, +make strings based on collapsed and returns not collpased +-} +getUniqueGraphs'' + ∷ [(GenPhyloGraph a b, GenPhyloGraph a b)] + → [GenPhyloGraph a b] +getUniqueGraphs'' = nubGraph [] + + +{- | isNovelGraph checks if a graph is in list of existing graphs +uses colapsed representation +-} +isNovelGraph + ∷ [GenPhyloGraph a b] + → GenPhyloGraph a b + → Bool +isNovelGraph graphList testGraph = + null graphList + || ( let collapsedInGraph = (LG.prettyIndices . fst6 . U.collapseGraph) testGraph + collapseGraphList = fmap (LG.prettyIndices . fst6 . U.collapseGraph) graphList + matchList = filter (== collapsedInGraph) collapseGraphList + in -- trace ("IsNovel: " <> (show $ null matchList)) + null matchList + ) + + +{- | keeps and returns unique graphs based on Eq of Topological Simple Graph +String prettyIndices w/0 HTU names and branch lengths +arbitrarily rooted on 0 for oonsistency +reversed to keep original order in case sorted on length +-} +nubGraph + ∷ [(GenPhyloGraph a b, GenPhyloGraph a b, String)] + → [(GenPhyloGraph a b, GenPhyloGraph a b)] + → [GenPhyloGraph a b] +nubGraph curList inList = + if null inList + then reverse $ fmap fst3 curList + else + let (firstGraphNC, firstGraphC) = head inList + + -- nub on newick topology only--should be collapsed already + firstString = makeNewickList False False False (fst $ head $ LG.getRoots $ fst6 firstGraphNC) [fst6 firstGraphNC] [snd6 firstGraphNC] + + -- nub on prettty string + -- firstString = LG.prettyIndices $ thd6 firstGraphNC + isMatch = filter (== firstString) (fmap thd3 curList) + in -- trace ("NG: " <> (show $ null isMatch) <> "->" <> firstString) $ + if null curList + then nubGraph [(firstGraphNC, firstGraphC, firstString)] (tail inList) + else + if null isMatch + then nubGraph ((firstGraphNC, firstGraphC, firstString) : curList) (tail inList) + else nubGraph curList (tail inList) + + +-- ) + +-- | getUniqueGraphs takes each pair of non-zero edges and compares them--if equal not added to list +getUniqueGraphs' + ∷ [([LG.LEdge EdgeInfo], GenPhyloGraph a b)] + → [([LG.LEdge EdgeInfo], GenPhyloGraph a b)] + → [GenPhyloGraph a b] +getUniqueGraphs' inGraphPairList currentUniquePairs = + if null inGraphPairList + then fmap snd currentUniquePairs + else + let firstPair@(firstEdges, _) = head inGraphPairList + in if null currentUniquePairs + then getUniqueGraphs' (tail inGraphPairList) [firstPair] + else + let equalList = filter id $ fmap ((== firstEdges) . fst) currentUniquePairs + in if null equalList + then getUniqueGraphs' (tail inGraphPairList) (firstPair : currentUniquePairs) + else getUniqueGraphs' (tail inGraphPairList) currentUniquePairs + + +-- | getNodeType returns node type for Node +getNodeType ∷ (Show a, Show b) ⇒ LG.Gr a b → LG.Node → NodeType +getNodeType inGraph inNode + | not $ LG.gelem inNode inGraph = error ("Node " <> show inNode <> " not in graph\n" <> GFU.showGraph inGraph) + | LG.isLeaf inGraph inNode = LeafNode + | LG.isTreeNode inGraph inNode = TreeNode + | LG.isNetworkNode inGraph inNode = NetworkNode + | LG.isRoot inGraph inNode = RootNode + | LG.isIn1Out1 inGraph inNode = In1Out1 + | otherwise = error ("Node type " <> show inNode <> " not Leaf, Tree, Network, or Root in graph\n" <> GFU.showGraph inGraph) + + +{- | copyIAFinalToPrelim takes a Decorated graph and copies +the IA final fields to preliminary IA states--this for IA only optimization +inswapping and other operations. This is done because the "preliminary" IA states +are only known after full post/pre traversals +-} +copyIAFinalToPrelim ∷ DecoratedGraph → DecoratedGraph +copyIAFinalToPrelim inGraph = + if LG.isEmpty inGraph + then error "Empty input graph in copyIAFinalToPrelim" + else + let nodes = LG.labNodes inGraph + edges = LG.labEdges inGraph + newNodes = fmap makeIAPrelimFromFinal nodes + in LG.mkGraph newNodes edges + + +{- | makeIAPrelimFromFinal updates the label of a node for IA states +setting preliminary to final +-} +makeIAPrelimFromFinal ∷ LG.LNode VertexInfo → LG.LNode VertexInfo +makeIAPrelimFromFinal (inIndex, label) = + let labData = vertData label + newLabData = fmap (fmap f) labData + in (inIndex, label{vertData = newLabData}) + where + f c + | GV.null (slimIAFinal c) && GV.null (wideIAFinal c) && GV.null (hugeIAFinal c) = c + | not $ GV.null $ slimIAFinal c = c{slimIAPrelim = M.makeDynamicCharacterFromSingleVector $ slimIAFinal c} + | not $ GV.null $ wideIAFinal c = c{wideIAPrelim = M.makeDynamicCharacterFromSingleVector $ wideIAFinal c} + | otherwise = c{hugeIAPrelim = M.makeDynamicCharacterFromSingleVector $ hugeIAFinal c} + + +{- | copyIAPrelimToFinal takes a Decorated graph and copies +the IA prelim fields to final IA states--this for IA only optimization +inswapping and other operations. THis is fdone for root and leaf vertices +-} +copyIAPrelimToFinal ∷ DecoratedGraph → DecoratedGraph +copyIAPrelimToFinal inGraph = + if LG.isEmpty inGraph + then error "Empty input graph in copyIAFinalToPrelim" + else + let nodes = LG.labNodes inGraph + edges = LG.labEdges inGraph + newNodes = fmap makeIAFinalFromPrelim nodes + in LG.mkGraph newNodes edges + + +{- | makeIAFinalFomPrelim updates the label of a node for IA states +setting final to preliminary +-} +makeIAFinalFromPrelim ∷ LG.LNode VertexInfo → LG.LNode VertexInfo +makeIAFinalFromPrelim (inIndex, label) = + let labData = vertData label + newLabData = fmap (fmap f) labData + in (inIndex, label{vertData = newLabData}) + where + f c = + let newSlimIAFinal = extractMediansGapped $ slimIAPrelim c + newWideIAFinal = extractMediansGapped $ wideIAPrelim c + newHugeIAFinal = extractMediansGapped $ hugeIAPrelim c + in if GV.null (snd3 $ slimIAPrelim c) && GV.null (snd3 $ wideIAPrelim c) && GV.null (snd3 $ hugeIAPrelim c) + then c + else + if not $ GV.null $ snd3 $ slimIAPrelim c + then c{slimIAFinal = newSlimIAFinal} + else + if not $ GV.null $ snd3 $ wideIAPrelim c + then c{wideIAFinal = newWideIAFinal} + else c{hugeIAFinal = newHugeIAFinal} + + +{- | getTopoUniqPhylogeneticGraph takes a list of phylogenetic graphs and returns +list of topologically unique graphs--operatres on simple graph field +noZeroEdges flag passed to remove zero weight edges +-} +getTopoUniqPhylogeneticGraph ∷ Bool → [PhylogeneticGraph] → [PhylogeneticGraph] +getTopoUniqPhylogeneticGraph nonZeroEdges inPhyloGraphList = + if null inPhyloGraphList + then [] + else + let uniqueBoolList = createUniqueBoolList nonZeroEdges (fmap fst6 inPhyloGraphList) [] + boolPair = zip inPhyloGraphList uniqueBoolList + in (fst <$> filter snd boolPair) + + +-- | createUniqueBoolList creates a list of Bool if graphs are unique--first occurrence is True, others False +createUniqueBoolList ∷ Bool → [SimpleGraph] → [(SimpleGraph, Bool)] → [Bool] +createUniqueBoolList nonZeroEdges inGraphList boolAccum = + if null inGraphList + then reverse $ fmap snd boolAccum + else + let firstGraph = head inGraphList + in if null boolAccum + then createUniqueBoolList nonZeroEdges (tail inGraphList) ((firstGraph, True) : boolAccum) + else + let checkList = filter id $ fmap (topologicalEqual nonZeroEdges firstGraph . fst) boolAccum + in if null checkList + then createUniqueBoolList nonZeroEdges (tail inGraphList) ((firstGraph, True) : boolAccum) + else createUniqueBoolList nonZeroEdges (tail inGraphList) ((firstGraph, False) : boolAccum) + + +{- | topologicalEqual takes two simple graphs and returns True if graphs have same nodes and edges +option to exclude zero weight edges +-} +topologicalEqual ∷ Bool → SimpleGraph → SimpleGraph → Bool +topologicalEqual nonZeroEdges g1 g2 + | LG.isEmpty g1 && LG.isEmpty g2 = True + | LG.isEmpty g1 || LG.isEmpty g2 = False + | otherwise = + let nodesG1 = LG.labNodes g1 + nodesG2 = LG.labNodes g2 + edgesG1 = + if nonZeroEdges + then fmap LG.toEdge $ filter ((> 0) . thd3) $ LG.labEdges g1 + else LG.edges g1 + edgesG2 = + if nonZeroEdges + then fmap LG.toEdge $ filter ((> 0) . thd3) $ LG.labEdges g2 + else LG.edges g2 + in nodesG1 == nodesG2 && edgesG1 == edgesG2 + + +{- | getEdgeMinLengthToNode takes a labelled node and returns the min length of +the edge leading to the node +-} +getEdgeMinLengthToNode ∷ [LG.LEdge EdgeInfo] → LG.LNode a → Double +getEdgeMinLengthToNode edgeList (node, _) = + let foundEdge = L.find ((== node) . snd3) edgeList + in -- root node will be nor be in in edge set and need so set > 0 + if isNothing foundEdge + then 1.0 -- error ("Edge not found in getEdgeMinLengthToNode: node " <> (show node) <> " edge list " <> (show edgeList)) + else minLength $ thd3 $ fromJust foundEdge + + +{- | getBVUniqPhylogeneticGraph takes a list of phylogenetic graphs and returns +list of topologically unique graphs based on their node bitvector assignments +operatres on Decorated graph field +noZeroEdges flag passed to remove zero weight edges +-} +getBVUniqPhylogeneticGraph ∷ Bool → [PhylogeneticGraph] → [PhylogeneticGraph] +getBVUniqPhylogeneticGraph nonZeroEdges inPhyloGraphList = + if null inPhyloGraphList + then [] + else + let bvGraphList = fmap (getBVNodeList nonZeroEdges . thd6) inPhyloGraphList + uniqueBoolList = createBVUniqueBoolList bvGraphList [] + boolPair = zip inPhyloGraphList uniqueBoolList + in (fst <$> filter snd boolPair) + + +{- | getBVNodeList takes a DecoratedGraph and returns sorted list (by BV) of nodes +removes node with zero edge weight to them if specified +-} +getBVNodeList ∷ Bool → DecoratedGraph → [BV.BitVector] +getBVNodeList nonZeroEdges inGraph = + if LG.isEmpty inGraph + then [] + else + let nodeList = LG.labNodes inGraph + edgeList = LG.labEdges inGraph + minLengthList = fmap (getEdgeMinLengthToNode edgeList) nodeList + nodePairList = filter ((> 0) . snd) $ zip nodeList minLengthList + bvNodeList = + if nonZeroEdges + then L.sort $ fmap ((bvLabel . snd) . fst) nodePairList + else L.sort $ fmap (bvLabel . snd) nodeList + in bvNodeList + + +{- | createBVUniqueBoolList creates a list of Bool if graphs are unique by bitvecector node list +first occurrence is True, others False +assumes edges filterd b=y lenght already +-} +createBVUniqueBoolList ∷ [[BV.BitVector]] → [([BV.BitVector], Bool)] → [Bool] +createBVUniqueBoolList inBVGraphListList boolAccum = + if null inBVGraphListList + then reverse $ fmap snd boolAccum + else + let firstGraphList = head inBVGraphListList + in if null boolAccum + then createBVUniqueBoolList (tail inBVGraphListList) ((firstGraphList, True) : boolAccum) + else + let checkList = filter id $ fmap ((== firstGraphList) . fst) boolAccum + in if null checkList + then createBVUniqueBoolList (tail inBVGraphListList) ((firstGraphList, True) : boolAccum) + else createBVUniqueBoolList (tail inBVGraphListList) ((firstGraphList, False) : boolAccum) + + +-- | makeDummyLabEdge takes an unlabelled edge and adds a dummy label +makeDummyLabEdge ∷ EdgeInfo → LG.Edge → LG.LEdge EdgeInfo +makeDummyLabEdge edgeLab (u, v) = (u, v, edgeLab) + + +{- | netNodeAncestorViolation checks whether one of the edge into a netowrk node (in 2) +is cinnected to an ancestor (via the other parent) of the node +this is a form of time violation since the parents of a network node must be +at least possibly coeval +this uses the bit vector label of nodes. If the other child of either parent node +of a network node has non-zero intersection between the BV label of the network node +and that other child of parent then they conecting edge is from and ancestral node hence a time violation +O(n) n netork nodes in Graph, but checks all nodes to see if network +-} +hasNetNodeAncestorViolation ∷ LG.Gr VertexInfo b → Bool +hasNetNodeAncestorViolation inGraph = + if LG.isEmpty inGraph + then error "Empty graph in hasNetNodeAncestorViolation" + else + let (_, _, _, netWorkNodeList) = LG.splitVertexList inGraph + hasAncViolationList = filter id $ fmap (nodeAncViolation inGraph) netWorkNodeList + in -- trace ("HNV: " <> (show $ (not . null) hasAncViolationList)) + (not . null) hasAncViolationList + + +{- | nodeAncViolation checks a single node fo ancestrpo connection--he ceviolation +should be O(1). Return True if violation +-} +nodeAncViolation ∷ LG.Gr VertexInfo b → LG.LNode VertexInfo → Bool +nodeAncViolation inGraph inNode = + let parentList = LG.labParents inGraph (fst inNode) + in if length parentList /= 2 + then error ("Parent number should be 2: " <> show (fst inNode) <> " <- " <> show (fmap fst parentList)) + else + let sisterNodes = concatMap (LG.sisterLabNodes inGraph) parentList + sisterBVData = fmap (bvLabel . snd) sisterNodes + inNodeBVData = bvLabel $ snd inNode + sisterBVIntersections = fmap (.&. inNodeBVData) sisterBVData + isAncInNode = filter (== inNodeBVData) sisterBVIntersections + in (not . null) isAncInNode + + +{- | selectGraphStochastic takes a list of graphs and returns a list of graphs chosen at Random +using an exponential distribution based on graph cost difference divided by an input factor +if factor is 0 then stringth graphs cost +mprob acceptance = -exp [(cost - minCost)/ factor] +returns n graphs by random criterion without replacment +-} +selectGraphStochastic ∷ Int → Double → [PhylogeneticGraph] → PhyG [PhylogeneticGraph] +selectGraphStochastic number factor inGraphList + | null inGraphList = pure inGraphList + | number >= length inGraphList = pure inGraphList + | otherwise = + let getProb ∷ Double → Double + getProb a = exp ((-1) * factor / a) + + minCost = minimum $ snd6 <$> inGraphList + + selectGraphByWeighting ∷ (Double → Double → Bool) → PhylogeneticGraph → PhyG Bool + selectGraphByWeighting cmp v = + let extractKey = getProb . (-) minCost . snd6 + in getRandomR (0, 1) <&> cmp (extractKey v) + in do + -- so no more than specified + selectedGraphs ← take number <$> filterM (selectGraphByWeighting (<)) inGraphList + + -- takes some random remainder to fill out length of list + suffixPad ← case number - length selectedGraphs of + 0 → pure [] + numLucky → filterM (selectGraphByWeighting (>=)) inGraphList >>= (fmap (take numLucky) . shuffleList) + + pure $ selectedGraphs <> suffixPad + + +{- | getDisplayTreeCostList returns a list of teh "block" costs of display trees +in a piar with any graph 'penalty' cost +-} +getDisplayTreeCostList ∷ PhylogeneticGraph → ([VertexCost], VertexCost) +getDisplayTreeCostList inGraph = + if LG.isEmpty $ thd6 inGraph + then ([], 0.0) + else + let rootIndex = fst $ head $ LG.getRoots $ fst6 inGraph + displayTreeCharVect = fft6 inGraph + displayTreeCostVect = fmap (getBlockCost rootIndex) displayTreeCharVect + nonGraphCost = V.sum displayTreeCostVect + in (V.toList displayTreeCostVect, snd6 inGraph - nonGraphCost) + + +-- | getBlockCost returns the cost, summed over characters, of a character block +getBlockCost ∷ LG.Node → V.Vector DecoratedGraph → VertexCost +getBlockCost rootIndex charGraphVect = + if V.null charGraphVect + then 0.0 + else V.sum $ fmap (getCharacterCost rootIndex) charGraphVect + + +-- | getCharacterCost returns charcter cost as root of character tree +getCharacterCost ∷ LG.Node → DecoratedGraph → VertexCost +getCharacterCost rootIndex inGraph = + if LG.isEmpty inGraph + then 0.0 + else + let rootLabel = LG.lab inGraph rootIndex + in maybe (error ("Root index without label: " <> show rootIndex)) subGraphCost rootLabel + + +{- | makeLeafGraph takes input data and creates a 'graph' of leaves with Vertex informnation +but with zero edges. This 'graph' can be reused as a starting structure for graph construction +to avoid remaking of leaf vertices +-} +makeLeafGraph ∷ ProcessedData → DecoratedGraph +makeLeafGraph (nameVect, bvNameVect, blocDataVect) = + if V.null nameVect + then error "Empty ProcessedData in makeLeafGraph" + else + let leafVertexList = V.toList $ V.map (makeLeafVertex nameVect bvNameVect blocDataVect) (V.fromList [0 .. V.length nameVect - 1]) + in LG.mkGraph leafVertexList [] + + +{- | makeSimpleLeafGraph takes input data and creates a 'graph' of leaves with Vertex informnation +but with zero edges. This 'graph' can be reused as a starting structure for graph construction +to avoid remaking of leaf vertices +-} +makeSimpleLeafGraph ∷ ProcessedData → SimpleGraph +makeSimpleLeafGraph (nameVect, _, _) = + if V.null nameVect + then error "Empty ProcessedData in makeSimpleLeafGraph" + else + let leafVertexList = V.toList $ V.map (makeSimpleLeafVertex nameVect) (V.fromList [0 .. V.length nameVect - 1]) + in LG.mkGraph leafVertexList [] + where + makeSimpleLeafVertex ∷ ∀ {b}. V.Vector b → Int → (Int, b) + makeSimpleLeafVertex a b = (b, a V.! b) + + +-- | makeLeafVertex makes a single unconnected vertex for a leaf +makeLeafVertex ∷ V.Vector NameText → V.Vector NameBV → V.Vector BlockData → Int → LG.LNode VertexInfo +makeLeafVertex nameVect bvNameVect inData localIndex = + -- trace ("Making leaf " <> (show localIndex) <> " Data " <> (show $ length inData) <> " " <> (show $ fmap length $ fmap snd3 inData)) ( + let centralData = V.map snd3 inData + thisData = V.map (V.! localIndex) centralData + newVertex = + VertexInfo + { index = localIndex + , bvLabel = bvNameVect V.! localIndex + , parents = V.empty + , children = V.empty + , nodeType = LeafNode + , vertName = nameVect V.! localIndex + , vertData = thisData + , vertexResolutionData = mempty + , vertexCost = 0.0 + , subGraphCost = 0.0 + } + in -- trace (show (length thisData) <> (show $ fmap length thisData)) + (localIndex, newVertex) + + +-- ) + +{- | remakePhylogeneticGraph remakes (rebuilds from scratch) phylogenetic graph +fst, thd. 4th and 5th fields +-} +remakePhylogeneticGraph ∷ PhylogeneticGraph → PhylogeneticGraph +remakePhylogeneticGraph inGraph@(a, b, c, d, e, f) = + if inGraph == emptyPhylogeneticGraph + then inGraph + else + let a' = LG.remakeGraph a + c' = LG.remakeGraph c + d' = fmap (fmap LG.remakeGraph) d + e' = fmap (fmap LG.remakeGraph) e + in (a', b, c', d', e', f) + + +keyExists ∷ ∀ f k v. (Eq k, Foldable f) ⇒ k → f (k, v) → Bool +keyExists key = any ((== key) . fst) diff --git a/src/Input/BitPack.hs b/src/Input/BitPack.hs new file mode 100644 index 000000000..2042a0fcb --- /dev/null +++ b/src/Input/BitPack.hs @@ -0,0 +1,2403 @@ +{- | +Module : BitPack.hs +Description : Module with functionality to transform NonAdditive data to bit packed + Word64 structures +Copyright : (c) 2022 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. +License : + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The views and conclusions contained in the software and documentation are those +of the authors and should not be interpreted as representing official policies, +either expressed or implied, of the FreeBSD Project. + +Maintainer : Ward Wheeler +Stability : unstable +Portability : portable (I hope) +-} +module Input.BitPack ( + getSingleCharacter, + packNonAdditiveData, + median2Packed, + median2PackedUnionField, + packedPreorder, + threeWayPacked, + threeWayPacked', + unionPacked, + minMaxCharDiff, +) where + +-- import Bio.DynamicCharacter +-- import Bio.DynamicCharacter.Element (SlimState, WideState) + +-- import PHANE.Evaluation.Verbosity (Verbosity (..)) +import Data.BitVector.LittleEndian qualified as BV +import Data.Bits +import Data.List qualified as L +import Data.List.Split qualified as SL +import Data.Maybe +import Data.Text.Lazy qualified as T +import Data.Vector qualified as V +import Data.Vector.Generic qualified as GV +import Data.Vector.Unboxed qualified as UV +import Data.Word +import GeneralUtilities +import PHANE.Evaluation +import Types.Types + +-- import ParallelUtilities qualified as PU +-- import Debug.Trace + +{- +This module contains structures and functions for bit-packing operations +the basic idea is to transform the Bit-vector representation of non-additive +characters (whihc have effectively unlimited number of states) to more efficient +operations based on Word64 encodings. Additionally, this should reduce +memory footprint when large number of non-additive characters are +input (such as genomic SNP data). + +Several new types are cretged and manipulated based on the n umber of states +in the character--Packed2, Packed4, Packed5, Packed8, Packed64. These types hold +as subsets of bits (1 per state) multiple (other than for Packed64) original +non-additive characters. + +The optimizations are based on using bi-wise operations specific +to each packed type to create preliminary (post-order with cost) and final +(pre-order) states. Ideally the functions should contain no logic branches +or recursion (= loops) so that operations are solely bit-based. + +Methods are similar too those of Lamport 1975, Ronquist 1998, Moelannen 1999, Goloboff 2002, +and White and Holland 2011, but differ in detail due to +the programming language (Haskell) and need to maintain data structures +useful for network analysis. + +The basic character data structure is a vector of Word64, the original vector of bit-vectors +is split into a series of new characters depending on the number of states (in non-missing cells). + +A single Word64 can then hold 32 2-state. 16 4-state, 12 5-state (awkward but useful for DNA), +4 8-state, 1 64-state, and a bit-vector for >64 states. + +Character weights are all = 1 in static characters. This is ensured by organizeBlockData +in Input.Reorganize.hs basically--characters are multiplied by weight (if integer--otherwise not recoded) +So can check and only recode characters with weight of 1. +-} + +{- +Functions for median2 calculations of packed types +These are used in post-order graph traversals and pairwise +distance functions among others. +-} + +{- +Masks for various operations and state numbers +-} + +{- | mask2A first Mask for 2 state 64 bit +32 x (01) +6148914691236517205 +-} +mask2A ∷ Word64 +mask2A = 0x5555555555555555 + + +{- | mask2B second Mask for 2 state 64 bit +32 x (10) +12297829382473034410 +-} +mask2B ∷ Word64 +mask2B = 0xAAAAAAAAAAAAAAAA + + +{- | mask4A first mask for 4 state 64 bits +8608480567731124087 +16 X (0111) +-} +mask4A ∷ Word64 +mask4A = 0x7777777777777777 + + +{- | mask4B second mask for 4 states 64 bits +9838263505978427528 +16 X (1000) +-} +mask4B ∷ Word64 +mask4B = 0x8888888888888888 + + +{- | mask4C mak for 4 states an 64 bits +4919131752989213764 +16 x (0100) +-} +mask4C ∷ Word64 +mask4C = 0x4444444444444444 + + +{- | mask4D mak for 4 states an 64 bits +2459565876494606882 +16 x (0010) +-} +mask4D ∷ Word64 +mask4D = 0x2222222222222222 + + +{- | mask4E mak for 4 states an 64 bits +1229782938247303441 +16 x (0001) +-} +mask4E ∷ Word64 +mask4E = 0X1111111111111111 + + +{- +5 state masks top 4 bits OFF may require to mask out top 4 bits for states and cost +top 4 OFF rest ON 0xFFFFFFFFFFFFFFF + 1152921504606846975 +top 4 ON rest OFF 0xF000000000000000 + 17293822569102704640 +-} + +{- | mask5A first mask for 5 states 64 bits -- need to check what state of top 4 bits--should be OFF I think +12 x (01111) +557865244164603375 +-} +mask5A ∷ Word64 +mask5A = 0x7BDEF7BDEF7BDEF + + +{- | mask5B second mask for 5 states 64 bits -- need to check what state of top 4 bits-these are OFF +12 x (10000) +595056260442243600 (top 4 OFF) v 17888878829544948240 (top 4 ON) +0x842108421084210 (top 4 OFF) v F842108421084210 (top 4 ON) +-} +mask5B ∷ Word64 +mask5B = 0x842108421084210 + + +{- | mask5C mask 5 states 64 bits +12 x (01000) +297528130221121800 +-} +mask5C ∷ Word64 +mask5C = 0x421084210842108 + + +{- | mask5D mask 5 states 64 bits +12 x (00100) +148764065110560900 +-} +mask5D ∷ Word64 +mask5D = 0x210842108421084 + + +{- | mask5E mask 5 states 64 bits +12 x (00010) +74382032555280450 +-} +mask5E ∷ Word64 +mask5E = 0x108421084210842 + + +{- | mask5F mask 5 states 64 bits +12 x (00001) +37191016277640225 +-} +mask5F ∷ Word64 +mask5F = 0x84210842108421 + + +{- | mask8A first mask for 8 states 64 bits +8 x (01111111) +9187201950435737471 +-} +mask8A ∷ Word64 +mask8A = 0x7F7F7F7F7F7F7F7F + + +{- | mask8B second mask for 8 states 64 bits +8 x (10000000) +9259542123273814144 +-} +mask8B ∷ Word64 +mask8B = 0x8080808080808080 + + +{- | mask8C mask for 8 states 64 bits +8 x (01000000) +4629771061636907072 +-} +mask8C ∷ Word64 +mask8C = 0x4040404040404040 + + +{- | mask8D mask for 8 states 64 bits +8 x (00100000) +2314885530818453536 +-} +mask8D ∷ Word64 +mask8D = 0x2020202020202020 + + +{- | mask8E mask for 8 states 64 bits +8 x (00010000) +1157442765409226768 +-} +mask8E ∷ Word64 +mask8E = 0x1010101010101010 + + +{- | mask8F mask for 8 states 64 bits +8 x (00001000) +578721382704613384 +-} +mask8F ∷ Word64 +mask8F = 0x808080808080808 + + +{- | mask8G mask for 8 states 64 bits +8 x (00000100) +289360691352306692 +-} +mask8G ∷ Word64 +mask8G = 0x404040404040404 + + +{- | mask8H mask for 8 states 64 bits +8 x (00000010) +144680345676153346 +-} +mask8H ∷ Word64 +mask8H = 0x202020202020202 + + +{- | mask8I mask for 8 states 64 bits +8 x (00000001) +72340172838076673 +-} +mask8I ∷ Word64 +mask8I = 0x101010101010101 + + +-- | mask2scN 11(32x00) mask to reveal states of Nth subcharacter in 64Bit Word +mask2sc0 ∷ Word64 +mask2sc0 = 0x3 + + +mask2sc1 ∷ Word64 +mask2sc1 = shiftL mask2sc0 2 + + +mask2sc2 ∷ Word64 +mask2sc2 = shiftL mask2sc0 (2 * 2) + + +mask2sc3 ∷ Word64 +mask2sc3 = shiftL mask2sc0 (2 * 3) + + +mask2sc4 ∷ Word64 +mask2sc4 = shiftL mask2sc0 (2 * 4) + + +mask2sc5 ∷ Word64 +mask2sc5 = shiftL mask2sc0 (2 * 5) + + +mask2sc6 ∷ Word64 +mask2sc6 = shiftL mask2sc0 (2 * 6) + + +mask2sc7 ∷ Word64 +mask2sc7 = shiftL mask2sc0 (2 * 7) + + +mask2sc8 ∷ Word64 +mask2sc8 = shiftL mask2sc0 (2 * 8) + + +mask2sc9 ∷ Word64 +mask2sc9 = shiftL mask2sc0 (2 * 9) + + +mask2sc10 ∷ Word64 +mask2sc10 = shiftL mask2sc0 (2 * 10) + + +mask2sc11 ∷ Word64 +mask2sc11 = shiftL mask2sc0 (2 * 11) + + +mask2sc12 ∷ Word64 +mask2sc12 = shiftL mask2sc0 (2 * 12) + + +mask2sc13 ∷ Word64 +mask2sc13 = shiftL mask2sc0 (2 * 13) + + +mask2sc14 ∷ Word64 +mask2sc14 = shiftL mask2sc0 (2 * 14) + + +mask2sc15 ∷ Word64 +mask2sc15 = shiftL mask2sc0 (2 * 15) + + +mask2sc16 ∷ Word64 +mask2sc16 = shiftL mask2sc0 (2 * 16) + + +mask2sc17 ∷ Word64 +mask2sc17 = shiftL mask2sc0 (2 * 17) + + +mask2sc18 ∷ Word64 +mask2sc18 = shiftL mask2sc0 (2 * 18) + + +mask2sc19 ∷ Word64 +mask2sc19 = shiftL mask2sc0 (2 * 19) + + +mask2sc20 ∷ Word64 +mask2sc20 = shiftL mask2sc0 (2 * 20) + + +mask2sc21 ∷ Word64 +mask2sc21 = shiftL mask2sc0 (2 * 21) + + +mask2sc22 ∷ Word64 +mask2sc22 = shiftL mask2sc0 (2 * 22) + + +mask2sc23 ∷ Word64 +mask2sc23 = shiftL mask2sc0 (2 * 23) + + +mask2sc24 ∷ Word64 +mask2sc24 = shiftL mask2sc0 (2 * 24) + + +mask2sc25 ∷ Word64 +mask2sc25 = shiftL mask2sc0 (2 * 25) + + +mask2sc26 ∷ Word64 +mask2sc26 = shiftL mask2sc0 (2 * 26) + + +mask2sc27 ∷ Word64 +mask2sc27 = shiftL mask2sc0 (2 * 27) + + +mask2sc28 ∷ Word64 +mask2sc28 = shiftL mask2sc0 (2 * 28) + + +mask2sc29 ∷ Word64 +mask2sc29 = shiftL mask2sc0 (2 * 29) + + +mask2sc30 ∷ Word64 +mask2sc30 = shiftL mask2sc0 (2 * 30) + + +mask2sc31 ∷ Word64 +mask2sc31 = shiftL mask2sc0 (2 * 31) + + +-- | mask4scN 1111(16x0000) mask to reveal states of Nth subcharacter in 64Bit Word +mask4sc0 ∷ Word64 +mask4sc0 = 0xF + + +mask4sc1 ∷ Word64 +mask4sc1 = shiftL mask4sc0 4 + + +mask4sc2 ∷ Word64 +mask4sc2 = shiftL mask4sc0 (4 * 2) + + +mask4sc3 ∷ Word64 +mask4sc3 = shiftL mask4sc0 (4 * 3) + + +mask4sc4 ∷ Word64 +mask4sc4 = shiftL mask4sc0 (4 * 4) + + +mask4sc5 ∷ Word64 +mask4sc5 = shiftL mask4sc0 (4 * 5) + + +mask4sc6 ∷ Word64 +mask4sc6 = shiftL mask4sc0 (4 * 6) + + +mask4sc7 ∷ Word64 +mask4sc7 = shiftL mask4sc0 (4 * 7) + + +mask4sc8 ∷ Word64 +mask4sc8 = shiftL mask4sc0 (4 * 8) + + +mask4sc9 ∷ Word64 +mask4sc9 = shiftL mask4sc0 (4 * 9) + + +mask4sc10 ∷ Word64 +mask4sc10 = shiftL mask4sc0 (4 * 10) + + +mask4sc11 ∷ Word64 +mask4sc11 = shiftL mask4sc0 (4 * 11) + + +mask4sc12 ∷ Word64 +mask4sc12 = shiftL mask4sc0 (4 * 12) + + +mask4sc13 ∷ Word64 +mask4sc13 = shiftL mask4sc0 (4 * 13) + + +mask4sc14 ∷ Word64 +mask4sc14 = shiftL mask4sc0 (4 * 14) + + +mask4sc15 ∷ Word64 +mask4sc15 = shiftL mask4sc0 (4 * 15) + + +-- | mask5scN 11111(12x00000) mask to reveal states of Nth subcharacter in 64Bit Word +mask5sc0 ∷ Word64 +mask5sc0 = 0x1F + + +mask5sc1 ∷ Word64 +mask5sc1 = shiftL mask5sc0 5 + + +mask5sc2 ∷ Word64 +mask5sc2 = shiftL mask5sc0 (5 * 2) + + +mask5sc3 ∷ Word64 +mask5sc3 = shiftL mask5sc0 (5 * 3) + + +mask5sc4 ∷ Word64 +mask5sc4 = shiftL mask5sc0 (5 * 4) + + +mask5sc5 ∷ Word64 +mask5sc5 = shiftL mask5sc0 (5 * 5) + + +mask5sc6 ∷ Word64 +mask5sc6 = shiftL mask5sc0 (5 * 6) + + +mask5sc7 ∷ Word64 +mask5sc7 = shiftL mask5sc0 (5 * 7) + + +mask5sc8 ∷ Word64 +mask5sc8 = shiftL mask5sc0 (5 * 8) + + +mask5sc9 ∷ Word64 +mask5sc9 = shiftL mask5sc0 (5 * 9) + + +mask5sc10 ∷ Word64 +mask5sc10 = shiftL mask5sc0 (5 * 10) + + +mask5sc11 ∷ Word64 +mask5sc11 = shiftL mask5sc0 (5 * 11) + + +-- | mask8scN 11111111(7x00000000) mask to reveal states of Nth subcharacter in 64Bit Word +mask8sc0 ∷ Word64 +mask8sc0 = 0xFF + + +mask8sc1 ∷ Word64 +mask8sc1 = shiftL mask8sc0 8 + + +mask8sc2 ∷ Word64 +mask8sc2 = shiftL mask8sc0 (8 * 2) + + +mask8sc3 ∷ Word64 +mask8sc3 = shiftL mask8sc0 (8 * 3) + + +mask8sc4 ∷ Word64 +mask8sc4 = shiftL mask8sc0 (8 * 4) + + +mask8sc5 ∷ Word64 +mask8sc5 = shiftL mask8sc0 (8 * 5) + + +mask8sc6 ∷ Word64 +mask8sc6 = shiftL mask8sc0 (8 * 6) + + +mask8sc7 ∷ Word64 +mask8sc7 = shiftL mask8sc0 (8 * 7) + + +{-- +Lists of sub-character masks for operations over packed characters +-} + +packed2SubCharList ∷ [Word64] +packed2SubCharList = + [ mask2sc0 + , mask2sc1 + , mask2sc2 + , mask2sc3 + , mask2sc4 + , mask2sc5 + , mask2sc6 + , mask2sc7 + , mask2sc8 + , mask2sc9 + , mask2sc10 + , mask2sc11 + , mask2sc12 + , mask2sc13 + , mask2sc14 + , mask2sc15 + , mask2sc16 + , mask2sc17 + , mask2sc18 + , mask2sc19 + , mask2sc20 + , mask2sc21 + , mask2sc22 + , mask2sc23 + , mask2sc24 + , mask2sc25 + , mask2sc26 + , mask2sc27 + , mask2sc28 + , mask2sc29 + , mask2sc30 + , mask2sc31 + ] + + +packed4SubCharList ∷ [Word64] +packed4SubCharList = + [ mask4sc0 + , mask4sc1 + , mask4sc2 + , mask4sc3 + , mask4sc4 + , mask4sc5 + , mask4sc6 + , mask4sc7 + , mask4sc8 + , mask4sc9 + , mask4sc10 + , mask4sc11 + , mask4sc12 + , mask4sc13 + , mask4sc14 + , mask4sc15 + ] + + +packed5SubCharList ∷ [Word64] +packed5SubCharList = + [ mask5sc0 + , mask5sc1 + , mask5sc2 + , mask5sc3 + , mask5sc4 + , mask5sc5 + , mask5sc6 + , mask5sc7 + , mask5sc8 + , mask5sc9 + , mask5sc10 + , mask5sc11 + ] + + +packed8SubCharList ∷ [Word64] +packed8SubCharList = [mask8sc0, mask8sc1, mask8sc2, mask8sc3, mask8sc4, mask8sc5, mask8sc6, mask8sc7] + + +{- +Packed character minimum and maximum length functions +-} + +{- | mainMxCharDiff get the approximate minimum and maximum difference in number of states +uses masking with &/== +-} +minMaxCharDiff ∷ Bool → CharType → (Double, Double) → Word64 → Word64 → (Double, Double) +minMaxCharDiff adjustNoCost inCharType bitCosts a b = + let (minVal, maxVal) = + if inCharType == Packed2 + then minMaxPacked2 adjustNoCost bitCosts a b + else + if inCharType == Packed4 + then minMaxPacked4 adjustNoCost bitCosts a b + else + if inCharType == Packed5 + then minMaxPacked5 adjustNoCost bitCosts a b + else + if inCharType == Packed8 + then minMaxPacked8 adjustNoCost bitCosts a b + else + if inCharType == Packed64 + then minMaxPacked64 adjustNoCost bitCosts a b + else error ("Character type " <> show inCharType <> " unrecognized/not implemented") + in (minVal, maxVal) + + +{- | minMaxPacked2 minium and maximum cost 32x2 bit nonadditive character +the popcount for equality A/C -> A/C is identical but could be A->C so max 1 +basically unrolled to make faster +-} +minMaxPacked2 ∷ Bool → (Double, Double) → Word64 → Word64 → (Double, Double) +minMaxPacked2 adjustNoCost (lNoChangeCost, lChangeCost) a b = + let a0 = a .&. mask2sc0 + b0 = b .&. mask2sc0 + max0 + | a0 == (0 ∷ Word64) = 0 + | (a0 .&. b0) == (0 ∷ Word64) = 1 + | (a0 == b0) = 0 + | otherwise = 1 + + a1 = a .&. mask2sc1 + b1 = b .&. mask2sc1 + max1 + | a1 == (0 ∷ Word64) = 0 + | (a1 .&. b1) == (0 ∷ Word64) = 1 + | (a1 == b1) = 0 + | otherwise = 1 + + a2 = a .&. mask2sc2 + b2 = b .&. mask2sc2 + max2 + | a2 == (0 ∷ Word64) = 0 + | (a2 .&. b2) == (0 ∷ Word64) = 1 + | (a2 == b2) = 0 + | otherwise = 1 + + a3 = a .&. mask2sc3 + b3 = b .&. mask2sc3 + max3 + | a3 == (0 ∷ Word64) = 0 + | (a3 .&. b3) == (0 ∷ Word64) = 1 + | (a3 == b3) = 0 + | otherwise = 1 + + a4 = a .&. mask2sc4 + b4 = b .&. mask2sc4 + max4 + | a4 == (0 ∷ Word64) = 0 + | (a4 .&. b4) == (0 ∷ Word64) = 1 + | (a4 == b4) = 0 + | otherwise = 1 + + a5 = a .&. mask2sc5 + b5 = b .&. mask2sc5 + max5 + | a5 == (0 ∷ Word64) = 0 + | (a5 .&. b5) == (0 ∷ Word64) = 1 + | (a5 == b5) = 0 + | otherwise = 1 + + a6 = a .&. mask2sc6 + b6 = b .&. mask2sc6 + max6 + | a6 == (0 ∷ Word64) = 0 + | (a6 .&. b6) == (0 ∷ Word64) = 1 + | (a6 == b6) = 0 + | otherwise = 1 + + a7 = a .&. mask2sc7 + b7 = b .&. mask2sc7 + max7 + | a7 == (0 ∷ Word64) = 0 + | (a7 .&. b7) == (0 ∷ Word64) = 1 + | (a7 == b7) = 0 + | otherwise = 1 + + a8 = a .&. mask2sc8 + b8 = b .&. mask2sc8 + max8 + | a8 == (0 ∷ Word64) = 0 + | (a8 .&. b8) == (0 ∷ Word64) = 1 + | (a8 == b8) = 0 + | otherwise = 1 + + a9 = a .&. mask2sc9 + b9 = b .&. mask2sc9 + max9 + | a9 == (0 ∷ Word64) = 0 + | (a9 .&. b9) == (0 ∷ Word64) = 1 + | (a9 == b9) = 0 + | otherwise = 1 + + a10 = a .&. mask2sc10 + b10 = b .&. mask2sc10 + max10 + | a10 == (0 ∷ Word64) = 0 + | (a10 .&. b10) == (0 ∷ Word64) = 1 + | (a10 == b10) = 0 + | otherwise = 1 + + a11 = a .&. mask2sc11 + b11 = b .&. mask2sc11 + max11 + | a11 == (0 ∷ Word64) = 0 + | (a11 .&. b11) == (0 ∷ Word64) = 1 + | (a11 == b11) = 0 + | otherwise = 1 + + a12 = a .&. mask2sc12 + b12 = b .&. mask2sc12 + max12 + | a12 == (0 ∷ Word64) = 0 + | (a12 .&. b12) == (0 ∷ Word64) = 1 + | (a12 == b12) = 0 + | otherwise = 1 + + a13 = a .&. mask2sc13 + b13 = b .&. mask2sc13 + max13 + | a13 == (0 ∷ Word64) = 0 + | (a13 .&. b13) == (0 ∷ Word64) = 1 + | (a13 == b13) = 0 + | otherwise = 1 + + a14 = a .&. mask2sc14 + b14 = b .&. mask2sc14 + max14 + | a14 == (0 ∷ Word64) = 0 + | (a14 .&. b14) == (0 ∷ Word64) = 1 + | (a14 == b14) = 0 + | otherwise = 1 + + a15 = a .&. mask2sc15 + b15 = b .&. mask2sc15 + max15 + | a15 == (0 ∷ Word64) = 0 + | (a15 .&. b15) == (0 ∷ Word64) = 1 + | (a15 == b15) = 0 + | otherwise = 1 + + a16 = a .&. mask2sc16 + b16 = b .&. mask2sc16 + max16 + | a16 == (0 ∷ Word64) = 0 + | (a16 .&. b16) == (0 ∷ Word64) = 1 + | (a16 == b16) = 0 + | otherwise = 1 + + a17 = a .&. mask2sc17 + b17 = b .&. mask2sc17 + max17 + | a17 == (0 ∷ Word64) = 0 + | (a17 .&. b17) == (0 ∷ Word64) = 1 + | (a17 == b17) = 0 + | otherwise = 1 + + a18 = a .&. mask2sc18 + b18 = b .&. mask2sc18 + max18 + | a18 == (0 ∷ Word64) = 0 + | (a18 .&. b18) == (0 ∷ Word64) = 1 + | (a1 == b18) = 0 + | otherwise = 1 + + a19 = a .&. mask2sc19 + b19 = b .&. mask2sc19 + max19 + | a19 == (0 ∷ Word64) = 0 + | (a19 .&. b19) == (0 ∷ Word64) = 1 + | (a19 == b19) = 0 + | otherwise = 1 + + a20 = a .&. mask2sc20 + b20 = b .&. mask2sc20 + max20 + | a20 == (0 ∷ Word64) = 0 + | (a20 .&. b20) == (0 ∷ Word64) = 1 + | (a20 == b20) = 0 + | otherwise = 1 + + a21 = a .&. mask2sc21 + b21 = b .&. mask2sc21 + max21 + | a21 == (0 ∷ Word64) = 0 + | (a21 .&. b21) == (0 ∷ Word64) = 1 + | (a21 == b21) = 0 + | otherwise = 1 + + a22 = a .&. mask2sc22 + b22 = b .&. mask2sc22 + max22 + | a22 == (0 ∷ Word64) = 0 + | (a22 .&. b22) == (0 ∷ Word64) = 1 + | (a22 == b22) = 0 + | otherwise = 1 + + a23 = a .&. mask2sc23 + b23 = b .&. mask2sc23 + max23 + | a23 == (0 ∷ Word64) = 0 + | (a23 .&. b23) == (0 ∷ Word64) = 1 + | (a23 == b23) = 0 + | otherwise = 1 + + a24 = a .&. mask2sc24 + b24 = b .&. mask2sc24 + max24 + | a24 == (0 ∷ Word64) = 0 + | (a24 .&. b24) == (0 ∷ Word64) = 1 + | (a24 == b24) = 0 + | otherwise = 1 + + a25 = a .&. mask2sc25 + b25 = b .&. mask2sc25 + max25 + | a25 == (0 ∷ Word64) = 0 + | (a25 .&. b25) == (0 ∷ Word64) = 1 + | (a25 == b25) = 0 + | otherwise = 1 + + a26 = a .&. mask2sc26 + b26 = b .&. mask2sc26 + max26 + | a26 == (0 ∷ Word64) = 0 + | (a26 .&. b26) == (0 ∷ Word64) = 1 + | (a26 == b26) = 0 + | otherwise = 1 + + a27 = a .&. mask2sc27 + b27 = b .&. mask2sc27 + max27 + | a27 == (0 ∷ Word64) = 0 + | (a27 .&. b27) == (0 ∷ Word64) = 1 + | (a27 == b27) = 0 + | otherwise = 1 + + a28 = a .&. mask2sc28 + b28 = b .&. mask2sc28 + max28 + | a28 == (0 ∷ Word64) = 0 + | (a28 .&. b28) == (0 ∷ Word64) = 1 + | (a28 == b28) = 0 + | otherwise = 1 + + a29 = a .&. mask2sc29 + b29 = b .&. mask2sc29 + max29 + | a29 == (0 ∷ Word64) = 0 + | (a29 .&. b29) == (0 ∷ Word64) = 1 + | (a29 == b29) = 0 + | otherwise = 1 + + a30 = a .&. mask2sc30 + b30 = b .&. mask2sc30 + max30 + | a30 == (0 ∷ Word64) = 0 + | (a30 .&. b30) == (0 ∷ Word64) = 1 + | (a30 == b30) = 0 + | otherwise = 1 + + a31 = a .&. mask2sc31 + b31 = b .&. mask2sc31 + max31 + | a31 == (0 ∷ Word64) = 0 + | (a31 .&. b31) == (0 ∷ Word64) = 1 + | (a31 == b31) = 0 + | otherwise = 1 + + -- sum up values + (_, minNumNoChange, minNumChange) = andOR2 a b + maxVal = + sum + [ max0 + , max1 + , max2 + , max3 + , max4 + , max5 + , max6 + , max7 + , max8 + , max9 + , max10 + , max11 + , max12 + , max13 + , max14 + , max15 + , max16 + , max17 + , max18 + , max19 + , max20 + , max21 + , max22 + , max23 + , max24 + , max25 + , max26 + , max27 + , max28 + , max29 + , max30 + , max31 + ] + in -- trace ("2-state: " <> (show (minNumNoChange, minNumChange))) $ + -- trace ("MM2:" <> "\t" <> (showBits a0) <> " " <> (showBits b0) <> "->" <> (showBits $ a0 .&. b0) <> "=>" <> (show max0) <> "\n\t" <> (showBits a10) <> " " <> (showBits b10) <> "->" <> (showBits $ a10 .&. b10) <> "=>" <> (show max10)) + if lNoChangeCost == 0.0 + then (fromIntegral minNumChange, fromIntegral maxVal) + else + if not adjustNoCost + then + ( (lNoChangeCost * fromIntegral minNumNoChange) + (lChangeCost * fromIntegral minNumChange) + , (lNoChangeCost * fromIntegral ((32 ∷ Int) - maxVal)) + (lChangeCost * fromIntegral maxVal) + ) + else + let minNumNoChange' = (2 * minNumNoChange) + minNumChange + in ( (lNoChangeCost * fromIntegral minNumNoChange') + (lChangeCost * fromIntegral minNumChange) + , (lNoChangeCost * fromIntegral ((32 ∷ Int) - maxVal)) + (lChangeCost * fromIntegral maxVal) + ) + + +{- | minMaxPacked4 minium and maximum cost 16x4 bit nonadditive character +could add popcount == 1 for equality A/C -> A/C is identical but could be A->C so max 1 +basically unrolled to make faster +any diffference between states gets 1 for max +-} +minMaxPacked4 ∷ Bool → (Double, Double) → Word64 → Word64 → (Double, Double) +minMaxPacked4 adjustNoCost (lNoChangeCost, lChangeCost) a b = + let a0 = a .&. mask4sc0 + b0 = b .&. mask4sc0 + max0 + | a0 == (0 ∷ Word64) = 0 + | (a0 .&. b0) == (0 ∷ Word64) = 1 + | (a0 == b0) = 0 + | otherwise = 1 + + a1 = a .&. mask4sc1 + b1 = b .&. mask4sc1 + max1 + | a1 == (0 ∷ Word64) = 0 + | (a1 .&. b1) == (0 ∷ Word64) = 1 + | (a1 == b1) = 0 + | otherwise = 1 + + a2 = a .&. mask4sc2 + b2 = b .&. mask4sc2 + max2 + | a2 == (0 ∷ Word64) = 0 + | (a2 .&. b2) == (0 ∷ Word64) = 1 + | (a2 == b2) = 0 + | otherwise = 1 + + a3 = a .&. mask4sc3 + b3 = b .&. mask4sc3 + max3 + | a3 == (0 ∷ Word64) = 0 + | (a3 .&. b3) == (0 ∷ Word64) = 1 + | (a3 == b3) = 0 + | otherwise = 1 + + a4 = a .&. mask4sc4 + b4 = b .&. mask4sc4 + max4 + | a4 == (0 ∷ Word64) = 0 + | (a4 .&. b4) == (0 ∷ Word64) = 1 + | (a4 == b4) = 0 + | otherwise = 1 + + a5 = a .&. mask4sc5 + b5 = b .&. mask4sc5 + max5 + | a5 == (0 ∷ Word64) = 0 + | (a5 .&. b5) == (0 ∷ Word64) = 1 + | (a5 == b5) = 0 + | otherwise = 1 + + a6 = a .&. mask4sc6 + b6 = b .&. mask4sc6 + max6 + | a6 == (0 ∷ Word64) = 0 + | (a6 .&. b6) == (0 ∷ Word64) = 1 + | (a6 == b6) = 0 + | otherwise = 1 + + a7 = a .&. mask4sc7 + b7 = b .&. mask4sc7 + max7 + | a7 == (0 ∷ Word64) = 0 + | (a7 .&. b7) == (0 ∷ Word64) = 1 + | (a7 == b7) = 0 + | otherwise = 1 + + a8 = a .&. mask4sc8 + b8 = b .&. mask4sc8 + max8 + | a8 == (0 ∷ Word64) = 0 + | (a8 .&. b8) == (0 ∷ Word64) = 1 + | (a8 == b8) = 0 + | otherwise = 1 + + a9 = a .&. mask4sc9 + b9 = b .&. mask4sc9 + max9 + | a9 == (0 ∷ Word64) = 0 + | (a9 .&. b9) == (0 ∷ Word64) = 1 + | (a9 == b9) = 0 + | otherwise = 1 + + a10 = a .&. mask4sc10 + b10 = b .&. mask4sc10 + max10 + | a10 == (0 ∷ Word64) = 0 + | (a10 .&. b10) == (0 ∷ Word64) = 1 + | (a10 == b10) = 0 + | otherwise = 1 + + a11 = a .&. mask4sc11 + b11 = b .&. mask4sc11 + max11 + | a11 == (0 ∷ Word64) = 0 + | (a11 .&. b11) == (0 ∷ Word64) = 1 + | (a11 == b11) = 0 + | otherwise = 1 + + a12 = a .&. mask4sc12 + b12 = b .&. mask4sc12 + max12 + | a12 == (0 ∷ Word64) = 0 + | (a12 .&. b12) == (0 ∷ Word64) = 1 + | (a12 == b12) = 0 + | otherwise = 1 + + a13 = a .&. mask4sc13 + b13 = b .&. mask4sc13 + max13 + | a13 == (0 ∷ Word64) = 0 + | (a13 .&. b13) == (0 ∷ Word64) = 1 + | (a13 == b13) = 0 + | otherwise = 1 + + a14 = a .&. mask4sc14 + b14 = b .&. mask4sc14 + max14 + | a14 == (0 ∷ Word64) = 0 + | (a14 .&. b14) == (0 ∷ Word64) = 1 + | (a14 == b14) = 0 + | otherwise = 1 + + a15 = a .&. mask4sc15 + b15 = b .&. mask4sc15 + max15 + | a15 == (0 ∷ Word64) = 0 + | (a15 .&. b15) == (0 ∷ Word64) = 1 + | (a15 == b15) = 0 + | otherwise = 1 + + -- sum up values + (_, minNumNoChange, minNumChange) = andOR4 a b + maxVal = + sum + [ max0 + , max1 + , max2 + , max3 + , max4 + , max5 + , max6 + , max7 + , max8 + , max9 + , max10 + , max11 + , max12 + , max13 + , max14 + , max15 + ] + in -- trace ("4-state: " <> (show (minNumNoChange, minNumChange))) $ + -- trace ("MM2:" <> "\t" <> (showBits a0) <> " " <> (showBits b0) <> "->" <> (showBits $ a0 .&. b0) <> "=>" <> (show max0) <> "\n\t" <> (showBits a10) <> " " <> (showBits b10) <> "->" <> (showBits $ a10 .&. b10) <> "=>" <> (show max10)) + if lNoChangeCost == 0.0 + then (fromIntegral minNumChange, fromIntegral maxVal) + else + if not adjustNoCost + then + ( (lNoChangeCost * fromIntegral minNumNoChange) + (lChangeCost * fromIntegral minNumChange) + , (lNoChangeCost * fromIntegral ((32 ∷ Int) - maxVal)) + (lChangeCost * fromIntegral maxVal) + ) + else + let minNumNoChange' = (2 * minNumNoChange) + minNumChange + in ( (lNoChangeCost * fromIntegral minNumNoChange') + (lChangeCost * fromIntegral minNumChange) + , (lNoChangeCost * fromIntegral ((32 ∷ Int) - maxVal)) + (lChangeCost * fromIntegral maxVal) + ) + + +{- | minMaxPacked5 minium and maximum cost 12x5 bit nonadditive character +the popcount for equality A/C -> A/C is identical but could be A->C so max 1 +basically unrolled to make faster +-} +minMaxPacked5 ∷ Bool → (Double, Double) → Word64 → Word64 → (Double, Double) +minMaxPacked5 adjustNoCost (lNoChangeCost, lChangeCost) a b = + let a0 = a .&. mask8sc0 + b0 = b .&. mask5sc0 + max0 + | a0 == (0 ∷ Word64) = 0 + | (a0 .&. b0) == (0 ∷ Word64) = 1 + | (a0 == b0) = 0 + | otherwise = 1 + + a1 = a .&. mask5sc1 + b1 = b .&. mask5sc1 + max1 + | a1 == (0 ∷ Word64) = 0 + | (a1 .&. b1) == (0 ∷ Word64) = 1 + | (a1 == b1) = 0 + | otherwise = 1 + + a2 = a .&. mask5sc2 + b2 = b .&. mask5sc2 + max2 + | a2 == (0 ∷ Word64) = 0 + | (a2 .&. b2) == (0 ∷ Word64) = 1 + | (a2 == b2) = 0 + | otherwise = 1 + + a3 = a .&. mask5sc3 + b3 = b .&. mask5sc3 + max3 + | a3 == (0 ∷ Word64) = 0 + | (a3 .&. b3) == (0 ∷ Word64) = 1 + | (a3 == b3) = 0 + | otherwise = 1 + + a4 = a .&. mask5sc4 + b4 = b .&. mask5sc4 + max4 + | a4 == (0 ∷ Word64) = 0 + | (a4 .&. b4) == (0 ∷ Word64) = 1 + | (a4 == b4) = 0 + | otherwise = 1 + + a5 = a .&. mask5sc5 + b5 = b .&. mask5sc5 + max5 + | a5 == (0 ∷ Word64) = 0 + | (a5 .&. b5) == (0 ∷ Word64) = 1 + | (a5 == b5) = 0 + | otherwise = 1 + + a6 = a .&. mask5sc6 + b6 = b .&. mask5sc6 + max6 + | a6 == (0 ∷ Word64) = 0 + | (a6 .&. b6) == (0 ∷ Word64) = 1 + | (a6 == b6) = 0 + | otherwise = 1 + + a7 = a .&. mask5sc7 + b7 = b .&. mask5sc7 + max7 + | a7 == (0 ∷ Word64) = 0 + | (a7 .&. b7) == (0 ∷ Word64) = 1 + | (a7 == b7) = 0 + | otherwise = 1 + + a8 = a .&. mask5sc8 + b8 = b .&. mask5sc8 + max8 + | a8 == (0 ∷ Word64) = 0 + | (a8 .&. b8) == (0 ∷ Word64) = 1 + | (a8 == b8) = 0 + | otherwise = 1 + + a9 = a .&. mask5sc9 + b9 = b .&. mask5sc9 + max9 + | a9 == (0 ∷ Word64) = 0 + | (a9 .&. b9) == (0 ∷ Word64) = 1 + | (a9 == b9) = 0 + | otherwise = 1 + + a10 = a .&. mask5sc10 + b10 = b .&. mask5sc10 + max10 + | a10 == (0 ∷ Word64) = 0 + | (a10 .&. b10) == (0 ∷ Word64) = 1 + | (a10 == b10) = 0 + | otherwise = 1 + + a11 = a .&. mask5sc11 + b11 = b .&. mask5sc11 + max11 + | a11 == (0 ∷ Word64) = 0 + | (a11 .&. b11) == (0 ∷ Word64) = 1 + | (a11 == b11) = 0 + | otherwise = 1 + + -- sum up values + (_, minNumNoChange, minNumChange) = andOR5 a b + maxVal = + sum + [ max0 + , max1 + , max2 + , max3 + , max4 + , max5 + , max6 + , max7 + , max8 + , max9 + , max10 + , max11 + ] + in -- trace ("MM2:" <> "\t" <> (showBits a0) <> " " <> (showBits b0) <> "->" <> (showBits $ a0 .&. b0) <> "=>" <> (show max0) <> "\n\t" <> (showBits a10) <> " " <> (showBits b10) <> "->" <> (showBits $ a10 .&. b10) <> "=>" <> (show max10)) + if lNoChangeCost == 0.0 + then (fromIntegral minNumChange, fromIntegral maxVal) + else + if not adjustNoCost + then + ( (lNoChangeCost * fromIntegral minNumNoChange) + (lChangeCost * fromIntegral minNumChange) + , (lNoChangeCost * fromIntegral ((32 ∷ Int) - maxVal)) + (lChangeCost * fromIntegral maxVal) + ) + else + let minNumNoChange' = (2 * minNumNoChange) + minNumChange + in ( (lNoChangeCost * fromIntegral minNumNoChange') + (lChangeCost * fromIntegral minNumChange) + , (lNoChangeCost * fromIntegral ((32 ∷ Int) - maxVal)) + (lChangeCost * fromIntegral maxVal) + ) + + +{- | minMaxPacked8 minium and maximum cost 12x5 bit nonadditive character +the popcount for equality A/C -> A/C is identical but could be A->C so max 1 +basically unrolled to make faster +-} +minMaxPacked8 ∷ Bool → (Double, Double) → Word64 → Word64 → (Double, Double) +minMaxPacked8 adjustNoCost (lNoChangeCost, lChangeCost) a b = + let a0 = a .&. mask8sc0 + b0 = b .&. mask8sc0 + max0 + | a0 == (0 ∷ Word64) = 0 + | (a0 .&. b0) == (0 ∷ Word64) = 1 + | (a0 == b0) = 0 + | otherwise = 1 + + a1 = a .&. mask8sc1 + b1 = b .&. mask8sc1 + max1 + | a1 == (0 ∷ Word64) = 0 + | (a1 .&. b1) == (0 ∷ Word64) = 1 + | (a1 == b1) = 0 + | otherwise = 1 + + a2 = a .&. mask8sc2 + b2 = b .&. mask8sc2 + max2 + | a2 == (0 ∷ Word64) = 0 + | (a2 .&. b2) == (0 ∷ Word64) = 1 + | (a2 == b2) = 0 + | otherwise = 1 + + a3 = a .&. mask8sc3 + b3 = b .&. mask8sc3 + max3 + | a3 == (0 ∷ Word64) = 0 + | (a3 .&. b3) == (0 ∷ Word64) = 1 + | (a3 == b3) = 0 + | otherwise = 1 + + a4 = a .&. mask8sc4 + b4 = b .&. mask8sc4 + max4 + | a4 == (0 ∷ Word64) = 0 + | (a4 .&. b4) == (0 ∷ Word64) = 1 + | (a4 == b4) = 0 + | otherwise = 1 + + a5 = a .&. mask8sc5 + b5 = b .&. mask8sc5 + max5 + | a5 == (0 ∷ Word64) = 0 + | (a5 .&. b5) == (0 ∷ Word64) = 1 + | (a5 == b5) = 0 + | otherwise = 1 + + a6 = a .&. mask8sc6 + b6 = b .&. mask8sc6 + max6 + | a6 == (0 ∷ Word64) = 0 + | (a6 .&. b6) == (0 ∷ Word64) = 1 + | (a6 == b6) = 0 + | otherwise = 1 + + a7 = a .&. mask8sc7 + b7 = b .&. mask8sc7 + max7 + | a7 == (0 ∷ Word64) = 0 + | (a7 .&. b7) == (0 ∷ Word64) = 1 + | (a7 == b7) = 0 + | otherwise = 1 + + -- sum up values + (_, minNumNoChange, minNumChange) = andOR8 a b + maxVal = sum [max0, max1, max2, max3, max4, max5, max6, max7] + in -- trace ("MM2:" <> "\t" <> (showBits a0) <> " " <> (showBits b0) <> "->" <> (showBits $ a0 .&. b0) <> "=>" <> (show max0) <> "\n\t" <> (showBits a10) <> " " <> (showBits b10) <> "->" <> (showBits $ a10 .&. b10) <> "=>" <> (show max10)) + if lNoChangeCost == 0.0 + then (fromIntegral minNumChange, fromIntegral maxVal) + else + if not adjustNoCost + then + ( (lNoChangeCost * fromIntegral minNumNoChange) + (lChangeCost * fromIntegral minNumChange) + , (lNoChangeCost * fromIntegral ((32 ∷ Int) - maxVal)) + (lChangeCost * fromIntegral maxVal) + ) + else + let minNumNoChange' = (2 * minNumNoChange) + minNumChange + in ( (lNoChangeCost * fromIntegral minNumNoChange') + (lChangeCost * fromIntegral minNumChange) + , (lNoChangeCost * fromIntegral ((32 ∷ Int) - maxVal)) + (lChangeCost * fromIntegral maxVal) + ) + + +{- | minMaxPacked64 minium and maximum cost 64 bit nonadditive character +the popcount for equality A/C -> A/C is identical but could be A->C so max 1 +operattion over each sub-character +-} +minMaxPacked64 ∷ Bool → (Double, Double) → Word64 → Word64 → (Double, Double) +minMaxPacked64 adjustNoCost (lNoChangeCost, lChangeCost) a b = + let maxVal = if (a == b) && (popCount a == 1) then (0 ∷ Int) else (1 ∷ Int) + minVal = if (a .&. b) == (0 ∷ Word64) then (1 ∷ Int) else (0 ∷ Int) + in if lNoChangeCost == 0.0 + then (fromIntegral minVal, fromIntegral maxVal) + else + if not adjustNoCost + then + ( (lNoChangeCost * fromIntegral maxVal) + (lChangeCost * fromIntegral minVal) + , (lNoChangeCost * fromIntegral minVal) + (lChangeCost * fromIntegral maxVal) + ) + else + let (_, minNumNoChange, minNumChange) = andOR64 a b + minNumNoChange' = (2 * minNumNoChange) + minNumChange + in ( (lNoChangeCost * fromIntegral minNumNoChange') + (lChangeCost * fromIntegral minNumChange) + , (lNoChangeCost * fromIntegral ((32 ∷ Int) - maxVal)) + (lChangeCost * fromIntegral maxVal) + ) + + +{- | median2Packed takes two characters of packedNonAddTypes +and retuns new character data based on 2-median and cost +the change/no change numbers are adjuted based on two edgwes gogin into a singel vertex + each change would be coupled by a nochange on the sidter edge + each noChnage is really for two erdges so multiplied by two + this will enforce 2 edges from root so 2n - 2 edges for possible changes +-} +median2Packed ∷ Bool → CharType → Double → (Double, Double) → CharacterData → CharacterData → CharacterData +median2Packed adjustNoCost inCharType inCharWeight (thisNoChangeCost, thisChangeCost) leftChar rightChar = + let (newStateVect, numNoChange, numChange) = + if inCharType == Packed2 + then median2Word64 andOR2 (snd3 $ packedNonAddPrelim leftChar) (snd3 $ packedNonAddPrelim rightChar) + else + if inCharType == Packed4 + then median2Word64 andOR4 (snd3 $ packedNonAddPrelim leftChar) (snd3 $ packedNonAddPrelim rightChar) + else + if inCharType == Packed5 + then median2Word64 andOR5 (snd3 $ packedNonAddPrelim leftChar) (snd3 $ packedNonAddPrelim rightChar) + else + if inCharType == Packed8 + then median2Word64 andOR8 (snd3 $ packedNonAddPrelim leftChar) (snd3 $ packedNonAddPrelim rightChar) + else + if inCharType == Packed64 + then median2Word64 andOR64 (snd3 $ packedNonAddPrelim leftChar) (snd3 $ packedNonAddPrelim rightChar) + else error ("Character type " <> show inCharType <> " unrecognized/not implemented") + + -- this for PMDL/ML costs + newCost = + if thisNoChangeCost == 0.0 + then inCharWeight * (fromIntegral numChange) + else + if not adjustNoCost + then inCharWeight * ((thisChangeCost * (fromIntegral numChange)) + (thisNoChangeCost * (fromIntegral numNoChange))) + else + let adjustedNoChangeNumber = numChange + (2 * numNoChange) + in inCharWeight * ((thisChangeCost * (fromIntegral numChange)) + (thisNoChangeCost * (fromIntegral adjustedNoChangeNumber))) + + newCharacter = + emptyCharacter + { packedNonAddPrelim = (snd3 $ packedNonAddPrelim leftChar, newStateVect, snd3 $ packedNonAddPrelim rightChar) + , localCost = newCost + , globalCost = newCost + globalCost leftChar + globalCost rightChar + } + in -- trace ("MSP: " <> (show inCharType) <> " " <> (show (numChange, numNoChange)) <> (show newCost)) $ + -- trace ("M2P: " <> (showBitsV $ (snd3 . packedNonAddPrelim) leftChar) <> " " <> (showBitsV $ (snd3 . packedNonAddPrelim) rightChar) <> " -> " <> (showBitsV $ (snd3 . packedNonAddPrelim) newCharacter) <> " at cost " <> (show newCost)) + newCharacter + + +{- | median2PackedUnionField takes two characters of packedNonAddTypes +and retuns new character data based on 2-median and cost +-} +median2PackedUnionField ∷ Bool → CharType → Double → (Double, Double) → CharacterData → CharacterData → CharacterData +median2PackedUnionField adjustNoCost inCharType inCharWeight (thisNoChangeCost, thisChangeCost) leftChar rightChar = + let (newStateVect, numNoChange, numChange) = + if inCharType == Packed2 + then median2Word64 andOR2 (packedNonAddUnion leftChar) (packedNonAddUnion rightChar) + else + if inCharType == Packed4 + then median2Word64 andOR4 (packedNonAddUnion leftChar) (packedNonAddUnion rightChar) + else + if inCharType == Packed5 + then median2Word64 andOR5 (packedNonAddUnion leftChar) (packedNonAddUnion rightChar) + else + if inCharType == Packed8 + then median2Word64 andOR8 (packedNonAddUnion leftChar) (packedNonAddUnion rightChar) + else + if inCharType == Packed64 + then median2Word64 andOR64 (packedNonAddUnion leftChar) (packedNonAddUnion rightChar) + else error ("Character type " <> show inCharType <> " unrecognized/not implemented") + + -- this for PMDL/ML costs + newCost = + if thisNoChangeCost == 0.0 + then inCharWeight * (fromIntegral numChange) + else + if not adjustNoCost + then inCharWeight * ((thisChangeCost * (fromIntegral numChange)) + (thisNoChangeCost * (fromIntegral numNoChange))) + else + let adjustedNoChangeNumber = numChange + (2 * numNoChange) + in inCharWeight * ((thisChangeCost * (fromIntegral numChange)) + (thisNoChangeCost * (fromIntegral adjustedNoChangeNumber))) + + newCharacter = + emptyCharacter + { packedNonAddUnion = newStateVect + , localCost = newCost + , globalCost = newCost + globalCost leftChar + globalCost rightChar + } + in newCharacter + + +{- | unionPacked returns character that is the union (== OR) for bit packed characters +of the final fields as preliminary and final +-} +unionPacked ∷ CharacterData → CharacterData → CharacterData +unionPacked charL charR = + let newVect = UV.zipWith (.|.) (packedNonAddFinal charL) (packedNonAddFinal charR) + in emptyCharacter + { packedNonAddPrelim = (newVect, newVect, newVect) + , packedNonAddFinal = newVect + } + + +{- +andOrN functions derived from White and Holland 2011 +-} + +{- +-- | andOR2 Packed2 modified from Goloboff 2002 +-- this is incomplete-- Goloboff uses look up table for on bits to length +andOR2 :: Word64 -> Word64 -> (Word64, Int) +andOR2 x y = + let x1 = x .&. y + c1 = xor mask2B (shiftR ((x1 .&. mask2B) .|. (x1 .&. mask2A)) 1) + c2 = c1 .|. (shiftL c1 1) + newState = x1 .|. c2 + numChanges = lookUpLegnth((c1 .|. (shiftR c1 31)) .&. 0xFFFFFFFF) + in + (newState, numChanges) + -} + +-- For all biut packed characters--post order median 2 +-- for now--for somwe reason either mis-diagnosed or mis-coded (from White and Holland 2011) +-- the "on" bit in u is reflective of number of intersections not unions. +-- hence subtracting the number of unions from numbers of characters +-- determined by leading OFF bits since packing will likely have ragged edges +-- no not always the pack-able number + +-- | median2Word64 driver function for median of two PackedN states +median2Word64 + ∷ (Word64 → Word64 → (Word64, Int, Int)) → UV.Vector Word64 → UV.Vector Word64 → (UV.Vector Word64, Int, Int) +median2Word64 andOrFun leftVect rightVect = + let (stateVect, noChangeVect, changeVect) = UV.unzip3 $ UV.zipWith andOrFun leftVect rightVect + in (stateVect, UV.sum noChangeVect, UV.sum changeVect) + + +-- | andOR2 and or function for Packed2 encoding +andOR2 ∷ Word64 → Word64 → (Word64, Int, Int) +andOR2 x y = + let u = shiftR ((((x .&. y .&. mask2A) + mask2A) .|. (x .&. y)) .&. mask2B) 1 + z = (x .&. y) .|. ((x .|. y) .&. ((u + mask2A) `xor` mask2B)) + + -- get number of characters by checking states (may not be full) + numEmptyBits = countLeadingZeros x --- could be y just as well + + -- shift divide by 2 states + numNonCharacters = shiftR numEmptyBits 1 + numChars = 32 - numNonCharacters + in {- + trace ("AO2 numChars:" <> (show numChars) <> " x & y:" <> (showBits $ x .&. y) <> "\nx .&. y .&. mask2A:" <> (showBits $ (x .&. y .&. mask2A)) <> "\n((x .&. y .&. mask2A) + mask2A):" <> (showBits $ ((x .&. y .&. mask2A) + mask2A)) + <> "\n:(((x .&. y .&. mask2A) + mask2A) .|. (x .&. y)): " <> (showBits $ (((x .&. y .&. mask2A) + mask2A) .|. (x .&. y))) <> "\n:((((x .&. y .&. mask2A) + mask2A) .|. (x .&. y)) .&. mask2B):" + <> (showBits $ ((((x .&. y .&. mask2A) + mask2A) .|. (x .&. y)) .&. mask2B)) <> "\nu: " <> (showBits u) + <>"\npc: " <> (show $ popCount u) <> " x:" <> (showBits x) <> " y:" <> (showBits y) <> " => u:" <> (showBits u) <> " z:" <> (showBits z)) -- <> " mask2A:" <> (showBits mask2A) <> " mask2B:" <> (showBits mask2B)) + -} + (z, popCount u, numChars - popCount u) + + +-- | andOR4 and or function for Packed4 encoding +andOR4 ∷ Word64 → Word64 → (Word64, Int, Int) +andOR4 x y = + let u = shiftR ((((x .&. y .&. mask4A) + mask4A) .|. (x .&. y)) .&. mask4B) 3 + z = (x .&. y) .|. ((x .|. y) .&. ((u + mask4A) `xor` mask4B)) + + -- get number of characters by checking states (may not be full) + numEmptyBits = countLeadingZeros x --- could be y just as well + + -- shift divide by 4 states + numNonCharacters = shiftR numEmptyBits 2 + numChars = 16 - numNonCharacters + in (z, popCount u, numChars - popCount u) + + +{- | andOR5 and or function for Packed5 encoding +potential issue with top 4 bits--not sure on mask5B whether top 4 should be on or OFF. +can always mask top 4 with AND 0000111... (0xFFFFFFFFFFFFFFF or 1152921504606846975) +to remove bits for counting +and calcualted state +-} +andOR5 ∷ Word64 → Word64 → (Word64, Int, Int) +andOR5 x y = + let u = shiftR ((((x .&. y .&. mask5A) + mask5A) .|. (x .&. y)) .&. mask5B) 4 + z = (x .&. y) .|. ((x .|. y) .&. ((u + mask5A) `xor` mask5B)) + + -- get number of characters by checking states (may not be full) + numEmptyBits = countLeadingZeros x --- could be y just as well + + -- since top 4 bits always off (can't put anyhting in there) need to subtract those zeros + -- to get character number. Cant shift to get / 5 so integer divide + (numNonCharacters, _) = divMod (numEmptyBits - 4) 5 + numChars = 12 - numNonCharacters + in -- trace ("AO5 numChars:" <> (show numChars) <> " x & y:" <> (showBits $ x .&. y) <> " u:" <> (showBits u) <> " z:" <> (showBits z) <> " leading 0:" <> (show numEmptyBits) <> " non-chars:" <> (show numNonCharacters) <> " popCount u:" <> (show $ popCount u)) + (z, popCount u, numChars - popCount u) + + +-- | andOR8 and or function for Packed8 encoding +andOR8 ∷ Word64 → Word64 → (Word64, Int, Int) +andOR8 x y = + let u = shiftR ((((x .&. y .&. mask8A) + mask8A) .|. (x .&. y)) .&. mask8B) 7 + z = (x .&. y) .|. ((x .|. y) .&. ((u + mask8A) `xor` mask8B)) + + -- get number of characters by checking states (may not be full) + numEmptyBits = countLeadingZeros x --- could be y just as well + + -- shift divide by 8 states + numNonCharacters = shiftR numEmptyBits 3 + numChars = 8 - numNonCharacters + in (z, popCount u, numChars - popCount u) + + +{- | andOR64 and or function for Packed64 encoding +x `xor` x to make sure all 0 bits +-} +andOR64 ∷ Word64 → Word64 → (Word64, Int, Int) +andOR64 x y = + if (x .&. y) /= (x `xor` x) + then (x .&. y, 1, 0) + else (x .|. y, 0, 1) + + +{- +Functions for median3 calculations of packed types +These are used in pre-order graph traversals and final state assignment +among others. +-} + +{- | packePreorder takes character type, current node (and children preliminary assignments) +and parent final assignment and creates final assignment for current node +a bit clumsy since uses Goloboff modifications and have to do some of the preOrder pass +in Goloboff but not done here + +Doubt this could be speeded up much by parallel -- but could be done +-} +packedPreorder ∷ CharType → (UV.Vector Word64, UV.Vector Word64, UV.Vector Word64) → UV.Vector Word64 → UV.Vector Word64 +packedPreorder inCharType (leftPrelim, childPrelim, rightPrelim) parentFinal = + let newStateVect + | inCharType == Packed2 = UV.zipWith4 preOrder2 leftPrelim childPrelim rightPrelim parentFinal + | inCharType == Packed4 = UV.zipWith4 preOrder4 leftPrelim childPrelim rightPrelim parentFinal + | inCharType == Packed5 = UV.zipWith4 preOrder5 leftPrelim childPrelim rightPrelim parentFinal + | inCharType == Packed8 = UV.zipWith4 preOrder8 leftPrelim childPrelim rightPrelim parentFinal + | inCharType == Packed64 = UV.zipWith4 preOrder64 leftPrelim childPrelim rightPrelim parentFinal + | otherwise = error ("Character type " <> show inCharType <> " unrecognized/not implemented") + in newStateVect + + +{- | preOrder2 performs bitpacked Fitch preorder based on Goloboff 2002 +less efficient than it could be due to not using Goloboff for post-order +assignment so have to calculate some post-order values that would +already exist otherwise. Given that pre-order should be much less frequent than +pre-order shouldn't be that bad +-} +preOrder2 ∷ Word64 → Word64 → Word64 → Word64 → Word64 +preOrder2 leftPrelim childPrelim rightPrelim parentFinal = + -- post-order stuff to get "temp" state used to calculate final + let t = leftPrelim .&. rightPrelim + + -- preOrder values + x2 = parentFinal .&. complement childPrelim + c3 = (mask2A .&. x2) .|. shiftR (mask2B .&. x2) 1 + c4 = c3 .|. shiftL c3 1 + + finalState = (parentFinal .&. complement c4) .|. (c4 .&. (childPrelim .|. parentFinal .&. t)) + in -- trace ("PO2: " <> " in " <> (show (showBits leftPrelim, showBits childPrelim, showBits rightPrelim, showBits parentFinal)) <> "->" <> (show $ showBits finalState)) + finalState + + +-- | preOrder4 from preOrder2 but for 4 states +preOrder4 ∷ Word64 → Word64 → Word64 → Word64 → Word64 +preOrder4 leftPrelim childPrelim rightPrelim parentFinal = + -- post-order stuff to get "temp" state used to calculate final + let x1 = leftPrelim .&. rightPrelim + y1 = leftPrelim .|. rightPrelim + c1 = xor mask4E ((mask4E .&. x1) .|. shiftR (mask4D .&. x1) 1 .|. shiftR (mask4C .&. x1) 2 .|. shiftR (mask4B .&. x1) 3) + c2 = c1 .|. shiftL c1 1 .|. shiftL c1 2 .|. shiftL c1 3 + t = c2 .|. y1 + + -- preOrder values + x2 = parentFinal .&. complement childPrelim + c3 = (mask4E .&. x2) .|. shiftR (mask4D .&. x2) 1 .|. shiftR (mask4C .&. x2) 2 .|. shiftR (mask4B .&. x2) 3 + c4 = c3 .|. shiftL c3 1 .|. shiftL c3 2 .|. shiftL c3 3 + + finalState = (parentFinal .&. complement c4) .|. (c4 .&. (childPrelim .|. parentFinal .&. t)) + in finalState + + +-- | preOrder5 from preOrder2 but for 5 states +preOrder5 ∷ Word64 → Word64 → Word64 → Word64 → Word64 +preOrder5 leftPrelim childPrelim rightPrelim parentFinal = + -- post-order stuff to get "temp" state used to calculate final + let x1 = leftPrelim .&. rightPrelim + y1 = leftPrelim .|. rightPrelim + c1 = + xor + mask5F + ( (mask5F .&. x1) + .|. shiftR (mask5E .&. x1) 1 + .|. shiftR (mask5D .&. x1) 2 + .|. shiftR (mask5C .&. x1) 3 + .|. shiftR (mask5B .&. x1) 4 + ) + c2 = c1 .|. shiftL c1 1 .|. shiftL c1 2 .|. shiftL c1 3 .|. shiftL c1 4 + t = c2 .|. y1 + + -- preOrder values + x2 = parentFinal .&. complement childPrelim + c3 = + (mask5F .&. x2) + .|. shiftR (mask5E .&. x2) 1 + .|. shiftR (mask5D .&. x2) 2 + .|. shiftR (mask5C .&. x2) 3 + .|. shiftR (mask5B .&. x2) 4 + c4 = c3 .|. shiftL c3 1 .|. shiftL c3 2 .|. shiftL c3 3 .|. shiftL c3 4 + + finalState = (parentFinal .&. complement c4) .|. (c4 .&. (childPrelim .|. parentFinal .&. t)) + in finalState + + +-- | preOrder8 from preOrder2 but for 8 states +preOrder8 ∷ Word64 → Word64 → Word64 → Word64 → Word64 +preOrder8 leftPrelim childPrelim rightPrelim parentFinal = + -- post-order stuff to get "temp" state used to calculate final + let x1 = leftPrelim .&. rightPrelim + y1 = leftPrelim .|. rightPrelim + c1 = + xor + mask8I + ( (mask8I .&. x1) + .|. shiftR (mask8H .&. x1) 1 + .|. shiftR (mask8G .&. x1) 2 + .|. shiftR (mask8F .&. x1) 3 + .|. shiftR (mask8E .&. x1) 4 + .|. shiftR (mask8D .&. x1) 5 + .|. shiftR (mask8C .&. x1) 6 + .|. shiftR (mask8B .&. x1) 7 + ) + c2 = c1 .|. shiftL c1 1 .|. shiftL c1 2 .|. shiftL c1 3 .|. shiftL c1 4 .|. shiftL c1 5 .|. shiftL c1 6 .|. shiftL c1 7 + t = c2 .|. y1 + + -- preOrder values + x2 = parentFinal .&. complement childPrelim + c3 = + (mask8I .&. x2) + .|. shiftR (mask8H .&. x2) 1 + .|. shiftR (mask8G .&. x2) 2 + .|. shiftR (mask8F .&. x2) 3 + .|. shiftR (mask8E .&. x2) 4 + .|. shiftR (mask8D .&. x2) 5 + .|. shiftR (mask8C .&. x2) 6 + .|. shiftR (mask8B .&. x2) 7 + c4 = c1 .|. shiftL c3 1 .|. shiftL c3 2 .|. shiftL c3 3 .|. shiftL c3 4 .|. shiftL c3 5 .|. shiftL c3 6 .|. shiftL c3 7 + + finalState = (parentFinal .&. complement c4) .|. (c4 .&. (childPrelim .|. parentFinal .&. t)) + in finalState + + +-- | preOrder64 performs simple Fitch preorder ("up-pass") on Word64 +preOrder64 ∷ Word64 → Word64 → Word64 → Word64 → Word64 +preOrder64 leftPrelim childPrelim rightPrelim parentFinal = + let a = parentFinal .&. complement childPrelim + b = leftPrelim .&. rightPrelim + c = parentFinal .|. childPrelim + d = childPrelim .|. (parentFinal .&. (leftPrelim .|. rightPrelim)) + in if a == (leftPrelim `xor` leftPrelim ∷ Word64) + then parentFinal + else + if b == (leftPrelim `xor` leftPrelim ∷ Word64) + then c + else d + + +{- +Functions for hard-wired 3-way optimization + basically + C & P1 & P2 -> if not 0 + else (C & P1) | (C & P2) | (P1 & P2) -> if not 0 + else C | P1 | P2 + bit operations based on Goloboff (2002) for trichotomous trees +-} +-- \| threeWayPacked median 3 for hard-wired networks +-- this is based on Goloboff (2002) for trichotomous trees +threeWayPacked ∷ CharType → UV.Vector Word64 → UV.Vector Word64 → UV.Vector Word64 → UV.Vector Word64 +threeWayPacked inCharType parent1 parent2 curNode = + let newStateVect + | inCharType == Packed2 = UV.zipWith3 threeWay2 parent1 parent2 curNode + | inCharType == Packed4 = UV.zipWith3 threeWay4 parent1 parent2 curNode + | inCharType == Packed5 = UV.zipWith3 threeWay5 parent1 parent2 curNode + | inCharType == Packed8 = UV.zipWith3 threeWay8 parent1 parent2 curNode + | inCharType == Packed64 = UV.zipWith3 threeWay64 parent1 parent2 curNode + | otherwise = error ("Character type " <> show inCharType <> " unrecognized/not implemented") + in newStateVect + + +{- | threeWayPacked' median 3 for hard-wired networks +this uses lists of masks so likely slower than Goloboff +this approach could also be used for min/max to be simpler but alos likelu slower since previous is +manually unrolled +-} +threeWayPacked' ∷ CharType → UV.Vector Word64 → UV.Vector Word64 → UV.Vector Word64 → UV.Vector Word64 +threeWayPacked' inCharType parent1 parent2 curNode = + let newStateVect + | inCharType == Packed2 = UV.zipWith3 (threeWayNWord64 packed2SubCharList) parent1 parent2 curNode + | inCharType == Packed4 = UV.zipWith3 (threeWayNWord64 packed4SubCharList) parent1 parent2 curNode + | inCharType == Packed5 = UV.zipWith3 (threeWayNWord64 packed5SubCharList) parent1 parent2 curNode + | inCharType == Packed8 = UV.zipWith3 (threeWayNWord64 packed8SubCharList) parent1 parent2 curNode + | inCharType == Packed64 = UV.zipWith3 threeWay64 parent1 parent2 curNode + | otherwise = error ("Character type " <> show inCharType <> " unrecognized/not implemented") + in newStateVect + + +{- | threeWayNWord64 3-way hardwired optimization for Packed N Word64 +non-additive character--maps over sub-characters with appropriate masks +lists of subcharacters with all ovther bits OFF are created via masks +then zipped over threeway function and ORed to create 32 bit final state +this is an alternate approach to the three node optimization of Golobiff below. +both should yield same result-- this is polymoprhic and simple--but not parallel +as in Goloboff so likely slower +-} +threeWayNWord64 ∷ [Word64] → Word64 → Word64 → Word64 → Word64 +threeWayNWord64 packedSubCharList p1 p2 cN = + let p1SubCharList = fmap (p1 .&.) packedSubCharList + p2SubCharList = fmap (p2 .&.) packedSubCharList + cNSubCharList = fmap (cN .&.) packedSubCharList + threeWayList = zipWith3 threeWay64 p1SubCharList p2SubCharList cNSubCharList + in L.foldl1' (.|.) threeWayList + + +{- | threeWay2 3-way hardwired optimization for Packed2 Word64 +but used on subCharacters +-} +threeWay2 ∷ Word64 → Word64 → Word64 → Word64 +threeWay2 p1 p2 cN = + let x = p1 .&. p2 .&. cN + y = (p1 .&. p2) .|. (p1 .&. cN) .|. (p2 .&. cN) + z = p1 .|. p2 .|. cN + c1 = xor mask2B ((mask2B .&. x) .|. shiftR (mask2A .&. x) 1) + d1 = xor mask2B ((mask2B .&. y) .|. shiftR (mask2A .&. y) 1) + c2 = c1 .|. shiftL c1 1 + d2 = d1 .|. shiftL d1 1 + newState = x .|. (y .&. c2) .|. (z .&. d2) + in newState + + +-- | threeWay4 3-way hardwired optimization for Packed4 Word64 +threeWay4 ∷ Word64 → Word64 → Word64 → Word64 +threeWay4 p1 p2 cN = + let x = p1 .&. p2 .&. cN + y = (p1 .&. p2) .|. (p1 .&. cN) .|. (p2 .&. cN) + z = p1 .|. p2 .|. cN + c1 = xor mask4B ((mask4B .&. x) .|. shiftR (mask4C .&. x) 1 .|. shiftR (mask4D .&. x) 2 .|. shiftR (mask4E .&. x) 3) + d1 = xor mask4B ((mask4B .&. y) .|. shiftR (mask4C .&. y) 1 .|. shiftR (mask4D .&. y) 2 .|. shiftR (mask4E .&. y) 3) + c2 = c1 .|. shiftL c1 1 .|. shiftL c1 2 .|. shiftL c1 3 + d2 = d1 .|. shiftL d1 1 .|. shiftL d1 2 .|. shiftL d1 3 + newState = x .|. (y .&. c2) .|. (z .&. d2) + in newState + + +-- | threeWay5 3-way hardwired optimization for Packed5 Word64 +threeWay5 ∷ Word64 → Word64 → Word64 → Word64 +threeWay5 p1 p2 cN = + let x = p1 .&. p2 .&. cN + y = (p1 .&. p2) .|. (p1 .&. cN) .|. (p2 .&. cN) + z = p1 .|. p2 .|. cN + c1 = + xor + mask5B + ((mask5B .&. x) .|. shiftR (mask5C .&. x) 1 .|. shiftR (mask5D .&. x) 2 .|. shiftR (mask5E .&. x) 3 .|. shiftR (mask5F .&. x) 4) + d1 = + xor + mask5B + ((mask5B .&. y) .|. shiftR (mask5C .&. y) 1 .|. shiftR (mask5D .&. y) 2 .|. shiftR (mask5E .&. y) 3 .|. shiftR (mask5F .&. y) 4) + c2 = c1 .|. shiftL c1 1 .|. shiftL c1 2 .|. shiftL c1 3 .|. shiftL c1 4 + d2 = d1 .|. shiftL d1 1 .|. shiftL d1 2 .|. shiftL d1 3 .|. shiftL d1 4 + newState = x .|. (y .&. c2) .|. (z .&. d2) + in newState + + +-- | threeWay8 3-way hardwired optimization for Packed8 Word64 +threeWay8 ∷ Word64 → Word64 → Word64 → Word64 +threeWay8 p1 p2 cN = + let x = p1 .&. p2 .&. cN + y = (p1 .&. p2) .|. (p1 .&. cN) .|. (p2 .&. cN) + z = p1 .|. p2 .|. cN + c1 = + xor + mask8B + ( (mask8B .&. x) + .|. shiftR (mask8C .&. x) 1 + .|. shiftR (mask8D .&. x) 2 + .|. shiftR (mask8E .&. x) 3 + .|. shiftR (mask8F .&. x) 4 + .|. shiftR (mask8G .&. x) 5 + .|. shiftR (mask8H .&. x) 6 + .|. shiftR (mask8I .&. x) 7 + ) + d1 = + xor + mask8B + ( (mask8B .&. y) + .|. shiftR (mask8C .&. y) 1 + .|. shiftR (mask8D .&. y) 2 + .|. shiftR (mask8E .&. y) 3 + .|. shiftR (mask8F .&. y) 4 + .|. shiftR (mask8G .&. y) 5 + .|. shiftR (mask8H .&. y) 6 + .|. shiftR (mask8I .&. y) 7 + ) + c2 = c1 .|. shiftL c1 1 .|. shiftL c1 2 .|. shiftL c1 3 .|. shiftL c1 4 .|. shiftL c1 5 .|. shiftL c1 6 .|. shiftL c1 7 + d2 = d1 .|. shiftL d1 1 .|. shiftL d1 2 .|. shiftL d1 3 .|. shiftL d1 4 .|. shiftL d1 5 .|. shiftL d1 6 .|. shiftL d1 7 + newState = x .|. (y .&. c2) .|. (z .&. d2) + in newState + + +-- | threeWay64 3-way hardwired optimization for straight Word64 +threeWay64 ∷ Word64 → Word64 → Word64 → Word64 +threeWay64 p1 p2 cN = + let x = p1 .&. p2 .&. cN + y = (p1 .&. p2) .|. (p1 .&. cN) .|. (p2 .&. cN) + z = p1 .|. p2 .|. cN + in if x /= (p1 `xor` p1 ∷ Word64) + then x + else + if y /= (p1 `xor` p1 ∷ Word64) + then y + else z + + +{- +Functions to encode ("pack") non-additive characters into new Word64 characters +based on their number of states +-} + +{- | getSingleCharacter takes a taxa x characters block and an index and returns the character vector for that index +resulting in a taxon by single charcater vector +-} +getSingleCharacter ∷ V.Vector (V.Vector CharacterData) → Int → V.Vector CharacterData +getSingleCharacter taxVectByCharVect charIndex = fmap (V.! charIndex) taxVectByCharVect + +{- | packData takes input data and creates a variety of bit-packed data types +to increase efficiency and reduce footprint of non-additive characters +that are encoded as bitvectors +-} +packNonAdditiveData ∷ GlobalSettings → ProcessedData → PhyG ProcessedData +packNonAdditiveData inGS (nameVect, bvNameVect, blockDataVect) = + -- need to check if this blowws out memory on big data sets (e.g. genomic) + let -- parallel setup + action ∷ BlockData → PhyG BlockData + action = recodeNonAddCharacters inGS + in do + newBlockDataList ← + getParallelChunkTraverse >>= \pTraverse → + action `pTraverse` V.toList blockDataVect + -- PU.seqParMap (parStrategy $ strictParStrat inGS) (recodeNonAddCharacters inGS) (V.toList blockDataVect) -- could be an option to save memory etc + + pure (nameVect, bvNameVect, V.fromList newBlockDataList) + + +{- | recodeNonAddCharacters takes block data, goes through characters +and recodes NonAdditive. +Concat and list for charInfoV because new characters can be created +and newCharInfo then as well, could be multiple per input 'charcater' +-} +recodeNonAddCharacters ∷ GlobalSettings → BlockData → PhyG BlockData +recodeNonAddCharacters inGS (nameBlock, charDataVV, charInfoV) = + let numChars = V.length charInfoV + + -- create vector of single characters with vector of taxon data of sngle character each + singleCharVectList = V.toList $ fmap (getSingleCharacter charDataVV) (V.fromList [0 .. numChars - 1]) + + packAction ∷ (V.Vector CharacterData, CharInfo) → PhyG ([[CharacterData]], [CharInfo]) + packAction = packNonAddPair inGS + in do + -- bit pack the nonadd + result ← + getParallelChunkTraverse >>= \pTraverse → + packAction `pTraverse` zip singleCharVectList (V.toList charInfoV) + let (recodedSingleVecList, newCharInfoLL) = unzip result + -- \$ zipWith (packNonAdd inGS) singleCharVectList (V.toList charInfoV) + + -- recreate BlockData, tacxon dominant structure + let newTaxVectByCharVect = V.fromList $ fmap V.fromList $ L.transpose $ concat recodedSingleVecList + + -- trace ("RNAC: " <> (show (length recodedSingleVecList, fmap length recodedSingleVecList)) <> " -> " <> (show $ fmap length newTaxVectByCharVect) <> " " <> (show $ length $ V.fromList $ concat newCharInfoLL)) + pure (nameBlock, newTaxVectByCharVect, V.fromList $ concat newCharInfoLL) + + +-- | packNonAddPair is a wrapper arpounf packNonAdd +packNonAddPair ∷ GlobalSettings → (V.Vector CharacterData, CharInfo) → PhyG ([[CharacterData]], [CharInfo]) +packNonAddPair inGS (inCharDataV, charInfo) = packNonAdd inGS inCharDataV charInfo + + +{- | packNonAdd takes (vector of taxa) by character data and list of character information +and returns bit packed and recoded non-additive characters and charInfo +input int is character index in block +the weight is skipping because of the weight replication in reorganize +if characters have non integer weight then they were not reorganized and left +as single BV--here as well. Should be very few (if any) of them. +-} +packNonAdd ∷ GlobalSettings → V.Vector CharacterData → CharInfo → PhyG ([[CharacterData]], [CharInfo]) +packNonAdd inGS inCharDataV charInfo = + -- trace ("PNA in weight: " <> (show $ weight charInfo)) ( + if charType charInfo /= NonAdd + then pure ([V.toList inCharDataV], [charInfo]) + else -- recode non-additive characters + + let leafNonAddV = V.toList $ fmap (snd3 . stateBVPrelim) inCharDataV + + -- there is a problem with this index--they should all be the same but there are two classes in some cases + -- I believe due to missing data + -- numNonAdd = (length . head) leafNonAddV + numNonAdd = minimum $ fmap length leafNonAddV + + -- parallel setup + stateAction ∷ Int → (Int, [BV.BitVector]) + stateAction = getStateNumber leafNonAddV + + charAction ∷ (Int, [[BV.BitVector]]) → PhyG ([CharacterData], [CharInfo]) + charAction = makeStateNCharacterTuple inGS charInfo + in do + -- split characters into groups by states number 2,4,5,8,64, >64 (excluding missing) + statePar ← getParallelChunkMap + let stateNumDataPairList = statePar stateAction [0 .. numNonAdd - 1] + -- PU.seqParMap (parStrategy $ strictParStrat inGS) (getStateNumber leafNonAddV) [0.. numNonAdd - 1] + + -- sort characters by states number (2, 4, 5, 8, 64, >64 -> 128) + let (state2CharL, state4CharL, state5CharL, state8CharL, state64CharL, state128CharL) = binStateNumber stateNumDataPairList ([], [], [], [], [], []) + + -- make new characters based on state size + result ← + getParallelChunkTraverse >>= \pTraverse → + charAction + `pTraverse` zip [2, 4, 5, 8, 64, 128] [state2CharL, state4CharL, state5CharL, state8CharL, state64CharL, state128CharL] + let (newStateCharListList, newCharInfoList) = unzip result + -- (PU.seqParMap (parStrategy $ strictParStrat inGS) (makeStateNCharacterTuple inGS charInfo) (zip [2,4,5,8,64,128] [state2CharL, state4CharL, state5CharL, state8CharL, state64CharL, state128CharL])) + + -- this here in case recoding and removing constant (with missing) characters yields no data to be bitpacked + if (L.foldl1' (&&) $ fmap null [state2CharL, state4CharL, state5CharL, state8CharL, state64CharL, state128CharL]) + then pure ([V.toList inCharDataV], [charInfo]) + else pure (newStateCharListList, concat newCharInfoList) + + +-- ) + +-- | makeStateNCharacterTuple is a wrapper for makeStateNCharacter to allow for parMap use +makeStateNCharacterTuple ∷ GlobalSettings → CharInfo → (Int, [[BV.BitVector]]) → PhyG ([CharacterData], [CharInfo]) +makeStateNCharacterTuple inGS charInfo (stateNumber, charDataLL) = makeStateNCharacter inGS charInfo stateNumber charDataLL + + +{- | makeStateNCharacter takes a list of characters each of which is a list of taxon character values and +creates a new character of all characters for give taxon and packs (64/ state number) characters into a 64 bit Word64 +via chuncksOf--or if 64, not packing, if 128 stays bitvector +check for non-sequential states (A,T) or (0,2) etc +return is list of taxa x single new (packed) character +-} +makeStateNCharacter ∷ GlobalSettings → CharInfo → Int → [[BV.BitVector]] → PhyG ([CharacterData], [CharInfo]) +makeStateNCharacter inGS charInfo stateNumber charDataLL = do + result ← + if stateNumber > 64 + then recodeBV2BV inGS charInfo charDataLL + else + if stateNumber == 64 + then recodeBV2Word64Single inGS charInfo charDataLL + else recodeBV2Word64 inGS charInfo stateNumber charDataLL + + pure result + + +{- | recodeBV2BV take a list of BV.bitvector non-add characters and creates a list (taxa) +BV non-additive characters of type NonAdd. +this results in a single character and charInfo in list so can be concatenated +and removed if empty +-} +recodeBV2BV ∷ GlobalSettings → CharInfo → [[BV.BitVector]] → PhyG ([CharacterData], [CharInfo]) +recodeBV2BV inGS charInfo charTaxBVLL = + if null charTaxBVLL + then pure ([], []) + else + let -- convert to taxon by characgter data list + newStateList = makeNewCharacterData charTaxBVLL + + -- rename with thype + newCharName = T.append (name charInfo) $ T.pack "LargeState" + + -- create new characters for each taxon + newCharDataList = fmap (makeNewData emptyCharacter) newStateList + in do + pure + ( newCharDataList + , [charInfo{name = newCharName, charType = NonAdd, noChangeCost = (fst . bcgt64) inGS, changeCost = (snd . bcgt64) inGS}] + ) + where + makeNewData a b = a{stateBVPrelim = (b, b, b), stateBVFinal = b} + + +{- | recodeBV2Word64Single take a list of BV.bitvector non-add characters and creates a list (taxa) +of Word64 unpacked non-additive characters of type Packed64. +this results in a single character and charInfo in list so can be concatenated +and removed if empty +Aassumes a leaf only sets snd3 +-} +recodeBV2Word64Single ∷ GlobalSettings → CharInfo → [[BV.BitVector]] → PhyG ([CharacterData], [CharInfo]) +recodeBV2Word64Single inGS charInfo charTaxBVLL = + if null charTaxBVLL + then pure ([], []) + else + let newCharName = T.append (name charInfo) $ T.pack "64State" + + -- convert BV to Word64 + taxWord64BLL = fmap (fmap BV.toUnsignedNumber) charTaxBVLL + + -- convert to taxon by character data lisyt + newStateList = makeNewCharacterData taxWord64BLL + + -- make new character data + newCharDataList = fmap (makeNewData emptyCharacter) newStateList + in do + pure + ( newCharDataList + , [charInfo{name = newCharName, charType = Packed64, noChangeCost = (fst . bc64) inGS, changeCost = (snd . bc64) inGS}] + ) + where + makeNewData a b = a{packedNonAddPrelim = (b, b, b), packedNonAddFinal = b} + + +{- | makeNewCharacterData takes a list of characters, each of which is a list of taxon states +of type a (bitvector or Word64) and returns a list of taxa each of which is a vector +of type a charactyer data +generic verctor so can have Unboxed V and Boxed V +-} +makeNewCharacterData ∷ (GV.Vector v a) ⇒ [[a]] → [v a] +makeNewCharacterData charByTaxSingleCharData = + let taxonByCharL = L.transpose charByTaxSingleCharData + taxonByCharV = fmap GV.fromList taxonByCharL + in taxonByCharV + + +{- | recodeBV2Word64 take a list of BV.bitvector non-add characters and the states number of creates +Word64 representaions where subcharacters are created and shifted to proper positions and ORd +to create packed reresentation--new character types Packed2, Packed4, Packed5, and Packed8. +this results in a single character and charInfo in list so can be concatenated +and removed if empty +-} +recodeBV2Word64 ∷ GlobalSettings → CharInfo → Int → [[BV.BitVector]] → PhyG ([CharacterData], [CharInfo]) +recodeBV2Word64 inGS charInfo stateNumber charTaxBVLL = + -- trace ("Enter RBV2W64 In: " <> (show stateNumber) <> " " <> (show (length charTaxBVLL, fmap length charTaxBVLL))) ( + if null charTaxBVLL + then pure ([], []) + else + let newCharType = + if stateNumber == 2 + then Packed2 + else + if stateNumber == 4 + then Packed4 + else + if stateNumber == 5 + then Packed5 + else + if stateNumber == 8 + then Packed8 + else error ("State number " <> (show stateNumber) <> " not to be packed in recodeBV2Word64") + + newCharName = T.append (name charInfo) $ T.pack ((show stateNumber) <> "State") + + -- get number of characters that can be packed into Word64 for that state number + numCanPack = fst $ divMod 64 stateNumber + + -- convert to taxon by character data list + taxCharBVLL = L.transpose charTaxBVLL + + -- parallel setup + stateAction ∷ [BV.BitVector] → [Int] + stateAction = getStateIndexList + + packAction ∷ [[Int]] → [BV.BitVector] → CharacterData + packAction = packIntoWord64 stateNumber numCanPack + in do + -- get state index list for all characters (could be non sequential 0|2; A|T etc) + statePar ← getParallelChunkMap + let stateIndexLL = statePar stateAction charTaxBVLL + -- PU.seqParMap (parStrategy $ strictParStrat inGS) getStateIndexList charTaxBVLL + + -- convert data each taxon into packedWord64 + packPar ← getParallelChunkMap + let packedDataL = packPar (packAction stateIndexLL) taxCharBVLL + -- PU.seqParMap (parStrategy $ strictParStrat inGS) (packIntoWord64 stateNumber numCanPack stateIndexLL) taxCharBVLL + + -- get noChange and Change cost for char type + let (lNoChangeCost, lChangeCost) = + if stateNumber == 2 + then bc2 inGS + else + if stateNumber == 4 + then bc4 inGS + else + if stateNumber == 5 + then bc5 inGS + else + if stateNumber == 8 + then bc8 inGS + else error ("Change/NoChange costs for state number " <> (show stateNumber) <> " not to be set in recodeBV2Word64") + + -- trace ("RBV2W64 Out: " <> (show $ fmap (snd3 . packedNonAddPrelim) packedDataL)) + -- trace ("RBV2W64: " <> (show (lNoChangeCost, lChangeCost))) + pure + (packedDataL, [charInfo{name = newCharName, charType = newCharType, noChangeCost = lNoChangeCost, changeCost = lChangeCost}]) + + +-- ) + +{- | packIntoWideState takes a list of bitvectors for a taxon, the state number and number that can be packed into +a WideState and performs appropriate bit settting and shifting to create WideState +paralle looked to bag out here +-} +packIntoWord64 ∷ Int → Int → [[Int]] → [BV.BitVector] → CharacterData +packIntoWord64 stateNumber numToPack stateCharacterIndexL inBVList = + -- get packable chunk of bv and correcsponding state indices + let packBVList = SL.chunksOf numToPack inBVList + packIndexLL = SL.chunksOf numToPack stateCharacterIndexL + + -- pack each chunk + packedWordVect = UV.fromList $ zipWith (makeWord64FromChunk stateNumber) packIndexLL packBVList + in emptyCharacter + { packedNonAddPrelim = (packedWordVect, packedWordVect, packedWordVect) + , packedNonAddFinal = packedWordVect + } + + +{- | makeWord64FromChunk takes a list (= chunk) of bitvectors and creates bit subcharacter (Word64) +with adjacent bits for each BV in chunk. It then bit shifts the appropriate number of bits for each member +of the chunk and finally ORs all (64/stateNumber) Word64s to make the final packed representation +-} +makeWord64FromChunk ∷ Int → [[Int]] → [BV.BitVector] → Word64 +makeWord64FromChunk stateNumber stateIndexLL bvList = + if null bvList + then (0 ∷ Word64) + else + let subCharacterList = zipWith3 (makeSubCharacter stateNumber) stateIndexLL bvList [0 .. (length bvList - 1)] + in -- trace ("MW64FC: " <> (show subCharacterList) <> " " <> (show $ L.foldl1' (.|.) subCharacterList)) + L.foldl1' (.|.) subCharacterList + + +{- | makeSubCharacter makes sub-character (ie only those bits for states) from single bitvector and shifts appropriate number of bits +to make Word64 with sub character bits set and all other bits OFF and in correct bit positions for that sub-character +-} +makeSubCharacter ∷ Int → [Int] → BV.BitVector → Int → Word64 +makeSubCharacter stateNumber stateIndexList inBV subCharacterIndex = + -- trace ("Making sub character:" <> (show stateNumber <> " " <> (show stateIndexList) <> " " <> (show subCharacterIndex) <> (show inBV))) ( + let -- get bit of state indices + bitStates = fmap (testBit inBV) stateIndexList + + -- get index of states when only minimally bit encoded (0101, 0001 -> 11, 01) + newBitStates = setOnBits (0 `xor` 0 ∷ Word64) bitStates 0 + subCharacter = shiftL newBitStates (subCharacterIndex * stateNumber) + in -- trace ("MSC: " <> (show subCharacterIndex) <> " " <> (show bitStates) <> " " <> (show newBitStates) <> " " <> (show subCharacter)) $ + -- cna remove this check when working + + if length stateIndexList `notElem` [(fst (divMod 2 stateNumber) + 1) .. stateNumber] + then error ("State number of index list do not match: " <> show (stateNumber, length stateIndexList, stateIndexList)) + else subCharacter + + +-- ) +-- ) + +-- | setOnBits recursively sets On bits in a list of Bool +setOnBits ∷ Word64 → [Bool] → Int → Word64 +setOnBits baseVal onList bitIndex = + let c = L.uncons onList + in if isNothing c + then baseVal + else -- if null onList then baseVal + + let (a, b) = fromJust c + -- newVal = if head onList then setBit baseVal bitIndex + newVal = + if a + then setBit baseVal bitIndex + else baseVal + in setOnBits newVal b (bitIndex + 1) + + +-- setOnBits newVal (tail onList) (bitIndex + 1) + +{- | getStateIndexList takes list of list bit vectors and for each taxon for a given bv character +and returns a list of +bit indices of states in the bv this because states can be non-seqeuntial (0|3) +used to have a list of all states used (ON) in a character in all taxa +-} +getStateIndexList ∷ [BV.BitVector] → [Int] +getStateIndexList taxBVL = + if null taxBVL + then [] + else + let inBV = L.foldl1' (.|.) taxBVL + onList = fmap (testBit inBV) [0 .. finiteBitSize inBV - 1] + onIndexPair = zip onList [0 .. finiteBitSize inBV - 1] + indexList = (snd <$> filter fst onIndexPair) + in -- trace ("GSIL: " <> (show indexList)) + indexList + + +{- | binStateNumber takes a list of pairs of char states number and data column as list of bitvectors and +into list for 2,4,5,8,64,>64 +-} +binStateNumber + ∷ [(Int, [BV.BitVector])] + → ([[BV.BitVector]], [[BV.BitVector]], [[BV.BitVector]], [[BV.BitVector]], [[BV.BitVector]], [[BV.BitVector]]) + → ([[BV.BitVector]], [[BV.BitVector]], [[BV.BitVector]], [[BV.BitVector]], [[BV.BitVector]], [[BV.BitVector]]) +binStateNumber inPairList (cur2, cur4, cur5, cur8, cur64, cur128) = + let c = L.uncons inPairList + in if isNothing c + then -- dont' really need to reverse here but seems hygenic + -- trace ("Recoding NonAdditive Characters : " <> (show (length cur2, length cur4, length cur5, length cur8, length cur64, length cur128))) + (L.reverse cur2, L.reverse cur4, L.reverse cur5, L.reverse cur8, L.reverse cur64, L.reverse cur128) + else + let (a, b) = fromJust c + (stateNum, stateData) = a + in -- skip--constant + if stateNum < 2 + then binStateNumber b (cur2, cur4, cur5, cur8, cur64, cur128) + else + if stateNum == 2 + then binStateNumber b (stateData : cur2, cur4, cur5, cur8, cur64, cur128) + else + if stateNum <= 4 + then binStateNumber b (cur2, stateData : cur4, cur5, cur8, cur64, cur128) + else + if stateNum <= 5 + then binStateNumber b (cur2, cur4, stateData : cur5, cur8, cur64, cur128) + else + if stateNum <= 8 + then binStateNumber b (cur2, cur4, cur5, stateData : cur8, cur64, cur128) + else + if stateNum <= 64 + then binStateNumber b (cur2, cur4, cur5, cur8, stateData : cur64, cur128) + else binStateNumber b (cur2, cur4, cur5, cur8, cur64, stateData : cur128) + + +{- | getStateNumber returns the number of uniqe (non missing) states for a 'column' +of nonadd bitvector values +the charState values are in ranges for 2,4,5,8,64 and bigger numbers +missingVal not takeen from alphabet size since that was not updated in reorganize. +So take OR of all on bits--may be non-sequential--ie 0 2 7 so need to watch that. +returns pair of stateNUmber class (2,4,5,8,64, >64 as 128) and list of states +for efficient glueing back together later +checks for min length vchars to make those missing-- can happen with implied alignment recoding +-} +getStateNumber ∷ [V.Vector BV.BitVector] → Int → (Int, [BV.BitVector]) +getStateNumber characterDataVV characterIndex = + -- trace ("GSN:" <> (show characterIndex) <> " " <> (show $ fmap V.length characterDataVV) <> "\n\t" <> (show $ fmap (V.! characterIndex) characterDataVV)) ( + if null characterDataVV + then (0, []) + else + let thisCharV = fmap (V.! characterIndex) characterDataVV + missingVal = L.foldl1' (.|.) thisCharV + nonMissingStates = filter (/= missingVal) thisCharV + nonMissingBV = L.foldl1' (.|.) nonMissingStates + numStates = popCount nonMissingBV + + -- this turns off non-missing bits + thisCharL = fmap (.&. nonMissingBV) thisCharV + in -- trace ("GSN:" <> (show nonMissingStates) <> "\nNMBV " <> (show nonMissingBV) <> " MBV " <> (show missingVal) <> " -> " <> (show numStates) ) ( + ( if null nonMissingStates || (numStates == 1) + then (1, []) + else + ( if numStates == 2 + then (2, thisCharL) + else + if numStates <= 4 + then (4, thisCharL) + else + if numStates == 5 + then (5, thisCharL) + else + if numStates <= 8 + then (8, thisCharL) + else + if numStates <= 64 + then (64, thisCharL) + else (128, thisCharL) + ) + ) + +-- ) -- ) diff --git a/src/Input/BitPackingDimension.hs b/src/Input/BitPackingDimension.hs new file mode 100644 index 000000000..4caa4db67 --- /dev/null +++ b/src/Input/BitPackingDimension.hs @@ -0,0 +1,65 @@ +module BitPackingDimension where + + +data BitDimension + = BitDim0 + | BitDim1 + | BitDim2 + | BitDim3 + | BitDim4 + | BitDim5 + | BitDim6 + | BitDimN + deriving (Bounded, Eq, Ord) + + +instance Enum BitDimension where + fromEnum BitDim0 = 0 + fromEnum BitDim1 = 1 + fromEnum BitDim2 = 2 + fromEnum BitDim3 = 3 + fromEnum BitDim4 = 4 + fromEnum BitDim5 = 5 + fromEnum BitDim6 = 6 + fromEnum BitDimN = maxBound + + + toEnum i = case i `mod` 7 of + 0 -> BitDim0 + 1 -> BitDim1 + 2 -> BitDim2 + 3 -> BitDim3 + 4 -> BitDim4 + 5 -> BitDim5 + 6 -> BitDim6 + _ -> BitDimN + + +instance Show BitDimension where + show d = + let getExp BitDim0 = '⁰' + getExp BitDim1 = '¹' + getExp BitDim2 = '²' + getExp BitDim3 = '³' + getExp BitDim4 = '⁴' + getExp BitDim5 = '⁵' + getExp BitDim6 = '⁶' + getExp BitDimN = 'ⁿ' + in '❬' : '2' : getExp d : "❭" + + +bitWidth :: BitDimension -> Maybe Word +bitWidth BitDimN = Nothing +bitWidth bitDimV = Just . (2 ^) $ fromEnum bitDimV + + +computeDimension :: Word -> BitDimension +computeDimension x + | x > 2 ^ 6 = BitDimN + | x > 2 ^ 5 = BitDim6 + | x > 2 ^ 4 = BitDim5 + | x > 2 ^ 3 = BitDim4 + | x > 2 ^ 2 = BitDim3 + | x > 2 ^ 1 = BitDim2 + | x > 2 ^ 0 = BitDim1 + | otherwise = BitDim0 diff --git a/src/Input/DataTransformation.hs b/src/Input/DataTransformation.hs new file mode 100644 index 000000000..f248f9e1c --- /dev/null +++ b/src/Input/DataTransformation.hs @@ -0,0 +1,1097 @@ +{-# LANGUAGE OverloadedStrings #-} + +{- | +Module with functionality to transform phylogenetic data +-} +module Input.DataTransformation ( + renameData, + getDataTerminalNames, + addMissingTerminalsToInput, + checkDuplicatedTerminals, + createNaiveData, + createBVNames, + partitionSequences, + missingAligned, + setMissingBits, + removeAllMissingCharacters, + checkLeafMissingData, +) where + +import Bio.DynamicCharacter +import Bio.DynamicCharacter.Element +import Control.Monad (when) +import Control.Monad.IO.Class (MonadIO (..)) +import Data.Coerce (coerce) +import Data.Alphabet +import Data.Alphabet.Codec +import Data.Alphabet.IUPAC +import Data.Alphabet.Special +import Data.Bifunctor +import Data.Bimap (Bimap) +import Data.Bimap qualified as BM +import Bio.DynamicCharacter.Element qualified as Elem +import Data.BitVector.LittleEndian qualified as BV +import Data.BitVector.LittleEndian (BitVector) +import Data.Bits +import Data.Char qualified as C +import Data.Foldable +import Data.Functor (($>)) +import Data.List qualified as L +import Data.List.NonEmpty (NonEmpty (..)) +import Data.List.NonEmpty qualified as NE +import Data.Maybe +import Data.String +import Data.Text.Lazy qualified as T +import Data.Text.Short qualified as ST +import Data.Vector qualified as V +import Data.Vector.Storable qualified as SV +import Data.Vector.Unboxed qualified as UV +import Data.Vector.Generic qualified as GV +import Foreign.C.Types +import GeneralUtilities +import Numeric.Natural +import PHANE.Evaluation +import PHANE.Evaluation.ErrorPhase (ErrorPhase (..)) +import PHANE.Evaluation.Logging (LogLevel (..), Logger (..)) +import PHANE.Evaluation.Verbosity (Verbosity (..)) +import Text.Read +import Types.Types +import Utilities.Utilities qualified as U + +import Debug.Trace + +{- | checkLeafMissingData checks missing data in inputs +missing data is defined as not in an input file +if missing number / numInputFiles > threshold then put in list of +leaves to exclude +-} +checkLeafMissingData ∷ Int → [RawData] → [NameText] +checkLeafMissingData theshold inDataList = + if theshold == 100 + then [] + else + let numInputFiles = length inDataList + criterion = (fromIntegral (100 - theshold)) / 100.0 + minOccurence = (ceiling $ criterion * (fromIntegral numInputFiles)) ∷ Int + leafList = fmap fst $ concat $ fmap fst inDataList + groupedLeafList = L.group $ L.sort leafList + leafOccurence = fmap length groupedLeafList + leafNameOccurencePair = zip (fmap head groupedLeafList) leafOccurence + leafToExclude = fmap fst $ filter ((< minOccurence) . snd) leafNameOccurencePair + in -- trace ("CLMD :" <> (show (theshold, numInputFiles, criterion, minOccurence, leafOccurence, leafToExclude))) $ + leafToExclude + + +{- | removeAllMissingCharacters removes characters from list in rawData if all taxa are missing +this can happen when taxa are renamed or added in terminals file +only checks a list length of 1 basically a sequence character +static chars passed on +-} +removeAllMissingCharacters ∷ RawData → PhyG [RawData] +removeAllMissingCharacters inData@(termData, charData) = + let lengthCheck = exists . filter (> 0) $ length . snd <$> termData + -- check for non-single sequence character + multipleSeqChar = length charData /= 1 || ((charType $ head charData) `elem` exactCharacterTypes) + warnMessage = + unwords + [ "Input file" + , T.unpack . name $ head charData + , "contains all missing data (perhaps due to renaming or adding/deleting terminals) and has been skipped." + , "\n" + ] + result + | multipleSeqChar || lengthCheck = pure [inData] + | otherwise = logWith LogWarn warnMessage $> [] + in result + + +{- | partitionSequences takes a character to split sequnces, usually '#'' as in POY, but can be changed +and divides the seqeunces into corresponding partitions. Replicate character info appending +a number to character name +assumes that input rawdata are a single character (as in form a single file) for sequence data +-} +partitionSequences ∷ ST.ShortText → [RawData] → PhyG [RawData] +partitionSequences partChar inDataList = + if null inDataList + then do + pure [] + else + let firstRawData@(taxDataList, charInfoList) = head inDataList + in -- for raw seqeunce data this will always be a single character + if (length charInfoList > 1) || (charType (head charInfoList) `notElem` sequenceCharacterTypes) + then do + restStuff ← partitionSequences partChar (tail inDataList) + pure $ firstRawData : restStuff + else + ( let (leafNameList, leafDataList) = unzip taxDataList + partitionCharList = fmap (U.splitSequence partChar) leafDataList + partitionCharListByPartition = makePartitionList partitionCharList + firstPartNumber = length $ head partitionCharList + allSame = filter (== firstPartNumber) $ length <$> tail partitionCharList + pairPartitions = zip (fmap T.unpack leafNameList) (fmap length partitionCharList) + in -- check partition numbers consistent + 1 because of tail + if (length allSame + 1) /= length partitionCharList + then + errorWithoutStackTrace + ( "Number of sequence partitions not consistent in " + <> T.unpack (name $ head charInfoList) + <> " " + <> (show pairPartitions) + <> "\n\tThis may be due to occurence of the partition character '" + <> (show partChar) + <> "' in sequence data. This value can be changed with the 'set' command." + ) + else -- if single partition then nothing to do + + if firstPartNumber == 1 + then do + restStuff ← partitionSequences partChar (tail inDataList) + pure $ firstRawData : restStuff + else -- split data + + -- make new structures to create RawData list + + let leafNameListList = replicate firstPartNumber leafNameList + + -- these filtered from terminal partitions + leafDataListList = fmap (fmap (filter (/= partChar))) partitionCharListByPartition + + -- create TermData + newTermDataList = joinLists leafNameListList leafDataListList + + -- filter out taxa with empty data-- so can be reconciled proerly later + newTermDataList' = fmap removeTaxaWithNoData newTermDataList + + -- create final [RawData] + charInfoListList = replicate firstPartNumber charInfoList + newRawDataList = zip newTermDataList' charInfoListList + in do + logWith LogInfo ("\nPartitioning " <> T.unpack (name $ head charInfoList) <> " into " <> show firstPartNumber <> " segments\n") + restStuff ← partitionSequences partChar (tail inDataList) + -- trace (" NCI " <> (show $ newTermDataList')) + pure $ newRawDataList <> restStuff + ) + + +{- | removeTaxaWithNoData takes a single TermData list and removes taxa with empty data +these can be created from paritioning sequences where there are no data in a +partitition. This allows for data reconciliation/renaming later. +-} +removeTaxaWithNoData ∷ [TermData] → [TermData] +removeTaxaWithNoData inTermData = + if null inTermData + then [] + else + let newData = filter (exists . snd) inTermData + in -- trace ((show $ length inTermData) <> " -> " <> (show $ length newData)) + newData + + +{- | joinLists takes two lists of lists (of same length) and zips the +heads of each, then continues till all joined +-} +joinLists ∷ [[a]] → [[b]] → [[(a, b)]] +joinLists listA listB + | length listA /= length listB = error ("Input lists not equal " <> show (length listA, length listB)) + | null listA = [] + | otherwise = + let firstList = zip (head listA) (head listB) + in firstList : joinLists (tail listA) (tail listB) + + +-- | makePartitionList take list by taxon and retuns list by partition +makePartitionList ∷ [[[ST.ShortText]]] → [[[ST.ShortText]]] +makePartitionList inListList = + if null $ head inListList + then [] + else + let firstParList = fmap head inListList + in firstParList : makePartitionList (fmap tail inListList) + + +{- | renameData takes a list of rename Text pairs (new name, oldName) +and replaces the old name with the new +-} +renameData ∷ [(T.Text, T.Text)] → RawData → RawData +renameData newNamePairList inData = + if null newNamePairList + then inData + else + let terminalData = fst inData + in if null terminalData + then inData + else + let newTerminalData = fmap (relabelterminalData newNamePairList) terminalData + in (newTerminalData, snd inData) + + +{- | relabelterminalData takes a list of Text pairs and the terminals with the +second name in the pairs is changed to the first +-} +relabelterminalData ∷ [(T.Text, T.Text)] → TermData → TermData +relabelterminalData namePairList terminalData@(leafName, leafData) = + if null namePairList + then terminalData + else + let foundName = find ((== leafName) . snd) namePairList + in if isNothing foundName + then -- trace ("Not renaming " <> (T.unpack leafName)) -- <> " " <> show namePairList) + terminalData + else -- trace ("Renaming " <> (T.unpack leafName) <> " to " <> (T.unpack $ fst $ fromJust foundName)) + (fst $ fromJust foundName, leafData) + + +{- | getDataTerminalNames takes all input data and gets full terminal list +and adds missing data for terminals not in input files +-} +getDataTerminalNames ∷ [RawData] → [T.Text] +getDataTerminalNames inDataList = + if null inDataList + then [] + else L.sort $ L.nub $ fst <$> concatMap fst inDataList + + +-- | addMissingTerminalsToInput dataLeafNames renamedData +addMissingTerminalsToInput ∷ NonEmpty T.Text → [TermData] → RawData → RawData +addMissingTerminalsToInput dataLeafNames@(firstLeafName :| xs) curTermData inData@(termDataList, charInfoList) = + let rData = case find ((== firstLeafName) . fst) termDataList of + Just vs → vs : curTermData + Nothing → (firstLeafName, []) : curTermData + in case xs of + [] → (reverse rData, charInfoList) + y : ys → addMissingTerminalsToInput (y :| ys) rData inData + + +-- | checkDuplicatedTerminals takes list TermData and checks for repeated terminal names +checkDuplicatedTerminals ∷ [TermData] → (Bool, [T.Text]) +checkDuplicatedTerminals inData = + if null inData + then (False, []) + else + let nameList = L.group $ L.sort $ fmap fst inData + dupList = filter ((> 1) . length) nameList + in if null dupList + then (False, []) + else (True, fmap head dupList) + + +{- | joinSortFileData takes list of list of short text and merges line by line to join leaf states +and sorts the result +-} +joinSortFileData ∷ [[ST.ShortText]] → [String] +joinSortFileData inFileLists = + if null (head inFileLists) + then [] + else + let -- changed sort order (now by data file input, more or less) to reduce time complexity + -- should still be label invariant + -- firstLeaf = L.sort $ ST.toString $ ST.concat $ fmap head inFileLists + firstLeaf = ST.toString $ ST.concat $ fmap head inFileLists + in -- firstLeaf = show $ L.sort $ fmap head inFileLists + + firstLeaf : joinSortFileData (fmap tail inFileLists) + + +{- | createBVNames takes input data, sorts the raw data, hashes, sorts those to create +unique, label invariant (but data related so arbitrary but consistent) +Assumes the rawData come in sorted by the data reconciliation process +These used for vertex labels, caching, left/right DO issues +-} +createBVNames ∷ [RawData] → [(T.Text, BitVector)] +createBVNames inDataList = + let rawDataList = fmap fst inDataList + textNameList = fst <$> head rawDataList + textNameList' = fst <$> last rawDataList + + fileLeafCharList = fmap (fmap snd) rawDataList + fileLeafList = fmap (fmap ST.concat) fileLeafCharList + leafList = reverse $ joinSortFileData fileLeafList + + -- hash not guaranteed to be stable over OS or library version + -- leafHash = fmap H.hash leafList + leafHash = leafList + leafHashPair = L.sortOn fst $ zip leafHash [0 .. (length textNameList - 1)] -- textNameList + (_, leafReoderedList) = unzip leafHashPair + -- leafOrder = sortOn fst $ zip leafReoderedList [0..((length textNameList) - 1)] + -- (nameList, intList) = unzip leafOrder + + -- bv1 = BV.bitVec (length textNameList) (1 :: Integer) + boolList = replicate (length textNameList - 1) False + bv1 = BV.fromBits $ True : boolList + bvList = fmap (shiftL bv1) leafReoderedList -- [0..((length textNameList) - 1)] + in if textNameList /= textNameList' + then error "Taxa are not properly ordered in createBVNames" + else -- trace (show $ fmap BV.toBits bvList) + zip textNameList bvList + + +{- | createNaiveData takes input RawData and transforms to "Naive" data. +these data are organized into blocks (set to input filenames initially) +and are bitvector coded, but are not organized by character type, packed ot +optimized in any other way (prealigned-> nonadd, Sankoff. 2 state sankoff to binary, +constant characters skipped etc) +these processes take place later +these data can be input to any data optimization commands and are useful +for data output as they haven't been reordered or transformed in any way. +the RawData is a list since it is organized by input file +the list accumulator is to avoid Vector snoc/cons O(n) +-} +createNaiveData ∷ GlobalSettings → [RawData] → [(T.Text, BitVector)] → [BlockData] → PhyG ProcessedData +createNaiveData inGS inDataList leafBitVectorNames curBlockData = + -- trace ("CND: " <> (show $ (optimalityCriterion inGS))) $ + if null inDataList + then -- trace ("Naive data with " <> (show $ length curBlockData) <> " blocks and " <> (show $ fmap length $ fmap V.head $ fmap snd3 curBlockData) <> " characters") + + pure + ( V.fromList $ fmap fst leafBitVectorNames + , V.fromList $ fmap snd leafBitVectorNames + , V.fromList $ reverse curBlockData + ) + else + let (firstData, firstCharInfo) = head inDataList + in -- empty file should have been caught earlier, but avoids some head/tail errors + if null firstCharInfo + then -- trace "Empty CharInfo" + createNaiveData inGS (tail inDataList) leafBitVectorNames curBlockData + else -- process data as come in--each of these should be from a single file + -- and initially assigned to a single, unique block + + let thisBlockName = name $ head firstCharInfo + -- thisBlockName = T.append (T.pack "block-") (name $ head firstCharInfo) + thisBlockCharInfo = V.fromList firstCharInfo + maxCharacterLength = maximum $ fmap length $ (fmap snd firstData) + recodedCharacters = recodeRawData (fmap fst firstData) (fmap snd firstData) firstCharInfo maxCharacterLength [] + -- thisBlockGuts = V.zip (V.fromList $ fmap snd leafBitVectorNames) recodedCharacters + previousBlockName = + if not $ null curBlockData + then fst3 $ head curBlockData + else T.empty + thisBlockName' = + if T.takeWhile (/= '#') previousBlockName /= T.takeWhile (/= '#') thisBlockName + then thisBlockName + else + let oldSuffix = T.dropWhile (/= '#') previousBlockName + indexSuffix = + if T.null oldSuffix + then T.pack "#0" + else + let oldIndex = readMaybe (T.unpack $ T.tail oldSuffix) ∷ Maybe Int + newIndex = 1 + fromJust oldIndex + in if isNothing oldIndex + then error "Bad suffix in createNaiveData" + else T.pack ("#" <> show newIndex) + in T.append (T.takeWhile (/= '#') thisBlockName) indexSuffix + + thisBlockCharInfo'' = V.zipWith (resetAddNonAddAlphabets recodedCharacters) thisBlockCharInfo (V.fromList [0 .. (V.length thisBlockCharInfo - 1)]) + + -- create "orginal" character info for later use in outputs after character recoding and transformation etc. + thisBlockCharInfo' = fmap setOrigCharInfo thisBlockCharInfo'' + + -- recode with appropriate missing data codes + recodedCharacters' = fmap (recodeNonAddMissingBlock thisBlockCharInfo') recodedCharacters + + -- reweight characters by NCM (Tuffley and Steel, 1997) factor (does not include root factor--must be separately calculated) + thisBlockCharInfoNCM = + if (optimalityCriterion inGS) == NCM + then fmap reweightNCM thisBlockCharInfo' + else thisBlockCharInfo' + + thisBlockData = (thisBlockName', recodedCharacters', thisBlockCharInfoNCM) + + (prealignedDataEqualLength, nameMinPairList, nameNonMinPairList) = checkPrealignedEqualLength (fmap fst leafBitVectorNames) thisBlockData + in -- trace ("CND:" <> (show $ fmap length $ (fmap snd firstData))) ( + if not prealignedDataEqualLength + then + errorWithoutStackTrace + ( "Error on input of prealigned sequence characters in file " + <> (takeWhile (/= '#') $ T.unpack thisBlockName') + <> "--not equal length [(Taxon, Length)]: \nMinimum length taxa: " + <> (show nameMinPairList) + <> "\nNon Minimum length taxa: " + <> (show nameNonMinPairList) + ) + else + logWith LogInfo ("Recoding input block: " <> T.unpack thisBlockName' <> "\n") + *> createNaiveData inGS (tail inDataList) leafBitVectorNames (thisBlockData : curBlockData) + + +-- | reweightNCM takes character info and reweights via NCM (Tuffley and Steel, 1997) -log_10 1/(alphabet size) +reweightNCM ∷ CharInfo → CharInfo +reweightNCM inCharInfo = + let originalWeight = weight inCharInfo + alphabetSize = length $ alphabet inCharInfo + newWeight = originalWeight * (-1.0) * logBase 10 (1.0 / (fromIntegral alphabetSize)) + in -- trace ("RWNCM: " <> (show (originalWeight, newWeight))) + inCharInfo{weight = newWeight} + + +{- | setOrigCharInfo takes fields from charInfo and sets the initial original charcter infomatin field +as a singleton Vector +-} +setOrigCharInfo ∷ CharInfo → CharInfo +setOrigCharInfo inCharInfo = + let origData = (name inCharInfo, charType inCharInfo, alphabet inCharInfo) + in inCharInfo{origInfo = V.singleton origData} + + +-- | recodeAddNonAddMissing takes Block data and recodes missing for additive and non-additive characters +recodeNonAddMissingBlock ∷ V.Vector CharInfo → V.Vector CharacterData → V.Vector CharacterData +recodeNonAddMissingBlock blockCharInfo singleTaxonBlockData = + V.zipWith recodeNonAddMissingCharacter blockCharInfo singleTaxonBlockData + + +-- | recodeAddNonAddMissingCharacter recodes additive and non-additive missing data +recodeNonAddMissingCharacter ∷ CharInfo → CharacterData → CharacterData +recodeNonAddMissingCharacter charInfo inCharData = + let inCharType = charType charInfo + in if inCharType /= NonAdd + then inCharData + else + let nonAddState = (V.head . snd3 . stateBVPrelim) inCharData + newState = + if nonAddState == (nonAddState `xor` nonAddState) + then V.singleton (complement nonAddState) + else V.singleton nonAddState + in if nonAddState == (nonAddState `xor` nonAddState) + then + inCharData + { stateBVPrelim = (newState, newState, newState) + , stateBVFinal = newState + } + else inCharData + + +{- | getAddNonAddAlphabets takes recoded character data and resets the alphabet +field in charInfo to reflect observed states. This is used to properly set missing and +bit packing values +-} +resetAddNonAddAlphabets ∷ V.Vector (V.Vector CharacterData) → CharInfo → Int → CharInfo +resetAddNonAddAlphabets taxonByCharData charInfo charIndex = + let inCharType = charType charInfo + in if inCharType `notElem` [Add, NonAdd] + then charInfo + else + if inCharType == NonAdd + then + let -- get actual states + inCharV = fmap (V.head . snd3 . stateBVPrelim) $ fmap V.head taxonByCharData + missingVal = BV.fromBits $ L.replicate (fromEnum . BV.dimension . V.head . snd3 . stateBVPrelim . V.head $ V.head taxonByCharData) True + -- missingVal = V.foldl1' (.|.) inCharV + nonMissingBV = V.foldl1' (.|.) $ V.filter (/= missingVal) inCharV + + -- max in case of all missing character + numStates = max 1 $ popCount nonMissingBV + + -- numBits = UV.length . coerce $ (V.head . snd3 . stateBVPrelim) $ (V.head taxonByCharData) V.! charIndex + foundSymbols = ST.fromString . show <$> (0 :| [1 .. pred numStates]) + stateAlphabet = fromSymbols foundSymbols -- fromSymbolsWOGap foundSymbols + in -- trace ("RNA: " <> (show stateAlphabet)) + charInfo{alphabet = stateAlphabet} + else + if inCharType == Add + then + let (minRangeL, maxRangeL) = V.unzip $ fmap (V.head . snd3 . rangePrelim) $ fmap (V.! charIndex) taxonByCharData + + minRange = + if minimum minRangeL < (maxBound ∷ Int) + then minimum minRangeL + else 0 + + maxRange = + if maximum maxRangeL > (minBound ∷ Int) + then maximum maxRangeL + else 0 + + foundSymbols = ST.fromString . show <$> (minRange :| [succ minRange .. maxRange]) + stateAlphabet = fromSymbols foundSymbols -- fromSymbolsWOGap foundSymbols + in if maxRange < minRange + then error ("Error in processing of additive character states " <> (show (minRange, maxRange))) + else -- trace ("RA: " <> (show (minimum minRangeL, maximum maxRangeL)) <> " -> " <> (show (minRange, maxRange)) <> " " <> (show foundSymbols)) -- <> " -> " <> (show stateAlphabet)) + charInfo{alphabet = stateAlphabet} + else error ("Unrecognized character type in resetAddNonAddAlphabets: " <> (show inCharType)) + + +{- | checkPrealignedEqualLength checks prealigned type for equal length +at this stage (called before reblocking) there should only be a single charcter per block +but more general--if not great if > 1 character with naming +-} +checkPrealignedEqualLength + ∷ [T.Text] → (NameText, V.Vector (V.Vector CharacterData), V.Vector CharInfo) → (Bool, [(T.Text, Int)], [(T.Text, Int)]) +checkPrealignedEqualLength nameTextList (_, taxByCharacterDataVV, charInfoV) = + let numCharsIndexList = [0 .. (V.length charInfoV) - 1] + sameLengthPairList = zipWith (verifyPrealignedCharacterLength nameTextList taxByCharacterDataVV) (V.toList charInfoV) numCharsIndexList + badOnes = filter ((== False) . fst3) sameLengthPairList + in if null badOnes + then (True, [], []) + else (False, concat $ fmap snd3 badOnes, concat $ fmap thd3 badOnes) + + +{- | verifyPrealignedCharacterLength takes an index for character and examines theat character--if prealigned checks for +equal length if prealigned then "True" +-} +verifyPrealignedCharacterLength + ∷ [T.Text] → V.Vector (V.Vector CharacterData) → CharInfo → Int → (Bool, [(T.Text, Int)], [(T.Text, Int)]) +verifyPrealignedCharacterLength nameTextList taxByCharacterDataVV charInfo charIndex = + let inCharType = charType charInfo + inCharV = fmap (V.! charIndex) taxByCharacterDataVV + in if inCharType `notElem` prealignedCharacterTypes + then (True, [], []) + else -- True for IA field use but doesn't matter in this case since looking at prealigned fields + + let prealigedDataLengthList = V.toList $ fmap (U.getCharacterLength' True charInfo) inCharV + {- + if inCharType == AlignedSlim then V.toList $ fmap SV.length $ fmap (snd3 . alignedSlimPrelim) inCharV + else if inCharType == AlignedWide then V.toList $ fmap UV.length $ fmap (snd3 . alignedWidePrelim) inCharV + else if inCharType == AlignedHuge then V.toList $ fmap V.length $ fmap (snd3 . alignedHugePrelim) inCharV + else error ("Character type " <> show inCharType <> " unrecongized/not implemented") + -} + + nameLengthPairList = zip nameTextList prealigedDataLengthList + lMinLength = minimum prealigedDataLengthList + haveMinLength = filter ((== lMinLength) . snd) nameLengthPairList + notMinMinLength = filter ((/= lMinLength) . snd) nameLengthPairList + in -- all min length then all same length + -- trace ("VPCL:" <> (show $ (haveMinLength, notMinMinLength))) ( + if null notMinMinLength + then (True, [], []) + else (False, haveMinLength, notMinMinLength) + + +-- ) + +{- | recodeRawData takes the ShortText representation of character states/ranges etc +and recodes the appropriate fields in CharacterData (from Types) +the list accumulator is to avoid Vector cons/snoc O(n) +differentiates between seqeunce type and others with char info +-} +recodeRawData ∷ [NameText] → [[ST.ShortText]] → [CharInfo] → Int → [[CharacterData]] → V.Vector (V.Vector CharacterData) +recodeRawData inTaxNames inData inCharInfo maxCharLength curCharData = + -- trace ("RRD: ") ( + if null inTaxNames + then V.fromList $ reverse $ fmap V.fromList curCharData + else + let firstData = head inData + firstDataRecoded = createLeafCharacter inCharInfo firstData maxCharLength + in -- trace ("RRD:" <> (show firstData)) + -- trace ("Recoding " <> (T.unpack $ head inTaxNames) <> " as " <> (show $ charType $ head inCharInfo) <> "\n\t" <> show firstDataRecoded) + -- trace ((show $ length inData) <> " " <> (show $ length firstData) <> " " <> (show $ length inCharInfo) + recodeRawData (tail inTaxNames) (tail inData) inCharInfo maxCharLength (firstDataRecoded : curCharData) + + +-- ) + +-- | missingNonAdditive is non-additive missing character value, all 1's based on alphabet size +missingNonAdditive ∷ CharInfo → CharacterData +missingNonAdditive inCharInfo = + let missingChar = V.singleton (BV.fromBits $ replicate (length $ alphabet inCharInfo) True) + in emptyCharacter + { stateBVPrelim = (missingChar, missingChar, missingChar) + , stateBVFinal = missingChar + } + + +-- | missingAdditive is additive missing character value, all 1's based on alphabet size +missingAdditive ∷ CharInfo → CharacterData +missingAdditive inCharInfo = + let minState' = readMaybe (ST.toString . head . toList $ alphabet inCharInfo) ∷ Maybe Int + maxState' = readMaybe (ST.toString . last . toList $ alphabet inCharInfo) ∷ Maybe Int + minState = + if isNothing minState' + then 0 + else fromJust minState' + maxState = + if isNothing maxState' + then 0 + else fromJust maxState' + + missingRange = + V.zip + (V.singleton minState) + (V.singleton maxState) + in emptyCharacter + { rangePrelim = (missingRange, missingRange, missingRange) + , rangeFinal = missingRange + } + + +{- | missingMatrix is additive missing character value, all 1's based on alphabet size +setrting stateBVPrelim/Final for approx DO-like costs (lookup) +-} +missingMatrix ∷ CharInfo → CharacterData +missingMatrix inCharInfo = + let numStates = length $ alphabet inCharInfo + missingState = (0 ∷ StateCost, [], []) + in emptyCharacter + { matrixStatesPrelim = V.singleton (V.replicate numStates missingState) + , matrixStatesFinal = V.singleton (V.replicate numStates missingState) + } + + +-- | getMissingValue takes the character type and returns the appropriate missing data value +getMissingValue ∷ [CharInfo] → Int → [CharacterData] +getMissingValue inChar maxCharLength + | null inChar = [] + | charType (head inChar) `elem` nonExactCharacterTypes = [emptyCharacter] -- [] + | charType (head inChar) `elem` prealignedCharacterTypes = [missingAligned (head inChar) maxCharLength] + | charType (head inChar) == NonAdd = missingNonAdditive (head inChar) : getMissingValue (tail inChar) maxCharLength + | charType (head inChar) == Add = missingAdditive (head inChar) : getMissingValue (tail inChar) maxCharLength + | charType (head inChar) == Matrix = missingMatrix (head inChar) : getMissingValue (tail inChar) maxCharLength + | otherwise = error ("Datatype " <> show (charType $ head inChar) <> " not recognized") + + +{- | missingAligned creates missing data (all bits on) for prealigned data +important n ot to sett all bits--then run of in to seg fault land +-} +missingAligned ∷ CharInfo → Int → CharacterData +missingAligned inChar charLength = + let alphabetLen = length $ alphabet inChar + missingVals x = GV.replicate charLength $ setMissingBits x 0 alphabetLen + missingChar x = let v = missingVals x in ( v, v, v ) + in case charType inChar of + AlignedSlim -> emptyCharacter{alignedSlimPrelim = missingChar (0 ∷ SlimState) } + AlignedWide -> emptyCharacter{alignedWidePrelim = missingChar (0 ∷ WideState) } + AlignedHuge -> emptyCharacter{alignedHugePrelim = missingChar (fromNumber (toEnum alphabetLen) 0 ∷ HugeState) } + val -> error $ unwords [ "Datatype", show val, "not recognized" ] + + +-- | setMissingBits sets the first bits by index to '1' rest left as is (0 on input) +setMissingBits ∷ (Show a, FiniteBits a) ⇒ a → Int → Int → a +setMissingBits inVal curIndex alphSize + | curIndex == alphSize = inVal + | otherwise = setMissingBits (setBit inVal curIndex) (curIndex + 1) alphSize + + +{- | getStateBitVectorList takes the alphabet of a character ([ShorText]) +and returns bitvectors (with of size alphabet) for each state in order of states in alphabet +-} +getStateBitVectorList ∷ Alphabet ST.ShortText → V.Vector (ST.ShortText, BitVector) +getStateBitVectorList localStates + | null localStates = error "Character with empty alphabet in getStateBitVectorList" + | otherwise = + let stateCount = toEnum $ length localStates + stateIndexList = V.fromList [0 .. stateCount - 1] + genNum = (2 ^) ∷ Word → Natural + bvList ∷ V.Vector BitVector + bvList = (BV.fromNumber stateCount . genNum) <$> stateIndexList + in + --trace ("GSBVL: " <> (show bvList)) $ + V.zip (alphabetSymbols localStates) bvList + -- V.zip (alphabetStateNames localStates) bvList + + +iupacToBVPairs + ∷ (IsString s, Ord s) + ⇒ Alphabet s + → Bimap (NonEmpty s) (NonEmpty s) + → V.Vector (s, BitVector) +iupacToBVPairs inputAlphabet iupac = V.fromList $ bimap NE.head encoder <$> BM.toAscList iupac + where + constructor = flip BV.fromNumber (0 ∷ Int) + encoder = encodeState inputAlphabet constructor + + +{- | nucleotideBVPairs for recoding DNA sequences +this done to insure not recalculating everything for each base +-} +nucleotideBVPairs ∷ V.Vector (ST.ShortText, BitVector) +nucleotideBVPairs = iupacToBVPairs baseAlphabet iupacToDna + where + baseAlphabet = fromSymbols $ ST.fromString <$> "-" :| ["A", "C", "G", "T"] + + +{- | aminoAcidBVPairs for recoding protein sequences +this done to insure not recalculating everything for each residue +B, Z, X, ? for ambiguities +-} +aminoAcidBVPairs ∷ V.Vector (ST.ShortText, BitVector) +aminoAcidBVPairs = iupacToBVPairs acidAlphabet iupacToAminoAcid + where + acidAlphabet = + fromSymbols $ + fromString + <$> "A" :| ["C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y", "-"] + + +{- | getBVCode take a Vector of (ShortText, BV) and returns bitvector code for +ShortText state +-} +getBVCode ∷ V.Vector (ST.ShortText, BitVector) → ST.ShortText → BitVector +getBVCode bvCodeVect inState = + let newCode = V.find ((== inState) . fst) bvCodeVect + in maybe (error ("State " <> ST.toString inState <> " not found in bitvect code " <> show bvCodeVect)) snd newCode + + +getNucleotideSequenceChar ∷ Bool → [ST.ShortText] → [CharacterData] +getNucleotideSequenceChar isPrealigned stateList = + let sequenceVect + | null stateList = mempty + | otherwise = + SV.fromList $ + BV.toUnsignedNumber . getBVCode nucleotideBVPairs <$> (fmap ST.pack $ fmap (fmap C.toUpper) $ fmap ST.unpack stateList) + newSequenceChar = + if not isPrealigned + then + emptyCharacter + { slimPrelim = sequenceVect + , slimGapped = (sequenceVect, sequenceVect, sequenceVect) + } + else + emptyCharacter + { alignedSlimPrelim = (sequenceVect, sequenceVect, sequenceVect) + } + in [newSequenceChar] + + +getAminoAcidSequenceChar ∷ Bool → [ST.ShortText] → [CharacterData] +getAminoAcidSequenceChar isPrealigned stateList = + let sequenceVect + | null stateList = mempty + | otherwise = + UV.fromList $ + BV.toUnsignedNumber . getBVCode aminoAcidBVPairs <$> (fmap ST.pack $ fmap (fmap C.toUpper) $ fmap ST.unpack stateList) + newSequenceChar = + if not isPrealigned + then + emptyCharacter + { widePrelim = sequenceVect + , wideGapped = (sequenceVect, sequenceVect, sequenceVect) + } + else + emptyCharacter + { alignedWidePrelim = (sequenceVect, sequenceVect, sequenceVect) + } + in [newSequenceChar] + + +{- | getGeneralBVCode take a Vector of (ShortText, BV) and returns bitvector code for +ShortText state. These states can be ambiguous as in general sequences +so states need to be parsed first +the AND to all states makes ambiguityoes only observed states +-} +getGeneralBVCode ∷ V.Vector (ST.ShortText, BitVector) → ST.ShortText → (SlimState, WideState, HugeState) +getGeneralBVCode bvCodeVect inState = + let inStateString = ST.toString inState + alphabetLen = toEnum $ length bvCodeVect + construct x = (BV.toUnsignedNumber x, BV.toUnsignedNumber x, Elem.fromNumber alphabetLen $ BV.toUnsignedNumber x) + in -- if '[' `notElem` inStateString then --single state + --trace ("GGBVC: " <> (show (alphabetLen, bvCodeVect, inState))) $ + if (head inStateString /= '[') && (last inStateString /= ']') -- single state + then + let newCode = V.find ((== inState) . fst) bvCodeVect + allBVStates = V.foldl1' (.|.) (fmap snd bvCodeVect) + bvDimension = fromEnum . BV.dimension . snd $ V.head bvCodeVect + in case newCode of + Just (_,x) -> construct x + Nothing -> case inState of + + {- + -- B is Aspartic Acid or Asparagine if '-' =0 then states 3 and 12. + "B" -> + let x = BV.fromBits $ (replicate 3 False) <> [True] <> (replicate 8 False) <> [True] <> (replicate (bvDimension - 13) False) + in construct x + -- any amino acid but not '-' + "X" -> + let x = allBVStates .&. BV.fromBits (False : (replicate (bvDimension - 1) True)) + in construct x + -} + -- any state including '-' + "?" -> + let x = allBVStates .&. (BV.fromBits (replicate bvDimension True)) + in construct x + + _ -> error $ unwords [ "State", ST.toString inState, "not found in bitvect code", show bvCodeVect ] + + else + let statesStringList = words $ tail $ init inStateString + stateList = fmap ST.fromString statesStringList + maybeBVList = fmap getBV stateList + stateBVList = fmap (snd . fromJust) maybeBVList + ambiguousBVState = foldr1 (.|.) stateBVList + in if Nothing `elem` maybeBVList + then error ("Ambiguity group " <> inStateString <> " contained states not found in bitvect code " <> show bvCodeVect) + else construct ambiguousBVState + where + getBV s = V.find ((== s) . fst) bvCodeVect + + +{- | getGeneralSequenceChar encode general (ie not nucleotide or amino acid) sequences +as bitvectors. Main difference with getSequenceChar is in dealing with ambiguities +they need to be parsed and "or-ed" differently +need to have all three preliminary fields populated for some reason--prob shouldn't need that +-} +getGeneralSequenceChar ∷ CharInfo → [ST.ShortText] → [CharacterData] +getGeneralSequenceChar inCharInfo stateList = + --trace ("GGSC-1" <> (show (charType inCharInfo, alphabet inCharInfo, stateList, getStateBitVectorList $ alphabet inCharInfo))) $ + let cType = charType inCharInfo + -- isAligned = prealigned inCharInfo + stateBVPairVect :: V.Vector (ST.ShortText, BitVector) + stateBVPairVect = getStateBitVectorList $ alphabet inCharInfo + (slimVec, wideVec, hugeVec) + | not $ null stateList = + (\(x, y, z) → (SV.fromList $ toList x, UV.fromList $ toList y, z)) . V.unzip3 . V.fromList $ + fmap (getGeneralBVCode stateBVPairVect) stateList + | otherwise = (mempty, mempty, mempty) + newSequenceChar = + emptyCharacter + { slimPrelim = if cType `elem` [SlimSeq, NucSeq] then slimVec else mempty + , slimGapped = if cType `elem` [SlimSeq, NucSeq] then (slimVec, slimVec, slimVec) else (mempty, mempty, mempty) + , slimFinal = if cType `elem` [SlimSeq, NucSeq] then slimVec else mempty + , widePrelim = if cType `elem` [WideSeq, AminoSeq] then wideVec else mempty + , wideGapped = if cType `elem` [WideSeq, AminoSeq] then (wideVec, wideVec, wideVec) else (mempty, mempty, mempty) + , wideFinal = if cType `elem` [WideSeq, AminoSeq] then wideVec else mempty + , hugePrelim = if cType == HugeSeq then hugeVec else mempty + , hugeGapped = if cType == HugeSeq then (hugeVec, hugeVec, hugeVec) else (mempty, mempty, mempty) + , hugeFinal = if cType == HugeSeq then hugeVec else mempty + , alignedSlimPrelim = if cType `elem` [AlignedSlim] then (slimVec, slimVec, slimVec) else (mempty, mempty, mempty) + , alignedSlimFinal = if cType `elem` [AlignedSlim] then slimVec else mempty + , alignedWidePrelim = if cType `elem` [AlignedWide] then (wideVec, wideVec, wideVec) else (mempty, mempty, mempty) + , alignedWideFinal = if cType `elem` [AlignedWide] then wideVec else mempty + , alignedHugePrelim = if cType `elem` [AlignedHuge] then (hugeVec, hugeVec, hugeVec) else (mempty, mempty, mempty) + , alignedHugeFinal = if cType `elem` [AlignedHuge] then hugeVec else mempty + } + in --trace ("GGSC-2" <> (show stateBVPairVect)) $ + [newSequenceChar] + + +{- | getStateBitVector takes the alphabet of a character ([ShortText]) +and returns then bitvectorfor that state in order of states in alphabet +-} +getStateBitVector ∷ Alphabet ST.ShortText → ST.ShortText → BitVector +getStateBitVector localAlphabet = encodeState localAlphabet (const constructor) . (: []) + where + constructor :: BitVector + constructor = + let x = bit $ length localAlphabet - 1 + in + x `xor` x + + +-- getMinMaxStates takes list of strings and determines the minimum and maximum integer values +getMinMaxStates ∷ [String] → (Int, Int) → (Int, Int) +getMinMaxStates inStateStringList (curMin, curMax) = + if null inStateStringList + then (curMin, curMax) + else + let firstString = head inStateStringList + in -- missing data + if firstString == "-" || firstString == "?" + then getMinMaxStates (tail inStateStringList) (curMin, curMax) + else -- single state + + if '[' `notElem` firstString + then + let onlyInt = readMaybe firstString ∷ Maybe Int + in if isNothing onlyInt + then error ("State not an integer in getIntRange: " <> firstString) + else + let minVal = + if fromJust onlyInt < curMin + then fromJust onlyInt + else curMin + maxVal = + if fromJust onlyInt > curMax + then fromJust onlyInt + else curMax + in getMinMaxStates (tail inStateStringList) (minVal, maxVal) + else -- range of states + + let statesStringList = words $ tail $ init firstString + stateInts = fmap readMaybe statesStringList ∷ [Maybe Int] + in if Nothing `elem` stateInts + then error ("Non-integer in range " <> firstString) + else + let localMin = minimum $ fmap fromJust stateInts + localMax = maximum $ fmap fromJust stateInts + minVal = + if localMin < curMin + then localMin + else curMin + maxVal = + if localMax > curMax + then localMax + else curMax + in getMinMaxStates (tail inStateStringList) (minVal, maxVal) + + +-- getIntRange takes the local states and returns the Integer range of an additive character +-- in principle allows for > 2 states +getIntRange ∷ ST.ShortText → Alphabet ST.ShortText → (Int, Int) +getIntRange localState totalAlphabet = + let stateString = ST.toString localState + in -- single state + if (stateString == "?") || (stateString == "-") + then getMinMaxStates (ST.toString <$> toList totalAlphabet) (maxBound ∷ Int, minBound ∷ Int) + else + if '[' `notElem` stateString + then + let onlyInt = readMaybe stateString ∷ Maybe Int + in if isNothing onlyInt + then error ("State not an integer in getIntRange: " <> ST.toString localState) + else (fromJust onlyInt, fromJust onlyInt) + else -- Range of states + + let hasDash = ST.any (== '-') localState + statesStringList = + if hasDash + then fmap ST.toString $ fmap (ST.filter (`notElem` ['[', ']'])) $ ST.split (== '-') localState + else fmap (: []) $ ST.toString $ ST.filter (`notElem` ['[', ']']) localState + -- words $ tail $ init stateString + stateInts = fmap readMaybe statesStringList ∷ [Maybe Int] + in -- trace ("GIR:" <> (show localState) <> " -> " <> (show (minimum $ fmap fromJust stateInts, maximum $ fmap fromJust stateInts))) ( + if Nothing `elem` stateInts + then error ("Non-integer in range " <> ST.toString localState) + else (minimum $ fmap fromJust stateInts, maximum $ fmap fromJust stateInts) + + +-- ) + +-- | getTripleList +getTripleList ∷ MatrixTriple → MatrixTriple → [ST.ShortText] → [ST.ShortText] → [MatrixTriple] +getTripleList hasState notHasState localAlphabet stateList = + if null localAlphabet + then [] + else + let firstAlphState = head localAlphabet + in if firstAlphState `elem` stateList + then -- trace ("State " <> show firstAlphState <> " in " <> show localAlphabet) + hasState : getTripleList hasState notHasState (tail localAlphabet) stateList + else notHasState : getTripleList hasState notHasState (tail localAlphabet) stateList + + +-- | getInitialMatrixVector gets matrix vector +getInitialMatrixVector ∷ Alphabet ST.ShortText → ST.ShortText → V.Vector MatrixTriple +getInitialMatrixVector alphabet' localState = + let hasState = (0 ∷ StateCost, [], []) + notHasState = (maxBound ∷ StateCost, [], []) + localAlphabet = toList alphabet' + in let stateString = ST.toString localState + in -- single state + if '[' `notElem` stateString + then -- trace ("GIMV: " <> (show $ V.fromList $ getTripleList hasState notHasState localAlphabet [localState])) + V.fromList $ getTripleList hasState notHasState localAlphabet [localState] + else -- polylorphic/ambiguous + + let statesStringList = words $ tail $ init stateString + stateList = fmap ST.fromString statesStringList + in V.fromList $ getTripleList hasState notHasState localAlphabet stateList + + +{- | getQualitativeCharacters processes non-sequence characters (non-additive, additive, sankoff/matrix) +and recodes returning list of encoded characters +reverses order due to prepending +matrix stateBVPrelim/Final for approx matrix costs +adddded in code for ambiguities for non-additive--somehow got lost--alphabet [robbaly wrong now as well +-} +getQualitativeCharacters ∷ [CharInfo] → [ST.ShortText] → [CharacterData] → [CharacterData] +getQualitativeCharacters inCharInfoList inStateList curCharList = + if null inCharInfoList + then reverse curCharList + else + let firstCharInfo = head inCharInfoList + firstState = head inStateList + firstCharType = charType firstCharInfo + totalAlphabet = alphabet firstCharInfo + in -- single state + if firstCharType == NonAdd + then + let stateBV + | ST.length firstState == 1 = getStateBitVector (alphabet firstCharInfo) firstState + | otherwise = + let ambiguousStateST = ST.filter (`notElem` ['[', ']']) firstState + ambiguousStateString = ST.toString ambiguousStateST + stateSTList = fmap ST.singleton ambiguousStateString + stateBVList = getStateBitVector (alphabet firstCharInfo) <$> stateSTList + in -- showStuff = show (firstState, ambiguousStateST, ambiguousStateString, stateSTList, stateBVList) + + -- trace ("GQC: " <> (show ambiguousStateString) <> " " <> (show stateSTList) <> " " <> (show stateBVList)) + L.foldl1' (.|.) stateBVList + newCharacter = emptyCharacter{stateBVPrelim = (V.singleton stateBV, V.singleton stateBV, V.singleton stateBV)} + in -- trace (" -> " <> (show stateBV) <> " from " <> (show totalAlphabet)) + getQualitativeCharacters (tail inCharInfoList) (tail inStateList) (newCharacter : curCharList) + else + if firstCharType == Add + then + if firstState == ST.fromString "-1" + then getQualitativeCharacters (tail inCharInfoList) (tail inStateList) (missingAdditive firstCharInfo : curCharList) + else + let (minRange, maxRange) = getIntRange firstState totalAlphabet + newCharacter = + emptyCharacter + { rangePrelim = (V.singleton (minRange, maxRange), V.singleton (minRange, maxRange), V.singleton (minRange, maxRange)) + } + in if ST.length firstState > 1 + then -- trace ("GQC: " <> show firstState) + getQualitativeCharacters (tail inCharInfoList) (tail inStateList) (newCharacter : curCharList) + else getQualitativeCharacters (tail inCharInfoList) (tail inStateList) (newCharacter : curCharList) + else + if firstCharType == Matrix + then + if firstState `elem` fmap ST.fromString ["?", "-"] + then getQualitativeCharacters (tail inCharInfoList) (tail inStateList) (missingMatrix firstCharInfo : curCharList) + else + let initialMatrixVector = getInitialMatrixVector (alphabet firstCharInfo) firstState + newCharacter = emptyCharacter{matrixStatesPrelim = V.singleton initialMatrixVector} + in -- trace (show initialMatrixVector) ( + -- trace ((show $ alphabet firstCharInfo) <> " " <> (ST.toString firstState)) ( + -- trace ("GQC " <> (T.unpack $ name firstCharInfo) <> (show $ alphabet firstCharInfo) <> " " <> (show $ costMatrix firstCharInfo)) ( + if null (costMatrix firstCharInfo) + then + errorWithoutStackTrace + ( "\n\nMatrix character input error: No cost matrix has been specified for character " + <> T.unpack (name firstCharInfo) + <> " perhaps file " + <> (takeWhile (/= '#') $ T.unpack (name firstCharInfo)) + <> " character " + <> ((tail $ dropWhile (/= '#') $ T.unpack (name firstCharInfo))) + ) + else getQualitativeCharacters (tail inCharInfoList) (tail inStateList) (newCharacter : curCharList) + else -- ) + + error ("Character type " <> show firstCharType <> " not recongnized/implemented") + + +{- | createLeafCharacter takes rawData and Charinfo and returns CharacterData type +need to add in missing data as well +-} +createLeafCharacter ∷ [CharInfo] → [ST.ShortText] → Int → [CharacterData] +createLeafCharacter inCharInfoList rawDataList maxCharLength + | null inCharInfoList = + error "Null data in charInfoList createLeafCharacter" + | null rawDataList -- missing data + = + getMissingValue inCharInfoList maxCharLength + | otherwise = + let localCharType = charType $ head inCharInfoList + localAlphabet = alphabet $ head inCharInfoList + isNucleotideData = isAlphabetDna localAlphabet + isAminoAcidData = isAlphabetAminoAcid localAlphabet + in --trace ("CLC: " <> (show localCharType) <> " " <> (show rawDataList)) $ + if localCharType `elem` sequenceCharacterTypes + then -- in if length inCharInfoList == 1 then -- should this be `elem` sequenceCharacterTypes + case localCharType of + NucSeq → getNucleotideSequenceChar False rawDataList + AminoSeq → getAminoAcidSequenceChar False rawDataList + -- ambiguities different, and alphabet varies with character (potentially) + AlignedSlim → + if isNucleotideData + then getNucleotideSequenceChar True rawDataList + else getGeneralSequenceChar (head inCharInfoList) rawDataList + AlignedWide → + if isAminoAcidData + then getAminoAcidSequenceChar True rawDataList + else getGeneralSequenceChar (head inCharInfoList) rawDataList + AlignedHuge → getGeneralSequenceChar (head inCharInfoList) rawDataList + SlimSeq → getGeneralSequenceChar (head inCharInfoList) rawDataList + WideSeq → getGeneralSequenceChar (head inCharInfoList) rawDataList + HugeSeq → getGeneralSequenceChar (head inCharInfoList) rawDataList + _ → getQualitativeCharacters inCharInfoList rawDataList [] + else + if length inCharInfoList /= length rawDataList + then error "Mismatch in number of characters and character info" + else getQualitativeCharacters inCharInfoList rawDataList [] + + +-- ) + +exists ∷ (Foldable f) ⇒ f a → Bool +exists = not . null diff --git a/src/Input/FastAC.hs b/src/Input/FastAC.hs new file mode 100644 index 000000000..ab576c375 --- /dev/null +++ b/src/Input/FastAC.hs @@ -0,0 +1,584 @@ +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE OverloadedStrings #-} + +{- | +Module exposing fasta/c sequence parsing and importing functionality. +-} +module Input.FastAC ( + getFastAText, + getFastaCharInfo, + getFastC, + getFastCText, + getFastcCharInfo, + genDiscreteDenseOfDimension, +) where + +import Control.DeepSeq +import Control.Monad (when) +import Control.Monad.IO.Class (MonadIO (..)) +import Data.Alphabet +import Data.Bits +import Data.Char qualified as C +import Data.Foldable (fold) +import Data.Hashable +import Data.List qualified as L +import Data.List.NonEmpty (NonEmpty (..)) +import Data.List.NonEmpty qualified as NE +import Data.MetricRepresentation +import Data.MetricRepresentation qualified as MR +import Data.TCM (TCMDiagnosis (..), TCMStructure (..)) +import Data.TCM qualified as TCM +import Data.TCM.Dense qualified as TCMD +import Data.Text.Lazy qualified as T +import Data.Text.Short qualified as ST +import Data.Vector qualified as V +import Debug.Trace +import GeneralUtilities +import Input.DataTransformation qualified as DT +import PHANE.Evaluation +import PHANE.Evaluation.ErrorPhase (ErrorPhase (..)) +import PHANE.Evaluation.Logging (LogLevel (..), Logger (..)) +import PHANE.Evaluation.Verbosity (Verbosity (..)) +import SymMatrix qualified as S +import Types.Types + + +{- | getAlphabet takse a list of short-text lists and returns alphabet as list of short-text +although with multicharacter alphabets that contain '[' or ']' this would be a problem, +its only used for single character alphabets in fasta formats. +'#' for partitions in fasta sequences +-} +getAlphabet ∷ [String] → [ST.ShortText] → [ST.ShortText] +getAlphabet curList inList = + let notAlphElement = ST.fromString <$> ["?", "[", "]", "#"] + in if null inList + then filter (`notElem` notAlphElement) $ fmap ST.fromString $ L.sort curList `L.union` ["-"] + else + let firstChars = fmap (: []) $ L.nub $ ST.toString $ head inList + in getAlphabet (firstChars `L.union` curList) (tail inList) + + +{- | generateDefaultMatrix takes an alphabet and generates cost matrix (assuming '-' + in already) +-} +generateDefaultMatrix ∷ Alphabet ST.ShortText → Int → Int → Int → [[Int]] +generateDefaultMatrix inAlph rowCount indelCost substitutionCost + | null inAlph = [] + | rowCount == length inAlph = [] + | otherwise = + let firstPart = + if rowCount < (length inAlph - 1) + then replicate rowCount substitutionCost + else replicate rowCount indelCost + thirdPart = + if rowCount < (length inAlph - 1) + then replicate (length inAlph - rowCount - 1 - 1) substitutionCost <> [indelCost] + else [] + in (firstPart <> [0] <> thirdPart) : generateDefaultMatrix inAlph (rowCount + 1) indelCost substitutionCost + + +{- | getFastaCharInfo get alphabet , names etc from processed fasta data +this doesn't separate ambiguities from elements--processed later +need to read in TCM or default +only for single character element sequecnes +-} +getFastaCharInfo ∷ [TermData] → String → String → Bool → ([ST.ShortText], [[Int]], Double) → PhyG (CharInfo, [TermData]) +getFastaCharInfo inData dataName dataType isPrealigned localTCM = + if null inData + then error "Empty inData in getFastaCharInfo" + else + let nucleotideAlphabet = + fmap + ST.fromString $ + ["A", "C", "G", "T", "U", "R", "Y", "S", "W", "K", "M", "B", "D", "H", "V", "N", "?", "-"] + <> (map (fmap C.toLower) ["A", "C", "G", "T", "U", "R", "Y", "S", "W", "K", "M", "B", "D", "H", "V", "N", "?", "-"]) + lAminoAcidAlphabet = + fmap + ST.fromString $ + ["A", "B", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "X", "Y", "Z", "-", "?"] + <> (map (fmap C.toLower) ["A", "B", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "X", "Y", "Z", "-", "?"]) + + -- onlyInNucleotides = [ST.fromString "U"] + -- onlyInAminoAcids = fmap ST.fromString ["E","F","I","L","P","Q","X","Z"] + sequenceData = getAlphabet [] $ foldMap snd inData + + seqType + | dataType == "nucleotide" {- trace ("File " <> dataName <> " is nucleotide data.") -} = NucSeq + | dataType == "aminoacid" {- trace ("File " <> dataName <> " is aminoacid data.") -} = AminoSeq + | dataType == "hugeseq" {- trace ("File " <> dataName <> " is large alphabet data.") -} = HugeSeq + | dataType == "custom_alphabet" {- trace ("File " <> dataName <> " is large alphabet data.") -} = HugeSeq + | ( sequenceData `L.intersect` nucleotideAlphabet == sequenceData {- trace ("Assuming file " <> dataName + <> " is nucleotide data. Specify `aminoacid' filetype if this is incorrect.") -} + ) = + NucSeq + | ( sequenceData `L.intersect` lAminoAcidAlphabet == sequenceData {- trace ("Assuming file " <> dataName + <> " is amino acid data. Specify `nucleotide' filetype if this is incorrect.") -} + ) = + AminoSeq + | length sequenceData <= 8 {- trace ("File " <> dataName <> " is small alphabet data.") -} = SlimSeq + | length sequenceData <= 64 {- trace ("File " <> dataName <> " is wide alphabet data.") -} = WideSeq + | otherwise {- trace ("File " <> dataName <> " is large alphabet data.") -} = HugeSeq + + seqAlphabet = fromSymbols seqSymbols + + seqSymbols = + let toSymbols = fmap ST.fromString + in case seqType of + NucSeq → toSymbols $ "A" :| ["C", "G", "T", "-"] + AminoSeq → toSymbols $ "A" :| ["C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y", "-"] + _ → "-" :| sequenceData + + thisAlphabet = + case fst3 localTCM of + [] → seqAlphabet + a : as → fromSymbols $ a :| as + + -- capitalize input data if NucSeq or AminoSeq + outData = + if seqType `notElem` [NucSeq, AminoSeq] + then inData + else fmap makeUpperCaseTermData inData + in --trace ("GFCI: " <> (show (dataType, sequenceData, seqType))) $ -- <> " " <> (show sequenceData)) $ + do + case seqType of + NucSeq → logWith LogInfo ("File " <> dataName <> " is nucleotide data." <> "\n") + AminoSeq → logWith LogInfo ("File " <> dataName <> " is aminoacid data." <> "\n") + HugeSeq → logWith LogInfo ("File " <> dataName <> " is large alphabet data." <> "\n") + WideSeq → logWith LogInfo ("File " <> dataName <> " is wide alphabet data." <> "\n") + SlimSeq → logWith LogInfo ("File " <> dataName <> " is slim alphabet data." <> "\n") + _ → failWithPhase Parsing ("File " <> dataName <> " is of unknown data type." <> "\n") + + processedCharInfo ← commonFastCharInfo dataName isPrealigned localTCM seqType thisAlphabet + pure (processedCharInfo, outData) + + +-- | makeUpperCaseTermData +makeUpperCaseTermData ∷ TermData → TermData +makeUpperCaseTermData (taxName, dataList) = + let newData = fmap (fmap C.toUpper) $ fmap ST.unpack dataList + in (taxName, fmap ST.pack newData) + + +-- | commonFastCharInfo breaks out common functions between fasta and fastc parsing +commonFastCharInfo ∷ String → Bool → ([ST.ShortText], [[Int]], Double) → CharType → Alphabet ST.ShortText → PhyG CharInfo +commonFastCharInfo dataName isPrealigned localTCM seqType thisAlphabet = + let localCostMatrix ∷ S.Matrix Int + localCostMatrix = + force $ + if null $ fst3 localTCM + then + let (indelCost, substitutionCost) = + if null $ snd3 localTCM + then (1, 1) + else ((head . head . snd3) localTCM, (last . head . snd3) localTCM) + in S.fromLists $ generateDefaultMatrix thisAlphabet 0 indelCost substitutionCost + else S.fromLists $ snd3 localTCM + + localCostMatrixTransformed = transformGapLastToGapFirst' localCostMatrix + + scmForSlim ∷ ∀ {a1} {a2}. (Enum a1, Enum a2) ⇒ a1 → a2 → Word + scmForSlim i j = localCostMatrixTransformed S.! (fromEnum i, fromEnum j) + + tcmWeightFactor = thd3 localTCM + + alignedSeqType + | not isPrealigned = seqType + | seqType `elem` [NucSeq, SlimSeq] = AlignedSlim + | seqType `elem` [WideSeq, AminoSeq] = AlignedWide + | seqType == HugeSeq = AlignedHuge + | otherwise = error "Unrecognozed data type in getFastcCharInfo" + + updateStandardInfo ∷ CharInfo → CharInfo + updateStandardInfo info = + info + { charType = alignedSeqType + , activity = True + , costMatrix = localCostMatrix + , name = T.pack (filter (/= ' ') dataName <> "#0") + , alphabet = thisAlphabet + , prealigned = isPrealigned + , origInfo = V.singleton (T.pack (filter (/= ' ') dataName <> "#0"), alignedSeqType, thisAlphabet) + } + + updateSlimInfo ∷ CharInfo → CharInfo + updateSlimInfo info = + let dimension = force . fromIntegral $ V.length localCostMatrix + slimMetric = TCMD.generateDenseTransitionCostMatrix 0 dimension scmForSlim + in info + { weight = tcmWeightFactor + , slimTCM = slimMetric + } + + updateWideInfo ∷ CharInfo → PhyG CharInfo + updateWideInfo info = do + (wideWeight, wideMemoTCM) ← getTCMMemo (thisAlphabet, localCostMatrixTransformed) + pure + info + { weight = tcmWeightFactor * fromRational wideWeight + , wideTCM = wideMemoTCM + } + + updateHugeInfo ∷ CharInfo → PhyG CharInfo + updateHugeInfo info = do + (hugeWeight, hugeMemoTCM) ← getTCMMemo (thisAlphabet, localCostMatrixTransformed) + pure + info + { weight = tcmWeightFactor * fromRational hugeWeight + , hugeTCM = hugeMemoTCM + } + + updateCharacterTypeInfo ∷ CharInfo → PhyG CharInfo + updateCharacterTypeInfo = case seqType of + NucSeq → pure . updateSlimInfo + SlimSeq → pure . updateSlimInfo + WideSeq → updateWideInfo + AminoSeq → updateWideInfo + HugeSeq → updateHugeInfo + val → + const . failWithPhase Parsing $ + "getFastaCharInfo: Failure proceesing the CharType: '" <> show val <> "'" + + notNullFromTCM f = not . null $ f localTCM + noNeedToEmitWarning = notNullFromTCM fst3 || notNullFromTCM snd3 + + logProcessingOfCharInfo ∷ PhyG () + logProcessingOfCharInfo + | noNeedToEmitWarning = logWith LogInfo $ fold ["Processing TCM data for file : ", dataName, "\n"] + | otherwise = + logWith LogWarn $ + fold + [ "Warning: no tcm file specified for use with fasta/c file : " + , dataName + , ". Using default, all 1 diagonal 0 cost matrix." + , "\n" + ] + in do + logProcessingOfCharInfo + emptyCharInfo >>= updateCharacterTypeInfo . updateStandardInfo + + +-- | getTCMMemo creates the memoized tcm for large alphabet sequences +getTCMMemo + ∷ ( FiniteBits b + , Hashable b + , Integral i + , MonadIO m + , NFData b + ) + ⇒ (a, S.Matrix i) + → m (Rational, MR.MetricRepresentation b) +getTCMMemo (_inAlphabet, inMatrix) = + let (coefficient, tcm) = force . TCM.fromRows $ S.getFullVects inMatrix + in do + metric ← case tcmStructure $ TCM.diagnoseTcm tcm of + NonAdditive → pure discreteMetric + Additive → pure . linearNorm . toEnum $ TCM.size tcm + _ → {-# SCC "tcmStructure_OTHER" #-} metricRepresentation tcm + pure (coefficient, metric) + + +{- | getSequenceAphabet take a list of ShortText with information and accumulatiors +For both nonadditive and additve looks for [] to denote ambiguity and splits states + if splits on spaces if there are spaces (within []) (ala fastc or multicharacter states) + else if no spaces + if non-additive then each symbol is split out as an alphabet element -- as in TNT + if is additive splits on '-' to denote range +rescales (integerizes later) additive characters with decimal places to an integer type rep +for additive charcaters if states are not nummerical then throuws an error +DOES not check for partitin characters +-} +getSequenceAphabet ∷ [ST.ShortText] → [ST.ShortText] → [ST.ShortText] +getSequenceAphabet newAlph inStates = + if null inStates + then -- removes indel gap from alphabet if present and then (re) adds at end + -- (filter (/= (ST.singleton '-')) $ sort $ nub newAlph) <> [ST.singleton '-'] + L.sort (L.nub newAlph) <> [ST.singleton '-'] + else + let firstState = ST.toString $ head inStates + in if head firstState /= '[' + then + if firstState `elem` ["?", "-"] + then getSequenceAphabet newAlph (tail inStates) + else getSequenceAphabet (head inStates : newAlph) (tail inStates) + else -- ambiguity + + let newAmbigStates = fmap ST.fromString $ words $ filter (`notElem` ['[', ']']) firstState + in getSequenceAphabet (newAmbigStates <> newAlph) (tail inStates) + + +{- | getFastcCharInfo get alphabet , names etc from processed fasta data +this doesn't separate ambiguities from elements--processed later +need to read in TCM or default +-} + +-- Not correct with default alphabet and matrix now after tcm recodeing added to low for decmials I htink. +-- only for multi-character element seqeunces +getFastcCharInfo ∷ [TermData] → String → Bool → ([ST.ShortText], [[Int]], Double) → PhyG CharInfo +getFastcCharInfo inData dataName isPrealigned localTCM = + if null inData + then error "Empty inData in getFastcCharInfo" + else -- if null $ fst localTCM then errorWithoutStackTrace ("Must specify a tcm file with fastc data for fie : " <> dataName) + + let symbolsFound + | not $ null $ fst3 localTCM = fst3 localTCM + | otherwise = getSequenceAphabet [] $ concatMap snd inData + + thisAlphabet = fromSymbols $ NE.fromList symbolsFound + + seqType = + case length thisAlphabet of + n | n <= 8 → SlimSeq + n | n <= 64 → WideSeq + _ → HugeSeq + in commonFastCharInfo dataName isPrealigned localTCM seqType thisAlphabet + + +{- | getFastAText processes fasta file +assumes single character alphabet +deletes '-' (unless "prealigned"), and spaces +-} +getFastAText ∷ T.Text → String → Bool → [TermData] +getFastAText fileContents' fileName isPreligned = + if T.null fileContents' + then errorWithoutStackTrace "\n\n'Read' command error: empty file" + else -- removes ';' comments and spaces + + let fileContents = T.unlines $ filter (not . T.null) $ T.takeWhile (/= ';') <$> T.lines fileContents' + in if T.head fileContents /= '>' + then errorWithoutStackTrace "\n\n'Read' command error: fasta file must start with '>'" + else + let terminalSplits = T.split (== '>') fileContents + pairData = getRawDataPairsFastA isPreligned (tail terminalSplits) + (hasDupTerminals, dupList) = DT.checkDuplicatedTerminals pairData + in -- tail because initial split will an empty text + if hasDupTerminals + then errorWithoutStackTrace ("\tInput file " <> fileName <> " has duplicate terminals: " <> show dupList) + else pairData + + +{- | getRawDataPairsFastA takes splits of Text and returns terminalName, Data pairs--minimal error checking +taxon nmame unitil finds ' ' , '$' or ';' +-} +getRawDataPairsFastA ∷ Bool → [T.Text] → [TermData] +getRawDataPairsFastA isPreligned inTextList = + if null inTextList + then [] + else + let firstText = head inTextList + firstName = + T.strip $ + T.filter (/= '"') $ + T.filter C.isPrint $ + T.takeWhile (/= ' ') $ + T.takeWhile (/= '$') $ + T.takeWhile (/= ';') $ + head $ + T.lines firstText + firstData = T.strip $ T.filter C.isPrint $ T.filter (/= ' ') $ T.concat $ tail $ T.lines firstText + firstDataNoGaps = T.filter (/= '-') firstData + firtDataSTList = fmap (ST.fromText . T.toStrict) (T.chunksOf 1 firstData) + firstDataNoGapsSTList = fmap (ST.fromText . T.toStrict) (T.chunksOf 1 firstDataNoGaps) + in -- trace (T.unpack firstName <> "\n" <> T.unpack firstData) ( + -- trace ("FA " <> (show firtDataSTList)) ( + if isPreligned + then -- trace ("GRDPF: " <> (show isPreligned)) + (firstName, firtDataSTList) : getRawDataPairsFastA isPreligned (tail inTextList) + else (firstName, firstDataNoGapsSTList) : getRawDataPairsFastA isPreligned (tail inTextList) + + +-- ) + +{- | getFastC processes fasta file +assumes spaces between alphabet elements +deletes '-' (unless "prealigned") +-} +getFastC ∷ String → String → Bool → [TermData] +getFastC fileContents' fileName isPreligned = + if null fileContents' + then errorWithoutStackTrace "\n\n'Read' command error: empty file" + else + let fileContentLines = filter (not . null) $ stripString <$> lines fileContents' + in if null fileContentLines + then + errorWithoutStackTrace + ( "File " + <> show fileName + <> " is having problems reading as 'fastc'. If this is a 'fasta' file, " + <> "prepend `fasta:' to the file name as in 'fasta:\"bleh.fas\"'" + ) + else -- ';' comments if in terminal name are removed by getRawDataPairsFastC--otherwise leaves in there--unless its first character of line + -- because of latexIPA encodings using ';'(and '$') + + let fileContents = unlines $ filter ((/= ';') . head) fileContentLines + in if null fileContents + then + errorWithoutStackTrace + ( "File " + <> show fileName + <> " is having problems reading as 'fastc'. If this is a 'fasta' file, " + <> "prepend `fasta:' to the file name as in 'fasta:\"bleh.fas\"'" + ) + else + if head fileContents /= '>' + then errorWithoutStackTrace "\n\n'Read' command error: fasta file must start with '>'" + else + let terminalSplits = T.split (== '>') $ T.pack fileContents + pairData = recodeFASTCAmbiguities fileName $ getRawDataPairsFastC isPreligned (tail terminalSplits) + (hasDupTerminals, dupList) = DT.checkDuplicatedTerminals pairData + in -- tail because initial split will an empty text + if hasDupTerminals + then errorWithoutStackTrace ("\tInput file " <> fileName <> " has duplicate terminals: " <> show dupList) + else pairData + + +{- | getFastCText processes fasta file +assumes spaces between alphabet elements +deletes '-' (unless "prealigned") +-} +getFastCText ∷ T.Text → String → Bool → [TermData] +getFastCText fileContents' fileName isPreligned = + if T.null fileContents' + then errorWithoutStackTrace "\n\n'Read' command error: empty file" + else + let fileContentLines = filter (not . T.null) $ fmap T.strip (T.lines fileContents') + in if null fileContentLines + then + errorWithoutStackTrace + ( "File " + <> show fileName + <> " is having problems reading as 'fastc'. If this is a 'fasta' file, " + <> "prepend `fasta:' to the file name as in 'fasta:\"bleh.fas\"'" + ) + else -- ';' comments if in terminal name are removed by getRawDataPairsFastC--otherwise leaves in there--unless its first character of line + -- because of latexIPA encodings using ';'(and '$') + + let fileContents = T.unlines $ filter ((/= ';') . T.head) fileContentLines + in if T.null fileContents + then + errorWithoutStackTrace + ( "File " + <> show fileName + <> " is having problems reading as 'fastc'. If this is a 'fasta' file, " + <> "prepend `fasta:' to the file name as in 'fasta:\"bleh.fas\"'" + ) + else + if T.head fileContents /= '>' + then errorWithoutStackTrace "\n\n'Read' command error: fasta file must start with '>'" + else + let terminalSplits = T.split (== '>') fileContents + pairData = recodeFASTCAmbiguities fileName $ getRawDataPairsFastC isPreligned (tail terminalSplits) + (hasDupTerminals, dupList) = DT.checkDuplicatedTerminals pairData + in -- tail because initial split will an empty text + if hasDupTerminals + then errorWithoutStackTrace ("\tInput file " <> fileName <> " has duplicate terminals: " <> show dupList) + else pairData + + +-- | recodeFASTCAmbiguities take list of TermData and scans for ambiguous groups staring with '['' and ending with '] +recodeFASTCAmbiguities ∷ String → [TermData] → [TermData] +recodeFASTCAmbiguities fileName inData = + if null inData + then [] + else + let (firstName, firstData) = head inData + newData = concatAmbig fileName firstData + in (firstName, newData) : recodeFASTCAmbiguities fileName (tail inData) + + +{- | concatAmbig takes a list of ShortText and concatanates ambiguyous states '['X Y Z...']' into a +single Short Tex for later processing +-} +concatAmbig ∷ String → [ST.ShortText] → [ST.ShortText] +concatAmbig fileName inList = + if null inList + then [] + else + let firstGroup = ST.toString $ head inList + in -- not ambiguity group + -- trace (firstGroup <> show inList) ( + if null firstGroup + then concatAmbig fileName (tail inList) + else + if head firstGroup /= '[' + then head inList : concatAmbig fileName (tail inList) + else + let ambiguityGroup = head inList : getRestAmbiguityGroup fileName (tail inList) + in -- trace (show ambiguityGroup) + ST.concat ambiguityGroup : concatAmbig fileName (drop (length ambiguityGroup) inList) + + +-- ) + +-- | getRestAmbiguityGroup takes a list of ShorText and keeps added them until one is found with ']' +getRestAmbiguityGroup ∷ String → [ST.ShortText] → [ST.ShortText] +getRestAmbiguityGroup fileName inList = + if null inList + then errorWithoutStackTrace ("\n\n'Read' command error: fastc file " <> fileName <> " with unterminated ambiguity specification ']'") + else + let firstGroup = ST.toString $ head inList + in if ']' `notElem` firstGroup + then ST.cons ' ' (head inList) : getRestAmbiguityGroup fileName (tail inList) + else [ST.cons ' ' $ head inList] + + +{- | getRawDataPairsFastC takes splits of Text and returns terminalName, Data pairs--minimal error checking +this splits on spaces in sequences +takes taxon name until encouters '' '. '$', or ';' +-} +getRawDataPairsFastC ∷ Bool → [T.Text] → [TermData] +getRawDataPairsFastC isPreligned inTextList = + if null inTextList + then [] + else + let firstText = head inTextList + firstName = + T.strip $ + T.filter (/= '"') $ + T.filter C.isPrint $ + T.takeWhile (/= ' ') $ + T.takeWhile (/= '$') $ + T.takeWhile (/= ';') $ + head $ + T.lines firstText + firstData = T.split (== ' ') $ T.concat $ tail $ T.lines firstText + firstDataNoGaps = filter (/= "-") firstData + in -- trace (show firstData) ( + -- trace (T.unpack firstName <> "\n" <> (T.unpack $ T.intercalate " " firstData)) ( + if isPreligned + then (firstName, fmap (ST.fromText . T.toStrict) firstData) : getRawDataPairsFastC isPreligned (tail inTextList) + else (firstName, fmap (ST.fromText . T.toStrict) firstDataNoGaps) : getRawDataPairsFastC isPreligned (tail inTextList) + + +-- | add to tnt +genDiscreteDenseOfDimension + ∷ (Enum i) + ⇒ i + → TCMD.DenseTransitionCostMatrix +genDiscreteDenseOfDimension d = + let n = toEnum $ fromEnum d + r = [0 .. n - 1] + m = [[if i == j then 0 else 1 | j ← r] | i ← r] + in TCMD.generateDenseTransitionCostMatrix n n . S.getCost $ V.fromList <$> V.fromList m + + +transformGapLastToGapFirst' ∷ S.Matrix Int → S.Matrix Word +transformGapLastToGapFirst' mat = + let (n, _) = S.dim mat + m = S.getFullVects mat + + f 0 0 = m S.! (n - 1, n - 1) + f 0 j = m S.! (n - 1, j - 1) + f i 0 = m S.! (i - 1, n - 1) + f i j = m S.! (i - 1, j - 1) + in S.fromLists [[fromIntegral $ f i j | j ← [0 .. n - 1]] | i ← [0 .. n - 1]] + +{- +transformGapLastToGapFirst :: TCM.TCM -> TCM.TCM +transformGapLastToGapFirst tcm = + let n = TCM.size tcm + in TCM.generate n $ \case + (0,0) -> tcm TCM.! (n - 1, n - 1) + (0,j) -> tcm TCM.! (n - 1, j - 1) + (i,0) -> tcm TCM.! (i - 1, n - 1) + (i,j) -> tcm TCM.! (i - 1, j - 1) +-} diff --git a/src/Input/ReadInputFiles.hs b/src/Input/ReadInputFiles.hs new file mode 100644 index 000000000..558fe564b --- /dev/null +++ b/src/Input/ReadInputFiles.hs @@ -0,0 +1,798 @@ +{- | +Module exposing functionality for reading input files for phylogenetic analysis. +-} +module Input.ReadInputFiles ( + executeReadCommands, + getReadArgs, + extractInputTuple, + expandReadCommands, +) where + +import Commands.Verify qualified as V +import Control.Monad (when) +import Control.Monad.IO.Class (MonadIO (..)) +import Data.Char +import Data.Char qualified as C +import Data.Foldable +import Data.Graph.Inductive.Basic qualified as B +import Data.List qualified as L +import Data.Maybe +import Data.Text.Lazy qualified as T +import Data.Text.Lazy.IO qualified as TIO +import Data.Text.Short qualified as ST +import Debug.Trace +import GeneralUtilities qualified as GU +import GraphFormatUtilities qualified as GFU +import Input.FastAC qualified as FAC +import Input.TNTUtilities qualified as TNT +import PHANE.Evaluation +import PHANE.Evaluation.ErrorPhase (ErrorPhase (..)) +import PHANE.Evaluation.Logging (LogLevel (..), Logger (..)) +import PHANE.Evaluation.Verbosity (Verbosity (..)) +import System.IO +import System.Path.Glob qualified as SPG +import Text.Read +import Types.Types +import Utilities.LocalGraph qualified as LG +import Utilities.Utilities qualified as U + + +{- | expandReadCommands expands read commands to multiple satisfying wild cards +read command can have multiple file names +-} +expandReadCommands ∷ [Command] → Command → PhyG [Command] +expandReadCommands _newReadList inCommand@(commandType, argList') = + let argList = filter ((`notElem` ["tcm"]) . fst) argList' + tcmArgList = filter ((`elem` ["tcm"]) . fst) argList' + fileNames = fmap snd $ filter ((/= "tcm") . fst) $ filter ((/= "") . snd) argList' + modifierList = fmap fst argList + in case commandType of + Read -> do + globbedFileNames ← liftIO $ mapM SPG.glob fileNames + when (all null globbedFileNames) . failWithPhase Inputting $ unwords + [ "File(s) not found in 'read' command (could be due to incorrect filename or missing closing double quote '\"''):" + , show fileNames + ] + + newArgPairs ← mapM makeNewArgs (zip modifierList globbedFileNames) + + let commandList = replicate (length newArgPairs) commandType + let tcmArgListL = replicate (length newArgPairs) tcmArgList + + pure $ zip commandList (zipWith (<>) newArgPairs tcmArgListL) + + _ -> failWithPhase Inputting $ unwords [ "Incorrect command type in expandReadCommands:", show inCommand ] + + +{- | makeNewArgs takes an argument modifier (first in pair) and replicates is and zips with +globbed file name list to create a list of arguments +-} +makeNewArgs ∷ (String, [String]) → PhyG [(String, String)] +makeNewArgs (modifier, fileNameList) = + if null fileNameList + then do failWithPhase Inputting ("Null filename list in makeNewArgs: " <> show (modifier, fileNameList)) + else + let modList = replicate (length fileNameList) modifier + in do return $ zip modList fileNameList + + +{- +Added strictness on read so could close files after reading to + allow for large number (1000's) of files to be input. + this may not be the way to go, especially if files are big and + closing them is not an issue +-} + +{- | extractInputTuple takes the list of pairs from mapped executeReadCommands +and returns ([RawData], [SimpleGraph]) +-} +extractInputTuple + ∷ [([RawData], [SimpleGraph], [NameText], [NameText], [(NameText, NameText)], [(NameText, NameText)])] + → ([RawData], [SimpleGraph], [NameText], [NameText], [(NameText, NameText)], [(NameText, NameText)]) +extractInputTuple dataGraphList = + let (inDataList, inGraphList, inTerminalsList, inExcludeList, inRenamePairs, inReBlockPairs) = L.unzip6 dataGraphList + rawData = L.sort $ concat inDataList + rawGraphs = concat inGraphList + rawTerminals = concat inTerminalsList + excludeTerminals = concat inExcludeList + renamePairs = concat inRenamePairs + reBlockPairs = concat inReBlockPairs + in (rawData, rawGraphs, rawTerminals, excludeTerminals, renamePairs, reBlockPairs) + + +-- | executeReadCommands wrapper for executeReadCommands' with out all the empty list imput +executeReadCommands + ∷ [Argument] + → PhyG ([RawData], [SimpleGraph], [NameText], [NameText], [(NameText, NameText)], [(NameText, NameText)]) +executeReadCommands = executeReadCommands' [] [] [] [] [] [] False ([], [], 1.0) + + +{- | executeReadCommands' reads input files and returns raw data, input graphs, and terminal taxa to include +assumes that "prealigned" and "tcm:File" are the first arguments if they are specified +so that they can apply to all the files in the command without order depence +-} +executeReadCommands' + ∷ [RawData] + → [SimpleGraph] + → [NameText] + → [NameText] + → [(NameText, NameText)] + → [(NameText, NameText)] + → Bool + → ([ST.ShortText], [[Int]], Double) + → [Argument] + → PhyG ([RawData], [SimpleGraph], [NameText], [NameText], [(NameText, NameText)], [(NameText, NameText)]) +executeReadCommands' curData curGraphs curTerminals curExcludeList curRenamePairs curReBlockPairs _ tcmPair argList = do + if null argList + then return (curData, curGraphs, curTerminals, curExcludeList, curRenamePairs, curReBlockPairs) + else do + -- logWith LogInfo ("ERC: " <> (show argList) <> " " <> (show tcmPair)) + let isPrealigned' = False -- removed this option and m,asde specific to sequence types + -- | isPrealigned = True + -- | "prealigned" `elem` fmap fst argList = True + -- | otherwise = False + let (firstOption', firstFile) = head argList + let firstOption = fmap C.toLower firstOption' + -- Check for prealigned + -- if firstOption == "prealigned" then + -- executeReadCommands' curData curGraphs curTerminals curExcludeList curRenamePairs curReBlockPairs isPrealigned' tcmPair (tail argList) + -- else do + do + fileHandle ← + liftIO $ + if ',' `notElem` firstFile + then openFile firstFile ReadMode + else return (stdin ∷ Handle) + canBeReadFrom ← liftIO $ hIsReadable fileHandle + if not canBeReadFrom + then do failWithPhase Inputting ("\n\n'Read' error: file " <> firstFile <> " cannot be read") + else + if not $ null firstOption + then do logWith LogInfo ("Reading " <> firstFile <> " with option " <> firstOption <> "\n") + else do logWith LogInfo ("Reading " <> firstFile <> " with no options" <> "\n") + + -- this is awkward but need to use dot utilities + if firstOption == "dot" + then do + dotGraph ← liftIO $ LG.hGetDotLocal fileHandle + let inputDot = GFU.relabelFGL $ LG.dotToGraph dotGraph + let hasLoops = B.hasLoop inputDot + if hasLoops + then do failWithPhase Parsing ("Input graph in " <> firstFile <> " has loops/self-edges" <> "\n") + else do logWith LogInfo "" + let hasCycles = GFU.cyclic inputDot + if hasCycles + then do failWithPhase Parsing ("Input graph in " <> firstFile <> " has at least one cycle" <> "\n") + else + executeReadCommands' + curData + (inputDot : curGraphs) + curTerminals + curExcludeList + curRenamePairs + curReBlockPairs + isPrealigned' + tcmPair + (tail argList) + else -- not "dot" files + do + -- logWith LogInfo ("FC1: " <> firstFile) + fileContents ← + liftIO $ + if ',' `notElem` firstFile + then TIO.hGetContents fileHandle + else return (T.pack firstFile) + + {--Strict version--don't think necessary--but could getvinot open fikle number issues? + -- destroys lazyness but allows closing right away + -- this so don't have huge numbers of open files for large data sets + fileContents <- hGetContents' fileHandle + hClose fileHandle + -} + + if T.null fileContents + then do failWithPhase Inputting ("Error: Input file " <> firstFile <> " is empty") + else -- try to figure out file type based on first and following characters + + if firstOption == "tcm" + then do + newTCMPair ← processTCMContents (',' `elem` firstFile) (T.unpack fileContents) firstFile + executeReadCommands' + curData + curGraphs + curTerminals + curExcludeList + curRenamePairs + curReBlockPairs + isPrealigned' + newTCMPair + (tail argList) + else + if null firstOption + then + let firstChar = T.head $ T.dropWhile (== ' ') fileContents + in if (toLower firstChar == '/') || (toLower firstChar == 'd') || (toLower firstChar == 'g') + then do + logWith LogInfo ("\tTrying to parse " <> firstFile <> " as dot" <> "\n") + -- destroys lazyness but allows closing right away + -- this so don't have huge numbers of open files for large data sets + fileHandle2 ← liftIO $ openFile firstFile ReadMode + dotGraph ← liftIO $ LG.hGetDotLocal fileHandle2 + liftIO $ hClose fileHandle2 + let inputDot = GFU.relabelFGL $ LG.dotToGraph dotGraph + let hasLoops = B.hasLoop inputDot + if hasLoops + then do failWithPhase Parsing ("Input graph in " <> firstFile <> " has loops/self-edges" <> "\n") + else do logWith LogInfo "" + let hasCycles = GFU.cyclic inputDot + if hasCycles + then do failWithPhase Parsing ("Input graph in " <> firstFile <> " has at least one cycle" <> "\n") + else + executeReadCommands' + curData + (inputDot : curGraphs) + curTerminals + curExcludeList + curRenamePairs + curReBlockPairs + isPrealigned' + tcmPair + (tail argList) + else + if (toLower firstChar == '<') || (toLower firstChar == '(') + then + let thisGraphList = getFENewickGraphText fileContents + hasCycles = filter id $ fmap GFU.cyclic thisGraphList + hasLoops = filter id $ fmap B.hasLoop thisGraphList + in if not $ null hasLoops + then do failWithPhase Parsing ("Input graph in " <> firstFile <> " has loops/self-edges" <> "\n") + else + if not $ null hasCycles + then do failWithPhase Parsing ("Input graph in " <> firstFile <> " has at least one cycle" <> "\n") + else + executeReadCommands' + curData + (thisGraphList <> curGraphs) + curTerminals + curExcludeList + curRenamePairs + curReBlockPairs + isPrealigned' + tcmPair + (tail argList) + else + if toLower firstChar == 'x' + then do + tntData ← TNT.getTNTDataText fileContents firstFile + logWith LogInfo ("\tTrying to parse " <> firstFile <> " as TNT" <> "\n") + executeReadCommands' + (tntData : curData) + curGraphs + curTerminals + curExcludeList + curRenamePairs + curReBlockPairs + isPrealigned' + tcmPair + (tail argList) + else + let fileContents' = T.unlines $ filter (not . T.null) $ T.takeWhile (/= ';') <$> T.lines fileContents + in if T.null (T.dropWhile (== ' ') fileContents') + then do failWithPhase Parsing ("Null file '" <> firstFile <> "' input") + else + if T.head (T.dropWhile (== ' ') fileContents') == '>' + then + let secondLine = T.lines fileContents' !! 1 + hasSpaces = T.find (== ' ') secondLine + in -- numWords = length $ words secondLine + + -- spaces between alphabet elements suggest fastc + if isJust hasSpaces + then + let fastcData = FAC.getFastCText fileContents firstFile isPrealigned' + in do + fastcCharInfo ← FAC.getFastcCharInfo fastcData firstFile isPrealigned' tcmPair + logWith LogInfo ("\tTrying to parse " <> firstFile <> " as fastc--if it should be fasta specify 'fasta:' on input." <> "\n") + executeReadCommands' + ((fastcData, [fastcCharInfo]) : curData) + curGraphs + curTerminals + curExcludeList + curRenamePairs + curReBlockPairs + isPrealigned' + tcmPair + (tail argList) + else + let fastaData' = FAC.getFastAText fileContents firstFile isPrealigned' + in do + (fastaCharInfo, fastaData) ← + {-# SCC "getFastaCharInfo_1" #-} FAC.getFastaCharInfo fastaData' firstFile firstOption isPrealigned' tcmPair + logWith LogInfo ("\tTrying to parse " <> firstFile <> " as fasta" <> "\n") + executeReadCommands' + ((fastaData, [fastaCharInfo]) : curData) + curGraphs + curTerminals + curExcludeList + curRenamePairs + curReBlockPairs + isPrealigned' + tcmPair + (tail argList) + else do failWithPhase Parsing ("Cannot determine file type for " <> firstFile <> " need to prepend type") + else -- fasta + let firstChar = T.head $ T.dropWhile (== ' ') fileContents + in + if firstOption `elem` ["fasta", "nucleotide", "aminoacid", "hugeseq"] + then + let fastaData' = FAC.getFastAText fileContents firstFile isPrealigned' + in do + (fastaCharInfo, fastaData) ← + {-# SCC "getFastaCharInfo_2" #-} FAC.getFastaCharInfo fastaData' firstFile firstOption isPrealigned' tcmPair + executeReadCommands' + ((fastaData, [fastaCharInfo]) : curData) + curGraphs + curTerminals + curExcludeList + curRenamePairs + curReBlockPairs + isPrealigned' + tcmPair + (tail argList) + else -- fastc + + if firstOption == "fastc" + then + let fastcData = FAC.getFastCText fileContents firstFile isPrealigned' + in do + fastcCharInfo ← FAC.getFastcCharInfo fastcData firstFile isPrealigned' tcmPair + executeReadCommands' + ((fastcData, [fastcCharInfo]) : curData) + curGraphs + curTerminals + curExcludeList + curRenamePairs + curReBlockPairs + isPrealigned' + tcmPair + (tail argList) + else -- prealigned fasta + + if firstOption `elem` ["prefasta", "prenucleotide", "preaminoacid", "prehugeseq"] + then + let fastaData' = FAC.getFastAText fileContents firstFile True + in do + (fastaCharInfo, fastaData) ← + {-# SCC "getFastaCharInfo_3" #-} FAC.getFastaCharInfo fastaData' firstFile firstOption True tcmPair + -- trace ("POSTREAD:" <> (show fastaCharInfo) <> "\n" <> (show fastaData)) + executeReadCommands' + ((fastaData, [fastaCharInfo]) : curData) + curGraphs + curTerminals + curExcludeList + curRenamePairs + curReBlockPairs + isPrealigned' + tcmPair + (tail argList) + else -- prealigned fastc + + if firstOption == "prefastc" + then + let fastcData = FAC.getFastCText fileContents firstFile True + in do + fastcCharInfo ← FAC.getFastcCharInfo fastcData firstFile True tcmPair + executeReadCommands' + ((fastcData, [fastcCharInfo]) : curData) + curGraphs + curTerminals + curExcludeList + curRenamePairs + curReBlockPairs + isPrealigned' + tcmPair + (tail argList) + else -- tnt + -- tnt + + if firstOption == "tnt" + then do + tntData ← TNT.getTNTDataText fileContents firstFile + executeReadCommands' + (tntData : curData) + curGraphs + curTerminals + curExcludeList + curRenamePairs + curReBlockPairs + isPrealigned' + tcmPair + (tail argList) + else -- else if firstOption == "prealigned" then executeReadCommands' curData curGraphs curTerminals curExcludeList curRenamePairs curReBlockPairs isPrealigned' tcmPair (tail argList) + -- FENEwick + + if firstOption `elem` ["newick", "enewick", "fenewick"] + then + let thisGraphList = getFENewickGraphText fileContents + hasLoops = filter id $ fmap B.hasLoop thisGraphList + hasCycles = filter id $ fmap GFU.cyclic thisGraphList + in if not $ null hasLoops + then do failWithPhase Parsing ("Input graphin " <> firstFile <> " has loops/self-edges") + else + if not $ null hasCycles + then do failWithPhase Parsing ("Input graph in " <> firstFile <> " has at least one cycle") + else + executeReadCommands' + curData + (thisGraphList <> curGraphs) + curTerminals + curExcludeList + curRenamePairs + curReBlockPairs + isPrealigned' + tcmPair + (tail argList) + else -- reading terminals list to include--must be "new" names if taxa are renamed + + if firstOption == "include" then + + if (toLower firstChar) `elem` ['/','d','g','<','(', 'x'] + then do + failWithPhase Parsing ("Input 'include' file " <> firstFile <> " does not look like one.") + else + let terminalsList = fmap ((T.pack . filter (/= '"')) . filter C.isPrint) (words $ unlines $ U.stripComments (T.unpack <$> T.lines fileContents)) + in executeReadCommands' + curData + curGraphs + (terminalsList <> curTerminals) + curExcludeList + curRenamePairs + curReBlockPairs + isPrealigned' + tcmPair + (tail argList) + else + if firstOption == "exclude" then + if (toLower firstChar) `elem` ['/','d','g','<','(', 'x'] + then do + failWithPhase Parsing ("Input 'exclude' file " <> firstFile <> " does not look like one.") + else + let excludeList = fmap ((T.pack . filter (/= '"')) . filter C.isPrint) (words $ unlines $ U.stripComments (T.unpack <$> T.lines fileContents)) + in executeReadCommands' + curData + curGraphs + curTerminals + (excludeList <> curExcludeList) + curRenamePairs + curReBlockPairs + isPrealigned' + tcmPair + (tail argList) + else + if firstOption == "rename" then + if (toLower firstChar) `elem` ['/','d','g','<','(', 'x'] + then do + failWithPhase Parsing ("Input 'rename' file " <> firstFile <> " does not look like one.") + else do + let renameLines = U.stripComments (T.unpack <$> T.lines fileContents) + namePairsLL ← mapM (makeNamePairs firstFile) renameLines + let namePairs = concat namePairsLL + let changeNamePairs = filter areDiff (namePairs <> curRenamePairs) + let newChangeNames = fmap fst changeNamePairs + let origChangeNames = fmap snd changeNamePairs + let intersectionChangeNames = L.nub $ L.intersect newChangeNames origChangeNames + + if (not $ null intersectionChangeNames) + then errorWithoutStackTrace ("Error: Renaming of " <> (show intersectionChangeNames) <> " as both a new name and to be renamed") + else + executeReadCommands' + curData + curGraphs + curTerminals + curExcludeList + (namePairs <> curRenamePairs) + curReBlockPairs + isPrealigned' + tcmPair + (tail argList) + else + if firstOption == "block" then + if (toLower firstChar) `elem` ['/','d','g','<','(', 'x'] + then do + failWithPhase Parsing ("Input 'block' file " <> firstFile <> " does not look like one.") + else do + let renameLines = U.stripComments (T.unpack <$> T.lines fileContents) + blockPairsLL ← mapM (makeNamePairs firstFile) renameLines + let blockPairs = concat blockPairsLL + + executeReadCommands' + curData + curGraphs + curTerminals + curExcludeList + curRenamePairs + (blockPairs <> curReBlockPairs) + isPrealigned' + tcmPair + (tail argList) + else -- else if firstOption == "prealigned" then + -- executeReadCommands' curData curGraphs curTerminals curExcludeList curRenamePairs curReBlockPairs isPrealigned' tcmPair (tail argList) + do failWithPhase Parsing ("\n\n'Read' command error: option " <> firstOption <> " not recognized/implemented") + where + areDiff (a, b) = + if a /= b + then True + else False + + +{- | makeNamePairs takes lines of rename or reblock file and returns pairs of names +for renaming or reblocking +-} +makeNamePairs ∷ String → String → PhyG [(T.Text, T.Text)] +makeNamePairs inFileName inLine = + if null inLine + then do return [] + else + let lineWords = fmap T.strip $ T.words $ T.pack $ filter (/= '"') $ filter C.isPrint inLine + in if length lineWords < 2 + then + failWithPhase + Parsing + ("Rename file " <> inFileName <> " line needs at least two Strings to rename the second as the first: " <> inLine) + else + let targetNameList = replicate (length $ tail lineWords) (head lineWords) + renamePairList = zip targetNameList (tail lineWords) + in do + return renamePairList + + +{- | getReadArgs processes arguments ofr the 'read' command +should allow mulitple files and gracefully error check +also allows tcm file specification (limit 1 tcm per command?) +as fasta, fastc, tnt, tcm, prealigned +-} +getReadArgs ∷ String → [(String, String)] → PhyG [Argument] +getReadArgs fullCommand argList = + if null argList + then return [] + else -- trace ("GRA: " <> fullCommand <> " " <> (show argList)) ( + + let (firstPart, secondPart) = head argList + in do + restPart ← getReadArgs fullCommand (tail argList) + + -- command in wrong place like prealigned or rename after file name + if (not . null) firstPart && null secondPart + then + failWithPhase + Parsing + ( "\n\n'Read' command error: possibly incorrect placement of option specification '" + <> firstPart + <> "' should be before filename as in '" + <> firstPart + <> ":filename'\n" + ) + else -- plain file name with no modifier + + if null firstPart + then + if (head secondPart == '"') || (last secondPart == '"') + then do + return $ (firstPart, init $ tail secondPart) : restPart + else failWithPhase Parsing ("\n\n'Read' command error: '" <> secondPart <> "' : Need to specify filename in double quotes\n") + else -- Change to allowed modifiers + + if fmap toLower firstPart `notElem` V.readArgList + then failWithPhase Parsing ("\n\n'Read' command error: " <> fullCommand <> " contains unrecognized option '" <> firstPart <> "'\n") + else + if null secondPart && (firstPart == "prealigned") + then do + return $ (firstPart, []) : restPart + else + if null (tail secondPart) + then return argList + else do + return $ (firstPart, init $ tail secondPart) : restPart + + +-- ) + +{- Not used when migrated to Text input from String + +-- | getFENewickGraph takes graph contents and returns local graph format +-- could be mulitple input graphs +getFENewickGraph :: String -> [LG.Gr T.Text Double] +getFENewickGraph fileString = + getFENewickGraphText (T.pack fileString) +-} + +{- | getFENewickGraphText takes graph contents and returns local graph format +could be mulitple input graphs +-} +getFENewickGraphText ∷ T.Text → [LG.Gr T.Text Double] +getFENewickGraphText fileString = + -- trace (fileString) + LG.getFENLocal (T.filter (/= '\n') $ T.strip fileString) + + +{- | processTCMContents +functionality added to integerize tcm and add weight factor to allow for decimal tcm values +-} +processTCMContents ∷ Bool → String → String → PhyG ([ST.ShortText], [[Int]], Double) +processTCMContents indelGap inContents fileName = + if null inContents + then do failWithPhase Parsing ("\n\n'Read' 'tcm' command error: empty tcmfile `" <> fileName) + else + if indelGap + then + let indelString = L.takeWhile (/= ',') inContents + substString = tail $ L.dropWhile (/= ',') inContents + indelMaybe = readMaybe indelString ∷ Maybe Int + substMaybe = readMaybe substString ∷ Maybe Int + in if isNothing indelMaybe + then do failWithPhase Parsing ("Specification of indel cost must be an Integer (Indel cost, Substitution cost): " <> indelString) + else + if isNothing substMaybe + then do + failWithPhase Parsing ("Specification of substitution cost must be an Integer (Indel cost, Substitution cost): " <> substString) + else do + return ([], [[fromJust indelMaybe, fromJust substMaybe], []], 1.0) + else -- First line is alphabet + do + let tcmLines = lines inContents + let localAlphabet = fmap ST.pack $ words $ head tcmLines + -- to account for '-' + let numElements = 1 + length localAlphabet + let costMatrixStrings = words <$> tail tcmLines + -- localCostMatrix = filter (/= []) $ fmap (fmap (GU.stringToInt fileName)) costMatrixStrings + (scaleFactor, localCostMatrix) ← getCostMatrixAndScaleFactor fileName costMatrixStrings + let numLines = length localCostMatrix + let lengthRowList = fmap length localCostMatrix + let rowsCorrectLength = foldl' (&&) True $ fmap (== numElements) lengthRowList + + if ST.singleton '-' `elem` localAlphabet + then do + failWithPhase + Parsing + "\n\n'Read' 'tcm' file format error: '-' (InDel/Gap) should not be specified as an alphabet element. It is added in automatically and assumed to be the last row and column of tcm matrix" + else + if numLines /= numElements + then do + failWithPhase + Parsing + ( "\n\n'Read' 'tcm' file format error: incorrect number of lines in matrix from " + <> fileName + <> " this could be due to a missing element symbol line at the beginning of the file (e.g. A C G T, '-' is assumed) or there is a mismatch in the dimensions of the tcm (including '-') " + <> show numElements + <> " elements are implied and there are " + <> show numLines + <> "\n" <> (concat $ L.intersperse " " $ words $ head tcmLines) + <> "\n " <> (show $ length $ words $ head tcmLines) + ) + else + if not rowsCorrectLength + then do + failWithPhase + Parsing + ( "\n\n'Read' 'tcm' file format error: incorrect lines length(s) in matrix from " + <> fileName + <> " there should be (including '-') " + <> show numElements + ) + else -- trace (show (scaleFactor, localCostMatrix)) + -- trace (show localAlphabet <> "\n" <> show localCostMatrix) + return (localAlphabet <> [ST.singleton '-'], localCostMatrix, scaleFactor) + + +{- | getCostMatrixAndScaleFactor' takes [[String]] and returns cost matrix as +[[Int]] but if there are decimal values, a scalerFactor is determined and the +cost matrix are integerized by multiplication by 1/scaleFactor +Unlike version above is more flexible with Double format + +Adds precision to 100--fixes an issue when smallest and other values are similar +Not sure how this related to 2**16 (CShort) for ffi for <= 8 alphabets +Also can be integer overflow if the number are large since treated as integers + during DO for DNA--seems like pairwise alignment cost may be limited to 2^31 or 2^32 +-} +getCostMatrixAndScaleFactor ∷ String → [[String]] → PhyG (Double, [[Int]]) +getCostMatrixAndScaleFactor fileName = \case + [] -> failWithPhase Parsing "Empty list in inStringListList" + inStringListList -> + let maxDecimalPlaces = maximum $ getDecimals <$> concat inStringListList + doubleMatrix = filter (/= []) $ fmap (fmap (GU.stringToDouble fileName)) inStringListList + minDouble = minimum $ fmap minimum $ fmap (filter (> 0.0)) doubleMatrix + maxDouble = maximum $ fmap maximum doubleMatrix + + -- set precision based on range of max and min values + -- to deal with some pathplogical situatinos with very high cost of no change and lo cost for change + range = maxDouble / minDouble + precisionFactor = if range < 10.0 then 10.0 + else 1.0 + {- + if range >= 100000.0 then 1.0 + else if range >= 10000.0 then 10.0 + else if range >= 1000.0 then 100.0 + else if range >= 100.0 then 1000.0 + else if range >= 10.0 then 10000.0 + else 100000.0 + -} + + rescaledDoubleMatrix = fmap (fmap (* (precisionFactor / minDouble))) doubleMatrix + integerizedMatrix = fmap (fmap round) rescaledDoubleMatrix + + scaleFactor + | maxDecimalPlaces == 0 = 1.0 + | otherwise = (minDouble / precisionFactor) + + outputMatrix = case maxDecimalPlaces of + 0 -> filter (/= []) $ fmap (GU.stringToInt fileName) <$> inStringListList + _ -> integerizedMatrix + + in --trace ("GCMSC: " <> (show (precisionFactor,minDouble,scaleFactor,integerizedMatrix,rescaledDoubleMatrix) )) $ + pure $ (scaleFactor, outputMatrix) + + +{- +- | diagonalsEqualZero takes an integer matrix [[Int]] +-- and checks if any diagonal values are == 0 +diagonalsEqualZero :: [[Int]] -> Int -> Bool +diagonalsEqualZero inMatrix index = + if null inMatrix then False + else + if (head inMatrix) !! index /= 0 then True + else diagonalsEqualZero (tail inMatrix) (index + 1) +-} + +-- | getDecimals returns the number of decimal places in a string rep of a number +getDecimals ∷ String → Int +getDecimals inString = + if null inString + then 0 + else + let decimalPart = dropWhile (/= '.') inString + in if null decimalPart + then 0 + else length decimalPart - 1 + + +{- Unused code + +{- | getCostMatrixAndScaleFactor takes [[String]] and returns cost matrix as +[[Int]] but if there are decimal values, a scalerFactor is determined and the +cost matrix are integerized by multiplication by 1/scaleFactor +scaleFactor will alway be a factor of 10 to allow for easier +compilation of tcm characters later (weights the same...) +-} +getCostMatrixAndScaleFactor' ∷ String → [[String]] → PhyG (Double, [[Int]]) +getCostMatrixAndScaleFactor' fileName inStringListList = + if null inStringListList + then failWithPhase Parsing "Empty list in inStringListList" + else + let maxDecimalPlaces = maximum $ getDecimals <$> concat inStringListList + scaleFactor = + if maxDecimalPlaces == 0 + then 1.0 + else 0.1 ** fromIntegral maxDecimalPlaces + in if maxDecimalPlaces == 0 + then do + return $ (scaleFactor, filter (/= []) $ fmap (fmap (GU.stringToInt fileName)) inStringListList) + else do + let newStringListList = fmap (fmap (integerizeString maxDecimalPlaces)) inStringListList + return $ (scaleFactor, filter (/= []) $ fmap (fmap (GU.stringToInt fileName)) newStringListList) + +{- | integerizeString integerizes a String by removing the '.' and adding the number of '0's to pad out +adds maxDecimalPlaces - the nymber in the String +-} +integerizeString ∷ Int → String → String +integerizeString maxDecimalPlaces inString = + if null inString + then error "Null string in integerizeString" + else + let decimalPart = dropWhile (/= '.') inString + in if inString == "0" + then inString + else + if null decimalPart + then inString <> replicate maxDecimalPlaces '0' + else filter (/= '.') inString <> replicate (maxDecimalPlaces - (length decimalPart - 1)) '0' +-} \ No newline at end of file diff --git a/src/Input/Reorganize.hs b/src/Input/Reorganize.hs new file mode 100644 index 000000000..6a6d2de61 --- /dev/null +++ b/src/Input/Reorganize.hs @@ -0,0 +1,887 @@ +{-# OPTIONS_GHC -Wno-missed-specialisations #-} + +{- | +Module with functionality to transform phylogenetic data +-} +module Input.Reorganize ( + combineDataByType, + reBlockData, + removeConstantCharactersPrealigned, + removeConstantCharsPrealigned, + optimizePrealignedData, + convert2BV, + getRecodingType, +) where + +import Bio.DynamicCharacter.Element +import Data.Alphabet +import Data.BitVector.LittleEndian qualified as BV +import Data.Bits +import Data.Functor ((<&>)) +import Data.List qualified as L +import Data.List.NonEmpty (NonEmpty (..)) +import Data.Maybe +import Data.MetricRepresentation qualified as MR +import Data.Text.Lazy qualified as T +import Data.Text.Short qualified as ST +import Data.Vector qualified as V +import Data.Vector.Generic qualified as GV +import Data.Vector.Storable qualified as SV +import Data.Vector.Unboxed qualified as UV +import GeneralUtilities +import GraphOptimization.Medians qualified as M +import Input.BitPack qualified as BP +import PHANE.Evaluation +import PHANE.Evaluation.Logging (LogLevel (..), Logger (..)) +import SymMatrix qualified as S +import Text.Read +import Types.Types +import Utilities.Utilities qualified as U + + +{- | optimizePrealignedData convert +prealigned to non-additive or matrix +here +bitPack new non-additive +packNonAdditive +-} +optimizePrealignedData ∷ GlobalSettings → ProcessedData → PhyG ProcessedData +optimizePrealignedData inGS inData@(_, _, blockDataVect) = case U.getNumberPrealignedCharacters blockDataVect of + 0 → pure inData + _ → do + -- don't recode if SI/PMDL since need no change cost + if (optimalityCriterion inGS) `elem` [SI, PMDL] + then pure inData + else do + -- remove constant characters from prealigned + inData' ← removeConstantCharactersPrealigned inData + + -- convert prealigned to nonadditive if all 1 tcms + let inData'' = convertPrealignedToNonAdditive inData' + + -- bit packing for non-additivecharacters + BP.packNonAdditiveData inGS inData'' + + +{- | convertPrealignedToNonAdditive converts prealigned data to non-additive +if homogeneous TCM (all 1's non-diagnoal) +-} +convertPrealignedToNonAdditive ∷ ProcessedData → ProcessedData +convertPrealignedToNonAdditive (nameVect, bvNameVect, blockDataVect) = (nameVect, bvNameVect, fmap convertPrealignedToNonAdditiveBlock blockDataVect) + + +{- | convertPrealignedToNonAdditiveBlock takes a character block and convertes prealigned to non-add if tcms all 1's +this is done taxon by taxon and character by character since can convert with only local infomation +-} +convertPrealignedToNonAdditiveBlock ∷ BlockData → BlockData +convertPrealignedToNonAdditiveBlock (nameBlock, charDataVV, charInfoV) = + let codingTypeV = fmap fst $ fmap getRecodingType (fmap costMatrix charInfoV) + (newCharDataVV, newCharInfoVV) = V.unzip $ fmap (convertTaxonPrealignedToNonAdd charInfoV codingTypeV) charDataVV + in (nameBlock, newCharDataVV, V.head newCharInfoVV) + + +{- | convertTaxonPrealignedToNonAdd takes a vector of character info and vector of charcter data for a taxon +and recodes prealigned as non-additve if tcm's all 1s +-} +convertTaxonPrealignedToNonAdd + ∷ V.Vector CharInfo → V.Vector String → V.Vector CharacterData → (V.Vector CharacterData, V.Vector CharInfo) +convertTaxonPrealignedToNonAdd charInfoV codingTypeV charDataV = + V.unzip $ V.zipWith3 convertTaxonPrealignedToNonAddCharacter charInfoV codingTypeV charDataV + + +{- | convertTaxonPrealignedToNonAddCharacter takes a taxon character and char info and cost matrix type +and transforms to non-additive if all tcms are 1's +-} +convertTaxonPrealignedToNonAddCharacter ∷ CharInfo → String → CharacterData → (CharacterData, CharInfo) +convertTaxonPrealignedToNonAddCharacter charInfo matrixType charData = + -- trace ("CTP: " <> (show $ charType charInfo) <> " " <> (show $ matrixType)) ( + if charType charInfo `notElem` prealignedCharacterTypes + then (charData, charInfo) + else + if matrixType /= "nonAdd" + then (charData, charInfo) + else -- this coefficient do to reducion by integer factors in matrix representation + -- need to be multipled or 2:2 goes to 1:1 and weigh/cost incorrect (non for slim--ffi etc) + + let matrixCoefficient = + if (charType charInfo) == AlignedSlim + then 1 + else + if (charType charInfo) == AlignedWide + then MR.minInDelCost (wideTCM charInfo) + else + if (charType charInfo) == AlignedHuge + then MR.minInDelCost (hugeTCM charInfo) + else error ("Unrecognized--unimplemented character type in convertTaxonPrealignedToNonAddCharacter: " <> (show $ charType charInfo)) + + charWeight = weight charInfo + + newStateBV = + if charType charInfo == AlignedSlim + then convert2BVTriple 32 $ (snd3 . alignedSlimPrelim) charData + else + if charType charInfo == AlignedWide + then convert2BVTriple 64 $ (snd3 . alignedWidePrelim) charData + else -- no point in huge recoding--with not be packed anyway + + if charType charInfo == AlignedHuge + then error "No point in converting huge--can't pack anyway" + else -- this is wrong for some reason--creates incorrect IA + -- (snd3 $ alignedHugePrelim charData, snd3 $ alignedHugePrelim charData, snd3 $ alignedHugePrelim charData) + error ("Unrecognized character type in convertTaxonPrealignedToNonAddCharacter: " <> (show $ charType charInfo)) + in ( emptyCharacter{stateBVPrelim = newStateBV} + , charInfo{charType = NonAdd, weight = charWeight * (fromIntegral $ fromEnum matrixCoefficient)} + ) + + +-- ) + +-- | convert2BVTriple takes SlimState or WideState and converts to Triple Vector of bitvectors +convert2BVTriple + ∷ (Integral a, GV.Vector v a) ⇒ Word → v a → (V.Vector BV.BitVector, V.Vector BV.BitVector, V.Vector BV.BitVector) +convert2BVTriple size inM = + let inMList = GV.toList inM + inMBV = fmap (BV.fromNumber size) inMList + in (V.fromList inMBV, V.fromList inMBV, V.fromList inMBV) + + +{- | convert2BV takes SlimState or WideState and converts to Vector of bitvectors +this for leaves so assume M only one needed really +-} +convert2BV + ∷ (Integral a, GV.Vector v a) ⇒ Word → (v a, v a, v a) → (V.Vector BV.BitVector, V.Vector BV.BitVector, V.Vector BV.BitVector) +convert2BV size (_, inM, _) = + let inMList = GV.toList inM + inMBV = fmap (BV.fromNumber size) inMList + in (V.fromList inMBV, V.fromList inMBV, V.fromList inMBV) + + +{- | getRecodingType takes a cost matrix and detemines if it can be recodes as non-additive, +non-additive with gap chars, or matrix +assumes indel costs are in first row and column +-} +getRecodingType ∷ S.Matrix Int → (String, Int) +getRecodingType inMatrix = + if S.null inMatrix + then error "Null matrix in getRecodingType" + else + if (not . S.isSymmetric) inMatrix + then ("matrix", 0) + else + let matrixLL = S.toFullLists inMatrix + secondRow = matrixLL L.!! 1 + numUniqueCosts = length $ L.group $ L.sort $ (filter (/= 0) $ concat matrixLL) + in -- trace ("GRT: " <> (show numUniqueCosts)) ( + + -- don't recode huge--no point + if S.rows inMatrix > 32 + then ("matrix", head secondRow) + else -- all same except for 0 + + if numUniqueCosts == 1 + then ("nonAdd", 0) + else ("matrix", head secondRow) + + +{- +-- all same except for gaps +else if numUniqueCosts == 2 then + trace ("NAG: " <> (show $ length $ L.group $ L.sort $ filter (/= 0) lastRow)) ( + if (length $ L.group $ filter (/= 0) lastRow) == 1 then ("nonAddGap", head lastRow) + + -- some no gaps different + else ("matrix", head lastRow) + ) +-- to many types for nonadd coding +else ("matrix", head lastRow) + +-} +-- ) + +{- | reBlockData takes original block assignments--each input file is a block-- +and combines, creates new, deletes empty blocks from user input +reblock pair names may contain wildcards +-} +reBlockData ∷ [(NameText, NameText)] → ProcessedData → PhyG ProcessedData +reBlockData reBlockPairs inData@(leafNames, leafBVs, blockDataV) = + -- trace ("RBD:" <> (show $ fmap fst3 blockDataV )) ( + if null reBlockPairs + then do + logWith LogInfo "Character Blocks as input files\n" + pure inData + else + let -- those block to be reassigned--nub in case repeated names + toBeReblockedNames = fmap (T.filter (/= '"')) $ L.nub $ fmap snd reBlockPairs + unChangedBlocks = V.filter ((`notElemWildcards` toBeReblockedNames) . fst3) blockDataV + blocksToChange = V.filter ((`elemWildcards` toBeReblockedNames) . fst3) blockDataV + newBlocks = makeNewBlocks reBlockPairs blocksToChange [] + reblockedBlocks = unChangedBlocks <> V.fromList newBlocks + in do + logWith + LogInfo + ( "\nReblocking: " + <> show toBeReblockedNames + <> " leaving unchanged: " + <> show (fmap fst3 unChangedBlocks) + <> "\n\tNew blocks: " + <> show (fmap fst3 reblockedBlocks) + <> "\n" + ) + pure (leafNames, leafBVs, reblockedBlocks) + + +-- ) + +-- | makeNewBlocks takes lists of reblock pairs and existing relevant blocks and creates new blocks returned as a list +makeNewBlocks ∷ [(NameText, NameText)] → V.Vector BlockData → [BlockData] → [BlockData] +makeNewBlocks reBlockPairs inBlockV curBlockList + | null reBlockPairs = curBlockList + | V.null inBlockV && null curBlockList = + errorWithoutStackTrace + ( "Reblock pair names do not have a match for any input block--perhaps missing '#0/N'? Blocks: " <> (show $ fmap snd reBlockPairs) + ) + | V.null inBlockV = curBlockList + | otherwise = + let firstBlock = V.head inBlockV + firstName = fst3 firstBlock + newPairList = fst <$> filter (textMatchWildcards firstName . snd) reBlockPairs + in if null newPairList + then + errorWithoutStackTrace + ( "Reblock pair names do not have a match for any input block--perhaps missing '#0'? Specified pairs: " + <> show reBlockPairs + <> " input block name: " + <> T.unpack firstName + ) + else + if length newPairList > 1 + then errorWithoutStackTrace ("Multiple reblock destinations for single input block" <> show newPairList) + else + let newBlockName = head newPairList + existingBlock = filter ((== newBlockName) . fst3) curBlockList + in -- new block to be created + if null existingBlock + then -- trace("NBlocks:" <> (show $ fmap fst3 curBlockList)) + makeNewBlocks reBlockPairs (V.tail inBlockV) ((newBlockName, snd3 firstBlock, thd3 firstBlock) : curBlockList) + else -- existing block to be added to + + if length existingBlock > 1 + then error ("Error: Block to be added to more than one block-should not happen: " <> show reBlockPairs) + else -- need to add character vectors to vertex vectors and add to CharInfo + -- could be multiple 'characteres' if non-exact data in inpuit file (Add, NonAdd, MAtrix etc) + + let blockToAddTo = head existingBlock + newCharData = V.zipWith (<>) (snd3 blockToAddTo) (snd3 firstBlock) + newCharInfo = thd3 blockToAddTo <> thd3 firstBlock + in -- trace("EBlocks:" <> (show $ fmap fst3 curBlockList)) + makeNewBlocks + reBlockPairs + (V.tail inBlockV) + ((newBlockName, newCharData, newCharInfo) : filter ((/= newBlockName) . fst3) curBlockList) + + +{- | combineDataByType combines data of same type for (exact types) into +same vectors in character so have one non-add, one add, one of each packed type, +can have multiple matrix (due to cost matrix differneces) +similar result to groupDataByType, but does not assume single characters. +-} +combineDataByType ∷ GlobalSettings → ProcessedData → PhyG ProcessedData +combineDataByType inGS inData@(taxNames, taxBVNames, _) = + -- recode add to non-add before combine-- takes care wor integer weighting + let (_, _, blockDataV') = recodeAddToNonAddCharacters inGS maxAddStatesToRecode inData + in do + recodedData ← mapM combineData blockDataV' + pure (taxNames, taxBVNames, recodedData) + + +-- | combineData creates for a block) lists of each data type and concats then creating new data and new char info +combineData ∷ BlockData → PhyG BlockData +combineData (blockName, blockDataVV, charInfoV) = + let -- parallel setup + action ∷ V.Vector CharacterData → (V.Vector CharacterData, V.Vector CharInfo) + action = combineBlockData charInfoV + in do + pTraverse ← getParallelChunkMap + let result = pTraverse action (V.toList blockDataVV) + let (newBlockDataLV, newCharInfoLV) = unzip result + -- (newBlockDataLV, newCharInfoLV) = unzip (PU.seqParMap PU.myStrategyRDS (combineBlockData charInfoV) (V.toList blockDataVV)) -- `using` PU.myParListChunkRDS) + pure (blockName, V.fromList newBlockDataLV, head newCharInfoLV) + + +{- | combineBlockData takes a vector of char info and vector or charcater data for a taxon and +combined exact data types into single characters +additive characters should have already been converted (if less that maxAddStatesToRecode) to binary +non-additive characters with integer weights could be repeated before combining-- but this has been disabled +due to memory issues in large data sets +non-integer additive and non-additive are grouped together by weight so they can be combined and bit packed +all other types are grouped by weight for efficiency of optimization and reductionn of multiplies +zero weight characters are filtered out +-} +combineBlockData ∷ V.Vector CharInfo → V.Vector CharacterData → (V.Vector CharacterData, V.Vector CharInfo) +combineBlockData inCharInfoV inCharDataV = + let pairCharsInfo = V.zip inCharInfoV inCharDataV + + -- characters to not be reorganized-- nbasically the sequence characters + sequenceCharacters = V.toList $ V.filter ((> 0) . weight . fst) $ V.filter ((`elem` sequenceCharacterTypes) . charType . fst) pairCharsInfo + + -- matrix characters are more complex--can only join if same matrix + matrixCharsPair = V.filter ((> 0) . weight . fst) $ V.filter ((== Matrix) . charType . fst) pairCharsInfo + (newMatrixCharL, newMatrixCharInfoL) = + if (not . null) matrixCharsPair + then unzip $ organizeMatrixCharsByMatrix (V.toList matrixCharsPair) + else ([], []) + + -- non-additive characters + -- multiple characters by weight, if only 1 weight then all together + nonAddChars = V.filter ((> 0) . weight . fst) $ V.filter ((== NonAdd) . charType . fst) pairCharsInfo + (newNonAddCharInfo, newNonAddChar) = unzip $ V.toList $ groupCharactersByWeight nonAddChars + + -- additive characters + -- multiple characters by weight, if only 1 weight then all together + + addChars = V.filter ((> 0) . weight . fst) $ V.filter ((== Add) . charType . fst) pairCharsInfo + (newAddCharInfo, newAddChar) = unzip $ V.toList $ groupCharactersByWeight addChars + + -- Packed2 characters + packed2Chars = V.filter ((> 0) . weight . fst) $ V.filter ((== Packed2) . charType . fst) pairCharsInfo + (newPacked2CharInfo, newPacked2Char) = unzip $ V.toList $ groupCharactersByWeight packed2Chars + + -- Packed4 characters + packed4Chars = V.filter ((> 0) . weight . fst) $ V.filter ((== Packed4) . charType . fst) pairCharsInfo + (newPacked4CharInfo, newPacked4Char) = unzip $ V.toList $ groupCharactersByWeight packed4Chars + + -- Packed5 characters + packed5Chars = V.filter ((> 0) . weight . fst) $ V.filter ((== Packed5) . charType . fst) pairCharsInfo + (newPacked5CharInfo, newPacked5Char) = unzip $ V.toList $ groupCharactersByWeight packed5Chars + + -- Packed8 characters + packed8Chars = V.filter ((> 0) . weight . fst) $ V.filter ((== Packed8) . charType . fst) pairCharsInfo + (newPacked8CharInfo, newPacked8Char) = unzip $ V.toList $ groupCharactersByWeight packed8Chars + + -- Packed64 characters + packed64Chars = V.filter ((> 0) . weight . fst) $ V.filter ((== Packed64) . charType . fst) pairCharsInfo + (newPacked64CharInfo, newPacked64Char) = unzip $ V.toList $ groupCharactersByWeight packed64Chars + + -- Add together all new characters, seqeunce characters and char info + -- newCharList = newNonAddCharL <> (V.toList nonAddCharsWeightNotInt) <> newAddCharL <> (V.toList addCharsWeightNot1) <> newPacked2CharL <> newPacked4CharL <> newPacked5CharL <> newPacked8CharL <> newPacked64CharL <> newMatrixCharL <> (fmap snd sequenceCharacters) + -- newCharInfoList = newNonAddCharInfoL <> (V.toList nonAddCharsWeightNotIntInfo) <> newAddCharInfoL <> (V.toList addCharsWeightNot1Info) <> newPacked2CharInfoL <> newPacked4CharInfoL <> newPacked5CharInfoL <> newPacked8CharInfoL <> newPacked64CharInfoL <> newMatrixCharInfoL <> (fmap fst sequenceCharacters) + + newCharList = + newNonAddChar + <> newAddChar + <> newPacked2Char + <> newPacked4Char + <> newPacked5Char + <> newPacked8Char + <> newPacked64Char + <> newMatrixCharL + <> (fmap snd sequenceCharacters) + newCharInfoList = + newNonAddCharInfo + <> newAddCharInfo + <> newPacked2CharInfo + <> newPacked4CharInfo + <> newPacked5CharInfo + <> newPacked8CharInfo + <> newPacked64CharInfo + <> newMatrixCharInfoL + <> (fmap fst sequenceCharacters) + in (V.fromList newCharList, V.fromList newCharInfoList) + + +{- | groupCharactersByWeight takes a list of characters and returns a list of lists of charcter with same weight +checked as Double. +NB--Does not check for character type---assuming this is desired it must be assured before input +-} +groupCharactersByWeight ∷ V.Vector (CharInfo, CharacterData) → V.Vector (CharInfo, CharacterData) +groupCharactersByWeight inCharsPairList = + if V.null inCharsPairList + then V.empty + else + let weightList = L.nub $ V.toList $ fmap weight (fmap fst inCharsPairList) + charListByWeight = fmap (getSameWeightChars inCharsPairList) $ V.fromList weightList + in V.concatMap mergeCharacters charListByWeight + + +{- | mergeCharacters merges the data fieed of characters based on type +NB--Does not check all chars are same type or weight--will silently combine mempty values +with whatever weight. +returns list with single member so can concat later +-} +mergeCharacters ∷ V.Vector (CharInfo, CharacterData) → V.Vector (CharInfo, CharacterData) +mergeCharacters inCharsPairList = + if V.null inCharsPairList + then V.empty + else + let thisCharType = (charType . fst . V.head) inCharsPairList + + -- non-add data + dataFieldNonAdd = V.concatMap (snd3 . stateBVPrelim . snd) inCharsPairList + newNonAddChar = + ((snd . V.head) inCharsPairList) + { stateBVPrelim = (dataFieldNonAdd, dataFieldNonAdd, dataFieldNonAdd) + , stateBVFinal = dataFieldNonAdd + } + + -- add data + dataFieldAdd = V.concatMap (snd3 . rangePrelim . snd) inCharsPairList + newAddChar = ((snd . V.head) inCharsPairList){rangePrelim = (dataFieldAdd, dataFieldAdd, dataFieldAdd), rangeFinal = dataFieldAdd} + + -- packed data + dataFieldPacked = UV.concat $ V.toList $ fmap (snd3 . packedNonAddPrelim . snd) inCharsPairList + newPackedChar = + ((snd . V.head) inCharsPairList) + { packedNonAddPrelim = (dataFieldPacked, dataFieldPacked, dataFieldPacked) + , packedNonAddFinal = dataFieldPacked + } + + -- add info of merging to character info + newOrigInfo = V.concatMap (origInfo . fst) inCharsPairList + newCharInfo = ((fst . V.head) inCharsPairList){origInfo = newOrigInfo} + in if thisCharType == NonAdd + then V.singleton (newCharInfo, newNonAddChar) + else + if thisCharType == Add + then V.singleton (newCharInfo, newAddChar) + else + if thisCharType `elem` packedNonAddTypes + then V.singleton (newCharInfo, newPackedChar) + else error ("Error in mergeCharacters: Character type " <> show thisCharType <> " unrecognized/not implemented") + + +-- | getSameWeightChars returns character pairs with same matrix as testMatrix +getSameWeightChars ∷ V.Vector (CharInfo, CharacterData) → Double → V.Vector (CharInfo, CharacterData) +getSameWeightChars inCharsPairList testWeight = + if V.null inCharsPairList + then V.empty + else + let inWeightList = fmap weight (fmap fst inCharsPairList) + weightPairPair = V.zip inWeightList inCharsPairList + matchList = V.filter ((== testWeight) . weight . fst . snd) weightPairPair + in snd <$> matchList + + +{- Currently unused--employed to reduce chrater umbers by replicating characters with integer weight + can cause memory issues with large data sets +-- | replicateCharPairByWeight replicates characters by integer weight +replicateCharPairByWeight :: (CharInfo, CharacterData) -> [(CharInfo, CharacterData)] +replicateCharPairByWeight firstPair = + let charIntWeight = doubleAsInt $ (weight . fst) firstPair + in + if isNothing charIntWeight then error ("Character weight not an integer in replicateCharPair: " <> (show $ (weight . fst) firstPair)) + else + (replicate (fromJust charIntWeight) firstPair) +-} + +-- | organizeMatrixCharsByMatrix combines matrix charcters if they have the same cost matrix +organizeMatrixCharsByMatrix ∷ [(CharInfo, CharacterData)] → [(CharacterData, CharInfo)] +organizeMatrixCharsByMatrix inCharsPairList = + if null inCharsPairList + then [] + else + let costMatrixList = L.nub $ fmap costMatrix (fmap fst inCharsPairList) + charMatrixLL = fmap (getSameMatrixChars inCharsPairList) costMatrixList + newMatrixPairs = fmap combineMatrixCharsByMatrix charMatrixLL + in newMatrixPairs + + +-- | combineMatrixCharsByMatrix combines matrix characters--assumes cost matrices are the same +combineMatrixCharsByMatrix ∷ [(CharInfo, CharacterData)] → (CharacterData, CharInfo) +combineMatrixCharsByMatrix inCharList = + let newMatrixcharData = V.concat $ fmap matrixStatesPrelim $ fmap snd inCharList + newMatrixChar = + ((snd . head) inCharList) + { matrixStatesPrelim = newMatrixcharData + , matrixStatesFinal = newMatrixcharData + } + newMatrixCharInfo = ((fst . head) inCharList){origInfo = V.concat $ fmap (origInfo . fst) inCharList} + in (newMatrixChar, newMatrixCharInfo) + + +-- | getSameMatrixChars returns character pairs with same matrix as testMatrix +getSameMatrixChars ∷ [(CharInfo, CharacterData)] → S.Matrix Int → [(CharInfo, CharacterData)] +getSameMatrixChars inCharsPairList testMatrix = + let inMatrixList = fmap costMatrix (fmap fst inCharsPairList) + matrixPairPair = zip inMatrixList inCharsPairList + matchList = filter ((== testMatrix) . costMatrix . fst . snd) matrixPairPair + in fmap snd matchList + + +{- | removeConstantCharactersPrealigned takes processed data and removes constant characters +from prealignedCharacterTypes +-} +removeConstantCharactersPrealigned ∷ ProcessedData → PhyG ProcessedData +removeConstantCharactersPrealigned inData@(nameVect, bvNameVect, blockDataVect) + | V.null blockDataVect = pure inData + | otherwise = do + newBlockData ← + getParallelChunkMap <&> \pMap → + removeConstantBlockPrealigned `pMap` V.toList blockDataVect + pure (nameVect, bvNameVect, V.fromList newBlockData) + + +-- | removeConstantBlockPrealigned takes block data and removes constant characters +removeConstantBlockPrealigned ∷ BlockData → BlockData +removeConstantBlockPrealigned inBlockData@(blockName, taxVectByCharVect, charInfoV) = + -- check for null data--really reallyu shouldn't happen + if V.null taxVectByCharVect + then -- trace ("Warning: Null block data in removeConstantBlockPrealigned") + inBlockData + else -- check for prealigned data in block + + if U.getNumberPrealignedCharacters (V.singleton inBlockData) == 0 + then inBlockData + else + let numChars = V.length $ V.head taxVectByCharVect + + -- create vector of single characters with vector of taxon data of sngle character each + -- like a standard matrix with a single character + singleCharVect = fmap (BP.getSingleCharacter taxVectByCharVect) (V.fromList [0 .. numChars - 1]) + + -- actually remove constants form chaarcter list + singleCharVect' = V.zipWith removeConstantCharsPrealigned singleCharVect charInfoV + + -- recreate the taxa vext by character vect block data expects + -- should filter out length zero characters + newTaxVectByCharVect = U.glueBackTaxChar singleCharVect' + in (blockName, newTaxVectByCharVect, charInfoV) + + +{- | removeConstantCharsPrealigned takes a single 'character' and if proper type removes if all values are the same +could be done if character has max lenght of 0 as well. +packed types already filtered when created +-} +removeConstantCharsPrealigned ∷ V.Vector CharacterData → CharInfo → V.Vector CharacterData +removeConstantCharsPrealigned singleChar charInfo = case charType charInfo of + cType | cType `elem` prealignedCharacterTypes → getVariableChars cType singleChar + _ → singleChar + + +{- | getVariableChars checks identity of states in a vector positin in all taxa +and returns True if variable, False if constant +bit packed and non-exact should not get in here +-} +getVariableChars ∷ CharType → V.Vector CharacterData → V.Vector CharacterData +getVariableChars inCharType singleChar = + let nonAddV = fmap snd3 $ fmap stateBVPrelim singleChar + addV = fmap snd3 $ fmap rangePrelim singleChar + matrixV = fmap matrixStatesPrelim singleChar + alSlimV = fmap snd3 $ fmap alignedSlimPrelim singleChar + alWideV = fmap snd3 $ fmap alignedWidePrelim singleChar + alHugeV = fmap snd3 $ fmap alignedHugePrelim singleChar + + -- get identity vect + boolVar = case inCharType of + NonAdd → getVarVectBits inCharType nonAddV [] + Add → getVarVectAdd addV [] + Matrix → getVarVectMatrix matrixV [] + AlignedSlim → getVarVectBits inCharType alSlimV [] + AlignedWide → getVarVectBits inCharType alWideV [] + AlignedHuge → getVarVectBits inCharType alHugeV [] + cType → error $ "Char type unrecognized in getVariableChars: " <> show cType + + -- get Variable characters by type + nonAddVariable = fmap (filterConstantsV (V.fromList boolVar)) nonAddV + addVariable = fmap (filterConstantsV (V.fromList boolVar)) addV + matrixVariable = fmap (filterConstantsV (V.fromList boolVar)) matrixV + alSlimVariable = fmap (filterConstantsSV (V.fromList boolVar)) alSlimV + alWideVariable = fmap (filterConstantsUV (V.fromList boolVar)) alWideV + + -- this is a hack-not sure why popCount etc don't work with HugeVector little endian BVs + -- these should be short seqs in general so no ral impact on effeciancy + alHugeVariable = alHugeV -- fmap (filterConstantsV (V.fromList boolVar)) alHugeV + + -- assign to propoer character fields + outCharVect = + V.zipWith + (assignNewField inCharType) + singleChar + (V.zip6 nonAddVariable addVariable matrixVariable alSlimVariable alWideVariable alHugeVariable) + in -- trace ("GVC:" <> (show $ length boolVar) <> " -> " <> (show $ length $ filter (== False) boolVar)) + outCharVect + + +{- | getVarVectAdd takes a vector of a vector additive ranges and returns False if range overlap +True if not (short circuits) +based on range overlap +-} +getVarVectAdd ∷ V.Vector (V.Vector (Int, Int)) → [Bool] → [Bool] +getVarVectAdd stateVV curBoolList = + if V.null (V.head stateVV) + then L.reverse curBoolList + else + let firstChar = fmap V.head stateVV + isVariable = checkIsVariableAdditive (V.head firstChar) (V.tail firstChar) + in getVarVectAdd (fmap V.tail stateVV) (isVariable : curBoolList) + + +{- | getVarVectMatrix takes a generic vector and returns False if values are same +True if not (short circuits) +based on simple identity not max cost zero +-} +getVarVectMatrix ∷ V.Vector (V.Vector (V.Vector MatrixTriple)) → [Bool] → [Bool] +getVarVectMatrix stateVV curBoolList = + if V.null (V.head stateVV) + then L.reverse curBoolList + else + let firstChar = fmap V.head stateVV + isVariable = checkIsVariableMatrix (getMatrixStateList $ V.head firstChar) (V.tail firstChar) + in getVarVectMatrix (fmap V.tail stateVV) (isVariable : curBoolList) + + +{- | +getVarVectBits takes a generic vector and returns False if values are same +True if not (short circuits) +based on simple identity not max cost zero +-} +getVarVectBits ∷ (FiniteBits a, GV.Vector v a) ⇒ CharType → V.Vector (v a) → [Bool] → [Bool] +getVarVectBits inCharType stateVV curBoolList = + if GV.null (V.head stateVV) + then L.reverse curBoolList + else + let firstChar = fmap GV.head stateVV + isVariable = + if inCharType == NonAdd + then checkIsVariableBit (GV.head firstChar) (GV.tail firstChar) + else + if inCharType == AlignedSlim + then checkIsVariableBit (GV.head firstChar) (GV.tail firstChar) + else + if inCharType == AlignedWide + then checkIsVariableBit (GV.head firstChar) (GV.tail firstChar) + else + if inCharType == AlignedHuge + then checkIsVariableBit (GV.head firstChar) (GV.tail firstChar) + else error ("Char type unrecognized in getVariableChars: " <> show inCharType) + in getVarVectBits inCharType (fmap GV.tail stateVV) (isVariable : curBoolList) + + +{- +-- | checkIsVariableIdentity takes a generic vector and sees if all elements are identical +checkIsVariableIdentity :: (Eq a, GV.Vector v a) => a -> v a -> Bool +checkIsVariableIdentity firstElement inVect = + if GV.null inVect then False + else + if firstElement /= GV.head inVect then True + else checkIsVariableIdentity firstElement (GV.tail inVect) +-} + +{- | checkIsVariableAdditive checks if additive charcter is variable +by taking new ranges and range costs of first element with all others +if summed cost > 0 then variable +-} +checkIsVariableAdditive ∷ (Int, Int) → V.Vector (Int, Int) → Bool +checkIsVariableAdditive (ir1, ir2) rangeList = + if V.null rangeList + then False + else + let (nr1, nr2) = V.head rangeList + (newMin, newMax, newCost) = M.getNewRange ir1 ir2 nr1 nr2 + in if newCost > 0 + then True + else checkIsVariableAdditive (newMin, newMax) (V.tail rangeList) + + +-- | getMatrixStateList returns minimum matrix characters states as integers +getMatrixStateList ∷ V.Vector MatrixTriple → [Int] +getMatrixStateList inState = + let statePairList = zip (fmap fst3 $ V.toList inState) [0 .. (V.length inState - 1)] + minCost = minimum $ fmap fst3 $ V.toList inState + stateList = fmap snd $ filter ((== minCost) . fst) statePairList + in stateList + + +{- | checkIsVariableMatrix checks if matrix charcter is variable +by taking min cost states of first element and +checks for overlap--if empty list intersect then variable +-} +checkIsVariableMatrix ∷ [Int] → V.Vector (V.Vector MatrixTriple) → Bool +checkIsVariableMatrix inStateList restStatesV = + if V.null restStatesV + then False + else + let nextStateList = getMatrixStateList $ V.head restStatesV + + newStateList = L.intersect inStateList nextStateList + in if null newStateList + then True + else checkIsVariableMatrix newStateList (V.tail restStatesV) + + +{- | checkIsVariableBit takes a generic vector and checks for +state overlap via bit AND (.&.) +-} +checkIsVariableBit ∷ (FiniteBits a, GV.Vector v a) ⇒ a → v a → Bool +checkIsVariableBit firstElement restVect + | GV.null restVect = False + | otherwise = + let newState = firstElement .&. (GV.head restVect) + in if popCount newState == 0 + then True + else checkIsVariableBit newState (GV.tail restVect) + + +-- | these need to be abstracted but had problems with the bool list -> Generic vector, and SV pair + +{- | filerConstantsV takes the charcter data and filters out teh constants +uses filter to keep O(n) +filterConstantsV :: (GV.Vector v a) => [Bool] -> v a -> v a +-} +filterConstantsV ∷ V.Vector Bool → V.Vector a → V.Vector a +filterConstantsV inVarBoolV charVect + | V.null inVarBoolV = charVect + | otherwise = + let pairVect = V.zip charVect inVarBoolV + variableCharV = V.map fst $ V.filter ((== True) . snd) pairVect + in variableCharV + + +{- | filerConstantsSV takes the charcter data and filters out teh constants +uses filter to keep O(n) +filterConstantsV :: (GV.Vector v a) => [Bool] -> v a -> v a +-} +filterConstantsSV ∷ (SV.Storable a) ⇒ V.Vector Bool → SV.Vector a → SV.Vector a +filterConstantsSV inVarBoolV charVect + | V.null inVarBoolV = charVect + | otherwise = + let varVect = filterConstantsV inVarBoolV . V.fromList $ SV.toList charVect + in SV.fromList $ V.toList varVect + + +{- | filerConstantsUV takes the charcter data and filters out teh constants +uses filter to keep O(n) +filterConstantsV :: (GV.Vector v a) => [Bool] -> v a -> v a +-} +filterConstantsUV ∷ (UV.Unbox a) ⇒ V.Vector Bool → UV.Vector a → UV.Vector a +filterConstantsUV inVarBoolV charVect + | V.null inVarBoolV = charVect + | otherwise = + let varVect = filterConstantsV inVarBoolV (V.fromList $ UV.toList charVect) + in UV.fromList $ V.toList varVect + + +{- | assignNewField takes character type and a 6-tuple of charcter fields and assigns the appropriate +to the correct field +neither bit packed nor nno-exact should het here +-} +assignNewField + ∷ CharType + → CharacterData + → ( V.Vector BV.BitVector + , V.Vector (Int, Int) + , V.Vector (V.Vector MatrixTriple) + , SV.Vector SlimState + , UV.Vector WideState + , V.Vector HugeState + ) + → CharacterData +assignNewField inCharType charData (nonAddData, addData, matrixData, alignedSlimData, alignedWideData, alignedHugeData) = case inCharType of + NonAdd → charData{stateBVPrelim = (nonAddData, nonAddData, nonAddData)} + Add → charData{rangePrelim = (addData, addData, addData)} + Matrix → charData{matrixStatesPrelim = matrixData} + AlignedSlim → charData{alignedSlimPrelim = (alignedSlimData, alignedSlimData, alignedSlimData)} + AlignedWide → charData{alignedWidePrelim = (alignedWideData, alignedWideData, alignedWideData)} + AlignedHuge → charData{alignedHugePrelim = (alignedHugeData, alignedHugeData, alignedHugeData)} + cType → error $ "Char type unrecognized in assignNewField: " <> show cType + + +{- | recodeAddToNonAddCharacters takes an max states number and processsed data +and recodes additive characters with max state < input max (0..input max - 1) +as a series of binary non-additive characters +-} +recodeAddToNonAddCharacters ∷ GlobalSettings → Int → ProcessedData → ProcessedData +recodeAddToNonAddCharacters inGS maxStateToRecode (nameVect, nameBVVect, blockDataVect) = + let newBlockDataVect = fmap (convertAddToNonAddBlock inGS maxStateToRecode) blockDataVect + in (nameVect, nameBVVect, newBlockDataVect) + + +-- | convertAddToNonAddBlock converts additive characters to non-additive in a block +convertAddToNonAddBlock ∷ GlobalSettings → Int → BlockData → BlockData +convertAddToNonAddBlock inGS maxStateToRecode (blockName, taxByCharDataVV, charInfoV) = + let (newTaxByCharDataVV, newCharInfoVV) = V.unzip $ fmap (recodeTaxonData inGS maxStateToRecode charInfoV) taxByCharDataVV + in -- trace ("CNAB: " <> (show (V.length $ V.head newTaxByCharDataVV, V.length $ V.head newCharInfoVV))) + (blockName, newTaxByCharDataVV, V.head newCharInfoVV) + + +-- | recodeTaxonData recodes Add as nonAdd for each taxon in turn +recodeTaxonData + ∷ GlobalSettings → Int → V.Vector CharInfo → V.Vector CharacterData → (V.Vector CharacterData, V.Vector CharInfo) +recodeTaxonData inGS maxStateToRecode charInfoV taxonCharacterDataV = + let (newCharDataVV, newCharInfoVV) = unzip $ zipWith (recodeAddToNonAddCharacter inGS maxStateToRecode) (V.toList taxonCharacterDataV) (V.toList charInfoV) + in -- trace ("RTD: " <> (show (V.length $ V.concat newCharDataVV, V.length $ V.concat newCharInfoVV))) + (V.concat newCharDataVV, V.concat newCharInfoVV) + + +{- | recodeAddToNonAddCharacter takes a single character for single taxon and recodes if non-additive with + fewer than maxStateToRecode states. + assumes states in linear order + replicatee charinfo for multiple new characters after recoding +-} +recodeAddToNonAddCharacter ∷ GlobalSettings → Int → CharacterData → CharInfo → (V.Vector CharacterData, V.Vector CharInfo) +recodeAddToNonAddCharacter inGS maxStateToRecode inCharData inCharInfo = + let inCharType = charType inCharInfo + numStates = 1 + (L.maximum $ fmap makeInt $ alphabet inCharInfo) -- min 2 (1 + (L.last $ L.sort $ fmap makeInt $ alphabetSymbols $ alphabet inCharInfo)) + -- numStates = 1 + (L.last $ L.sort $ fmap makeInt $ alphabetSymbols $ alphabet inCharInfo) + origName = name inCharInfo + in -- if a single state recodes to a single uninfomative binary + -- removed || ((not . doubleIsInt . weight) inCharInfo) to allow for recodding (leaving weight) for non-integer weights + if (inCharType /= Add) + then (V.singleton inCharData, V.singleton inCharInfo) + else -- the limit on recoded states is removed for PMDL/ML since otherwise bit costs will be incorrect + + if (numStates > maxStateToRecode) && ((optimalityCriterion inGS) `notElem` [PMDL, SI, MAPA]) + then (V.singleton inCharData, V.singleton inCharInfo) + else + if numStates < 2 + then (V.empty, V.empty) + else -- create numStates - 1 no-additve chaaracters (V.singleton inCharData, V.singleton inCharInfo) + -- bits ON-- [0.. snd range] + + let minRangeIndex = fst $ V.head $ snd3 $ rangePrelim inCharData + maxRangeIndex = snd $ V.head $ snd3 $ rangePrelim inCharData + inCharOrigData = origInfo inCharInfo + newCharInfo = + inCharInfo + { name = (T.pack $ (T.unpack origName) <> "RecodedToNonAdd") + , charType = NonAdd + , alphabet = fromSymbols . fmap ST.fromString $ "0" :| ["1"] + , origInfo = inCharOrigData + } + + -- create new characters and new character info + newCharList = fmap (makeNewNonAddChar minRangeIndex maxRangeIndex) [0 .. numStates - 2] + + newCharInfoList = replicate (numStates - 1) newCharInfo + in -- trace ("RTNA: Numstates " <> (show numStates) <> " " <> (show $ (snd3 . rangePrelim) inCharData) <> " -> " <> (show $ fmap (snd3 . stateBVPrelim) newCharList)) + -- (show (length newCharList, V.length $ V.replicate (numStates - 1) newCharInfo)) <> "\n" <> (show newCharList) <> "\n" <> (show $ charType newCharInfo)) + (V.fromList newCharList, V.fromList newCharInfoList) + where + makeInt a = + let newA = readMaybe (ST.toString a) ∷ Maybe Int + in if isNothing newA + then error ("State '" <> (ST.toString a) <> "' not recoding to Int") + else fromJust newA + + +{- | makeNewNonAddCharacter takes a stateIndex and charcatear number +and makes a non-additive character with 0 or 1 coding +based on stateIndex versus state number +if stateIndex > charNumber then 1 else 0 (coded as bit 0 for 0, bit 1 for 1) +-} +makeNewNonAddChar ∷ Int → Int → Int → CharacterData +makeNewNonAddChar minStateIndex maxStateIndex charIndex = + let bvMinState = + if minStateIndex <= charIndex + then BV.fromBits [True, False] + else BV.fromBits [False, True] + + bvMaxState = + if maxStateIndex <= charIndex + then BV.fromBits [True, False] + else BV.fromBits [False, True] + + bvState = bvMinState .|. bvMaxState + in emptyCharacter + { stateBVPrelim = (V.singleton bvState, V.singleton bvState, V.singleton bvState) + , stateBVFinal = V.singleton bvState + } diff --git a/src/Input/TNTUtilities.hs b/src/Input/TNTUtilities.hs new file mode 100644 index 000000000..b2e1d0a8b --- /dev/null +++ b/src/Input/TNTUtilities.hs @@ -0,0 +1,1028 @@ +{- +Functions to input TNT file reading for PhyG + This is far from complete wrt TNT functionality + Deals with scope and add/nonadd/sankloff + ccode, ccosts + Ambiguities in "dense" tnt rows (single character states, no spaces) + Ambiguiities and multi-character states (e.g. CYTB, 1.23) + Will limit continuous character reps to 9 sig digits + this to allow 2x32 bit representations ina single Word64 later + +One Big thing-- + For multicharacter states-- + the parsing assumes there is more than one multicharacter character. + Can easily add a char from single-state (with spaces) + But the parsing relies on counting hte number of "words" in a line of data + if 2--single character states + if > 2 --multicharacter states. + A singl multi-character states would be parsed as single charcter states + NEED TO FIX +-} + +{- | +Module to read TNT input files for phylogenetic analysis. +-} +module Input.TNTUtilities ( + getTNTDataText, +) where + +import Control.Monad (replicateM, when) +import Control.Monad.IO.Class (MonadIO (..)) +import Data.Alphabet +import Data.Char +import Data.Char qualified as C +import Data.Foldable +import Data.List qualified as L +import Data.List.NonEmpty (NonEmpty (..)) +import Data.Maybe +import Data.MetricRepresentation +import Data.Set qualified as Set +import Data.TCM qualified as TCM +import Data.Text.Lazy qualified as T +import Data.Text.Short qualified as ST +import Data.Vector qualified as V +import Debug.Trace +import Input.DataTransformation qualified as DT +import Input.FastAC qualified as FAC +import PHANE.Evaluation +import PHANE.Evaluation.ErrorPhase (ErrorPhase (..)) +import PHANE.Evaluation.Logging (LogLevel (..), Logger (..)) +import PHANE.Evaluation.Verbosity (Verbosity (..)) +import SymMatrix qualified as SM +import Text.Read +import Types.Types + + +-- getTNTDataText take file contents and returns raw data and char info form TNT file +getTNTDataText ∷ T.Text → String → PhyG RawData +getTNTDataText inString fileName = + if T.null inString + then errorWithoutStackTrace ("\n\nTNT input file " <> fileName <> " processing error--empty file") + else + let inString' = T.unlines $ filter ((/= '&') . T.head) $ filter (not . T.null) $ fmap T.strip (T.lines inString) + inText' = T.strip inString' + + -- this for leading command like "taxname ujsed in report tnt" + inText = + if (T.head inText') /= 'x' + then T.unlines $ tail $ T.lines inText' + else inText' + in if toLower (T.head inText) /= 'x' + then errorWithoutStackTrace ("\n\nTNT input file " <> fileName <> " processing error--must begin with 'xread'") + else -- look for quoted message + + let singleQuotes = T.count (T.pack "'") inText + quotedMessage + | singleQuotes == 0 = T.pack "No TNT title message" + | singleQuotes > 2 = + errorWithoutStackTrace ("\n\nTNT input file " <> fileName <> " processing error--too many single quotes in title") + | otherwise = T.split (== '\'') inText !! 1 + (firstNum, secondNum, remainderText) = if singleQuotes /= 0 then removeNCharNTax $ T.split (== '\'') inText !! 2 + else removeNCharNTax $ inText + numCharM = readMaybe (T.unpack firstNum) ∷ Maybe Int + numTaxM = readMaybe (T.unpack secondNum) ∷ Maybe Int + restFile = filter ((> 0) . T.length) $ T.lines remainderText + + numChar = fromJust numCharM + numTax = fromJust numTaxM + in -- trace (show quotedMessage <> " " <> (show remainderText) <> "\n" <> show restFile) ( + if T.null inText + then errorWithoutStackTrace ("n\nTNT input file " <> fileName <> " processing error--empty TNT contents") + else + if null restFile + then errorWithoutStackTrace ("n\nTNT input file " <> fileName <> " processing error--empty TNT contents after first line") + else + if isNothing numCharM + then + errorWithoutStackTrace + ("n\nTNT input file " <> fileName <> " processing error--number of characters:" <> show (T.unpack firstNum)) + else + if isNothing numTaxM + then errorWithoutStackTrace ("n\nTNT input file " <> fileName <> " processing error--number of taxa:" <> show (T.unpack secondNum)) + else + let semiColonLineNumber = L.findIndex ((== ';') . T.head) restFile -- (== T.pack ";") restFile + in if isNothing semiColonLineNumber + then + errorWithoutStackTrace + ("\n\nTNT input file " <> fileName <> " processing error--can't find ';' to end data block" <> show restFile) + else + let dataBlock = filter ((> 0) . T.length) (T.filter printOrSpace <$> take (fromJust semiColonLineNumber) restFile) + -- dataBlock = filter ((>0).T.length) $ fmap (T.filter C.isPrint) $ take (fromJust semiColonLineNumber) restFile + charInfoBlock = filter (/= T.pack ";") $ filter ((> 0) . T.length) $ tail $ drop (fromJust semiColonLineNumber) restFile + numDataLines = length dataBlock + -- (interleaveNumber, interleaveRemainder) = numDataLines `quotRem` numTax + (_, interleaveRemainder) = numDataLines `quotRem` numTax + in -- trace (show dataBlock <> "\n" <> show (interleaveNumber, interleaveRemainder, numDataLines, numTax)) ( + if interleaveRemainder /= 0 + then + errorWithoutStackTrace + ("\n\nTNT input file " <> fileName <> " processing error--number of taxa mis-specified or interleaved format error ") + else + let sortedData = glueInterleave fileName dataBlock numTax numChar [] + charNumberList = fmap (length . snd) sortedData + nameLengthList = zip (fmap fst sortedData) charNumberList + incorrectLengthList = filter ((/= numChar) . snd) nameLengthList + (hasDupTerminals, dupList) = DT.checkDuplicatedTerminals sortedData + in do + renamedDefaultCharInfo ← renameTNTChars fileName 0 <$> (replicateM numChar defaultTNTCharInfo) + charInfoData ← getTNTCharInfo fileName numChar renamedDefaultCharInfo charInfoBlock + let checkInfo = length charInfoData == numChar + + -- trace ("Sorted data:" <> show sortedData) ( + -- trace ("Alph2 " <> (show $ fmap alphabet charInfoData)) ( + if not checkInfo + then + error + ("Character information number not equal to input character number: " <> show numChar <> " v " <> show (length charInfoData)) + else + if not $ null incorrectLengthList + then + errorWithoutStackTrace + ( "\tInput file " + <> fileName + <> " has terminals with incorrect or varying numbers of characters (should be " + <> show numChar + <> "):" + <> show incorrectLengthList + ) + else + if hasDupTerminals + then errorWithoutStackTrace ("\tInput file " <> fileName <> " has duplicate terminals: " <> show dupList) + else do + -- check non-Additive alphabet to be numbers + -- integerize and reweight additive chars (including in ambiguities) + let curNames = fmap ((T.filter (/= '"') . T.filter C.isPrint) . fst) sortedData + let curData = fmap snd sortedData + let (curData', charInfoData') = checkAndRecodeCharacterAlphabets fileName curData charInfoData [] [] + logWith + LogInfo + ( "\nTNT file file " + <> fileName + <> " message : " + <> T.unpack quotedMessage + <> " with " + <> show numTax + <> " taxa and " + <> show numChar + <> " characters" + <> "\n" + ) + -- logWith LogInfo ("TNTU:" <> (show (fmap length curData', length charInfoData'))) + let (newData, newCharInfoL) = filterInvariant curData' charInfoData' ([], []) + -- logWith LogInfo ("TNTUF:" <> (show (fmap length newData, length newCharInfoL))) + if null newCharInfoL + then failWithPhase Parsing ("TNT file " <> fileName <> " has no variant data") + else pure $ (zip curNames newData, newCharInfoL) + where + -- ) ) + printOrSpace a = (C.isPrint a || C.isSpace a) && (a /= '\r') + + +-- | filterInvariant filters out characters that are identical in all terminals +filterInvariant + ∷ [[ST.ShortText]] → [charInfoData] → ([[ST.ShortText]], [charInfoData]) → ([[ST.ShortText]], [charInfoData]) +filterInvariant inDataLL inCharInfoL newData@(newDataLL, newCharInfoL) = + if null inCharInfoL + then (fmap reverse newDataLL, reverse newCharInfoL) + else + let firstCharData = fmap head inDataLL + firstCharData' = filter (`notElem` [ST.pack "-", ST.pack "?"]) firstCharData + allSameL = fmap ((== (head firstCharData'))) (tail firstCharData') + allSame = if null firstCharData' then True + else foldl' (&&) True allSameL + in -- trace ("FI:" <> (show (allSameL, allSame))) $ + if allSame + then filterInvariant (fmap tail inDataLL) (tail inCharInfoL) (newDataLL, newCharInfoL) + else + if null newCharInfoL + then filterInvariant (fmap tail inDataLL) (tail inCharInfoL) (fmap (: []) firstCharData, head inCharInfoL : newCharInfoL) + else filterInvariant (fmap tail inDataLL) (tail inCharInfoL) (zipWith (:) firstCharData newDataLL, head inCharInfoL : newCharInfoL) + + +{- | removeNCharNTax removes the first two "words" of nchar and ntax, but leaves text with line feeds so can use +lines later +-} +removeNCharNTax ∷ T.Text → (T.Text, T.Text, T.Text) +removeNCharNTax inText = + let noLeadingSpaces = T.dropWhile (not . C.isDigit) inText + nCharacters = T.takeWhile C.isDigit noLeadingSpaces + remainder = T.dropWhile C.isDigit noLeadingSpaces + noLeadingSpaces' = T.dropWhile (not . C.isDigit) remainder + nTaxa = T.takeWhile C.isDigit noLeadingSpaces' + remainder' = T.dropWhile C.isDigit noLeadingSpaces' + in (nCharacters, nTaxa, remainder') + + +{- | glueInterleave takes interleves lines and puts them together with name error checking based on number of taxa +needs to be more robust on detecting multichar blocks--now if only a single multichar in a block would think +its a regular block +-} +glueInterleave ∷ String → [T.Text] → Int → Int → [(T.Text, [String])] → [TermData] +glueInterleave fileName lineList numTax numChars curData + | null lineList = + -- recheck num taxa + -- check chars after process due to ambiguities + if length curData /= numTax + then error ("Error in glueInterleave: final taxon number error: " <> show numTax <> " vs. " <> show (length curData)) + else + let nameList = fmap (T.strip . fst) curData + charShortTextList = fmap (fmap ST.fromString . snd) curData + in -- trace ((show $ length curData) <> " " <> (show $ length $ snd $ head curData)) + zip nameList charShortTextList + | length lineList < numTax = error "Error in glueInterleave: line number error" + | otherwise = + let thisDataBlock = T.words <$> take numTax lineList + blockNames = fmap head thisDataBlock + -- if two words then regular TNT without space, otherwise spaces between states + blockStrings = + if length (head thisDataBlock) == 2 + then fmap (((collectAmbiguities fileName . fmap (: [])) . T.unpack) . last) thisDataBlock + else fmap ((collectMultiCharAmbiguities fileName . fmap T.unpack) . tail) thisDataBlock + canonicalNames = + if not (null curData) + then fmap fst curData + else blockNames + canonicalStrings = fmap snd curData + in -- trace ("GIL: " <> show blockNames <> "\n" <> show canonicalNames <> "\n" <> show blockStrings <> "\n" <> show canonicalStrings) ( + -- check for taxon order + -- trace ("Words:" <> (show $ length $ head thisDataBlock)) ( + if blockNames /= canonicalNames + then + errorWithoutStackTrace + ("\n\nTNT input file " <> fileName <> "processing error--interleaved taxon order error or mispecified number of taxa") + else + let newChars = + if not (null curData) + then zipWith (<>) canonicalStrings blockStrings + else blockStrings + in glueInterleave fileName (drop numTax lineList) numTax numChars (zip canonicalNames newChars) + + +-- ) + +{- | collectMultiCharAmbiguities take a list of Strings and collects TNT ambiguities [X Y] into single Strings +this only for multicharacter TNT characters as opposed to collectAmbiguities +-} +collectMultiCharAmbiguities ∷ String → [String] → [String] +collectMultiCharAmbiguities fileName inStringList = + if null inStringList + then [] + else + let firstString = head inStringList + in -- might be better to check for head and last [] for better error processing + if ('[' `elem` firstString) && (']' `elem` firstString) + then + if '-' `elem` firstString + then + let firstPart = takeWhile (/= '-') firstString + secondPart = tail $ dropWhile (/= '-') firstString + in concat [firstPart, " ", secondPart] : collectMultiCharAmbiguities fileName (tail inStringList) + else + errorWithoutStackTrace + ( "\n\nTNT input file " + <> fileName + <> " processing error: ambiguity format problem no ambiguity or range '-' in : " + <> firstString + ) + else + if ']' `elem` firstString + then + errorWithoutStackTrace + ("\n\nTNT input file " <> fileName <> " processing error: ambiguity format problem naked ']' in : " <> firstString) + else + if '[' `elem` firstString + then + let firstStringList = takeWhile (']' `notElem`) inStringList + ambiguityStringList = firstStringList <> [inStringList !! max 0 (length firstStringList)] + in -- trace (show firstStringList <> show ambiguityStringList <> show (head $ drop (length firstStringList) inStringList)) + unwords ambiguityStringList : collectMultiCharAmbiguities fileName (drop (length ambiguityStringList) inStringList) + else firstString : collectMultiCharAmbiguities fileName (tail inStringList) + + +-- ) + +{- | collectAmbiguities take a list of Strings and collects TNT ambiguities [XY] +into single Strings +this only for single 'block' TNT characters where states are a single character +-} +collectAmbiguities ∷ String → [String] → [String] +collectAmbiguities fileName inStringList = + if null inStringList + then [] + else + let firstString = head inStringList + in -- trace ("CA " <> firstString) ( + if firstString == "]" + then + errorWithoutStackTrace + ("\n\nTNT input file " <> fileName <> " processing error: ambiguity format problem naked ']' in : " <> firstString) + else + if firstString == "[" + then + let ambiguityStringList = takeWhile (/= "]") inStringList <> ["]"] + in -- trace ("CA:" <> (concat ambiguityStringList)) -- <> " " <> concat (drop (length $ concat ambiguityStringList) inStringList)) + concat ambiguityStringList : collectAmbiguities fileName (drop (length $ concat ambiguityStringList) inStringList) + else firstString : collectAmbiguities fileName (tail inStringList) + + +-- ) + +-- | defaultTNTCharInfo default values for TNT characters +defaultTNTCharInfo ∷ (MonadIO m) ⇒ m CharInfo +defaultTNTCharInfo = + let a = fromSymbols $ ST.fromString "0" :| [] + f info = + info + { charType = NonAdd + , activity = True + , weight = 1.0 + , costMatrix = SM.empty + , name = T.empty + , alphabet = a + , prealigned = True + , origInfo = V.singleton (T.empty, NonAdd, a) + } + in f <$> emptyCharInfo + + +-- | renameTNTChars creates a unique name for each character from fileNamer:Number +renameTNTChars ∷ String → Int → [CharInfo] → [CharInfo] +renameTNTChars fileName charIndex = \case + [] → [] + firstInfo : otherInfo → + let newName = T.pack $ filter (/= ' ') fileName <> "#" <> show charIndex + localInfo = firstInfo{name = newName} + in localInfo : renameTNTChars fileName (charIndex + 1) otherInfo + + +{- | getTNTCharInfo numChar charInfoBlock +bit of but needs to update as it goes along +-} +getTNTCharInfo ∷ String → Int → [CharInfo] → [T.Text] → PhyG [CharInfo] +getTNTCharInfo fileName charNumber curCharInfo inLines = + if null inLines + then do + pure curCharInfo + else + let firstLine' = T.strip $ head inLines + multipleCommandsInLine = fmap ((T.reverse . T.cons ';') . T.reverse) (filter ((> 0) . T.length) $ T.strip <$> T.splitOn (T.pack ";") firstLine') + firstLine = head multipleCommandsInLine + in -- trace ("GTCI:" <> (show multipleCommandsInLine)) $ + if T.null firstLine + then getTNTCharInfo fileName charNumber curCharInfo (tail inLines) + else -- hit 'proc /;' line at end + + if T.head firstLine == 'p' + then do + pure curCharInfo + else + if T.last firstLine /= ';' + then + errorWithoutStackTrace + ( "\n\nTNT input file " <> fileName <> " processing error--ccode/costs lines must end with semicolon ';': " <> T.unpack firstLine + ) + else do + -- have a valid line + let wordList = T.words $ T.init firstLine + let command2 = T.toLower $ T.take 2 $ head wordList + -- localCharInfoResult ← getCCodes fileName charNumber (tail wordList) curCharInfo + localCharInfo ← + if command2 == T.pack "cc" + then getCCodes fileName charNumber (tail wordList) curCharInfo + else pure curCharInfo + let localCharInfo' = + if command2 == T.pack "co" + then getCosts fileName charNumber (tail wordList) localCharInfo + else localCharInfo + + if (command2 /= T.pack "cc") && (command2 /= T.pack "co") + then do + logWith + LogInfo + ("\n\nWarning: TNT input file " <> fileName <> " unrecognized/not implemented command ignored : " <> T.unpack firstLine <> "\n") + getTNTCharInfo fileName charNumber curCharInfo (tail multipleCommandsInLine <> tail inLines) + else getTNTCharInfo fileName charNumber localCharInfo' (tail multipleCommandsInLine <> tail inLines) + + +-- ) + +-- | ccodeChars are the TNT ccode control characters +ccodeChars ∷ [Char] +ccodeChars = ['+', '-', '[', ']', '(', ')', '/'] + + +{- | getCCodes takes a line from TNT and modifies characters according to cc-code option +assumes single command (ccodeChars) per line +could sort and num so only hit each char once--but would be n^2 then. +the singleton stuff for compount things like "+." +-} +getCCodes ∷ String → Int → [T.Text] → [CharInfo] → PhyG [CharInfo] +getCCodes fileName charNumber commandWordList curCharInfo = + -- trace ("getCCodes " <> show commandWordList) $ + if null curCharInfo + then do + pure [] + else + let charStatus = + if T.length (head commandWordList) == 1 + then head commandWordList + else T.singleton $ T.head $ head commandWordList + scopeList = + if T.length (head commandWordList) == 1 + then tail commandWordList + else -- not a weight--weight gets added to scope without special case + + if T.head (head commandWordList) /= '/' + then T.tail (head commandWordList) : tail commandWordList + else -- a weight '/'--weight gets added to scope without special case + + T.unwords (tail $ T.words $ T.tail (head commandWordList)) : tail commandWordList + charIndices = L.nub $ L.sort $ concatMap (scopeToIndex fileName charNumber) scopeList + in do + updatedCharInfo ← getNewCharInfo fileName curCharInfo charStatus (head commandWordList) charIndices 0 [] + -- trace (show charStatus <> " " <> (show scopeList) <> " " <> show charIndices) + -- if T.length charStatus > 1 then errorWithoutStackTrace ("\n\nTNT input file " <> fileName <> " character status processing error: option not recognized/implemented " <> (T.unpack charStatus)) + -- else + pure updatedCharInfo + + +{- | getCosts takes a line from TNT and modifies characters according to cc-code option +command format : costs A.B = X/Y Z U>V Q; +assumes X/Y and U>V have no sapces (= one word) +-} +getCosts ∷ String → Int → [T.Text] → [CharInfo] → [CharInfo] +getCosts fileName charNumber commandWordList curCharInfo = + -- trace ("getCosts " <> show commandWordList) $ + if null curCharInfo + then [] + else + let scopeList = takeWhile (/= T.pack "=") commandWordList + charIndices = L.nub $ L.sort $ concatMap (scopeToIndex fileName charNumber) scopeList + (localAlphabet, localMatrix) = processCostsLine fileName $ tail $ dropWhile (/= T.pack "=") commandWordList + updatedCharInfo = newCharInfoMatrix curCharInfo localAlphabet localMatrix charIndices 0 [] + in trace + ("Alph " <> (show $ fmap alphabet updatedCharInfo)) + updatedCharInfo + + +{- | processCostsLine takes the transformation commands of TNT and creates a TCM matrix from that +does not check for alphabet size or order so sorts states to get matrix +TNT states (alphabet elements) must be single characters +-} +processCostsLine ∷ String → [T.Text] → (NonEmpty ST.ShortText, [[Int]]) +processCostsLine fileName wordList = + if null wordList + then + errorWithoutStackTrace + ("\n\nTNT input file " <> fileName <> " costs processing error: 'costs' command without transfomation costs specified") + else case L.sort . L.nub $ foldMap getAlphabetElements wordList of + [] → errorWithoutStackTrace ("\n\nTNT input file " <> fileName <> " No symbols found!") + s : ss → + let localAlphabet = s :| ss + transCosts = getTransformationCosts fileName localAlphabet wordList + localMatrix = makeMatrix fileName localAlphabet transCosts + in (localAlphabet, localMatrix) + + +-- trace ("TNT" <> show localAlphabet <> " " <> show transCosts <> "\n\t" <> show localMatrix) + +-- | makeMatrix takes triples and makes a square matrix with 0 diagonals if not specified +makeMatrix ∷ String → NonEmpty ST.ShortText → [(Int, Int, Int)] → [[Int]] +makeMatrix fileName localAlphabet transCosts + | null transCosts = [] + | length transCosts < (length localAlphabet * length localAlphabet) - length localAlphabet = + errorWithoutStackTrace + ( "\n\nTNT input file " + <> fileName + <> " costs processing error: 'costs' command not all pairwise (non-diagnonal) transformation costs specified" + ) + | length transCosts > (length localAlphabet * length localAlphabet) = + errorWithoutStackTrace + ( "\n\nTNT input file " <> fileName <> " costs processing error: 'costs' command too many pairwise transformation costs specified" + ) + | otherwise = + let initialMatrix = replicate (length localAlphabet) $ replicate (length localAlphabet) 0 + newMatrix = SM.toFullLists $ SM.updateMatrix (SM.fromLists initialMatrix) transCosts + in -- check for uninitialized, non-diagnonal cells--maybe metricity as warning + newMatrix + + +{- | getTransformationCosts get state pairs and their costs +retuns as (i,j,k) = i->j costs k +need to make this gerneral to letter states +-} +getTransformationCosts ∷ String → NonEmpty ST.ShortText → [T.Text] → [(Int, Int, Int)] +getTransformationCosts fileName localAlphabet wordList + | null wordList = [] + | length wordList == 1 = + errorWithoutStackTrace + ( "\n\nTNT input file " + <> fileName + <> " ccode processing error: 'costs' command imporperly formated (transformation and costs in pairs)" + ) + | otherwise = + let -- wordlist must be >= 2 due to above tests + transText = head wordList + costText = wordList !! 1 + transCost = readMaybe (T.unpack costText) ∷ Maybe Int + directedOperator = T.find (== '>') transText + symetricalOperator = T.find (== '/') transText + in if isNothing directedOperator && isNothing symetricalOperator + then + errorWithoutStackTrace + ("\n\nTNT input file " <> fileName <> " ccode processing error: 'costs' command requires '/' or '>': " <> T.unpack transText) + else + let fromStateText = + if isNothing symetricalOperator + then T.takeWhile (/= '>') transText + else T.takeWhile (/= '/') transText + toStateText = + if isNothing symetricalOperator + then T.tail $ T.dropWhile (/= '>') transText + else T.tail $ T.dropWhile (/= '/') transText + fromState = readMaybe (T.unpack fromStateText) ∷ Maybe Int + toState = readMaybe (T.unpack toStateText) ∷ Maybe Int + in if isNothing transCost + then + errorWithoutStackTrace + ( "\n\nTNT input file " + <> fileName + <> " ccode processing error: 'costs' command transformation " + <> T.unpack costText + <> " does not appear to be an integer." + ) + else + if isJust fromState && isJust toState + then -- states are numerical + + let newTripleList = + if isNothing symetricalOperator + then [(fromJust fromState, fromJust toState, fromJust transCost)] + else [(fromJust fromState, fromJust toState, fromJust transCost), (fromJust toState, fromJust fromState, fromJust transCost)] + in newTripleList <> getTransformationCosts fileName localAlphabet (drop 2 wordList) + else -- states are characters (or multicharacters) + + let fromStateIndex = L.elemIndex (ST.fromText $ T.toStrict fromStateText) $ toList localAlphabet + toStateIndex = L.elemIndex (ST.fromText $ T.toStrict toStateText) $ toList localAlphabet + newTripleList = + if isNothing symetricalOperator + then [(fromJust fromStateIndex, fromJust toStateIndex, fromJust transCost)] + else + [ (fromJust fromStateIndex, fromJust toStateIndex, fromJust transCost) + , (fromJust toStateIndex, fromJust fromStateIndex, fromJust transCost) + ] + in if isNothing fromStateIndex + then + errorWithoutStackTrace + ( "\n\nTNT input file " + <> fileName + <> " ccode processing error: 'costs' command " + <> show (T.unwords wordList) + <> " transformation state " + <> T.unpack fromStateText + <> " was not found in charcater alphabet " + <> show localAlphabet + ) + else + if isNothing toStateIndex + then + errorWithoutStackTrace + ( "\n\nTNT input file " + <> fileName + <> " ccode processing error: 'costs' command " + <> show (T.unwords wordList) + <> " transformation state " + <> T.unpack toStateText + <> " was not found in charcater alphabet " + <> show localAlphabet + ) + else newTripleList <> getTransformationCosts fileName localAlphabet (drop 2 wordList) + + +{- | getAlphabetElements takes Text and returns non '/' '>' elements +this is for a single "block" as in A1>G2 or C>T +-} +getAlphabetElements ∷ T.Text → [ST.ShortText] +getAlphabetElements inText = + if T.null inText + then [] + else + let hasForwardSlash = T.find (== '/') inText + hasGreaterThan = T.find (== '>') inText + in if isNothing hasForwardSlash && isNothing hasGreaterThan + then [] + else + let firstSymbol = T.takeWhile (`notElem` ['/', '>']) inText + secondSymbol = T.tail $ T.dropWhile (`notElem` ['/', '>']) inText + in [ST.fromText $ T.toStrict firstSymbol, ST.fromText $ T.toStrict secondSymbol] + + +{- +let symbolList = T.filter (/= '/') $ T.filter (/= '>') inText +in +fmap ST.singleton $ T.unpack symbolList +-} + +-- | scopeToIndex takes the number of characters and converts to a list of indices +scopeToIndex ∷ String → Int → T.Text → [Int] +scopeToIndex fileName numChars scopeText = + if T.null scopeText + then [] + else -- stop will include '.' if present` + + let (start, stop) = T.breakOn (T.pack ".") scopeText + in -- trace (show (start, stop)) ( + -- single integer index + if not (T.null start) && (T.head start `elem` ccodeChars) + then + errorWithoutStackTrace + ( "\n\nTNT file " + <> fileName + <> " ccode processing error: ccode '" + <> T.unpack scopeText + <> "' incorrect format. Scope must follow operator " + ) + else + if start == scopeText + then + let scopeSingleton = readMaybe (T.unpack scopeText) ∷ Maybe Int + in if isNothing scopeSingleton + then + errorWithoutStackTrace + ("\n\nTNT file " <> fileName <> " ccode processing error: ccode '" <> T.unpack scopeText <> "' contains non-integer (0)") + else + if fromJust scopeSingleton < numChars + then [fromJust scopeSingleton] + else + errorWithoutStackTrace + ( "\n\nTNT file " + <> fileName + <> " ccode processing error: scope greater than char number " + <> show (fromJust scopeSingleton) + <> " > " + <> show (numChars - 1) + ) + else + let startIndex = + if T.null start + then Just 0 + else readMaybe (T.unpack start) ∷ Maybe Int + stopIndex = + if stop == T.pack "." + then Just (numChars - 1) + else readMaybe (T.unpack $ T.tail stop) ∷ Maybe Int + in if isNothing startIndex || isNothing stopIndex + then + errorWithoutStackTrace + ("\n\nTNT file " <> fileName <> " ccode processing error: ccode '" <> T.unpack scopeText <> "' contains non-integer (1)") + else + if fromJust startIndex >= numChars + then + errorWithoutStackTrace + ( "\n\nTNT file " + <> fileName + <> " ccode processing error: scope greater than char number " + <> show (fromJust startIndex) + <> " > " + <> show (numChars - 1) + ) + else + if fromJust stopIndex >= numChars + then + errorWithoutStackTrace + ( "\n\nTNT file " + <> fileName + <> " ccode processing error: scope greater than char number " + <> show (fromJust stopIndex) + <> " > " + <> show (numChars - 1) + ) + else [(max 0 (fromJust startIndex)) .. (min (fromJust stopIndex) numChars)] + + +-- ) + +{- | getNewCharInfo updates the a specific list character element +if that char is not in index list it is unaffected and added back to create the new list +in a single pass. +if nothing to do (and nothing done so curCharLIst == []) then return original charInfo +othewise return the reverse since new values are prepended +-} +getNewCharInfo ∷ String → [CharInfo] → T.Text → T.Text → [Int] → Int → [CharInfo] → PhyG [CharInfo] +getNewCharInfo fileName inCharList newStatus newStatusFull indexList charIndex curCharList = + -- trace (show charIndex <> " " <> show indexList <> " " <> (show $ length inCharList)) ( + if null inCharList + then do + pure $ reverse curCharList + else + if null indexList + then + if null curCharList + then do + pure inCharList + else do + pure $ reverse curCharList <> inCharList + else + let firstIndex = head indexList + firstInfo = head inCharList + in if charIndex /= firstIndex + then getNewCharInfo fileName (tail inCharList) newStatus newStatusFull indexList (charIndex + 1) (firstInfo : curCharList) + else + let updatedCharInfo + | newStatus == T.pack "-" = firstInfo{charType = NonAdd} + | newStatus == T.pack "+" = firstInfo{charType = Add} + | newStatus == T.pack "[" = firstInfo{activity = True} + | newStatus == T.pack "]" = firstInfo{activity = False} + | newStatus == T.pack "(" = firstInfo{charType = Matrix} + | newStatus == T.pack ")" = firstInfo{charType = NonAdd} + -- \| newStatus == T.pack "/" = firstInfo {weight = 1.0} + | T.head newStatus == '/' = + let newWeight = readMaybe (tail $ T.unpack newStatusFull) ∷ Maybe Double + in if isNothing newWeight + then + errorWithoutStackTrace + ("\n\nTNT file " <> fileName <> " ccode processing error: weight " <> tail (T.unpack newStatusFull) <> " not a double") + else firstInfo{weight = fromJust newWeight} + | otherwise = + -- trace ("Warning: TNT file " <> fileName <> " ccodes command " <> T.unpack newStatus <> " is unrecognized/not implemented--skipping") + firstInfo + in do + when ((T.unpack newStatus `notElem` ["-", "+", "[", "]", "(", ")"]) && (T.head newStatus /= '/')) $ + logWith + LogWarn + ( "Warning: TNT file " + <> fileName + <> " ccodes command " + <> T.unpack newStatus + <> " is unrecognized/not implemented--skipping" + <> "\n" + ) + getNewCharInfo + fileName + (tail inCharList) + newStatus + newStatusFull + (tail indexList) + (charIndex + 1) + (updatedCharInfo : curCharList) + + +{- | newCharInfoMatrix updates alphabet and tcm matrix for characters in indexList +if that character is not in index list it is unaffected and added back to create the new list +in a single pass. +if nothing to do (and nothing done so curCharList == []) then return original charInfo +othewise retiurn the reverse since new values are prepended +-} +newCharInfoMatrix ∷ [CharInfo] → NonEmpty ST.ShortText → [[Int]] → [Int] → Int → [CharInfo] → [CharInfo] +newCharInfoMatrix inCharList localAlphabet localMatrix indexList charIndex curCharList = + -- trace (show charIndex <> " " <> show indexList <> " " <> (show $ length inCharList)) ( + if null inCharList + then reverse curCharList + else + if null indexList + then + if null curCharList + then inCharList + else reverse curCharList <> inCharList + else + let firstIndex = head indexList + firstInfo = head inCharList + in if charIndex /= firstIndex + then newCharInfoMatrix (tail inCharList) localAlphabet localMatrix indexList (charIndex + 1) (firstInfo : curCharList) + else -- let updatedCharInfo = firstInfo {alphabet = fromSymbols localAlphabet, costMatrix = SM.fromLists localMatrix} + + let updatedCharInfo = firstInfo{alphabet = fromSymbols localAlphabet, costMatrix = SM.fromLists localMatrix} + in -- trace ("TNT2" <> (show $ alphabet updatedCharInfo)) + newCharInfoMatrix (tail inCharList) localAlphabet localMatrix (tail indexList) (charIndex + 1) (updatedCharInfo : curCharList) + + +{- | reconcileAlphabetAndCostMatrix trakes the original charcater alphabet created from the cost matrix and compares to the +observed states. If observed states are a subset of the inferred, the inferred are used to replace the original +this could happen if a matrix is specified for arange of characters, some of which do not exhibit all the states +otherwsie an error is thrown since states don't agree with m,artrix specification +this could happen for a DNA character (ACGT-) with a martix specified of numerical values (01234) +-} +reconcileAlphabetAndCostMatrix + ∷ ( Ord s + , Show s + ) + ⇒ String + → String + → Alphabet s + → Alphabet s + → Alphabet s +reconcileAlphabetAndCostMatrix fileName charName observedAlphabet inferredAlphabet + | observedAlphabet `isAlphabetSubsetOf` inferredAlphabet = inferredAlphabet + | otherwise = + errorWithoutStackTrace $ + fold + [ "Error: TNT file " + , fileName + , " character number " + , tail $ dropWhile (/= '#') charName + , " Observed " + , show observedAlphabet + , " is incompatible with matrix specification states " + , show inferredAlphabet + ] + + +-- checks is observed alphabet is sub set of inferred (ie that from cost matrix string) +-- this can happen if a general DNA cost is sspecified for many characters, but some +-- characters may have only a few of the states. +isAlphabetSubsetOf ∷ (Show s, Ord s) ⇒ Alphabet s → Alphabet s → Bool +isAlphabetSubsetOf specialAlphabet queryAlphabet = + let querySet = alphabetSymbols queryAlphabet + specialSet = alphabetSymbols specialAlphabet + in not $ querySet `Set.isSubsetOf` specialSet + + +{- +-- this seems to produce backwards logic +isAlphabetSubsetOf' :: Ord s => Alphabet s -> Alphabet s -> Bool +isAlphabetSubsetOf' specialAlphabet queryAlphabet = + let querySet = alphabetSymbols queryAlphabet + specialSet = alphabetSymbols specialAlphabet + in trace ("RACM: " <> (show $ querySet `Set.isSubsetOf` specialSet)) querySet `Set.isSubsetOf` specialSet +-} + +{- | checkAndRecodeCharacterAlphabets take RawData and checks the data with char info. +verifies that states (including in ambiguity) are Numerical for additive, and checks alphabets and cost matrices +and assigns correct alphabet to all characters +-} +checkAndRecodeCharacterAlphabets + ∷ String → [[ST.ShortText]] → [CharInfo] → [[ST.ShortText]] → [CharInfo] → ([[ST.ShortText]], [CharInfo]) +checkAndRecodeCharacterAlphabets fileName inData inCharInfo newData newCharInfo + | null inCharInfo && null newCharInfo = error "Empty inCharInfo on input in checkAndRecodeCharacterAlphabets" + | null inData = error "Empty inData in checkAndRecodeCharacterAlphabets" + | null inCharInfo = + -- trace (show $ L.transpose newData) + -- (reverse $ fmap reverse newData, reverse newCharInfo) + (L.transpose $ reverse newData, reverse newCharInfo) + | otherwise = + let firstColumn = fmap head inData + firstInfo = head inCharInfo + originalAlphabet = alphabet firstInfo + thisName = name firstInfo + (firstAlphabet, newWeight, newColumn) = getAlphabetFromSTList fileName firstColumn firstInfo + updatedCharInfo = + if (Matrix == charType firstInfo) && (firstAlphabet /= originalAlphabet) + then firstInfo{alphabet = reconcileAlphabetAndCostMatrix fileName (T.unpack thisName) firstAlphabet originalAlphabet} + else firstInfo{alphabet = firstAlphabet, weight = newWeight} + in -- checkAndRecodeCharacterAlphabets fileName (fmap tail inData) (tail inCharInfo) (prependColumn newColumn newData []) (updatedCharInfo : newCharInfo) + checkAndRecodeCharacterAlphabets + fileName + (fmap tail inData) + (tail inCharInfo) + (newColumn : newData) + (updatedCharInfo : newCharInfo) + + +{- | getAlphabetFromSTList take a list of ST.ShortText and returns list of unique alphabet elements, +recodes decimat AB.XYZ to ABXYZ and reweights by that factor 1/1000 for .XYZ 1/10 for .X etc +checks if char is additive for numerical alphabet +-} +getAlphabetFromSTList ∷ String → [ST.ShortText] → CharInfo → (Alphabet ST.ShortText, Double, [ST.ShortText]) +getAlphabetFromSTList fileName inStates inCharInfo = + if null inStates + then error "Empty column data in getAlphabetFromSTList" + else + let thisType = charType inCharInfo + thisWeight = weight inCharInfo + mostDecimals = + if thisType == Add + then maximum $ fmap getDecimals inStates + else 0 + (thisAlphabet', newColumn) = getAlphWithAmbiguity fileName inStates thisType mostDecimals [] [] + + newWeight + | mostDecimals > 0 = thisWeight / (10.0 ** fromIntegral mostDecimals) + | otherwise = thisWeight + + thisAlphabet = case thisAlphabet' of + [] → ST.fromString "0" :| [] + s : ss → s :| ss + in -- trace (show (thisAlphabet, newWeight, newColumn, mostDecimals)) + -- (fromSymbols thisAlphabet, newWeight, newColumn) + -- trace ("getAlph weight: " <> (show thisWeight)) + (fromSymbols thisAlphabet, newWeight, newColumn) + + +-- | getDecimals tkase a state ShortText and return number decimals--if ambiguous then the most of range +getDecimals ∷ ST.ShortText → Int +getDecimals inChar = + if ST.null inChar + then 0 + else + let inCharText = ST.toString inChar + in -- trace (inCharText) ( + if '.' `notElem` inCharText + then 0 + else + if head inCharText /= '[' + then -- no ambiguity + length (dropWhile (/= '.') inCharText) - 1 + else -- has ambiguity (range) + + let rangeStringList = words $ filter (`notElem` ['[', ']']) inCharText + fstChar = + if '.' `elem` head rangeStringList + then dropWhile (/= '.') (head rangeStringList) + else ['.'] + sndChar = + if '.' `elem` last rangeStringList + then dropWhile (/= '.') (last rangeStringList) + else ['.'] + in -- trace (fstChar <> " " <> sndChar) + max (length fstChar) (length sndChar) - 1 + + +-- ) + +{- | getAlphWithAmbiguity take a list of ShortText with information and accumulatiors +For both nonadditive and additive. Searches for [] to denote ambiguity and splits states + if splits on spaces if there are spaces (within []) (ala fastc or multicharacter states) + else if no spaces + if non-additive then each symbol is split out as an alphabet element -- as in TNT + if is additive splits on '-' to denote range +rescales (integerizes later) additive characters with decimal places to an integer type rep +for additive characters if states are not nummerical then throws an error +-} +getAlphWithAmbiguity + ∷ String → [ST.ShortText] → CharType → Int → [ST.ShortText] → [ST.ShortText] → ([ST.ShortText], [ST.ShortText]) +getAlphWithAmbiguity fileName inStates thisType mostDecimals newAlph newStates = + if null inStates + then (L.sort $ L.nub newAlph, reverse newStates) + else + let firstState = ST.toString $ head inStates + in if thisType /= Add + then + if head firstState /= '[' + then + if firstState `elem` ["?", "-"] + then getAlphWithAmbiguity fileName (tail inStates) thisType mostDecimals newAlph (head inStates : newStates) + else getAlphWithAmbiguity fileName (tail inStates) thisType mostDecimals (head inStates : newAlph) (head inStates : newStates) + else -- ambiguity + -- let newAmbigStates = fmap ST.fromString $ words $ filter (`notElem` ['[',']']) firstState + + let newAmbigStates = fmap ST.fromString $ fmap (: []) $ filter (`notElem` ['[', ']']) firstState + in getAlphWithAmbiguity fileName (tail inStates) thisType mostDecimals (newAmbigStates <> newAlph) (head inStates : newStates) + else -- additive character + + let scaleFactor = 1.0 / (10.0 ** fromIntegral mostDecimals) + in if head firstState /= '[' + then + if mostDecimals == 0 + then getAlphWithAmbiguity fileName (tail inStates) thisType mostDecimals (head inStates : newAlph) (head inStates : newStates) + else + let stateNumber = readMaybe firstState ∷ Maybe Double + newStateNumber = takeWhile (/= '.') $ show (fromJust stateNumber / scaleFactor) + in if isNothing stateNumber + then + if firstState `elem` ["?", "-"] + then + getAlphWithAmbiguity + fileName + (tail inStates) + thisType + mostDecimals + (ST.fromString "-1" : newAlph) + (ST.fromString "-1" : newStates) + else + errorWithoutStackTrace + ("\n\nTNT file " <> fileName <> " ccode processing error: Additive character not a number (Int/Float) " <> firstState) + else + getAlphWithAmbiguity + fileName + (tail inStates) + thisType + mostDecimals + (ST.fromString newStateNumber : newAlph) + (ST.fromString newStateNumber : newStates) + else -- trace ("GAlphAmb: " <> (show firstState)) $ + + let hasDecimalorDash = (elem '.' firstState) || (elem '-' firstState) + gutsList = + if hasDecimalorDash + then fmap (: []) $ filter (`notElem` ['[', ']', '.', '-']) firstState + else (: []) <$> filter (`notElem` ['[', ']']) firstState + newStateNumberList = fmap readMaybe gutsList ∷ [Maybe Double] + newStateNumberStringList = fmap (((takeWhile (`notElem` ['.', '-']) . show) . (/ scaleFactor)) . fromJust) newStateNumberList + in if Nothing `elem` newStateNumberList + then + errorWithoutStackTrace + ("\n\nTNT file " <> fileName <> " ccode processing error: Additive character not a number (Int/Float) " <> firstState) + else + let newAmbigState = + if hasDecimalorDash + then ST.filter (/= ' ') $ ST.fromString $ '[' : unwords newStateNumberStringList <> "]" + else ST.fromString $ '[' : concat newStateNumberStringList <> "]" + in getAlphWithAmbiguity + fileName + (tail inStates) + thisType + mostDecimals + (fmap ST.fromString newStateNumberStringList <> newAlph) + (newAmbigState : newStates) + +-- ) diff --git a/src/Reconciliation/Adams.hs b/src/Reconciliation/Adams.hs new file mode 100644 index 000000000..1ed4a7611 --- /dev/null +++ b/src/Reconciliation/Adams.hs @@ -0,0 +1,648 @@ +{- | +Functions to create Adams II consensus trees (unlabelled internal veritices) +-} +module Reconciliation.Adams (makeAdamsII) where + +import Control.Monad.IO.Class (MonadIO (..)) +import Data.Graph.Inductive.Graph qualified as G +import Data.Graph.Inductive.PatriciaTree qualified as P +import Data.List qualified as L +import Data.Maybe +import Data.Set qualified as Set +import Data.Text.Lazy qualified as T +import Data.Vector qualified as V +import GraphFormatUtilities qualified as PhyP +import PHANE.Evaluation +import PHANE.Evaluation.Verbosity (Verbosity (..)) +import Types.Types + + +-- import ParallelUtilities qualified as PU +-- import Debug.Trace + +data VertexType = Root | Internal | Leaf | Network | Tree + deriving stock (Eq, Ord, Read, Show) -- NFData ? + + +type Vertex = (String, [Int], [Int], VertexType) -- name, child vertices, parent vertices, type (a bit redundant) + + +type Edge = (Int, Int, Maybe Double) -- terminal vertices (by numbers) and potential length + + +type PhyloGraphVect = (V.Vector Vertex, V.Vector Edge) + + +type GenPhyNetNode = (String, [String], [String]) -- Make list for both so unresolved network (node name, [descendant name], [ancestor name]) + + +type GenPhyNet = [GenPhyNetNode] + + +-- | null PhyloGraphVect +nullGraphVect ∷ PhyloGraphVect +nullGraphVect = (V.empty, V.empty) + + +-- | getAdamsIIPair inputs 2 PhyloGraphVects and returns AdamsII consensus +getAdamsIIPair ∷ PhyloGraphVect → PhyloGraphVect → PhyloGraphVect +getAdamsIIPair inGraphVectA inGraphVectB = + let inGraphVectList = [inGraphVectA, inGraphVectB] + (sameLeafSet, leafSets) = getAndCheckLeafSets inGraphVectList + curVertexSets = map fst inGraphVectList + rootPairList = map (findRoot 0) inGraphVectList + -- rootIndexList = map fst rootPairList + rootVertexList = map snd rootPairList + rootSplits = map getSecond rootVertexList + rootSplitLeafListList = getSplitLeafListList rootSplits inGraphVectList + rootLUBPre = leastUpperBound rootSplitLeafListList + rootLUB = [L.sort x | x ← rootLUBPre, not (null x)] -- need map sort $ + + -- create nodes based on LUBs + rootNode ∷ (String, [String], [a]) + rootNode = ("root", map lub2TreeRep rootLUB, []) + leavesPlaced = concat [x | x ← rootLUB, length x < 3] + vertexLeafSetList = map (map getLeafSetFromNodeName . V.toList) curVertexSets + potentialVertexSets = map (map getSecond . V.toList) curVertexSets + in if not sameLeafSet + then errorWithoutStackTrace ("Leaf sets of input graphs do not match" <> show leafSets) + else -- return processed when have all nodes + + let allAdamsNodes = makeAdamsNodes [rootNode] "root" rootLUB leavesPlaced (zip potentialVertexSets vertexLeafSetList) -- curVertexSets vertexLeafSetList + in genPhyNet2PhyloGraphVect allAdamsNodes + + +-- | mkGraphPair take LNode list lEdge List pair and reutns fgl graph +mkGraphPair ∷ ([G.LNode a], [G.LEdge b]) → P.Gr a b +mkGraphPair (nodeList, edgeList) = G.mkGraph nodeList edgeList + + +{- | makeAdamsII takes a list of fgl graphs, convertes them to PhyloGraphVect +makes the Adamns consensus and then converts back to fgl for return to EUN code +-} +makeAdamsII ∷ [G.LNode String] → [P.Gr String Double] → PhyG (P.Gr String Double) +makeAdamsII leafNodeList inFGList + | null leafNodeList = error "Null leaf node list in makeAdamsII" + | null inFGList = pure G.empty + | otherwise = + let inGraphNodes = fmap G.labNodes inFGList + inGraphEdges = fmap G.labEdges inFGList + inGraphNonLeafNodes = fmap (drop $ length leafNodeList) inGraphNodes + newNodeListList = fmap (leafNodeList <>) inGraphNonLeafNodes + inFGList' = mkGraphPair <$> zip newNodeListList inGraphEdges + + -- parallel + action ∷ (Ord a) ⇒ P.Gr a b → Bool + action = isTree + in do + actionPar ← getParallelChunkMap + let allTreesList = actionPar action inFGList' + -- let allTreesList = PU.seqParMap PU.myStrategyRDS isTree inFGList' -- `using` PU.myParListChunkRDS + let allTrees = L.foldl1' (&&) allTreesList + + if not allTrees + then errorWithoutStackTrace ("Input graphs are not all trees in makeAdamsII: " <> show allTreesList) + else + if not (leafSetConstant [] inFGList) + then errorWithoutStackTrace "Input leaf sets not constant in makeAdamsII" + else + let inPGVList = fmap fgl2PGV inFGList' -- paralle problem with NFData seqParMap myStrategy fgl2PGV inFGList + adamsPGV = L.foldl1' getAdamsIIPair inPGVList + in -- trace ("\nAdams: " <> show adamsPGV) + pure $ pgv2FGL adamsPGV + + +{- | fgl2PGVEdge takes an fgl Labeled edge (e,u,label) +and returns a PGV edge with no brnach length (e,u,Nothing) +-} +fgl2PGVEdge ∷ G.LEdge b → Edge +fgl2PGVEdge (e, u, _) = (e, u, Nothing) + + +{- | fgl2PGVNode takes a tripple of an fgl labelled node (Int, a), its +child verteces and parent versitices and returns the PGV Vertex including its type +-} +fgl2PGVNode ∷ (Show b) ⇒ P.Gr T.Text b → (G.LNode String, [Int], [Int]) → Vertex +fgl2PGVNode inGraph ((index, inLabel), childList, parentList) = + -- if null label then error "Null node label in fgl2PGVNode" + -- else + let guts = T.init $ PhyP.component2Newick inGraph False False (index, T.pack inLabel) + label = if PhyP.checkIfLeaf guts then T.unpack $ T.tail $ T.init guts else T.unpack guts + in if null parentList + then (label, childList, parentList, Root) + else + if null childList + then (label, childList, parentList, Leaf) + else + if length parentList == 1 + then (label, childList, parentList, Reconciliation.Adams.Tree) + else + if length parentList > 1 + then (label, childList, parentList, Network) + else error "This can't happen in fgl2PGVNode" + + +{- | fgl2PGV takes an fgl (functional graph) and convertes to PhyloGraphVect +to use local (and old) Adams consensus functions +retuns "Nothing" for edge labels ( no need for branch lengths) +-} +fgl2PGV ∷ P.Gr String Double → PhyloGraphVect +fgl2PGV inGraph = + if G.isEmpty inGraph + then nullGraphVect + else + let fglNodeList = G.labNodes inGraph + fglNodeParentList = fmap (G.pre inGraph . fst) fglNodeList + fglNodeChildList = fmap (G.suc inGraph . fst) fglNodeList + fglNodeInfoList = zip3 fglNodeList fglNodeChildList fglNodeParentList + fglEdgeList = G.labEdges inGraph + pgvNodeList = fmap (fgl2PGVNode (PhyP.stringGraph2TextGraph inGraph)) fglNodeInfoList + pgvEdgeList = fmap fgl2PGVEdge fglEdgeList + in (V.fromList pgvNodeList, V.fromList pgvEdgeList) + + +-- | vertex2Node take alist of vertices and returns a list of fgl Labelled nodes +vertex2Node ∷ Int → V.Vector Vertex → [G.LNode String] +vertex2Node counter inVertexVect = + if V.null inVertexVect + then [] + else + let (label, _, _, _) = V.head inVertexVect + in (counter, label) : vertex2Node (counter + 1) (V.tail inVertexVect) + + +{- | edge2FGLEdge take vertex of Int Int Maybe Double and returns +fgl node with type Double +-} +edge2FGLEdge ∷ (Int, Int, Maybe Double) → (Int, Int, Double) +edge2FGLEdge (e, u, _) = (e, u, 1.0 ∷ Double) + + +-- | pgv2FGL take a PhyloGraphVect and converts to an fgl graph +pgv2FGL ∷ PhyloGraphVect → P.Gr String Double +pgv2FGL (inVertexVect, inEdgeVect) = + let fglNodes = vertex2Node 0 inVertexVect + fglEdges = V.map edge2FGLEdge inEdgeVect + in G.mkGraph fglNodes (V.toList fglEdges) + + +-- | getLeafList returns sorted leaf complement of graph fgl +getLeafLabelListFGL ∷ (Ord a) ⇒ P.Gr a b → [a] +getLeafLabelListFGL inGraph = + if G.isEmpty inGraph + then error "Empty graph in getLeafLabelListFGL" + else + let degOutList = G.outdeg inGraph <$> G.nodes inGraph + newNodePair = zip degOutList (G.labNodes inGraph) + leafPairList = filter ((== 0) . fst) newNodePair + (_, leafList) = unzip leafPairList + in L.sort $ snd <$> leafList + + +{- | leafSetConstant takes a series of fgl graphs and checks if leaf sets are the same for +all of them +-} +leafSetConstant ∷ (Ord a) ⇒ [a] → [P.Gr a b] → Bool +leafSetConstant leafList inFGLList + | null inFGLList = True + | null leafList = + -- first graph + let firstGraph = head inFGLList + firstLeaves = getLeafLabelListFGL firstGraph + in leafSetConstant firstLeaves (tail inFGLList) + | otherwise = + let thisGraph = head inFGLList + theseLeaves = getLeafLabelListFGL thisGraph + in theseLeaves == leafList && leafSetConstant leafList (tail inFGLList) + + +{- | isTree takes fgl graph and checks is conected, no self edges, single root (includes connected), no indegree +> 1 nodes, leaf labels appear only once +-} +isTree ∷ (Ord a) ⇒ P.Gr a b → Bool +isTree inGraph = + not (G.isEmpty inGraph) + && ( let nodeIndegrees = G.indeg inGraph <$> G.nodes inGraph + maxIndegree = maximum nodeIndegrees + rootNodes = filter (== 0) nodeIndegrees + leafLabels = getLeafLabelListFGL inGraph + uniqueLeafLabels = L.nub leafLabels + eList = fst <$> G.edges inGraph + uList = snd <$> G.edges inGraph + selfList = filter id $ zipWith (==) eList uList + in not ((((length rootNodes /= 1) || (maxIndegree > 1)) || (length leafLabels /= length uniqueLeafLabels)) || not (null selfList)) + ) + + +{- | getRootNamesFromGenPhyNet extracts non-leaf-non-root +names from vertices in order found +-} +getRootNamesFromGenPhyNet ∷ GenPhyNet → [String] +getRootNamesFromGenPhyNet inNet = + if null inNet + then [] + else + let (name, desc, anc) = head inNet + vertType = getVertType (length desc) (length anc) + in if vertType == Root + then name : getRootNamesFromGenPhyNet (tail inNet) + else getRootNamesFromGenPhyNet (tail inNet) + + +{- | getNonLeafNonRootNamesFromGenPhyNet extracts non-leaf-non-root +names from vertices in order found +-} +getNonLeafNonRootNamesFromGenPhyNet ∷ GenPhyNet → [String] +getNonLeafNonRootNamesFromGenPhyNet inNet = + if null inNet + then [] + else + let (name, desc, anc) = head inNet + vertType = getVertType (length desc) (length anc) + in if (vertType /= Leaf) && (vertType /= Root) + then name : getNonLeafNonRootNamesFromGenPhyNet (tail inNet) + else getNonLeafNonRootNamesFromGenPhyNet (tail inNet) + + +-- | getLeafNamesFromGenPhyNet extracts leaf names from vertices in order found +getLeafNamesFromGenPhyNet ∷ GenPhyNet → [String] +getLeafNamesFromGenPhyNet inNet = + if null inNet + then [] + else + let (name, desc, anc) = head inNet + in if getVertType (length desc) (length anc) == Leaf + then name : getLeafNamesFromGenPhyNet (tail inNet) + else getLeafNamesFromGenPhyNet (tail inNet) + + +-- | getVertType takes list of desc and anc to determine type of vertex +getVertType ∷ Int → Int → VertexType +getVertType nDesc nAnc + | nDesc == 0 && nAnc == 0 = error "Isolated node" + | nAnc == 0 = Root + | nDesc == 0 = Leaf + | nAnc == 1 = Reconciliation.Adams.Tree + | nAnc > 2 = Network + | otherwise = error ("Screwey node: indegree " <> show nDesc <> " outdegree " <> show nAnc) + + +{- | getVertNum takes a list of vertex names and teh complete list and +returns a list of the indices (integers) of the names +-} +getVertNum ∷ [String] → [String] → [Int] +getVertNum nameList vertexNameList = + if null vertexNameList + then [] + else + let firstVertName = head vertexNameList + vertNum = L.elemIndex firstVertName nameList + in if isNothing vertNum + then error ("Error in vertex name index: " <> show firstVertName <> " in " <> show nameList) + else fromJust vertNum : getVertNum nameList (tail vertexNameList) + + +-- | oldIndex2New takes a PhyloGraphVect and creates a list of reorder based ordered name list +oldIndex2New ∷ V.Vector Vertex → [String] → [(Int, Vertex)] +oldIndex2New inVertexVect nameList = + if V.null inVertexVect + then [] + else + let curVert = V.head inVertexVect + (vertName, _, _, _) = curVert + vertNum = L.elemIndex vertName nameList + in (fromJust vertNum, curVert) : oldIndex2New (V.tail inVertexVect) nameList + + +{- | genForestToPhyloGraph converts GenForest to PhyloGraph (so can use legacy +ENewick etc parsers +takes flattened vector of GenPhyNetNodes and builds vertices (leaf, internal, +and root) and edges. Vertices and edges are added to input null +PhyloGraphVect +EDGES SEEM TO BE INCORRECT IN PLACES +-} +genForestToPhyloGraphVect ∷ V.Vector GenPhyNetNode → PhyloGraphVect → [String] → PhyloGraphVect +genForestToPhyloGraphVect inGen inPhyVect nameList = + if V.null inGen + then inPhyVect + else + let (inVertexName, inVertexDescNameList, inVertexAncNameList) = V.head inGen + (curVertVect, curEdgeVect) = inPhyVect + descNumList = getVertNum nameList inVertexDescNameList + ancNumList = getVertNum nameList inVertexAncNameList + vertType = getVertType (length descNumList) (length ancNumList) + newEdgeVect ∷ V.Vector (Int, Int, Maybe a) + newEdgeVect = + V.zip3 + (V.fromList ancNumList) + ( V.replicate + (length ancNumList) + (head $ getVertNum nameList [inVertexName]) + ) + (V.replicate (length ancNumList) Nothing) -- edge from anc to current, no weight info + in genForestToPhyloGraphVect + (V.tail inGen) + ( V.snoc curVertVect (inVertexName, descNumList, ancNumList, vertType) + , curEdgeVect <> newEdgeVect + ) + nameList + + +{- | getNamesFromGenPhyNet extracts names from vertices in order found +leaves are first, then internal, root last +-} +getNamesFromGenPhyNet ∷ GenPhyNet → [String] +getNamesFromGenPhyNet inNet = + L.sort (getLeafNamesFromGenPhyNet inNet) + <> getNonLeafNonRootNamesFromGenPhyNet inNet + <> getRootNamesFromGenPhyNet inNet + + +{- | getShortestList takes list and length and list of lists and return +shortest list +-} +getShortestList ∷ ([a], [b]) → Int → [([a], [b])] → ([a], [b]) +getShortestList bestList lengthBestList inListList = + if null inListList + then bestList + else + let curList = head inListList + lengthCurList = length $ fst curList + in if lengthCurList < lengthBestList + then getShortestList curList lengthCurList (tail inListList) + else getShortestList bestList lengthBestList (tail inListList) + + +{- | getSplitList take an LUB, list of placed taxa, and vector of tree vertices +and returns a list of splits for each input tree (from tree vertices) +also filters out placed taxa +CLEANUP--many more ioperations than needed--should be passed as better +structure +-} +getSplitList ∷ [String] → [String] → ([[Int]], [[String]]) → [[String]] +getSplitList curLUB placedTaxa (potentialVerts, vertLeafSet) = + if null curLUB + then error "Null LUB in getSplitList" + else + let vertList = [(x, y) | (x, y) ← zip vertLeafSet potentialVerts, L.intersect x curLUB == curLUB] + smallestVert = snd $ getShortestList (head vertList) (length $ fst $ head vertList) (tail vertList) + vectVertLeafSet = V.fromList vertLeafSet -- adds factor of "n", could pass another variable? + rawLUBs = map (vectVertLeafSet V.!) smallestVert + newLUBs = map (L.\\ placedTaxa) rawLUBs + in newLUBs + + +-- | replaceChar take set of charcters to be replaced by a char in a String +replaceChar ∷ String → Char → Char → Char +replaceChar inSet2Replace replaceChar2 inChar = + if inChar `elem` inSet2Replace + then replaceChar2 + else inChar + + +{- | getVertsFromIndexList takes a list of vertex vector indices and returns a list of +vertices +-} +getVertsFromIndexList ∷ [Int] → PhyloGraphVect → [Vertex] +getVertsFromIndexList indexList inGraphVect = + if null indexList + then [] + else + let (vertVect, _) = inGraphVect + in (vertVect V.! head indexList) : getVertsFromIndexList (tail indexList) inGraphVect + + +{- | ggenPhyNet2PhyloGraphVect takes as input GenPhyNet and return +PhyloGraphVect with root as last node +-} +genPhyNet2PhyloGraphVect ∷ GenPhyNet → PhyloGraphVect +genPhyNet2PhyloGraphVect inGenPhyNet = + if null inGenPhyNet + then error "Null GenPhyNet in genPhyNet2PhyloGraphVect" + else + let nameList = getNamesFromGenPhyNet inGenPhyNet + (vertVect, edgeVect) = genForestToPhyloGraphVect (V.fromList inGenPhyNet) nullGraphVect nameList + newVertVect = vertVect V.// oldIndex2New vertVect nameList + in (newVertVect, edgeVect) + + +{- | makeAdamsNodes takes root Adams node, rootLUB, vertex sets of input +trees and placed leaf set and constructs each Adams node in turn. +-} +makeAdamsNodes ∷ GenPhyNet → String → [[String]] → [String] → [([[Int]], [[String]])] → GenPhyNet +makeAdamsNodes inAdamsTree parentName inLUBList placedTaxa bothLeafLists = + -- inTreeVertexLists vertexLeafSetList = + if null inLUBList + then inAdamsTree + else + let curLUB = head inLUBList + in if length curLUB == 1 -- make nodes since done + then + let newNode ∷ (String, [a], [String]) + newNode = (head curLUB, [], [parentName]) + in makeAdamsNodes + (newNode : inAdamsTree) + parentName + (tail inLUBList) + (head curLUB : placedTaxa) + bothLeafLists -- inTreeVertexLists vertexLeafSetList + else + if length curLUB == 2 + then + let leftChild = lub2TreeRep [head curLUB] + rightChild = lub2TreeRep [last curLUB] + newNode1, newNode2, newNode3 ∷ (String, [String], [String]) + newNode1 = (lub2TreeRep curLUB, [leftChild, rightChild], [parentName]) + newNode2 = (leftChild, [], [lub2TreeRep curLUB]) + newNode3 = (rightChild, [], [lub2TreeRep curLUB]) + newGenPhyNet = newNode2 : (newNode3 : (newNode1 : inAdamsTree)) + newPlacedTaxa = lub2TreeRep curLUB : (leftChild : (rightChild : placedTaxa)) + in makeAdamsNodes + newGenPhyNet + parentName + (tail inLUBList) + newPlacedTaxa + bothLeafLists -- inTreeVertexLists vertexLeafSetList + else -- core case with LUB creation and taxon placementg + + let splitListList = map (getSplitList curLUB placedTaxa) bothLeafLists -- (zip inTreeVertexLists vertexLeafSetList) + newLUBpre = leastUpperBound splitListList + newLUB = [L.sort x | x ← newLUBpre, not (null x)] -- had "map sort $" was this "sort" necessary? for List intersection? + newNode = (lub2TreeRep curLUB, map lub2TreeRep newLUB, [parentName]) + in -- trace ("New LUBs " <> show newLUB <> " newNode " <> show newNode) + makeAdamsNodes + (newNode : inAdamsTree) + (lub2TreeRep curLUB) + (tail inLUBList) + placedTaxa + bothLeafLists + <> makeAdamsNodes [] (lub2TreeRep curLUB) newLUB placedTaxa bothLeafLists -- inTreeVertexLists vertexLeafSetList) <> + -- inTreeVertexLists vertexLeafSetList) + + +{- | getLeafSetFromNodeName takes String name of node and returns sorted list of leaf +names--ASSUMES node names are not given in input and are assigned as trees +are parsed +-} +getLeafSetFromNodeName ∷ Vertex → [String] +getLeafSetFromNodeName inVertex = + let (nodeName, _, _, _) = inVertex + in if null nodeName + then error "Null node name in getLeafSetFromNodeName" + else + let rawList = map (replaceChar ['(', ')', ','] ' ') nodeName + in L.sort $ words rawList -- this sort required + + +{- | lub2TreeRep takes list of names and makes into unresolved subtree +in parens +-} +lub2TreeRep ∷ [String] → String +lub2TreeRep inStringList + | null inStringList = error "Null input in lub2TreeRep" + | length inStringList == 1 = head inStringList + | otherwise = + let inside = init $ concatMap (<> ",") inStringList + in ('(' : inside) <> ")" + + +{- | getDecendantLeafList iputs a vertex and returns leaf set (as list of +leaf names as strings) descdended from +that vertex, if a leaf, returns that leaf +-} +getDecendantLeafList ∷ [Vertex] → PhyloGraphVect → [String] +getDecendantLeafList inVertexList inGraphVect = + if null inVertexList + then [] + else + let (curVertName, descList, _, vertType) = head inVertexList + descVertList = getVertsFromIndexList descList inGraphVect + in if vertType == Leaf + then curVertName : getDecendantLeafList (tail inVertexList) inGraphVect + else + getDecendantLeafList [head descVertList] inGraphVect + <> getDecendantLeafList (tail descVertList) inGraphVect + <> getDecendantLeafList (tail inVertexList) inGraphVect + + +-- | getSplitLeafList takes a node and returns a list of list of descendent leaves +getSplitLeafList ∷ [Int] → PhyloGraphVect → [[String]] +getSplitLeafList descList inGraphVect = + if null descList + then [] + else + let curDesc = head descList + (vertexVect, _) = inGraphVect + curLeaves = getDecendantLeafList [vertexVect V.! curDesc] inGraphVect + in curLeaves : getSplitLeafList (tail descList) inGraphVect + + +{- | getSplitLeafListList takes list of descenndents for PhyloGraphVect and +returns a list of descendant list for each split of each tree +-} +getSplitLeafListList ∷ [[Int]] → [PhyloGraphVect] → [[[String]]] +getSplitLeafListList descListList inGraphVectList + | null descListList = [] + | null inGraphVectList = error "Diff numbers of descdent lists and graphs" + | otherwise = + let curIntList = head descListList + curGraphVectList = head inGraphVectList + in getSplitLeafList curIntList curGraphVectList + : getSplitLeafListList (tail descListList) (tail inGraphVectList) + + +{- | lub2 takes two lists of lists of names and generates the pairswise set of +intersections +-} +lub2 ∷ [[String]] → [[String]] → [[String]] +lub2 s1 s2 + | null s1 = [] + | null s2 = [] + | otherwise = + let intersectFirst = L.intersect (head s1) (head s2) : lub2 [head s1] (tail s2) + in intersectFirst <> lub2 (tail s1) s2 + + +{- | leastUpperBound takes list of list vertex leaf descendants (as Strings) +and returns LUB of Adams II (1972) consensus +-} +leastUpperBound ∷ [[[String]]] → [[String]] +leastUpperBound inVertexListList + | length inVertexListList < 2 = + error "Too few name lists in leastUpperBound" + | length inVertexListList == 2 = + let x = head inVertexListList + y = last inVertexListList + in lub2 x y + | otherwise = + let x = head inVertexListList + y = head $ tail inVertexListList + z = tail $ tail inVertexListList + t = lub2 x y + in leastUpperBound (t : z) + + +-- | get second retriueves 2nd element of 4 +getSecond ∷ (a, b, c, d) → b +getSecond inTuple = + let (_, b2, _, _) = inTuple + in b2 + + +{- | leafSetFromVertexVect takes vector of veritces and returns set of leaf +names +-} +leafSetFromVertexVect ∷ Set.Set String → V.Vector Vertex → Set.Set String +leafSetFromVertexVect inSet inVerts = + if V.null inVerts + then inSet + else + let (curName, _, _, curType) = V.head inVerts + in if curType == Leaf + then leafSetFromVertexVect (Set.insert curName inSet) (V.tail inVerts) + else leafSetFromVertexVect inSet (V.tail inVerts) + + +{- | getLeafSet tgake a list pf PhyloGraphVect and returns a pair with +True if the leaf sets are identical, and a list of the sets +-} +getLeafSet ∷ PhyloGraphVect → Set.Set String +getLeafSet inGraphVect = + let (inVerts, _) = inGraphVect + in leafSetFromVertexVect Set.empty inVerts + + +{- | setEqual checks for set equality by difference between union and +intersection is empty +-} +setEqual ∷ (Ord a) ⇒ Set.Set a → Set.Set a → Bool +setEqual firstSet secondSet = + let combinedElem = Set.union firstSet secondSet + sameElem = Set.intersection firstSet secondSet + in Set.empty == Set.difference combinedElem sameElem + + +-- | getAndCheckLeafSets take graphs and checks that leaf sets are identical +getAndCheckLeafSets ∷ [PhyloGraphVect] → (Bool, [Set.Set String]) +getAndCheckLeafSets inGraphs = + if null inGraphs + then error "Empty graph list in getAndCheckLeafSets" + else + let leafSetList = map getLeafSet inGraphs + firstSet = head leafSetList + setDiffList = map (setEqual firstSet) (tail leafSetList) + allEmpty = and setDiffList + in (allEmpty, leafSetList) + + +-- | findRoot take PhyloGraphVect and return root index and Vertex +findRoot ∷ Int → PhyloGraphVect → (Int, Vertex) +findRoot index inGraph = + let (vertexVect, _) = inGraph + in if index < V.length vertexVect + then + let (_, _, _, vertexType) = vertexVect V.! index + in if vertexType == Root + then (index, vertexVect V.! index) + else findRoot (index + 1) inGraph + else error "Index exceeeds vertex number in findRoot" diff --git a/src/Reconciliation/Eun.hs b/src/Reconciliation/Eun.hs new file mode 100644 index 000000000..3187f6764 --- /dev/null +++ b/src/Reconciliation/Eun.hs @@ -0,0 +1,1043 @@ +{-# LANGUAGE ScopedTypeVariables #-} + +{- | +Module : Eun.hs +Description : Module to calculate various graph reconciliation methods Wheeler (2021) + input graphviz dot files and newick +Copyright : (c) 2021 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. +License : + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The views and conclusions contained in the software and documentation are those +of the authors and should not be interpreted as representing official policies, +either expressed or implied, of the FreeBSD Project. + +Maintainer : Ward Wheeler +Stability : unstable +Portability : portable (I hope) +-} +module Reconciliation.Eun ( + reconcile, + makeProcessedGraph, + addGraphLabels, +) where + +import Control.Monad (when) +import Control.Monad.IO.Class (MonadIO (..)) +import Control.Parallel.Strategies +import Data.BitVector qualified as BV +import Data.Bits qualified as B +import Data.Graph.Inductive.Graph qualified as G +import Data.Graph.Inductive.PatriciaTree qualified as P +import Data.Graph.Inductive.Query.BFS qualified as BFS +import Data.GraphViz as GV +import Data.GraphViz.Printing +import Data.List qualified as L +import Data.Map.Strict qualified as Map +import Data.Maybe +import Data.Set qualified as S +import Data.Text.Lazy qualified as T +import Data.Vector qualified as V +import GraphFormatUtilities qualified as PhyP +import Graphs.GraphOperations qualified as GO +import PHANE.Evaluation +import PHANE.Evaluation.Verbosity (Verbosity (..)) +import Reconciliation.Adams qualified as A +import Types.Types +import Utilities.LocalGraph qualified as LG + + +-- import Debug.Trace +-- import ParallelUtilities as PU + +{- +-- | turnOnOutZeroBit turns on the bit 'nleaves" signifying that +-- the node is outdegree 1 +-- this so outdegree one nodes and their child have differnet bit sets +turnOnOutZeroBit :: BV.BV -> Int -> BV.BV +turnOnOutZeroBit inBitVect nLeaves = BV.or [B.bit nLeaves, inBitVect] +-} + +{- +-- | turnOffOutZeroBit turns off the bit 'nleaves" signifying that +-- the node is outdegree /= 1 +-- this so outdegree one nodes and their child have differnet bit sets +turnOffOutZeroBit :: BV.BV -> Int -> BV.BV +turnOffOutZeroBit inBitVect nLeaves = BV.extract (nLeaves - 1) 0 inBitVect +-} + +{- | setOutDegreeOneBits assumes outdegree of vertex = 1, takes the number of leaves, bitvector +repersentation (rawBV) of vertex, a L.sorted list of outdegree=1 vertices and the vertex index +and creates a bitvector to prepend of length the number of outdgree=1 vertices where +the correpsonding vertex index position in ilist is 'On' and the remainder are 'Off' +this insures a unique lablelling for all outdegree=1 vertices +-} +setOutDegreeOneBits ∷ BV.BV → [Int] → Int → BV.BV +setOutDegreeOneBits inBitVect out1VertexList vertexIndex = + if null out1VertexList + then error "Empty outdegree=1 vertex list in setOutDegreeOneBits" + else + let vertListIndex = L.elemIndex vertexIndex out1VertexList + vertexPosition = fromJust vertListIndex + boolList = replicate vertexPosition False <> [True] <> replicate (length out1VertexList - vertexPosition - 1) False + prependBitVect = BV.fromBits boolList + in if isNothing vertListIndex + then error ("Vertex " <> show vertexIndex <> " not found in list " <> show out1VertexList) + else BV.append prependBitVect inBitVect + + +{- | getRoot takes a graph and list of nodes and returns vertex with indegree 0 +so assumes a connected graph--with a single root--not a forest +this does noit include unconnected leaves +-} +getRoots ∷ P.Gr a b → [G.Node] → [Int] +getRoots inGraph nodeList = + if null nodeList + then [] + else + let firstNode = head nodeList + in if (G.indeg inGraph firstNode == 0) && (G.outdeg inGraph firstNode > 0) + then firstNode : getRoots inGraph (tail nodeList) + else getRoots inGraph (tail nodeList) + + +{- | getUnConnectedLeaves takes a graph and list of nodes and returns vertex with indegree 0 +and outdegree == 0 +-} +getUnConnectedLeaves ∷ P.Gr a b → [G.Node] → [Int] +getUnConnectedLeaves inGraph nodeList = + if null nodeList + then [] + else + let firstNode = head nodeList + in if (G.indeg inGraph firstNode == 0) && (G.outdeg inGraph firstNode == 0) + then firstNode : getUnConnectedLeaves inGraph (tail nodeList) + else getUnConnectedLeaves inGraph (tail nodeList) + + +{- | getUnconmnectedNOdes takes a graph and list of nodes and returns vertex with indegree 0 +and outdegeee 0 +-} +getUnConnectedNodes ∷ P.Gr String String → Int → [G.Node] → [G.LNode BV.BV] +getUnConnectedNodes inGraph nLeaves nodeList = + if null nodeList + then [] + else + let firstNode = head nodeList + newNode = (firstNode, B.bit firstNode) + in if G.deg inGraph firstNode == 0 + then newNode : getUnConnectedNodes inGraph nLeaves (tail nodeList) + else getUnConnectedNodes inGraph nLeaves (tail nodeList) + + +{- | makeNodeFromChildren gets bit vectors as union of children in a post order traversal from leaves +the prepending of a single 'On' bit if there is only once child (setOutDegreeOneBit) +is modified to allow for multiple outdegree 1 vertices as parent of single vertex +-} +makeNodeFromChildren ∷ P.Gr String String → Int → V.Vector (G.LNode BV.BV) → [Int] → Int → PhyG [G.LNode BV.BV] +makeNodeFromChildren inGraph nLeaves leafNodes out1VertexList myVertex = + if myVertex < nLeaves + then pure [leafNodes V.! myVertex] + else + let myChildren = G.suc inGraph myVertex + -- parallel + action ∷ Int → PhyG [G.LNode BV.BV] + action = makeNodeFromChildren inGraph nLeaves leafNodes out1VertexList + in do + myChildrenNodes ← + getParallelChunkTraverse >>= \pTraverse → + action `pTraverse` myChildren + -- PU.seqParMap PU.myStrategyRDS (makeNodeFromChildren inGraph nLeaves leafNodes out1VertexList) myChildren -- `using` PU.myParListChunkRDS + + let rawBV = BV.or $ fmap (snd . head) myChildrenNodes + let myBV = + if length myChildren /= 1 + then rawBV + else setOutDegreeOneBits rawBV out1VertexList myVertex + + pure $ (myVertex, myBV) : concat myChildrenNodes + + +{- | getNodesFromARoot follows nodes connected to a root. +can be fmapped over roots to hit all--should be ok if multiple hits on nodes +since all labeled by BV.BVs need to fuse them if multiple roots to make sure nodes are consistent +and only one per root--should be ok for multikple jhists of nodes since BVs are from childre +just wasted work. Should L.nub after to maeksure only unique (by BV) nodes in list at end +-} +getNodesFromARoot ∷ P.Gr String String → Int → [G.LNode BV.BV] → Int → PhyG [G.LNode BV.BV] +getNodesFromARoot inGraph nLeaves leafNodes rootVertex = + if G.isEmpty inGraph + then error "Input graph is empty in getLabelledNodes" + else + let rootChildVerts = G.suc inGraph rootVertex + + -- get outdree = 1 node list for creting prepended bit vectors + out1VertexList = L.sort $ filter ((== 1) . G.outdeg inGraph) $ G.nodes inGraph + + -- parallel + action ∷ Int → PhyG [G.LNode BV.BV] + action = makeNodeFromChildren inGraph nLeaves (V.fromList leafNodes) out1VertexList + in do + -- recurse to children since assume only leaves can be labbeled with BV.BVs + -- fmap becasue could be > 2 (as in at root) + rootChildNewNodes ← + getParallelChunkTraverse >>= \pTraverse → + action `pTraverse` rootChildVerts + -- rootChildNewNodes = PU.seqParMap PU.myStrategyRDS (makeNodeFromChildren inGraph nLeaves (V.fromList leafNodes) out1VertexList) rootChildVerts -- `using` PU.myParListChunkRDS + + -- check if outdegree = 1 + let rawBV = BV.or $ fmap (snd . head) rootChildNewNodes + let rootBV = + if length rootChildVerts /= 1 + then rawBV + else setOutDegreeOneBits rawBV out1VertexList rootVertex + + pure $ (rootVertex, rootBV) : concat rootChildNewNodes + + +{- | getLabelledNodes labels nodes with bit vectors union of subtree leaves via post order traversal +adds nodes to reDoneNodes as they are preocessed +reorder NOdes is n^2 should be figured out how to keep them in order more efficeintly +-} +getLabelledNodes ∷ P.Gr String String → Int → [G.LNode BV.BV] → PhyG [G.LNode BV.BV] +getLabelledNodes inGraph nLeaves leafNodes = + -- trace ("getLabbeled graph with " <> (show $ G.noNodes inGraph) <> " nodes in " <> (showGraph inGraph)) ( + if G.isEmpty inGraph + then error "Input graph is empty in getLabelledNodes" + else + let rootVertexList = getRoots inGraph (G.nodes inGraph) + in do + htuList' ← mapM (getNodesFromARoot inGraph nLeaves leafNodes) rootVertexList + let htuList = L.nub $ concat htuList' + + -- this for adding in missing data + let unConnectedNodeList = getUnConnectedNodes inGraph nLeaves (G.nodes inGraph) + + pure $ reorderLNodes (htuList <> unConnectedNodeList) 0 + + +-- | findLNode takes an index and looks for node with that as vertex and retuirns that node +findLNode ∷ Int → [G.LNode BV.BV] → G.LNode BV.BV +findLNode vertex lNodeList = + if null lNodeList + then error ("Node " <> show vertex <> " not found") + else + let (a, b) = head lNodeList + in if a == vertex + then (a, b) + else findLNode vertex (tail lNodeList) + + +{- | reorderLNodes takes a list of nodes and reorders and order based on node vertex number +n^2 ugh +-} +reorderLNodes ∷ [G.LNode BV.BV] → Int → [G.LNode BV.BV] +reorderLNodes inNodeList inIndex + | null inNodeList = [] + | inIndex == length inNodeList = [] + | otherwise = + let newNode = findLNode inIndex inNodeList + in newNode : reorderLNodes inNodeList (inIndex + 1) + + +-- ) + +-- | relabelEdgs creates (BV.BV, BV.BV) labnels for an edges +relabelEdge ∷ V.Vector (G.LNode BV.BV) → G.LEdge String → G.LEdge (BV.BV, BV.BV) +relabelEdge allNodesVect inLEdge = + let (e, u, _) = inLEdge + eNodeBV = snd (allNodesVect V.! e) + uNodeBV = snd (allNodesVect V.! u) + in (e, u, (eNodeBV, uNodeBV)) + + +{- | changeLabelEdge labels edges by descendent vertex label +assumes leaves first then vertices labeled in order +offset for numLeaves assumes only labeling non-leaves so smaller set +assumes that if an "urroot" has been added via `union` then it is last vertex +this for condition where root has been added to majority consensus tree +-} +changeLabelEdge ∷ Int → V.Vector Double → [G.LEdge b] → [G.LEdge Double] +changeLabelEdge numLeaves freqVect edgeList = + if null edgeList + then [] + else + let (e, u, _) = head edgeList + newLabel + | u < numLeaves = 1 + | (u - numLeaves) >= V.length freqVect = 1 + | otherwise = freqVect V.! (u - numLeaves) + in -- trace (show (e,u) <> " " <> (show (u - numLeaves)) <> " " <> show freqVect) -- <> " " <> show newLabel) + (e, u, newLabel) : changeLabelEdge numLeaves freqVect (tail edgeList) + + +{- | addEdgeFrequenciesToGraph takes a greaph and edge frequencies and relables edges +with node frequencies of discedendet node +-} +addEdgeFrequenciesToGraph ∷ P.Gr a b → Int → [Double] → P.Gr a Double +addEdgeFrequenciesToGraph inGraph numLeaves freqList = + let inNodes = G.labNodes inGraph + inEdges = G.labEdges inGraph + newEdges = changeLabelEdge numLeaves (V.fromList freqList) inEdges + in -- trace (show inEdges) + G.mkGraph inNodes newEdges + + +-- | getLeafNumber take Graph and gets nu,ber of leaves (outdegree = 0) +getLeafNumber ∷ P.Gr BV.BV (BV.BV, BV.BV) → Int +getLeafNumber inGraph = + let degOutList = G.outdeg inGraph <$> G.nodes inGraph + in length $ filter (== 0) degOutList + + +{- +-- | findStrLabel checks Attributes (list f Attribute) from Graphvz to extract the String label of node +-- returns Maybe Text +findStrLabel :: Attributes -> Maybe T.Text +findStrLabel = getFirst . foldMap getStrLabel + +-- | getStrLabel takes an Attribute and reurns Text if StrLabel found, mempty otherwise +getStrLabel :: Attribute -> First T.Text +getStrLabel (Label (StrLabel txt)) = First . Just $ txt +getStrLabel _ = mempty + +-- | getLeafString takes a pairs (node vertex number, graphViz Attributes) +-- and returns String name of leaf of Stringified nude number if unlabbeled +getLeafString :: (Int, Attributes) -> String +getLeafString (nodeIndex, nodeLabel) = + let maybeTextLabel = findStrLabel nodeLabel + in + maybe (show nodeIndex) T.unpack maybeTextLabel + +-- | getLeafList returns leaf complement of graph from DOT file +getLeafList :: P.Gr Attributes Attributes -> [G.LNode String] +getLeafList inGraph = + if G.isEmpty inGraph then [] + else + let degOutList = G.outdeg inGraph <$> G.nodes inGraph + newNodePair = zip degOutList (G.labNodes inGraph) + leafPairList = filter ((==0).fst ) newNodePair + (_, leafList) = unzip leafPairList + (nodeVerts, _) = unzip leafList + newLabels = fmap getLeafString leafList + leafList' = zip nodeVerts newLabels + in + leafList' +-} + +{- | getLeafListNewick returns leaf complement of graph from newick file +difference from above is in the leaf label type +-} +getLeafListNewick ∷ P.Gr a b → [G.LNode a] +getLeafListNewick inGraph = + if G.isEmpty inGraph + then [] + else + let degOutList = G.outdeg inGraph <$> G.nodes inGraph + newNodePair = zip degOutList (G.labNodes inGraph) + leafPairList = filter ((== 0) . fst) newNodePair + (_, leafList) = unzip leafPairList + (nodeVerts, _) = unzip leafList + -- only different line + newLabels = fmap snd leafList + leafList' = zip nodeVerts newLabels + in leafList' + + +{- +-- | checkNodesSequential takes a list of nodes and returns booolean +-- True if nodes are input with sequential numerical indices +-- False if not--screws up reindexing later which assumes they are successive +checkNodesSequential :: G.Node -> [G.Node] -> Bool +checkNodesSequential prevNode inNodeList + | null inNodeList = True + | (head inNodeList - prevNode) /= 1 = trace ("Index or indices missing between " <> (show $ head inNodeList) <> " and " <> (show prevNode)) False + | otherwise = checkNodesSequential (head inNodeList) (tail inNodeList) +-} + +-- | reAnnotateGraphs takes parsed graph input and reformats for EUN +reAnnotateGraphs ∷ P.Gr String String → PhyG (P.Gr BV.BV (BV.BV, BV.BV)) +reAnnotateGraphs inGraph = + -- trace ("Reannotating " <> (showGraph inGraph)) ( + if G.isEmpty inGraph + then error "Input graph is empty in reAnnotateGraphs" + else + let degOutList = G.outdeg inGraph <$> G.nodes inGraph + nLeaves = length $ filter (== 0) degOutList + leafVerts = [0 .. (nLeaves - 1)] + leafIntegers = fmap B.bit leafVerts + leafBitVects = leafIntegers -- fmap (BV.bitVec nLeaves) leafIntegers + leafNodes = Prelude.zip leafVerts leafBitVects + in do + allNodes ← getLabelledNodes inGraph nLeaves leafNodes + let allEdges = fmap (relabelEdge (V.fromList allNodes)) (G.labEdges inGraph) + -- assign HTU BV via postorder pass. + pure $ G.mkGraph allNodes allEdges + + +-- | checkBVs looks at BV.BV of node and retuns FALSE if found True if not +checkBVs ∷ BV.BV → [G.LNode BV.BV] → Bool +checkBVs inBV nodeList = + null nodeList + || ( let (_, bv) = head nodeList + in inBV /= bv && checkBVs inBV (tail nodeList) + ) + + +-- | checkBVs looks at BV.BV of node and retuns FALSE if found True if not +checkEdgeBVs ∷ (BV.BV, BV.BV) → [G.LEdge (BV.BV, BV.BV)] → Bool +checkEdgeBVs (inABV, inBBV) edgeList = + null edgeList + || ( let (_, _, (aBV, bBV)) = head edgeList + in not ((inABV == aBV) && (inBBV == bBV)) && checkEdgeBVs (inABV, inBBV) (tail edgeList) + ) + + +{- | addAndReIndexUniqueNodes takes an inital list of nodes and adds new nodes reindexed +check identity by BV.BV +index bigger than size becasue starting after number of leaves +-} +addAndReIndexUniqueNodes ∷ Int → [G.LNode BV.BV] → [G.LNode BV.BV] → [G.LNode BV.BV] +addAndReIndexUniqueNodes newIndex nodesToExamine uniqueReIndexedNodes = + if null nodesToExamine + then uniqueReIndexedNodes + else + let (_, inBV) = head nodesToExamine + isUnique = checkBVs inBV uniqueReIndexedNodes + in if isUnique + then + let newNode = (newIndex, inBV) + in addAndReIndexUniqueNodes (newIndex + 1) (tail nodesToExamine) (newNode : uniqueReIndexedNodes) + else addAndReIndexUniqueNodes newIndex (tail nodesToExamine) uniqueReIndexedNodes + + +-- | getNodeIndex takes a BV.BV and returns a node with the same BV.BV +getNodeIndex ∷ BV.BV → [G.LNode BV.BV] → Int +getNodeIndex inBV nodeList = + if null nodeList + then error ("Node with BV " <> show inBV <> " not found in getNodeIndex") + else + let (inIndex, bv) = head nodeList + in if bv == inBV + then inIndex + else getNodeIndex inBV (tail nodeList) + + +{- | addAndReIndexEdges takes list of indexed nodes and BV, a list of edges to examine and a list of edges to keep +checks for uniqueness of edges by BV.BVs on (e,u) and reindexes the edge nodes based on the node set with bit vectors +keep method either 'unique" or "all" to keep lists of unique or all edges +-} +addAndReIndexEdges ∷ String → [G.LNode BV.BV] → [G.LEdge (BV.BV, BV.BV)] → [G.LEdge (BV.BV, BV.BV)] → [G.LEdge (BV.BV, BV.BV)] +addAndReIndexEdges keepMethod indexedNodes edgesToExamine uniqueReIndexedEdges = + if null edgesToExamine + then uniqueReIndexedEdges + else + let (_, _, (eBV, uBV)) = head edgesToExamine + isUnique = checkEdgeBVs (eBV, uBV) uniqueReIndexedEdges + in if (keepMethod == "all") || isUnique + then -- Find nodes with BVs of edge + + let eNode = getNodeIndex eBV indexedNodes + uNode = getNodeIndex uBV indexedNodes + newEdge = (eNode, uNode, (eBV, uBV)) + in addAndReIndexEdges keepMethod indexedNodes (tail edgesToExamine) (newEdge : uniqueReIndexedEdges) + else addAndReIndexEdges keepMethod indexedNodes (tail edgesToExamine) uniqueReIndexedEdges + + +{- | testEdge nodeList fullEdgeList) counter +chnage to input graph and delete edge from graph as opposed to making new graphs each time. +should be much faster using P.delLEdge (since only one edge to delete) +-} +testEdge ∷ (Eq b) ⇒ P.Gr a b → G.LEdge b → [G.LEdge b] +testEdge fullGraph candidateEdge@(e, u, _) = + let newGraph = G.delLEdge candidateEdge fullGraph + bfsNodes = BFS.bfs e newGraph + foundU = L.find (== u) bfsNodes + in [candidateEdge | isNothing foundU] + + +{- | makeEUN take list of nodes and edges, deletes each edge (e,u) in turn makes graph, +checks for path between nodes e and u, if there is delete edge otherwise keep edge in list for new graph +-} +makeEUN ∷ (Eq b, NFData b) ⇒ [G.LNode a] → [G.LEdge b] → P.Gr a b → PhyG (P.Gr a b) +makeEUN nodeList fullEdgeList fullGraph = + let -- counterList = [0..(length fullEdgeList - 1)] + -- requiredEdges = concat $ fmap (testEdge nodeList fullEdgeList) counterList + -- parallel + -- action :: G.LEdge b -> [G.LEdge b] + action = testEdge fullGraph + in do + testPar ← getParallelChunkMap + let requiredEdges = testPar action fullEdgeList -- PU.seqParMap PU.myStrategyRDS (testEdge fullGraph) fullEdgeList -- `using` PU.myParListChunkRDS + let newGraph = G.mkGraph nodeList (concat requiredEdges) + pure newGraph + + +{- | getLeafLabelMatches tyakes the total list and looks for elements in the smaller local leaf set +retuns int index of the match or (-1) if not found so that leaf can be added in orginal order +-} +getLeafLabelMatches ∷ [G.LNode String] → G.LNode String → (Int, Int) +getLeafLabelMatches localLeafList totNode = + if null localLeafList + then (-1, fst totNode) + else + let (inIndex, leafString) = head localLeafList + in if snd totNode == leafString + then (inIndex, fst totNode) + else getLeafLabelMatches (tail localLeafList) totNode + + +-- | reIndexEdge takes an (Int, Int) map, labelled edge, and returns a new labelled edge with new e,u vertices +reIndexLEdge ∷ Map.Map Int Int → G.LEdge a → G.LEdge String +reIndexLEdge vertexMap inEdge = + if Map.null vertexMap + then error "Null vertex map" + else + let (e, u, _) = inEdge + newE = Map.lookup e vertexMap + newU = Map.lookup u vertexMap + in if isNothing newE + then error ("Error looking up vertex " <> show e <> " in " <> show (e, u)) + else + if isNothing newU + then error ("Error looking up vertex " <> show u <> " in " <> show (e, u)) + else (fromJust newE, fromJust newU, "") + + +{- | reIndexAndAddLeaves takes rawGraphs and total input leaf sets and reindexes node, and edges, and adds +in leaves (with out edges) so that later processing can get bit vectors correct and match from +graph to graph. +new node set in teh total leaf set form all graphs plus teh local HTUs renumbered up based on added leaves +the map contains leaf mappings based on label of leaf, the HTUs extend that map with stright integers. +edges are re-indexed based on that map +-} +reIndexAndAddLeavesEdges ∷ [G.LNode String] → ([G.LNode String], P.Gr a b) → PhyG (P.Gr String String) +reIndexAndAddLeavesEdges totallLeafSet (inputLeafList, inGraph) = + if G.isEmpty inGraph + then pure G.empty + else -- reindex nodes and edges and add in new nodes (total leaf set + local HTUs) + -- create a map between inputLeafSet and totalLeafSet which is the canonical enumeration + -- then add in local HTU nodes and for map as well + -- trace ("Original graph: " <> (showGraph inGraph)) ( + + let -- parallel + action ∷ G.LNode String → (Int, Int) + action = getLeafLabelMatches inputLeafList + in do + labelPar ← getParallelChunkMap + let correspondanceList = labelPar action totallLeafSet + -- PU.seqParMap PU.myStrategyRDS (getLeafLabelMatches inputLeafList) totallLeafSet -- `using` PU.myParListChunkRDS + let matchList = filter ((/= (-1)) . fst) correspondanceList + -- remove order dependancey + -- htuList = [(length inputLeafList)..(length inputLeafList + htuNumber - 1)] + let htuList = fmap fst (G.labNodes inGraph) L.\\ fmap fst inputLeafList + let htuNumber = length (G.labNodes inGraph) - length inputLeafList + let newHTUNumbers = [(length totallLeafSet) .. (length totallLeafSet + htuNumber - 1)] + let htuMatchList = zip htuList newHTUNumbers + let vertexMap = Map.fromList (matchList <> htuMatchList) + let reIndexedEdgeList = fmap (reIndexLEdge vertexMap) (G.labEdges inGraph) + + let newNodeNumbers = [0 .. (length totallLeafSet + htuNumber - 1)] + let attributeList = replicate (length totallLeafSet + htuNumber) "" -- origAttribute + let newNodeList = zip newNodeNumbers attributeList + + pure $ G.mkGraph newNodeList reIndexedEdgeList + + +-- | relabelNode takes nofde list and labels leaves with label and HTUs with String of HexCode of BV label +relabelNodes ∷ [G.LNode BV.BV] → [G.LNode String] → [G.LNode String] +relabelNodes inNodes leafLabelledNodes + | null inNodes = [] + | not $ null leafLabelledNodes = head leafLabelledNodes : relabelNodes (tail inNodes) (tail leafLabelledNodes) + | otherwise = + let (vertex, _) = head inNodes + in (vertex, "HTU" <> show vertex) : relabelNodes (tail inNodes) [] + + +-- | addGraphLabels take Graph and changes to add nodes labelled wiyth String, edges as well +addGraphLabels ∷ P.Gr BV.BV (BV.BV, BV.BV) → [G.LNode String] → P.Gr String String +addGraphLabels inGraph totallLeafSet + | G.isEmpty inGraph = error "Empty graph in addGraphLabels" + | null totallLeafSet = error "Empty leaf set in addGraphLabels" + | otherwise = + let newNodes = relabelNodes (G.labNodes inGraph) totallLeafSet + -- newNodes = totallLeafSet <> newHTUList + (eList, uList) = unzip (G.edges inGraph) + + newEdges = zip3 eList uList (replicate (length eList) "") + in -- trace ("Relabelled EUN : " <> (showGraph $ G.mkGraph newNodes newEdges) <> " from " <> (show totallLeafSet)) + G.mkGraph newNodes newEdges + + +{- | intermediateNodeExists takes two node bitvetors and the full bitvector list +and checks to see if there is an intermediate node between first two that would +remove need for the edge between the first two. +This should reduce time complexity of vertex-based reconciliation to O(n^3) from O(n^4) +-} +intermediateNodeExists ∷ BV.BV → BV.BV → [BV.BV] → Bool +intermediateNodeExists aBV cBV fullNodeBVList = + not (null fullNodeBVList) + && let bBV = head fullNodeBVList + leftIntersection = BV.and [aBV, bBV] + rightIntersection = BV.and [bBV, cBV] + in if (bBV == aBV) || (bBV == cBV) + then intermediateNodeExists aBV cBV (tail fullNodeBVList) + else ((leftIntersection == bBV) && (rightIntersection == cBV)) || intermediateNodeExists aBV cBV (tail fullNodeBVList) + + +{- | getIntersectionEdges takes a node A and cretes directed edges to each other edge in [B] +with rulkesLEdge + if A intesect B = empty then no edge + else if A intesect B = B then create edge A->B + else if A intesect B = A then create edge B->A + else --can't happen +Added in check for intermediate (by bitvector) node that shold obviate need for +breadth first search for vertex-based reconciliation + if A > B and B > C and A intersect B = B and B intersect C = C + then edge A->C is redundant and is not added to edge set +-} +getIntersectionEdges ∷ [BV.BV] → [G.LNode BV.BV] → G.LNode BV.BV → [G.LEdge (BV.BV, BV.BV)] +getIntersectionEdges fullNodeBVList bNodeList aNode = + if null bNodeList + then [] + else + let (aIndex, aBV) = aNode + (bIndex, bBV) = head bNodeList + intersection = BV.and [aBV, bBV] + in -- only do the directed 1/2 so no L.nub issues later + if (bBV >= aBV) || (intersection == 0) + then getIntersectionEdges fullNodeBVList (tail bNodeList) aNode + else + if intersection == bBV + then + if intermediateNodeExists aBV bBV fullNodeBVList + then getIntersectionEdges fullNodeBVList (tail bNodeList) aNode + else (aIndex, bIndex, (aBV, bBV)) : getIntersectionEdges fullNodeBVList (tail bNodeList) aNode + else getIntersectionEdges fullNodeBVList (tail bNodeList) aNode + + +{- | combinable tales a list of bitvecotrs and a single bitvector +and checks each of the first to see if combinable +if A and B == A,B, or 0 then True else False +if True return [bitvector] else [] if not +-} +combinable ∷ String → [BV.BV] → BV.BV → [BV.BV] +combinable comparison bvList bvIn + | comparison == "identity" = + if null bvList + then [] + else [bvIn | bvIn `elem` bvList] + | comparison == "combinable" -- combinable sensu Nelson 1979 + = + if null bvList + then [bvIn] + else + let intersectList = fmap (checkBitVectors bvIn) bvList -- took out paralleism here + isCombinable = L.foldl' (&&) True intersectList + in [bvIn | isCombinable] + | otherwise = errorWithoutStackTrace ("Comparison method " <> comparison <> " unrecongnized (combinable/identity)") + where + checkBitVectors a b = + let c = BV.and [a, b] + in c == a || c == b || c == 0 + + +{- | getGraphCompatibleList takes a list of graphs (list of node Bitvectors) +and retuns a list of each graph a bitvector node is compatible with +this isued later for majority rule consensus +each bit vector node will have a list of length 1..number of graphs +-} +getGraphCompatibleList ∷ String → [[BV.BV]] → BV.BV → [BV.BV] +getGraphCompatibleList comparison inBVListList bvToCheck = + if null inBVListList + then error "Null list of list of bitvectors in getGraphCompatibleList" + else + let compatibleList = concatMap (flip (combinable comparison) bvToCheck) inBVListList + in -- trace (show $ length compatibleList) + compatibleList + + +{- | getCompatibleList takes a list of graph node bitvectors as lists +retuns a list of lists of bitvectors where the length of the list of the individual bitvectors +is the number of graphs it is compatible with +-} +getCompatibleList ∷ String → [[BV.BV]] → [[BV.BV]] +getCompatibleList comparison inBVListList = + if null inBVListList + then error "Null list of list of bitvectors in getCompatibleList" + else + let uniqueBVList = L.nub $ concat inBVListList + bvCompatibleListList = fmap (getGraphCompatibleList comparison inBVListList) uniqueBVList + in filter (not . null) bvCompatibleListList + + +{- | getThresholdNodes takes a threshold and keeps those unique objects present in the threshold percent or +higher. L.sorted by frequency (low to high) +urRoot added to make sure there will be a single connected graph +-} +getThresholdNodes ∷ String → Int → Int → [[G.LNode BV.BV]] → ([G.LNode BV.BV], [Double]) +getThresholdNodes comparison thresholdInt numLeaves objectListList + | thresholdInt < 0 || thresholdInt > 100 = errorWithoutStackTrace "Threshold must be in range [0,100]" + | null objectListList = error "Empty list of object lists in getThresholdObjects" + | otherwise = + let numGraphs = fromIntegral $ length objectListList + indexList = [numLeaves .. (numLeaves + length objectGroupList - 1)] + objectGroupList + | comparison == "combinable" = getCompatibleList comparison (fmap (fmap snd) objectListList) + | comparison == "identity" = L.group $ L.sort (snd <$> concat objectListList) + | otherwise = errorWithoutStackTrace ("Comparison method " <> comparison <> " unrecognized (combinable/identity)") + uniqueList = zip indexList (fmap head objectGroupList) + frequencyList = fmap (((/ numGraphs) . fromIntegral) . length) objectGroupList -- removed parallel + fullPairList = zip uniqueList frequencyList + threshold = (fromIntegral thresholdInt / 100.0) ∷ Double + in -- trace ("There are " <> (show $ length objectListList) <> " to filter: " <> (show uniqueList) <> "\n" <> (show objectGroupList) <> " " <> (show frequencyList)) + (fst <$> filter ((>= threshold) . snd) fullPairList, snd <$> fullPairList) + + +{- | getThresholdEdges takes a threshold and number of graphs and keeps those unique edges present in the threshold percent or +higher. L.sorted by frequency (low to high) +modified from getThresholdNodes due to type change in edges +used and number from numleaves so can use BV +-} +getThresholdEdges ∷ (Show a, Ord a) ⇒ Int → Int → [a] → ([a], [Double]) +getThresholdEdges thresholdInt numGraphsInput objectList + | thresholdInt < 0 || thresholdInt > 100 = errorWithoutStackTrace "Threshold must be in range [0,100]" + | null objectList = error "Empty list of object lists in getThresholdEdges" + | otherwise = + let threshold = (fromIntegral thresholdInt / 100.0) ∷ Double + numGraphs = fromIntegral numGraphsInput + objectGroupList = L.group $ L.sort objectList + uniqueList = fmap head objectGroupList + frequencyList = fmap (((/ numGraphs) . fromIntegral) . length) objectGroupList -- removed parallel + fullPairList = zip uniqueList frequencyList + in -- trace ("There are " <> (show numGraphsIn) <> " to filter: " <> (show uniqueList) <> "\n" <> (show $ fmap length objectGroupList) <> " " <> (show frequencyList)) + (fst <$> filter ((>= threshold) . snd) fullPairList, snd <$> fullPairList) + + +{- | getPostOrderVerts takes a vertex and traverses postorder to root places all visirted nodes in a set of found +vertices. Keeps placing new nodes in recursion list until a root is hit. If a node is already in found set +it is not added to list of nodes to recurse +returns set of visited nodes +-} +getPostOrderVerts ∷ P.Gr BV.BV (BV.BV, BV.BV) → S.Set G.Node → [G.Node] → S.Set G.Node +getPostOrderVerts inGraph foundVertSet inVertexList = + if null inVertexList + then foundVertSet + else + let firstVertex = head inVertexList + in if S.member firstVertex foundVertSet + then getPostOrderVerts inGraph foundVertSet (tail inVertexList) + else + let newFoundSet = S.insert firstVertex foundVertSet + parentVerts = G.pre inGraph firstVertex + in getPostOrderVerts inGraph newFoundSet (inVertexList <> parentVerts) + + +{- | verticesByPostorder takes a graph and a leaf set and an initially empty found vertex set +as the postorder pass takes place form each leaf, each visited vertex is placed in foundVertSet +when roots are hit, it recurses back untill all paths are traced to a root. +final final rgaph is created and retuyrned from foundVertSet and input list +could have edges unconnected to leaves if consistent edge leading to a subtree with inconsistent configuration +so are filtered out by making sure each vertex in an edge is in the vertex list +-} +verticesByPostorder ∷ P.Gr BV.BV (BV.BV, BV.BV) → [G.LNode BV.BV] → S.Set G.Node → P.Gr BV.BV (BV.BV, BV.BV) +verticesByPostorder inGraph leafNodes foundVertSet + | G.isEmpty inGraph = error "Empty graph in verticesByPostorder" + | null leafNodes = + let vertexIndexList = S.toList foundVertSet + vertexLabelList = fmap (fromJust . G.lab inGraph) vertexIndexList + vertexList = zip vertexIndexList vertexLabelList + edgeList = fmap (verifyEdge vertexIndexList) (G.labEdges inGraph) -- removed parallel + in G.mkGraph vertexList (concat edgeList) + | otherwise = + let firstLeaf = fst $ head leafNodes + firstVertices = getPostOrderVerts inGraph foundVertSet [firstLeaf] + in verticesByPostorder inGraph (tail leafNodes) (S.union foundVertSet firstVertices) + + +{- | verifyEdge takes a vertex index list and an edge and checks to see if +the subtyending vertices are in the vertex list nad returns teh edge as asingleton list +if yes--else empty list (for mapping purposes) +-} +verifyEdge ∷ [G.Node] → G.LEdge (BV.BV, BV.BV) → [G.LEdge (BV.BV, BV.BV)] +verifyEdge vertIndexList inEdge@(e, u, _) + | e `notElem` vertIndexList = [] + | u `notElem` vertIndexList = [] + | otherwise = [inEdge] + + +{- +-- | sortInputArgs takes a list of arguments (Strings) nd retuns a pair of lists +-- of strings that are newick or graphviz dotFile filenames for later parsing +sortInputArgs :: [String] -> [String] -> ([T.Text],[T.Text],[String],[String],[String]) -> ([T.Text],[T.Text],[String],[String],[String]) +sortInputArgs inContents inArgs (curFEN, curNewick, curDot, curNewFiles, curFENFILES) = + if null inArgs then (curFEN, curNewick, curDot, curNewFiles, curFENFILES) + else + let firstFileName = head inArgs + firstContents = filter (not . isSpace) $ head inContents + in + if head firstContents == '(' then -- Newick/EnhancedNewick + sortInputArgs (tail inContents) (tail inArgs) (curFEN, T.pack firstContents : curNewick, curDot, firstFileName : curNewFiles, curFENFILES) + else if head firstContents == '<' then -- ForestEnhancedNewick + sortInputArgs (tail inContents) (tail inArgs) (T.pack firstContents : curFEN, curNewick, curDot, curNewFiles, firstFileName : curFENFILES) + else if (head firstContents == 's') || (head firstContents == 'g') || (head firstContents == 'd') then --Dot + sortInputArgs (tail inContents) (tail inArgs) (curFEN, curNewick, firstFileName : curDot, curNewFiles, curFENFILES) + else errorWithoutStackTrace("Input file " <> firstFileName <> " does not appear to be Newick, Enhanced Newick, Forest Enhanced Newick or dot format ") + +-- | nodeText2String takes a node with text label and returns a node with String label +nodeText2String :: G.LNode T.Text -> G.LNode String +nodeText2String (inIndex, label) = (inIndex, T.unpack label) + +-- | fglTextA2TextString converts the graph types from Text A to Text String +fglTextB2Text :: P.Gr b Double -> P.Gr b T.Text +fglTextB2Text inGraph = + if G.isEmpty inGraph then G.empty + else + let labNodes = G.labNodes inGraph + labEdges = G.labEdges inGraph + (eList, uList, labelList) = unzip3 labEdges + --- newLabels = fmap toShortest labelList + newLabels = fmap (T.pack . show) labelList + newEdges = zip3 eList uList newLabels + in + G.mkGraph labNodes newEdges +-} + +{- | addUrRootAndEdges creates a single root and adds edges to existing roots +and unconnected leaves +-} +addUrRootAndEdges ∷ P.Gr String Double → P.Gr String Double +addUrRootAndEdges inGraph = + let origLabVerts = G.labNodes inGraph + origLabEdges = G.labEdges inGraph + origRootList = getRoots inGraph (fst <$> origLabVerts) + unconnectedLeafList = getUnConnectedLeaves inGraph (fst <$> origLabVerts) + in -- all ok--no unconnected vertices + if (length origRootList == 1) && null unconnectedLeafList + then inGraph + else -- add edges to unconencted leaves + + if length origRootList == 1 + then + let newEdgeList = + zip3 + (replicate (length unconnectedLeafList) (head origRootList)) + unconnectedLeafList + (replicate (length unconnectedLeafList) 0.0) + in G.mkGraph origLabVerts (origLabEdges <> newEdgeList) + else -- add UR root, edges to existing roots, and edges to unconnected leaves + + let unRootedVertices = origRootList <> unconnectedLeafList + numOrigVerts = length origLabVerts + newRoot = (numOrigVerts, "HTU" <> show numOrigVerts) + newEdgeList = zip3 (replicate (length unRootedVertices) numOrigVerts) unRootedVertices (replicate (length unRootedVertices) 0.0) + in G.mkGraph (origLabVerts <> [newRoot]) (origLabEdges <> newEdgeList) + + +-- | changeVertexEdgeLabels keeps or removes vertex and edge labels +changeVertexEdgeLabels ∷ (Show b) ⇒ Bool → Bool → P.Gr String b → P.Gr String String +changeVertexEdgeLabels keepVertexLabel keepEdgeLabel inGraph = + -- trace ("CVL: " <> (show (keepVertexLabel, keepEdgeLabel))) $ + let inLabNodes = G.labNodes inGraph + degOutList = G.outdeg inGraph <$> G.nodes inGraph + nodeOutList = zip degOutList inLabNodes + leafNodeList = snd <$> filter ((== 0) . fst) nodeOutList + nonLeafNodeList = snd <$> filter ((> 0) . fst) nodeOutList + newNonLeafNodes = + if not keepVertexLabel + then zip (fmap fst nonLeafNodeList) (replicate (length nonLeafNodeList) "") + else fmap checkMakeLabel nonLeafNodeList + inLabEdges = G.labEdges inGraph + inEdges = fmap G.toEdge inLabEdges + newEdges = + if keepEdgeLabel + then fmap showLabel inLabEdges + else fmap (`G.toLEdge` "") inEdges + in -- trace ("CVEL " <> (show (keepVertexLabel, keepEdgeLabel ))) + G.mkGraph (leafNodeList <> newNonLeafNodes) newEdges + where + showLabel (e, u, l) = (e, u, show l) + checkMakeLabel (a, b) = + if head b /= 'H' + then (a, "HTU" <> show a) + else (a, b) + + +-- | reconcile is the overall function to drive all methods +reconcile ∷ (String, String, Int, Bool, Bool, Bool, String, [P.Gr String String]) → PhyG (String, P.Gr String String) +reconcile (localMethod, compareMethod, threshold, connectComponents, edgeLabel, vertexLabel, outputFormat, inputGraphList) = + let -- parallel + reAnnotate ∷ P.Gr String String → PhyG (P.Gr BV.BV (BV.BV, BV.BV)) + reAnnotate = reAnnotateGraphs + in -- intersectionAction :: G.LNode BV.BV -> [G.LEdge (BV.BV,BV.BV)] + -- intersectionAction = getIntersectionEdges (fmap snd thresholdNodes) thresholdNodes + do + -- Reformat graphs with appropriate annotations, BV.BVs, etc + processedGraphs ← + getParallelChunkTraverse >>= \pTraverse → + reAnnotate `pTraverse` inputGraphList + -- PU.seqParMap PU.myStrategyRDS reAnnotateGraphs inputGraphList -- `using` PU.myParListChunkRDS + + -- Create lists of reindexed unique nodes and edges, identity by BV.BVs + -- The drops to not reexamine leaves repeatedly + -- Assumes leaves are first in list + let numLeaves = getLeafNumber (head processedGraphs) + let leafNodes = take numLeaves (G.labNodes $ head processedGraphs) + let firstNodes = G.labNodes $ head processedGraphs + let numFirstNodes = length firstNodes + let unionNodes = + L.sort $ + leafNodes + <> addAndReIndexUniqueNodes + numFirstNodes + (concatMap (drop numLeaves) (G.labNodes <$> tail processedGraphs)) + (drop numLeaves firstNodes) + -- unionEdges = addAndReIndexEdges "unique" unionNodes (concatMap G.labEdges (tail processedGraphs)) (G.labEdges $ head processedGraphs) + + let totallLeafString = L.foldl' L.union [] (fmap (fmap snd . getLeafListNewick) inputGraphList) + let totallLeafSet = zip [0 .. (length totallLeafString - 1)] totallLeafString + + -- Create Adams II consensus + -- + adamsII ← A.makeAdamsII totallLeafSet (fmap PhyP.relabelFGLEdgesDouble inputGraphList) + -- adamsIIInfo = "There are " <> show (length $ G.nodes adamsII) <> " nodes present in Adams II consensus" + let adamsII' = changeVertexEdgeLabels vertexLabel edgeLabel adamsII + let adamsIIOutDotString = T.unpack $ renderDot $ toDot $ GV.graphToDot GV.quickParams adamsII + let adamsIIOutFENString = PhyP.fglList2ForestEnhancedNewickString [PhyP.stringGraph2TextGraph $ PhyP.relabelFGLEdgesDouble adamsII'] True True + + -- + -- Create thresholdMajority rule Consensus and dot string + -- vertex-based CUN-> Majority rule ->Strict + -- + let (thresholdNodes', nodeFreqs) = getThresholdNodes compareMethod threshold numLeaves (fmap (drop numLeaves . G.labNodes) processedGraphs) + let thresholdNodes = leafNodes <> thresholdNodes' + + intersectionPar ← getParallelChunkMap + let intersectionAction = getIntersectionEdges (fmap snd thresholdNodes) thresholdNodes + let thresholdEdgesList = intersectionPar intersectionAction thresholdNodes + -- PU.seqParMap PU.myStrategyRDS (getIntersectionEdges (fmap snd thresholdNodes) thresholdNodes) thresholdNodes -- `using` PU.myParListChunkRDS + let thresholdEdges = L.nub $ concat thresholdEdgesList + -- numPossibleEdges = ((length thresholdNodes * length thresholdNodes) - length thresholdNodes) `div` 2 + let thresholdConsensusGraph = G.mkGraph thresholdNodes thresholdEdges -- O(n^3) + + -- thresholdConInfo = "There are " <> show (length thresholdNodes) <> " nodes present in >= " <> (show threshold <> "%") <> " of input graphs and " <> show numPossibleEdges <> " candidate edges" + -- <> " yielding a final graph with " <> show (length (G.labNodes thresholdConsensusGraph)) <> " nodes and " <> show (length (G.labEdges thresholdConsensusGraph)) <> " edges" + + -- add back labels for vertices and "GV.quickParams" for G.Gr String Double or whatever + let labelledTresholdConsensusGraph' = addGraphLabels thresholdConsensusGraph totallLeafSet + let labelledTresholdConsensusGraph'' = addEdgeFrequenciesToGraph labelledTresholdConsensusGraph' (length leafNodes) nodeFreqs + + -- Add urRoot and edges to existing roots if there are unconnected components and connnectComponets is True + let labelledTresholdConsensusGraph = + if not connectComponents + then labelledTresholdConsensusGraph'' + else addUrRootAndEdges labelledTresholdConsensusGraph'' + let gvRelabelledConsensusGraph = + GO.renameSimpleGraphNodesString $ LG.reindexGraph $ changeVertexEdgeLabels vertexLabel edgeLabel labelledTresholdConsensusGraph + let thresholdConsensusOutDotString = T.unpack $ renderDot $ toDot $ GV.graphToDot GV.quickParams gvRelabelledConsensusGraph + let thresholdConsensusOutFENString = PhyP.fglList2ForestEnhancedNewickString [PhyP.stringGraph2TextGraph labelledTresholdConsensusGraph] edgeLabel True + + -- + -- Create threshold EUN and dot string, orignial EUN is threshold = 0 + -- + let allEdges = addAndReIndexEdges "all" unionNodes (concatMap G.labEdges (tail processedGraphs)) (G.labEdges $ head processedGraphs) + let (thresholdEUNEdges, edgeFreqs) = getThresholdEdges threshold (length processedGraphs) allEdges + thresholdEUNGraph' ← makeEUN unionNodes thresholdEUNEdges (G.mkGraph unionNodes thresholdEUNEdges) + + -- Remove unnconnected HTU nodes via postorder pass from leaves + let thresholdEUNGraph = verticesByPostorder thresholdEUNGraph' leafNodes S.empty + -- thresholdEUNInfo = "\nThreshold EUN deleted " <> show (length unionEdges - length (G.labEdges thresholdEUNGraph) ) <> " of " <> show (length unionEdges) <> " total edges" + -- <> " for a final graph with " <> show (length (G.labNodes thresholdEUNGraph)) <> " nodes and " <> show (length (G.labEdges thresholdEUNGraph)) <> " edges" + + -- add back labels for vertices and "GV.quickParams" for G.Gr String Double or whatever + let thresholdLabelledEUNGraph' = addGraphLabels thresholdEUNGraph totallLeafSet + let thresholdLabelledEUNGraph'' = addEdgeFrequenciesToGraph thresholdLabelledEUNGraph' (length leafNodes) edgeFreqs + + -- Add urRoot and edges to existing roots if there are unconnected components and connnectComponets is True + let thresholdLabelledEUNGraph = + if not connectComponents + then thresholdLabelledEUNGraph'' + else addUrRootAndEdges thresholdLabelledEUNGraph'' + + -- Create EUN Dot String + let gvRelabelledEUNGraph = GO.renameSimpleGraphNodesString $ LG.reindexGraph $ changeVertexEdgeLabels vertexLabel edgeLabel thresholdLabelledEUNGraph + let thresholdEUNOutDotString = T.unpack $ renderDot $ toDot $ GV.graphToDot GV.quickParams gvRelabelledEUNGraph -- eunGraph + let thresholdEUNOutFENString = PhyP.fglList2ForestEnhancedNewickString [PhyP.stringGraph2TextGraph thresholdLabelledEUNGraph] edgeLabel True + + -- Create Adams II consensus + -- + adamsII ← A.makeAdamsII totallLeafSet (fmap PhyP.relabelFGLEdgesDouble inputGraphList) + -- adamsIIInfo = "There are " <> show (length $ G.nodes adamsII) <> " nodes present in Adams II consensus" + let adamsII' = changeVertexEdgeLabels vertexLabel False adamsII + let adamsIIOutDotString = T.unpack $ renderDot $ toDot $ GV.graphToDot GV.quickParams adamsII' + let adamsIIOutFENString = PhyP.fglList2ForestEnhancedNewickString [PhyP.stringGraph2TextGraph $ PhyP.relabelFGLEdgesDouble adamsII'] False False + + if localMethod == "eun" + then + if outputFormat == "dot" + then pure (thresholdEUNOutDotString, gvRelabelledEUNGraph) + else + if outputFormat == "fenewick" + then pure (thresholdEUNOutFENString, gvRelabelledEUNGraph) + else errorWithoutStackTrace ("Output graph format " <> outputFormat <> " is not implemented") + else + if localMethod == "adams" + then + if outputFormat == "dot" + then pure (adamsIIOutDotString, adamsII') + else + if outputFormat == "fenewick" + then pure (adamsIIOutFENString, adamsII') + else errorWithoutStackTrace ("Output graph format " <> outputFormat <> " is not implemented") + else + if (localMethod == "majority") || (localMethod == "cun") || (localMethod == "strict") + then + if outputFormat == "dot" + then pure (thresholdConsensusOutDotString, gvRelabelledConsensusGraph) + else + if outputFormat == "fenewick" + then pure (thresholdConsensusOutFENString, gvRelabelledConsensusGraph) + else errorWithoutStackTrace ("Output graph format " <> outputFormat <> " is not implemented") + else errorWithoutStackTrace ("Graph combination method " <> localMethod <> " is not implemented") + + +{- | makeProcessedGraph takes a set of graphs and a leaf set and adds teh missing leafws to teh graphs and reindexes +the nodes and edges of the input graphs consistenly +String as oposed to Text due tyo reuse of code in Eun.c +-} +makeProcessedGraph ∷ [LG.LNode T.Text] → SimpleGraph → PhyG SimpleGraph +makeProcessedGraph leafTextList inGraph + | null leafTextList = error "Null leaf list in makeFullLeafSetGraph" + | LG.isEmpty inGraph = error "Empty graph in makeFullLeafSetGraph" + | otherwise = + let (_, graphleafTextList, _, _) = LG.splitVertexList inGraph + leafStringList = fmap nodeToString leafTextList + graphLeafStringList = fmap nodeToString graphleafTextList + in do + reIndexedGraph ← reIndexAndAddLeavesEdges leafStringList (graphLeafStringList, inGraph) + let textNodes = (nodeToText <$> LG.labNodes reIndexedGraph) + let doubleEdges = (edgeToDouble <$> LG.labEdges reIndexedGraph) + pure $ LG.mkGraph textNodes doubleEdges + where + nodeToString (a, b) = (a, T.unpack b) + nodeToText (a, b) = (a, T.pack b) + edgeToDouble (a, b, c) = (a, b, read c ∷ Double) diff --git a/src/Reconciliation/ReconcileGraphs.hs b/src/Reconciliation/ReconcileGraphs.hs new file mode 100644 index 000000000..7b93fc8b1 --- /dev/null +++ b/src/Reconciliation/ReconcileGraphs.hs @@ -0,0 +1,294 @@ +{- | +Module : ReconcileGraphs.hs +Description : Module to call graph reconciliation functions +Copyright : (c) 2021 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. +License : + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The views and conclusions contained in the software and documentation are those +of the authors and should not be interpreted as representing official policies, +either expressed or implied, of the FreeBSD Project. + +Maintainer : Ward Wheeler +Stability : unstable +Portability : portable (I hope) +-} +module Reconciliation.ReconcileGraphs ( + makeReconcileGraph, +) where + +import Control.Monad (when) +import Control.Monad.IO.Class (MonadIO (..)) +import Data.List qualified as L +import Data.Maybe +import Data.Text.Lazy qualified as T +import GeneralUtilities +import GraphFormatUtilities qualified as GFU +import PHANE.Evaluation +import PHANE.Evaluation.Verbosity (Verbosity (..)) +import Reconciliation.Eun qualified as E +import Types.Types +import Utilities.LocalGraph qualified as LG + + +-- import Debug.Trace + +-- | makeReconcileGraph is a wrapper around eun.hs functions to return String of reconciled graph +makeReconcileGraph ∷ [String] → [(String, String)] → [SimpleGraph] → PhyG (String, SimpleGraph) +makeReconcileGraph validCommandList commandPairList inGraphList = + -- trace ("MRG: " <> (concatMap LG.prettyIndices inGraphList)) $ + if null inGraphList + then pure ("Error: No input graphs to reconcile", LG.empty) + else + if length inGraphList == 1 + then pure ("Warning: Only single input graph to reconcile", head inGraphList) + else + let -- convert SimpleGraph to String String from Text Double + stringGraphs = fmap (GFU.modifyVertexEdgeLabels True True . GFU.textGraph2StringGraph) inGraphList + + -- parse arguements + commandList = (mergePair <$> filter (('"' `notElem`) . snd) commandPairList) + (localMethod, compareMethod, threshold, connectComponents, edgeLabel, vertexLabel, outputFormat) = processReconcileArgs validCommandList commandList + in do + -- call EUN/reconcile functions + (reconcileString, reconcileGraph) ← + E.reconcile (localMethod, compareMethod, threshold, connectComponents, edgeLabel, vertexLabel, outputFormat, stringGraphs) + + -- convert eun format graph back to SimpleGraph + let reconcileSimpleGraph = GFU.stringGraph2TextGraphDouble reconcileGraph + + -- trace ("MRG :" <> (show (localMethod, compareMethod, threshold, connectComponents, edgeLabel, vertexLabel, outputFormat)) <> "\n" <> reconcileString + -- <> "\n" <> (LG.prettyIndices reconcileSimpleGraph)) + pure ("", reconcileSimpleGraph) + where + mergePair (a, b) = + if a /= [] && b /= [] + then a <> (':' : b) + else a <> b + + +{- | processReconcileArgs takes a list of strings and returns values of commands for proram execution +including defaults +checks commands for misspellings +-} +processReconcileArgs ∷ [String] → [String] → (String, String, Int, Bool, Bool, Bool, String) +processReconcileArgs validCommandList inList' = + let inList = inList' L.\\ ["overwrite", "append", "reconcile", "dot", "newick", "dotpdf"] + in if null inList + then + let -- default values + localMethod = "eun" + compareMethod = "combinable" + threshold = 0 + connectComponents = True + edgeLabel = True + vertexLabel = True + outputFormat = "dot" + in (localMethod, compareMethod, threshold, connectComponents, edgeLabel, vertexLabel, outputFormat) + else -- trace ("Rec args: " <> (show inList)) $ + + let inTextList = fmap T.pack inList + inTextListLC = fmap T.toLower inTextList + commandList = filter (T.any (== ':')) inTextListLC + stringCommands = fmap (T.unpack . T.takeWhile (/= ':')) commandList + (editCostList, matchList) = unzip $ fmap (getBestMatch (maxBound ∷ Int, "no suggestion") validCommandList) stringCommands + commandMatch = zip3 editCostList stringCommands matchList + notMatchedList = filter ((> 0) . fst3) commandMatch + localMethod = getMethod inTextListLC + compareMethod = getCompareMethod inTextListLC + connect = getConnect inTextListLC + edgeLabel = getEdgeLabel inTextListLC + vertexLabel = getVertexLabel inTextListLC + threshold + | localMethod == "cun" = 0 + | localMethod == "strict" = 100 + | otherwise = getThreshold inTextListLC + outFormat = getOutputFormat inTextListLC + in if null notMatchedList + then (localMethod, compareMethod, threshold, connect, edgeLabel, vertexLabel, outFormat) + else + errorWithoutStackTrace + ("\n\nError(s) in reconcile command specification (case insensitive):\n" <> getCommandErrorString notMatchedList) + + +-- ) + +{- | getMethod returns method value or dedfault otherwise +assumes in lower case +-} +getMethod ∷ [T.Text] → String +getMethod inTextList = + -- default + if null inTextList + then "eun" + else + let firstCommand = T.takeWhile (/= ':') $ head inTextList + firstOption = T.tail $ T.dropWhile (/= ':') $ head inTextList + in if isNothing (T.find (== ':') (head inTextList)) + then getMethod (tail inTextList) + else + if firstCommand == T.pack "method" + then + let option = T.unpack firstOption + in if option == "eun" + then "eun" + else + if option == "cun" + then "cun" + else + if option == "majority" + then "majority" + else + if option == "strict" + then "strict" + else + if option == "adams" + then "adams" + else errorWithoutStackTrace ("Reconcile option \'" <> option <> "\' not recognized (eun|cun|majority|strict)") + else getMethod (tail inTextList) + + +{- | getCompareMethod returns compareMethod value or default otherwise +assumes in lower case +-} +getCompareMethod ∷ [T.Text] → String +getCompareMethod inTextList = + -- default + if null inTextList + then "combinable" + else + let firstCommand = T.takeWhile (/= ':') $ head inTextList + firstOption = T.tail $ T.dropWhile (/= ':') $ head inTextList + in if isNothing (T.find (== ':') (head inTextList)) + then getCompareMethod (tail inTextList) + else + if firstCommand == T.pack "compare" + then + let option = T.unpack firstOption + in if option == "combinable" + then "combinable" + else + if option == "identity" + then "identity" + else errorWithoutStackTrace ("Compare option \'" <> option <> "\' not recognized (combinable|identity)") + else getCompareMethod (tail inTextList) + + +{- | getConect returns connect value or default otherwise (True|False) +assumes in lower case +-} +getConnect ∷ [T.Text] → Bool +getConnect inTextList = + -- default + not (null inTextList) + && ( let firstCommand = T.takeWhile (/= ':') $ head inTextList + firstOption = T.tail $ T.dropWhile (/= ':') $ head inTextList + in if isNothing (T.find (== ':') (head inTextList)) + then getConnect (tail inTextList) + else + if firstCommand == T.pack "connect" + then + let option = T.unpack firstOption + in (option == "true") + || (option /= "false" && errorWithoutStackTrace ("Connect option \'" <> option <> "\' not recognized (True|False)")) + else getConnect (tail inTextList) + ) + + +{- | getEdgeLabel returns edgeLabel value or default otherwise (True|False) +assumes in lower case +-} +getEdgeLabel ∷ [T.Text] → Bool +getEdgeLabel inTextList = True + + +{- +-- default +null inTextList || (let firstCommand = T.takeWhile (/= ':') $ head inTextList + firstOption = T.tail $ T.dropWhile (/= ':') $ head inTextList + in + if isNothing (T.find (== ':') (head inTextList)) then getEdgeLabel (tail inTextList) + else if firstCommand == T.pack "edgelabel" then + let option = T.unpack firstOption + in + (option == "true") || (option /= "false" && errorWithoutStackTrace ("EdgeLAbel option \'" <> option <> "\' not recognized (True|False)")) + else getEdgeLabel (tail inTextList)) +-} + +{- | chanaged to True and modified later if need be +| getVertexLabel returns edgeLabel value or default otherwise (True|False) +assumes in lower case +-} +getVertexLabel ∷ [T.Text] → Bool +getVertexLabel inTextList = True + + +{- +-- default +not (null inTextList) && (let firstCommand = T.takeWhile (/= ':') $ head inTextList + firstOption = T.tail $ T.dropWhile (/= ':') $ head inTextList + in + if isNothing (T.find (== ':') (head inTextList)) then getVertexLabel (tail inTextList) + else if firstCommand == T.pack "vertexlabel" then + let option = T.unpack firstOption + in + (option == "true") || (option /= "false" && errorWithoutStackTrace ("VertexLabel option \'" <> option <> "\' not recognized (True|False)")) + else getVertexLabel (tail inTextList)) +-} + +{- | getThreshold returns threshold value or default otherwise +assumes in lower case +-} +getThreshold ∷ [T.Text] → Int +getThreshold inTextList = + -- default + if null inTextList + then 0 ∷ Int + else + let firstCommand = T.takeWhile (/= ':') $ head inTextList + firstOption = T.tail $ T.dropWhile (/= ':') $ head inTextList + in if isNothing (T.find (== ':') (head inTextList)) + then getThreshold (tail inTextList) + else + if firstCommand == T.pack "threshold" + then read (T.unpack firstOption) ∷ Int + else getThreshold (tail inTextList) + + +{- | getOutputFormat returns output file format or default otherwise +assumes in lower case +-} +getOutputFormat ∷ [T.Text] → String +getOutputFormat inTextList = + -- default + if null inTextList + then "dot" + else -- removed prefix for output graph format + + let firstOption = head inTextList + outFormat = T.unpack firstOption + in if outFormat == "dot" + then "dot" + else + if outFormat == "dotpdf" + then "dot" + else (if (outFormat == "fenewick") || (outFormat == "newick") then "fenewick" else getOutputFormat (tail inTextList)) diff --git a/src/Search/Build.hs b/src/Search/Build.hs new file mode 100644 index 000000000..1e18c0bd1 --- /dev/null +++ b/src/Search/Build.hs @@ -0,0 +1,605 @@ +{- | +Module specifying graph building functions + +Distance builds (@Wagner@, @NJ@, @WPGMA@) are imported from @Wag2020@: + +https://www.github.com/wardwheeler/wag2020 +-} +module Search.Build ( + buildGraph, +) where + +import Commands.Verify qualified as VER +import Control.Monad (replicateM, when) +import Control.Monad.Random.Class +import Data.Char +import Data.Foldable (fold) +import Data.Functor (($>), (<&>)) +import Data.List qualified as L +import Data.List.NonEmpty (NonEmpty ((:|))) +import Data.Maybe +import Data.Text.Lazy qualified as TL +import Data.Vector qualified as V +import GeneralUtilities +import GraphOptimization.Traversals qualified as T +import Graphs.GraphOperations qualified as GO +import PHANE.Evaluation +import PHANE.Evaluation.ErrorPhase (ErrorPhase (..)) +import PHANE.Evaluation.Logging (LogLevel (..), Logger (..)) +import PHANE.Evaluation.Verbosity (Verbosity (..)) +import Reconciliation.ReconcileGraphs qualified as R +import Search.DistanceMethods qualified as DM +import Search.DistanceWagner qualified as DW +import Search.WagnerBuild qualified as WB +import SymMatrix qualified as M +import Text.Read +import Types.DistanceTypes +import Types.Types +import Utilities.DistanceUtilities qualified as DU +import Utilities.Distances qualified as D +import Utilities.Distances qualified as DD +import Utilities.LocalGraph qualified as LG +import Utilities.Utilities qualified as U + + +{- | buildGraph wraps around build tree--build trees and adds network edges after build if network +with appropriate options +transforms graph type to Tree for builds then back to initial graph type + +/Note:/ The returned graphs are /ALWAYS/ fully evaluated! +-} +buildGraph ∷ [Argument] → GlobalSettings → ProcessedData → PhyG [ReducedPhylogeneticGraph] +buildGraph inArgs inGS inData = + let fstArgList = fmap (fmap toLower . fst) inArgs + sndArgList = fmap (fmap toLower . snd) inArgs + lcArgList = zip fstArgList sndArgList + checkCommandList = checkCommandArgs "build" fstArgList VER.buildArgList + failure = failWithPhase Parsing . unwords + in -- check for valid command options + if not checkCommandList + then failure ["Unrecognized command in 'build':", show inArgs] + else + let -- block build options including number of display trees to return + buildBlock = filter ((== "block") . fst) lcArgList + displayBlock = filter ((== "displaytrees") . fst) lcArgList + numDisplayTrees + | length displayBlock > 1 = + errorWithoutStackTrace ("Multiple displayTree number specifications in command--can have only one: " <> show inArgs) + | null displayBlock = Just 10 + | null (snd $ head displayBlock) = Just 10 + | otherwise = readMaybe (snd $ head displayBlock) ∷ Maybe Int + + returnList = filter ((== "return") . fst) lcArgList + numReturnTrees + | length returnList > 1 = + errorWithoutStackTrace ("Multiple 'return' number specifications in command--can have only one: " <> show inArgs) + | null returnList = Just (maxBound ∷ Int) + | null (snd $ head returnList) = Just (maxBound ∷ Int) + | otherwise = readMaybe (snd $ head returnList) ∷ Maybe Int + + doEUN' = any ((== "eun") . fst) lcArgList + doCUN' = any ((== "cun") . fst) lcArgList + doEUN + | not doEUN' && not doCUN' = True + | otherwise = doEUN' + + returnTrees' = any ((== "displaytrees") . fst) lcArgList + returnGraph' = any ((== "graph") . fst) lcArgList + returnRandomDisplayTrees' = any ((== "atrandom") . fst) lcArgList + returnFirst' = any ((== "first") . fst) lcArgList + buildDistance = any ((== "distance") . fst) lcArgList + + -- temprary change (if needed) to buyild tree structures + inputGraphType = graphType inGS + treeGS = inGS{graphType = Tree} + + -- really only trees now--but maybe later if can ensure phylogenetic graph from recocnile + (returnGraph, returnTrees) = case graphType inGS of + Tree → (False, True) + _ | returnGraph' || returnTrees' → (returnGraph', returnTrees') + _ → (False, True) + + -- default to return reandom and overrides if both specified + (returnRandomDisplayTrees, _) = + if returnRandomDisplayTrees' || returnFirst' + then (returnRandomDisplayTrees', returnFirst') + else (True, False) + + processedDataList = U.getProcessDataByBlock True inData + + -- parallel setup + pairwiseAction ∷ ProcessedData → PhyG [[VertexCost]] + pairwiseAction = DD.getPairwiseDistances + + buildAction ∷ (ProcessedData, [[VertexCost]]) → PhyG [ReducedPhylogeneticGraph] + buildAction = uncurry $ buildTree' True inArgs treeGS + + traverseAction ∷ Bool → Bool → Maybe Int → SimpleGraph → PhyG ReducedPhylogeneticGraph + traverseAction = T.multiTraverseFullyLabelGraphReduced inGS inData -- False False Nothing + in do + when (not $ null buildBlock) $ logWith LogInfo "Block building initial graph(s)\n" + -- initial build of trees from combined data--or by blocks + -- distance calculation moved here to get out of main scope + firstGraphs' ← + if null buildBlock + then + let simpleTreeOnly = False + in do + pairwiseDistances' ← D.getPairwiseDistances inData + buildTreeList ← buildTree simpleTreeOnly inArgs treeGS inData pairwiseDistances' + -- logWith LogTech $ fold [ "BL:\t", show $ length buildTreeList, "\n" ] + pure buildTreeList + else do + -- removing taxa with missing data for block + -- trace ("Block building initial graph(s)") $ + distanceMatrixList ← + if buildDistance + then getParallelChunkTraverse >>= \pTraverse → pairwiseAction `pTraverse` processedDataList + else pure $ replicate (length processedDataList) [] + blockList ← + getParallelChunkTraverse >>= \pTraverse → + buildAction `pTraverse` zip processedDataList distanceMatrixList + let blockTrees = concat blockList + + returnGraphs ← + reconcileBlockTrees blockTrees (fromJust numDisplayTrees) returnTrees returnGraph returnRandomDisplayTrees doEUN + + getParallelChunkTraverse >>= \pTraverse → traverseAction True True Nothing `pTraverse` returnGraphs + + -- this to allow 'best' to return more trees then later 'returned' and contains memory by letting other graphs go out of scope + firstGraphs ← case buildBlock of + [] → GO.selectGraphs Unique (fromJust numReturnTrees) 0.0 firstGraphs' + _ → pure firstGraphs' + + -- reporting info + let (costString, returnString) = case firstGraphs of + [] → ("\t\tBlock build returned 0 graphs", "\t\tReturning 0 graphs") + g : gs → + let gCosts = snd5 <$> firstGraphs + hiCost = maximum gCosts + loCost = minimum gCosts + suffix = + unwords + [ show $ length firstGraphs + , "graphs at cost range" + , fold ["(", show loCost, ", ", show hiCost, ")"] + ] + in ("\tBlock build yielded" <> suffix, "\tReturning " <> suffix) + + case numDisplayTrees of + Nothing → failure ["DisplayTrees specification in build not an integer:", show . snd $ head displayBlock] + Just nDistplayTrees → case numReturnTrees of + Nothing → failure ["Return number specifications in build not an integer:", show . snd $ head returnList] + Just nReturnTrees → do + logWith LogInfo $ returnString <> "\n" + case (inputGraphType, buildBlock) of + (Tree, []) → pure firstGraphs + (_, x : xs) → logWith LogInfo (costString <> "\n") $> firstGraphs + (_, []) → do + logWith LogInfo $ unwords ["\tRediagnosing as", show $ graphType inGS, "\n"] + getParallelChunkTraverse >>= \pTraverse → + pTraverse (traverseAction False False Nothing) $ fst5 <$> firstGraphs + + +{- | reconcileBlockTrees takes a lists of trees (with potentially varying leave complement) and reconciled them +as per the arguments producing a set of displayTrees (ordered or resolved random), and/or the reconciled graph +all outputs are re-optimzed and ready to go +-} +reconcileBlockTrees ∷ [ReducedPhylogeneticGraph] → Int → Bool → Bool → Bool → Bool → PhyG [SimpleGraph] +reconcileBlockTrees blockTrees numDisplayTrees returnTrees returnGraph returnRandomDisplayTrees doEUN = + -- trace ("Reconcile producing " <> (show numDisplayTrees)) ( + let -- numLeaves = V.length $ fst3 inData + -- fullLeafSet = zip [0..(numLeaves - 1)] (V.toList $ fst3 inData) + simpleGraphList = fmap fst5 blockTrees + -- fullLeafGraphList = fmap (E.makeProcessedGraph fullLeafSet) simpleGraphList + + reconcileArgList ∷ ∀ {a}. [(String, [a])] + reconcileArgList = + if doEUN + then [("eun", []), ("vertexLabel:true", []), ("connect:True", [])] + else [("cun", []), ("vertexLabel:true", []), ("connect:True", [])] + + -- parallel setup + convertAction ∷ SimpleGraph → PhyG SimpleGraph + convertAction = GO.convertGeneralGraphToPhylogeneticGraph True + in do + -- create reconciled graph--NB may NOT be phylogenetic graph--time violations etc. + reconciledGraphInitial' ← R.makeReconcileGraph VER.reconcileArgList reconcileArgList simpleGraphList + let reconciledGraphInitial = snd reconciledGraphInitial' + + -- ladderize, time consistent-ized, removed chained network edges, removed treenodes with all network edge children + reconciledGraph ← GO.convertGeneralGraphToPhylogeneticGraph True reconciledGraphInitial + + -- this for non-convertable graphs + let reconciledGraph' + | not $ LG.isEmpty reconciledGraph = reconciledGraph + | otherwise = reconciledGraphInitial + + displayGraphs' ← + if not returnRandomDisplayTrees + then pure $ take numDisplayTrees $ LG.generateDisplayTrees True reconciledGraph' + else LG.generateDisplayTreesRandom numDisplayTrees reconciledGraph' + + -- need this to fix up some graphs after other stuff changed + -- displayGraphs <- mapM (GO.convertGeneralGraphToPhylogeneticGraph True) displayGraphs' + displayGraphs ← + getParallelChunkTraverse >>= \pTraverse → + convertAction `pTraverse` displayGraphs' + + -- displayGraphs = fmap GO.ladderizeGraph $ fmap GO.renameSimpleGraphNodes displayGraphs' + let numNetNodes = length $ fth4 (LG.splitVertexList reconciledGraph) + let strNetNodes = + fold + [ "Reconciled graph has " + , show numNetNodes + , " network nodes hence up to 2^" + , show numNetNodes + , " display trees" + ] + + when (LG.isEmpty reconciledGraph && not returnTrees) $ + error + "\n\n\tError--reconciled graph could not be converted to phylogenetic graph. Consider modifying block tree search options or returning display trees." + + if not (LG.isEmpty reconciledGraph) && not returnTrees + then logWith LogMore (strNetNodes <> " for softwired network\n") $> [reconciledGraph] + else + if not returnGraph && returnTrees + then pure displayGraphs + else + if not (LG.isEmpty reconciledGraph) + then logWith LogMore ("\n\t" <> strNetNodes) $> reconciledGraph : displayGraphs + else + logWith + LogWarn + "\n\tWarning--reconciled graph could not be converted to phylogenetic graph. Consider modifying block tree search options or performing standard builds.\n" + $> displayGraphs + + +{- | +'buildTree'' wraps build tree and changes order of arguments for mapping. +-} +buildTree' ∷ Bool → [Argument] → GlobalSettings → ProcessedData → [[VertexCost]] → PhyG [ReducedPhylogeneticGraph] +buildTree' simpleTreeOnly inArgs inGS inData pairwiseDistances = + buildTree simpleTreeOnly inArgs inGS inData pairwiseDistances + + +{- | +'buildTree' takes build options and returns constructed graphList +simpleTreeOnly (for block build) returns a single best tree to reduce edges in +reconcile step. +-} +buildTree ∷ Bool → [Argument] → GlobalSettings → ProcessedData → [[VertexCost]] → PhyG [ReducedPhylogeneticGraph] +buildTree simpleTreeOnly inArgs inGS inData@(nameTextVect, _, _) pairwiseDistances = + let getKeyBy ∷ (Eq a) ⇒ (((a, b) → Bool) → [([Char], [Char])] → t) → a → t + getKeyBy f key = f ((== key) . fst) lcArgList + hasKey = getKeyBy any + filterKey = fmap snd . getKeyBy filter + + fstArgList = fmap (fmap toLower . fst) inArgs + sndArgList = fmap (fmap toLower . snd) inArgs + lcArgList = zip fstArgList sndArgList + checkCommandList = checkCommandArgs "build" fstArgList VER.buildArgList + + buildDistance = hasKey "distance" + buildCharacter = hasKey "character" + + {- + -- character build + performBuildCharacter ∷ Int → PhyG [ReducedPhylogeneticGraph] + performBuildCharacter numReplicates = + let treeList = WB.rasWagnerBuild inGS inData numReplicates + treeList' + | simpleTreeOnly = GO.selectGraphs Best 1 0.0 (-1) treeList + | otherwise = treeList + in do + logWith LogMore $ getBuildLogMessage "Character" "yielded" "trees" treeList' + pure treeList' + -} + + -- distance build + performBuildDistance ∷ Int → Int → PhyG [ReducedPhylogeneticGraph] + performBuildDistance numReplicates numToSave = + -- do all options in line and add together for return tree list + let outgroupElem = outgroupIndex inGS + nameStringVect = fmap TL.unpack nameTextVect + distMatrix = M.fromLists pairwiseDistances + + whenKey ∷ (Monoid p) ⇒ [Char] → p → p + whenKey key value + | hasKey key = value + | otherwise = mempty + + refinement + | hasKey "tbr" = "tbr" + | hasKey "spr" = "spr" + | hasKey "otu" = "otu" + | otherwise = "none" + in {- + treeList1 = if hasKey "rdwag" then randomizedDistanceWagner simpleTreeOnly inGS inData nameStringVect distMatrix outgroupElem numReplicates numToSave refinement + else pure [] + treeList2 = if hasKey "dwag" then distanceWagner simpleTreeOnly inGS inData nameStringVect distMatrix outgroupElem refinement + else pure [] + treeList3 = if hasKey "nj" then neighborJoin simpleTreeOnly inGS inData nameStringVect distMatrix outgroupElem refinement + else pure [] + treeList4 = if hasKey "wpgma" then wPGMA simpleTreeOnly inGS inData nameStringVect distMatrix outgroupElem refinement + else pure [] + + -- treeListFull = fold [treeList1, treeList2, treeList3, treeList4] + treeListFull = treeList1 <> treeList2 <> treeList3 <> treeList4 + -} + + do + -- logWith LogInfo ("L455: " <> (show (numReplicates,numToSave))) + treeList1 ← + {-# SCC buildTree_treeList1 #-} + if hasKey "rdwag" + then + randomizedDistanceWagner + simpleTreeOnly + inGS + inData + nameStringVect + distMatrix + outgroupElem + numReplicates + numToSave + refinement + else pure [] + treeList2 ← + if hasKey "dwag" + then distanceWagner simpleTreeOnly inGS inData nameStringVect distMatrix outgroupElem refinement + else pure [] + treeList3 ← + if hasKey "nj" + then neighborJoin simpleTreeOnly inGS inData nameStringVect distMatrix outgroupElem refinement + else pure [] + treeList4 ← + if hasKey "wpgma" + then wPGMA simpleTreeOnly inGS inData nameStringVect distMatrix outgroupElem refinement + else pure [] + + -- let treeListFull = fold [treeList1, treeList2, treeList3, treeList4] + let treeListFull = treeList1 <> treeList2 <> treeList3 <> treeList4 + logWith LogInfo "\tBuilding Distance Tree\n" + case treeListFull of + [] → errorWithoutStackTrace $ "Distance build is specified, but without any method: " <> show inArgs + xs → do + logWith LogMore $ (getBuildLogMessage "Distance" "yielded" "trees" $ xs) <> "\n" + if not simpleTreeOnly + then pure xs + else GO.selectGraphs Best 1 0 xs + in do + failWhen (not checkCommandList) $ "Unrecognized command in 'build': " <> show inArgs + failWhen (buildDistance && buildCharacter) $ + "Cannot specify both 'character' and 'distance' builds in same build command" <> show inArgs + numReplicates ← case filterKey "replicates" of + [] → pure 10 + [x] → case readMaybe x ∷ Maybe Int of + Just i → pure i + Nothing → failParseKeyInteger "replicates" x + _ → failParseKeyDuplicates "replicate" inArgs + numToSave ← case filterKey "best" of + [] → pure numReplicates + [x] → case readMaybe x ∷ Maybe Int of + Just i → pure i + Nothing → failParseKeyInteger "best" x + _ → failParseKeyDuplicates "best" inArgs + + if buildDistance + then performBuildDistance numReplicates numToSave + else -- else performBuildCharacter numReplicates + do + -- character build + treeList ← WB.rasWagnerBuild inGS inData numReplicates + treeList' ← GO.selectGraphs Best 1 0 treeList + if simpleTreeOnly + then do + logWith LogMore $ (getBuildLogMessage "Character" "yielded" "trees" treeList') <> "\n" + pure treeList' + else do + logWith LogMore $ (getBuildLogMessage "Character" "yielded" "trees" treeList) <> "\n" + pure treeList + + +{- | distanceWagner takes Processed data and pairwise distance matrix and returns +'best' addition sequence Wagner (defined in Farris, 1972) as fully decorated tree (as Graph) +-} +distanceWagner + ∷ Bool → GlobalSettings → ProcessedData → V.Vector String → M.Matrix Double → Int → String → PhyG [ReducedPhylogeneticGraph] +distanceWagner simpleTreeOnly inGS inData leafNames distMatrix outgroupValue refinement = + do + distWagTreeList ← DM.doWagnerS inGS leafNames distMatrix "closest" outgroupValue "best" 1 [] + let distWagTree = head distWagTreeList + dWagRefined ← DW.performRefinement refinement "best:1" "first" leafNames outgroupValue distWagTree + let distWagTree' = head dWagRefined + let distWagTreeSimpleGraph = DU.convertToDirectedGraphText leafNames outgroupValue (snd4 distWagTree') + let charInfoVV = V.map thd3 $ thd3 inData + if not simpleTreeOnly + then do + result ← + T.multiTraverseFullyLabelGraphReduced + inGS + inData + False + False + Nothing + (GO.renameSimpleGraphNodes $ GO.dichotomizeRoot outgroupValue $ LG.switchRootTree (length leafNames) distWagTreeSimpleGraph) + return $ [result] + else + let simpleWag = GO.renameSimpleGraphNodes $ GO.dichotomizeRoot outgroupValue $ LG.switchRootTree (length leafNames) distWagTreeSimpleGraph + in return $ [(simpleWag, 0.0, LG.empty, V.empty, charInfoVV)] + + +{- | randomizedDistanceWagner takes Processed data and pairwise distance matrix and returns +random addition sequence Wagner trees fully decorated tree (as Graph) +-} +randomizedDistanceWagner + ∷ Bool + → GlobalSettings + → ProcessedData + → V.Vector String + → M.Matrix Double + → Int + → Int + → Int + → String + → PhyG [ReducedPhylogeneticGraph] +randomizedDistanceWagner simpleTreeOnly inGS inData leafNames distMatrix outgroupValue numReplicates numToKeep refinement = + {-# SCC randomizedDistanceWagner_TOP_DEF #-} + -- set up parallel structures + let refineAction ∷ TreeWithData → PhyG [TreeWithData] + refineAction = DW.performRefinement refinement "best:1" "first" leafNames outgroupValue + + traverseGraphAction ∷ SimpleGraph → PhyG ReducedPhylogeneticGraph + traverseGraphAction g0 = + {-# SCC randomizedDistanceWagner_traverseGraphAction #-} + do + let -- debugger :: (Logger m, Show a, Show b, Show c) => a -> LG.Gr b c -> m () + -- debugger n g = logWith LogTech $ "In: 'randomizedDistanceWagner.traverseGraphAction'\n Graph [ G_" <> show n <> " ]:\n" <> LG.prettify g + -- debugger 0 g0 + let g1 = LG.switchRootTree (length leafNames) g0 + -- debugger 1 g1 + let g2 = GO.dichotomizeRoot outgroupValue g1 + -- debugger 2 g2 + result@(g3, _, _, _, _) ← T.multiTraverseFullyLabelGraphReduced inGS inData False False Nothing g2 + -- debugger 3 g3 + pure result + + dichotomizeAction ∷ SimpleGraph → SimpleGraph + dichotomizeAction = GO.dichotomizeRoot outgroupValue . (LG.switchRootTree (length leafNames)) + + directedGraphAction ∷ TreeWithData → SimpleGraph + directedGraphAction = DU.convertToDirectedGraphText leafNames outgroupValue . snd4 + + leafIndexVec ∷ V.Vector Int + leafIndexVec = V.generate (V.length leafNames) id + + charInfoVV ∷ V.Vector (V.Vector CharInfo) + charInfoVV = V.map thd3 $ thd3 inData + in do + randomizedAdditionSequences ← replicateM numReplicates $ shuffleList leafIndexVec + randomizedAdditionWagnerTreeList ← + DM.doWagnerS inGS leafNames distMatrix "random" outgroupValue "random" numToKeep randomizedAdditionSequences + + let randomizedAdditionWagnerTreeList' = take numToKeep $ L.sortOn thd4 randomizedAdditionWagnerTreeList + + randomizedAdditionWagnerTreeList'' ∷ [TreeWithData] ← case refinement of + "none" → pure randomizedAdditionWagnerTreeList' + _ → + getParallelChunkTraverse >>= \pTraverse → + fmap fold $ + refineAction `pTraverse` randomizedAdditionWagnerTreeList' + + randomizedAdditionWagnerSimpleGraphList ← + getParallelChunkMap <&> \pMap → + directedGraphAction `pMap` randomizedAdditionWagnerTreeList'' + + let resultingRandomizedGraphs + | not simpleTreeOnly = + getParallelChunkTraverse >>= \pTraverse → traverseGraphAction `pTraverse` randomizedAdditionWagnerSimpleGraphList + | otherwise = + let numTrees = length randomizedAdditionWagnerSimpleGraphList + in do + simpleRDWagList ← + getParallelChunkMap <&> \pMap → + dichotomizeAction `pMap` randomizedAdditionWagnerSimpleGraphList + pure $ + L.zip5 + simpleRDWagList + (replicate numTrees 0.0) + (replicate numTrees LG.empty) + (replicate numTrees V.empty) + (replicate numTrees charInfoVV) + + resultingRandomizedGraphs + + +{- | neighborJoin takes Processed data and pairwise distance matrix and returns +Neighbor-Joining tree as fully decorated tree (as Graph) +-} +neighborJoin + ∷ Bool → GlobalSettings → ProcessedData → V.Vector String → M.Matrix Double → Int → String → PhyG [ReducedPhylogeneticGraph] +neighborJoin simpleTreeOnly inGS inData leafNames distMatrix outgroupValue refinement = + do + njTree ← DM.neighborJoining leafNames distMatrix outgroupValue + njRefined ← DW.performRefinement refinement "best:1" "first" leafNames outgroupValue njTree + let njTree' = head njRefined + let njSimpleGraph = DU.convertToDirectedGraphText leafNames outgroupValue (snd4 njTree') + let charInfoVV = V.map thd3 $ thd3 inData + if not simpleTreeOnly + then do + result ← + T.multiTraverseFullyLabelGraphReduced + inGS + inData + False + False + Nothing + (GO.renameSimpleGraphNodes $ GO.dichotomizeRoot outgroupValue $ LG.switchRootTree (length leafNames) njSimpleGraph) + pure [result] + else do + let simpleNJ = GO.dichotomizeRoot outgroupValue $ LG.switchRootTree (length leafNames) njSimpleGraph + pure [(simpleNJ, 0.0, LG.empty, V.empty, charInfoVV)] + + +{- | wPGMA takes Processed data and pairwise distance matrix and returns +WPGMA tree as fully decorated tree (as Graph) +since root index not nOTUs as with other tres--chanegd as with dWag and NJ to make consistent. +-} +wPGMA + ∷ Bool → GlobalSettings → ProcessedData → V.Vector String → M.Matrix Double → Int → String → PhyG [ReducedPhylogeneticGraph] +wPGMA simpleTreeOnly inGS inData leafNames distMatrix outgroupValue refinement = + do + wpgmaTree ← DM.wPGMA leafNames distMatrix outgroupValue + wpgmaRefined ← DW.performRefinement refinement "best:1" "first" leafNames outgroupValue wpgmaTree + let wpgmaTree' = head wpgmaRefined + let wpgmaSimpleGraph = DU.convertToDirectedGraphText leafNames outgroupValue (snd4 wpgmaTree') + let charInfoVV = V.map thd3 $ thd3 inData + if not simpleTreeOnly + then do + result ← + T.multiTraverseFullyLabelGraphReduced + inGS + inData + False + False + Nothing + (GO.renameSimpleGraphNodes $ GO.dichotomizeRoot outgroupValue $ LG.switchRootTree (length leafNames) wpgmaSimpleGraph) + return $ [result] + else + let simpleWPGMA = GO.dichotomizeRoot outgroupValue $ LG.switchRootTree (length leafNames) wpgmaSimpleGraph + in return [(simpleWPGMA, 0.0, LG.empty, V.empty, charInfoVV)] + + +{- +### ### ### +### Log Helper functions +### ### ### +-} + +getBuildLogMessage ∷ (Show b, Ord b) ⇒ String → String → String → [(a, b, c, d, e)] → String +getBuildLogMessage pref verb obj list = + let listLength = show $ length list + costRangeStr = case list of + [] → mempty + x : xs → + let costValues = snd5 <$> x :| xs + in "at cost range " <> show (minimum costValues, maximum costValues) <> "\n" + in unwords ["\t" <> pref, "build", verb, listLength, obj, costRangeStr] + + +failParse ∷ String → Evaluation env a +failParse = failWithPhase Parsing + + +failParseKeyDuplicates ∷ (Show a) ⇒ String → a → PhyG b +failParseKeyDuplicates key args = failParse $ fold ["Multiple '", key, "' number specifications in command--can have only one: ", show args] + + +failParseKeyInteger ∷ (Show a) ⇒ String → a → PhyG b +failParseKeyInteger key val = failParse $ fold ["Key '", key, "' specification in 'build' not an integer: ", show val] + + +failWhen ∷ Bool → String → PhyG () +failWhen cond = when cond . failParse diff --git a/src/Search/DistanceMethods.hs b/src/Search/DistanceMethods.hs new file mode 100644 index 000000000..21774703a --- /dev/null +++ b/src/Search/DistanceMethods.hs @@ -0,0 +1,394 @@ +{-# LANGUAGE BangPatterns #-} + +{- | +Module to calculate distance tree construction methods Neightbor-Joining, WPGMA, +and WPGMAcbut with added refinement based on 4-point metric. +-} +module Search.DistanceMethods (neighborJoining, wPGMA, doWagnerS, performWagnerRefinement) where + +import Control.Monad (when) +import Data.Number.Transfinite as NT +import Data.Vector qualified as V +import GeneralUtilities +import PHANE.Evaluation +import PHANE.Evaluation.ErrorPhase (ErrorPhase (..)) +import PHANE.Evaluation.Logging (LogLevel (..), Logger (..)) +import PHANE.Evaluation.Verbosity (Verbosity (..)) +import Search.DistanceWagner qualified as W +import SymMatrix qualified as M +import Types.DistanceTypes +import Types.Types +import Utilities.DistanceUtilities + + +{- | wPGMA takes a list of leaves and a distance matrixx and returns +an WGPGMA tree +WPGMA not UPGMA since the linkages are from the two descendent linkages and not weighted by +the number of taxa in each group as in https://en.wikipedia.org/wiki/WPGMA +-} +wPGMA ∷ V.Vector String → M.Matrix Double → Int → PhyG TreeWithData +wPGMA leafNames distMatrix outgroup = + if M.null distMatrix + then error "Null matrix in WPGMA" + else + let numLeaves = V.length leafNames + leafVertexVect = V.fromList [0 .. (numLeaves - 1)] + in do + ((vertexVect, edgeVect), finalMatrix) ← addTaxaWPGMA distMatrix numLeaves (leafVertexVect, V.empty) [] + let wPGMATree' = convertLinkagesToEdgeWeights vertexVect (V.toList edgeVect) (V.toList edgeVect) [] numLeaves + -- linkage leves are not same as edgeweights so need to be converted + let newickString = convertToNewick leafNames outgroup wPGMATree' + let treeCost = getTreeCost wPGMATree' + + logWith LogInfo "\n\tBuilding WPGMA tree\n" + -- trace (show wPGMATree <> "\n" <> show wPGMATree') + return $ (take (length newickString - 3) newickString <> "[" <> show treeCost <> "];\n", wPGMATree', treeCost, finalMatrix) + + +-- | pulls dWagner function from module Wagner +doWagnerS + ∷ GlobalSettings → V.Vector String → M.Matrix Double → String → Int → String → Int → [V.Vector Int] → PhyG [TreeWithData] +doWagnerS inGS leafNames distMatrix firstPairMethod outgroup addSequence numToKeep replicateSequences = + do + logWith LogInfo ("\tBuilding " <> show (length replicateSequences) <> " Wagner tree(s)" <> "\n") + W.doWagnerS inGS leafNames distMatrix firstPairMethod outgroup addSequence numToKeep replicateSequences + + +-- | pulls Wagner refinement from Wagner module +performWagnerRefinement ∷ String → String → String → V.Vector String → Int → TreeWithData → PhyG [TreeWithData] +performWagnerRefinement = W.performRefinement + + +{- | neighborJoining takes a list of leaves and a distance matrixx and returns +an NJ tree +-} +neighborJoining ∷ V.Vector String → M.Matrix Double → Int → PhyG TreeWithData +neighborJoining leafNames distMatrix outgroup = + if M.null distMatrix + then error "Null matrix in neighborJoining" + else do + logWith LogInfo "\n\tBuilding NJ tree\n" + -- get intial matrices + let numLeaves = V.length leafNames + let leafVertexVect = V.fromList [0 .. (numLeaves - 1)] + (nJTree, finalLittleDMatrix) ← addTaxaNJ distMatrix numLeaves (leafVertexVect, V.empty) [] + let newickString = convertToNewick leafNames outgroup nJTree + let treeCost = getTreeCost nJTree + + return $ (take (length newickString - 3) newickString <> "[" <> show treeCost <> "];\n", nJTree, treeCost, finalLittleDMatrix) + + +-- | sumAvail sums the row only thos values not already added to tree +sumAvail ∷ [Int] → Int → [Double] → Double +sumAvail vertInList index distList + | null distList = 0.0 + | index `elem` vertInList = sumAvail vertInList (index + 1) (tail distList) + | otherwise = + let firstDist = head distList + in firstDist + sumAvail vertInList (index + 1) (tail distList) + + +-- | makeDMatrixRow make a single row of the bif D matrix +makeDMatrixRow ∷ M.Matrix Double → [Int] → Int → Int → V.Vector Double -- V.Seq Double +makeDMatrixRow inObsMatrix vertInList column row + | M.null inObsMatrix = error "Null matrix in makeInitialDMatrix" + | row `elem` vertInList = V.replicate (V.length (inObsMatrix V.! row)) NT.infinity + | column == V.length (inObsMatrix V.! row) = V.empty + | column == row = V.cons 0.0 (makeDMatrixRow inObsMatrix vertInList (column + 1) row) + | column `notElem` vertInList = + let dij = inObsMatrix M.! (row, column) + divisor = (fromIntegral (M.rows inObsMatrix) - 2) - fromIntegral (length vertInList) + ri = (sumAvail vertInList 0 $ M.getFullRow inObsMatrix row) + rj = (sumAvail vertInList 0 $ M.getFullRow inObsMatrix column) + bigDij = dij - ((ri + rj) / divisor) + in V.cons bigDij (makeDMatrixRow inObsMatrix vertInList (column + 1) row) + | otherwise = V.cons NT.infinity (makeDMatrixRow inObsMatrix vertInList (column + 1) row) + + +{- | makeIDMatrix makes adjusted matrix (D) from observed (d) values +assumes matrix is square and symmetrical +makes values Infinity if already added +adjust ri and rj to bew based on on values not in termInList +does by row so can be parallelized call with column = 0 update list [] +makes DMatrix direclty not via M.updateMatrix +-} +makeDMatrix' ∷ M.Matrix Double → [Int] → Int → Int → [(Int, Int, Double)] → M.Matrix Double +makeDMatrix' inObsMatrix vertInList row column updateList + | M.null inObsMatrix = error "Null matrix in makeInitialDMatrix" + | row == M.rows inObsMatrix = M.updateMatrix inObsMatrix updateList + | column == M.cols inObsMatrix = makeDMatrix' inObsMatrix vertInList (row + 1) 0 updateList + | column == row = makeDMatrix' inObsMatrix vertInList row (column + 1) ((row, column, 0.0) : updateList) + | (column `elem` vertInList) || (row `elem` vertInList) = + makeDMatrix' inObsMatrix vertInList row (column + 1) ((row, column, NT.infinity) : updateList) + | otherwise = + let dij = inObsMatrix M.! (row, column) + divisor = (fromIntegral (M.rows inObsMatrix) - 2) - fromIntegral (length vertInList) + ri = (sumAvail vertInList 0 $ M.getFullRow inObsMatrix row) + rj = (sumAvail vertInList 0 $ M.getFullRow inObsMatrix column) + bigDij = dij - ((ri + rj) / divisor) + in makeDMatrix' inObsMatrix vertInList row (column + 1) ((row, column, bigDij) : updateList) + + +{- | makeIDMatrix makes adjusted matrix (D) from observed (d) values +assumes matrix is square and symmetrical +makes values Infinity if already added +adjust ri and rj to be based on on values not in termInList +does by row so can be parallelized call with column = 0 update list [] +makes DMatrix direclty not via M.updateMatrix +-} +makeDMatrix ∷ M.Matrix Double → [Int] → PhyG (M.Matrix Double) +makeDMatrix inObsMatrix vertInList = + if M.null inObsMatrix + then error "Null matrix in makeInitialDMatrix" + else + let makeRowAction ∷ Int → V.Vector Double + makeRowAction = makeDMatrixRow inObsMatrix vertInList 0 + in do + makeDRowPar ← getParallelChunkMap + let newMatrix = makeDRowPar makeRowAction [0 .. (M.rows inObsMatrix - 1)] + + pure $ V.fromList newMatrix + + +{- | pickNearestUpdateMatrix takes d and D matrices, pickes nearest based on D +then updates d and D to reflect new node and distances created +updates the column/row for vertices that are joined to be infinity so +won't be chosen to join again +-} +pickNearestUpdateMatrixNJ ∷ M.Matrix Double → [Int] → PhyG (M.Matrix Double, Vertex, Edge, Edge, [Int]) +pickNearestUpdateMatrixNJ littleDMatrix vertInList = + if M.null littleDMatrix + then error "Null d matrix in pickNearestUpdateMatrix" + else do + newMatrix ← makeDMatrix littleDMatrix vertInList + minPairResult ← getMatrixMinPairTabu newMatrix vertInList + let (iMin, jMin, distIJ) = minPairResult + -- (iMin, jMin, distIJ) = getMatrixMinPairTabu (makeDMatrix' littleDMatrix vertInList 0 0 []) vertInList + + -- trace ("First pair " <> show (iMin, jMin, distIJ) <> " matrix: " <> (show (makeDMatrix littleDMatrix vertInList)) + -- <> "\nVertsIn: " <> show vertInList <> " matrix': " <> show (makeDMatrix' littleDMatrix vertInList 0 0 [])) ( + if distIJ == NT.infinity + then error "No minimum found in pickNearestUpdateMatrix" + else + let -- new vertex is size of distance matrix (0 indexed) + newVertIndex = M.rows littleDMatrix + dij = littleDMatrix M.! (iMin, jMin) + divisor = fromIntegral (M.rows littleDMatrix) - 2 - fromIntegral (length vertInList) + -- only count values of those in + ri = (sumAvail vertInList 0 $ M.getFullRow littleDMatrix iMin) + rj = (sumAvail vertInList 0 $ M.getFullRow littleDMatrix jMin) + -- seem reversed compared to examples, seems arbitrary to me (for leaf pairs at least) + -- diMinNewVert = (dij / 2.0) - ((ri - rj) / (2.0 * divisor)) + -- djMinNewVert = dij - diMinNewVert + djMinNewVert = (dij / 2.0) - ((ri - rj) / (2.0 * divisor)) + diMinNewVert = dij - djMinNewVert + + newVertInList = vertInList <> [iMin, jMin] + + -- get distances to existing vertices + otherVertList = [0 .. (M.rows littleDMatrix - 1)] + + getNewDistAction ∷ Int → Double + getNewDistAction = getNewDist littleDMatrix dij iMin jMin diMinNewVert djMinNewVert + in do + -- NOTE: This may be a problem for parallel calls but looks OK + newDistPar ← getParallelChunkMap + let newLittleDRow = newDistPar getNewDistAction otherVertList + -- newLittleDRow = PU.seqParMap PU.myStrategyR0 (getNewDist littleDMatrix dij iMin jMin diMinNewVert djMinNewVert) otherVertList -- `using` myParListChunkRDS + let newLittleDMatrix = M.addMatrixRow littleDMatrix (V.fromList $ newLittleDRow <> [0.0]) + -- recalculate whole D matrix since new row affects all the original ones (except those merged) + -- included vertex values set to infinity so won't be chosen later + -- newBigDMatrix = makeDMatrix newLittleDMatrix newVertInList -- 0 0 [] + + -- create new edges + let newEdgeI = (newVertIndex, iMin, diMinNewVert) + let newEdgeJ = (newVertIndex, jMin, djMinNewVert) + + pure (newLittleDMatrix, newVertIndex, newEdgeI, newEdgeJ, newVertInList) + + +-- ) + +-- | getNewDist get ditance of new vertex to existing vertices +getNewDist ∷ M.Matrix Double → Double → Int → Int → Double → Double → Int → Double +getNewDist littleDMatrix dij iMin jMin diMinNewVert djMinNewVert otherVert + | otherVert == iMin = diMinNewVert + | otherVert == jMin = djMinNewVert + | otherwise = + let dik = littleDMatrix M.! (iMin, otherVert) + djk = littleDMatrix M.! (jMin, otherVert) + in (dik + djk - dij) / 2.0 + + +{- | addTaxaNJ recursively calls pickNearestUpdateMatrix untill all internal nodes are created +recursively called until all (n - 2) internal vertices are created. +-} +addTaxaNJ ∷ M.Matrix Double → Int → Tree → [Int] → PhyG (Tree, M.Matrix Double) +addTaxaNJ littleDMatrix numLeaves (vertexVect, edgeVect) vertInList = + if V.length vertexVect == (2 * numLeaves) - 2 + then + let -- (iMin, jMin, _) = getMatrixMinPairTabu (makeDMatrix littleDMatrix vertInList) vertInList + last2 = subtractVector (V.fromList vertInList) vertexVect + iMin = last2 V.! 0 + jMin = last2 V.! 1 + {-This is wrong--not last two + iMin = vertexVect V.! ((V.length vertexVect) - 1) + jMin = vertexVect V.! ((V.length vertexVect) - 2) + -} + lastEdge = (iMin, jMin, littleDMatrix M.! (iMin, jMin)) + in {- + trace (show last2 <> " from " <> show vertexVect <> "\nlast edge: " <> " size " <> (show $ V.length vertexVect) <> " matrix: " <> (show littleDMatrix) + <> " edge: " + <> (show lastEdge) <> " vertInList: " <> show vertInList) + -} + return ((vertexVect, edgeVect `V.snoc` lastEdge), littleDMatrix) + else -- more to add + do + pickNearestResult ← pickNearestUpdateMatrixNJ littleDMatrix vertInList + let (newLittleDMatrix, newVertIndex, newEdgeI, newEdgeJ, newVertInList) = pickNearestResult + let newVertexVect = vertexVect `V.snoc` newVertIndex + let newEdgeVect = edgeVect <> V.fromList [newEdgeI, newEdgeJ] + + -- trace (M.showMatrixNicely newLittleDMatrix <> "\n" <> M.showMatrixNicely bigDMatrix) + let -- progress = takeWhile (/='.') $ show ((fromIntegral (100 * (V.length vertexVect - numLeaves))/fromIntegral (numLeaves - 2)) :: Double) + let (percentAdded, _) = divMod (100 * (V.length vertexVect - numLeaves)) (numLeaves - 2) + let (decileNumber, decileRemainder) = divMod percentAdded 10 + let (_, oddRemainder) = divMod (V.length vertexVect - numLeaves) 2 + + if decileRemainder == 0 && oddRemainder == 0 + then do + logWith LogInfo ("\t" <> show (10 * decileNumber) <> "%") + addTaxaNJ newLittleDMatrix numLeaves (newVertexVect, newEdgeVect) newVertInList + else addTaxaNJ newLittleDMatrix numLeaves (newVertexVect, newEdgeVect) newVertInList + + +-- | addTaxaWPGMA perfomrs recursive reduction of distance matrix until all internal vertices are created +addTaxaWPGMA ∷ M.Matrix Double → Int → Tree → [Int] → PhyG (Tree, M.Matrix Double) +addTaxaWPGMA distMatrix numLeaves (vertexVect, edgeVect) vertInList = + if V.length vertexVect == (2 * numLeaves) - 2 + then + let -- (iMin, jMin, _) = getMatrixMinPairTabu distMatrix vertInList -- (-1, -1, NT.infinity) 0 0 + last2 = subtractVector (V.fromList vertInList) vertexVect + iMin = last2 V.! 0 + jMin = last2 V.! 1 + lastEdge = (iMin, jMin, distMatrix M.! (iMin, jMin)) + in {- + trace ((show last2) <> " from " <> show vertexVect <> "\nlast edge: " <> " size " <> (show $ V.length vertexVect) <> " matrix: " <> (show distMatrix) + <> " last edge: " + <> (show lastEdge) <> " vertInList: " <> (show vertInList) + <> " all edges " <> show (edgeVect `V.snoc` lastEdge)) + -} + pure ((vertexVect, edgeVect `V.snoc` lastEdge), distMatrix) + else do + -- building + upfdateResult ← pickUpdateMatrixWPGMA distMatrix vertInList + let (newDistMatrix, newVertIndex, newEdgeI, newEdgeJ, newVertInList) = upfdateResult + let newVertexVect = vertexVect `V.snoc` newVertIndex + let newEdgeVect = edgeVect <> V.fromList [newEdgeI, newEdgeJ] + + -- trace (M.showMatrixNicely distMatrix) + let -- progress = takeWhile (/='.') $ show ((fromIntegral (100 * (V.length vertexVect - numLeaves))/fromIntegral (numLeaves - 2)) :: Double) + let (percentAdded, _) = divMod (100 * (V.length vertexVect - numLeaves)) (numLeaves - 2) + let (decileNumber, decileRemainder) = divMod percentAdded 10 + let (_, oddRemainder) = divMod (V.length vertexVect - numLeaves) 2 + + if decileRemainder == 0 && oddRemainder == 0 + then do + logWith LogInfo ("\t" <> show (10 * decileNumber) <> "%") + addTaxaWPGMA newDistMatrix numLeaves (newVertexVect, newEdgeVect) newVertInList + else do + addTaxaWPGMA newDistMatrix numLeaves (newVertexVect, newEdgeVect) newVertInList + + +{- | pickUpdateMatrixWPGMA takes d matrix, pickes closesst based on d +then updates d to reflect new node and distances created +updates the column/row for vertices that are joined to be infinity so +won't be chosen to join again +-} +pickUpdateMatrixWPGMA ∷ M.Matrix Double → [Int] → PhyG (M.Matrix Double, Vertex, Edge, Edge, [Int]) +pickUpdateMatrixWPGMA distMatrix vertInList = + if M.null distMatrix + then error "Null d matrix in pickNearestUpdateMatrix" + else do + minPairResult ← getMatrixMinPairTabu distMatrix vertInList -- (-1, -1, NT.infinity) 0 0 + let (iMin, jMin, dij) = minPairResult + + -- trace ("First pair " <> show (iMin, jMin, dij)) ( + if dij == NT.infinity + then error "No minimum found in pickNearestUpdateMatrix" + else + let -- new vertex is size of distance matrix (0 indexed) + newVertIndex = M.rows distMatrix + + diMinNewVert = dij / 2.0 + djMinNewVert = dij / 2.0 + + newVertInList = vertInList <> [iMin, jMin] + + -- get distances to existing vertices + otherVertList = [0 .. (M.rows distMatrix - 1)] + + wpgmaAction ∷ Int → Double + wpgmaAction = getNewDistWPGMA distMatrix iMin jMin diMinNewVert djMinNewVert + in do + wpgmaPar ← getParallelChunkMap + let newDistRow = wpgmaPar wpgmaAction otherVertList + -- newDistRow = PU.seqParMap PU.myStrategyR0 (getNewDistWPGMA distMatrix iMin jMin diMinNewVert djMinNewVert) otherVertList -- `using` myParListChunkRDS + let newDistMatrix = M.addMatrixRow distMatrix (V.fromList $ newDistRow <> [0.0]) + + -- create new edges + let newEdgeI = (newVertIndex, iMin, diMinNewVert) + let newEdgeJ = (newVertIndex, jMin, djMinNewVert) + + pure (newDistMatrix, newVertIndex, newEdgeI, newEdgeJ, newVertInList) + + +-- | getNewDistWPGMA get ditance of new vertex to existing vertices WPGMA--cluster levels +getNewDistWPGMA ∷ M.Matrix Double → Int → Int → Double → Double → Int → Double +getNewDistWPGMA distMatrix iMin jMin diMinNewVert djMinNewVert otherVert + | otherVert == iMin = diMinNewVert + | otherVert == jMin = djMinNewVert + | otherwise = + let dik = distMatrix M.! (iMin, otherVert) + djk = distMatrix M.! (jMin, otherVert) + in (dik + djk) / 2.0 + + +{- | convertLinkagesToEdgeWeights converts linake leevs to branch lengths +by subtracting descnedent linakge from edge weight +edges are created in linakege order so can use that direction, leaves are +always 2nd element in edge +-} +convertLinkagesToEdgeWeights ∷ V.Vector Vertex → [Edge] → [Edge] → [Edge] → Int → Tree +convertLinkagesToEdgeWeights vertexVect fullEdgeList inEdgeList curEdgeList numLeaves + | null fullEdgeList = error "Null edge set in convertLinkagesToEdgeWeights" + | V.null vertexVect = error "Null vertex set in convertLinkagesToEdgeWeights" + | null inEdgeList = (vertexVect, V.fromList curEdgeList) + | null curEdgeList = + -- first case, take last edge (highest linkage) special case need + -- to subtract both descendent linkages + let (eVert, uVert, linkage) = last inEdgeList + eDescLinkage = if eVert >= numLeaves then getWeightDescLink eVert fullEdgeList else 0.0 + uDescLinkage = if uVert >= numLeaves then getWeightDescLink uVert fullEdgeList else 0.0 + newEdgeList = [(eVert, uVert, linkage - eDescLinkage - uDescLinkage)] + in convertLinkagesToEdgeWeights vertexVect fullEdgeList (init inEdgeList) newEdgeList numLeaves + | otherwise -- not first but still some edegs to go + = + let (eVert, uVert, linkage) = head inEdgeList + uDescLinkage = if uVert >= numLeaves then getWeightDescLink uVert fullEdgeList else 0.0 + newEdgeList = (eVert, uVert, linkage - uDescLinkage) : curEdgeList + in convertLinkagesToEdgeWeights vertexVect fullEdgeList (tail inEdgeList) newEdgeList numLeaves + + +{- | getWeightDescLink takes the m,pore derived (smaller linkage, 2nd vertex of edge) and returns +the weight of that edge (assumes not leaf--checked earlier) +-} +getWeightDescLink ∷ Int → [Edge] → Double +getWeightDescLink uVert fullEdgeList = + if null fullEdgeList + then error "Edge not found in getWeightDescLink" + else + let (eVert, _, weight) = head fullEdgeList + in if eVert == uVert + then weight + else getWeightDescLink uVert (tail fullEdgeList) diff --git a/src/Search/DistanceWagner.hs b/src/Search/DistanceWagner.hs new file mode 100644 index 000000000..5d5b7c2a9 --- /dev/null +++ b/src/Search/DistanceWagner.hs @@ -0,0 +1,1165 @@ +{- +Need to integerize costs for swapping very slow on Double values + do due spurious precision +-} + +{- | +Module with distance tree construction methods Distance Wagner Farris 1972 +but with added refinement based on 4-point metric. +-} +module Search.DistanceWagner (doWagnerS, performRefinement) where + +import Control.DeepSeq (force) +import Control.Monad (when) +import Control.Parallel.Strategies +import Data.Functor ((<&>)) +import Data.List qualified as L +import Data.Maybe +import Data.Number.Transfinite qualified as NT +import Data.Vector qualified as V +import GeneralUtilities +import PHANE.Evaluation +import PHANE.Evaluation.ErrorPhase (ErrorPhase (..)) +import PHANE.Evaluation.Logging (LogLevel (..), Logger (..)) +import PHANE.Evaluation.Verbosity (Verbosity (..)) +import SymMatrix qualified as M +import Types.DistanceTypes +import Types.Types +import Utilities.DistanceUtilities +import Utilities.Utilities (listApplying, strict3of4) + + +{- | getStartingPair returns starying pair for Wagner build + closts mnimal cost pair + furthest maximal cost pair + random chooses uniformly at random from leaf set +-} +getStartingPair ∷ String → M.Matrix Double → Edge +getStartingPair choiceOpt distMatrix + | choiceOpt == "closest" = getMatrixMinPair distMatrix (-1, -1, NT.infinity) 0 0 + | choiceOpt == "furthest" = getMatrixMaxPair distMatrix (-1, -1, 0 ∷ Double) 0 0 + | choiceOpt == "random" = errorWithoutStackTrace "Initial pair option 'random' not yet implemented" + | otherwise = + errorWithoutStackTrace ("Initial pair option " <> choiceOpt <> " unrecognized. Must be 'closest', 'furthest', or 'random'") + + +-- | getBestEdgeTree take list of edge tuples and return trhe one with best addition cost +getBestEdgeTree + ∷ V.Vector (Double, Tree, M.Matrix Double) → Double → (Double, Tree, M.Matrix Double) → (Double, Tree, M.Matrix Double) +getBestEdgeTree edgeTreeList curBestCost curBestResult = + if V.null edgeTreeList + then curBestResult + else + let (firstAddCost, _, _) = V.head edgeTreeList + in if firstAddCost < curBestCost + then getBestEdgeTree (V.tail edgeTreeList) firstAddCost (V.head edgeTreeList) + else getBestEdgeTree (V.tail edgeTreeList) curBestCost curBestResult + + +{- | add leaf to an edge creating new tree with distances and add cost, also augmented distance matrix +but this for swap so returns entire new3-edge cost so not Farris triangle it is sum of three diveded by 2 +-} +addToEdgeSwap ∷ M.Matrix Double → Int → Tree → Int → Edge → (Double, Tree, M.Matrix Double) +addToEdgeSwap distMatrix leaf initialTree newLeafIndex inEdge = + let (eVertex, uVertex, inWeight) = inEdge + (initialVertexVect, initialEdgeVect) = initialTree + addCost = ((distMatrix M.! (leaf, eVertex)) + (distMatrix M.! (leaf, uVertex)) - (distMatrix M.! (eVertex, uVertex))) / 2.0 + eVertLeafDist = (distMatrix M.! (leaf, eVertex)) - addCost + uVertLeafDist = (distMatrix M.! (leaf, uVertex)) - addCost + newVertexVect = V.snoc initialVertexVect leaf + newEdges = V.fromList [(leaf, newLeafIndex, addCost), (eVertex, newLeafIndex, eVertLeafDist), (uVertex, newLeafIndex, uVertLeafDist)] + cleanupEdges = V.filter (/= inEdge) initialEdgeVect + newEdgeVect = cleanupEdges <> newEdges + newTree = (newVertexVect, newEdgeVect) + -- add new costs from added vertex to each reamaining leaf + augmentedDistMatrix = getNewDistMatrix distMatrix addCost eVertLeafDist uVertLeafDist eVertex uVertex leaf + in (addCost + eVertLeafDist + uVertLeafDist - inWeight, newTree, augmentedDistMatrix) + + +-- | add leaf to an edge creating new tree with distances and add cost, also augmented distance matrix +addToEdge ∷ M.Matrix Double → Int → Tree → Int → Edge → (Double, Tree, M.Matrix Double) +addToEdge distMatrix leaf initialTree newLeafIndex inEdge = + -- trace ("In addToEdge with " <> (show (leaf, initialTree, newLeafIndex, (M.rows distMatrix), inEdge))) ( + let (eVertex, uVertex, _) = inEdge + (initialVertexVect, initialEdgeVect) = initialTree + addCost = ((distMatrix M.! (leaf, eVertex)) + (distMatrix M.! (leaf, uVertex)) - (distMatrix M.! (eVertex, uVertex))) / 2.0 + eVertLeafDist = (distMatrix M.! (leaf, eVertex)) - addCost + uVertLeafDist = (distMatrix M.! (leaf, uVertex)) - addCost + newVertexVect = V.snoc initialVertexVect leaf + newEdges = V.fromList [(leaf, newLeafIndex, addCost), (eVertex, newLeafIndex, eVertLeafDist), (uVertex, newLeafIndex, uVertLeafDist)] + cleanupEdges = V.filter (/= inEdge) initialEdgeVect + newEdgeVect = V.map orderEdge $ cleanupEdges <> newEdges + newTree = (newVertexVect, newEdgeVect) + -- add new costs from added vertex to each reamaining leaf + augmentedDistMatrix = getNewDistMatrix distMatrix addCost eVertLeafDist uVertLeafDist eVertex uVertex leaf + in (addCost, newTree, augmentedDistMatrix) + + +{- | addTaxonToTree takes distMatrix, an initialTree, Vector of leavesToAdd, and leaf index to add +and retursn a tuple wiht the addition cost, the new tree, the new leaves to add, and new distance matrix (enhanced) +-} +addTaxonToTree ∷ M.Matrix Double → Tree → V.Vector Int → Int → Int → (Double, Tree, V.Vector Int, M.Matrix Double) +addTaxonToTree distMatrix initialTree leavesToAdd newVertexIndex leaf = + if V.null leavesToAdd + then (0.0, initialTree, leavesToAdd, distMatrix) + else + let leavesRemaining = V.filter (/= leaf) leavesToAdd + (_, edgesInitial) = initialTree + + -- Parallelize heretoo much and destroys lazy matrix update + addEdgeList = V.map (addToEdge distMatrix leaf initialTree newVertexIndex) edgesInitial + (firstAddCost, _, _) = V.head addEdgeList -- this to initialize getBestEdge below + in -- filter for best addition point + let (addCost, newTree, augmentedDistMatrix) = getBestEdgeTree (V.tail addEdgeList) firstAddCost (V.head addEdgeList) + in (addCost, newTree, leavesRemaining, augmentedDistMatrix) + + +-- | getBestLeafAdd chooses best leaf to add based on cost field +getBestLeafAdd + ∷ V.Vector (Double, Tree, V.Vector Int, M.Matrix Double) + → Double + → (Double, Tree, V.Vector Int, M.Matrix Double) + → (Double, Tree, V.Vector Int, M.Matrix Double) +getBestLeafAdd addPosVect curBestCost curBestLeaf = + if V.null addPosVect + then curBestLeaf + else + let (thisCost, _, _, _) = V.head addPosVect + in if thisCost < curBestCost + then getBestLeafAdd (V.tail addPosVect) thisCost (V.head addPosVect) + else getBestLeafAdd (V.tail addPosVect) curBestCost curBestLeaf + + +{- | wagBest takes distMatrix, and intial tree of two leaves, a vector of leavesToAdd, the input nuber of leaves +and returns the Farris 1972 distance Wagner adding the "closest" leaf at each iteration +-} +wagBest + ∷ M.Matrix Double → Tree → V.Vector Int → Int → Int → V.Vector Int → String → PhyG (Tree, V.Vector Int, M.Matrix Double) +wagBest distMatrix inTree leavesToAdd nOTUs newVertexIndex leavesToMap choiceOpt + | length leavesToAdd == nOTUs = + let (eVertex, uVertex, edgeWeight) = orderEdge $ getStartingPair choiceOpt distMatrix + initialTree = (V.fromList [eVertex, uVertex], V.fromList [(eVertex, uVertex, edgeWeight)]) + leavesToAdd' = V.filter (/= eVertex) $ V.filter (/= uVertex) leavesToAdd + in wagBest distMatrix initialTree leavesToAdd' nOTUs nOTUs leavesToAdd' choiceOpt + | V.null leavesToAdd = return (inTree, leavesToAdd, distMatrix) + | otherwise = + let addPosVect = V.map (addTaxonToTree distMatrix inTree leavesToAdd newVertexIndex) leavesToMap + (firstLeafCost, _, _, _) = V.head addPosVect -- To initialize below + (_, newTree, newLeavesToAdd, augmentedDistMatrix) = getBestLeafAdd (V.tail addPosVect) firstLeafCost (V.head addPosVect) + in let -- progress = takeWhile (/='.') $ show ((fromIntegral (100 * (newVertexIndex - nOTUs))/fromIntegral (nOTUs - 2)) :: Double) + (percentAdded, _) = divMod (100 * (newVertexIndex - nOTUs)) (nOTUs - 2) + (decileNumber, decileRemainder) = divMod percentAdded 10 + (_, oddRemainder) = divMod (newVertexIndex - nOTUs) 2 + in do + if decileRemainder == 0 && oddRemainder == 0 + then do + logWith LogInfo ("\t" <> show (10 * decileNumber) <> "%") + wagBest augmentedDistMatrix newTree newLeavesToAdd nOTUs (newVertexIndex + 1) newLeavesToAdd choiceOpt + else do + wagBest augmentedDistMatrix newTree newLeavesToAdd nOTUs (newVertexIndex + 1) newLeavesToAdd choiceOpt + + +-- ) + +{- | calculateWagnerTrees takes an input distance matrix (and options later) and returns +a tree (V,E) discription of Wagner tree with labelled internal veritices and branch lengths +-} +calculateWagnerTrees ∷ M.Matrix Double → String → PhyG (Tree, M.Matrix Double) +calculateWagnerTrees distMatrix choiceOpt = + if M.dim distMatrix == (0, 0) + then errorWithoutStackTrace "Null distance matrix" + else -- get initial pair of leaves and create initial tree + + let nOTUs = M.cols distMatrix + allLeaves = V.fromList [0 .. (nOTUs - 1)] + in do + (newTree, _, endMatrix) ← wagBest distMatrix (V.empty, V.empty) allLeaves nOTUs nOTUs allLeaves choiceOpt + return (newTree, endMatrix) + + +{- | makeTreeFromOrder takes an input order and other arguemnts and cretes tree using a single additoin +seqeunce, best plaecment for the leaf each round +-} +makeTreeFromOrder ∷ M.Matrix Double → Tree → Int → Int → V.Vector Int → (Tree, M.Matrix Double) +makeTreeFromOrder distMatrix initialTree nOTUs vertexIndex leavesToAdd = + if null leavesToAdd + then (initialTree, distMatrix) + else + let leaf = V.head leavesToAdd + (_, newTree, _, augmentedDistMatrix) = addTaxonToTree distMatrix initialTree leavesToAdd vertexIndex leaf + in {-Too much output + let (percentAdded, _) = divMod (100 * (nOTUs - (V.length leavesToAdd))) (nOTUs - 2) + (decileNumber, decileRemainder) = divMod percentAdded 10 + in + if decileRemainder == 0 then + trace ("\t\t" <> (show $ 10 * decileNumber) <> "%") + makeTreeFromOrder augmentedDistMatrix newTree nOTUs (vertexIndex + 1) (V.tail leavesToAdd) + else + -} + makeTreeFromOrder augmentedDistMatrix newTree nOTUs (vertexIndex + 1) (V.tail leavesToAdd) + + +-- | getRandomAdditionSequence initializes based on input sequence and adds in order from there +getRandomAdditionSequence ∷ V.Vector String → M.Matrix Double → Int → V.Vector Int → TreeWithData +getRandomAdditionSequence leafNames distMatrix outgroup initiaLeavesToAdd = + let nOTUs = V.length leafNames + in let eVertex = initiaLeavesToAdd V.! 0 + uVertex = initiaLeavesToAdd V.! 1 + edgeWeight = distMatrix M.! (eVertex, uVertex) + initialTree = (V.fromList [eVertex, uVertex], V.fromList [(eVertex, uVertex, edgeWeight)]) + leavesToAdd = V.filter (/= eVertex) $ V.filter (/= uVertex) initiaLeavesToAdd + in let thisTree = makeTreeFromOrder distMatrix initialTree nOTUs nOTUs leavesToAdd + -- (_, edgeVect) = fst thisTree + treeCost = getTreeCost $ fst thisTree -- V.sum $ V.map getEdgeCost edgeVect + newickTree = convertToNewick leafNames outgroup (fst thisTree) + newickTree' = take (length newickTree - 3) newickTree <> "[" <> showDouble precision treeCost <> "]" <> ";" + in (newickTree', fst thisTree, treeCost, snd thisTree) + + +{- | doWagnerS takes user options and produces the Wagner tree methods desired (best, asis, or random) +outputs newick rep list +-} +doWagnerS + ∷ GlobalSettings → V.Vector String → M.Matrix Double → String → Int → String → Int → [V.Vector Int] → PhyG [TreeWithData] +doWagnerS inGS leafNames distMatrix firstPairMethod outgroup addSequence numToKeep replicateSequences = + let nOTUs = V.length leafNames + rasAction ∷ V.Vector Int → TreeWithData + rasAction = getRandomAdditionSequence leafNames distMatrix outgroup + in if addSequence == "best" + then do + wagnerResult ← calculateWagnerTrees distMatrix firstPairMethod + -- (_, edgeVect) = fst wagnerResult + let treeCost = getTreeCost $ fst wagnerResult --- V.sum $ V.map getEdgeCost edgeVect + let newickTree = convertToNewick leafNames outgroup (fst wagnerResult) + let newickTree' = take (length newickTree - 3) newickTree <> "[" <> showDouble precision treeCost <> "]" <> ";" + + return $ [(newickTree', fst wagnerResult, treeCost, snd wagnerResult)] + else + if addSequence == "asis" + then + let initialTree = (V.fromList [0, 1], V.fromList [(0, 1, distMatrix M.! (0, 1))]) + leavesToAdd = V.fromList [2 .. (nOTUs - 1)] + asIsResult = makeTreeFromOrder distMatrix initialTree nOTUs nOTUs leavesToAdd + treeCost = getTreeCost $ fst asIsResult -- V.sum $ V.map getEdgeCost asIsEdges + newickTree = convertToNewick leafNames outgroup (fst asIsResult) + newickTree' = take (length newickTree - 3) newickTree <> "[" <> showDouble precision treeCost <> "]" <> ";" + in return $ [(newickTree', fst asIsResult, treeCost, snd asIsResult)] + else + if head addSequence == 'r' + then + if null replicateSequences + then errorWithoutStackTrace "Zero replicate additions specified--could be error in configuration file" + else + if (length replicateSequences > 10000) + then doWagnerRASProgressive inGS leafNames distMatrix outgroup numToKeep [] replicateSequences + else -- else take numToKeep $ L.sortOn thd4 (fmap (getRandomAdditionSequence leafNames distMatrix outgroup) replicateSequences `using` PU.myParListChunkRDS) + -- else take numToKeep $ L.sortOn thd4 (PU.seqParMap rseq (getRandomAdditionSequence leafNames distMatrix outgroup) replicateSequences) + do + rasResult ← + getParallelChunkMapBy strict3of4 <&> \pMap → + rasAction `pMap` replicateSequences + pure $ take numToKeep $ L.sortOn thd4 rasResult -- (PU.seqParMap rdeepseq (getRandomAdditionSequence leafNames distMatrix outgroup) replicateSequences) + else errorWithoutStackTrace ("Addition sequence " <> addSequence <> " not implemented") + + +{- | doWagnerRASProgressive performs RAS distance Wagner in chunks to reduce memory footprint +for large numner of replicates +-} +doWagnerRASProgressive + ∷ GlobalSettings → V.Vector String → M.Matrix Double → Int → Int → [TreeWithData] → [V.Vector Int] → PhyG [TreeWithData] +doWagnerRASProgressive inGS leafNames distMatrix outgroup numToKeep curBestTreeList replicateSequences = + if null replicateSequences + then pure curBestTreeList + else + let threadNumber = graphsSteepest inGS -- PU.getNumThreads + -- this number can be tweaked to incrase or reduce memory usage and efficiency + jobFactor = 20 + numToDo = max 1000 (jobFactor * threadNumber) + replist = take numToDo replicateSequences + + rasAction ∷ V.Vector Int → TreeWithData + rasAction = getRandomAdditionSequence leafNames distMatrix outgroup + in do + rasTrees ← + getParallelChunkMapBy strict3of4 <&> \pMap → + rasAction `pMap` replist + + -- rasTrees = fmap (getRandomAdditionSequence leafNames distMatrix outgroup) replist `using` PU.myParListChunkRDS + -- rasTrees = PU.seqParMap rpar (getRandomAdditionSequence leafNames distMatrix outgroup) replist + let newBestList = take numToKeep $ L.sortOn thd4 (rasTrees ++ curBestTreeList) + + doWagnerRASProgressive inGS leafNames distMatrix outgroup numToKeep newBestList (drop numToDo replicateSequences) + + +{- | edgeHasVertex takes an vertex and an edge and returns Maybe Int +of other vertex +-} +edgeHasVertex ∷ Vertex → Edge → Maybe (Vertex, Edge) +edgeHasVertex inVert inEdge = + let (a, b, _) = inEdge + in if a == inVert + then Just (b, inEdge) + else + if b == inVert + then Just (a, inEdge) + else Nothing + + +{- | getSubEdges take a Vector of edges and retuns a list of edges connected to input vertex +uses nOTUs to know when to stop recursing +-} +getSubEdges ∷ [Vertex] → Int → V.Vector Edge → V.Vector Edge → String → V.Vector Edge +getSubEdges inVertexList nOTUs edgeVect subEdgeVect howMany + | V.null edgeVect = subEdgeVect + | null inVertexList = subEdgeVect + | otherwise = + let startVertex = head inVertexList + foundVect = V.filter (/= Nothing) $ V.map (edgeHasVertex startVertex) edgeVect + in if V.null foundVect + then getSubEdges (tail inVertexList) nOTUs edgeVect subEdgeVect howMany -- only terminals left + else + if V.length foundVect /= 2 + then + error + ( "Index (" + <> howMany + <> ")" + <> show startVertex + <> "->found " + <> show (V.length foundVect) + <> " but should be two edges in " + <> show foundVect + <> " in " + <> show edgeVect + ) + else + let thingsFound = V.map fromJust foundVect + verticesFound = V.toList $ V.map fst thingsFound + edgesFound = V.map snd thingsFound + + -- add found edges to subEdgeSet + newSubEdgeVect = subEdgeVect <> edgesFound + -- delete delete edges from those to be searched + newEdgeVect = V.filter (/= V.last edgesFound) $ V.filter (/= V.head edgesFound) edgeVect + in -- recurse on vertices that were found + if howMany == "first2" + then edgesFound + else getSubEdges (verticesFound <> tail inVertexList) nOTUs newEdgeVect newSubEdgeVect howMany + + +{- | adjustInternalEdgeVertex adjusts vertex of an internal (ie non-pendent, none 2 terminal edge) +assumes hv > lv +-} +adjustInternalEdgeVertex ∷ Vertex → Vertex → Vertex → Int → Int → Vertex +adjustInternalEdgeVertex inV hV lV maxOffSet nOTUs + | inV <= max nOTUs lV = inV + | inV > hV = inV - maxOffSet + | inV > lV = inV - 1 + | otherwise = error ("This can't happen " <> show (inV, lV, hV)) + + +-- ) + +{- | adjustVertex reduces vertex (hV,lV,_) is the dge that was deleted +index by 2,1, or 0 if greater than hVert, lVert, or neither +asumes hv > lv; and inE > inU +if selfe edge its a termin and only reduce by 1 +assumes edge is ordered e > u +-} +adjustVertex ∷ Edge → Vertex → Vertex → Int → Edge +adjustVertex (inE, inU, w) hV lV nOTUs + | (inE <= max lV nOTUs) && (inU <= max lV nOTUs) = (inE, inU, w) + | lV < nOTUs -- update pendant edge was deleted since hV > lV; hV must be > nOTUs + = + if (inE > max hV nOTUs) && (inU > max hV nOTUs) + then (inE - 1, inU - 1, w) + else + if (inE <= max hV nOTUs) && (inU > max hV nOTUs) + then (inE, inU - 1, w) + else + if (inE > max hV nOTUs) && (inU <= max hV nOTUs) + then (inE - 1, inU, w) + else (inE, inU, w) + | otherwise -- internal edge was deleted -- both deleted verteces internal + = + let newE = adjustInternalEdgeVertex inE hV lV 2 nOTUs + newU = adjustInternalEdgeVertex inU hV lV 2 nOTUs + in (newE, newU, w) + + +{- | updateVertexNUmbersOnEdges taked vertex numbers and updates HTU indices to reflect +the deletion of those vertices +ASSUMES edges are ordered (a,b,weight) a > b +subtracts 2 from index if > the bigger of teh two, subtract 1 if bigger than lower, +otherwise leaves unchanged +-} +updateVertexNUmbersOnEdges ∷ Vertex → Vertex → V.Vector Edge → Int → V.Vector Edge +updateVertexNUmbersOnEdges eVert uVert edgeList nOTUs = + -- trace ("In updateVertexNUmbersOnEdges") ( + if V.null edgeList + then V.empty + else + let hVert = max eVert uVert + lVert = min eVert uVert + newVertex = adjustVertex (V.head edgeList) hVert lVert nOTUs + in V.cons newVertex (updateVertexNUmbersOnEdges eVert uVert (V.tail edgeList) nOTUs) + + +{- | updateDistMatrix updates distance matrix to remove eVertex and uVertex columns and rows as in updateVertexNUmbersOnEdges +update costs for two contracte edges c1Edge and c2Edge +the distance update takes place first and row/coumn removal second +this to keep the proper indices (since the non-deleted rows and columns indices are changed) +-} +updateDistMatrix ∷ Vertex → Vertex → M.Matrix Double → Int → Edge → Edge → M.Matrix Double +updateDistMatrix eVert uVert distMatrix nOTUs c1Edge c2Edge = + let validEdgeList = filter ((>= 0) . fst3) [c1Edge, c2Edge] + newMatrix = M.unsafeUpdateMatrix distMatrix validEdgeList + newMatrix' = M.deleteRowsAndColumns newMatrix (filter (> (nOTUs - 1)) [eVert, uVert]) + in newMatrix' + + +-- | getEndVertices takes a pair of edges and returns the non-index vertices +getEndVertices ∷ V.Vector Edge → Vertex → (Vertex, Vertex) +getEndVertices inEdges index = + if V.length inEdges /= 2 + then error ("Edge number should be 2 not " <> show (V.length inEdges) <> " in getEndVertices") + else + let (fEVertex, fUVertex, _) = V.head inEdges + (sEVertex, sUVertex, _) = V.last inEdges + in if fEVertex == index + then + if sEVertex == index + then (fUVertex, sUVertex) + else (fUVertex, sEVertex) + else + if fUVertex == index + then + if sEVertex == index + then (fEVertex, sUVertex) + else (fEVertex, sEVertex) + else error ("Error finding ends of " <> show (V.head inEdges) <> " and " <> show (V.last inEdges)) + + +{- | getContractedWeight gets edge contractd edge weights ither form distMatrix for OTUs or 1 OTU and 1 +internal wertex or by 4 point metric (maximum of two estimations) +-} +getContractedWeight ∷ Vertex → Vertex → M.Matrix Double → Int → V.Vector Edge → Double +getContractedWeight aVert bVert distMatrix nOTUs edgeVect + | aVert < nOTUs && bVert < nOTUs = distMatrix M.! (aVert, bVert) + | aVert < nOTUs = distMatrix M.! (aVert, bVert) + | bVert < nOTUs = distMatrix M.! (aVert, bVert) + | otherwise = + -- both internal 4-point mertic estimation + -- get edges connected to the contracted edge + let aEdgeVect = getSubEdges [aVert] nOTUs edgeVect V.empty "first2" + bEdgeVect = getSubEdges [bVert] nOTUs edgeVect V.empty "first2" + (a, b) = getEndVertices aEdgeVect aVert + (c, d) = getEndVertices bEdgeVect bVert + firstEstimate = (distMatrix M.! (a, c)) + (distMatrix M.! (b, d)) - (distMatrix M.! (a, b)) - (distMatrix M.! (c, d)) + secondEstimate = (distMatrix M.! (a, b)) + (distMatrix M.! (b, c)) - (distMatrix M.! (a, b)) - (distMatrix M.! (c, d)) + in max firstEstimate secondEstimate / 2.0 + + +{- | contractEdges takes two edges that share a vertex and fuses them +If terminal then return selfdge +update contracted edge weight +dummy edge (-1,-1,-1) is returned when no edge to contract so won't be updated in distance Matrix +-} +contractEdges ∷ M.Matrix Double → Int → V.Vector Edge → Vertex → V.Vector Edge → Edge +contractEdges distMatrix nOTUs edgeVect index allEdges + | V.null edgeVect = (-1, -1, 0) + | V.length edgeVect == 1 = + let (a, b, w) = V.head edgeVect + in if (a == index) || (b == index) + then (index, index, w) + else error ("Contacting single edge: " <> show (V.head edgeVect) <> " with vertex " <> show index <> " not found") + | otherwise = + let (aVert, bVert) = getEndVertices edgeVect index + newWeight = getContractedWeight aVert bVert distMatrix nOTUs (subtractVector edgeVect allEdges) + in orderEdge (aVert, bVert, newWeight) + + +{- | getDistance creates distances from remaing OTUs to new vertex (rowID) via Farris 1972 +rowID is the row (or column) of distance Matrix +-} +getDistance ∷ M.Matrix Double → Double → Double → Double → Int → Int → Int → Int → Double +getDistance origDist addCost eVertLeafDist uVertLeafDist leafIndex eVertex uVertex rowID = + let first = (origDist M.! (rowID, leafIndex)) - addCost + second = (origDist M.! (rowID, eVertex)) - eVertLeafDist + third = (origDist M.! (rowID, uVertex)) - uVertLeafDist + in maximum [first, second, third] + + +{- | getNewDistMatrix takes distMatrix and adds Cost for new eVertLeafDist uVertLeafDist +created in build process (new HTUs) +should be complete (on input) for leaves already added (initail paiwise distances and HTUs added) +adds a single new row (minus last 0.0 as new row at end) which is appended +-} +getNewDistMatrix ∷ M.Matrix Double → Double → Double → Double → Int → Int → Int → M.Matrix Double +getNewDistMatrix origDist addCost eVertLeafDist uVertLeafDist eVertex uVertex leafIndex = + let columnHolder = V.fromList [0 .. (M.rows origDist - 1)] -- List of HTU and OTU indices in pairwise dist matrix + newDistRow = V.map (getDistance origDist addCost eVertLeafDist uVertLeafDist leafIndex eVertex uVertex) columnHolder + newDistRow' = newDistRow `V.snoc` (0.0 ∷ Double) + in M.addMatrixRow origDist newDistRow' + + +{- | enterNewEdgeCost cretes new row of costs from edge vectors +infty NT.infinity if not there +this makes update n^2 which is dumb +-} +enterNewEdgeCost ∷ Int → V.Vector Edge → Int → Double +enterNewEdgeCost columnNumber edgeVect rowNumber = + if V.null edgeVect + then NT.infinity -- not found so Infty + else + let (a, b, weight) = V.head edgeVect + in if (columnNumber == a && rowNumber == b) || (columnNumber == b && rowNumber == a) + then weight + else enterNewEdgeCost columnNumber (V.tail edgeVect) rowNumber + + +{- | addEdgesToDistMatrix adds new edges from internal node join to existing matrix +0 if not created so only the two new vertices and the edges they touch (3 each) +so need to add two columns and two rows, for new vertices I, and II (here numIn and numIn + 1) +-} +getNewDistMatrixInternal ∷ M.Matrix Double → V.Vector Edge → M.Matrix Double +getNewDistMatrixInternal inMatrix newEdgeVect = + -- trace ("In getNewDistMatrixInternal") ( + if V.length newEdgeVect /= 5 + then error ("Wrong size edgeVector shoud be 5 and is " <> show (V.length newEdgeVect)) + else + let numIn = M.rows inMatrix + columnHolder = [0 .. (numIn - 1)] + newDistColumnI = fmap (enterNewEdgeCost numIn newEdgeVect) columnHolder <> [0.0] + newDistColumnII = fmap (enterNewEdgeCost (numIn + 1) newEdgeVect) columnHolder <> [enterNewEdgeCost numIn newEdgeVect (numIn + 1), 0.0] + in M.addMatrices inMatrix (V.fromList [V.fromList newDistColumnI, V.fromList newDistColumnII]) + + +{- | connectEdges takes two vectors of edges and adds and edge between the two in edgesToConnect +this deletes the two old edges from the edge Vectors and creates a new tree with the five new edges +added in; the addition cost is for total of created edges minus edges that were destroyed +-} +connectEdges ∷ M.Matrix Double → V.Vector Edge → V.Vector Edge → V.Vector Edge → (Double, Tree, M.Matrix Double) +connectEdges distMatrix eEdges uEdges edgesToConnect + | V.null eEdges = error "Empty e Edge vector in connectEdges" + | V.null uEdges = error "Empty u Edge vector in connectEdges" + | V.length edgesToConnect /= 2 = error ("There shoule be 2 edges to connect and ther are " <> show (V.length edgesToConnect)) + | otherwise = + let edgesKept = subtractVector edgesToConnect eEdges <> subtractVector edgesToConnect uEdges + numInTree = M.rows distMatrix + -- order new edges + (a, b, wAB) = V.head edgesToConnect + (c, d, wCD) = V.last edgesToConnect + firstEstimate = (distMatrix M.! (a, c)) + (distMatrix M.! (b, d)) - (distMatrix M.! (a, b)) - (distMatrix M.! (c, d)) + secondEstimate = (distMatrix M.! (a, b)) + (distMatrix M.! (b, c)) - (distMatrix M.! (a, b)) - (distMatrix M.! (c, d)) + centralEdgeCost = max firstEstimate secondEstimate / 2.0 + centralEdge = orderEdge (numInTree, numInTree + 1, centralEdgeCost) + aCost = ((distMatrix M.! (c, a)) - (distMatrix M.! (c, b)) + (distMatrix M.! (a, b))) / 2.0 + bCost = (distMatrix M.! (a, b)) - aCost + cCost = ((distMatrix M.! (a, c)) - (distMatrix M.! (a, d)) + (distMatrix M.! (c, d))) / 2.0 + dCost = (distMatrix M.! (c, d)) - cCost + newAEdge = orderEdge (a, numInTree, aCost) -- edge cost not unique either (a or b, numInTree) could be max or min + newBEdge = orderEdge (b, numInTree, bCost) -- edge cost not unique either (a or b, numInTree) could be max or min + newCEdge = orderEdge (c, numInTree + 1, cCost) -- edge cost not unique either (a or b, numInTree + 1) could be max or min + newDEdge = orderEdge (d, numInTree + 1, dCost) -- edge cost not unique either (a or b, numInTree + 1) could be max or min + newEdgeVect = V.fromList [centralEdge, newAEdge, newBEdge, newCEdge, newDEdge] + newDistMatrix = getNewDistMatrixInternal distMatrix newEdgeVect + in (centralEdgeCost + aCost + bCost + cCost + dCost - wAB - wCD, (V.empty, newEdgeVect <> edgesKept), newDistMatrix) + + +{- | addEdgeToSplit adds a new edge to a specifc pair of input edges detemined addition cost +and creted new edge set with appropriate weights +-} +addEdgeToSplit + ∷ V.Vector Edge → V.Vector Edge → Edge → Edge → M.Matrix Double → V.Vector Edge → (Double, Tree, M.Matrix Double) +addEdgeToSplit eEdges uEdges eTerminal uTerminal distMatrix edgesToConnect + | V.null eEdges && V.null uEdges = error "Empty e/u Edge vectors in addEdgeToSplit" + | V.null edgesToConnect = error "Empty eEdge vector in addEdgeToSplit" + | fst3 (V.head eEdges) == (-1) -- adding eOTU, V.last since the (-1,-1,0) edge should always be first + = + addToEdgeSwap distMatrix (fst3 eTerminal) (V.empty, uEdges) (M.rows distMatrix) (V.last edgesToConnect) + | fst3 (V.head uEdges) == (-1) -- adding uOTU + = + addToEdgeSwap distMatrix (fst3 uTerminal) (V.empty, eEdges) (M.rows distMatrix) (V.last edgesToConnect) + | otherwise -- both internal edges + = + connectEdges distMatrix eEdges uEdges edgesToConnect + + +{- | splitTree takes a tree description and its edgeList and return pairs of edge list +split at input edge in tree with "repaird"/contracted edges, delta, and original +edge (pairs of vertices and weight) for each split +-} +splitTree ∷ M.Matrix Double → Tree → Double → Edge → SplitTreeData +splitTree distMatrix inTree inTreeCost edgeToRemove = + -- check if proper tree--remove later + let (_, edgeVect) = inTree + (eVertex, uVertex, _) = edgeToRemove + + -- newEdgeSet = subtractVector (V.cons edgeToRemove $ eEdges <> uEdges) edgeVect + newEdgeSet = V.filter (/= edgeToRemove) edgeVect + nOTUs = div (3 + V.length edgeVect) 2 + eSubEdges = getSubEdges [eVertex] nOTUs newEdgeSet V.empty "all" + uSubEdges = getSubEdges [uVertex] nOTUs newEdgeSet V.empty "all" + + -- get edges that need to be contracted and re-estimate weights + eEdges = getSubEdges [eVertex] nOTUs eSubEdges V.empty "first2" + uEdges = getSubEdges [uVertex] nOTUs uSubEdges V.empty "first2" + eMergedEdge = contractEdges distMatrix nOTUs eEdges eVertex edgeVect + uMergedEdge = contractEdges distMatrix nOTUs uEdges uVertex edgeVect + + -- remove non-contracted edges and add in contracted edges + eSubEdges' = V.cons eMergedEdge (subtractVector eEdges eSubEdges) + uSubEdges' = V.cons uMergedEdge (subtractVector uEdges uSubEdges) + + -- need to know this order for SPR/TBR so which edge was in which set + previousEdges = V.fromList [eMergedEdge, uMergedEdge] + + -- map new HTU indices in edges and create new distance matrix + -- HTU indices are updated first--then values updated in distMatrix as rows/columns are + -- deleted to remove info from delted edge + eSubEdges'' = updateVertexNUmbersOnEdges eVertex uVertex (V.map orderEdge eSubEdges') nOTUs + uSubEdges'' = updateVertexNUmbersOnEdges eVertex uVertex (V.map orderEdge uSubEdges') nOTUs + previousEdges'' = updateVertexNUmbersOnEdges eVertex uVertex (V.map orderEdge previousEdges) nOTUs + + -- Update with deleted node and reestimated contracted edges + -- update matrix the costs (2 ij x 2 ji) of contracted edges + distMatrix'' = updateDistMatrix eVertex uVertex distMatrix nOTUs eMergedEdge uMergedEdge -- (V.head previousEdges'') (V.last previousEdges'') + + -- Delta calcualted by differene in original tree cost and split and readdition to split edges (then tail splits later + -- so not remake the original tree) + splitCost = getTreeCost (V.empty, eSubEdges'') + getTreeCost (V.empty, uSubEdges'') + + -- Delta of tree length will be weights of the the edges removed - the weights of the two edges contracted and reestimated + -- delta = weight + (V.sum $ V.map thd3 eEdges) + (V.sum $ V.map thd3 uEdges) - (V.sum $ V.map thd3 previousEdges'') + delta = inTreeCost - splitCost -- readditionCost + in if eVertex < nOTUs + then (V.singleton (eVertex, eVertex, 0.0), uSubEdges'', delta, previousEdges'', distMatrix'') -- this so know a pendant edge + else + if uVertex < nOTUs + then (V.singleton (uVertex, uVertex, 0.0), eSubEdges'', delta, previousEdges'', distMatrix'') -- this so know a pendant edge + else -- neded to have e first then u for SPR/TBR so can coordinate with previous edges + (eSubEdges'', uSubEdges'', delta, previousEdges'', distMatrix'') + + +{- | sieveTrees takes a list of (addition cost, Tree, distnce matrix) and returns list +of better or equal trees to input delta +-} +sieveTrees + ∷ Double → Double → V.Vector (Double, Tree, M.Matrix Double) → V.Vector String → Int → [TreeWithData] → [TreeWithData] +sieveTrees inDelta curBestCost inAddList leafNames outgroup savedTrees = + if null inAddList + then savedTrees + else + let firstTuple = V.head inAddList + (firstDelta, firstTree, firstMatrix) = firstTuple + newCost = curBestCost - inDelta + firstDelta + -- checkCost = getTreeCost firstTree + newickTree = convertToNewick leafNames outgroup firstTree + newickTree' = take (length newickTree - 3) newickTree <> "[" <> showDouble precision newCost <> "]" <> ";" + newTuple = (newickTree', firstTree, newCost, firstMatrix) + in if firstDelta > inDelta + then sieveTrees inDelta curBestCost (V.tail inAddList) leafNames outgroup savedTrees + else + if newCost < curBestCost + then sieveTrees inDelta curBestCost (V.tail inAddList) leafNames outgroup [newTuple] + else + if withinEpsilon newCost curBestCost + then sieveTrees inDelta curBestCost (V.tail inAddList) leafNames outgroup (newTuple : savedTrees) + else sieveTrees inDelta curBestCost (V.tail inAddList) leafNames outgroup savedTrees + + +{- | reAddTerminals checks to see if split on pendant edge--if so reads terminal to each edge but saves equal cost if found +and saveMethod specifies it. Idenitical to the wagner addition process with saveing equal as option +could use addEdgeToSplit for consistancey with SPR/TBR +-} +reAddTerminals ∷ String → Double → V.Vector String → Int → SplitTreeData → [TreeWithData] +reAddTerminals rejoinType curBestCost leafNames outGroup split = + if rejoinType /= "otu" + then error ("Incorrect swap function in reAddTerminals: " <> rejoinType) + else + let (eEdgeVect, uEdgeVect, delta, _, distMatrix) = split + nOTUs = V.length leafNames + in if (V.length eEdgeVect > 1) + || ((fst3 (V.head eEdgeVect) /= snd3 (V.head eEdgeVect)) && (fst3 (V.head eEdgeVect) < nOTUs) && (snd3 (V.head eEdgeVect) < nOTUs)) + then [] + else + ( if M.rows distMatrix /= ((2 * nOTUs) - 3) + then error ("Dist Matrix incorrect size " <> show (M.dim distMatrix) <> " should be " <> show ((2 * nOTUs) - 3, (2 * nOTUs) - 3)) + else + let newLeafIndex = M.rows distMatrix + -- take tail of uEdgeVect so not regerate input tree + additionList = V.map (addToEdgeSwap distMatrix (fst3 $ V.head eEdgeVect) (V.empty, uEdgeVect) newLeafIndex) uEdgeVect -- (V.tail uEdgeVect) -- tail so not hit original tree, leave all to reestimate if necesary + minAdditionCost = V.minimum (V.map fst3 additionList) + in if minAdditionCost > delta + then [] + else sieveTrees delta curBestCost additionList leafNames outGroup [] + ) + + +{- | add leaf to an edge creating new tree with distances and add cost, also augmented distance matrix +but this for swap so returns entire new3-edge cost so not Farris triangle it is sum of three diveded by 2 +-} +addToEdgeSwapRecurse ∷ Double → M.Matrix Double → Int → Tree → Int → V.Vector Edge → (Double, Tree, M.Matrix Double) +addToEdgeSwapRecurse inDelta distMatrix leaf initialTree newLeafIndex inEdgeVect = + if V.null inEdgeVect + then (inDelta, initialTree, distMatrix) + else + let inEdge@(eVertex, uVertex, inWeight) = V.head inEdgeVect + (initialVertexVect, initialEdgeVect) = initialTree + addCost = ((distMatrix M.! (leaf, eVertex)) + (distMatrix M.! (leaf, uVertex)) - (distMatrix M.! (eVertex, uVertex))) / 2.0 + eVertLeafDist = (distMatrix M.! (leaf, eVertex)) - addCost + uVertLeafDist = (distMatrix M.! (leaf, uVertex)) - addCost + newVertexVect = V.snoc initialVertexVect leaf + newEdges = V.fromList [(leaf, newLeafIndex, addCost), (eVertex, newLeafIndex, eVertLeafDist), (uVertex, newLeafIndex, uVertLeafDist)] + cleanupEdges = V.filter (/= inEdge) initialEdgeVect + newEdgeVect = cleanupEdges <> newEdges + newTree = (newVertexVect, newEdgeVect) + -- add new costs from added vertex to each reamaining leaf + augmentedDistMatrix = getNewDistMatrix distMatrix addCost eVertLeafDist uVertLeafDist eVertex uVertex leaf + newDelta = addCost + eVertLeafDist + uVertLeafDist - inWeight + in if newDelta < inDelta + then (newDelta, newTree, augmentedDistMatrix) + else addToEdgeSwapRecurse inDelta distMatrix leaf initialTree newLeafIndex (V.tail inEdgeVect) + + +{- | getVectorAllVectorPairs takes two vectors and creates a vector of avector of two elements each for each +pairwise combinatrion of elements +-} +getVectorAllVectorPairs ∷ V.Vector a → V.Vector a → V.Vector (V.Vector a) +getVectorAllVectorPairs firstVect secondVect = + if V.null firstVect + then V.empty + else + let firstElement = V.head firstVect + firstPairs = V.map (V.cons firstElement) $ V.map V.singleton secondVect + in firstPairs <> getVectorAllVectorPairs (V.tail firstVect) secondVect + + +{- | createVectorEdgePairs creates teh Vector of Vectors of edges (2 in each case) to connect +if SPR then takes the initial (previous e edge) and pairs with all in u edge vect +if TBR then all conbinations of pairs +-} +createVectorEdgePairs ∷ String → V.Vector Edge → V.Vector Edge → V.Vector Edge → V.Vector (V.Vector Edge) +createVectorEdgePairs pairSet previousEdges eEdgeVect uEdgeVect + | pairSet == "spr" = + let eEdgePrev = V.head previousEdges + in V.map (V.cons eEdgePrev) $ V.map V.singleton uEdgeVect + | pairSet == "tbr" = getVectorAllVectorPairs eEdgeVect uEdgeVect + | otherwise = errorWithoutStackTrace ("Pair set option " <> pairSet <> " not implemented") + + +-- | addEdgeToSplitRecurse like addToEdgeSplit but recursiblye yeilds a single best tree +addEdgeToSplitRecurse + ∷ V.Vector Edge + → V.Vector Edge + → Edge + → Edge + → M.Matrix Double + → V.Vector (V.Vector Edge) + → (Double, Tree, M.Matrix Double) + → (Double, Tree, M.Matrix Double) +addEdgeToSplitRecurse eEdges uEdges eTerminal uTerminal distMatrix edgesToConnectVect origTriple@(inDelta, _, _) = + if V.null edgesToConnectVect + then origTriple + else + let edgesToConnect = V.head edgesToConnectVect + in if V.null eEdges && V.null uEdges + then error "Empty e/u Edge vectors in addEdgeToSplit" + else + if V.null edgesToConnect + then error "Empty eEdge vector in addEdgeToSplit" + else + if fst3 (V.head eEdges) == (-1) -- adding eOTU, V.last since the (-1,-1,0) edge should always be first + then + let (newDelta, newTree, newMatrix) = addToEdgeSwap distMatrix (fst3 eTerminal) (V.empty, uEdges) (M.rows distMatrix) (V.last edgesToConnect) + in if newDelta < inDelta + then (newDelta, newTree, newMatrix) + else addEdgeToSplitRecurse eEdges uEdges eTerminal uTerminal distMatrix (V.tail edgesToConnectVect) origTriple + else + if fst3 (V.head uEdges) == (-1) -- adding uOTU + then + let (newDelta, newTree, newMatrix) = addToEdgeSwap distMatrix (fst3 uTerminal) (V.empty, eEdges) (M.rows distMatrix) (V.last edgesToConnect) + in if newDelta < inDelta + then (newDelta, newTree, newMatrix) + else addEdgeToSplitRecurse eEdges uEdges eTerminal uTerminal distMatrix (V.tail edgesToConnectVect) origTriple + else -- both internal edges + + let (newDelta, newTree, newMatrix) = connectEdges distMatrix eEdges uEdges edgesToConnect + in if newDelta < inDelta + then (newDelta, newTree, newMatrix) + else addEdgeToSplitRecurse eEdges uEdges eTerminal uTerminal distMatrix (V.tail edgesToConnectVect) origTriple + + +{- | doSPRTBR takes split tree and rejoins by creating edges from a single one of the first edge set to each of the mebers of the second +important that e and u edges come in correct order and previous edges are e and u sets in order as well +-} +doSPRTBR ∷ String → Double → V.Vector String → Int → SplitTreeData → [TreeWithData] +doSPRTBR rejoinType curBestCost leafNames outGroup split = + -- trace ("In doSPRTBR") ( + let (eEdgeVect, uEdgeVect, delta, previousEdges, distMatrix) = split + in if V.null eEdgeVect || V.null uEdgeVect + then error "Empty edge vectors in doSPRTBR" + else + if fst3 (V.head eEdgeVect) == snd3 (V.head eEdgeVect) -- if an OTU call readdTerminal + then + let newLeafIndex = M.rows distMatrix + -- keep whole uEdgeVect so recheck input tree edges + additionList = V.map (addToEdgeSwap distMatrix (fst3 $ V.head eEdgeVect) (V.empty, uEdgeVect) newLeafIndex) uEdgeVect + minAdditionCost = V.minimum (V.map fst3 additionList) + in if minAdditionCost > delta + then [] + else sieveTrees delta curBestCost additionList leafNames outGroup [] + else -- internal edge or edge with two OTUs as vertices + -- check to make sure edges are where they should be can remove later + -- fix e edge and join to each u edge for SPR (n^2) + + let edgesToConnect = createVectorEdgePairs rejoinType previousEdges eEdgeVect uEdgeVect + -- e and u terminal should not be used here since OTUs are shortcircuited above + eTerminal = (-1, -1, 0) + uTerminal = (-1, -1, 0) + additionList = V.map (addEdgeToSplit eEdgeVect uEdgeVect eTerminal uTerminal distMatrix) edgesToConnect + minAdditionCost = V.minimum (V.map fst3 additionList) + in if minAdditionCost > delta + then [] + else sieveTrees delta curBestCost additionList leafNames outGroup [] + + +-- | doSPRTBRSteep like doSPRTBR but only saves a sinlge (and better) tree +doSPRTBRSteep ∷ String → Double → V.Vector String → Int → SplitTreeData → TreeWithData → TreeWithData +doSPRTBRSteep rejoinType curBestCost leafNames outGroup split origTree@(_, inTree, _, inMatrix) = + -- trace ("In doSPRTBR") ( + let (eEdgeVect, uEdgeVect, delta, previousEdges, distMatrix) = split + in if V.null eEdgeVect || V.null uEdgeVect + then error "Empty edge vectors in doSPRTBR" + else + if fst3 (V.head eEdgeVect) == snd3 (V.head eEdgeVect) -- if an OTU call readdTerminal + then + let newLeafIndex = M.rows distMatrix + -- keep whole uEdgeVect so recheck input tree edges + (newDelta, newTree, newMatrix) = addToEdgeSwapRecurse delta distMatrix (fst3 $ V.head eEdgeVect) (V.empty, uEdgeVect) newLeafIndex uEdgeVect + newCost = curBestCost - delta + newDelta + newickTree = convertToNewick leafNames outGroup newTree + newickTree' = take (length newickTree - 3) newickTree <> "[" <> showDouble precision newCost <> "]" <> ";" + in if newCost < curBestCost + then (newickTree', newTree, newCost, newMatrix) + else origTree + else -- internal edge or edge with two OTUs as vertices + -- check to make sure edges are where they should be can remove later + -- fix e edge and join to each u edge for SPR (n^2) + + let edgesToConnect = createVectorEdgePairs rejoinType previousEdges eEdgeVect uEdgeVect + -- e and u terminal should not be used here since OTUs are shortcircuited above + eTerminal = (-1, -1, 0) + uTerminal = (-1, -1, 0) + (newDelta, newTree, newMatrix) = addEdgeToSplitRecurse eEdgeVect uEdgeVect eTerminal uTerminal distMatrix edgesToConnect (delta, inTree, inMatrix) + newCost = curBestCost - delta + newDelta + newickTree = convertToNewick leafNames outGroup newTree + newickTree' = take (length newickTree - 3) newickTree <> "[" <> showDouble precision newCost <> "]" <> ";" + in if newCost < curBestCost + then -- trace ("->" <> show newCost) + (newickTree', newTree, newCost, newMatrix) + else origTree + + +-- | reAddTerminalsSteep like readdTerminals but only returns one tree keeping better +reAddTerminalsSteep ∷ String → Double → V.Vector String → Int → SplitTreeData → TreeWithData → TreeWithData +reAddTerminalsSteep rejoinType curBestCost leafNames outGroup split origTree = + if rejoinType /= "otu" + then error ("Incorrect swap function in reAddTerminals: " <> rejoinType) + else + let (eEdgeVect, uEdgeVect, delta, _, distMatrix) = split + nOTUs = V.length leafNames + in if (V.length eEdgeVect > 1) + || ((fst3 (V.head eEdgeVect) /= snd3 (V.head eEdgeVect)) && (fst3 (V.head eEdgeVect) < nOTUs) && (snd3 (V.head eEdgeVect) < nOTUs)) + then origTree + else + if M.rows distMatrix /= ((2 * nOTUs) - 3) + then error ("Dist Matrix incorrect size " <> show (M.dim distMatrix) <> " should be " <> show ((2 * nOTUs) - 3, (2 * nOTUs) - 3)) + else + let newLeafIndex = M.rows distMatrix + -- take tail of uEdgeVect so not regerate input tree + (newDelta, newTree, newMatrix) = addToEdgeSwapRecurse delta distMatrix (fst3 $ V.head eEdgeVect) (V.empty, uEdgeVect) newLeafIndex uEdgeVect -- (V.tail uEdgeVect) -- tail so not hit original tree, leave all to reestimate if necesary + newCost = curBestCost - delta + newDelta + newickTree = convertToNewick leafNames outGroup newTree + newickTree' = take (length newickTree - 3) newickTree <> "[" <> showDouble precision newCost <> "]" <> ";" + in if newCost < curBestCost + then -- trace ("->" <> show newCost) + (newickTree', newTree, newCost, newMatrix) + else origTree + + +{- | filterNewTreesOnCost returns list of all unique new best cost trees from list +assumes curBestCost = cost of sabed trees +-} +filterNewTreesOnCost ∷ Double → [TreeWithData] → [TreeWithData] → [TreeWithData] +filterNewTreesOnCost curBestCost firstTreeList savedTrees = + if null firstTreeList + then savedTrees + else + let firstTree = head firstTreeList + (_, _, firstCost, _) = firstTree + in if firstCost < curBestCost + then filterNewTreesOnCost firstCost (tail firstTreeList) [firstTree] + else + if firstCost > curBestCost + then filterNewTreesOnCost curBestCost (tail firstTreeList) savedTrees + else + let uniqueTree = filterNewTrees savedTrees firstTree + in if isNothing uniqueTree + then filterNewTreesOnCost curBestCost (tail firstTreeList) savedTrees + else filterNewTreesOnCost curBestCost (tail firstTreeList) (fromJust uniqueTree : savedTrees) + + +-- | filterNewTrees takes the first tree and checks if in the second list +filterNewTrees ∷ [TreeWithData] → TreeWithData → Maybe TreeWithData +filterNewTrees secondTreeList firstTree = + if null secondTreeList + then Just firstTree + else + let (firstNewick, _, _, _) = firstTree + (secondNewick, _, _, _) = head secondTreeList + in if firstNewick == secondNewick + then Nothing + else filterNewTrees (tail secondTreeList) firstTree + + +-- | getSaveNumber returns number ot save or Infty +getSaveNumber ∷ String → Int +getSaveNumber inString = + if length inString == 4 + then maxBound ∷ Int + else (read $ drop 5 inString) ∷ Int + + +{- | splitJoin does both split and rejoin operations in a fashion that if a better (shorter) tree is found is shortcircuits and +begins again on the new tree, else proceeds untill all splits and joins are completed, but only on a single tree +-} +splitJoin + ∷ (String → Double → V.Vector String → Int → SplitTreeData → TreeWithData → TreeWithData) + → String + → V.Vector String + → Int + → V.Vector Edge + → TreeWithData + → TreeWithData +splitJoin swapFunction refineType leafNames outGroup edgeVect curTreeWithData@(_, curTree, curTreeCost, curTreeMatrix) = + if V.null edgeVect + then curTreeWithData -- All splits tested, nothing better found + else + let firstEdge = V.head edgeVect + firstSplit = splitTree curTreeMatrix curTree curTreeCost firstEdge + firstTree@(_, firstNewTree, firstTreeCost, _) = swapFunction refineType curTreeCost leafNames outGroup firstSplit curTreeWithData + in if firstTreeCost < curTreeCost + then splitJoin swapFunction refineType leafNames outGroup (snd firstNewTree) firstTree + else splitJoin swapFunction refineType leafNames outGroup (V.tail edgeVect) curTreeWithData + + +{- | splitJoinWrapper wraps around splitJoin to allow parallel execution +the reason is to allow the consumption of "edgeVect" recursively within the same tree +-} +splitJoinWrapper + ∷ (String → Double → V.Vector String → Int → SplitTreeData → TreeWithData → TreeWithData) + → String + → V.Vector String + → Int + → TreeWithData + → TreeWithData +splitJoinWrapper swapFunction refineType leafNames outGroup curTreeWithData@(_, curTree, _, _) = + let edgeVect = snd curTree + in splitJoin swapFunction refineType leafNames outGroup edgeVect curTreeWithData + + +{- | getGeneralSwapSteepestOne performs refinement as in getGeneralSwap but saves on a single tree (per split/swap) and +immediately accepts a Better (shorter) tree and resumes the search on that new tree +relies heavily on laziness of splitTree so not parallel at this level +-} +getGeneralSwapSteepestOne + ∷ String + → (String → Double → V.Vector String → Int → SplitTreeData → TreeWithData → TreeWithData) + → V.Vector String + → Int + → [TreeWithData] + → [TreeWithData] + → PhyG [TreeWithData] +getGeneralSwapSteepestOne refineType swapFunction leafNames outGroup inTreeList savedTrees = case inTreeList of + [] → pure savedTrees + t : ts → + let splitAction ∷ TreeWithData → TreeWithData + splitAction = splitJoinWrapper swapFunction refineType leafNames outGroup + in do + steepTreeList ← + getParallelChunkMapBy strict3of4 <&> \pMap → + splitAction `pMap` inTreeList + + let steepCost = minimum $ fmap thd4 steepTreeList + -- this to maintain the trajectories untill final swap--otherwise could converge down to single tree prematurely + pure $ keepTrees steepTreeList "unique" "first" steepCost + + +{- | getGeneralSwap performs a "re-add" of terminal identical to wagner build addition to available edges +performed on all splits recursively until no more better/equal cost trees found +this won't work to save "all" just unique and best and unique of best +add "steep-est" descent +-} +getGeneralSwap + ∷ String + → (String → Double → V.Vector String → Int → SplitTreeData → [TreeWithData]) + → String + → String + → V.Vector String + → Int + → [TreeWithData] + → [TreeWithData] + → PhyG [TreeWithData] +getGeneralSwap refineType swapFunction saveMethod keepMethod leafNames outGroup inTreeList savedTrees = case inTreeList of + [] → pure savedTrees + curFullTree : otherTrees → + let maxNumSave = getSaveNumber saveMethod + splitAction ∷ Edge → SplitTreeData + splitAction = splitTree curTreeMatrix curTree curTreeCost + + splitExtract ∷ SplitTreeData → SplitTreeData + splitExtract ~val@(a, b, c, _, _) = force a `seq` force b `seq` force c `seq` val + + swapAction ∷ SplitTreeData → [TreeWithData] + swapAction = swapFunction refineType curTreeCost leafNames outGroup + + overallBestCost = minimum $ fmap thd4 savedTrees + (_, curTree, curTreeCost, curTreeMatrix) = curFullTree + in do + -- parallelize here + splitTreeList ← + getParallelChunkMapBy splitExtract <&> \pMap → + splitAction `pMap` (V.toList $ snd curTree) + firstTreeList ← + getParallelChunkMapBy (listApplying strict3of4) <&> \pMap → + swapAction `pMap` splitTreeList + let firstTreeList' = filterNewTreesOnCost overallBestCost (curFullTree : concat firstTreeList) savedTrees + + -- Work around for negative NT.infinity tree costs (could be dst matrix issue) + if NT.isInfinite curTreeCost || null firstTreeList' + then getGeneralSwap refineType swapFunction saveMethod keepMethod leafNames outGroup otherTrees savedTrees + else + let (_, _, costOfFoundTrees, _) = head firstTreeList' + in -- workaround for negatrive NT.infinity trees + if NT.isInfinite costOfFoundTrees + then getGeneralSwap refineType swapFunction saveMethod keepMethod leafNames outGroup otherTrees savedTrees + else + if costOfFoundTrees < overallBestCost + then + let uniqueTreesToAdd = fmap fromJust $ filter (/= Nothing) $ fmap (filterNewTrees inTreeList) firstTreeList' + treesToSwap = keepTrees (otherTrees <> uniqueTreesToAdd) saveMethod keepMethod costOfFoundTrees + in getGeneralSwap refineType swapFunction saveMethod keepMethod leafNames outGroup treesToSwap (take maxNumSave firstTreeList') + else + if costOfFoundTrees == overallBestCost + then + if length savedTrees >= maxNumSave + then getGeneralSwap refineType swapFunction saveMethod keepMethod leafNames outGroup otherTrees savedTrees + else + let uniqueTreesToAdd = fmap fromJust $ filter (/= Nothing) $ fmap (filterNewTrees inTreeList) firstTreeList' + treesToSwap = keepTrees (otherTrees <> uniqueTreesToAdd) saveMethod keepMethod costOfFoundTrees + in getGeneralSwap + refineType + swapFunction + saveMethod + keepMethod + leafNames + outGroup + treesToSwap + (take maxNumSave $ savedTrees <> firstTreeList') + else getGeneralSwap refineType swapFunction saveMethod keepMethod leafNames outGroup otherTrees savedTrees + + +{- | performRefinement takes input trees in TRE and Newick format and performs different forms of tree refinement +at present just OTU (remove leaves and re-add), SPR and TBR +-} +performRefinement ∷ String → String → String → V.Vector String → Int → TreeWithData → PhyG [TreeWithData] +performRefinement refinement saveMethod keepMethod leafNames outGroup inTree = + if refinement == "none" + then pure [inTree] + else + if refinement == "otu" + then do + newTrees ← + getGeneralSwapSteepestOne "otu" reAddTerminalsSteep leafNames outGroup [inTree] [([], (V.empty, V.empty), NT.infinity, M.empty)] + newTrees' ← + getGeneralSwap + "otu" + reAddTerminals + saveMethod + keepMethod + leafNames + outGroup + newTrees + [([], (V.empty, V.empty), NT.infinity, M.empty)] + + if refinement == "best:1" + then + if not $ null newTrees + then pure newTrees + else pure [inTree] + else + if not (null newTrees') + then pure newTrees' + else -- trace "OTU swap did not find any new trees" + pure [inTree] + else + if refinement == "spr" + then do + newTrees ← + getGeneralSwapSteepestOne "otu" reAddTerminalsSteep leafNames outGroup [inTree] [([], (V.empty, V.empty), NT.infinity, M.empty)] + newTrees' ← + getGeneralSwapSteepestOne "spr" doSPRTBRSteep leafNames outGroup newTrees [([], (V.empty, V.empty), NT.infinity, M.empty)] + newTrees'' ← + getGeneralSwap + "spr" + doSPRTBR + saveMethod + keepMethod + leafNames + outGroup + newTrees' + [([], (V.empty, V.empty), NT.infinity, M.empty)] + + if saveMethod == "best:1" + then do + if not $ null newTrees' + then pure newTrees' + else pure [inTree] + else + if not (null newTrees'') + then pure newTrees'' + else -- trace "SPR swap did not find any new trees" + pure [inTree] + else + if refinement == "tbr" + then do + newTrees ← + getGeneralSwapSteepestOne "otu" reAddTerminalsSteep leafNames outGroup [inTree] [([], (V.empty, V.empty), NT.infinity, M.empty)] + newTrees' ← + getGeneralSwapSteepestOne "spr" doSPRTBRSteep leafNames outGroup newTrees [([], (V.empty, V.empty), NT.infinity, M.empty)] + newTrees'' ← + getGeneralSwapSteepestOne "tbr" doSPRTBRSteep leafNames outGroup newTrees' [([], (V.empty, V.empty), NT.infinity, M.empty)] + newTrees''' ← + getGeneralSwap + "tbr" + doSPRTBR + saveMethod + keepMethod + leafNames + outGroup + newTrees'' + [([], (V.empty, V.empty), NT.infinity, M.empty)] + + if saveMethod == "best:1" + then do + if not $ null newTrees' + then pure newTrees'' + else pure [inTree] + else + if not (null newTrees''') + then pure newTrees''' + else -- trace "TBR swap did not find any new trees" + pure [inTree] + else errorWithoutStackTrace ("Unrecognized refinement method: " <> refinement) diff --git a/src/Search/Fuse.hs b/src/Search/Fuse.hs new file mode 100644 index 000000000..0b7c2f9db --- /dev/null +++ b/src/Search/Fuse.hs @@ -0,0 +1,786 @@ +-- Used defaultParStrat for fusing operations--hopefully reduce memory footprint + +{- | +Module specifying graph fusing recombination functions. +-} +module Search.Fuse ( + fuseAllGraphs, +) where + +import Control.Monad (filterM, when) +import Control.Monad.IO.Class (MonadIO (..)) +import Control.Monad.Random.Class +import Data.BitVector.LittleEndian qualified as BV +import Data.Bits +import Data.Foldable (fold) +import Data.Functor ((<&>)) +import Data.InfList qualified as IL +import Data.List qualified as L +import Data.Map qualified as MAP +import Data.Maybe +import Data.Text.Lazy qualified as TL +import Data.Vector qualified as V +import GeneralUtilities +import GraphOptimization.PostOrderSoftWiredFunctions qualified as POSW +import GraphOptimization.Traversals qualified as T +import Graphs.GraphOperations qualified as GO +import PHANE.Evaluation +import PHANE.Evaluation.ErrorPhase (ErrorPhase (..)) +import PHANE.Evaluation.Logging (LogLevel (..), Logger (..)) +import PHANE.Evaluation.Verbosity (Verbosity (..)) +import Search.Swap qualified as S +import Types.Types +import Utilities.LocalGraph qualified as LG +import Utilities.Utilities as U + + +-- In general, needs simplification and refactoring + +{- | fuseAllGraphs takes a list of phylogenetic graphs and performs all pairwise fuses +later--could limit by options making random choices for fusing +keeps results according to options (best, unique, etc) +unique is unique of "best" from individual fusings +singleRound short circuits recursive continuation on newly found graphs +-} +fuseAllGraphs + ∷ SwapParams + → GlobalSettings + → ProcessedData + → Int + → Bool + → Bool + → Bool + → Maybe Int + → Bool + → Bool + → [ReducedPhylogeneticGraph] + → PhyG ([ReducedPhylogeneticGraph], Int) +fuseAllGraphs swapParams inGS inData counter returnBest returnUnique singleRound fusePairs randomPairs reciprocal inGraphList = case inGraphList of + [] → return ([], counter) + [x] → return (inGraphList, counter) + _ → + let -- getting values to be passed for graph diagnorsis later + numLeaves = V.length $ fst3 inData + + curBest = minimum $ fmap snd5 inGraphList + + curBestGraph = head $ filter ((== curBest) . snd5) inGraphList + in do + -- get net penalty estimate from optimal graph for delta recombine later + -- Nothing here so starts at overall root + inGraphNetPenalty ← T.getPenaltyFactor inGS inData Nothing $ GO.convertReduced2PhylogeneticGraphSimple curBestGraph + + let inGraphNetPenaltyFactor = inGraphNetPenalty / curBest + + -- could be fusePairRecursive to save on memory + let action ∷ (ReducedPhylogeneticGraph, ReducedPhylogeneticGraph) → PhyG [ReducedPhylogeneticGraph] + action = fusePair swapParams inGS inData numLeaves inGraphNetPenaltyFactor curBest reciprocal + + -- get fuse pairs + let graphPairList' = getListPairs inGraphList + (graphPairList, randString) ← case fusePairs of + Nothing → pure (graphPairList', "") + Just count | randomPairs → do + selectedGraphs ← take count <$> shuffleList graphPairList' + pure (selectedGraphs, " randomized") + Just index → pure (takeNth index graphPairList', "") + + newGraphList ← + getParallelChunkTraverseBy (fmap U.strict2of5) >>= \pTraverse → + fold <$> pTraverse action graphPairList + + let fuseBest = + if not (null newGraphList) + then minimum $ fmap snd5 newGraphList + else infinity + + let swapTypeString = + if swapType swapParams == NoSwap + then "out" + else " " <> (show $ swapType swapParams) + + logWith + LogInfo + ( "\tFusing " + <> (show $ length graphPairList) + <> randString + <> " graph pairs with" + <> swapTypeString + <> " swapping at minimum cost " + <> (show $ min fuseBest curBest) + <> "\n" + ) + if null newGraphList + then return (inGraphList, counter + 1) + else + if returnUnique + then do + uniqueList ← GO.selectGraphs Unique (keepNum swapParams) 0 $ inGraphList <> newGraphList + if fuseBest < curBest -- trace ("\t->" <> (show fuseBest)) -- <> "\n" <> (LG.prettify $ GO.convertDecoratedToSimpleGraph $ thd5 $ head bestSwapGraphList)) + then + fuseAllGraphs + swapParams + inGS + inData + (counter + 1) + returnBest + returnUnique + singleRound + fusePairs + randomPairs + reciprocal + uniqueList + else pure (uniqueList, counter + 1) + else -- return best + -- only do one round of fusing + + if singleRound + then GO.selectGraphs Best (keepNum swapParams) 0.0 (inGraphList <> newGraphList) <&> \x → (x, counter + 1) + else -- recursive rounds + do + -- need unique list to keep going + + allBestList ← GO.selectGraphs Unique (keepNum swapParams) 0 $ inGraphList <> newGraphList + -- found better + if fuseBest < curBest + then do + -- logWith LogInfo ("\n") + fuseAllGraphs + swapParams + inGS + inData + (counter + 1) + returnBest + returnUnique + singleRound + fusePairs + randomPairs + reciprocal + allBestList + else -- equal or worse cost just return--could keep finding equal + return (allBestList, counter + 1) + + +{- | fusePairRecursive wraps around fusePair recursively traversing through fuse pairs as oppose +to parMapping at once creating a large memory footprint +needs to be refactored (left right are same) + +-- this logic is wrong and needs to be fixed +-} +fusePairRecursive + ∷ SwapParams + → GlobalSettings + → ProcessedData + → Int + → VertexCost + → VertexCost + → Bool + → [ReducedPhylogeneticGraph] + → [(ReducedPhylogeneticGraph, ReducedPhylogeneticGraph)] + → PhyG [ReducedPhylogeneticGraph] +fusePairRecursive swapParams inGS inData numLeaves netPenalty curBestScore reciprocal resultList leftRightList = + if null leftRightList + then return resultList + else + let -- parallel here blows up memory + numPairsToExamine = graphsSteepest inGS -- min (graphsSteepest inGS) PU.getNumThreads + + -- parallel setup + action ∷ (ReducedPhylogeneticGraph, ReducedPhylogeneticGraph) → PhyG [ReducedPhylogeneticGraph] + action = fusePair swapParams inGS inData numLeaves netPenalty curBestScore reciprocal + in do + -- paralleized high level + fusePairResult' ← + getParallelChunkTraverseBy (fmap U.strict2of5) >>= \pTraverse → + action `pTraverse` take numPairsToExamine leftRightList + let fusePairResult = concat fusePairResult' + + bestResultList ← + if graphType inGS == Tree + then GO.selectGraphs Best (keepNum swapParams) 0 fusePairResult + else do + -- check didn't make weird network + goodGraphList ← filterM (LG.isPhylogeneticGraph . fst5) fusePairResult + GO.selectGraphs Best (keepNum swapParams) 0 goodGraphList + + let pairScore = + if (not . null) bestResultList + then snd5 $ head bestResultList + else infinity + + let newCurBestScore = min curBestScore pairScore + + bestResultList' ← + fusePairRecursive + swapParams + inGS + inData + numLeaves + netPenalty + newCurBestScore + reciprocal + resultList + (drop numPairsToExamine leftRightList) + + let bestResultList'' = + if pairScore <= curBestScore + then bestResultList' + else [] + + pure bestResultList'' + + +-- bestResultList' <> fusePairRecursive swapParams inGS inData numLeaves netPenalty newCurBestScore reciprocal resultList (tail leftRightList) + +{- | fusePair recombines a single pair of graphs +this is done by coopting the split and readd functinos from the Swap.Swap functions and exchanging +pruned subgraphs with the same leaf complement (as recorded by the subtree root node bit vector field) +spr-like and tbr-like readds can be performed as with options +needs simolification and refactoring +-} +fusePair + ∷ SwapParams + → GlobalSettings + → ProcessedData + → Int + → VertexCost + → VertexCost + → Bool + → (ReducedPhylogeneticGraph, ReducedPhylogeneticGraph) + → PhyG [ReducedPhylogeneticGraph] +fusePair swapParams inGS inData numLeaves netPenalty curBestScore reciprocal (leftGraph, rightGraph) = + if (LG.isEmpty $ fst5 leftGraph) || (LG.isEmpty $ fst5 rightGraph) + then error "Empty graph in fusePair" + else + if (fst5 leftGraph) == (fst5 rightGraph) + then return [] + else -- split graphs at all bridge edges (all edges for Tree) + + let -- left graph splits on all edges + leftDecoratedGraph = thd5 leftGraph + (leftRootIndex, _) = head $ LG.getRoots leftDecoratedGraph + leftBreakEdgeList = + if (graphType inGS) == Tree + then filter ((/= leftRootIndex) . fst3) $ LG.labEdges leftDecoratedGraph + else filter ((/= leftRootIndex) . fst3) $ LG.getEdgeSplitList leftDecoratedGraph + + -- right graph splits on all edges + rightDecoratedGraph = thd5 rightGraph + (rightRootIndex, _) = head $ LG.getRoots rightDecoratedGraph + rightBreakEdgeList = + if (graphType inGS) == Tree + then filter ((/= rightRootIndex) . fst3) $ LG.labEdges rightDecoratedGraph + else filter ((/= rightRootIndex) . fst3) $ LG.getEdgeSplitList rightDecoratedGraph + + -- parallel stuff + -- splitLeftAction :: (Show b) => Gr a b -> LEdge b -> (Gr a b, Node, Node, Node, LEdge b, [Edge]) + splitLeftAction = LG.splitGraphOnEdge' leftDecoratedGraph + + -- splitRightAction :: (Show b) => Gr a b -> LEdge b -> (Gr a b, Node, Node, Node, LEdge b, [Edge]) + splitRightAction = LG.splitGraphOnEdge' rightDecoratedGraph + + exchangeAction + ∷ ((DecoratedGraph, LG.Node, LG.Node, LG.Node), (DecoratedGraph, LG.Node, LG.Node, LG.Node), LG.Node) + → (DecoratedGraph, Int, Int, Int, Int) + exchangeAction = exchangePrunedGraphs numLeaves + + reoptimizeAction ∷ (DecoratedGraph, Int, Int) → PhyG (DecoratedGraph, VertexCost) + reoptimizeAction = S.reoptimizeSplitGraphFromVertexTuple inGS inData False netPenalty + in do + splitLeftPar ← getParallelChunkMap + let leftSplitTupleList = splitLeftPar splitLeftAction leftBreakEdgeList + + let (_, _, leftPrunedGraphRootIndexList, leftOriginalConnectionOfPrunedList, leftOriginalEdgeList, _) = L.unzip6 leftSplitTupleList + + let leftPrunedGraphBVList = fmap bvLabel $ fmap fromJust $ fmap (LG.lab leftDecoratedGraph) leftPrunedGraphRootIndexList + + splitRightPar ← getParallelChunkMap + let rightSplitTupleList = splitRightPar splitRightAction rightBreakEdgeList + + let (_, _, rightPrunedGraphRootIndexList, rightOriginalConnectionOfPrunedList, rightOriginalEdgeList, _) = L.unzip6 rightSplitTupleList + + let rightPrunedGraphBVList = fmap bvLabel $ fmap fromJust $ fmap (LG.lab rightDecoratedGraph) rightPrunedGraphRootIndexList + + -- get all pairs of split graphs + let (leftSplitTupleList', rightSplitTupleList') = unzip $ cartProd (fmap first4of6 leftSplitTupleList) (fmap first4of6 rightSplitTupleList) + let (leftPrunedGraphBVList', rightPrunedGraphBVList') = unzip $ cartProd leftPrunedGraphBVList rightPrunedGraphBVList + + -- get compatible split pairs via checking bv of root index of pruned subgraphs + let leftRightMatchList = zipWith (==) leftPrunedGraphBVList' rightPrunedGraphBVList' + + -- only take compatible, non-identical pairs with > 2 terminal--otherwise basically SPR move or nothing (if identical) + -- also checks that prune and splits don't match between the graphs to be recombined--ie exchanging the same sub-graph + let recombinablePairList = L.zipWith (getCompatibleNonIdenticalSplits numLeaves) leftRightMatchList leftPrunedGraphBVList' + let (leftValidTupleList, rightValidTupleList, _) = L.unzip3 $ filter ((== True) . thd3) $ zip3 leftSplitTupleList' rightSplitTupleList' recombinablePairList + + if null leftValidTupleList + then pure [] + else do + -- create new "splitgraphs" by replacing nodes and edges of pruned subgraph in reciprocal graphs + -- returns reindexed list of base graph root, pruned component root, parent of pruned component root, original graph break edge + + -- leftRight first then rightLeft if reciprocal + + exchangeLeftPar ← getParallelChunkMap + let exchangeLeftResult = exchangeLeftPar exchangeAction (zip3 leftValidTupleList rightValidTupleList leftOriginalConnectionOfPrunedList) + let ( leftBaseRightPrunedSplitGraphList + , leftRightGraphRootIndexList + , leftRightPrunedParentRootIndexList + , leftRightPrunedRootIndexList + , leftRightOriginalConnectionOfPrunedList + ) = + L.unzip5 exchangeLeftResult + + leftRightOptimizedSplitGraphCostList ← + getParallelChunkTraverse >>= \pTraverse → + pTraverse reoptimizeAction $ zip3 leftBaseRightPrunedSplitGraphList leftRightGraphRootIndexList leftRightPrunedRootIndexList + + let baseGraphDifferentList = L.replicate (length leftRightOptimizedSplitGraphCostList) True + + let ( _ + , leftRightOptimizedSplitGraphCostList' + , _ + , leftRightPrunedRootIndexList' + , leftRightPrunedParentRootIndexList' + , leftRightOriginalConnectionOfPrunedList' + ) = + L.unzip6 $ + filter ((== True) . fst6) $ + L.zip6 + baseGraphDifferentList + leftRightOptimizedSplitGraphCostList + leftRightGraphRootIndexList + leftRightPrunedRootIndexList + leftRightPrunedParentRootIndexList + leftRightOriginalConnectionOfPrunedList + + -- re-add pruned component to base component left-right and right-left + -- need curent best cost + let curBetterCost = min (snd5 leftGraph) (snd5 rightGraph) + + -- get network penalty factors to pass on + leftPenalty ← getNetworkPentaltyFactor inGS inData (snd5 leftGraph) leftGraph + rightPenalty ← getNetworkPentaltyFactor inGS inData (snd5 rightGraph) rightGraph + let networkCostFactor = min leftPenalty rightPenalty + + -- left and right root indices should be the same + leftRightFusedGraphList ← + recombineComponents + swapParams + inGS + inData + curBetterCost + curBestScore + leftRightOptimizedSplitGraphCostList' + leftRightPrunedRootIndexList' + leftRightPrunedParentRootIndexList' + leftRightOriginalConnectionOfPrunedList' + leftRootIndex + networkCostFactor + leftOriginalEdgeList + + rightLeftFusedGraphList ← + if not reciprocal + then pure [] + else do + exchangeRightPar ← getParallelChunkMap + let exchangeRightResult = exchangeRightPar exchangeAction (zip3 rightValidTupleList leftValidTupleList rightOriginalConnectionOfPrunedList) + let ( rightBaseLeftPrunedSplitGraphList + , rightLeftGraphRootIndexList + , rightLeftPrunedParentRootIndexList + , rightLeftPrunedRootIndexList + , rightLeftOriginalConnectionOfPrunedList + ) = + L.unzip5 exchangeRightResult + + rightLeftOptimizedSplitGraphCostList ← + getParallelChunkTraverse >>= \pTraverse → + pTraverse reoptimizeAction $ zip3 rightBaseLeftPrunedSplitGraphList rightLeftGraphRootIndexList rightLeftPrunedRootIndexList + + let ( _ + , rightLeftOptimizedSplitGraphCostList' + , _ + , rightLeftPrunedRootIndexList' + , rightLeftPrunedParentRootIndexList' + , rightLeftOriginalConnectionOfPrunedList' + ) = + L.unzip6 $ + filter ((== True) . fst6) $ + L.zip6 + baseGraphDifferentList + rightLeftOptimizedSplitGraphCostList + rightLeftGraphRootIndexList + rightLeftPrunedRootIndexList + rightLeftPrunedParentRootIndexList + rightLeftOriginalConnectionOfPrunedList + recombineComponents + swapParams + inGS + inData + curBetterCost + curBestScore + rightLeftOptimizedSplitGraphCostList' + rightLeftPrunedRootIndexList' + rightLeftPrunedParentRootIndexList' + rightLeftOriginalConnectionOfPrunedList' + rightRootIndex + networkCostFactor + rightOriginalEdgeList + + -- get "best" fused graphs from leftRight and rightLeft + bestFusedGraphs ← GO.selectGraphs Best (keepNum swapParams) 0 $ leftRightFusedGraphList <> rightLeftFusedGraphList + + pure bestFusedGraphs + where + first4of6 (a, b, c, d, _, _) = (a, b, c, d) + + +{- | recombineComponents takes readdition arguments (swap, steepest etc) and wraps the swap-stype rejoining of components +ignores doSteepeast for now--doesn't seem to have meaning in rejoining since not then taking that graph for fusion and shortcircuiting +original connection done first left/rightOriginalEdgeList--spo can do "none" in swap +"curBetterCost" is of pain of inputs to make sure keep all better than their inputs, "overallBestScore" is for progress info +-} +recombineComponents + ∷ SwapParams + → GlobalSettings + → ProcessedData + → VertexCost + → VertexCost + → [(DecoratedGraph, VertexCost)] + → [Int] + → [Int] + → [Int] + → LG.Node + → VertexCost + → [LG.LEdge EdgeInfo] + → PhyG [ReducedPhylogeneticGraph] +recombineComponents swapParams inGS inData curBetterCost overallBestCost inSplitGraphCostPairList prunedRootIndexList prunedParentRootIndexList _ graphRoot networkCostFactor originalSplitEdgeList = + -- check and see if any reconnecting to do + -- trace ("RecombineComponents " <> (show $ length splitGraphCostPairList)) ( + let splitGraphCostPairList = filter ((not . LG.isEmpty) . fst) inSplitGraphCostPairList + in if null splitGraphCostPairList + then pure [] + else -- top line to cover SPR HarWired bug + + let -- since splits not created together, IA won't be consistent between components + -- steepest = False -- should look at all better, now option + + -- network costs--using an input value that is minimum of inputs + netPenaltyFactorList = L.replicate (length splitGraphCostPairList) networkCostFactor + + -- no simulated annealling functionality infuse + inSimAnnealParams = Nothing + + -- get edges in pruned (to be exchanged) graphs + edgesInPrunedList = fmap LG.getEdgeListAfter $ zip (fmap fst splitGraphCostPairList) prunedParentRootIndexList + + -- get edges in base (not to be exchanged) graphs and put original split edge first + rejoinEdgesList = fmap (getBaseGraphEdges graphRoot) $ zip3 (fmap fst splitGraphCostPairList) edgesInPrunedList originalSplitEdgeList + + -- huge zip to fit arguments into revised join function + graphDataList = + zip9 + (fmap fst splitGraphCostPairList) + (fmap GO.convertDecoratedToSimpleGraph $ fmap fst splitGraphCostPairList) + (fmap snd splitGraphCostPairList) + (L.replicate (length splitGraphCostPairList) graphRoot) + prunedRootIndexList + prunedParentRootIndexList + rejoinEdgesList + edgesInPrunedList + netPenaltyFactorList + + -- parallel setup + action + ∷ (DecoratedGraph, SimpleGraph, VertexCost, LG.Node, LG.Node, LG.Node, [LG.LEdge EdgeInfo], [LG.LEdge EdgeInfo], VertexCost) + → PhyG [ReducedPhylogeneticGraph] + action = S.rejoinGraphTuple swapParams inGS inData overallBestCost [] inSimAnnealParams + in -- alternate -- rejoinGraphTupleRecursive swapParams inGS inData curBetterCost overallBestCost inSimAnnealParams graphDataList + do + -- do "all additions" - + recombinedGraphList' ← getParallelChunkTraverseBy (fmap U.strict2of5) >>= \pTraverse → pTraverse action graphDataList + let recombinedGraphList = concat recombinedGraphList' + + -- this based on heuristic deltas + let bestFuseCost = + if null recombinedGraphList + then infinity + else minimum $ fmap snd5 recombinedGraphList + if null recombinedGraphList + then pure [] + else + if bestFuseCost <= curBetterCost + then GO.selectGraphs Best (keepNum swapParams) 0 recombinedGraphList + else pure [] + + +{- | rejoinGraphTupleRecursive is a wrapper for S.rejoinGraphTuple that recursively goes through list as opposd to parMapping +this to save on memory footprint since there would be many calls generated +the rejoin operation is parallelized itself +recursive best cost so can keep all better than input but can have progress info +-} +rejoinGraphTupleRecursive + ∷ SwapParams + → GlobalSettings + → ProcessedData + → VertexCost + → VertexCost + → Maybe SAParams + → [(DecoratedGraph, SimpleGraph, VertexCost, LG.Node, LG.Node, LG.Node, [LG.LEdge EdgeInfo], [LG.LEdge EdgeInfo], VertexCost)] + → PhyG [ReducedPhylogeneticGraph] +rejoinGraphTupleRecursive swapParams inGS inData curBestCost recursiveBestCost inSimAnnealParams graphDataList = + if null graphDataList + then return [] + else + let firstGraphData = head graphDataList + -- update with unions for rejoining + -- using best cost for differentiate since there was no single graph to get original deltas + -- add randomize edges option? + + {- Turned off join prune option here + charInfoVV = fmap thd3 $ thd3 inData + (splitGraphDec, splitGraphSimple, splitCost, baseGraphRootIndex, prunedGraphRootIndex, prunedParent~RootIndex, _, edgesInPrunedList, netPenaltyFactor) = firstGraphData + prunedToRejoinUnionData = vertData $ fromJust $ LG.lab splitGraphDec prunedGraphRootIndex + unionEdgeList <- S.getUnionRejoinEdgeList inGS splitGraphDec charInfoVV [baseGraphRootIndex] (curBestCost - splitCost) prunedToRejoinUnionData [] + -} + + firstGraphData' = firstGraphData + in {-Turned off for now since doesn't alternate + if joinType swapParams /= JoinAll then + (splitGraphDec, splitGraphSimple, splitCost, baseGraphRootIndex, prunedGraphRootIndex, prunedParentRootIndex, unionEdgeList, edgesInPrunedList, netPenaltyFactor) + else firstGraphData + -} + + -- Unconditional printing, conditional output payload. + do + firstRejoinResult ← S.rejoinGraphTuple swapParams inGS inData curBestCost [] inSimAnnealParams firstGraphData' + let firstBestCost = + if (not . null) firstRejoinResult + then minimum $ fmap snd5 firstRejoinResult + else infinity + + let newRecursiveBestCost = min recursiveBestCost firstBestCost + + when (firstBestCost < recursiveBestCost) $ + logWith LogInfo ("\t->" <> show newRecursiveBestCost) + + rejoinResult ← + rejoinGraphTupleRecursive swapParams inGS inData curBestCost newRecursiveBestCost inSimAnnealParams (tail graphDataList) + let result = firstRejoinResult <> rejoinResult + + pure result + + +{- + -- Doing a conditional print like this still results in a <> exception + -- This is a really confision and cryptic error condition, + -- however it is 100% related to unsafe printing and parallelism. + -- So we gotta refactor to do logging properly and then refactor to correctly + -- enable parallism in order to fully address this class of issues! + + in if firstBestCost < recursiveBestCost + then trace ("\t->" <> show newRecursiveBestCost) result + else result +-} + +-- | getNetworkPentaltyFactor get scale network penalty for graph +getNetworkPentaltyFactor ∷ GlobalSettings → ProcessedData → VertexCost → ReducedPhylogeneticGraph → PhyG VertexCost +getNetworkPentaltyFactor inGS inData graphCost inGraph = + if LG.isEmpty $ thd5 inGraph + then pure 0.0 + else do + inGraphNetPenalty ← + if (graphType inGS == Tree) + then pure 0.0 + else -- else if (graphType inGS == HardWired) then 0.0 + + if (graphFactor inGS) == NoNetworkPenalty + then pure 0.0 + else + if (graphFactor inGS) == Wheeler2015Network + then POSW.getW15NetPenaltyFull Nothing inGS inData Nothing (GO.convertReduced2PhylogeneticGraphSimple inGraph) + else + if (graphFactor inGS) == Wheeler2023Network + then pure $ POSW.getW23NetPenaltyReduced inGraph + else error ("Network penalty type " <> (show $ graphFactor inGS) <> " is not yet implemented") + + pure $ inGraphNetPenalty / graphCost + + +{- | getBaseGraphEdges gets the edges in the base graph the the exchanged sub graphs can be rejoined +basically all edges except at root and those in the subgraph +adds original edge connection edges (those with nodes in original edge) at front (for use in "none" swap later), removing if there to +prevent redundancy if swap not "none" +-} +getBaseGraphEdges ∷ (Eq b) ⇒ LG.Node → (LG.Gr a b, [LG.LEdge b], LG.LEdge b) → [LG.LEdge b] +getBaseGraphEdges graphRoot (inGraph, edgesInSubGraph, origSiteEdge) = + if LG.isEmpty inGraph + then [] + else + let baseGraphEdges = filter ((/= graphRoot) . fst3) $ (LG.labEdges inGraph) L.\\ edgesInSubGraph + baseMatchList = filter (edgeMatch origSiteEdge) baseGraphEdges + in -- origSiteEdge : (filter ((/= graphRoot) . fst3) $ (LG.labEdges inGraph) L.\\ (origSiteEdge : edgesInSubGraph)) + + -- trace ("GBGE: " <> (show $ length baseMatchList)) + -- trace ("GBGE sub: " <> (show $ origEdge `elem` (fmap LG.toEdge edgesInSubGraph)) <> " base: " <> (show $ origEdge `elem` (fmap LG.toEdge baseGraphEdges)) <> " total: " <> (show $ origEdge `elem` (fmap LG.toEdge $ LG.labEdges inGraph)) <> "\n" <> (show origEdge) <> " sub " <> (show $ fmap LG.toEdge edgesInSubGraph) <> " base " <> (show $ fmap LG.toEdge baseGraphEdges) <> "\nTotal " <> (show $ fmap LG.toEdge $ LG.labEdges inGraph)) + + baseMatchList <> (baseGraphEdges L.\\ baseMatchList) + where + edgeMatch (a, b, _) (e, v, _) = + if e == a + then True + else + if e == b + then True + else + if v == a + then True + else + if v == b + then True + else False + + +{- | getCompatibleNonIdenticalSplits takes the number of leaves, splitGraph of the left graph, the splitGraph if the right graph, +the bitVector equality list of pruned roots, the bitvector of the root of the pruned graph on left +(this could be either since filter for identity--just to check leaf numbers) +checks that the leaf sets of the pruned subgraphs are equal, greater than 1 leaf, fewer thanm nleaves - 2, and non-identical +removed identity check fo now--so much time to do that (O(n)) may not be worth it +-} +getCompatibleNonIdenticalSplits + ∷ Int + → Bool + → BV.BitVector + → Bool +getCompatibleNonIdenticalSplits numLeaves leftRightMatch leftPrunedGraphBV + | not leftRightMatch = False + | popCount leftPrunedGraphBV < 3 = False + | popCount leftPrunedGraphBV > numLeaves - 3 = False + | otherwise = True + + +{- | exchangePrunedGraphs creates a new "splitGraph" containing both first (base) and second (pruned) graph components +both components need to have HTU and edges reindexed to be in sync, oringal edge terminal node is also reindexed and returned for limit readd distance +-} +exchangePrunedGraphs + ∷ Int + → ((DecoratedGraph, LG.Node, LG.Node, LG.Node), (DecoratedGraph, LG.Node, LG.Node, LG.Node), LG.Node) + → (DecoratedGraph, Int, Int, Int, Int) +exchangePrunedGraphs numLeaves (firstGraphTuple, secondGraphTuple, breakEdgeNode) = + if LG.isEmpty (fst4 firstGraphTuple) || LG.isEmpty (fst4 secondGraphTuple) + then + error + ("Empty graph input in exchangePrunedGraphs" <> (show (LG.isEmpty (fst4 firstGraphTuple), LG.isEmpty (fst4 secondGraphTuple)))) + else + let (firstSplitGraph, firstGraphRootIndex, _, _) = firstGraphTuple + (secondSplitGraph, _, secondPrunedGraphRootIndex, _) = secondGraphTuple + + -- get nodes and edges of firstBase + firstGraphRootLabel = fromJust $ LG.lab firstSplitGraph firstGraphRootIndex + firstGraphRootNode = (firstGraphRootIndex, firstGraphRootLabel) + (firstBaseGraphNodeList', firstBaseGraphEdgeList) = LG.nodesAndEdgesAfter firstSplitGraph [firstGraphRootNode] + + -- add in root nodes of partitions since not included in "nodesAfter" function + firstBaseGraphNodeList = firstGraphRootNode : firstBaseGraphNodeList' + + -- get nodes and edges of second pruned + secondPrunedGraphRootLabel = fromJust $ LG.lab secondSplitGraph secondPrunedGraphRootIndex + secondPrunedGraphRootNode = (secondPrunedGraphRootIndex, secondPrunedGraphRootLabel) + secondPrunedParentNode = head $ LG.labParents secondSplitGraph secondPrunedGraphRootIndex + (secondPrunedGraphNodeList', secondPrunedGraphEdgeList') = LG.nodesAndEdgesAfter secondSplitGraph [secondPrunedGraphRootNode] + + -- add root node of second pruned since not included in "nodesAfter" function + -- add in gandparent nodes of pruned and its edges to pruned graphs + secondPrunedGraphNodeList = [secondPrunedGraphRootNode, secondPrunedParentNode] <> secondPrunedGraphNodeList' + secondPrunedGraphEdgeList = (head $ LG.inn secondSplitGraph secondPrunedGraphRootIndex) : secondPrunedGraphEdgeList' + + -- reindex base and pruned partitions (HTUs and edges) to get in sync and make combinable + -- 0 is dummy since won't be in base split + (baseGraphNodes, baseGraphEdges, numBaseHTUs, reindexedBreakEdgeNodeBase) = reindexSubGraph numLeaves 0 firstBaseGraphNodeList firstBaseGraphEdgeList breakEdgeNode + (prunedGraphNodes, prunedGraphEdges, _, _) = reindexSubGraph numLeaves numBaseHTUs secondPrunedGraphNodeList secondPrunedGraphEdgeList breakEdgeNode + + -- should always be in base graph--should be in first (base) component--if not use original node + reindexedBreakEdgeNode = + if (reindexedBreakEdgeNodeBase /= Nothing) + then fromJust reindexedBreakEdgeNodeBase + else breakEdgeNode + + -- create and reindex new split graph + newSplitGraph = LG.mkGraph (baseGraphNodes <> prunedGraphNodes) (baseGraphEdges <> prunedGraphEdges) + + -- get graph root Index, pruned root index, pruned root parent index + -- firstGraphRootIndex should not have changed in reindexing--same as numLeaves + prunedParentRootIndex = fst $ head $ (LG.getRoots newSplitGraph) L.\\ [firstGraphRootNode] + prunedRootIndex = head $ LG.descendants newSplitGraph prunedParentRootIndex + in if (length $ LG.getRoots newSplitGraph) /= 2 + then error ("Not 2 components in split graph: " <> "\n" <> (LG.prettify $ GO.convertDecoratedToSimpleGraph newSplitGraph)) + else + if (length $ LG.descendants newSplitGraph prunedParentRootIndex) /= 1 + then error ("Too many children of parentPrunedNode: " <> "\n" <> (LG.prettify $ GO.convertDecoratedToSimpleGraph newSplitGraph)) + else + if (length $ LG.parents secondSplitGraph secondPrunedGraphRootIndex) /= 1 + then + error + ( "Parent number not equal to 1 in node " + <> (show secondPrunedGraphRootIndex) + <> " of second graph\n" + <> (LG.prettify $ GO.convertDecoratedToSimpleGraph secondSplitGraph) + ) + else + if (length $ LG.inn secondSplitGraph secondPrunedGraphRootIndex) /= 1 + then + error + ( "Edge incedent tor pruned graph not equal to 1 in node " + <> (show $ fmap LG.toEdge $ LG.inn secondSplitGraph secondPrunedGraphRootIndex) + <> " of second graph\n" + <> (LG.prettify $ GO.convertDecoratedToSimpleGraph secondSplitGraph) + ) + else (newSplitGraph, firstGraphRootIndex, prunedParentRootIndex, prunedRootIndex, reindexedBreakEdgeNode) + + +{- | reindexSubGraph reindexes the non-leaf nodes and edges of a subgraph to allow topological combination of subgraphs +the leaf indices are unchanges but HTUs are changes ot in order enumeration statting with an input offset +new BreakEdge is returned as a Maybe becuase may be either in base or pruned subgraphs +-} +reindexSubGraph + ∷ Int → Int → [LG.LNode VertexInfo] → [LG.LEdge b] → LG.Node → ([LG.LNode VertexInfo], [LG.LEdge b], Int, Maybe LG.Node) +reindexSubGraph numLeaves offset nodeList edgeList origBreakEdge = + if null nodeList || null edgeList + then ([], [], offset, Nothing) + else -- create map of node indices from list + + let (newNodeList, indexList) = unzip $ getPairList numLeaves offset nodeList + indexMap = MAP.fromList indexList + newEdgeList = fmap (reIndexEdge indexMap) edgeList + newBreakEdge = MAP.lookup origBreakEdge indexMap + in {- + if newBreakEdge == Nothing then error ("Map index for break edge node not found: " <> (show origBreakEdge) <> " in Map " <> (show $ MAP.toList indexMap)) + else + -} + -- trace ("RISG:" <> (show (fmap fst nodeList, fmap fst newNodeList, numLeaves)) <> " map " <> (show $ MAP.toList indexMap)) + (newNodeList, newEdgeList, 1 + (maximum $ fmap fst newNodeList) - numLeaves, newBreakEdge) + + +-- | reIndexEdge takes a map and a labelled edge and returns new indices same label edge based on map +reIndexEdge ∷ MAP.Map Int Int → LG.LEdge b → LG.LEdge b +reIndexEdge indexMap (u, v, l) = + let u' = MAP.lookup u indexMap + v' = MAP.lookup v indexMap + in if u' == Nothing || v' == Nothing + then error ("Error in map lookup in reindexEdge: " <> show (u, v)) + else (fromJust u', fromJust v', l) + + +{- | getPairList returns an original index new index lits of pairs +assumes leaf nmodes are first numleaves +-} +getPairList ∷ Int → Int → [LG.LNode VertexInfo] → [(LG.LNode VertexInfo, (Int, Int))] +getPairList numLeaves counter nodeList = + if null nodeList + then [] + else + let (firstIndex, firstLabel) = head nodeList + newLabel = firstLabel{vertName = TL.pack ("HTU" <> (show $ counter + numLeaves))} + in if firstIndex < numLeaves + then (head nodeList, (firstIndex, firstIndex)) : getPairList numLeaves counter (tail nodeList) + else ((counter + numLeaves, newLabel), (firstIndex, (counter + numLeaves))) : getPairList numLeaves (counter + 1) (tail nodeList) diff --git a/src/Search/GeneticAlgorithm.hs b/src/Search/GeneticAlgorithm.hs new file mode 100644 index 000000000..1e5f6a44f --- /dev/null +++ b/src/Search/GeneticAlgorithm.hs @@ -0,0 +1,351 @@ +{- | +Module specifying graph sGeneticAlgorithm functions +-} +module Search.GeneticAlgorithm ( + geneticAlgorithm, +) where + +import Control.Monad (when) +import Control.Monad.IO.Class (MonadIO (..)) +import Control.Monad.Random.Class +import Data.Foldable (fold) +import GeneralUtilities +import Graphs.GraphOperations qualified as GO +import PHANE.Evaluation +import PHANE.Evaluation.ErrorPhase (ErrorPhase (..)) +import PHANE.Evaluation.Logging (LogLevel (..), Logger (..)) +import PHANE.Evaluation.Verbosity (Verbosity (..)) +import Search.Fuse qualified as F +import Search.NetworkAddDelete qualified as N +import Search.Swap qualified as S +import Types.Types +import Utilities.LocalGraph qualified as LG + + +{- | geneticAlgorithm takes arguments and performs genetic algorithm on input graphs +the process follows several steps +1) input graphs are mutated + this step is uncharacteristically first so that is can operate on + graphs that have been "fused" (recombined) already + mutated graphs are added up to popsize + if input graphs are already at the population size, an equal number of mutants are added (exceeding input popsize) +2) graph are recombined using fusing operations +3) population undergoes selection to population size (unique graphs) + selection based on delta with best graph and severity factor on (0,Inf) 1 pure cost delta < 1 more severe, > 1 less severe + if "elitist" (default) 'best' graphs are always selected to ensure no worse. +4) operation repearts for number of generations +-} +geneticAlgorithm + ∷ GlobalSettings + → ProcessedData + → Bool + → Int + → Int + → Int + → Int + → Int + → Double + → Int + → Int + → Int + → [ReducedPhylogeneticGraph] + → PhyG ([ReducedPhylogeneticGraph], Int) +geneticAlgorithm inGS inData doElitist maxNetEdges keepNum popSize generations generationCounter severity recombinations stopCount stopNum inGraphList = + if null inGraphList + then return ([], 0) + else + if generationCounter == generations + then return (inGraphList, generationCounter) + else + if stopCount >= stopNum + then return (inGraphList, generationCounter) + else do + -- get elite list of best solutions + initialEliteList ← GO.selectGraphs Best (maxBound ∷ Int) 0 inGraphList + + logWith LogInfo ("Genetic algorithm generation: " <> (show generationCounter) <> "\n") + + seedList ← getRandoms + -- mutate input graphs, produces number input, limited to popsize + let action = mutateGraph inGS inData maxNetEdges + mutatedGraphList' ← + getParallelChunkTraverse >>= \pTraverse → + action `pTraverse` takeRandom (seedList !! 1) popSize inGraphList + + let numShort = popSize - (length mutatedGraphList') + let randList = (randomIntList $ seedList !! 2) + let graphList = takeRandom (seedList !! 3) numShort inGraphList + + -- adjust to correct populationsize if input number < popSize + mutatedGraphList ← case length mutatedGraphList' `compare` popSize of + LT → do + additionalMutated ← + getParallelChunkTraverse >>= \pTraverse → + action `pTraverse` graphList + pure $ mutatedGraphList' <> additionalMutated + _ → pure mutatedGraphList' + + -- get unique graphs, no point in recombining repetitions + uniqueMutatedGraphList ← GO.selectGraphs Unique (maxBound ∷ Int) 0 $ mutatedGraphList <> inGraphList + + -- recombine elite with mutated and mutated with mutated + let recombineSwap = getRandomElement (seedList !! 4) [NoSwap, NNI, SPR] -- these take too long, "tbr", "alternate"] + + -- options to join via union choices or all in fuse + -- this is ignored for now in fuse--JoinAll is what it does + let joinType = getRandomElement (seedList !! 6) [JoinAlternate, JoinAll] + + let doSteepest = True + let returnBest = False + let returnUnique = True + let singleRound = False + let fusePairs = Just recombinations + let randomPairs = True + let reciprocal = False + + -- populate SwapParams structure + let swapParams = + SwapParams + { swapType = recombineSwap + , joinType = joinType + , atRandom = True -- randomize swap order + , keepNum = (2 * popSize) + , maxMoveEdgeDist = (maxBound ∷ Int) + , steepest = doSteepest + , joinAlternate = False -- not working now + , doIA = False + , returnMutated = False + } + + (recombinedGraphList, _) ← + F.fuseAllGraphs + swapParams + inGS + inData + 0 + returnBest + returnUnique + singleRound + fusePairs + randomPairs + reciprocal + uniqueMutatedGraphList + + -- selection of graphs population + -- unique sorted on cost so getting unique with lowest cost + selectedGraphs ← GO.selectGraphs Unique popSize 0 recombinedGraphList + let newCost = snd5 $ head selectedGraphs + + -- if new graphs better cost then take those + if newCost < (snd5 $ head initialEliteList) + then + geneticAlgorithm + inGS + inData + doElitist + maxNetEdges + keepNum + popSize + generations + (generationCounter + 1) + severity + recombinations + 0 + stopNum + selectedGraphs + else do + -- if new graphs not better then add in elites to ensure monotonic decrease in cost + newGraphList ← GO.selectGraphs Unique keepNum 0 $ initialEliteList <> selectedGraphs + geneticAlgorithm + inGS + inData + doElitist + maxNetEdges + keepNum + popSize + generations + (generationCounter + 1) + severity + recombinations + (stopCount + 1) + stopNum + newGraphList + + +-- | mutateGraph mutates a graph using drift functionality +mutateGraph ∷ GlobalSettings → ProcessedData → Int → ReducedPhylogeneticGraph → PhyG ReducedPhylogeneticGraph +mutateGraph inGS inData maxNetEdges inGraph + | LG.isEmpty (fst5 inGraph) = failWithPhase Computing "Empty graph in mutateGraph" + | otherwise = + let getRandomFrom es = (`getRandomElement` es) <$> getRandom + + -- static parameter values + valAlternate = False + valDoIA = False + valDoRandomOrder = True + valJoinType = JoinAll -- keep selection of rejoins based on all possibilities + valMaxMoveEdgeDist = 10000 + valNumToKeep = 5 + valReturnMutated = True + valSteepest = True + + -- randomize simulated annealing parameters + getRandomSAParams = do + rDrift ← getRandomFrom [5, 10, 20] + rSteps ← getRandomFrom [5, 10, 20] + rMethod ← getRandomFrom [Drift, SimAnneal] + pure . Just $ + SAParams + { method = rMethod + , numberSteps = rSteps + , currentStep = 0 + , rounds = 1 + , driftAcceptEqual = 0.5 + , driftAcceptWorse = 2.0 + , -- this could be an important factor don't want too severe, but significant + driftMaxChanges = rDrift + , driftChanges = 0 + } + + -- randomize 'swap' parameters + getRandomSwapParams = do + swapType ← getRandomFrom [SPR, Alternate] + -- randomize network edit parameters + let doRandomOrder = True + -- populate SwapParams structure + pure $ + SwapParams + { swapType = swapType + , joinType = valJoinType + , atRandom = True -- randomize split and rejoin edge orders + , keepNum = valNumToKeep + , maxMoveEdgeDist = valMaxMoveEdgeDist + , steepest = valSteepest + , joinAlternate = False + , doIA = valDoIA + , returnMutated = valReturnMutated + } + + firstOrOldIfNoneExists = + pure . \case + ([], _) → inGraph + (x : _, _) → x + + mutateOption1 = do + rSAParams ← getRandomSAParams + rSwapParams ← getRandomSwapParams + firstOrOldIfNoneExists =<< S.swapDriver rSwapParams inGS inData 0 [inGraph] [(rSAParams, inGraph)] + + mutateOption2 = do + rSAParams ← getRandomSAParams + rSwapParams ← getRandomSwapParams + firstOrOldIfNoneExists =<< S.swapDriver rSwapParams inGS inData 0 [inGraph] [(rSAParams, inGraph)] + + mutateOption3 = do + rSAParams ← getRandomSAParams + rSwapParams ← getRandomSwapParams + firstOrOldIfNoneExists + =<< N.moveAllNetEdges + inGS + inData + maxNetEdges + valNumToKeep + 0 + valReturnMutated + valSteepest + valDoRandomOrder + ([], infinity) + (rSAParams, [inGraph]) + + mutateOption4 = do + rSAParams ← getRandomSAParams + rSwapParams ← getRandomSwapParams + firstOrOldIfNoneExists + =<< N.moveAllNetEdges + inGS + inData + maxNetEdges + valNumToKeep + 0 + valReturnMutated + valSteepest + valDoRandomOrder + ([], infinity) + (rSAParams, [inGraph]) + + mutateOption5 = do + rMaxRounds ← getRandomFrom [1 .. 5] + rSAParams ← getRandomSAParams + rSwapParams ← getRandomSwapParams + firstOrOldIfNoneExists + =<< N.insertAllNetEdges + inGS + inData + maxNetEdges + valNumToKeep + rMaxRounds + 0 + valReturnMutated + valSteepest + valDoRandomOrder + ([], infinity) + (rSAParams, [inGraph]) + + mutateOption6 = do + rMaxRounds ← getRandomFrom [1 .. 5] + rSAParams ← getRandomSAParams + rSwapParams ← getRandomSwapParams + firstOrOldIfNoneExists + =<< N.addDeleteNetEdges + inGS + inData + maxNetEdges + valNumToKeep + rMaxRounds + 0 + valReturnMutated + valSteepest + valDoRandomOrder + ([], infinity) + (rSAParams, [inGraph]) + + mutateOption7 = do + rSAParams ← getRandomSAParams + rSwapParams ← getRandomSwapParams + firstOrOldIfNoneExists + =<< N.deleteAllNetEdges + inGS + inData + maxNetEdges + valNumToKeep + 0 + valReturnMutated + valSteepest + valDoRandomOrder + ([], infinity) + (rSAParams, [inGraph]) + in do + -- randomize edit type + editType ← getRandomFrom ["swap", "netEdge"] + netEditType ← getRandomFrom ["netAdd", "netDelete", "netAddDelete"] -- , "netMove"] + case (graphType inGS, editType, netEditType) of + -- only swap mutation stuff for tree + -- (Tree, _, nEdit) | nEdit `notElem` ["netAdd", "netDelete", "netAddDelete", "netMove"] → mutateOption1 + (Tree, _, _) → mutateOption1 + -- graphs choose what type of mutation at random + (_, "swap", _) → mutateOption2 + -- move only for Hardwired + (HardWired, _, _) → mutateOption3 + -- SoftWired + (SoftWired, _, "netMove") → mutateOption4 + (SoftWired, _, "netAdd") → mutateOption5 + (SoftWired, _, "netAddDelete") → mutateOption6 + (SoftWired, _, "netDelete") → mutateOption7 + (SoftWired, _, val) → + failWithPhase Parsing $ + fold + ["Unrecognized edit type '", val, "' for sofwired network"] + otherwise → + failWithPhase Parsing $ + fold + ["Unrecognized situation " <> (show (graphType inGS, editType, netEditType))] diff --git a/src/Search/NetworkAddDelete.hs b/src/Search/NetworkAddDelete.hs new file mode 100644 index 000000000..9af853f9a --- /dev/null +++ b/src/Search/NetworkAddDelete.hs @@ -0,0 +1,2549 @@ +{- | +Module specifying graph egde adding and deleting functions +-} +module Search.NetworkAddDelete ( + deleteAllNetEdges, + insertAllNetEdges, + moveAllNetEdges, + deltaPenaltyAdjustment, + deleteNetEdge, + deleteOneNetAddAll, + addDeleteNetEdges, + getCharacterDelta, + getBlockDelta, + -- these are not used but to quiet warnings + heuristicDeleteDelta, + heuristicAddDelta, + heuristicAddDelta', +) where + +import Control.Arrow ((&&&)) +import Control.Monad (when) +import Control.Monad.IO.Class (MonadIO (..)) +import Control.Monad.Random.Class +import Data.Bits +import Data.Foldable (fold) +import Data.Functor ((<&>)) +import Data.InfList qualified as IL +import Data.Maybe +import Data.Text.Lazy qualified as TL +import Data.Vector qualified as V +import GeneralUtilities +import GraphOptimization.Medians qualified as M +import GraphOptimization.PostOrderSoftWiredFunctions qualified as POSW +import GraphOptimization.PostOrderSoftWiredFunctionsNew qualified as NEW +import GraphOptimization.PreOrderFunctions qualified as PRE +import GraphOptimization.Traversals qualified as T +import Graphs.GraphOperations qualified as GO +import PHANE.Evaluation +import PHANE.Evaluation.ErrorPhase (ErrorPhase (..)) +import PHANE.Evaluation.Logging +import PHANE.Evaluation.Verbosity (Verbosity (..)) +import Types.Types +import Utilities.LocalGraph qualified as LG +import Utilities.Utilities qualified as U + + +{- | +'addDeleteNetEdges' is a wrapper for 'addDeleteNetEdges'' allowing for multiple simulated annealing rounds. +-} +addDeleteNetEdges + ∷ GlobalSettings + → ProcessedData + → Int + → Int + → Int + → Int + → Bool + → Bool + → Bool + → ([ReducedPhylogeneticGraph], VertexCost) + → (Maybe SAParams, [ReducedPhylogeneticGraph]) + → PhyG ([ReducedPhylogeneticGraph], Int) +addDeleteNetEdges inGS inData maxNetEdges numToKeep maxRounds counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) (inSimAnnealParams, inPhyloGraphList) = case inSimAnnealParams of + Nothing → + addDeleteNetEdges' + inGS + inData + maxNetEdges + numToKeep + maxRounds + counter + returnMutated + doSteepest + doRandomOrder + (curBestGraphList, curBestGraphCost) + Nothing + inPhyloGraphList + Just simAnneal → + let -- create list of params with unique list of random values for rounds of annealing + annealingRounds = rounds simAnneal + saParamList = replicate annealingRounds inSimAnnealParams + + -- parallel setup + action ∷ (Maybe SAParams, [ReducedPhylogeneticGraph]) → PhyG ([ReducedPhylogeneticGraph], Int) + action = + addDeleteNetEdges'' + inGS + inData + maxNetEdges + numToKeep + maxRounds + counter + returnMutated + doSteepest + doRandomOrder + (curBestGraphList, curBestGraphCost) + in do + addDeleteResult ← + getParallelChunkTraverse >>= \pTraverse → + pTraverse action . zip saParamList $ replicate annealingRounds inPhyloGraphList + let (annealRoundsList, counterList) = unzip addDeleteResult + GO.selectGraphs Best numToKeep 0 (fold annealRoundsList) <&> \x → (x, sum counterList) + + +-- | addDeleteNetEdges'' is wrapper around addDeleteNetEdges' to use parmap +addDeleteNetEdges'' + ∷ GlobalSettings + → ProcessedData + → Int + → Int + → Int + → Int + → Bool + → Bool + → Bool + → ([ReducedPhylogeneticGraph], VertexCost) + → (Maybe SAParams, [ReducedPhylogeneticGraph]) + → PhyG ([ReducedPhylogeneticGraph], Int) +addDeleteNetEdges'' inGS inData maxNetEdges numToKeep maxRounds counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) (inSimAnnealParams, inPhyloGraphList) = + addDeleteNetEdges' + inGS + inData + maxNetEdges + numToKeep + maxRounds + counter + returnMutated + doSteepest + doRandomOrder + (curBestGraphList, curBestGraphCost) + inSimAnnealParams + inPhyloGraphList + + +{- | addDeleteNetEdges' removes each edge and adds an edge to all possible places (or steepest) each round +until no better or additional graphs are found (or max rounds met) +call with ([], infinity) [single input graph] +doesn't have to be random, but likely to converge quickly if not +-} +addDeleteNetEdges' + ∷ GlobalSettings + → ProcessedData + → Int + → Int + → Int + → Int + → Bool + → Bool + → Bool + → ([ReducedPhylogeneticGraph], VertexCost) + → Maybe SAParams + → [ReducedPhylogeneticGraph] + → PhyG ([ReducedPhylogeneticGraph], Int) +addDeleteNetEdges' inGS inData maxNetEdges numToKeep maxRounds counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) inSimAnnealParams = \case + [] → pure (take numToKeep curBestGraphList, counter) + -- if hit maxmimum rounds then return + inPhyloGraphList → case counter `compare` maxRounds of + EQ → pure (take numToKeep curBestGraphList, counter) + -- other wise add/delete + _ → do + -- insert edges first + (insertGraphList, _) ← + insertAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + counter + returnMutated + doSteepest + doRandomOrder + (curBestGraphList, curBestGraphCost) + inSimAnnealParams + inPhyloGraphList + + -- this to update randlists in SAPArams for subsequent calls + let updatedSAParamList = case inSimAnnealParams of + Nothing → [Nothing, Nothing] + _ → replicate 2 inSimAnnealParams + + -- if no better--take input for delte phase + (insertGraphList', insertGraphCost, toDeleteList) ← case insertGraphList of + [] → pure (curBestGraphList, curBestGraphCost, inPhyloGraphList) + gs → do + newList ← GO.selectGraphs Best (maxBound ∷ Int) 0 gs + pure (newList, snd5 $ head newList, newList) + + -- delete edges + (deleteGraphList, _) ← + deleteAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + counter + returnMutated + doSteepest + doRandomOrder + (insertGraphList', insertGraphCost) + (head updatedSAParamList) + toDeleteList + + -- gather beter if any + (newBestGraphList, newBestGraphCost, graphsToDoNext) ← case deleteGraphList of + [] → pure (curBestGraphList, curBestGraphCost, inPhyloGraphList) + gs → do + newDeleteGraphs ← GO.selectGraphs Best (maxBound ∷ Int) 0 gs + pure (newDeleteGraphs, snd5 $ head newDeleteGraphs, newDeleteGraphs) + + -- check is same then return + case newBestGraphCost `compare` curBestGraphCost of + EQ → pure (take numToKeep curBestGraphList, counter) + -- if better (or nothing) keep going + _ → + addDeleteNetEdges' + inGS + inData + maxNetEdges + numToKeep + maxRounds + (counter + 1) + returnMutated + doSteepest + doRandomOrder + (newBestGraphList, newBestGraphCost) + (last updatedSAParamList) + graphsToDoNext + + +-- | moveAllNetEdges is a wrapper for moveAllNetEdges' allowing for multiple simulated annealing rounds +moveAllNetEdges + ∷ GlobalSettings + → ProcessedData + → Int + → Int + → Int + → Bool + → Bool + → Bool + → ([ReducedPhylogeneticGraph], VertexCost) + → (Maybe SAParams, [ReducedPhylogeneticGraph]) + → PhyG ([ReducedPhylogeneticGraph], Int) +moveAllNetEdges inGS inData maxNetEdges numToKeep counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) (inSimAnnealParams, inPhyloGraphList) = case inSimAnnealParams of + Nothing → + moveAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + counter + returnMutated + doSteepest + doRandomOrder + (curBestGraphList, curBestGraphCost) + Nothing + inPhyloGraphList + Just simAnneal → + let -- create list of params with unique list of random values for rounds of annealing + annealingRounds = rounds simAnneal + saParamList = replicate annealingRounds inSimAnnealParams + + -- parallel setup + action ∷ (Maybe SAParams, [ReducedPhylogeneticGraph]) → PhyG ([ReducedPhylogeneticGraph], Int) + action = + moveAllNetEdges'' + inGS + inData + maxNetEdges + numToKeep + counter + returnMutated + doSteepest + doRandomOrder + (curBestGraphList, curBestGraphCost) + in do + moveResult ← + getParallelChunkTraverse >>= \pTraverse → + pTraverse action . zip saParamList $ replicate annealingRounds inPhyloGraphList + let (annealRoundsList, counterList) = unzip moveResult + GO.selectGraphs Best numToKeep 0 (fold annealRoundsList) <&> \x → (x, sum counterList) + + +-- | moveAllNetEdges'' is wrapper around moveAllNetEdges' to use parmap +moveAllNetEdges'' + ∷ GlobalSettings + → ProcessedData + → Int + → Int + → Int + → Bool + → Bool + → Bool + → ([ReducedPhylogeneticGraph], VertexCost) + → (Maybe SAParams, [ReducedPhylogeneticGraph]) + → PhyG ([ReducedPhylogeneticGraph], Int) +moveAllNetEdges'' inGS inData maxNetEdges numToKeep counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) (inSimAnnealParams, inPhyloGraphList) = + moveAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + counter + returnMutated + doSteepest + doRandomOrder + (curBestGraphList, curBestGraphCost) + inSimAnnealParams + inPhyloGraphList + + +{- | moveAllNetEdges' removes each edge and adds an edge to all possible places (or steepest) each round +until no better or additional graphs are found +call with ([], infinity) [single input graph] +-} +moveAllNetEdges' + ∷ GlobalSettings + → ProcessedData + → Int + → Int + → Int + → Bool + → Bool + → Bool + → ([ReducedPhylogeneticGraph], VertexCost) + → Maybe SAParams + → [ReducedPhylogeneticGraph] + → PhyG ([ReducedPhylogeneticGraph], Int) +moveAllNetEdges' inGS inData maxNetEdges numToKeep counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) inSimAnnealParams = \case + [] → pure (take numToKeep curBestGraphList, counter) + firstPhyloGraph : otherPhyloGraphs + | LG.isEmpty $ fst5 firstPhyloGraph → + moveAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + counter + returnMutated + doSteepest + doRandomOrder + (curBestGraphList, curBestGraphCost) + inSimAnnealParams + otherPhyloGraphs + firstPhyloGraph : otherPhyloGraphs → + let currentCost = min curBestGraphCost $ snd5 firstPhyloGraph + -- parallel setup + action ∷ LG.Edge → PhyG [ReducedPhylogeneticGraph] + action = deleteOneNetAddAll' inGS inData maxNetEdges numToKeep doSteepest doRandomOrder firstPhyloGraph inSimAnnealParams + in do + -- randomize order of edges to try moving + netEdgeList ← + let edges = LG.labNetEdges $ thd5 firstPhyloGraph + permutationOf + | doRandomOrder = shuffleList + | otherwise = pure + in permutationOf edges + + deleteResult ← + getParallelChunkTraverse >>= \pTraverse → + pTraverse (action . LG.toEdge) netEdgeList + + let newGraphList' = fold deleteResult + newGraphList ← GO.selectGraphs Best numToKeep 0 newGraphList' + let newGraphCost = case newGraphList' of + [] → infinity + (_, c, _, _, _) : _ → c + + -- if graph is a tree no edges to delete + case netEdgeList of + [] → (firstPhyloGraph : otherPhyloGraphs, counter) <$ logWith LogInfo "\t\tGraph in move has no network edges to move--skipping\n" + e : es → case inSimAnnealParams of + Nothing → case newGraphCost `compare` currentCost of + GT → + moveAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + (counter + 1) + returnMutated + doSteepest + doRandomOrder + (firstPhyloGraph : curBestGraphList, currentCost) + inSimAnnealParams + otherPhyloGraphs + LT + | doSteepest → + moveAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + (counter + 1) + returnMutated + doSteepest + doRandomOrder + (newGraphList, newGraphCost) + inSimAnnealParams + newGraphList + LT → + moveAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + (counter + 1) + returnMutated + doSteepest + doRandomOrder + (newGraphList, newGraphCost) + inSimAnnealParams + (newGraphList <> otherPhyloGraphs) + EQ → do + -- new graph list contains the input graph if equal and filterd unique already in moveAllNetEdges + newCurSameBestList ← GO.selectGraphs Unique numToKeep 0 $ curBestGraphList <> newGraphList + moveAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + (counter + 1) + returnMutated + doSteepest + doRandomOrder + (newCurSameBestList, currentCost) + inSimAnnealParams + otherPhyloGraphs + + -- sim anneal choice + Just simAnneal → + -- not sure why this was here--seems to work + let -- abstract stopping criterion to continue + (numDone, numMax) = case method simAnneal of + SimAnneal → currentStep &&& numberSteps $ simAnneal + _ → driftChanges &&& driftMaxChanges $ simAnneal + in do + uniqueGraphList ← GO.selectGraphs Unique numToKeep 0 newGraphList' + + let (annealBestCost, nextUniqueList) = case uniqueGraphList of + [] → (curBestGraphCost, []) + (_, cost, _, _, _) : more → (min curBestGraphCost cost, more) + + (acceptFirstGraph, newSAParams) ← case uniqueGraphList of + [] → pure (False, U.incrementSimAnnealParams inSimAnnealParams) + (_, c, _, _, _) : _ → U.simAnnealAccept inSimAnnealParams annealBestCost c + + case numDone `compare` numMax of + LT | acceptFirstGraph → do + moveAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + (counter + 1) + returnMutated + doSteepest + doRandomOrder + ((head uniqueGraphList) : curBestGraphList, annealBestCost) + newSAParams + (nextUniqueList <> otherPhyloGraphs) + LT → do + moveAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + (counter + 1) + returnMutated + doSteepest + doRandomOrder + (curBestGraphList, annealBestCost) + newSAParams + (nextUniqueList <> otherPhyloGraphs) + + -- if want non-optimized list for GA or whatever + _ | returnMutated → pure (take numToKeep curBestGraphList, counter) + -- optimize list and return + _ → do + (bestMoveList', counter') ← + moveAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + (counter + 1) + False + doSteepest + doRandomOrder + ([], annealBestCost) + Nothing + $ take numToKeep curBestGraphList + bestMoveList ← GO.selectGraphs Best numToKeep 0 bestMoveList' + pure (take numToKeep bestMoveList, counter') + + +-- | (curBestGraphList, annealBestCost) is a wrapper for moveAllNetEdges' allowing for multiple simulated annealing rounds +insertAllNetEdges + ∷ GlobalSettings + → ProcessedData + → Int + → Int + → Int + → Int + → Bool + → Bool + → Bool + → ([ReducedPhylogeneticGraph], VertexCost) + → (Maybe SAParams, [ReducedPhylogeneticGraph]) + → PhyG ([ReducedPhylogeneticGraph], Int) +insertAllNetEdges inGS inData maxNetEdges numToKeep maxRounds counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) (inSimAnnealParams, inPhyloGraphList) = + let -- parallel setup + randAction ∷ [Int] → PhyG ([ReducedPhylogeneticGraph], Int) + randAction = + insertAllNetEdgesRand + inGS + inData + maxNetEdges + numToKeep + counter + returnMutated + doSteepest + (curBestGraphList, curBestGraphCost) + Nothing + inPhyloGraphList + + action ∷ (Maybe SAParams, [ReducedPhylogeneticGraph]) → PhyG ([ReducedPhylogeneticGraph], Int) + action = + insertAllNetEdges'' + inGS + inData + maxNetEdges + numToKeep + counter + returnMutated + doSteepest + doRandomOrder + (curBestGraphList, curBestGraphCost) + in if isNothing inSimAnnealParams + then -- check for multiple rounds of addition--if > 1 then need to randomize order + + if maxRounds == 1 + then + insertAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + counter + returnMutated + doSteepest + doRandomOrder + (curBestGraphList, curBestGraphCost) + Nothing + inPhyloGraphList + else do + -- this odd contruction is to ensure that the correct number of annealing rounds are + -- done even though the values in the lists are ignored. Should be refactored. + -- this happened during migration to monadic getRandom + let intList = replicate maxRounds (0 ∷ Int) + let intListList = replicate maxRounds intList + insertGraphResult ← + getParallelChunkTraverse >>= \pTraverse → + randAction `pTraverse` intListList + + let (insertGraphList, counterList) = unzip insertGraphResult + -- insert functions take care of returning "better" or empty + -- should be empty if nothing better + GO.selectGraphs Best numToKeep 0 (fold insertGraphList) <&> \x → (x, sum counterList) + else + let -- create list of params with unique list of random values for rounds of annealing + annealingRounds = rounds $ fromJust inSimAnnealParams + annealParamGraphList = replicate annealingRounds inSimAnnealParams + in do + insertGraphResult ← + getParallelChunkTraverse >>= \pTraverse → + pTraverse action . zip annealParamGraphList $ replicate annealingRounds inPhyloGraphList + let (annealRoundsList, counterList) = unzip insertGraphResult + let selectionType + | not returnMutated || isNothing inSimAnnealParams = Best + | otherwise = Unique + + GO.selectGraphs selectionType numToKeep 0 (fold annealRoundsList) <&> \x → (x, sum counterList) + + +-- | insertAllNetEdgesRand is a wrapper around insertAllNetEdges'' to pass unique randomLists to insertAllNetEdges' +insertAllNetEdgesRand + ∷ GlobalSettings + → ProcessedData + → Int + → Int + → Int + → Bool + → Bool + → ([ReducedPhylogeneticGraph], VertexCost) + → Maybe SAParams + → [ReducedPhylogeneticGraph] + → [Int] + → PhyG ([ReducedPhylogeneticGraph], Int) +insertAllNetEdgesRand inGS inData maxNetEdges numToKeep counter returnMutated doSteepest (curBestGraphList, curBestGraphCost) inSimAnnealParams inPhyloGraphList _ = + insertAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + counter + returnMutated + doSteepest + True + (curBestGraphList, curBestGraphCost) + inSimAnnealParams + inPhyloGraphList + + +-- | insertAllNetEdges'' is a wrapper around insertAllNetEdges' to allow for seqParMap +insertAllNetEdges'' + ∷ GlobalSettings + → ProcessedData + → Int + → Int + → Int + → Bool + → Bool + → Bool + → ([ReducedPhylogeneticGraph], VertexCost) + → (Maybe SAParams, [ReducedPhylogeneticGraph]) + → PhyG ([ReducedPhylogeneticGraph], Int) +insertAllNetEdges'' inGS inData maxNetEdges numToKeep counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) (inSimAnnealParams, inPhyloGraphList) = + insertAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + counter + returnMutated + doSteepest + doRandomOrder + (curBestGraphList, curBestGraphCost) + inSimAnnealParams + inPhyloGraphList + + +{- | insertAllNetEdges' adds network edges one each each round until no better or additional +graphs are found +call with ([], infinity) [single input graph] +-} +insertAllNetEdges' + ∷ GlobalSettings + → ProcessedData + → Int + → Int + → Int + → Bool + → Bool + → Bool + → ([ReducedPhylogeneticGraph], VertexCost) + → Maybe SAParams + → [ReducedPhylogeneticGraph] + → PhyG ([ReducedPhylogeneticGraph], Int) +insertAllNetEdges' inGS inData maxNetEdges numToKeep counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) inSimAnnealParams = \case + -- this logic so don't return mutated if finish insertion before hitting other stopping points + -- and don't want mutated--straight deletion on current best graphs + [] → case inSimAnnealParams of + Just _ + | not returnMutated → + deleteAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + counter + False + doSteepest + doRandomOrder + ([], curBestGraphCost) + Nothing + (take numToKeep curBestGraphList) + _ → pure (take numToKeep curBestGraphList, counter) + firstPhyloGraph : otherPhyloGraphs → + let currentCost = min curBestGraphCost $ snd5 firstPhyloGraph + + -- check for max net edges + (_, _, _, netNodes) = LG.splitVertexList $ thd5 firstPhyloGraph + in do + logWith LogInfo ("\t\tNumber of network edges: " <> (show $ length netNodes) <> "\n") + + (newGraphList, _, newSAParams) ← + insertEachNetEdge + inGS + inData + maxNetEdges + numToKeep + doSteepest + doRandomOrder + Nothing + inSimAnnealParams + firstPhyloGraph + + bestNewGraphList ← GO.selectGraphs Best numToKeep 0 newGraphList + let newGraphCost = case bestNewGraphList of + [] → infinity + (_, c, _, _, _) : _ → c + + -- logWith LogInfo ("\t\tNumber of network edges: " <> (show $ length netNodes) <> "\n") + + case length netNodes `compare` maxNetEdges of + LT → case newGraphList of + [] → do + logWith LogInfo $ unwords ["\t\tNumber of network edges:", show $ length netNodes, "\n"] + pure (take numToKeep curBestGraphList, counter) + -- regular insert keeping best + g : gs → case inSimAnnealParams of + Nothing → + postProcessNetworkAdd + inGS + inData + maxNetEdges + numToKeep + counter + returnMutated + doSteepest + doRandomOrder + (curBestGraphList, curBestGraphCost) + (newGraphList, newGraphCost) + inSimAnnealParams + netNodes + currentCost + otherPhyloGraphs + -- simulated annealing--needs new SAParams + Just _ → do + (saGraphs, saCounter) ← + postProcessNetworkAddSA + inGS + inData + maxNetEdges + numToKeep + counter + returnMutated + doSteepest + doRandomOrder + (curBestGraphList, curBestGraphCost) + (newGraphList, newGraphCost) + newSAParams + netNodes + currentCost + otherPhyloGraphs + + -- if want mutated then return + case returnMutated of + True → pure (saGraphs, saCounter) + -- delete non-minimal edges if any + -- not sim anneal/drift regular optimal searching + _ → do + let annealBestCost = minimum $ fmap snd5 saGraphs + (bestList', counter') ← + deleteAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + saCounter + False + doSteepest + doRandomOrder + (saGraphs, annealBestCost) + Nothing + saGraphs + bestList ← GO.selectGraphs Best numToKeep 0 bestList' + pure (bestList, counter') + _ → do + logWith LogInfo $ unwords ["Maximum number of network edges reached:", show $ length netNodes, "\n"] + pure (take numToKeep curBestGraphList, counter) + + +{- | postProcessNetworkAddSA processes simaneal/drift +assumes SAParams are updated during return of graph list above +-} +postProcessNetworkAddSA + ∷ GlobalSettings + → ProcessedData + → Int + → Int + → Int + → Bool + → Bool + → Bool + → ([ReducedPhylogeneticGraph], VertexCost) + → ([ReducedPhylogeneticGraph], VertexCost) + → Maybe SAParams + → [LG.LNode VertexInfo] + → VertexCost + → [ReducedPhylogeneticGraph] + → PhyG ([ReducedPhylogeneticGraph], Int) +postProcessNetworkAddSA inGS inData maxNetEdges numToKeep counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) (newGraphList, newGraphCost) inSimAnnealParams _ _ inPhyloGraphList = + -- trace ("\t\tNumber of network edges: " <> (show $ length netNodes)) ( + -- this to deal with empty list issues if nothing found + let (nextNewGraphList, firstNewGraphList) = + if (not . null) newGraphList + then (tail newGraphList, [head newGraphList]) + else ([], []) + graphsToInsert = + if doSteepest + then newGraphList + else take numToKeep $ newGraphList <> inPhyloGraphList + in -- always accept if found better + if newGraphCost < curBestGraphCost + then do + logWith LogInfo ("\t-> " <> (show newGraphCost)) + insertAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + (counter + 1) + returnMutated + doSteepest + doRandomOrder + (newGraphList, newGraphCost) + inSimAnnealParams + graphsToInsert + else -- check if hit max change/ cooling steps + + if ((currentStep $ fromJust inSimAnnealParams) >= (numberSteps $ fromJust inSimAnnealParams)) + || ((driftChanges $ fromJust inSimAnnealParams) >= (driftMaxChanges $ fromJust inSimAnnealParams)) + then GO.selectGraphs Unique numToKeep 0 (newGraphList <> curBestGraphList) <&> \x → (x, counter) + else -- more to do + + let annealBestCost = min curBestGraphCost newGraphCost + in do + insertAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + (counter + 1) + returnMutated + doSteepest + doRandomOrder + (firstNewGraphList <> curBestGraphList, annealBestCost) + inSimAnnealParams + (nextNewGraphList <> inPhyloGraphList) + + +-- ) + +-- | postProcessNetworkAdd prcesses non-simaneal/drift--so no updating of SAParams +postProcessNetworkAdd + ∷ GlobalSettings + → ProcessedData + → Int + → Int + → Int + → Bool + → Bool + → Bool + → ([ReducedPhylogeneticGraph], VertexCost) + → ([ReducedPhylogeneticGraph], VertexCost) + → Maybe SAParams + → [LG.LNode VertexInfo] + → VertexCost + → [ReducedPhylogeneticGraph] + → PhyG ([ReducedPhylogeneticGraph], Int) +postProcessNetworkAdd inGS inData maxNetEdges numToKeep counter returnMutated doSteepest doRandomOrder (curBestGraphList, _) (newGraphList, newGraphCost) inSimAnnealParams _ currentCost inPhyloGraphList = case newGraphCost `compare` currentCost of + -- "steepest style descent" abandons existing list if better cost found + + -- check if graph OK--done in insert function + LT → + let graphsToInsert + | doSteepest = newGraphList + | otherwise = take numToKeep $ newGraphList <> inPhyloGraphList + in do + logWith LogInfo ("\t-> " <> show newGraphCost) + insertAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + (counter + 1) + returnMutated + doSteepest + doRandomOrder + (newGraphList, newGraphCost) + inSimAnnealParams + graphsToInsert + + -- worse graphs found--go on + GT → + insertAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + (counter + 1) + returnMutated + doSteepest + doRandomOrder + (curBestGraphList, currentCost) + inSimAnnealParams + inPhyloGraphList + -- equal cost + -- not sure if should add new graphs to queue to do edge deletion again + EQ → do + -- new graph list contains the input graph if equal and filterd unique already in insertAllNetEdges + newCurSameBestList ← GO.selectGraphs Unique numToKeep 0 $ curBestGraphList <> newGraphList + insertAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + (counter + 1) + returnMutated + doSteepest + doRandomOrder + (newCurSameBestList, currentCost) + inSimAnnealParams + inPhyloGraphList + + +{- | insertEachNetEdge takes a phylogenetic graph and inserts all permissible network edges one at time +and returns unique list of new Phylogenetic Graphs and cost +even if worse--could be used for simulated annealing later +if equal returns unique graph list +-} +insertEachNetEdge + ∷ GlobalSettings + → ProcessedData + → Int + → Int + → Bool + → Bool + → Maybe VertexCost + → Maybe SAParams + → ReducedPhylogeneticGraph + → PhyG ([ReducedPhylogeneticGraph], VertexCost, Maybe SAParams) +insertEachNetEdge inGS inData maxNetEdges numToKeep doSteepest doRandomOrder preDeleteCost inSimAnnealParams inPhyloGraph = + if LG.isEmpty $ fst5 inPhyloGraph + then error "Empty input insertEachNetEdge graph in deleteAllNetEdges" + else + let currentCost = + if isNothing preDeleteCost + then snd5 inPhyloGraph + else fromJust preDeleteCost + + (_, _, _, netNodes) = LG.splitVertexList (thd5 inPhyloGraph) + + -- parallel stuff + action ∷ (LG.LEdge b, LG.LEdge b) → PhyG ReducedPhylogeneticGraph + action = insertNetEdge inGS inData inPhyloGraph preDeleteCost + in do + candidateNetworkEdgeList' ← getPermissibleEdgePairs inGS (thd5 inPhyloGraph) + + -- radomize pair list + candidateNetworkEdgeList ← + if doRandomOrder + then shuffleList candidateNetworkEdgeList' + else pure candidateNetworkEdgeList' + + inNetEdRList ← + insertNetEdgeRecursive + inGS + inData + maxNetEdges + doSteepest + doRandomOrder + inPhyloGraph + preDeleteCost + inSimAnnealParams + candidateNetworkEdgeList + + inNetEdRListMAP ← + getParallelChunkTraverse >>= \pTraverse → + action `pTraverse` candidateNetworkEdgeList + + let (newGraphList, newSAParams) = + if not doSteepest + then + let genNewSimAnnealParams = + if isNothing inSimAnnealParams + then Nothing + else U.incrementSimAnnealParams inSimAnnealParams + in -- TODO + (filter (/= emptyReducedPhylogeneticGraph) inNetEdRListMAP, genNewSimAnnealParams) + else inNetEdRList + let minCost = + if null candidateNetworkEdgeList || null newGraphList + then infinity + else minimum $ fmap snd5 newGraphList + + logWith LogInfo ("\tExamining at most " <> (show $ length candidateNetworkEdgeList) <> " candidate edge pairs" <> "\n") + + -- no network edges to insert + -- trace ("IENE: " <> (show minCost)) ( + if (length netNodes >= maxNetEdges) + then do + logWith LogInfo ("Maximum number of network edges reached: " <> (show $ length netNodes) <> "\n") + pure ([inPhyloGraph], snd5 inPhyloGraph, inSimAnnealParams) + else -- no edges to add + + if null candidateNetworkEdgeList + then do + -- trace ("IENE num cand edges:" <> (show $ length candidateNetworkEdgeList)) + pure ([inPhyloGraph], currentCost, newSAParams) + else -- single if steepest so no need to unique + + if doSteepest + then GO.selectGraphs Best numToKeep 0 newGraphList <&> \x → (x, minCost, newSAParams) + else -- "all" option needs to recurse since does all available edges at each step + -- logic is here since not in the deleteNetEdge function + + if isNothing inSimAnnealParams + then + let -- parallel stuff + insertAction + ∷ (Maybe SAParams, ReducedPhylogeneticGraph) → PhyG ([ReducedPhylogeneticGraph], VertexCost, Maybe SAParams) + insertAction = insertEachNetEdge' inGS inData maxNetEdges numToKeep doSteepest doRandomOrder preDeleteCost + in if minCost < currentCost + then do + let annealParamList = replicate (length newGraphList) newSAParams + -- + insertResult ← + getParallelChunkTraverse >>= \pTraverse → + pTraverse insertAction $ zip annealParamList newGraphList + let (allGraphListList, costList, allSAParamList) = unzip3 insertResult + (allMinCost, allMinCostGraphs) ← case fold allGraphListList of + [] → pure (infinity, []) + gs → GO.selectGraphs Unique numToKeep 0.0 gs <&> \x → (minimum costList, x) + + pure (allMinCostGraphs, allMinCost, U.incrementSimAnnealParams $ head allSAParamList) + else GO.selectGraphs Unique numToKeep 0 newGraphList <&> \x → (x, minCost, newSAParams) + else -- SA anneal/Drift + + -- always take better + + if minCost < currentCost + then do + pure (newGraphList, minCost, newSAParams) + else -- check if hit step limit--more for SA than drift + + if ((currentStep $ fromJust inSimAnnealParams) >= (numberSteps $ fromJust inSimAnnealParams)) + || ((driftChanges $ fromJust inSimAnnealParams) >= (driftMaxChanges $ fromJust inSimAnnealParams)) + then do + pure ([inPhyloGraph], snd5 inPhyloGraph, inSimAnnealParams) + else -- otherwise do the anneal/Drift accept, or keep going on input graph + + do + (acceptGraph, nextSAParams) ← U.simAnnealAccept inSimAnnealParams currentCost minCost + case acceptGraph of + True → pure (newGraphList, minCost, newSAParams) + _ → + insertEachNetEdge + inGS + inData + maxNetEdges + numToKeep + doSteepest + doRandomOrder + preDeleteCost + nextSAParams + inPhyloGraph + + +-- | insertEachNetEdge' is a wrapper around insertEachNetEdge to allow for parmapping with multiple parameters +insertEachNetEdge' + ∷ GlobalSettings + → ProcessedData + → Int + → Int + → Bool + → Bool + → Maybe VertexCost + → (Maybe SAParams, ReducedPhylogeneticGraph) + → PhyG ([ReducedPhylogeneticGraph], VertexCost, Maybe SAParams) +insertEachNetEdge' inGS inData maxNetEdges numToKeep doSteepest doRandomOrder preDeleteCost (inSimAnnealParams, inPhyloGraph) = + insertEachNetEdge inGS inData maxNetEdges numToKeep doSteepest doRandomOrder preDeleteCost inSimAnnealParams inPhyloGraph + + +{- | insertNetEdgeRecursive recursively inserts edges and returns new graph only if better +if parallel evaluated numThreads each time (steepest scenario) +-} +insertNetEdgeRecursive + ∷ GlobalSettings + → ProcessedData + → Int + → Bool + → Bool + → ReducedPhylogeneticGraph + → Maybe VertexCost + → Maybe SAParams + → [(LG.LEdge EdgeInfo, LG.LEdge EdgeInfo)] + → PhyG ([ReducedPhylogeneticGraph], Maybe SAParams) +insertNetEdgeRecursive inGS inData maxNetEdges doSteepest doRandomOrder inPhyloGraph preDeleteCost inSimAnnealParams inEdgePairList = + -- trace ("Edges pairs to go : " <> (show $ length edgePairList)) ( + if null inEdgePairList + then do + pure ([inPhyloGraph], inSimAnnealParams) + else -- don't want to over saturate the parallel thread system + + let {-saRounds = if isNothing inSimAnnealParams then 1 + else rounds $ fromJust inSimAnnealParams + (numGraphsToExamine, _) = divMod PU.getNumThreads saRounds -- this may not "drift" if finds alot better, but that's how its supposed to work + -} + + numGraphsToExamine = graphsSteepest inGS -- min (graphsSteepest inGS) PU.getNumThreads + -- firstEdgePair = head edgePairList + edgePairList = take numGraphsToExamine inEdgePairList + + -- check for max net edges + (_, _, _, netNodes) = LG.splitVertexList (thd5 inPhyloGraph) + + -- parallel seup + action ∷ (LG.LEdge b, LG.LEdge b) → PhyG ReducedPhylogeneticGraph + action = insertNetEdge inGS inData inPhyloGraph preDeleteCost + in do + -- need to check display/character trees not conical graph + -- these graph costs are "exact" or at least non-heuristic--needs to be updated when get a good heuristic + newGraphList'' ← + getParallelChunkTraverse >>= \pTraverse → + action `pTraverse` edgePairList + + let newGraphList' = filter (/= emptyReducedPhylogeneticGraph) newGraphList'' + newGraphList ← GO.selectGraphs Best (maxBound ∷ Int) 0 newGraphList' + let newGraphCost = snd5 $ head newGraphList + + if length netNodes >= maxNetEdges + then do + logWith LogInfo $ unwords ["Maximum number of network edges reached:", show $ length netNodes, "\n"] + pure ([inPhyloGraph], inSimAnnealParams) + else -- malformed graph--returns nothing for either regular or simAnneal/drift + + if null newGraphList' + then do + -- trace ("INER: Empty more to go : " <> (show $ length $ tail edgePairList)) + insertNetEdgeRecursive + inGS + inData + maxNetEdges + doSteepest + doRandomOrder + inPhyloGraph + preDeleteCost + inSimAnnealParams + (drop numGraphsToExamine inEdgePairList) + else -- "regular" insert, within steepest + + if isNothing inSimAnnealParams + then -- better cost + + if newGraphCost < snd5 inPhyloGraph + then do + -- cyclic check in insert edge function + -- trace ("INER: Better -> " <> (show $ snd5 newGraph)) + pure (newGraphList, inSimAnnealParams) + else -- not better + do + -- trace ("INER: Really Not Better") + insertNetEdgeRecursive + inGS + inData + maxNetEdges + doSteepest + doRandomOrder + inPhyloGraph + preDeleteCost + inSimAnnealParams + (drop numGraphsToExamine inEdgePairList) + else -- sim annealing/drift + + -- trace ("IENR:" <> (show (newGraphCost, snd5 inPhyloGraph)) <> " params: " <> (show (currentStep $ fromJust inSimAnnealParams, numberSteps $ fromJust inSimAnnealParams, driftChanges $ fromJust inSimAnnealParams, driftMaxChanges $ fromJust inSimAnnealParams))) ( + -- if better always accept + + if newGraphCost < snd5 inPhyloGraph + then do + -- cyclic check in insert edge function + -- these graph costs are "exact" or at least non-heuristic--needs to be updated when get a good heuristic + (_, nextSAParams) ← U.simAnnealAccept inSimAnnealParams (snd5 inPhyloGraph) newGraphCost + pure (newGraphList, nextSAParams) + else -- check if hit step limit--more for SA than drift + + if ((currentStep $ fromJust inSimAnnealParams) >= (numberSteps $ fromJust inSimAnnealParams)) + || ((driftChanges $ fromJust inSimAnnealParams) >= (driftMaxChanges $ fromJust inSimAnnealParams)) + then do + pure ([inPhyloGraph], inSimAnnealParams) + else do + -- otherwise do the anneal/Drift accept + + (acceptGraph, nextSAParams) ← U.simAnnealAccept inSimAnnealParams (snd5 inPhyloGraph) newGraphCost + case acceptGraph of + True → pure (newGraphList, nextSAParams) + _ → + insertNetEdgeRecursive + inGS + inData + maxNetEdges + doSteepest + doRandomOrder + inPhyloGraph + preDeleteCost + nextSAParams + (drop numGraphsToExamine inEdgePairList) + + +{- | insertNetEdge inserts an edge between two other edges, creating 2 new nodes and rediagnoses graph +contacts deletes 2 orginal edges and adds 2 nodes and 5 new edges +does not check any edge reasonable-ness properties +new edge directed from first to second edge +naive for now +predeletecost of edge move +no choice of graph--just makes and returns +-} +insertNetEdge + ∷ GlobalSettings + → ProcessedData + → ReducedPhylogeneticGraph + → Maybe VertexCost + → (LG.LEdge b, LG.LEdge b) + → PhyG ReducedPhylogeneticGraph +insertNetEdge inGS inData inPhyloGraph _ edgePair@((u, v, _), (u', v', _)) = + if LG.isEmpty $ thd5 inPhyloGraph + then error "Empty input phylogenetic graph in insNetEdge" + else + let inSimple = fst5 inPhyloGraph + + -- get children of u' to make sure no net children--moved to permissiable edges + -- u'ChildrenNetNodes = filter (== True) $ fmap (LG.isNetworkNode inSimple) $ LG.descendants inSimple u' + + numNodes = length $ LG.nodes inSimple + newNodeOne = (numNodes, TL.pack ("HTU" <> (show numNodes))) + newNodeTwo = (numNodes + 1, TL.pack ("HTU" <> (show $ numNodes + 1))) + newEdgeList = + [ (u, fst newNodeOne, 0.0) + , (fst newNodeOne, v, 0.0) + , (u', fst newNodeTwo, 0.0) + , (fst newNodeTwo, v', 0.0) + , (fst newNodeOne, fst newNodeTwo, 0.0) + ] + edgesToDelete = [(u, v), (u', v')] + newSimple = LG.delEdges edgesToDelete $ LG.insEdges newEdgeList $ LG.insNodes [newNodeOne, newNodeTwo] inSimple + + -- do not prune other edges if now unused + pruneEdges = False + + -- don't warn that edges are being pruned + warnPruneEdges = False + + -- graph optimization from root + startVertex = Nothing + + -- full two-pass optimization + leafGraph = LG.extractLeafGraph $ thd5 inPhyloGraph + in do + newPhyloGraph ← T.multiTraverseFullyLabelSoftWiredReduced inGS inData pruneEdges warnPruneEdges leafGraph startVertex newSimple + + -- calculates heursitic graph delta + -- (heuristicDelta, _, _, _, _) = heuristicAddDelta inGS inPhyloGraph edgePair (fst newNodeOne) (fst newNodeTwo) + heuristicDelta' <- heuristicAddDelta' inGS inPhyloGraph edgePair + + let edgeAddDelta = deltaPenaltyAdjustment inGS inPhyloGraph "add" + + let heuristicFactor = (heuristicDelta' + edgeAddDelta) / edgeAddDelta + + -- use or not Net add heuristics + let metHeuristicThreshold = not (useNetAddHeuristic inGS) || heuristicFactor < (2 / 3) + + -- remove these checks when working + isPhyloGraph ← LG.isPhylogeneticGraph newSimple + if not isPhyloGraph + then do + pure emptyReducedPhylogeneticGraph + else + if metHeuristicThreshold + then -- if (GO.parentsInChainGraph . thd5) newPhyloGraph then emptyPhylogeneticGraph + -- else + + if (snd5 newPhyloGraph <= snd5 inPhyloGraph) + then do + pure newPhyloGraph + else do + pure emptyReducedPhylogeneticGraph + else do + pure emptyReducedPhylogeneticGraph + + +-- | (curBestGraphList, annealBestCost) is a wrapper for moveAllNetEdges' allowing for multiple simulated annealing rounds +deleteAllNetEdges + ∷ GlobalSettings + → ProcessedData + → Int + → Int + → Int + → Bool + → Bool + → Bool + → ([ReducedPhylogeneticGraph], VertexCost) + → (Maybe SAParams, [ReducedPhylogeneticGraph]) + → PhyG ([ReducedPhylogeneticGraph], Int) +deleteAllNetEdges inGS inData maxNetEdges numToKeep counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) (inSimAnnealParams, inPhyloGraphList) = + if isNothing inSimAnnealParams + then + deleteAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + counter + returnMutated + doSteepest + doRandomOrder + (curBestGraphList, curBestGraphCost) + inSimAnnealParams + inPhyloGraphList + else + let -- create list of params with unique list of random values for rounds of annealing + annealingRounds = rounds $ fromJust inSimAnnealParams + annealParamGraphList = replicate annealingRounds inSimAnnealParams + + -- parallel + action ∷ (Maybe SAParams, [ReducedPhylogeneticGraph]) → PhyG ([ReducedPhylogeneticGraph], Int) + action = + deleteAllNetEdges'' + inGS + inData + maxNetEdges + numToKeep + counter + returnMutated + doSteepest + doRandomOrder + (curBestGraphList, curBestGraphCost) + in do + deleteResult ← + getParallelChunkTraverse >>= \pTraverse → + pTraverse action . zip annealParamGraphList $ replicate annealingRounds inPhyloGraphList + + let (annealRoundsList, counterList) = unzip deleteResult + GO.selectGraphs Best numToKeep 0 (fold annealRoundsList) <&> \x → (x, sum counterList) + + +-- | deleteAllNetEdges'' is a wrapper around deleteAllNetEdges' to allow use of seqParMap +deleteAllNetEdges'' + ∷ GlobalSettings + → ProcessedData + → Int + → Int + → Int + → Bool + → Bool + → Bool + → ([ReducedPhylogeneticGraph], VertexCost) + → (Maybe SAParams, [ReducedPhylogeneticGraph]) + → PhyG ([ReducedPhylogeneticGraph], Int) +deleteAllNetEdges'' inGS inData maxNetEdges numToKeep counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) (inSimAnnealParams, inPhyloGraphList) = + deleteAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + counter + returnMutated + doSteepest + doRandomOrder + (curBestGraphList, curBestGraphCost) + inSimAnnealParams + inPhyloGraphList + + +{- | deleteAllNetEdges deletes network edges one each each round until no better or additional +graphs are found +call with ([], infinity) [single input graph] +-} +deleteAllNetEdges' + ∷ GlobalSettings + → ProcessedData + → Int + → Int + → Int + → Bool + → Bool + → Bool + → ([ReducedPhylogeneticGraph], VertexCost) + → Maybe SAParams + → [ReducedPhylogeneticGraph] + → PhyG ([ReducedPhylogeneticGraph], Int) +deleteAllNetEdges' inGS inData maxNetEdges numToKeep counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) inSimAnnealParams = \case + [] → pure (take numToKeep curBestGraphList, counter) + firstPhyloGraph : otherPhyloGraphs + | LG.isEmpty $ fst5 firstPhyloGraph → + deleteAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + counter + returnMutated + doSteepest + doRandomOrder + (curBestGraphList, curBestGraphCost) + inSimAnnealParams + otherPhyloGraphs + inputPhyloGraphs@(firstPhyloGraph : otherPhyloGraphs) → do + let currentCost = min curBestGraphCost $ snd5 firstPhyloGraph + + (newGraphList', _, newSAParams) ← + deleteEachNetEdge + inGS + inData + numToKeep + doSteepest + doRandomOrder + False + inSimAnnealParams + firstPhyloGraph + + newGraphList ← GO.selectGraphs Best numToKeep 0 newGraphList' + let newGraphCost = case newGraphList of + [] → infinity + (_, c, _, _, _) : _ → c + + -- if graph is a tree no edges to delete + case LG.isTree $ fst5 firstPhyloGraph of + True → do + logWith LogInfo "\tGraph in delete network edges is tree--skipping\n" + deleteAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + (counter + 1) + returnMutated + doSteepest + doRandomOrder + (firstPhyloGraph : curBestGraphList, currentCost) + inSimAnnealParams + otherPhyloGraphs + + -- is this an issue for SA? + _ → case newGraphList of + [] → pure (take numToKeep curBestGraphList, counter + 1) + g : gs → case inSimAnnealParams of + -- regular delete wihtout simulated annealing + Nothing → + postProcessNetworkDelete + inGS + inData + maxNetEdges + numToKeep + counter + returnMutated + doSteepest + doRandomOrder + (curBestGraphList, curBestGraphCost) + inSimAnnealParams + inputPhyloGraphs + newGraphList + newGraphCost + currentCost + -- simulated annealing + Just simAnneal → do + (saGraphs, saCounter) ← + postProcessNetworkDeleteSA + inGS + inData + maxNetEdges + numToKeep + counter + returnMutated + doSteepest + doRandomOrder + (curBestGraphList, curBestGraphCost) + newSAParams + inputPhyloGraphs + newGraphList + newGraphCost + currentCost + + case returnMutated of + -- if want mutated then return + True → pure (saGraphs, saCounter) + -- insert non-minimal edges if any + -- not sim anneal/drift regular optimal searching + False → do + let annealBestCost = minimum $ fmap snd5 saGraphs + insertedGraphs ← + insertAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + saCounter + False + doSteepest + doRandomOrder + (saGraphs, annealBestCost) + Nothing + saGraphs + + let (bestList', counter') = insertedGraphs + bestList ← GO.selectGraphs Best numToKeep 0 bestList' + pure (bestList, counter') + + +{- | postProcessNetworkDeleteSA postprocesses results from delete actions for non-annealing/Drift network delete operations +assumes SAParams are updated during return of graph list above +-} +postProcessNetworkDeleteSA + ∷ GlobalSettings + → ProcessedData + → Int + → Int + → Int + → Bool + → Bool + → Bool + → ([ReducedPhylogeneticGraph], VertexCost) + → Maybe SAParams + → [ReducedPhylogeneticGraph] + → [ReducedPhylogeneticGraph] + → VertexCost + → VertexCost + → PhyG ([ReducedPhylogeneticGraph], Int) +postProcessNetworkDeleteSA inGS inData maxNetEdges numToKeep counter returnMutated doSteepest doRandomOrder (curBestGraphList, curBestGraphCost) inSimAnnealParams inPhyloGraphList newGraphList newGraphCost currentCost = + -- this to deal with empty list issues if nothing found + let (nextNewGraphList, firstNewGraphList) = case newGraphList of + [] → ([], []) + g : gs → (gs, [g]) + + graphsToDelete + | doSteepest = newGraphList + | otherwise = take numToKeep $ newGraphList <> inPhyloGraphList + in -- always accept if found better + if newGraphCost < currentCost + then do + logWith LogInfo ("\t-> " <> (show newGraphCost)) + deleteAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + (counter + 1) + returnMutated + doSteepest + doRandomOrder + (newGraphList, newGraphCost) + inSimAnnealParams + graphsToDelete + else -- check if hit max change/ cooling steps + + if ((currentStep $ fromJust inSimAnnealParams) >= (numberSteps $ fromJust inSimAnnealParams)) + || ((driftChanges $ fromJust inSimAnnealParams) >= (driftMaxChanges $ fromJust inSimAnnealParams)) + then GO.selectGraphs Unique numToKeep 0 (newGraphList <> curBestGraphList) <&> \x → (x, counter) + else -- more to do + + let annealBestCost = min curBestGraphCost newGraphCost + in do + deleteAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + (counter + 1) + returnMutated + doSteepest + doRandomOrder + (firstNewGraphList <> curBestGraphList, annealBestCost) + inSimAnnealParams + (nextNewGraphList <> inPhyloGraphList) + + +-- | postProcessNetworkDelete postprocesses results from delete actions for "regular" ie non-annealing/Drift network delete operations +postProcessNetworkDelete + ∷ GlobalSettings + → ProcessedData + → Int + → Int + → Int + → Bool + → Bool + → Bool + → ([ReducedPhylogeneticGraph], VertexCost) + → Maybe SAParams + → [ReducedPhylogeneticGraph] + → [ReducedPhylogeneticGraph] + → VertexCost + → VertexCost + → PhyG ([ReducedPhylogeneticGraph], Int) +postProcessNetworkDelete inGS inData maxNetEdges numToKeep counter returnMutated doSteepest doRandomOrder (curBestGraphList, _) inSimAnnealParams inPhyloGraphList newGraphList newGraphCost currentCost = + -- worse graphs found--go on + if newGraphCost > currentCost + then do + deleteAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + (counter + 1) + returnMutated + doSteepest + doRandomOrder + ((head inPhyloGraphList) : curBestGraphList, currentCost) + inSimAnnealParams + (tail inPhyloGraphList) + else -- "steepest style descent" abandons existing list if better cost found + + if newGraphCost < currentCost + then do + logWith LogInfo ("\t-> " <> (show newGraphCost)) + if doSteepest + then do + deleteAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + (counter + 1) + returnMutated + doSteepest + doRandomOrder + (newGraphList, newGraphCost) + inSimAnnealParams + newGraphList + else do + deleteAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + (counter + 1) + returnMutated + doSteepest + doRandomOrder + (newGraphList, newGraphCost) + inSimAnnealParams + (newGraphList <> (tail inPhyloGraphList)) + else -- equal cost + -- not sure if should add new graphs to queue to do edge deletion again + + -- new graph list contains the input graph if equal and filterd unique already in deleteEachNetEdge + do + newCurSameBestList ← GO.selectGraphs Unique numToKeep 0.0 $ curBestGraphList <> newGraphList + deleteAllNetEdges' + inGS + inData + maxNetEdges + numToKeep + (counter + 1) + returnMutated + doSteepest + doRandomOrder + (newCurSameBestList, currentCost) + inSimAnnealParams + (tail inPhyloGraphList) + + +-- | deleteOneNetAddAll' wrapper on deleteOneNetAddAll to allow for parmap +deleteOneNetAddAll' + ∷ GlobalSettings + → ProcessedData + → Int + → Int + → Bool + → Bool + → ReducedPhylogeneticGraph + → Maybe SAParams + → LG.Edge + → PhyG [ReducedPhylogeneticGraph] +deleteOneNetAddAll' inGS inData maxNetEdges numToKeep doSteepest doRandomOrder inPhyloGraph inSimAnnealParams edgeToDelete = + deleteOneNetAddAll + inGS + inData + maxNetEdges + numToKeep + doSteepest + doRandomOrder + inPhyloGraph + [edgeToDelete] + inSimAnnealParams + + +{- | deleteOneNetAddAll version deletes net edges in turn and readds-based on original cost +but this cost in graph (really not correct) but allows logic of insert edge to function better +unlike deleteOneNetAddAll' only deals with single edge deletion at a time +-} +deleteOneNetAddAll + ∷ GlobalSettings + → ProcessedData + → Int + → Int + → Bool + → Bool + → ReducedPhylogeneticGraph + → [LG.Edge] + → Maybe SAParams + → PhyG [ReducedPhylogeneticGraph] +deleteOneNetAddAll inGS inData maxNetEdges numToKeep doSteepest doRandomOrder inPhyloGraph edgeToDeleteList inSimAnnealParams = + if null edgeToDeleteList + then do + -- trace ("\tGraph has no edges to move---skipping") + pure [inPhyloGraph] + else + if LG.isEmpty $ thd5 inPhyloGraph + then error "Empty graph in deleteOneNetAddAll" + else do + -- trace ("DONAA-New: " <> (show $ snd5 inPhyloGraph) <> " Steepest:" <> (show doSteepest)) ( + logWith + LogInfo + ("Moving " <> (show $ length edgeToDeleteList) <> " network edges, current best cost: " <> (show $ snd5 inPhyloGraph) <> "\n") + -- start with initial graph cost + let inGraphCost = snd5 inPhyloGraph + + -- get deleted simple graphs and bool for changed + delGraphBoolPair ← deleteNetworkEdge (fst5 inPhyloGraph) (head edgeToDeleteList) + + -- no change in network structure + if snd delGraphBoolPair == False + then do + deleteOneNetAddAll + inGS + inData + maxNetEdges + numToKeep + doSteepest + doRandomOrder + inPhyloGraph + (tail edgeToDeleteList) + inSimAnnealParams + else + let simpleGraphToInsert = fst delGraphBoolPair + + (_, _, _, curNetNodes) = LG.splitVertexList simpleGraphToInsert + curNumNetNodes = length curNetNodes + + -- optimize deleted graph and update cost with input cost + leafGraph = LG.extractLeafGraph $ thd5 inPhyloGraph + in do + graphToInsert ← T.multiTraverseFullyLabelSoftWiredReduced inGS inData False False leafGraph Nothing simpleGraphToInsert -- `using` PU.myParListChunkRDS + + -- keep same cost and just keep better--check if better than original later + let graphToInsert' = T.updatePhylogeneticGraphCostReduced graphToInsert inGraphCost + + insertedGraphTripleList ← + insertEachNetEdge + inGS + inData + (curNumNetNodes + 1) + numToKeep + doSteepest + doRandomOrder + Nothing + inSimAnnealParams + graphToInsert' + + let newMinimumCost = snd3 insertedGraphTripleList + + let newBestGraphs = filter ((== newMinimumCost) . snd5) $ fst3 insertedGraphTripleList + + -- trace ("DONAA-New: " <> (show (inGraphCost, fmap snd5 graphsToInsert, fmap snd5 graphsToInsert', newMinimumCost))) ( + if newMinimumCost < inGraphCost + then do + -- trace ("DONA-> ") + pure newBestGraphs + else do + deleteOneNetAddAll + inGS + inData + maxNetEdges + numToKeep + doSteepest + doRandomOrder + inPhyloGraph + (tail edgeToDeleteList) + inSimAnnealParams + + +{- | getPermissibleEdgePairs takes a DecoratedGraph and returns the list of all pairs +of edges that can be joined by a network edge and meet all necessary conditions +-} + +-- add in other conditions +-- reproducable--ie not tree noide with two net node children--other stuff +getPermissibleEdgePairs ∷ GlobalSettings → DecoratedGraph → PhyG [(LG.LEdge EdgeInfo, LG.LEdge EdgeInfo)] +getPermissibleEdgePairs inGS inGraph = + if LG.isEmpty inGraph + then error "Empty input graph in isEdgePairPermissible" + else + let edgeList = LG.labEdges inGraph + + -- edges to potentially conenct + edgePairs = cartProd edgeList edgeList + + -- get coeval node pairs in existing grap + coevalNodeConstraintList = LG.coevalNodePairs inGraph + + -- parallel + -- action :: (LNode a, LNode a) -> (LNode a, LNode a, [LNode a], [LNode a], [LNode a], [LNode a]) + action = LG.addBeforeAfterToPair inGraph + in do + actionPar ← getParallelChunkMap + let coevalNodeConstraintList' = actionPar action coevalNodeConstraintList + let edgeAction ∷ (LG.LEdge EdgeInfo, LG.LEdge EdgeInfo) → Bool + edgeAction = isEdgePairPermissible inGraph coevalNodeConstraintList' + + edgePar ← getParallelChunkMap + let edgeTestList = edgePar edgeAction edgePairs + -- PU.seqParMap (parStrategy $ lazyParStrat inGS) (isEdgePairPermissible inGraph coevalNodeConstraintList') edgePairs -- `using` PU.myParListChunkRDS + + let pairList = fmap fst $ filter ((== True) . snd) $ zip edgePairs edgeTestList + + -- trace ("Edge Pair list :" <> (show $ fmap f pairList) <> "\n" + -- <> "GPEP\n" <> (LG.prettify $ GO.convertDecoratedToSimpleGraph inGraph)) + pure pairList + + +-- where f (a, b) = (LG.toEdge a, LG.toEdge b) + +{- | isEdgePairPermissible takes a graph and two edges, coeval contraints, and tests whether a +pair of edges can be linked by a new edge and satify three consitions: + 1) neither edge is a network edge + 2) one edge cannot be "before" while the other is "after" in any of the constraint pairs + 3) neither edge is an ancestor or descendent edge of the other (tested via bv of nodes) +the result should apply to a new edge in either direction +new edge to be creted is edge1 -> ege2 +Could change to LG.isPhylogeneticGraph +-} +isEdgePairPermissible + ∷ DecoratedGraph + → [(LG.LNode a, LG.LNode a, [LG.LNode a], [LG.LNode a], [LG.LNode a], [LG.LNode a])] + → (LG.LEdge EdgeInfo, LG.LEdge EdgeInfo) + → Bool +isEdgePairPermissible inGraph constraintList (edge1@(u, v, _), edge2@(u', v', _)) = + if LG.isEmpty inGraph + then error "Empty input graph in isEdgePairPermissible" + else + if u == u' + then False + else + if v == v' + then False + else -- equality implied in above two + -- else if LG.toEdge edge1 == LG.toEdge edge2 then False + + if (LG.isNetworkNode inGraph u) || (LG.isNetworkNode inGraph u') + then False + else + if (LG.isNetworkLabEdge inGraph edge1) || (LG.isNetworkLabEdge inGraph edge2) + then False + else + if not (LG.meetsAllCoevalConstraintsNodes (fmap removeNodeLabels constraintList) edge1 edge2) + then False + else + if (isAncDescEdge inGraph edge1 edge2) + then False + else -- get children of u' to make sure no net children + + if (not . null) $ filter (== True) $ fmap (LG.isNetworkNode inGraph) $ LG.descendants inGraph u' + then False + else True + where + removeNodeLabels (a, b, c, d, e, f) = (LG.toNode a, LG.toNode b, fmap LG.toNode c, fmap LG.toNode d, fmap LG.toNode e, fmap LG.toNode f) + + +{- | isAncDescEdge takes a graph and two edges and examines whethe either edge is the ancestor or descendent of the other +this is done via examination of teh bitvector fields of the node +-} +isAncDescEdge ∷ DecoratedGraph → LG.LEdge EdgeInfo → LG.LEdge EdgeInfo → Bool +isAncDescEdge inGraph (a, _, _) (b, _, _) = + if LG.isEmpty inGraph + then error "Empty input graph in isAncDescEdge" + else + let aBV = bvLabel $ fromJust $ LG.lab inGraph a + bBV = bvLabel $ fromJust $ LG.lab inGraph b + in -- trace ("IADE: " <> (show (a, aBV, b, bBV, aBV .&. bBV))) ( + if aBV .&. bBV == aBV + then True + else + if aBV .&. bBV == bBV + then True + else False + +{- These heuristics do not seem tom work well at all-} + +{- | heuristic add delta' based on new display tree and delta from existing costs by block--assumming < 0 +original edges subtree1 ((u,l),(u,v)) and subtree2 ((u',v'),(u',l')) create a directed edge from +subtree 1 to subtree 2 via +1) Add node x and y, delete edges (u,v) and (u'v') and create edges (u,x), (x,v), (u',y), and (y,v') +2) real cost is the sum of block costs that are lower for new graph versus older +3) heuristic is when new subtree is lower than existing block by block + so calculate d(u,v) + d(u',v') [existing display tree cost estimate] compared to + d((union u,v), v') - d(u'.v') [New display tree cost estimate] over blocks + so blockDelta = if d((union u,v), v') - d(u'.v') < d(u,v) + d(u',v') then d((union u,v), v') - d(u'.v') + else 0 [existing better] + graphDelta = egdeAddDelta (separately calculated) + sum [blockDelta] + Compare to real delta to check behavior +original subtrees u -> (a,v) and u' -> (v',b) +-} +heuristicAddDelta' ∷ GlobalSettings → ReducedPhylogeneticGraph → (LG.LEdge b, LG.LEdge b) → PhyG VertexCost +heuristicAddDelta' _ inPhyloGraph ((u, v, _), (u', v', _)) = + if LG.isEmpty (fst5 inPhyloGraph) + then error "Empty graph in heuristicAddDelta" + else + let a = head $ filter (/= v) $ LG.descendants (fst5 inPhyloGraph) u + b = head $ filter (/= v') $ LG.descendants (fst5 inPhyloGraph) u' + blockTrees = + fmap V.fromList $ + fmap (GO.getDecoratedDisplayTreeList (thd5 inPhyloGraph)) $ + V.zip (fth5 inPhyloGraph) $ + V.fromList [0 .. (V.length (fft5 inPhyloGraph) - 1)] + action :: (V.Vector DecoratedGraph, V.Vector CharInfo) → PhyG VertexCost + action = getBlockDelta (u, v, u', v', a, b) + in do + actionPar <- getParallelChunkTraverse + -- blockDeltaV <- V.zipWith (getBlockDelta (u, v, u', v', a, b)) (zip blockTrees (fft5 inPhyloGraph)) + blockDeltaL <- actionPar action (V.toList $ V.zip blockTrees (fft5 inPhyloGraph)) + pure $ sum blockDeltaL + + +{- | getBlockDelta determines the network add delta for each block (vector of characters) +if existing is lower then zero, else (existing - new) +-} +getBlockDelta + ∷ (LG.Node, LG.Node, LG.Node, LG.Node, LG.Node, LG.Node) → (V.Vector DecoratedGraph, V.Vector CharInfo) → PhyG VertexCost +getBlockDelta (u, v, u', v', a, b) (inCharV, charInfoV) = + if V.null inCharV + then error "Empty charcter tree vector in getBlockDelta" + else + let action :: (DecoratedGraph, CharInfo) → (VertexCost, VertexCost) + action = getCharacterDelta (u, v, u', v', a, b) + in do + actionPar ← getParallelChunkMap + let result = actionPar action (V.toList $ V.zip inCharV charInfoV) + let (charNewV, charExistingV) = unzip result + let newCost = sum charNewV + let existingCost = sum charExistingV + if (newCost < existingCost) + then pure $ newCost - existingCost + else pure $ 0.0 + +{- | getCharacterDelta determines the network add delta for each block (vector of characters) +if existing is lower then zero, else (existing - new) + calculate d(u,v) + d(u',v') [existing display tree cost estimate] compared to + d((union u,v), v') - d(u'.v') +need to use final assignemnts--so set prelim to final first +Since a distance--no need for No chanage cost adjustment +-} +getCharacterDelta + ∷ (LG.Node, LG.Node, LG.Node, LG.Node, LG.Node, LG.Node) → (DecoratedGraph, CharInfo) → (VertexCost, VertexCost) +getCharacterDelta (_, v, _, v', a, b) (inCharTree, charInfo) = + -- getCharacterDelta (u,v,u',v',a,b) inCharTree charInfo = + let doIA = False + noChangeCostAdjust = False --this since want a distance not a median + -- filterGaps = True + -- uData = V.head $ V.head $ vertData $ fromJust $ LG.lab inCharTree u + vData = V.head $ V.head $ vertData $ fromJust $ LG.lab inCharTree v + vFinalData = V.head $ V.head $ PRE.setPreliminaryToFinalStates $ vertData $ fromJust $ LG.lab inCharTree v + -- u'Data = V.head $ V.head $ vertData $ fromJust $ LG.lab inCharTree u' + v'Data = V.head $ V.head $ vertData $ fromJust $ LG.lab inCharTree v' + v'FinalData = V.head $ V.head $ PRE.setPreliminaryToFinalStates $ vertData $ fromJust $ LG.lab inCharTree v' + aData = V.head $ V.head $ vertData $ fromJust $ LG.lab inCharTree a + aFinalData = V.head $ V.head $ PRE.setPreliminaryToFinalStates $ vertData $ fromJust $ LG.lab inCharTree a + bData = V.head $ V.head $ vertData $ fromJust $ LG.lab inCharTree b + + -- unionUV = M.union2Single doIA filterGaps uData vData charInfo + -- (_,dUV) = M.median2Single doIA uData vData charInfo + -- dUV = vertexCost $ fromJust $ LG.lab inCharTree u + -- dU'V' = vertexCost $ fromJust $ LG.lab inCharTree u' + -- (_, dUnionUVV') = M.median2Single doIA unionUV v'Data charInfo + + (newX, dVV') = M.median2Single noChangeCostAdjust doIA vFinalData v'FinalData charInfo + (_, dAX) = M.median2Single noChangeCostAdjust doIA aFinalData newX charInfo + (_, dAV) = M.median2Single noChangeCostAdjust doIA aData vData charInfo + (_, dV'B) = M.median2Single noChangeCostAdjust doIA v'Data bData charInfo + in -- trace ("GCD: " <> (show (dVV' + dAX, dAV + dV'B))) ( + (dVV' + dAX, dAV + dV'B) + + +-- if dUnionUVV' - dU'V' < dU'V' then dUnionUVV' - dU'V' +-- else 0.0 +-- ) + +{- | heuristicAddDelta takes the existing graph, edge pair, and new nodes to create and makes +the new nodes and reoptimizes starting nodes of two edges. Returns cost delta based on +previous and new node resolution caches +returns cost delta and the reoptimized nodes for use in incremental optimization +original edges (to be deleted) (u,v) and (u',v'), n1 inserted in (u,v) and n2 inserted into (u',v') +creates (n1, n2), (u,n1), (n1,v), (u',n2), (n2, v') +-} +heuristicAddDelta + ∷ GlobalSettings + → ReducedPhylogeneticGraph + → (LG.LEdge b, LG.LEdge b) + → LG.Node + → LG.Node + → PhyG (VertexCost, LG.LNode VertexInfo, LG.LNode VertexInfo, LG.LNode VertexInfo, LG.LNode VertexInfo) +heuristicAddDelta inGS inPhyloGraph ((u, v, _), (u', v', _)) n1 n2 = + if LG.isEmpty (fst5 inPhyloGraph) + then error "Empty graph in heuristicAddDelta" + else + if graphType inGS == HardWired + then do + uvVertData <- M.makeEdgeDataM False True (thd5 inPhyloGraph) (fft5 inPhyloGraph) (u, v, dummyEdge) + uvPrimeData <- M.makeEdgeDataM False True (thd5 inPhyloGraph) (fft5 inPhyloGraph) (u', v', dummyEdge) + postOrderVertex <- POSW.createVertexDataOverBlocks inGS uvVertData uvPrimeData (fft5 inPhyloGraph) [] + let hardDelta = V.sum $ fmap V.sum $ fmap (fmap snd) postOrderVertex + + pure (hardDelta, dummyNode, dummyNode, dummyNode, dummyNode) + else -- softwired + + let uLab = fromJust $ LG.lab (thd5 inPhyloGraph) u + uPrimeLab = fromJust $ LG.lab (thd5 inPhyloGraph) u' + vLab = fromJust $ LG.lab (thd5 inPhyloGraph) v + vPrimeLab = fromJust $ LG.lab (thd5 inPhyloGraph) v' + uPrimeOtherChild = head $ filter ((/= v') . fst) $ LG.labDescendants (thd5 inPhyloGraph) (u', uPrimeLab) + uOtherChild = head $ filter ((/= v) . fst) $ LG.labDescendants (thd5 inPhyloGraph) (u, uLab) + in do + -- direction first edge to second so n2 is outdegree 1 to v' + let n2Lab = NEW.getOutDegree1VertexSoftWired n2 vPrimeLab (thd5 inPhyloGraph) [n2] + uPrimeLabAfter ← NEW.getOutDegree2VertexSoftWired inGS (fft5 inPhyloGraph) u' (n2, n2Lab) uPrimeOtherChild (thd5 inPhyloGraph) + n1Lab ← NEW.getOutDegree2VertexSoftWired inGS (fft5 inPhyloGraph) n1 (v, vLab) (n2, n2Lab) (thd5 inPhyloGraph) + uLabAfter ← NEW.getOutDegree2VertexSoftWired inGS (fft5 inPhyloGraph) u uOtherChild (n1, n1Lab) (thd5 inPhyloGraph) + + -- cost of resolutions + let (_, uCostBefore) = NEW.extractDisplayTrees (Just (-1)) False (vertexResolutionData uLab) + let (_, uPrimeCostBefore) = NEW.extractDisplayTrees (Just (-1)) False (vertexResolutionData uPrimeLab) + let (_, uCostAfter) = NEW.extractDisplayTrees (Just (-1)) False (vertexResolutionData uLabAfter) + let (_, uPrimeCostAfter) = NEW.extractDisplayTrees (Just (-1)) False (vertexResolutionData uPrimeLabAfter) + + let addNetDelta = (uCostAfter - uCostBefore) + (uPrimeCostAfter - uPrimeCostBefore) + -- trace ("HAD: " <> (show (uCostAfter, uCostBefore, uPrimeCostAfter, uPrimeCostBefore)) <> " -> " <> (show addNetDelta)) $ + if null (filter ((/= v') . fst) $ LG.labDescendants (thd5 inPhyloGraph) (u', uPrimeLab)) + || null (filter ((/= v) . fst) $ LG.labDescendants (thd5 inPhyloGraph) (u, uLab)) + then pure (infinity, dummyNode, dummyNode, dummyNode, dummyNode) + else -- this should not happen--should try to create new edges from children of net edges + + if (length $ LG.descendants (thd5 inPhyloGraph) u) < 2 || (length $ LG.descendants (thd5 inPhyloGraph) u') < 2 + then error ("Outdegree 1 nodes in heuristicAddDelta") + else pure (addNetDelta, (u, uLabAfter), (u', uPrimeLabAfter), (n1, n1Lab), (n2, n2Lab)) + + +{- | deltaPenaltyAdjustment takes number of leaves and Phylogenetic graph and returns a heuristic graph penalty for adding a single network edge +if Wheeler2015Network, this is based on all changes affecting a single block (most permissive) and Wheeler 2015 calculation of penalty +if PMDLGraph -- graph complexity +if NoNetworkPenalty then 0 +modification "add" or subtrct to calculate delta +always delta is positive--whether neg or pos is deltermined when used +-} +deltaPenaltyAdjustment + ∷ GlobalSettings + → ReducedPhylogeneticGraph + → String + → VertexCost +deltaPenaltyAdjustment inGS inGraph modification = + -- trace ("DPA: entering: " <> (show $ graphFactor inGS)) ( + let numLeaves = numDataLeaves inGS + edgeCostModel = graphFactor inGS + (_, _, _, networkNodeList) = LG.splitVertexList (fst5 inGraph) + in if edgeCostModel == NoNetworkPenalty + then -- trace ("DPA: No penalty") + 0.0 + else -- else if length networkNodeList == 0 then + -- trace ("DPA: No cost") + -- 0.0 + + if edgeCostModel == Wheeler2015Network + then (snd5 inGraph) / (fromIntegral $ 2 * ((2 * numLeaves) - 2) + (2 * (length networkNodeList))) + else + if (optimalityCriterion inGS) `elem` [PMDL, SI] + then -- trace ("DPW: In PMDLGraph") ( + + if graphType inGS == Tree + then fst $ IL.head (graphComplexityList inGS) + else + if graphType inGS == SoftWired + then + let currentComplexity = fst $ (graphComplexityList inGS) IL.!!! (length networkNodeList) + nextComplexity = + if modification == "add" + then fst $ (graphComplexityList inGS) IL.!!! ((length networkNodeList) + 1) + else + if modification == "delete" + then fst $ (graphComplexityList inGS) IL.!!! ((length networkNodeList) - 1) + else error ("SoftWired deltaPenaltyAdjustment modification not recognized: " <> modification) + in abs (currentComplexity - nextComplexity) + else + if graphType inGS == HardWired + then + let currentComplexity = snd $ (graphComplexityList inGS) IL.!!! (length networkNodeList) + nextComplexity = + if modification == "add" + then snd $ (graphComplexityList inGS) IL.!!! ((length networkNodeList) + 1) + else + if modification == "delete" + then snd $ (graphComplexityList inGS) IL.!!! ((length networkNodeList) - 1) + else error ("HardWired deltaPenaltyAdjustment modification not recognized: " <> modification) + in abs (currentComplexity - nextComplexity) + else error ("Graph type not yet implemented: " <> (show $ graphType inGS)) + else -- ) + + if edgeCostModel == Wheeler2023Network + then -- same as W15 for heuristic penalty for single edge + (snd5 inGraph) / (fromIntegral $ 2 * ((2 * numLeaves) - 2) + (2 * (length networkNodeList))) + else error ("Network edge cost model not yet implemented: " <> (show edgeCostModel)) + + +-- ) + +{- | deleteEachNetEdge takes a phylogenetic graph and deletes all network edges one at time +and returns best list of new Phylogenetic Graphs and cost +even if worse--could be used for simulated annealing later +if equal returns unique graph list +-} +deleteEachNetEdge + ∷ GlobalSettings + → ProcessedData + → Int + → Bool + → Bool + → Bool + → Maybe SAParams + → ReducedPhylogeneticGraph + → PhyG ([ReducedPhylogeneticGraph], VertexCost, Maybe SAParams) +deleteEachNetEdge inGS inData numToKeep doSteepest doRandomOrder force inSimAnnealParams inPhyloGraph = + -- trace ("DENE start") ( + if LG.isEmpty $ thd5 inPhyloGraph + then do + pure ([], infinity, inSimAnnealParams) -- error "Empty input phylogenetic graph in deleteAllNetEdges" + else + let currentCost = snd5 inPhyloGraph + + -- potentially randomize order of list + networkEdgeList' = LG.netEdges $ thd5 inPhyloGraph + + --- parallel + action ∷ LG.Edge → PhyG ReducedPhylogeneticGraph + action = deleteNetEdge inGS inData inPhyloGraph force + in do + networkEdgeList ← + if not doRandomOrder + then pure networkEdgeList' + else shuffleList networkEdgeList' + + delNetEdgeList ← + getParallelChunkTraverse >>= \pTraverse → + action `pTraverse` networkEdgeList + + deleteNetEdgeRecursiveList ← deleteNetEdgeRecursive inGS inData inPhyloGraph force inSimAnnealParams networkEdgeList + let (newGraphList, newSAParams) = + if not doSteepest + then (delNetEdgeList, U.incrementSimAnnealParams inSimAnnealParams) + else deleteNetEdgeRecursiveList + + bestCostGraphList ← filter ((/= infinity) . snd5) <$> GO.selectGraphs Best numToKeep 0 newGraphList + let minCost = + if null bestCostGraphList + then infinity + else minimum $ fmap snd5 bestCostGraphList + + -- no network edges to delete + if null networkEdgeList + then do + logWith LogInfo ("\tNo network edges to delete" <> "\n") + pure ([inPhyloGraph], currentCost, inSimAnnealParams) + else -- single if steepest so no neeed to unique--and have run through all options (including SA stuff) via recursive call + + if doSteepest + then do + pure (newGraphList, minCost, newSAParams) + else -- "all" option needs to recurse since does all available edges at each step + -- logic is here since not in the deleteNetEdge function + + if isNothing inSimAnnealParams + then + if minCost < currentCost + then -- trace ("DENE--Delete net edge return:" <> (show (minCost,length uniqueCostGraphList))) ( + + let annealParamList = replicate (length bestCostGraphList) newSAParams + + -- parallel + deleteAction ∷ (Maybe SAParams, ReducedPhylogeneticGraph) → PhyG ([ReducedPhylogeneticGraph], VertexCost, Maybe SAParams) + deleteAction = deleteEachNetEdge' inGS inData numToKeep doSteepest doRandomOrder force + in do + -- TODO + nextGraphDoubleList ← + getParallelChunkTraverse >>= \pTraverse → + deleteAction `pTraverse` zip annealParamList bestCostGraphList + + let newMinCost = minimum $ fmap snd3 nextGraphDoubleList + let newGraphListBetter = filter ((== newMinCost) . snd5) $ concatMap fst3 nextGraphDoubleList + + GO.selectGraphs Unique numToKeep 0 newGraphListBetter <&> \x → (x, newMinCost, newSAParams) + else do + pure (bestCostGraphList, currentCost, newSAParams) + else -- SA anneal/Drift + + -- always take better + + if minCost < currentCost + then do + pure (bestCostGraphList, minCost, newSAParams) + else -- check if hit step limit--more for SA than drift + + if ((currentStep $ fromJust inSimAnnealParams) >= (numberSteps $ fromJust inSimAnnealParams)) + || ((driftChanges $ fromJust inSimAnnealParams) >= (driftMaxChanges $ fromJust inSimAnnealParams)) + then do + pure ([inPhyloGraph], snd5 inPhyloGraph, inSimAnnealParams) + else do + -- otherwise do the anneal/Drift accept, or keep going on input graph + (acceptGraph, nextSAParams) ← U.simAnnealAccept inSimAnnealParams currentCost minCost + case acceptGraph of + True → pure (bestCostGraphList, minCost, newSAParams) + _ → deleteEachNetEdge inGS inData numToKeep doSteepest doRandomOrder force nextSAParams inPhyloGraph + + +{- | deleteEachNetEdge' is a wrapper around deleteEachNetEdge to allow for zipping new random seeds for each +replicate +-} +deleteEachNetEdge' + ∷ GlobalSettings + → ProcessedData + → Int + → Bool + → Bool + → Bool + → (Maybe SAParams, ReducedPhylogeneticGraph) + → PhyG ([ReducedPhylogeneticGraph], VertexCost, Maybe SAParams) +deleteEachNetEdge' inGS inData numToKeep doSteepest doRandomOrder force (inSimAnnealParams, inPhyloGraph) = + deleteEachNetEdge inGS inData numToKeep doSteepest doRandomOrder force inSimAnnealParams inPhyloGraph + + +{- | deleteNetEdgeRecursive like deleteEdge, deletes an edge (checking if network) and rediagnoses graph +contacts in=out=1 edges and removes node, reindexing nodes and edges +except returns on first better (as opposed to do all deletes first) +or sim annleal/drift +-} +deleteNetEdgeRecursive + ∷ GlobalSettings + → ProcessedData + → ReducedPhylogeneticGraph + → Bool + → Maybe SAParams + → [LG.Edge] + → PhyG ([ReducedPhylogeneticGraph], Maybe SAParams) +deleteNetEdgeRecursive inGS inData inPhyloGraph force inSimAnnealParams inEdgeToDeleteList = + if null inEdgeToDeleteList + then do + pure ([], inSimAnnealParams) + else + let {- Unclear if should adjust to number of rounds if already limiting to graphsSteepest value + saRounds = if isNothing inSimAnnealParams then 1 + else rounds $ fromJust inSimAnnealParams + + (numGraphsToExamine, _) = divMod PU.getNumThreads saRounds -- this may not "drift" if finds alot better, but that's how its supposed to work + -} + numGraphsToExamine = graphsSteepest inGS -- min (graphsSteepest inGS) PU.getNumThreads + -- edgeToDelete = head inEdgeToDeleteList + edgeToDeleteList = take numGraphsToExamine inEdgeToDeleteList + + leafGraph = LG.extractLeafGraph $ thd5 inPhyloGraph + + -- prune other edges if now unused + pruneEdges = False + + -- don't warn that edges are being pruned + warnPruneEdges = False + + -- graph optimization from root + startVertex = Nothing + + -- parallel + deleteAction ∷ LG.Edge → PhyG (SimpleGraph, Bool) + deleteAction = deleteNetworkEdge (fst5 inPhyloGraph) + + softTraverse ∷ SimpleGraph → PhyG ReducedPhylogeneticGraph + softTraverse = T.multiTraverseFullyLabelSoftWiredReduced inGS inData pruneEdges warnPruneEdges leafGraph startVertex + + hardTraverse ∷ SimpleGraph → PhyG ReducedPhylogeneticGraph + hardTraverse = T.multiTraverseFullyLabelHardWiredReduced inGS inData leafGraph startVertex + in do + -- calls general funtion to remove network graph edge + simpleGraphList' ← + getParallelChunkTraverse >>= \pTraverse → + deleteAction `pTraverse` edgeToDeleteList + + let simpleGraphList = fmap fst $ filter ((== True) . snd) simpleGraphList' + -- \$ PU.seqParMap (parStrategy $ lazyParStrat inGS) (deleteNetworkEdge (fst5 inPhyloGraph)) edgeToDeleteList + + -- delSimple = GO.contractIn1Out1EdgesRename $ LG.delEdge edgeToDelete $ fst5 inPhyloGraph + -- full two-pass optimization + let leafGraph = LG.extractLeafGraph $ thd5 inPhyloGraph + + -- (heuristicDelta, _, _) = heuristicDeleteDelta inGS inPhyloGraph edgeToDelete + -- heuristicDelta = 0.0 + + -- can treat as negative for delete + -- edgeAddDelta = deltaPenaltyAdjustment inGS inPhyloGraph "delete" + + newPhyloGraphList' ← case graphType inGS of + SoftWired → + getParallelChunkTraverse >>= \pTraverse → + softTraverse `pTraverse` simpleGraphList + HardWired → + getParallelChunkTraverse >>= \pTraverse → + hardTraverse `pTraverse` simpleGraphList + val → + failWithPhase Computing $ + fold + ["Unsupported graph type '", show val, "' in deleteNetEdge. Must be soft- or hard-wired"] + + newPhyloGraphList ← GO.selectGraphs Best (maxBound ∷ Int) 0 newPhyloGraphList' + + -- if not modified return original graph + -- This check seems to be issue with delete not functioning properly + if null simpleGraphList + then do + pure ([inPhyloGraph], inSimAnnealParams) + else -- forcing delete for move + + if force + then do + -- trace ("DNERec forced") + pure (newPhyloGraphList, inSimAnnealParams) + else -- regular search not sim anneal/drift + + if (isNothing inSimAnnealParams) + then -- return if better + + if (snd5 $ head newPhyloGraphList) < (snd5 inPhyloGraph) + then do + -- trace ("DNERec Better -> " <> (show $ snd5 newPhyloGraph)) + pure (newPhyloGraphList, inSimAnnealParams) + else do + -- need to update edge list for new graph + -- potentially randomize order of list + deleteNetEdgeRecursive inGS inData inPhyloGraph force inSimAnnealParams (drop numGraphsToExamine inEdgeToDeleteList) + else -- sim anneal/drift + + -- if better always accept + + if (snd5 $ head newPhyloGraphList) < (snd5 inPhyloGraph) + then do + -- these graph costs are "exact" or at least non-heuristic--needs to be updated when get a good heuristic + (_, nextSAParams) ← U.simAnnealAccept inSimAnnealParams (snd5 inPhyloGraph) . snd5 $ head newPhyloGraphList + pure (newPhyloGraphList, nextSAParams) + else -- check if hit step limit--more for SA than drift + + if ((currentStep $ fromJust inSimAnnealParams) >= (numberSteps $ fromJust inSimAnnealParams)) + || ((driftChanges $ fromJust inSimAnnealParams) >= (driftMaxChanges $ fromJust inSimAnnealParams)) + then do + pure ([inPhyloGraph], inSimAnnealParams) + else do + -- otherwise do the anneal/Drift accept + + (acceptGraph, nextSAParams) ← U.simAnnealAccept inSimAnnealParams (snd5 inPhyloGraph) . snd5 $ head newPhyloGraphList + case acceptGraph of + True → pure (newPhyloGraphList, nextSAParams) + _ → deleteNetEdgeRecursive inGS inData inPhyloGraph force nextSAParams (drop numGraphsToExamine inEdgeToDeleteList) + + +{- | deleteEdge deletes an edge (checking if network) and rediagnoses graph +contacts in=out=1 edgfes and removes node, reindexing nodes and edges +naive for now +force requires reoptimization no matter what--used for net move +skipping heuristics for now--awful +calls deleteNetworkEdge that has various graph checks +-} +deleteNetEdge + ∷ GlobalSettings + → ProcessedData + → ReducedPhylogeneticGraph + → Bool + → LG.Edge + → PhyG ReducedPhylogeneticGraph +deleteNetEdge inGS inData inPhyloGraph force edgeToDelete = + if LG.isEmpty $ thd5 inPhyloGraph + then error "Empty input phylogenetic graph in deleteNetEdge" + else + if not (LG.isNetworkEdge (fst5 inPhyloGraph) edgeToDelete) + then error ("Edge to delete: " <> (show edgeToDelete) <> " not in graph:\n" <> (LG.prettify $ fst5 inPhyloGraph)) + else do + -- trace ("DNE: " <> (show edgeToDelete)) ( + (delSimple, wasModified) ← deleteNetworkEdge (fst5 inPhyloGraph) edgeToDelete + + -- delSimple = GO.contractIn1Out1EdgesRename $ LG.delEdge edgeToDelete $ fst5 inPhyloGraph + + -- prune other edges if now unused + let pruneEdges = False + + -- don't warn that edges are being pruned + let warnPruneEdges = False + + -- graph optimization from root + let startVertex = Nothing + + -- (heuristicDelta, _, _) = heuristicDeleteDelta inGS inPhyloGraph edgeToDelete + + -- edgeAddDelta = deltaPenaltyAdjustment inGS inPhyloGraph "delete" + + -- full two-pass optimization--cycles checked in edge deletion function + let leafGraph = LG.extractLeafGraph $ thd5 inPhyloGraph + + newPhyloGraph ← + if (graphType inGS == SoftWired) + then T.multiTraverseFullyLabelSoftWiredReduced inGS inData pruneEdges warnPruneEdges leafGraph startVertex delSimple + else + if (graphType inGS == HardWired) + then T.multiTraverseFullyLabelHardWiredReduced inGS inData leafGraph startVertex delSimple + else error "Unsupported graph type in deleteNetEdge. Must be soft or hard wired" + -- check if deletino modified graph + if not wasModified + then do + pure inPhyloGraph + else -- else if force || (graphType inGS) == HardWired then + + if force + then do + -- trace ("DNE forced") + pure newPhyloGraph + else -- if (heuristicDelta / (dynamicEpsilon inGS)) - edgeAddDelta < 0 then newPhyloGraph + + if (snd5 newPhyloGraph) < (snd5 inPhyloGraph) + then do + -- trace ("DNE Better: " <> (show $ snd5 newPhyloGraph)) + pure newPhyloGraph + else do + -- trace ("DNE Not Better: " <> (show $ snd5 newPhyloGraph)) + pure inPhyloGraph + + +-- ) + +{- | deleteNetworkEdge deletes a network edges from a simple graph +retuns newGraph if can be modified or input graph with Boolean to tell if modified +and contracts, reindexes/names internaledges/veritices around deletion +can't raise to general graph level due to vertex info +in edges (b,a) (c,a) (a,d), deleting (a,b) deletes node a, inserts edge (b,d) +contacts node c since now in1out1 vertex +checks for chained network edges--can be created by progressive deletion +checks for cycles now +shouldn't need for check for creating a node with children that are both network nodes +since that would require that condition coming in and shodl be there--ie checked earlier in addition and input +-} +deleteNetworkEdge ∷ SimpleGraph → LG.Edge → PhyG (SimpleGraph, Bool) +deleteNetworkEdge inGraph inEdge@(p1, nodeToDelete) = + if LG.isEmpty inGraph + then error ("Cannot delete edge from empty graph") + else + let childrenNodeToDelete = LG.descendants inGraph nodeToDelete + parentsNodeToDelete = LG.parents inGraph nodeToDelete + -- parentNodeToKeep = head $ filter (/= p1) parentsNodeToDelete + -- newEdge = (parentNodeToKeep, head childrenNodeToDelete, 0.0) + -- newGraph = LG.insEdge newEdge $ LG.delNode nodeToDelete inGraph + newGraph = LG.delEdge inEdge inGraph + -- newGraph' = GO.contractIn1Out1EdgesRename newGraph + + -- conversion as if input--see if affects length + -- newGraph'' = GO.convertGeneralGraphToPhylogeneticGraph False newGraph + newGraph'' = GO.contractIn1Out1EdgesRename newGraph + in -- error conditions and creation of chained network edges (forbidden in phylogenetic graph--causes resolutoin cache issues) + if length childrenNodeToDelete /= 1 + then error ("Cannot delete non-network edge in deleteNetworkEdge: (1)" <> (show inEdge) <> "\n" <> (LG.prettyIndices inGraph)) + else + if length parentsNodeToDelete /= 2 + then error ("Cannot delete non-network edge in deleteNetworkEdge (2): " <> (show inEdge) <> "\n" <> (LG.prettyIndices inGraph)) + else -- warning if chained on input, skip if chained net edges in output + + if (LG.isNetworkNode inGraph p1) + then do + -- error ("Error: Chained network nodes in deleteNetworkEdge : " <> (show inEdge) <> "\n" <> (LG.prettyIndices inGraph) <> " skipping") + logWith LogWarn ("\tWarning: Chained network nodes in deleteNetworkEdge skipping deletion" <> "\n") + pure (LG.empty, False) + else + if LG.hasChainedNetworkNodes newGraph'' + then do + logWith LogWarn ("\tWarning: Chained network nodes in deleteNetworkEdge skipping deletion (2)" <> "\n") + pure (LG.empty, False) + else + if LG.isEmpty newGraph'' + then do + pure (LG.empty, False) + else do + {-trace ("DNE: Edge to delete " <> (show inEdge) <> " cnd " <> (show childrenNodeToDelete) <> " pnd " <> (show parentsNodeToDelete) <> " pntk " <> (show parentNodeToKeep) + <> " ne " <> (show newEdge) <> "\nInGraph: " <> (LG.prettyIndices inGraph) <> "\nNewGraph: " <> (LG.prettyIndices newGraph) <> "\nNewNewGraph: " + <> (LG.prettyIndices newGraph')) -} + pure (newGraph'', True) + + +{- | heuristicDeleteDelta takes the existing graph, edge to delete, +reoptimizes starting nodes of two created edges. Returns cost delta based on +previous and new node resolution caches +delete n1 -> n2, create u -> v, u' -> v' +assumes original is edge n1 -> n2, u' -> (n2, X), n1 -> (n2,v), u (n1,Y) +-} +heuristicDeleteDelta + ∷ GlobalSettings + → ReducedPhylogeneticGraph + → LG.Edge + → PhyG (VertexCost, LG.LNode VertexInfo, LG.LNode VertexInfo) +heuristicDeleteDelta inGS inPhyloGraph (n1, n2) = + if LG.isEmpty (fst5 inPhyloGraph) + then error "Empty graph in heuristicDeleteDelta" + else + if graphType inGS == HardWired + then -- ensures delete--will always be lower or equakl cost if delete edge from HardWired + pure (-1, dummyNode, dummyNode) + else + let inGraph = thd5 inPhyloGraph + u = head $ LG.parents inGraph n1 + u' = head $ filter (/= n1) $ LG.parents inGraph n2 + v' = head $ LG.descendants inGraph n2 + v = head $ filter (/= n2) $ LG.descendants inGraph n1 + + uLab = fromJust $ LG.lab inGraph u + uPrimeLab = fromJust $ LG.lab inGraph u' + vLab = fromJust $ LG.lab inGraph v + vPrimeLab = fromJust $ LG.lab inGraph v' + + uOtherChild = head $ filter ((/= n1) . fst) $ LG.labDescendants inGraph (u, uLab) + uPrimeOtherChild = head $ filter ((/= n2) . fst) $ LG.labDescendants inGraph (u', uPrimeLab) + in do + -- skip over netnodes + uLabAfter ← NEW.getOutDegree2VertexSoftWired inGS (fft5 inPhyloGraph) u (v, vLab) uOtherChild inGraph + uPrimeLabAfter ← NEW.getOutDegree2VertexSoftWired inGS (fft5 inPhyloGraph) u' (v', vPrimeLab) uPrimeOtherChild inGraph + + -- cost of resolutions + let (_, uCostBefore) = NEW.extractDisplayTrees (Just (-1)) False (vertexResolutionData uLab) + let (_, uPrimeCostBefore) = NEW.extractDisplayTrees (Just (-1)) False (vertexResolutionData uPrimeLab) + let (_, uCostAfter) = NEW.extractDisplayTrees (Just (-1)) False (vertexResolutionData uLabAfter) + let (_, uPrimeCostAfter) = NEW.extractDisplayTrees (Just (-1)) False (vertexResolutionData uPrimeLabAfter) + + let addNetDelta = uCostAfter - uCostBefore + uPrimeCostAfter - uPrimeCostBefore + + -- this should not happen--should try to crete new edges from children of net edges + if null (LG.parents inGraph n1) + || null (filter (/= n1) $ LG.parents inGraph n2) + || null (LG.descendants inGraph n2) + || null (filter (/= n2) $ LG.descendants inGraph n1) + || null (filter ((/= n2) . fst) $ LG.labDescendants inGraph (u', uPrimeLab)) + || null (filter ((/= n1) . fst) $ LG.labDescendants inGraph (u, uLab)) + then pure (infinity, dummyNode, dummyNode) + else -- this should not happen--should try to crete new edges from children of net edges + + if (length (LG.parents inGraph n1) /= 1) + || (length (LG.parents inGraph n2) /= 2) + || (length (LG.descendants inGraph n2) /= 1) + || (length (LG.descendants inGraph n1) /= 2) + then error ("Graph malformation in numbers of parents and children in heuristicDeleteDelta") + else pure (addNetDelta, (u, uLabAfter), (u', uPrimeLabAfter)) + +{- +-- | insertNetEdgeBothDirections calls insertNetEdge for both u -> v and v -> u new edge orientations +insertNetEdgeBothDirections :: GlobalSettings -> ProcessedData -> ReducedPhylogeneticGraph -> Maybe VertexCost -> (LG.LEdge b, LG.LEdge b) -> [ReducedPhylogeneticGraph] +insertNetEdgeBothDirections inGS inData inPhyloGraph preDeleteCost (u,v) = fmap (insertNetEdge inGS inData inPhyloGraph preDeleteCost) [(u,v), (v,u)] +-} diff --git a/src/Search/Refinement.hs b/src/Search/Refinement.hs new file mode 100644 index 000000000..d30df895b --- /dev/null +++ b/src/Search/Refinement.hs @@ -0,0 +1,902 @@ +{-- +This is a memory pig and rather slow. +Probably because it doens't just take the first "better" graph. + +For now--graphsteepest set to 1 to reduce memory footprint. +--} + +{- | +Module controlling graph refinement functions +-} +module Search.Refinement ( + refineGraph, + netEdgeMaster, + fuseGraphs, + swapMaster, + geneticAlgorithmMaster, +) where + +import Commands.Verify qualified as VER +import Control.Monad (when) +import Control.Monad.Random.Class +import Data.Char +import Data.Foldable (fold) +import Data.Functor (($>), (<$), (<&>)) +import Data.Maybe +import GeneralUtilities +import Graphs.GraphOperations qualified as GO +import PHANE.Evaluation +import PHANE.Evaluation.ErrorPhase (ErrorPhase (..)) +import PHANE.Evaluation.Logging (LogLevel (..), Logger (..)) +import PHANE.Evaluation.Verbosity (Verbosity (..)) +import Search.Fuse qualified as F +import Search.GeneticAlgorithm qualified as GA +import Search.NetworkAddDelete qualified as N +import Search.SwapMaster qualified as SM +import Text.Read +import Types.Types +import Utilities.Utilities as U + + +{- | swapMaster moved to Search.SwapMaster due to very long (>20') compile times +with --enalble-profinling +-} +swapMaster + ∷ [Argument] + → GlobalSettings + → ProcessedData + → [ReducedPhylogeneticGraph] + → PhyG [ReducedPhylogeneticGraph] +swapMaster = SM.swapMaster + + +-- | driver for overall refinement +refineGraph + ∷ [Argument] + → GlobalSettings + → ProcessedData + → [ReducedPhylogeneticGraph] + → PhyG [ReducedPhylogeneticGraph] +refineGraph inArgs inGS inData inGraphList = + if null inGraphList + then do + logWith LogInfo "No graphs input to refine\n" + pure [] + else + let fstArgList = fmap (fmap toLower . fst) inArgs + sndArgList = fmap (fmap toLower . snd) inArgs + lcArgList = zip fstArgList sndArgList + checkCommandList = checkCommandArgs "refineGraph" fstArgList VER.refineArgList + in -- check for valid command options + if not checkCommandList + then failWithPhase Parsing ("Unrecognized command in 'GeneticAlgorithm': " <> show inArgs) + else + let doNetAdd = any ((== "netadd") . fst) lcArgList + doNetDel = any ((== "netdel") . fst) lcArgList || any ((== "netdelete") . fst) lcArgList + doNetAddDel = any ((== "netadddel") . fst) lcArgList || any ((== "netadddelete") . fst) lcArgList + doNetMov = any ((== "netmove") . fst) lcArgList + doGenAlg = any ((== "ga") . fst) lcArgList || any ((== "geneticalgorithm") . fst) lcArgList + in -- network edge edits + if doNetAdd || doNetDel || doNetAddDel || doNetMov + then netEdgeMaster inArgs inGS inData inGraphList + else -- genetic algorithm + geneticAlgorithmMaster inArgs inGS inData inGraphList + + +-- error "No refinement operation specified" + +{- | geneticAlgorithmMaster takes arguments and performs genetic algorithm on input graphs +the process follows several steps +1) input graphs are mutated + this step is uncharacteristically first so that is can operate on + graphs that have been "fused" (recombined) already + mutated graphs are added up to popsize + if input graphs are already at the population size, an equal number of mutants are added (exceeding input popsize) +2) graph are recombined using fusing operations +3) population undergoes selection to population size (unique graphs) + selection based on delta with best graph and severity factor on (0,Inf) 1 pure cost delta < 1 more severe, > 1 less severe + if "elitist" (default) 'best' graphs are always selected to ensure no worse. +4) operation repearts for number of generations +-} +geneticAlgorithmMaster + ∷ [Argument] + → GlobalSettings + → ProcessedData + → [ReducedPhylogeneticGraph] + → PhyG [ReducedPhylogeneticGraph] +geneticAlgorithmMaster inArgs inGS inData inGraphList + | null inGraphList = logWith LogInfo "No graphs to undergo Genetic Algorithm\n" $> [] + | otherwise = do + logWith LogInfo $ + fold + [ "Genetic Algorithm operating on population of " + , show $ length inGraphList + , " input graph(s) with cost range (" + , show . minimum $ fmap snd5 inGraphList + , "," + , show . maximum $ fmap snd5 inGraphList + , ")" + , "\n" + ] + + -- process args + let (doElitist, keepNum, popSize, generations, severity, recombinations, maxNetEdges, stopNum) = getGeneticAlgParams inArgs + + (newGraphList, generationCounter) ← + GA.geneticAlgorithm + inGS + inData + doElitist + (fromJust maxNetEdges) + (fromJust keepNum) + (fromJust popSize) + (fromJust generations) + 0 + (fromJust severity) + (fromJust recombinations) + 0 + stopNum + inGraphList + + logWith LogInfo $ + fold + [ "\tGenetic Algorithm: " + , show $ length newGraphList + , " resulting graphs with cost range (" + , show . minimum $ fmap snd5 newGraphList + , "," + , show . maximum $ fmap snd5 newGraphList + , ")" + , " after " + , show generationCounter + , " generation(s)" + , "\n" + ] + + pure newGraphList + + +-- | getGeneticAlgParams returns paramlist from arglist +getGeneticAlgParams + ∷ [Argument] + → (Bool, Maybe Int, Maybe Int, Maybe Int, Maybe Double, Maybe Int, Maybe Int, Int) +getGeneticAlgParams inArgs = + let fstArgList = fmap (fmap toLower . fst) inArgs + sndArgList = fmap (fmap toLower . snd) inArgs + lcArgList = zip fstArgList sndArgList + checkCommandList = checkCommandArgs "geneticalgorithm" fstArgList VER.geneticAlgorithmArgList + in -- check for valid command options + if not checkCommandList + then errorWithoutStackTrace ("Unrecognized command in 'GeneticAlgorithm': " <> show inArgs) + else + let keepList = filter ((== "keep") . fst) lcArgList + keepNum + | length keepList > 1 = + errorWithoutStackTrace + ("Multiple 'keep' number specifications in genetic algorithm command--can have only one: " <> show inArgs) + | null keepList = Just 10 + | otherwise = readMaybe (snd $ head keepList) ∷ Maybe Int + + popSizeList = filter ((== "popsize") . fst) lcArgList + popSize + | length popSizeList > 1 = + errorWithoutStackTrace + ("Multiple 'popsize' number specifications in genetic algorithm command--can have only one: " <> show inArgs) + | null popSizeList = Just 20 + | otherwise = readMaybe (snd $ head popSizeList) ∷ Maybe Int + + generationsList = filter ((== "generations") . fst) lcArgList + generations + | length generationsList > 1 = + errorWithoutStackTrace + ("Multiple 'generations' number specifications in genetic algorithm command--can have only one: " <> show inArgs) + | null generationsList = Just 5 + | otherwise = readMaybe (snd $ head generationsList) ∷ Maybe Int + + severityList = filter ((== "severity") . fst) lcArgList + severity + | length severityList > 1 = + errorWithoutStackTrace + ("Multiple 'severity' number specifications in genetic algorithm command--can have only one: " <> show inArgs) + | null severityList = Just 1.0 + | otherwise = readMaybe (snd $ head severityList) ∷ Maybe Double + + recombinationsList = filter ((== "recombinations") . fst) lcArgList + recombinations + | length recombinationsList > 1 = + errorWithoutStackTrace + ("Multiple 'recombinations' number specifications in genetic algorithm command--can have only one: " <> show inArgs) + | null recombinationsList = Just 100 + | otherwise = readMaybe (snd $ head recombinationsList) ∷ Maybe Int + + maxNetEdgesList = filter ((== "maxnetedges") . fst) lcArgList + maxNetEdges + | length maxNetEdgesList > 1 = + errorWithoutStackTrace + ("Multiple 'maxNetEdges' number specifications in ccommand--can have only one: " <> show inArgs) + | null maxNetEdgesList = Just 5 + | otherwise = readMaybe (snd $ head maxNetEdgesList) ∷ Maybe Int + + stopList = filter ((== "stop") . fst) lcArgList + stopNum + | length stopList > 1 = + errorWithoutStackTrace + ("Multiple 'stop' number specifications in genetic algorithm command--can have only one: " <> show inArgs) + | null stopList = Just (maxBound ∷ Int) + | otherwise = readMaybe (snd $ head stopList) ∷ Maybe Int + + -- in case want to make it an option + -- doElitist' = any ((== "nni") . fst) lcArgList + doElitist = True + in -- check arguments + if isNothing keepNum + then errorWithoutStackTrace ("Keep specification not an integer in Genetic Algorithm: " <> show (head keepList)) + else + if isNothing popSize + then errorWithoutStackTrace ("PopSize specification not an integer in Genetic Algorithm: " <> show (head popSizeList)) + else + if isNothing generations + then errorWithoutStackTrace ("Generations specification not an integer in Genetic Algorithm: " <> show (head generationsList)) + else + if isNothing severity + then errorWithoutStackTrace ("Severity factor specification not an integer in Genetic Algorithm: " <> show (head severityList)) + else + if isNothing recombinations + then + errorWithoutStackTrace + ("Severity factor specification not an integer in Genetic Algorithm: " <> show (head recombinationsList)) + else + if isNothing stopNum + then + errorWithoutStackTrace + ("Stop specification not an integer or not found in Genetic Algorithm (e.g. stop:10) " <> show (head stopList)) + else (doElitist, keepNum, popSize, generations, severity, recombinations, maxNetEdges, fromJust stopNum) + + +{- | fuseGraphs is a wrapper for graph recombination +the functions make heavy use of branch swapping functions in Search.Swap +-} +fuseGraphs + ∷ [Argument] + → GlobalSettings + → ProcessedData + → [ReducedPhylogeneticGraph] + → PhyG [ReducedPhylogeneticGraph] +fuseGraphs inArgs inGS inData inGraphList + | null inGraphList = logWith LogMore "Fusing--skipped: No graphs to fuse\n" $> [] + | length inGraphList == 1 = logWith LogMore "Fusing--skipped: Need > 1 graphs to fuse\n" $> inGraphList + -- \| graphType inGS == HardWired = trace "Fusing hardwired graphs is currenty not implemented" inGraphList + | otherwise = do + -- process args for fuse placement + (keepNum, maxMoveEdgeDist, fusePairs, lcArgList) ← getFuseGraphParams inArgs + + -- steepest off by default due to wanteing to check all addition points + let doSteepest' = any ((== "steepest") . fst) lcArgList + let doAll = any ((== "all") . fst) lcArgList + + let doSteepest + | (not doSteepest' && not doAll) = True + | (doSteepest' && doAll) = True + | doAll = False + | otherwise = doSteepest' + + -- readdition options, specified as swap types + -- no alternate or nni for fuse--not really meaningful + + let swapType + | any ((== "tbr") . fst) lcArgList = TBR + | any ((== "spr") . fst) lcArgList = SPR + | otherwise = NoSwap + + -- turn off union selection of rejoin--default to do both, union first + let joinType + | any ((== "joinall") . fst) lcArgList = JoinAll + | any ((== "joinpruned") . fst) lcArgList = JoinPruned + | otherwise = JoinAlternate + + -- set implied alignment swapping + let doIA' = any ((== "ia") . fst) lcArgList + let getDoIA + | (graphType inGS /= Tree) && doIA' = logWith LogWarn "\tIgnoring 'IA' swap option for non-Tree\n" $> False + | otherwise = pure doIA' + + let returnBest = any ((== "best") . fst) lcArgList + let returnUnique = (not returnBest) || (any ((== "unique") . fst) lcArgList) + let doSingleRound = any ((== "once") . fst) lcArgList + let randomPairs = any ((== "atrandom") . fst) lcArgList + let fusePairs' + | fusePairs == Just (maxBound ∷ Int) = Nothing + | otherwise = fusePairs + + -- this for exchange or one dirction transfer of sub-graph--one half time for noreciprocal + let reciprocal' = any ((== "reciprocal") . fst) lcArgList + let notReciprocal = any ((== "notreciprocal") . fst) lcArgList + let reciprocal + | not reciprocal' = False + | notReciprocal = False + | otherwise = True + + -- populate SwapParams structure + let swapParams withIA = + SwapParams + { swapType = swapType + , joinType = joinType + , atRandom = randomPairs -- really same as swapping at random not so important here + , keepNum = (fromJust keepNum) + , maxMoveEdgeDist = (2 * fromJust maxMoveEdgeDist) + , steepest = doSteepest + , joinAlternate = False -- join prune alternates--turned off for now + , doIA = withIA + , returnMutated = False -- no SA/Drift swapping in Fuse + } + logWith LogInfo $ + unwords + [ "Fusing" + , show $ length inGraphList + , "input graph(s) with minimum cost" + , show . minimum $ fmap snd5 inGraphList + , "\n" + ] + + withIA ← getDoIA + + -- perform graph fuse operations + -- sets graphsSteepest to 1 to reduce memory footprintt + (newGraphList, counterFuse) ← + F.fuseAllGraphs + (swapParams withIA) + inGS -- (inGS{graphsSteepest = 1}) + inData + 0 + returnBest + returnUnique + doSingleRound + fusePairs' + randomPairs + reciprocal + inGraphList + + logWith LogMore $ + unwords + [ "\tAfter fusing:" + , show $ length newGraphList + , "resulting graphs with minimum cost" + , show . minimum $ fmap snd5 newGraphList + , " after fuse rounds (total): " + , show counterFuse + , "\n" + ] + pure newGraphList + + +-- | getFuseGraphParams returns fuse parameters from arglist +getFuseGraphParams + ∷ [Argument] + → PhyG (Maybe Int, Maybe Int, Maybe Int, [(String, String)]) +getFuseGraphParams inArgs = + let fstArgList = fmap (fmap toLower . fst) inArgs + sndArgList = fmap (fmap toLower . snd) inArgs + lcArgList = zip fstArgList sndArgList + checkCommandList = checkCommandArgs "fuse" fstArgList VER.fuseArgList + in -- check for valid command options + if not checkCommandList + then errorWithoutStackTrace ("Unrecognized command in 'fuse': " <> show inArgs) + else + let keepList = filter ((== "keep") . fst) lcArgList + keepNum + | null keepList = Just 10 + | otherwise = readMaybe (snd $ head keepList) ∷ Maybe Int + + -- this is awkward syntax but works to check fpor multiple swapping limit commands + moveLimitList = filter (not . null) (snd <$> filter ((`notElem` ["keep", "pairs"]) . fst) lcArgList) + maxMoveEdgeDist + | null moveLimitList = Just ((maxBound ∷ Int) `div` 3) + | otherwise = readMaybe (head moveLimitList) ∷ Maybe Int + + pairList = filter ((== "pairs") . fst) lcArgList + fusePairs + | null pairList = Just (maxBound ∷ Int) + | otherwise = readMaybe (snd $ head pairList) ∷ Maybe Int + in -- check arguments + if length keepList > 1 + then failWithPhase Parsing ("Multiple 'keep' number specifications in fuse command--can have only one: " <> show inArgs) + else + if length moveLimitList > 1 + then + failWithPhase + Parsing + ("Multiple maximum edge distance number specifications in fuse command--can have only one (e.g. spr:2): " <> show inArgs) + else + if length pairList > 1 + then failWithPhase Parsing ("Multiple 'pair' number specifications in fuse command--can have only one: " <> show inArgs) + else + if isNothing keepNum + then failWithPhase Parsing ("Keep specification not an integer in swap: " <> show (head keepList)) + else + if isNothing maxMoveEdgeDist + then + failWithPhase + Parsing + ("Maximum edge move distance specification in fuse command not an integer (e.g. spr:2): " <> show (head moveLimitList)) + else + if isNothing fusePairs + then failWithPhase Parsing ("fusePairs specification not an integer in fuse: " <> show (head pairList)) + else pure (keepNum, maxMoveEdgeDist, fusePairs, lcArgList) + + +-- | netEdgeMaster overall master for add/delete net edges +netEdgeMaster + ∷ [Argument] + → GlobalSettings + → ProcessedData + → [ReducedPhylogeneticGraph] + → PhyG [ReducedPhylogeneticGraph] +netEdgeMaster inArgs inGS inData inGraphList + | null inGraphList = [] <$ logWith LogInfo "No graphs to edit network edges\n" + | otherwise = case graphType inGS of + Tree → + inGraphList + <$ logWith + LogWarn + "\tCannot perform network edge operations on graphtype tree--set graphtype to SoftWired or HardWired\n" + -- process args for netEdgeMaster + _ → + let ( keepNum + , steps' + , annealingRounds' + , driftRounds' + , acceptEqualProb + , acceptWorseFactor + , maxChanges + , maxNetEdges + , lcArgList + , maxRounds + ) = getNetEdgeParams inArgs + doNetAdd = any ((== "netadd") . fst) lcArgList + doNetDelete = any ((== "netdel") . fst) lcArgList || any ((== "netdelete") . fst) lcArgList + doAddDelete = any ((== "netadddel") . fst) lcArgList || any ((== "netadddelete") . fst) lcArgList + doMove = any ((== "netmove") . fst) lcArgList + doSteepest' = any ((== "steepest") . fst) lcArgList + doAll = any ((== "all") . fst) lcArgList + + -- do steepest default + doSteepest + | (not doSteepest' && not doAll) = True + | doSteepest' && doAll = True + | otherwise = doSteepest' + + -- randomized order default + doRandomOrder' + | any ((== "atrandom") . fst) lcArgList = True + | any ((== "inorder") . fst) lcArgList = False + | otherwise = True + + -- simulated annealing parameters + -- returnMutated to return annealed Graphs before swapping fir use in Genetic Algorithm + doAnnealing = any ((== "annealing") . fst) lcArgList + + doDrift = any ((== "drift") . fst) lcArgList + + -- ensures random edge order for drift/annealing + doRandomOrder = doRandomOrder' || doDrift || doAnnealing + + returnMutated = any ((== "returnmutated") . fst) lcArgList + + getSimAnnealParams + | not doAnnealing && not doDrift = Nothing + | otherwise = + let steps = max 3 (fromJust steps') + annealingRounds = case annealingRounds' of + Just v | v >= 1 → v + _ → 1 + + driftRounds = case driftRounds' of + Just v | v >= 1 → v + _ → 1 + + saMethod + | doDrift && doAnnealing = Drift + | doDrift = Drift + | otherwise = SimAnneal + + equalProb = case acceptEqualProb of + Nothing → 0 + Just v | v > 1 → 1 + Just v | v < 0 → 0 + Just v → v + + worseFactor = max (fromJust acceptWorseFactor) 0.0 + + changes = case maxChanges of + Just v | v >= 0 → v + _ → 15 + in Just $ + SAParams + { method = saMethod + , numberSteps = steps + , currentStep = 0 + , rounds = max annealingRounds driftRounds + , driftAcceptEqual = equalProb + , driftAcceptWorse = worseFactor + , driftMaxChanges = changes + , driftChanges = 0 + } + + -- parallel stuff + insertAction ∷ (Maybe SAParams, [ReducedPhylogeneticGraph]) → PhyG ([ReducedPhylogeneticGraph], Int) + insertAction = + N.insertAllNetEdges + inGS + inData + (fromJust maxNetEdges) + (fromJust keepNum) + (fromJust maxRounds) + 0 + returnMutated + doSteepest + doRandomOrder + ([], infinity) + + deleteAction ∷ (Maybe SAParams, [ReducedPhylogeneticGraph]) → PhyG ([ReducedPhylogeneticGraph], Int) + deleteAction = + N.deleteAllNetEdges + inGS + inData + (fromJust maxNetEdges) + (fromJust keepNum) + 0 + returnMutated + doSteepest + doRandomOrder + ([], infinity) + + moveAction ∷ (Maybe SAParams, [ReducedPhylogeneticGraph]) → PhyG ([ReducedPhylogeneticGraph], Int) + moveAction = + N.moveAllNetEdges + inGS + inData + (fromJust maxNetEdges) + (fromJust keepNum) + 0 + returnMutated + doSteepest + doRandomOrder + ([], infinity) + + addDeleteAction ∷ (Maybe SAParams, [ReducedPhylogeneticGraph]) → PhyG ([ReducedPhylogeneticGraph], Int) + addDeleteAction = + N.addDeleteNetEdges + inGS + inData + (fromJust maxNetEdges) + (fromJust keepNum) + (fromJust maxRounds) + 0 + returnMutated + doSteepest + doRandomOrder + ([], infinity) + + simAnnealParams = getSimAnnealParams + -- create simulated annealing random lists uniquely for each fmap + newSimAnnealParamList = U.generateUniqueRandList (length inGraphList) simAnnealParams + in do + -- perform add/delete/move operations + {- + bannerText + | isJust simAnnealParams = + let editString + | doNetAdd = " add) " + | doNetDelete = " delete) " + | doAddDelete = " add/delete) " + | otherwise = " move " + in if method (fromJust simAnnealParams) == SimAnneal + then + "Simulated Annealing (Network edge" + <> editString + <> show (rounds $ fromJust simAnnealParams) + <> " rounds " + <> show (length inGraphList) + <> " with " + <> show (numberSteps $ fromJust simAnnealParams) + <> " cooling steps " + <> show (length inGraphList) + <> " input graph(s) at minimum cost " + <> show (minimum $ fmap snd5 inGraphList) + <> " keeping maximum of " + <> show (fromJust keepNum) + <> " graphs" + else + "Drifting (Network edge" + <> editString + <> show (rounds $ fromJust simAnnealParams) + <> " rounds " + <> show (length inGraphList) + <> " with " + <> show (numberSteps $ fromJust simAnnealParams) + <> " cooling steps " + <> show (length inGraphList) + <> " input graph(s) at minimum cost " + <> show (minimum $ fmap snd5 inGraphList) + <> " keeping maximum of " + <> show (fromJust keepNum) + <> " graphs" + | doNetDelete = + ( "Network edge delete on " + <> show (length inGraphList) + <> " input graph(s) with minimum cost " + <> show (minimum $ fmap snd5 inGraphList) + ) + | doNetAdd = + ( "Network edge add on " + <> show (length inGraphList) + <> " input graph(s) with minimum cost " + <> show (minimum $ fmap snd5 inGraphList) + <> " and maximum " + <> show (fromJust maxRounds) + <> " rounds" + ) + | doAddDelete = + ( "Network edge add/delete on " + <> show (length inGraphList) + <> " input graph(s) with minimum cost " + <> show (minimum $ fmap snd5 inGraphList) + <> " and maximum " + <> show (fromJust maxRounds) + <> " rounds" + ) + | doMove = + ( "Network edge move on " + <> show (length inGraphList) + <> " input graph(s) with minimum cost " + <> show (minimum $ fmap snd5 inGraphList) + ) + | otherwise = "" + -} + when (doDrift && doAnnealing) $ + logWith + LogWarn + "\tSpecified both Simulated Annealing (with temperature steps) and Drifting (without)--defaulting to drifting.\n" + when (graphType inGS == HardWired && doNetDelete) $ + logWith + LogInfo + "Deleting edges from hardwired graphs will trivially remove all network edges to a tree, skipping\n" + when (graphType inGS == HardWired && doAddDelete) $ + logWith + LogInfo + "Adding and Deleting edges to/from hardwired graphs will trivially remove all network edges to a tree, skipping\n" + -- logWith LogInfo $ bannerText <> "\n" + (newGraphList, counterAdd) ← do + if doNetAdd + then + if graphType inGS == HardWired + then -- logWith LogWarn "Adding edges to hardwired graphs will always increase cost, skipping" + pure (inGraphList, 0) + else do + logWith LogInfo ( "Network edge add on " + <> show (length inGraphList) + <> " input graph(s) with minimum cost " + <> show (minimum $ fmap snd5 inGraphList) + <> " limiting maximum number of network edge additions to " + <> show (fromJust maxNetEdges) + <> " and maximum " + <> show (fromJust maxRounds) + <> " rounds" + <> "\n" + ) + graphPairList1 ← + getParallelChunkTraverse >>= \pTraverse → + pTraverse insertAction . zip newSimAnnealParamList $ (: []) <$> inGraphList + + let (graphListList, counterList) = unzip graphPairList1 + GO.selectGraphs Unique (fromJust keepNum) 0 (fold graphListList) <&> \x → (x, sum counterList) + else pure (inGraphList, 0) + + (newGraphList', counterDelete) ← + if doNetDelete + then + if graphType inGS == HardWired + then -- logWith LogWarn ("Deleting edges from hardwired graphs will trivially remove all network edges to a tree, skipping") + pure (newGraphList, 0) + else do + logWith LogInfo ( "Network edge delete on " + <> show (length inGraphList) + <> " input graph(s) with minimum cost " + <> show (minimum $ fmap snd5 inGraphList) + <> "\n" + ) + graphPairList2 ← + getParallelChunkTraverse >>= \pTraverse → + pTraverse deleteAction . zip newSimAnnealParamList $ (: []) <$> newGraphList + + let (graphListList, counterList) = unzip graphPairList2 + GO.selectGraphs Unique (fromJust keepNum) 0 (fold graphListList) <&> \x → (x, sum counterList) + else -- ) + pure (newGraphList, 0) + + (newGraphList'', counterMove) ← + if doMove + then do + logWith LogInfo ( "Network edge move on " + <> show (length inGraphList) + <> " input graph(s) with minimum cost " + <> show (minimum $ fmap snd5 inGraphList) + <> "\n" + ) + graphPairList3 ← + getParallelChunkTraverse >>= \pTraverse → + pTraverse moveAction . zip newSimAnnealParamList $ pure <$> newGraphList' + + let (graphListList, counterList) = unzip graphPairList3 + GO.selectGraphs Unique (fromJust keepNum) 0 (fold graphListList) <&> \x → (x, sum counterList) + else pure (newGraphList', 0) + + (newGraphList''', counterAddDelete) ← + if doAddDelete + then + if graphType inGS == HardWired + then -- logWith LogInfo "Adding and Deleting edges to/from hardwired graphs will trivially remove all network edges to a tree, skipping" + pure (newGraphList'', 0) + else do + logWith LogInfo ( "Network edge delete on " + <> show (length inGraphList) + <> " input graph(s) with minimum cost " + <> show (minimum $ fmap snd5 inGraphList) + <> " limiting maximum number of network edge additions to " + <> show (fromJust maxNetEdges) + <> "\n" + ) + graphPairList4 ← + getParallelChunkTraverse >>= \pTraverse → + pTraverse addDeleteAction $ zip newSimAnnealParamList $ (: []) <$> newGraphList'' + + let (graphListList, counterList) = unzip graphPairList4 + GO.selectGraphs Unique (fromJust keepNum) 0 (fold graphListList) <&> \x → (x, sum counterList) + else pure (newGraphList'', 0) + + resultGraphList ← case newGraphList''' of + [] → pure inGraphList + _ → GO.selectGraphs Unique (fromJust keepNum) 0 newGraphList''' + + logWith + LogInfo + ( "\tAfter network edge add/delete/move: " + <> show (length resultGraphList) + <> " resulting graphs at cost " + <> show (minimum $ fmap snd5 resultGraphList) + <> " with add/delete/move rounds (total): " + <> show counterAdd + <> " Add, " + <> show counterDelete + <> " Delete, " + <> show counterMove + <> " Move, " + <> show counterAddDelete + <> " AddDelete" + <> "\n" + ) + pure resultGraphList + + +-- | getNetEdgeParams returns net edge cparameters from argument list +getNetEdgeParams + ∷ [Argument] + → (Maybe Int, Maybe Int, Maybe Int, Maybe Int, Maybe Double, Maybe Double, Maybe Int, Maybe Int, [(String, String)], Maybe Int) +getNetEdgeParams inArgs = + let fstArgList = fmap (fmap toLower . fst) inArgs + sndArgList = fmap (fmap toLower . snd) inArgs + lcArgList = zip fstArgList sndArgList + checkCommandList = checkCommandArgs "netEdgeMaster" fstArgList VER.netEdgeArgList + in -- check for valid command options + if not checkCommandList + then errorWithoutStackTrace ("Unrecognized command in 'netEdge': " <> show inArgs) + else + let keepList = filter ((== "keep") . fst) lcArgList + keepNum + | length keepList > 1 = + errorWithoutStackTrace ("Multiple 'keep' number specifications in netEdge command--can have only one: " <> show inArgs) + | null keepList = Just 10 + | otherwise = readMaybe (snd $ head keepList) ∷ Maybe Int + + -- simulated anealing options + stepsList = filter ((== "steps") . fst) lcArgList + steps' + | length stepsList > 1 = + errorWithoutStackTrace + ("Multiple annealing steps value specifications in netEdge command--can have only one (e.g. steps:10): " <> show inArgs) + | null stepsList = Just 10 + | otherwise = readMaybe (snd $ head stepsList) ∷ Maybe Int + + annealingList = filter ((== "annealing") . fst) lcArgList + annealingRounds' + | length annealingList > 1 = + errorWithoutStackTrace + ("Multiple 'annealing' rounds number specifications in netEdge command--can have only one: " <> show inArgs) + | null annealingList = Just 1 + | otherwise = readMaybe (snd $ head annealingList) ∷ Maybe Int + + -- drift options + driftList = filter ((== "drift") . fst) lcArgList + driftRounds' + | length driftList > 1 = + errorWithoutStackTrace ("Multiple 'drift' rounds number specifications in netEdge command--can have only one: " <> show inArgs) + | null driftList = Just 1 + | otherwise = readMaybe (snd $ head driftList) ∷ Maybe Int + + acceptEqualList = filter ((== "acceptequal") . fst) lcArgList + acceptEqualProb + | length acceptEqualList > 1 = + errorWithoutStackTrace ("Multiple 'drift' acceptEqual specifications in netEdge command--can have only one: " <> show inArgs) + | null acceptEqualList = Just 0.5 + | otherwise = readMaybe (snd $ head acceptEqualList) ∷ Maybe Double + + acceptWorseList = filter ((== "acceptworse") . fst) lcArgList + acceptWorseFactor + | length acceptWorseList > 1 = + errorWithoutStackTrace ("Multiple 'drift' acceptWorse specifications in netEdge command--can have only one: " <> show inArgs) + | null acceptWorseList = Just 20.0 + | otherwise = readMaybe (snd $ head acceptWorseList) ∷ Maybe Double + + maxChangesList = filter ((== "maxchanges") . fst) lcArgList + maxChanges + | length maxChangesList > 1 = + errorWithoutStackTrace ("Multiple 'drift' maxChanges number specifications in swap command--can have only one: " <> show inArgs) + | null maxChangesList = Just 15 + | otherwise = readMaybe (snd $ head maxChangesList) ∷ Maybe Int + + maxNetEdgesList = filter ((== "maxnetedges") . fst) lcArgList + maxNetEdges + | length maxNetEdgesList > 1 = + errorWithoutStackTrace ("Multiple 'maxNetEdges' number specifications in netEdge command--can have only one: " <> show inArgs) + | null maxNetEdgesList = Just 5 + | otherwise = readMaybe (snd $ head maxNetEdgesList) ∷ Maybe Int + + maxRoundsList = filter ((== "rounds") . fst) lcArgList + maxRounds + | length maxRoundsList > 1 = + errorWithoutStackTrace ("Multiple 'rounds' number specifications in netEdge command--can have only one: " <> show inArgs) + | null maxRoundsList = Just 1 + | otherwise = readMaybe (snd $ head maxRoundsList) ∷ Maybe Int + in -- check inputs + if isNothing keepNum + then errorWithoutStackTrace ("Keep specification not an integer in netEdge: " <> show (head keepList)) + else + if isNothing steps' + then errorWithoutStackTrace ("Annealing steps specification not an integer (e.g. steps:10): " <> show (snd $ head stepsList)) + else + if isNothing acceptEqualProb + then + errorWithoutStackTrace + ("Drift 'acceptEqual' specification not a float (e.g. acceptEqual:0.75): " <> show (snd $ head acceptEqualList)) + else + if isNothing acceptWorseFactor + then + errorWithoutStackTrace + ("Drift 'acceptWorse' specification not a float (e.g. acceptWorse:1.0): " <> show (snd $ head acceptWorseList)) + else + if isNothing maxChanges + then + errorWithoutStackTrace + ("Drift 'maxChanges' specification not an integer (e.g. maxChanges:10): " <> show (snd $ head maxChangesList)) + else + if isNothing maxNetEdges + then + errorWithoutStackTrace + ("Network edit 'maxNetEdges' specification not an integer (e.g. maxNetEdges:10): " <> show (snd $ head maxNetEdgesList)) + else + if isNothing maxRounds + then + errorWithoutStackTrace + ("Network edit 'rounds' specification not an integer (e.g. rounds:10): " <> show (snd $ head maxRoundsList)) + else + ( keepNum + , steps' + , annealingRounds' + , driftRounds' + , acceptEqualProb + , acceptWorseFactor + , maxChanges + , maxNetEdges + , lcArgList + , maxRounds + ) diff --git a/src/Search/Search.hs b/src/Search/Search.hs new file mode 100644 index 000000000..56c8567f7 --- /dev/null +++ b/src/Search/Search.hs @@ -0,0 +1,1129 @@ +{-# OPTIONS_GHC -fmax-pmcheck-models=50 #-} + +{- | +Module controlling timed randomized search functions +-} +module Search.Search ( + search, +) where + +import Commands.Transform qualified as TRANS +import Commands.Verify qualified as VER +import Control.Arrow ((&&&)) +import Control.Concurrent.Timeout (timeout) +import Control.DeepSeq +import Control.Monad (when) +import Control.Monad.IO.Unlift +import Control.Monad.Random.Class qualified as Sample +import Data.Bifunctor (bimap) +import Data.Char +import Data.Foldable +--import Data.Foldable1 qualified as F1 +import Data.Functor (($>), (<&>)) +import Data.List qualified as L +import Data.List.NonEmpty (NonEmpty (..)) +import Data.List.Split qualified as LS +import Data.Maybe +import Data.Vector qualified as V +import GeneralUtilities +import Graphs.GraphOperations qualified as GO +import PHANE.Evaluation +import PHANE.Evaluation.Logging (LogLevel (..), Logger (..)) +import Search.Build qualified as B +import Search.Refinement qualified as R +import System.Timing +import Text.Read +import Types.Types +import Utilities.LocalGraph qualified as LG +import Utilities.Utilities qualified as U + + +-- Bandit lists are concateenated for each of use andd update--first 3 are graph evaluation +-- and remainder are sesrch bandits. +-- they are updated separately + +-- | treeBanditList is list of search types to be chosen from if graphType is tree +treeBanditList ∷ [String] +treeBanditList = + [ "buildCharacter" + , "swapSPR" + , "swapAlternate" + , "fuse" + , "fuseSPR" + , "fuseTBR" + , "driftSPR" + , "driftAlternate" + , "annealSPR" + , "annealAlternate" + , "geneticAlgorithm" + -- , "buildDistance" -- "buildSPR", "buildAlternate", distance only up front to reduce memory footprint + ] + + +-- | netWorkBanditList is list of search types unique to graphType network +netWorkBanditList ∷ [String] +netWorkBanditList = ["networkAdd", "networkDelete", "networkAddDelete", "driftNetwork", "annealNetwork", "networkMove"] + + +-- | fullBanditList is list of search types to be chosen from if Network +fullBanditList ∷ [String] +fullBanditList = treeBanditList <> netWorkBanditList + + +-- | graphBanditList list for more rapid, or more thorough methods +graphBanditList ∷ [String] +graphBanditList = fmap show [MultiTraverse, SingleTraverse, StaticApproximation] + + +-- | A strict, three-way version of 'uncurry'. +uncurry' ∷ (Functor f, NFData d) ⇒ (a → b → f d) → (a, b) → f d +uncurry' f (a, b) = force <$> f a b + + +{- +-- | A strict, three-way version of 'uncurry'. +uncurry3' ∷ (Functor f, NFData d) ⇒ (a → b → c → f d) → (a, b, c) → f d +uncurry3' f (a, b, c) = force <$> f a b c +-} + +-- | search timed randomized search returns graph list and comment list with info String for each search instance +search + ∷ [Argument] + → GlobalSettings + → ProcessedData + → [ReducedPhylogeneticGraph] + → PhyG ([ReducedPhylogeneticGraph], [[String]]) +search inArgs inGS inData inGraphList' = + -- flatThetaList is the initial prior list (flat) of search (bandit) choices + -- can also be used in search for non-Thomspon search + let flattenList :: Fractional b => [a] -> [(a, b)] + flattenList xs = + let count = length xs + limit = 1 / fromIntegral count + in zip xs $ L.replicate count limit + + flatGraphBanditList = flattenList graphBanditList + + flatThetaList = flattenList $ case graphType inGS of + Tree → treeBanditList + _ → fullBanditList + + totalFlatTheta = flatGraphBanditList <> flatThetaList + in do + (searchTime, keepNum, instances, thompsonSample, mFactor, mFunction, maxNetEdges, stopNum) ← getSearchParams inArgs + + let threshold = fromSeconds . fromIntegral $ (100 * searchTime) `div` 100 + let initialSeconds = fromSeconds . fromIntegral $ (0 ∷ Int) + let searchTimed ∷ (Int, ([ReducedPhylogeneticGraph], [String])) → Evaluation () ([ReducedPhylogeneticGraph], [String]) + searchTimed = + uncurry' $ + searchForDuration + inGS + inData + [[]] + keepNum + thompsonSample + mFactor + mFunction + totalFlatTheta + 1 + maxNetEdges + initialSeconds + threshold + 0 + stopNum + + logWith + LogInfo + ( "Randomized seach for " + <> (show searchTime) + <> " seconds with " + <> (show instances) + <> " instances keeping at most " + <> (show keepNum) + <> " graphs" + <> "\n" + ) + -- if initial graph list is empty make some + + inGraphList ← case length inGraphList' `compare` keepNum of + LT → do + dWagGraphList ← + B.buildGraph + [("distance", ""), ("replicates", show (1000)), ("rdwag", ""), ("best", show keepNum), ("return", show keepNum)] + inGS + inData + + fmap (take keepNum) . GO.selectGraphs Unique (maxBound ∷ Int) 0 $ dWagGraphList <> inGraphList' + _ → pure inGraphList' + + let threadCount = instances -- <- (max 1) <$> getNumCapabilities + let startGraphs = replicate threadCount (inGraphList, mempty) + let threadInits ∷ [(Int, ([ReducedPhylogeneticGraph], [String]))] + threadInits = zip [1 ..] startGraphs + -- If there are no input graphs--make some via distance + -- resultList ← pooledMapConcurrently searchTimed threadInits + resultList ← + getParallelChunkTraverse >>= \pTraverse → + searchTimed `pTraverse` threadInits + let (newGraphList, commentList) = unzip resultList + let newCostList = L.group $ L.sort $ fmap getMinGraphListCost newGraphList + + let iterationHitString = + ( "Hit minimum cost " + <> (show $ minimum $ fmap snd5 $ concat newGraphList) + <> " in " + <> (show $ length $ head newCostList) + <> " of " + <> (show $ length newGraphList) + <> " iterations" + <> "\n" + ) + let completeGraphList = inGraphList <> fold newGraphList + filteredGraphList ← GO.selectGraphs Unique (maxBound ∷ Int) 0 completeGraphList + let selectedGraphList = take keepNum filteredGraphList + + logWith LogInfo iterationHitString + + pure (selectedGraphList, commentList <> [[iterationHitString]]) + where + getMinGraphListCost ∷ (Foldable t, Functor t) ⇒ t (a, Double, c, d, e) → Double + getMinGraphListCost a + | not $ null a = minimum $ fmap snd5 a + | otherwise = infinity + + +-- unneeded +-- instance NFData (IO (Maybe ([ReducedPhylogeneticGraph], [String]))) where rnf x = seq x () + +{- $ toMicroseconds allotedSeconds +this CPUtime is total over all threads--not wall clock +so changed to crappier getCurrentTime in System.Timing to +get wall clock-like ellapsed time +addied time out to terminate when exceeeded time remaining. +never teminates due to time +-} + + +searchForDuration + ∷ ∀ r + . (Fractional r, Real r) + ⇒ GlobalSettings + → ProcessedData + → [[VertexCost]] + → Int + → Bool + → Int + → String + → [(String, r)] + → Int + → Int + → CPUTime + → CPUTime + → Int + → Int + → Int + → ([ReducedPhylogeneticGraph], [String]) + → PhyG ([ReducedPhylogeneticGraph], [String]) +searchForDuration inGS inData pairwiseDistances keepNum thompsonSample mFactor mFunction totalThetaList counter maxNetEdges inTotalSeconds allotedSeconds stopCount stopNum refIndex (inGraphList, infoStringList) = + let timeLimit = fromIntegral $ toMicroseconds allotedSeconds + + logWarning ∷ b → [String] → PhyG b + logWarning val tokens = logWith LogWarn ((unwords $ "Thread" : show refIndex : tokens) <> "\n") $> val + + runForDuration ∷ PhyG a → PhyG (Maybe a) + runForDuration = liftIOOp (timeout timeLimit) + in do + -- this line to keep control of graph number + inGraphList' ← take keepNum <$> GO.selectGraphs Unique (maxBound ∷ Int) 0.0 inGraphList + + -- searchingInnerOp ∷ PhyG ([ReducedPhylogeneticGraph], [String]) + let searchingInnerOp = + force $ + performSearch + inGS + inData + pairwiseDistances + keepNum + totalThetaList + maxNetEdges + inTotalSeconds + (inGraphList', infoStringList) + + -- searchingForDuration ∷ PhyG ([ReducedPhylogeneticGraph], [String]) + let searchingForDuration = do + -- result = force $ performSearch inGS inData pairwiseDistances keepNum thompsonSample thetaList maxNetEdges (head seedList) inTotalSeconds (inGraphList', infoStringList) + result ← runForDuration searchingInnerOp + case result of + Nothing → logWarning (inGraphList, []) ["terminated due to time"] + Just gs → logWarning gs ["is OK", show allotedSeconds, "->", show . fromIntegral $ toMicroseconds allotedSeconds] + + -- (elapsedSeconds, output) <- timeOpUT $ + (elapsedSeconds, elapsedSecondsCPU, output) ← timeOpCPUWall searchingForDuration + + -- update theta list based on performance + let outTotalSeconds = timeSum inTotalSeconds elapsedSecondsCPU + let bestCost = + if (not $ null $ fst output) + then minimum $ fmap snd5 $ fst output + else infinity + let finalTimeString = ",Final Values,," <> (show bestCost) <> "," <> (show $ toSeconds outTotalSeconds) + case snd output of + [] → pure output + x : xs → do + -- passing time as CPU time not wall clock so parallel timings change to elapsedSeconds for wall clock + (updatedThetaList, newStopCount) ← + updateTheta + SearchBandit + thompsonSample + mFactor + mFunction + counter + (x :| xs) + (drop 3 totalThetaList) + elapsedSecondsCPU + outTotalSeconds + stopCount + stopNum + (updatedGraphTheta, _) ← + updateTheta + GraphBandit + thompsonSample + mFactor + mFunction + counter + (x :| xs) + (take 3 totalThetaList) + elapsedSecondsCPU + outTotalSeconds + stopCount + stopNum + -- add lists together properly so first three are the graph bandits + let combinedThetaList = updatedGraphTheta <> updatedThetaList + let thetaString = L.intercalate "," . fmap (showRealValue . snd) $ case snd output of + [] → totalThetaList + _ → combinedThetaList + + let remainingTime = allotedSeconds `timeLeft` elapsedSeconds + logWith LogMore $ + unlines + [ "Thread \t" <> show refIndex + , "Alloted \t" <> show allotedSeconds + , "Ellapsed \t" <> show elapsedSeconds + , "Remaining\t" <> show remainingTime + , "\n" + ] + + if (toPicoseconds remainingTime) == 0 || newStopCount >= stopNum || (null $ snd output) + then pure (fst output, infoStringList <> (snd output) <> [finalTimeString <> "," <> thetaString <> "," <> "*"]) -- output with strings correctly added together + else + searchForDuration + inGS + inData + pairwiseDistances + keepNum + thompsonSample + mFactor + mFunction + combinedThetaList + (counter + 1) + maxNetEdges + outTotalSeconds + remainingTime + newStopCount + stopNum + refIndex + $ bimap (inGraphList <>) (infoStringList <>) output + + +-- | updateTheta updates the expected success parameters for the bandit search list +updateTheta + ∷ ∀ r + . (Fractional r, Real r) + ⇒ BanditType + → Bool + → Int + → String + → Int + → NonEmpty String + → [(String, r)] + → CPUTime + → CPUTime + → Int + → Int + → PhyG ([(String, r)], Int) +updateTheta thisBandit thompsonSample mFactor mFunction counter (infoString :| infoStringList) inPairList elapsedSeconds totalSeconds stopCount stopNum = case inPairList of + [] → pure ([], stopCount) + _ → + let searchBandit = case thisBandit of + SearchBandit → takeWhile (/= ',') $ tail infoString + -- GraphBandit + _ → case LS.splitOn "," infoString of + toks | "StaticApprox" `elem` toks → "StaticApproximation" + toks | "MultiTraverse:False" `elem` toks → "SingleTraverse" + _ → "MultiTraverse" + + searchDeltaString = takeWhile (/= ',') $ tail $ dropWhile (/= ',') (tail infoString) + searchDelta ∷ r + searchDelta = fromRational . toRational $ (read searchDeltaString ∷ Double) + in if not thompsonSample + then + let newStopCount + | searchDelta <= 0.0 = stopCount + 1 + | otherwise = 0 + + stopString + | newStopCount >= stopNum = + "\n\tSearch iterations have not improved for " <> (show newStopCount) <> " iterations--terminating this search command" <> "\n" + | otherwise = "" + in do + when (thisBandit == SearchBandit) $ logWith LogInfo stopString + pure (inPairList, newStopCount) + else -- update via results, previous history, memory \factor and type of memory "loss" + + let -- get timing and benefit accounting for 0's + -- average time ratio in time factor for benefit adjustment + totalTime = (fromIntegral $ toSeconds totalSeconds) / (fromIntegral counter) ∷ r + + timeFactor ∷ r + timeFactor + | counter == 1 = 1.0 + | toSeconds elapsedSeconds == 0 = 0.1 + | otherwise = (fromIntegral $ toSeconds elapsedSeconds) / totalTime + + durationTime = + if toSeconds elapsedSeconds <= 1 + then 1 + else toSeconds elapsedSeconds + + -- get bandit index and orignial theta value from pair list + indexBandit' = L.elemIndex searchBandit $ fmap fst inPairList + indexBandit = fromJust indexBandit' + + inThetaBandit = snd $ inPairList !! indexBandit + + newStopCount = + if searchDelta <= 0.0 + then stopCount + 1 + else 0 + + stopString = + if newStopCount >= stopNum + then + ( "\n\tSearch iterations have not improved for " <> (show newStopCount) <> " iterations--terminating this search command" <> "\n" + ) + else "" + in -- check error + if isNothing indexBandit' + then + error + ("Bandit index not found: " <> searchBandit <> " in " <> showRealValuedPairs inPairList <> (show $ tail $ head infoStringList)) + else -- "simple" for testing, sucess=1, no time factor, full average overall interations + + if mFunction == "simple" + then -- first Simple update based on 0 = no better. 1 == any better irrespective of magnitude or time, full memory + -- all thetas (second field) sum to one. + -- if didn't do anything but increment everyone else with 1/(num bandits - 1), then renormalize + -- dowenweights unsiucessful bandit + -- need to add more downweight if search long + + let previousSuccessList = fmap (* (fromIntegral counter)) $ fmap snd inPairList + + benefit = + if searchDelta <= 0.0 + then 0.0 + else searchDelta / (fromIntegral durationTime) + + -- see if bandit was sucessful and if so set increment + incrementBandit = if benefit == 0.0 then 0.0 else 1.0 + newBanditVal ∷ r + newBanditVal = incrementBandit + ((fromIntegral counter) * inThetaBandit) + + -- for nonBandit if not successful increment them (basically to downweight bandit), other wise not + incrementNonBanditVals = + if benefit == 0.0 + then 1.0 / ((fromIntegral $ length inPairList) - 1.0) + else 0.0 + updatedSuccessList = fmap (+ incrementNonBanditVals) previousSuccessList + + -- uopdate the bandit from list by splitting and rejoining + firstBanditPart = take indexBandit updatedSuccessList + thirdBanditPart = drop (indexBandit + 1) updatedSuccessList + newSuccessList = firstBanditPart <> (newBanditVal : thirdBanditPart) + totalTheta = sum newSuccessList + + newThetaList = (/ totalTheta) <$> newSuccessList + in do + logWith LogInfo stopString + pure (zip (fmap fst inPairList) newThetaList, newStopCount) + else -- more complex 'recency' options + + if mFunction `elem` ["linear", "exponential"] + then + let -- weight factors for previous theta (wN-1)* \theta + -- maxed ot counter so averaging not wierd for early iterations + mFactor' ∷ r + mFactor' = min (fromIntegral counter) $ fromIntegral mFactor + (wN_1, wN) = case mFunction of + "linear" → (mFactor' / (mFactor' + 1), 1.0 / (mFactor' + 1)) + "exponential" → + let f ∷ r → Double + f = fromRational . toRational + g ∷ Double → r + g = fromRational . toRational + e ∷ Double + e = f mFactor' + v ∷ r + v = 1.0 / g (2.0 ** e) + in (1.0 - v, v) + _ → + error $ + unwords + [ "Thompson search option" + , mFunction + , "not recognized:" + , show ["simple", "linear", "exponential"] + ] + + -- simple "success-based" benefit, scaled to average time of search iteration + searchBenefit ∷ r + searchBenefit + | searchDelta <= 0.0 = 0.0 + | otherwise = searchDelta / timeFactor + + previousSuccessList = fmap (* (fromIntegral counter)) $ fmap snd inPairList + + -- average of new am]nd previous if m=1, no memory if m=0, longer memory with larger m + -- m should be limited to counter + -- linear ((m * previous value) + new value) / m+1 + -- exponential ((2^(-m) * previous value) + new value) / (2^(-m) + 1) + newBanditVal ∷ r + newBanditVal + | searchBenefit > inThetaBandit = (wN * searchBenefit) + (wN_1 * inThetaBandit) + | otherwise = (wN_1 * inThetaBandit) + (wN * (inThetaBandit + searchBenefit)) + + -- for nonBandit if not successful increment them (basically to downweight bandit), other wise not + incrementNonBanditVals + | searchDelta <= 0.0 = 1.0 / ((fromIntegral $ length inPairList) - 1.0) + | otherwise = 0.0 + + updatedSuccessList = fmap (+ incrementNonBanditVals) previousSuccessList + + -- uopdate the bandit from list by splitting and rejoining, then normalizing to 1.0 + firstBanditPart ∷ [r] + firstBanditPart = take indexBandit updatedSuccessList + thirdBanditPart ∷ [r] + thirdBanditPart = drop (indexBandit + 1) updatedSuccessList + newSuccessList ∷ [r] + newSuccessList = firstBanditPart <> (newBanditVal : thirdBanditPart) + totalTheta = sum newSuccessList + + newThetaList = (/ totalTheta) <$> newSuccessList + in do + -- trace ("Update : \n" <> (show $ fmap snd inPairList) <> "\n" <> (show previousSuccessList) <> "\n" <> (show updatedSuccessList) <> "\n" <> (show newThetaList) <> "\n") $ + -- trace ("Not simple: " <> mFunction <> " search benefit " <> (show searchBenefit) <> " " <> searchBandit <> " index " <> (show indexBandit) <> " total time: " <> (show $ toSeconds totalSeconds) <> " elapsed time: " <> (show $ toSeconds elapsedSeconds) <> " -> " <> (show (searchDelta, toSeconds elapsedSeconds)) <> "\n" <> (show $ fmap snd inPairList) <> "\n" <> (show newThetaList) <> "\n" <> (head infoStringList)) ( + logWith LogInfo stopString + pure (zip (fmap fst inPairList) newThetaList, newStopCount) + else -- ) + + errorWithoutStackTrace + ("Thompson search option " <> mFunction <> " not recognized " <> (show ["simple", "linear", "exponential"])) + + +-- | This exponentiation functionn from http://www.haskell.org/haskellwiki/Generic_number_type#squareRoot +(^!) ∷ (Num a) ⇒ a → Int → a +(^!) x n = x ^ n + + +-- | squareRoot integer square root from http://www.haskell.org/haskellwiki/Generic_number_type#squareRoot +squareRoot ∷ Integer → Integer +squareRoot 0 = 0 +squareRoot 1 = 1 +squareRoot n = + let twopows = iterate (^! 2) 2 + (lowerRoot, lowerN) = + last $ takeWhile ((n >=) . snd) $ zip (1 : twopows) twopows + newtonStep x = div (x + div n x) 2 + iters = iterate newtonStep (squareRoot (div n lowerN) * lowerRoot) + isRoot r = r ^! 2 <= n && n < (r + 1) ^! 2 + in head $ dropWhile (not . isRoot) iters + + +{- | performSearch takes input graphs and performs randomized build and search with time limit +Thompson sampling and mFactor to pick strategy from updated theta success values +the random calls return the tail of the input list to avoid long list access--can do vector since infinite +if no input graphs then do a unitary distance build to get a quick start +-} +performSearch + ∷ ∀ r + . (Real r) + ⇒ GlobalSettings + → ProcessedData + → [[VertexCost]] + → Int + → [(String, r)] + → Int + → CPUTime + → ([ReducedPhylogeneticGraph], [String]) + → PhyG ([ReducedPhylogeneticGraph], [String]) +performSearch inGS' inData' _pairwiseDistances keepNum totalThetaList maxNetEdges inTime (inGraphList', _) = + -- set up basic parameters for search/refine methods + let thetaList = drop 3 totalThetaList + numLeaves = V.length $ fst3 inData' + -- set up log for sample + thompsonString = "," <> showRealValuedPairs totalThetaList + + getRandomTheta = sampleRandomChoices thetaList + + -- choose search type from list with frequencies as input from searchForDuration + -- adjust if all trees and networks chose to ensure some net stuff tried + getSearchBandit + | null inGraphList' = pure "buildDistance" + | otherwise = do + rTheta ← getRandomTheta + case graphType inGS' of + Tree → pure rTheta + _ → + let thoseValues = + ["networkMove", "networkDelete", "driftNetwork", "annealNetwork"] + in case filter (== False) $ LG.isTree . fst5 <$> inGraphList' of + _ : _ + | rTheta `notElem` thoseValues → + sampleRandomChoices + [("networkAdd", 0.5), ("networkAddDelete", 0.5)] + _ → pure rTheta + + numToCharBuild = fromInteger $ squareRoot $ toInteger numLeaves + numToDistBuild = min 1000 (numLeaves * numLeaves) + numDistToKeep = keepNum + + showArg a = fst a <> ":" <> snd a + + -- common swap arguments + getSwapKeep = min keepNum <$> sampleRandomChoices [(1, 0.50), (2, 0.33), (4, 0.17)] + + -- common drift arguments + getDriftArgs = do + maxChanges ← sampleRandomChoices [("5", 0.33), ("10", 0.34), ("20", 0.33)] + acceptEqual ← sampleRandomChoices [("0.1", 0.5), ("0.5", 0.5)] + acceptWorse ← sampleRandomChoices [("10.0", 0.33), ("20.0", 0.34), ("40", 0.33)] + pure + [ ("drift", "") + , ("maxChanges", maxChanges) + , ("acceptEqual", acceptEqual) + , ("acceptWorse", acceptWorse) + ] + + -- common annealing arguments + getAnnealArgs = do + tempSteps ← sampleRandomChoices [("5", 0.33), ("10", 0.34), ("20", 0.33)] + pure [("annealing", ""), ("steps", tempSteps)] + + -- common fuse options + getFusePairs = sampleRandomChoices [("5", 0.45), ("10", 0.45), ("20", 0.1)] + fuseKeep = 2 * keepNum + + -- network edit options + netGeneralArgs ∷ [(String, String)] + netGeneralArgs = [("keep", show keepNum), ("steepest", ""), ("atRandom", ""), ("maxnetedges", show maxNetEdges)] + netMoveArgs = ("netMove", "") : netGeneralArgs + netAddArgs = ("netAdd", "") : netGeneralArgs + netDelArgs = ("netDel", "") : netGeneralArgs + netAddDelArgs = ("netAddDel", "") : netGeneralArgs + getNetDriftAnnealMethod ∷ PhyG [(String, String)] + getNetDriftAnnealMethod = pure . (id &&& const "") <$> sampleRandomChoices [("netAdd", 0.5), ("netDel", 0.5)] -- ,("netMove", 0.25), ("netAddDel", 0.25),] + + -- Genetic Algorithm Arguments + -- stops after 2 rounds with no improvement (if generations > 2) + getGeneticAlgArgs = do + popSize ← sampleRandomChoices [("10", 0.50), ("20", 0.25), ("40", 0.25)] + generations ← sampleRandomChoices [("1", 1.0)] -- , "2" , "4"] + severity ← sampleRandomChoices [("0.0", 0.33), ("1.0", 0.34), ("2.0", 0.33)] + recombinations ← sampleRandomChoices [("10", 0.45), ("20", 0.45), ("40", 0.1)] + pure + [ ("popsize", popSize) + , ("generations", generations) + , ("severity", severity) + , ("recombinations", recombinations) + , ("stop", "2") + , ("maxnetedges", show maxNetEdges) + ] + in do + searchBandit ← getSearchBandit + -- unless fuse or genetic algorithm, only operate on "best" input graphs + -- this to reduce memory footrpint when have multiple iterations + inGraphList'' ← + if searchBandit `elem` ["fuse", "fuseSPR", "fuseTBR", "geneticAlgorithm"] + then pure inGraphList' + else GO.selectGraphs Best keepNum 0 inGraphList' + + -- Can't do both static approx and multitraverse:False + let transformBy xs = TRANS.transform xs inGS' inData' inData' inGraphList'' + newDataMTF ← transformBy [("multitraverse", "false")] + newDataSA ← transformBy [("staticapprox", [])] + + -- set graph valuation bandit + graphEvaluationBandit ← sampleRandomChoices $ take 3 totalThetaList + -- apply graph evaluation bandit + let transformToStaticApproximation = + U.getNumberNonExactCharacters (thd3 inData') /= 0 + && graphEvaluationBandit == "StaticApproximation" + + let transformMultiTraverse = graphEvaluationBandit == "SingleTraverse" + + let ((inGS, origData, inData, inGraphList), transformString) + | transformToStaticApproximation && (useIA inGS') = (newDataSA, ",StaticApprox") + | transformMultiTraverse = (newDataMTF, ",MultiTraverse:False") + | otherwise = ((inGS', inData', inData', inGraphList''), "") + + buildType ← case searchBandit of + "buildCharacter" → pure "character" + "buildDistance" → pure "distance" + _ → sampleRandomChoices [("distance", 0.5), ("character", 0.5)] + + -- common build arguments including block and distance + -- Tree does not use block--doesn't work very well for tree building + buildMethod ← case inGraphList' of + [] → pure "unitary" + _ : _ → case graphType inGS' of + Tree → pure "unitary" + _ → sampleRandomChoices [("unitary", 0.8), ("block", 0.2)] + + let wagnerOptions = case (buildType, buildMethod) of + ("distance", "block") → + [ ("replicates", show numToCharBuild) + , ("rdwag", "") + , ("best", show (1 ∷ Int)) + ] + ("distance", _) → + [ ("replicates", show numToDistBuild) + , ("rdwag", "") + , ("best", show numDistToKeep) + , ("return", show numToCharBuild) + ] + ("character", "block") → + [("replicates", show (1 ∷ Int))] + ("character", _) → [] + + blockOptions ← case buildMethod of + -- to resolve block build graphs + "block" → + sampleRandomChoices [("eun", 0.5), ("cun", 0.5)] >>= \reconciliationMethod → + pure + [ ("block", "") + , ("atRandom", "") + , ("displaytrees", show numToCharBuild) + , (reconciliationMethod, "") + ] + _ → pure [] + + let builder bArgs = B.buildGraph bArgs inGS' inData' + let attach :: b -> a -> (a, b) + attach = flip (,) + let selectUniqueGraphs = GO.selectGraphs Unique (maxBound ∷ Int) 0.0 + + -- bandit list with search arguments set + -- primes (') for build to start with untransformed data + (searchGraphs, searchArgs) ← case searchBandit of + "buildCharacter" → + let buildArgs = [(buildType, "")] <> wagnerOptions <> blockOptions + in attach buildArgs <$> builder buildArgs + "buildDistance" → + -- search for dist builds 1000, keeps 10 best distance then selects 10 best after rediagnosis + -- this line in here to allow for returning lots of rediagnosed distance trees, then + -- reducing to unique best cost trees--but is a memory pig + let buildArgs = [(buildType, "")] <> wagnerOptions <> blockOptions + in attach buildArgs <$> builder buildArgs + "buildSPR" → + let -- build part + buildArgs = [(buildType, "")] <> wagnerOptions <> blockOptions + -- swap options + swapType = "spr" + in -- search + do + buildGraphs ← builder buildArgs + buildGraphs' ← selectUniqueGraphs buildGraphs + swapKeep ← getSwapKeep + let swapArgs = [(swapType, ""), ("steepest", ""), ("keep", show swapKeep), ("atrandom", "")] + swapList ← R.swapMaster swapArgs inGS inData buildGraphs' + pure (swapList, buildArgs <> swapArgs) + "buildAlternate" → + let -- build part + buildArgs = [(buildType, "")] <> wagnerOptions <> blockOptions + -- swap options + swapType = "alternate" -- default anyway + in -- search + do + buildGraphs ← builder buildArgs + buildGraphs' ← selectUniqueGraphs buildGraphs + swapKeep ← getSwapKeep + let swapArgs = [(swapType, ""), ("steepest", ""), ("keep", show swapKeep), ("atrandom", "")] + swapList ← R.swapMaster swapArgs inGS inData buildGraphs' + pure (swapList, buildArgs <> swapArgs) + "swapSPR" → + let -- swap options + swapType = "spr" + in -- search + do + swapKeep ← getSwapKeep + let swapArgs = [(swapType, ""), ("steepest", ""), ("keep", show swapKeep), ("atrandom", "")] + swapList ← R.swapMaster swapArgs inGS inData inGraphList + pure (swapList, swapArgs) + "swapAlternate" → + let -- swap options + swapType = "alternate" -- default anyway + in -- search + do + swapKeep ← getSwapKeep + let swapArgs = [(swapType, ""), ("steepest", ""), ("keep", show swapKeep), ("atrandom", "")] + swapList ← R.swapMaster swapArgs inGS inData inGraphList + pure (swapList, swapArgs) + -- drift only best graphs + "driftSPR" → + let -- swap args + swapType = "spr" + in -- perform search + do + driftArgs ← getDriftArgs + swapKeep ← getSwapKeep + let swapArgs = [(swapType, ""), ("steepest", ""), ("keep", show swapKeep), ("atrandom", "")] + + -- swap with drift (common) arguments + let swapDriftArgs = swapArgs <> driftArgs + swapList ← R.swapMaster swapDriftArgs inGS inData inGraphList + pure (swapList, swapArgs) + -- drift only best graphs + "driftAlternate" → + let -- swap args + swapType = "alternate" + in -- perform search + do + driftArgs ← getDriftArgs + swapKeep ← getSwapKeep + let swapArgs = [(swapType, ""), ("steepest", ""), ("keep", show swapKeep), ("atrandom", "")] + -- swap with drift (common) arguments + let swapDriftArgs = swapArgs <> driftArgs + swapList ← R.swapMaster swapDriftArgs inGS inData inGraphList + pure (swapList, swapDriftArgs) + -- anneal only best graphs + "annealSPR" → + let -- swap args + swapType = "spr" + in -- perform search + do + annealArgs ← getAnnealArgs + swapKeep ← getSwapKeep + let swapArgs = [(swapType, ""), ("steepest", ""), ("keep", show swapKeep), ("atrandom", "")] + -- swap with anneal (common) arguments + let swapAnnealArgs = swapArgs <> annealArgs + swapList ← R.swapMaster swapAnnealArgs inGS inData inGraphList + pure (swapList, swapAnnealArgs) + -- anneal only best graphs + "annealAlternate" → + let -- swap args + swapType = "alternate" + in -- perform search + do + annealArgs ← getAnnealArgs + swapKeep ← getSwapKeep + let swapArgs = [(swapType, ""), ("steepest", ""), ("keep", show swapKeep), ("atrandom", "")] + -- swap with anneal (common) arguments + let swapAnnealArgs = swapArgs <> annealArgs + swapList ← R.swapMaster swapAnnealArgs inGS inData inGraphList + pure (swapList, swapAnnealArgs) + "geneticAlgorithm" → + do + -- args from above + -- perform search + gaArgs ← getGeneticAlgArgs + gaReturn ← R.geneticAlgorithmMaster gaArgs inGS inData inGraphList + pure (gaReturn, gaArgs) + "fuse" → do + -- should more graphs be added if only one? Would downweight fuse perhpas too much + fusePairs ← getFusePairs + -- fuse arguments + -- this to limit memory footprint of fuse during search + -- gsNum = min (graphsSteepest inGS) 5 + -- inGSgs1 = inGS{graphsSteepest = gsNum} + let fuseArgs = + [ ("none", "") + , ("all", "") + , ("unique", "") + , ("atrandom", "") + , ("pairs", fusePairs) + , ("keep", show fuseKeep) + , ("noreciprocal", "") + ] + -- perform search + R.fuseGraphs fuseArgs inGS inData inGraphList <&> (\x → (x, fuseArgs)) + "fuseSPR" → do + -- fuse arguments + -- inGSgs1 = inGS{graphsSteepest = 1} + fusePairs ← getFusePairs + let fuseArgs = + [ ("spr", "") + , ("all", "") + , ("unique", "") + , ("atrandom", "") + , ("pairs", fusePairs) + , ("keep", show fuseKeep) + , ("noreciprocal", "") + ] + -- perform search + R.fuseGraphs fuseArgs inGS inData inGraphList <&> (\x → (x, fuseArgs)) + "fuseTBR" → do + -- fuse arguments + -- inGSgs1 = inGS{graphsSteepest = 1} + fusePairs ← getFusePairs + let fuseArgs = + [ ("tbr", "") + , ("all", "") + , ("unique", "") + , ("atrandom", "") + , ("pairs", fusePairs) + , ("keep", show fuseKeep) + , ("noreciprocal", "") + ] + -- perform search + R.fuseGraphs fuseArgs inGS inData inGraphList <&> (\x → (x, fuseArgs)) + "networkAdd" → + let -- network add args + netEditArgs = netAddArgs + in -- perform search + R.netEdgeMaster netEditArgs inGS inData inGraphList <&> (\x → (x, netEditArgs)) + "networkDelete" → + let -- network delete args + netEditArgs = netDelArgs + in -- perform search + R.netEdgeMaster netEditArgs inGS inData inGraphList <&> (\x → (x, netEditArgs)) + "networkAddDelete" → + let -- network add/delete args + netEditArgs = netAddDelArgs + in -- perform search + R.netEdgeMaster netEditArgs inGS inData inGraphList <&> (\x → (x, netEditArgs)) + "networkMove" → + let -- network move args + netEditArgs = netMoveArgs + in -- perform search + R.netEdgeMaster netEditArgs inGS inData inGraphList <&> (\x → (x, netEditArgs)) + "driftNetwork" → do + driftArgs ← getDriftArgs + netDriftAnnealArgs ← getNetDriftAnnealMethod + -- network add/delete + drift args + let netEditArgs = fold [netDriftAnnealArgs, netGeneralArgs, driftArgs] + -- perform search + R.netEdgeMaster netEditArgs inGS inData inGraphList <&> (\x → (x, netEditArgs)) + "annealNetwork" → do + annealArgs ← getAnnealArgs + netDriftAnnealArgs ← getNetDriftAnnealMethod + -- network add/delete + annealing args + let netEditArgs = fold [netDriftAnnealArgs, netGeneralArgs, annealArgs] + -- perform search + R.netEdgeMaster netEditArgs inGS inData inGraphList <&> (\x → (x, netEditArgs)) + _ → error ("Unknown/unimplemented method in search: " <> searchBandit) + + -- process + uniqueGraphs' ← fmap (take keepNum) . selectUniqueGraphs $ searchGraphs <> inGraphList + let transformBy' xs = TRANS.transform xs inGS origData inData uniqueGraphs' + newDataMT ← transformBy' [("multiTraverse", "true")] + newDataT ← transformBy' [("dynamic", [])] + let (uniqueGraphs, transString) + | (not transformToStaticApproximation && not transformMultiTraverse) = (uniqueGraphs', "") + | transformToStaticApproximation = (fth4 newDataT, ",Dynamic") + | otherwise = (fth4 newDataMT, ",MultiTraverse:True") + + -- string of delta and cost of graphs + let extract :: (Foldable f, Functor f, Ord b) => f (a, b, c, d, e) -> b + extract = minimum . fmap snd5 + + let deltaString + | null inGraphList' = "10.0," + | otherwise = show $ extract inGraphList' - extract uniqueGraphs + + let currentBestString + | not $ null uniqueGraphs = show $ extract uniqueGraphs + | otherwise = show infinity + + -- create string for search stats + let searchString = + "," + <> searchBandit + <> "," + <> deltaString + <> "," + <> currentBestString + <> "," + <> (show $ toSeconds inTime) + <> "," + <> (L.intercalate "," $ fmap showArg searchArgs) + <> transformString + <> transString + pure (uniqueGraphs, [searchString <> thompsonString]) + + +{- | +'getSearchParams' takes arguments and returns search params. +-} +getSearchParams ∷ [Argument] → PhyG (Int, Int, Int, Bool, Int, String, Int, Int) +getSearchParams inArgs = + let fstArgList = fmap (fmap toLower . fst) inArgs + sndArgList = fmap (fmap toLower . snd) inArgs + lcArgList = zip fstArgList sndArgList + checkCommandList = checkCommandArgs "search" fstArgList VER.searchArgList + in -- check for valid command options + if not checkCommandList + then errorWithoutStackTrace ("Unrecognized command in 'search': " <> show inArgs) + else + let instancesList = filter ((== "instances") . fst) lcArgList + instances + | length instancesList > 1 = + errorWithoutStackTrace ("Multiple 'keep' number specifications in search command--can have only one: " <> show inArgs) + | null instancesList = Just 1 + | otherwise = readMaybe (snd $ head instancesList) ∷ Maybe Int + + keepList = filter ((== "keep") . fst) lcArgList + keepNum + | length keepList > 1 = + errorWithoutStackTrace ("Multiple 'keep' number specifications in search command--can have only one: " <> show inArgs) + | null keepList = Just 10 + | otherwise = readMaybe (snd $ head keepList) ∷ Maybe Int + + daysList = filter ((== "days") . fst) lcArgList + days + | length daysList > 1 = + errorWithoutStackTrace ("Multiple 'days' number specifications in search command--can have only one: " <> show inArgs) + | null daysList = Just 0 + | otherwise = readMaybe (snd $ head daysList) ∷ Maybe Int + + hoursList = filter ((== "hours") . fst) lcArgList + hours + | length hoursList > 1 = + errorWithoutStackTrace ("Multiple 'hours' number specifications in search command--can have only one: " <> show inArgs) + | null hoursList = Just 0 + | otherwise = readMaybe (snd $ head hoursList) ∷ Maybe Int + + minutesList = filter ((== "minutes") . fst) lcArgList + minutes + | length minutesList > 1 = + errorWithoutStackTrace ("Multiple 'minutes' number specifications in search command--can have only one: " <> show inArgs) + | null minutesList = Just 1 + | otherwise = readMaybe (snd $ head minutesList) ∷ Maybe Int + + secondsList = filter ((== "seconds") . fst) lcArgList + seconds + | length secondsList > 1 = + errorWithoutStackTrace ("Multiple 'seconds' number specifications in search command--can have only one: " <> show inArgs) + | null secondsList = Just 0 + | otherwise = readMaybe (snd $ head secondsList) ∷ Maybe Int + + maxNetEdgesList = filter ((== "maxnetedges") . fst) lcArgList + maxNetEdges + | length maxNetEdgesList > 1 = + errorWithoutStackTrace ("Multiple 'maxNetEdges' number specifications in netEdge command--can have only one: " <> show inArgs) + | null maxNetEdgesList = Just 10 + | otherwise = readMaybe (snd $ head maxNetEdgesList) ∷ Maybe Int + + thompsonList = filter ((== "thompson") . fst) lcArgList + mFactor + | length thompsonList > 1 = + errorWithoutStackTrace ("Multiple 'Thompson' number specifications in search command--can have only one: " <> show inArgs) + | null thompsonList = Just 1 + | otherwise = readMaybe (snd $ head thompsonList) ∷ Maybe Int + + stopList = filter ((== "stop") . fst) lcArgList + stopNum + | length stopList > 1 = + errorWithoutStackTrace ("Multiple 'stop' number specifications in search command--can have only one: " <> show inArgs) + | null stopList = Just (maxBound ∷ Int) + | otherwise = readMaybe (snd $ head stopList) ∷ Maybe Int + + thompson = any ((== "thompson") . fst) lcArgList + mLinear = any ((== "linear") . fst) lcArgList + mExponential = any ((== "exponential") . fst) lcArgList + mSimple = any ((== "simple") . fst) lcArgList + + mFunction = + if mLinear && mExponential + then "linear" + else + if mLinear + then "linear" + else + if mExponential + then "exponential" + else + if mSimple + then "simple" + else "linear" + in if isNothing keepNum + then errorWithoutStackTrace ("Keep specification not an integer in search: " <> show (head keepList)) + else + if isNothing instances + then errorWithoutStackTrace ("Instances specification not an integer in search: " <> show (head instancesList)) + else + if isNothing days + then errorWithoutStackTrace ("Days specification not an integer in search: " <> show (head daysList)) + else + if isNothing hours + then errorWithoutStackTrace ("Hours specification not an integer in search: " <> show (head hoursList)) + else + if isNothing minutes + then errorWithoutStackTrace ("Minutes specification not an integer in search: " <> show (head minutesList)) + else + if isNothing seconds + then errorWithoutStackTrace ("Seconds factor specification not an integer in search: " <> show (head secondsList)) + else + if isNothing mFactor + then + errorWithoutStackTrace + ("Thompson mFactor specification not an integer or not found in search (e.g. Thompson:1) " <> show (head thompsonList)) + else + if isNothing maxNetEdges + then + errorWithoutStackTrace + ("Search 'maxNetEdges' specification not an integer or not found (e.g. maxNetEdges:8): " <> show (snd $ head maxNetEdgesList)) + else + if isNothing stopNum + then + errorWithoutStackTrace + ("Search stop specification not an integer or not found in search (e.g. stop:10) " <> show (head stopList)) + else + let seconds' = + if ((fromJust minutes > 0) || (fromJust hours > 0) || (fromJust days > 0)) && (null secondsList) + then Just 0 + else seconds + searchTime = (fromJust seconds') + (60 * (fromJust minutes)) + (3600 * (fromJust hours)) + in do + when (mLinear && mExponential) $ + logWith LogWarn ("Thompson recency function specification has both 'linear' and 'exponential', defaulting to 'linear'\n") + pure $ + ( searchTime + , fromJust keepNum + , fromJust instances + , thompson + , fromJust mFactor + , mFunction + , fromJust maxNetEdges + , fromJust stopNum + ) + + +showRealValue ∷ ∀ r. (Real r) ⇒ r → String +showRealValue = + let convert ∷ r → Double + convert = fromRational . toRational + in show . convert + + +showRealValuedPairs ∷ ∀ r. (Real r) ⇒ [(String, r)] → String +showRealValuedPairs = + let asTuple ∷ (String, r) → String + asTuple (x, y) = fold ["( ", show x, ", ", showRealValue y, " )"] + + enclose ∷ String → String + enclose = ("[ " <>) . (<> " ]") + in enclose . L.intercalate ", " . fmap asTuple + + +sampleRandomChoices ∷ ∀ a r. (Real r) ⇒ [(a, r)] → PhyG a +sampleRandomChoices = Sample.fromList . fmap (fmap toRational) diff --git a/src/Search/Swap.hs b/src/Search/Swap.hs new file mode 100644 index 000000000..ec813ef61 --- /dev/null +++ b/src/Search/Swap.hs @@ -0,0 +1,2267 @@ +{- | +Module specifying graph swapping rearrangement functions +-} +module Search.Swap ( + getUnionRejoinEdgeList, + rejoinGraphTuple, + reoptimizeSplitGraphFromVertexTuple, + swapDriver, +) where + +import Control.Monad (filterM) +import Control.Monad.IO.Class +import Control.Monad.Random.Class +import Data.Foldable (fold, toList) +import Data.Foldable1 (Foldable1) +import Data.Foldable1 qualified as F1 +import Data.Functor (($>), (<&>)) +import Data.List qualified as L +import Data.Maybe +import Data.Ord (comparing) +import Data.Vector qualified as V +import GHC.Real qualified as Real (infinity) +import GeneralUtilities +import GraphOptimization.Medians qualified as M +import GraphOptimization.PostOrderSoftWiredFunctions qualified as POSW +import GraphOptimization.PreOrderFunctions qualified as PRE +import GraphOptimization.Traversals qualified as T +import Graphs.GraphOperations qualified as GO +import PHANE.Evaluation +import PHANE.Evaluation.ErrorPhase (ErrorPhase (..)) +import PHANE.Evaluation.Logging (LogLevel (..), Logger (..)) +import PHANE.Evaluation.Verbosity (Verbosity (..)) +import Types.Types +import Utilities.LocalGraph qualified as LG +import Utilities.Utilities as U + + +{- | SwapDriver + Uses compnent functions but with alternate high-level logic + Generates new simmanneal params for recursive rounds +-} +swapDriver + ∷ SwapParams + → GlobalSettings + → ProcessedData + → Int + → [ReducedPhylogeneticGraph] + → [(Maybe SAParams, ReducedPhylogeneticGraph)] + → PhyG ([ReducedPhylogeneticGraph], Int) +swapDriver swapParams inGS inData inCounter curBestGraphList inSimAnnealParams = + if null curBestGraphList + then pure ([], inCounter) + else do + let inBestCost = minimum $ fmap snd5 curBestGraphList + -- liftIO $ putStr ("SD: " <> (show $ swapType swapParams)) -- <> " " <> (show $ fmap snd5 curBestGraphList)) -- <> "\n" <> (LG.prettyIndices $ head $ fmap fst5 curBestGraphList)) + + (newGraphList', newCounter, newSAParams) ← + swapMaster swapParams inGS inData inCounter curBestGraphList inSimAnnealParams + + newGraphList ← GO.selectGraphs Best (keepNum swapParams) 0.0 newGraphList' + -- liftIO $ putStr ("SD-After: " <> (show $ fmap snd5 newGraphList)) + -- found no better + + if null newGraphList + then pure (curBestGraphList, newCounter) + else + let newCost = minimum $ fmap snd5 newGraphList + in -- found worse for some reason + if newCost > inBestCost + then pure (curBestGraphList, newCounter) + else -- found same (ie additional) + + if newCost == inBestCost + then do + allGraphs <- GO.selectGraphs Best (keepNum swapParams) 0.0 (newGraphList <> curBestGraphList) + pure (allGraphs, newCounter) + else -- found better-- go around again + do + let newSimAnnealParamList = zip (U.generateUniqueRandList (length newGraphList) newSAParams) newGraphList + swapDriver swapParams inGS inData newCounter newGraphList newSimAnnealParamList + + +{- swapMAster + Working through the complex logic of swapSPRTBR +-} +swapMaster + ∷ SwapParams + → GlobalSettings + → ProcessedData + → Int + → [ReducedPhylogeneticGraph] + → [(Maybe SAParams, ReducedPhylogeneticGraph)] + → PhyG ([ReducedPhylogeneticGraph], Int, Maybe SAParams) +swapMaster swapParams inGS inData@(leafNames, _, _) inCounter curBestGraphList inSimAnnealParams = + -- swapSPRTBR swapParams inGS inData inCounter curBestGraphList inSimAnnealParams + let curBestCost = minimum $ fmap snd5 curBestGraphList + numLeaves = V.length leafNames + in do + inGraphNetPenalty ← T.getPenaltyFactor inGS inData Nothing $ GO.convertReduced2PhylogeneticGraph (head curBestGraphList) + let inGraphNetPenaltyFactor = inGraphNetPenalty / curBestCost + + -- (a,b,c) <- swapAll swapParams inGS inData inCounter curBestCost curBestGraphList curBestGraphList numLeaves netPenaltyFactor (fst $ head inSimAnnealParams) + -- a' <- GO.selectGraphs Best (keepNum swapParams) 0.0 a + -- pure (a,b,c) + + -- THis seems to screw up + case (fst $ head inSimAnnealParams) of + Nothing → do + -- steepest takes immediate best--does not keep equall cost-- for now--disabled not working correctly so goes to "all" + (swappedGraphs, counter, swapSAPArams) ← + swapAll + swapParams + inGS + inData + inCounter + curBestCost + curBestGraphList + curBestGraphList + numLeaves + inGraphNetPenaltyFactor + Nothing + pure $ case swappedGraphs of + [] → (curBestGraphList, counter, Nothing) + gs → (gs, counter, swapSAPArams) + -- simulated annealing/drifting acceptance does a steepest with SA acceptance + Just simAnneal → + -- then a swap steepest and all on annealed graph + -- same at this level method (SA, Drift) choice occurs at lower level + -- annealed should only yield a single graph + let -- create list of params with unique list of random values for rounds of annealing + annealDriftRounds = rounds simAnneal + newSimAnnealParamList = U.generateUniqueRandList annealDriftRounds (fst $ head inSimAnnealParams) + -- parallel setup + action ∷ Maybe SAParams → PhyG ([ReducedPhylogeneticGraph], Int, Maybe SAParams) + action = swapAll swapParams inGS inData 0 curBestCost curBestGraphList curBestGraphList numLeaves inGraphNetPenaltyFactor + + extractGraphTopoCost + ∷ ([ReducedPhylogeneticGraph], Int, Maybe SAParams) + → ([ReducedPhylogeneticGraph], Int, Maybe SAParams) + extractGraphTopoCost = applyOver1of3 (listApplying strict1and2of5) + in -- this to ensure current step set to 0 + do + swapPar ← getParallelChunkTraverseBy extractGraphTopoCost + (annealDriftGraphs', anealDriftCounterList, annealDriftParams) ← unzip3 <$> swapPar action newSimAnnealParamList + + -- annealed/Drifted 'mutated' graphs + annealDriftGraphs ← GO.selectGraphs Unique (keepNum swapParams) 0.0 $ concat annealDriftGraphs' + + -- swap back "normally" if desired for full drifting/annealing + (swappedGraphs, counter, _) ← + swapAll + swapParams + inGS + inData + (sum anealDriftCounterList) + (min curBestCost (minimum $ snd5 <$> annealDriftGraphs)) + curBestGraphList + annealDriftGraphs + numLeaves + inGraphNetPenaltyFactor + Nothing + + bestGraphs ← GO.selectGraphs Best (keepNum swapParams) 0.0 $ curBestGraphList <> swappedGraphs + -- this Bool for Genetic Algorithm mutation step + pure $ + if not $ returnMutated swapParams + then (bestGraphs, counter, head annealDriftParams) + else (annealDriftGraphs, sum anealDriftCounterList, head annealDriftParams) + + +{- | swapAll is a high level function that basically deals with portioning out swap-type swaps +and performs the high level options for Alternate where SPR is perfomred first, then TBR, +but whenever a better (or additional) graph is found during TBR, an SPR swap of that graph +is performed before returning to TBR again. THis contibues until no new graphs are found in the +SPR + TBR swap. +each call to swapAll' sets break edge number to 0 +this swaps untill none found better +-} +swapAll + ∷ SwapParams + → GlobalSettings + → ProcessedData + → Int + → VertexCost + → [ReducedPhylogeneticGraph] + → [ReducedPhylogeneticGraph] + → Int + → VertexCost + → Maybe SAParams + → PhyG ([ReducedPhylogeneticGraph], Int, Maybe SAParams) +swapAll swapParams inGS inData counter curBestCost curSameBetterList [] numLeaves netPenaltyFactor inSimAnnealParams = pure (curSameBetterList, counter, inSimAnnealParams) +swapAll swapParams inGS inData counter curBestCost curSameBetterList inGraphList@(firstGraph : otherGraphs) numLeaves netPenaltyFactor inSimAnnealParams = + {-# SCC swapAll_TOP_DEF #-} + -- nni, spr, tbr + case swapType swapParams of + Alternate → do + -- logWith LogInfo "In swapAll-Alt" + (sprGraphs, sprCounter, sprSAPArams) ← + swapAll' + (swapParams{swapType = SPR}) + inGS + inData + counter + curBestCost + curSameBetterList + inGraphList + numLeaves + netPenaltyFactor + 0 + inSimAnnealParams + graphsToTBR ← GO.selectGraphs Best (keepNum swapParams) 0.0 $ sprGraphs <> inGraphList + let sprBestCost = getGraphCost graphsToTBR + + -- tbr until find better or novel equal + (tbrGraphs, tbrCounter, tbrSAPArams) ← + swapAll' + (swapParams{swapType = TBRAlternate}) + inGS + inData + sprCounter + sprBestCost + graphsToTBR + graphsToTBR + numLeaves + netPenaltyFactor + 0 + sprSAPArams + let tbrBestCost = case tbrGraphs of + [] → infinity + (_, c, _, _, _) : _ → c + + {- This isn't improving performance so turned off in SwapSPRTBR-} + -- if found better and alternating union pruning then return so can go back to start union pruning again + case joinType swapParams of + JoinAlternate | sprBestCost < curBestCost → pure (sprGraphs, sprCounter, sprSAPArams) + JoinAlternate | tbrBestCost < curBestCost → pure (tbrGraphs, tbrCounter, tbrSAPArams) + _ → case tbrBestCost `compare` sprBestCost of + -- found nothing better or equal + GT → pure (graphsToTBR, tbrCounter, tbrSAPArams) + -- if TBR found better go around again with SPR first--since returned if found better during TBR rejoin + LT → + swapAll + swapParams + inGS + inData + tbrCounter + tbrBestCost + tbrGraphs + tbrGraphs + numLeaves + netPenaltyFactor + tbrSAPArams + -- check if found additional + EQ → do + bestTBRGraphs ← GO.selectGraphs Best (keepNum swapParams) 0.0 tbrGraphs + let newTBRGraphs = GO.reducedphylogeneticGraphListMinus bestTBRGraphs $ sprGraphs <> curSameBetterList + case newTBRGraphs of + g : gs + | length bestTBRGraphs < keepNum swapParams → + swapAll + swapParams + inGS + inData + tbrCounter + tbrBestCost + tbrGraphs + newTBRGraphs + numLeaves + netPenaltyFactor + tbrSAPArams + -- found nothing new + _ → pure (tbrGraphs, tbrCounter, tbrSAPArams) + + -- 0 is for list of edges so move through list past stable edges + _ → + swapAll' + swapParams + inGS + inData + counter + curBestCost + curSameBetterList + inGraphList + numLeaves + netPenaltyFactor + 0 + inSimAnnealParams + + +{- | swapAll' performs branch swapping on all 'break' edges and all readditions +this not a "map" version to reduce memory footprint to a more mangeable level +"steepest" a passable option to short circuit readdition action to return immediately +if better graph found +1) takes first graph +2) if steepeast checks to make sure <= current best cost +3) gets list of "break-able" edges + all non-root edges if tree + all non-root bridge edges if network +4) send list (and other info) to split-join function + goes on if empty list returned or > current best + add graphs todo list if == current best cost +5) returns all of minimum cost found +if Alternate then when found better do SPR first then TBR +assumes SPR done before Alternate entering so can star with TBR and iff get better +go back to SPR. NBest for "steepest" descent +For drift and anneal need to randomize order of splits and rejoins +-} +swapAll' + ∷ SwapParams + → GlobalSettings + → ProcessedData + → Int + → VertexCost + → [ReducedPhylogeneticGraph] + → [ReducedPhylogeneticGraph] + → Int + → VertexCost + → Int + → Maybe SAParams + → PhyG ([ReducedPhylogeneticGraph], Int, Maybe SAParams) +swapAll' swapParams inGS inData counter curBestCost curSameBetterList inGraphList numLeaves netPenaltyFactor breakEdgeNumber inSimAnnealParams = + let selectionOf x = GO.selectGraphs x (keepNum swapParams) 0.0 + in -- don't need to check for mutated here since checked above + case inGraphList of + [] → + let tag x = (x, counter, inSimAnnealParams) + in tag <$> Unique `selectionOf` curSameBetterList + firstGraph : tailGraphs + | LG.isEmpty $ thd5 firstGraph → + swapAll' + swapParams + inGS + inData + counter + curBestCost + curSameBetterList + tailGraphs + numLeaves + netPenaltyFactor + breakEdgeNumber + inSimAnnealParams + firstGraph : tailGraphs → + let firstDecoratedGraph = thd5 firstGraph + (firstRootIndex, _) = head $ LG.getRoots firstDecoratedGraph + + -- determine edges to break on--'bridge' edges only for network + -- filter out edges from root since no use--would just rejoin + -- sort longest edge to shortest--option to speeed up steepest and conditions for all as well + -- this edge sort from Varon and Wheeler 2013 + breakEdgeList' = + let filtration = filter ((/= firstRootIndex) . fst3) + extractor + | graphType inGS == Tree || LG.isTree firstDecoratedGraph = LG.labEdges + | otherwise = LG.getEdgeSplitList + sorting + | not $ atRandom swapParams = GO.sortEdgeListByLength + | otherwise = id + in sorting . filtration $ extractor firstDecoratedGraph + in do + -- logWith LogInfo "In swapAll'" + -- randomize edges list order for anneal and drift + breakEdgeList'' ← + if atRandom swapParams + then shuffleList breakEdgeList' + else pure breakEdgeList' + + -- move first "breakEdgeFactor" edges in split list to end + -- since breakEdgeFactor can get incremented past number of edges the integer remainder is determined + -- to move to end + -- this to reduces the revisiting of stable edges (by moving them to the end of the list) + -- yet still insures that all edges will be visited in final (or ay time needed) split. + -- used in POY v 1-3, Came from Steve Farris pers. com. + let breakEdgeFactor = snd $ divMod breakEdgeNumber (length breakEdgeList'') + let breakEdgeList = + let (prefix, suffix) = splitAt breakEdgeFactor breakEdgeList'' + in suffix <> prefix + + -- perform intial split and rejoin on each edge in first graph + splitJoinResult ← + {-# SCC splitJoinResult #-} + splitJoinGraph + swapParams + inGS + inData + curBestCost + curSameBetterList + numLeaves + netPenaltyFactor + inSimAnnealParams + firstGraph + breakEdgeNumber + breakEdgeList + breakEdgeList + let (newGraphList', newSAParams, newBreakEdgeNumber) = splitJoinResult + + -- get best return graph list-can be empty if nothing better ort smame cost + newGraphList ← Best `selectionOf` newGraphList' + + -- get unique return graph list-can be empty if nothing better ort same cost + let newMinCost = getGraphCost `orInfinity` newGraphList + + -- logic for returning normal swap operations (better only) + -- versus simulated annealin/Drifing returning potentially sub-optimal + case inSimAnnealParams of + Nothing → case newMinCost `compare` curBestCost of + -- found better cost graph + LT → do + logWith LogInfo $ "\t->" <> (show newMinCost) + -- for alternate do SPR first then TBR + -- for alternate in TBR or prune union alternate if found better return immediately + if swapType swapParams == TBRAlternate || joinType swapParams == JoinAlternate || steepest swapParams + then pure (newGraphList, counter, newSAParams) + else -- regular swap--keep going with better graphs + + swapAll' + swapParams + inGS + inData + (counter + 1) + newMinCost + newGraphList + newGraphList + numLeaves + netPenaltyFactor + newBreakEdgeNumber + newSAParams + + -- found only worse graphs--never happens due to the way splitjoin returns only better or equal + -- but could change + GT → + swapAll' + swapParams + inGS + inData + (counter + 1) + curBestCost + curSameBetterList + (tail inGraphList) + numLeaves + netPenaltyFactor + 0 -- breakEdgeNumber set to zero for new graph to look at + newSAParams + -- found same cost graphs + EQ → + -- Important to not limit curSameBest since may rediscover graphs via swapping on equal when limiting the number to keep + -- can be a cause of infinite running issues. + -- let newCurSameBetterList = GO.selectGraphs Best (keepNum swapParams) 0.0 (-1) (curSameBetterList <> newGraphList) + let graphChoices = (tailGraphs <> newGraphList) `GO.reducedphylogeneticGraphListMinus` curSameBetterList + in do + newCurSameBetterList ← (Best `selectionOf`) $ curSameBetterList <> newGraphList + graphsToDo ← Best `selectionOf` graphChoices + + -- these conditions help to prevent recswapping endlessly on new graphs thatare not in buffers, + -- but have same cost + let graphsToDo' + | keepNum swapParams - 1 <= length graphsToDo = tailGraphs + -- found nothing better that is new + | keepNum swapParams - 1 <= length newCurSameBetterList = tailGraphs + | otherwise = graphsToDo + + swapAll' + swapParams + inGS + inData + (counter + 1) + curBestCost + newCurSameBetterList + graphsToDo' + numLeaves + netPenaltyFactor + newBreakEdgeNumber + newSAParams + + -- simulated annealing/Drift post processing + Just simAnneal → case newMinCost `compare` curBestCost of + -- found better cost graph + LT → logWith LogInfo ("\t->" <> show newMinCost) $> (newGraphList, counter, newSAParams) + -- not better so check for drift changes or annealing steps and return if reached maximum number + _ + | currentStep simAnneal >= numberSteps simAnneal || driftChanges simAnneal >= driftMaxChanges simAnneal → do + g ← selectionOf Unique $ newGraphList <> curSameBetterList + pure (g, counter, newSAParams) + + -- didn't hit stopping numbers so continuing--but based on current best cost not whatever was found + _ → do + newBestGraph ← selectionOf Unique $ newGraphList <> curSameBetterList + graphsToDo ← selectionOf Unique $ (newGraphList <> tailGraphs) `GO.reducedphylogeneticGraphListMinus` curSameBetterList + swapAll' + swapParams + inGS + inData + (counter + 1) + curBestCost + newBestGraph + graphsToDo + numLeaves + netPenaltyFactor + newBreakEdgeNumber + newSAParams + + -- found only worse graphs--never happens due to the way splitjoin returns only better or equal + -- but could change + Nothing + | newMinCost > curBestCost → + -- breakEdgeNumber set to zero for new graph to look at + swapAll' + swapParams + inGS + inData + (counter + 1) + curBestCost + curSameBetterList + tailGraphs + numLeaves + netPenaltyFactor + 0 + newSAParams + -- found same cost graphs + -- Important to not limit curSameBest since may rediscover graphs via swapping on equal when limiting the number to keep + -- can be a cause of infinite running issues. + Nothing → + let graphChoices = (tailGraphs <> newGraphList) `GO.reducedphylogeneticGraphListMinus` curSameBetterList + in do + newCurSameBetterList ← (Best `selectionOf`) $ curSameBetterList <> newGraphList + graphsToDo ← Best `selectionOf` graphChoices + + -- these conditions help to prevent recswapping endlessly on new graphs thatare not in buffers, + -- but have same cost + let graphsToDo' + | keepNum swapParams - 1 <= length graphsToDo = tailGraphs + -- found nothing better that is new + | keepNum swapParams - 1 <= length newCurSameBetterList = tailGraphs + | otherwise = graphsToDo + + swapAll' + swapParams + inGS + inData + (counter + 1) + curBestCost + newCurSameBetterList + graphsToDo' + numLeaves + netPenaltyFactor + newBreakEdgeNumber + newSAParams + + -- simulated annealing/Drift post processing + + -- found better cost graph + Just simAnneal | newMinCost < curBestCost → do + logWith LogInfo $ "\t->" <> show newMinCost + pure (newGraphList, counter, newSAParams) + + -- not better so check for drift changes or annealing steps and return if reached maximum number + Just simAnneal | currentStep simAnneal >= numberSteps simAnneal || driftChanges simAnneal >= driftMaxChanges simAnneal → do + g ← selectionOf Unique $ newGraphList <> curSameBetterList + pure (g, counter, newSAParams) + + -- didn't hit stopping numbers so continuing--but based on current best cost not whatever was found + Just simAnneal → do + newBestGraph ← selectionOf Unique $ newGraphList <> curSameBetterList + graphsToDo ← selectionOf Unique $ (newGraphList <> tailGraphs) `GO.reducedphylogeneticGraphListMinus` curSameBetterList + swapAll' + swapParams + inGS + inData + (counter + 1) + curBestCost + newBestGraph + graphsToDo + numLeaves + netPenaltyFactor + breakEdgeNumber + newSAParams + + +{- | splitJoinGraph splits a graph on a single input edge (recursively though edge list) and rejoins to all possible other edges +if steepest == True then returns on finding a better graph (lower cost) +this will traverse entire SPR neighbohood if nothing better found (or steepest == False) +different from swapALL (original) in that it doesn't build up split list so lower memory footprint +breakEdgeList Complete keeps original edge list so can create readdition edge lists more easily +parallel map on rejoin if not steepest, if steepest do number of parallel threads so can reurn if any one is better +NB -- need to verify NNI/SPR/TBR rearrangement numbers +assumes break edges are bridge edges +graph split into two peices "base" graph with original root and "pruned" graph that was split off. +the edge connecting the two is (originalConnectionOfPruned -> prunedGraphRootIndex) +the edges in pruned graph do not contaiun that edge since are enumerated via preorder pass from prunedGraphRootIndex +this is the edge that is reconnected when graphs are joined, it is often delted and rejoined to update info and to deal with +conditions where the pruned graph is a single terminal +returns teh "breakEdgeNumber" so that in steepest, the edge breaking can continue in where it left off so to speak. +this can speed up SPR/TBR by a contant factor by not revisiting stable edges (not used in SA/drifting) +used in POY v1-3 +-} +splitJoinGraph + ∷ SwapParams + → GlobalSettings + → ProcessedData + → VertexCost + → [ReducedPhylogeneticGraph] + → Int + → VertexCost + → Maybe SAParams + → ReducedPhylogeneticGraph + → Int + → [LG.LEdge EdgeInfo] + → [LG.LEdge EdgeInfo] + → PhyG ([ReducedPhylogeneticGraph], Maybe SAParams, Int) +splitJoinGraph swapParams inGS inData curBestCost curSameBetterList numLeaves netPenaltyFactor inSimAnnealParams firstGraph breakEdgeNumber' breakEdgeListComplete = \case + [] → pure (curSameBetterList, inSimAnnealParams, 0) + -- split on first input edge + edgeToBreakOn : otherEdges → + let -- this so breaking edges can contnue where current left off + -- since "rotates" edges all will be done. + breakEdgeNumber = breakEdgeNumber' + 1 + + -- split input graph into a part with the original root ("base") and the "pruned" graph -- the piece split off w/o original root + (splitGraph, graphRoot, prunedGraphRootIndex, originalConnectionOfPruned) = LG.splitGraphOnEdge (thd5 firstGraph) edgeToBreakOn + in do + -- reoptimize split graph for re-addition heuristics + (reoptimizedSplitGraph, splitCost) ← + reoptimizeSplitGraphFromVertex inGS inData (doIA swapParams) netPenaltyFactor splitGraph graphRoot prunedGraphRootIndex + + -- check for malformed network split--do nothing if malformed + if splitCost == infinity + then pure ([], inSimAnnealParams, breakEdgeNumber) + else do + -- regular swap + -- get root in base (for readdition) and edges in pruned section for rerooting during TBR readdition + let (_, edgesInPrunedGraph) = LG.nodesAndEdgesAfter splitGraph [(originalConnectionOfPruned, fromJust $ LG.lab splitGraph originalConnectionOfPruned)] + + let edgesInBaseGraph = breakEdgeListComplete L.\\ (edgeToBreakOn : edgesInPrunedGraph) + + -- insert here union calcuations based on Varon and Wheeler (2013) + -- basically--rebuild edge to rejoin list based on critical value, totalCost - splitCost, and + -- edge union distance + -- build edges pre-order and add to rejoin list if + -- 1) not network (but still recurse to children) + -- 2) union delta below threshold and recurse to children + -- if > threshold then stop, no add, no recurse since children can only get hihger ubnion distance + -- use split graph (with reoptimized nodes) and overall graph root to get avialbel edges in base graph for rejoin + + let prunedToRejoinUnionData = vertData $ fromJust $ LG.lab reoptimizedSplitGraph prunedGraphRootIndex + -- prunedToRejoinUnionData = vertData $ fromJust $ LG.lab (thd5 firstGraph) prunedGraphRootIndex + let charInfoVV = fft5 firstGraph + unionEdgeList ← + getUnionRejoinEdgeList + inGS + reoptimizedSplitGraph + charInfoVV + [graphRoot] + ((snd5 firstGraph) - splitCost) + prunedToRejoinUnionData + [] + + -- builds graph edge list with unions--need to be able to turn off and just used edges in base graph for some sort + -- of "no-union" swap + -- determine those edges within distance of original if limited (ie NNI etc) + let veryBigDist = (maxMoveEdgeDist swapParams) >= ((maxBound ∷ Int) `div` 3) + -- NOTE: There might be a strictness issue here + let ~candidateEdges = take (maxMoveEdgeDist swapParams) $ LG.sortEdgeListByDistance splitGraph [graphRoot] [graphRoot] + let rejoinEdges' = case (veryBigDist, joinType swapParams) of + (True, JoinAll) → edgesInBaseGraph + (True, _) → unionEdgeList + (False, JoinAll) → candidateEdges + (False, _) → L.intersect candidateEdges unionEdgeList + + -- randomize edges list order for anneal and drift + rejoinEdges ← + if atRandom swapParams + then shuffleList rejoinEdges' + else pure rejoinEdges' + + -- rejoin graph to all possible edges in base graph + rejoinResult ← + {-# SCC rejoinResult #-} + rejoinGraph + swapParams + inGS + inData + curBestCost + [] + netPenaltyFactor + reoptimizedSplitGraph + (GO.convertDecoratedToSimpleGraph splitGraph) + splitCost + graphRoot + prunedGraphRootIndex + originalConnectionOfPruned + rejoinEdges + edgesInPrunedGraph + inSimAnnealParams + let (newGraphList, newSAParams) = rejoinResult + {- + trace ("Edge to break on:" <> (show $ LG.toEdge edgeToBreakOn) + <> "\nBase graph edges: " <> (show $ fmap LG.toEdge edgesInBaseGraph) + <> "\nPruned graph edges: " <> (show $ fmap LG.toEdge edgesInPrunedGraph) + <> "\nTarget edges to rejoin: " <> (show $ fmap LG.toEdge rejoinEdges) + <> "\nFull edgelist: " <> (show $ fmap LG.toEdge breakEdgeListComplete)) + -} + + newGraphList' ← GO.selectGraphs Best (keepNum swapParams) 0.0 newGraphList + + case inSimAnnealParams of + -- only returns graphs if same of better else empty + -- adds null o\r better graphs to reurn list + Nothing + | (not . null) newGraphList && (steepest swapParams) → + pure (newGraphList', inSimAnnealParams, breakEdgeNumber) + Nothing → do + splitJoinResult ← + splitJoinGraph + swapParams + inGS + inData + curBestCost + curSameBetterList + numLeaves + netPenaltyFactor + inSimAnnealParams + firstGraph + breakEdgeNumber + breakEdgeListComplete + otherEdges + let (recurseGraphList, _, newEdgeBreakNumber) = splitJoinResult + pure (newGraphList' <> recurseGraphList, inSimAnnealParams, newEdgeBreakNumber) + -- Annealing/Drift swap + -- return if better + -- return if other chosen probabalistically + -- recurse if nothing returned + + -- if better than current graph return + Just simAnnealParams → + let newMinCost + | null newGraphList = infinity + | otherwise = (snd5 . head) newGraphList' + + result + | newMinCost < curBestCost = pure (newGraphList', newSAParams, breakEdgeNumber) + -- if SA returned graphs--return them + | (not . null) newGraphList = pure (newGraphList, newSAParams, breakEdgeNumber) + -- keep going if nothing + | otherwise = + splitJoinGraph + swapParams + inGS + inData + -- (tail randomIntListSwap) + curBestCost + curSameBetterList + numLeaves + netPenaltyFactor + newSAParams + firstGraph + breakEdgeNumber + breakEdgeListComplete + otherEdges + in result + + +{- | getUnionRejoinEdgeList takes a graph (split and reoptimized usually), the overall root index (of split), +split cost, and union threshold value and returns list of edges that have union distance <= threshold factor +assumes that root edges are not network edges (an invariant) +checks node then recurses to children +-} +getUnionRejoinEdgeList + ∷ GlobalSettings + → DecoratedGraph + → V.Vector (V.Vector CharInfo) + → [LG.Node] + → Double + → VertexBlockData + → [LG.LEdge EdgeInfo] + → PhyG [LG.LEdge EdgeInfo] +getUnionRejoinEdgeList inGS inGraph charInfoVV nodeIndexList splitDiffCost nodeToJoinUnionData curEdgeList = + -- returns edges in post order since prepending--could reverse if want pre-order edges + -- might be better post order, unclear + if null nodeIndexList + then pure curEdgeList + else + let nodeIndex = head nodeIndexList + childEdges = LG.out inGraph nodeIndex + childNodeIndexList = fmap snd3 childEdges + + -- node data and union distance + nodeData = vertData $ fromJust $ LG.lab inGraph nodeIndex + in -- nodeDataString = U.getUnionFieldsNode nodeData + -- toJoinString = U.getUnionFieldsNode nodeToJoinUnionData + + -- traceNoLF ("GURE: " <> (show nodeIndex)) ( + -- trace ("GUREL: " <> (show (unionDistance, splitDiffCost, unionThreshold * splitDiffCost))) ( + if (length childEdges) `notElem` [0, 1, 2] + then error ("Node has improper number of children : " <> (show $ length childEdges)) + else do + -- add non-outgroup root edge to list for rejoin after checking for acceptable union distance + -- if edge is not within union distance factor then stop--no recursion + -- this since distance cannot get lower further upt the graph given union creation + + unionDistance ← getUnionDistanceM nodeToJoinUnionData nodeData charInfoVV + let metThreshold = unionDistance < splitDiffCost * (unionThreshold inGS) + + if LG.isRoot inGraph nodeIndex + then + if metThreshold + then + if (null $ LG.out inGraph (head childNodeIndexList)) + then + getUnionRejoinEdgeList + inGS + inGraph + charInfoVV + [(snd3 $ last childEdges)] + splitDiffCost + nodeToJoinUnionData + ((last childEdges) : curEdgeList) + else + getUnionRejoinEdgeList + inGS + inGraph + charInfoVV + [(snd3 $ head childEdges)] + splitDiffCost + nodeToJoinUnionData + ((head childEdges) : curEdgeList) + else pure curEdgeList + else do + -- non-root node--process childre 1/2 + -- recurses to their children if union condition met--but doesn't add network edges + -- check current node--then recurse to children + + -- let -- first and second child data child + newCurEdgeListChild ← + if metThreshold + then getUnionRejoinEdgeList inGS inGraph charInfoVV childNodeIndexList splitDiffCost nodeToJoinUnionData (childEdges <> curEdgeList) + else pure curEdgeList + -- in -- recurse remaining nodes + getUnionRejoinEdgeList inGS inGraph charInfoVV (tail nodeIndexList) splitDiffCost nodeToJoinUnionData newCurEdgeListChild + + +{- | getUnionDistanceM gets distance between two the union fields of two characters +since its a distance no need for no change cost adjustment +-} +getUnionDistanceM ∷ VertexBlockData → VertexBlockData → V.Vector (V.Vector CharInfo) → PhyG Double +getUnionDistanceM union1 union2 charInfoVV = + let noChangeCostAdjut = False + in M.distance2UnionsM noChangeCostAdjut union1 union2 charInfoVV + + +-- | rejoinGraphTuple is a wrapper around rejoinGraph for fmapping--only returns graph list not simulated annealing params +rejoinGraphTuple + ∷ SwapParams + → GlobalSettings + → ProcessedData + → VertexCost + → [ReducedPhylogeneticGraph] + → Maybe SAParams + → (DecoratedGraph, SimpleGraph, VertexCost, LG.Node, LG.Node, LG.Node, [LG.LEdge EdgeInfo], [LG.LEdge EdgeInfo], VertexCost) + → PhyG [ReducedPhylogeneticGraph] +rejoinGraphTuple + swapParams + inGS + inData + curBestCost + curBestGraphs + inSimAnnealParams + ( reoptimizedSplitGraph + , splitGraphSimple + , splitGraphCost + , graphRoot + , prunedGraphRootIndex + , originalConnectionOfPruned + , rejoinEdges + , edgesInPrunedGraph + , netPenaltyFactor + ) = do + result ← + rejoinGraph + swapParams + inGS + inData + curBestCost + curBestGraphs + netPenaltyFactor + reoptimizedSplitGraph + splitGraphSimple + splitGraphCost + graphRoot + prunedGraphRootIndex + originalConnectionOfPruned + rejoinEdges + edgesInPrunedGraph + inSimAnnealParams + pure $ fst result + + +{- | rejoinGraph rejoins a split graph at all edges (if not steepest and found better) +in "base" graph. +if not steepest then do all as map, else recursive on base graph edge list +nni doesn't apper to be correct here--maybe loose it--doing nothing +-} +rejoinGraph + ∷ SwapParams + → GlobalSettings + → ProcessedData + → VertexCost + → [ReducedPhylogeneticGraph] + → VertexCost + → DecoratedGraph + → SimpleGraph + → VertexCost + → LG.Node + → LG.Node + → LG.Node + → [LG.LEdge EdgeInfo] + → [LG.LEdge EdgeInfo] + → Maybe SAParams + → PhyG ([ReducedPhylogeneticGraph], Maybe SAParams) +rejoinGraph swapParams inGS inData curBestCost curBestGraphs netPenaltyFactor reoptimizedSplitGraph splitGraphSimple splitGraphCost graphRoot prunedGraphRootIndex originalConnectionOfPruned rejoinEdges' edgesInPrunedGraph inSimAnnealParams = + {-# SCC rejoinGraph_TOP_DEF #-} + -- found no better--but return equal cost graphs + -- trace ("In rejoinGraph with num rejoining edges: " <> (show $ length rejoinEdges')) ( + if null rejoinEdges' + then pure (curBestGraphs, inSimAnnealParams) + else -- this is for no swapping option in fuse and genetic algorithm-fuse + + let rejoinEdges = + if (swapType swapParams) == NoSwap + then take 6 rejoinEdges' + else rejoinEdges' + in -- regular swapping + if isNothing inSimAnnealParams + then -- check if split graph cost same as graph then return since graph can only get longer on readdition + + if splitGraphCost >= curBestCost + then pure ([], inSimAnnealParams) + else -- fmap over all edges in base graph + + if not (steepest swapParams) + then + let {- + TODO: Add back safe parallelism here + * Old implementation (unsafe paralel): + rejoinGraphList = concat $ fmap fst $ PU.seqParMap (parStrategy $ lazyParStrat inGS) (singleJoin swapParams inGS inData reoptimizedSplitGraph splitGraphSimple splitGraphCost prunedGraphRootIndex originalConnectionOfPruned curBestCost edgesInPrunedGraph inSimAnnealParams) rejoinEdges + * New implementation (safe sequential): + -} + -- parallel stuff + action ∷ LG.LEdge EdgeInfo → PhyG [ReducedPhylogeneticGraph] + action = + {-# SCC rejoinGraph_action_of_singleJoin_1 #-} + fmap fst + . singleJoin + swapParams + inGS + inData + reoptimizedSplitGraph + splitGraphSimple + splitGraphCost + prunedGraphRootIndex + originalConnectionOfPruned + curBestCost + edgesInPrunedGraph + inSimAnnealParams + in do + -- logWith LogInfo "In rejoinGraph-not-steepest" + rejoinGraphList ← + getParallelChunkTraverseBy (listApplying strict2of5) >>= \pTraverse → + fold <$> pTraverse action rejoinEdges + {- + rejoinOperation = fst . singleJoin + swapParams + inGS + inData + reoptimizedSplitGraph + splitGraphSimple + splitGraphCost + prunedGraphRootIndex + originalConnectionOfPruned + curBestCost + edgesInPrunedGraph + inSimAnnealParams + rejoinGraphList = foldMap rejoinOperation rejoinEdges + -} + + {-Checking only min but seems to make slower + newMinCost = if null rejoinGraphList then infinity + else minimum $ fmap snd rejoinGraphList + (minEstCostNewGraphList, _) = unzip $ filter ((== newMinCost) . snd) rejoinGraphList + -} + + -- newGraphList = fmap (T.multiTraverseFullyLabelGraphReduced inGS inData False False Nothing) (fmap fst rejoinGraphList) `using` PU.myParListChunkRDS + newGraphList ← GO.selectGraphs Best (keepNum swapParams) 0 rejoinGraphList + + -- will only return graph if <= curBest cost + case rejoinGraphList of + [] → pure ([], inSimAnnealParams) + xs → case getGraphCost newGraphList `compare` curBestCost of + LT → pure (newGraphList, inSimAnnealParams) + _ → pure ([], inSimAnnealParams) + else -- famp over number of threads edges in base graph + -- then recurse + + let -- this could be made a little parallel--but if lots of threads basically can do all + -- to not overload paralle threads + {- This not so efficient is swapping in single graphs so leaving it be + saRounds = if isNothing inSimAnnealParams then 1 + else rounds $ fromJust inSimAnnealParams + + (numGraphsToExamine, _) = divMod PU.getNumThreads saRounds -- this may not "drift" if finds alot better, but that's how its supposed to work + -} + numGraphsToExamine = (graphsSteepest inGS) -- min (graphsSteepest inGS) PU.getNumThreads + rejoinEdgeList = take numGraphsToExamine rejoinEdges + + -- parallel stuff + action ∷ LG.LEdge EdgeInfo → PhyG [ReducedPhylogeneticGraph] + action = + {-# SCC rejoinGraph_action_of_singleJoin_2 #-} + fmap fst + . singleJoin + swapParams + inGS + inData + reoptimizedSplitGraph + splitGraphSimple + splitGraphCost + prunedGraphRootIndex + originalConnectionOfPruned + curBestCost + edgesInPrunedGraph + inSimAnnealParams + in do + -- logWith LogInfo $ "In rejoinGraph-steepest " <> (show $ length rejoinEdgeList) + rejoinGraphList ← + getParallelChunkTraverseBy (listApplying strict1and2of5) >>= \pTraverse → + fold <$> pTraverse action rejoinEdgeList + + newGraphList' ← GO.selectGraphs Best (keepNum swapParams) 0.0 rejoinGraphList + + -- found nothing better or equal + if null rejoinGraphList + then -- trace ("In steepest worse: " <> (show $ length (drop PU.getNumThreads rejoinEdges))) + + rejoinGraph + swapParams + inGS + inData + curBestCost + curBestGraphs + netPenaltyFactor + reoptimizedSplitGraph + splitGraphSimple + splitGraphCost + graphRoot + prunedGraphRootIndex + originalConnectionOfPruned + (drop numGraphsToExamine rejoinEdges) + edgesInPrunedGraph + inSimAnnealParams + else -- found better graph + + if (snd5 . head) newGraphList' < curBestCost + then -- trace ("Steepest better") + pure (newGraphList', inSimAnnealParams) + else -- found equal cost graph + + if (snd5 . head) newGraphList' == curBestCost + then do + newBestList ← GO.selectGraphs Best (keepNum swapParams) 0 $ curBestGraphs <> newGraphList' + rejoinGraph + swapParams + inGS + inData + curBestCost + newBestList + netPenaltyFactor + reoptimizedSplitGraph + splitGraphSimple + splitGraphCost + graphRoot + prunedGraphRootIndex + originalConnectionOfPruned + (drop numGraphsToExamine rejoinEdges) + edgesInPrunedGraph + inSimAnnealParams + else -- found worse graphs only + + -- trace ("In steepest worse (after recalculation): " <> (show $ length (drop PU.getNumThreads rejoinEdges))) + + rejoinGraph + swapParams + inGS + inData + curBestCost + curBestGraphs + netPenaltyFactor + reoptimizedSplitGraph + splitGraphSimple + splitGraphCost + graphRoot + prunedGraphRootIndex + originalConnectionOfPruned + (drop numGraphsToExamine rejoinEdges) + edgesInPrunedGraph + inSimAnnealParams + else -- Drifting/Simulated annealing + -- basically if it accepted a graph (better or probabalistically) then pass up + -- otherwise move to next rejoin, changes in graphs counted at higher level + + let -- based on "steepest" + -- to not overload paralle threads + {- This not so efficient is swapping in single graphs so leaving it be + saRounds = if isNothing inSimAnnealParams then 1 + else rounds $ fromJust inSimAnnealParams + + (numGraphsToExamine, _) = divMod PU.getNumThreads saRounds -- this may not "drift" if finds alot better, but that's how its supposed to work + -} + numGraphsToExamine = graphsSteepest inGS -- min (graphsSteepest inGS) PU.getNumThreads + rejoinEdgeList = take numGraphsToExamine rejoinEdges + simAnnealParamList = U.generateUniqueRandList numGraphsToExamine inSimAnnealParams + + -- parallel stuff + action ∷ (Maybe SAParams, LG.LEdge EdgeInfo) → PhyG ([ReducedPhylogeneticGraph], Maybe SAParams) + action = + {-# SCC rejoinGraph_action_of_singleJoinPrime #-} + singleJoin' + swapParams + inGS + inData + reoptimizedSplitGraph + splitGraphSimple + splitGraphCost + prunedGraphRootIndex + originalConnectionOfPruned + curBestCost + edgesInPrunedGraph + in do + rejoinGraphPairList ← + getParallelChunkTraverseBy (applyOver1of2 (listApplying strict2of5)) >>= \pTraverse → + pTraverse action $ zip simAnnealParamList rejoinEdgeList + + -- mechanics to see if trhere is a better graph in return set + -- only taking first of each list--so can keep sa params with them--really all should have length == 1 anyway + -- making sure remove all null lists that nothing was found + let nonEmptyPairList = filter (not . null . fst) rejoinGraphPairList + + let rejoinGraphList = + if (not . null) nonEmptyPairList + then fmap (head . fst) nonEmptyPairList + else [] + + let newMinCost = + if (not . null) rejoinGraphList + then minimum $ fmap snd5 rejoinGraphList + else infinity + + -- head should only be called when non-empty--so should never get runtime head error + let (newMinGraph, newMinGraphSAParams) = head $ filter ((== newMinCost) . snd5 . fst) (zip rejoinGraphList (fmap snd nonEmptyPairList)) + + -- if better than current--pass up and on + if newMinCost < curBestCost + then pure ([newMinGraph], newMinGraphSAParams) + else -- check if hit step limit--more for SA than drift + + if ((currentStep $ fromJust inSimAnnealParams) >= (numberSteps $ fromJust inSimAnnealParams)) + || ((driftChanges $ fromJust inSimAnnealParams) >= (driftMaxChanges $ fromJust inSimAnnealParams)) + then pure (curBestGraphs, inSimAnnealParams) + else -- not better so go to SA results + + -- return first non-empty result + + if (not . null) nonEmptyPairList + then pure $ head nonEmptyPairList + else -- if nothing returned (no better or probabalistically chosen) go on with updated SA params + + let newSAParams = (snd . head) rejoinGraphPairList + in rejoinGraph + swapParams + inGS + inData + curBestCost + curBestGraphs + netPenaltyFactor + reoptimizedSplitGraph + splitGraphSimple + splitGraphCost + graphRoot + prunedGraphRootIndex + originalConnectionOfPruned + (drop numGraphsToExamine rejoinEdges) + edgesInPrunedGraph + newSAParams + + +-- | singleJoin' is a wrapper arounds singleJoin to allow parMap with individual SAParams +singleJoin' + ∷ SwapParams + → GlobalSettings + → ProcessedData + → DecoratedGraph + → SimpleGraph + → VertexCost + → LG.Node + → LG.Node + → VertexCost + → [LG.LEdge EdgeInfo] + → (Maybe SAParams, LG.LEdge EdgeInfo) + → PhyG ([ReducedPhylogeneticGraph], Maybe SAParams) +singleJoin' swapParams inGS inData splitGraph splitGraphSimple splitCost prunedGraphRootIndex originalConnectionOfPruned curBestCost edgesInPrunedGraph (inSimAnnealParams, targetEdge) = + singleJoin + swapParams + inGS + inData + splitGraph + splitGraphSimple + splitCost + prunedGraphRootIndex + originalConnectionOfPruned + curBestCost + edgesInPrunedGraph + inSimAnnealParams + targetEdge + + +{- | singleJoin takes optimized split graph, split cost, target edge, swap type (ie TBR/SPR/NNI) +and "rejoins" the split graph to a single graph--creates joined graph and calculates a heuristic graph cost +based on the union assignment of the edge and its distance to the root vertex of the pruned graph +if TBR checks all edges in pruned graph with readdition edge (shortcircuits if steepest == True) +always deletes connecting edge to pruned part and readds--this because sometimes it is there and sometimes not (depending on +if SPR for terminal etc) and can create parallel edges with different weights (0.0 or not) so just remove to be sure. +TBR uses dynamic epsilon even in SPR moves--SPR does not +-} +singleJoin + ∷ SwapParams + → GlobalSettings + → ProcessedData + → DecoratedGraph + → SimpleGraph + → VertexCost + → LG.Node + → LG.Node + → VertexCost + → [LG.LEdge EdgeInfo] + → Maybe SAParams + → LG.LEdge EdgeInfo + → PhyG ([ReducedPhylogeneticGraph], Maybe SAParams) +singleJoin swapParams inGS inData splitGraph splitGraphSimple splitCost prunedGraphRootIndex originalConnectionOfPruned curBestCost edgesInPrunedGraph inSimAnnealParams targetEdge@(u, v, _) + | LG.isEmpty splitGraphSimple = pure ([], inSimAnnealParams) + -- do redo orginal graph join + | originalConnectionOfPruned `elem` [u, v] = pure ([], inSimAnnealParams) + | otherwise = case LG.lab splitGraph prunedGraphRootIndex of + Nothing → pure ([], inSimAnnealParams) + -- regular swap + Just label → + let defaultResult = (mempty, inSimAnnealParams) + + newEdgeList = + [ (u, originalConnectionOfPruned, 0.0) + , (originalConnectionOfPruned, v, 0.0) + , (originalConnectionOfPruned, prunedGraphRootIndex, 0.0) + ] + + charInfoVV = fmap thd3 $ thd3 inData + + -- Filter for bridge edges for TBR when needed + edgesInPrunedGraph' + | (graphType inGS == Tree) || LG.isTree splitGraphSimple = edgesInPrunedGraph + | otherwise = fmap fst . filter snd . zip edgesInPrunedGraph $ LG.isBridge splitGraphSimple . LG.toEdge <$> edgesInPrunedGraph + + sprNewGraph = LG.insEdges newEdgeList $ LG.delEdges [(u, v), (originalConnectionOfPruned, prunedGraphRootIndex)] splitGraphSimple + + -- here when needed--correct graph is issue in network + -- swap can screw up time consistency and other issues + getCheckedGraphNewSPR ∷ PhyG (LG.Gr NameText VertexCost) + getCheckedGraphNewSPR = do + isPhyloGraph ← LG.isPhylogeneticGraph sprNewGraph + let result + | graphType inGS == Tree = sprNewGraph + | isPhyloGraph = sprNewGraph + | otherwise = LG.empty + pure result + + decide ∷ ReducedPhylogeneticGraph → ([ReducedPhylogeneticGraph], Maybe SAParams) + decide input@(_, newCost, _, _, _) + | newCost <= curBestCost = ([input], inSimAnnealParams) + | otherwise = defaultResult + + action ∷ PhyG ([ReducedPhylogeneticGraph], Maybe SAParams) + action = do + sprNewGraphChecked ← getCheckedGraphNewSPR + if LG.isEmpty sprNewGraphChecked + then pure defaultResult + else decide <$> T.multiTraverseFullyLabelGraphReduced inGS inData False False Nothing sprNewGraphChecked + -- graphType with IA field + -- only uswe wqhere they exist + + (makeEdgeDataFunction, edgeJoinFunction) = + if graphType inGS == HardWired + then (M.makeEdgeDataM False True, edgeJoinDelta inGS False) + else + if not (useIA inGS) + then (M.makeEdgeDataM False True, edgeJoinDelta inGS False) + else (M.makeEdgeDataM True True, edgeJoinDelta inGS True) + in do + -- set edge union creation type to IA-based, filtering gaps (should be linear) + -- hence True True + targetEdgeData ← makeEdgeDataFunction splitGraph charInfoVV targetEdge + -- this for using DO for edge O(n^2) + -- targetEdgeData = M.makeEdgeData doIA (not doIA) splitGraph charInfoVV targetEdge + + -- this for SPR/NNI only + let prunedRootVertexData = vertData $ fromJust $ LG.lab splitGraph prunedGraphRootIndex + + -- rejoin should always be DO based on edge and pruned root but can be different lengths (unless Static Approx) + sprReJoinCost ← edgeJoinFunction charInfoVV prunedRootVertexData targetEdgeData + + case inSimAnnealParams of + -- SPR or no TBR rearrangements + -- wierdness here with first case should have been taking longer--but is quicker for prealigned + Nothing → case swapType swapParams of + -- _ | length edgesInPrunedGraph < 4 → action + SPR | sprReJoinCost + splitCost <= curBestCost → action + SPR → pure defaultResult + -- do TBR stuff returning SPR results if heuristic better + TBR | (length edgesInPrunedGraph < 4) && (sprReJoinCost + splitCost <= curBestCost) → action + TBR | (length edgesInPrunedGraph < 4) → pure defaultResult + TBR → do + -- check if spr better always return if so + sprResult ← + if sprReJoinCost + splitCost <= curBestCost + (sprReJoinCost * (dynamicEpsilon inGS)) + then fst <$> action + else pure mempty + case toList sprResult of + _ : _ → pure (sprResult, inSimAnnealParams) + [] → do + tbrResult' ← + tbrJoin + swapParams + inGS + inData + splitGraph + splitGraphSimple + splitCost + prunedGraphRootIndex + originalConnectionOfPruned + curBestCost + edgesInPrunedGraph' + inSimAnnealParams + targetEdge + let (tbrResult, _) = tbrResult' + pure (tbrResult, inSimAnnealParams) + + -- TBRAlternate can skip SPR moves since done already in alternate scenario + _ → do + tbrResult' ← + tbrJoin + swapParams + inGS + inData + splitGraph + splitGraphSimple + splitCost + prunedGraphRootIndex + originalConnectionOfPruned + curBestCost + edgesInPrunedGraph' + inSimAnnealParams + targetEdge + let (tbrResult, _) = tbrResult' + pure (tbrResult, inSimAnnealParams) + + -- simulated annealing/Drift swap + Just _simAnnealParams → do + -- check if spr better always return if so + sprNewGraphChecked ← getCheckedGraphNewSPR + rediagnosedSPRGraph@(_, newCost, _, _, _) ← + T.multiTraverseFullyLabelGraphReduced inGS inData False False Nothing sprNewGraphChecked + + let sprResult + | sprReJoinCost + splitCost <= curBestCost + (sprReJoinCost * (dynamicEpsilon inGS)) && newCost < curBestCost = + [rediagnosedSPRGraph] + | otherwise = mempty + case toList sprResult of + -- spr better than current + (_, cost, _, _, _) : _ → do + (_, newSAParams) ← U.simAnnealAccept inSimAnnealParams curBestCost cost + pure (sprResult, newSAParams) + -- do simAnneal/Drift for SPR and on to tbr if not accept + [] → do + acceptance ← U.simAnnealAccept inSimAnnealParams curBestCost (sprReJoinCost + splitCost) + case acceptance of + -- if accepted (better or random) then return with updated annealing/Drift parameters + (True, newSAParams) → pure ([rediagnosedSPRGraph], newSAParams) + -- rejected-recurse with updated SA params + (False, newSAParams) → case swapType swapParams of + SPR → pure (mempty, newSAParams) + _ | length edgesInPrunedGraph < 4 → pure (mempty, newSAParams) + _ → + tbrJoin + swapParams + inGS + inData + splitGraph + splitGraphSimple + splitCost + prunedGraphRootIndex + originalConnectionOfPruned + curBestCost + edgesInPrunedGraph' + newSAParams + targetEdge + + +{- | edgeJoinDelta calculates heuristic cost for joining pair edges + IA field is faster--but has to be there and not for harwired +-} +edgeJoinDelta ∷ GlobalSettings → Bool → V.Vector (V.Vector CharInfo) → VertexBlockData → VertexBlockData → PhyG VertexCost +edgeJoinDelta inGS useIA charInfoVV edgeA edgeB = + if (not useIA) + then do + vertexStuff ← POSW.createVertexDataOverBlocks inGS edgeA edgeB charInfoVV [] + pure $ V.sum $ fmap V.sum $ fmap (fmap snd) vertexStuff + else do + vertexStuff ← POSW.createVertexDataOverBlocksStaticIA inGS edgeA edgeB charInfoVV [] + pure $ V.sum $ fmap V.sum $ fmap (fmap snd) vertexStuff + + +{- | tbrJoin performs TBR rearrangements on pruned graph component +"reroots" pruned graph on each bridge edge and tries join to +target edge as in SPR +each edge is tried in turn (except for original root edge covered by singleJoin SPR function) +if heuristic edge join cost is below current best cost then the component is rerooted, joined to +target edge and graph fully diagnosed to verify cost +"steepest" short circuits checking edges to return better verified cost graph immediately +otherwise ("all") returns all graphs better than or equal to current better cost +if nothing equal or better found returns empty list +tests if reroot edges are bridge edges +uses dynamic epsilon--seems the delta estimate is high +-} +tbrJoin + ∷ SwapParams + → GlobalSettings + → ProcessedData + → DecoratedGraph + → SimpleGraph + → VertexCost + → LG.Node + → LG.Node + → VertexCost + → [LG.LEdge EdgeInfo] + → Maybe SAParams + → LG.LEdge EdgeInfo + → PhyG ([ReducedPhylogeneticGraph], Maybe SAParams) +tbrJoin swapParams inGS inData splitGraph splitGraphSimple splitCost prunedGraphRootIndex originalConnectionOfPruned curBestCost edgesInPrunedGraph inSimAnnealParams targetEdge = + -- this is for networks stopping TBR rearrangements with there are network edegs involved in pruned part of graph + let hasNetEdges + | graphType inGS == Tree = False + | otherwise = null $ filter ((== True) . LG.isNetworkLabEdge splitGraph) edgesInPrunedGraph + in case edgesInPrunedGraph of + [] → pure ([], inSimAnnealParams) + _ | hasNetEdges → pure ([], inSimAnnealParams) + _ → + -- get target edge data\ + -- always using IA for union, but filtering out gaps (doIA (not doIA)) + let charInfoVV = fmap thd3 $ thd3 inData + + -- graphType with IA field + -- only uswe wqhere they exist\ + (makeEdgeDataFunction, edgeJoinFunction) + | graphType inGS == HardWired = (M.makeEdgeDataM False True, edgeJoinDelta inGS False) + | not (useIA inGS) = (M.makeEdgeDataM False True, edgeJoinDelta inGS False) + | otherwise = (M.makeEdgeDataM True True, edgeJoinDelta inGS True) + in do + targetEdgeData ← makeEdgeDataFunction splitGraph charInfoVV targetEdge + + -- parallell stuff + let makeEdgeAction ∷ LG.LEdge b → PhyG VertexBlockData + makeEdgeAction = makeEdgeDataFunction splitGraph charInfoVV + + let joinAction ∷ VertexBlockData → PhyG VertexCost + joinAction = edgeJoinFunction charInfoVV targetEdgeData + + let rerootAction ∷ LG.LEdge EdgeInfo → SimpleGraph + rerootAction = rerootPrunedAndMakeGraph splitGraphSimple prunedGraphRootIndex originalConnectionOfPruned targetEdge + + -- Debugging info + -- debugger ∷ (Logger m, Show a, Show b, Show c) ⇒ String -> a → LG.Gr b c → m () + -- debugger str n g = logWith LogTech $ fold + -- [ "In: 'tbrJoin' with '", str, "'\n Graph [ G_", show n, " ]:\n", LG.prettify g ] + + -- reoptimizeAction ∷ SimpleGraph → PhyG ReducedPhylogeneticGraph + let reoptimizeAction g0 = do + -- debugger "reoptimizeAction" 0 g0 + result@(g1, _, _, _, _) ← T.multiTraverseFullyLabelGraphReduced inGS inData False False Nothing g0 + -- debugger "reoptimizeAction" 1 g1 + pure result + -- logic for annealing/Drift regular swap first + case inSimAnnealParams of + -- get heuristic delta joins for edges in pruned graph + Nothing + | not (steepest swapParams) → + let rerootEdgeList = filter ((/= prunedGraphRootIndex) . fst3) $ filter ((/= originalConnectionOfPruned) . fst3) edgesInPrunedGraph + in do + -- debugger "CASE OF -> Nothing( 1 )" 0 splitGraphSimple + -- True True to use IA fields and filter gaps + rerootEdgeDataList ← + getParallelChunkTraverse >>= \pTraverse → + makeEdgeAction `pTraverse` rerootEdgeList + + rerootEdgeDeltaList' ← + getParallelChunkTraverse >>= \pTraverse → + joinAction `pTraverse` rerootEdgeDataList + + let rerootEdgeDeltaList = fmap (+ splitCost) rerootEdgeDeltaList' + + let joinBestCost = minimum $ curBestCost : rerootEdgeDeltaList + -- logWith LogInfo $ "TBRJoin JBC CBC->" <> show (curBestCost, joinBestCost) + + -- check for possible better/equal graphs and verify + let deltaAdjustmentJoinCost = (curBestCost - splitCost) * (dynamicEpsilon inGS) + let candidateEdgeList = fmap fst $ filter ((<= (curBestCost + deltaAdjustmentJoinCost)) . snd) (zip rerootEdgeList rerootEdgeDeltaList) + + -- /NOTE:/ All returned fields are required by 'reoptimizeAction', so evaluate strictly + candidateJoinedGraphList' ← + getParallelChunkMap <&> \pMap → + rerootAction `pMap` candidateEdgeList + + -- check for graph wierdness if network + candidateJoinedGraphList ← + if graphType inGS == Tree + then pure candidateJoinedGraphList' + else filterM LG.isPhylogeneticGraph candidateJoinedGraphList' + + -- check for graph wierdness + rediagnosedGraphList' ← + getParallelChunkTraverseBy strict2of5 >>= \pTraverse → + reoptimizeAction `pTraverse` candidateJoinedGraphList + let rediagnosedGraphList = filter ((<= curBestCost) . snd5) rediagnosedGraphList' + + let result + | null candidateEdgeList = [] + | null rediagnosedGraphList = [] + | otherwise = rediagnosedGraphList + pure (result, Nothing) + + -- get steepest edges + Nothing → + let -- to not overload paralle threads + {- This not so efficient is swapping in single graphs so leaving it be + saRounds = if isNothing inSimAnnealParams then 1 + else rounds $ fromJust inSimAnnealParams + + (numGraphsToExamine, _) = divMod PU.getNumThreads saRounds -- this may not "drift" if finds alot better, but that's how its supposed to work + -} + numEdgesToExamine = graphsSteepest inGS -- min (graphsSteepest inGS) PU.getNumThreads + firstSetEdges = take numEdgesToExamine edgesInPrunedGraph + + -- get heuristic delta joins for steepest edge set + rerootEdgeList = filter ((/= prunedGraphRootIndex) . fst3) $ filter ((/= originalConnectionOfPruned) . fst3) firstSetEdges + in do + -- debugger "CASE OF -> Nothing( 2 )" 0 splitGraphSimple + -- True True to use IA fields and filter gaps + + rerootEdgeDataList ← + getParallelChunkTraverse >>= \pTraverse → + makeEdgeAction `pTraverse` rerootEdgeList + + rerootEdgeDeltaList' ← + getParallelChunkTraverse >>= \pTraverse → + joinAction `pTraverse` rerootEdgeDataList + + let rerootEdgeDeltaList = fmap (+ splitCost) rerootEdgeDeltaList' + + -- check for possible better/equal graphs and verify + let deltaAdjustmentJoinCost = (curBestCost - splitCost) * (dynamicEpsilon inGS) + let candidateEdgeList = fmap fst $ filter ((<= (curBestCost + deltaAdjustmentJoinCost)) . snd) (zip rerootEdgeList rerootEdgeDeltaList) + + -- /NOTE:/ All returned fields are required by 'reoptimizeAction', so evaluate strictly + candidateJoinedGraphList ← + getParallelChunkMap <&> \pMap → + rerootAction `pMap` candidateEdgeList + + rediagnosedGraphList' ← + getParallelChunkTraverseBy strict2of5 >>= \pTraverse → + reoptimizeAction `pTraverse` candidateJoinedGraphList + let rediagnosedGraphList = filter ((<= curBestCost) . snd5) rediagnosedGraphList' + + -- trace ("TBR steepest: " <> (show $ length rerootEdgeList) <> " edges to go " <> (show $ length $ (drop numEdgesToExamine edgesInPrunedGraph))) ( + if null candidateEdgeList + then + tbrJoin + swapParams + inGS + inData + splitGraph + splitGraphSimple + splitCost + prunedGraphRootIndex + originalConnectionOfPruned + curBestCost + (drop numEdgesToExamine edgesInPrunedGraph) + inSimAnnealParams + targetEdge + else + if null rediagnosedGraphList + then + tbrJoin + swapParams + inGS + inData + splitGraph + splitGraphSimple + splitCost + prunedGraphRootIndex + originalConnectionOfPruned + curBestCost + (drop numEdgesToExamine edgesInPrunedGraph) + inSimAnnealParams + targetEdge + else -- trace ("TBR: " <> (show $ minimum $ fmap snd5 rediagnosedGraphList)) + pure (rediagnosedGraphList, Nothing) + + -- simulated annealing/Drift stuff + -- based on steepest type swapping + Just simAnnealParams → + let -- to not overload paralle threads + {- This not so efficient is swapping in single graphs so leaving it be + saRounds = if isNothing inSimAnnealParams then 1 + else rounds $ fromJust inSimAnnealParams + + (numGraphsToExamine, _) = divMod PU.getNumThreads saRounds -- this may not "drift" if finds alot better, but that's how its supposed to work + -} + numEdgesToExamine = graphsSteepest inGS -- min (graphsSteepest inGS) PU.getNumThreads + firstSetEdges = take numEdgesToExamine edgesInPrunedGraph + + -- get heuristic delta joins for steepest edge set + rerootEdgeList = filter ((/= prunedGraphRootIndex) . fst3) $ filter ((/= originalConnectionOfPruned) . fst3) firstSetEdges + in do + -- debugger "CASE OF -> Just" 0 splitGraphSimple + -- True True to use IA fields and filter gaps + rerootEdgeDataList ← + getParallelChunkTraverse >>= \pTraverse → + makeEdgeAction `pTraverse` rerootEdgeList + + rerootEdgeDeltaList' ← + getParallelChunkTraverse >>= \pTraverse → + joinAction `pTraverse` rerootEdgeDataList + + let rerootEdgeDeltaList = fmap (+ splitCost) rerootEdgeDeltaList' + -- PU.seqParMap (parStrategy $ lazyParStrat inGS) (edgeJoinFunction charInfoVV targetEdgeData) rerootEdgeDataList + + let minDelta = + if (not . null) rerootEdgeDeltaList + then minimum $ rerootEdgeDeltaList + else infinity + let minEdgeList = + if (not . null) rerootEdgeDeltaList + then fmap fst $ filter ((== minDelta) . snd) (zip rerootEdgeList rerootEdgeDeltaList) + else [] + + -- check for possible better/equal graphs and verify + rediagnosedGraphList ← case minEdgeList of + [] → pure [] + xs → + getParallelChunkTraverseBy strict2of5 >>= \pTraverse → + (reoptimizeAction . rerootAction) `pTraverse` xs + + let newMinCost = + if (not . null) minEdgeList + then minimum $ fmap snd5 rediagnosedGraphList + else infinity + + -- only taking one for SA/Drift check + let newMinGraph = + if newMinCost /= infinity + then head $ filter ((== newMinCost) . snd5) rediagnosedGraphList + else emptyReducedPhylogeneticGraph + + -- if better always return it--hope this conditions short circuits so don't fully diagnose graph all the time + if minDelta < curBestCost && newMinCost < curBestCost + then do + (_, newSAParams) ← U.simAnnealAccept inSimAnnealParams curBestCost newMinCost + pure ([newMinGraph], newSAParams) + else -- check if hit step limit--more for SA than drift + + if ((currentStep simAnnealParams) >= (numberSteps simAnnealParams)) + || ((driftChanges simAnnealParams) >= (driftMaxChanges simAnnealParams)) + then pure ([], inSimAnnealParams) + else do + (acceptGraph, newSAParams) ← U.simAnnealAccept inSimAnnealParams curBestCost minDelta + -- banner = if newMinCost < curBestCost then "TBR heur better" + -- else "TBR Accepted not better" + -- if accepted (better or random) then return with updated annealing/Drift parameters + if acceptGraph + then -- trace (banner <> (show $ driftChanges $ fromJust newSAParams)) + pure ([newMinGraph], newSAParams) + else -- rejected--recurse wirth updated params + + -- trace ("TBR SA Drift Reject: " <> (show $ driftChanges $ fromJust newSAParams)) + + tbrJoin + swapParams + inGS + inData + splitGraph + splitGraphSimple + splitCost + prunedGraphRootIndex + originalConnectionOfPruned + curBestCost + (drop numEdgesToExamine edgesInPrunedGraph) + newSAParams + targetEdge + + +-- | rerootPrunedAndMakeGraph reroots the pruned graph component on the rerootEdge and joins to base gaph at target edge +rerootPrunedAndMakeGraph ∷ SimpleGraph → LG.Node → LG.Node → LG.LEdge EdgeInfo → LG.LEdge EdgeInfo → SimpleGraph +rerootPrunedAndMakeGraph splitGraphSimple prunedGraphRootIndex originalConnectionOfPruned (u, v, _) rerootEdge = + -- get edges to delete and edges to add + let (prunedEdgesToAdd, prunedEdgesToDelete) = getTBREdgeEditsSimple splitGraphSimple prunedGraphRootIndex rerootEdge + + -- edges to connect rerooted pruned component and base graph + connectingEdges = + [ (u, originalConnectionOfPruned, 0.0) + , (originalConnectionOfPruned, v, 0.0) + , (originalConnectionOfPruned, prunedGraphRootIndex, 0.0) + ] + + tbrNewGraph = + LG.insEdges (connectingEdges <> prunedEdgesToAdd) $ + LG.delEdges ([(u, v), (originalConnectionOfPruned, prunedGraphRootIndex)] <> prunedEdgesToDelete) splitGraphSimple + in tbrNewGraph + + +{- | getTBREdgeEditsSimple takes and edge and returns the list of edit to pruned subgraph +as a pair of edges to add and those to delete +since reroot edge is directed (e,v), edges away from v will have correct +orientation. Edges between 'e' and the root will have to be flipped +original root edges and reroort edge are deleted and new root and edge spanning orginal root created +delete original connection edge and creates a new one--like SPR +returns ([add], [delete]) +-} +getTBREdgeEditsSimple ∷ SimpleGraph → LG.Node → LG.LEdge a → ([LG.LEdge Double], [LG.Edge]) +getTBREdgeEditsSimple inGraph prunedGraphRootIndex rerootEdge = + -- trace ("Getting TBR Edits for " <> (show rerootEdge)) ( + let -- originalRootEdgeNodes = LG.descendants inGraph prunedGraphRootIndex + originalRootEdges = LG.out inGraph prunedGraphRootIndex + + -- get path from new root edge fst vertex to orginal root and flip those edges + -- since (u,v) is u -> v u "closer" to root + closerToPrunedRootEdgeNode = (fst3 rerootEdge, fromJust $ LG.lab inGraph $ fst3 rerootEdge) + (nodesInPath, edgesinPath) = + LG.postOrderPathToNode inGraph closerToPrunedRootEdgeNode (prunedGraphRootIndex, fromJust $ LG.lab inGraph prunedGraphRootIndex) + + -- don't want original root edges to be flipped since later deleted + edgesToFlip = edgesinPath L.\\ originalRootEdges + flippedEdges = fmap LG.flipLEdge edgesToFlip + + -- new edges on new root position and spanning old root + -- add in closer vertex to root to make sure direction of edge is correct + newEdgeOnOldRoot = + if (snd3 $ head originalRootEdges) `elem` ((fst3 rerootEdge) : (fmap fst nodesInPath)) + then (snd3 $ head originalRootEdges, snd3 $ last originalRootEdges, 0.0) + else (snd3 $ last originalRootEdges, snd3 $ head originalRootEdges, 0.0) + newRootEdges = [(prunedGraphRootIndex, fst3 rerootEdge, 0.0), (prunedGraphRootIndex, snd3 rerootEdge, 0.0)] + in -- assumes we are not checking original root + -- rerooted + -- delete orignal root edges and rerootEdge + -- add new root edges + -- and new edge on old root--but need orientation + -- flip edges from new root to old (delete and add list) + {- + trace ("\n\nIn Graph:\n" <> (LG.prettyIndices inGraph) <> "\nTBR Edits: " <> (show (LG.toEdge rerootEdge, prunedGraphRootIndex)) + <> " NewEdgeOldRoot: " <> (show $ LG.toEdge newEdgeOnOldRoot) + <> " New rootEdges: " <> (show $ fmap LG.toEdge newRootEdges) + ) + -} + -- <> "\nEdges to add: " <> (show $ fmap LG.toEdge $ newEdgeOnOldRoot : (flippedEdges <> newRootEdges)) <> "\nEdges to delete: " <> (show $ rerootEdge : (fmap LG.toEdge (edgesToFlip <> originalRootEdges)))) + (newEdgeOnOldRoot : (flippedEdges <> newRootEdges), LG.toEdge rerootEdge : (fmap LG.toEdge (edgesToFlip <> originalRootEdges))) + + +-- ) + +{- | reoptimizeSplitGraphFromVertex fully labels the component graph that is connected to the specified vertex +retuning that graph with 2 optimized components and their cost +both components goo through multi-traversal optimizations +doIA option to only do IA optimization as opposed to full thing--should be enormously faster--but yet more approximate +creates final for both base graph and priunned component due to rerooting non-concordance of preorder and post order assignments +terminology bse graph is the component with the original root, pruned that which has been removed form the original +graph to be readded to edge set +The function +1) optimizes two components seprately from their "root" +2) takes nodes and edges for each and cretes new graph +3) returns graph and summed cost of two components +4) adds in root and netPenalty factor estimates since net penalty can only be calculated on full graph +part of this is turning off net penalty cost when optimizing base and pruned graph components +if doIA is TRUE then call function that onl;y optimizes the IA assignments on the "original graph" after split. +this keeps teh IA chracters in sync across the two graphs +NB uses PhylogeneticGraph internally +This should return infinity for split graph cost if either component is emptyGraph +-} +reoptimizeSplitGraphFromVertex + ∷ GlobalSettings + → ProcessedData + → Bool + → VertexCost + → DecoratedGraph + → Int + → Int + → PhyG (DecoratedGraph, VertexCost) +reoptimizeSplitGraphFromVertex inGS inData doIA netPenaltyFactor inSplitGraph startVertex prunedSubGraphRootVertex = + -- trace ("RSGFV: " <> (show startVertex)) ( + if doIA + then -- only reoptimize the IA states for dynamic characters + reoptimizeSplitGraphFromVertexIA inGS inData netPenaltyFactor inSplitGraph startVertex prunedSubGraphRootVertex + else -- perform full optimizations of nodes + -- these required for full optimization + + let nonExactCharacters = U.getNumberSequenceCharacters (thd3 inData) + origGraph = inSplitGraph -- thd5 origPhyloGraph + leafGraph = + if graphType inGS == SoftWired + then LG.extractLeafGraph origGraph -- POSW.makeLeafGraphSoftWired inGS inData -- LG.extractLeafGraph origGraph + else LG.extractLeafGraph origGraph + calcBranchLengths = False + + -- this for multitravers in swap for softwired to turn off + multiTraverse = + if graphType inGS /= HardWired + then multiTraverseCharacters inGS + else False + + -- create simple graph version of split for post order pass + splitGraphSimple = GO.convertDecoratedToSimpleGraph inSplitGraph + in do + -- create optimized base graph + -- False for staticIA + (postOrderBaseGraph, _) ← + T.generalizedGraphPostOrderTraversal + (inGS{graphFactor = NoNetworkPenalty, multiTraverseCharacters = multiTraverse}) + nonExactCharacters + inData + leafGraph + False + (Just startVertex) + splitGraphSimple + + fullBaseGraph ← + PRE.preOrderTreeTraversal + (inGS{graphFactor = NoNetworkPenalty, multiTraverseCharacters = multiTraverse}) + (finalAssignment inGS) + False + calcBranchLengths + (nonExactCharacters > 0) + startVertex + True + postOrderBaseGraph + + -- create fully optimized pruned graph. Post order then preorder + + -- get root node of pruned graph--parent since that is the full pruned piece (keeping that node for addition to base graph and edge creation) + let startPrunedNode = (prunedSubGraphRootVertex, fromJust $ LG.lab origGraph prunedSubGraphRootVertex) + let startPrunedParentNode = head $ LG.labParents origGraph prunedSubGraphRootVertex + let startPrunedParentEdge = (fst startPrunedParentNode, prunedSubGraphRootVertex, dummyEdge) + + -- False for staticIA + (postOrderPrunedGraph, _) ← + T.generalizedGraphPostOrderTraversal + (inGS{graphFactor = NoNetworkPenalty, multiTraverseCharacters = multiTraverse}) + nonExactCharacters + inData + leafGraph + False + (Just prunedSubGraphRootVertex) + splitGraphSimple + + -- False for staticIA + fullPrunedGraph ← + PRE.preOrderTreeTraversal + (inGS{graphFactor = NoNetworkPenalty, multiTraverseCharacters = multiTraverse}) + (finalAssignment inGS) + False + calcBranchLengths + (nonExactCharacters > 0) + prunedSubGraphRootVertex + True + postOrderPrunedGraph + + -- get root node of base graph + let startBaseNode = (startVertex, fromJust $ LG.lab (thd6 fullBaseGraph) startVertex) + + -- get nodes and edges in base and pruned graph (both PhylogeneticGrapgs so thd5) + let (baseGraphNonRootNodes, baseGraphEdges) = LG.nodesAndEdgesAfter (thd6 fullBaseGraph) [startBaseNode] + + let (prunedGraphNonRootNodes, prunedGraphEdges) = + if LG.isLeaf origGraph prunedSubGraphRootVertex + then ([], []) + else LG.nodesAndEdgesAfter (thd6 fullPrunedGraph) [startPrunedNode] + + -- make fully optimized graph from base and split components + let fullSplitGraph = + LG.mkGraph + ([startBaseNode, startPrunedNode, startPrunedParentNode] <> baseGraphNonRootNodes <> prunedGraphNonRootNodes) + (startPrunedParentEdge : (baseGraphEdges <> prunedGraphEdges)) + + -- cost of split graph to be later combined with re-addition delta for heuristic graph cost + let prunedCost = + if LG.isLeaf origGraph prunedSubGraphRootVertex + then 0 + else snd6 fullPrunedGraph + let splitGraphCost = ((1.0 + netPenaltyFactor) * ((snd6 fullBaseGraph) + prunedCost)) + + {- + -- check fo unlabbeld nodes + coninicalNodes = LG.labNodes fullSplitGraph + nodeLabels = fmap (LG.lab fullSplitGraph) (fmap fst coninicalNodes) + unlabelledNodes = filter ((== Nothing) .snd) $ (zip (fmap fst coninicalNodes) nodeLabels) + -} + + if prunedCost == infinity || (snd6 fullBaseGraph) == infinity + then pure (LG.empty, infinity) + else pure (fullSplitGraph, splitGraphCost) + + +-- ) + +-- | reoptimizeSplitGraphFromVertexTuple wrapper for reoptimizeSplitGraphFromVertex with last 3 args as tuple +reoptimizeSplitGraphFromVertexTuple + ∷ GlobalSettings + → ProcessedData + → Bool + → VertexCost + → (DecoratedGraph, Int, Int) + → PhyG (DecoratedGraph, VertexCost) +reoptimizeSplitGraphFromVertexTuple inGS inData doIA netPenaltyFactor (inSplitGraph, startVertex, prunedSubGraphRootVertex) = + reoptimizeSplitGraphFromVertex inGS inData doIA netPenaltyFactor inSplitGraph startVertex prunedSubGraphRootVertex + + +{- | reoptimizeSplitGraphFromVertexIA performs operations of reoptimizeSplitGraphFromVertex for static charcaters +but dynamic characters--only update IA assignments and initialized from origPhylo graph (at leaves) to keep IA characters in sync +since all "static" only need single traversal post order pass +uses PhylogenetiGraph internally +-} +reoptimizeSplitGraphFromVertexIA + ∷ GlobalSettings + → ProcessedData + → VertexCost + → DecoratedGraph + → Int + → Int + → PhyG (DecoratedGraph, VertexCost) +reoptimizeSplitGraphFromVertexIA inGS inData netPenaltyFactor inSplitGraph startVertex prunedSubGraphRootVertex = + -- if graphType inGS /= Tree then error "Networks not yet implemented in reoptimizeSplitGraphFromVertexIA" + -- else + let nonExactCharacters = U.getNumberSequenceCharacters (thd3 inData) + origGraph = inSplitGraph -- thd5 origPhyloGraph + + -- create leaf graphs--but copy IA final to prelim + leafGraph = + if graphType inGS == SoftWired + then GO.copyIAFinalToPrelim $ LG.extractLeafGraph origGraph -- POSW.makeLeafGraphSoftWired inGS inData -- LG.extractLeafGraph origGraph + else GO.copyIAFinalToPrelim $ LG.extractLeafGraph origGraph + calcBranchLengths = False + + -- this for multitravers in swap for softwired to turn off + multiTraverse = + if graphType inGS == Tree + then multiTraverseCharacters inGS + else False + + -- create simple graph version of split for post order pass + splitGraphSimple = GO.convertDecoratedToSimpleGraph inSplitGraph + in do + -- Create base graph + -- create postorder assignment--but only from single traversal + -- True flag fior staticIA + postOrderBaseGraph ← + POSW.postOrderTreeTraversal + (inGS{graphFactor = NoNetworkPenalty, multiTraverseCharacters = multiTraverse}) + inData + leafGraph + True + (Just startVertex) + splitGraphSimple + let baseGraphCost = snd6 postOrderBaseGraph + + -- True flag fior staticIA + fullBaseGraph ← + PRE.preOrderTreeTraversal + (inGS{graphFactor = NoNetworkPenalty, multiTraverseCharacters = multiTraverse}) + (finalAssignment inGS) + True + calcBranchLengths + (nonExactCharacters > 0) + startVertex + True + postOrderBaseGraph + + {- + localRootCost = if (rootCost inGS) == NoRootCost then 0.0 + else error ("Root cost type " <> (show $ rootCost inGS) <> " is not yet implemented") + -} + + -- get root node of base graph + let startBaseNode = (startVertex, fromJust $ LG.lab (thd6 fullBaseGraph) startVertex) + + -- Create pruned graph + -- get root node of pruned graph--parent since that is the full pruned piece (keeping that node for addition to base graph and edge creation) + let startPrunedNode = GO.makeIAPrelimFromFinal (prunedSubGraphRootVertex, fromJust $ LG.lab origGraph prunedSubGraphRootVertex) + let startPrunedParentNode = head $ LG.labParents origGraph prunedSubGraphRootVertex + let startPrunedParentEdge = (fst startPrunedParentNode, prunedSubGraphRootVertex, dummyEdge) + + -- True flag fior staticIA + postOrderPrunedGraph ← + POSW.postOrderTreeTraversal + (inGS{graphFactor = NoNetworkPenalty, multiTraverseCharacters = multiTraverse}) + inData + leafGraph + True + (Just prunedSubGraphRootVertex) + splitGraphSimple + let prunedGraphCost = snd6 postOrderPrunedGraph + + -- True flag fior staticIA + fullPrunedGraph ← + PRE.preOrderTreeTraversal + (inGS{graphFactor = NoNetworkPenalty, multiTraverseCharacters = multiTraverse}) + (finalAssignment inGS) + True + calcBranchLengths + (nonExactCharacters > 0) + prunedSubGraphRootVertex + True + postOrderPrunedGraph + + -- get nodes and edges in base and pruned graph (both PhylogeneticGrapgs so thd5) + let (baseGraphNonRootNodes, baseGraphEdges) = LG.nodesAndEdgesAfter (thd6 fullBaseGraph) [startBaseNode] + + let (prunedGraphNonRootNodes, prunedGraphEdges) = + if LG.isLeaf origGraph prunedSubGraphRootVertex + then ([], []) + else LG.nodesAndEdgesAfter (thd6 fullPrunedGraph) [startPrunedNode] + + -- make fully optimized graph from base and split components + let fullSplitGraph = + LG.mkGraph + ([startBaseNode, startPrunedNode, startPrunedParentNode] <> baseGraphNonRootNodes <> prunedGraphNonRootNodes) + (startPrunedParentEdge : (baseGraphEdges <> prunedGraphEdges)) + + let splitGraphCost = ((1.0 + netPenaltyFactor) * (baseGraphCost + prunedGraphCost)) + + -- remove when working + -- trace ("ROGFVIA split costs:" <> (show (baseGraphCost, prunedGraphCost, localRootCost)) <> " -> " <> (show splitGraphCost)) ( + if prunedGraphCost == infinity || baseGraphCost == infinity + then pure (LG.empty, infinity) + else + if splitGraphCost == 0 + then + error + ( "Split costs:" + <> (show (baseGraphCost, prunedGraphCost)) + <> " -> " + <> (show splitGraphCost) + <> " Split graph simple:\n" + <> (LG.prettify splitGraphSimple) + <> "\nFull:\n" + <> (show inSplitGraph) + <> "\nOriginal Graph:\n" + <> (show origGraph) + ) + else pure (fullSplitGraph, splitGraphCost) + + +getGraphCost ∷ [ReducedPhylogeneticGraph] → VertexCost +getGraphCost = snd5 . head + + +orInfinity ∷ ∀ a r t. (Foldable t, Fractional r) ⇒ (t a → r) → t a → r +orInfinity f xs + | null xs = fromRational Real.infinity + | otherwise = f xs + +{- Old swap driving code that got overly complex and + was not working properly--very slow +-} + +{- +{- | swapSPRTBR performs SPR or TBR branch (edge) swapping on graphs +runs both SPR and TBR depending on argument since so much duplicated functionality +'steepest' abandons swap graph and switches to found graph as soon as anyhting 'better' +is found. The alternative (all) examines the entire neighborhood and retuns the best result +the return is a list of better graphs and the number of swapping rounds were required to ge there +if joinType == JoinAll is specified a single round is performed--otherwise a union rounds +alternate between joinPruned and joinAll. This to be rapid but complete. +joinType = JoinAll for annealing/drifting +-} +swapSPRTBR + ∷ SwapParams + → GlobalSettings + → ProcessedData + → Int + → [ReducedPhylogeneticGraph] + → [(Maybe SAParams, ReducedPhylogeneticGraph)] + → PhyG ([ReducedPhylogeneticGraph], Int) +swapSPRTBR swapParams inGS inData inCounter currBestGraphs = \case + [] → pure (currBestGraphs, inCounter) + firstPair@(inSimAnnealParams, _) : _ + | joinType swapParams == JoinAll || isJust inSimAnnealParams → + swapSPRTBR' (swapParams{joinType = JoinAll}) inGS inData inCounter firstPair + + firstPair@(inSimAnnealParams, inGraph) : morePairs → do + -- join with union pruing first then followed by joinAll, but joinAlternate will return on better gaphs to return to join prune + (firstList, firstCounter) ← + swapSPRTBR' (swapParams{joinType = JoinPruned}) inGS inData inCounter firstPair + -- the + 5 is to allow for extra buffer room with input graph and multiple equally costly solutions, can help + bestFirstList ← GO.selectGraphs Best (keepNum swapParams) 0.0 $ inGraph : firstList + (alternateList, alternateCounter') ← + swapSPRTBRList (swapParams{joinType = JoinAlternate}) inGS inData firstCounter bestFirstList $ + zip + (U.generateUniqueRandList (length bestFirstList) inSimAnnealParams) + bestFirstList + (bestAlternateList, alternateCounter) ← case joinType swapParams of + JoinAlternate → + let graphs = inGraph : (alternateList <> bestFirstList) + tag x = (x, alternateCounter') + in tag <$> GO.selectGraphs Best (keepNum swapParams) 0.0 graphs + -- JoinPruned-don't alternate during search + _ → pure (bestFirstList, firstCounter) + + -- recursive list version as opposed ot parMap version + -- should reduce memory footprint at cost of less parallelism--but random replicates etc should take care of that + (afterSecondList, afterSecondCounter) ← + swapSPRTBRList (swapParams{joinType = JoinAll}) inGS inData alternateCounter bestAlternateList $ + zip + (U.generateUniqueRandList (length bestAlternateList) inSimAnnealParams) + bestAlternateList + + bestSecondList ← GO.selectGraphs Best (keepNum swapParams) 0.0 afterSecondList + + -- add final joinall if buffer full? + let nextBestGraphs = case comparing getGraphCost bestSecondList currBestGraphs of + LT → bestSecondList + _ → currBestGraphs + --liftIO $ putStr ("SSPR: " <> (show $ fmap snd5 nextBestGraphs) ) -- <> " " <> (show $ fmap (snd5 .snd) morePairs)) + swapSPRTBR swapParams inGS inData (afterSecondCounter + inCounter) nextBestGraphs morePairs + +{- | swapSPRTBRList is a wrapper around swapSPRTBR' allowing for a list of graphs and a current best cost +reduce time of swap +-} +swapSPRTBRList + ∷ SwapParams + → GlobalSettings + → ProcessedData + → Int + → [ReducedPhylogeneticGraph] + → [(Maybe SAParams, ReducedPhylogeneticGraph)] + → PhyG ([ReducedPhylogeneticGraph], Int) +swapSPRTBRList swapParams inGS inData inCounter curBestGraphs = \case + [] → pure (curBestGraphs, inCounter) + -- currrent graph worse than best saved + (_, inGraph) : otherDoubles + | snd5 inGraph > getGraphCost curBestGraphs → + swapSPRTBRList swapParams inGS inData inCounter curBestGraphs otherDoubles + + firstPair@(inSimAnnealParams, inGraph) : otherDoubles → do + --logWith LogInfo "In SwapSPRTBRList" + + (graphList, swapCounter) ← swapSPRTBR' swapParams inGS inData inCounter firstPair + bestNewGraphList ← GO.selectGraphs Best (keepNum swapParams) 0.0 graphList + --liftIO $ putStr (" SSPRTBRList: " <> (show $ fmap snd5 (inGraph : bestNewGraphList))) + let recurse = swapSPRTBRList swapParams inGS inData swapCounter + let recurse' = flip recurse otherDoubles + case comparing getGraphCost bestNewGraphList curBestGraphs of + -- found equal + EQ → recurse' =<< GO.selectGraphs Unique (keepNum swapParams) 0.0 (curBestGraphs <> bestNewGraphList) + -- found worse + GT → recurse' curBestGraphs + -- found better + LT → + pure (bestNewGraphList, swapCounter) + {- + let betterResult = + let doubleToSwap = zip (U.generateUniqueRandList (length bestNewGraphList) inSimAnnealParams) bestNewGraphList + in recurse bestNewGraphList $ doubleToSwap <> otherDoubles + in case joinType swapParams of + JoinAlternate → betterResult + JoinPruned → betterResult + _ → recurse' bestNewGraphList + -} + +{- | swapSPRTBR' is the central functionality of swapping allowing for repeated calls with alternate +options such as joinType to ensure complete swap but with an edge unions pass to +reduce time of swap +also manages SA/Drifting versus greedy swap +-} +swapSPRTBR' + ∷ SwapParams + → GlobalSettings + → ProcessedData + → Int + → (Maybe SAParams, ReducedPhylogeneticGraph) + → PhyG ([ReducedPhylogeneticGraph], Int) +swapSPRTBR' swapParams inGS inData@(leafNames, _, _) inCounter (inSimAnnealParams, inGraph@(simpleG, costG, _, _, _)) + | LG.isEmpty simpleG = pure ([], 0) + | otherwise = do + --liftIO $ putStr (" SSPR': " <> (show costG)) + --logWith LogInfo "In SwapSPRTBR'" + -- inGraphNetPenalty <- POSW.getNetPenaltyReduced inGS inData inGraph + inGraphNetPenalty ← T.getPenaltyFactor inGS inData Nothing $ GO.convertReduced2PhylogeneticGraph inGraph + let inGraphNetPenaltyFactor = inGraphNetPenalty / costG + let numLeaves = V.length leafNames + case inSimAnnealParams of + Nothing → do + -- steepest takes immediate best--does not keep equall cost-- for now--disabled not working correctly so goes to "all" + (swappedGraphs, counter, _) ← + swapAll + swapParams + inGS + inData + inCounter + costG + [] + [inGraph] + numLeaves + inGraphNetPenaltyFactor + Nothing + pure $ case swappedGraphs of + [] → ([inGraph], counter) + gs → (gs, counter) + -- simulated annealing/drifting acceptance does a steepest with SA acceptance + Just simAnneal → + -- then a swap steepest and all on annealed graph + -- same at this level method (SA, Drift) choice occurs at lower level + -- annealed should only yield a single graph + let -- create list of params with unique list of random values for rounds of annealing + annealDriftRounds = rounds simAnneal + newSimAnnealParamList = U.generateUniqueRandList annealDriftRounds inSimAnnealParams + -- parallel setup + action ∷ Maybe SAParams → PhyG ([ReducedPhylogeneticGraph], Int, Maybe SAParams) + action = swapAll swapParams inGS inData 0 costG [] [inGraph] numLeaves inGraphNetPenaltyFactor + in -- this to ensure current step set to 0 + do + swapPar ← getParallelChunkTraverse + (annealDriftGraphs', anealDriftCounterList, _) ← unzip3 <$> swapPar action newSimAnnealParamList + + -- annealed/Drifted 'mutated' graphs + annealDriftGraphs ← GO.selectGraphs Unique (keepNum swapParams) 0.0 $ concat annealDriftGraphs' + + -- swap back "normally" if desired for full drifting/annealing + (swappedGraphs, counter, _) ← + swapAll + swapParams + inGS + inData + (sum anealDriftCounterList) + (min costG (minimum $ snd5 <$> annealDriftGraphs)) + [] + annealDriftGraphs + numLeaves + inGraphNetPenaltyFactor + Nothing + + bestGraphs ← GO.selectGraphs Best (keepNum swapParams) 0.0 $ inGraph : swappedGraphs + -- this Bool for Genetic Algorithm mutation step + pure $ + if not $ returnMutated swapParams + then (bestGraphs, counter) + else (annealDriftGraphs, sum anealDriftCounterList) +-} diff --git a/src/Search/SwapMaster.hs b/src/Search/SwapMaster.hs new file mode 100644 index 000000000..008a2f1b5 --- /dev/null +++ b/src/Search/SwapMaster.hs @@ -0,0 +1,433 @@ +{- | +Module exposing the functionality to swap sub-graphs of a phylogenetic graph. +-} +module Search.SwapMaster ( + swapMaster, +) where + +import Commands.Verify qualified as VER +import Control.Monad (when) +import Control.Monad.IO.Class (MonadIO (..)) +import Control.Monad.Random.Class +import Data.Bifunctor (first) +import Data.Char +import Data.Foldable (fold) +import Data.Functor ((<&>)) +import Data.Maybe +import GeneralUtilities +import Graphs.GraphOperations qualified as GO +import PHANE.Evaluation +import PHANE.Evaluation.ErrorPhase (ErrorPhase (..)) +import PHANE.Evaluation.Logging (LogLevel (..), Logger (..)) +import PHANE.Evaluation.Verbosity (Verbosity (..)) +import Search.Swap qualified as S +import Text.Read +import Types.Types +import Utilities.Utilities as U + + +{- | swapMaster processes and spawns the swap functions +the 2 x maxMoveDist since distance either side to list 2* dist on sorted edges +-} +swapMaster + ∷ [Argument] + → GlobalSettings + → ProcessedData + → [ReducedPhylogeneticGraph] + → PhyG [ReducedPhylogeneticGraph] +swapMaster inArgs inGS inData inGraphListInput = + {-# SCC swapMaster_TOP_DEF #-} + if null inGraphListInput + then do + logWith LogInfo "No graphs to swap\n" + pure [] + else -- else if graphType inGS == HardWired then trace ("Swapping hardwired graphs is currenty not implemented") inGraphList + + let -- process args for swap + ( keepNum + , maxMoveEdgeDist' + , steps' + , annealingRounds' + , doDrift + , driftRounds' + , acceptEqualProb + , acceptWorseFactor + , maxChanges + , replicateNumber + , lcArgList + ) = getSwapParams inArgs + + swapType + | any ((== "nni") . fst) lcArgList = NNI + | any ((== "spr") . fst) lcArgList = SPR + | any ((== "tbr") . fst) lcArgList = TBR + | any ((== "alternate") . fst) lcArgList = Alternate + | otherwise = Alternate + + maxMoveEdgeDist = + if swapType == NNI + then 2 + else fromJust maxMoveEdgeDist' + + -- randomized orders of split and join-- not implemented + -- doRandomized = any ((=="randomized").fst) lcArgList + + -- set implied alignment swapping + doIA' = any ((== "ia") . fst) lcArgList + doIA'' = doIA' + + --- steepest/all options + doSteepest' = any ((== "steepest") . fst) lcArgList + doAll = any ((== "all") . fst) lcArgList + + -- steepest default + doSteepest = ((not doSteepest' && not doAll) || doSteepest') + + -- simulated annealing parameters + -- returnMutated to return annealed Graphs before swapping fir use in Genetic Algorithm + doAnnealing = any ((== "annealing") . fst) lcArgList + + returnMutated = any ((== "returnmutated") . fst) lcArgList + + -- turn off union selection of rejoin--default to do both, union first + joinType + | graphType inGS == HardWired = JoinAll + | any ((== "joinall") . fst) lcArgList = JoinAll + | any ((== "joinpruned") . fst) lcArgList = JoinPruned + | any ((== "joinalternate") . fst) lcArgList = JoinAlternate + | otherwise = JoinAlternate + + -- randomize split graph and rejoin edges, defualt to randomize + atRandom + | any ((== "atrandom") . fst) lcArgList = True + | any ((== "inOrder") . fst) lcArgList = False + | otherwise = True + + -- populate SwapParams structure + localSwapParams = + SwapParams + { swapType = swapType + , joinType = joinType + , atRandom = atRandom + , keepNum = (fromJust keepNum) + , maxMoveEdgeDist = maxMoveEdgeDist + , steepest = doSteepest + , joinAlternate = False -- join prune alternates--turned off for now + , doIA = doIA'' + , returnMutated = returnMutated + } + + -- swap replicates is meant to allow multiple randomized swap trajectories + -- set to 1 if not randomized swap or SA/Drifting (set by their own options) + replicates + | not atRandom = 1 + | doAnnealing = 1 + | otherwise = fromJust replicateNumber + + -- replicate inGraphList based on 'replicates' for randomized trajectories + inGraphList = concat $ replicate replicates inGraphListInput + numGraphs = length inGraphList + + -- parallel setup + action ∷ [(Maybe SAParams, ReducedPhylogeneticGraph)] → PhyG ([ReducedPhylogeneticGraph], Int) + -- action = {-# SCC swapMaster_action_swapSPRTBR #-} S.swapSPRTBR localSwapParams inGS inData 0 inGraphList + action = {-# SCC swapMaster_action_swapSPRTBR #-} S.swapDriver localSwapParams inGS inData 0 inGraphList + in do + simAnnealParams ← + getSimAnnealParams doAnnealing doDrift steps' annealingRounds' driftRounds' acceptEqualProb acceptWorseFactor maxChanges + + -- create simulated annealing random lists uniquely for each fmap + let newSimAnnealParamList = replicate numGraphs simAnnealParams + + let progressString + | (not doAnnealing && not doDrift) = + ( "Swapping " + <> show (length inGraphListInput) + <> " input graph(s) with " + <> show replicates + <> " trajectories at minimum cost " + <> show (minimum $ fmap snd5 inGraphList) + <> " keeping maximum of " + <> show (fromJust keepNum) + <> " graphs per input graph" + <> "\n" + ) + | method (fromJust simAnnealParams) == SimAnneal = + ( "Simulated Annealing (Swapping) " + <> show (rounds $ fromJust simAnnealParams) + <> " rounds with " + <> show (numberSteps $ fromJust simAnnealParams) + <> " cooling steps " + <> show (length inGraphList) + <> " input graph(s) at minimum cost " + <> show (minimum $ fmap snd5 inGraphList) + <> " keeping maximum of " + <> show (fromJust keepNum) + <> " graphs" + <> "\n" + ) + | otherwise = + "Drifting (Swapping) " + <> show (rounds $ fromJust simAnnealParams) + <> " rounds with " + <> show (driftMaxChanges $ fromJust simAnnealParams) + <> " maximum changes per round on " + <> show (length inGraphList) + <> " input graph(s) at minimum cost " + <> show (minimum $ fmap snd5 inGraphList) + <> " keeping maximum of " + <> show (fromJust keepNum) + <> " graphs" + <> "\n" + + logWith LogInfo progressString + + let simAnnealList = (: []) <$> zip newSimAnnealParamList inGraphList + graphPairList ← + getParallelChunkTraverse >>= \pTraverse → + action `pTraverse` simAnnealList + + let (graphListList, counterList) = first fold $ unzip graphPairList + (newGraphList, counter) ← GO.selectGraphs Best (fromJust keepNum) 0 graphListList <&> \x → (x, sum counterList) + + let finalGraphList = case newGraphList of + [] → inGraphList + _ → newGraphList + + let fullBuffWarning = + if length newGraphList >= (fromJust keepNum) + then + "\n\tWarning--Swap returned as many minimum cost graphs as the 'keep' number. \n\tThis may have limited the effectiveness of the swap. \n\tConsider increasing the 'keep' value or adding an additional swap." + else "" + + let endString + | (not doAnnealing && not doDrift) = + ( "\n\tAfter swap: " + <> show (length finalGraphList) + <> " resulting graphs with minimum cost " + <> show (minimum $ fmap snd5 finalGraphList) + <> " with swap rounds (total): " + <> show counter + <> " " + <> show swapType + ) + | method (fromJust simAnnealParams) == SimAnneal = + ( "\n\tAfter Simulated Annealing: " + <> show (length finalGraphList) + <> " resulting graphs with minimum cost " + <> show (minimum $ fmap snd5 finalGraphList) + <> " with swap rounds (total): " + <> show counter + <> " " + <> show swapType + ) + | otherwise = + "\n\tAfter Drifting: " + <> show (length finalGraphList) + <> " resulting graphs with minimum cost " + <> show (minimum $ fmap snd5 finalGraphList) + <> " with swap rounds (total): " + <> show counter + <> " " + <> show swapType + + logWith LogInfo (endString <> fullBuffWarning <> "\n") + pure finalGraphList + + +-- | getSimumlatedAnnealingParams returns SA parameters +getSimAnnealParams + ∷ Bool + → Bool + → Maybe Int + → Maybe Int + → Maybe Int + → Maybe Double + → Maybe Double + → Maybe Int + → PhyG (Maybe SAParams) +getSimAnnealParams doAnnealing doDrift steps' annealingRounds' driftRounds' acceptEqualProb acceptWorseFactor maxChanges + | not doAnnealing && not doDrift = pure Nothing + | otherwise = + let steps = max 3 (fromJust steps') + + annealingRounds = case annealingRounds' of + Just v | 1 <= v → v + _ → 1 + + driftRounds = case driftRounds' of + Just v | 1 <= v → v + _ → 1 + + saMethod + | doDrift && doAnnealing = Drift + | doDrift = Drift + | otherwise = SimAnneal + + equalProb = case acceptEqualProb of + Nothing → 0 + Just v | v > 1 → 1 + Just v | v < 0 → 0 + Just v → v + + worseFactor = max (fromJust acceptWorseFactor) 0.0 + + changes = case maxChanges of + Just num | num >= 0 → num + _ → 15 + + getResult = + Just $ + SAParams + { method = saMethod + , numberSteps = steps + , currentStep = 0 + , rounds = max annealingRounds driftRounds + , driftAcceptEqual = equalProb + , driftAcceptWorse = worseFactor + , driftMaxChanges = changes + , driftChanges = 0 + } + in do + when (doDrift && doAnnealing) $ + logWith + LogWarn + "\tSpecified both Simulated Annealing (with temperature steps) and Drifting (without)--defaulting to drifting.\n" + pure $ getResult + + +-- | getSwapParams takes areg list and preocesses returning parameter values +getSwapParams + ∷ [Argument] + → ( Maybe Int + , Maybe Int + , Maybe Int + , Maybe Int + , Bool + , Maybe Int + , Maybe Double + , Maybe Double + , Maybe Int + , Maybe Int + , [(String, String)] + ) +getSwapParams inArgs = + let fstArgList = fmap (fmap toLower . fst) inArgs + sndArgList = fmap (fmap toLower . snd) inArgs + lcArgList = zip fstArgList sndArgList + checkCommandList = checkCommandArgs "swap" fstArgList VER.swapArgList + in -- check for valid command options + if not checkCommandList + then errorWithoutStackTrace ("Unrecognized command in 'swap': " <> show inArgs) + else + let keepList = filter ((== "keep") . fst) lcArgList + keepNum + | length keepList > 1 = + errorWithoutStackTrace ("Multiple 'keep' number specifications in swap command--can have only one: " <> show inArgs) + | null keepList = Just 10 + | otherwise = readMaybe (snd $ head keepList) ∷ Maybe Int + + moveLimitList = filter (not . null) (snd <$> filter ((`elem` ["alternate", "spr", "tbr", "nni"]) . fst) lcArgList) + maxMoveEdgeDist' + | length moveLimitList > 1 = + errorWithoutStackTrace + ("Multiple maximum edge distance number specifications in swap command--can have only one (e.g. spr:2): " <> show inArgs) + | null moveLimitList = Just ((maxBound ∷ Int) `div` 3) + | otherwise = readMaybe (head moveLimitList) ∷ Maybe Int + + -- simulated anealing options + stepsList = filter ((== "steps") . fst) lcArgList + steps' + | length stepsList > 1 = + errorWithoutStackTrace + ("Multiple annealing steps value specifications in swap command--can have only one (e.g. steps:10): " <> show inArgs) + | null stepsList = Just 10 + | otherwise = readMaybe (snd $ head stepsList) ∷ Maybe Int + + annealingList = filter ((== "annealing") . fst) lcArgList + annealingRounds' + | length annealingList > 1 = + errorWithoutStackTrace ("Multiple 'annealing' rounds number specifications in swap command--can have only one: " <> show inArgs) + | null annealingList = Just 1 + | otherwise = readMaybe (snd $ head annealingList) ∷ Maybe Int + + -- drift options + doDrift = any ((== "drift") . fst) lcArgList + + driftList = filter ((== "drift") . fst) lcArgList + driftRounds' + | length driftList > 1 = + errorWithoutStackTrace ("Multiple 'drift' rounds number specifications in swap command--can have only one: " <> show inArgs) + | null driftList = Just 1 + | otherwise = readMaybe (snd $ head driftList) ∷ Maybe Int + + acceptEqualList = filter ((== "acceptequal") . fst) lcArgList + acceptEqualProb + | length acceptEqualList > 1 = + errorWithoutStackTrace ("Multiple 'drift' acceptEqual specifications in swap command--can have only one: " <> show inArgs) + | null acceptEqualList = Just 0.5 + | otherwise = readMaybe (snd $ head acceptEqualList) ∷ Maybe Double + + acceptWorseList = filter ((== "acceptworse") . fst) lcArgList + acceptWorseFactor + | length acceptWorseList > 1 = + errorWithoutStackTrace ("Multiple 'drift' acceptWorse specifications in swap command--can have only one: " <> show inArgs) + | null acceptWorseList = Just 20.0 + | otherwise = readMaybe (snd $ head acceptWorseList) ∷ Maybe Double + + maxChangesList = filter ((== "maxchanges") . fst) lcArgList + maxChanges + | length maxChangesList > 1 = + errorWithoutStackTrace ("Multiple 'drift' maxChanges number specifications in swap command--can have only one: " <> show inArgs) + | null maxChangesList = Just 15 + | otherwise = readMaybe (snd $ head maxChangesList) ∷ Maybe Int + + replicatesList = filter ((== "replicates") . fst) lcArgList + replicates + | length replicatesList > 1 = + errorWithoutStackTrace ("Multiple 'swap' replicates number specifications in swap command--can have only one: " <> show inArgs) + | null replicatesList = Just 1 + | otherwise = readMaybe (snd $ head replicatesList) ∷ Maybe Int + in -- check inputs + if isNothing keepNum + then errorWithoutStackTrace ("Keep specification not an integer in swap: " <> show (head keepList)) + else + if isNothing maxMoveEdgeDist' + then errorWithoutStackTrace ("Maximum edge move distance specification not an integer (e.g. spr:2): " <> show (head moveLimitList)) + else + if isNothing steps' + then errorWithoutStackTrace ("Annealing steps specification not an integer (e.g. steps:10): " <> show (snd $ head stepsList)) + else + if isNothing acceptEqualProb + then + errorWithoutStackTrace + ("Drift 'acceptEqual' specification not a float (e.g. acceptEqual:0.75): " <> show (snd $ head acceptEqualList)) + else + if isNothing acceptWorseFactor + then + errorWithoutStackTrace + ("Drift 'acceptWorse' specification not a float (e.g. acceptWorse:1.0): " <> show (snd $ head acceptWorseList)) + else + if isNothing maxChanges + then + errorWithoutStackTrace + ("Drift 'maxChanges' specification not an integer (e.g. maxChanges:10): " <> show (snd $ head maxChangesList)) + else + if isNothing replicates + then + errorWithoutStackTrace + ("Swap 'replicates' specification not an integer (e.g. replicates:5): " <> show (snd $ head replicatesList)) + else -- trace ("GSP: " <> (show inArgs) <> " " <> (show )(keepNum, maxMoveEdgeDist', steps', annealingRounds', doDrift, driftRounds', acceptEqualProb, acceptWorseFactor, maxChanges, lcArgList)) + + ( keepNum + , maxMoveEdgeDist' + , steps' + , annealingRounds' + , doDrift + , driftRounds' + , acceptEqualProb + , acceptWorseFactor + , maxChanges + , replicates + , lcArgList + ) diff --git a/src/Search/WagnerBuild.hs b/src/Search/WagnerBuild.hs new file mode 100644 index 000000000..d309fb6be --- /dev/null +++ b/src/Search/WagnerBuild.hs @@ -0,0 +1,266 @@ +{- | +Module specifying charcter-based Wagner tree building functions. +-} +module Search.WagnerBuild ( + wagnerTreeBuild, + wagnerTreeBuild', + rasWagnerBuild, +) where + +import Control.Monad (replicateM) +import Data.Maybe +import Data.Text.Lazy qualified as TL +import Data.Vector qualified as V +import GeneralUtilities +import GraphOptimization.Medians qualified as M +import GraphOptimization.PostOrderSoftWiredFunctions qualified as POSW +import GraphOptimization.PreOrderFunctions qualified as PRE +import GraphOptimization.Traversals qualified as T +import Graphs.GraphOperations qualified as GO +import PHANE.Evaluation +import PHANE.Evaluation.Logging (LogLevel (..), Logger (..)) +import Types.Types +import Utilities.LocalGraph qualified as LG +import Utilities.Utilities qualified as U + + +-- import Debug.Trace + +-- Haven't added in unions--but prob should + +{- | rasWagnerBuild generates a series of random addition sequences and then calls wagnerTreeBuild to construct them. +Does not filter by best, unique etc. That happens with the select() command specified separately. +-} +rasWagnerBuild ∷ GlobalSettings → ProcessedData → Int → PhyG [ReducedPhylogeneticGraph] +rasWagnerBuild inGS inData numReplicates = + if numReplicates == 0 + then do + pure [] + else + let numLeaves = V.length $ fst3 inData + + -- "graph" of leaf nodes without any edges + leafGraph = GO.makeSimpleLeafGraph inData + leafDecGraph = GO.makeLeafGraph inData + leafIndexVec = V.generate numLeaves id + + hasNonExactChars = U.getNumberSequenceCharacters (thd3 inData) > 0 + + wagnerTreeAction ∷ (V.Vector Int, Int) → PhyG ReducedPhylogeneticGraph + wagnerTreeAction = wagnerTreeBuild' inGS inData leafGraph leafDecGraph numLeaves hasNonExactChars + in do + randomizedAdditionSequences ← replicateM numReplicates $ shuffleList leafIndexVec + logWith LogInfo ("\t\tBuilding " <> show numReplicates <> " character Wagner replicates" <> "\n") + getParallelChunkTraverseBy U.strict2of5 >>= \pTraverse → + pTraverse wagnerTreeAction $ zip randomizedAdditionSequences [0 .. numReplicates - 1] + + +-- | wagnerTreeBuild' is a wrapper around wagnerTreeBuild to allow for better parallelization--(zipWith not doing so well?) +wagnerTreeBuild' + ∷ GlobalSettings + → ProcessedData + → SimpleGraph + → DecoratedGraph + → Int + → Bool + → (V.Vector Int, Int) + → PhyG ReducedPhylogeneticGraph +wagnerTreeBuild' inGS inData leafSimpleGraph leafDecGraph numLeaves hasNonExactChars (additionSequence, replicateIndex) = do + wagResult ← + wagnerTreeBuild inGS inData leafSimpleGraph leafDecGraph numLeaves hasNonExactChars additionSequence replicateIndex + pure $ GO.convertPhylogeneticGraph2Reduced $ wagResult + + +{- | wagnerTreeBuild builds a wagner tree (Farris 1970--but using random addition seqeuces--not "best" addition) +from a leaf addition sequence. Always produces a tree that can be converted to a soft/hard wired network +afterwards +basic procs is to add edges to unresolved tree +currently naive wrt candidate tree costs +-} +wagnerTreeBuild + ∷ GlobalSettings → ProcessedData → SimpleGraph → DecoratedGraph → Int → Bool → V.Vector Int → Int → PhyG PhylogeneticGraph +wagnerTreeBuild inGS inData leafSimpleGraph leafDecGraph numLeaves hasNonExactChars additionSequence replicateIndex = + let rootHTU = (numLeaves, TL.pack $ "HTU" <> show numLeaves) + nextHTU = (numLeaves + 1, TL.pack $ "HTU" <> show (numLeaves + 1)) + + edge0 = (numLeaves, additionSequence V.! 0, 0.0) + edge1 = (numLeaves, numLeaves + 1, 0.0) + edge2 = (numLeaves + 1, additionSequence V.! 1, 0.0) + edge3 = (numLeaves + 1, additionSequence V.! 2, 0.0) + + initialTree = LG.insEdges [edge0, edge1, edge2, edge3] $ LG.insNodes [rootHTU, nextHTU] leafSimpleGraph + + blockCharInfo = V.map thd3 $ thd3 inData + + -- initialFullyDecoratedTree = T.multiTraverseFullyLabelTree inGS inData initialTree + -- False flag for staticIA--can't be done in build + calculateBranchLengths = False -- must be True for delata using existing edge + in do + initialPostOrderTree ← POSW.postDecorateTree inGS False initialTree leafDecGraph blockCharInfo numLeaves numLeaves + initialFullyDecoratedTree ← + PRE.preOrderTreeTraversal + inGS + (finalAssignment inGS) + False + calculateBranchLengths + hasNonExactChars + numLeaves + False + initialPostOrderTree + -- this used for missing data adjustments during build + maxDistance ← U.getMaxNumberObservations (thd3 inData) + + logWith LogInfo ("\tBuilding Wagner replicate " <> show replicateIndex <> "\n") + wagnerTree ← + recursiveAddEdgesWagner + maxDistance + (useIA inGS) + (V.drop 3 additionSequence) + numLeaves + (numLeaves + 2) + inGS + inData + hasNonExactChars + leafDecGraph + initialFullyDecoratedTree + pure wagnerTree + + +{- | recursiveAddEdgesWagner adds edges until 2n -1 (n leaves) vertices in graph +this tested by null additin sequence list +interface will change with correct final states--using post-order pass for now +-} +recursiveAddEdgesWagner + ∷ VertexCost + → Bool + → V.Vector Int + → Int + → Int + → GlobalSettings + → ProcessedData + → Bool + → DecoratedGraph + → PhylogeneticGraph + → PhyG PhylogeneticGraph +recursiveAddEdgesWagner maxDistance useIA additionSequence numLeaves numVerts inGS inData hasNonExactChars leafDecGraph inGraph@(inSimple, _, inDecGraph, _, _, charInfoVV) = + -- all edges/ taxa in graph + -- trace ("To go " <> (show additionSequence) <> " verts " <> (show numVerts)) ( + if null additionSequence + then pure inGraph + else -- trace ("RAEW-In: " <> (show $ length additionSequence)) ( + -- edges/taxa to add, but not the edges that leads to outgroup--redundant with its sister edge + + let -- outgroupEdges = filter ((< numLeaves) . snd3) $ LG.out inDecGraph numLeaves + edgesToInvade = LG.labEdges inDecGraph -- L.\\ outgroupEdges + leafToAdd = V.head additionSequence + leafToAddVertData = vertData $ fromJust $ LG.lab inDecGraph leafToAdd + + addTaxonAction ∷ LG.LEdge EdgeInfo → PhyG (VertexCost, LG.LNode TL.Text, [LG.LEdge Double], LG.Edge) + addTaxonAction = addTaxonWagner maxDistance useIA numVerts inGraph leafToAddVertData leafToAdd + in do + --- TODO + addTaxonWagnerPar ← getParallelChunkTraverseBy U.strict1of4 + candidateEditList ← addTaxonWagnerPar addTaxonAction edgesToInvade + -- let candidateEditList = PU.seqParMap (parStrategy $ lazyParStrat inGS) (addTaxonWagner maxDistance useIA numVerts inGraph leafToAddVertData leafToAdd) edgesToInvade + + let minDelta = minimum $ fmap fst4 candidateEditList + let (_, nodeToAdd, edgesToAdd, edgeToDelete) = head $ filter ((== minDelta) . fst4) candidateEditList + + -- create new tree + let newSimple = LG.insEdges edgesToAdd $ LG.insNode nodeToAdd $ LG.delEdge edgeToDelete inSimple + + -- this reroot since could add taxon sister to outgroup + let newSimple' = LG.rerootTree (outgroupIndex inGS) newSimple + + -- create fully labelled tree, if all taxa in do full multi-labelled for correct graph type + -- False flag for static IA--can't do when adding in new leaves + let calculateBranchLengths = False -- must be True for delata using existing edge + postOrderStuff ← POSW.postDecorateTree inGS False newSimple' leafDecGraph charInfoVV numLeaves numLeaves + newPhyloGraph ← -- T.multiTraverseFullyLabelTree inGS inData leafDecGraph (Just numLeaves) newSimple' + if V.length additionSequence > 1 + then + PRE.preOrderTreeTraversal + inGS + (finalAssignment inGS) + False + calculateBranchLengths + hasNonExactChars + numLeaves + False + postOrderStuff + else T.multiTraverseFullyLabelTree inGS inData leafDecGraph (Just numLeaves) newSimple' + + if isNothing (LG.lab inDecGraph leafToAdd) + then error "Missing label data for vertices" + else + recursiveAddEdgesWagner + maxDistance + useIA + (V.tail additionSequence) + numLeaves + (numVerts + 1) + inGS + inData + hasNonExactChars + leafDecGraph + newPhyloGraph + + +-- ) + +{- | addTaxonWagner adds a taxon (really edges) by 'invading' and edge, deleting that adege and creteing 3 more +to existing tree and gets cost (for now by postorder traversal--so wasteful but will be by final states later) +returns a tuple of the cost, node to add, edges to add, edge to delete +-} +addTaxonWagner + ∷ VertexCost + → Bool + → Int + → PhylogeneticGraph + → VertexBlockData + → Int + → LG.LEdge EdgeInfo + → PhyG (VertexCost, LG.LNode TL.Text, [LG.LEdge Double], LG.Edge) +addTaxonWagner maxDistance useIA numVerts (_, _, inDecGraph, _, _, charInfoVV) leafToAddVertData leafToAdd targetEdge = + let edge0 = (numVerts, leafToAdd, 0.0) + edge1 = (fst3 targetEdge, numVerts, 0.0) + edge2 = (numVerts, snd3 targetEdge, 0.0) + newNode = (numVerts, TL.pack ("HTU" <> show numVerts)) + in -- full post order + -- newSimpleGraph = LG.insEdges [edge0, edge1, edge2] $ LG.insNode newNode $ LG.delEdge (LG.toEdge targetEdge) inSimple + -- newCost = snd6 $ T.postDecorateTree newSimpleGraph leafDecGraph charInfoVV numLeaves + do + -- heuristic delta + (delta, edgeUnionVertData) ← getDelta useIA leafToAddVertData targetEdge inDecGraph charInfoVV + + -- modification for missing data + let nonMissingDistance = U.getPairwiseObservationsGraph leafToAddVertData edgeUnionVertData + + let deltaNormalized = delta * maxDistance / (max 1.0 nonMissingDistance) + + pure (deltaNormalized, newNode, [edge0, edge1, edge2], LG.toEdge targetEdge) + + +-- (newCost, newNode, [edge0, edge1, edge2], LG.toEdge targetEdge) + +{- | getDelta estimates the delta in tree cost by adding a leaf taxon in Wagner build +must be DO for this--isolated leaves won't have IA +-} +getDelta + ∷ Bool → VertexBlockData → LG.LEdge EdgeInfo → DecoratedGraph → V.Vector (V.Vector CharInfo) → PhyG (VertexCost, VertexBlockData) +getDelta useIA leafToAddVertData (eNode, vNode, _) inDecGraph charInfoVV = + let eNodeVertData = vertData $ fromJust $ LG.lab inDecGraph eNode + vNodeVertData = vertData $ fromJust $ LG.lab inDecGraph vNode + in -- create edge union 'character' blockData + -- filters gaps (True argument) because using DOm (as must) to add taxa not in IA framework + -- edge union based on final IA assignments filtering gaps (True True) + + if isNothing (LG.lab inDecGraph eNode) || isNothing (LG.lab inDecGraph vNode) + then error "Missing label data for vertices" + else do + edgeUnionVertData ← M.createEdgeUnionOverBlocksM useIA True eNodeVertData vNodeVertData charInfoVV + -- Use edge union data for delta to edge data + + let dLeafEdgeUnionCost = sum (fst <$> V.zipWith3 (PRE.getBlockCostPairsFinal DirectOptimization) leafToAddVertData edgeUnionVertData charInfoVV) + + pure (dLeafEdgeUnionCost, edgeUnionVertData) diff --git a/src/Support/Support.hs b/src/Support/Support.hs new file mode 100644 index 000000000..ea102c8c0 --- /dev/null +++ b/src/Support/Support.hs @@ -0,0 +1,1057 @@ +{- | +Module containing support functions +-} +module Support.Support ( + supportGraph, +) where + +import Commands.Verify qualified as VER +import Control.Monad (when) +import Control.Monad.IO.Class (MonadIO (..)) +import Control.Monad.Random.Class +import Data.Char +import Data.List qualified as L +import Data.Maybe +import Data.Vector qualified as V +import Data.Vector.Generic qualified as GV +import Data.Vector.Primitive (convert) +import Data.Vector.Unboxed qualified as UV +import GeneralUtilities +import GraphOptimization.Traversals qualified as T +import Graphs.GraphOperations qualified as GO +import PHANE.Evaluation +import PHANE.Evaluation.ErrorPhase (ErrorPhase (..)) +import PHANE.Evaluation.Logging (LogLevel (..), Logger (..)) +import PHANE.Evaluation.Verbosity (Verbosity (..)) +import Reconciliation.ReconcileGraphs qualified as REC +import Search.Build qualified as B +import Search.NetworkAddDelete qualified as N +import Search.Refinement qualified as R +import Text.Read +import Types.Types +import Utilities.Distances qualified as DD +import Utilities.LocalGraph qualified as LG + + +-- | driver for overall support +supportGraph ∷ [Argument] → GlobalSettings → ProcessedData → [ReducedPhylogeneticGraph] → PhyG [ReducedPhylogeneticGraph] +supportGraph inArgs inGS inData inGraphList = + if null inGraphList + then error "No graphs input to calculate support" + else + let fstArgList = fmap (fmap toLower . fst) inArgs + sndArgList = fmap (fmap toLower . snd) inArgs + lcArgList = zip fstArgList sndArgList + checkCommandList = checkCommandArgs "support" fstArgList VER.supportArgList + in -- check for valid command options + if not checkCommandList + then errorWithoutStackTrace ("Unrecognized command in 'support': " <> show inArgs) + else + let supportMeasure + | any ((== "bootstrap") . fst) lcArgList = Bootstrap + | any ((== "jackknife") . fst) lcArgList = Jackknife + | any ((== "goodmanbremer") . fst) lcArgList = GoodmanBremer + | otherwise = GoodmanBremer + + useSPR = any ((== "spr") . fst) lcArgList + useTBR = any ((== "spr") . fst) lcArgList + + onlyBuild = any ((== "buildonly") . fst) lcArgList + + jackList = filter ((== "jackknife") . fst) lcArgList + jackFreq' + | length jackList > 1 = + errorWithoutStackTrace + ( "Multiple jackknife sampling frequency specifications in support command--can have only one (e.g. jackknife:0.62): " + <> show inArgs + ) + | null jackList = Just 0.6321 -- 1- 1/e + | null (snd $ head jackList) = Just 0.6321 + | otherwise = readMaybe (snd $ head jackList) ∷ Maybe Double + + replicateList = filter ((== "replicates") . fst) lcArgList + replicates' + | length replicateList > 1 = + errorWithoutStackTrace + ("Multiple resampling replicate specifications in support command--can have only one (e.g. replicates:100): " <> show inArgs) + | null replicateList = Just 100 + | otherwise = readMaybe (snd $ head replicateList) ∷ Maybe Int + + goodBremList = filter ((`elem` ["goodmanbremer", "gb"]) . fst) lcArgList + {- + goodBremMethod + | length goodBremList > 1 = + errorWithoutStackTrace ("Multiple Goodman-Bremer method specifications in support command--can have only one (e.g. gb:tbr): " <> show inArgs) + | null (snd $ head goodBremList) = Just "tbr" + | otherwise = Just $ snd $ head goodBremList + -} + + goodBremSampleList = filter ((`elem` ["gbsample"]) . fst) lcArgList + goodBremSample + | length goodBremSampleList > 1 = + errorWithoutStackTrace + ("Multiple Goodman-Bremer sample specifications in support command--can have only one (e.g. gbsample:1000): " <> show inArgs) + | null goodBremSampleList = Just (maxBound ∷ Int) + | otherwise = readMaybe (snd $ head goodBremSampleList) ∷ Maybe Int + in if isNothing jackFreq' + then errorWithoutStackTrace ("Jacknife frequency not a float (e.g. jackknife:0.5) in support: " <> show (snd $ head jackList)) + else + if isNothing replicates' + then + errorWithoutStackTrace + ("Resampling replicates specification not a string (e.g. replicates:100) in support: " <> show (snd $ head replicateList)) + else -- else if isNothing goodBremMethod then errorWithoutStackTrace ("Goodman-Bremer method specification not a string (e.g. goodmanBremer:SPR) in support: " <> (show (snd $ head goodBremList)) <> (show lcArgList)) + + if isNothing goodBremSample + then + errorWithoutStackTrace + ("Goodman-Bremer sample specification not an integer (e.g. gbsample:1000) in support: " <> show (snd $ head goodBremSampleList)) + else + let thisMethod + | (supportMeasure == Bootstrap) && ((not . null) jackList && null goodBremList) = + -- trace + -- "Bootstrap and Jackknife specified--defaulting to Jackknife" + Jackknife + | (supportMeasure == Bootstrap) || ((not . null) jackList && (not . null) goodBremList) = + -- trace + -- "Resampling (Bootstrap or Jackknife) and Goodman-Bremer specified--defaulting to Goodman-Bremer" + GoodmanBremer + | supportMeasure == Bootstrap = Bootstrap + | (not . null) jackList = Jackknife + | otherwise = GoodmanBremer + + gbSampleSize = + if goodBremSample == Just (maxBound ∷ Int) + then Nothing + else goodBremSample + + -- sample trees uniformly at random--or "nth" + gbRandomSample = + if isJust gbSampleSize + then True -- any (( == "atrandom").fst) lcArgList + else False + + replicates = + if fromJust replicates' < 0 + then -- logWith LogWarn "Negative replicates number--defaulting to 100" + 100 + else fromJust replicates' + jackFreq = + if fromJust jackFreq' <= 0 || fromJust jackFreq' >= 1.0 + then -- trace "Jackknife frequency must be on (0.0, 1.0) defaulting to 0.6321" + 0.6321 + else fromJust jackFreq' + + buildOptions = [("distance", ""), ("replicates", show (100 ∷ Int)), ("best", show (1 ∷ Int)), ("rdwag", "")] -- [("replicates", show 10), ("best", show 1)] + swapOptions = + if onlyBuild + then [] + else [("tbr", ""), ("steepest", ""), ("keep", show (1 ∷ Int))] + supportGraphList = + if thisMethod == Bootstrap || thisMethod == Jackknife + then + let extraString = + if thisMethod == Jackknife + then " with delete fraction " <> show (1 - jackFreq) + else "" + in do + g ← getResampleGraph inGS inData thisMethod replicates buildOptions swapOptions jackFreq + logWith LogTech $ + unwords + [ "Generating" + , show thisMethod + , "resampling support with" + , show replicates + , "replicates" + , extraString + ] + pure [g] + else + let neighborhood = + if useTBR + then "tbr" + else + if useSPR + then "spr" + else "tbr" + extraString = + if isJust gbSampleSize + then " using " <> neighborhood <> " based on " <> show (fromJust gbSampleSize) <> " samples at random" + else " using " <> neighborhood + in do + logWith LogTech $ "Generating Goodman-Bremer support" <> extraString <> "\n" + -- TODO + mapM (getGoodBremGraphs inGS inData neighborhood gbSampleSize gbRandomSample) inGraphList + in do + -- Option warnings + when ((supportMeasure == Bootstrap) && ((not . null) jackList && null goodBremList)) $ + logWith LogWarn "Bootstrap and Jackknife specified--defaulting to Jackknife" + when ((supportMeasure == Bootstrap) || ((not . null) jackList && (not . null) goodBremList)) $ + logWith LogWarn "Resampling (Bootstrap or Jackknife) and Goodman-Bremer specified--defaulting to Goodman-Bremer" + when (fromJust replicates' < 0) $ + logWith LogWarn "Negative replicates number--defaulting to 100" + when (fromJust jackFreq' <= 0 || fromJust jackFreq' >= 1.0) $ + logWith LogWarn "Jackknife frequency must be on (0.0, 1.0) defaulting to 0.6321" + + supportGraphList + + +-- | getResampledGraphs performs resampling and search for Bootstrap and jackknife support +getResampleGraph + ∷ GlobalSettings + → ProcessedData + → SupportMethod + → Int + → [(String, String)] + → [(String, String)] + → Double + → PhyG ReducedPhylogeneticGraph +getResampleGraph inGS inData resampleType replicates buildOptions swapOptions jackFreq = + let -- create appropriate support graph >50% ? + -- need to add args + reconcileArgs = case graphType inGS of + Tree → + [ ("method", "majority") + , ("compare", "identity") + , ("edgelabel", "true") + , ("vertexlabel", "true") + , ("connect", "true") + , ("threshold", "51") + , ("outformat", "dot") + ] + _ → + [ ("method", "eun") + , ("compare", "identity") + , ("edgelabel", "true") + , ("vertexlabel", "true") + , ("connect", "true") + , ("threshold", "51") + , ("outformat", "dot") + ] + -- parallel stuff + action ∷ PhyG ReducedPhylogeneticGraph + action = makeResampledDataAndGraph inGS inData resampleType buildOptions swapOptions jackFreq + in -- majority ruke consensus if no args + do + -- the replicate to performs number replicates + resampledGraphList ← + getParallelChunkTraverse >>= \pTraverse → + const action `pTraverse` replicate replicates () + recResult ← REC.makeReconcileGraph VER.reconcileArgList reconcileArgs $ fst5 <$> resampledGraphList + let (_, reconciledGraph) = recResult + + -- generate resampled graph + -- can't really relabel easily wihtout bv and maybe not necessary anyway--node numebrs inconsistent + pure (reconciledGraph, infinity, LG.empty, V.empty, V.empty) + + +{- | makeResampledDataAndGraph takes paramters, resmaples data and find a graph based on search parameters +returning the resampled graph +-} +makeResampledDataAndGraph + ∷ GlobalSettings + → ProcessedData + → SupportMethod + → [(String, String)] + → [(String, String)] + → Double + → PhyG ReducedPhylogeneticGraph +makeResampledDataAndGraph inGS inData resampleType buildOptions swapOptions jackFreq = do + newData ← resampleData resampleType jackFreq inData + -- pairwise distances for distance analysis + -- pairwiseDistances ← DD.getPairwiseDistances newData + let buildGraphs = B.buildGraph buildOptions inGS newData + + -- if not a tree then try to add net edges + let netAddArgs = [("netadd", ""), ("keep", show (1 ∷ Int)), ("steepest", ""), ("atrandom", ""), ("maxnetedges", "5")] + + -- simple swap refinement + if V.null $ thd3 newData + then pure emptyReducedPhylogeneticGraph + else do + -- build graphs + buildGraphs ← B.buildGraph buildOptions inGS newData + bestBuildGraphList ← GO.selectGraphs Best (maxBound ∷ Int) 0.0 buildGraphs + + edgeGraphList ← R.netEdgeMaster netAddArgs inGS newData bestBuildGraphList + let netGraphList = case graphType inGS of + Tree → bestBuildGraphList + _ → edgeGraphList + swapGraphs ← R.swapMaster swapOptions inGS newData netGraphList + let swapGraphList + | null swapOptions = netGraphList + | otherwise = swapGraphs + pure $ head swapGraphList + + +{- | resampleData perfoms a single randomized data resampling +based on either with replacement (bootstrp) or without (jackknife) +jackknife moves through processed data and creates a new data set + based on simple prob +Bootstrap draws chars from input directly copying--currently disabled +if a block of data end up with zero resampled characters it is deleted +-} +resampleData ∷ SupportMethod → Double → ProcessedData → PhyG ProcessedData +resampleData resampleType sampleFreq (nameVect, nameBVVect, blockDataVect) + | V.null blockDataVect = error "Null input data in resampleData" + | otherwise -- Bootstrap or Jackknife resampling + = + let resampler = case resampleType of + Bootstrap → resampleBlockBootstrap + _ → resampleBlockJackknife sampleFreq + in do + newBlockDataVect' ← traverse resampler blockDataVect + -- filter any zero length blocks + let newBlockDataVect = V.filter ((not . V.null) . thd3) newBlockDataVect' + pure $ (nameVect, nameBVVect, newBlockDataVect) + + +{- | +Takes BlockData and a seed and creates a Bootstrap resampled BlockData +-} +resampleBlockBootstrap ∷ BlockData → PhyG BlockData +resampleBlockBootstrap (nameText, charDataVV, charInfoV) = do + -- maps over taxa in data bLock + (newCharDataVV, newCharInfoV) ← V.unzip <$> traverse (makeSampledPairVectBootstrap charInfoV) charDataVV + pure (nameText, newCharDataVV, V.head newCharInfoV) + + +{- | makeSampledPairVectBootstrap takes a list of Int and a vectors of charinfo and char data +and returns new vectors of chardata and charinfo based on randomly sampled character indices +this to create a Bootstrap replicate of equal size +this for a single taxon hecen pass teh random ints so same for each one +-} +makeSampledPairVectBootstrap + ∷ V.Vector CharInfo → V.Vector CharacterData → PhyG (V.Vector CharacterData, V.Vector CharInfo) +makeSampledPairVectBootstrap inCharInfoVect inCharDataVect = + -- get character numbers and set resampling indices for dynamic characters--statric happen within those characters + let -- zip so can filter static and dynamic characters + dataInfoPairV = V.zip inCharDataVect inCharInfoVect + + -- filter static + (staticCharsV, staticCharsInfoV) = V.unzip $ V.filter ((`elem` exactCharacterTypes) . charType . snd) dataInfoPairV + + -- filter dynamic + (dynamicCharsV, dynamicCharsInfoV) = V.unzip $ V.filter ((`notElem` exactCharacterTypes) . charType . snd) dataInfoPairV + + numDynamicChars = V.length dynamicCharsV + in do + dynCharIndices ← V.replicateM numDynamicChars $ getRandomR (0, numDynamicChars - 1) + let resampleDynamicChars = V.map (dynamicCharsV V.!) dynCharIndices + let resampleDynamicCharInfo = V.map (dynamicCharsInfoV V.!) dynCharIndices + + -- static chars do each one mapping random choices within the character type + -- but keeping each one--hence char info is staticCharsInfoV + resampleStaticChars ← V.zipWithM subSampleStatic staticCharsV staticCharsInfoV + -- cons the vectors for chrater data and character info + pure (resampleStaticChars <> resampleDynamicChars, staticCharsInfoV <> resampleDynamicCharInfo) + + +{- | subSampleStatic takes a random int list and a static charcter +Bootstrap resamples that character based on ransom it list and number of "subcharacters" in character +-} +subSampleStatic ∷ CharacterData → CharInfo → PhyG CharacterData +subSampleStatic inCharData inCharInfo = + let (a1, a2, a3) = rangePrelim inCharData + (na1, na2, na3) = stateBVPrelim inCharData + (pa1, pa2, pa3) = packedNonAddPrelim inCharData + m1 = matrixStatesPrelim inCharData + inCharType = charType inCharInfo + + charLength + | inCharType == Add = V.length a2 + | inCharType == NonAdd = V.length na2 + | inCharType == Matrix = V.length m1 + | otherwise = error ("Dynamic character in subSampleStatic: " <> show inCharType) + in do + -- get character indices based on number "subcharacters" + staticCharIndices ← V.generateM charLength . const $ randIndex charLength <$> getRandom + let staticCharIndicesUV ∷ UV.Vector Int + staticCharIndicesUV = convert staticCharIndices + + -- trace ("SSS:" <> (show $ V.length staticCharIndices) <> " " <> (show staticCharIndices)) ( + case inCharType of + Add → + pure $ + inCharData + { rangePrelim = (V.map (a1 V.!) staticCharIndices, V.map (a2 V.!) staticCharIndices, V.map (a3 V.!) staticCharIndices) + } + NonAdd → + pure $ + inCharData + { stateBVPrelim = (V.map (na1 V.!) staticCharIndices, V.map (na2 V.!) staticCharIndices, V.map (na3 V.!) staticCharIndices) + } + val + | val `elem` packedNonAddTypes → + pure $ + inCharData + { packedNonAddPrelim = + (UV.map (pa1 UV.!) staticCharIndicesUV, UV.map (pa2 UV.!) staticCharIndicesUV, UV.map (pa3 UV.!) staticCharIndicesUV) + } + Matrix → pure $ inCharData{matrixStatesPrelim = V.map (m1 V.!) staticCharIndices} + _ → error ("Incorrect character type in subSampleStatic: " <> show inCharType) + where + randIndex ∷ ∀ {b}. (Integral b) ⇒ b → b → b + randIndex a b = snd $ divMod (abs b) a + + +{- | makeSampledCharCharInfoVect takes a vector of Int and a vector of charData and a vector of charinfo +if teh data type is not static--the character is returns if Bool is True not otherwise +if the char is static (add, non add, matrix) then the bool array is applied +across the vetor of those characters (since they re vectors of charcters themselves +returned as a pair of vectors (reversed--but shouldn't matter for resampling purposes) +does not check if equal in length +-} +makeSampledVect ∷ (GV.Vector v a) ⇒ [Bool] → [a] → v a → v a +makeSampledVect boolList accumList inVect = + if GV.null inVect + then GV.fromList accumList + else + if head boolList + then makeSampledVect (tail boolList) (GV.head inVect : accumList) (GV.tail inVect) + else makeSampledVect (tail boolList) accumList (GV.tail inVect) + + +{- | makeSampledVect takes a list of Bool and avector and returns those values +with True as a vector (reversed--but shouldn't matter for resampling purposes) +does not check if equal in length +-} +makeSampledPairVect + ∷ [Bool] + → [Bool] + → [CharacterData] + → [CharInfo] + → V.Vector CharInfo + → V.Vector CharacterData + → (V.Vector CharacterData, V.Vector CharInfo) +makeSampledPairVect fullBoolList boolList accumCharDataList accumCharInfoList inCharInfoVect inCharDataVect = + if V.null inCharInfoVect + then (V.fromList accumCharDataList, V.fromList accumCharInfoList) + else + let firstCharInfo = V.head inCharInfoVect + firstCharData = V.head inCharDataVect + firstCharType = charType firstCharInfo + in -- straight resample if dynamic + if firstCharType `notElem` exactCharacterTypes + then + if head boolList + then + makeSampledPairVect + fullBoolList + (tail boolList) + (firstCharData : accumCharDataList) + (firstCharInfo : accumCharInfoList) + (V.tail inCharInfoVect) + (V.tail inCharDataVect) + else + makeSampledPairVect + fullBoolList + (tail boolList) + accumCharDataList + accumCharInfoList + (V.tail inCharInfoVect) + (V.tail inCharDataVect) + else -- static character--keep in sample, but need to sample in the vector + + let (a1, a2, a3) = rangePrelim firstCharData + (na1, na2, na3) = stateBVPrelim firstCharData + (pa1, pa2, pa3) = packedNonAddPrelim firstCharData + m1 = matrixStatesPrelim firstCharData + in if firstCharType == Add + then + let newCharData = + firstCharData + { rangePrelim = (makeSampledVect fullBoolList [] a1, makeSampledVect fullBoolList [] a2, makeSampledVect fullBoolList [] a3) + } + in -- trace ("Length Add: " <> (show $ V.length $ snd3 $ rangePrelim newCharData)) ( + if V.null (makeSampledVect fullBoolList [] a2) + then + makeSampledPairVect + fullBoolList + (tail boolList) + accumCharDataList + accumCharInfoList + (V.tail inCharInfoVect) + (V.tail inCharDataVect) + else + makeSampledPairVect + fullBoolList + (tail boolList) + (newCharData : accumCharDataList) + (firstCharInfo : accumCharInfoList) + (V.tail inCharInfoVect) + (V.tail inCharDataVect) + else -- ) + + if firstCharType == NonAdd + then + let newCharData = + firstCharData + { stateBVPrelim = (makeSampledVect fullBoolList [] na1, makeSampledVect fullBoolList [] na2, makeSampledVect fullBoolList [] na3) + } + in -- trace ("Length NonAdd: " <> (show $ V.length $ snd3 $ stateBVPrelim newCharData)) ( + if V.null (makeSampledVect fullBoolList [] na2) + then + makeSampledPairVect + fullBoolList + (tail boolList) + accumCharDataList + accumCharInfoList + (V.tail inCharInfoVect) + (V.tail inCharDataVect) + else + makeSampledPairVect + fullBoolList + (tail boolList) + (newCharData : accumCharDataList) + (firstCharInfo : accumCharInfoList) + (V.tail inCharInfoVect) + (V.tail inCharDataVect) + else -- ) + + if firstCharType `elem` packedNonAddTypes + then + let newCharData = + firstCharData + { packedNonAddPrelim = + (makeSampledVect fullBoolList [] pa1, makeSampledVect fullBoolList [] pa2, makeSampledVect fullBoolList [] pa3) + } + in -- trace ("Length NonAdd: " <> (show $ V.length $ snd3 $ stateBVPrelim newCharData)) ( + if GV.null (makeSampledVect fullBoolList [] pa2) + then + makeSampledPairVect + fullBoolList + (tail boolList) + accumCharDataList + accumCharInfoList + (GV.tail inCharInfoVect) + (GV.tail inCharDataVect) + else + makeSampledPairVect + fullBoolList + (tail boolList) + (newCharData : accumCharDataList) + (firstCharInfo : accumCharInfoList) + (GV.tail inCharInfoVect) + (GV.tail inCharDataVect) + else -- ) + + if firstCharType == Matrix + then + let newCharData = firstCharData{matrixStatesPrelim = makeSampledVect fullBoolList [] m1} + in if V.null (makeSampledVect fullBoolList [] m1) + then + makeSampledPairVect + fullBoolList + (tail boolList) + accumCharDataList + accumCharInfoList + (V.tail inCharInfoVect) + (V.tail inCharDataVect) + else + makeSampledPairVect + fullBoolList + (tail boolList) + (newCharData : accumCharDataList) + (firstCharInfo : accumCharInfoList) + (V.tail inCharInfoVect) + (V.tail inCharDataVect) + else error ("Incorrect character type in makeSampledPairVect: " <> show firstCharType) + + +-- | resampleBlockJackknife takes BlockData and a seed and creates a jackknife resampled BlockData +resampleBlockJackknife ∷ Double → BlockData → PhyG BlockData +resampleBlockJackknife sampleFreq inData@(nameText, charDataVV, charInfoV) = + let getRandomAcceptances ∷ PhyG [Bool] + getRandomAcceptances = fmap (randAccept sampleFreq) <$> getRandoms + + randAccept ∷ Double → Word → Bool + randAccept b a = + let (_, randVal) = divMod (abs a) 1000 + critVal = floor (1000 * b) + in -- trace ("RA : " <> (show (b,a, randVal, critVal, randVal < critVal))) + randVal < critVal + + jackknifeSampling ∷ PhyG (V.Vector (V.Vector CharacterData), V.Vector (V.Vector CharInfo)) + jackknifeSampling = do + accepts1 ← getRandomAcceptances + accepts2 ← getRandomAcceptances + pure . V.unzip $ makeSampledPairVect accepts1 accepts2 [] [] charInfoV <$> charDataVV + in do + (newCharDataVV, newCharInfoVV) ← jackknifeSampling + let newCharInfoV ∷ V.Vector CharInfo + newCharInfoV = V.head newCharInfoVV + case V.length newCharInfoV of + 0 → resampleBlockJackknife sampleFreq inData + _ → pure (nameText, newCharDataVV, newCharInfoV) + + +{- | getGoodBremGraphs performs Goodman-Bremer support +examines complete SPR or TBR swap neighborhood chekcing for presence/absence of edges in input Graph List +can do sample of trees either "nth" or at random if specified +sample based on SPR-- 4n^2 - 26n - 42 for TBR 8n^3 for now +this will only examine bridge edges for networks, networkedge values willl be doen via net delete +MAPs for each graph? +-} +getGoodBremGraphs + ∷ GlobalSettings → ProcessedData → String → Maybe Int → Bool → ReducedPhylogeneticGraph → PhyG ReducedPhylogeneticGraph +getGoodBremGraphs inGS inData swapType sampleSize sampleAtRandom inGraph = + if LG.isEmpty (fst5 inGraph) + then error "Null graph in getGoodBremGraphs" -- maybe should be error? + else do + -- create list of edges for input graph and a structure with egde node indices and bitvector values + -- requires index BV of each node + + let tupleList = getGraphTupleList inGraph + + -- traverse neighborhood (and net edge removal) keeping min cost without edges + supportEdgeTupleList ← getGBTuples inGS inData swapType sampleSize sampleAtRandom tupleList inGraph + + let simpleGBGraph = LG.mkGraph (LG.labNodes $ fst5 inGraph) (fmap (tupleToSimpleEdge (snd5 inGraph)) supportEdgeTupleList) + -- trace ("GGBG: " <> (show $ length tupleList) <> " -> " <> (show $ length supportEdgeTupleList)) + pure (simpleGBGraph, snd5 inGraph, thd5 inGraph, fth5 inGraph, fft5 inGraph) + where + tupleToSimpleEdge + ∷ ∀ {c1} {a} {b} {c2} {d} + . (Num c1) + ⇒ c1 + → (a, b, c2, d, c1) + → (a, b, c1) + tupleToSimpleEdge d (a, b, _, _, c) = (a, b, c - d) + + +-- | getGraphTupleList takes a graph and cost (maybe initialized to infinity) returns tuple list +getGraphTupleList ∷ ReducedPhylogeneticGraph → [(Int, Int, NameBV, NameBV, VertexCost)] +getGraphTupleList inGraph = + if LG.isEmpty (fst5 inGraph) + then error "Null graph in getGraphTupleList" + else + let egdeList = LG.edges (fst5 inGraph) + + -- graph node list + nodeList = LG.labNodes (thd5 inGraph) + nodeIndexBVPairList = fmap makeindexBVPair nodeList + + -- list of vectors for contant time access via index = fst (a, bv) + nodeIndexBVPairVect = V.fromList nodeIndexBVPairList + + -- make tuple for each edge in each graph + -- (uIndex,vINdex,uBV, vBV, graph cost) + tupleList = makeGraphEdgeTuples nodeIndexBVPairVect infinity egdeList + in tupleList + where + makeindexBVPair ∷ ∀ {a}. (a, VertexInfo) → (a, NameBV) + makeindexBVPair (a, b) = (a, bvLabel b) + + +{- | getGBTuples takes a tuple list from graph containing initialized values and update those values based +on each graph in the inGraph neigborhood +first does this via swap--for network does edge net edge in turn by removing using netDel +-} +getGBTuples + ∷ GlobalSettings + → ProcessedData + → String + → Maybe Int + → Bool + → [(Int, Int, NameBV, NameBV, VertexCost)] + → ReducedPhylogeneticGraph + → PhyG [(Int, Int, NameBV, NameBV, VertexCost)] +getGBTuples inGS inData swapType sampleSize sampleAtRandom inTupleList inGraph = do + -- traverse swap (SPR/TBR) neighborhood optimizing each graph fully + swapTuples ← performGBSwap inGS inData swapType sampleSize sampleAtRandom inTupleList inGraph + case graphType inGS of + -- swap only for Tree-do nothing + Tree → pure swapTuples + _ | LG.isTree (fst5 inGraph) → pure swapTuples + -- network edge support if not Tree + -- SoftWired => delete edge -- could add net move if needed + SoftWired → + let deleteAction ∷ (Int, Int, NameBV, NameBV, VertexCost) → PhyG (Int, Int, NameBV, NameBV, VertexCost) + deleteAction = updateDeleteTuple inGS inData inGraph + in getParallelChunkTraverse >>= \pTraverse → + deleteAction `pTraverse` swapTuples + -- HardWired => move edge + _ → + let moveAction ∷ (Int, Int, NameBV, NameBV, VertexCost) → PhyG (Int, Int, NameBV, NameBV, VertexCost) + moveAction = updateMoveTuple inGS inData inGraph + in getParallelChunkTraverse >>= \pTraverse → + moveAction `pTraverse` swapTuples + + +{- | updateDeleteTuple take a graph and and edge and delete a network edge (or retunrs tuple if not network) +if this were a HardWWired graph--cost would always go down, so only applied to softwired graphs +-} +updateDeleteTuple + ∷ GlobalSettings + → ProcessedData + → ReducedPhylogeneticGraph + → (Int, Int, NameBV, NameBV, VertexCost) + → PhyG (Int, Int, NameBV, NameBV, VertexCost) +updateDeleteTuple inGS inData inGraph inTuple@(inE, inV, inEBV, inVBV, inCost) = + let isNetworkEdge = LG.isNetworkEdge (fst5 inGraph) (inE, inV) + in if not isNetworkEdge + then do + pure inTuple + else do + -- True to force full evalutation + deleteRedgeResults ← N.deleteNetEdge inGS inData inGraph True (inE, inV) + let deleteCost = snd5 deleteRedgeResults + pure (inE, inV, inEBV, inVBV, min inCost deleteCost) + + +{- | updateMoveTuple take a graph and and edge and moves a network edge (or returns tuple if not network) +if this were a HardWWired graph--cost would always go down, so only applied to softwired graphs +max bound because its a place holder for max num net edges +-} +updateMoveTuple + ∷ GlobalSettings + → ProcessedData + → ReducedPhylogeneticGraph + → (Int, Int, NameBV, NameBV, VertexCost) + → PhyG (Int, Int, NameBV, NameBV, VertexCost) +updateMoveTuple inGS inData inGraph inTuple@(inE, inV, inEBV, inVBV, inCost) = + let isNetworkEdge = LG.isNetworkEdge (fst5 inGraph) (inE, inV) + in if not isNetworkEdge + then do + pure inTuple + else -- True to force full evalutation + + let steepest = False + randomOrder = False + keepNum = 10 -- really could be one since sorted by cost, but just to make sure)Order + saParams ∷ ∀ {a}. Maybe a + saParams = Nothing + in do + deleteAddGraphs ← + N.deleteOneNetAddAll inGS inData (maxBound ∷ Int) keepNum steepest randomOrder inGraph [(inE, inV)] saParams + let moveCost = minimum (snd5 <$> deleteAddGraphs) + + pure (inE, inV, inEBV, inVBV, min inCost moveCost) + + +{- | performGBSwap takes parameters and graphs and traverses swap neighborhood +examining each (or nth, or random) Graphs examining each ecah in each graph for Goodman-Bremer +optimality support +-} +performGBSwap + ∷ GlobalSettings + → ProcessedData + → String + → Maybe Int + → Bool + → [(Int, Int, NameBV, NameBV, VertexCost)] + → ReducedPhylogeneticGraph + → PhyG [(Int, Int, NameBV, NameBV, VertexCost)] +performGBSwap inGS inData swapType sampleSize sampleAtRandom inTupleList inGraph + | LG.isEmpty (fst5 inGraph) = error "Null graph in performGBSwap" + | otherwise = + let -- work with simple graph + inSimple = fst5 inGraph + (firstRootIndex, _) = head $ LG.getRoots inSimple + + -- determine edges to break on--'bridge' edges only for network + -- filter out edges from root since no use--would just rejoin + breakEdgeList = case graphType inGS of + Tree → filter ((/= firstRootIndex) . fst3) $ LG.labEdges inSimple + _ → filter ((/= firstRootIndex) . fst3) $ LG.getEdgeSplitList inSimple + in do + -- integerized critical value for prob accept + -- based on approx (leaves - netnodes)^2 or (leaves - netnodes)^3 + let (_, leafList, _, netVertList) = LG.splitVertexList (fst5 inGraph) + let intProbAccept = case swapType of + "spr" → + floor + ((1000.0 * fromIntegral (fromJust sampleSize)) / ((2.0 * fromIntegral (length leafList - length netVertList)) ** 2) ∷ Double) + _ → + floor + ((1000.0 * fromIntegral (fromJust sampleSize)) / ((2.0 * fromIntegral (length leafList - length netVertList)) ** 3) ∷ Double) + + -- splitRejoinAction ∷ ([Int], LG.LEdge Double) → PhyG [(Int, Int, NameBV, NameBV, VertexCost)] + let splitRejoinAction = splitRejoinGB inGS inData swapType intProbAccept sampleAtRandom inTupleList inSimple breakEdgeList + + -- generate tuple lists for each break edge parallelized at this level + tupleListList ← + getParallelChunkTraverse >>= \pTraverse → + splitRejoinAction `pTraverse` breakEdgeList + + -- merge tuple lists--should all be in same order + let newTupleList = mergeTupleLists (filter (not . null) tupleListList) [] + -- trace ("PGBS:" <> (show $ fmap length tupleListList) <> " -> " <> (show $ length newTupleList)) + pure newTupleList + + +{- | splitRejoinGB take parameters and splits input graph at specified edge and rejoins at all available edge +(reroots the pruned subgraph if TBR) and creates and gets cost of graph (lazy takes care of post order only) +with optimized graph, tuple list is creted and compared to input graph tuple list. +original edge was (parentPrunedGraphRoot, prunedGraphRootIndex) +working with SimpleGraph +-} +splitRejoinGB + ∷ GlobalSettings + → ProcessedData + → String + → Int + → Bool + → [(Int, Int, NameBV, NameBV, VertexCost)] + → SimpleGraph + → [LG.LEdge Double] + → LG.LEdge Double + → PhyG [(Int, Int, NameBV, NameBV, VertexCost)] +splitRejoinGB inGS inData swapType intProbAccept sampleAtRandom inTupleList inGraph originalBreakEdgeList breakEdge = + let -- split graph on breakEdge + (splitGraph, _, prunedGraphRootIndex, _, _, edgeDeleteList) = LG.splitGraphOnEdge' inGraph breakEdge + + -- get edges in base graph to be invaded (ie not in pruned graph) + prunedGraphRootNode = (prunedGraphRootIndex, fromJust $ LG.lab splitGraph prunedGraphRootIndex) + (prunedSubTreeNodes, prunedSubTreeEdges) = LG.nodesAndEdgesAfter splitGraph [prunedGraphRootNode] + edgesNotToInvade = (LG.toEdge breakEdge : edgeDeleteList) <> fmap LG.toEdge prunedSubTreeEdges + edgesToInvade = filter (LG.notMatchEdgeIndices edgesNotToInvade) originalBreakEdgeList + + -- rejoin, evaluate, get better tuple + -- check if there are tbr-type rearrangements to do (rerooting pruned graph) + -- create TBR rerooot split graphs if required + splitGraphList = + if (length prunedSubTreeNodes < 3) || swapType == "spr" + then [splitGraph] + else -- generate "tbr" rerootings in split graph + getTBRSplitGraphs inGS splitGraph breakEdge + + action ∷ LG.LEdge Double → PhyG [(Int, Int, NameBV, NameBV, VertexCost)] + action = rejoinGB inGS inData intProbAccept sampleAtRandom inTupleList splitGraphList breakEdge + in do + -- parallel at break level above + rejoinTupleListList ← + getParallelChunkTraverse >>= \pTraverse → + action `pTraverse` edgesToInvade + + -- merge tuples + pure $ mergeTupleLists rejoinTupleListList [] + + +{- | rejoinGB rejoins split graph at specific edge, id SPR then that's it, if TBR reroot pruned subgraph +splitGraph is SimpleGraph +the rejoin is SPR type relying on teh list lengt of split graph to present the TBR reroots +-} +rejoinGB + ∷ GlobalSettings + → ProcessedData + → Int + → Bool + → [(Int, Int, NameBV, NameBV, VertexCost)] + → [SimpleGraph] + → LG.LEdge Double + → LG.LEdge Double + → PhyG [(Int, Int, NameBV, NameBV, VertexCost)] +rejoinGB inGS inData intProbAccept sampleAtRandom inTupleList splitGraphList originalBreakEdge@(eBreak, _, _) edgeToInvade = case splitGraphList of + [] → pure inTupleList + splitGraph : otherGraphs → + let proceedWithSampling + | not sampleAtRandom = pure False + | otherwise = getRandomR (0, 999) >>= \rVal → pure $ rVal >= intProbAccept + + rejoinUsingTuples givenTuples = + rejoinGB + inGS + inData + intProbAccept + sampleAtRandom + givenTuples + otherGraphs + originalBreakEdge + edgeToInvade + + resultOfSampling = rejoinUsingTuples inTupleList + + resultWithoutSampling = + let newGraph = LG.joinGraphOnEdge splitGraph edgeToInvade eBreak + pruneEdges = False + warnPruneEdges = False + + startVertex ∷ ∀ {a}. Maybe a + startVertex = Nothing + + generatedResult = T.multiTraverseFullyLabelGraphReduced inGS inData pruneEdges warnPruneEdges startVertex newGraph + + generaterNewGraph + | graphType inGS == Tree || LG.isTree newGraph || ((not . LG.cyclic) newGraph && (not . LG.parentInChain) newGraph) = + generatedResult + | otherwise = pure emptyReducedPhylogeneticGraph + in do + newPhylogeneticGraph ← generaterNewGraph + let tupleList + | newPhylogeneticGraph == emptyReducedPhylogeneticGraph = inTupleList + -- update tuple list based on new graph + | otherwise = getLowerGBEdgeCost inTupleList newPhylogeneticGraph -- ((2 * numTaxa) -1) + rejoinUsingTuples tupleList + in do + shouldSampleRandomly ← proceedWithSampling + if shouldSampleRandomly + then resultOfSampling + else resultWithoutSampling + + +-- | mergeTupleLists takes a list of list of tuples and merges them choosing the better each recursive round +mergeTupleLists + ∷ [[(Int, Int, NameBV, NameBV, VertexCost)]] + → [(Int, Int, NameBV, NameBV, VertexCost)] + → [(Int, Int, NameBV, NameBV, VertexCost)] +mergeTupleLists inTupleListList accumList + | null inTupleListList = accumList + | null accumList = mergeTupleLists (tail inTupleListList) (head inTupleListList) + | otherwise = + let firstTupleList = head inTupleListList + newTupleList = zipWith chooseBetterTuple firstTupleList accumList + in mergeTupleLists (tail inTupleListList) newTupleList + + +-- | chooseBetterTuple takes two (Int, Int, NameBV, NameBV, VertexCost) and returns better cost +chooseBetterTuple + ∷ (Int, Int, NameBV, NameBV, VertexCost) → (Int, Int, NameBV, NameBV, VertexCost) → (Int, Int, NameBV, NameBV, VertexCost) +chooseBetterTuple (aE, aV, aEBV, aVBV, aCost) (_, _, _, _, bCost) = (aE, aV, aEBV, aVBV, min aCost bCost) + + +{- | makeGraphEdgeTuples take node and edge,cost tuples from a graph and returns a list of tuples of the form +(uIndex,vINdex,uBV, vBV, graph cost) +this for edge comparisons for Goodman-Bremer and other optimality-type support +-} +makeGraphEdgeTuples ∷ V.Vector (Int, NameBV) → VertexCost → [(Int, Int)] → [(Int, Int, NameBV, NameBV, VertexCost)] +makeGraphEdgeTuples nodeBVVect graphCost edgeList = + -- trace ("MET: " <> (show $ V.length nodeBVVect)) + fmap (make5Tuple nodeBVVect graphCost) edgeList + where + make5Tuple + ∷ ∀ {a} {d} {e} + . V.Vector (a, d) + → e + → (Int, Int) + → (Int, Int, d, d, e) + make5Tuple nv c (a, b) = (a, b, snd (nv V.! a), snd (nv V.! b), c) + + +{- | getLowerGBEdgeCost take a list of edge tuples of (uIndex,vINdex,uBV, vBV, graph cost) from the graph +whose supports are being calculated and a new graph and updates the edge cost (GB value) if that edge +is NOT present in the graph taking the minimum of the original GB value and the new graph cost +-} +getLowerGBEdgeCost + ∷ [(Int, Int, NameBV, NameBV, VertexCost)] → ReducedPhylogeneticGraph → [(Int, Int, NameBV, NameBV, VertexCost)] +getLowerGBEdgeCost edgeTupleList inGraph = + if LG.isEmpty (fst5 inGraph) || null edgeTupleList + then error "Empty graph or null edge tuple list in getLowerGBEdgeCost" + else + let inGraphTupleList = getGraphTupleList inGraph + in fmap (updateEdgeTuple (snd5 inGraph) inGraphTupleList) edgeTupleList + + +{- | updateEdgeTuple checks is edge is NOT in input graph edge tuple list and if not takes minimum +of edge cost GB value and in graph cost, else returns unchanged +-} +updateEdgeTuple + ∷ VertexCost + → [(Int, Int, NameBV, NameBV, VertexCost)] + → (Int, Int, NameBV, NameBV, VertexCost) + → (Int, Int, NameBV, NameBV, VertexCost) +updateEdgeTuple inGraphCost inGraphTupleList (uIndex, vIndex, uBV, vBV, edgeGBValue) = + let edgeNotFoundCost = getNotFoundCost uBV vBV inGraphCost inGraphTupleList + in if isNothing edgeNotFoundCost + then (uIndex, vIndex, uBV, vBV, edgeGBValue) + else (uIndex, vIndex, uBV, vBV, min edgeGBValue (fromJust edgeNotFoundCost)) + + +{- | getNotFoundCost take a pair of BitVectors (of vertices in graph) from an edge +and a list of (Int, Int, NameBV, NameBV, VertexCost) tuples and returns +Nothing is the BVs of the two match (= signifying edge in graph) or +Just graph cost if not present for Goodman-Bremer calculations +-} +getNotFoundCost ∷ NameBV → NameBV → VertexCost → [(Int, Int, NameBV, NameBV, VertexCost)] → Maybe VertexCost +getNotFoundCost uBV vBV inTupleCost inTupleList = + if null inTupleList + then Just inTupleCost + else + let (_, _, uInBV, vInBV, _) = head inTupleList + in if uBV == uInBV && vBV == vInBV + then Nothing + else getNotFoundCost uBV vBV inTupleCost (tail inTupleList) + + +{- | getTBRSplitGraphs takes a split gaph and the original split edge and +returns a list of rerooted subgrahs split graphs suitable for rejoining +via SPR-type rejoin each to generate TBR neighborhood +much of this is modified from Swap.hs but removing data and delta portions +-} +getTBRSplitGraphs ∷ GlobalSettings → SimpleGraph → LG.LEdge Double → [SimpleGraph] +getTBRSplitGraphs inGS splitGraph splitEdge = + if LG.isEmpty splitGraph + then error "Empty graph in getTBRSplitGraphs" + else -- get edges in pruned graph and reroot on those edges that are 1) not from original "root" of prune + -- and 2) not network edges + + let prunedGraphRootNode = (snd3 splitEdge, fromJust $ LG.lab splitGraph $ snd3 splitEdge) + edgesInPrunedSubGraph = snd $ LG.nodesAndEdgesAfter splitGraph [prunedGraphRootNode] + + nonNetWorkEdgeList = + if graphType inGS /= Tree + then filter (not . LG.isNetworkLabEdge splitGraph) edgesInPrunedSubGraph + else edgesInPrunedSubGraph + + -- original pruned root edges + prunedRootEdges = LG.out splitGraph $ fst prunedGraphRootNode + + -- edges available for rerooting + edgeAfterList = nonNetWorkEdgeList L.\\ prunedRootEdges + + -- get edges to add and delete for TBR rerooting + tbrEdits = fmap (getTBREdits splitGraph prunedGraphRootNode edgesInPrunedSubGraph . LG.toEdge) edgeAfterList + + -- TBR split graph list + tbrGraphList = fmap (LG.insertDeleteEdges splitGraph) tbrEdits + in splitGraph : tbrGraphList + + +{- | getTBREdits takes and edge and returns the list of edits to pruned subgraph +as a pair of edges to add and those to delete +since reroot edge is directed (e,v), edges away from v will have correct +orientation. Edges between 'e' and the root will have to be flipped +original root edges and reroort edge are deleted and new root and edge spanning orginal root created +returns ([add], [delete]) +modified from function in swap to be more general and operate on SimpleGraphs as are used here +-} +getTBREdits ∷ (Eq a, Eq b) ⇒ LG.Gr a b → LG.LNode a → [LG.LEdge b] → LG.Edge → ([LG.LEdge b], [LG.Edge]) +getTBREdits inGraph prunedGraphRootNode edgesInPrunedSubGraph rerootEdge = + -- trace ("Gettiung TBR Edits for " <> (show rerootEdge)) ( + let prunedGraphRootIndex = fst prunedGraphRootNode + originalRootEdgeNodes = LG.descendants inGraph prunedGraphRootIndex + originalRootEdges = LG.out inGraph prunedGraphRootIndex + + -- get path from new root edge fst vertex to orginal root and flip those edges + closerToPrunedRootEdgeNode = (fst rerootEdge, fromJust $ LG.lab inGraph $ fst rerootEdge) + (nodesInPath, edgesinPath) = LG.postOrderPathToNode inGraph closerToPrunedRootEdgeNode prunedGraphRootNode + + -- don't want original root edges to be flipped since deleted + edgesToFlip = edgesinPath L.\\ originalRootEdges + flippedEdges = fmap LG.flipLEdge edgesToFlip + + -- dummyEdgeLabel so can be type "b" + dummyEdgeLabel = thd3 $ head edgesInPrunedSubGraph + + -- new edges on new root position and spanning old root + -- add in closer vertex to root to make sure direction of edge is correct + newEdgeOnOldRoot = + if snd3 (head originalRootEdges) `elem` (fst rerootEdge : fmap fst nodesInPath) + then (snd3 $ head originalRootEdges, snd3 $ last originalRootEdges, dummyEdgeLabel) + else (snd3 $ last originalRootEdges, snd3 $ head originalRootEdges, dummyEdgeLabel) + newRootEdges = [(prunedGraphRootIndex, fst rerootEdge, dummyEdgeLabel), (prunedGraphRootIndex, snd rerootEdge, dummyEdgeLabel)] + in -- original root edge so no change + if fst rerootEdge `elem` originalRootEdgeNodes && snd rerootEdge `elem` originalRootEdgeNodes + then ([], []) + else -- rerooted + + -- delete orignal root edges and rerootEdge + -- add new root edges + -- and new edge on old root--but need orientation + -- flip edges from new root to old (delete and add list) + -- trace ("\n\nIn Graph:\n" <> (LG.prettify $ GO.convertDecoratedToSimpleGraph inGraph) <> "\nTBR Edits: " <> (show (rerootEdge, prunedGraphRootIndex, fmap LG.toEdge flippedEdges)) + -- <> "\nEdges to add: " <> (show $ fmap LG.toEdge $ newEdgeOnOldRoot : (flippedEdges <> newRootEdges)) <> "\nEdges to delete: " <> (show $ rerootEdge : (fmap LG.toEdge (edgesToFlip <> originalRootEdges)))) + (newEdgeOnOldRoot : (flippedEdges <> newRootEdges), rerootEdge : fmap LG.toEdge (edgesToFlip <> originalRootEdges)) + +-- ) diff --git a/src/System/Timing.hs b/src/System/Timing.hs new file mode 100644 index 000000000..f36f40536 --- /dev/null +++ b/src/System/Timing.hs @@ -0,0 +1,155 @@ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE Strict #-} + +module System.Timing ( + CPUTime (), + fromPicoseconds, + fromMilliseconds, + fromMicroseconds, + fromSeconds, + toPicoseconds, + toMicroseconds, + toMilliseconds, + toSeconds, + timeDifference, + timeLeft, + timeOp, + timeOpUT, + -- , timeOpThread + timeOpCPUWall, + timeSum, +) where + +import Control.DeepSeq +import Control.Monad.IO.Class +import Data.Foldable +import Data.Time.Clock +import Numeric.Natural +import System.CPUTime + + +-- import qualified ParallelUtilities as PU + +-- | CPU time with picosecond resolution +newtype CPUTime = CPUTime Natural + deriving (Eq, Ord) + + +instance NFData CPUTime where + rnf (CPUTime !_) = () + + +instance Show CPUTime where + show (CPUTime x) + | x < nSecond = let (q, _) = x `quotRem` 1 in fold [show q, ".", "???", "ps"] + | x < μSecond = let (q, r) = x `quotRem` nSecond in fold [show q, ".", zeroPad 3 (r `div` 1), "ns"] + | x < mSecond = let (q, r) = x `quotRem` μSecond in fold [show q, ".", zeroPad 3 (r `div` nSecond), "μs"] + | x < second = let (q, r) = x `quotRem` mSecond in fold [show q, ".", zeroPad 3 (r `div` μSecond), "ms"] + | x < minute = let (q, r) = x `quotRem` second in fold [show q, ".", zeroPad 3 (r `div` mSecond), "s "] + | x < hour = let (q, r) = x `quotRem` minute in fold [show q, "m", zeroPad 2 (r `div` second), "sec"] + | x < day = let (q, r) = x `quotRem` hour in fold [show q, "h", zeroPad 2 (r `div` minute), "min"] + | otherwise = let (q, r) = x `quotRem` day in fold [show q, "d", zeroPad 2 (r `div` hour), "hrs"] + where + nSecond = 1000 + μSecond = 1000 * nSecond + mSecond = 1000 * μSecond + second = 1000 * mSecond + minute = 60 * second + hour = 60 * minute + day = 24 * hour + + +zeroPad ∷ Int → Natural → String +zeroPad k i = replicate (k - length shown) '0' <> shown + where + shown = show i + + +timeOp ∷ (MonadIO m, NFData a) ⇒ m a → m (CPUTime, a) +timeOp ioa = do + t1 ← liftIO getCPUTime + a ← force <$> ioa + t2 ← liftIO getCPUTime + let t = CPUTime . fromIntegral $ t2 - t1 + pure (t, a) + + +{- Commenting out due to unsafe call to getNumThreads +timeOpThread :: (MonadIO m, NFData a) => m a -> m (CPUTime, a) +timeOpThread ioa = do + t1 <- liftIO getCPUTime + a <- force <$> ioa + t2 <- liftIO getCPUTime + let t = CPUTime . fromIntegral $ fst $ divMod (t2 - t1) (fromIntegral PU.getNumThreads) + pure (t, a) +-} + +-- unit in pico second or something so not what I want in seconds +timeOpUT ∷ (MonadIO m, NFData a) ⇒ m a → m (CPUTime, a) +timeOpUT ioa = do + t1 ← liftIO getCurrentTime + a ← force <$> ioa + t2 ← liftIO getCurrentTime + let picoMagnitude = 1000000000000 ∷ Integer + let t = CPUTime . fromIntegral $ picoMagnitude * (floor (nominalDiffTimeToSeconds (diffUTCTime t2 t1))) + pure (t, a) + + +-- reports both CPUTime (ie total over parallel) and wall clock duration +timeOpCPUWall ∷ (MonadIO m, NFData a) ⇒ m a → m (CPUTime, CPUTime, a) +timeOpCPUWall ioa = do + wt1 ← liftIO getCurrentTime + ct1 ← liftIO getCPUTime + a ← force <$> ioa + ct2 ← liftIO getCPUTime + wt2 ← liftIO getCurrentTime + let wt = (CPUTime . fromIntegral) (1000000000000 * (floor (nominalDiffTimeToSeconds (diffUTCTime wt2 wt1))) ∷ Integer) + let ct = CPUTime . fromIntegral $ ct2 - ct1 + pure (wt, ct, a) + + +timeDifference ∷ CPUTime → CPUTime → CPUTime +timeDifference (CPUTime a) (CPUTime b) = CPUTime $ max a b - min a b + + +timeLeft ∷ CPUTime → CPUTime → CPUTime +timeLeft (CPUTime a) (CPUTime b) = + if b > a + then CPUTime 0 + else CPUTime $ a - b + + +timeSum ∷ CPUTime → CPUTime → CPUTime +timeSum (CPUTime a) (CPUTime b) = CPUTime $ a + b + + +fromPicoseconds ∷ Natural → CPUTime +fromPicoseconds = CPUTime + + +fromMicroseconds ∷ Natural → CPUTime +fromMicroseconds = CPUTime . (* 1000000) + + +fromMilliseconds ∷ Natural → CPUTime +fromMilliseconds = CPUTime . (* 1000000000) + + +fromSeconds ∷ Natural → CPUTime +fromSeconds = CPUTime . (* 1000000000000) + + +toPicoseconds ∷ CPUTime → Natural +toPicoseconds (CPUTime x) = x + + +toMicroseconds ∷ CPUTime → Natural +toMicroseconds (CPUTime x) = x `div` 1000000 + + +toMilliseconds ∷ CPUTime → Natural +toMilliseconds (CPUTime x) = x `div` 1000000000 + + +toSeconds ∷ CPUTime → Natural +toSeconds (CPUTime x) = x `div` 1000000000000 diff --git a/pkg/PhyGraph/Types/DistanceTypes.hs b/src/Types/DistanceTypes.hs similarity index 88% rename from pkg/PhyGraph/Types/DistanceTypes.hs rename to src/Types/DistanceTypes.hs index 578103562..ab388a9ea 100644 --- a/pkg/PhyGraph/Types/DistanceTypes.hs +++ b/src/Types/DistanceTypes.hs @@ -32,36 +32,46 @@ either expressed or implied, of the FreeBSD Project. Maintainer : Ward Wheeler Stability : unstable Portability : portable (I hope) - -} - module Types.DistanceTypes where -import qualified Data.Vector as V -import qualified SymMatrix as M +import Data.Vector qualified as V +import SymMatrix qualified as M + type Vertex = Int + + type Weight = Double + + type Edge = (Vertex, Vertex, Weight) -type Tree = (V.Vector Vertex,V.Vector Edge) + + +type Tree = (V.Vector Vertex, V.Vector Edge) + type TreeWithData = (String, Tree, Double, M.Matrix Double) -type SplitTreeData = (V.Vector Edge,V.Vector Edge, Double, V.Vector Edge, M.Matrix Double) + + +type SplitTreeData = (V.Vector Edge, V.Vector Edge, Double, V.Vector Edge, M.Matrix Double) + -- | emptyTree -emptyTree :: Tree +emptyTree ∷ Tree emptyTree = (V.empty, V.empty) -- | emptyTreeWithData -emptyTreeWithData :: TreeWithData +emptyTreeWithData ∷ TreeWithData emptyTreeWithData = ("()[];", (V.empty, V.empty), 0.0, M.empty) + -- | used for comparing tree costs that are Double -epsilon :: Double +epsilon ∷ Double epsilon = 0.000000000000001 + -- | precision for branch lengths display -precision :: Int +precision ∷ Int precision = 8 - diff --git a/src/Types/Types.hs b/src/Types/Types.hs new file mode 100644 index 000000000..ac7601d68 --- /dev/null +++ b/src/Types/Types.hs @@ -0,0 +1,929 @@ +{-# LANGUAGE DeriveGeneric #-} +{-# LANGUAGE DerivingStrategies #-} +{-# LANGUAGE OverloadedStrings #-} + +{- | +Module specifying data types +-} +module Types.Types ( + module Types.Types, +) where + +import Bio.DynamicCharacter (HugeDynamicCharacter, OpenDynamicCharacter, SlimDynamicCharacter, WideDynamicCharacter) +import Bio.DynamicCharacter.Element (HugeState, SlimState, WideState) +import Control.DeepSeq +import Control.Monad.IO.Class (MonadIO) +import Control.Parallel.Strategies +import Data.Alphabet +import Data.BitVector.LittleEndian qualified as BV +import Data.InfList qualified as IL +import Data.List.NonEmpty (NonEmpty (..)) +import Data.MetricRepresentation as MR +import Data.TCM qualified as TCM +import Data.TCM.Dense qualified as TCMD +import Data.Text.Lazy qualified as T +import Data.Text.Short qualified as ST +import Data.Vector qualified as V +import Data.Vector.Storable qualified as SV +import Data.Vector.Unboxed qualified as UV +import Data.Word (Word64) +import GHC.Generics +import PHANE.Evaluation +import SymMatrix qualified as S +import Utilities.LocalGraph qualified as LG + + +-- | Debug Flag +isDebug ∷ Bool +isDebug = False + + +-- | Program Version +pgVersion ∷ String +pgVersion = "0.1" + + +-- | used for comparing graph costs and edge lengths that are Double +epsilon ∷ Double +epsilon = 0.0001 + + +-- | infinity is a large Double for use with Graph costs +infinity ∷ Double +infinity = read "Infinity" ∷ Double + + +{- | maxAddStatesToRecode maximum size of addditive character to recode into +non-additive characters 65 can fit in 4 WideState since nstates - 1 binaries +prob could be bigger based on cost of optimizing additive versus but this +seems a reasonable number (prob should be timed to verify) +-} +maxAddStatesToRecode ∷ Int +maxAddStatesToRecode = 129 + + +{- | +Core monad transformer stack for evaluating computations within the application PhyG. +-} +type PhyG = Evaluation () + + +-- | Types for timed searches +type Days = Int + + +type Hours = Int + + +type Minutes = Int + + +type Seconds = Int + + +type Time = (Days, Hours, Minutes, Seconds) + + +{- | Command types +data Argument = String | Double | Int | Bool | Time + deriving stock (Show, Eq) +-} +type Argument = (String, String) + + +-- For rename format rename:(a,b,c,...,y,z) => a-y renamed to z +data Instruction + = NotACommand + | Build + | Fuse + | Read + | Reblock + | Refine + | Rename + | Report + | Run + | Select + | Set + | Swap + | Search + | Support + | Transform + deriving stock (Show, Eq, Ord) + + +-- | Node variety +data NodeType = RootNode | LeafNode | TreeNode | NetworkNode | In1Out1 + deriving stock (Show, Eq) + + +-- | Edge types +data EdgeType = NetworkEdge | TreeEdge | PendantEdge + deriving stock (Show, Eq, Ord) + + +-- | Command type structure +type Command = (Instruction, [Argument]) + + +-- | CharType data type for input characters +data CharType + = Add + | NonAdd + | Matrix + | SlimSeq + | WideSeq + | HugeSeq + | NucSeq + | AminoSeq + | AlignedSlim + | AlignedWide + | AlignedHuge + | Packed2 + | Packed4 + | Packed5 + | Packed8 + | Packed64 + deriving stock (Read, Show, Eq) + + +{- | Evaluation strategries for a graph, full muti-traversal, single-trwraversal based on outpgroup rooting, +static apporximation of non-exact (unaligned sequence) charcaters. +-} +data GraphEvaluation = MultiTraverse | SingleTraverse | StaticApproximation + deriving stock (Read, Show, Eq) + + +-- | bandit types +data BanditType = SearchBandit | GraphBandit + deriving stock (Read, Show, Eq) + + +-- non additive bit packed types (64 not really 'packed' but treated as if were) +-- these are not entered but are created by transforming existing non-additive characters +packedNonAddTypes ∷ [CharType] +packedNonAddTypes = [Packed2, Packed4, Packed5, Packed8, Packed64] + + +-- aligned not in here because they are not reorganized, and would screw up reroot optimization +exactCharacterTypes ∷ [CharType] +exactCharacterTypes = [Add, NonAdd, Matrix] <> packedNonAddTypes + + +-- | types for character classes +nonExactCharacterTypes ∷ [CharType] +nonExactCharacterTypes = [SlimSeq, WideSeq, HugeSeq, NucSeq, AminoSeq] -- , AlignedSlim, AlignedWide, AlignedHuge] + + +-- prealigned types +prealignedCharacterTypes ∷ [CharType] +prealignedCharacterTypes = [AlignedSlim, AlignedWide, AlignedHuge] + + +-- sequence types +sequenceCharacterTypes ∷ [CharType] +sequenceCharacterTypes = nonExactCharacterTypes <> prealignedCharacterTypes + + +{- | Graph types for searching etc. Can be modified by 'Set command +HardWired and SoftWired are network types +'Tree' would be a single tree in the sense as produced by typical phylogentic +seqrch programs--no forests +-} +data GraphType = Tree | HardWired | SoftWired + deriving stock (Show, Eq) + + +{- | Optimality criterion sets the cost function for graphs and potentially models +likelihood form is the "Self Information" in context of Kolmogorov complexity/MDL/PMDL +MAPA is Integrate Likelihood/dBaysian stuff via TCM modification +-} +data OptimalityCriterion = Parsimony | PMDL | SI | MAPA | NCM + deriving stock (Show, Eq) + + +data GraphFactor = NoNetworkPenalty | Wheeler2015Network | Wheeler2023Network | PMDLGraph + deriving stock (Show, Eq) + + +data RootCost = NoRootCost | MAPARoot | NCMRoot | PMDLRoot | SIRoot | Wheeler2015Root + deriving stock (Show, Eq) + + +data SoftWiredAlgorithm = Naive | ResolutionCache + deriving stock (Show, Eq) + + +data ParallelStrategy = R0 | RSeq | RPar | RDeepSeq + deriving stock (Show, Eq) + + +{- | Method for makeing final seqeujnce charactert states assignment +do an DO-based method--more exact but higher time complexity--single preorder +pass but worst cae O(n^2) in seqeunce length +or assign based on Implied alignment --requires additional post/pre order +traversal but is linear in sequence length +-} +data AssignmentMethod = DirectOptimization | ImpliedAlignment + deriving stock (Show, Eq) + + +data SearchData = SearchData + { instruction ∷ Instruction + , arguments ∷ [Argument] + , minGraphCostIn ∷ VertexCost + , maxGraphCostIn ∷ VertexCost + , numGraphsIn ∷ Int + , minGraphCostOut ∷ VertexCost + , maxGraphCostOut ∷ VertexCost + , numGraphsOut ∷ Int + , commentString ∷ String + , duration ∷ Int + } + deriving stock (Show, Eq) + + +{- | maxSimultaneousGraphsSteepest is the maximum number of graphs that are evaluated +at a step in "steepest" algorithms of swap and fuse. Set becasue can increase +run time of these procedurs by delaying finding "better" solutins to move to. +-} +maxSimultaneousGraphsSteepest ∷ Int +maxSimultaneousGraphsSteepest = 10 + + +-- | SwapType types for swapping, TBRAlternate for special casing in Swap +data SwapType = NoSwap | NNI | SPR | TBR | Alternate | TBRAlternate + deriving stock (Show, Eq) + + +-- | JoinType types for join methods +data JoinType = JoinPruned | JoinAll | JoinAlternate + deriving stock (Show, Eq) + + +-- | SelectGraphType types to select gaphs +data SelectGraphType = Best | Unique | AtRandom | All + deriving stock (Show, Eq) + + +-- | Support method types +data SupportMethod = Jackknife | Bootstrap | GoodmanBremer + deriving stock (Show, Eq) + + +-- | return parallel Strategy +parStrategy ∷ (NFData b) ⇒ ParallelStrategy → Strategy b +parStrategy parStrat + | parStrat == R0 = r0 + | parStrat == RPar = rpar + | parStrat == RSeq = rseq + | parStrat == RDeepSeq = rdeepseq + | otherwise = rdeepseq + + +data GlobalSettings = GlobalSettings + { bc2 ∷ (Double, Double) -- PMDL bitCost for 2 states of no-change and change as pair + , bc4 ∷ (Double, Double) -- PMDL bitCost for 4 states of no-change and change as pair + , bc5 ∷ (Double, Double) -- PMDL bitCost for 5 states of no-change and change as pair + , bc8 ∷ (Double, Double) -- PMDL bitCost for 8 states of no-change and change as pair + , bc64 ∷ (Double, Double) -- PMDL bitCost for 64 states of no-change and change as pair + , bcgt64 ∷ (Double, Double) -- PMDL bitCost for > 64 states of no-change and change as pair + , compressResolutions ∷ Bool -- "nub" resolutions in softwired graph + , defaultParStrat ∷ ParallelStrategy -- default parallel strategy + , dynamicEpsilon ∷ Double -- factor of dynamic heuristics overestimating graph deltas determind by fraction of data is dynamic and user value + , finalAssignment ∷ AssignmentMethod + , fractionDynamic ∷ Double -- estimated fraction of character length that are dynamic (actually seqeunce) for setting dynamicEpsilon + , graphComplexityList ∷ IL.InfList (VertexCost, VertexCost) -- complexity of graphs in bits, index for number of network nodes (0= tree etc0 lazy so only evaluate each once when needed O(n) but needlazyness and permanence + , graphsSteepest ∷ Int -- he maximum number of graphs that are evaluated + -- at a step in "steepest" algorithms of swap and network add/delete. Set because can increase + -- run time of these procedures by delaying finding "better" solutions to move to. + -- also increases memory footprint + , graphType ∷ GraphType + , graphFactor ∷ GraphFactor -- net penalty/graph complexity + , lazyParStrat ∷ ParallelStrategy -- default parallel strategy to WHNF + , missingThreshold ∷ Int -- default threshold of maximum missing data to keep in data set 100 (keep all, 0 would be no missing data) + , modelComplexity ∷ Double -- model cost for PMDL, 0.0 for other criteria + , multiTraverseCharacters ∷ Bool -- If true "reroot" charcter trees to get best cost for (only affects) dynamic characters, if False then no + , numDataLeaves ∷ Int -- number of leaves set after data processing--for conveniance really + , optimalityCriterion ∷ OptimalityCriterion + , outgroupIndex ∷ Int -- Outgroup terminal index, default 0 (first input leaf) + , outGroupName ∷ T.Text -- Outgroup name + , partitionCharacter ∷ String -- 'character' for mparitioning seqeunce data into homologous sections'--checks for length == 1 later + , reportNaiveData ∷ Bool -- reports using Naive data so preserves character order and codings. This comes at a cost in memory footprint. If False, + -- packed characters are reported--and are somewhat inscrutable. But perhaps 3% of data footprint--useful for large + -- add/non add dat asets liker SNP genomic data + , rootComplexity ∷ VertexCost -- complexity of root in bits per root for PMDL/ML calculations + , rootCost ∷ RootCost + , searchData ∷ [SearchData] + , seed ∷ Int -- random seed + , softWiredMethod ∷ SoftWiredAlgorithm -- algorithm to optimize softwired graphs + , strictParStrat ∷ ParallelStrategy -- default parallel strategy to Fully evaluate + , unionThreshold ∷ Double -- this is the edge union cost threshold for rejoing edges during SPR and TBR, and (perhas) character Wagner build + -- as described by Varon and Wheeler (2013) and set to 1.17 experimentally + , useIA ∷ Bool -- turn on/off IA everywhere can (mainly for internal testing) + , useNetAddHeuristic ∷ Bool -- Netowrk addition heuristic--very coarse currently + } + deriving stock (Show, Eq) + + +instance NFData GlobalSettings where rnf x = seq x () + + +{- | CharInfo information about characters +null values for these are in Input.FastAC.hs + TCMD.DenseTransitionCostMatrix => genDiscreteDenseOfDimension (length alphabet) + MR.MetricRepresentation WideState => metricRepresentation <$> TCM.fromRows [[0::Word]] + MR.MetricRepresentation BV.BitVector => metricRepresentation <$> TCM.fromRows [[0::Word]] +changeCost and noChange costs are for PMDL costs for packed/non-additive character for +other character types the cost matrix holds this information in comvcert with weight since the matrix values are integers +-} +data CharInfo = CharInfo + { name ∷ NameText + , charType ∷ CharType + , activity ∷ Bool + , weight ∷ Double + , costMatrix ∷ S.Matrix Int + , slimTCM ∷ TCMD.DenseTransitionCostMatrix + , wideTCM ∷ MR.MetricRepresentation WideState + , hugeTCM ∷ MR.MetricRepresentation HugeState + , changeCost ∷ Double + , noChangeCost ∷ Double + , alphabet ∷ Alphabet ST.ShortText + , prealigned ∷ Bool + , origInfo ∷ V.Vector (NameText, CharType, Alphabet ST.ShortText) + } + deriving stock (Show, Eq) + + +instance NFData CharInfo where rnf x = seq x () + + +instance Ord CharInfo where x `compare` y = show x `compare` show y + + +-- | Types for vertex information +type VertexCost = Double + + +type StateCost = Int + + +type VertexIndex = Int + + +-- | index of child vertices +type ChildStateIndex = Int + + +{- | unique bitvector labelling for vertex based on descednent labellings +these labels are used for caching, left/right DO optimizaiton +thery should be label invariant +a hash of sorted input data for leaves +will need a map from NameBV to T.Text name (or not) +-} +type NameBV = BV.BitVector + + +-- | Human legibale name for vertices, characters, and Blocks +type NameText = T.Text + + +{- | TYpes for Matrix/Sankoff characters +Triple contains info from left and right child--could be only one +use fst then +finals state also vector of triple--but need to keep min cost +for final assignments +filter by local costVect to get 'best' states an each node +-} +type MatrixTriple = (StateCost, [ChildStateIndex], [ChildStateIndex]) + + +-- Only date here that varies by vertex, rest inglobal character info +-- vectors so all data of single type can be grouped together +-- will need to add masks for bit-packing non-additive chars +-- may have to add single assignment for hardwired and IP optimization +-- for approximate sakoff (DO-like) costs can use stateBVPrelim/stateBVFinal +-- for matrix/Saknoff characters-- Vector of vector of States +-- BUT all with same cost matrix/tcm +-- triples (add, no-add, sequence) are to keep children of vertex states for pre-order pass +-- order is always (left parent median, median, right parent median) +-- do not need for matrix since up pass is a traceback from parent +-- sequence characters are a vector of bitvectors--so only a single seqeunce character +-- per "charctaer" this is so the multi-traversal can take place independently for each +-- sequence character, creating a properly "rooted" tree/graph for each non-exact seqeunce character +-- prelim is created from gapped, final from (via 3-way minimization) parent final and child alignment (2nd and 3rd fields). +-- the 'alignment' fields hold the implied alignment data +-- the 'union' fields hold post-order unions of subgraph charcaters (ia for sequence) fir use in uinion threshold +-- during branch addintion/readdition (e.g swapping) +data CharacterData = CharacterData + { -- for Non-additive + stateBVPrelim ∷ (V.Vector BV.BitVector, V.Vector BV.BitVector, V.Vector BV.BitVector) -- HugeDynamicCharacter -- preliminary for Non-additive chars, Sankoff Approx + , stateBVFinal ∷ V.Vector BV.BitVector + , stateBVUnion ∷ V.Vector BV.BitVector + , -- for Additive + rangePrelim ∷ (V.Vector (Int, Int), V.Vector (Int, Int), V.Vector (Int, Int)) + , rangeFinal ∷ V.Vector (Int, Int) + , rangeUnion ∷ V.Vector (Int, Int) + , -- for multiple Sankoff/Matrix with slim tcm + matrixStatesPrelim ∷ V.Vector (V.Vector MatrixTriple) + , matrixStatesFinal ∷ V.Vector (V.Vector MatrixTriple) + , matrixStatesUnion ∷ V.Vector (V.Vector MatrixTriple) + , -- preliminary for m,ultiple seqeunce chars with same TCM + slimPrelim ∷ SV.Vector SlimState + , -- gapped medians of left, right, and preliminary used in preorder pass + slimGapped ∷ SlimDynamicCharacter + , slimAlignment ∷ SlimDynamicCharacter + , slimFinal ∷ SV.Vector SlimState + , slimIAPrelim ∷ SlimDynamicCharacter + , slimIAFinal ∷ SV.Vector SlimState + , slimIAUnion ∷ SV.Vector SlimState + , -- vector of individual character costs (Can be used in reweighting-ratchet) + widePrelim ∷ UV.Vector WideState + , -- gapped median of left, right, and preliminary used in preorder pass + wideGapped ∷ WideDynamicCharacter + , wideAlignment ∷ WideDynamicCharacter + , wideFinal ∷ UV.Vector WideState + , wideIAPrelim ∷ WideDynamicCharacter + , wideIAFinal ∷ UV.Vector WideState + , wideIAUnion ∷ UV.Vector WideState + , -- vector of individual character costs (Can be used in reweighting-ratchet) + hugePrelim ∷ V.Vector HugeState + , -- gapped medians of left, right, and preliminary used in preorder pass + hugeGapped ∷ HugeDynamicCharacter + , hugeAlignment ∷ HugeDynamicCharacter + , hugeFinal ∷ V.Vector HugeState + , hugeIAPrelim ∷ HugeDynamicCharacter + , hugeIAFinal ∷ V.Vector HugeState + , hugeIAUnion ∷ V.Vector HugeState + , -- vectors for pre-aligned sequences also used in static approx + alignedSlimPrelim ∷ SlimDynamicCharacter + , alignedSlimFinal ∷ SV.Vector SlimState + , alignedSlimUnion ∷ SV.Vector SlimState + , alignedWidePrelim ∷ WideDynamicCharacter + , alignedWideFinal ∷ UV.Vector WideState + , alignedWideUnion ∷ UV.Vector WideState + , alignedHugePrelim ∷ HugeDynamicCharacter + , alignedHugeFinal ∷ V.Vector HugeState + , alignedHugeUnion ∷ V.Vector HugeState + , -- coiuld be made Storable later is using C or GPU/Accelerate + packedNonAddPrelim ∷ OpenDynamicCharacter UV.Vector Word64 + , packedNonAddFinal ∷ UV.Vector Word64 + , packedNonAddUnion ∷ UV.Vector Word64 + , -- vector of individual character costs (Can be used in reweighting-ratchet) + localCostVect ∷ V.Vector StateCost + , -- weight * V.sum localCostVect + localCost ∷ VertexCost + , -- unclear if need vector version + globalCost ∷ VertexCost + } + deriving stock (Show, Eq, Generic) + + +instance NFData CharacterData where rnf x = seq x () + + +{- | type TermData type contains termnal name and list of characters +characters as ShortText to save space on input +-} +type TermData = (NameText, [ST.ShortText]) + + +type LeafData = (NameText, V.Vector CharacterData) + + +{- | VertexBlockData vector over blocks of character data in block (Vector) +blocks of character data for a given vertex +-} +type VertexBlockData = V.Vector (V.Vector CharacterData) + + +{- | VertexBlockDataMaybe vector over maybe blocks of character data in block (Vector) +blocks of character data for a given vertex +-} +type VertexBlockDataMaybe = V.Vector (V.Vector (Maybe CharacterData)) + + +{- | ResolutionData contains vertex information for soft-wired network components +these are used in the idenitification of minimal cost display trees for a block of +data that follow the same display tree +-} +type ResolutionVertexData = V.Vector ResolutionBlockData + + +{- | ResolutionBlockData contains a list of ResolutionData +this list contains all the potential resolutions of a softwired +networtk vertex +-} +type ResolutionBlockData = V.Vector ResolutionData + + +{- | ResolutionData contains individual block information for a given resoluton of soft-wired network components +these are used in the idenitification of minimal cost display trees for a block of +data that follow the same display tree +nodes are VertexInfo for ease of conversion--but nthe info is largely bogus and not to be trusted, same with EdgeInfo +-} +data ResolutionData = ResolutionData + { displaySubGraph ∷ ([LG.LNode VertexInfo], [LG.LEdge EdgeInfo]) -- holds the post-order display sub-tree for the block + , displayBVLabel ∷ NameBV -- For comparison of vertices subtrees, left/right, anmd root leaf inclusion + , displayData ∷ V.Vector CharacterData -- data for characters in block + -- left and right indices of child resolution Data for traceback and preliminary state assignment + , childResolutionIndices ∷ (Maybe Int, Maybe Int) + , resolutionCost ∷ VertexCost -- cost of creating the resolution + , displayCost ∷ VertexCost -- cost of that display subtree + } + deriving stock (Show, Eq) + + +instance NFData ResolutionData where rnf x = seq x () + + +-- | VertexInfo type -- vertex information for Decorated Graph +data VertexInfo = VertexInfo + { index ∷ Int -- For accessing + , bvLabel ∷ NameBV -- For comparison of vertices subtrees, left/right + , parents ∷ V.Vector Int -- indegree indices + , children ∷ V.Vector Int -- outdegree indices + , nodeType ∷ NodeType -- root, leaf, network, tree + , vertName ∷ NameText -- Text name of vertex either input or HTU# + , vertData ∷ VertexBlockData -- data as vector of blocks (each a vector of characters) + , vertexResolutionData ∷ V.Vector ResolutionBlockData -- soft-wired network component resolution information for Blocks + , vertexCost ∷ VertexCost -- local cost of vertex + , subGraphCost ∷ VertexCost -- cost of graph to leaves from the vertex + } + deriving stock (Generic, Show, Eq) + + +instance NFData VertexInfo where rnf x = seq x () + + +-- | type edge data, source and sink node indices are fst3 and snd3 fields. +data EdgeInfo = EdgeInfo + { minLength ∷ VertexCost + , maxLength ∷ VertexCost + , midRangeLength ∷ VertexCost + , edgeType ∷ EdgeType + } + deriving stock (Show, Eq, Ord) + + +instance NFData EdgeInfo where rnf x = seq x () + + +{- | DecortatedGraph is the canonical graph contining all final information +from preorder traversal trees +and post-order info usually from an initial root-based traversal +-} +type DecoratedGraph = LG.Gr VertexInfo EdgeInfo + + +{- | Type BLockDisplayTree is a Forest of tree components (indegree, outdegree) = (0,1|2),(1,2),(1,0) +these are "resolved" from more general graphs +will have to allow for indegre=outdegree=1 for dispaly tree generation and reconciliation +the vertData field will always have a single Bloclk--teh vecor of blocks will be a vector of +DecoratedGraphs. These woulod better have a single Vector of cChracter info as +opposed to the Decorated Tree type, but the record naming and use gets screwed up. +type BlockDisplayForest = LG.Gr VertexInfo EdgeInfo +-} + +{- | DecoratedGraph is a forest of tree compnents for a single character +this is used for non-exact character traversal trees +there will always be only a single block and single character even though +expresed as Vector fo Vector of Chaarcters. Would be better as a single character as +opposed to the Decorated Tree type, but the record naming and use gets screwed up. +type DecoratedGraph = LG.Gr VertexInfo EdgeInfo +-} + +-- | type RawGraph is input graphs with leaf and edge labels +type SimpleGraph = LG.Gr NameText VertexCost + + +{- | Type phylogentic Graph is a graph with +cost, optimality value, +block display trees, character traversal "foci" (could have multiple) +Data optimizations exist in Processed Data +Question of wheterh traversal foci shold be in Graph or Data section +for now in Graph but could easiily be moved to Processed data +May need "EdgeData" like VertexData for heuristics. UNclear if local scope during SPR/TBR will do it. + Fields: + 1) "Simple" graph with fileds useful for outputting graphs + 2) Graph optimality value or cost + 3) Decorated Graph with optimized vertex/Node data + 4) Vector of display trees for each data Block + root and vertex costs not updated in rerooting so cannot be trusted + Each block can have multiple display trees so extra list there + 5) Vector of traversal foci for each character (Vector of Blocks -> Vector of Characters, a single tree for each character) + vector is over blocks, then characters (could have have multiple for each character, but only single tracked here) + only important for dynamic (ie non-exact) characters whose costs depend on traversal focus + one graph per character + 6) Vector of Block Character Information (whihc is a Vector itself) required to properly optimize characters +-} +type PhylogeneticGraph = + ( SimpleGraph + , VertexCost + , DecoratedGraph + , V.Vector [DecoratedGraph] + , V.Vector (V.Vector DecoratedGraph) + , V.Vector (V.Vector CharInfo) + ) + + +-- | Type phylogentic Graph is a graph with general types +type GenPhyloGraph a b = + (SimpleGraph, VertexCost, DecoratedGraph, V.Vector [LG.Gr a b], V.Vector (V.Vector (LG.Gr a b)), V.Vector (V.Vector CharInfo)) + + +{- | Type ReducedPhylogenticGraph is a graph +that has most of the information of a PhylogeneticGraph but does not have repeated +decorations. The only lacking information is the traversal topologies of the charcter graphs (5th field of phylogenetic graph) +the display tree field (4th) does not have decorations but only toplogy infomatin in form of SimpleGraph +the purpose of the type is to remove the redundant decorations to 1/3 of what they are in PhylogeneticGraph + Fields: + 1) "Simple" graph with fileds useful for outputting graphs + 2) Graph optimality value or cost + 3) Decorated Graph with optimized vertex/Node data + 4) Vector of display trees for each data Block as Simple Graphs + Each block can have multiple display trees so extra list there + 5) Vector of Block Character Information (whihc is a Vector itself) required to properly optimize characters +-} +type ReducedPhylogeneticGraph = (SimpleGraph, VertexCost, DecoratedGraph, V.Vector [SimpleGraph], V.Vector (V.Vector CharInfo)) + + +{- | RawData type processed from input to be passed to characterData +to recode into usable form +the format is tuple of a list of taxon-data list tuples and charinfo list. +the data list and charinfo list must have the same length +-} +type RawData = ([TermData], [CharInfo]) + + +{- | Processed data is the basic data structire for analysis +can be input to functions +based on "blocks" that follow same display tree (soft-wired network) +each block has a set of characters (for each vertex eventually) and character info +the vector of T.Text are the names--leaves form input, internal HTU <> (show index) +ablock are initialy set bu input file, and can be later changed by "set(block,...)" +command +"Naive" "Optimized" and "Transformed" darta are this type after different processing steps +the first and second vectors are size number of leaves, teh third is number of blocks +-} +type ProcessedData = (V.Vector NameText, V.Vector NameBV, V.Vector BlockData) + + +{- | Block data is the basic data unit that is optimized on a display tree + Block data contain data fo all leaves and all characters in the block +it is row, ie vertex dominant +it has a bitvector name derived from leaf bitvector labels (union of children) +the bitvector name can vary among blocks (for non-leaves) due to alternate display trees +a vector of characterer data where assignments and costs reside, and a vector of character info +leaves will alwasy be first (indices 0..n-1) for simpler updating of data during graph optimization +NameText is the block label used for assignment and reporting output +Initially set to input filename of character + Fields: + 1) name of the block--intially taken from input filenames + 2) vector of vertex/leaf data with vector of character data for each leaf + 3) Vector of character information for characters in the block +-} +type BlockData = (NameText, V.Vector (V.Vector CharacterData), V.Vector CharInfo) + + +-- | type SAParams parameter structure for simulated alnnealing and Drifting +data SimulatedAnnealingMethod = SimAnneal | Drift + deriving stock (Read, Show, Eq) + + +-- | Simulated Annealing parameters +data SAParams = SAParams + { method ∷ SimulatedAnnealingMethod + , numberSteps ∷ Int + , currentStep ∷ Int + , rounds ∷ Int + , driftAcceptEqual ∷ Double + , driftAcceptWorse ∷ Double + , driftMaxChanges ∷ Int + , driftChanges ∷ Int + } + deriving stock (Show, Eq) + + +instance NFData SAParams where rnf x = seq x () + + +-- | SwapParam type for swap parameers +data SwapParams = SwapParams + { swapType ∷ SwapType -- NNI/SPR/TBR/Alternate + , joinType ∷ JoinType -- Union priuning on or off + , atRandom ∷ Bool -- randomized spluting and rejoining + , keepNum ∷ Int -- number equally costly solutoins to keep + , maxMoveEdgeDist ∷ Int -- maximum rejoin distance from initial mplacement + , steepest ∷ Bool -- steepest descent versus "all" + , joinAlternate ∷ Bool -- in alternate swapping for TBR + , doIA ∷ Bool -- use Implied alignment fields for rearragement costs + , returnMutated ∷ Bool -- return changed graphs for simlated annealing, genetic algorithm + } + deriving stock (Show, Eq) + + +instance NFData SwapParams where rnf x = seq x () + + +-- | empty structures for convenient use + +-- | emptyProcessedData empty processsed data dfor memory saving with large qualitative data sets(e.g. SNPS) +emptyProcessedData ∷ ProcessedData +emptyProcessedData = (V.empty, V.empty, V.empty) + + +-- | emptySearchData for use in getting basic procesin input data +emptySearchData ∷ SearchData +emptySearchData = + SearchData + { instruction = NotACommand + , arguments = [] + , minGraphCostIn = infinity + , maxGraphCostIn = infinity + , numGraphsIn = 0 + , minGraphCostOut = infinity + , maxGraphCostOut = infinity + , numGraphsOut = 0 + , commentString = [] + , duration = 0 + } + + +-- | emptyGlobalSettings for early use in parition charcter. Can't have full due to data dependency of outpogroup name +emptyGlobalSettings ∷ GlobalSettings +emptyGlobalSettings = + GlobalSettings + { outgroupIndex = 0 + , outGroupName = "NoOutgroupSet" + , optimalityCriterion = Parsimony + , graphType = Tree + , compressResolutions = False + , finalAssignment = DirectOptimization + , graphFactor = Wheeler2015Network + , rootCost = NoRootCost + , rootComplexity = 0.0 + , graphComplexityList = IL.repeat (0.0, 0.0) + , modelComplexity = 0.0 + , seed = 0 + , searchData = [] + , partitionCharacter = "#" + , numDataLeaves = 0 + , bc2 = (0.0, 1.0) + , bc4 = (0.0, 1.0) + , bc5 = (0.0, 1.0) + , bc8 = (0.0, 1.0) + , bc64 = (0.0, 1.0) + , bcgt64 = (0.0, 1.0) + , fractionDynamic = 1.0 + , dynamicEpsilon = 1.00 + , graphsSteepest = 10 + , softWiredMethod = ResolutionCache + , multiTraverseCharacters = True + , reportNaiveData = True + , unionThreshold = 1.17 + , defaultParStrat = RSeq + , lazyParStrat = RPar -- default parallel strategy + , strictParStrat = RDeepSeq -- high level--basically srtict evaluation + , useNetAddHeuristic = True + , useIA = True + , missingThreshold = 100 + } + + +{- | emptyPhylogeneticGraph specifies and empty phylogenetic graph +important cost is infinity for filtering operations +-} +emptyPhylogeneticGraph ∷ PhylogeneticGraph +emptyPhylogeneticGraph = (LG.empty, infinity, LG.empty, V.empty, V.empty, V.empty) + + +{- | emptyReducedPhylogeneticGraph specifies and empty phylogenetic graph +important cost is infinity for filtering operations +-} +emptyReducedPhylogeneticGraph ∷ ReducedPhylogeneticGraph +emptyReducedPhylogeneticGraph = (LG.empty, infinity, LG.empty, V.empty, V.empty) + + +-- | emptycharacter useful for intialization and missing data +emptyCharacter ∷ CharacterData +emptyCharacter = + CharacterData -- for non-additive + { stateBVPrelim = (mempty, mempty, mempty) -- preliminary for Non-additive chars, Sankoff Approx + , stateBVFinal = mempty + , stateBVUnion = mempty + , -- for Additive + rangePrelim = (mempty, mempty, mempty) + , rangeFinal = mempty + , rangeUnion = mempty + , -- for multiple Sankoff/Matrix with sme tcm + matrixStatesPrelim = mempty + , matrixStatesFinal = mempty + , matrixStatesUnion = mempty + , -- preliminary for m,ultiple seqeunce cahrs with same TCM + slimPrelim = mempty + , -- gapped medians of left, right, and preliminary used in preorder pass + slimGapped = (mempty, mempty, mempty) + , slimAlignment = (mempty, mempty, mempty) + , slimFinal = mempty + , slimIAPrelim = (mempty, mempty, mempty) + , slimIAFinal = mempty + , slimIAUnion = mempty + , -- gapped median of left, right, and preliminary used in preorder pass + widePrelim = mempty + , -- gapped median of left, right, and preliminary used in preorder pass + wideGapped = (mempty, mempty, mempty) + , wideAlignment = (mempty, mempty, mempty) + , wideFinal = mempty + , wideIAPrelim = (mempty, mempty, mempty) + , wideIAFinal = mempty + , wideIAUnion = mempty + , -- vector of individual character costs (Can be used in reweighting-ratchet) + hugePrelim = mempty + , -- gapped mediasn of left, right, and preliminary used in preorder pass + hugeGapped = (mempty, mempty, mempty) + , hugeAlignment = (mempty, mempty, mempty) + , hugeFinal = mempty + , hugeIAPrelim = (mempty, mempty, mempty) + , hugeIAFinal = mempty + , hugeIAUnion = mempty + , -- vectors for pre-aligned sequences also used in static approx + alignedSlimPrelim = (mempty, mempty, mempty) + , alignedSlimFinal = mempty + , alignedSlimUnion = mempty + , alignedWidePrelim = (mempty, mempty, mempty) + , alignedWideFinal = mempty + , alignedWideUnion = mempty + , alignedHugePrelim = (mempty, mempty, mempty) + , alignedHugeFinal = mempty + , alignedHugeUnion = mempty + , packedNonAddPrelim = (mempty, mempty, mempty) + , packedNonAddFinal = mempty + , packedNonAddUnion = mempty + , -- vector of individual character costs (Can be used in reweighting-ratchet) + localCostVect = V.singleton 0 + , -- weight * V.sum localCostVect + localCost = 0 + , -- unclear if need vector version + globalCost = 0 + } + + +-- | emptyVertex useful for graph rearrangements +emptyVertexInfo ∷ VertexInfo +emptyVertexInfo = + VertexInfo + { index = -1 + , bvLabel = BV.fromBits [False] + , parents = mempty + , children = mempty + , nodeType = TreeNode -- root, leaf, network, tree + , vertName = "EmptyVertex" + , vertData = mempty + , vertexResolutionData = mempty + , vertexCost = 0.0 + , subGraphCost = 0.0 + } + + +-- | usefule in some cases +dummyNode ∷ LG.LNode VertexInfo +dummyNode = (-1, emptyVertexInfo) + + +-- | dummyEdge for convenience +dummyEdge ∷ EdgeInfo +dummyEdge = + EdgeInfo + { minLength = 0 + , maxLength = 0 + , midRangeLength = 0 + , edgeType = TreeEdge + } + + +-- emptyCharInfo for convenience +emptyCharInfo ∷ (MonadIO m) ⇒ m CharInfo +emptyCharInfo = + let minimalMatrix ∷ [[Int]] + minimalMatrix = [[0, 1], [1, 0]] + (_, tcm) = TCM.fromRows minimalMatrix + sTCM = TCMD.generateDenseTransitionCostMatrix 2 2 . S.getCost $ V.fromList <$> V.fromList minimalMatrix + in do + wTCM ← MR.metricRepresentation tcm + hTCM ← MR.metricRepresentation tcm + pure + CharInfo + { name = "EmptyCharName" + , charType = NonAdd + , activity = True + , weight = 1.0 + , costMatrix = S.empty + , slimTCM = sTCM + , wideTCM = wTCM + , hugeTCM = hTCM + , changeCost = 1.0 + , noChangeCost = 0.0 + , alphabet = fromSymbols $ "0" :| ["1"] + , prealigned = False + , origInfo = V.empty + } diff --git a/src/Utilities/DistanceUtilities.hs b/src/Utilities/DistanceUtilities.hs new file mode 100644 index 000000000..6402595c8 --- /dev/null +++ b/src/Utilities/DistanceUtilities.hs @@ -0,0 +1,631 @@ +{- | +Module : Utilities.hs +Description : Module with useful functionsfor distance tree construction methods dWag, Neightbor-Joining, UPGMA, and WPGMA + -- but with added refinement based on 4-point metric +Copyright : (c) 2020 Ward C. Wheeler, Division of Invertebrate Zoology, AMNH. All rights reserved. +License : + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The views and conclusions contained in the software and documentation are those +of the authors and should not be interpreted as representing official policies, +either expressed or implied, of the FreeBSD Project. + +Maintainer : Ward Wheeler +Stability : unstable +Portability : portable (I hope) +-} +module Utilities.DistanceUtilities where + +import Control.Monad.IO.Class (MonadIO (..)) +import Data.Graph.Inductive.Graph qualified as G +import Data.Graph.Inductive.PatriciaTree qualified as P +import Data.List qualified as L +import Data.Maybe +import Data.Number.Transfinite qualified as NT +import Data.Set qualified as Set +import Data.Text.Lazy qualified as T +import Data.Vector qualified as LS +import Data.Vector qualified as V +import GeneralUtilities +import GraphFormatUtilities qualified as PP +import PHANE.Evaluation +import PHANE.Evaluation.Verbosity (Verbosity (..)) +import SymMatrix qualified as M +import System.IO.Unsafe +import System.Random qualified as Rand +import System.Random.Shuffle qualified as RandS +import Types.DistanceTypes +import Types.Types + + +{- | localRoundtakes a double multiplies by 10^precisoin, rounds to integer then divides +by precision +-} +localRound ∷ Double → Int → Double +localRound val places = + let factor = 10.0 ^^ places + newVal = val * factor + roundedVal = round newVal ∷ Int + in fromIntegral roundedVal / factor + + +{- | showDouble integer number of string postion )including sign and decimal) and returns string +of that length--basically truncation +-} +showDouble ∷ Int → Double → String +showDouble places val = show $ localRound val places -- reverse $ dropWhile (== '0') $ reverse $ take places $ show val + + +-- | test for equality with epsilon +withinEpsilon ∷ Double → Double → Bool +withinEpsilon a b = a == b + + +{- +if abs (a - b) < epsilon then True +else False +-} + +{- | vertex2FGLNode take vertex of Int and a Vector of Strings (leaf names) and returns +fgl node with type T.Text +-} +vertex2FGLNode ∷ V.Vector String → Vertex → (Int, T.Text) +vertex2FGLNode leafVect vertIndex = + if vertIndex < V.length leafVect + then (vertIndex, T.pack (leafVect V.! vertIndex)) + else + let vertexName = "HTU" <> show (vertIndex - V.length leafVect) + in (vertIndex, T.pack vertexName) + + +{- | treeFGL take a Treee type and converts to an fgl graph +to be used with PhylParsers module hence Text +-} +tree2FGL ∷ Tree → V.Vector String → P.Gr T.Text Double +tree2FGL inTree@(inVertexVect, inEdgeVect) leafNameVect = + if inTree == emptyTree + then error "Empty tree in tree2FGL" + else + let fglNodes = V.map (vertex2FGLNode leafNameVect) inVertexVect + in -- trace ("tree2FGL orig, vertices, edges = " <> (show $ length inVertexVect ) <> " " <> (show $ length fglNodes ) <> " " <> (show $ length inEdgeVect )) + G.mkGraph (V.toList fglNodes) (V.toList inEdgeVect) + + +-- reIndexEdges take a list of vertices as integer indices and an edges (Int, Int, Double) +-- and retuns a new vertex with the indives of the vertex labels +reIndexEdges ∷ [Int] → Edge → Edge +reIndexEdges inVertexList (e, u, w) = + if null inVertexList + then error "Null inVertexList in reIndexEdges" + else + let newE = L.elemIndex e inVertexList + newU = L.elemIndex u inVertexList + in -- un safe not testing but list is created from edges first + (fromJust newE, fromJust newU, w) + + +{- | makeVertexNames takes vertgex indices and returns leaf name if < nOTUs and "HTU" <> show Index +if not +-} +makeVertexNames ∷ [Vertex] → Int → V.Vector String → Bool → [String] +makeVertexNames vertList nOTUs leafNames nameHTUs = + if null vertList + then [] + else + let firstVert = head vertList + in if firstVert < nOTUs + then (leafNames V.! firstVert) : makeVertexNames (tail vertList) nOTUs leafNames nameHTUs + else + if nameHTUs + then ("HTU" <> show firstVert) : makeVertexNames (tail vertList) nOTUs leafNames nameHTUs + else "" : makeVertexNames (tail vertList) nOTUs leafNames nameHTUs + + +-- | directSingleEdge takes an Int and makes that 'e' and otehr vertex as 'u' in edge (e->u) +directSingleEdge ∷ Int → Edge → Edge +directSingleEdge index (a, b, w) + | a == index = (a, b, w) + | b == index = (b, a, w) + | otherwise = error ("Index " <> show index <> " doesn't match edge " <> show (a, b, w) <> " in directSingleEdge") + + +-- | getChildEdges returns the two edges that are childre of a vertex +getChildEdges ∷ Int → Int → V.Vector Edge → V.Vector Edge +getChildEdges vertIndex nLeaves inEdgeVect + | V.null inEdgeVect = V.empty + | vertIndex < nLeaves = error ("Looking for child of leaf " <> show (vertIndex, nLeaves)) + | otherwise = + let (a, b, w) = V.head inEdgeVect + in if (a == vertIndex) || (b == vertIndex) + then V.cons (a, b, w) (getChildEdges vertIndex nLeaves (V.tail inEdgeVect)) + else getChildEdges vertIndex nLeaves (V.tail inEdgeVect) + + +-- | directexEdges takes a vector of edges and outgrop index and directs the edges (parent -> child vertices) based on that +directEdges ∷ Int → Int → Bool → V.Vector Edge → V.Vector Edge +directEdges vertIndex nLeaves isFirst inEdgeVect + | V.null inEdgeVect = V.empty + | isFirst -- to find out group edge order larger to smaller will have outgroup index second + = + let outgroupEdge = getEdgeRoot vertIndex inEdgeVect + remainingEdgeVect = subtractVector (V.singleton outgroupEdge) inEdgeVect + (a, b, w) = orderEdge outgroupEdge + in V.cons (a, b, w) (directEdges a nLeaves False remainingEdgeVect) + | vertIndex < nLeaves = V.empty + | otherwise -- not outgroup but regular node, get two child edges + = + let descdendantEdges = getChildEdges vertIndex nLeaves inEdgeVect + remainingEdgeVect = subtractVector descdendantEdges inEdgeVect + newDescEdges = V.map (directSingleEdge vertIndex) descdendantEdges + in if V.length newDescEdges /= 2 + then + error + ( "There should be 2 child edges for index " + <> show vertIndex + <> " and there are(is) " + <> show (V.length newDescEdges) + <> " " + <> show newDescEdges + ) + else + let (_, bf, _) = V.head newDescEdges + (_, bs, _) = V.last newDescEdges + firstSubEdges = directEdges bf nLeaves False remainingEdgeVect + remainingEdgeVect' = subtractVector firstSubEdges remainingEdgeVect + secondSubEdges = directEdges bs nLeaves False remainingEdgeVect' + in (newDescEdges <> (firstSubEdges <> secondSubEdges)) + + +-- | convertToGraph takes Vertex of names and a tree and return inductive Graph format +convertToDirectedGraph ∷ V.Vector String → Int → Tree → P.Gr String Double +convertToDirectedGraph leafList outgroupIndex inTree = + let (_, edgeVect) = inTree + nOTUs = length leafList + -- should be stright 0->n-1 but in case some vertex number if missing + vertexList = L.sort $ Set.toList $ getVertexSet edgeVect + vertexNames = makeVertexNames vertexList nOTUs leafList True + labelledVertexList = L.zip vertexList vertexNames + edgeList = V.toList $ directEdges outgroupIndex nOTUs True edgeVect + in G.mkGraph labelledVertexList edgeList + + +-- | convertToGraphText takes Vertex of names and a tree and return inductive Graph format +convertToDirectedGraphText ∷ V.Vector String → Int → Tree → P.Gr T.Text Double +convertToDirectedGraphText leafList outgroupIndex inTree = + let (_, edgeVect) = inTree + nOTUs = length leafList + -- should be stright 0->n-1 but in case some vertex number if missing + vertexList = L.sort $ Set.toList $ getVertexSet edgeVect + vertexNames = makeVertexNames vertexList nOTUs leafList False + labelledVertexList = L.zip vertexList (fmap T.pack vertexNames) + edgeList = V.toList $ directEdges outgroupIndex nOTUs True edgeVect + in G.mkGraph labelledVertexList edgeList + + +{- | convertToNewick generates a newick file by converting Tree type to FGL Graph, +adds a root and two new edges (deleting root edge) +and calls convert function from PhyloParsers +-} +convertToNewick ∷ V.Vector String → Int → Tree → String +convertToNewick leafNames outGroup inTree + | inTree == emptyTree = error "Empty tree in convertToNewick" + | V.null leafNames = error "Empty leaf names in convertToNewick" + | otherwise = + let fglTree = convertToDirectedGraphText leafNames outGroup inTree + in -- trace ("ConverttoNewick in-vertices in-edges" <> (show $ length inVertexVect ) <> " " <> (show $ V.toList inVertexVect ) <> "\n" <> (show $ length vertexVect ) <> " " <> (show vertexVect ) <> "\n" <> (show $ length edgeVect ) <> " " <> show edgeVect <> "\n" <> show fglTree) + PP.fglList2ForestEnhancedNewickString [fglTree] True False + + +-- | getEdgeRootIndex takes edge Vect, Index, and determines edges from root +getEdgeRootIndex ∷ Int → Int → V.Vector Edge → (Int, Edge) +getEdgeRootIndex edgeIndex outgroup edgeVect = + if V.null edgeVect + then error "Root edge not found" + else + let (eVect, uVect, _) = V.head edgeVect + in if (eVect == outgroup) || (uVect == outgroup) + then (edgeIndex, V.head edgeVect) + else getEdgeRootIndex (edgeIndex + 1) outgroup (V.tail edgeVect) + + +-- | convertToNewick wrapper to remove double commas +convertToNewick' ∷ V.Vector String → Int → Tree → String +convertToNewick' leafNames outGroup wagTree = removeCrap $ convertToNewickGuts leafNames outGroup wagTree + + +{- | removeDoubleCommas removes second of double comas ",," -> "," +this a hack to fix problem in convertToNewick +-} +removeCrap ∷ String → String +removeCrap inString = + if length inString == 1 + then inString + else + let firstChar = head inString + secondChar = inString !! 1 + in if firstChar == ',' && secondChar == ',' + then ',' : removeCrap (drop 2 inString) + else + if firstChar == ',' && secondChar == ')' + then ')' : removeCrap (drop 2 inString) + else firstChar : removeCrap (tail inString) + + +{- | convertToNewick converts Tree rep to Newick String +includes edge cost--splits root edge cost into halves to help +tree viewers like FigTree (so input is unrooted) +NEED TO ADD smaller group left larger group right for more legible trees +-} +convertToNewickGuts ∷ V.Vector String → Int → Tree → String +convertToNewickGuts leafNames outGroup wagTree = + let (inLeaves, inEdges) = wagTree + newEdges = fmap orderEdge inEdges + (_, edgeVect) = orderTree (inLeaves, newEdges) + foundEdge = getEdgeRoot outGroup edgeVect + in let (firstVert, secondVert, weight) = foundEdge + remainderEdges = V.filter (/= foundEdge) edgeVect + in -- this is embarassing bullshit -- converting ",," to "," + if firstVert == outGroup + then + "(" + <> (leafNames V.! outGroup) + <> ":" + <> showDouble 8 (weight / 2.0) + <> "," + <> getEdgesNonRoot secondVert remainderEdges (V.length leafNames) leafNames + <> ":" + <> showDouble 8 (weight / 2.0) + <> ")" + else + "(" + <> (leafNames V.! outGroup) + <> ":" + <> showDouble 8 (weight / 2.0) + <> "," + <> getEdgesNonRoot firstVert remainderEdges (V.length leafNames) leafNames + <> ":" + <> showDouble 8 (weight / 2.0) + <> ")" + + +-- | orderEdge takes an Edge and puts high index first then lower +orderEdge ∷ Edge → Edge +orderEdge (a, b, w) = + if a > b + then (a, b, w) + else (b, a, w) + + +-- | orderTree puts Tree edges in order based on edges +orderTree ∷ Tree → Tree +orderTree (leaves, edges) = + let edgeList = L.sort $ V.toList edges + in (leaves, V.fromList edgeList) + + +-- | getEdges takes root Index and determines edges from root +getEdgeRoot ∷ Int → V.Vector Edge → Edge +getEdgeRoot edgeIndex edgeVect = + if V.null edgeVect + then (-1, -1, -1.0) + else + let (eVect, uVect, _) = V.head edgeVect + in if (eVect == edgeIndex) || (uVect == edgeIndex) then V.head edgeVect else getEdgeRoot edgeIndex (V.tail edgeVect) + + +{- | getEdgesNonRoot takes root Index and determines edges from root returns String of Taxa +alwasy getting ordered edges and ordered tree +so eVect > uVect always; eVect can never be < nOTUs +Need to add smaller tree left, bigger right +-} +getEdgesNonRoot ∷ Int → V.Vector Edge → Int → V.Vector String → String +getEdgesNonRoot edgeIndex edgeVect nOTUs leafNames = + -- trace (show edgeIndex) ( + let terminal = (-1, -1, -1.0) + in if V.null edgeVect + then "END" + else + let thisEdge = getEdgeRoot edgeIndex edgeVect + in if thisEdge == terminal + then "ERROR" + else + let (eVect, uVect, weight) = thisEdge + remainderEdges = V.filter (/= thisEdge) edgeVect + eDesc = getEdgeRoot eVect remainderEdges + uDesc = getEdgeRoot uVect remainderEdges + eSubTree = getEdgesNonRoot eVect remainderEdges nOTUs leafNames + uSubTree = getEdgesNonRoot uVect remainderEdges nOTUs leafNames + in if V.null remainderEdges + then (leafNames V.! uVect) <> ":" <> showDouble precision weight <> "," + else + if eVect == edgeIndex + then + if eVect < nOTUs + then + if uDesc /= terminal + then "(" <> (leafNames V.! eVect) <> ":" <> showDouble precision weight <> "," <> uSubTree <> ")" + else (leafNames V.! eVect) <> ":" <> showDouble precision weight <> "," + else + if uVect < nOTUs + then + if eDesc /= terminal + then "(" <> (leafNames V.! uVect) <> ":" <> showDouble precision weight <> "," <> eSubTree <> ")" + else (leafNames V.! uVect) <> ":" <> showDouble precision weight <> "," + else + if (eDesc /= terminal) && (uDesc == terminal) + then eSubTree <> ":" <> showDouble precision weight <> "," + else + if (eDesc == terminal) && (uDesc /= terminal) + then uSubTree <> ":" <> showDouble precision weight <> "," + else + if length eSubTree < length uSubTree + then "(" <> eSubTree <> "," <> uSubTree <> ":" <> showDouble precision weight <> ")" + else "(" <> uSubTree <> "," <> eSubTree <> ":" <> showDouble precision weight <> ")" + else + if uVect == edgeIndex + then + if uVect < nOTUs + then + if eDesc /= terminal + then "(" <> (leafNames V.! uVect) <> ":" <> showDouble precision weight <> "," <> eSubTree <> ")" + else (leafNames V.! uVect) <> ":" <> showDouble precision weight <> "," + else + if eVect < nOTUs + then + if uDesc /= terminal + then "(" <> (leafNames V.! eVect) <> ":" <> showDouble precision weight <> "," <> uSubTree <> ")" + else (leafNames V.! eVect) <> ":" <> showDouble precision weight <> "," + else + if (eDesc /= terminal) && (uDesc == terminal) + then eSubTree <> ":" <> showDouble precision weight <> "," + else + if (eDesc == terminal) && (uDesc /= terminal) + then uSubTree <> ":" <> showDouble precision weight <> "," + else + if length eSubTree < length uSubTree + then "(" <> eSubTree <> "," <> uSubTree <> ":" <> showDouble precision weight <> ")" + else "(" <> uSubTree <> ":" <> showDouble precision weight <> "," <> eSubTree <> ")" + else getEdgesNonRoot edgeIndex remainderEdges nOTUs leafNames <> ":" <> showDouble precision weight <> "," + + +-- getBestTrees takes newick and L.sorts on comment at end with cost +getBestTrees ∷ String → Int → [TreeWithData] → Double → [TreeWithData] → [TreeWithData] +getBestTrees keepMethod number inList curBestCost curBestTrees = + if null inList + then -- apply keep method if not keeping all + + if length curBestTrees <= number + then curBestTrees + else + if keepMethod == "first" + then take number curBestTrees + else + if keepMethod == "last" + then reverse $ take number $ reverse curBestTrees + else + if keepMethod == "random" + then -- not Fitch but shuffles elements and takes first n + + let randIntList = betterRandomList (length curBestTrees) (length curBestTrees - 1) + newList = RandS.shuffle curBestTrees randIntList + in take number newList + else error ("Keep method " <> keepMethod <> " not implemented") + else + let firstTree = head inList + (_, _, firstCost, _) = firstTree + in if firstCost < curBestCost + then getBestTrees keepMethod number (tail inList) firstCost [firstTree] + else + if withinEpsilon firstCost curBestCost + then getBestTrees keepMethod number (tail inList) firstCost (firstTree : curBestTrees) + else getBestTrees keepMethod number (tail inList) curBestCost curBestTrees + + +{- | getUniqueTrees saves uniqe newick trees +different paths--ehnce different distMatrices coud result in same newick tree +-} +getUniqueTrees ∷ [TreeWithData] → [TreeWithData] → [TreeWithData] +getUniqueTrees inList uniqueList = + if null inList + then uniqueList + else + let firstTree = head inList + fstNewick = fst4 firstTree + in if fstNewick `notElem` fmap fst4 uniqueList + then getUniqueTrees (tail inList) (firstTree : uniqueList) + else getUniqueTrees (tail inList) uniqueList + + +{- | keepTrees filters newick trees based on options +all keep all +best shortest (and unique) allows number of max to save +unique unique representations irespective of length +keep metyhod for save first | last | atRandom if buffer full +-} +keepTrees ∷ [TreeWithData] → String → String → Double → [TreeWithData] +keepTrees inList saveMethod keepMethod curBestCost + | null inList = [] + | saveMethod == "all" = inList + | take 6 saveMethod == "unique" = + if length saveMethod == 6 + then getUniqueTrees inList [] + else + let number = read (drop 7 saveMethod) ∷ Int + in take number $ L.sortOn thd4 $ getUniqueTrees inList [] + | take 4 saveMethod == "best" = + if length saveMethod == 4 + then getUniqueTrees (getBestTrees keepMethod (maxBound ∷ Int) inList curBestCost []) [] + else + if (saveMethod !! 4) == ':' + then + let number = read (drop 5 saveMethod) ∷ Int + saveTrees = take number $ L.sortOn thd4 $ getUniqueTrees inList [] + (_, _, bestCost, _) = head saveTrees + in getBestTrees keepMethod number saveTrees bestCost [] + else error ("Save method " <> saveMethod <> " improperly formatted") + | otherwise = error ("Save method " <> saveMethod <> " not implemented") + + +-- \| ranList generates random list of positive integers +ranList ∷ Rand.StdGen → Int → Int → [Int] +ranList sg n maxValue = take n $ Rand.randomRs (0, maxValue) sg + + +-- | driver function to generate list of positive integers +{-# NOINLINE betterRandomList #-} +betterRandomList ∷ Int → Int → [Int] +betterRandomList n maxValue = unsafePerformIO $ do + sg ← Rand.getStdGen + return $ ranList sg n maxValue + + +{- | Subtrace vector subtracts elements of vector a from vector b +is thins n^2 ? +edges are directed +-} +subtractVector ∷ (Eq a) ⇒ V.Vector a → V.Vector a → V.Vector a +subtractVector a b + | V.null a = b + | V.null b = V.empty + | otherwise = + let firstB = V.head b + notFound = V.notElem firstB a + in if notFound + then V.cons firstB (subtractVector a (V.tail b)) + else subtractVector a (V.tail b) + + +-- | getVertexSet take a vector of edges and creates the set of vertex numbers +getVertexSet ∷ V.Vector Edge → Set.Set Vertex +getVertexSet edgeVect = + if V.null edgeVect + then Set.empty + else + let (a, b, _) = V.head edgeVect + thisSet = Set.fromList [a, b] + in Set.union thisSet (getVertexSet $ V.tail edgeVect) + + +-- | getMinRowDistMatrix distMatrix tabuList +getMinRowDistMatrix ∷ M.Matrix Double → [Int] → (Int, Double) → Int → Int → (Int, Int, Double) +getMinRowDistMatrix distMatrix tabuList minPair@(minCol, minVal) curColumn row + | curColumn == LS.length (distMatrix LS.! row) = (row, minCol, minVal) + | row `elem` tabuList = (-1, -1, NT.infinity) + | curColumn == row = getMinRowDistMatrix distMatrix tabuList minPair (curColumn + 1) row + | curColumn `elem` tabuList = getMinRowDistMatrix distMatrix tabuList minPair (curColumn + 1) row + | otherwise = + let firstVal = distMatrix M.! (row, curColumn) + in if firstVal < minVal + then getMinRowDistMatrix distMatrix tabuList (curColumn, firstVal) (curColumn + 1) row + else getMinRowDistMatrix distMatrix tabuList minPair (curColumn + 1) row + + +-- | compareTriples take two triples and orders them based on the smaller thrid element +minTriples ∷ (Ord c) ⇒ (a, b, c) → (a, b, c) → Ordering +minTriples (_, __, c) (_, _, d) + | c < d = LT + | c > d = GT + | otherwise = EQ + + +{- | getMatrixMinPairTabu takes distMatrix initial integer pair and value +traverses the matrix (skipping rows and columns in tabuList and return minimum distance and index pair +if tie takes first +gets minimum by row and parallelizes ovcer rows +call with (-1, -1, NT.infinity) 0 0 +-} +getMatrixMinPairTabu ∷ M.Matrix Double → [Int] → PhyG (Int, Int, Double) +getMatrixMinPairTabu distMatrix tabuList = + if M.null distMatrix + then error "Empty matrix in getMatrixPairTabu" + else + let -- minValueList = seqParMap myStrategy (getMinRowDistMatrix distMatrix tabuList (-1, NT.infinity) 0) [0..(M.rows distMatrix - 1)] + minRowAction ∷ Int → (Int, Int, Double) + minRowAction = getMinRowDistMatrix distMatrix tabuList (-1, NT.infinity) 0 + in do + minRowPar ← getParallelChunkMap + let minValueList = minRowPar minRowAction [0 .. (M.rows distMatrix - 1)] + pure $ L.minimumBy minTriples minValueList + + +{- +-- | getMatrixMinPairTabu takes distMatrix initial integer pair and value +-- traverses the matrix (skippiong rows and columns in tabuList and return minimum distance and index pair +-- if tie takes first + -- call with (-1, -1, NT.infinity) 0 0 +getMatrixMinPairTabu' :: M.Matrix Double -> [Int] -> (Int, Int, Double) -> Int -> Int -> (Int, Int, Double) +getMatrixMinPairTabu' distMatrix tabuList curBest curRow curColumn + | curRow == M.rows distMatrix = curBest + | curColumn == M.cols distMatrix = getMatrixMinPairTabu' distMatrix tabuList curBest (curRow + 1) 0 + | curColumn == curRow = getMatrixMinPairTabu' distMatrix tabuList curBest curRow (curColumn + 1) + | (curColumn `elem` tabuList) || (curRow `elem` tabuList) = getMatrixMinPairTabu' distMatrix tabuList curBest curRow (curColumn + 1) + | otherwise = + let (_, _, currentBestDistance) = curBest + in + if distMatrix M.! (curRow, curColumn) < currentBestDistance then + getMatrixMinPairTabu' distMatrix tabuList (curRow, curColumn, distMatrix M.! (curRow, curColumn)) curRow (curColumn + 1) + else getMatrixMinPairTabu' distMatrix tabuList curBest curRow (curColumn + 1) +-} + +{- | getMatrixMinPair takes distMatrix initla pinteger pair and value +traverses teh matrix and return minimum distance and index pair +if tie takes first +call with (-1, -1, NT.infinity) 0 0 +-} +getMatrixMinPair ∷ M.Matrix Double → (Int, Int, Double) → Int → Int → (Int, Int, Double) +getMatrixMinPair distMatrix curBest curRow curColumn + | curRow == M.rows distMatrix = curBest + | curColumn == M.cols distMatrix = getMatrixMinPair distMatrix curBest (curRow + 1) 0 + | curColumn == curRow = getMatrixMinPair distMatrix curBest curRow (curColumn + 1) + | otherwise = + let (_, _, currentBestDistance) = curBest + in if distMatrix M.! (curRow, curColumn) < currentBestDistance + then getMatrixMinPair distMatrix (curRow, curColumn, distMatrix M.! (curRow, curColumn)) curRow (curColumn + 1) + else getMatrixMinPair distMatrix curBest curRow (curColumn + 1) + + +{- | getMatrixMaxPair takes distMatrix initla pinteger pair and value +traverses teh matrix and return maximum distance and index pair +if tie takes first +call with (-1 , -1, 0 :: Double) 0 0 +-} +getMatrixMaxPair ∷ M.Matrix Double → (Int, Int, Double) → Int → Int → (Int, Int, Double) +getMatrixMaxPair distMatrix curBest curRow curColumn + | curRow == M.rows distMatrix = curBest + | curColumn == M.cols distMatrix = getMatrixMaxPair distMatrix curBest (curRow + 1) 0 + | curColumn == curRow = getMatrixMaxPair distMatrix curBest curRow (curColumn + 1) + | otherwise = + let (_, _, currentBestDistance) = curBest + in if distMatrix M.! (curRow, curColumn) > currentBestDistance + then getMatrixMaxPair distMatrix (curRow, curColumn, distMatrix M.! (curRow, curColumn)) curRow (curColumn + 1) + else getMatrixMaxPair distMatrix curBest curRow (curColumn + 1) + + +-- | getTreeCost takes Tree and returns cost based on sum of edge weights +getTreeCost ∷ Tree → Double +getTreeCost inTree = + V.sum $ V.map getEdgeCost $ snd inTree + + +-- | getEdgeCost returns weight form edge tuple +getEdgeCost ∷ (Vertex, Vertex, Weight) → Double +getEdgeCost (_, _, edgeWeight) = edgeWeight diff --git a/src/Utilities/Distances.hs b/src/Utilities/Distances.hs new file mode 100644 index 000000000..401ba2942 --- /dev/null +++ b/src/Utilities/Distances.hs @@ -0,0 +1,117 @@ +{- | +Module specifying data types +-} +module Utilities.Distances ( + getPairwiseDistances, + getBlockDistance, + getPairwiseBlockDistance, +) where + +import Data.List qualified as L +import Data.Vector qualified as V +import GeneralUtilities +import GraphOptimization.Medians qualified as M +import PHANE.Evaluation +import PHANE.Evaluation.Logging (LogLevel (..), Logger (..)) +import SymMatrix qualified as S +import Types.Types +import Utilities.Utilities qualified as U + + +{- | +getPairwiseDistances takes Processed data +and retuns a matrix (list of lists of Double) of pairwise +distances among vertices in data set over blocks ans all character types +sums over blocks +-} +getPairwiseDistances ∷ ProcessedData → PhyG [[VertexCost]] +getPairwiseDistances (nameVect, _, blockDataVect) + | V.null nameVect = error "Null name vector in getPairwiseDistances" + | V.null blockDataVect = error "Null Block Data vector in getPairwiseDistances" + | otherwise = + let -- get maximum observations and pairwise max observations to normalize distances + pairList = makeIndexPairs True (V.length nameVect) (V.length nameVect) 0 0 + -- pairListCosts = fmap (U.getPairwiseObservations blockDataVect) pairList `using` P.myParListChunkRDS + -- TODO + -- pairListCosts = P.seqParMap rdeepseq (U.getPairwiseObservations blockDataVect) pairList + -- pairListCosts = fmap (U.getPairwiseObservations blockDataVect) pairList + + -- parallel setup + pairwiseAction ∷ (Int, Int) → VertexCost + pairwiseAction = U.getPairwiseObservations blockDataVect + + blockAction ∷ BlockData → PhyG (S.Matrix VertexCost) + blockAction = getPairwiseBlockDistance (V.length nameVect) + in do + maxDistance ← U.getMaxNumberObservations blockDataVect + + pTraverse ← getParallelChunkMap + let pairListCosts = pTraverse pairwiseAction pairList + + let normFactorList = fmap (maxDistance /) $ fmap (max 1.0) pairListCosts + let initialFactorMatrix = S.fromLists $ replicate (V.length nameVect) $ replicate (V.length nameVect) 0.0 + let (iLst, jList) = unzip pairList + let threeList = zip3 iLst jList normFactorList + let factorMatrix = S.updateMatrix initialFactorMatrix threeList + + -- get pairwise distances + + -- let blockDistancesList = V.toList $ V.map (getPairwiseBlockDistance (V.length nameVect)) blockDataVect + -- blockDistancesList' <- mapM (getPairwiseBlockDistance (V.length nameVect)) blockDataVect + blockTraverse ← getParallelChunkTraverse + blockDistancesList ← blockTraverse blockAction (V.toList blockDataVect) + + -- let blockDistancesList = V.toList blockDistancesList' + + let summedBlock = L.foldl1' (S.zipWith (+)) blockDistancesList + + -- rescaled pairwsie distances + let rescaledDistanceMatrix = S.zipWith (*) factorMatrix summedBlock + + -- trace ("Factor:" <> show maxDistance <> " : " <> (show normFactorList)) + logWith LogInfo ("\tGenerating pairwise distances for " <> show (V.length blockDataVect) <> " character blocks\n") + pure $ S.toFullLists rescaledDistanceMatrix -- summedBlock + + +{- | getBlockDistance takes Block data and returns distance between +vertices based on block data +this can be done for leaves only or all via the input processed +data leaves are first--then HTUs follow +No change adjust is False since this is a distance +-} +getBlockDistance ∷ BlockData → (Int, Int) → PhyG VertexCost +getBlockDistance (_, localVertData, blockCharInfo) (firstIndex, secondIndex) = + if V.null localVertData + then pure 0.0 + else + let isMedian = False + in do + pairCostPairList <- M.median2M isMedian (localVertData V.! firstIndex) (localVertData V.! secondIndex) blockCharInfo + let pairCost = sum $ fmap snd pairCostPairList + pure pairCost + +{- | getPairwiseBlocDistance returns pairwisee distances among vertices for +a block of data +this can be done for ;leaves only or all via the input processed +data leaves are first--then HTUs follow +-} +getPairwiseBlockDistance ∷ Int → BlockData → PhyG (S.Matrix VertexCost) +getPairwiseBlockDistance numVerts inData = + let pairList = makeIndexPairs True numVerts numVerts 0 0 + initialPairMatrix = S.fromLists $ replicate numVerts $ replicate numVerts 0.0 + -- pairListCosts = fmap (getBlockDistance inData) pairList `using` P.myParListChunkRDS + -- pairListCosts = P.seqParMap rdeepseq (getBlockDistance inData) pairList + -- TODO + -- pairListCosts = fmap (getBlockDistance inData) pairList + action ∷ (Int, Int) → PhyG VertexCost + action = getBlockDistance inData + in do + pTraverse ← getParallelChunkTraverse + pairListCosts <- pTraverse action pairList + + let (iLst, jList) = unzip pairList + let threeList = zip3 iLst jList pairListCosts + let newMatrix = S.updateMatrix initialPairMatrix threeList + + -- trace ("NM:\n" <> (show threeList) <> "\n" <>(show $ S.toFullLists newMatrix)) + pure newMatrix diff --git a/src/Utilities/LocalGraph.hs b/src/Utilities/LocalGraph.hs new file mode 100644 index 000000000..da1a1bb03 --- /dev/null +++ b/src/Utilities/LocalGraph.hs @@ -0,0 +1,2416 @@ +{-# LANGUAGE ScopedTypeVariables #-} + +{- | +Module specifying graph types and functionality. +This is for indirection so can change underlying graph library without polutting the rest of the code. +-} +module Utilities.LocalGraph where + +import Control.Monad.IO.Class (MonadIO (..)) +import Control.Monad.Random.Class +import Control.Parallel.Strategies +import Cyclic qualified as C +import Data.Graph.Inductive.Basic qualified as B +import Data.Graph.Inductive.Graph qualified as G +import Data.Graph.Inductive.PatriciaTree qualified as P +import Data.Graph.Inductive.Query.ArtPoint qualified as AP +import Data.Graph.Inductive.Query.BCC qualified as BCC +import Data.Graph.Inductive.Query.BFS qualified as BFS +import Data.Graph.Inductive.Query.DFS qualified as DFS +import Data.GraphViz as GV +import Data.GraphViz.Commands.IO as GVIO +import Data.List qualified as L +import Data.List.NonEmpty (NonEmpty (..)) +import Data.List.NonEmpty qualified as NE +import Data.Map qualified as MAP +import Data.Maybe +import Data.Text.Lazy qualified as T +import Data.Vector qualified as V +import GeneralUtilities +import GraphFormatUtilities qualified as GFU +import PHANE.Evaluation +import PHANE.Evaluation.ErrorPhase (ErrorPhase (Computing)) +import PHANE.Evaluation.Verbosity (Verbosity (..)) +import System.IO + + +-- import ParallelUtilities qualified as PU +-- import Debug.Trace + +{- | to avoid circular dependency with Types.hs +Core monad transformer stack for evaluating computations within the application PhyG. +-} +type PhyG = Evaluation () + + +-- | Gr local graph definition using FGL +type Gr a b = P.Gr a b + + +type Node = G.Node + + +type LNode a = G.LNode a + + +type DotGraph = GV.DotGraph + + +type Edge = G.Edge + + +type LEdge b = G.LEdge b + + +{- | getFENLocal maps to forestEnhancedNewickStringList2FGLList in GraphFormatUtilities +to allow for potnetial swapping FGL graph backend +requires leading and trailing space and newlines to be removed +-} +getFENLocal ∷ T.Text → [Utilities.LocalGraph.Gr T.Text Double] +getFENLocal = GFU.forestEnhancedNewickStringList2FGLList + + +-- | readDotLocal calls GrapvViz function to allow for substitution later +readDotLocal ∷ String → IO (Utilities.LocalGraph.DotGraph Utilities.LocalGraph.Node) +readDotLocal = GVIO.readDotFile + + +-- | dotToGraph local mapo dor +dotToGraph ∷ Utilities.LocalGraph.DotGraph Utilities.LocalGraph.Node → Utilities.LocalGraph.Gr Attributes Attributes +dotToGraph = GV.dotToGraph + + +-- | hGetDotLocal calls hGetDot from GraphVoz +hGetDotLocal ∷ Handle → IO (Utilities.LocalGraph.DotGraph Utilities.LocalGraph.Node) +hGetDotLocal = GVIO.hGetDot + + +-- | fglToPrettyString calls prettify from FGL +fglToPrettyString ∷ (Show a, Show b) ⇒ P.Gr a b → String +fglToPrettyString inGraph = + if G.isEmpty inGraph + then "Empty Graph" + else G.prettify inGraph + + +-- | pretty prints graph to String +prettify ∷ (Show a, Show b) ⇒ Gr a b → String +prettify inGraph = + if G.isEmpty inGraph + then "Empty Graph" + else G.prettify inGraph + + +-- | prettyIndices prints graph to String only using indices +prettyIndices ∷ Gr a b → String +prettyIndices inGraph = + if G.isEmpty inGraph + then "Empty Graph" + else + let nodeList = concat $ fmap (<> ",") $ fmap show $ nodes inGraph + edgeList = concat $ fmap (<> ",") $ fmap show $ edges inGraph + in nodeList <> "\n" <> edgeList + + +-- | prettyDot prints generic graph to generic dot format +prettyDot ∷ Gr a b → String +prettyDot inGraph = + if G.isEmpty inGraph + then error ("Empty Graph") + else + let topPart = "digraph G {\n\trankdir = LR; edge [colorscheme=spectral11]; node [shape = rect];\n" + nodeList = concatMap ("\t" <>) $ fmap (<> ";\n") $ fmap show $ nodes inGraph + edgeList = concatMap ("\t" <>) $ fmap makeEdgeString $ edges inGraph + endPart = "}\n" + in topPart <> nodeList <> edgeList <> endPart + where + makeEdgeString (a, b) = (show a) <> " -> " <> (show b) <> ";\n" + + +-- these duplicate edge functions should be O(n log n) based on sort--rest linear + +-- hasDuplicateEdgesNub hasDuplicateEdge but based on nub +hasDuplicateEdgesNub ∷ Gr a b → Bool +hasDuplicateEdgesNub inGraph = + if isEmpty inGraph + then False + else + let edgeList = fmap toEdge $ labEdges inGraph + uniqueEdgeList = L.nub edgeList + in not (length edgeList == length uniqueEdgeList) + + +-- hasDuplicateEdge checked for duplicate edges based on indices (not label) +hasDuplicateEdge ∷ Gr a b → Bool +hasDuplicateEdge inGraph = + if isEmpty inGraph + then False + else + let sortedEdges = L.sort $ fmap toEdge $ labEdges inGraph + groupedEdges = L.group sortedEdges + dupEdges = filter ((> 1) . length) groupedEdges + in -- dupEdges = (fmap toEdge $ labEdges inGraph) L.\\ (L.nub $ fmap toEdge $ labEdges inGraph) + + (not . null) dupEdges + + +{- | getDuplicateEdges retuns a list of edges that are duplicated +by indeices--no label comparison +can be used to delete the extra +-} +getDuplicateEdges ∷ Gr a b → [Edge] +getDuplicateEdges inGraph = + if isEmpty inGraph + then [] + else + let sortedEdges = L.sort $ fmap toEdge $ labEdges inGraph + groupedEdges = L.group sortedEdges + dupEdges = filter ((> 1) . length) groupedEdges + in -- dupEdges = (fmap toEdge $ labEdges inGraph) L.\\ (L.nub $ fmap toEdge $ labEdges inGraph) + + fmap head dupEdges + + +-- (fmap toEdge $ labEdges inGraph) L.\\ (L.nub $ fmap toEdge $ labEdges inGraph) + +-- | removeDuplicateEdges removes duplicate edges from graph +removeDuplicateEdges ∷ Gr a b → Gr a b +removeDuplicateEdges inGraph = + if isEmpty inGraph + then inGraph + else + let dupEdges = getDuplicateEdges inGraph + in if null dupEdges + then inGraph + else delEdges dupEdges inGraph + + +{- | hasTreeNodeWithAllNetworkChildren checks treenodes for all (should be 2) children that +are netork nodes +-} +hasTreeNodeWithAllNetworkChildren ∷ Gr a b → (Bool, [Node]) +hasTreeNodeWithAllNetworkChildren inGraph = + if isEmpty inGraph + then (False, []) + else + let (_, _, treeNodeList, _) = splitVertexList inGraph + hasAllNetChildrenList = fmap (hasAllNetChildren inGraph) (fmap fst treeNodeList) + nodesWithAllNetChildren = fmap (fst . fst) $ filter ((== True) . snd) $ zip treeNodeList hasAllNetChildrenList + in ((not . null) nodesWithAllNetChildren, nodesWithAllNetChildren) + + +-- | hasAllNetChildren checks whether all (usually 2) childrenb of a vertex are network nodes +hasAllNetChildren ∷ Gr a b → Node → Bool +hasAllNetChildren inGraph inNode = + let children = descendants inGraph inNode + childVertNodes = filter (== True) $ fmap (isNetworkNode inGraph) children + in length children == length childVertNodes + + +{- | removeTreeEdgeFromTreeNodeWithAllNetworkChildren takes a graph and removes the first edge (head) +from each tree node with all network children, then contracts those edges and nodes, +then reindexes -- but does not rename graph nodes +-} +removeTreeEdgeFromTreeNodeWithAllNetworkChildren ∷ Gr a b → Gr a b +removeTreeEdgeFromTreeNodeWithAllNetworkChildren inGraph = + let (toDo, nodesWithEdgesToDelete) = hasTreeNodeWithAllNetworkChildren inGraph + outEdgesToDeleteList = fmap toEdge $ fmap head $ fmap (out inGraph) nodesWithEdgesToDelete + newGraph = delEdges outEdgesToDeleteList inGraph + newGraph' = reindexGraph $ contractIn1Out1Edges newGraph + in if not toDo + then inGraph + else newGraph' + + +-- | hasChainedNetworkNodes checks if a graph has network nodes with at least one parent that is also a network node +hasChainedNetworkNodes ∷ Gr a b → Bool +hasChainedNetworkNodes inGraph = + if isEmpty inGraph + then False + else + let (_, _, _, netVertexList) = splitVertexList inGraph + chainedNodeList = filter (== True) $ fmap (hasNetParent inGraph) $ fmap fst netVertexList + netWithChildNetList = filter (== True) $ fmap (hasAllNetChildren inGraph) $ fmap fst netVertexList + in if null netVertexList + then False + else (not . null) (chainedNodeList <> netWithChildNetList) + + +-- | hasNetParent checks parent of node and retuens True if one or both are network nodes +hasNetParent ∷ Gr a b → Node → Bool +hasNetParent inGraph inNode = + let parentList = parents inGraph inNode + parentNetList = filter (== True) $ fmap (isNetworkNode inGraph) parentList + in (not . null) parentNetList + + +{- | removeChainedNetworkNodes detectes and fixes (if possible) chained network edges +if 1 parent of network edge is tree node can be fixed by delete and contracting that node/edge +else if both parent are netowrks--cannot be fixed and errors out +does NOT rename nodes since need vertex info on that--but are reindexed +-} +removeChainedNetworkNodes ∷ (Show a, Show b) ⇒ Bool → Gr a b → Maybe (Gr a b) +removeChainedNetworkNodes showWarning inGraph = + if isEmpty inGraph + then Nothing + else + let (_, _, _, netVertexList) = splitVertexList inGraph + parentNetNodeList = fmap (hasNetParent inGraph) $ fmap fst netVertexList + chainedNodeList = fmap fst $ filter ((== True) . snd) $ zip netVertexList parentNetNodeList + fixableChainedEdgeList = concatMap (getTreeEdgeParent inGraph) (fmap fst chainedNodeList) + newGraph = delEdges fixableChainedEdgeList inGraph + newGraph' = reindexGraph $ contractIn1Out1Edges newGraph + in if null netVertexList + then Just inGraph + else + if null chainedNodeList + then Just inGraph + else + if null fixableChainedEdgeList + then + let warningString = + if showWarning + then "Warning: Unfixable chained network nodes (both parent and child nodes are indegree > 1). Skipping graph" + else "" + in traceNoLF + warningString + Nothing + else + let warningString = + if showWarning + then + "Warning: Chained network nodes (both parent and child nodes are indegree > 1), removing edges to tree node parents (this may affect graph cost)\n" + else "" + in traceNoLF + warningString + -- : " <> (show fixableChainedEdgeList))-- <> "\n" <> (prettyIndices inGraph)) + Just + newGraph' + + +{- | getTreeEdgeParent gets the tree edge (as list) into a network node as opposed to the edge from a network parent +if both parents are network nodes then returns [] +-} +getTreeEdgeParent ∷ Gr a b → Node → [Edge] +getTreeEdgeParent inGraph inNode = + let parentList = parents inGraph inNode + parentTreeList = fmap fst $ filter ((== False) . snd) $ zip parentList (fmap (isNetworkNode inGraph) parentList) + in if null parentTreeList + then [] + else [(head parentTreeList, inNode)] + + +-- Wrapper functions for fgl so could swap out later if want to + +-- | maps to isEmpty +isEmpty ∷ Gr a b → Bool +isEmpty = G.isEmpty + + +-- | maps to empty +empty ∷ Gr a b +empty = G.empty + + +-- | maps to equal +equal ∷ (Eq a, Eq b) ⇒ Gr a b → Gr a b → Bool +equal x y = G.equal x y + + +-- | gelem is a node in a graph +gelem ∷ Node → Gr a b → Bool +gelem = G.gelem + + +-- | maps to labNodes-- I beleive in vertex index order +labNodes ∷ Gr a b → [LNode a] +labNodes = G.labNodes + + +-- | maps to labEdges +labEdges ∷ Gr a b → [LEdge b] +labEdges = G.labEdges + + +-- | toEdge removes label from edge +toEdge ∷ LEdge b → Edge +toEdge = G.toEdge + + +-- | toNode returns fst of LNode a +toNode ∷ LNode a → Node +toNode inNode = fst inNode + + +-- | toLEdge adds a label to an edge +toLEdge ∷ Edge → b → LEdge b +toLEdge = G.toLEdge + + +-- | toLEdge' flipped version of toLEdge +toLEdge' ∷ b → Edge → LEdge b +toLEdge' inLabel inEdge = G.toLEdge inEdge inLabel + + +-- | deg mapes to fgl deg +deg ∷ Gr a b → Node → Int +deg inGraph inNode = G.deg inGraph inNode + + +-- | maps to indeg +indeg ∷ Gr a b → LNode a → Int +indeg inGraph inLNode = G.indeg inGraph $ fst inLNode + + +-- | maps to outdeg +outdeg ∷ Gr a b → LNode a → Int +outdeg inGraph inLNode = G.outdeg inGraph $ fst inLNode + + +{- | getInOut takes a node and a graph and returns +a triple (LNode a, indegree, outDegree) +-} +getInOutDeg ∷ Gr a b → LNode a → (LNode a, Int, Int) +getInOutDeg inGraph inLNode = + if isEmpty inGraph + then error "Empty graph in getInOut" + else + let inDeg = indeg inGraph inLNode + outDeg = outdeg inGraph inLNode + in (inLNode, inDeg, outDeg) + + +{- | getInOutDegNoLabel takes a node and a graph and returns +a pair (indegree, outDegree) +-} +getInOutDegNoLabel ∷ Gr a b → Node → (Int, Int) +getInOutDegNoLabel inGraph inNode = + if isEmpty inGraph + then error "Empty graph in getInOut" + else + let inDeg = length $ inn inGraph inNode + outDeg = length $ out inGraph inNode + in (inDeg, outDeg) + + +-- | in-bound edge list to node, maps to inn +inn ∷ Gr a b → Node → [LEdge b] +inn = G.inn + + +-- | lab returns label of node as Maybe +lab ∷ Gr a b → Node → Maybe a +lab = G.lab + + +-- | out-bound edge list from node, maps to out +out ∷ Gr a b → Node → [LEdge b] +out = G.out + + +-- | hasEdge maps to fgl function returns True if graphs has directed edge between nodes +hasEdge ∷ Gr a b → Edge → Bool +hasEdge = G.hasEdge + + +{- | updateNodeLabel updated teh label infomatino on a node +this is done by deleting that nmode and addingl back in to graph +when a node is deleted all edges inceident on it are also deleted +so they must be saved and added back +-} +updateNodeLabel ∷ Gr a b → Node → a → Gr a b +updateNodeLabel inGraph inNodeIndex newLabel = + if isEmpty inGraph + then error "Empty graph in sisterLabNodes" + else + let incidentEdges = (inn inGraph inNodeIndex) <> (out inGraph inNodeIndex) + in insEdges incidentEdges $ insNode (inNodeIndex, newLabel) $ delNode inNodeIndex inGraph + + +{- | sisterLabNodes returns list of nodes that are "sister" ie share same parent +as input node +-} +sisterLabNodes ∷ (Show a, Eq a) ⇒ Gr a b → LNode a → [LNode a] +sisterLabNodes inGraph inNode = + if isEmpty inGraph + then error "Empty graph in sisterLabNodes" + else + let parentNodeList = labParents inGraph (fst inNode) + otherChildrenOfParentsList = (concatMap (labDescendants inGraph) parentNodeList) L.\\ [inNode] + in -- shoudl not need nub for phylogenetic graphs but whatever + L.nub otherChildrenOfParentsList + + +-- | parents of unlabelled node +parents ∷ Gr a b → Node → [Node] +parents inGraph inNode = fst3 <$> G.inn inGraph inNode + + +-- | grandParents of unlabelled node +grandParents ∷ Gr a b → Node → [Node] +grandParents inGraph inNode = + let nodeParents = parents inGraph inNode + in concatMap (parents inGraph) nodeParents + + +-- | sharedGrandParents sees if a node has common grandparents +sharedGrandParents ∷ Gr a b → Node → Bool +sharedGrandParents inGraph inNode = + if isEmpty inGraph + then error "Empty gaph in sharedGrandParents" + else + if isRoot inGraph inNode + then False + else + let parentList = parents inGraph inNode + grandParentLists = fmap (parents inGraph) parentList + intersection = L.foldl1' L.intersect grandParentLists + in -- True + if null intersection then False else True + + +-- | labParents returns the labelled parents of a node +labParents ∷ (Eq a) ⇒ Gr a b → Node → [LNode a] +labParents inGraph inNode = + let parentNodeList = parents inGraph inNode + parentLabelList = fmap (lab inGraph) parentNodeList + hasNothing = Nothing `elem` parentLabelList + parentLabelList' = fmap fromJust parentLabelList + in if hasNothing + then error "Unlabeled nodes in labParents" + else zip parentNodeList parentLabelList' + + +{- | isPhylogeneticGraph checks various issues to see if +there is wierdness in graph +-} +isPhylogeneticGraph ∷ (Show a, Eq a, NFData a, Show b, Eq b) ⇒ Gr a b → PhyG Bool +isPhylogeneticGraph inGraph = + if isEmpty inGraph + then pure False + else + let nodeList = fmap fst $ labNodes inGraph + indegreeList = fmap (inn inGraph) nodeList + outdegreeList = fmap (out inGraph) nodeList + in if hasDuplicateEdgesNub inGraph + then pure False + else + if length (getRoots inGraph) /= 1 + then pure False + else + if outdeg inGraph (head $ getRoots inGraph) /= 2 + then pure False + else + if (not . null) (getIsolatedNodes inGraph) + then pure False + else + if (not . null) (filter ((> 2) . length) indegreeList) + then pure False + else + if (not . null) (filter ((> 2) . length) outdegreeList) + then pure False + else + if parentsInChain inGraph + then pure False + else do + consistent ← isGraphTimeConsistent inGraph + if not consistent + then pure False + else pure True + + +-- | removeParentsInChain checks the parents of each netowrk node are not anc/desc of each other +parentsInChain ∷ (Eq a, Eq b, Show a) ⇒ Gr a b → Bool +parentsInChain inGraph = + if isEmpty inGraph + then False + else + let (_, _, _, netVertexList) = splitVertexList inGraph + parentNetVertList = fmap (labParents inGraph . fst) netVertexList + + -- get list of nodes that are transitively equal in age + concurrentList = mergeConcurrentNodeLists parentNetVertList [] + concurrentPairList = concatMap getListPairs concurrentList + + -- get pairs that violate concurrency + violatingConcurrentPairs = concatMap (concurrentViolatePair inGraph) concurrentPairList + + -- get network nodes with violations + parentNodeViolateList = concatMap pairToList violatingConcurrentPairs + childNodeViolateList = concatMap (descendants inGraph) parentNodeViolateList + netNodeViolateList = filter (isNetworkNode inGraph) childNodeViolateList + + netEdgesThatViolate = fmap toEdge $ inn inGraph $ head netNodeViolateList + in if null violatingConcurrentPairs + then False + else + if null netNodeViolateList + then True + else + if null netEdgesThatViolate + then True + else False + where + pairToList (a, b) = [fst a, fst b] + + +-- | descendants of unlabelled node +descendants ∷ Gr a b → Node → [Node] +descendants inGraph inNode = snd3 <$> G.out inGraph inNode + + +-- | labDescendants labelled descendents of labelled node +labDescendants ∷ (Show a, Eq a) ⇒ Gr a b → LNode a → [LNode a] +labDescendants inGraph inNode = + let nodeList = snd3 <$> G.out inGraph (fst inNode) + maybeLabelList = fmap (lab inGraph) nodeList + hasNothing = Nothing `elem` maybeLabelList + labelList = fmap fromJust maybeLabelList + labNodeList = zip nodeList labelList + in -- if null labelList then + -- trace ("Warning: Null labelled descendants in labDescendants" <> " " <> (show $ zip nodeList maybeLabelList)) + -- labNodeList + -- else + if hasNothing + then error ("Unlabeled nodes in labDescendants" <> "\n" <> (show $ zip nodeList maybeLabelList)) + else -- labNodeList + -- error ("Unlabeled nodes in labDescendants" <> "\n" <> (show $ zip nodeList maybeLabelList)) + + labNodeList + + +-- | takes a graph and node and returns pair of inbound and noutbound labelled edges +getInOutEdges ∷ Gr a b → Node → ([LEdge b], [LEdge b]) +getInOutEdges inGraph inNode = (inn inGraph inNode, out inGraph inNode) + + +-- | nodes returns list of unlabbeled nodes, maps to nodes +nodes ∷ Gr a b → [Node] +nodes = G.nodes + + +-- | edges returns list of unlabeled nodes, maps to nodes +edges ∷ Gr a b → [Edge] +edges = G.edges + + +-- | insEdges inserts a list of labelled edges into a graph +insEdges ∷ [LEdge b] → Gr a b → Gr a b +insEdges = G.insEdges + + +-- | insEdge inserts a labelled edge into a graph +insEdge ∷ LEdge b → Gr a b → Gr a b +insEdge = G.insEdge + + +{- | delLEdges delete a labelled edge from a graph +wrapps around delEdge +-} +delLEdge ∷ LEdge b → Gr a b → Gr a b +delLEdge inEdge = G.delEdge (G.toEdge inEdge) + + +{- | delEdges delete an unlabelled edge from a graph +wrapps around delEdge +-} +delEdge ∷ Edge → Gr a b → Gr a b +delEdge inEdge = G.delEdge inEdge + + +{- | delLEdge deletes a list of unlabelled edges from a graph +wrapps around delEdges +-} +delEdges ∷ [Edge] → Gr a b → Gr a b +delEdges inEdgeList = G.delEdges inEdgeList + + +{- | delLEdge deletes a list of labelled edges from a graph +wrapps around delEdges +-} +delLEdges ∷ [LEdge b] → Gr a b → Gr a b +delLEdges inEdgeList = G.delEdges (fmap G.toEdge inEdgeList) + + +-- | insNode inserts a labelled node into a graph +insNode ∷ LNode a → Gr a b → Gr a b +insNode = G.insNode + + +-- | insNodes inserts multiple labelled nodes into a graph +insNodes ∷ [LNode a] → Gr a b → Gr a b +insNodes = G.insNodes + + +{- | delLNode deletes a labelled node from a graph +NB Removes any edges involving this node +-} +delLNode ∷ LNode a → Gr a b → Gr a b +delLNode inNode = G.delNode (fst inNode) + + +{- | delNode deletes an unlabelled node from a graph +NB Removes any edges involving this node +-} +delNode ∷ Node → Gr a b → Gr a b +delNode = G.delNode + + +{- | delNodes deletes a list of unlabelled nodes from a graph +NB I beleive removes any edges involving these nodes +-} +delNodes ∷ [Node] → Gr a b → Gr a b +delNodes = G.delNodes + + +-- | mkGraph creates a graph from list of nodes and list of edges +mkGraph ∷ [LNode a] → [LEdge b] → Gr a b +mkGraph = G.mkGraph + + +-- | mkGraphPair creates a graph from pair of list of nodes and list of edges +mkGraphPair ∷ ([LNode a], [LEdge b]) → Gr a b +mkGraphPair (nodeList, edgeList) = G.mkGraph nodeList edgeList + + +-- | components list of list of nodes (G.Graphalyze can return graph list) +components ∷ Gr a b → [[Node]] +components = DFS.components + + +-- | componentGraphs takes a graph and returns its compnent gaphs +componentGraphs ∷ Gr a b → [Gr a b] +componentGraphs inGraph = + if isEmpty inGraph + then [] + else + let componentNodeListList = components inGraph + labComponentNodeListList = fmap (fmap (labelNode inGraph)) componentNodeListList + edgeListListList = fmap (fmap (inn inGraph)) componentNodeListList + componentEdgeList = fmap concat edgeListListList + componentGraphList = zipWith mkGraph labComponentNodeListList componentEdgeList + in if length componentNodeListList == 1 + then [inGraph] + else componentGraphList + + +-- | labelNode uses lab but checks for Nothing and returns labelled node +labelNode ∷ Gr a b → Node → LNode a +labelNode inGraph inNode = + if isEmpty inGraph + then error "Empty graph for label source" + else + let label = lab inGraph inNode + in if isNothing label + then error ("No label for node " <> show inNode) + else (inNode, fromJust label) + + +-- | labelNodeFlip uses lab but checks for Nothing and returns labelled node +labelNodeFlip ∷ Node → Gr a b → LNode a +labelNodeFlip = flip labelNode + + +-- | noComponents returns number of components +noComponents ∷ Gr a b → Int +noComponents = DFS.noComponents + + +-- | isLeaf checks if node is root +isLeaf ∷ Gr a b → Node → Bool +isLeaf inGraph inNode = G.outdeg inGraph inNode == 0 + + +-- | isOutDeg1Node checks if node has a single child +isOutDeg1Node ∷ Gr a b → Node → Bool +isOutDeg1Node inGraph inNode = G.outdeg inGraph inNode == 1 + + +-- | isNetworkNode checks if node is network node +isNetworkNode ∷ Gr a b → Node → Bool +isNetworkNode inGraph inNode = (G.indeg inGraph inNode > 1) && (G.outdeg inGraph inNode > 0) + + +-- | isNetworkLeaf checks if node is network node and a leaf--usually an error condition in phylogenetic networks +isNetworkLeaf ∷ Gr a b → Node → Bool +isNetworkLeaf inGraph inNode = (G.indeg inGraph inNode > 1) && (G.outdeg inGraph inNode == 0) + + +-- | isNetworkEdge checks if edge is network edge +isNetworkEdge ∷ Gr a b → Edge → Bool +isNetworkEdge inGraph inEdge = (G.indeg inGraph (snd inEdge) > 1) && (G.outdeg inGraph (snd inEdge) > 0) + + +{- | hasNetworkEdgeList checks edge lisyt to see if any are network--short cicuuits +so faster than using filter and null +-} +hasNetworkEdgeList ∷ Gr a b → [LEdge b] → Bool +hasNetworkEdgeList inGraph edgeList = + if null edgeList + then False + else + if isNetworkLabEdge inGraph (head edgeList) + then True + else hasNetworkEdgeList inGraph (drop 1edgeList) + + +-- | isNetworkLabEdge checks if edge is network edge +isNetworkLabEdge ∷ Gr a b → LEdge b → Bool +isNetworkLabEdge inGraph inEdge = (G.indeg inGraph (snd3 inEdge) > 1) + + +-- | isIn1Out1 checks if node has indegree = 1 outdegree = 1 +isIn1Out1 ∷ Gr a b → Node → Bool +isIn1Out1 inGraph inNode = (G.indeg inGraph inNode == 1) && (G.outdeg inGraph inNode == 1) + + +-- | labNetEdges takes a graph and returns list of network labelled edges +labNetEdges ∷ Gr a b → [LEdge b] +labNetEdges inGraph = + if isEmpty inGraph + then error "Empty graph in labNetEdges" + else filter (isNetworkLabEdge inGraph) $ labEdges inGraph + + +-- | netEdges takes a graph and returns list of network edges +netEdges ∷ Gr a b → [Edge] +netEdges inGraph = + if isEmpty inGraph + then error "Empty graph in labNetEdges" + else filter (isNetworkEdge inGraph) $ edges inGraph + + +-- | isTreeNode checks if node is network node +isTreeNode ∷ Gr a b → Node → Bool +isTreeNode inGraph inNode = (G.indeg inGraph inNode == 1) && (G.outdeg inGraph inNode > 0) + + +-- getRoots returns list of graph roots (labelled) +getRoots ∷ Gr a b → [LNode a] +getRoots inGraph = + if isEmpty inGraph + then [] + else + let nodeList = labNodes inGraph + -- rootBoolList = fmap (isRoot inGraph . fst) nodeList + rootBoolList = fmap ((== 0) . length) $ fmap (inn inGraph) $ fmap fst nodeList + pairList = zip rootBoolList nodeList + rootPairList = filter ((== True) . fst) pairList + rootList = fmap snd rootPairList + in rootList + + +{- | getIsolatedNodes returns list of labelled nodes with indegree=outdegree=0 +should change to use G.deg == 0 +-} +getIsolatedNodes ∷ Gr a b → [LNode a] +getIsolatedNodes inGraph = + if isEmpty inGraph + then [] + else + let nodeList = labNodes inGraph + -- rootBoolList = fmap (isRoot inGraph . fst) nodeList + in0BoolList = fmap ((== 0) . length) $ fmap (inn inGraph) $ fmap fst nodeList + out0BoolList = fmap ((== 0) . length) $ fmap (out inGraph) $ fmap fst nodeList + isolateBoolList = zipWith (&&) in0BoolList out0BoolList + + pairList = zip isolateBoolList nodeList + isolatePairList = filter ((== True) . fst) pairList + isolateList = fmap snd isolatePairList + in isolateList + + +-- | isRoot checks if node is root +isRoot ∷ Gr a b → Node → Bool +isRoot inGraph inNode = + G.gelem inNode inGraph && (G.indeg inGraph inNode == 0) + + +-- | pre returns list of nodes linking to a node +pre ∷ Gr a b → Node → [Node] +pre = G.pre + + +-- | suc returns list of nodes linking from a node +suc ∷ Gr a b → Node → [Node] +suc = G.suc + + +-- | edgeLabel returns label of edge +edgeLabel ∷ LEdge b → b +edgeLabel = G.edgeLabel + + +-- | getOtherVertex retuns the edge vertex /= index +getOtherVertex ∷ LEdge b → Int → Int +getOtherVertex (u, v, _) index = if u == index then v else u + + +-- | flipEdge flips orientation of unlabelled edge +flipEdge ∷ Edge → Edge +flipEdge (u, v) = (v, u) + + +-- | flipLEdge flips orientation of labelled edge +flipLEdge ∷ LEdge b → LEdge b +flipLEdge (u, v, w) = (v, u, w) + + +-- | isTree takes a graph and checks if there are anmy network nodes--if not returns True +isTree ∷ Gr a b → Bool +isTree inGraph = + if G.isEmpty inGraph + then error "Empty graph in isTree" + else + let (_, _, _, netNodes) = splitVertexList inGraph + in null netNodes + + +-- \| splitVertexList splits the vertices of a graph into ([root], [leaf], [tree], [network]) +splitVertexList ∷ Gr a b → ([LNode a], [LNode a], [LNode a], [LNode a]) +splitVertexList inGraph = + if G.isEmpty inGraph + then ([], [], [], []) + else + let -- leaves + degOutList = outdeg inGraph <$> labNodes inGraph + newNodePair = zip degOutList (labNodes inGraph) + leafPairList = filter ((== 0) . fst) newNodePair + (_, leafList) = unzip leafPairList + + -- roots + degInList = indeg inGraph <$> labNodes inGraph + newRootPair = zip degInList (labNodes inGraph) + rootPairList = filter ((== 0) . fst) newRootPair + (_, rootList) = unzip rootPairList + + -- tree nodes + nodeTripleList = zip3 degInList degOutList (labNodes inGraph) + treeTripleList = filter ((== 1) . fst3) $ filter ((> 0) . snd3) nodeTripleList + (_, _, treeVertexList) = unzip3 treeTripleList + + -- network nodes + networkTripleList = filter ((> 1) . fst3) $ filter ((> 0) . snd3) nodeTripleList + (_, _, networkVertexList) = unzip3 networkTripleList + in (rootList, leafList, treeVertexList, networkVertexList) + + +{- | pathToRoot takes a graph and a vertex and returns a pair of lists +of vertices and edges to root(s) in order of encountering them to root +if a tree--not necessarily if network--should work +-} +pathToRoot ∷ (Eq a, Eq b) ⇒ Gr a b → LNode a → ([LNode a], [LEdge b]) +pathToRoot inGraph inNode = + if G.isEmpty inGraph + then error "Empty graph in pathToRoot" + else pathToRoot' inGraph [inNode] [] [] + + +{- | pathToRoot' with accumulators +filter operators basically for networks so not retrace paths +includes roots as nodes +-} +pathToRoot' ∷ (Eq a, Eq b) ⇒ Gr a b → [LNode a] → [LNode a] → [LEdge b] → ([LNode a], [LEdge b]) +pathToRoot' inGraph inNodeList curNodeList curEdgeList = + if null inNodeList + then (reverse curNodeList, reverse curEdgeList) + else + let inNode = head inNodeList + in -- root would already be inlist of nodes visited + if isRoot inGraph (fst inNode) + then pathToRoot' inGraph (drop 1inNodeList) curNodeList curEdgeList + else + let inLEdges = filter (`notElem` curEdgeList) $ inn inGraph (fst inNode) + inNodes = filter (`notElem` (fmap fst curNodeList)) $ fmap fst3 inLEdges + -- inLabNodes = concatMap (labParents inGraph) (fmap fst3 inLEdges) + inLabNodes = zip inNodes (fmap (fromJust . lab inGraph) inNodes) + in pathToRoot' inGraph (inLabNodes <> drop 1inNodeList) (inLabNodes <> curNodeList) (inLEdges <> curEdgeList) + + +{- | postOrderPathToNode takes a graph and two vertices nd returns a pair of lists +of vertices and edges to beteween them in order of encountering them from first to second +the path is post order to root so if second vertex is leaf-side of first node will hit root and fail +-} +postOrderPathToNode ∷ (Eq a, Eq b) ⇒ Gr a b → LNode a → LNode a → ([LNode a], [LEdge b]) +postOrderPathToNode inGraph startNode endNode = + if G.isEmpty inGraph + then error "Empty graph in pathToRoot" + else postOrderPathToNode' inGraph endNode [startNode] [] [] + + +{- | postOrderPathToNode' with accumulators +filter operators basically for networks so not retrace paths +-} +postOrderPathToNode' ∷ (Eq a, Eq b) ⇒ Gr a b → LNode a → [LNode a] → [LNode a] → [LEdge b] → ([LNode a], [LEdge b]) +postOrderPathToNode' inGraph endNode inNodeList curNodeList curEdgeList = + if null inNodeList + then (reverse curNodeList, reverse curEdgeList) + else + let inNode = head inNodeList + in -- root would already be inlist of nodes visited + if (fst inNode) == (fst endNode) + then postOrderPathToNode' inGraph endNode (drop 1inNodeList) curNodeList curEdgeList + else + if isRoot inGraph (fst inNode) + then + error + ( "postOrderPathToNode hit root before end node. Root index " + <> (show $ fst inNode) + <> " edges " + <> (show $ fmap toEdge curEdgeList) + ) + else + let inLEdges = filter (`notElem` curEdgeList) $ inn inGraph (fst inNode) + inNodes = filter (`notElem` (fmap fst curNodeList)) $ fmap fst3 inLEdges + inLabNodes = zip inNodes (fmap (fromJust . lab inGraph) inNodes) + in postOrderPathToNode' inGraph endNode (inLabNodes <> drop 1inNodeList) (inLabNodes <> curNodeList) (inLEdges <> curEdgeList) + + +{- | nodesAndEdgesBefore takes a graph and list of nodes to get list of nodes +and edges 'before' in the sense of leading to--ie between root and +(not including)) that node +call with ([], []) +filter operators basically for networks so not retrace paths +-} +nodesAndEdgesBefore' ∷ (Eq a, Eq b, Show a) ⇒ Gr a b → ([LNode a], [LEdge b]) → [LNode a] → ([LNode a], [LEdge b]) +nodesAndEdgesBefore' inGraph curResults@(curNodes, curEdges) inNodeList + | G.isEmpty inGraph = error "Input Graph is empty in nodesAndEdgesBefore" + | null inNodeList = curResults + | otherwise = + let intoEdgeList = filter (`notElem` curEdges) $ inn inGraph (fst $ head inNodeList) + intoNodeList = filter (`notElem` (fmap fst curNodes)) $ fmap fst3 intoEdgeList + labelMaybeList = fmap (lab inGraph) intoNodeList + labelList = fmap fromJust labelMaybeList + intoLabNodeList = zip intoNodeList labelList + in if Nothing `elem` labelMaybeList + then error ("Empty node label in nodesAndEdgesBefore" <> show intoLabNodeList) + else nodesAndEdgesBefore' inGraph (intoLabNodeList <> curNodes, intoEdgeList <> curEdges) (intoLabNodeList <> drop 1inNodeList) + + +{- | nodesAndEdgesBefore takes a graph and list of nodes to get list of nodes +and edges 'before' in the sense of leading to--ie between root and +(not including)) that node +filter operators basically for networks so not retrace paths +wrapper without accuulator +Does NOT Contain starting nodes +-} +nodesAndEdgesBefore ∷ (Eq a, Eq b, Show a) ⇒ Gr a b → [LNode a] → ([LNode a], [LEdge b]) +nodesAndEdgesBefore inGraph inNodeList = nodesAndEdgesBefore' inGraph ([], []) inNodeList + + +-- | getEdgeListAfter wrapper around nodesAndEdgesAfter to return only edges +getEdgeListAfter ∷ (Eq a, Eq b, Show a) ⇒ (Gr a b, Node) → [LEdge b] +getEdgeListAfter (inGraph, inNode) = + if isEmpty inGraph + then error "Input Graph is empty in getEdgeListAfter" + else snd $ nodesAndEdgesAfter inGraph [(inNode, fromJust $ lab inGraph inNode)] + + +{- | nodesAndEdgesAfter' takes a graph and list of nodes to get list of nodes +and edges 'after' in the sense of leading from-ie between (not including)) that node +and all the way to any leaves is connects to. +Does NOT Contain starting nodes +call with ([], []) +filter operators basically for networks so not retrace paths +-} +nodesAndEdgesAfter' ∷ (Eq a, Eq b, Show a) ⇒ Gr a b → ([LNode a], [LEdge b]) → [LNode a] → ([LNode a], [LEdge b]) +nodesAndEdgesAfter' inGraph curResults@(curNodes, curEdges) inNodeList + | G.isEmpty inGraph = error "Input Graph is empty in nodesAndEdgesAfter'" + | null inNodeList = curResults + | otherwise = + let fromEdgeList = filter (`notElem` curEdges) $ out inGraph (fst $ head inNodeList) + fromNodeList = filter (`notElem` (fmap fst curNodes)) $ fmap snd3 fromEdgeList + labelMaybeList = fmap (lab inGraph) fromNodeList + in if Nothing `elem` labelMaybeList + then + error + ( "Empty node label in nodesAndEdgesAfter" + <> show (filter ((== Nothing) . snd) $ zip fromNodeList labelMaybeList) + <> " " + <> (show fromNodeList) + <> "\n" + <> (prettyIndices inGraph) + ) + else + let labelList = fmap fromJust labelMaybeList + fromLabNodeList = zip fromNodeList labelList + in nodesAndEdgesAfter' inGraph (fromLabNodeList <> curNodes, fromEdgeList <> curEdges) (fromLabNodeList <> drop 1inNodeList) + + +{- | nodesAndEdgesAfter takes a graph and list of nodes to get list of nodes +and edges 'after' in the sense of leading from-ie between (not including)) that node +and all the way to any leaves is connects to. +wrapper wihtout accumulator +Does NOT Contain starting nodes +-} +nodesAndEdgesAfter ∷ (Eq a, Eq b, Show a) ⇒ Gr a b → [LNode a] → ([LNode a], [LEdge b]) +nodesAndEdgesAfter inGraph inNodeList = nodesAndEdgesAfter' inGraph ([], []) inNodeList + + +-- | indexMatchNode returns True if two labelled nodes have same index +indexMatchNode ∷ LNode a → LNode a → Bool +indexMatchNode (a, _) (b, _) = if a == b then True else False + + +{- | coevalNodePairs generatres a list of pairs of nodes that must be potentially equal in +age (ie parents of networkNode) +-} +coevalNodePairs ∷ (Eq a) ⇒ Gr a b → [(LNode a, LNode a)] +coevalNodePairs inGraph = + if G.isEmpty inGraph + then [] + else + let (_, _, _, netVertexList) = splitVertexList inGraph + pairListList = fmap (labParents inGraph) $ fmap fst netVertexList + in if null netVertexList + then [] + else fmap makePairs pairListList + where + makePairs a = + if length a /= 2 + then error ("Not two parents for coevalNodePairs") + else (head a, a !! 1) + + +-- | indexMatchEdge returns True if two labelled edges have same node indices +indexMatchEdge ∷ LEdge b → LEdge b → Bool +indexMatchEdge (a, b, _) (c, d, _) = if a == c && b == d then True else False + + +{- | contractRootOut1Edge contracts indegree 0, outdegree 1, edges and removes the node in the middle +does one at a time and makes a graph and recurses +removes "tail" edges (single) from root to single child +-} +contractRootOut1Edge ∷ (Show a, Show b) ⇒ Gr a b → Gr a b +contractRootOut1Edge inGraph = + if G.isEmpty inGraph + then G.empty + else + let inOutDeg = getInOutDeg inGraph <$> labNodes inGraph + out1RootList = filter ((== 1) . thd3) $ filter ((== 0) . snd3) inOutDeg + out2RootList = filter ((> 1) . thd3) $ filter ((== 0) . snd3) inOutDeg + in -- trace ("CRO1E :" <> (show (inOutDeg, out1RootList, out2RootList))) ( + if null out1RootList + then inGraph + else + if length out2RootList > 1 + then error ("Multiple roots in graph in contractRootOut1Edge: " <> (show $ length out2RootList)) + else + if null out2RootList + then + let -- get root with out = 1 and its child, that childs' children, and edges + rootVertex = head out1RootList + childOfRoot = snd3 $ head $ out inGraph ((fst . fst3) rootVertex) + childOfRootEdges = out inGraph childOfRoot + + newEdgeToAdd0 = ((fst . fst3) rootVertex, snd3 $ head childOfRootEdges, thd3 $ head childOfRootEdges) + newEdgeToAdd1 = ((fst . fst3) rootVertex, snd3 $ last childOfRootEdges, thd3 $ last childOfRootEdges) + + -- create new Graph, deleting child node deltes three edges around it + newGraph = insEdges [newEdgeToAdd0, newEdgeToAdd1] $ delNode childOfRoot inGraph + in -- trace ("Removing drop 1edge root :" <> (show $ snd3 $ head $ out inGraph ((fst . fst3) rootVertex))) + contractRootOut1Edge $ reindexGraph newGraph + else -- case where mupltiple roots--combine by deleting in0out1 node and creting edge to its child from regular root. + + let rootVertex = head out1RootList + in0out1RootIndex = (fst . fst3) rootVertex + + out2RootVertex = (fst . fst3 . head) out2RootList + + -- removes one of root out edges and inserts teh in0 out1 node adding two oew edges + root02EdgeToDelete = last $ out inGraph out2RootVertex + newEdgeFrom02Root = (out2RootVertex, in0out1RootIndex, thd3 root02EdgeToDelete) + newEdgeFrom01Root = (in0out1RootIndex, snd3 root02EdgeToDelete, thd3 root02EdgeToDelete) + + -- this relies on (a) root in first HTU + nonOTUOut0Nodes = fmap (fst . fst3) $ filter ((>= out2RootVertex) . (fst . fst3)) $ filter ((== 0) . thd3) $ filter ((> 0) . snd3) inOutDeg + + newGraph = insEdges [newEdgeFrom02Root, newEdgeFrom01Root] $ delEdge (toEdge root02EdgeToDelete) $ delNodes nonOTUOut0Nodes inGraph + in -- trace ("Removing extra edge root :" <> (show $ (root02EdgeToDelete, newEdgeFrom02Root, newEdgeFrom01Root))) + contractRootOut1Edge $ reindexGraph $ contractIn1Out1Edges $ newGraph + + +-- ) + +{- | contractIn1Out1Edges contracts indegree 1, outdegree 1, edges and removes the node in the middle +does one at a time and makes a graph and recurses +-} +contractIn1Out1Edges ∷ Gr a b → Gr a b +contractIn1Out1Edges inGraph = + if G.isEmpty inGraph + then G.empty + else + let inOutDeg = getInOutDeg inGraph <$> labNodes inGraph + degree11VertexList = filter ((== 1) . thd3) $ filter ((== 1) . snd3) inOutDeg + in -- trace ("vertex 11:" <> show degree11VertexList) ( + if null degree11VertexList + then inGraph + else + let nodeToDelete = fst3 $ head degree11VertexList + inEdgeToDelete = head $ inn inGraph $ fst nodeToDelete + outEdgeToDelete = head $ out inGraph $ fst nodeToDelete + newEdgeToAdd = (fst3 inEdgeToDelete, snd3 outEdgeToDelete, thd3 inEdgeToDelete) + reindexedNodes = reindexNodes (fst nodeToDelete) [] $ labNodes inGraph + reindexedEdges = reindexEdges (fst nodeToDelete) [] (newEdgeToAdd : (labEdges inGraph)) + + newGraph = mkGraph reindexedNodes reindexedEdges + in -- newGraph = insEdge newEdgeToAdd $ delLNode nodeToDelete inGraph + + -- trace ("Deleting Node " <> show (fst nodeToDelete) <> " " <> show (inEdgeToDelete, outEdgeToDelete) <> " inserting " <> show newEdgeToAdd) + contractIn1Out1Edges newGraph + + +-- ) + +{- | reindexNodes takes a node (assumes index and fst of node are the same) and a list of + nodes deleting the input node and reindexing all the other nodes with indices > than the input are reduced by 1 +-} +reindexNodes ∷ Int → [LNode a] → [LNode a] → [LNode a] +reindexNodes inNodeIndex curList nodeList = + if null nodeList + then reverse curList + else + let firstNode@(index, label) = head nodeList + in if index < inNodeIndex + then reindexNodes inNodeIndex (firstNode : curList) (drop 1nodeList) + else + if index == inNodeIndex + then reindexNodes inNodeIndex curList (drop 1nodeList) + else reindexNodes inNodeIndex ((index - 1, label) : curList) (drop 1nodeList) + + +{- | reindexEdges takes the index of a node that has/is being delted and reindexes indices +that are not incident on the node and deleted if incedent on node index +-} +reindexEdges ∷ Int → [LEdge b] → [LEdge b] → [LEdge b] +reindexEdges inNodeIndex curList edgeList = + if null edgeList + then curList + else + let (a, b, c) = head edgeList + a' = + if a < inNodeIndex + then a + else a - 1 + b' = + if b < inNodeIndex + then b + else b - 1 + in -- incident on node to be deleted + if a == inNodeIndex || b == inNodeIndex + then reindexEdges inNodeIndex curList (drop 1edgeList) + else -- reindexed edge added in + reindexEdges inNodeIndex ((a', b', c) : curList) (drop 1edgeList) + + +-- | artPoint calls ap to get articulation points of graph +artPoint ∷ (Eq b) ⇒ Gr a b → [Node] +artPoint inGraph = AP.ap $ B.undir inGraph + + +{- | makeNodeEdgePair takes a graph and extracts list of nodes and edges +returning a pair +-} +makeNodeEdgePair ∷ Gr a b → ([LNode a], [LEdge b]) +makeNodeEdgePair inGraph = + if isEmpty inGraph + then ([], []) + else (labNodes inGraph, labEdges inGraph) + + +{- | makeNodeEdgePairVect takes a graph and extracts list of nodes and edges +returning a pair of a vector of nodes and Vector of edges +-} +makeNodeEdgePairVect ∷ Gr a b → (V.Vector (LNode a), V.Vector (LEdge b)) +makeNodeEdgePairVect inGraph = + if isEmpty inGraph + then (V.empty, V.empty) + else (V.fromList $ labNodes inGraph, V.fromList $ labEdges inGraph) + + +-- | extractLeafGraph takes a Decorated graphs and cretes a graph from the leaves and no edges +extractLeafGraph ∷ Gr a b → Gr a b +extractLeafGraph inGraph = + if isEmpty inGraph + then G.empty + else + let (_, leafNodes, _, _) = splitVertexList inGraph + in mkGraph leafNodes [] + + +-- | makes graph undirected +undir ∷ (Eq b) ⇒ Gr a b → Gr a b +undir inGraph = B.undir inGraph + + +-- | finds bi connected components of a graph +bcc ∷ Gr a b → [Gr a b] +bcc inGraph = BCC.bcc inGraph + + +{- | removeNonLeafOut0NodesAfterRoot' removed nodes (and edges attached) that are ourtdegree = zero +but have index > root +differnet way to do same as removeNonLeafOut0NodesAfterRoot +-} +removeNonLeafOut0NodesAfterRoot' ∷ (Eq a) ⇒ Gr a b → Gr a b +removeNonLeafOut0NodesAfterRoot' inGraph = + if isEmpty inGraph + then empty + else + let (rootNodeList, putativeLeafNodeList, _, _) = splitVertexList inGraph + rootIndex = (fst . head) rootNodeList + leafList = filter ((< rootIndex) . fst) putativeLeafNodeList + in removeNonLeafOut0Nodes leafList inGraph + + +{- | removeNonLeafOut0NodesAfterRoot removed nodes (and edges attached) that are ourtdegree = zero +but have index > root +this should be faster than removeNonLeafOut0NodesAfterRoot' by a factor of n +-} +removeNonLeafOut0NodesAfterRoot ∷ (Eq a) ⇒ Gr a b → Gr a b +removeNonLeafOut0NodesAfterRoot inGraph = + if isEmpty inGraph + then empty + else + let (rootNodeList, putativeLeafNodeList, _, _) = splitVertexList inGraph + rootIndex = (fst . head) rootNodeList + + -- this should catch anything out = 0 + zeroOutNodeList = filter ((> rootIndex) . fst) putativeLeafNodeList + in -- trace ("RNLOoN: " <> (show (rootIndex, fmap fst zeroOutNodeList)) ) -- <> "\n" <> (prettyIndices inGraph)) ( + if null zeroOutNodeList + then inGraph + else + let newGraph = delNodes (fmap fst zeroOutNodeList) inGraph + in removeNonLeafOut0NodesAfterRoot $ reindexGraph newGraph + + +-- ) + +{- | removeNonLeafOut0Nodes removed nodes (and edges attached) that are ourtdegree = zero +but not in the leaf node list +does not reindex graph +-} +removeNonLeafOut0Nodes ∷ (Eq a) ⇒ [LNode a] → Gr a b → Gr a b +removeNonLeafOut0Nodes leafList inGraph = + if null leafList + then error "Null leaf list in removeNonLeafOut0Nodes" + else + if isEmpty inGraph + then empty + else + let nonLeafList = (labNodes inGraph) L.\\ leafList + outdegreePairList = zip nonLeafList (fmap (outdeg inGraph) nonLeafList) + (zeroOutNodeList, _) = unzip $ filter ((== 0) . snd) outdegreePairList + in if null zeroOutNodeList + then inGraph + else + let newGraph = delNodes (fmap fst zeroOutNodeList) inGraph + in removeNonLeafOut0Nodes leafList newGraph + + +{- | reindexGraph takes a graph and reindexes nodes and edges such that nodes +are sequential and the firt field matches their node index +-} +reindexGraph ∷ Gr a b → Gr a b +reindexGraph inGraph = + if isEmpty inGraph + then empty + else + let nodeList = labNodes inGraph + newIndexList = [0 .. (length nodeList - 1)] + nodeIndexPair = zip (fmap fst nodeList) newIndexList + nodeIndexMap = MAP.fromList nodeIndexPair + newNodeList = fmap (makeNewNode nodeIndexMap) nodeList + newEdgeList = fmap (makeNewEdge nodeIndexMap) (labEdges inGraph) + in mkGraph newNodeList newEdgeList + where + makeNewNode indexMap (a, b) = (fromJust $ MAP.lookup a indexMap, b) + makeNewEdge indexMap (a, b, c) = (fromJust $ MAP.lookup a indexMap, fromJust $ MAP.lookup b indexMap, c) + + +-- | isBridge uses naive (component number) procedure to determine if edge is a bridge O(n) +isBridge ∷ Gr a b → Edge → Bool +isBridge inGraph inNode = + if isEmpty inGraph + then error ("Empty graph in isBridge") + else + let numComponents = noComponents inGraph + numComponents' = noComponents $ delEdge inNode inGraph + in numComponents' > numComponents + + +-- FGL articulation point code--could be modified to get brisge edges in linear time} +------------------------------------------------------------------------------ +-- Tree for storing the DFS numbers and back edges for each node in the graph. +-- Each node in this tree is of the form (v,n,b) where v is the vertex number, +-- n is its DFS number and b is the list of nodes (and their DFS numbers) that +-- lead to back back edges for that vertex v. +------------------------------------------------------------------------------ +data DFSTree a = B (a, a, [(a, a)]) [DFSTree a] + deriving (Eq, Show, Read) + + +------------------------------------------------------------------------------ +-- Tree for storing the DFS and low numbers for each node in the graph. +-- Each node in this tree is of the form (v,n,l) where v is the vertex number, +-- n is its DFS number and l is its low number. +------------------------------------------------------------------------------ +data LOWTree a = Brc (a, a, a) [LOWTree a] + deriving (Eq, Show, Read) + + +------------------------------------------------------------------------------ +-- Finds the back edges for a given node. +------------------------------------------------------------------------------ +getBackEdges ∷ Node → [[(Node, Int)]] → [(Node, Int)] +getBackEdges _ [] = [] +getBackEdges v ls = map head (filter (elem (v, 0)) (drop 1ls)) + + +------------------------------------------------------------------------------ +-- Builds a DFS tree for a given graph. Each element (v,n,b) in the tree +-- contains: the node number v, the DFS number n, and a list of backedges b. +------------------------------------------------------------------------------ +dfsTree ∷ Int → Node → [Node] → [[(Node, Int)]] → Gr a b → ([DFSTree Int], Gr a b, Int) +dfsTree n _ [] _ g = ([], g, n) +dfsTree n _ _ _ g | isEmpty g = ([], g, n) +dfsTree n u (v : vs) ls g = case G.match v g of + (Nothing, g1) → dfsTree n u vs ls g1 + (Just c, g1) → (B (v, n + 1, bck) ts : ts', g3, k) + where + bck = getBackEdges v ls + (ts, g2, m) = dfsTree (n + 1) v sc ls' g1 + (ts', g3, k) = dfsTree m v vs ls g2 + ls' = ((v, n + 1) : sc') : ls + sc' = map (\x → (x, 0)) sc + sc = G.suc' c + + +------------------------------------------------------------------------------ +-- Finds the minimum between a dfs number and a list of back edges' dfs +-- numbers. +------------------------------------------------------------------------------ +minbckEdge ∷ Int → [(Node, Int)] → Int +minbckEdge n [] = n +minbckEdge n bs = min n (minimum (map snd bs)) + + +------------------------------------------------------------------------------ +-- Returns the low number for a node in a subtree. +------------------------------------------------------------------------------ +getLow ∷ LOWTree Int → Int +getLow (Brc (_, _, l) _) = l + + +------------------------------------------------------------------------------ +-- Builds a low tree from a DFS tree. Each element (v,n,low) in the tree +-- contains: the node number v, the DFS number n, and the low number low. +------------------------------------------------------------------------------ +lowTree ∷ DFSTree Int → LOWTree Int +lowTree (B (v, n, []) []) = Brc (v, n, n) [] +lowTree (B (v, n, bcks) []) = Brc (v, n, minbckEdge n bcks) [] +lowTree (B (v, n, bcks) trs) = Brc (v, n, lowv) ts + where + lowv = min (minbckEdge n bcks) lowChild + lowChild = minimum (map getLow ts) + ts = map lowTree trs + + +------------------------------------------------------------------------------ +-- Builds a low tree for a given graph. Each element (v,n,low) in the tree +-- contains: the node number v, the DFS number n, and the low number low. +------------------------------------------------------------------------------ +getLowTree ∷ Gr a b → Node → LOWTree Int +getLowTree g v = lowTree (head dfsf) + where + (dfsf, _, _) = dfsTree 0 0 [v] [] g + + +------------------------------------------------------------------------------ +-- Tests if a node in a subtree is an articulation point. An non-root node v +-- is an articulation point iff there exists at least one child w of v such +-- that lowNumber(w) >= dfsNumber(v). The root node is an articulation point +-- iff it has two or more children. +------------------------------------------------------------------------------ +isap ∷ LOWTree Int → Bool +isap (Brc (_, _, _) []) = False +isap (Brc (_, 1, _) ts) = length ts > 1 +isap (Brc (_, n, _) ts) = not (null ch) + where + -- modify for bridges + -- where ch = filter ( >= n) (map getLow ts) + ch = filter (>= n) (map getLow ts) + + +------------------------------------------------------------------------------ +-- Finds the articulation points by traversing the low tree. +------------------------------------------------------------------------------ +arp ∷ LOWTree Int → [Node] +arp (Brc (v, 1, _) ts) + | length ts > 1 = v : concatMap arp ts + | otherwise = concatMap arp ts +arp (Brc (v, n, l) ts) + | isap (Brc (v, n, l) ts) = v : concatMap arp ts + | otherwise = concatMap arp ts + + +------------------------------------------------------------------------------ +-- Finds the articulation points of a graph starting at a given node. +------------------------------------------------------------------------------ +artpoints ∷ Gr a b → Node → [Node] +artpoints g v = arp (getLowTree g v) + + +{- | + Finds the articulation points for a connected undirected graph, + by using the low numbers criteria: + + a) The root node is an articulation point iff it has two or more children. + + b) An non-root node v is an articulation point iff there exists at least + one child w of v such that lowNumber(w) >= dfsNumber(v). +-} +ap' ∷ Gr a b → [Node] +ap' g = artpoints g v where ((_, v, _, _), _) = G.matchAny g + + +-- | cyclic maps to cyclic function in module Cyclic.hs +cyclic ∷ Gr a b → Bool +cyclic inGraph = + -- trace ("Cyclic:" <> (show $ C.cyclic inGraph)) + C.cyclic inGraph + + +{- | testEdge nodeList fullEdgeList) counter +chnage to input graph and delete edge from graph as opposed to making new graphs each time. +should be much faster using P.delLEdge (since only one edge to delete) +-} +testEdge ∷ (Eq b) ⇒ P.Gr a b → G.LEdge b → [G.LEdge b] +testEdge fullGraph candidateEdge@(e, u, _) = + let newGraph = G.delLEdge candidateEdge fullGraph + bfsNodes = BFS.bfs e newGraph + foundU = L.find (== u) bfsNodes + in [candidateEdge | isNothing foundU] + + +{- | transitiveReduceGraph take list of nodes and edges, deletes each edge (e,u) in turn makes graph, +checks for path between nodes e and u, if there is delete edge otherwise keep edge in list for new graph +transitive reduction Aho et al. 1972 +this not iterative with new graphs--shold it be? +-} +transitiveReduceGraph ∷ (Eq b) ⇒ Gr a b → Gr a b +transitiveReduceGraph fullGraph = + let requiredEdges = fmap (testEdge fullGraph) (labEdges fullGraph) + newGraph = G.mkGraph (labNodes fullGraph) (concat requiredEdges) + in newGraph + + +{- | getCoevalConstraintEdges takes a graph and a network node and creates two lists: one of edges +"before" (ie towards root) and a second "after (ie away from root) +this defines a coeval constraint. No network edge can be added that would be directed +from the before group to the after +-} +getCoevalConstraintEdges ∷ (Eq a, Eq b, Show a) ⇒ Gr a b → LNode a → ([LEdge b], [LEdge b]) +getCoevalConstraintEdges inGraph inNode = + if isEmpty inGraph + then error "Empty input graph in getCoevalConstraintEdges" + else + let (_, edgeBeforeList) = nodesAndEdgesBefore inGraph [inNode] + (_, edgeAfterList) = nodesAndEdgesAfter inGraph [inNode] + in (edgeBeforeList, edgeAfterList) + + +-- | getGraphCoevalConstraints takes a graph and returns coeval constraints based on network nodes +getGraphCoevalConstraints ∷ (Eq a, Eq b, Show a, NFData b) ⇒ Gr a b → PhyG [([LEdge b], [LEdge b])] +getGraphCoevalConstraints inGraph = + if isEmpty inGraph + then error "Empty input graph in getGraphCoevalConstraints" + else + let (_, _, _, networkNodeList) = splitVertexList inGraph + in if null networkNodeList + then pure [] + else + let -- coevalAction ∷ (Eq a, Eq b, Show a) => LNode a -> ([LEdge b],[LEdge b]) + coevalAction = getCoevalConstraintEdges inGraph + in do + pTraverse ← getParallelChunkMap + let coevalResult = pTraverse coevalAction networkNodeList + pure coevalResult + + +-- PU.seqParMap PU.myStrategy (getCoevalConstraintEdges inGraph) networkNodeList -- `using` PU.myParListChunkRDS + +{- | getGraphCoevalConstraintsNodes takes a graph and returns coeval constraints based on network nodes +and nodes as a triple +-} +getGraphCoevalConstraintsNodes ∷ (Eq a, Eq b, Show a, NFData b) ⇒ Gr a b → PhyG [(LNode a, [LEdge b], [LEdge b])] +getGraphCoevalConstraintsNodes inGraph = + if isEmpty inGraph + then error "Empty input graph in getGraphCoevalConstraints" + else + let (_, _, _, networkNodeList) = splitVertexList inGraph + in if null networkNodeList + then pure [] + else + let -- coevalAction ∷ (Eq a, Eq b, Show a) => LNode a -> ([LEdge b],[LEdge b]) + coevalAction = getCoevalConstraintEdges inGraph + in do + pTraverse ← getParallelChunkMap + let coevalResult = pTraverse coevalAction networkNodeList + let (edgeBeforeList, edgeAfterList) = unzip coevalResult + -- let (edgeBeforeList, edgeAfterList) = unzip (PU.seqParMap PU.myStrategy (getCoevalConstraintEdges inGraph) networkNodeList) -- `using` PU.myParListChunkRDS) + pure $ zip3 networkNodeList edgeBeforeList edgeAfterList + + +{- | meetsAllCoevalConstraintsNodes checks constraint pair list and examines +whether one edge is from before and one after--if so fails False +else True if all pass +new edghe that woudl be creatated in edge1 -> edge2 +checks if starting and ending vertices of edges to be linked are on same side of each +coeval contraint +-} +meetsAllCoevalConstraintsNodes ∷ (Eq b) ⇒ [(Node, Node, [Node], [Node], [Node], [Node])] → LEdge b → LEdge b → Bool +meetsAllCoevalConstraintsNodes constraintList edge1@(u, v, _) edge2@(u', v', _) = + if null constraintList + then True + else + let (_, _, aNodesBefore, bNodesBefore, aNodesAfter, bNodesAfter) = head constraintList + in -- checks if edge nodes would stradle existing coeveal nodes + if (u `elem` aNodesAfter) && (u' `elem` bNodesBefore) + then False + else + if (u' `elem` aNodesAfter) && (u `elem` bNodesBefore) + then False + else + if (v `elem` aNodesAfter) && (v' `elem` bNodesBefore) + then False + else + if (v' `elem` aNodesAfter) && (v `elem` bNodesBefore) + then False + else + if (u `elem` aNodesBefore) && (u' `elem` bNodesAfter) + then False + else + if (u' `elem` aNodesBefore) && (u `elem` bNodesAfter) + then False + else + if (v `elem` aNodesBefore) && (v' `elem` bNodesAfter) + then False + else + if (v' `elem` aNodesBefore) && (v `elem` bNodesAfter) + then False + else meetsAllCoevalConstraintsNodes (drop 1constraintList) edge1 edge2 + + +{- | meetsAllCoevalConstraintsEdges checks constraint pair list and examines +whether one edge is from before and one after--if so fails False +else True if all pass +not correct I don't htink +-} +meetsAllCoevalConstraintsEdges ∷ (Eq b) ⇒ [([LEdge b], [LEdge b])] → LEdge b → LEdge b → Bool +meetsAllCoevalConstraintsEdges constraintList edge1 edge2 = + if null constraintList + then True + else + let (beforeList, afterList) = head constraintList + in if edge1 `elem` beforeList && edge2 `elem` afterList + then False + else + if edge2 `elem` beforeList && edge1 `elem` afterList + then False + else meetsAllCoevalConstraintsEdges (drop 1constraintList) edge1 edge2 + + +-- | insertDeleteEdges takes a graphs and list of nodes and edges to add and delete and creates new graph +insertDeleteEdges ∷ (Show a, Show b) ⇒ Gr a b → ([LEdge b], [Edge]) → Gr a b +insertDeleteEdges inGraph (edgesToAdd, edgesToDelete) = + let editedGraph = insEdges edgesToAdd $ delEdges edgesToDelete inGraph + in -- trace ("AGE: " <> (show editStuff) <> "\nIn graph:\n" <> (prettify inGraph) <> "New Graph:\n" <> (prettify editedGraph)) + editedGraph + + +-- | notMatchEdgeIndices reutnr True if not in edge list but doesn't compare label only indices +notMatchEdgeIndices ∷ [Edge] → LEdge b → Bool +notMatchEdgeIndices unlabeledEdegList labelledEdge = + if toEdge labelledEdge `elem` unlabeledEdegList + then False + else True + + +-- | isGraphTimeConsistent retuns False if graph fails time consistency +isGraphTimeConsistent ∷ (Show a, Eq a, Eq b, NFData a) ⇒ Gr a b → PhyG Bool +isGraphTimeConsistent inGraph = + if isEmpty inGraph + then pure True + else + if isTree inGraph + then pure True + else + let coevalNodeConstraintList = coevalNodePairs inGraph + -- addAction :: (LNode a, LNode a) -> (LNode a, LNode a, [LNode a], [LNode a], [LNode a], [LNode a]) + addAction = addBeforeAfterToPair inGraph + in do + pMap ← getParallelChunkMap + let coevalNodeConstraintList' = pMap addAction coevalNodeConstraintList + -- coevalNodeConstraintList' = PU.seqParMap PU.myStrategy (addBeforeAfterToPair inGraph) coevalNodeConstraintList -- `using` PU.myParListChunkRDS + let coevalPairsToCompareList = getListPairs coevalNodeConstraintList' + let timeOffendingEdgeList = getEdgesToRemoveForTime inGraph coevalPairsToCompareList + + pure $ null timeOffendingEdgeList + + +{- | addBeforeAfterToPair adds before and after node list to pari of nodes for later use +in time contraint edge removal +-} +addBeforeAfterToPair + ∷ (Show a, Eq a, Eq b) + ⇒ Gr a b + → (LNode a, LNode a) + → (LNode a, LNode a, [LNode a], [LNode a], [LNode a], [LNode a]) +addBeforeAfterToPair inGraph (a, b) = + if isEmpty inGraph + then error "Empty graph in addBeforeAfterToPair" + else + let (aNodesBefore, _) = nodesAndEdgesBefore inGraph [a] + (bNodesBefore, _) = nodesAndEdgesBefore inGraph [b] + (aNodesAfter, _) = nodesAndEdgesAfter inGraph [a] + (bNodesAfter, _) = nodesAndEdgesAfter inGraph [b] + in (a, b, aNodesBefore, bNodesBefore, aNodesAfter, bNodesAfter) + + +{- | getEdgesToRemoveForTime recursive looks at each pair of nodes that should +be potentially coeval based on network vertices. And see if it conflicts with +other pairs that should also be coeval. +two pairs of nodes (a,b) and (a',b'), one for each net node tio be compared +if a "before" a' then b must be before b' +if a "after" a' then b must be after b' +otherwise its a time violation +returns net edge to delte in second pair +-} +getEdgesToRemoveForTime + ∷ (Show a, Eq a, Eq b) + ⇒ Gr a b + → [((LNode a, LNode a, [LNode a], [LNode a], [LNode a], [LNode a]), (LNode a, LNode a, [LNode a], [LNode a], [LNode a], [LNode a]))] + → [Edge] +getEdgesToRemoveForTime inGraph inNodePairList = + if isEmpty inGraph + then [] + else + if null inNodePairList + then [] + else + let ((_, _, aNodesBefore, bNodesBefore, aNodesAfter, bNodesAfter), (a', b', _, _, _, _)) = head inNodePairList + in -- trace ("GETRT: " <> (show inNodePairList)) ( + -- condition if a before a' then b before b' to be ok + if (a' `elem` aNodesAfter) && (b' `elem` bNodesBefore) + then + let edgeToRemove = (head $ filter (isNetworkEdge inGraph) $ fmap toEdge $ out inGraph $ fst b') + in -- trace ("\tRemoving network edge due to time consistancy: " <> (show edgeToRemove)) + -- trace ("GERT Edges0:" <> (show edgeToRemove) <> " " <> (show $ fmap toEdge $ out inGraph $ fst b') <> " Net: " <> (show $ fmap (isNetworkEdge inGraph) $ fmap toEdge $ out inGraph $ fst b')) + edgeToRemove : getEdgesToRemoveForTime inGraph (drop 1inNodePairList) + else + if (a' `elem` aNodesBefore) && (b' `elem` bNodesAfter) + then + let edgeToRemove = (head $ filter (isNetworkEdge inGraph) $ fmap toEdge $ out inGraph $ fst b') + in -- trace ("GERT Edges1:" <> (show edgeToRemove) <> " " <> (show $ fmap toEdge $ out inGraph $ fst b')) + -- trace ("\tRemoving network edge due to time consistancy : " <> (show edgeToRemove)) + edgeToRemove : getEdgesToRemoveForTime inGraph (drop 1inNodePairList) + else + if (b' `elem` aNodesAfter) && (a' `elem` bNodesBefore) + then + let edgeToRemove = (head $ filter (isNetworkEdge inGraph) $ fmap toEdge $ out inGraph $ fst b') + in -- trace ("\tRemoving network edge due to time consistancy: " <> (show edgeToRemove)) + -- trace ("GERT Edges0:" <> (show edgeToRemove) <> " " <> (show $ fmap toEdge $ out inGraph $ fst b') <> " Net: " <> (show $ fmap (isNetworkEdge inGraph) $ fmap toEdge $ out inGraph $ fst b')) + edgeToRemove : getEdgesToRemoveForTime inGraph (drop 1inNodePairList) + else + if (b' `elem` aNodesBefore) && (a' `elem` bNodesAfter) + then + let edgeToRemove = (head $ filter (isNetworkEdge inGraph) $ fmap toEdge $ out inGraph $ fst b') + in -- trace ("GERT Edges1:" <> (show edgeToRemove) <> " " <> (show $ fmap toEdge $ out inGraph $ fst b')) + -- trace ("\tRemoving network edge due to time consistancy : " <> (show edgeToRemove)) + edgeToRemove : getEdgesToRemoveForTime inGraph (drop 1inNodePairList) + else getEdgesToRemoveForTime inGraph (drop 1inNodePairList) + + +-- ) + +{- | getEdgeSplitList takes a graph and returns list of edges +that split a graph increasing the number of components by 1 +this is quadratic +should change to Tarjan's algorithm (linear) +everything else in there is O(n^2-3) so maybe doesn't matter +filters out edges with parent nodes that are out degree 1 and root edges +-} +getEdgeSplitList ∷ (Show a, Show b, Eq b) ⇒ Gr a b → [LEdge b] +getEdgeSplitList inGraph = + if isEmpty inGraph + then error ("Empty graph in getEdgeSplitList") + else + let origNumComponents = noComponents inGraph + origEdgeList = labEdges inGraph + edgeDeleteComponentNumberList = fmap noComponents $ fmap (flip delEdge inGraph) (fmap toEdge origEdgeList) + bridgeList = fmap snd $ filter ((> origNumComponents) . fst) $ zip edgeDeleteComponentNumberList origEdgeList + + -- filter out edges starting in an outdegree 1 node (network or in out 1) node + -- this would promote an HTU to a leaf later. Its a bridge, but not what need + -- bridgeList' = filter ((not . isRoot inGraph) .fst3 ) $ filter ((not. isNetworkNode inGraph) . snd3) $ filter ((not . isOutDeg1Node inGraph) . fst3) bridgeList + bridgeList' = + filter ((not . isRoot inGraph) . fst3) $ + filter ((not . isNetworkEdge inGraph) . toEdge) $ + filter ((not . isOutDeg1Node inGraph) . fst3) bridgeList + in -- trace ("BridgeList" <> (show $ fmap toEdge bridgeList') <> "\nGraph\n" <> (prettyIndices inGraph)) + bridgeList' + + +{- | splitGraphOnEdge takes a graph and an edge and returns a single graph but with two components +the roots of each component are returned with two graphs, with broken edge contracted, and 'naked' +node returned (this is teh original connection vertex on base graph connected to pruned graph). +The naked node is used for rejoining the two components during rearrangement +(SplitGraph, root of component that has original root, root of component that was cut off, naked node left over) +this function does not check whether edge is a 'bridge' +-} +splitGraphOnEdge ∷ (Show b) ⇒ Gr a b → LEdge b → (Gr a b, Node, Node, Node) +splitGraphOnEdge inGraph (e, v, l) = + if isEmpty inGraph + then error "Empty graph in splitGraphOnEdge" + else + if (length $ getRoots inGraph) /= 1 + then error ("Incorrect number roots in splitGraphOnEdge--must be 1: " <> (show $ fmap fst $ getRoots inGraph)) + else + let childrenENode = (descendants inGraph e) L.\\ [v] + parentsENode = parents inGraph e + newEdge = (head parentsENode, head childrenENode, l) + edgesToDelete = [(head parentsENode, e), (e, head childrenENode)] -- (e,v) + + -- make new graph + splitGraph = insEdge newEdge $ delEdges edgesToDelete inGraph + in if length childrenENode /= 1 + then error ("Incorrect number of children of edge to split--must be 1: " <> (show ((e, v), childrenENode))) + else + if length parentsENode /= 1 + then error ("Incorrect number of parents of edge to split--must be 1: " <> (show ((e, v), parentsENode))) + else -- trace ("SGE:" <> (show (childrenENode, parentsENode, newEdge, edgesToDelete))) + (splitGraph, fst $ head $ getRoots inGraph, v, e) + + +{- | splitGraphOnEdge' like splitGraphOnEdge above but returns edges created and destroyed as well +used in Goodman-Bermer and could make swap more efficient as well. +-} +splitGraphOnEdge' ∷ (Show b) ⇒ Gr a b → LEdge b → (Gr a b, Node, Node, Node, LEdge b, [Edge]) +splitGraphOnEdge' inGraph (e, v, l) = + if isEmpty inGraph + then error "Empty graph in splitGraphOnEdge" + else + if (length $ getRoots inGraph) /= 1 + then error ("Incorrect number roots in splitGraphOnEdge--must be 1: " <> (show $ fmap fst $ getRoots inGraph)) + else + let childrenENode = (descendants inGraph e) L.\\ [v] + parentsENode = parents inGraph e + newEdge = (head parentsENode, head childrenENode, l) + edgesToDelete = [(head parentsENode, e), (e, head childrenENode)] -- (e,v) + + -- make new graph + splitGraph = insEdge newEdge $ delEdges edgesToDelete inGraph + in if length childrenENode /= 1 + then error ("Incorrect number of children of edge to split--must be 1: " <> (show ((e, v), childrenENode))) + else + if length parentsENode /= 1 + then error ("Incorrect number of parents of edge to split--must be 1: " <> (show ((e, v), parentsENode))) + else -- trace ("SGE:" <> (show (childrenENode, parentsENode, newEdge, edgesToDelete))) + (splitGraph, fst $ head $ getRoots inGraph, v, e, newEdge, edgesToDelete) + + +{- | joinGraphOnEdge takes a graph and adds an edge reducing the component number +expected ot be two components to one in SPR/TBR +assumes that first node of edge (e,v,l) is 'naked' ie avaiable to make edges but is in graph +created from splitGraphOnEdge +-} +joinGraphOnEdge ∷ (Show a, Show b) ⇒ Gr a b → LEdge b → Node → Gr a b +joinGraphOnEdge inGraph (x, y, l) parentofPrunedSubGraph = + if isEmpty inGraph + then error ("Empty graph in joinGraphOnEdge") + else + let edgeToCreate0 = (x, parentofPrunedSubGraph, l) + edgeToCreate1 = (parentofPrunedSubGraph, y, l) + in -- edgeToCreate2 = (parentofPrunedSubGraph, graphToJoinRoot, l) + + -- make new graph + -- trace ("JGE:" <> (show edgeToInvade) <> " " <> (show (parentofPrunedSubGraph, graphToJoinRoot))) -- <> "\n" <> (prettify inGraph)) + insEdges [edgeToCreate0, edgeToCreate1] $ delEdge (x, y) inGraph + + +{- | parentsInChain checks for parents in chain ie network edges +that implies a network event between nodes where one is the ancestor of the other +a time violation +-} +parentInChain ∷ (Show a, Eq a, Eq b) ⇒ Gr a b → Bool +parentInChain inGraph = + if isEmpty inGraph + then error "Null graph in parentInChain" + else + let (_, _, _, netVertexList) = splitVertexList inGraph + parentNetVertList = fmap (labParents inGraph) $ fmap fst netVertexList + + -- get list of nodes that are transitively equal in age + concurrentList = mergeConcurrentNodeLists parentNetVertList [] + concurrentPairList = concatMap getListPairs concurrentList + + -- get pairs that violate concurrency + violatingConcurrentPairs = concatMap (concurrentViolatePair inGraph) concurrentPairList + in if null violatingConcurrentPairs + then False + else True + + +{- | getSisterSisterEdgeList take a graph and returns list of edges where two network nodes +have the same two parents +-} +getSisterSisterEdgeList ∷ Gr a b → [Edge] +getSisterSisterEdgeList inGraph = + if isEmpty inGraph + then [] + else + let (_, _, _, netVertexList) = splitVertexList inGraph + in if null netVertexList + then [] + else getSisterSisterEdgeByNetVertex inGraph netVertexList + + +-- | getSisterSisterEdgeByNetVertex takes a list of vertices and recursively generate a list of edges to delete +getSisterSisterEdgeByNetVertex ∷ Gr a b → [LNode a] → [Edge] +getSisterSisterEdgeByNetVertex inGraph netNodeList = + if null netNodeList + then [] + else + let firstNode = fst $ head netNodeList + parentsList = parents inGraph firstNode + grandParentsList = fmap (parents inGraph) parentsList + sameGrandParentList = L.foldl1' L.intersect grandParentsList + in if null sameGrandParentList + then getSisterSisterEdgeByNetVertex inGraph (drop 1netNodeList) + else -- trace ("Found sister-sister") + (toEdge $ head $ inn inGraph firstNode) : getSisterSisterEdgeByNetVertex inGraph (drop 1netNodeList) + + +{- | concurrentViolatePair takes a pair of nodes and sees if either is ancestral to the other--if so returns pair +as list otherwise null list +-} +concurrentViolatePair ∷ (Eq a, Show a, Eq b) ⇒ Gr a b → (LNode a, LNode a) → [(LNode a, LNode a)] +concurrentViolatePair inGraph (node1, node2) = + if isEmpty inGraph + then error "Empty graph in concurrentViolatePair" + else + let (nodesBeforeFirst, _) = nodesAndEdgesBefore inGraph [node1] + (nodesBeforeSecond, _) = nodesAndEdgesBefore inGraph [node2] + in if node2 `elem` nodesBeforeFirst + then [(node1, node2)] + else + if node1 `elem` nodesBeforeSecond + then [(node1, node2)] + else [] + + +{- | mergeConcurrentNodeLists takes a list os lists and returns a list os lists of merged lists +lists are merged if they share any elements +-} +mergeConcurrentNodeLists ∷ (Eq a) ⇒ [[LNode a]] → [[LNode a]] → [[LNode a]] +mergeConcurrentNodeLists inListList currentListList = + if null inListList + then -- trace ("MCNL:" <> (show $ fmap (fmap fst) currentListList)) + currentListList + else -- first case + + if null currentListList + then mergeConcurrentNodeLists (drop 1inListList) [head inListList] + else + let firstList = head inListList + (intersectList, _) = unzip $ filter ((== True) . snd) $ zip (currentListList) (fmap (not . null) $ fmap (L.intersect firstList) currentListList) + + (noIntersectLists, _) = unzip $ filter ((== False) . snd) $ zip (currentListList) (fmap (not . null) $ fmap (L.intersect firstList) currentListList) + + mergedList = + if null intersectList + then firstList + else L.foldl' L.union firstList intersectList + in -- trace ("MCL-F:" <> (show $ fmap fst firstList) <> " inter " <> (show $ fmap (fmap fst) intersectList) <> + -- " noInter " <> (show $ fmap (fmap fst) noIntersectLists) <> " curList " <> (show $ fmap (fmap fst) currentListList)) + mergeConcurrentNodeLists (drop 1inListList) (mergedList : noIntersectLists) + + +{- | sortEdgeListByDistance sorts edges by distance (in edges) from edge pair of vertices +cretes a list of edges into (but traveling away from) an initial eNOde and away from +an initial vNode adding new nodes to those lists as encountered by traversing edges. +the eidea is theat the nodes from a directed edge (eNode, vNode) +the list is creted at each round from the "in" and "out" edge lists +so they are in order of 1 edge 2 edges etc. +-} +sortEdgeListByDistance ∷ Gr a b → [Node] → [Node] → [LEdge b] +sortEdgeListByDistance inGraph eNodeList vNodeList = + if isEmpty inGraph + then error ("Empty graph in edgeListByDistance") + else + if (null eNodeList && null vNodeList) + then [] + else -- get edges 'in' to eNodeList + + let inEdgeList = concatMap (inn inGraph) eNodeList + newENodeList = fmap fst3 inEdgeList + + -- get edges 'out' from vNodeList + outEdgeList = concatMap (out inGraph) vNodeList + newVNodeList = fmap snd3 outEdgeList + in inEdgeList <> outEdgeList <> (sortEdgeListByDistance inGraph newENodeList newVNodeList) + + +{- | switchRootTree takes a new root vertex index of a tree and switches the existing root (and all relevent edges) +to new index +-} +switchRootTree ∷ (Show a) ⇒ Int → Gr a b → Gr a b +switchRootTree newRootIndex inGraph = + if isEmpty inGraph + then empty + else + let rootList = getRoots inGraph + (newRootCurInEdges, newRootCurOutEdges) = getInOutEdges inGraph newRootIndex + oldRootEdges = out inGraph $ fst $ head rootList + in -- not a directed tree + if length rootList /= 1 + then error ("Graph input to switchRootTree is not a tree--not single root:" <> (show rootList)) + else -- same root + + if newRootIndex == (fst $ head rootList) + then inGraph + else -- create new edges and delete the old ones + + let newEdgesToAdd = fmap (flipVertices (fst $ head rootList) newRootIndex) (newRootCurInEdges <> newRootCurOutEdges <> oldRootEdges) + in insEdges newEdgesToAdd $ delLEdges (newRootCurInEdges <> newRootCurOutEdges <> oldRootEdges) inGraph + + +{- | flipVertices takes an old vertex index and a new vertex index and inserts one for the other +in a labelled edge +-} +flipVertices ∷ Node → Node → LEdge b → LEdge b +flipVertices a b (u, v, l) = + let newU = + if u == a + then b + else + if u == b + then a + else u + newV = + if v == a + then b + else + if v == b + then a + else v + in -- trace (show (u,v,l) <> "->" <> show (newU, newV, l)) + (newU, newV, l) + + +{- | rerootTree takes a graph and reroots based on a vertex index (usually leaf outgroup) + if input is a forest then only roots the component that contains the vertex wil be rerooted + unclear how will effect network edges--will need to verify that does not create cycles + multi-rooted components (as opposed to forests) are unaffected with trace warning thrown + after checking for existing root and multiroots, should be O(n) where 'n is the length + of the path between the old and new root +-} +rerootTree ∷ (Show a, Show b, Eq b) ⇒ Node → Gr a b → Gr a b +rerootTree rerootIndex inGraph = + -- trace ("In reroot Graph: " <> show rerootIndex) ( + if isEmpty inGraph + then inGraph + else + let componentList = components inGraph + parentNewRootList = pre inGraph rerootIndex + newRootOrigEdge = head $ inn inGraph rerootIndex + parentRootList = fmap (isRoot inGraph) parentNewRootList + outgroupInComponent = fmap (rerootIndex `elem`) componentList + componentWithOutgroup = filter ((== True) . fst) $ zip outgroupInComponent componentList + in -- (_, inNewRoot, outNewRoot) = getInOutDeg inGraph (labelNode inGraph rerootIndex) + + -- rerooting on root so no indegree edges + -- this for wagner build reroots where can try to reroot on leaf not yet added + if null $ inn inGraph rerootIndex + then inGraph -- error ("Rerooting on indegree 0 node " <> (show rerootIndex) <> "\n" <> prettyIndices inGraph) -- empty + else + if null componentWithOutgroup + then inGraph -- error ("Error rooting wierdness in rerootTree " <> (show rerootIndex) <> "\n" <> prettyIndices inGraph) -- empty + else -- check if new outtaxon has a parent--shouldn't happen-but could if its an internal node reroot + + if null parentNewRootList || (True `elem` parentRootList) + then inGraph + else + ( if null componentWithOutgroup + then error ("Outgroup index " <> show rerootIndex <> " not found in graph") + else -- trace ("RRT: " <> (show (rerootIndex, inNewRoot, outNewRoot))) ( + -- reroot component with new outtaxon + + let componentWithNewOutgroup = snd $ head componentWithOutgroup + (_, originalRootList) = unzip $ filter ((== True) . fst) $ zip (fmap (isRoot inGraph) componentWithNewOutgroup) componentWithNewOutgroup + numRoots = length originalRootList + orginalRoot = head originalRootList + originalRootEdges = out inGraph orginalRoot + in if numRoots == 0 + then error ("No root in rerootTree: Attempting to reroot on edge to node " <> (show rerootIndex) <> "\n" <> prettyIndices inGraph) -- empty + else -- check if outgroup in a multirooted component + -- if wagner build this is ok + + if numRoots > 1 + then inGraph -- error ("Error: Attempting to reroot multi-rooted component") -- inGraph + else -- reroot graph safely automatically will only affect the component with the outgroup + -- delete old root edge and create two new edges from oringal root node. + -- keep orignl root node and delte/crete new edges when they are encounterd + -- trace ("Moving root from " <> (show orginalRoot) <> " to " <> (show rerootIndex)) ( + + let leftChildEdge = (orginalRoot, rerootIndex, edgeLabel $ head originalRootEdges) + rightChildEdge = (orginalRoot, fst3 newRootOrigEdge, edgeLabel $ last originalRootEdges) + + -- this assumes 2 children of old root -- shouled be correct as Phylogenetic Graph + newEdgeOnOldRoot = + if (length originalRootEdges) /= 2 + then + error + ( "Number of root out edges /= 2 in rerootGraph: " + <> (show $ length originalRootEdges) + <> " root index: " + <> (show (orginalRoot, rerootIndex)) + <> "\nGraph:\n" + <> (prettyIndices inGraph) + ) + else (snd3 $ head originalRootEdges, snd3 $ last originalRootEdges, thd3 $ head originalRootEdges) + + newRootEdges = [leftChildEdge, rightChildEdge, newEdgeOnOldRoot] + newGraph = insEdges newRootEdges $ delLEdges (newRootOrigEdge : originalRootEdges) inGraph + + -- get edges that need reversing + newGraph' = preTraverseAndFlipEdges [leftChildEdge, rightChildEdge] newGraph + in -- trace ("=") + -- trace ("Deleting " <> (show (newRootOrigEdge : originalRootEdges)) <> "\nInserting " <> (show newRootEdges)) + -- trace ("In " <> (GFU.showGraph inGraph) <> "\nNew " <> (GFU.showGraph newGraph) <> "\nNewNew " <> (GFU.showGraph newGraph')) + newGraph' + ) + + +-- ) -- ) + +{- | rerootDisplayTree like reroot but inputs original root position instead of figuring it out. +assumes graph is tree--not useable fo Wagner builds since they have multiple components while building +-} +rerootDisplayTree ∷ (NFData a, Show a, Show b, Eq a, Eq b) ⇒ Node → Node → Gr a b → Gr a b +rerootDisplayTree orginalRootIndex rerootIndex inGraph = + -- trace ("In reroot Graph: " <> show rerootIndex) ( + if isEmpty inGraph + then inGraph + else + let -- componentList = components inGraph' + + parentNewRootList = pre inGraph rerootIndex + newRootOrigEdge = head $ inn inGraph rerootIndex + parentRootList = fmap (isRoot inGraph) parentNewRootList + -- outgroupInComponent = fmap (rerootIndex `elem`) componentList + -- componentWithOutgroup = filter ((== True).fst) $ zip outgroupInComponent componentList + (_, inNewRoot, outNewRoot) = getInOutDeg inGraph (labelNode inGraph rerootIndex) + in -- these here for checking in1out1 edges on reroot + -- parentNewRootIn1Out1 = isIn1Out1 inGraph (head $ parents inGraph rerootIndex) + -- childNewRootIn1Out1 = if null $ descendants inGraph rerootIndex then False + -- else isIn1Out1 inGraph (head $ descendants inGraph rerootIndex) + + -- don't reroot on in=out=1 since same as it descendent edge + if (inNewRoot == 1) && (outNewRoot == 1) + then inGraph + else -- else if parentNewRootIn1Out1 || childNewRootIn1Out1 then inGraph + + -- rerooting on root so no indegree edges + -- this for wagner build reroots where can try to reroot on leaf not yet added + + if null $ inn inGraph rerootIndex + then inGraph -- error ("Rerooting on indegree 0 node " <> (show rerootIndex) <> "\n" <> prettyIndices inGraph) -- empty + else -- check if new outtaxon has a parent--shouldn't happen-but could if its an internal node reroot + + if null parentNewRootList || (True `elem` parentRootList) + then inGraph + else -- else if null componentWithOutgroup then error ("Outgroup index " <> show rerootIndex <> " not found in graph") + + -- trace ("RRT: " <> (show (rerootIndex, inNewRoot, outNewRoot))) ( + -- reroot component with new outtaxon + + let -- componentWithNewOutgroup = snd $ head componentWithOutgroup + -- (_, originalRootList) = unzip $ filter ((==True).fst) $ zip (fmap (isRoot inGraph) componentWithNewOutgroup) componentWithNewOutgroup + -- numRoots = 1 -- length originalRootList + orginalRoot = orginalRootIndex -- head originalRootList + originalRootEdges = (out inGraph orginalRoot) + in -- reroot graph safely automatically will only affect the component with the outgroup + -- delete old root edge and create two new edges from oringal root node. + -- keep orignl root node and delte/crete new edges when they are encounterd + -- trace ("Moving root from " <> (show orginalRoot) <> " to " <> (show rerootIndex)) ( + let leftChildEdge = (orginalRoot, rerootIndex, edgeLabel $ head originalRootEdges) + rightChildEdge = (orginalRoot, fst3 newRootOrigEdge, edgeLabel $ last originalRootEdges) + + -- this assumes 2 children of old root -- shouled be correct as Phylogenetic Graph + -- same result for both conditions to allow for check below and return original root if not correct + newEdgeOnOldRoot = + if (length originalRootEdges) /= 2 + then (snd3 $ head originalRootEdges, snd3 $ last originalRootEdges, thd3 $ head originalRootEdges) + else {- + error ("Number of root out edges /= 2 in rerootGraph: " <> (show $ length originalRootEdges) + <> " root index: " <> (show (orginalRoot, rerootIndex, fst $ head $ getRoots inGraph)) <> "\nGraph:\n" <> (prettyIndices inGraph)) + -} + (snd3 $ head originalRootEdges, snd3 $ last originalRootEdges, thd3 $ head originalRootEdges) + + newRootEdges = [leftChildEdge, rightChildEdge, newEdgeOnOldRoot] + newGraph = insEdges newRootEdges $ delLEdges (newRootOrigEdge : originalRootEdges) inGraph + + -- get edges that need reversing + newGraph' = preTraverseAndFlipEdgesTree orginalRootIndex [leftChildEdge, rightChildEdge] newGraph + in -- this erros for malformed newEdge on old root + if (length originalRootEdges) /= 2 + then inGraph + else -- checks that new root has two edges + + if (length $ out newGraph' orginalRootIndex) /= 2 + then inGraph + else newGraph' + + +{- | preTraverseAndFlipEdgesTree traverses a tree from starting edge flipping in-edges since they should +be out-edges +when recursion its edges that don't need to be fliped then stops +assumes input edge is directed correctly +follows traversal out "pre" order updating graph as edges flipped +-} +preTraverseAndFlipEdgesTree ∷ (Eq b) ⇒ Node → [LEdge b] → Gr a b → Gr a b +preTraverseAndFlipEdgesTree rootIndex inEdgeList inGraph = + if null inEdgeList + then inGraph + else + let -- first edge directled correctly + inEdge@(_, v, _) = head inEdgeList + + -- edges "in" to child node of first edge--these should be out and need to be flipped + childEdges = filter ((/= rootIndex) . fst3) $ filter (/= inEdge) $ inn inGraph v + + -- flip to "in" to "out" edges + flippedEdges = fmap flipLEdge childEdges + + -- -- modify graph accordingly + newGraph = insEdges flippedEdges $ delLEdges childEdges inGraph + in -- trace ("PTFE: flipped " <> (show $ fmap toEdge flippedEdges)) ( + -- edge terminates in leaf or edges in correct orientation + if null childEdges + then preTraverseAndFlipEdgesTree rootIndex (drop 1inEdgeList) inGraph + else -- edge needs to be reversed to follow through its children from a new graph + preTraverseAndFlipEdgesTree rootIndex (flippedEdges <> (drop 1inEdgeList)) newGraph + + +-- ) + +{- | preTraverseAndFlipEdges traverses graph from starting edge flipping edges as needed +when recursion its edges that don't need to be fliped then stops +assumes input edge is directed correctly +follows traversal out "pre" order ish from edges +have to check edge orientatins--make sure thay haven't changed as graph was updated earlier +-} +preTraverseAndFlipEdges ∷ (Eq b) ⇒ [LEdge b] → Gr a b → Gr a b +preTraverseAndFlipEdges inEdgelist inGraph = + if null inEdgelist + then inGraph + else + let inEdge@(_, v, _) = head inEdgelist + childEdges = (out inGraph v) <> (filter (/= inEdge) $ inn inGraph v) + + -- returns list of edges that had to be flipped + edgesToFlip = getToFlipEdges v childEdges + flippedEdges = fmap flipLEdge edgesToFlip + newGraph = insEdges flippedEdges $ delLEdges edgesToFlip inGraph + in -- trace ("PTFE: flipped " <> (show $ fmap toEdge flippedEdges)) ( + -- edge terminates in leaf or edges in correct orientation + if null childEdges || null edgesToFlip + then preTraverseAndFlipEdges (drop 1inEdgelist) inGraph + else -- edge needs to be reversed to follow through its children from a new graph + preTraverseAndFlipEdges (flippedEdges <> (drop 1inEdgelist)) newGraph + + +-- ) + +{- | getToFlipEdges takes an index and check edge list +and creates new list of edges that need to be flipped +-} +getToFlipEdges ∷ Node → [LEdge b] → [LEdge b] +getToFlipEdges parentNodeIndex inEdgeList = + if null inEdgeList + then [] + else + let firstEdge@(u, _, _) = head inEdgeList + in if parentNodeIndex /= u + then firstEdge : getToFlipEdges parentNodeIndex (drop 1inEdgeList) + else getToFlipEdges parentNodeIndex (drop 1inEdgeList) + + +{- | Random generates display trees up to input number by choosing +to keep indegree nodes > 1 unifomaly at random +-} +generateDisplayTreesRandom ∷ ∀ a b. (Show a, Show b, Eq a, Eq b, NFData a, NFData b) ⇒ Int → Gr a b → PhyG [Gr a b] +generateDisplayTreesRandom numDisplayTrees inGraph + | isEmpty inGraph = failWithPhase Computing "Empty graph in generateDisplayTreesRandom" + | isTree inGraph = pure [inGraph] + | otherwise = + let clonedGraphs ∷ [Gr a b] + clonedGraphs = replicate numDisplayTrees inGraph + in getParallelChunkTraverse >>= \pTraverse → + randomlyResolveGraphToTree `pTraverse` clonedGraphs + + +{- | +Resolves a single graph to a tree by choosing single indegree edges +uniformly at random and deleting all others from graph +in=out=1 nodes are contracted, HTU's withn outdegree 0 removed, graph reindexed +but not renamed--edges from root are left alone. +-} +randomlyResolveGraphToTree ∷ (Show a, Show b, Eq a, Eq b) ⇒ Gr a b → PhyG (Gr a b) +randomlyResolveGraphToTree inGraph + | isEmpty inGraph = pure inGraph + | otherwise = + let (_, leafList, _, _) = splitVertexList inGraph + inEdgeListByVertex = inn inGraph <$> nodes inGraph + in do + edgesToDelete ← fmap concat . traverse chooseOneDumpRest $ fmap toEdge <$> inEdgeListByVertex + let newTree = delEdges edgesToDelete inGraph + let newTree' = removeNonLeafOut0Nodes leafList newTree + let newTree'' = contractIn1Out1Edges newTree' + let reindexTree = reindexGraph newTree'' + -- trace ("RRGT\n" <> (prettify inGraph) <> "\n to delete " <> (show edgesToDelete) <> "\nNew graph:\n" <> (prettify newTree) + -- <> "\nnewTree'\n" <> (prettify newTree') <> "\nnewTree''\n" <> (prettify newTree'') <> "\reindex\n" <> (prettify reindexTree)) + pure reindexTree + + +-- | chooseOneDumpRest takes random val and chooses to keep the edge in list returning list of edges to delete +chooseOneDumpRest ∷ [Edge] → PhyG [Edge] +chooseOneDumpRest = \case + [] → pure [] + x : [] → pure [] + x : xs → fmap NE.tail . shuffleList $ x :| xs + + +-- | generateDisplayTrees nice wrapper around generateDisplayTrees' with clean interface +generateDisplayTrees ∷ (Eq a) ⇒ Bool → Gr a b → [Gr a b] +generateDisplayTrees contractEdges inGraph = + if isTree inGraph + then [inGraph] + else + let (_, leafList, _, _) = splitVertexList inGraph + in generateDisplayTrees' contractEdges leafList [inGraph] [] + + +{- | generateDisplayTrees' takes a graph list and recursively generates +a list of trees created by progresively resolving each network vertex into a tree vertex +in each input graph +creating up to 2**m (m network vertices) trees. +call -> generateDisplayTrees' [startGraph] [] +the second and third args contain graphs that need more work and graphs that are done (ie trees) +-} +generateDisplayTrees' ∷ (Eq a) ⇒ Bool → [LNode a] → [Gr a b] → [Gr a b] → [Gr a b] +generateDisplayTrees' contractEdges leafList curGraphList treeList = + if null curGraphList + then + let treeList' = fmap (removeNonLeafOut0Nodes leafList) treeList + treeList'' = + if contractEdges + then fmap contractIn1Out1Edges treeList' + else treeList' + reindexedTreeList = fmap reindexGraph treeList'' + in reindexedTreeList + else + let firstGraph = head curGraphList + in if isEmpty firstGraph + then [] + else + let nodeList = labNodes firstGraph + inNetEdgeList = filter ((> 1) . length) $ fmap (inn firstGraph) $ fmap fst nodeList + in if null inNetEdgeList + then generateDisplayTrees' contractEdges leafList (drop 1curGraphList) (firstGraph : treeList) + else + let newGraphList = splitGraphListFromNode inNetEdgeList [firstGraph] + in generateDisplayTrees' contractEdges leafList (newGraphList <> (drop 1curGraphList)) treeList + + +{- | splitGraphListFromNode take a graph and a list of edges for indegree > 1 node +removes each in edge in turn to create a new graph and maintains any in 1 out 1 nodes +if these edges are to be contracted out use 'contractOneOneEdges' +should be a single traversal 'splitting' graph each time. removing edges anmd recursing +to do it again untill all edges are indegree 1. +the edges in list for recurvise deleteion should always be in all graphs uuntil +the end +-} +splitGraphListFromNode ∷ [[LEdge b]] → [Gr a b] → [Gr a b] +splitGraphListFromNode inEdgeListList inGraphList = + if null inEdgeListList + then inGraphList + else + if null inGraphList + then error "Empty graph lits in splitGraphListFromNode" + else + let firstNetEdgeList = head inEdgeListList + indexList = [0 .. (length firstNetEdgeList - 1)] + repeatedEdgeList = replicate (length firstNetEdgeList) firstNetEdgeList + netEdgeIndexPairList = zip repeatedEdgeList indexList + newGraphList = concat $ fmap (deleteEdgesCreateGraphs netEdgeIndexPairList 0) inGraphList + in splitGraphListFromNode (drop 1inEdgeListList) newGraphList + + +{- | deleteEdgesCreateGraphs takes a list of edges and an index list and a graph, +recursively keeps the index-th edge and deletes the others creating a new graph list +-} +deleteEdgesCreateGraphs ∷ [([LEdge b], Int)] → Int → Gr a b → [Gr a b] +deleteEdgesCreateGraphs netEdgeIndexPairList counter inGraph = + if isEmpty inGraph + then error "Empty graph in " deleteEdgesCreateGraphs + else + if null netEdgeIndexPairList + then [] + else + let (edgeList, lIndex) = head netEdgeIndexPairList + -- edgeToKeep = edgeList !! index + edgesToDelete = (take lIndex edgeList) <> (drop (lIndex + 1) edgeList) + newGraph = delLEdges edgesToDelete inGraph + in newGraph : deleteEdgesCreateGraphs (drop 1netEdgeIndexPairList) (counter + 1) inGraph + + +-- | undirectedEdgeEquality checks edgse for equality irrespective of direction +undirectedEdgeEquality ∷ Edge → Edge → Bool +undirectedEdgeEquality (a, b) (c, d) = + if a == c && b == d + then True + else + if a == d && b == c + then True + else False + + +{- | undirectedEdgeMinus subtracts edges in the second list from those in the first using +undirected matching +-} +undirectedEdgeMinus ∷ [Edge] → [Edge] → [Edge] +undirectedEdgeMinus firstList secondList = + if null firstList + then [] + else + let firstEdge@(a, b) = head firstList + in if firstEdge `L.elem` secondList + then undirectedEdgeMinus (drop 1firstList) secondList + else + if (b, a) `L.elem` secondList + then undirectedEdgeMinus (drop 1firstList) secondList + else firstEdge : undirectedEdgeMinus (drop 1firstList) secondList + + +{- | remakeGraph takes an existing graph and returns amkgraph version +this to test whether multiple graph edits are creating large memory footprints +-} +remakeGraph ∷ Gr a b → Gr a b +remakeGraph inGraph = + if isEmpty inGraph + then inGraph + else mkGraph (labNodes inGraph) (labEdges inGraph) diff --git a/pkg/PhyGraph/Utilities/LocalSequence.hs b/src/Utilities/LocalSequence.hs similarity index 67% rename from pkg/PhyGraph/Utilities/LocalSequence.hs rename to src/Utilities/LocalSequence.hs index 26dd89fec..a0b0df1ec 100644 --- a/pkg/PhyGraph/Utilities/LocalSequence.hs +++ b/src/Utilities/LocalSequence.hs @@ -31,143 +31,171 @@ either expressed or implied, of the FreeBSD Project. Maintainer : Ward Wheeler Stability : unstable Portability : portable (I hope) - -} - module Utilities.LocalSequence where -- import Debug.Trace -import Data.Sequence ((<|), (><), (|>)) -import qualified Data.Sequence as S + -- import qualified Data.Foldable as Foldable -import qualified Data.Foldable as F -import Data.Maybe -import qualified Data.Vector as V +import Data.Foldable qualified as F +import Data.Maybe +import Data.Sequence ((<|), (><), (|>)) +import Data.Sequence qualified as S +import Data.Vector qualified as V + -- | sequjence type for exporting type Seq = S.Seq + -- | head maps to Take -head :: Seq a -> a +head ∷ Seq a → a head inSeq = S.index inSeq 0 + -- | tail maps to drop Seq -tail :: Seq a -> Seq a +tail ∷ Seq a → Seq a tail = S.drop 1 + -- | (!!) index -(!) :: Seq a -> Int -> a +(!) ∷ Seq a → Int → a (!) = S.index + -- | cons maps to (<|) -cons :: a -> Seq a -> Seq a +cons ∷ a → Seq a → Seq a cons newElem inSeq = newElem <| inSeq + -- | snoc maps to (|>) -snoc ::Seq a -> a -> Seq a +snoc ∷ Seq a → a → Seq a snoc inSeq newElem = inSeq |> newElem + -- | snoc with args reversed -snocFlip :: a -> Seq a -> Seq a +snocFlip ∷ a → Seq a → Seq a snocFlip newElem inSeq = inSeq |> newElem + -- | empty to empty -empty :: Seq a +empty ∷ Seq a empty = S.empty + -- | null equal to empty -null :: (Eq a) => Seq a -> Bool +null ∷ (Eq a) ⇒ Seq a → Bool null inSeq = inSeq == S.empty + -- | singleton to singleton -singleton :: a -> Seq a +singleton ∷ a → Seq a singleton = S.singleton --- | ++ maps to >< -(++) :: Seq a -> Seq a -> Seq a -(++) inSeqA inSeqB = inSeqA >< inSeqB + +-- | <> maps to >< +(<>) ∷ Seq a → Seq a → Seq a +(<>) inSeqA inSeqB = inSeqA >< inSeqB + -- | concat fold over >< -concat :: (Eq a) => Seq (Seq a) -> Seq a +concat ∷ (Eq a) ⇒ Seq (Seq a) → Seq a concat inSeqSeq = concatInternal inSeqSeq Utilities.LocalSequence.empty + -- | concatInternal internal concat function with accumulator -concatInternal :: (Eq a) => Seq (Seq a) -> Seq a -> Seq a +concatInternal ∷ (Eq a) ⇒ Seq (Seq a) → Seq a → Seq a concatInternal inSeqSeq newSeq = - if Utilities.LocalSequence.null inSeqSeq then newSeq + if Utilities.LocalSequence.null inSeqSeq + then newSeq else - let firstSeq = Utilities.LocalSequence.head inSeqSeq - in - concatInternal (Utilities.LocalSequence.tail inSeqSeq) (firstSeq >< newSeq) + let firstSeq = Utilities.LocalSequence.head inSeqSeq + in concatInternal (Utilities.LocalSequence.tail inSeqSeq) (firstSeq >< newSeq) + -- | zip maps to zip -zip :: Seq a -> Seq b -> Seq (a,b) +zip ∷ Seq a → Seq b → Seq (a, b) zip = S.zip + -- | length maps to length -length :: Seq a -> Int +length ∷ Seq a → Int length = S.length + -- | toList from Foldable -toList :: Seq a -> [a] +toList ∷ Seq a → [a] toList = F.toList + -- | fromList from fromList -fromList :: [a] -> Seq a +fromList ∷ [a] → Seq a fromList = S.fromList + -- | toVector via intemediate List (alas) -toVector :: Seq a -> V.Vector a +toVector ∷ Seq a → V.Vector a toVector inSeq = V.fromList $ toList inSeq + -- | toVector via intemediate List (alas) -fromVector :: V.Vector a -> Seq a +fromVector ∷ V.Vector a → Seq a fromVector inVect = S.fromList $ V.toList inVect + -- | reverse to reverse -reverse :: Seq a -> Seq a +reverse ∷ Seq a → Seq a reverse = S.reverse + -- | last should be connstant time -last :: Seq a -> a +last ∷ Seq a → a last inSeq = S.index inSeq $ S.length inSeq - 1 + -- | map to fmap for ease of migrating libraries -map :: Traversable t => (a->b) -> t a -> t b +map ∷ (Traversable t) ⇒ (a → b) → t a → t b map = fmap + -- | drop maps to drop -drop :: Int -> Seq a -> Seq a +drop ∷ Int → Seq a → Seq a drop = S.drop + -- | take maps to take -take :: Int -> Seq a -> Seq a +take ∷ Int → Seq a → Seq a take = S.take + -- | unsafeTake maps to take -unsafeTake :: Int -> Seq a -> Seq a +unsafeTake ∷ Int → Seq a → Seq a unsafeTake = S.take + -- | unsafeDrop maps to drop -unsafeDrop :: Int -> Seq a -> Seq a +unsafeDrop ∷ Int → Seq a → Seq a unsafeDrop = S.drop + -- | replicate maps to replicate -replicate :: Int -> a -> Seq a +replicate ∷ Int → a → Seq a replicate = S.replicate --- | elem returns True if element present in Sequence --- False otherwise -elem :: (Eq a) => a -> Seq a -> Bool + +{- | elem returns True if element present in Sequence +False otherwise +-} +elem ∷ (Eq a) ⇒ a → Seq a → Bool elem element fullSequence = - let index = S.elemIndexL element fullSequence - in - isJust index + let index = S.elemIndexL element fullSequence + in isJust index --- | notElem returns False if element present in Sequence --- True otherwise -notElem :: (Eq a) => a -> Seq a -> Bool + +{- | notElem returns False if element present in Sequence +True otherwise +-} +notElem ∷ (Eq a) ⇒ a → Seq a → Bool notElem element fullSequence = - let index = S.elemIndexL element fullSequence - in - isNothing index + let index = S.elemIndexL element fullSequence + in isNothing index diff --git a/pkg/PhyGraph/Utilities/TcmHash.hs b/src/Utilities/TcmHash.hs similarity index 79% rename from pkg/PhyGraph/Utilities/TcmHash.hs rename to src/Utilities/TcmHash.hs index b42562bd0..36cdc8acc 100644 --- a/pkg/PhyGraph/Utilities/TcmHash.hs +++ b/src/Utilities/TcmHash.hs @@ -31,25 +31,21 @@ either expressed or implied, of the FreeBSD Project. Maintainer : Ward Wheeler Stability : unstable Portability : portable (I hope) - -} - - module Utilities.TcmHash where -- import Data.HashMap -- import Types.Types -import qualified Data.BitVector.LittleEndian as BV +import Data.BitVector.LittleEndian qualified as BV -- import qualified Data.Vector as V -import Data.Bits ((.&.)) -import qualified SymMatrix as S - --- | getMedianPair takes a tcm (single states) and two states and --- retuns the best median as superimposed bitvectors and the cost of that median -getMedianPair :: S.Matrix Int -> BV.BitVector -> BV.BitVector-> (BV.BitVector, Int) -getMedianPair tcm stateI stateJ = - let cost = tcm S.! (BV.toSignedNumber stateI, BV.toSignedNumber stateJ) - in - (stateI .&. stateJ, cost) +import Data.Bits ((.&.)) +import SymMatrix qualified as S +{- | getMedianPair takes a tcm (single states) and two states and +retuns the best median as superimposed bitvectors and the cost of that median +-} +getMedianPair ∷ S.Matrix Int → BV.BitVector → BV.BitVector → (BV.BitVector, Int) +getMedianPair tcm stateI stateJ = + let cost = tcm S.! (BV.toSignedNumber stateI, BV.toSignedNumber stateJ) + in (stateI .&. stateJ, cost) diff --git a/src/Utilities/ThreeWayFunctions.hs b/src/Utilities/ThreeWayFunctions.hs new file mode 100644 index 000000000..9719e9499 --- /dev/null +++ b/src/Utilities/ThreeWayFunctions.hs @@ -0,0 +1,540 @@ +{- +ToDo: + Add parallel optimization overblocks and characters? +-} + +{- | +Module specifying three way optimization functions for use in pre-order of +HardWired graphs and iterative pass-type optimization for Trees +-} +module Utilities.ThreeWayFunctions ( + threeMedianFinal, + addGapsToChildren, + threeWayGeneric, +) where + +import Bio.DynamicCharacter (extractMedians) +import Bio.DynamicCharacter.Element (SlimState, WideState, HugeState) +import Data.Alphabet +import Data.BitVector.LittleEndian qualified as BV +import Data.Bits +import Data.List qualified as L +import Data.MetricRepresentation qualified as MR +import Data.TCM.Dense qualified as TCMD +import Data.Vector qualified as V +import Data.Vector.Generic qualified as GV +import Data.Vector.Storable qualified as SV +import Data.Vector.Unboxed qualified as UV +import GeneralUtilities +import GraphOptimization.Medians qualified as M +import Input.BitPack qualified as BP +import SymMatrix qualified as S +import Types.Types + + +{- | threeMedianFinal calculates a 3-median for data types in a single character +for dynamic characters this is done by 3 min-trees +taking best result. Later true 3-way via ffi can be incorporated +first type of operation two parents and current node-- since prelim +has (left child preliminary, node preliminary, right child preliminary) +that information can be used if needed +since assumes in 2 out 1 only the node preliminary field is used +need to remove naked gaps from medians for dynamic characters --hence the extract medians call +-} +threeMedianFinal ∷ CharInfo → CharacterData → CharacterData → CharacterData → CharacterData +threeMedianFinal charInfo parent1 parent2 curNode = + let localCharType = charType charInfo + in if localCharType == Add + then + let threeFinal = V.zipWith3 threeWayAdditive (rangeFinal parent1) (rangeFinal parent2) (snd3 $ rangePrelim curNode) + in curNode{rangeFinal = threeFinal} + else + if localCharType == NonAdd + then + let threeFinal = V.zipWith3 threeWayNonAdditive (stateBVFinal parent1) (stateBVFinal parent2) (snd3 $ stateBVPrelim curNode) + in curNode{stateBVFinal = threeFinal} + else + if localCharType `elem` packedNonAddTypes + then + let threeFinal = BP.threeWayPacked localCharType (packedNonAddFinal parent1) (packedNonAddFinal parent2) (snd3 $ packedNonAddPrelim curNode) + in curNode{packedNonAddFinal = threeFinal} + else + if localCharType == Matrix + then + let threeFinal = + V.zipWith3 + (threeWayMatrix (costMatrix charInfo)) + (matrixStatesFinal parent1) + (matrixStatesFinal parent2) + (matrixStatesPrelim curNode) + in curNode{matrixStatesFinal = threeFinal} + else + if localCharType == AlignedSlim + then + let threeFinal = + M.getFinal3WaySlim (slimTCM charInfo) (alignedSlimFinal parent1) (alignedSlimFinal parent2) (snd3 $ alignedSlimPrelim curNode) + in curNode{alignedSlimFinal = threeFinal} + else + if localCharType == AlignedWide + then + let threeFinal = + M.getFinal3WayWideHuge + (wideTCM charInfo) + (alignedWideFinal parent1) + (alignedWideFinal parent2) + (snd3 $ alignedWidePrelim curNode) + in curNode{alignedWideFinal = threeFinal} + else + if localCharType == AlignedHuge + then + let threeFinal = + M.getFinal3WayWideHuge + (hugeTCM charInfo) + (alignedHugeFinal parent1) + (alignedHugeFinal parent2) + (snd3 $ alignedHugePrelim curNode) + in curNode{alignedHugeFinal = threeFinal} + else + if (localCharType == SlimSeq) || (localCharType == NucSeq) + then + let threeFinal = threeWaySlim charInfo parent1 parent2 curNode + in curNode + { slimFinal = extractMedians $ M.makeDynamicCharacterFromSingleVector threeFinal + } + else + if (localCharType == WideSeq) || (localCharType == AminoSeq) + then + let threeFinal = threeWayWide charInfo parent1 parent2 curNode + in curNode + { wideFinal = extractMedians $ M.makeDynamicCharacterFromSingleVector threeFinal + } + else + if localCharType == HugeSeq + then + let threeFinal = threeWayHuge charInfo parent1 parent2 curNode + in curNode + { hugeFinal = extractMedians $ M.makeDynamicCharacterFromSingleVector threeFinal + } + else error ("Unrecognized/implemented character type: " <> show localCharType) + + +-- | threeWayNonAdditive takes the union/intersection operation over 3 non additive states +threeWayNonAdditive ∷ BV.BitVector → BV.BitVector → BV.BitVector → BV.BitVector +threeWayNonAdditive inA inB inC = + let intersection3 = (inA .&. inB) .&. inC + intersectionAB = inA .&. inB + intersectionAC = inA .&. inC + intersectionBC = inB .&. inC + union3 = (inA .|. inB) .|. inC + in if not (BV.isZeroVector intersection3) + then intersection3 + else + if not (BV.isZeroVector intersectionAB) + then intersectionAB .|. inC + else + if not (BV.isZeroVector intersectionAC) + then intersectionAC .|. inB + else + if not (BV.isZeroVector intersectionBC) + then intersectionBC .|. inA + else union3 + + +{- | threeWayAdditive take three additive states and returns median +the idea is the interval between the minimum of all three maximum (min maxA, maxB, maxC) +and the maximum of all three minima (max minA, minB, minC) +ordered such that the fst of pair not greater than second +-} +threeWayAdditive ∷ (Int, Int) → (Int, Int) → (Int, Int) → (Int, Int) +threeWayAdditive (minA, maxA) (minB, maxB) (minC, maxC) = + let minOfMaxs = minimum [maxA, maxB, maxC] + maxOfMins = maximum [minA, minB, minC] + in if maxOfMins > minOfMaxs + then (maxOfMins, minOfMaxs) + else (minOfMaxs, maxOfMins) + + +{- | threeWayMatrix creates median best state vector from a traceback, since parents could conflict +on traceback does a minimization. +The final states of parents will have non-maximum costs and these compared +to the the child states with pointers to their children are set for +traceback from current node to child(ren) from preliminary assignment +since type assumes two children--they are both set to same value so if either left or right +is set later the process will be correct +-} +threeWayMatrix + ∷ S.Matrix Int → V.Vector MatrixTriple → V.Vector MatrixTriple → V.Vector MatrixTriple → V.Vector MatrixTriple +threeWayMatrix inCostMatrix parent1 parent2 curNode = + let numStates = S.rows inCostMatrix + + -- get the costs of each state for each node, for prents non-maximal cost will be final states + parent1StatesCost = fmap fst3 parent1 + parent2StatesCost = fmap fst3 parent2 + curNodeStatesCost = fmap fst3 curNode + + -- get the minimum cost for each state given combinations of all three nodes and the min cost child state + minCost3States = getMinStatePair inCostMatrix (maxBound ∷ StateCost) numStates parent1StatesCost parent2StatesCost curNodeStatesCost + in -- minStateCost = V.minimum $ fmap fst minCost3States + -- finalStatesTriple = fmap (assignMinMaxCost minStateCost (maxBound :: StateCost)) minCost3States + + minCost3States + + +{- | getMinStatePair takes cost matrix and state costs (vector of Int) and returns best median cost state of child for that best cost +if either parent or child has maxbound cost then that state get max bound cost +-} +getMinStatePair + ∷ S.Matrix Int + → StateCost + → Int + → V.Vector StateCost + → V.Vector StateCost + → V.Vector StateCost + → V.Vector (StateCost, [ChildStateIndex], [ChildStateIndex]) +getMinStatePair inCostMatrix maxCost numStates p1CostV p2CostV curCostV = + let f ∷ (Num a) ⇒ a → a → a → a + f a b c = a + b + c + + range = [0 .. numStates - 1] + + bestMedianCost vec = getBestPairCost inCostMatrix maxCost numStates vec <$> range + + -- get costs to parents-- will assume parent costs are 0 or max + bestMedianCostP1 = bestMedianCost p1CostV + bestMedianCostP2 = bestMedianCost p2CostV + + -- get costs to single child via preliminary states + medianChildCostPairVect = + getBestPairCostAndState inCostMatrix maxCost numStates curCostV <$> range + + -- get 3 sum costs and best state value + threeWayStateCostList = zipWith3 f bestMedianCostP1 bestMedianCostP2 (fmap fst medianChildCostPairVect) + minThreeWayCost = minimum threeWayStateCostList + + finalStateCostL = zipWith (assignBestMax minThreeWayCost maxCost) threeWayStateCostList medianChildCostPairVect + in V.fromList finalStateCostL + + +{- | +assignBestMax checks 3-way median state cost and if minimum sets to that otherwise sets to max +double 2nd field for 2-child type asumption +-} +assignBestMax + ∷ StateCost → StateCost → StateCost → (StateCost, [ChildStateIndex]) → (StateCost, [ChildStateIndex], [ChildStateIndex]) +assignBestMax minCost maxCost stateCost (_, stateChildList) + | stateCost == minCost = (minCost, stateChildList, stateChildList) + | otherwise = (maxCost, stateChildList, stateChildList) + + +{- | +getBestPairCost gets the baest cost for a state to each of parent states--does not keep parent state +-} +getBestPairCost ∷ S.Matrix Int → StateCost → Int → V.Vector StateCost → Int → StateCost +getBestPairCost inCostMatrix maxCost numStates parentStateCostV medianStateIndex = + let stateCost = V.minimum $ V.zipWith (g inCostMatrix maxCost medianStateIndex) parentStateCostV (V.fromList [0 .. (numStates - 1)]) + + g ∷ (Eq a) ⇒ S.Matrix a → a → Int → a → Int → a + g cM mC mS pC pS + | pC == mC = mC + | otherwise = cM S.! (mS, pS) + in stateCost + + +{- | +getBestPairCostAndState gets best pair of median state and chikd states based on preliminarr states of node +-} +getBestPairCostAndState ∷ S.Matrix Int → StateCost → Int → V.Vector StateCost → Int → (StateCost, [ChildStateIndex]) +getBestPairCostAndState inCostMatrix maxCost numStates childStateCostV medianStateIndex = + let g ∷ (Eq a) ⇒ S.Matrix a → a → Int → a → Int → (a, Int) + g cM mC mS pC pS + | pC == mC = (mC, pS) + | otherwise = (cM S.! (mS, pS), pS) + + statecostV = V.zipWith (g inCostMatrix maxCost medianStateIndex) childStateCostV (V.fromList [0 .. (numStates - 1)]) + minStateCost = V.minimum $ fmap fst statecostV + bestPairs = V.filter ((== minStateCost) . fst) statecostV + bestChildStates = V.toList $ fmap snd bestPairs + in (minStateCost, L.sort bestChildStates) + + +{- | threeWaySlim take charInfo, 2 parents, and curNOde and creates 3 median via +1) 3 DO medians (choosing lowest cost median) ((p1,p2), cn), ((cn,p1), p2), and ((cn,p2), p1) +2) inserting gaps to make all 3 line up +3) creating 3-medians +4) choosing lowest cost median +No change adjust is False since the 3-way lookup shold include that factor when it returns cost +-} +threeWaySlim ∷ CharInfo → CharacterData → CharacterData → CharacterData → SV.Vector SlimState +threeWaySlim charInfo parent1 parent2 curNode = + -- trace ("3WSlim: ") ( + let isMedian = False --since we want distances + -- pairwise median structures + p1p2 = M.getDOMedianCharInfo isMedian charInfo parent1 parent2 + p1cN = M.getDOMedianCharInfo isMedian charInfo parent1 curNode + p2cN = M.getDOMedianCharInfo isMedian charInfo parent2 curNode + + -- get 3rd to pairwise + p1p2cN = M.getDOMedianCharInfo isMedian charInfo p1p2 curNode + p1cNp2 = M.getDOMedianCharInfo isMedian charInfo p1cN parent2 + p2cNp1 = M.getDOMedianCharInfo isMedian charInfo p2cN parent1 + + (a1, b1, c1) = addGapsToChildren (slimGapped p1p2cN) (slimGapped p1p2) + (median1, cost1) = get3WayGeneric (TCMD.lookupThreeway (slimTCM charInfo)) a1 b1 c1 + + (a2, b2, c2) = addGapsToChildren (slimGapped p1cNp2) (slimGapped p1cN) + (median2, cost2) = get3WayGeneric (TCMD.lookupThreeway (slimTCM charInfo)) a2 b2 c2 + + (a3, b3, c3) = addGapsToChildren (slimGapped p2cNp1) (slimGapped p2cN) + (median3, cost3) = get3WayGeneric (TCMD.lookupThreeway (slimTCM charInfo)) a3 b3 c3 + + minCost = minimum [cost1, cost2, cost3] + in if cost1 == minCost + then median1 + else + if cost2 == minCost + then median2 + else median3 + + +-- ) + +{- | threeWayWide take charInfo, 2 parents, and curNOde and creates 3 median via +1) 3 DO medians (choosing lowest cost median) ((p1,p2), cn), ((cn,p1), p2), and ((cn,p2), p1) +2) inserting gaps to make all 3 line up +3) creating 3-medians +4) choosing lowest cost median +No change adjust is False since the 3-way lookup shold include that factor when it returns cost +-} +threeWayWide ∷ CharInfo → CharacterData → CharacterData → CharacterData → UV.Vector WideState +threeWayWide charInfo parent1 parent2 curNode = + let isMedian = False --since we want distances + -- pairwise median structures + p1p2 = M.getDOMedianCharInfo isMedian charInfo parent1 parent2 + p1cN = M.getDOMedianCharInfo isMedian charInfo parent1 curNode + p2cN = M.getDOMedianCharInfo isMedian charInfo parent2 curNode + + -- get 3rd to pairwise + p1p2cN = M.getDOMedianCharInfo isMedian charInfo p1p2 curNode + p1cNp2 = M.getDOMedianCharInfo isMedian charInfo p1cN parent2 + p2cNp1 = M.getDOMedianCharInfo isMedian charInfo p2cN parent1 + + (a1, b1, c1) = addGapsToChildren (wideGapped p1p2cN) (wideGapped p1p2) + (median1, cost1) = get3WayGeneric (MR.retreiveThreewayTCM (wideTCM charInfo)) a1 b1 c1 + + (a2, b2, c2) = addGapsToChildren (wideGapped p1cNp2) (wideGapped p1cN) + (median2, cost2) = get3WayGeneric (MR.retreiveThreewayTCM (wideTCM charInfo)) a2 b2 c2 + + (a3, b3, c3) = addGapsToChildren (wideGapped p2cNp1) (wideGapped p2cN) + (median3, cost3) = get3WayGeneric (MR.retreiveThreewayTCM (wideTCM charInfo)) a3 b3 c3 + + minCost = minimum [cost1, cost2, cost3] + in if cost1 == minCost + then median1 + else + if cost2 == minCost + then median2 + else median3 + + +{- | threeWayHuge take charInfo, 2 parents, and curNOde and creates 3 median via +1) 3 DO medians (choosing lowest cost median) ((p1,p2), cn), ((cn,p1), p2), and ((cn,p2), p1) +2) inserting gaps to make all 3 line up +3) creating 3-medians +4) choosing lowest cost median +No change adjust is False since the 3-way lookup shold include that factor when it returns cost +-} +threeWayHuge ∷ CharInfo → CharacterData → CharacterData → CharacterData → V.Vector HugeState +threeWayHuge charInfo parent1 parent2 curNode = + let isMedian = False --since want distances only + -- pairwise median structures + p1p2 = M.getDOMedianCharInfo isMedian charInfo parent1 parent2 + p1cN = M.getDOMedianCharInfo isMedian charInfo parent1 curNode + p2cN = M.getDOMedianCharInfo isMedian charInfo parent2 curNode + + -- get 3rd to pairwise + p1p2cN = M.getDOMedianCharInfo isMedian charInfo p1p2 curNode + p1cNp2 = M.getDOMedianCharInfo isMedian charInfo p1cN parent2 + p2cNp1 = M.getDOMedianCharInfo isMedian charInfo p2cN parent1 + + (a1, b1, c1) = addGapsToChildren (hugeGapped p1p2cN) (hugeGapped p1p2) + (median1, cost1) = get3WayGeneric (MR.retreiveThreewayTCM (hugeTCM charInfo)) a1 b1 c1 + + (a2, b2, c2) = addGapsToChildren (hugeGapped p1cNp2) (hugeGapped p1cN) + (median2, cost2) = get3WayGeneric (MR.retreiveThreewayTCM (hugeTCM charInfo)) a2 b2 c2 + + (a3, b3, c3) = addGapsToChildren (hugeGapped p2cNp1) (hugeGapped p2cN) + (median3, cost3) = get3WayGeneric (MR.retreiveThreewayTCM (hugeTCM charInfo)) a3 b3 c3 + + minCost = minimum [cost1, cost2, cost3] + in if cost1 == minCost + then median1 + else + if cost2 == minCost + then median2 + else median3 + + +{- | addGapsToChildren pads out "new" gaps based on identity--if not identical--adds a gap based on cost matrix size +importand node filed orders correct--has moved around +-} +addGapsToChildren ∷ (FiniteBits a, GV.Vector v a) ⇒ (v a, v a, v a) → (v a, v a, v a) → (v a, v a, v a) +addGapsToChildren (reGappedParentFinal, _, reGappedNodePrelim) (gappedLeftChild, gappedNodePrelim, gappedRightChild) = + -- trace ("AG2C:") ( + let (reGappedLeft, reGappedRight) = slideRegap reGappedNodePrelim gappedNodePrelim gappedLeftChild gappedRightChild mempty mempty + in if (GV.length reGappedParentFinal /= GV.length reGappedLeft) || (GV.length reGappedParentFinal /= GV.length reGappedRight) + then + error + ( "Vectors not same length " + <> show (GV.length reGappedParentFinal, GV.length reGappedLeft, GV.length reGappedRight) + ) + else (reGappedParentFinal, reGappedLeft, reGappedRight) + + +-- ) + +{- | slideRegap takes two version of same vectors (1st and snd) one with additional gaps and if the two aren't equal then adds gaps +to the 3rd and 4th input vectors +-} +slideRegap ∷ (FiniteBits a, GV.Vector v a) ⇒ v a → v a → v a → v a → [a] → [a] → (v a, v a) +slideRegap reGappedNode gappedNode gappedLeft gappedRight newLeftList newRightList = + let finalize = GV.fromList . reverse + in case GV.uncons reGappedNode of + Nothing → (finalize newLeftList, finalize newRightList) + Just (headRGN, tailRGN) → + let gapState = (headRGN `xor` headRGN) `setBit` fromEnum gapIndex + nextCall = slideRegap tailRGN + in -- gap in reGappedNode, null gappedNode is gap at end of reGappedNode + -- can copmplete the remainder of the slide as gaps only + case GV.uncons gappedNode of + Nothing → + let gapList = replicate (GV.length reGappedNode) gapState + gapApplication = finalize . (gapList <>) + in (gapApplication newLeftList, gapApplication newRightList) + Just (headGN, tailGN) → + if headRGN /= headGN + then nextCall gappedNode gappedLeft gappedRight (gapState : newLeftList) (gapState : newRightList) + else -- no "new gap" + + nextCall + tailGN + (GV.tail gappedLeft) + (GV.tail gappedRight) + (GV.head gappedLeft : newLeftList) + (GV.head gappedRight : newRightList) + + +-- | get3WayGeneric takes thee vectors and produces a (median, cost) pair +get3WayGeneric ∷ (GV.Vector v e) ⇒ (e → e → e → (e, Word)) → v e → v e → v e → (v e, Word) +get3WayGeneric tcm in1 in2 in3 = + let len = GV.length in1 + vt = V.generate len $ \i → tcm (in1 GV.! i) (in2 GV.! i) (in3 GV.! i) + + gen ∷ (GV.Vector v a) ⇒ V.Vector (a, b) → v a + gen v = let med i = fst $ v V.! i in GV.generate len med + + add ∷ (Num b) ⇒ V.Vector (a, b) → b + add = V.foldl' (\x e → x + snd e) 0 + in (,) <$> gen <*> add $ vt + + +{-Not using this now--but could. Would need to add Aligned Types-} + +{- | threeWayGeneric take charInfo, 2 parents, and curNOde and creates 3 median via +1) 3 DO medians (choosing lowest cost median) ((p1,p2), cn), ((cn,p1), p2), and ((cn,p2), p1) +2) inserting gaps to make all 3 line up +3) creating 3-medians +4) choosing lowest cost median +No change adjust is False since the 3-way lookup shold include that factor when it returns cost +-} +threeWayGeneric ∷ CharInfo → CharacterData → CharacterData → CharacterData → CharacterData +threeWayGeneric charInfo parent1 parent2 curNode = + let isMedian = False --since want distances + localCharType = charType charInfo + -- pairwise medina structures + p1p2 = M.getDOMedianCharInfo isMedian charInfo parent1 parent2 + p1cN = M.getDOMedianCharInfo isMedian charInfo parent1 curNode + p2cN = M.getDOMedianCharInfo isMedian charInfo parent2 curNode + + -- get 3rd to pairwise + p1p2cN = M.getDOMedianCharInfo isMedian charInfo p1p2 curNode + p1cNp2 = M.getDOMedianCharInfo isMedian charInfo p1cN parent2 + p2cNp1 = M.getDOMedianCharInfo isMedian charInfo p2cN parent1 + + (median1Slim, median1Wide, median1Huge, cost1) = + if localCharType `elem` [SlimSeq, NucSeq] + then + let (a, b, c) = addGapsToChildren (slimGapped p1p2cN) (slimGapped p1p2) + (median, cost) = get3WayGeneric (TCMD.lookupThreeway (slimTCM charInfo)) a b c + in (median, mempty, mempty, cost) + else + if localCharType `elem` [AminoSeq, WideSeq] + then + let (a, b, c) = addGapsToChildren (wideGapped p1p2cN) (wideGapped p1p2) + (median, cost) = get3WayGeneric (MR.retreiveThreewayTCM (wideTCM charInfo)) a b c + in (mempty, median, mempty, cost) + else + if localCharType == HugeSeq + then + let (a, b, c) = addGapsToChildren (hugeGapped p1p2cN) (hugeGapped p1p2) + (median, cost) = get3WayGeneric (MR.retreiveThreewayTCM (hugeTCM charInfo)) a b c + in (mempty, mempty, median, cost) + else error ("Unrecognized character type: " <> show localCharType) + + (median2Slim, median2Wide, median2Huge, cost2) = + if localCharType `elem` [SlimSeq, NucSeq] + then + let (a, b, c) = addGapsToChildren (slimGapped p1cNp2) (slimGapped p1cN) + (median, cost) = get3WayGeneric (TCMD.lookupThreeway (slimTCM charInfo)) a b c + in (median, mempty, mempty, cost) + else + if localCharType `elem` [AminoSeq, WideSeq] + then + let (a, b, c) = addGapsToChildren (wideGapped p1cNp2) (wideGapped p1cN) + (median, cost) = get3WayGeneric (MR.retreiveThreewayTCM (wideTCM charInfo)) a b c + in (mempty, median, mempty, cost) + else + if localCharType == HugeSeq + then + let (a, b, c) = addGapsToChildren (hugeGapped p1cNp2) (hugeGapped p1cN) + (median, cost) = get3WayGeneric (MR.retreiveThreewayTCM (hugeTCM charInfo)) a b c + in (mempty, mempty, median, cost) + else error ("Unrecognized character type: " <> show localCharType) + + (median3Slim, median3Wide, median3Huge, cost3) = + if localCharType `elem` [SlimSeq, NucSeq] + then + let (a, b, c) = addGapsToChildren (slimGapped p2cNp1) (slimGapped p2cN) + (median, cost) = get3WayGeneric (TCMD.lookupThreeway (slimTCM charInfo)) a b c + in (median, mempty, mempty, cost) + else + if localCharType `elem` [AminoSeq, WideSeq] + then + let (a, b, c) = addGapsToChildren (wideGapped p2cNp1) (wideGapped p2cN) + (median, cost) = get3WayGeneric (MR.retreiveThreewayTCM (wideTCM charInfo)) a b c + in (mempty, median, mempty, cost) + else + if localCharType == HugeSeq + then + let (a, b, c) = addGapsToChildren (hugeGapped p2cNp1) (hugeGapped p2cN) + (median, cost) = get3WayGeneric (MR.retreiveThreewayTCM (hugeTCM charInfo)) a b c + in (mempty, mempty, median, cost) + else error ("Unrecognized character type: " <> show localCharType) + + minCost = minimum [cost1, cost2, cost3] + (medianBestSlim, medianBestWide, medianBestHuge) = + if cost1 == minCost + then (median1Slim, median1Wide, median1Huge) + else + if cost2 == minCost + then (median2Slim, median2Wide, median2Huge) + else (median3Slim, median3Wide, median3Huge) + in -- set for correct data type + if localCharType `elem` [SlimSeq, NucSeq] + then emptyCharacter{slimFinal = medianBestSlim} + else + if localCharType `elem` [AminoSeq, WideSeq] + then emptyCharacter{wideFinal = medianBestWide} + else + if localCharType == HugeSeq + then emptyCharacter{hugeFinal = medianBestHuge} + else error ("Unrecognized character type: " <> show localCharType) diff --git a/src/Utilities/Utilities.hs b/src/Utilities/Utilities.hs new file mode 100644 index 000000000..b82f5bd1a --- /dev/null +++ b/src/Utilities/Utilities.hs @@ -0,0 +1,1801 @@ +{- | +Module specifying utility functions for use with PhyGraph +-} +module Utilities.Utilities where + +import Bio.DynamicCharacter.Element (HugeState, SlimState, WideState) +import Complexity.Graphs qualified as GC +import Complexity.Utilities qualified as GCU +import Control.DeepSeq (NFData (..), force) +import Control.Monad (replicateM) +import Control.Monad.IO.Class (MonadIO (..)) +import Control.Monad.Random.Class +import Data.Alphabet +import Data.Alphabet.IUPAC +import Data.Alphabet.Special +-- import Data.Alphabet.Special +import Data.Bimap qualified as BM +import Data.BitVector.LittleEndian qualified as BV +import Data.Bits +import Data.Foldable +import Data.Functor ((<&>)) +import Data.InfList qualified as IL +import Data.List qualified as L +import Data.List.NonEmpty (NonEmpty (..)) +import Data.List.NonEmpty qualified as NE +import Data.List.Split qualified as SL +import Data.Maybe +import Data.Set qualified as SET +import Data.Text.Lazy qualified as T +import Data.Text.Short qualified as ST +import Data.Vector qualified as V +import Data.Vector.Storable qualified as SV +import Data.Vector.Unboxed qualified as UV +import Debug.Trace +import GeneralUtilities +import GeneralUtilities qualified as GU +import GraphOptimization.Medians (get2WaySlim, get2WayWideHuge) +import PHANE.Evaluation +import PHANE.Evaluation.ErrorPhase (ErrorPhase (..)) +import PHANE.Evaluation.Logging (LogLevel (..), Logger (..)) +import PHANE.Evaluation.Verbosity (Verbosity (..)) +import SymMatrix qualified as S +import Types.Types +import Utilities.LocalGraph qualified as LG + + +listApplying ∷ ∀ a b. (a → a) → [a] → [a] +listApplying strictnessSpec ~val = + let go ∷ [a] → [a] + go [] = [] + go (x : xs) = + let moreVals = go xs + in (strictnessSpec x `seq` moreVals `seq` x) : moreVals + in go val `seq` val + + +applyOver1of2 ∷ (a → a) → (a, b) → (a, b) +applyOver1of2 strictnessSpec ~val@(x, _) = + strictnessSpec x `seq` val + + +applyOver1of3 ∷ (a → a) → (a, b, c) → (a, b, c) +applyOver1of3 strictnessSpec ~val@(x, _, _) = + strictnessSpec x `seq` val + + +{- | strict1of4 ensures parallelism and demands strict return of 1st of 4 tuple elements + this is used in lazy-ish parallel evalution functions in PHANE evaluation +-} +strict1of4 ∷ (NFData a) ⇒ (a, b, c, d) → (a, b, c, d) +strict1of4 ~val@(!x, _, _, _) = + force x `seq` val + + +{- | strict2of2 ensures parallelism and demands strict return of 2nd of 2 tuple elements + this is used in lazy-ish parallel evalution functions in PHANE evaluation +-} +strict2of2 ∷ (NFData b) ⇒ (a, b) → (a, b) +strict2of2 ~val@(_, !x) = + force x `seq` val + + +{- | strict3of4 ensures parallelism and demands strict return of 3rd of 4 tuple elements + this is used in lazy-ish parallel evalution functions in PHANE evaluation +-} +strict3of4 ∷ (NFData c) ⇒ (a, b, c, d) → (a, b, c, d) +strict3of4 ~val@(_, _, !x, _) = + force x `seq` val + + +{- | strict2of5 ensures parallelism and demands strict return of 2nd of 5 tuple elements + this is used in lazy-ish parallel evalution functions in PHANE evaluation +-} +strict2of5 ∷ (NFData b) ⇒ (a, b, c, d, e) → (a, b, c, d, e) +strict2of5 ~val@(_, !x, _, _, _) = + force x `seq` val + + +{- | strict1and2of5 ensures parallelism and demands strict return of 2nd of 5 tuple elements + this is used in lazy-ish parallel evalution functions in PHANE evaluation +-} +strict1and2of5 ∷ (NFData a, NFData b) ⇒ (a, b, c, d, e) → (a, b, c, d, e) +strict1and2of5 ~val@(!x, !y, _, _, _) = + force x `seq` force y `seq` val + + +{- getEdgeComplexityFactors determines complexity of terminal vertex of edge and compares to + length of that edge (in bits) to detemrine complexity factor K(Child)/K(child-max) + [(edgeVertexComplexity, edgeMaxLength, edgeComplexityFactor)] + returns all 0.0 valiuues if not PMDL/SI + 4th field is GraphViz Color scheme Spectral11 1=Red, 11=violet +-} +getEdgeComplexityFactors + ∷ GlobalSettings → ProcessedData → [LG.LNode VertexInfo] → [LG.LEdge EdgeInfo] → [(VertexCost, VertexCost, Double)] +getEdgeComplexityFactors inGS inData vertexList edgeList = + if optimalityCriterion inGS `notElem` [PMDL, SI] + then (replicate (length edgeList) (0.0, 0.0, 0.0)) + else + let edgeInfoList = fmap (getEdgeInfo (V.length $ fst3 inData)) edgeList + endVertexList = fmap (Just . snd3) edgeList + edgeVertexComplexity = fmap (calculatePMDLVertexComplexity False (Just $ V.fromList vertexList) inData) endVertexList + edgeMaxLength = fmap (maxLength . thd3) edgeList + edgeComplexityFactor = zipWith (/) edgeMaxLength edgeVertexComplexity + in zip3 edgeVertexComplexity edgeMaxLength edgeComplexityFactor + + +{- getEdgeColor creates an integer list for GraphViz color schemes + takes max (all seem to start at 1) and if red=1 (or whatever) + and violet(blue) = max then assignes blue to lowest of + Double input and 1 highest (if all <1.0 then 1.0 is highest). +-} +getEdgeColor ∷ Int → [Double] → [Int] +getEdgeColor maxColorInt valueList = + if null valueList + then [] + else + let maxListVal = maximum valueList + maxVal = + if maxListVal < 1.0 + then 1.0 -- this for bootstraps, info index etc + else maxListVal + categories = reverse [1 .. maxColorInt] + thresholds = fmap (maxVal / (fromIntegral maxColorInt) *) (fmap fromIntegral categories) + in -- trace ("GEC: " <> (show maxColorInt) <> " " <> (show maxVal) <> "\n" <> (show valueList) <> "\n" <> (show categories) <> "\n" <> + -- (show thresholds) <> "\n" <> (show $ fmap (getThresholdNumber thresholds 0) valueList)) $ + fmap (getThresholdNumber thresholds 0) valueList + where + getThresholdNumber l i a = + if null l + then max i 1 + else + if a >= head l + then max i 1 + else getThresholdNumber (tail l) (i + 1) a + + +-- | getEdgeInfo returns a list of Strings of edge infomation +getEdgeInfo ∷ Int -> LG.LEdge EdgeInfo -> [String] +getEdgeInfo numLeaves inEdge = + let edgeTypeAdjust = if (snd3 inEdge < numLeaves) then PendantEdge + else edgeType (thd3 inEdge) + in + [ " " + , show $ fst3 inEdge + , show $ snd3 inEdge + , show $ edgeTypeAdjust + , show $ minLength (thd3 inEdge) + , show $ maxLength (thd3 inEdge) + , show $ midRangeLength (thd3 inEdge) + ] + + +{- | needTwoEdgeNoCostAdjust checks global data for PMDL or SI +and whether the required median is a distance (ie single edge) +or two edge median (as in creating a vertex for post order traversal) +and returns a boolean is adjustment is required. +this to add in the extra "noChange" costs when required +-} +needTwoEdgeNoCostAdjust ∷ GlobalSettings → Bool → Bool +needTwoEdgeNoCostAdjust inGS isTwoEdgeMedian = + if not isTwoEdgeMedian + then False + else + if optimalityCriterion inGS `notElem` [SI, PMDL, MAPA] + then False + else True + + +{- | collapseGraph collapses zero-length edges in 3rd field of a phylogenetic graph +does not affect display trees or character graphs +fst6 and thd6 (both) are modified since this is used for output +Collapsed frst can no longer be used for graph optimization since non-binary +this is done by removing the end vertex 'v' of min length 0 edge (u->v) +this removes (u,v) and the two (v, x) and (v, y) out edges from v. New edges are created +(u, x) and (u,y) with length labels of (v,x) and (v,y) +assumes all indexing is the same between the simple and decorated graph +done recusively until no minLength == zero edges so edges renumbered properly +network edges, pendant edges and root edges, are not collapsed +this wierd thype is to allow for polymorphism in graph type--basically a general phylogenetic graph +-} +collapseGraph + ∷ GenPhyloGraph a b + → GenPhyloGraph a b +collapseGraph inPhylograph@(inSimple, inC, inDecorated, inD, inE, inF) = + if LG.isEmpty inSimple + then inPhylograph + else + let inDecTreeEdgeList = filter (not . LG.isNetworkLabEdge inDecorated) $ LG.labEdges inDecorated + zeroEdgeList' = filter ((== 0.0) . minLength . thd3) inDecTreeEdgeList + + -- remove cases of pendent edges--don't remove those either + leafIndexList = fmap fst $ snd4 $ LG.splitVertexList inSimple + rootChildIndexList = fmap snd3 $ concatMap (LG.out inSimple) $ fmap fst $ fst4 $ LG.splitVertexList inSimple + zeroEdgeList = filter ((`notElem` (leafIndexList <> rootChildIndexList)) . snd3) zeroEdgeList' + in if null zeroEdgeList + then inPhylograph + else -- get node to be deleted and its out edges + + let nodeToDelete = snd3 $ head zeroEdgeList + sourceEdgeToDelete = fst3 $ head zeroEdgeList + + -- new dec edges + firstOutEdgeDec = head $ LG.out inDecorated nodeToDelete + secondOutEdgeDec = last $ LG.out inDecorated nodeToDelete + + newFirstEdgeDec = (sourceEdgeToDelete, snd3 firstOutEdgeDec, thd3 firstOutEdgeDec) + newSecondEdgeDec = (sourceEdgeToDelete, snd3 secondOutEdgeDec, thd3 secondOutEdgeDec) + + -- new simple edges + firstOutEdgeSimple = head $ LG.out inSimple nodeToDelete + secondOutEdgeSimple = last $ LG.out inSimple nodeToDelete + + newFirstEdgeSimple = (sourceEdgeToDelete, snd3 firstOutEdgeSimple, thd3 firstOutEdgeSimple) + newSecondEdgeSimple = (sourceEdgeToDelete, snd3 secondOutEdgeSimple, thd3 secondOutEdgeSimple) + + -- make new decorated--deleting node removes all incident edges + newDecGraph = LG.insEdges [newFirstEdgeDec, newSecondEdgeDec] $ LG.delNode nodeToDelete inDecorated + + -- make new simple--deleting node removes all incident edges + newSimpleGraph = LG.insEdges [newFirstEdgeSimple, newSecondEdgeSimple] $ LG.delNode nodeToDelete inSimple + in (newSimpleGraph, inC, newDecGraph, inD, inE, inF) + + +-- | collapseReducedGraph is a wrpper to collapseGraph +collapseReducedGraph ∷ ReducedPhylogeneticGraph → ReducedPhylogeneticGraph +collapseReducedGraph (inSimple, inC, inDecorated, inD, inF) = + let (newSimpleGraph, _, newDecGraph, _, _, _) = collapseGraph (inSimple, inC, inDecorated, mempty, mempty, inF) + in (newSimpleGraph, inC, newDecGraph, inD, inF) + + +{- | calculateGraphComplexity returns an infinite list of graph complexities indexed by +number of network nodes-assumes for now--single component graph not forest +first in pair is softwired complexity, second hardwired complexity +could coppy to vector for say 100 or so and offset infinite list after +to reduce lisyt access facvtor +-} +calculateGraphComplexity ∷ ProcessedData → IL.InfList (VertexCost, VertexCost) +calculateGraphComplexity (nameVect, _, blockDatVect) = + let numNetNodesList = IL.fromList [(0 ∷ Int) ..] + numRoots = 1 + numBlocks = V.length blockDatVect + graphComplexity = IL.map (getGraphComplexity (V.length nameVect) numRoots numBlocks) numNetNodesList + in graphComplexity + + +{- | old version with direct bit calcualtions +| getGraphComplexity' takes the number of leaves and number of +network nodes and calculates the graph complexity in bits +tree num edges (2n-2) n leaves * 2 nodes for each edge * (log 2n -1 vertices-- min specify) +-} +getGraphComplexity' ∷ Int → Int → Int → (VertexCost, VertexCost) +getGraphComplexity' numLeaves numRoots numNetNodes = + -- place holder for now + let nodeComplexity = logBase 2.0 (fromIntegral $ (2 * numLeaves) - 1 + numNetNodes) -- bits to specify each vertex + treeEdges = (2 * numLeaves) - 2 + extraRootEdges = 2 * (numRoots - 1) + baseTreeComplexity = nodeComplexity * fromIntegral (2 * (treeEdges - extraRootEdges)) + numDisplayTrees = 2.0 ** fromIntegral numNetNodes + harWiredEdges = (treeEdges - extraRootEdges) + (3 * numNetNodes) + hardwiredAddComplexity = nodeComplexity * fromIntegral (2 * harWiredEdges) + in -- maybe softwired is numDisplatTrees * harWired since have those edges in input + (baseTreeComplexity * numDisplayTrees, hardwiredAddComplexity) + + +{- | getGraphComplexity takes the number of leaves and number of +network nodes and calculates the algorithmic graph complexity in bits +-} +getGraphComplexity ∷ Int → Int → Int → Int → (VertexCost, VertexCost) +getGraphComplexity numLeaves numRoots numBlocks numNetNodes = + -- place holder for now + let graphProgram = GC.makeProgramStringGraph numLeaves 0 numRoots numNetNodes + (_, _, _, gzipGraph) = GCU.getInformationContent graphProgram + + graphDisplayProgram = GC.makeDisplayGraphString numLeaves 0 numRoots numNetNodes + (_, _, _, gzipDisplay) = GCU.getInformationContent graphDisplayProgram + + displayTreeSwitchingComplexity = fromIntegral numNetNodes + marginalDisplayComplexity = gzipDisplay - gzipGraph -- graphDisplayShannonBits - graphShannonBits + + -- cost of swithing (speciying) 1 bit per netNode then minimum of blocks as duspolay tree number since only have a few block usually + softWiredFactor = displayTreeSwitchingComplexity + ((min (2 ** fromIntegral numNetNodes) (fromIntegral numBlocks)) * marginalDisplayComplexity) + in (gzipGraph + softWiredFactor, gzipGraph) + + +{- | calculateMAPARootCost-- for now used NCM--but better to reflect empirical Pi (frequency) values +won't affect the search choice since a constant factor +-} +calculateMAPARootCost ∷ ProcessedData → VertexCost +calculateMAPARootCost = calculateNCMRootCost + + +{- | calculateNCMRootCost calcuates the contant fact of SUM -log 10 1/r over all characters +approximate for packed data (based on alphabet size for packed) +-} +calculateNCMRootCost ∷ ProcessedData → VertexCost +calculateNCMRootCost (_, _, blockDataV) = + if V.null blockDataV + then 0.0 + else V.sum $ fmap getBlockNCMRootCost blockDataV + + +-- | getBlockNCMRootCost gets NCM root cost for character block +getBlockNCMRootCost ∷ BlockData → VertexCost +getBlockNCMRootCost (_, charDataVV, charInfoV) = + if V.null charDataVV || V.null charInfoV + then 0 + else -- get length of each characters + -- this for prealigned and non-aligned sequences mainly + -- but if data are reorganized and packed--all data + + let numChars = V.length charInfoV + leafCharListV = fmap (charDataVV V.!) [0 .. numChars - 1] + -- False fo not use IA field + maxCharLengthList = zipWith (getMaxCharacterLength False) (V.toList charInfoV) (fmap V.toList leafCharListV) + weightList = fmap weight (V.toList charInfoV) + rootCostList = zipWith (*) weightList (fmap fromIntegral maxCharLengthList) + in -- trace ("GNCMR: " <> (show (numChars, maxCharLengthList, weightList, rootCostList))) $ + sum rootCostList + + +{- | calculatePMDLVertexComplexity creates a vertex cost as either the 'insertion' of character data +or as probbaility of seqeunce in bit based on element frequencies. +-} +calculatePMDLVertexComplexity ∷ Bool → Maybe (V.Vector (LG.LNode VertexInfo)) → ProcessedData → Maybe Int → VertexCost +calculatePMDLVertexComplexity useLogPiValues nodeV (nameVect, _, blockDataV) index = + -- trace ("In CPMDLRC") $ + if useLogPiValues + then -- this is based on average of leaves + -- root complexity base on log2 Pis + -- trace ("New way: " <> (show (getLogPiRootCost blockDataV))) $ + getLogPiRootCost blockDataV + else + if isNothing nodeV -- False so insert--if index /= Nothing + -- average of leaves + -- use insert based (but pi of '-' prob way underestimated) + then + let numLeaves = V.length nameVect + insertDataCost = V.sum $ fmap getblockInsertDataCost blockDataV + in insertDataCost / fromIntegral numLeaves + else -- optimized graph specific vertex cost-- not average + -- pull data from specific vertgex but make a singleton vector + + let nodeInfoVV = vertData $ snd $ (fromJust nodeV) V.! (fromJust index) + charInfoVV = fmap thd3 blockDataV + in -- trace ("U-PMDL-V: vertex " <> (show $ fromJust index)) $ + V.sum $ V.zipWith getVertexInsertBlock nodeInfoVV charInfoVV + + +{- getVertexInsertBlock returns block insert cost for a single vertex +-} +getVertexInsertBlock ∷ (V.Vector CharacterData) → V.Vector CharInfo → VertexCost +getVertexInsertBlock charDataV charInfoV = + V.sum $ V.zipWith getCharacterInsertCost charDataV charInfoV + + +{- | calculatePMDLRootCost creates a root cost as either the 'insertion' of character data +or as probbaility of seqeunce in bit based on element frequencies. +For sequence data averaged over leaf taxa +so root independent + +Wrapper around more flexible calculatePMDLVertexComplexity +-} +calculatePMDLRootCost ∷ Bool → ProcessedData → VertexCost +calculatePMDLRootCost useLogPiValues inData = + calculatePMDLVertexComplexity useLogPiValues Nothing inData Nothing + + +{- | getblockInsertDataCost gets the total cost of 'inserting' the data in a block +this most easily done before bit packing since won't vary anyway. +then store value in Global Settings +-} +getblockInsertDataCost ∷ BlockData → Double +getblockInsertDataCost (_, characterDataVV, charInfoV) = + V.sum $ fmap (getVerticesInsertCost charInfoV) characterDataVV + + +{- | getVerticesInsertCost is the cost or originating or 'inserting' all + vertex data in vector (usually leaf data, but could be singleton of vertex) + for all characters in a block +-} +getVerticesInsertCost ∷ V.Vector CharInfo → V.Vector CharacterData → Double +getVerticesInsertCost charInfoV charDataV = + V.sum $ V.zipWith getCharacterInsertCost charDataV charInfoV + + +-- | getLogPiRootCost gets log 2 ofelement dreqnecies over all data blocks +getLogPiRootCost ∷ V.Vector BlockData → Double +getLogPiRootCost inBlockDataV = + --trace ("In GLPRC Blocks: " <> (show $ (V.length inBlockDataV, fmap (fmap V.length) $ fmap GU.snd3 inBlockDataV, fmap (V.length . GU.thd3) inBlockDataV))) $ + V.sum $ fmap getBlockLogPiCost inBlockDataV + + +{- | getBlockLogPiCost sums up the character root complexity of a block +relies on pulling charcters out of block + +NOT CORRECT num chars and num tax confused +-} +getBlockLogPiCost ∷ BlockData → Double +getBlockLogPiCost (_, leafCharVV, charInfoV) = + if V.null charInfoV + then 0.0 + else -- create leaf by single character structure to be mapped over + + let charNumList = [0 .. (V.length $ V.head leafCharVV) - 1] + leafCharVV' = transposeVector leafCharVV + charVect = fmap (leafCharVV' V.!) (V.fromList charNumList) + in -- trace ("In GBLPC: " <> (show ((length charNumList, V.length leafCharVV'), V.length charInfoV))) $ + V.sum $ V.zipWith getCharacterLogPiCost charVect charInfoV + + +{- | getCharacterLogPiCost takes a character data (vector of terminals for single character) +and returns root complexity cost based on frequency of +sequence character elements and state number for no-sequence +-} +getCharacterLogPiCost ∷ V.Vector CharacterData → CharInfo → Double +getCharacterLogPiCost charDataV charInfo = + --trace ("In GCLPC: " <> (show $ V.length charDataV)) $ + if V.null charDataV + then 0.0 + else + let nonSequenceCharRootCost = getNonSequenceCharacterLogPiCost charInfo (V.head charDataV) + sequenceCharRootCost = getSequenceCharacterLogPiCost charDataV charInfo + in -- trace ("In GCLPC: " <> (show (nonSequenceCharRootCost, sequenceCharRootCost))) $ + nonSequenceCharRootCost + sequenceCharRootCost + +{- | getNonSequenceCharacterLogPiCost creates root cost for all add/nonAdd/Matrix characters +based number of character states and the log2 of that same for each leaf (since static) +so no need to average--can use a single leaf--hence head +-} +getNonSequenceCharacterLogPiCost ∷ CharInfo → CharacterData → Double +getNonSequenceCharacterLogPiCost charInfo inChar = + --trace ("In GCIC: " <> (show (charType charInfo, stateBVPrelim inChar))) $ + let localCharType = charType charInfo + + numStates = + if localCharType == NonAdd + then BV.dimension $ (V.head $ GU.snd3 $ stateBVPrelim inChar) + else + if localCharType == Add + then + let (a, b) = V.head $ GU.snd3 $ rangePrelim inChar + in toEnum (b - a) + else + if localCharType == Matrix + then toEnum $ V.length $ V.head $ matrixStatesPrelim inChar + else 0 ∷ Word + alphabetWeight = logBase 2.0 $ (fromIntegral $ numStates ∷ Double) + + insertCost + | localCharType == Add = alphabetWeight * fromIntegral (V.length $ GU.snd3 $ rangePrelim inChar) + | localCharType == NonAdd = alphabetWeight * fromIntegral (V.length $ GU.snd3 $ stateBVPrelim inChar) + | localCharType `elem` packedNonAddTypes = fromIntegral (UV.length $ GU.snd3 $ packedNonAddPrelim inChar) + | localCharType == Matrix = alphabetWeight * fromIntegral (V.length $ matrixStatesPrelim inChar) + | otherwise = error ("Non static character type: " <> (show localCharType)) + in + if localCharType `notElem` ([Add, NonAdd, Matrix] <> packedNonAddTypes) + then 0.0 + else -- trace ("GCIC: " <> (show (numStates, insertCost))) $ + insertCost + + +{- | getSequenceCharacterLogPiCost creates root cost for all characters as with add/nonAdd +based on the frequencies of character states and the log2 of that frequency for each state or sequence element +average cost over leaves so root placement independent +-} +getSequenceCharacterLogPiCost ∷ V.Vector CharacterData → CharInfo → Double +getSequenceCharacterLogPiCost charDataV charInfo = + --trace ("GSCLP") $ + if V.null charDataV + then error "No data to calculate root cost" + else + if charType charInfo `notElem` sequenceCharacterTypes + then 0.0 + else + let isNeyman = checkNeyman (costMatrix charInfo) + numLeaves = V.length charDataV + leafList = V.toList $ fmap (getSequenceLeafCharStateNumbers charInfo) charDataV + elementList = fmap fst $ head leafList + numberList = fmap (getElementNumberList (concat leafList)) elementList + elementNumList = zip elementList numberList + totalElements = fromIntegral $ sum $ fmap snd elementNumList + elemFreqList = fmap (/ totalElements) $ fmap fromIntegral $ fmap snd elementNumList + bitList = + if isNeyman + then replicate (length $ alphabet charInfo) $ logBase 2.0 $ (fromIntegral $ (length $ alphabet charInfo) ∷ Double) + else fmap (hereLogBase2) elemFreqList + in -- trace ("GSCLPC: " <> (show (isNeyman,elemFreqList, numberList, bitList))) $ + abs $ (sum $ zipWith (*) (fmap fromIntegral numberList) bitList) / (fromIntegral numLeaves) + where + hereLogBase2 a = + if a < epsilon + then 0.0 + else logBase 2.0 a + + +{- | checkNeyman take a matrix (type from PHANE) and returns True if + the matrix has all non-diagonal values the same + otherwise False +-} +checkNeyman ∷ S.Matrix Int → Bool +checkNeyman inMatrix = + if S.null inMatrix + then False + else + if S.rows inMatrix < 2 + then False + else + let nonDiagVal = inMatrix S.! (0, 1) + in checkMatrixVals inMatrix nonDiagVal 0 0 + + +-- | checkMatrixVals checks of all non-adiagnonal values equal input value +checkMatrixVals ∷ S.Matrix Int → Int → Int → Int → Bool +checkMatrixVals inMatrix nonDiagVal rowIndex columnIndex = + -- trace ("Check Matrix: " <> (show (rowIndex, columnIndex, S.rows inMatrix, S.cols inMatrix))) $ + if rowIndex >= S.rows inMatrix + then True + else + if columnIndex >= S.cols inMatrix + then checkMatrixVals inMatrix nonDiagVal (rowIndex + 1) 0 + else + if rowIndex == columnIndex + then checkMatrixVals inMatrix nonDiagVal rowIndex (columnIndex + 1) + else + if inMatrix S.! (rowIndex, columnIndex) /= nonDiagVal + then False + else checkMatrixVals inMatrix nonDiagVal rowIndex (columnIndex + 1) + + +{- | getCharacterElementFreq gets the alphabet elements of sequence characters in a +specific character over all leaves +-} +getCharacterElementFreq ∷ V.Vector CharacterData → CharInfo → [(String, Double)] +getCharacterElementFreq charDataV charInfo = + if V.null charDataV + then [] + else + let leafList = V.toList $ fmap (getSequenceLeafCharStateNumbers charInfo) charDataV + elementList = fmap fst $ head leafList + elementNumList = zip elementList (fmap (getElementNumberList (concat leafList)) elementList) + totalElements = fromIntegral $ sum $ fmap snd elementNumList + elemFreqList = fmap (/ totalElements) $ fmap fromIntegral $ fmap snd elementNumList + in zip elementList elemFreqList + + +{- | getElementNumberList takes a list of (String, Int) and returns +the summed Ints based on lisyt of String +-} +getElementNumberList ∷ [(String, Int)] → String → Int +getElementNumberList pairList matchSymbol = + if null pairList + then 0 + else + if (fst $ head pairList) == matchSymbol + then (snd $ head pairList) + getElementNumberList (tail pairList) matchSymbol + else getElementNumberList (tail pairList) matchSymbol + + +-- | getSequenceLeafCharStateNumbers gets the numebr of each charcter state as in alphabet +getSequenceLeafCharStateNumbers ∷ CharInfo → CharacterData → [(String, Int)] +getSequenceLeafCharStateNumbers charInfo blockDatum = + let localType = charType charInfo + localAlphabet = (ST.toString <$> alphabet charInfo) + + -- this to avoid recalculations and list access issues + lANES = (fromJust $ NE.nonEmpty $ alphabetSymbols localAlphabet) + lAList = NE.toList $ lANES + lAVect = V.fromList $ NE.toList $ lANES + + getCharState + ∷ ∀ {b} + . (Show b, Bits b) + ⇒ b + → String + getCharState a = (bitVectToCharState localAlphabet lANES lAVect a) <> "," + + -- get strings (with ",") of elements + stringStates + | localType `elem` sequenceCharacterTypes = case localType of + x | x `elem` [NucSeq] → (SV.foldMap getCharState (slimPrelim blockDatum)) + x | x `elem` [SlimSeq] → (SV.foldMap getCharState (slimPrelim blockDatum)) + x | x `elem` [WideSeq] → (UV.foldMap getCharState (widePrelim blockDatum)) + x | x `elem` [AminoSeq] → (UV.foldMap getCharState (widePrelim blockDatum)) + x | x `elem` [HugeSeq] → (foldMap getCharState (hugePrelim blockDatum)) + x | x `elem` [AlignedSlim] → (SV.foldMap getCharState $ snd3 $ alignedSlimPrelim blockDatum) + x | x `elem` [AlignedWide] → (UV.foldMap getCharState $ snd3 $ alignedWidePrelim blockDatum) + x | x `elem` [AlignedHuge] → (foldMap getCharState $ snd3 $ alignedHugePrelim blockDatum) + _ → error ("Un-implemented sequence data type " <> show localType) + | otherwise = error ("Non-sequence data type " <> show localType) + + -- filter out ambiguities (ie not in element list) + elementListList = filter (`elem` lAList) $ SL.splitOn "," stringStates + elementList = L.group $ L.sort elementListList + elementNumberList = fmap (getElementNumber elementList) lAList + in zip lAList elementNumberList + + +{- | getElementNumber takes a list of list of elements that has been grouped and checkes +if matches String and if such returns length else goes on to next input String +-} +getElementNumber ∷ [[String]] → String → Int +getElementNumber symbolLL matchSymbol = + if null symbolLL + then 0 + else + if (head $ head symbolLL) == matchSymbol + then length $ head symbolLL + else getElementNumber (tail symbolLL) matchSymbol + + +{- | getCharacterInsertCost takes a character and characterInfo and returns origination/insert cost for the character +for PMDL of add, non add matrix log2 of alphabet size +This uses actual inserts of different elements as oposedd to average, although averaged over leaves so not +root dependant +-} +getCharacterInsertCost ∷ CharacterData → CharInfo → Double +getCharacterInsertCost inChar charInfo = + -- trace ("In GCIC: " <> (show $ charType charInfo)) $ + let localCharType = charType charInfo + thisWeight = weight charInfo + + numStates = + if localCharType == NonAdd + then BV.dimension $ (V.head $ GU.snd3 $ stateBVPrelim inChar) + else + if localCharType == Add + then + let (a, b) = V.head $ GU.snd3 $ rangePrelim inChar + in toEnum (b - a) + else + if localCharType == Matrix + then toEnum $ V.length $ V.head $ matrixStatesPrelim inChar + else 0 ∷ Word + alphabetWeight = logBase 2.0 $ (fromIntegral $ numStates ∷ Double) + + slimLength = max (SV.length $ slimPrelim inChar) (SV.length $ snd3 $ alignedSlimPrelim inChar) + wideLength = max (UV.length $ widePrelim inChar) (UV.length $ snd3 $ alignedWidePrelim inChar) + + allGapSlim = SV.replicate slimLength $ (0 ∷ SlimState) `setBit` fromEnum gapIndex + allGapWide = UV.replicate wideLength $ (0 ∷ WideState) `setBit` fromEnum gapIndex + + hugeGapChar = ((V.head $ hugePrelim inChar) `xor` (V.head $ hugePrelim inChar)) `setBit` fromEnum gapIndex + allGapHuge = V.replicate (V.length $ hugePrelim inChar) hugeGapChar + + hugeGapCharAligned = ((V.head $ snd3 $ alignedHugePrelim inChar) `xor` (V.head $ snd3 $ alignedHugePrelim inChar)) `setBit` fromEnum gapIndex + allGapHugeAligned = V.replicate (V.length $ snd3 $ alignedHugePrelim inChar) hugeGapCharAligned + + -- don't need to worry about noCost Adjustment and medioan issue since "-" -> "-" is zero + -- otherwise would hahve to subtract off extra no-change cost (gap to gap) + insertCost + | localCharType == Add = alphabetWeight * fromIntegral (V.length $ GU.snd3 $ rangePrelim inChar) + | localCharType == NonAdd = alphabetWeight * fromIntegral (V.length $ GU.snd3 $ stateBVPrelim inChar) + | localCharType `elem` packedNonAddTypes = fromIntegral (UV.length $ GU.snd3 $ packedNonAddPrelim inChar) + | localCharType == Matrix = alphabetWeight * fromIntegral (V.length $ matrixStatesPrelim inChar) + | localCharType == SlimSeq = fromIntegral $ snd $ get2WaySlim (slimTCM charInfo) allGapSlim (slimPrelim inChar) + | localCharType == NucSeq = fromIntegral $ snd $ get2WaySlim (slimTCM charInfo) allGapSlim (slimPrelim inChar) + | localCharType == WideSeq = + fromIntegral $ snd $ get2WayWideHuge (wideTCM charInfo) allGapWide (widePrelim inChar) + | localCharType == AminoSeq = + fromIntegral $ snd $ get2WayWideHuge (wideTCM charInfo) allGapWide (widePrelim inChar) + | localCharType == HugeSeq = + fromIntegral $ snd $ get2WayWideHuge (hugeTCM charInfo) allGapHuge (hugePrelim inChar) + | localCharType == AlignedSlim = + fromIntegral $ snd $ get2WaySlim (slimTCM charInfo) allGapSlim (snd3 $ alignedSlimPrelim inChar) + | localCharType == AlignedWide = + fromIntegral $ snd $ get2WayWideHuge (wideTCM charInfo) allGapWide (snd3 $ alignedWidePrelim inChar) + | localCharType == AlignedHuge = + fromIntegral $ snd $ get2WayWideHuge (hugeTCM charInfo) allGapHugeAligned (snd3 $ alignedHugePrelim inChar) + | otherwise = error ("Character type unimplemented : " <> show localCharType) + in -- trace ("GCIC: " <> (show (insertCost, thisWeight * insertCost, allGapSlim, snd3 $ alignedSlimPrelim inChar))) $ + thisWeight * insertCost + + +{- | splitSequence takes a ShortText divider and splits a list of ShortText on +that ShortText divider consuming it akin to Text.splitOn +-} +splitSequence ∷ ST.ShortText → [ST.ShortText] → [[ST.ShortText]] +splitSequence partitionST stList = + if null stList + then [] + else + let firstPart = takeWhile (/= partitionST) stList + restList = dropWhile (/= partitionST) stList + in if restList == [partitionST] + then firstPart : [[ST.fromString "#"]] + else + if not $ null restList + then firstPart : splitSequence partitionST (tail restList) + else [firstPart] + + +-- See Bio.DynamicCharacter.decodeState for a better implementation for dynamic character elements +bitVectToCharStateQual ∷ (Show b, FiniteBits b, Bits b) ⇒ Alphabet String → b → String +bitVectToCharStateQual localAlphabet bitValue = + let charString = L.intercalate "," $ foldr pollSymbol mempty indices + in if popCount bitValue == finiteBitSize bitValue + then "?" + else + if popCount bitValue > 1 + then "[" <> charString <> "]" + else charString + where + indices = [0 .. len - 1] + len = length vec + -- this is a hack--the alphabets for non-additive charcaters gets truncated to binary at some point earlier + vec = V.fromList $ fmap show [0 .. finiteBitSize bitValue - 1] + pollSymbol i polled + | bitValue `testBit` i = (vec V.! i) : polled + | otherwise = polled + + +-- See Bio.DynamicCharacter.decodeState for a better implementation for dynamic character elements +-- this for TNT output of qualitative characters +bitVectToCharStateNonAdd ∷ (Show b, FiniteBits b, Bits b) ⇒ Alphabet String → b → String +bitVectToCharStateNonAdd localAlphabet bitValue = + let stateList = [0 .. (finiteBitSize bitValue) - 1] + stateCharList = fmap (: []) $ ['0' .. '9'] <> ['A' .. 'Z'] <> ['a' .. 'z'] + bitOnList = fmap (testBit bitValue) stateList + statesON = fmap fst $ filter ((== True) . snd) $ zip stateCharList bitOnList + charString = concat statesON + in -- trace ("BVNA: " <> (show (bitValue, bitOnList, charString))) $ + if popCount bitValue == finiteBitSize bitValue + then "?" + else + if popCount bitValue > 1 + then "[" <> charString <> "]" + else charString + + +-- See Bio.DynamicCharacter.decodeState for a better implementation for dynamic character elements +bitVectToCharState' ∷ (FiniteBits b, Bits b) ⇒ Alphabet String → b → String +bitVectToCharState' localAlphabet bitValue = + -- check for symbol length > 1 then add space (since sorted last element longest) + let -- maxSymbolLength = maximum (length <$> SET.toList (alphabetSymbols localAlphabet)) + charString = foldr pollSymbol mempty indices + charString' = L.intercalate "," $ filter (/= "\8220") charString + in -- trace ("BV2CSA:" <> (show (maxSymbolLength, SET.toList (alphabetSymbols localAlphabet) ))) ( + if popCount bitValue == finiteBitSize bitValue + then "?" + else + if popCount bitValue > 1 + then "[" <> charString' <> "]" <> " " + else charString' <> " " + where + -- ) + + indices = [0 .. len - 1] + len = length vec + vec = alphabetSymbols localAlphabet + pollSymbol i polled + | bitValue `testBit` i = (vec V.! i) : polled + | otherwise = polled + + +bitVectToCharState ∷ (Show b, Bits b) ⇒ Alphabet String → NonEmpty String → V.Vector String → b → String +bitVectToCharState localAlphabet localAlphabetNEString localAlphabetVect bitValue = + -- trace ("BVCA': " <> (show $ isAlphabetAminoAcid localAlphabet) <> " " <> (show $ isAlphabetDna localAlphabet) <> " " <> (show localAlphabet)) ( + let stringVal' = foldr pollSymbol mempty indices + stringVal = concat stringVal' + in {- + if length stringVal == 1 then L.intercalate "," stringVal' <> " " + else + + -- if isAlphabetAminoAcid localAlphabet then + if SET.size (alphabetSymbols localAlphabet) > 5 then + + if stringVal == "DN" then "B" <> " " + else if stringVal == "EQ" then "Z" <> " " + else if stringVal == "ACDEFGHIKLMNPQRSTVWY" then "X" <> " " + else if stringVal == "-ACDEFGHIKLMNPQRSTVWY" then "?" <> " " + -- amino acid polymorphisms without ambiguity codes + else "[" <> stringVal <> "]" <> " " + + -- Nucleotide IUPAC + -- hack until fix isDNA and RNA alphabet + -- else if ((show localAlphabet) == ("Alphabet: {\"-\", \"A\", \"C\", \"G\", \"T\"}")) || ((show localAlphabet) == ("Alphabet: {\"-\", \"A\", \"C\", \"G\", \"U\"}")) then + -} + + if (isAlphabetDna localAlphabet || isAlphabetRna localAlphabet) && (SET.size (alphabetSymbols localAlphabet) == 5) + then + if stringVal `elem` ["", "-"] + then "-" + else + if length stringVal == 1 + then stringVal + else + if stringVal == "AG" + then "R" + else + if stringVal == "CT" + then "Y" + else + if stringVal == "CG" + then "S" + else + if stringVal == "AT" + then "W" + else + if stringVal == "GT" + then "K" + else + if stringVal == "AC" + then "M" + else + if stringVal == "CGT" + then "B" + else + if stringVal == "AGT" + then "D" + else + if stringVal == "ACT" + then "H" + else + if stringVal == "ACG" + then "V" + else + if stringVal == "ACGT" + then "N" + else + if stringVal == "-ACGT" + then "?" + else -- ours for gap chars and nuc + + if stringVal == "-A" + then "a" + else + if stringVal == "-C" + then "c" + else + if stringVal == "-G" + then "g" + else + if stringVal == "-T" + then "t" + else + if stringVal == "-AG" + then "r" + else + if stringVal == "-CT" + then "y" + else + if stringVal == "-CG" + then "s" + else + if stringVal == "-AT" + then "w" + else + if stringVal == "-GT" + then "k" + else + if stringVal == "-AC" + then "m" + else + if stringVal == "-CGT" + then "b" + else + if stringVal == "-AGT" + then "d" + else + if stringVal == "-ACT" + then "h" + else + if stringVal == "-ACG" + then "v" + else "Unrecognized nucleic acid ambiguity code : " <> "|" <> stringVal <> "|" + else -- AA IUPAC + + if isAlphabetAminoAcid localAlphabet && (SET.size (alphabetSymbols localAlphabet) > 5) + then + if length stringVal == 1 + then stringVal <> " " + else + if stringVal == "DN" + then "B" + else + if stringVal == "EQ" + then "Z" + else + if stringVal == "ACDEFGHIKLMNPQRSTVWY" + then "X" + else + if stringVal == "-ACDEFGHIKLMNPQRSTVWY" + then "?" + else -- amino acid polymorphisms without ambiguity codes + "[" <> stringVal <> "]" <> " " + else (bitVectToCharState'' localAlphabetNEString localAlphabetVect bitValue) <> " " + where + indices = [0 .. len - 1] + len = length vec + vec = localAlphabetVect -- alphabetSymbols localAlphabet + pollSymbol i polled + | bitValue `testBit` i = (vec V.! i) : polled + | otherwise = polled + + +-- bitVectToCharState'' takes a bit vector representation and returns a list states as integers +bitVectToCharState'' ∷ (Bits b) ⇒ NonEmpty String → V.Vector String → b → String +bitVectToCharState'' localAlphabet localAlphabetVect bitValue + | isAlphabetDna hereAlphabet = fold $ iupacToDna BM.!> (NE.fromList observedSymbols) + | isAlphabetAminoAcid hereAlphabet = fold $ iupacToAminoAcid BM.!> (NE.fromList observedSymbols) + | otherwise -- L.intercalate "," $ toList observedSymbols + = + let symbolList = toList observedSymbols + symbolComma = L.intercalate "," observedSymbols + in if length observedSymbols == 1 + then symbolComma + else + if length observedSymbols == 0 + then "-" + else ('[' : symbolComma) <> "]" + where + hereAlphabet = fromSymbols localAlphabet + symbolCountH = length localAlphabet + observedSymbols = + -- NE.fromList $ + foldMap + -- (\ i -> [localAlphabet NE.!! i | bitValue `testBit` i]) + (\i → [localAlphabetVect V.! i | bitValue `testBit` i]) + [0 .. symbolCountH - 1] + + +-- | matrixStateToStringtakes a matrix state and returns a string representation +matrixStateToString ∷ V.Vector MatrixTriple → String +matrixStateToString inStateVect = + let minCost = V.minimum $ fmap fst3 inStateVect + minCostStates = V.toList $ V.filter ((== minCost) . fst3) inStateVect + statesStringList = fmap show minCostStates + in if length statesStringList == 1 + then head statesStringList + else "[" <> unwords statesStringList <> "]" + + +{- | additivStateToString take an additive range and prints single state if range equal or +[ab] if not +[a-b] causes problems with TNT +-} +additivStateToString ∷ V.Vector String → (Int, Int) → String +additivStateToString localAlphabet (a, b) = + if a == b + then show a + else + if (show a == V.head localAlphabet) && (show b == V.last localAlphabet) + then "?" + else "[" <> show a <> show b <> "]" + + +{- | filledDataFields takes rawData and checks taxon to see what percent +"characters" are found. +call with (0,0) +-} +filledDataFields ∷ (Int, Int) → TermData → (NameText, Int, Int) +filledDataFields (hasData, totalData) (taxName, taxData) + | null taxData = (taxName, hasData, totalData) + | ST.length (head taxData) == 0 = filledDataFields (hasData, 1 + totalData) (taxName, tail taxData) + | otherwise = filledDataFields (1 + hasData, 1 + totalData) (taxName, tail taxData) + + +{- | stripComments removes all lines that being with "--" haskell stype comments +needs to be reversed on return to maintain order. +-} +stripComments ∷ [String] → [String] +stripComments inStringList = + if null inStringList + then [] + else + let strippedLine = GU.stripString $ head inStringList + in if null strippedLine + then stripComments $ tail inStringList + else + if length strippedLine < 2 + then strippedLine : stripComments (tail inStringList) + else + if "--" == take 2 strippedLine + then stripComments $ tail inStringList + else strippedLine : stripComments (tail inStringList) + + +-- | getDecoratedGraphBlockCharInformation takes decorated graph and reports number of blosk and size of each +getDecoratedGraphBlockCharInformation ∷ DecoratedGraph → ((Int, Int), [V.Vector Int]) +getDecoratedGraphBlockCharInformation inGraph = + if LG.isEmpty inGraph + then ((0, 0), []) + else -- get a vertices from graph and take their information + + let inVertDataList = fmap (vertData . snd) (LG.labNodes inGraph) + blockNumMax = maximum $ fmap length inVertDataList + blocknumMin = minimum $ fmap length inVertDataList + blockLengthList = fmap (fmap length) inVertDataList + in ((blockNumMax, blocknumMin), blockLengthList) + + +{- | vectMaybeHead takes a vector and returns JUst V.head if not V.empty +Nothing otherwise +-} +vectMaybeHead ∷ V.Vector a → Maybe a +vectMaybeHead inVect = + if V.null inVect + then Nothing + else Just (V.head inVect) + + +-- vectResolveMaybe takes a Vector of Maybe a +-- and returns Just a or V.empty +vectResolveMaybe ∷ V.Vector (Maybe a) → V.Vector a +vectResolveMaybe inVect = + -- trace ("VRM " <> show (length inVect)) $ + if isNothing (V.head inVect) + then V.empty + else V.singleton $ fromJust $ V.head inVect + + +{- | getNumberPrealignedCharacters takes processed data and returns the number of prealigned sequence characters +used to special case procedurs with prealigned sequences +-} +getNumberPrealignedCharacters ∷ V.Vector BlockData → Int +getNumberPrealignedCharacters blockDataVect = + if V.null blockDataVect + then 0 + else + let firstBlock = GU.thd3 $ V.head blockDataVect + characterTypes = V.map charType firstBlock + sequenceChars = length $ V.filter id $ V.map (`elem` prealignedCharacterTypes) characterTypes + in sequenceChars + getNumberPrealignedCharacters (V.tail blockDataVect) + + +{- | getNumberNonExactCharacters takes processed data and returns the number of non-exact (unaligned seqeunce) characters +used to special case procedures with unaligned sequences +-} +getNumberNonExactCharacters ∷ V.Vector BlockData → Int +getNumberNonExactCharacters blockDataVect = + if V.null blockDataVect + then 0 + else + let firstBlock = GU.thd3 $ V.head blockDataVect + characterTypes = V.map charType firstBlock + sequenceChars = length $ V.filter id $ V.map (`elem` nonExactCharacterTypes) characterTypes + in sequenceChars + getNumberNonExactCharacters (V.tail blockDataVect) + + +{- | getNumberSequenceCharacters takes processed data and returns the number of non-exact (= sequence) characters +utilized to special case datasets with limited non-exact characters +-} +getNumberSequenceCharacters ∷ V.Vector BlockData → Int +getNumberSequenceCharacters blockDataVect = + if V.null blockDataVect + then 0 + else + let firstBlock = GU.thd3 $ V.head blockDataVect + characterTypes = V.map charType firstBlock + sequenceChars = length $ V.filter id $ V.map (`elem` sequenceCharacterTypes) characterTypes + in sequenceChars + getNumberSequenceCharacters (V.tail blockDataVect) + + +{- | getNumber4864PackedChars takes processed data and returns the number of +packed characters with states in 4, 8, and 64 +this for NCM since weightiong may be apperoximate and needs to be rediagnosed +-} +getNumber4864PackedChars ∷ V.Vector BlockData → Int +getNumber4864PackedChars blockDataVect = + if V.null blockDataVect + then 0 + else + let firstBlock = GU.thd3 $ V.head blockDataVect + characterTypes = V.map charType firstBlock + packedChars = length $ V.filter id $ V.map (`elem` [Packed4, Packed8, Packed64]) characterTypes + in packedChars + getNumber4864PackedChars (V.tail blockDataVect) + + +{- | has4864PackedChars takes processed data and if has +packed characters with states in 4, 8, and 64 +this for NCM since weightiong may be apperoximate and needs to be rediagnosed +-} +has4864PackedChars ∷ V.Vector BlockData → Bool +has4864PackedChars blockDataVect = + not (V.null blockDataVect) + && ( let firstBlock = GU.thd3 $ V.head blockDataVect + characterTypes = V.map charType firstBlock + packedChars = length $ V.filter id $ V.map (`elem` [Packed4, Packed8, Packed64]) characterTypes + in ((packedChars > 0) || has4864PackedChars (V.tail blockDataVect)) + ) + + +{- | getLengthSequenceCharacters takes processed data and returns the total length (maximum) of non-exact (= sequence) characters +utilised to get rough estimate of fraction of non-exact characters for +dynamic epsilon adjustment to data types +maximum since that ius th eminimum length of optimized HTU sequences +-} +getLengthSequenceCharacters ∷ V.Vector BlockData → Int +getLengthSequenceCharacters blockDataVect = + if V.null blockDataVect + then 0 + else + let -- get character info + firstBlock = GU.thd3 $ V.head blockDataVect + characterTypes = V.map charType firstBlock + + -- get sequences in block + firstBlockCharacters = GU.snd3 $ V.head blockDataVect + (sequenceCharVect, _) = V.unzip $ V.filter snd (V.zip firstBlockCharacters (V.map (`elem` sequenceCharacterTypes) characterTypes)) + + -- get max length sequence data + sequenceCharsLength = V.sum $ fmap (V.maximum . fmap getMaxCharLength) sequenceCharVect + in -- trace ("GLSC: " <> (show (sequenceCharsLength, V.length firstBlockCharacters, fmap V.length firstBlockCharacters))) + sequenceCharsLength + getLengthSequenceCharacters (V.tail blockDataVect) + + +-- | getMaxCharLength takes characterData and returns the length of the longest character field from preliminary +getMaxCharLength ∷ CharacterData → Int +getMaxCharLength inChardata = + let nonAdd = (V.length . snd3) $ stateBVPrelim inChardata + add = (V.length . snd3) $ rangePrelim inChardata + matrix = V.length $ matrixStatesPrelim inChardata + slim = (SV.length . snd3) $ slimGapped inChardata + wide = (UV.length . snd3) $ wideGapped inChardata + huge = (V.length . snd3) $ hugeGapped inChardata + aSlim = (SV.length . snd3) $ alignedSlimPrelim inChardata + aWide = (UV.length . snd3) $ alignedWidePrelim inChardata + aHuge = (V.length . snd3) $ alignedHugePrelim inChardata + packed = 32 * ((UV.length . snd3) $ packedNonAddPrelim inChardata) + aBitChar = (UV.head . snd3) $ packedNonAddPrelim inChardata + missingBitChar = complement $ aBitChar `xor` aBitChar + packedNotMissingChars = 32 * (UV.length $ UV.filter (/= missingBitChar) $ snd3 $ packedNonAddPrelim inChardata) + in -- trace ("GMCL: " <> (show [nonAdd, add, matrix, slim, wide, huge, aSlim, aWide, aHuge, packed])) + -- trace ("GMCL: " <> show (nonAdd, add, matrix, packed, packedNotMissingChars)) $ + maximum [nonAdd, add, matrix, slim, wide, huge, aSlim, aWide, aHuge, packedNotMissingChars] + + +{- | getNumberExactCharacters takes processed data and returns the number of non-exact characters +ised to special case datasets with limited non-exact characters +-} +getNumberExactCharacters ∷ V.Vector BlockData → Int +getNumberExactCharacters blockDataVect = + if V.null blockDataVect + then 0 + else + let firstBlock = GU.thd3 $ V.head blockDataVect + characterTypes = V.map charType firstBlock + exactChars = length $ V.filter id $ V.map (`elem` exactCharacterTypes) characterTypes + in exactChars + getNumberExactCharacters (V.tail blockDataVect) + + +{- | getPairwiseObservationsGraph gets the observations between a pairs of vertices on a graph +that are non-missing +used to normalize Wagner deltas +-} +getPairwiseObservationsGraph ∷ VertexBlockData → VertexBlockData → VertexCost +getPairwiseObservationsGraph vertexBlockDataI vertexBlockDataJ = + let blockObsV = V.zipWith getPairedBlockObs vertexBlockDataI vertexBlockDataJ + in if V.null vertexBlockDataI || V.null vertexBlockDataJ + then 0.0 + else fromIntegral $ V.sum blockObsV + + +-- | getPairedBlockObs takes zipped block of chardata vectors and returns block paired observaiton that are non-missing +getPairedBlockObs ∷ V.Vector CharacterData → V.Vector CharacterData → Int +getPairedBlockObs charVectI charVectJ = + if V.null charVectI || V.null charVectJ + then 0 + else V.sum $ V.zipWith getPairCharNonMissing charVectI charVectJ + + +-- | getPairCharNonMissing gets non-missing character numbers from a pair of characters +getPairCharNonMissing ∷ CharacterData → CharacterData → Int +getPairCharNonMissing iTaxon jTaxon = + let obsI = getMaxCharLength iTaxon + obsJ = getMaxCharLength jTaxon + in -- trace ("GPCL: " <> (show (iIndex, jIndex, V.length charDataV))) $ + if obsI == 0 || obsJ == 0 + then 0 + else max obsI obsJ + + +{- | getPairwiseObservations gets the observations between a pairs of leaves that are non-missing +used to normalize distances +-} +getPairwiseObservations ∷ V.Vector BlockData → (Int, Int) → VertexCost +getPairwiseObservations blocKDataV pairTax = + if V.null blocKDataV + then 0 + else fromIntegral $ V.sum (fmap (getPairBlockObs pairTax) blocKDataV) + + +-- | getMaxBlockObs gets the supremum over taxa number of characters in a block of data +getPairBlockObs ∷ (Int, Int) → BlockData → Int +getPairBlockObs pairTax (_, charDataVV, _) = + if V.null charDataVV + then 0 + else + let newListList = L.transpose $ V.toList $ fmap V.toList charDataVV + charTaxVect = V.fromList $ fmap V.fromList newListList + in -- trace ("GPBO: " <> (show (V.length charDataVV, V.length charTaxVect, fmap V.length charTaxVect))) $ + V.sum (fmap (getPairCharLength pairTax) charTaxVect) + + +{- | getPairBlockObs get non-missing observations between taxa +NB--does not go into qualitative or packed charcters and check for missing values +other than "all missing" packed +-} +getPairCharLength ∷ (Int, Int) → V.Vector CharacterData → Int +getPairCharLength (iIndex, jIndex) charDataV = + if V.null charDataV + then 0 + else + let iTaxon = charDataV V.! iIndex + jTaxon = charDataV V.! jIndex + obsI = getMaxCharLength iTaxon + obsJ = getMaxCharLength jTaxon + in -- trace ("GPCL: " <> (show (iIndex, jIndex, V.length charDataV))) $ + if obsI == 0 || obsJ == 0 + then 0 + else max obsI obsJ + + +{- | getMaxNumberObservations takes data set and returns the supremum of character numbers from all +taxa over all charcaters (sequence and qiualitative) +used for various normalizations +-} +getMaxNumberObservations ∷ V.Vector BlockData → PhyG VertexCost +getMaxNumberObservations blocKDataV + | V.null blocKDataV = pure 0 + | otherwise = + getParallelChunkTraverse >>= \pTraverse → + fmap (fromIntegral . sum) . pTraverse getMaxBlockObs $ V.toList blocKDataV + + +-- | getMaxBlockObs gets the supremum over taxa number of characters in a block of data +getMaxBlockObs ∷ BlockData → PhyG Int +getMaxBlockObs (_, charDataVV, _) + | V.null charDataVV = pure 0 + | otherwise = + let newListList = L.transpose $ V.toList $ fmap V.toList charDataVV + in getParallelChunkTraverse >>= \pTraverse → + sum <$> pTraverse getSupCharLength newListList + + +{- | getMaxCharLength takes a vector of charcters and returns the supremum of observations for that character +over all taxa +-} +getSupCharLength ∷ [CharacterData] → PhyG Int +getSupCharLength charDataV + | null charDataV = pure 0 + | otherwise = + getParallelChunkMap <&> \pMap → + maximum $ getMaxCharLength `pMap` charDataV + + +-- getFractionDynamic returns fraction (really of length) of dynamic charcters for adjustment to dynamicEpsilon +getFractionDynamic ∷ ProcessedData → Double +getFractionDynamic inData = + let numStaticCharacters = getNumberExactCharacters $ thd3 inData + lengthDynamicCharacters = getLengthSequenceCharacters $ thd3 inData + in fromIntegral lengthDynamicCharacters / fromIntegral (lengthDynamicCharacters + numStaticCharacters) + + +{- | splitBlockCharacters takes a block of characters (vector) and splits into two partitions of exact (Add, NonAdd, Matrix) and sequence characters +(= nonExact) using accumulators +-} +splitBlockCharacters + ∷ V.Vector (V.Vector CharacterData) + → V.Vector CharInfo + → Int + → [([CharacterData], CharInfo)] + → [([CharacterData], CharInfo)] + → (BlockData, BlockData) +splitBlockCharacters inDataVV inCharInfoV localIndex exactCharPairList seqCharPairList = + if localIndex == V.length inCharInfoV + then + let (exactDataList, exactCharInfoList) = unzip exactCharPairList + (sequenceDataList, sequenceCharInfoList) = unzip seqCharPairList + newExactCharInfoVect = V.fromList $ reverse exactCharInfoList + newSeqCharCharInfoVect = V.fromList $ reverse sequenceCharInfoList + newExactData = V.fromList $ fmap (V.fromList . reverse) (L.transpose exactDataList) + newSeqCharData = V.fromList $ fmap (V.fromList . reverse) (L.transpose sequenceDataList) + in ( (T.pack "ExactCharacters", newExactData, newExactCharInfoVect) + , (T.pack "Non-ExactCharacters", newSeqCharData, newSeqCharCharInfoVect) + ) + else + let localCharacterType = charType (inCharInfoV V.! localIndex) + thisCharacterData = V.toList $ fmap (V.! localIndex) inDataVV + newPair = (thisCharacterData, inCharInfoV V.! localIndex) + in if localCharacterType `elem` exactCharacterTypes + then splitBlockCharacters inDataVV inCharInfoV (localIndex + 1) (newPair : exactCharPairList) seqCharPairList + else + if localCharacterType `elem` sequenceCharacterTypes + then splitBlockCharacters inDataVV inCharInfoV (localIndex + 1) exactCharPairList (newPair : seqCharPairList) + else error ("Unrecongized/implemented character type: " <> show localCharacterType) + + +-- | safeVectorHead safe vector head--throws error if null +safeVectorHead ∷ V.Vector a → a +safeVectorHead inVect = + if V.null inVect + then error "Empty vector in safeVectorHead" + else V.head inVect + + +{- | get leftRightChildLabelBV takes a pair of vertex labels and returns left and right +based on their bitvector representation. This ensures left/right consistancey in +pre and postoder passes, and with bitvectors of leaves determined by data hash, +ensures label invariance with repect to leaves +larger bitvector goes second (bigge) +-} +leftRightChildLabelBV ∷ (VertexInfo, VertexInfo) → (VertexInfo, VertexInfo) +leftRightChildLabelBV inPair@(firstNode, secondNode) = + let firstLabel = bvLabel firstNode + secondLabel = bvLabel secondNode + in if firstLabel > secondLabel + then (secondNode, firstNode) + else inPair + + +{- | get leftRightChildLabelBVNode takes a pair ofnodes and returns left and right +based on their bitvector representation. This ensures left/right consistancey in +pre and postoder passes, and with bitvectors of leaves determined by data hash, +ensures label invariance with repect to leaves +larger bitvector goes second (bigge) +-} +leftRightChildLabelBVNode ∷ (LG.LNode VertexInfo, LG.LNode VertexInfo) → (LG.LNode VertexInfo, LG.LNode VertexInfo) +leftRightChildLabelBVNode inPair@(firstNode, secondNode) = + let firstLabel = bvLabel $ snd firstNode + secondLabel = bvLabel $ snd secondNode + in if firstLabel > secondLabel + then (secondNode, firstNode) + else inPair + + +{- | prettyPrintVertexInfo returns a string with formated version of +vertex info +-} +prettyPrintVertexInfo ∷ VertexInfo → String +prettyPrintVertexInfo inVertData = + let zerothPart = "Vertex name " <> T.unpack (vertName inVertData) <> " Index " <> show (index inVertData) + firstPart = "\n\tBitVector (as number) " <> show ((BV.toUnsignedNumber $ bvLabel inVertData) ∷ Int) + secondPart = "\n\tParents " <> show (parents inVertData) <> " Children " <> show (children inVertData) + thirdPart = + "\n\tType " + <> show (nodeType inVertData) + <> " Local Cost " + <> show (vertexCost inVertData) + <> " SubGraph Cost " + <> show (subGraphCost inVertData) + fourthPart = + "\n\tData Blocks: " + <> show (V.length $ vertData inVertData) + <> " Characters (by block) " + <> show (V.length <$> vertData inVertData) + fifthPart = "\n\t" <> show (vertData inVertData) + in zerothPart <> firstPart <> secondPart <> thirdPart <> fourthPart <> fifthPart + + +-- | add3 adds three values +add3 ∷ (Num a) ⇒ a → a → a → a +add3 x y z = x + y + z + + +{- | getProcessDataByBlock takes ProcessData and returns a list of Processed data with one block +per processed data element +argument to filter terminals with missing taxa +wraps around getProcessDataByBlock' with counter +-} +getProcessDataByBlock ∷ Bool → ProcessedData → [ProcessedData] +getProcessDataByBlock filterMissing (nameVect, nameBVVect, blockDataVect) = reverse $ getProcessDataByBlock' filterMissing 0 (nameVect, nameBVVect, blockDataVect) + + +{- | getProcessDataByBlock' called by getProcessDataByBlock with counter +and later reversed +-} +getProcessDataByBlock' ∷ Bool → Int → ProcessedData → [ProcessedData] +getProcessDataByBlock' filterMissing counter (nameVect, nameBVVect, blockDataVect) + | V.null blockDataVect = [] + | counter == V.length blockDataVect = [] + | otherwise = + let thisBlockData = blockDataVect V.! counter + in if not filterMissing + then + (nameVect, nameBVVect, V.singleton thisBlockData) + : getProcessDataByBlock' filterMissing (counter + 1) (nameVect, nameBVVect, blockDataVect) + else + let (blockName, charDataLeafVect, blockCharInfo) = thisBlockData + isMissingVect = V.map V.null charDataLeafVect + (nonMissingNameVect, nonMissingBVVect, nonMissingLeafData, _) = V.unzip4 $ V.filter (not . GU.fth4) (V.zip4 nameVect nameBVVect charDataLeafVect isMissingVect) + nonMissingBlockData = (blockName, nonMissingLeafData, blockCharInfo) + in (nonMissingNameVect, nonMissingBVVect, V.singleton nonMissingBlockData) + : getProcessDataByBlock' filterMissing (counter + 1) (nameVect, nameBVVect, blockDataVect) + + +{- | copyToNothing takes VertexBlockData and copies to VertexBlockDataMaybe +data as nothing +-} +copyToNothing ∷ VertexBlockData → VertexBlockDataMaybe +copyToNothing = fmap setNothing + where + setNothing a = V.replicate (V.length a) Nothing + + +{- | copyToJust takes VertexBlockData and copies to VertexBlockDataMaybe +data as Just CharacterData +-} +copyToJust ∷ VertexBlockData → VertexBlockDataMaybe +copyToJust = fmap (fmap Just) + + +{- | simAnnealAccept takes simulated annealing parameters, current best graph (e) cost, +candidate graph cost (e') and a uniform random integer and returns a Bool to accept or reject +the candidate solution +the basic method is + 1) accepts if current is better + 2) Otherwise prob accept = exp(-(e' -e)/T) +where T is a step from max to min +maxT and minT can probbaly be set to 100 and 1 or something but leaving some flexibility +curStep == 0 random walk (always accept) +curStep == (numSteps -1) greedy False is not better +-} +simAnnealAccept ∷ Maybe SAParams → VertexCost → VertexCost → PhyG (Bool, Maybe SAParams) +simAnnealAccept inParams curBestCost candCost = case inParams of + Nothing → error "simAnnealAccept Simulated anneling parameters = Nothing" + Just simAnealVals → case method simAnealVals of + -- drifting probs + Drift → driftAccept inParams curBestCost candCost + _ → + -- simulated annealing probs + let numSteps = numberSteps simAnealVals + curStep = currentStep simAnealVals + + -- stepFactor = (fromIntegral $ numSteps - curStep) / (fromIntegral numSteps) + -- tempFactor = curBestCost * stepFactor + + candCost' + | curBestCost == candCost = candCost + 1 + | otherwise = candCost + + -- factors here for tweaking + energyFactor = 10.0 * (100 * (curBestCost - candCost') / curBestCost) + tempFactor' = 10.0 * fromIntegral (numSteps - curStep) / fromIntegral numSteps + + -- flipped order - (e' -e) + -- probAcceptance = exp ((curBestCost - candCost) / ((maxTemp - minTemp) * tempFactor)) + -- probAcceptance' = exp ( (fromIntegral (curStep + 1)) * (curBestCost - candCost') / tempFactor) + + probAcceptance = exp (energyFactor / tempFactor') + + -- multiplier for resolution 1000, 100 prob be ok + randMultiplier ∷ Word + randMultiplier = 1000 + intAccept = floor $ fromIntegral randMultiplier * probAcceptance + + nextSAParams = simAnealVals{currentStep = curStep + 1} + + withUpdatedParams ∷ Bool → (Bool, Maybe SAParams) + withUpdatedParams b = (b, Just nextSAParams) + + costCheck + -- lowest cost-- greedy + -- but increment this if using heuristic costs + | candCost < curBestCost = pure True + -- not better and at lowest temp + | curStep >= numSteps - 1 = pure False + -- test for non-lowest temp conditions + | otherwise = + -- use remainder for testing + (< intAccept) . snd . (`divMod` randMultiplier) . abs <$> getRandom + in withUpdatedParams <$> costCheck + + +-- | incrementSimAnnealParams increments the step number by 1 but returns all other the same +incrementSimAnnealParams ∷ Maybe SAParams → Maybe SAParams +incrementSimAnnealParams = + let incrementor params = case method params of + SimAnneal → params{currentStep = currentStep params + 1} + _ → params{driftChanges = driftChanges params + 1} + in fmap incrementor + + +-- | generateRandLists generates n random lists from seed +generateRandIntLists ∷ Int → PhyG [[Int]] +generateRandIntLists count = replicateM count getRandoms + + +{- | generateUniqueRandList take a int and simulated anealing parameters list and creates +a list of SA paramter values with unique rnandomInt lists +sets current step to 0 +-} +generateUniqueRandList ∷ Int → Maybe SAParams → [Maybe SAParams] +generateUniqueRandList number inParams = replicate number inParams + + +{- | driftAccept takes SAParams, currrent best cost, and candidate cost +and returns a Boolean and an incremented set of params +this based on a percentage of diffference in graph cost +-} +driftAccept ∷ Maybe SAParams → VertexCost → VertexCost → PhyG (Bool, Maybe SAParams) +driftAccept simAnealVals curBestCost candCost = case simAnealVals of + Nothing → error "Nothing value in driftAccept" + Just params → + let -- prob acceptance for better, same, and worse costs + probAcceptance + | candCost < curBestCost = 1.0 + | candCost == curBestCost = driftAcceptEqual params + | otherwise = 1.0 / (driftAcceptWorse params + (100.0 * (candCost - curBestCost) / curBestCost)) + + -- multiplier for resolution 1000, 100 prob be ok + randMultiplier ∷ Word + randMultiplier = 1000 + intAccept = floor $ fromIntegral randMultiplier * probAcceptance + + -- not always incrementing becasue may not result in changes + nextSAParams = Just $ params{driftChanges = driftChanges params + 1} + nextSAPAramsNoChange = simAnealVals + + resultParams + -- only increment numberof changes for True values + -- but increment this if using heuristic costs + | candCost < curBestCost = pure (True, nextSAParams) + | otherwise = do + -- use remainder for testing--passing infinite list and take head + intRandVal ← snd . (`divMod` randMultiplier) . abs <$> getRandom + pure $ + if intRandVal < intAccept + then -- trace ("Drift T: " <> (show (curNumChanges, candCost, curBestCost, probAcceptance, intAccept, intRandVal)) <> " True") + (True, nextSAParams) + else -- trace ("Drift F: " <> (show (curNumChanges, candCost, curBestCost, probAcceptance, intAccept, intRandVal)) <> " False") + (False, nextSAPAramsNoChange) + in resultParams + + +-- | getTraversalCosts takes a Phylogenetic Graph and returns costs of traversal trees +getTraversalCosts ∷ PhylogeneticGraph → [VertexCost] +getTraversalCosts inGraph = + let traversalTrees = V.toList (V.toList <$> fft6 inGraph) + traversalRoots = fmap (head . LG.getRoots) (concat traversalTrees) + traversalRootCosts = fmap (subGraphCost . snd) traversalRoots + in traversalRootCosts + + +{- SAme as getCharacterLength +-- | getSequenceCharacterLengths returns a the length of block characters +getSequenceCharacterLengths :: CharacterData -> CharInfo -> Int +getSequenceCharacterLengths inCharData inCharInfo = + let inCharType = charType inCharInfo + in + -- trace ("GCL:" <> (show inCharType) <> " " <> (show $ snd3 $ stateBVPrelim inCharData)) ( + case inCharType of + x | x == NonAdd -> 0 -- V.length $ snd3 $ stateBVPrelim inCharData + x | x `elem` packedNonAddTypes -> 0 -- UV.length $ snd3 $ packedNonAddPrelim inCharData + x | x == Add -> 0 -- V.length $ snd3 $ rangePrelim inCharData + x | x == Matrix -> 0 -- V.length $ matrixStatesPrelim inCharData + x | x `elem` [SlimSeq, NucSeq ] -> SV.length $ snd3 $ slimAlignment inCharData + x | x `elem` [WideSeq, AminoSeq] -> UV.length $ snd3 $ wideAlignment inCharData + x | x == HugeSeq -> V.length $ snd3 $ hugeAlignment inCharData + x | x == AlignedSlim -> SV.length $ snd3 $ alignedSlimPrelim inCharData + x | x == AlignedWide -> UV.length $ snd3 $ alignedWidePrelim inCharData + x | x == AlignedHuge -> V.length $ snd3 $ alignedHugePrelim inCharData + _ -> error ("Un-implemented data type " <> show inCharType) + -- ) +-} + +-- | getCharacterLengths returns a the length of block characters +getCharacterLength ∷ Bool → CharacterData → CharInfo → Int +getCharacterLength useIA inCharData inCharInfo = + let inCharType = charType inCharInfo + in -- trace ("GCL:" <> (show inCharType) <> " " <> (show $ snd3 $ stateBVPrelim inCharData)) ( + case inCharType of + x | x == NonAdd → V.length $ snd3 $ stateBVPrelim inCharData + x | x `elem` packedNonAddTypes → UV.length $ snd3 $ packedNonAddPrelim inCharData + x | x == Add → V.length $ snd3 $ rangePrelim inCharData + x | x == Matrix → V.length $ matrixStatesPrelim inCharData + x | x `elem` [SlimSeq, NucSeq] && useIA → SV.length $ snd3 $ slimAlignment inCharData -- slimAlignment inCharData + x | x `elem` [SlimSeq, NucSeq] → SV.length $ snd3 $ slimGapped inCharData -- slimAlignment inCharData + x | x `elem` [WideSeq, AminoSeq] && useIA → UV.length $ snd3 $ wideAlignment inCharData -- wideAlignment inCharData + x | x `elem` [WideSeq, AminoSeq] → UV.length $ snd3 $ wideGapped inCharData -- wideAlignment inCharData + x | x == HugeSeq && useIA → V.length $ snd3 $ hugeAlignment inCharData -- hugeAlignment inCharData + x | x == HugeSeq → V.length $ snd3 $ hugeGapped inCharData -- hugeAlignment inCharData + x | x == AlignedSlim → SV.length $ snd3 $ alignedSlimPrelim inCharData + x | x == AlignedWide → UV.length $ snd3 $ alignedWidePrelim inCharData + x | x == AlignedHuge → V.length $ snd3 $ alignedHugePrelim inCharData + _ → error ("Un-implemented data type " <> show inCharType) + + +-- ) + +-- | getCharacterLengths' flipped arg version of getCharacterLength +getCharacterLength' ∷ Bool → CharInfo → CharacterData → Int +getCharacterLength' useIA inCharInfo inCharData = getCharacterLength useIA inCharData inCharInfo + + +-- | getMaxCharacterLengths get maximum charcter legnth from a list +getMaxCharacterLength ∷ Bool → CharInfo → [CharacterData] → Int +getMaxCharacterLength useIA inCharInfo inCharDataList = maximum $ fmap (getCharacterLength' useIA inCharInfo) inCharDataList + + +-- | getSingleTaxon takes a taxa x characters block and an index and returns the character vector for that index +getSingleTaxon ∷ V.Vector (V.Vector CharacterData) → Int → V.Vector CharacterData +getSingleTaxon singleCharVect taxonIndex = fmap (V.! taxonIndex) singleCharVect + + +{- | glueBackTaxChar takes single chartacter taxon vectors and glues them back inot multiple characters for each +taxon as expected in Blockdata. Like a transpose. FIlters out zero length characters +-} +glueBackTaxChar ∷ V.Vector (V.Vector CharacterData) → V.Vector (V.Vector CharacterData) +glueBackTaxChar singleCharVect = + let numTaxa = V.length $ V.head singleCharVect + multiCharVect = fmap (getSingleTaxon singleCharVect) (V.fromList [0 .. numTaxa - 1]) + in multiCharVect + + +{- | concatFastas is used by "report(ia)" to create a single concatenated fasta +for use by programs such as RAxML andf TNT +takes a single string of multiple fasta output, each entity to be concated (by taxon name) +separate by a line starting with "Seqeunce character" from ia output +there must be the same number of taxa and names in each element to be concatenated +-} +concatFastas ∷ String → String +concatFastas inMultFastaString = + if null inMultFastaString + then [] + else -- split on "Sequence character" + + let fastaFileStringList = filter (not . null) $ spitIntoFastas inMultFastaString + + -- make pairs from each "file" + fastaTaxDataPairLL = fmap fasta2PairList fastaFileStringList + + fastTaxNameList = (fst <$> head fastaTaxDataPairLL) + + -- merge sequences with (<>) + fastaDataLL = fmap (fmap snd) fastaTaxDataPairLL + mergedData = mergeFastaData fastaDataLL + + -- create new tuples + newFastaPairs = zipWith (<>) fastTaxNameList mergedData + in -- trace ("CF:" <> (show $ length fastaFileStringList) <> " " <> (show $ fmap length fastaFileStringList)) + concat newFastaPairs + + +{- | spitIntoFastas takes String generated by reprot ia functions, +splits on "Sequence character" line, creating separate fastas +-} +spitIntoFastas ∷ String → [String] +spitIntoFastas inString = + if null inString + then [] + else + let linesList = splitFastaLines [] [] $ filter (not . null) $ lines inString + fastaStringList = fmap unlines linesList + in fastaStringList + + +-- | splitFastaLines splits a list of lines into lists based on the line "Sequence character" +splitFastaLines ∷ [String] → [[String]] → [String] → [[String]] +splitFastaLines curFasta curFastaList inLineList = + if null inLineList + then reverse (reverse curFasta : curFastaList) + else + let firstLine = head inLineList + firstWord = + if (not $ null $ words firstLine) + then head $ words firstLine + else "" + in if null firstLine + then splitFastaLines curFasta curFastaList (tail inLineList) + else + if firstWord == "Sequence" + then splitFastaLines [] (reverse curFasta : curFastaList) (tail inLineList) + else splitFastaLines (firstLine : curFasta) curFastaList (tail inLineList) + + +-- | mergeFastaData takes list of list of Strings and merges into a single list of Strings +mergeFastaData ∷ [[String]] → [String] +mergeFastaData inDataLL = + if null $ head inDataLL + then [] + else + let firstData = concatMap head inDataLL + formattedData = unlines $ SL.chunksOf 50 firstData + in formattedData : mergeFastaData (fmap tail inDataLL) + + +{- | fasta2PairList takes an individual fasta file as single string and returns +pairs data of taxon name and data +assumes single character alphabet +deletes '-' (unless "prealigned"), and spaces +-} +fasta2PairList ∷ String → [(String, String)] +fasta2PairList fileContents' = + if null fileContents' + then [] + else + let fileContents = unlines $ filter (not . null) $ lines fileContents' + in if null fileContents + then [] + else + if head fileContents /= '>' + then errorWithoutStackTrace "\n\n'Read' command error: fasta file must start with '>'" + else + let terminalSplits = SL.splitWhen (== '>') fileContents + + -- tail because initial split will an empty text + pairData = getRawDataPairs (tail terminalSplits) + in -- trace ("FSPL: " <> (show $ length pairData) <> " " <> (show $ fmap fst pairData)) + pairData + + +-- | getRawDataPairstakes splits of Text and returns terminalName, Data pairs--minimal error checking +getRawDataPairs ∷ [String] → [(String, String)] +getRawDataPairs inList = + if null inList + then [] + else + let firstText = head inList + firstName = head $ lines firstText + firstData = concat $ tail $ lines firstText + in ('>' : firstName <> "\n", firstData) : getRawDataPairs (tail inList) + + +{- | hasResolutionDuplicateEdges checks resolution subtree in verteexInfo for duplicate d edges in softwired +subtree field +-} +hasResolutionDuplicateEdges ∷ ResolutionData → Bool +hasResolutionDuplicateEdges inResData = + let edgeList = snd $ displaySubGraph inResData + edgeDupList = length $ fmap LG.toEdge edgeList L.\\ L.nub (fmap LG.toEdge edgeList) + in edgeDupList > 0 + + +-- | getUnionFieldsNode reutnrs String of union fields nicely +getUnionFieldsNode ∷ VertexBlockData → String +getUnionFieldsNode inVertData = + "UnionFields " <> show inVertData + + +-- | transposeVector transposes a vector via conversion to lists transposing those and converting back +transposeVector ∷ V.Vector (V.Vector a) → V.Vector (V.Vector a) +transposeVector inVect = + if V.null inVect + then V.empty + else + let newListList = L.transpose $ V.toList $ fmap V.toList inVect + in V.fromList $ fmap V.fromList newListList + + +{- | combineMatrices takes two matrices [[a]] and applied function to be ziped and cretes new matrix +better be small becasue of list access would be n^3 +-} +combineMatrices ∷ (a → a → a) → [[a]] → [[a]] → [[a]] +combineMatrices f m1 m2 + | null m1 = error "Null matrix 1 in combineMatrices" + | null m2 = error "Null matrix 2 in combineMatrices" + | (fmap length m1) /= (fmap length m2) = + error ("Cannot combine matrices with unequal dimensions " <> (show (fmap length m1) <> " " <> show (fmap length m2))) + | otherwise = zipWith (zipWith f) m1 m2 + + +-- | normalizeMatrix takes a [[Int]] and returns normalized, symmetrical frequencies +normalizeMatrix ∷ [[Int]] → [[Double]] +normalizeMatrix inMatrix = + if null inMatrix + then [] + else + let totalNumber = (sum $ (fmap sum) inMatrix) ∷ Int + transposeMatrix = L.transpose inMatrix + symMatrix = zipWith (zipWith (+)) inMatrix transposeMatrix + newMatrix = fmap (fmap (/ (fromIntegral totalNumber))) $ fmap (fmap fromIntegral) symMatrix + in newMatrix + + +{- | divideList takes a [a] and returns [[a]] + by dividing according to lengths in input list to lengths +-} +divideList ∷ [Int] → [a] → [[a]] +divideList lengthList inList = + if null lengthList + then [] + else + if null inList + then [] + else + if sum lengthList /= length inList + then error ("List division and list length do not match: " <> (show (sum lengthList)) <> " versus " <> (show $ length inList)) + else + let firstLength = head lengthList + in (take firstLength inList) : divideList (tail lengthList) (drop firstLength inList) diff --git a/testData/archea-test.pg b/testData/archea-test.pg new file mode 100644 index 000000000..2a7acbbcb --- /dev/null +++ b/testData/archea-test.pg @@ -0,0 +1,30 @@ +-- script to amke Archea graphs +set(seed:1683323754) + +-- Read data +--read("../data/Archea/Archea-10.fas") +read("archea_ssu_585_aln.fas") + +-- set optimization parameters +--report("../data/Archea/Achea.csv", data, overwrite) +--set(graphssteepest:100) +--build(distance, replicates:1, rdwag) + +--build(replicates:1) + +--transform(staticApprox) + +--report("archea-test-10.dot", graphs, dotpdf, overwrite, nocollapse) +--read("../graphs/Archea-test-build.dot") +read("archea-test-10.dot") +--search(hours:200, instances:10) + +--fuse(tbr) +--swap(drift:10) +--refine(ga) +--swap() +--select(best) +--transform(multitraverse:false) +--transform(staticApprox) +report("archea-test-out.dot", graphs, dotpdf, overwrite, nocollapse) +report("archea-test-search.csv", search, overwrite) diff --git a/testData/bitDNA.tcm b/testData/bitDNA.tcm new file mode 100644 index 000000000..ac434f692 --- /dev/null +++ b/testData/bitDNA.tcm @@ -0,0 +1,6 @@ +A C G T +5.2962283643832556e-2 6.398622011854378 6.476191864174507 6.893750194729596 7.779862832183442 +6.398622011854378 8.22426029723005e-2 6.084345858542064 6.09395637587381 6.139852874400422 +6.476191864174507 6.084345858542064 0.10007333808859915 5.629371701252091 5.584324138621469 +6.893750194729596 6.09395637587381 5.629371701252091 0.10562564454071662 5.19247565086378 +7.779862832183442 6.139852874400422 5.584324138621469 5.19247565086378 -0.0 diff --git a/testData/charWag-segfault.pg b/testData/charWag-segfault.pg new file mode 100644 index 000000000..ebbaeecc3 --- /dev/null +++ b/testData/charWag-segfault.pg @@ -0,0 +1,91 @@ +--read(fasta:"metazoa-aa-1.fas") +read(fasta:"chel.fas") +read(fasta:"chel-16.fas") +--set(criterion:NCM) +--read(prefasta:"chel-prealigned.fas") +--read(prefasta:"chel-prealigned-16.fas") +--read(prefasta:"chel-prealigned.fas") + +-- this line can cause seg fault--not sure why +--read(prefasta:"chel-prealigned.fas", tcm:(1,1)) + +--read(prefasta:"chel-prealigned.fas", tcm:"sg1t1.mat") +--read(fasta:"chel.fas", tcm:"sg200t100-d1.mat") +--read(fasta:"chel.fas", tcm:"sg2t1.mat") +--read(fasta:"chel-16.fas", tcm:"sg1t1.mat") +read(fasta:"chel-16.fas", tcm:(2,1)) +read(fasta:"chel-16.fas", tcm:(2,2)) +--read(fasta:"chel-16.fas", tcm:"sg1t1.mat") +--read("chel.ss") +--read("chel-ques.ss") +--read("chel-3.hen") +--transform(weight:0.01, name:"chel.fas#0*") + +set(seed:1666899539) + +report("test0-data.csv", data, overwrite) + + +--set(partitionCharacter:%) +--set(finalassignment:ia) +--set(bc2:(0.1,1.1)) +--set(bc4:(0.2, 1.2)) +--set(bc5:(0.3, 1.3)) +--set(bc8:(0.4, 1.4)) +--set(bc64:(0.5,1.5)) +--set(bcgt64:(0.6,1.6)) + +--set(dynamicEpsilon:0.0) + +--set(useIA:False) + +-- For network testing +--set (outgroup:"one") +--set(graphtype:hardwired) +--set(graphtype:softwired) +--Set(Compressresolutions:True) +--set(graphFactor:NoPenalty) +--set(rootcost:ml) +--set(criterion:ml) + +-- set(seed:1) +build(replicates:1) +--build(distance, dwag) +--build(distance, rdwag, replicates:20, best:20) + +--build(distance, rdwag, replicates:10, best:1, block, displayTrees:10) +--build(replicates:10, block, displayTrees:10, cun) + +--set(multitraverse:false) + +--fuse(tbr) + +--search(minutes:1) +--build(replicates:10) +--refine(netadd, atrandom, maxnetedges:5) +--fuse(tbr) + +transform(staticApprox) +swap() +transform(dynamic) + + +--search(hours:0, minutes:1, instances:2, Thompson:1, linear, maxnetedges:5) +--search(minutes:5, instances:1, Thompson:1, simple) +--search(minutes:5, instances:1, stop:50) + +-- transform(dynamic) +select(best) +report("test0-crossrefs.csv", crossrefs, overwrite) +report("test0-search.csv", search, overwrite) +--report("test0-wag.txt", overwrite, graphs, ascii) +report("test0-wag.dot", overwrite, graphs, dotpdf, nocollapse) +--report("test0-wag.tre", overwrite, graphs, newick, nohtulabels, nobraNCHLENGTHS, collapse) +--report("test0-wag.tre", overwrite, graphs, newick) +--report("test0-dist-mat.csv", pairdist, overwrite) +report("test0-diag.csv", diagnosis, overwrite) +--report("test0-supp.dot", support,dotPDF, overwrite) +report("test-ia.fas", impliedalignment, includeMissing, overwrite) +--report("test-ia-concat.fas", impliedalignment, concatenate, overwrite) +report("test-tnt.txt", TNT, overwrite) +--report("test0-reconcile.dot", reconcile, method:eun, outformat:dot, dotpdf, threshold:0) diff --git a/testData/charWag.pg b/testData/charWag.pg index 2b66a18b1..3c51cd15c 100644 --- a/testData/charWag.pg +++ b/testData/charWag.pg @@ -1,20 +1,48 @@ + +set(criterion:PMDL) +--set(graphfactor:nopenalty) +set(modelComplexity:3512) +--set(graphtype:softwired) +set(outgroup:"Artemia") + + --read(fasta:"metazoa-aa-1.fas") --read(fasta:"chel.fas") +--read(fasta:"chel.fas", tcm:"test-1-Ney5.bit.tcm") +--read(fasta:"chel.fas", tcm:"test-1-Ney5.bit.tcm") --read(fasta:"chel-16.fas") -read(prefasta:"chel-prealigned.fas") + + +--read(prefasta:"chel-prealigned.fas") --read(prefasta:"chel-prealigned-16.fas") --read(prefasta:"chel-prealigned.fas") ---read(prefasta:"chel-prealigned.fas", tcm:(2,1)) ---read(prefasta:"chel-prealigned.fas", tcm:"sg2t2.mat") ---read(fasta:"chel.fas", tcm:"sg200t100-d1.mat") -read(fasta:"chel.fas", tcm:"sg2t1.mat") ---read(fasta:"chel-16.fas", tcm:"sg2t2.mat") ---read("chel.hen") ---read("chel-3.hen") + +-- this line can cause seg fault--not sure why +--read(prefasta:"chel-prealigned.fas") + +--read(prefasta:"chel-prealigned.fas", tcm:"sg2t1.mat") +read(fasta:"chel.fas", tcm:"bitDNA.tcm") +--read(fasta:"chel.fas", tcm:"neyman5.bit.tcm") +--read(fasta:"chel.fas", tcm:"sg2t1.mat") +--read(fasta:"chel.fas", tcm:(755360,1)) + +--read(fasta:"chel.fas", tcm:"sg2t2.mat", gapOpening:10) + +--read(fasta:"chel-16.fas", tcm:"sg2t1.mat") +--read("chel.ss") +--read("chel-ques.ss") +--read("chel-3.ss") --transform(weight:0.01, name:"chel.fas#0*") +set(seed:1701287416) +--set(reportNaive:False) +--set(missingThreshold:10) + +--report("test0-data.csv", data, overwrite) + + --set(partitionCharacter:%) -set(finalassignment:ia) +--set(finalassignment:ia) --set(bc2:(0.1,1.1)) --set(bc4:(0.2, 1.2)) --set(bc5:(0.3, 1.3)) @@ -22,60 +50,92 @@ set(finalassignment:ia) --set(bc64:(0.5,1.5)) --set(bcgt64:(0.6,1.6)) -set(dynamicEpsilon:0.0) +--set(dynamicEpsilon:0.0) + +--set(useIA:False) -- For network testing --set (outgroup:"one") --- set(graphtype:softwired) +--set(graphtype:hardwired) +--set(graphtype:softwired) --Set(Compressresolutions:True) --set(graphFactor:NoPenalty) --set(rootcost:ml) --set(criterion:ml) --- set(seed:1) -build(replicates:10) ---build(distance, dwag) ---build(nj, wpgma, dwag, distance, rdwag, replicates:10, best:10) ---build(distance, rdwag, replicates:10, best:10) --- report("new-401-distance.csv", pairdist, overwrite) --- set(graphType:softwired) -select(unique) -fuse(tbr) ---report("test-build.tre",overwrite, graphs, newick) --- swap(spr, steepest) ---swap(spr, all:5) ---swap(spr, ia) ---refine(netadd) ---select(unique) ---swap (tbr, annealing:4, steps:20) ---swap(tbr, steepest) ---select(unique) ---refine (ga, popsize:20, generations:4, severity:0, recombinations:50) ---select(unique) ---swap(tbr, all) ---search(seconds:90, instances:4) ---select(best) ---support(gb:tbr, gbsample:1000) --- transform(staticApprox) ---swap(tbr, steepest) -select(unique) -swap(keep:10) - ---transform(weight:0.1, name:"chel.fas#0*") +--set(multitraverse:false) + +--fuse(tbr) + +--transform(staticApprox) + +--search(minutes:1) +--build(replicates:10) +--fuse(tbr) +--swap() + +--build(distance, wpgma) + +--transform(staticApprox) +--read("chel-335.tre") + +--build(dwag, distance) +build(distance, rdwag, dwag, wpgma,nj, replicates:100, best:5) +--build(distance,rdwag, replicates:100, best:100) +--build(rdwag, replicates:100) +--build(distance, rdwag, replicates:20, best:5) +--search(hours:0, minutes:1, instances:4, Thompson:1, linear, maxnetedges:5) +--search(minutes:5, instances:1, Thompson:1, simple) + +--search(minutes:5, instances:4, stop:50,Thompson:2) + + + +--select(unique:10) +select() + +--refine(netadd, atrandom, maxnetedges:5) +--refine(netdelete, atrandom, maxnetedges:5) + +--search(hours:0, minutes:3, instances:4, Thompson:1, linear, maxnetedges:5) -- transform(dynamic) + +--swap() + +--swap(replicates:5, drift, maxChanges:5) +--swap(replicates:5) + +--fuse() + +--refine (ga) + +-- transform(dynamic) + select(best) -report("test0-data.csv", data, overwrite) -report("test0-crossrefs.csv", crossrefs, overwrite) +--transform (staticApprox) + +--report("test0-data.csv", data, overwrite) + +--report("test0-search.csv", search, overwrite) --report("test0-wag.txt", overwrite, graphs, ascii) -report("test0-wag.dot", overwrite, graphs, dotpdf, nocollapse) -report("test0-wag.tre", overwrite, graphs, newick, nohtulabels, nobraNCHLENGTHS, collapse) +--report("test0-wag.dot", overwrite, graphs, dotpdf, nocollapse, color) +--report("test0-wag-complexity.dot", overwrite, complexity, dotpdf, nocollapse, color) +report("test0.dot", graphs, dotpdf, overwrite,color, branchlengths:mid) + +--report("test0-wag.tre", overwrite, graphs, newick, nobraNCHLENGTHS, noHTULabels, collapse) --report("test0-wag.tre", overwrite, graphs, newick) --report("test0-dist-mat.csv", pairdist, overwrite) --report("test0-diag.csv", diagnosis, overwrite) ---report("test-search-data", search, overwrite) + +--report("test0-metadata.csv", metaData, overwrite) +--report("test0-paramEst.csv", parameterEstimation, overwrite) + --report("test0-supp.dot", support,dotPDF, overwrite) --report("test-ia.fas", impliedalignment, includeMissing, overwrite) --report("test-ia-concat.fas", impliedalignment, concatenate, overwrite) --report("test-tnt.txt", TNT, overwrite) ---report("test0-reconcile.dot", reconcile, method:eun, outformat:dot, dotpdf, threshold:0) +--report("test0-reconcile-strict.tre", reconcile, method:strict, newick, threshold:51, overwrite) +--report("test0-reconcile-adams.dot", reconcile, graphs, method:adams, dotpdf, nobranchlengths, overwrite) +--report("test0-reconcile.dot", reconcile) +--report("test0-reconcile-majority.dot", reconcile, method:majority, dotpdf, threshold:51, overwrite) diff --git a/testData/chel-10.fas b/testData/chel-10.fas new file mode 100644 index 000000000..f3678f626 --- /dev/null +++ b/testData/chel-10.fas @@ -0,0 +1,50 @@ +>Rhiphicephalus + TCCAGACGAG TAGTGCATCT ACCCGATGCT ACGGCTCGGA CTGAACATCA TGCCGGTTCT + TTCTTGGTGC ACTTCATTGT GTGCCTCGAG ATGGCCGGTG CTTTTACTTT GAAAAAATTA + GAGTGCTCAA CGCAGGCGAG TCGCC + +>Vonones + TCGAGGCTGG CGGTCCGCCT ACAGGCGGTC ACTGCCAGTA CTCAACATCC TGCCGGTTTT + CCCTTGGTGC TCTTCGCTGA GTGTCTCGGG TGGCCGGCAC GTTTACTTTG AAAAAATTAG + AGTGCTCAAA GCAGGCGCGT AGCC + +>Centruoides + TCCAGACAAG CGGTCCACCC GCGGTGGTTA CTGTTTGGAC TGGACGTTTG GCCGGATTCC + TTTGATGCTC TTTGCCGAGT GTCTTGGGTG TCCGGCACGT TTACTTTGAA AAAATTAGAG + TGCTCAAAGC AGCGTACCGC C + +>Limulus + TCTAGACTGG CGGTCCGCTT CCGGCGGTTA CTGCCTGGCC TAAACATCTG CCGGTTTTCC + CTCGGTGCCC TTGATTGAGT GTCTTGGGTG GCCGGCACGT TTACTTTGAA AAAATTAGAG + TGCTCAAAGC AGCGCAACGC C + +>Aportus + TCCAGACTGA CGGTCCACCG CTCGGGCGAC TGTCAGGCCT GAACATTCCG TCGGTTTCGA + CAGATTTTTC CCGCGGTGCT CTTCGGTGAG TGTCGCGGAG GCCGACAAGT TTACTTTGAA + AAAATTAGAG TGCTCAAAGC AGGCGGTCAC C + +>Alentus + TCCAGACTGA CGGTCCACCG AATCGGGCGA CTGTCAGGCC TGAACATCCG GTCGGTTTCT + CTCCCTTTTC CCGCGGTGCT CTTCGTTGAG TGTCGCGGGA GGCCGACAAG TTTACTTTGA + AAAAATTAGA GTGCTCAAAG CAGGCGGTCA CC + +>Artemia + CTCGGTCGGG TGGTGCCGCC TCACGGTGGT CACTGCCTCG ATCGGACAAT TCATTGGATC + GTTCGGGGTG CTCTTAACCG AGTGTCCTGG GTGGCCGATA CGTTTACTTT GAACAAATTA + GAGTGCTTAA AGCAGGTGCA CCGCGCC + +>Gea + TCCGGCCGGA CGGGTCCGCC TACCGGTGGT TACTGTTCGC TGCCGAGCTT CAGGGGGCCG + CTGTCGATGA TCTTCATCGG TTATCTTCCG TAACCCTCAC GTTTACTTTG AAAAAATTAG + AGTGCTCAAA GCAGCGCGAC GCC + +>Hypochilus + TCCAGACGGG CGGTCCGCCT AACGGTGGTT ACTGCCTGGC CTGAACAACC AGCCGGTTTC + CCTAGATGAT CTTCATTGAT TGTCTTGGGT GACCGGCACG TTTACTTTGA AAAAATTAGA + GTGCTCAAAG CAGCGTGACG CC + +>Thelichoris + TCCAGACGGG CGGTCCGCCT AACGGTGGTT ACTGCCTGGC CTGAACAGCC AGCCGGTTTC + CCTAGATGAT CTTTACCGGT TGTCTTGGGT GACCGGCACG TTTACTTTGA AAAAATTAGA + GTGCTCAAAG CAGGCTGACG CC + diff --git a/testData/chel-3-10.dot b/testData/chel-3-10.dot new file mode 100644 index 000000000..db000a330 --- /dev/null +++ b/testData/chel-3-10.dot @@ -0,0 +1,70 @@ +digraph G { + rankdir = LR; node [ shape = rect]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 [label=HTU17]; + 18 [label=HTU18]; + 19 [label=HTU19]; + 20 [label=HTU20]; + 21 [label=HTU21]; + 22 [label=HTU22]; + 23 [label=HTU23]; + 24 [label=HTU24]; + 25 [label=HTU25]; + 26 [label=HTU26]; + 27 [label=HTU27]; + 28 [label=HTU28]; + 29 [label=HTU29]; + 30 [label=HTU30]; + 31 [label=HTU31]; + 32 [label=HTU32]; + 17 -> 0 [label=23.5]; + 17 -> 30 [label=16.0]; + 18 -> 14 [label=49.0]; + 18 -> 25 [label=11.5]; + 19 -> 1 [label=59.5]; + 19 -> 28 [label=25.0]; + 20 -> 19 [label=18.0]; + 20 -> 22 [label=27.5]; + 21 -> 18 [label=36.5]; + 21 -> 24 [label=26.5]; + 22 -> 16 [label=68.5]; + 22 -> 26 [label=70.0]; + 23 -> 15 [label=68.0]; + 23 -> 20 [label=27.5]; + 24 -> 11 [label=47.5]; + 24 -> 27 [label=34.0]; + 25 -> 7 [label=138.5]; + 25 -> 9 [label=44.5]; + 26 -> 2 [label=140.5]; + 26 -> 4 [label=107.5]; + 27 -> 5 [label=58.5]; + 27 -> 29 [label=46.0]; + 28 -> 21 [label=19.0]; + 28 -> 31 [label=22.0]; + 29 -> 8 [label=62.0]; + 29 -> 12 [label=33.0]; + 30 -> 3 [label=56.0]; + 30 -> 32 [label=71.0]; + 31 -> 6 [label=61.5]; + 31 -> 10 [label=58.5]; + 32 -> 13 [label=113.0]; + 32 -> 23 [label=26.5]; +} +//1242.0 diff --git a/testData/chel-3.ss b/testData/chel-3.ss index 2694fd9f9..1422e005f 100644 --- a/testData/chel-3.ss +++ b/testData/chel-3.ss @@ -1,23 +1,23 @@ xread 'TNT data for Graph 0 generated by PhylogeneticGraph (PhyG)' 787 17 -Alentus -TCCAG--ACTGA-C-GGT-CCACC-G---AA-TCG--G-GCG-A-C-T-G---T-CAGGCCTGAAC---A-TCCGGTCGG-TTTC-TCTCCCTTTTCCCGCG-G-TGCTCTTCGT-TGAG-TGTCGCGGG-AGGCCGACAAGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA----G-G--C-G-GTCACC-TCCAG--ACTGA-C-GGT-CCACC-G--AATCG--G-GCG-A-C-T-G---T-CAGGCCTGAAC---A-TCCG-GTCGG-TTTC-TCTCCCTTTTCCCGCG-G-TGCTCTTCGT-TGAG-TGTCGCGGG-AGGCCGACAAGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA----G-G--C-G-GTCACC-TCCAG--ACTGA-C-GGT-CCACC-G--AATCG--G-GCG-A-C-T-G---T-CAGGCCTGAAC---A-TCCG-GTCGG-TTTC-TCTCCCTTTTCCCGCG-G-TGCTCTTCGT-TGAG-TGTCGCGGG-AGGCCGACAAGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA----G-G--C-G-GTCACC--TCCAG--ACTGA--C-GGT-CCA-CCGAA-TC-GG-G-C-G--A-CTG-TCAGGCCTGA-A---C--ATCCGGTCGGT--TTC--TCTC-C-C-TTT-TCCCGC-GGTGCTCTT--CGTTGAG-TG-T-CGC-G-GGAGGCCGACAAGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA--G-G----C-G----G-T--CACC -Amblypygid -TCCAG--ACTGG-C-GGT-CCGCC-T---AG-CGGC-G-AGT-A-C-T-G---T-CAGGCCTGAAC---A-TGGCGCCGG--TT--T-T-CC--TT-----G-G-TTCTCTTTAC-TGAG-TGTCTTGGG-CGACCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--G-CTGTCGCC-TCCAG--ACTGG-C-GGT-CCGCC-T--AGCGGC-G-AGT-A-C-T-G---T-CAGGCCTGAAC---A-T-GGCGCCGG---T--T-TTCC--TT-----G-G-TTCTCTTTAC-TGAG-TGTCTTGGG-CGACCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--G-CTGTCGCC-TCCAG--ACTGG-C-GGT-CCGCC-T--AGCGGC-G-AGT-A-C-T-G---T-CAGGCCTGAAC---A-T-GGCGCCGG---T--T-TTCC--TT-----G-G-TTCTCTTTAC-TGAG-TGTCTTGGG-CGACCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--G-CTGTCGCC--TCCAG--ACTGG--C-GGT-CCG-CCT-A-GC-GGCGAG-T--A-CTG-TCAGGCCTGA-A---C--ATGGCGCCGG---TT---T-T--C---CT--T------GGTTCTCTT--TACTGAG-TG-T-CTT-G-GGCGACCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA--G-G----C-T----G-T--CGCC +Alentus -TCCAG--ACTGA-C-GGT-CCACC-G---AA-TCG--G-GCG-A-C-T-G---T-CAGGCCTGAAC---A-TCCGGTCGG-TTTC-TCTCCCTTTTCCCGCG-G-TGCTCTTCGT-TGAG-TGTCGCGGG-AGGCCGACAAGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA----G-G--C-G-GTCACC-TCCAG--ACTGA-C-GGT-CCACC-G--AATCG--G-GCG-A-C-T-G---T-CAGGCCTGAAC---A-TCCG-GTCGG-TTTC-TCTCCCTTTTCCCGCG-G-TGCTCTTCGT-TGAG-TGTCGCGGG-AGGCCGACAAGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA----G-G--C-G-GTCACC-TCCAG--ACTGA-C-GGT-CCACC-G--AATCG--G-GCG-A-C-T-G---T-CAGGCCTGAAC---A-TCCG-GTCGG-TTTC-TCTCCCTTTTCCCGCG-G-TGCTCTTCGT-TGAG-TGTCGCGGG-AGGCCGACAAGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA----G-G--C-G-GTCACC--TCCAG--ACTGA--C-GGT-CCA-CCGAA-TC-GG-G-C-G--A-CTG-TCAGGCCTGA-A---C--ATCCGGTCGGT--TTC--TCTC-C-C-TTT-TCCCGC-GGTGCTCTT--CGTTGAG-TG-T-CGC-G-GGAGGCCGACAAGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA--G-G----C-G----G-T--CACC +Amblypygid -TCCAG--ACTGG-C-GGT-CCGCC-T---AG-CGGC-G-AGT-A-C-T-G---T-CAGGCCTGAAC---A-TGGCGCCGG--TT--T-T-CC--TT-----G-G-TTCTCTTTAC-TGAG-TGTCTTGGG-CGACCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--G-CTGTCGCC-TCCAG--ACTGG-C-GGT-CCGCC-T--AGCGGC-G-AGT-A-C-T-G---T-CAGGCCTGAAC---A-T-GGCGCCGG---T--T-TTCC--TT-----G-G-TTCTCTTTAC-TGAG-TGTCTTGGG-CGACCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--G-CTGTCGCC-TCCAG--ACTGG-C-GGT-CCGCC-T--AGCGGC-G-AGT-A-C-T-G---T-CAGGCCTGAAC---A-T-GGCGCCGG---T--T-TTCC--TT-----G-G-TTCTCTTTAC-TGAG-TGTCTTGGG-CGACCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--G-CTGTCGCC--TCCAG--ACTGG--C-GGT-CCG-CCT-A-GC-GGCGAG-T--A-CTG-TCAGGCCTGA-A---C--ATGGCGCCGG---TT---T-T--C---CT--T------GGTTCTCTT--TACTGAG-TG-T-CTT-G-GGCGACCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA--G-G----C-T----G-T--CGCC Americhernus -TCGAGCCTC-CAAT-GAT-ACGTT-G-AAAG-GCG--T--TT-ATC-G-T---T-G-GGGCCG-AC---A-GCGCGTC-G---------T---GGG--CTCG-GTTGGCCTT-AA-AAAGCTG-ATCGGGTTCTCCGGCAATTTTACTTTGAAAAAATTAGGGTGCTCAAAGCA----G-G--C-CTGGTGCC-TCGAGCCTC-CAAT-GAT-ACGTT-GAAAGGCG--T--TT-ATC-G-T---T-G-GGGCCG-AC---A-GCGC-GTC-G---------T---GGG--CTCG-GTTGGCCTT-AA-AAAGCTG-ATCGGGTTCTCCGGCAATTTTACTTTGAAAAAATTAGGGTGCTCAAAGCA----G-G--C-CTGGTGCC-TCGAGCCTC-CAAT-GAT-ACGTT-GAAAGGCG--T--TT-ATC-G-T---T-G-GGGCCG-AC---A-GCGC-GTC-G---------T---GGG--CTCG-GTTGGCCTT-AA-AAAGCTG-ATCGGGTTCTCCGGCAATTTTACTTTGAAAAAATTAGGGTGCTCAAAGCA----G-G--C-CTGGTGCC--TCGAGCCTC-CAAT--GAT-ACG-TTG-A-AAGGC-G-T-TT-A-TCGTTGGGGCC-GACAGCGC--GTCGTG--GG--------C-T--CGG-TT----------G-GC-CTTAA-AAA--GCTGAT--CG-G-GTTCTCCGGCAATTTTACTTTGAAAAAATTAGGGTGCTCAAAGCA--G-GCCT---G----G-T---GCC -Aportus -TCCAG--ACTGA-C-GGT-CCACC-G----C-TCG--G-GCG-A-C-T-G---T-CAGGCCTGAAC---A-TTCCGTCGG-TTTCGACAGATTTTTCCCGCG-G-TGCTCTTCGG-TGAG-TGTCGC-GG-AGGCCGACAAGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA----G-G--C-G-GTCACC-TCCAG--ACTGA-C-GGT-CCACC-G---CTCG--G-GCG-A-C-T-G---T-CAGGCCTGAAC---A-TTCC-GTCGG-TTTCGACAGATTTTTCCCGCG-G-TGCTCTTCGG-TGAG-TGTCGC-GG-AGGCCGACAAGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA----G-G--C-G-GTCACC-TCCAG--ACTGA-C-GGT-CCACC-G---CTCG--G-GCG-A-C-T-G---T-CAGGCCTGAAC---A-TTCC-GTCGG-TTTCGACAGATTTTTCCCGCG-G-TGCTCTTCGG-TGAG-TGTCGC-GG-AGGCCGACAAGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA----G-G--C-G-GTCACC--TCCAG--ACTGA--C-GGT-CCA-CCG-C-TC-GG-G-C-G--A-CTG-TCAGGCCTGA-A---C--ATTCCGTCGGT--TTCG-ACAG-A-T-TTT-TCCCGC-GGTGCTCTT--CGGTGAG-TG-T-CGC---GGAGGCCGACAAGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA--G-G----C-G----G-T--CACC -Artemia CTCG-G--TC-GGGT-GGTGCCGCC-T--CAC-G-G--TGGTC-A-CTGCC---T-C-GATCGG-AC--AATTCAT-T-GG--------AT-C-GTT--CGGG-G-TGCTCTT-AACCGAG-TGTCCTGGG-TGGCCGATACGTTTACTTTGAACAAATTAGAGTGCTTAAAGCA---GGTG--C-ACCGCGCCCTCG-G--TC-GGGT-GGTGCCGCC-T-CACG-G--TGGTC-A-CTGCC---T-C-GATCGG-AC--AATTCAT--T-GG--------AT-C-GTT--CGGG-G-TGCTCTT-AACCGAG-TGTCCTGGG-TGGCCGATACGTTTACTTTGAACAAATTAGAGTGCTTAAAGCA---GGTG--C-ACCGCGCCCTCG-G--TC-GGGT-GGTGCCGCC-T-CACG-G--TGGTC-A-CTGCC---T-C-GATCGG-AC--AATTCAT--T-GG--------AT-C-GTT--CGGG-G-TGCTCTT-AACCGAG-TGTCCTGGG-TGGCCGATACGTTTACTTTGAACAAATTAGAGTGCTTAAAGCA---GGTG--C-ACCGCGCCC-TCG-G--TC-GGGT--GGTGCCGCCT--C-ACGGT-G-G-TC-ACTGCCTCGATCG-GACA---A--TTCATT--GG--------A-T--C-G-TTC-G----G-GGTGCTCTTAACCGA--G-TG-T-CCT-G-GGTGGCCGATACGTTTACTTTGAACAAATTAGAGTGCTTAAAGCA--G-G--T---GCACCG-C---GCC -Centruoides -TCCAG--ACAAG-C-GGT-CCACC------CGCGGT-G-GTT-A-C-T-G---T-TTGGACTGGAC---G-TTTGGCCGG--AT--T-C-CT--TT-----G-A-TGCTCTTTGC-CGAG-TGTCTTGGG-TGTCCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--C-GTACCGCC-TCCAG--ACAAG-C-GGT-CCACC----CGCGGT-G-GTT-A-C-T-G---T-TTGGACTGGAC---G-T-TTGGCCGG---A--T-TCCT--TT-----G-A-TGCTCTTTGC-CGAG-TGTCTTGGG-TGTCCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--C-GTACCGCC-TCCAG--ACAAG-C-GGT-CCACC----CGCGGT-G-GTT-A-C-T-G---T-TTGGACTGGAC---G-T-TTGGCCGG---A--T-TCCT--TT-----G-A-TGCTCTTTGC-CGAG-TGTCTTGGG-TGTCCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--C-GTACCGCC--TCCAG--ACAAG--C-GGT-CCA-CC--C-GC-GGTG-GTT--A-CTG-TTTGGACTGG-A---C--GTTTGGCCGG-A-TT--CC-T-------T--T------GATGCTCTT--TGCCGAG-TG-T-CTT-G-GGTGTCCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA----G----C------G-TACCGCC -Chanbria -TCTAG--ACTGG-T-GGT-CCGCC-T----C-TGGT-G-GTT-A-C-T-A---C-CTGGCCTAAACA--A-TT-TGCCGGT-TT--T-C-CC--TT-----G-G-TGCTCTTCAC-CGAG-TGTCTTGGG-GGACTGGTACGTTTACTTTGAAGAAACTAGAGTGCTCAAAGCA--G---G--C-GTAACGCC-TCTAG--ACTGG-T-GGT-CCGCC----TCTGGT-G-GTT-A-C-T-A---C-CTGGCCTAAAC-A-A-T-TT-GCCGGT--T--T-TCCC--TT-----G-G-TGCTCTTCAC-CGAG-TGTCTTGGG-GGACTGGTACGTTTACTTTGAAGAAACTAGAGTGCTCAAAGCA--G---G--C-GTAACGCC-TCTAG--ACTGG-T-GGT-CCGCC----TCTGGT-G-GTT-A-C-T-A---C-CTGGCCTAAAC-A-A-T-TT-GCCGGT--T--T-TCCC--TT-----G-G-TGCTCTTCAC-CGAG-TGTCTTGGG-GGACTGGTACGTTTACTTTGAAGAAACTAGAGTGCTCAAAGCA--G---G--C-GTAACGCC--TCTAG--ACTGG--T-GGT-CCG-CC--T-CT-GGTG-GTT--A-CTA-CCTGGCCTAA-A---C-AATTT-GCCGG---TT---T-T-CC---CT--T------GGTGCTCTT--CACCGAG-TG-T-CTT-G-GGGGACTGGTACGTTTACTTTGAAGAAACTAGAGTGCTCAAAGCA---GG----C------G-TAACGCC -Gea -TCCGG--CCGGA-CGGGT-CCGCC-TA---C-CGGT-G-GTT-A-C-T-GTT-CGCT-GCC-GAGC---T-TC-AGGGGG--CC--G-C-TG--TC-----G-A-TGATCTTCAT-CGGT-TATCTTCCG-TAACCCTCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--C-GCGACGCC-TCCGG--CCGGA-CGGGT-CCGCCT---ACCGGT-G-GTT-A-C-T-GTT-CGCT-GCC-GAGC-----T-TCAGGGGG---C--C-GCTG--TC-----G-A-TGATCTTCAT-CGGT-TATCTTCCG-TAACCCTCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--C-GCGACGCC-TCCGG--CCGGA-CGGGT-CCGCCT---ACCGGT-G-GTT-A-C-T-GTT-CGCT-GCC-GAGC-----T-TCAGGGGG---C--C-GCTG--TC-----G-A-TGATCTTCAT-CGGT-TATCTTCCG-TAACCCTCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--C-GCGACGCC--TCCGG--CCGGA--CGGGT-CCG-CC--TACC-GGTG-GTT--A-CTG--TTCG-CTGC-C---G--AGCT-TCAGG----G---G-G-CC--GCT-GT-----CGATGATCTT--CATCGGT-TA-T-CTT-C-CGTAACCCTCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA----G----C------G-CGACGCC -Hadrurus -TCCGG--ACTGT-C-GGT-CCGC-------C-GCAA-G-CTT-A-C-T-G---G-CAGGACCGGAC---G-TCTAGCCGG--AC--T-C-TC--TC-----GTA-TCCTCTTCAC-CGGG-TGTCTTGGG-TGTCCGGCAATTTTACTTTGAAAAAATTAGAGTGCTCAAAGCAGC----G--C-GATCCGCC-TCCGG--ACTGT-C-GGT-CCGC-----C-GCAA-G-CTT-A-C-T-G---G-CAGGACCGGAC---G-T-CTAGCCGG---A--C-TCTC--TC-----GTA-TCCTCTTCAC-CGGG-TGTCTTGGG-TGTCCGGCAATTTTACTTTGAAAAAATTAGAGTGCTCAAAGCAGC----G--C-GATCCGCC-TCCGG--ACTGT-C-GGT-CCGC-----C-GCAA-G-CTT-A-C-T-G---G-CAGGACCGGAC---G-T-CTAGCCGG---A--C-TCTC--TC-----GTA-TCCTCTTCAC-CGGG-TGTCTTGGG-TGTCCGGCAATTTTACTTTGAAAAAATTAGAGTGCTCAAAGCAGC----G--C-GATCCGCC--TCCGG--ACTGT--C-GGT-CCG-C---C--G-CAAG-CTT--A-CTG-GCAGGACCGG-A---C--GTCTAGCCGG-ACTC--TC-T------CG--T-------ATCCTCTT--CACCGGG-TG-T-CTT-G-GGTGTCCGGCAATTTTACTTTGAAAAAATTAGAGTGCTCAAAGCAGC--G----C------GAT-CCGCC -Hypochilus -TCCAG--ACGGG-C-GGT-CCGCC-TA---A-CGGT-G-GTT-A-C-T-G---C-CTGGCCTGAACA--A-CC-AGCCGG--TT--T-C-CC--TA-----G-A-TGATCTTCAT-TGAT-TGTCTTGGG-TGACCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--C-GTGACGCC-TCCAG--ACGGG-C-GGT-CCGCCT---AACGGT-G-GTT-A-C-T-G---C-CTGGCCTGAAC---A-A-CCAGCCGG---T--T-TCCC--TA-----G-A-TGATCTTCAT-TGAT-TGTCTTGGG-TGACCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--C-GTGACGCC-TCCAG--ACGGG-C-GGT-CCGCCT---AACGGT-G-GTT-A-C-T-G---C-CTGGCCTGAAC---A-A-CCAGCCGG---T--T-TCCC--TA-----G-A-TGATCTTCAT-TGAT-TGTCTTGGG-TGACCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--C-GTGACGCC--TCCAG--ACGGG--C-GGT-CCG-CC--TAAC-GGTG-GTT--A-CTG-CCTGGCCTGA-A---C--AACCAGCCGG----T---T-T-CC---CT--A------GATGATCTT--CATTGAT-TG-T-CTT-G-GGTGACCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA----G----C------G-TGACGCC -Limulus -TCTAG--ACTGG-C-GGT-CCGCT-T----C-CGGC-G-GTT-A-C-T-G---C-CTGGCCTAAAC---A-TC-TGCCGGT-TT--T-C-CC--TC-----G-G-TGCCCTTGAT-TGAG-TGTCTTGGG-TGGCCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--C-GCAACGCC-TCTAG--ACTGG-C-GGT-CCGCT----TCCGGC-G-GTT-A-C-T-G---C-CTGGCCTAAAC---A-T-CT-GCCGGT--T--T-TCCC--TC-----G-G-TGCCCTTGAT-TGAG-TGTCTTGGG-TGGCCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--C-GCAACGCC-TCTAG--ACTGG-C-GGT-CCGCT----TCCGGC-G-GTT-A-C-T-G---C-CTGGCCTAAAC---A-T-CT-GCCGGT--T--T-TCCC--TC-----G-G-TGCCCTTGAT-TGAG-TGTCTTGGG-TGGCCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--C-GCAACGCC--TCTAG--ACTGG--C-GGT-CCG-CT--T-CC-GGCG-GTT--A-CTG-CCTGGCCTAA-A---C--ATCT-GCCGG---TT---T-T-CC---CT--C------GGTGCCCTT--GATTGAG-TG-T-CTT-G-GGTGGCCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA----G----C------G-CAACGCC +Aportus -TCCAG--ACTGA-C-GGT-CCACC-G----C-TCG--G-GCG-A-C-T-G---T-CAGGCCTGAAC---A-TTCCGTCGG-TTTCGACAGATTTTTCCCGCG-G-TGCTCTTCGG-TGAG-TGTCGC-GG-AGGCCGACAAGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA----G-G--C-G-GTCACC-TCCAG--ACTGA-C-GGT-CCACC-G---CTCG--G-GCG-A-C-T-G---T-CAGGCCTGAAC---A-TTCC-GTCGG-TTTCGACAGATTTTTCCCGCG-G-TGCTCTTCGG-TGAG-TGTCGC-GG-AGGCCGACAAGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA----G-G--C-G-GTCACC-TCCAG--ACTGA-C-GGT-CCACC-G---CTCG--G-GCG-A-C-T-G---T-CAGGCCTGAAC---A-TTCC-GTCGG-TTTCGACAGATTTTTCCCGCG-G-TGCTCTTCGG-TGAG-TGTCGC-GG-AGGCCGACAAGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA----G-G--C-G-GTCACC--TCCAG--ACTGA--C-GGT-CCA-CCG-C-TC-GG-G-C-G--A-CTG-TCAGGCCTGA-A---C--ATTCCGTCGGT--TTCG-ACAG-A-T-TTT-TCCCGC-GGTGCTCTT--CGGTGAG-TG-T-CGC---GGAGGCCGACAAGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA--G-G----C-G----G-T--CACC +Artemia CTCG-G--TC-GGGT-GGTGCCGCC-T--CAC-G-G--TGGTC-A-CTGCC---T-C-GATCGG-AC--AATTCAT-T-GG--------AT-C-GTT--CGGG-G-TGCTCTT-AACCGAG-TGTCCTGGG-TGGCCGATACGTTTACTTTGAACAAATTAGAGTGCTTAAAGCA---GGTG--C-ACCGCGCCCTCG-G--TC-GGGT-GGTGCCGCC-T-CACG-G--TGGTC-A-CTGCC---T-C-GATCGG-AC--AATTCAT--T-GG--------AT-C-GTT--CGGG-G-TGCTCTT-AACCGAG-TGTCCTGGG-TGGCCGATACGTTTACTTTGAACAAATTAGAGTGCTTAAAGCA---GGTG--C-ACCGCGCCCTCG-G--TC-GGGT-GGTGCCGCC-T-CACG-G--TGGTC-A-CTGCC---T-C-GATCGG-AC--AATTCAT--T-GG--------AT-C-GTT--CGGG-G-TGCTCTT-AACCGAG-TGTCCTGGG-TGGCCGATACGTTTACTTTGAACAAATTAGAGTGCTTAAAGCA---GGTG--C-ACCGCGCCC-TCG-G--TC-GGGT--GGTGCCGCCT--C-ACGGT-G-G-TC-ACTGCCTCGATCG-GACA---A--TTCATT--GG--------A-T--C-G-TTC-G----G-GGTGCTCTTAACCGA--G-TG-T-CCT-G-GGTGGCCGATACGTTTACTTTGAACAAATTAGAGTGCTTAAAGCA--G-G--T---GCACCG-C---GCC +Centruoides -TCCAG--ACAAG-C-GGT-CCACC------CGCGGT-G-GTT-A-C-T-G---T-TTGGACTGGAC---G-TTTGGCCGG--AT--T-C-CT--TT-----G-A-TGCTCTTTGC-CGAG-TGTCTTGGG-TGTCCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--C-GTACCGCC-TCCAG--ACAAG-C-GGT-CCACC----CGCGGT-G-GTT-A-C-T-G---T-TTGGACTGGAC---G-T-TTGGCCGG---A--T-TCCT--TT-----G-A-TGCTCTTTGC-CGAG-TGTCTTGGG-TGTCCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--C-GTACCGCC-TCCAG--ACAAG-C-GGT-CCACC----CGCGGT-G-GTT-A-C-T-G---T-TTGGACTGGAC---G-T-TTGGCCGG---A--T-TCCT--TT-----G-A-TGCTCTTTGC-CGAG-TGTCTTGGG-TGTCCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--C-GTACCGCC--TCCAG--ACAAG--C-GGT-CCA-CC--C-GC-GGTG-GTT--A-CTG-TTTGGACTGG-A---C--GTTTGGCCGG-A-TT--CC-T-------T--T------GATGCTCTT--TGCCGAG-TG-T-CTT-G-GGTGTCCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA----G----C------G-TACCGCC +Chanbria -TCTAG--ACTGG-T-GGT-CCGCC-T----C-TGGT-G-GTT-A-C-T-A---C-CTGGCCTAAACA--A-TT-TGCCGGT-TT--T-C-CC--TT-----G-G-TGCTCTTCAC-CGAG-TGTCTTGGG-GGACTGGTACGTTTACTTTGAAGAAACTAGAGTGCTCAAAGCA--G---G--C-GTAACGCC-TCTAG--ACTGG-T-GGT-CCGCC----TCTGGT-G-GTT-A-C-T-A---C-CTGGCCTAAAC-A-A-T-TT-GCCGGT--T--T-TCCC--TT-----G-G-TGCTCTTCAC-CGAG-TGTCTTGGG-GGACTGGTACGTTTACTTTGAAGAAACTAGAGTGCTCAAAGCA--G---G--C-GTAACGCC-TCTAG--ACTGG-T-GGT-CCGCC----TCTGGT-G-GTT-A-C-T-A---C-CTGGCCTAAAC-A-A-T-TT-GCCGGT--T--T-TCCC--TT-----G-G-TGCTCTTCAC-CGAG-TGTCTTGGG-GGACTGGTACGTTTACTTTGAAGAAACTAGAGTGCTCAAAGCA--G---G--C-GTAACGCC--TCTAG--ACTGG--T-GGT-CCG-CC--T-CT-GGTG-GTT--A-CTA-CCTGGCCTAA-A---C-AATTT-GCCGG---TT---T-T-CC---CT--T------GGTGCTCTT--CACCGAG-TG-T-CTT-G-GGGGACTGGTACGTTTACTTTGAAGAAACTAGAGTGCTCAAAGCA---GG----C------G-TAACGCC +Gea -TCCGG--CCGGA-CGGGT-CCGCC-TA---C-CGGT-G-GTT-A-C-T-GTT-CGCT-GCC-GAGC---T-TC-AGGGGG--CC--G-C-TG--TC-----G-A-TGATCTTCAT-CGGT-TATCTTCCG-TAACCCTCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--C-GCGACGCC-TCCGG--CCGGA-CGGGT-CCGCCT---ACCGGT-G-GTT-A-C-T-GTT-CGCT-GCC-GAGC-----T-TCAGGGGG---C--C-GCTG--TC-----G-A-TGATCTTCAT-CGGT-TATCTTCCG-TAACCCTCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--C-GCGACGCC-TCCGG--CCGGA-CGGGT-CCGCCT---ACCGGT-G-GTT-A-C-T-GTT-CGCT-GCC-GAGC-----T-TCAGGGGG---C--C-GCTG--TC-----G-A-TGATCTTCAT-CGGT-TATCTTCCG-TAACCCTCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--C-GCGACGCC--TCCGG--CCGGA--CGGGT-CCG-CC--TACC-GGTG-GTT--A-CTG--TTCG-CTGC-C---G--AGCT-TCAGG----G---G-G-CC--GCT-GT-----CGATGATCTT--CATCGGT-TA-T-CTT-C-CGTAACCCTCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA----G----C------G-CGACGCC +Hadrurus -TCCGG--ACTGT-C-GGT-CCGC-------C-GCAA-G-CTT-A-C-T-G---G-CAGGACCGGAC---G-TCTAGCCGG--AC--T-C-TC--TC-----GTA-TCCTCTTCAC-CGGG-TGTCTTGGG-TGTCCGGCAATTTTACTTTGAAAAAATTAGAGTGCTCAAAGCAGC----G--C-GATCCGCC-TCCGG--ACTGT-C-GGT-CCGC-----C-GCAA-G-CTT-A-C-T-G---G-CAGGACCGGAC---G-T-CTAGCCGG---A--C-TCTC--TC-----GTA-TCCTCTTCAC-CGGG-TGTCTTGGG-TGTCCGGCAATTTTACTTTGAAAAAATTAGAGTGCTCAAAGCAGC----G--C-GATCCGCC-TCCGG--ACTGT-C-GGT-CCGC-----C-GCAA-G-CTT-A-C-T-G---G-CAGGACCGGAC---G-T-CTAGCCGG---A--C-TCTC--TC-----GTA-TCCTCTTCAC-CGGG-TGTCTTGGG-TGTCCGGCAATTTTACTTTGAAAAAATTAGAGTGCTCAAAGCAGC----G--C-GATCCGCC--TCCGG--ACTGT--C-GGT-CCG-C---C--G-CAAG-CTT--A-CTG-GCAGGACCGG-A---C--GTCTAGCCGG-ACTC--TC-T------CG--T-------ATCCTCTT--CACCGGG-TG-T-CTT-G-GGTGTCCGGCAATTTTACTTTGAAAAAATTAGAGTGCTCAAAGCAGC--G----C------GAT-CCGCC +Hypochilus -TCCAG--ACGGG-C-GGT-CCGCC-TA---A-CGGT-G-GTT-A-C-T-G---C-CTGGCCTGAACA--A-CC-AGCCGG--TT--T-C-CC--TA-----G-A-TGATCTTCAT-TGAT-TGTCTTGGG-TGACCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--C-GTGACGCC-TCCAG--ACGGG-C-GGT-CCGCCT---AACGGT-G-GTT-A-C-T-G---C-CTGGCCTGAAC---A-A-CCAGCCGG---T--T-TCCC--TA-----G-A-TGATCTTCAT-TGAT-TGTCTTGGG-TGACCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--C-GTGACGCC-TCCAG--ACGGG-C-GGT-CCGCCT---AACGGT-G-GTT-A-C-T-G---C-CTGGCCTGAAC---A-A-CCAGCCGG---T--T-TCCC--TA-----G-A-TGATCTTCAT-TGAT-TGTCTTGGG-TGACCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--C-GTGACGCC--TCCAG--ACGGG--C-GGT-CCG-CC--TAAC-GGTG-GTT--A-CTG-CCTGGCCTGA-A---C--AACCAGCCGG----T---T-T-CC---CT--A------GATGATCTT--CATTGAT-TG-T-CTT-G-GGTGACCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA----G----C------G-TGACGCC +Limulus -TCTAG--ACTGG-C-GGT-CCGCT-T----C-CGGC-G-GTT-A-C-T-G---C-CTGGCCTAAAC---A-TC-TGCCGGT-TT--T-C-CC--TC-----G-G-TGCCCTTGAT-TGAG-TGTCTTGGG-TGGCCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--C-GCAACGCC-TCTAG--ACTGG-C-GGT-CCGCT----TCCGGC-G-GTT-A-C-T-G---C-CTGGCCTAAAC---A-T-CT-GCCGGT--T--T-TCCC--TC-----G-G-TGCCCTTGAT-TGAG-TGTCTTGGG-TGGCCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--C-GCAACGCC-TCTAG--ACTGG-C-GGT-CCGCT----TCCGGC-G-GTT-A-C-T-G---C-CTGGCCTAAAC---A-T-CT-GCCGGT--T--T-TCCC--TC-----G-G-TGCCCTTGAT-TGAG-TGTCTTGGG-TGGCCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--C-GCAACGCC--TCTAG--ACTGG--C-GGT-CCG-CT--T-CC-GGCG-GTT--A-CTG-CCTGGCCTAA-A---C--ATCT-GCCGG---TT---T-T-CC---CT--C------GGTGCCCTT--GATTGAG-TG-T-CTT-G-GGTGGCCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA----G----C------G-CAACGCC Mastigoproctus -TCCAG--ACGGG-T-GGT-CCACCGC----C-CGGT-G-GCT-A-C-T-G---C-CCGGCCTGAAC-A-A-TCTCGCCGG--TT--T-T-CC--TT-----G-A-TTCTCTTCAC-CGAG-TGTCTTGGG-TGACCGGCACGTTTACTTTGAAAAAATTAGAGTGCTTAAAGCA------G--C-GTAACGCC-TCCAG--ACGGG-T-GGT-CCACCG---CCCGGT-G-GCT-A-C-T-G---C-CCGGCCTGAACA--A-T-CTCGCCGG---T--T-TTCC--TT-----G-A-TTCTCTTCAC-CGAG-TGTCTTGGG-TGACCGGCACGTTTACTTTGAAAAAATTAGAGTGCTTAAAGCA------G--C-GTAACGCC-TCCAG--ACGGG-T-GGT-CCACCG---CCCGGT-G-GCT-A-C-T-G---C-CCGGCCTGAACA--A-T-CTCGCCGG---T--T-TTCC--TT-----G-A-TTCTCTTCAC-CGAG-TGTCTTGGG-TGACCGGCACGTTTACTTTGAAAAAATTAGAGTGCTTAAAGCA------G--C-GTAACGCC--TCCAG--ACGGG--T-GGT-CCA-CCG-C-CC-GGTG-GCT--A-CTG-CCCGGCCTGA-A---CA-ATCTCGCCGG---TT---T-T--C---CT--T------GATTCTCTT--CACCGAG-TG-T-CTT-G-GGTGACCGGCACGTTTACTTTGAAAAAATTAGAGTGCTTAAAGCA----G----C------G-TAACGCC -Pauroctonus -TCCAG--ACTGT-C-GGT-CCGCA------C-CGGA-G-GTT-A-C-T-G---G-CAGGACCGGAC---G-TCTAGCCGG--CC--T-C-CC--TC-----GT--TGCTCTTCAC-CGGG-TGTCTTGGG-TGTCCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--C-GACCCGCC-TCCAG--ACTGT-C-GGT-CCGCA----C-CGGA-G-GTT-A-C-T-G---G-CAGGACCGGAC---G-T-CTAGCCGG---C--C-TCCC--TC-----GT--TGCTCTTCAC-CGGG-TGTCTTGGG-TGTCCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--C-GACCCGCC-TCCAG--ACTGT-C-GGT-CCGCA----C-CGGA-G-GTT-A-C-T-G---G-CAGGACCGGAC---G-T-CTAGCCGG---C--C-TCCC--TC-----GT--TGCTCTTCAC-CGGG-TGTCTTGGG-TGTCCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--C-GACCCGCC--TCCAG--ACTGT--C-GGT-CCG-CA--C--C-GGAG-GTT--A-CTG-GCAGGACCGG-A---C--GTCTAGCCGG-CCTC--CC-T------CG--T--------TGCTCTT--CACCGGG-TG-T-CTT-G-GGTGTCCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA----G----C------GAC-CCGCC +Pauroctonus -TCCAG--ACTGT-C-GGT-CCGCA------C-CGGA-G-GTT-A-C-T-G---G-CAGGACCGGAC---G-TCTAGCCGG--CC--T-C-CC--TC-----GT--TGCTCTTCAC-CGGG-TGTCTTGGG-TGTCCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--C-GACCCGCC-TCCAG--ACTGT-C-GGT-CCGCA----C-CGGA-G-GTT-A-C-T-G---G-CAGGACCGGAC---G-T-CTAGCCGG---C--C-TCCC--TC-----GT--TGCTCTTCAC-CGGG-TGTCTTGGG-TGTCCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--C-GACCCGCC-TCCAG--ACTGT-C-GGT-CCGCA----C-CGGA-G-GTT-A-C-T-G---G-CAGGACCGGAC---G-T-CTAGCCGG---C--C-TCCC--TC-----GT--TGCTCTTCAC-CGGG-TGTCTTGGG-TGTCCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--C-GACCCGCC--TCCAG--ACTGT--C-GGT-CCG-CA--C--C-GGAG-GTT--A-CTG-GCAGGACCGG-A---C--GTCTAGCCGG-CCTC--CC-T------CG--T--------TGCTCTT--CACCGGG-TG-T-CTT-G-GGTGTCCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA----G----C------GAC-CCGCC Rhiphicephalus -TCCAG--AC-GAGT-AGT-GCATC-T---AC-CCG-AT-GCT-A-C-G-G--CT-C-GGACTGAAC---A-TCATGCCGG--------TT-C-TTT--CTTG-G-TGCACTTCAT-TGTG-TGCCTCGAGATGGCCGGTGCTTTTACTTTGAAAAAATTAGAGTGCTCAACGCA----G-G--C-GAGTCGCC-TCCAG--AC-GAGT-AGT-GCATC-T--ACCCG-AT-GCT-A-C-G-G--CT-C-GGACTGAAC---A-TCAT-GCCGG--------TT-C-TTT--CTTG-G-TGCACTTCAT-TGTG-TGCCTCGAGATGGCCGGTGCTTTTACTTTGAAAAAATTAGAGTGCTCAACGCA----G-G--C-GAGTCGCC-TCCAG--AC-GAGT-AGT-GCATC-T--ACCCG-AT-GCT-A-C-G-G--CT-C-GGACTGAAC---A-TCAT-GCCGG--------TT-C-TTT--CTTG-G-TGCACTTCAT-TGTG-TGCCTCGAGATGGCCGGTGCTTTTACTTTGAAAAAATTAGAGTGCTCAACGCA----G-G--C-GAGTCGCC--TCCAG--AC-GAGT--AGTGCAT-CTA-C-CCGAT-G-C-T--A-CGGCTCGGACT-GA-A---C--ATCATGCCGG--------T-T--C-T-TTC-T----T-GGTGCACTT--CATT--G-TG-TGCCTCGAGATGGCCGGTGCTTTTACTTTGAAAAAATTAGAGTGCTCAACGCA--G-G--C---G-A--G-T--CGCC -Thelichoris -TCCAG--ACGGG-C-GGT-CCGCC-TA---A-CGGT-G-GTT-A-C-T-G---C-CTGGCCTGAACA--G-CC-AGCCGG--TT--T-C-CC--TA-----G-A-TGATCTTTAC-CGGT-TGTCTTGGG-TGACCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--G-CTGACGCC-TCCAG--ACGGG-C-GGT-CCGCCT---AACGGT-G-GTT-A-C-T-G---C-CTGGCCTGAAC---A-G-CCAGCCGG---T--T-TCCC--TA-----G-A-TGATCTTTAC-CGGT-TGTCTTGGG-TGACCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--G-CTGACGCC-TCCAG--ACGGG-C-GGT-CCGCCT---AACGGT-G-GTT-A-C-T-G---C-CTGGCCTGAAC---A-G-CCAGCCGG---T--T-TCCC--TA-----G-A-TGATCTTTAC-CGGT-TGTCTTGGG-TGACCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--G-CTGACGCC--TCCAG--ACGGG--C-GGT-CCG-CC--TAAC-GGTG-GTT--A-CTG-CCTGGCCTGA-A---C--AGCCAGCCGG----T---T-T-CC---CT--A------GATGATCTT--TACCGGT-TG-T-CTT-G-GGTGACCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA----G----G------C-TGACGCC -Thermobius -CTCGG--AC-GATC-GGT-TCGCC-G----C-CCG--T-GTTAA-C-T-G--AT-C-GTTCCGGAC---G-TCCTGCCGG--TT--T-TTCC-TTT--CTCG-G-TGCTCTTCAT-TGAG-TGTCTTGAT-TGGCCGGCACGTTTACTTTGAACAAATTAGAGTGCTCAAAGCA----G-GTCCAGTGTCGCC-CTCGG--AC-GATC-GGT-TCGCC-G---CCCG--T-GTTAA-C-T-G--AT-C-GTTCCGGAC---G-TCCT-GCCGG--TT--T-TTCC-TTT--CTCG-G-TGCTCTTCAT-TGAG-TGTCTTGAT-TGGCCGGCACGTTTACTTTGAACAAATTAGAGTGCTCAAAGCA----G-GTCCAGTGTCGCC-CTCGG--AC-GATC-GGT-TCGCC-G---CCCG--T-GTTAA-C-T-G--AT-C-GTTCCGGAC---G-TCCT-GCCGG--TT--T-TTCC-TTT--CTCG-G-TGCTCTTCAT-TGAG-TGTCTTGAT-TGGCCGGCACGTTTACTTTGAACAAATTAGAGTGCTCAAAGCA----G-GTCCAGTGTCGCC-CTCG-G--AC-GA-TC-GGT-TCG-CCG-C-CC-GT-G-T-TA-A-CTGATCGTTCC-GG-A---C--GTCCTGCCGGT--TT---T-TC-C-T-TTC-T----C-GGTGCTCTT--CATTGAG-TG-T-CTT-G-ATTGGCCGGCACGTTTACTTTGAACAAATTAGAGTGCTCAAAGCA--G-G--TCCAG-T--G-T--CGCC -Vonones -TCGAG--GCTGG-C-GGT-CCGCC-T---AC-AGGC-G-GTC-A-C-T-G---C-CAGTACTCAAC---A-TCCTGCCGG--TT--T-TCCC--TT-----G-G-TGCTCTTCGC-TGAG-TGTCTCGGG-TGGCCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA----G-G--C-GCGTAGCC-TCGAG--GCTGG-C-GGT-CCGCC-T--ACAGGC-G-GTC-A-C-T-G---C-CAGTACTCAAC---A-TCCT-GCCGG--TT--T-TCCC--TT-----G-G-TGCTCTTCGC-TGAG-TGTCTCGGG-TGGCCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA----G-G--C-GCGTAGCC-TCGAG--GCTGG-C-GGT-CCGCC-T--ACAGGC-G-GTC-A-C-T-G---C-CAGTACTCAAC---A-TCCT-GCCGG--TT--T-TCCC--TT-----G-G-TGCTCTTCGC-TGAG-TGTCTCGGG-TGGCCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA----G-G--C-GCGTAGCC--TCGAG--GCTGG--C-GGT-CCG-CCT-A-CA-GGCG-G-T-CA-CTG-CCAGTACTCA-A---C--ATCCTGCCGG---TT---T-TC-C---CT--T------GGTGCTCTT--CGCTGAG-TG-T-CTC-G-GGTGGCCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA--G-G----C-G-C--G-T--AGCC +Thelichoris -TCCAG--ACGGG-C-GGT-CCGCC-TA---A-CGGT-G-GTT-A-C-T-G---C-CTGGCCTGAACA--G-CC-AGCCGG--TT--T-C-CC--TA-----G-A-TGATCTTTAC-CGGT-TGTCTTGGG-TGACCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--G-CTGACGCC-TCCAG--ACGGG-C-GGT-CCGCCT---AACGGT-G-GTT-A-C-T-G---C-CTGGCCTGAAC---A-G-CCAGCCGG---T--T-TCCC--TA-----G-A-TGATCTTTAC-CGGT-TGTCTTGGG-TGACCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--G-CTGACGCC-TCCAG--ACGGG-C-GGT-CCGCCT---AACGGT-G-GTT-A-C-T-G---C-CTGGCCTGAAC---A-G-CCAGCCGG---T--T-TCCC--TA-----G-A-TGATCTTTAC-CGGT-TGTCTTGGG-TGACCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA------G--G-CTGACGCC--TCCAG--ACGGG--C-GGT-CCG-CC--TAAC-GGTG-GTT--A-CTG-CCTGGCCTGA-A---C--AGCCAGCCGG----T---T-T-CC---CT--A------GATGATCTT--TACCGGT-TG-T-CTT-G-GGTGACCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA----G----G------C-TGACGCC +Thermobius -CTCGG--AC-GATC-GGT-TCGCC-G----C-CCG--T-GTTAA-C-T-G--AT-C-GTTCCGGAC---G-TCCTGCCGG--TT--T-TTCC-TTT--CTCG-G-TGCTCTTCAT-TGAG-TGTCTTGAT-TGGCCGGCACGTTTACTTTGAACAAATTAGAGTGCTCAAAGCA----G-GTCCAGTGTCGCC-CTCGG--AC-GATC-GGT-TCGCC-G---CCCG--T-GTTAA-C-T-G--AT-C-GTTCCGGAC---G-TCCT-GCCGG--TT--T-TTCC-TTT--CTCG-G-TGCTCTTCAT-TGAG-TGTCTTGAT-TGGCCGGCACGTTTACTTTGAACAAATTAGAGTGCTCAAAGCA----G-GTCCAGTGTCGCC-CTCGG--AC-GATC-GGT-TCGCC-G---CCCG--T-GTTAA-C-T-G--AT-C-GTTCCGGAC---G-TCCT-GCCGG--TT--T-TTCC-TTT--CTCG-G-TGCTCTTCAT-TGAG-TGTCTTGAT-TGGCCGGCACGTTTACTTTGAACAAATTAGAGTGCTCAAAGCA----G-GTCCAGTGTCGCC-CTCG-G--AC-GA-TC-GGT-TCG-CCG-C-CC-GT-G-T-TA-A-CTGATCGTTCC-GG-A---C--GTCCTGCCGGT--TT---T-TC-C-T-TTC-T----C-GGTGCTCTT--CATTGAG-TG-T-CTT-G-ATTGGCCGGCACGTTTACTTTGAACAAATTAGAGTGCTCAAAGCA--G-G--TCCAG-T--G-T--CGCC +Vonones -TCGAG--GCTGG-C-GGT-CCGCC-T---AC-AGGC-G-GTC-A-C-T-G---C-CAGTACTCAAC---A-TCCTGCCGG--TT--T-TCCC--TT-----G-G-TGCTCTTCGC-TGAG-TGTCTCGGG-TGGCCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA----G-G--C-GCGTAGCC-TCGAG--GCTGG-C-GGT-CCGCC-T--ACAGGC-G-GTC-A-C-T-G---C-CAGTACTCAAC---A-TCCT-GCCGG--TT--T-TCCC--TT-----G-G-TGCTCTTCGC-TGAG-TGTCTCGGG-TGGCCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA----G-G--C-GCGTAGCC-TCGAG--GCTGG-C-GGT-CCGCC-T--ACAGGC-G-GTC-A-C-T-G---C-CAGTACTCAAC---A-TCCT-GCCGG--TT--T-TCCC--TT-----G-G-TGCTCTTCGC-TGAG-TGTCTCGGG-TGGCCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA----G-G--C-GCGTAGCC--TCGAG--GCTGG--C-GGT-CCG-CCT-A-CA-GGCG-G-T-CA-CTG-CCAGTACTCA-A---C--ATCCTGCCGG---TT---T-TC-C---CT--T------GGTGCTCTT--CGCTGAG-TG-T-CTC-G-GGTGGCCGGCACGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCA--G-G----C-G-C--G-T--AGCC ; cc - 0.786; proc/; diff --git a/testData/chel-334.dot b/testData/chel-334.dot new file mode 100644 index 000000000..dfac6e4b7 --- /dev/null +++ b/testData/chel-334.dot @@ -0,0 +1,497 @@ +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 []; + 18 []; + 19 []; + 20 []; + 21 []; + 22 []; + 23 []; + 24 []; + 25 []; + 26 []; + 27 []; + 28 []; + 29 []; + 30 []; + 31 []; + 32 []; + 17 -> 0 [label=48.5]; + 17 -> 32 [label=38.0]; + 18 -> 11 [label=15.0]; + 18 -> 26 [label=7.0]; + 19 -> 2 [label=47.0]; + 19 -> 13 [label=17.5]; + 20 -> 4 [label=30.5]; + 20 -> 15 [label=29.5]; + 21 -> 18 [label=2.0]; + 21 -> 24 [label=7.0]; + 22 -> 6 [label=18.0]; + 22 -> 10 [label=13.0]; + 23 -> 19 [label=9.5]; + 23 -> 29 [label=4.5]; + 24 -> 5 [label=16.0]; + 24 -> 27 [label=8.0]; + 25 -> 20 [label=12.5]; + 25 -> 28 [label=5.0]; + 26 -> 7 [label=31.0]; + 26 -> 30 [label=3.0]; + 27 -> 8 [label=17.0]; + 27 -> 12 [label=9.0]; + 28 -> 21 [label=3.0]; + 28 -> 22 [label=3.0]; + 29 -> 1 [label=10.5]; + 29 -> 31 [label=5.0]; + 30 -> 9 [label=12.0]; + 30 -> 14 [label=12.0]; + 31 -> 16 [label=16.0]; + 31 -> 25 [label=4.0]; + 32 -> 3 [label=17.0]; + 32 -> 23 [label=12.0]; +} + +//334.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 []; + 18 []; + 19 []; + 20 []; + 21 []; + 22 []; + 23 []; + 24 []; + 25 []; + 26 []; + 27 []; + 28 []; + 29 []; + 30 []; + 31 []; + 32 []; + 17 -> 0 [label=48.5]; + 17 -> 32 [label=38.0]; + 18 -> 21 [label=1.0]; + 18 -> 24 [label=6.5]; + 19 -> 16 [label=16.0]; + 19 -> 22 [label=3.5]; + 20 -> 4 [label=30.5]; + 20 -> 15 [label=29.5]; + 21 -> 11 [label=14.0]; + 21 -> 26 [label=6.0]; + 22 -> 20 [label=12.0]; + 22 -> 28 [label=4.5]; + 23 -> 1 [label=10.5]; + 23 -> 19 [label=5.0]; + 24 -> 5 [label=15.5]; + 24 -> 27 [label=8.5]; + 25 -> 2 [label=47.0]; + 25 -> 13 [label=17.5]; + 26 -> 14 [label=10.0]; + 26 -> 30 [label=4.0]; + 27 -> 8 [label=17.0]; + 27 -> 12 [label=9.0]; + 28 -> 18 [label=2.5]; + 28 -> 31 [label=2.5]; + 29 -> 23 [label=4.5]; + 29 -> 25 [label=9.5]; + 30 -> 7 [label=32.0]; + 30 -> 9 [label=12.0]; + 31 -> 6 [label=17.5]; + 31 -> 10 [label=13.5]; + 32 -> 3 [label=17.0]; + 32 -> 29 [label=12.0]; +} + +//334.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 []; + 18 []; + 19 []; + 20 []; + 21 []; + 22 []; + 23 []; + 24 []; + 25 []; + 26 []; + 27 []; + 28 []; + 29 []; + 30 []; + 31 []; + 32 []; + 17 -> 0 [label=48.5]; + 17 -> 32 [label=38.0]; + 18 -> 21 [label=2.0]; + 18 -> 24 [label=7.0]; + 19 -> 16 [label=16.0]; + 19 -> 22 [label=4.0]; + 20 -> 4 [label=30.5]; + 20 -> 15 [label=29.5]; + 21 -> 11 [label=15.0]; + 21 -> 30 [label=7.0]; + 22 -> 20 [label=12.5]; + 22 -> 28 [label=5.0]; + 23 -> 1 [label=10.5]; + 23 -> 19 [label=5.0]; + 24 -> 5 [label=16.0]; + 24 -> 27 [label=8.0]; + 25 -> 2 [label=47.0]; + 25 -> 13 [label=17.5]; + 26 -> 9 [label=12.0]; + 26 -> 14 [label=12.0]; + 27 -> 8 [label=17.0]; + 27 -> 12 [label=9.0]; + 28 -> 18 [label=3.0]; + 28 -> 31 [label=3.0]; + 29 -> 23 [label=4.5]; + 29 -> 25 [label=9.5]; + 30 -> 7 [label=31.0]; + 30 -> 26 [label=3.0]; + 31 -> 6 [label=18.0]; + 31 -> 10 [label=13.0]; + 32 -> 3 [label=17.0]; + 32 -> 29 [label=12.0]; +} + +//334.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 []; + 18 []; + 19 []; + 20 []; + 21 []; + 22 []; + 23 []; + 24 []; + 25 []; + 26 []; + 27 []; + 28 []; + 29 []; + 30 []; + 31 []; + 32 []; + 17 -> 0 [label=48.5]; + 17 -> 32 [label=38.0]; + 18 -> 21 [label=1.0]; + 18 -> 24 [label=6.5]; + 19 -> 16 [label=16.0]; + 19 -> 22 [label=3.5]; + 20 -> 4 [label=30.5]; + 20 -> 15 [label=29.5]; + 21 -> 11 [label=14.0]; + 21 -> 26 [label=6.0]; + 22 -> 20 [label=12.0]; + 22 -> 28 [label=4.5]; + 23 -> 1 [label=10.5]; + 23 -> 19 [label=5.0]; + 24 -> 5 [label=15.5]; + 24 -> 27 [label=8.5]; + 25 -> 2 [label=47.0]; + 25 -> 13 [label=17.5]; + 26 -> 9 [label=10.0]; + 26 -> 30 [label=5.5]; + 27 -> 8 [label=17.0]; + 27 -> 12 [label=9.0]; + 28 -> 18 [label=2.5]; + 28 -> 31 [label=2.5]; + 29 -> 23 [label=4.5]; + 29 -> 25 [label=9.5]; + 30 -> 7 [label=31.5]; + 30 -> 14 [label=13.5]; + 31 -> 6 [label=17.5]; + 31 -> 10 [label=13.5]; + 32 -> 3 [label=17.0]; + 32 -> 29 [label=12.0]; +} + +//334.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 []; + 18 []; + 19 []; + 20 []; + 21 []; + 22 []; + 23 []; + 24 []; + 25 []; + 26 []; + 27 []; + 28 []; + 29 []; + 30 []; + 31 []; + 32 []; + 17 -> 0 [label=48.5]; + 17 -> 23 [label=38.0]; + 18 -> 4 [label=30.5]; + 18 -> 15 [label=29.5]; + 19 -> 20 [label=4.5]; + 19 -> 22 [label=9.5]; + 20 -> 1 [label=10.5]; + 20 -> 21 [label=5.0]; + 21 -> 16 [label=16.0]; + 21 -> 25 [label=4.0]; + 22 -> 2 [label=47.0]; + 22 -> 13 [label=17.5]; + 23 -> 3 [label=17.0]; + 23 -> 19 [label=12.0]; + 24 -> 31 [label=3.0]; + 24 -> 32 [label=3.0]; + 25 -> 18 [label=12.5]; + 25 -> 24 [label=5.0]; + 26 -> 9 [label=12.0]; + 26 -> 14 [label=12.0]; + 27 -> 8 [label=17.0]; + 27 -> 12 [label=9.0]; + 28 -> 5 [label=16.0]; + 28 -> 27 [label=8.0]; + 29 -> 11 [label=15.0]; + 29 -> 30 [label=7.0]; + 30 -> 7 [label=31.0]; + 30 -> 26 [label=3.0]; + 31 -> 28 [label=7.0]; + 31 -> 29 [label=2.0]; + 32 -> 6 [label=18.0]; + 32 -> 10 [label=13.0]; +} + +//334.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 []; + 18 []; + 19 []; + 20 []; + 21 []; + 22 []; + 23 []; + 24 []; + 25 []; + 26 []; + 27 []; + 28 []; + 29 []; + 30 []; + 31 []; + 32 []; + 17 -> 0 [label=48.5]; + 17 -> 23 [label=38.0]; + 18 -> 4 [label=30.5]; + 18 -> 15 [label=29.5]; + 19 -> 20 [label=4.5]; + 19 -> 22 [label=9.5]; + 20 -> 1 [label=10.5]; + 20 -> 21 [label=5.0]; + 21 -> 16 [label=16.0]; + 21 -> 25 [label=3.5]; + 22 -> 2 [label=47.0]; + 22 -> 13 [label=17.5]; + 23 -> 3 [label=17.0]; + 23 -> 19 [label=12.0]; + 24 -> 31 [label=2.5]; + 24 -> 32 [label=2.5]; + 25 -> 18 [label=12.0]; + 25 -> 24 [label=4.5]; + 26 -> 14 [label=10.0]; + 26 -> 30 [label=4.0]; + 27 -> 8 [label=17.0]; + 27 -> 12 [label=9.0]; + 28 -> 5 [label=15.5]; + 28 -> 27 [label=8.5]; + 29 -> 11 [label=14.0]; + 29 -> 26 [label=6.0]; + 30 -> 7 [label=32.0]; + 30 -> 9 [label=12.0]; + 31 -> 28 [label=6.5]; + 31 -> 29 [label=1.0]; + 32 -> 6 [label=17.5]; + 32 -> 10 [label=13.5]; +} + +//334.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 []; + 18 []; + 19 []; + 20 []; + 21 []; + 22 []; + 23 []; + 24 []; + 25 []; + 26 []; + 27 []; + 28 []; + 29 []; + 30 []; + 31 []; + 32 []; + 17 -> 0 [label=48.5]; + 17 -> 23 [label=38.0]; + 18 -> 4 [label=30.5]; + 18 -> 15 [label=29.5]; + 19 -> 20 [label=4.5]; + 19 -> 22 [label=9.5]; + 20 -> 1 [label=10.5]; + 20 -> 21 [label=5.0]; + 21 -> 16 [label=16.0]; + 21 -> 25 [label=3.5]; + 22 -> 2 [label=47.0]; + 22 -> 13 [label=17.5]; + 23 -> 3 [label=17.0]; + 23 -> 19 [label=12.0]; + 24 -> 31 [label=2.5]; + 24 -> 32 [label=2.5]; + 25 -> 18 [label=12.0]; + 25 -> 24 [label=4.5]; + 26 -> 7 [label=31.5]; + 26 -> 14 [label=13.5]; + 27 -> 8 [label=17.0]; + 27 -> 12 [label=9.0]; + 28 -> 5 [label=15.5]; + 28 -> 27 [label=8.5]; + 29 -> 11 [label=14.0]; + 29 -> 30 [label=6.0]; + 30 -> 9 [label=10.0]; + 30 -> 26 [label=5.5]; + 31 -> 28 [label=6.5]; + 31 -> 29 [label=1.0]; + 32 -> 6 [label=17.5]; + 32 -> 10 [label=13.5]; +} + +//334.0 diff --git a/testData/chel-dash.ss b/testData/chel-dash.ss new file mode 100644 index 000000000..3aa4f6dc1 --- /dev/null +++ b/testData/chel-dash.ss @@ -0,0 +1,23 @@ +xread +'TNT data for Graph 0 generated by PhylogeneticGraph (PhyG)' +2 17 +Alentus 00 +Amblypygid 0- +Americhernus 01 +Aportus 12 +Artemia 11 +Centruoides 1- +Chanbria 12 +Gea 10 +Hadrurus 11 +Hypochilus 11 +Limulus 12 +Mastigoproctus 13 +Pauroctonus 11 +Rhiphicephalus 11 +Thelichoris 11 +Thermobius 11 +Vonones 11 +; +cc + 0.; +proc/; diff --git a/testData/chel-ia.fas b/testData/chel-ia.fas new file mode 100644 index 000000000..8f7509273 --- /dev/null +++ b/testData/chel-ia.fas @@ -0,0 +1,51 @@ +>Alentus +-TCCAGACTGAC-GGT-CCACCG--AATCGG--GCGACTG--T-CAGGCC +T-GAAC-A-TCCGGT-CGGTTTC-TCTCCCTTTTCCCGCGGTGCTCTTCG +TTGAGTGTCGCGG-GA-GGCCGACAAGTTTACTTTGAAAAAATTAGAGTG +CTCAAAGCA-G-GCG-GTCACC +>Aportus +-TCCAGACTGAC-GGT-CCACCG---CTCGG--GCGACTG--T-CAGGCC +T-GAAC-ATTCC-GT-CGGTTTCGACAGATTTTTCCCGCGGTGCTCTTCG +GTGAGTGTCGC-G-GA-GGCCGACAAGTTTACTTTGAAAAAATTAGAGTG +CTCAAAGCA-G-GCG-GTCACC +>Artemia +-CTCG-GTCGGGT-GG-TGCCGC---CT-CACGGTGGTCACTG--C-CTC +G-A-TC-G-G-A--CA--A---T--T-C--ATTGGATCGTTCGGGGTGCT +CTTAACCGAGT-G-TCCTGGGTGGCCGATACGTTTACTTTGAACAAATTA +GAGTGCTTA---AAGCAGGTGC +>Centruoides +--TCCAGACAAGC-GG-T-CCAC---C--CGCGGTGGTTACTG--T-TTG +G-ACT--G-G-A--C---G---T--T-T-GGCCGG--ATTCCTTTGATGC +TCTTTGCCGAG-T-GTCTTGGGTGTCCGGCACGTTTACTTTGAAAAAATT +AGAGTGCTC---AAAGC---AG +>Gea +--TCCGGCCGGACGGG-T-CCGC---CT-ACCGGTGGTTACTGTTCGCT- +G-CC---G-A-G--C-------T--T-C-AGGGGG--CCGCTGTCGATGA +TCTTCATCGGT-T-ATCTTCCGTAACCCTCACGTTTACTTTGAAAAAATT +AGAGTGCTC---AAAGC---AG +>Hypochilus +--TCCAGACGGGC-GG-T-CCGC---CT-AACGGTGGTTACTG--C-CTG +G-CCT--G-A-A--C---A---A--C-C-AGCCGG--TTTCCCTAGATGA +TCTTCATTGAT-T-GTCTTGGGTGACCGGCACGTTTACTTTGAAAAAATT +AGAGTGCTC---AAAGC---AG +>Limulus +--TCTAGACTGGC-GG-T-CCGC---TT-C-CGGCGGTTACTG--C-CTG +G-CCT--A-A-A--C---A---T--C-T--GCCGG-TTTTCCCTCGGTGC +CCTTGATTGAG-T-GTCTTGGGTGGCCGGCACGTTTACTTTGAAAAAATT +AGAGTGCTC---AAAGC---AG +>Rhiphicephalus +-TCCAGACGAGT-AGT-GCATCT-A-CCCGAT-GCTACGG--C-TCGGAC +T-GAAC-A-T-C-AT---G---C--C-G-GTTCTTTCTTGGTGCACTTCA +TTGTGTGCCTC-GAGATGGCCGGTGCTTTTACTTTGAAAAAATTAGAGTG +CTCAACGCA-G-GCGAGTCGCC +>Thelichoris +--TCCAGACGGGC-GG-T-CCGC---CT-AACGGTGGTTACTG--C-CTG +G-CCT--G-A-A--C---A---G--C-C-AGCCGG--TTTCCCTAGATGA +TCTTTACCGGT-T-GTCTTGGGTGACCGGCACGTTTACTTTGAAAAAATT +AGAGTGCTC---AAAGC---AG +>Vonones +--TCGAGGCTGGC-GG-T-CCGC---CTACA-GGCGGTCACTG--C-CAG +T-ACTC---A-A--C---A---T--C-C-TGCCGGTTTTCCCTTGGTGCT +CTTCGCTGAGT-G-TCTCGGGTGGCCGGCACGTTTACTTTGAAAAAATTA +GAGTGCTCA---AAGCA-G-GC + diff --git a/testData/chel-ques.ss b/testData/chel-ques.ss new file mode 100644 index 000000000..14ec45b7f --- /dev/null +++ b/testData/chel-ques.ss @@ -0,0 +1,23 @@ +xread +'TNT data for Graph 0 generated by PhylogeneticGraph (PhyG)' +2 17 +Alentus 00 +Amblypygid 0? +Americhernus 01 +Aportus 12 +Artemia 11 +Centruoides 1? +Chanbria 12 +Gea 10 +Hadrurus 11 +Hypochilus 11 +Limulus 12 +Mastigoproctus 13 +Pauroctonus 11 +Rhiphicephalus 11 +Thelichoris 11 +Thermobius 11 +Vonones 11 +; +cc - 0.; +proc/; diff --git a/testData/chel-tax-holder.ss b/testData/chel-tax-holder.ss new file mode 100644 index 000000000..96dc48a4c --- /dev/null +++ b/testData/chel-tax-holder.ss @@ -0,0 +1,23 @@ +xread +'TNT data for Graph 0 generated by PhylogeneticGraph (PhyG)' +1 17 +Alentus 0 +Amblypygid 1 +Americhernus 1 +Aportus 1 +Artemia 1 +Centruoides 1 +Chanbria 1 +Gea 1 +Hadrurus 1 +Hypochilus 1 +Limulus 1 +Mastigoproctus 1 +Pauroctonus 1 +Rhiphicephalus 1 +Thelichoris 1 +Thermobius 1 +Vonones 1 +; +cc - 0; +proc/; diff --git a/testData/chel.ss b/testData/chel.ss index 1d4c228a0..b79d9d9b3 100644 --- a/testData/chel.ss +++ b/testData/chel.ss @@ -1,7 +1,7 @@ xread 'TNT data for Graph 0 generated by PhylogeneticGraph (PhyG)' 1 17 -Alentus 0 +Alentus 0 Amblypygid 0 Americhernus 0 Aportus 1 @@ -17,7 +17,8 @@ Pauroctonus 1 Rhiphicephalus 1 Thelichoris 1 Thermobius 1 -Vonones 1 +Vonones - ; -cc - 0; +cc - 0.; +cc /1 0.; proc/; diff --git a/testData/five-taxon-example.pg b/testData/five-taxon-example.pg new file mode 100644 index 000000000..311b02491 --- /dev/null +++ b/testData/five-taxon-example.pg @@ -0,0 +1,19 @@ +--set(criterion:SI) +set(modelComplexity:3758) + +read("five-taxon-example.tnt") +read("five-taxon-example.tre") + +set(seed:1701287416) + +set(bc2:(0.829934760886531, 1.192825403552391)) +set(bc4:(1.0003608090655018, 1.9996392811480421 )) + + +-- build(distance, rdwag) + +select(best) + +report("five-taxon-example..dot", overwrite, graphs, dotpdf, nocollapse, noHTULabels) +report("five-taxon-example-r.tre", overwrite, graphs, newick, nobraNCHLENGTHS, noHTULabels, collapse) +report("five-taxon-example.csv", diagnosis, overwrite) diff --git a/testData/five-taxon-example.tnt b/testData/five-taxon-example.tnt new file mode 100644 index 000000000..6f79f826d --- /dev/null +++ b/testData/five-taxon-example.tnt @@ -0,0 +1,11 @@ +xread +'5 taxon complexity example data' +6 5 +A 000000 +B 110121 +C 110122 +D 101211 +E 101212 +; +cc -.; +proc/; \ No newline at end of file diff --git a/testData/five-taxon-example.tre b/testData/five-taxon-example.tre new file mode 100644 index 000000000..fd125005c --- /dev/null +++ b/testData/five-taxon-example.tre @@ -0,0 +1 @@ +(((E,D),(B,C)), A); \ No newline at end of file diff --git a/testData/het.pg b/testData/het.pg index 469fed1d6..53ededf0c 100644 --- a/testData/het.pg +++ b/testData/het.pg @@ -4,7 +4,7 @@ read("heteroptera-dW-1-poy.tre") build(dwag) -- select(best) -- swap(spr, steepest) --- select(best) +select(best) -- support(jackknife, buildOnly, replicates:10) report("heteroptera-pg.tre", graphs,newick, overwrite) -- report("het-pair-dist.csv", pairdist, overwrite) diff --git a/testData/ia-test.pg b/testData/ia-test.pg new file mode 100644 index 000000000..8de3b9966 --- /dev/null +++ b/testData/ia-test.pg @@ -0,0 +1,27 @@ +-- set(seed:1675872081) +--read("chel-mod.tnt") +--read("chel.tnt") +--read("chel-3-10.dot") +read("chel-3.ss") +--read("chel-tax-holder.ss") +read(fasta:"chel.fas") +read(fasta:"chel-10.fas") +--read("metazoa-aa-1.fas") +--read("metazoa-aa-2.fas") +--read("met.dot") +--read(fasta:"chel-ia.fas") +--build(replicates:1) +--transform(multitraverse:false) +--transform(multitraverse:true) +--select(best) +--transform(staticapprox) +--swap() +--select(threshold:0.1) +--transform(dynamic) +search(minutes:2) +--report("chel-data.csv", data, overwrite) +--report("chel-diag.csv", diagnosis, overwrite) +--report("chel-tnt.ss", tnt, overwrite) +--report("chel-ia-search.dot", graphs, dotpdf, overwrite) +report("chel-search.csv", search, overwrite) +report("chel.gv", graphs, dotpdf, overwrite) diff --git a/testData/issue-may-19.pg b/testData/issue-may-19.pg deleted file mode 100644 index 36125bf77..000000000 --- a/testData/issue-may-19.pg +++ /dev/null @@ -1,10 +0,0 @@ -read(fasta:"chel.fas") - --- this appers to be the issue--"ia" fails" "do" ok --- perhaps same/similar issue as you fixed earlier for wide/huge -set(finalassignment:ia) ---set(finalassignment:do) - -build(distance, dwag) - -report("test0-wag.tre", overwrite, graphs, newick, nohtulabels, nobraNCHLENGTHS) diff --git a/testData/mats/sg16t1.mat b/testData/mats/sg16t1.mat old mode 100755 new mode 100644 diff --git a/testData/mats/sg16t2.mat b/testData/mats/sg16t2.mat old mode 100755 new mode 100644 diff --git a/testData/mats/sg16t4.mat b/testData/mats/sg16t4.mat old mode 100755 new mode 100644 diff --git a/testData/mats/sg16t8.mat b/testData/mats/sg16t8.mat old mode 100755 new mode 100644 diff --git a/testData/mats/sg16tI.mat b/testData/mats/sg16tI.mat old mode 100755 new mode 100644 diff --git a/testData/mats/sg1t1.mat b/testData/mats/sg1t1.mat old mode 100755 new mode 100644 diff --git a/testData/mats/sg1tI.mat b/testData/mats/sg1tI.mat old mode 100755 new mode 100644 diff --git a/testData/mats/sg2t1.mat b/testData/mats/sg2t1.mat old mode 100755 new mode 100644 diff --git a/testData/mats/sg2t2.mat b/testData/mats/sg2t2.mat old mode 100755 new mode 100644 diff --git a/testData/mats/sg2t4.mat b/testData/mats/sg2t4.mat old mode 100755 new mode 100644 diff --git a/testData/mats/sg2t8.mat b/testData/mats/sg2t8.mat old mode 100755 new mode 100644 diff --git a/testData/mats/sg2tI.mat b/testData/mats/sg2tI.mat old mode 100755 new mode 100644 diff --git a/testData/mats/sg4t1.mat b/testData/mats/sg4t1.mat old mode 100755 new mode 100644 diff --git a/testData/mats/sg4t2.mat b/testData/mats/sg4t2.mat old mode 100755 new mode 100644 diff --git a/testData/mats/sg4t4.mat b/testData/mats/sg4t4.mat old mode 100755 new mode 100644 diff --git a/testData/mats/sg4t8.mat b/testData/mats/sg4t8.mat old mode 100755 new mode 100644 diff --git a/testData/mats/sg4tI.mat b/testData/mats/sg4tI.mat old mode 100755 new mode 100644 diff --git a/testData/mats/sg8t1.mat b/testData/mats/sg8t1.mat old mode 100755 new mode 100644 diff --git a/testData/mats/sg8t2.mat b/testData/mats/sg8t2.mat old mode 100755 new mode 100644 diff --git a/testData/mats/sg8t4.mat b/testData/mats/sg8t4.mat old mode 100755 new mode 100644 diff --git a/testData/mats/sg8t8.mat b/testData/mats/sg8t8.mat old mode 100755 new mode 100644 diff --git a/testData/mats/sg8tI.mat b/testData/mats/sg8tI.mat old mode 100755 new mode 100644 diff --git a/testData/net-hard.pg b/testData/net-hard.pg index ad8b0fffc..d56eefb4d 100644 --- a/testData/net-hard.pg +++ b/testData/net-hard.pg @@ -31,6 +31,7 @@ set(finalassignment:DO) -- select(unique) -- fuse() -- swap(spr, steepest) +--swap() swap(tbr, drift, maxChanges:5, acceptequal:0.0) select(unique) refine(netadd) diff --git a/testData/net-test.dot b/testData/net-test.dot index 44aeb12ae..79e3a4be9 100644 --- a/testData/net-test.dot +++ b/testData/net-test.dot @@ -14,21 +14,22 @@ digraph { 12 [label=HTU12]; 13 [label=HTU13]; 14 [label=HTU14]; - 8 -> 7 [label=0.0]; - 8 -> 14 [label=0.0]; + 8 -> 7 [label=1.0]; + 8 -> 14 [label=1.0]; 9 -> 4 [label=10.0]; 9 -> 10 [label=0.0]; 10 -> 1 [label=10.0]; 10 -> 3 [label=10.0]; - 11 -> 0 [label=10.0]; - 11 -> 5 [label=10.0]; - 12 -> 9 [label=0.0]; + 11 -> 9 [label=1.0]; + 11 -> 13 [label=0.0]; + 12 -> 6 [label=2.0]; 12 -> 11 [label=0.0]; - 13 -> 6 [label=2.0]; - 13 -> 12 [label=0.0]; + 13 -> 0 [label=10.0]; + 13 -> 5 [label=10.0]; 14 -> 2 [label=0.0]; - 14 -> 13 [label=0.0]; + 14 -> 12 [label=0.0]; } +//20.0 digraph { 0 [label=five]; 1 [label=four]; @@ -45,21 +46,22 @@ digraph { 12 [label=HTU12]; 13 [label=HTU13]; 14 [label=HTU14]; - 8 -> 7 [label=0.0]; - 8 -> 14 [label=0.0]; + 8 -> 7 [label=1.0]; + 8 -> 14 [label=1.0]; 9 -> 4 [label=10.0]; - 9 -> 11 [label=0.0]; + 9 -> 10 [label=0.0]; 10 -> 1 [label=10.0]; 10 -> 3 [label=10.0]; 11 -> 0 [label=10.0]; - 11 -> 5 [label=10.0]; - 12 -> 9 [label=0.0]; - 12 -> 10 [label=0.0]; - 13 -> 6 [label=2.0]; - 13 -> 12 [label=0.0]; + 11 -> 9 [label=1.0]; + 12 -> 6 [label=2.0]; + 12 -> 13 [label=0.0]; + 13 -> 5 [label=10.0]; + 13 -> 11 [label=0.0]; 14 -> 2 [label=0.0]; - 14 -> 13 [label=0.0]; + 14 -> 12 [label=0.0]; } +//20.0 digraph { 0 [label=five]; 1 [label=four]; @@ -76,52 +78,22 @@ digraph { 12 [label=HTU12]; 13 [label=HTU13]; 14 [label=HTU14]; - 8 -> 7 [label=0.0]; - 8 -> 14 [label=0.0]; + 8 -> 7 [label=1.0]; + 8 -> 14 [label=1.0]; 9 -> 4 [label=10.0]; - 9 -> 11 [label=0.0]; + 9 -> 10 [label=0.0]; 10 -> 1 [label=10.0]; - 10 -> 9 [label=0.0]; - 11 -> 0 [label=10.0]; - 11 -> 5 [label=10.0]; - 12 -> 3 [label=10.0]; - 12 -> 10 [label=0.0]; - 13 -> 6 [label=2.0]; - 13 -> 12 [label=0.0]; - 14 -> 2 [label=0.0]; - 14 -> 13 [label=0.0]; -} -digraph { - 0 [label=five]; - 1 [label=four]; - 2 [label=one]; - 3 [label=seven]; - 4 [label=six]; - 5 [label=three]; - 6 [label=two]; - 7 [label=zero]; - 8 [label=HTU8]; - 9 [label=HTU9]; - 10 [label=HTU10]; - 11 [label=HTU11]; - 12 [label=HTU12]; - 13 [label=HTU13]; - 14 [label=HTU14]; - 8 -> 7 [label=0.0]; - 8 -> 14 [label=0.0]; - 9 -> 4 [label=10.0]; - 9 -> 11 [label=0.0]; 10 -> 3 [label=10.0]; - 10 -> 9 [label=0.0]; - 11 -> 0 [label=10.0]; 11 -> 5 [label=10.0]; - 12 -> 1 [label=10.0]; - 12 -> 10 [label=0.0]; - 13 -> 6 [label=2.0]; - 13 -> 12 [label=0.0]; + 11 -> 9 [label=1.0]; + 12 -> 6 [label=2.0]; + 12 -> 13 [label=0.0]; + 13 -> 0 [label=10.0]; + 13 -> 11 [label=0.0]; 14 -> 2 [label=0.0]; - 14 -> 13 [label=0.0]; + 14 -> 12 [label=0.0]; } +//20.0 digraph { 0 [label=five]; 1 [label=four]; @@ -138,21 +110,22 @@ digraph { 12 [label=HTU12]; 13 [label=HTU13]; 14 [label=HTU14]; - 8 -> 7 [label=0.0]; - 8 -> 14 [label=0.0]; - 9 -> 4 [label=10.0]; + 8 -> 7 [label=1.0]; + 8 -> 14 [label=1.0]; 9 -> 10 [label=0.0]; + 9 -> 11 [label=1.0]; 10 -> 1 [label=10.0]; 10 -> 3 [label=10.0]; - 11 -> 0 [label=10.0]; - 11 -> 9 [label=0.0]; - 12 -> 5 [label=10.0]; - 12 -> 11 [label=0.0]; - 13 -> 6 [label=2.0]; - 13 -> 12 [label=0.0]; + 11 -> 4 [label=10.0]; + 11 -> 13 [label=0.0]; + 12 -> 6 [label=2.0]; + 12 -> 9 [label=0.0]; + 13 -> 0 [label=10.0]; + 13 -> 5 [label=10.0]; 14 -> 2 [label=0.0]; - 14 -> 13 [label=0.0]; + 14 -> 12 [label=0.0]; } +//20.0 digraph { 0 [label=five]; 1 [label=four]; @@ -169,21 +142,22 @@ digraph { 12 [label=HTU12]; 13 [label=HTU13]; 14 [label=HTU14]; - 8 -> 7 [label=0.0]; - 8 -> 14 [label=0.0]; - 9 -> 4 [label=10.0]; - 9 -> 10 [label=0.0]; + 8 -> 7 [label=1.0]; + 8 -> 14 [label=1.0]; + 9 -> 3 [label=10.0]; + 9 -> 11 [label=1.0]; 10 -> 1 [label=10.0]; - 10 -> 3 [label=10.0]; - 11 -> 5 [label=10.0]; - 11 -> 9 [label=0.0]; - 12 -> 0 [label=10.0]; - 12 -> 11 [label=0.0]; - 13 -> 6 [label=2.0]; - 13 -> 12 [label=0.0]; + 10 -> 9 [label=0.0]; + 11 -> 4 [label=10.0]; + 11 -> 13 [label=0.0]; + 12 -> 6 [label=2.0]; + 12 -> 10 [label=0.0]; + 13 -> 0 [label=10.0]; + 13 -> 5 [label=10.0]; 14 -> 2 [label=0.0]; - 14 -> 13 [label=0.0]; + 14 -> 12 [label=0.0]; } +//20.0 digraph { 0 [label=five]; 1 [label=four]; @@ -200,21 +174,22 @@ digraph { 12 [label=HTU12]; 13 [label=HTU13]; 14 [label=HTU14]; - 8 -> 7 [label=0.0]; - 8 -> 14 [label=0.0]; - 9 -> 10 [label=0.0]; - 9 -> 11 [label=0.0]; - 10 -> 1 [label=10.0]; + 8 -> 7 [label=1.0]; + 8 -> 14 [label=1.0]; + 9 -> 1 [label=10.0]; + 9 -> 11 [label=1.0]; 10 -> 3 [label=10.0]; - 11 -> 0 [label=10.0]; - 11 -> 5 [label=10.0]; - 12 -> 4 [label=10.0]; - 12 -> 9 [label=0.0]; - 13 -> 6 [label=2.0]; - 13 -> 12 [label=0.0]; + 10 -> 9 [label=0.0]; + 11 -> 4 [label=10.0]; + 11 -> 13 [label=0.0]; + 12 -> 6 [label=2.0]; + 12 -> 10 [label=0.0]; + 13 -> 0 [label=10.0]; + 13 -> 5 [label=10.0]; 14 -> 2 [label=0.0]; - 14 -> 13 [label=0.0]; + 14 -> 12 [label=0.0]; } +//20.0 digraph { 0 [label=five]; 1 [label=four]; @@ -231,21 +206,22 @@ digraph { 12 [label=HTU12]; 13 [label=HTU13]; 14 [label=HTU14]; - 8 -> 7 [label=0.0]; - 8 -> 12 [label=0.0]; - 9 -> 6 [label=2.0]; - 9 -> 10 [label=0.0]; + 8 -> 7 [label=1.0]; + 8 -> 12 [label=1.0]; + 9 -> 4 [label=10.0]; + 9 -> 13 [label=0.0]; + 10 -> 6 [label=2.0]; 10 -> 11 [label=0.0]; - 10 -> 13 [label=0.0]; - 11 -> 4 [label=10.0]; + 11 -> 9 [label=1.0]; 11 -> 14 [label=0.0]; 12 -> 2 [label=0.0]; - 12 -> 9 [label=0.0]; + 12 -> 10 [label=0.0]; 13 -> 0 [label=10.0]; 13 -> 5 [label=10.0]; 14 -> 1 [label=10.0]; 14 -> 3 [label=10.0]; } +//20.0 digraph { 0 [label=five]; 1 [label=four]; @@ -262,49 +238,19 @@ digraph { 12 [label=HTU12]; 13 [label=HTU13]; 14 [label=HTU14]; - 8 -> 7 [label=0.0]; - 8 -> 14 [label=0.0]; - 9 -> 6 [label=2.0]; - 9 -> 11 [label=0.0]; - 10 -> 4 [label=10.0]; + 8 -> 7 [label=1.0]; + 8 -> 12 [label=1.0]; + 9 -> 0 [label=10.0]; + 9 -> 5 [label=10.0]; + 10 -> 6 [label=2.0]; 10 -> 13 [label=0.0]; - 11 -> 1 [label=10.0]; - 11 -> 12 [label=0.0]; - 12 -> 3 [label=10.0]; + 11 -> 4 [label=10.0]; + 11 -> 14 [label=0.0]; + 12 -> 2 [label=0.0]; 12 -> 10 [label=0.0]; - 13 -> 0 [label=10.0]; - 13 -> 5 [label=10.0]; - 14 -> 2 [label=0.0]; - 14 -> 9 [label=0.0]; + 13 -> 9 [label=0.0]; + 13 -> 11 [label=1.0]; + 14 -> 1 [label=10.0]; + 14 -> 3 [label=10.0]; } -digraph { - 0 [label=five]; - 1 [label=four]; - 2 [label=one]; - 3 [label=seven]; - 4 [label=six]; - 5 [label=three]; - 6 [label=two]; - 7 [label=zero]; - 8 [label=HTU8]; - 9 [label=HTU9]; - 10 [label=HTU10]; - 11 [label=HTU11]; - 12 [label=HTU12]; - 13 [label=HTU13]; - 14 [label=HTU14]; - 8 -> 7 [label=0.0]; - 8 -> 14 [label=0.0]; - 9 -> 6 [label=2.0]; - 9 -> 13 [label=0.0]; - 10 -> 5 [label=10.0]; - 10 -> 11 [label=0.0]; - 11 -> 4 [label=10.0]; - 11 -> 12 [label=0.0]; - 12 -> 1 [label=10.0]; - 12 -> 3 [label=10.0]; - 13 -> 0 [label=10.0]; - 13 -> 10 [label=0.0]; - 14 -> 2 [label=0.0]; - 14 -> 9 [label=0.0]; -} \ No newline at end of file +//20.0 \ No newline at end of file diff --git a/testData/net.pg b/testData/net.pg index 7c43a2277..ce2df4479 100644 --- a/testData/net.pg +++ b/testData/net.pg @@ -1,5 +1,8 @@ +--set(criterion:PMDL) + read("net-I.fas") read("net-II.fas") + -- read("net-Ia.fas") -- read("net-IIa.fas") --reblock ("net1", "net-I*.fas*") @@ -11,22 +14,38 @@ read("net-II.fas") --read("net-II.tre") --read("net-II.dot") --read("net-I-II.dot") + set(outgroup:"zero") set(graphtype:softwired) -Set(Compressresolutions:True) -set(graphFactor:W15) -set(rootcost:NoRootCost) -report("net-test-data.csv", data, overwrite) -set(finalassignment:DO) + +--set(softwiredmethod:naive) +--Set(Compressresolutions:True) +--set(graphFactor:W15) +--set(graphFactor:W23) + +set(graphFactor:nopenalty) + +--set(rootcost:NoRootCost) +--set(finalassignment:DO) +--set(softwiredmethod:naive) --read("net-test.dot") -build(replicates:100) ---build(distance, rdwag, replicates:10) -swap(tbr) ---refine(netadd) -select(best) -report("net-test.tre", graphs, newick, overwrite) -report("net-test.dot", graphs, dot, overwrite) +--build(replicates:100) +build(distance, dwag) +--transform(softwiredmethod:resolutioncache) +--swap() + +refine(netadd,maxnetedges:5) +refine(netmove) +refine (netdelete) +refine(netadddelete,maxnetedges:5) + +--select(best) +--fuse() + +report("net-test-out.tre", graphs, newick, overwrite) +report("net-test-out.dot", graphs, dotpdf, overwrite) report("net-test-diag-IA.csv", diagnosis, overwrite) report("net-display.dot", displaytrees, dot, overwrite) report("net-display.tre", displaytrees, newick, overwrite) -report("net-test-reconcile.dot", reconcile, dot, overwrite, method:cun, threshold:51) +report("net-display-search.csv", search, overwrite) +--report("net-test-reconcile.dot", reconcile, dot, overwrite, method:cun, threshold:51) diff --git a/testData/net2.pg b/testData/net2.pg index 162958ccb..e03ceca46 100644 --- a/testData/net2.pg +++ b/testData/net2.pg @@ -23,6 +23,6 @@ report("net-test.tre", graphs, newick, overwrite) report("net-test.dot", graphs, dot, overwrite) report("net-test-data.csv", data, overwrite) report("net-test-diag-IA.csv", diagnosis, overwrite) -report("net-display.dot", displaytrees, dot, overwrite) +report("net-display.dot", displaytrees, dotpdf, overwrite) report("net-display.tre", displaytrees, newick, overwrite) report("net-test-reconcile.dot", reconcile, dot, overwrite, method:cun, threshold:51) diff --git a/testData/neyman5.bit.tcm b/testData/neyman5.bit.tcm new file mode 100644 index 000000000..40848f46e --- /dev/null +++ b/testData/neyman5.bit.tcm @@ -0,0 +1,7 @@ +A C G T +2.2714665893450666e-2 8.000340909132339 8.000340909132339 8.000340909132339 8.000340909132339 +8.000340909132339 2.2714665893450666e-2 8.000340909132339 8.000340909132339 8.000340909132339 +8.000340909132339 8.000340909132339 2.2714665893450666e-2 8.000340909132339 8.000340909132339 +8.000340909132339 8.000340909132339 8.000340909132339 2.2714665893450666e-2 8.000340909132339 +8.000340909132339 8.000340909132339 8.000340909132339 8.000340909132339 0.0 + diff --git a/testData/test4.pg b/testData/test4.pg index 78c449ace..43f98aae2 100644 --- a/testData/test4.pg +++ b/testData/test4.pg @@ -2,6 +2,11 @@ read("metazoa-aa-1.fas") read("metazoa-aa-2.fas") --read("mixed-all.tre") -read("protein.tre") +--read("protein.tre") +--transform(staticapprox) +build(distance,rdwag,replicates:10) +select(best) +swap() +select(best) report("protein.dis.test.tre", graphs, newick, overwrite) report("protein.dis.test.csv", pairdist, overwrite) diff --git a/testData/test5.pg b/testData/test5.pg index dba7287ac..89d1c3eb3 100644 --- a/testData/test5.pg +++ b/testData/test5.pg @@ -2,19 +2,18 @@ read("louse.fastc") read("woman.fastc") ---Alex this is the line to change "ia" to trip error "do" goes fine. -set(finalassignment:ia) ---set(finalassignment:do) - set(outgroup:"ArabicAfroAsiatic") ---read(fastc:"louse.fastc", tcm:"louse-11.tcm") + read("louseWoman.tre") --build(distance, rdwag, replicates:10) --build(replicates:10) --read("louse.dis.csv.tre") --swap(tbr, steepest) + select(best) + report("louseWoman-new.tre", graphs, newick, append) --report("graphs.txt", graphs, ascii, overwrite) --report("louse.dis.csv", pairdist, overwrite) -report("louse.data.csv", data, overwrite) \ No newline at end of file +report("louse.data.csv", data, overwrite) +report("louse.diag.csv", diagnosis, overwrite) \ No newline at end of file diff --git a/testing-1/test-1-Ney2.bit.tcm b/testing-1/test-1-Ney2.bit.tcm new file mode 100644 index 000000000..db6196ff3 --- /dev/null +++ b/testing-1/test-1-Ney2.bit.tcm @@ -0,0 +1,5 @@ +A C +9.019780897157814e-2 5.044394119358453 5.044394119358453 +5.044394119358453 9.019780897157814e-2 5.044394119358453 +5.044394119358453 5.044394119358453 0.0 + diff --git a/testing-1/test-1-Ney5.bit.tcm b/testing-1/test-1-Ney5.bit.tcm new file mode 100644 index 000000000..6d7ee3fc2 --- /dev/null +++ b/testing-1/test-1-Ney5.bit.tcm @@ -0,0 +1,7 @@ +A C G T +0.10893437155316406 5.78135971352466 5.78135971352466 5.78135971352466 5.78135971352466 +5.78135971352466 0.10893437155316406 5.78135971352466 5.78135971352466 5.78135971352466 +5.78135971352466 5.78135971352466 0.10893437155316406 5.78135971352466 5.78135971352466 +5.78135971352466 5.78135971352466 5.78135971352466 0.10893437155316406 5.78135971352466 +5.78135971352466 5.78135971352466 5.78135971352466 5.78135971352466 0.0 + diff --git a/testing-1/test-1-Ney9-aa.bit.tcm b/testing-1/test-1-Ney9-aa.bit.tcm new file mode 100644 index 000000000..47db30bc9 --- /dev/null +++ b/testing-1/test-1-Ney9-aa.bit.tcm @@ -0,0 +1,12 @@ +A C D E F G H I K +0.12314823077286484 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 +6.78135971352466 0.12314823077286484 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 +6.78135971352466 6.78135971352466 0.12314823077286484 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 +6.78135971352466 6.78135971352466 6.78135971352466 0.12314823077286484 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 +6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 0.12314823077286484 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 +6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 0.12314823077286484 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 +6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 0.12314823077286484 6.78135971352466 6.78135971352466 6.78135971352466 +6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 0.12314823077286484 6.78135971352466 6.78135971352466 +6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 0.12314823077286484 6.78135971352466 +6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 0.0 + diff --git a/testing-1/test-1-Ney9.bit.tcm b/testing-1/test-1-Ney9.bit.tcm new file mode 100644 index 000000000..7d35c88a7 --- /dev/null +++ b/testing-1/test-1-Ney9.bit.tcm @@ -0,0 +1,12 @@ +A C G T Z Q O P F +0.12314823077286484 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 +6.78135971352466 0.12314823077286484 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 +6.78135971352466 6.78135971352466 0.12314823077286484 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 +6.78135971352466 6.78135971352466 6.78135971352466 0.12314823077286484 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 +6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 0.12314823077286484 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 +6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 0.12314823077286484 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 +6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 0.12314823077286484 6.78135971352466 6.78135971352466 6.78135971352466 +6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 0.12314823077286484 6.78135971352466 6.78135971352466 +6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 0.12314823077286484 6.78135971352466 +6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 6.78135971352466 0.0 + diff --git a/testing-1/test-1-NeyR.bit.tcm b/testing-1/test-1-NeyR.bit.tcm new file mode 100644 index 000000000..bf835f31a --- /dev/null +++ b/testing-1/test-1-NeyR.bit.tcm @@ -0,0 +1,5 @@ +A C +1.019780897157814e-2 5.044394119358453 5.044394119358453 +5.044394119358453 2.019780897157814e-2 5.044394119358453 +5.044394119358453 5.044394119358453 3.019780897157814e-2 + diff --git a/testing-1/test-1.fas b/testing-1/test-1.fas new file mode 100644 index 000000000..5160453fd --- /dev/null +++ b/testing-1/test-1.fas @@ -0,0 +1,10 @@ +>T0 +A +>T1 +A +>T2 +A +>T3 +A +>T4 +A \ No newline at end of file diff --git a/testing-1/test-1.pg b/testing-1/test-1.pg new file mode 100644 index 000000000..610b05009 --- /dev/null +++ b/testing-1/test-1.pg @@ -0,0 +1,28 @@ +-- testing script for PMDL stuff +set(criterion:SI) +--read(prefasta:"test-1.fas", tcm:"test-1-Ney2.bit.tcm") + +-- this has no changes (all A) +-- uses DO +--read("test-1.fas", tcm:"test-1-Ney2.bit.tcm") +-- prealigned +--read(prefasta:"test-1.fas", tcm:"test-1-Ney2.bit.tcm") + +-- this has one single change (A->C) +read("test-2.fas", tcm:"test-1-Ney2.bit.tcm") +--read(prefasta:"test-2.fas", tcm:"test-1-Ney2.bit.tcm") + +--wide +--read("test-2-wide.fas", tcm:"test-1-Ney9.bit.tcm") +--read("test-2-aa.fas", tcm:"test-1-Ney9-aa.bit.tcm") + + +--read("test-1.fas", tcm:"test-1-Ney2.bit.tcm") +--read("test-1.fas", tcm:(11, 11)) +--read("test-1.fas") +--read("test-1.dot") +read("test-1.tre") +report("test-data.csv", data, overwrite) + +report("test-diag.csv", diagnosis, overwrite) +report("test-out.dot", dotpdf, graphs, overwrite) diff --git a/testing-1/test-2-aa.fas b/testing-1/test-2-aa.fas new file mode 100644 index 000000000..a73017e6b --- /dev/null +++ b/testing-1/test-2-aa.fas @@ -0,0 +1,11 @@ +>T0 +ACDEFGHIK +>T1 +ACDEFGHIK +>T2 +ACDEFGHIK +>T3 +ACDEFGHIK +>T4 +ACDEFGHIK + diff --git a/testing-1/test-2-wide.fas b/testing-1/test-2-wide.fas new file mode 100644 index 000000000..fae25368e --- /dev/null +++ b/testing-1/test-2-wide.fas @@ -0,0 +1,10 @@ +>T0 +ACGTZQOPF +>T1 +ACGTZQOPF +>T2 +ACGTZQOPF +>T3 +ACGTZQOPF +>T4 +ACGTZQOPF \ No newline at end of file diff --git a/testing-1/test-2.fas b/testing-1/test-2.fas new file mode 100644 index 000000000..f16c28466 --- /dev/null +++ b/testing-1/test-2.fas @@ -0,0 +1,10 @@ +>T0 +A +>T1 +A +>T2 +A +>T3 +A +>T4 +C \ No newline at end of file diff --git a/tutorials/tutorial_1/Tutorial1_phygfiles/run1/chel_16S.fas b/tutorials/tutorial_1/Tutorial1_phygfiles/run1/chel_16S.fas new file mode 100644 index 000000000..d4b04dcde --- /dev/null +++ b/tutorials/tutorial_1/Tutorial1_phygfiles/run1/chel_16S.fas @@ -0,0 +1,81 @@ +>Americhernus + TCGAGCCTCC AATGATACGT TGAAAGGCGT TTATCGTTGG GGCCGACAGC GCGTCGTGGG + CTCGGTTGGC CTTAAAAAGC TGATCGGGTT CTCCGGCAAT TTTACTTTGA AAAAATTAGG + GTGCTCAAAG CAGGCCTGGT GCC + +>Gea + TCCGGCCGGA CGGGTCCGCC TACCGGTGGT TACTGTTCGC TGCCGAGCTT CAGGGGGCCG + CTGTCGATGA TCTTCATCGG TTATCTTCCG TAACCCTCAC GTTTACTTTG AAAAAATTAG + AGTGCTCAAA GCAGCGCGAC GCC + +>Hypochilus + TCCAGACGGG CGGTCCGCCT AACGGTGGTT ACTGCCTGGC CTGAACAACC AGCCGGTTTC + CCTAGATGAT CTTCATTGAT TGTCTTGGGT GACCGGCACG TTTACTTTGA AAAAATTAGA + GTGCTCAAAG CAGCGTGACG CC + +>Thelichoris + TCCAGACGGG CGGTCCGCCT AACGGTGGTT ACTGCCTGGC CTGAACAGCC AGCCGGTTTC + CCTAGATGAT CTTTACCGGT TGTCTTGGGT GACCGGCACG TTTACTTTGA AAAAATTAGA + GTGCTCAAAG CAGGCTGACG CC + +>Amblypygid + TCCAGACTGG CGGTCCGCCT AGCGGCGAGT ACTGTCAGGC CTGAACATGG CGCCGGTTTT + CCTTGGTTCT CTTTACTGAG TGTCTTGGGC GACCGGCACG TTTACTTTGA AAAAATTAGA + GTGCTCAAAG CAGGCTGTCG CC + +>Mastigoproctus + TCCAGACGGG TGGTCCACCG CCCGGTGGCT ACTGCCCGGC CTGAACAATC TCGCCGGTTT + TCCTTGATTC TCTTCACCGA GTGTCTTGGG TGACCGGCAC GTTTACTTTG AAAAAATTAG + AGTGCTTAAA GCAGCGTAAC GCC + +>Rhiphicephalus + TCCAGACGAG TAGTGCATCT ACCCGATGCT ACGGCTCGGA CTGAACATCA TGCCGGTTCT + TTCTTGGTGC ACTTCATTGT GTGCCTCGAG ATGGCCGGTG CTTTTACTTT GAAAAAATTA + GAGTGCTCAA CGCAGGCGAG TCGCC + +>Vonones + TCGAGGCTGG CGGTCCGCCT ACAGGCGGTC ACTGCCAGTA CTCAACATCC TGCCGGTTTT + CCCTTGGTGC TCTTCGCTGA GTGTCTCGGG TGGCCGGCAC GTTTACTTTG AAAAAATTAG + AGTGCTCAAA GCAGGCGCGT AGCC + +>Centruoides + TCCAGACAAG CGGTCCACCC GCGGTGGTTA CTGTTTGGAC TGGACGTTTG GCCGGATTCC + TTTGATGCTC TTTGCCGAGT GTCTTGGGTG TCCGGCACGT TTACTTTGAA AAAATTAGAG + TGCTCAAAGC AGCGTACCGC C + +>Hadrurus + TCCGGACTGT CGGTCCGCCG CAAGCTTACT GGCAGGACCG GACGTCTAGC CGGACTCTCT + CGTATCCTCT TCACCGGGTG TCTTGGGTGT CCGGCAATTT TACTTTGAAA AAATTAGAGT + GCTCAAAGCA GCGCGATCCG CC + +>Pauroctonus + TCCAGACTGT CGGTCCGCAC CGGAGGTTAC TGGCAGGACC GGACGTCTAG CCGGCCTCCC + TCGTTGCTCT TCACCGGGTG TCTTGGGTGT CCGGCACGTT TACTTTGAAA AAATTAGAGT + GCTCAAAGCA GCGACCCGCC + +>Limulus + TCTAGACTGG CGGTCCGCTT CCGGCGGTTA CTGCCTGGCC TAAACATCTG CCGGTTTTCC + CTCGGTGCCC TTGATTGAGT GTCTTGGGTG GCCGGCACGT TTACTTTGAA AAAATTAGAG + TGCTCAAAGC AGCGCAACGC C + +>Aportus + TCCAGACTGA CGGTCCACCG CTCGGGCGAC TGTCAGGCCT GAACATTCCG TCGGTTTCGA + CAGATTTTTC CCGCGGTGCT CTTCGGTGAG TGTCGCGGAG GCCGACAAGT TTACTTTGAA + AAAATTAGAG TGCTCAAAGC AGGCGGTCAC C + +>Alentus + TCCAGACTGA CGGTCCACCG AATCGGGCGA CTGTCAGGCC TGAACATCCG GTCGGTTTCT + CTCCCTTTTC CCGCGGTGCT CTTCGTTGAG TGTCGCGGGA GGCCGACAAG TTTACTTTGA + AAAAATTAGA GTGCTCAAAG CAGGCGGTCA CC + +>Artemia + CTCGGTCGGG TGGTGCCGCC TCACGGTGGT CACTGCCTCG ATCGGACAAT TCATTGGATC + GTTCGGGGTG CTCTTAACCG AGTGTCCTGG GTGGCCGATA CGTTTACTTT GAACAAATTA + GAGTGCTTAA AGCAGGTGCA CCGCGCC + +>Thermobius + CTCGGACGAT CGGTTCGCCG CCCGTGTTAA CTGATCGTTC CGGACGTCCT GCCGGTTTTT + CCTTTCTCGG TGCTCTTCAT TGAGTGTCTT GATTGGCCGG CACGTTTACT TTGAACAAAT + TAGAGTGCTC AAAGCAGGTC CAGTGTCGCC + + diff --git a/tutorials/tutorial_1/Tutorial1_phygfiles/run1/chel_cox1.fas b/tutorials/tutorial_1/Tutorial1_phygfiles/run1/chel_cox1.fas new file mode 100755 index 000000000..f4612e20b --- /dev/null +++ b/tutorials/tutorial_1/Tutorial1_phygfiles/run1/chel_cox1.fas @@ -0,0 +1,119 @@ +>Alentus +TTATATTAGGAGCACCAGATATAGCTTTCCCTCGAATAAATAATATAAGATTTTGACTTC +TTCCACCATCTTTAACTTTACTCCTCTCTAGAGGAATAGTAGAAAGTGGAGTGGGAACAG +GCTGAACTGTCTATCCTCCTTTATCTGCTGGAATTGCACATGCTGGTGCCTCAGTAGATT +TAGGAATCTTTTCTTTACACCTAGCAGGAGTTTCATCCATCTTAGGGGCCGTAAACTTTA +TAACTACCGTTATCAATATACGAAGAACAGGTATAACAATAGACCGAATTCCTCTTTTTG +TATGATCTGTATTTATTACAGCCCTTCTTCTCCTTCTTTCCCTCCCAGTC +>Amblypygid +TAATATTAGGAGCCCCAGATATGGCTTTTCCCCGTATAAATAATATAAGTTTTTGACTTT +TACCCCCCTCTCTAACACTACTCCTCTCTAGAGGGATAGTGGAAAGAGGTGTAGGTACAG +GATGAACTGTTTACCCCCCTCTATCGGCTAGCATCGCACACGCCGGTGCTTCTGTTGATA +TGGGAATTTTCTCGCTCCATTTGGCCGGGGTTTCTTCAATTTTAGGGGCCGTTAATTTTA +TAACTACTGTTATCAATATACGAAGAACCGGAATAACTATAGACCGAATTCCCCTTTTTG +TTTGATCTGTCTTTATTACCGCCCTCCTTCTTCTTCTATCATTACCTGTT +>Americhernus +TAATACTTGGGGCCCCTGATATGGCTTTCCCCCGTATAAATAATATGAGATTCTGACTCT +TACCTCCCTCATTAACTCTCCTATTATCGAGAGGGATAGTTGAAACAGGGGTTGGTACTG +GATGGACAGTATACCCCCCCTTATCGGCCGGGATCGCCCACGCAGGGGCCTCAGTTGACC +TGGCAATCTTTTCTCTCCACTTAGCTGGGGTTTCTTCTATTCTAGGAGCAGTTAATTTTA +TAACAACCGTTATCAATATACGAAGTGTAGGGATGACAATAGACCGAGTACCTCTTTTTG +TATGGTCTATTTTCATCACAACTCTCCTTCTACTACTTTCTCTACCAGTC +>Aportus +TAATATTAGGGGCCCCTGATATGGCCTTTCCCCGGATAAATAATATAAGTTTCTGACTAT +TACCCCCCTCTTTAACCCTTCTTTTATCTAGTGGTATGGTTGAAACAGGAGTTGGGACTG +GTTGAACAGTTTACCCCCCTTTAGCCGCCGGAATTGCTCACGCAGGAGCTTCGGTAGATC +TTGCTATTTTCTCACTCCACTTAGCCGGGGTTTCTTCGATTTTAGGAGCAGTAAACTTTA +TAACTACCGTTATTAATATACGAAGAACAGGGATAACAATGGACCGGATCCCTCTTTTCG +TCTGGTCGGTTTTTATTACTGCCCTTCTTTTACTTTTATCTCTCCCAGTT +>Artemia +TAATATTAGGAGCTCCAGATATAGCTTTCCCTCGAATAAATAATATAAGATTCTGACTCC +TCCCTCCCTCTTTAACTCTTTTACTTTCAAGAGGAATAGTAGAAAGAGGAGTAGGAACGG +GATGAACCGTTTATCCTCCTCTTGCAGCCGGAATTGCTCATGCCGGAGCTTCAGTTGATA +TAGGAATCTTCTCTCTACATTTAGCAGGAGTATCCTCTATTTTAGGGGCCGTAAATTTTA +TAACAACAGTGATTAACATACGGGCATCTGGAATAACATTAGACCGAATACCTTTATTCG +TGTGATCTGTATTTATTACTGCACTATTATTACTTTTATCACTACCAGTT +>Centruoides +TAATATTAGGGGCCCCTGATATGGCTTTCCCTCGAATAAATAATATAAGATTCTGATTGC +TTCCCCCTTCCCTAACTCTTTTACTTTCGAGTGGAATGGTAGAAAGAGGGGTAGGTACAG +GATGAACTGTTTATCCCCCTCTTGCAGCCGGAATTGCTCATGCTGGAGCTTCCGTTGACA +TAGGAATTTTCTCTCTACATTTAGCAGGAGTGTCTTCTATTTTAGGGGCCGTAAACTTCA +TAACTACAGTAATTAATATACGAGCATCTGGAATAACAATAGACCGAATACCTTTATTTG +TTTGATCTGTCTTTATTACAGCACTACTTTTACTCTTGTCATTACCAGTT +>Chanbria +TTATATTAGGAGCACCTGATATAGCATTTCCACGTATAAACAATATAAGATTCTGACTTT +TACCTCCATCTCTTACACTACTCCTATCAAGAGGAATGGTAGAAAGAGGAGTCGGTACAG +GATGAACTGTGTATCCCCCGCTAGCAGCAGGAATTGCCCACGCTGGAGCCTCAGTTGACA +TAGGAATTTTTTCACTACACCTCGCCGGAGTCTCATCAATTCTAGGTGCAGTTAATTTTA +TAACAACTGTTATTAACATACGCCCTGCGGGAATAACTATAGACCGTATACCACTTTTCG +TGTGAGCAGTATTTATTACAGCCCTACTCCTTCTTTTATCTCTACCAGTC +>Gea +TTATACTTGGTGCCCCAGATATGGCATTCCCACGAATAAATAATATAAGATTCTGACTTC +TCCCCCCTTCTCTAACTCTATTACTTTCAAGAGGAATAGTAGAAAGAGGAGTAGGAACAG +GATGAACAGTTTACCCCCCTCTAGCAGCAGGAATTGCTCATGCTGGAGCTTCAGTTGATA +TAGGAATTTTCTCGCTACACCTTGCAGGGGTCTCATCAATCTTAGGAGCAGTTAATTTCA +TGACAACAGTTATTAATATGCGCCCTGCAGGAATAACTATGGACCGTATACCACTCTTCG +TGTGAGCTGTCTTTATTACAGCCTTGCTACTATTACTATCCCTCCCAGTC +>Hadrurus +TTATATTAGGGGCCCCAGATATAGCTTTTCCCCGAATAAATAATATAAGCTTCTGATTAC +TTCCCCCCTCCCTAACTCTCCTCCTTTTAAGAGGAATAGTTGAAAGAGGAGTAGGGACCG +GCTGAACTGTCTATCCTCCTTTAGCTAGAGGAATTGCCCATGCAGGAGCCTCGGTAGACA +TAGGAATTTTTTCTCTACACTTAGCTGGAGTTTCATCCATTTTAGGGGCCGTAAACTTTA +TAACAACAGTGATTAATATACGATCATCAGGAATAACAATAGATCGAATACCTTTATTTG +TTTGATCAGTCTTTATTACCGCTTTATTGTTACTCCTTTCACTCCCAGTT +>Hypochilus +TAATACTCGGGGCCCCAGATATAGCTTTCCCACGAATAAATAACATAAGATTCTGACTTC +TTCCTCCCTCTCTAACTCTACTTCTCTCAAGCGGAATAGTCGAAAGAGGGGTAGGAACCG +GCTGGACAGTATACCCTCCTCTAGCCAGAGGAATCGCTCATGCAGGTGCTTCTGTAGATA +TAGGAATCTTCTCGCTTCACCTAGCAGGAGTCTCATCAATCCTAGGGGCCGTAAACTTTA +TAACAACAGTAATCAATATACGATCCTCAGGAATATCTATAGACCGTATACCTTTATTTG +TATGATCAGTATTCATTACCGCTCTCCTCCTTCTTCTCTCTTTACCAGTC +>Limulus +TAATACTAGGTGCCCCGGATATGGCTTTCCCTCGAATAAATAACATAAGTTTCTGACTTT +TACCCCCATCTCTCACTCTCCTCTTATCTAGAGGAATAGTAGAAAGAGGTGTAGGAACTG +GATGAACTGTATATCCTCCTTTATCTGCAAGTATTGCCCATGCGGGAGCTTCCGTTGATC +TCGGAATCTTCTCACTACACTTAGCAGGTGTATCATCAATTTTAGGTGCCGTAAATTTTA +TAACAACTGTTATCAATATACGATCTACCGGAATAACTATGGACCGGATACCTCTTTTTG +TTTGAGCAGTATTTATTACTGCTTTACTTTTACTACTGTCCTTACCAGTT +>Mastigoproctus +TGATATTAGGTGCTCCTGATATAGCTTTTCCTCGTATAAACAATATAAGATTCTGACTTT +TACCTCCTTCGTTAACGTTACTTCTTTCCAGAGGTATAGTTGAAAGAGGGGTAGGAACAG +GATGAACAGTCTATCCTCCTTTATCAGCTAGAATTGCTCATGCCGGAGCTTCAGTAGACT +TAGGTATTTTCTCGCTGCATCTAGCTGGTGTCTCGTCTATCTTAGGAGCTGTCAATTTTA +TAACAACTGTTATCAATATACGATCTACAGGTATAACTATAGACCGTATACCTCTTTTCG +TATGGGCAGTATTTATTACCGCCCTACTACTTTTATTATCTTTACCAGTA +>Pauroctonus +TAATGTTAGGTGCTCCTGATATGGCTTTTCCACGAATAAACAATATGAGTTTCTGGCTCC +TACCTCCTTCACTAACACTACTTCTTTCTAGAGGTATAGTTGAAAGAGGAGTAGGAACAG +GATGAACTGTTTACCCTCCTTTATCAGCCAGTATTGCTCATGCTGGGGCTTCGGTAGATT +TAGGAATTTTCTCCCTACACTTGGCAGGTGTTTCTTCAATTTTAGGAGCTGTAAATTTTA +TGACAACTGTTATTAACATACGATCTACGGGAATAACTATAGACCGAATACCGCTTTTCG +TTTGAGCAGTATTTATTACAGCATTACTTCTCTTATTATCTCTACCGGTA +>Rhiphicephalus +TAATATTAGGAGCACCTGATATAGCATTCCCACGAATAAATAATATAAGCTTTTGACTGC +TACCTCCTTCTCTCACCCTTCTTCTATCTAGAGGAATAGTTGAAAGAGGAGTTGGAACAG +GATGAACTGTATATCCTCCCCTATCAGCTAGAATCGCACATGCAGGGGCCTCTGTAGATT +TGGGAATTTTTTCATTGCATTTAGCAGGAGTTTCATCAATTCTAGGGGCCGTAAATTTCA +TAACAACTGTAATTAATATACGATCTACAGGTATAACTATAGACCGAATACCGTTATTCG +TCTGAGCAGTTTTTATTACAGCGCTCTTACTCCTTTTATCTTTACCCGTT +>Thelichoris +TAATATTAGGAGCACCTGATATAGCATTCCCACGAATAAATAATATAAGCTTTTGACTGC +TACCTCCTTCTCTCACCCTTCTTCTATCTAGAGGAATAGTTGAAAGAGGAGTTGGAACAG +GATGAACTGTATATCCTCCCCTATCAGCTAGAATCGCACATGCAGGGGCCTCTGTAGATT +TGGGAATTTTTTCATTGCATTTAGCAGGAGTTTCATCAATTCTAGGGGCCGTAAATTTCA +TAACAACTGTAATTAATATACGATCTACAGGTATAACTATAGACCGAATACCGTTATTCG +TCTGAGCAGTTTTTATTACAGCGCTCTTACTCCTTTTATCTTTACCCGTT +>Thermobius +TAATATTAGGTGCTCCCGATATAGCTTTCCCTCGAATAAATAATATGAGTTTCTGACTTT +TACCTCCCTCGCTAACTCTACTTCTTTCTAGAGGTATAGTCGAAAGAGGGGTAGGAACAG +GATGAACTGTTTACCCTCCTCTATCTGCCAGCATTGCCCATGCAGGAGCATCTGTAGATC +TAGGAATCTTCTCATTACATTTAGCAGGGGTTTCTTCAATTCTAGGTGCTGTAAATTTTA +TAACTACCGTTATCAATATACGATCAACAGGAATAACTATAGACCGAATACCTCTTTTCG +TCTGAGCGGTATTTATTACAGCCTTACTACTTTTACTATCATTACCGGTT +>Vonones +TAATGTTAGGTGCTCCAGATATGGCATTTCCCCGAATAAATAATATAAGTTTCTGACTTT +TACCCCCTTCGCTAACTTTACTTTTATCTAGAGGTATAGTCGAAAGAGGAGTGGGAACTG +GATGAACAGTATACCCTCCTTTATCAGCCAGAATTGCTCACGCAGGTGCTTCAGTTGATC +TAGGTATTTTTTCATTACATTTAGCAGGGGTCTCATCAATCTTAGGAGCTGTAAACTTTA +TAACGACCGTTATCAATATACGATCTACAGGGATAACTATAGACCGAATACCACTTTTTG +TTTGAGCAGTATTTATTACAGCTCTACTCCTACTGTTATCTTTACCAGTC diff --git a/tutorials/tutorial_1/Tutorial1_phygfiles/search1/chel_16S.fas b/tutorials/tutorial_1/Tutorial1_phygfiles/search1/chel_16S.fas new file mode 100644 index 000000000..d4b04dcde --- /dev/null +++ b/tutorials/tutorial_1/Tutorial1_phygfiles/search1/chel_16S.fas @@ -0,0 +1,81 @@ +>Americhernus + TCGAGCCTCC AATGATACGT TGAAAGGCGT TTATCGTTGG GGCCGACAGC GCGTCGTGGG + CTCGGTTGGC CTTAAAAAGC TGATCGGGTT CTCCGGCAAT TTTACTTTGA AAAAATTAGG + GTGCTCAAAG CAGGCCTGGT GCC + +>Gea + TCCGGCCGGA CGGGTCCGCC TACCGGTGGT TACTGTTCGC TGCCGAGCTT CAGGGGGCCG + CTGTCGATGA TCTTCATCGG TTATCTTCCG TAACCCTCAC GTTTACTTTG AAAAAATTAG + AGTGCTCAAA GCAGCGCGAC GCC + +>Hypochilus + TCCAGACGGG CGGTCCGCCT AACGGTGGTT ACTGCCTGGC CTGAACAACC AGCCGGTTTC + CCTAGATGAT CTTCATTGAT TGTCTTGGGT GACCGGCACG TTTACTTTGA AAAAATTAGA + GTGCTCAAAG CAGCGTGACG CC + +>Thelichoris + TCCAGACGGG CGGTCCGCCT AACGGTGGTT ACTGCCTGGC CTGAACAGCC AGCCGGTTTC + CCTAGATGAT CTTTACCGGT TGTCTTGGGT GACCGGCACG TTTACTTTGA AAAAATTAGA + GTGCTCAAAG CAGGCTGACG CC + +>Amblypygid + TCCAGACTGG CGGTCCGCCT AGCGGCGAGT ACTGTCAGGC CTGAACATGG CGCCGGTTTT + CCTTGGTTCT CTTTACTGAG TGTCTTGGGC GACCGGCACG TTTACTTTGA AAAAATTAGA + GTGCTCAAAG CAGGCTGTCG CC + +>Mastigoproctus + TCCAGACGGG TGGTCCACCG CCCGGTGGCT ACTGCCCGGC CTGAACAATC TCGCCGGTTT + TCCTTGATTC TCTTCACCGA GTGTCTTGGG TGACCGGCAC GTTTACTTTG AAAAAATTAG + AGTGCTTAAA GCAGCGTAAC GCC + +>Rhiphicephalus + TCCAGACGAG TAGTGCATCT ACCCGATGCT ACGGCTCGGA CTGAACATCA TGCCGGTTCT + TTCTTGGTGC ACTTCATTGT GTGCCTCGAG ATGGCCGGTG CTTTTACTTT GAAAAAATTA + GAGTGCTCAA CGCAGGCGAG TCGCC + +>Vonones + TCGAGGCTGG CGGTCCGCCT ACAGGCGGTC ACTGCCAGTA CTCAACATCC TGCCGGTTTT + CCCTTGGTGC TCTTCGCTGA GTGTCTCGGG TGGCCGGCAC GTTTACTTTG AAAAAATTAG + AGTGCTCAAA GCAGGCGCGT AGCC + +>Centruoides + TCCAGACAAG CGGTCCACCC GCGGTGGTTA CTGTTTGGAC TGGACGTTTG GCCGGATTCC + TTTGATGCTC TTTGCCGAGT GTCTTGGGTG TCCGGCACGT TTACTTTGAA AAAATTAGAG + TGCTCAAAGC AGCGTACCGC C + +>Hadrurus + TCCGGACTGT CGGTCCGCCG CAAGCTTACT GGCAGGACCG GACGTCTAGC CGGACTCTCT + CGTATCCTCT TCACCGGGTG TCTTGGGTGT CCGGCAATTT TACTTTGAAA AAATTAGAGT + GCTCAAAGCA GCGCGATCCG CC + +>Pauroctonus + TCCAGACTGT CGGTCCGCAC CGGAGGTTAC TGGCAGGACC GGACGTCTAG CCGGCCTCCC + TCGTTGCTCT TCACCGGGTG TCTTGGGTGT CCGGCACGTT TACTTTGAAA AAATTAGAGT + GCTCAAAGCA GCGACCCGCC + +>Limulus + TCTAGACTGG CGGTCCGCTT CCGGCGGTTA CTGCCTGGCC TAAACATCTG CCGGTTTTCC + CTCGGTGCCC TTGATTGAGT GTCTTGGGTG GCCGGCACGT TTACTTTGAA AAAATTAGAG + TGCTCAAAGC AGCGCAACGC C + +>Aportus + TCCAGACTGA CGGTCCACCG CTCGGGCGAC TGTCAGGCCT GAACATTCCG TCGGTTTCGA + CAGATTTTTC CCGCGGTGCT CTTCGGTGAG TGTCGCGGAG GCCGACAAGT TTACTTTGAA + AAAATTAGAG TGCTCAAAGC AGGCGGTCAC C + +>Alentus + TCCAGACTGA CGGTCCACCG AATCGGGCGA CTGTCAGGCC TGAACATCCG GTCGGTTTCT + CTCCCTTTTC CCGCGGTGCT CTTCGTTGAG TGTCGCGGGA GGCCGACAAG TTTACTTTGA + AAAAATTAGA GTGCTCAAAG CAGGCGGTCA CC + +>Artemia + CTCGGTCGGG TGGTGCCGCC TCACGGTGGT CACTGCCTCG ATCGGACAAT TCATTGGATC + GTTCGGGGTG CTCTTAACCG AGTGTCCTGG GTGGCCGATA CGTTTACTTT GAACAAATTA + GAGTGCTTAA AGCAGGTGCA CCGCGCC + +>Thermobius + CTCGGACGAT CGGTTCGCCG CCCGTGTTAA CTGATCGTTC CGGACGTCCT GCCGGTTTTT + CCTTTCTCGG TGCTCTTCAT TGAGTGTCTT GATTGGCCGG CACGTTTACT TTGAACAAAT + TAGAGTGCTC AAAGCAGGTC CAGTGTCGCC + + diff --git a/tutorials/tutorial_1/Tutorial1_phygfiles/search1/chel_cox1.fas b/tutorials/tutorial_1/Tutorial1_phygfiles/search1/chel_cox1.fas new file mode 100755 index 000000000..f4612e20b --- /dev/null +++ b/tutorials/tutorial_1/Tutorial1_phygfiles/search1/chel_cox1.fas @@ -0,0 +1,119 @@ +>Alentus +TTATATTAGGAGCACCAGATATAGCTTTCCCTCGAATAAATAATATAAGATTTTGACTTC +TTCCACCATCTTTAACTTTACTCCTCTCTAGAGGAATAGTAGAAAGTGGAGTGGGAACAG +GCTGAACTGTCTATCCTCCTTTATCTGCTGGAATTGCACATGCTGGTGCCTCAGTAGATT +TAGGAATCTTTTCTTTACACCTAGCAGGAGTTTCATCCATCTTAGGGGCCGTAAACTTTA +TAACTACCGTTATCAATATACGAAGAACAGGTATAACAATAGACCGAATTCCTCTTTTTG +TATGATCTGTATTTATTACAGCCCTTCTTCTCCTTCTTTCCCTCCCAGTC +>Amblypygid +TAATATTAGGAGCCCCAGATATGGCTTTTCCCCGTATAAATAATATAAGTTTTTGACTTT +TACCCCCCTCTCTAACACTACTCCTCTCTAGAGGGATAGTGGAAAGAGGTGTAGGTACAG +GATGAACTGTTTACCCCCCTCTATCGGCTAGCATCGCACACGCCGGTGCTTCTGTTGATA +TGGGAATTTTCTCGCTCCATTTGGCCGGGGTTTCTTCAATTTTAGGGGCCGTTAATTTTA +TAACTACTGTTATCAATATACGAAGAACCGGAATAACTATAGACCGAATTCCCCTTTTTG +TTTGATCTGTCTTTATTACCGCCCTCCTTCTTCTTCTATCATTACCTGTT +>Americhernus +TAATACTTGGGGCCCCTGATATGGCTTTCCCCCGTATAAATAATATGAGATTCTGACTCT +TACCTCCCTCATTAACTCTCCTATTATCGAGAGGGATAGTTGAAACAGGGGTTGGTACTG +GATGGACAGTATACCCCCCCTTATCGGCCGGGATCGCCCACGCAGGGGCCTCAGTTGACC +TGGCAATCTTTTCTCTCCACTTAGCTGGGGTTTCTTCTATTCTAGGAGCAGTTAATTTTA +TAACAACCGTTATCAATATACGAAGTGTAGGGATGACAATAGACCGAGTACCTCTTTTTG +TATGGTCTATTTTCATCACAACTCTCCTTCTACTACTTTCTCTACCAGTC +>Aportus +TAATATTAGGGGCCCCTGATATGGCCTTTCCCCGGATAAATAATATAAGTTTCTGACTAT +TACCCCCCTCTTTAACCCTTCTTTTATCTAGTGGTATGGTTGAAACAGGAGTTGGGACTG +GTTGAACAGTTTACCCCCCTTTAGCCGCCGGAATTGCTCACGCAGGAGCTTCGGTAGATC +TTGCTATTTTCTCACTCCACTTAGCCGGGGTTTCTTCGATTTTAGGAGCAGTAAACTTTA +TAACTACCGTTATTAATATACGAAGAACAGGGATAACAATGGACCGGATCCCTCTTTTCG +TCTGGTCGGTTTTTATTACTGCCCTTCTTTTACTTTTATCTCTCCCAGTT +>Artemia +TAATATTAGGAGCTCCAGATATAGCTTTCCCTCGAATAAATAATATAAGATTCTGACTCC +TCCCTCCCTCTTTAACTCTTTTACTTTCAAGAGGAATAGTAGAAAGAGGAGTAGGAACGG +GATGAACCGTTTATCCTCCTCTTGCAGCCGGAATTGCTCATGCCGGAGCTTCAGTTGATA +TAGGAATCTTCTCTCTACATTTAGCAGGAGTATCCTCTATTTTAGGGGCCGTAAATTTTA +TAACAACAGTGATTAACATACGGGCATCTGGAATAACATTAGACCGAATACCTTTATTCG +TGTGATCTGTATTTATTACTGCACTATTATTACTTTTATCACTACCAGTT +>Centruoides +TAATATTAGGGGCCCCTGATATGGCTTTCCCTCGAATAAATAATATAAGATTCTGATTGC +TTCCCCCTTCCCTAACTCTTTTACTTTCGAGTGGAATGGTAGAAAGAGGGGTAGGTACAG +GATGAACTGTTTATCCCCCTCTTGCAGCCGGAATTGCTCATGCTGGAGCTTCCGTTGACA +TAGGAATTTTCTCTCTACATTTAGCAGGAGTGTCTTCTATTTTAGGGGCCGTAAACTTCA +TAACTACAGTAATTAATATACGAGCATCTGGAATAACAATAGACCGAATACCTTTATTTG +TTTGATCTGTCTTTATTACAGCACTACTTTTACTCTTGTCATTACCAGTT +>Chanbria +TTATATTAGGAGCACCTGATATAGCATTTCCACGTATAAACAATATAAGATTCTGACTTT +TACCTCCATCTCTTACACTACTCCTATCAAGAGGAATGGTAGAAAGAGGAGTCGGTACAG +GATGAACTGTGTATCCCCCGCTAGCAGCAGGAATTGCCCACGCTGGAGCCTCAGTTGACA +TAGGAATTTTTTCACTACACCTCGCCGGAGTCTCATCAATTCTAGGTGCAGTTAATTTTA +TAACAACTGTTATTAACATACGCCCTGCGGGAATAACTATAGACCGTATACCACTTTTCG +TGTGAGCAGTATTTATTACAGCCCTACTCCTTCTTTTATCTCTACCAGTC +>Gea +TTATACTTGGTGCCCCAGATATGGCATTCCCACGAATAAATAATATAAGATTCTGACTTC +TCCCCCCTTCTCTAACTCTATTACTTTCAAGAGGAATAGTAGAAAGAGGAGTAGGAACAG +GATGAACAGTTTACCCCCCTCTAGCAGCAGGAATTGCTCATGCTGGAGCTTCAGTTGATA +TAGGAATTTTCTCGCTACACCTTGCAGGGGTCTCATCAATCTTAGGAGCAGTTAATTTCA +TGACAACAGTTATTAATATGCGCCCTGCAGGAATAACTATGGACCGTATACCACTCTTCG +TGTGAGCTGTCTTTATTACAGCCTTGCTACTATTACTATCCCTCCCAGTC +>Hadrurus +TTATATTAGGGGCCCCAGATATAGCTTTTCCCCGAATAAATAATATAAGCTTCTGATTAC +TTCCCCCCTCCCTAACTCTCCTCCTTTTAAGAGGAATAGTTGAAAGAGGAGTAGGGACCG +GCTGAACTGTCTATCCTCCTTTAGCTAGAGGAATTGCCCATGCAGGAGCCTCGGTAGACA +TAGGAATTTTTTCTCTACACTTAGCTGGAGTTTCATCCATTTTAGGGGCCGTAAACTTTA +TAACAACAGTGATTAATATACGATCATCAGGAATAACAATAGATCGAATACCTTTATTTG +TTTGATCAGTCTTTATTACCGCTTTATTGTTACTCCTTTCACTCCCAGTT +>Hypochilus +TAATACTCGGGGCCCCAGATATAGCTTTCCCACGAATAAATAACATAAGATTCTGACTTC +TTCCTCCCTCTCTAACTCTACTTCTCTCAAGCGGAATAGTCGAAAGAGGGGTAGGAACCG +GCTGGACAGTATACCCTCCTCTAGCCAGAGGAATCGCTCATGCAGGTGCTTCTGTAGATA +TAGGAATCTTCTCGCTTCACCTAGCAGGAGTCTCATCAATCCTAGGGGCCGTAAACTTTA +TAACAACAGTAATCAATATACGATCCTCAGGAATATCTATAGACCGTATACCTTTATTTG +TATGATCAGTATTCATTACCGCTCTCCTCCTTCTTCTCTCTTTACCAGTC +>Limulus +TAATACTAGGTGCCCCGGATATGGCTTTCCCTCGAATAAATAACATAAGTTTCTGACTTT +TACCCCCATCTCTCACTCTCCTCTTATCTAGAGGAATAGTAGAAAGAGGTGTAGGAACTG +GATGAACTGTATATCCTCCTTTATCTGCAAGTATTGCCCATGCGGGAGCTTCCGTTGATC +TCGGAATCTTCTCACTACACTTAGCAGGTGTATCATCAATTTTAGGTGCCGTAAATTTTA +TAACAACTGTTATCAATATACGATCTACCGGAATAACTATGGACCGGATACCTCTTTTTG +TTTGAGCAGTATTTATTACTGCTTTACTTTTACTACTGTCCTTACCAGTT +>Mastigoproctus +TGATATTAGGTGCTCCTGATATAGCTTTTCCTCGTATAAACAATATAAGATTCTGACTTT +TACCTCCTTCGTTAACGTTACTTCTTTCCAGAGGTATAGTTGAAAGAGGGGTAGGAACAG +GATGAACAGTCTATCCTCCTTTATCAGCTAGAATTGCTCATGCCGGAGCTTCAGTAGACT +TAGGTATTTTCTCGCTGCATCTAGCTGGTGTCTCGTCTATCTTAGGAGCTGTCAATTTTA +TAACAACTGTTATCAATATACGATCTACAGGTATAACTATAGACCGTATACCTCTTTTCG +TATGGGCAGTATTTATTACCGCCCTACTACTTTTATTATCTTTACCAGTA +>Pauroctonus +TAATGTTAGGTGCTCCTGATATGGCTTTTCCACGAATAAACAATATGAGTTTCTGGCTCC +TACCTCCTTCACTAACACTACTTCTTTCTAGAGGTATAGTTGAAAGAGGAGTAGGAACAG +GATGAACTGTTTACCCTCCTTTATCAGCCAGTATTGCTCATGCTGGGGCTTCGGTAGATT +TAGGAATTTTCTCCCTACACTTGGCAGGTGTTTCTTCAATTTTAGGAGCTGTAAATTTTA +TGACAACTGTTATTAACATACGATCTACGGGAATAACTATAGACCGAATACCGCTTTTCG +TTTGAGCAGTATTTATTACAGCATTACTTCTCTTATTATCTCTACCGGTA +>Rhiphicephalus +TAATATTAGGAGCACCTGATATAGCATTCCCACGAATAAATAATATAAGCTTTTGACTGC +TACCTCCTTCTCTCACCCTTCTTCTATCTAGAGGAATAGTTGAAAGAGGAGTTGGAACAG +GATGAACTGTATATCCTCCCCTATCAGCTAGAATCGCACATGCAGGGGCCTCTGTAGATT +TGGGAATTTTTTCATTGCATTTAGCAGGAGTTTCATCAATTCTAGGGGCCGTAAATTTCA +TAACAACTGTAATTAATATACGATCTACAGGTATAACTATAGACCGAATACCGTTATTCG +TCTGAGCAGTTTTTATTACAGCGCTCTTACTCCTTTTATCTTTACCCGTT +>Thelichoris +TAATATTAGGAGCACCTGATATAGCATTCCCACGAATAAATAATATAAGCTTTTGACTGC +TACCTCCTTCTCTCACCCTTCTTCTATCTAGAGGAATAGTTGAAAGAGGAGTTGGAACAG +GATGAACTGTATATCCTCCCCTATCAGCTAGAATCGCACATGCAGGGGCCTCTGTAGATT +TGGGAATTTTTTCATTGCATTTAGCAGGAGTTTCATCAATTCTAGGGGCCGTAAATTTCA +TAACAACTGTAATTAATATACGATCTACAGGTATAACTATAGACCGAATACCGTTATTCG +TCTGAGCAGTTTTTATTACAGCGCTCTTACTCCTTTTATCTTTACCCGTT +>Thermobius +TAATATTAGGTGCTCCCGATATAGCTTTCCCTCGAATAAATAATATGAGTTTCTGACTTT +TACCTCCCTCGCTAACTCTACTTCTTTCTAGAGGTATAGTCGAAAGAGGGGTAGGAACAG +GATGAACTGTTTACCCTCCTCTATCTGCCAGCATTGCCCATGCAGGAGCATCTGTAGATC +TAGGAATCTTCTCATTACATTTAGCAGGGGTTTCTTCAATTCTAGGTGCTGTAAATTTTA +TAACTACCGTTATCAATATACGATCAACAGGAATAACTATAGACCGAATACCTCTTTTCG +TCTGAGCGGTATTTATTACAGCCTTACTACTTTTACTATCATTACCGGTT +>Vonones +TAATGTTAGGTGCTCCAGATATGGCATTTCCCCGAATAAATAATATAAGTTTCTGACTTT +TACCCCCTTCGCTAACTTTACTTTTATCTAGAGGTATAGTCGAAAGAGGAGTGGGAACTG +GATGAACAGTATACCCTCCTTTATCAGCCAGAATTGCTCACGCAGGTGCTTCAGTTGATC +TAGGTATTTTTTCATTACATTTAGCAGGGGTCTCATCAATCTTAGGAGCTGTAAACTTTA +TAACGACCGTTATCAATATACGATCTACAGGGATAACTATAGACCGAATACCACTTTTTG +TTTGAGCAGTATTTATTACAGCTCTACTCCTACTGTTATCTTTACCAGTC diff --git a/tutorials/tutorial_1/Tutorial1_phygfiles/search1/chel_ia1.ss b/tutorials/tutorial_1/Tutorial1_phygfiles/search1/chel_ia1.ss new file mode 100644 index 000000000..3da24fa3a --- /dev/null +++ b/tutorials/tutorial_1/Tutorial1_phygfiles/search1/chel_ia1.ss @@ -0,0 +1,4320 @@ +Implied Alignments for Graph 0 + +Sequence character chel_cox1.fas#0 +>Alentus +T T A T A T T A G G A G C A C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T T T G A C T T C T T C C A C C A T C T T T +A A C T T T A C T C C T C - T C T A G A G G A A T +A G T A G A A A G T G G A G T G G G A A C A G G C +T G A A C T G T C T A T C C T C C T T T A T C - T +G - C T G G A A T T G C A C A T G C T G G T G C C +T C A G T A G A T T T A G G A A T C T T T T C T T +T A C A C C T A G C A G G A G T T T C A T C C A T +C T T A G G G G C C G T A A A C T T T A T A A C T +A C C G T T A T C A A T A T A C G A A - G A A C A +G G T A T A A C A A T A G A C C G A A T T C C T C +T T T T T G T A T G A T C T G T A T T T A T T A C +A G C C C T T C T T C T C C T T C T T T C C C T C +C C A G T C +>Amblypygid +T A A T A T T A G G A G C C C C A G A T A T G G - +C - T T T T C C C C G T A T A A A T A A T A T A A +G T T T T T G A C T T T T A C C C C C C T C T C T +A A C A C T - A C T C C T C T C T A G A G G G A T +A G T G G A A A G A G G T G T A G G T A C A G G A +T G A A C T G T T T A C C C C C C T C T A T C - G +G - C T A G C A T C G C A C A C G C C G G T G C T +T C T G T T G A T A T G G G A A T T T T C T C G C +T C C A T T T G G C C G G G G T T T C T T C A A T +T T T A G G G G C C G T T A A T T T T A T A A C T +A C T G T T A T C A A T A T A C G A A - G A A C C +G G A A T A A C T A T A G A C C G A A T T C C C C +T T T T T G T T T G A T C T G T C T T T A T T A C +C G C C C T C C T T C T T C T T C T A T C A T T A +C C T G T T +>Americhernus +T A A T A C T T G G G G C C C C T G A T A T G G - +C - T T T C C C C C G T A T A A A T A A T A T G A +G A T T C T G A C T C T T A C C T C C C T C A T T +A A C T C T - C C T A T T A T C G A G A G G G A T +A G T T G A A A C A G G G G T T G G T A C T G G A +T G G A C A G T A T A C C C C C C C T T A T C - G +G - C C G G G A T C G C C C A C G C A G G G G C C +T C A G T T G A C C T G G C A A T C T T T T C T C +T C C A C T T A G C T G G G G T T T C T T C T A T +T C T A G G A G C A G T T A A T T T T A T A A C A +A C C G T T A T C A A T A T A C G A A - G T G T A +G G G A T G A C A A T A G A C C G A G T A C C T C +T T T T T G T A T G G T C T A T T T T C A T C A C +A A C T C T C C T T C T A C T A C T T T C T C T A +C C A G T C +>Aportus +T A A T A T T A G G G G C C C C T G A T A T G G C +C - T T T C C C - C G G A T A A A T A A T A T A A +G T T T C T G A C T A T T A C C C C C C T C T T T +A A C C C T T C T T T T A - T C T A G T G G T A T +G G T T G A A A C A G G A G T T G G G A C T G G T +T G A A C A G T T T A C C C C C C T T T A G C - C +G - C C G G A A T T G C T C A C G C A G G A G C T +T C G G T A G A T C T T G C T A T T T T C T C A C +T C C A C T T A G C C G G G G T T T C T T C G A T +T T T A G G A G C A G T A A A C T T T A T A A C T +A C C G T T A T T A A T A T A C G A A - G A A C A +G G G A T A A C A A T G G A C C G G A T C C C T C +T T T T C G T C T G G T C G G T T T T T A T T A C +T G C C C T T C T T T T A C T T T T A T C T C T C +C C A G T T +>Artemia +T A A T A T T A G G A G C T C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A C T C C T C C C T C C C T C T T T +A A C T C T T T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C G G G A +T G A A C C G T T T A T C C T C C T C T T G C - A +G C - C G G A A T T G C T C A T G C C G G A G C T +T C A G T T G A T A T A G G A A T C T T C T C T C +T A C A T T T A G C A G G A G T A T C C T C T A T +T T T A G G G G C C G T A A A T T T T A T A A C A +A C A G T G A T T A A C A T A C G G G C A T - C T +G G A A T A A C A T T A G A C C G A A T A C C T T +T A T T C G T G T G A T C T G T A T T T A T T A C +T G C A C T A T T A T T A C T T T T A T C A C T A +C C A G T T +>Centruoides +T A A T A T T A G G G G C C C C T G A T A T G G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A T T G C T T C C C C C T T C C C T +A A C T C T T T T A C T T - T C G A G T G G A A T +G G T A G A A A G A G G G G T A G G T A C A G G A +T G A A C T G T T T A T C C C C C T C T T G C - A +G C - C G G A A T T G C T C A T G C T G G A G C T +T C C G T T G A C A T A G G A A T T T T C T C T C +T A C A T T T A G C A G G A G T G T C T T C T A T +T T T A G G G G C C G T A A A C T T C A T A A C T +A C A G T A A T T A A T A T A C G A G C A T - C T +G G A A T A A C A A T A G A C C G A A T A C C T T +T A T T T G T T T G A T C T G T C T T T A T T A C +A G C A C T A C T T T T A C T C T T G T C A T T A +C C A G T T +>Chanbria +T T A T A T T A G G A G C A C C T G A T A T A G - +C - A T T T C C A C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C A T C T C T +T A C A C T A C T C C T A - T C A A G A G G A A T +G G T A G A A A G A G G A G T C G G T A C A G G A +T G A A C T G T G T A T C C C C C G C T A G C - A +G - C A G G A A T T G C C C A C G C T G G A G C C +T C A G T T G A C A T A G G A A T T T T T T C A C +T A C A C C T C G C C G G A G T C T C A T C A A T +T C T A G G T G C A G T T A A T T T T A T A A C A +A C T G T T A T T A A C A T A C G - C C C T G C G +G G A A T A A C T A T A G A C C G T A T A C C A C +T T T T C G T G T G A G C A G T A T T T A T T A C +A G C C C T A C T C C T T C T T T T A T C T C T A +C C A G T C +>Gea +T T A T A C T T G G T G C C C C A G A T A T G G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G A T T C T G A C T T C T C C C C C C T T C T C T +A A C T C T A T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C A G G A +T G A A C A G T T T A C C C C C C T C T A G C - A +G - C A G G A A T T G C T C A T G C T G G A G C T +T C A G T T G A T A T A G G A A T T T T C T C G C +T A C A C C T T G C A G G G G T C T C A T C A A T +C T T A G G A G C A G T T A A T T T C A T G A C A +A C A G T T A T T A A T A T G C G - C C C T G C A +G G A A T A A C T A T G G A C C G T A T A C C A C +T C T T C G T G T G A G C T G T C T T T A T T A C +A G C C T T G C T A C T A T T A C T A T C C C T C +C C A G T C +>Hadrurus +T T A T A T T A G G G G C C C C A G A T A T A G - +C - T T T T C C C C G A A T A A A T A A T A T A A +G C T T C T G A T T A C T T C C C C C C T C C C T +A A C T C T C C T C C T T - T T A A G A G G A A T +A G T T G A A A G A G G A G T A G G G A C C G G C +T G A A C T G T C T A T C C T C C T T T A G C T A +G - - A G G A A T T G C C C A T G C A G G A G C C +T C G G T A G A C A T A G G A A T T T T T T C T C +T A C A C T T A G C T G G A G T T T C A T C C A T +T T T A G G G G C C G T A A A C T T T A T A A C A +A C A G T G A T T A A T A T A C G A T C A T - C A +G G A A T A A C A A T A G A T C G A A T A C C T T +T A T T T G T T T G A T C A G T C T T T A T T A C +C G C T T T A T T G T T A C T C C T T T C A C T C +C C A G T T +>Hypochilus +T A A T A C T C G G G G C C C C A G A T A T A G - +C - T T T C C C A C G A A T A A A T A A C A T A A +G A T T C T G A C T T C T T C C T C C C T C T C T +A A C T C T A C T T C T C - T C A A G C G G A A T +A G T C G A A A G A G G G G T A G G A A C C G G C +T G G A C A G T A T A C C C T C C T C T A G C C A +G - - A G G A A T C G C T C A T G C A G G T G C T +T C T G T A G A T A T A G G A A T C T T C T C G C +T T C A C C T A G C A G G A G T C T C A T C A A T +C C T A G G G G C C G T A A A C T T T A T A A C A +A C A G T A A T C A A T A T A C G A T C C T - C A +G G A A T A T C T A T A G A C C G T A T A C C T T +T A T T T G T A T G A T C A G T A T T C A T T A C +C G C T C T C C T C C T T C T T C T C T C T T T A +C C A G T C +>Limulus +T A A T A C T A G G T G C C C C G G A T A T G G - +C - T T T C C C T C G A A T A A A T A A C A T A A +G T T T C T G A C T T T T A C C C C C A T C T C T +C A C T C T - C C T C T T A T C T A G A G G A A T +A G T A G A A A G A G G T G T A G G A A C T G G A +T G A A C T G T A T A T C C T C C T T T A T C - T +G - C A A G T A T T G C C C A T G C G G G A G C T +T C C G T T G A T C T C G G A A T C T T C T C A C +T A C A C T T A G C A G G T G T A T C A T C A A T +T T T A G G T G C C G T A A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T - C T A C C +G G A A T A A C T A T G G A C C G G A T A C C T C +T T T T T G T T T G A G C A G T A T T T A T T A C +T G C T T T A C T T T T A C T A C T G T C C T T A +C C A G T T +>Mastigoproctus +T G A T A T T A G G T G C T C C T G A T A T A G - +C - T T T T C C T C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C T T C G T T +A A C G T T A C T T C T T - T C C A G A G G T A T +A G T T G A A A G A G G G G T A G G A A C A G G A +T G A A C A G T C T A T C C T C C T T T A T C - A +G - C T A G A A T T G C T C A T G C C G G A G C T +T C A G T A G A C T T A G G T A T T T T C T C G C +T G C A T C T A G C T G G T G T C T C G T C T A T +C T T A G G A G C T G T C A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G T A T A C C T C +T T T T C G T A T G G G C A G T A T T T A T T A C +C G C C C T A C T A C T T T T A T T A T C T T T A +C C A G T A +>Pauroctonus +T A A T G T T A G G T G C T C C T G A T A T G G - +C - T T T T C C A C G A A T A A A C A A T A T G A +G T T T C T G G C T C C T A C C T C C T T C A C T +A A C A C T A C T T C T T - T C T A G A G G T A T +A G T T G A A A G A G G A G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T T T A T C - A +G - C C A G T A T T G C T C A T G C T G G G G C T +T C G G T A G A T T T A G G A A T T T T C T C C C +T A C A C T T G G C A G G T G T T T C T T C A A T +T T T A G G A G C T G T A A A T T T T A T G A C A +A C T G T T A T T A A C A T A C G A T - C T A C G +G G A A T A A C T A T A G A C C G A A T A C C G C +T T T T C G T T T G A G C A G T A T T T A T T A C +A G C A T T A C T T C T C T T A T T A T C T C T A +C C G G T A +>Rhiphicephalus +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T C - A +G - C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G A A T A C C G T +T A T T C G T C T G A G C A G T T T T T A T T A C +A G C G C T C T T A C T C C T T T T A T C T T T A +C C C G T T +>Thelichoris +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T C - A +G - C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G A A T A C C G T +T A T T C G T C T G A G C A G T T T T T A T T A C +A G C G C T C T T A C T C C T T T T A T C T T T A +C C C G T T +>Thermobius +T A A T A T T A G G T G C T C C C G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T G A +G T T T C T G A C T T T T A C C T C C C T C G C T +A A C T C T A C T T C T T - T C T A G A G G T A T +A G T C G A A A G A G G G G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T C T A T C - T +G - C C A G C A T T G C C C A T G C A G G A G C A +T C T G T A G A T C T A G G A A T C T T C T C A T +T A C A T T T A G C A G G G G T T T C T T C A A T +T C T A G G T G C T G T A A A T T T T A T A A C T +A C C G T T A T C A A T A T A C G A T - C A A C A +G G A A T A A C T A T A G A C C G A A T A C C T C +T T T T C G T C T G A G C G G T A T T T A T T A C +A G C C T T A C T A C T T T T A C T A T C A T T A +C C G G T T +>Vonones +T A A T G T T A G G T G C T C C A G A T A T G G - +C A T T T C C C - C G A A T A A A T A A T A T A A +G T T T C T G A C T T T T A C C C C C T T C G C T +A A C T T T A C T T - T T A T C T A G A G G T A T +A G T C G A A A G A G G A G T G G G A A C T G G A +T G A A C A G T A T A C C C T C C T T T A T C - A +G - C C A G A A T T G C T C A C G C A G G T G C T +T C A G T T G A T C T A G G T A T T T T T T C A T +T A C A T T T A G C A G G G G T C T C A T C A A T +C T T A G G A G C T G T A A A C T T T A T A A C G +A C C G T T A T C A A T A T A C G A T - C T A C A +G G G A T A A C T A T A G A C C G A A T A C C A C +T T T T T G T T T G A G C A G T A T T T A T T A C +A G C T C T A C T C C T A C T G T T A T C T T T A +C C A G T C + + +Sequence character chel_16S.fas#0 +>Alentus +T C C A G A C T - - G A - C - G G T - C C A C C - +- - G A A T C G - G G C G A C T G - - - T C A G G +C C T G A A C - A T C C G G T C G G T T T C - T C +- T C C C T T T T C C C G C - - G - G T G C T C T +T C G T T G A G - T G - T C G C G - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Amblypygid +T C C A G A C T - - G G - C - G G T - C C G C C T +- - A - G C G G C G A G T A C T G - - - T C A G G +C C T G A A C - A T G G C G C C G G - T T - - T - +- T - - C - - - - - - C T T - - G - G T T C T C T +T T A C T G A G - T G - T C T T G - G G C G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - - - G - C T G +- T - - - - C G C C +>Americhernus +T C G A G C C T C C A A - T - G A T - A C G T T G +- - A - A A G G C G T T T A T C G - - - T T G G G +G C C G - A C - A G C G C G T C - G - - T - - G - +- G - - G - - - - - - C T C - - G - G T T G G C C +T T A A A A A G C T G A T C - G G - G T T C T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +G G T G C T C A A A G C A - - G - - - - G C C T G +- G - - - - T G C C +>Aportus +T C C A G A C T - - G A - C - G G T - C C A C C - +- - G - C T C G - G G C G A C T G - - - T C A G G +C C T G A A C - A T T C C G T C G G T T T C G A C +- A G A T T T T T C C C G C - - G - G T G C T C T +T C G G T G A G - T G - T C G C - - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Artemia +C T C G G T C G - - G G - T - G G T G C C G C C - +- T C - A C G G T G G T C A C T G - - - C C T C G +A T C G G A C - A A T T C A T T G G - - A - - - - +- T - C G - - - - - - T T C G G G - G T G C T C T +T A A C C G A G - T G - T C C T G - G G T G G C C +G A T A C G T T T A C T T T G A A C A A A T T A G +A G T G C T T A A A G C A - - G - - - - G - T - G +C A - C C G C G C C +>Centruoides +T C C A G A C A - - A G - C - G G T - C C A C C - +- - C - G C G G T G G T T A C T G - - - T T T G G +A C T G G A C - G T T T G G C C G G - - A - - T - +- T - C C - - - - - - T T T - - G - A T G C T C T +T T G C C G A G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - - +- A - - C - C G C C +>Chanbria +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - +>Gea +T C C G G C C G - - G A - C G G G T - C C G C C - +T - A - C C G G T G G T T A C T G - T T C G C T G +C C - G A G C - - T T C A G G G G G - - C - - C - +- G - C T - - - - - - G T C - - G - A T G A T C T +T C A T C G G T - T A - T C T T C - C G T A A C C +C T C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - - - - C G C C +>Hadrurus +T C C G G A C T - - G T - C - G G T - C C G C C - +- - - - G C - A A G C T T A C T G - - - G C A G G +A C C G G A C - G T C T A G C C G G - - A - - C - +- T - C T - - - - - - C T C - - G T A T C C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A T - C - C G C C +>Hypochilus +T C C A G A C G - - G G - C - G G T - C C G C C - +T - A - A C G G T G G T T A C T G - - - C C T G G +C C T G A A C - A A C C A G C C G G - - T - - T - +- T - C C - - - - - - C T A - - G - A T G A T C T +T C A T T G A T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - G +- A - - - - C G C C +>Limulus +T C T A G A C T - - G G - C - G G T - C C G C T T +- - - - C C G G C G G T T A C T G - - - C C T G G +C C T A A A C - A T C - T G C C G G - T T - - T - +- T - C C - - - - - - C T C - - G - G T G C C C T +T G A T T G A G - T G - T C T T G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - A +- A - - - - C G C C +>Mastigoproctus +T C C A G A C G - - G G - T - G G T - C C A C C - +G - C - C C G G T G G C T A C T G - - - C C C G G +C C T G A A C A A T C T C G C C G G - - T - - T - +- T - T C - - - - - - C T T - - G - A T T C T C T +T C A C C G A G - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T T A A A G C A - - G - - C - G - T - A +- A - - - - C G C C +>Pauroctonus +T C C A G A C T - - G T - C - G G T - C C G - C - +- - A - C C G G A G G T T A C T G - - - G C A G G +A C C G G A C - G T C T A G C C G G - - C - - C - +- T - C C - - - - - - C T C - - G - T T G C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - A - C +- C - - - - C G C C +>Rhiphicephalus +T C C A G A C G - - A G - T - A G T - G C A T C - +T - A - C C C G A T G C T A C G G - - - C T C G G +A C T G A A C - A T C A T G C C G G - - T - - T - +C T - T T - - - - - - C T T - - G - G T G C A C T +T C A T T G T G - T G - C C T C G A G A T G G C C +G G T G C T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A C G C A - G G - - C - G - A - G +- T - - - - C G C C +>Thelichoris +T C C A G A C G - - G G - C - G G T - C C G C C - +T - A - A C G G T G G T T A C T G - - - C C T G G +C C T G A A C - A G C C A G C C G G - - T - - T - +- T - C C - - - - - - C T A - - G - A T G A T C T +T T A C C G G T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - G G - - C - - - T - G +- A - - - - C G C C +>Thermobius +C T C G G A C - - - G A T C - G G T - T C G C C - +- - G - C C C G T G T T A A C T G A - - T C - G T +T C C G G A C - G T C C T G C C G G T T T - - T - +- T - C C - T T T - - C T C - - G - G T G C T C T +T C A T T G A G - T G - T C T T G - A T T G G C C +G G C A C G T T T A C T T T G A A C A A A T T A G +A G T G C T C A A A G C A G - G T C C A G - T - G +- T - - - - C G C C +>Vonones +T C G A G G C T - - G G - C - G G T - C C G C C T +- - A - C A G G C G G T C A C T G - - - C C A G T +A C T C A A C - A T C C T G C C G G - T T - - T - +- T - C C - - - - - - C T T - - G - G T G C T C T +T C G C T G A G - T G - T C T C G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - C - G +- T - - - - A G C C + +Implied Alignments for Graph 1 + +Sequence character chel_cox1.fas#0 +>Alentus +T T A T A T T A G G A G C A C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T T T G A C T T C T T C C A C C A T C T T T +A A C T T T A C T C C T C - T C T A G A G G A A T +A G T A G A A A G T G G A G T G G G A A C A G G C +T G A A C T G T C T A T C C T C C T T T A T C - T +G - C T G G A A T T G C A C A T G C T G G T G C C +T C A G T A G A T T T A G G A A T C T T T T C T T +T A C A C C T A G C A G G A G T T T C A T C C A T +C T T A G G G G C C G T A A A C T T T A T A A C T +A C C G T T A T C A A T A T A C G A A - G A A C A +G G T A T A A C A A T A G A C C G A A T T C C T C +T T T T T G T A T G A T C T G T A T T T A T T A C +A G C C C T T C T T C T C C T T C T T T C C C T C +C C A G T C +>Amblypygid +T A A T A T T A G G A G C C C C A G A T A T G G - +C - T T T T C C C C G T A T A A A T A A T A T A A +G T T T T T G A C T T T T A C C C C C C T C T C T +A A C A C T - A C T C C T C T C T A G A G G G A T +A G T G G A A A G A G G T G T A G G T A C A G G A +T G A A C T G T T T A C C C C C C T C T A T C - G +G - C T A G C A T C G C A C A C G C C G G T G C T +T C T G T T G A T A T G G G A A T T T T C T C G C +T C C A T T T G G C C G G G G T T T C T T C A A T +T T T A G G G G C C G T T A A T T T T A T A A C T +A C T G T T A T C A A T A T A C G A A - G A A C C +G G A A T A A C T A T A G A C C G A A T T C C C C +T T T T T G T T T G A T C T G T C T T T A T T A C +C G C C C T C C T T C T T C T T C T A T C A T T A +C C T G T T +>Americhernus +T A A T A C T T G G G G C C C C T G A T A T G G - +C - T T T C C C C C G T A T A A A T A A T A T G A +G A T T C T G A C T C T T A C C T C C C T C A T T +A A C T C T - C C T A T T A T C G A G A G G G A T +A G T T G A A A C A G G G G T T G G T A C T G G A +T G G A C A G T A T A C C C C C C C T T A T C - G +G - C C G G G A T C G C C C A C G C A G G G G C C +T C A G T T G A C C T G G C A A T C T T T T C T C +T C C A C T T A G C T G G G G T T T C T T C T A T +T C T A G G A G C A G T T A A T T T T A T A A C A +A C C G T T A T C A A T A T A C G A A - G T G T A +G G G A T G A C A A T A G A C C G A G T A C C T C +T T T T T G T A T G G T C T A T T T T C A T C A C +A A C T C T C C T T C T A C T A C T T T C T C T A +C C A G T C +>Aportus +T A A T A T T A G G G G C C C C T G A T A T G G C +C - T T T C C C - C G G A T A A A T A A T A T A A +G T T T C T G A C T A T T A C C C C C C T C T T T +A A C C C T T C T T T T A - T C T A G T G G T A T +G G T T G A A A C A G G A G T T G G G A C T G G T +T G A A C A G T T T A C C C C C C T T T A G C - C +G - C C G G A A T T G C T C A C G C A G G A G C T +T C G G T A G A T C T T G C T A T T T T C T C A C +T C C A C T T A G C C G G G G T T T C T T C G A T +T T T A G G A G C A G T A A A C T T T A T A A C T +A C C G T T A T T A A T A T A C G A A - G A A C A +G G G A T A A C A A T G G A C C G G A T C C C T C +T T T T C G T C T G G T C G G T T T T T A T T A C +T G C C C T T C T T T T A C T T T T A T C T C T C +C C A G T T +>Artemia +T A A T A T T A G G A G C T C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A C T C C T C C C T C C C T C T T T +A A C T C T T T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C G G G A +T G A A C C G T T T A T C C T C C T C T T G C - A +G C - C G G A A T T G C T C A T G C C G G A G C T +T C A G T T G A T A T A G G A A T C T T C T C T C +T A C A T T T A G C A G G A G T A T C C T C T A T +T T T A G G G G C C G T A A A T T T T A T A A C A +A C A G T G A T T A A C A T A C G G G C A T - C T +G G A A T A A C A T T A G A C C G A A T A C C T T +T A T T C G T G T G A T C T G T A T T T A T T A C +T G C A C T A T T A T T A C T T T T A T C A C T A +C C A G T T +>Centruoides +T A A T A T T A G G G G C C C C T G A T A T G G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A T T G C T T C C C C C T T C C C T +A A C T C T T T T A C T T - T C G A G T G G A A T +G G T A G A A A G A G G G G T A G G T A C A G G A +T G A A C T G T T T A T C C C C C T C T T G C - A +G C - C G G A A T T G C T C A T G C T G G A G C T +T C C G T T G A C A T A G G A A T T T T C T C T C +T A C A T T T A G C A G G A G T G T C T T C T A T +T T T A G G G G C C G T A A A C T T C A T A A C T +A C A G T A A T T A A T A T A C G A G C A T - C T +G G A A T A A C A A T A G A C C G A A T A C C T T +T A T T T G T T T G A T C T G T C T T T A T T A C +A G C A C T A C T T T T A C T C T T G T C A T T A +C C A G T T +>Chanbria +T T A T A T T A G G A G C A C C T G A T A T A G - +C - A T T T C C A C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C A T C T C T +T A C A C T A C T C C T A - T C A A G A G G A A T +G G T A G A A A G A G G A G T C G G T A C A G G A +T G A A C T G T G T A T C C C C C G C T A G C - A +G - C A G G A A T T G C C C A C G C T G G A G C C +T C A G T T G A C A T A G G A A T T T T T T C A C +T A C A C C T C G C C G G A G T C T C A T C A A T +T C T A G G T G C A G T T A A T T T T A T A A C A +A C T G T T A T T A A C A T A C G - C C C T G C G +G G A A T A A C T A T A G A C C G T A T A C C A C +T T T T C G T G T G A G C A G T A T T T A T T A C +A G C C C T A C T C C T T C T T T T A T C T C T A +C C A G T C +>Gea +T T A T A C T T G G T G C C C C A G A T A T G G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G A T T C T G A C T T C T C C C C C C T T C T C T +A A C T C T A T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C A G G A +T G A A C A G T T T A C C C C C C T C T A G C - A +G - C A G G A A T T G C T C A T G C T G G A G C T +T C A G T T G A T A T A G G A A T T T T C T C G C +T A C A C C T T G C A G G G G T C T C A T C A A T +C T T A G G A G C A G T T A A T T T C A T G A C A +A C A G T T A T T A A T A T G C G - C C C T G C A +G G A A T A A C T A T G G A C C G T A T A C C A C +T C T T C G T G T G A G C T G T C T T T A T T A C +A G C C T T G C T A C T A T T A C T A T C C C T C +C C A G T C +>Hadrurus +T T A T A T T A G G G G C C C C A G A T A T A G - +C - T T T T C C C C G A A T A A A T A A T A T A A +G C T T C T G A T T A C T T C C C C C C T C C C T +A A C T C T C C T C C T T - T T A A G A G G A A T +A G T T G A A A G A G G A G T A G G G A C C G G C +T G A A C T G T C T A T C C T C C T T T A G C T A +G - - A G G A A T T G C C C A T G C A G G A G C C +T C G G T A G A C A T A G G A A T T T T T T C T C +T A C A C T T A G C T G G A G T T T C A T C C A T +T T T A G G G G C C G T A A A C T T T A T A A C A +A C A G T G A T T A A T A T A C G A T C A T - C A +G G A A T A A C A A T A G A T C G A A T A C C T T +T A T T T G T T T G A T C A G T C T T T A T T A C +C G C T T T A T T G T T A C T C C T T T C A C T C +C C A G T T +>Hypochilus +T A A T A C T C G G G G C C C C A G A T A T A G - +C - T T T C C C A C G A A T A A A T A A C A T A A +G A T T C T G A C T T C T T C C T C C C T C T C T +A A C T C T A C T T C T C - T C A A G C G G A A T +A G T C G A A A G A G G G G T A G G A A C C G G C +T G G A C A G T A T A C C C T C C T C T A G C C A +G - - A G G A A T C G C T C A T G C A G G T G C T +T C T G T A G A T A T A G G A A T C T T C T C G C +T T C A C C T A G C A G G A G T C T C A T C A A T +C C T A G G G G C C G T A A A C T T T A T A A C A +A C A G T A A T C A A T A T A C G A T C C T - C A +G G A A T A T C T A T A G A C C G T A T A C C T T +T A T T T G T A T G A T C A G T A T T C A T T A C +C G C T C T C C T C C T T C T T C T C T C T T T A +C C A G T C +>Limulus +T A A T A C T A G G T G C C C C G G A T A T G G - +C - T T T C C C T C G A A T A A A T A A C A T A A +G T T T C T G A C T T T T A C C C C C A T C T C T +C A C T C T - C C T C T T A T C T A G A G G A A T +A G T A G A A A G A G G T G T A G G A A C T G G A +T G A A C T G T A T A T C C T C C T T T A T C - T +G - C A A G T A T T G C C C A T G C G G G A G C T +T C C G T T G A T C T C G G A A T C T T C T C A C +T A C A C T T A G C A G G T G T A T C A T C A A T +T T T A G G T G C C G T A A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T - C T A C C +G G A A T A A C T A T G G A C C G G A T A C C T C +T T T T T G T T T G A G C A G T A T T T A T T A C +T G C T T T A C T T T T A C T A C T G T C C T T A +C C A G T T +>Mastigoproctus +T G A T A T T A G G T G C T C C T G A T A T A G - +C - T T T T C C T C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C T T C G T T +A A C G T T A C T T C T T - T C C A G A G G T A T +A G T T G A A A G A G G G G T A G G A A C A G G A +T G A A C A G T C T A T C C T C C T T T A T C - A +G - C T A G A A T T G C T C A T G C C G G A G C T +T C A G T A G A C T T A G G T A T T T T C T C G C +T G C A T C T A G C T G G T G T C T C G T C T A T +C T T A G G A G C T G T C A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G T A T A C C T C +T T T T C G T A T G G G C A G T A T T T A T T A C +C G C C C T A C T A C T T T T A T T A T C T T T A +C C A G T A +>Pauroctonus +T A A T G T T A G G T G C T C C T G A T A T G G - +C - T T T T C C A C G A A T A A A C A A T A T G A +G T T T C T G G C T C C T A C C T C C T T C A C T +A A C A C T A C T T C T T - T C T A G A G G T A T +A G T T G A A A G A G G A G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T T T A T C - A +G - C C A G T A T T G C T C A T G C T G G G G C T +T C G G T A G A T T T A G G A A T T T T C T C C C +T A C A C T T G G C A G G T G T T T C T T C A A T +T T T A G G A G C T G T A A A T T T T A T G A C A +A C T G T T A T T A A C A T A C G A T - C T A C G +G G A A T A A C T A T A G A C C G A A T A C C G C +T T T T C G T T T G A G C A G T A T T T A T T A C +A G C A T T A C T T C T C T T A T T A T C T C T A +C C G G T A +>Rhiphicephalus +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T C - A +G - C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G A A T A C C G T +T A T T C G T C T G A G C A G T T T T T A T T A C +A G C G C T C T T A C T C C T T T T A T C T T T A +C C C G T T +>Thelichoris +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T C - A +G - C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G A A T A C C G T +T A T T C G T C T G A G C A G T T T T T A T T A C +A G C G C T C T T A C T C C T T T T A T C T T T A +C C C G T T +>Thermobius +T A A T A T T A G G T G C T C C C G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T G A +G T T T C T G A C T T T T A C C T C C C T C G C T +A A C T C T A C T T C T T - T C T A G A G G T A T +A G T C G A A A G A G G G G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T C T A T C - T +G - C C A G C A T T G C C C A T G C A G G A G C A +T C T G T A G A T C T A G G A A T C T T C T C A T +T A C A T T T A G C A G G G G T T T C T T C A A T +T C T A G G T G C T G T A A A T T T T A T A A C T +A C C G T T A T C A A T A T A C G A T - C A A C A +G G A A T A A C T A T A G A C C G A A T A C C T C +T T T T C G T C T G A G C G G T A T T T A T T A C +A G C C T T A C T A C T T T T A C T A T C A T T A +C C G G T T +>Vonones +T A A T G T T A G G T G C T C C A G A T A T G G - +C A T T T C C C - C G A A T A A A T A A T A T A A +G T T T C T G A C T T T T A C C C C C T T C G C T +A A C T T T A C T T - T T A T C T A G A G G T A T +A G T C G A A A G A G G A G T G G G A A C T G G A +T G A A C A G T A T A C C C T C C T T T A T C - A +G - C C A G A A T T G C T C A C G C A G G T G C T +T C A G T T G A T C T A G G T A T T T T T T C A T +T A C A T T T A G C A G G G G T C T C A T C A A T +C T T A G G A G C T G T A A A C T T T A T A A C G +A C C G T T A T C A A T A T A C G A T - C T A C A +G G G A T A A C T A T A G A C C G A A T A C C A C +T T T T T G T T T G A G C A G T A T T T A T T A C +A G C T C T A C T C C T A C T G T T A T C T T T A +C C A G T C + + +Sequence character chel_16S.fas#0 +>Alentus +T C C A G A C T - - G A - C - G G T - C C A C C - +- - G A A T C G - G G C G A C T G - - - T C A G G +C C T G A A C - A T C C G G T C G G T T T C - T C +- T C C C T T T T C C C G C - - G - G T G C T C T +T C G T T G A G - T G - T C G C G - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Amblypygid +T C C A G A C T - - G G - C - G G T - C C G C C T +- - A - G C G G C G A G T A C T G - - - T C A G G +C C T G A A C - A T G G C G C C G G - T T - - T - +- T - - C - - - - - - C T T - - G - G T T C T C T +T T A C T G A G - T G - T C T T G - G G C G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - - - G - C T G +- T - - - - C G C C +>Americhernus +T C G A G C C T C C A A - T - G A T - A C G T T G +- - A - A A G G C G T T T A T C G - - - T T G G G +G C C G - A C - A G C G C G T C - G - - T - - G - +- G - - G - - - - - - C T C - - G - G T T G G C C +T T A A A A A G C T G A T C - G G - G T T C T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +G G T G C T C A A A G C A - - G - - - - G C C T G +- G - - - - T G C C +>Aportus +T C C A G A C T - - G A - C - G G T - C C A C C - +- - G - C T C G - G G C G A C T G - - - T C A G G +C C T G A A C - A T T C C G T C G G T T T C G A C +- A G A T T T T T C C C G C - - G - G T G C T C T +T C G G T G A G - T G - T C G C - - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Artemia +C T C G G T C G - - G G - T - G G T G C C G C C - +- T C - A C G G T G G T C A C T G - - - C C T C G +A T C G G A C - A A T T C A T T G G - - A - - - - +- T - C G - - - - - - T T C G G G - G T G C T C T +T A A C C G A G - T G - T C C T G - G G T G G C C +G A T A C G T T T A C T T T G A A C A A A T T A G +A G T G C T T A A A G C A - - G - - - - G - T - G +C A - C C G C G C C +>Centruoides +T C C A G A C A - - A G - C - G G T - C C A C C - +- - C - G C G G T G G T T A C T G - - - T T T G G +A C T G G A C - G T T T G G C C G G - - A - - T - +- T - C C - - - - - - T T T - - G - A T G C T C T +T T G C C G A G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - - +- A - - C - C G C C +>Chanbria +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - +>Gea +T C C G G C C G - - G A - C G G G T - C C G C C - +T - A - C C G G T G G T T A C T G - T T C G C T G +C C - G A G C - - T T C A G G G G G - - C - - C - +- G - C T - - - - - - G T C - - G - A T G A T C T +T C A T C G G T - T A - T C T T C - C G T A A C C +C T C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - - - - C G C C +>Hadrurus +T C C G G A C T - - G T - C - G G T - C C G C C - +- - - - G C - A A G C T T A C T G - - - G C A G G +A C C G G A C - G T C T A G C C G G - - A - - C - +- T - C T - - - - - - C T C - - G T A T C C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A T - C - C G C C +>Hypochilus +T C C A G A C G - - G G - C - G G T - C C G C C - +T - A - A C G G T G G T T A C T G - - - C C T G G +C C T G A A C - A A C C A G C C G G - - T - - T - +- T - C C - - - - - - C T A - - G - A T G A T C T +T C A T T G A T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - G +- A - - - - C G C C +>Limulus +T C T A G A C T - - G G - C - G G T - C C G C T T +- - - - C C G G C G G T T A C T G - - - C C T G G +C C T A A A C - A T C - T G C C G G - T T - - T - +- T - C C - - - - - - C T C - - G - G T G C C C T +T G A T T G A G - T G - T C T T G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - A +- A - - - - C G C C +>Mastigoproctus +T C C A G A C G - - G G - T - G G T - C C A C C - +G - C - C C G G T G G C T A C T G - - - C C C G G +C C T G A A C A A T C T C G C C G G - - T - - T - +- T - T C - - - - - - C T T - - G - A T T C T C T +T C A C C G A G - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T T A A A G C A - - G - - C - G - T - A +- A - - - - C G C C +>Pauroctonus +T C C A G A C T - - G T - C - G G T - C C G - C - +- - A - C C G G A G G T T A C T G - - - G C A G G +A C C G G A C - G T C T A G C C G G - - C - - C - +- T - C C - - - - - - C T C - - G - T T G C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - A - C +- C - - - - C G C C +>Rhiphicephalus +T C C A G A C G - - A G - T - A G T - G C A T C - +T - A - C C C G A T G C T A C G G - - - C T C G G +A C T G A A C - A T C A T G C C G G - - T - - T - +C T - T T - - - - - - C T T - - G - G T G C A C T +T C A T T G T G - T G - C C T C G A G A T G G C C +G G T G C T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A C G C A - G G - - C - G - A - G +- T - - - - C G C C +>Thelichoris +T C C A G A C G - - G G - C - G G T - C C G C C - +T - A - A C G G T G G T T A C T G - - - C C T G G +C C T G A A C - A G C C A G C C G G - - T - - T - +- T - C C - - - - - - C T A - - G - A T G A T C T +T T A C C G G T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - G G - - C - - - T - G +- A - - - - C G C C +>Thermobius +C T C G G A C - - - G A T C - G G T - T C G C C - +- - G - C C C G T G T T A A C T G A - - T C - G T +T C C G G A C - G T C C T G C C G G T T T - - T - +- T - C C - T T T - - C T C - - G - G T G C T C T +T C A T T G A G - T G - T C T T G - A T T G G C C +G G C A C G T T T A C T T T G A A C A A A T T A G +A G T G C T C A A A G C A G - G T C C A G - T - G +- T - - - - C G C C +>Vonones +T C G A G G C T - - G G - C - G G T - C C G C C T +- - A - C A G G C G G T C A C T G - - - C C A G T +A C T C A A C - A T C C T G C C G G - T T - - T - +- T - C C - - - - - - C T T - - G - G T G C T C T +T C G C T G A G - T G - T C T C G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - C - G +- T - - - - A G C C + +Implied Alignments for Graph 2 + +Sequence character chel_cox1.fas#0 +>Alentus +T T A T A T T A G G A G C A C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T T T G A C T T C T T C C A C C A T C T T T +A A C T T T A C T C C T C - T C T A G A G G A A T +A G T A G A A A G T G G A G T G G G A A C A G G C +T G A A C T G T C T A T C C T C C T T T A T C - T +G - C T G G A A T T G C A C A T G C T G G T G C C +T C A G T A G A T T T A G G A A T C T T T T C T T +T A C A C C T A G C A G G A G T T T C A T C C A T +C T T A G G G G C C G T A A A C T T T A T A A C T +A C C G T T A T C A A T A T A C G A A - G A A C A +G G T A T A A C A A T A G A C C G A A T T C C T C +T T T T T G T A T G A T C T G T A T T T A T T A C +A G C C C T T C T T C T C C T T C T T T C C C T C +C C A G T C +>Amblypygid +T A A T A T T A G G A G C C C C A G A T A T G G - +C - T T T T C C C C G T A T A A A T A A T A T A A +G T T T T T G A C T T T T A C C C C C C T C T C T +A A C A C T - A C T C C T C T C T A G A G G G A T +A G T G G A A A G A G G T G T A G G T A C A G G A +T G A A C T G T T T A C C C C C C T C T A T C - G +G - C T A G C A T C G C A C A C G C C G G T G C T +T C T G T T G A T A T G G G A A T T T T C T C G C +T C C A T T T G G C C G G G G T T T C T T C A A T +T T T A G G G G C C G T T A A T T T T A T A A C T +A C T G T T A T C A A T A T A C G A A - G A A C C +G G A A T A A C T A T A G A C C G A A T T C C C C +T T T T T G T T T G A T C T G T C T T T A T T A C +C G C C C T C C T T C T T C T T C T A T C A T T A +C C T G T T +>Americhernus +T A A T A C T T G G G G C C C C T G A T A T G G - +C - T T T C C C C C G T A T A A A T A A T A T G A +G A T T C T G A C T C T T A C C T C C C T C A T T +A A C T C T - C C T A T T A T C G A G A G G G A T +A G T T G A A A C A G G G G T T G G T A C T G G A +T G G A C A G T A T A C C C C C C C T T A T C - G +G - C C G G G A T C G C C C A C G C A G G G G C C +T C A G T T G A C C T G G C A A T C T T T T C T C +T C C A C T T A G C T G G G G T T T C T T C T A T +T C T A G G A G C A G T T A A T T T T A T A A C A +A C C G T T A T C A A T A T A C G A A - G T G T A +G G G A T G A C A A T A G A C C G A G T A C C T C +T T T T T G T A T G G T C T A T T T T C A T C A C +A A C T C T C C T T C T A C T A C T T T C T C T A +C C A G T C +>Aportus +T A A T A T T A G G G G C C C C T G A T A T G G C +C - T T T C C C - C G G A T A A A T A A T A T A A +G T T T C T G A C T A T T A C C C C C C T C T T T +A A C C C T T C T T T T A - T C T A G T G G T A T +G G T T G A A A C A G G A G T T G G G A C T G G T +T G A A C A G T T T A C C C C C C T T T A G C - C +G - C C G G A A T T G C T C A C G C A G G A G C T +T C G G T A G A T C T T G C T A T T T T C T C A C +T C C A C T T A G C C G G G G T T T C T T C G A T +T T T A G G A G C A G T A A A C T T T A T A A C T +A C C G T T A T T A A T A T A C G A A - G A A C A +G G G A T A A C A A T G G A C C G G A T C C C T C +T T T T C G T C T G G T C G G T T T T T A T T A C +T G C C C T T C T T T T A C T T T T A T C T C T C +C C A G T T +>Artemia +T A A T A T T A G G A G C T C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A C T C C T C C C T C C C T C T T T +A A C T C T T T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C G G G A +T G A A C C G T T T A T C C T C C T C T T G C - A +G C - C G G A A T T G C T C A T G C C G G A G C T +T C A G T T G A T A T A G G A A T C T T C T C T C +T A C A T T T A G C A G G A G T A T C C T C T A T +T T T A G G G G C C G T A A A T T T T A T A A C A +A C A G T G A T T A A C A T A C G G G C A T - C T +G G A A T A A C A T T A G A C C G A A T A C C T T +T A T T C G T G T G A T C T G T A T T T A T T A C +T G C A C T A T T A T T A C T T T T A T C A C T A +C C A G T T +>Centruoides +T A A T A T T A G G G G C C C C T G A T A T G G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A T T G C T T C C C C C T T C C C T +A A C T C T T T T A C T T - T C G A G T G G A A T +G G T A G A A A G A G G G G T A G G T A C A G G A +T G A A C T G T T T A T C C C C C T C T T G C - A +G C - C G G A A T T G C T C A T G C T G G A G C T +T C C G T T G A C A T A G G A A T T T T C T C T C +T A C A T T T A G C A G G A G T G T C T T C T A T +T T T A G G G G C C G T A A A C T T C A T A A C T +A C A G T A A T T A A T A T A C G A G C A T - C T +G G A A T A A C A A T A G A C C G A A T A C C T T +T A T T T G T T T G A T C T G T C T T T A T T A C +A G C A C T A C T T T T A C T C T T G T C A T T A +C C A G T T +>Chanbria +T T A T A T T A G G A G C A C C T G A T A T A G - +C - A T T T C C A C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C A T C T C T +T A C A C T A C T C C T A - T C A A G A G G A A T +G G T A G A A A G A G G A G T C G G T A C A G G A +T G A A C T G T G T A T C C C C C G C T A G C - A +G - C A G G A A T T G C C C A C G C T G G A G C C +T C A G T T G A C A T A G G A A T T T T T T C A C +T A C A C C T C G C C G G A G T C T C A T C A A T +T C T A G G T G C A G T T A A T T T T A T A A C A +A C T G T T A T T A A C A T A C G - C C C T G C G +G G A A T A A C T A T A G A C C G T A T A C C A C +T T T T C G T G T G A G C A G T A T T T A T T A C +A G C C C T A C T C C T T C T T T T A T C T C T A +C C A G T C +>Gea +T T A T A C T T G G T G C C C C A G A T A T G G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G A T T C T G A C T T C T C C C C C C T T C T C T +A A C T C T A T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C A G G A +T G A A C A G T T T A C C C C C C T C T A G C - A +G - C A G G A A T T G C T C A T G C T G G A G C T +T C A G T T G A T A T A G G A A T T T T C T C G C +T A C A C C T T G C A G G G G T C T C A T C A A T +C T T A G G A G C A G T T A A T T T C A T G A C A +A C A G T T A T T A A T A T G C G - C C C T G C A +G G A A T A A C T A T G G A C C G T A T A C C A C +T C T T C G T G T G A G C T G T C T T T A T T A C +A G C C T T G C T A C T A T T A C T A T C C C T C +C C A G T C +>Hadrurus +T T A T A T T A G G G G C C C C A G A T A T A G - +C - T T T T C C C C G A A T A A A T A A T A T A A +G C T T C T G A T T A C T T C C C C C C T C C C T +A A C T C T C C T C C T T - T T A A G A G G A A T +A G T T G A A A G A G G A G T A G G G A C C G G C +T G A A C T G T C T A T C C T C C T T T A G C T A +G - - A G G A A T T G C C C A T G C A G G A G C C +T C G G T A G A C A T A G G A A T T T T T T C T C +T A C A C T T A G C T G G A G T T T C A T C C A T +T T T A G G G G C C G T A A A C T T T A T A A C A +A C A G T G A T T A A T A T A C G A T C A T - C A +G G A A T A A C A A T A G A T C G A A T A C C T T +T A T T T G T T T G A T C A G T C T T T A T T A C +C G C T T T A T T G T T A C T C C T T T C A C T C +C C A G T T +>Hypochilus +T A A T A C T C G G G G C C C C A G A T A T A G - +C - T T T C C C A C G A A T A A A T A A C A T A A +G A T T C T G A C T T C T T C C T C C C T C T C T +A A C T C T A C T T C T C - T C A A G C G G A A T +A G T C G A A A G A G G G G T A G G A A C C G G C +T G G A C A G T A T A C C C T C C T C T A G C C A +G - - A G G A A T C G C T C A T G C A G G T G C T +T C T G T A G A T A T A G G A A T C T T C T C G C +T T C A C C T A G C A G G A G T C T C A T C A A T +C C T A G G G G C C G T A A A C T T T A T A A C A +A C A G T A A T C A A T A T A C G A T C C T - C A +G G A A T A T C T A T A G A C C G T A T A C C T T +T A T T T G T A T G A T C A G T A T T C A T T A C +C G C T C T C C T C C T T C T T C T C T C T T T A +C C A G T C +>Limulus +T A A T A C T A G G T G C C C C G G A T A T G G - +C - T T T C C C T C G A A T A A A T A A C A T A A +G T T T C T G A C T T T T A C C C C C A T C T C T +C A C T C T - C C T C T T A T C T A G A G G A A T +A G T A G A A A G A G G T G T A G G A A C T G G A +T G A A C T G T A T A T C C T C C T T T A T C - T +G - C A A G T A T T G C C C A T G C G G G A G C T +T C C G T T G A T C T C G G A A T C T T C T C A C +T A C A C T T A G C A G G T G T A T C A T C A A T +T T T A G G T G C C G T A A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T - C T A C C +G G A A T A A C T A T G G A C C G G A T A C C T C +T T T T T G T T T G A G C A G T A T T T A T T A C +T G C T T T A C T T T T A C T A C T G T C C T T A +C C A G T T +>Mastigoproctus +T G A T A T T A G G T G C T C C T G A T A T A G - +C - T T T T C C T C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C T T C G T T +A A C G T T A C T T C T T - T C C A G A G G T A T +A G T T G A A A G A G G G G T A G G A A C A G G A +T G A A C A G T C T A T C C T C C T T T A T C - A +G - C T A G A A T T G C T C A T G C C G G A G C T +T C A G T A G A C T T A G G T A T T T T C T C G C +T G C A T C T A G C T G G T G T C T C G T C T A T +C T T A G G A G C T G T C A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G T A T A C C T C +T T T T C G T A T G G G C A G T A T T T A T T A C +C G C C C T A C T A C T T T T A T T A T C T T T A +C C A G T A +>Pauroctonus +T A A T G T T A G G T G C T C C T G A T A T G G - +C - T T T T C C A C G A A T A A A C A A T A T G A +G T T T C T G G C T C C T A C C T C C T T C A C T +A A C A C T A C T T C T T - T C T A G A G G T A T +A G T T G A A A G A G G A G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T T T A T C - A +G - C C A G T A T T G C T C A T G C T G G G G C T +T C G G T A G A T T T A G G A A T T T T C T C C C +T A C A C T T G G C A G G T G T T T C T T C A A T +T T T A G G A G C T G T A A A T T T T A T G A C A +A C T G T T A T T A A C A T A C G A T - C T A C G +G G A A T A A C T A T A G A C C G A A T A C C G C +T T T T C G T T T G A G C A G T A T T T A T T A C +A G C A T T A C T T C T C T T A T T A T C T C T A +C C G G T A +>Rhiphicephalus +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T C - A +G - C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G A A T A C C G T +T A T T C G T C T G A G C A G T T T T T A T T A C +A G C G C T C T T A C T C C T T T T A T C T T T A +C C C G T T +>Thelichoris +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T C - A +G - C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G A A T A C C G T +T A T T C G T C T G A G C A G T T T T T A T T A C +A G C G C T C T T A C T C C T T T T A T C T T T A +C C C G T T +>Thermobius +T A A T A T T A G G T G C T C C C G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T G A +G T T T C T G A C T T T T A C C T C C C T C G C T +A A C T C T A C T T C T T - T C T A G A G G T A T +A G T C G A A A G A G G G G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T C T A T C - T +G - C C A G C A T T G C C C A T G C A G G A G C A +T C T G T A G A T C T A G G A A T C T T C T C A T +T A C A T T T A G C A G G G G T T T C T T C A A T +T C T A G G T G C T G T A A A T T T T A T A A C T +A C C G T T A T C A A T A T A C G A T - C A A C A +G G A A T A A C T A T A G A C C G A A T A C C T C +T T T T C G T C T G A G C G G T A T T T A T T A C +A G C C T T A C T A C T T T T A C T A T C A T T A +C C G G T T +>Vonones +T A A T G T T A G G T G C T C C A G A T A T G G - +C A T T T C C C - C G A A T A A A T A A T A T A A +G T T T C T G A C T T T T A C C C C C T T C G C T +A A C T T T A C T T - T T A T C T A G A G G T A T +A G T C G A A A G A G G A G T G G G A A C T G G A +T G A A C A G T A T A C C C T C C T T T A T C - A +G - C C A G A A T T G C T C A C G C A G G T G C T +T C A G T T G A T C T A G G T A T T T T T T C A T +T A C A T T T A G C A G G G G T C T C A T C A A T +C T T A G G A G C T G T A A A C T T T A T A A C G +A C C G T T A T C A A T A T A C G A T - C T A C A +G G G A T A A C T A T A G A C C G A A T A C C A C +T T T T T G T T T G A G C A G T A T T T A T T A C +A G C T C T A C T C C T A C T G T T A T C T T T A +C C A G T C + + +Sequence character chel_16S.fas#0 +>Alentus +T C C A G A C T - - G A - C - G G T - C C A C C - +- - G A A T C G - G G C G A C T G - - - T C A G G +C C T G A A C - A T C C G G T C G G T T T C - T C +- T C C C T T T T C C C G C - - G - G T G C T C T +T C G T T G A G - T G - T C G C G - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Amblypygid +T C C A G A C T - - G G - C - G G T - C C G C C T +- - A - G C G G C G A G T A C T G - - - T C A G G +C C T G A A C - A T G G C G C C G G - T T - - T - +- T - - C - - - - - - C T T - - G - G T T C T C T +T T A C T G A G - T G - T C T T G - G G C G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - - - G - C T G +- T - - - - C G C C +>Americhernus +T C G A G C C T C C A A - T - G A T - A C G T T G +- - A - A A G G C G T T T A T C G - - - T T G G G +G C C G - A C - A G C G C G T C - G - - T - - G - +- G - - G - - - - - - C T C - - G - G T T G G C C +T T A A A A A G C T G A T C - G G - G T T C T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +G G T G C T C A A A G C A - - G - - - - G C C T G +- G - - - - T G C C +>Aportus +T C C A G A C T - - G A - C - G G T - C C A C C - +- - G - C T C G - G G C G A C T G - - - T C A G G +C C T G A A C - A T T C C G T C G G T T T C G A C +- A G A T T T T T C C C G C - - G - G T G C T C T +T C G G T G A G - T G - T C G C - - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Artemia +C T C G G T C G - - G G - T - G G T G C C G C C - +- T C - A C G G T G G T C A C T G - - - C C T C G +A T C G G A C - A A T T C A T T G G - - A - - - - +- T - C G - - - - - - T T C G G G - G T G C T C T +T A A C C G A G - T G - T C C T G - G G T G G C C +G A T A C G T T T A C T T T G A A C A A A T T A G +A G T G C T T A A A G C A - - G - - - - G - T - G +C A - C C G C G C C +>Centruoides +T C C A G A C A - - A G - C - G G T - C C A C C - +- - C - G C G G T G G T T A C T G - - - T T T G G +A C T G G A C - G T T T G G C C G G - - A - - T - +- T - C C - - - - - - T T T - - G - A T G C T C T +T T G C C G A G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - - +- A - - C - C G C C +>Chanbria +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - +>Gea +T C C G G C C G - - G A - C G G G T - C C G C C - +T - A - C C G G T G G T T A C T G - T T C G C T G +C C - G A G C - - T T C A G G G G G - - C - - C - +- G - C T - - - - - - G T C - - G - A T G A T C T +T C A T C G G T - T A - T C T T C - C G T A A C C +C T C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - - - - C G C C +>Hadrurus +T C C G G A C T - - G T - C - G G T - C C G C C - +- - - - G C - A A G C T T A C T G - - - G C A G G +A C C G G A C - G T C T A G C C G G - - A - - C - +- T - C T - - - - - - C T C - - G T A T C C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A T - C - C G C C +>Hypochilus +T C C A G A C G - - G G - C - G G T - C C G C C - +T - A - A C G G T G G T T A C T G - - - C C T G G +C C T G A A C - A A C C A G C C G G - - T - - T - +- T - C C - - - - - - C T A - - G - A T G A T C T +T C A T T G A T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - G +- A - - - - C G C C +>Limulus +T C T A G A C T - - G G - C - G G T - C C G C T T +- - - - C C G G C G G T T A C T G - - - C C T G G +C C T A A A C - A T C - T G C C G G - T T - - T - +- T - C C - - - - - - C T C - - G - G T G C C C T +T G A T T G A G - T G - T C T T G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - A +- A - - - - C G C C +>Mastigoproctus +T C C A G A C G - - G G - T - G G T - C C A C C - +G - C - C C G G T G G C T A C T G - - - C C C G G +C C T G A A C A A T C T C G C C G G - - T - - T - +- T - T C - - - - - - C T T - - G - A T T C T C T +T C A C C G A G - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T T A A A G C A - - G - - C - G - T - A +- A - - - - C G C C +>Pauroctonus +T C C A G A C T - - G T - C - G G T - C C G - C - +- - A - C C G G A G G T T A C T G - - - G C A G G +A C C G G A C - G T C T A G C C G G - - C - - C - +- T - C C - - - - - - C T C - - G - T T G C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - A - C +- C - - - - C G C C +>Rhiphicephalus +T C C A G A C G - - A G - T - A G T - G C A T C - +T - A - C C C G A T G C T A C G G - - - C T C G G +A C T G A A C - A T C A T G C C G G - - T - - T - +C T - T T - - - - - - C T T - - G - G T G C A C T +T C A T T G T G - T G - C C T C G A G A T G G C C +G G T G C T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A C G C A - G G - - C - G - A - G +- T - - - - C G C C +>Thelichoris +T C C A G A C G - - G G - C - G G T - C C G C C - +T - A - A C G G T G G T T A C T G - - - C C T G G +C C T G A A C - A G C C A G C C G G - - T - - T - +- T - C C - - - - - - C T A - - G - A T G A T C T +T T A C C G G T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - G G - - C - - - T - G +- A - - - - C G C C +>Thermobius +C T C G G A C - - - G A T C - G G T - T C G C C - +- - G - C C C G T G T T A A C T G A - - T C - G T +T C C G G A C - G T C C T G C C G G T T T - - T - +- T - C C - T T T - - C T C - - G - G T G C T C T +T C A T T G A G - T G - T C T T G - A T T G G C C +G G C A C G T T T A C T T T G A A C A A A T T A G +A G T G C T C A A A G C A G - G T C C A G - T - G +- T - - - - C G C C +>Vonones +T C G A G G C T - - G G - C - G G T - C C G C C T +- - A - C A G G C G G T C A C T G - - - C C A G T +A C T C A A C - A T C C T G C C G G - T T - - T - +- T - C C - - - - - - C T T - - G - G T G C T C T +T C G C T G A G - T G - T C T C G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - C - G +- T - - - - A G C C + +Implied Alignments for Graph 3 + +Sequence character chel_cox1.fas#0 +>Alentus +T T A T A T T A G G A G C A C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T T T G A C T T C T T C C A C C A T C T T T +A A C T T T A C T C C T C - T C T A G A G G A A T +A G T A G A A A G T G G A G T G G G A A C A G G C +T G A A C T G T C T A T C C T C C T T T A T C - T +G - C T G G A A T T G C A C A T G C T G G T G C C +T C A G T A G A T T T A G G A A T C T T T T C T T +T A C A C C T A G C A G G A G T T T C A T C C A T +C T T A G G G G C C G T A A A C T T T A T A A C T +A C C G T T A T C A A T A T A C G A A - G A A C A +G G T A T A A C A A T A G A C C G A A T T C C T C +T T T T T G T A T G A T C T G T A T T T A T T A C +A G C C C T T C T T C T C C T T C T T T C C C T C +C C A G T C +>Amblypygid +T A A T A T T A G G A G C C C C A G A T A T G G - +C - T T T T C C C C G T A T A A A T A A T A T A A +G T T T T T G A C T T T T A C C C C C C T C T C T +A A C A C T - A C T C C T C T C T A G A G G G A T +A G T G G A A A G A G G T G T A G G T A C A G G A +T G A A C T G T T T A C C C C C C T C T A T C - G +G - C T A G C A T C G C A C A C G C C G G T G C T +T C T G T T G A T A T G G G A A T T T T C T C G C +T C C A T T T G G C C G G G G T T T C T T C A A T +T T T A G G G G C C G T T A A T T T T A T A A C T +A C T G T T A T C A A T A T A C G A A - G A A C C +G G A A T A A C T A T A G A C C G A A T T C C C C +T T T T T G T T T G A T C T G T C T T T A T T A C +C G C C C T C C T T C T T C T T C T A T C A T T A +C C T G T T +>Americhernus +T A A T A C T T G G G G C C C C T G A T A T G G - +C - T T T C C C C C G T A T A A A T A A T A T G A +G A T T C T G A C T C T T A C C T C C C T C A T T +A A C T C T - C C T A T T A T C G A G A G G G A T +A G T T G A A A C A G G G G T T G G T A C T G G A +T G G A C A G T A T A C C C C C C C T T A T C - G +G - C C G G G A T C G C C C A C G C A G G G G C C +T C A G T T G A C C T G G C A A T C T T T T C T C +T C C A C T T A G C T G G G G T T T C T T C T A T +T C T A G G A G C A G T T A A T T T T A T A A C A +A C C G T T A T C A A T A T A C G A A - G T G T A +G G G A T G A C A A T A G A C C G A G T A C C T C +T T T T T G T A T G G T C T A T T T T C A T C A C +A A C T C T C C T T C T A C T A C T T T C T C T A +C C A G T C +>Aportus +T A A T A T T A G G G G C C C C T G A T A T G G C +C - T T T C C C - C G G A T A A A T A A T A T A A +G T T T C T G A C T A T T A C C C C C C T C T T T +A A C C C T T C T T T T A - T C T A G T G G T A T +G G T T G A A A C A G G A G T T G G G A C T G G T +T G A A C A G T T T A C C C C C C T T T A G C - C +G - C C G G A A T T G C T C A C G C A G G A G C T +T C G G T A G A T C T T G C T A T T T T C T C A C +T C C A C T T A G C C G G G G T T T C T T C G A T +T T T A G G A G C A G T A A A C T T T A T A A C T +A C C G T T A T T A A T A T A C G A A - G A A C A +G G G A T A A C A A T G G A C C G G A T C C C T C +T T T T C G T C T G G T C G G T T T T T A T T A C +T G C C C T T C T T T T A C T T T T A T C T C T C +C C A G T T +>Artemia +T A A T A T T A G G A G C T C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A C T C C T C C C T C C C T C T T T +A A C T C T T T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C G G G A +T G A A C C G T T T A T C C T C C T C T T G C - A +G C - C G G A A T T G C T C A T G C C G G A G C T +T C A G T T G A T A T A G G A A T C T T C T C T C +T A C A T T T A G C A G G A G T A T C C T C T A T +T T T A G G G G C C G T A A A T T T T A T A A C A +A C A G T G A T T A A C A T A C G G G C A T - C T +G G A A T A A C A T T A G A C C G A A T A C C T T +T A T T C G T G T G A T C T G T A T T T A T T A C +T G C A C T A T T A T T A C T T T T A T C A C T A +C C A G T T +>Centruoides +T A A T A T T A G G G G C C C C T G A T A T G G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A T T G C T T C C C C C T T C C C T +A A C T C T T T T A C T T - T C G A G T G G A A T +G G T A G A A A G A G G G G T A G G T A C A G G A +T G A A C T G T T T A T C C C C C T C T T G C - A +G C - C G G A A T T G C T C A T G C T G G A G C T +T C C G T T G A C A T A G G A A T T T T C T C T C +T A C A T T T A G C A G G A G T G T C T T C T A T +T T T A G G G G C C G T A A A C T T C A T A A C T +A C A G T A A T T A A T A T A C G A G C A T - C T +G G A A T A A C A A T A G A C C G A A T A C C T T +T A T T T G T T T G A T C T G T C T T T A T T A C +A G C A C T A C T T T T A C T C T T G T C A T T A +C C A G T T +>Chanbria +T T A T A T T A G G A G C A C C T G A T A T A G - +C - A T T T C C A C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C A T C T C T +T A C A C T A C T C C T A - T C A A G A G G A A T +G G T A G A A A G A G G A G T C G G T A C A G G A +T G A A C T G T G T A T C C C C C G C T A G C - A +G - C A G G A A T T G C C C A C G C T G G A G C C +T C A G T T G A C A T A G G A A T T T T T T C A C +T A C A C C T C G C C G G A G T C T C A T C A A T +T C T A G G T G C A G T T A A T T T T A T A A C A +A C T G T T A T T A A C A T A C G - C C C T G C G +G G A A T A A C T A T A G A C C G T A T A C C A C +T T T T C G T G T G A G C A G T A T T T A T T A C +A G C C C T A C T C C T T C T T T T A T C T C T A +C C A G T C +>Gea +T T A T A C T T G G T G C C C C A G A T A T G G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G A T T C T G A C T T C T C C C C C C T T C T C T +A A C T C T A T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C A G G A +T G A A C A G T T T A C C C C C C T C T A G C - A +G - C A G G A A T T G C T C A T G C T G G A G C T +T C A G T T G A T A T A G G A A T T T T C T C G C +T A C A C C T T G C A G G G G T C T C A T C A A T +C T T A G G A G C A G T T A A T T T C A T G A C A +A C A G T T A T T A A T A T G C G - C C C T G C A +G G A A T A A C T A T G G A C C G T A T A C C A C +T C T T C G T G T G A G C T G T C T T T A T T A C +A G C C T T G C T A C T A T T A C T A T C C C T C +C C A G T C +>Hadrurus +T T A T A T T A G G G G C C C C A G A T A T A G - +C - T T T T C C C C G A A T A A A T A A T A T A A +G C T T C T G A T T A C T T C C C C C C T C C C T +A A C T C T C C T C C T T - T T A A G A G G A A T +A G T T G A A A G A G G A G T A G G G A C C G G C +T G A A C T G T C T A T C C T C C T T T A G C T A +G - - A G G A A T T G C C C A T G C A G G A G C C +T C G G T A G A C A T A G G A A T T T T T T C T C +T A C A C T T A G C T G G A G T T T C A T C C A T +T T T A G G G G C C G T A A A C T T T A T A A C A +A C A G T G A T T A A T A T A C G A T C A T - C A +G G A A T A A C A A T A G A T C G A A T A C C T T +T A T T T G T T T G A T C A G T C T T T A T T A C +C G C T T T A T T G T T A C T C C T T T C A C T C +C C A G T T +>Hypochilus +T A A T A C T C G G G G C C C C A G A T A T A G - +C - T T T C C C A C G A A T A A A T A A C A T A A +G A T T C T G A C T T C T T C C T C C C T C T C T +A A C T C T A C T T C T C - T C A A G C G G A A T +A G T C G A A A G A G G G G T A G G A A C C G G C +T G G A C A G T A T A C C C T C C T C T A G C C A +G - - A G G A A T C G C T C A T G C A G G T G C T +T C T G T A G A T A T A G G A A T C T T C T C G C +T T C A C C T A G C A G G A G T C T C A T C A A T +C C T A G G G G C C G T A A A C T T T A T A A C A +A C A G T A A T C A A T A T A C G A T C C T - C A +G G A A T A T C T A T A G A C C G T A T A C C T T +T A T T T G T A T G A T C A G T A T T C A T T A C +C G C T C T C C T C C T T C T T C T C T C T T T A +C C A G T C +>Limulus +T A A T A C T A G G T G C C C C G G A T A T G G - +C - T T T C C C T C G A A T A A A T A A C A T A A +G T T T C T G A C T T T T A C C C C C A T C T C T +C A C T C T - C C T C T T A T C T A G A G G A A T +A G T A G A A A G A G G T G T A G G A A C T G G A +T G A A C T G T A T A T C C T C C T T T A T C - T +G - C A A G T A T T G C C C A T G C G G G A G C T +T C C G T T G A T C T C G G A A T C T T C T C A C +T A C A C T T A G C A G G T G T A T C A T C A A T +T T T A G G T G C C G T A A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T - C T A C C +G G A A T A A C T A T G G A C C G G A T A C C T C +T T T T T G T T T G A G C A G T A T T T A T T A C +T G C T T T A C T T T T A C T A C T G T C C T T A +C C A G T T +>Mastigoproctus +T G A T A T T A G G T G C T C C T G A T A T A G - +C - T T T T C C T C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C T T C G T T +A A C G T T A C T T C T T - T C C A G A G G T A T +A G T T G A A A G A G G G G T A G G A A C A G G A +T G A A C A G T C T A T C C T C C T T T A T C - A +G - C T A G A A T T G C T C A T G C C G G A G C T +T C A G T A G A C T T A G G T A T T T T C T C G C +T G C A T C T A G C T G G T G T C T C G T C T A T +C T T A G G A G C T G T C A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G T A T A C C T C +T T T T C G T A T G G G C A G T A T T T A T T A C +C G C C C T A C T A C T T T T A T T A T C T T T A +C C A G T A +>Pauroctonus +T A A T G T T A G G T G C T C C T G A T A T G G - +C - T T T T C C A C G A A T A A A C A A T A T G A +G T T T C T G G C T C C T A C C T C C T T C A C T +A A C A C T A C T T C T T - T C T A G A G G T A T +A G T T G A A A G A G G A G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T T T A T C - A +G - C C A G T A T T G C T C A T G C T G G G G C T +T C G G T A G A T T T A G G A A T T T T C T C C C +T A C A C T T G G C A G G T G T T T C T T C A A T +T T T A G G A G C T G T A A A T T T T A T G A C A +A C T G T T A T T A A C A T A C G A T - C T A C G +G G A A T A A C T A T A G A C C G A A T A C C G C +T T T T C G T T T G A G C A G T A T T T A T T A C +A G C A T T A C T T C T C T T A T T A T C T C T A +C C G G T A +>Rhiphicephalus +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T C - A +G - C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G A A T A C C G T +T A T T C G T C T G A G C A G T T T T T A T T A C +A G C G C T C T T A C T C C T T T T A T C T T T A +C C C G T T +>Thelichoris +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T C - A +G - C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G A A T A C C G T +T A T T C G T C T G A G C A G T T T T T A T T A C +A G C G C T C T T A C T C C T T T T A T C T T T A +C C C G T T +>Thermobius +T A A T A T T A G G T G C T C C C G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T G A +G T T T C T G A C T T T T A C C T C C C T C G C T +A A C T C T A C T T C T T - T C T A G A G G T A T +A G T C G A A A G A G G G G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T C T A T C - T +G - C C A G C A T T G C C C A T G C A G G A G C A +T C T G T A G A T C T A G G A A T C T T C T C A T +T A C A T T T A G C A G G G G T T T C T T C A A T +T C T A G G T G C T G T A A A T T T T A T A A C T +A C C G T T A T C A A T A T A C G A T - C A A C A +G G A A T A A C T A T A G A C C G A A T A C C T C +T T T T C G T C T G A G C G G T A T T T A T T A C +A G C C T T A C T A C T T T T A C T A T C A T T A +C C G G T T +>Vonones +T A A T G T T A G G T G C T C C A G A T A T G G - +C A T T T C C C - C G A A T A A A T A A T A T A A +G T T T C T G A C T T T T A C C C C C T T C G C T +A A C T T T A C T T - T T A T C T A G A G G T A T +A G T C G A A A G A G G A G T G G G A A C T G G A +T G A A C A G T A T A C C C T C C T T T A T C - A +G - C C A G A A T T G C T C A C G C A G G T G C T +T C A G T T G A T C T A G G T A T T T T T T C A T +T A C A T T T A G C A G G G G T C T C A T C A A T +C T T A G G A G C T G T A A A C T T T A T A A C G +A C C G T T A T C A A T A T A C G A T - C T A C A +G G G A T A A C T A T A G A C C G A A T A C C A C +T T T T T G T T T G A G C A G T A T T T A T T A C +A G C T C T A C T C C T A C T G T T A T C T T T A +C C A G T C + + +Sequence character chel_16S.fas#0 +>Alentus +T C C A G A C T - - G A - C - G G T - C C A C C - +- - G A A T C G - G G C G A C T G - - - T C A G G +C C T G A A C - A T C C G G T C G G T T T C - T C +- T C C C T T T T C C C G C - - G - G T G C T C T +T C G T T G A G - T G - T C G C G - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Amblypygid +T C C A G A C T - - G G - C - G G T - C C G C C T +- - A - G C G G C G A G T A C T G - - - T C A G G +C C T G A A C - A T G G C G C C G G - T T - - T - +- T - - C - - - - - - C T T - - G - G T T C T C T +T T A C T G A G - T G - T C T T G - G G C G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - - - G - C T G +- T - - - - C G C C +>Americhernus +T C G A G C C T C C A A - T - G A T - A C G T T G +- - A - A A G G C G T T T A T C G - - - T T G G G +G C C G - A C - A G C G C G T C - G - - T - - G - +- G - - G - - - - - - C T C - - G - G T T G G C C +T T A A A A A G C T G A T C - G G - G T T C T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +G G T G C T C A A A G C A - - G - - - - G C C T G +- G - - - - T G C C +>Aportus +T C C A G A C T - - G A - C - G G T - C C A C C - +- - G - C T C G - G G C G A C T G - - - T C A G G +C C T G A A C - A T T C C G T C G G T T T C G A C +- A G A T T T T T C C C G C - - G - G T G C T C T +T C G G T G A G - T G - T C G C - - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Artemia +C T C G G T C G - - G G - T - G G T G C C G C C - +- T C - A C G G T G G T C A C T G - - - C C T C G +A T C G G A C - A A T T C A T T G G - - A - - - - +- T - C G - - - - - - T T C G G G - G T G C T C T +T A A C C G A G - T G - T C C T G - G G T G G C C +G A T A C G T T T A C T T T G A A C A A A T T A G +A G T G C T T A A A G C A - - G - - - - G - T - G +C A - C C G C G C C +>Centruoides +T C C A G A C A - - A G - C - G G T - C C A C C - +- - C - G C G G T G G T T A C T G - - - T T T G G +A C T G G A C - G T T T G G C C G G - - A - - T - +- T - C C - - - - - - T T T - - G - A T G C T C T +T T G C C G A G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - - +- A - - C - C G C C +>Chanbria +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - +>Gea +T C C G G C C G - - G A - C G G G T - C C G C C - +T - A - C C G G T G G T T A C T G - T T C G C T G +C C - G A G C - - T T C A G G G G G - - C - - C - +- G - C T - - - - - - G T C - - G - A T G A T C T +T C A T C G G T - T A - T C T T C - C G T A A C C +C T C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - - - - C G C C +>Hadrurus +T C C G G A C T - - G T - C - G G T - C C G C C - +- - - - G C - A A G C T T A C T G - - - G C A G G +A C C G G A C - G T C T A G C C G G - - A - - C - +- T - C T - - - - - - C T C - - G T A T C C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A T - C - C G C C +>Hypochilus +T C C A G A C G - - G G - C - G G T - C C G C C - +T - A - A C G G T G G T T A C T G - - - C C T G G +C C T G A A C - A A C C A G C C G G - - T - - T - +- T - C C - - - - - - C T A - - G - A T G A T C T +T C A T T G A T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - G +- A - - - - C G C C +>Limulus +T C T A G A C T - - G G - C - G G T - C C G C T T +- - - - C C G G C G G T T A C T G - - - C C T G G +C C T A A A C - A T C - T G C C G G - T T - - T - +- T - C C - - - - - - C T C - - G - G T G C C C T +T G A T T G A G - T G - T C T T G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - A +- A - - - - C G C C +>Mastigoproctus +T C C A G A C G - - G G - T - G G T - C C A C C - +G - C - C C G G T G G C T A C T G - - - C C C G G +C C T G A A C A A T C T C G C C G G - - T - - T - +- T - T C - - - - - - C T T - - G - A T T C T C T +T C A C C G A G - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T T A A A G C A - - G - - C - G - T - A +- A - - - - C G C C +>Pauroctonus +T C C A G A C T - - G T - C - G G T - C C G - C - +- - A - C C G G A G G T T A C T G - - - G C A G G +A C C G G A C - G T C T A G C C G G - - C - - C - +- T - C C - - - - - - C T C - - G - T T G C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - A - C +- C - - - - C G C C +>Rhiphicephalus +T C C A G A C G - - A G - T - A G T - G C A T C - +T - A - C C C G A T G C T A C G G - - - C T C G G +A C T G A A C - A T C A T G C C G G - - T - - T - +C T - T T - - - - - - C T T - - G - G T G C A C T +T C A T T G T G - T G - C C T C G A G A T G G C C +G G T G C T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A C G C A - G G - - C - G - A - G +- T - - - - C G C C +>Thelichoris +T C C A G A C G - - G G - C - G G T - C C G C C - +T - A - A C G G T G G T T A C T G - - - C C T G G +C C T G A A C - A G C C A G C C G G - - T - - T - +- T - C C - - - - - - C T A - - G - A T G A T C T +T T A C C G G T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - G G - - C - - - T - G +- A - - - - C G C C +>Thermobius +C T C G G A C - - - G A T C - G G T - T C G C C - +- - G - C C C G T G T T A A C T G A - - T C - G T +T C C G G A C - G T C C T G C C G G T T T - - T - +- T - C C - T T T - - C T C - - G - G T G C T C T +T C A T T G A G - T G - T C T T G - A T T G G C C +G G C A C G T T T A C T T T G A A C A A A T T A G +A G T G C T C A A A G C A G - G T C C A G - T - G +- T - - - - C G C C +>Vonones +T C G A G G C T - - G G - C - G G T - C C G C C T +- - A - C A G G C G G T C A C T G - - - C C A G T +A C T C A A C - A T C C T G C C G G - T T - - T - +- T - C C - - - - - - C T T - - G - G T G C T C T +T C G C T G A G - T G - T C T C G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - C - G +- T - - - - A G C C + +Implied Alignments for Graph 4 + +Sequence character chel_cox1.fas#0 +>Alentus +T T A T A T T A G G A G C A C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T T T G A C T T C T T C C A C C A T C T T T +A A C T T T A C T C C T C - T C T A G A G G A A T +A G T A G A A A G T G G A G T G G G A A C A G G C +T G A A C T G T C T A T C C T C C T T T A T C - T +G - C T G G A A T T G C A C A T G C T G G T G C C +T C A G T A G A T T T A G G A A T C T T T T C T T +T A C A C C T A G C A G G A G T T T C A T C C A T +C T T A G G G G C C G T A A A C T T T A T A A C T +A C C G T T A T C A A T A T A C G A A - G A A C A +G G T A T A A C A A T A G A C C G A A T T C C T C +T T T T T G T A T G A T C T G T A T T T A T T A C +A G C C C T T C T T C T C C T T C T T T C C C T C +C C A G T C +>Amblypygid +T A A T A T T A G G A G C C C C A G A T A T G G - +C - T T T T C C C C G T A T A A A T A A T A T A A +G T T T T T G A C T T T T A C C C C C C T C T C T +A A C A C T - A C T C C T C T C T A G A G G G A T +A G T G G A A A G A G G T G T A G G T A C A G G A +T G A A C T G T T T A C C C C C C T C T A T C - G +G - C T A G C A T C G C A C A C G C C G G T G C T +T C T G T T G A T A T G G G A A T T T T C T C G C +T C C A T T T G G C C G G G G T T T C T T C A A T +T T T A G G G G C C G T T A A T T T T A T A A C T +A C T G T T A T C A A T A T A C G A A - G A A C C +G G A A T A A C T A T A G A C C G A A T T C C C C +T T T T T G T T T G A T C T G T C T T T A T T A C +C G C C C T C C T T C T T C T T C T A T C A T T A +C C T G T T +>Americhernus +T A A T A C T T G G G G C C C C T G A T A T G G - +C - T T T C C C C C G T A T A A A T A A T A T G A +G A T T C T G A C T C T T A C C T C C C T C A T T +A A C T C T - C C T A T T A T C G A G A G G G A T +A G T T G A A A C A G G G G T T G G T A C T G G A +T G G A C A G T A T A C C C C C C C T T A T C - G +G - C C G G G A T C G C C C A C G C A G G G G C C +T C A G T T G A C C T G G C A A T C T T T T C T C +T C C A C T T A G C T G G G G T T T C T T C T A T +T C T A G G A G C A G T T A A T T T T A T A A C A +A C C G T T A T C A A T A T A C G A A - G T G T A +G G G A T G A C A A T A G A C C G A G T A C C T C +T T T T T G T A T G G T C T A T T T T C A T C A C +A A C T C T C C T T C T A C T A C T T T C T C T A +C C A G T C +>Aportus +T A A T A T T A G G G G C C C C T G A T A T G G C +C - T T T C C C - C G G A T A A A T A A T A T A A +G T T T C T G A C T A T T A C C C C C C T C T T T +A A C C C T T C T T T T A - T C T A G T G G T A T +G G T T G A A A C A G G A G T T G G G A C T G G T +T G A A C A G T T T A C C C C C C T T T A G C - C +G - C C G G A A T T G C T C A C G C A G G A G C T +T C G G T A G A T C T T G C T A T T T T C T C A C +T C C A C T T A G C C G G G G T T T C T T C G A T +T T T A G G A G C A G T A A A C T T T A T A A C T +A C C G T T A T T A A T A T A C G A A - G A A C A +G G G A T A A C A A T G G A C C G G A T C C C T C +T T T T C G T C T G G T C G G T T T T T A T T A C +T G C C C T T C T T T T A C T T T T A T C T C T C +C C A G T T +>Artemia +T A A T A T T A G G A G C T C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A C T C C T C C C T C C C T C T T T +A A C T C T T T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C G G G A +T G A A C C G T T T A T C C T C C T C T T G C - A +G C - C G G A A T T G C T C A T G C C G G A G C T +T C A G T T G A T A T A G G A A T C T T C T C T C +T A C A T T T A G C A G G A G T A T C C T C T A T +T T T A G G G G C C G T A A A T T T T A T A A C A +A C A G T G A T T A A C A T A C G G G C A T - C T +G G A A T A A C A T T A G A C C G A A T A C C T T +T A T T C G T G T G A T C T G T A T T T A T T A C +T G C A C T A T T A T T A C T T T T A T C A C T A +C C A G T T +>Centruoides +T A A T A T T A G G G G C C C C T G A T A T G G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A T T G C T T C C C C C T T C C C T +A A C T C T T T T A C T T - T C G A G T G G A A T +G G T A G A A A G A G G G G T A G G T A C A G G A +T G A A C T G T T T A T C C C C C T C T T G C - A +G C - C G G A A T T G C T C A T G C T G G A G C T +T C C G T T G A C A T A G G A A T T T T C T C T C +T A C A T T T A G C A G G A G T G T C T T C T A T +T T T A G G G G C C G T A A A C T T C A T A A C T +A C A G T A A T T A A T A T A C G A G C A T - C T +G G A A T A A C A A T A G A C C G A A T A C C T T +T A T T T G T T T G A T C T G T C T T T A T T A C +A G C A C T A C T T T T A C T C T T G T C A T T A +C C A G T T +>Chanbria +T T A T A T T A G G A G C A C C T G A T A T A G - +C - A T T T C C A C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C A T C T C T +T A C A C T A C T C C T A - T C A A G A G G A A T +G G T A G A A A G A G G A G T C G G T A C A G G A +T G A A C T G T G T A T C C C C C G C T A G C - A +G - C A G G A A T T G C C C A C G C T G G A G C C +T C A G T T G A C A T A G G A A T T T T T T C A C +T A C A C C T C G C C G G A G T C T C A T C A A T +T C T A G G T G C A G T T A A T T T T A T A A C A +A C T G T T A T T A A C A T A C G - C C C T G C G +G G A A T A A C T A T A G A C C G T A T A C C A C +T T T T C G T G T G A G C A G T A T T T A T T A C +A G C C C T A C T C C T T C T T T T A T C T C T A +C C A G T C +>Gea +T T A T A C T T G G T G C C C C A G A T A T G G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G A T T C T G A C T T C T C C C C C C T T C T C T +A A C T C T A T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C A G G A +T G A A C A G T T T A C C C C C C T C T A G C - A +G - C A G G A A T T G C T C A T G C T G G A G C T +T C A G T T G A T A T A G G A A T T T T C T C G C +T A C A C C T T G C A G G G G T C T C A T C A A T +C T T A G G A G C A G T T A A T T T C A T G A C A +A C A G T T A T T A A T A T G C G - C C C T G C A +G G A A T A A C T A T G G A C C G T A T A C C A C +T C T T C G T G T G A G C T G T C T T T A T T A C +A G C C T T G C T A C T A T T A C T A T C C C T C +C C A G T C +>Hadrurus +T T A T A T T A G G G G C C C C A G A T A T A G - +C - T T T T C C C C G A A T A A A T A A T A T A A +G C T T C T G A T T A C T T C C C C C C T C C C T +A A C T C T C C T C C T T - T T A A G A G G A A T +A G T T G A A A G A G G A G T A G G G A C C G G C +T G A A C T G T C T A T C C T C C T T T A G C T A +G - - A G G A A T T G C C C A T G C A G G A G C C +T C G G T A G A C A T A G G A A T T T T T T C T C +T A C A C T T A G C T G G A G T T T C A T C C A T +T T T A G G G G C C G T A A A C T T T A T A A C A +A C A G T G A T T A A T A T A C G A T C A T - C A +G G A A T A A C A A T A G A T C G A A T A C C T T +T A T T T G T T T G A T C A G T C T T T A T T A C +C G C T T T A T T G T T A C T C C T T T C A C T C +C C A G T T +>Hypochilus +T A A T A C T C G G G G C C C C A G A T A T A G - +C - T T T C C C A C G A A T A A A T A A C A T A A +G A T T C T G A C T T C T T C C T C C C T C T C T +A A C T C T A C T T C T C - T C A A G C G G A A T +A G T C G A A A G A G G G G T A G G A A C C G G C +T G G A C A G T A T A C C C T C C T C T A G C C A +G - - A G G A A T C G C T C A T G C A G G T G C T +T C T G T A G A T A T A G G A A T C T T C T C G C +T T C A C C T A G C A G G A G T C T C A T C A A T +C C T A G G G G C C G T A A A C T T T A T A A C A +A C A G T A A T C A A T A T A C G A T C C T - C A +G G A A T A T C T A T A G A C C G T A T A C C T T +T A T T T G T A T G A T C A G T A T T C A T T A C +C G C T C T C C T C C T T C T T C T C T C T T T A +C C A G T C +>Limulus +T A A T A C T A G G T G C C C C G G A T A T G G - +C - T T T C C C T C G A A T A A A T A A C A T A A +G T T T C T G A C T T T T A C C C C C A T C T C T +C A C T C T - C C T C T T A T C T A G A G G A A T +A G T A G A A A G A G G T G T A G G A A C T G G A +T G A A C T G T A T A T C C T C C T T T A T C - T +G - C A A G T A T T G C C C A T G C G G G A G C T +T C C G T T G A T C T C G G A A T C T T C T C A C +T A C A C T T A G C A G G T G T A T C A T C A A T +T T T A G G T G C C G T A A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T - C T A C C +G G A A T A A C T A T G G A C C G G A T A C C T C +T T T T T G T T T G A G C A G T A T T T A T T A C +T G C T T T A C T T T T A C T A C T G T C C T T A +C C A G T T +>Mastigoproctus +T G A T A T T A G G T G C T C C T G A T A T A G - +C - T T T T C C T C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C T T C G T T +A A C G T T A C T T C T T - T C C A G A G G T A T +A G T T G A A A G A G G G G T A G G A A C A G G A +T G A A C A G T C T A T C C T C C T T T A T C - A +G - C T A G A A T T G C T C A T G C C G G A G C T +T C A G T A G A C T T A G G T A T T T T C T C G C +T G C A T C T A G C T G G T G T C T C G T C T A T +C T T A G G A G C T G T C A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G T A T A C C T C +T T T T C G T A T G G G C A G T A T T T A T T A C +C G C C C T A C T A C T T T T A T T A T C T T T A +C C A G T A +>Pauroctonus +T A A T G T T A G G T G C T C C T G A T A T G G - +C - T T T T C C A C G A A T A A A C A A T A T G A +G T T T C T G G C T C C T A C C T C C T T C A C T +A A C A C T A C T T C T T - T C T A G A G G T A T +A G T T G A A A G A G G A G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T T T A T C - A +G - C C A G T A T T G C T C A T G C T G G G G C T +T C G G T A G A T T T A G G A A T T T T C T C C C +T A C A C T T G G C A G G T G T T T C T T C A A T +T T T A G G A G C T G T A A A T T T T A T G A C A +A C T G T T A T T A A C A T A C G A T - C T A C G +G G A A T A A C T A T A G A C C G A A T A C C G C +T T T T C G T T T G A G C A G T A T T T A T T A C +A G C A T T A C T T C T C T T A T T A T C T C T A +C C G G T A +>Rhiphicephalus +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T C - A +G - C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G A A T A C C G T +T A T T C G T C T G A G C A G T T T T T A T T A C +A G C G C T C T T A C T C C T T T T A T C T T T A +C C C G T T +>Thelichoris +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T C - A +G - C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G A A T A C C G T +T A T T C G T C T G A G C A G T T T T T A T T A C +A G C G C T C T T A C T C C T T T T A T C T T T A +C C C G T T +>Thermobius +T A A T A T T A G G T G C T C C C G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T G A +G T T T C T G A C T T T T A C C T C C C T C G C T +A A C T C T A C T T C T T - T C T A G A G G T A T +A G T C G A A A G A G G G G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T C T A T C - T +G - C C A G C A T T G C C C A T G C A G G A G C A +T C T G T A G A T C T A G G A A T C T T C T C A T +T A C A T T T A G C A G G G G T T T C T T C A A T +T C T A G G T G C T G T A A A T T T T A T A A C T +A C C G T T A T C A A T A T A C G A T - C A A C A +G G A A T A A C T A T A G A C C G A A T A C C T C +T T T T C G T C T G A G C G G T A T T T A T T A C +A G C C T T A C T A C T T T T A C T A T C A T T A +C C G G T T +>Vonones +T A A T G T T A G G T G C T C C A G A T A T G G - +C A T T T C C C - C G A A T A A A T A A T A T A A +G T T T C T G A C T T T T A C C C C C T T C G C T +A A C T T T A C T T - T T A T C T A G A G G T A T +A G T C G A A A G A G G A G T G G G A A C T G G A +T G A A C A G T A T A C C C T C C T T T A T C - A +G - C C A G A A T T G C T C A C G C A G G T G C T +T C A G T T G A T C T A G G T A T T T T T T C A T +T A C A T T T A G C A G G G G T C T C A T C A A T +C T T A G G A G C T G T A A A C T T T A T A A C G +A C C G T T A T C A A T A T A C G A T - C T A C A +G G G A T A A C T A T A G A C C G A A T A C C A C +T T T T T G T T T G A G C A G T A T T T A T T A C +A G C T C T A C T C C T A C T G T T A T C T T T A +C C A G T C + + +Sequence character chel_16S.fas#0 +>Alentus +T C C A G A C T - - G A - C - G G T - C C A C C - +- - G A A T C G - G G C G A C T G - - - T C A G G +C C T G A A C - A T C C G G T C G G T T T C - T C +- T C C C T T T T C C C G C - - G - G T G C T C T +T C G T T G A G - T G - T C G C G - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Amblypygid +T C C A G A C T - - G G - C - G G T - C C G C C T +- - A - G C G G C G A G T A C T G - - - T C A G G +C C T G A A C - A T G G C G C C G G - T T - - T - +- T - - C - - - - - - C T T - - G - G T T C T C T +T T A C T G A G - T G - T C T T G - G G C G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - - - G - C T G +- T - - - - C G C C +>Americhernus +T C G A G C C T C C A A - T - G A T - A C G T T G +- - A - A A G G C G T T T A T C G - - - T T G G G +G C C G - A C - A G C G C G T C - G - - T - - G - +- G - - G - - - - - - C T C - - G - G T T G G C C +T T A A A A A G C T G A T C - G G - G T T C T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +G G T G C T C A A A G C A - - G - - - - G C C T G +- G - - - - T G C C +>Aportus +T C C A G A C T - - G A - C - G G T - C C A C C - +- - G - C T C G - G G C G A C T G - - - T C A G G +C C T G A A C - A T T C C G T C G G T T T C G A C +- A G A T T T T T C C C G C - - G - G T G C T C T +T C G G T G A G - T G - T C G C - - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Artemia +C T C G G T C G - - G G - T - G G T G C C G C C - +- T C - A C G G T G G T C A C T G - - - C C T C G +A T C G G A C - A A T T C A T T G G - - A - - - - +- T - C G - - - - - - T T C G G G - G T G C T C T +T A A C C G A G - T G - T C C T G - G G T G G C C +G A T A C G T T T A C T T T G A A C A A A T T A G +A G T G C T T A A A G C A - - G - - - - G - T - G +C A - C C G C G C C +>Centruoides +T C C A G A C A - - A G - C - G G T - C C A C C - +- - C - G C G G T G G T T A C T G - - - T T T G G +A C T G G A C - G T T T G G C C G G - - A - - T - +- T - C C - - - - - - T T T - - G - A T G C T C T +T T G C C G A G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - - +- A - - C - C G C C +>Chanbria +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - +>Gea +T C C G G C C G - - G A - C G G G T - C C G C C - +T - A - C C G G T G G T T A C T G - T T C G C T G +C C - G A G C - - T T C A G G G G G - - C - - C - +- G - C T - - - - - - G T C - - G - A T G A T C T +T C A T C G G T - T A - T C T T C - C G T A A C C +C T C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - - - - C G C C +>Hadrurus +T C C G G A C T - - G T - C - G G T - C C G C C - +- - - - G C - A A G C T T A C T G - - - G C A G G +A C C G G A C - G T C T A G C C G G - - A - - C - +- T - C T - - - - - - C T C - - G T A T C C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A T - C - C G C C +>Hypochilus +T C C A G A C G - - G G - C - G G T - C C G C C - +T - A - A C G G T G G T T A C T G - - - C C T G G +C C T G A A C - A A C C A G C C G G - - T - - T - +- T - C C - - - - - - C T A - - G - A T G A T C T +T C A T T G A T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - G +- A - - - - C G C C +>Limulus +T C T A G A C T - - G G - C - G G T - C C G C T T +- - - - C C G G C G G T T A C T G - - - C C T G G +C C T A A A C - A T C - T G C C G G - T T - - T - +- T - C C - - - - - - C T C - - G - G T G C C C T +T G A T T G A G - T G - T C T T G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - A +- A - - - - C G C C +>Mastigoproctus +T C C A G A C G - - G G - T - G G T - C C A C C - +G - C - C C G G T G G C T A C T G - - - C C C G G +C C T G A A C A A T C T C G C C G G - - T - - T - +- T - T C - - - - - - C T T - - G - A T T C T C T +T C A C C G A G - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T T A A A G C A - - G - - C - G - T - A +- A - - - - C G C C +>Pauroctonus +T C C A G A C T - - G T - C - G G T - C C G - C - +- - A - C C G G A G G T T A C T G - - - G C A G G +A C C G G A C - G T C T A G C C G G - - C - - C - +- T - C C - - - - - - C T C - - G - T T G C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - A - C +- C - - - - C G C C +>Rhiphicephalus +T C C A G A C G - - A G - T - A G T - G C A T C - +T - A - C C C G A T G C T A C G G - - - C T C G G +A C T G A A C - A T C A T G C C G G - - T - - T - +C T - T T - - - - - - C T T - - G - G T G C A C T +T C A T T G T G - T G - C C T C G A G A T G G C C +G G T G C T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A C G C A - G G - - C - G - A - G +- T - - - - C G C C +>Thelichoris +T C C A G A C G - - G G - C - G G T - C C G C C - +T - A - A C G G T G G T T A C T G - - - C C T G G +C C T G A A C - A G C C A G C C G G - - T - - T - +- T - C C - - - - - - C T A - - G - A T G A T C T +T T A C C G G T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - G G - - C - - - T - G +- A - - - - C G C C +>Thermobius +C T C G G A C - - - G A T C - G G T - T C G C C - +- - G - C C C G T G T T A A C T G A - - T C - G T +T C C G G A C - G T C C T G C C G G T T T - - T - +- T - C C - T T T - - C T C - - G - G T G C T C T +T C A T T G A G - T G - T C T T G - A T T G G C C +G G C A C G T T T A C T T T G A A C A A A T T A G +A G T G C T C A A A G C A G - G T C C A G - T - G +- T - - - - C G C C +>Vonones +T C G A G G C T - - G G - C - G G T - C C G C C T +- - A - C A G G C G G T C A C T G - - - C C A G T +A C T C A A C - A T C C T G C C G G - T T - - T - +- T - C C - - - - - - C T T - - G - G T G C T C T +T C G C T G A G - T G - T C T C G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - C - G +- T - - - - A G C C + +Implied Alignments for Graph 5 + +Sequence character chel_cox1.fas#0 +>Alentus +T T A T A T T A G G A G C A C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T T T G A C T T C T T C C A C C A T C T T T +A A C T T T A C T C C T C - T C T A G A G G A A T +A G T A G A A A G T G G A G T G G G A A C A G G C +T G A A C T G T C T A T C C T C C T T T A T C - T +G - C T G G A A T T G C A C A T G C T G G T G C C +T C A G T A G A T T T A G G A A T C T T T T C T T +T A C A C C T A G C A G G A G T T T C A T C C A T +C T T A G G G G C C G T A A A C T T T A T A A C T +A C C G T T A T C A A T A T A C G A A - G A A C A +G G T A T A A C A A T A G A C C G A A T T C C T C +T T T T T G T A T G A T C T G T A T T T A T T A C +A G C C C T T C T T C T C C T T C T T T C C C T C +C C A G T C +>Amblypygid +T A A T A T T A G G A G C C C C A G A T A T G G - +C - T T T T C C C C G T A T A A A T A A T A T A A +G T T T T T G A C T T T T A C C C C C C T C T C T +A A C A C T - A C T C C T C T C T A G A G G G A T +A G T G G A A A G A G G T G T A G G T A C A G G A +T G A A C T G T T T A C C C C C C T C T A T C - G +G - C T A G C A T C G C A C A C G C C G G T G C T +T C T G T T G A T A T G G G A A T T T T C T C G C +T C C A T T T G G C C G G G G T T T C T T C A A T +T T T A G G G G C C G T T A A T T T T A T A A C T +A C T G T T A T C A A T A T A C G A A - G A A C C +G G A A T A A C T A T A G A C C G A A T T C C C C +T T T T T G T T T G A T C T G T C T T T A T T A C +C G C C C T C C T T C T T C T T C T A T C A T T A +C C T G T T +>Americhernus +T A A T A C T T G G G G C C C C T G A T A T G G - +C - T T T C C C C C G T A T A A A T A A T A T G A +G A T T C T G A C T C T T A C C T C C C T C A T T +A A C T C T - C C T A T T A T C G A G A G G G A T +A G T T G A A A C A G G G G T T G G T A C T G G A +T G G A C A G T A T A C C C C C C C T T A T C - G +G - C C G G G A T C G C C C A C G C A G G G G C C +T C A G T T G A C C T G G C A A T C T T T T C T C +T C C A C T T A G C T G G G G T T T C T T C T A T +T C T A G G A G C A G T T A A T T T T A T A A C A +A C C G T T A T C A A T A T A C G A A - G T G T A +G G G A T G A C A A T A G A C C G A G T A C C T C +T T T T T G T A T G G T C T A T T T T C A T C A C +A A C T C T C C T T C T A C T A C T T T C T C T A +C C A G T C +>Aportus +T A A T A T T A G G G G C C C C T G A T A T G G C +C - T T T C C C - C G G A T A A A T A A T A T A A +G T T T C T G A C T A T T A C C C C C C T C T T T +A A C C C T T C T T T T A - T C T A G T G G T A T +G G T T G A A A C A G G A G T T G G G A C T G G T +T G A A C A G T T T A C C C C C C T T T A G C - C +G - C C G G A A T T G C T C A C G C A G G A G C T +T C G G T A G A T C T T G C T A T T T T C T C A C +T C C A C T T A G C C G G G G T T T C T T C G A T +T T T A G G A G C A G T A A A C T T T A T A A C T +A C C G T T A T T A A T A T A C G A A - G A A C A +G G G A T A A C A A T G G A C C G G A T C C C T C +T T T T C G T C T G G T C G G T T T T T A T T A C +T G C C C T T C T T T T A C T T T T A T C T C T C +C C A G T T +>Artemia +T A A T A T T A G G A G C T C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A C T C C T C C C T C C C T C T T T +A A C T C T T T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C G G G A +T G A A C C G T T T A T C C T C C T C T T G C - A +G C - C G G A A T T G C T C A T G C C G G A G C T +T C A G T T G A T A T A G G A A T C T T C T C T C +T A C A T T T A G C A G G A G T A T C C T C T A T +T T T A G G G G C C G T A A A T T T T A T A A C A +A C A G T G A T T A A C A T A C G G G C A T - C T +G G A A T A A C A T T A G A C C G A A T A C C T T +T A T T C G T G T G A T C T G T A T T T A T T A C +T G C A C T A T T A T T A C T T T T A T C A C T A +C C A G T T +>Centruoides +T A A T A T T A G G G G C C C C T G A T A T G G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A T T G C T T C C C C C T T C C C T +A A C T C T T T T A C T T - T C G A G T G G A A T +G G T A G A A A G A G G G G T A G G T A C A G G A +T G A A C T G T T T A T C C C C C T C T T G C - A +G C - C G G A A T T G C T C A T G C T G G A G C T +T C C G T T G A C A T A G G A A T T T T C T C T C +T A C A T T T A G C A G G A G T G T C T T C T A T +T T T A G G G G C C G T A A A C T T C A T A A C T +A C A G T A A T T A A T A T A C G A G C A T - C T +G G A A T A A C A A T A G A C C G A A T A C C T T +T A T T T G T T T G A T C T G T C T T T A T T A C +A G C A C T A C T T T T A C T C T T G T C A T T A +C C A G T T +>Chanbria +T T A T A T T A G G A G C A C C T G A T A T A G - +C - A T T T C C A C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C A T C T C T +T A C A C T A C T C C T A - T C A A G A G G A A T +G G T A G A A A G A G G A G T C G G T A C A G G A +T G A A C T G T G T A T C C C C C G C T A G C - A +G - C A G G A A T T G C C C A C G C T G G A G C C +T C A G T T G A C A T A G G A A T T T T T T C A C +T A C A C C T C G C C G G A G T C T C A T C A A T +T C T A G G T G C A G T T A A T T T T A T A A C A +A C T G T T A T T A A C A T A C G - C C C T G C G +G G A A T A A C T A T A G A C C G T A T A C C A C +T T T T C G T G T G A G C A G T A T T T A T T A C +A G C C C T A C T C C T T C T T T T A T C T C T A +C C A G T C +>Gea +T T A T A C T T G G T G C C C C A G A T A T G G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G A T T C T G A C T T C T C C C C C C T T C T C T +A A C T C T A T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C A G G A +T G A A C A G T T T A C C C C C C T C T A G C - A +G - C A G G A A T T G C T C A T G C T G G A G C T +T C A G T T G A T A T A G G A A T T T T C T C G C +T A C A C C T T G C A G G G G T C T C A T C A A T +C T T A G G A G C A G T T A A T T T C A T G A C A +A C A G T T A T T A A T A T G C G - C C C T G C A +G G A A T A A C T A T G G A C C G T A T A C C A C +T C T T C G T G T G A G C T G T C T T T A T T A C +A G C C T T G C T A C T A T T A C T A T C C C T C +C C A G T C +>Hadrurus +T T A T A T T A G G G G C C C C A G A T A T A G - +C - T T T T C C C C G A A T A A A T A A T A T A A +G C T T C T G A T T A C T T C C C C C C T C C C T +A A C T C T C C T C C T T - T T A A G A G G A A T +A G T T G A A A G A G G A G T A G G G A C C G G C +T G A A C T G T C T A T C C T C C T T T A G C T A +G - - A G G A A T T G C C C A T G C A G G A G C C +T C G G T A G A C A T A G G A A T T T T T T C T C +T A C A C T T A G C T G G A G T T T C A T C C A T +T T T A G G G G C C G T A A A C T T T A T A A C A +A C A G T G A T T A A T A T A C G A T C A T - C A +G G A A T A A C A A T A G A T C G A A T A C C T T +T A T T T G T T T G A T C A G T C T T T A T T A C +C G C T T T A T T G T T A C T C C T T T C A C T C +C C A G T T +>Hypochilus +T A A T A C T C G G G G C C C C A G A T A T A G - +C - T T T C C C A C G A A T A A A T A A C A T A A +G A T T C T G A C T T C T T C C T C C C T C T C T +A A C T C T A C T T C T C - T C A A G C G G A A T +A G T C G A A A G A G G G G T A G G A A C C G G C +T G G A C A G T A T A C C C T C C T C T A G C C A +G - - A G G A A T C G C T C A T G C A G G T G C T +T C T G T A G A T A T A G G A A T C T T C T C G C +T T C A C C T A G C A G G A G T C T C A T C A A T +C C T A G G G G C C G T A A A C T T T A T A A C A +A C A G T A A T C A A T A T A C G A T C C T - C A +G G A A T A T C T A T A G A C C G T A T A C C T T +T A T T T G T A T G A T C A G T A T T C A T T A C +C G C T C T C C T C C T T C T T C T C T C T T T A +C C A G T C +>Limulus +T A A T A C T A G G T G C C C C G G A T A T G G - +C - T T T C C C T C G A A T A A A T A A C A T A A +G T T T C T G A C T T T T A C C C C C A T C T C T +C A C T C T - C C T C T T A T C T A G A G G A A T +A G T A G A A A G A G G T G T A G G A A C T G G A +T G A A C T G T A T A T C C T C C T T T A T C - T +G - C A A G T A T T G C C C A T G C G G G A G C T +T C C G T T G A T C T C G G A A T C T T C T C A C +T A C A C T T A G C A G G T G T A T C A T C A A T +T T T A G G T G C C G T A A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T - C T A C C +G G A A T A A C T A T G G A C C G G A T A C C T C +T T T T T G T T T G A G C A G T A T T T A T T A C +T G C T T T A C T T T T A C T A C T G T C C T T A +C C A G T T +>Mastigoproctus +T G A T A T T A G G T G C T C C T G A T A T A G - +C - T T T T C C T C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C T T C G T T +A A C G T T A C T T C T T - T C C A G A G G T A T +A G T T G A A A G A G G G G T A G G A A C A G G A +T G A A C A G T C T A T C C T C C T T T A T C - A +G - C T A G A A T T G C T C A T G C C G G A G C T +T C A G T A G A C T T A G G T A T T T T C T C G C +T G C A T C T A G C T G G T G T C T C G T C T A T +C T T A G G A G C T G T C A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G T A T A C C T C +T T T T C G T A T G G G C A G T A T T T A T T A C +C G C C C T A C T A C T T T T A T T A T C T T T A +C C A G T A +>Pauroctonus +T A A T G T T A G G T G C T C C T G A T A T G G - +C - T T T T C C A C G A A T A A A C A A T A T G A +G T T T C T G G C T C C T A C C T C C T T C A C T +A A C A C T A C T T C T T - T C T A G A G G T A T +A G T T G A A A G A G G A G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T T T A T C - A +G - C C A G T A T T G C T C A T G C T G G G G C T +T C G G T A G A T T T A G G A A T T T T C T C C C +T A C A C T T G G C A G G T G T T T C T T C A A T +T T T A G G A G C T G T A A A T T T T A T G A C A +A C T G T T A T T A A C A T A C G A T - C T A C G +G G A A T A A C T A T A G A C C G A A T A C C G C +T T T T C G T T T G A G C A G T A T T T A T T A C +A G C A T T A C T T C T C T T A T T A T C T C T A +C C G G T A +>Rhiphicephalus +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T C - A +G - C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G A A T A C C G T +T A T T C G T C T G A G C A G T T T T T A T T A C +A G C G C T C T T A C T C C T T T T A T C T T T A +C C C G T T +>Thelichoris +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T C - A +G - C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G A A T A C C G T +T A T T C G T C T G A G C A G T T T T T A T T A C +A G C G C T C T T A C T C C T T T T A T C T T T A +C C C G T T +>Thermobius +T A A T A T T A G G T G C T C C C G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T G A +G T T T C T G A C T T T T A C C T C C C T C G C T +A A C T C T A C T T C T T - T C T A G A G G T A T +A G T C G A A A G A G G G G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T C T A T C - T +G - C C A G C A T T G C C C A T G C A G G A G C A +T C T G T A G A T C T A G G A A T C T T C T C A T +T A C A T T T A G C A G G G G T T T C T T C A A T +T C T A G G T G C T G T A A A T T T T A T A A C T +A C C G T T A T C A A T A T A C G A T - C A A C A +G G A A T A A C T A T A G A C C G A A T A C C T C +T T T T C G T C T G A G C G G T A T T T A T T A C +A G C C T T A C T A C T T T T A C T A T C A T T A +C C G G T T +>Vonones +T A A T G T T A G G T G C T C C A G A T A T G G - +C A T T T C C C - C G A A T A A A T A A T A T A A +G T T T C T G A C T T T T A C C C C C T T C G C T +A A C T T T A C T T - T T A T C T A G A G G T A T +A G T C G A A A G A G G A G T G G G A A C T G G A +T G A A C A G T A T A C C C T C C T T T A T C - A +G - C C A G A A T T G C T C A C G C A G G T G C T +T C A G T T G A T C T A G G T A T T T T T T C A T +T A C A T T T A G C A G G G G T C T C A T C A A T +C T T A G G A G C T G T A A A C T T T A T A A C G +A C C G T T A T C A A T A T A C G A T - C T A C A +G G G A T A A C T A T A G A C C G A A T A C C A C +T T T T T G T T T G A G C A G T A T T T A T T A C +A G C T C T A C T C C T A C T G T T A T C T T T A +C C A G T C + + +Sequence character chel_16S.fas#0 +>Alentus +T C C A G A C T - - G A - C - G G T - C C A C C - +- - G A A T C G - G G C G A C T G - - - T C A G G +C C T G A A C - A T C C G G T C G G T T T C - T C +- T C C C T T T T C C C G C - - G - G T G C T C T +T C G T T G A G - T G - T C G C G - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Amblypygid +T C C A G A C T - - G G - C - G G T - C C G C C T +- - A - G C G G C G A G T A C T G - - - T C A G G +C C T G A A C - A T G G C G C C G G - T T - - T - +- T - - C - - - - - - C T T - - G - G T T C T C T +T T A C T G A G - T G - T C T T G - G G C G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - - - G - C T G +- T - - - - C G C C +>Americhernus +T C G A G C C T C C A A - T - G A T - A C G T T G +- - A - A A G G C G T T T A T C G - - - T T G G G +G C C G - A C - A G C G C G T C - G - - T - - G - +- G - - G - - - - - - C T C - - G - G T T G G C C +T T A A A A A G C T G A T C - G G - G T T C T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +G G T G C T C A A A G C A - - G - - - - G C C T G +- G - - - - T G C C +>Aportus +T C C A G A C T - - G A - C - G G T - C C A C C - +- - G - C T C G - G G C G A C T G - - - T C A G G +C C T G A A C - A T T C C G T C G G T T T C G A C +- A G A T T T T T C C C G C - - G - G T G C T C T +T C G G T G A G - T G - T C G C - - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Artemia +C T C G G T C G - - G G - T - G G T G C C G C C - +- T C - A C G G T G G T C A C T G - - - C C T C G +A T C G G A C - A A T T C A T T G G - - A - - - - +- T - C G - - - - - - T T C G G G - G T G C T C T +T A A C C G A G - T G - T C C T G - G G T G G C C +G A T A C G T T T A C T T T G A A C A A A T T A G +A G T G C T T A A A G C A - - G - - - - G - T - G +C A - C C G C G C C +>Centruoides +T C C A G A C A - - A G - C - G G T - C C A C C - +- - C - G C G G T G G T T A C T G - - - T T T G G +A C T G G A C - G T T T G G C C G G - - A - - T - +- T - C C - - - - - - T T T - - G - A T G C T C T +T T G C C G A G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - - +- A - - C - C G C C +>Chanbria +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - +>Gea +T C C G G C C G - - G A - C G G G T - C C G C C - +T - A - C C G G T G G T T A C T G - T T C G C T G +C C - G A G C - - T T C A G G G G G - - C - - C - +- G - C T - - - - - - G T C - - G - A T G A T C T +T C A T C G G T - T A - T C T T C - C G T A A C C +C T C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - - - - C G C C +>Hadrurus +T C C G G A C T - - G T - C - G G T - C C G C C - +- - - - G C - A A G C T T A C T G - - - G C A G G +A C C G G A C - G T C T A G C C G G - - A - - C - +- T - C T - - - - - - C T C - - G T A T C C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A T - C - C G C C +>Hypochilus +T C C A G A C G - - G G - C - G G T - C C G C C - +T - A - A C G G T G G T T A C T G - - - C C T G G +C C T G A A C - A A C C A G C C G G - - T - - T - +- T - C C - - - - - - C T A - - G - A T G A T C T +T C A T T G A T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - G +- A - - - - C G C C +>Limulus +T C T A G A C T - - G G - C - G G T - C C G C T T +- - - - C C G G C G G T T A C T G - - - C C T G G +C C T A A A C - A T C - T G C C G G - T T - - T - +- T - C C - - - - - - C T C - - G - G T G C C C T +T G A T T G A G - T G - T C T T G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - A +- A - - - - C G C C +>Mastigoproctus +T C C A G A C G - - G G - T - G G T - C C A C C - +G - C - C C G G T G G C T A C T G - - - C C C G G +C C T G A A C A A T C T C G C C G G - - T - - T - +- T - T C - - - - - - C T T - - G - A T T C T C T +T C A C C G A G - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T T A A A G C A - - G - - C - G - T - A +- A - - - - C G C C +>Pauroctonus +T C C A G A C T - - G T - C - G G T - C C G - C - +- - A - C C G G A G G T T A C T G - - - G C A G G +A C C G G A C - G T C T A G C C G G - - C - - C - +- T - C C - - - - - - C T C - - G - T T G C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - A - C +- C - - - - C G C C +>Rhiphicephalus +T C C A G A C G - - A G - T - A G T - G C A T C - +T - A - C C C G A T G C T A C G G - - - C T C G G +A C T G A A C - A T C A T G C C G G - - T - - T - +C T - T T - - - - - - C T T - - G - G T G C A C T +T C A T T G T G - T G - C C T C G A G A T G G C C +G G T G C T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A C G C A - G G - - C - G - A - G +- T - - - - C G C C +>Thelichoris +T C C A G A C G - - G G - C - G G T - C C G C C - +T - A - A C G G T G G T T A C T G - - - C C T G G +C C T G A A C - A G C C A G C C G G - - T - - T - +- T - C C - - - - - - C T A - - G - A T G A T C T +T T A C C G G T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - G G - - C - - - T - G +- A - - - - C G C C +>Thermobius +C T C G G A C - - - G A T C - G G T - T C G C C - +- - G - C C C G T G T T A A C T G A - - T C - G T +T C C G G A C - G T C C T G C C G G T T T - - T - +- T - C C - T T T - - C T C - - G - G T G C T C T +T C A T T G A G - T G - T C T T G - A T T G G C C +G G C A C G T T T A C T T T G A A C A A A T T A G +A G T G C T C A A A G C A G - G T C C A G - T - G +- T - - - - C G C C +>Vonones +T C G A G G C T - - G G - C - G G T - C C G C C T +- - A - C A G G C G G T C A C T G - - - C C A G T +A C T C A A C - A T C C T G C C G G - T T - - T - +- T - C C - - - - - - C T T - - G - G T G C T C T +T C G C T G A G - T G - T C T C G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - C - G +- T - - - - A G C C + +Implied Alignments for Graph 6 + +Sequence character chel_cox1.fas#0 +>Alentus +T T A T A T T A G G A G C A C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T T T G A C T T C T T C C A C C A T C T T T +A A C T T T A C T C C T C - T C T A G A G G A A T +A G T A G A A A G T G G A G T G G G A A C A G G C +T G A A C T G T C T A T C C T C C T T T A T - C - +T G C T G G A A T T G C A C A T G C T G G T G C C +T C A G T A G A T T T A G G A A T C T T T T C T T +T A C A C C T A G C A G G A G T T T C A T C C A T +C T T A G G G G C C G T A A A C T T T A T A A C T +A C C G T T A T C A A T A T A C G A A G A A C A G +G T A T A A C A A T A G A C C G A A T T C C T C T +T T T T G T A T G A T C T G T A T T T A T T A C A +G C C C T T C T T C T C C T T C T T T C C C T C C +C A G T C +>Amblypygid +T A A T A T T A G G A G C C C C A G A T A T G G - +C - T T T T C C C C G T A T A A A T A A T A T A A +G T T T T T G A C T T T T A C C C C C C T C T C T +A A C A C T - A C T C C T C T C T A G A G G G A T +A G T G G A A A G A G G T G T A G G T A C A G G A +T G A A C T G T T T A C C C C C C T C T A T - C - +G G C T A G C A T C G C A C A C G C C G G T G C T +T C T G T T G A T A T G G G A A T T T T C T C G C +T C C A T T T G G C C G G G G T T T C T T C A A T +T T T A G G G G C C G T T A A T T T T A T A A C T +A C T G T T A T C A A T A T A C G A A G A A C C G +G A A T A A C T A T A G A C C G A A T T C C C C T +T T T T G T T T G A T C T G T C T T T A T T A C C +G C C C T C C T T C T T C T T C T A T C A T T A C +C T G T T +>Americhernus +T A A T A C T T G G G G C C C C T G A T A T G G - +C - T T T C C C C C G T A T A A A T A A T A T G A +G A T T C T G A C T C T T A C C T C C C T C A T T +A A C T C T - C C T A T T A T C G A G A G G G A T +A G T T G A A A C A G G G G T T G G T A C T G G A +T G G A C A G T A T A C C C C C C C T T A T - C - +G G C C G G G A T C G C C C A C G C A G G G G C C +T C A G T T G A C C T G G C A A T C T T T T C T C +T C C A C T T A G C T G G G G T T T C T T C T A T +T C T A G G A G C A G T T A A T T T T A T A A C A +A C C G T T A T C A A T A T A C G A A G T G T A G +G G A T G A C A A T A G A C C G A G T A C C T C T +T T T T G T A T G G T C T A T T T T C A T C A C A +A C T C T C C T T C T A C T A C T T T C T C T A C +C A G T C +>Aportus +T A A T A T T A G G G G C C C C T G A T A T G G C +C - T T T C C C - C G G A T A A A T A A T A T A A +G T T T C T G A C T A T T A C C C C C C T C T T T +A A C C C T T C T T T T A - T C T A G T G G T A T +G G T T G A A A C A G G A G T T G G G A C T G G T +T G A A C A G T T T A C C C C C C T T T A G - C - +C G C C G G A A T T G C T C A C G C A G G A G C T +T C G G T A G A T C T T G C T A T T T T C T C A C +T C C A C T T A G C C G G G G T T T C T T C G A T +T T T A G G A G C A G T A A A C T T T A T A A C T +A C C G T T A T T A A T A T A C G A A G A A C A G +G G A T A A C A A T G G A C C G G A T C C C T C T +T T T C G T C T G G T C G G T T T T T A T T A C T +G C C C T T C T T T T A C T T T T A T C T C T C C +C A G T T +>Artemia +T A A T A T T A G G A G C T C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A C T C C T C C C T C C C T C T T T +A A C T C T T T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C G G G A +T G A A C C G T T T A T C C T C C T C T T G - C - +A G C C G G A A T T G C T C A T G C C G G A G C T +T C A G T T G A T A T A G G A A T C T T C T C T C +T A C A T T T A G C A G G A G T A T C C T C T A T +T T T A G G G G C C G T A A A T T T T A T A A C A +A C A G T G A T T A A C A T A C G G G C A T C T G +G A A T A A C A T T A G A C C G A A T A C C T T T +A T T C G T G T G A T C T G T A T T T A T T A C T +G C A C T A T T A T T A C T T T T A T C A C T A C +C A G T T +>Centruoides +T A A T A T T A G G G G C C C C T G A T A T G G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A T T G C T T C C C C C T T C C C T +A A C T C T T T T A C T T - T C G A G T G G A A T +G G T A G A A A G A G G G G T A G G T A C A G G A +T G A A C T G T T T A T C C C C C T C T T G - C - +A G C C G G A A T T G C T C A T G C T G G A G C T +T C C G T T G A C A T A G G A A T T T T C T C T C +T A C A T T T A G C A G G A G T G T C T T C T A T +T T T A G G G G C C G T A A A C T T C A T A A C T +A C A G T A A T T A A T A T A C G A G C A T C T G +G A A T A A C A A T A G A C C G A A T A C C T T T +A T T T G T T T G A T C T G T C T T T A T T A C A +G C A C T A C T T T T A C T C T T G T C A T T A C +C A G T T +>Chanbria +T T A T A T T A G G A G C A C C T G A T A T A G - +C - A T T T C C A C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C A T C T C T +T A C A C T A C T C C T A - T C A A G A G G A A T +G G T A G A A A G A G G A G T C G G T A C A G G A +T G A A C T G T G T A T C C C C C G C T A G - C - +A G C A G G A A T T G C C C A C G C T G G A G C C +T C A G T T G A C A T A G G A A T T T T T T C A C +T A C A C C T C G C C G G A G T C T C A T C A A T +T C T A G G T G C A G T T A A T T T T A T A A C A +A C T G T T A T T A A C A T A C G C C C T G C G G +G A A T A A C T A T A G A C C G T A T A C C A C T +T T T C G T G T G A G C A G T A T T T A T T A C A +G C C C T A C T C C T T C T T T T A T C T C T A C +C A G T C +>Gea +T T A T A C T T G G T G C C C C A G A T A T G G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G A T T C T G A C T T C T C C C C C C T T C T C T +A A C T C T A T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C A G G A +T G A A C A G T T T A C C C C C C T C T A G - C - +A G C A G G A A T T G C T C A T G C T G G A G C T +T C A G T T G A T A T A G G A A T T T T C T C G C +T A C A C C T T G C A G G G G T C T C A T C A A T +C T T A G G A G C A G T T A A T T T C A T G A C A +A C A G T T A T T A A T A T G C G C C C T G C A G +G A A T A A C T A T G G A C C G T A T A C C A C T +C T T C G T G T G A G C T G T C T T T A T T A C A +G C C T T G C T A C T A T T A C T A T C C C T C C +C A G T C +>Hadrurus +T T A T A T T A G G G G C C C C A G A T A T A G - +C - T T T T C C C C G A A T A A A T A A T A T A A +G C T T C T G A T T A C T T C C C C C C T C C C T +A A C T C T C C T C C T T - T T A A G A G G A A T +A G T T G A A A G A G G A G T A G G G A C C G G C +T G A A C T G T C T A T C C T C C T T T A G - C T +A G - A G G A A T T G C C C A T G C A G G A G C C +T C G G T A G A C A T A G G A A T T T T T T C T C +T A C A C T T A G C T G G A G T T T C A T C C A T +T T T A G G G G C C G T A A A C T T T A T A A C A +A C A G T G A T T A A T A T A C G A T C A T C A G +G A A T A A C A A T A G A T C G A A T A C C T T T +A T T T G T T T G A T C A G T C T T T A T T A C C +G C T T T A T T G T T A C T C C T T T C A C T C C +C A G T T +>Hypochilus +T A A T A C T C G G G G C C C C A G A T A T A G - +C - T T T C C C A C G A A T A A A T A A C A T A A +G A T T C T G A C T T C T T C C T C C C T C T C T +A A C T C T A C T T C T C - T C A A G C G G A A T +A G T C G A A A G A G G G G T A G G A A C C G G C +T G G A C A G T A T A C C C T C C T C T A G C C - +A G - A G G A A T C G C T C A T G C A G G T G C T +T C T G T A G A T A T A G G A A T C T T C T C G C +T T C A C C T A G C A G G A G T C T C A T C A A T +C C T A G G G G C C G T A A A C T T T A T A A C A +A C A G T A A T C A A T A T A C G A T C C T C A G +G A A T A T C T A T A G A C C G T A T A C C T T T +A T T T G T A T G A T C A G T A T T C A T T A C C +G C T C T C C T C C T T C T T C T C T C T T T A C +C A G T C +>Limulus +T A A T A C T A G G T G C C C C G G A T A T G G - +C - T T T C C C T C G A A T A A A T A A C A T A A +G T T T C T G A C T T T T A C C C C C A T C T C T +C A C T C T - C C T C T T A T C T A G A G G A A T +A G T A G A A A G A G G T G T A G G A A C T G G A +T G A A C T G T A T A T C C T C C T T T A T - C - +T G C A A G T A T T G C C C A T G C G G G A G C T +T C C G T T G A T C T C G G A A T C T T C T C A C +T A C A C T T A G C A G G T G T A T C A T C A A T +T T T A G G T G C C G T A A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T C T A C C G +G A A T A A C T A T G G A C C G G A T A C C T C T +T T T T G T T T G A G C A G T A T T T A T T A C T +G C T T T A C T T T T A C T A C T G T C C T T A C +C A G T T +>Mastigoproctus +T G A T A T T A G G T G C T C C T G A T A T A G - +C - T T T T C C T C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C T T C G T T +A A C G T T A C T T C T T - T C C A G A G G T A T +A G T T G A A A G A G G G G T A G G A A C A G G A +T G A A C A G T C T A T C C T C C T T T A T - C - +A G C T A G A A T T G C T C A T G C C G G A G C T +T C A G T A G A C T T A G G T A T T T T C T C G C +T G C A T C T A G C T G G T G T C T C G T C T A T +C T T A G G A G C T G T C A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G T A T A C C T C T +T T T C G T A T G G G C A G T A T T T A T T A C C +G C C C T A C T A C T T T T A T T A T C T T T A C +C A G T A +>Pauroctonus +T A A T G T T A G G T G C T C C T G A T A T G G - +C - T T T T C C A C G A A T A A A C A A T A T G A +G T T T C T G G C T C C T A C C T C C T T C A C T +A A C A C T A C T T C T T - T C T A G A G G T A T +A G T T G A A A G A G G A G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T T T A T - C - +A G C C A G T A T T G C T C A T G C T G G G G C T +T C G G T A G A T T T A G G A A T T T T C T C C C +T A C A C T T G G C A G G T G T T T C T T C A A T +T T T A G G A G C T G T A A A T T T T A T G A C A +A C T G T T A T T A A C A T A C G A T C T A C G G +G A A T A A C T A T A G A C C G A A T A C C G C T +T T T C G T T T G A G C A G T A T T T A T T A C A +G C A T T A C T T C T C T T A T T A T C T C T A C +C G G T A +>Rhiphicephalus +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T - C - +A G C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thelichoris +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T - C - +A G C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thermobius +T A A T A T T A G G T G C T C C C G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T G A +G T T T C T G A C T T T T A C C T C C C T C G C T +A A C T C T A C T T C T T - T C T A G A G G T A T +A G T C G A A A G A G G G G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T C T A T - C - +T G C C A G C A T T G C C C A T G C A G G A G C A +T C T G T A G A T C T A G G A A T C T T C T C A T +T A C A T T T A G C A G G G G T T T C T T C A A T +T C T A G G T G C T G T A A A T T T T A T A A C T +A C C G T T A T C A A T A T A C G A T C A A C A G +G A A T A A C T A T A G A C C G A A T A C C T C T +T T T C G T C T G A G C G G T A T T T A T T A C A +G C C T T A C T A C T T T T A C T A T C A T T A C +C G G T T +>Vonones +T A A T G T T A G G T G C T C C A G A T A T G G - +C A T T T C C C - C G A A T A A A T A A T A T A A +G T T T C T G A C T T T T A C C C C C T T C G C T +A A C T T T A C T T - T T A T C T A G A G G T A T +A G T C G A A A G A G G A G T G G G A A C T G G A +T G A A C A G T A T A C C C T C C T T T A T - C - +A G C C A G A A T T G C T C A C G C A G G T G C T +T C A G T T G A T C T A G G T A T T T T T T C A T +T A C A T T T A G C A G G G G T C T C A T C A A T +C T T A G G A G C T G T A A A C T T T A T A A C G +A C C G T T A T C A A T A T A C G A T C T A C A G +G G A T A A C T A T A G A C C G A A T A C C A C T +T T T T G T T T G A G C A G T A T T T A T T A C A +G C T C T A C T C C T A C T G T T A T C T T T A C +C A G T C + + +Sequence character chel_16S.fas#0 +>Alentus +T C C A G A C T - - G A - C - G G T - C C A C C - +- G A A T C G - G G C G A C T G - - - T C - A G G +C C T G A A C - - A T C C G G - T C G G T T - T C +- T C - T C C C T T T T C C C G C G G T G C T C T +T C G T T G A G - T G - T C G C G - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Amblypygid +T C C A G A C T - - G G - C - G G T - C C G C C T +- A - G C G G C G A G T A C T G - - - T C - A G G +C C T G A A C - - A T G G C G - C C G G - T - T - +- T - - T - - C - - - - - - C T T G G T T C T C T +T T A C T G A G - T G - T C T T G - G G C G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - - - G - C T G +- T - - - - C G C C +>Americhernus +T C G A G C C T C C A A - T - G A T - A C G T T G +- A - A A G G C G T T T A T C G - - - T T - G G G +G C C G - A C - - A G C G C G - T C - G - - - T - +- G - - G - - G - - - - - - C T C G G T T G G C C +T T A A A A A G C T G A T C - G G - G T T C T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +G G T G C T C A A A G C A - - G - - - - G C C T G +- G - - - - T G C C +>Aportus +T C C A G A C T - - G A - C - G G T - C C A C C - +- G - C T C G - G G C G A C T G - - - T C - A G G +C C T G A A C - - A T T C C G - T C G G T T - T C +G A C - A G A T T T T T C C C G C G G T G C T C T +T C G G T G A G - T G - T C G C - - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Artemia +C T C G G T C G - - G G - T - G G T G C C G C C - +T C - A C G G T G G T C A C T G - - - C C T C G A +T C - G G A C - A A T T C A T - T G G A - - - T - +- C - - G - T T - - - - - - C G G G G T G C T C T +T A A C C G A G - T G - T C C T G - G G T G G C C +G A T A C G T T T A C T T T G A A C A A A T T A G +A G T G C T T A A A G C A - - G - - - - G - T - G +C A C - C G C G C C +>Centruoides +T C C A G A C A - - A G - C - G G T - C C A C C - +- C - G C G G T G G T T A C T G - - - T T T G G A +C T - G G A C - - G T T T G G C C G G A - - - T - +- T - - C - C T - - - - - - - T T G A T G C T C T +T T G C C G A G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - - +- A - - C - C G C C +>Chanbria +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - +>Gea +T C C G G C C G - - G A - C G G G T - C C G C C - +T A - C C G G T G G T T A C T G - T T C G - C T G +C C - G A G C - - - T T C A G - G G G G - - - C - +- C - - G - C T - - - - - - G T C G A T G A T C T +T C A T C G G T - T A - T C T T C - C G T A A C C +C T C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - - - - C G C C +>Hadrurus +T C C G G A C T - - G T - C - G G T - C C G - C - +- C - G C - A A G C T T A C T G - - - G C A G G A +C C - G G A C - - G T C T A G C C G G A - - C T - +- C - - T - C T - - - - - - C G T - A T C C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - T C - C G C C +>Hypochilus +T C C A G A C G - - G G - C - G G T - C C G C C - +T A - A C G G T G G T T A C T G - - - C C - T G G +C C T G A A C - - A A C C A G - C C G G - - - T - +- T - - T - C C - - - - - - C T A G A T G A T C T +T C A T T G A T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - G +- A - - - - C G C C +>Limulus +T C T A G A C T - - G G - C - G G T - C C G C T T +- - - C C G G C G G T T A C T G - - - C C - T G G +C C T A A A C - - A T C - T G - C C G G - T - T - +- T - - T - C C - - - - - - C T C G G T G C C C T +T G A T T G A G - T G - T C T T G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - A +- A - - - - C G C C +>Mastigoproctus +T C C A G A C G - - G G - T - G G T - C C A C C - +G C - C C G G T G G C T A C T G - - - C C - C G G +C C T G A A C A - A T C T C G - C C G G - - - T - +- T - - T - T C - - - - - - C T T G A T T C T C T +T C A C C G A G - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T T A A A G C A - - G - - C - G - T - A +- A - - - - C G C C +>Pauroctonus +T C C A G A C T - - G T - C - G G T - C C G - C - +- A - C C G G A G G T T A C T G - - - G C - A G G +A C C G G A C - - G T C T A G - C C G G - - - C - +- C - - T - C C - - - - - - C T C G T T G C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - A - C +- C - - - - C G C C +>Rhiphicephalus +T C C A G A C G - - A G - T - A G T - G C A T C - +T A - C C C G A T G C T A C G G - - - C T - C G G +A C T G A A C - - A T C A T G - C C G G - - - T - +- T - C T - T T - - - - - - C T T G G T G C A C T +T C A T T G T G - T G - C C T C G A G A T G G C C +G G T G C T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A C G C A - G G - - C - G - A - G +- T - - - - C G C C +>Thelichoris +T C C A G A C G - - G G - C - G G T - C C G C C - +T A - A C G G T G G T T A C T G - - - C C - T G G +C C T G A A C - - A G C C A G - C C G G - - - T - +- T - - T - C C - - - - - - C T A G A T G A T C T +T T A C C G G T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - G G - - C - - - T - G +- A - - - - C G C C +>Thermobius +C T C G G A C - - - G A T C - G G T - T C G C C - +- G - C C C G T G T T A A C T G A - - T C - - G T +T C C G G A C - - G T C C T G - C C G G T T - T - +- T - - T - C C - T T T - - C T C G G T G C T C T +T C A T T G A G - T G - T C T T G - A T T G G C C +G G C A C G T T T A C T T T G A A C A A A T T A G +A G T G C T C A A A G C A G - G T C C A G - T - G +- T - - - - C G C C +>Vonones +T C G A G G C T - - G G - C - G G T - C C G C C T +- A - C A G G C G G T C A C T G - - - C C - A G T +A C T C A A C - - A T C C T G - C C G G - T - T - +- T - - T - C C - - - - - - C T T G G T G C T C T +T C G C T G A G - T G - T C T C G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - C - G +- T - - - - A G C C + +Implied Alignments for Graph 7 + +Sequence character chel_cox1.fas#0 +>Alentus +T T A T A T T A G G A G C A C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T T T G A C T T C T T C C A C C A T C T T T +A A C T T T A C T C C T C - T C T A G A G G A A T +A G T A G A A A G T G G A G T G G G A A C A G G C +T G A A C T G T C T A T C C T C C T T T A T - C - +T G C T G G A A T T G C A C A T G C T G G T G C C +T C A G T A G A T T T A G G A A T C T T T T C T T +T A C A C C T A G C A G G A G T T T C A T C C A T +C T T A G G G G C C G T A A A C T T T A T A A C T +A C C G T T A T C A A T A T A C G A A G A A C A G +G T A T A A C A A T A G A C C G A A T T C C T C T +T T T T G T A T G A T C T G T A T T T A T T A C A +G C C C T T C T T C T C C T T C T T T C C C T C C +C A G T C +>Amblypygid +T A A T A T T A G G A G C C C C A G A T A T G G - +C - T T T T C C C C G T A T A A A T A A T A T A A +G T T T T T G A C T T T T A C C C C C C T C T C T +A A C A C T - A C T C C T C T C T A G A G G G A T +A G T G G A A A G A G G T G T A G G T A C A G G A +T G A A C T G T T T A C C C C C C T C T A T - C - +G G C T A G C A T C G C A C A C G C C G G T G C T +T C T G T T G A T A T G G G A A T T T T C T C G C +T C C A T T T G G C C G G G G T T T C T T C A A T +T T T A G G G G C C G T T A A T T T T A T A A C T +A C T G T T A T C A A T A T A C G A A G A A C C G +G A A T A A C T A T A G A C C G A A T T C C C C T +T T T T G T T T G A T C T G T C T T T A T T A C C +G C C C T C C T T C T T C T T C T A T C A T T A C +C T G T T +>Americhernus +T A A T A C T T G G G G C C C C T G A T A T G G - +C - T T T C C C C C G T A T A A A T A A T A T G A +G A T T C T G A C T C T T A C C T C C C T C A T T +A A C T C T - C C T A T T A T C G A G A G G G A T +A G T T G A A A C A G G G G T T G G T A C T G G A +T G G A C A G T A T A C C C C C C C T T A T - C - +G G C C G G G A T C G C C C A C G C A G G G G C C +T C A G T T G A C C T G G C A A T C T T T T C T C +T C C A C T T A G C T G G G G T T T C T T C T A T +T C T A G G A G C A G T T A A T T T T A T A A C A +A C C G T T A T C A A T A T A C G A A G T G T A G +G G A T G A C A A T A G A C C G A G T A C C T C T +T T T T G T A T G G T C T A T T T T C A T C A C A +A C T C T C C T T C T A C T A C T T T C T C T A C +C A G T C +>Aportus +T A A T A T T A G G G G C C C C T G A T A T G G C +C - T T T C C C - C G G A T A A A T A A T A T A A +G T T T C T G A C T A T T A C C C C C C T C T T T +A A C C C T T C T T T T A - T C T A G T G G T A T +G G T T G A A A C A G G A G T T G G G A C T G G T +T G A A C A G T T T A C C C C C C T T T A G - C - +C G C C G G A A T T G C T C A C G C A G G A G C T +T C G G T A G A T C T T G C T A T T T T C T C A C +T C C A C T T A G C C G G G G T T T C T T C G A T +T T T A G G A G C A G T A A A C T T T A T A A C T +A C C G T T A T T A A T A T A C G A A G A A C A G +G G A T A A C A A T G G A C C G G A T C C C T C T +T T T C G T C T G G T C G G T T T T T A T T A C T +G C C C T T C T T T T A C T T T T A T C T C T C C +C A G T T +>Artemia +T A A T A T T A G G A G C T C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A C T C C T C C C T C C C T C T T T +A A C T C T T T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C G G G A +T G A A C C G T T T A T C C T C C T C T T G - C - +A G C C G G A A T T G C T C A T G C C G G A G C T +T C A G T T G A T A T A G G A A T C T T C T C T C +T A C A T T T A G C A G G A G T A T C C T C T A T +T T T A G G G G C C G T A A A T T T T A T A A C A +A C A G T G A T T A A C A T A C G G G C A T C T G +G A A T A A C A T T A G A C C G A A T A C C T T T +A T T C G T G T G A T C T G T A T T T A T T A C T +G C A C T A T T A T T A C T T T T A T C A C T A C +C A G T T +>Centruoides +T A A T A T T A G G G G C C C C T G A T A T G G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A T T G C T T C C C C C T T C C C T +A A C T C T T T T A C T T - T C G A G T G G A A T +G G T A G A A A G A G G G G T A G G T A C A G G A +T G A A C T G T T T A T C C C C C T C T T G - C - +A G C C G G A A T T G C T C A T G C T G G A G C T +T C C G T T G A C A T A G G A A T T T T C T C T C +T A C A T T T A G C A G G A G T G T C T T C T A T +T T T A G G G G C C G T A A A C T T C A T A A C T +A C A G T A A T T A A T A T A C G A G C A T C T G +G A A T A A C A A T A G A C C G A A T A C C T T T +A T T T G T T T G A T C T G T C T T T A T T A C A +G C A C T A C T T T T A C T C T T G T C A T T A C +C A G T T +>Chanbria +T T A T A T T A G G A G C A C C T G A T A T A G - +C - A T T T C C A C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C A T C T C T +T A C A C T A C T C C T A - T C A A G A G G A A T +G G T A G A A A G A G G A G T C G G T A C A G G A +T G A A C T G T G T A T C C C C C G C T A G - C - +A G C A G G A A T T G C C C A C G C T G G A G C C +T C A G T T G A C A T A G G A A T T T T T T C A C +T A C A C C T C G C C G G A G T C T C A T C A A T +T C T A G G T G C A G T T A A T T T T A T A A C A +A C T G T T A T T A A C A T A C G C C C T G C G G +G A A T A A C T A T A G A C C G T A T A C C A C T +T T T C G T G T G A G C A G T A T T T A T T A C A +G C C C T A C T C C T T C T T T T A T C T C T A C +C A G T C +>Gea +T T A T A C T T G G T G C C C C A G A T A T G G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G A T T C T G A C T T C T C C C C C C T T C T C T +A A C T C T A T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C A G G A +T G A A C A G T T T A C C C C C C T C T A G - C - +A G C A G G A A T T G C T C A T G C T G G A G C T +T C A G T T G A T A T A G G A A T T T T C T C G C +T A C A C C T T G C A G G G G T C T C A T C A A T +C T T A G G A G C A G T T A A T T T C A T G A C A +A C A G T T A T T A A T A T G C G C C C T G C A G +G A A T A A C T A T G G A C C G T A T A C C A C T +C T T C G T G T G A G C T G T C T T T A T T A C A +G C C T T G C T A C T A T T A C T A T C C C T C C +C A G T C +>Hadrurus +T T A T A T T A G G G G C C C C A G A T A T A G - +C - T T T T C C C C G A A T A A A T A A T A T A A +G C T T C T G A T T A C T T C C C C C C T C C C T +A A C T C T C C T C C T T - T T A A G A G G A A T +A G T T G A A A G A G G A G T A G G G A C C G G C +T G A A C T G T C T A T C C T C C T T T A G - C T +A G - A G G A A T T G C C C A T G C A G G A G C C +T C G G T A G A C A T A G G A A T T T T T T C T C +T A C A C T T A G C T G G A G T T T C A T C C A T +T T T A G G G G C C G T A A A C T T T A T A A C A +A C A G T G A T T A A T A T A C G A T C A T C A G +G A A T A A C A A T A G A T C G A A T A C C T T T +A T T T G T T T G A T C A G T C T T T A T T A C C +G C T T T A T T G T T A C T C C T T T C A C T C C +C A G T T +>Hypochilus +T A A T A C T C G G G G C C C C A G A T A T A G - +C - T T T C C C A C G A A T A A A T A A C A T A A +G A T T C T G A C T T C T T C C T C C C T C T C T +A A C T C T A C T T C T C - T C A A G C G G A A T +A G T C G A A A G A G G G G T A G G A A C C G G C +T G G A C A G T A T A C C C T C C T C T A G C C - +A G - A G G A A T C G C T C A T G C A G G T G C T +T C T G T A G A T A T A G G A A T C T T C T C G C +T T C A C C T A G C A G G A G T C T C A T C A A T +C C T A G G G G C C G T A A A C T T T A T A A C A +A C A G T A A T C A A T A T A C G A T C C T C A G +G A A T A T C T A T A G A C C G T A T A C C T T T +A T T T G T A T G A T C A G T A T T C A T T A C C +G C T C T C C T C C T T C T T C T C T C T T T A C +C A G T C +>Limulus +T A A T A C T A G G T G C C C C G G A T A T G G - +C - T T T C C C T C G A A T A A A T A A C A T A A +G T T T C T G A C T T T T A C C C C C A T C T C T +C A C T C T - C C T C T T A T C T A G A G G A A T +A G T A G A A A G A G G T G T A G G A A C T G G A +T G A A C T G T A T A T C C T C C T T T A T - C - +T G C A A G T A T T G C C C A T G C G G G A G C T +T C C G T T G A T C T C G G A A T C T T C T C A C +T A C A C T T A G C A G G T G T A T C A T C A A T +T T T A G G T G C C G T A A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T C T A C C G +G A A T A A C T A T G G A C C G G A T A C C T C T +T T T T G T T T G A G C A G T A T T T A T T A C T +G C T T T A C T T T T A C T A C T G T C C T T A C +C A G T T +>Mastigoproctus +T G A T A T T A G G T G C T C C T G A T A T A G - +C - T T T T C C T C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C T T C G T T +A A C G T T A C T T C T T - T C C A G A G G T A T +A G T T G A A A G A G G G G T A G G A A C A G G A +T G A A C A G T C T A T C C T C C T T T A T - C - +A G C T A G A A T T G C T C A T G C C G G A G C T +T C A G T A G A C T T A G G T A T T T T C T C G C +T G C A T C T A G C T G G T G T C T C G T C T A T +C T T A G G A G C T G T C A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G T A T A C C T C T +T T T C G T A T G G G C A G T A T T T A T T A C C +G C C C T A C T A C T T T T A T T A T C T T T A C +C A G T A +>Pauroctonus +T A A T G T T A G G T G C T C C T G A T A T G G - +C - T T T T C C A C G A A T A A A C A A T A T G A +G T T T C T G G C T C C T A C C T C C T T C A C T +A A C A C T A C T T C T T - T C T A G A G G T A T +A G T T G A A A G A G G A G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T T T A T - C - +A G C C A G T A T T G C T C A T G C T G G G G C T +T C G G T A G A T T T A G G A A T T T T C T C C C +T A C A C T T G G C A G G T G T T T C T T C A A T +T T T A G G A G C T G T A A A T T T T A T G A C A +A C T G T T A T T A A C A T A C G A T C T A C G G +G A A T A A C T A T A G A C C G A A T A C C G C T +T T T C G T T T G A G C A G T A T T T A T T A C A +G C A T T A C T T C T C T T A T T A T C T C T A C +C G G T A +>Rhiphicephalus +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T - C - +A G C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thelichoris +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T - C - +A G C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thermobius +T A A T A T T A G G T G C T C C C G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T G A +G T T T C T G A C T T T T A C C T C C C T C G C T +A A C T C T A C T T C T T - T C T A G A G G T A T +A G T C G A A A G A G G G G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T C T A T - C - +T G C C A G C A T T G C C C A T G C A G G A G C A +T C T G T A G A T C T A G G A A T C T T C T C A T +T A C A T T T A G C A G G G G T T T C T T C A A T +T C T A G G T G C T G T A A A T T T T A T A A C T +A C C G T T A T C A A T A T A C G A T C A A C A G +G A A T A A C T A T A G A C C G A A T A C C T C T +T T T C G T C T G A G C G G T A T T T A T T A C A +G C C T T A C T A C T T T T A C T A T C A T T A C +C G G T T +>Vonones +T A A T G T T A G G T G C T C C A G A T A T G G - +C A T T T C C C - C G A A T A A A T A A T A T A A +G T T T C T G A C T T T T A C C C C C T T C G C T +A A C T T T A C T T - T T A T C T A G A G G T A T +A G T C G A A A G A G G A G T G G G A A C T G G A +T G A A C A G T A T A C C C T C C T T T A T - C - +A G C C A G A A T T G C T C A C G C A G G T G C T +T C A G T T G A T C T A G G T A T T T T T T C A T +T A C A T T T A G C A G G G G T C T C A T C A A T +C T T A G G A G C T G T A A A C T T T A T A A C G +A C C G T T A T C A A T A T A C G A T C T A C A G +G G A T A A C T A T A G A C C G A A T A C C A C T +T T T T G T T T G A G C A G T A T T T A T T A C A +G C T C T A C T C C T A C T G T T A T C T T T A C +C A G T C + + +Sequence character chel_16S.fas#0 +>Alentus +T C C A G A C T - - G A - C - G G T - C C A C C - +- G A A T C G - G G C G A C T G - - - T C - A G G +C C T G A A C - - A T C C G G - T C G G T T - T C +- T C - T C C C T T T T C C C G C G G T G C T C T +T C G T T G A G - T G - T C G C G - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Amblypygid +T C C A G A C T - - G G - C - G G T - C C G C C T +- A - G C G G C G A G T A C T G - - - T C - A G G +C C T G A A C - - A T G G C G - C C G G - T - T - +- T - - T - - C - - - - - - C T T G G T T C T C T +T T A C T G A G - T G - T C T T G - G G C G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - - - G - C T G +- T - - - - C G C C +>Americhernus +T C G A G C C T C C A A - T - G A T - A C G T T G +- A - A A G G C G T T T A T C G - - - T T - G G G +G C C G - A C - - A G C G C G - T C - G - - - T - +- G - - G - - G - - - - - - C T C G G T T G G C C +T T A A A A A G C T G A T C - G G - G T T C T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +G G T G C T C A A A G C A - - G - - - - G C C T G +- G - - - - T G C C +>Aportus +T C C A G A C T - - G A - C - G G T - C C A C C - +- G - C T C G - G G C G A C T G - - - T C - A G G +C C T G A A C - - A T T C C G - T C G G T T - T C +G A C - A G A T T T T T C C C G C G G T G C T C T +T C G G T G A G - T G - T C G C - - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Artemia +C T C G G T C G - - G G - T - G G T G C C G C C - +T C - A C G G T G G T C A C T G - - - C C T C G A +T C - G G A C - A A T T C A T - T G G A - - - T - +- C - - G - T T - - - - - - C G G G G T G C T C T +T A A C C G A G - T G - T C C T G - G G T G G C C +G A T A C G T T T A C T T T G A A C A A A T T A G +A G T G C T T A A A G C A - - G - - - - G - T - G +C A C - C G C G C C +>Centruoides +T C C A G A C A - - A G - C - G G T - C C A C C - +- C - G C G G T G G T T A C T G - - - T T T G G A +C T - G G A C - - G T T T G G C C G G A - - - T - +- T - - C - C T - - - - - - - T T G A T G C T C T +T T G C C G A G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - - +- A - - C - C G C C +>Chanbria +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - +>Gea +T C C G G C C G - - G A - C G G G T - C C G C C - +T A - C C G G T G G T T A C T G - T T C G - C T G +C C - G A G C - - - T T C A G - G G G G - - - C - +- C - - G - C T - - - - - - G T C G A T G A T C T +T C A T C G G T - T A - T C T T C - C G T A A C C +C T C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - - - - C G C C +>Hadrurus +T C C G G A C T - - G T - C - G G T - C C G - C - +- C - G C - A A G C T T A C T G - - - G C A G G A +C C - G G A C - - G T C T A G C C G G A - - C T - +- C - - T - C T - - - - - - C G T - A T C C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - T C - C G C C +>Hypochilus +T C C A G A C G - - G G - C - G G T - C C G C C - +T A - A C G G T G G T T A C T G - - - C C - T G G +C C T G A A C - - A A C C A G - C C G G - - - T - +- T - - T - C C - - - - - - C T A G A T G A T C T +T C A T T G A T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - G +- A - - - - C G C C +>Limulus +T C T A G A C T - - G G - C - G G T - C C G C T T +- - - C C G G C G G T T A C T G - - - C C - T G G +C C T A A A C - - A T C - T G - C C G G - T - T - +- T - - T - C C - - - - - - C T C G G T G C C C T +T G A T T G A G - T G - T C T T G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - A +- A - - - - C G C C +>Mastigoproctus +T C C A G A C G - - G G - T - G G T - C C A C C - +G C - C C G G T G G C T A C T G - - - C C - C G G +C C T G A A C A - A T C T C G - C C G G - - - T - +- T - - T - T C - - - - - - C T T G A T T C T C T +T C A C C G A G - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T T A A A G C A - - G - - C - G - T - A +- A - - - - C G C C +>Pauroctonus +T C C A G A C T - - G T - C - G G T - C C G - C - +- A - C C G G A G G T T A C T G - - - G C - A G G +A C C G G A C - - G T C T A G - C C G G - - - C - +- C - - T - C C - - - - - - C T C G T T G C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - A - C +- C - - - - C G C C +>Rhiphicephalus +T C C A G A C G - - A G - T - A G T - G C A T C - +T A - C C C G A T G C T A C G G - - - C T - C G G +A C T G A A C - - A T C A T G - C C G G - - - T - +- T - C T - T T - - - - - - C T T G G T G C A C T +T C A T T G T G - T G - C C T C G A G A T G G C C +G G T G C T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A C G C A - G G - - C - G - A - G +- T - - - - C G C C +>Thelichoris +T C C A G A C G - - G G - C - G G T - C C G C C - +T A - A C G G T G G T T A C T G - - - C C - T G G +C C T G A A C - - A G C C A G - C C G G - - - T - +- T - - T - C C - - - - - - C T A G A T G A T C T +T T A C C G G T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - G G - - C - - - T - G +- A - - - - C G C C +>Thermobius +C T C G G A C - - - G A T C - G G T - T C G C C - +- G - C C C G T G T T A A C T G A - - T C - - G T +T C C G G A C - - G T C C T G - C C G G T T - T - +- T - - T - C C - T T T - - C T C G G T G C T C T +T C A T T G A G - T G - T C T T G - A T T G G C C +G G C A C G T T T A C T T T G A A C A A A T T A G +A G T G C T C A A A G C A G - G T C C A G - T - G +- T - - - - C G C C +>Vonones +T C G A G G C T - - G G - C - G G T - C C G C C T +- A - C A G G C G G T C A C T G - - - C C - A G T +A C T C A A C - - A T C C T G - C C G G - T - T - +- T - - T - C C - - - - - - C T T G G T G C T C T +T C G C T G A G - T G - T C T C G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - C - G +- T - - - - A G C C + +Implied Alignments for Graph 8 + +Sequence character chel_cox1.fas#0 +>Alentus +T T A T A T T A G G A G C A C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T T T G A C T T C T T C C A C C A T C T T T +A A C T T T A C T C C T C - T C T A G A G G A A T +A G T A G A A A G T G G A G T G G G A A C A G G C +T G A A C T G T C T A T C C T C C T T T A T - C - +T G C T G G A A T T G C A C A T G C T G G T G C C +T C A G T A G A T T T A G G A A T C T T T T C T T +T A C A C C T A G C A G G A G T T T C A T C C A T +C T T A G G G G C C G T A A A C T T T A T A A C T +A C C G T T A T C A A T A T A C G A A G A A C A G +G T A T A A C A A T A G A C C G A A T T C C T C T +T T T T G T A T G A T C T G T A T T T A T T A C A +G C C C T T C T T C T C C T T C T T T C C C T C C +C A G T C +>Amblypygid +T A A T A T T A G G A G C C C C A G A T A T G G - +C - T T T T C C C C G T A T A A A T A A T A T A A +G T T T T T G A C T T T T A C C C C C C T C T C T +A A C A C T - A C T C C T C T C T A G A G G G A T +A G T G G A A A G A G G T G T A G G T A C A G G A +T G A A C T G T T T A C C C C C C T C T A T - C - +G G C T A G C A T C G C A C A C G C C G G T G C T +T C T G T T G A T A T G G G A A T T T T C T C G C +T C C A T T T G G C C G G G G T T T C T T C A A T +T T T A G G G G C C G T T A A T T T T A T A A C T +A C T G T T A T C A A T A T A C G A A G A A C C G +G A A T A A C T A T A G A C C G A A T T C C C C T +T T T T G T T T G A T C T G T C T T T A T T A C C +G C C C T C C T T C T T C T T C T A T C A T T A C +C T G T T +>Americhernus +T A A T A C T T G G G G C C C C T G A T A T G G - +C - T T T C C C C C G T A T A A A T A A T A T G A +G A T T C T G A C T C T T A C C T C C C T C A T T +A A C T C T - C C T A T T A T C G A G A G G G A T +A G T T G A A A C A G G G G T T G G T A C T G G A +T G G A C A G T A T A C C C C C C C T T A T - C - +G G C C G G G A T C G C C C A C G C A G G G G C C +T C A G T T G A C C T G G C A A T C T T T T C T C +T C C A C T T A G C T G G G G T T T C T T C T A T +T C T A G G A G C A G T T A A T T T T A T A A C A +A C C G T T A T C A A T A T A C G A A G T G T A G +G G A T G A C A A T A G A C C G A G T A C C T C T +T T T T G T A T G G T C T A T T T T C A T C A C A +A C T C T C C T T C T A C T A C T T T C T C T A C +C A G T C +>Aportus +T A A T A T T A G G G G C C C C T G A T A T G G C +C - T T T C C C - C G G A T A A A T A A T A T A A +G T T T C T G A C T A T T A C C C C C C T C T T T +A A C C C T T C T T T T A - T C T A G T G G T A T +G G T T G A A A C A G G A G T T G G G A C T G G T +T G A A C A G T T T A C C C C C C T T T A G - C - +C G C C G G A A T T G C T C A C G C A G G A G C T +T C G G T A G A T C T T G C T A T T T T C T C A C +T C C A C T T A G C C G G G G T T T C T T C G A T +T T T A G G A G C A G T A A A C T T T A T A A C T +A C C G T T A T T A A T A T A C G A A G A A C A G +G G A T A A C A A T G G A C C G G A T C C C T C T +T T T C G T C T G G T C G G T T T T T A T T A C T +G C C C T T C T T T T A C T T T T A T C T C T C C +C A G T T +>Artemia +T A A T A T T A G G A G C T C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A C T C C T C C C T C C C T C T T T +A A C T C T T T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C G G G A +T G A A C C G T T T A T C C T C C T C T T G - C - +A G C C G G A A T T G C T C A T G C C G G A G C T +T C A G T T G A T A T A G G A A T C T T C T C T C +T A C A T T T A G C A G G A G T A T C C T C T A T +T T T A G G G G C C G T A A A T T T T A T A A C A +A C A G T G A T T A A C A T A C G G G C A T C T G +G A A T A A C A T T A G A C C G A A T A C C T T T +A T T C G T G T G A T C T G T A T T T A T T A C T +G C A C T A T T A T T A C T T T T A T C A C T A C +C A G T T +>Centruoides +T A A T A T T A G G G G C C C C T G A T A T G G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A T T G C T T C C C C C T T C C C T +A A C T C T T T T A C T T - T C G A G T G G A A T +G G T A G A A A G A G G G G T A G G T A C A G G A +T G A A C T G T T T A T C C C C C T C T T G - C - +A G C C G G A A T T G C T C A T G C T G G A G C T +T C C G T T G A C A T A G G A A T T T T C T C T C +T A C A T T T A G C A G G A G T G T C T T C T A T +T T T A G G G G C C G T A A A C T T C A T A A C T +A C A G T A A T T A A T A T A C G A G C A T C T G +G A A T A A C A A T A G A C C G A A T A C C T T T +A T T T G T T T G A T C T G T C T T T A T T A C A +G C A C T A C T T T T A C T C T T G T C A T T A C +C A G T T +>Chanbria +T T A T A T T A G G A G C A C C T G A T A T A G - +C - A T T T C C A C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C A T C T C T +T A C A C T A C T C C T A - T C A A G A G G A A T +G G T A G A A A G A G G A G T C G G T A C A G G A +T G A A C T G T G T A T C C C C C G C T A G - C - +A G C A G G A A T T G C C C A C G C T G G A G C C +T C A G T T G A C A T A G G A A T T T T T T C A C +T A C A C C T C G C C G G A G T C T C A T C A A T +T C T A G G T G C A G T T A A T T T T A T A A C A +A C T G T T A T T A A C A T A C G C C C T G C G G +G A A T A A C T A T A G A C C G T A T A C C A C T +T T T C G T G T G A G C A G T A T T T A T T A C A +G C C C T A C T C C T T C T T T T A T C T C T A C +C A G T C +>Gea +T T A T A C T T G G T G C C C C A G A T A T G G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G A T T C T G A C T T C T C C C C C C T T C T C T +A A C T C T A T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C A G G A +T G A A C A G T T T A C C C C C C T C T A G - C - +A G C A G G A A T T G C T C A T G C T G G A G C T +T C A G T T G A T A T A G G A A T T T T C T C G C +T A C A C C T T G C A G G G G T C T C A T C A A T +C T T A G G A G C A G T T A A T T T C A T G A C A +A C A G T T A T T A A T A T G C G C C C T G C A G +G A A T A A C T A T G G A C C G T A T A C C A C T +C T T C G T G T G A G C T G T C T T T A T T A C A +G C C T T G C T A C T A T T A C T A T C C C T C C +C A G T C +>Hadrurus +T T A T A T T A G G G G C C C C A G A T A T A G - +C - T T T T C C C C G A A T A A A T A A T A T A A +G C T T C T G A T T A C T T C C C C C C T C C C T +A A C T C T C C T C C T T - T T A A G A G G A A T +A G T T G A A A G A G G A G T A G G G A C C G G C +T G A A C T G T C T A T C C T C C T T T A G - C T +A G - A G G A A T T G C C C A T G C A G G A G C C +T C G G T A G A C A T A G G A A T T T T T T C T C +T A C A C T T A G C T G G A G T T T C A T C C A T +T T T A G G G G C C G T A A A C T T T A T A A C A +A C A G T G A T T A A T A T A C G A T C A T C A G +G A A T A A C A A T A G A T C G A A T A C C T T T +A T T T G T T T G A T C A G T C T T T A T T A C C +G C T T T A T T G T T A C T C C T T T C A C T C C +C A G T T +>Hypochilus +T A A T A C T C G G G G C C C C A G A T A T A G - +C - T T T C C C A C G A A T A A A T A A C A T A A +G A T T C T G A C T T C T T C C T C C C T C T C T +A A C T C T A C T T C T C - T C A A G C G G A A T +A G T C G A A A G A G G G G T A G G A A C C G G C +T G G A C A G T A T A C C C T C C T C T A G C C - +A G - A G G A A T C G C T C A T G C A G G T G C T +T C T G T A G A T A T A G G A A T C T T C T C G C +T T C A C C T A G C A G G A G T C T C A T C A A T +C C T A G G G G C C G T A A A C T T T A T A A C A +A C A G T A A T C A A T A T A C G A T C C T C A G +G A A T A T C T A T A G A C C G T A T A C C T T T +A T T T G T A T G A T C A G T A T T C A T T A C C +G C T C T C C T C C T T C T T C T C T C T T T A C +C A G T C +>Limulus +T A A T A C T A G G T G C C C C G G A T A T G G - +C - T T T C C C T C G A A T A A A T A A C A T A A +G T T T C T G A C T T T T A C C C C C A T C T C T +C A C T C T - C C T C T T A T C T A G A G G A A T +A G T A G A A A G A G G T G T A G G A A C T G G A +T G A A C T G T A T A T C C T C C T T T A T - C - +T G C A A G T A T T G C C C A T G C G G G A G C T +T C C G T T G A T C T C G G A A T C T T C T C A C +T A C A C T T A G C A G G T G T A T C A T C A A T +T T T A G G T G C C G T A A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T C T A C C G +G A A T A A C T A T G G A C C G G A T A C C T C T +T T T T G T T T G A G C A G T A T T T A T T A C T +G C T T T A C T T T T A C T A C T G T C C T T A C +C A G T T +>Mastigoproctus +T G A T A T T A G G T G C T C C T G A T A T A G - +C - T T T T C C T C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C T T C G T T +A A C G T T A C T T C T T - T C C A G A G G T A T +A G T T G A A A G A G G G G T A G G A A C A G G A +T G A A C A G T C T A T C C T C C T T T A T - C - +A G C T A G A A T T G C T C A T G C C G G A G C T +T C A G T A G A C T T A G G T A T T T T C T C G C +T G C A T C T A G C T G G T G T C T C G T C T A T +C T T A G G A G C T G T C A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G T A T A C C T C T +T T T C G T A T G G G C A G T A T T T A T T A C C +G C C C T A C T A C T T T T A T T A T C T T T A C +C A G T A +>Pauroctonus +T A A T G T T A G G T G C T C C T G A T A T G G - +C - T T T T C C A C G A A T A A A C A A T A T G A +G T T T C T G G C T C C T A C C T C C T T C A C T +A A C A C T A C T T C T T - T C T A G A G G T A T +A G T T G A A A G A G G A G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T T T A T - C - +A G C C A G T A T T G C T C A T G C T G G G G C T +T C G G T A G A T T T A G G A A T T T T C T C C C +T A C A C T T G G C A G G T G T T T C T T C A A T +T T T A G G A G C T G T A A A T T T T A T G A C A +A C T G T T A T T A A C A T A C G A T C T A C G G +G A A T A A C T A T A G A C C G A A T A C C G C T +T T T C G T T T G A G C A G T A T T T A T T A C A +G C A T T A C T T C T C T T A T T A T C T C T A C +C G G T A +>Rhiphicephalus +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T - C - +A G C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thelichoris +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T - C - +A G C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thermobius +T A A T A T T A G G T G C T C C C G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T G A +G T T T C T G A C T T T T A C C T C C C T C G C T +A A C T C T A C T T C T T - T C T A G A G G T A T +A G T C G A A A G A G G G G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T C T A T - C - +T G C C A G C A T T G C C C A T G C A G G A G C A +T C T G T A G A T C T A G G A A T C T T C T C A T +T A C A T T T A G C A G G G G T T T C T T C A A T +T C T A G G T G C T G T A A A T T T T A T A A C T +A C C G T T A T C A A T A T A C G A T C A A C A G +G A A T A A C T A T A G A C C G A A T A C C T C T +T T T C G T C T G A G C G G T A T T T A T T A C A +G C C T T A C T A C T T T T A C T A T C A T T A C +C G G T T +>Vonones +T A A T G T T A G G T G C T C C A G A T A T G G - +C A T T T C C C - C G A A T A A A T A A T A T A A +G T T T C T G A C T T T T A C C C C C T T C G C T +A A C T T T A C T T - T T A T C T A G A G G T A T +A G T C G A A A G A G G A G T G G G A A C T G G A +T G A A C A G T A T A C C C T C C T T T A T - C - +A G C C A G A A T T G C T C A C G C A G G T G C T +T C A G T T G A T C T A G G T A T T T T T T C A T +T A C A T T T A G C A G G G G T C T C A T C A A T +C T T A G G A G C T G T A A A C T T T A T A A C G +A C C G T T A T C A A T A T A C G A T C T A C A G +G G A T A A C T A T A G A C C G A A T A C C A C T +T T T T G T T T G A G C A G T A T T T A T T A C A +G C T C T A C T C C T A C T G T T A T C T T T A C +C A G T C + + +Sequence character chel_16S.fas#0 +>Alentus +T C C A G A C T - - G A - C - G G T - C C A C C - +- G A A T C G - G G C G A C T G - - - T C - A G G +C C T G A A C - - A T C C G G - T C G G T T - T C +- T C - T C C C T T T T C C C G C G G T G C T C T +T C G T T G A G - T G - T C G C G - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Amblypygid +T C C A G A C T - - G G - C - G G T - C C G C C T +- A - G C G G C G A G T A C T G - - - T C - A G G +C C T G A A C - - A T G G C G - C C G G - T - T - +- T - - T - - C - - - - - - C T T G G T T C T C T +T T A C T G A G - T G - T C T T G - G G C G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - - - G - C T G +- T - - - - C G C C +>Americhernus +T C G A G C C T C C A A - T - G A T - A C G T T G +- A - A A G G C G T T T A T C G - - - T T - G G G +G C C G - A C - - A G C G C G - T C - G - - - T - +- G - - G - - G - - - - - - C T C G G T T G G C C +T T A A A A A G C T G A T C - G G - G T T C T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +G G T G C T C A A A G C A - - G - - - - G C C T G +- G - - - - T G C C +>Aportus +T C C A G A C T - - G A - C - G G T - C C A C C - +- G - C T C G - G G C G A C T G - - - T C - A G G +C C T G A A C - - A T T C C G - T C G G T T - T C +G A C - A G A T T T T T C C C G C G G T G C T C T +T C G G T G A G - T G - T C G C - - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Artemia +C T C G G T C G - - G G - T - G G T G C C G C C - +T C - A C G G T G G T C A C T G - - - C C T C G A +T C - G G A C - A A T T C A T - T G G A - - - T - +- C - - G - T T - - - - - - C G G G G T G C T C T +T A A C C G A G - T G - T C C T G - G G T G G C C +G A T A C G T T T A C T T T G A A C A A A T T A G +A G T G C T T A A A G C A - - G - - - - G - T - G +C A C - C G C G C C +>Centruoides +T C C A G A C A - - A G - C - G G T - C C A C C - +- C - G C G G T G G T T A C T G - - - T T T G G A +C T - G G A C - - G T T T G G C C G G A - - - T - +- T - - C - C T - - - - - - - T T G A T G C T C T +T T G C C G A G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - - +- A - - C - C G C C +>Chanbria +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - +>Gea +T C C G G C C G - - G A - C G G G T - C C G C C - +T A - C C G G T G G T T A C T G - T T C G - C T G +C C - G A G C - - - T T C A G - G G G G - - - C - +- C - - G - C T - - - - - - G T C G A T G A T C T +T C A T C G G T - T A - T C T T C - C G T A A C C +C T C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - - - - C G C C +>Hadrurus +T C C G G A C T - - G T - C - G G T - C C G - C - +- C - G C - A A G C T T A C T G - - - G C A G G A +C C - G G A C - - G T C T A G C C G G A - - C T - +- C - - T - C T - - - - - - C G T - A T C C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - T C - C G C C +>Hypochilus +T C C A G A C G - - G G - C - G G T - C C G C C - +T A - A C G G T G G T T A C T G - - - C C - T G G +C C T G A A C - - A A C C A G - C C G G - - - T - +- T - - T - C C - - - - - - C T A G A T G A T C T +T C A T T G A T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - G +- A - - - - C G C C +>Limulus +T C T A G A C T - - G G - C - G G T - C C G C T T +- - - C C G G C G G T T A C T G - - - C C - T G G +C C T A A A C - - A T C - T G - C C G G - T - T - +- T - - T - C C - - - - - - C T C G G T G C C C T +T G A T T G A G - T G - T C T T G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - A +- A - - - - C G C C +>Mastigoproctus +T C C A G A C G - - G G - T - G G T - C C A C C - +G C - C C G G T G G C T A C T G - - - C C - C G G +C C T G A A C A - A T C T C G - C C G G - - - T - +- T - - T - T C - - - - - - C T T G A T T C T C T +T C A C C G A G - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T T A A A G C A - - G - - C - G - T - A +- A - - - - C G C C +>Pauroctonus +T C C A G A C T - - G T - C - G G T - C C G - C - +- A - C C G G A G G T T A C T G - - - G C - A G G +A C C G G A C - - G T C T A G - C C G G - - - C - +- C - - T - C C - - - - - - C T C G T T G C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - A - C +- C - - - - C G C C +>Rhiphicephalus +T C C A G A C G - - A G - T - A G T - G C A T C - +T A - C C C G A T G C T A C G G - - - C T - C G G +A C T G A A C - - A T C A T G - C C G G - - - T - +- T - C T - T T - - - - - - C T T G G T G C A C T +T C A T T G T G - T G - C C T C G A G A T G G C C +G G T G C T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A C G C A - G G - - C - G - A - G +- T - - - - C G C C +>Thelichoris +T C C A G A C G - - G G - C - G G T - C C G C C - +T A - A C G G T G G T T A C T G - - - C C - T G G +C C T G A A C - - A G C C A G - C C G G - - - T - +- T - - T - C C - - - - - - C T A G A T G A T C T +T T A C C G G T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - G G - - C - - - T - G +- A - - - - C G C C +>Thermobius +C T C G G A C - - - G A T C - G G T - T C G C C - +- G - C C C G T G T T A A C T G A - - T C - - G T +T C C G G A C - - G T C C T G - C C G G T T - T - +- T - - T - C C - T T T - - C T C G G T G C T C T +T C A T T G A G - T G - T C T T G - A T T G G C C +G G C A C G T T T A C T T T G A A C A A A T T A G +A G T G C T C A A A G C A G - G T C C A G - T - G +- T - - - - C G C C +>Vonones +T C G A G G C T - - G G - C - G G T - C C G C C T +- A - C A G G C G G T C A C T G - - - C C - A G T +A C T C A A C - - A T C C T G - C C G G - T - T - +- T - - T - C C - - - - - - C T T G G T G C T C T +T C G C T G A G - T G - T C T C G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - C - G +- T - - - - A G C C + +Implied Alignments for Graph 9 + +Sequence character chel_cox1.fas#0 +>Alentus +T T A T A T T A G G A G C A C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T T T G A C T T C T T C C A C C A T C T T T +A A C T T T A C T C C T C - T C T A G A G G A A T +A G T A G A A A G T G G A G T G G G A A C A G G C +T G A A C T G T C T A T C C T C C T T T A T - C - +T G C T G G A A T T G C A C A T G C T G G T G C C +T C A G T A G A T T T A G G A A T C T T T T C T T +T A C A C C T A G C A G G A G T T T C A T C C A T +C T T A G G G G C C G T A A A C T T T A T A A C T +A C C G T T A T C A A T A T A C G A A G A A C A G +G T A T A A C A A T A G A C C G A A T T C C T C T +T T T T G T A T G A T C T G T A T T T A T T A C A +G C C C T T C T T C T C C T T C T T T C C C T C C +C A G T C +>Amblypygid +T A A T A T T A G G A G C C C C A G A T A T G G - +C - T T T T C C C C G T A T A A A T A A T A T A A +G T T T T T G A C T T T T A C C C C C C T C T C T +A A C A C T - A C T C C T C T C T A G A G G G A T +A G T G G A A A G A G G T G T A G G T A C A G G A +T G A A C T G T T T A C C C C C C T C T A T - C - +G G C T A G C A T C G C A C A C G C C G G T G C T +T C T G T T G A T A T G G G A A T T T T C T C G C +T C C A T T T G G C C G G G G T T T C T T C A A T +T T T A G G G G C C G T T A A T T T T A T A A C T +A C T G T T A T C A A T A T A C G A A G A A C C G +G A A T A A C T A T A G A C C G A A T T C C C C T +T T T T G T T T G A T C T G T C T T T A T T A C C +G C C C T C C T T C T T C T T C T A T C A T T A C +C T G T T +>Americhernus +T A A T A C T T G G G G C C C C T G A T A T G G - +C - T T T C C C C C G T A T A A A T A A T A T G A +G A T T C T G A C T C T T A C C T C C C T C A T T +A A C T C T - C C T A T T A T C G A G A G G G A T +A G T T G A A A C A G G G G T T G G T A C T G G A +T G G A C A G T A T A C C C C C C C T T A T - C - +G G C C G G G A T C G C C C A C G C A G G G G C C +T C A G T T G A C C T G G C A A T C T T T T C T C +T C C A C T T A G C T G G G G T T T C T T C T A T +T C T A G G A G C A G T T A A T T T T A T A A C A +A C C G T T A T C A A T A T A C G A A G T G T A G +G G A T G A C A A T A G A C C G A G T A C C T C T +T T T T G T A T G G T C T A T T T T C A T C A C A +A C T C T C C T T C T A C T A C T T T C T C T A C +C A G T C +>Aportus +T A A T A T T A G G G G C C C C T G A T A T G G C +C - T T T C C C - C G G A T A A A T A A T A T A A +G T T T C T G A C T A T T A C C C C C C T C T T T +A A C C C T T C T T T T A - T C T A G T G G T A T +G G T T G A A A C A G G A G T T G G G A C T G G T +T G A A C A G T T T A C C C C C C T T T A G - C - +C G C C G G A A T T G C T C A C G C A G G A G C T +T C G G T A G A T C T T G C T A T T T T C T C A C +T C C A C T T A G C C G G G G T T T C T T C G A T +T T T A G G A G C A G T A A A C T T T A T A A C T +A C C G T T A T T A A T A T A C G A A G A A C A G +G G A T A A C A A T G G A C C G G A T C C C T C T +T T T C G T C T G G T C G G T T T T T A T T A C T +G C C C T T C T T T T A C T T T T A T C T C T C C +C A G T T +>Artemia +T A A T A T T A G G A G C T C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A C T C C T C C C T C C C T C T T T +A A C T C T T T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C G G G A +T G A A C C G T T T A T C C T C C T C T T G - C - +A G C C G G A A T T G C T C A T G C C G G A G C T +T C A G T T G A T A T A G G A A T C T T C T C T C +T A C A T T T A G C A G G A G T A T C C T C T A T +T T T A G G G G C C G T A A A T T T T A T A A C A +A C A G T G A T T A A C A T A C G G G C A T C T G +G A A T A A C A T T A G A C C G A A T A C C T T T +A T T C G T G T G A T C T G T A T T T A T T A C T +G C A C T A T T A T T A C T T T T A T C A C T A C +C A G T T +>Centruoides +T A A T A T T A G G G G C C C C T G A T A T G G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A T T G C T T C C C C C T T C C C T +A A C T C T T T T A C T T - T C G A G T G G A A T +G G T A G A A A G A G G G G T A G G T A C A G G A +T G A A C T G T T T A T C C C C C T C T T G - C - +A G C C G G A A T T G C T C A T G C T G G A G C T +T C C G T T G A C A T A G G A A T T T T C T C T C +T A C A T T T A G C A G G A G T G T C T T C T A T +T T T A G G G G C C G T A A A C T T C A T A A C T +A C A G T A A T T A A T A T A C G A G C A T C T G +G A A T A A C A A T A G A C C G A A T A C C T T T +A T T T G T T T G A T C T G T C T T T A T T A C A +G C A C T A C T T T T A C T C T T G T C A T T A C +C A G T T +>Chanbria +T T A T A T T A G G A G C A C C T G A T A T A G - +C - A T T T C C A C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C A T C T C T +T A C A C T A C T C C T A - T C A A G A G G A A T +G G T A G A A A G A G G A G T C G G T A C A G G A +T G A A C T G T G T A T C C C C C G C T A G - C - +A G C A G G A A T T G C C C A C G C T G G A G C C +T C A G T T G A C A T A G G A A T T T T T T C A C +T A C A C C T C G C C G G A G T C T C A T C A A T +T C T A G G T G C A G T T A A T T T T A T A A C A +A C T G T T A T T A A C A T A C G C C C T G C G G +G A A T A A C T A T A G A C C G T A T A C C A C T +T T T C G T G T G A G C A G T A T T T A T T A C A +G C C C T A C T C C T T C T T T T A T C T C T A C +C A G T C +>Gea +T T A T A C T T G G T G C C C C A G A T A T G G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G A T T C T G A C T T C T C C C C C C T T C T C T +A A C T C T A T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C A G G A +T G A A C A G T T T A C C C C C C T C T A G - C - +A G C A G G A A T T G C T C A T G C T G G A G C T +T C A G T T G A T A T A G G A A T T T T C T C G C +T A C A C C T T G C A G G G G T C T C A T C A A T +C T T A G G A G C A G T T A A T T T C A T G A C A +A C A G T T A T T A A T A T G C G C C C T G C A G +G A A T A A C T A T G G A C C G T A T A C C A C T +C T T C G T G T G A G C T G T C T T T A T T A C A +G C C T T G C T A C T A T T A C T A T C C C T C C +C A G T C +>Hadrurus +T T A T A T T A G G G G C C C C A G A T A T A G - +C - T T T T C C C C G A A T A A A T A A T A T A A +G C T T C T G A T T A C T T C C C C C C T C C C T +A A C T C T C C T C C T T - T T A A G A G G A A T +A G T T G A A A G A G G A G T A G G G A C C G G C +T G A A C T G T C T A T C C T C C T T T A G - C T +A G - A G G A A T T G C C C A T G C A G G A G C C +T C G G T A G A C A T A G G A A T T T T T T C T C +T A C A C T T A G C T G G A G T T T C A T C C A T +T T T A G G G G C C G T A A A C T T T A T A A C A +A C A G T G A T T A A T A T A C G A T C A T C A G +G A A T A A C A A T A G A T C G A A T A C C T T T +A T T T G T T T G A T C A G T C T T T A T T A C C +G C T T T A T T G T T A C T C C T T T C A C T C C +C A G T T +>Hypochilus +T A A T A C T C G G G G C C C C A G A T A T A G - +C - T T T C C C A C G A A T A A A T A A C A T A A +G A T T C T G A C T T C T T C C T C C C T C T C T +A A C T C T A C T T C T C - T C A A G C G G A A T +A G T C G A A A G A G G G G T A G G A A C C G G C +T G G A C A G T A T A C C C T C C T C T A G C C - +A G - A G G A A T C G C T C A T G C A G G T G C T +T C T G T A G A T A T A G G A A T C T T C T C G C +T T C A C C T A G C A G G A G T C T C A T C A A T +C C T A G G G G C C G T A A A C T T T A T A A C A +A C A G T A A T C A A T A T A C G A T C C T C A G +G A A T A T C T A T A G A C C G T A T A C C T T T +A T T T G T A T G A T C A G T A T T C A T T A C C +G C T C T C C T C C T T C T T C T C T C T T T A C +C A G T C +>Limulus +T A A T A C T A G G T G C C C C G G A T A T G G - +C - T T T C C C T C G A A T A A A T A A C A T A A +G T T T C T G A C T T T T A C C C C C A T C T C T +C A C T C T - C C T C T T A T C T A G A G G A A T +A G T A G A A A G A G G T G T A G G A A C T G G A +T G A A C T G T A T A T C C T C C T T T A T - C - +T G C A A G T A T T G C C C A T G C G G G A G C T +T C C G T T G A T C T C G G A A T C T T C T C A C +T A C A C T T A G C A G G T G T A T C A T C A A T +T T T A G G T G C C G T A A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T C T A C C G +G A A T A A C T A T G G A C C G G A T A C C T C T +T T T T G T T T G A G C A G T A T T T A T T A C T +G C T T T A C T T T T A C T A C T G T C C T T A C +C A G T T +>Mastigoproctus +T G A T A T T A G G T G C T C C T G A T A T A G - +C - T T T T C C T C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C T T C G T T +A A C G T T A C T T C T T - T C C A G A G G T A T +A G T T G A A A G A G G G G T A G G A A C A G G A +T G A A C A G T C T A T C C T C C T T T A T - C - +A G C T A G A A T T G C T C A T G C C G G A G C T +T C A G T A G A C T T A G G T A T T T T C T C G C +T G C A T C T A G C T G G T G T C T C G T C T A T +C T T A G G A G C T G T C A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G T A T A C C T C T +T T T C G T A T G G G C A G T A T T T A T T A C C +G C C C T A C T A C T T T T A T T A T C T T T A C +C A G T A +>Pauroctonus +T A A T G T T A G G T G C T C C T G A T A T G G - +C - T T T T C C A C G A A T A A A C A A T A T G A +G T T T C T G G C T C C T A C C T C C T T C A C T +A A C A C T A C T T C T T - T C T A G A G G T A T +A G T T G A A A G A G G A G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T T T A T - C - +A G C C A G T A T T G C T C A T G C T G G G G C T +T C G G T A G A T T T A G G A A T T T T C T C C C +T A C A C T T G G C A G G T G T T T C T T C A A T +T T T A G G A G C T G T A A A T T T T A T G A C A +A C T G T T A T T A A C A T A C G A T C T A C G G +G A A T A A C T A T A G A C C G A A T A C C G C T +T T T C G T T T G A G C A G T A T T T A T T A C A +G C A T T A C T T C T C T T A T T A T C T C T A C +C G G T A +>Rhiphicephalus +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T - C - +A G C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thelichoris +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T - C - +A G C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thermobius +T A A T A T T A G G T G C T C C C G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T G A +G T T T C T G A C T T T T A C C T C C C T C G C T +A A C T C T A C T T C T T - T C T A G A G G T A T +A G T C G A A A G A G G G G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T C T A T - C - +T G C C A G C A T T G C C C A T G C A G G A G C A +T C T G T A G A T C T A G G A A T C T T C T C A T +T A C A T T T A G C A G G G G T T T C T T C A A T +T C T A G G T G C T G T A A A T T T T A T A A C T +A C C G T T A T C A A T A T A C G A T C A A C A G +G A A T A A C T A T A G A C C G A A T A C C T C T +T T T C G T C T G A G C G G T A T T T A T T A C A +G C C T T A C T A C T T T T A C T A T C A T T A C +C G G T T +>Vonones +T A A T G T T A G G T G C T C C A G A T A T G G - +C A T T T C C C - C G A A T A A A T A A T A T A A +G T T T C T G A C T T T T A C C C C C T T C G C T +A A C T T T A C T T - T T A T C T A G A G G T A T +A G T C G A A A G A G G A G T G G G A A C T G G A +T G A A C A G T A T A C C C T C C T T T A T - C - +A G C C A G A A T T G C T C A C G C A G G T G C T +T C A G T T G A T C T A G G T A T T T T T T C A T +T A C A T T T A G C A G G G G T C T C A T C A A T +C T T A G G A G C T G T A A A C T T T A T A A C G +A C C G T T A T C A A T A T A C G A T C T A C A G +G G A T A A C T A T A G A C C G A A T A C C A C T +T T T T G T T T G A G C A G T A T T T A T T A C A +G C T C T A C T C C T A C T G T T A T C T T T A C +C A G T C + + +Sequence character chel_16S.fas#0 +>Alentus +T C C A G A C T - - G A - C - G G T - C C A C C - +- G A A T C G - G G C G A C T G - - - T C - A G G +C C T G A A C - - A T C C G G - T C G G T T - T C +- T C - T C C C T T T T C C C G C G G T G C T C T +T C G T T G A G - T G - T C G C G - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Amblypygid +T C C A G A C T - - G G - C - G G T - C C G C C T +- A - G C G G C G A G T A C T G - - - T C - A G G +C C T G A A C - - A T G G C G - C C G G - T - T - +- T - - T - - C - - - - - - C T T G G T T C T C T +T T A C T G A G - T G - T C T T G - G G C G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - - - G - C T G +- T - - - - C G C C +>Americhernus +T C G A G C C T C C A A - T - G A T - A C G T T G +- A - A A G G C G T T T A T C G - - - T T - G G G +G C C G - A C - - A G C G C G - T C - G - - - T - +- G - - G - - G - - - - - - C T C G G T T G G C C +T T A A A A A G C T G A T C - G G - G T T C T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +G G T G C T C A A A G C A - - G - - - - G C C T G +- G - - - - T G C C +>Aportus +T C C A G A C T - - G A - C - G G T - C C A C C - +- G - C T C G - G G C G A C T G - - - T C - A G G +C C T G A A C - - A T T C C G - T C G G T T - T C +G A C - A G A T T T T T C C C G C G G T G C T C T +T C G G T G A G - T G - T C G C - - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Artemia +C T C G G T C G - - G G - T - G G T G C C G C C - +T C - A C G G T G G T C A C T G - - - C C T C G A +T C - G G A C - A A T T C A T - T G G A - - - T - +- C - - G - T T - - - - - - C G G G G T G C T C T +T A A C C G A G - T G - T C C T G - G G T G G C C +G A T A C G T T T A C T T T G A A C A A A T T A G +A G T G C T T A A A G C A - - G - - - - G - T - G +C A C - C G C G C C +>Centruoides +T C C A G A C A - - A G - C - G G T - C C A C C - +- C - G C G G T G G T T A C T G - - - T T T G G A +C T - G G A C - - G T T T G G C C G G A - - - T - +- T - - C - C T - - - - - - - T T G A T G C T C T +T T G C C G A G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - - +- A - - C - C G C C +>Chanbria +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - +>Gea +T C C G G C C G - - G A - C G G G T - C C G C C - +T A - C C G G T G G T T A C T G - T T C G - C T G +C C - G A G C - - - T T C A G - G G G G - - - C - +- C - - G - C T - - - - - - G T C G A T G A T C T +T C A T C G G T - T A - T C T T C - C G T A A C C +C T C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - - - - C G C C +>Hadrurus +T C C G G A C T - - G T - C - G G T - C C G - C - +- C - G C - A A G C T T A C T G - - - G C A G G A +C C - G G A C - - G T C T A G C C G G A - - C T - +- C - - T - C T - - - - - - C G T - A T C C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - T C - C G C C +>Hypochilus +T C C A G A C G - - G G - C - G G T - C C G C C - +T A - A C G G T G G T T A C T G - - - C C - T G G +C C T G A A C - - A A C C A G - C C G G - - - T - +- T - - T - C C - - - - - - C T A G A T G A T C T +T C A T T G A T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - G +- A - - - - C G C C +>Limulus +T C T A G A C T - - G G - C - G G T - C C G C T T +- - - C C G G C G G T T A C T G - - - C C - T G G +C C T A A A C - - A T C - T G - C C G G - T - T - +- T - - T - C C - - - - - - C T C G G T G C C C T +T G A T T G A G - T G - T C T T G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - A +- A - - - - C G C C +>Mastigoproctus +T C C A G A C G - - G G - T - G G T - C C A C C - +G C - C C G G T G G C T A C T G - - - C C - C G G +C C T G A A C A - A T C T C G - C C G G - - - T - +- T - - T - T C - - - - - - C T T G A T T C T C T +T C A C C G A G - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T T A A A G C A - - G - - C - G - T - A +- A - - - - C G C C +>Pauroctonus +T C C A G A C T - - G T - C - G G T - C C G - C - +- A - C C G G A G G T T A C T G - - - G C - A G G +A C C G G A C - - G T C T A G - C C G G - - - C - +- C - - T - C C - - - - - - C T C G T T G C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - A - C +- C - - - - C G C C +>Rhiphicephalus +T C C A G A C G - - A G - T - A G T - G C A T C - +T A - C C C G A T G C T A C G G - - - C T - C G G +A C T G A A C - - A T C A T G - C C G G - - - T - +- T - C T - T T - - - - - - C T T G G T G C A C T +T C A T T G T G - T G - C C T C G A G A T G G C C +G G T G C T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A C G C A - G G - - C - G - A - G +- T - - - - C G C C +>Thelichoris +T C C A G A C G - - G G - C - G G T - C C G C C - +T A - A C G G T G G T T A C T G - - - C C - T G G +C C T G A A C - - A G C C A G - C C G G - - - T - +- T - - T - C C - - - - - - C T A G A T G A T C T +T T A C C G G T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - G G - - C - - - T - G +- A - - - - C G C C +>Thermobius +C T C G G A C - - - G A T C - G G T - T C G C C - +- G - C C C G T G T T A A C T G A - - T C - - G T +T C C G G A C - - G T C C T G - C C G G T T - T - +- T - - T - C C - T T T - - C T C G G T G C T C T +T C A T T G A G - T G - T C T T G - A T T G G C C +G G C A C G T T T A C T T T G A A C A A A T T A G +A G T G C T C A A A G C A G - G T C C A G - T - G +- T - - - - C G C C +>Vonones +T C G A G G C T - - G G - C - G G T - C C G C C T +- A - C A G G C G G T C A C T G - - - C C - A G T +A C T C A A C - - A T C C T G - C C G G - T - T - +- T - - T - C C - - - - - - C T T G G T G C T C T +T C G C T G A G - T G - T C T C G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - C - G +- T - - - - A G C C + diff --git a/tutorials/tutorial_1/Tutorial1_phygfiles/search1/chel_ia2.ss b/tutorials/tutorial_1/Tutorial1_phygfiles/search1/chel_ia2.ss new file mode 100644 index 000000000..3da24fa3a --- /dev/null +++ b/tutorials/tutorial_1/Tutorial1_phygfiles/search1/chel_ia2.ss @@ -0,0 +1,4320 @@ +Implied Alignments for Graph 0 + +Sequence character chel_cox1.fas#0 +>Alentus +T T A T A T T A G G A G C A C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T T T G A C T T C T T C C A C C A T C T T T +A A C T T T A C T C C T C - T C T A G A G G A A T +A G T A G A A A G T G G A G T G G G A A C A G G C +T G A A C T G T C T A T C C T C C T T T A T C - T +G - C T G G A A T T G C A C A T G C T G G T G C C +T C A G T A G A T T T A G G A A T C T T T T C T T +T A C A C C T A G C A G G A G T T T C A T C C A T +C T T A G G G G C C G T A A A C T T T A T A A C T +A C C G T T A T C A A T A T A C G A A - G A A C A +G G T A T A A C A A T A G A C C G A A T T C C T C +T T T T T G T A T G A T C T G T A T T T A T T A C +A G C C C T T C T T C T C C T T C T T T C C C T C +C C A G T C +>Amblypygid +T A A T A T T A G G A G C C C C A G A T A T G G - +C - T T T T C C C C G T A T A A A T A A T A T A A +G T T T T T G A C T T T T A C C C C C C T C T C T +A A C A C T - A C T C C T C T C T A G A G G G A T +A G T G G A A A G A G G T G T A G G T A C A G G A +T G A A C T G T T T A C C C C C C T C T A T C - G +G - C T A G C A T C G C A C A C G C C G G T G C T +T C T G T T G A T A T G G G A A T T T T C T C G C +T C C A T T T G G C C G G G G T T T C T T C A A T +T T T A G G G G C C G T T A A T T T T A T A A C T +A C T G T T A T C A A T A T A C G A A - G A A C C +G G A A T A A C T A T A G A C C G A A T T C C C C +T T T T T G T T T G A T C T G T C T T T A T T A C +C G C C C T C C T T C T T C T T C T A T C A T T A +C C T G T T +>Americhernus +T A A T A C T T G G G G C C C C T G A T A T G G - +C - T T T C C C C C G T A T A A A T A A T A T G A +G A T T C T G A C T C T T A C C T C C C T C A T T +A A C T C T - C C T A T T A T C G A G A G G G A T +A G T T G A A A C A G G G G T T G G T A C T G G A +T G G A C A G T A T A C C C C C C C T T A T C - G +G - C C G G G A T C G C C C A C G C A G G G G C C +T C A G T T G A C C T G G C A A T C T T T T C T C +T C C A C T T A G C T G G G G T T T C T T C T A T +T C T A G G A G C A G T T A A T T T T A T A A C A +A C C G T T A T C A A T A T A C G A A - G T G T A +G G G A T G A C A A T A G A C C G A G T A C C T C +T T T T T G T A T G G T C T A T T T T C A T C A C +A A C T C T C C T T C T A C T A C T T T C T C T A +C C A G T C +>Aportus +T A A T A T T A G G G G C C C C T G A T A T G G C +C - T T T C C C - C G G A T A A A T A A T A T A A +G T T T C T G A C T A T T A C C C C C C T C T T T +A A C C C T T C T T T T A - T C T A G T G G T A T +G G T T G A A A C A G G A G T T G G G A C T G G T +T G A A C A G T T T A C C C C C C T T T A G C - C +G - C C G G A A T T G C T C A C G C A G G A G C T +T C G G T A G A T C T T G C T A T T T T C T C A C +T C C A C T T A G C C G G G G T T T C T T C G A T +T T T A G G A G C A G T A A A C T T T A T A A C T +A C C G T T A T T A A T A T A C G A A - G A A C A +G G G A T A A C A A T G G A C C G G A T C C C T C +T T T T C G T C T G G T C G G T T T T T A T T A C +T G C C C T T C T T T T A C T T T T A T C T C T C +C C A G T T +>Artemia +T A A T A T T A G G A G C T C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A C T C C T C C C T C C C T C T T T +A A C T C T T T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C G G G A +T G A A C C G T T T A T C C T C C T C T T G C - A +G C - C G G A A T T G C T C A T G C C G G A G C T +T C A G T T G A T A T A G G A A T C T T C T C T C +T A C A T T T A G C A G G A G T A T C C T C T A T +T T T A G G G G C C G T A A A T T T T A T A A C A +A C A G T G A T T A A C A T A C G G G C A T - C T +G G A A T A A C A T T A G A C C G A A T A C C T T +T A T T C G T G T G A T C T G T A T T T A T T A C +T G C A C T A T T A T T A C T T T T A T C A C T A +C C A G T T +>Centruoides +T A A T A T T A G G G G C C C C T G A T A T G G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A T T G C T T C C C C C T T C C C T +A A C T C T T T T A C T T - T C G A G T G G A A T +G G T A G A A A G A G G G G T A G G T A C A G G A +T G A A C T G T T T A T C C C C C T C T T G C - A +G C - C G G A A T T G C T C A T G C T G G A G C T +T C C G T T G A C A T A G G A A T T T T C T C T C +T A C A T T T A G C A G G A G T G T C T T C T A T +T T T A G G G G C C G T A A A C T T C A T A A C T +A C A G T A A T T A A T A T A C G A G C A T - C T +G G A A T A A C A A T A G A C C G A A T A C C T T +T A T T T G T T T G A T C T G T C T T T A T T A C +A G C A C T A C T T T T A C T C T T G T C A T T A +C C A G T T +>Chanbria +T T A T A T T A G G A G C A C C T G A T A T A G - +C - A T T T C C A C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C A T C T C T +T A C A C T A C T C C T A - T C A A G A G G A A T +G G T A G A A A G A G G A G T C G G T A C A G G A +T G A A C T G T G T A T C C C C C G C T A G C - A +G - C A G G A A T T G C C C A C G C T G G A G C C +T C A G T T G A C A T A G G A A T T T T T T C A C +T A C A C C T C G C C G G A G T C T C A T C A A T +T C T A G G T G C A G T T A A T T T T A T A A C A +A C T G T T A T T A A C A T A C G - C C C T G C G +G G A A T A A C T A T A G A C C G T A T A C C A C +T T T T C G T G T G A G C A G T A T T T A T T A C +A G C C C T A C T C C T T C T T T T A T C T C T A +C C A G T C +>Gea +T T A T A C T T G G T G C C C C A G A T A T G G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G A T T C T G A C T T C T C C C C C C T T C T C T +A A C T C T A T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C A G G A +T G A A C A G T T T A C C C C C C T C T A G C - A +G - C A G G A A T T G C T C A T G C T G G A G C T +T C A G T T G A T A T A G G A A T T T T C T C G C +T A C A C C T T G C A G G G G T C T C A T C A A T +C T T A G G A G C A G T T A A T T T C A T G A C A +A C A G T T A T T A A T A T G C G - C C C T G C A +G G A A T A A C T A T G G A C C G T A T A C C A C +T C T T C G T G T G A G C T G T C T T T A T T A C +A G C C T T G C T A C T A T T A C T A T C C C T C +C C A G T C +>Hadrurus +T T A T A T T A G G G G C C C C A G A T A T A G - +C - T T T T C C C C G A A T A A A T A A T A T A A +G C T T C T G A T T A C T T C C C C C C T C C C T +A A C T C T C C T C C T T - T T A A G A G G A A T +A G T T G A A A G A G G A G T A G G G A C C G G C +T G A A C T G T C T A T C C T C C T T T A G C T A +G - - A G G A A T T G C C C A T G C A G G A G C C +T C G G T A G A C A T A G G A A T T T T T T C T C +T A C A C T T A G C T G G A G T T T C A T C C A T +T T T A G G G G C C G T A A A C T T T A T A A C A +A C A G T G A T T A A T A T A C G A T C A T - C A +G G A A T A A C A A T A G A T C G A A T A C C T T +T A T T T G T T T G A T C A G T C T T T A T T A C +C G C T T T A T T G T T A C T C C T T T C A C T C +C C A G T T +>Hypochilus +T A A T A C T C G G G G C C C C A G A T A T A G - +C - T T T C C C A C G A A T A A A T A A C A T A A +G A T T C T G A C T T C T T C C T C C C T C T C T +A A C T C T A C T T C T C - T C A A G C G G A A T +A G T C G A A A G A G G G G T A G G A A C C G G C +T G G A C A G T A T A C C C T C C T C T A G C C A +G - - A G G A A T C G C T C A T G C A G G T G C T +T C T G T A G A T A T A G G A A T C T T C T C G C +T T C A C C T A G C A G G A G T C T C A T C A A T +C C T A G G G G C C G T A A A C T T T A T A A C A +A C A G T A A T C A A T A T A C G A T C C T - C A +G G A A T A T C T A T A G A C C G T A T A C C T T +T A T T T G T A T G A T C A G T A T T C A T T A C +C G C T C T C C T C C T T C T T C T C T C T T T A +C C A G T C +>Limulus +T A A T A C T A G G T G C C C C G G A T A T G G - +C - T T T C C C T C G A A T A A A T A A C A T A A +G T T T C T G A C T T T T A C C C C C A T C T C T +C A C T C T - C C T C T T A T C T A G A G G A A T +A G T A G A A A G A G G T G T A G G A A C T G G A +T G A A C T G T A T A T C C T C C T T T A T C - T +G - C A A G T A T T G C C C A T G C G G G A G C T +T C C G T T G A T C T C G G A A T C T T C T C A C +T A C A C T T A G C A G G T G T A T C A T C A A T +T T T A G G T G C C G T A A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T - C T A C C +G G A A T A A C T A T G G A C C G G A T A C C T C +T T T T T G T T T G A G C A G T A T T T A T T A C +T G C T T T A C T T T T A C T A C T G T C C T T A +C C A G T T +>Mastigoproctus +T G A T A T T A G G T G C T C C T G A T A T A G - +C - T T T T C C T C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C T T C G T T +A A C G T T A C T T C T T - T C C A G A G G T A T +A G T T G A A A G A G G G G T A G G A A C A G G A +T G A A C A G T C T A T C C T C C T T T A T C - A +G - C T A G A A T T G C T C A T G C C G G A G C T +T C A G T A G A C T T A G G T A T T T T C T C G C +T G C A T C T A G C T G G T G T C T C G T C T A T +C T T A G G A G C T G T C A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G T A T A C C T C +T T T T C G T A T G G G C A G T A T T T A T T A C +C G C C C T A C T A C T T T T A T T A T C T T T A +C C A G T A +>Pauroctonus +T A A T G T T A G G T G C T C C T G A T A T G G - +C - T T T T C C A C G A A T A A A C A A T A T G A +G T T T C T G G C T C C T A C C T C C T T C A C T +A A C A C T A C T T C T T - T C T A G A G G T A T +A G T T G A A A G A G G A G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T T T A T C - A +G - C C A G T A T T G C T C A T G C T G G G G C T +T C G G T A G A T T T A G G A A T T T T C T C C C +T A C A C T T G G C A G G T G T T T C T T C A A T +T T T A G G A G C T G T A A A T T T T A T G A C A +A C T G T T A T T A A C A T A C G A T - C T A C G +G G A A T A A C T A T A G A C C G A A T A C C G C +T T T T C G T T T G A G C A G T A T T T A T T A C +A G C A T T A C T T C T C T T A T T A T C T C T A +C C G G T A +>Rhiphicephalus +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T C - A +G - C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G A A T A C C G T +T A T T C G T C T G A G C A G T T T T T A T T A C +A G C G C T C T T A C T C C T T T T A T C T T T A +C C C G T T +>Thelichoris +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T C - A +G - C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G A A T A C C G T +T A T T C G T C T G A G C A G T T T T T A T T A C +A G C G C T C T T A C T C C T T T T A T C T T T A +C C C G T T +>Thermobius +T A A T A T T A G G T G C T C C C G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T G A +G T T T C T G A C T T T T A C C T C C C T C G C T +A A C T C T A C T T C T T - T C T A G A G G T A T +A G T C G A A A G A G G G G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T C T A T C - T +G - C C A G C A T T G C C C A T G C A G G A G C A +T C T G T A G A T C T A G G A A T C T T C T C A T +T A C A T T T A G C A G G G G T T T C T T C A A T +T C T A G G T G C T G T A A A T T T T A T A A C T +A C C G T T A T C A A T A T A C G A T - C A A C A +G G A A T A A C T A T A G A C C G A A T A C C T C +T T T T C G T C T G A G C G G T A T T T A T T A C +A G C C T T A C T A C T T T T A C T A T C A T T A +C C G G T T +>Vonones +T A A T G T T A G G T G C T C C A G A T A T G G - +C A T T T C C C - C G A A T A A A T A A T A T A A +G T T T C T G A C T T T T A C C C C C T T C G C T +A A C T T T A C T T - T T A T C T A G A G G T A T +A G T C G A A A G A G G A G T G G G A A C T G G A +T G A A C A G T A T A C C C T C C T T T A T C - A +G - C C A G A A T T G C T C A C G C A G G T G C T +T C A G T T G A T C T A G G T A T T T T T T C A T +T A C A T T T A G C A G G G G T C T C A T C A A T +C T T A G G A G C T G T A A A C T T T A T A A C G +A C C G T T A T C A A T A T A C G A T - C T A C A +G G G A T A A C T A T A G A C C G A A T A C C A C +T T T T T G T T T G A G C A G T A T T T A T T A C +A G C T C T A C T C C T A C T G T T A T C T T T A +C C A G T C + + +Sequence character chel_16S.fas#0 +>Alentus +T C C A G A C T - - G A - C - G G T - C C A C C - +- - G A A T C G - G G C G A C T G - - - T C A G G +C C T G A A C - A T C C G G T C G G T T T C - T C +- T C C C T T T T C C C G C - - G - G T G C T C T +T C G T T G A G - T G - T C G C G - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Amblypygid +T C C A G A C T - - G G - C - G G T - C C G C C T +- - A - G C G G C G A G T A C T G - - - T C A G G +C C T G A A C - A T G G C G C C G G - T T - - T - +- T - - C - - - - - - C T T - - G - G T T C T C T +T T A C T G A G - T G - T C T T G - G G C G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - - - G - C T G +- T - - - - C G C C +>Americhernus +T C G A G C C T C C A A - T - G A T - A C G T T G +- - A - A A G G C G T T T A T C G - - - T T G G G +G C C G - A C - A G C G C G T C - G - - T - - G - +- G - - G - - - - - - C T C - - G - G T T G G C C +T T A A A A A G C T G A T C - G G - G T T C T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +G G T G C T C A A A G C A - - G - - - - G C C T G +- G - - - - T G C C +>Aportus +T C C A G A C T - - G A - C - G G T - C C A C C - +- - G - C T C G - G G C G A C T G - - - T C A G G +C C T G A A C - A T T C C G T C G G T T T C G A C +- A G A T T T T T C C C G C - - G - G T G C T C T +T C G G T G A G - T G - T C G C - - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Artemia +C T C G G T C G - - G G - T - G G T G C C G C C - +- T C - A C G G T G G T C A C T G - - - C C T C G +A T C G G A C - A A T T C A T T G G - - A - - - - +- T - C G - - - - - - T T C G G G - G T G C T C T +T A A C C G A G - T G - T C C T G - G G T G G C C +G A T A C G T T T A C T T T G A A C A A A T T A G +A G T G C T T A A A G C A - - G - - - - G - T - G +C A - C C G C G C C +>Centruoides +T C C A G A C A - - A G - C - G G T - C C A C C - +- - C - G C G G T G G T T A C T G - - - T T T G G +A C T G G A C - G T T T G G C C G G - - A - - T - +- T - C C - - - - - - T T T - - G - A T G C T C T +T T G C C G A G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - - +- A - - C - C G C C +>Chanbria +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - +>Gea +T C C G G C C G - - G A - C G G G T - C C G C C - +T - A - C C G G T G G T T A C T G - T T C G C T G +C C - G A G C - - T T C A G G G G G - - C - - C - +- G - C T - - - - - - G T C - - G - A T G A T C T +T C A T C G G T - T A - T C T T C - C G T A A C C +C T C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - - - - C G C C +>Hadrurus +T C C G G A C T - - G T - C - G G T - C C G C C - +- - - - G C - A A G C T T A C T G - - - G C A G G +A C C G G A C - G T C T A G C C G G - - A - - C - +- T - C T - - - - - - C T C - - G T A T C C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A T - C - C G C C +>Hypochilus +T C C A G A C G - - G G - C - G G T - C C G C C - +T - A - A C G G T G G T T A C T G - - - C C T G G +C C T G A A C - A A C C A G C C G G - - T - - T - +- T - C C - - - - - - C T A - - G - A T G A T C T +T C A T T G A T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - G +- A - - - - C G C C +>Limulus +T C T A G A C T - - G G - C - G G T - C C G C T T +- - - - C C G G C G G T T A C T G - - - C C T G G +C C T A A A C - A T C - T G C C G G - T T - - T - +- T - C C - - - - - - C T C - - G - G T G C C C T +T G A T T G A G - T G - T C T T G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - A +- A - - - - C G C C +>Mastigoproctus +T C C A G A C G - - G G - T - G G T - C C A C C - +G - C - C C G G T G G C T A C T G - - - C C C G G +C C T G A A C A A T C T C G C C G G - - T - - T - +- T - T C - - - - - - C T T - - G - A T T C T C T +T C A C C G A G - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T T A A A G C A - - G - - C - G - T - A +- A - - - - C G C C +>Pauroctonus +T C C A G A C T - - G T - C - G G T - C C G - C - +- - A - C C G G A G G T T A C T G - - - G C A G G +A C C G G A C - G T C T A G C C G G - - C - - C - +- T - C C - - - - - - C T C - - G - T T G C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - A - C +- C - - - - C G C C +>Rhiphicephalus +T C C A G A C G - - A G - T - A G T - G C A T C - +T - A - C C C G A T G C T A C G G - - - C T C G G +A C T G A A C - A T C A T G C C G G - - T - - T - +C T - T T - - - - - - C T T - - G - G T G C A C T +T C A T T G T G - T G - C C T C G A G A T G G C C +G G T G C T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A C G C A - G G - - C - G - A - G +- T - - - - C G C C +>Thelichoris +T C C A G A C G - - G G - C - G G T - C C G C C - +T - A - A C G G T G G T T A C T G - - - C C T G G +C C T G A A C - A G C C A G C C G G - - T - - T - +- T - C C - - - - - - C T A - - G - A T G A T C T +T T A C C G G T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - G G - - C - - - T - G +- A - - - - C G C C +>Thermobius +C T C G G A C - - - G A T C - G G T - T C G C C - +- - G - C C C G T G T T A A C T G A - - T C - G T +T C C G G A C - G T C C T G C C G G T T T - - T - +- T - C C - T T T - - C T C - - G - G T G C T C T +T C A T T G A G - T G - T C T T G - A T T G G C C +G G C A C G T T T A C T T T G A A C A A A T T A G +A G T G C T C A A A G C A G - G T C C A G - T - G +- T - - - - C G C C +>Vonones +T C G A G G C T - - G G - C - G G T - C C G C C T +- - A - C A G G C G G T C A C T G - - - C C A G T +A C T C A A C - A T C C T G C C G G - T T - - T - +- T - C C - - - - - - C T T - - G - G T G C T C T +T C G C T G A G - T G - T C T C G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - C - G +- T - - - - A G C C + +Implied Alignments for Graph 1 + +Sequence character chel_cox1.fas#0 +>Alentus +T T A T A T T A G G A G C A C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T T T G A C T T C T T C C A C C A T C T T T +A A C T T T A C T C C T C - T C T A G A G G A A T +A G T A G A A A G T G G A G T G G G A A C A G G C +T G A A C T G T C T A T C C T C C T T T A T C - T +G - C T G G A A T T G C A C A T G C T G G T G C C +T C A G T A G A T T T A G G A A T C T T T T C T T +T A C A C C T A G C A G G A G T T T C A T C C A T +C T T A G G G G C C G T A A A C T T T A T A A C T +A C C G T T A T C A A T A T A C G A A - G A A C A +G G T A T A A C A A T A G A C C G A A T T C C T C +T T T T T G T A T G A T C T G T A T T T A T T A C +A G C C C T T C T T C T C C T T C T T T C C C T C +C C A G T C +>Amblypygid +T A A T A T T A G G A G C C C C A G A T A T G G - +C - T T T T C C C C G T A T A A A T A A T A T A A +G T T T T T G A C T T T T A C C C C C C T C T C T +A A C A C T - A C T C C T C T C T A G A G G G A T +A G T G G A A A G A G G T G T A G G T A C A G G A +T G A A C T G T T T A C C C C C C T C T A T C - G +G - C T A G C A T C G C A C A C G C C G G T G C T +T C T G T T G A T A T G G G A A T T T T C T C G C +T C C A T T T G G C C G G G G T T T C T T C A A T +T T T A G G G G C C G T T A A T T T T A T A A C T +A C T G T T A T C A A T A T A C G A A - G A A C C +G G A A T A A C T A T A G A C C G A A T T C C C C +T T T T T G T T T G A T C T G T C T T T A T T A C +C G C C C T C C T T C T T C T T C T A T C A T T A +C C T G T T +>Americhernus +T A A T A C T T G G G G C C C C T G A T A T G G - +C - T T T C C C C C G T A T A A A T A A T A T G A +G A T T C T G A C T C T T A C C T C C C T C A T T +A A C T C T - C C T A T T A T C G A G A G G G A T +A G T T G A A A C A G G G G T T G G T A C T G G A +T G G A C A G T A T A C C C C C C C T T A T C - G +G - C C G G G A T C G C C C A C G C A G G G G C C +T C A G T T G A C C T G G C A A T C T T T T C T C +T C C A C T T A G C T G G G G T T T C T T C T A T +T C T A G G A G C A G T T A A T T T T A T A A C A +A C C G T T A T C A A T A T A C G A A - G T G T A +G G G A T G A C A A T A G A C C G A G T A C C T C +T T T T T G T A T G G T C T A T T T T C A T C A C +A A C T C T C C T T C T A C T A C T T T C T C T A +C C A G T C +>Aportus +T A A T A T T A G G G G C C C C T G A T A T G G C +C - T T T C C C - C G G A T A A A T A A T A T A A +G T T T C T G A C T A T T A C C C C C C T C T T T +A A C C C T T C T T T T A - T C T A G T G G T A T +G G T T G A A A C A G G A G T T G G G A C T G G T +T G A A C A G T T T A C C C C C C T T T A G C - C +G - C C G G A A T T G C T C A C G C A G G A G C T +T C G G T A G A T C T T G C T A T T T T C T C A C +T C C A C T T A G C C G G G G T T T C T T C G A T +T T T A G G A G C A G T A A A C T T T A T A A C T +A C C G T T A T T A A T A T A C G A A - G A A C A +G G G A T A A C A A T G G A C C G G A T C C C T C +T T T T C G T C T G G T C G G T T T T T A T T A C +T G C C C T T C T T T T A C T T T T A T C T C T C +C C A G T T +>Artemia +T A A T A T T A G G A G C T C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A C T C C T C C C T C C C T C T T T +A A C T C T T T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C G G G A +T G A A C C G T T T A T C C T C C T C T T G C - A +G C - C G G A A T T G C T C A T G C C G G A G C T +T C A G T T G A T A T A G G A A T C T T C T C T C +T A C A T T T A G C A G G A G T A T C C T C T A T +T T T A G G G G C C G T A A A T T T T A T A A C A +A C A G T G A T T A A C A T A C G G G C A T - C T +G G A A T A A C A T T A G A C C G A A T A C C T T +T A T T C G T G T G A T C T G T A T T T A T T A C +T G C A C T A T T A T T A C T T T T A T C A C T A +C C A G T T +>Centruoides +T A A T A T T A G G G G C C C C T G A T A T G G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A T T G C T T C C C C C T T C C C T +A A C T C T T T T A C T T - T C G A G T G G A A T +G G T A G A A A G A G G G G T A G G T A C A G G A +T G A A C T G T T T A T C C C C C T C T T G C - A +G C - C G G A A T T G C T C A T G C T G G A G C T +T C C G T T G A C A T A G G A A T T T T C T C T C +T A C A T T T A G C A G G A G T G T C T T C T A T +T T T A G G G G C C G T A A A C T T C A T A A C T +A C A G T A A T T A A T A T A C G A G C A T - C T +G G A A T A A C A A T A G A C C G A A T A C C T T +T A T T T G T T T G A T C T G T C T T T A T T A C +A G C A C T A C T T T T A C T C T T G T C A T T A +C C A G T T +>Chanbria +T T A T A T T A G G A G C A C C T G A T A T A G - +C - A T T T C C A C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C A T C T C T +T A C A C T A C T C C T A - T C A A G A G G A A T +G G T A G A A A G A G G A G T C G G T A C A G G A +T G A A C T G T G T A T C C C C C G C T A G C - A +G - C A G G A A T T G C C C A C G C T G G A G C C +T C A G T T G A C A T A G G A A T T T T T T C A C +T A C A C C T C G C C G G A G T C T C A T C A A T +T C T A G G T G C A G T T A A T T T T A T A A C A +A C T G T T A T T A A C A T A C G - C C C T G C G +G G A A T A A C T A T A G A C C G T A T A C C A C +T T T T C G T G T G A G C A G T A T T T A T T A C +A G C C C T A C T C C T T C T T T T A T C T C T A +C C A G T C +>Gea +T T A T A C T T G G T G C C C C A G A T A T G G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G A T T C T G A C T T C T C C C C C C T T C T C T +A A C T C T A T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C A G G A +T G A A C A G T T T A C C C C C C T C T A G C - A +G - C A G G A A T T G C T C A T G C T G G A G C T +T C A G T T G A T A T A G G A A T T T T C T C G C +T A C A C C T T G C A G G G G T C T C A T C A A T +C T T A G G A G C A G T T A A T T T C A T G A C A +A C A G T T A T T A A T A T G C G - C C C T G C A +G G A A T A A C T A T G G A C C G T A T A C C A C +T C T T C G T G T G A G C T G T C T T T A T T A C +A G C C T T G C T A C T A T T A C T A T C C C T C +C C A G T C +>Hadrurus +T T A T A T T A G G G G C C C C A G A T A T A G - +C - T T T T C C C C G A A T A A A T A A T A T A A +G C T T C T G A T T A C T T C C C C C C T C C C T +A A C T C T C C T C C T T - T T A A G A G G A A T +A G T T G A A A G A G G A G T A G G G A C C G G C +T G A A C T G T C T A T C C T C C T T T A G C T A +G - - A G G A A T T G C C C A T G C A G G A G C C +T C G G T A G A C A T A G G A A T T T T T T C T C +T A C A C T T A G C T G G A G T T T C A T C C A T +T T T A G G G G C C G T A A A C T T T A T A A C A +A C A G T G A T T A A T A T A C G A T C A T - C A +G G A A T A A C A A T A G A T C G A A T A C C T T +T A T T T G T T T G A T C A G T C T T T A T T A C +C G C T T T A T T G T T A C T C C T T T C A C T C +C C A G T T +>Hypochilus +T A A T A C T C G G G G C C C C A G A T A T A G - +C - T T T C C C A C G A A T A A A T A A C A T A A +G A T T C T G A C T T C T T C C T C C C T C T C T +A A C T C T A C T T C T C - T C A A G C G G A A T +A G T C G A A A G A G G G G T A G G A A C C G G C +T G G A C A G T A T A C C C T C C T C T A G C C A +G - - A G G A A T C G C T C A T G C A G G T G C T +T C T G T A G A T A T A G G A A T C T T C T C G C +T T C A C C T A G C A G G A G T C T C A T C A A T +C C T A G G G G C C G T A A A C T T T A T A A C A +A C A G T A A T C A A T A T A C G A T C C T - C A +G G A A T A T C T A T A G A C C G T A T A C C T T +T A T T T G T A T G A T C A G T A T T C A T T A C +C G C T C T C C T C C T T C T T C T C T C T T T A +C C A G T C +>Limulus +T A A T A C T A G G T G C C C C G G A T A T G G - +C - T T T C C C T C G A A T A A A T A A C A T A A +G T T T C T G A C T T T T A C C C C C A T C T C T +C A C T C T - C C T C T T A T C T A G A G G A A T +A G T A G A A A G A G G T G T A G G A A C T G G A +T G A A C T G T A T A T C C T C C T T T A T C - T +G - C A A G T A T T G C C C A T G C G G G A G C T +T C C G T T G A T C T C G G A A T C T T C T C A C +T A C A C T T A G C A G G T G T A T C A T C A A T +T T T A G G T G C C G T A A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T - C T A C C +G G A A T A A C T A T G G A C C G G A T A C C T C +T T T T T G T T T G A G C A G T A T T T A T T A C +T G C T T T A C T T T T A C T A C T G T C C T T A +C C A G T T +>Mastigoproctus +T G A T A T T A G G T G C T C C T G A T A T A G - +C - T T T T C C T C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C T T C G T T +A A C G T T A C T T C T T - T C C A G A G G T A T +A G T T G A A A G A G G G G T A G G A A C A G G A +T G A A C A G T C T A T C C T C C T T T A T C - A +G - C T A G A A T T G C T C A T G C C G G A G C T +T C A G T A G A C T T A G G T A T T T T C T C G C +T G C A T C T A G C T G G T G T C T C G T C T A T +C T T A G G A G C T G T C A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G T A T A C C T C +T T T T C G T A T G G G C A G T A T T T A T T A C +C G C C C T A C T A C T T T T A T T A T C T T T A +C C A G T A +>Pauroctonus +T A A T G T T A G G T G C T C C T G A T A T G G - +C - T T T T C C A C G A A T A A A C A A T A T G A +G T T T C T G G C T C C T A C C T C C T T C A C T +A A C A C T A C T T C T T - T C T A G A G G T A T +A G T T G A A A G A G G A G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T T T A T C - A +G - C C A G T A T T G C T C A T G C T G G G G C T +T C G G T A G A T T T A G G A A T T T T C T C C C +T A C A C T T G G C A G G T G T T T C T T C A A T +T T T A G G A G C T G T A A A T T T T A T G A C A +A C T G T T A T T A A C A T A C G A T - C T A C G +G G A A T A A C T A T A G A C C G A A T A C C G C +T T T T C G T T T G A G C A G T A T T T A T T A C +A G C A T T A C T T C T C T T A T T A T C T C T A +C C G G T A +>Rhiphicephalus +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T C - A +G - C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G A A T A C C G T +T A T T C G T C T G A G C A G T T T T T A T T A C +A G C G C T C T T A C T C C T T T T A T C T T T A +C C C G T T +>Thelichoris +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T C - A +G - C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G A A T A C C G T +T A T T C G T C T G A G C A G T T T T T A T T A C +A G C G C T C T T A C T C C T T T T A T C T T T A +C C C G T T +>Thermobius +T A A T A T T A G G T G C T C C C G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T G A +G T T T C T G A C T T T T A C C T C C C T C G C T +A A C T C T A C T T C T T - T C T A G A G G T A T +A G T C G A A A G A G G G G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T C T A T C - T +G - C C A G C A T T G C C C A T G C A G G A G C A +T C T G T A G A T C T A G G A A T C T T C T C A T +T A C A T T T A G C A G G G G T T T C T T C A A T +T C T A G G T G C T G T A A A T T T T A T A A C T +A C C G T T A T C A A T A T A C G A T - C A A C A +G G A A T A A C T A T A G A C C G A A T A C C T C +T T T T C G T C T G A G C G G T A T T T A T T A C +A G C C T T A C T A C T T T T A C T A T C A T T A +C C G G T T +>Vonones +T A A T G T T A G G T G C T C C A G A T A T G G - +C A T T T C C C - C G A A T A A A T A A T A T A A +G T T T C T G A C T T T T A C C C C C T T C G C T +A A C T T T A C T T - T T A T C T A G A G G T A T +A G T C G A A A G A G G A G T G G G A A C T G G A +T G A A C A G T A T A C C C T C C T T T A T C - A +G - C C A G A A T T G C T C A C G C A G G T G C T +T C A G T T G A T C T A G G T A T T T T T T C A T +T A C A T T T A G C A G G G G T C T C A T C A A T +C T T A G G A G C T G T A A A C T T T A T A A C G +A C C G T T A T C A A T A T A C G A T - C T A C A +G G G A T A A C T A T A G A C C G A A T A C C A C +T T T T T G T T T G A G C A G T A T T T A T T A C +A G C T C T A C T C C T A C T G T T A T C T T T A +C C A G T C + + +Sequence character chel_16S.fas#0 +>Alentus +T C C A G A C T - - G A - C - G G T - C C A C C - +- - G A A T C G - G G C G A C T G - - - T C A G G +C C T G A A C - A T C C G G T C G G T T T C - T C +- T C C C T T T T C C C G C - - G - G T G C T C T +T C G T T G A G - T G - T C G C G - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Amblypygid +T C C A G A C T - - G G - C - G G T - C C G C C T +- - A - G C G G C G A G T A C T G - - - T C A G G +C C T G A A C - A T G G C G C C G G - T T - - T - +- T - - C - - - - - - C T T - - G - G T T C T C T +T T A C T G A G - T G - T C T T G - G G C G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - - - G - C T G +- T - - - - C G C C +>Americhernus +T C G A G C C T C C A A - T - G A T - A C G T T G +- - A - A A G G C G T T T A T C G - - - T T G G G +G C C G - A C - A G C G C G T C - G - - T - - G - +- G - - G - - - - - - C T C - - G - G T T G G C C +T T A A A A A G C T G A T C - G G - G T T C T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +G G T G C T C A A A G C A - - G - - - - G C C T G +- G - - - - T G C C +>Aportus +T C C A G A C T - - G A - C - G G T - C C A C C - +- - G - C T C G - G G C G A C T G - - - T C A G G +C C T G A A C - A T T C C G T C G G T T T C G A C +- A G A T T T T T C C C G C - - G - G T G C T C T +T C G G T G A G - T G - T C G C - - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Artemia +C T C G G T C G - - G G - T - G G T G C C G C C - +- T C - A C G G T G G T C A C T G - - - C C T C G +A T C G G A C - A A T T C A T T G G - - A - - - - +- T - C G - - - - - - T T C G G G - G T G C T C T +T A A C C G A G - T G - T C C T G - G G T G G C C +G A T A C G T T T A C T T T G A A C A A A T T A G +A G T G C T T A A A G C A - - G - - - - G - T - G +C A - C C G C G C C +>Centruoides +T C C A G A C A - - A G - C - G G T - C C A C C - +- - C - G C G G T G G T T A C T G - - - T T T G G +A C T G G A C - G T T T G G C C G G - - A - - T - +- T - C C - - - - - - T T T - - G - A T G C T C T +T T G C C G A G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - - +- A - - C - C G C C +>Chanbria +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - +>Gea +T C C G G C C G - - G A - C G G G T - C C G C C - +T - A - C C G G T G G T T A C T G - T T C G C T G +C C - G A G C - - T T C A G G G G G - - C - - C - +- G - C T - - - - - - G T C - - G - A T G A T C T +T C A T C G G T - T A - T C T T C - C G T A A C C +C T C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - - - - C G C C +>Hadrurus +T C C G G A C T - - G T - C - G G T - C C G C C - +- - - - G C - A A G C T T A C T G - - - G C A G G +A C C G G A C - G T C T A G C C G G - - A - - C - +- T - C T - - - - - - C T C - - G T A T C C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A T - C - C G C C +>Hypochilus +T C C A G A C G - - G G - C - G G T - C C G C C - +T - A - A C G G T G G T T A C T G - - - C C T G G +C C T G A A C - A A C C A G C C G G - - T - - T - +- T - C C - - - - - - C T A - - G - A T G A T C T +T C A T T G A T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - G +- A - - - - C G C C +>Limulus +T C T A G A C T - - G G - C - G G T - C C G C T T +- - - - C C G G C G G T T A C T G - - - C C T G G +C C T A A A C - A T C - T G C C G G - T T - - T - +- T - C C - - - - - - C T C - - G - G T G C C C T +T G A T T G A G - T G - T C T T G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - A +- A - - - - C G C C +>Mastigoproctus +T C C A G A C G - - G G - T - G G T - C C A C C - +G - C - C C G G T G G C T A C T G - - - C C C G G +C C T G A A C A A T C T C G C C G G - - T - - T - +- T - T C - - - - - - C T T - - G - A T T C T C T +T C A C C G A G - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T T A A A G C A - - G - - C - G - T - A +- A - - - - C G C C +>Pauroctonus +T C C A G A C T - - G T - C - G G T - C C G - C - +- - A - C C G G A G G T T A C T G - - - G C A G G +A C C G G A C - G T C T A G C C G G - - C - - C - +- T - C C - - - - - - C T C - - G - T T G C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - A - C +- C - - - - C G C C +>Rhiphicephalus +T C C A G A C G - - A G - T - A G T - G C A T C - +T - A - C C C G A T G C T A C G G - - - C T C G G +A C T G A A C - A T C A T G C C G G - - T - - T - +C T - T T - - - - - - C T T - - G - G T G C A C T +T C A T T G T G - T G - C C T C G A G A T G G C C +G G T G C T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A C G C A - G G - - C - G - A - G +- T - - - - C G C C +>Thelichoris +T C C A G A C G - - G G - C - G G T - C C G C C - +T - A - A C G G T G G T T A C T G - - - C C T G G +C C T G A A C - A G C C A G C C G G - - T - - T - +- T - C C - - - - - - C T A - - G - A T G A T C T +T T A C C G G T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - G G - - C - - - T - G +- A - - - - C G C C +>Thermobius +C T C G G A C - - - G A T C - G G T - T C G C C - +- - G - C C C G T G T T A A C T G A - - T C - G T +T C C G G A C - G T C C T G C C G G T T T - - T - +- T - C C - T T T - - C T C - - G - G T G C T C T +T C A T T G A G - T G - T C T T G - A T T G G C C +G G C A C G T T T A C T T T G A A C A A A T T A G +A G T G C T C A A A G C A G - G T C C A G - T - G +- T - - - - C G C C +>Vonones +T C G A G G C T - - G G - C - G G T - C C G C C T +- - A - C A G G C G G T C A C T G - - - C C A G T +A C T C A A C - A T C C T G C C G G - T T - - T - +- T - C C - - - - - - C T T - - G - G T G C T C T +T C G C T G A G - T G - T C T C G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - C - G +- T - - - - A G C C + +Implied Alignments for Graph 2 + +Sequence character chel_cox1.fas#0 +>Alentus +T T A T A T T A G G A G C A C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T T T G A C T T C T T C C A C C A T C T T T +A A C T T T A C T C C T C - T C T A G A G G A A T +A G T A G A A A G T G G A G T G G G A A C A G G C +T G A A C T G T C T A T C C T C C T T T A T C - T +G - C T G G A A T T G C A C A T G C T G G T G C C +T C A G T A G A T T T A G G A A T C T T T T C T T +T A C A C C T A G C A G G A G T T T C A T C C A T +C T T A G G G G C C G T A A A C T T T A T A A C T +A C C G T T A T C A A T A T A C G A A - G A A C A +G G T A T A A C A A T A G A C C G A A T T C C T C +T T T T T G T A T G A T C T G T A T T T A T T A C +A G C C C T T C T T C T C C T T C T T T C C C T C +C C A G T C +>Amblypygid +T A A T A T T A G G A G C C C C A G A T A T G G - +C - T T T T C C C C G T A T A A A T A A T A T A A +G T T T T T G A C T T T T A C C C C C C T C T C T +A A C A C T - A C T C C T C T C T A G A G G G A T +A G T G G A A A G A G G T G T A G G T A C A G G A +T G A A C T G T T T A C C C C C C T C T A T C - G +G - C T A G C A T C G C A C A C G C C G G T G C T +T C T G T T G A T A T G G G A A T T T T C T C G C +T C C A T T T G G C C G G G G T T T C T T C A A T +T T T A G G G G C C G T T A A T T T T A T A A C T +A C T G T T A T C A A T A T A C G A A - G A A C C +G G A A T A A C T A T A G A C C G A A T T C C C C +T T T T T G T T T G A T C T G T C T T T A T T A C +C G C C C T C C T T C T T C T T C T A T C A T T A +C C T G T T +>Americhernus +T A A T A C T T G G G G C C C C T G A T A T G G - +C - T T T C C C C C G T A T A A A T A A T A T G A +G A T T C T G A C T C T T A C C T C C C T C A T T +A A C T C T - C C T A T T A T C G A G A G G G A T +A G T T G A A A C A G G G G T T G G T A C T G G A +T G G A C A G T A T A C C C C C C C T T A T C - G +G - C C G G G A T C G C C C A C G C A G G G G C C +T C A G T T G A C C T G G C A A T C T T T T C T C +T C C A C T T A G C T G G G G T T T C T T C T A T +T C T A G G A G C A G T T A A T T T T A T A A C A +A C C G T T A T C A A T A T A C G A A - G T G T A +G G G A T G A C A A T A G A C C G A G T A C C T C +T T T T T G T A T G G T C T A T T T T C A T C A C +A A C T C T C C T T C T A C T A C T T T C T C T A +C C A G T C +>Aportus +T A A T A T T A G G G G C C C C T G A T A T G G C +C - T T T C C C - C G G A T A A A T A A T A T A A +G T T T C T G A C T A T T A C C C C C C T C T T T +A A C C C T T C T T T T A - T C T A G T G G T A T +G G T T G A A A C A G G A G T T G G G A C T G G T +T G A A C A G T T T A C C C C C C T T T A G C - C +G - C C G G A A T T G C T C A C G C A G G A G C T +T C G G T A G A T C T T G C T A T T T T C T C A C +T C C A C T T A G C C G G G G T T T C T T C G A T +T T T A G G A G C A G T A A A C T T T A T A A C T +A C C G T T A T T A A T A T A C G A A - G A A C A +G G G A T A A C A A T G G A C C G G A T C C C T C +T T T T C G T C T G G T C G G T T T T T A T T A C +T G C C C T T C T T T T A C T T T T A T C T C T C +C C A G T T +>Artemia +T A A T A T T A G G A G C T C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A C T C C T C C C T C C C T C T T T +A A C T C T T T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C G G G A +T G A A C C G T T T A T C C T C C T C T T G C - A +G C - C G G A A T T G C T C A T G C C G G A G C T +T C A G T T G A T A T A G G A A T C T T C T C T C +T A C A T T T A G C A G G A G T A T C C T C T A T +T T T A G G G G C C G T A A A T T T T A T A A C A +A C A G T G A T T A A C A T A C G G G C A T - C T +G G A A T A A C A T T A G A C C G A A T A C C T T +T A T T C G T G T G A T C T G T A T T T A T T A C +T G C A C T A T T A T T A C T T T T A T C A C T A +C C A G T T +>Centruoides +T A A T A T T A G G G G C C C C T G A T A T G G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A T T G C T T C C C C C T T C C C T +A A C T C T T T T A C T T - T C G A G T G G A A T +G G T A G A A A G A G G G G T A G G T A C A G G A +T G A A C T G T T T A T C C C C C T C T T G C - A +G C - C G G A A T T G C T C A T G C T G G A G C T +T C C G T T G A C A T A G G A A T T T T C T C T C +T A C A T T T A G C A G G A G T G T C T T C T A T +T T T A G G G G C C G T A A A C T T C A T A A C T +A C A G T A A T T A A T A T A C G A G C A T - C T +G G A A T A A C A A T A G A C C G A A T A C C T T +T A T T T G T T T G A T C T G T C T T T A T T A C +A G C A C T A C T T T T A C T C T T G T C A T T A +C C A G T T +>Chanbria +T T A T A T T A G G A G C A C C T G A T A T A G - +C - A T T T C C A C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C A T C T C T +T A C A C T A C T C C T A - T C A A G A G G A A T +G G T A G A A A G A G G A G T C G G T A C A G G A +T G A A C T G T G T A T C C C C C G C T A G C - A +G - C A G G A A T T G C C C A C G C T G G A G C C +T C A G T T G A C A T A G G A A T T T T T T C A C +T A C A C C T C G C C G G A G T C T C A T C A A T +T C T A G G T G C A G T T A A T T T T A T A A C A +A C T G T T A T T A A C A T A C G - C C C T G C G +G G A A T A A C T A T A G A C C G T A T A C C A C +T T T T C G T G T G A G C A G T A T T T A T T A C +A G C C C T A C T C C T T C T T T T A T C T C T A +C C A G T C +>Gea +T T A T A C T T G G T G C C C C A G A T A T G G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G A T T C T G A C T T C T C C C C C C T T C T C T +A A C T C T A T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C A G G A +T G A A C A G T T T A C C C C C C T C T A G C - A +G - C A G G A A T T G C T C A T G C T G G A G C T +T C A G T T G A T A T A G G A A T T T T C T C G C +T A C A C C T T G C A G G G G T C T C A T C A A T +C T T A G G A G C A G T T A A T T T C A T G A C A +A C A G T T A T T A A T A T G C G - C C C T G C A +G G A A T A A C T A T G G A C C G T A T A C C A C +T C T T C G T G T G A G C T G T C T T T A T T A C +A G C C T T G C T A C T A T T A C T A T C C C T C +C C A G T C +>Hadrurus +T T A T A T T A G G G G C C C C A G A T A T A G - +C - T T T T C C C C G A A T A A A T A A T A T A A +G C T T C T G A T T A C T T C C C C C C T C C C T +A A C T C T C C T C C T T - T T A A G A G G A A T +A G T T G A A A G A G G A G T A G G G A C C G G C +T G A A C T G T C T A T C C T C C T T T A G C T A +G - - A G G A A T T G C C C A T G C A G G A G C C +T C G G T A G A C A T A G G A A T T T T T T C T C +T A C A C T T A G C T G G A G T T T C A T C C A T +T T T A G G G G C C G T A A A C T T T A T A A C A +A C A G T G A T T A A T A T A C G A T C A T - C A +G G A A T A A C A A T A G A T C G A A T A C C T T +T A T T T G T T T G A T C A G T C T T T A T T A C +C G C T T T A T T G T T A C T C C T T T C A C T C +C C A G T T +>Hypochilus +T A A T A C T C G G G G C C C C A G A T A T A G - +C - T T T C C C A C G A A T A A A T A A C A T A A +G A T T C T G A C T T C T T C C T C C C T C T C T +A A C T C T A C T T C T C - T C A A G C G G A A T +A G T C G A A A G A G G G G T A G G A A C C G G C +T G G A C A G T A T A C C C T C C T C T A G C C A +G - - A G G A A T C G C T C A T G C A G G T G C T +T C T G T A G A T A T A G G A A T C T T C T C G C +T T C A C C T A G C A G G A G T C T C A T C A A T +C C T A G G G G C C G T A A A C T T T A T A A C A +A C A G T A A T C A A T A T A C G A T C C T - C A +G G A A T A T C T A T A G A C C G T A T A C C T T +T A T T T G T A T G A T C A G T A T T C A T T A C +C G C T C T C C T C C T T C T T C T C T C T T T A +C C A G T C +>Limulus +T A A T A C T A G G T G C C C C G G A T A T G G - +C - T T T C C C T C G A A T A A A T A A C A T A A +G T T T C T G A C T T T T A C C C C C A T C T C T +C A C T C T - C C T C T T A T C T A G A G G A A T +A G T A G A A A G A G G T G T A G G A A C T G G A +T G A A C T G T A T A T C C T C C T T T A T C - T +G - C A A G T A T T G C C C A T G C G G G A G C T +T C C G T T G A T C T C G G A A T C T T C T C A C +T A C A C T T A G C A G G T G T A T C A T C A A T +T T T A G G T G C C G T A A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T - C T A C C +G G A A T A A C T A T G G A C C G G A T A C C T C +T T T T T G T T T G A G C A G T A T T T A T T A C +T G C T T T A C T T T T A C T A C T G T C C T T A +C C A G T T +>Mastigoproctus +T G A T A T T A G G T G C T C C T G A T A T A G - +C - T T T T C C T C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C T T C G T T +A A C G T T A C T T C T T - T C C A G A G G T A T +A G T T G A A A G A G G G G T A G G A A C A G G A +T G A A C A G T C T A T C C T C C T T T A T C - A +G - C T A G A A T T G C T C A T G C C G G A G C T +T C A G T A G A C T T A G G T A T T T T C T C G C +T G C A T C T A G C T G G T G T C T C G T C T A T +C T T A G G A G C T G T C A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G T A T A C C T C +T T T T C G T A T G G G C A G T A T T T A T T A C +C G C C C T A C T A C T T T T A T T A T C T T T A +C C A G T A +>Pauroctonus +T A A T G T T A G G T G C T C C T G A T A T G G - +C - T T T T C C A C G A A T A A A C A A T A T G A +G T T T C T G G C T C C T A C C T C C T T C A C T +A A C A C T A C T T C T T - T C T A G A G G T A T +A G T T G A A A G A G G A G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T T T A T C - A +G - C C A G T A T T G C T C A T G C T G G G G C T +T C G G T A G A T T T A G G A A T T T T C T C C C +T A C A C T T G G C A G G T G T T T C T T C A A T +T T T A G G A G C T G T A A A T T T T A T G A C A +A C T G T T A T T A A C A T A C G A T - C T A C G +G G A A T A A C T A T A G A C C G A A T A C C G C +T T T T C G T T T G A G C A G T A T T T A T T A C +A G C A T T A C T T C T C T T A T T A T C T C T A +C C G G T A +>Rhiphicephalus +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T C - A +G - C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G A A T A C C G T +T A T T C G T C T G A G C A G T T T T T A T T A C +A G C G C T C T T A C T C C T T T T A T C T T T A +C C C G T T +>Thelichoris +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T C - A +G - C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G A A T A C C G T +T A T T C G T C T G A G C A G T T T T T A T T A C +A G C G C T C T T A C T C C T T T T A T C T T T A +C C C G T T +>Thermobius +T A A T A T T A G G T G C T C C C G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T G A +G T T T C T G A C T T T T A C C T C C C T C G C T +A A C T C T A C T T C T T - T C T A G A G G T A T +A G T C G A A A G A G G G G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T C T A T C - T +G - C C A G C A T T G C C C A T G C A G G A G C A +T C T G T A G A T C T A G G A A T C T T C T C A T +T A C A T T T A G C A G G G G T T T C T T C A A T +T C T A G G T G C T G T A A A T T T T A T A A C T +A C C G T T A T C A A T A T A C G A T - C A A C A +G G A A T A A C T A T A G A C C G A A T A C C T C +T T T T C G T C T G A G C G G T A T T T A T T A C +A G C C T T A C T A C T T T T A C T A T C A T T A +C C G G T T +>Vonones +T A A T G T T A G G T G C T C C A G A T A T G G - +C A T T T C C C - C G A A T A A A T A A T A T A A +G T T T C T G A C T T T T A C C C C C T T C G C T +A A C T T T A C T T - T T A T C T A G A G G T A T +A G T C G A A A G A G G A G T G G G A A C T G G A +T G A A C A G T A T A C C C T C C T T T A T C - A +G - C C A G A A T T G C T C A C G C A G G T G C T +T C A G T T G A T C T A G G T A T T T T T T C A T +T A C A T T T A G C A G G G G T C T C A T C A A T +C T T A G G A G C T G T A A A C T T T A T A A C G +A C C G T T A T C A A T A T A C G A T - C T A C A +G G G A T A A C T A T A G A C C G A A T A C C A C +T T T T T G T T T G A G C A G T A T T T A T T A C +A G C T C T A C T C C T A C T G T T A T C T T T A +C C A G T C + + +Sequence character chel_16S.fas#0 +>Alentus +T C C A G A C T - - G A - C - G G T - C C A C C - +- - G A A T C G - G G C G A C T G - - - T C A G G +C C T G A A C - A T C C G G T C G G T T T C - T C +- T C C C T T T T C C C G C - - G - G T G C T C T +T C G T T G A G - T G - T C G C G - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Amblypygid +T C C A G A C T - - G G - C - G G T - C C G C C T +- - A - G C G G C G A G T A C T G - - - T C A G G +C C T G A A C - A T G G C G C C G G - T T - - T - +- T - - C - - - - - - C T T - - G - G T T C T C T +T T A C T G A G - T G - T C T T G - G G C G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - - - G - C T G +- T - - - - C G C C +>Americhernus +T C G A G C C T C C A A - T - G A T - A C G T T G +- - A - A A G G C G T T T A T C G - - - T T G G G +G C C G - A C - A G C G C G T C - G - - T - - G - +- G - - G - - - - - - C T C - - G - G T T G G C C +T T A A A A A G C T G A T C - G G - G T T C T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +G G T G C T C A A A G C A - - G - - - - G C C T G +- G - - - - T G C C +>Aportus +T C C A G A C T - - G A - C - G G T - C C A C C - +- - G - C T C G - G G C G A C T G - - - T C A G G +C C T G A A C - A T T C C G T C G G T T T C G A C +- A G A T T T T T C C C G C - - G - G T G C T C T +T C G G T G A G - T G - T C G C - - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Artemia +C T C G G T C G - - G G - T - G G T G C C G C C - +- T C - A C G G T G G T C A C T G - - - C C T C G +A T C G G A C - A A T T C A T T G G - - A - - - - +- T - C G - - - - - - T T C G G G - G T G C T C T +T A A C C G A G - T G - T C C T G - G G T G G C C +G A T A C G T T T A C T T T G A A C A A A T T A G +A G T G C T T A A A G C A - - G - - - - G - T - G +C A - C C G C G C C +>Centruoides +T C C A G A C A - - A G - C - G G T - C C A C C - +- - C - G C G G T G G T T A C T G - - - T T T G G +A C T G G A C - G T T T G G C C G G - - A - - T - +- T - C C - - - - - - T T T - - G - A T G C T C T +T T G C C G A G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - - +- A - - C - C G C C +>Chanbria +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - +>Gea +T C C G G C C G - - G A - C G G G T - C C G C C - +T - A - C C G G T G G T T A C T G - T T C G C T G +C C - G A G C - - T T C A G G G G G - - C - - C - +- G - C T - - - - - - G T C - - G - A T G A T C T +T C A T C G G T - T A - T C T T C - C G T A A C C +C T C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - - - - C G C C +>Hadrurus +T C C G G A C T - - G T - C - G G T - C C G C C - +- - - - G C - A A G C T T A C T G - - - G C A G G +A C C G G A C - G T C T A G C C G G - - A - - C - +- T - C T - - - - - - C T C - - G T A T C C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A T - C - C G C C +>Hypochilus +T C C A G A C G - - G G - C - G G T - C C G C C - +T - A - A C G G T G G T T A C T G - - - C C T G G +C C T G A A C - A A C C A G C C G G - - T - - T - +- T - C C - - - - - - C T A - - G - A T G A T C T +T C A T T G A T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - G +- A - - - - C G C C +>Limulus +T C T A G A C T - - G G - C - G G T - C C G C T T +- - - - C C G G C G G T T A C T G - - - C C T G G +C C T A A A C - A T C - T G C C G G - T T - - T - +- T - C C - - - - - - C T C - - G - G T G C C C T +T G A T T G A G - T G - T C T T G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - A +- A - - - - C G C C +>Mastigoproctus +T C C A G A C G - - G G - T - G G T - C C A C C - +G - C - C C G G T G G C T A C T G - - - C C C G G +C C T G A A C A A T C T C G C C G G - - T - - T - +- T - T C - - - - - - C T T - - G - A T T C T C T +T C A C C G A G - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T T A A A G C A - - G - - C - G - T - A +- A - - - - C G C C +>Pauroctonus +T C C A G A C T - - G T - C - G G T - C C G - C - +- - A - C C G G A G G T T A C T G - - - G C A G G +A C C G G A C - G T C T A G C C G G - - C - - C - +- T - C C - - - - - - C T C - - G - T T G C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - A - C +- C - - - - C G C C +>Rhiphicephalus +T C C A G A C G - - A G - T - A G T - G C A T C - +T - A - C C C G A T G C T A C G G - - - C T C G G +A C T G A A C - A T C A T G C C G G - - T - - T - +C T - T T - - - - - - C T T - - G - G T G C A C T +T C A T T G T G - T G - C C T C G A G A T G G C C +G G T G C T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A C G C A - G G - - C - G - A - G +- T - - - - C G C C +>Thelichoris +T C C A G A C G - - G G - C - G G T - C C G C C - +T - A - A C G G T G G T T A C T G - - - C C T G G +C C T G A A C - A G C C A G C C G G - - T - - T - +- T - C C - - - - - - C T A - - G - A T G A T C T +T T A C C G G T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - G G - - C - - - T - G +- A - - - - C G C C +>Thermobius +C T C G G A C - - - G A T C - G G T - T C G C C - +- - G - C C C G T G T T A A C T G A - - T C - G T +T C C G G A C - G T C C T G C C G G T T T - - T - +- T - C C - T T T - - C T C - - G - G T G C T C T +T C A T T G A G - T G - T C T T G - A T T G G C C +G G C A C G T T T A C T T T G A A C A A A T T A G +A G T G C T C A A A G C A G - G T C C A G - T - G +- T - - - - C G C C +>Vonones +T C G A G G C T - - G G - C - G G T - C C G C C T +- - A - C A G G C G G T C A C T G - - - C C A G T +A C T C A A C - A T C C T G C C G G - T T - - T - +- T - C C - - - - - - C T T - - G - G T G C T C T +T C G C T G A G - T G - T C T C G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - C - G +- T - - - - A G C C + +Implied Alignments for Graph 3 + +Sequence character chel_cox1.fas#0 +>Alentus +T T A T A T T A G G A G C A C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T T T G A C T T C T T C C A C C A T C T T T +A A C T T T A C T C C T C - T C T A G A G G A A T +A G T A G A A A G T G G A G T G G G A A C A G G C +T G A A C T G T C T A T C C T C C T T T A T C - T +G - C T G G A A T T G C A C A T G C T G G T G C C +T C A G T A G A T T T A G G A A T C T T T T C T T +T A C A C C T A G C A G G A G T T T C A T C C A T +C T T A G G G G C C G T A A A C T T T A T A A C T +A C C G T T A T C A A T A T A C G A A - G A A C A +G G T A T A A C A A T A G A C C G A A T T C C T C +T T T T T G T A T G A T C T G T A T T T A T T A C +A G C C C T T C T T C T C C T T C T T T C C C T C +C C A G T C +>Amblypygid +T A A T A T T A G G A G C C C C A G A T A T G G - +C - T T T T C C C C G T A T A A A T A A T A T A A +G T T T T T G A C T T T T A C C C C C C T C T C T +A A C A C T - A C T C C T C T C T A G A G G G A T +A G T G G A A A G A G G T G T A G G T A C A G G A +T G A A C T G T T T A C C C C C C T C T A T C - G +G - C T A G C A T C G C A C A C G C C G G T G C T +T C T G T T G A T A T G G G A A T T T T C T C G C +T C C A T T T G G C C G G G G T T T C T T C A A T +T T T A G G G G C C G T T A A T T T T A T A A C T +A C T G T T A T C A A T A T A C G A A - G A A C C +G G A A T A A C T A T A G A C C G A A T T C C C C +T T T T T G T T T G A T C T G T C T T T A T T A C +C G C C C T C C T T C T T C T T C T A T C A T T A +C C T G T T +>Americhernus +T A A T A C T T G G G G C C C C T G A T A T G G - +C - T T T C C C C C G T A T A A A T A A T A T G A +G A T T C T G A C T C T T A C C T C C C T C A T T +A A C T C T - C C T A T T A T C G A G A G G G A T +A G T T G A A A C A G G G G T T G G T A C T G G A +T G G A C A G T A T A C C C C C C C T T A T C - G +G - C C G G G A T C G C C C A C G C A G G G G C C +T C A G T T G A C C T G G C A A T C T T T T C T C +T C C A C T T A G C T G G G G T T T C T T C T A T +T C T A G G A G C A G T T A A T T T T A T A A C A +A C C G T T A T C A A T A T A C G A A - G T G T A +G G G A T G A C A A T A G A C C G A G T A C C T C +T T T T T G T A T G G T C T A T T T T C A T C A C +A A C T C T C C T T C T A C T A C T T T C T C T A +C C A G T C +>Aportus +T A A T A T T A G G G G C C C C T G A T A T G G C +C - T T T C C C - C G G A T A A A T A A T A T A A +G T T T C T G A C T A T T A C C C C C C T C T T T +A A C C C T T C T T T T A - T C T A G T G G T A T +G G T T G A A A C A G G A G T T G G G A C T G G T +T G A A C A G T T T A C C C C C C T T T A G C - C +G - C C G G A A T T G C T C A C G C A G G A G C T +T C G G T A G A T C T T G C T A T T T T C T C A C +T C C A C T T A G C C G G G G T T T C T T C G A T +T T T A G G A G C A G T A A A C T T T A T A A C T +A C C G T T A T T A A T A T A C G A A - G A A C A +G G G A T A A C A A T G G A C C G G A T C C C T C +T T T T C G T C T G G T C G G T T T T T A T T A C +T G C C C T T C T T T T A C T T T T A T C T C T C +C C A G T T +>Artemia +T A A T A T T A G G A G C T C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A C T C C T C C C T C C C T C T T T +A A C T C T T T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C G G G A +T G A A C C G T T T A T C C T C C T C T T G C - A +G C - C G G A A T T G C T C A T G C C G G A G C T +T C A G T T G A T A T A G G A A T C T T C T C T C +T A C A T T T A G C A G G A G T A T C C T C T A T +T T T A G G G G C C G T A A A T T T T A T A A C A +A C A G T G A T T A A C A T A C G G G C A T - C T +G G A A T A A C A T T A G A C C G A A T A C C T T +T A T T C G T G T G A T C T G T A T T T A T T A C +T G C A C T A T T A T T A C T T T T A T C A C T A +C C A G T T +>Centruoides +T A A T A T T A G G G G C C C C T G A T A T G G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A T T G C T T C C C C C T T C C C T +A A C T C T T T T A C T T - T C G A G T G G A A T +G G T A G A A A G A G G G G T A G G T A C A G G A +T G A A C T G T T T A T C C C C C T C T T G C - A +G C - C G G A A T T G C T C A T G C T G G A G C T +T C C G T T G A C A T A G G A A T T T T C T C T C +T A C A T T T A G C A G G A G T G T C T T C T A T +T T T A G G G G C C G T A A A C T T C A T A A C T +A C A G T A A T T A A T A T A C G A G C A T - C T +G G A A T A A C A A T A G A C C G A A T A C C T T +T A T T T G T T T G A T C T G T C T T T A T T A C +A G C A C T A C T T T T A C T C T T G T C A T T A +C C A G T T +>Chanbria +T T A T A T T A G G A G C A C C T G A T A T A G - +C - A T T T C C A C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C A T C T C T +T A C A C T A C T C C T A - T C A A G A G G A A T +G G T A G A A A G A G G A G T C G G T A C A G G A +T G A A C T G T G T A T C C C C C G C T A G C - A +G - C A G G A A T T G C C C A C G C T G G A G C C +T C A G T T G A C A T A G G A A T T T T T T C A C +T A C A C C T C G C C G G A G T C T C A T C A A T +T C T A G G T G C A G T T A A T T T T A T A A C A +A C T G T T A T T A A C A T A C G - C C C T G C G +G G A A T A A C T A T A G A C C G T A T A C C A C +T T T T C G T G T G A G C A G T A T T T A T T A C +A G C C C T A C T C C T T C T T T T A T C T C T A +C C A G T C +>Gea +T T A T A C T T G G T G C C C C A G A T A T G G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G A T T C T G A C T T C T C C C C C C T T C T C T +A A C T C T A T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C A G G A +T G A A C A G T T T A C C C C C C T C T A G C - A +G - C A G G A A T T G C T C A T G C T G G A G C T +T C A G T T G A T A T A G G A A T T T T C T C G C +T A C A C C T T G C A G G G G T C T C A T C A A T +C T T A G G A G C A G T T A A T T T C A T G A C A +A C A G T T A T T A A T A T G C G - C C C T G C A +G G A A T A A C T A T G G A C C G T A T A C C A C +T C T T C G T G T G A G C T G T C T T T A T T A C +A G C C T T G C T A C T A T T A C T A T C C C T C +C C A G T C +>Hadrurus +T T A T A T T A G G G G C C C C A G A T A T A G - +C - T T T T C C C C G A A T A A A T A A T A T A A +G C T T C T G A T T A C T T C C C C C C T C C C T +A A C T C T C C T C C T T - T T A A G A G G A A T +A G T T G A A A G A G G A G T A G G G A C C G G C +T G A A C T G T C T A T C C T C C T T T A G C T A +G - - A G G A A T T G C C C A T G C A G G A G C C +T C G G T A G A C A T A G G A A T T T T T T C T C +T A C A C T T A G C T G G A G T T T C A T C C A T +T T T A G G G G C C G T A A A C T T T A T A A C A +A C A G T G A T T A A T A T A C G A T C A T - C A +G G A A T A A C A A T A G A T C G A A T A C C T T +T A T T T G T T T G A T C A G T C T T T A T T A C +C G C T T T A T T G T T A C T C C T T T C A C T C +C C A G T T +>Hypochilus +T A A T A C T C G G G G C C C C A G A T A T A G - +C - T T T C C C A C G A A T A A A T A A C A T A A +G A T T C T G A C T T C T T C C T C C C T C T C T +A A C T C T A C T T C T C - T C A A G C G G A A T +A G T C G A A A G A G G G G T A G G A A C C G G C +T G G A C A G T A T A C C C T C C T C T A G C C A +G - - A G G A A T C G C T C A T G C A G G T G C T +T C T G T A G A T A T A G G A A T C T T C T C G C +T T C A C C T A G C A G G A G T C T C A T C A A T +C C T A G G G G C C G T A A A C T T T A T A A C A +A C A G T A A T C A A T A T A C G A T C C T - C A +G G A A T A T C T A T A G A C C G T A T A C C T T +T A T T T G T A T G A T C A G T A T T C A T T A C +C G C T C T C C T C C T T C T T C T C T C T T T A +C C A G T C +>Limulus +T A A T A C T A G G T G C C C C G G A T A T G G - +C - T T T C C C T C G A A T A A A T A A C A T A A +G T T T C T G A C T T T T A C C C C C A T C T C T +C A C T C T - C C T C T T A T C T A G A G G A A T +A G T A G A A A G A G G T G T A G G A A C T G G A +T G A A C T G T A T A T C C T C C T T T A T C - T +G - C A A G T A T T G C C C A T G C G G G A G C T +T C C G T T G A T C T C G G A A T C T T C T C A C +T A C A C T T A G C A G G T G T A T C A T C A A T +T T T A G G T G C C G T A A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T - C T A C C +G G A A T A A C T A T G G A C C G G A T A C C T C +T T T T T G T T T G A G C A G T A T T T A T T A C +T G C T T T A C T T T T A C T A C T G T C C T T A +C C A G T T +>Mastigoproctus +T G A T A T T A G G T G C T C C T G A T A T A G - +C - T T T T C C T C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C T T C G T T +A A C G T T A C T T C T T - T C C A G A G G T A T +A G T T G A A A G A G G G G T A G G A A C A G G A +T G A A C A G T C T A T C C T C C T T T A T C - A +G - C T A G A A T T G C T C A T G C C G G A G C T +T C A G T A G A C T T A G G T A T T T T C T C G C +T G C A T C T A G C T G G T G T C T C G T C T A T +C T T A G G A G C T G T C A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G T A T A C C T C +T T T T C G T A T G G G C A G T A T T T A T T A C +C G C C C T A C T A C T T T T A T T A T C T T T A +C C A G T A +>Pauroctonus +T A A T G T T A G G T G C T C C T G A T A T G G - +C - T T T T C C A C G A A T A A A C A A T A T G A +G T T T C T G G C T C C T A C C T C C T T C A C T +A A C A C T A C T T C T T - T C T A G A G G T A T +A G T T G A A A G A G G A G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T T T A T C - A +G - C C A G T A T T G C T C A T G C T G G G G C T +T C G G T A G A T T T A G G A A T T T T C T C C C +T A C A C T T G G C A G G T G T T T C T T C A A T +T T T A G G A G C T G T A A A T T T T A T G A C A +A C T G T T A T T A A C A T A C G A T - C T A C G +G G A A T A A C T A T A G A C C G A A T A C C G C +T T T T C G T T T G A G C A G T A T T T A T T A C +A G C A T T A C T T C T C T T A T T A T C T C T A +C C G G T A +>Rhiphicephalus +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T C - A +G - C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G A A T A C C G T +T A T T C G T C T G A G C A G T T T T T A T T A C +A G C G C T C T T A C T C C T T T T A T C T T T A +C C C G T T +>Thelichoris +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T C - A +G - C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G A A T A C C G T +T A T T C G T C T G A G C A G T T T T T A T T A C +A G C G C T C T T A C T C C T T T T A T C T T T A +C C C G T T +>Thermobius +T A A T A T T A G G T G C T C C C G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T G A +G T T T C T G A C T T T T A C C T C C C T C G C T +A A C T C T A C T T C T T - T C T A G A G G T A T +A G T C G A A A G A G G G G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T C T A T C - T +G - C C A G C A T T G C C C A T G C A G G A G C A +T C T G T A G A T C T A G G A A T C T T C T C A T +T A C A T T T A G C A G G G G T T T C T T C A A T +T C T A G G T G C T G T A A A T T T T A T A A C T +A C C G T T A T C A A T A T A C G A T - C A A C A +G G A A T A A C T A T A G A C C G A A T A C C T C +T T T T C G T C T G A G C G G T A T T T A T T A C +A G C C T T A C T A C T T T T A C T A T C A T T A +C C G G T T +>Vonones +T A A T G T T A G G T G C T C C A G A T A T G G - +C A T T T C C C - C G A A T A A A T A A T A T A A +G T T T C T G A C T T T T A C C C C C T T C G C T +A A C T T T A C T T - T T A T C T A G A G G T A T +A G T C G A A A G A G G A G T G G G A A C T G G A +T G A A C A G T A T A C C C T C C T T T A T C - A +G - C C A G A A T T G C T C A C G C A G G T G C T +T C A G T T G A T C T A G G T A T T T T T T C A T +T A C A T T T A G C A G G G G T C T C A T C A A T +C T T A G G A G C T G T A A A C T T T A T A A C G +A C C G T T A T C A A T A T A C G A T - C T A C A +G G G A T A A C T A T A G A C C G A A T A C C A C +T T T T T G T T T G A G C A G T A T T T A T T A C +A G C T C T A C T C C T A C T G T T A T C T T T A +C C A G T C + + +Sequence character chel_16S.fas#0 +>Alentus +T C C A G A C T - - G A - C - G G T - C C A C C - +- - G A A T C G - G G C G A C T G - - - T C A G G +C C T G A A C - A T C C G G T C G G T T T C - T C +- T C C C T T T T C C C G C - - G - G T G C T C T +T C G T T G A G - T G - T C G C G - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Amblypygid +T C C A G A C T - - G G - C - G G T - C C G C C T +- - A - G C G G C G A G T A C T G - - - T C A G G +C C T G A A C - A T G G C G C C G G - T T - - T - +- T - - C - - - - - - C T T - - G - G T T C T C T +T T A C T G A G - T G - T C T T G - G G C G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - - - G - C T G +- T - - - - C G C C +>Americhernus +T C G A G C C T C C A A - T - G A T - A C G T T G +- - A - A A G G C G T T T A T C G - - - T T G G G +G C C G - A C - A G C G C G T C - G - - T - - G - +- G - - G - - - - - - C T C - - G - G T T G G C C +T T A A A A A G C T G A T C - G G - G T T C T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +G G T G C T C A A A G C A - - G - - - - G C C T G +- G - - - - T G C C +>Aportus +T C C A G A C T - - G A - C - G G T - C C A C C - +- - G - C T C G - G G C G A C T G - - - T C A G G +C C T G A A C - A T T C C G T C G G T T T C G A C +- A G A T T T T T C C C G C - - G - G T G C T C T +T C G G T G A G - T G - T C G C - - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Artemia +C T C G G T C G - - G G - T - G G T G C C G C C - +- T C - A C G G T G G T C A C T G - - - C C T C G +A T C G G A C - A A T T C A T T G G - - A - - - - +- T - C G - - - - - - T T C G G G - G T G C T C T +T A A C C G A G - T G - T C C T G - G G T G G C C +G A T A C G T T T A C T T T G A A C A A A T T A G +A G T G C T T A A A G C A - - G - - - - G - T - G +C A - C C G C G C C +>Centruoides +T C C A G A C A - - A G - C - G G T - C C A C C - +- - C - G C G G T G G T T A C T G - - - T T T G G +A C T G G A C - G T T T G G C C G G - - A - - T - +- T - C C - - - - - - T T T - - G - A T G C T C T +T T G C C G A G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - - +- A - - C - C G C C +>Chanbria +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - +>Gea +T C C G G C C G - - G A - C G G G T - C C G C C - +T - A - C C G G T G G T T A C T G - T T C G C T G +C C - G A G C - - T T C A G G G G G - - C - - C - +- G - C T - - - - - - G T C - - G - A T G A T C T +T C A T C G G T - T A - T C T T C - C G T A A C C +C T C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - - - - C G C C +>Hadrurus +T C C G G A C T - - G T - C - G G T - C C G C C - +- - - - G C - A A G C T T A C T G - - - G C A G G +A C C G G A C - G T C T A G C C G G - - A - - C - +- T - C T - - - - - - C T C - - G T A T C C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A T - C - C G C C +>Hypochilus +T C C A G A C G - - G G - C - G G T - C C G C C - +T - A - A C G G T G G T T A C T G - - - C C T G G +C C T G A A C - A A C C A G C C G G - - T - - T - +- T - C C - - - - - - C T A - - G - A T G A T C T +T C A T T G A T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - G +- A - - - - C G C C +>Limulus +T C T A G A C T - - G G - C - G G T - C C G C T T +- - - - C C G G C G G T T A C T G - - - C C T G G +C C T A A A C - A T C - T G C C G G - T T - - T - +- T - C C - - - - - - C T C - - G - G T G C C C T +T G A T T G A G - T G - T C T T G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - A +- A - - - - C G C C +>Mastigoproctus +T C C A G A C G - - G G - T - G G T - C C A C C - +G - C - C C G G T G G C T A C T G - - - C C C G G +C C T G A A C A A T C T C G C C G G - - T - - T - +- T - T C - - - - - - C T T - - G - A T T C T C T +T C A C C G A G - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T T A A A G C A - - G - - C - G - T - A +- A - - - - C G C C +>Pauroctonus +T C C A G A C T - - G T - C - G G T - C C G - C - +- - A - C C G G A G G T T A C T G - - - G C A G G +A C C G G A C - G T C T A G C C G G - - C - - C - +- T - C C - - - - - - C T C - - G - T T G C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - A - C +- C - - - - C G C C +>Rhiphicephalus +T C C A G A C G - - A G - T - A G T - G C A T C - +T - A - C C C G A T G C T A C G G - - - C T C G G +A C T G A A C - A T C A T G C C G G - - T - - T - +C T - T T - - - - - - C T T - - G - G T G C A C T +T C A T T G T G - T G - C C T C G A G A T G G C C +G G T G C T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A C G C A - G G - - C - G - A - G +- T - - - - C G C C +>Thelichoris +T C C A G A C G - - G G - C - G G T - C C G C C - +T - A - A C G G T G G T T A C T G - - - C C T G G +C C T G A A C - A G C C A G C C G G - - T - - T - +- T - C C - - - - - - C T A - - G - A T G A T C T +T T A C C G G T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - G G - - C - - - T - G +- A - - - - C G C C +>Thermobius +C T C G G A C - - - G A T C - G G T - T C G C C - +- - G - C C C G T G T T A A C T G A - - T C - G T +T C C G G A C - G T C C T G C C G G T T T - - T - +- T - C C - T T T - - C T C - - G - G T G C T C T +T C A T T G A G - T G - T C T T G - A T T G G C C +G G C A C G T T T A C T T T G A A C A A A T T A G +A G T G C T C A A A G C A G - G T C C A G - T - G +- T - - - - C G C C +>Vonones +T C G A G G C T - - G G - C - G G T - C C G C C T +- - A - C A G G C G G T C A C T G - - - C C A G T +A C T C A A C - A T C C T G C C G G - T T - - T - +- T - C C - - - - - - C T T - - G - G T G C T C T +T C G C T G A G - T G - T C T C G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - C - G +- T - - - - A G C C + +Implied Alignments for Graph 4 + +Sequence character chel_cox1.fas#0 +>Alentus +T T A T A T T A G G A G C A C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T T T G A C T T C T T C C A C C A T C T T T +A A C T T T A C T C C T C - T C T A G A G G A A T +A G T A G A A A G T G G A G T G G G A A C A G G C +T G A A C T G T C T A T C C T C C T T T A T C - T +G - C T G G A A T T G C A C A T G C T G G T G C C +T C A G T A G A T T T A G G A A T C T T T T C T T +T A C A C C T A G C A G G A G T T T C A T C C A T +C T T A G G G G C C G T A A A C T T T A T A A C T +A C C G T T A T C A A T A T A C G A A - G A A C A +G G T A T A A C A A T A G A C C G A A T T C C T C +T T T T T G T A T G A T C T G T A T T T A T T A C +A G C C C T T C T T C T C C T T C T T T C C C T C +C C A G T C +>Amblypygid +T A A T A T T A G G A G C C C C A G A T A T G G - +C - T T T T C C C C G T A T A A A T A A T A T A A +G T T T T T G A C T T T T A C C C C C C T C T C T +A A C A C T - A C T C C T C T C T A G A G G G A T +A G T G G A A A G A G G T G T A G G T A C A G G A +T G A A C T G T T T A C C C C C C T C T A T C - G +G - C T A G C A T C G C A C A C G C C G G T G C T +T C T G T T G A T A T G G G A A T T T T C T C G C +T C C A T T T G G C C G G G G T T T C T T C A A T +T T T A G G G G C C G T T A A T T T T A T A A C T +A C T G T T A T C A A T A T A C G A A - G A A C C +G G A A T A A C T A T A G A C C G A A T T C C C C +T T T T T G T T T G A T C T G T C T T T A T T A C +C G C C C T C C T T C T T C T T C T A T C A T T A +C C T G T T +>Americhernus +T A A T A C T T G G G G C C C C T G A T A T G G - +C - T T T C C C C C G T A T A A A T A A T A T G A +G A T T C T G A C T C T T A C C T C C C T C A T T +A A C T C T - C C T A T T A T C G A G A G G G A T +A G T T G A A A C A G G G G T T G G T A C T G G A +T G G A C A G T A T A C C C C C C C T T A T C - G +G - C C G G G A T C G C C C A C G C A G G G G C C +T C A G T T G A C C T G G C A A T C T T T T C T C +T C C A C T T A G C T G G G G T T T C T T C T A T +T C T A G G A G C A G T T A A T T T T A T A A C A +A C C G T T A T C A A T A T A C G A A - G T G T A +G G G A T G A C A A T A G A C C G A G T A C C T C +T T T T T G T A T G G T C T A T T T T C A T C A C +A A C T C T C C T T C T A C T A C T T T C T C T A +C C A G T C +>Aportus +T A A T A T T A G G G G C C C C T G A T A T G G C +C - T T T C C C - C G G A T A A A T A A T A T A A +G T T T C T G A C T A T T A C C C C C C T C T T T +A A C C C T T C T T T T A - T C T A G T G G T A T +G G T T G A A A C A G G A G T T G G G A C T G G T +T G A A C A G T T T A C C C C C C T T T A G C - C +G - C C G G A A T T G C T C A C G C A G G A G C T +T C G G T A G A T C T T G C T A T T T T C T C A C +T C C A C T T A G C C G G G G T T T C T T C G A T +T T T A G G A G C A G T A A A C T T T A T A A C T +A C C G T T A T T A A T A T A C G A A - G A A C A +G G G A T A A C A A T G G A C C G G A T C C C T C +T T T T C G T C T G G T C G G T T T T T A T T A C +T G C C C T T C T T T T A C T T T T A T C T C T C +C C A G T T +>Artemia +T A A T A T T A G G A G C T C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A C T C C T C C C T C C C T C T T T +A A C T C T T T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C G G G A +T G A A C C G T T T A T C C T C C T C T T G C - A +G C - C G G A A T T G C T C A T G C C G G A G C T +T C A G T T G A T A T A G G A A T C T T C T C T C +T A C A T T T A G C A G G A G T A T C C T C T A T +T T T A G G G G C C G T A A A T T T T A T A A C A +A C A G T G A T T A A C A T A C G G G C A T - C T +G G A A T A A C A T T A G A C C G A A T A C C T T +T A T T C G T G T G A T C T G T A T T T A T T A C +T G C A C T A T T A T T A C T T T T A T C A C T A +C C A G T T +>Centruoides +T A A T A T T A G G G G C C C C T G A T A T G G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A T T G C T T C C C C C T T C C C T +A A C T C T T T T A C T T - T C G A G T G G A A T +G G T A G A A A G A G G G G T A G G T A C A G G A +T G A A C T G T T T A T C C C C C T C T T G C - A +G C - C G G A A T T G C T C A T G C T G G A G C T +T C C G T T G A C A T A G G A A T T T T C T C T C +T A C A T T T A G C A G G A G T G T C T T C T A T +T T T A G G G G C C G T A A A C T T C A T A A C T +A C A G T A A T T A A T A T A C G A G C A T - C T +G G A A T A A C A A T A G A C C G A A T A C C T T +T A T T T G T T T G A T C T G T C T T T A T T A C +A G C A C T A C T T T T A C T C T T G T C A T T A +C C A G T T +>Chanbria +T T A T A T T A G G A G C A C C T G A T A T A G - +C - A T T T C C A C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C A T C T C T +T A C A C T A C T C C T A - T C A A G A G G A A T +G G T A G A A A G A G G A G T C G G T A C A G G A +T G A A C T G T G T A T C C C C C G C T A G C - A +G - C A G G A A T T G C C C A C G C T G G A G C C +T C A G T T G A C A T A G G A A T T T T T T C A C +T A C A C C T C G C C G G A G T C T C A T C A A T +T C T A G G T G C A G T T A A T T T T A T A A C A +A C T G T T A T T A A C A T A C G - C C C T G C G +G G A A T A A C T A T A G A C C G T A T A C C A C +T T T T C G T G T G A G C A G T A T T T A T T A C +A G C C C T A C T C C T T C T T T T A T C T C T A +C C A G T C +>Gea +T T A T A C T T G G T G C C C C A G A T A T G G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G A T T C T G A C T T C T C C C C C C T T C T C T +A A C T C T A T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C A G G A +T G A A C A G T T T A C C C C C C T C T A G C - A +G - C A G G A A T T G C T C A T G C T G G A G C T +T C A G T T G A T A T A G G A A T T T T C T C G C +T A C A C C T T G C A G G G G T C T C A T C A A T +C T T A G G A G C A G T T A A T T T C A T G A C A +A C A G T T A T T A A T A T G C G - C C C T G C A +G G A A T A A C T A T G G A C C G T A T A C C A C +T C T T C G T G T G A G C T G T C T T T A T T A C +A G C C T T G C T A C T A T T A C T A T C C C T C +C C A G T C +>Hadrurus +T T A T A T T A G G G G C C C C A G A T A T A G - +C - T T T T C C C C G A A T A A A T A A T A T A A +G C T T C T G A T T A C T T C C C C C C T C C C T +A A C T C T C C T C C T T - T T A A G A G G A A T +A G T T G A A A G A G G A G T A G G G A C C G G C +T G A A C T G T C T A T C C T C C T T T A G C T A +G - - A G G A A T T G C C C A T G C A G G A G C C +T C G G T A G A C A T A G G A A T T T T T T C T C +T A C A C T T A G C T G G A G T T T C A T C C A T +T T T A G G G G C C G T A A A C T T T A T A A C A +A C A G T G A T T A A T A T A C G A T C A T - C A +G G A A T A A C A A T A G A T C G A A T A C C T T +T A T T T G T T T G A T C A G T C T T T A T T A C +C G C T T T A T T G T T A C T C C T T T C A C T C +C C A G T T +>Hypochilus +T A A T A C T C G G G G C C C C A G A T A T A G - +C - T T T C C C A C G A A T A A A T A A C A T A A +G A T T C T G A C T T C T T C C T C C C T C T C T +A A C T C T A C T T C T C - T C A A G C G G A A T +A G T C G A A A G A G G G G T A G G A A C C G G C +T G G A C A G T A T A C C C T C C T C T A G C C A +G - - A G G A A T C G C T C A T G C A G G T G C T +T C T G T A G A T A T A G G A A T C T T C T C G C +T T C A C C T A G C A G G A G T C T C A T C A A T +C C T A G G G G C C G T A A A C T T T A T A A C A +A C A G T A A T C A A T A T A C G A T C C T - C A +G G A A T A T C T A T A G A C C G T A T A C C T T +T A T T T G T A T G A T C A G T A T T C A T T A C +C G C T C T C C T C C T T C T T C T C T C T T T A +C C A G T C +>Limulus +T A A T A C T A G G T G C C C C G G A T A T G G - +C - T T T C C C T C G A A T A A A T A A C A T A A +G T T T C T G A C T T T T A C C C C C A T C T C T +C A C T C T - C C T C T T A T C T A G A G G A A T +A G T A G A A A G A G G T G T A G G A A C T G G A +T G A A C T G T A T A T C C T C C T T T A T C - T +G - C A A G T A T T G C C C A T G C G G G A G C T +T C C G T T G A T C T C G G A A T C T T C T C A C +T A C A C T T A G C A G G T G T A T C A T C A A T +T T T A G G T G C C G T A A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T - C T A C C +G G A A T A A C T A T G G A C C G G A T A C C T C +T T T T T G T T T G A G C A G T A T T T A T T A C +T G C T T T A C T T T T A C T A C T G T C C T T A +C C A G T T +>Mastigoproctus +T G A T A T T A G G T G C T C C T G A T A T A G - +C - T T T T C C T C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C T T C G T T +A A C G T T A C T T C T T - T C C A G A G G T A T +A G T T G A A A G A G G G G T A G G A A C A G G A +T G A A C A G T C T A T C C T C C T T T A T C - A +G - C T A G A A T T G C T C A T G C C G G A G C T +T C A G T A G A C T T A G G T A T T T T C T C G C +T G C A T C T A G C T G G T G T C T C G T C T A T +C T T A G G A G C T G T C A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G T A T A C C T C +T T T T C G T A T G G G C A G T A T T T A T T A C +C G C C C T A C T A C T T T T A T T A T C T T T A +C C A G T A +>Pauroctonus +T A A T G T T A G G T G C T C C T G A T A T G G - +C - T T T T C C A C G A A T A A A C A A T A T G A +G T T T C T G G C T C C T A C C T C C T T C A C T +A A C A C T A C T T C T T - T C T A G A G G T A T +A G T T G A A A G A G G A G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T T T A T C - A +G - C C A G T A T T G C T C A T G C T G G G G C T +T C G G T A G A T T T A G G A A T T T T C T C C C +T A C A C T T G G C A G G T G T T T C T T C A A T +T T T A G G A G C T G T A A A T T T T A T G A C A +A C T G T T A T T A A C A T A C G A T - C T A C G +G G A A T A A C T A T A G A C C G A A T A C C G C +T T T T C G T T T G A G C A G T A T T T A T T A C +A G C A T T A C T T C T C T T A T T A T C T C T A +C C G G T A +>Rhiphicephalus +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T C - A +G - C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G A A T A C C G T +T A T T C G T C T G A G C A G T T T T T A T T A C +A G C G C T C T T A C T C C T T T T A T C T T T A +C C C G T T +>Thelichoris +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T C - A +G - C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G A A T A C C G T +T A T T C G T C T G A G C A G T T T T T A T T A C +A G C G C T C T T A C T C C T T T T A T C T T T A +C C C G T T +>Thermobius +T A A T A T T A G G T G C T C C C G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T G A +G T T T C T G A C T T T T A C C T C C C T C G C T +A A C T C T A C T T C T T - T C T A G A G G T A T +A G T C G A A A G A G G G G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T C T A T C - T +G - C C A G C A T T G C C C A T G C A G G A G C A +T C T G T A G A T C T A G G A A T C T T C T C A T +T A C A T T T A G C A G G G G T T T C T T C A A T +T C T A G G T G C T G T A A A T T T T A T A A C T +A C C G T T A T C A A T A T A C G A T - C A A C A +G G A A T A A C T A T A G A C C G A A T A C C T C +T T T T C G T C T G A G C G G T A T T T A T T A C +A G C C T T A C T A C T T T T A C T A T C A T T A +C C G G T T +>Vonones +T A A T G T T A G G T G C T C C A G A T A T G G - +C A T T T C C C - C G A A T A A A T A A T A T A A +G T T T C T G A C T T T T A C C C C C T T C G C T +A A C T T T A C T T - T T A T C T A G A G G T A T +A G T C G A A A G A G G A G T G G G A A C T G G A +T G A A C A G T A T A C C C T C C T T T A T C - A +G - C C A G A A T T G C T C A C G C A G G T G C T +T C A G T T G A T C T A G G T A T T T T T T C A T +T A C A T T T A G C A G G G G T C T C A T C A A T +C T T A G G A G C T G T A A A C T T T A T A A C G +A C C G T T A T C A A T A T A C G A T - C T A C A +G G G A T A A C T A T A G A C C G A A T A C C A C +T T T T T G T T T G A G C A G T A T T T A T T A C +A G C T C T A C T C C T A C T G T T A T C T T T A +C C A G T C + + +Sequence character chel_16S.fas#0 +>Alentus +T C C A G A C T - - G A - C - G G T - C C A C C - +- - G A A T C G - G G C G A C T G - - - T C A G G +C C T G A A C - A T C C G G T C G G T T T C - T C +- T C C C T T T T C C C G C - - G - G T G C T C T +T C G T T G A G - T G - T C G C G - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Amblypygid +T C C A G A C T - - G G - C - G G T - C C G C C T +- - A - G C G G C G A G T A C T G - - - T C A G G +C C T G A A C - A T G G C G C C G G - T T - - T - +- T - - C - - - - - - C T T - - G - G T T C T C T +T T A C T G A G - T G - T C T T G - G G C G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - - - G - C T G +- T - - - - C G C C +>Americhernus +T C G A G C C T C C A A - T - G A T - A C G T T G +- - A - A A G G C G T T T A T C G - - - T T G G G +G C C G - A C - A G C G C G T C - G - - T - - G - +- G - - G - - - - - - C T C - - G - G T T G G C C +T T A A A A A G C T G A T C - G G - G T T C T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +G G T G C T C A A A G C A - - G - - - - G C C T G +- G - - - - T G C C +>Aportus +T C C A G A C T - - G A - C - G G T - C C A C C - +- - G - C T C G - G G C G A C T G - - - T C A G G +C C T G A A C - A T T C C G T C G G T T T C G A C +- A G A T T T T T C C C G C - - G - G T G C T C T +T C G G T G A G - T G - T C G C - - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Artemia +C T C G G T C G - - G G - T - G G T G C C G C C - +- T C - A C G G T G G T C A C T G - - - C C T C G +A T C G G A C - A A T T C A T T G G - - A - - - - +- T - C G - - - - - - T T C G G G - G T G C T C T +T A A C C G A G - T G - T C C T G - G G T G G C C +G A T A C G T T T A C T T T G A A C A A A T T A G +A G T G C T T A A A G C A - - G - - - - G - T - G +C A - C C G C G C C +>Centruoides +T C C A G A C A - - A G - C - G G T - C C A C C - +- - C - G C G G T G G T T A C T G - - - T T T G G +A C T G G A C - G T T T G G C C G G - - A - - T - +- T - C C - - - - - - T T T - - G - A T G C T C T +T T G C C G A G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - - +- A - - C - C G C C +>Chanbria +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - +>Gea +T C C G G C C G - - G A - C G G G T - C C G C C - +T - A - C C G G T G G T T A C T G - T T C G C T G +C C - G A G C - - T T C A G G G G G - - C - - C - +- G - C T - - - - - - G T C - - G - A T G A T C T +T C A T C G G T - T A - T C T T C - C G T A A C C +C T C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - - - - C G C C +>Hadrurus +T C C G G A C T - - G T - C - G G T - C C G C C - +- - - - G C - A A G C T T A C T G - - - G C A G G +A C C G G A C - G T C T A G C C G G - - A - - C - +- T - C T - - - - - - C T C - - G T A T C C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A T - C - C G C C +>Hypochilus +T C C A G A C G - - G G - C - G G T - C C G C C - +T - A - A C G G T G G T T A C T G - - - C C T G G +C C T G A A C - A A C C A G C C G G - - T - - T - +- T - C C - - - - - - C T A - - G - A T G A T C T +T C A T T G A T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - G +- A - - - - C G C C +>Limulus +T C T A G A C T - - G G - C - G G T - C C G C T T +- - - - C C G G C G G T T A C T G - - - C C T G G +C C T A A A C - A T C - T G C C G G - T T - - T - +- T - C C - - - - - - C T C - - G - G T G C C C T +T G A T T G A G - T G - T C T T G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - A +- A - - - - C G C C +>Mastigoproctus +T C C A G A C G - - G G - T - G G T - C C A C C - +G - C - C C G G T G G C T A C T G - - - C C C G G +C C T G A A C A A T C T C G C C G G - - T - - T - +- T - T C - - - - - - C T T - - G - A T T C T C T +T C A C C G A G - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T T A A A G C A - - G - - C - G - T - A +- A - - - - C G C C +>Pauroctonus +T C C A G A C T - - G T - C - G G T - C C G - C - +- - A - C C G G A G G T T A C T G - - - G C A G G +A C C G G A C - G T C T A G C C G G - - C - - C - +- T - C C - - - - - - C T C - - G - T T G C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - A - C +- C - - - - C G C C +>Rhiphicephalus +T C C A G A C G - - A G - T - A G T - G C A T C - +T - A - C C C G A T G C T A C G G - - - C T C G G +A C T G A A C - A T C A T G C C G G - - T - - T - +C T - T T - - - - - - C T T - - G - G T G C A C T +T C A T T G T G - T G - C C T C G A G A T G G C C +G G T G C T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A C G C A - G G - - C - G - A - G +- T - - - - C G C C +>Thelichoris +T C C A G A C G - - G G - C - G G T - C C G C C - +T - A - A C G G T G G T T A C T G - - - C C T G G +C C T G A A C - A G C C A G C C G G - - T - - T - +- T - C C - - - - - - C T A - - G - A T G A T C T +T T A C C G G T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - G G - - C - - - T - G +- A - - - - C G C C +>Thermobius +C T C G G A C - - - G A T C - G G T - T C G C C - +- - G - C C C G T G T T A A C T G A - - T C - G T +T C C G G A C - G T C C T G C C G G T T T - - T - +- T - C C - T T T - - C T C - - G - G T G C T C T +T C A T T G A G - T G - T C T T G - A T T G G C C +G G C A C G T T T A C T T T G A A C A A A T T A G +A G T G C T C A A A G C A G - G T C C A G - T - G +- T - - - - C G C C +>Vonones +T C G A G G C T - - G G - C - G G T - C C G C C T +- - A - C A G G C G G T C A C T G - - - C C A G T +A C T C A A C - A T C C T G C C G G - T T - - T - +- T - C C - - - - - - C T T - - G - G T G C T C T +T C G C T G A G - T G - T C T C G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - C - G +- T - - - - A G C C + +Implied Alignments for Graph 5 + +Sequence character chel_cox1.fas#0 +>Alentus +T T A T A T T A G G A G C A C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T T T G A C T T C T T C C A C C A T C T T T +A A C T T T A C T C C T C - T C T A G A G G A A T +A G T A G A A A G T G G A G T G G G A A C A G G C +T G A A C T G T C T A T C C T C C T T T A T C - T +G - C T G G A A T T G C A C A T G C T G G T G C C +T C A G T A G A T T T A G G A A T C T T T T C T T +T A C A C C T A G C A G G A G T T T C A T C C A T +C T T A G G G G C C G T A A A C T T T A T A A C T +A C C G T T A T C A A T A T A C G A A - G A A C A +G G T A T A A C A A T A G A C C G A A T T C C T C +T T T T T G T A T G A T C T G T A T T T A T T A C +A G C C C T T C T T C T C C T T C T T T C C C T C +C C A G T C +>Amblypygid +T A A T A T T A G G A G C C C C A G A T A T G G - +C - T T T T C C C C G T A T A A A T A A T A T A A +G T T T T T G A C T T T T A C C C C C C T C T C T +A A C A C T - A C T C C T C T C T A G A G G G A T +A G T G G A A A G A G G T G T A G G T A C A G G A +T G A A C T G T T T A C C C C C C T C T A T C - G +G - C T A G C A T C G C A C A C G C C G G T G C T +T C T G T T G A T A T G G G A A T T T T C T C G C +T C C A T T T G G C C G G G G T T T C T T C A A T +T T T A G G G G C C G T T A A T T T T A T A A C T +A C T G T T A T C A A T A T A C G A A - G A A C C +G G A A T A A C T A T A G A C C G A A T T C C C C +T T T T T G T T T G A T C T G T C T T T A T T A C +C G C C C T C C T T C T T C T T C T A T C A T T A +C C T G T T +>Americhernus +T A A T A C T T G G G G C C C C T G A T A T G G - +C - T T T C C C C C G T A T A A A T A A T A T G A +G A T T C T G A C T C T T A C C T C C C T C A T T +A A C T C T - C C T A T T A T C G A G A G G G A T +A G T T G A A A C A G G G G T T G G T A C T G G A +T G G A C A G T A T A C C C C C C C T T A T C - G +G - C C G G G A T C G C C C A C G C A G G G G C C +T C A G T T G A C C T G G C A A T C T T T T C T C +T C C A C T T A G C T G G G G T T T C T T C T A T +T C T A G G A G C A G T T A A T T T T A T A A C A +A C C G T T A T C A A T A T A C G A A - G T G T A +G G G A T G A C A A T A G A C C G A G T A C C T C +T T T T T G T A T G G T C T A T T T T C A T C A C +A A C T C T C C T T C T A C T A C T T T C T C T A +C C A G T C +>Aportus +T A A T A T T A G G G G C C C C T G A T A T G G C +C - T T T C C C - C G G A T A A A T A A T A T A A +G T T T C T G A C T A T T A C C C C C C T C T T T +A A C C C T T C T T T T A - T C T A G T G G T A T +G G T T G A A A C A G G A G T T G G G A C T G G T +T G A A C A G T T T A C C C C C C T T T A G C - C +G - C C G G A A T T G C T C A C G C A G G A G C T +T C G G T A G A T C T T G C T A T T T T C T C A C +T C C A C T T A G C C G G G G T T T C T T C G A T +T T T A G G A G C A G T A A A C T T T A T A A C T +A C C G T T A T T A A T A T A C G A A - G A A C A +G G G A T A A C A A T G G A C C G G A T C C C T C +T T T T C G T C T G G T C G G T T T T T A T T A C +T G C C C T T C T T T T A C T T T T A T C T C T C +C C A G T T +>Artemia +T A A T A T T A G G A G C T C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A C T C C T C C C T C C C T C T T T +A A C T C T T T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C G G G A +T G A A C C G T T T A T C C T C C T C T T G C - A +G C - C G G A A T T G C T C A T G C C G G A G C T +T C A G T T G A T A T A G G A A T C T T C T C T C +T A C A T T T A G C A G G A G T A T C C T C T A T +T T T A G G G G C C G T A A A T T T T A T A A C A +A C A G T G A T T A A C A T A C G G G C A T - C T +G G A A T A A C A T T A G A C C G A A T A C C T T +T A T T C G T G T G A T C T G T A T T T A T T A C +T G C A C T A T T A T T A C T T T T A T C A C T A +C C A G T T +>Centruoides +T A A T A T T A G G G G C C C C T G A T A T G G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A T T G C T T C C C C C T T C C C T +A A C T C T T T T A C T T - T C G A G T G G A A T +G G T A G A A A G A G G G G T A G G T A C A G G A +T G A A C T G T T T A T C C C C C T C T T G C - A +G C - C G G A A T T G C T C A T G C T G G A G C T +T C C G T T G A C A T A G G A A T T T T C T C T C +T A C A T T T A G C A G G A G T G T C T T C T A T +T T T A G G G G C C G T A A A C T T C A T A A C T +A C A G T A A T T A A T A T A C G A G C A T - C T +G G A A T A A C A A T A G A C C G A A T A C C T T +T A T T T G T T T G A T C T G T C T T T A T T A C +A G C A C T A C T T T T A C T C T T G T C A T T A +C C A G T T +>Chanbria +T T A T A T T A G G A G C A C C T G A T A T A G - +C - A T T T C C A C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C A T C T C T +T A C A C T A C T C C T A - T C A A G A G G A A T +G G T A G A A A G A G G A G T C G G T A C A G G A +T G A A C T G T G T A T C C C C C G C T A G C - A +G - C A G G A A T T G C C C A C G C T G G A G C C +T C A G T T G A C A T A G G A A T T T T T T C A C +T A C A C C T C G C C G G A G T C T C A T C A A T +T C T A G G T G C A G T T A A T T T T A T A A C A +A C T G T T A T T A A C A T A C G - C C C T G C G +G G A A T A A C T A T A G A C C G T A T A C C A C +T T T T C G T G T G A G C A G T A T T T A T T A C +A G C C C T A C T C C T T C T T T T A T C T C T A +C C A G T C +>Gea +T T A T A C T T G G T G C C C C A G A T A T G G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G A T T C T G A C T T C T C C C C C C T T C T C T +A A C T C T A T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C A G G A +T G A A C A G T T T A C C C C C C T C T A G C - A +G - C A G G A A T T G C T C A T G C T G G A G C T +T C A G T T G A T A T A G G A A T T T T C T C G C +T A C A C C T T G C A G G G G T C T C A T C A A T +C T T A G G A G C A G T T A A T T T C A T G A C A +A C A G T T A T T A A T A T G C G - C C C T G C A +G G A A T A A C T A T G G A C C G T A T A C C A C +T C T T C G T G T G A G C T G T C T T T A T T A C +A G C C T T G C T A C T A T T A C T A T C C C T C +C C A G T C +>Hadrurus +T T A T A T T A G G G G C C C C A G A T A T A G - +C - T T T T C C C C G A A T A A A T A A T A T A A +G C T T C T G A T T A C T T C C C C C C T C C C T +A A C T C T C C T C C T T - T T A A G A G G A A T +A G T T G A A A G A G G A G T A G G G A C C G G C +T G A A C T G T C T A T C C T C C T T T A G C T A +G - - A G G A A T T G C C C A T G C A G G A G C C +T C G G T A G A C A T A G G A A T T T T T T C T C +T A C A C T T A G C T G G A G T T T C A T C C A T +T T T A G G G G C C G T A A A C T T T A T A A C A +A C A G T G A T T A A T A T A C G A T C A T - C A +G G A A T A A C A A T A G A T C G A A T A C C T T +T A T T T G T T T G A T C A G T C T T T A T T A C +C G C T T T A T T G T T A C T C C T T T C A C T C +C C A G T T +>Hypochilus +T A A T A C T C G G G G C C C C A G A T A T A G - +C - T T T C C C A C G A A T A A A T A A C A T A A +G A T T C T G A C T T C T T C C T C C C T C T C T +A A C T C T A C T T C T C - T C A A G C G G A A T +A G T C G A A A G A G G G G T A G G A A C C G G C +T G G A C A G T A T A C C C T C C T C T A G C C A +G - - A G G A A T C G C T C A T G C A G G T G C T +T C T G T A G A T A T A G G A A T C T T C T C G C +T T C A C C T A G C A G G A G T C T C A T C A A T +C C T A G G G G C C G T A A A C T T T A T A A C A +A C A G T A A T C A A T A T A C G A T C C T - C A +G G A A T A T C T A T A G A C C G T A T A C C T T +T A T T T G T A T G A T C A G T A T T C A T T A C +C G C T C T C C T C C T T C T T C T C T C T T T A +C C A G T C +>Limulus +T A A T A C T A G G T G C C C C G G A T A T G G - +C - T T T C C C T C G A A T A A A T A A C A T A A +G T T T C T G A C T T T T A C C C C C A T C T C T +C A C T C T - C C T C T T A T C T A G A G G A A T +A G T A G A A A G A G G T G T A G G A A C T G G A +T G A A C T G T A T A T C C T C C T T T A T C - T +G - C A A G T A T T G C C C A T G C G G G A G C T +T C C G T T G A T C T C G G A A T C T T C T C A C +T A C A C T T A G C A G G T G T A T C A T C A A T +T T T A G G T G C C G T A A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T - C T A C C +G G A A T A A C T A T G G A C C G G A T A C C T C +T T T T T G T T T G A G C A G T A T T T A T T A C +T G C T T T A C T T T T A C T A C T G T C C T T A +C C A G T T +>Mastigoproctus +T G A T A T T A G G T G C T C C T G A T A T A G - +C - T T T T C C T C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C T T C G T T +A A C G T T A C T T C T T - T C C A G A G G T A T +A G T T G A A A G A G G G G T A G G A A C A G G A +T G A A C A G T C T A T C C T C C T T T A T C - A +G - C T A G A A T T G C T C A T G C C G G A G C T +T C A G T A G A C T T A G G T A T T T T C T C G C +T G C A T C T A G C T G G T G T C T C G T C T A T +C T T A G G A G C T G T C A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G T A T A C C T C +T T T T C G T A T G G G C A G T A T T T A T T A C +C G C C C T A C T A C T T T T A T T A T C T T T A +C C A G T A +>Pauroctonus +T A A T G T T A G G T G C T C C T G A T A T G G - +C - T T T T C C A C G A A T A A A C A A T A T G A +G T T T C T G G C T C C T A C C T C C T T C A C T +A A C A C T A C T T C T T - T C T A G A G G T A T +A G T T G A A A G A G G A G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T T T A T C - A +G - C C A G T A T T G C T C A T G C T G G G G C T +T C G G T A G A T T T A G G A A T T T T C T C C C +T A C A C T T G G C A G G T G T T T C T T C A A T +T T T A G G A G C T G T A A A T T T T A T G A C A +A C T G T T A T T A A C A T A C G A T - C T A C G +G G A A T A A C T A T A G A C C G A A T A C C G C +T T T T C G T T T G A G C A G T A T T T A T T A C +A G C A T T A C T T C T C T T A T T A T C T C T A +C C G G T A +>Rhiphicephalus +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T C - A +G - C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G A A T A C C G T +T A T T C G T C T G A G C A G T T T T T A T T A C +A G C G C T C T T A C T C C T T T T A T C T T T A +C C C G T T +>Thelichoris +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T C - A +G - C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T - C T A C A +G G T A T A A C T A T A G A C C G A A T A C C G T +T A T T C G T C T G A G C A G T T T T T A T T A C +A G C G C T C T T A C T C C T T T T A T C T T T A +C C C G T T +>Thermobius +T A A T A T T A G G T G C T C C C G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T G A +G T T T C T G A C T T T T A C C T C C C T C G C T +A A C T C T A C T T C T T - T C T A G A G G T A T +A G T C G A A A G A G G G G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T C T A T C - T +G - C C A G C A T T G C C C A T G C A G G A G C A +T C T G T A G A T C T A G G A A T C T T C T C A T +T A C A T T T A G C A G G G G T T T C T T C A A T +T C T A G G T G C T G T A A A T T T T A T A A C T +A C C G T T A T C A A T A T A C G A T - C A A C A +G G A A T A A C T A T A G A C C G A A T A C C T C +T T T T C G T C T G A G C G G T A T T T A T T A C +A G C C T T A C T A C T T T T A C T A T C A T T A +C C G G T T +>Vonones +T A A T G T T A G G T G C T C C A G A T A T G G - +C A T T T C C C - C G A A T A A A T A A T A T A A +G T T T C T G A C T T T T A C C C C C T T C G C T +A A C T T T A C T T - T T A T C T A G A G G T A T +A G T C G A A A G A G G A G T G G G A A C T G G A +T G A A C A G T A T A C C C T C C T T T A T C - A +G - C C A G A A T T G C T C A C G C A G G T G C T +T C A G T T G A T C T A G G T A T T T T T T C A T +T A C A T T T A G C A G G G G T C T C A T C A A T +C T T A G G A G C T G T A A A C T T T A T A A C G +A C C G T T A T C A A T A T A C G A T - C T A C A +G G G A T A A C T A T A G A C C G A A T A C C A C +T T T T T G T T T G A G C A G T A T T T A T T A C +A G C T C T A C T C C T A C T G T T A T C T T T A +C C A G T C + + +Sequence character chel_16S.fas#0 +>Alentus +T C C A G A C T - - G A - C - G G T - C C A C C - +- - G A A T C G - G G C G A C T G - - - T C A G G +C C T G A A C - A T C C G G T C G G T T T C - T C +- T C C C T T T T C C C G C - - G - G T G C T C T +T C G T T G A G - T G - T C G C G - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Amblypygid +T C C A G A C T - - G G - C - G G T - C C G C C T +- - A - G C G G C G A G T A C T G - - - T C A G G +C C T G A A C - A T G G C G C C G G - T T - - T - +- T - - C - - - - - - C T T - - G - G T T C T C T +T T A C T G A G - T G - T C T T G - G G C G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - - - G - C T G +- T - - - - C G C C +>Americhernus +T C G A G C C T C C A A - T - G A T - A C G T T G +- - A - A A G G C G T T T A T C G - - - T T G G G +G C C G - A C - A G C G C G T C - G - - T - - G - +- G - - G - - - - - - C T C - - G - G T T G G C C +T T A A A A A G C T G A T C - G G - G T T C T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +G G T G C T C A A A G C A - - G - - - - G C C T G +- G - - - - T G C C +>Aportus +T C C A G A C T - - G A - C - G G T - C C A C C - +- - G - C T C G - G G C G A C T G - - - T C A G G +C C T G A A C - A T T C C G T C G G T T T C G A C +- A G A T T T T T C C C G C - - G - G T G C T C T +T C G G T G A G - T G - T C G C - - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Artemia +C T C G G T C G - - G G - T - G G T G C C G C C - +- T C - A C G G T G G T C A C T G - - - C C T C G +A T C G G A C - A A T T C A T T G G - - A - - - - +- T - C G - - - - - - T T C G G G - G T G C T C T +T A A C C G A G - T G - T C C T G - G G T G G C C +G A T A C G T T T A C T T T G A A C A A A T T A G +A G T G C T T A A A G C A - - G - - - - G - T - G +C A - C C G C G C C +>Centruoides +T C C A G A C A - - A G - C - G G T - C C A C C - +- - C - G C G G T G G T T A C T G - - - T T T G G +A C T G G A C - G T T T G G C C G G - - A - - T - +- T - C C - - - - - - T T T - - G - A T G C T C T +T T G C C G A G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - - +- A - - C - C G C C +>Chanbria +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - +>Gea +T C C G G C C G - - G A - C G G G T - C C G C C - +T - A - C C G G T G G T T A C T G - T T C G C T G +C C - G A G C - - T T C A G G G G G - - C - - C - +- G - C T - - - - - - G T C - - G - A T G A T C T +T C A T C G G T - T A - T C T T C - C G T A A C C +C T C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - - - - C G C C +>Hadrurus +T C C G G A C T - - G T - C - G G T - C C G C C - +- - - - G C - A A G C T T A C T G - - - G C A G G +A C C G G A C - G T C T A G C C G G - - A - - C - +- T - C T - - - - - - C T C - - G T A T C C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A T - C - C G C C +>Hypochilus +T C C A G A C G - - G G - C - G G T - C C G C C - +T - A - A C G G T G G T T A C T G - - - C C T G G +C C T G A A C - A A C C A G C C G G - - T - - T - +- T - C C - - - - - - C T A - - G - A T G A T C T +T C A T T G A T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - G +- A - - - - C G C C +>Limulus +T C T A G A C T - - G G - C - G G T - C C G C T T +- - - - C C G G C G G T T A C T G - - - C C T G G +C C T A A A C - A T C - T G C C G G - T T - - T - +- T - C C - - - - - - C T C - - G - G T G C C C T +T G A T T G A G - T G - T C T T G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - A +- A - - - - C G C C +>Mastigoproctus +T C C A G A C G - - G G - T - G G T - C C A C C - +G - C - C C G G T G G C T A C T G - - - C C C G G +C C T G A A C A A T C T C G C C G G - - T - - T - +- T - T C - - - - - - C T T - - G - A T T C T C T +T C A C C G A G - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T T A A A G C A - - G - - C - G - T - A +- A - - - - C G C C +>Pauroctonus +T C C A G A C T - - G T - C - G G T - C C G - C - +- - A - C C G G A G G T T A C T G - - - G C A G G +A C C G G A C - G T C T A G C C G G - - C - - C - +- T - C C - - - - - - C T C - - G - T T G C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - A - C +- C - - - - C G C C +>Rhiphicephalus +T C C A G A C G - - A G - T - A G T - G C A T C - +T - A - C C C G A T G C T A C G G - - - C T C G G +A C T G A A C - A T C A T G C C G G - - T - - T - +C T - T T - - - - - - C T T - - G - G T G C A C T +T C A T T G T G - T G - C C T C G A G A T G G C C +G G T G C T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A C G C A - G G - - C - G - A - G +- T - - - - C G C C +>Thelichoris +T C C A G A C G - - G G - C - G G T - C C G C C - +T - A - A C G G T G G T T A C T G - - - C C T G G +C C T G A A C - A G C C A G C C G G - - T - - T - +- T - C C - - - - - - C T A - - G - A T G A T C T +T T A C C G G T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - G G - - C - - - T - G +- A - - - - C G C C +>Thermobius +C T C G G A C - - - G A T C - G G T - T C G C C - +- - G - C C C G T G T T A A C T G A - - T C - G T +T C C G G A C - G T C C T G C C G G T T T - - T - +- T - C C - T T T - - C T C - - G - G T G C T C T +T C A T T G A G - T G - T C T T G - A T T G G C C +G G C A C G T T T A C T T T G A A C A A A T T A G +A G T G C T C A A A G C A G - G T C C A G - T - G +- T - - - - C G C C +>Vonones +T C G A G G C T - - G G - C - G G T - C C G C C T +- - A - C A G G C G G T C A C T G - - - C C A G T +A C T C A A C - A T C C T G C C G G - T T - - T - +- T - C C - - - - - - C T T - - G - G T G C T C T +T C G C T G A G - T G - T C T C G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - C - G +- T - - - - A G C C + +Implied Alignments for Graph 6 + +Sequence character chel_cox1.fas#0 +>Alentus +T T A T A T T A G G A G C A C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T T T G A C T T C T T C C A C C A T C T T T +A A C T T T A C T C C T C - T C T A G A G G A A T +A G T A G A A A G T G G A G T G G G A A C A G G C +T G A A C T G T C T A T C C T C C T T T A T - C - +T G C T G G A A T T G C A C A T G C T G G T G C C +T C A G T A G A T T T A G G A A T C T T T T C T T +T A C A C C T A G C A G G A G T T T C A T C C A T +C T T A G G G G C C G T A A A C T T T A T A A C T +A C C G T T A T C A A T A T A C G A A G A A C A G +G T A T A A C A A T A G A C C G A A T T C C T C T +T T T T G T A T G A T C T G T A T T T A T T A C A +G C C C T T C T T C T C C T T C T T T C C C T C C +C A G T C +>Amblypygid +T A A T A T T A G G A G C C C C A G A T A T G G - +C - T T T T C C C C G T A T A A A T A A T A T A A +G T T T T T G A C T T T T A C C C C C C T C T C T +A A C A C T - A C T C C T C T C T A G A G G G A T +A G T G G A A A G A G G T G T A G G T A C A G G A +T G A A C T G T T T A C C C C C C T C T A T - C - +G G C T A G C A T C G C A C A C G C C G G T G C T +T C T G T T G A T A T G G G A A T T T T C T C G C +T C C A T T T G G C C G G G G T T T C T T C A A T +T T T A G G G G C C G T T A A T T T T A T A A C T +A C T G T T A T C A A T A T A C G A A G A A C C G +G A A T A A C T A T A G A C C G A A T T C C C C T +T T T T G T T T G A T C T G T C T T T A T T A C C +G C C C T C C T T C T T C T T C T A T C A T T A C +C T G T T +>Americhernus +T A A T A C T T G G G G C C C C T G A T A T G G - +C - T T T C C C C C G T A T A A A T A A T A T G A +G A T T C T G A C T C T T A C C T C C C T C A T T +A A C T C T - C C T A T T A T C G A G A G G G A T +A G T T G A A A C A G G G G T T G G T A C T G G A +T G G A C A G T A T A C C C C C C C T T A T - C - +G G C C G G G A T C G C C C A C G C A G G G G C C +T C A G T T G A C C T G G C A A T C T T T T C T C +T C C A C T T A G C T G G G G T T T C T T C T A T +T C T A G G A G C A G T T A A T T T T A T A A C A +A C C G T T A T C A A T A T A C G A A G T G T A G +G G A T G A C A A T A G A C C G A G T A C C T C T +T T T T G T A T G G T C T A T T T T C A T C A C A +A C T C T C C T T C T A C T A C T T T C T C T A C +C A G T C +>Aportus +T A A T A T T A G G G G C C C C T G A T A T G G C +C - T T T C C C - C G G A T A A A T A A T A T A A +G T T T C T G A C T A T T A C C C C C C T C T T T +A A C C C T T C T T T T A - T C T A G T G G T A T +G G T T G A A A C A G G A G T T G G G A C T G G T +T G A A C A G T T T A C C C C C C T T T A G - C - +C G C C G G A A T T G C T C A C G C A G G A G C T +T C G G T A G A T C T T G C T A T T T T C T C A C +T C C A C T T A G C C G G G G T T T C T T C G A T +T T T A G G A G C A G T A A A C T T T A T A A C T +A C C G T T A T T A A T A T A C G A A G A A C A G +G G A T A A C A A T G G A C C G G A T C C C T C T +T T T C G T C T G G T C G G T T T T T A T T A C T +G C C C T T C T T T T A C T T T T A T C T C T C C +C A G T T +>Artemia +T A A T A T T A G G A G C T C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A C T C C T C C C T C C C T C T T T +A A C T C T T T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C G G G A +T G A A C C G T T T A T C C T C C T C T T G - C - +A G C C G G A A T T G C T C A T G C C G G A G C T +T C A G T T G A T A T A G G A A T C T T C T C T C +T A C A T T T A G C A G G A G T A T C C T C T A T +T T T A G G G G C C G T A A A T T T T A T A A C A +A C A G T G A T T A A C A T A C G G G C A T C T G +G A A T A A C A T T A G A C C G A A T A C C T T T +A T T C G T G T G A T C T G T A T T T A T T A C T +G C A C T A T T A T T A C T T T T A T C A C T A C +C A G T T +>Centruoides +T A A T A T T A G G G G C C C C T G A T A T G G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A T T G C T T C C C C C T T C C C T +A A C T C T T T T A C T T - T C G A G T G G A A T +G G T A G A A A G A G G G G T A G G T A C A G G A +T G A A C T G T T T A T C C C C C T C T T G - C - +A G C C G G A A T T G C T C A T G C T G G A G C T +T C C G T T G A C A T A G G A A T T T T C T C T C +T A C A T T T A G C A G G A G T G T C T T C T A T +T T T A G G G G C C G T A A A C T T C A T A A C T +A C A G T A A T T A A T A T A C G A G C A T C T G +G A A T A A C A A T A G A C C G A A T A C C T T T +A T T T G T T T G A T C T G T C T T T A T T A C A +G C A C T A C T T T T A C T C T T G T C A T T A C +C A G T T +>Chanbria +T T A T A T T A G G A G C A C C T G A T A T A G - +C - A T T T C C A C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C A T C T C T +T A C A C T A C T C C T A - T C A A G A G G A A T +G G T A G A A A G A G G A G T C G G T A C A G G A +T G A A C T G T G T A T C C C C C G C T A G - C - +A G C A G G A A T T G C C C A C G C T G G A G C C +T C A G T T G A C A T A G G A A T T T T T T C A C +T A C A C C T C G C C G G A G T C T C A T C A A T +T C T A G G T G C A G T T A A T T T T A T A A C A +A C T G T T A T T A A C A T A C G C C C T G C G G +G A A T A A C T A T A G A C C G T A T A C C A C T +T T T C G T G T G A G C A G T A T T T A T T A C A +G C C C T A C T C C T T C T T T T A T C T C T A C +C A G T C +>Gea +T T A T A C T T G G T G C C C C A G A T A T G G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G A T T C T G A C T T C T C C C C C C T T C T C T +A A C T C T A T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C A G G A +T G A A C A G T T T A C C C C C C T C T A G - C - +A G C A G G A A T T G C T C A T G C T G G A G C T +T C A G T T G A T A T A G G A A T T T T C T C G C +T A C A C C T T G C A G G G G T C T C A T C A A T +C T T A G G A G C A G T T A A T T T C A T G A C A +A C A G T T A T T A A T A T G C G C C C T G C A G +G A A T A A C T A T G G A C C G T A T A C C A C T +C T T C G T G T G A G C T G T C T T T A T T A C A +G C C T T G C T A C T A T T A C T A T C C C T C C +C A G T C +>Hadrurus +T T A T A T T A G G G G C C C C A G A T A T A G - +C - T T T T C C C C G A A T A A A T A A T A T A A +G C T T C T G A T T A C T T C C C C C C T C C C T +A A C T C T C C T C C T T - T T A A G A G G A A T +A G T T G A A A G A G G A G T A G G G A C C G G C +T G A A C T G T C T A T C C T C C T T T A G - C T +A G - A G G A A T T G C C C A T G C A G G A G C C +T C G G T A G A C A T A G G A A T T T T T T C T C +T A C A C T T A G C T G G A G T T T C A T C C A T +T T T A G G G G C C G T A A A C T T T A T A A C A +A C A G T G A T T A A T A T A C G A T C A T C A G +G A A T A A C A A T A G A T C G A A T A C C T T T +A T T T G T T T G A T C A G T C T T T A T T A C C +G C T T T A T T G T T A C T C C T T T C A C T C C +C A G T T +>Hypochilus +T A A T A C T C G G G G C C C C A G A T A T A G - +C - T T T C C C A C G A A T A A A T A A C A T A A +G A T T C T G A C T T C T T C C T C C C T C T C T +A A C T C T A C T T C T C - T C A A G C G G A A T +A G T C G A A A G A G G G G T A G G A A C C G G C +T G G A C A G T A T A C C C T C C T C T A G C C - +A G - A G G A A T C G C T C A T G C A G G T G C T +T C T G T A G A T A T A G G A A T C T T C T C G C +T T C A C C T A G C A G G A G T C T C A T C A A T +C C T A G G G G C C G T A A A C T T T A T A A C A +A C A G T A A T C A A T A T A C G A T C C T C A G +G A A T A T C T A T A G A C C G T A T A C C T T T +A T T T G T A T G A T C A G T A T T C A T T A C C +G C T C T C C T C C T T C T T C T C T C T T T A C +C A G T C +>Limulus +T A A T A C T A G G T G C C C C G G A T A T G G - +C - T T T C C C T C G A A T A A A T A A C A T A A +G T T T C T G A C T T T T A C C C C C A T C T C T +C A C T C T - C C T C T T A T C T A G A G G A A T +A G T A G A A A G A G G T G T A G G A A C T G G A +T G A A C T G T A T A T C C T C C T T T A T - C - +T G C A A G T A T T G C C C A T G C G G G A G C T +T C C G T T G A T C T C G G A A T C T T C T C A C +T A C A C T T A G C A G G T G T A T C A T C A A T +T T T A G G T G C C G T A A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T C T A C C G +G A A T A A C T A T G G A C C G G A T A C C T C T +T T T T G T T T G A G C A G T A T T T A T T A C T +G C T T T A C T T T T A C T A C T G T C C T T A C +C A G T T +>Mastigoproctus +T G A T A T T A G G T G C T C C T G A T A T A G - +C - T T T T C C T C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C T T C G T T +A A C G T T A C T T C T T - T C C A G A G G T A T +A G T T G A A A G A G G G G T A G G A A C A G G A +T G A A C A G T C T A T C C T C C T T T A T - C - +A G C T A G A A T T G C T C A T G C C G G A G C T +T C A G T A G A C T T A G G T A T T T T C T C G C +T G C A T C T A G C T G G T G T C T C G T C T A T +C T T A G G A G C T G T C A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G T A T A C C T C T +T T T C G T A T G G G C A G T A T T T A T T A C C +G C C C T A C T A C T T T T A T T A T C T T T A C +C A G T A +>Pauroctonus +T A A T G T T A G G T G C T C C T G A T A T G G - +C - T T T T C C A C G A A T A A A C A A T A T G A +G T T T C T G G C T C C T A C C T C C T T C A C T +A A C A C T A C T T C T T - T C T A G A G G T A T +A G T T G A A A G A G G A G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T T T A T - C - +A G C C A G T A T T G C T C A T G C T G G G G C T +T C G G T A G A T T T A G G A A T T T T C T C C C +T A C A C T T G G C A G G T G T T T C T T C A A T +T T T A G G A G C T G T A A A T T T T A T G A C A +A C T G T T A T T A A C A T A C G A T C T A C G G +G A A T A A C T A T A G A C C G A A T A C C G C T +T T T C G T T T G A G C A G T A T T T A T T A C A +G C A T T A C T T C T C T T A T T A T C T C T A C +C G G T A +>Rhiphicephalus +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T - C - +A G C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thelichoris +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T - C - +A G C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thermobius +T A A T A T T A G G T G C T C C C G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T G A +G T T T C T G A C T T T T A C C T C C C T C G C T +A A C T C T A C T T C T T - T C T A G A G G T A T +A G T C G A A A G A G G G G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T C T A T - C - +T G C C A G C A T T G C C C A T G C A G G A G C A +T C T G T A G A T C T A G G A A T C T T C T C A T +T A C A T T T A G C A G G G G T T T C T T C A A T +T C T A G G T G C T G T A A A T T T T A T A A C T +A C C G T T A T C A A T A T A C G A T C A A C A G +G A A T A A C T A T A G A C C G A A T A C C T C T +T T T C G T C T G A G C G G T A T T T A T T A C A +G C C T T A C T A C T T T T A C T A T C A T T A C +C G G T T +>Vonones +T A A T G T T A G G T G C T C C A G A T A T G G - +C A T T T C C C - C G A A T A A A T A A T A T A A +G T T T C T G A C T T T T A C C C C C T T C G C T +A A C T T T A C T T - T T A T C T A G A G G T A T +A G T C G A A A G A G G A G T G G G A A C T G G A +T G A A C A G T A T A C C C T C C T T T A T - C - +A G C C A G A A T T G C T C A C G C A G G T G C T +T C A G T T G A T C T A G G T A T T T T T T C A T +T A C A T T T A G C A G G G G T C T C A T C A A T +C T T A G G A G C T G T A A A C T T T A T A A C G +A C C G T T A T C A A T A T A C G A T C T A C A G +G G A T A A C T A T A G A C C G A A T A C C A C T +T T T T G T T T G A G C A G T A T T T A T T A C A +G C T C T A C T C C T A C T G T T A T C T T T A C +C A G T C + + +Sequence character chel_16S.fas#0 +>Alentus +T C C A G A C T - - G A - C - G G T - C C A C C - +- G A A T C G - G G C G A C T G - - - T C - A G G +C C T G A A C - - A T C C G G - T C G G T T - T C +- T C - T C C C T T T T C C C G C G G T G C T C T +T C G T T G A G - T G - T C G C G - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Amblypygid +T C C A G A C T - - G G - C - G G T - C C G C C T +- A - G C G G C G A G T A C T G - - - T C - A G G +C C T G A A C - - A T G G C G - C C G G - T - T - +- T - - T - - C - - - - - - C T T G G T T C T C T +T T A C T G A G - T G - T C T T G - G G C G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - - - G - C T G +- T - - - - C G C C +>Americhernus +T C G A G C C T C C A A - T - G A T - A C G T T G +- A - A A G G C G T T T A T C G - - - T T - G G G +G C C G - A C - - A G C G C G - T C - G - - - T - +- G - - G - - G - - - - - - C T C G G T T G G C C +T T A A A A A G C T G A T C - G G - G T T C T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +G G T G C T C A A A G C A - - G - - - - G C C T G +- G - - - - T G C C +>Aportus +T C C A G A C T - - G A - C - G G T - C C A C C - +- G - C T C G - G G C G A C T G - - - T C - A G G +C C T G A A C - - A T T C C G - T C G G T T - T C +G A C - A G A T T T T T C C C G C G G T G C T C T +T C G G T G A G - T G - T C G C - - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Artemia +C T C G G T C G - - G G - T - G G T G C C G C C - +T C - A C G G T G G T C A C T G - - - C C T C G A +T C - G G A C - A A T T C A T - T G G A - - - T - +- C - - G - T T - - - - - - C G G G G T G C T C T +T A A C C G A G - T G - T C C T G - G G T G G C C +G A T A C G T T T A C T T T G A A C A A A T T A G +A G T G C T T A A A G C A - - G - - - - G - T - G +C A C - C G C G C C +>Centruoides +T C C A G A C A - - A G - C - G G T - C C A C C - +- C - G C G G T G G T T A C T G - - - T T T G G A +C T - G G A C - - G T T T G G C C G G A - - - T - +- T - - C - C T - - - - - - - T T G A T G C T C T +T T G C C G A G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - - +- A - - C - C G C C +>Chanbria +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - +>Gea +T C C G G C C G - - G A - C G G G T - C C G C C - +T A - C C G G T G G T T A C T G - T T C G - C T G +C C - G A G C - - - T T C A G - G G G G - - - C - +- C - - G - C T - - - - - - G T C G A T G A T C T +T C A T C G G T - T A - T C T T C - C G T A A C C +C T C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - - - - C G C C +>Hadrurus +T C C G G A C T - - G T - C - G G T - C C G - C - +- C - G C - A A G C T T A C T G - - - G C A G G A +C C - G G A C - - G T C T A G C C G G A - - C T - +- C - - T - C T - - - - - - C G T - A T C C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - T C - C G C C +>Hypochilus +T C C A G A C G - - G G - C - G G T - C C G C C - +T A - A C G G T G G T T A C T G - - - C C - T G G +C C T G A A C - - A A C C A G - C C G G - - - T - +- T - - T - C C - - - - - - C T A G A T G A T C T +T C A T T G A T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - G +- A - - - - C G C C +>Limulus +T C T A G A C T - - G G - C - G G T - C C G C T T +- - - C C G G C G G T T A C T G - - - C C - T G G +C C T A A A C - - A T C - T G - C C G G - T - T - +- T - - T - C C - - - - - - C T C G G T G C C C T +T G A T T G A G - T G - T C T T G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - A +- A - - - - C G C C +>Mastigoproctus +T C C A G A C G - - G G - T - G G T - C C A C C - +G C - C C G G T G G C T A C T G - - - C C - C G G +C C T G A A C A - A T C T C G - C C G G - - - T - +- T - - T - T C - - - - - - C T T G A T T C T C T +T C A C C G A G - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T T A A A G C A - - G - - C - G - T - A +- A - - - - C G C C +>Pauroctonus +T C C A G A C T - - G T - C - G G T - C C G - C - +- A - C C G G A G G T T A C T G - - - G C - A G G +A C C G G A C - - G T C T A G - C C G G - - - C - +- C - - T - C C - - - - - - C T C G T T G C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - A - C +- C - - - - C G C C +>Rhiphicephalus +T C C A G A C G - - A G - T - A G T - G C A T C - +T A - C C C G A T G C T A C G G - - - C T - C G G +A C T G A A C - - A T C A T G - C C G G - - - T - +- T - C T - T T - - - - - - C T T G G T G C A C T +T C A T T G T G - T G - C C T C G A G A T G G C C +G G T G C T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A C G C A - G G - - C - G - A - G +- T - - - - C G C C +>Thelichoris +T C C A G A C G - - G G - C - G G T - C C G C C - +T A - A C G G T G G T T A C T G - - - C C - T G G +C C T G A A C - - A G C C A G - C C G G - - - T - +- T - - T - C C - - - - - - C T A G A T G A T C T +T T A C C G G T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - G G - - C - - - T - G +- A - - - - C G C C +>Thermobius +C T C G G A C - - - G A T C - G G T - T C G C C - +- G - C C C G T G T T A A C T G A - - T C - - G T +T C C G G A C - - G T C C T G - C C G G T T - T - +- T - - T - C C - T T T - - C T C G G T G C T C T +T C A T T G A G - T G - T C T T G - A T T G G C C +G G C A C G T T T A C T T T G A A C A A A T T A G +A G T G C T C A A A G C A G - G T C C A G - T - G +- T - - - - C G C C +>Vonones +T C G A G G C T - - G G - C - G G T - C C G C C T +- A - C A G G C G G T C A C T G - - - C C - A G T +A C T C A A C - - A T C C T G - C C G G - T - T - +- T - - T - C C - - - - - - C T T G G T G C T C T +T C G C T G A G - T G - T C T C G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - C - G +- T - - - - A G C C + +Implied Alignments for Graph 7 + +Sequence character chel_cox1.fas#0 +>Alentus +T T A T A T T A G G A G C A C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T T T G A C T T C T T C C A C C A T C T T T +A A C T T T A C T C C T C - T C T A G A G G A A T +A G T A G A A A G T G G A G T G G G A A C A G G C +T G A A C T G T C T A T C C T C C T T T A T - C - +T G C T G G A A T T G C A C A T G C T G G T G C C +T C A G T A G A T T T A G G A A T C T T T T C T T +T A C A C C T A G C A G G A G T T T C A T C C A T +C T T A G G G G C C G T A A A C T T T A T A A C T +A C C G T T A T C A A T A T A C G A A G A A C A G +G T A T A A C A A T A G A C C G A A T T C C T C T +T T T T G T A T G A T C T G T A T T T A T T A C A +G C C C T T C T T C T C C T T C T T T C C C T C C +C A G T C +>Amblypygid +T A A T A T T A G G A G C C C C A G A T A T G G - +C - T T T T C C C C G T A T A A A T A A T A T A A +G T T T T T G A C T T T T A C C C C C C T C T C T +A A C A C T - A C T C C T C T C T A G A G G G A T +A G T G G A A A G A G G T G T A G G T A C A G G A +T G A A C T G T T T A C C C C C C T C T A T - C - +G G C T A G C A T C G C A C A C G C C G G T G C T +T C T G T T G A T A T G G G A A T T T T C T C G C +T C C A T T T G G C C G G G G T T T C T T C A A T +T T T A G G G G C C G T T A A T T T T A T A A C T +A C T G T T A T C A A T A T A C G A A G A A C C G +G A A T A A C T A T A G A C C G A A T T C C C C T +T T T T G T T T G A T C T G T C T T T A T T A C C +G C C C T C C T T C T T C T T C T A T C A T T A C +C T G T T +>Americhernus +T A A T A C T T G G G G C C C C T G A T A T G G - +C - T T T C C C C C G T A T A A A T A A T A T G A +G A T T C T G A C T C T T A C C T C C C T C A T T +A A C T C T - C C T A T T A T C G A G A G G G A T +A G T T G A A A C A G G G G T T G G T A C T G G A +T G G A C A G T A T A C C C C C C C T T A T - C - +G G C C G G G A T C G C C C A C G C A G G G G C C +T C A G T T G A C C T G G C A A T C T T T T C T C +T C C A C T T A G C T G G G G T T T C T T C T A T +T C T A G G A G C A G T T A A T T T T A T A A C A +A C C G T T A T C A A T A T A C G A A G T G T A G +G G A T G A C A A T A G A C C G A G T A C C T C T +T T T T G T A T G G T C T A T T T T C A T C A C A +A C T C T C C T T C T A C T A C T T T C T C T A C +C A G T C +>Aportus +T A A T A T T A G G G G C C C C T G A T A T G G C +C - T T T C C C - C G G A T A A A T A A T A T A A +G T T T C T G A C T A T T A C C C C C C T C T T T +A A C C C T T C T T T T A - T C T A G T G G T A T +G G T T G A A A C A G G A G T T G G G A C T G G T +T G A A C A G T T T A C C C C C C T T T A G - C - +C G C C G G A A T T G C T C A C G C A G G A G C T +T C G G T A G A T C T T G C T A T T T T C T C A C +T C C A C T T A G C C G G G G T T T C T T C G A T +T T T A G G A G C A G T A A A C T T T A T A A C T +A C C G T T A T T A A T A T A C G A A G A A C A G +G G A T A A C A A T G G A C C G G A T C C C T C T +T T T C G T C T G G T C G G T T T T T A T T A C T +G C C C T T C T T T T A C T T T T A T C T C T C C +C A G T T +>Artemia +T A A T A T T A G G A G C T C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A C T C C T C C C T C C C T C T T T +A A C T C T T T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C G G G A +T G A A C C G T T T A T C C T C C T C T T G - C - +A G C C G G A A T T G C T C A T G C C G G A G C T +T C A G T T G A T A T A G G A A T C T T C T C T C +T A C A T T T A G C A G G A G T A T C C T C T A T +T T T A G G G G C C G T A A A T T T T A T A A C A +A C A G T G A T T A A C A T A C G G G C A T C T G +G A A T A A C A T T A G A C C G A A T A C C T T T +A T T C G T G T G A T C T G T A T T T A T T A C T +G C A C T A T T A T T A C T T T T A T C A C T A C +C A G T T +>Centruoides +T A A T A T T A G G G G C C C C T G A T A T G G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A T T G C T T C C C C C T T C C C T +A A C T C T T T T A C T T - T C G A G T G G A A T +G G T A G A A A G A G G G G T A G G T A C A G G A +T G A A C T G T T T A T C C C C C T C T T G - C - +A G C C G G A A T T G C T C A T G C T G G A G C T +T C C G T T G A C A T A G G A A T T T T C T C T C +T A C A T T T A G C A G G A G T G T C T T C T A T +T T T A G G G G C C G T A A A C T T C A T A A C T +A C A G T A A T T A A T A T A C G A G C A T C T G +G A A T A A C A A T A G A C C G A A T A C C T T T +A T T T G T T T G A T C T G T C T T T A T T A C A +G C A C T A C T T T T A C T C T T G T C A T T A C +C A G T T +>Chanbria +T T A T A T T A G G A G C A C C T G A T A T A G - +C - A T T T C C A C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C A T C T C T +T A C A C T A C T C C T A - T C A A G A G G A A T +G G T A G A A A G A G G A G T C G G T A C A G G A +T G A A C T G T G T A T C C C C C G C T A G - C - +A G C A G G A A T T G C C C A C G C T G G A G C C +T C A G T T G A C A T A G G A A T T T T T T C A C +T A C A C C T C G C C G G A G T C T C A T C A A T +T C T A G G T G C A G T T A A T T T T A T A A C A +A C T G T T A T T A A C A T A C G C C C T G C G G +G A A T A A C T A T A G A C C G T A T A C C A C T +T T T C G T G T G A G C A G T A T T T A T T A C A +G C C C T A C T C C T T C T T T T A T C T C T A C +C A G T C +>Gea +T T A T A C T T G G T G C C C C A G A T A T G G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G A T T C T G A C T T C T C C C C C C T T C T C T +A A C T C T A T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C A G G A +T G A A C A G T T T A C C C C C C T C T A G - C - +A G C A G G A A T T G C T C A T G C T G G A G C T +T C A G T T G A T A T A G G A A T T T T C T C G C +T A C A C C T T G C A G G G G T C T C A T C A A T +C T T A G G A G C A G T T A A T T T C A T G A C A +A C A G T T A T T A A T A T G C G C C C T G C A G +G A A T A A C T A T G G A C C G T A T A C C A C T +C T T C G T G T G A G C T G T C T T T A T T A C A +G C C T T G C T A C T A T T A C T A T C C C T C C +C A G T C +>Hadrurus +T T A T A T T A G G G G C C C C A G A T A T A G - +C - T T T T C C C C G A A T A A A T A A T A T A A +G C T T C T G A T T A C T T C C C C C C T C C C T +A A C T C T C C T C C T T - T T A A G A G G A A T +A G T T G A A A G A G G A G T A G G G A C C G G C +T G A A C T G T C T A T C C T C C T T T A G - C T +A G - A G G A A T T G C C C A T G C A G G A G C C +T C G G T A G A C A T A G G A A T T T T T T C T C +T A C A C T T A G C T G G A G T T T C A T C C A T +T T T A G G G G C C G T A A A C T T T A T A A C A +A C A G T G A T T A A T A T A C G A T C A T C A G +G A A T A A C A A T A G A T C G A A T A C C T T T +A T T T G T T T G A T C A G T C T T T A T T A C C +G C T T T A T T G T T A C T C C T T T C A C T C C +C A G T T +>Hypochilus +T A A T A C T C G G G G C C C C A G A T A T A G - +C - T T T C C C A C G A A T A A A T A A C A T A A +G A T T C T G A C T T C T T C C T C C C T C T C T +A A C T C T A C T T C T C - T C A A G C G G A A T +A G T C G A A A G A G G G G T A G G A A C C G G C +T G G A C A G T A T A C C C T C C T C T A G C C - +A G - A G G A A T C G C T C A T G C A G G T G C T +T C T G T A G A T A T A G G A A T C T T C T C G C +T T C A C C T A G C A G G A G T C T C A T C A A T +C C T A G G G G C C G T A A A C T T T A T A A C A +A C A G T A A T C A A T A T A C G A T C C T C A G +G A A T A T C T A T A G A C C G T A T A C C T T T +A T T T G T A T G A T C A G T A T T C A T T A C C +G C T C T C C T C C T T C T T C T C T C T T T A C +C A G T C +>Limulus +T A A T A C T A G G T G C C C C G G A T A T G G - +C - T T T C C C T C G A A T A A A T A A C A T A A +G T T T C T G A C T T T T A C C C C C A T C T C T +C A C T C T - C C T C T T A T C T A G A G G A A T +A G T A G A A A G A G G T G T A G G A A C T G G A +T G A A C T G T A T A T C C T C C T T T A T - C - +T G C A A G T A T T G C C C A T G C G G G A G C T +T C C G T T G A T C T C G G A A T C T T C T C A C +T A C A C T T A G C A G G T G T A T C A T C A A T +T T T A G G T G C C G T A A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T C T A C C G +G A A T A A C T A T G G A C C G G A T A C C T C T +T T T T G T T T G A G C A G T A T T T A T T A C T +G C T T T A C T T T T A C T A C T G T C C T T A C +C A G T T +>Mastigoproctus +T G A T A T T A G G T G C T C C T G A T A T A G - +C - T T T T C C T C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C T T C G T T +A A C G T T A C T T C T T - T C C A G A G G T A T +A G T T G A A A G A G G G G T A G G A A C A G G A +T G A A C A G T C T A T C C T C C T T T A T - C - +A G C T A G A A T T G C T C A T G C C G G A G C T +T C A G T A G A C T T A G G T A T T T T C T C G C +T G C A T C T A G C T G G T G T C T C G T C T A T +C T T A G G A G C T G T C A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G T A T A C C T C T +T T T C G T A T G G G C A G T A T T T A T T A C C +G C C C T A C T A C T T T T A T T A T C T T T A C +C A G T A +>Pauroctonus +T A A T G T T A G G T G C T C C T G A T A T G G - +C - T T T T C C A C G A A T A A A C A A T A T G A +G T T T C T G G C T C C T A C C T C C T T C A C T +A A C A C T A C T T C T T - T C T A G A G G T A T +A G T T G A A A G A G G A G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T T T A T - C - +A G C C A G T A T T G C T C A T G C T G G G G C T +T C G G T A G A T T T A G G A A T T T T C T C C C +T A C A C T T G G C A G G T G T T T C T T C A A T +T T T A G G A G C T G T A A A T T T T A T G A C A +A C T G T T A T T A A C A T A C G A T C T A C G G +G A A T A A C T A T A G A C C G A A T A C C G C T +T T T C G T T T G A G C A G T A T T T A T T A C A +G C A T T A C T T C T C T T A T T A T C T C T A C +C G G T A +>Rhiphicephalus +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T - C - +A G C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thelichoris +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T - C - +A G C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thermobius +T A A T A T T A G G T G C T C C C G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T G A +G T T T C T G A C T T T T A C C T C C C T C G C T +A A C T C T A C T T C T T - T C T A G A G G T A T +A G T C G A A A G A G G G G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T C T A T - C - +T G C C A G C A T T G C C C A T G C A G G A G C A +T C T G T A G A T C T A G G A A T C T T C T C A T +T A C A T T T A G C A G G G G T T T C T T C A A T +T C T A G G T G C T G T A A A T T T T A T A A C T +A C C G T T A T C A A T A T A C G A T C A A C A G +G A A T A A C T A T A G A C C G A A T A C C T C T +T T T C G T C T G A G C G G T A T T T A T T A C A +G C C T T A C T A C T T T T A C T A T C A T T A C +C G G T T +>Vonones +T A A T G T T A G G T G C T C C A G A T A T G G - +C A T T T C C C - C G A A T A A A T A A T A T A A +G T T T C T G A C T T T T A C C C C C T T C G C T +A A C T T T A C T T - T T A T C T A G A G G T A T +A G T C G A A A G A G G A G T G G G A A C T G G A +T G A A C A G T A T A C C C T C C T T T A T - C - +A G C C A G A A T T G C T C A C G C A G G T G C T +T C A G T T G A T C T A G G T A T T T T T T C A T +T A C A T T T A G C A G G G G T C T C A T C A A T +C T T A G G A G C T G T A A A C T T T A T A A C G +A C C G T T A T C A A T A T A C G A T C T A C A G +G G A T A A C T A T A G A C C G A A T A C C A C T +T T T T G T T T G A G C A G T A T T T A T T A C A +G C T C T A C T C C T A C T G T T A T C T T T A C +C A G T C + + +Sequence character chel_16S.fas#0 +>Alentus +T C C A G A C T - - G A - C - G G T - C C A C C - +- G A A T C G - G G C G A C T G - - - T C - A G G +C C T G A A C - - A T C C G G - T C G G T T - T C +- T C - T C C C T T T T C C C G C G G T G C T C T +T C G T T G A G - T G - T C G C G - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Amblypygid +T C C A G A C T - - G G - C - G G T - C C G C C T +- A - G C G G C G A G T A C T G - - - T C - A G G +C C T G A A C - - A T G G C G - C C G G - T - T - +- T - - T - - C - - - - - - C T T G G T T C T C T +T T A C T G A G - T G - T C T T G - G G C G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - - - G - C T G +- T - - - - C G C C +>Americhernus +T C G A G C C T C C A A - T - G A T - A C G T T G +- A - A A G G C G T T T A T C G - - - T T - G G G +G C C G - A C - - A G C G C G - T C - G - - - T - +- G - - G - - G - - - - - - C T C G G T T G G C C +T T A A A A A G C T G A T C - G G - G T T C T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +G G T G C T C A A A G C A - - G - - - - G C C T G +- G - - - - T G C C +>Aportus +T C C A G A C T - - G A - C - G G T - C C A C C - +- G - C T C G - G G C G A C T G - - - T C - A G G +C C T G A A C - - A T T C C G - T C G G T T - T C +G A C - A G A T T T T T C C C G C G G T G C T C T +T C G G T G A G - T G - T C G C - - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Artemia +C T C G G T C G - - G G - T - G G T G C C G C C - +T C - A C G G T G G T C A C T G - - - C C T C G A +T C - G G A C - A A T T C A T - T G G A - - - T - +- C - - G - T T - - - - - - C G G G G T G C T C T +T A A C C G A G - T G - T C C T G - G G T G G C C +G A T A C G T T T A C T T T G A A C A A A T T A G +A G T G C T T A A A G C A - - G - - - - G - T - G +C A C - C G C G C C +>Centruoides +T C C A G A C A - - A G - C - G G T - C C A C C - +- C - G C G G T G G T T A C T G - - - T T T G G A +C T - G G A C - - G T T T G G C C G G A - - - T - +- T - - C - C T - - - - - - - T T G A T G C T C T +T T G C C G A G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - - +- A - - C - C G C C +>Chanbria +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - +>Gea +T C C G G C C G - - G A - C G G G T - C C G C C - +T A - C C G G T G G T T A C T G - T T C G - C T G +C C - G A G C - - - T T C A G - G G G G - - - C - +- C - - G - C T - - - - - - G T C G A T G A T C T +T C A T C G G T - T A - T C T T C - C G T A A C C +C T C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - - - - C G C C +>Hadrurus +T C C G G A C T - - G T - C - G G T - C C G - C - +- C - G C - A A G C T T A C T G - - - G C A G G A +C C - G G A C - - G T C T A G C C G G A - - C T - +- C - - T - C T - - - - - - C G T - A T C C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - T C - C G C C +>Hypochilus +T C C A G A C G - - G G - C - G G T - C C G C C - +T A - A C G G T G G T T A C T G - - - C C - T G G +C C T G A A C - - A A C C A G - C C G G - - - T - +- T - - T - C C - - - - - - C T A G A T G A T C T +T C A T T G A T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - G +- A - - - - C G C C +>Limulus +T C T A G A C T - - G G - C - G G T - C C G C T T +- - - C C G G C G G T T A C T G - - - C C - T G G +C C T A A A C - - A T C - T G - C C G G - T - T - +- T - - T - C C - - - - - - C T C G G T G C C C T +T G A T T G A G - T G - T C T T G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - A +- A - - - - C G C C +>Mastigoproctus +T C C A G A C G - - G G - T - G G T - C C A C C - +G C - C C G G T G G C T A C T G - - - C C - C G G +C C T G A A C A - A T C T C G - C C G G - - - T - +- T - - T - T C - - - - - - C T T G A T T C T C T +T C A C C G A G - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T T A A A G C A - - G - - C - G - T - A +- A - - - - C G C C +>Pauroctonus +T C C A G A C T - - G T - C - G G T - C C G - C - +- A - C C G G A G G T T A C T G - - - G C - A G G +A C C G G A C - - G T C T A G - C C G G - - - C - +- C - - T - C C - - - - - - C T C G T T G C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - A - C +- C - - - - C G C C +>Rhiphicephalus +T C C A G A C G - - A G - T - A G T - G C A T C - +T A - C C C G A T G C T A C G G - - - C T - C G G +A C T G A A C - - A T C A T G - C C G G - - - T - +- T - C T - T T - - - - - - C T T G G T G C A C T +T C A T T G T G - T G - C C T C G A G A T G G C C +G G T G C T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A C G C A - G G - - C - G - A - G +- T - - - - C G C C +>Thelichoris +T C C A G A C G - - G G - C - G G T - C C G C C - +T A - A C G G T G G T T A C T G - - - C C - T G G +C C T G A A C - - A G C C A G - C C G G - - - T - +- T - - T - C C - - - - - - C T A G A T G A T C T +T T A C C G G T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - G G - - C - - - T - G +- A - - - - C G C C +>Thermobius +C T C G G A C - - - G A T C - G G T - T C G C C - +- G - C C C G T G T T A A C T G A - - T C - - G T +T C C G G A C - - G T C C T G - C C G G T T - T - +- T - - T - C C - T T T - - C T C G G T G C T C T +T C A T T G A G - T G - T C T T G - A T T G G C C +G G C A C G T T T A C T T T G A A C A A A T T A G +A G T G C T C A A A G C A G - G T C C A G - T - G +- T - - - - C G C C +>Vonones +T C G A G G C T - - G G - C - G G T - C C G C C T +- A - C A G G C G G T C A C T G - - - C C - A G T +A C T C A A C - - A T C C T G - C C G G - T - T - +- T - - T - C C - - - - - - C T T G G T G C T C T +T C G C T G A G - T G - T C T C G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - C - G +- T - - - - A G C C + +Implied Alignments for Graph 8 + +Sequence character chel_cox1.fas#0 +>Alentus +T T A T A T T A G G A G C A C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T T T G A C T T C T T C C A C C A T C T T T +A A C T T T A C T C C T C - T C T A G A G G A A T +A G T A G A A A G T G G A G T G G G A A C A G G C +T G A A C T G T C T A T C C T C C T T T A T - C - +T G C T G G A A T T G C A C A T G C T G G T G C C +T C A G T A G A T T T A G G A A T C T T T T C T T +T A C A C C T A G C A G G A G T T T C A T C C A T +C T T A G G G G C C G T A A A C T T T A T A A C T +A C C G T T A T C A A T A T A C G A A G A A C A G +G T A T A A C A A T A G A C C G A A T T C C T C T +T T T T G T A T G A T C T G T A T T T A T T A C A +G C C C T T C T T C T C C T T C T T T C C C T C C +C A G T C +>Amblypygid +T A A T A T T A G G A G C C C C A G A T A T G G - +C - T T T T C C C C G T A T A A A T A A T A T A A +G T T T T T G A C T T T T A C C C C C C T C T C T +A A C A C T - A C T C C T C T C T A G A G G G A T +A G T G G A A A G A G G T G T A G G T A C A G G A +T G A A C T G T T T A C C C C C C T C T A T - C - +G G C T A G C A T C G C A C A C G C C G G T G C T +T C T G T T G A T A T G G G A A T T T T C T C G C +T C C A T T T G G C C G G G G T T T C T T C A A T +T T T A G G G G C C G T T A A T T T T A T A A C T +A C T G T T A T C A A T A T A C G A A G A A C C G +G A A T A A C T A T A G A C C G A A T T C C C C T +T T T T G T T T G A T C T G T C T T T A T T A C C +G C C C T C C T T C T T C T T C T A T C A T T A C +C T G T T +>Americhernus +T A A T A C T T G G G G C C C C T G A T A T G G - +C - T T T C C C C C G T A T A A A T A A T A T G A +G A T T C T G A C T C T T A C C T C C C T C A T T +A A C T C T - C C T A T T A T C G A G A G G G A T +A G T T G A A A C A G G G G T T G G T A C T G G A +T G G A C A G T A T A C C C C C C C T T A T - C - +G G C C G G G A T C G C C C A C G C A G G G G C C +T C A G T T G A C C T G G C A A T C T T T T C T C +T C C A C T T A G C T G G G G T T T C T T C T A T +T C T A G G A G C A G T T A A T T T T A T A A C A +A C C G T T A T C A A T A T A C G A A G T G T A G +G G A T G A C A A T A G A C C G A G T A C C T C T +T T T T G T A T G G T C T A T T T T C A T C A C A +A C T C T C C T T C T A C T A C T T T C T C T A C +C A G T C +>Aportus +T A A T A T T A G G G G C C C C T G A T A T G G C +C - T T T C C C - C G G A T A A A T A A T A T A A +G T T T C T G A C T A T T A C C C C C C T C T T T +A A C C C T T C T T T T A - T C T A G T G G T A T +G G T T G A A A C A G G A G T T G G G A C T G G T +T G A A C A G T T T A C C C C C C T T T A G - C - +C G C C G G A A T T G C T C A C G C A G G A G C T +T C G G T A G A T C T T G C T A T T T T C T C A C +T C C A C T T A G C C G G G G T T T C T T C G A T +T T T A G G A G C A G T A A A C T T T A T A A C T +A C C G T T A T T A A T A T A C G A A G A A C A G +G G A T A A C A A T G G A C C G G A T C C C T C T +T T T C G T C T G G T C G G T T T T T A T T A C T +G C C C T T C T T T T A C T T T T A T C T C T C C +C A G T T +>Artemia +T A A T A T T A G G A G C T C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A C T C C T C C C T C C C T C T T T +A A C T C T T T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C G G G A +T G A A C C G T T T A T C C T C C T C T T G - C - +A G C C G G A A T T G C T C A T G C C G G A G C T +T C A G T T G A T A T A G G A A T C T T C T C T C +T A C A T T T A G C A G G A G T A T C C T C T A T +T T T A G G G G C C G T A A A T T T T A T A A C A +A C A G T G A T T A A C A T A C G G G C A T C T G +G A A T A A C A T T A G A C C G A A T A C C T T T +A T T C G T G T G A T C T G T A T T T A T T A C T +G C A C T A T T A T T A C T T T T A T C A C T A C +C A G T T +>Centruoides +T A A T A T T A G G G G C C C C T G A T A T G G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A T T G C T T C C C C C T T C C C T +A A C T C T T T T A C T T - T C G A G T G G A A T +G G T A G A A A G A G G G G T A G G T A C A G G A +T G A A C T G T T T A T C C C C C T C T T G - C - +A G C C G G A A T T G C T C A T G C T G G A G C T +T C C G T T G A C A T A G G A A T T T T C T C T C +T A C A T T T A G C A G G A G T G T C T T C T A T +T T T A G G G G C C G T A A A C T T C A T A A C T +A C A G T A A T T A A T A T A C G A G C A T C T G +G A A T A A C A A T A G A C C G A A T A C C T T T +A T T T G T T T G A T C T G T C T T T A T T A C A +G C A C T A C T T T T A C T C T T G T C A T T A C +C A G T T +>Chanbria +T T A T A T T A G G A G C A C C T G A T A T A G - +C - A T T T C C A C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C A T C T C T +T A C A C T A C T C C T A - T C A A G A G G A A T +G G T A G A A A G A G G A G T C G G T A C A G G A +T G A A C T G T G T A T C C C C C G C T A G - C - +A G C A G G A A T T G C C C A C G C T G G A G C C +T C A G T T G A C A T A G G A A T T T T T T C A C +T A C A C C T C G C C G G A G T C T C A T C A A T +T C T A G G T G C A G T T A A T T T T A T A A C A +A C T G T T A T T A A C A T A C G C C C T G C G G +G A A T A A C T A T A G A C C G T A T A C C A C T +T T T C G T G T G A G C A G T A T T T A T T A C A +G C C C T A C T C C T T C T T T T A T C T C T A C +C A G T C +>Gea +T T A T A C T T G G T G C C C C A G A T A T G G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G A T T C T G A C T T C T C C C C C C T T C T C T +A A C T C T A T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C A G G A +T G A A C A G T T T A C C C C C C T C T A G - C - +A G C A G G A A T T G C T C A T G C T G G A G C T +T C A G T T G A T A T A G G A A T T T T C T C G C +T A C A C C T T G C A G G G G T C T C A T C A A T +C T T A G G A G C A G T T A A T T T C A T G A C A +A C A G T T A T T A A T A T G C G C C C T G C A G +G A A T A A C T A T G G A C C G T A T A C C A C T +C T T C G T G T G A G C T G T C T T T A T T A C A +G C C T T G C T A C T A T T A C T A T C C C T C C +C A G T C +>Hadrurus +T T A T A T T A G G G G C C C C A G A T A T A G - +C - T T T T C C C C G A A T A A A T A A T A T A A +G C T T C T G A T T A C T T C C C C C C T C C C T +A A C T C T C C T C C T T - T T A A G A G G A A T +A G T T G A A A G A G G A G T A G G G A C C G G C +T G A A C T G T C T A T C C T C C T T T A G - C T +A G - A G G A A T T G C C C A T G C A G G A G C C +T C G G T A G A C A T A G G A A T T T T T T C T C +T A C A C T T A G C T G G A G T T T C A T C C A T +T T T A G G G G C C G T A A A C T T T A T A A C A +A C A G T G A T T A A T A T A C G A T C A T C A G +G A A T A A C A A T A G A T C G A A T A C C T T T +A T T T G T T T G A T C A G T C T T T A T T A C C +G C T T T A T T G T T A C T C C T T T C A C T C C +C A G T T +>Hypochilus +T A A T A C T C G G G G C C C C A G A T A T A G - +C - T T T C C C A C G A A T A A A T A A C A T A A +G A T T C T G A C T T C T T C C T C C C T C T C T +A A C T C T A C T T C T C - T C A A G C G G A A T +A G T C G A A A G A G G G G T A G G A A C C G G C +T G G A C A G T A T A C C C T C C T C T A G C C - +A G - A G G A A T C G C T C A T G C A G G T G C T +T C T G T A G A T A T A G G A A T C T T C T C G C +T T C A C C T A G C A G G A G T C T C A T C A A T +C C T A G G G G C C G T A A A C T T T A T A A C A +A C A G T A A T C A A T A T A C G A T C C T C A G +G A A T A T C T A T A G A C C G T A T A C C T T T +A T T T G T A T G A T C A G T A T T C A T T A C C +G C T C T C C T C C T T C T T C T C T C T T T A C +C A G T C +>Limulus +T A A T A C T A G G T G C C C C G G A T A T G G - +C - T T T C C C T C G A A T A A A T A A C A T A A +G T T T C T G A C T T T T A C C C C C A T C T C T +C A C T C T - C C T C T T A T C T A G A G G A A T +A G T A G A A A G A G G T G T A G G A A C T G G A +T G A A C T G T A T A T C C T C C T T T A T - C - +T G C A A G T A T T G C C C A T G C G G G A G C T +T C C G T T G A T C T C G G A A T C T T C T C A C +T A C A C T T A G C A G G T G T A T C A T C A A T +T T T A G G T G C C G T A A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T C T A C C G +G A A T A A C T A T G G A C C G G A T A C C T C T +T T T T G T T T G A G C A G T A T T T A T T A C T +G C T T T A C T T T T A C T A C T G T C C T T A C +C A G T T +>Mastigoproctus +T G A T A T T A G G T G C T C C T G A T A T A G - +C - T T T T C C T C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C T T C G T T +A A C G T T A C T T C T T - T C C A G A G G T A T +A G T T G A A A G A G G G G T A G G A A C A G G A +T G A A C A G T C T A T C C T C C T T T A T - C - +A G C T A G A A T T G C T C A T G C C G G A G C T +T C A G T A G A C T T A G G T A T T T T C T C G C +T G C A T C T A G C T G G T G T C T C G T C T A T +C T T A G G A G C T G T C A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G T A T A C C T C T +T T T C G T A T G G G C A G T A T T T A T T A C C +G C C C T A C T A C T T T T A T T A T C T T T A C +C A G T A +>Pauroctonus +T A A T G T T A G G T G C T C C T G A T A T G G - +C - T T T T C C A C G A A T A A A C A A T A T G A +G T T T C T G G C T C C T A C C T C C T T C A C T +A A C A C T A C T T C T T - T C T A G A G G T A T +A G T T G A A A G A G G A G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T T T A T - C - +A G C C A G T A T T G C T C A T G C T G G G G C T +T C G G T A G A T T T A G G A A T T T T C T C C C +T A C A C T T G G C A G G T G T T T C T T C A A T +T T T A G G A G C T G T A A A T T T T A T G A C A +A C T G T T A T T A A C A T A C G A T C T A C G G +G A A T A A C T A T A G A C C G A A T A C C G C T +T T T C G T T T G A G C A G T A T T T A T T A C A +G C A T T A C T T C T C T T A T T A T C T C T A C +C G G T A +>Rhiphicephalus +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T - C - +A G C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thelichoris +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T - C - +A G C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thermobius +T A A T A T T A G G T G C T C C C G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T G A +G T T T C T G A C T T T T A C C T C C C T C G C T +A A C T C T A C T T C T T - T C T A G A G G T A T +A G T C G A A A G A G G G G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T C T A T - C - +T G C C A G C A T T G C C C A T G C A G G A G C A +T C T G T A G A T C T A G G A A T C T T C T C A T +T A C A T T T A G C A G G G G T T T C T T C A A T +T C T A G G T G C T G T A A A T T T T A T A A C T +A C C G T T A T C A A T A T A C G A T C A A C A G +G A A T A A C T A T A G A C C G A A T A C C T C T +T T T C G T C T G A G C G G T A T T T A T T A C A +G C C T T A C T A C T T T T A C T A T C A T T A C +C G G T T +>Vonones +T A A T G T T A G G T G C T C C A G A T A T G G - +C A T T T C C C - C G A A T A A A T A A T A T A A +G T T T C T G A C T T T T A C C C C C T T C G C T +A A C T T T A C T T - T T A T C T A G A G G T A T +A G T C G A A A G A G G A G T G G G A A C T G G A +T G A A C A G T A T A C C C T C C T T T A T - C - +A G C C A G A A T T G C T C A C G C A G G T G C T +T C A G T T G A T C T A G G T A T T T T T T C A T +T A C A T T T A G C A G G G G T C T C A T C A A T +C T T A G G A G C T G T A A A C T T T A T A A C G +A C C G T T A T C A A T A T A C G A T C T A C A G +G G A T A A C T A T A G A C C G A A T A C C A C T +T T T T G T T T G A G C A G T A T T T A T T A C A +G C T C T A C T C C T A C T G T T A T C T T T A C +C A G T C + + +Sequence character chel_16S.fas#0 +>Alentus +T C C A G A C T - - G A - C - G G T - C C A C C - +- G A A T C G - G G C G A C T G - - - T C - A G G +C C T G A A C - - A T C C G G - T C G G T T - T C +- T C - T C C C T T T T C C C G C G G T G C T C T +T C G T T G A G - T G - T C G C G - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Amblypygid +T C C A G A C T - - G G - C - G G T - C C G C C T +- A - G C G G C G A G T A C T G - - - T C - A G G +C C T G A A C - - A T G G C G - C C G G - T - T - +- T - - T - - C - - - - - - C T T G G T T C T C T +T T A C T G A G - T G - T C T T G - G G C G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - - - G - C T G +- T - - - - C G C C +>Americhernus +T C G A G C C T C C A A - T - G A T - A C G T T G +- A - A A G G C G T T T A T C G - - - T T - G G G +G C C G - A C - - A G C G C G - T C - G - - - T - +- G - - G - - G - - - - - - C T C G G T T G G C C +T T A A A A A G C T G A T C - G G - G T T C T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +G G T G C T C A A A G C A - - G - - - - G C C T G +- G - - - - T G C C +>Aportus +T C C A G A C T - - G A - C - G G T - C C A C C - +- G - C T C G - G G C G A C T G - - - T C - A G G +C C T G A A C - - A T T C C G - T C G G T T - T C +G A C - A G A T T T T T C C C G C G G T G C T C T +T C G G T G A G - T G - T C G C - - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Artemia +C T C G G T C G - - G G - T - G G T G C C G C C - +T C - A C G G T G G T C A C T G - - - C C T C G A +T C - G G A C - A A T T C A T - T G G A - - - T - +- C - - G - T T - - - - - - C G G G G T G C T C T +T A A C C G A G - T G - T C C T G - G G T G G C C +G A T A C G T T T A C T T T G A A C A A A T T A G +A G T G C T T A A A G C A - - G - - - - G - T - G +C A C - C G C G C C +>Centruoides +T C C A G A C A - - A G - C - G G T - C C A C C - +- C - G C G G T G G T T A C T G - - - T T T G G A +C T - G G A C - - G T T T G G C C G G A - - - T - +- T - - C - C T - - - - - - - T T G A T G C T C T +T T G C C G A G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - - +- A - - C - C G C C +>Chanbria +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - +>Gea +T C C G G C C G - - G A - C G G G T - C C G C C - +T A - C C G G T G G T T A C T G - T T C G - C T G +C C - G A G C - - - T T C A G - G G G G - - - C - +- C - - G - C T - - - - - - G T C G A T G A T C T +T C A T C G G T - T A - T C T T C - C G T A A C C +C T C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - - - - C G C C +>Hadrurus +T C C G G A C T - - G T - C - G G T - C C G - C - +- C - G C - A A G C T T A C T G - - - G C A G G A +C C - G G A C - - G T C T A G C C G G A - - C T - +- C - - T - C T - - - - - - C G T - A T C C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - T C - C G C C +>Hypochilus +T C C A G A C G - - G G - C - G G T - C C G C C - +T A - A C G G T G G T T A C T G - - - C C - T G G +C C T G A A C - - A A C C A G - C C G G - - - T - +- T - - T - C C - - - - - - C T A G A T G A T C T +T C A T T G A T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - G +- A - - - - C G C C +>Limulus +T C T A G A C T - - G G - C - G G T - C C G C T T +- - - C C G G C G G T T A C T G - - - C C - T G G +C C T A A A C - - A T C - T G - C C G G - T - T - +- T - - T - C C - - - - - - C T C G G T G C C C T +T G A T T G A G - T G - T C T T G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - A +- A - - - - C G C C +>Mastigoproctus +T C C A G A C G - - G G - T - G G T - C C A C C - +G C - C C G G T G G C T A C T G - - - C C - C G G +C C T G A A C A - A T C T C G - C C G G - - - T - +- T - - T - T C - - - - - - C T T G A T T C T C T +T C A C C G A G - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T T A A A G C A - - G - - C - G - T - A +- A - - - - C G C C +>Pauroctonus +T C C A G A C T - - G T - C - G G T - C C G - C - +- A - C C G G A G G T T A C T G - - - G C - A G G +A C C G G A C - - G T C T A G - C C G G - - - C - +- C - - T - C C - - - - - - C T C G T T G C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - A - C +- C - - - - C G C C +>Rhiphicephalus +T C C A G A C G - - A G - T - A G T - G C A T C - +T A - C C C G A T G C T A C G G - - - C T - C G G +A C T G A A C - - A T C A T G - C C G G - - - T - +- T - C T - T T - - - - - - C T T G G T G C A C T +T C A T T G T G - T G - C C T C G A G A T G G C C +G G T G C T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A C G C A - G G - - C - G - A - G +- T - - - - C G C C +>Thelichoris +T C C A G A C G - - G G - C - G G T - C C G C C - +T A - A C G G T G G T T A C T G - - - C C - T G G +C C T G A A C - - A G C C A G - C C G G - - - T - +- T - - T - C C - - - - - - C T A G A T G A T C T +T T A C C G G T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - G G - - C - - - T - G +- A - - - - C G C C +>Thermobius +C T C G G A C - - - G A T C - G G T - T C G C C - +- G - C C C G T G T T A A C T G A - - T C - - G T +T C C G G A C - - G T C C T G - C C G G T T - T - +- T - - T - C C - T T T - - C T C G G T G C T C T +T C A T T G A G - T G - T C T T G - A T T G G C C +G G C A C G T T T A C T T T G A A C A A A T T A G +A G T G C T C A A A G C A G - G T C C A G - T - G +- T - - - - C G C C +>Vonones +T C G A G G C T - - G G - C - G G T - C C G C C T +- A - C A G G C G G T C A C T G - - - C C - A G T +A C T C A A C - - A T C C T G - C C G G - T - T - +- T - - T - C C - - - - - - C T T G G T G C T C T +T C G C T G A G - T G - T C T C G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - C - G +- T - - - - A G C C + +Implied Alignments for Graph 9 + +Sequence character chel_cox1.fas#0 +>Alentus +T T A T A T T A G G A G C A C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T T T G A C T T C T T C C A C C A T C T T T +A A C T T T A C T C C T C - T C T A G A G G A A T +A G T A G A A A G T G G A G T G G G A A C A G G C +T G A A C T G T C T A T C C T C C T T T A T - C - +T G C T G G A A T T G C A C A T G C T G G T G C C +T C A G T A G A T T T A G G A A T C T T T T C T T +T A C A C C T A G C A G G A G T T T C A T C C A T +C T T A G G G G C C G T A A A C T T T A T A A C T +A C C G T T A T C A A T A T A C G A A G A A C A G +G T A T A A C A A T A G A C C G A A T T C C T C T +T T T T G T A T G A T C T G T A T T T A T T A C A +G C C C T T C T T C T C C T T C T T T C C C T C C +C A G T C +>Amblypygid +T A A T A T T A G G A G C C C C A G A T A T G G - +C - T T T T C C C C G T A T A A A T A A T A T A A +G T T T T T G A C T T T T A C C C C C C T C T C T +A A C A C T - A C T C C T C T C T A G A G G G A T +A G T G G A A A G A G G T G T A G G T A C A G G A +T G A A C T G T T T A C C C C C C T C T A T - C - +G G C T A G C A T C G C A C A C G C C G G T G C T +T C T G T T G A T A T G G G A A T T T T C T C G C +T C C A T T T G G C C G G G G T T T C T T C A A T +T T T A G G G G C C G T T A A T T T T A T A A C T +A C T G T T A T C A A T A T A C G A A G A A C C G +G A A T A A C T A T A G A C C G A A T T C C C C T +T T T T G T T T G A T C T G T C T T T A T T A C C +G C C C T C C T T C T T C T T C T A T C A T T A C +C T G T T +>Americhernus +T A A T A C T T G G G G C C C C T G A T A T G G - +C - T T T C C C C C G T A T A A A T A A T A T G A +G A T T C T G A C T C T T A C C T C C C T C A T T +A A C T C T - C C T A T T A T C G A G A G G G A T +A G T T G A A A C A G G G G T T G G T A C T G G A +T G G A C A G T A T A C C C C C C C T T A T - C - +G G C C G G G A T C G C C C A C G C A G G G G C C +T C A G T T G A C C T G G C A A T C T T T T C T C +T C C A C T T A G C T G G G G T T T C T T C T A T +T C T A G G A G C A G T T A A T T T T A T A A C A +A C C G T T A T C A A T A T A C G A A G T G T A G +G G A T G A C A A T A G A C C G A G T A C C T C T +T T T T G T A T G G T C T A T T T T C A T C A C A +A C T C T C C T T C T A C T A C T T T C T C T A C +C A G T C +>Aportus +T A A T A T T A G G G G C C C C T G A T A T G G C +C - T T T C C C - C G G A T A A A T A A T A T A A +G T T T C T G A C T A T T A C C C C C C T C T T T +A A C C C T T C T T T T A - T C T A G T G G T A T +G G T T G A A A C A G G A G T T G G G A C T G G T +T G A A C A G T T T A C C C C C C T T T A G - C - +C G C C G G A A T T G C T C A C G C A G G A G C T +T C G G T A G A T C T T G C T A T T T T C T C A C +T C C A C T T A G C C G G G G T T T C T T C G A T +T T T A G G A G C A G T A A A C T T T A T A A C T +A C C G T T A T T A A T A T A C G A A G A A C A G +G G A T A A C A A T G G A C C G G A T C C C T C T +T T T C G T C T G G T C G G T T T T T A T T A C T +G C C C T T C T T T T A C T T T T A T C T C T C C +C A G T T +>Artemia +T A A T A T T A G G A G C T C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A C T C C T C C C T C C C T C T T T +A A C T C T T T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C G G G A +T G A A C C G T T T A T C C T C C T C T T G - C - +A G C C G G A A T T G C T C A T G C C G G A G C T +T C A G T T G A T A T A G G A A T C T T C T C T C +T A C A T T T A G C A G G A G T A T C C T C T A T +T T T A G G G G C C G T A A A T T T T A T A A C A +A C A G T G A T T A A C A T A C G G G C A T C T G +G A A T A A C A T T A G A C C G A A T A C C T T T +A T T C G T G T G A T C T G T A T T T A T T A C T +G C A C T A T T A T T A C T T T T A T C A C T A C +C A G T T +>Centruoides +T A A T A T T A G G G G C C C C T G A T A T G G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A T T G C T T C C C C C T T C C C T +A A C T C T T T T A C T T - T C G A G T G G A A T +G G T A G A A A G A G G G G T A G G T A C A G G A +T G A A C T G T T T A T C C C C C T C T T G - C - +A G C C G G A A T T G C T C A T G C T G G A G C T +T C C G T T G A C A T A G G A A T T T T C T C T C +T A C A T T T A G C A G G A G T G T C T T C T A T +T T T A G G G G C C G T A A A C T T C A T A A C T +A C A G T A A T T A A T A T A C G A G C A T C T G +G A A T A A C A A T A G A C C G A A T A C C T T T +A T T T G T T T G A T C T G T C T T T A T T A C A +G C A C T A C T T T T A C T C T T G T C A T T A C +C A G T T +>Chanbria +T T A T A T T A G G A G C A C C T G A T A T A G - +C - A T T T C C A C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C A T C T C T +T A C A C T A C T C C T A - T C A A G A G G A A T +G G T A G A A A G A G G A G T C G G T A C A G G A +T G A A C T G T G T A T C C C C C G C T A G - C - +A G C A G G A A T T G C C C A C G C T G G A G C C +T C A G T T G A C A T A G G A A T T T T T T C A C +T A C A C C T C G C C G G A G T C T C A T C A A T +T C T A G G T G C A G T T A A T T T T A T A A C A +A C T G T T A T T A A C A T A C G C C C T G C G G +G A A T A A C T A T A G A C C G T A T A C C A C T +T T T C G T G T G A G C A G T A T T T A T T A C A +G C C C T A C T C C T T C T T T T A T C T C T A C +C A G T C +>Gea +T T A T A C T T G G T G C C C C A G A T A T G G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G A T T C T G A C T T C T C C C C C C T T C T C T +A A C T C T A T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C A G G A +T G A A C A G T T T A C C C C C C T C T A G - C - +A G C A G G A A T T G C T C A T G C T G G A G C T +T C A G T T G A T A T A G G A A T T T T C T C G C +T A C A C C T T G C A G G G G T C T C A T C A A T +C T T A G G A G C A G T T A A T T T C A T G A C A +A C A G T T A T T A A T A T G C G C C C T G C A G +G A A T A A C T A T G G A C C G T A T A C C A C T +C T T C G T G T G A G C T G T C T T T A T T A C A +G C C T T G C T A C T A T T A C T A T C C C T C C +C A G T C +>Hadrurus +T T A T A T T A G G G G C C C C A G A T A T A G - +C - T T T T C C C C G A A T A A A T A A T A T A A +G C T T C T G A T T A C T T C C C C C C T C C C T +A A C T C T C C T C C T T - T T A A G A G G A A T +A G T T G A A A G A G G A G T A G G G A C C G G C +T G A A C T G T C T A T C C T C C T T T A G - C T +A G - A G G A A T T G C C C A T G C A G G A G C C +T C G G T A G A C A T A G G A A T T T T T T C T C +T A C A C T T A G C T G G A G T T T C A T C C A T +T T T A G G G G C C G T A A A C T T T A T A A C A +A C A G T G A T T A A T A T A C G A T C A T C A G +G A A T A A C A A T A G A T C G A A T A C C T T T +A T T T G T T T G A T C A G T C T T T A T T A C C +G C T T T A T T G T T A C T C C T T T C A C T C C +C A G T T +>Hypochilus +T A A T A C T C G G G G C C C C A G A T A T A G - +C - T T T C C C A C G A A T A A A T A A C A T A A +G A T T C T G A C T T C T T C C T C C C T C T C T +A A C T C T A C T T C T C - T C A A G C G G A A T +A G T C G A A A G A G G G G T A G G A A C C G G C +T G G A C A G T A T A C C C T C C T C T A G C C - +A G - A G G A A T C G C T C A T G C A G G T G C T +T C T G T A G A T A T A G G A A T C T T C T C G C +T T C A C C T A G C A G G A G T C T C A T C A A T +C C T A G G G G C C G T A A A C T T T A T A A C A +A C A G T A A T C A A T A T A C G A T C C T C A G +G A A T A T C T A T A G A C C G T A T A C C T T T +A T T T G T A T G A T C A G T A T T C A T T A C C +G C T C T C C T C C T T C T T C T C T C T T T A C +C A G T C +>Limulus +T A A T A C T A G G T G C C C C G G A T A T G G - +C - T T T C C C T C G A A T A A A T A A C A T A A +G T T T C T G A C T T T T A C C C C C A T C T C T +C A C T C T - C C T C T T A T C T A G A G G A A T +A G T A G A A A G A G G T G T A G G A A C T G G A +T G A A C T G T A T A T C C T C C T T T A T - C - +T G C A A G T A T T G C C C A T G C G G G A G C T +T C C G T T G A T C T C G G A A T C T T C T C A C +T A C A C T T A G C A G G T G T A T C A T C A A T +T T T A G G T G C C G T A A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T C T A C C G +G A A T A A C T A T G G A C C G G A T A C C T C T +T T T T G T T T G A G C A G T A T T T A T T A C T +G C T T T A C T T T T A C T A C T G T C C T T A C +C A G T T +>Mastigoproctus +T G A T A T T A G G T G C T C C T G A T A T A G - +C - T T T T C C T C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C T T C G T T +A A C G T T A C T T C T T - T C C A G A G G T A T +A G T T G A A A G A G G G G T A G G A A C A G G A +T G A A C A G T C T A T C C T C C T T T A T - C - +A G C T A G A A T T G C T C A T G C C G G A G C T +T C A G T A G A C T T A G G T A T T T T C T C G C +T G C A T C T A G C T G G T G T C T C G T C T A T +C T T A G G A G C T G T C A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G T A T A C C T C T +T T T C G T A T G G G C A G T A T T T A T T A C C +G C C C T A C T A C T T T T A T T A T C T T T A C +C A G T A +>Pauroctonus +T A A T G T T A G G T G C T C C T G A T A T G G - +C - T T T T C C A C G A A T A A A C A A T A T G A +G T T T C T G G C T C C T A C C T C C T T C A C T +A A C A C T A C T T C T T - T C T A G A G G T A T +A G T T G A A A G A G G A G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T T T A T - C - +A G C C A G T A T T G C T C A T G C T G G G G C T +T C G G T A G A T T T A G G A A T T T T C T C C C +T A C A C T T G G C A G G T G T T T C T T C A A T +T T T A G G A G C T G T A A A T T T T A T G A C A +A C T G T T A T T A A C A T A C G A T C T A C G G +G A A T A A C T A T A G A C C G A A T A C C G C T +T T T C G T T T G A G C A G T A T T T A T T A C A +G C A T T A C T T C T C T T A T T A T C T C T A C +C G G T A +>Rhiphicephalus +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T - C - +A G C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thelichoris +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T - C - +A G C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thermobius +T A A T A T T A G G T G C T C C C G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T G A +G T T T C T G A C T T T T A C C T C C C T C G C T +A A C T C T A C T T C T T - T C T A G A G G T A T +A G T C G A A A G A G G G G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T C T A T - C - +T G C C A G C A T T G C C C A T G C A G G A G C A +T C T G T A G A T C T A G G A A T C T T C T C A T +T A C A T T T A G C A G G G G T T T C T T C A A T +T C T A G G T G C T G T A A A T T T T A T A A C T +A C C G T T A T C A A T A T A C G A T C A A C A G +G A A T A A C T A T A G A C C G A A T A C C T C T +T T T C G T C T G A G C G G T A T T T A T T A C A +G C C T T A C T A C T T T T A C T A T C A T T A C +C G G T T +>Vonones +T A A T G T T A G G T G C T C C A G A T A T G G - +C A T T T C C C - C G A A T A A A T A A T A T A A +G T T T C T G A C T T T T A C C C C C T T C G C T +A A C T T T A C T T - T T A T C T A G A G G T A T +A G T C G A A A G A G G A G T G G G A A C T G G A +T G A A C A G T A T A C C C T C C T T T A T - C - +A G C C A G A A T T G C T C A C G C A G G T G C T +T C A G T T G A T C T A G G T A T T T T T T C A T +T A C A T T T A G C A G G G G T C T C A T C A A T +C T T A G G A G C T G T A A A C T T T A T A A C G +A C C G T T A T C A A T A T A C G A T C T A C A G +G G A T A A C T A T A G A C C G A A T A C C A C T +T T T T G T T T G A G C A G T A T T T A T T A C A +G C T C T A C T C C T A C T G T T A T C T T T A C +C A G T C + + +Sequence character chel_16S.fas#0 +>Alentus +T C C A G A C T - - G A - C - G G T - C C A C C - +- G A A T C G - G G C G A C T G - - - T C - A G G +C C T G A A C - - A T C C G G - T C G G T T - T C +- T C - T C C C T T T T C C C G C G G T G C T C T +T C G T T G A G - T G - T C G C G - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Amblypygid +T C C A G A C T - - G G - C - G G T - C C G C C T +- A - G C G G C G A G T A C T G - - - T C - A G G +C C T G A A C - - A T G G C G - C C G G - T - T - +- T - - T - - C - - - - - - C T T G G T T C T C T +T T A C T G A G - T G - T C T T G - G G C G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - - - G - C T G +- T - - - - C G C C +>Americhernus +T C G A G C C T C C A A - T - G A T - A C G T T G +- A - A A G G C G T T T A T C G - - - T T - G G G +G C C G - A C - - A G C G C G - T C - G - - - T - +- G - - G - - G - - - - - - C T C G G T T G G C C +T T A A A A A G C T G A T C - G G - G T T C T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +G G T G C T C A A A G C A - - G - - - - G C C T G +- G - - - - T G C C +>Aportus +T C C A G A C T - - G A - C - G G T - C C A C C - +- G - C T C G - G G C G A C T G - - - T C - A G G +C C T G A A C - - A T T C C G - T C G G T T - T C +G A C - A G A T T T T T C C C G C G G T G C T C T +T C G G T G A G - T G - T C G C - - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Artemia +C T C G G T C G - - G G - T - G G T G C C G C C - +T C - A C G G T G G T C A C T G - - - C C T C G A +T C - G G A C - A A T T C A T - T G G A - - - T - +- C - - G - T T - - - - - - C G G G G T G C T C T +T A A C C G A G - T G - T C C T G - G G T G G C C +G A T A C G T T T A C T T T G A A C A A A T T A G +A G T G C T T A A A G C A - - G - - - - G - T - G +C A C - C G C G C C +>Centruoides +T C C A G A C A - - A G - C - G G T - C C A C C - +- C - G C G G T G G T T A C T G - - - T T T G G A +C T - G G A C - - G T T T G G C C G G A - - - T - +- T - - C - C T - - - - - - - T T G A T G C T C T +T T G C C G A G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - - +- A - - C - C G C C +>Chanbria +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - +>Gea +T C C G G C C G - - G A - C G G G T - C C G C C - +T A - C C G G T G G T T A C T G - T T C G - C T G +C C - G A G C - - - T T C A G - G G G G - - - C - +- C - - G - C T - - - - - - G T C G A T G A T C T +T C A T C G G T - T A - T C T T C - C G T A A C C +C T C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - - - - C G C C +>Hadrurus +T C C G G A C T - - G T - C - G G T - C C G - C - +- C - G C - A A G C T T A C T G - - - G C A G G A +C C - G G A C - - G T C T A G C C G G A - - C T - +- C - - T - C T - - - - - - C G T - A T C C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - T C - C G C C +>Hypochilus +T C C A G A C G - - G G - C - G G T - C C G C C - +T A - A C G G T G G T T A C T G - - - C C - T G G +C C T G A A C - - A A C C A G - C C G G - - - T - +- T - - T - C C - - - - - - C T A G A T G A T C T +T C A T T G A T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - G +- A - - - - C G C C +>Limulus +T C T A G A C T - - G G - C - G G T - C C G C T T +- - - C C G G C G G T T A C T G - - - C C - T G G +C C T A A A C - - A T C - T G - C C G G - T - T - +- T - - T - C C - - - - - - C T C G G T G C C C T +T G A T T G A G - T G - T C T T G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - A +- A - - - - C G C C +>Mastigoproctus +T C C A G A C G - - G G - T - G G T - C C A C C - +G C - C C G G T G G C T A C T G - - - C C - C G G +C C T G A A C A - A T C T C G - C C G G - - - T - +- T - - T - T C - - - - - - C T T G A T T C T C T +T C A C C G A G - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T T A A A G C A - - G - - C - G - T - A +- A - - - - C G C C +>Pauroctonus +T C C A G A C T - - G T - C - G G T - C C G - C - +- A - C C G G A G G T T A C T G - - - G C - A G G +A C C G G A C - - G T C T A G - C C G G - - - C - +- C - - T - C C - - - - - - C T C G T T G C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - A - C +- C - - - - C G C C +>Rhiphicephalus +T C C A G A C G - - A G - T - A G T - G C A T C - +T A - C C C G A T G C T A C G G - - - C T - C G G +A C T G A A C - - A T C A T G - C C G G - - - T - +- T - C T - T T - - - - - - C T T G G T G C A C T +T C A T T G T G - T G - C C T C G A G A T G G C C +G G T G C T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A C G C A - G G - - C - G - A - G +- T - - - - C G C C +>Thelichoris +T C C A G A C G - - G G - C - G G T - C C G C C - +T A - A C G G T G G T T A C T G - - - C C - T G G +C C T G A A C - - A G C C A G - C C G G - - - T - +- T - - T - C C - - - - - - C T A G A T G A T C T +T T A C C G G T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - G G - - C - - - T - G +- A - - - - C G C C +>Thermobius +C T C G G A C - - - G A T C - G G T - T C G C C - +- G - C C C G T G T T A A C T G A - - T C - - G T +T C C G G A C - - G T C C T G - C C G G T T - T - +- T - - T - C C - T T T - - C T C G G T G C T C T +T C A T T G A G - T G - T C T T G - A T T G G C C +G G C A C G T T T A C T T T G A A C A A A T T A G +A G T G C T C A A A G C A G - G T C C A G - T - G +- T - - - - C G C C +>Vonones +T C G A G G C T - - G G - C - G G T - C C G C C T +- A - C A G G C G G T C A C T G - - - C C - A G T +A C T C A A C - - A T C C T G - C C G G - T - T - +- T - - T - C C - - - - - - C T T G G T G C T C T +T C G C T G A G - T G - T C T C G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - C - G +- T - - - - A G C C + diff --git a/tutorials/tutorial_1/Tutorial1_phygfiles/search1/chel_run1.dot b/tutorials/tutorial_1/Tutorial1_phygfiles/search1/chel_run1.dot new file mode 100644 index 000000000..d7251f859 --- /dev/null +++ b/tutorials/tutorial_1/Tutorial1_phygfiles/search1/chel_run1.dot @@ -0,0 +1,700 @@ +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 [label=HTU17]; + 18 [label=HTU18]; + 19 [label=HTU19]; + 20 [label=HTU20]; + 21 [label=HTU21]; + 22 [label=HTU22]; + 23 [label=HTU23]; + 24 [label=HTU24]; + 25 [label=HTU25]; + 26 [label=HTU26]; + 27 [label=HTU27]; + 28 [label=HTU28]; + 29 [label=HTU29]; + 30 [label=HTU30]; + 31 [label=HTU31]; + 32 [label=HTU32]; + 17 -> 10 [label=79.0]; + 17 -> 32 [label=57.5]; + 18 -> 24 [label=21.0]; + 18 -> 29 [label=35.5]; + 19 -> 8 [label=49.0]; + 19 -> 23 [label=25.5]; + 20 -> 11 [label=42.5]; + 20 -> 18 [label=18.5]; + 21 -> 9 [label=40.5]; + 21 -> 19 [label=27.0]; + 22 -> 16 [label=51.5]; + 22 -> 25 [label=10.5]; + 23 -> 4 [label=58.5]; + 23 -> 5 [label=33.5]; + 24 -> 21 [label=21.0]; + 24 -> 31 [label=152.5]; + 25 -> 27 [label=16.0]; + 25 -> 28 [label=15.0]; + 26 -> 1 [label=52.5]; + 26 -> 2 [label=90.5]; + 27 -> 12 [label=39.0]; + 27 -> 20 [label=18.5]; + 28 -> 15 [label=48.5]; + 28 -> 30 [label=30.5]; + 29 -> 13 [label=40.0]; + 29 -> 14 [label=15.0]; + 30 -> 0 [label=64.0]; + 30 -> 3 [label=61.0]; + 31 -> 6 [label=32.5]; + 31 -> 7 [label=174.5]; + 32 -> 22 [label=11.5]; + 32 -> 26 [label=27.0]; +} +//999.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 [label=HTU17]; + 18 [label=HTU18]; + 19 [label=HTU19]; + 20 [label=HTU20]; + 21 [label=HTU21]; + 22 [label=HTU22]; + 23 [label=HTU23]; + 24 [label=HTU24]; + 25 [label=HTU25]; + 26 [label=HTU26]; + 27 [label=HTU27]; + 28 [label=HTU28]; + 29 [label=HTU29]; + 30 [label=HTU30]; + 31 [label=HTU31]; + 32 [label=HTU32]; + 17 -> 10 [label=79.0]; + 17 -> 20 [label=57.5]; + 18 -> 4 [label=58.5]; + 18 -> 5 [label=33.5]; + 19 -> 21 [label=21.0]; + 19 -> 26 [label=35.5]; + 20 -> 31 [label=27.0]; + 20 -> 32 [label=11.5]; + 21 -> 23 [label=21.0]; + 21 -> 24 [label=152.5]; + 22 -> 11 [label=42.5]; + 22 -> 19 [label=18.5]; + 23 -> 9 [label=40.5]; + 23 -> 27 [label=27.0]; + 24 -> 6 [label=32.5]; + 24 -> 7 [label=174.5]; + 25 -> 12 [label=39.0]; + 25 -> 22 [label=18.5]; + 26 -> 13 [label=40.0]; + 26 -> 14 [label=15.0]; + 27 -> 8 [label=49.0]; + 27 -> 18 [label=25.5]; + 28 -> 15 [label=48.5]; + 28 -> 30 [label=30.5]; + 29 -> 25 [label=16.0]; + 29 -> 28 [label=15.0]; + 30 -> 0 [label=64.0]; + 30 -> 3 [label=61.0]; + 31 -> 1 [label=52.5]; + 31 -> 2 [label=90.5]; + 32 -> 16 [label=51.5]; + 32 -> 29 [label=10.5]; +} +//999.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 [label=HTU17]; + 18 [label=HTU18]; + 19 [label=HTU19]; + 20 [label=HTU20]; + 21 [label=HTU21]; + 22 [label=HTU22]; + 23 [label=HTU23]; + 24 [label=HTU24]; + 25 [label=HTU25]; + 26 [label=HTU26]; + 27 [label=HTU27]; + 28 [label=HTU28]; + 29 [label=HTU29]; + 30 [label=HTU30]; + 31 [label=HTU31]; + 32 [label=HTU32]; + 17 -> 10 [label=79.0]; + 17 -> 24 [label=57.5]; + 18 -> 0 [label=64.0]; + 18 -> 3 [label=61.0]; + 19 -> 12 [label=39.0]; + 19 -> 26 [label=18.5]; + 20 -> 15 [label=48.5]; + 20 -> 18 [label=30.5]; + 21 -> 19 [label=16.0]; + 21 -> 20 [label=15.0]; + 22 -> 16 [label=51.5]; + 22 -> 21 [label=10.5]; + 23 -> 1 [label=52.5]; + 23 -> 2 [label=90.5]; + 24 -> 22 [label=11.5]; + 24 -> 23 [label=27.0]; + 25 -> 29 [label=35.5]; + 25 -> 32 [label=21.0]; + 26 -> 11 [label=42.5]; + 26 -> 25 [label=18.5]; + 27 -> 4 [label=58.5]; + 27 -> 5 [label=33.5]; + 28 -> 8 [label=49.0]; + 28 -> 27 [label=25.5]; + 29 -> 13 [label=40.0]; + 29 -> 14 [label=15.0]; + 30 -> 9 [label=40.5]; + 30 -> 28 [label=27.0]; + 31 -> 6 [label=32.5]; + 31 -> 7 [label=174.5]; + 32 -> 30 [label=21.0]; + 32 -> 31 [label=152.5]; +} +//999.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 [label=HTU17]; + 18 [label=HTU18]; + 19 [label=HTU19]; + 20 [label=HTU20]; + 21 [label=HTU21]; + 22 [label=HTU22]; + 23 [label=HTU23]; + 24 [label=HTU24]; + 25 [label=HTU25]; + 26 [label=HTU26]; + 27 [label=HTU27]; + 28 [label=HTU28]; + 29 [label=HTU29]; + 30 [label=HTU30]; + 31 [label=HTU31]; + 32 [label=HTU32]; + 17 -> 10 [label=79.0]; + 17 -> 24 [label=57.5]; + 18 -> 0 [label=64.0]; + 18 -> 3 [label=61.0]; + 19 -> 12 [label=39.0]; + 19 -> 26 [label=18.5]; + 20 -> 15 [label=48.5]; + 20 -> 18 [label=30.5]; + 21 -> 19 [label=16.0]; + 21 -> 20 [label=15.0]; + 22 -> 1 [label=52.5]; + 22 -> 2 [label=90.5]; + 23 -> 16 [label=51.5]; + 23 -> 21 [label=10.5]; + 24 -> 22 [label=27.0]; + 24 -> 23 [label=11.5]; + 25 -> 29 [label=35.5]; + 25 -> 32 [label=21.0]; + 26 -> 11 [label=42.5]; + 26 -> 25 [label=18.5]; + 27 -> 4 [label=58.5]; + 27 -> 5 [label=33.5]; + 28 -> 8 [label=49.0]; + 28 -> 27 [label=25.5]; + 29 -> 13 [label=40.0]; + 29 -> 14 [label=15.0]; + 30 -> 9 [label=40.5]; + 30 -> 28 [label=27.0]; + 31 -> 6 [label=32.5]; + 31 -> 7 [label=174.5]; + 32 -> 30 [label=21.0]; + 32 -> 31 [label=152.5]; +} +//999.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 [label=HTU17]; + 18 [label=HTU18]; + 19 [label=HTU19]; + 20 [label=HTU20]; + 21 [label=HTU21]; + 22 [label=HTU22]; + 23 [label=HTU23]; + 24 [label=HTU24]; + 25 [label=HTU25]; + 26 [label=HTU26]; + 27 [label=HTU27]; + 28 [label=HTU28]; + 29 [label=HTU29]; + 30 [label=HTU30]; + 31 [label=HTU31]; + 32 [label=HTU32]; + 17 -> 10 [label=79.0]; + 17 -> 22 [label=57.5]; + 18 -> 0 [label=64.0]; + 18 -> 3 [label=61.0]; + 19 -> 15 [label=48.5]; + 19 -> 18 [label=30.5]; + 20 -> 1 [label=52.5]; + 20 -> 2 [label=90.5]; + 21 -> 16 [label=51.5]; + 21 -> 24 [label=10.5]; + 22 -> 20 [label=27.0]; + 22 -> 21 [label=11.5]; + 23 -> 12 [label=39.0]; + 23 -> 32 [label=18.5]; + 24 -> 19 [label=15.0]; + 24 -> 23 [label=16.0]; + 25 -> 4 [label=58.5]; + 25 -> 5 [label=33.5]; + 26 -> 8 [label=49.0]; + 26 -> 25 [label=25.5]; + 27 -> 9 [label=40.5]; + 27 -> 26 [label=27.0]; + 28 -> 6 [label=32.5]; + 28 -> 7 [label=174.5]; + 29 -> 27 [label=21.0]; + 29 -> 28 [label=152.5]; + 30 -> 13 [label=40.0]; + 30 -> 14 [label=15.0]; + 31 -> 29 [label=21.0]; + 31 -> 30 [label=35.5]; + 32 -> 11 [label=42.5]; + 32 -> 31 [label=18.5]; +} +//999.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 [label=HTU17]; + 18 [label=HTU18]; + 19 [label=HTU19]; + 20 [label=HTU20]; + 21 [label=HTU21]; + 22 [label=HTU22]; + 23 [label=HTU23]; + 24 [label=HTU24]; + 25 [label=HTU25]; + 26 [label=HTU26]; + 27 [label=HTU27]; + 28 [label=HTU28]; + 29 [label=HTU29]; + 30 [label=HTU30]; + 31 [label=HTU31]; + 32 [label=HTU32]; + 17 -> 10 [label=79.0]; + 17 -> 22 [label=57.5]; + 18 -> 0 [label=64.0]; + 18 -> 3 [label=61.0]; + 19 -> 15 [label=48.5]; + 19 -> 18 [label=30.5]; + 20 -> 16 [label=51.5]; + 20 -> 24 [label=10.5]; + 21 -> 1 [label=52.5]; + 21 -> 2 [label=90.5]; + 22 -> 20 [label=11.5]; + 22 -> 21 [label=27.0]; + 23 -> 12 [label=39.0]; + 23 -> 32 [label=18.5]; + 24 -> 19 [label=15.0]; + 24 -> 23 [label=16.0]; + 25 -> 4 [label=58.5]; + 25 -> 5 [label=33.5]; + 26 -> 8 [label=49.0]; + 26 -> 25 [label=25.5]; + 27 -> 9 [label=40.5]; + 27 -> 26 [label=27.0]; + 28 -> 6 [label=32.5]; + 28 -> 7 [label=174.5]; + 29 -> 27 [label=21.0]; + 29 -> 28 [label=152.5]; + 30 -> 13 [label=40.0]; + 30 -> 14 [label=15.0]; + 31 -> 29 [label=21.0]; + 31 -> 30 [label=35.5]; + 32 -> 11 [label=42.5]; + 32 -> 31 [label=18.5]; +} +//999.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 [label=HTU17]; + 18 [label=HTU18]; + 19 [label=HTU19]; + 20 [label=HTU20]; + 21 [label=HTU21]; + 22 [label=HTU22]; + 23 [label=HTU23]; + 24 [label=HTU24]; + 25 [label=HTU25]; + 26 [label=HTU26]; + 27 [label=HTU27]; + 28 [label=HTU28]; + 29 [label=HTU29]; + 30 [label=HTU30]; + 31 [label=HTU31]; + 32 [label=HTU32]; + 17 -> 10 [label=77.0]; + 17 -> 20 [label=52.5]; + 18 -> 5 [label=32.5]; + 18 -> 8 [label=57.0]; + 19 -> 23 [label=19.0]; + 19 -> 26 [label=30.5]; + 20 -> 31 [label=27.5]; + 20 -> 32 [label=10.5]; + 21 -> 24 [label=150.5]; + 21 -> 27 [label=27.0]; + 22 -> 11 [label=40.5]; + 22 -> 19 [label=20.5]; + 23 -> 9 [label=46.5]; + 23 -> 21 [label=20.0]; + 24 -> 6 [label=33.5]; + 24 -> 7 [label=168.5]; + 25 -> 12 [label=40.0]; + 25 -> 22 [label=20.0]; + 26 -> 13 [label=39.5]; + 26 -> 14 [label=14.5]; + 27 -> 4 [label=45.5]; + 27 -> 18 [label=25.5]; + 28 -> 15 [label=48.5]; + 28 -> 30 [label=31.5]; + 29 -> 25 [label=18.0]; + 29 -> 28 [label=17.5]; + 30 -> 0 [label=65.0]; + 30 -> 3 [label=59.0]; + 31 -> 1 [label=51.0]; + 31 -> 2 [label=88.0]; + 32 -> 16 [label=51.5]; + 32 -> 29 [label=8.0]; +} +//998.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 [label=HTU17]; + 18 [label=HTU18]; + 19 [label=HTU19]; + 20 [label=HTU20]; + 21 [label=HTU21]; + 22 [label=HTU22]; + 23 [label=HTU23]; + 24 [label=HTU24]; + 25 [label=HTU25]; + 26 [label=HTU26]; + 27 [label=HTU27]; + 28 [label=HTU28]; + 29 [label=HTU29]; + 30 [label=HTU30]; + 31 [label=HTU31]; + 32 [label=HTU32]; + 17 -> 10 [label=77.0]; + 17 -> 24 [label=52.5]; + 18 -> 0 [label=65.0]; + 18 -> 3 [label=59.0]; + 19 -> 12 [label=40.0]; + 19 -> 26 [label=20.0]; + 20 -> 15 [label=48.5]; + 20 -> 18 [label=31.5]; + 21 -> 19 [label=18.0]; + 21 -> 20 [label=17.5]; + 22 -> 1 [label=51.0]; + 22 -> 2 [label=88.0]; + 23 -> 16 [label=51.5]; + 23 -> 21 [label=8.0]; + 24 -> 22 [label=27.5]; + 24 -> 23 [label=10.5]; + 25 -> 29 [label=30.5]; + 25 -> 30 [label=19.0]; + 26 -> 11 [label=40.5]; + 26 -> 25 [label=20.5]; + 27 -> 4 [label=45.5]; + 27 -> 28 [label=25.5]; + 28 -> 5 [label=32.5]; + 28 -> 8 [label=57.0]; + 29 -> 13 [label=39.5]; + 29 -> 14 [label=14.5]; + 30 -> 9 [label=46.5]; + 30 -> 32 [label=20.0]; + 31 -> 6 [label=33.5]; + 31 -> 7 [label=168.5]; + 32 -> 27 [label=27.0]; + 32 -> 31 [label=150.5]; +} +//998.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 [label=HTU17]; + 18 [label=HTU18]; + 19 [label=HTU19]; + 20 [label=HTU20]; + 21 [label=HTU21]; + 22 [label=HTU22]; + 23 [label=HTU23]; + 24 [label=HTU24]; + 25 [label=HTU25]; + 26 [label=HTU26]; + 27 [label=HTU27]; + 28 [label=HTU28]; + 29 [label=HTU29]; + 30 [label=HTU30]; + 31 [label=HTU31]; + 32 [label=HTU32]; + 17 -> 10 [label=77.0]; + 17 -> 22 [label=52.5]; + 18 -> 0 [label=65.0]; + 18 -> 3 [label=59.0]; + 19 -> 15 [label=48.5]; + 19 -> 18 [label=31.5]; + 20 -> 16 [label=51.5]; + 20 -> 24 [label=8.0]; + 21 -> 1 [label=51.0]; + 21 -> 2 [label=88.0]; + 22 -> 20 [label=10.5]; + 22 -> 21 [label=27.5]; + 23 -> 12 [label=40.0]; + 23 -> 32 [label=20.0]; + 24 -> 19 [label=17.5]; + 24 -> 23 [label=18.0]; + 25 -> 5 [label=32.5]; + 25 -> 8 [label=57.0]; + 26 -> 4 [label=45.5]; + 26 -> 25 [label=25.5]; + 27 -> 9 [label=46.5]; + 27 -> 29 [label=20.0]; + 28 -> 6 [label=33.5]; + 28 -> 7 [label=168.5]; + 29 -> 26 [label=27.0]; + 29 -> 28 [label=150.5]; + 30 -> 13 [label=39.5]; + 30 -> 14 [label=14.5]; + 31 -> 27 [label=19.0]; + 31 -> 30 [label=30.5]; + 32 -> 11 [label=40.5]; + 32 -> 31 [label=20.5]; +} +//998.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 [label=HTU17]; + 18 [label=HTU18]; + 19 [label=HTU19]; + 20 [label=HTU20]; + 21 [label=HTU21]; + 22 [label=HTU22]; + 23 [label=HTU23]; + 24 [label=HTU24]; + 25 [label=HTU25]; + 26 [label=HTU26]; + 27 [label=HTU27]; + 28 [label=HTU28]; + 29 [label=HTU29]; + 30 [label=HTU30]; + 31 [label=HTU31]; + 32 [label=HTU32]; + 17 -> 10 [label=77.0]; + 17 -> 29 [label=52.5]; + 18 -> 6 [label=33.5]; + 18 -> 7 [label=168.5]; + 19 -> 9 [label=46.5]; + 19 -> 31 [label=20.0]; + 20 -> 13 [label=39.5]; + 20 -> 14 [label=14.5]; + 21 -> 19 [label=19.0]; + 21 -> 20 [label=30.5]; + 22 -> 11 [label=40.5]; + 22 -> 21 [label=20.5]; + 23 -> 0 [label=65.0]; + 23 -> 3 [label=59.0]; + 24 -> 15 [label=48.5]; + 24 -> 23 [label=31.5]; + 25 -> 12 [label=40.0]; + 25 -> 22 [label=20.0]; + 26 -> 24 [label=17.5]; + 26 -> 25 [label=18.0]; + 27 -> 1 [label=51.0]; + 27 -> 2 [label=88.0]; + 28 -> 16 [label=51.5]; + 28 -> 26 [label=8.0]; + 29 -> 27 [label=27.5]; + 29 -> 28 [label=10.5]; + 30 -> 4 [label=45.5]; + 30 -> 32 [label=25.5]; + 31 -> 18 [label=150.5]; + 31 -> 30 [label=27.0]; + 32 -> 5 [label=32.5]; + 32 -> 8 [label=57.0]; +} +//998.0 diff --git a/tutorials/tutorial_1/Tutorial1_phygfiles/search1/chel_run2.dot b/tutorials/tutorial_1/Tutorial1_phygfiles/search1/chel_run2.dot new file mode 100644 index 000000000..d7251f859 --- /dev/null +++ b/tutorials/tutorial_1/Tutorial1_phygfiles/search1/chel_run2.dot @@ -0,0 +1,700 @@ +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 [label=HTU17]; + 18 [label=HTU18]; + 19 [label=HTU19]; + 20 [label=HTU20]; + 21 [label=HTU21]; + 22 [label=HTU22]; + 23 [label=HTU23]; + 24 [label=HTU24]; + 25 [label=HTU25]; + 26 [label=HTU26]; + 27 [label=HTU27]; + 28 [label=HTU28]; + 29 [label=HTU29]; + 30 [label=HTU30]; + 31 [label=HTU31]; + 32 [label=HTU32]; + 17 -> 10 [label=79.0]; + 17 -> 32 [label=57.5]; + 18 -> 24 [label=21.0]; + 18 -> 29 [label=35.5]; + 19 -> 8 [label=49.0]; + 19 -> 23 [label=25.5]; + 20 -> 11 [label=42.5]; + 20 -> 18 [label=18.5]; + 21 -> 9 [label=40.5]; + 21 -> 19 [label=27.0]; + 22 -> 16 [label=51.5]; + 22 -> 25 [label=10.5]; + 23 -> 4 [label=58.5]; + 23 -> 5 [label=33.5]; + 24 -> 21 [label=21.0]; + 24 -> 31 [label=152.5]; + 25 -> 27 [label=16.0]; + 25 -> 28 [label=15.0]; + 26 -> 1 [label=52.5]; + 26 -> 2 [label=90.5]; + 27 -> 12 [label=39.0]; + 27 -> 20 [label=18.5]; + 28 -> 15 [label=48.5]; + 28 -> 30 [label=30.5]; + 29 -> 13 [label=40.0]; + 29 -> 14 [label=15.0]; + 30 -> 0 [label=64.0]; + 30 -> 3 [label=61.0]; + 31 -> 6 [label=32.5]; + 31 -> 7 [label=174.5]; + 32 -> 22 [label=11.5]; + 32 -> 26 [label=27.0]; +} +//999.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 [label=HTU17]; + 18 [label=HTU18]; + 19 [label=HTU19]; + 20 [label=HTU20]; + 21 [label=HTU21]; + 22 [label=HTU22]; + 23 [label=HTU23]; + 24 [label=HTU24]; + 25 [label=HTU25]; + 26 [label=HTU26]; + 27 [label=HTU27]; + 28 [label=HTU28]; + 29 [label=HTU29]; + 30 [label=HTU30]; + 31 [label=HTU31]; + 32 [label=HTU32]; + 17 -> 10 [label=79.0]; + 17 -> 20 [label=57.5]; + 18 -> 4 [label=58.5]; + 18 -> 5 [label=33.5]; + 19 -> 21 [label=21.0]; + 19 -> 26 [label=35.5]; + 20 -> 31 [label=27.0]; + 20 -> 32 [label=11.5]; + 21 -> 23 [label=21.0]; + 21 -> 24 [label=152.5]; + 22 -> 11 [label=42.5]; + 22 -> 19 [label=18.5]; + 23 -> 9 [label=40.5]; + 23 -> 27 [label=27.0]; + 24 -> 6 [label=32.5]; + 24 -> 7 [label=174.5]; + 25 -> 12 [label=39.0]; + 25 -> 22 [label=18.5]; + 26 -> 13 [label=40.0]; + 26 -> 14 [label=15.0]; + 27 -> 8 [label=49.0]; + 27 -> 18 [label=25.5]; + 28 -> 15 [label=48.5]; + 28 -> 30 [label=30.5]; + 29 -> 25 [label=16.0]; + 29 -> 28 [label=15.0]; + 30 -> 0 [label=64.0]; + 30 -> 3 [label=61.0]; + 31 -> 1 [label=52.5]; + 31 -> 2 [label=90.5]; + 32 -> 16 [label=51.5]; + 32 -> 29 [label=10.5]; +} +//999.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 [label=HTU17]; + 18 [label=HTU18]; + 19 [label=HTU19]; + 20 [label=HTU20]; + 21 [label=HTU21]; + 22 [label=HTU22]; + 23 [label=HTU23]; + 24 [label=HTU24]; + 25 [label=HTU25]; + 26 [label=HTU26]; + 27 [label=HTU27]; + 28 [label=HTU28]; + 29 [label=HTU29]; + 30 [label=HTU30]; + 31 [label=HTU31]; + 32 [label=HTU32]; + 17 -> 10 [label=79.0]; + 17 -> 24 [label=57.5]; + 18 -> 0 [label=64.0]; + 18 -> 3 [label=61.0]; + 19 -> 12 [label=39.0]; + 19 -> 26 [label=18.5]; + 20 -> 15 [label=48.5]; + 20 -> 18 [label=30.5]; + 21 -> 19 [label=16.0]; + 21 -> 20 [label=15.0]; + 22 -> 16 [label=51.5]; + 22 -> 21 [label=10.5]; + 23 -> 1 [label=52.5]; + 23 -> 2 [label=90.5]; + 24 -> 22 [label=11.5]; + 24 -> 23 [label=27.0]; + 25 -> 29 [label=35.5]; + 25 -> 32 [label=21.0]; + 26 -> 11 [label=42.5]; + 26 -> 25 [label=18.5]; + 27 -> 4 [label=58.5]; + 27 -> 5 [label=33.5]; + 28 -> 8 [label=49.0]; + 28 -> 27 [label=25.5]; + 29 -> 13 [label=40.0]; + 29 -> 14 [label=15.0]; + 30 -> 9 [label=40.5]; + 30 -> 28 [label=27.0]; + 31 -> 6 [label=32.5]; + 31 -> 7 [label=174.5]; + 32 -> 30 [label=21.0]; + 32 -> 31 [label=152.5]; +} +//999.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 [label=HTU17]; + 18 [label=HTU18]; + 19 [label=HTU19]; + 20 [label=HTU20]; + 21 [label=HTU21]; + 22 [label=HTU22]; + 23 [label=HTU23]; + 24 [label=HTU24]; + 25 [label=HTU25]; + 26 [label=HTU26]; + 27 [label=HTU27]; + 28 [label=HTU28]; + 29 [label=HTU29]; + 30 [label=HTU30]; + 31 [label=HTU31]; + 32 [label=HTU32]; + 17 -> 10 [label=79.0]; + 17 -> 24 [label=57.5]; + 18 -> 0 [label=64.0]; + 18 -> 3 [label=61.0]; + 19 -> 12 [label=39.0]; + 19 -> 26 [label=18.5]; + 20 -> 15 [label=48.5]; + 20 -> 18 [label=30.5]; + 21 -> 19 [label=16.0]; + 21 -> 20 [label=15.0]; + 22 -> 1 [label=52.5]; + 22 -> 2 [label=90.5]; + 23 -> 16 [label=51.5]; + 23 -> 21 [label=10.5]; + 24 -> 22 [label=27.0]; + 24 -> 23 [label=11.5]; + 25 -> 29 [label=35.5]; + 25 -> 32 [label=21.0]; + 26 -> 11 [label=42.5]; + 26 -> 25 [label=18.5]; + 27 -> 4 [label=58.5]; + 27 -> 5 [label=33.5]; + 28 -> 8 [label=49.0]; + 28 -> 27 [label=25.5]; + 29 -> 13 [label=40.0]; + 29 -> 14 [label=15.0]; + 30 -> 9 [label=40.5]; + 30 -> 28 [label=27.0]; + 31 -> 6 [label=32.5]; + 31 -> 7 [label=174.5]; + 32 -> 30 [label=21.0]; + 32 -> 31 [label=152.5]; +} +//999.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 [label=HTU17]; + 18 [label=HTU18]; + 19 [label=HTU19]; + 20 [label=HTU20]; + 21 [label=HTU21]; + 22 [label=HTU22]; + 23 [label=HTU23]; + 24 [label=HTU24]; + 25 [label=HTU25]; + 26 [label=HTU26]; + 27 [label=HTU27]; + 28 [label=HTU28]; + 29 [label=HTU29]; + 30 [label=HTU30]; + 31 [label=HTU31]; + 32 [label=HTU32]; + 17 -> 10 [label=79.0]; + 17 -> 22 [label=57.5]; + 18 -> 0 [label=64.0]; + 18 -> 3 [label=61.0]; + 19 -> 15 [label=48.5]; + 19 -> 18 [label=30.5]; + 20 -> 1 [label=52.5]; + 20 -> 2 [label=90.5]; + 21 -> 16 [label=51.5]; + 21 -> 24 [label=10.5]; + 22 -> 20 [label=27.0]; + 22 -> 21 [label=11.5]; + 23 -> 12 [label=39.0]; + 23 -> 32 [label=18.5]; + 24 -> 19 [label=15.0]; + 24 -> 23 [label=16.0]; + 25 -> 4 [label=58.5]; + 25 -> 5 [label=33.5]; + 26 -> 8 [label=49.0]; + 26 -> 25 [label=25.5]; + 27 -> 9 [label=40.5]; + 27 -> 26 [label=27.0]; + 28 -> 6 [label=32.5]; + 28 -> 7 [label=174.5]; + 29 -> 27 [label=21.0]; + 29 -> 28 [label=152.5]; + 30 -> 13 [label=40.0]; + 30 -> 14 [label=15.0]; + 31 -> 29 [label=21.0]; + 31 -> 30 [label=35.5]; + 32 -> 11 [label=42.5]; + 32 -> 31 [label=18.5]; +} +//999.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 [label=HTU17]; + 18 [label=HTU18]; + 19 [label=HTU19]; + 20 [label=HTU20]; + 21 [label=HTU21]; + 22 [label=HTU22]; + 23 [label=HTU23]; + 24 [label=HTU24]; + 25 [label=HTU25]; + 26 [label=HTU26]; + 27 [label=HTU27]; + 28 [label=HTU28]; + 29 [label=HTU29]; + 30 [label=HTU30]; + 31 [label=HTU31]; + 32 [label=HTU32]; + 17 -> 10 [label=79.0]; + 17 -> 22 [label=57.5]; + 18 -> 0 [label=64.0]; + 18 -> 3 [label=61.0]; + 19 -> 15 [label=48.5]; + 19 -> 18 [label=30.5]; + 20 -> 16 [label=51.5]; + 20 -> 24 [label=10.5]; + 21 -> 1 [label=52.5]; + 21 -> 2 [label=90.5]; + 22 -> 20 [label=11.5]; + 22 -> 21 [label=27.0]; + 23 -> 12 [label=39.0]; + 23 -> 32 [label=18.5]; + 24 -> 19 [label=15.0]; + 24 -> 23 [label=16.0]; + 25 -> 4 [label=58.5]; + 25 -> 5 [label=33.5]; + 26 -> 8 [label=49.0]; + 26 -> 25 [label=25.5]; + 27 -> 9 [label=40.5]; + 27 -> 26 [label=27.0]; + 28 -> 6 [label=32.5]; + 28 -> 7 [label=174.5]; + 29 -> 27 [label=21.0]; + 29 -> 28 [label=152.5]; + 30 -> 13 [label=40.0]; + 30 -> 14 [label=15.0]; + 31 -> 29 [label=21.0]; + 31 -> 30 [label=35.5]; + 32 -> 11 [label=42.5]; + 32 -> 31 [label=18.5]; +} +//999.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 [label=HTU17]; + 18 [label=HTU18]; + 19 [label=HTU19]; + 20 [label=HTU20]; + 21 [label=HTU21]; + 22 [label=HTU22]; + 23 [label=HTU23]; + 24 [label=HTU24]; + 25 [label=HTU25]; + 26 [label=HTU26]; + 27 [label=HTU27]; + 28 [label=HTU28]; + 29 [label=HTU29]; + 30 [label=HTU30]; + 31 [label=HTU31]; + 32 [label=HTU32]; + 17 -> 10 [label=77.0]; + 17 -> 20 [label=52.5]; + 18 -> 5 [label=32.5]; + 18 -> 8 [label=57.0]; + 19 -> 23 [label=19.0]; + 19 -> 26 [label=30.5]; + 20 -> 31 [label=27.5]; + 20 -> 32 [label=10.5]; + 21 -> 24 [label=150.5]; + 21 -> 27 [label=27.0]; + 22 -> 11 [label=40.5]; + 22 -> 19 [label=20.5]; + 23 -> 9 [label=46.5]; + 23 -> 21 [label=20.0]; + 24 -> 6 [label=33.5]; + 24 -> 7 [label=168.5]; + 25 -> 12 [label=40.0]; + 25 -> 22 [label=20.0]; + 26 -> 13 [label=39.5]; + 26 -> 14 [label=14.5]; + 27 -> 4 [label=45.5]; + 27 -> 18 [label=25.5]; + 28 -> 15 [label=48.5]; + 28 -> 30 [label=31.5]; + 29 -> 25 [label=18.0]; + 29 -> 28 [label=17.5]; + 30 -> 0 [label=65.0]; + 30 -> 3 [label=59.0]; + 31 -> 1 [label=51.0]; + 31 -> 2 [label=88.0]; + 32 -> 16 [label=51.5]; + 32 -> 29 [label=8.0]; +} +//998.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 [label=HTU17]; + 18 [label=HTU18]; + 19 [label=HTU19]; + 20 [label=HTU20]; + 21 [label=HTU21]; + 22 [label=HTU22]; + 23 [label=HTU23]; + 24 [label=HTU24]; + 25 [label=HTU25]; + 26 [label=HTU26]; + 27 [label=HTU27]; + 28 [label=HTU28]; + 29 [label=HTU29]; + 30 [label=HTU30]; + 31 [label=HTU31]; + 32 [label=HTU32]; + 17 -> 10 [label=77.0]; + 17 -> 24 [label=52.5]; + 18 -> 0 [label=65.0]; + 18 -> 3 [label=59.0]; + 19 -> 12 [label=40.0]; + 19 -> 26 [label=20.0]; + 20 -> 15 [label=48.5]; + 20 -> 18 [label=31.5]; + 21 -> 19 [label=18.0]; + 21 -> 20 [label=17.5]; + 22 -> 1 [label=51.0]; + 22 -> 2 [label=88.0]; + 23 -> 16 [label=51.5]; + 23 -> 21 [label=8.0]; + 24 -> 22 [label=27.5]; + 24 -> 23 [label=10.5]; + 25 -> 29 [label=30.5]; + 25 -> 30 [label=19.0]; + 26 -> 11 [label=40.5]; + 26 -> 25 [label=20.5]; + 27 -> 4 [label=45.5]; + 27 -> 28 [label=25.5]; + 28 -> 5 [label=32.5]; + 28 -> 8 [label=57.0]; + 29 -> 13 [label=39.5]; + 29 -> 14 [label=14.5]; + 30 -> 9 [label=46.5]; + 30 -> 32 [label=20.0]; + 31 -> 6 [label=33.5]; + 31 -> 7 [label=168.5]; + 32 -> 27 [label=27.0]; + 32 -> 31 [label=150.5]; +} +//998.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 [label=HTU17]; + 18 [label=HTU18]; + 19 [label=HTU19]; + 20 [label=HTU20]; + 21 [label=HTU21]; + 22 [label=HTU22]; + 23 [label=HTU23]; + 24 [label=HTU24]; + 25 [label=HTU25]; + 26 [label=HTU26]; + 27 [label=HTU27]; + 28 [label=HTU28]; + 29 [label=HTU29]; + 30 [label=HTU30]; + 31 [label=HTU31]; + 32 [label=HTU32]; + 17 -> 10 [label=77.0]; + 17 -> 22 [label=52.5]; + 18 -> 0 [label=65.0]; + 18 -> 3 [label=59.0]; + 19 -> 15 [label=48.5]; + 19 -> 18 [label=31.5]; + 20 -> 16 [label=51.5]; + 20 -> 24 [label=8.0]; + 21 -> 1 [label=51.0]; + 21 -> 2 [label=88.0]; + 22 -> 20 [label=10.5]; + 22 -> 21 [label=27.5]; + 23 -> 12 [label=40.0]; + 23 -> 32 [label=20.0]; + 24 -> 19 [label=17.5]; + 24 -> 23 [label=18.0]; + 25 -> 5 [label=32.5]; + 25 -> 8 [label=57.0]; + 26 -> 4 [label=45.5]; + 26 -> 25 [label=25.5]; + 27 -> 9 [label=46.5]; + 27 -> 29 [label=20.0]; + 28 -> 6 [label=33.5]; + 28 -> 7 [label=168.5]; + 29 -> 26 [label=27.0]; + 29 -> 28 [label=150.5]; + 30 -> 13 [label=39.5]; + 30 -> 14 [label=14.5]; + 31 -> 27 [label=19.0]; + 31 -> 30 [label=30.5]; + 32 -> 11 [label=40.5]; + 32 -> 31 [label=20.5]; +} +//998.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 [label=HTU17]; + 18 [label=HTU18]; + 19 [label=HTU19]; + 20 [label=HTU20]; + 21 [label=HTU21]; + 22 [label=HTU22]; + 23 [label=HTU23]; + 24 [label=HTU24]; + 25 [label=HTU25]; + 26 [label=HTU26]; + 27 [label=HTU27]; + 28 [label=HTU28]; + 29 [label=HTU29]; + 30 [label=HTU30]; + 31 [label=HTU31]; + 32 [label=HTU32]; + 17 -> 10 [label=77.0]; + 17 -> 29 [label=52.5]; + 18 -> 6 [label=33.5]; + 18 -> 7 [label=168.5]; + 19 -> 9 [label=46.5]; + 19 -> 31 [label=20.0]; + 20 -> 13 [label=39.5]; + 20 -> 14 [label=14.5]; + 21 -> 19 [label=19.0]; + 21 -> 20 [label=30.5]; + 22 -> 11 [label=40.5]; + 22 -> 21 [label=20.5]; + 23 -> 0 [label=65.0]; + 23 -> 3 [label=59.0]; + 24 -> 15 [label=48.5]; + 24 -> 23 [label=31.5]; + 25 -> 12 [label=40.0]; + 25 -> 22 [label=20.0]; + 26 -> 24 [label=17.5]; + 26 -> 25 [label=18.0]; + 27 -> 1 [label=51.0]; + 27 -> 2 [label=88.0]; + 28 -> 16 [label=51.5]; + 28 -> 26 [label=8.0]; + 29 -> 27 [label=27.5]; + 29 -> 28 [label=10.5]; + 30 -> 4 [label=45.5]; + 30 -> 32 [label=25.5]; + 31 -> 18 [label=150.5]; + 31 -> 30 [label=27.0]; + 32 -> 5 [label=32.5]; + 32 -> 8 [label=57.0]; +} +//998.0 diff --git a/tutorials/tutorial_1/Tutorial1_phygfiles/search1/chel_search1.dot b/tutorials/tutorial_1/Tutorial1_phygfiles/search1/chel_search1.dot new file mode 100644 index 000000000..48f65c959 --- /dev/null +++ b/tutorials/tutorial_1/Tutorial1_phygfiles/search1/chel_search1.dot @@ -0,0 +1,700 @@ +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 [label=HTU17]; + 18 [label=HTU18]; + 19 [label=HTU19]; + 20 [label=HTU20]; + 21 [label=HTU21]; + 22 [label=HTU22]; + 23 [label=HTU23]; + 24 [label=HTU24]; + 25 [label=HTU25]; + 26 [label=HTU26]; + 27 [label=HTU27]; + 28 [label=HTU28]; + 29 [label=HTU29]; + 30 [label=HTU30]; + 31 [label=HTU31]; + 32 [label=HTU32]; + 17 -> 10 [label=50.0]; + 17 -> 29 [label=47.0]; + 18 -> 0 [label=60.0]; + 18 -> 3 [label=54.0]; + 19 -> 6 [label=33.0]; + 19 -> 7 [label=172.0]; + 20 -> 13 [label=39.5]; + 20 -> 14 [label=9.5]; + 21 -> 9 [label=44.0]; + 21 -> 19 [label=154.0]; + 22 -> 20 [label=32.5]; + 22 -> 31 [label=16.5]; + 23 -> 11 [label=42.0]; + 23 -> 12 [label=42.0]; + 24 -> 22 [label=18.0]; + 24 -> 23 [label=12.0]; + 25 -> 15 [label=47.0]; + 25 -> 18 [label=33.0]; + 26 -> 24 [label=15.0]; + 26 -> 25 [label=18.5]; + 27 -> 16 [label=35.0]; + 27 -> 26 [label=17.0]; + 28 -> 1 [label=45.0]; + 28 -> 2 [label=90.0]; + 29 -> 27 [label=15.0]; + 29 -> 28 [label=25.5]; + 30 -> 4 [label=51.5]; + 30 -> 32 [label=22.0]; + 31 -> 21 [label=23.5]; + 31 -> 30 [label=23.0]; + 32 -> 5 [label=33.0]; + 32 -> 8 [label=51.0]; +} +//999.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 [label=HTU17]; + 18 [label=HTU18]; + 19 [label=HTU19]; + 20 [label=HTU20]; + 21 [label=HTU21]; + 22 [label=HTU22]; + 23 [label=HTU23]; + 24 [label=HTU24]; + 25 [label=HTU25]; + 26 [label=HTU26]; + 27 [label=HTU27]; + 28 [label=HTU28]; + 29 [label=HTU29]; + 30 [label=HTU30]; + 31 [label=HTU31]; + 32 [label=HTU32]; + 17 -> 10 [label=50.0]; + 17 -> 22 [label=47.0]; + 18 -> 0 [label=60.0]; + 18 -> 3 [label=54.0]; + 19 -> 15 [label=47.0]; + 19 -> 18 [label=33.0]; + 20 -> 19 [label=18.5]; + 20 -> 29 [label=15.0]; + 21 -> 1 [label=45.0]; + 21 -> 2 [label=90.0]; + 22 -> 21 [label=25.5]; + 22 -> 24 [label=15.0]; + 23 -> 11 [label=42.0]; + 23 -> 12 [label=42.0]; + 24 -> 16 [label=35.0]; + 24 -> 20 [label=17.0]; + 25 -> 6 [label=33.0]; + 25 -> 7 [label=172.0]; + 26 -> 4 [label=51.5]; + 26 -> 27 [label=22.0]; + 27 -> 5 [label=33.0]; + 27 -> 8 [label=51.0]; + 28 -> 9 [label=44.0]; + 28 -> 25 [label=154.0]; + 29 -> 23 [label=12.0]; + 29 -> 31 [label=18.0]; + 30 -> 13 [label=39.5]; + 30 -> 14 [label=9.5]; + 31 -> 30 [label=32.5]; + 31 -> 32 [label=16.5]; + 32 -> 26 [label=23.0]; + 32 -> 28 [label=23.5]; +} +//999.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 [label=HTU17]; + 18 [label=HTU18]; + 19 [label=HTU19]; + 20 [label=HTU20]; + 21 [label=HTU21]; + 22 [label=HTU22]; + 23 [label=HTU23]; + 24 [label=HTU24]; + 25 [label=HTU25]; + 26 [label=HTU26]; + 27 [label=HTU27]; + 28 [label=HTU28]; + 29 [label=HTU29]; + 30 [label=HTU30]; + 31 [label=HTU31]; + 32 [label=HTU32]; + 17 -> 10 [label=77.0]; + 17 -> 22 [label=53.0]; + 18 -> 0 [label=64.0]; + 18 -> 3 [label=60.0]; + 19 -> 15 [label=48.5]; + 19 -> 18 [label=30.5]; + 20 -> 19 [label=16.5]; + 20 -> 23 [label=16.5]; + 21 -> 1 [label=51.5]; + 21 -> 2 [label=90.5]; + 22 -> 21 [label=26.0]; + 22 -> 24 [label=12.5]; + 23 -> 12 [label=39.5]; + 23 -> 29 [label=20.0]; + 24 -> 16 [label=52.5]; + 24 -> 20 [label=8.0]; + 25 -> 6 [label=33.5]; + 25 -> 7 [label=168.5]; + 26 -> 4 [label=45.5]; + 26 -> 27 [label=25.5]; + 27 -> 5 [label=32.5]; + 27 -> 8 [label=57.0]; + 28 -> 9 [label=46.5]; + 28 -> 32 [label=19.0]; + 29 -> 11 [label=42.0]; + 29 -> 31 [label=20.0]; + 30 -> 13 [label=40.5]; + 30 -> 14 [label=14.5]; + 31 -> 28 [label=19.0]; + 31 -> 30 [label=30.5]; + 32 -> 25 [label=150.5]; + 32 -> 26 [label=26.0]; +} +//998.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 [label=HTU17]; + 18 [label=HTU18]; + 19 [label=HTU19]; + 20 [label=HTU20]; + 21 [label=HTU21]; + 22 [label=HTU22]; + 23 [label=HTU23]; + 24 [label=HTU24]; + 25 [label=HTU25]; + 26 [label=HTU26]; + 27 [label=HTU27]; + 28 [label=HTU28]; + 29 [label=HTU29]; + 30 [label=HTU30]; + 31 [label=HTU31]; + 32 [label=HTU32]; + 17 -> 10 [label=77.0]; + 17 -> 29 [label=53.0]; + 18 -> 6 [label=33.5]; + 18 -> 7 [label=168.5]; + 19 -> 5 [label=32.5]; + 19 -> 8 [label=57.0]; + 20 -> 4 [label=45.5]; + 20 -> 19 [label=25.5]; + 21 -> 9 [label=46.5]; + 21 -> 23 [label=19.0]; + 22 -> 13 [label=40.5]; + 22 -> 14 [label=14.5]; + 23 -> 18 [label=150.5]; + 23 -> 20 [label=26.0]; + 24 -> 12 [label=39.5]; + 24 -> 26 [label=20.0]; + 25 -> 21 [label=19.0]; + 25 -> 22 [label=30.5]; + 26 -> 11 [label=42.0]; + 26 -> 25 [label=20.0]; + 27 -> 1 [label=51.5]; + 27 -> 2 [label=90.5]; + 28 -> 16 [label=52.5]; + 28 -> 31 [label=8.0]; + 29 -> 27 [label=26.0]; + 29 -> 28 [label=12.5]; + 30 -> 15 [label=48.5]; + 30 -> 32 [label=30.5]; + 31 -> 24 [label=16.5]; + 31 -> 30 [label=16.5]; + 32 -> 0 [label=64.0]; + 32 -> 3 [label=60.0]; +} +//998.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 [label=HTU17]; + 18 [label=HTU18]; + 19 [label=HTU19]; + 20 [label=HTU20]; + 21 [label=HTU21]; + 22 [label=HTU22]; + 23 [label=HTU23]; + 24 [label=HTU24]; + 25 [label=HTU25]; + 26 [label=HTU26]; + 27 [label=HTU27]; + 28 [label=HTU28]; + 29 [label=HTU29]; + 30 [label=HTU30]; + 31 [label=HTU31]; + 32 [label=HTU32]; + 17 -> 10 [label=77.0]; + 17 -> 26 [label=53.0]; + 18 -> 0 [label=64.0]; + 18 -> 3 [label=60.0]; + 19 -> 13 [label=40.5]; + 19 -> 14 [label=14.5]; + 20 -> 11 [label=42.0]; + 20 -> 28 [label=20.0]; + 21 -> 12 [label=39.5]; + 21 -> 20 [label=20.0]; + 22 -> 15 [label=48.5]; + 22 -> 18 [label=30.5]; + 23 -> 21 [label=16.5]; + 23 -> 22 [label=16.5]; + 24 -> 16 [label=52.5]; + 24 -> 23 [label=8.0]; + 25 -> 1 [label=51.5]; + 25 -> 2 [label=90.5]; + 26 -> 24 [label=12.5]; + 26 -> 25 [label=26.0]; + 27 -> 9 [label=46.5]; + 27 -> 32 [label=19.0]; + 28 -> 19 [label=30.5]; + 28 -> 27 [label=19.0]; + 29 -> 5 [label=32.5]; + 29 -> 8 [label=57.0]; + 30 -> 6 [label=33.5]; + 30 -> 7 [label=168.5]; + 31 -> 4 [label=45.5]; + 31 -> 29 [label=25.5]; + 32 -> 30 [label=150.5]; + 32 -> 31 [label=26.0]; +} +//998.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 [label=HTU17]; + 18 [label=HTU18]; + 19 [label=HTU19]; + 20 [label=HTU20]; + 21 [label=HTU21]; + 22 [label=HTU22]; + 23 [label=HTU23]; + 24 [label=HTU24]; + 25 [label=HTU25]; + 26 [label=HTU26]; + 27 [label=HTU27]; + 28 [label=HTU28]; + 29 [label=HTU29]; + 30 [label=HTU30]; + 31 [label=HTU31]; + 32 [label=HTU32]; + 17 -> 10 [label=77.0]; + 17 -> 24 [label=53.0]; + 18 -> 0 [label=64.0]; + 18 -> 3 [label=60.0]; + 19 -> 12 [label=39.5]; + 19 -> 26 [label=20.0]; + 20 -> 15 [label=48.5]; + 20 -> 18 [label=30.5]; + 21 -> 19 [label=16.5]; + 21 -> 20 [label=16.5]; + 22 -> 16 [label=52.5]; + 22 -> 21 [label=8.0]; + 23 -> 1 [label=51.5]; + 23 -> 2 [label=90.5]; + 24 -> 22 [label=12.5]; + 24 -> 23 [label=26.0]; + 25 -> 31 [label=19.0]; + 25 -> 32 [label=30.5]; + 26 -> 11 [label=42.0]; + 26 -> 25 [label=20.0]; + 27 -> 5 [label=32.5]; + 27 -> 8 [label=57.0]; + 28 -> 6 [label=33.5]; + 28 -> 7 [label=168.5]; + 29 -> 4 [label=45.5]; + 29 -> 27 [label=25.5]; + 30 -> 28 [label=150.5]; + 30 -> 29 [label=26.0]; + 31 -> 9 [label=46.5]; + 31 -> 30 [label=19.0]; + 32 -> 13 [label=40.5]; + 32 -> 14 [label=14.5]; +} +//998.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 [label=HTU17]; + 18 [label=HTU18]; + 19 [label=HTU19]; + 20 [label=HTU20]; + 21 [label=HTU21]; + 22 [label=HTU22]; + 23 [label=HTU23]; + 24 [label=HTU24]; + 25 [label=HTU25]; + 26 [label=HTU26]; + 27 [label=HTU27]; + 28 [label=HTU28]; + 29 [label=HTU29]; + 30 [label=HTU30]; + 31 [label=HTU31]; + 32 [label=HTU32]; + 17 -> 10 [label=77.0]; + 17 -> 22 [label=53.0]; + 18 -> 0 [label=64.0]; + 18 -> 3 [label=60.0]; + 19 -> 15 [label=48.5]; + 19 -> 18 [label=30.5]; + 20 -> 16 [label=52.5]; + 20 -> 24 [label=8.0]; + 21 -> 1 [label=51.5]; + 21 -> 2 [label=90.5]; + 22 -> 20 [label=12.5]; + 22 -> 21 [label=26.0]; + 23 -> 12 [label=39.5]; + 23 -> 32 [label=20.0]; + 24 -> 19 [label=16.5]; + 24 -> 23 [label=16.5]; + 25 -> 5 [label=32.5]; + 25 -> 8 [label=57.0]; + 26 -> 6 [label=33.5]; + 26 -> 7 [label=168.5]; + 27 -> 4 [label=45.5]; + 27 -> 25 [label=25.5]; + 28 -> 26 [label=150.5]; + 28 -> 27 [label=26.0]; + 29 -> 9 [label=46.5]; + 29 -> 28 [label=19.0]; + 30 -> 13 [label=40.5]; + 30 -> 14 [label=14.5]; + 31 -> 29 [label=19.0]; + 31 -> 30 [label=30.5]; + 32 -> 11 [label=42.0]; + 32 -> 31 [label=20.0]; +} +//998.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 [label=HTU17]; + 18 [label=HTU18]; + 19 [label=HTU19]; + 20 [label=HTU20]; + 21 [label=HTU21]; + 22 [label=HTU22]; + 23 [label=HTU23]; + 24 [label=HTU24]; + 25 [label=HTU25]; + 26 [label=HTU26]; + 27 [label=HTU27]; + 28 [label=HTU28]; + 29 [label=HTU29]; + 30 [label=HTU30]; + 31 [label=HTU31]; + 32 [label=HTU32]; + 17 -> 10 [label=77.0]; + 17 -> 26 [label=53.0]; + 18 -> 13 [label=40.5]; + 18 -> 14 [label=14.5]; + 19 -> 11 [label=42.0]; + 19 -> 28 [label=20.0]; + 20 -> 0 [label=64.0]; + 20 -> 3 [label=60.0]; + 21 -> 15 [label=48.5]; + 21 -> 20 [label=30.5]; + 22 -> 12 [label=39.5]; + 22 -> 19 [label=20.0]; + 23 -> 21 [label=16.5]; + 23 -> 22 [label=16.5]; + 24 -> 1 [label=51.5]; + 24 -> 2 [label=90.5]; + 25 -> 16 [label=52.5]; + 25 -> 23 [label=8.0]; + 26 -> 24 [label=26.0]; + 26 -> 25 [label=12.5]; + 27 -> 9 [label=46.5]; + 27 -> 32 [label=19.0]; + 28 -> 18 [label=30.5]; + 28 -> 27 [label=19.0]; + 29 -> 6 [label=33.5]; + 29 -> 7 [label=168.5]; + 30 -> 5 [label=32.5]; + 30 -> 8 [label=57.0]; + 31 -> 4 [label=45.5]; + 31 -> 30 [label=25.5]; + 32 -> 29 [label=150.5]; + 32 -> 31 [label=26.0]; +} +//998.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 [label=HTU17]; + 18 [label=HTU18]; + 19 [label=HTU19]; + 20 [label=HTU20]; + 21 [label=HTU21]; + 22 [label=HTU22]; + 23 [label=HTU23]; + 24 [label=HTU24]; + 25 [label=HTU25]; + 26 [label=HTU26]; + 27 [label=HTU27]; + 28 [label=HTU28]; + 29 [label=HTU29]; + 30 [label=HTU30]; + 31 [label=HTU31]; + 32 [label=HTU32]; + 17 -> 10 [label=77.0]; + 17 -> 27 [label=53.0]; + 18 -> 0 [label=64.0]; + 18 -> 3 [label=60.0]; + 19 -> 13 [label=40.5]; + 19 -> 14 [label=14.5]; + 20 -> 19 [label=30.5]; + 20 -> 29 [label=19.0]; + 21 -> 11 [label=42.0]; + 21 -> 20 [label=20.0]; + 22 -> 12 [label=39.5]; + 22 -> 21 [label=20.0]; + 23 -> 15 [label=48.5]; + 23 -> 18 [label=30.5]; + 24 -> 22 [label=16.5]; + 24 -> 23 [label=16.5]; + 25 -> 1 [label=51.5]; + 25 -> 2 [label=90.5]; + 26 -> 16 [label=52.5]; + 26 -> 24 [label=8.0]; + 27 -> 25 [label=26.0]; + 27 -> 26 [label=12.5]; + 28 -> 31 [label=150.5]; + 28 -> 32 [label=26.0]; + 29 -> 9 [label=46.5]; + 29 -> 28 [label=19.0]; + 30 -> 5 [label=32.5]; + 30 -> 8 [label=57.0]; + 31 -> 6 [label=33.5]; + 31 -> 7 [label=168.5]; + 32 -> 4 [label=45.5]; + 32 -> 30 [label=25.5]; +} +//998.0 +digraph G { + rankdir = LR; node [ shape = none]; + + 0 [label=Alentus]; + 1 [label=Amblypygid]; + 2 [label=Americhernus]; + 3 [label=Aportus]; + 4 [label=Artemia]; + 5 [label=Centruoides]; + 6 [label=Chanbria]; + 7 [label=Gea]; + 8 [label=Hadrurus]; + 9 [label=Hypochilus]; + 10 [label=Limulus]; + 11 [label=Mastigoproctus]; + 12 [label=Pauroctonus]; + 13 [label=Rhiphicephalus]; + 14 [label=Thelichoris]; + 15 [label=Thermobius]; + 16 [label=Vonones]; + 17 [label=HTU17]; + 18 [label=HTU18]; + 19 [label=HTU19]; + 20 [label=HTU20]; + 21 [label=HTU21]; + 22 [label=HTU22]; + 23 [label=HTU23]; + 24 [label=HTU24]; + 25 [label=HTU25]; + 26 [label=HTU26]; + 27 [label=HTU27]; + 28 [label=HTU28]; + 29 [label=HTU29]; + 30 [label=HTU30]; + 31 [label=HTU31]; + 32 [label=HTU32]; + 17 -> 10 [label=77.0]; + 17 -> 26 [label=53.0]; + 18 -> 13 [label=40.5]; + 18 -> 14 [label=14.5]; + 19 -> 11 [label=42.0]; + 19 -> 28 [label=20.0]; + 20 -> 0 [label=64.0]; + 20 -> 3 [label=60.0]; + 21 -> 15 [label=48.5]; + 21 -> 20 [label=30.5]; + 22 -> 12 [label=39.5]; + 22 -> 19 [label=20.0]; + 23 -> 21 [label=16.5]; + 23 -> 22 [label=16.5]; + 24 -> 16 [label=52.5]; + 24 -> 23 [label=8.0]; + 25 -> 1 [label=51.5]; + 25 -> 2 [label=90.5]; + 26 -> 24 [label=12.5]; + 26 -> 25 [label=26.0]; + 27 -> 9 [label=46.5]; + 27 -> 31 [label=19.0]; + 28 -> 18 [label=30.5]; + 28 -> 27 [label=19.0]; + 29 -> 5 [label=32.5]; + 29 -> 8 [label=57.0]; + 30 -> 6 [label=33.5]; + 30 -> 7 [label=168.5]; + 31 -> 30 [label=150.5]; + 31 -> 32 [label=26.0]; + 32 -> 4 [label=45.5]; + 32 -> 29 [label=25.5]; +} +//998.0 diff --git a/tutorials/tutorial_1/Tutorial1_phygfiles/search1/chel_search1.pg b/tutorials/tutorial_1/Tutorial1_phygfiles/search1/chel_search1.pg new file mode 100644 index 000000000..46ad7056e --- /dev/null +++ b/tutorials/tutorial_1/Tutorial1_phygfiles/search1/chel_search1.pg @@ -0,0 +1,10 @@ +--Chel search +set(seed:73412305) +set(outgroup:"Limulus") +read(nucleotide:"chel_16S.fas") +read(nucleotide:"chel_cox1.fas") +read(nucleotide:"chel_12S.fas") +search(minutes:30) +report("chel_search1.tre", newick, graphs, overwrite) +report("chel_search1.ss", ia, overwrite) +report("chel_search1.dot", graphs, dotpdf, overwrite) diff --git a/tutorials/tutorial_1/Tutorial1_phygfiles/search1/chel_search1.ss b/tutorials/tutorial_1/Tutorial1_phygfiles/search1/chel_search1.ss new file mode 100644 index 000000000..8f212eb13 --- /dev/null +++ b/tutorials/tutorial_1/Tutorial1_phygfiles/search1/chel_search1.ss @@ -0,0 +1,4320 @@ +Implied Alignments for Graph 0 + +Sequence character chel_cox1.fas#0 +>Alentus +T T A T A T T A G G A G C A C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T T T G A C T T C T T C C A C C A T C T T T +A A C T T T A C T C C T C T C T A G A G G A A T A +G T A G A A A G T G G A G T G G G A A C A G G C T +G A A C T G T C T A T C C T C C T T T A T - C - T +G C T G G A A T T G C A C A T G C T G G T G C C T +C A G T A G A T T T A G G A A T C T T T T C T T T +A C A C C T A G C A G G A G T T T C A T C C A T C +T T A G G G G C C G T A A A C T T T A T A A C T A +C C G T T A T C A A T A T A C G A A G A A - C A G +G T A T A A C A A T A G A C C G A A T T C C T C T +T T T T G T A T G A T C T G T A T T T A T T A C A +G C C C T T C T T C T C C T T C T T T C C C T C C +C A G T C +>Amblypygid +T A A T A T T A G G A G C C C C A G A T A T G G - +C - T T T T C C C C G T A T A A A T A A T A T A A +G T T T T T G A C T T T T A C C C C C C T C T C T +A A C A C T A C T C C T C T C T A G A G G G A T A +G T G G A A A G A G G T G T A G G T A C A G G A T +G A A C T G T T T A C C C C C C T C T A T - C - G +G C T A G C A T C G C A C A C G C C G G T G C T T +C T G T T G A T A T G G G A A T T T T C T C G C T +C C A T T T G G C C G G G G T T T C T T C A A T T +T T A G G G G C C G T T A A T T T T A T A A C T A +C T G T T A T C A A T A T A C G A A G A A - C C G +G A A T A A C T A T A G A C C G A A T T C C C C T +T T T T G T T T G A T C T G T C T T T A T T A C C +G C C C T C C T T C T T C T T C T A T C A T T A C +C T G T T +>Americhernus +T A A T A C T T G G G G C C C C T G A T A T G G - +C - T T T C C C C C G T A T A A A T A A T A T G A +G A T T C T G A C T C T T A C C T C C C T C A T T +A A C T C T C C T A T T A T C G A G A G G G A T A +G T T G A A A C A G G G G T T G G T A C T G G A T +G G A C A G T A T A C C C C C C C T T A T - C - G +G C C G G G A T C G C C C A C G C A G G G G C C T +C A G T T G A C C T G G C A A T C T T T T C T C T +C C A C T T A G C T G G G G T T T C T T C T A T T +C T A G G A G C A G T T A A T T T T A T A A C A A +C C G T T A T C A A T A T A C G A A G T G - T A G +G G A T G A C A A T A G A C C G A G T A C C T C T +T T T T G T A T G G T C T A T T T T C A T C A C A +A C T C T C C T T C T A C T A C T T T C T C T A C +C A G T C +>Aportus +T A A T A T T A G G G G C C C C T G A T A T G G C +C - T T T C C C - C G G A T A A A T A A T A T A A +G T T T C T G A C T A T T A C C C C C C T C T T T +A A C C C T T C T T T T A T C T A G T G G T A T G +G T T G A A A C A G G A G T T G G G A C T G G T T +G A A C A G T T T A C C C C C C T T T A G - C - C +G C C G G A A T T G C T C A C G C A G G A G C T T +C G G T A G A T C T T G C T A T T T T C T C A C T +C C A C T T A G C C G G G G T T T C T T C G A T T +T T A G G A G C A G T A A A C T T T A T A A C T A +C C G T T A T T A A T A T A C G A A G A A - C A G +G G A T A A C A A T G G A C C G G A T C C C T C T +T T T C G T C T G G T C G G T T T T T A T T A C T +G C C C T T C T T T T A C T T T T A T C T C T C C +C A G T T +>Artemia +T A A T A T T A G G A G C T C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A C T C C T C C C T C C C T C T T T +A A C T C T T T T A C T T T C A A G A G G A A T A +G T A G A A A G A G G A G T A G G A A C G G G A T +G A A C C G T T T A T C C T C C T C T T G - C - A +G C C G G A A T T G C T C A T G C C G G A G C T T +C A G T T G A T A T A G G A A T C T T C T C T C T +A C A T T T A G C A G G A G T A T C C T C T A T T +T T A G G G G C C G T A A A T T T T A T A A C A A +C A G T G A T T A A C A T A C G G G C A T - C T G +G A A T A A C A T T A G A C C G A A T A C C T T T +A T T C G T G T G A T C T G T A T T T A T T A C T +G C A C T A T T A T T A C T T T T A T C A C T A C +C A G T T +>Centruoides +T A A T A T T A G G G G C C C C T G A T A T G G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A T T G C T T C C C C C T T C C C T +A A C T C T T T T A C T T T C G A G T G G A A T G +G T A G A A A G A G G G G T A G G T A C A G G A T +G A A C T G T T T A T C C C C C T C T T G - C - A +G C C G G A A T T G C T C A T G C T G G A G C T T +C C G T T G A C A T A G G A A T T T T C T C T C T +A C A T T T A G C A G G A G T G T C T T C T A T T +T T A G G G G C C G T A A A C T T C A T A A C T A +C A G T A A T T A A T A T A C G A G C A T - C T G +G A A T A A C A A T A G A C C G A A T A C C T T T +A T T T G T T T G A T C T G T C T T T A T T A C A +G C A C T A C T T T T A C T C T T G T C A T T A C +C A G T T +>Chanbria +T T A T A T T A G G A G C A C C T G A T A T A G - +C - A T T T C C A C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C A T C T C T +T A C A C T A C T C C T A T C A A G A G G A A T G +G T A G A A A G A G G A G T C G G T A C A G G A T +G A A C T G T G T A T C C C C C G C T A G - C - A +G C A G G A A T T G C C C A C G C T G G A G C C T +C A G T T G A C A T A G G A A T T T T T T C A C T +A C A C C T C G C C G G A G T C T C A T C A A T T +C T A G G T G C A G T T A A T T T T A T A A C A A +C T G T T A T T A A C A T A C G - C C C T G C G G +G A A T A A C T A T A G A C C G T A T A C C A C T +T T T C G T G T G A G C A G T A T T T A T T A C A +G C C C T A C T C C T T C T T T T A T C T C T A C +C A G T C +>Gea +T T A T A C T T G G T G C C C C A G A T A T G G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G A T T C T G A C T T C T C C C C C C T T C T C T +A A C T C T A T T A C T T T C A A G A G G A A T A +G T A G A A A G A G G A G T A G G A A C A G G A T +G A A C A G T T T A C C C C C C T C T A G - C - A +G C A G G A A T T G C T C A T G C T G G A G C T T +C A G T T G A T A T A G G A A T T T T C T C G C T +A C A C C T T G C A G G G G T C T C A T C A A T C +T T A G G A G C A G T T A A T T T C A T G A C A A +C A G T T A T T A A T A T G C G - C C C T G C A G +G A A T A A C T A T G G A C C G T A T A C C A C T +C T T C G T G T G A G C T G T C T T T A T T A C A +G C C T T G C T A C T A T T A C T A T C C C T C C +C A G T C +>Hadrurus +T T A T A T T A G G G G C C C C A G A T A T A G - +C - T T T T C C C C G A A T A A A T A A T A T A A +G C T T C T G A T T A C T T C C C C C C T C C C T +A A C T C T C C T C C T T T T A A G A G G A A T A +G T T G A A A G A G G A G T A G G G A C C G G C T +G A A C T G T C T A T C C T C C T T T A G - C T A +G - A G G A A T T G C C C A T G C A G G A G C C T +C G G T A G A C A T A G G A A T T T T T T C T C T +A C A C T T A G C T G G A G T T T C A T C C A T T +T T A G G G G C C G T A A A C T T T A T A A C A A +C A G T G A T T A A T A T A C G A T C A T - C A G +G A A T A A C A A T A G A T C G A A T A C C T T T +A T T T G T T T G A T C A G T C T T T A T T A C C +G C T T T A T T G T T A C T C C T T T C A C T C C +C A G T T +>Hypochilus +T A A T A C T C G G G G C C C C A G A T A T A G - +C - T T T C C C A C G A A T A A A T A A C A T A A +G A T T C T G A C T T C T T C C T C C C T C T C T +A A C T C T A C T T C T C T C A A G C G G A A T A +G T C G A A A G A G G G G T A G G A A C C G G C T +G G A C A G T A T A C C C T C C T C T A G C C - A +G - A G G A A T C G C T C A T G C A G G T G C T T +C T G T A G A T A T A G G A A T C T T C T C G C T +T C A C C T A G C A G G A G T C T C A T C A A T C +C T A G G G G C C G T A A A C T T T A T A A C A A +C A G T A A T C A A T A T A C G A T C C T - C A G +G A A T A T C T A T A G A C C G T A T A C C T T T +A T T T G T A T G A T C A G T A T T C A T T A C C +G C T C T C C T C C T T C T T C T C T C T T T A C +C A G T C +>Limulus +T A A T A C T A G G T G C C C C G G A T A T G G - +C - T T T C C C T C G A A T A A A T A A C A T A A +G T T T C T G A C T T T T A C C C C C A T C T C T +C A C T C T C C T C T T A T C T A G A G G A A T A +G T A G A A A G A G G T G T A G G A A C T G G A T +G A A C T G T A T A T C C T C C T T T A T - C - T +G C A A G T A T T G C C C A T G C G G G A G C T T +C C G T T G A T C T C G G A A T C T T C T C A C T +A C A C T T A G C A G G T G T A T C A T C A A T T +T T A G G T G C C G T A A A T T T T A T A A C A A +C T G T T A T C A A T A T A C G A T C T A - C C G +G A A T A A C T A T G G A C C G G A T A C C T C T +T T T T G T T T G A G C A G T A T T T A T T A C T +G C T T T A C T T T T A C T A C T G T C C T T A C +C A G T T +>Mastigoproctus +T G A T A T T A G G T G C T C C T G A T A T A G - +C - T T T T C C T C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C T T C G T T +A A C G T T A C T T C T T T C C A G A G G T A T A +G T T G A A A G A G G G G T A G G A A C A G G A T +G A A C A G T C T A T C C T C C T T T A T - C - A +G C T A G A A T T G C T C A T G C C G G A G C T T +C A G T A G A C T T A G G T A T T T T C T C G C T +G C A T C T A G C T G G T G T C T C G T C T A T C +T T A G G A G C T G T C A A T T T T A T A A C A A +C T G T T A T C A A T A T A C G A T C T A - C A G +G T A T A A C T A T A G A C C G T A T A C C T C T +T T T C G T A T G G G C A G T A T T T A T T A C C +G C C C T A C T A C T T T T A T T A T C T T T A C +C A G T A +>Pauroctonus +T A A T G T T A G G T G C T C C T G A T A T G G - +C - T T T T C C A C G A A T A A A C A A T A T G A +G T T T C T G G C T C C T A C C T C C T T C A C T +A A C A C T A C T T C T T T C T A G A G G T A T A +G T T G A A A G A G G A G T A G G A A C A G G A T +G A A C T G T T T A C C C T C C T T T A T - C - A +G C C A G T A T T G C T C A T G C T G G G G C T T +C G G T A G A T T T A G G A A T T T T C T C C C T +A C A C T T G G C A G G T G T T T C T T C A A T T +T T A G G A G C T G T A A A T T T T A T G A C A A +C T G T T A T T A A C A T A C G A T C T A - C G G +G A A T A A C T A T A G A C C G A A T A C C G C T +T T T C G T T T G A G C A G T A T T T A T T A C A +G C A T T A C T T C T C T T A T T A T C T C T A C +C G G T A +>Rhiphicephalus +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A T C T A G A G G A A T A +G T T G A A A G A G G A G T T G G A A C A G G A T +G A A C T G T A T A T C C T C C C C T A T - C - A +G C T A G A A T C G C A C A T G C A G G G G C C T +C T G T A G A T T T G G G A A T T T T T T C A T T +G C A T T T A G C A G G A G T T T C A T C A A T T +C T A G G G G C C G T A A A T T T C A T A A C A A +C T G T A A T T A A T A T A C G A T C T A - C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thelichoris +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A T C T A G A G G A A T A +G T T G A A A G A G G A G T T G G A A C A G G A T +G A A C T G T A T A T C C T C C C C T A T - C - A +G C T A G A A T C G C A C A T G C A G G G G C C T +C T G T A G A T T T G G G A A T T T T T T C A T T +G C A T T T A G C A G G A G T T T C A T C A A T T +C T A G G G G C C G T A A A T T T C A T A A C A A +C T G T A A T T A A T A T A C G A T C T A - C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thermobius +T A A T A T T A G G T G C T C C C G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T G A +G T T T C T G A C T T T T A C C T C C C T C G C T +A A C T C T A C T T C T T T C T A G A G G T A T A +G T C G A A A G A G G G G T A G G A A C A G G A T +G A A C T G T T T A C C C T C C T C T A T - C - T +G C C A G C A T T G C C C A T G C A G G A G C A T +C T G T A G A T C T A G G A A T C T T C T C A T T +A C A T T T A G C A G G G G T T T C T T C A A T T +C T A G G T G C T G T A A A T T T T A T A A C T A +C C G T T A T C A A T A T A C G A T C A A - C A G +G A A T A A C T A T A G A C C G A A T A C C T C T +T T T C G T C T G A G C G G T A T T T A T T A C A +G C C T T A C T A C T T T T A C T A T C A T T A C +C G G T T +>Vonones +T A A T G T T A G G T G C T C C A G A T A T G G - +C A T T T C C C - C G A A T A A A T A A T A T A A +G T T T C T G A C T T T T A C C C C C T T C G C T +A A C T T T A C T T T T A T C T A G A G G T A T A +G T C G A A A G A G G A G T G G G A A C T G G A T +G A A C A G T A T A C C C T C C T T T A T - C - A +G C C A G A A T T G C T C A C G C A G G T G C T T +C A G T T G A T C T A G G T A T T T T T T C A T T +A C A T T T A G C A G G G G T C T C A T C A A T C +T T A G G A G C T G T A A A C T T T A T A A C G A +C C G T T A T C A A T A T A C G A T C T A - C A G +G G A T A A C T A T A G A C C G A A T A C C A C T +T T T T G T T T G A G C A G T A T T T A T T A C A +G C T C T A C T C C T A C T G T T A T C T T T A C +C A G T C + + +Sequence character chel_16S.fas#0 +>Alentus +T C C A G A C T - - G A - C - G G T - C C - A C C +- - G A A - T C G - G G C G A C T G - - - T - C A +G G C C T G A A C - - A T C C G G T C G G T T T C +- T C - T C C - C T T T T C C C G - - C G G T G C +T C T T C G T T G A G - T G - T C G C G - G G A G +G C C G A C A A G T T T A C T T T G A A A A A A T +T A G A G T G C T C A A A G C A G - G - - C - G - +- - - G - - T - C A C C +>Amblypygid +T C C A G A C T - - G G - C - G G T - C C - G C C +T - A - G - C G G C G A G T A C T G - - - T - C A +G G C C T G A A C - - A T G G C G C C G G - T T - +- T - - T - - - C - - - - - - C T - - T G G T T C +T C T T T A C T G A G - T G - T C T T G - G G C G +A C C G G C A C G T T T A C T T T G A A A A A A T +T A G A G T G C T C A A A G C A - - G - - - - G - +- C T G - - T - C G C C +>Americhernus +T C G A G C C T C C A A - T - G A T - A C - G T T +G - A - A - A G G C G T T T A T C G - - - T - T G +G G G C C G - A C - - A G C G C G T C - G - - T - +- G - - G - - - G - - - - - - C T - - C G G T T G +G C C T T A A A A A G C T G A T C - G G - G T T C +T C C G G C A A T T T T A C T T T G A A A A A A T +T A G G G T G C T C A A A G C A - - G - - - - G C +- C T G - - G - T G C C +>Aportus +T C C A G A C T - - G A - C - G G T - C C - A C C +- - G - C - T C G - G G C G A C T G - - - T - C A +G G C C T G A A C - - A T T C C G T C G G T T T C +G A C - A G A - T T T T T C C C G - - C G G T G C +T C T T C G G T G A G - T G - T C G C - - G G A G +G C C G A C A A G T T T A C T T T G A A A A A A T +T A G A G T G C T C A A A G C A G - G - - C - G - +- - - G - - T - C A C C +>Artemia +C T C G G T C G - - G G - T - G G T G C C - G C C +- T - - C A C G G T G G T C A C T G - - - C - C T +C G A T C G G A C - A A T T C A - T T G G - - A - +- - - - T - C - G - - - - - - T T C G G G G T G C +T C T T A A C C G A G - T G - T C C T G - G G T G +G C C G A T A C G T T T A C T T T G A A C A A A T +T A G A G T G C T T A A A G C A - G G - - T - G - +- C - A - C C G C G C C +>Centruoides +T C C A G A C A - - A G - C - G G T - C C A C C C +- - - - G - C G G T G G T T A C T G - - - T - T T +G G A C T G G A C - - G T T T G G C C G G - - A - +- T - - T - C - C - - - - - - T T - - T G A T G C +T C T T T G C C G A G - T G - T C T T G - G G T G +T C C G G C A C G T T T A C T T T G A A A A A A T +T A G A G T G C T C A A A G C A - - G - - C - G - +- T - A - - C - C G C C +>Chanbria +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - +>Gea +T C C G G C C G - - G A - C G G G T - C C - G C C +- T A - C - C G G T G G T T A C T G - T T C G C T +- G C C - G A G C - - - T T C A G G G G G - - C - +- C - - G - C - T - - - - - - G T - - C G A T G A +T C T T C A T C G G T - T A - T C T T C - C G T A +A C C C T C A C G T T T A C T T T G A A A A A A T +T A G A G T G C T C A A A G C A - - G - - C - G - +- C - G - - A - C G C C +>Hadrurus +T C C G G A C T - - G T - C - G G T - C C - G C C +- - - - G - C - A A G C T T A C T G - - - G - C A +G G A C C G G A C - - G T C T A G C C G G - - A - +- C - - T - C T C - - - - - - T C - - G T A T C C +T C T T C A C C G G G - T G - T C T T G - G G T G +T C C G G C A A T T T T A C T T T G A A A A A A T +T A G A G T G C T C A A A G C A - - G - - C - G - +C G - A T - C - C G C C +>Hypochilus +T C C A G A C G - - G G - C - G G T - C C - G C C +- T A - A - C G G T G G T T A C T G - - - C - C T +G G C C T G A A C - - A A C C A G C C G G - - T - +- T - - T - C - C - - - - - - C T - - A G A T G A +T C T T C A T T G A T - T G - T C T T G - G G T G +A C C G G C A C G T T T A C T T T G A A A A A A T +T A G A G T G C T C A A A G C A - - G - - C - G - +- T - G - - A - C G C C +>Limulus +T C T A G A C T - - G G - C - G G T - C C - G C T +T - - - C - C G G C G G T T A C T G - - - C - C T +G G C C T A A A C - - A T C - T G C C G G - T T - +- T - - T - C - C - - - - - - C T - - C G G T G C +C C T T G A T T G A G - T G - T C T T G - G G T G +G C C G G C A C G T T T A C T T T G A A A A A A T +T A G A G T G C T C A A A G C A - - G - - C - G - +- C - A - - A - C G C C +>Mastigoproctus +T C C A G A C G - - G G - T - G G T - C C - A C C +G - C - C - C G G T G G C T A C T G - - - C - C C +G G C C T G A A C A - A T C T C G C C G G - - T - +- T - - T - T - C - - - - - - C T - - T G A T T C +T C T T C A C C G A G - T G - T C T T G - G G T G +A C C G G C A C G T T T A C T T T G A A A A A A T +T A G A G T G C T T A A A G C A - - G - - C - G - +- T - A - - A - C G C C +>Pauroctonus +T C C A G A C T - - G T - C - G G T - C C - - G C +- - A - C - C G G A G G T T A C T G - - - G - C A +G G A C C G G A C - - G T C T A G C C G G - - C - +- C - - T - C - C - - - - - - C T - - C G T T G C +T C T T C A C C G G G - T G - T C T T G - G G T G +T C C G G C A C G T T T A C T T T G A A A A A A T +T A G A G T G C T C A A A G C A - - G - - C - G - +- A - C - - C - C G C C +>Rhiphicephalus +T C C A G A C G - - A G - T - A G T - G C - A T C +- T A - C - C C G A T G C T A C G G - - - C - T C +G G A C T G A A C - - A T C A T G C C G G - - T - +- T - C T - T - T - - - - - - C T - - T G G T G C +A C T T C A T T G T G - T G - C C T C G A G A T G +G C C G G T G C T T T T A C T T T G A A A A A A T +T A G A G T G C T C A A C G C A G - G - - C - G - +- A - G - - T - C G C C +>Thelichoris +T C C A G A C G - - G G - C - G G T - C C - G C C +- T A - A - C G G T G G T T A C T G - - - C - C T +G G C C T G A A C - - A G C C A G C C G G - - T - +- T - - T - C - C - - - - - - C T - - A G A T G A +T C T T T A C C G G T - T G - T C T T G - G G T G +A C C G G C A C G T T T A C T T T G A A A A A A T +T A G A G T G C T C A A A G C A G - G - - C - - - +- T - G - - A - C G C C +>Thermobius +C T C G G A C - - - G A T C - G G T - T C - G C C +- - G - C - C C G T G T T A A C T G A - - T - C - +G T T C C G G A C - - G T C C T G C C G G T T T - +- T - - T - C - C - T T T - - C T - - C G G T G C +T C T T C A T T G A G - T G - T C T T G - A T T G +G C C G G C A C G T T T A C T T T G A A C A A A T +T A G A G T G C T C A A A G C A G - G T C C A G - +- T - G - - T - C G C C +>Vonones +T C G A G G C T - - G G - C - G G T - C C - G C C +T - A - C - A G G C G G T C A C T G - - - C - C A +G T A C T C A A C - - A T C C T G C C G G - T T - +- T - - T - C - C - - - - - - C T - - T G G T G C +T C T T C G C T G A G - T G - T C T C G - G G T G +G C C G G C A C G T T T A C T T T G A A A A A A T +T A G A G T G C T C A A A G C A G - G - - C - G - +- C - G - - T - A G C C + +Implied Alignments for Graph 1 + +Sequence character chel_cox1.fas#0 +>Alentus +T T A T A T T A G G A G C A C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T T T G A C T T C T T C C A C C A T C T T T +A A C T T T A C T C C T C T C T A G A G G A A T A +G T A G A A A G T G G A G T G G G A A C A G G C T +G A A C T G T C T A T C C T C C T T T A T - C - T +G C T G G A A T T G C A C A T G C T G G T G C C T +C A G T A G A T T T A G G A A T C T T T T C T T T +A C A C C T A G C A G G A G T T T C A T C C A T C +T T A G G G G C C G T A A A C T T T A T A A C T A +C C G T T A T C A A T A T A C G A A G A A - C A G +G T A T A A C A A T A G A C C G A A T T C C T C T +T T T T G T A T G A T C T G T A T T T A T T A C A +G C C C T T C T T C T C C T T C T T T C C C T C C +C A G T C +>Amblypygid +T A A T A T T A G G A G C C C C A G A T A T G G - +C - T T T T C C C C G T A T A A A T A A T A T A A +G T T T T T G A C T T T T A C C C C C C T C T C T +A A C A C T A C T C C T C T C T A G A G G G A T A +G T G G A A A G A G G T G T A G G T A C A G G A T +G A A C T G T T T A C C C C C C T C T A T - C - G +G C T A G C A T C G C A C A C G C C G G T G C T T +C T G T T G A T A T G G G A A T T T T C T C G C T +C C A T T T G G C C G G G G T T T C T T C A A T T +T T A G G G G C C G T T A A T T T T A T A A C T A +C T G T T A T C A A T A T A C G A A G A A - C C G +G A A T A A C T A T A G A C C G A A T T C C C C T +T T T T G T T T G A T C T G T C T T T A T T A C C +G C C C T C C T T C T T C T T C T A T C A T T A C +C T G T T +>Americhernus +T A A T A C T T G G G G C C C C T G A T A T G G - +C - T T T C C C C C G T A T A A A T A A T A T G A +G A T T C T G A C T C T T A C C T C C C T C A T T +A A C T C T C C T A T T A T C G A G A G G G A T A +G T T G A A A C A G G G G T T G G T A C T G G A T +G G A C A G T A T A C C C C C C C T T A T - C - G +G C C G G G A T C G C C C A C G C A G G G G C C T +C A G T T G A C C T G G C A A T C T T T T C T C T +C C A C T T A G C T G G G G T T T C T T C T A T T +C T A G G A G C A G T T A A T T T T A T A A C A A +C C G T T A T C A A T A T A C G A A G T G - T A G +G G A T G A C A A T A G A C C G A G T A C C T C T +T T T T G T A T G G T C T A T T T T C A T C A C A +A C T C T C C T T C T A C T A C T T T C T C T A C +C A G T C +>Aportus +T A A T A T T A G G G G C C C C T G A T A T G G C +C - T T T C C C - C G G A T A A A T A A T A T A A +G T T T C T G A C T A T T A C C C C C C T C T T T +A A C C C T T C T T T T A T C T A G T G G T A T G +G T T G A A A C A G G A G T T G G G A C T G G T T +G A A C A G T T T A C C C C C C T T T A G - C - C +G C C G G A A T T G C T C A C G C A G G A G C T T +C G G T A G A T C T T G C T A T T T T C T C A C T +C C A C T T A G C C G G G G T T T C T T C G A T T +T T A G G A G C A G T A A A C T T T A T A A C T A +C C G T T A T T A A T A T A C G A A G A A - C A G +G G A T A A C A A T G G A C C G G A T C C C T C T +T T T C G T C T G G T C G G T T T T T A T T A C T +G C C C T T C T T T T A C T T T T A T C T C T C C +C A G T T +>Artemia +T A A T A T T A G G A G C T C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A C T C C T C C C T C C C T C T T T +A A C T C T T T T A C T T T C A A G A G G A A T A +G T A G A A A G A G G A G T A G G A A C G G G A T +G A A C C G T T T A T C C T C C T C T T G - C - A +G C C G G A A T T G C T C A T G C C G G A G C T T +C A G T T G A T A T A G G A A T C T T C T C T C T +A C A T T T A G C A G G A G T A T C C T C T A T T +T T A G G G G C C G T A A A T T T T A T A A C A A +C A G T G A T T A A C A T A C G G G C A T - C T G +G A A T A A C A T T A G A C C G A A T A C C T T T +A T T C G T G T G A T C T G T A T T T A T T A C T +G C A C T A T T A T T A C T T T T A T C A C T A C +C A G T T +>Centruoides +T A A T A T T A G G G G C C C C T G A T A T G G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A T T G C T T C C C C C T T C C C T +A A C T C T T T T A C T T T C G A G T G G A A T G +G T A G A A A G A G G G G T A G G T A C A G G A T +G A A C T G T T T A T C C C C C T C T T G - C - A +G C C G G A A T T G C T C A T G C T G G A G C T T +C C G T T G A C A T A G G A A T T T T C T C T C T +A C A T T T A G C A G G A G T G T C T T C T A T T +T T A G G G G C C G T A A A C T T C A T A A C T A +C A G T A A T T A A T A T A C G A G C A T - C T G +G A A T A A C A A T A G A C C G A A T A C C T T T +A T T T G T T T G A T C T G T C T T T A T T A C A +G C A C T A C T T T T A C T C T T G T C A T T A C +C A G T T +>Chanbria +T T A T A T T A G G A G C A C C T G A T A T A G - +C - A T T T C C A C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C A T C T C T +T A C A C T A C T C C T A T C A A G A G G A A T G +G T A G A A A G A G G A G T C G G T A C A G G A T +G A A C T G T G T A T C C C C C G C T A G - C - A +G C A G G A A T T G C C C A C G C T G G A G C C T +C A G T T G A C A T A G G A A T T T T T T C A C T +A C A C C T C G C C G G A G T C T C A T C A A T T +C T A G G T G C A G T T A A T T T T A T A A C A A +C T G T T A T T A A C A T A C G - C C C T G C G G +G A A T A A C T A T A G A C C G T A T A C C A C T +T T T C G T G T G A G C A G T A T T T A T T A C A +G C C C T A C T C C T T C T T T T A T C T C T A C +C A G T C +>Gea +T T A T A C T T G G T G C C C C A G A T A T G G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G A T T C T G A C T T C T C C C C C C T T C T C T +A A C T C T A T T A C T T T C A A G A G G A A T A +G T A G A A A G A G G A G T A G G A A C A G G A T +G A A C A G T T T A C C C C C C T C T A G - C - A +G C A G G A A T T G C T C A T G C T G G A G C T T +C A G T T G A T A T A G G A A T T T T C T C G C T +A C A C C T T G C A G G G G T C T C A T C A A T C +T T A G G A G C A G T T A A T T T C A T G A C A A +C A G T T A T T A A T A T G C G - C C C T G C A G +G A A T A A C T A T G G A C C G T A T A C C A C T +C T T C G T G T G A G C T G T C T T T A T T A C A +G C C T T G C T A C T A T T A C T A T C C C T C C +C A G T C +>Hadrurus +T T A T A T T A G G G G C C C C A G A T A T A G - +C - T T T T C C C C G A A T A A A T A A T A T A A +G C T T C T G A T T A C T T C C C C C C T C C C T +A A C T C T C C T C C T T T T A A G A G G A A T A +G T T G A A A G A G G A G T A G G G A C C G G C T +G A A C T G T C T A T C C T C C T T T A G - C T A +G - A G G A A T T G C C C A T G C A G G A G C C T +C G G T A G A C A T A G G A A T T T T T T C T C T +A C A C T T A G C T G G A G T T T C A T C C A T T +T T A G G G G C C G T A A A C T T T A T A A C A A +C A G T G A T T A A T A T A C G A T C A T - C A G +G A A T A A C A A T A G A T C G A A T A C C T T T +A T T T G T T T G A T C A G T C T T T A T T A C C +G C T T T A T T G T T A C T C C T T T C A C T C C +C A G T T +>Hypochilus +T A A T A C T C G G G G C C C C A G A T A T A G - +C - T T T C C C A C G A A T A A A T A A C A T A A +G A T T C T G A C T T C T T C C T C C C T C T C T +A A C T C T A C T T C T C T C A A G C G G A A T A +G T C G A A A G A G G G G T A G G A A C C G G C T +G G A C A G T A T A C C C T C C T C T A G C C - A +G - A G G A A T C G C T C A T G C A G G T G C T T +C T G T A G A T A T A G G A A T C T T C T C G C T +T C A C C T A G C A G G A G T C T C A T C A A T C +C T A G G G G C C G T A A A C T T T A T A A C A A +C A G T A A T C A A T A T A C G A T C C T - C A G +G A A T A T C T A T A G A C C G T A T A C C T T T +A T T T G T A T G A T C A G T A T T C A T T A C C +G C T C T C C T C C T T C T T C T C T C T T T A C +C A G T C +>Limulus +T A A T A C T A G G T G C C C C G G A T A T G G - +C - T T T C C C T C G A A T A A A T A A C A T A A +G T T T C T G A C T T T T A C C C C C A T C T C T +C A C T C T C C T C T T A T C T A G A G G A A T A +G T A G A A A G A G G T G T A G G A A C T G G A T +G A A C T G T A T A T C C T C C T T T A T - C - T +G C A A G T A T T G C C C A T G C G G G A G C T T +C C G T T G A T C T C G G A A T C T T C T C A C T +A C A C T T A G C A G G T G T A T C A T C A A T T +T T A G G T G C C G T A A A T T T T A T A A C A A +C T G T T A T C A A T A T A C G A T C T A - C C G +G A A T A A C T A T G G A C C G G A T A C C T C T +T T T T G T T T G A G C A G T A T T T A T T A C T +G C T T T A C T T T T A C T A C T G T C C T T A C +C A G T T +>Mastigoproctus +T G A T A T T A G G T G C T C C T G A T A T A G - +C - T T T T C C T C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C T T C G T T +A A C G T T A C T T C T T T C C A G A G G T A T A +G T T G A A A G A G G G G T A G G A A C A G G A T +G A A C A G T C T A T C C T C C T T T A T - C - A +G C T A G A A T T G C T C A T G C C G G A G C T T +C A G T A G A C T T A G G T A T T T T C T C G C T +G C A T C T A G C T G G T G T C T C G T C T A T C +T T A G G A G C T G T C A A T T T T A T A A C A A +C T G T T A T C A A T A T A C G A T C T A - C A G +G T A T A A C T A T A G A C C G T A T A C C T C T +T T T C G T A T G G G C A G T A T T T A T T A C C +G C C C T A C T A C T T T T A T T A T C T T T A C +C A G T A +>Pauroctonus +T A A T G T T A G G T G C T C C T G A T A T G G - +C - T T T T C C A C G A A T A A A C A A T A T G A +G T T T C T G G C T C C T A C C T C C T T C A C T +A A C A C T A C T T C T T T C T A G A G G T A T A +G T T G A A A G A G G A G T A G G A A C A G G A T +G A A C T G T T T A C C C T C C T T T A T - C - A +G C C A G T A T T G C T C A T G C T G G G G C T T +C G G T A G A T T T A G G A A T T T T C T C C C T +A C A C T T G G C A G G T G T T T C T T C A A T T +T T A G G A G C T G T A A A T T T T A T G A C A A +C T G T T A T T A A C A T A C G A T C T A - C G G +G A A T A A C T A T A G A C C G A A T A C C G C T +T T T C G T T T G A G C A G T A T T T A T T A C A +G C A T T A C T T C T C T T A T T A T C T C T A C +C G G T A +>Rhiphicephalus +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A T C T A G A G G A A T A +G T T G A A A G A G G A G T T G G A A C A G G A T +G A A C T G T A T A T C C T C C C C T A T - C - A +G C T A G A A T C G C A C A T G C A G G G G C C T +C T G T A G A T T T G G G A A T T T T T T C A T T +G C A T T T A G C A G G A G T T T C A T C A A T T +C T A G G G G C C G T A A A T T T C A T A A C A A +C T G T A A T T A A T A T A C G A T C T A - C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thelichoris +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A T C T A G A G G A A T A +G T T G A A A G A G G A G T T G G A A C A G G A T +G A A C T G T A T A T C C T C C C C T A T - C - A +G C T A G A A T C G C A C A T G C A G G G G C C T +C T G T A G A T T T G G G A A T T T T T T C A T T +G C A T T T A G C A G G A G T T T C A T C A A T T +C T A G G G G C C G T A A A T T T C A T A A C A A +C T G T A A T T A A T A T A C G A T C T A - C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thermobius +T A A T A T T A G G T G C T C C C G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T G A +G T T T C T G A C T T T T A C C T C C C T C G C T +A A C T C T A C T T C T T T C T A G A G G T A T A +G T C G A A A G A G G G G T A G G A A C A G G A T +G A A C T G T T T A C C C T C C T C T A T - C - T +G C C A G C A T T G C C C A T G C A G G A G C A T +C T G T A G A T C T A G G A A T C T T C T C A T T +A C A T T T A G C A G G G G T T T C T T C A A T T +C T A G G T G C T G T A A A T T T T A T A A C T A +C C G T T A T C A A T A T A C G A T C A A - C A G +G A A T A A C T A T A G A C C G A A T A C C T C T +T T T C G T C T G A G C G G T A T T T A T T A C A +G C C T T A C T A C T T T T A C T A T C A T T A C +C G G T T +>Vonones +T A A T G T T A G G T G C T C C A G A T A T G G - +C A T T T C C C - C G A A T A A A T A A T A T A A +G T T T C T G A C T T T T A C C C C C T T C G C T +A A C T T T A C T T T T A T C T A G A G G T A T A +G T C G A A A G A G G A G T G G G A A C T G G A T +G A A C A G T A T A C C C T C C T T T A T - C - A +G C C A G A A T T G C T C A C G C A G G T G C T T +C A G T T G A T C T A G G T A T T T T T T C A T T +A C A T T T A G C A G G G G T C T C A T C A A T C +T T A G G A G C T G T A A A C T T T A T A A C G A +C C G T T A T C A A T A T A C G A T C T A - C A G +G G A T A A C T A T A G A C C G A A T A C C A C T +T T T T G T T T G A G C A G T A T T T A T T A C A +G C T C T A C T C C T A C T G T T A T C T T T A C +C A G T C + + +Sequence character chel_16S.fas#0 +>Alentus +T C C A G A C T - - G A - C - G G T - C C - A C C +- - G A A - T C G - G G C G A C T G - - - T - C A +G G C C T G A A C - - A T C C G G T C G G T T T C +- T C - T C C - C T T T T C C C G - - C G G T G C +T C T T C G T T G A G - T G - T C G C G - G G A G +G C C G A C A A G T T T A C T T T G A A A A A A T +T A G A G T G C T C A A A G C A G - G - - C - G - +- - - G - - T - C A C C +>Amblypygid +T C C A G A C T - - G G - C - G G T - C C - G C C +T - A - G - C G G C G A G T A C T G - - - T - C A +G G C C T G A A C - - A T G G C G C C G G - T T - +- T - - T - - - C - - - - - - C T - - T G G T T C +T C T T T A C T G A G - T G - T C T T G - G G C G +A C C G G C A C G T T T A C T T T G A A A A A A T +T A G A G T G C T C A A A G C A - - G - - - - G - +- C T G - - T - C G C C +>Americhernus +T C G A G C C T C C A A - T - G A T - A C - G T T +G - A - A - A G G C G T T T A T C G - - - T - T G +G G G C C G - A C - - A G C G C G T C - G - - T - +- G - - G - - - G - - - - - - C T - - C G G T T G +G C C T T A A A A A G C T G A T C - G G - G T T C +T C C G G C A A T T T T A C T T T G A A A A A A T +T A G G G T G C T C A A A G C A - - G - - - - G C +- C T G - - G - T G C C +>Aportus +T C C A G A C T - - G A - C - G G T - C C - A C C +- - G - C - T C G - G G C G A C T G - - - T - C A +G G C C T G A A C - - A T T C C G T C G G T T T C +G A C - A G A - T T T T T C C C G - - C G G T G C +T C T T C G G T G A G - T G - T C G C - - G G A G +G C C G A C A A G T T T A C T T T G A A A A A A T +T A G A G T G C T C A A A G C A G - G - - C - G - +- - - G - - T - C A C C +>Artemia +C T C G G T C G - - G G - T - G G T G C C - G C C +- T - - C A C G G T G G T C A C T G - - - C - C T +C G A T C G G A C - A A T T C A - T T G G - - A - +- - - - T - C - G - - - - - - T T C G G G G T G C +T C T T A A C C G A G - T G - T C C T G - G G T G +G C C G A T A C G T T T A C T T T G A A C A A A T +T A G A G T G C T T A A A G C A - G G - - T - G - +- C - A - C C G C G C C +>Centruoides +T C C A G A C A - - A G - C - G G T - C C A C C C +- - - - G - C G G T G G T T A C T G - - - T - T T +G G A C T G G A C - - G T T T G G C C G G - - A - +- T - - T - C - C - - - - - - T T - - T G A T G C +T C T T T G C C G A G - T G - T C T T G - G G T G +T C C G G C A C G T T T A C T T T G A A A A A A T +T A G A G T G C T C A A A G C A - - G - - C - G - +- T - A - - C - C G C C +>Chanbria +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - +>Gea +T C C G G C C G - - G A - C G G G T - C C - G C C +- T A - C - C G G T G G T T A C T G - T T C G C T +- G C C - G A G C - - - T T C A G G G G G - - C - +- C - - G - C - T - - - - - - G T - - C G A T G A +T C T T C A T C G G T - T A - T C T T C - C G T A +A C C C T C A C G T T T A C T T T G A A A A A A T +T A G A G T G C T C A A A G C A - - G - - C - G - +- C - G - - A - C G C C +>Hadrurus +T C C G G A C T - - G T - C - G G T - C C - G C C +- - - - G - C - A A G C T T A C T G - - - G - C A +G G A C C G G A C - - G T C T A G C C G G - - A - +- C - - T - C T C - - - - - - T C - - G T A T C C +T C T T C A C C G G G - T G - T C T T G - G G T G +T C C G G C A A T T T T A C T T T G A A A A A A T +T A G A G T G C T C A A A G C A - - G - - C - G - +C G - A T - C - C G C C +>Hypochilus +T C C A G A C G - - G G - C - G G T - C C - G C C +- T A - A - C G G T G G T T A C T G - - - C - C T +G G C C T G A A C - - A A C C A G C C G G - - T - +- T - - T - C - C - - - - - - C T - - A G A T G A +T C T T C A T T G A T - T G - T C T T G - G G T G +A C C G G C A C G T T T A C T T T G A A A A A A T +T A G A G T G C T C A A A G C A - - G - - C - G - +- T - G - - A - C G C C +>Limulus +T C T A G A C T - - G G - C - G G T - C C - G C T +T - - - C - C G G C G G T T A C T G - - - C - C T +G G C C T A A A C - - A T C - T G C C G G - T T - +- T - - T - C - C - - - - - - C T - - C G G T G C +C C T T G A T T G A G - T G - T C T T G - G G T G +G C C G G C A C G T T T A C T T T G A A A A A A T +T A G A G T G C T C A A A G C A - - G - - C - G - +- C - A - - A - C G C C +>Mastigoproctus +T C C A G A C G - - G G - T - G G T - C C - A C C +G - C - C - C G G T G G C T A C T G - - - C - C C +G G C C T G A A C A - A T C T C G C C G G - - T - +- T - - T - T - C - - - - - - C T - - T G A T T C +T C T T C A C C G A G - T G - T C T T G - G G T G +A C C G G C A C G T T T A C T T T G A A A A A A T +T A G A G T G C T T A A A G C A - - G - - C - G - +- T - A - - A - C G C C +>Pauroctonus +T C C A G A C T - - G T - C - G G T - C C - - G C +- - A - C - C G G A G G T T A C T G - - - G - C A +G G A C C G G A C - - G T C T A G C C G G - - C - +- C - - T - C - C - - - - - - C T - - C G T T G C +T C T T C A C C G G G - T G - T C T T G - G G T G +T C C G G C A C G T T T A C T T T G A A A A A A T +T A G A G T G C T C A A A G C A - - G - - C - G - +- A - C - - C - C G C C +>Rhiphicephalus +T C C A G A C G - - A G - T - A G T - G C - A T C +- T A - C - C C G A T G C T A C G G - - - C - T C +G G A C T G A A C - - A T C A T G C C G G - - T - +- T - C T - T - T - - - - - - C T - - T G G T G C +A C T T C A T T G T G - T G - C C T C G A G A T G +G C C G G T G C T T T T A C T T T G A A A A A A T +T A G A G T G C T C A A C G C A G - G - - C - G - +- A - G - - T - C G C C +>Thelichoris +T C C A G A C G - - G G - C - G G T - C C - G C C +- T A - A - C G G T G G T T A C T G - - - C - C T +G G C C T G A A C - - A G C C A G C C G G - - T - +- T - - T - C - C - - - - - - C T - - A G A T G A +T C T T T A C C G G T - T G - T C T T G - G G T G +A C C G G C A C G T T T A C T T T G A A A A A A T +T A G A G T G C T C A A A G C A G - G - - C - - - +- T - G - - A - C G C C +>Thermobius +C T C G G A C - - - G A T C - G G T - T C - G C C +- - G - C - C C G T G T T A A C T G A - - T - C - +G T T C C G G A C - - G T C C T G C C G G T T T - +- T - - T - C - C - T T T - - C T - - C G G T G C +T C T T C A T T G A G - T G - T C T T G - A T T G +G C C G G C A C G T T T A C T T T G A A C A A A T +T A G A G T G C T C A A A G C A G - G T C C A G - +- T - G - - T - C G C C +>Vonones +T C G A G G C T - - G G - C - G G T - C C - G C C +T - A - C - A G G C G G T C A C T G - - - C - C A +G T A C T C A A C - - A T C C T G C C G G - T T - +- T - - T - C - C - - - - - - C T - - T G G T G C +T C T T C G C T G A G - T G - T C T C G - G G T G +G C C G G C A C G T T T A C T T T G A A A A A A T +T A G A G T G C T C A A A G C A G - G - - C - G - +- C - G - - T - A G C C + +Implied Alignments for Graph 2 + +Sequence character chel_cox1.fas#0 +>Alentus +T T A T A T T A G G A G C A C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T T T G A C T T C T T C C A C C A T C T T T +A A C T T T A C T C C T C - T C T A G A G G A A T +A G T A G A A A G T G G A G T G G G A A C A G G C +T G A A C T G T C T A T C C T C C T T T A T - C - +T G C T G G A A T T G C A C A T G C T G G T G C C +T C A G T A G A T T T A G G A A T C T T T T C T T +T A C A C C T A G C A G G A G T T T C A T C C A T +C T T A G G G G C C G T A A A C T T T A T A A C T +A C C G T T A T C A A T A T A C G A A G A A C A G +G T A T A A C A A T A G A C C G A A T T C C T C T +T T T T G T A T G A T C T G T A T T T A T T A C A +G C C C T T C T T C T C C T T C T T T C C C T C C +C A G T C +>Amblypygid +T A A T A T T A G G A G C C C C A G A T A T G G - +C - T T T T C C C C G T A T A A A T A A T A T A A +G T T T T T G A C T T T T A C C C C C C T C T C T +A A C A C T - A C T C C T C T C T A G A G G G A T +A G T G G A A A G A G G T G T A G G T A C A G G A +T G A A C T G T T T A C C C C C C T C T A T - C - +G G C T A G C A T C G C A C A C G C C G G T G C T +T C T G T T G A T A T G G G A A T T T T C T C G C +T C C A T T T G G C C G G G G T T T C T T C A A T +T T T A G G G G C C G T T A A T T T T A T A A C T +A C T G T T A T C A A T A T A C G A A G A A C C G +G A A T A A C T A T A G A C C G A A T T C C C C T +T T T T G T T T G A T C T G T C T T T A T T A C C +G C C C T C C T T C T T C T T C T A T C A T T A C +C T G T T +>Americhernus +T A A T A C T T G G G G C C C C T G A T A T G G - +C - T T T C C C C C G T A T A A A T A A T A T G A +G A T T C T G A C T C T T A C C T C C C T C A T T +A A C T C T - C C T A T T A T C G A G A G G G A T +A G T T G A A A C A G G G G T T G G T A C T G G A +T G G A C A G T A T A C C C C C C C T T A T - C - +G G C C G G G A T C G C C C A C G C A G G G G C C +T C A G T T G A C C T G G C A A T C T T T T C T C +T C C A C T T A G C T G G G G T T T C T T C T A T +T C T A G G A G C A G T T A A T T T T A T A A C A +A C C G T T A T C A A T A T A C G A A G T G T A G +G G A T G A C A A T A G A C C G A G T A C C T C T +T T T T G T A T G G T C T A T T T T C A T C A C A +A C T C T C C T T C T A C T A C T T T C T C T A C +C A G T C +>Aportus +T A A T A T T A G G G G C C C C T G A T A T G G C +C - T T T C C C - C G G A T A A A T A A T A T A A +G T T T C T G A C T A T T A C C C C C C T C T T T +A A C C C T T C T T T T A - T C T A G T G G T A T +G G T T G A A A C A G G A G T T G G G A C T G G T +T G A A C A G T T T A C C C C C C T T T A G - C - +C G C C G G A A T T G C T C A C G C A G G A G C T +T C G G T A G A T C T T G C T A T T T T C T C A C +T C C A C T T A G C C G G G G T T T C T T C G A T +T T T A G G A G C A G T A A A C T T T A T A A C T +A C C G T T A T T A A T A T A C G A A G A A C A G +G G A T A A C A A T G G A C C G G A T C C C T C T +T T T C G T C T G G T C G G T T T T T A T T A C T +G C C C T T C T T T T A C T T T T A T C T C T C C +C A G T T +>Artemia +T A A T A T T A G G A G C T C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A C T C C T C C C T C C C T C T T T +A A C T C T T T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C G G G A +T G A A C C G T T T A T C C T C C T C T T G - C - +A G C C G G A A T T G C T C A T G C C G G A G C T +T C A G T T G A T A T A G G A A T C T T C T C T C +T A C A T T T A G C A G G A G T A T C C T C T A T +T T T A G G G G C C G T A A A T T T T A T A A C A +A C A G T G A T T A A C A T A C G G G C A T C T G +G A A T A A C A T T A G A C C G A A T A C C T T T +A T T C G T G T G A T C T G T A T T T A T T A C T +G C A C T A T T A T T A C T T T T A T C A C T A C +C A G T T +>Centruoides +T A A T A T T A G G G G C C C C T G A T A T G G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A T T G C T T C C C C C T T C C C T +A A C T C T T T T A C T T - T C G A G T G G A A T +G G T A G A A A G A G G G G T A G G T A C A G G A +T G A A C T G T T T A T C C C C C T C T T G - C - +A G C C G G A A T T G C T C A T G C T G G A G C T +T C C G T T G A C A T A G G A A T T T T C T C T C +T A C A T T T A G C A G G A G T G T C T T C T A T +T T T A G G G G C C G T A A A C T T C A T A A C T +A C A G T A A T T A A T A T A C G A G C A T C T G +G A A T A A C A A T A G A C C G A A T A C C T T T +A T T T G T T T G A T C T G T C T T T A T T A C A +G C A C T A C T T T T A C T C T T G T C A T T A C +C A G T T +>Chanbria +T T A T A T T A G G A G C A C C T G A T A T A G - +C - A T T T C C A C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C A T C T C T +T A C A C T A C T C C T A - T C A A G A G G A A T +G G T A G A A A G A G G A G T C G G T A C A G G A +T G A A C T G T G T A T C C C C C G C T A G - C - +A G C A G G A A T T G C C C A C G C T G G A G C C +T C A G T T G A C A T A G G A A T T T T T T C A C +T A C A C C T C G C C G G A G T C T C A T C A A T +T C T A G G T G C A G T T A A T T T T A T A A C A +A C T G T T A T T A A C A T A C G C C C T G C G G +G A A T A A C T A T A G A C C G T A T A C C A C T +T T T C G T G T G A G C A G T A T T T A T T A C A +G C C C T A C T C C T T C T T T T A T C T C T A C +C A G T C +>Gea +T T A T A C T T G G T G C C C C A G A T A T G G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G A T T C T G A C T T C T C C C C C C T T C T C T +A A C T C T A T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C A G G A +T G A A C A G T T T A C C C C C C T C T A G - C - +A G C A G G A A T T G C T C A T G C T G G A G C T +T C A G T T G A T A T A G G A A T T T T C T C G C +T A C A C C T T G C A G G G G T C T C A T C A A T +C T T A G G A G C A G T T A A T T T C A T G A C A +A C A G T T A T T A A T A T G C G C C C T G C A G +G A A T A A C T A T G G A C C G T A T A C C A C T +C T T C G T G T G A G C T G T C T T T A T T A C A +G C C T T G C T A C T A T T A C T A T C C C T C C +C A G T C +>Hadrurus +T T A T A T T A G G G G C C C C A G A T A T A G - +C - T T T T C C C C G A A T A A A T A A T A T A A +G C T T C T G A T T A C T T C C C C C C T C C C T +A A C T C T C C T C C T T - T T A A G A G G A A T +A G T T G A A A G A G G A G T A G G G A C C G G C +T G A A C T G T C T A T C C T C C T T T A G - C T +A G - A G G A A T T G C C C A T G C A G G A G C C +T C G G T A G A C A T A G G A A T T T T T T C T C +T A C A C T T A G C T G G A G T T T C A T C C A T +T T T A G G G G C C G T A A A C T T T A T A A C A +A C A G T G A T T A A T A T A C G A T C A T C A G +G A A T A A C A A T A G A T C G A A T A C C T T T +A T T T G T T T G A T C A G T C T T T A T T A C C +G C T T T A T T G T T A C T C C T T T C A C T C C +C A G T T +>Hypochilus +T A A T A C T C G G G G C C C C A G A T A T A G - +C - T T T C C C A C G A A T A A A T A A C A T A A +G A T T C T G A C T T C T T C C T C C C T C T C T +A A C T C T A C T T C T C - T C A A G C G G A A T +A G T C G A A A G A G G G G T A G G A A C C G G C +T G G A C A G T A T A C C C T C C T C T A G C C - +A G - A G G A A T C G C T C A T G C A G G T G C T +T C T G T A G A T A T A G G A A T C T T C T C G C +T T C A C C T A G C A G G A G T C T C A T C A A T +C C T A G G G G C C G T A A A C T T T A T A A C A +A C A G T A A T C A A T A T A C G A T C C T C A G +G A A T A T C T A T A G A C C G T A T A C C T T T +A T T T G T A T G A T C A G T A T T C A T T A C C +G C T C T C C T C C T T C T T C T C T C T T T A C +C A G T C +>Limulus +T A A T A C T A G G T G C C C C G G A T A T G G - +C - T T T C C C T C G A A T A A A T A A C A T A A +G T T T C T G A C T T T T A C C C C C A T C T C T +C A C T C T - C C T C T T A T C T A G A G G A A T +A G T A G A A A G A G G T G T A G G A A C T G G A +T G A A C T G T A T A T C C T C C T T T A T - C - +T G C A A G T A T T G C C C A T G C G G G A G C T +T C C G T T G A T C T C G G A A T C T T C T C A C +T A C A C T T A G C A G G T G T A T C A T C A A T +T T T A G G T G C C G T A A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T C T A C C G +G A A T A A C T A T G G A C C G G A T A C C T C T +T T T T G T T T G A G C A G T A T T T A T T A C T +G C T T T A C T T T T A C T A C T G T C C T T A C +C A G T T +>Mastigoproctus +T G A T A T T A G G T G C T C C T G A T A T A G - +C - T T T T C C T C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C T T C G T T +A A C G T T A C T T C T T - T C C A G A G G T A T +A G T T G A A A G A G G G G T A G G A A C A G G A +T G A A C A G T C T A T C C T C C T T T A T - C - +A G C T A G A A T T G C T C A T G C C G G A G C T +T C A G T A G A C T T A G G T A T T T T C T C G C +T G C A T C T A G C T G G T G T C T C G T C T A T +C T T A G G A G C T G T C A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G T A T A C C T C T +T T T C G T A T G G G C A G T A T T T A T T A C C +G C C C T A C T A C T T T T A T T A T C T T T A C +C A G T A +>Pauroctonus +T A A T G T T A G G T G C T C C T G A T A T G G - +C - T T T T C C A C G A A T A A A C A A T A T G A +G T T T C T G G C T C C T A C C T C C T T C A C T +A A C A C T A C T T C T T - T C T A G A G G T A T +A G T T G A A A G A G G A G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T T T A T - C - +A G C C A G T A T T G C T C A T G C T G G G G C T +T C G G T A G A T T T A G G A A T T T T C T C C C +T A C A C T T G G C A G G T G T T T C T T C A A T +T T T A G G A G C T G T A A A T T T T A T G A C A +A C T G T T A T T A A C A T A C G A T C T A C G G +G A A T A A C T A T A G A C C G A A T A C C G C T +T T T C G T T T G A G C A G T A T T T A T T A C A +G C A T T A C T T C T C T T A T T A T C T C T A C +C G G T A +>Rhiphicephalus +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T - C - +A G C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thelichoris +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T - C - +A G C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thermobius +T A A T A T T A G G T G C T C C C G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T G A +G T T T C T G A C T T T T A C C T C C C T C G C T +A A C T C T A C T T C T T - T C T A G A G G T A T +A G T C G A A A G A G G G G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T C T A T - C - +T G C C A G C A T T G C C C A T G C A G G A G C A +T C T G T A G A T C T A G G A A T C T T C T C A T +T A C A T T T A G C A G G G G T T T C T T C A A T +T C T A G G T G C T G T A A A T T T T A T A A C T +A C C G T T A T C A A T A T A C G A T C A A C A G +G A A T A A C T A T A G A C C G A A T A C C T C T +T T T C G T C T G A G C G G T A T T T A T T A C A +G C C T T A C T A C T T T T A C T A T C A T T A C +C G G T T +>Vonones +T A A T G T T A G G T G C T C C A G A T A T G G - +C A T T T C C C - C G A A T A A A T A A T A T A A +G T T T C T G A C T T T T A C C C C C T T C G C T +A A C T T T A C T T - T T A T C T A G A G G T A T +A G T C G A A A G A G G A G T G G G A A C T G G A +T G A A C A G T A T A C C C T C C T T T A T - C - +A G C C A G A A T T G C T C A C G C A G G T G C T +T C A G T T G A T C T A G G T A T T T T T T C A T +T A C A T T T A G C A G G G G T C T C A T C A A T +C T T A G G A G C T G T A A A C T T T A T A A C G +A C C G T T A T C A A T A T A C G A T C T A C A G +G G A T A A C T A T A G A C C G A A T A C C A C T +T T T T G T T T G A G C A G T A T T T A T T A C A +G C T C T A C T C C T A C T G T T A T C T T T A C +C A G T C + + +Sequence character chel_16S.fas#0 +>Alentus +T C C A G A C T - - G A - C - G G T - C C A C C - +- G A A T C G - G G C G A C T G - - - T C - A G G +C C T G A A C - - A T C C G G - T C G G T T - T C +- T C - T C C C T T T T C C C G C G G T G C T C T +T C G T T G A G - T G - T C G C G - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Amblypygid +T C C A G A C T - - G G - C - G G T - C C G C C T +- A - G C G G C G A G T A C T G - - - T C - A G G +C C T G A A C - - A T G G C G - C C G G - T - T - +- T - - T - - C - - - - - - C T T G G T T C T C T +T T A C T G A G - T G - T C T T G - G G C G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - - - G - C T G +- T - - - - C G C C +>Americhernus +T C G A G C C T C C A A - T - G A T - A C G T T G +- A - A A G G C G T T T A T C G - - - T T - G G G +G C C G - A C - - A G C G C G - T C - G - - - T - +- G - - G - - G - - - - - - C T C G G T T G G C C +T T A A A A A G C T G A T C - G G - G T T C T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +G G T G C T C A A A G C A - - G - - - - G C C T G +- G - - - - T G C C +>Aportus +T C C A G A C T - - G A - C - G G T - C C A C C - +- G - C T C G - G G C G A C T G - - - T C - A G G +C C T G A A C - - A T T C C G - T C G G T T - T C +G A C - A G A T T T T T C C C G C G G T G C T C T +T C G G T G A G - T G - T C G C - - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Artemia +C T C G G T C G - - G G - T - G G T G C C G C C - +T C - A C G G T G G T C A C T G - - - C C T C G A +T C - G G A C - A A T T C A T - T G G A - - - T - +- C - - G - T T - - - - - - C G G G G T G C T C T +T A A C C G A G - T G - T C C T G - G G T G G C C +G A T A C G T T T A C T T T G A A C A A A T T A G +A G T G C T T A A A G C A - - G - - - - G - T - G +C A C - C G C G C C +>Centruoides +T C C A G A C A - - A G - C - G G T - C C A C C - +- C - G C G G T G G T T A C T G - - - T T T G G A +C T - G G A C - - G T T T G G C C G G A - - - T - +- T - - C - C T - - - - - - - T T G A T G C T C T +T T G C C G A G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - - +- A - - C - C G C C +>Chanbria +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - +>Gea +T C C G G C C G - - G A - C G G G T - C C G C C - +T A - C C G G T G G T T A C T G - T T C G - C T G +C C - G A G C - - - T T C A G - G G G G - - - C - +- C - - G - C T - - - - - - G T C G A T G A T C T +T C A T C G G T - T A - T C T T C - C G T A A C C +C T C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - - - - C G C C +>Hadrurus +T C C G G A C T - - G T - C - G G T - C C G - C - +- C - G C - A A G C T T A C T G - - - G C A G G A +C C - G G A C - - G T C T A G C C G G A - - C T - +- C - - T - C T - - - - - - C G T - A T C C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - T C - C G C C +>Hypochilus +T C C A G A C G - - G G - C - G G T - C C G C C - +T A - A C G G T G G T T A C T G - - - C C - T G G +C C T G A A C - - A A C C A G - C C G G - - - T - +- T - - T - C C - - - - - - C T A G A T G A T C T +T C A T T G A T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - G +- A - - - - C G C C +>Limulus +T C T A G A C T - - G G - C - G G T - C C G C T T +- - - C C G G C G G T T A C T G - - - C C - T G G +C C T A A A C - - A T C - T G - C C G G - T - T - +- T - - T - C C - - - - - - C T C G G T G C C C T +T G A T T G A G - T G - T C T T G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - A +- A - - - - C G C C +>Mastigoproctus +T C C A G A C G - - G G - T - G G T - C C A C C - +G C - C C G G T G G C T A C T G - - - C C - C G G +C C T G A A C A - A T C T C G - C C G G - - - T - +- T - - T - T C - - - - - - C T T G A T T C T C T +T C A C C G A G - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T T A A A G C A - - G - - C - G - T - A +- A - - - - C G C C +>Pauroctonus +T C C A G A C T - - G T - C - G G T - C C G - C - +- A - C C G G A G G T T A C T G - - - G C - A G G +A C C G G A C - - G T C T A G - C C G G - - - C - +- C - - T - C C - - - - - - C T C G T T G C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - A - C +- C - - - - C G C C +>Rhiphicephalus +T C C A G A C G - - A G - T - A G T - G C A T C - +T A - C C C G A T G C T A C G G - - - C T - C G G +A C T G A A C - - A T C A T G - C C G G - - - T - +- T - C T - T T - - - - - - C T T G G T G C A C T +T C A T T G T G - T G - C C T C G A G A T G G C C +G G T G C T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A C G C A - G G - - C - G - A - G +- T - - - - C G C C +>Thelichoris +T C C A G A C G - - G G - C - G G T - C C G C C - +T A - A C G G T G G T T A C T G - - - C C - T G G +C C T G A A C - - A G C C A G - C C G G - - - T - +- T - - T - C C - - - - - - C T A G A T G A T C T +T T A C C G G T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - G G - - C - - - T - G +- A - - - - C G C C +>Thermobius +C T C G G A C - - - G A T C - G G T - T C G C C - +- G - C C C G T G T T A A C T G A - - T C - - G T +T C C G G A C - - G T C C T G - C C G G T T - T - +- T - - T - C C - T T T - - C T C G G T G C T C T +T C A T T G A G - T G - T C T T G - A T T G G C C +G G C A C G T T T A C T T T G A A C A A A T T A G +A G T G C T C A A A G C A G - G T C C A G - T - G +- T - - - - C G C C +>Vonones +T C G A G G C T - - G G - C - G G T - C C G C C T +- A - C A G G C G G T C A C T G - - - C C - A G T +A C T C A A C - - A T C C T G - C C G G - T - T - +- T - - T - C C - - - - - - C T T G G T G C T C T +T C G C T G A G - T G - T C T C G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - C - G +- T - - - - A G C C + +Implied Alignments for Graph 3 + +Sequence character chel_cox1.fas#0 +>Alentus +T T A T A T T A G G A G C A C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T T T G A C T T C T T C C A C C A T C T T T +A A C T T T A C T C C T C - T C T A G A G G A A T +A G T A G A A A G T G G A G T G G G A A C A G G C +T G A A C T G T C T A T C C T C C T T T A T - C - +T G C T G G A A T T G C A C A T G C T G G T G C C +T C A G T A G A T T T A G G A A T C T T T T C T T +T A C A C C T A G C A G G A G T T T C A T C C A T +C T T A G G G G C C G T A A A C T T T A T A A C T +A C C G T T A T C A A T A T A C G A A G A A C A G +G T A T A A C A A T A G A C C G A A T T C C T C T +T T T T G T A T G A T C T G T A T T T A T T A C A +G C C C T T C T T C T C C T T C T T T C C C T C C +C A G T C +>Amblypygid +T A A T A T T A G G A G C C C C A G A T A T G G - +C - T T T T C C C C G T A T A A A T A A T A T A A +G T T T T T G A C T T T T A C C C C C C T C T C T +A A C A C T - A C T C C T C T C T A G A G G G A T +A G T G G A A A G A G G T G T A G G T A C A G G A +T G A A C T G T T T A C C C C C C T C T A T - C - +G G C T A G C A T C G C A C A C G C C G G T G C T +T C T G T T G A T A T G G G A A T T T T C T C G C +T C C A T T T G G C C G G G G T T T C T T C A A T +T T T A G G G G C C G T T A A T T T T A T A A C T +A C T G T T A T C A A T A T A C G A A G A A C C G +G A A T A A C T A T A G A C C G A A T T C C C C T +T T T T G T T T G A T C T G T C T T T A T T A C C +G C C C T C C T T C T T C T T C T A T C A T T A C +C T G T T +>Americhernus +T A A T A C T T G G G G C C C C T G A T A T G G - +C - T T T C C C C C G T A T A A A T A A T A T G A +G A T T C T G A C T C T T A C C T C C C T C A T T +A A C T C T - C C T A T T A T C G A G A G G G A T +A G T T G A A A C A G G G G T T G G T A C T G G A +T G G A C A G T A T A C C C C C C C T T A T - C - +G G C C G G G A T C G C C C A C G C A G G G G C C +T C A G T T G A C C T G G C A A T C T T T T C T C +T C C A C T T A G C T G G G G T T T C T T C T A T +T C T A G G A G C A G T T A A T T T T A T A A C A +A C C G T T A T C A A T A T A C G A A G T G T A G +G G A T G A C A A T A G A C C G A G T A C C T C T +T T T T G T A T G G T C T A T T T T C A T C A C A +A C T C T C C T T C T A C T A C T T T C T C T A C +C A G T C +>Aportus +T A A T A T T A G G G G C C C C T G A T A T G G C +C - T T T C C C - C G G A T A A A T A A T A T A A +G T T T C T G A C T A T T A C C C C C C T C T T T +A A C C C T T C T T T T A - T C T A G T G G T A T +G G T T G A A A C A G G A G T T G G G A C T G G T +T G A A C A G T T T A C C C C C C T T T A G - C - +C G C C G G A A T T G C T C A C G C A G G A G C T +T C G G T A G A T C T T G C T A T T T T C T C A C +T C C A C T T A G C C G G G G T T T C T T C G A T +T T T A G G A G C A G T A A A C T T T A T A A C T +A C C G T T A T T A A T A T A C G A A G A A C A G +G G A T A A C A A T G G A C C G G A T C C C T C T +T T T C G T C T G G T C G G T T T T T A T T A C T +G C C C T T C T T T T A C T T T T A T C T C T C C +C A G T T +>Artemia +T A A T A T T A G G A G C T C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A C T C C T C C C T C C C T C T T T +A A C T C T T T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C G G G A +T G A A C C G T T T A T C C T C C T C T T G - C - +A G C C G G A A T T G C T C A T G C C G G A G C T +T C A G T T G A T A T A G G A A T C T T C T C T C +T A C A T T T A G C A G G A G T A T C C T C T A T +T T T A G G G G C C G T A A A T T T T A T A A C A +A C A G T G A T T A A C A T A C G G G C A T C T G +G A A T A A C A T T A G A C C G A A T A C C T T T +A T T C G T G T G A T C T G T A T T T A T T A C T +G C A C T A T T A T T A C T T T T A T C A C T A C +C A G T T +>Centruoides +T A A T A T T A G G G G C C C C T G A T A T G G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A T T G C T T C C C C C T T C C C T +A A C T C T T T T A C T T - T C G A G T G G A A T +G G T A G A A A G A G G G G T A G G T A C A G G A +T G A A C T G T T T A T C C C C C T C T T G - C - +A G C C G G A A T T G C T C A T G C T G G A G C T +T C C G T T G A C A T A G G A A T T T T C T C T C +T A C A T T T A G C A G G A G T G T C T T C T A T +T T T A G G G G C C G T A A A C T T C A T A A C T +A C A G T A A T T A A T A T A C G A G C A T C T G +G A A T A A C A A T A G A C C G A A T A C C T T T +A T T T G T T T G A T C T G T C T T T A T T A C A +G C A C T A C T T T T A C T C T T G T C A T T A C +C A G T T +>Chanbria +T T A T A T T A G G A G C A C C T G A T A T A G - +C - A T T T C C A C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C A T C T C T +T A C A C T A C T C C T A - T C A A G A G G A A T +G G T A G A A A G A G G A G T C G G T A C A G G A +T G A A C T G T G T A T C C C C C G C T A G - C - +A G C A G G A A T T G C C C A C G C T G G A G C C +T C A G T T G A C A T A G G A A T T T T T T C A C +T A C A C C T C G C C G G A G T C T C A T C A A T +T C T A G G T G C A G T T A A T T T T A T A A C A +A C T G T T A T T A A C A T A C G C C C T G C G G +G A A T A A C T A T A G A C C G T A T A C C A C T +T T T C G T G T G A G C A G T A T T T A T T A C A +G C C C T A C T C C T T C T T T T A T C T C T A C +C A G T C +>Gea +T T A T A C T T G G T G C C C C A G A T A T G G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G A T T C T G A C T T C T C C C C C C T T C T C T +A A C T C T A T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C A G G A +T G A A C A G T T T A C C C C C C T C T A G - C - +A G C A G G A A T T G C T C A T G C T G G A G C T +T C A G T T G A T A T A G G A A T T T T C T C G C +T A C A C C T T G C A G G G G T C T C A T C A A T +C T T A G G A G C A G T T A A T T T C A T G A C A +A C A G T T A T T A A T A T G C G C C C T G C A G +G A A T A A C T A T G G A C C G T A T A C C A C T +C T T C G T G T G A G C T G T C T T T A T T A C A +G C C T T G C T A C T A T T A C T A T C C C T C C +C A G T C +>Hadrurus +T T A T A T T A G G G G C C C C A G A T A T A G - +C - T T T T C C C C G A A T A A A T A A T A T A A +G C T T C T G A T T A C T T C C C C C C T C C C T +A A C T C T C C T C C T T - T T A A G A G G A A T +A G T T G A A A G A G G A G T A G G G A C C G G C +T G A A C T G T C T A T C C T C C T T T A G - C T +A G - A G G A A T T G C C C A T G C A G G A G C C +T C G G T A G A C A T A G G A A T T T T T T C T C +T A C A C T T A G C T G G A G T T T C A T C C A T +T T T A G G G G C C G T A A A C T T T A T A A C A +A C A G T G A T T A A T A T A C G A T C A T C A G +G A A T A A C A A T A G A T C G A A T A C C T T T +A T T T G T T T G A T C A G T C T T T A T T A C C +G C T T T A T T G T T A C T C C T T T C A C T C C +C A G T T +>Hypochilus +T A A T A C T C G G G G C C C C A G A T A T A G - +C - T T T C C C A C G A A T A A A T A A C A T A A +G A T T C T G A C T T C T T C C T C C C T C T C T +A A C T C T A C T T C T C - T C A A G C G G A A T +A G T C G A A A G A G G G G T A G G A A C C G G C +T G G A C A G T A T A C C C T C C T C T A G C C - +A G - A G G A A T C G C T C A T G C A G G T G C T +T C T G T A G A T A T A G G A A T C T T C T C G C +T T C A C C T A G C A G G A G T C T C A T C A A T +C C T A G G G G C C G T A A A C T T T A T A A C A +A C A G T A A T C A A T A T A C G A T C C T C A G +G A A T A T C T A T A G A C C G T A T A C C T T T +A T T T G T A T G A T C A G T A T T C A T T A C C +G C T C T C C T C C T T C T T C T C T C T T T A C +C A G T C +>Limulus +T A A T A C T A G G T G C C C C G G A T A T G G - +C - T T T C C C T C G A A T A A A T A A C A T A A +G T T T C T G A C T T T T A C C C C C A T C T C T +C A C T C T - C C T C T T A T C T A G A G G A A T +A G T A G A A A G A G G T G T A G G A A C T G G A +T G A A C T G T A T A T C C T C C T T T A T - C - +T G C A A G T A T T G C C C A T G C G G G A G C T +T C C G T T G A T C T C G G A A T C T T C T C A C +T A C A C T T A G C A G G T G T A T C A T C A A T +T T T A G G T G C C G T A A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T C T A C C G +G A A T A A C T A T G G A C C G G A T A C C T C T +T T T T G T T T G A G C A G T A T T T A T T A C T +G C T T T A C T T T T A C T A C T G T C C T T A C +C A G T T +>Mastigoproctus +T G A T A T T A G G T G C T C C T G A T A T A G - +C - T T T T C C T C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C T T C G T T +A A C G T T A C T T C T T - T C C A G A G G T A T +A G T T G A A A G A G G G G T A G G A A C A G G A +T G A A C A G T C T A T C C T C C T T T A T - C - +A G C T A G A A T T G C T C A T G C C G G A G C T +T C A G T A G A C T T A G G T A T T T T C T C G C +T G C A T C T A G C T G G T G T C T C G T C T A T +C T T A G G A G C T G T C A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G T A T A C C T C T +T T T C G T A T G G G C A G T A T T T A T T A C C +G C C C T A C T A C T T T T A T T A T C T T T A C +C A G T A +>Pauroctonus +T A A T G T T A G G T G C T C C T G A T A T G G - +C - T T T T C C A C G A A T A A A C A A T A T G A +G T T T C T G G C T C C T A C C T C C T T C A C T +A A C A C T A C T T C T T - T C T A G A G G T A T +A G T T G A A A G A G G A G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T T T A T - C - +A G C C A G T A T T G C T C A T G C T G G G G C T +T C G G T A G A T T T A G G A A T T T T C T C C C +T A C A C T T G G C A G G T G T T T C T T C A A T +T T T A G G A G C T G T A A A T T T T A T G A C A +A C T G T T A T T A A C A T A C G A T C T A C G G +G A A T A A C T A T A G A C C G A A T A C C G C T +T T T C G T T T G A G C A G T A T T T A T T A C A +G C A T T A C T T C T C T T A T T A T C T C T A C +C G G T A +>Rhiphicephalus +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T - C - +A G C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thelichoris +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T - C - +A G C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thermobius +T A A T A T T A G G T G C T C C C G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T G A +G T T T C T G A C T T T T A C C T C C C T C G C T +A A C T C T A C T T C T T - T C T A G A G G T A T +A G T C G A A A G A G G G G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T C T A T - C - +T G C C A G C A T T G C C C A T G C A G G A G C A +T C T G T A G A T C T A G G A A T C T T C T C A T +T A C A T T T A G C A G G G G T T T C T T C A A T +T C T A G G T G C T G T A A A T T T T A T A A C T +A C C G T T A T C A A T A T A C G A T C A A C A G +G A A T A A C T A T A G A C C G A A T A C C T C T +T T T C G T C T G A G C G G T A T T T A T T A C A +G C C T T A C T A C T T T T A C T A T C A T T A C +C G G T T +>Vonones +T A A T G T T A G G T G C T C C A G A T A T G G - +C A T T T C C C - C G A A T A A A T A A T A T A A +G T T T C T G A C T T T T A C C C C C T T C G C T +A A C T T T A C T T - T T A T C T A G A G G T A T +A G T C G A A A G A G G A G T G G G A A C T G G A +T G A A C A G T A T A C C C T C C T T T A T - C - +A G C C A G A A T T G C T C A C G C A G G T G C T +T C A G T T G A T C T A G G T A T T T T T T C A T +T A C A T T T A G C A G G G G T C T C A T C A A T +C T T A G G A G C T G T A A A C T T T A T A A C G +A C C G T T A T C A A T A T A C G A T C T A C A G +G G A T A A C T A T A G A C C G A A T A C C A C T +T T T T G T T T G A G C A G T A T T T A T T A C A +G C T C T A C T C C T A C T G T T A T C T T T A C +C A G T C + + +Sequence character chel_16S.fas#0 +>Alentus +T C C A G A C T - - G A - C - G G T - C C A C C - +- G A A T C G - G G C G A C T G - - - T C - A G G +C C T G A A C - - A T C C G G - T C G G T T - T C +- T C - T C C C T T T T C C C G C G G T G C T C T +T C G T T G A G - T G - T C G C G - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Amblypygid +T C C A G A C T - - G G - C - G G T - C C G C C T +- A - G C G G C G A G T A C T G - - - T C - A G G +C C T G A A C - - A T G G C G - C C G G - T - T - +- T - - T - - C - - - - - - C T T G G T T C T C T +T T A C T G A G - T G - T C T T G - G G C G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - - - G - C T G +- T - - - - C G C C +>Americhernus +T C G A G C C T C C A A - T - G A T - A C G T T G +- A - A A G G C G T T T A T C G - - - T T - G G G +G C C G - A C - - A G C G C G - T C - G - - - T - +- G - - G - - G - - - - - - C T C G G T T G G C C +T T A A A A A G C T G A T C - G G - G T T C T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +G G T G C T C A A A G C A - - G - - - - G C C T G +- G - - - - T G C C +>Aportus +T C C A G A C T - - G A - C - G G T - C C A C C - +- G - C T C G - G G C G A C T G - - - T C - A G G +C C T G A A C - - A T T C C G - T C G G T T - T C +G A C - A G A T T T T T C C C G C G G T G C T C T +T C G G T G A G - T G - T C G C - - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Artemia +C T C G G T C G - - G G - T - G G T G C C G C C - +T C - A C G G T G G T C A C T G - - - C C T C G A +T C - G G A C - A A T T C A T - T G G A - - - T - +- C - - G - T T - - - - - - C G G G G T G C T C T +T A A C C G A G - T G - T C C T G - G G T G G C C +G A T A C G T T T A C T T T G A A C A A A T T A G +A G T G C T T A A A G C A - - G - - - - G - T - G +C A C - C G C G C C +>Centruoides +T C C A G A C A - - A G - C - G G T - C C A C C - +- C - G C G G T G G T T A C T G - - - T T T G G A +C T - G G A C - - G T T T G G C C G G A - - - T - +- T - - C - C T - - - - - - - T T G A T G C T C T +T T G C C G A G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - - +- A - - C - C G C C +>Chanbria +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - +>Gea +T C C G G C C G - - G A - C G G G T - C C G C C - +T A - C C G G T G G T T A C T G - T T C G - C T G +C C - G A G C - - - T T C A G - G G G G - - - C - +- C - - G - C T - - - - - - G T C G A T G A T C T +T C A T C G G T - T A - T C T T C - C G T A A C C +C T C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - - - - C G C C +>Hadrurus +T C C G G A C T - - G T - C - G G T - C C G - C - +- C - G C - A A G C T T A C T G - - - G C A G G A +C C - G G A C - - G T C T A G C C G G A - - C T - +- C - - T - C T - - - - - - C G T - A T C C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - T C - C G C C +>Hypochilus +T C C A G A C G - - G G - C - G G T - C C G C C - +T A - A C G G T G G T T A C T G - - - C C - T G G +C C T G A A C - - A A C C A G - C C G G - - - T - +- T - - T - C C - - - - - - C T A G A T G A T C T +T C A T T G A T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - G +- A - - - - C G C C +>Limulus +T C T A G A C T - - G G - C - G G T - C C G C T T +- - - C C G G C G G T T A C T G - - - C C - T G G +C C T A A A C - - A T C - T G - C C G G - T - T - +- T - - T - C C - - - - - - C T C G G T G C C C T +T G A T T G A G - T G - T C T T G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - A +- A - - - - C G C C +>Mastigoproctus +T C C A G A C G - - G G - T - G G T - C C A C C - +G C - C C G G T G G C T A C T G - - - C C - C G G +C C T G A A C A - A T C T C G - C C G G - - - T - +- T - - T - T C - - - - - - C T T G A T T C T C T +T C A C C G A G - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T T A A A G C A - - G - - C - G - T - A +- A - - - - C G C C +>Pauroctonus +T C C A G A C T - - G T - C - G G T - C C G - C - +- A - C C G G A G G T T A C T G - - - G C - A G G +A C C G G A C - - G T C T A G - C C G G - - - C - +- C - - T - C C - - - - - - C T C G T T G C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - A - C +- C - - - - C G C C +>Rhiphicephalus +T C C A G A C G - - A G - T - A G T - G C A T C - +T A - C C C G A T G C T A C G G - - - C T - C G G +A C T G A A C - - A T C A T G - C C G G - - - T - +- T - C T - T T - - - - - - C T T G G T G C A C T +T C A T T G T G - T G - C C T C G A G A T G G C C +G G T G C T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A C G C A - G G - - C - G - A - G +- T - - - - C G C C +>Thelichoris +T C C A G A C G - - G G - C - G G T - C C G C C - +T A - A C G G T G G T T A C T G - - - C C - T G G +C C T G A A C - - A G C C A G - C C G G - - - T - +- T - - T - C C - - - - - - C T A G A T G A T C T +T T A C C G G T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - G G - - C - - - T - G +- A - - - - C G C C +>Thermobius +C T C G G A C - - - G A T C - G G T - T C G C C - +- G - C C C G T G T T A A C T G A - - T C - - G T +T C C G G A C - - G T C C T G - C C G G T T - T - +- T - - T - C C - T T T - - C T C G G T G C T C T +T C A T T G A G - T G - T C T T G - A T T G G C C +G G C A C G T T T A C T T T G A A C A A A T T A G +A G T G C T C A A A G C A G - G T C C A G - T - G +- T - - - - C G C C +>Vonones +T C G A G G C T - - G G - C - G G T - C C G C C T +- A - C A G G C G G T C A C T G - - - C C - A G T +A C T C A A C - - A T C C T G - C C G G - T - T - +- T - - T - C C - - - - - - C T T G G T G C T C T +T C G C T G A G - T G - T C T C G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - C - G +- T - - - - A G C C + +Implied Alignments for Graph 4 + +Sequence character chel_cox1.fas#0 +>Alentus +T T A T A T T A G G A G C A C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T T T G A C T T C T T C C A C C A T C T T T +A A C T T T A C T C C T C - T C T A G A G G A A T +A G T A G A A A G T G G A G T G G G A A C A G G C +T G A A C T G T C T A T C C T C C T T T A T - C - +T G C T G G A A T T G C A C A T G C T G G T G C C +T C A G T A G A T T T A G G A A T C T T T T C T T +T A C A C C T A G C A G G A G T T T C A T C C A T +C T T A G G G G C C G T A A A C T T T A T A A C T +A C C G T T A T C A A T A T A C G A A G A A C A G +G T A T A A C A A T A G A C C G A A T T C C T C T +T T T T G T A T G A T C T G T A T T T A T T A C A +G C C C T T C T T C T C C T T C T T T C C C T C C +C A G T C +>Amblypygid +T A A T A T T A G G A G C C C C A G A T A T G G - +C - T T T T C C C C G T A T A A A T A A T A T A A +G T T T T T G A C T T T T A C C C C C C T C T C T +A A C A C T - A C T C C T C T C T A G A G G G A T +A G T G G A A A G A G G T G T A G G T A C A G G A +T G A A C T G T T T A C C C C C C T C T A T - C - +G G C T A G C A T C G C A C A C G C C G G T G C T +T C T G T T G A T A T G G G A A T T T T C T C G C +T C C A T T T G G C C G G G G T T T C T T C A A T +T T T A G G G G C C G T T A A T T T T A T A A C T +A C T G T T A T C A A T A T A C G A A G A A C C G +G A A T A A C T A T A G A C C G A A T T C C C C T +T T T T G T T T G A T C T G T C T T T A T T A C C +G C C C T C C T T C T T C T T C T A T C A T T A C +C T G T T +>Americhernus +T A A T A C T T G G G G C C C C T G A T A T G G - +C - T T T C C C C C G T A T A A A T A A T A T G A +G A T T C T G A C T C T T A C C T C C C T C A T T +A A C T C T - C C T A T T A T C G A G A G G G A T +A G T T G A A A C A G G G G T T G G T A C T G G A +T G G A C A G T A T A C C C C C C C T T A T - C - +G G C C G G G A T C G C C C A C G C A G G G G C C +T C A G T T G A C C T G G C A A T C T T T T C T C +T C C A C T T A G C T G G G G T T T C T T C T A T +T C T A G G A G C A G T T A A T T T T A T A A C A +A C C G T T A T C A A T A T A C G A A G T G T A G +G G A T G A C A A T A G A C C G A G T A C C T C T +T T T T G T A T G G T C T A T T T T C A T C A C A +A C T C T C C T T C T A C T A C T T T C T C T A C +C A G T C +>Aportus +T A A T A T T A G G G G C C C C T G A T A T G G C +C - T T T C C C - C G G A T A A A T A A T A T A A +G T T T C T G A C T A T T A C C C C C C T C T T T +A A C C C T T C T T T T A - T C T A G T G G T A T +G G T T G A A A C A G G A G T T G G G A C T G G T +T G A A C A G T T T A C C C C C C T T T A G - C - +C G C C G G A A T T G C T C A C G C A G G A G C T +T C G G T A G A T C T T G C T A T T T T C T C A C +T C C A C T T A G C C G G G G T T T C T T C G A T +T T T A G G A G C A G T A A A C T T T A T A A C T +A C C G T T A T T A A T A T A C G A A G A A C A G +G G A T A A C A A T G G A C C G G A T C C C T C T +T T T C G T C T G G T C G G T T T T T A T T A C T +G C C C T T C T T T T A C T T T T A T C T C T C C +C A G T T +>Artemia +T A A T A T T A G G A G C T C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A C T C C T C C C T C C C T C T T T +A A C T C T T T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C G G G A +T G A A C C G T T T A T C C T C C T C T T G - C - +A G C C G G A A T T G C T C A T G C C G G A G C T +T C A G T T G A T A T A G G A A T C T T C T C T C +T A C A T T T A G C A G G A G T A T C C T C T A T +T T T A G G G G C C G T A A A T T T T A T A A C A +A C A G T G A T T A A C A T A C G G G C A T C T G +G A A T A A C A T T A G A C C G A A T A C C T T T +A T T C G T G T G A T C T G T A T T T A T T A C T +G C A C T A T T A T T A C T T T T A T C A C T A C +C A G T T +>Centruoides +T A A T A T T A G G G G C C C C T G A T A T G G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A T T G C T T C C C C C T T C C C T +A A C T C T T T T A C T T - T C G A G T G G A A T +G G T A G A A A G A G G G G T A G G T A C A G G A +T G A A C T G T T T A T C C C C C T C T T G - C - +A G C C G G A A T T G C T C A T G C T G G A G C T +T C C G T T G A C A T A G G A A T T T T C T C T C +T A C A T T T A G C A G G A G T G T C T T C T A T +T T T A G G G G C C G T A A A C T T C A T A A C T +A C A G T A A T T A A T A T A C G A G C A T C T G +G A A T A A C A A T A G A C C G A A T A C C T T T +A T T T G T T T G A T C T G T C T T T A T T A C A +G C A C T A C T T T T A C T C T T G T C A T T A C +C A G T T +>Chanbria +T T A T A T T A G G A G C A C C T G A T A T A G - +C - A T T T C C A C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C A T C T C T +T A C A C T A C T C C T A - T C A A G A G G A A T +G G T A G A A A G A G G A G T C G G T A C A G G A +T G A A C T G T G T A T C C C C C G C T A G - C - +A G C A G G A A T T G C C C A C G C T G G A G C C +T C A G T T G A C A T A G G A A T T T T T T C A C +T A C A C C T C G C C G G A G T C T C A T C A A T +T C T A G G T G C A G T T A A T T T T A T A A C A +A C T G T T A T T A A C A T A C G C C C T G C G G +G A A T A A C T A T A G A C C G T A T A C C A C T +T T T C G T G T G A G C A G T A T T T A T T A C A +G C C C T A C T C C T T C T T T T A T C T C T A C +C A G T C +>Gea +T T A T A C T T G G T G C C C C A G A T A T G G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G A T T C T G A C T T C T C C C C C C T T C T C T +A A C T C T A T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C A G G A +T G A A C A G T T T A C C C C C C T C T A G - C - +A G C A G G A A T T G C T C A T G C T G G A G C T +T C A G T T G A T A T A G G A A T T T T C T C G C +T A C A C C T T G C A G G G G T C T C A T C A A T +C T T A G G A G C A G T T A A T T T C A T G A C A +A C A G T T A T T A A T A T G C G C C C T G C A G +G A A T A A C T A T G G A C C G T A T A C C A C T +C T T C G T G T G A G C T G T C T T T A T T A C A +G C C T T G C T A C T A T T A C T A T C C C T C C +C A G T C +>Hadrurus +T T A T A T T A G G G G C C C C A G A T A T A G - +C - T T T T C C C C G A A T A A A T A A T A T A A +G C T T C T G A T T A C T T C C C C C C T C C C T +A A C T C T C C T C C T T - T T A A G A G G A A T +A G T T G A A A G A G G A G T A G G G A C C G G C +T G A A C T G T C T A T C C T C C T T T A G - C T +A G - A G G A A T T G C C C A T G C A G G A G C C +T C G G T A G A C A T A G G A A T T T T T T C T C +T A C A C T T A G C T G G A G T T T C A T C C A T +T T T A G G G G C C G T A A A C T T T A T A A C A +A C A G T G A T T A A T A T A C G A T C A T C A G +G A A T A A C A A T A G A T C G A A T A C C T T T +A T T T G T T T G A T C A G T C T T T A T T A C C +G C T T T A T T G T T A C T C C T T T C A C T C C +C A G T T +>Hypochilus +T A A T A C T C G G G G C C C C A G A T A T A G - +C - T T T C C C A C G A A T A A A T A A C A T A A +G A T T C T G A C T T C T T C C T C C C T C T C T +A A C T C T A C T T C T C - T C A A G C G G A A T +A G T C G A A A G A G G G G T A G G A A C C G G C +T G G A C A G T A T A C C C T C C T C T A G C C - +A G - A G G A A T C G C T C A T G C A G G T G C T +T C T G T A G A T A T A G G A A T C T T C T C G C +T T C A C C T A G C A G G A G T C T C A T C A A T +C C T A G G G G C C G T A A A C T T T A T A A C A +A C A G T A A T C A A T A T A C G A T C C T C A G +G A A T A T C T A T A G A C C G T A T A C C T T T +A T T T G T A T G A T C A G T A T T C A T T A C C +G C T C T C C T C C T T C T T C T C T C T T T A C +C A G T C +>Limulus +T A A T A C T A G G T G C C C C G G A T A T G G - +C - T T T C C C T C G A A T A A A T A A C A T A A +G T T T C T G A C T T T T A C C C C C A T C T C T +C A C T C T - C C T C T T A T C T A G A G G A A T +A G T A G A A A G A G G T G T A G G A A C T G G A +T G A A C T G T A T A T C C T C C T T T A T - C - +T G C A A G T A T T G C C C A T G C G G G A G C T +T C C G T T G A T C T C G G A A T C T T C T C A C +T A C A C T T A G C A G G T G T A T C A T C A A T +T T T A G G T G C C G T A A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T C T A C C G +G A A T A A C T A T G G A C C G G A T A C C T C T +T T T T G T T T G A G C A G T A T T T A T T A C T +G C T T T A C T T T T A C T A C T G T C C T T A C +C A G T T +>Mastigoproctus +T G A T A T T A G G T G C T C C T G A T A T A G - +C - T T T T C C T C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C T T C G T T +A A C G T T A C T T C T T - T C C A G A G G T A T +A G T T G A A A G A G G G G T A G G A A C A G G A +T G A A C A G T C T A T C C T C C T T T A T - C - +A G C T A G A A T T G C T C A T G C C G G A G C T +T C A G T A G A C T T A G G T A T T T T C T C G C +T G C A T C T A G C T G G T G T C T C G T C T A T +C T T A G G A G C T G T C A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G T A T A C C T C T +T T T C G T A T G G G C A G T A T T T A T T A C C +G C C C T A C T A C T T T T A T T A T C T T T A C +C A G T A +>Pauroctonus +T A A T G T T A G G T G C T C C T G A T A T G G - +C - T T T T C C A C G A A T A A A C A A T A T G A +G T T T C T G G C T C C T A C C T C C T T C A C T +A A C A C T A C T T C T T - T C T A G A G G T A T +A G T T G A A A G A G G A G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T T T A T - C - +A G C C A G T A T T G C T C A T G C T G G G G C T +T C G G T A G A T T T A G G A A T T T T C T C C C +T A C A C T T G G C A G G T G T T T C T T C A A T +T T T A G G A G C T G T A A A T T T T A T G A C A +A C T G T T A T T A A C A T A C G A T C T A C G G +G A A T A A C T A T A G A C C G A A T A C C G C T +T T T C G T T T G A G C A G T A T T T A T T A C A +G C A T T A C T T C T C T T A T T A T C T C T A C +C G G T A +>Rhiphicephalus +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T - C - +A G C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thelichoris +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T - C - +A G C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thermobius +T A A T A T T A G G T G C T C C C G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T G A +G T T T C T G A C T T T T A C C T C C C T C G C T +A A C T C T A C T T C T T - T C T A G A G G T A T +A G T C G A A A G A G G G G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T C T A T - C - +T G C C A G C A T T G C C C A T G C A G G A G C A +T C T G T A G A T C T A G G A A T C T T C T C A T +T A C A T T T A G C A G G G G T T T C T T C A A T +T C T A G G T G C T G T A A A T T T T A T A A C T +A C C G T T A T C A A T A T A C G A T C A A C A G +G A A T A A C T A T A G A C C G A A T A C C T C T +T T T C G T C T G A G C G G T A T T T A T T A C A +G C C T T A C T A C T T T T A C T A T C A T T A C +C G G T T +>Vonones +T A A T G T T A G G T G C T C C A G A T A T G G - +C A T T T C C C - C G A A T A A A T A A T A T A A +G T T T C T G A C T T T T A C C C C C T T C G C T +A A C T T T A C T T - T T A T C T A G A G G T A T +A G T C G A A A G A G G A G T G G G A A C T G G A +T G A A C A G T A T A C C C T C C T T T A T - C - +A G C C A G A A T T G C T C A C G C A G G T G C T +T C A G T T G A T C T A G G T A T T T T T T C A T +T A C A T T T A G C A G G G G T C T C A T C A A T +C T T A G G A G C T G T A A A C T T T A T A A C G +A C C G T T A T C A A T A T A C G A T C T A C A G +G G A T A A C T A T A G A C C G A A T A C C A C T +T T T T G T T T G A G C A G T A T T T A T T A C A +G C T C T A C T C C T A C T G T T A T C T T T A C +C A G T C + + +Sequence character chel_16S.fas#0 +>Alentus +T C C A G A C T - - G A - C - G G T - C C A C C - +- G A A T C G - G G C G A C T G - - - T C - A G G +C C T G A A C - - A T C C G G - T C G G T T - T C +- T C - T C C C T T T T C C C G C G G T G C T C T +T C G T T G A G - T G - T C G C G - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Amblypygid +T C C A G A C T - - G G - C - G G T - C C G C C T +- A - G C G G C G A G T A C T G - - - T C - A G G +C C T G A A C - - A T G G C G - C C G G - T - T - +- T - - T - - C - - - - - - C T T G G T T C T C T +T T A C T G A G - T G - T C T T G - G G C G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - - - G - C T G +- T - - - - C G C C +>Americhernus +T C G A G C C T C C A A - T - G A T - A C G T T G +- A - A A G G C G T T T A T C G - - - T T - G G G +G C C G - A C - - A G C G C G - T C - G - - - T - +- G - - G - - G - - - - - - C T C G G T T G G C C +T T A A A A A G C T G A T C - G G - G T T C T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +G G T G C T C A A A G C A - - G - - - - G C C T G +- G - - - - T G C C +>Aportus +T C C A G A C T - - G A - C - G G T - C C A C C - +- G - C T C G - G G C G A C T G - - - T C - A G G +C C T G A A C - - A T T C C G - T C G G T T - T C +G A C - A G A T T T T T C C C G C G G T G C T C T +T C G G T G A G - T G - T C G C - - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Artemia +C T C G G T C G - - G G - T - G G T G C C G C C - +T C - A C G G T G G T C A C T G - - - C C T C G A +T C - G G A C - A A T T C A T - T G G A - - - T - +- C - - G - T T - - - - - - C G G G G T G C T C T +T A A C C G A G - T G - T C C T G - G G T G G C C +G A T A C G T T T A C T T T G A A C A A A T T A G +A G T G C T T A A A G C A - - G - - - - G - T - G +C A C - C G C G C C +>Centruoides +T C C A G A C A - - A G - C - G G T - C C A C C - +- C - G C G G T G G T T A C T G - - - T T T G G A +C T - G G A C - - G T T T G G C C G G A - - - T - +- T - - C - C T - - - - - - - T T G A T G C T C T +T T G C C G A G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - - +- A - - C - C G C C +>Chanbria +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - +>Gea +T C C G G C C G - - G A - C G G G T - C C G C C - +T A - C C G G T G G T T A C T G - T T C G - C T G +C C - G A G C - - - T T C A G - G G G G - - - C - +- C - - G - C T - - - - - - G T C G A T G A T C T +T C A T C G G T - T A - T C T T C - C G T A A C C +C T C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - - - - C G C C +>Hadrurus +T C C G G A C T - - G T - C - G G T - C C G - C - +- C - G C - A A G C T T A C T G - - - G C A G G A +C C - G G A C - - G T C T A G C C G G A - - C T - +- C - - T - C T - - - - - - C G T - A T C C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - T C - C G C C +>Hypochilus +T C C A G A C G - - G G - C - G G T - C C G C C - +T A - A C G G T G G T T A C T G - - - C C - T G G +C C T G A A C - - A A C C A G - C C G G - - - T - +- T - - T - C C - - - - - - C T A G A T G A T C T +T C A T T G A T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - G +- A - - - - C G C C +>Limulus +T C T A G A C T - - G G - C - G G T - C C G C T T +- - - C C G G C G G T T A C T G - - - C C - T G G +C C T A A A C - - A T C - T G - C C G G - T - T - +- T - - T - C C - - - - - - C T C G G T G C C C T +T G A T T G A G - T G - T C T T G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - A +- A - - - - C G C C +>Mastigoproctus +T C C A G A C G - - G G - T - G G T - C C A C C - +G C - C C G G T G G C T A C T G - - - C C - C G G +C C T G A A C A - A T C T C G - C C G G - - - T - +- T - - T - T C - - - - - - C T T G A T T C T C T +T C A C C G A G - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T T A A A G C A - - G - - C - G - T - A +- A - - - - C G C C +>Pauroctonus +T C C A G A C T - - G T - C - G G T - C C G - C - +- A - C C G G A G G T T A C T G - - - G C - A G G +A C C G G A C - - G T C T A G - C C G G - - - C - +- C - - T - C C - - - - - - C T C G T T G C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - A - C +- C - - - - C G C C +>Rhiphicephalus +T C C A G A C G - - A G - T - A G T - G C A T C - +T A - C C C G A T G C T A C G G - - - C T - C G G +A C T G A A C - - A T C A T G - C C G G - - - T - +- T - C T - T T - - - - - - C T T G G T G C A C T +T C A T T G T G - T G - C C T C G A G A T G G C C +G G T G C T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A C G C A - G G - - C - G - A - G +- T - - - - C G C C +>Thelichoris +T C C A G A C G - - G G - C - G G T - C C G C C - +T A - A C G G T G G T T A C T G - - - C C - T G G +C C T G A A C - - A G C C A G - C C G G - - - T - +- T - - T - C C - - - - - - C T A G A T G A T C T +T T A C C G G T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - G G - - C - - - T - G +- A - - - - C G C C +>Thermobius +C T C G G A C - - - G A T C - G G T - T C G C C - +- G - C C C G T G T T A A C T G A - - T C - - G T +T C C G G A C - - G T C C T G - C C G G T T - T - +- T - - T - C C - T T T - - C T C G G T G C T C T +T C A T T G A G - T G - T C T T G - A T T G G C C +G G C A C G T T T A C T T T G A A C A A A T T A G +A G T G C T C A A A G C A G - G T C C A G - T - G +- T - - - - C G C C +>Vonones +T C G A G G C T - - G G - C - G G T - C C G C C T +- A - C A G G C G G T C A C T G - - - C C - A G T +A C T C A A C - - A T C C T G - C C G G - T - T - +- T - - T - C C - - - - - - C T T G G T G C T C T +T C G C T G A G - T G - T C T C G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - C - G +- T - - - - A G C C + +Implied Alignments for Graph 5 + +Sequence character chel_cox1.fas#0 +>Alentus +T T A T A T T A G G A G C A C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T T T G A C T T C T T C C A C C A T C T T T +A A C T T T A C T C C T C - T C T A G A G G A A T +A G T A G A A A G T G G A G T G G G A A C A G G C +T G A A C T G T C T A T C C T C C T T T A T - C - +T G C T G G A A T T G C A C A T G C T G G T G C C +T C A G T A G A T T T A G G A A T C T T T T C T T +T A C A C C T A G C A G G A G T T T C A T C C A T +C T T A G G G G C C G T A A A C T T T A T A A C T +A C C G T T A T C A A T A T A C G A A G A A C A G +G T A T A A C A A T A G A C C G A A T T C C T C T +T T T T G T A T G A T C T G T A T T T A T T A C A +G C C C T T C T T C T C C T T C T T T C C C T C C +C A G T C +>Amblypygid +T A A T A T T A G G A G C C C C A G A T A T G G - +C - T T T T C C C C G T A T A A A T A A T A T A A +G T T T T T G A C T T T T A C C C C C C T C T C T +A A C A C T - A C T C C T C T C T A G A G G G A T +A G T G G A A A G A G G T G T A G G T A C A G G A +T G A A C T G T T T A C C C C C C T C T A T - C - +G G C T A G C A T C G C A C A C G C C G G T G C T +T C T G T T G A T A T G G G A A T T T T C T C G C +T C C A T T T G G C C G G G G T T T C T T C A A T +T T T A G G G G C C G T T A A T T T T A T A A C T +A C T G T T A T C A A T A T A C G A A G A A C C G +G A A T A A C T A T A G A C C G A A T T C C C C T +T T T T G T T T G A T C T G T C T T T A T T A C C +G C C C T C C T T C T T C T T C T A T C A T T A C +C T G T T +>Americhernus +T A A T A C T T G G G G C C C C T G A T A T G G - +C - T T T C C C C C G T A T A A A T A A T A T G A +G A T T C T G A C T C T T A C C T C C C T C A T T +A A C T C T - C C T A T T A T C G A G A G G G A T +A G T T G A A A C A G G G G T T G G T A C T G G A +T G G A C A G T A T A C C C C C C C T T A T - C - +G G C C G G G A T C G C C C A C G C A G G G G C C +T C A G T T G A C C T G G C A A T C T T T T C T C +T C C A C T T A G C T G G G G T T T C T T C T A T +T C T A G G A G C A G T T A A T T T T A T A A C A +A C C G T T A T C A A T A T A C G A A G T G T A G +G G A T G A C A A T A G A C C G A G T A C C T C T +T T T T G T A T G G T C T A T T T T C A T C A C A +A C T C T C C T T C T A C T A C T T T C T C T A C +C A G T C +>Aportus +T A A T A T T A G G G G C C C C T G A T A T G G C +C - T T T C C C - C G G A T A A A T A A T A T A A +G T T T C T G A C T A T T A C C C C C C T C T T T +A A C C C T T C T T T T A - T C T A G T G G T A T +G G T T G A A A C A G G A G T T G G G A C T G G T +T G A A C A G T T T A C C C C C C T T T A G - C - +C G C C G G A A T T G C T C A C G C A G G A G C T +T C G G T A G A T C T T G C T A T T T T C T C A C +T C C A C T T A G C C G G G G T T T C T T C G A T +T T T A G G A G C A G T A A A C T T T A T A A C T +A C C G T T A T T A A T A T A C G A A G A A C A G +G G A T A A C A A T G G A C C G G A T C C C T C T +T T T C G T C T G G T C G G T T T T T A T T A C T +G C C C T T C T T T T A C T T T T A T C T C T C C +C A G T T +>Artemia +T A A T A T T A G G A G C T C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A C T C C T C C C T C C C T C T T T +A A C T C T T T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C G G G A +T G A A C C G T T T A T C C T C C T C T T G - C - +A G C C G G A A T T G C T C A T G C C G G A G C T +T C A G T T G A T A T A G G A A T C T T C T C T C +T A C A T T T A G C A G G A G T A T C C T C T A T +T T T A G G G G C C G T A A A T T T T A T A A C A +A C A G T G A T T A A C A T A C G G G C A T C T G +G A A T A A C A T T A G A C C G A A T A C C T T T +A T T C G T G T G A T C T G T A T T T A T T A C T +G C A C T A T T A T T A C T T T T A T C A C T A C +C A G T T +>Centruoides +T A A T A T T A G G G G C C C C T G A T A T G G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A T T G C T T C C C C C T T C C C T +A A C T C T T T T A C T T - T C G A G T G G A A T +G G T A G A A A G A G G G G T A G G T A C A G G A +T G A A C T G T T T A T C C C C C T C T T G - C - +A G C C G G A A T T G C T C A T G C T G G A G C T +T C C G T T G A C A T A G G A A T T T T C T C T C +T A C A T T T A G C A G G A G T G T C T T C T A T +T T T A G G G G C C G T A A A C T T C A T A A C T +A C A G T A A T T A A T A T A C G A G C A T C T G +G A A T A A C A A T A G A C C G A A T A C C T T T +A T T T G T T T G A T C T G T C T T T A T T A C A +G C A C T A C T T T T A C T C T T G T C A T T A C +C A G T T +>Chanbria +T T A T A T T A G G A G C A C C T G A T A T A G - +C - A T T T C C A C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C A T C T C T +T A C A C T A C T C C T A - T C A A G A G G A A T +G G T A G A A A G A G G A G T C G G T A C A G G A +T G A A C T G T G T A T C C C C C G C T A G - C - +A G C A G G A A T T G C C C A C G C T G G A G C C +T C A G T T G A C A T A G G A A T T T T T T C A C +T A C A C C T C G C C G G A G T C T C A T C A A T +T C T A G G T G C A G T T A A T T T T A T A A C A +A C T G T T A T T A A C A T A C G C C C T G C G G +G A A T A A C T A T A G A C C G T A T A C C A C T +T T T C G T G T G A G C A G T A T T T A T T A C A +G C C C T A C T C C T T C T T T T A T C T C T A C +C A G T C +>Gea +T T A T A C T T G G T G C C C C A G A T A T G G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G A T T C T G A C T T C T C C C C C C T T C T C T +A A C T C T A T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C A G G A +T G A A C A G T T T A C C C C C C T C T A G - C - +A G C A G G A A T T G C T C A T G C T G G A G C T +T C A G T T G A T A T A G G A A T T T T C T C G C +T A C A C C T T G C A G G G G T C T C A T C A A T +C T T A G G A G C A G T T A A T T T C A T G A C A +A C A G T T A T T A A T A T G C G C C C T G C A G +G A A T A A C T A T G G A C C G T A T A C C A C T +C T T C G T G T G A G C T G T C T T T A T T A C A +G C C T T G C T A C T A T T A C T A T C C C T C C +C A G T C +>Hadrurus +T T A T A T T A G G G G C C C C A G A T A T A G - +C - T T T T C C C C G A A T A A A T A A T A T A A +G C T T C T G A T T A C T T C C C C C C T C C C T +A A C T C T C C T C C T T - T T A A G A G G A A T +A G T T G A A A G A G G A G T A G G G A C C G G C +T G A A C T G T C T A T C C T C C T T T A G - C T +A G - A G G A A T T G C C C A T G C A G G A G C C +T C G G T A G A C A T A G G A A T T T T T T C T C +T A C A C T T A G C T G G A G T T T C A T C C A T +T T T A G G G G C C G T A A A C T T T A T A A C A +A C A G T G A T T A A T A T A C G A T C A T C A G +G A A T A A C A A T A G A T C G A A T A C C T T T +A T T T G T T T G A T C A G T C T T T A T T A C C +G C T T T A T T G T T A C T C C T T T C A C T C C +C A G T T +>Hypochilus +T A A T A C T C G G G G C C C C A G A T A T A G - +C - T T T C C C A C G A A T A A A T A A C A T A A +G A T T C T G A C T T C T T C C T C C C T C T C T +A A C T C T A C T T C T C - T C A A G C G G A A T +A G T C G A A A G A G G G G T A G G A A C C G G C +T G G A C A G T A T A C C C T C C T C T A G C C - +A G - A G G A A T C G C T C A T G C A G G T G C T +T C T G T A G A T A T A G G A A T C T T C T C G C +T T C A C C T A G C A G G A G T C T C A T C A A T +C C T A G G G G C C G T A A A C T T T A T A A C A +A C A G T A A T C A A T A T A C G A T C C T C A G +G A A T A T C T A T A G A C C G T A T A C C T T T +A T T T G T A T G A T C A G T A T T C A T T A C C +G C T C T C C T C C T T C T T C T C T C T T T A C +C A G T C +>Limulus +T A A T A C T A G G T G C C C C G G A T A T G G - +C - T T T C C C T C G A A T A A A T A A C A T A A +G T T T C T G A C T T T T A C C C C C A T C T C T +C A C T C T - C C T C T T A T C T A G A G G A A T +A G T A G A A A G A G G T G T A G G A A C T G G A +T G A A C T G T A T A T C C T C C T T T A T - C - +T G C A A G T A T T G C C C A T G C G G G A G C T +T C C G T T G A T C T C G G A A T C T T C T C A C +T A C A C T T A G C A G G T G T A T C A T C A A T +T T T A G G T G C C G T A A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T C T A C C G +G A A T A A C T A T G G A C C G G A T A C C T C T +T T T T G T T T G A G C A G T A T T T A T T A C T +G C T T T A C T T T T A C T A C T G T C C T T A C +C A G T T +>Mastigoproctus +T G A T A T T A G G T G C T C C T G A T A T A G - +C - T T T T C C T C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C T T C G T T +A A C G T T A C T T C T T - T C C A G A G G T A T +A G T T G A A A G A G G G G T A G G A A C A G G A +T G A A C A G T C T A T C C T C C T T T A T - C - +A G C T A G A A T T G C T C A T G C C G G A G C T +T C A G T A G A C T T A G G T A T T T T C T C G C +T G C A T C T A G C T G G T G T C T C G T C T A T +C T T A G G A G C T G T C A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G T A T A C C T C T +T T T C G T A T G G G C A G T A T T T A T T A C C +G C C C T A C T A C T T T T A T T A T C T T T A C +C A G T A +>Pauroctonus +T A A T G T T A G G T G C T C C T G A T A T G G - +C - T T T T C C A C G A A T A A A C A A T A T G A +G T T T C T G G C T C C T A C C T C C T T C A C T +A A C A C T A C T T C T T - T C T A G A G G T A T +A G T T G A A A G A G G A G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T T T A T - C - +A G C C A G T A T T G C T C A T G C T G G G G C T +T C G G T A G A T T T A G G A A T T T T C T C C C +T A C A C T T G G C A G G T G T T T C T T C A A T +T T T A G G A G C T G T A A A T T T T A T G A C A +A C T G T T A T T A A C A T A C G A T C T A C G G +G A A T A A C T A T A G A C C G A A T A C C G C T +T T T C G T T T G A G C A G T A T T T A T T A C A +G C A T T A C T T C T C T T A T T A T C T C T A C +C G G T A +>Rhiphicephalus +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T - C - +A G C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thelichoris +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T - C - +A G C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thermobius +T A A T A T T A G G T G C T C C C G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T G A +G T T T C T G A C T T T T A C C T C C C T C G C T +A A C T C T A C T T C T T - T C T A G A G G T A T +A G T C G A A A G A G G G G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T C T A T - C - +T G C C A G C A T T G C C C A T G C A G G A G C A +T C T G T A G A T C T A G G A A T C T T C T C A T +T A C A T T T A G C A G G G G T T T C T T C A A T +T C T A G G T G C T G T A A A T T T T A T A A C T +A C C G T T A T C A A T A T A C G A T C A A C A G +G A A T A A C T A T A G A C C G A A T A C C T C T +T T T C G T C T G A G C G G T A T T T A T T A C A +G C C T T A C T A C T T T T A C T A T C A T T A C +C G G T T +>Vonones +T A A T G T T A G G T G C T C C A G A T A T G G - +C A T T T C C C - C G A A T A A A T A A T A T A A +G T T T C T G A C T T T T A C C C C C T T C G C T +A A C T T T A C T T - T T A T C T A G A G G T A T +A G T C G A A A G A G G A G T G G G A A C T G G A +T G A A C A G T A T A C C C T C C T T T A T - C - +A G C C A G A A T T G C T C A C G C A G G T G C T +T C A G T T G A T C T A G G T A T T T T T T C A T +T A C A T T T A G C A G G G G T C T C A T C A A T +C T T A G G A G C T G T A A A C T T T A T A A C G +A C C G T T A T C A A T A T A C G A T C T A C A G +G G A T A A C T A T A G A C C G A A T A C C A C T +T T T T G T T T G A G C A G T A T T T A T T A C A +G C T C T A C T C C T A C T G T T A T C T T T A C +C A G T C + + +Sequence character chel_16S.fas#0 +>Alentus +T C C A G A C T - - G A - C - G G T - C C A C C - +- G A A T C G - G G C G A C T G - - - T C - A G G +C C T G A A C - - A T C C G G - T C G G T T - T C +- T C - T C C C T T T T C C C G C G G T G C T C T +T C G T T G A G - T G - T C G C G - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Amblypygid +T C C A G A C T - - G G - C - G G T - C C G C C T +- A - G C G G C G A G T A C T G - - - T C - A G G +C C T G A A C - - A T G G C G - C C G G - T - T - +- T - - T - - C - - - - - - C T T G G T T C T C T +T T A C T G A G - T G - T C T T G - G G C G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - - - G - C T G +- T - - - - C G C C +>Americhernus +T C G A G C C T C C A A - T - G A T - A C G T T G +- A - A A G G C G T T T A T C G - - - T T - G G G +G C C G - A C - - A G C G C G - T C - G - - - T - +- G - - G - - G - - - - - - C T C G G T T G G C C +T T A A A A A G C T G A T C - G G - G T T C T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +G G T G C T C A A A G C A - - G - - - - G C C T G +- G - - - - T G C C +>Aportus +T C C A G A C T - - G A - C - G G T - C C A C C - +- G - C T C G - G G C G A C T G - - - T C - A G G +C C T G A A C - - A T T C C G - T C G G T T - T C +G A C - A G A T T T T T C C C G C G G T G C T C T +T C G G T G A G - T G - T C G C - - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Artemia +C T C G G T C G - - G G - T - G G T G C C G C C - +T C - A C G G T G G T C A C T G - - - C C T C G A +T C - G G A C - A A T T C A T - T G G A - - - T - +- C - - G - T T - - - - - - C G G G G T G C T C T +T A A C C G A G - T G - T C C T G - G G T G G C C +G A T A C G T T T A C T T T G A A C A A A T T A G +A G T G C T T A A A G C A - - G - - - - G - T - G +C A C - C G C G C C +>Centruoides +T C C A G A C A - - A G - C - G G T - C C A C C - +- C - G C G G T G G T T A C T G - - - T T T G G A +C T - G G A C - - G T T T G G C C G G A - - - T - +- T - - C - C T - - - - - - - T T G A T G C T C T +T T G C C G A G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - - +- A - - C - C G C C +>Chanbria +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - +>Gea +T C C G G C C G - - G A - C G G G T - C C G C C - +T A - C C G G T G G T T A C T G - T T C G - C T G +C C - G A G C - - - T T C A G - G G G G - - - C - +- C - - G - C T - - - - - - G T C G A T G A T C T +T C A T C G G T - T A - T C T T C - C G T A A C C +C T C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - - - - C G C C +>Hadrurus +T C C G G A C T - - G T - C - G G T - C C G - C - +- C - G C - A A G C T T A C T G - - - G C A G G A +C C - G G A C - - G T C T A G C C G G A - - C T - +- C - - T - C T - - - - - - C G T - A T C C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - T C - C G C C +>Hypochilus +T C C A G A C G - - G G - C - G G T - C C G C C - +T A - A C G G T G G T T A C T G - - - C C - T G G +C C T G A A C - - A A C C A G - C C G G - - - T - +- T - - T - C C - - - - - - C T A G A T G A T C T +T C A T T G A T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - G +- A - - - - C G C C +>Limulus +T C T A G A C T - - G G - C - G G T - C C G C T T +- - - C C G G C G G T T A C T G - - - C C - T G G +C C T A A A C - - A T C - T G - C C G G - T - T - +- T - - T - C C - - - - - - C T C G G T G C C C T +T G A T T G A G - T G - T C T T G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - A +- A - - - - C G C C +>Mastigoproctus +T C C A G A C G - - G G - T - G G T - C C A C C - +G C - C C G G T G G C T A C T G - - - C C - C G G +C C T G A A C A - A T C T C G - C C G G - - - T - +- T - - T - T C - - - - - - C T T G A T T C T C T +T C A C C G A G - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T T A A A G C A - - G - - C - G - T - A +- A - - - - C G C C +>Pauroctonus +T C C A G A C T - - G T - C - G G T - C C G - C - +- A - C C G G A G G T T A C T G - - - G C - A G G +A C C G G A C - - G T C T A G - C C G G - - - C - +- C - - T - C C - - - - - - C T C G T T G C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - A - C +- C - - - - C G C C +>Rhiphicephalus +T C C A G A C G - - A G - T - A G T - G C A T C - +T A - C C C G A T G C T A C G G - - - C T - C G G +A C T G A A C - - A T C A T G - C C G G - - - T - +- T - C T - T T - - - - - - C T T G G T G C A C T +T C A T T G T G - T G - C C T C G A G A T G G C C +G G T G C T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A C G C A - G G - - C - G - A - G +- T - - - - C G C C +>Thelichoris +T C C A G A C G - - G G - C - G G T - C C G C C - +T A - A C G G T G G T T A C T G - - - C C - T G G +C C T G A A C - - A G C C A G - C C G G - - - T - +- T - - T - C C - - - - - - C T A G A T G A T C T +T T A C C G G T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - G G - - C - - - T - G +- A - - - - C G C C +>Thermobius +C T C G G A C - - - G A T C - G G T - T C G C C - +- G - C C C G T G T T A A C T G A - - T C - - G T +T C C G G A C - - G T C C T G - C C G G T T - T - +- T - - T - C C - T T T - - C T C G G T G C T C T +T C A T T G A G - T G - T C T T G - A T T G G C C +G G C A C G T T T A C T T T G A A C A A A T T A G +A G T G C T C A A A G C A G - G T C C A G - T - G +- T - - - - C G C C +>Vonones +T C G A G G C T - - G G - C - G G T - C C G C C T +- A - C A G G C G G T C A C T G - - - C C - A G T +A C T C A A C - - A T C C T G - C C G G - T - T - +- T - - T - C C - - - - - - C T T G G T G C T C T +T C G C T G A G - T G - T C T C G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - C - G +- T - - - - A G C C + +Implied Alignments for Graph 6 + +Sequence character chel_cox1.fas#0 +>Alentus +T T A T A T T A G G A G C A C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T T T G A C T T C T T C C A C C A T C T T T +A A C T T T A C T C C T C - T C T A G A G G A A T +A G T A G A A A G T G G A G T G G G A A C A G G C +T G A A C T G T C T A T C C T C C T T T A T - C - +T G C T G G A A T T G C A C A T G C T G G T G C C +T C A G T A G A T T T A G G A A T C T T T T C T T +T A C A C C T A G C A G G A G T T T C A T C C A T +C T T A G G G G C C G T A A A C T T T A T A A C T +A C C G T T A T C A A T A T A C G A A G A A C A G +G T A T A A C A A T A G A C C G A A T T C C T C T +T T T T G T A T G A T C T G T A T T T A T T A C A +G C C C T T C T T C T C C T T C T T T C C C T C C +C A G T C +>Amblypygid +T A A T A T T A G G A G C C C C A G A T A T G G - +C - T T T T C C C C G T A T A A A T A A T A T A A +G T T T T T G A C T T T T A C C C C C C T C T C T +A A C A C T - A C T C C T C T C T A G A G G G A T +A G T G G A A A G A G G T G T A G G T A C A G G A +T G A A C T G T T T A C C C C C C T C T A T - C - +G G C T A G C A T C G C A C A C G C C G G T G C T +T C T G T T G A T A T G G G A A T T T T C T C G C +T C C A T T T G G C C G G G G T T T C T T C A A T +T T T A G G G G C C G T T A A T T T T A T A A C T +A C T G T T A T C A A T A T A C G A A G A A C C G +G A A T A A C T A T A G A C C G A A T T C C C C T +T T T T G T T T G A T C T G T C T T T A T T A C C +G C C C T C C T T C T T C T T C T A T C A T T A C +C T G T T +>Americhernus +T A A T A C T T G G G G C C C C T G A T A T G G - +C - T T T C C C C C G T A T A A A T A A T A T G A +G A T T C T G A C T C T T A C C T C C C T C A T T +A A C T C T - C C T A T T A T C G A G A G G G A T +A G T T G A A A C A G G G G T T G G T A C T G G A +T G G A C A G T A T A C C C C C C C T T A T - C - +G G C C G G G A T C G C C C A C G C A G G G G C C +T C A G T T G A C C T G G C A A T C T T T T C T C +T C C A C T T A G C T G G G G T T T C T T C T A T +T C T A G G A G C A G T T A A T T T T A T A A C A +A C C G T T A T C A A T A T A C G A A G T G T A G +G G A T G A C A A T A G A C C G A G T A C C T C T +T T T T G T A T G G T C T A T T T T C A T C A C A +A C T C T C C T T C T A C T A C T T T C T C T A C +C A G T C +>Aportus +T A A T A T T A G G G G C C C C T G A T A T G G C +C - T T T C C C - C G G A T A A A T A A T A T A A +G T T T C T G A C T A T T A C C C C C C T C T T T +A A C C C T T C T T T T A - T C T A G T G G T A T +G G T T G A A A C A G G A G T T G G G A C T G G T +T G A A C A G T T T A C C C C C C T T T A G - C - +C G C C G G A A T T G C T C A C G C A G G A G C T +T C G G T A G A T C T T G C T A T T T T C T C A C +T C C A C T T A G C C G G G G T T T C T T C G A T +T T T A G G A G C A G T A A A C T T T A T A A C T +A C C G T T A T T A A T A T A C G A A G A A C A G +G G A T A A C A A T G G A C C G G A T C C C T C T +T T T C G T C T G G T C G G T T T T T A T T A C T +G C C C T T C T T T T A C T T T T A T C T C T C C +C A G T T +>Artemia +T A A T A T T A G G A G C T C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A C T C C T C C C T C C C T C T T T +A A C T C T T T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C G G G A +T G A A C C G T T T A T C C T C C T C T T G - C - +A G C C G G A A T T G C T C A T G C C G G A G C T +T C A G T T G A T A T A G G A A T C T T C T C T C +T A C A T T T A G C A G G A G T A T C C T C T A T +T T T A G G G G C C G T A A A T T T T A T A A C A +A C A G T G A T T A A C A T A C G G G C A T C T G +G A A T A A C A T T A G A C C G A A T A C C T T T +A T T C G T G T G A T C T G T A T T T A T T A C T +G C A C T A T T A T T A C T T T T A T C A C T A C +C A G T T +>Centruoides +T A A T A T T A G G G G C C C C T G A T A T G G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A T T G C T T C C C C C T T C C C T +A A C T C T T T T A C T T - T C G A G T G G A A T +G G T A G A A A G A G G G G T A G G T A C A G G A +T G A A C T G T T T A T C C C C C T C T T G - C - +A G C C G G A A T T G C T C A T G C T G G A G C T +T C C G T T G A C A T A G G A A T T T T C T C T C +T A C A T T T A G C A G G A G T G T C T T C T A T +T T T A G G G G C C G T A A A C T T C A T A A C T +A C A G T A A T T A A T A T A C G A G C A T C T G +G A A T A A C A A T A G A C C G A A T A C C T T T +A T T T G T T T G A T C T G T C T T T A T T A C A +G C A C T A C T T T T A C T C T T G T C A T T A C +C A G T T +>Chanbria +T T A T A T T A G G A G C A C C T G A T A T A G - +C - A T T T C C A C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C A T C T C T +T A C A C T A C T C C T A - T C A A G A G G A A T +G G T A G A A A G A G G A G T C G G T A C A G G A +T G A A C T G T G T A T C C C C C G C T A G - C - +A G C A G G A A T T G C C C A C G C T G G A G C C +T C A G T T G A C A T A G G A A T T T T T T C A C +T A C A C C T C G C C G G A G T C T C A T C A A T +T C T A G G T G C A G T T A A T T T T A T A A C A +A C T G T T A T T A A C A T A C G C C C T G C G G +G A A T A A C T A T A G A C C G T A T A C C A C T +T T T C G T G T G A G C A G T A T T T A T T A C A +G C C C T A C T C C T T C T T T T A T C T C T A C +C A G T C +>Gea +T T A T A C T T G G T G C C C C A G A T A T G G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G A T T C T G A C T T C T C C C C C C T T C T C T +A A C T C T A T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C A G G A +T G A A C A G T T T A C C C C C C T C T A G - C - +A G C A G G A A T T G C T C A T G C T G G A G C T +T C A G T T G A T A T A G G A A T T T T C T C G C +T A C A C C T T G C A G G G G T C T C A T C A A T +C T T A G G A G C A G T T A A T T T C A T G A C A +A C A G T T A T T A A T A T G C G C C C T G C A G +G A A T A A C T A T G G A C C G T A T A C C A C T +C T T C G T G T G A G C T G T C T T T A T T A C A +G C C T T G C T A C T A T T A C T A T C C C T C C +C A G T C +>Hadrurus +T T A T A T T A G G G G C C C C A G A T A T A G - +C - T T T T C C C C G A A T A A A T A A T A T A A +G C T T C T G A T T A C T T C C C C C C T C C C T +A A C T C T C C T C C T T - T T A A G A G G A A T +A G T T G A A A G A G G A G T A G G G A C C G G C +T G A A C T G T C T A T C C T C C T T T A G - C T +A G - A G G A A T T G C C C A T G C A G G A G C C +T C G G T A G A C A T A G G A A T T T T T T C T C +T A C A C T T A G C T G G A G T T T C A T C C A T +T T T A G G G G C C G T A A A C T T T A T A A C A +A C A G T G A T T A A T A T A C G A T C A T C A G +G A A T A A C A A T A G A T C G A A T A C C T T T +A T T T G T T T G A T C A G T C T T T A T T A C C +G C T T T A T T G T T A C T C C T T T C A C T C C +C A G T T +>Hypochilus +T A A T A C T C G G G G C C C C A G A T A T A G - +C - T T T C C C A C G A A T A A A T A A C A T A A +G A T T C T G A C T T C T T C C T C C C T C T C T +A A C T C T A C T T C T C - T C A A G C G G A A T +A G T C G A A A G A G G G G T A G G A A C C G G C +T G G A C A G T A T A C C C T C C T C T A G C C - +A G - A G G A A T C G C T C A T G C A G G T G C T +T C T G T A G A T A T A G G A A T C T T C T C G C +T T C A C C T A G C A G G A G T C T C A T C A A T +C C T A G G G G C C G T A A A C T T T A T A A C A +A C A G T A A T C A A T A T A C G A T C C T C A G +G A A T A T C T A T A G A C C G T A T A C C T T T +A T T T G T A T G A T C A G T A T T C A T T A C C +G C T C T C C T C C T T C T T C T C T C T T T A C +C A G T C +>Limulus +T A A T A C T A G G T G C C C C G G A T A T G G - +C - T T T C C C T C G A A T A A A T A A C A T A A +G T T T C T G A C T T T T A C C C C C A T C T C T +C A C T C T - C C T C T T A T C T A G A G G A A T +A G T A G A A A G A G G T G T A G G A A C T G G A +T G A A C T G T A T A T C C T C C T T T A T - C - +T G C A A G T A T T G C C C A T G C G G G A G C T +T C C G T T G A T C T C G G A A T C T T C T C A C +T A C A C T T A G C A G G T G T A T C A T C A A T +T T T A G G T G C C G T A A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T C T A C C G +G A A T A A C T A T G G A C C G G A T A C C T C T +T T T T G T T T G A G C A G T A T T T A T T A C T +G C T T T A C T T T T A C T A C T G T C C T T A C +C A G T T +>Mastigoproctus +T G A T A T T A G G T G C T C C T G A T A T A G - +C - T T T T C C T C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C T T C G T T +A A C G T T A C T T C T T - T C C A G A G G T A T +A G T T G A A A G A G G G G T A G G A A C A G G A +T G A A C A G T C T A T C C T C C T T T A T - C - +A G C T A G A A T T G C T C A T G C C G G A G C T +T C A G T A G A C T T A G G T A T T T T C T C G C +T G C A T C T A G C T G G T G T C T C G T C T A T +C T T A G G A G C T G T C A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G T A T A C C T C T +T T T C G T A T G G G C A G T A T T T A T T A C C +G C C C T A C T A C T T T T A T T A T C T T T A C +C A G T A +>Pauroctonus +T A A T G T T A G G T G C T C C T G A T A T G G - +C - T T T T C C A C G A A T A A A C A A T A T G A +G T T T C T G G C T C C T A C C T C C T T C A C T +A A C A C T A C T T C T T - T C T A G A G G T A T +A G T T G A A A G A G G A G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T T T A T - C - +A G C C A G T A T T G C T C A T G C T G G G G C T +T C G G T A G A T T T A G G A A T T T T C T C C C +T A C A C T T G G C A G G T G T T T C T T C A A T +T T T A G G A G C T G T A A A T T T T A T G A C A +A C T G T T A T T A A C A T A C G A T C T A C G G +G A A T A A C T A T A G A C C G A A T A C C G C T +T T T C G T T T G A G C A G T A T T T A T T A C A +G C A T T A C T T C T C T T A T T A T C T C T A C +C G G T A +>Rhiphicephalus +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T - C - +A G C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thelichoris +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T - C - +A G C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thermobius +T A A T A T T A G G T G C T C C C G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T G A +G T T T C T G A C T T T T A C C T C C C T C G C T +A A C T C T A C T T C T T - T C T A G A G G T A T +A G T C G A A A G A G G G G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T C T A T - C - +T G C C A G C A T T G C C C A T G C A G G A G C A +T C T G T A G A T C T A G G A A T C T T C T C A T +T A C A T T T A G C A G G G G T T T C T T C A A T +T C T A G G T G C T G T A A A T T T T A T A A C T +A C C G T T A T C A A T A T A C G A T C A A C A G +G A A T A A C T A T A G A C C G A A T A C C T C T +T T T C G T C T G A G C G G T A T T T A T T A C A +G C C T T A C T A C T T T T A C T A T C A T T A C +C G G T T +>Vonones +T A A T G T T A G G T G C T C C A G A T A T G G - +C A T T T C C C - C G A A T A A A T A A T A T A A +G T T T C T G A C T T T T A C C C C C T T C G C T +A A C T T T A C T T - T T A T C T A G A G G T A T +A G T C G A A A G A G G A G T G G G A A C T G G A +T G A A C A G T A T A C C C T C C T T T A T - C - +A G C C A G A A T T G C T C A C G C A G G T G C T +T C A G T T G A T C T A G G T A T T T T T T C A T +T A C A T T T A G C A G G G G T C T C A T C A A T +C T T A G G A G C T G T A A A C T T T A T A A C G +A C C G T T A T C A A T A T A C G A T C T A C A G +G G A T A A C T A T A G A C C G A A T A C C A C T +T T T T G T T T G A G C A G T A T T T A T T A C A +G C T C T A C T C C T A C T G T T A T C T T T A C +C A G T C + + +Sequence character chel_16S.fas#0 +>Alentus +T C C A G A C T - - G A - C - G G T - C C A C C - +- G A A T C G - G G C G A C T G - - - T C - A G G +C C T G A A C - - A T C C G G - T C G G T T - T C +- T C - T C C C T T T T C C C G C G G T G C T C T +T C G T T G A G - T G - T C G C G - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Amblypygid +T C C A G A C T - - G G - C - G G T - C C G C C T +- A - G C G G C G A G T A C T G - - - T C - A G G +C C T G A A C - - A T G G C G - C C G G - T - T - +- T - - T - - C - - - - - - C T T G G T T C T C T +T T A C T G A G - T G - T C T T G - G G C G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - - - G - C T G +- T - - - - C G C C +>Americhernus +T C G A G C C T C C A A - T - G A T - A C G T T G +- A - A A G G C G T T T A T C G - - - T T - G G G +G C C G - A C - - A G C G C G - T C - G - - - T - +- G - - G - - G - - - - - - C T C G G T T G G C C +T T A A A A A G C T G A T C - G G - G T T C T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +G G T G C T C A A A G C A - - G - - - - G C C T G +- G - - - - T G C C +>Aportus +T C C A G A C T - - G A - C - G G T - C C A C C - +- G - C T C G - G G C G A C T G - - - T C - A G G +C C T G A A C - - A T T C C G - T C G G T T - T C +G A C - A G A T T T T T C C C G C G G T G C T C T +T C G G T G A G - T G - T C G C - - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Artemia +C T C G G T C G - - G G - T - G G T G C C G C C - +T C - A C G G T G G T C A C T G - - - C C T C G A +T C - G G A C - A A T T C A T - T G G A - - - T - +- C - - G - T T - - - - - - C G G G G T G C T C T +T A A C C G A G - T G - T C C T G - G G T G G C C +G A T A C G T T T A C T T T G A A C A A A T T A G +A G T G C T T A A A G C A - - G - - - - G - T - G +C A C - C G C G C C +>Centruoides +T C C A G A C A - - A G - C - G G T - C C A C C - +- C - G C G G T G G T T A C T G - - - T T T G G A +C T - G G A C - - G T T T G G C C G G A - - - T - +- T - - C - C T - - - - - - - T T G A T G C T C T +T T G C C G A G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - - +- A - - C - C G C C +>Chanbria +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - +>Gea +T C C G G C C G - - G A - C G G G T - C C G C C - +T A - C C G G T G G T T A C T G - T T C G - C T G +C C - G A G C - - - T T C A G - G G G G - - - C - +- C - - G - C T - - - - - - G T C G A T G A T C T +T C A T C G G T - T A - T C T T C - C G T A A C C +C T C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - - - - C G C C +>Hadrurus +T C C G G A C T - - G T - C - G G T - C C G - C - +- C - G C - A A G C T T A C T G - - - G C A G G A +C C - G G A C - - G T C T A G C C G G A - - C T - +- C - - T - C T - - - - - - C G T - A T C C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - T C - C G C C +>Hypochilus +T C C A G A C G - - G G - C - G G T - C C G C C - +T A - A C G G T G G T T A C T G - - - C C - T G G +C C T G A A C - - A A C C A G - C C G G - - - T - +- T - - T - C C - - - - - - C T A G A T G A T C T +T C A T T G A T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - G +- A - - - - C G C C +>Limulus +T C T A G A C T - - G G - C - G G T - C C G C T T +- - - C C G G C G G T T A C T G - - - C C - T G G +C C T A A A C - - A T C - T G - C C G G - T - T - +- T - - T - C C - - - - - - C T C G G T G C C C T +T G A T T G A G - T G - T C T T G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - A +- A - - - - C G C C +>Mastigoproctus +T C C A G A C G - - G G - T - G G T - C C A C C - +G C - C C G G T G G C T A C T G - - - C C - C G G +C C T G A A C A - A T C T C G - C C G G - - - T - +- T - - T - T C - - - - - - C T T G A T T C T C T +T C A C C G A G - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T T A A A G C A - - G - - C - G - T - A +- A - - - - C G C C +>Pauroctonus +T C C A G A C T - - G T - C - G G T - C C G - C - +- A - C C G G A G G T T A C T G - - - G C - A G G +A C C G G A C - - G T C T A G - C C G G - - - C - +- C - - T - C C - - - - - - C T C G T T G C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - A - C +- C - - - - C G C C +>Rhiphicephalus +T C C A G A C G - - A G - T - A G T - G C A T C - +T A - C C C G A T G C T A C G G - - - C T - C G G +A C T G A A C - - A T C A T G - C C G G - - - T - +- T - C T - T T - - - - - - C T T G G T G C A C T +T C A T T G T G - T G - C C T C G A G A T G G C C +G G T G C T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A C G C A - G G - - C - G - A - G +- T - - - - C G C C +>Thelichoris +T C C A G A C G - - G G - C - G G T - C C G C C - +T A - A C G G T G G T T A C T G - - - C C - T G G +C C T G A A C - - A G C C A G - C C G G - - - T - +- T - - T - C C - - - - - - C T A G A T G A T C T +T T A C C G G T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - G G - - C - - - T - G +- A - - - - C G C C +>Thermobius +C T C G G A C - - - G A T C - G G T - T C G C C - +- G - C C C G T G T T A A C T G A - - T C - - G T +T C C G G A C - - G T C C T G - C C G G T T - T - +- T - - T - C C - T T T - - C T C G G T G C T C T +T C A T T G A G - T G - T C T T G - A T T G G C C +G G C A C G T T T A C T T T G A A C A A A T T A G +A G T G C T C A A A G C A G - G T C C A G - T - G +- T - - - - C G C C +>Vonones +T C G A G G C T - - G G - C - G G T - C C G C C T +- A - C A G G C G G T C A C T G - - - C C - A G T +A C T C A A C - - A T C C T G - C C G G - T - T - +- T - - T - C C - - - - - - C T T G G T G C T C T +T C G C T G A G - T G - T C T C G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - C - G +- T - - - - A G C C + +Implied Alignments for Graph 7 + +Sequence character chel_cox1.fas#0 +>Alentus +T T A T A T T A G G A G C A C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T T T G A C T T C T T C C A C C A T C T T T +A A C T T T A C T C C T C - T C T A G A G G A A T +A G T A G A A A G T G G A G T G G G A A C A G G C +T G A A C T G T C T A T C C T C C T T T A T - C - +T G C T G G A A T T G C A C A T G C T G G T G C C +T C A G T A G A T T T A G G A A T C T T T T C T T +T A C A C C T A G C A G G A G T T T C A T C C A T +C T T A G G G G C C G T A A A C T T T A T A A C T +A C C G T T A T C A A T A T A C G A A G A A C A G +G T A T A A C A A T A G A C C G A A T T C C T C T +T T T T G T A T G A T C T G T A T T T A T T A C A +G C C C T T C T T C T C C T T C T T T C C C T C C +C A G T C +>Amblypygid +T A A T A T T A G G A G C C C C A G A T A T G G - +C - T T T T C C C C G T A T A A A T A A T A T A A +G T T T T T G A C T T T T A C C C C C C T C T C T +A A C A C T - A C T C C T C T C T A G A G G G A T +A G T G G A A A G A G G T G T A G G T A C A G G A +T G A A C T G T T T A C C C C C C T C T A T - C - +G G C T A G C A T C G C A C A C G C C G G T G C T +T C T G T T G A T A T G G G A A T T T T C T C G C +T C C A T T T G G C C G G G G T T T C T T C A A T +T T T A G G G G C C G T T A A T T T T A T A A C T +A C T G T T A T C A A T A T A C G A A G A A C C G +G A A T A A C T A T A G A C C G A A T T C C C C T +T T T T G T T T G A T C T G T C T T T A T T A C C +G C C C T C C T T C T T C T T C T A T C A T T A C +C T G T T +>Americhernus +T A A T A C T T G G G G C C C C T G A T A T G G - +C - T T T C C C C C G T A T A A A T A A T A T G A +G A T T C T G A C T C T T A C C T C C C T C A T T +A A C T C T - C C T A T T A T C G A G A G G G A T +A G T T G A A A C A G G G G T T G G T A C T G G A +T G G A C A G T A T A C C C C C C C T T A T - C - +G G C C G G G A T C G C C C A C G C A G G G G C C +T C A G T T G A C C T G G C A A T C T T T T C T C +T C C A C T T A G C T G G G G T T T C T T C T A T +T C T A G G A G C A G T T A A T T T T A T A A C A +A C C G T T A T C A A T A T A C G A A G T G T A G +G G A T G A C A A T A G A C C G A G T A C C T C T +T T T T G T A T G G T C T A T T T T C A T C A C A +A C T C T C C T T C T A C T A C T T T C T C T A C +C A G T C +>Aportus +T A A T A T T A G G G G C C C C T G A T A T G G C +C - T T T C C C - C G G A T A A A T A A T A T A A +G T T T C T G A C T A T T A C C C C C C T C T T T +A A C C C T T C T T T T A - T C T A G T G G T A T +G G T T G A A A C A G G A G T T G G G A C T G G T +T G A A C A G T T T A C C C C C C T T T A G - C - +C G C C G G A A T T G C T C A C G C A G G A G C T +T C G G T A G A T C T T G C T A T T T T C T C A C +T C C A C T T A G C C G G G G T T T C T T C G A T +T T T A G G A G C A G T A A A C T T T A T A A C T +A C C G T T A T T A A T A T A C G A A G A A C A G +G G A T A A C A A T G G A C C G G A T C C C T C T +T T T C G T C T G G T C G G T T T T T A T T A C T +G C C C T T C T T T T A C T T T T A T C T C T C C +C A G T T +>Artemia +T A A T A T T A G G A G C T C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A C T C C T C C C T C C C T C T T T +A A C T C T T T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C G G G A +T G A A C C G T T T A T C C T C C T C T T G - C - +A G C C G G A A T T G C T C A T G C C G G A G C T +T C A G T T G A T A T A G G A A T C T T C T C T C +T A C A T T T A G C A G G A G T A T C C T C T A T +T T T A G G G G C C G T A A A T T T T A T A A C A +A C A G T G A T T A A C A T A C G G G C A T C T G +G A A T A A C A T T A G A C C G A A T A C C T T T +A T T C G T G T G A T C T G T A T T T A T T A C T +G C A C T A T T A T T A C T T T T A T C A C T A C +C A G T T +>Centruoides +T A A T A T T A G G G G C C C C T G A T A T G G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A T T G C T T C C C C C T T C C C T +A A C T C T T T T A C T T - T C G A G T G G A A T +G G T A G A A A G A G G G G T A G G T A C A G G A +T G A A C T G T T T A T C C C C C T C T T G - C - +A G C C G G A A T T G C T C A T G C T G G A G C T +T C C G T T G A C A T A G G A A T T T T C T C T C +T A C A T T T A G C A G G A G T G T C T T C T A T +T T T A G G G G C C G T A A A C T T C A T A A C T +A C A G T A A T T A A T A T A C G A G C A T C T G +G A A T A A C A A T A G A C C G A A T A C C T T T +A T T T G T T T G A T C T G T C T T T A T T A C A +G C A C T A C T T T T A C T C T T G T C A T T A C +C A G T T +>Chanbria +T T A T A T T A G G A G C A C C T G A T A T A G - +C - A T T T C C A C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C A T C T C T +T A C A C T A C T C C T A - T C A A G A G G A A T +G G T A G A A A G A G G A G T C G G T A C A G G A +T G A A C T G T G T A T C C C C C G C T A G - C - +A G C A G G A A T T G C C C A C G C T G G A G C C +T C A G T T G A C A T A G G A A T T T T T T C A C +T A C A C C T C G C C G G A G T C T C A T C A A T +T C T A G G T G C A G T T A A T T T T A T A A C A +A C T G T T A T T A A C A T A C G C C C T G C G G +G A A T A A C T A T A G A C C G T A T A C C A C T +T T T C G T G T G A G C A G T A T T T A T T A C A +G C C C T A C T C C T T C T T T T A T C T C T A C +C A G T C +>Gea +T T A T A C T T G G T G C C C C A G A T A T G G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G A T T C T G A C T T C T C C C C C C T T C T C T +A A C T C T A T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C A G G A +T G A A C A G T T T A C C C C C C T C T A G - C - +A G C A G G A A T T G C T C A T G C T G G A G C T +T C A G T T G A T A T A G G A A T T T T C T C G C +T A C A C C T T G C A G G G G T C T C A T C A A T +C T T A G G A G C A G T T A A T T T C A T G A C A +A C A G T T A T T A A T A T G C G C C C T G C A G +G A A T A A C T A T G G A C C G T A T A C C A C T +C T T C G T G T G A G C T G T C T T T A T T A C A +G C C T T G C T A C T A T T A C T A T C C C T C C +C A G T C +>Hadrurus +T T A T A T T A G G G G C C C C A G A T A T A G - +C - T T T T C C C C G A A T A A A T A A T A T A A +G C T T C T G A T T A C T T C C C C C C T C C C T +A A C T C T C C T C C T T - T T A A G A G G A A T +A G T T G A A A G A G G A G T A G G G A C C G G C +T G A A C T G T C T A T C C T C C T T T A G - C T +A G - A G G A A T T G C C C A T G C A G G A G C C +T C G G T A G A C A T A G G A A T T T T T T C T C +T A C A C T T A G C T G G A G T T T C A T C C A T +T T T A G G G G C C G T A A A C T T T A T A A C A +A C A G T G A T T A A T A T A C G A T C A T C A G +G A A T A A C A A T A G A T C G A A T A C C T T T +A T T T G T T T G A T C A G T C T T T A T T A C C +G C T T T A T T G T T A C T C C T T T C A C T C C +C A G T T +>Hypochilus +T A A T A C T C G G G G C C C C A G A T A T A G - +C - T T T C C C A C G A A T A A A T A A C A T A A +G A T T C T G A C T T C T T C C T C C C T C T C T +A A C T C T A C T T C T C - T C A A G C G G A A T +A G T C G A A A G A G G G G T A G G A A C C G G C +T G G A C A G T A T A C C C T C C T C T A G C C - +A G - A G G A A T C G C T C A T G C A G G T G C T +T C T G T A G A T A T A G G A A T C T T C T C G C +T T C A C C T A G C A G G A G T C T C A T C A A T +C C T A G G G G C C G T A A A C T T T A T A A C A +A C A G T A A T C A A T A T A C G A T C C T C A G +G A A T A T C T A T A G A C C G T A T A C C T T T +A T T T G T A T G A T C A G T A T T C A T T A C C +G C T C T C C T C C T T C T T C T C T C T T T A C +C A G T C +>Limulus +T A A T A C T A G G T G C C C C G G A T A T G G - +C - T T T C C C T C G A A T A A A T A A C A T A A +G T T T C T G A C T T T T A C C C C C A T C T C T +C A C T C T - C C T C T T A T C T A G A G G A A T +A G T A G A A A G A G G T G T A G G A A C T G G A +T G A A C T G T A T A T C C T C C T T T A T - C - +T G C A A G T A T T G C C C A T G C G G G A G C T +T C C G T T G A T C T C G G A A T C T T C T C A C +T A C A C T T A G C A G G T G T A T C A T C A A T +T T T A G G T G C C G T A A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T C T A C C G +G A A T A A C T A T G G A C C G G A T A C C T C T +T T T T G T T T G A G C A G T A T T T A T T A C T +G C T T T A C T T T T A C T A C T G T C C T T A C +C A G T T +>Mastigoproctus +T G A T A T T A G G T G C T C C T G A T A T A G - +C - T T T T C C T C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C T T C G T T +A A C G T T A C T T C T T - T C C A G A G G T A T +A G T T G A A A G A G G G G T A G G A A C A G G A +T G A A C A G T C T A T C C T C C T T T A T - C - +A G C T A G A A T T G C T C A T G C C G G A G C T +T C A G T A G A C T T A G G T A T T T T C T C G C +T G C A T C T A G C T G G T G T C T C G T C T A T +C T T A G G A G C T G T C A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G T A T A C C T C T +T T T C G T A T G G G C A G T A T T T A T T A C C +G C C C T A C T A C T T T T A T T A T C T T T A C +C A G T A +>Pauroctonus +T A A T G T T A G G T G C T C C T G A T A T G G - +C - T T T T C C A C G A A T A A A C A A T A T G A +G T T T C T G G C T C C T A C C T C C T T C A C T +A A C A C T A C T T C T T - T C T A G A G G T A T +A G T T G A A A G A G G A G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T T T A T - C - +A G C C A G T A T T G C T C A T G C T G G G G C T +T C G G T A G A T T T A G G A A T T T T C T C C C +T A C A C T T G G C A G G T G T T T C T T C A A T +T T T A G G A G C T G T A A A T T T T A T G A C A +A C T G T T A T T A A C A T A C G A T C T A C G G +G A A T A A C T A T A G A C C G A A T A C C G C T +T T T C G T T T G A G C A G T A T T T A T T A C A +G C A T T A C T T C T C T T A T T A T C T C T A C +C G G T A +>Rhiphicephalus +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T - C - +A G C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thelichoris +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T - C - +A G C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thermobius +T A A T A T T A G G T G C T C C C G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T G A +G T T T C T G A C T T T T A C C T C C C T C G C T +A A C T C T A C T T C T T - T C T A G A G G T A T +A G T C G A A A G A G G G G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T C T A T - C - +T G C C A G C A T T G C C C A T G C A G G A G C A +T C T G T A G A T C T A G G A A T C T T C T C A T +T A C A T T T A G C A G G G G T T T C T T C A A T +T C T A G G T G C T G T A A A T T T T A T A A C T +A C C G T T A T C A A T A T A C G A T C A A C A G +G A A T A A C T A T A G A C C G A A T A C C T C T +T T T C G T C T G A G C G G T A T T T A T T A C A +G C C T T A C T A C T T T T A C T A T C A T T A C +C G G T T +>Vonones +T A A T G T T A G G T G C T C C A G A T A T G G - +C A T T T C C C - C G A A T A A A T A A T A T A A +G T T T C T G A C T T T T A C C C C C T T C G C T +A A C T T T A C T T - T T A T C T A G A G G T A T +A G T C G A A A G A G G A G T G G G A A C T G G A +T G A A C A G T A T A C C C T C C T T T A T - C - +A G C C A G A A T T G C T C A C G C A G G T G C T +T C A G T T G A T C T A G G T A T T T T T T C A T +T A C A T T T A G C A G G G G T C T C A T C A A T +C T T A G G A G C T G T A A A C T T T A T A A C G +A C C G T T A T C A A T A T A C G A T C T A C A G +G G A T A A C T A T A G A C C G A A T A C C A C T +T T T T G T T T G A G C A G T A T T T A T T A C A +G C T C T A C T C C T A C T G T T A T C T T T A C +C A G T C + + +Sequence character chel_16S.fas#0 +>Alentus +T C C A G A C T - - G A - C - G G T - C C A C C - +- G A A T C G - G G C G A C T G - - - T C - A G G +C C T G A A C - - A T C C G G - T C G G T T - T C +- T C - T C C C T T T T C C C G C G G T G C T C T +T C G T T G A G - T G - T C G C G - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Amblypygid +T C C A G A C T - - G G - C - G G T - C C G C C T +- A - G C G G C G A G T A C T G - - - T C - A G G +C C T G A A C - - A T G G C G - C C G G - T - T - +- T - - T - - C - - - - - - C T T G G T T C T C T +T T A C T G A G - T G - T C T T G - G G C G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - - - G - C T G +- T - - - - C G C C +>Americhernus +T C G A G C C T C C A A - T - G A T - A C G T T G +- A - A A G G C G T T T A T C G - - - T T - G G G +G C C G - A C - - A G C G C G - T C - G - - - T - +- G - - G - - G - - - - - - C T C G G T T G G C C +T T A A A A A G C T G A T C - G G - G T T C T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +G G T G C T C A A A G C A - - G - - - - G C C T G +- G - - - - T G C C +>Aportus +T C C A G A C T - - G A - C - G G T - C C A C C - +- G - C T C G - G G C G A C T G - - - T C - A G G +C C T G A A C - - A T T C C G - T C G G T T - T C +G A C - A G A T T T T T C C C G C G G T G C T C T +T C G G T G A G - T G - T C G C - - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Artemia +C T C G G T C G - - G G - T - G G T G C C G C C - +T C - A C G G T G G T C A C T G - - - C C T C G A +T C - G G A C - A A T T C A T - T G G A - - - T - +- C - - G - T T - - - - - - C G G G G T G C T C T +T A A C C G A G - T G - T C C T G - G G T G G C C +G A T A C G T T T A C T T T G A A C A A A T T A G +A G T G C T T A A A G C A - - G - - - - G - T - G +C A C - C G C G C C +>Centruoides +T C C A G A C A - - A G - C - G G T - C C A C C - +- C - G C G G T G G T T A C T G - - - T T T G G A +C T - G G A C - - G T T T G G C C G G A - - - T - +- T - - C - C T - - - - - - - T T G A T G C T C T +T T G C C G A G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - - +- A - - C - C G C C +>Chanbria +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - +>Gea +T C C G G C C G - - G A - C G G G T - C C G C C - +T A - C C G G T G G T T A C T G - T T C G - C T G +C C - G A G C - - - T T C A G - G G G G - - - C - +- C - - G - C T - - - - - - G T C G A T G A T C T +T C A T C G G T - T A - T C T T C - C G T A A C C +C T C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - - - - C G C C +>Hadrurus +T C C G G A C T - - G T - C - G G T - C C G - C - +- C - G C - A A G C T T A C T G - - - G C A G G A +C C - G G A C - - G T C T A G C C G G A - - C T - +- C - - T - C T - - - - - - C G T - A T C C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - T C - C G C C +>Hypochilus +T C C A G A C G - - G G - C - G G T - C C G C C - +T A - A C G G T G G T T A C T G - - - C C - T G G +C C T G A A C - - A A C C A G - C C G G - - - T - +- T - - T - C C - - - - - - C T A G A T G A T C T +T C A T T G A T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - G +- A - - - - C G C C +>Limulus +T C T A G A C T - - G G - C - G G T - C C G C T T +- - - C C G G C G G T T A C T G - - - C C - T G G +C C T A A A C - - A T C - T G - C C G G - T - T - +- T - - T - C C - - - - - - C T C G G T G C C C T +T G A T T G A G - T G - T C T T G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - A +- A - - - - C G C C +>Mastigoproctus +T C C A G A C G - - G G - T - G G T - C C A C C - +G C - C C G G T G G C T A C T G - - - C C - C G G +C C T G A A C A - A T C T C G - C C G G - - - T - +- T - - T - T C - - - - - - C T T G A T T C T C T +T C A C C G A G - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T T A A A G C A - - G - - C - G - T - A +- A - - - - C G C C +>Pauroctonus +T C C A G A C T - - G T - C - G G T - C C G - C - +- A - C C G G A G G T T A C T G - - - G C - A G G +A C C G G A C - - G T C T A G - C C G G - - - C - +- C - - T - C C - - - - - - C T C G T T G C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - A - C +- C - - - - C G C C +>Rhiphicephalus +T C C A G A C G - - A G - T - A G T - G C A T C - +T A - C C C G A T G C T A C G G - - - C T - C G G +A C T G A A C - - A T C A T G - C C G G - - - T - +- T - C T - T T - - - - - - C T T G G T G C A C T +T C A T T G T G - T G - C C T C G A G A T G G C C +G G T G C T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A C G C A - G G - - C - G - A - G +- T - - - - C G C C +>Thelichoris +T C C A G A C G - - G G - C - G G T - C C G C C - +T A - A C G G T G G T T A C T G - - - C C - T G G +C C T G A A C - - A G C C A G - C C G G - - - T - +- T - - T - C C - - - - - - C T A G A T G A T C T +T T A C C G G T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - G G - - C - - - T - G +- A - - - - C G C C +>Thermobius +C T C G G A C - - - G A T C - G G T - T C G C C - +- G - C C C G T G T T A A C T G A - - T C - - G T +T C C G G A C - - G T C C T G - C C G G T T - T - +- T - - T - C C - T T T - - C T C G G T G C T C T +T C A T T G A G - T G - T C T T G - A T T G G C C +G G C A C G T T T A C T T T G A A C A A A T T A G +A G T G C T C A A A G C A G - G T C C A G - T - G +- T - - - - C G C C +>Vonones +T C G A G G C T - - G G - C - G G T - C C G C C T +- A - C A G G C G G T C A C T G - - - C C - A G T +A C T C A A C - - A T C C T G - C C G G - T - T - +- T - - T - C C - - - - - - C T T G G T G C T C T +T C G C T G A G - T G - T C T C G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - C - G +- T - - - - A G C C + +Implied Alignments for Graph 8 + +Sequence character chel_cox1.fas#0 +>Alentus +T T A T A T T A G G A G C A C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T T T G A C T T C T T C C A C C A T C T T T +A A C T T T A C T C C T C - T C T A G A G G A A T +A G T A G A A A G T G G A G T G G G A A C A G G C +T G A A C T G T C T A T C C T C C T T T A T - C - +T G C T G G A A T T G C A C A T G C T G G T G C C +T C A G T A G A T T T A G G A A T C T T T T C T T +T A C A C C T A G C A G G A G T T T C A T C C A T +C T T A G G G G C C G T A A A C T T T A T A A C T +A C C G T T A T C A A T A T A C G A A G A A C A G +G T A T A A C A A T A G A C C G A A T T C C T C T +T T T T G T A T G A T C T G T A T T T A T T A C A +G C C C T T C T T C T C C T T C T T T C C C T C C +C A G T C +>Amblypygid +T A A T A T T A G G A G C C C C A G A T A T G G - +C - T T T T C C C C G T A T A A A T A A T A T A A +G T T T T T G A C T T T T A C C C C C C T C T C T +A A C A C T - A C T C C T C T C T A G A G G G A T +A G T G G A A A G A G G T G T A G G T A C A G G A +T G A A C T G T T T A C C C C C C T C T A T - C - +G G C T A G C A T C G C A C A C G C C G G T G C T +T C T G T T G A T A T G G G A A T T T T C T C G C +T C C A T T T G G C C G G G G T T T C T T C A A T +T T T A G G G G C C G T T A A T T T T A T A A C T +A C T G T T A T C A A T A T A C G A A G A A C C G +G A A T A A C T A T A G A C C G A A T T C C C C T +T T T T G T T T G A T C T G T C T T T A T T A C C +G C C C T C C T T C T T C T T C T A T C A T T A C +C T G T T +>Americhernus +T A A T A C T T G G G G C C C C T G A T A T G G - +C - T T T C C C C C G T A T A A A T A A T A T G A +G A T T C T G A C T C T T A C C T C C C T C A T T +A A C T C T - C C T A T T A T C G A G A G G G A T +A G T T G A A A C A G G G G T T G G T A C T G G A +T G G A C A G T A T A C C C C C C C T T A T - C - +G G C C G G G A T C G C C C A C G C A G G G G C C +T C A G T T G A C C T G G C A A T C T T T T C T C +T C C A C T T A G C T G G G G T T T C T T C T A T +T C T A G G A G C A G T T A A T T T T A T A A C A +A C C G T T A T C A A T A T A C G A A G T G T A G +G G A T G A C A A T A G A C C G A G T A C C T C T +T T T T G T A T G G T C T A T T T T C A T C A C A +A C T C T C C T T C T A C T A C T T T C T C T A C +C A G T C +>Aportus +T A A T A T T A G G G G C C C C T G A T A T G G C +C - T T T C C C - C G G A T A A A T A A T A T A A +G T T T C T G A C T A T T A C C C C C C T C T T T +A A C C C T T C T T T T A - T C T A G T G G T A T +G G T T G A A A C A G G A G T T G G G A C T G G T +T G A A C A G T T T A C C C C C C T T T A G - C - +C G C C G G A A T T G C T C A C G C A G G A G C T +T C G G T A G A T C T T G C T A T T T T C T C A C +T C C A C T T A G C C G G G G T T T C T T C G A T +T T T A G G A G C A G T A A A C T T T A T A A C T +A C C G T T A T T A A T A T A C G A A G A A C A G +G G A T A A C A A T G G A C C G G A T C C C T C T +T T T C G T C T G G T C G G T T T T T A T T A C T +G C C C T T C T T T T A C T T T T A T C T C T C C +C A G T T +>Artemia +T A A T A T T A G G A G C T C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A C T C C T C C C T C C C T C T T T +A A C T C T T T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C G G G A +T G A A C C G T T T A T C C T C C T C T T G - C - +A G C C G G A A T T G C T C A T G C C G G A G C T +T C A G T T G A T A T A G G A A T C T T C T C T C +T A C A T T T A G C A G G A G T A T C C T C T A T +T T T A G G G G C C G T A A A T T T T A T A A C A +A C A G T G A T T A A C A T A C G G G C A T C T G +G A A T A A C A T T A G A C C G A A T A C C T T T +A T T C G T G T G A T C T G T A T T T A T T A C T +G C A C T A T T A T T A C T T T T A T C A C T A C +C A G T T +>Centruoides +T A A T A T T A G G G G C C C C T G A T A T G G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A T T G C T T C C C C C T T C C C T +A A C T C T T T T A C T T - T C G A G T G G A A T +G G T A G A A A G A G G G G T A G G T A C A G G A +T G A A C T G T T T A T C C C C C T C T T G - C - +A G C C G G A A T T G C T C A T G C T G G A G C T +T C C G T T G A C A T A G G A A T T T T C T C T C +T A C A T T T A G C A G G A G T G T C T T C T A T +T T T A G G G G C C G T A A A C T T C A T A A C T +A C A G T A A T T A A T A T A C G A G C A T C T G +G A A T A A C A A T A G A C C G A A T A C C T T T +A T T T G T T T G A T C T G T C T T T A T T A C A +G C A C T A C T T T T A C T C T T G T C A T T A C +C A G T T +>Chanbria +T T A T A T T A G G A G C A C C T G A T A T A G - +C - A T T T C C A C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C A T C T C T +T A C A C T A C T C C T A - T C A A G A G G A A T +G G T A G A A A G A G G A G T C G G T A C A G G A +T G A A C T G T G T A T C C C C C G C T A G - C - +A G C A G G A A T T G C C C A C G C T G G A G C C +T C A G T T G A C A T A G G A A T T T T T T C A C +T A C A C C T C G C C G G A G T C T C A T C A A T +T C T A G G T G C A G T T A A T T T T A T A A C A +A C T G T T A T T A A C A T A C G C C C T G C G G +G A A T A A C T A T A G A C C G T A T A C C A C T +T T T C G T G T G A G C A G T A T T T A T T A C A +G C C C T A C T C C T T C T T T T A T C T C T A C +C A G T C +>Gea +T T A T A C T T G G T G C C C C A G A T A T G G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G A T T C T G A C T T C T C C C C C C T T C T C T +A A C T C T A T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C A G G A +T G A A C A G T T T A C C C C C C T C T A G - C - +A G C A G G A A T T G C T C A T G C T G G A G C T +T C A G T T G A T A T A G G A A T T T T C T C G C +T A C A C C T T G C A G G G G T C T C A T C A A T +C T T A G G A G C A G T T A A T T T C A T G A C A +A C A G T T A T T A A T A T G C G C C C T G C A G +G A A T A A C T A T G G A C C G T A T A C C A C T +C T T C G T G T G A G C T G T C T T T A T T A C A +G C C T T G C T A C T A T T A C T A T C C C T C C +C A G T C +>Hadrurus +T T A T A T T A G G G G C C C C A G A T A T A G - +C - T T T T C C C C G A A T A A A T A A T A T A A +G C T T C T G A T T A C T T C C C C C C T C C C T +A A C T C T C C T C C T T - T T A A G A G G A A T +A G T T G A A A G A G G A G T A G G G A C C G G C +T G A A C T G T C T A T C C T C C T T T A G - C T +A G - A G G A A T T G C C C A T G C A G G A G C C +T C G G T A G A C A T A G G A A T T T T T T C T C +T A C A C T T A G C T G G A G T T T C A T C C A T +T T T A G G G G C C G T A A A C T T T A T A A C A +A C A G T G A T T A A T A T A C G A T C A T C A G +G A A T A A C A A T A G A T C G A A T A C C T T T +A T T T G T T T G A T C A G T C T T T A T T A C C +G C T T T A T T G T T A C T C C T T T C A C T C C +C A G T T +>Hypochilus +T A A T A C T C G G G G C C C C A G A T A T A G - +C - T T T C C C A C G A A T A A A T A A C A T A A +G A T T C T G A C T T C T T C C T C C C T C T C T +A A C T C T A C T T C T C - T C A A G C G G A A T +A G T C G A A A G A G G G G T A G G A A C C G G C +T G G A C A G T A T A C C C T C C T C T A G C C - +A G - A G G A A T C G C T C A T G C A G G T G C T +T C T G T A G A T A T A G G A A T C T T C T C G C +T T C A C C T A G C A G G A G T C T C A T C A A T +C C T A G G G G C C G T A A A C T T T A T A A C A +A C A G T A A T C A A T A T A C G A T C C T C A G +G A A T A T C T A T A G A C C G T A T A C C T T T +A T T T G T A T G A T C A G T A T T C A T T A C C +G C T C T C C T C C T T C T T C T C T C T T T A C +C A G T C +>Limulus +T A A T A C T A G G T G C C C C G G A T A T G G - +C - T T T C C C T C G A A T A A A T A A C A T A A +G T T T C T G A C T T T T A C C C C C A T C T C T +C A C T C T - C C T C T T A T C T A G A G G A A T +A G T A G A A A G A G G T G T A G G A A C T G G A +T G A A C T G T A T A T C C T C C T T T A T - C - +T G C A A G T A T T G C C C A T G C G G G A G C T +T C C G T T G A T C T C G G A A T C T T C T C A C +T A C A C T T A G C A G G T G T A T C A T C A A T +T T T A G G T G C C G T A A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T C T A C C G +G A A T A A C T A T G G A C C G G A T A C C T C T +T T T T G T T T G A G C A G T A T T T A T T A C T +G C T T T A C T T T T A C T A C T G T C C T T A C +C A G T T +>Mastigoproctus +T G A T A T T A G G T G C T C C T G A T A T A G - +C - T T T T C C T C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C T T C G T T +A A C G T T A C T T C T T - T C C A G A G G T A T +A G T T G A A A G A G G G G T A G G A A C A G G A +T G A A C A G T C T A T C C T C C T T T A T - C - +A G C T A G A A T T G C T C A T G C C G G A G C T +T C A G T A G A C T T A G G T A T T T T C T C G C +T G C A T C T A G C T G G T G T C T C G T C T A T +C T T A G G A G C T G T C A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G T A T A C C T C T +T T T C G T A T G G G C A G T A T T T A T T A C C +G C C C T A C T A C T T T T A T T A T C T T T A C +C A G T A +>Pauroctonus +T A A T G T T A G G T G C T C C T G A T A T G G - +C - T T T T C C A C G A A T A A A C A A T A T G A +G T T T C T G G C T C C T A C C T C C T T C A C T +A A C A C T A C T T C T T - T C T A G A G G T A T +A G T T G A A A G A G G A G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T T T A T - C - +A G C C A G T A T T G C T C A T G C T G G G G C T +T C G G T A G A T T T A G G A A T T T T C T C C C +T A C A C T T G G C A G G T G T T T C T T C A A T +T T T A G G A G C T G T A A A T T T T A T G A C A +A C T G T T A T T A A C A T A C G A T C T A C G G +G A A T A A C T A T A G A C C G A A T A C C G C T +T T T C G T T T G A G C A G T A T T T A T T A C A +G C A T T A C T T C T C T T A T T A T C T C T A C +C G G T A +>Rhiphicephalus +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T - C - +A G C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thelichoris +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T - C - +A G C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thermobius +T A A T A T T A G G T G C T C C C G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T G A +G T T T C T G A C T T T T A C C T C C C T C G C T +A A C T C T A C T T C T T - T C T A G A G G T A T +A G T C G A A A G A G G G G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T C T A T - C - +T G C C A G C A T T G C C C A T G C A G G A G C A +T C T G T A G A T C T A G G A A T C T T C T C A T +T A C A T T T A G C A G G G G T T T C T T C A A T +T C T A G G T G C T G T A A A T T T T A T A A C T +A C C G T T A T C A A T A T A C G A T C A A C A G +G A A T A A C T A T A G A C C G A A T A C C T C T +T T T C G T C T G A G C G G T A T T T A T T A C A +G C C T T A C T A C T T T T A C T A T C A T T A C +C G G T T +>Vonones +T A A T G T T A G G T G C T C C A G A T A T G G - +C A T T T C C C - C G A A T A A A T A A T A T A A +G T T T C T G A C T T T T A C C C C C T T C G C T +A A C T T T A C T T - T T A T C T A G A G G T A T +A G T C G A A A G A G G A G T G G G A A C T G G A +T G A A C A G T A T A C C C T C C T T T A T - C - +A G C C A G A A T T G C T C A C G C A G G T G C T +T C A G T T G A T C T A G G T A T T T T T T C A T +T A C A T T T A G C A G G G G T C T C A T C A A T +C T T A G G A G C T G T A A A C T T T A T A A C G +A C C G T T A T C A A T A T A C G A T C T A C A G +G G A T A A C T A T A G A C C G A A T A C C A C T +T T T T G T T T G A G C A G T A T T T A T T A C A +G C T C T A C T C C T A C T G T T A T C T T T A C +C A G T C + + +Sequence character chel_16S.fas#0 +>Alentus +T C C A G A C T - - G A - C - G G T - C C A C C - +- G A A T C G - G G C G A C T G - - - T C - A G G +C C T G A A C - - A T C C G G - T C G G T T - T C +- T C - T C C C T T T T C C C G C G G T G C T C T +T C G T T G A G - T G - T C G C G - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Amblypygid +T C C A G A C T - - G G - C - G G T - C C G C C T +- A - G C G G C G A G T A C T G - - - T C - A G G +C C T G A A C - - A T G G C G - C C G G - T - T - +- T - - T - - C - - - - - - C T T G G T T C T C T +T T A C T G A G - T G - T C T T G - G G C G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - - - G - C T G +- T - - - - C G C C +>Americhernus +T C G A G C C T C C A A - T - G A T - A C G T T G +- A - A A G G C G T T T A T C G - - - T T - G G G +G C C G - A C - - A G C G C G - T C - G - - - T - +- G - - G - - G - - - - - - C T C G G T T G G C C +T T A A A A A G C T G A T C - G G - G T T C T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +G G T G C T C A A A G C A - - G - - - - G C C T G +- G - - - - T G C C +>Aportus +T C C A G A C T - - G A - C - G G T - C C A C C - +- G - C T C G - G G C G A C T G - - - T C - A G G +C C T G A A C - - A T T C C G - T C G G T T - T C +G A C - A G A T T T T T C C C G C G G T G C T C T +T C G G T G A G - T G - T C G C - - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Artemia +C T C G G T C G - - G G - T - G G T G C C G C C - +T C - A C G G T G G T C A C T G - - - C C T C G A +T C - G G A C - A A T T C A T - T G G A - - - T - +- C - - G - T T - - - - - - C G G G G T G C T C T +T A A C C G A G - T G - T C C T G - G G T G G C C +G A T A C G T T T A C T T T G A A C A A A T T A G +A G T G C T T A A A G C A - - G - - - - G - T - G +C A C - C G C G C C +>Centruoides +T C C A G A C A - - A G - C - G G T - C C A C C - +- C - G C G G T G G T T A C T G - - - T T T G G A +C T - G G A C - - G T T T G G C C G G A - - - T - +- T - - C - C T - - - - - - - T T G A T G C T C T +T T G C C G A G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - - +- A - - C - C G C C +>Chanbria +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - +>Gea +T C C G G C C G - - G A - C G G G T - C C G C C - +T A - C C G G T G G T T A C T G - T T C G - C T G +C C - G A G C - - - T T C A G - G G G G - - - C - +- C - - G - C T - - - - - - G T C G A T G A T C T +T C A T C G G T - T A - T C T T C - C G T A A C C +C T C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - - - - C G C C +>Hadrurus +T C C G G A C T - - G T - C - G G T - C C G - C - +- C - G C - A A G C T T A C T G - - - G C A G G A +C C - G G A C - - G T C T A G C C G G A - - C T - +- C - - T - C T - - - - - - C G T - A T C C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - T C - C G C C +>Hypochilus +T C C A G A C G - - G G - C - G G T - C C G C C - +T A - A C G G T G G T T A C T G - - - C C - T G G +C C T G A A C - - A A C C A G - C C G G - - - T - +- T - - T - C C - - - - - - C T A G A T G A T C T +T C A T T G A T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - G +- A - - - - C G C C +>Limulus +T C T A G A C T - - G G - C - G G T - C C G C T T +- - - C C G G C G G T T A C T G - - - C C - T G G +C C T A A A C - - A T C - T G - C C G G - T - T - +- T - - T - C C - - - - - - C T C G G T G C C C T +T G A T T G A G - T G - T C T T G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - A +- A - - - - C G C C +>Mastigoproctus +T C C A G A C G - - G G - T - G G T - C C A C C - +G C - C C G G T G G C T A C T G - - - C C - C G G +C C T G A A C A - A T C T C G - C C G G - - - T - +- T - - T - T C - - - - - - C T T G A T T C T C T +T C A C C G A G - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T T A A A G C A - - G - - C - G - T - A +- A - - - - C G C C +>Pauroctonus +T C C A G A C T - - G T - C - G G T - C C G - C - +- A - C C G G A G G T T A C T G - - - G C - A G G +A C C G G A C - - G T C T A G - C C G G - - - C - +- C - - T - C C - - - - - - C T C G T T G C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - A - C +- C - - - - C G C C +>Rhiphicephalus +T C C A G A C G - - A G - T - A G T - G C A T C - +T A - C C C G A T G C T A C G G - - - C T - C G G +A C T G A A C - - A T C A T G - C C G G - - - T - +- T - C T - T T - - - - - - C T T G G T G C A C T +T C A T T G T G - T G - C C T C G A G A T G G C C +G G T G C T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A C G C A - G G - - C - G - A - G +- T - - - - C G C C +>Thelichoris +T C C A G A C G - - G G - C - G G T - C C G C C - +T A - A C G G T G G T T A C T G - - - C C - T G G +C C T G A A C - - A G C C A G - C C G G - - - T - +- T - - T - C C - - - - - - C T A G A T G A T C T +T T A C C G G T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - G G - - C - - - T - G +- A - - - - C G C C +>Thermobius +C T C G G A C - - - G A T C - G G T - T C G C C - +- G - C C C G T G T T A A C T G A - - T C - - G T +T C C G G A C - - G T C C T G - C C G G T T - T - +- T - - T - C C - T T T - - C T C G G T G C T C T +T C A T T G A G - T G - T C T T G - A T T G G C C +G G C A C G T T T A C T T T G A A C A A A T T A G +A G T G C T C A A A G C A G - G T C C A G - T - G +- T - - - - C G C C +>Vonones +T C G A G G C T - - G G - C - G G T - C C G C C T +- A - C A G G C G G T C A C T G - - - C C - A G T +A C T C A A C - - A T C C T G - C C G G - T - T - +- T - - T - C C - - - - - - C T T G G T G C T C T +T C G C T G A G - T G - T C T C G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - C - G +- T - - - - A G C C + +Implied Alignments for Graph 9 + +Sequence character chel_cox1.fas#0 +>Alentus +T T A T A T T A G G A G C A C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T T T G A C T T C T T C C A C C A T C T T T +A A C T T T A C T C C T C - T C T A G A G G A A T +A G T A G A A A G T G G A G T G G G A A C A G G C +T G A A C T G T C T A T C C T C C T T T A T - C - +T G C T G G A A T T G C A C A T G C T G G T G C C +T C A G T A G A T T T A G G A A T C T T T T C T T +T A C A C C T A G C A G G A G T T T C A T C C A T +C T T A G G G G C C G T A A A C T T T A T A A C T +A C C G T T A T C A A T A T A C G A A G A A C A G +G T A T A A C A A T A G A C C G A A T T C C T C T +T T T T G T A T G A T C T G T A T T T A T T A C A +G C C C T T C T T C T C C T T C T T T C C C T C C +C A G T C +>Amblypygid +T A A T A T T A G G A G C C C C A G A T A T G G - +C - T T T T C C C C G T A T A A A T A A T A T A A +G T T T T T G A C T T T T A C C C C C C T C T C T +A A C A C T - A C T C C T C T C T A G A G G G A T +A G T G G A A A G A G G T G T A G G T A C A G G A +T G A A C T G T T T A C C C C C C T C T A T - C - +G G C T A G C A T C G C A C A C G C C G G T G C T +T C T G T T G A T A T G G G A A T T T T C T C G C +T C C A T T T G G C C G G G G T T T C T T C A A T +T T T A G G G G C C G T T A A T T T T A T A A C T +A C T G T T A T C A A T A T A C G A A G A A C C G +G A A T A A C T A T A G A C C G A A T T C C C C T +T T T T G T T T G A T C T G T C T T T A T T A C C +G C C C T C C T T C T T C T T C T A T C A T T A C +C T G T T +>Americhernus +T A A T A C T T G G G G C C C C T G A T A T G G - +C - T T T C C C C C G T A T A A A T A A T A T G A +G A T T C T G A C T C T T A C C T C C C T C A T T +A A C T C T - C C T A T T A T C G A G A G G G A T +A G T T G A A A C A G G G G T T G G T A C T G G A +T G G A C A G T A T A C C C C C C C T T A T - C - +G G C C G G G A T C G C C C A C G C A G G G G C C +T C A G T T G A C C T G G C A A T C T T T T C T C +T C C A C T T A G C T G G G G T T T C T T C T A T +T C T A G G A G C A G T T A A T T T T A T A A C A +A C C G T T A T C A A T A T A C G A A G T G T A G +G G A T G A C A A T A G A C C G A G T A C C T C T +T T T T G T A T G G T C T A T T T T C A T C A C A +A C T C T C C T T C T A C T A C T T T C T C T A C +C A G T C +>Aportus +T A A T A T T A G G G G C C C C T G A T A T G G C +C - T T T C C C - C G G A T A A A T A A T A T A A +G T T T C T G A C T A T T A C C C C C C T C T T T +A A C C C T T C T T T T A - T C T A G T G G T A T +G G T T G A A A C A G G A G T T G G G A C T G G T +T G A A C A G T T T A C C C C C C T T T A G - C - +C G C C G G A A T T G C T C A C G C A G G A G C T +T C G G T A G A T C T T G C T A T T T T C T C A C +T C C A C T T A G C C G G G G T T T C T T C G A T +T T T A G G A G C A G T A A A C T T T A T A A C T +A C C G T T A T T A A T A T A C G A A G A A C A G +G G A T A A C A A T G G A C C G G A T C C C T C T +T T T C G T C T G G T C G G T T T T T A T T A C T +G C C C T T C T T T T A C T T T T A T C T C T C C +C A G T T +>Artemia +T A A T A T T A G G A G C T C C A G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A C T C C T C C C T C C C T C T T T +A A C T C T T T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C G G G A +T G A A C C G T T T A T C C T C C T C T T G - C - +A G C C G G A A T T G C T C A T G C C G G A G C T +T C A G T T G A T A T A G G A A T C T T C T C T C +T A C A T T T A G C A G G A G T A T C C T C T A T +T T T A G G G G C C G T A A A T T T T A T A A C A +A C A G T G A T T A A C A T A C G G G C A T C T G +G A A T A A C A T T A G A C C G A A T A C C T T T +A T T C G T G T G A T C T G T A T T T A T T A C T +G C A C T A T T A T T A C T T T T A T C A C T A C +C A G T T +>Centruoides +T A A T A T T A G G G G C C C C T G A T A T G G - +C - T T T C C C T C G A A T A A A T A A T A T A A +G A T T C T G A T T G C T T C C C C C T T C C C T +A A C T C T T T T A C T T - T C G A G T G G A A T +G G T A G A A A G A G G G G T A G G T A C A G G A +T G A A C T G T T T A T C C C C C T C T T G - C - +A G C C G G A A T T G C T C A T G C T G G A G C T +T C C G T T G A C A T A G G A A T T T T C T C T C +T A C A T T T A G C A G G A G T G T C T T C T A T +T T T A G G G G C C G T A A A C T T C A T A A C T +A C A G T A A T T A A T A T A C G A G C A T C T G +G A A T A A C A A T A G A C C G A A T A C C T T T +A T T T G T T T G A T C T G T C T T T A T T A C A +G C A C T A C T T T T A C T C T T G T C A T T A C +C A G T T +>Chanbria +T T A T A T T A G G A G C A C C T G A T A T A G - +C - A T T T C C A C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C A T C T C T +T A C A C T A C T C C T A - T C A A G A G G A A T +G G T A G A A A G A G G A G T C G G T A C A G G A +T G A A C T G T G T A T C C C C C G C T A G - C - +A G C A G G A A T T G C C C A C G C T G G A G C C +T C A G T T G A C A T A G G A A T T T T T T C A C +T A C A C C T C G C C G G A G T C T C A T C A A T +T C T A G G T G C A G T T A A T T T T A T A A C A +A C T G T T A T T A A C A T A C G C C C T G C G G +G A A T A A C T A T A G A C C G T A T A C C A C T +T T T C G T G T G A G C A G T A T T T A T T A C A +G C C C T A C T C C T T C T T T T A T C T C T A C +C A G T C +>Gea +T T A T A C T T G G T G C C C C A G A T A T G G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G A T T C T G A C T T C T C C C C C C T T C T C T +A A C T C T A T T A C T T - T C A A G A G G A A T +A G T A G A A A G A G G A G T A G G A A C A G G A +T G A A C A G T T T A C C C C C C T C T A G - C - +A G C A G G A A T T G C T C A T G C T G G A G C T +T C A G T T G A T A T A G G A A T T T T C T C G C +T A C A C C T T G C A G G G G T C T C A T C A A T +C T T A G G A G C A G T T A A T T T C A T G A C A +A C A G T T A T T A A T A T G C G C C C T G C A G +G A A T A A C T A T G G A C C G T A T A C C A C T +C T T C G T G T G A G C T G T C T T T A T T A C A +G C C T T G C T A C T A T T A C T A T C C C T C C +C A G T C +>Hadrurus +T T A T A T T A G G G G C C C C A G A T A T A G - +C - T T T T C C C C G A A T A A A T A A T A T A A +G C T T C T G A T T A C T T C C C C C C T C C C T +A A C T C T C C T C C T T - T T A A G A G G A A T +A G T T G A A A G A G G A G T A G G G A C C G G C +T G A A C T G T C T A T C C T C C T T T A G - C T +A G - A G G A A T T G C C C A T G C A G G A G C C +T C G G T A G A C A T A G G A A T T T T T T C T C +T A C A C T T A G C T G G A G T T T C A T C C A T +T T T A G G G G C C G T A A A C T T T A T A A C A +A C A G T G A T T A A T A T A C G A T C A T C A G +G A A T A A C A A T A G A T C G A A T A C C T T T +A T T T G T T T G A T C A G T C T T T A T T A C C +G C T T T A T T G T T A C T C C T T T C A C T C C +C A G T T +>Hypochilus +T A A T A C T C G G G G C C C C A G A T A T A G - +C - T T T C C C A C G A A T A A A T A A C A T A A +G A T T C T G A C T T C T T C C T C C C T C T C T +A A C T C T A C T T C T C - T C A A G C G G A A T +A G T C G A A A G A G G G G T A G G A A C C G G C +T G G A C A G T A T A C C C T C C T C T A G C C - +A G - A G G A A T C G C T C A T G C A G G T G C T +T C T G T A G A T A T A G G A A T C T T C T C G C +T T C A C C T A G C A G G A G T C T C A T C A A T +C C T A G G G G C C G T A A A C T T T A T A A C A +A C A G T A A T C A A T A T A C G A T C C T C A G +G A A T A T C T A T A G A C C G T A T A C C T T T +A T T T G T A T G A T C A G T A T T C A T T A C C +G C T C T C C T C C T T C T T C T C T C T T T A C +C A G T C +>Limulus +T A A T A C T A G G T G C C C C G G A T A T G G - +C - T T T C C C T C G A A T A A A T A A C A T A A +G T T T C T G A C T T T T A C C C C C A T C T C T +C A C T C T - C C T C T T A T C T A G A G G A A T +A G T A G A A A G A G G T G T A G G A A C T G G A +T G A A C T G T A T A T C C T C C T T T A T - C - +T G C A A G T A T T G C C C A T G C G G G A G C T +T C C G T T G A T C T C G G A A T C T T C T C A C +T A C A C T T A G C A G G T G T A T C A T C A A T +T T T A G G T G C C G T A A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T C T A C C G +G A A T A A C T A T G G A C C G G A T A C C T C T +T T T T G T T T G A G C A G T A T T T A T T A C T +G C T T T A C T T T T A C T A C T G T C C T T A C +C A G T T +>Mastigoproctus +T G A T A T T A G G T G C T C C T G A T A T A G - +C - T T T T C C T C G T A T A A A C A A T A T A A +G A T T C T G A C T T T T A C C T C C T T C G T T +A A C G T T A C T T C T T - T C C A G A G G T A T +A G T T G A A A G A G G G G T A G G A A C A G G A +T G A A C A G T C T A T C C T C C T T T A T - C - +A G C T A G A A T T G C T C A T G C C G G A G C T +T C A G T A G A C T T A G G T A T T T T C T C G C +T G C A T C T A G C T G G T G T C T C G T C T A T +C T T A G G A G C T G T C A A T T T T A T A A C A +A C T G T T A T C A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G T A T A C C T C T +T T T C G T A T G G G C A G T A T T T A T T A C C +G C C C T A C T A C T T T T A T T A T C T T T A C +C A G T A +>Pauroctonus +T A A T G T T A G G T G C T C C T G A T A T G G - +C - T T T T C C A C G A A T A A A C A A T A T G A +G T T T C T G G C T C C T A C C T C C T T C A C T +A A C A C T A C T T C T T - T C T A G A G G T A T +A G T T G A A A G A G G A G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T T T A T - C - +A G C C A G T A T T G C T C A T G C T G G G G C T +T C G G T A G A T T T A G G A A T T T T C T C C C +T A C A C T T G G C A G G T G T T T C T T C A A T +T T T A G G A G C T G T A A A T T T T A T G A C A +A C T G T T A T T A A C A T A C G A T C T A C G G +G A A T A A C T A T A G A C C G A A T A C C G C T +T T T C G T T T G A G C A G T A T T T A T T A C A +G C A T T A C T T C T C T T A T T A T C T C T A C +C G G T A +>Rhiphicephalus +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T - C - +A G C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thelichoris +T A A T A T T A G G A G C A C C T G A T A T A G - +C - A T T C C C A C G A A T A A A T A A T A T A A +G C T T T T G A C T G C T A C C T C C T T C T C T +C A C C C T T C T T C T A - T C T A G A G G A A T +A G T T G A A A G A G G A G T T G G A A C A G G A +T G A A C T G T A T A T C C T C C C C T A T - C - +A G C T A G A A T C G C A C A T G C A G G G G C C +T C T G T A G A T T T G G G A A T T T T T T C A T +T G C A T T T A G C A G G A G T T T C A T C A A T +T C T A G G G G C C G T A A A T T T C A T A A C A +A C T G T A A T T A A T A T A C G A T C T A C A G +G T A T A A C T A T A G A C C G A A T A C C G T T +A T T C G T C T G A G C A G T T T T T A T T A C A +G C G C T C T T A C T C C T T T T A T C T T T A C +C C G T T +>Thermobius +T A A T A T T A G G T G C T C C C G A T A T A G - +C - T T T C C C T C G A A T A A A T A A T A T G A +G T T T C T G A C T T T T A C C T C C C T C G C T +A A C T C T A C T T C T T - T C T A G A G G T A T +A G T C G A A A G A G G G G T A G G A A C A G G A +T G A A C T G T T T A C C C T C C T C T A T - C - +T G C C A G C A T T G C C C A T G C A G G A G C A +T C T G T A G A T C T A G G A A T C T T C T C A T +T A C A T T T A G C A G G G G T T T C T T C A A T +T C T A G G T G C T G T A A A T T T T A T A A C T +A C C G T T A T C A A T A T A C G A T C A A C A G +G A A T A A C T A T A G A C C G A A T A C C T C T +T T T C G T C T G A G C G G T A T T T A T T A C A +G C C T T A C T A C T T T T A C T A T C A T T A C +C G G T T +>Vonones +T A A T G T T A G G T G C T C C A G A T A T G G - +C A T T T C C C - C G A A T A A A T A A T A T A A +G T T T C T G A C T T T T A C C C C C T T C G C T +A A C T T T A C T T - T T A T C T A G A G G T A T +A G T C G A A A G A G G A G T G G G A A C T G G A +T G A A C A G T A T A C C C T C C T T T A T - C - +A G C C A G A A T T G C T C A C G C A G G T G C T +T C A G T T G A T C T A G G T A T T T T T T C A T +T A C A T T T A G C A G G G G T C T C A T C A A T +C T T A G G A G C T G T A A A C T T T A T A A C G +A C C G T T A T C A A T A T A C G A T C T A C A G +G G A T A A C T A T A G A C C G A A T A C C A C T +T T T T G T T T G A G C A G T A T T T A T T A C A +G C T C T A C T C C T A C T G T T A T C T T T A C +C A G T C + + +Sequence character chel_16S.fas#0 +>Alentus +T C C A G A C T - - G A - C - G G T - C C A C C - +- G A A T C G - G G C G A C T G - - - T C - A G G +C C T G A A C - - A T C C G G - T C G G T T - T C +- T C - T C C C T T T T C C C G C G G T G C T C T +T C G T T G A G - T G - T C G C G - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Amblypygid +T C C A G A C T - - G G - C - G G T - C C G C C T +- A - G C G G C G A G T A C T G - - - T C - A G G +C C T G A A C - - A T G G C G - C C G G - T - T - +- T - - T - - C - - - - - - C T T G G T T C T C T +T T A C T G A G - T G - T C T T G - G G C G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - - - G - C T G +- T - - - - C G C C +>Americhernus +T C G A G C C T C C A A - T - G A T - A C G T T G +- A - A A G G C G T T T A T C G - - - T T - G G G +G C C G - A C - - A G C G C G - T C - G - - - T - +- G - - G - - G - - - - - - C T C G G T T G G C C +T T A A A A A G C T G A T C - G G - G T T C T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +G G T G C T C A A A G C A - - G - - - - G C C T G +- G - - - - T G C C +>Aportus +T C C A G A C T - - G A - C - G G T - C C A C C - +- G - C T C G - G G C G A C T G - - - T C - A G G +C C T G A A C - - A T T C C G - T C G G T T - T C +G A C - A G A T T T T T C C C G C G G T G C T C T +T C G G T G A G - T G - T C G C - - G G A G G C C +G A C A A G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - - - G +- T - - - - C A C C +>Artemia +C T C G G T C G - - G G - T - G G T G C C G C C - +T C - A C G G T G G T C A C T G - - - C C T C G A +T C - G G A C - A A T T C A T - T G G A - - - T - +- C - - G - T T - - - - - - C G G G G T G C T C T +T A A C C G A G - T G - T C C T G - G G T G G C C +G A T A C G T T T A C T T T G A A C A A A T T A G +A G T G C T T A A A G C A - - G - - - - G - T - G +C A C - C G C G C C +>Centruoides +T C C A G A C A - - A G - C - G G T - C C A C C - +- C - G C G G T G G T T A C T G - - - T T T G G A +C T - G G A C - - G T T T G G C C G G A - - - T - +- T - - C - C T - - - - - - - T T G A T G C T C T +T T G C C G A G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - - +- A - - C - C G C C +>Chanbria +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - - - - - - - - - - - - - - - - +- - - - - - - - - - +>Gea +T C C G G C C G - - G A - C G G G T - C C G C C - +T A - C C G G T G G T T A C T G - T T C G - C T G +C C - G A G C - - - T T C A G - G G G G - - - C - +- C - - G - C T - - - - - - G T C G A T G A T C T +T C A T C G G T - T A - T C T T C - C G T A A C C +C T C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - - - - C G C C +>Hadrurus +T C C G G A C T - - G T - C - G G T - C C G - C - +- C - G C - A A G C T T A C T G - - - G C A G G A +C C - G G A C - - G T C T A G C C G G A - - C T - +- C - - T - C T - - - - - - C G T - A T C C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A A T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - G +- A - T C - C G C C +>Hypochilus +T C C A G A C G - - G G - C - G G T - C C G C C - +T A - A C G G T G G T T A C T G - - - C C - T G G +C C T G A A C - - A A C C A G - C C G G - - - T - +- T - - T - C C - - - - - - C T A G A T G A T C T +T C A T T G A T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - T - G +- A - - - - C G C C +>Limulus +T C T A G A C T - - G G - C - G G T - C C G C T T +- - - C C G G C G G T T A C T G - - - C C - T G G +C C T A A A C - - A T C - T G - C C G G - T - T - +- T - - T - C C - - - - - - C T C G G T G C C C T +T G A T T G A G - T G - T C T T G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - C - A +- A - - - - C G C C +>Mastigoproctus +T C C A G A C G - - G G - T - G G T - C C A C C - +G C - C C G G T G G C T A C T G - - - C C - C G G +C C T G A A C A - A T C T C G - C C G G - - - T - +- T - - T - T C - - - - - - C T T G A T T C T C T +T C A C C G A G - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T T A A A G C A - - G - - C - G - T - A +- A - - - - C G C C +>Pauroctonus +T C C A G A C T - - G T - C - G G T - C C G - C - +- A - C C G G A G G T T A C T G - - - G C - A G G +A C C G G A C - - G T C T A G - C C G G - - - C - +- C - - T - C C - - - - - - C T C G T T G C T C T +T C A C C G G G - T G - T C T T G - G G T G T C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - - G - - C - G - A - C +- C - - - - C G C C +>Rhiphicephalus +T C C A G A C G - - A G - T - A G T - G C A T C - +T A - C C C G A T G C T A C G G - - - C T - C G G +A C T G A A C - - A T C A T G - C C G G - - - T - +- T - C T - T T - - - - - - C T T G G T G C A C T +T C A T T G T G - T G - C C T C G A G A T G G C C +G G T G C T T T T A C T T T G A A A A A A T T A G +A G T G C T C A A C G C A - G G - - C - G - A - G +- T - - - - C G C C +>Thelichoris +T C C A G A C G - - G G - C - G G T - C C G C C - +T A - A C G G T G G T T A C T G - - - C C - T G G +C C T G A A C - - A G C C A G - C C G G - - - T - +- T - - T - C C - - - - - - C T A G A T G A T C T +T T A C C G G T - T G - T C T T G - G G T G A C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A - G G - - C - - - T - G +- A - - - - C G C C +>Thermobius +C T C G G A C - - - G A T C - G G T - T C G C C - +- G - C C C G T G T T A A C T G A - - T C - - G T +T C C G G A C - - G T C C T G - C C G G T T - T - +- T - - T - C C - T T T - - C T C G G T G C T C T +T C A T T G A G - T G - T C T T G - A T T G G C C +G G C A C G T T T A C T T T G A A C A A A T T A G +A G T G C T C A A A G C A G - G T C C A G - T - G +- T - - - - C G C C +>Vonones +T C G A G G C T - - G G - C - G G T - C C G C C T +- A - C A G G C G G T C A C T G - - - C C - A G T +A C T C A A C - - A T C C T G - C C G G - T - T - +- T - - T - C C - - - - - - C T T G G T G C T C T +T C G C T G A G - T G - T C T C G - G G T G G C C +G G C A C G T T T A C T T T G A A A A A A T T A G +A G T G C T C A A A G C A G - G - - C - G - C - G +- T - - - - A G C C + diff --git a/tutorials/tutorial_1/Tutorial_1_doc/crossrefs1.png b/tutorials/tutorial_1/Tutorial_1_doc/crossrefs1.png new file mode 100644 index 000000000..8ad9b95e1 Binary files /dev/null and b/tutorials/tutorial_1/Tutorial_1_doc/crossrefs1.png differ diff --git a/tutorials/tutorial_1/Tutorial_1_doc/crossrefs2.png b/tutorials/tutorial_1/Tutorial_1_doc/crossrefs2.png new file mode 100644 index 000000000..51019e8f3 Binary files /dev/null and b/tutorials/tutorial_1/Tutorial_1_doc/crossrefs2.png differ diff --git a/tutorials/tutorial_1/Tutorial_1_doc/output1.png b/tutorials/tutorial_1/Tutorial_1_doc/output1.png new file mode 100644 index 000000000..2f0455c56 Binary files /dev/null and b/tutorials/tutorial_1/Tutorial_1_doc/output1.png differ diff --git a/tutorials/tutorial_1/Tutorial_1_doc/output2.png b/tutorials/tutorial_1/Tutorial_1_doc/output2.png new file mode 100644 index 000000000..9bfa5463e Binary files /dev/null and b/tutorials/tutorial_1/Tutorial_1_doc/output2.png differ diff --git a/tutorials/tutorial_1/Tutorial_1_doc/phyg_tutorial_1.tex b/tutorials/tutorial_1/Tutorial_1_doc/phyg_tutorial_1.tex new file mode 100644 index 000000000..1edd1015a --- /dev/null +++ b/tutorials/tutorial_1/Tutorial_1_doc/phyg_tutorial_1.tex @@ -0,0 +1,760 @@ +\documentclass[]{article} +\usepackage{longtable} +\usepackage{color} +\usepackage{tabu} +\usepackage{setspace} +\usepackage{pdflscape} +\usepackage{graphicx} +\usepackage {float} +%\usepackage{subfigure} +\usepackage{caption} +\usepackage{subcaption} +\usepackage{natbib} +%\usepackage{fullpage} +\bibliographystyle{plain} +\usepackage{fancyhdr} +%\bibliographystyle{cbe} +\usepackage{algorithmic} +\usepackage[vlined,ruled]{algorithm2e} +\usepackage{amsmath} +\usepackage{amsfonts} +\usepackage{amssymb} +\usepackage[T1]{fontenc} +\usepackage{url} + +\usepackage[dvipsnames]{xcolor} +\usepackage{color, soul} +\usepackage[colorlinks=true, linkcolor=blue, citecolor=DarkOrchid, urlcolor=TealBlue ]{hyperref} +%\usepackage[nottoc,numbib]{tocbibind} +\usepackage{tocloft} + + +\setlength\itemindent{0.25cm} + +\newcommand{\phyg}{\texttt{PhyG} } +\newcommand{\BigO}[1]{\ensuremath{\mathcal{O}\left(\,#1\,\right)}\xspace} + +\title{PhyG 0.1 Tutorials} + +\author{Louise M. Crowley} +\makeindex +\begin{document} +\maketitle + +\pagestyle{fancy} + +%Maybe add:\\ +%\begin{enumerate} +% \item{reports of diagnosis} +% \item{Supports--Goodman-Bremer, bootstrap, jackknife--maybe on chel or +% something preassigned and small} +% \item{Consensus etc outputs} +%\end{enumerate} + +\section{\phyg Tutorials} + +These tutorials are intended to provide guidance for using the phylogenetic +program \texttt{PhyG}. Each tutorial contains a \phyg script that includes +detailed commentaries explaining the rationale behind each step of the analysis. +The command arguments will differ substantially depending on type, complexity, +and size of the data set. The values of arguments within this tutorial have been +chosen such that the analysis can complete within the timeframe of this session. +Therefore, the values used here should not be taken to be optimal parameters. + +The tutorials use sample datasets that can also be downloaded from the \texttt{PhyG} +\href{https://github.com/amnh/PhyGraph}{GitHub} website. The minimally required +items to run the tutorial analyses are the \phyg application and sample data files. +Running these analyses requires some familiarity with the \phyg command structure +that can be found below (see Section \ref{subsec:Scripts}). A complete guide to the +commands and associated arguments of \phyg can be found +\href{https://github.com/amnh/PhyGraph}{here}. + +%------------------------------------------------------------------------------------------------------- +\subsection{Obtaining and Installing \phyg} +\label{subsec:Installation} + +In this tutorial, you will learn how to obtain and install the \phyg binaries. +Precompiled \phyg binaries, test data, and documentation in pdf format are available +from the \phyg \href{https://github.com/amnh/PhyGraph}{GitHub} website. + +\begin{enumerate} +\item Open a web browser and navigate to the \phyg \href{https://github.com/amnh/PhyGraph} +{GitHub} website. In the \textit{bin} directory, binaries are available for macOS +computers with either Intel or M1 processors, and Linux (Intel) machines (see information +relating to Windows machine below). + +\item Click on the appropriate link for the binary. On most systems this will +download to either your Desktop or Downloads folder. + +\item The user is advised to change the permissions of the binary to ensure +that it is executable. In a terminal window, navigate to the directory where the \phyg +binary is located. Type the following: + + \begin{quote} + chmod +x phyg + \end{quote} + +\item With macOS machines, right click on the binary, and select `open with +\textit{Terminal}', and click `Open'. The macOS has now marked the binary as `safe' +to open. In Linux machines, it is not necessary to mark executables as `safe'. + +\item The binary should either be moved into your \$PATH or referred to its +absolute PATH when executing a script. Your \$PATH is a colon-separated list +of directories, the contents of which you can execute without specifying the +absolute path. To find out what's in your \$PATH, open a \textit{Terminal} window +(located in your Applications folder) and type: + + \begin{quote} + echo \$PATH + \end{quote} + +If the directories \texttt{/usr/local} or \texttt{/usr/local/bin} do not exist yet, type the +following (\texttt{sudo} will require Admin password): + + \begin{quote} + sudo mkdir -p /usr/local/bin + \end{quote} + +\item For users without administrative privileges for their computer, save +this binary at your preferred location (for instance in `Applications') and refer to its +absolute PATH when executing a script: + + \begin{quote} + /Applications/phyg ~/Desktop/phygfiles/run1.pg + \end{quote} + +See also below in Subsection \ref{subsec:executing}. + +\item Move the \phyg binary from its current location to this location. For example +typing the following will move the binary from its Desktop location into the user's +\$PATH: + + \begin{quote} + sudo mv ~/Desktop/phyg /usr/local/bin + \end{quote} + +\item For those users with Windows machines, a Windows Subsystem for Linux +(WSL) can be installed. This system will allow you to run the Linux binary directly +on your machine, without having to install a virtual machine or dual-boot setup. +The WSL, along with directions for installation, can be found +\href{https://learn.microsoft.com/en-us/windows/wsl/}{here}. +\end{enumerate} + +%------------------------------------------------------------------------------------------------------- +\subsection{Making a \phyg script} +\label{subsec:Scripts} + +\phyg analyses are conducted using a script and cannot be performed interactively. +A script is a simple text file containing a list of commands to be performed. \phyg +scripts can be created and saved using any conventional text editor (such as +\textit{TextEdit}, \textit{TextWrangler}, \textit{BBEdit}, \textit{Emacs}, or \textit{NotePad}). +Do not use a word processing application like \textit{Microsoft Word} or \textit{Apple +Pages}, as these programs can introduce hidden characters in the file, which will be +interpreted by \phyg and can cause unpredictable parsing of the data. In this tutorial, +you will learn about the format of \phyg scripts and how to generate one for execution +by the program. + +\begin{enumerate} + +\item Open your text editor of choice. + +\item Type the following: + + \begin{quote} + -\/-Chel first analysis\\ + set(seed:73412305)\\ + set(outgroup:"Limulus")\\ + read(nucleotide:"chel\_16S.fas")\\ + read(nucleotide:"chel\_cox1.fas")\\ + report("chel\_cr1.csv", crossrefs, overwrite)\\ + report("chel\_data1.csv", data, overwrite)\\ + \end{quote} + +Note: the commands \texttt{read}, \texttt{rename}, \texttt{reblock}, and \texttt{set} +are executed at the beginning of program execution, irrespective of where they +appear in the command script. All other commands are executed in the order they +are specified. + +\item In this example, the script begins with a comment that describes the +contents of the file. Comments are prepended with `-{}-' and can span multiple +lines, provided each line begins with `-{}-'. \phyg will ignore any commented +lines. Comments can provide useful information to the reader to understand +the purpose of the script. They can also be useful for testing purposes. + +\item Each command consists of a name, followed by a list of arguments or options +separated by commas and enclosed in parentheses. Commands and arguments are +case insensitive, with the exception of filename specifications, which are always in +double quotes (\textbf{"fileName"}). There are defaults for all options except +input graphs. The command, followed by open and closed parentheses `\texttt{()}', +denotes default options, e.g. \texttt{build()} is the equivalent of \texttt{build(character, +replicates:10)}. + +\item In this script, the seed for the random number generator is \texttt{set} to +the integer value 73412305. By setting this value, you are guaranteed to reproduce +a given search strategy each time the script is run. This value can be any number of +digits in length. This seed value should not be mistaken for the initial random +seed value, as chosen by the application (see Figure \ref{output1}). + +Note: \phyg is a multi-threading application and will, by default, use all available cores. +To guarantee the reproducibility of a given search each time the scripts is run, the user +should specify the same number of processors used by \phyg for each run. +This is achieved by including the options `+RTS -NX' where `X' is the number +of processors offered to the program. + +\item Next \texttt{set} the outgroup for the analysis to \textit{Limulus}. If the outgroup +is not set, \phyg will sort the taxa alphabetically and choose the first taxon as the +outgroup. Only a single taxon can be set as the outgroup of the analysis. + +Note: taxon names cannot have spaces, otherwise the names can be incorrectly +interpreted by the program. + +\item The command \texttt{read} imports file-based information, including data files, +tree and graph files. \texttt{read} commands must contain an input file. Supported +data formats include FASTA, FASTC and TNT files, and graph formats include Dot, +Enewick, and Newick. Filenames must include the suffix (e.g. +.fas, .fasta, .fastc, .ss, .tre). Failure to include these suffices will result in the error +"File(s) not found in `read' command". The filename must match \textit{exactly}, +including capitalization.\\ +\\ +\texttt{read} also accepts wild cards. The user can use an asterisk (*) to represent +zero or more characters. For example, to read all the filenames with the extension .fas, +the user can simply type: + + \begin{quote} + read(nucleotide:"*.fas")\\ + \end{quote} + +Moreover, to capture all the filetypes ending with fas, the user can type: + + \begin{quote} + read(nucleotide:"*.fas*")\\ + \end{quote} + +This will read files ending in .fas, .fasta, and .fastc. \phyg will attempt to recognize +the type of input and parse appropriately. Otherwise, the type of file can be +indicated. In this example, we indicate that the files "chel\_16S.fas" and +"chel\_cox1.fas" contain IUPAC \texttt{nucleotide} sequence data in fasta format. + +\item Having read in our data, it is advisable to verify that the files were properly +parsed by checking the characters and terminals in the cross-references and data +files. We will examine these files below (Section \ref{subsec:Inspecting}). + +\item Save this file with the name \textbf{run1.pg} in a directory \textbf{phygfiles} +located on your Desktop. + +Note: the tutorial data files should be moved to this +location also. + +\end{enumerate} + +%------------------------------------------------------------------------------------------------------- +\subsection{Executing scripts} +\label{subsec:executing} + +In this tutorial you will learn how to execute a script using \texttt{PhyG}. + +\begin{enumerate} + +\item Before we read the data, we will make sure that \phyg is working in the directory +containing the data files. The working directory tells the application where to look +for the files. In this way, whenever we tell \phyg to read a file, we don't need to +specify where it is located in the file system, we can simply use its name. To +begin, open a \textit{Terminal} window. + +\item Change the directory to where the script \textbf{run1.pg} is located by using the +\texttt{cd} command, as in: + + \begin{quote} + cd ~/Desktop/phygfiles + \end{quote} + +\item By typing \texttt{ls} you will see that this directory contains the script +\textbf{run1.pg}. + +\item The program is invoked from the command-line, as in: + + \begin{quote} + phyg commandFile + \end{quote} + +To run the script \textbf{run1.pg} using four processors type the following: + + \begin{quote} + phyg run1.pg +RTS -N4 + \end{quote} + +This is equivalent to typing the following from any location on your computer: + + \begin{quote} + phyg ~/Desktop/phygfiles/run1.pg +RTS -N4 + \end{quote} + +Note: should you get the error ``phyg: command not found'' this indicated that +\texttt{Phyg} is not in your path. + +\item To interrupt the analysis, press control-c. Interrupting the analysis cancels +the execution of the last command requested by the user. + +\end{enumerate} + +%------------------------------------------------------------------------------------------------------- +\subsection{Inspecting the data} +\label{subsec:Inspecting} + +In this tutorial you will learn how to inspect data in both the terminal window and +in reported output files. Running the script will automatically generate an extensive, +detailed output in the terminal window. It may also be desirable to inspect the +imported data in greater detail to ensure that the format and contents of the files +have been interpreted correctly. This practice helps avoid common errors, such +as inconsistently spelled terminal names, which may result in bogus results, +produce error messages, and aborted jobs. + +\begin{enumerate} + +\item Having run the script, examine the output in the terminal window +(Figure \ref{output1}). \phyg will continue to output information to the screen +until the completion of the analysis. It is possible to scroll through this output, +even as the analysis continues to run. The output includes the landing page, +the name of the script, the random seed, whether the files were +correctly specified, the name of the input file(s), whether the terminals were +renamed, whether trees were selected, and a brief description of the content +for each loaded file. The current state of the analysis can be viewed here. Any +warnings and errors will also appear here. Upon completion of the analysis, the +results, including costs, the number of returned graphs, and information relating +wall-clock time, CPU time, and CPU usage are also provided. + +\begin{figure} +\centering +\includegraphics[width=\textwidth]{output1.png} +\caption{The \phyg output display in the \textit{Terminal} window at the beginning +of an analysis.} +\label{output1} +\end{figure} + +\item A good practice is to confirm that \phyg parsed the input files correctly. +By examining the output in the terminal, you can see that both files were +interpreted and read in as nucleotide sequence data files by the program +(Figure \ref{output1}). + +\item Let's reexamine our script: + + \begin{quote} + -\/-Chel first analysis\\ + set(seed:73412305)\\ + set(outgroup:"Limulus")\\ + read(nucleotide:"chel\_16S.fas")\\ + read(nucleotide:"chel\_cox1.fas")\\ + report("chel\_cr1.csv", crossrefs, overwrite)\\ + report("chel\_data1.csv", data, overwrite)\\ + \end{quote} + +Having imported our data files, we next \texttt{report} a crossrefs and data file. +The command \texttt{report} outputs the results of the current analysis or loaded +data by directing it to a file. To redirect the output to a file, the file name (in quotes), +followed by a comma, must be included in the argument list of report. All arguments +for \texttt{report} are optional. This command allows the user to output information +concerning the characters and terminals, diagnosis, export static homology data, +implied alignments, trees, graphs, dot files, as well as other miscellaneous arguments. +By default, new information printed to a file is appended to the file. The option +\texttt{overwrite} overrides the default and rewrites the file rather than appending +to the existing information. Many of the report options can be output in csv format, +which can subsequently be imported into spreadsheet applications like \textit{Excel} +or \textit{Numbers} for easy viewing. + +\item Examine the reported file \textbf{"chel\_cr1.csv"}. The \texttt{crossrefs} +argument of the \texttt{report} command is a useful tool for the visual representation +of the inputted data. This argument displays whether data are present or absent +for each terminal in each of the imported data files. This provides a comprehensive +visual overview of the completeness of the data. It will highlight missing data, as +well as inconsistencies in the spelling of taxon names in different data files (see +Figure \ref{crossrefs}). The argument will report a file with the terminals represented +in rows, and the data files in columns. A plus sign ``+'' indicates that data for a given +terminal is present in the corresponding file; a minus sign ``-'' indicates that it is not. + +\item Open the csv file using your spreadsheet program of choice (Figure \ref{crossrefs}). +The user is encouraged to report a crossrefs file having imported the data +into \texttt{PhyG}, especially when working with new datasets. + +\begin{figure} +\centering +\includegraphics[width=0.45\textwidth]{crossrefs1.png}\hfill +\includegraphics[width=0.45\textwidth]{crossrefs2.png} +\caption{Inspecting imported data. The figure shows two crossrefs files, which have +been imported into \textit{Excel}. The image on the left illustrates an inconsistency in the +naming of the taxon \textit{Thelichoris}. This was corrected in the image on the right.} +\label{crossrefs} +\end{figure} + +\item Examine the reported file \textbf{"chel\_data1.csv"}. This file is a summary +of different aspects of the input data and terminals. This file summarizes +information relating to the input data (number of terminals, number of input files, +number of character blocks and the total number of characters). It also provides +information relating to the terminal taxa included in the analysis, including the +names of the taxa, a list of the excluded taxa (if any), and whether any terminals +were renamed. In this file you will also see information relating to +``Index'', ``Block'', ``Name'', ``Type'', ``Activity'', ``Weight'', ``Prealigned'', ``Alphabet'', +``TCM''. ``Index'' reports the character number in the overall dataset; ``Name'' the +name of the character (by default based on its source data file); ``Type'' is the type +of character (e.g. Non-Additive, Matrix, Nucleotide Sequence), ``Activity'' whether +the character is active (included in the analysis) or not (excluded), ``Weight'' is the +weight of the character, ``Prealigned'' denotes whether a sequence character +(e.g. amino acids) is to be treated as prealigned or not, ``Alphabet'' the elements of +a sequence character, ``TCM'' is the transition cost matrix specifying costs among +sequence elements and ``gap'' or insertion-deletion. +\end{enumerate} + +%------------------------------------------------------------------------------------------------------- +\subsection{Tree building with nucleotide sequence characters} +\label{subsec:Building} + +Having imported and inspected our data, we are now ready to build the initial trees +or graphs. In this tutorial you will build a distance tree using nucleotide sequence +characters. The \texttt{build} command builds initial graphs. The arguments of build +specify the number of graphs to be generated, and whether the build is based on +distance or character methods. Distance methods are considerably faster, but +approximate in terms of character-based methods. + +\begin{enumerate} + +\item Modify the script to include \texttt{build(distance, rdwag, replicates:100)}. +Specifying \texttt{distance} causes a pairwise distance matrix to be calculated +($\BigO n^2$) and is used as a basis for distance tree construction. The tree is +then constructed by performing random addition sequence distance Wagner +builds, yielding multiple trees determined by the argument \texttt{replicates:n}. +This method has a time complexity of $\mathcal{O} \left( m*n^2 \right)$. + +\item We now want to examine the resulting trees. Trees are not reported as +output in the terminal window and must be directed to a file. Modify the script +to output tree files with \texttt{report("chel\_run1.tre", newick, graphs, overwrite)}. + + \begin{quote} + -\/-Chel first analysis\\ + set(seed:73412305)\\ + set(outgroup:"Limulus")\\ + read(nucleotide:"chel\_16S.fas")\\ + read(nucleotide:"chel\_cox1.fas")\\ + build(distance, rdwag, replicates:50)\\ + report("chel\_cr1.csv", crossrefs, overwrite)\\ + report("chel\_data1.csv", data, overwrite)\\ + report("chel\_run1.tre", newick, graphs, overwrite)\\ + \end{quote} + +\item Examine the reported file \textbf{"chel\_run1.tre"} in your preferred text editor. +\texttt{report(graph)} outputs a graph in a format specified by other arguments in the +command, in this case \texttt{newick}. \phyg will \texttt{overwrite} any existing trees in +this file. This newick tree file, in parenthetical notation, can be viewed in other programs +like \textit{FigTree} or \textit{TreeView}. The values associated with the taxon names and +HTUs are the branch lengths. The cost of the tree(s) can be found in square brackets, at +the end of each tree. Notice that the trees appear in order of cost in the file, with the most +optimal tree appearing last. In this analysis, \phyg returned 50 trees ranging in cost from +1003 to 1040 with the cost of the most optimal tree being 1003. + +\end{enumerate} + +%------------------------------------------------------------------------------------------------------- +\subsection{Performing a local search} +\label{subsec:localsearch} + +Now that trees have been generated and stored in memory, a local search can be +performed in order to refine and improve the initial trees by examining additional +topologies of potentially better cost. The command \texttt{swap} is the basic local +search function of \phyg and performs branch-swapping rearrangement on graphs. +This command proceeds by clipping parts of the given tree and attaching them in +different positions. These algorithms employed in this branch-swapping include +`NNI', `SPR' and `TBR' refinement. The default of \texttt{swap} performs alternating +rounds of `SPR' and `TBR' refinement, swapping iteratively until a local optimum +is found, keeping 10 graphs per input graph. + +\begin{enumerate} + +\item Modify the script to include \texttt{swap(alternate, keep:10)}. This command +specifies that alternating rounds of \texttt{spr} and \texttt{tbr} refinement are performed. +After each round \phyg will \texttt{keep} 10 graphs per input graph. + + \begin{quote} + -\/-Chel first analysis\\ + set(seed:73412305)\\ + set(outgroup:"Limulus")\\ + read(nucleotide:"chel\_16S.fas")\\ + read(nucleotide:"chel\_cox1.fas")\\ + build(distance, rdwag, replicates:50)\\ + swap(alternate, keep:10)\\ + report("chel\_cr1.csv", crossrefs, overwrite)\\ + report("chel\_data1.csv", data, overwrite)\\ + report("chel\_run1.tre", newick, graphs, overwrite)\\ + \end{quote} + +\item Now reexamine the reported file \textbf{"chel\_run1.tre"}. Notice that this +simple local search has reduced the cost of the initial best tree from 1003 to 998. + +\end{enumerate} + +%------------------------------------------------------------------------------------------------------- +%\subsection{Selecting trees} +%\label{subsec:Selecting} +% +%So far we have performed the basic steps of importing character data, building +%initial trees, and conducting a simple local search. In this tutorial we will select +%trees at various stages of the analysis. +% +%\begin{enumerate} +% +%\item It may be useful to \texttt{select} all topologically unique trees during the +%analysis. Contra \texttt{select()}, which selects topologically unique and optimal +%trees, \texttt{select(unique)} selects all unique trees (after collapsing zero-length +%branches), regardless of cost. This is a useful command that ensures that a +%larger tree space is explored. Modify the script to \texttt{select(unique)} tree(s) +%following the distance \texttt{build}. If this is used as an option during the +%search, the user should remember to \texttt{select()} at the end of the run, +%prior to reporting the results. The remaining trees are deleted from memory. +% +%\item Generally, users will want to \texttt{select} only those trees that are both +%optimal \textit{and} topologically unique at the end of an analysis. The default +%setting of the \texttt{select()} does exactly that. Add \texttt{select()} to our script, +%ensuring that \phyg will select the topologically unique trees of best cost upon +%completion. For now we will comment out this command as we will only include +%a pool of topologically unique trees for the next stage of the analysis (Section +%\ref{subsec:Fusing}). +% +% \begin{quote} +% -\/-Chel first analysis\\ +% set(seed:73412305)\\ +% set(outgroup:"Limulus")\\ +% read(nucleotide:"chel\_16S.fas")\\ +% read(nucleotide:"chel\_cox1.fas")\\ +% build(distance, rdwag, replicates:50)\\ +% select(unique)\\ +% swap(alternate, keep:10)\\ +% -\/-select()\\ +% report("chel\_cr1.csv", crossrefs, overwrite)\\ +% report("chel\_data1.csv", data, overwrite)\\ +% report("chel\_run1.tre", newick, graphs, overwrite)\\ +% \end{quote} +% +%\end{enumerate} + +%------------------------------------------------------------------------------------------------------- +\subsection{Performing tree recombination} +\label{subsec:Fusing} + +The command \texttt{fuse} performs tree fusing on the trees in memory. Tree +fusing can be used to escape local optima. \texttt{fuse} operates on a collection +of graphs performing reciprocal graph recombination between pairs of graphs. +Non-identical subgraphs with identical leaf sets are exchanged between graphs +and the results evaluated. This process of exchange and evaluation continues +until no new graphs are found. + +Note: more than one tree must be stored in memory in order to perform this operation. + +\begin{enumerate} + +\item Modify the script to include \texttt{fuse(tbr:10, keep:1)}. This command +causes the exchanged subgraphs to be tried at multiple positions (up to 9 +edges away from their initial positions. The number of returned graphs is +limited to 1. + +\item Uncomment the command \texttt{select()}. + + \begin{quote} + -\/-Chel first analysis\\ + set(seed:73412305)\\ + set(outgroup:"Limulus")\\ + read(nucleotide:"chel\_16S.fas")\\ + read(nucleotide:"chel\_cox1.fas")\\ + build(distance, rdwag, replicates:50)\\ +% select(unique)\\ + swap(alternate, keep:10)\\ + fuse(tbr:10, keep:1)\\ +% select()\\ + report("chel\_cr1.csv", crossrefs, overwrite)\\ + report("chel\_data1.csv", data, overwrite)\\ + report("chel\_run1.tre", newick, graphs, overwrite)\\ + \end{quote} + +\item Rerun the analysis. + +\item Reopen the reported tree file \texttt{"chel\_run1.tre"} using your text editor. +You will see that the cost of the most optimal tree has not decreased any further +than 998. While the cost of the tree has not decreased any further, this gives us +some confidence that the large tree space that we explored did not find any +other more optimal trees. + +\end{enumerate} + +%------------------------------------------------------------------------------------------------------- +\subsection{Reporting an implied alignment} +\label{subsec:ia} + +Another useful way to view the data is to \texttt{report} the implied alignment of the +molecular data currently loaded. An implied alignment is a representation of the +insertion, deletion and substitution events that take place on a \textit{given} tree, +represented as an alignment. Outputted implied alignments can be imported into +visualization program such as \href{https://ormbunkar.se/aliview/#DOWNLOAD}{AliView}, +\href{https://thalljiscience.github.io/}{BioEdit} and \href{https://www.geneious.com}{Geneious}, +as well as other phylogenetic programs such as \textit{TNT}---this will be done during +another lab session. + +\begin{enumerate} + +\item Modify the script to include \texttt{report("fileName", ia)}. + + \begin{quote} + -\/-Chel first analysis\\ + set(seed:73412305)\\ + set(outgroup:"Limulus")\\ + read(nucleotide:"chel\_16S.fas")\\ + read(nucleotide:"chel\_cox1.fas")\\ + build(distance, rdwag, replicates:50)\\ +% select(unique)\\ + swap(alternate, keep:10)\\ + fuse(tbr:10, keep:1)\\ +% select()\\ + report("chel\_cr1.csv", crossrefs, overwrite)\\ + report("chel\_data1.csv", data, overwrite)\\ + report("chel\_run1.tre", newick, graphs, overwrite)\\ + report("chel\_ia1.fas", ia, overwrite)\\ + \end{quote} + +An implied alignment is output for each reported tree, as well as for each block of imported +data. + +\item Should the user wish to output an implied alignment that is readable by the +program \textit{TNT}, they should adjust the command replacing \texttt{ia} with the +argument \texttt{tnt}. + + \begin{quote} + report("chel\_ia1.ss", tnt, overwrite)\\ + \end{quote} + +\end{enumerate} + +%------------------------------------------------------------------------------------------------------- +\subsection{Reporting publication quality trees} +\label{subsec:dotpdf} + +Publication quality trees can be reported with \texttt{PhyG}. These pdf files can +subsequently be displayed, edited, and printed using graphics software. + +\begin{enumerate} + +\item In order to output pdf files the application \textit{dot} must be installed from +the \href{https://graphviz.org/download/}{Graphviz} website. \textit{dot} is a graph +description language and Graphviz an open-source graph visualization software. +This program is well suited to representing graphs and networks. + +\item The command \texttt{report("filename.dot", graphs, dotpdf, overwrite)} will +produce a file that can be read in \textit{Adobe Illustrator}, \textit{Apple Preview} +or any vectorial image edition program. Modify the script to include this command: + + \begin{quote} + -\/-Chel first analysis\\ + set(seed:73412305)\\ + set(outgroup:"Limulus")\\ + read(nucleotide:"chel\_16S.fas")\\ + read(nucleotide:"chel\_cox1.fas")\\ + build(distance, rdwag, replicates:50)\\ +% select(unique)\\ + swap(alternate, keep:10)\\ + fuse(tbr:10, keep:1)\\ +% select()\\ + report("chel\_cr1.csv", crossrefs, overwrite)\\ + report("chel\_data1.csv", data, overwrite)\\ + report("chel\_run1.tre", newick, graphs, overwrite)\\ + report("chel\_ia1.fas", ia, overwrite)\\ + report("chel\_run1.dot", graphs, dotpdf, overwrite)\\ + \end{quote} + +\item Notice that two files were outputted from using this command. \phyg has +output an eps (on macOS) or pdf (on linux) file that can be viewed in a vector graphics +program and a dot file, which can be viewed (and modified) in \textit{Graphviz}. +\end{enumerate} + +%\subsection{Using Search} +%\label{subsec:search} +% +%This command implements a default search strategy, performing a timed randomized +%series of graph optimization methods including building, swapping, recombination +%(fusing), simulated annealing and drifting, network edge addition/deletion/moving, +%and Genetic Algorithm. The parameters and their order of implementation are +%randomized for this search. +% +%\begin{enumerate} +%\item Create a new file using your text editor of choice. +% +%\item Type the following: +% +% \begin{quote} +% -\/-Chel analysis using search\\ +% set(seed:73412305)\\ +% set(outgroup:"Limulus")\\ +% read(nucleotide:"chel\_16S.fas")\\ +% read(nucleotide:"chel\_cox1.fas")\\ +% search(minutes:20)\\ +% report("chel\_search2.csv", search, overwrite)\\ +% report("chel\_data2.csv", data, overwrite)\\ +% report("chel\_search2.tre", newick, graphs, overwrite)\\ +% \end{quote} +% +% \item +% \ phyg{search} should be the first analysis option for both new and +% expert users. Let's start with a very small data set, so that we can get +% some meaningful results with a very small amount of time: +% +% \item +% After this command finishes, you should see a message on screen telling +% you how many trees where built, how many fusing generations where +% performed, how many times the best tree was found, and what its score +% is. +% +%\item Save this file with the name \textbf{run1\_search.pg} in a directory \texttt{phygfiles} +%located on your Desktop. + +%\end{enumerate} + +\subsection{Calculating Supports: Goodman-Bremer} + +This method calculates Goodman-Bremer support values for input graphs. The +method traverses the SPR or TBR neighborhood, to determine an upper bound +on the NP-hard values. The arguments of \texttt{goodmanbremer} (\texttt{spr} or +\texttt{tbr}) are optional, but \phyg will, by default, traverse the TBR neighborhood. +The command \texttt{report(support)} labels the edges with Goodman-Bremer support +values. + +\begin{enumerate} + + \item Using your text editor of choice, make a new script to generate the + Goodman-Bremer support values for our newick tree from \textbf{run1.pg}. + + \begin{quote} + -\/-Goodman-Bremers for chel\_run1\\ + set(seed:73412305) \\ + set(outgroup:"Limulus") \\ + read(nucleotide:"chel\_16S.fas") \\ + read(nucleotide:"chel\_cox1.fas") \\ + read(newick:"chel\_run1.tre") \\ + support(gb:spr, gbsample:10000)\\ + report("chel\_run1\_GB.dot", support, dotpdf, overwrite) + \end{quote} + +\item Save this file in the same directory as your tree file, i.e. \texttt{run1}. +We will name this file \texttt{run1\_GBs.pg}. + +\item Execute the script: + + \begin{quote} + phyg run1\_GBs.pg + \end{quote} + +\item Examine the reported files. Goodman-Bremer values are present +over the branches. + +\end{enumerate} + +%\subsection{Calculating Supports: Bootstrap} + +%\subsection{Calculating Supports: Jackknife} + + +%\printindex + +\end{document} diff --git a/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/crossrefs.png b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/crossrefs.png new file mode 100644 index 000000000..ee531d462 Binary files /dev/null and b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/crossrefs.png differ diff --git a/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/data.png b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/data.png new file mode 100644 index 000000000..3796d4dcc Binary files /dev/null and b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/data.png differ diff --git a/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/eps1.png b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/eps1.png new file mode 100644 index 000000000..f0cc25884 Binary files /dev/null and b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/eps1.png differ diff --git a/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/eps2.png b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/eps2.png new file mode 100644 index 000000000..bb65b0f20 Binary files /dev/null and b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/eps2.png differ diff --git a/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/eps3.png b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/eps3.png new file mode 100644 index 000000000..b91728903 Binary files /dev/null and b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/eps3.png differ diff --git a/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/eps4.png b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/eps4.png new file mode 100644 index 000000000..576f6f24c Binary files /dev/null and b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/eps4.png differ diff --git a/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/eps5.png b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/eps5.png new file mode 100644 index 000000000..698bc3793 Binary files /dev/null and b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/eps5.png differ diff --git a/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/eps6.png b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/eps6.png new file mode 100644 index 000000000..2167571ed Binary files /dev/null and b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/eps6.png differ diff --git a/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/output1.png b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/output1.png new file mode 100644 index 000000000..73f478610 Binary files /dev/null and b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/output1.png differ diff --git a/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/output2.png b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/output2.png new file mode 100644 index 000000000..15ad43857 Binary files /dev/null and b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/output2.png differ diff --git a/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/output3.png b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/output3.png new file mode 100644 index 000000000..ebcd03230 Binary files /dev/null and b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/output3.png differ diff --git a/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/output4.png b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/output4.png new file mode 100644 index 000000000..b6b8760c4 Binary files /dev/null and b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/output4.png differ diff --git a/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/output5.png b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/output5.png new file mode 100644 index 000000000..9985a5b93 Binary files /dev/null and b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/output5.png differ diff --git a/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/phyg_tutorial_2.tex b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/phyg_tutorial_2.tex new file mode 100644 index 000000000..ff86d40a0 --- /dev/null +++ b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/phyg_tutorial_2.tex @@ -0,0 +1,834 @@ +\documentclass[11pt]{article} +\usepackage{longtable} +\usepackage{color} +\usepackage{tabu} +\usepackage{setspace} +\usepackage{pdflscape} +\usepackage{graphicx} +\usepackage {float} +%\usepackage{subfigure} +\usepackage{caption} +\usepackage{subcaption} +\usepackage{natbib} +\usepackage{fullpage} +\bibliographystyle{plain} +%\bibliographystyle{cbe} +\usepackage{algorithmic} +\usepackage[vlined,ruled]{algorithm2e} +\usepackage{amsmath} +\usepackage{amsfonts} +\usepackage{amssymb} +\usepackage[T1]{fontenc} +\usepackage{url} + +\usepackage[dvipsnames]{xcolor} +\usepackage{color, soul} +\usepackage[colorlinks=true, linkcolor=blue, citecolor=DarkOrchid, urlcolor=TealBlue ]{hyperref} +%\usepackage[nottoc,numbib]{tocbibind} +\usepackage{tocloft} + + +\setlength\itemindent{0.25cm} + +\newcommand{\phyg}{\texttt{PhyG} } +\newcommand{\BigO}[1]{\ensuremath{\mathcal{O}\left(\,#1\,\right)}\xspace} + +\title{PhyG 0.3 Tutorials} + +\author{Louise M. Crowley} +\makeindex +\begin{document} +\maketitle + +\section{\phyg Tutorials} + +These tutorials provide guidance for performing network analyses using the +phylogenetic program \texttt{PhyG}. In addition to illustrating the vertical transfer +of information between ancestor-descendant lineages, phylogenetic networks +can convey information about reticulation events such as horizontal gene transfer, +hybridization and introgression between lineages. There are two main types of +networks---`softwired' and `hardwired'. When displayed, these networks main +appear similar, however, they represent different interpretations of the meaning +of phylogenetic edges. + +A `softwired' network is a summary of a set of individual `display' trees or `tree-based +networks', which have been generated by removing edges from the network. +In `softwired' networks, individual characters have single parents, as opposed +to those in `hardwired' networks, where characters can have multiple parents +\citep{KannanandWheeler2012}. + +The user is advised to work through Tutorial 0.1 prior to attempting this tutorial, +as this document provides instructions on obtaining and installing \texttt{PhyG}, +as well as making and executing scripts. As in previous tutorials, each tutorial +contains a \phyg script that includes detailed commentaries explaining the +rationale behind each step of the analysis. The values of arguments herein have +been chosen such that the analysis can complete within the timeframe of this +session. Therefore, the values used here should not be taken to be optimal +parameters. + +These tutorials use sample datasets that can also be downloaded from the +\texttt{PhyG} \href{https://github.com/amnh/PhyGraph}{GitHub} website. Move +these data files to a directory on your Desktop called \textbf{phygfiles}. The +minimally required items to run the tutorial analyses are the \phyg application +and sample data files. Running these analyses requires some familiarity with +the \phyg command structure, to which a complete guide can be found +\href{https://github.com/amnh/PhyGraph}{here}. + +%------------------------------------------------------------------------------------------------------- +\subsection{Making a script and inspecting the data} +\label{subsec:networkscript} + +In this tutorial, you will generate the initial script for a `softwired' network analysis +and inspect the inputted data. + +\begin{enumerate} + +\item Open your text editor of choice and type the following: + + \begin{quote} + -\/-building networks for softwired tests using flu datasets\\ + set(seed:1634561640)\\ + set(outgroup:"1466\_H7N2\_Avian\_chicken\_pa\_1490921\_2002")\\ + set(graphtype:softwired)\\ + set(graphfactor:nopenalty)\\ + read(nucleotide:"flu*.fas*", tcm:(2,1))\\ + report("flu\_net1\_data.csv", data, overwrite)\\ + report("flu\_net1\_cr.csv", crossrefs, overwrite) + \end{quote} + +\item The script begins with a comment that describes the purpose of this analysis. +Recall that comments are prepended with `-{}-' and can span multiple lines, provided +each line begins with `-{}-'. \phyg will ignore any commented lines.\\ + +In the next four lines, we change the settings of \texttt{PhyG}. All \texttt{set} +commands are executed at the start of a run, irrespective of where they appear +in the script. + +\item First, we \texttt{set} the seed for the random number generator to the +integer value 1634561640. By setting this value, we are guaranteed to reproduce +a given search trajectory each time the script is run. This is the case even when the +operations are randomized, as is the case when using the argument \texttt{atrandom}. + +\item Next, the outgroup for the analysis is \texttt{set} to +\emph{1466\_H7N2\_Avian\_chicken\_pa\_1490921\_2002}. If the outgroup is not +\texttt{set}, the default outgroup is the taxon whose name is lexically first after any +renaming of taxa, and/or if taxa were specified by using the arguments \texttt{include} +or \texttt{exclude}. Only a single taxon can be \texttt{set} as the outgroup of the analysis. +Recall that taxon names cannot have spaces, otherwise the names can be incorrectly +interpreted by the program. + +\item We next \texttt{set} the \texttt{graphtype} of this analysis. \phyg allows for +the input, analysis of and output of a broader class of phylogenetic graphs. These +include trees, forests and both `softwired' and `hardwired' networks. The current +choices are \texttt{tree} (the default), \texttt{hardwired} and \texttt{softwired}. We +\texttt{set} the \texttt{graphtype} to \texttt{softwired}. + +\item When conducting a network analysis, a penalty can be ascribed, so that +`softwired' phylogenetic networks can compete equally with phylogenetic trees on +a parsimony optimality basis. A network penalty takes into account the change in +cost as edges are added to the graph. Current choices include \texttt{nopenalty}, +\texttt{w15} and \texttt{w23}. In general, \texttt{w23} is a more severe penalty than +\texttt{w15}. For the generation of an initial network for further analysis, we \texttt{set} +the \texttt{graphfactor} to \texttt{nopenalty}. Note: assigning \texttt{nopenalty} +is useful for the generation of graphs for further refinement, however it is unlikely +to be a reasonable penalty for `real' analyses. + +\item Change to the directory where the downloaded data files are located by using +the \texttt{cd} command, as in: + + \begin{quote} + cd ~/Desktop/phygfiles + \end{quote} + +By typing \texttt{ls} you will see that this directory contains twelve files in fasta format. +A primary requirement of this type of analysis is to have a minimum of two blocks of +data. The command \texttt{read} imports our data files. Rather than type in the name +of each of these files in our script, we will use wildcards (*) to capture all these files: + + \begin{quote} + read(nucleotide:"flu*.fas*", tcm:(2,1)\\ + \end{quote} + +Filenames must include the suffix (e.g. .fas, .fasta, .fastc, .ss, .tre). Note: in this case, +the wildcards capture files ending in .fasta and .fas. Failure to include these suffices +will result in the error ``File(s) not found in `read' command''. The filename must match +\textit{exactly}, including capitalization. We indicate that the files contain IUPAC +\texttt{nucleotide} sequence data in fasta format. The argument \texttt{tcm} refers to +the transformation cost matrix. The first integer specifies the substitution cost and the +second integer value defines the indel cost. By default, the cost of both are set to 1. + +\item Having read in our data, it is advisable to verify that the files were properly +parsed. We \texttt{report} a \texttt{crossrefs} and \texttt{data} files, which allows +us to output information concerning the characters and terminals in our data files. + +\item Save this file with the name \textbf{``flu\_net1.pg''} in a directory \texttt{phygfiles} +located on your Desktop. + +\item Run the script by typing the following: + + \begin{quote} + phyg flu\_net1.pg + \end{quote} + +Notice the output in the \textit{Terminal} window. + +\item Examine the reported files \textbf{``flu\_net1\_data.csv''} and +\textbf{``flu\_net1\_cr.csv''}. The \texttt{data} file summarizes information relating +to the input data (number of terminals, number of input files, number of character +blocks and the total number of characters). The \texttt{crossrefs} file provides a +comprehensive visual overview of the completeness of the data. Together these +files indicate that data has been input for nine terminal taxa, across 12 data +blocks. The \texttt{crossrefs} file highlights that data in two of the files is missing +for three taxa. + +\end{enumerate} + +%------------------------------------------------------------------------------------------------------- +\subsection{Generating a network from trees} +\label{subsec:makinganetwork} + +The first step in the analysis of networks is to generate one. Having imported and +inspected our data, we are now ready to build the initial graphs. In this tutorial, you +will learn how to generate a network from trees using \texttt{PhyG}. + +\begin{enumerate} + +\item Modify the script to include \texttt{build(distance, rdwag, block, graph, eun, +replicates: 1000)}. By specifying \texttt{block}, \phyg performs independent builds +for each ``block'' of data. If this option is not specified, the builds are performed +combining all the data. This command causes a pairwise distance matrix to be +calculated ($\BigO n^2$) that is subsequently used as a basis for distance tree +construction. \texttt{distance} trees are constructed on the \texttt{block}s of data +by performing random addition sequence distance Wagner builds (\texttt{rdWag}), +yielding multiple trees (\texttt{n}), as determined by the argument \texttt{replicates}. +The resulting graphs are reconciled using the \texttt{eun} argument, turning them +into a network. \texttt{eun} reconciles block trees into a Edge-Union-Network +\citep{MiyagiandWheeler2019, Wheeler2022}. + +\item We will want to examine the resulting graphs. Graphs are not reported as +output in the \textit{Terminal} window and must be directed to a file. Modify the +script to output graph files with \texttt{report("flu\_net1.tre", graphs, newick, +overwrite)} and \texttt{report("flu\_net1\_gv.tre", dotpdf, graphs, overwrite)}. + + \begin{quote} + -\/-building networks for softwired tests using flu datasets\\ + set(seed:1634561640)\\ + set(outgroup:"1466\_H7N2\_Avian\_chicken\_pa\_1490921\_2002")\\ + set(graphtype:softwired)\\ + set(graphfactor:nopenalty)\\ + read(nucleotide:"flu*.fas*", tcm:(2,1))\\ + build(distance, rdwag, block, graph, eun, replicates:1000)\\ + report("flu\_net1\_data.csv", data, overwrite)\\ + report("flu\_net1\_cr.csv", crossrefs, overwrite)\\ + report("flu\_net1.tre", graphs, newick, overwrite)\\ + report("flu\_net1\_gv.tre", dotpdf, graphs, overwrite) + \end{quote} + +\item Run the script. + +\item Notice the output in the \textit{Terminal} window. Following the \texttt{build} +you'll see the warning: ``Chained network nodes (both parent and child nodes are +indegree > 1), removing edges to tree node parents (this may affect graph cost)''. +This warning indicates that a pair of network vertices have violated the requirement +\citep{Moretetal2004} that network vertices cannot be connected by a single edge. +The graph has been modified or corrected prior to further refinement or examination. + +\item Examine the reported graph files. Because the file \textbf{``flu\_net1.tre''} +represents a phylogenetic network (as opposed to tree), this file can not be viewed +using tree viewing programs such as \textit{FigTree} or \textit{TreeView}. Using +your preferred text editor, open this file (Figure \ref{tre1}). This looks like a standard +newick tree file, in parenthetical notation, however, notice the addition of four +\textbf{\#}'s, which represent reticulation events. This is an ENewick file +\citep{Cardonaetal2008}. The values associated with the taxon names and HTUs +are the branch lengths. The cost of the graph(s) can be found in square brackets, +at the end of each graph. In this analysis, \phyg returned a single network with a +cost of 13976. + +\begin{figure}[H] +\centering +\includegraphics[width=\textwidth]{tre1.png} +\caption{Output file \textbf{``flu\_net1.tre''} in ENewick graph format.} +\label{tre1} +\end{figure} + +\item The command \texttt{report("filename.tre", dotpdf, graphs, overwrite)} will +produce a file that can be read in \textit{Adobe Illustrator}, \textit{Apple Preview} +or any vectorial image editor program. Notice that two files were outputted from +using this command. \phyg has output an eps (on OSX) or pdf (on linux) file that +can be viewed in a vector graphics program and a dot file, which can be viewed +(and modified) in \textit{Graphviz}. Note: in order to output pdf files the application +\textit{dot} must be installed from the \href{https://graphviz.org/download/}{Graphviz} +website. \textit{dot} is a graph description language and Graphviz an open-source +graph visualization software. This program is well suited to representing graphs +and networks. Open the reported file \textbf{``flu\_net1\_gv.tre.eps''} in your preferred +visualization program (Figure \ref{eps1}). The values associated with the taxon +names and HTUs are the branch lengths. Examining this file, we can see two +networks (one leading into HTU14 and HTU15). + +\begin{figure}[H] +\centering +\includegraphics[width=\textwidth]{eps1.png} +\caption{Output file \textbf{``flu\_net1\_gv.tre.eps''} in eps format.} +\label{eps1} +\end{figure} + +\end{enumerate} +%------------------------------------------------------------------------------------------------------- +\subsection{Refining a softwired network: adding network edges} +\label{subsec:netadd} + +Now that we have a network to work with, we can do some refinement operations +on the network edges. In the next five tutorials we will learn how to perform +manipulation of network edges to existing network graphs. This tutorial will +focus on adding network edges. + +\begin {enumerate} + +\item Rather than modify the previous script (possible to do with comments and +additional text), we will generate a new script to perform the refinements of our +first network. + +\item Open your text editor of choice and type the following: + + \begin{quote} + -\/-Refinements of softwired networks using flu data sets using netadd\\ + set(seed:1634561640)\\ + set(outgroup:"1466\_H7N2\_Avian\_chicken\_pa\_1490921\_2002")\\ + set(graphtype:softwired)\\ + set(graphfactor:w15)\\ + read(nucleotide:"flu*.fas*", tcm:(2,1))\\ + read(newick:"flu\_net1.tre")\\ + refine(netadd, maxnetedges:5, atrandom, steepest)\\ + report("flu\_net2.tre", graphs, newick, overwrite)\\ + report("flu\_net2\_gv.tre", dotpdf, graphs, overwrite) + \end{quote} + +\item The script begins with a comment that describes the purpose of this +analysis. + +\item The next three lines are identical to that of \textbf{``flu\_net1.pg''}, so we +will not comment about them any further. + +\item Unlike the previous script, here, we \texttt{set} the \texttt{graphfactor} to +\texttt{w15}. This penalty involves the calculation of the most parsimonious +display tree. `Softwired' networks with no defined penalty, monotonically +reduce the network cost with each additional network edge. A display tree is +a tree created from a `softwired' graph by deleting one of two of the network +edges incident on each network vertex. Because this penalty has a higher +time complexity to calculate the total network costs (than \texttt{w23} and +\texttt{nopenalty}), this may take significantly more time for larger datasets. + +\item In addition to importing the nucleotide sequence data files, we also +read in the \texttt{newick} graph file \textbf{``flu\_net1.tre''} generated from the +previous tutorial (Section \ref{subsec:makinganetwork}). Reading in tree or graph +files, rather than building them from scratch, can be a useful starting point and +can speed up analyses. + +\item Having \texttt{read} in our data and graph files, we can now perform refinements +of the network. The command \texttt{refine} performs edit operations on network +edges. The network specific refinement operations include \texttt{netadd}, +\texttt{netdel}, \texttt{netadddel} and \texttt{netmove}. These refinements are +only applicable to network edges, hence they can only be applied to `softwired' +and `hardwired' graphs (as opposed to trees). \\ + +The argument \texttt{netadd} adds network edges to input graphs at all possible +positions until no better cost graph is found. Though no need to specify here as +they are the default arguments of \texttt{refine}, we specify \texttt{atrandom} and +\texttt{steepest}. \texttt{atrandom} randomizes the evaluation of the networks and +the order by which they are generated. \texttt{steepest} specifies that if a better +network is found during the network refinement operations then the refinements +will switch greedily to this graph and abandon the previous network. The argument +\texttt{maxnetedges:n} specifies that network edges can only be added to networks +with the number of edges less that the specified integer \texttt{n}. This can only be +used in conjunction with \texttt{netadd} and \texttt{netadddel}. + +\item Save this file with the name \textbf{``flu\_net2.pg''} in the same directory as the +data files and previously reported graph files. + +\item Run the script. + +\begin{figure} +\centering +\includegraphics[width=\textwidth]{output1.png} +\caption{Output of the \textit{Terminal} window from running the script +\textbf{``flu\_net2.pg''}.} +\label{output1} +\end{figure} + +\item Scroll through the output in the \textit{Terminal} window (Figure \ref{output1}). +Having \texttt{read} in the data files, they were then processed or transformed using +the \texttt{tcm} information. The blocks of characters (in this case the data files) were +recoded in that \phyg is processing the data. Then the settings of \phyg were \texttt{set}. +Note: the cost of the input graph \textbf{``flu\_net1.tre''} has increased from 13976 to +14276.8125. Recall that this graph was generated under \texttt{nopenalty} costs and +here the penalty is \texttt{set} to \texttt{w15}. Therefore, when this graph is read in, it +is rediagnosed with the new penalty cost. \phyg then performed 3 rounds of adding +network edges, examining a decreasing number of candidate edge pairs after each +round. As you add network edges the number of possible new edges will go down +(in most cases) because of the constrains of phylogenetic networks. In each round +\phyg added a network edge. \phyg stopped after three rounds as we specified +\texttt{maxnetedges:5}. Notice the reduction in network cost after each round. Note: +the message ``Warning: Time consistency error'' is an internal processing error and +can be ignored.\\ + +Let's examine the reported graph files. + +\item Opening the file \textbf{``flu\_net2.tre''} in your preferred text editor (Figure +\ref{tre2}). This ENewick file has ten \textbf{\#}'s, representing the reticulation events. +Comparing this graph to \textbf{``flu\_net1.tre''} (our input graph), we can see that +the addition of three additional network edges, reduced the cost of the network from +14276.8125 to 13528.25. + +\begin{figure}[H] +\centering +\includegraphics[width=\textwidth]{tre2.png} +\caption{Output file \textbf{``flu\_net2.tre''} in ENewick graph format.} +\label{tre2} +\end{figure} + +\item Open the reported file \textbf{``flu\_net2\_gv.tre.eps''} in your preferred +visualization program (Figure \ref{eps2}). The values associated with the taxon +names and HTUs are the branch lengths. Examining this file, we can see five +networks (one leading into HTU19, HTU24, HTU26, HTU22 and HTU16). A single +graph was reported. + +\begin{figure}[H] +\centering +\includegraphics[width=\textwidth]{eps2.png} +\caption{Output file \textbf{``flu\_net2\_gv.tre.eps''} in eps format.} +\label{eps2} +\end{figure} + +\item We can apply some of the commands that we learned in Tutorial 0.1 to +these tutorials. As we learned previously, another useful way to visualize the data +is to \texttt{report} the implied alignment of the molecular data currently loaded. +The implied alignment is a visual representation (alignment) of the indel and +substitution events that take place on a \textit{given} graph. \\ + +Modify the script to include the following: + +\begin{quote} +report("flu\_net2\_ia.txt", ia, overwrite) +\end{quote} + +This command will output an implied alignment for each reported graph, as well as +for each block of imported data. Should you wish to output an implied alignment +that is readable by the program \textit{TNT}, you should modify the command +replacing \texttt{ia} with the argument \texttt{tnt}. + +\begin{quote} +report("flu\_net2.ss", tnt, overwrite) +\end{quote} + +\end{enumerate} + +%------------------------------------------------------------------------------------------------------- +\subsection{Refining a softwired network: deleting network edges} +\label{subsec:netdel} + +This tutorial will focus on deleting network edges. + +\begin {enumerate} + +\item Open your text editor of choice and type the following: + + \begin{quote} + -\/-Refinements of softwired networks using flu data sets using netdel\\ + set(seed:1634561640)\\ + set(outgroup:"1466\_H7N2\_Avian\_chicken\_pa\_1490921\_2002")\\ + set(graphtype:softwired)\\ + set(graphfactor:w15)\\ + read(nucleotide:"flu*.fas*", tcm:(2,1))\\ + read(newick:"flu\_net1.tre")\\ + refine(netdel, atrandom, steepest)\\ + report("flu\_net3.tre", graphs, newick, overwrite)\\ + report("flu\_net3\_gv.tre", dotpdf, graphs, overwrite) + \end{quote} + +\item The script begins with a comment that describes the purpose of this +analysis. + +\item The next six lines are identical to that of \textbf{``flu\_net1.pg''}, so we +will not comment about them any further. + +\item Having \texttt{read} in our data and graph files, we can now perform +refinements of the network. The argument \texttt{netdel} deletes network edges +from input graphs one at a time until no better cost graph is found. We also +specify that this edit operation be performed using \texttt{atrandom} and +\texttt{steepest}. + +\item Save this file with the name \textbf{``flu\_net3.pg''} in the same directory +as the data files and previously reported graph files. + +\item Run the script. + +\begin{figure}[H] +\centering +\includegraphics[width=\textwidth]{output2.png} +\caption{Output of the \textit{Terminal} window from running the script +\textbf{``flu\_net3.pg''}.} +\label{output2} +\end{figure} + +\item Scroll through the output in the \textit{Terminal} window (Figure \ref{output2}). +Having \texttt{read} in the data files, they were then processed and the settings of +\phyg were \texttt{set}. The output for \texttt{netdel} looks different to that of +\texttt{netadd}---the recursive output is not shown---as \texttt{netdel} is not dependent +on the maximum number of edges, as is the case with \texttt{netadd}. \phyg performed +2 rounds of deleting network edges, but it could only delete one network edge.\\ + +Let's examine the reported graph files. + +\item Opening the file \textbf{``flu\_net3.tre''} in your preferred text editor. This +ENewick file has two \textbf{\#}'s, representing the reticulation events (Figure +\ref{tre3}). Comparing this graph to \textbf{``flu\_net1.tre''} (our input graph), we can +see that the deletion of one network edge, reduced the cost of the network from +14276.8125 to 14197.15625. A single graph was reported. + +\begin{figure}[H] +\centering +\includegraphics[width=\textwidth]{tre3.png} +\caption{Output file \textbf{``flu\_net3.tre''} in ENewick graph format.} +\label{tre3} +\end{figure} + +\item Open the reported file \textbf{``flu\_net3\_gv.tre.eps''} in your preferred +visualization program (Figure \ref{eps3}). The values associated with the taxon +names and HTUs are the branch lengths. Examining this file, we can see a single +network leading to HTU15. + +\begin{figure}[H] +\centering +\includegraphics[width=\textwidth]{eps3.png} +\caption{Output file \textbf{"flu\_net3\_gv.tre.eps"} in eps format.} +\label{eps3} +\end{figure} + +\item In this tutorial we read in the input graph \textbf{``flu\_net1.tre''}. What +happens if instead we read in \textbf{``flu\_net2.tre''}, our graph from our +\texttt{netadd} operation? \\ + +Modify the script using comments and new output file names. + +\item Rerun the script. + +\item Answer the following questions: +\subitem Does this change the number of resulting networks? +\subitem How does changing the input graph affected the cost of the graph? + +\end{enumerate} +%------------------------------------------------------------------------------------------------------- +\subsection{Refining a softwired network: adding and deleting network edges} +\label{subsec:netadddel} + +This tutorial will focus on consecutively adding and deleting network edges. + +\begin {enumerate} + +\item Open your text editor of choice and type the following: + + \begin{quote} + -\/-Refinements of softwired networks using flu data sets using netadddel\\ + set(seed:1634561640)\\ + set(outgroup:"1466\_H7N2\_Avian\_chicken\_pa\_1490921\_2002")\\ + set(graphtype:softwired)\\ + set(graphfactor:w15)\\ + read(nucleotide:"flu*.fas*", tcm:(2,1))\\ + read(newick:"flu\_net1.tre")\\ + refine(netadddel, maxnetedges:5, atrandom, steepest)\\ + report("flu\_net4.tre", graphs, newick, overwrite)\\ + report("flu\_net4\_gv.tre", dotpdf, graphs, overwrite) + \end{quote} + +\item The script begins with a comment that describes the purpose of this +analysis. + +\item The next six lines are identical to that of \textbf{``flu\_net1.pg''}, so we +will not comment about them any further. + +\item Having \texttt{read} in our data and graph files, we can now perform +refinements of the network. The argument \texttt{netadddel} consecutively +adds and then deletes network edges from input graphs until certain conditions +are met. In this case, until either no improvement (graph cost) is found +in a round or until the number of rounds of addition and deletions (in \texttt{rounds:n}) +is reached. The maximum number of edges is still specified by \texttt{maxnetedges:n} +within each round. We also specify that this edit operation be performed using +\texttt{atrandom} +and \texttt{steepest}. + +\item Save this file with the name \textbf{``flu\_net4.pg''} in the same directory +as the data files and previously reported graph files. + +\item Run the script. + +\begin{figure}[H] +\centering +\includegraphics[width=\textwidth]{output3.png} +\caption{Output of the \textit{Terminal} window from running the script +\textbf{``flu\_net4.pg''}.} +\label{output3} +\end{figure} + +\item Scroll through the output in the \textit{Terminal} window (Figure \ref{output3}). +Having \texttt{read} in the data files, they were then processed and the settings of +\phyg were \texttt{set}. The recursive output is shown. \phyg performed a single +round of addition, followed by deletion operations of network edges. The cost of +the graph continues to decrease with each edit operation a network edge. A single +round is the default for \texttt{netadddel}.\\ + +Let's examine the reported graph files. + +\item Opening the file \textbf{``flu\_net4.tre''} in your preferred text editor. This +ENewick file has eight \textbf{\#}'s, representing the reticulation events (Figure +\ref{tre4}). Comparing this graph to \textbf{``flu\_net1.tre''} (our input graph from +Section \ref{subsec:makinganetwork}), we can see that the deletion of one network +edge, reduced the cost of the network from 14276.8125 to 13293.187. A single graph +was reported. + +\begin{figure}[H] +\centering +\includegraphics[width=0.8\textwidth]{tre4.png} +\caption{Output file \textbf{``flu\_net4.tre''} in ENewick graph format.} +\label{tre4} +\end{figure} + +\item Open the reported file \textbf{``flu\_net4\_gv.tre.eps''} in your preferred +visualization program (Figure \ref{eps4}). The values associated with the taxon +names and HTUs are the branch lengths. Examining this file, we can see a four +networks leading to HTU22, HTU20, HTU14 and HTU24. + +\begin{figure}[H] +\centering +\includegraphics[width=\textwidth]{eps4.png} +\caption{Output file \textbf{"flu\_net4\_gv.tre.eps"} in eps format.} +\label{eps4} +\end{figure} + +\item In this tutorial we used the default setting for \texttt{netadddel}, which +included performing a single round of edit operations. What would happen +if we increase the number of rounds of \texttt{netadddel} operations that \phyg +performs? \\ + +Modify the script as follows: + + \begin{quote} + refine(netadddel, maxnetedges:5, atrandom, steepest, rounds:2) + \end{quote} + +\item Rerun the script. + +\item Answer the following questions: +\subitem Does this change the number of resulting networks? +\subitem How does increasing the number of operation rounds affected the +cost of the tree? +\subitem How does this cost compare to that of the graphs from the previous +tutorial (Section \ref{subsec:netdel})? + +\end{enumerate} +%------------------------------------------------------------------------------------------------------- +\subsection{Refining a softwired network: moving network edges} +\label{subsec:softnetmove} + +This tutorial focuses on moving network edges. + +\begin {enumerate} + +\item Open your text editor of choice and type the following: + + \begin{quote} + -\/-Refinements of softwired networks using flu data sets using netmove\\ + set(seed:1634561640)\\ + set(outgroup:"1466\_H7N2\_Avian\_chicken\_pa\_1490921\_2002")\\ + set(graphtype:softwired)\\ + set(graphfactor:w15)\\ + read(nucleotide:"flu*.fas*", tcm:(2,1))\\ + read(newick:"flu\_net1.tre")\\ + refine(netmove, atrandom, steepest)\\ + report("flu\_net5.tre", graphs, newick, overwrite)\\ + report("flu\_net5\_gv.tre", dotpdf, graphs, overwrite) + \end{quote} + +\item The script begins with a comment that describes the purpose of this +analysis. + +\item The next six lines are identical to that of \textbf{``flu\_net1.pg''}, so we +will not comment about them any further. + +\item Having \texttt{read} in our data and graph files, we can now perform +refinements of the network. The argument \texttt{netmove} moves existing +network edges in input graphs one at a time to new positions until there +are no more improvements in the cost of the graph. We also specify that +this edit operation be performed using \texttt{atrandom} and \texttt{steepest}. + +\item Save this file with the name \textbf{``flu\_net5.pg''} in the same directory +as the data files and previously reported graph files. + +\item Run the script. + +\begin{figure}[H] +\centering +\includegraphics[width=\textwidth]{output4.png} +\caption{Output of the \textit{Terminal} window from running the script +\textbf{``flu\_net5.pg''}.} +\label{output4} +\end{figure} + +\item Scroll through the output in the \textit{Terminal} window (Figure \ref{output4}). +Having \texttt{read} in the data files, they were then processed and the settings of +\phyg were \texttt{set}. \phyg performed three round of moves of network edges. +We can see that \phyg did three full rounds of moves, with the cost decreasing +with each move.\\ + +Let's examine the reported graph files. + +\item Opening the file \textbf{``flu\_net5.tre''} in your preferred text editor. This +ENewick file has four \textbf{\#}'s, representing the vertices (Figure \ref{tre5}). +Comparing this graph to \textbf{``flu\_net1.tre''} (our input graph), we can see +that \phyg the moves reduced the cost of the network from 14276.8125 to +13135.46875. A single graph was reported. With \texttt{netmove} the number +of network edges/vertices is maintained between the input and output graphs. + +\begin{figure}[H] +\centering +\includegraphics[width=0.8\textwidth]{tre5.png} +\caption{Output file \textbf{``flu\_net5.tre''} in ENewick graph format.} +\label{tre5} +\end{figure} + +\item Open the reported file \textbf{``flu\_net5\_gv.tre.eps''} in your preferred +visualization program (Figure \ref{eps5}). The values associated with the taxon +names and HTUs are the branch lengths. Examining this file, we can see four +network edges leading to HTU18 and HTU20. + +\begin{figure}[H] +\centering +\includegraphics[width=\textwidth]{eps5.png} +\caption{Output file \textbf{"flu\_net5\_gv.tre.eps"} in eps format.} +\label{eps5} +\end{figure} + +\item One of the output option for `softwired' networks is to report the `display' +trees. Recall that a `display' tree or `tree-based network' is a tree created from +a `softwired' graph by deleting one of two of the network edges/vertices. Modify +the script to include \texttt{report("flu\_net5\_dt.tre", displaytrees, newick, +overwrite)}. + +\item Run the updated script. + +\item Examine the reported file. For now `display' trees can only be reported +in newick format (as opposed to dotpdf and eps format---the error lies with \textit{dot} +on OSX machines). This newick tree file, in parenthetical notation, can be viewed in +other programs like \textit{FigTree} or \textit{TreeView}. + +%\item In Tutorial 0.1 we learned how to perform a local search on the graphs stored +%in memory. Let's do that here. Modify the script to include \texttt{swap(spr:3)}. +%This command specifies that SPR refinement \citep{Dayhoff1969} is performed. +%Because we specify the optional argument \texttt{n}, the readdition of pruned graphs +%will be within $2 * N$ edges of its original placement. +% +%\item Run the modified script. +% +%\item Now reexamine the reported file \textbf{``flu\_net5.tre''}. Notice that this +%simple local search has reduced the cost of the initial best tree from 13135.46875 +%to \hl{xxx}. + +\end{enumerate} +%------------------------------------------------------------------------------------------------------- +\subsection{Hardwired network: moving network edges} +\label{subsec:hardnetmove} + +So far, the tutorials have centered on `softwired' graphs. In this tutorial, we will +focus of `hardwired' graphs. + +\begin {enumerate} + +\item Open your text editor of choice and type the following: + + \begin{quote} + -\/-Refinements of hardwired networks using flu data sets using netmove\\ + set(seed:1634561640)\\ + set(outgroup:"1466\_H7N2\_Avian\_chicken\_pa\_1490921\_2002")\\ + set(graphtype:hardwired)\\ + read(nucleotide:"flu*.fas*", tcm:(2,1))\\ + read(newick:"flu\_net1.tre")\\ + refine(netmove, atrandom, steepest)\\ + report("flu\_net6.tre", graphs, newick, overwrite)\\ + report("flu\_net6\_gv.tre", dotpdf, graphs, overwrite) + \end{quote} + +\item The script begins with a comment that describes the purpose of this +analysis. + +\item The next two lines are identical to that of \textbf{``flu\_net1.pg''}, so we +will not comment about them any further. + +\item We next \texttt{set} the \texttt{graphtype} of this analysis to \texttt{hardwired}. + +\item When conducting a `hardwired' network analysis, there is no need to \texttt{set} +a \texttt{graphfactor} or penalty cost. Unlike `softwired networks', which discussed, +monotonically reduces the network cost with each additional network edge (when +there's no ascribed penalty), +the cost of `hardwired' networks monotonically increases with the addition of each +additional network edge. By extension, the number of network edges is kept constant +with \texttt{netmove}, so there is no need to ascribe a penalty cost. + +\item In addition to importing the nucleotide sequence data files, we also read in +the \texttt{newick} graph file \textbf{``flu\_net1.tre''}. + +\item The argument \texttt{netmove} moves network edges on the input graphs +to all possible positions until no better cost graph is found. We also specify +\texttt{atrandom} and \texttt{steepest}. + +\item Rerun the script. + +\begin{figure} +\centering +\includegraphics[width=\textwidth]{output5.png} +\caption{Output of the \textit{Terminal} window from running the script +\textbf{``flu\_net6.pg''}.} +\label{output5} +\end{figure} + +\item Scroll through the output in the \textit{Terminal} window (Figure \ref{output5}). +Having \texttt{read} in the data files, they were then processed and the settings of +\phyg were \texttt{set}. Notice the cost of the input graph \textbf{``flu\_net1.tre''} is +different from when it is analyzed as a `softwired' network. The cost of the graph +changes when it is diagnosed as a `hardwired' network. \phyg performed three +round of moves of network edges. We can see that \phyg did four full rounds of +moves, reduced the cost of the network from 18540 to 15611. The cost of the graph +decreased with each move. Comparing the cost of the output graph to that found in +(Section \ref{subsec:softnetmove}), we can see that cost of the `softwired' graph is +14276.8125 while that of the `hardwired' graph is 15611.\\ + +Let's examine the reported graph files. + +\item Opening the file \textbf{``flu\_net6.tre''} in your preferred text editor. This +ENewick file has four \textbf{\#}'s, representing the vertices (Figure \ref{tre6}). +A single graph was reported. With \texttt{netmove} the number of network +edges/vertices is maintained between the input and output graphs. + +\begin{figure}[H] +\centering +\includegraphics[width=0.8\textwidth]{tre6.png} +\caption{Output file \textbf{``flu\_net6.tre''} in ENewick graph format.} +\label{tre6} +\end{figure} + +\item Open the reported file \textbf{``flu\_net6\_gv.tre.eps''} in your preferred +visualization program (Figure \ref{eps6}). The values associated with the taxon +names and HTUs are the branch lengths. Examining this file, we can see four +network edges leading to HTU20 and HTU18. + +\begin{figure}[H] +\centering +\includegraphics[width=\textwidth]{eps6.png} +\caption{Output file \textbf{"flu\_net6\_gv.tre.eps"} in eps format.} +\label{eps6} +\end{figure} + +\end{enumerate} +%------------------------------------------------------------------------------------------------------- + +%\printindex +\bibliography{/Users/louise/DropboxAMNH/big-refs-3.bib} +\end{document} diff --git a/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/tre1.png b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/tre1.png new file mode 100644 index 000000000..38d1703db Binary files /dev/null and b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/tre1.png differ diff --git a/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/tre2.png b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/tre2.png new file mode 100644 index 000000000..82d1522dc Binary files /dev/null and b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/tre2.png differ diff --git a/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/tre3.png b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/tre3.png new file mode 100644 index 000000000..60804959c Binary files /dev/null and b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/tre3.png differ diff --git a/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/tre4.png b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/tre4.png new file mode 100644 index 000000000..6d8ea695c Binary files /dev/null and b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/tre4.png differ diff --git a/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/tre5.png b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/tre5.png new file mode 100644 index 000000000..844bf2055 Binary files /dev/null and b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/tre5.png differ diff --git a/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/tre6.png b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/tre6.png new file mode 100644 index 000000000..f6c62a952 Binary files /dev/null and b/tutorials/tutorial_2/Tutorial_2_doc/Tutorial_doc/tre6.png differ