diff --git a/CMakeLists.txt b/CMakeLists.txt index 4a619f4bb..9a5354027 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,8 +1,6 @@ cmake_minimum_required(VERSION 3.12) project(HyPhy) cmake_policy(VERSION 3.0.0) -#cmake_policy(SET CMP0026 OLD) -#cmake_policy(SET CMP0004 OLD) enable_testing() @@ -14,6 +12,7 @@ set(CMAKE_CONFIGURATION_TYPES Release) option(NOAVX OFF) option(NOSSE3 OFF) +option(NONEON OFF) #------------------------------------------------------------------------------- # SSE MACROS @@ -44,6 +43,34 @@ macro(PCL_CHECK_FOR_SSE3) HAVE_SSE3_EXTENSIONS) endmacro(PCL_CHECK_FOR_SSE3) +#------------------------------------------------------------------------------- +# NEON MACROS +#------------------------------------------------------------------------------- + +macro(PCL_CHECK_FOR_NEON) + include(CheckCXXSourceRuns) + set(CMAKE_REQUIRED_FLAGS) + + #if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG) + set(CMAKE_REQUIRED_FLAGS "-msse3") + #endif(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANG) + + check_cxx_source_runs(" + #include + #include + int main() { + double array[4] = {4.,2.,3.,4}; + float64x2_t a1 = vld1q_f64 (array), + a2 = vld1q_f64 (array+2); + a1 = vmulq_f64 (a1,a2); + vst1q_lane_f64 (array, a1, 0); + printf (\"%g\", array[0]); + return 0; + }" + HAVE_NEON_EXTENSIONS) +endmacro(PCL_CHECK_FOR_NEON) + + #------------------------------------------------------------------------------- # SSE MACROS #------------------------------------------------------------------------------- @@ -219,6 +246,13 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") endif(${HAVE_SSE3_EXTENSIONS}) endif (${HAVE_AVX_EXTENSIONS}) endif(NOAVX) + + if(NOT NONEON) + PCL_CHECK_FOR_NEON() + if(${HAVE_NEON_EXTENSIONS}) + add_definitions (-D_SLKP_USE_ARM_NEON) + endif(${HAVE_NEON_EXTENSIONS}) + endif(NOT NONEON) set_property( SOURCE ${SRC_CORE} ${SRC_NEW} ${SRC_UTILS} ${SRC_UNIXMAIN} @@ -503,11 +537,13 @@ set_target_properties( HYPHYDEBUG PROPERTIES if(${OPENMP_FOUND}) - COMPILE_FLAGS "${DEFAULT_COMPILE_FLAGS} ${DEFAULT_DEBUG_WARNING_FLAGS} ${OpenMP_CXX_FLAGS} ${MPI_COMPILE_FLAGS} -g -fprofile-arcs -ftest-coverage -fsanitize=address -fsanitize=leak -O0 " - LINK_FLAGS "${DEFAULT_COMPILE_FLAGS} ${OpenMP_CXX_FLAGS} ${MPI_LINK_FLAGS} -g -fprofile-arcs -ftest-coverage -fsanitize=address -O0 " + #COMPILE_FLAGS "${DEFAULT_COMPILE_FLAGS} ${DEFAULT_DEBUG_WARNING_FLAGS} ${OpenMP_CXX_FLAGS} ${MPI_COMPILE_FLAGS} -g -fsanitize=address -fsanitize=leak -O0 " + COMPILE_FLAGS "${DEFAULT_COMPILE_FLAGS} ${DEFAULT_DEBUG_WARNING_FLAGS} ${OpenMP_CXX_FLAGS} ${MPI_COMPILE_FLAGS} -g -fsanitize=address -O0 " + LINK_FLAGS "${DEFAULT_COMPILE_FLAGS} ${OpenMP_CXX_FLAGS} ${MPI_LINK_FLAGS} -g -fsanitize=address -O0 " else(${OPENMP_FOUND}) - COMPILE_FLAGS "${DEFAULT_COMPILE_FLAGS} ${DEFAULT_DEBUG_WARNING_FLAGS} -g -fprofile-arcs -ftest-coverage -fsanitize=address -fsanitize=leak -O0 " - LINK_FLAGS "${DEFAULT_COMPILE_FLAGS} -g -fprofile-arcs -ftest-coverage -fsanitize=address -O0 " + #COMPILE_FLAGS "${DEFAULT_COMPILE_FLAGS} ${DEFAULT_DEBUG_WARNING_FLAGS} -g -fsanitize=address -fsanitize=leak -O0 " + COMPILE_FLAGS "${DEFAULT_COMPILE_FLAGS} ${DEFAULT_DEBUG_WARNING_FLAGS} -g -fsanitize=address -O0 " + LINK_FLAGS "${DEFAULT_COMPILE_FLAGS} -g -fsanitize=address -O0 " endif(${OPENMP_FOUND}) ) diff --git a/res/TemplateBatchFiles/SelectionAnalyses/BUSTED.bf b/res/TemplateBatchFiles/SelectionAnalyses/BUSTED.bf index 8eb11a654..cd3772a41 100644 --- a/res/TemplateBatchFiles/SelectionAnalyses/BUSTED.bf +++ b/res/TemplateBatchFiles/SelectionAnalyses/BUSTED.bf @@ -429,34 +429,42 @@ if (busted.do_srv_hmm) { //parameters.SetConstraint (((busted.test.bsrel_model[terms.parameters])[terms.global])[terms.rate_variation.hmm_lambda],"1e-6", ""); } - -parameters.DeclareGlobalWithRanges ("busted.bl.scaler", 1, 0, 1000); -busted.grid_search.results = estimators.FitLF (busted.filter_names, busted.trees, busted.model_map, busted.final_partitioned_mg_results, busted.model_object_map, { - "retain-lf-object": TRUE, - terms.run_options.proportional_branch_length_scaler : - {"0" : "busted.bl.scaler"}, +debug.checkpoint = utility.GetEnvVariable ("DEBUG_CHECKPOINT"); + +if (Type (debug.checkpoint) != "String") { + parameters.DeclareGlobalWithRanges ("busted.bl.scaler", 1, 0, 1000); + busted.grid_search.results = estimators.FitLF (busted.filter_names, busted.trees, busted.model_map, busted.final_partitioned_mg_results, busted.model_object_map, { + "retain-lf-object": TRUE, + terms.run_options.proportional_branch_length_scaler : + {"0" : "busted.bl.scaler"}, - terms.run_options.optimization_settings : - { - "OPTIMIZATION_METHOD" : "nedler-mead", - "MAXIMUM_OPTIMIZATION_ITERATIONS" : 500, - "OPTIMIZATION_PRECISION" : busted.nm.precision - } , + terms.run_options.optimization_settings : + { + "OPTIMIZATION_METHOD" : "nedler-mead", + "MAXIMUM_OPTIMIZATION_ITERATIONS" : 500, + "OPTIMIZATION_PRECISION" : busted.nm.precision + } , - terms.search_grid : busted.initial_grid, - terms.search_restarts : busted.N.initial_guesses -}); + terms.search_grid : busted.initial_grid, + terms.search_restarts : busted.N.initial_guesses + }); -busted.full_model = estimators.FitLF (busted.filter_names, busted.trees, busted.model_map, busted.grid_search.results, busted.model_object_map, { - "retain-lf-object": TRUE, - terms.run_options.optimization_settings : - { - "OPTIMIZATION_METHOD" : "hybrid" - } + busted.full_model = estimators.FitLF (busted.filter_names, busted.trees, busted.model_map, busted.grid_search.results, busted.model_object_map, { + "retain-lf-object": TRUE, + terms.run_options.optimization_settings : + { + "OPTIMIZATION_METHOD" : "hybrid", + //"OPTIMIZATION_PRECISION" : 1. + } - }); - + }); +} else { + ExecuteAFile (debug.checkpoint); + GetString (lf, LikelihoodFunction, 0); + busted.full_model = estimators.ExtractMLEs (lf, busted.model_object_map); + busted.full_model[terms.likelihood_function] = lf; +} KeywordArgument ("save-fit", "Save BUSTED model fit to this file (default is not to save)", "/dev/null"); @@ -464,6 +472,7 @@ io.SpoolLFToPath(busted.full_model[terms.likelihood_function], io.PromptUserForF io.ReportProgressMessageMD("BUSTED", "main", "* " + selection.io.report_fit (busted.full_model, 9, busted.codon_data_info[terms.data.sample_size])); +//VERBOSITY_LEVEL = 101; io.ReportProgressMessageMD("BUSTED", "main", "* For *test* branches, the following rate distribution for branch-site combinations was inferred"); diff --git a/res/TemplateBatchFiles/SelectionAnalyses/PRIME.bf b/res/TemplateBatchFiles/SelectionAnalyses/PRIME.bf index be6b8fad8..a64812e56 100644 --- a/res/TemplateBatchFiles/SelectionAnalyses/PRIME.bf +++ b/res/TemplateBatchFiles/SelectionAnalyses/PRIME.bf @@ -21,6 +21,9 @@ LoadFunctionLibrary("modules/io_functions.ibf"); LoadFunctionLibrary("modules/selection_lib.ibf"); + + + debug.checkpoint = utility.GetEnvVariable ("DEBUG_CHECKPOINT"); /*------------------------------------------------------------------------------ Display analysis information @@ -103,7 +106,11 @@ This table is meant for HTML rendering in the results web-app; can use HTML char is 'pop-over' explanation of terms. This is ONLY saved to the JSON file. For Markdown screen output see the next set of variables. */ -prime.table_screen_output = {{"Codon", "Partition", "alpha", "beta", "Property Description", "Importance", "p-value"}}; + +prime.site.composition.string = ""; + + +prime.table_screen_output = {{"Codon", "Partition", "alpha", "beta", "Property Description", "Importance", "p-value","Most common codon substitutions at this site"}}; prime.table_output_options = { terms.table_options.header : TRUE, terms.table_options.minimum_column_width: 12, terms.table_options.align : "center", @@ -112,9 +119,10 @@ prime.table_output_options = { terms.table_options.header : TRUE, "1" : 12, "2" : 12, "3" : 12, - "4" : 50, + "4" : 40, "5" : 12, - "6" : 12} + "6" : 12, + "7" : 70} }; @@ -129,6 +137,7 @@ prime.property_set = io.SelectAnOption ( { "Atchley":"Use the five properties derived from a factor analysis of 500 amino-acid properties [Table 2 in PNAS (2005) 102(18) 6395-6400 doi: 10.1073/pnas.0408677102]", "LCAP":"Use the five properties defined in the Conant and Stadler LCAP model [Mol Biol Evol (2009) 26 (5): 1155-1161. doi: 10.1093/molbev/msp031]", + "Random" : "Random properties (for null hypothesis testing)", "Custom":"Load the set of properties from a file" }, "The set of properties to use in the model"); @@ -155,6 +164,7 @@ for (_partition_, _selection_; in; prime.selected_branches) { } prime.pairwise_counts = genetic_code.ComputePairwiseDifferencesAndExpectedSites(prime.codon_data_info[terms.code], None); +prime.codon_table = genetic_code.DefineCodonToAAMapping (prime.codon_data_info[terms.code]); selection.io.startTimer (prime.json [terms.json.timers], "Model fitting",1); @@ -296,6 +306,9 @@ prime.lambda_range = { terms.upper_bound: "15" }; + + + prime.table_headers = { 6 + 3*prime.properties.N, 2}; prime.table_headers[0][0] = "alpha;"; prime.table_headers[0][1] = "Synonymous substitution rate at a site"; @@ -375,7 +388,8 @@ prime.report.significant_site = {{"" + (1+((prime.filter_specification[prime.rep Format(prime.report.row[1],7,3), // beta prime.property_report_name, // property name prime.rate_to_screen ( prime.property_report_name,,prime.report_rate), // property importance - Format(prime.report.row[prime.print_index],7,3) // property p-value + Format(prime.report.row[prime.print_index],7,3), // property p-value + Join ("|", prime.site.composition.string[prime.property_report_name]) }}; @@ -659,6 +673,9 @@ lfunction prime.handle_a_site (lf_fel, lf_prop, filter_data, partition_index, pa } + ancestral_info = ancestral.build (lf_prop,0,FALSE); + branch_substitution_information = (ancestral.ComputeSubstitutionBySite (ancestral_info,0,None))[^"terms.substitutions"]; + DeleteObject (ancestral_info); alternative = estimators.ExtractMLEsOptions (lf_prop, model_mapping, {}); alternative [utility.getGlobalValue("terms.fit.log_likelihood")] = results[1][0]; @@ -691,6 +708,7 @@ lfunction prime.handle_a_site (lf_fel, lf_prop, filter_data, partition_index, pa utility.getGlobalValue("terms.alternative") : alternative, utility.getGlobalValue("terms.Null"): Null, utility.getGlobalValue("terms.model.residue_properties") : constrained_models, + utility.getGlobalValue("terms.branch_selection_attributes") : branch_substitution_information, utility.getGlobalValue("terms.prime_imputed_states") : character_map }; } @@ -738,6 +756,71 @@ function prime.report.echo (prime.report.site, prime.report.partition, prime.rep lfunction prime.store_results (node, result, arguments) { + sub_profile = result[utility.getGlobalValue("terms.branch_selection_attributes")]; + + + + if (None != sub_profile) { + + total_sub_count = 0; + + + + sub_by_count = {"Overall" : {}}; + + for (p,v; in; ^"prime.properties") { + sub_by_count[p] = {}; + } + + for (i, v; in; sub_profile) { + for (j, b; in; v) { + c = Abs (b); + total_sub_count += c; + ai = (^"prime.codon_table")[i]; + aj = (^"prime.codon_table")[j]; + if (Abs (ai) == 0) {ai = "?";} + if (Abs (aj) == 0) {aj = "?";} + + + for (p, counts; in; sub_by_count) { + if ((counts/c)==0) { + counts [c] = {}; + } + + delta = ""; + if (p != "Overall") { + delta = "("+Format (Abs (((^"prime.properties")[p])[ai]-((^"prime.properties")[p])[aj]),0,2) + ")"; + } + + if (ai != aj) { + counts[c] + (((^"prime.codon_table")[i] + ">" + (^"prime.codon_table")[j]) + delta); + } else { + counts[c] + ai; + } + } + } + } + + sub_profile = {}; + for (p,count; in; sub_by_count) { + + sorted_subs = {Abs (count), 1}; + j = 0; + for (i, v; in; count) { + sorted_subs[j] = -(+i); + j += 1; + } + sub_profile1 = {}; + for (i; in; sorted_subs % 0) { + sub_profile1 + ("[" + (-i) + "]" + Join (",",count [-i])); + } + + sub_profile [p] = sub_profile1; + } + ^"prime.site.composition.string" = sub_profile; + } + + partition_index = arguments [3]; pattern_info = arguments [4]; diff --git a/res/TemplateBatchFiles/SelectionAnalyses/contrast-fel.bf b/res/TemplateBatchFiles/SelectionAnalyses/contrast-fel.bf index 152f7f866..6a6bf332e 100644 --- a/res/TemplateBatchFiles/SelectionAnalyses/contrast-fel.bf +++ b/res/TemplateBatchFiles/SelectionAnalyses/contrast-fel.bf @@ -666,12 +666,34 @@ function fel.apply_proportional_site_constraint (tree_name, node_name, alpha_par //---------------------------------------------------------------------------------------- lfunction fel.handle_a_site (lf, filter_data, partition_index, pattern_info, model_mapping, branch_sets, parameter_names, permutation) { - utility.ForEach (utility.Keys (branch_sets), "_node_", + + lengths_by_class = {}; + + for (_node_; in; utility.Keys (branch_sets)) { + _node_class_ = branch_sets[_node_]; + _beta_scaler = parameter_names[_node_class_]; + lengths_by_class[_node_class_] += fel.apply_proportional_site_constraint ("fel.site_tree", _node_, fel.alpha, fel.beta, fel.alpha.scaler, _beta_scaler, + (( (^"fel.final_partitioned_mg_results")[^"terms.branch_length"])[partition_index])[_node_]); + } + + /*utility.ForEach (utility.Keys (branch_sets), "_node_", '_node_class_ = (`&branch_sets`)[_node_]; _beta_scaler = (`¶meter_names`)[_node_class_]; fel.apply_proportional_site_constraint ("fel.site_tree", _node_, fel.alpha, fel.beta, fel.alpha.scaler, _beta_scaler, (( fel.final_partitioned_mg_results[terms.branch_length])[partition_index])[_node_]); - '); + ');*/ + + ignorable = {}; + for (fel.t; in; utility.Values (branch_sets)) { + if ( lengths_by_class[fel.t] == 0) { + ignorable [parameter_names[fel.t]] = 1; + } else { + ignorable [parameter_names[fel.t]] = 0; + } + } + + //console.log (lengths_by_class); + //console.log (ignorable); GetString (lfInfo, ^lf,-1); ExecuteCommands (filter_data); @@ -765,13 +787,15 @@ lfunction fel.handle_a_site (lf, filter_data, partition_index, pattern_info, mod ^ref_parameter = sum /denominator; + + if (testable == 1) { parameters.SetConstraint ((^"fel.scaler_parameter_names")[^"terms.tree_attributes.background"],ref_parameter, ""); } else { for (_gname_; in; ^"fel.branches.testable") { _pname_ = (^"fel.scaler_parameter_names")[_gname_]; - if (_pname_ != ref_parameter && (^"fel.ignorable")[_pname_] == 0) { + if (_pname_ != ref_parameter && (ignorable)[_pname_] == 0) { parameters.SetConstraint (_pname_,ref_parameter, ""); } } @@ -788,8 +812,9 @@ lfunction fel.handle_a_site (lf, filter_data, partition_index, pattern_info, mod v1n = (^"fel.branches.testable")[v]; v2n = (^"fel.branches.testable")[v2]; estimators.RestoreLFStateFromSnapshot (lf_id, snapshot); + - if ((^"fel.ignorable")[(^"fel.scaler_parameter_names")[v1n]] || (^"fel.ignorable")[(^"fel.scaler_parameter_names")[v2n]]) { + if ((ignorable)[(^"fel.scaler_parameter_names")[v1n]] || (ignorable)[(^"fel.scaler_parameter_names")[v2n]]) { //console.log (v1n + "|" + v2n + " is ignorable"); (pairwise[v1n + "|" + v2n]) = alternative; } else { @@ -811,6 +836,9 @@ lfunction fel.handle_a_site (lf, filter_data, partition_index, pattern_info, mod p_values = {}; p_values ["overall"] = lrt[^'terms.p_value']; test_keys = {}; + + //estimators.RestoreLFStateFromSnapshot (lf_id, snapshot); + if (^'fel.report.test_count' > 1) { @@ -826,7 +854,6 @@ lfunction fel.handle_a_site (lf, filter_data, partition_index, pattern_info, mod } } - p_values = math.HolmBonferroniCorrection (p_values); if (permutation == FALSE && (Min (p_values,0))["value"] <= ^'fel.p_value') { diff --git a/src/core/global_things.cpp b/src/core/global_things.cpp index 7749410de..0ecce1638 100644 --- a/src/core/global_things.cpp +++ b/src/core/global_things.cpp @@ -121,7 +121,7 @@ namespace hy_global { kErrorStringDatasetRefIndexError ("Dataset index reference out of range"), kErrorStringMatrixExportError ("Export matrix called with a non-polynomial matrix argument"), kErrorStringNullOperand ("Attempting to operate on an undefined value; this is probably the result of an earlier 'soft' error condition"), - kHyPhyVersion = _String ("2.5.23"), + kHyPhyVersion = _String ("2.5.24"), kNoneToken = "None", kNullToken = "null", diff --git a/src/core/include/defines.h b/src/core/include/defines.h index 27b00b482..83b2acf3b 100644 --- a/src/core/include/defines.h +++ b/src/core/include/defines.h @@ -336,9 +336,15 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. //TODO 20170413: not sure where to put the conditional includes below +#ifdef _SLKP_USE_ARM_NEON + #include +#endif #ifdef _SLKP_USE_SSE_INTRINSICS -#include + //#include "sse2neon.h" + //#else + #include + //#endif #endif #ifdef _SLKP_USE_AVX_INTRINSICS diff --git a/src/core/include/likefunc.h b/src/core/include/likefunc.h index 94bc2f350..8e878617e 100644 --- a/src/core/include/likefunc.h +++ b/src/core/include/likefunc.h @@ -614,7 +614,8 @@ class _LikelihoodFunction: public BaseObj { hyFloat smoothingTerm, smoothingReduction, - smoothingPenalty; + smoothingPenalty, + errorTolerance; #ifdef _SLKP_LFENGINE_REWRITE_ /* diff --git a/src/core/likefunc.cpp b/src/core/likefunc.cpp index e307b6174..08cfbf962 100644 --- a/src/core/likefunc.cpp +++ b/src/core/likefunc.cpp @@ -559,8 +559,7 @@ _LikelihoodFunction::_LikelihoodFunction (void) { //_______________________________________________________________________________________ -void _LikelihoodFunction::Init (void) -{ +void _LikelihoodFunction::Init (void) { siteResults = nil; bySiteResults = nil; hasBeenSetUp = 0; @@ -571,6 +570,7 @@ void _LikelihoodFunction::Init (void) evalsSinceLastSetup = 0; siteArrayPopulated = false; smoothingTerm = 0.; + errorTolerance = 1.; smoothingPenalty = 0.; conditionalInternalNodeLikelihoodCaches = nil; @@ -3638,7 +3638,8 @@ void _LikelihoodFunction::SetupLFCaches (void) { #endif evalsSinceLastSetup = 0L; - + unsigned long maxFilterSize = 1; + for (unsigned long i=0UL; iGetSiteCountInUnits()); if (leafCount > 1UL) { conditionalInternalNodeLikelihoodCaches[i] = (hyFloat*)MemAllocate (sizeof(hyFloat)*patternCount*stateSpaceDim*iNodeCount*cT->categoryCount, false, 64); branchCaches[i] = (hyFloat*)MemAllocate (sizeof(hyFloat)*2*patternCount*stateSpaceDim*cT->categoryCount, false, 64); @@ -3726,7 +3728,7 @@ void _LikelihoodFunction::SetupLFCaches (void) { } free (columnBlock); free (translationCache); conditionalTerminalNodeLikelihoodCaches < ambigs; - + errorTolerance = MAX (1.,round (log (1.+maxFilterSize)/log (10))); #ifdef MDSOCL OCLEval[i].init(patternCount, theFilter->GetDimension(), conditionalInternalNodeLikelihoodCaches[i]); #endif @@ -4110,8 +4112,12 @@ _Matrix* _LikelihoodFunction::Optimize (_AssociativeList const * options) BenchmarkThreads (this); #ifdef __HYPHYMPI__ } + + #endif + //ObjectToConsole(variables_changed_during_last_compute); + //NLToConsole(); bool skipCG = ! CheckEqual (get_optimization_setting (kSkipConjugateGradient, 0.0), 0.0), keepStartingPoint = ! CheckEqual (get_optimization_setting (kUseLastResults, 0.0), 0.0), @@ -4128,14 +4134,23 @@ _Matrix* _LikelihoodFunction::Optimize (_AssociativeList const * options) } } - + for (unsigned long i=0UL; ivarFlags & HY_VARIABLE_CHANGED) { + variables_changed_during_last_compute->InsertNumber (GetIthIndependentVar(i)->get_index()); + //if (variables_changed_during_last_compute->InsertNumber (GetIthIndependentVar(i)->get_index()) >= 0) + // printf ("Insert [before] %s\n",GetIthIndependentName(i)->get_str()); + } + } + _SimpleList untag; if (keepStartingPoint) { indexInd.Each ([this, &untag] (long v, unsigned long i) -> void { _Variable *iv = GetIthIndependentVar (i); + //printf ("%s = %g, global %d, initialized %d, changed %d\n",GetIthIndependentName(i)->get_str(), GetIthIndependent(i), iv->IsGlobal(), iv->HasBeenInitialized(), iv->IsModified()); if (!iv->IsGlobal() && iv->HasBeenInitialized()) { if (!iv->IsModified()) { iv->MarkModified(); + //printf ("TAG [before] %s\n",GetIthIndependentName(i)->get_str()); untag << i; } } @@ -4148,12 +4163,11 @@ _Matrix* _LikelihoodFunction::Optimize (_AssociativeList const * options) if (keepStartingPoint) { untag.Each ([this] (long v, unsigned long i) -> void { - _Variable *iv = GetIthIndependentVar (i); + _Variable *iv = GetIthIndependentVar (v); + //printf ("UNTAG [before] %s\n",GetIthIndependentName(v)->get_str()); iv->ClearModified(); }); - } - - if (!keepStartingPoint) { + } else { hyFloat global_starting_point = get_optimization_setting (kGlobalStartingPoint, 0.1); if (CheckEqual (get_optimization_setting (kRandomStartingPerturbations, 0.0), 0.0)) { @@ -4182,6 +4196,8 @@ _Matrix* _LikelihoodFunction::Optimize (_AssociativeList const * options) for (unsigned long i=0UL; ivarFlags & HY_VARIABLE_CHANGED) { variables_changed_during_last_compute->InsertNumber (GetIthIndependentVar(i)->get_index()); + //if (variables_changed_during_last_compute->InsertNumber (GetIthIndependentVar(i)->get_index()) >= 0) + // printf ("Insert [after] %s\n",GetIthIndependentName(i)->get_str()); } } @@ -5629,7 +5645,7 @@ long _LikelihoodFunction::Bracket (long index, hyFloat& left, hyFloat& middle (or involves a paremeter that has very little effect on the LF), recomputation could be within numerical error **/ - if (rightValue - middleValue > 1e-9 || leftValue - middleValue > 1e-9) { + if (rightValue - middleValue > 1e-12*errorTolerance || leftValue - middleValue > 1e-12*errorTolerance) { char buf[256], buf2[512]; snprintf (buf, 256, " \n\tERROR: [_LikelihoodFunction::Bracket (index %ld) recomputed the value to midpoint: L(%g) = %g [@%g -> %g:@%g -> %g]]", index, middle, middleValue, left, leftValue,right, rightValue); snprintf (buf2, 512, "\n\t[_LikelihoodFunction::Bracket (index %ld) BRACKET %s: %20.16g <= %20.16g >= %20.16g. steps, L=%g, R=%g, values %15.12g : %15.12g - %15.12g]", index, successful ? "SUCCESSFUL" : "FAILED", left,middle,right, leftStep, rightStep, leftValue - middleValue, middleValue, rightValue - middleValue); @@ -6500,7 +6516,7 @@ hyFloat _LikelihoodFunction::ConjugateGradientDescent (hyFloat step_precision } SetAllIndependent (&bestVal); - if (maxSoFar < initial_value && CheckEqual(maxSoFar, initial_value, kMachineEpsilon * 100.) == false) { + if (maxSoFar < initial_value && CheckEqual(maxSoFar, initial_value, kMachineEpsilon * errorTolerance) == false) { HandleApplicationError (_String("Internal optimization error in _LikelihoodFunction::ConjugateGradientDescent. Worsened likelihood score from ") & initial_value & " to " & maxSoFar); } @@ -6874,7 +6890,7 @@ void _LikelihoodFunction::GradientDescent (hyFloat& gPrecision, _Matrix& best bestVal = current_best_vector; } - if (maxSoFar < initialValue && !CheckEqual (maxSoFar, initialValue, 100. * kMachineEpsilon)) { + if (maxSoFar < initialValue && !CheckEqual (maxSoFar, initialValue, kMachineEpsilon*errorTolerance)) { _TerminateAndDump(_String (" _LikelihoodFunction::GradientLocateTheBump: in the Brent loop iteration ") & long(outcome) & ". " & _String (maxSoFar, "%18.16g") & " / " & _String (initialValue,"%18.16g") & ".\n Optimization direction: \n" & _String ((_String*)gradient.toStr()) ); return 0.; @@ -7081,7 +7097,7 @@ void _LikelihoodFunction::LocateTheBump (long index,hyFloat gPrecision, hyFlo snprintf (buf, 256, "\n\t[_LikelihoodFunction::LocateTheBump (index %ld) RESETTING THE VALUE (worse log likelihood obtained; current value %20.16g, best value %20.16g) ]\n\n", index, GetIthIndependent(index), bestVal); BufferToConsole (buf); } - if (CheckEqual(GetIthIndependent(index), bestVal) && fabs (middleValue-maxSoFar) > 1e-9) { + if (CheckEqual(GetIthIndependent(index), bestVal) && fabs (middleValue-maxSoFar) > 1e-12*errorTolerance) { char buf[256]; snprintf (buf, 256, " \n\tERROR: [_LikelihoodFunction::LocateTheBump (index %ld) current value %20.16g (parameter = %20.16g), best value %20.16g (parameter = %20.16g)); delta = %20.16g ]\n\n", index, middleValue, GetIthIndependent(index), maxSoFar, bestVal, maxSoFar - middleValue); _TerminateAndDump (_String (buf) & "\n" & "\nParameter name " & *GetIthIndependentName(index)); diff --git a/src/core/matrix.cpp b/src/core/matrix.cpp index 5a50e7984..61323e1ff 100644 --- a/src/core/matrix.cpp +++ b/src/core/matrix.cpp @@ -1895,7 +1895,7 @@ bool _Matrix::AmISparseFast (_Matrix& whereTo) { long * non_zero_index = (long*)alloca (threshold*sizeof(long)); -#ifdef _SLKP_USE_AVX_INTRINSICS +#if defined _SLKP_USE_AVX_INTRINSICS __m256d zeros = _mm256_setzero_pd(); long lDimMOD4 = lDim >> 2 << 2; for (long i = 0; i < lDimMOD4; i+=4) { @@ -1917,6 +1917,27 @@ bool _Matrix::AmISparseFast (_Matrix& whereTo) { } } } +#elif _SLKP_USE_ARM_NEON + float64x2_t zeros = vdupq_n_f64 (0.); + long lDimMOD2 = lDim >> 1 << 1; + for (long i = 0; i < lDimMOD2; i+=2) { + + uint64x2_t res = vceqq_f64 (vld1q_f64 (theData+i), zeros); + if (vaddvq_u64 (res)) { // something is different + if (vgetq_lane_u64 (res,0)) { non_zero_index[k++] = i; if (k == threshold) break; }; + if (vgetq_lane_u64 (res,1)) { non_zero_index[k++] = i+1; if (k == threshold) break; }; + } + } + + if (k < threshold) + for (long i = lDimMOD2; i < lDim; i++) { + if (theData[i] != 0.0) { + non_zero_index[k++] = i; + if (k == threshold) { + return false; + } + } + } #else for (long i = 0; i < lDim; i++) { if (theData[i] != 0.0) { @@ -3357,8 +3378,8 @@ void _Matrix::AddMatrix (_Matrix& storage, _Matrix& secondArg, bool subtract if (subtract) { #ifdef _SLKP_USE_AVX_INTRINSICS - #define CELL_OP1(x,y) __m256d y = _mm256_sub_pd (_mm256_loadu_pd (stData+x), _mm256_loadu_pd (argData+x)) -#define CELL_OP2(x,y) _mm256_storeu_pd (stData+x,y) + #define CELL_OP1(x,y) __m256d y = _mm256_sub_pd (_mm256_loadu_pd (stData+x), _mm256_loadu_pd (argData+x)) + #define CELL_OP2(x,y) _mm256_storeu_pd (stData+x,y) #pragma GCC unroll 4 #pragma clang loop vectorize(enable) @@ -3376,6 +3397,26 @@ void _Matrix::AddMatrix (_Matrix& storage, _Matrix& secondArg, bool subtract CELL_OP2 (idx+8,r3); CELL_OP2 (idx+12,r4); } +#elif defined _SLKP_USE_ARM_NEON + #define CELL_OP1(x,y) float64x2_t y = vsubq_f64 (vld1q_f64 (stData+x), vld1q_f64 (argData+x)) + #define CELL_OP2(x,y) vst1q_f64 (stData+x,y) + #pragma GCC unroll 4 + #pragma clang loop vectorize(enable) + #pragma clang loop interleave(enable) + #pragma clang loop unroll(enable) + #pragma GCC ivdep + #pragma ivdep + for (long idx = 0; idx < upto; idx+=8) { + CELL_OP1 (idx,r1); + CELL_OP1 (idx+2,r2); + CELL_OP1 (idx+4,r3); + CELL_OP1 (idx+6,r4); + CELL_OP2 (idx,r1); + CELL_OP2 (idx+2,r2); + CELL_OP2 (idx+4,r3); + CELL_OP2 (idx+6,r4); + } + #else for (long idx = 0; idx < upto; idx+=4) { stData[idx]-=argData[idx]; @@ -3400,7 +3441,27 @@ void _Matrix::AddMatrix (_Matrix& storage, _Matrix& secondArg, bool subtract CELL_OP (idx+12); } - #else +#elif defined _SLKP_USE_ARM_NEON + #define CELL_OP1(x,y) float64x2_t y = vaddq_f64 (vld1q_f64 (stData+x), vld1q_f64 (argData+x)) + #define CELL_OP2(x,y) vst1q_f64 (stData+x,y) + #pragma GCC unroll 4 + #pragma clang loop vectorize(enable) + #pragma clang loop interleave(enable) + #pragma clang loop unroll(enable) + #pragma GCC ivdep + #pragma ivdep + for (long idx = 0; idx < upto; idx+=8) { + CELL_OP1 (idx,r1); + CELL_OP1 (idx+2,r2); + CELL_OP1 (idx+4,r3); + CELL_OP1 (idx+6,r4); + CELL_OP2 (idx,r1); + CELL_OP2 (idx+2,r2); + CELL_OP2 (idx+4,r3); + CELL_OP2 (idx+6,r4); + } + +#else for (long idx = 0; idx < upto; idx+=4) { stData[idx]+=argData[idx]; stData[idx+1]+=argData[idx+1]; @@ -3630,6 +3691,21 @@ void _Matrix::Multiply (_Matrix& storage, hyFloat c) for (; k < lDim; k++) { destination[k] = source[k]*c; } + #elif defined _SLKP_USE_ARM_NEON + #define CELL_OP(k) vst1q_f64 (destination + k, vmulq_f64(value_op, vld1q_f64 (source+k))) + long lDimM8 = lDim >> 3 << 3, + k = 0; + + float64x2_t value_op = vdupq_n_f64 (c); + for (k = 0L; k < lDimM8; k+=8) { + CELL_OP (k); + CELL_OP (k+2); + CELL_OP (k+4); + CELL_OP (k+6); + } + for (; k < lDim; k++) { + destination[k] = source[k]*c; + } #else for (long k = 0L; k < lDim; k++) { destination[k] = source[k]*c; @@ -3730,11 +3806,14 @@ void _Matrix::Multiply (_Matrix& storage, _Matrix const& secondArg) const #elif _SLKP_USE_SSE_INTRINSICS #define DO_GROUP_OP1(X,Y,k) X = _mm_loadu_pd(dest + row_offset + k); Y = _mm_loadu_pd(secondArg.theData + col_offset + k); X = _mm_add_pd (X, _mm_mul_pd(A4, Y)); #define DO_GROUP_OP2(X,k) _mm_storeu_pd (dest + row_offset + k,X); + +#elif _SLKP_USE_ARM_NEON + #define DO_GROUP_OP1(X,Y,k) X = vld1q_f64 (dest + row_offset + k); Y = vld1q_f64 (secondArg.theData + col_offset + k); X = vfmaq_f64 (X, A4, Y); + #define DO_GROUP_OP2(X,k) vst1q_f64 (dest + row_offset + k,X); #endif #ifndef _SLKP_SSE_VECTORIZATION_ - if (dimm4 == vDim) { #if defined _SLKP_USE_AVX_INTRINSICS @@ -3818,7 +3897,7 @@ void _Matrix::Multiply (_Matrix& storage, _Matrix const& secondArg) const } return; #elif _SLKP_USE_SSE_INTRINSICS - memset (dest, 0, lDim * sizeof (hyFloat)); + memset (dest, 0, lDim * sizeof (hyFloat)); long ti = 0L, row_offset = 0L; @@ -3869,6 +3948,58 @@ void _Matrix::Multiply (_Matrix& storage, _Matrix const& secondArg) const } } return; + #elif _SLKP_USE_ARM_NEON + memset (dest, 0, lDim * sizeof (hyFloat)); + long ti = 0L, + row_offset = 0L; + + if (vDim == 20UL) { + for (long r = 0; r < 20; r++, row_offset += 20) { + long col_offset = 0L; + for (long c = 0; c < 20L; c++, ti++, col_offset += 20L) { + float64x2_t A4 = vdupq_n_f64(theData[ti]); + //for (long k = 0; k < 20L; k+=4) { + float64x2_t D4, B4, D4_1, B4_1, D4_2, B4_2, D4_3, B4_3, D4_4, B4_4; + DO_GROUP_OP1 (D4, B4, 0); + DO_GROUP_OP1 (D4_1, B4_1, 2); + DO_GROUP_OP1 (D4_2, B4_2, 4); + DO_GROUP_OP1 (D4_3, B4_3, 6); + DO_GROUP_OP1 (D4_4, B4_4, 8); + DO_GROUP_OP2 (D4, 0); + DO_GROUP_OP2 (D4_1, 2); + DO_GROUP_OP2 (D4_2, 4); + DO_GROUP_OP2 (D4_3, 6); + DO_GROUP_OP2 (D4_4, 8); + DO_GROUP_OP1 (D4, B4, 10); + DO_GROUP_OP1 (D4_1, B4_1, 12); + DO_GROUP_OP1 (D4_2, B4_2, 14); + DO_GROUP_OP1 (D4_3, B4_3, 16); + DO_GROUP_OP1 (D4_4, B4_4, 18); + DO_GROUP_OP2 (D4, 10); + DO_GROUP_OP2 (D4_1, 12); + DO_GROUP_OP2 (D4_2, 14); + DO_GROUP_OP2 (D4_3, 16); + DO_GROUP_OP2 (D4_4, 18); + //} + } + } + } else + for (long r = 0; r < vDim; r++, row_offset += vDim) { + long col_offset = 0L; + for (long c = 0; c < vDim; c++, ti++, col_offset += vDim) { + float64x2_t A4 = vdupq_n_f64(theData[ti]); + #pragma GCC unroll 4 + #pragma clang loop vectorize(enable) + #pragma clang loop interleave(enable) + #pragma clang loop unroll(enable) + for (long k = 0; k < vDim; k+=2) { + float64x2_t D4, B4; + DO_GROUP_OP1 (D4, B4, k); + DO_GROUP_OP2 (D4, k); + } + } + } + return; #endif memset (dest, 0, lDim * sizeof (hyFloat)); @@ -3970,7 +4101,6 @@ void _Matrix::Multiply (_Matrix& storage, _Matrix const& secondArg) const if (dimm4 == 60UL) { // codons - for (long r = 0; r < hDim; r++, row_offset += vDim) { long col_offset = 0L; for (long c = 0; c < hDim; c++, ti++, col_offset += vDim) { @@ -4147,7 +4277,116 @@ void _Matrix::Multiply (_Matrix& storage, _Matrix const& secondArg) const } } } - #else + #elif _SLKP_USE_ARM_NEON + long ti = 0L, + row_offset = 0L; + + if (dimm4 == 60UL) { // codons + + + for (long r = 0; r < hDim; r++, row_offset += vDim) { + long col_offset = 0L; + for (long c = 0; c < hDim; c++, ti++, col_offset += vDim) { + float64x2_t A4 = vdupq_n_f64(theData[ti]); + + float64x2_t D4_1, D4_2, D4_3, D4_4; + float64x2_t B4_1, B4_2, B4_3, B4_4; + + DO_GROUP_OP1 (D4_1,B4_1,0); + DO_GROUP_OP1 (D4_2,B4_2,2); + DO_GROUP_OP1 (D4_3,B4_3,4); + DO_GROUP_OP1 (D4_4,B4_4,6); + DO_GROUP_OP2 (D4_1,0); + DO_GROUP_OP2 (D4_2,2); + DO_GROUP_OP2 (D4_3,4); + DO_GROUP_OP2 (D4_4,6); + + DO_GROUP_OP1 (D4_1,B4_1,8); + DO_GROUP_OP1 (D4_2,B4_2,10); + DO_GROUP_OP1 (D4_3,B4_3,12); + DO_GROUP_OP1 (D4_4,B4_4,14); + DO_GROUP_OP2 (D4_1,8); + DO_GROUP_OP2 (D4_2,10); + DO_GROUP_OP2 (D4_3,12); + DO_GROUP_OP2 (D4_4,14); + + DO_GROUP_OP1 (D4_1,B4_1,16); + DO_GROUP_OP1 (D4_2,B4_2,18); + DO_GROUP_OP1 (D4_3,B4_3,20); + DO_GROUP_OP1 (D4_4,B4_4,22); + DO_GROUP_OP2 (D4_1,16); + DO_GROUP_OP2 (D4_2,18); + DO_GROUP_OP2 (D4_3,20); + DO_GROUP_OP2 (D4_4,22); + + DO_GROUP_OP1 (D4_1,B4_1,24); + DO_GROUP_OP1 (D4_2,B4_2,26); + DO_GROUP_OP1 (D4_3,B4_3,28); + DO_GROUP_OP1 (D4_4,B4_4,30); + DO_GROUP_OP2 (D4_1,24); + DO_GROUP_OP2 (D4_2,26); + DO_GROUP_OP2 (D4_3,28); + DO_GROUP_OP2 (D4_4,30); + + DO_GROUP_OP1 (D4_1,B4_1,32); + DO_GROUP_OP1 (D4_2,B4_2,34); + DO_GROUP_OP1 (D4_3,B4_3,36); + DO_GROUP_OP1 (D4_4,B4_4,38); + DO_GROUP_OP2 (D4_1,32); + DO_GROUP_OP2 (D4_2,34); + DO_GROUP_OP2 (D4_3,36); + DO_GROUP_OP2 (D4_4,38); + + DO_GROUP_OP1 (D4_1,B4_1,40); + DO_GROUP_OP1 (D4_2,B4_2,42); + DO_GROUP_OP1 (D4_3,B4_3,44); + DO_GROUP_OP1 (D4_4,B4_4,46); + DO_GROUP_OP2 (D4_1,40); + DO_GROUP_OP2 (D4_2,42); + DO_GROUP_OP2 (D4_3,44); + DO_GROUP_OP2 (D4_4,46); + + DO_GROUP_OP1 (D4_1,B4_1,48); + DO_GROUP_OP1 (D4_2,B4_2,50); + DO_GROUP_OP1 (D4_3,B4_3,52); + DO_GROUP_OP1 (D4_4,B4_4,54); + DO_GROUP_OP2 (D4_1,48); + DO_GROUP_OP2 (D4_2,50); + DO_GROUP_OP2 (D4_3,52); + DO_GROUP_OP2 (D4_4,54); + + DO_GROUP_OP1 (D4_1,B4_1,56); + DO_GROUP_OP1 (D4_2,B4_2,58); + DO_GROUP_OP2 (D4_1,56); + DO_GROUP_OP2 (D4_2,58); + + for (long k = dimm4; k < vDim; k++) { + dest[row_offset + k] += theData[ti] * secondArg.theData[col_offset + k]; + } + } + } + } else { // something else + for (long r = 0; r < hDim; r++, row_offset += vDim) { + long col_offset = 0L; + for (long c = 0; c < hDim; c++, ti++, col_offset += vDim) { + float64x2_t A4 = vdupq_n_f64(theData[ti]); + #pragma GCC unroll 4 + #pragma clang loop vectorize(enable) + #pragma clang loop interleave(enable) + #pragma clang loop unroll(enable) + for (long k = 0; k < dimm4; k+=2) { + float64x2_t D4, B4; + DO_GROUP_OP1 (D4, B4, k); + DO_GROUP_OP2 (D4, k); + } + + for (long k = dimm4; k < vDim; k++) { + dest[row_offset + k] += theData[ti] * secondArg.theData[col_offset + k]; + } + } + } + } + #else const unsigned long column_shift2 = secondArg.vDim << 1, column_shift3 = (secondArg.vDim << 1) + secondArg.vDim, @@ -4185,7 +4424,7 @@ void _Matrix::Multiply (_Matrix& storage, _Matrix const& secondArg) const #endif } -#else + #else secondArg.Transpose(); for (long i=0; i= 0L) { @@ -4380,13 +4632,26 @@ void _Matrix::Multiply (_Matrix& storage, _Matrix const& secondArg) const CELL_OP(16);CELL_OP(20);CELL_OP(24);CELL_OP(28); CELL_OP(32);CELL_OP(36);CELL_OP(40);CELL_OP(44); CELL_OP(48);CELL_OP(52);CELL_OP(56); + #elif _SLKP_USE_ARM_NEON + float64x2_t value_op = vdupq_n_f64 (value); + #define CELL_OP(x) vst1q_f64 (res+x, vfmaq_f64 (vld1q_f64(res+x), value_op, vld1q_f64 (secArg+x))) + + + CELL_OP(0);CELL_OP(2);CELL_OP(4);CELL_OP(6); + CELL_OP(8);CELL_OP(10);CELL_OP(12);CELL_OP(14); + CELL_OP(16);CELL_OP(18);CELL_OP(20);CELL_OP(22); + CELL_OP(24);CELL_OP(26);CELL_OP(28); + CELL_OP(30);CELL_OP(32);CELL_OP(34);CELL_OP(36);CELL_OP(38); + CELL_OP(40);CELL_OP(42);CELL_OP(44);CELL_OP(46); + CELL_OP(48);CELL_OP(50);CELL_OP(52);CELL_OP(54); + CELL_OP(56);CELL_OP(58); #else - for (unsigned long i = 0UL; i < 60UL; i+=4UL) { - res[i] += value * secArg[i]; - res[i+1] += value * secArg[i+1]; - res[i+2] += value * secArg[i+2]; - res[i+3] += value * secArg[i+3]; - } + for (unsigned long i = 0UL; i < 60UL; i+=4UL) { + res[i] += value * secArg[i]; + res[i+1] += value * secArg[i+1]; + res[i+2] += value * secArg[i+2]; + res[i+3] += value * secArg[i+3]; + } #endif res[60] += value * secArg[60]; } @@ -4410,13 +4675,19 @@ void _Matrix::Multiply (_Matrix& storage, _Matrix const& secondArg) const #ifdef _SLKP_USE_AVX_INTRINSICS __m256d value_op = _mm256_set1_pd (value); #endif + #ifdef _SLKP_USE_ARM_NEON + float64x2_t value_op = vdupq_n_f64 (value); + #endif for (unsigned long i = 0UL; i < loopBound; i+=4) { #ifdef _SLKP_USE_AVX_INTRINSICS - #ifdef _SLKP_USE_FMA3_INTRINSICS - _mm256_storeu_pd (res+i, _mm256_fmadd_pd (value_op, _mm256_loadu_pd (secArg+i),_mm256_loadu_pd(res+i))); - #else - _mm256_storeu_pd (res+i, _mm256_add_pd (_mm256_loadu_pd(res+i), _mm256_mul_pd(value_op, _mm256_loadu_pd (secArg+i)))); - #endif + #ifdef _SLKP_USE_FMA3_INTRINSICS + _mm256_storeu_pd (res+i, _mm256_fmadd_pd (value_op, _mm256_loadu_pd (secArg+i),_mm256_loadu_pd(res+i))); + #else + _mm256_storeu_pd (res+i, _mm256_add_pd (_mm256_loadu_pd(res+i), _mm256_mul_pd(value_op, _mm256_loadu_pd (secArg+i)))); + #endif + #elif defined _SLKP_USE_ARM_NEON + vst1q_f64 (res+i, vfmaq_f64 (vld1q_f64(res+i), value_op, vld1q_f64 (secArg+i))); + vst1q_f64 (res+i+2, vfmaq_f64 (vld1q_f64(res+i+2), value_op, vld1q_f64 (secArg+i+2))); #else res[i] += value * secArg[i]; res[i+1] += value * secArg[i+1]; @@ -4446,6 +4717,9 @@ void _Matrix::Multiply (_Matrix& storage, _Matrix const& secondArg) const #ifdef _SLKP_USE_AVX_INTRINSICS __m256d value_op = _mm256_set1_pd (value); #endif + #ifdef _SLKP_USE_ARM_NEON + float64x2_t value_op = vdupq_n_f64 (value); + #endif for (unsigned long i = 0UL; i < loopBound; i+=4) { #ifdef _SLKP_USE_AVX_INTRINSICS @@ -4454,12 +4728,16 @@ void _Matrix::Multiply (_Matrix& storage, _Matrix const& secondArg) const #else _mm256_storeu_pd (res+i, _mm256_add_pd (_mm256_loadu_pd(res+i), _mm256_mul_pd(value_op, _mm256_loadu_pd (secArg+i)))); #endif + #elif defined _SLKP_USE_ARM_NEON + vst1q_f64 (res+i, vfmaq_f64 (vld1q_f64(res+i), value_op, vld1q_f64 (secArg+i))); + vst1q_f64 (res+i+2, vfmaq_f64 (vld1q_f64(res+i+2), value_op, vld1q_f64 (secArg+i+2))); #else res[i] += value * secArg[i]; res[i+1] += value * secArg[i+1]; res[i+2] += value * secArg[i+2]; res[i+3] += value * secArg[i+3]; #endif + } for (unsigned long i = loopBound; i < vDim; i++) { res[i] += value * secArg[i]; @@ -6547,9 +6825,8 @@ hyFloat _Matrix::Sqr (hyFloat* _hprestrict_ stash) { Swap(temp); return DBL_EPSILON * 1.e4; } else { - if (hDim==4) + if (hDim==4) { // special case for nucleotides - { for (unsigned long i=0UL, k = 0UL; i<16; i+=4) { for (unsigned long j=0UL; j<4UL; j++, k++) { hyFloat p1 = theData[i] * theData [j]; @@ -6564,6 +6841,123 @@ hyFloat _Matrix::Sqr (hyFloat* _hprestrict_ stash) { //vDim - vDim % 4; // loop interchange rocks! +#ifdef _SLKP_USE_ARM_NEON + #define DO_GROUP_OP0(X,Y,k) Y = vld1q_f64(theData + col_offset + k); X = vmulq_f64(A4, Y); + #define DO_GROUP_OP1(X,Y,k) X = vld1q_f64(dest + row_offset + k); Y = vld1q_f64(theData + col_offset + k); X = vfmaq_f64 (X,A4,Y); + #define DO_GROUP_OP2(X,k) vst1q_f64 (dest + row_offset + k,X); + + if (true) { + hyFloat * _hprestrict_ dest = stash; + + long ti = 0L, + row_offset = 0L; + + if (loopBound == 60UL) { + for (long r = 0; r < hDim; r++, row_offset += vDim) { + long col_offset = 0L; + { // row 1 + float64x2_t A4 = vdupq_n_f64 (theData[ti]); + float64x2_t X1, X2, X3, X4, + Y1, Y2, Y3, Y4; + + DO_GROUP_OP0 (X1, Y1, 0); DO_GROUP_OP0 (X2, Y2, 2); DO_GROUP_OP0 (X3, Y3, 4); DO_GROUP_OP0 (X4, Y4, 6); + DO_GROUP_OP2 (X1,0); DO_GROUP_OP2 (X2,2); DO_GROUP_OP2 (X3,4); DO_GROUP_OP2 (X4,6); + DO_GROUP_OP0 (X1, Y1, 8); DO_GROUP_OP0 (X2, Y2, 10); DO_GROUP_OP0 (X3, Y3, 12); DO_GROUP_OP0 (X4, Y4, 14); + DO_GROUP_OP2 (X1,8); DO_GROUP_OP2 (X2,10); DO_GROUP_OP2 (X3,12); DO_GROUP_OP2 (X4,14); + DO_GROUP_OP0 (X1, Y1, 16); DO_GROUP_OP0 (X2, Y2, 18); DO_GROUP_OP0 (X3, Y3, 20); DO_GROUP_OP0 (X4, Y4, 22); + DO_GROUP_OP2 (X1,16); DO_GROUP_OP2 (X2,18); DO_GROUP_OP2 (X3,20); DO_GROUP_OP2 (X4,22); + DO_GROUP_OP0 (X1, Y1, 24); DO_GROUP_OP0 (X2, Y2, 26); DO_GROUP_OP0 (X3, Y3, 28); DO_GROUP_OP0 (X4, Y4, 30); + DO_GROUP_OP2 (X1,24); DO_GROUP_OP2 (X2,26); DO_GROUP_OP2 (X3,28); DO_GROUP_OP2 (X4,30); + DO_GROUP_OP0 (X1, Y1, 32); DO_GROUP_OP0 (X2, Y2, 34); DO_GROUP_OP0 (X3, Y3, 36); DO_GROUP_OP0 (X4, Y4, 38); + DO_GROUP_OP2 (X1,32); DO_GROUP_OP2 (X2,34); DO_GROUP_OP2 (X3,36); DO_GROUP_OP2 (X4,38); + DO_GROUP_OP0 (X1, Y1, 40); DO_GROUP_OP0 (X2, Y2, 42); DO_GROUP_OP0 (X3, Y3, 44); DO_GROUP_OP0 (X4, Y4, 46); + DO_GROUP_OP2 (X1,40); DO_GROUP_OP2 (X2,42); DO_GROUP_OP2 (X3,44); DO_GROUP_OP2 (X4,46); + DO_GROUP_OP0 (X1, Y1, 48); DO_GROUP_OP0 (X2, Y2, 50); DO_GROUP_OP0 (X3, Y3, 52); DO_GROUP_OP0 (X4, Y4, 54); + DO_GROUP_OP2 (X1,48); DO_GROUP_OP2 (X2,50); DO_GROUP_OP2 (X3,52); DO_GROUP_OP2 (X4,54); + DO_GROUP_OP0 (X1, Y1, 56); DO_GROUP_OP0 (X2, Y2, 58); + DO_GROUP_OP2 (X1,56); DO_GROUP_OP2 (X2,58); + + + for (long k = 60; k < vDim; k++) { + dest[row_offset + k] = theData[ti] * theData[k]; + } + col_offset = vDim; + ti++; + } + + for (long c = 1; c < hDim; c++, ti++, col_offset += vDim) { + float64x2_t A4 = vdupq_n_f64 (theData[ti]); + float64x2_t X1, X2, X3, X4, + Y1, Y2, Y3, Y4; + + DO_GROUP_OP1 (X1, Y1, 0); DO_GROUP_OP1 (X2, Y2, 2); DO_GROUP_OP1 (X3, Y3, 4); DO_GROUP_OP1 (X4, Y4, 6); + DO_GROUP_OP2 (X1,0); DO_GROUP_OP2 (X2,2); DO_GROUP_OP2 (X3,4); DO_GROUP_OP2 (X4,6); + DO_GROUP_OP1 (X1, Y1, 8); DO_GROUP_OP1 (X2, Y2, 10); DO_GROUP_OP1 (X3, Y3, 12); DO_GROUP_OP1 (X4, Y4, 14); + DO_GROUP_OP2 (X1,8); DO_GROUP_OP2 (X2,10); DO_GROUP_OP2 (X3,12); DO_GROUP_OP2 (X4,14); + DO_GROUP_OP1 (X1, Y1, 16); DO_GROUP_OP1 (X2, Y2, 18); DO_GROUP_OP1 (X3, Y3, 20); DO_GROUP_OP1 (X4, Y4, 22); + DO_GROUP_OP2 (X1,16); DO_GROUP_OP2 (X2,18); DO_GROUP_OP2 (X3,20); DO_GROUP_OP2 (X4,22); + DO_GROUP_OP1 (X1, Y1, 24); DO_GROUP_OP1 (X2, Y2, 26); DO_GROUP_OP1 (X3, Y3, 28); DO_GROUP_OP1 (X4, Y4, 30); + DO_GROUP_OP2 (X1,24); DO_GROUP_OP2 (X2,26); DO_GROUP_OP2 (X3,28); DO_GROUP_OP2 (X4,30); + DO_GROUP_OP1 (X1, Y1, 32); DO_GROUP_OP1 (X2, Y2, 34); DO_GROUP_OP1 (X3, Y3, 36); DO_GROUP_OP1 (X4, Y4, 38); + DO_GROUP_OP2 (X1,32); DO_GROUP_OP2 (X2,34); DO_GROUP_OP2 (X3,36); DO_GROUP_OP2 (X4,38); + DO_GROUP_OP1 (X1, Y1, 40); DO_GROUP_OP1 (X2, Y2, 42); DO_GROUP_OP1 (X3, Y3, 44); DO_GROUP_OP1 (X4, Y4, 46); + DO_GROUP_OP2 (X1,40); DO_GROUP_OP2 (X2,42); DO_GROUP_OP2 (X3,44); DO_GROUP_OP2 (X4,46); + DO_GROUP_OP1 (X1, Y1, 48); DO_GROUP_OP1 (X2, Y2, 50); DO_GROUP_OP1 (X3, Y3, 52); DO_GROUP_OP1 (X4, Y4, 54); + DO_GROUP_OP2 (X1,48); DO_GROUP_OP2 (X2,50); DO_GROUP_OP2 (X3,52); DO_GROUP_OP2 (X4,54); + DO_GROUP_OP1 (X1, Y1, 56); DO_GROUP_OP1 (X2, Y2, 58); + DO_GROUP_OP2 (X1,56); DO_GROUP_OP2 (X2,58); + + for (long k = 60; k < vDim; k++) { + dest[row_offset + k] += theData[ti] * theData[col_offset + k]; + } + } + } + } else { + for (long r = 0; r < hDim; r++, row_offset += vDim) { + long col_offset = 0L; + { // row 1 + float64x2_t A4 = vdupq_n_f64 (theData[ti]); + #pragma GCC unroll 4 + #pragma clang loop vectorize(enable) + #pragma clang loop interleave(enable) + #pragma clang loop unroll(enable) + #pragma GCC ivdep + #pragma ivdep + for (long k = 0; k < loopBound; k+=4) { + float64x2_t D4, B4, X1, X2; + DO_GROUP_OP0 (D4, B4, k); + DO_GROUP_OP0 (X1, X2, k+2); + DO_GROUP_OP2 (D4,k); + DO_GROUP_OP2 (X1,k+2); + } + + for (long k = loopBound; k < vDim; k++) { + dest[row_offset + k] = theData[ti] * theData[k]; + } + col_offset = vDim; + ti++; + } + + for (long c = 1; c < hDim; c++, ti++, col_offset += vDim) { + float64x2_t A4 = vdupq_n_f64 (theData[ti]); + for (long k = 0; k < loopBound; k+=4) { + float64x2_t D4, B4, X1, X2; + DO_GROUP_OP1 (D4, B4, k); + DO_GROUP_OP1 (X1, X2, k+2); + DO_GROUP_OP2 (D4,k); + DO_GROUP_OP2 (X1,k+2); + } + + for (long k = loopBound; k < vDim; k++) { + dest[row_offset + k] += theData[ti] * theData[col_offset + k]; + } + } + } + } + } else { + +#endif + #ifdef _SLKP_USE_AVX_INTRINSICS #define DO_GROUP_OP0(X,Y,k) Y = _mm256_loadu_pd(theData + col_offset + k); X = _mm256_mul_pd(A4, Y); #ifdef _SLKP_USE_FMA3_INTRINSICS @@ -6906,7 +7300,9 @@ hyFloat _Matrix::Sqr (hyFloat* _hprestrict_ stash) { #endif } } - +#ifdef _SLKP_USE_ARM_NEON + } +#endif long lDimmod4 = (lDim >> 2) << 2; hyFloat diffs[4] = {0.0,0.0,0.0,0.0}; diff --git a/src/core/tree_evaluator.cpp b/src/core/tree_evaluator.cpp index 25667d7ec..160d39f10 100644 --- a/src/core/tree_evaluator.cpp +++ b/src/core/tree_evaluator.cpp @@ -122,6 +122,11 @@ inline double _sse_sum_2 (__m128d const & x) { } #endif +#ifdef _SLKP_USE_ARM_NEON +inline double _neon_sum_2 (float64x2_t const & x) { + return vgetq_lane_f64 (x,0) + vgetq_lane_f64 (x,1); +} +#endif template inline void __ll_handle_matrix_transpose (hyFloat const * __restrict transitionMatrix, hyFloat * __restrict tMatrixT) { long i = 0L; @@ -268,6 +273,40 @@ template inline void __ll_handle_block10_product_sum_linear (hyF } #endif +#ifdef _SLKP_USE_ARM_NEON +template inline void __ll_handle_block10_product_sum_linear (hyFloat const* _hprestrict_ tT, hyFloat * _hprestrict_ childVector, float64x2_t * lastTotal) { + + float64x2_t T [5] = { + vld1q_f64 (tT+S), + vld1q_f64 (tT+S+2), + vld1q_f64 (tT+S+4), + vld1q_f64 (tT+S+6), + vld1q_f64 (tT+S+8) + }; + + float64x2_t C [5] = { + vld1q_f64 (childVector+S), + vld1q_f64 (childVector+S+2), + vld1q_f64 (childVector+S+4), + vld1q_f64 (childVector+S+6), + vld1q_f64 (childVector+S+8) + }; + + if (S) { + T[0] = vfmaq_f64 (vmulq_f64 (T[1], C[1]), T[0], C[0]); //0+1 + T[2] = vfmaq_f64 (vmulq_f64 (T[3], C[3]), T[2], C[2]); //2+3 + T[3] = vfmaq_f64 (T[0], T[4], C[4]); //0+1+4 + T[0] = vaddq_f64 (T[3], T[2]); // 0+1+2+3+4 + *lastTotal = vaddq_f64 (*lastTotal, T[0]); + } else { + T[0] = vfmaq_f64 (vmulq_f64 (T[1], C[1]), T[0], C[0]); //0+1 + T[2] = vfmaq_f64 (vmulq_f64 (T[3], C[3]), T[2], C[2]); //2+3 + T[3] = vfmaq_f64 (T[0], T[4], C[4]); //0+1+4 + *lastTotal = vaddq_f64 (T[3], T[2]); // 0+1+2+3+4 + } +} +#endif + #ifdef _SLKP_USE_AVX_INTRINSICS template inline void __ll_handle_block20_product_sum_linear (hyFloat const* _hprestrict_ tT, hyFloat * _hprestrict_ childVector, __m256d * lastTotal) { @@ -374,9 +413,9 @@ template inline __m128d __ll_handle_block10_product_sum (hyFloat _mm_storeu_pd(parentConditionals+S+8, P[4]); P[0] =_mm_add_pd (P[0],P[1]); - P[1] =_mm_add_pd (P[2],P[3]); + P[2] =_mm_add_pd (P[2],P[3]); P[0] =_mm_add_pd (P[0],P[4]); - P[2] =_mm_add_pd (P[0],P[1]); + P[2] =_mm_add_pd (P[0],P[2]); if (grandTotal) { *grandTotal = _mm_add_pd (P[2],*grandTotal); return *grandTotal; @@ -386,6 +425,57 @@ template inline __m128d __ll_handle_block10_product_sum (hyFloat } #endif +#ifdef _SLKP_USE_ARM_NEON +template inline float64x2_t __ll_handle_block10_product_sum (hyFloat const* _hprestrict_ transposedMatrix, hyFloat * _hprestrict_ childVector, hyFloat * _hprestrict_ parentConditionals, float64x2_t * grandTotal) { + + float64x2_t P [5] = { + vdupq_n_f64(0.), + vdupq_n_f64(0.), + vdupq_n_f64(0.), + vdupq_n_f64(0.), + vdupq_n_f64(0.) + }; + + hyFloat const * __restrict tT = transposedMatrix; + + for (long col_idx = 0; col_idx < D; col_idx++, tT+=D) { + if (childVector[col_idx] > 0.) { + float64x2_t C = vdupq_n_f64(childVector[col_idx]); + + P[0] = vfmaq_f64 (P[0], vld1q_f64 (tT+S), C); + P[1] = vfmaq_f64 (P[1], vld1q_f64 (tT+S+2), C); + P[2] = vfmaq_f64 (P[2], vld1q_f64 (tT+S+4), C); + P[3] = vfmaq_f64 (P[3], vld1q_f64 (tT+S+6), C); + P[4] = vfmaq_f64 (P[4], vld1q_f64 (tT+S+8), C); + } + } + + + P[0] = vmulq_f64(vld1q_f64(parentConditionals+S), P[0]); + P[1] = vmulq_f64(vld1q_f64(parentConditionals+S+2), P[1]); + P[2] = vmulq_f64(vld1q_f64(parentConditionals+S+4), P[2]); + P[3] = vmulq_f64(vld1q_f64(parentConditionals+S+6), P[3]); + P[4] = vmulq_f64(vld1q_f64(parentConditionals+S+8), P[4]); + + vst1q_f64(parentConditionals+S, P[0]); + vst1q_f64(parentConditionals+S+2, P[1]); + vst1q_f64(parentConditionals+S+4, P[2]); + vst1q_f64(parentConditionals+S+6, P[3]); + vst1q_f64(parentConditionals+S+8, P[4]); + + P[0] = vaddq_f64 (P[0],P[1]); + P[2] = vaddq_f64 (P[2],P[3]); + P[0] = vaddq_f64 (P[0],P[4]); + P[2] = vaddq_f64 (P[0],P[2]); + if (grandTotal) { + *grandTotal = vaddq_f64 (P[2],*grandTotal); + return *grandTotal; + } + return P[2]; + +} +#endif + #ifdef _SLKP_USE_AVX_INTRINSICS template inline __m256d __ll_handle_block20_product_sum (hyFloat const* _hprestrict_ transposedMatrix, hyFloat * _hprestrict_ childVector, hyFloat * _hprestrict_ parentConditionals, __m256d * grandTotal) { __m256d P [5] = { @@ -728,11 +818,12 @@ inline void _handle4x4_pruning_case (double const* childVector, double const* tM #elif defined _SLKP_USE_AVX_INTRINSICS - __m256d c3 = _mm256_set1_pd(childVector[3]), - c0 = _mm256_set1_pd(childVector[0]), - c1 = _mm256_set1_pd(childVector[1]), - c2 = _mm256_set1_pd(childVector[2]), - t0,t1,t2,t3; + __m256d + c0 = _mm256_set1_pd(childVector[0]), + c1 = _mm256_set1_pd(childVector[1]), + c2 = _mm256_set1_pd(childVector[2]), + c3 = _mm256_set1_pd(childVector[3]), + t0,t1,t2,t3; if (transposed_mx) { t0 = ((__m256d*)transposed_mx)[0]; @@ -761,9 +852,38 @@ inline void _handle4x4_pruning_case (double const* childVector, double const* tM #endif +#elif defined _SLKP_USE_ARM_NEON + float64x2_t c0 = vdupq_n_f64(childVector[0]), + c1 = vdupq_n_f64(childVector[1]), + c2 = vdupq_n_f64(childVector[2]), + c3 = vdupq_n_f64(childVector[3]); + + float64x2x2_t t0, t1, t2, t3; + + if (transposed_mx) { + t0 = ((float64x2x2_t*)transposed_mx)[0]; + t1 = ((float64x2x2_t*)transposed_mx)[1]; + t2 = ((float64x2x2_t*)transposed_mx)[2]; + t3 = ((float64x2x2_t*)transposed_mx)[3]; + } else { + t0 = (float64x2x2_t){tMatrix[0],tMatrix[4],tMatrix[8],tMatrix[12]}; + t1 = (float64x2x2_t){tMatrix[1],tMatrix[5],tMatrix[9],tMatrix[13]}; + t2 = (float64x2x2_t){tMatrix[2],tMatrix[6],tMatrix[10],tMatrix[14]}; + t3 = (float64x2x2_t){tMatrix[3],tMatrix[7],tMatrix[11],tMatrix[15]}; + } + t0.val[0] = vfmaq_f64 (vmulq_f64 (c1,t1.val[0]),c0,t0.val[0]); + t0.val[1] = vfmaq_f64 (vmulq_f64 (c1,t1.val[1]),c0,t0.val[1]); + t2.val[0] = vfmaq_f64 (vmulq_f64 (c3,t3.val[0]),c2,t2.val[0]); + t2.val[1] = vfmaq_f64 (vmulq_f64 (c3,t3.val[1]),c2,t2.val[1]); + vst1q_f64 (parentConditionals, + vmulq_f64 (vld1q_f64 (parentConditionals), vaddq_f64 (t0.val[0], t2.val[0]))); + vst1q_f64 (parentConditionals+2, vmulq_f64 (vld1q_f64 (parentConditionals+2), vaddq_f64 (t0.val[1], t2.val[1]))); + #else + + // 12 multiplications, 16 additions, 3 subtractions /*hyFloat t1 = childVector[0] - childVector[3], @@ -836,7 +956,7 @@ hyFloat _TheTree::ComputeTreeBlockByBranch ( _SimpleList siteTo = siteCount; } - #if defined _SLKP_USE_AVX_INTRINSICS || defined _SLKP_USE_SSE_INTRINSICS + #if defined _SLKP_USE_AVX_INTRINSICS || defined _SLKP_USE_SSE_INTRINSICS || defined _SLKP_USE_ARM_NEON hyFloat * tMatrixT = nil; switch (alphabetDimension) { case 20: @@ -940,6 +1060,16 @@ hyFloat _TheTree::ComputeTreeBlockByBranch ( _SimpleList (__m256d) {transitionMatrix[3],transitionMatrix[7],transitionMatrix[11],transitionMatrix[15]} }; #endif + + #ifdef _SLKP_USE_ARM_NEON + float64x2x2_t tmatrix_transpose [4] = { + (float64x2x2_t) {transitionMatrix[0],transitionMatrix[4],transitionMatrix[8],transitionMatrix[12]}, + (float64x2x2_t) {transitionMatrix[1],transitionMatrix[5],transitionMatrix[9],transitionMatrix[13]}, + (float64x2x2_t) {transitionMatrix[2],transitionMatrix[6],transitionMatrix[10],transitionMatrix[14]}, + (float64x2x2_t) {transitionMatrix[3],transitionMatrix[7],transitionMatrix[11],transitionMatrix[15]} + }; + #endif + for (long siteID = siteFrom; siteID < siteTo; siteID++, parentConditionals += 4UL) { __ll_loop_preamble if (__ll_handle_conditional_array_initialization<4> ( @@ -949,7 +1079,7 @@ hyFloat _TheTree::ComputeTreeBlockByBranch ( _SimpleList - #ifdef _SLKP_USE_AVX_INTRINSICS + #if defined _SLKP_USE_AVX_INTRINSICS or defined _SLKP_USE_ARM_NEON _handle4x4_pruning_case (childVector, tMatrix, parentConditionals, tmatrix_transpose); #else _handle4x4_pruning_case (childVector, tMatrix, parentConditionals, nil); @@ -968,7 +1098,7 @@ hyFloat _TheTree::ComputeTreeBlockByBranch ( _SimpleList // START AMINO-ACID CASE else if (alphabetDimension == 20UL) { - #if defined _SLKP_USE_AVX_INTRINSICS || defined _SLKP_USE_SSE_INTRINSICS + #if defined _SLKP_USE_AVX_INTRINSICS || defined _SLKP_USE_SSE_INTRINSICS || defined _SLKP_USE_ARM_NEON __ll_handle_matrix_transpose<20> (transitionMatrix, tMatrixT); #endif @@ -985,6 +1115,11 @@ hyFloat _TheTree::ComputeTreeBlockByBranch ( _SimpleList grandTotal = __ll_handle_block10_product_sum<20,0> (tMatrixT, childVector, parentConditionals, nil); grandTotal = __ll_handle_block10_product_sum<20,10> (tMatrixT, childVector, parentConditionals, &grandTotal),grandTotal; sum = _sse_sum_2 (grandTotal); + #elif defined _SLKP_USE_ARM_NEON + float64x2_t grandTotal + = __ll_handle_block10_product_sum<20,0> (tMatrixT, childVector, parentConditionals, nil); + grandTotal = __ll_handle_block10_product_sum<20,10> (tMatrixT, childVector, parentConditionals, &grandTotal),grandTotal; + sum = _neon_sum_2 (grandTotal); #else __ll_product_sum_loop<20L> (tMatrix, childVector, parentConditionals, sum); #endif @@ -999,7 +1134,7 @@ hyFloat _TheTree::ComputeTreeBlockByBranch ( _SimpleList // END AMINO-ACID CASE // START UNIVERSAL CODE else if (alphabetDimension == 60UL) { - #if defined _SLKP_USE_AVX_INTRINSICS || defined _SLKP_USE_SSE_INTRINSICS + #if defined _SLKP_USE_AVX_INTRINSICS || defined _SLKP_USE_SSE_INTRINSICS || defined _SLKP_USE_ARM_NEON __ll_handle_matrix_transpose<60> (transitionMatrix, tMatrixT); #endif for (long siteID = siteFrom; siteID < siteTo; siteID++, parentConditionals += 60L) { @@ -1025,6 +1160,15 @@ hyFloat _TheTree::ComputeTreeBlockByBranch ( _SimpleList grandTotal = __ll_handle_block10_product_sum<60,40> (tMatrixT, childVector, parentConditionals, &grandTotal); grandTotal = __ll_handle_block10_product_sum<60,50> (tMatrixT, childVector, parentConditionals, &grandTotal); sum += _sse_sum_2(grandTotal); + #elif defined _SLKP_USE_ARM_NEON + float64x2_t grandTotal + = __ll_handle_block10_product_sum<60,0> (tMatrixT, childVector, parentConditionals, nil); + grandTotal = __ll_handle_block10_product_sum<60,10> (tMatrixT, childVector, parentConditionals, &grandTotal); + grandTotal = __ll_handle_block10_product_sum<60,20> (tMatrixT, childVector, parentConditionals, &grandTotal); + grandTotal = __ll_handle_block10_product_sum<60,30> (tMatrixT, childVector, parentConditionals, &grandTotal); + grandTotal = __ll_handle_block10_product_sum<60,40> (tMatrixT, childVector, parentConditionals, &grandTotal); + grandTotal = __ll_handle_block10_product_sum<60,50> (tMatrixT, childVector, parentConditionals, &grandTotal); + sum += _neon_sum_2(grandTotal); #else __ll_product_sum_loop<60L> (tMatrix, childVector, parentConditionals, sum); #endif @@ -1036,7 +1180,7 @@ hyFloat _TheTree::ComputeTreeBlockByBranch ( _SimpleList } } else if (alphabetDimension == 61UL) { - #if defined _SLKP_USE_AVX_INTRINSICS || defined _SLKP_USE_SSE_INTRINSICS + #if defined _SLKP_USE_AVX_INTRINSICS || defined _SLKP_USE_SSE_INTRINSICS || defined _SLKP_USE_ARM_NEON __ll_handle_matrix_transpose<61> (transitionMatrix, tMatrixT); #endif for (long siteID = siteFrom; siteID < siteTo; siteID++, parentConditionals += 61L) { @@ -1122,6 +1266,26 @@ hyFloat _TheTree::ComputeTreeBlockByBranch ( _SimpleList hyFloat s60 = _sse_sum_2(lastTotal) + childVector[60] * tT[60]; parentConditionals[60] *= s60; sum += _sse_sum_2(grandTotal) + s60; + #elif defined _SLKP_USE_ARM_NEON + float64x2_t grandTotal; + grandTotal = __ll_handle_block10_product_sum<61,0> (tMatrixT, childVector, parentConditionals, nil); + grandTotal = __ll_handle_block10_product_sum<61,10> (tMatrixT, childVector, parentConditionals, &grandTotal); + grandTotal = __ll_handle_block10_product_sum<61,20> (tMatrixT, childVector, parentConditionals, &grandTotal); + grandTotal = __ll_handle_block10_product_sum<61,30> (tMatrixT, childVector, parentConditionals, &grandTotal); + grandTotal = __ll_handle_block10_product_sum<61,40> (tMatrixT, childVector, parentConditionals, &grandTotal); + grandTotal = __ll_handle_block10_product_sum<61,50> (tMatrixT, childVector, parentConditionals, &grandTotal); + + hyFloat const * __restrict tT = tMatrix + 61*60; + float64x2_t lastTotal; + __ll_handle_block10_product_sum_linear<61,0> (tT, childVector, &lastTotal); + __ll_handle_block10_product_sum_linear<61,10> (tT, childVector, &lastTotal); + __ll_handle_block10_product_sum_linear<61,20> (tT, childVector, &lastTotal); + __ll_handle_block10_product_sum_linear<61,30> (tT, childVector, &lastTotal); + __ll_handle_block10_product_sum_linear<61,40> (tT, childVector, &lastTotal); + __ll_handle_block10_product_sum_linear<61,50> (tT, childVector, &lastTotal); + hyFloat s60 = _neon_sum_2(lastTotal) + childVector[60] * tT[60]; + parentConditionals[60] *= s60; + sum += _neon_sum_2(grandTotal) + s60; #else __ll_product_sum_loop<61L> (tMatrix, childVector, parentConditionals, sum); #endif @@ -1138,7 +1302,7 @@ hyFloat _TheTree::ComputeTreeBlockByBranch ( _SimpleList __ll_loop_epilogue } } else if (alphabetDimension == 62UL) { - #if defined _SLKP_USE_AVX_INTRINSICS || defined _SLKP_USE_SSE_INTRINSICS + #if defined _SLKP_USE_AVX_INTRINSICS || defined _SLKP_USE_SSE_INTRINSICS || defined _SLKP_USE_ARM_NEON __ll_handle_matrix_transpose<62> (transitionMatrix, tMatrixT); #endif for (long siteID = siteFrom; siteID < siteTo; siteID++, parentConditionals += 62L) { @@ -1206,6 +1370,40 @@ hyFloat _TheTree::ComputeTreeBlockByBranch ( _SimpleList parentConditionals[61] *= s61; sum += _sse_sum_2(grandTotal) + s60 + s61; + #elif defined _SLKP_USE_ARM_NEON + float64x2_t grandTotal + = __ll_handle_block10_product_sum<62,0> (tMatrixT, childVector, parentConditionals, nil); + grandTotal = __ll_handle_block10_product_sum<62,10> (tMatrixT, childVector, parentConditionals, &grandTotal); + grandTotal = __ll_handle_block10_product_sum<62,20> (tMatrixT, childVector, parentConditionals, &grandTotal); + grandTotal = __ll_handle_block10_product_sum<62,30> (tMatrixT, childVector, parentConditionals, &grandTotal); + grandTotal = __ll_handle_block10_product_sum<62,40> (tMatrixT, childVector, parentConditionals, &grandTotal); + grandTotal = __ll_handle_block10_product_sum<62,50> (tMatrixT, childVector, parentConditionals, &grandTotal); + + hyFloat const * __restrict tT = tMatrix + 62*60; + + float64x2_t lastTotal; + __ll_handle_block10_product_sum_linear<62,0> (tT, childVector, &lastTotal); + __ll_handle_block10_product_sum_linear<62,10> (tT, childVector, &lastTotal); + __ll_handle_block10_product_sum_linear<62,20> (tT, childVector, &lastTotal); + __ll_handle_block10_product_sum_linear<62,30> (tT, childVector, &lastTotal); + __ll_handle_block10_product_sum_linear<62,40> (tT, childVector, &lastTotal); + __ll_handle_block10_product_sum_linear<62,50> (tT, childVector, &lastTotal); + hyFloat s60 = _neon_sum_2(lastTotal) + childVector[60] * tT[60] + childVector[61] * tT[61]; + parentConditionals[60] *= s60; + + tT += 62; + float64x2_t lastTotal2; + __ll_handle_block10_product_sum_linear<62,0> (tT, childVector, &lastTotal2); + __ll_handle_block10_product_sum_linear<62,10> (tT, childVector, &lastTotal2); + __ll_handle_block10_product_sum_linear<62,20> (tT, childVector, &lastTotal2); + __ll_handle_block10_product_sum_linear<62,30> (tT, childVector, &lastTotal2); + __ll_handle_block10_product_sum_linear<62,40> (tT, childVector, &lastTotal2); + __ll_handle_block10_product_sum_linear<62,50> (tT, childVector, &lastTotal2); + + hyFloat s61 = _neon_sum_2(lastTotal2) + childVector[60] * tT[60] + childVector[61] * tT[61]; + parentConditionals[61] *= s61; + + sum += _neon_sum_2(grandTotal) + s60 + s61; #else __ll_product_sum_loop<62L> (tMatrix, childVector, parentConditionals, sum); #endif @@ -1216,7 +1414,7 @@ hyFloat _TheTree::ComputeTreeBlockByBranch ( _SimpleList __ll_loop_epilogue } } else if (alphabetDimension == 63UL) { - #if defined _SLKP_USE_AVX_INTRINSICS || defined _SLKP_USE_SSE_INTRINSICS + #if defined _SLKP_USE_AVX_INTRINSICS || defined _SLKP_USE_SSE_INTRINSICS || defined _SLKP_USE_ARM_NEON __ll_handle_matrix_transpose<63> (transitionMatrix, tMatrixT); #endif for (long siteID = siteFrom; siteID < siteTo; siteID++, parentConditionals += 63L) { @@ -1305,6 +1503,51 @@ hyFloat _TheTree::ComputeTreeBlockByBranch ( _SimpleList parentConditionals[62] *= s62; sum += _sse_sum_2(grandTotal) + s60 + s61 + s62; + #elif defined _SLKP_USE_ARM_NEON + float64x2_t grandTotal + = __ll_handle_block10_product_sum<63,0> (tMatrixT, childVector, parentConditionals, nil); + grandTotal = __ll_handle_block10_product_sum<63,10> (tMatrixT, childVector, parentConditionals, &grandTotal); + grandTotal = __ll_handle_block10_product_sum<63,20> (tMatrixT, childVector, parentConditionals, &grandTotal); + grandTotal = __ll_handle_block10_product_sum<63,30> (tMatrixT, childVector, parentConditionals, &grandTotal); + grandTotal = __ll_handle_block10_product_sum<63,40> (tMatrixT, childVector, parentConditionals, &grandTotal); + grandTotal = __ll_handle_block10_product_sum<63,50> (tMatrixT, childVector, parentConditionals, &grandTotal); + + hyFloat const * __restrict tT = tMatrix + 63*60; + float64x2_t lastTotal; + __ll_handle_block10_product_sum_linear<63,0> (tT, childVector, &lastTotal); + __ll_handle_block10_product_sum_linear<63,10> (tT, childVector, &lastTotal); + __ll_handle_block10_product_sum_linear<63,20> (tT, childVector, &lastTotal); + __ll_handle_block10_product_sum_linear<63,30> (tT, childVector, &lastTotal); + __ll_handle_block10_product_sum_linear<63,40> (tT, childVector, &lastTotal); + __ll_handle_block10_product_sum_linear<63,50> (tT, childVector, &lastTotal); + hyFloat s60 = _neon_sum_2(lastTotal) + childVector[60] * tT[60] + childVector[61] * tT[61] + childVector[62] * tT[62]; + parentConditionals[60] *= s60; + + tT += 63; + float64x2_t lastTotal2; + __ll_handle_block10_product_sum_linear<63,0> (tT, childVector, &lastTotal2); + __ll_handle_block10_product_sum_linear<63,10> (tT, childVector, &lastTotal2); + __ll_handle_block10_product_sum_linear<63,20> (tT, childVector, &lastTotal2); + __ll_handle_block10_product_sum_linear<63,30> (tT, childVector, &lastTotal2); + __ll_handle_block10_product_sum_linear<63,40> (tT, childVector, &lastTotal2); + __ll_handle_block10_product_sum_linear<63,50> (tT, childVector, &lastTotal2); + + hyFloat s61 = _neon_sum_2(lastTotal2) + childVector[60] * tT[60] + childVector[61] * tT[61] + childVector[62] * tT[62];; + parentConditionals[61] *= s61; + + tT += 63; + float64x2_t lastTotal3; + __ll_handle_block10_product_sum_linear<63,0> (tT, childVector, &lastTotal3); + __ll_handle_block10_product_sum_linear<63,10> (tT, childVector, &lastTotal3); + __ll_handle_block10_product_sum_linear<63,20> (tT, childVector, &lastTotal3); + __ll_handle_block10_product_sum_linear<63,30> (tT, childVector, &lastTotal3); + __ll_handle_block10_product_sum_linear<63,40> (tT, childVector, &lastTotal3); + __ll_handle_block10_product_sum_linear<63,50> (tT, childVector, &lastTotal3); + + hyFloat s62 = _neon_sum_2(lastTotal3) + childVector[60] * tT[60] + childVector[61] * tT[61] + childVector[62] * tT[62];; + parentConditionals[62] *= s62; + + sum += _neon_sum_2(grandTotal) + s60 + s61 + s62; #else __ll_product_sum_loop<63L> (tMatrix, childVector, parentConditionals, sum); #endif @@ -1568,7 +1811,7 @@ void _TheTree::ComputeBranchCache ( myParent = flatParents.list_data[myParent+flatLeaves.lLength]; } while (myParent >= 0); -#if defined _SLKP_USE_AVX_INTRINSICS || defined _SLKP_USE_SSE_INTRINSICS +#if defined _SLKP_USE_AVX_INTRINSICS || defined _SLKP_USE_SSE_INTRINSICS || defined _SLKP_USE_ARM_NEON hyFloat * tMatrixT = nil; switch (alphabetDimension) { case 20: @@ -1755,11 +1998,20 @@ void _TheTree::ComputeBranchCache ( if (alphabetDimension == 4L) { #ifdef _SLKP_USE_AVX_INTRINSICS - __m256d tmatrix_transpose [4] = { - (__m256d) {transitionMatrix[0],transitionMatrix[4],transitionMatrix[8],transitionMatrix[12]}, - (__m256d) {transitionMatrix[1],transitionMatrix[5],transitionMatrix[9],transitionMatrix[13]}, - (__m256d) {transitionMatrix[2],transitionMatrix[6],transitionMatrix[10],transitionMatrix[14]}, - (__m256d) {transitionMatrix[3],transitionMatrix[7],transitionMatrix[11],transitionMatrix[15]} + __m256d tmatrix_transpose [4] = { + (__m256d) {transitionMatrix[0],transitionMatrix[4],transitionMatrix[8],transitionMatrix[12]}, + (__m256d) {transitionMatrix[1],transitionMatrix[5],transitionMatrix[9],transitionMatrix[13]}, + (__m256d) {transitionMatrix[2],transitionMatrix[6],transitionMatrix[10],transitionMatrix[14]}, + (__m256d) {transitionMatrix[3],transitionMatrix[7],transitionMatrix[11],transitionMatrix[15]} + }; + #endif + + #ifdef _SLKP_USE_ARM_NEON + float64x2x2_t tmatrix_transpose [4] = { + (float64x2x2_t) {transitionMatrix[0],transitionMatrix[4],transitionMatrix[8],transitionMatrix[12]}, + (float64x2x2_t) {transitionMatrix[1],transitionMatrix[5],transitionMatrix[9],transitionMatrix[13]}, + (float64x2x2_t) {transitionMatrix[2],transitionMatrix[6],transitionMatrix[10],transitionMatrix[14]}, + (float64x2x2_t) {transitionMatrix[3],transitionMatrix[7],transitionMatrix[11],transitionMatrix[15]} }; #endif for (long siteID = siteFrom; siteID < siteTo; siteID++, parentConditionals += 4L) { @@ -1773,7 +2025,7 @@ void _TheTree::ComputeBranchCache ( continue; } long didScale = 0; - #ifdef _SLKP_USE_AVX_INTRINSICS + #if defined _SLKP_USE_AVX_INTRINSICS || defined _SLKP_USE_ARM_NEON _handle4x4_pruning_case (childVector, tMatrix, parentConditionals, tmatrix_transpose); #else _handle4x4_pruning_case (childVector, tMatrix, parentConditionals, nil); @@ -1791,7 +2043,7 @@ void _TheTree::ComputeBranchCache ( __handle_site_corrections(didScale, siteID); } } else if (alphabetDimension == 20L) { - #if defined _SLKP_USE_AVX_INTRINSICS || defined _SLKP_USE_SSE_INTRINSICS + #if defined _SLKP_USE_AVX_INTRINSICS || defined _SLKP_USE_SSE_INTRINSICS || _SLKP_USE_ARM_NEON __ll_handle_matrix_transpose<20>(transitionMatrix, tMatrixT); #endif for (long siteID = siteFrom; siteID < siteTo; siteID++, parentConditionals += 20L) { @@ -1809,6 +2061,11 @@ void _TheTree::ComputeBranchCache ( __m128d s128; s128 = __ll_handle_block10_product_sum<20,0> (tMatrixT, childVector, parentConditionals, nil); s128 = __ll_handle_block10_product_sum<20,10> (tMatrixT, childVector, parentConditionals, &s128); + #elif defined _SLKP_USE_ARM_NEON + float64x2_t grandTotal = vdupq_n_f64(0.); + grandTotal = __ll_handle_block10_product_sum<20,0> (tMatrixT, childVector, parentConditionals, nil); + grandTotal = __ll_handle_block10_product_sum<20,10> (tMatrixT, childVector, parentConditionals, &grandTotal),grandTotal; + sum = _neon_sum_2 (grandTotal); #else __ll_product_sum_loop<20L> (tMatrix, childVector, parentConditionals, sum); #endif @@ -1824,7 +2081,7 @@ void _TheTree::ComputeBranchCache ( __handle_site_corrections(didScale, siteID); } } else if (alphabetDimension == 60L) { - #if defined _SLKP_USE_AVX_INTRINSICS || defined _SLKP_USE_SSE_INTRINSICS + #if defined _SLKP_USE_AVX_INTRINSICS || defined _SLKP_USE_SSE_INTRINSICS || _SLKP_USE_ARM_NEON __ll_handle_matrix_transpose<60L>(transitionMatrix, tMatrixT); #endif for (long siteID = siteFrom; siteID < siteTo; siteID++, parentConditionals += 60L) { @@ -1849,6 +2106,14 @@ void _TheTree::ComputeBranchCache ( grandTotal = __ll_handle_block10_product_sum<60,30> (tMatrixT, childVector, parentConditionals, &grandTotal); grandTotal = __ll_handle_block10_product_sum<60,40> (tMatrixT, childVector, parentConditionals, &grandTotal); grandTotal = __ll_handle_block10_product_sum<60,50> (tMatrixT, childVector, parentConditionals, &grandTotal); + #elif defined _SLKP_USE_ARM_NEON + float64x2_t + grandTotal = __ll_handle_block10_product_sum<60,0> (tMatrixT, childVector, parentConditionals, nil); + grandTotal = __ll_handle_block10_product_sum<60,10> (tMatrixT, childVector, parentConditionals, &grandTotal); + grandTotal = __ll_handle_block10_product_sum<60,20> (tMatrixT, childVector, parentConditionals, &grandTotal); + grandTotal = __ll_handle_block10_product_sum<60,30> (tMatrixT, childVector, parentConditionals, &grandTotal); + grandTotal = __ll_handle_block10_product_sum<60,40> (tMatrixT, childVector, parentConditionals, &grandTotal); + grandTotal = __ll_handle_block10_product_sum<60,50> (tMatrixT, childVector, parentConditionals, &grandTotal); #else __ll_product_sum_loop<60L> (tMatrix, childVector, parentConditionals, sum); #endif @@ -1864,7 +2129,7 @@ void _TheTree::ComputeBranchCache ( __handle_site_corrections(didScale, siteID); } } else if (alphabetDimension == 61L) { - #if defined _SLKP_USE_AVX_INTRINSICS || defined _SLKP_USE_SSE_INTRINSICS + #if defined _SLKP_USE_AVX_INTRINSICS || defined _SLKP_USE_SSE_INTRINSICS || defined _SLKP_USE_ARM_NEON __ll_handle_matrix_transpose<61L>(transitionMatrix, tMatrixT); #endif for (long siteID = siteFrom; siteID < siteTo; siteID++, parentConditionals += 61L) { @@ -1909,6 +2174,25 @@ void _TheTree::ComputeBranchCache ( __ll_handle_block10_product_sum_linear<61,50> (tT, childVector, &lastTotal); hyFloat s60 = _sse_sum_2(lastTotal) + childVector[60] * tT[60]; parentConditionals[60] *= s60; + #elif defined _SLKP_USE_ARM_NEON + float64x2_t grandTotal; + grandTotal = __ll_handle_block10_product_sum<61,0> (tMatrixT, childVector, parentConditionals, nil); + grandTotal = __ll_handle_block10_product_sum<61,10> (tMatrixT, childVector, parentConditionals, &grandTotal); + grandTotal = __ll_handle_block10_product_sum<61,20> (tMatrixT, childVector, parentConditionals, &grandTotal); + grandTotal = __ll_handle_block10_product_sum<61,30> (tMatrixT, childVector, parentConditionals, &grandTotal); + grandTotal = __ll_handle_block10_product_sum<61,40> (tMatrixT, childVector, parentConditionals, &grandTotal); + grandTotal = __ll_handle_block10_product_sum<61,50> (tMatrixT, childVector, parentConditionals, &grandTotal); + + hyFloat const * __restrict tT = tMatrix + 61*60; + float64x2_t lastTotal; + __ll_handle_block10_product_sum_linear<61,0> (tT, childVector, &lastTotal); + __ll_handle_block10_product_sum_linear<61,10> (tT, childVector, &lastTotal); + __ll_handle_block10_product_sum_linear<61,20> (tT, childVector, &lastTotal); + __ll_handle_block10_product_sum_linear<61,30> (tT, childVector, &lastTotal); + __ll_handle_block10_product_sum_linear<61,40> (tT, childVector, &lastTotal); + __ll_handle_block10_product_sum_linear<61,50> (tT, childVector, &lastTotal); + hyFloat s60 = _neon_sum_2(lastTotal) + childVector[60] * tT[60]; + parentConditionals[60] *= s60; #else __ll_product_sum_loop<61L> (tMatrix, childVector, parentConditionals, sum); #endif @@ -1925,7 +2209,7 @@ void _TheTree::ComputeBranchCache ( __handle_site_corrections(didScale, siteID); } } else if (alphabetDimension == 62L) { - #if defined _SLKP_USE_AVX_INTRINSICS || defined _SLKP_USE_SSE_INTRINSICS + #if defined _SLKP_USE_AVX_INTRINSICS || defined _SLKP_USE_SSE_INTRINSICS || defined _SLKP_USE_ARM_NEON __ll_handle_matrix_transpose<62L>(transitionMatrix, tMatrixT); #endif for (long siteID = siteFrom; siteID < siteTo; siteID++, parentConditionals += 62L) { @@ -1990,6 +2274,36 @@ void _TheTree::ComputeBranchCache ( __ll_handle_block10_product_sum_linear<62,50> (tT, childVector, &lastTotal2); hyFloat s61 = _sse_sum_2(lastTotal2) + childVector[60] * tT[60] + childVector[61] * tT[61]; parentConditionals[61] *= s61; + #elif defined _SLKP_USE_ARM_NEON + float64x2_t grandTotal + = __ll_handle_block10_product_sum<62,0> (tMatrixT, childVector, parentConditionals, nil); + grandTotal = __ll_handle_block10_product_sum<62,10> (tMatrixT, childVector, parentConditionals, &grandTotal); + grandTotal = __ll_handle_block10_product_sum<62,20> (tMatrixT, childVector, parentConditionals, &grandTotal); + grandTotal = __ll_handle_block10_product_sum<62,30> (tMatrixT, childVector, parentConditionals, &grandTotal); + grandTotal = __ll_handle_block10_product_sum<62,40> (tMatrixT, childVector, parentConditionals, &grandTotal); + grandTotal = __ll_handle_block10_product_sum<62,50> (tMatrixT, childVector, parentConditionals, &grandTotal); + + hyFloat const * __restrict tT = tMatrix + 62*60; + float64x2_t lastTotal; + __ll_handle_block10_product_sum_linear<62,0> (tT, childVector, &lastTotal); + __ll_handle_block10_product_sum_linear<62,10> (tT, childVector, &lastTotal); + __ll_handle_block10_product_sum_linear<62,20> (tT, childVector, &lastTotal); + __ll_handle_block10_product_sum_linear<62,30> (tT, childVector, &lastTotal); + __ll_handle_block10_product_sum_linear<62,40> (tT, childVector, &lastTotal); + __ll_handle_block10_product_sum_linear<62,50> (tT, childVector, &lastTotal); + hyFloat s60 = _neon_sum_2(lastTotal) + childVector[60] * tT[60] + childVector[61] * tT[61]; + parentConditionals[60] *= s60; + + tT += 62; + float64x2_t lastTotal2; + __ll_handle_block10_product_sum_linear<62,0> (tT, childVector, &lastTotal2); + __ll_handle_block10_product_sum_linear<62,10> (tT, childVector, &lastTotal2); + __ll_handle_block10_product_sum_linear<62,20> (tT, childVector, &lastTotal2); + __ll_handle_block10_product_sum_linear<62,30> (tT, childVector, &lastTotal2); + __ll_handle_block10_product_sum_linear<62,40> (tT, childVector, &lastTotal2); + __ll_handle_block10_product_sum_linear<62,50> (tT, childVector, &lastTotal2); + hyFloat s61 = _neon_sum_2(lastTotal2) + childVector[60] * tT[60] + childVector[61] * tT[61]; + parentConditionals[61] *= s61; #else __ll_product_sum_loop<62L> (tMatrix, childVector, parentConditionals, sum); #endif @@ -2005,7 +2319,7 @@ void _TheTree::ComputeBranchCache ( __handle_site_corrections(didScale, siteID); } } else if (alphabetDimension == 63L) { - #if defined _SLKP_USE_AVX_INTRINSICS || defined _SLKP_USE_SSE_INTRINSICS + #if defined _SLKP_USE_AVX_INTRINSICS || defined _SLKP_USE_SSE_INTRINSICS || defined _SLKP_USE_ARM_NEON __ll_handle_matrix_transpose<63L>(transitionMatrix, tMatrixT); #endif for (long siteID = siteFrom; siteID < siteTo; siteID++, parentConditionals += 63L) { @@ -2089,6 +2403,48 @@ void _TheTree::ComputeBranchCache ( __ll_handle_block10_product_sum_linear<63,50> (tT, childVector, &lastTotal3); hyFloat s62 = _sse_sum_2(lastTotal3) + childVector[60] * tT[60] + childVector[61] * tT[61] + childVector[62] * tT[62]; parentConditionals[62] *= s62; + + #elif defined _SLKP_USE_ARM_NEON + float64x2_t + grandTotal = __ll_handle_block10_product_sum<63,0> (tMatrixT, childVector, parentConditionals, nil); + grandTotal = __ll_handle_block10_product_sum<63,10> (tMatrixT, childVector, parentConditionals, &grandTotal); + grandTotal = __ll_handle_block10_product_sum<63,20> (tMatrixT, childVector, parentConditionals, &grandTotal); + grandTotal = __ll_handle_block10_product_sum<63,30> (tMatrixT, childVector, parentConditionals, &grandTotal); + grandTotal = __ll_handle_block10_product_sum<63,40> (tMatrixT, childVector, parentConditionals, &grandTotal); + grandTotal = __ll_handle_block10_product_sum<63,50> (tMatrixT, childVector, parentConditionals, &grandTotal); + + hyFloat const * __restrict tT = tMatrix + 63*60; + float64x2_t lastTotal; + __ll_handle_block10_product_sum_linear<63,0> (tT, childVector, &lastTotal); + __ll_handle_block10_product_sum_linear<63,10> (tT, childVector, &lastTotal); + __ll_handle_block10_product_sum_linear<63,20> (tT, childVector, &lastTotal); + __ll_handle_block10_product_sum_linear<63,30> (tT, childVector, &lastTotal); + __ll_handle_block10_product_sum_linear<63,40> (tT, childVector, &lastTotal); + __ll_handle_block10_product_sum_linear<63,50> (tT, childVector, &lastTotal); + hyFloat s60 = _neon_sum_2(lastTotal) + childVector[60] * tT[60] + childVector[61] * tT[61] + childVector[62] * tT[62]; + parentConditionals[60] *= s60; + + tT += 63; + float64x2_t lastTotal2; + __ll_handle_block10_product_sum_linear<63,0> (tT, childVector, &lastTotal2); + __ll_handle_block10_product_sum_linear<63,10> (tT, childVector, &lastTotal2); + __ll_handle_block10_product_sum_linear<63,20> (tT, childVector, &lastTotal2); + __ll_handle_block10_product_sum_linear<63,30> (tT, childVector, &lastTotal2); + __ll_handle_block10_product_sum_linear<63,40> (tT, childVector, &lastTotal2); + __ll_handle_block10_product_sum_linear<63,50> (tT, childVector, &lastTotal2); + hyFloat s61 = _neon_sum_2(lastTotal2) + childVector[60] * tT[60] + childVector[61] * tT[61] + childVector[62] * tT[62]; + parentConditionals[61] *= s61; + + tT += 63; + float64x2_t lastTotal3; + __ll_handle_block10_product_sum_linear<63,0> (tT, childVector, &lastTotal3); + __ll_handle_block10_product_sum_linear<63,10> (tT, childVector, &lastTotal3); + __ll_handle_block10_product_sum_linear<63,20> (tT, childVector, &lastTotal3); + __ll_handle_block10_product_sum_linear<63,30> (tT, childVector, &lastTotal3); + __ll_handle_block10_product_sum_linear<63,40> (tT, childVector, &lastTotal3); + __ll_handle_block10_product_sum_linear<63,50> (tT, childVector, &lastTotal3); + hyFloat s62 = _neon_sum_2(lastTotal3) + childVector[60] * tT[60] + childVector[61] * tT[61] + childVector[62] * tT[62]; + parentConditionals[62] *= s62; #else __ll_product_sum_loop<63L> (tMatrix, childVector, parentConditionals, sum); #endif diff --git a/src/lib/setup.py b/src/lib/setup.py index 0dad51a3d..8b36da24b 100644 --- a/src/lib/setup.py +++ b/src/lib/setup.py @@ -5,7 +5,6 @@ from glob import glob import sys -from platform import architecture, mac_ver #incdir = get_python_inc(plat_specific=1) #print incdir @@ -48,8 +47,10 @@ define_macros = [('__HYPHY_64__', None)] if '64' in architecture()[0] else [] # openmp on Mac OS X Lion is broken -major, minor = mac_ver()[0].split('.') -openmp = ['-fopenmp'] if int(major) < 10 or (int(major) == 10 and int(minor) < 7) else [] +#major, minor = mac_ver()[0].split('.') +#openmp = ['-fopenmp'] if int(major) < 10 or (int(major) == 10 and int(minor) < 7) else [] + +openmp = [] setup( name = 'HyPhy', @@ -59,7 +60,7 @@ author_email = 'spond@temple.edu', url = 'http://www.hyphy.org/', packages = ['HyPhy'], - package_dir = {'HyPhy': 'LibraryModules/Python/HyPhy'}, + package_dir = {'HyPhy': './'}, # data_files = resFiles, # py_modules = ['HyPhy'], ext_modules = [Extension('_HyPhy', diff --git a/tests/hbltests/Results/HIVSweden.out b/tests/hbltests/Results/HIVSweden.out index 79b8035b5..41addf90d 100644 --- a/tests/hbltests/Results/HIVSweden.out +++ b/tests/hbltests/Results/HIVSweden.out @@ -2,28 +2,28 @@ *** RUNNING SINGLE RATE MODEL *** ################################# ->Done in 5 seconds +>Done in 3 seconds --1137.68842968708 +-1137.68870195817 ----------------------------------- -dN/dS = 0.9051395450601901 +dN/dS = 0.9071719918589423 *** RUNNING MODEL 1 (Neutral) *** ###################################### ->Done in 3 seconds +>Done in 2 seconds --1114.64197979838 +-1114.64190562717 ------------------------------------------------ -dN/dS = 0.5549192972885679 (sample variance = 0.2120268320358276) +dN/dS = 0.5548244466259989 (sample variance = 0.2120544773207587) -Rate[1]= 0.07854090 (weight=0.4830173) -Rate[2]= 1.00000000 (weight=0.5169827) +Rate[1]= 0.07848545 (weight=0.4830912) +Rate[2]= 1.00000000 (weight=0.5169088) ------------------------------------------------ @@ -37,215 +37,215 @@ Import the following part into a data processing program for further analysis -Rate/Site Rate=0.07854089951899505 Rate=1 -1 0.0004154841702208897 0.9995845158297791 -2 0.9494800946553694 0.05051990534463052 -3 0.8527138076557068 0.1472861923442931 -4 0.9772953209605599 0.02270467903944003 -5 0.7089270358876215 0.2910729641123785 -6 0.9706034286518881 0.02939657134811178 -7 0.8411759549187992 0.1588240450812008 -8 0.644850909128687 0.3551490908713131 -9 0.009023903715709264 0.9909760962842907 -10 0.2515932185849544 0.7484067814150457 -11 0.9362971760831602 0.0637028239168398 -12 0.8039911310919982 0.1960088689080018 -13 0.9592990339188422 0.04070096608115781 -14 0.6831216537039989 0.3168783462960011 -15 0.9686853075822512 0.03131469241774875 -16 0.9686853075822512 0.03131469241774875 -17 0.9494800946553694 0.05051990534463052 -18 0.09627709694116433 0.9037229030588357 -19 0.8046571650681456 0.1953428349318545 -20 0.4254688557117581 0.5745311442882419 -21 0.001162113715308391 0.9988378862846916 -22 0.001180617896019509 0.9988193821039805 -23 0.9494800946553694 0.05051990534463052 -24 7.060663427640608e-05 0.9999293933657236 -25 0.8600353102504529 0.1399646897495471 -26 2.239754017472222e-05 0.9999776024598254 -27 0.7157862269098134 0.2842137730901866 -28 1.823968038733901e-08 0.9999999817603197 -29 0.9785832556654707 0.02141674433452926 -30 0.582384830553312 0.417615169446688 -31 0.002379358355164586 0.9976206416448354 -32 0.8411759549187992 0.1588240450812008 -33 0.9362971760831602 0.0637028239168398 -34 0.6979822928927298 0.3020177071072702 -35 0.778392401117516 0.2216075988824839 -36 0.01421053968210576 0.9857894603178943 -37 0.06336691422415654 0.9366330857758434 -38 0.6912831003120626 0.3087168996879373 -39 0.001919852660566921 0.998080147339433 -40 0.003960890204721402 0.9960391097952785 -41 0.5294498773319029 0.470550122668097 -42 0.8383792474897703 0.1616207525102296 -43 0.5921771900482152 0.4078228099517847 -44 0.208194488060306 0.791805511939694 -45 0.4812547391790605 0.5187452608209394 -46 0.01854854305069108 0.9814514569493089 -47 0.7638290038081602 0.2361709961918397 -48 0.1402805111076657 0.8597194888923344 -49 0.07649019370504373 0.9235098062949563 -50 0.5565553788637815 0.4434446211362185 -51 4.401412699981744e-06 0.9999955985873 -52 0.9686853075822512 0.03131469241774875 -53 0.9337995342113474 0.06620046578865253 -54 0.05713111117495385 0.9428688888250462 -55 0.6733962368122044 0.3266037631877956 -56 0.9785832556654707 0.02141674433452926 -57 0.03638999158091847 0.9636100084190816 -58 0.9366001361131814 0.06339986388681852 -59 0.02363153610583355 0.9763684638941664 -60 0.7157862269098134 0.2842137730901866 -61 0.2740628614189302 0.7259371385810699 -62 0.1720628516827533 0.8279371483172466 -63 0.06696027896282203 0.933039721037178 -64 0.1590255451412893 0.8409744548587106 -65 0.1371557240403739 0.8628442759596261 -66 2.83460436728884e-08 0.9999999716539563 -67 0.5390460493463879 0.4609539506536121 -68 0.0006628127730370201 0.999337187226963 -69 2.873441346176776e-05 0.9999712655865381 -70 0.3234733611365553 0.6765266388634448 -71 0.2249283193971887 0.7750716806028114 -72 0.161927560424059 0.8380724395759409 -73 0.1394764462745381 0.8605235537254619 -74 0.9494800946553694 0.05051990534463052 -75 0.02458202885075857 0.9754179711492413 -76 6.411295676480708e-05 0.9999358870432352 -77 0.9715441430178714 0.02845585698212867 -78 0.7862785460776877 0.2137214539223124 -79 0.7328737186780433 0.2671262813219566 -80 0.9706034286518881 0.02939657134811178 -81 0.8611706364190057 0.1388293635809944 -82 0.7668224471831266 0.2331775528168733 -83 5.672677285259407e-05 0.9999432732271474 -84 0.006741759929513022 0.993258240070487 -85 0.96836526712463 0.03163473287536989 -86 0.9686853075822512 0.03131469241774875 -87 1.531106965534881e-05 0.9999846889303446 -88 0.7668224471831266 0.2331775528168733 -89 0.9316117506366601 0.06838824936333981 -90 0.0386175672192274 0.9613824327807726 -91 0.5872082036930426 0.4127917963069574 +Rate/Site Rate=0.07848544796897376 Rate=1 +1 0.0004147478183825302 0.9995852521816174 +2 0.949615551955995 0.05038444804400498 +3 0.8529359368370378 0.1470640631629622 +4 0.9773716102717996 0.02262838972820037 +5 0.7091414936372149 0.290858506362785 +6 0.9706969154415642 0.02930308455843569 +7 0.841382629288338 0.158617370711662 +8 0.6450287771422856 0.3549712228577144 +9 0.009017591385717552 0.9909824086142826 +10 0.2517327512397574 0.7482672487602428 +11 0.9364355050188269 0.06356449498117313 +12 0.8042392149007731 0.1957607850992268 +13 0.9594019831567395 0.04059801684326048 +14 0.6833362390928011 0.316663760907199 +15 0.9687756816361081 0.03122431836389203 +16 0.9687756816361081 0.03122431836389203 +17 0.949615551955995 0.05038444804400498 +18 0.09626486184129626 0.9037351381587037 +19 0.8048657022558181 0.1951342977441818 +20 0.4258332890773215 0.5741667109226785 +21 0.001162544759598141 0.9988374552404018 +22 0.00117934284219372 0.9988206571578063 +23 0.949615551955995 0.05038444804400498 +24 7.05692249317919e-05 0.9999294307750682 +25 0.8602498450519684 0.1397501549480315 +26 2.235787126908494e-05 0.9999776421287309 +27 0.7160075322697251 0.2839924677302748 +28 1.81694382789184e-08 0.9999999818305617 +29 0.978655781308275 0.02134421869172495 +30 0.5825330972325473 0.4174669027674526 +31 0.00237797957064055 0.9976220204293594 +32 0.841382629288338 0.158617370711662 +33 0.9364355050188269 0.06356449498117313 +34 0.6984640898990011 0.3015359101009989 +35 0.7788646013536071 0.2211353986463928 +36 0.01422156141350081 0.9857784385864992 +37 0.06342743744187564 0.9365725625581244 +38 0.6917494525406751 0.3082505474593248 +39 0.00191842922725556 0.9980815707727445 +40 0.003962940766435124 0.9960370592335649 +41 0.5299471303854615 0.4700528696145386 +42 0.8385993998396862 0.1614006001603138 +43 0.5923948163573549 0.407605183642645 +44 0.2085284079556125 0.7914715920443876 +45 0.4816557620682597 0.5183442379317403 +46 0.01854486833821928 0.9814551316617807 +47 0.7640484766915798 0.2359515233084201 +48 0.1404682371700924 0.8595317628299075 +49 0.07657491682608442 0.9234250831739156 +50 0.5570427216616054 0.4429572783383945 +51 4.395262579126055e-06 0.9999956047374209 +52 0.9687756816361081 0.03122431836389203 +53 0.9339655403783996 0.06603445962160043 +54 0.057176351410101 0.942823648589899 +55 0.6738749864466691 0.3261250135533308 +56 0.978655781308275 0.02134421869172495 +57 0.03639242086085162 0.9636075791391484 +58 0.9367572417729919 0.06324275822700812 +59 0.02363022019514807 0.9763697798048518 +60 0.7160075322697251 0.2839924677302748 +61 0.2742154507693895 0.7257845492306105 +62 0.1721183776736239 0.8278816223263761 +63 0.06701858054585133 0.9329814194541487 +64 0.1592881364653803 0.8407118635346197 +65 0.1373340643424358 0.8626659356575642 +66 2.827881136893144e-08 0.9999999717211886 +67 0.5391574881592042 0.4608425118407958 +68 0.0006622339418346806 0.9993377660581653 +69 2.866242595935145e-05 0.9999713375740407 +70 0.3237224918056043 0.6762775081943957 +71 0.2250277291358384 0.7749722708641616 +72 0.1621092944461913 0.8378907055538086 +73 0.1394617511496932 0.8605382488503069 +74 0.949615551955995 0.05038444804400498 +75 0.02458626333717552 0.9754137366628245 +76 6.407045470315398e-05 0.9999359295452969 +77 0.9716263288159281 0.02837367118407193 +78 0.7864931352062936 0.2135068647937064 +79 0.7333948721239267 0.2666051278760733 +80 0.9706969154415642 0.02930308455843569 +81 0.8613585063476906 0.1386414936523094 +82 0.7670412142745399 0.2329587857254601 +83 5.666114976590877e-05 0.9999433388502341 +84 0.006727990372494228 0.9932720096275057 +85 0.9684585560287761 0.03154144397122403 +86 0.9687756816361081 0.03122431836389203 +87 1.52384846143212e-05 0.9999847615153856 +88 0.7670412142745399 0.2329587857254601 +89 0.9317501104144779 0.06824988958552211 +90 0.03862425798799864 0.9613757420120014 +91 0.587359366036023 0.4126406339639771 *** RUNNING MODEL 2 (Selection) *** ###################################### ->Done in 8 seconds +>Done in 6 seconds --1106.44545333916 +-1106.44543364521 ------------------------------------------------ -dN/dS = 1.129068005488161 (sample variance = 1.588067143267372) +dN/dS = 1.126641026596627 (sample variance = 1.581382036629081) -Rate[1]= 0.05854806 (weight=0.3747723) -Rate[2]= 1.00000000 (weight=0.4427396) -Rate[3]= 3.64070986 (weight=0.1824881) +Rate[1]= 0.05768870 (weight=0.3739141) +Rate[2]= 1.00000000 (weight=0.4447790) +Rate[3]= 3.64184458 (weight=0.1813069) ------------------------------------------------ Sites with dN/dS>1 (Posterior cutoff = 0.9) -26 (0.9071235888236596) -28 (0.999208561635355) -66 (0.9984823131311625) -87 (0.985639180825136) +26 (0.905748932682225) +28 (0.9991894014403121) +66 (0.9984584808494847) +87 (0.9854566318670548) ------------------------------------------------ Sites with dN/dS<=1 (Posterior cutoff = 0.9) -1 (0.5052373923971539) -2 (0.0004593492021371499) -3 (0.005917205452138728) -4 (9.663607149571083e-05) -5 (0.03172842746069128) -6 (0.0001511795330813065) -7 (0.006943635909993886) -8 (0.05673891477006113) -9 (0.7226692280451891) -10 (0.09651028335914213) -11 (0.0009207100403048468) -12 (0.01064028334210353) -13 (0.0004934182636844576) -14 (0.03991745163515635) -15 (0.0002225580972989534) -16 (0.0002225580972989534) -17 (0.0004593492021371499) -18 (0.4330406113257519) -19 (0.0116121595707726) -20 (0.0336812685288552) -21 (0.2473118054259542) -22 (0.7984465921549734) -23 (0.0004593492021371499) -24 (0.5842280859182822) -25 (0.005172980464989402) -27 (0.02940476216359821) -29 (7.952947195176931e-05) -30 (0.09146670281887008) -31 (0.5699030448214001) -32 (0.006943635909993886) -33 (0.0009207100403048468) -34 (0.004515364181301996) -35 (0.002050750658840959) -36 (0.1052215418403245) -37 (0.08602792425933428) -38 (0.005398479347200226) -39 (0.6440044083471241) -40 (0.4558311806110331) -41 (0.01185264798035508) -42 (0.006857989522987185) -43 (0.08367248274274289) -44 (0.01183660486503293) -45 (0.01750623812172373) -46 (0.4294127609528174) -47 (0.0183780006215666) -48 (0.02534884977862542) -49 (0.07112745644118525) -50 (0.01063437293207849) -51 (0.8861014582086777) -52 (0.0002225580972989534) -53 (0.0007683604393760527) -54 (0.09835228649247904) -55 (0.006339545564899368) -56 (7.952947195176931e-05) -57 (0.2050735349427733) -58 (0.000727531168723411) -59 (0.3314435529804894) -60 (0.02940476216359821) -61 (0.08200111674080049) -62 (0.193666026549605) -63 (0.0772513986346591) -64 (0.02148421691961612) -65 (0.02745286636506417) -67 (0.1261823647518457) -68 (0.605555060135807) -69 (0.8333573458992974) -70 (0.05534546347214257) -71 (0.1227852244756305) -72 (0.02110507208988902) -73 (0.2776010249671293) -74 (0.0004593492021371499) -75 (0.3073445506286841) -76 (0.6759845714674814) -77 (0.0002305219770671781) -78 (0.01499353952539858) -79 (0.003225300542333516) -80 (0.0001511795330813065) -81 (0.005173815704594973) -82 (0.01777066638393202) -83 (0.8142466349701873) -84 (0.2783391381741632) -85 (0.0001964834081581603) -86 (0.0002225580972989534) -88 (0.01777066638393202) -89 (0.001173394928906147) -90 (0.1864176575163458) -91 (0.08828981163248605) +1 (0.5005649106541394) +2 (0.0004506733110064033) +3 (0.005834191751320087) +4 (9.457380175062416e-05) +5 (0.03138510059998675) +6 (0.0001480393169770257) +7 (0.006847428124619753) +8 (0.05619655457626769) +9 (0.7202616116659697) +10 (0.09532197568044432) +11 (0.0009046734380393297) +12 (0.01050223976511856) +13 (0.0004847586715730755) +14 (0.03950684521589812) +15 (0.0002182155996077391) +16 (0.0002182155996077391) +17 (0.0004506733110064033) +18 (0.4303827103641656) +19 (0.01146247222682962) +20 (0.03318736449456186) +21 (0.2436354993793777) +22 (0.7962699486165546) +23 (0.0004506733110064033) +24 (0.57941118084374) +25 (0.005099728165153801) +27 (0.02908192622556446) +29 (7.778197017495751e-05) +30 (0.09069853965707597) +31 (0.5660401862868557) +32 (0.006847428124619753) +33 (0.0009046734380393297) +34 (0.004442814164845603) +35 (0.002016405422878198) +36 (0.1033806809795843) +37 (0.0845869735850874) +38 (0.005315569117438836) +39 (0.6404963641339237) +40 (0.4513223091835529) +41 (0.01166252323108743) +42 (0.006762560825262604) +43 (0.08294489462806978) +44 (0.01159145309975683) +45 (0.01723403953949377) +46 (0.4258799056869767) +47 (0.01815822194877413) +48 (0.0248595357138091) +49 (0.06985038806310395) +50 (0.01046813156931801) +51 (0.8842009233039249) +52 (0.0002182155996077391) +53 (0.0007544903027667682) +54 (0.09676453858925446) +55 (0.006243972369370006) +56 (7.778197017495751e-05) +57 (0.2024175281590031) +58 (0.0007143331590382112) +59 (0.328082302559163) +60 (0.02908192622556446) +61 (0.08094857058070434) +62 (0.191684602210924) +63 (0.07594221099860735) +64 (0.02106638033100854) +65 (0.02692637587837913) +67 (0.1252291578690442) +68 (0.6013314754839405) +69 (0.8310257634017505) +70 (0.05459581602399919) +71 (0.1213339421503428) +72 (0.02070417777354739) +73 (0.2752230672821743) +74 (0.0004506733110064033) +75 (0.3040808672019115) +76 (0.6716789194025321) +77 (0.0002260604432214562) +78 (0.01480917575963592) +79 (0.003175560377811062) +80 (0.0001480393169770257) +81 (0.005098851302542715) +82 (0.01755686700678069) +83 (0.8117048671851982) +84 (0.2741932110594019) +85 (0.0001925217117709219) +86 (0.0002182155996077391) +88 (0.01755686700678069) +89 (0.001153571766383773) +90 (0.1839063827910584) +91 (0.0875405164290031) ------------------------------------------------ @@ -256,95 +256,95 @@ Import the following part into a data processing program for further analysis -Rate/Site Rate=0.05854805862606752 Rate=1 Rate=3.640709855148721 -1 2.123821587431194e-05 0.4947413693869718 0.5052373923971539 -2 0.8454805150711112 0.1540601357267516 0.0004593492021371499 -3 0.7171925566791651 0.2768902378686959 0.005917205452138728 -4 0.9023145803022071 0.09758878362629728 9.663607149571083e-05 -5 0.5829266831612094 0.3853448893780992 0.03172842746069128 -6 0.8874735538558555 0.112375266611063 0.0001511795330813065 -7 0.701299413788691 0.2917569503013152 0.006943635909993886 -8 0.5258486059883327 0.4174124792416061 0.05673891477006113 -9 0.00105308734360597 0.276277684611205 0.7226692280451891 -10 0.1004481920361048 0.803041524604753 0.09651028335914213 -11 0.8194140603392319 0.1796652296204631 0.0009207100403048468 -12 0.6717772510719154 0.3175824655859811 0.01064028334210353 -13 0.8646165745664404 0.1348900071698753 0.0004934182636844576 -14 0.5624375756998519 0.3976449726649919 0.03991745163515635 -15 0.8822191864215847 0.1175582554811163 0.0002225580972989534 -16 0.8822191864215847 0.1175582554811163 0.0002225580972989534 -17 0.8454805150711112 0.1540601357267516 0.0004593492021371499 -18 0.032983308732931 0.533976079941317 0.4330406113257519 -19 0.6645919677056602 0.3237958727235673 0.0116121595707726 -20 0.1597162578236267 0.806602473647518 0.0336812685288552 -21 6.497561809983018e-05 0.752623218955946 0.2473118054259542 -22 6.147883799728457e-05 0.2014919290070293 0.7984465921549734 -23 0.8454805150711112 0.1540601357267516 0.0004593492021371499 -24 1.810178742010458e-06 0.4157701039029758 0.5842280859182822 -25 0.7259101630303384 0.2689168565046722 0.005172980464989402 -26 1.883724016460931e-07 0.09287622280393867 0.9071235888236596 -27 0.5896752507993037 0.3809199870370982 0.02940476216359821 -28 7.593295246048614e-14 0.0007914383645691474 0.999208561635355 -29 0.9055057132759483 0.09441475725209994 7.952947195176931e-05 -30 0.4715021983865639 0.437031098794566 0.09146670281887008 -31 0.0001979673775266633 0.4298989878010734 0.5699030448214001 -32 0.701299413788691 0.2917569503013152 0.006943635909993886 -33 0.8194140603392319 0.1796652296204631 0.0009207100403048468 -34 0.2960737907520586 0.6994108450666394 0.004515364181301996 -35 0.3559604560803941 0.641988793260765 0.002050750658840959 -36 0.00130942674258278 0.8934690314170927 0.1052215418403245 -37 0.01180978530471919 0.9021622904359465 0.08602792425933428 -38 0.2941041942604504 0.7004973263923493 0.005398479347200226 -39 0.000144144499790302 0.3558514471530856 0.6440044083471241 -40 0.0002529699651473707 0.5439158494238194 0.4558311806110331 -41 0.2120674596489594 0.7760798923706854 0.01185264798035508 -42 0.7007026627516525 0.2924393477253602 0.006857989522987185 -43 0.4841294396612101 0.432198077596047 0.08367248274274289 -44 0.0301228872990572 0.9580405078359099 0.01183660486503293 -45 0.1893248839788894 0.7931688778993868 0.01750623812172373 -46 0.003359794088751816 0.5672274449584306 0.4294127609528174 -47 0.63009891120623 0.3515230881722035 0.0183780006215666 -48 0.02206748041476434 0.9525836698066102 0.02534884977862542 -49 0.01356591438428317 0.9153066291745316 0.07112745644118525 -50 0.2240745276745924 0.7652910993933291 0.01063437293207849 -51 2.492645830161824e-08 0.1138985168648641 0.8861014582086777 -52 0.8822191864215847 0.1175582554811163 0.0002225580972989534 -53 0.8211660693180184 0.1780655702426055 0.0007683604393760527 -54 0.01100295212811388 0.8906447613794071 0.09835228649247904 -55 0.2833340811759865 0.7103263732591142 0.006339545564899368 -56 0.9055057132759483 0.09441475725209994 7.952947195176931e-05 -57 0.007045975624482142 0.7878804894327445 0.2050735349427733 -58 0.8240613622898397 0.1752111065414368 0.000727531168723411 -59 0.004581163540158042 0.6639752834793526 0.3314435529804894 -60 0.5896752507993037 0.3809199870370982 0.02940476216359821 -61 0.107806169745878 0.8101927135133216 0.08200111674080049 -62 0.06861334373069095 0.7377206297197041 0.193666026549605 -63 0.01242704838986371 0.9103215529754771 0.0772513986346591 -64 0.02390636440935269 0.9546094186710312 0.02148421691961612 -65 0.02151934533944069 0.9510277882954953 0.02745286636506417 -66 2.671166083534726e-12 0.001517686866166226 0.9984823131311625 -67 0.4295450384222473 0.4442725968259071 0.1261823647518457 -68 3.079998533959148e-05 0.3944141398788535 0.605555060135807 -69 4.243743084525438e-07 0.1666422297263941 0.8333573458992974 -70 0.1288881665397853 0.8157663699880722 0.05534546347214257 -71 0.08913564852994389 0.7880791269944257 0.1227852244756305 -72 0.02499602234463322 0.9538989055654777 0.02110507208988902 -73 0.05300541149090041 0.6693935635419703 0.2776010249671293 -74 0.8454805150711112 0.1540601357267516 0.0004593492021371499 -75 0.004981355134669857 0.687674094236646 0.3073445506286841 -76 1.275841252079727e-06 0.3240141526912666 0.6759845714674814 -77 0.8883601983297144 0.1114092796932183 0.0002305219770671781 -78 0.6485688357833093 0.3364376246912923 0.01499353952539858 -79 0.3321969903648257 0.6645777090928409 0.003225300542333516 -80 0.8874735538558555 0.112375266611063 0.0001511795330813065 -81 0.7193517468744175 0.2754744374209876 0.005173815704594973 -82 0.6319797015005985 0.3502496321154693 0.01777066638393202 -83 9.451297855017863e-07 0.1857524199000273 0.8142466349701873 -84 9.483309866743794e-05 0.7215660287271692 0.2783391381741632 -85 0.8807566499603399 0.119046866631502 0.0001964834081581603 -86 0.8822191864215847 0.1175582554811163 0.0002225580972989534 -87 1.512672292179054e-08 0.01436080404814105 0.985639180825136 -88 0.6319797015005985 0.3502496321154693 0.01777066638393202 -89 0.8117475469052963 0.1870790581657977 0.001173394928906147 -90 0.007483970311633406 0.8060983721720208 0.1864176575163458 -91 0.4758571278249646 0.4358530605425494 0.08828981163248605 +Rate/Site Rate=0.05768869986246565 Rate=1 Rate=3.641844577696067 +1 2.014182872779183e-05 0.4994149475171328 0.5005649106541394 +2 0.8452194549572273 0.1543298717317663 0.0004506733110064033 +3 0.7164617500753068 0.2777040581733731 0.005834191751320087 +4 0.9022505078903756 0.09765491830787378 9.457380175062416e-05 +5 0.5818848042477683 0.3867300951522449 0.03138510059998675 +6 0.8873636945850181 0.112488266098005 0.0001480393169770257 +7 0.7005076792958981 0.2926448925794821 0.006847428124619753 +8 0.5247883329162988 0.4190151125074336 0.05619655457626769 +9 0.001025124165060435 0.2787132641689699 0.7202616116659697 +10 0.0988666787453516 0.8058113455742041 0.09532197568044432 +11 0.8190555318482564 0.1800397947137043 0.0009046734380393297 +12 0.6708961696125029 0.3186015906223785 0.01050223976511856 +13 0.864433000690832 0.1350822406375951 0.0004847586715730755 +14 0.5613826888688731 0.399110465915229 0.03950684521589812 +15 0.8820922333215474 0.117689551078845 0.0002182155996077391 +16 0.8820922333215474 0.117689551078845 0.0002182155996077391 +17 0.8452194549572273 0.1543298717317663 0.0004506733110064033 +18 0.0324851386885853 0.537132150947249 0.4303827103641656 +19 0.6636954012888419 0.3248421264843285 0.01146247222682962 +20 0.1574210003780902 0.8093916351273479 0.03318736449456186 +21 6.147404749371294e-05 0.7563030265731285 0.2436354993793777 +22 5.912803676264833e-05 0.2036709233466827 0.7962699486165546 +23 0.8452194549572273 0.1543298717317663 0.0004506733110064033 +24 1.697882291310795e-06 0.4205871212739687 0.57941118084374 +25 0.7252007199820658 0.2696995518527805 0.005099728165153801 +26 1.76903184809418e-07 0.09425089041459019 0.905748932682225 +27 0.5886393930203893 0.3822786807540463 0.02908192622556446 +28 7.257830102405714e-14 0.000810598559615294 0.9991894014403121 +29 0.905453981684754 0.09446823634507101 7.778197017495751e-05 +30 0.4704804321322921 0.438821028210632 0.09069853965707597 +31 0.000190267409636111 0.4337695463035081 0.5660401862868557 +32 0.7005076792958981 0.2926448925794821 0.006847428124619753 +33 0.8190555318482564 0.1800397947137043 0.0009046734380393297 +34 0.2927441515249551 0.7028130343101993 0.004442814164845603 +35 0.3524412830671291 0.6455423115099926 0.002016405422878198 +36 0.001253859115858844 0.895365459904557 0.1033806809795843 +37 0.01145549195170351 0.9039575344632091 0.0845869735850874 +38 0.2907971110003181 0.7038873198822431 0.005315569117438836 +39 0.0001386157668289592 0.3593650200992473 0.6404963641339237 +40 0.0002432878779577585 0.5484344029384893 0.4513223091835529 +41 0.2092473563395368 0.7790901204293758 0.01166252323108743 +42 0.6999063654856726 0.2933310736890647 0.006762560825262604 +43 0.4831185953460878 0.4339365100258424 0.08294489462806978 +44 0.0292576800605162 0.9591508668397269 0.01159145309975683 +45 0.1867006668711101 0.796065293589396 0.01723403953949377 +46 0.003266315231745858 0.5708537790812774 0.4258799056869767 +47 0.6291274207544222 0.3527143572968038 0.01815822194877413 +48 0.02141713714322223 0.9537233271429686 0.0248595357138091 +49 0.01317171754156796 0.9169778943953281 0.06985038806310395 +50 0.2211497254419244 0.7683821429887575 0.01046813156931801 +51 2.314690514493323e-08 0.11579905354917 0.8842009233039249 +52 0.8820922333215474 0.117689551078845 0.0002182155996077391 +53 0.8208101360851635 0.1784353736120698 0.0007544903027667682 +54 0.01067202582008938 0.8925634355906562 0.09676453858925446 +55 0.2800657198062314 0.7136903078243987 0.006243972369370006 +56 0.905453981684754 0.09446823634507101 7.778197017495751e-05 +57 0.006838110356898206 0.7907443614840988 0.2024175281590031 +58 0.8237175911135334 0.1755680757274284 0.0007143331590382112 +59 0.004450345634491975 0.6674673518063451 0.328082302559163 +60 0.5886393930203893 0.3822786807540463 0.02908192622556446 +61 0.1061213208343808 0.812930108584915 0.08094857058070434 +62 0.06753143245596548 0.7407839653331104 0.191684602210924 +63 0.01205277332437679 0.9120050156770158 0.07594221099860735 +64 0.02320619483438383 0.9557274248346076 0.02106638033100854 +65 0.02088692244142688 0.952186701680194 0.02692637587837913 +66 2.448383399924293e-12 0.001541519148066872 0.9984584808494847 +67 0.4285837636617972 0.4461870784691586 0.1252291578690442 +68 2.946025360697879e-05 0.3986390642624525 0.6013314754839405 +69 3.98203585590263e-07 0.1689738383946639 0.8310257634017505 +70 0.1269233311499915 0.8184808528260092 0.05459581602399919 +71 0.08772963484839656 0.7909364230012607 0.1213339421503428 +72 0.02426486820361876 0.9550309540228338 0.02070417777354739 +73 0.05217899891051785 0.6725979338073078 0.2752230672821743 +74 0.8452194549572273 0.1543298717317663 0.0004506733110064033 +75 0.004838241899341649 0.6910808908987468 0.3040808672019115 +76 1.198920235239551e-06 0.3283198816772326 0.6716789194025321 +77 0.8882667916649903 0.1115071478917882 0.0002260604432214562 +78 0.6476389301739991 0.337551894066365 0.01480917575963592 +79 0.3286907698848671 0.668133669737322 0.003175560377811062 +80 0.8873636945850181 0.112488266098005 0.0001480393169770257 +81 0.7186192824913594 0.2762818662060978 0.005098851302542715 +82 0.63100855429723 0.3514345786959893 0.01755686700678069 +83 8.934097890872577e-07 0.1882942394050127 0.8117048671851982 +84 9.275585147221103e-05 0.725714033089126 0.2741932110594019 +85 0.8806249498392179 0.1191825284490111 0.0001925217117709219 +86 0.8820922333215474 0.117689551078845 0.0002182155996077391 +87 1.429077450481847e-08 0.0145433538421707 0.9854566318670548 +88 0.63100855429723 0.3514345786959893 0.01755686700678069 +89 0.8113577821477324 0.1874886460858839 0.001153571766383773 +90 0.007262246700933957 0.8088313705080076 0.1839063827910584 +91 0.4748305935897082 0.4376288899812887 0.0875405164290031 diff --git a/tests/hbltests/Results/HIVSweden.out_MODEL_-1.nex b/tests/hbltests/Results/HIVSweden.out_MODEL_-1.nex index f5c923a96..341155122 100644 --- a/tests/hbltests/Results/HIVSweden.out_MODEL_-1.nex +++ b/tests/hbltests/Results/HIVSweden.out_MODEL_-1.nex @@ -37,8 +37,8 @@ END; BEGIN HYPHY; -global c=0.9051395450601901; -global kappa=0.4052422178470169; +global c=0.9071719918589423; +global kappa=0.4054779971121202; modelMatrix={61,61}; modelMatrix[0][1]:=kappa*c*t; modelMatrix[0][2]:=t; @@ -636,33 +636,35 @@ TRY_NUMERIC_SEQUENCE_MATCH=0; ACCEPT_ROOTED_TREES=0; UseModel (theModel); -Tree givenTree=(((U68496,U68497)Node3,(U68501,(U68502,(U68503,((((U68504,U68506)Node15,U68505)Node14,U68507)Node13,U68508)Node12)Node10)Node8)Node6)Node2,U68498,(U68499,U68500)Node6_1); +Tree givenTree=(U68496,U68497,((U68498,(U68499,U68500)Node7)Node5,(U68501,(U68502,(U68503,((((U68504,U68506)Node19,U68505)Node18,U68507)Node17,U68508)Node16)Node14)Node12)Node10)Node4); + +givenTree.U68496.t=0.1930890973257582; +givenTree.U68497.t=0.6424612223788787; +givenTree.U68498.t=0.409121209278724; +givenTree.U68499.t=0.2307021519626118; +givenTree.U68500.t=0.8109768825230198; +givenTree.Node7.t=0.6770911492487591; +givenTree.Node5.t=0.192056403882526; +givenTree.U68501.t=1.068413022713992; +givenTree.U68502.t=1.553405763007259; +givenTree.U68503.t=1.569000404650932; +givenTree.U68504.t=0.1542179615342612; +givenTree.U68506.t=0.5708195775387649; +givenTree.Node19.t=0.4514474490820354; +givenTree.U68505.t=0.2461875505020575; +givenTree.Node18.t=0.2979766094476649; +givenTree.U68507.t=0.4958919610910434; +givenTree.Node17.t=0.2823906966108597; +givenTree.U68508.t=1.611576611031497; +givenTree.Node16.t=0.4108260863152219; +givenTree.Node14.t=0.5198983543648987; +givenTree.Node12.t=0.2732130787806588; +givenTree.Node10.t=0.3908328965048091; +givenTree.Node4.t=1.241058558313147;SetParameter (DEFER_CONSTRAINT_APPLICATION, 1, 0); +SetParameter (DEFER_CONSTRAINT_APPLICATION, 0, 0); -givenTree.U68496.t=0.1932757790297635; -givenTree.U68497.t=0.6442091360697517; -givenTree.Node3.t=1.244091350196699; -givenTree.U68501.t=1.072314145612458; -givenTree.U68502.t=1.555368999549274; -givenTree.U68503.t=1.573795636757062; -givenTree.U68504.t=0.1545017778755618; -givenTree.U68506.t=0.5724438957384254; -givenTree.Node15.t=0.452076415208067; -givenTree.U68505.t=0.2464768693858635; -givenTree.Node14.t=0.2982771493845721; -givenTree.U68507.t=0.4974360955659346; -givenTree.Node13.t=0.2831885574728508; -givenTree.U68508.t=1.613919861191977; -givenTree.Node12.t=0.41156092487671; -givenTree.Node10.t=0.5209728334125625; -givenTree.Node8.t=0.2739993974710216; -givenTree.Node6.t=0.3910850502827137; -givenTree.Node2.t=0.1922077082389846; -givenTree.U68498.t=0.4102499738247144; -givenTree.U68499.t=0.2311741894617957; -givenTree.U68500.t=0.8119659118420136; -givenTree.Node6_1.t=0.678816471820221; DataSet ds = ReadDataFile(USE_NEXUS_FILE_DATA); -DataSetFilter filteredData = CreateFilter(ds,3,"0-272","0,1,5-8,10,9,11,12,2-4","TAA,TAG,TGA"); +DataSetFilter filteredData = CreateFilter(ds,3,"0-272","0-8,10,9,11,12","TAA,TAG,TGA"); ASSUME_REVERSIBLE_MODELS=0; USE_LAST_RESULTS=1; LikelihoodFunction lf = (filteredData,givenTree); diff --git a/tests/hbltests/Results/HIVSweden.out_MODEL_0.nex b/tests/hbltests/Results/HIVSweden.out_MODEL_0.nex index e9838a0e4..15f8c7d83 100644 --- a/tests/hbltests/Results/HIVSweden.out_MODEL_0.nex +++ b/tests/hbltests/Results/HIVSweden.out_MODEL_0.nex @@ -37,11 +37,11 @@ END; BEGIN HYPHY; -global W=0.07854089951899505; +global W=0.07848544796897376; W:<1; -global P=0.483017317295036; +global P=0.4830911811352628; P:<1; -global kappa=0.3863915314909612; +global kappa=0.3862572108595822; c.weights={1,2}; c.weights[0][0]:=P; @@ -652,33 +652,35 @@ TRY_NUMERIC_SEQUENCE_MATCH=0; ACCEPT_ROOTED_TREES=0; UseModel (theModel); -Tree givenTree=(((U68496,U68497)Node3,(U68501,(U68502,(U68503,((((U68504,U68506)Node15,U68505)Node14,U68507)Node13,U68508)Node12)Node10)Node8)Node6)Node2,U68498,(U68499,U68500)Node6_1); +Tree givenTree=(U68496,U68497,((U68498,(U68499,U68500)Node7)Node5,(U68501,(U68502,(U68503,((((U68504,U68506)Node19,U68505)Node18,U68507)Node17,U68508)Node16)Node14)Node12)Node10)Node4); + +givenTree.U68496.t=0.3017206887153597; +givenTree.U68497.t=1.059711785366983; +givenTree.U68498.t=0.6389397418529784; +givenTree.U68499.t=0.3596540096621793; +givenTree.U68500.t=1.321906771217612; +givenTree.Node7.t=1.136554744000134; +givenTree.Node5.t=0.3043513200848355; +givenTree.U68501.t=1.769569624859784; +givenTree.U68502.t=2.675115127943899; +givenTree.U68503.t=2.706643713132597; +givenTree.U68504.t=0.2431327474261901; +givenTree.U68506.t=0.9044122018972757; +givenTree.Node19.t=0.7162910919155517; +givenTree.U68505.t=0.3956882184714828; +givenTree.Node18.t=0.4996707175132051; +givenTree.U68507.t=0.774075369790494; +givenTree.Node17.t=0.2764202496369115; +givenTree.U68508.t=2.852778762374758; +givenTree.Node16.t=0.7482651562390982; +givenTree.Node14.t=0.8229119584835572; +givenTree.Node12.t=0.3096780975715163; +givenTree.Node10.t=0.6408915003259278; +givenTree.Node4.t=2.084190034363844;SetParameter (DEFER_CONSTRAINT_APPLICATION, 1, 0); +SetParameter (DEFER_CONSTRAINT_APPLICATION, 0, 0); -givenTree.U68496.t=0.3014658876783538; -givenTree.U68497.t=1.058937965054783; -givenTree.Node3.t=2.08174984846752; -givenTree.U68501.t=1.764333870286974; -givenTree.U68502.t=2.673570567299918; -givenTree.U68503.t=2.699658069558426; -givenTree.U68504.t=0.2431541588198822; -givenTree.U68506.t=0.9038084521967604; -givenTree.Node15.t=0.7153498168088975; -givenTree.U68505.t=0.3956539001874164; -givenTree.Node14.t=0.4993402949067045; -givenTree.U68507.t=0.7737346315034068; -givenTree.Node13.t=0.277145717534158; -givenTree.U68508.t=2.851718049074558; -givenTree.Node12.t=0.7478267266145866; -givenTree.Node10.t=0.8232048503168657; -givenTree.Node8.t=0.309357412100375; -givenTree.Node6.t=0.6402997546627597; -givenTree.Node2.t=0.3042887273137687; -givenTree.U68498.t=0.6387909451494354; -givenTree.U68499.t=0.3594052835330087; -givenTree.U68500.t=1.319979691408115; -givenTree.Node6_1.t=1.13804766964687; DataSet ds = ReadDataFile(USE_NEXUS_FILE_DATA); -DataSetFilter filteredData = CreateFilter(ds,3,"0-272","0,1,5-8,10,9,11,12,2-4","TAA,TAG,TGA"); +DataSetFilter filteredData = CreateFilter(ds,3,"0-272","0-8,10,9,11,12","TAA,TAG,TGA"); ASSUME_REVERSIBLE_MODELS=0; USE_LAST_RESULTS=1; LikelihoodFunction lf = (filteredData,givenTree); diff --git a/tests/hbltests/Results/HIVSweden.out_MODEL_1.nex b/tests/hbltests/Results/HIVSweden.out_MODEL_1.nex index 27c62b245..7b9e7cabb 100644 --- a/tests/hbltests/Results/HIVSweden.out_MODEL_1.nex +++ b/tests/hbltests/Results/HIVSweden.out_MODEL_1.nex @@ -37,15 +37,15 @@ END; BEGIN HYPHY; -global P2=0.7081253811493516; +global P2=0.7104121522639678; P2:<1; -global W_1=0.05854805862606752; +global W_1=0.05768869986246565; W_1:<1; -global W_2=3.640709855148721; +global W_2=3.641844577696067; W_2:>1; -global P1=0.3747723001916182; +global P1=0.3739141342952348; P1:<1; -global kappa=0.3597040016204291; +global kappa=0.3597183898000834; c.weights={1,3}; c.weights[0][0]:=P1; @@ -658,33 +658,35 @@ TRY_NUMERIC_SEQUENCE_MATCH=0; ACCEPT_ROOTED_TREES=0; UseModel (theModel); -Tree givenTree=(((U68496,U68497)Node3,(U68501,(U68502,(U68503,((((U68504,U68506)Node15,U68505)Node14,U68507)Node13,U68508)Node12)Node10)Node8)Node6)Node2,U68498,(U68499,U68500)Node6_1); +Tree givenTree=(U68496,U68497,((U68498,(U68499,U68500)Node7)Node5,(U68501,(U68502,(U68503,((((U68504,U68506)Node19,U68505)Node18,U68507)Node17,U68508)Node16)Node14)Node12)Node10)Node4); + +givenTree.U68496.t=0.1820765235473329; +givenTree.U68497.t=0.6321073455214453; +givenTree.U68498.t=0.3343564501791256; +givenTree.U68499.t=0.1876640197504523; +givenTree.U68500.t=0.8005801879569905; +givenTree.Node7.t=0.750108304945318; +givenTree.Node5.t=0.189875002788432; +givenTree.U68501.t=1.055571283712865; +givenTree.U68502.t=1.747134065548263; +givenTree.U68503.t=1.779098089271837; +givenTree.U68504.t=0.1466122835681893; +givenTree.U68506.t=0.5212546540008767; +givenTree.Node19.t=0.4171959202637034; +givenTree.U68505.t=0.2414131900046768; +givenTree.Node18.t=0.2762126262689943; +givenTree.U68507.t=0.4784362608407944; +givenTree.Node17.t=0.006818214720109361; +givenTree.U68508.t=1.945808887957573; +givenTree.Node16.t=0.5086054961633685; +givenTree.Node14.t=0.508023542650578; +givenTree.Node12.t=0.1816851776949857; +givenTree.Node10.t=0.394526379685196; +givenTree.Node4.t=1.247106653589127;SetParameter (DEFER_CONSTRAINT_APPLICATION, 1, 0); +SetParameter (DEFER_CONSTRAINT_APPLICATION, 0, 0); -givenTree.U68496.t=0.1819372645612003; -givenTree.U68497.t=0.6305701510272947; -givenTree.Node3.t=1.244260990073252; -givenTree.U68501.t=1.05208595250357; -givenTree.U68502.t=1.746445328113835; -givenTree.U68503.t=1.774938469552289; -givenTree.U68504.t=0.1463724860825859; -givenTree.U68506.t=0.5200057981767465; -givenTree.Node15.t=0.4166914000694648; -givenTree.U68505.t=0.241055361539026; -givenTree.Node14.t=0.2754520695076969; -givenTree.U68507.t=0.4771824473110387; -givenTree.Node13.t=0.006305605103123437; -givenTree.U68508.t=1.944259154790765; -givenTree.Node12.t=0.5081406979260248; -givenTree.Node10.t=0.5074584441922946; -givenTree.Node8.t=0.1808898503811754; -givenTree.Node6.t=0.3939031724767187; -givenTree.Node2.t=0.1896556393922262; -givenTree.U68498.t=0.3337924593752155; -givenTree.U68499.t=0.1875734352334583; -givenTree.U68500.t=0.7998185516002427; -givenTree.Node6_1.t=0.7492860788646263; DataSet ds = ReadDataFile(USE_NEXUS_FILE_DATA); -DataSetFilter filteredData = CreateFilter(ds,3,"0-272","0,1,5-8,10,9,11,12,2-4","TAA,TAG,TGA"); +DataSetFilter filteredData = CreateFilter(ds,3,"0-272","0-8,10,9,11,12","TAA,TAG,TGA"); ASSUME_REVERSIBLE_MODELS=0; USE_LAST_RESULTS=1; LikelihoodFunction lf = (filteredData,givenTree);