diff --git a/CMakeLists.txt b/CMakeLists.txt index 1c33d5a2e..f6ef69c62 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -168,7 +168,7 @@ set_source_files_properties(${SRC_CORE} ${SRC_NEW} {SRC_UTILS} PROPERTIES COMPIL #------------------------------------------------------------------------------- set(DEFAULT_WARNING_FLAGS " -w -Weffc++ -Wextra -Wall ") -set(DEFAULT_DEBUG_WARNING_FLAGS "-Wall -Wno-int-to-pointer-cast -Wno-conversion-null -Wno-sign-compare -Wno-maybe-uninitialized") +set(DEFAULT_DEBUG_WARNING_FLAGS "-Wall -Wno-int-to-pointer-cast -Wno-conversion-null -Wno-sign-compare") diff --git a/res/TemplateBatchFiles/MSS-joint-fitter.bf b/res/TemplateBatchFiles/MSS-joint-fitter.bf new file mode 100644 index 000000000..2c627a644 --- /dev/null +++ b/res/TemplateBatchFiles/MSS-joint-fitter.bf @@ -0,0 +1,332 @@ + +/* 1. SETUP +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ + +/* 1a. Initial Setup +------------------------------------------------------------------------------*/ +RequireVersion ("2.5.48"); + +LoadFunctionLibrary ("libv3/all-terms.bf"); +LoadFunctionLibrary ("libv3/convenience/regexp.bf"); +LoadFunctionLibrary ("libv3/convenience/math.bf"); +LoadFunctionLibrary ("libv3/IOFunctions.bf"); +LoadFunctionLibrary ("libv3/UtilityFunctions.bf"); +LoadFunctionLibrary ("libv3/tasks/trees.bf"); +LoadFunctionLibrary ("libv3/tasks/alignments.bf"); +LoadFunctionLibrary ("libv3/tasks/estimators.bf"); +LoadFunctionLibrary ("SelectionAnalyses/modules/io_functions.ibf"); +LoadFunctionLibrary ("libv3/tasks/mpi.bf"); +LoadFunctionLibrary ("libv3/convenience/random.bf"); +LoadFunctionLibrary("libv3/models/codon/MSS.bf"); + + + +mss_selector.analysisDescription = {terms.io.info : "mss_joint_fitter : Performs a joint MSS model fit to several genes jointly.", + terms.io.version : "0.0.1", + terms.io.reference : "TBD", + terms.io.authors : "Sergei L Kosakovsky Pond", + terms.io.contact : "spond@temple.edu", + terms.io.requirements : "A collection of coding alignments with trees in the corresponding alignment files " + }; + + +mss.json = { terms.json.analysis: mss_selector.analysisDescription, + terms.json.input: {}, + }; + + +utility.SetEnvVariable ("OPTIMIZE_SUMMATION_ORDER_PARTITION", 100); +// don't spend too much time optimizing column ordering. + +/* 1b. User Input and data load +------------------------------------------------------------------------------*/ +io.DisplayAnalysisBanner (mss_selector.analysisDescription); + +KeywordArgument ("filelist","List of files to include in this analysis"); +mss_selector.file_list = io.get_a_list_of_files(io.PromptUserForFilePathRead ("List of files to include in this analysis")); + +mss_selector.file_count = utility.Array1D (mss_selector.file_list); +io.CheckAssertion("mss_selector.file_count >= 1", "A non-empty file list is required"); + +io.ReportProgressMessageMD("mss", "data" , "* Loaded a list with **" + mss_selector.file_count + "** files"); + +KeywordArgument ("code", "Which genetic code should be used", "Universal"); +mss.genetic_code = alignments.LoadGeneticCode (None); + + +mss.file_records = {}; +mss.file_info = {}; +mss.tree_info = {}; +mss.file_prefix = {}; +mss.fits = {}; +mss.filters = {}; +mss.parameters = 0; +mss.baselineLL = 0; +mss.is_bic = TRUE; + +KeywordArgument ("output", "Write the resulting JSON to this file",None); +mss.json = io.ReadFromOrCreate ("Save the resulting JSON file to", mss.json); + +mss.json - terms.data.value; + +mss_selector.file_list = utility.DictToArray (mss_selector.file_list); + +mss_selector.queue = mpi.CreateQueue ( + { + "Headers" : {{"libv3/UtilityFunctions.bf","libv3/tasks/alignments.bf", + "libv3/tasks/trees.bf","SelectionAnalyses/modules/io_functions.ibf", + "libv3/tasks/estimators.bf","libv3/models/codon/MSS.bf"}}, + "Variables" : {{}} + } + ); + +function mss.handle_initial_fit (filepath, namespace, genetic_code, run_fit) { + + utility.ToggleEnvVariable ("GLOBAL_FPRINTF_REDIRECT", "/dev/null"); + ExecuteCommands (' + `namespace`.json = {}; + namespace `namespace` { + scaler_prefix = "MSS.scaler"; + KeywordArgument ("code", "Which genetic code should be used", "`genetic_code`"); + KeywordArgument ("alignment", "Load this alignment", "`filepath`"); + utility.ToggleEnvVariable ("ALWAYS_RELOAD_FUNCTION_LIBRARIES", TRUE); + LoadFunctionLibrary ("SelectionAnalyses/modules/shared-load-file.bf"); + utility.ToggleEnvVariable ("ALWAYS_RELOAD_FUNCTION_LIBRARIES", None); + + load_file ({utility.getGlobalValue("terms.prefix"): "`namespace`", + utility.getGlobalValue("terms.settings") : {utility.getGlobalValue("terms.settings.branch_selector") : "selection.io.SelectAllBranches"}}); + + if (^"`&run_fit`") { + doGTR ("`namespace`"); + doPartitionedMG ("`namespace`", FALSE); + } + + }; + '); + utility.ToggleEnvVariable ("GLOBAL_FPRINTF_REDIRECT", None); + return { + "path" : filepath, + "pt" : Eval (namespace + ".trees"), + "init" : Eval (namespace + ".partitioned_mg_results") + }; +} + +lfunction mss.store_initial_fit (node, result, arguments) { + + mss_selector.print_row = { + 5, + 1 + }; + + (^"mss.fits") [result["path"]] = result["init"]; + (^"mss.tree_info") [result["path"]] = result ["pt"]; + + ^"mss.parameters" += (result["init"])[^"terms.parameters"]; + ^"mss.baselineLL" += (result["init"])[^"terms.fit.log_likelihood"]; + //^"mss.sample_size" += (result["init"])[^"terms.data.sample_size"]; + mss_selector.print_row [0] = result["path"]; + info = (^"mss.file_records")[result["path"]]; + mss_selector.print_row [1] = info [^"terms.data.sequences"]; + mss_selector.print_row [2] = info [^"terms.data.sites"]; + + mss.T = 0; + for (mss.b,mss.bi; in; ((result["init"])[^"terms.branch_length"])["0"]) { + mss.T += mss.bi [^"terms.fit.MLE"]; + } + mss_selector.print_row [3] = Format (mss.T, 8, 3); + mss_selector.print_row [4] = Format ((result["init"])[^"terms.fit.log_likelihood"], 12, 4); + fprintf(stdout, io.FormatTableRow(mss_selector.print_row, ^"mss_selector.table_output_options")); + return TRUE; +} + + +mss_selector.table_output_options = { + utility.getGlobalValue("terms.table_options.header"): 1, + utility.getGlobalValue("terms.table_options.minimum_column_width"): 16, + utility.getGlobalValue("terms.table_options.align"): "left", + utility.getGlobalValue("terms.table_options.column_widths"): { + "0": 120, + "1" :10, + "2" :10, + "3" :10, + "4" :15 + } + }; + +mss_selector.header = { + {"Filepath"} + {"Sequences"} + {"Sites"} + {"TreeLength"}, + {"Log(L)"} + }; + + +ExecuteCommands ( "mss.codon_classes = model.codon.MSS.prompt_and_define (terms.global, mss.genetic_code[terms.code])", + {"--mss-type" : "SynREV"} + ); + +io.ReportProgressMessageMD("mss", "fit0" , "Individual file statistics and simple model fits\n"); + + +fprintf(stdout, io.FormatTableRow(mss_selector.header, mss_selector.table_output_options)); +mss_selector.table_output_options[utility.getGlobalValue("terms.table_options.header")] = FALSE; + +mss.path_ordering = {}; +mss.filter_names = {}; +mss.trees = {}; +mss.order2path = {}; + + + +for (mss.counter, mss_selector.path; in; mss_selector.file_list) { + //io.ReportProgressBar("", "Loading datafile " + (+mss.counter+1) + "/" + mss_selector.file_count); + + mss.namespace = "mss_" + mss.counter; + mss.handle_initial_fit (mss_selector.path, mss.namespace, mss.genetic_code[terms.id], FALSE); + mss.file_records [mss_selector.path] = ^"`mss.namespace`.codon_data_info"; + mss.sample_size += (^"`mss.namespace`.codon_data_info")[terms.data.sample_size]; + + + mpi.QueueJob (mss_selector.queue, "mss.handle_initial_fit", {"0" : mss_selector.path, + "1" : mss.namespace, + "2" : mss.genetic_code[terms.id], + "3" : TRUE}, + "mss.store_initial_fit"); + + + + mss.file_info [mss_selector.path] = ^"`mss.namespace`.json"; + mss.filters[mss_selector.path] = ^"`mss.namespace`.filter_names"; + mss.file_prefix [mss_selector.path] = mss.namespace; + mss.order2path [Abs (mss.path_ordering)] = mss_selector.path; + mss.path_ordering [mss_selector.path] = Abs (mss.path_ordering); + mss.filter_names + (^"`mss.namespace`.filter_names")[0]; +} + + + +//selection.io.getIC(fit[terms.fit.log_likelihood], fit[terms.parameters] + xp, ss) + +//io.ClearProgressBar(); +mpi.QueueComplete (mss_selector.queue); + +mss.baseline_AIC = mss.getIC (mss.baselineLL, mss.parameters, mss.sample_size, mss.is_bic); + +io.ReportProgressMessageMD("mss", "fit0done" , "Baseline model fit"); +io.ReportProgressMessageMD("mss", "fit0done", "**LogL = " + mss.baselineLL + "**" + ", **AIC-c = " + mss.baseline_AIC + "** \n"); + +mss.json ["baseline"] = {terms.json.log_likelihood : mss.baselineLL, terms.json.AICc : mss.baseline_AIC}; + +function mss.MSS_generator (type) { + model = Call ("models.codon.MSS.ModelDescription",type, mss.genetic_code[terms.code], mss.codon_classes); + return model; +} + +mss.tree_objects = {}; +mss.fit_objects = {}; +mss.lf_components = { + 2*mss_selector.file_count, + 1 + }; + +mss.lf_id = "MSS.COMPOSITE.LF"; +mss.model_map_overall = {}; + + + +for (mss.counter, mss_selector.path; in; mss_selector.file_list) { + mss.prefix = mss.file_prefix [mss_selector.path]; + + + mss.model_name = "`mss.prefix`.model"; + mss.tree_name = "`mss.prefix`.tree"; + mss.lf_components[2*mss.counter] = mss.filter_names[mss.counter]; + mss.lf_components[2*mss.counter+1] = mss.tree_name; + utility.ExecuteInGlobalNamespace ("`mss.model_name ` = 0"); + ^(mss.model_name) = model.generic.DefineModel("mss.MSS_generator", mss.prefix + ".model_object", { + "0": "terms.global" + }, mss.filter_names[mss.counter], None); + + mss.model_map_overall [mss.prefix + ".model_object"] = ^(mss.model_name); + + mss.model_map = { mss.prefix + ".model_object" : ^(mss.model_name)}; + + model.ApplyModelToTree(mss.lf_components[2 * mss.counter + 1], (mss.tree_info[mss_selector.path])[0], { + "default": ^(mss.model_name) + }, None); + + + initial_values = mss.fits[mss_selector.path]; + + for (mss.p; in; estimators.GetGlobalMLE_RegExp( initial_values, terms.parameters.omega_ratio)) { + (initial_values[terms.global])[terms.parameters.omega_ratio] = mss.p; + } + + mss.model_map["estimators.SetCategory"][""]; + estimators.ApplyExistingEstimates.set_globals = {}; + mss.model_map["estimators.SetGlobals"][""]; + mss.constraint_count = estimators.ApplyExistingEstimatesToTree (mss.lf_components[2 * mss.counter + 1], + mss.model_map, + (initial_values [terms.branch_length])[0], + terms.model.branch_length_constrain, + TRUE); + + models.FixParameterSetRegExp (terms.parameters.omega_ratio, ^(mss.model_name)); + models.FixParameterSetRegExp (terms.nucleotideRatePrefix, ^(mss.model_name)); + + + if (mss.counter == 0) { + mss.reference_set = ^(mss.model_name); + mss.mss_rate_list = model.GetParameters_RegExp( ^(mss.model_name),"^" + terms.parameters.synonymous_rate + ""); + mss.model_dimension = utility.Array1D (mss.mss_rate_list); + mss.scaler_prefix = 'mss.scaler_parameter_'; + mss.scaler_mapping = {}; + mss.scaler_index = { mss.model_dimension , 1}; + for (mss.i, mss.v; in; mss.mss_rate_list) { + mss.scaler_mapping [mss.i] = Abs (mss.scaler_mapping); + mss.scaler_index [ mss.scaler_mapping [mss.i]] = mss.v; + } + for (mss.i2 = 0; mss.i2 < mss.model_dimension; mss.i2 += 1) { + parameters.DeclareGlobal (mss.scaler_prefix + mss.i2, None); + } + + mss.json ["mapping"] = mss.scaler_index; + } else { + //utility.getGlobalValue ("terms.parameters.synonymous_rate"); + models.BindGlobalParameters ({"0" : mss.reference_set , "1" : ^(mss.model_name)}, terms.parameters.synonymous_rate + terms.model.MSS.syn_rate_between); + models.BindGlobalParameters ({"0" : mss.reference_set , "1" : ^(mss.model_name)}, terms.parameters.synonymous_rate + terms.model.MSS.syn_rate_within); + } +} + + +utility.ExecuteInGlobalNamespace ("LikelihoodFunction `mss.lf_id` = (`&mss.lf_components`)"); + +VERBOSITY_LEVEL = 1; +USE_LAST_RESULTS = 1; +Optimize (res, ^mss.lf_id); + +mss.joint_AIC = mss.getIC (res[1][0], mss.parameters + res[1][1], mss.sample_size, mss.is_bic); + +mss.json ["joint"] = {terms.json.log_likelihood : res[1][0], terms.json.AICc : mss.joint_AIC}; + + +io.ReportProgressMessageMD("mss", "fitAlldone" , "Joint model fit"); +io.ReportProgressMessageMD("mss", "fitAlldone", "**LogL = " + res[1][0] + "**" + ", **AIC-c = " + mss.joint_AIC + "** \n"); + +mss.json["joint-model"] = estimators.ExtractMLEsOptions (mss.lf_id, mss.model_map_overall, {terms.globals_only : TRUE}); +//estimators.RemoveBranchLengthConstraints (mss.json["joint-model"]); + +io.SpoolJSON (mss.json, mss.json[terms.data.file]); + +KeywordArgument ("save-fit", "Write the resulting model fit file to this (large!) file", "/dev/null"); +io.SpoolLFToPath(mss.lf_id, io.PromptUserForFilePath ("Save model fit to this file ['/dev/null' to skip]")); + +//------------------------------------------------------------------------------------------------------------------------ + +lfunction mss.getIC(logl, params, samples, is_bic) { + if (is_bic) { + return -2 * logl + Log (samples) * params; + } + return -2 * logl + 2 * samples / (samples - params - 1) * params; +} diff --git a/res/TemplateBatchFiles/SelectionAnalyses/BUSTED.bf b/res/TemplateBatchFiles/SelectionAnalyses/BUSTED.bf index aaa06918b..8b571766c 100644 --- a/res/TemplateBatchFiles/SelectionAnalyses/BUSTED.bf +++ b/res/TemplateBatchFiles/SelectionAnalyses/BUSTED.bf @@ -473,7 +473,7 @@ busted.initial_grid = {}; busted.initial_grid_presets = {"0" : 0.1}; busted.initial_ranges = {}; -busted.init_grid_setup (busted.distribution); +busted.init_grid_setup (busted.distribution, busted.error_sink); /** setup parameter optimization groups */ @@ -487,7 +487,7 @@ if (busted.has_background) { busted.model_object_map = { "busted.background" : busted.background.bsrel_model, "busted.test" : busted.test.bsrel_model }; busted.background_distribution = models.codon.BS_REL.ExtractMixtureDistribution(busted.background.bsrel_model); - busted.init_grid_setup (busted.background_distribution); + busted.init_grid_setup (busted.background_distribution, busted.error_sink); //PARAMETER_GROUPING + busted.background_distribution["rates"]; //PARAMETER_GROUPING + busted.background_distribution["weights"]; @@ -529,7 +529,7 @@ if (busted.do_srv) { //PARAMETER_GROUPING + busted.srv_distribution["weights"]; PARAMETER_GROUPING + utility.Concat (busted.srv_distribution["rates"],busted.srv_distribution["weights"]); - busted.init_grid_setup (busted.srv_distribution); + busted.init_grid_setup (busted.srv_distribution, FALSE); } @@ -559,17 +559,16 @@ for (busted.partition_index = 0; busted.partition_index < busted.partition_count busted.initial.test_mean = ((selection.io.extract_global_MLE_re (busted.final_partitioned_mg_results, "^" + terms.parameters.omega_ratio + ".+test.+"))["0"])[terms.fit.MLE]; busted.initial_grid = estimators.LHC (busted.initial_ranges,busted.initial_grid.N); - //estimators.CreateInitialGrid (busted.initial_grid, busted.initial_grid.N, busted.initial_grid_presets); busted.initial_grid = utility.Map (busted.initial_grid, "_v_", - 'busted._renormalize (_v_, "busted.distribution", busted.initial.test_mean)' + 'busted._renormalize (_v_, "busted.distribution", busted.initial.test_mean, busted.error_sink)' ); if (busted.has_background) { //GDD rate category busted.initial.background_mean = ((selection.io.extract_global_MLE_re (busted.final_partitioned_mg_results, "^" + terms.parameters.omega_ratio + ".+background.+"))["0"])[terms.fit.MLE]; busted.initial_grid = utility.Map (busted.initial_grid, "_v_", - 'busted._renormalize (_v_, "busted.background_distribution", busted.initial.background_mean)' + 'busted._renormalize (_v_, "busted.background_distribution", busted.initial.background_mean, busted.error_sink)' ); } @@ -606,6 +605,7 @@ if (Type (debug.checkpoint) != "String") { // constrain nucleotide rate parameters busted.tmp_fixed = models.FixParameterSetRegExp (terms.nucleotideRatePrefix,busted.test.bsrel_model); + busted.grid_search.results = estimators.FitLF (busted.filter_names, busted.trees, busted.model_map, busted.final_partitioned_mg_results, busted.model_object_map, { "retain-lf-object": TRUE, @@ -1227,12 +1227,18 @@ lfunction busted.DistributionGuess (mean) { // renormalize the grid to have the same mean as the initial omega value -lfunction busted._renormalize (v, distro, mean) { +lfunction busted._renormalize (v, distro, mean, skip_first) { parameters.SetValues (v); m = parameters.GetStickBreakingDistribution (^distro); d = Rows (m); - m = +(m[-1][0] $ m[-1][1]); // current mean - for (i = 0; i < d; i+=1) { + if (skip_first) { + m0 = m[0][0]*m[0][1]; + } else { + m0 = 0; + } + m = +(m[-1][0] $ m[-1][1]) -m0; // current mean + + for (i = (skip_first != 0); i < d; i+=1) { (v[((^distro)["rates"])[i]])[^"terms.fit.MLE"] = (v[((^distro)["rates"])[i]])[^"terms.fit.MLE"] / m * mean; } return v; @@ -1292,65 +1298,68 @@ lfunction busted.mixture_site_BF (ll, p) { //------------------------------------------------------------------------------ -function busted.init_grid_setup (omega_distro) { - utility.ForEachPair (omega_distro[terms.parameters.rates], "_index_", "_name_", - ' - if (_index_[0] < busted.rate_classes - 1) { // not the last rate - busted.initial_grid [_name_] = { - { - 0.01, 0.1, 0.25, 0.75 - } - }["_MATRIX_ELEMENT_VALUE_^(busted.rate_classes-_index_[0]-1)"]; - busted.initial_grid_presets [_name_] = 0; - - if (busted.error_sink && _index_[0] == 0) { - busted.initial_ranges [_name_] = { - terms.lower_bound : 100, - terms.upper_bound : 10000 - }; - } else { - busted.initial_ranges [_name_] = { - terms.lower_bound : 0, - terms.upper_bound : 1 - }; +function busted.init_grid_setup (omega_distro, error_sink) { + for (_index_,_name_; in; omega_distro[terms.parameters.rates]) { + if (_index_ < busted.rate_classes - 1) { // not the last rate + busted.initial_grid [_name_] = { + { + 0.01, 0.1, 0.25, 0.75 } - } else { - busted.initial_grid [_name_] = { - { - 1, 1.5, 2, 4, 10 - } + }["_MATRIX_ELEMENT_VALUE_^(busted.rate_classes-_index_-1)"]; + + busted.initial_grid_presets [_name_] = 0; + + if (error_sink && _index_ == 0) { + busted.initial_grid [_name_] = {{100,500,1000,5000}}; + busted.initial_ranges [_name_] = { + terms.lower_bound : 100, + terms.upper_bound : 10000 }; + } else { busted.initial_ranges [_name_] = { - terms.lower_bound : 1, - terms.upper_bound : 10 + terms.lower_bound : 0, + terms.upper_bound : 1 }; - busted.initial_grid_presets [_name_] = 2; } - ' - ); - - - utility.ForEachPair (omega_distro[terms.parameters.weights], "_index_", "_name_", - ' + } else { busted.initial_grid [_name_] = { { - 0.2, 0.5, 0.7, 0.8, 0.9 + 1, 1.5, 2, 4, 10 } }; - busted.initial_grid_presets [_name_] = 3; - if (busted.error_sink && _index_[0] == 0) { - busted.initial_ranges [_name_] = { - terms.lower_bound : 0, - terms.upper_bound : 0.01, - }; - } else { - busted.initial_ranges [_name_] = { - terms.lower_bound : 0, - terms.upper_bound : 1 - }; + busted.initial_ranges [_name_] = { + terms.lower_bound : 1, + terms.upper_bound : 10 + }; + busted.initial_grid_presets [_name_] = 2; + } + } + + + for (_index_, _name_; in; omega_distro[terms.parameters.weights]) { + busted.initial_grid [_name_] = { + { + 0.2, 0.5, 0.7, 0.8, 0.9 } - ' - ); + }; + busted.initial_grid_presets [_name_] = 3; + if (error_sink && _index_ == 0) { + busted.initial_grid [_name_] = { + { + 0, 0.001, 0.005, 0.1 + } + }; + busted.initial_ranges [_name_] = { + terms.lower_bound : 0, + terms.upper_bound : 0.01, + }; + } else { + busted.initial_ranges [_name_] = { + terms.lower_bound : 0, + terms.upper_bound : 1 + }; + } + } } //------------------------------------------------------------------------------ diff --git a/res/TemplateBatchFiles/SelectionAnalyses/RELAX.bf b/res/TemplateBatchFiles/SelectionAnalyses/RELAX.bf index 3a68b4a5d..fcb572573 100644 --- a/res/TemplateBatchFiles/SelectionAnalyses/RELAX.bf +++ b/res/TemplateBatchFiles/SelectionAnalyses/RELAX.bf @@ -43,19 +43,21 @@ relax.analysis_description = { Version 3.1 adds LHC + Nedler-Mead initial fit phase and keyword support. Version 3.1.1 adds some bug fixes for better convergence. Version 4.0 adds support for synonymous rate variation. - Version 4.1 adds further support for multiple hit models", + Version 4.1 adds further support for multiple hit models. + Version 4.1.1 adds reporting for convergence diagnostics", terms.io.version : "3.1.1", terms.io.reference : "RELAX: Detecting Relaxed Selection in a Phylogenetic Framework (2015). Mol Biol Evol 32 (3): 820-832", terms.io.authors : "Sergei L Kosakovsky Pond, Ben Murrell, Steven Weaver and Temple iGEM / UCSD viral evolution g", terms.io.contact : "spond@temple.edu", - terms.io.requirements : "in-frame codon alignment and a phylogenetic tree, with at least two groups of branches defined using the {} notation (one group can be defined as all unlabeled branches)" + terms.io.requirements : "in-frame codon alignment and a phylogenetic tree, with at least two groups of branches defined using the {} notation (one group can be defined as all unlabeled branches)", + terms.settings: {} }; relax.json = { terms.json.analysis: relax.analysis_description, terms.json.input: {}, terms.json.fits : {}, terms.json.timers : {}, - terms.json.test_results : {} + terms.json.test_results : {}, }; relax.relaxation_parameter = "relax.K"; @@ -1001,6 +1003,9 @@ function relax.FitMainTestPair (prompt) { if (Abs (relax.best_samples[1] - relax.best_samples[0]) < 5.) { // could be diagnostic of convergence problems + selection.io.json_store_setting (relax.json, "convergence-flat-surface", relax.best_samples[1] - relax.best_samples[0]); + + io.ReportProgressMessageMD("RELAX", "alt-2", "* Potential convergence issues due to flat likelihood surfaces; checking to see whether K > 1 or K < 1 is robustly inferred"); if (relax.fitted.K > 1) { parameters.SetRange (model.generic.GetGlobalParameter (relax.model_object_map ["relax.test"] , terms.relax.k), terms.range01); @@ -1022,6 +1027,8 @@ function relax.FitMainTestPair (prompt) { if (relax.alternative_model.fit.take2 [terms.fit.log_likelihood] > relax.alternative_model.fit[terms.fit.log_likelihood]) { + relax.stash_fitted_K = relax.fitted.K; + io.ReportProgressMessageMD("RELAX", "alt-2", "\n### Potential for highly unreliable K inference due to multiple local maxima in the likelihood function, treat results with caution "); io.ReportProgressMessageMD("RELAX", "alt-2", "> Relaxation parameter reset to opposite mode of evolution from that obtained in the initial optimization."); @@ -1032,6 +1039,11 @@ function relax.FitMainTestPair (prompt) { relax.inferred_distribution = parameters.GetStickBreakingDistribution (models.codon.BS_REL.ExtractMixtureDistribution (relax.model_object_map ["relax.test"])) % 0; selection.io.report_dnds (relax.inferred_distribution); + selection.io.json_store_setting (relax.json, "convergence-unstable", { + {relax.fitted.K, relax.alternative_model.fit.take2 [terms.fit.log_likelihood]} + {relax.stash_fitted_K, relax.alternative_model.fit [terms.fit.log_likelihood]} + + }); io.ReportProgressMessageMD("RELAX", "alt-2", "* The following rate distribution was inferred for **reference** branches"); relax.inferred_distribution_ref = parameters.GetStickBreakingDistribution (models.codon.BS_REL.ExtractMixtureDistribution (relax.model_object_map ["relax.reference"])) % 0; @@ -1182,6 +1194,8 @@ do { if (relax.LRT [terms.LRT] < 0) { io.ReportProgressMessageMD("RELAX", "refit", "* Detected convergence issues (negative LRT). Refitting the alterative/null model pair from a new starting point"); + selection.io.json_store_setting (relax.json, "convergence-negative-lrt", relax.loop_passes); + relax.general_descriptive.fit = relax.null_model.fit; // reset initial conditions for (relax.k = 1; relax.k < relax.numbers_of_tested_groups; relax.k += 1) { // remove constraints on K relax.model_nmsp = relax.model_namespaces[relax.k ]; diff --git a/res/TemplateBatchFiles/SelectionAnalyses/aBSREL.bf b/res/TemplateBatchFiles/SelectionAnalyses/aBSREL.bf index 2678cb5b2..12f6beef9 100644 --- a/res/TemplateBatchFiles/SelectionAnalyses/aBSREL.bf +++ b/res/TemplateBatchFiles/SelectionAnalyses/aBSREL.bf @@ -1,4 +1,4 @@ -RequireVersion ("2.5.8"); +RequireVersion ("2.5.57"); LoadFunctionLibrary("libv3/all-terms.bf"); // must be loaded before CF3x4 @@ -10,6 +10,7 @@ LoadFunctionLibrary("libv3/tasks/estimators.bf"); LoadFunctionLibrary("libv3/tasks/alignments.bf"); LoadFunctionLibrary("libv3/tasks/mpi.bf"); LoadFunctionLibrary("libv3/tasks/trees.bf"); +LoadFunctionLibrary("libv3/tasks/ancestral.bf"); LoadFunctionLibrary("libv3/models/rate_variation.bf"); LoadFunctionLibrary("modules/io_functions.ibf"); @@ -68,12 +69,16 @@ absrel.display_orders = {terms.original_name: -1, absrel.analysis_description = {terms.io.info : "aBSREL (Adaptive branch-site random effects likelihood) uses an adaptive random effects branch-site model framework to test whether each branch has evolved under positive selection, - using a procedure which infers an optimal number of rate categories per branch.", - terms.io.version : "2.3", - terms.io.reference : "Less Is More: An Adaptive Branch-Site Random Effects Model for Efficient Detection of Episodic Diversifying Selection (2015). Mol Biol Evol 32 (5): 1342-1353. v2.2 adds support for multiple-hit models. v2.3 adds support for SRV", + using a procedure which infers an optimal number of rate categories per branch. v2.2 adds support for multiple-hit models. v2.3 adds support for SRV. v2.5 adds support for ancestral state reconstruction, identification of sites contributing to selection signal, and some diagnostics. ", + terms.io.version : "2.5", + terms.io.reference : " + Less Is More: An Adaptive Branch-Site Random Effects Model for Efficient Detection of Episodic Diversifying Selection (2015). + Mol Biol Evol 32 (5): 1342-1353. + ", terms.io.authors : "Sergei L Kosakovsky Pond, Ben Murrell, Steven Weaver and Temple iGEM / UCSD viral evolution group", terms.io.contact : "spond@temple.edu", - terms.io.requirements : "in-frame codon alignment and a phylogenetic tree" + terms.io.requirements : "in-frame codon alignment and a phylogenetic tree", + terms.settings : {} }; @@ -119,6 +124,8 @@ absrel.multi_hit = io.SelectAnOption ({ }, "Include support for multiple nucleotide substitutions"); +selection.io.json_store_setting (absrel.json, "multiple-hit", absrel.multi_hit); + KeywordArgument ("srv", "Include synonymous rate variation", "No"); absrel.srv = io.SelectAnOption ( @@ -140,6 +147,7 @@ if (absrel.do_srv) { absrel.synonymous_rate_classes = io.PromptUser ("The number omega rate classes to include in the model", absrel.synonymous_rate_classes, 1, 10, TRUE); } +selection.io.json_store_setting (absrel.json, "srv", absrel.do_srv); KeywordArgument ("output", "Write the resulting JSON to this file (default is to save to the same path as the alignment file + 'ABSREL.json')", absrel.codon_data_info [terms.json.json]); @@ -554,6 +562,19 @@ io.ReportProgressMessageMD("absrel", "Full adaptive model", "* " + selection.io. selection.io.stopTimer (absrel.json [terms.json.timers], "Full adaptive model fitting"); + +absrel.json [terms.substitutions] = {}; + +absrel.ancestral_info = ancestral.build (absrel.likelihood_function_id,0,FALSE); +(absrel.json [terms.substitutions] )[0] = ancestral.ComputeCompressedSubstitutions (absrel.ancestral_info); +DeleteObject (absrel.ancestral_info); + +absrel.json [terms.json.site_log_likelihood] = { + terms.json.unconstrained : absrel.ComputeSiteLikelihoods (absrel.likelihood_function_id), + terms.json.tested : {} +}; + +absrel.distribution_for_json = {}; absrel._report_srv (absrel.full_model.fit, "Full adaptive model", absrel.likelihood_function_id); @@ -594,7 +615,7 @@ selection.io.json_store_lf (absrel.json, absrel.full_model.fit[terms.fit.log_likelihood], absrel.full_model.fit[terms.parameters] + 9 , absrel.codon_data_info[terms.data.sample_size], - {}, + absrel.distribution_for_json, absrel.display_orders[absrel.full_adaptive_model]); KeywordArgument ("save-fit", "Save full adaptive aBSREL model fit to this file (default is not to save)", "/dev/null"); @@ -612,38 +633,41 @@ if (absrel.multi_hit == "Double") { "0": 35, "1": 10, "2": 20, - "3": 12, - "4": 20, + "3": 18, + "4": 12, "5": 20, + "6": 20, }, terms.number_precision : 2}; - fprintf (stdout, "\n", io.FormatTableRow ({{"Branch", "Rates", "Max. dN/dS", "2x rate", "Test LRT", "Uncorrected p-value"}}, absrel.testing_table.settings)); + fprintf (stdout, "\n", io.FormatTableRow ({{"Branch", "Rates", "Max. dN/dS", "Sites @ EBF>=100", "2x rate", "Test LRT", "Uncorrected p-value"}}, absrel.testing_table.settings)); } else { if (absrel.multi_hit == "Double+Triple") { absrel.testing_table.settings = {terms.table_options.header : TRUE, terms.table_options.column_widths: { "0": 35, "1": 10, "2": 20, - "3": 12, + "3" :18, "4": 12, - "5": 20, + "5": 12, "6": 20, + "7": 20, }, terms.number_precision : 2}; - fprintf (stdout, "\n", io.FormatTableRow ({{"Branch", "Rates", "Max. dN/dS", "2x rate","3x rate", "Test LRT", "Uncorrected p-value"}}, absrel.testing_table.settings)); + fprintf (stdout, "\n", io.FormatTableRow ({{"Branch", "Rates", "Max. dN/dS", "Sites @ EBF>=100", "2x rate","3x rate", "Test LRT", "Uncorrected p-value"}}, absrel.testing_table.settings)); } else { absrel.testing_table.settings = {terms.table_options.header : TRUE, terms.table_options.column_widths: { "0": 35, "1": 10, "2": 20, - "3": 20, + "3": 18, "4": 20, + "5": 20, }, terms.number_precision : 2}; - fprintf (stdout, "\n", io.FormatTableRow ({{"Branch", "Rates", "Max. dN/dS", "Test LRT", "Uncorrected p-value"}}, absrel.testing_table.settings)); + fprintf (stdout, "\n", io.FormatTableRow ({{"Branch", "Rates", "Max. dN/dS", "Sites @ EBF>=100", "Test LRT", "Uncorrected p-value"}}, absrel.testing_table.settings)); } } @@ -656,6 +680,8 @@ absrel.branch.lrt = {}; absrel.branch.delta = {}; absrel.branch.psi = {}; +absrel.branch_site_level_ER = {}; + for (absrel.branch_id = 0; absrel.branch_id < absrel.branch_count; absrel.branch_id += 1) { absrel.current_branch = absrel.names_sorted_by_length[absrel.branch_id]; @@ -670,47 +696,102 @@ for (absrel.branch_id = 0; absrel.branch_id < absrel.branch_count; absrel.branch } else { absrel.report.row [2] = ">1000 (" + Format (absrel.dn_ds.distro[absrel.branch.complexity[absrel.current_branch]-1][1]*100,5,2) + "%)"; } + absrel.report.row [3] = "N/A"; absrel.offset = 0; absrel.final_estimates = absrel.GetBranchEstimates(absrel.model_defintions [absrel.branch.complexity[absrel.current_branch]], absrel.tree_id, absrel.current_branch); if (absrel.multi_hit == "Double" || absrel.multi_hit == "Double+Triple") { - absrel.report.row [3] = (absrel.final_estimates[utility.getGlobalValue ("terms.parameters.multiple_hit_rate")])[utility.getGlobalValue ("terms.fit.MLE")]; - absrel.branch.delta [absrel.current_branch] = absrel.report.row [3]; + absrel.report.row [4] = (absrel.final_estimates[utility.getGlobalValue ("terms.parameters.multiple_hit_rate")])[utility.getGlobalValue ("terms.fit.MLE")]; + absrel.branch.delta [absrel.current_branch] = absrel.report.row [4]; absrel.offset = 1; } if (absrel.multi_hit == "Double+Triple") { - absrel.report.row [4] = (absrel.final_estimates[utility.getGlobalValue ("terms.parameters.triple_hit_rate")])[utility.getGlobalValue ("terms.fit.MLE")]; - absrel.branch.psi [absrel.current_branch] = absrel.report.row [4]; + absrel.report.row [5] = (absrel.final_estimates[utility.getGlobalValue ("terms.parameters.triple_hit_rate")])[utility.getGlobalValue ("terms.fit.MLE")]; + absrel.branch.psi [absrel.current_branch] = absrel.report.row [5]; absrel.offset += 1; } if ((absrel.selected_branches[0])[absrel.current_branch] == terms.tree_attributes.test) { if (absrel.dn_ds.distro [absrel.branch.complexity[absrel.current_branch]-1][0] > 1) { + + absrel.branch_mixtures = absrel.branch.GetMixtureParameters (absrel.model_defintions [absrel.branch.complexity[absrel.current_branch]], absrel.tree_id, absrel.current_branch); + + // stash the weights + if (None != absrel.branch_mixtures) { + absrel.mixture_stash = {}; + + LFCompute (^absrel.likelihood_function_id,LF_START_COMPUTE); + + absrel.cat_count = utility.Array1D (absrel.branch_mixtures) + 1; + absrel.weights = {1, absrel.cat_count - 1 }; + for (absrel.i, v; in; absrel.branch_mixtures) { + absrel.mixture_stash [v] = ^v; + absrel.weights[absrel.i] = ^v; + ^v = 0; + } + + absrel.weights = parameters.GetStickBreakingDistributionWeigths (absrel.weights); + + absrel.siteLL = absrel.ComputeSiteLikelihoods (absrel.likelihood_function_id); + absrel.all_siteLL = {absrel.cat_count, Columns (absrel.siteLL)}; + absrel.all_siteLL [absrel.cat_count-1][0] = absrel.siteLL; + + for (absrel.i = 1; absrel.i < absrel.cat_count; absrel.i += 1) { + if (absrel.i > 1) { + ^( absrel.branch_mixtures[absrel.i-1]) = 0; + } + ^( absrel.branch_mixtures[absrel.i-1]) = 1; + absrel.siteLL = absrel.ComputeSiteLikelihoods (absrel.likelihood_function_id); + absrel.all_siteLL [absrel.i-1][0] = absrel.siteLL; + } + for (p, v; in; absrel.mixture_stash) { + ^p = v; + } + + LFCompute (^absrel.likelihood_function_id,LF_DONE_COMPUTE); + + absrel.branch_site_level_ER [absrel.current_branch] = absrel.mixture_site_logl (absrel.all_siteLL,absrel.weights ); + + absrel.branch_site_level_BF = + ((absrel.compute_mixture_site_BF (absrel.branch_site_level_ER [absrel.current_branch], absrel.weights ))[absrel.cat_count-1][-1])["_MATRIX_ELEMENT_VALUE_>=100"]; + + absrel.report.row [3] = Format (absrel.branch_site_level_BF, 5, 0); + + } + absrel.branch.ConstrainForTesting (absrel.model_defintions [absrel.branch.complexity[absrel.current_branch]], absrel.tree_id, absrel.current_branch); Optimize (absrel.null.mles, ^absrel.likelihood_function_id , {"OPTIMIZATION_METHOD" : "coordinate-wise"} ); absrel.branch.test = absrel.ComputeLRT ( absrel.full_model.fit[terms.fit.log_likelihood], absrel.null.mles[1][0]); + ((absrel.json [terms.json.site_log_likelihood])[terms.json.tested])[ absrel.current_branch ] = absrel.ComputeSiteLikelihoods (absrel.likelihood_function_id); estimators.RestoreLFStateFromSnapshot (absrel.likelihood_function_id, absrel.full_model.mle_set); } else { absrel.branch.test = {terms.LRT : 0, terms.p_value : 1}; } absrel.branch.p_values [ absrel.current_branch ] = absrel.branch.test [terms.p_value]; absrel.branch.lrt [absrel.current_branch] = absrel.branch.test [terms.LRT]; - absrel.report.row [3+absrel.offset] = absrel.branch.test [terms.LRT]; - absrel.report.row [4+absrel.offset] = Format (absrel.branch.test [terms.p_value], 8, 5); + absrel.report.row [4+absrel.offset] = absrel.branch.test [terms.LRT]; + absrel.report.row [5+absrel.offset] = Format (absrel.branch.test [terms.p_value], 8, 5); } else { absrel.branch.lrt [absrel.current_branch] = None; absrel.branch.p_values [absrel.current_branch] = None; - absrel.report.row [3+absrel.offset] = "Not selected"; - absrel.report.row [4+absrel.offset] = "for testing"; + absrel.report.row [4+absrel.offset] = "Not selected"; + absrel.report.row [5+absrel.offset] = "for testing"; } fprintf (stdout, io.FormatTableRow (absrel.report.row, absrel.testing_table.settings)); } +selection.io.json_store_branch_attribute(absrel.json, + terms.posterior, + terms.json.branch_annotations, + -1, + 0, + absrel.branch_site_level_ER); + + if (Abs (absrel.branch.delta)) { selection.io.json_store_branch_attribute(absrel.json, terms.parameters.multiple_hit_rate, terms.json.branch_label, absrel.display_orders[terms.parameters.multiple_hit_rate], 0, @@ -857,8 +938,25 @@ lfunction absrel.branch.ConstrainForTesting (model, tree_id, branch_id) { bv, ''); } +} + +//------------------------------------------------------------------------------------------------------------------------ + +lfunction absrel.branch.GetMixtureParameters (model, tree_id, branch_id) { + component_count = model[utility.getGlobalValue ("terms.model.components")]; + local_parameters = (model[utility.getGlobalValue ("terms.parameters")])[utility.getGlobalValue ("terms.local")]; + if (component_count > 1) { + mp = {component_count - 1, 1}; + for (i = 0; i < component_count-1; i += 1) { + mp[i] = "`tree_id`.`branch_id`.`local_parameters[terms.AddCategory (utility.getGlobalValue('terms.mixture.mixture_aux_weight'), i+1)]`"; + } + return mp; + } else { + return None; + } } + //------------------------------------------------------------------------------------------------------------------------ lfunction absrel.PopulateInitialGrid (model, tree_id, branch_id, current_estimates) { @@ -1222,3 +1320,30 @@ function absrel._report_srv (absrel_model_fit, tag, lf_id) { (absrel.json [absrel.json.srv_posteriors]) = absrel.cmx_weighted $ absrel.column_weights; } } + +//------------------------------------------------------------------------------ +lfunction absrel.ComputeSiteLikelihoods (id) { + ConstructCategoryMatrix (sl, ^id, SITE_LOG_LIKELIHOODS); + return sl; +} + +//------------------------------------------------------------------------------ + +lfunction absrel.mixture_site_logl (ll, p) { + res = ll["0"]; + S = Columns (ll); + norm = {1,S}["Max(ll[-1][_MATRIX_ELEMENT_COLUMN_],0)"]; + scaled_ll = ll["Exp(_MATRIX_ELEMENT_VALUE_-norm[_MATRIX_ELEMENT_COLUMN_])*p[_MATRIX_ELEMENT_ROW_]"]; + + norm = p["1"]*scaled_ll; + ll = scaled_ll["_MATRIX_ELEMENT_VALUE_/norm[_MATRIX_ELEMENT_COLUMN_]"]; + return ll; +} + +//------------------------------------------------------------------------------ + +lfunction absrel.compute_mixture_site_BF (ll, p) { + prior_odds = p["_MATRIX_ELEMENT_VALUE_/(1-_MATRIX_ELEMENT_VALUE_)"]; + ll = ll["(_MATRIX_ELEMENT_VALUE_/(1-_MATRIX_ELEMENT_VALUE_))/prior_odds[_MATRIX_ELEMENT_ROW_]"]; + return ll; +} \ No newline at end of file diff --git a/res/TemplateBatchFiles/SelectionAnalyses/error-filter.bf b/res/TemplateBatchFiles/SelectionAnalyses/error-filter.bf new file mode 100644 index 000000000..b72e35469 --- /dev/null +++ b/res/TemplateBatchFiles/SelectionAnalyses/error-filter.bf @@ -0,0 +1,276 @@ +RequireVersion("2.5.58"); + +/*------------------------------------------------------------------------------ + Load library files +*/ + +LoadFunctionLibrary("libv3/UtilityFunctions.bf"); +LoadFunctionLibrary("libv3/IOFunctions.bf"); +LoadFunctionLibrary("libv3/stats.bf"); +LoadFunctionLibrary("libv3/all-terms.bf"); + +LoadFunctionLibrary("libv3/tasks/ancestral.bf"); +LoadFunctionLibrary("libv3/tasks/alignments.bf"); +LoadFunctionLibrary("libv3/tasks/estimators.bf"); +LoadFunctionLibrary("libv3/tasks/trees.bf"); + +LoadFunctionLibrary("libv3/convenience/math.bf"); + + +/*------------------------------------------------------------------------------ + Display analysis information +*/ + +efilter.analysis_description = { + terms.io.info: " +The error filter analysis reads a BUSTED-E JSON result file, +identifies sites which may be due to alignment or other error, +and masks them according to a user-selected strategy, saving the resulting MSA to a new file. +A machine readable report on the errors filtered (if any) is also created. + ", + terms.io.version: "0.1", + terms.io.reference: "TBD", + terms.io.authors: "Sergei L Kosakovsky Pond", + terms.io.contact: "spond@temple.edu", + terms.io.requirements: "A BUSTED-E .json results file (possibly .gz compressed)", + terms.settings: {} +}; +io.DisplayAnalysisBanner(efilter.analysis_description); + +KeywordArgument ("json", "Load the BUSTED-E .json result file", None); +efilter.path = io.PromptUserForFilePathRead ("Load the BUSTED-E .json result file"); +io.ReportProgressMessageMD ("efilter","loading","Loading .JSON data"); + +KeywordArgument ("threshold", "Empirical Bayes Factor error threshold for masking sites", 100.); +efilter.threshold = io.PromptUser ("Empirical Bayes Factor threshold for masking sites", 100., 1, 1e100, FALSE); + +KeywordArgument ("ratio", "Empirical Bayes Factor for error vs selection ", 20.); +efilter.ratio_threshold = io.PromptUser ("Empirical Bayes Factor for error vs selection", 20., 1, 1e100, FALSE); + + +KeywordArgument ("site-threshold", "Mask the entire site if more than X% sequences are problematic", .4); +efilter.site_threshold = io.PromptUser ("Mask the entire site if more than X% sequences are problematic", 0.4, 0., 1., FALSE); + + +efilter.json_out = { + terms.json.analysis : efilter.analysis_description, + terms.settings : { + terms.empirical_bayes_factor : efilter.threshold, + "BF ratio" : efilter.ratio_threshold, + "site threshold" : efilter.site_threshold + } +}; + +efilter.json = io.ParseJSON(efilter.path); + +io.CheckAssertion ("utility.GetByKey (efilter.json, {{terms.json.analysis, 'settings', 'error-sink'}},'Number')", "There is no error sink data in the JSON file"); + + +efilter.error_sink_proportion = utility.GetByKey (efilter.json, {{terms.json.fits,"Unconstrained model",terms.json.rate_distribution,"Test","0",terms.json.proportion}},'Number'); + +io.CheckAssertion ("None!=efilter.error_sink_proportion", "There is no estimate of the error-sink omega proportion"); + +efilter.rates = utility.GetByKey (efilter.json, {{terms.json.fits,"Unconstrained model",terms.json.rate_distribution,"Test"}},'AssociativeList'); +efilter.rates_count = utility.Array1D (efilter.rates); + +efilter.fast_omega_proportion = utility.GetByKey (efilter.json, {{terms.json.fits,"Unconstrained model",terms.json.rate_distribution,"Test","" + (efilter.rates_count-1),terms.json.proportion}},'Number'); + +efilter.input = utility.GetByKey (efilter.json, {{terms.json.input}}, "AssociativeList"); +io.CheckAssertion ("None!=efilter.input", "There is no information about the alignment"); + +io.ReportProgressMessageMD ("efilter","loading","- Original data file path: \`" + efilter.input[terms.json.file] + "\`"); +io.ReportProgressMessageMD ("efilter","loading","- Partitions: " + efilter.input[terms.json.partition_count]); +io.ReportProgressMessageMD ("efilter","loading","- Sequences: " + efilter.input[terms.json.sequences]); +io.ReportProgressMessageMD ("efilter","loading","- Sites: " + efilter.input[terms.json.sites]); + +efilter.json_out[terms.json.input] = efilter.input; + +io.ReportProgressMessageMD ("efilter","loading","- Estimated proportion of the MSA in the error-sink class = " + Format (efilter.error_sink_proportion *100, 8, 3) + "%"); + +if (efilter.error_sink_proportion == 0) { + efilter.error_sink_prior_odds = 1e100; +} else { + efilter.error_sink_prior_odds = efilter.error_sink_proportion / (1-efilter.error_sink_proportion); +} +efilter.error_sink_prior_odds_ratio = Min (1e25,efilter.error_sink_proportion / efilter.fast_omega_proportion); + +efilter.site_offset = 0; + +utility.ToggleEnvVariable ("LOOP_TREE_ITERATOR_PREORDER", TRUE); + +for (p = 0; p < efilter.input[terms.json.partition_count]; p+=1) { + efilter.sequences = {}; + efilter.masked_sites = {}; + efilter.branch_data = utility.GetByKey (efilter.json, {{terms.json.branch_attributes,p}}, "AssociativeList"); + efilter.tested_branches = utility.GetByKey (efilter.json, {{terms.json.tested,p}}, "AssociativeList"); + efilter.tree = utility.GetByKey (efilter.input, {{terms.json.trees,p}}, "String"); + efilter.subs = utility.GetByKey (efilter.json, {{terms.substitutions,p}}, "AssociativeList"); + Topology efilter.T = efilter.tree; + efilter.sites = utility.Array1D (efilter.subs ); + + if (p == 0) { + efilter.leaves = {}; + for (s;in;TipName (efilter.T,-1)) { + efilter.sequences[s] = ""; + efilter.sequences[s] * efilter.sites; + efilter.masked_sites[s] = {}; + efilter.leaves[s] = 1; + } + + } + efilter.descendants = {}; + efilter.leaf_descendants = {}; + + for (s;in;BranchName (efilter.T,-1)) { + if (efilter.sequences / s) { + continue; + } + efilter.descendants[s] = {}; + efilter.leaf_descendants[s] = {}; + for (s2, v; in; efilter.T [s]) { + if (v == 1) { + (efilter.leaf_descendants[s])[s2] = 1; + } + (efilter.descendants[s])[s2] = 1; + } + if (Abs (efilter.leaf_descendants[s]) * 2 > Abs (efilter.leaves)) { + t = efilter.leaf_descendants[s]; + efilter.leaf_descendants[s] = efilter.leaves; + efilter.leaf_descendants[s] - t; + } + } + + + for (site = 0; site < efilter.sites; site += 1) { + efilter.subs4site = efilter.subs [site]; + efilter.states = {}; + efilter.masked_already = {}; + efilter.write_out = {}; + for (parent, node; in; efilter.T) { + if (parent) { + if (efilter.subs4site/node) { + efilter.states [node] = efilter.subs4site[node]; + } else { + efilter.states [node] = efilter.states [parent] + } + efilter.write_state = efilter.states [node] ; + if (efilter.masked_already [node]) { + continue; + } + if (efilter.branch_data / node) { + efilter.site_prob = ((efilter.branch_data[node])["Posterior prob omega class by site"])[0][site]; + if (None == efilter.site_prob) { + continue; + } + efilter.site_BF2 = ((efilter.branch_data[node])["Posterior prob omega class by site"])[efilter.rates_count-1][site]; + if (efilter.site_prob < 1) { + efilter.site_BF = efilter.site_prob/(1-efilter.site_prob)/efilter.error_sink_prior_odds; + } else { + efilter.site_BF = 1e25; + } + + if (efilter.site_BF2 < 1) { + efilter.site_BF2 = efilter.site_prob / efilter.site_BF2 / efilter.error_sink_prior_odds_ratio; + } else { + efilter.site_BF2 = 1e25; + } + + if (efilter.site_BF >= efilter.threshold && efilter.site_BF2 >= efilter.ratio_threshold) { + if (efilter.masked_sites/node) { // terminal node + if (efilter.masked_already [ntm] != 1) { + efilter.masked_sites [node] + (site + efilter.site_offset); + } + efilter.write_out[node] = "---"; + efilter.masked_already [node] = 1; + } else { + if (Abs (efilter.leaf_descendants[node]) / Abs (efilter.sequences) >= efilter.site_threshold) { + for (ntm, ignore; in; efilter.sequences) { + efilter.write_out[ntm] = "---"; + if (efilter.masked_already [ntm] != 1) { + efilter.masked_sites [ntm] + (site + efilter.site_offset); + } + } + //console.log ("Masking everything " + site + " " + node + " " + Abs (efilter.leaf_descendants) / Abs (efilter.sequences)); + break; + } + for (ntm, ignore; in; efilter.leaf_descendants[node]) { + efilter.write_out[ntm] = "---"; + if (efilter.masked_already [ntm] != 1) { + efilter.masked_sites [ntm] + (site + efilter.site_offset); + } + } + efilter.masked_already * efilter.leaf_descendants[node]; + } + } + } + } else { + efilter.states [node] = efilter.subs4site["root"]; + } + + + if (efilter.sequences / node && efilter.masked_already[node] == 0) { + efilter.write_out[node] = efilter.write_state; + } + } + + + for (s,st; in; efilter.write_out) { + efilter.sequences[s] * st; + } + + } + + efilter.site_offset += efilter.sites; +} + +utility.ToggleEnvVariable ("LOOP_TREE_ITERATOR_PREORDER", None); + +KeywordArgument ("output", "Write a filtered MSA to"); +efilter.path = io.PromptUserForFilePath ("Write a filtered MSA to"); + +io.ReportProgressMessageMD ("efilter","results","Filtering results"); + +efilter.settings = {"header" : TRUE, "column-widths": { + "0": 50, + "1": 20, + "2": 20 + }}; + +fprintf (stdout, "\n", io.FormatTableRow ({{"Sequence", "Filtered Sites", "Filtered Proportion, %"}}, efilter.settings)); +efilter.settings ["header"] = FALSE; +efilter.total_filtered = 0; + +for (s;in;TipName (efilter.T,-1)) { + efilter.row_matrix = {1,3}; + + efilter.row_matrix [0] = s; + efilter.row_matrix [1] = utility.Array1D (efilter.masked_sites[s]); + efilter.row_matrix [2] = Format (efilter.row_matrix [1] / efilter.site_offset*10, 10, 3); + + efilter.total_filtered += efilter.row_matrix [1]; + + fprintf (stdout, io.FormatTableRow (efilter.row_matrix, efilter.settings)); + efilter.sequences[s] * 0; + fprintf (efilter.path, ">", s, "\n", efilter.sequences[s], "\n"); +} + +if (efilter.input[terms.json.partition_count] == 1) { + fprintf (efilter.path, "\n", Format (efilter.T,1,0)); +} + +io.ReportProgressMessageMD ("efilter","results","\nMasked a total of **`efilter.total_filtered`** or `Format (efilter.total_filtered/efilter.input[terms.json.sequences]/efilter.input[terms.json.sites]*100,8,3)`% sites"); + + +fprintf (stdout, "\n"); + +KeywordArgument ("output-json", "Write the resulting JSON to this file"); +efilter.json_path = io.PromptUserForFilePath ("Save the resulting JSON file to"); + +efilter.json_out ["filter"] = efilter.masked_sites ; + +io.SpoolJSON (efilter.json_out,efilter.json_path); + + + + + diff --git a/res/TemplateBatchFiles/files.lst b/res/TemplateBatchFiles/files.lst index e71a7e28c..81a9c7b40 100644 --- a/res/TemplateBatchFiles/files.lst +++ b/res/TemplateBatchFiles/files.lst @@ -8,6 +8,7 @@ "RELAX","[RELAX] Test for relaxation of selection pressure along a specified set of test branches using RELAX (a random effects test of selection relaxation).","SelectionAnalyses/RELAX.bf"; "FADE","[FADE] Test a protein alignment for directional selection towards specific amino acids along a specified set of test branches using FADE (a FUBAR Approach to Directional Evolution).","SelectionAnalyses/FADE.bf"; "PRIME","[PRIME] ","SelectionAnalyses/PRIME.bf"; +"error-filter","[Error-Filter] Use a BUSTED-E model fit to clean-up a sequence alignment","SelectionAnalyses/error-filter.bf"; "","A collection of tools for evolutionary hypothesis testing coding-sequence data.","!Evolutionary Hypothesis Testing"; "FMM", "Fit a model that permits double (and triple) instantaneous nucleotide substitutions","SelectionAnalyses/FitMultiModel.bf"; diff --git a/res/TemplateBatchFiles/libv3/IOFunctions.bf b/res/TemplateBatchFiles/libv3/IOFunctions.bf index 4bed72707..7c1e95c00 100644 --- a/res/TemplateBatchFiles/libv3/IOFunctions.bf +++ b/res/TemplateBatchFiles/libv3/IOFunctions.bf @@ -136,6 +136,25 @@ lfunction io._reportMessageHelper(analysis, text) { } } +/** + * @name io.CompressedObjectSpool + * @param json + * @param file + */ + +lfunction io.CompressedObjectSpool (object, file) { + GetString (have_zip, ZLIB_ENABLED, 0); + if (have_zip && file != "/dev/null") { + if (utility.GetEnvVariable ("GZIP_OUTPUT")) { + utility.ToggleEnvVariable("GZIP_COMPRESSION_LEVEL", 5); + fprintf(file, CLEAR_FILE, object); + utility.ToggleEnvVariable("GZIP_COMPRESSION_LEVEL", None); + return; + } + } + fprintf(file, CLEAR_FILE, object); + } + /** * @name io.SpoolJSON * @param json @@ -144,7 +163,7 @@ lfunction io._reportMessageHelper(analysis, text) { lfunction io.SpoolJSON(json, file) { utility.ToggleEnvVariable("USE_JSON_FOR_MATRIX", 1); if (Type(file) == "String" && Abs (file) > 0) { - fprintf(file, CLEAR_FILE, json); + io.CompressedObjectSpool (json, file); } else { fprintf(stdout, "\n", json, "\n"); } @@ -160,6 +179,7 @@ lfunction io.SpoolJSON(json, file) { lfunction io.ParseJSON(file_path) { fscanf(file_path, REWIND, "Raw", test); parsed_test = Eval(test); + DeleteObject (test); return parsed_test; } @@ -550,7 +570,8 @@ lfunction io.SpoolLF(lf_id, trunk_path, tag) { lfunction io.SpoolLFToPath(lf_id, path) { if (path != "/dev/null") { Export(__lf_spool, ^ lf_id); - fprintf(path, CLEAR_FILE, __lf_spool); + io.CompressedObjectSpool (__lf_spool, path); + DeleteObject (__lf_spool); } } diff --git a/res/TemplateBatchFiles/libv3/UtilityFunctions.bf b/res/TemplateBatchFiles/libv3/UtilityFunctions.bf index 0697c47bc..09db689e2 100644 --- a/res/TemplateBatchFiles/libv3/UtilityFunctions.bf +++ b/res/TemplateBatchFiles/libv3/UtilityFunctions.bf @@ -793,15 +793,15 @@ lfunction utility.Has (d, key, type) { return FALSE; } - if (Type (key) == "Matrix") { - depth = utility.Array1D (key); - current_check = &d; + if (Type (key) == "Matrix") { + depth = utility.Array1D (key); + current_check = d; current_key = key[0]; for (i = 1; i < depth; i += 1) { - if (Eval ("`current_check`/'`current_key`'")) { - current_check = "(" + current_check + ")['`current_key`']"; - if (Eval ("Type(`current_check`)") != "AssociativeList") { + if (current_check / current_key) { + current_check = current_check[current_key]; + if (Type (current_check) != "AssociativeList") { return FALSE; } current_key = key[i]; @@ -809,13 +809,17 @@ lfunction utility.Has (d, key, type) { return FALSE; } } - - if ( Eval ("`current_check`/'`current_key`'") ) { + + + if ( current_check / current_key ) { + return_value = current_check[current_key]; if (type == None) { return TRUE; } - return Eval ("Type((`current_check`)['`current_key`'])") == type; - } + if (Type (return_value) == type) { + return TRUE; + } + } } } return FALSE; @@ -847,24 +851,25 @@ lfunction utility.GetByKey (d, key, type) { } if (Type (key) == "Matrix") { - depth = utility.Array1D (key); - current_check = &d; + depth = utility.Array1D (key); + current_check = d; current_key = key[0]; for (i = 1; i < depth; i += 1) { - if (Eval ("`current_check`/'`current_key`'")) { - current_check = "(" + current_check + ")['`current_key`']"; - if (Eval ("Type(`current_check`)") != "AssociativeList") { - return FALSE; + if (current_check / current_key) { + current_check = current_check[current_key]; + if (Type (current_check) != "AssociativeList") { + return None; } current_key = key[i]; } else { - return FALSE; + return None; } } - - if ( Eval ("`current_check`/'`current_key`'") ) { - return_value = Eval ("(`current_check`)['`current_key`']"); + + + if ( current_check / current_key ) { + return_value = current_check[current_key]; if (type == None) { return return_value; } diff --git a/res/TemplateBatchFiles/libv3/all-terms.bf b/res/TemplateBatchFiles/libv3/all-terms.bf index 91add912c..5d8459762 100644 --- a/res/TemplateBatchFiles/libv3/all-terms.bf +++ b/res/TemplateBatchFiles/libv3/all-terms.bf @@ -333,6 +333,8 @@ namespace terms{ proportion = "proportion"; positive = "positive test results"; imputed_states = "Imputed States"; + site_log_likelihood = "Site Log Likelihood"; + unconstrained = "unconstrained"; } diff --git a/res/TemplateBatchFiles/libv3/tasks/trees.bf b/res/TemplateBatchFiles/libv3/tasks/trees.bf index 9368aa92b..9a2b09fdf 100644 --- a/res/TemplateBatchFiles/libv3/tasks/trees.bf +++ b/res/TemplateBatchFiles/libv3/tasks/trees.bf @@ -4,6 +4,7 @@ LoadFunctionLibrary("../convenience/regexp.bf"); LoadFunctionLibrary("../UtilityFunctions.bf"); LoadFunctionLibrary("TreeTools"); LoadFunctionLibrary("distances.bf"); +LoadFunctionLibrary("../convenience/regexp.bf"); /** @module trees */ @@ -371,39 +372,47 @@ lfunction trees.branch_names(tree, respect_case) { */ lfunction trees.extract_paml_annotation (tree_string) { - - pieces = regexp.SplitWithMatches (tree_string, "\$([0-9])"); + + pieces = regexp.SplitWithMatches (tree_string, "\$[0-9]+"); N = utility.Array1D (pieces[^("terms.regexp.separators")]); has_paml_clades = N > 0; sanitized_string = ""; if (N > 0) { - for (i,s; in; pieces[^("terms.regexp.strings")]) { - sanitized_string += s; - if (+i > 0) { - sanitized_string += "{" + regexp.Replace ((pieces[^("terms.regexp.separators")])[-1+i], "\$", "PAML_CLADE_") + "}"; + for (i = 0; i <= N; i+=1) { + sanitized_string += (pieces[^("terms.regexp.strings")])[i]; + if (i < N) { + sanitized_string += "{" + regexp.Replace ((pieces[^("terms.regexp.separators")])[i], "\$", "PAML_CLADE_") + "}"; } } + } else { + sanitized_string = tree_string; } - pieces = regexp.SplitWithMatches (sanitized_string, "\#([0-9])"); + + pieces = regexp.SplitWithMatches (sanitized_string, "\#[0-9]+"); N = utility.Array1D (pieces[^("terms.regexp.separators")]); if (N > 0) { - for (i,s; in; pieces[^("terms.regexp.strings")]) { - sanitized_string += s; - if (+i > 0) { - sanitized_string += "{" + regexp.Replace ((pieces[^("terms.regexp.separators")])[-1+i], "\#", "PAML_GROUP_") + "}"; + sanitized_string = ""; + for (i = 0; i <= N; i+=1) { + sanitized_string += (pieces[^("terms.regexp.strings")])[i]; + if (i < N) { + sanitized_string += "{" + regexp.Replace ((pieces[^("terms.regexp.separators")])[i], "\#", "PAML_GROUP_") + "}"; } } } + if (Abs (sanitized_string) == 0) { sanitized_string = tree_string; } + utility.ToggleEnvVariable ("IGNORE_INTERNAL_NODE_LABELS", TRUE); result = trees.ExtractTreeInfo (sanitized_string); - + utility.ToggleEnvVariable ("IGNORE_INTERNAL_NODE_LABELS", None); + + if (has_paml_clades) { additional_annotation = {}; tree_str = result[^"terms.trees.newick"]; @@ -416,6 +425,8 @@ lfunction trees.extract_paml_annotation (tree_string) { } } } + + result[^"terms.trees.model_map"] * additional_annotation; result[^"terms.trees.model_list"] = Columns(result[^"terms.trees.model_map"]); result[^"terms.trees.newick_annotated"] = tree.Annotate (&T, result[^"terms.trees.model_map"], "{}", FALSE); @@ -584,6 +595,7 @@ lfunction trees.ExtractTreeInfo(tree_string) { Topology T = tree_string; res = trees.ExtractTreeInfoFromTopology (&T); res [^"terms.data.file"] = file_name; + DeleteObject (T); return res; } diff --git a/src/core/baseobj.cpp b/src/core/baseobj.cpp index a72aebe9a..2ec2b0751 100644 --- a/src/core/baseobj.cpp +++ b/src/core/baseobj.cpp @@ -64,9 +64,9 @@ BaseRef BaseObj::toStr(unsigned long) { return new _String(kNullToken); } BaseRef BaseObj::toErrStr(void) { return toStr(); } //______________________________________________________________________________ -void BaseObj::toFileStr(FILE *dest, unsigned long padding) { +void BaseObj::toFileStr(hyFile *dest, unsigned long padding) { _String *s = (_String *)toStr(padding); - fwrite(s->get_str(), 1, s->length(), dest); + dest->fwrite(s->get_str(), 1, s->length()); DeleteObject (s); } diff --git a/src/core/batchlan.cpp b/src/core/batchlan.cpp index 215ef249d..3c471bf60 100644 --- a/src/core/batchlan.cpp +++ b/src/core/batchlan.cpp @@ -2675,7 +2675,7 @@ void _ElementaryCommand::ExecuteCase5 (_ExecutionList& chain) { } SetStatusLine ("Loading Data"); - df = hyFile::openFile (fName.get_str(),"rb"); + df = hyFile::openFile (fName.get_str(),kFileReadBinary); if (df==nil) { // try reading this file as a string formula fName = GetStringFromFormula ((_String*)parameters(1),chain.nameSpacePrefix); @@ -2685,7 +2685,7 @@ void _ElementaryCommand::ExecuteCase5 (_ExecutionList& chain) { return; } - df = hyFile::openFile (fName.get_str(),"rb"); + df = hyFile::openFile (fName.get_str(),kFileReadBinary); if (df==nil) { HandleApplicationError ((_String ("Could not find source dataset file ") & ((_String*)parameters(1))->Enquote('"') & " (resolved to '" & fName & "')\nPath stack:\n\t" & GetPathStack ("\n\t"))); @@ -3208,12 +3208,12 @@ void _ElementaryCommand::ExecuteCase52 (_ExecutionList& chain) { _String spool_file; - FILE* main_file = nil; + hyFile* main_file = nil; if (parameters.countitems () > 6) { spool_file = ProcessLiteralArgument (GetIthParameter(6),chain.nameSpacePrefix); ProcessFileName(spool_file); - main_file = doFileOpen (spool_file,"w"); + main_file = doFileOpen (spool_file,kFileWrite); if (!main_file) { throw (_String("Failed to open ") & spool_file.Enquote() & " for writing"); } @@ -3243,7 +3243,6 @@ void _ElementaryCommand::ExecuteCase52 (_ExecutionList& chain) { _Variable * category_names_id = CheckReceptacle(&rate_variable_names, __PRETTY_FUNCTION__); _Matrix* category_names = new _Matrix (1,1,false,true); - SetStatusLine ("Simulating Data"); { // lf must be deleted before the referenced datafilters _LikelihoodFunction lf (filter_specification, nil); @@ -3253,7 +3252,6 @@ void _ElementaryCommand::ExecuteCase52 (_ExecutionList& chain) { */ lf.Simulate (*sim_dataset, exclusions, category_values, category_names, root_states, do_internals?(main_file?&spool_file:&kEmptyString):nil); - SetStatusLine ("Idle"); } @@ -3565,7 +3563,7 @@ bool _ElementaryCommand::Execute (_ExecutionList& chain) { if (terminate_execution) { return false; } - FILE* theDump = doFileOpen (fName,"rb"); + hyFile* theDump = doFileOpen (fName,kFileReadBinary); if (!theDump) { HandleApplicationError (((_String)("File ")&fName&_String(" couldn't be open for reading."))); return false; @@ -3585,7 +3583,8 @@ bool _ElementaryCommand::Execute (_ExecutionList& chain) { } else { importResult = false; } - fclose (theDump); + theDump->close(); + delete (theDump); return importResult; } break; @@ -5114,7 +5113,7 @@ void ReadBatchFile (_String& fName, _ExecutionList& target) { FetchVar(LocateVarByName (optimizationPrecision))->SetValue(&precvalue); #endif*/ - hyFile *f = hyFile::openFile (fName.get_str (), "rb"); + hyFile *f = hyFile::openFile (fName.get_str (), kFileReadBinary); SetStatusLine ("Parsing File"); if (!f) { HandleApplicationError (_String("Could not read batch file '") & fName & "'.\nPath stack:\n\t" & GetPathStack("\n\t")); diff --git a/src/core/batchlanhelpers.cpp b/src/core/batchlanhelpers.cpp index 35819a0fc..a555bba77 100644 --- a/src/core/batchlanhelpers.cpp +++ b/src/core/batchlanhelpers.cpp @@ -83,7 +83,7 @@ void ReadModelList(void) { if (templateModelList.empty() == false) return; _String modelListFile (GetStandardDirectory (HY_HBL_DIRECTORY_TEMPLATE_MODELS) & "models.lst"), - theData (doFileOpen (modelListFile.get_str(),"rb")); + theData (doFileOpen (modelListFile.get_str(),kFileReadBinary)); if (theData.nonempty()) { _ElementaryCommand::ExtractConditions(theData,0,templateModelList); diff --git a/src/core/batchlanruntime.cpp b/src/core/batchlanruntime.cpp index 9078a9ca5..864c7869c 100644 --- a/src/core/batchlanruntime.cpp +++ b/src/core/batchlanruntime.cpp @@ -1988,7 +1988,7 @@ bool _ElementaryCommand::HandleAdvanceIterator(_ExecutionList& current_prog if (source_object_class == TREE) node_name = new _FString (map_node_to_calcnode (topTraverser)->ContextFreeName()); else - node_name = new _FString (((_TreeTopology*)parameters.GetItem (1))->GetNodeName(topTraverser)); + node_name = new _FString (((_TreeTopology*)parameters.GetItem (reciever_count))->GetNodeName(topTraverser)); if (reciever_count > 1) { ((_Variable*)parameters.GetItem (reciever_count+2))->SetValue (node_name, false, false,NULL); @@ -1999,7 +1999,7 @@ bool _ElementaryCommand::HandleAdvanceIterator(_ExecutionList& current_prog if (source_object_class == TREE) parent_name = new _FString (map_node_to_calcnode (parent_node)->ContextFreeName()); else - parent_name = new _FString (((_TreeTopology*)parameters.GetItem (1))->GetNodeName(parent_node)); + parent_name = new _FString (((_TreeTopology*)parameters.GetItem (reciever_count))->GetNodeName(parent_node)); ((_Variable*)parameters.GetItem (reciever_count+1))->SetValue (parent_name, false, false,NULL); } else { ((_Variable*)parameters.GetItem (reciever_count+1))->SetValue (new _MathObject, false, false,NULL); @@ -2728,7 +2728,7 @@ bool _ElementaryCommand::HandleFprintf (_ExecutionList& current_program) { success = true; - FILE* destination_file = nil; + hyFile * destination_file = nil; try { @@ -2775,10 +2775,17 @@ bool _ElementaryCommand::HandleFprintf (_ExecutionList& current_program) { do_close = open_handle < 0; if (!do_close) { - destination_file = (FILE*)open_file_handles.GetXtra (open_handle); + destination_file = (hyFile*)open_file_handles.GetXtra (open_handle); } else { - if ((destination_file = doFileOpen (destination.get_str(), "a")) == nil) - throw (_String ("Could not create/open output file at path ") & destination.Enquote() & "."); + #ifdef __ZLIB__ + if ((destination_file = doFileOpen (destination.get_str(), kFileAppend, false, hy_env::EnvVariableGetNumber(hy_env::gzip_compression_level) > 0)) == nil) { + throw (_String ("Could not create/open output file at path ") & destination.Enquote() & "."); + } + #else + if ((destination_file = doFileOpen (destination.get_str(), kFileAppend)) == nil) { + throw (_String ("Could not create/open output file at path ") & destination.Enquote() & "."); + } + #endif } } } @@ -2791,8 +2798,13 @@ bool _ElementaryCommand::HandleFprintf (_ExecutionList& current_program) { // handle special cases first if (*current_argument == kFprintfClearFile) { if (!print_to_stdout && destination_file) { - fclose (destination_file); - destination_file = doFileOpen (destination.get_str(), "w"); + destination_file->close(); delete destination_file; +#ifdef __ZLIB__ + destination_file = doFileOpen (destination.get_str(), kFileWrite,false, hy_env::EnvVariableGetNumber(hy_env::gzip_compression_level) > 0); +#else + destination_file = doFileOpen (destination.get_str(), kFileWrite); +#endif + if (!do_close) { _String* destination_copy = new _String (destination); @@ -2860,7 +2872,8 @@ bool _ElementaryCommand::HandleFprintf (_ExecutionList& current_program) { } if (destination_file && destination_file != hy_message_log_file && do_close) { - fclose (destination_file); + destination_file->close(); + delete destination_file; } return success; @@ -2901,7 +2914,7 @@ bool _ElementaryCommand::HandleExecuteCommandsCases(_ExecutionList& current } _String original_path (file_path); - FILE * source_file = nil; + hyFile * source_file = nil; bool reload = hy_env::EnvVariableTrue(hy_env::always_reload_libraries); @@ -2920,7 +2933,7 @@ bool _ElementaryCommand::HandleExecuteCommandsCases(_ExecutionList& current ReportWarning (_String("Already loaded ") & original_path.Enquote() & " from " & try_path); return true; } - if ((source_file = doFileOpen (try_path.get_str (), "rb"))) { + if ((source_file = doFileOpen (try_path.get_str (), kFileReadBinary))) { file_path = try_path; break; } @@ -2939,7 +2952,7 @@ bool _ElementaryCommand::HandleExecuteCommandsCases(_ExecutionList& current return true; } - if ((source_file = doFileOpen (file_path.get_str (), "rb")) == nil) { + if ((source_file = doFileOpen (file_path.get_str (), kFileReadBinary)) == nil) { throw (_String("Could not read command file from '") & original_path & "' (expanded to '" & file_path & "')"); } @@ -2952,10 +2965,13 @@ bool _ElementaryCommand::HandleExecuteCommandsCases(_ExecutionList& current source_code = new _StringBuffer (source_file); - if (fclose (source_file) ) { // failed to fclose + if (source_file->close() ) { // failed to fclose DeleteObject (source_code); throw (_String("Internal error: failed in a call to fclose ") & file_path.Enquote()); } + + delete (source_file); + pop_path = true; PushFilePath (file_path); } else { // commands are not loaded from a file @@ -3324,7 +3340,8 @@ bool _ElementaryCommand::HandleGetString (_ExecutionList& current_program) static const _String kVersionString ("HYPHY_VERSION"), kTimeStamp ("TIME_STAMP"), kListLoadedLibraries ("LIST_OF_LOADED_LIBRARIES"), - kGetNumberOfMatrixExp ("MATRIX_EXPONENTIALS_COMPUTED"); + kGetNumberOfMatrixExp ("MATRIX_EXPONENTIALS_COMPUTED"), + kCanCompressFiles ("ZLIB_ENABLED"); _Variable * receptacle = nil; @@ -3358,7 +3375,13 @@ bool _ElementaryCommand::HandleGetString (_ExecutionList& current_program) return_value = new _Matrix (loadedLibraryPaths.Keys()); } else if (*GetIthParameter(1UL) == kGetNumberOfMatrixExp) { return_value = new _Constant (matrix_exp_count); - } + } else if (*GetIthParameter(1UL) == kCanCompressFiles) { +#ifdef __ZLIB__ + return_value = new _Constant (1.); +#else + return_value = new _Constant (0.); +#endif + } if (!return_value) { @@ -3743,11 +3766,11 @@ bool _ElementaryCommand::HandleFscanf (_ExecutionList& current_program, boo return false; } - FILE * input_stream = doFileOpen (file_path.get_str(), "rb"); + hyFile * input_stream = doFileOpen (file_path.get_str(), kFileReadBinary); hy_env::EnvVariableSet(hy_env::file_created, new HY_CONSTANT_FALSE, false); if (!input_stream) { if (has_create) { - input_stream = doFileOpen (file_path.get_str(), "wb"); + input_stream = doFileOpen (file_path.get_str(), kFileWriteBinary); if (input_stream){ while (argument_index < upper_bound) { _Variable * store_here = _ValidateStorageVariable (current_program, argument_index + 1UL); @@ -3772,22 +3795,42 @@ bool _ElementaryCommand::HandleFscanf (_ExecutionList& current_program, boo last_call_stream_position = 0L; } - fseek (input_stream,0,SEEK_END); - current_stream_position = ftell (input_stream); - current_stream_position -= last_call_stream_position; - - if (current_stream_position<=0) { - hy_env::EnvVariableSet(hy_env::end_of_file, new HY_CONSTANT_TRUE, false); - fclose(input_stream); - return true; + _String * file_data = nil; + + if (input_stream->is_compressed()) { + /** + 20231113 : when the stream is compressed, SEEK_END is not available. + */ + _StringBuffer * file_data_buffer = new _StringBuffer (input_stream); + if (file_data_buffer->length() == 0L) { + hy_env::EnvVariableSet(hy_env::end_of_file, new HY_CONSTANT_TRUE, false); + input_stream->close(); + delete (input_stream); + DeleteObject (file_data_buffer); + return true; + } + file_data = file_data_buffer; + } else { + input_stream->seek (0,SEEK_END); + current_stream_position = input_stream->tell(); + current_stream_position -= last_call_stream_position; + + if (current_stream_position<=0) { + hy_env::EnvVariableSet(hy_env::end_of_file, new HY_CONSTANT_TRUE, false); + input_stream->close(); + delete (input_stream); + return true; + } + + input_stream->rewind(); + input_stream->seek (last_call_stream_position, SEEK_SET); + file_data = new _String (input_stream, current_stream_position); } - rewind (input_stream); - fseek (input_stream, last_call_stream_position, SEEK_SET); - _String * file_data = new _String (input_stream, current_stream_position); - fclose (input_stream); - dynamic_reference_manager < file_data; input_data = file_data; + input_stream->close(); + delete (input_stream); + dynamic_reference_manager < file_data; current_stream_position = 0L; } } diff --git a/src/core/dataset.cpp b/src/core/dataset.cpp index a0755eace..ebdd84684 100644 --- a/src/core/dataset.cpp +++ b/src/core/dataset.cpp @@ -82,7 +82,7 @@ _DataSet::_DataSet(long l) //_______________________________________________________________________ -_DataSet::_DataSet(FILE *f) { +_DataSet::_DataSet(hyFile *f) { dsh = nil; useHorizontalRep = false; theTT = &hy_default_translation_table; @@ -141,9 +141,8 @@ BaseRef _DataSet::makeDynamic(void) const { //_______________________________________________________________________ void _DataSet::ResetIHelper(void) { - if (dsh && dsh->characterPositions.lLength == 256) - for (long k = 0; k < 256; k++) { - dsh->characterPositions.list_data[k] = -1; + if (dsh && dsh->characterPositions.lLength == 256) { + InitializeArray (dsh->characterPositions.list_data, 256, -1L); } } @@ -196,16 +195,17 @@ void _DataSet::AddSite(char c) { if (theMap.list_data[0] == 0) { if (theMap.list_data[1] == 0) { if (theNames.lLength) { - fprintf(streamThrough, ">%s\n", ((_String *)theNames(0))->get_str()); + streamThrough->puts ("(_String *)theNames(0))->get_str()"); + streamThrough->fputc ('\n'); } else { - fprintf(streamThrough, ">Sequence 1\n"); + streamThrough->puts (">Sequence 1"); } AppendNewInstance(new _String(kEmptyString)); } theMap.list_data[1]++; theMap.list_data[2]++; - fputc(c, streamThrough); + streamThrough->fputc(c); } else { HandleApplicationError("Can't add more sites to a file based data set, " "when more that one sequence has been written!", @@ -235,10 +235,16 @@ void _DataSet::Write2Site(long index, char c, char skip_char) { theMap.list_data[0]++; if (theNames.lLength > theMap.list_data[0]) { - fprintf(streamThrough, "\n>%s\n", - ((_String *)theNames(theMap.list_data[0]))->get_str()); + streamThrough->puts ("\n>"); + streamThrough->puts (((_String *)theNames(theMap.list_data[0]))->get_str()); + streamThrough->fputc ('\n'); } else { - fprintf(streamThrough, "\n>Sequence %ld\n", theMap.list_data[0] + 1); + streamThrough->puts ("\n>"); + char buffer[64]; + snprintf (buffer, 64, "%ld", theMap.list_data[0] + 1); + streamThrough->puts (buffer); + streamThrough->fputc ('\n'); + //fprintf(streamThrough, "\n>Sequence %ld\n", theMap.list_data[0] + 1); } theMap.list_data[1] = 0; @@ -254,7 +260,7 @@ void _DataSet::Write2Site(long index, char c, char skip_char) { } theMap.list_data[1]++; - fputc(c, streamThrough); + streamThrough->fputc(c); } else { if (useHorizontalRep) { long currentWritten = ((_String *)list_data[0])->length(); @@ -386,8 +392,8 @@ void _DataSet::SetTranslationTable(_TranslationTable *newTT) { //_______________________________________________________________________ void _DataSet::Finalize(void) { if (streamThrough) { - fclose(streamThrough); - streamThrough = nil; + streamThrough->close(); + delete (streamThrough); theMap.Clear(); } else { if (useHorizontalRep) { @@ -550,44 +556,27 @@ void _DataSet::Finalize(void) { void _DataSet::Compact(long index) { if (useHorizontalRep) { HandleApplicationError( - "Internal Error: _DataSet::Compact called with compact represntation", + "Internal Error: _DataSet::Compact called on a dataset already using a Compact representation", true); return; } - - _Site *tC = (_Site *)(*(_List *)this)(index); - if (tC->GetRefNo() != -1) + _Site *tC = (_Site *)GetItem (index); + if (tC->GetRefNo() != -1) { // take care of double referencing - { _Site *tCC = tC; - long lastRef, count = 0; + long lastRef, count = 0L; do { lastRef = tCC->GetRefNo(); count++; - tCC = (_Site *)(*(_List *)this)(tCC->GetRefNo()); + tCC = (_Site *)GetItem (tCC->GetRefNo()); } while (tCC->GetRefNo() != -1); - if (count > 1) { + + if (count > 1L) { theFrequencies[lastRef]++; } tC->SetRefNo(lastRef); } - /*if (tC->GetRefNo()==-1) - { - long f = dsh->incompletePatterns->Find(tC); - if (f >= 0) - { - f = dsh->incompletePatterns->GetXtra (f); - if (fClear(); - tC->SetRefNo(f); - } - else - tC->Finalize(); - } - }*/ } //_______________________________________________________________________ @@ -671,12 +660,17 @@ BaseRef _DataSet::toStr(unsigned long) { //___________________________________________________ -void _DataSet::toFileStr(FILE *dest, unsigned long padding) { - fprintf(dest, "%ld species: ", NoOfSpecies()); +void _DataSet::toFileStr(hyFile *dest, unsigned long padding) { + char buffer[512]; + snprintf (buffer, 512, "%ld species: ", NoOfSpecies()); + dest->puts (buffer); + theNames.toFileStr(dest, padding); - - fprintf(dest, ";\nTotal Sites: %ld", GetNoTypes()); - fprintf(dest, ";\nDistinct Sites: %ld", theFrequencies.lLength); + snprintf (buffer, 512, ";\nTotal Sites: %ld", GetNoTypes()); + dest->puts (buffer); + + snprintf (buffer, 512, ";\nDistinct Sites: %ld", theFrequencies.lLength); + dest->puts (buffer); /* fprintf (dest,"\n"); for (long j=0; j fNS && isspace(CurrentLine.char_at (space2+1))) { - _String sequence_name (CurrentLine,fNS, space2); - CurrentLine.Trim(space2+2,-1); // chop out the name + if (space2 < 0 && CurrentLine.length() > 10) { + _String sequence_name (CurrentLine,fNS, CurrentLine.length()); ds.AddName(sequence_name); + CurrentLine.Trim(CurrentLine.length(),-1); } else { - _String sequence_name (CurrentLine,fNS, fNS+9); - CurrentLine.Trim(fNS+10,-1); // chop out the name - ds.AddName(sequence_name); + // hack for PAML support + if (space2 > fNS && isspace(CurrentLine.char_at (space2+1))) { + _String sequence_name (CurrentLine,fNS, space2); + CurrentLine.Trim(space2+2,-1); // chop out the name + ds.AddName(sequence_name); + } else { + _String sequence_name (CurrentLine,fNS, fNS+9); + CurrentLine.Trim(fNS+10,-1); // chop out the name + ds.AddName(sequence_name); + } } } diff --git a/src/core/dataset_filter.cpp b/src/core/dataset_filter.cpp index 5359d09d6..c4ac3b5f2 100644 --- a/src/core/dataset_filter.cpp +++ b/src/core/dataset_filter.cpp @@ -1779,7 +1779,7 @@ _String const _DataSetFilter::GenerateConsensusString (bool resolved, _SimpleLis //_________________________________________________________ -void _DataSetFilter::toFileStr (FILE*dest, unsigned long) { +void _DataSetFilter::toFileStr (hyFile *dest, unsigned long) { // write out the file with this dataset filter if (dest) { internalToStr (dest,nil); @@ -1821,7 +1821,7 @@ void _DataSetFilter::ConvertCodeToLettersBuffered (long code, unsigned char u //_________________________________________________________ -void _DataSetFilter::internalToStr (FILE * file ,_StringBuffer * string_buffer) { +void _DataSetFilter::internalToStr (hyFile * file ,_StringBuffer * string_buffer) { // case 4: // labels, sequential //case 5: // labels, interleaved diff --git a/src/core/fstring.cpp b/src/core/fstring.cpp index 8ebc2ed3c..90553c903 100644 --- a/src/core/fstring.cpp +++ b/src/core/fstring.cpp @@ -912,10 +912,11 @@ HBLObjectRef _FString::FileExists (HBLObjectRef cache) { _String cpy (get_str()); if (ProcessFileName(cpy)) { // TODO use fstat instead - FILE * test = doFileOpen (cpy, "rb"); + hyFile * test = doFileOpen (cpy, kFileReadBinary); if (test) { exists = 1.0; - fclose (test); + test->close (); + delete (test); } } } diff --git a/src/core/global_things.cpp b/src/core/global_things.cpp index b79949097..22a9d3046 100644 --- a/src/core/global_things.cpp +++ b/src/core/global_things.cpp @@ -95,7 +95,7 @@ namespace hy_global { ignore_kw_defaults = false, force_verbosity_from_cli = false; - FILE *hy_error_log_file = NULL, + hyFile *hy_error_log_file = NULL, *hy_message_log_file = NULL; hyTreeDefinitionPhase @@ -122,7 +122,7 @@ namespace hy_global { kErrorStringMatrixExportError ("Export matrix called with a non-polynomial matrix argument"), kErrorStringNullOperand ("Attempting to operate on an undefined value; this is probably the result of an earlier 'soft' error condition"), kErrorNumerical ("To treat numerical errors as warnings, please specify \"ENV=TOLERATE_NUMERICAL_ERRORS=1;\" as the command line argument. This often resolves the issue, which is indicative of numerical instability."), - kHyPhyVersion = _String ("2.5.57"), + kHyPhyVersion = _String ("2.5.58"), kNoneToken = "None", kNullToken = "null", @@ -222,14 +222,11 @@ namespace hy_global { //____________________________________________________________________________________ - FILE * doFileOpen (const char * fileName, const char * mode, bool error) { - FILE *daFile = nil; + hyFile * doFileOpen (const char * fileName, hyFileOpenMode mode, bool error, bool compress) { + hyFile *daFile = nil; if (fileName) { - daFile = fopen (fileName,mode); - if (!daFile && error) { - HandleApplicationError (_String("Could not open file '") & *fileName & "' with mode '" & *mode & "'."); - } + daFile = hyFile::openFile (fileName,mode, error, compress); } return daFile; } @@ -350,9 +347,11 @@ namespace hy_global { hy_x_variable = nil; hy_n_variable = nil; pathNames.Clear(); +#ifdef _USE_EMSCRIPTEN_ BuiltInFunctions.Clear(); hyReservedWords.Clear(); UnOps.Clear(); +#endif } hy_scanf_last_file_path = kEmptyString; EnvVariableSet(random_seed, new _Constant (hy_random_seed), false); @@ -404,7 +403,7 @@ namespace hy_global { #ifndef __HEADLESS__ // do not create log files for _HEADLESS_ _String * prefix [2] = {&hy_error_log_name, &hy_messages_log_name}; - FILE ** handle [2] = {&hy_error_log_file, &hy_message_log_file}; + hyFile ** handle [2] = {&hy_error_log_file, &hy_message_log_file}; for (long file_index = 0; file_index < 2; file_index++) { long p = 1L; @@ -419,7 +418,7 @@ namespace hy_global { _String file_name = *prefix[file_index] & ".mpinode" & (long)hy_mpi_node_rank; #endif - *handle[file_index] = doFileOpen (file_name.get_str(),"w+"); + *handle[file_index] = doFileOpen (file_name.get_str(),kFileWrite); while (*handle[file_index] == nil && p<10) { #ifndef __HYPHYMPI__ file_name = *prefix[file_index] & '.' & p; @@ -486,11 +485,13 @@ namespace hy_global { _HY_HBLCommandHelper.Clear(); _HY_ValidHBLExpressions.Clear(); listOfCompiledFormulae.Clear(); +#ifdef _USE_EMSCRIPTEN_ _hy_standard_library_paths.Clear(); _hy_standard_library_extensions.Clear(); availableTemplateFilesAbbreviations.Clear(); availableTemplateFiles.Clear(); availablePostProcessors.Clear(); +#endif #ifdef __HYPHYMPI__ @@ -509,21 +510,24 @@ namespace hy_global { _String * prefix [2] = {&hy_error_log_name, &hy_messages_log_name}; char const * messages [] = {"\nCheck %s for execution error details.\n", "\nCheck %s for diagnostic messages.\n"}; - FILE * handle [2] = {hy_error_log_file, hy_message_log_file}; + hyFile * handle [2] = {hy_error_log_file, hy_message_log_file}; for (long file_index = 0; file_index < 2; file_index++) { if (handle[file_index]) { - fflush (handle[file_index]); - fseek(handle[file_index],0,SEEK_END); - unsigned long fileSize = ftell(handle[file_index]); + //fflush (handle[file_index]); + handle[file_index]->flush (); + handle[file_index]->seek(0, SEEK_END); + unsigned long fileSize = handle[file_index]->tell(); if (fileSize) { fprintf (stderr, messages[file_index], prefix[file_index]->get_str()); if (file_index == 0) { no_errors = false; } - fclose (handle[file_index]); + handle[file_index]->close(); + delete handle[file_index]; } else { - fclose (handle[file_index]); + handle[file_index]->close(); + delete handle[file_index]; remove (prefix[file_index]->get_str()); } } @@ -689,9 +693,9 @@ namespace hy_global { } char str[] = "\n"; - fwrite (str, 1, 1, hy_message_log_file); - fwrite (message.get_str(), 1, message.length(), hy_message_log_file); - fflush (hy_message_log_file); + hy_message_log_file->fwrite (str, 1, 1); + hy_message_log_file->fwrite (message.get_str(), 1, message.length()); + hy_message_log_file->flush(); #endif } @@ -756,13 +760,14 @@ namespace hy_global { char str[] = "\nError:"; if (hy_error_log_file) { - fwrite (str, 1, 7, hy_error_log_file); - fwrite (message.get_str(), 1, message.length(), hy_error_log_file); - fflush (hy_error_log_file); + hy_error_log_file->fwrite (str, 1, 7); + hy_error_log_file->fwrite (message.get_str(), 1, message.length()); + hy_error_log_file->flush (); } if (hy_error_log_file) { - fprintf (hy_error_log_file, "\n%s", (const char*)message); + hy_error_log_file->fputc ('\n'); + hy_error_log_file->puts ((const char*)message); } _String errMsg; @@ -828,6 +833,21 @@ namespace hy_global { theMessage << "MinGW ";// " & __MINGW32_VERSION; #endif #endif +#ifdef _SLKP_USE_ARM_NEON + theMessage << " ARM Neon SIMD"; +#endif +#ifdef _SLKP_USE_AVX_INTRINSICS + theMessage << " x86 AVX SIMD"; +#endif +#ifdef _SLKP_USE_FMA3_INTRINSICS + theMessage << " with FMA3"; +#endif +#ifdef _SLKP_USE_SSE_INTRINSICS + theMessage << " x86 SSE4 SIMD"; +#endif +#ifdef __ZLIB__ + theMessage << " zlib (v" << ZLIB_VERSION << ")"; +#endif return theMessage; } @@ -1007,128 +1027,5 @@ namespace hy_global { return true; } - //____________________________________________________________________________________ - hyFile* hyFile::openFile (const char * file_path, const char * mode , bool error, long buffer) { - hyFile* f = new hyFile; -#ifdef __ZLIB__ - if (file_path) { - f->_fileReference = gzopen (file_path, mode); - if (!f->_fileReference && error) { - HandleApplicationError (_String("Could not open file '") & *file_path & "' with mode '" & *mode & "'."); - } - } -#else - f->_fileReference = doFileOpen(file_path, mode, error); -#endif - if (!f->_fileReference ) { - delete f; - f = nil; - } - return f; - - } - - //____________________________________________________________________________________ - void hyFile::close (void) { - if (valid()) { - #ifdef __ZLIB__ - gzclose (_fileReference); - #else - fclose (_fileReference); - #endif - _fileReference = NULL; - } - } - - //____________________________________________________________________________________ - void hyFile::lock (void) { - if (valid()) { - #ifdef __ZLIB__ - //gzclose (_fileReference); - #else - flockfile (_fileReference); - #endif - } - } - - //____________________________________________________________________________________ - void hyFile::unlock (void) { - if (valid()) { - #ifdef __ZLIB__ - //gzclose (_fileReference); - #else - funlockfile (_fileReference); - #endif - } - } - - //____________________________________________________________________________________ - void hyFile::rewind (void) { - if (valid()) { - #ifdef __ZLIB__ - gzrewind (_fileReference); - #else - ::rewind (_fileReference); - #endif - } - } - - //____________________________________________________________________________________ - void hyFile::seek (long pos, int whence) { - if (valid()) { - #ifdef __ZLIB__ - gzseek (_fileReference, pos, whence); - #else - fseek (_fileReference, pos, whence); - #endif - } - } - //____________________________________________________________________________________ - - size_t hyFile::tell (void) { - if (valid()) { - #ifdef __ZLIB__ - return gztell (_fileReference); - #else - return ftell (_fileReference); - #endif - } - return 0; - } - //____________________________________________________________________________________ - bool hyFile::feof (void) { - if (valid()) { - #ifdef __ZLIB__ - return gzeof (_fileReference); - #else - return feof_unlocked (_fileReference); - #endif - } - return true; - } - //____________________________________________________________________________________ - int hyFile::getc (void) { - if (valid()) { - #ifdef __ZLIB__ - return gzgetc (_fileReference); - #else - return getc_unlocked (_fileReference); - #endif - } - return 0; - } - - //____________________________________________________________________________________ - unsigned long hyFile::read (void* buffer, unsigned long size, unsigned long items) { - if (valid()) { - #ifdef __ZLIB__ - return gzfread (buffer, size, items, _fileReference); - #else - return ::fread (buffer, size, items, _fileReference); - #endif - } - return 0; - } - } // namespace close diff --git a/src/core/hbl_env.cpp b/src/core/hbl_env.cpp index 1db5d11b1..49d05541b 100644 --- a/src/core/hbl_env.cpp +++ b/src/core/hbl_env.cpp @@ -134,6 +134,7 @@ namespace hy_env { .PushPairCopyKey (data_file_gap_width, new _Constant (10.0)) .PushPairCopyKey (accept_branch_lengths, new _Constant (HY_CONSTANT_TRUE)) .PushPairCopyKey (number_threads, new _Constant (0.)) + .PushPairCopyKey (gzip_compression_level, new _Constant (0.)) ; ; } @@ -221,7 +222,11 @@ _String const if set to `harvest_frequencies_gap_options` to TRUE, then N-fold ambigs will add 1/N to each character count in HarvestFrequencies, otherwise N-folds are ignored in counting */ - + gzip_compression_level ("GZIP_COMPRESSION_LEVEL"), + /* + if __ZLIB__ is present, controls whether or not output files created by fprintf will be compressed by ZLIB; + value of 0 means no compresson + */ include_model_spec ("INCLUDE_MODEL_SPECS"), /* controls whether or not export / serialization operations (like toStr) diff --git a/src/core/hyFile.cpp b/src/core/hyFile.cpp new file mode 100644 index 000000000..7f4f39a22 --- /dev/null +++ b/src/core/hyFile.cpp @@ -0,0 +1,336 @@ +/* + HyPhy - Hypothesis Testing Using Phylogenies. + + Copyright (C) 1997-now + Core Developers: + Sergei L Kosakovsky Pond (sergeilkp@icloud.com) + Art FY Poon (apoon42@uwo.ca) + Steven Weaver (sweaver@temple.edu) + + Module Developers: + Lance Hepler (nlhepler@gmail.com) + Martin Smith (martin.audacis@gmail.com) + + Significant contributions from: + Spencer V Muse (muse@stat.ncsu.edu) + Simon DW Frost (sdf22@cam.ac.uk) + + Permission is hereby granted, free of charge, to any person obtaining a + copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be included + in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "hy_types.h" +#include "hy_strings.h" +#include "global_things.h" + +//____________________________________________________________________________________ + hyFile* hyFile::openFile (const char * file_path, hyFileOpenMode mode , bool error, bool compress, long buffer) { + + _String str_mode; + auto handle_error = [&](void * ptr)->void { + if (!ptr && error) { + hy_global::HandleApplicationError (_String("Could not open file '") & *file_path & "' with mode '" & str_mode & "'."); + } + }; + + hyFile* f = new hyFile; +#ifdef __ZLIB__ + if (file_path) { + + _String str_mode; + switch (mode) { + case kFileRead: + str_mode = "r"; + break; + case kFileReadBinary: + str_mode = "rb"; + break; + case kFileWrite: + str_mode = compress ? "w" : "wT"; + break; + case kFileWriteBinary: + str_mode = compress ? "wb" : "wbT"; + break; + case kFileAppend: + str_mode = compress ? "a" : "aT"; + break; + case kFileReadWrite: + str_mode = "w+"; + break; + } + + if (mode != kFileReadWrite) { + f->_fileReference = gzopen (file_path, (char const*)str_mode); + if (f->_fileReference) { + if (gzdirect(f->_fileReference)) { + gzclose (f->_fileReference); + f->_fileReference = NULL; + f->_fileReferenceDirect = ::fopen (file_path, (const char*)str_mode); + handle_error (f->_fileReferenceDirect); + } + } else { + handle_error (f->_fileReference); + } + } else { + f->_fileReferenceDirect = ::fopen (file_path, (const char*)str_mode); + handle_error (f->_fileReferenceDirect); + } + if (!f->valid()) { + delete f; + f = nil; + } +} +#else + if (file_path) { + _String str_mode; + switch (mode) { + case kFileRead: + str_mode = "r"; + break; + case kFileReadBinary: + str_mode = "rb"; + break; + case kFileWrite: + str_mode = "w"; + break; + case kFileWriteBinary: + str_mode = "wb"; + break; + case kFileAppend: + str_mode = "a"; + break; + case kFileReadWrite: + str_mode = "w+"; + break; + } + + f->_fileReference = ::fopen (file_path, (const char*)str_mode); + handle_error (f->_fileReference); + } + if (!f->_fileReference ) { + delete f; + f = nil; + } +#endif + + return f; + + } + + //____________________________________________________________________________________ + int hyFile::close (void) { + if (valid()) { + #ifdef __ZLIB__ + if (_fileReferenceDirect) { + return fclose (_fileReferenceDirect); + } else { + return gzclose (_fileReference); + } + #else + return fclose (_fileReference); + #endif + _fileReference = NULL; + } + return 0; + } + + //____________________________________________________________________________________ + void hyFile::lock (void) { + if (valid()) { + #ifdef __ZLIB__ + if (_fileReferenceDirect) { + flockfile (_fileReferenceDirect); + } + //gzclose (_fileReference); + #else + flockfile (_fileReference); + #endif + } + } + + //____________________________________________________________________________________ + void hyFile::flush (void) { + if (valid()) { + #ifdef __ZLIB__ + if (_fileReferenceDirect) { + fflush (_fileReferenceDirect); + } else { + gzflush (_fileReference, Z_SYNC_FLUSH); + } + #else + fflush(_fileReference); + #endif + } + } + + //____________________________________________________________________________________ + void hyFile::unlock (void) { + if (valid()) { + #ifdef __ZLIB__ + if (_fileReferenceDirect) { + funlockfile (_fileReferenceDirect); + } + #else + funlockfile (_fileReference); + #endif + } + } + + //____________________________________________________________________________________ + void hyFile::rewind (void) { + if (valid()) { + #ifdef __ZLIB__ + if (_fileReferenceDirect) { + ::rewind (_fileReferenceDirect); + } else { + gzrewind (_fileReference); + } + #else + ::rewind (_fileReference); + #endif + } + } + + //____________________________________________________________________________________ + void hyFile::seek (long pos, int whence) { + if (valid()) { + #ifdef __ZLIB__ + if (_fileReferenceDirect) { + fseek (_fileReferenceDirect, pos, whence); + } else { + gzseek (_fileReference, pos, whence); + } + #else + fseek (_fileReference, pos, whence); + #endif + } + } + + //____________________________________________________________________________________ + + size_t hyFile::tell (void) { + if (valid()) { + #ifdef __ZLIB__ + if (_fileReferenceDirect) { + return ftell (_fileReferenceDirect); + } else { + return gztell (_fileReference); + } + #else + return ftell (_fileReference); + #endif + } + return 0; + } + //____________________________________________________________________________________ + bool hyFile::feof (void) { + if (valid()) { + #ifdef __ZLIB__ + if (_fileReferenceDirect) { + return feof_unlocked (_fileReferenceDirect); + } else { + return gzeof (_fileReference); + } + #else + return feof_unlocked (_fileReference); + #endif + } + return true; + } + //____________________________________________________________________________________ + int hyFile::puts( const char* buffer) { + if (valid()) { + #ifdef __ZLIB__ + if (_fileReferenceDirect) { + return ::fputs (buffer, _fileReferenceDirect); + } else { + return gzputs (_fileReference, buffer); + } + + #else + return ::fputs (buffer, _fileReference); + #endif + } + return -1; + } + + //____________________________________________________________________________________ + int hyFile::fputc( int chr) { + if (valid()) { + #ifdef __ZLIB__ + if (_fileReferenceDirect) { + return ::fputc (chr, _fileReferenceDirect); + } else { + return gzputc (_fileReference, chr); + } + #else + return ::fputc (chr, _fileReference); + #endif + } + return -1; + } + + //____________________________________________________________________________________ + size_t hyFile::fwrite( const void* buffer, size_t size, size_t count) { + if (valid()) { + #ifdef __ZLIB__ + if (_fileReferenceDirect) { + return ::fwrite (buffer, size, count, _fileReferenceDirect); + } else { + return gzfwrite (buffer, size, count, _fileReference); + } + #else + return ::fwrite (buffer, size, count, _fileReference); + #endif + } + return -1; + } + //____________________________________________________________________________________ + int hyFile::getc (void) { + if (valid()) { + #ifdef __ZLIB__ + if (_fileReferenceDirect) { + return getc_unlocked (_fileReferenceDirect); + } else { + return gzgetc (_fileReference); + } + + #else + return getc_unlocked (_fileReference); + #endif + } + return 0; + } + + //____________________________________________________________________________________ + unsigned long hyFile::read (void* buffer, unsigned long size, unsigned long items) { + if (valid()) { + #ifdef __ZLIB__ + if (_fileReferenceDirect) { + return ::fread (buffer, size, items, _fileReferenceDirect); + } else { + return gzfread (buffer, size, items, _fileReference); + } + + #else + return ::fread (buffer, size, items, _fileReference); + #endif + } + return 0; + } diff --git a/src/core/include/baseobj.h b/src/core/include/baseobj.h index ee99f26a9..343330bf5 100644 --- a/src/core/include/baseobj.h +++ b/src/core/include/baseobj.h @@ -85,8 +85,8 @@ class BaseObj { (default = same as toStr) */ - virtual void toFileStr (FILE *, unsigned long padding = 0UL) ; - /** file representation for the object + virtual void toFileStr (hyFile *, unsigned long padding = 0UL) ; + /** file representation for the object @param padding is used to allow 'pretty' rendering of nested objects, like dictss */ diff --git a/src/core/include/dataset.h b/src/core/include/dataset.h index 705ceee6b..94be6cd3c 100644 --- a/src/core/include/dataset.h +++ b/src/core/include/dataset.h @@ -72,7 +72,7 @@ class _DataSet : public _List // a complete data set public: _DataSet(void); _DataSet(long); - _DataSet(FILE *); + _DataSet(hyFile *); // with estimated number of sites per file virtual ~_DataSet(void); @@ -109,7 +109,7 @@ class _DataSet : public _List // a complete data set virtual BaseRef toStr(unsigned long = 0UL); // convert to string - virtual void toFileStr(FILE *dest, unsigned long = 0UL); + virtual void toFileStr(hyFile *dest, unsigned long = 0UL); void Compact(long); // release string overhead @@ -194,7 +194,7 @@ class _DataSet : public _List // a complete data set _TranslationTable *theTT; // translation Table, if any _List theNames; // Names of species - FILE *streamThrough; + hyFile *streamThrough; _DSHelper *dsh; bool useHorizontalRep; diff --git a/src/core/include/dataset_filter.h b/src/core/include/dataset_filter.h index bc7c8ae3e..1357899e4 100644 --- a/src/core/include/dataset_filter.h +++ b/src/core/include/dataset_filter.h @@ -68,7 +68,7 @@ class _DataSetFilter : public BaseObj { virtual ~_DataSetFilter(void); virtual BaseRef toStr(unsigned long = 0UL); // convert to string - virtual void toFileStr(FILE *, unsigned long = 0UL); // convert to string + virtual void toFileStr(hyFile *, unsigned long = 0UL); // convert to string virtual BaseRef makeDynamic(void) const; virtual void Duplicate(BaseRefConst); @@ -397,7 +397,7 @@ class _DataSetFilter : public BaseObj { long dimension; private: - void internalToStr(FILE *, _StringBuffer *); + void internalToStr(hyFile *, _StringBuffer *); inline void retrieve_individual_site_from_raw_coordinates (_String & store, unsigned long site, unsigned long sequence) const { diff --git a/src/core/include/global_things.h b/src/core/include/global_things.h index 0366697b2..2792b7bb3 100644 --- a/src/core/include/global_things.h +++ b/src/core/include/global_things.h @@ -57,9 +57,7 @@ #include #endif -#ifdef __ZLIB__ - #include -#endif + class _Variable; // forward decl class _ExecutionList; // forward decl @@ -129,26 +127,7 @@ namespace hy_global { /* pass-through structure for reading / writing from a file that may or may not be compressed */ - class hyFile { - public: - hyFile (void) {_fileReference = NULL;} - static hyFile* openFile (const char * file_path, const char * mode , bool error = false, long buffer = 1024*128); - inline bool valid (void) const {return _fileReference != NULL;} - void lock (void); - void unlock (void); - void rewind (void); - void seek (long, int); - void close (); - bool feof (void); - unsigned long read (void* buffer, unsigned long size, unsigned long items); - size_t tell (); - int getc (); -#ifdef __ZLIB__ - gzFile _fileReference; -#else - FILE* _fileReference; -#endif - }; + /** Open the file located at file_path using mode 'mode' @@ -159,7 +138,7 @@ namespace hy_global { @return the FILE handle or nil (if file does not exist or could not be open with the requested mode) */ - FILE* doFileOpen (const char * file_path, const char * mode , bool error = false); + hyFile* doFileOpen (const char * file_path, hyFileOpenMode mode , bool error = false, bool compress = false); /** The omnibus clean-up function that attempts to deallocate all application memory @@ -300,7 +279,7 @@ namespace hy_global { extern _List _hy_standard_library_extensions, _hy_standard_library_paths; - extern FILE* hy_error_log_file, + extern hyFile* hy_error_log_file, * hy_message_log_file; diff --git a/src/core/include/hbl_env.h b/src/core/include/hbl_env.h index ab82c14b8..7e5965237 100644 --- a/src/core/include/hbl_env.h +++ b/src/core/include/hbl_env.h @@ -206,6 +206,7 @@ namespace hy_env { tolerate_numerical_errors, tolerate_constraint_violation, number_threads, + gzip_compression_level, tree_parser_namespace; ; diff --git a/src/core/include/hy_strings.h b/src/core/include/hy_strings.h index 853ae8f99..d43e9a042 100644 --- a/src/core/include/hy_strings.h +++ b/src/core/include/hy_strings.h @@ -268,6 +268,7 @@ class _String : public BaseObj { specifically); also added a check that the # of chars read was the same as the one requested. */ + _String(hyFile *file, long read_this_many = -1L); _String(FILE *file, long read_this_many = -1L); /** diff --git a/src/core/include/hy_types.h b/src/core/include/hy_types.h index b62a2238d..fca090501 100644 --- a/src/core/include/hy_types.h +++ b/src/core/include/hy_types.h @@ -40,6 +40,12 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #ifndef __HY_TYPES__ #define __HY_TYPES__ +#ifdef __ZLIB__ + #include +#endif + +#include + /** Put application-wide typedefs and enums here */ @@ -75,6 +81,47 @@ enum hyComparisonType { kCompareUndefined = 0xff }; +enum hyFileOpenMode { + kFileRead, + kFileReadBinary, + kFileReadWrite, + kFileWrite, + kFileWriteBinary, + kFileAppend +}; +class hyFile { + public: + hyFile (void) { + _fileReference = NULL; + #ifdef __ZLIB__ + _fileReferenceDirect = NULL; + #endif + } + static hyFile* openFile (const char * file_path, hyFileOpenMode mode , bool error = false, bool compress = false, long buffer = 1024*128); + void lock (void); + void unlock (void); + void rewind (void); + void seek (long, int); + void flush (void); + int close (); + bool feof (void); + unsigned long read (void* buffer, unsigned long size, unsigned long items); + size_t fwrite( const void* buffer, size_t size, size_t count); + int puts(const char *str); + int fputc(int chr); + size_t tell (); + int getc (); + #ifdef __ZLIB__ + inline bool valid (void) const {return _fileReference != NULL || _fileReferenceDirect != NULL;} + gzFile _fileReference; + FILE * _fileReferenceDirect; + bool is_compressed (void) const { return _fileReference != NULL;} + #else + inline bool valid (void) const {return _fileReference != NULL;} + bool is_compressed (void) const { return false;} + FILE* _fileReference; + #endif +}; #endif diff --git a/src/core/include/list.h b/src/core/include/list.h index 9ce7ffb9e..26b3869d0 100644 --- a/src/core/include/list.h +++ b/src/core/include/list.h @@ -402,7 +402,7 @@ class _List:public _SimpleList { /** */ - virtual void toFileStr(FILE*, unsigned long = 0UL); + virtual void toFileStr(hyFile*, unsigned long = 0UL); /** Generate a string that is not present in the list. diff --git a/src/core/include/matrix.h b/src/core/include/matrix.h index 4a59d2b15..c4a6e9b4e 100644 --- a/src/core/include/matrix.h +++ b/src/core/include/matrix.h @@ -407,7 +407,7 @@ class _Matrix: public _MathObject { virtual BaseRef toStr (unsigned long = 0UL); // convert this matrix to a string - virtual void toFileStr (FILE*dest, unsigned long = 0UL); + virtual void toFileStr (hyFile *dest, unsigned long = 0UL); bool AmISparse (void); @@ -428,8 +428,8 @@ class _Matrix: public _MathObject { bool CompareMatrices (_Matrix const* rhs, hyFloat tolerance = kMachineEpsilon) const; - void ExportMatrixExp (_Matrix*, FILE*); - bool ImportMatrixExp (FILE*); + void ExportMatrixExp (_Matrix*, hyFile*); + bool ImportMatrixExp (hyFile*); hyFloat FisherExact (hyFloat, hyFloat, hyFloat); @@ -627,7 +627,7 @@ class _Matrix: public _MathObject { private: - void internal_to_str (_StringBuffer*, FILE*, unsigned long padding); + void internal_to_str (_StringBuffer*, hyFile*, unsigned long padding); void SetupSparseMatrixAllocations (void); bool is_square_numeric (bool dense = true) const; diff --git a/src/core/include/polynoml.h b/src/core/include/polynoml.h index 415245d84..8341bbccf 100644 --- a/src/core/include/polynoml.h +++ b/src/core/include/polynoml.h @@ -158,7 +158,7 @@ class _Polynomial : public _MathObject { virtual BaseObj* toStr (unsigned long = 0UL); void CheckTerm(void); - virtual void toFileStr (FILE*, unsigned long = 0UL); + virtual void toFileStr (hyFile*, unsigned long = 0UL); long GetNoVariables(void) const { return variableIndex.countitems(); diff --git a/src/core/include/string_file_wrapper.h b/src/core/include/string_file_wrapper.h index 172616f5f..e714ea1f6 100644 --- a/src/core/include/string_file_wrapper.h +++ b/src/core/include/string_file_wrapper.h @@ -57,7 +57,7 @@ class StringFileWrapper { */ public: - StringFileWrapper (_StringBuffer * string, FILE * file); + StringFileWrapper (_StringBuffer * string, hyFile * file); /** Create a wrapper around around a string / file pair If both arguments are null, the wrapper will simply "eat" the bufferring operations (/dev/null equivalent). If both arguments @@ -108,7 +108,7 @@ class StringFileWrapper { private: _StringBuffer * string_buffer; - FILE* file_buffer; + hyFile* file_buffer; }; diff --git a/src/core/include/topology.h b/src/core/include/topology.h index 95ebd53b7..27f367dc2 100644 --- a/src/core/include/topology.h +++ b/src/core/include/topology.h @@ -185,7 +185,7 @@ class _TreeTopology: public _CalcNode { char rooted; - virtual void toFileStr (FILE*, unsigned long); + virtual void toFileStr (hyFile*, unsigned long); virtual BaseRef toStr (unsigned long = 0UL); void RerootTreeInternalTraverser (node* iterator, long, bool,_StringBuffer&, _TreeTopologyParseSettings const& settings, hyTopologyBranchLengthMode branch_length_mode, long variable_ref = -1L, bool = false) const; diff --git a/src/core/include/variable.h b/src/core/include/variable.h index c6349247c..2ca5f24f4 100644 --- a/src/core/include/variable.h +++ b/src/core/include/variable.h @@ -69,7 +69,7 @@ class _Variable : public _Constant { virtual void Duplicate (BaseRefConst); virtual BaseRef makeDynamic(void) const; virtual BaseRef toStr (unsigned long = 0UL); - virtual void toFileStr (FILE*, unsigned long = 0UL); + virtual void toFileStr (hyFile*, unsigned long = 0UL); virtual void MarkDone (void); diff --git a/src/core/likefunc.cpp b/src/core/likefunc.cpp index 951332680..6fa1571a3 100644 --- a/src/core/likefunc.cpp +++ b/src/core/likefunc.cpp @@ -455,7 +455,7 @@ void UpdateOptimizationStatus (hyFloat max, long pdone, char init, bool o static _String userStatusString; static clock_t userTimeStart; - FILE *outFile = fileName?doFileOpen (fileName->get_str(),"w"):nil; + hyFile *outFile = fileName?doFileOpen (fileName->get_str(),kFileWrite):nil; _FString* t; static TimeDifference timer; @@ -463,9 +463,6 @@ void UpdateOptimizationStatus (hyFloat max, long pdone, char init, bool o if (init==0) { lCount = likeFuncEvalCallCount; timer.Start(); -#ifndef _MINGW32_MEGA_ - setvbuf (stdout,nil, _IONBF,1); -#endif lastDone = 0; userTimeStart = clock(); checkParameter (optimizationStringQuantum, update_quantum, 0.0); @@ -516,50 +513,41 @@ void UpdateOptimizationStatus (hyFloat max, long pdone, char init, bool o } if (outFile) { - fprintf (outFile,"%s", reportString.get_str()); + outFile->puts (reportString.get_str()); } else -#ifndef _MINGW32_MEGA_ printf ("\015%s", reportString.get_str()); -#else - SetStatusLine (reportString); -#endif } else { char buffer [1024]; if (optimization) { - if (outFile) - fprintf (outFile,"Likelihood function optimization status\nCurrent Maximum: %-14.8g (%ld %% done)\nLikelihood Function evaluations/second: %-8.4g", (double)max, pdone, - (likeFuncEvalCallCount-lCount)/elapsed_time); - else { - long written = snprintf (buffer,1024,"Current Max: %-14.8g (%ld %% done) LF Evals/Sec: %-8.4g", (double)max, pdone, (likeFuncEvalCallCount-lCount)/elapsed_time); + + long written = snprintf (buffer,1024,"Current Max: %-14.8g (%ld %% done) LF Evals/Sec: %-8.4g", (double)max, pdone, (likeFuncEvalCallCount-lCount)/elapsed_time); - if (elapsed_time) { - snprintf (buffer+written,1024-written, "CPU Load: %-8.4g", (clock()-userTimeStart)/((hyFloat)CLOCKS_PER_SEC*elapsed_time)); - } + if (elapsed_time) { + snprintf (buffer+written,1024-written, "CPU Load: %-8.4g", (clock()-userTimeStart)/((hyFloat)CLOCKS_PER_SEC*elapsed_time)); } } else { snprintf (buffer, 1024, "Sites done: %g (%ld %% done)", (double)max, pdone); } -#ifndef _MINGW32_MEGA_ - printf ("\015%s", buffer); -#else - SetStatusLine (_String(buffer)); -#endif + if (outFile) { + outFile->puts (buffer); + } else { + printf ("\015%s", buffer); + } } } else { if (outFile) { - fprintf (outFile,"DONE"); + outFile->puts ("DONE"); } else { -#ifndef _MINGW32_MEGA_ printf ("\033\015 "); setvbuf (stdout,nil,_IOLBF,1024); -#endif } } if (outFile) { - fclose (outFile); + outFile->close(); + delete (outFile); } } @@ -5168,7 +5156,7 @@ _Matrix* _LikelihoodFunction::Optimize (_AssociativeList const * options) void _LikelihoodFunction::_TerminateAndDump(const _String &error, bool sig_term) { - FILE * out = doFileOpen ("/tmp/hyphy.dump", "w"); + hyFile * out = doFileOpen ("/tmp/hyphy.dump", kFileWrite); _String err ("Internal error "); @@ -5178,8 +5166,9 @@ void _LikelihoodFunction::_TerminateAndDump(const _String &error, bool sig_te _StringBuffer sLF (8192L); SerializeLF (sLF,_hyphyLFSerializeModeVanilla); sLF.TrimSpace(); - fwrite ((void*)sLF.get_str(), 1, sLF.length(), out); - fclose (out); + out->fwrite ((void*)sLF.get_str(), 1, sLF.length()); + out->close(); + delete out; } HandleApplicationError (err & '\n' & error, true); @@ -10380,8 +10369,8 @@ void _LikelihoodFunction::StateCounter (long functionCallback) const { for (unsigned i = 0; i < this_site_count; i++) { for (unsigned long discrete_category_index = 0UL; - discrete_category_index < discrete_category_variables.lLength; - discrete_category_index++) { + discrete_category_index < discrete_category_variables.lLength; + discrete_category_index++) { _CategoryVariable* discrete_cat = GetIthCategoryVar(discrete_category_variables(discrete_category_index)); @@ -10407,8 +10396,6 @@ void _LikelihoodFunction::StateCounter (long functionCallback) const { while (good_sites < this_site_count) { - - if (category_simulation_mode != kLFSimulateCategoriesNone ) { for (unsigned long hmm_category_index =0UL; @@ -10528,9 +10515,11 @@ void _LikelihoodFunction::StateCounter (long functionCallback) const { } - target.ResetIHelper(); - for (unsigned long character_index = 0UL; character_index < sites_per_unit; character_index ++) { - target.Compact(site_offset_raw + leaf_count - sites_per_unit + character_index); + if (!target.InternalStorageMode()) { + target.ResetIHelper(); + for (unsigned long character_index = 0UL; character_index < sites_per_unit; character_index ++) { + target.Compact(site_offset_raw + leaf_count - sites_per_unit + character_index); + } } if (storeIntermediates ) { @@ -10540,9 +10529,11 @@ void _LikelihoodFunction::StateCounter (long functionCallback) const { for (unsigned long character_index = 0UL; character_index < sites_per_unit; character_index ++) { target.Write2Site (site_offset_raw + leaf_count - sites_per_unit + character_index, simulated_unit (character_index)); } - target.ResetIHelper(); - for (unsigned long character_index = 0UL; character_index < sites_per_unit; character_index ++) { - target.Compact(site_offset_raw + leaf_count - sites_per_unit + character_index); + if (!target.InternalStorageMode()) { + target.ResetIHelper(); + for (unsigned long character_index = 0UL; character_index < sites_per_unit; character_index ++) { + target.Compact(site_offset_raw + leaf_count - sites_per_unit + character_index); + } } } } else { @@ -10581,7 +10572,7 @@ void _LikelihoodFunction::StateCounter (long functionCallback) const { if (storeIntermediates && !this_tree->IsDegenerate()) { if (storeIntermediates->nonempty()) { - FILE * file_for_ancestral_sequences = doFileOpen (storeIntermediates->get_str(),"w"); + hyFile * file_for_ancestral_sequences = doFileOpen (storeIntermediates->get_str(),kFileWrite); if (!file_for_ancestral_sequences) { HandleApplicationError (_String ("Failed to open ") & storeIntermediates->Enquote() & " for writing."); target.Finalize(); diff --git a/src/core/list.cpp b/src/core/list.cpp index 35645d609..0dbe4cdb3 100644 --- a/src/core/list.cpp +++ b/src/core/list.cpp @@ -577,17 +577,17 @@ void _List::Replace (long index, BaseRef newObj, bool dup) { // Char* conversion //TODO: toFileStr should be ToFileStr to follow convention. -void _List::toFileStr(FILE* dest, unsigned long) { - fprintf (dest,"{"); +void _List::toFileStr(hyFile* dest, unsigned long) { + dest->puts ("{"); if (lLength) { GetItem(0)->toFileStr(dest); for (unsigned long i = 1UL; iputs (", "); GetItem(i)->toFileStr(dest); } } - fprintf (dest,"}"); + dest->puts ("}"); } // Char* conversion diff --git a/src/core/matrix.cpp b/src/core/matrix.cpp index 5cc12c623..f78744818 100644 --- a/src/core/matrix.cpp +++ b/src/core/matrix.cpp @@ -8228,7 +8228,7 @@ _Matrix _Matrix::operator - (_Matrix& m) } //_________________________________________________________ -void _Matrix::internal_to_str (_StringBuffer* string, FILE * file, unsigned long padding) { +void _Matrix::internal_to_str (_StringBuffer* string, hyFile * file, unsigned long padding) { StringFileWrapper res (string, file); _String padder (" ", padding); @@ -8331,7 +8331,7 @@ void _Matrix::internal_to_str (_StringBuffer* string, FILE * file, unsigned l } } //_________________________________________________________ -void _Matrix::toFileStr (FILE*dest, unsigned long padding){ +void _Matrix::toFileStr (hyFile *dest, unsigned long padding){ internal_to_str(nil, dest, padding); } //_____________________________________________________________________________________________ @@ -8381,14 +8381,14 @@ void _Matrix::InitMxVar (_SimpleList& mxVariables, hyFloat glValue) { }); } //_____________________________________________________________________________________________ -bool _Matrix::ImportMatrixExp (FILE* theSource) { +bool _Matrix::ImportMatrixExp (hyFile* theSource) { // TODO: SLKP 20171027, need to review and possibly deprecate long mDim=0,i,k=0,j,m; char buffer[255],fc=0; buffer[0]=0; while(1) { - buffer[mDim]=fgetc(theSource); - if (feof(theSource)) { + buffer[mDim]=theSource->getc(); + if (theSource->feof()) { return false; } if (buffer[mDim]==',') { @@ -8404,11 +8404,10 @@ bool _Matrix::ImportMatrixExp (FILE* theSource) { i = 0; _SimpleList varList,c1,c2; while (fc!=';') { - fc = fgetc (theSource); - if ((fc==',')||(fc==';')) { + fc = theSource->getc(); + if ( fc==',' || fc==';') { buffer [i] = 0; _String varName (buffer); - _Variable * ppv = CheckReceptacle (&varName, kEmptyString, true); varList << ppv->get_index(); i = 0; @@ -8416,13 +8415,13 @@ bool _Matrix::ImportMatrixExp (FILE* theSource) { buffer[i]=fc; i++; } - if (feof(theSource)) { + if (theSource->feof()) { return false; } } do { - fc = fgetc (theSource); - if (feof(theSource)) { + fc = theSource->getc(); + if (theSource->feof()) { return false; } } while (fc!=';'); @@ -8432,10 +8431,10 @@ bool _Matrix::ImportMatrixExp (FILE* theSource) { while (kgetc(); buffer[i] = fc; i++; - if (feof(theSource)) { + if (theSource->feof()) { return false; } } @@ -8446,13 +8445,13 @@ bool _Matrix::ImportMatrixExp (FILE* theSource) { while (fc!='}') { i = 0; do { - buffer[i] = fc = fgetc (theSource); + buffer[i] = fc = theSource->getc(); i++; - if (feof(theSource)) { + if (theSource->feof()) { DeleteObject (thisCell); return false; } - } while ((fc!=',')&&(fc!='}')); + } while (fc!=',' && fc!='}'); buffer[i]=0; theCoeffs[j]=atof (buffer); j++; @@ -8461,7 +8460,7 @@ bool _Matrix::ImportMatrixExp (FILE* theSource) { return false; } } - fc = fgetc(theSource); + fc = theSource->getc(); if (fc != '{') { DeleteObject (thisCell); return false; @@ -8472,18 +8471,18 @@ bool _Matrix::ImportMatrixExp (FILE* theSource) { while (fc!='}') { i = 0; do { - buffer[i] = fc = fgetc (theSource); + buffer[i] = fc = theSource->getc(); i++; - if (feof(theSource)) { + if (theSource->feof()) { DeleteObject (thisCell); DeleteObject (pd); return false; } - } while ((fc!=',')&&(fc!='}')); + } while (fc!=',' && fc!='}'); buffer[i]=0; c1<getc(); if (fc != '{') { DeleteObject (thisCell); DeleteObject (pd); @@ -8493,14 +8492,14 @@ bool _Matrix::ImportMatrixExp (FILE* theSource) { while (fc!='}') { i = 0; do { - buffer[i] = fc = fgetc (theSource); + buffer[i] = fc = theSource->getc(); i++; - if (feof(theSource)) { + if (theSource->feof()) { DeleteObject (thisCell); DeleteObject (pd); return false; } - } while ((fc!=',')&&(fc!='}')); + } while (fc!=',' && fc!='}' ); buffer[i]=0; c2<puts (buffer); _SimpleList mxVariables; { _AVLList mxA (&mxVariables); @@ -8546,11 +8549,11 @@ void _Matrix::ExportMatrixExp (_Matrix* theBase, FILE* theDump) long k, i=0; hyFloat* varPool = (hyFloat*)MatrixMemAllocate (mxVariables.countitems()*sizeof(hyFloat)); for (k=0; kGetName()->get_str()); + theDump->puts (LocateVar(mxVariables(k))->GetName()->get_str()); if (kfputc (','); } else { - fprintf (theDump,"%c",';'); + theDump->fputc (';'); } varPool[k]=topPolyCap; } @@ -8565,7 +8568,8 @@ void _Matrix::ExportMatrixExp (_Matrix* theBase, FILE* theDump) DeleteObject(dummy); checkParameter (ANAL_MATRIX_TOLERANCE,analMatrixTolerance,1e-6); - fprintf (theDump,"%g,%g;",analMatrixTolerance,topPolyCap); + snprintf (buffer, 256, "%g,%g;",analMatrixTolerance,topPolyCap); + theDump->puts (buffer); // now loop thru the cells and check the precision term by term for (k=0; kputs (buffer); for (i=0; i<=tup; i++) { if (i) { - fprintf(theDump,",%18.16g",coeffHolder[i]); + snprintf (buffer,256, ",%18.16g",coeffHolder[i]); } else { - fprintf(theDump,"%18.16g",coeffHolder[i]); + snprintf (buffer, 256,"%18.16g",coeffHolder[i]); } + theDump->puts (buffer); } - fprintf(theDump,"}%ld",tup); + snprintf (buffer,256,"}%ld",tup); + theDump->puts (buffer); c1.toFileStr(theDump); c2.toFileStr(theDump); MatrixMemFree (coeffHolder); diff --git a/src/core/polynoml.cpp b/src/core/polynoml.cpp index f8f7641de..3e3954397 100644 --- a/src/core/polynoml.cpp +++ b/src/core/polynoml.cpp @@ -2730,36 +2730,37 @@ BaseObj* _Polynomial::toStr (unsigned long padding) { //__________________________________________________________________________________ -void _Polynomial::toFileStr (FILE*f, unsigned long padding) +void _Polynomial::toFileStr (hyFile *f, unsigned long padding) { if (theTerms->NumberOfTerms()&&theTerms->thePowers) { - fprintf(f,"p("); + f->puts ("p("); _List _varNames; long i; for (i=0; iGetName(); - fprintf(f,"%s",((_String*)_varNames(i))->get_str()); + f->puts (((_String*)_varNames(i))->get_str()); if (iputs (","); } } - fprintf(f,")="); + f->puts (")="); for (i=0; iNumberOfTerms(); i++) { char number [100]; snprintf (number, sizeof(number),PRINTF_FORMAT_STRING,theTerms->GetCoeff(i)); if ((i>0)&&(number[0]!='-')) { - fprintf(f,"+"); + f->puts ("+"); } - fprintf(f,"%s",number); + f->puts (number); if ((i>0)||!theTerms->IsFirstANumber()) { - fprintf(f,"*"); + f->puts ("*"); long *cT = theTerms->GetTerm(i); for (long k=0; k0) { - fprintf(f,"%s",((_String*)_varNames(k))->get_str()); + f->puts (((_String*)_varNames(k))->get_str()); if (*cT>1) { - fprintf(f,"^");; - fprintf(f,"%ld",*cT); + f->puts ("^"); + snprintf (number, sizeof(number),"%ld",*cT); + f->puts (number); } } } diff --git a/src/core/string_file_wrapper.cpp b/src/core/string_file_wrapper.cpp index 066adbfd3..09ab7ab8c 100644 --- a/src/core/string_file_wrapper.cpp +++ b/src/core/string_file_wrapper.cpp @@ -39,7 +39,7 @@ #include "string_file_wrapper.h" -StringFileWrapper::StringFileWrapper (_StringBuffer *string, FILE *file) { +StringFileWrapper::StringFileWrapper (_StringBuffer *string, hyFile *file) { string_buffer = string; file_buffer = file; } @@ -48,7 +48,7 @@ StringFileWrapper& StringFileWrapper::operator << (const char* buffer) { if (string_buffer) { *string_buffer << buffer; } else if (file_buffer) { - fprintf (file_buffer, buffer); + file_buffer->puts (buffer); } return *this; } @@ -57,7 +57,7 @@ StringFileWrapper& StringFileWrapper::operator << (const char letter) { if (string_buffer) { *string_buffer << letter; } else if (file_buffer) { - fputc (letter, file_buffer); + file_buffer->fputc (letter); } return *this; } @@ -66,7 +66,7 @@ StringFileWrapper& StringFileWrapper::operator << (const _String& buffer) { if (string_buffer) { *string_buffer << buffer; } else if (file_buffer) { - fprintf (file_buffer, "%s", (const char*)buffer); + file_buffer->puts ((const char*)buffer); } return *this; } @@ -91,14 +91,14 @@ StringFileWrapper& StringFileWrapper::operator << (const StringFileWrapperConsta } else if (file_buffer) { switch (special) { case kStringFileWrapperNewLine: - fputc ('\n', file_buffer); + file_buffer->fputc ('\n'); break; case kStringFileWrapperLinefeed: - fputc ('\r', file_buffer); - break; + file_buffer->fputc ('\r'); + break; case kStringFileWrapperTab: - fputc ('\t', file_buffer); - break; + file_buffer->fputc ('\t'); + break; } } return *this; diff --git a/src/core/strings.cpp b/src/core/strings.cpp index 363684989..3f17bb170 100644 --- a/src/core/strings.cpp +++ b/src/core/strings.cpp @@ -282,6 +282,26 @@ _String::_String(FILE * file, long read_this_many) { } } +//============================================================= +_String::_String(hyFile * file, long read_this_many) { + _String::Initialize (); + if (file) { + if (read_this_many < 0) { + file->seek (0, SEEK_END); + s_length = (unsigned long) file->tell(); + file->rewind(); + } else { + s_length = read_this_many; + } + s_data = (char *)MemAllocate(s_length + 1UL); + unsigned long read_items = file->read(s_data, 1, s_length); + if (read_items < s_length) { + s_data = (char*)MemReallocate(s_data,read_items+1); + s_length = read_items; + } + s_data[s_length] = '\0'; + } +} //============================================================= _String::~_String(void) { if (CanFreeMe()) { diff --git a/src/core/topology.cpp b/src/core/topology.cpp index fb151cbf0..ec6bca60b 100644 --- a/src/core/topology.cpp +++ b/src/core/topology.cpp @@ -957,9 +957,9 @@ BaseRef _TreeTopology::toStr (unsigned long) { } //__________________________________________________________________________________ -void _TreeTopology::toFileStr(FILE* f, unsigned long padding) { +void _TreeTopology::toFileStr(hyFile * f, unsigned long padding) { _String * s = (_String*)toStr(padding); - fprintf (f, "%s", s->get_str()); + f->puts (s->get_str()); DeleteObject(s); } diff --git a/src/core/variable.cpp b/src/core/variable.cpp index 0f5fab881..e1e2285a0 100644 --- a/src/core/variable.cpp +++ b/src/core/variable.cpp @@ -167,14 +167,14 @@ BaseRef _Variable::toStr(unsigned long padding) } //__________________________________________________________________________________ -void _Variable::toFileStr(FILE* f, unsigned long padding) +void _Variable::toFileStr(hyFile* f, unsigned long padding) { if (varValue&&varValue->IsPrintable()) { varValue->toFileStr(f, padding); } else { HBLObjectRef vv = Compute(); if (!vv) { - fprintf(f,"NAN"); + f->puts ("NAN"); } else { vv->toFileStr(f, padding); } diff --git a/src/mains/unix.cpp b/src/mains/unix.cpp index 8eed24af6..19c867ffe 100644 --- a/src/mains/unix.cpp +++ b/src/mains/unix.cpp @@ -283,10 +283,10 @@ void ReadInTemplateFiles(void) { _String dir_sep (get_platform_directory_char()), fileIndex = *((_String*)pathNames(0)) & hy_standard_library_directory & dir_sep & "files.lst"; - FILE* modelList = fopen (fileIndex.get_str(),"r"); + hyFile * modelList = doFileOpen (fileIndex.get_str(),kFileRead, false); if (!modelList) { fileIndex = baseArgDir& hy_standard_library_directory & dir_sep & "files.lst"; - modelList = fopen (fileIndex.get_str(),"r"); + modelList = doFileOpen (fileIndex.get_str(),kFileRead, false); if (!modelList) { return; } @@ -295,7 +295,8 @@ void ReadInTemplateFiles(void) { } _String theData (modelList); - fclose (modelList); + modelList->close(); + delete (modelList); if (theData.length()) { _List extracted_files; diff --git a/src/new/bayesgraph.cpp b/src/new/bayesgraph.cpp index 4e35e961d..48061d997 100644 --- a/src/new/bayesgraph.cpp +++ b/src/new/bayesgraph.cpp @@ -77,7 +77,8 @@ _String #ifdef __UNIX__ void ConsoleBGMStatus (_String const statusLine, hyFloat percentDone, _String const * fileName) { - FILE *outFile = fileName?doFileOpen (fileName->get_str(),"w"):nil; + + hyFile *outFile = fileName?doFileOpen (fileName->get_str(),kFileWrite):nil; _String reportLine (statusLine); if (percentDone >= 0.0) { @@ -85,20 +86,21 @@ void ConsoleBGMStatus (_String const statusLine, hyFloat percentDone, _St } if (outFile) { - fprintf (outFile,"%s", reportLine.get_str()); + outFile->puts(reportLine.get_str()); } else if (verbosity_level == 1) { printf ("\033\015 %s", reportLine.get_str()); + if (percentDone < -1.5) { + printf ("\033\015 "); + setvbuf (stdout,nil,_IOLBF,1024); + } else if (percentDone < -0.5) { + setvbuf (stdout,nil, _IONBF,1); + } } - if (percentDone < -1.5) { - printf ("\033\015 "); - setvbuf (stdout,nil,_IOLBF,1024); - } else if (percentDone < -0.5) { - setvbuf (stdout,nil, _IONBF,1); - } - + if (outFile) { - fclose (outFile); + outFile->close(); + delete (outFile); } }