From 540ab3936cfd09c87f4cb5120780dcf2fc79b550 Mon Sep 17 00:00:00 2001 From: Richard Liang Date: Thu, 24 Oct 2024 15:55:51 -0700 Subject: [PATCH 01/31] The align_it and align_it_aa methods now belong to the module "Gotoh". --- alignment/gotoh/.gitignore | 5 ++ alignment/gotoh/Makefile | 161 ------------------------------------- alignment/gotoh/extconf.rb | 2 +- alignment/gotoh/gotoh.cpp | 8 +- 4 files changed, 10 insertions(+), 166 deletions(-) create mode 100644 alignment/gotoh/.gitignore delete mode 100644 alignment/gotoh/Makefile diff --git a/alignment/gotoh/.gitignore b/alignment/gotoh/.gitignore new file mode 100644 index 0000000..4ba3832 --- /dev/null +++ b/alignment/gotoh/.gitignore @@ -0,0 +1,5 @@ +Makefile +*.so +*.o +extconf.h +build/* \ No newline at end of file diff --git a/alignment/gotoh/Makefile b/alignment/gotoh/Makefile deleted file mode 100644 index d60bbe7..0000000 --- a/alignment/gotoh/Makefile +++ /dev/null @@ -1,161 +0,0 @@ - -SHELL = /bin/sh - -#### Start of system configuration section. #### - -srcdir = . -topdir = /opt/local/lib/ruby/1.8/i686-darwin13 -hdrdir = $(topdir) -VPATH = $(srcdir):$(topdir):$(hdrdir) -prefix = $(DESTDIR)/opt/local -exec_prefix = $(prefix) -sitearchdir = $(sitelibdir)/$(sitearch) -libexecdir = $(exec_prefix)/libexec -dvidir = $(docdir) -mandir = $(DESTDIR)/opt/local/share/man -datadir = $(datarootdir) -pdfdir = $(docdir) -infodir = $(datarootdir)/info -oldincludedir = $(DESTDIR)/usr/include -sitelibdir = $(sitedir)/$(ruby_version) -bindir = $(exec_prefix)/bin -archdir = $(rubylibdir)/$(arch) -sbindir = $(exec_prefix)/sbin -sitedir = $(libdir)/ruby/site_ruby -localstatedir = $(prefix)/var -localedir = $(datarootdir)/locale -datarootdir = $(prefix)/share -libdir = $(exec_prefix)/lib -sysconfdir = $(prefix)/etc -docdir = $(datarootdir)/doc/$(PACKAGE) -sharedstatedir = $(prefix)/com -includedir = $(prefix)/include -vendorlibdir = $(vendordir)/$(ruby_version) -vendorarchdir = $(vendorlibdir)/$(sitearch) -rubylibdir = $(libdir)/ruby/$(ruby_version) -psdir = $(docdir) -vendordir = $(DESTDIR)/opt/local/lib/ruby/vendor_ruby -htmldir = $(docdir) - -CC = /usr/bin/clang -LIBRUBY = $(LIBRUBY_SO) -LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a -LIBRUBYARG_SHARED = -l$(RUBY_SO_NAME) -LIBRUBYARG_STATIC = -l$(RUBY_SO_NAME)-static - -RUBY_EXTCONF_H = -CFLAGS = -fno-common -pipe -Os -arch x86_64 -fno-common -pipe -fno-common $(cflags) -arch x86_64 -INCFLAGS = -I. -I$(topdir) -I$(hdrdir) -I$(srcdir) -DEFS = -CPPFLAGS = -I/opt/local/include -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE $(DEFS) $(cppflags) -CXXFLAGS = $(CFLAGS) -ldflags = -L. -L/opt/local/lib -Wl,-headerpad_max_install_names -arch x86_64 -dldflags = -archflag = -arch x86_64 -DLDFLAGS = $(ldflags) $(dldflags) $(archflag) -LDSHARED = $(CC) -dynamic -bundle -undefined suppress -flat_namespace -AR = ar -EXEEXT = - -RUBY_INSTALL_NAME = ruby1.8 -RUBY_SO_NAME = ruby -arch = i686-darwin13 -sitearch = i686-darwin13 -ruby_version = 1.8 -ruby = /opt/local/bin/ruby1.8 -RUBY = $(ruby) -RM = rm -f -MAKEDIRS = mkdir -p -INSTALL = /usr/bin/install -c -INSTALL_PROG = $(INSTALL) -m 0755 -INSTALL_DATA = $(INSTALL) -m 644 -COPY = cp - -#### End of system configuration section. #### - -preload = - -libpath = . $(libdir) /opt/local/lib -LIBPATH = -L. -L$(libdir) -L/opt/local/lib -DEFFILE = - -CLEANFILES = mkmf.log -DISTCLEANFILES = - -extout = -extout_prefix = -target_prefix = -LOCAL_LIBS = -LIBS = $(LIBRUBYARG_SHARED) -lpthread -ldl -lobjc -SRCS = alignment.cpp -OBJS = alignment.o -TARGET = alignment -DLLIB = $(TARGET).bundle -EXTSTATIC = -STATIC_LIB = - -BINDIR = $(bindir) -RUBYCOMMONDIR = $(sitedir)$(target_prefix) -RUBYLIBDIR = $(sitelibdir)$(target_prefix) -RUBYARCHDIR = $(sitearchdir)$(target_prefix) - -TARGET_SO = $(DLLIB) -CLEANLIBS = $(TARGET).bundle $(TARGET).il? $(TARGET).tds $(TARGET).map -CLEANOBJS = *.o *.a *.s[ol] *.pdb *.exp *.bak - -all: $(DLLIB) -static: $(STATIC_LIB) - -clean: - @-$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES) - -distclean: clean - @-$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log - @-$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES) - -realclean: distclean -install: install-so install-rb - -install-so: $(RUBYARCHDIR) -install-so: $(RUBYARCHDIR)/$(DLLIB) -$(RUBYARCHDIR)/$(DLLIB): $(DLLIB) - $(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR) -install-rb: pre-install-rb install-rb-default -install-rb-default: pre-install-rb-default -pre-install-rb: Makefile -pre-install-rb-default: Makefile -$(RUBYARCHDIR): - $(MAKEDIRS) $@ - -site-install: site-install-so site-install-rb -site-install-so: install-so -site-install-rb: install-rb - -.SUFFIXES: .c .m .cc .cxx .cpp .C .o - -.cc.o: - $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $< - -.cxx.o: - $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $< - -.cpp.o: - $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $< - -.C.o: - $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $< - -.c.o: - $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) -c $< - -$(DLLIB): $(OBJS) Makefile - @-$(RM) $@ - $(LDSHARED) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS) - - - -$(OBJS): ruby.h defines.h - -### USED TO COMPILE .so FOR RUBY APPLICATIONS ### -# alignment: -# g++ gotoh.cpp -shared -DUSERUBY -DUSERUBY2 -I/usr/share/rvm/rubies/ruby-3.0.0/include/ruby-3.0.0 -I/usr/share/rvm/rubies/ruby-3.0.0/include/ruby-3.0.0/x86_64-linux -o alignment.X.r2.so -O3 -fPIC diff --git a/alignment/gotoh/extconf.rb b/alignment/gotoh/extconf.rb index a75b78f..0cde321 100644 --- a/alignment/gotoh/extconf.rb +++ b/alignment/gotoh/extconf.rb @@ -1,4 +1,4 @@ require "mkmf" create_header -create_makefile("alignment") +create_makefile("gotoh") diff --git a/alignment/gotoh/gotoh.cpp b/alignment/gotoh/gotoh.cpp index fb9ffb5..9a3cef6 100644 --- a/alignment/gotoh/gotoh.cpp +++ b/alignment/gotoh/gotoh.cpp @@ -852,11 +852,11 @@ void widen_gaps(string* seq) return ret; } - - extern "C" void Init_alignment() + extern "C" void Init_gotoh() { - rb_define_global_function("align_it", (VALUE(*)(...))align_it, 4); - rb_define_global_function("align_it_aa", (VALUE(*)(...))align_it_aa, 4); + VALUE gotoh = rb_define_module("Gotoh"); + rb_define_module_function(gotoh, "align_it", (VALUE(*)(...))align_it, 4); + rb_define_module_function(gotoh, "align_it_aa", (VALUE(*)(...))align_it_aa, 4); } #endif From 9f69098cea4a6ecc45d6e7a2ce1311e5e72497e6 Mon Sep 17 00:00:00 2001 From: Richard Liang Date: Fri, 25 Oct 2024 16:19:18 -0700 Subject: [PATCH 02/31] Added some setup for building a gem containing the Ruby Gotoh bindings. --- README.md | 23 +++++++++++++++++++++-- alignment/gotoh/extconf.rb | 4 ---- ruby/.gitignore | 1 + ruby/build_gem.bash | 5 +++++ ruby/ext/gotoh/extconf.rb | 4 ++++ ruby/gotoh.gemspec | 18 ++++++++++++++++++ ruby/lib/gotoh.rb | 1 + 7 files changed, 50 insertions(+), 6 deletions(-) delete mode 100644 alignment/gotoh/extconf.rb create mode 100644 ruby/.gitignore create mode 100644 ruby/build_gem.bash create mode 100644 ruby/ext/gotoh/extconf.rb create mode 100644 ruby/gotoh.gemspec create mode 100644 ruby/lib/gotoh.rb diff --git a/README.md b/README.md index 56a2c76..45fbd6c 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,10 @@ to your `requirements.txt` file or similar method. # Ruby Bindings +The `/ruby` directory contains the directory structure required to build a +gem containing Gotoh's Ruby bindings. This gem, named `gotoh`, contains +a module called `Gotoh` that holds the `align_it` and `align_it_aa` functions. + To build Gotoh's Ruby bindings (which are called "alignment" instead of "gotoh" for arbitrary historical reasons), you'll need to have the following installed: @@ -65,8 +69,23 @@ for arbitrary historical reasons), you'll need to have the following installed: [RVM](https://rvm.io/) on Linux) - A C++ compiler -With these installed, you should be able to build the extension module with +The "canonical" environment for building this package is the CfE-internal +`cfe_ubuntu` Ruby image based on Ubuntu 20.04 (this old version is required +to run Ruby 2.2.2). + +In your build environment, you should be able to build the extension module +using the `build_gem.bash` script. By default this will make a gem with the +version number `0.1.0.pre`; set the environment variable `GOTOH_VERSION` before +building to assign a proper version number. + +## Manually building the Ruby bindings + +If you want to manually build the bindings, e.g. for testing/debugging/development, +copy `/ruby/ext/gotoh/extconf.rb` to the `/alignment/gotoh/` directory and +change the `create_makefile('gotoh/gotoh')` line to `create_makefile('gotoh')`; +then run ``` - ruby extconf.rb + ruby extconf.rb # generates a Makefile and other supporting files make ``` +which will build `gotoh.so`, which can be `require`d from Ruby. diff --git a/alignment/gotoh/extconf.rb b/alignment/gotoh/extconf.rb deleted file mode 100644 index 0cde321..0000000 --- a/alignment/gotoh/extconf.rb +++ /dev/null @@ -1,4 +0,0 @@ -require "mkmf" - -create_header -create_makefile("gotoh") diff --git a/ruby/.gitignore b/ruby/.gitignore new file mode 100644 index 0000000..c111b33 --- /dev/null +++ b/ruby/.gitignore @@ -0,0 +1 @@ +*.gem diff --git a/ruby/build_gem.bash b/ruby/build_gem.bash new file mode 100644 index 0000000..4fcf733 --- /dev/null +++ b/ruby/build_gem.bash @@ -0,0 +1,5 @@ +#! /usr/bin/env bash + +cp ../alignment/gotoh/gotoh.cpp ext/gotoh +gem build gotoh.gemspec +rm ext/gotoh/gotoh.cpp diff --git a/ruby/ext/gotoh/extconf.rb b/ruby/ext/gotoh/extconf.rb new file mode 100644 index 0000000..0fe49ba --- /dev/null +++ b/ruby/ext/gotoh/extconf.rb @@ -0,0 +1,4 @@ +require "mkmf" + +create_header +create_makefile('gotoh/gotoh') diff --git a/ruby/gotoh.gemspec b/ruby/gotoh.gemspec new file mode 100644 index 0000000..5de4683 --- /dev/null +++ b/ruby/gotoh.gemspec @@ -0,0 +1,18 @@ +Gem::Specification.new do |s| + s.name = "gotoh" + s.version = ENV['GOTOH_VERSION'] || '0.1.0.pre' + s.summary = "CfE implementation of the Gotoh sequence alignment algorithm" + s.files = [ + 'lib/gotoh.rb', + 'ext/gotoh/gotoh.cpp' + ] + s.extensions = [ + 'ext/gotoh/extconf.rb', + ] + s.authors = [ + "Conan Woods", + "Jamie Kai", + "David Rickett", + "Richard Liang" + ] +end diff --git a/ruby/lib/gotoh.rb b/ruby/lib/gotoh.rb new file mode 100644 index 0000000..31768f8 --- /dev/null +++ b/ruby/lib/gotoh.rb @@ -0,0 +1 @@ +require 'gotoh/gotoh' \ No newline at end of file From c82609117decb94ed57a2d8b292c1d3f493ceca0 Mon Sep 17 00:00:00 2001 From: Richard Liang Date: Tue, 29 Oct 2024 11:26:14 -0700 Subject: [PATCH 03/31] WIP: copying in the cfe_scripts version of _alignment.rb as a starting point. This currently doesn't do anything and is here in the interest of maintaining a sensible "history" for this file. --- ruby/_alignment.rb | 544 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 544 insertions(+) create mode 100644 ruby/_alignment.rb diff --git a/ruby/_alignment.rb b/ruby/_alignment.rb new file mode 100644 index 0000000..c328fb9 --- /dev/null +++ b/ruby/_alignment.rb @@ -0,0 +1,544 @@ +#TODO: Scoring algorithm to improve frame_align? + + +if(RUBY_PLATFORM =~ /(win|w)32$/) + if(RUBY_VERSION =~ /^2/) + require_relative 'alignment.windows.r22.so' + else + class String + def ord() #adding an ord method + return self.unpack('c')[0] + end + end + require 'ckwlib/alignment.windows.so' + end +elsif(RUBY_PLATFORM =~ /x86_64-linux/) #Ok, its probably not a mac, soo..... + if(RUBY_VERSION =~ /^2/) + require_relative 'alignment.linux64.r2.so' + else + class String + def ord() #adding an ord method + return self.unpack('c')[0] + end + end + require 'ckwlib/alignment.linux64.so' + end +elsif(RUBY_PLATFORM =~ /i686-darwin10/) + require 'ckwlib/alignment.macosx.so' +else + if(RUBY_VERSION =~ /^2/) + require_relative 'alignment.linux32.r2.so' + else + class String + def ord() #adding an ord method + return self.unpack('c')[0] + end + end + require 'ckwlib/alignment.linux32.so' + end +end + +#init $nucMat +$nucMat = Array.new(127) {Array.new(127) {-1.0} } +['A','T','G','C','R','Y','K','M','B','D','H','V','S','W','N'].each do |nuc| + $nucMat[nuc.ord()][nuc.ord()] = 1.0 + $nucMat[nuc.ord()]['X'.ord()]=$nucMat['X'.ord()][nuc.ord()]=-6.0 if(nuc !='N') +end +#bi-mixtures +$nucMat['A'.ord()]['R'.ord()]=$nucMat['R'.ord()]['A'.ord()]=1.0 +$nucMat['G'.ord()]['R'.ord()]=$nucMat['R'.ord()]['G'.ord()]=1.0 +$nucMat['C'.ord()]['Y'.ord()]=$nucMat['Y'.ord()]['C'.ord()]=1.0 +$nucMat['T'.ord()]['Y'.ord()]=$nucMat['Y'.ord()]['T'.ord()]=1.0 +$nucMat['G'.ord()]['K'.ord()]=$nucMat['K'.ord()]['G'.ord()]=1.0 +$nucMat['T'.ord()]['K'.ord()]=$nucMat['K'.ord()]['T'.ord()]=1.0 +$nucMat['C'.ord()]['M'.ord()]=$nucMat['M'.ord()]['C'.ord()]=1.0 +$nucMat['A'.ord()]['M'.ord()]=$nucMat['M'.ord()]['A'.ord()]=1.0 +$nucMat['C'.ord()]['S'.ord()]=$nucMat['S'.ord()]['C'.ord()]=1.0 +$nucMat['G'.ord()]['S'.ord()]=$nucMat['S'.ord()]['G'.ord()]=1.0 +$nucMat['T'.ord()]['W'.ord()]=$nucMat['W'.ord()]['T'.ord()]=1.0 +$nucMat['A'.ord()]['W'.ord()]=$nucMat['W'.ord()]['A'.ord()]=1.0 +#tri-mixtures +$nucMat['C'.ord()]['B'.ord()]=$nucMat['B'.ord()]['C'.ord()]=1.0 +$nucMat['G'.ord()]['B'.ord()]=$nucMat['B'.ord()]['G'.ord()]=1.0 +$nucMat['T'.ord()]['B'.ord()]=$nucMat['B'.ord()]['T'.ord()]=1.0 +$nucMat['A'.ord()]['D'.ord()]=$nucMat['D'.ord()]['A'.ord()]=1.0 +$nucMat['G'.ord()]['D'.ord()]=$nucMat['D'.ord()]['G'.ord()]=1.0 +$nucMat['T'.ord()]['D'.ord()]=$nucMat['D'.ord()]['T'.ord()]=1.0 +$nucMat['A'.ord()]['H'.ord()]=$nucMat['H'.ord()]['A'.ord()]=1.0 +$nucMat['C'.ord()]['H'.ord()]=$nucMat['H'.ord()]['C'.ord()]=1.0 +$nucMat['T'.ord()]['H'.ord()]=$nucMat['H'.ord()]['T'.ord()]=1.0 +$nucMat['A'.ord()]['V'.ord()]=$nucMat['V'.ord()]['A'.ord()]=1.0 +$nucMat['C'.ord()]['V'.ord()]=$nucMat['V'.ord()]['C'.ord()]=1.0 +$nucMat['G'.ord()]['V'.ord()]=$nucMat['V'.ord()]['G'.ord()]=1.0 +#other +$nucMat['$'.ord()]['$'.ord()]=50.0 +$nucMat['T'.ord()]['U'.ord()] = $nucMat['U'.ord()]['T'.ord()] = 1.0 +$nucMat['N'.ord()]['N'.ord()] = 0.0 +$nucMat['X'.ord()]['-'.ord()]=$nucMat['X'.ord()]['-'.ord()]=3.0 +['A','T','G','C'].each do |ch| + $nucMat[ch.ord()]['*'.ord()]=$nucMat['*'.ord()][ch.ord()]=1.0 + $nucMat[ch.ord()]['&'.ord()]=$nucMat['&'.ord()][ch.ord()]=0.7 + $nucMat[ch.ord()]['$'.ord()]=$nucMat['$'.ord()][ch.ord()]=0.0 + $nucMat[ch.ord()]['.'.ord()]=$nucMat['.'.ord()][ch.ord()]=-20.0 + $nucMat[ch.ord()]['N'.ord()]=$nucMat['N'.ord()][ch.ord()]=-3.0 +end + + + +def score_alignment(seqa, seqb) + sc = 0.0 + 0.upto(seqa.size() - 1) do |i| + sc += $nucMat[seqa[i,1].upcase().ord()][seqb[i,1].upcase().ord()] + end + return sc +end + +def make_gap_list(seq) + list = [] + cur_ins = nil + prev_i = nil + 0.upto(seq.size() - 1) do |i| + if(seq[i,1] == '-') + if(prev_i and i == prev_i + 1) + cur_ins << i + prev_i = i + else + list << cur_ins if(cur_ins != nil and cur_ins != []) + cur_ins = [i] + prev_i = i + end + end + end + list << cur_ins if(cur_ins != nil and cur_ins != []) + return list +end + +#common_insert_locations is based on amino acid locations starting at base 0. +#Assumes standard in the first base. +#Prealign lets you run a lot of the corrections and qc on a already aligned sequence. +def frame_align(seqa, seqb, gap_init=3, gap_penalty=1, common_insert_locations=[], trim=false, raise_errors=false, prealigned=false) + elem = nil + if(prealigned) + elem = [seqa, seqb] + else + elem = align_it(seqa, seqb, gap_init, gap_penalty) + end + puts "Wierd sizes Z" if(elem[0].size() != elem[1].size()) + + #Do trimming? + if(trim and elem[0] =~ /^(-+)[^-]/) + elem[0][0,$1.size()] = '' + elem[1][0,$1.size()] = '' + end + if(trim and elem[0] =~ /[^-](-+)$/) + elem[1][(elem[0].size() - $1.size()), $1.size()] = '' + elem[0][(elem[0].size() - $1.size()), $1.size()] = '' + end + + #Start + if(trim and elem[1][0,1] == '-') + #get rid of edges that are the wrong size + elem[1] =~ /^(-+)[^-]/ + #Make sure its a multiple of three + dashes = $1 + if(dashes == nil) + #pass + elsif((dashes.size() % 3) == 1) + elem[1][dashes.size(),1] = '-' + elem[1][dashes.size() + 1,1] = '-' + elsif((dashes.size() % 3) == 2) + elem[1][dashes.size(),1] = '-' + end + end + + #end + if(trim and elem[1][-1,1] == '-') + #get rid of edges that are the wrong size + elem[1] =~ /[^-](-+)$/ + #Make sure its a multiple of three + dashes = $1 + if(dashes == nil) + #pass + elsif((dashes.size() % 3) == 1) + elem[1][(elem[1].size() - dashes.size()) - 1] = '-' + elem[1][(elem[1].size() - dashes.size()) - 2] = '-' + elsif((dashes.size() % 3) == 2) + elem[1][(elem[1].size() - dashes.size()) - 1] = '-' + end + end + + + #try to merge deletions and insertions if things aren't looking well. + #added 16-Nov-2018, helps fix poor insertions near the start. + if(elem[0].size() % 3 != 0 or elem[1].size() % 3 != 0) + dex = 0 #Don't start at 0, that way lies madness... Or does it??? + while(dex = elem[0].index(/-/, dex)) + #Now, find an ajacent dash in elem[1] to cancel + if((dex - 1 >= 0) and elem[1][dex - 1] == '-') + elem[0][dex] = '' + elem[1][dex - 1] = '' + dex = 1 + elsif(elem[1][dex + 1] == '-') + elem[1][dex + 1] = '' + elem[0][dex] = '' + dex = 1 + elsif((dex - 2 >= 0) and elem[1][dex - 2] == '-') + elem[1][dex - 2] = '' + elem[0][dex] = '' + dex = 1 + elsif(elem[1][dex + 2] == '-') + elem[1][dex + 2] = '' + elem[0][dex] = '' + dex = 1 + end + + #check to see if we fixed everything + if(!(elem[0].size() % 3 != 0 or elem[1].size() % 3 != 0)) + break + end + + dex += 1 + end + + end + + + #I wonder if these should throw exceptions by default, but have an option to ignore. + if(elem[0].gsub(/[^-]/,'').size() % 3 != 0 and raise_errors) + #puts "Can not frame align" + raise "Can not frame align, #{elem[0].gsub(/[^-]/,'').size()} inserted bases not divisible by 3" + #return elem + end + if(elem[1].gsub(/[^-]/,'').size() % 3 != 0 and raise_errors) + #puts "Can not frame align" + #puts elem[0] + #puts elem[1] + + raise "Can not frame align, #{elem[1].gsub(/[^-]/,'').size()} deleted bases not divisible by 3" + #return elem + end + + #Build the insert/delete lists. + insert_list = make_gap_list(elem[0]) + delete_list = make_gap_list(elem[1]) + #Now we have a list that looks like [[3,4,5], [9], [11,12]] + + + if(insert_list.size() > 0)#Inserts first + new_ins_list = [] + + #First step is clustering insertions. (v2: 16-Nov-2018) + insert_list.each_with_index do |ins, i| + next if(ins.size() == 0) #we already ate this one. + if(ins.size() % 3 == 0) #this insertion is fine! + new_ins_list << ins + next + end + + #Can I merge with the next insert? + if(insert_list[i + 1] and (ins + insert_list[i + 1]).size() % 3 == 0 and + (insert_list[i + 1].first - ins.last) < 9) + + ins2 = insert_list[i + 1] + if(ins2.size() > ins.size()) + new_ins_list << ((ins2.first - ins.size()) .. ins2.first - 1).to_a() + ins2 + else + new_ins_list << ins + ((ins.last + 1) .. (ins.last + ins2.size())).to_a() + end + insert_list[i + 1] = [] + #maybe merge with the next two inserts? + elsif(insert_list[i + 1] and insert_list[i + 2] and + (ins + insert_list[i + 1] + insert_list[i + 2]).size() % 3 == 0 and + (insert_list[i + 2].first - ins.last) < 12) + + ins2 = insert_list[i + 1] + ins3 = insert_list[i + 2] + if(true) #Lets just assume that if you need to combine 3 inserts, the middle one is where it goes. + new_ins_list << ((ins2.first - ins.size()) .. ins2.first - 1).to_a() + ins2 + ((ins2.last + 1) .. (ins2.last + ins3.size())).to_a() + end + + insert_list[i + 1] = [] + insert_list[i + 2] = [] + else #No merge, life sucks and then you die. + raise "Can not frame align insert" if(raise_errors) + end + end + +=begin + #First step is clustering insertions. (v1: old version) + insert_list.each_with_index do |ins, i| + next_ins = insert_list[i + 1] + + if(ins.size() % 3 != 0) #Wrong size! + #Look for next insertions that would make it the right size and are not far apart + if(!outta_frame and next_ins and (((next_ins.size() + ins.size()) % 3) == 0) and next_ins[0] - ins[-1] < 8 ) #within 8 bases + if(next_ins.size() > ins.size()) #scoring would be good here + ins.each do |a| + next_ins.insert(0, next_ins[0] - 1) #Insert at start + end + else + next_ins.each do |a| + ins.insert(-1, ins[0] + 1) #insert at end + end + insert_list[i + 1] = ins #Pushing this problem ahead for convinence. + end + #don't insert into new list, as we've pushed it to the next element. + else + #I wonder if we should try to merge multiple more than two inserts? Eh, nah. + #return elem + outta_frame = true + raise "Can not frame align insert" if(raise_errors) + end + else + new_ins_list << ins #this insert is okay + end + + end +=end + + #puts insert_list.inspect + #puts new_ins_list.inspect + + #second step should be to frame align inserts (prioritizing to common_points) + offset = 0 #offset created by previous insertions. (IMPORTANT) + new_ins_list.each do |ins| + #see if its close to a common_insert(within 3 amino acids?) + min_common = common_insert_locations.min(){|a,b| ((a) * 3 - (ins[0] - offset)).abs() <=> ((b) * 3 - ins[0]).abs()} + if(min_common != nil and ((min_common ) * 3 - (ins[0] - offset)).abs() <= 9) + #Cool, align to this common insert + new_ins = [] + 0.upto(ins.size() - 1) do |i| + new_ins << ((min_common) * 3) + i + offset + end + ins.replace(new_ins) + end + + #B frame align + #scoring would be good here + if(ins[0] % 3 == 1) #set back one base + new_ins = [] + ins.each do |i| + new_ins << i - 1 + end + ins.replace(new_ins) + elsif(ins[0] % 3 == 2) #Set forward one base. + new_ins = [] + ins.each do |i| + new_ins << i + 1 + end + ins.replace(new_ins) + end + + offset += ins.size() + end + + #make the actual modifications + #begin + #orige = "" + elem[0] + elem[0] = elem[0].gsub('-','') + new_ins_list.each do |ins| + ins.each do |i| + if(i > elem[0].size()) + elem[0].insert(-1, '-') + else + elem[0].insert(i, '-') + end + end + end + #rescue + # puts "OOH---------------------------------------------" + # puts orige.inspect + # puts elem.inspect + # puts new_ins_list.inspect() + # raise $! + #end + + end + + + #Deletion--------------------------------------------------------------------------------------------------- + outta_frame = false + if(delete_list.size() > 0)#Deletions second + new_del_list = [] + next_del = nil + + #First step is clustering deletions. (v2: 19-Nov-2018) + delete_list.each_with_index do |del, i| + next if(del.size() == 0) #we already ate this one. + if(del.size() % 3 == 0) #this insertion is fine! + new_del_list << del + next + end + + #Can I merge with the next delete? + if(delete_list[i + 1] and (del + delete_list[i + 1]).size() % 3 == 0 and + (delete_list[i + 1].first - del.last) < 9) + + del2 = delete_list[i + 1] + if(del2.size() > del.size()) + new_del_list << ((del2.first - del.size()) .. del2.first - 1).to_a() + del2 + else + new_del_list << del + ((del.last + 1) .. (del.last + del2.size())).to_a() + end + delete_list[i + 1] = [] + #maybe merge with the next two deletes? + elsif(delete_list[i + 1] and delete_list[i + 2] and + (del + delete_list[i + 1] + delete_list[i + 2]).size() % 3 == 0 and + (delete_list[i + 2].first - del.last) < 12) #slightly higher range, since we've already got a higher threshold of confidence here + + del2 = delete_list[i + 1] + del3 = delete_list[i + 2] + if(true) #Lets just assume that if you need to combine 3 deletes, the middle one is where it goes. + new_del_list << ((del2.first - del.size()) .. del2.first - 1).to_a() + del2 + ((del2.last + 1) .. (del2.last + del3.size())).to_a() + end + + delete_list[i + 1] = [] + delete_list[i + 2] = [] + else #No merge, life sucks and then you die. + new_del_list << delete_list[i] + end + end + +=begin + #First step is clustering deletions. (v1: old version) + delete_list.each_with_index do |del, i| + next_del = delete_list[i + 1] + + if(del.size() % 3 != 0 and !outta_frame) #Wrong size! + #Look for next deletions that would make it the right size and are not far apart + if(next_del and (((next_del.size() + del.size()) % 3) == 0) and next_del[0] - del[-1] < 6 ) #within 6 bases + if(next_del.size() > del.size()) #scoring would be good here + del.each do |a| + next_del.insert(0, next_del[0] - 1) #delete at start + end + else + next_del.each do |a| + del.insert(-1, del[0] + 1) #delete at end + end + delete_list[i + 1] = del #Pushing this problem ahead for convinence. + end + #don't delete into new list, as we've pushed it to the next element. + else + #I wonder if we should try to merge multiple more than two deletes? Eh, nah. + # + #return elem + new_del_list << del + outta_frame = true + #raise "Can not frame align delete" + end + else + new_del_list << del #this delete is okay + end + end +=end + + + #second step should be to frame align deletes + offset = 0 #offset created by previous deletions. (IMPORTANT) + new_del_list.each do |del| + next if(del.size() % 3 != 0) + #frame align + #scoring would be good here + if(del[0] % 3 == 1) #set back one base + new_del = [] + del.each do |i| + new_del << i - 1 + end + del.replace(new_del) + elsif(del[0] % 3 == 2) #Set forward one base. + new_del = [] + del.each do |i| + new_del << i + 1 + end + del.replace(new_del) + end + + offset += del.size() + end + + #make the actual modifications + elem[1] = elem[1].gsub('-','') + new_del_list.each do |del| + del.each do |i| + if(i > elem[1].size() ) + elem[1].insert(elem[1].size(), '-') + else + elem[1].insert(i, '-') + end + end + end + end + + return elem +end + +#Returns a [seq_sans_inserts, [list of inserts]] +def remove_inserts(elem) + seq = '' + elem[1] + inserts = [] + + insert_list = [] + 0.upto(elem[0].size() - 1) do |i| + insert_list << i if(elem[0][i,1] == '-') + end + + big_insert_list = [] + if(elem[0].include?('-'))#Inserts first + #First step should be to cluster inserts + cur_ins = nil + prev_i = nil + insert_list.each do |i| + if(prev_i and i == prev_i + 1) + cur_ins << i + prev_i = i + else + big_insert_list << cur_ins if(cur_ins != nil and cur_ins != []) + cur_ins = [i] + prev_i = i + end + end + big_insert_list << cur_ins if(cur_ins != nil and cur_ins != []) + end + + offset = 0 + big_insert_list.each do |ins| + ins_seq = '' + ins.each do |i| + ins_seq += elem[1][i,1] + end + inserts << [((ins[0] - offset) / 3), ins_seq] + offset += ins.size() + ins.each do |i| + seq[i,1] = '.' + end + end + + return [seq.gsub('.',''), inserts] +end + +=begin +seq = 'CCTCAAATCACTCTTTGGCAACGACCCTTAGTCACAGTAAGAATAGGGGGACAGCTAATAGAAGCCCTATTAGACACAGGAGCAGATGATACAGTATTAGAAGAAAAAATAGATTTACCAGGAAAATGGARACCAAAAATGATAGGGGGAATTGGAGGTTTTATTAAAGTAAGGCAATATGATCAGATACTTATGGAAATATGTGARAAGAAGGCCATAGGTACAGTATTAGTAGGACCTACMCCTGTCAACATAATTGGRCGRAATATGTTGACTCAGATTGGTTGTACTTTAAATTTTCCAATTAGTCCTATTGARACTGTGCCAGTAAAATTAAAGCCAGGGATGGATGGCCCAAAAGTTAARCAATGGCCATTGACAGAAGAAAAAATAAAAGCATTAACAGAAATTTGTGCAGAAATGGAAAARGAAGGAAAAATTTCAAAAATTGGGCCTGAAAATCCATACAATACTCCAGTGTTTGCCATAAAGAAAAARGATAGTAMTAAATGGAGAAAATTAGTAGATTTCAGAGAACTCAATAAGAGAACTCAAGACTTCTGGGAGGTCCAATTAGGAATTCCTCATCCCGCGGGATTAAAAAAGAAAARATCAGTAACAGTACTAGATRTAGGGGATGCATATTTTTCAGTTCCCTTAGACAAAGAYTTTAGAAAGTATACTGCATTCACTATACCTAGTGTAAATAATGAAACACCAGGRATTAGATATCAGTACAATGTRCTKCCACAGGGATGGAAAGGATCACCAGCAATATTTCARGCAAGCATGACAAAAATCTTAGAGCCCTTTAGAACAAAAAATCCAGAGGTGGTGATCTACCAGTATATGGATGATTTATATGTAGGATCTGACTTAGAGATAGGGCAACATAGAGCAAAAATAGAGGARTTAAGAGAACATCTAYTGARATGGGGATTTACCACACCAGACAAAAAACATCAGAAAGAACCTCCATTTCTTTGGATGGGATATGAACTTCATCCTGACAAATGGACAGTCCAGCCTATARTRCTGCCARACAAAGRMRRCTGGACTGTCAATGATATACAGAAATTAGTAGGAAAACTAAATTGGGCCAGTCAAATTTATGCAGGAATTAAAGTAAAGCAACTGTGTAAACTCCTCAGGGGAGCCAAAGCATTAACAGAYATAGTAACAYTAACTGAGGAAGCAGAATTAGAATTGGCAGAGAACAGGGAAATTCTAAAAGAACCTGTACATGGGGTATAYTATGAYCCAGYAAAAGACTTAATAGCAGAAATACAGAAACAAGGGCAAGACCAATGGACATATCAAATATATCAAGARCCATTTAAAAATCTAAARACAGGAAAATATGCAAARAGGAGATCTGCCCACACRAATGATGTAAAACAATTAACAGAGGTAGTGCAAAAAGTGTCTACAGAARGCATAGTAATATGGGGGAAGAYCCCTAAATTTAAGCTGCCCATACAAAAAGAAACATGGGAGGCA' +std = 'CCTCAGGTCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAGGGGGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATGAGTTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATCAGATACTCATAGAAATCTGTGGACATAAAGCTATAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTGACTCAGATTGGTTGCACTTTAAATTTTCCCATTAGCCCTATTGAGACTGTACCAGTAAAATTAAAGCCAGGAATGGATGGCCCAAAAGTTAAACAATGGCCATTGACAGAAGAAAAAATAAAAGCATTAGTAGAAATTTGTACAGAGATGGAAAAGGAAGGGAAAATTTCAAAAATTGGGCCTGAAAATCCATACAATACTCCAGTATTTGCCATAAAGAAAAAAGACAGTACTAAATGGAGAAAATTAGTAGATTTCAGAGAACTTAATAAGAGAACTCAAGACTTCTGGGAAGTTCAATTAGGAATACCACATCCCGCAGGGTTAAAAAAGAAAAAATCAGTAACAGTACTGGATGTGGGTGATGCATATTTTTCAGTTCCCTTAGATGAAGACTTCAGGAAGTATACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCACAGGGATGGAAAGGATCACCAGCAATATTCCAAAGTAGCATGACAAAAATCTTAGAGCCTTTTAGAAAACAAAATCCAGACATAGTTATCTATCAATACATGGATGATTTGTATGTAGGATCTGACTTAGAAATAGGGCAGCATAGAACAAAAATAGAGGAGCTGAGACAACATCTGTTGAGGTGGGGACTTACCACACCAGACAAAAAACATCAGAAAGAACCTCCATTCCTTTGGATGGGTTATGAACTCCATCCTGATAAATGGACAGTACAGCCTATAGTGCTGCCAGAAAAAGACAGCTGGACTGTCAATGACATACAGAAGTTAGTGGGGAAATTGAATTGGGCAAGTCAGATTTACCCAGGGATTAAAGTAAGGCAATTATGTAAACTCCTTAGAGGAACCAAAGCACTAACAGAAGTAATACCACTAACAGAAGAAGCAGAGCTAGAACTGGCAGAAAACAGAGAGATTCTAAAAGAACCAGTACATGGAGTGTATTATGACCCATCAAAAGACTTAATAGCAGAAATACAGAAGCAGGGGCAAGGCCAATGGACATATCAAATTTATCAAGAGCCATTTAAAAATCTGAAAACAGGAAAATATGCAAGAATGAGGGGTGCCCACACTAATGATGTAAAACAATTAACAGAGGCAGTGCAAAAAATAACCACAGAAAGCATAGTAATATGGGGAAAGACTCCTAAATTTAAACTGCCCATACAAAAGGAAACATGGGAAACATGGTGGACAGAGTATTGGCAAGCCACCTGGATTCCTGAGTGGGAGTTTGTTAATACCCCTCCCTTAGTGAAATTATGGTACCAGTTAGAGAAAGAACCCATAGTAGGAGCAGAAACCTTC' + +elem = frame_align(std, seq, 9, 2, [35,168]) #Seems to be off by one. +puts remove_inserts(elem).inspect() +=end +=begin +#----testing----- +require 'ckwlib/io' + +std = "TGTACAAGACCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAGAGCATTTTATGCAACAGGAGAAATAATAGGAGATATAAGACAAGCACATTGT" + +fasta = Io::read_fasta('motivate_preds_full_NucSeq.fas') + +File.open( + +fasta.each do |fas| + elem = frame_align(std, fas[1].strip(), 3, 1, [10]) + puts fas[0] + puts elem[0] + puts elem[1] + puts remove_inserts(elem).inspect() + puts score_alignment(elem[0], elem[1]) +end +=end \ No newline at end of file From fe4bc9189adb9ebef483b54bdf488ae80aa2e1ad Mon Sep 17 00:00:00 2001 From: Richard Liang Date: Tue, 29 Oct 2024 11:38:37 -0700 Subject: [PATCH 04/31] WIP: getting files moved into their new places. --- ruby/build_gem.bash | 6 +++--- ruby/{gotoh.gemspec => cfe_gotoh.gemspec} | 10 +++++----- ruby/ext/{gotoh => cfe_gotoh}/extconf.rb | 0 ruby/{_alignment.rb => lib/cfe_gotoh.rb} | 2 ++ ruby/lib/gotoh.rb | 1 - 5 files changed, 10 insertions(+), 9 deletions(-) rename ruby/{gotoh.gemspec => cfe_gotoh.gemspec} (59%) rename ruby/ext/{gotoh => cfe_gotoh}/extconf.rb (100%) rename ruby/{_alignment.rb => lib/cfe_gotoh.rb} (99%) delete mode 100644 ruby/lib/gotoh.rb diff --git a/ruby/build_gem.bash b/ruby/build_gem.bash index 4fcf733..d2792bf 100644 --- a/ruby/build_gem.bash +++ b/ruby/build_gem.bash @@ -1,5 +1,5 @@ #! /usr/bin/env bash -cp ../alignment/gotoh/gotoh.cpp ext/gotoh -gem build gotoh.gemspec -rm ext/gotoh/gotoh.cpp +cp ../alignment/gotoh/gotoh.cpp ext/gotoh/cfe_gotoh.cpp +gem build cfe_gotoh.gemspec +rm ext/cfe_gotoh/cfe_gotoh.cpp diff --git a/ruby/gotoh.gemspec b/ruby/cfe_gotoh.gemspec similarity index 59% rename from ruby/gotoh.gemspec rename to ruby/cfe_gotoh.gemspec index 5de4683..0ad5fa3 100644 --- a/ruby/gotoh.gemspec +++ b/ruby/cfe_gotoh.gemspec @@ -1,13 +1,13 @@ Gem::Specification.new do |s| - s.name = "gotoh" - s.version = ENV['GOTOH_VERSION'] || '0.1.0.pre' + s.name = "cfe_gotoh" + s.version = ENV['CFE_GOTOH_VERSION'] || '0.1.0.pre' s.summary = "CfE implementation of the Gotoh sequence alignment algorithm" s.files = [ - 'lib/gotoh.rb', - 'ext/gotoh/gotoh.cpp' + 'lib/cfe_gotoh.rb', + 'ext/cfe_gotoh/cfe_gotoh.cpp' ] s.extensions = [ - 'ext/gotoh/extconf.rb', + 'ext/cfe_gotoh/extconf.rb', ] s.authors = [ "Conan Woods", diff --git a/ruby/ext/gotoh/extconf.rb b/ruby/ext/cfe_gotoh/extconf.rb similarity index 100% rename from ruby/ext/gotoh/extconf.rb rename to ruby/ext/cfe_gotoh/extconf.rb diff --git a/ruby/_alignment.rb b/ruby/lib/cfe_gotoh.rb similarity index 99% rename from ruby/_alignment.rb rename to ruby/lib/cfe_gotoh.rb index c328fb9..8934a16 100644 --- a/ruby/_alignment.rb +++ b/ruby/lib/cfe_gotoh.rb @@ -1,5 +1,7 @@ #TODO: Scoring algorithm to improve frame_align? +require 'cfe_gotoh/cfe_gotoh' + if(RUBY_PLATFORM =~ /(win|w)32$/) if(RUBY_VERSION =~ /^2/) diff --git a/ruby/lib/gotoh.rb b/ruby/lib/gotoh.rb deleted file mode 100644 index 31768f8..0000000 --- a/ruby/lib/gotoh.rb +++ /dev/null @@ -1 +0,0 @@ -require 'gotoh/gotoh' \ No newline at end of file From 5da3994a32ecfc4c1a988efec07d10004be75c44 Mon Sep 17 00:00:00 2001 From: Richard Liang Date: Tue, 29 Oct 2024 12:05:33 -0700 Subject: [PATCH 05/31] Some light refactoring before attempting surgery. --- ruby/lib/cfe_gotoh.rb | 898 ++++++++++++++++++++---------------------- 1 file changed, 424 insertions(+), 474 deletions(-) diff --git a/ruby/lib/cfe_gotoh.rb b/ruby/lib/cfe_gotoh.rb index 8934a16..9864557 100644 --- a/ruby/lib/cfe_gotoh.rb +++ b/ruby/lib/cfe_gotoh.rb @@ -3,544 +3,494 @@ require 'cfe_gotoh/cfe_gotoh' -if(RUBY_PLATFORM =~ /(win|w)32$/) - if(RUBY_VERSION =~ /^2/) - require_relative 'alignment.windows.r22.so' - else - class String - def ord() #adding an ord method - return self.unpack('c')[0] - end +module CfeGotoh + def _build_substitution_matrix + sub_matrix = Array.new(127) {Array.new(127) {-1.0} } + ['A','T','G','C','R','Y','K','M','B','D','H','V','S','W','N'].each do |nuc| + sub_matrix[nuc.ord()][nuc.ord()] = 1.0 + sub_matrix[nuc.ord()]['X'.ord()]=sub_matrix['X'.ord()][nuc.ord()]=-6.0 if(nuc !='N') end - require 'ckwlib/alignment.windows.so' - end -elsif(RUBY_PLATFORM =~ /x86_64-linux/) #Ok, its probably not a mac, soo..... - if(RUBY_VERSION =~ /^2/) - require_relative 'alignment.linux64.r2.so' - else - class String - def ord() #adding an ord method - return self.unpack('c')[0] - end + #bi-mixtures + sub_matrix['A'.ord()]['R'.ord()]=sub_matrix['R'.ord()]['A'.ord()]=1.0 + sub_matrix['G'.ord()]['R'.ord()]=sub_matrix['R'.ord()]['G'.ord()]=1.0 + sub_matrix['C'.ord()]['Y'.ord()]=sub_matrix['Y'.ord()]['C'.ord()]=1.0 + sub_matrix['T'.ord()]['Y'.ord()]=sub_matrix['Y'.ord()]['T'.ord()]=1.0 + sub_matrix['G'.ord()]['K'.ord()]=sub_matrix['K'.ord()]['G'.ord()]=1.0 + sub_matrix['T'.ord()]['K'.ord()]=sub_matrix['K'.ord()]['T'.ord()]=1.0 + sub_matrix['C'.ord()]['M'.ord()]=sub_matrix['M'.ord()]['C'.ord()]=1.0 + sub_matrix['A'.ord()]['M'.ord()]=sub_matrix['M'.ord()]['A'.ord()]=1.0 + sub_matrix['C'.ord()]['S'.ord()]=sub_matrix['S'.ord()]['C'.ord()]=1.0 + sub_matrix['G'.ord()]['S'.ord()]=sub_matrix['S'.ord()]['G'.ord()]=1.0 + sub_matrix['T'.ord()]['W'.ord()]=sub_matrix['W'.ord()]['T'.ord()]=1.0 + sub_matrix['A'.ord()]['W'.ord()]=sub_matrix['W'.ord()]['A'.ord()]=1.0 + #tri-mixtures + sub_matrix['C'.ord()]['B'.ord()]=sub_matrix['B'.ord()]['C'.ord()]=1.0 + sub_matrix['G'.ord()]['B'.ord()]=sub_matrix['B'.ord()]['G'.ord()]=1.0 + sub_matrix['T'.ord()]['B'.ord()]=sub_matrix['B'.ord()]['T'.ord()]=1.0 + sub_matrix['A'.ord()]['D'.ord()]=sub_matrix['D'.ord()]['A'.ord()]=1.0 + sub_matrix['G'.ord()]['D'.ord()]=sub_matrix['D'.ord()]['G'.ord()]=1.0 + sub_matrix['T'.ord()]['D'.ord()]=sub_matrix['D'.ord()]['T'.ord()]=1.0 + sub_matrix['A'.ord()]['H'.ord()]=sub_matrix['H'.ord()]['A'.ord()]=1.0 + sub_matrix['C'.ord()]['H'.ord()]=sub_matrix['H'.ord()]['C'.ord()]=1.0 + sub_matrix['T'.ord()]['H'.ord()]=sub_matrix['H'.ord()]['T'.ord()]=1.0 + sub_matrix['A'.ord()]['V'.ord()]=sub_matrix['V'.ord()]['A'.ord()]=1.0 + sub_matrix['C'.ord()]['V'.ord()]=sub_matrix['V'.ord()]['C'.ord()]=1.0 + sub_matrix['G'.ord()]['V'.ord()]=sub_matrix['V'.ord()]['G'.ord()]=1.0 + #other + sub_matrix['$'.ord()]['$'.ord()]=50.0 + sub_matrix['T'.ord()]['U'.ord()] = sub_matrix['U'.ord()]['T'.ord()] = 1.0 + sub_matrix['N'.ord()]['N'.ord()] = 0.0 + sub_matrix['X'.ord()]['-'.ord()]=sub_matrix['X'.ord()]['-'.ord()]=3.0 + ['A','T','G','C'].each do |ch| + sub_matrix[ch.ord()]['*'.ord()]=sub_matrix['*'.ord()][ch.ord()]=1.0 + sub_matrix[ch.ord()]['&'.ord()]=sub_matrix['&'.ord()][ch.ord()]=0.7 + sub_matrix[ch.ord()]['$'.ord()]=sub_matrix['$'.ord()][ch.ord()]=0.0 + sub_matrix[ch.ord()]['.'.ord()]=sub_matrix['.'.ord()][ch.ord()]=-20.0 + sub_matrix[ch.ord()]['N'.ord()]=sub_matrix['N'.ord()][ch.ord()]=-3.0 end - require 'ckwlib/alignment.linux64.so' end -elsif(RUBY_PLATFORM =~ /i686-darwin10/) - require 'ckwlib/alignment.macosx.so' -else - if(RUBY_VERSION =~ /^2/) - require_relative 'alignment.linux32.r2.so' - else - class String - def ord() #adding an ord method - return self.unpack('c')[0] - end - end - require 'ckwlib/alignment.linux32.so' - end -end - -#init $nucMat -$nucMat = Array.new(127) {Array.new(127) {-1.0} } -['A','T','G','C','R','Y','K','M','B','D','H','V','S','W','N'].each do |nuc| - $nucMat[nuc.ord()][nuc.ord()] = 1.0 - $nucMat[nuc.ord()]['X'.ord()]=$nucMat['X'.ord()][nuc.ord()]=-6.0 if(nuc !='N') -end -#bi-mixtures -$nucMat['A'.ord()]['R'.ord()]=$nucMat['R'.ord()]['A'.ord()]=1.0 -$nucMat['G'.ord()]['R'.ord()]=$nucMat['R'.ord()]['G'.ord()]=1.0 -$nucMat['C'.ord()]['Y'.ord()]=$nucMat['Y'.ord()]['C'.ord()]=1.0 -$nucMat['T'.ord()]['Y'.ord()]=$nucMat['Y'.ord()]['T'.ord()]=1.0 -$nucMat['G'.ord()]['K'.ord()]=$nucMat['K'.ord()]['G'.ord()]=1.0 -$nucMat['T'.ord()]['K'.ord()]=$nucMat['K'.ord()]['T'.ord()]=1.0 -$nucMat['C'.ord()]['M'.ord()]=$nucMat['M'.ord()]['C'.ord()]=1.0 -$nucMat['A'.ord()]['M'.ord()]=$nucMat['M'.ord()]['A'.ord()]=1.0 -$nucMat['C'.ord()]['S'.ord()]=$nucMat['S'.ord()]['C'.ord()]=1.0 -$nucMat['G'.ord()]['S'.ord()]=$nucMat['S'.ord()]['G'.ord()]=1.0 -$nucMat['T'.ord()]['W'.ord()]=$nucMat['W'.ord()]['T'.ord()]=1.0 -$nucMat['A'.ord()]['W'.ord()]=$nucMat['W'.ord()]['A'.ord()]=1.0 -#tri-mixtures -$nucMat['C'.ord()]['B'.ord()]=$nucMat['B'.ord()]['C'.ord()]=1.0 -$nucMat['G'.ord()]['B'.ord()]=$nucMat['B'.ord()]['G'.ord()]=1.0 -$nucMat['T'.ord()]['B'.ord()]=$nucMat['B'.ord()]['T'.ord()]=1.0 -$nucMat['A'.ord()]['D'.ord()]=$nucMat['D'.ord()]['A'.ord()]=1.0 -$nucMat['G'.ord()]['D'.ord()]=$nucMat['D'.ord()]['G'.ord()]=1.0 -$nucMat['T'.ord()]['D'.ord()]=$nucMat['D'.ord()]['T'.ord()]=1.0 -$nucMat['A'.ord()]['H'.ord()]=$nucMat['H'.ord()]['A'.ord()]=1.0 -$nucMat['C'.ord()]['H'.ord()]=$nucMat['H'.ord()]['C'.ord()]=1.0 -$nucMat['T'.ord()]['H'.ord()]=$nucMat['H'.ord()]['T'.ord()]=1.0 -$nucMat['A'.ord()]['V'.ord()]=$nucMat['V'.ord()]['A'.ord()]=1.0 -$nucMat['C'.ord()]['V'.ord()]=$nucMat['V'.ord()]['C'.ord()]=1.0 -$nucMat['G'.ord()]['V'.ord()]=$nucMat['V'.ord()]['G'.ord()]=1.0 -#other -$nucMat['$'.ord()]['$'.ord()]=50.0 -$nucMat['T'.ord()]['U'.ord()] = $nucMat['U'.ord()]['T'.ord()] = 1.0 -$nucMat['N'.ord()]['N'.ord()] = 0.0 -$nucMat['X'.ord()]['-'.ord()]=$nucMat['X'.ord()]['-'.ord()]=3.0 -['A','T','G','C'].each do |ch| - $nucMat[ch.ord()]['*'.ord()]=$nucMat['*'.ord()][ch.ord()]=1.0 - $nucMat[ch.ord()]['&'.ord()]=$nucMat['&'.ord()][ch.ord()]=0.7 - $nucMat[ch.ord()]['$'.ord()]=$nucMat['$'.ord()][ch.ord()]=0.0 - $nucMat[ch.ord()]['.'.ord()]=$nucMat['.'.ord()][ch.ord()]=-20.0 - $nucMat[ch.ord()]['N'.ord()]=$nucMat['N'.ord()][ch.ord()]=-3.0 -end - + NUCLEOTIDE_MATRIX = self._build_substitution_matrix() -def score_alignment(seqa, seqb) - sc = 0.0 - 0.upto(seqa.size() - 1) do |i| - sc += $nucMat[seqa[i,1].upcase().ord()][seqb[i,1].upcase().ord()] + def self.score_alignment(seqa, seqb) + sc = 0.0 + 0.upto(seqa.size() - 1) do |i| + sc += NUCLEOTIDE_MATRIX[seqa[i,1].upcase().ord()][seqb[i,1].upcase().ord()] + end + return sc end - return sc -end -def make_gap_list(seq) - list = [] - cur_ins = nil - prev_i = nil - 0.upto(seq.size() - 1) do |i| - if(seq[i,1] == '-') - if(prev_i and i == prev_i + 1) - cur_ins << i - prev_i = i - else - list << cur_ins if(cur_ins != nil and cur_ins != []) - cur_ins = [i] - prev_i = i + + def self.make_gap_list(seq) + list = [] + cur_ins = nil + prev_i = nil + 0.upto(seq.size() - 1) do |i| + if(seq[i,1] == '-') + if(prev_i and i == prev_i + 1) + cur_ins << i + prev_i = i + else + list << cur_ins if(cur_ins != nil and cur_ins != []) + cur_ins = [i] + prev_i = i + end end end + list << cur_ins if(cur_ins != nil and cur_ins != []) + return list end - list << cur_ins if(cur_ins != nil and cur_ins != []) - return list -end -#common_insert_locations is based on amino acid locations starting at base 0. -#Assumes standard in the first base. -#Prealign lets you run a lot of the corrections and qc on a already aligned sequence. -def frame_align(seqa, seqb, gap_init=3, gap_penalty=1, common_insert_locations=[], trim=false, raise_errors=false, prealigned=false) - elem = nil - if(prealigned) - elem = [seqa, seqb] - else - elem = align_it(seqa, seqb, gap_init, gap_penalty) - end - puts "Wierd sizes Z" if(elem[0].size() != elem[1].size()) - - #Do trimming? - if(trim and elem[0] =~ /^(-+)[^-]/) - elem[0][0,$1.size()] = '' - elem[1][0,$1.size()] = '' - end - if(trim and elem[0] =~ /[^-](-+)$/) - elem[1][(elem[0].size() - $1.size()), $1.size()] = '' - elem[0][(elem[0].size() - $1.size()), $1.size()] = '' - end - - #Start - if(trim and elem[1][0,1] == '-') - #get rid of edges that are the wrong size - elem[1] =~ /^(-+)[^-]/ - #Make sure its a multiple of three - dashes = $1 - if(dashes == nil) - #pass - elsif((dashes.size() % 3) == 1) - elem[1][dashes.size(),1] = '-' - elem[1][dashes.size() + 1,1] = '-' - elsif((dashes.size() % 3) == 2) - elem[1][dashes.size(),1] = '-' - end - end - - #end - if(trim and elem[1][-1,1] == '-') - #get rid of edges that are the wrong size - elem[1] =~ /[^-](-+)$/ - #Make sure its a multiple of three - dashes = $1 - if(dashes == nil) - #pass - elsif((dashes.size() % 3) == 1) - elem[1][(elem[1].size() - dashes.size()) - 1] = '-' - elem[1][(elem[1].size() - dashes.size()) - 2] = '-' - elsif((dashes.size() % 3) == 2) - elem[1][(elem[1].size() - dashes.size()) - 1] = '-' - end - end - - - #try to merge deletions and insertions if things aren't looking well. - #added 16-Nov-2018, helps fix poor insertions near the start. - if(elem[0].size() % 3 != 0 or elem[1].size() % 3 != 0) - dex = 0 #Don't start at 0, that way lies madness... Or does it??? - while(dex = elem[0].index(/-/, dex)) - #Now, find an ajacent dash in elem[1] to cancel - if((dex - 1 >= 0) and elem[1][dex - 1] == '-') - elem[0][dex] = '' - elem[1][dex - 1] = '' - dex = 1 - elsif(elem[1][dex + 1] == '-') - elem[1][dex + 1] = '' - elem[0][dex] = '' - dex = 1 - elsif((dex - 2 >= 0) and elem[1][dex - 2] == '-') - elem[1][dex - 2] = '' - elem[0][dex] = '' - dex = 1 - elsif(elem[1][dex + 2] == '-') - elem[1][dex + 2] = '' - elem[0][dex] = '' - dex = 1 - end - - #check to see if we fixed everything - if(!(elem[0].size() % 3 != 0 or elem[1].size() % 3 != 0)) - break - end + + #common_insert_locations is based on amino acid locations starting at base 0. + #Assumes standard in the first base. + #Prealign lets you run a lot of the corrections and qc on a already aligned sequence. + def self.frame_align( + seqa, + seqb, + gap_init=3, + gap_penalty=1, + common_insert_locations=[], + trim=false, + raise_errors=false, + prealigned=false + ) + elem = nil + if(prealigned) + elem = [seqa, seqb] + else + elem = align_it(seqa, seqb, gap_init, gap_penalty) + end + puts "Wierd sizes Z" if(elem[0].size() != elem[1].size()) - dex += 1 + #Do trimming? + if(trim and elem[0] =~ /^(-+)[^-]/) + elem[0][0,$1.size()] = '' + elem[1][0,$1.size()] = '' + end + if(trim and elem[0] =~ /[^-](-+)$/) + elem[1][(elem[0].size() - $1.size()), $1.size()] = '' + elem[0][(elem[0].size() - $1.size()), $1.size()] = '' end - end - - - #I wonder if these should throw exceptions by default, but have an option to ignore. - if(elem[0].gsub(/[^-]/,'').size() % 3 != 0 and raise_errors) - #puts "Can not frame align" - raise "Can not frame align, #{elem[0].gsub(/[^-]/,'').size()} inserted bases not divisible by 3" - #return elem - end - if(elem[1].gsub(/[^-]/,'').size() % 3 != 0 and raise_errors) - #puts "Can not frame align" - #puts elem[0] - #puts elem[1] + #Start + if(trim and elem[1][0,1] == '-') + #get rid of edges that are the wrong size + elem[1] =~ /^(-+)[^-]/ + #Make sure its a multiple of three + dashes = $1 + if(dashes == nil) + #pass + elsif((dashes.size() % 3) == 1) + elem[1][dashes.size(),1] = '-' + elem[1][dashes.size() + 1,1] = '-' + elsif((dashes.size() % 3) == 2) + elem[1][dashes.size(),1] = '-' + end + end - raise "Can not frame align, #{elem[1].gsub(/[^-]/,'').size()} deleted bases not divisible by 3" - #return elem - end - - #Build the insert/delete lists. - insert_list = make_gap_list(elem[0]) - delete_list = make_gap_list(elem[1]) - #Now we have a list that looks like [[3,4,5], [9], [11,12]] - - - if(insert_list.size() > 0)#Inserts first - new_ins_list = [] + #end + if(trim and elem[1][-1,1] == '-') + #get rid of edges that are the wrong size + elem[1] =~ /[^-](-+)$/ + #Make sure its a multiple of three + dashes = $1 + if(dashes == nil) + #pass + elsif((dashes.size() % 3) == 1) + elem[1][(elem[1].size() - dashes.size()) - 1] = '-' + elem[1][(elem[1].size() - dashes.size()) - 2] = '-' + elsif((dashes.size() % 3) == 2) + elem[1][(elem[1].size() - dashes.size()) - 1] = '-' + end + end - #First step is clustering insertions. (v2: 16-Nov-2018) - insert_list.each_with_index do |ins, i| - next if(ins.size() == 0) #we already ate this one. - if(ins.size() % 3 == 0) #this insertion is fine! - new_ins_list << ins - next - end - - #Can I merge with the next insert? - if(insert_list[i + 1] and (ins + insert_list[i + 1]).size() % 3 == 0 and - (insert_list[i + 1].first - ins.last) < 9) - - ins2 = insert_list[i + 1] - if(ins2.size() > ins.size()) - new_ins_list << ((ins2.first - ins.size()) .. ins2.first - 1).to_a() + ins2 - else - new_ins_list << ins + ((ins.last + 1) .. (ins.last + ins2.size())).to_a() + + #try to merge deletions and insertions if things aren't looking well. + #added 16-Nov-2018, helps fix poor insertions near the start. + if(elem[0].size() % 3 != 0 or elem[1].size() % 3 != 0) + dex = 0 #Don't start at 0, that way lies madness... Or does it??? + while(dex = elem[0].index(/-/, dex)) + #Now, find an ajacent dash in elem[1] to cancel + if((dex - 1 >= 0) and elem[1][dex - 1] == '-') + elem[0][dex] = '' + elem[1][dex - 1] = '' + dex = 1 + elsif(elem[1][dex + 1] == '-') + elem[1][dex + 1] = '' + elem[0][dex] = '' + dex = 1 + elsif((dex - 2 >= 0) and elem[1][dex - 2] == '-') + elem[1][dex - 2] = '' + elem[0][dex] = '' + dex = 1 + elsif(elem[1][dex + 2] == '-') + elem[1][dex + 2] = '' + elem[0][dex] = '' + dex = 1 end - insert_list[i + 1] = [] - #maybe merge with the next two inserts? - elsif(insert_list[i + 1] and insert_list[i + 2] and - (ins + insert_list[i + 1] + insert_list[i + 2]).size() % 3 == 0 and - (insert_list[i + 2].first - ins.last) < 12) - ins2 = insert_list[i + 1] - ins3 = insert_list[i + 2] - if(true) #Lets just assume that if you need to combine 3 inserts, the middle one is where it goes. - new_ins_list << ((ins2.first - ins.size()) .. ins2.first - 1).to_a() + ins2 + ((ins2.last + 1) .. (ins2.last + ins3.size())).to_a() + #check to see if we fixed everything + if(!(elem[0].size() % 3 != 0 or elem[1].size() % 3 != 0)) + break end - - insert_list[i + 1] = [] - insert_list[i + 2] = [] - else #No merge, life sucks and then you die. - raise "Can not frame align insert" if(raise_errors) + + dex += 1 end + end + -=begin - #First step is clustering insertions. (v1: old version) - insert_list.each_with_index do |ins, i| - next_ins = insert_list[i + 1] + #I wonder if these should throw exceptions by default, but have an option to ignore. + if(elem[0].gsub(/[^-]/,'').size() % 3 != 0 and raise_errors) + #puts "Can not frame align" + raise "Can not frame align, #{elem[0].gsub(/[^-]/,'').size()} inserted bases not divisible by 3" + #return elem + end + if(elem[1].gsub(/[^-]/,'').size() % 3 != 0 and raise_errors) + #puts "Can not frame align" + #puts elem[0] + #puts elem[1] - if(ins.size() % 3 != 0) #Wrong size! - #Look for next insertions that would make it the right size and are not far apart - if(!outta_frame and next_ins and (((next_ins.size() + ins.size()) % 3) == 0) and next_ins[0] - ins[-1] < 8 ) #within 8 bases - if(next_ins.size() > ins.size()) #scoring would be good here - ins.each do |a| - next_ins.insert(0, next_ins[0] - 1) #Insert at start - end + raise "Can not frame align, #{elem[1].gsub(/[^-]/,'').size()} deleted bases not divisible by 3" + #return elem + end + + #Build the insert/delete lists. + insert_list = make_gap_list(elem[0]) + delete_list = make_gap_list(elem[1]) + #Now we have a list that looks like [[3,4,5], [9], [11,12]] + + + if(insert_list.size() > 0)#Inserts first + new_ins_list = [] + + #First step is clustering insertions. (v2: 16-Nov-2018) + insert_list.each_with_index do |ins, i| + next if(ins.size() == 0) #we already ate this one. + if(ins.size() % 3 == 0) #this insertion is fine! + new_ins_list << ins + next + end + + #Can I merge with the next insert? + if(insert_list[i + 1] and (ins + insert_list[i + 1]).size() % 3 == 0 and + (insert_list[i + 1].first - ins.last) < 9) + + ins2 = insert_list[i + 1] + if(ins2.size() > ins.size()) + new_ins_list << ((ins2.first - ins.size()) .. ins2.first - 1).to_a() + ins2 else - next_ins.each do |a| - ins.insert(-1, ins[0] + 1) #insert at end - end - insert_list[i + 1] = ins #Pushing this problem ahead for convinence. + new_ins_list << ins + ((ins.last + 1) .. (ins.last + ins2.size())).to_a() end - #don't insert into new list, as we've pushed it to the next element. - else - #I wonder if we should try to merge multiple more than two inserts? Eh, nah. - #return elem - outta_frame = true + insert_list[i + 1] = [] + #maybe merge with the next two inserts? + elsif(insert_list[i + 1] and insert_list[i + 2] and + (ins + insert_list[i + 1] + insert_list[i + 2]).size() % 3 == 0 and + (insert_list[i + 2].first - ins.last) < 12) + + ins2 = insert_list[i + 1] + ins3 = insert_list[i + 2] + if(true) #Lets just assume that if you need to combine 3 inserts, the middle one is where it goes. + new_ins_list << ((ins2.first - ins.size()) .. ins2.first - 1).to_a() + ins2 + ((ins2.last + 1) .. (ins2.last + ins3.size())).to_a() + end + + insert_list[i + 1] = [] + insert_list[i + 2] = [] + else #No merge, life sucks and then you die. raise "Can not frame align insert" if(raise_errors) end - else - new_ins_list << ins #this insert is okay end - end -=end - - #puts insert_list.inspect - #puts new_ins_list.inspect - - #second step should be to frame align inserts (prioritizing to common_points) - offset = 0 #offset created by previous insertions. (IMPORTANT) - new_ins_list.each do |ins| - #see if its close to a common_insert(within 3 amino acids?) - min_common = common_insert_locations.min(){|a,b| ((a) * 3 - (ins[0] - offset)).abs() <=> ((b) * 3 - ins[0]).abs()} - if(min_common != nil and ((min_common ) * 3 - (ins[0] - offset)).abs() <= 9) - #Cool, align to this common insert - new_ins = [] - 0.upto(ins.size() - 1) do |i| - new_ins << ((min_common) * 3) + i + offset + =begin + #First step is clustering insertions. (v1: old version) + insert_list.each_with_index do |ins, i| + next_ins = insert_list[i + 1] + + if(ins.size() % 3 != 0) #Wrong size! + #Look for next insertions that would make it the right size and are not far apart + if(!outta_frame and next_ins and (((next_ins.size() + ins.size()) % 3) == 0) and next_ins[0] - ins[-1] < 8 ) #within 8 bases + if(next_ins.size() > ins.size()) #scoring would be good here + ins.each do |a| + next_ins.insert(0, next_ins[0] - 1) #Insert at start + end + else + next_ins.each do |a| + ins.insert(-1, ins[0] + 1) #insert at end + end + insert_list[i + 1] = ins #Pushing this problem ahead for convinence. + end + #don't insert into new list, as we've pushed it to the next element. + else + #I wonder if we should try to merge multiple more than two inserts? Eh, nah. + #return elem + outta_frame = true + raise "Can not frame align insert" if(raise_errors) + end + else + new_ins_list << ins #this insert is okay end - ins.replace(new_ins) + end + =end - #B frame align - #scoring would be good here - if(ins[0] % 3 == 1) #set back one base - new_ins = [] - ins.each do |i| - new_ins << i - 1 + #puts insert_list.inspect + #puts new_ins_list.inspect + + #second step should be to frame align inserts (prioritizing to common_points) + offset = 0 #offset created by previous insertions. (IMPORTANT) + new_ins_list.each do |ins| + #see if its close to a common_insert(within 3 amino acids?) + min_common = common_insert_locations.min(){|a,b| ((a) * 3 - (ins[0] - offset)).abs() <=> ((b) * 3 - ins[0]).abs()} + if(min_common != nil and ((min_common ) * 3 - (ins[0] - offset)).abs() <= 9) + #Cool, align to this common insert + new_ins = [] + 0.upto(ins.size() - 1) do |i| + new_ins << ((min_common) * 3) + i + offset + end + ins.replace(new_ins) end - ins.replace(new_ins) - elsif(ins[0] % 3 == 2) #Set forward one base. - new_ins = [] - ins.each do |i| - new_ins << i + 1 + + #B frame align + #scoring would be good here + if(ins[0] % 3 == 1) #set back one base + new_ins = [] + ins.each do |i| + new_ins << i - 1 + end + ins.replace(new_ins) + elsif(ins[0] % 3 == 2) #Set forward one base. + new_ins = [] + ins.each do |i| + new_ins << i + 1 + end + ins.replace(new_ins) end - ins.replace(new_ins) + + offset += ins.size() end - - offset += ins.size() - end - #make the actual modifications - #begin - #orige = "" + elem[0] - elem[0] = elem[0].gsub('-','') - new_ins_list.each do |ins| - ins.each do |i| - if(i > elem[0].size()) - elem[0].insert(-1, '-') - else - elem[0].insert(i, '-') + #make the actual modifications + #begin + #orige = "" + elem[0] + elem[0] = elem[0].gsub('-','') + new_ins_list.each do |ins| + ins.each do |i| + if(i > elem[0].size()) + elem[0].insert(-1, '-') + else + elem[0].insert(i, '-') + end end end - end - #rescue - # puts "OOH---------------------------------------------" - # puts orige.inspect - # puts elem.inspect - # puts new_ins_list.inspect() - # raise $! - #end + #rescue + # puts "OOH---------------------------------------------" + # puts orige.inspect + # puts elem.inspect + # puts new_ins_list.inspect() + # raise $! + #end - end - - - #Deletion--------------------------------------------------------------------------------------------------- - outta_frame = false - if(delete_list.size() > 0)#Deletions second - new_del_list = [] - next_del = nil + end - #First step is clustering deletions. (v2: 19-Nov-2018) - delete_list.each_with_index do |del, i| - next if(del.size() == 0) #we already ate this one. - if(del.size() % 3 == 0) #this insertion is fine! - new_del_list << del - next - end + + #Deletion--------------------------------------------------------------------------------------------------- + outta_frame = false + if(delete_list.size() > 0)#Deletions second + new_del_list = [] + next_del = nil - #Can I merge with the next delete? - if(delete_list[i + 1] and (del + delete_list[i + 1]).size() % 3 == 0 and - (delete_list[i + 1].first - del.last) < 9) - - del2 = delete_list[i + 1] - if(del2.size() > del.size()) - new_del_list << ((del2.first - del.size()) .. del2.first - 1).to_a() + del2 - else - new_del_list << del + ((del.last + 1) .. (del.last + del2.size())).to_a() + #First step is clustering deletions. (v2: 19-Nov-2018) + delete_list.each_with_index do |del, i| + next if(del.size() == 0) #we already ate this one. + if(del.size() % 3 == 0) #this insertion is fine! + new_del_list << del + next end - delete_list[i + 1] = [] - #maybe merge with the next two deletes? - elsif(delete_list[i + 1] and delete_list[i + 2] and - (del + delete_list[i + 1] + delete_list[i + 2]).size() % 3 == 0 and - (delete_list[i + 2].first - del.last) < 12) #slightly higher range, since we've already got a higher threshold of confidence here - del2 = delete_list[i + 1] - del3 = delete_list[i + 2] - if(true) #Lets just assume that if you need to combine 3 deletes, the middle one is where it goes. - new_del_list << ((del2.first - del.size()) .. del2.first - 1).to_a() + del2 + ((del2.last + 1) .. (del2.last + del3.size())).to_a() + #Can I merge with the next delete? + if(delete_list[i + 1] and (del + delete_list[i + 1]).size() % 3 == 0 and + (delete_list[i + 1].first - del.last) < 9) + + del2 = delete_list[i + 1] + if(del2.size() > del.size()) + new_del_list << ((del2.first - del.size()) .. del2.first - 1).to_a() + del2 + else + new_del_list << del + ((del.last + 1) .. (del.last + del2.size())).to_a() + end + delete_list[i + 1] = [] + #maybe merge with the next two deletes? + elsif(delete_list[i + 1] and delete_list[i + 2] and + (del + delete_list[i + 1] + delete_list[i + 2]).size() % 3 == 0 and + (delete_list[i + 2].first - del.last) < 12) #slightly higher range, since we've already got a higher threshold of confidence here + + del2 = delete_list[i + 1] + del3 = delete_list[i + 2] + if(true) #Lets just assume that if you need to combine 3 deletes, the middle one is where it goes. + new_del_list << ((del2.first - del.size()) .. del2.first - 1).to_a() + del2 + ((del2.last + 1) .. (del2.last + del3.size())).to_a() + end + + delete_list[i + 1] = [] + delete_list[i + 2] = [] + else #No merge, life sucks and then you die. + new_del_list << delete_list[i] end - - delete_list[i + 1] = [] - delete_list[i + 2] = [] - else #No merge, life sucks and then you die. - new_del_list << delete_list[i] end - end - -=begin - #First step is clustering deletions. (v1: old version) - delete_list.each_with_index do |del, i| - next_del = delete_list[i + 1] - if(del.size() % 3 != 0 and !outta_frame) #Wrong size! - #Look for next deletions that would make it the right size and are not far apart - if(next_del and (((next_del.size() + del.size()) % 3) == 0) and next_del[0] - del[-1] < 6 ) #within 6 bases - if(next_del.size() > del.size()) #scoring would be good here - del.each do |a| - next_del.insert(0, next_del[0] - 1) #delete at start + =begin + #First step is clustering deletions. (v1: old version) + delete_list.each_with_index do |del, i| + next_del = delete_list[i + 1] + + if(del.size() % 3 != 0 and !outta_frame) #Wrong size! + #Look for next deletions that would make it the right size and are not far apart + if(next_del and (((next_del.size() + del.size()) % 3) == 0) and next_del[0] - del[-1] < 6 ) #within 6 bases + if(next_del.size() > del.size()) #scoring would be good here + del.each do |a| + next_del.insert(0, next_del[0] - 1) #delete at start + end + else + next_del.each do |a| + del.insert(-1, del[0] + 1) #delete at end + end + delete_list[i + 1] = del #Pushing this problem ahead for convinence. end + #don't delete into new list, as we've pushed it to the next element. else - next_del.each do |a| - del.insert(-1, del[0] + 1) #delete at end - end - delete_list[i + 1] = del #Pushing this problem ahead for convinence. + #I wonder if we should try to merge multiple more than two deletes? Eh, nah. + # + #return elem + new_del_list << del + outta_frame = true + #raise "Can not frame align delete" end - #don't delete into new list, as we've pushed it to the next element. else - #I wonder if we should try to merge multiple more than two deletes? Eh, nah. - # - #return elem - new_del_list << del - outta_frame = true - #raise "Can not frame align delete" + new_del_list << del #this delete is okay end - else - new_del_list << del #this delete is okay end - end -=end + =end - - #second step should be to frame align deletes - offset = 0 #offset created by previous deletions. (IMPORTANT) - new_del_list.each do |del| - next if(del.size() % 3 != 0) - #frame align - #scoring would be good here - if(del[0] % 3 == 1) #set back one base - new_del = [] - del.each do |i| - new_del << i - 1 + + #second step should be to frame align deletes + offset = 0 #offset created by previous deletions. (IMPORTANT) + new_del_list.each do |del| + next if(del.size() % 3 != 0) + #frame align + #scoring would be good here + if(del[0] % 3 == 1) #set back one base + new_del = [] + del.each do |i| + new_del << i - 1 + end + del.replace(new_del) + elsif(del[0] % 3 == 2) #Set forward one base. + new_del = [] + del.each do |i| + new_del << i + 1 + end + del.replace(new_del) end - del.replace(new_del) - elsif(del[0] % 3 == 2) #Set forward one base. - new_del = [] + + offset += del.size() + end + + #make the actual modifications + elem[1] = elem[1].gsub('-','') + new_del_list.each do |del| del.each do |i| - new_del << i + 1 + if(i > elem[1].size() ) + elem[1].insert(elem[1].size(), '-') + else + elem[1].insert(i, '-') + end end - del.replace(new_del) end - - offset += del.size() + end + + return elem + end + + #Returns a [seq_sans_inserts, [list of inserts]] + def self.remove_inserts(elem) + seq = '' + elem[1] + inserts = [] + + insert_list = [] + 0.upto(elem[0].size() - 1) do |i| + insert_list << i if(elem[0][i,1] == '-') end - #make the actual modifications - elem[1] = elem[1].gsub('-','') - new_del_list.each do |del| - del.each do |i| - if(i > elem[1].size() ) - elem[1].insert(elem[1].size(), '-') + big_insert_list = [] + if(elem[0].include?('-'))#Inserts first + #First step should be to cluster inserts + cur_ins = nil + prev_i = nil + insert_list.each do |i| + if(prev_i and i == prev_i + 1) + cur_ins << i + prev_i = i else - elem[1].insert(i, '-') + big_insert_list << cur_ins if(cur_ins != nil and cur_ins != []) + cur_ins = [i] + prev_i = i end end + big_insert_list << cur_ins if(cur_ins != nil and cur_ins != []) end - end - - return elem -end - -#Returns a [seq_sans_inserts, [list of inserts]] -def remove_inserts(elem) - seq = '' + elem[1] - inserts = [] - - insert_list = [] - 0.upto(elem[0].size() - 1) do |i| - insert_list << i if(elem[0][i,1] == '-') - end - - big_insert_list = [] - if(elem[0].include?('-'))#Inserts first - #First step should be to cluster inserts - cur_ins = nil - prev_i = nil - insert_list.each do |i| - if(prev_i and i == prev_i + 1) - cur_ins << i - prev_i = i - else - big_insert_list << cur_ins if(cur_ins != nil and cur_ins != []) - cur_ins = [i] - prev_i = i + + offset = 0 + big_insert_list.each do |ins| + ins_seq = '' + ins.each do |i| + ins_seq += elem[1][i,1] + end + inserts << [((ins[0] - offset) / 3), ins_seq] + offset += ins.size() + ins.each do |i| + seq[i,1] = '.' end end - big_insert_list << cur_ins if(cur_ins != nil and cur_ins != []) - end - - offset = 0 - big_insert_list.each do |ins| - ins_seq = '' - ins.each do |i| - ins_seq += elem[1][i,1] - end - inserts << [((ins[0] - offset) / 3), ins_seq] - offset += ins.size() - ins.each do |i| - seq[i,1] = '.' - end - end - return [seq.gsub('.',''), inserts] -end - -=begin -seq = 'CCTCAAATCACTCTTTGGCAACGACCCTTAGTCACAGTAAGAATAGGGGGACAGCTAATAGAAGCCCTATTAGACACAGGAGCAGATGATACAGTATTAGAAGAAAAAATAGATTTACCAGGAAAATGGARACCAAAAATGATAGGGGGAATTGGAGGTTTTATTAAAGTAAGGCAATATGATCAGATACTTATGGAAATATGTGARAAGAAGGCCATAGGTACAGTATTAGTAGGACCTACMCCTGTCAACATAATTGGRCGRAATATGTTGACTCAGATTGGTTGTACTTTAAATTTTCCAATTAGTCCTATTGARACTGTGCCAGTAAAATTAAAGCCAGGGATGGATGGCCCAAAAGTTAARCAATGGCCATTGACAGAAGAAAAAATAAAAGCATTAACAGAAATTTGTGCAGAAATGGAAAARGAAGGAAAAATTTCAAAAATTGGGCCTGAAAATCCATACAATACTCCAGTGTTTGCCATAAAGAAAAARGATAGTAMTAAATGGAGAAAATTAGTAGATTTCAGAGAACTCAATAAGAGAACTCAAGACTTCTGGGAGGTCCAATTAGGAATTCCTCATCCCGCGGGATTAAAAAAGAAAARATCAGTAACAGTACTAGATRTAGGGGATGCATATTTTTCAGTTCCCTTAGACAAAGAYTTTAGAAAGTATACTGCATTCACTATACCTAGTGTAAATAATGAAACACCAGGRATTAGATATCAGTACAATGTRCTKCCACAGGGATGGAAAGGATCACCAGCAATATTTCARGCAAGCATGACAAAAATCTTAGAGCCCTTTAGAACAAAAAATCCAGAGGTGGTGATCTACCAGTATATGGATGATTTATATGTAGGATCTGACTTAGAGATAGGGCAACATAGAGCAAAAATAGAGGARTTAAGAGAACATCTAYTGARATGGGGATTTACCACACCAGACAAAAAACATCAGAAAGAACCTCCATTTCTTTGGATGGGATATGAACTTCATCCTGACAAATGGACAGTCCAGCCTATARTRCTGCCARACAAAGRMRRCTGGACTGTCAATGATATACAGAAATTAGTAGGAAAACTAAATTGGGCCAGTCAAATTTATGCAGGAATTAAAGTAAAGCAACTGTGTAAACTCCTCAGGGGAGCCAAAGCATTAACAGAYATAGTAACAYTAACTGAGGAAGCAGAATTAGAATTGGCAGAGAACAGGGAAATTCTAAAAGAACCTGTACATGGGGTATAYTATGAYCCAGYAAAAGACTTAATAGCAGAAATACAGAAACAAGGGCAAGACCAATGGACATATCAAATATATCAAGARCCATTTAAAAATCTAAARACAGGAAAATATGCAAARAGGAGATCTGCCCACACRAATGATGTAAAACAATTAACAGAGGTAGTGCAAAAAGTGTCTACAGAARGCATAGTAATATGGGGGAAGAYCCCTAAATTTAAGCTGCCCATACAAAAAGAAACATGGGAGGCA' -std = 'CCTCAGGTCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAGGGGGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATGAGTTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATCAGATACTCATAGAAATCTGTGGACATAAAGCTATAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTGACTCAGATTGGTTGCACTTTAAATTTTCCCATTAGCCCTATTGAGACTGTACCAGTAAAATTAAAGCCAGGAATGGATGGCCCAAAAGTTAAACAATGGCCATTGACAGAAGAAAAAATAAAAGCATTAGTAGAAATTTGTACAGAGATGGAAAAGGAAGGGAAAATTTCAAAAATTGGGCCTGAAAATCCATACAATACTCCAGTATTTGCCATAAAGAAAAAAGACAGTACTAAATGGAGAAAATTAGTAGATTTCAGAGAACTTAATAAGAGAACTCAAGACTTCTGGGAAGTTCAATTAGGAATACCACATCCCGCAGGGTTAAAAAAGAAAAAATCAGTAACAGTACTGGATGTGGGTGATGCATATTTTTCAGTTCCCTTAGATGAAGACTTCAGGAAGTATACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCACAGGGATGGAAAGGATCACCAGCAATATTCCAAAGTAGCATGACAAAAATCTTAGAGCCTTTTAGAAAACAAAATCCAGACATAGTTATCTATCAATACATGGATGATTTGTATGTAGGATCTGACTTAGAAATAGGGCAGCATAGAACAAAAATAGAGGAGCTGAGACAACATCTGTTGAGGTGGGGACTTACCACACCAGACAAAAAACATCAGAAAGAACCTCCATTCCTTTGGATGGGTTATGAACTCCATCCTGATAAATGGACAGTACAGCCTATAGTGCTGCCAGAAAAAGACAGCTGGACTGTCAATGACATACAGAAGTTAGTGGGGAAATTGAATTGGGCAAGTCAGATTTACCCAGGGATTAAAGTAAGGCAATTATGTAAACTCCTTAGAGGAACCAAAGCACTAACAGAAGTAATACCACTAACAGAAGAAGCAGAGCTAGAACTGGCAGAAAACAGAGAGATTCTAAAAGAACCAGTACATGGAGTGTATTATGACCCATCAAAAGACTTAATAGCAGAAATACAGAAGCAGGGGCAAGGCCAATGGACATATCAAATTTATCAAGAGCCATTTAAAAATCTGAAAACAGGAAAATATGCAAGAATGAGGGGTGCCCACACTAATGATGTAAAACAATTAACAGAGGCAGTGCAAAAAATAACCACAGAAAGCATAGTAATATGGGGAAAGACTCCTAAATTTAAACTGCCCATACAAAAGGAAACATGGGAAACATGGTGGACAGAGTATTGGCAAGCCACCTGGATTCCTGAGTGGGAGTTTGTTAATACCCCTCCCTTAGTGAAATTATGGTACCAGTTAGAGAAAGAACCCATAGTAGGAGCAGAAACCTTC' - -elem = frame_align(std, seq, 9, 2, [35,168]) #Seems to be off by one. -puts remove_inserts(elem).inspect() -=end -=begin -#----testing----- -require 'ckwlib/io' - -std = "TGTACAAGACCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAGAGCATTTTATGCAACAGGAGAAATAATAGGAGATATAAGACAAGCACATTGT" - -fasta = Io::read_fasta('motivate_preds_full_NucSeq.fas') - -File.open( - -fasta.each do |fas| - elem = frame_align(std, fas[1].strip(), 3, 1, [10]) - puts fas[0] - puts elem[0] - puts elem[1] - puts remove_inserts(elem).inspect() - puts score_alignment(elem[0], elem[1]) + return [seq.gsub('.',''), inserts] + end end -=end \ No newline at end of file From 14716e30c013df08dd2e947ba971bef9a958aabf Mon Sep 17 00:00:00 2001 From: Richard Liang Date: Wed, 30 Oct 2024 10:34:27 -0700 Subject: [PATCH 06/31] WIP: refactoring in progress. --- ruby/lib/cfe_gotoh.rb | 317 ++++++++++++++++-------------------------- 1 file changed, 122 insertions(+), 195 deletions(-) diff --git a/ruby/lib/cfe_gotoh.rb b/ruby/lib/cfe_gotoh.rb index 9864557..718a4b6 100644 --- a/ruby/lib/cfe_gotoh.rb +++ b/ruby/lib/cfe_gotoh.rb @@ -50,12 +50,12 @@ def _build_substitution_matrix end end - NUCLEOTIDE_MATRIX = self._build_substitution_matrix() + NUCLEOTIDE_MATRIX = self._build_substitution_matrix().freeze - def self.score_alignment(seqa, seqb) + def self.score_alignment(standard, query) sc = 0.0 - 0.upto(seqa.size() - 1) do |i| - sc += NUCLEOTIDE_MATRIX[seqa[i,1].upcase().ord()][seqb[i,1].upcase().ord()] + 0.upto(standard.size() - 1) do |i| + sc += NUCLEOTIDE_MATRIX[standard[i,1].upcase().ord()][query[i,1].upcase().ord()] end return sc end @@ -81,13 +81,92 @@ def self.make_gap_list(seq) return list end + def self.trim_leading_dashes(standard, query) + leading_dashes_match = /^(-+)[^-]/.match(standard) + if (leading_dashes_match.nil?) + return + end + leading_dashes = leading_dashes_match[1] + standard[0, leading_dashes.size()] = '' + query[0, leading_dashes.size()] = '' + end + + def self.trim_trailing_dashes(standard, query) + trailing_dashes_match = /[^-](-+)$/.match(standard) + if (trailing_dashes_match.nil?) + return + end + trailing_dashes = trailing_dashes_match[1] + end_of_standard = standard.size() - trailing_dashes.size() + standard[end_of_standard, trailing_dashes.size()] = '' + query[end_of_standard, trailing_dashes.size()] = '' + end + + def self.fix_incomplete_edge_codon(query, side=:leading) + edge_idx = 0 + dash_regex = /^(-+)[^-]/ + incr = 1 + if (side != :leading) # fix the trailing edge + edge_idx = -1 + dash_regex = /[^-](-+)$/ + incr = -1 + end + + if (query[edge_idx] == '-') + dashes = dash_regex.match(query)[0] # we know there will be a match + + # If the length of the dashes aren't a multiple of 3, turn some + # of the query characters into dashes to force it to be a full + # codon of dashes. + if (dashes.size() % 3 >= 1) + first_non_dash_idx = 0 + if (side != :leading) + first_non_dash_idx = query.size() - dashes.size() - 1 + end + query[first_non_dash_idx] = '-' + if (dashes.size() % 3 == 2) + query[first_non_dash_idx + incr] = '-' + end + end + end + end + + def self.merge_insertions_and_deletions_to_fix_oof_sequences( + standard, + query + ) + # Merge deletions and insertions until the sequences have a cogent length + # (i.e. have length divisible by 3). This helps fix poor insertions near + # the start of the sequence. + raise 'Standard and query should be the same length' if standard.size() != query.size() + if(standard.size() % 3 != 0) + dex = 0 + while(dex = standard.index(/-/, dex)) + [-1, 1, -2, 2].each do |offset| # look one base away, then two bases away + if ((dex + offset >= 0) and query[dex + offset] == '-') + standard[dex] = '' + query[dex + offset] = '' + dex = 0 + break + end + end + + # Stop if the sequences are now a cogent length. + if(standard.size() % 3 == 0) + break + end + dex += 1 + end + end + end + #common_insert_locations is based on amino acid locations starting at base 0. #Assumes standard in the first base. #Prealign lets you run a lot of the corrections and qc on a already aligned sequence. def self.frame_align( - seqa, - seqb, + standard, + query, gap_init=3, gap_penalty=1, common_insert_locations=[], @@ -95,113 +174,35 @@ def self.frame_align( raise_errors=false, prealigned=false ) - elem = nil - if(prealigned) - elem = [seqa, seqb] - else - elem = align_it(seqa, seqb, gap_init, gap_penalty) + if(!prealigned) + elem = align_it(standard, query, gap_init, gap_penalty) + standard = elem[0] + query = elem[1] end - puts "Wierd sizes Z" if(elem[0].size() != elem[1].size()) + raise "Standard and query should be the same length" if standard.size() != query.size() - #Do trimming? - if(trim and elem[0] =~ /^(-+)[^-]/) - elem[0][0,$1.size()] = '' - elem[1][0,$1.size()] = '' + # Trim leading and trailing dashes if desired. + if (trim) + trim_leading_dashes(standard, query) + trim_trailing_dashes(standard, query) + fix_incomplete_edge_codon(standard, query, :leading) + fix_incomplete_edge_codon(standard, query, :trailing) end - if(trim and elem[0] =~ /[^-](-+)$/) - elem[1][(elem[0].size() - $1.size()), $1.size()] = '' - elem[0][(elem[0].size() - $1.size()), $1.size()] = '' - end - - #Start - if(trim and elem[1][0,1] == '-') - #get rid of edges that are the wrong size - elem[1] =~ /^(-+)[^-]/ - #Make sure its a multiple of three - dashes = $1 - if(dashes == nil) - #pass - elsif((dashes.size() % 3) == 1) - elem[1][dashes.size(),1] = '-' - elem[1][dashes.size() + 1,1] = '-' - elsif((dashes.size() % 3) == 2) - elem[1][dashes.size(),1] = '-' - end - end - - #end - if(trim and elem[1][-1,1] == '-') - #get rid of edges that are the wrong size - elem[1] =~ /[^-](-+)$/ - #Make sure its a multiple of three - dashes = $1 - if(dashes == nil) - #pass - elsif((dashes.size() % 3) == 1) - elem[1][(elem[1].size() - dashes.size()) - 1] = '-' - elem[1][(elem[1].size() - dashes.size()) - 2] = '-' - elsif((dashes.size() % 3) == 2) - elem[1][(elem[1].size() - dashes.size()) - 1] = '-' - end - end - - #try to merge deletions and insertions if things aren't looking well. - #added 16-Nov-2018, helps fix poor insertions near the start. - if(elem[0].size() % 3 != 0 or elem[1].size() % 3 != 0) - dex = 0 #Don't start at 0, that way lies madness... Or does it??? - while(dex = elem[0].index(/-/, dex)) - #Now, find an ajacent dash in elem[1] to cancel - if((dex - 1 >= 0) and elem[1][dex - 1] == '-') - elem[0][dex] = '' - elem[1][dex - 1] = '' - dex = 1 - elsif(elem[1][dex + 1] == '-') - elem[1][dex + 1] = '' - elem[0][dex] = '' - dex = 1 - elsif((dex - 2 >= 0) and elem[1][dex - 2] == '-') - elem[1][dex - 2] = '' - elem[0][dex] = '' - dex = 1 - elsif(elem[1][dex + 2] == '-') - elem[1][dex + 2] = '' - elem[0][dex] = '' - dex = 1 - end - - #check to see if we fixed everything - if(!(elem[0].size() % 3 != 0 or elem[1].size() % 3 != 0)) - break - end - - dex += 1 - end - - end + merge_insertions_and_deletions_to_fix_oof_sequences(standard, query) - - #I wonder if these should throw exceptions by default, but have an option to ignore. - if(elem[0].gsub(/[^-]/,'').size() % 3 != 0 and raise_errors) - #puts "Can not frame align" - raise "Can not frame align, #{elem[0].gsub(/[^-]/,'').size()} inserted bases not divisible by 3" - #return elem + if(standard.gsub(/[^-]/,'').size() % 3 != 0 and raise_errors) + raise "Can not frame align, #{standard.gsub(/[^-]/,'').size()} inserted bases not divisible by 3" end - if(elem[1].gsub(/[^-]/,'').size() % 3 != 0 and raise_errors) - #puts "Can not frame align" - #puts elem[0] - #puts elem[1] - - raise "Can not frame align, #{elem[1].gsub(/[^-]/,'').size()} deleted bases not divisible by 3" - #return elem + if(query.gsub(/[^-]/,'').size() % 3 != 0 and raise_errors) + raise "Can not frame align, #{query.gsub(/[^-]/,'').size()} deleted bases not divisible by 3" end #Build the insert/delete lists. - insert_list = make_gap_list(elem[0]) - delete_list = make_gap_list(elem[1]) + insert_list = make_gap_list(standard) + delete_list = make_gap_list(query) #Now we have a list that looks like [[3,4,5], [9], [11,12]] - if(insert_list.size() > 0)#Inserts first new_ins_list = [] @@ -242,41 +243,6 @@ def self.frame_align( end end - =begin - #First step is clustering insertions. (v1: old version) - insert_list.each_with_index do |ins, i| - next_ins = insert_list[i + 1] - - if(ins.size() % 3 != 0) #Wrong size! - #Look for next insertions that would make it the right size and are not far apart - if(!outta_frame and next_ins and (((next_ins.size() + ins.size()) % 3) == 0) and next_ins[0] - ins[-1] < 8 ) #within 8 bases - if(next_ins.size() > ins.size()) #scoring would be good here - ins.each do |a| - next_ins.insert(0, next_ins[0] - 1) #Insert at start - end - else - next_ins.each do |a| - ins.insert(-1, ins[0] + 1) #insert at end - end - insert_list[i + 1] = ins #Pushing this problem ahead for convinence. - end - #don't insert into new list, as we've pushed it to the next element. - else - #I wonder if we should try to merge multiple more than two inserts? Eh, nah. - #return elem - outta_frame = true - raise "Can not frame align insert" if(raise_errors) - end - else - new_ins_list << ins #this insert is okay - end - - end - =end - - #puts insert_list.inspect - #puts new_ins_list.inspect - #second step should be to frame align inserts (prioritizing to common_points) offset = 0 #offset created by previous insertions. (IMPORTANT) new_ins_list.each do |ins| @@ -312,28 +278,19 @@ def self.frame_align( #make the actual modifications #begin - #orige = "" + elem[0] - elem[0] = elem[0].gsub('-','') + #orige = "" + standard + standard = standard.gsub('-','') new_ins_list.each do |ins| ins.each do |i| - if(i > elem[0].size()) - elem[0].insert(-1, '-') + if(i > standard.size()) + standard.insert(-1, '-') else - elem[0].insert(i, '-') + standard.insert(i, '-') end end end - #rescue - # puts "OOH---------------------------------------------" - # puts orige.inspect - # puts elem.inspect - # puts new_ins_list.inspect() - # raise $! - #end - end - #Deletion--------------------------------------------------------------------------------------------------- outta_frame = false if(delete_list.size() > 0)#Deletions second @@ -377,40 +334,6 @@ def self.frame_align( end end - =begin - #First step is clustering deletions. (v1: old version) - delete_list.each_with_index do |del, i| - next_del = delete_list[i + 1] - - if(del.size() % 3 != 0 and !outta_frame) #Wrong size! - #Look for next deletions that would make it the right size and are not far apart - if(next_del and (((next_del.size() + del.size()) % 3) == 0) and next_del[0] - del[-1] < 6 ) #within 6 bases - if(next_del.size() > del.size()) #scoring would be good here - del.each do |a| - next_del.insert(0, next_del[0] - 1) #delete at start - end - else - next_del.each do |a| - del.insert(-1, del[0] + 1) #delete at end - end - delete_list[i + 1] = del #Pushing this problem ahead for convinence. - end - #don't delete into new list, as we've pushed it to the next element. - else - #I wonder if we should try to merge multiple more than two deletes? Eh, nah. - # - #return elem - new_del_list << del - outta_frame = true - #raise "Can not frame align delete" - end - else - new_del_list << del #this delete is okay - end - end - =end - - #second step should be to frame align deletes offset = 0 #offset created by previous deletions. (IMPORTANT) new_del_list.each do |del| @@ -435,33 +358,37 @@ def self.frame_align( end #make the actual modifications - elem[1] = elem[1].gsub('-','') + query = query.gsub('-','') new_del_list.each do |del| del.each do |i| - if(i > elem[1].size() ) - elem[1].insert(elem[1].size(), '-') + if(i > query.size() ) + query.insert(query.size(), '-') else - elem[1].insert(i, '-') + query.insert(i, '-') end end end end - return elem + return [standard, query] end #Returns a [seq_sans_inserts, [list of inserts]] def self.remove_inserts(elem) - seq = '' + elem[1] + return remove_insertions_from_query(elem[1]) + end + + def self.remove_insertions_from_query(query) + seq = '' + query inserts = [] insert_list = [] - 0.upto(elem[0].size() - 1) do |i| - insert_list << i if(elem[0][i,1] == '-') + 0.upto(standard.size() - 1) do |i| + insert_list << i if(standard[i,1] == '-') end big_insert_list = [] - if(elem[0].include?('-'))#Inserts first + if(standard.include?('-'))#Inserts first #First step should be to cluster inserts cur_ins = nil prev_i = nil @@ -482,7 +409,7 @@ def self.remove_inserts(elem) big_insert_list.each do |ins| ins_seq = '' ins.each do |i| - ins_seq += elem[1][i,1] + ins_seq += query[i,1] end inserts << [((ins[0] - offset) / 3), ins_seq] offset += ins.size() From e2b64daaa49fd33c5f2b5478cc411ecc030cbe7a Mon Sep 17 00:00:00 2001 From: rhliang Date: Wed, 30 Oct 2024 17:11:05 -0700 Subject: [PATCH 07/31] WIP: more refactoring in progress. Some questions have arisen re: the "merge indels to put them into codons" part; this will require some discussion. --- ruby/lib/cfe_gotoh.rb | 144 ++++++++++++++++++++---------------------- 1 file changed, 68 insertions(+), 76 deletions(-) diff --git a/ruby/lib/cfe_gotoh.rb b/ruby/lib/cfe_gotoh.rb index 718a4b6..b8d39a3 100644 --- a/ruby/lib/cfe_gotoh.rb +++ b/ruby/lib/cfe_gotoh.rb @@ -4,6 +4,12 @@ module CfeGotoh + class Error < RuntimeError + end + + class GapMergeError < Error + end + def _build_substitution_matrix sub_matrix = Array.new(127) {Array.new(127) {-1.0} } ['A','T','G','C','R','Y','K','M','B','D','H','V','S','W','N'].each do |nuc| @@ -160,6 +166,54 @@ def self.merge_insertions_and_deletions_to_fix_oof_sequences( end end + def self.cluster_gaps(gaps, raise_errors=false) + # Merge adjacent gaps if they are not a codon-sized gap. + new_gap_list = [] + gaps.each_with_index do |gap, i| + next if(gap.size() == 0) # we already ate this one + if(gap.size() % 3 == 0) # this gap is fine! + new_gap_list << gap + next + end + + gap2 = gaps[i + 1] + gap3 = gaps[i + 2] + # Can I merge with the next gap? + if (gap2 and (gap + gap2).size() % 3 == 0 and (gap2.first - gap.last) < 9) + if(gap2.size() > gap.size()) + new_gap_list << ((gap2.first - gap.size()) .. gap2.first - 1).to_a() + gap2 + else + new_gap_list << gap + ((gap.last + 1) .. (gap.last + gap2.size())).to_a() + end + gaps[i + 1] = [] + # Can I merge with the next two gaps? + elsif( + gap2 and gap3 and + (gap + gap2 + gap3).size() % 3 == 0 and + (gap3.first - gap.last) < 12 + ) + # Place the gap around the middle of the three merging gaps. + new_gap = ( + (gap2.first - gap.size()) .. gap2.first - 1.to_a() + + gap2 + + ((gap2.last + 1) .. (gap2.last + gap3.size())).to_a() + ) + new_gap_list << new_gap + + gaps[i + 1] = [] + gaps[i + 2] = [] + else + # We can't merge the gaps; either raise an error or meekly proceed. + if (raise_errors) + raise GapMergeError + else + new_gap_list << gap # FIXME this behaviour differs between insertions and deletions + end + end + end + return new_gap_list + end + #common_insert_locations is based on amino acid locations starting at base 0. #Assumes standard in the first base. @@ -192,55 +246,26 @@ def self.frame_align( merge_insertions_and_deletions_to_fix_oof_sequences(standard, query) if(standard.gsub(/[^-]/,'').size() % 3 != 0 and raise_errors) - raise "Can not frame align, #{standard.gsub(/[^-]/,'').size()} inserted bases not divisible by 3" + raise "Cannot frame align, #{standard.gsub(/[^-]/,'').size()} inserted bases not divisible by 3" end if(query.gsub(/[^-]/,'').size() % 3 != 0 and raise_errors) - raise "Can not frame align, #{query.gsub(/[^-]/,'').size()} deleted bases not divisible by 3" + raise "Cannot frame align, #{query.gsub(/[^-]/,'').size()} deleted bases not divisible by 3" end - #Build the insert/delete lists. + # Build the insert/delete lists. These lists look like + # [[3,4,5], [9], [11,12]] insert_list = make_gap_list(standard) delete_list = make_gap_list(query) - #Now we have a list that looks like [[3,4,5], [9], [11,12]] - if(insert_list.size() > 0)#Inserts first + # Process the insertions. + if(insert_list.size() > 0) new_ins_list = [] - #First step is clustering insertions. (v2: 16-Nov-2018) - insert_list.each_with_index do |ins, i| - next if(ins.size() == 0) #we already ate this one. - if(ins.size() % 3 == 0) #this insertion is fine! - new_ins_list << ins - next - end - - #Can I merge with the next insert? - if(insert_list[i + 1] and (ins + insert_list[i + 1]).size() % 3 == 0 and - (insert_list[i + 1].first - ins.last) < 9) - - ins2 = insert_list[i + 1] - if(ins2.size() > ins.size()) - new_ins_list << ((ins2.first - ins.size()) .. ins2.first - 1).to_a() + ins2 - else - new_ins_list << ins + ((ins.last + 1) .. (ins.last + ins2.size())).to_a() - end - insert_list[i + 1] = [] - #maybe merge with the next two inserts? - elsif(insert_list[i + 1] and insert_list[i + 2] and - (ins + insert_list[i + 1] + insert_list[i + 2]).size() % 3 == 0 and - (insert_list[i + 2].first - ins.last) < 12) - - ins2 = insert_list[i + 1] - ins3 = insert_list[i + 2] - if(true) #Lets just assume that if you need to combine 3 inserts, the middle one is where it goes. - new_ins_list << ((ins2.first - ins.size()) .. ins2.first - 1).to_a() + ins2 + ((ins2.last + 1) .. (ins2.last + ins3.size())).to_a() - end - - insert_list[i + 1] = [] - insert_list[i + 2] = [] - else #No merge, life sucks and then you die. - raise "Can not frame align insert" if(raise_errors) - end + # Step 1: cluster the insertions. + begin + new_ins_list = cluster_gaps(insert_list, raise_errors=raise_errors)) + rescue GapMergeError + raise "Cannot frame align insert" if raise_errors end #second step should be to frame align inserts (prioritizing to common_points) @@ -297,42 +322,9 @@ def self.frame_align( new_del_list = [] next_del = nil - #First step is clustering deletions. (v2: 19-Nov-2018) - delete_list.each_with_index do |del, i| - next if(del.size() == 0) #we already ate this one. - if(del.size() % 3 == 0) #this insertion is fine! - new_del_list << del - next - end - - #Can I merge with the next delete? - if(delete_list[i + 1] and (del + delete_list[i + 1]).size() % 3 == 0 and - (delete_list[i + 1].first - del.last) < 9) - - del2 = delete_list[i + 1] - if(del2.size() > del.size()) - new_del_list << ((del2.first - del.size()) .. del2.first - 1).to_a() + del2 - else - new_del_list << del + ((del.last + 1) .. (del.last + del2.size())).to_a() - end - delete_list[i + 1] = [] - #maybe merge with the next two deletes? - elsif(delete_list[i + 1] and delete_list[i + 2] and - (del + delete_list[i + 1] + delete_list[i + 2]).size() % 3 == 0 and - (delete_list[i + 2].first - del.last) < 12) #slightly higher range, since we've already got a higher threshold of confidence here - - del2 = delete_list[i + 1] - del3 = delete_list[i + 2] - if(true) #Lets just assume that if you need to combine 3 deletes, the middle one is where it goes. - new_del_list << ((del2.first - del.size()) .. del2.first - 1).to_a() + del2 + ((del2.last + 1) .. (del2.last + del3.size())).to_a() - end - - delete_list[i + 1] = [] - delete_list[i + 2] = [] - else #No merge, life sucks and then you die. - new_del_list << delete_list[i] - end - end + # As above, step 1 is to cluster the deletions. Note that this behaviour + # differs from how we handle the insertions! + new_del_list = cluster_gaps(delete_list, raise_errors=false) #second step should be to frame align deletes offset = 0 #offset created by previous deletions. (IMPORTANT) From cd1bd2f8c585e8528cc93fe1dc041fd4d6940727 Mon Sep 17 00:00:00 2001 From: Richard Liang Date: Fri, 1 Nov 2024 14:53:02 -0700 Subject: [PATCH 08/31] The gem now builds and can be 'require'd. --- alignment/gotoh/gotoh.cpp | 4 +- ruby/Gemfile | 9 ++ ruby/Gemfile.lock | 39 +++++ ruby/build_gem.bash | 2 +- ruby/ext/cfe_gotoh/extconf.rb | 2 +- ruby/lib/cfe_gotoh.rb | 277 +++++++++++++++++----------------- 6 files changed, 189 insertions(+), 144 deletions(-) create mode 100644 ruby/Gemfile create mode 100644 ruby/Gemfile.lock diff --git a/alignment/gotoh/gotoh.cpp b/alignment/gotoh/gotoh.cpp index 9a3cef6..eab96e9 100644 --- a/alignment/gotoh/gotoh.cpp +++ b/alignment/gotoh/gotoh.cpp @@ -852,9 +852,9 @@ void widen_gaps(string* seq) return ret; } - extern "C" void Init_gotoh() + extern "C" void Init_cfe_gotoh() { - VALUE gotoh = rb_define_module("Gotoh"); + VALUE gotoh = rb_define_module("CfeGotoh"); rb_define_module_function(gotoh, "align_it", (VALUE(*)(...))align_it, 4); rb_define_module_function(gotoh, "align_it_aa", (VALUE(*)(...))align_it_aa, 4); } diff --git a/ruby/Gemfile b/ruby/Gemfile new file mode 100644 index 0000000..db49378 --- /dev/null +++ b/ruby/Gemfile @@ -0,0 +1,9 @@ +source 'https://rubygems.org' +ruby '2.2.2' + +# To install all these gems, run "gem install bundler", then "bundle install". + +gem 'minitest-reporters' +gem 'simplecov', '0.17.1' +gem 'simplecov-lcov', '0.7.0' +gem 'simplecov-cobertura', '1.4.2' diff --git a/ruby/Gemfile.lock b/ruby/Gemfile.lock new file mode 100644 index 0000000..ce0aa27 --- /dev/null +++ b/ruby/Gemfile.lock @@ -0,0 +1,39 @@ +GEM + remote: https://rubygems.org/ + specs: + ansi (1.5.0) + builder (3.3.0) + docile (1.3.5) + hashie (5.0.0) + json (2.5.1) + minitest (4.7.5) + minitest-reporters (0.14.24) + ansi + builder + minitest (>= 2.12, < 5.0) + powerbar + powerbar (2.0.1) + hashie (>= 1.1.0) + simplecov (0.17.1) + docile (~> 1.1) + json (>= 1.8, < 3) + simplecov-html (~> 0.10.0) + simplecov-cobertura (1.4.2) + simplecov (~> 0.8) + simplecov-html (0.10.2) + simplecov-lcov (0.7.0) + +PLATFORMS + ruby + +DEPENDENCIES + minitest-reporters + simplecov (= 0.17.1) + simplecov-cobertura (= 1.4.2) + simplecov-lcov (= 0.7.0) + +RUBY VERSION + ruby 2.2.2p95 + +BUNDLED WITH + 1.17.3 diff --git a/ruby/build_gem.bash b/ruby/build_gem.bash index d2792bf..e2d4c25 100644 --- a/ruby/build_gem.bash +++ b/ruby/build_gem.bash @@ -1,5 +1,5 @@ #! /usr/bin/env bash -cp ../alignment/gotoh/gotoh.cpp ext/gotoh/cfe_gotoh.cpp +cp ../alignment/gotoh/gotoh.cpp ext/cfe_gotoh/cfe_gotoh.cpp gem build cfe_gotoh.gemspec rm ext/cfe_gotoh/cfe_gotoh.cpp diff --git a/ruby/ext/cfe_gotoh/extconf.rb b/ruby/ext/cfe_gotoh/extconf.rb index 0fe49ba..18945cb 100644 --- a/ruby/ext/cfe_gotoh/extconf.rb +++ b/ruby/ext/cfe_gotoh/extconf.rb @@ -1,4 +1,4 @@ require "mkmf" create_header -create_makefile('gotoh/gotoh') +create_makefile('cfe_gotoh/cfe_gotoh') diff --git a/ruby/lib/cfe_gotoh.rb b/ruby/lib/cfe_gotoh.rb index b8d39a3..e3bb2d7 100644 --- a/ruby/lib/cfe_gotoh.rb +++ b/ruby/lib/cfe_gotoh.rb @@ -10,53 +10,53 @@ class Error < RuntimeError class GapMergeError < Error end - def _build_substitution_matrix - sub_matrix = Array.new(127) {Array.new(127) {-1.0} } - ['A','T','G','C','R','Y','K','M','B','D','H','V','S','W','N'].each do |nuc| - sub_matrix[nuc.ord()][nuc.ord()] = 1.0 - sub_matrix[nuc.ord()]['X'.ord()]=sub_matrix['X'.ord()][nuc.ord()]=-6.0 if(nuc !='N') - end - #bi-mixtures - sub_matrix['A'.ord()]['R'.ord()]=sub_matrix['R'.ord()]['A'.ord()]=1.0 - sub_matrix['G'.ord()]['R'.ord()]=sub_matrix['R'.ord()]['G'.ord()]=1.0 - sub_matrix['C'.ord()]['Y'.ord()]=sub_matrix['Y'.ord()]['C'.ord()]=1.0 - sub_matrix['T'.ord()]['Y'.ord()]=sub_matrix['Y'.ord()]['T'.ord()]=1.0 - sub_matrix['G'.ord()]['K'.ord()]=sub_matrix['K'.ord()]['G'.ord()]=1.0 - sub_matrix['T'.ord()]['K'.ord()]=sub_matrix['K'.ord()]['T'.ord()]=1.0 - sub_matrix['C'.ord()]['M'.ord()]=sub_matrix['M'.ord()]['C'.ord()]=1.0 - sub_matrix['A'.ord()]['M'.ord()]=sub_matrix['M'.ord()]['A'.ord()]=1.0 - sub_matrix['C'.ord()]['S'.ord()]=sub_matrix['S'.ord()]['C'.ord()]=1.0 - sub_matrix['G'.ord()]['S'.ord()]=sub_matrix['S'.ord()]['G'.ord()]=1.0 - sub_matrix['T'.ord()]['W'.ord()]=sub_matrix['W'.ord()]['T'.ord()]=1.0 - sub_matrix['A'.ord()]['W'.ord()]=sub_matrix['W'.ord()]['A'.ord()]=1.0 - #tri-mixtures - sub_matrix['C'.ord()]['B'.ord()]=sub_matrix['B'.ord()]['C'.ord()]=1.0 - sub_matrix['G'.ord()]['B'.ord()]=sub_matrix['B'.ord()]['G'.ord()]=1.0 - sub_matrix['T'.ord()]['B'.ord()]=sub_matrix['B'.ord()]['T'.ord()]=1.0 - sub_matrix['A'.ord()]['D'.ord()]=sub_matrix['D'.ord()]['A'.ord()]=1.0 - sub_matrix['G'.ord()]['D'.ord()]=sub_matrix['D'.ord()]['G'.ord()]=1.0 - sub_matrix['T'.ord()]['D'.ord()]=sub_matrix['D'.ord()]['T'.ord()]=1.0 - sub_matrix['A'.ord()]['H'.ord()]=sub_matrix['H'.ord()]['A'.ord()]=1.0 - sub_matrix['C'.ord()]['H'.ord()]=sub_matrix['H'.ord()]['C'.ord()]=1.0 - sub_matrix['T'.ord()]['H'.ord()]=sub_matrix['H'.ord()]['T'.ord()]=1.0 - sub_matrix['A'.ord()]['V'.ord()]=sub_matrix['V'.ord()]['A'.ord()]=1.0 - sub_matrix['C'.ord()]['V'.ord()]=sub_matrix['V'.ord()]['C'.ord()]=1.0 - sub_matrix['G'.ord()]['V'.ord()]=sub_matrix['V'.ord()]['G'.ord()]=1.0 - #other - sub_matrix['$'.ord()]['$'.ord()]=50.0 - sub_matrix['T'.ord()]['U'.ord()] = sub_matrix['U'.ord()]['T'.ord()] = 1.0 - sub_matrix['N'.ord()]['N'.ord()] = 0.0 - sub_matrix['X'.ord()]['-'.ord()]=sub_matrix['X'.ord()]['-'.ord()]=3.0 - ['A','T','G','C'].each do |ch| - sub_matrix[ch.ord()]['*'.ord()]=sub_matrix['*'.ord()][ch.ord()]=1.0 - sub_matrix[ch.ord()]['&'.ord()]=sub_matrix['&'.ord()][ch.ord()]=0.7 - sub_matrix[ch.ord()]['$'.ord()]=sub_matrix['$'.ord()][ch.ord()]=0.0 - sub_matrix[ch.ord()]['.'.ord()]=sub_matrix['.'.ord()][ch.ord()]=-20.0 - sub_matrix[ch.ord()]['N'.ord()]=sub_matrix['N'.ord()][ch.ord()]=-3.0 - end + sub_matrix = Array.new(127) {Array.new(127) {-1.0} } + ['A','T','G','C','R','Y','K','M','B','D','H','V','S','W','N'].each do |nuc| + sub_matrix[nuc.ord()][nuc.ord()] = 1.0 + sub_matrix[nuc.ord()]['X'.ord()]=sub_matrix['X'.ord()][nuc.ord()]=-6.0 if(nuc !='N') + end + #bi-mixtures + sub_matrix['A'.ord()]['R'.ord()]=sub_matrix['R'.ord()]['A'.ord()]=1.0 + sub_matrix['G'.ord()]['R'.ord()]=sub_matrix['R'.ord()]['G'.ord()]=1.0 + sub_matrix['C'.ord()]['Y'.ord()]=sub_matrix['Y'.ord()]['C'.ord()]=1.0 + sub_matrix['T'.ord()]['Y'.ord()]=sub_matrix['Y'.ord()]['T'.ord()]=1.0 + sub_matrix['G'.ord()]['K'.ord()]=sub_matrix['K'.ord()]['G'.ord()]=1.0 + sub_matrix['T'.ord()]['K'.ord()]=sub_matrix['K'.ord()]['T'.ord()]=1.0 + sub_matrix['C'.ord()]['M'.ord()]=sub_matrix['M'.ord()]['C'.ord()]=1.0 + sub_matrix['A'.ord()]['M'.ord()]=sub_matrix['M'.ord()]['A'.ord()]=1.0 + sub_matrix['C'.ord()]['S'.ord()]=sub_matrix['S'.ord()]['C'.ord()]=1.0 + sub_matrix['G'.ord()]['S'.ord()]=sub_matrix['S'.ord()]['G'.ord()]=1.0 + sub_matrix['T'.ord()]['W'.ord()]=sub_matrix['W'.ord()]['T'.ord()]=1.0 + sub_matrix['A'.ord()]['W'.ord()]=sub_matrix['W'.ord()]['A'.ord()]=1.0 + #tri-mixtures + sub_matrix['C'.ord()]['B'.ord()]=sub_matrix['B'.ord()]['C'.ord()]=1.0 + sub_matrix['G'.ord()]['B'.ord()]=sub_matrix['B'.ord()]['G'.ord()]=1.0 + sub_matrix['T'.ord()]['B'.ord()]=sub_matrix['B'.ord()]['T'.ord()]=1.0 + sub_matrix['A'.ord()]['D'.ord()]=sub_matrix['D'.ord()]['A'.ord()]=1.0 + sub_matrix['G'.ord()]['D'.ord()]=sub_matrix['D'.ord()]['G'.ord()]=1.0 + sub_matrix['T'.ord()]['D'.ord()]=sub_matrix['D'.ord()]['T'.ord()]=1.0 + sub_matrix['A'.ord()]['H'.ord()]=sub_matrix['H'.ord()]['A'.ord()]=1.0 + sub_matrix['C'.ord()]['H'.ord()]=sub_matrix['H'.ord()]['C'.ord()]=1.0 + sub_matrix['T'.ord()]['H'.ord()]=sub_matrix['H'.ord()]['T'.ord()]=1.0 + sub_matrix['A'.ord()]['V'.ord()]=sub_matrix['V'.ord()]['A'.ord()]=1.0 + sub_matrix['C'.ord()]['V'.ord()]=sub_matrix['V'.ord()]['C'.ord()]=1.0 + sub_matrix['G'.ord()]['V'.ord()]=sub_matrix['V'.ord()]['G'.ord()]=1.0 + #other + sub_matrix['$'.ord()]['$'.ord()]=50.0 + sub_matrix['T'.ord()]['U'.ord()] = sub_matrix['U'.ord()]['T'.ord()] = 1.0 + sub_matrix['N'.ord()]['N'.ord()] = 0.0 + sub_matrix['X'.ord()]['-'.ord()]=sub_matrix['X'.ord()]['-'.ord()]=3.0 + ['A','T','G','C'].each do |ch| + sub_matrix[ch.ord()]['*'.ord()]=sub_matrix['*'.ord()][ch.ord()]=1.0 + sub_matrix[ch.ord()]['&'.ord()]=sub_matrix['&'.ord()][ch.ord()]=0.7 + sub_matrix[ch.ord()]['$'.ord()]=sub_matrix['$'.ord()][ch.ord()]=0.0 + sub_matrix[ch.ord()]['.'.ord()]=sub_matrix['.'.ord()][ch.ord()]=-20.0 + sub_matrix[ch.ord()]['N'.ord()]=sub_matrix['N'.ord()][ch.ord()]=-3.0 end + sub_matrix.each {|column| column.freeze} + sub_matrix.freeze - NUCLEOTIDE_MATRIX = self._build_substitution_matrix().freeze + NUCLEOTIDE_MATRIX = sub_matrix def self.score_alignment(standard, query) sc = 0.0 @@ -66,7 +66,6 @@ def self.score_alignment(standard, query) return sc end - def self.make_gap_list(seq) list = [] cur_ins = nil @@ -173,10 +172,10 @@ def self.cluster_gaps(gaps, raise_errors=false) next if(gap.size() == 0) # we already ate this one if(gap.size() % 3 == 0) # this gap is fine! new_gap_list << gap - next + next end - gap2 = gaps[i + 1] + gap2 = gaps[i + 1] # note: these could be nil, which we test for below gap3 = gaps[i + 2] # Can I merge with the next gap? if (gap2 and (gap + gap2).size() % 3 == 0 and (gap2.first - gap.last) < 9) @@ -194,11 +193,11 @@ def self.cluster_gaps(gaps, raise_errors=false) ) # Place the gap around the middle of the three merging gaps. new_gap = ( - (gap2.first - gap.size()) .. gap2.first - 1.to_a() - + gap2 - + ((gap2.last + 1) .. (gap2.last + gap3.size())).to_a() - ) - new_gap_list << new_gap + (gap2.first - gap.size()) .. gap2.first - 1.to_a() + + gap2 + + ((gap2.last + 1) .. (gap2.last + gap3.size())).to_a() + ) + new_gap_list << new_gap gaps[i + 1] = [] gaps[i + 2] = [] @@ -214,6 +213,65 @@ def self.cluster_gaps(gaps, raise_errors=false) return new_gap_list end + def self.align_gaps_to_frame(gaps, common_gap_locations=nil) + # Align gaps to codon boundaries, giving preference to common + # gap locations if specified. + + offset = 0 # offset created by previous gaps. + gaps.each do |gap| + # See if this gap is close to a common gap location (within 3 amino acids). + if (!common_gap_locations.nil?) + closest_common = common_gap_locations.min() do |a,b| + (3 * a - (gap[0] - offset)).abs() <=> (3 * b - (gap[0] - offset)).abs() + end + if(closest_common != nil and (3 * closest_common - (gap[0] - offset)).abs() <= 9) + # Align the gap to this position. + new_gap = [] + 0.upto(gap.size() - 1) do |i| + new_gap << 3 * closest_common + i + offset + end + gap.replace(new_gap) + end + end + + # Align the gap to the nearest appropriate frame. + # Original comment from Conan: scoring would be good here + if(gap[0] % 3 == 1) # set back one base + new_gap = [] + gap.each do |i| + new_gap << i - 1 + end + gap.replace(new_gap) + elsif(gap[0] % 3 == 2) # set forward one base + new_gap = [] + gap.each do |i| + new_gap << i + 1 + end + gap.replace(new_gap) + end + + offset += gap.size() + end + return gaps + end + + def self.splice_gaps_into_sequence(seq, gaps) + # Place the specified gaps into the sequence. Note that the + # gaps are specified by their positions in an *aligned* sequence, + # and as such include "offsets" introduced by gaps placed earlier + # in the sequence. The gaps must be in left-to-right order. + seq = seq.gsub('-','') + gaps.each do |gap| + gap.each do |i| + if(i > seq.size()) + seq.insert(-1, '-') + else + seq.insert(i, '-') + end + end + end + return seq + end #common_insert_locations is based on amino acid locations starting at base 0. #Assumes standard in the first base. @@ -223,11 +281,14 @@ def self.frame_align( query, gap_init=3, gap_penalty=1, - common_insert_locations=[], + common_insert_locations=nil, trim=false, raise_errors=false, prealigned=false ) + if (common_insert_locations.nil?) + common_insert_locations = [] + end if(!prealigned) elem = align_it(standard, query, gap_init, gap_penalty) standard = elem[0] @@ -263,103 +324,39 @@ def self.frame_align( # Step 1: cluster the insertions. begin - new_ins_list = cluster_gaps(insert_list, raise_errors=raise_errors)) + new_ins_list = cluster_gaps(insert_list, raise_errors=raise_errors) rescue GapMergeError raise "Cannot frame align insert" if raise_errors end - #second step should be to frame align inserts (prioritizing to common_points) - offset = 0 #offset created by previous insertions. (IMPORTANT) - new_ins_list.each do |ins| - #see if its close to a common_insert(within 3 amino acids?) - min_common = common_insert_locations.min(){|a,b| ((a) * 3 - (ins[0] - offset)).abs() <=> ((b) * 3 - ins[0]).abs()} - if(min_common != nil and ((min_common ) * 3 - (ins[0] - offset)).abs() <= 9) - #Cool, align to this common insert - new_ins = [] - 0.upto(ins.size() - 1) do |i| - new_ins << ((min_common) * 3) + i + offset - end - ins.replace(new_ins) - end - - #B frame align - #scoring would be good here - if(ins[0] % 3 == 1) #set back one base - new_ins = [] - ins.each do |i| - new_ins << i - 1 - end - ins.replace(new_ins) - elsif(ins[0] % 3 == 2) #Set forward one base. - new_ins = [] - ins.each do |i| - new_ins << i + 1 - end - ins.replace(new_ins) - end - - offset += ins.size() - end + # Step 2: frame-align the insertions, shifting things to common insertion + # positions where appropriate. + align_gaps_to_frame(new_ins_list, common_gap_locations=common_insert_locations) - #make the actual modifications - #begin - #orige = "" + standard - standard = standard.gsub('-','') - new_ins_list.each do |ins| - ins.each do |i| - if(i > standard.size()) - standard.insert(-1, '-') - else - standard.insert(i, '-') - end - end - end + # Put the insertions back into the standard. + standard = splice_gaps_into_sequence(standard, new_ins_list) end - #Deletion--------------------------------------------------------------------------------------------------- - outta_frame = false - if(delete_list.size() > 0)#Deletions second + # Process the deletions. + if(delete_list.size() > 0) new_del_list = [] - next_del = nil - # As above, step 1 is to cluster the deletions. Note that this behaviour - # differs from how we handle the insertions! - new_del_list = cluster_gaps(delete_list, raise_errors=false) - - #second step should be to frame align deletes - offset = 0 #offset created by previous deletions. (IMPORTANT) - new_del_list.each do |del| - next if(del.size() % 3 != 0) - #frame align - #scoring would be good here - if(del[0] % 3 == 1) #set back one base - new_del = [] - del.each do |i| - new_del << i - 1 - end - del.replace(new_del) - elsif(del[0] % 3 == 2) #Set forward one base. - new_del = [] - del.each do |i| - new_del << i + 1 - end - del.replace(new_del) - end - - offset += del.size() + # As above, step 1 is to cluster the deletions. + # FIXME note that the original code behaved differently between + # insertions and deletions; confirm that this is the right + # way forward. + begin + new_del_list = cluster_gaps(delete_list, raise_errors=raise_errors) + rescue GapMergeError + raise "Cannot frame align deletion" if raise_errors end - #make the actual modifications - query = query.gsub('-','') - new_del_list.each do |del| - del.each do |i| - if(i > query.size() ) - query.insert(query.size(), '-') - else - query.insert(i, '-') - end - end - end + # Again as above, frame-align the deletions; this time + # we don't worry about any common deletion positions. + align_gaps_to_frame(new_del_list) + + # Put the deletions back into the query. + query = splice_gaps_into_sequence(query, new_del_list) end return [standard, query] From 48243b648fde186b1e349aec5c23d2d93fc4a92b Mon Sep 17 00:00:00 2001 From: Richard Liang Date: Fri, 1 Nov 2024 15:07:35 -0700 Subject: [PATCH 09/31] Added a stub test file for filling out. --- ruby/test/cfe_gotoh_test.rb | 6 ++++++ ruby/test/test_helper.rb | 29 +++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 ruby/test/cfe_gotoh_test.rb create mode 100644 ruby/test/test_helper.rb diff --git a/ruby/test/cfe_gotoh_test.rb b/ruby/test/cfe_gotoh_test.rb new file mode 100644 index 0000000..60ee3a2 --- /dev/null +++ b/ruby/test/cfe_gotoh_test.rb @@ -0,0 +1,6 @@ +require_relative 'test_helper' +require_relative '../lib/cfe_gotoh' + + +class CfeGotohTest < Minitest::Test +end diff --git a/ruby/test/test_helper.rb b/ruby/test/test_helper.rb new file mode 100644 index 0000000..024b1b8 --- /dev/null +++ b/ruby/test/test_helper.rb @@ -0,0 +1,29 @@ +require 'simplecov' +require 'simplecov-cobertura' + + +SimpleCov.start do + formatter SimpleCov::Formatter::MultiFormatter.new([ + SimpleCov::Formatter::HTMLFormatter, # Add HTML report for viewing/review. + SimpleCov::Formatter::CoberturaFormatter # For CI + ]) +end + +require 'minitest/autorun' +require 'minitest/reporters' + +if ENV['CI_COMMIT_TAG'] + run_id = ENV['CI_COMMIT_TAG'] +elsif ENV["CI"] + run_id = "#{ENV['CI_PIPELINE_ID']}-#{ENV['CI_COMMIT_SHORT_SHA']}" +else + run_id = "LOCALBUILD" +end + +Minitest::Reporters.use! [ + Minitest::Reporters::SpecReporter.new, + Minitest::Reporters::JUnitReporter.new, + Minitest::Reporters::HtmlReporter.new( + :title => "hivdb_algorithm Test Report #{run_id}", + :erb_template => File.join(File.dirname(__FILE__), "templates/index.html.erb")) +] From 7456478d7ed28d221a0417fd3cbbfc7c8ad0e18b Mon Sep 17 00:00:00 2001 From: rhliang Date: Fri, 1 Nov 2024 21:23:13 -0700 Subject: [PATCH 10/31] Starting to write some of the tests. --- ruby/test/cfe_gotoh_test.rb | 130 ++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) diff --git a/ruby/test/cfe_gotoh_test.rb b/ruby/test/cfe_gotoh_test.rb index 60ee3a2..0919916 100644 --- a/ruby/test/cfe_gotoh_test.rb +++ b/ruby/test/cfe_gotoh_test.rb @@ -4,3 +4,133 @@ class CfeGotohTest < Minitest::Test end + + +class ScoreAlignmentTest < CfeGotohTest + REGULAR_BASES = ['A', 'C', 'G', 'T'].freeze + MIXTURES = { + 'R': ['A', 'G'], + 'Y': ['C', 'T'], + 'S': ['G', 'C'], + 'W': ['A', 'T'], + 'K': ['G', 'T'], + 'M': ['A', 'C'], + 'B': ['C', 'G', 'T'], + 'D': ['A', 'G', 'T'], + 'H': ['A', 'C', 'T'], + 'V': ['A', 'C', 'G'], + 'N': ['A', 'C', 'G', 'T'] + }.freeze + + def score_alignment_symmetric_test(expected, base1, base2) + assert_equal expected, CfeGotoh.score_alignment(base1, base2) + assert_equal expected, CfeGotoh.score_alignment(base2, base1) + end + + def test_regular_base_scores + REGULAR_BASES.each do |std_base| + REGULAR_BASES.each do |query_base| + expected = -1.0 + if (std_base == query_base) + expected = 1.0 + end + assert_equal expected, CfeGotoh.score_alignment(std_base, query_base) + end + end + end + + def test_mixture_scores + MIXTURES.each do |std_base, _| + MIXTURES.each do |query_base, __| + expected = -1.0 + if (std_base == query_base) + expected = 1.0 + end + assert_equal expected, CfeGotoh.score_alignment(std_base, query_base) + end + end + end + + def test_mixture_with_regular_scores + REGULAR_BASES.each do |base1| + MIXTURES.each do |base2, possible_bases| + expected = -1.0 + if (base2 == 'N') + expected = -3.0 + elsif (possible_bases.include?(base1)) + expected = 1.0 + end + score_alignment_symmetric_test(expected, base1, base2) + end + end + end + + def test_score_x + (REGULAR_BASES + MIXTURES).each do |base| + expected = -6.0 + if (base == 'N') + expected = -1.0 + end + score_alignment_symmetric_test(expected, base, 'X') + end + end + + def test_special_characters + assert_equal 50.0, CfeGotoh.score_alignment('$', '$') + score_alignment_symmetric_test(50.0, '$', '$') + score_alignment_symmetric_test(1.0, 'T', 'U') + assert_equal 0.0, CfeGotoh.score_alignment('N', 'N') + score_alignment_symmetric_test(3.0, 'X', '-') + REGULAR_BASES.each do |base| + score_alignment_symmetric_test(1.0, base, '*') + score_alignment_symmetric_test(0.7, base, '&') + score_alignment_symmetric_test(0.0, base, '$') + score_alignment_symmetric_test(-20.0, base, '.') + end + end + + def test_multiple_bases + seq1 = 'AGACTCTVC---' + seq2 = 'CGANNCTGCXXX' + expected = -1.0 + 2.0 - 3.0 - 3.0 + 4.0 + 9.0 + assert_equal expected, CfeGotoh.score_alignment(seq1, seq2) + end + + MAKE_GAP_LIST_TEST_CASES = [ + { + name: 'empty_sequence', + seq: '', + expected: [] + }, + { + name: 'no_gaps', + seq: 'ACAGAT', + expected: [] + }, + { + name: 'gap_in_middle', + seq: 'ACA--GATC', + expected: [[3, 4]] + }, + { + name: 'gap_at_start', + seq: '--ACAGATCC', + expected: [[0, 1]] + }, + { + name: 'gap_at_end', + seq: 'ACAGATCC----', + expected: [[8, 9, 10, 11]] + }, + { + name: 'multiple_gaps_in_middle', + seq: 'ACA--GA-TC', + expected: [[3, 4], [7]] + }, + { + name: 'multiple_gaps_throughout', + seq: '-ACA--GA-TC----', + expected: [[0], [4, 5], [8], [11, 12, 13, 14]] + } + ] +end From f9253b611073f3623b1cc402512cea855880dc9d Mon Sep 17 00:00:00 2001 From: Richard Liang Date: Wed, 6 Nov 2024 15:09:31 -0800 Subject: [PATCH 11/31] WIP: continuing to add tests. --- ruby/lib/cfe_gotoh.rb | 3 +- ruby/rakefile | 11 + ruby/test/cfe_gotoh_test.rb | 666 ++++++++++++++++++++++++++++++++++++ 3 files changed, 679 insertions(+), 1 deletion(-) create mode 100644 ruby/rakefile diff --git a/ruby/lib/cfe_gotoh.rb b/ruby/lib/cfe_gotoh.rb index e3bb2d7..29dffe6 100644 --- a/ruby/lib/cfe_gotoh.rb +++ b/ruby/lib/cfe_gotoh.rb @@ -216,12 +216,13 @@ def self.cluster_gaps(gaps, raise_errors=false) def self.align_gaps_to_frame(gaps, common_gap_locations=nil) # Align gaps to codon boundaries, giving preference to common # gap locations if specified. + # Gaps must be listed in ascending order, i.e. from left to right. offset = 0 # offset created by previous gaps. gaps.each do |gap| # See if this gap is close to a common gap location (within 3 amino acids). if (!common_gap_locations.nil?) - closest_common = common_gap_locations.min() do |a,b| + closest_common = common_gap_locations.min() do |a, b| (3 * a - (gap[0] - offset)).abs() <=> (3 * b - (gap[0] - offset)).abs() end if(closest_common != nil and (3 * closest_common - (gap[0] - offset)).abs() <= 9) diff --git a/ruby/rakefile b/ruby/rakefile new file mode 100644 index 0000000..efe4375 --- /dev/null +++ b/ruby/rakefile @@ -0,0 +1,11 @@ +require 'rake' +require 'rake/testtask' + +task :default => :test + +base_dir = File.expand_path(".") + +Rake::TestTask.new do |t| + t.libs = [base_dir] + t.pattern = "test/**/*_test.rb" +end diff --git a/ruby/test/cfe_gotoh_test.rb b/ruby/test/cfe_gotoh_test.rb index 0919916..3c30dc8 100644 --- a/ruby/test/cfe_gotoh_test.rb +++ b/ruby/test/cfe_gotoh_test.rb @@ -95,7 +95,10 @@ def test_multiple_bases expected = -1.0 + 2.0 - 3.0 - 3.0 + 4.0 + 9.0 assert_equal expected, CfeGotoh.score_alignment(seq1, seq2) end +end + +class MakeGapListTest < CfeGotohTest MAKE_GAP_LIST_TEST_CASES = [ { name: 'empty_sequence', @@ -133,4 +136,667 @@ def test_multiple_bases expected: [[0], [4, 5], [8], [11, 12, 13, 14]] } ] + + MAKE_GAP_LIST_TEST_CASES.each do |test_entry| + define_method("test_#{test_entry[:name]}") do + assert_equal test_entry[:expected], CfeGotoh.make_gap_list(test_entry[:seq]) + end + end +end + + +class TrimLeadingDashesTest < CfeGotohTest + TRIM_LEADING_DASHES_TEST_CASES = [ + { + name: 'no_leading_dashes', + std: 'ACAGAT', + query: 'ACACAT', + expected_std: 'ACAGAT', + expected_query: 'ACACAT' + }, + { + name: 'one_leading_dash' + std: '-ACAGAT', + query: 'GACACAT', + expected_std: 'ACAGAT', + expected_query: 'ACACAT' + }, + { + name: 'several_leading_dashes' + std: '----ACAGAT', + query: 'GGGGACACAT', + expected_std: 'ACAGAT', + expected_query: 'ACACAT' + }, + { + name: 'no_leading_dashes_other_dashes_ignored' + std: 'ACA---CATGAT-', + query: 'ACAGGG---CATC', + expected_std: 'ACA---CATGAT-', + expected_query: 'ACAGGG---CATC' + }, + { + name: 'one_leading_dash_other_dashes_ignored' + std: '-ACA---CATGAT-', + query: '-ACAGGG---CATC', + expected_std: 'ACA---CATGAT-', + expected_query: 'ACAGGG---CATC' + }, + { + name: 'several_leading_dashes_other_dashes_ignored' + std: '----ACA---CATGAT-', + query: 'GGGGACAGGG---CATC', + expected_std: 'ACA---CATGAT-', + expected_query: 'ACAGGG---CATC' + } + ] + + TRIM_LEADING_DASHES_TEST_CASES.each do |test_entry| + define_method("test_#{test_entry[:name]}") do + std = test_entry[:std] + query = test_entry[:query] + CfeGotoh.trim_leading_dashes(std, query) + assert_equal test_entry[:expected_std], std + assert_equal test_entry[:expected_query], query + end + end +end + + +class TrimTrailingDashesTest < CfeGotohTest + TRIM_TRAILING_DASHES_TEST_CASES = [ + { + name: 'no_trailing_dashes', + std: 'ACAGAT', + query: 'ACACAT', + expected_std: 'ACAGAT', + expected_query: 'ACACAT' + }, + { + name: 'one_trailing_dash' + std: 'ACAGAT-', + query: 'ACACATG', + expected_std: 'ACAGAT', + expected_query: 'ACACAT' + }, + { + name: 'several_trailing_dashes' + std: 'ACAGAT----', + query: 'ACACATGGGG', + expected_std: 'ACAGAT', + expected_query: 'ACACAT' + }, + { + name: 'no_trailing_dashes_other_dashes_ignored' + std: '-ACA---CATGAT', + query: 'CACAGGG---CAT', + expected_std: '-ACA---CATGAT', + expected_query: 'CACAGGG---CAT' + }, + { + name: 'one_trailing_dash_other_dashes_ignored' + std: '-ACA---CATGAT-', + query: 'CACAGGG---CATC', + expected_std: '-ACA---CATGAT', + expected_query: 'CACAGGG---CAT' + }, + { + name: 'several_trailing_dashes_other_dashes_ignored' + std: '-ACA---CATGAT----', + query: 'CACAGGG---CATCGGC', + expected_std: '-ACA---CATGAT', + expected_query: 'CACAGGG---CAT' + } + ] + + TRIM_TRAILING_DASHES_TEST_CASES.each do |test_entry| + define_method("test_#{test_entry[:name]}") do + std = test_entry[:std] + query = test_entry[:query] + CfeGotoh.trim_trailing_dashes(std, query) + assert_equal test_entry[:expected_std], std + assert_equal test_entry[:expected_query], query + end + end +end + + +class FixIncompleteEdgeCodonTest < CfeGotohTest + FIX_INCOMPLETE_EDGE_CODON_TEST_CASES = [ + { + name: 'no_leading_dashes', + seq: 'ACTAGG', + expected: 'ACTAGG' + }, + { + name: 'one_complete_blank_codon_leading', + seq: '---ACTAGG', + expected: '---ACTAGG' + }, + { + name: 'several_complete_blank_codons_leading', + seq: '---------ACTAGG', + expected: '---------ACTAGG' + }, + { + name: 'no_leading_dashes_other_blanks_ignored', + seq: 'ACT---AGG---', + expected: 'ACT---AGG---' + }, + { + name: 'one_leading_dash', + seq: '-GGACTAGG', + expected: '---ACTAGG' + }, + { + name: 'two_leading_dashes', + seq: '--GACTAGG', + expected: '---ACTAGG' + }, + { + name: 'one_dash_plus_full_codon_leading', + seq: '----GGACTAGG', + expected: '------ACTAGG' + }, + { + name: 'two_dashes_plus_full_codons_leading', + seq: '--------GACTAGG', + expected: '---------ACTAGG' + }, + { + name: 'leading_dashes_other_blanks_ignored', + seq: '-----GACT---AGG---', + expected: '------ACT---AGG---' + }, + { + name: 'no_trailing_dashes', + seq: 'ACTAGG', + expected: 'ACTAGG', + side: :trailing + }, + { + name: 'one_complete_blank_codon_trailing', + seq: 'ACTAGG---', + expected: 'ACTAGG---', + side: :trailing + }, + { + name: 'several_complete_blank_codons_trailing', + seq: 'ACTAGG---------', + expected: 'ACTAGG---------', + side: :trailing + }, + { + name: 'no_trailing_dashes_other_blanks_ignored', + seq: '---ACT---AGG', + expected: '---ACT---AGG', + side: :trailing + }, + { + name: 'one_trailing_dash', + seq: 'ACTAGGTT-', + expected: 'ACTAGG---', + side: :trailing + }, + { + name: 'two_trailing_dashes', + seq: 'ACTAGGT--', + expected: 'ACTAGG---', + side: :trailing + }, + { + name: 'one_trailing_dash_plus_full_codon', + seq: 'ACTAGGTT----', + expected: 'ACTAGG------', + side: :trailing + }, + { + name: 'two_trailing_dashes_plus_full_codons', + seq: 'ACTAGGT--------', + expected: 'ACTAGG---------', + side: :trailing + }, + { + name: 'trailing_dashes_other_blanks_ignored', + seq: '------ACT---AGGT-----', + expected: '------ACT---AGG------', + side: :trailing + }, + ] + + FIX_INCOMPLETE_EDGE_CODON_TEST_CASES.each do |test_entry| + define_method("test_#{test_entry[:name]}") do + seq = test_entry[:seq] + side = test_entry[:side] + if (side.nil?) + side = :leading + end + CfeGotoh.fix_incomplete_edge_codon(seq, side) + assert_equal test_entry[:expected], seq + end + end +end + + +# class merge_insertions_and_deletions_to_fix_oof_sequences +class MergeInsertionsAndDeletionsToFixOofSequencesTest < CfeGotohTest + def test_standard_and_query_must_be_same_length + assert_raises RuntimeError do + CfeGotoh.merge_insertions_and_deletions_to_fix_oof_sequences('ACT', 'ACTACT') + end + end + + MERGE_INDELS_TEST_CASES = [ + { + name: 'no_need_for_merge', + std: 'ACTAAG', + query: 'ACTCAG', + expected_std: 'ACTAAG', + expected_query: 'ACTCAG' + }, + { + name: 'blanks_but_no_need_for_merge', + std: 'ACT---AGCTTT', + query: 'ACTAGC---TTC', + expected_std: 'ACT---AGCTTT', + expected_query: 'ACTAGC---TTC' + }, + { + name: 'bad_length_but_no_possible_merges', + std: 'ACTAAGC', + query: 'ACTCAGC', + expected_std: 'ACTAAGC', + expected_query: 'ACTCAGC' + }, + { + name: 'merge_one_base_ahead', + std: 'ACT-AAG', + query: 'ACTC-AG', + expected_std: 'ACTAAG', + expected_query: 'ACTCAG' + }, + { + name: 'merge_one_base_behind', + std: 'ACT-AAG', + query: 'AC-TCAG', + expected_std: 'ACTAAG', + expected_query: 'ACTCAG' + }, + { + name: 'merge_two_bases_ahead', + std: 'ACT-AAG', + query: 'ACTCA-G', + expected_std: 'ACTAAG', + expected_query: 'ACTCAG' + }, + { + name: 'merge_two_bases_behind', + std: 'ACT-AAG', + query: 'A-CTCAG', + expected_std: 'ACTAAG', + expected_query: 'ACTCAG' + }, + { + name: 'one_base_behind_preferred', + std: 'ACT-AAG', + query: 'AC-T-AG', + expected_std: 'ACTAAG', + expected_query: 'ACT-AG' + }, + { + name: 'one_base_ahead_preferred_over_two_bases_behind', + std: 'ACT-AAA', + query: 'A-CT-AA', + expected_std: 'ACTAAA', + expected_query: 'A-CTAA' + }, + { + name: 'two_bases_behind_preferred_over_two_bases_ahead', + std: 'ACT-AAA', + query: 'A-CTA-A', + expected_std: 'ACTAAA', + expected_query: 'ACTA-A' + }, + { + name: 'gaps_too_far_to_merge', + std: 'ACT-AAA', + query: 'ACTAAA-', + expected_std: 'ACT-AAA', + query: 'ACTAAA-' + }, + { + name: 'merges_stop_at_cogent_length', + std: 'ACT-AAAG-GG-CC', + query: 'AC-TAAAGGG--CC', + expected_std: 'ACTAAAGGG-CC', + expected_query: 'ACTAAAGGG-CC' + }, + { + name: 'impossible_merge_skipped_but_later_ones_happen', + std: 'ACT-AAAGGG-CC', + query: 'ACTAAA-GG-GCC', + expected_std: 'ACT-AAAGGGCC', + expected_query: 'ACTAAA-GGGCC' + } + ] + + MERGE_INDELS_TEST_CASES.each do |test_entry| + define_method("test_#{test_entry[:name]}") do + std = test_entry[:std] + query = test_entry[:query] + side = test_entry[:side] + CfeGotoh.merge_insertions_and_deletions_to_fix_oof_sequences(std, query) + assert_equal test_entry[:expected_std], std + assert_equal test_entry[:expected_query], query + end + end +end + + +class ClusterGapsTest < CfeGotohTest + CLUSTER_GAPS_TEST_CASES = [ + { + name: 'no_gaps', + gaps: [], + expected: [] + }, + { + name: 'empty_gap_causes_nothing', + gaps: [[]], + expected: [] + }, + { + name: 'good_size_gap', + gaps: [[3, 4, 5]], + expected: [[3, 4, 5]] + }, + { + name: 'bad_size_gap', + gaps: [[3, 4, 5, 6]], + expected: [[3, 4, 5, 6]] + } + { + name: 'merge_two_close_gaps_to_first', + gaps: [[10, 11], [13]], + expected: [[10, 11, 12]] + }, + { + name: 'merge_two_close_gaps_to_second', + gaps: [[7], [10, 11]], + expected: [[9, 10, 11]] + }, + { + name: 'two_gaps_can_merge_edge_case', + gaps: [[2, 3, 4, 5, 6], [14]], + expected: [[2, 3, 4, 5, 6, 7]] + }, + { + name: 'two_gaps_too_far_edge_case', + gaps: [[2, 3, 4, 5, 6], [15]], + expected: [[2, 3, 4, 5, 6], [15]] + }, + { + name: 'two_gaps_too_far', + gaps: [[2, 3, 4, 5, 6], [21]], + expected: [[2, 3, 4, 5, 6], [21]] + }, + { + name: 'three_close_gaps_merge', + gaps: [[8], [12, 13, 14, 15], [18, 19, 20, 21]], + expected: [[11, 12, 13, 14, 15, 16, 17, 18, 19]] + }, + { + name: 'three_gaps_merge_edge_case', + gaps: [[8], [12, 13, 14, 15], [19, 20, 21, 22]], + expected: [[11, 12, 13, 14, 15, 16, 17, 18, 19]] + }, + { + name: 'three_gaps_too_far_edge_case', + gaps: [[8], [12, 13, 14, 15], [20, 21, 22, 23]], + expected: [[11, 12, 13, 14, 15, 16, 17, 18, 19]] + }, + { + name: 'typical_case', + gaps: [[3, 4, 5], [8, 9], [13], [19, 20, 21, 22, 23, 24], [27], [32], [38, 39, 40, 41], [50, 51], [60], [70], [75, 76]], + expected: [[3, 4, 5], [8, 9, 10], [19, 20, 21, 22, 23, 24], [31, 32, 33, 34, 35, 36], [50, 51], [60], [74, 75, 76]]] + } + ] + + CLUSTER_GAPS_TEST_CASES.each do |test_entry| + define_method("test_#{test_entry[:name]}") do + assert_equal test_entry[:expected], CfeGotoh.cluster_gaps(test_entry[:gaps]) + end + end + + def test_bad_gap_causes_error + assert_raises CfeGotoh::GapMergeError do + CfeGotoh.cluster_gaps([[3, 4]]) + end + end + + def test_bad_gap_among_several_gaps_causes_error + assert_raises CfeGotoh::GapMergeError do + CfeGotoh.cluster_gaps([[3, 4, 5], [9, 10, 11, 12, 13, 14], [17]]) + end + end +end + + +class AlignGapsToFrameTest < CfeGotohTest + NO_COMMON_POSITIONS_TEST_CASES = [ + { + name: 'no_gaps', + gaps: [], + expected: [] + }, + { + name: 'already_in_frame', + gaps: [[6, 7, 8]], + expected: [[6, 7, 8]] + }, + { + name: 'shift_toward_beginning', + gaps: [[7, 8, 9]], + expected: [[6, 7, 8]] + }, + { + name: 'shift_toward_end', + gaps: [[5, 6, 7]], + expected: [[6, 7, 8]] + }, + { + name: 'two_in_frame', + gaps: [[6, 7, 8], [15, 16, 17]], + expected: [[6, 7, 8], [15, 16, 17]] + }, + { + name: 'two_needing_shifts', + gaps: [[5, 6, 7, 8, 9, 10], [16, 17, 18]], + expected: [[6, 7, 8, 9, 10, 11], [15, 16, 17]]] + }, + { + name: 'several_in_frame', + gaps: [[6, 7, 8], [15, 16, 17], [24, 25, 26]], + expected: [[6, 7, 8], [15, 16, 17], [24, 25, 26]] + }, + { + name: 'several_needing_shifts', + gaps: [[5, 6, 7, 8, 9, 10], [16, 17, 18], [31, 32, 33]], + expected: [[6, 7, 8, 9, 10, 11], [15, 16, 17], [30, 31, 32]] + } + ] + + NO_COMMON_POSITIONS_TEST_CASES.each do |test_entry| + define_method("test_#{test_entry[:name]}") do + assert_equal test_entry[:expected], CfeGotoh.align_gaps_to_frame(test_entry[:gaps]) + end + end + + WITH_COMMON_POSITIONS_TEST_CASES = [ + { + name: 'no_gaps', + gaps: [], + common: [7, 15], + expected: [] + }, + { + name: 'too_far_from_common', + gaps: [[3, 4, 5]], + common: [7], + expected: [[3, 4, 5]] + }, + { + name: 'too_far_from_common_but_needs_shift', + gaps: [[2, 3, 4]], + common: [7], + expected: [[3, 4, 5]] + }, + { + name: 'too_far_before_common_edge_case', + gaps: [[9, 10, 11]], + common: [7], + expected: [[9, 10, 11]] + }, + { + name: 'within_range_before_common_edge_case', + gaps: [[12, 13, 14]], + common: [7], + expected: [[21, 22, 23]] + }, + { + name: 'within_range_after_common_edge_case', + gaps: [[30, 31, 32]], + common: [7], + expected: [[21, 22, 23]] + }, + { + name: 'too_far_after_common_edge_case', + gaps: [[33, 34, 35]], + common: [7], + expected: [[21, 22, 23]] + }, + { + name: 'too_far_after_common', + gaps: [[45, 46, 47]], + common: [7], + expected: [[45, 46, 47]] + }, + { + name: 'too_far_after_common_but_needs_shift', + gaps: [[46, 47, 48]], + common: [7], + expected: [[45, 46, 47]] + }, + { + name: 'offset_from_first_is_factored_into_second', + # [57, 58, 59] is at codon 20 of the aligned sequence, which would be + # at position 54 of the "raw" sequence; this should be just in the + # "catchment area" of the common insertion at codon 15. + gaps: [[15, 16, 17], [57, 58, 59]], + common: [7, 15], + expected: [[21, 22, 23], [45, 46, 47]] + }, + { + name: 'offset_from_first_without_common_is_factored_into_second', + gaps: [[4, 5, 6], [57, 58, 59]], + common: [15], + expected: [[3, 4, 5], [45, 46, 47]] + }, + { + name: 'offset_from_first_without_shifting_is_factored_into_second', + gaps: [[3, 4, 5], [57, 58, 59]], + common: [15], + expected: [[3, 4, 5], [45, 46, 47]] + }, + { + name: 'offset_from_first_is_factored_into_second', + # Even though [36, 37, 38] is in the "catchment area" of the common + # insertion at codon 15, that's in the coordinates of the aligned + # sequence; when the offset is accounted for, it should not be shifted. + gaps: [[15, 16, 17], [36, 37, 38]], + common: [7, 15], + expected: [[21, 22, 23], [36, 37, 38]] + }, + { + name: 'offset_from_first_without_common_is_factored_into_second', + gaps: [[3, 4, 5], [36, 37, 38]], + common: [7, 15], + expected: [[3, 4, 5], [36, 37, 38]] + }, + { + name: 'offset_from_first_without_shifting_is_factored_into_second', + gaps: [[21, 22, 23], [36, 37, 38]], + common: [7, 15], + expected: [[21, 22, 23], [36, 37, 38]] + }, + { + name: 'offsets_taken_into_account', + gaps: [[3, 4, 5, 6, 7, 8], [36, 37, 38], [111, 112, 113]], + common: [15, 31], + expected: [[3, 4, 5, 6, 7, 8], [36, 37, 38], [93, 94, 95]] + }, + { + name: 'two_gaps_shifted_to_same_common_position', + gaps: [[14, 15, 16], [22, 23, 24]], + common: [7], + expected: [[21, 22, 23], [24, 25, 26]] + }, + { + name: 'typical_case', + gaps: [[3, 4, 5], [17, 18, 19, 20, 21, 22], [31, 32, 33], [36, 37, 38]], + common: [6, 11], + expected: [[3, 4, 5], [18, 19, 20, 21, 22, 23], [30, 31, 32], [45, 46, 47]] + } + ] + + WITH_COMMON_POSITIONS_TEST_CASES.each do |test_entry| + define_method("test_#{test_entry[:name]}") do + assert_equal( + test_entry[:expected], + CfeGotoh.align_gaps_to_frame(test_entry[:gaps], test_entry[:common]) + ) + end + end +end + + +class SpliceGapsIntoSequenceTest < CfeGotohTest + SPLICE_GAPS_TEST_CASES = [ + { + name: 'no_gaps', + seq: 'ACTAAG', + gaps: [], + expected: 'ACTAAG' + }, + { + name: 'no_sequence', + seq: '', + gaps: [], + expected: '' + }, + { + name: 'insert_at_beginning', + seq: '---ACTAAG', + gaps: [[0, 1, 2]], + expected: '---ACTAAG' + }, + { + name: 'insert_at_beginning_removed_from_elsewhere', + seq: 'ACT---AAG', + gaps: [[0, 1, 2]], + expected: '---ACTAAG' + }, + { + name: 'insert_at_end', + seq: 'ACTAAG---', + gaps: [[6, 7, 8]], + expected: 'ACTAAG---' + }, + { + name: 'insert_at_end_removed_from_elsewhere', + seq: 'ACT---AAG', + gaps: [[6, 7, 8]], + expected: 'ACTAAG---' + } + ] end From 6f2bd1bf7998eb6b8ca2725d30505935613b9e52 Mon Sep 17 00:00:00 2001 From: rhliang Date: Wed, 6 Nov 2024 20:07:15 -0800 Subject: [PATCH 12/31] WIP: more tests written. --- ruby/test/cfe_gotoh_test.rb | 126 ++++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) diff --git a/ruby/test/cfe_gotoh_test.rb b/ruby/test/cfe_gotoh_test.rb index 3c30dc8..88696fa 100644 --- a/ruby/test/cfe_gotoh_test.rb +++ b/ruby/test/cfe_gotoh_test.rb @@ -797,6 +797,132 @@ class SpliceGapsIntoSequenceTest < CfeGotohTest seq: 'ACT---AAG', gaps: [[6, 7, 8]], expected: 'ACTAAG---' + }, + { + name: 'insert_in_middle_retained', + seq: 'ACT---AAG', + gaps: [[3, 4, 5]], + expected: 'ACT---AAG' + }, + { + name: 'insert_in_middle_corrected', + seq: 'AC--TA-AG', + gaps: [[3, 4, 5]], + expected: 'ACT---AAG' + }, + { + name: 'offsets_accounted_for', + seq: 'AACAT---GGG---G', + gaps: [[3, 4, 5], [9, 10, 11]], + expected: 'AAC---ATG---GGG' + }, + { + name: 'typical_case', + seq: '---AACAT---GGG---G------', + gaps: [[0, 1, 2], [3, 4, 5], [9, 10, 11], [15, 16, 17]], + expected: '---AAC---ATG---GGG---' } ] + + SPLICE_GAPS_TEST_CASES.each do |test_entry| + define_method("test_#{test_entry[:name]}") do + assert_equal( + test_entry[:expected], + CfeGotoh.splice_gaps_into_sequence(test_entry[:seq], test_entry[:gaps]) + ) + end + end +end + + +class FrameAlignTest < CfeGotohTest + def test_bad_inserted_bases_error + std = 'ACGTACGT-ACGT' + query = 'ACGTACGTAACGT' + assert_raises RuntimeError do + CfeGotoh.frame_align(std, query, 3, 1, nil, false, true, true) + end) + end + + def test_bad_inserted_bases_error + std = 'ACGTACGTAACGT' + query = 'ACGTACGT-ACGT' + assert_raises RuntimeError do + CfeGotoh.frame_align(std, query, 3, 1, nil, false, true, true) + end) + end + + def test_edges_are_trimmed + std = '------ACGTACGTACGT------' + query = '-------CGTACGTAC--------' + + result = CfeGotoh.frame_align(std, query, 3, 1, nil, false, true, true) + assert_equal 'ACGTACGTACGT', result[0] + assert_equal '---TACGTA---', result[1] + end + + def test_indels_are_merged + std = 'ACGT-ACGTACGT' + query = 'ACGTAC-GTACGT' + result = CfeGotoh.frame_align(std, query, 3, 1, nil, false, true, true) + expected = 'ACGTACGTACGT' + assert_equal expected, result[0] + assert_equal expected, result[1] + end + + def test_insertions_are_clustered + std = 'ACG--TA-CGTACGT' + query = 'ACGGTAAACGTACGT' + result = CfeGotoh.frame_align(std, query, 3, 1, nil, false, true, true) + expected_std = 'ACG---TACGTACGT' + expected_query = 'ACGGTAAACGTACGT' + assert_equal expected_std, result[0] + assert_equal expected_query, result[1] + end + + def test_deletions_are_clustered + std = 'ACGGGTAACGTACGT' + query = 'ACG--T-ACGTACGT' + result = CfeGotoh.frame_align(std, query, 3, 1, nil, false, true, true) + expected_std = 'ACGGTAAACGTACGT' + expected_query = 'ACG---TACGTACGT' + assert_equal expected_std, result[0] + assert_equal expected_query, result[1] + end + + def test_unmerged_inserts_raise_error + std = 'ACG--TACGTACGTAC-GT' + query = 'ACGGGTACGTACGTACCGT' + assert_raises RuntimeError do + CfeGotoh.frame_align(std, query, 3, 1, nil, false, true, true) + end + end + + def test_unmerged_deletions_raise_error + std = 'ACGGGTACGTACGTACCGT' + query = 'ACG--TACGTACGTAC-GT' + assert_raises RuntimeError do + CfeGotoh.frame_align(std, query, 3, 1, nil, false, true, true) + end + end + + def test_insertions_are_frame_aligned + std = 'ACGT---ACGTACGT' + query = 'ACGTTTTACGTACGT' + result = CfeGotoh.frame_align(std, query, 3, 1, nil, false, true, true) + expected_std = 'ACG---TACGTACGT' + expected_query = 'ACGTTTTACGTACGT' + assert_equal expected_std, result[0] + assert_equal expected_query, result[1] + end + + def test_deletions_are_frame_aligned + std = 'ACGTTTTACGTACGT' + query = 'ACGT---ACGTACGT' + result = CfeGotoh.frame_align(std, query, 3, 1, nil, false, true, true) + expected_std = 'ACGTTTTACGTACGT' + expected_query = 'ACG---TACGTACGT' + assert_equal expected_std, result[0] + assert_equal expected_query, result[1] + end end From 13356f15e3831d37734012aa4e9be27cbdbb2d1f Mon Sep 17 00:00:00 2001 From: Richard Liang Date: Fri, 8 Nov 2024 19:44:56 -0800 Subject: [PATCH 13/31] WIP: fixing tests. The frame_align tests still don't work. --- ruby/Gemfile | 4 +- ruby/Gemfile.lock | 17 +- ruby/lib/cfe_gotoh.rb | 21 +-- ruby/rakefile | 4 +- ruby/test/cfe_gotoh_test.rb | 329 ++++++++++++++++++++++++------------ ruby/test/test_helper.rb | 5 +- 6 files changed, 249 insertions(+), 131 deletions(-) diff --git a/ruby/Gemfile b/ruby/Gemfile index db49378..5d71a3a 100644 --- a/ruby/Gemfile +++ b/ruby/Gemfile @@ -3,7 +3,7 @@ ruby '2.2.2' # To install all these gems, run "gem install bundler", then "bundle install". -gem 'minitest-reporters' +gem 'minitest', '5.15.0' +gem 'minitest-reporters', '1.7.1' gem 'simplecov', '0.17.1' -gem 'simplecov-lcov', '0.7.0' gem 'simplecov-cobertura', '1.4.2' diff --git a/ruby/Gemfile.lock b/ruby/Gemfile.lock index ce0aa27..28a58a1 100644 --- a/ruby/Gemfile.lock +++ b/ruby/Gemfile.lock @@ -4,16 +4,14 @@ GEM ansi (1.5.0) builder (3.3.0) docile (1.3.5) - hashie (5.0.0) json (2.5.1) - minitest (4.7.5) - minitest-reporters (0.14.24) + minitest (5.15.0) + minitest-reporters (1.7.1) ansi builder - minitest (>= 2.12, < 5.0) - powerbar - powerbar (2.0.1) - hashie (>= 1.1.0) + minitest (>= 5.0) + ruby-progressbar + ruby-progressbar (1.13.0) simplecov (0.17.1) docile (~> 1.1) json (>= 1.8, < 3) @@ -21,16 +19,15 @@ GEM simplecov-cobertura (1.4.2) simplecov (~> 0.8) simplecov-html (0.10.2) - simplecov-lcov (0.7.0) PLATFORMS ruby DEPENDENCIES - minitest-reporters + minitest (= 5.15.0) + minitest-reporters (= 1.7.1) simplecov (= 0.17.1) simplecov-cobertura (= 1.4.2) - simplecov-lcov (= 0.7.0) RUBY VERSION ruby 2.2.2p95 diff --git a/ruby/lib/cfe_gotoh.rb b/ruby/lib/cfe_gotoh.rb index 29dffe6..81b6934 100644 --- a/ruby/lib/cfe_gotoh.rb +++ b/ruby/lib/cfe_gotoh.rb @@ -1,6 +1,7 @@ #TODO: Scoring algorithm to improve frame_align? -require 'cfe_gotoh/cfe_gotoh' +# require 'cfe_gotoh/cfe_gotoh' +require_relative '../ext/cfe_gotoh/cfe_gotoh' module CfeGotoh @@ -45,7 +46,7 @@ class GapMergeError < Error sub_matrix['$'.ord()]['$'.ord()]=50.0 sub_matrix['T'.ord()]['U'.ord()] = sub_matrix['U'.ord()]['T'.ord()] = 1.0 sub_matrix['N'.ord()]['N'.ord()] = 0.0 - sub_matrix['X'.ord()]['-'.ord()]=sub_matrix['X'.ord()]['-'.ord()]=3.0 + sub_matrix['X'.ord()]['-'.ord()]=sub_matrix['-'.ord()]['X'.ord()]=3.0 ['A','T','G','C'].each do |ch| sub_matrix[ch.ord()]['*'.ord()]=sub_matrix['*'.ord()][ch.ord()]=1.0 sub_matrix[ch.ord()]['&'.ord()]=sub_matrix['&'.ord()][ch.ord()]=0.7 @@ -118,18 +119,18 @@ def self.fix_incomplete_edge_codon(query, side=:leading) end if (query[edge_idx] == '-') - dashes = dash_regex.match(query)[0] # we know there will be a match + dashes = dash_regex.match(query)[1] # we know there will be a match # If the length of the dashes aren't a multiple of 3, turn some # of the query characters into dashes to force it to be a full # codon of dashes. if (dashes.size() % 3 >= 1) - first_non_dash_idx = 0 + first_non_dash_idx = dashes.size() if (side != :leading) first_non_dash_idx = query.size() - dashes.size() - 1 end query[first_non_dash_idx] = '-' - if (dashes.size() % 3 == 2) + if (dashes.size() % 3 == 1) query[first_non_dash_idx + incr] = '-' end end @@ -193,9 +194,9 @@ def self.cluster_gaps(gaps, raise_errors=false) ) # Place the gap around the middle of the three merging gaps. new_gap = ( - (gap2.first - gap.size()) .. gap2.first - 1.to_a() - + gap2 - + ((gap2.last + 1) .. (gap2.last + gap3.size())).to_a() + ((gap2.first - gap.size()) .. gap2.first - 1).to_a() + + gap2 + + ((gap2.last + 1) .. (gap2.last + gap3.size())).to_a() ) new_gap_list << new_gap @@ -301,8 +302,8 @@ def self.frame_align( if (trim) trim_leading_dashes(standard, query) trim_trailing_dashes(standard, query) - fix_incomplete_edge_codon(standard, query, :leading) - fix_incomplete_edge_codon(standard, query, :trailing) + fix_incomplete_edge_codon(query, :leading) + fix_incomplete_edge_codon(query, :trailing) end merge_insertions_and_deletions_to_fix_oof_sequences(standard, query) diff --git a/ruby/rakefile b/ruby/rakefile index efe4375..8b32cf1 100644 --- a/ruby/rakefile +++ b/ruby/rakefile @@ -6,6 +6,6 @@ task :default => :test base_dir = File.expand_path(".") Rake::TestTask.new do |t| - t.libs = [base_dir] - t.pattern = "test/**/*_test.rb" + t.libs = [base_dir] + t.pattern = "test/**/*_test.rb" end diff --git a/ruby/test/cfe_gotoh_test.rb b/ruby/test/cfe_gotoh_test.rb index 88696fa..07190c6 100644 --- a/ruby/test/cfe_gotoh_test.rb +++ b/ruby/test/cfe_gotoh_test.rb @@ -9,17 +9,17 @@ class CfeGotohTest < Minitest::Test class ScoreAlignmentTest < CfeGotohTest REGULAR_BASES = ['A', 'C', 'G', 'T'].freeze MIXTURES = { - 'R': ['A', 'G'], - 'Y': ['C', 'T'], - 'S': ['G', 'C'], - 'W': ['A', 'T'], - 'K': ['G', 'T'], - 'M': ['A', 'C'], - 'B': ['C', 'G', 'T'], - 'D': ['A', 'G', 'T'], - 'H': ['A', 'C', 'T'], - 'V': ['A', 'C', 'G'], - 'N': ['A', 'C', 'G', 'T'] + 'R' => ['A', 'G'], + 'Y' => ['C', 'T'], + 'S' => ['G', 'C'], + 'W' => ['A', 'T'], + 'K' => ['G', 'T'], + 'M' => ['A', 'C'], + 'B' => ['C', 'G', 'T'], + 'D' => ['A', 'G', 'T'], + 'H' => ['A', 'C', 'T'], + 'V' => ['A', 'C', 'G'], + 'N' => ['A', 'C', 'G', 'T'] }.freeze def score_alignment_symmetric_test(expected, base1, base2) @@ -34,7 +34,7 @@ def test_regular_base_scores if (std_base == query_base) expected = 1.0 end - assert_equal expected, CfeGotoh.score_alignment(std_base, query_base) + score_alignment_symmetric_test(expected, std_base, query_base) end end end @@ -45,8 +45,11 @@ def test_mixture_scores expected = -1.0 if (std_base == query_base) expected = 1.0 + if (std_base == 'N') + expected = 0.0 + end end - assert_equal expected, CfeGotoh.score_alignment(std_base, query_base) + score_alignment_symmetric_test(expected, std_base, query_base) end end end @@ -66,7 +69,7 @@ def test_mixture_with_regular_scores end def test_score_x - (REGULAR_BASES + MIXTURES).each do |base| + (REGULAR_BASES + MIXTURES.keys()).each do |base| expected = -6.0 if (base == 'N') expected = -1.0 @@ -155,35 +158,35 @@ class TrimLeadingDashesTest < CfeGotohTest expected_query: 'ACACAT' }, { - name: 'one_leading_dash' + name: 'one_leading_dash', std: '-ACAGAT', query: 'GACACAT', expected_std: 'ACAGAT', expected_query: 'ACACAT' }, { - name: 'several_leading_dashes' + name: 'several_leading_dashes', std: '----ACAGAT', query: 'GGGGACACAT', expected_std: 'ACAGAT', expected_query: 'ACACAT' }, { - name: 'no_leading_dashes_other_dashes_ignored' + name: 'no_leading_dashes_other_dashes_ignored', std: 'ACA---CATGAT-', query: 'ACAGGG---CATC', expected_std: 'ACA---CATGAT-', expected_query: 'ACAGGG---CATC' }, { - name: 'one_leading_dash_other_dashes_ignored' + name: 'one_leading_dash_other_dashes_ignored', std: '-ACA---CATGAT-', query: '-ACAGGG---CATC', expected_std: 'ACA---CATGAT-', expected_query: 'ACAGGG---CATC' }, { - name: 'several_leading_dashes_other_dashes_ignored' + name: 'several_leading_dashes_other_dashes_ignored', std: '----ACA---CATGAT-', query: 'GGGGACAGGG---CATC', expected_std: 'ACA---CATGAT-', @@ -213,35 +216,35 @@ class TrimTrailingDashesTest < CfeGotohTest expected_query: 'ACACAT' }, { - name: 'one_trailing_dash' + name: 'one_trailing_dash', std: 'ACAGAT-', query: 'ACACATG', expected_std: 'ACAGAT', expected_query: 'ACACAT' }, { - name: 'several_trailing_dashes' + name: 'several_trailing_dashes', std: 'ACAGAT----', query: 'ACACATGGGG', expected_std: 'ACAGAT', expected_query: 'ACACAT' }, { - name: 'no_trailing_dashes_other_dashes_ignored' + name: 'no_trailing_dashes_other_dashes_ignored', std: '-ACA---CATGAT', query: 'CACAGGG---CAT', expected_std: '-ACA---CATGAT', expected_query: 'CACAGGG---CAT' }, { - name: 'one_trailing_dash_other_dashes_ignored' + name: 'one_trailing_dash_other_dashes_ignored', std: '-ACA---CATGAT-', query: 'CACAGGG---CATC', expected_std: '-ACA---CATGAT', expected_query: 'CACAGGG---CAT' }, { - name: 'several_trailing_dashes_other_dashes_ignored' + name: 'several_trailing_dashes_other_dashes_ignored', std: '-ACA---CATGAT----', query: 'CACAGGG---CATCGGC', expected_std: '-ACA---CATGAT', @@ -361,7 +364,7 @@ class FixIncompleteEdgeCodonTest < CfeGotohTest seq: '------ACT---AGGT-----', expected: '------ACT---AGG------', side: :trailing - }, + } ] FIX_INCOMPLETE_EDGE_CODON_TEST_CASES.each do |test_entry| @@ -462,7 +465,7 @@ def test_standard_and_query_must_be_same_length std: 'ACT-AAA', query: 'ACTAAA-', expected_std: 'ACT-AAA', - query: 'ACTAAA-' + expected_query: 'ACTAAA-' }, { name: 'merges_stop_at_cogent_length', @@ -514,7 +517,7 @@ class ClusterGapsTest < CfeGotohTest name: 'bad_size_gap', gaps: [[3, 4, 5, 6]], expected: [[3, 4, 5, 6]] - } + }, { name: 'merge_two_close_gaps_to_first', gaps: [[10, 11], [13]], @@ -553,12 +556,12 @@ class ClusterGapsTest < CfeGotohTest { name: 'three_gaps_too_far_edge_case', gaps: [[8], [12, 13, 14, 15], [20, 21, 22, 23]], - expected: [[11, 12, 13, 14, 15, 16, 17, 18, 19]] + expected: [[8], [12, 13, 14, 15], [20, 21, 22, 23]] }, { name: 'typical_case', gaps: [[3, 4, 5], [8, 9], [13], [19, 20, 21, 22, 23, 24], [27], [32], [38, 39, 40, 41], [50, 51], [60], [70], [75, 76]], - expected: [[3, 4, 5], [8, 9, 10], [19, 20, 21, 22, 23, 24], [31, 32, 33, 34, 35, 36], [50, 51], [60], [74, 75, 76]]] + expected: [[3, 4, 5], [8, 9, 10], [19, 20, 21, 22, 23, 24], [31, 32, 33, 34, 35, 36], [50, 51], [60], [74, 75, 76]] } ] @@ -570,13 +573,13 @@ class ClusterGapsTest < CfeGotohTest def test_bad_gap_causes_error assert_raises CfeGotoh::GapMergeError do - CfeGotoh.cluster_gaps([[3, 4]]) + CfeGotoh.cluster_gaps([[3, 4]], raise_errors=true) end end def test_bad_gap_among_several_gaps_causes_error assert_raises CfeGotoh::GapMergeError do - CfeGotoh.cluster_gaps([[3, 4, 5], [9, 10, 11, 12, 13, 14], [17]]) + CfeGotoh.cluster_gaps([[3, 4, 5], [9, 10, 11, 12, 13, 14], [17]], raise_errors=true) end end end @@ -612,7 +615,7 @@ class AlignGapsToFrameTest < CfeGotohTest { name: 'two_needing_shifts', gaps: [[5, 6, 7, 8, 9, 10], [16, 17, 18]], - expected: [[6, 7, 8, 9, 10, 11], [15, 16, 17]]] + expected: [[6, 7, 8, 9, 10, 11], [15, 16, 17]] }, { name: 'several_in_frame', @@ -634,7 +637,7 @@ class AlignGapsToFrameTest < CfeGotohTest WITH_COMMON_POSITIONS_TEST_CASES = [ { - name: 'no_gaps', + name: 'no_gaps_with_common', gaps: [], common: [7, 15], expected: [] @@ -673,7 +676,7 @@ class AlignGapsToFrameTest < CfeGotohTest name: 'too_far_after_common_edge_case', gaps: [[33, 34, 35]], common: [7], - expected: [[21, 22, 23]] + expected: [[33, 34, 35]] }, { name: 'too_far_after_common', @@ -688,28 +691,28 @@ class AlignGapsToFrameTest < CfeGotohTest expected: [[45, 46, 47]] }, { - name: 'offset_from_first_is_factored_into_second', + name: 'offset_from_first_shifted_to_common_is_factored_into_second_shifted_to_common', # [57, 58, 59] is at codon 20 of the aligned sequence, which would be # at position 54 of the "raw" sequence; this should be just in the # "catchment area" of the common insertion at codon 15. gaps: [[15, 16, 17], [57, 58, 59]], common: [7, 15], - expected: [[21, 22, 23], [45, 46, 47]] + expected: [[21, 22, 23], [48, 49, 50]] }, { - name: 'offset_from_first_without_common_is_factored_into_second', + name: 'offset_from_first_not_shifted_to_common_is_factored_into_second_shifted_to_common', gaps: [[4, 5, 6], [57, 58, 59]], common: [15], - expected: [[3, 4, 5], [45, 46, 47]] + expected: [[3, 4, 5], [48, 49, 50]] }, { - name: 'offset_from_first_without_shifting_is_factored_into_second', + name: 'offset_from_first_without_shifting_is_factored_into_second_shifted_to_common', gaps: [[3, 4, 5], [57, 58, 59]], common: [15], - expected: [[3, 4, 5], [45, 46, 47]] + expected: [[3, 4, 5], [48, 49, 50]] }, { - name: 'offset_from_first_is_factored_into_second', + name: 'offset_from_first_shifted_to_common_is_factored_into_second', # Even though [36, 37, 38] is in the "catchment area" of the common # insertion at codon 15, that's in the coordinates of the aligned # sequence; when the offset is accounted for, it should not be shifted. @@ -718,22 +721,23 @@ class AlignGapsToFrameTest < CfeGotohTest expected: [[21, 22, 23], [36, 37, 38]] }, { - name: 'offset_from_first_without_common_is_factored_into_second', + name: 'offset_from_first_not_shifted_to_common_is_factored_into_second', gaps: [[3, 4, 5], [36, 37, 38]], common: [7, 15], expected: [[3, 4, 5], [36, 37, 38]] }, { - name: 'offset_from_first_without_shifting_is_factored_into_second', + name: 'offset_from_first_already_at_common_is_factored_into_second', gaps: [[21, 22, 23], [36, 37, 38]], common: [7, 15], expected: [[21, 22, 23], [36, 37, 38]] }, { - name: 'offsets_taken_into_account', + name: 'offsets_taken_into_account_in_shifting_to_common', gaps: [[3, 4, 5, 6, 7, 8], [36, 37, 38], [111, 112, 113]], common: [15, 31], - expected: [[3, 4, 5, 6, 7, 8], [36, 37, 38], [93, 94, 95]] + # [102, 103, 104] is codon 31 after the 9 base offset + expected: [[3, 4, 5, 6, 7, 8], [36, 37, 38], [102, 103, 104]] }, { name: 'two_gaps_shifted_to_same_common_position', @@ -743,9 +747,9 @@ class AlignGapsToFrameTest < CfeGotohTest }, { name: 'typical_case', - gaps: [[3, 4, 5], [17, 18, 19, 20, 21, 22], [31, 32, 33], [36, 37, 38]], - common: [6, 11], - expected: [[3, 4, 5], [18, 19, 20, 21, 22, 23], [30, 31, 32], [45, 46, 47]] + gaps: [[3, 4, 5], [17, 18, 19, 20, 21, 22], [40, 41, 42], [45, 46, 47]], + common: [6, 14], + expected: [[3, 4, 5], [21, 22, 23, 24, 25, 26], [39, 40, 41], [54, 55, 56]] } ] @@ -819,7 +823,7 @@ class SpliceGapsIntoSequenceTest < CfeGotohTest { name: 'typical_case', seq: '---AACAT---GGG---G------', - gaps: [[0, 1, 2], [3, 4, 5], [9, 10, 11], [15, 16, 17]], + gaps: [[0, 1, 2], [6, 7, 8], [12, 13, 14], [18, 19, 20]], expected: '---AAC---ATG---GGG---' } ] @@ -841,7 +845,7 @@ def test_bad_inserted_bases_error query = 'ACGTACGTAACGT' assert_raises RuntimeError do CfeGotoh.frame_align(std, query, 3, 1, nil, false, true, true) - end) + end end def test_bad_inserted_bases_error @@ -849,45 +853,7 @@ def test_bad_inserted_bases_error query = 'ACGTACGT-ACGT' assert_raises RuntimeError do CfeGotoh.frame_align(std, query, 3, 1, nil, false, true, true) - end) - end - - def test_edges_are_trimmed - std = '------ACGTACGTACGT------' - query = '-------CGTACGTAC--------' - - result = CfeGotoh.frame_align(std, query, 3, 1, nil, false, true, true) - assert_equal 'ACGTACGTACGT', result[0] - assert_equal '---TACGTA---', result[1] - end - - def test_indels_are_merged - std = 'ACGT-ACGTACGT' - query = 'ACGTAC-GTACGT' - result = CfeGotoh.frame_align(std, query, 3, 1, nil, false, true, true) - expected = 'ACGTACGTACGT' - assert_equal expected, result[0] - assert_equal expected, result[1] - end - - def test_insertions_are_clustered - std = 'ACG--TA-CGTACGT' - query = 'ACGGTAAACGTACGT' - result = CfeGotoh.frame_align(std, query, 3, 1, nil, false, true, true) - expected_std = 'ACG---TACGTACGT' - expected_query = 'ACGGTAAACGTACGT' - assert_equal expected_std, result[0] - assert_equal expected_query, result[1] - end - - def test_deletions_are_clustered - std = 'ACGGGTAACGTACGT' - query = 'ACG--T-ACGTACGT' - result = CfeGotoh.frame_align(std, query, 3, 1, nil, false, true, true) - expected_std = 'ACGGTAAACGTACGT' - expected_query = 'ACG---TACGTACGT' - assert_equal expected_std, result[0] - assert_equal expected_query, result[1] + end end def test_unmerged_inserts_raise_error @@ -906,23 +872,176 @@ def test_unmerged_deletions_raise_error end end - def test_insertions_are_frame_aligned - std = 'ACGT---ACGTACGT' - query = 'ACGTTTTACGTACGT' - result = CfeGotoh.frame_align(std, query, 3, 1, nil, false, true, true) - expected_std = 'ACG---TACGTACGT' - expected_query = 'ACGTTTTACGTACGT' - assert_equal expected_std, result[0] - assert_equal expected_query, result[1] + PREALIGN_TEST_CASES = [ + { + name: 'edges_are_trimmed', + std: '------ACGTACGTACGT------', + query: 'AAAAAA-CGTACGTAC--AAAAAA', + expected_std: 'ACGTACGTACGT', + expected_query: '---TACGTA---' + }, + { + name: 'edges_not_trimmed_when_not_specified', + std: '------ACGTACGTACGT------', + query: 'AAAAAAACGTACGTACGTAAAAAA', + trim: false, + expected_std: '------ACGTACGTACGT------', + expected_query: 'AAAAAAACGTACGTACGTAAAAAA' + }, + { + name: 'indels_are_merged', + std: 'ACGT-ACGTACGT', + query: 'ACGTAC-GTACGT', + expected_std: 'ACGTACGTACGT', + expected_query: 'ACGTACGTACGT' + }, + { + name: 'insertions_are_clustered', + std: 'ACG--TA-CGTACGT', + query: 'ACGGGTAACGTACGT', + expected_std: 'ACG---TACGTACGT', + expected_query: 'ACGGGTAACGTACGT' + }, + { + name: 'deletions_are_clustered', + std: 'ACGGGTAACGTACGT', + query: 'ACG--T-ACGTACGT', + expected_std: 'ACGGGTAACGTACGT', + expected_query: 'ACG---TACGTACGT' + }, + { + name: 'insertions_are_frame_aligned', + std: 'ACGT---ACGTACGT', + query: 'ACGTTTTACGTACGT', + expected_std: 'ACG---TACGTACGT', + expected_query: 'ACGTTTTACGTACGT' + }, + { + name: 'deletions_are_frame_aligned', + std: 'ACGTTTTACGTACGT', + query: 'ACGT---ACGTACGT', + expected_std: 'ACGTTTTACGTACGT', + expected_query: 'ACG---TACGTACGT' + }, + { + name: 'insertions_moved_to_common_positions', + std: 'ACGTTTTACGTACGT', + query: 'ACG---TACGTACGT', + common_insert_locations: [3], + expected_std: 'ACGTTTTACGTACGT', + expected_query: 'ACGTACGTA---CGT', + } + ] + + PREALIGN_TEST_CASES.each do |test_entry| + define_method("test_#{test_entry[:name]}") do + trim = test_entry[:trim].nil? ? true : false + raise_errors = test_entry[:raise_errors].nil? ? true : false + result = CfeGotoh.frame_align( + test_entry[:std], + test_entry[:query], + 3, + 1, + test_entry[:common_insert_locations], + trim, + raise_errors, + true + ) + assert_equal(test_entry[:expected_std], result[0]) + assert_equal(test_entry[:expected_query], result[1]) + end end - def test_deletions_are_frame_aligned - std = 'ACGTTTTACGTACGT' - query = 'ACGT---ACGTACGT' - result = CfeGotoh.frame_align(std, query, 3, 1, nil, false, true, true) - expected_std = 'ACGTTTTACGTACGT' - expected_query = 'ACG---TACGTACGT' - assert_equal expected_std, result[0] - assert_equal expected_query, result[1] + ALIGNMENT_TEST_CASES = [ + { + name: 'edges_are_trimmed', + std: 'ACGTACGTACGT', + query: 'CCCCCCACGTACGTACCTAAAAAA', + expected_std: 'ACGTACGTACGT', + expected_query: 'ACGTACGTACGT' + }, + { + name: 'edges_not_trimmed_when_not_specified', + std: 'ACGTACGTACGT', + query: 'AAAAAAACGTACGTACGTAAAAAA', + trim: false, + expected_std: '------ACGTACGTACGT------', + expected_query: 'AAAAAAACGTACGTACGTAAAAAA' + }, + { + # std: ACGTGACGT-ACGT + # qry: ACGT-ACGTGACGT + name: 'indels_are_merged', + std: 'ACGTGACGTACGT', + query: 'ACGTACGTGACGT', + expected_std: 'ACGTACGTACGT', + expected_query: 'ACGTACGTACGT' + }, + { + # std: ACG--TA-CGTACGT + # qry: ACGCCTATCGTACGT + name: 'insertions_are_clustered', + std: 'ACGTACGTACGT', + query: 'ACGCCTATCGTACGT', + expected_std: 'ACG---TACGTACGT', + expected_query: 'ACGCCTATCGTACGT' + }, + { + # std: ACGCCTATCGTACGT + # qry: ACG--TA-CGTACGT + name: 'deletions_are_clustered', + std: 'ACGCCTAGCGTACGT', + query: 'ACGTACGTACGT', + expected_std: 'ACGCCTAGCGTACGT', + expected_query: 'ACG---TACGTACGT' + }, + { + # std: ACGT---ACGTACGT + # qry: ACGTCCCACGTACGT + name: 'insertions_are_frame_aligned', + std: 'ACGTACGTACGT', + query: 'ACGTCCCACGTACGT', + expected_std: 'ACG---TACGTACGT', + expected_query: 'ACGTCCCACGTACGT' + }, + { + # std: ACGTCCCACGTACGT + # qry: ACGT---ACGTACGT + name: 'deletions_are_frame_aligned', + std: 'ACGTCCCACGTACGT', + query: 'ACGTACGTACGT', + expected_std: 'ACGTCCCACGTACGT', + expected_query: 'ACG---TACGTACGT' + }, + { + # std: ACGTCCCACGTACGT + # qry: ACGT---ACGTACGT + name: 'insertions_moved_to_common_positions', + std: 'ACGTCCCACGTACGT', + query: 'ACGTACGTACGT', + common_insert_locations: [3], + expected_std: 'ACGTCCCACGTACGT', + expected_query: 'ACGTACGTA---CGT', + } + ] + + ALIGNMENT_TEST_CASES.each do |test_entry| + define_method("test_#{test_entry[:name]}") do + trim = test_entry[:trim].nil? ? true : false + raise_errors = test_entry[:raise_errors].nil? ? true : false + result = CfeGotoh.frame_align( + test_entry[:std], + test_entry[:query], + 3, + 1, + test_entry[:common_insert_locations], + trim, + raise_errors, + false + ) + assert_equal(test_entry[:expected_std], result[0]) + assert_equal(test_entry[:expected_query], result[1]) + end end + end diff --git a/ruby/test/test_helper.rb b/ruby/test/test_helper.rb index 024b1b8..95e30ce 100644 --- a/ruby/test/test_helper.rb +++ b/ruby/test/test_helper.rb @@ -24,6 +24,7 @@ Minitest::Reporters::SpecReporter.new, Minitest::Reporters::JUnitReporter.new, Minitest::Reporters::HtmlReporter.new( - :title => "hivdb_algorithm Test Report #{run_id}", - :erb_template => File.join(File.dirname(__FILE__), "templates/index.html.erb")) + :title => "cfe_gotoh Test Report #{run_id}", + :erb_template => File.join(File.dirname(__FILE__), "templates/index.html.erb") + ) ] From 2ed9270042be271580fb2889fea597859a0ffd86 Mon Sep 17 00:00:00 2001 From: Richard Liang Date: Wed, 13 Nov 2024 11:30:30 -0800 Subject: [PATCH 14/31] First pass of testing is working. --- ruby/test/cfe_gotoh_test.rb | 128 ++++++++++++++++++----------- ruby/test/templates/index.html.erb | 89 ++++++++++++++++++++ 2 files changed, 171 insertions(+), 46 deletions(-) create mode 100644 ruby/test/templates/index.html.erb diff --git a/ruby/test/cfe_gotoh_test.rb b/ruby/test/cfe_gotoh_test.rb index 07190c6..0667c08 100644 --- a/ruby/test/cfe_gotoh_test.rb +++ b/ruby/test/cfe_gotoh_test.rb @@ -381,7 +381,6 @@ class FixIncompleteEdgeCodonTest < CfeGotohTest end -# class merge_insertions_and_deletions_to_fix_oof_sequences class MergeInsertionsAndDeletionsToFixOofSequencesTest < CfeGotohTest def test_standard_and_query_must_be_same_length assert_raises RuntimeError do @@ -925,16 +924,32 @@ def test_unmerged_deletions_raise_error }, { name: 'insertions_moved_to_common_positions', - std: 'ACGTTTTACGTACGT', - query: 'ACG---TACGTACGT', + std: 'ACGT---ACGTACGT', + query: 'ACGTTTTACGTACGT', common_insert_locations: [3], - expected_std: 'ACGTTTTACGTACGT', - expected_query: 'ACGTACGTA---CGT', + expected_std: 'ACGTACGTA---CGT', + expected_query: 'ACGTTTTACGTACGT', + }, + { + # The changes that should be made here: + # * edges trimmed + # * the insertions after position 3 and 4 should be merged + # * the deletions at 6-7 and 9 should be merged + # * the insertion after 12 should be merged with the deletion at 13 + # * the insertion after 19 should be frame-aligned by setting back 1 base + # * the deletion at 24-26 should be frame-aligned by setting forward 1 base + # * the insertion after 36 should be shifted over to common insert location 11 + name: 'typical_case', + std: '------ACG--T-ACAAGCTA-TCGTACG---TACGCCCTACGTACGTA---CGT------', + query: 'AAAAAAACGAATCAC--G-TAG-CGTACGCCCTACG---TACGTACGTATTTCGTAAAAAA', + common_insert_locations: [11], + expected_std: 'ACG---TACAAGCTATCGTAC---GTACGCCCTACGTAC---GTACGT', + expected_query: 'ACGAATCAC---GTAGCGTACGCCCTACGT---ACGTACGTATTTCGT' } ] PREALIGN_TEST_CASES.each do |test_entry| - define_method("test_#{test_entry[:name]}") do + define_method("test_without_alignment_#{test_entry[:name]}") do trim = test_entry[:trim].nil? ? true : false raise_errors = test_entry[:raise_errors].nil? ? true : false result = CfeGotoh.frame_align( @@ -956,7 +971,7 @@ def test_unmerged_deletions_raise_error { name: 'edges_are_trimmed', std: 'ACGTACGTACGT', - query: 'CCCCCCACGTACGTACCTAAAAAA', + query: 'CCCCCCACGTACGTACGTAAAAAA', expected_std: 'ACGTACGTACGT', expected_query: 'ACGTACGTACGT' }, @@ -969,64 +984,86 @@ def test_unmerged_deletions_raise_error expected_query: 'AAAAAAACGTACGTACGTAAAAAA' }, { - # std: ACGTGACGT-ACGT - # qry: ACGT-ACGTGACGT + # X preferentially aligns to dashes. + # std: TCTAAACCXC-GGGTTT + # qry: TCTAAACC-CXGGGTTT name: 'indels_are_merged', - std: 'ACGTGACGTACGT', - query: 'ACGTACGTGACGT', - expected_std: 'ACGTACGTACGT', - expected_query: 'ACGTACGTACGT' + std: 'TCTAAACXCGGGTTT', + query: 'TCTAAACCXGGGTTT', + expected_std: 'TCTAAACXCGGGTTT', + expected_query: 'TCTAAACCXGGGTTT' }, { - # std: ACG--TA-CGTACGT - # qry: ACGCCTATCGTACGT + # std: AAACC--CGG-GTTT + # qry: AAACCTTCGGAGTTT name: 'insertions_are_clustered', - std: 'ACGTACGTACGT', - query: 'ACGCCTATCGTACGT', - expected_std: 'ACG---TACGTACGT', - expected_query: 'ACGCCTATCGTACGT' + std: 'AAACCCGGGTTT', + query: 'AAACCTTCGGAGTTT', + expected_std: 'AAACCC---GGGTTT', + expected_query: 'AAACCTTCGGAGTTT' }, { - # std: ACGCCTATCGTACGT - # qry: ACG--TA-CGTACGT + # std: AAACCTTCGGAGTTT + # qry: AAACC--CGG-GTTT name: 'deletions_are_clustered', - std: 'ACGCCTAGCGTACGT', - query: 'ACGTACGTACGT', - expected_std: 'ACGCCTAGCGTACGT', - expected_query: 'ACG---TACGTACGT' + std: 'AAACCTTCGGAGTTT', + query: 'AAACCCGGGTTT', + expected_std: 'AAACCTTCGGAGTTT', + expected_query: 'AAACCC---GGGTTT' }, { - # std: ACGT---ACGTACGT - # qry: ACGTCCCACGTACGT + # std: AAACC---CGGGTTT + # qry: AAACCAGACGGGTTT name: 'insertions_are_frame_aligned', - std: 'ACGTACGTACGT', - query: 'ACGTCCCACGTACGT', - expected_std: 'ACG---TACGTACGT', - expected_query: 'ACGTCCCACGTACGT' + std: 'AAACCCGGGTTT', + query: 'AAACCAGACGGGTTT', + expected_std: 'AAACCC---GGGTTT', + expected_query: 'AAACCAGACGGGTTT' }, { - # std: ACGTCCCACGTACGT - # qry: ACGT---ACGTACGT + # std: AAACCAGACGGGTTT + # qry: AAACC---CGGGTTT name: 'deletions_are_frame_aligned', - std: 'ACGTCCCACGTACGT', - query: 'ACGTACGTACGT', - expected_std: 'ACGTCCCACGTACGT', - expected_query: 'ACG---TACGTACGT' + std: 'AAACCAGACGGGTTT', + query: 'AAACCCGGGTTT', + expected_std: 'AAACCAGACGGGTTT', + expected_query: 'AAACCC---GGGTTT' }, { - # std: ACGTCCCACGTACGT - # qry: ACGT---ACGTACGT + # std: AAACC---CGGGTTT + # qry: AAACCAGACGGGTTT name: 'insertions_moved_to_common_positions', - std: 'ACGTCCCACGTACGT', - query: 'ACGTACGTACGT', + std: 'AAACCCGGGTTT', + query: 'AAACCAGACGGGTTT', common_insert_locations: [3], - expected_std: 'ACGTCCCACGTACGT', - expected_query: 'ACGTACGTA---CGT', - } + expected_std: 'AAACCCGGG---TTT', + expected_query: 'AAACCAGACGGGTTT', + }, + # Nov 13, 2024: having trouble cooking up a case that aligns the way I want it to. + # Skipping this for now as most of this logic is tested in the no-alignment + # tests anyway. + # { + # # The changes that should be made here: + # # * edges trimmed + # # * the insertions after position 3 and 4 should be merged + # # * the deletions at 14-15 and 17 should be merged + # # * the insertion after 12 should be merged with the deletion at 13 + # # * the insertion after 19 should be frame-aligned by setting back 1 base + # # * the deletion at 24-26 should be frame-aligned by setting forward 1 base + # # * the insertion after 36 should be shifted over to common insert location 11 + # # std: ---TCT--A-AACCC-XGGGXXTXTT---AACCCGGGTTTAAACCC---GGG--- + # # qry: XXXTCTXXAXAACCCX-GGG--T-TTGGGAACCCGG---TAAACCCAAAGGGAAA + # name: 'typical_case', + # std: 'TCTAAACCCXGGGXXTXTTAACCCGGGTTTAAACCCGGG', + # query: 'XXXTCTXXAXAACCCXGGG--T-TTGGGAACCCGGTAAACCCAAAGGGAAA', + # common_insert_locations: [11], + # expected_std: 'TCT---AAACCCXGGGXXTX---TTAACCCGGGTTTAAA---CCCGGG', + # expected_query: 'TCTXXAXAACCCXGG---GTTTGGGAACCCGGT---AAACCCAAAGGG' + # } ] ALIGNMENT_TEST_CASES.each do |test_entry| - define_method("test_#{test_entry[:name]}") do + define_method("test_with_alignment_#{test_entry[:name]}") do trim = test_entry[:trim].nil? ? true : false raise_errors = test_entry[:raise_errors].nil? ? true : false result = CfeGotoh.frame_align( @@ -1043,5 +1080,4 @@ def test_unmerged_deletions_raise_error assert_equal(test_entry[:expected_query], result[1]) end end - end diff --git a/ruby/test/templates/index.html.erb b/ruby/test/templates/index.html.erb new file mode 100644 index 0000000..79f32f9 --- /dev/null +++ b/ruby/test/templates/index.html.erb @@ -0,0 +1,89 @@ + + + + <%= title %> + + + + + + + +
+
+

Generated on <%= Time.now.strftime("%b %d, %Y at %H:%M %Z") %>

+

+ <%= title %>

+

+ CI Pipeline ID <%= ENV["CI_PIPELINE_ID"] %> from branch <%= ENV["CI_COMMIT_BRANCH"] %>@<%= ENV["CI_PROJECT_NAMESPACE"] %>/<%= ENV["CI_PROJECT_NAME"] %>
+ Git Commit Hash <%= ENV["CI_COMMIT_SHA"] %> (<%= ENV["CI_COMMIT_SHORT_SHA"] %>) +

+

+ Finished in <%= total_time_to_hms %>, <%= '%.2f tests/s' % (count / total_time) %>, <%= '%.2f assertions/s' % (assertions / total_time) %> +

+

+ + <%= '%d' % count %> tests, + <%= '%d' % assertions %> assertions, + <%= '%d' % failures %> failures, + <%= '%d' % errors %> errors, + <%= '%d' % skips %> skips + +

+ +
+
+ <%= '%d' % percent_passes %>% passed +
+
+ <%= '%d' % percent_errors_failures %>% failed +
+
+ <%= '%d' % percent_skipps %>% skipped +
+
+ + +
+ + <% suites.each do |suite| %> +
+
<%= suite[:name] %> + + <%= '%d' % suite[:test_count] %> tests, + <%= '%d' % suite[:assertion_count] %> assertions, + <%= '%d' % suite[:fail_count] %> failures, + <%= '%d' % suite[:error_count] %> errors, + <%= '%d' % suite[:skip_count] %> skips, + finished in <%= '%.4fs' % suite[:time] %> + +
+
+
+ <% suite[:tests].each do |test| %> +
+
+ <% if result(test) == :pass %> + + <% elsif result(test) == :skip %> + + <% else %> + + <% end %> + <%= friendly_name(test) %> + + Assertions <%= test.assertions %>, time <%= ('%.6fs' % test.time) %> + +
+ <% if !test.passed? %> +
<%= "#{location(test.failure)}\n\n#{test.failure.message}" %>
+ <% end %> +
+ <% end %> +
+
+
+ <% end %> +
+ + From 2d2ce9a16ef0c69f231297a3e03dbf2f80adbf26 Mon Sep 17 00:00:00 2001 From: Richard Liang Date: Wed, 13 Nov 2024 16:39:14 -0800 Subject: [PATCH 15/31] Changed the require statement to hopefully work for both the gem and the tests. --- ruby/lib/cfe_gotoh.rb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ruby/lib/cfe_gotoh.rb b/ruby/lib/cfe_gotoh.rb index 81b6934..349e0ba 100644 --- a/ruby/lib/cfe_gotoh.rb +++ b/ruby/lib/cfe_gotoh.rb @@ -1,7 +1,6 @@ #TODO: Scoring algorithm to improve frame_align? -# require 'cfe_gotoh/cfe_gotoh' -require_relative '../ext/cfe_gotoh/cfe_gotoh' +require_relative 'cfe_gotoh/cfe_gotoh' module CfeGotoh From 358479eb2e361319f504a2dc41171323868a7f06 Mon Sep 17 00:00:00 2001 From: rhliang Date: Wed, 13 Nov 2024 17:11:07 -0800 Subject: [PATCH 16/31] WIP: first pass at updating the rakefile. No idea if it works yet. --- ruby/rakefile | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/ruby/rakefile b/ruby/rakefile index 8b32cf1..8767445 100644 --- a/ruby/rakefile +++ b/ruby/rakefile @@ -1,11 +1,41 @@ require 'rake' require 'rake/testtask' +require 'rake/file_utils' +require 'fileutils' task :default => :test base_dir = File.expand_path(".") + +file 'lib/cfe_gotoh/cfe_gotoh.so' => ['ext/cfe_gotoh/cfe_gotoh.so'] do + FileUtils.copy('ext/cfe_gotoh/cfe_gotoh.so', 'lib/cfe_gotoh/cfe_gotoh.so') +end + + +file 'ext/cfe_gotoh/cfe_gotoh.so' => [ + 'ext/cfe_gotoh/Makefile', + 'ext/cfe_gotoh/cfe_gotoh.cpp' +] do + sh 'cd ext/cfe_gotoh && make' +end + + +file 'ext/cfe_gotoh/cfe_gotoh.cpp' => ['../alignment/gotoh/gotoh.cpp'] do + FileUtils.copy('../alignment/gotoh/gotoh.cpp', 'ext/cfe_gotoh/cfe_gotoh.cpp') +end + + +file 'ext/cfe_gotoh/Makefile' => [ + 'ext/cfe_gotoh/extconf.rb', + 'ext/cfe_gotoh/cfe_gotoh.cpp' +] do + ruby 'ext/cfe_gotoh/extconf.rb' +end + + Rake::TestTask.new do |t| + t.deps = ['lib/cfe_gotoh.so'] t.libs = [base_dir] t.pattern = "test/**/*_test.rb" end From 77da1d0e6ae6dcf0612d39294f34f9ca7ebd1580 Mon Sep 17 00:00:00 2001 From: Richard Liang Date: Wed, 13 Nov 2024 19:10:41 -0800 Subject: [PATCH 17/31] Cleaned up the rakefile so that the extension is built before tests run. --- .gitignore | 1 + ruby/.gitignore | 2 ++ ruby/ext/.gitignore | 1 + ruby/ext/cfe_gotoh/.gitignore | 6 ++++++ ruby/lib/.gitignore | 2 ++ ruby/rakefile | 7 +++++-- ruby/test/.gitignore | 2 ++ 7 files changed, 19 insertions(+), 2 deletions(-) create mode 100644 ruby/ext/.gitignore create mode 100644 ruby/ext/cfe_gotoh/.gitignore create mode 100644 ruby/lib/.gitignore create mode 100644 ruby/test/.gitignore diff --git a/.gitignore b/.gitignore index 6431d81..82d8773 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ env/ __pycache__/ .pytest_cache/ +.DS_Store diff --git a/ruby/.gitignore b/ruby/.gitignore index c111b33..b79747d 100644 --- a/ruby/.gitignore +++ b/ruby/.gitignore @@ -1 +1,3 @@ *.gem +coverage/* +.DS_Store diff --git a/ruby/ext/.gitignore b/ruby/ext/.gitignore new file mode 100644 index 0000000..496ee2c --- /dev/null +++ b/ruby/ext/.gitignore @@ -0,0 +1 @@ +.DS_Store \ No newline at end of file diff --git a/ruby/ext/cfe_gotoh/.gitignore b/ruby/ext/cfe_gotoh/.gitignore new file mode 100644 index 0000000..c16485e --- /dev/null +++ b/ruby/ext/cfe_gotoh/.gitignore @@ -0,0 +1,6 @@ +cfe_gotoh.cpp +cfe_gotoh.o +cfe_gotoh.so +extconf.h +Makefile +mkmf.log diff --git a/ruby/lib/.gitignore b/ruby/lib/.gitignore new file mode 100644 index 0000000..162836e --- /dev/null +++ b/ruby/lib/.gitignore @@ -0,0 +1,2 @@ +cfe_gotoh/* +.DS_Store diff --git a/ruby/rakefile b/ruby/rakefile index 8767445..62a6c8a 100644 --- a/ruby/rakefile +++ b/ruby/rakefile @@ -9,6 +9,7 @@ base_dir = File.expand_path(".") file 'lib/cfe_gotoh/cfe_gotoh.so' => ['ext/cfe_gotoh/cfe_gotoh.so'] do + FileUtils.mkdir_p('lib/cfe_gotoh') FileUtils.copy('ext/cfe_gotoh/cfe_gotoh.so', 'lib/cfe_gotoh/cfe_gotoh.so') end @@ -30,12 +31,14 @@ file 'ext/cfe_gotoh/Makefile' => [ 'ext/cfe_gotoh/extconf.rb', 'ext/cfe_gotoh/cfe_gotoh.cpp' ] do - ruby 'ext/cfe_gotoh/extconf.rb' + Dir.chdir('ext/cfe_gotoh') do + ruby 'extconf.rb' + end end Rake::TestTask.new do |t| - t.deps = ['lib/cfe_gotoh.so'] t.libs = [base_dir] t.pattern = "test/**/*_test.rb" end +task :test => ['lib/cfe_gotoh/cfe_gotoh.so'] \ No newline at end of file diff --git a/ruby/test/.gitignore b/ruby/test/.gitignore new file mode 100644 index 0000000..ced4113 --- /dev/null +++ b/ruby/test/.gitignore @@ -0,0 +1,2 @@ +html_reports/* +reports/* From 1a1342a5deb906c6d5af15090628b5a4ddc7b3b9 Mon Sep 17 00:00:00 2001 From: Richard Liang Date: Wed, 13 Nov 2024 19:12:09 -0800 Subject: [PATCH 18/31] WIP: copied the .gitlab-ci.yml file from the hivdb_algorithm repo. --- .gitlab-ci.yml | 124 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100755 .gitlab-ci.yml diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100755 index 0000000..6cbb6b7 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,124 @@ +variables: + UBUNTU_VERSION: + value: "20.04" + description: Version of ubuntu to test against + RUBY_VERSION: + value: "2.2.2" + description: Version of ruby to test against + PACKAGE_REGISTRY_URL: "${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/packages/generic/${CI_PROJECT_NAME}/${CI_COMMIT_TAG}" + +stages: + - test + - release + +# ----------------------------------------------------- +# Test Stage + +unit-tests: + image: + name: git-int.cfenet.ubc.ca:5005/cfe/cfe_ubuntu/ruby:${UBUNTU_VERSION} + pull_policy: if-not-present + stage: test + script: + - cd $CI_PROJECT_DIR + - source /etc/profile.d/rvm.sh + - bundle install + - rake + - cp test/html_reports/index.html ./$(date +%Y%m%d)_${CI_PROJECT_NAME}_test_report.html + artifacts: + when: always + reports: + junit: $CI_PROJECT_DIR/test/reports/*.xml + coverage_report: + coverage_format: cobertura + path: $CI_PROJECT_DIR/coverage/coverage.xml + expire_in: 1 month + paths: + - coverage + - "*_${CI_PROJECT_NAME}_test_report.html" + coverage: /\((\d+\.\d+)\%\) covered\.$/ + rules: + - if: $CI_DEFAULT_BRANCH == $CI_COMMIT_BRANCH + - if: $CI_PIPELINE_SOURCE == "merge_request_event" + +# ----------------------------------------------------- +# Release Stage + +.release-rules: + stage: release + rules: + - if: $CI_COMMIT_TAG + variables: + GEMFILE_FILENAME: ${CI_PROJECT_NAME}-${CI_COMMIT_TAG}.gem + +build-gem: + image: + name: git-int.cfenet.ubc.ca:5005/cfe/cfe_ubuntu/ruby:${UBUNTU_VERSION} + pull_policy: if-not-present + extends: .release-rules + variables: + HIVDB_ALGORITHM_VERSION: $CI_COMMIT_TAG # the gemspec will look for this in the environment + script: + - cd $CI_PROJECT_DIR + - source /etc/profile.d/rvm.sh + - bundle install + - gem build ${CI_PROJECT_NAME}.gemspec --output build/${GEMFILE_FILENAME} + artifacts: + paths: + - build/${GEMFILE_FILENAME} + +upload-gem: + image: + name: alpine:latest + pull_policy: if-not-present + needs: + - build-gem + extends: .release-rules + variables: + GIT_STRATEGY: none + script: + - apk --no-cache add curl + - 'curl + --fail + --header "JOB-TOKEN: $CI_JOB_TOKEN" + --upload-file build/${GEMFILE_FILENAME} + ${PACKAGE_REGISTRY_URL}/${GEMFILE_FILENAME}' + +push-to-rubygems-int: + image: + name: git-int.cfenet.ubc.ca:5005/cfe/cfe_ubuntu/ruby:${UBUNTU_VERSION} + pull_policy: if-not-present + extends: .release-rules + variables: + GIT_STRATEGY: none + # this keyword specifies we will get the artifacts from the stage + needs: + - build-gem + - upload-gem + before_script: + - cd $CI_PROJECT_DIR + script: + - source /etc/profile.d/rvm.sh + # We don't specify a key here because we insert the $GEM_HOST_API_KEY via gitlab secrets + - gem push --host https://rubygems-int.bccfe.ca/private build/${GEMFILE_FILENAME} + environment: + name: release/${CI_COMMIT_TAG} + on_stop: yank-gem-from-rubygems-int + +yank-gem-from-rubygems-int: + image: + name: git-int.cfenet.ubc.ca:5005/cfe/cfe_ubuntu/ruby:${UBUNTU_VERSION} + pull_policy: if-not-present + extends: .release-rules + variables: + GIT_STRATEGY: none + before_script: + - cd $CI_PROJECT_DIR + script: + - source /etc/profile.d/rvm.sh + # We don't specify a key here because we insert the $GEM_HOST_API_KEY via gitlab secrets + - gem yank ${CI_PROJECT_NAME} --version ${CI_COMMIT_TAG} + environment: + name: release/${CI_COMMIT_TAG} + action: stop + when: manual From a5b98410ce2e39f61c42ce20e1a7935a5a83a8cc Mon Sep 17 00:00:00 2001 From: Richard Liang Date: Thu, 14 Nov 2024 17:22:56 -0800 Subject: [PATCH 19/31] WIP: added the CI configuration from the Phylowatch repo as well. --- .github/workflows/ci.yml | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..a61f2c9 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,35 @@ +name: Automated Tests + +on: ["push"] + +jobs: + build: + runs-on: [self-hosted] + steps: + - uses: actions/checkout@v3 + with: + submodules: 'true' + - name: Setup pipenv + run: | + pip install --upgrade pip + python -m venv venv + ./venv/bin/pip install --upgrade pip + ./venv/bin/pip install pipenv + - name: Install dependencies + run: | + PIPENV_VENV_IN_PROJECT=1 ./venv/bin/pipenv sync -d + # - name: Dummy settings file + # run: cp phylowatch/settings_default.py phylowatch/settings.py + - name: Static checks + run: | + ./.venv/bin/flake8 phylowatch --show-source --statistics + ./.venv/bin/flake8 tests --show-source --statistics + continue-on-error: true + - name: Unit tests + run: ./.venv/bin/pytest --cov-config=.coveragerc --cov --cov-report=xml:coverage.xml --mpl tests + - name: Upload coverage reports to Codecov with GitHub Action + uses: codecov/codecov-action@v3 + with: + token: ${{ secrets.CODECOV_TOKEN }} + root_dir: ${{ github.workspace }} + files: ${{ github.workspace }}/coverage.xml From 584a1325f06808414e416180c106de755d4dcf2c Mon Sep 17 00:00:00 2001 From: rhliang Date: Fri, 15 Nov 2024 18:20:10 -0800 Subject: [PATCH 20/31] WIP: added some basic devcontainer config and a simple Dockerfile based on cfe_ubuntu. --- .devcontainer/Dockerfile | 28 ++++++++++++++++++++++++++++ .devcontainer/devcontainer.json | 15 +++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 .devcontainer/Dockerfile create mode 100644 .devcontainer/devcontainer.json diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile new file mode 100644 index 0000000..12af011 --- /dev/null +++ b/.devcontainer/Dockerfile @@ -0,0 +1,28 @@ +ARG UBUNTU_VERSION=20.04 + +FROM ubuntu:${UBUNTU_VERSION} AS base + +ARG DEBIAN_FRONTEND=noninteractive + +ENV TZ=America/Vancouver \ + RUBY_VERSION=2.2.2 \ + BUNDLER_VERSION=1.17.3 + +RUN apt-get -y update &&\ + apt-get -y upgrade &&\ + apt-get install -y curl gpg + +RUN echo "deb http://security.ubuntu.com/ubuntu/ bionic-security main" >> /etc/apt/sources.list &&\ + echo "deb http://security.ubuntu.com/ubuntu/ bionic main" >> /etc/apt/sources.list &&\ + apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 3B4FE6ACC0B21F32 &&\ + apt update &&\ + apt install -y \ + gawk=1:4.1.4+dfsg-1build1 + +RUN curl -sSL https://rvm.io/mpapis.asc | gpg --import - && \ + curl -sSL https://rvm.io/pkuczynski.asc | gpg --import - && \ + curl -sSL https://get.rvm.io | bash -s stable && \ + usermod -a -G rvm root + +RUN /bin/bash -l -c "source /etc/profile.d/rvm.sh && rvm pkg install openssl" +RUN /bin/bash -l -c "source /etc/profile.d/rvm.sh && rvm requirements && rvm install ${RUBY_VERSION} --with-openssl-dir=/usr/local/rvm/usr && rvm --default use ${RUBY_VERSION}" diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..49fa4fd --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,15 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the +// README at: https://github.com/devcontainers/templates/tree/main/src/ubuntu +{ + "name": "CfE Ubuntu Ruby", + + "build": { + "dockerfile": "Dockerfile" + } + + "features": { + "ghcr.io/devcontainers/features/ruby:1": { + "version": "2.2" + } + } +} From ef9b8b6f8dc70fa689d8e867c6d2612156e887a0 Mon Sep 17 00:00:00 2001 From: Richard Liang Date: Tue, 19 Nov 2024 16:54:23 -0800 Subject: [PATCH 21/31] First pass at CI configuration. --- .devcontainer/Dockerfile | 3 + .devcontainer/devcontainer.json | 8 +-- .github/workflows/ci.yml | 44 ++++++------ .gitlab-ci.yml | 124 -------------------------------- 4 files changed, 27 insertions(+), 152 deletions(-) delete mode 100755 .gitlab-ci.yml diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 12af011..18e5d4c 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -2,6 +2,9 @@ ARG UBUNTU_VERSION=20.04 FROM ubuntu:${UBUNTU_VERSION} AS base +LABEL org.opencontainers.image.source=https://github.com/cfe-lab/gotoh +LABEL org.opencontainers.image.description="Gotoh dev container for development and testing" + ARG DEBIAN_FRONTEND=noninteractive ENV TZ=America/Vancouver \ diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 49fa4fd..7e62f97 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -5,11 +5,11 @@ "build": { "dockerfile": "Dockerfile" - } + }, + + "updateContentCommand": "cd /workspaces/gotoh/ruby && bundle install", "features": { - "ghcr.io/devcontainers/features/ruby:1": { - "version": "2.2" - } + "ghcr.io/devcontainers/features/git:1": {} } } diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a61f2c9..4064492 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -3,33 +3,29 @@ name: Automated Tests on: ["push"] jobs: - build: - runs-on: [self-hosted] + test: + runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - name: Checkout code from repo + uses: actions/checkout@v3 + + - name: Log into GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build devcontainer and run tests + uses: devcontainers/ci@v0.3 with: - submodules: 'true' - - name: Setup pipenv - run: | - pip install --upgrade pip - python -m venv venv - ./venv/bin/pip install --upgrade pip - ./venv/bin/pip install pipenv - - name: Install dependencies - run: | - PIPENV_VENV_IN_PROJECT=1 ./venv/bin/pipenv sync -d - # - name: Dummy settings file - # run: cp phylowatch/settings_default.py phylowatch/settings.py - - name: Static checks - run: | - ./.venv/bin/flake8 phylowatch --show-source --statistics - ./.venv/bin/flake8 tests --show-source --statistics - continue-on-error: true - - name: Unit tests - run: ./.venv/bin/pytest --cov-config=.coveragerc --cov --cov-report=xml:coverage.xml --mpl tests + imageName: ghcr.io/cfe-lab/gotoh_devcontainer + cacheFrom: ghcr.io/cfe-lab/gotoh_devcontainer + runCmd: cd ${{ github.workspace }}/ruby && rake test + - name: Upload coverage reports to Codecov with GitHub Action - uses: codecov/codecov-action@v3 + uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} root_dir: ${{ github.workspace }} - files: ${{ github.workspace }}/coverage.xml + files: ${{ github.workspace }}/ruby/coverage/coverage.xml diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml deleted file mode 100755 index 6cbb6b7..0000000 --- a/.gitlab-ci.yml +++ /dev/null @@ -1,124 +0,0 @@ -variables: - UBUNTU_VERSION: - value: "20.04" - description: Version of ubuntu to test against - RUBY_VERSION: - value: "2.2.2" - description: Version of ruby to test against - PACKAGE_REGISTRY_URL: "${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/packages/generic/${CI_PROJECT_NAME}/${CI_COMMIT_TAG}" - -stages: - - test - - release - -# ----------------------------------------------------- -# Test Stage - -unit-tests: - image: - name: git-int.cfenet.ubc.ca:5005/cfe/cfe_ubuntu/ruby:${UBUNTU_VERSION} - pull_policy: if-not-present - stage: test - script: - - cd $CI_PROJECT_DIR - - source /etc/profile.d/rvm.sh - - bundle install - - rake - - cp test/html_reports/index.html ./$(date +%Y%m%d)_${CI_PROJECT_NAME}_test_report.html - artifacts: - when: always - reports: - junit: $CI_PROJECT_DIR/test/reports/*.xml - coverage_report: - coverage_format: cobertura - path: $CI_PROJECT_DIR/coverage/coverage.xml - expire_in: 1 month - paths: - - coverage - - "*_${CI_PROJECT_NAME}_test_report.html" - coverage: /\((\d+\.\d+)\%\) covered\.$/ - rules: - - if: $CI_DEFAULT_BRANCH == $CI_COMMIT_BRANCH - - if: $CI_PIPELINE_SOURCE == "merge_request_event" - -# ----------------------------------------------------- -# Release Stage - -.release-rules: - stage: release - rules: - - if: $CI_COMMIT_TAG - variables: - GEMFILE_FILENAME: ${CI_PROJECT_NAME}-${CI_COMMIT_TAG}.gem - -build-gem: - image: - name: git-int.cfenet.ubc.ca:5005/cfe/cfe_ubuntu/ruby:${UBUNTU_VERSION} - pull_policy: if-not-present - extends: .release-rules - variables: - HIVDB_ALGORITHM_VERSION: $CI_COMMIT_TAG # the gemspec will look for this in the environment - script: - - cd $CI_PROJECT_DIR - - source /etc/profile.d/rvm.sh - - bundle install - - gem build ${CI_PROJECT_NAME}.gemspec --output build/${GEMFILE_FILENAME} - artifacts: - paths: - - build/${GEMFILE_FILENAME} - -upload-gem: - image: - name: alpine:latest - pull_policy: if-not-present - needs: - - build-gem - extends: .release-rules - variables: - GIT_STRATEGY: none - script: - - apk --no-cache add curl - - 'curl - --fail - --header "JOB-TOKEN: $CI_JOB_TOKEN" - --upload-file build/${GEMFILE_FILENAME} - ${PACKAGE_REGISTRY_URL}/${GEMFILE_FILENAME}' - -push-to-rubygems-int: - image: - name: git-int.cfenet.ubc.ca:5005/cfe/cfe_ubuntu/ruby:${UBUNTU_VERSION} - pull_policy: if-not-present - extends: .release-rules - variables: - GIT_STRATEGY: none - # this keyword specifies we will get the artifacts from the stage - needs: - - build-gem - - upload-gem - before_script: - - cd $CI_PROJECT_DIR - script: - - source /etc/profile.d/rvm.sh - # We don't specify a key here because we insert the $GEM_HOST_API_KEY via gitlab secrets - - gem push --host https://rubygems-int.bccfe.ca/private build/${GEMFILE_FILENAME} - environment: - name: release/${CI_COMMIT_TAG} - on_stop: yank-gem-from-rubygems-int - -yank-gem-from-rubygems-int: - image: - name: git-int.cfenet.ubc.ca:5005/cfe/cfe_ubuntu/ruby:${UBUNTU_VERSION} - pull_policy: if-not-present - extends: .release-rules - variables: - GIT_STRATEGY: none - before_script: - - cd $CI_PROJECT_DIR - script: - - source /etc/profile.d/rvm.sh - # We don't specify a key here because we insert the $GEM_HOST_API_KEY via gitlab secrets - - gem yank ${CI_PROJECT_NAME} --version ${CI_COMMIT_TAG} - environment: - name: release/${CI_COMMIT_TAG} - action: stop - when: manual From b8782fbc82f5c9ec7b4dfeef0ca7865b0c9c80b1 Mon Sep 17 00:00:00 2001 From: Richard Liang Date: Tue, 19 Nov 2024 17:08:58 -0800 Subject: [PATCH 22/31] Tweaking CI configuration. --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4064492..6789679 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,7 +21,7 @@ jobs: with: imageName: ghcr.io/cfe-lab/gotoh_devcontainer cacheFrom: ghcr.io/cfe-lab/gotoh_devcontainer - runCmd: cd ${{ github.workspace }}/ruby && rake test + runCmd: cd /workspaces/gotoh/ruby && rake test - name: Upload coverage reports to Codecov with GitHub Action uses: codecov/codecov-action@v5 From 365fed5d9ff37e4280e365d3a9ec23ea49de300e Mon Sep 17 00:00:00 2001 From: rhliang Date: Wed, 20 Nov 2024 15:13:30 -0800 Subject: [PATCH 23/31] Added more tests for remove_inserts. These tests were not run on my workstation; they're also an experiment to see whether CI is working. --- ruby/lib/cfe_gotoh.rb | 4 +- ruby/test/cfe_gotoh_test.rb | 112 ++++++++++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+), 2 deletions(-) diff --git a/ruby/lib/cfe_gotoh.rb b/ruby/lib/cfe_gotoh.rb index 349e0ba..5229e7e 100644 --- a/ruby/lib/cfe_gotoh.rb +++ b/ruby/lib/cfe_gotoh.rb @@ -365,10 +365,10 @@ def self.frame_align( #Returns a [seq_sans_inserts, [list of inserts]] def self.remove_inserts(elem) - return remove_insertions_from_query(elem[1]) + return remove_insertions_from_query(elem[0], elem[1]) end - def self.remove_insertions_from_query(query) + def self.remove_insertions_from_query(standard, query) seq = '' + query inserts = [] diff --git a/ruby/test/cfe_gotoh_test.rb b/ruby/test/cfe_gotoh_test.rb index 0667c08..c661522 100644 --- a/ruby/test/cfe_gotoh_test.rb +++ b/ruby/test/cfe_gotoh_test.rb @@ -1081,3 +1081,115 @@ def test_unmerged_deletions_raise_error end end end + + +class RemoveInsertsTest < CfeGotohTest + REMOVE_INSERTS_TEST_CASES = [ + { + name: 'no_insertions', + std: 'AAACCCGGGTTT', + query: 'AAACCCGGGTTT', + expected_seq: 'AAACCCGGGTTT', + expected_inserts: [] + }, + { + name: 'no_insertions_deletions_ignored', + std: 'AAACCCGGGTTT', + query: 'AAACCC---TTT', + expected_seq: 'AAACCC---TTT', + expected_inserts: [] + }, + { + name: 'single_base_insertion_in_middle', + std: 'AAACCC-GGGTTT', + query: 'AAACCCTGGGTTT', + expected_seq: 'AAACCCGGGTTT', + expected_inserts: [[2, 'T']] + }, + { + name: 'single_base_insertion_in_middle_mid_codon', + std: 'AAACCCGG-GTTT', + query: 'AAACCCGGTGTTT', + expected_seq: 'AAACCCGGGTTT', + expected_inserts: [[2, 'T']] + }, + { + name: 'single_base_insertion_at_beginning', + std: '-AAACCCGGGTTT', + query: 'TAAACCCGGGTTT', + expected_seq: 'AAACCCGGGTTT', + expected_inserts: [[0, 'T']] + }, + { + name: 'single_base_insertion_at_end', + std: 'AAACCCGGGTTT-', + query: 'AAACCCGGGTTTA', + expected_seq: 'AAACCCGGGTTT', + expected_inserts: [[4, 'A']] + }, + { + name: 'several_single_base_insertions', + std: '-AAACCC-GGGT-TT-', + query: 'TAAACCCAG-GTCTTA', + expected_seq: 'AAACCCG-GTTT', + expected_inserts: [[0, 'T'], [2, 'A'], [3, 'C'], [4, A]] + }, + { + name: 'multiple_base_insertion_in_middle', + std: 'AAACCC--GGGTTT', + query: 'AAACCCCAGGGTTT', + expected_seq: 'AAACCCGGGTTT', + expected_inserts: [[2, 'CA']] + }, + { + name: 'multiple_base_insertion_in_middle_mid_codon', + std: 'AAACC---CGGGTTT', + query: 'AAACCAGTCGGGTTT', + expected_seq: 'AAACCCGGGTTT', + expected_inserts: [[1, 'AGT']] + }, + { + name: 'multiple_base_insertion_at_beginning', + std: '---AAACCCGGGTTT', + query: 'CGTAAACCCGGGTTT', + expected_seq: 'AAACCCGGGTTT', + expected_inserts: [[0, 'CGT']] + }, + { + name: 'multiple_base_insertion_at_end', + std: 'AAACCCGGGTTT----', + query: 'AAACCCGGGTTTACGT', + expected_seq: 'AAACCCGGGTTT', + expected_inserts: [[4, 'ACGT']] + }, + { + name: 'distinct_insertions_at_same_codon', + std: 'AAA---C-CCGGGTTT', + query: 'AAAGTGCTCCGGGTTT', + expected_seq: 'AAACCCGGGTTT', + expected_inserts: [[1, 'GTG'], [1, 'T']] + }, + { + name: 'typical_case', + std: '---AAACC-CGGG---TT-T----', + query: 'CGTAAACGGCG-GACGTTATACGT', + expected_seq: 'AAACGCG-GTTT', + expected_inserts: [[0, 'CGT'], [1, 'C'], [3, 'ACG'], [3, 'A'], [4, 'ACGT']] + } + ] + + REMOVE_INSERTS_TEST_CASES.each do |test_entry| + define_method("test_#{test_entry[:name]}") do + result = CfeGotoh.remove_insertions_from_query( + test_entry[:std], + test_entry[:query] + ) + assert_equal(test_entry[:expected_seq], result[0]) + assert_equal(test_entry[:expected_inserts], result[1]) + + wrapper_result = CfeGotoh.remove_inserts([test_entry[:std], test_entry[:query]]) + assert_equal(test_entry[:expected_seq], wrapper_result[0]) + assert_equal(test_entry[:expected_inserts], wrapper_result[1]) + end + end +end From 916aecde0d0396678fa59c3ca1a42fee5493a3c3 Mon Sep 17 00:00:00 2001 From: Richard Liang Date: Wed, 20 Nov 2024 15:18:47 -0800 Subject: [PATCH 24/31] Fixed a broken test. This time the tests *were* run on my workstation; the previous commit showed that the CI pipeline was working correctly. --- ruby/test/cfe_gotoh_test.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ruby/test/cfe_gotoh_test.rb b/ruby/test/cfe_gotoh_test.rb index c661522..91dc5d1 100644 --- a/ruby/test/cfe_gotoh_test.rb +++ b/ruby/test/cfe_gotoh_test.rb @@ -1132,7 +1132,7 @@ class RemoveInsertsTest < CfeGotohTest std: '-AAACCC-GGGT-TT-', query: 'TAAACCCAG-GTCTTA', expected_seq: 'AAACCCG-GTTT', - expected_inserts: [[0, 'T'], [2, 'A'], [3, 'C'], [4, A]] + expected_inserts: [[0, 'T'], [2, 'A'], [3, 'C'], [4, 'A']] }, { name: 'multiple_base_insertion_in_middle', @@ -1174,7 +1174,7 @@ class RemoveInsertsTest < CfeGotohTest std: '---AAACC-CGGG---TT-T----', query: 'CGTAAACGGCG-GACGTTATACGT', expected_seq: 'AAACGCG-GTTT', - expected_inserts: [[0, 'CGT'], [1, 'C'], [3, 'ACG'], [3, 'A'], [4, 'ACGT']] + expected_inserts: [[0, 'CGT'], [1, 'G'], [3, 'ACG'], [3, 'A'], [4, 'ACGT']] } ] From eec1c927141e8442b958686c5ea5e6f02fe50f4f Mon Sep 17 00:00:00 2001 From: rhliang Date: Wed, 20 Nov 2024 17:33:21 -0800 Subject: [PATCH 25/31] WIP: some more additions to the CI/CD configuration, and some fixes to the things David noticed in his review. --- .github/workflows/cd.yml | 28 ++++++++++++++++++++++++++++ .github/workflows/ci.yml | 7 +++++++ ruby/test/templates/index.html.erb | 4 ++-- ruby/test/test_helper.rb | 9 +++++---- 4 files changed, 42 insertions(+), 6 deletions(-) create mode 100644 .github/workflows/cd.yml diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml new file mode 100644 index 0000000..df3a9c2 --- /dev/null +++ b/.github/workflows/cd.yml @@ -0,0 +1,28 @@ +name: Build the Ruby package + +on: + release: + types: [published] + +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Checkout code from repo + uses: actions/checkout@v3 + + - name: Log into GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build devcontainer and build the package + uses: devcontainers/ci@v0.3 + with: + imageName: ghcr.io/cfe-lab/gotoh_devcontainer + cacheFrom: ghcr.io/cfe-lab/gotoh_devcontainer + runCmd: cd /workspaces/gotoh/ruby && bash build_gem.bash + env: | + CFE_GOTOH_VERSION=${{ github.ref_name }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6789679..16155fe 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,6 +22,13 @@ jobs: imageName: ghcr.io/cfe-lab/gotoh_devcontainer cacheFrom: ghcr.io/cfe-lab/gotoh_devcontainer runCmd: cd /workspaces/gotoh/ruby && rake test + env: | + CI=1 + CI_COMMIT_REF=${{ github.ref_name }} + CI_COMMIT_SHA=${{ github.sha }} + CI_PIPELINE_ID=${{ github.run_id }} + CI_COMMIT_BRANCH=${{ github.ref_name }} + CI_PROJECT=${{ github.repository }} - name: Upload coverage reports to Codecov with GitHub Action uses: codecov/codecov-action@v5 diff --git a/ruby/test/templates/index.html.erb b/ruby/test/templates/index.html.erb index 79f32f9..114bb94 100644 --- a/ruby/test/templates/index.html.erb +++ b/ruby/test/templates/index.html.erb @@ -15,8 +15,8 @@

<%= title %>

- CI Pipeline ID <%= ENV["CI_PIPELINE_ID"] %> from branch <%= ENV["CI_COMMIT_BRANCH"] %>@<%= ENV["CI_PROJECT_NAMESPACE"] %>/<%= ENV["CI_PROJECT_NAME"] %>
- Git Commit Hash <%= ENV["CI_COMMIT_SHA"] %> (<%= ENV["CI_COMMIT_SHORT_SHA"] %>) + CI Pipeline ID <%= ENV["CI_PIPELINE_ID"] %> from branch <%= ENV["CI_COMMIT_BRANCH"] %>@<%= ENV["CI_PROJECT"] %>
+ Git Commit Hash <%= ENV["CI_COMMIT_SHA"] %>

Finished in <%= total_time_to_hms %>, <%= '%.2f tests/s' % (count / total_time) %>, <%= '%.2f assertions/s' % (assertions / total_time) %> diff --git a/ruby/test/test_helper.rb b/ruby/test/test_helper.rb index 95e30ce..a4b9a8b 100644 --- a/ruby/test/test_helper.rb +++ b/ruby/test/test_helper.rb @@ -12,10 +12,11 @@ require 'minitest/autorun' require 'minitest/reporters' -if ENV['CI_COMMIT_TAG'] - run_id = ENV['CI_COMMIT_TAG'] -elsif ENV["CI"] - run_id = "#{ENV['CI_PIPELINE_ID']}-#{ENV['CI_COMMIT_SHORT_SHA']}" +# if ENV['CI_COMMIT_REF'] +# run_id = ENV['CI_COMMIT_REF'] +# elsif ENV["CI"] +if ENV['CI_COMMIT_REF'] + run_id = "#{ENV['CI_COMMIT_REF']} #{ENV['CI_PIPELINE_ID']}-#{ENV['CI_COMMIT_SHA']}" else run_id = "LOCALBUILD" end From 0b5767b7c8665a4a60331f4fbe4e3b65f546494c Mon Sep 17 00:00:00 2001 From: Richard Liang Date: Thu, 21 Nov 2024 14:23:13 -0800 Subject: [PATCH 26/31] Addressed some of David's findings, and added a CD workflow. --- .github/workflows/cd.yml | 28 ---------------------------- .github/workflows/ci.yml | 38 -------------------------------------- ruby/cfe_gotoh.gemspec | 5 +++++ ruby/test/test_helper.rb | 3 --- 4 files changed, 5 insertions(+), 69 deletions(-) delete mode 100644 .github/workflows/cd.yml delete mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml deleted file mode 100644 index df3a9c2..0000000 --- a/.github/workflows/cd.yml +++ /dev/null @@ -1,28 +0,0 @@ -name: Build the Ruby package - -on: - release: - types: [published] - -jobs: - build: - runs-on: ubuntu-latest - steps: - - name: Checkout code from repo - uses: actions/checkout@v3 - - - name: Log into GitHub Container Registry - uses: docker/login-action@v2 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Build devcontainer and build the package - uses: devcontainers/ci@v0.3 - with: - imageName: ghcr.io/cfe-lab/gotoh_devcontainer - cacheFrom: ghcr.io/cfe-lab/gotoh_devcontainer - runCmd: cd /workspaces/gotoh/ruby && bash build_gem.bash - env: | - CFE_GOTOH_VERSION=${{ github.ref_name }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index 16155fe..0000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,38 +0,0 @@ -name: Automated Tests - -on: ["push"] - -jobs: - test: - runs-on: ubuntu-latest - steps: - - name: Checkout code from repo - uses: actions/checkout@v3 - - - name: Log into GitHub Container Registry - uses: docker/login-action@v2 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Build devcontainer and run tests - uses: devcontainers/ci@v0.3 - with: - imageName: ghcr.io/cfe-lab/gotoh_devcontainer - cacheFrom: ghcr.io/cfe-lab/gotoh_devcontainer - runCmd: cd /workspaces/gotoh/ruby && rake test - env: | - CI=1 - CI_COMMIT_REF=${{ github.ref_name }} - CI_COMMIT_SHA=${{ github.sha }} - CI_PIPELINE_ID=${{ github.run_id }} - CI_COMMIT_BRANCH=${{ github.ref_name }} - CI_PROJECT=${{ github.repository }} - - - name: Upload coverage reports to Codecov with GitHub Action - uses: codecov/codecov-action@v5 - with: - token: ${{ secrets.CODECOV_TOKEN }} - root_dir: ${{ github.workspace }} - files: ${{ github.workspace }}/ruby/coverage/coverage.xml diff --git a/ruby/cfe_gotoh.gemspec b/ruby/cfe_gotoh.gemspec index 0ad5fa3..918753e 100644 --- a/ruby/cfe_gotoh.gemspec +++ b/ruby/cfe_gotoh.gemspec @@ -15,4 +15,9 @@ Gem::Specification.new do |s| "David Rickett", "Richard Liang" ] + # Associate this gem with a GitHub repo; this allows the gems to be + # automatically associated with the repo when pushed to GitHub Packages. + s.metadata = { + "github_repo" => "ssh://github.com/cfe-lab/gotoh" + } end diff --git a/ruby/test/test_helper.rb b/ruby/test/test_helper.rb index a4b9a8b..7e79af4 100644 --- a/ruby/test/test_helper.rb +++ b/ruby/test/test_helper.rb @@ -12,9 +12,6 @@ require 'minitest/autorun' require 'minitest/reporters' -# if ENV['CI_COMMIT_REF'] -# run_id = ENV['CI_COMMIT_REF'] -# elsif ENV["CI"] if ENV['CI_COMMIT_REF'] run_id = "#{ENV['CI_COMMIT_REF']} #{ENV['CI_PIPELINE_ID']}-#{ENV['CI_COMMIT_SHA']}" else From b2ada0f775183091ecc7421193dd44f285e3bcab Mon Sep 17 00:00:00 2001 From: Richard Liang Date: Thu, 21 Nov 2024 14:29:38 -0800 Subject: [PATCH 27/31] Forgot to re-add the files that were renamed. --- .github/workflows/build_and_publish_gem.yml | 39 +++++++++++++++++++++ .github/workflows/unit_tests.yml | 37 +++++++++++++++++++ 2 files changed, 76 insertions(+) create mode 100644 .github/workflows/build_and_publish_gem.yml create mode 100644 .github/workflows/unit_tests.yml diff --git a/.github/workflows/build_and_publish_gem.yml b/.github/workflows/build_and_publish_gem.yml new file mode 100644 index 0000000..7e61d9d --- /dev/null +++ b/.github/workflows/build_and_publish_gem.yml @@ -0,0 +1,39 @@ +name: Build and publish the Ruby package + +on: + release: + types: [published] + +jobs: + build_gem: + runs-on: ubuntu-latest + steps: + - name: Checkout code from repo + uses: actions/checkout@v3 + + - name: Log into GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build devcontainer and build the package + uses: devcontainers/ci@v0.3 + with: + imageName: ghcr.io/cfe-lab/gotoh_devcontainer + cacheFrom: ghcr.io/cfe-lab/gotoh_devcontainer + runCmd: cd /workspaces/gotoh/ruby && bash build_gem.bash + env: | + CFE_GOTOH_VERSION=${{ github.ref_name }} + + - name: Publish gem to Github Packages + run: | + mkdir -p $HOME/.gem + touch $HOME/.gem/credentials + chmod 0600 $HOME/.gem/credentials + printf -- "---\n:github: Bearer ${GEM_HOST_API_KEY}\n" > $HOME/.gem/credentials + gem push --KEY github --host https://rubygems.pkg.github.com/${OWNER} *.gem + env: + GEM_HOST_API_KEY: ${{ secrets.GITHUB_TOKEN }} + OWNER: ${{ github.repository_owner }} diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml new file mode 100644 index 0000000..56c955a --- /dev/null +++ b/.github/workflows/unit_tests.yml @@ -0,0 +1,37 @@ +name: Automated Tests + +on: ["push"] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - name: Checkout code from repo + uses: actions/checkout@v3 + + - name: Log into GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build devcontainer and run tests + uses: devcontainers/ci@v0.3 + with: + imageName: ghcr.io/cfe-lab/gotoh_devcontainer + cacheFrom: ghcr.io/cfe-lab/gotoh_devcontainer + runCmd: cd /workspaces/gotoh/ruby && rake test + env: | + CI_COMMIT_REF=${{ github.ref_name }} + CI_COMMIT_SHA=${{ github.sha }} + CI_PIPELINE_ID=${{ github.run_id }} + CI_COMMIT_BRANCH=${{ github.ref_name }} + CI_PROJECT=${{ github.repository }} + + - name: Upload coverage reports to Codecov with GitHub Action + uses: codecov/codecov-action@v5 + with: + token: ${{ secrets.CODECOV_TOKEN }} + root_dir: ${{ github.workspace }} + files: ${{ github.workspace }}/ruby/coverage/coverage.xml From 83e9b8e14502d76dca0f2347157cdecce1c235c5 Mon Sep 17 00:00:00 2001 From: Richard Liang Date: Thu, 21 Nov 2024 14:35:00 -0800 Subject: [PATCH 28/31] Updated the actions to use newer versions after a warning on Github about using a deprecated Node.js. --- .github/workflows/build_and_publish_gem.yml | 4 ++-- .github/workflows/unit_tests.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build_and_publish_gem.yml b/.github/workflows/build_and_publish_gem.yml index 7e61d9d..5309177 100644 --- a/.github/workflows/build_and_publish_gem.yml +++ b/.github/workflows/build_and_publish_gem.yml @@ -9,10 +9,10 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code from repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Log into GitHub Container Registry - uses: docker/login-action@v2 + uses: docker/login-action@v3 with: registry: ghcr.io username: ${{ github.actor }} diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 56c955a..0a4bb00 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -7,10 +7,10 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code from repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Log into GitHub Container Registry - uses: docker/login-action@v2 + uses: docker/login-action@v3 with: registry: ghcr.io username: ${{ github.actor }} From 3d4600c19b87ed45427ece70ab879eb6d5e10abf Mon Sep 17 00:00:00 2001 From: Richard Liang Date: Thu, 21 Nov 2024 14:52:02 -0800 Subject: [PATCH 29/31] Fixed a badly-named test, and added a test to check another edge case. --- ruby/test/cfe_gotoh_test.rb | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/ruby/test/cfe_gotoh_test.rb b/ruby/test/cfe_gotoh_test.rb index 91dc5d1..ee93b5c 100644 --- a/ruby/test/cfe_gotoh_test.rb +++ b/ruby/test/cfe_gotoh_test.rb @@ -801,6 +801,12 @@ class SpliceGapsIntoSequenceTest < CfeGotohTest gaps: [[6, 7, 8]], expected: 'ACTAAG---' }, + { + name: 'insert_past_end', + seq: 'ACTAAG', + gaps: [[7, 8, 9, 10]], + expected: 'ACTAAG----' + }, { name: 'insert_in_middle_retained', seq: 'ACT---AAG', @@ -847,7 +853,7 @@ def test_bad_inserted_bases_error end end - def test_bad_inserted_bases_error + def test_bad_deleted_bases_error std = 'ACGTACGTAACGT' query = 'ACGTACGT-ACGT' assert_raises RuntimeError do From e75a1d8f796541e8029a1dffdc0fef899a4ce7e0 Mon Sep 17 00:00:00 2001 From: Richard Liang Date: Thu, 21 Nov 2024 15:49:52 -0800 Subject: [PATCH 30/31] Added a step to add the gem as an asset to the release; also streamlined the environment variables. --- .github/workflows/build_and_publish_gem.yml | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build_and_publish_gem.yml b/.github/workflows/build_and_publish_gem.yml index 5309177..2d4c1f1 100644 --- a/.github/workflows/build_and_publish_gem.yml +++ b/.github/workflows/build_and_publish_gem.yml @@ -7,6 +7,12 @@ on: jobs: build_gem: runs-on: ubuntu-latest + + env: + CFE_GOTOH_VERSION: ${{ github.ref_name }} + BUILD_PATH: ${{ github.workspace }}/ruby + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + steps: - name: Checkout code from repo uses: actions/checkout@v4 @@ -25,15 +31,19 @@ jobs: cacheFrom: ghcr.io/cfe-lab/gotoh_devcontainer runCmd: cd /workspaces/gotoh/ruby && bash build_gem.bash env: | - CFE_GOTOH_VERSION=${{ github.ref_name }} + CFE_GOTOH_VERSION - name: Publish gem to Github Packages run: | mkdir -p $HOME/.gem touch $HOME/.gem/credentials chmod 0600 $HOME/.gem/credentials - printf -- "---\n:github: Bearer ${GEM_HOST_API_KEY}\n" > $HOME/.gem/credentials - gem push --KEY github --host https://rubygems.pkg.github.com/${OWNER} *.gem + printf -- "---\n:github: Bearer ${GH_TOKEN}\n" > $HOME/.gem/credentials + gem push --KEY github --host https://rubygems.pkg.github.com/${OWNER} ${BUILD_PATH}/*.gem env: - GEM_HOST_API_KEY: ${{ secrets.GITHUB_TOKEN }} OWNER: ${{ github.repository_owner }} + + - name: Add gem as a release asset + run: gh release upload $CFE_GOTOH_VERSION ${BUILD_PATH}/*.gem + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 577f3a267641b0c07caef2406c4df2b486387f24 Mon Sep 17 00:00:00 2001 From: rhliang Date: Fri, 22 Nov 2024 11:01:53 -0800 Subject: [PATCH 31/31] Some updates to the README. --- README.md | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 45fbd6c..eba85c5 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ Gotoh, Osamu. "Optimal alignment between groups of sequences and its application CABIOS 9.3 (1993): 361-370. ``` -# Launching the Docker container +# Launching the webserver Docker container To build the Docker container, use the included Makefile in the root directory. This is currently tested with `make build-base`. This builds the Docker image. @@ -69,14 +69,17 @@ for arbitrary historical reasons), you'll need to have the following installed: [RVM](https://rvm.io/) on Linux) - A C++ compiler -The "canonical" environment for building this package is the CfE-internal -`cfe_ubuntu` Ruby image based on Ubuntu 20.04 (this old version is required -to run Ruby 2.2.2). - -In your build environment, you should be able to build the extension module -using the `build_gem.bash` script. By default this will make a gem with the -version number `0.1.0.pre`; set the environment variable `GOTOH_VERSION` before -building to assign a proper version number. +The "canonical" environment for testing and building this package is the +dev container defined in the `.devcontainer` directory. This system is based +on the `cfe_ubuntu` Ruby image based on Ubuntu 20.04 (this old version is +required to run Ruby 2.2.2), and stripped down to only the parts necessary +for Ruby. This environment is also used by the CI/CD pipeline for testing +and building. + +In this environment (or another suitable environment), you should be able to +build the extension module using the `build_gem.bash` script. By default this +will make a gem with the version number `0.1.0.pre`; set the environment variable +`CFE_GOTOH_VERSION` before building to assign a proper version number. ## Manually building the Ruby bindings