From d221c227aecdc09e134c045d6527d372ccca96c9 Mon Sep 17 00:00:00 2001 From: Joshua Orvis Date: Wed, 6 Sep 2017 10:10:33 -0500 Subject: [PATCH] Old perl modules now on CPAN as 'TIGR' package --- lib/HmmTools.pm | 841 ----------------- lib/TIGR/FASTAgrammar.pm | 361 -------- lib/TIGR/FASTAiterator.pm | 750 --------------- lib/TIGR/FASTAreader.pm | 930 ------------------- lib/TIGR/FASTArecord.pm | 479 ---------- lib/TIGR/FASTAwriter.pm | 408 --------- lib/TIGR/Foundation.pm | 1822 ------------------------------------- 7 files changed, 5591 deletions(-) delete mode 100755 lib/HmmTools.pm delete mode 100755 lib/TIGR/FASTAgrammar.pm delete mode 100755 lib/TIGR/FASTAiterator.pm delete mode 100755 lib/TIGR/FASTAreader.pm delete mode 100755 lib/TIGR/FASTArecord.pm delete mode 100755 lib/TIGR/FASTAwriter.pm delete mode 100755 lib/TIGR/Foundation.pm diff --git a/lib/HmmTools.pm b/lib/HmmTools.pm deleted file mode 100755 index ac82ddd6..00000000 --- a/lib/HmmTools.pm +++ /dev/null @@ -1,841 +0,0 @@ -package HmmTools; -require Exporter; -use strict; -use warnings; -use Carp; -use Data::Dumper; -our @ISA = qw (Exporter); -our @EXPORT = - qw(read_hmmer3_output read_hmmer2_output print_htab hmm_database_info get_cutoffs_for_hmm_accession build_alignment read_hmmer3_output2); -our @EXPORT_OK = qw (); - -sub read_hmmer3_output { - my ($path) = @_; - my $retval = {}; - - my $in_result = 0; - my $in_hit_scores = 0; - my $in_domain_scores = 0; - - if ( $path ne '' ) { - chomp $path; - my @statd = stat $path; - $retval->{'info'}->{ 'search_date' } = ( ( localtime( $statd[ 9 ] ) )[ 3 ] ) . "-" - . ( ( localtime( $statd[ 9 ] ) )[ 4 ] + 1 ) . "-" - . ( ( localtime( $statd[ 9 ] ) )[ 5 ] + 1900 ); - } else { - $retval->{'info'}->{ 'search_date' } = ( ( localtime )[ 3 ] ) . "-" - . ( ( localtime )[ 4 ] + 1 ) . "-" - . ( ( localtime )[ 5 ] + 1900 ); - } - - open(my $fh, "< $path") or die("Unable to open $path: $!"); - while( my $line = <$fh> ) { - next if( $line =~ /^\s*$/ || $line =~ /inclusion threshold/ ); - - # The program line - if ( $line =~ /^\#\s*((hmmscan)\.*)/ ) { - $retval->{'info'}->{'program'} = $1; - - # The version - } elsif( $line =~ /^\#\s*HMMER ([\d\.]+)\s+\(([^\)]+)\)/ ) { - ( $retval->{'info'}->{'version'}, $retval->{'info'}->{'release'} ) = ( $1, $2 ); - - # The hmm database searched - } elsif( $line =~ /^\#\s+target HMM database:\s+(\S+)/ ) { - $retval->{'info'}->{'hmm_file'} = $1; - - # The query file - } elsif( $line =~ /^\#\squery sequence (file|database):\s+(.+)/ ) { - $retval->{'info'}->{'sequence_file'} = $2; - - # This indicates we're parsing a hit. - } elsif( $line =~ /^Query\:\s+(\S+)/ ) { - my $data = &_parse_hmmpfam3_hit( $fh ); - $retval->{'queries'}->{$1} = $data; - - } - } - - close($fh); - return $retval; -} - -sub _parse_hmmpfam3_hit { - my ($fh) = @_; - my $data = {}; - - my $in_hit_scores = 0; - my $in_domain_scores = 0; - my $hit_acc; - - while( my $line = <$fh> ) { - chomp($line); - last if( $line =~ m|//| ); - next if( $line =~ /^\s*$/ ); - - if( $line =~ /^Domain annotation/ ) { - $in_hit_scores = 0; - - } elsif( $in_hit_scores && $line !~ /^\s+--/ && $line !~ /inclusion_threshold/ ) { - if( $line =~ /No hits detected/ ) { - $data->{'hits'} = {}; - last; - } - - my @c = split(/\s+/, $line); - my $t_hit_acc = $c[9]; - $data->{'hits'}->{$t_hit_acc}->{total_evalue} = $c[1]; - $data->{'hits'}->{$t_hit_acc}->{total_score} = $c[2]; - $data->{'hits'}->{$t_hit_acc}->{accession} = $c[9]; - $data->{'hits'}->{$t_hit_acc}->{hit_description} = join(" ",@c[10..(@c-1)]); - $data->{'hits'}->{$t_hit_acc}->{domain_count} = $c[8]; - $data->{'hits'}->{$t_hit_acc}->{frame} = ""; - } elsif( $line =~ /^>>\s*(\S+)/ ) { - $in_domain_scores = 1; - $hit_acc = $1; - - } elsif( $in_domain_scores && $line =~ /Alignments for each/ ) { - undef $hit_acc; - $in_domain_scores = 0; - - } elsif( $in_domain_scores && $line !~ /^\s*[>\#-]/ ) { - die("Didn't parse hit accession from header line before getting to domain table") - unless( $hit_acc ); - die("The hit accession [$hit_acc] didn't exist in lookup") unless( exists( $data->{'hits'}->{$hit_acc} ) ); - - if( $line =~ /\[No individual domains that/ ) { - $data->{'hits'}->{$hit_acc}->{'domains'} = {}; - $in_domain_scores = 0; - next; - } - - my @c = split(/\s+/, $line); - - if( $c[1] eq 'targets' || $c[1] eq 'reported' || $c[1] eq 'Fwd') { - print Dumper( $data->{'hits'}->{$hit_acc} ); - print "LINE: $line\n"; - print Dumper( @c ); - die("Issue parsing"); - } - - my $a = $c[1]; - my $b = $a + 0; - if( $b ne $a ) { - print Dumper( $data->{'hits'}->{$hit_acc} ); - print "LINE: $line\n"; - print Dumper( \@c ); - die("c[1] not numeric"); - } - - - $data->{'hits'}->{$hit_acc}->{'domains'}->{$c[1]}->{'seq_f'} = $c[10]; - $data->{'hits'}->{$hit_acc}->{'domains'}->{$c[1]}->{'seq_t'} = $c[11]; - $data->{'hits'}->{$hit_acc}->{'domains'}->{$c[1]}->{'hmm_f'} = $c[7]; - $data->{'hits'}->{$hit_acc}->{'domains'}->{$c[1]}->{'hmm_t'} = $c[8]; - $data->{'hits'}->{$hit_acc}->{'domains'}->{$c[1]}->{'domain_score'} = $c[3]; - $data->{'hits'}->{$hit_acc}->{'domains'}->{$c[1]}->{'domain_evalue'} = $c[6]; - } elsif( $line =~ /^\s+E-value\s+score/ ) { - $in_hit_scores = 1; - } - } - - return $data; -} - -## Subroutine to parse hmmscan (Hmmer3.0) output -sub read_hmmer3_output_old { - my $path = shift; - my $data = {}; - my @lines; - if ( $path ne '' ) { - chomp $path; - my @statd = stat $path; - $data->{ 'search_date' } = ( ( localtime( $statd[ 9 ] ) )[ 3 ] ) . "-" - . ( ( localtime( $statd[ 9 ] ) )[ 4 ] + 1 ) . "-" - . ( ( localtime( $statd[ 9 ] ) )[ 5 ] + 1900 ); - open( FH, "$path" ) || die "Can't open $path for reading: $!\n"; - chomp( @lines = ); - close FH; - } else { - chomp( @lines = ); - $data->{ 'search_date' } = ( ( localtime )[ 3 ] ) . "-" - . ( ( localtime )[ 4 ] + 1 ) . "-" - . ( ( localtime )[ 5 ] + 1900 ); - } - if ( !@lines ) { - carp "No data read from input $path"; - return undef; - } - my $i = 0; - while ($i < @lines) { - if ( $lines[ $i ] =~ /^#\s*((hmmscan)\.*)/ ) { - $data->{ 'program' } = $1; - my $version_line = $lines[ ++$i ]; - $version_line =~ /^#\s*HMMER (\d+)\.(\S+)/; - ( $data->{ 'version' }, $data->{ 'release' } ) = ( $1, $2 ); - $i +=2; - last; - } - ++$i; - } - until ( $lines[ $i ] =~ /^\s*$/ ) { - if ( $lines[ $i ] =~ /^#\s*target HMM database:\s+(\S+)/ ) { - $data->{ 'hmm_file' } = $1; - } elsif ( $lines[ $i ] =~ /^#\s*query sequence (file|database):\s+(.+)/ ) { - $data->{ 'sequence_file' } = $2; - } - $i++; - die "Failure to parse" if $i > @lines; - } - $i++; - if ( $lines[ $i ] =~ /^Query:\s+(\S+)/ ) { - $data->{ 'query' } = $1; - print "Found query: $1\n"; - $i++; - } - until ( $lines[$i] =~ /^\s*$/ ) { - if ( $lines[ $i ] =~ /^Scores for/ ) { - $i++; # this skips the separator row - my $headers = $lines[ ++$i ]; - $i++; # this skips the separator row - } elsif ( $lines[ $i ] !~ /No hits detected that satisfy reporting thresholds/i ) { - $i++ if ($lines[$i] =~ /inclusion threshold/g); - my @c = split /\s+/, $lines[ $i ],11; - my $hit_index = $c[9]; - $data->{hit}{$hit_index}{total_evalue} = $c[1]; - $data->{hit}{$hit_index}{total_score} = $c[2]; - $data->{hit}{$hit_index}{accession} = $c[9]; - $data->{hit}{$hit_index}{hit_description} = $c[10]; - $data->{hit}{$hit_index}{domain_count} = $c[8]; - $data->{hit}{$hit_index}{frame} = ""; - } else { - return $data; - } - $i++; - die "Failure to parse" if $i > @lines; - } - $i++; - if ( $lines[ $i ] =~ /^Domain annotation for each model/ ) { - $i++; - } - until ($lines[$i] =~ /^\/\/$/) { - if ( $lines[ $i ] !~ /No targets detected that satisfy reporting thresholds/ ) { - if($lines[$i] =~ />>/) { - my @c = split /\s+/, $lines[ $i ]; - my $hit_index = $c[ 1 ]; - $i += 3; - - if ( !defined $data->{ 'hit' }->{ $hit_index } ) { - warn "Why doesn't '$hit_index' match an existing identifier?"; - } else { - until ($lines[$i] =~ /^\s*$/) { - my @res = split /\s+/, $lines[ $i ]; - $data->{hit}{$hit_index}{domain}{$res[1]}{seq_f} = $res[10]; - $data->{hit}{$hit_index}{domain}{$res[1]}{seq_t} = $res[11]; - $data->{hit}{$hit_index}{domain}{$res[1]}{hmm_f} = $res[7]; - $data->{hit}{$hit_index}{domain}{$res[1]}{hmm_t} = $res[8]; - $data->{hit}{$hit_index}{domain}{$res[1]}{domain_score} = $res[3]; - $data->{hit}{$hit_index}{domain}{$res[1]}{domain_evalue} = $res[6]; - $i++; - } - } - } - } else { - return $data; - } - $i++; - } - return $data; -} -## Use with caution. Or just don't use this method. -sub read_hmmer2_output { - my $path = shift; - my $data = {}; -# die "Bad Parser under reconstruction.\n\n"; -# drink in data - my @lines; - -# drink in the output from file or stdin - if ( $path ne '' ) { - chomp $path; - my @statd = stat $path; - $data->{ 'search_date' } = - ( ( localtime( $statd[ 9 ] ) )[ 3 ] ) . "-" - . ( ( localtime( $statd[ 9 ] ) )[ 4 ] + 1 ) . "-" - . ( ( localtime( $statd[ 9 ] ) )[ 5 ] + 1900 ); - open( FH, "$path" ) - || die "Can't open $path for reading: $!\n"; - chomp( @lines = ); - close FH; - } - else { - chomp( @lines = ); - $data->{ 'search_date' } = - ( ( localtime )[ 3 ] ) . "-" - . ( ( localtime )[ 4 ] + 1 ) . "-" - . ( ( localtime )[ 5 ] + 1900 ); - } - if ( !@lines ) { - carp "No data read from input $path"; - return undef; - } - my $i = 0; - -# first line grouping is company, package and license info -# warn "Parsing License. Line $i\n"; -# amahurkar:1/15/08 Seems like the current output does not have licesning info, so commenting this -# so commenting out these lines -#until ( $lines[ $i ] eq "" ) -# $data->{ 'header' } .= $lines[ $i ] . "\n"; -# $i++; - -#$i++; - -# next group is program and version -# warn "Parsing Program and version. Line $i\n"; -# amahurkar:1/15/08 the format has changed and now there is no blank space -# after program name, so we are using '- -' as tha match param -# to stop parsing for program name -#until ( $lines[ $i ] eq "" ) -#until ( $lines[ $i ] =~ m/^-\s-/) # It doesn't work with LDhmmpfam v1.5.4 -#until ( $lines[ $i ] =~ m/^-{8}\s+-/) - while ($i < @lines) { - if ( $lines[ $i ] =~ /^((hmmpfam|hmmsearch)\.*)/ ) { - $data->{ 'program' } = $1; - my $version_line = $lines[ ++$i ]; - $version_line =~ /^HMMER (\d+)\.(\S+)/; - ( $data->{ 'version' }, $data->{ 'release' } ) = ( $1, $2 ); - $i +=2; - last; - } - ++$i; - } - -# next group is program parameters -# warn "Parsing Parameters. Line $i\n"; -# amahurkar:1/15/08 the format has changed and now there is no blank space -# after program name, so we are using '- -' as tha match param -# to stop parsing for program parameters - until ( $lines[ $i ] =~ /^\s*$/ ) { - if ( $lines[ $i ] =~ /^HMM file:\s+(\S+)/ ) { - $data->{ 'hmm_file' } = $1; - } elsif ( $lines[ $i ] =~ /^Sequence (file|database):\s+(.+)/ ) { - $data->{ 'sequence_file' } = $2; - } elsif ( $lines[ $i ] =~ /^per-sequence score cutoff:\s+(.+)/ ) { - $data->{ 'total_score_cutoff' } = $1; - } elsif ( $lines[ $i ] =~ /^per-domain score cutoff:\s+(.+)/ ) { - $data->{ 'domain_score_cutoff' } = $1; - } elsif ( $lines[ $i ] =~ /^per-sequence E-value cutoff:\s+(.+)/ ) { - $data->{ 'total_evalue_cutoff' } = $1; - } elsif ( $lines[ $i ] =~ /^per-domain E-value cutoff:\s+(.+)/ ) { - $data->{ 'domain_evalue_cutoff' } = $1; - } - $i++; - die "Failure to parse" if $i > @lines; - } - $i++; - -# get query info -# warn "Parsing Query Info. Line $i\n"; -#until ( $lines[ $i ] eq "" ) - until ( $lines[$i] =~ /^\s*$/ ) { - if ( $lines[ $i ] =~ /^Query (?:HMM|sequence):\s+(.+)/ ) { - $data->{ 'query' } = $1; - } - elsif ( $lines[ $i ] =~ /^Accession:\s+(.+)/ ) { - $data->{ 'query_accession' } = $1; - } - elsif ( $lines[ $i ] =~ /^Description:\s+(.+)/ ) { - $data->{ 'query_description' } = $1; - } - $i++; - die "Failure to parse" if $i > @lines; - } - $i++; - -# next section is global search results -# warn "Parsing Global Search Results. Line $i\n"; - my $find_frame = 0; # is datbase nucleotide sequence? - my $hit_index; - until ( $lines[$i] =~ /^\s*$/ ) { - if ( $lines[ $i ] =~ /^Scores for/ ) { - my $headers = $lines[ ++$i ]; - if ( $headers =~ /\bFr\b/ ) { - $data->{ 'program' } .= "-frames"; - $find_frame = 1; - } - $i++; # this skips the separator row - } elsif ( $lines[ $i ] !~ /no hits above thresholds/i ) { - my @c = split /\s+/, $lines[ $i ]; - if ( $find_frame ) { - $hit_index = $c[ 0 ] . $c[ $#c ]; - $data->{ 'hit' }->{ $hit_index }->{ 'frame' } = pop @c; - } - else { -# $hit_index = $c[ 0 ]; # AP 20090807 - ($hit_index = $c[0]) =~ s/\.\d+$//; - } - $data->{hit}{$hit_index}{accession} = shift @c; - $data->{hit}{$hit_index}{domain_count} = pop @c; - $data->{hit}{$hit_index}{total_evalue} = pop @c; - $data->{hit}{$hit_index}{total_score} = pop @c; - $data->{hit}{$hit_index}{hit_description} = join " ", @c; - } - else { - return $data; - } - $i++; - die "Failure to parse" if $i > @lines; - } - $i++; - -# next section is domain breakdown -# warn "Parsing Domain Breakdown. Line $i\n"; -#until ( $lines[ $i ] eq "" ) - until ($lines[$i] =~ /^\s*$/) { - if ( $lines[ $i ] =~ /^Parsed for domains/ ) { - $i += 2; # to skip header and separator - } elsif ( $lines[ $i ] !~ /no hits above thresholds/ ) { - -# $lines[$i] =~ /^(\w+)\s+(\d+)\/\d+\s+(\d+)\s+(\d+)\s[\[\.\]]{2}\s+(\d+)\s+(\d+)\s[\[\.\]]{2}\s+(-?[\.\d]+)\s+([\.\-e\d]+)\s*(\-?\d)?/; - my @c = split /\s+/, $lines[ $i ]; - if ( $find_frame ) { - $hit_index = $c[ 0 ] . $c[ $#c ]; - } - else { -#$hit_index = $c[ 0 ]; # AP 20090807 - ($hit_index = $c[0]) =~ s/\.\d+$//; - } - if ( !defined $data->{ 'hit' }->{ $hit_index } ) { - warn - "Why doesn't '$hit_index' match an existing identifier?"; - } - -# $data->{'hit'}->{$hit_index}->{'domain'}->{$2}->{'seq_f'} = $3; - my ( $d, $t ) = split /\//, $c[ 1 ]; - $data->{hit}{$hit_index}{domain}{$d}{seq_f} = $c[2]; - $data->{hit}{$hit_index}{domain}{$d}{seq_t} = $c[3]; - $data->{hit}{$hit_index}{domain}{$d}{hmm_f} = $c[5]; - $data->{hit}{$hit_index}{domain}{$d}{hmm_t} = $c[6]; - $data->{hit}{$hit_index}{domain}{$d}{domain_score} = $c[8]; - $data->{hit}{$hit_index}{domain}{$d}{domain_evalue} = $c[9]; - } - $i++; - die "Failure to parse" if $i > @lines; - } - $i++; - if ( $data->{ 'program' } =~ /hmmsearch/ ) { - -# next section is alignments -# warn "Parsing Alignments. Line $i\n"; - if ( $lines[ $i ] =~ /^Alignments of top-scoring domains/ ) { - $i++; - ## kgalens [11/13/2012] - ## Not sure how this ever worked? $domain isn't in scope here. - ## I've added use strict; and this module wouldn't. So adding the next line - ## just so it will run. - my $domain; - my $hit; - my $hit_index; - ## END kgalens [11/13/2012] - - until ( $lines[$i] =~ /^\s*$/ ) { - if ( $lines[ $i ] =~ /^(\S+): domain (\d+)/ ) { - ($hit_index, $hit, $domain ) = ( $1, $1, $2 ); - $hit =~ s/(.{10}).*/$1/; -# warn " Parsing hit $hit_index. Line $i\n"; - if ( $find_frame ) { - if ( $lines[ $i ] =~ /Fr = ([\-\d]+)/ || $lines[ $i ] =~ /\. frame ([\-\d]+)/ ) { - $hit_index .= $1; - } else { - warn "ERROR: Couldn't find frame from:\n '$lines[$i]'"; - } - } - if ( !defined $data->{ 'hit' }->{ $hit_index } ) { - warn "Why doesn't '$hit_index' match an existing identifier?"; - } - $i++; - } - if ( $lines[ $i ] =~ /\bRF\b/ ) { ## WHY?? - $i++; - } - -# capture aligned hmm consensus - my $hmm_seq = $lines[ $i ]; - $hmm_seq =~ s/\s+//g; - $hmm_seq =~ s/[\*\-\>\<]//g; - - $data->{ 'hit' }->{ $hit_index }->{ 'domain' }->{ $domain }->{hmm_seq} .= $hmm_seq; - until ( $lines[ $i ] =~ /^\s+\Q$hit\E/) { ## changed from /\b\Q$hit\E\b/ ) - $i++; - } - my $prot_seq = $lines[ $i ]; - if ( $prot_seq =~ /\w+\s+(\d+|\-)\s+(\S+)\s+(\d+|\-)/ ) { - $data->{ 'hit' }->{ $hit_index }->{ 'domain' }->{ $domain }->{prot_seq} .= $2; - } - $i += 2; # skip the blank line and move on to the next - die "Failure to parse" if $i > @lines; - } - } - $i++; - -# next (last) section is statistics -# warn "Parsing Statistics. Line $i\n"; - my @data; - while ( $i < @data ) { - if ( $lines[ $i ] =~ /^\s+mu =\s+(-?\d+)/ ) { - $data->{ 'mu' } = $1; - } elsif ( $lines[ $i ] =~ /^\s+lambda =\s(-?\d+)/ ) { - $data->{ 'lambda' } = $1; - } elsif ( $lines[ $i ] =~ /chi-sq statistic =\s(\d+)/ ) { - $data->{ 'chisq' } = $1; - } elsif ( $lines[ $i ] =~ /Total sequences searched:\s*(\d+)/ ) { - $data->{ 'tot_seq_searched' } = $1; - } elsif ( $lines[ $i ] =~ /Whole sequence top hits/ ) { - $lines[ ++$i ] =~ /(\d+)/; - $data->{ 'total_hits' } = $1; - $lines[ ++$i ] =~ /(\d+)/; - $data->{ 'total_hits_above_evalue_cutoff' } = $1; - } elsif ( $lines[ $i ] =~ /Domain top hits/ ) { - $lines[ ++$i ] =~ /(\d+)/; - $data->{ 'domain_hits' } = $1; - $lines[ ++$i ] =~ /(\d+)/; - $data->{ 'domain_hits_above_evalue_cutoff' } = $1; - } - $i++; - die "Failure to parse" if $i > @lines; - } - } - return $data; -} - -sub hmm_database_info { - my $dbh = shift; - my $hmm_q = - "SELECT hmm_acc, hmm_len, trusted_cutoff, noise_cutoff, hmm_com_name," - . " trusted_cutoff2, noise_cutoff2, gathering_cutoff, gathering_cutoff2" - . " FROM hmm2" - . " WHERE is_current = 1"; - my $HMM = $dbh->selectall_hashref( $hmm_q, 'hmm_acc' ); - return $HMM; -} - -sub print_htab { -#NOTE: This will produce results if the data hash was created through the 'read_hmmer3_output' subroutine -# This is because of a change in the naming of some of the property keys (ex 'hit' to 'hits') - my $data = shift; - my $HMM = shift; - my $output = shift; - foreach my $qry_id ( keys %{$data->{'queries'}} ) { - foreach my $hit ( - sort { - $data->{'queries'}->{$qry_id}->{'hits'}->{ $b }->{ 'total_score' } <=> $data->{'queries'}->{$qry_id}-> - {'hits'}->{ $a }->{ 'total_score' } - } keys %{ $data->{'queries'}->{$qry_id}->{'hits'} } ) - { - my $h = $data->{'queries'}->{$qry_id}->{'hits'}->{ $hit }; - next if (scalar keys %{$h->{'domains'}} == 0); #skip model hits that have no domain hits - foreach my $domain ( sort { $a <=> $b } keys %{ $h->{ 'domains' } } ) - { - # for convenience - my $dh = $h->{ 'domains' }->{ $domain }; - if ( $data->{'info'}->{ 'program' } =~ /hmmsearch/ ) { #hmmer2 is currently deprecated so this will probably error - my $hmm_com_name = - $HMM->{ $data->{ 'query' } }->{ 'hmm_com_name' } - ? $HMM->{ $data->{ 'query' } }->{ 'hmm_com_name' } - : $data->{ 'query_description' }; - print $output "$data->{query}" - . "\t$data->{search_date}" - . "\t$HMM->{$data->{query}}->{hmm_len}" - . "\t$data->{program}" - . "\t$data->{sequence_file}" - . "\t$h->{accession}" - . "\t$dh->{hmm_f}" - . "\t$dh->{hmm_t}" - . "\t$dh->{seq_f}" - . "\t$dh->{seq_t}" - . "\t$h->{frame}" - . "\t$dh->{domain_score}" - . "\t$h->{total_score}" - . "\t$domain" - . "\t$h->{domain_count}" - . "\t$hmm_com_name" - . "\t$h->{hit_description}" - . "\t$HMM->{$data->{query}}->{trusted_cutoff}" - . "\t$HMM->{$data->{query}}->{noise_cutoff}" - . "\t$h->{total_evalue}" - . "\t$dh->{domain_evalue}" - . "\t$HMM->{$data->{query}}->{trusted_cutoff2}" - . "\t$HMM->{$data->{query}}->{noise_cutoff2}" - . "\t$HMM->{$data->{query}}->{gathering_cutoff}" - . "\t$HMM->{$data->{query}}->{gathering_cutoff2}" . "\n"; - } - elsif ( $data->{'info'}->{ 'program' } =~ /hmmscan|hmmpfam/ ) { - my $hmm_com_name = - $HMM->{ $hit }->{ 'hmm_com_name' } - ? $HMM->{ $hit }->{ 'hmm_com_name' } - : $h->{ 'hit_description' }; - print $output "$h->{accession}" - . "\t$data->{'info'}->{search_date}" - . "\t$HMM->{$hit}->{hmm_len}" - . "\t$data->{'info'}->{program}" - . "\t$data->{'info'}->{hmm_file}" - . "\t$qry_id" - . "\t$dh->{hmm_f}" - . "\t$dh->{hmm_t}" - . "\t$dh->{seq_f}" - . "\t$dh->{seq_t}" - . "\t$h->{frame}" - . "\t$dh->{domain_score}" - . "\t$h->{total_score}" - . "\t$domain" - . "\t$h->{domain_count}" - . "\t$hmm_com_name" - . "\t$h->{hit_description}" - . "\t$HMM->{$h->{accession}}->{trusted_cutoff}" - . "\t$HMM->{$h->{accession}}->{noise_cutoff}" - . "\t$h->{total_evalue}" - . "\t$dh->{domain_evalue}" - . "\t$HMM->{$h->{accession}}->{trusted_cutoff2}" - . "\t$HMM->{$h->{accession}}->{noise_cutoff2}" - . "\t$HMM->{$h->{accession}}->{gathering_cutoff}" - . "\t$HMM->{$h->{accession}}->{gathering_cutoff2}" . "\n"; - } - } - } - } -} - -sub build_alignment { - my $data = shift; - my $instructions = shift; - - # build output file name - my $output_file; - $output_file = - $instructions->{file_prefix} . "." . $instructions->{file_format}; - open my $OUT, ">$output_file" - or croak "Can't open '$output_file' as output file: $!\n"; - select $OUT; - - # retrieve aligned sequences - my %screened; - foreach my $hit ( keys %{ $data->{ 'hit' } } ) { - - # screen for total score cutoffs - if ( $data->{hit}->{ $hit }->{total_score} >= - $instructions->{total_bit_cutoff} - && $data->{hit}->{ $hit }->{total_evalue} <= - $instructions->{total_evalue_cutoff} ) - { - foreach my $domain ( keys %{ $data->{hit}->{ $hit }->{domain} } ) - { - if ( $data->{hit}->{ $hit }->{domain}->{ $domain } - ->{domain_score} >= $instructions->{domain_bit_cutoff} - && $data->{hit}->{ $hit }->{domain}->{ $domain } - ->{domain_evalue} <= - $instructions->{domain_evalue_cutoff} ) - { - $screened{ $hit } = $domain; - } - } - } - } - - # Now that we have sequences aligned to hmm sequence, we have to translate - # this into a multiple alignment. Assign each position in each alignment to - # a position on the hmm 'sequence', and keep track of gaps in the hmm alignment - my %DIST; - my $ref; - foreach my $hit ( keys %screened ) { - $ref = - $data->{ 'hit' }->{ $hit }->{ 'domain' }->{ $screened{ $hit } }; - - # split aligned hmm seq and aligned protein seq into arrays - my @hmma = split / */, $ref->{hmm_seq}; - my @prota = split / */, $ref->{prot_seq}; - - # these should be the same length. If not, there's an error. - if ( @hmma != @prota ) { - croak "Length of hmm alignment (" . @hmma . ")" - . " is not equal to protein alignment (" - . @prota . ")" - . ": $data->{query}/$data->{hit}->{$hit}->{accession}\n" - . "$ref->{hmm_seq}\n@hmma\n$ref->{prot_seq}\n@prota\n"; - } - - # assign each position in the protein alignment to its hmm alignment position, - # if one exists - my $hmm_pos = $ref->{hmm_f}; - my $gap = 0; - for ( my $i = 0 ; $i < @hmma ; $i++ ) { - if ( $hmma[ $i ] ne "." ) { - - # assign position in the protein alignment to its hmm alignment position, - $prota[ $i ] = $hmm_pos; - - # record max gap distance between hmm alignment positions. - $DIST{ $hmm_pos } = $gap - if ( $gap >= $DIST{ $hmm_pos } ); - $gap = 0; - $hmm_pos++; - } - else { - $gap++; - } - } - $ref->{aln_map} = \@prota; - } - - # Now go back through (now that we've fully expanded the hmm alignment - # to include any and all gaps) and make aligned protein sequence - foreach my $hit ( keys %screened ) { - $ref = - $data->{ 'hit' }->{ $hit }->{ 'domain' }->{ $screened{ $hit } }; - my @prot_seq = split / */, $ref->{prot_seq}; - - # start our aligned protein with any gap resulting from a partial HMM hit. - my $aln_prot = "." x ( $ref->{hmm_f} - 1 ); - my $insert = 0; - for ( my $i = 0 ; $i < @prot_seq ; $i++ ) { - - # grab the hmm alignment position for each protein alignment position - my $pos = $ref->{aln_map}->[ $i ]; - if ( $pos =~ /\d+/ ) { - - # if it maps to a position, first insert any gap from the hmm alignment - $aln_prot .= "." x ( $DIST{ $pos } - $insert ); - - # then add the aa. - $aln_prot .= $prot_seq[ $i ]; - $insert = 0; - } - - # if it is an insertion (ie the hmm alignment shows gap), insert a gap - else { - $aln_prot .= $prot_seq[ $i ]; - $insert++; - } - } - $aln_prot =~ s/\-/\./g; - $ref->{aln_prot} = $aln_prot; - } - - # Now print out in selected format - # Stockholm format - if ( $instructions->{file_format} eq "mul" ) { - print "# STOCKHOLM 1.0\n"; - foreach my $hit ( - sort { - $data->{ 'hit' }->{ $b } - ->{ 'total_score' } <=> $data->{ 'hit' }->{ $a } - ->{ 'total_score' } - } keys %screened - ) - { - my $domain = $screened{ $hit }; - my $hit_ref = $data->{hit}->{ $hit }; - my $domain_ref = $hit_ref->{domain}->{ $domain }; - - # each line should look like: - # prot_acc/coord-coord sequence - printf "%-40s%s\n", - ( - "$hit_ref->{accession}/$domain_ref->{seq_f}-$domain_ref->{seq_t}", - $domain_ref->{aln_prot} - ); - } - } - - # FASTA format - elsif ($instructions->{file_format} eq "fasta" - || $instructions->{file_format} eq "fa" ) - { - use TIGR::FASTArecord; - foreach my $hit ( - sort { - $data->{ 'hit' }->{ $b } - ->{ 'total_score' } <=> $data->{ 'hit' }->{ $a } - ->{ 'total_score' } - } keys %screened - ) - { - my $domain = $screened{ $hit }; - my $hit_ref = $data->{hit}->{ $hit }; - my $domain_ref = $hit_ref->{domain}->{ $domain }; - my $aln_prot = $domain_ref->{aln_prot}; - $aln_prot =~ s/(.{60})/$1\n/g; - $aln_prot =~ s/\n+/\n/g; - chomp $aln_prot; - my $header = - ">$hit_ref->{accession}/$domain_ref->{seq_f}-$domain_ref->{seq_t}\n"; - print $header . $aln_prot . "\n"; - } - } - - # MSF format - elsif ( $instructions->{file_format} eq "msf" ) { - my $head_len = 40; - my ( %new_acc, %tmp_seq ); - my $header_line = 0; - my @alignment; - foreach my $hit ( - sort { - $data->{ 'hit' }->{ $b } - ->{ 'total_score' } <=> $data->{ 'hit' }->{ $a } - ->{ 'total_score' } - } keys %screened - ) - { - my $domain = $screened{ $hit }; - my $hit_ref = $data->{hit}->{ $hit }; - my $domain_ref = $hit_ref->{domain}->{ $domain }; - my $len = length( $domain_ref->{aln_prot} ); - if ( $header_line == 0 ) { - - #added DUMMY checksum for HMMER 2.2g hmmbuild. DHH. - print - "PileUp\n\n MSF: $len Type: P Check: 1111 ..\n\n"; - $header_line = 1; - } - - # print accession list at top - printf " Name: %-40s Len: $len Check: 0 Weight: 1.0\n", - "$hit_ref->{accession}/$domain_ref->{seq_f}-$domain_ref->{seq_t}"; - - # prepare alignment for bottom - my @tmp_pep = split //, $domain_ref->{aln_prot}; - for ( my $i = 0 ; $i < ( $len / 50 ) ; $i++ ) { - $alignment[ $i ] .= sprintf "%-40s", - "$hit_ref->{accession}/$domain_ref->{seq_f}-$domain_ref->{seq_t}"; - for ( my $b = 0 ; $b < 5 ; $b++ ) { - for ( my $a = 0 ; $a < 10 ; $a++ ) { - $alignment[ $i ] .= - $tmp_pep[ $a + ( $b * 10 ) + ( 50 * $i ) ]; - } - $alignment[ $i ] .= " "; - } - $alignment[ $i ] .= "\n"; - } - } - print "\n//\n\n\n"; - foreach my $block ( @alignment ) { - print "$block\n\n"; - } - } - else { - croak - "Don't recognize alignment file format $instructions->{file_format}\n"; - } - select STDOUT; -} - -sub get_cutoffs_for_hmm_accession { - my $dbh = shift; - my $accession = shift; - my $hmm_q = - "select trusted_cutoff, trusted_cutoff2, noise_cutoff from egad..hmm2 where hmm_acc = '$accession'"; - return $dbh->selectrow_hashref( $hmm_q ); -} -1; diff --git a/lib/TIGR/FASTAgrammar.pm b/lib/TIGR/FASTAgrammar.pm deleted file mode 100755 index 884348f2..00000000 --- a/lib/TIGR/FASTAgrammar.pm +++ /dev/null @@ -1,361 +0,0 @@ -# $Id: FASTAgrammar.pm,v 1.1 2004/04/28 15:03:43 aphillip Exp $ - -# Copyright @ 2002 - 2010 The Institute for Genomic Research (TIGR). -# All rights reserved. -# -# This software is provided "AS IS". TIGR makes no warranties, express or -# implied, including no representation or warranty with respect to the -# performance of the software and derivatives or their safety, -# effectiveness, or commercial viability. TIGR does not warrant the -# merchantability or fitness of the software and derivatives for any -# particular purpose, or that they may be exploited without infringing the -# copyrights, patent rights or property rights of others. -# -# This software program may not be sold, leased, transferred, exported or -# otherwise disclaimed to anyone, in whole or in part, without the prior -# written consent of TIGR. - -package TIGR::FASTAgrammar; -{ - -=head1 NAME - -FASTAgrammar - module for validating FASTA format records - -=head1 SYNOPSIS - - use TIGR::FASTAgrammar ':public'; - - $header = FASTA header here... - $data = FASTA data here... - $return_value = isValidFASTARecord($header, $data); - ... - -=head1 DESCRIPTION - -This module provides functions for verifying compliance with TIGR's FASTA -file and record definitions. - -=cut - - BEGIN { - require 5.006_00; - } - - use strict; - require Exporter; - - ## internal variables and identifiers - - our @ISA = qw(Exporter); - our $REVISION = (qw$Revision: 1.1 $)[-1]; - our $VERSION = '1.2'; - our $VERSION_STRING = "$VERSION (Build $REVISION)"; - our @DEPEND = (); - - - ## Export methods - - our %EXPORT_TAGS = ( 'public' => [ qw( isValidFASTArecord - isValidFASTAheader - isValidFASTAdata - isValidFASTAlineLength - setValidFASTAlineLength ) ] , - 'private' => [ qw( _headerToIdentifier - _isNucleotideData - _isPeptideData ) ] ); - - our @EXPORT_OK = ( @{ $EXPORT_TAGS{'public'} }, - @{ $EXPORT_TAGS{'private'} } ); - - - ## data structures - - # IUPAC extended codes acceptable for sequence data - # see WU-BLAST format on http://tigrblast.tigr.org/html/fasta.html - our $NA_IUPAC_CODES = 'ATUGCMRWSYKVHDBN\.\-'; - our $AA_IUPAC_CODES = 'ARNDBCQEZGHILKMFPSTUWXYV\*\-'; - - # FASTA parameters - our $FASTA_SEPARATOR = '^>'; - our $UNBOUND_FASTA_SEPARATOR = '>'; - - # note: BLAST can accept 80 bases per line; this just emits a warning - our $OUTPUT_LINE_LENGTH = 60; - my $RECORD_LINE_LENGTH = 0; - - # the TIGR FASTA header parse - our $FASTA_HEADER_PARSE = - '^>([[:graph:]]+)(?: +[ \ca[:graph:]]*$| *$)'; - ## prototypes - - sub isValidFASTArecord(@); - sub isValidFASTAheader($); - sub isValidFASTAdata($); - sub isValidFASTAlineLength($); - sub setValidFASTAlineLength($); - sub _isNucleotideData($); - sub _isPeptideData($); - sub _headerToIdentifier($); - - ## implementation - -=over - -=item $result = isValidFASTArecord(@record_defn); - -This method determines if a FASTA record, C<@record_defn>, fits the TIGR -definition for a FASTA record. C<@record_defn> is an array of lines over -which the record is defined. The first line should be the FASTA header, and -subsequent lines the data definition. This method checks line width, -character composition, and header format. If the record parses correctly, -this method returns 1. Otherwise, this method returns 0. - -=cut - - - sub isValidFASTArecord(@) { - - my $header = shift; - my @data_lines = @_; - my $valid_flag = 0; - my $first_line_flag = 0; - my $first_len = 0; - - # check conformance of header - $valid_flag = isValidFASTAheader($header); - - # check conformance of data - if ( ( $valid_flag != 0 ) && - ( scalar(@data_lines) > 0 ) ) { - my $data_scalar = join "", @data_lines; - $data_scalar =~ s/\n//g; # extract new lines from scalar data - $valid_flag = isValidFASTAdata($data_scalar); - } - - # check conformance of line length - while ( ( $valid_flag != 0 ) && - ( defined ( my $line = shift @data_lines ) ) ) { - chomp $line; - - if($first_line_flag == 0) { - $first_len = setValidFASTAlineLength($line); - $first_line_flag = 1; - } - - if(defined($first_len)) { - my $line_len_flag = isValidFASTAlineLength($line); - if ( ( $line_len_flag > 0 ) || - ( ( $line_len_flag < 0 ) && - ( $#data_lines != -1 ) ) ) { - $valid_flag = 0; - } - } - } - - return $valid_flag; - } - -=item $result = isValidFASTAheader($header); - -This method determines if the FASTA header description, C<$header>, is -a valid description. It checks for a leading carot symbol and trailing non -white space characters. Any number of white space characters -may be interleaved throughout the text portion of the header, with the -exception that there may be no space between the carot and the first word. -If the header is valid, this method returns 1. Otherwise, this method -returns 0. - -=cut - - - sub isValidFASTAheader($) { - my $header = shift; - my $return_val = 0; - my $identifier = undef; - - if ( ( defined ($header) ) && - ( ($identifier) = $header =~ /$FASTA_HEADER_PARSE/ ) ) { - - if((defined $identifier) && ($identifier !~ /\//)) { - $return_val = 1; - } - else { - $return_val = 0; - } - } - else { - $return_val = 0; - } - return $return_val; - } - -=item $result = isValidFASTAdata($data_def); - -This method takes the scalar data definition of a FASTA record, C<$data_def>. -It tests the data and returns 1 if the data conforms to nucleotide data or if -it conforms to peptide data or both. If the data is not recognizable or is -undefined, it returns 0. - -=cut - - - sub isValidFASTAdata($) { - my $data_definition = shift; - my $return_val = undef; - if(($data_definition =~ /^[$NA_IUPAC_CODES]+$/i) || - ($data_definition =~ /^[$AA_IUPAC_CODES]+$/i)) { - $return_val = 1; - } - else { - $return_val = 0; - } - } - -=item $result = isValidFASTAlineLength($line); - -This method returns -1 if the data line, C<$line> is less than -the TIGR definition requirement for line length, 0 if the data -line meets the TIGR definition requirement for line length, and -1 if the data line is greater than the TIGR definition requirement -for line length. - -=cut - - - sub isValidFASTAlineLength($) { - my $line = shift; - my $line_len = undef; - my $return_val = undef; - - if ( defined ($line) ) { - chomp $line; - $line_len = length($line); - if ( $line_len > $RECORD_LINE_LENGTH ) { - $return_val = 1; - } - elsif ( $line_len < $RECORD_LINE_LENGTH ) { - $return_val = -1; - } - else { - $return_val = 0; - } - } - } - -=item $result = setValidFASTAlineLength($); - -This method takes in the first data line in the data portion of a FASTA record. -The function returns the length of this line if it is positive. This length -determines the line length for all the data lines following this first line. -The function returns undefined if unsuccessful. - -=cut - - - sub setValidFASTAlineLength($) { - my $line = shift; - my $line_len = undef; - my $ret_len = undef; - - if(defined ($line)) { - chomp $line; - $line_len = length($line); - - if($line_len > 0) { - $ret_len = $line_len ; - } - } - $RECORD_LINE_LENGTH = $ret_len; - return $ret_len; - } - -# $result = _isNucleotideData($data_def); - -#This method takes the scalar data definition of a FASTA record, C<$data_def>. -#It tests it for conformance to a nucleotide data type. If the data are -#nucleotide IUPAC characters, this method returns 1. If not, this method -#returns 0. This method returns 0 if C<$data_def> is undefined. - - - sub _isNucleotideData($) { - my $data_def = shift; - my $return_val = undef; - - if ( ( defined ( $data_def ) ) && - ( $data_def =~ /^[$NA_IUPAC_CODES]+$/i ) ) { - $return_val = 1; - } - else { - $return_val = 0; - } - - return $return_val; - } - - -# $result = _isPeptideData($data_def); - -#This method takes the scalar data definition of a FASTA record, C<$data_def>. -#It tests it for conformance to a peptide data type. If the data are -#peptide IUPAC characters, this method returns 1. If not, this method returns -#zero. This method returns undefined if C<$data_def> is undefined. - - - sub _isPeptideData($) { - my $data_def = shift; - my $return_val = undef; - - if ( ( defined ( $data_def ) ) && - ( $data_def =~ /^[$AA_IUPAC_CODES]+$/i ) ) { - $return_val = 1; - } - else { - $return_val = 0; - } - - return $return_val; - } - -# $identifier = _headerToIdentifier($header); - -#This function takes a FASTA header as a parameter, returning a parsed -#identifier for the record. If the supplied header is invalid or undefined, -#this method returns undefined. - - - sub _headerToIdentifier($) { - my $header = shift; - my $identifier = undef; - my $return_val = undef; - - if ( ( defined ($header) ) && - ( ($identifier) = $header =~ /$FASTA_HEADER_PARSE/ ) && - ( defined ($identifier) ) ) { - if($identifier =~ /\//) { - $return_val = undef; - } - else { - $return_val = $identifier; - } - } - else { - $return_val = undef; - } - - return $return_val; - } - - -=back - -=head1 USAGE - -This module is not intended for developer use. Instead, use the front -end modules C and C. - -=cut - -} - -1; diff --git a/lib/TIGR/FASTAiterator.pm b/lib/TIGR/FASTAiterator.pm deleted file mode 100755 index 4827d437..00000000 --- a/lib/TIGR/FASTAiterator.pm +++ /dev/null @@ -1,750 +0,0 @@ -# $Id: FASTAiterator.pm,v 1.1 2004/04/28 15:03:43 aphillip Exp $ - -# Copyright @ 2002 - 2010 The Institute for Genomic Research (TIGR). -# All rights reserved. -# -# This software is provided "AS IS". TIGR makes no warranties, express or -# implied, including no representation or warranty with respect to the -# performance of the software and derivatives or their safety, -# effectiveness, or commercial viability. TIGR does not warrant the -# merchantability or fitness of the software and derivatives for any -# particular purpose, or that they may be exploited without infringing the -# copyrights, patent rights or property rights of others. -# -# This software program may not be sold, leased, transferred, exported or -# otherwise disclaimed to anyone, in whole or in part, without the prior -# written consent of TIGR. - -package TIGR::FASTAiterator; -{ - -=head1 NAME - -TIGR::FASTAiterator - TIGR::FASTAiterator class for parsing and navigating -FASTA format files and streams. An object of this class can parse FASTA -records from STDIN and from a pipe. - -=head1 SYNOPSIS - - use TIGR::FASTAiterator; - my $obj_instance = new TIGR::FASTAiterator ($foundation_obj_ref, - $error_array_ref, - $fasta_file_name); - -=head1 DESCRIPTION - -This module iterates over a FASTA formatted file stream. It provides -data extraction and simple analysis routines. This module utilizes -acceptance validation of FASTA formatted files via the TIGR::FASTAgrammar -module. - -=cut - - BEGIN { - require 5.006_00; - } - - use strict; - use IO::File; - use TIGR::Foundation; - use TIGR::FASTAgrammar ':public'; - use TIGR::FASTAgrammar ':private'; - use TIGR::FASTArecord; - - - ## internal variables and identifiers - - our $REVISION = (qw$Revision: 1.1 $)[-1]; - our $VERSION = '1.11'; - our $VERSION_STRING = "$VERSION (Build $REVISION)"; - our @DEPEND = - ( - "TIGR::Foundation", - "TIGR::FASTAgrammar", - "TIGR::FASTArecord", - ); - - my $SYS_ERR = 0; # this flag specifies non-user related error - my $USR_ERR = 1; # this flag specifies user related error - - ## external variables - - my $UNBOUND_FASTA_SEPARATOR = $TIGR::FASTAgrammar::UNBOUND_FASTA_SEPARATOR; - - # debugging scheme - # - # Debugging via the TIGR Foundation uses increasing log levels based on - # nesting. 'MAIN' starts at level 1. Every nest increments the level by - # 1. - # Subroutines always start nesting at level 2. As debugging levels - # increase, logging is more verbose. This makes sense as you log at - # greater depth (ie. deeper branching). - # - # The following definitions help emphasize the debugging in the program. - # - my $DEBUG_LEVEL_1 = 1; - my $DEBUG_LEVEL_2 = 2; - my $DEBUG_LEVEL_3 = 3; - my $DEBUG_LEVEL_4 = 4; - my $DEBUG_LEVEL_5 = 5; - my $DEBUG_LEVEL_6 = 6; - my $DEBUG_LEVEL_7 = 7; - my $DEBUG_LEVEL_8 = 8; - my $DEBUG_LEVEL_9 = 9; - - ## prototypes - - sub new(;$$$); - sub open($); - sub close(); - sub hasNext(); - sub next(); - sub get(); - sub _initialize(); - sub _parse(); - sub _nullRecordHandler($); - sub _errorHandler($$$); - - - ## implementation - -=over - -=item $obj_instance = new TIGR::FASTAiterator ($foundation_object, - $error_array_ref, $db_file); - -This method returns a new instance of a TIGR::FASTAiterator object. It takes -three optional parameters: a TIGR::Foundation object (C<$foundation_object>), -a reference to an array for logging user error messages (C<$error_array_ref>), -and a FASTA file (C<$db_file>) or stream. The filename "-" describes stdin. -The new instance is returned on success. If the file supplied cannot be -opened or is invalid, this method returns undefined. This method also returns -undefined if the parameters supplied are invalid. Errors in parsing are written -to the array C<$error_array_ref>, the error file and the log file. - -=cut - - - sub new(;$$$) { - my $pkg = shift; - my @method_args = @_; - - my $error_condition = 0; - my $self = {}; - bless $self, $pkg; - $self->_initialize(); # set up internal variables; - - if ( ( scalar (@method_args) > 0 ) && - ( ( ref ($method_args[0]) ) =~ /foundation/i ) ) { - $self->{foundation} = shift @method_args; - $self->_errorHandler("Got TIGR::Foundation in new()", $DEBUG_LEVEL_3, - $SYS_ERR); - } - else { - $self->{foundation} = undef; - $self->_errorHandler("No TIGR::Foundation in new()", $DEBUG_LEVEL_3, - $SYS_ERR); - } - - if ( ( scalar (@method_args) > 0 ) && - ( ( ref ($method_args[0]) ) =~ /array/i ) ) { - $self->{error_ref} = shift @method_args; - $self->_errorHandler("Got Error ARRAY in new()", $DEBUG_LEVEL_3, - $SYS_ERR); - } - else { - $self->{error_ref} = undef; - $self->_errorHandler("No Error ARRAY in new()", $DEBUG_LEVEL_3, - $SYS_ERR); - } - - if ( ( scalar (@method_args) > 0 ) && - ( ! ref ($method_args[0]) ) ) { - my $filename = shift @method_args; - if(defined($filename)) { - $self->{db_file_name} = $filename ; - $self->_errorHandler("Got file name in new()", $DEBUG_LEVEL_4, - $SYS_ERR); - } - else { - $self->_errorHandler("undef passed as filename", $DEBUG_LEVEL_4, - $USR_ERR); - } - } - else { - $self->{db_file_name} = undef; - $self->_errorHandler("No file name in new()", $DEBUG_LEVEL_3, - $SYS_ERR); - } - - # check for invocation errors - if ( ( scalar (@method_args) > 0 ) ) { - $error_condition = 1; - $self->_errorHandler("Too many parameters passed to new() method", - $DEBUG_LEVEL_3, $SYS_ERR); - } - elsif ( defined ( $self->{db_file_name} ) && - ! defined ( $self->open($self->{db_file_name}) ) ) { - # the error message is logged via the open() routine - $self = undef; - } - return ( $error_condition == 0 ) ? $self : undef; - } - - -=item $result = $obj_instance->open($file_name); - -This method opens a FASTA file or pipe for reading. It takes in the filename to -be opened. If the file name is "-" the input is taken from stdin. On success, -this method returns 1. If the file cannot be opened or parsing fails, this -method returns undefined. - -=cut - - - sub open($) { - my $self = shift; - my $db_file_name = shift; - - my $error_condition = 0; - - # close a previously open file - if ( defined ($self->{db_handle}) ) { - $self->_errorHandler("Closing old handle in open()", $DEBUG_LEVEL_3, - $SYS_ERR); - $self->close(); - } - my $name = $self->{db_file_name}; - - if (!( - ( defined ( $db_file_name ) ) && - ( $self->{db_file_name} = $db_file_name ) && - ( defined ( $self->{db_file_name} )) - ) ) { - - $error_condition = 1; - $self->_errorHandler( - "File name does not exist", $DEBUG_LEVEL_3, $USR_ERR); - } - elsif(!defined ( $self->{db_handle} = - new IO::File $self->{db_file_name})) { - $error_condition = 1; - $self->_errorHandler( - "Cannot open file \'$self->{db_file_name}\'", $DEBUG_LEVEL_3, - $USR_ERR); - } - - - if ( $error_condition == 1 ) { - $self->_initialize(); # reset object state - } - - return ($error_condition == 1) ? undef : 1; - } - - -=item $result = $obj_instance->close(); - -This method closes the object file stream and resets all internal data -structures. The result of the operation is returned. If the file stream -is closed successfully, this object returns true (1), otherwise false -(undefined). - -=cut - - - sub close() { - my $self = shift; - my $return_val = undef; - - if ( defined ( $self->{db_handle} ) ) { - $return_val = $self->{db_handle}->close(); - if (!$return_val) { - $return_val = undef; - $self->_errorHandler( - "Error closing FASTA file: $self->{db_file_name}", - $DEBUG_LEVEL_4, $USR_ERR); - } - } - $self->_initialize(); - return $return_val; - } - - -=item $result = $obj_instance->hasNext(); - -This method returns true (1) if there are more elements beyond the current -element in the filestream. If not, this method returns false (undefined). - -=cut - - sub hasNext() { - my $self = shift; - my $next_header = $self->{rec_header}; - $self->_errorHandler( - "Checking to see if the header of the next record is set", - $DEBUG_LEVEL_2, $SYS_ERR); - my $result = undef; - my $newline = undef; - my $line_number = $self->{line_number}; - if ($line_number == 0) { - $self->_errorHandler( - "No record has been parsed", - $DEBUG_LEVEL_3, $SYS_ERR); - my $db_handle = defined ( $self->{db_handle} ) ? - $self->{db_handle} : undef; - - if(defined $db_handle) { - $newline = <$db_handle>; - } - #reading the first line from a file. - if(defined($newline)) { - $line_number++; - $self->{line_number} = $line_number; - $next_header = $newline; - $self->_errorHandler( - "Assigned the header of the next record", - $DEBUG_LEVEL_4, $SYS_ERR); - $self->{rec_header} = $next_header; - $result = 1; - } - } - - if((defined ($next_header)) && (($next_header) ne "")) { - $result = 1; - } - return $result; - } - - -=item $result = $obj_instance->next(); - -This method selects the next record in the file stream for parsing. If the -record parses, it is returned, else the method returns undefined. If there is -no record in the file stream, the method returns undefined. - -=cut - - sub next() { - my $self = shift; - my $record = undef; - if(($self->_parse()) == 1) { - $self->_errorHandler( - "The record parsed", - $DEBUG_LEVEL_3, $SYS_ERR); - #obtaining the stored record. - my $recordarray_ref = $self->{recordinfo}; - - if( (defined $recordarray_ref) && - (( ref ($recordarray_ref) ) =~ /array/i) ) { - - my @recordarray = @$recordarray_ref; - my $array_length = 0; - - if((defined $recordarray_ref) && - ($array_length = @recordarray) && - ($array_length > 0)) { - - my $header = shift @recordarray; - $self->_errorHandler( - "Got the record header", - $DEBUG_LEVEL_5, $SYS_ERR); - my $data = undef; - if ( scalar(@recordarray) > 0 ) { - $data = join "", @recordarray; - $data =~ s/\n//g; # extract new lines from scalar data - } - $self->_errorHandler( - "Got the record data", - $DEBUG_LEVEL_5, $SYS_ERR); - $record = new TIGR::FASTArecord ($header, $data); - - if(defined($record)) { - $self->_errorHandler( - "Created new record", - $DEBUG_LEVEL_6, $SYS_ERR); - } - } - } - } - return $record; - } - - -=item $record_contents = $obj_instance->get(); - -This method returns the current TIGR::FASTArecord object (active record). If -the current object (active record) is undefined, this method returns undefined. - -=cut - - sub get() { - my $self = shift; - my $record = undef; - #obtaining the stored record information. - my $recordarray_ref = $self->{recordinfo}; - if( (defined $recordarray_ref) && - (( ref ($recordarray_ref) ) =~ /array/i) ) { - my @recordarray = @$recordarray_ref; - my $array_length = 0; - - if((defined $recordarray_ref) && - ($array_length = @recordarray) && - ($array_length > 0)) { - - my $header = shift @recordarray; - $self->_errorHandler( - "Got the record header", - $DEBUG_LEVEL_4, $SYS_ERR); - my $data = undef; - if ( scalar(@recordarray) > 0 ) { - $data = join "", @recordarray; - $data =~ s/\n//g; # extract new lines from scalar data - } - $self->_errorHandler( - "Got the record data", - $DEBUG_LEVEL_4, $SYS_ERR); - $record = new TIGR::FASTArecord ($header, $data); - if(defined($record)) { - $self->_errorHandler( - "Created new record", - $DEBUG_LEVEL_5, $SYS_ERR); - } - - } - } - return $record; - } - - -# $obj_instance->_initialize(); - -#This method resets the object to its initial state. Internal data structures -#are reset. This method does not return. - - - sub _initialize() { - my $self = shift; - # look up methods for records here - $self->{error_cnt} = 0; # parse error tabulator - $self->{db_file_name} = ""; - $self->{db_handle} = undef; - $self->{rec_header} = undef; # the next record header - $self->{recordinfo} = undef; # reference to the record contents - $self->{line_number} = 0; # the line number in the FASTA file. - } - -# $obj_instance->_parse(); - -#This method parses a FASTA record from the file stream. -#All the parsing errors for this record are recorded in -#the logfile. If the record parses correctly, the method returns 1, else it -#returns 0. - - - sub _parse() { - - my $self = shift; - my $last_line_length_lt_std_flag = 0; - my $record_identifier = ""; - my $preceding_header_flag = 0; - my $first_data_line_length = 0; - my @recarray; - my $preceding_record_flag = 0; - - my $line_number = $self->{line_number}; - - $self->{error_cnt} = 0; - my $db_handle = defined ( $self->{db_handle} ) ? - $self->{db_handle} : undef; - - # check for FASTA header of next record - my $header = $self->{rec_header}; - - #when in the beginning and end of the filestream. - if(!defined ($header)) { - my $newline = <$db_handle>; - if((defined($newline)) && ($newline ne "")) { - $line_number++; - $header = $newline; - } - } - #parsing the header - if ( ( isValidFASTAheader($header) ) != 0 ) { - # set up the variables for parsing a new record - $last_line_length_lt_std_flag = 0; - $preceding_header_flag = 1; - $preceding_record_flag = 1; - $self->{rec_header} = undef; - - # if it's a valid FASTA header, then don't need to check again - # extract the record IDENTIFIER - $record_identifier = _headerToIdentifier($header); - push @recarray,$header; - } - else { #the header is not valid. - if((defined $header) && (defined $line_number)) { - $self->_errorHandler("Expected: record header " . - "information in FASTA record header. Got: \'$header\' ". - "at line $line_number.", $DEBUG_LEVEL_3, $USR_ERR); - } - $preceding_record_flag = 1; - $self->{rec_header} = undef; - } - - while ( ( defined ( $db_handle ) ) && - ( defined ( my $line = <$db_handle> )) && - (++$line_number)) { - chomp $line; - - # check FASTA data - if ( ( defined ( $record_identifier ) ) && - ( $record_identifier !~ // ) && - ( ( isValidFASTAdata($line) ) != 0 ) ) { - - push @recarray,$line; - if($preceding_header_flag == 1) { - $first_data_line_length = setValidFASTAlineLength($line); - } - - # check $last_line_length_lt_std_flag for an error on - # previous line - if(defined ($first_data_line_length)) { - if ( $last_line_length_lt_std_flag == 1 ) { - $self->{error_cnt}++; - $self->_errorHandler("Expected: FASTA data ". - "definition lines should be ". - "$first_data_line_length bases (characters) ". - "across. Only the last line of a sequence ". - "data definition may be less than " . - "$first_data_line_length bases (characters) " . - "across, if applicable. See line " . - ($line_number - 1) . '.', $DEBUG_LEVEL_6, $USR_ERR); - } - $last_line_length_lt_std_flag = 0; - - # check current line for over-length problem - if ( length($line) > $first_data_line_length ) { - $self->{error_cnt}++; - $self->_errorHandler("Expected: FASTA data ". - "definition lines should be $first_data_line_length ". - "bases (characters) across. Only the last line of a ". - "sequence data definition may be less than " . - "$first_data_line_length bases (characters) ". - "across, if applicable. See line " . $line_number . - '.', - $DEBUG_LEVEL_6,$USR_ERR); - } - - #check current line for under-length problem; - #report only if not - #the last line in the data definition - elsif ( length($line) < $first_data_line_length ) { - $last_line_length_lt_std_flag = 1; - } - } - $preceding_header_flag = 0; - } - elsif($line =~ /^>/) { #the next header - if ($preceding_record_flag == 1) { #its the next record. - $self->{rec_header} = $line; - $self->_errorHandler( - "Assigned the header of the next record", - $DEBUG_LEVEL_5, $SYS_ERR); - last; - } - } - # handle data error types - else { - $self->{error_cnt}++; - - # line has a separator token in it, so it may be header - if ( $line =~ /$UNBOUND_FASTA_SEPARATOR/ ) { - $self->_errorHandler("Expected: record header " . - "information in FASTA record header. Got: \'$line\' ". - "at line $line_number.", $DEBUG_LEVEL_6, $USR_ERR); - $last_line_length_lt_std_flag = 0; - } - # if last data line was small, expect this to be a header too - elsif ( $last_line_length_lt_std_flag == 1 ) { - $self->_errorHandler("Expected: FASTA record header " . - "beginning with \'>\'. Got: \'$line\' at line ". - "$line_number.",$DEBUG_LEVEL_6, $USR_ERR); - $last_line_length_lt_std_flag = 0; - } - elsif ( ( defined ( $record_identifier ) ) && - ( $record_identifier !~ // ) ) { - $self->_errorHandler("Expected: valid FASTA data " . - "definition for record identifier ". - "\'$record_identifier\' Check sequence content at line ". - "$line_number for invalid bases ". - "(data type: invalid data).", $DEBUG_LEVEL_6, $USR_ERR); - } - else { - $self->_errorHandler("Expected: FASTA record header " . - "followed by definition of sequence. Invalid input at " . - "line $line_number.", $DEBUG_LEVEL_6, $USR_ERR); - } - } - } # end while - - # check terminal case data definition - if ( $preceding_header_flag == 1 ) { - $self->_nullRecordHandler($line_number); - } - $self->{recordinfo} = \@recarray; - $self->{line_number} = $line_number; - return ( $self->{error_cnt} == 0 ) ? 1 : 0; - } - - -# $obj_instance->_nullRecordHandler($); - -#This method handles the case of a null or equivalently empty record -#encountered during parsing. It logs the appropriate message to the -#TIGR Foundation object. The only argument is the line number. - - - sub _nullRecordHandler($) { - my $self = shift; - my $line_number = shift; - - if ( ! defined ($line_number) ) { - $line_number = ""; - } - - $self->{error_cnt}++; - if ( $self->{db_handle}->eof() == 1 ) { - $self->_errorHandler("Expected: FASTA record header " . - "followed by definition of sequence. " . - "Got end of file after line " . - ($line_number) . ".", $DEBUG_LEVEL_5, $USR_ERR); - } - else { - $self->_errorHandler("Expected: FASTA record header " . - "followed by definition of sequence " . - "Got FASTA header at line " . - ($line_number-1) . ".", $DEBUG_LEVEL_5, $USR_ERR); - } - } - - - -# $message = $obj_instance->_errorHandler($message, $tf_level, -# $internal_log_flag); - -#This method handles logging to the TIGR::Foundation module and -#internal error record reference array. The C<$message> argument is logged -#to the appropriate service. The C<$tf_level> parameter specifies the -#logging level for TIGR::Foundation, while the C<$internal_log_flag> parameter -#specifies if C<$message> should be written to the internal array reference -#specified in C. If a TIGR::Foundation instance does not exist, -#no logging to that facility occurs. This method returns C<$message>. - - - sub _errorHandler($$$) { - - my $self = shift; - - my ( $message, $tf_level, $log_facility ) = @_; - - if ( defined ($message) && - defined ($tf_level) && - defined ($log_facility) ) { - - if ( defined ($self->{foundation}) ) { - if ( $log_facility != $USR_ERR ) { # all user errors go to .error - $self->{foundation}->logLocal($message, $tf_level); - } - else { - $self->{foundation}->logError($message); - } - } - - if ( ( defined ($self->{error_ref}) ) && - ( $log_facility == $USR_ERR ) ) { - push @{$self->{error_ref}}, $message; - } - } - return $message; - } - -=head1 USAGE - -To use this module, load the C package via the -C function. Then, create a new instance of the object via the -C method, as shown below. There are several invocations possible -for this method since all parameters to C are optional. - -To access records from the C instance, the -C package must be loaded via the C function. - -An example script using this module follows. The C -module is included for completeness but does not have to be used. - - #!/usr/local/bin/perl -w - - # This script accepts FASTA files with the '-i' option - # on the command line and validates every record in the file. - # Parse errors for each record are collected to the - # '@errors_list' array and written to the .error file. - # This program concatenates all of the correct records together to - # one output file specified with the '-o' option. - # NOTE: The '-i' option must be specified before every input file. - - - use strict; - use TIGR::FASTAiterator; - use TIGR::FASTArecord; - - MAIN: - { - my $tf_object = new TIGR::Foundation; - my @errors_list = (); - my @input_files = (); - my $output_file = undef; - - # Capture the return code from the TIGR::Foundation method - my $result = $tf_object->TIGR_GetOptions('i=s' => \@input_files, - 'o=s' => \$output_file); - if ( $result != 1 ) { - $tf_object->bail("Invalid command line options."); - } - - # Create a TIGR::FASTAiterator instance using TIGR::Foundation and - # an error message list. - my $fasta_iterator = new TIGR::FASTAiterator $tf_object, \@errors_list; - - if ( !( defined ( $output_file ) && - open OUTFILE, ">$output_file" ) ) { - $tf_object->bail("Cannot open output file for writing."); - } - - foreach my $in_file ( @input_files ) { - $fasta_iterator->open($in_file) or - $tf_object->logLocal("Cannot open or read file $in_file", 2); - - if ( scalar(@errors_list) > 0 ) { # are there parse errors? - while ( @errors_list ) { # get the messages from the list - my $message = shift @errors_list; - print STDERR $message, "\n"; - } - } - # - while ( $fasta_iterator->hasNext() ) { - # print each record to OUTFILE - my $record = $fasta_iterator->next(); - - # print each record to OUTFILE - if(defined $record) { - print OUTFILE $record->toString(); - } - } - } - } - -=cut - -} - -1; diff --git a/lib/TIGR/FASTAreader.pm b/lib/TIGR/FASTAreader.pm deleted file mode 100755 index c4f96c8d..00000000 --- a/lib/TIGR/FASTAreader.pm +++ /dev/null @@ -1,930 +0,0 @@ -# $Id: FASTAreader.pm,v 1.1 2004/04/28 15:03:43 aphillip Exp $ - -# Copyright @ 2002 - 2010 The Institute for Genomic Research (TIGR). -# All rights reserved. -# -# This software is provided "AS IS". TIGR makes no warranties, express or -# implied, including no representation or warranty with respect to the -# performance of the software and derivatives or their safety, -# effectiveness, or commercial viability. TIGR does not warrant the -# merchantability or fitness of the software and derivatives for any -# particular purpose, or that they may be exploited without infringing the -# copyrights, patent rights or property rights of others. -# -# This software program may not be sold, leased, transferred, exported or -# otherwise disclaimed to anyone, in whole or in part, without the prior -# written consent of TIGR. - - -package TIGR::FASTAreader; -{ - -=head1 NAME - -TIGR::FASTAreader - TIGR::FASTAreader class for parsing and navigating -FASTA format files. - -=head1 SYNOPSIS - - use TIGR::FASTAreader; - my $obj_instance = new TIGR::FASTAreader ($foundation_obj_ref, - $error_array_ref, $fasta_file_name); - -=head1 DESCRIPTION - -This module iterates over a FASTA formatted database file. It provides -data extraction and simple analysis routines. This module utilizes -acceptance validation of FASTA formatted files via the TIGR::FASTAgrammar -module. - -=cut - - BEGIN { - require 5.006_00; - } - - use strict; - use IO::File; - use TIGR::Foundation; - use TIGR::FASTAgrammar ':public'; - use TIGR::FASTAgrammar ':private'; - use TIGR::FASTArecord; - - - ## internal variables and identifiers - - our $REVISION = (qw$Revision: 1.1 $)[-1]; - our $VERSION = '1.21'; - our $VERSION_STRING = "$VERSION (Build $REVISION)"; - our @DEPEND = (); - - my $SYS_ERR = 0; # this flag specifies non-user related error - my $USR_ERR = 1; # this flag specifies user related error - - ## external variables - - my $UNBOUND_FASTA_SEPARATOR = $TIGR::FASTAgrammar::UNBOUND_FASTA_SEPARATOR; - - # debugging scheme - # - # Debugging via the TIGR Foundation uses increasing log levels based on - # nesting. 'MAIN' starts at level 1. Every nest increments the level by - # 1. - # Subroutines always start nesting at level 2. As debugging levels - # increase, logging is more verbose. This makes sense as you log at - # greater depth (ie. deeper branching). - # - # The following definitions help emphasize the debugging in the program. - # - my $DEBUG_LEVEL_1 = 1; - my $DEBUG_LEVEL_2 = 2; - my $DEBUG_LEVEL_3 = 3; - my $DEBUG_LEVEL_4 = 4; - my $DEBUG_LEVEL_5 = 5; - my $DEBUG_LEVEL_6 = 6; - my $DEBUG_LEVEL_7 = 7; - my $DEBUG_LEVEL_8 = 8; - my $DEBUG_LEVEL_9 = 9; - - ## prototypes - - sub new(;$$$); - sub open($;$); - sub close(); - sub index(); - sub seekIndex($); - sub getRecordByIdentifier($); - sub seekIdentifer($); - sub get(); - sub next(); - sub hasNext(); - sub count(); - sub path(); - sub _initialize(); - sub _parseDBfile(); - sub _nullRecordHandler($$); - sub _errorHandler($$$); - - - ## implementation - -=over - -=item $obj_instance = new TIGR::FASTAreader ($foundation_object, - $error_array_ref, $db_file); - -This method returns a new instance of a TIGR::FASTAreader object. It takes -three optional parameters: a TIGR::Foundation object (C<$foundation_object>), -a reference to an array for logging user error messages (C<$error_array_ref>), -and FASTA file (C<$db_file>). The new instance is returned on success. If -the file supplied cannot be opened or is invalid, this method returns -undefined. This method also returns undefined if the parameters supplied are -invalid. Errors in parsing are written to the array at C<$error_array_ref> -and the log file. - -=cut - - sub new(;$$$) { - my $pkg = shift; - my @method_args = @_; - - my $error_condition = 0; - my $self = {}; - bless $self, $pkg; - $self->_initialize(); # set up internal variables; - - if ( ( scalar (@method_args) > 0 ) && - ( ( ref ($method_args[0]) ) =~ /foundation/i ) ) { - $self->{foundation} = shift @method_args; - $self->_errorHandler("Got TIGR::Foundation in new()", $DEBUG_LEVEL_3, - $SYS_ERR); - } - else { - $self->{foundation} = undef; - $self->_errorHandler("No TIGR::Foundation in new()", $DEBUG_LEVEL_3, - $SYS_ERR); - } - - if ( ( scalar (@method_args) > 0 ) && - ( ( ref ($method_args[0]) ) =~ /array/i ) ) { - $self->{error_ref} = shift @method_args; - $self->_errorHandler("Got Error ARRAY in new()", $DEBUG_LEVEL_3, - $SYS_ERR); - } - else { - $self->{error_ref} = undef; - $self->_errorHandler("No Error ARRAY in new()", $DEBUG_LEVEL_3, - $SYS_ERR); - } - - if ( ( scalar (@method_args) > 0 ) && - ( ! ref ($method_args[0]) ) ) { - my $filename = shift @method_args; - if(defined($filename)) { - $self->{db_file_name} = $filename ; - $self->_errorHandler("Got file name in new()", $DEBUG_LEVEL_4, - $SYS_ERR); - } - else { - $self->_errorHandler("undef passed as filename", $DEBUG_LEVEL_4, - $USR_ERR); - } - } - else { - $self->{db_file_name} = undef; - $self->_errorHandler("No file name in new()", $DEBUG_LEVEL_3, - $SYS_ERR); - } - - # check for invocation errors - if ( ( scalar (@method_args) > 0 ) ) { - $error_condition = 1; - $self->_errorHandler("Too many parameters passed to new() method", - $DEBUG_LEVEL_3, $SYS_ERR); - } - elsif ( defined ( $self->{db_file_name} ) && - ! defined ( $self->open($self->{db_file_name}, "r") ) ) { - # the error message is logged via the open() routine - $self = undef; - } - return ( $error_condition == 0 ) ? $self : undef; - } - - -=item $result = $obj_instance->open($file_name, $flag); - -This method opens a FASTA file for reading. This method also parses the file -for correctness. The file, C<$file_name>, is opened using the C flags -specified by C<$flag>. On success, this method returns 1. If the file cannot -be opened or parsing fails, this method returns undefined. - -=cut - - sub open($;$) { - my $self = shift; - my $db_file_name = shift; - my $open_flags = shift; - - my $error_condition = 0; - - if ( ( ! defined ($open_flags) ) || - ( $open_flags !~ /^r$/i ) ) { - $open_flags = "r"; - } - $self->_errorHandler("Open flags = \'$open_flags\' in open()", - $DEBUG_LEVEL_3, $SYS_ERR); - - # close a previously open file - if ( defined ($self->{db_handle}) ) { - $self->_errorHandler("Closing old handle in open()", $DEBUG_LEVEL_3, - $SYS_ERR); - $self->close(); - } - - if (!( - ( defined ( $db_file_name ) ) && - ( $self->{db_file_name} = $db_file_name ) && - ( defined ( $self->{db_file_name} )) && - ( defined ( $self->{db_handle} = - new IO::File $self->{db_file_name}, $open_flags )) - ) ) { - $error_condition = 1; - $self->_errorHandler( - "Cannot open file \'$self->{db_file_name}\'", $DEBUG_LEVEL_3, - $USR_ERR); - } - elsif ( ( defined ( $self->{db_file_name} ) ) && - ( defined ( $self->{db_handle} ) ) && - ( $self->_parseDBfile() == 0 ) ) { - $error_condition = 1; - $self->_errorHandler("Encountered errors in file " . - "\'$self->{db_file_name}\'.", $DEBUG_LEVEL_3, $USR_ERR); - } - - if ( $error_condition == 1 ) { - $self->_initialize(); # reset object state - } - - return ($error_condition == 1) ? undef : 1; - } - - -=item $result = $obj_instance->close(); - -This method closes the object file stream and resets all internal data -structures. The result of the operation is returned. If the file stream -is closed successfully, this object returns true (1), otherwise false -(undefined). - -=cut - - sub close() { - my $self = shift; - my $return_val = undef; - - if ( defined ( $self->{db_handle} ) ) { - $return_val = $self->{db_handle}->close(); - if (!$return_val) { - $return_val = undef; - $self->_errorHandler( - "Error closing FASTA file: $self->{db_file_name}", - $DEBUG_LEVEL_4, $USR_ERR); - } - } - $self->_initialize(); - return $return_val; - } - - -=item $record_num = $obj_instance->index(); - -This method returns the record number of the active record. If no record has -been selected (ie. made active), then this method returns undefined. If -the active record pointer is before the first record, this method returns -'-1'. - -=cut - - sub index() { - my $self = shift; - my $return_val = undef; - - if ( defined ($self->{active_record}) ) { - $return_val = $self->{active_record}; - } - else { - $return_val = undef; - } - return $return_val; - } - - -=item $result = $obj_instance->seekIndex($num); - -This method selects a record by record order index. The C<$num> ordered -record is selected. If C<$num> is out of range for the database or not -1 -(indicating to seek one record before the first record), this -function returns undefined and the active record pointer is not changed. -Otherwise, the requested record is made active and the method returns 1. - -=cut - - sub seekIndex($) { - my $self = shift; - my $active_num = shift; - my $return_val; - - if ( (defined ($active_num) ) && - ( ($active_num =~ /^\d+$/) || - ($active_num == -1 ) ) && - ($active_num < $self->count()) ) { - $self->_errorHandler( - "Setting active record num to $active_num.", - $DEBUG_LEVEL_3, $SYS_ERR); - $self->{active_record} = $active_num; - $return_val = 1; - } - else { - if ( ! defined ($active_num) ) { - $active_num = ""; - } - $self->_errorHandler( - "Cannot set active record num to $active_num, out of " . - $self->count(), $DEBUG_LEVEL_3, $SYS_ERR); - $return_val = undef; - } - return $return_val; - } - - -=item $result = $obj_instance->next(); - -This method selects the next record in numerical order to be the active -record. It returns the record on success, undefined on failure. If the active -record is equal to -1, the first record is selected. - -=cut - - sub next() { - my $self = shift; - my $return_val = undef; - - if ( defined ( $self->hasNext() ) ) { - $self->{active_record}++; - $return_val = $self->get(); - } - # set undefined if no more records left or no records at all - else { - $return_val = undef; - } - - return $return_val; - } - - -=item $result = $obj_instance->hasNext(); - -This method returns true (1) if there are more elements beyond the -current element. If not, this method returns false (undefined). - -=cut - - sub hasNext() { - my $self = shift; - my $return_val = undef; - - if ( ( defined ($self->{active_record}) ) && - ( $self->{active_record} >= -1 ) && - ( $self->{active_record} < ( $self->count() - 1 ) ) - ) { - $return_val = 1; - } - else { - $return_val = undef; - } - return $return_val; - } - - -=item $result = - $obj_instance->getRecordByIdentifier($identifier); - -This method selects a record by record minimal identifier. -If C<$identifier> does not exist in the set of records, this function -returns undefined and the previously active record remains active. Otherwise, -the requested record is made active and the method returns a -C object representation of the current(active) record. - -=cut - - sub getRecordByIdentifier($) { - - my $self = shift; - my $identifier = shift; - my $fasta_record = undef; - my $seek_result = undef; - if((defined $identifier)) { - - my $seek_result = $self->seekIdentifier($identifier); - if( (defined ($seek_result)) && ($seek_result == 1)) { - $fasta_record = $self->get(); - } - } - else { - $self->_errorHandler("undefined identifier passed", $DEBUG_LEVEL_3, - $USR_ERR); - } - return $fasta_record; - } - - -=item $result = $obj_instance->seekIdentifier($identifier); - -This method selects a record by record minimal identifier. -If C<$identifier> does not exist in the set of records, this function -returns undefined and the previously active record remains active. Otherwise, -the requested record is made active and the method returns 1. - -=cut - - sub seekIdentifier($) { - my $self = shift; - my $identifier = shift; - my $number = undef; - - if ( (defined $identifier) && - (exists $self->{identifier_to_number_hash}->{$identifier}) ) { - $number = $self->{identifier_to_number_hash}->{$identifier}; - $self->_errorHandler( - "Got record number $number for $identifier.", $DEBUG_LEVEL_3, - $SYS_ERR); - } - return $self->seekIndex($number); - } - - -=item $record_contents = $obj_instance->get(); - -This method returns a C object representation of the current -(active) record. If no defined record is active, this method returns -undefined. - -=cut - - sub get() { - my $self = shift; - my $db_handle = defined ($self->{db_handle}) ? - $self->{db_handle} : undef; - my $header = ""; - my $data = ""; - my $f_obj = undef; - my $pos_str = undef; - my @pos_arr = (); - my $record = undef; - - # open FASTA file for reading - if (! defined ($db_handle) ) { - $self->_errorHandler("db_handle not defined: " . - "cannot access FASTA file \'$self->{db_file_name}\'.", - $DEBUG_LEVEL_3); - } - - # search and extract the FASTA record - if (( defined ( $self->{active_record} )) && - ( $self->{active_record} > -1 ) && - ( $self->{active_record} < $self->count() ) && - ( defined ( $db_handle )) && - ( defined ( $pos_str = - $self->{number_to_fp_array}->{$self->{active_record}} )) - ) { - @pos_arr = split " ", $pos_str; - - # seek to the start position of the record in the file and read the - # record information - if((defined $pos_arr[0]) && - (defined($db_handle->seek($pos_arr[0],SEEK_SET))) && - (defined $pos_arr[1]) && - (defined (read $db_handle, $record, - (($pos_arr[1]-$pos_arr[0])+1)))) { - - ($header) = $record =~ /(>.*)\n/; - $record =~ s/(>.*)\n//; - $record =~ s/[\s\n]+//g; - $f_obj = new TIGR::FASTArecord $header, $record; - } - } - return $f_obj; - } - - -=item $db_name = $obj_instance->path(); - -This method returns the path to the file used for processing. - -=cut - - sub path() { - my $self = shift; - # the existence of db_file_name is checked in new() - return $self->{db_file_name}; - } - - -=item $cnt = $obj_instance->count(); - -This method returns the number of records in the database file. - -=cut - - sub count() { - my $self = shift; - return $self->{num_records}; - } - - -# $obj_instance->_initialize(); - -#This method resets the object to its initial state. Internal data structures -#are reset. This method does not return. - - sub _initialize() { - my $self = shift; - - $self->{num_records} = 0; # number of recs - - # look up methods for records here - # the active record is stored as a sequence number - $self->{active_record} = undef; # current working record - $self->{number_to_fp_array} = (); # map seq# to file loc - $self->{number_to_identifier_array} = (); # map seq# to identifier - $self->{identifier_to_number_hash} = (); # map seq identifier to seq# - $self->{error_cnt} = 0; # parse error tabulator - $self->{db_file_name} = ""; - $self->{db_handle} = undef; - } - - -# $obj_instance->_parseDBfile(); - -#This method parses the FASTA database file passed via the C method. -#It defines all of the sequence look-ups and validates every record. This -#method finds the number of sequences and maximum sequence length. This -#method is called from the C method. The active record is un-selected -#by this method. - - sub _parseDBfile() { - my $self = shift; - my $last_line_length_lt_std_flag = 0; - my $line_number = 0; - my $record_identifier = ""; - my $preceding_header_flag = 0; - my $first_data_line_length = undef; - my $empty_line_found = 0; - my $db_handle = defined ( $self->{db_handle} ) ? - $self->{db_handle} : undef; - #the file position where the record starts - my $pos1 = 0; - #the file position where the record ends - my $pos2 = undef; - #the start and end positions of a record separated with a space - my $string = ""; - # variable to give the length of each line - my $line_len = undef; - # the sum of the length of all the lines in a file - my $sum_len = 0; - # loop through FASTA file - while ( ( defined ( $db_handle ) ) && - ( defined ( my $line = <$db_handle> ) ) && - ( ++$line_number ) ) { - chomp $line; - $line_len = length($line); - $sum_len += $line_len; - $sum_len++; - - # check FASTA data - if ( ( defined ( $record_identifier ) ) && - ( $record_identifier !~ // ) && - ( ( isValidFASTAdata($line) ) != 0 ) ) { - - # check if previous line was empty - if ( $empty_line_found == 1 ) { - $self->{error_cnt}++; - $self->_errorHandler("ERROR: Empty line found at line ". - ($line_number - 1). " - empty lines are ". - "allowed only at the end of a file", - $DEBUG_LEVEL_5, $USR_ERR); - $empty_line_found = 0; - } - - if($preceding_header_flag == 1) { - $first_data_line_length = setValidFASTAlineLength($line); - } - - # check $last_line_length_lt_std_flag for an error on previous line - if(defined ($first_data_line_length)) { - if ( $last_line_length_lt_std_flag == 1 ) { - $self->{error_cnt}++; - $self->_errorHandler("Expected: FASTA data definition " . - "lines should be $first_data_line_length bases " . - "(characters) across. Only the last line of a sequence ". - "data definition may be less than " . - "$first_data_line_length bases (characters) " . - "across, if applicable. See line " . - ($line_number - 1) . '.', $DEBUG_LEVEL_5, $USR_ERR); - } - $last_line_length_lt_std_flag = 0; - - # check current line for over-length problem - if ( $line_len > $first_data_line_length ) { - $self->{error_cnt}++; - $self->_errorHandler("Expected: FASTA data definition " . - "lines should be $first_data_line_length bases " . - "(characters) across. Only the last line of a sequence ". - "data definition may be less than " . - "$first_data_line_length bases (characters) ". - "across, if applicable. See line " . $line_number . '.', - $DEBUG_LEVEL_5,$USR_ERR); - } - - #check current line for under-length problem; report only if not - #the last line in the data definition - elsif ( $line_len < $first_data_line_length ) { - $last_line_length_lt_std_flag = 1; - } - } - $preceding_header_flag = 0; - } - # check for FASTA header - elsif ( ( isValidFASTAheader($line) ) != 0 ) { - if ( ! defined ( $self->{active_record} ) ) { - $self->{active_record} = -1; - } - - $self->{active_record}++; - - if( (defined $line_number) && - ($line_number > 1) ) { - $pos2 = (($sum_len - $line_len)-2); - $pos1 = $pos2+1; - - if((defined $pos1) && ( defined $string)) { - $string .= "$pos2"; - # store the start and end of a fasta record in a hash - $self->{number_to_fp_array}->{($self->{active_record})-1} = - $string; - } - } - - if(defined $pos1) { - $string = "$pos1 "; - } - - # check if previous line was a FASTA header - if ( $preceding_header_flag == 1 ) { - $self->_nullRecordHandler($self->{active_record} - 1, - $line_number); - } - - # check if previous line was empty - if ( $empty_line_found == 1 ) { - $self->{error_cnt}++; - $self->_errorHandler("ERROR: Empty line found at line ". - ($line_number - 1). " - empty lines are ". - "allowed only at the end of a file", - $DEBUG_LEVEL_5, $USR_ERR); - $empty_line_found = 0; - } - # if it's a valid FASTA header, then don't need to check again - # extract the record IDENTIFIER - $record_identifier = _headerToIdentifier($line); - if ( defined ( - $self->{identifier_to_number_hash}->{$record_identifier} ) ) { - $self->{error_cnt}++; - $self->_errorHandler("Expected: unique FASTA " . - "identifier. \'$record_identifier\' is a duplicate at " . - "line $line_number.", $DEBUG_LEVEL_5, $USR_ERR); - } - else { - $self->{identifier_to_number_hash}->{$record_identifier} = - $self->{active_record}; - $self->{number_to_identifier_array}->{$self->{active_record}} = - $record_identifier; - } - - # set up the variables for parsing a new record - $last_line_length_lt_std_flag = 0; - $preceding_header_flag = 1; - $self->{num_records}++; - } - # handle empty space - # empty space after the last record is allowed - elsif($line eq "") { - $empty_line_found = 1; - next; - } - # handle error data types - else { - $self->{error_cnt}++; - # check if previous line was empty - if ( $empty_line_found == 1 ) { - $self->{error_cnt}++; - $self->_errorHandler("ERROR: Empty line found at line ". - ($line_number - 1). " - empty lines are ". - "allowed only at the end of a file", - $DEBUG_LEVEL_5, $USR_ERR); - $empty_line_found = 0; - } - - # line has a separator token in it, so it may be header - if ( $line =~ /$UNBOUND_FASTA_SEPARATOR/ ) { - $self->_errorHandler("Expected: record header " . - "information in FASTA record header. Got: \'$line\' at " . - "line $line_number.", $DEBUG_LEVEL_6, $USR_ERR); - $last_line_length_lt_std_flag = 0; - } - # if last data line was small, expect this to be a header too - elsif ( $last_line_length_lt_std_flag == 1 ) { - $self->_errorHandler("Expected: FASTA record header " . - "beginning with \'>\'. Got: \'$line\' at line ". - "$line_number.",$DEBUG_LEVEL_6, $USR_ERR); - $last_line_length_lt_std_flag = 0; - } - elsif ( ( defined ( $record_identifier ) ) && - ( $record_identifier !~ // ) ) { - $self->_errorHandler("Expected: valid FASTA data " . - "definition for record identifier \'$record_identifier\'. " . - "Check sequence content at line $line_number for invalid " . - "bases (data type: invalid data).", $DEBUG_LEVEL_6, - $USR_ERR); - } - else { - $self->_errorHandler("Expected: FASTA record header " . - "followed by definition of sequence. Invalid input at " . - "line $line_number.", $DEBUG_LEVEL_6, $USR_ERR); - } - } - } # end while - - $pos2 = $sum_len; - if((defined $pos1) && ( defined $string)) { - $string .= "$pos2"; - # store the start and end of a fasta record in a hash - $self->{number_to_fp_array}->{$self->{active_record}} = $string; - } - - # check terminal case data definition - if ( $preceding_header_flag == 1 ) { - $self->_nullRecordHandler($self->{active_record}, $line_number); - } - - $self->{active_record} = -1; # set counter to the beginning - - return ( $self->{error_cnt} == 0 ) ? 1 : 0; - } - - -# $obj_instance->_nullRecordHandler($$); - -#This method handles the case of a null or equivalently empty record -#encountered during parsing. It logs the appropriate message to the -#TIGR Foundation object. The only arguments are the record number -#and the line number. - - sub _nullRecordHandler($$) { - my $self = shift; - my $active_num = shift; - my $line_number = shift; - my $preceding_rec_line_number = undef; - my $record_identifier = undef; - - if ( defined ( $self->{number_to_identifier_array}-> - {$active_num} ) ) { - $record_identifier = $self->{number_to_identifier_array}-> - {$active_num}; - } - else { - $record_identifier = ""; - } - - if(defined $line_number) { - if((defined $active_num) && - ($active_num < ($self->{active_record}))) { - $preceding_rec_line_number = $line_number-1; - } - elsif((defined $active_num) && - ($active_num == ($self->{active_record}))) { - $preceding_rec_line_number = $line_number; - } - } - else { - $preceding_rec_line_number = ""; - } - - $self->{error_cnt}++; - if ( $self->{db_handle}->eof() == 1 ) { - $self->_errorHandler("Expected: FASTA record header " . - "followed by definition of sequence. Record identifier " . - "\'" . $record_identifier . "\' is undefined from line " . - $preceding_rec_line_number . ". Got end of file after line " . - $line_number . ".", $DEBUG_LEVEL_5, $USR_ERR); - } - else { - $self->_errorHandler("Expected: FASTA record header " . - "followed by definition of sequence. Record identifier " . - "\'" . $record_identifier . "\' is undefined from line " . - $preceding_rec_line_number . ". Got FASTA header at line " . - $line_number . ".", $DEBUG_LEVEL_5, $USR_ERR); - } - } - - -# $message = $obj_instance->_errorHandler($message, $tf_level, -# $internal_log_flag); - -#This method handles logging to the TIGR::Foundation module and -#internal error record reference array. The C<$message> argument is logged -#to the appropriate service. The C<$tf_level> parameter specifies the -#logging level for TIGR::Foundation, while the C<$internal_log_flag> parameter -#specifies if C<$message> should be written to the internal array reference -#specified in C. If a TIGR::Foundation instance does not exist, -#no logging to that facility occurs. This method returns C<$message>. - - sub _errorHandler($$$) { - - my $self = shift; - - my ( $message, $tf_level, $log_facility ) = @_; - - if ( defined ($message) && - defined ($tf_level) && - defined ($log_facility) ) { - - if ( defined ($self->{foundation}) ) { - if ( $log_facility != $USR_ERR ) { # all user errors go to .error - $self->{foundation}->logLocal($message, $tf_level); - } - else { - $self->{foundation}->logError($message); - } - } - - if ( ( defined ($self->{error_ref}) ) && - ( $log_facility == $USR_ERR ) ) { - push @{$self->{error_ref}}, $message; - } - } - return $message; - } - -=head1 USAGE - -To use this module, load the C package via the -C function. Then, create a new instance of the object via the -C method, as shown below. There are several invocations possible -for this method since all parameters to C are optional. -To access records from the C instance, the -C package must be loaded via the C function. -An example script using this module follows. The C -module is included for completeness but does not have to be used. - - #!/usr/local/bin/perl -w - - # This script accepts FASTA files with the '-i' option - # on the command line and validates every one in turn. - # Parse errors are collected to the '@errors_list' array. - # This program concatenates all of the records together to - # one output file specified with the '-o' option. - # NOTE: The '-i' option must be specified before every input file. - # NOTE: The 'TIGR::FASTAwriter' module is intended for writing - # FASTA records. - - use strict; - use TIGR::FASTAreader; - use TIGR::FASTArecord; - - MAIN: - { - my $tf_object = new TIGR::Foundation; - my @errors_list = (); - my @input_files = (); - my $output_file = undef; - - # Capture the return code from the TIGR::Foundation method - my $result = $tf_object->TIGR_GetOptions('i=s' => \@input_files, - 'o=s' => \$output_file); - if ( $result != 1 ) { - $tf_object->bail("Invalid command line options."); - } - - # Create a TIGR::FASTAreader instance using TIGR::Foundation and - # an error message list. - my $fasta_reader = new TIGR::FASTAreader $tf_object, \@errors_list; - - if ( !( defined ( $output_file ) && - open OUTFILE, ">$output_file" ) ) { - $tf_object->bail("Cannot open output file for writing."); - } - - foreach my $in_file ( @input_files ) { - $fasta_reader->open($in_file) or - $tf_object->logLocal("Cannot open or read file $in_file", 2); - - if ( scalar(@errors_list) > 0 ) { # are there parse errors? - while ( @errors_list ) { # get the messages from the list - my $message = shift @errors_list; - print STDERR $message, "\n"; - } - } - - while ( $fasta_reader->hasNext() ) { - # print each record to OUTFILE - print OUTFILE $fasta_reader->next()->toString(); - } - } - } - -=cut - -} - -1; diff --git a/lib/TIGR/FASTArecord.pm b/lib/TIGR/FASTArecord.pm deleted file mode 100755 index 651510f5..00000000 --- a/lib/TIGR/FASTArecord.pm +++ /dev/null @@ -1,479 +0,0 @@ -# $Id: FASTArecord.pm,v 1.1 2004/04/28 15:03:43 aphillip Exp $ - -# Copyright @ 2002 - 2010 The Institute for Genomic Research (TIGR). -# All rights reserved. -# -# This software is provided "AS IS". TIGR makes no warranties, express or -# implied, including no representation or warranty with respect to the -# performance of the software and derivatives or their safety, -# effectiveness, or commercial viability. TIGR does not warrant the -# merchantability or fitness of the software and derivatives for any -# particular purpose, or that they may be exploited without infringing the -# copyrights, patent rights or property rights of others. -# -# This software program may not be sold, leased, transferred, exported or -# otherwise disclaimed to anyone, in whole or in part, without the prior -# written consent of TIGR. - -package TIGR::FASTArecord; -{ - -=head1 NAME - -TIGR::FASTArecord - TIGR::FASTArecord class describing FASTA records - -=head1 SYNOPSIS - - use TIGR::FASTArecord; - my $obj_instance = new TIGR::FASTArecord ($record_header, - $record_data); - -=head1 DESCRIPTION - -This module provides an object definition for a FASTA record. It verifies -data entry on creation, and returns information queried on the record. - -=cut - - BEGIN { - require 5.006_00; - } - - use strict; - use TIGR::FASTAgrammar ':public'; - use TIGR::FASTAgrammar ':private'; - - ## external variables - - my $OUTPUT_LINE_LENGTH_REF = \$TIGR::FASTAgrammar::OUTPUT_LINE_LENGTH; - my $UNBOUND_FASTA_SEPARATOR = $TIGR::FASTAgrammar::UNBOUND_FASTA_SEPARATOR; - - ## internal variables and identifiers - - our $REVISION = (qw$Revision: 1.1 $)[-1]; - our $VERSION = '1.2'; - our $VERSION_STRING = "$VERSION (Build $REVISION)"; - our @DEPEND = (); - - ## prototypes - - sub new($$); - sub equals($); - sub getHeader(); - sub getIdentifier(); - sub getData(); - sub size(); - sub toString(); - sub reverseComplement($); - sub reverseComplementData(); - sub subSequence($;$); - sub unGap(); - - ## implementation - -=over - -=item $obj_instance = new TIGR::FASTArecord ($header, $data_rec); - -This method returns a new instance of a TIGR::FASTArecord object. It takes -a record header, C<$header>, and the record data, C<$data_rec> as parameters. -The record header may optionally contain the leading header character, a -C<> symbol. Both parameters are parsed. A new object instance is -returned on success. If parsing fails or a record cannot be created, this -method returns undefined. - -=cut - - - sub new($$) { - my $pkg = shift; - my $header = shift; - my $data_rec = shift; - my $self = undef; - my $identifier = undef; - - if ( ( defined ($header) ) && - ( $header =~ s[^($UNBOUND_FASTA_SEPARATOR){0,1}(.*)] # accept no - [$UNBOUND_FASTA_SEPARATOR$2] ) && # separator - ( defined ($data_rec) ) && - ( defined($identifier = _headerToIdentifier($header)) ) && - # - # parsable id. - ( isValidFASTAdata($data_rec) != 0 ) ) { # valid data? - $self = {}; - bless $self, $pkg; - - chomp($header); - chomp($data_rec); - - $self->{header} = $header; - $self->{identifier} = $identifier; - $self->{data_rec} = $data_rec; - $self->{rec_size} = length($data_rec); # compute size - } - return $self; - } - - -=item $result = $obj_instance->equals($fasta_obj); - -This method takes a I object as a parameter. It compares -the passed object with the calling object's internal structures to test for -equality. If the two objects are equal, this method returns true (1). -Otherwise, this method returns false (undefined). - -=cut - - - sub equals($) { - my $self = shift; - my $other_obj = shift; - my $return_val = undef; - - if ( ( defined ($other_obj) ) && - ( $self->toString() eq $other_obj->toString() ) - ) { - $return_val = 1; - } - else { - $return_val = undef; - } - - return $return_val; - } - - -=item $identifier = $obj_instance->getHeader(); - -This method returns the header string for the record. - -=cut - - - sub getHeader() { - my $self = shift; - my $header = $self->{header}; # should always be defined - if (! isValidFASTAheader($header) ) { - # need to prepend an $UNBOUND_FASTA_SEPARATOR - $header = $UNBOUND_FASTA_SEPARATOR . $header; - } - return $header; - } - - -=item $identifier = $obj_instance->getIdentifier(); - -This method returns the identifier string for the record. - -=cut - - - sub getIdentifier() { - my $self = shift; - return $self->{identifier}; # should always be defined - } - - -=item $data_contents = $obj_instance->getData(); - -This method returns the data contents of the record as an -uninterrupted string. - -=cut - - - sub getData() { - my $self = shift; - return $self->{data_rec}; # should always be defined - } - - -=item $size_of_rec = $obj_instance->size(); - -This method returns the size of the data string contained by the -record. - -=cut - - - sub size() { - my $self = shift; - return $self->{rec_size}; # should always be defined - } - - -=item $str_rep = $obj_instance->toString(); - -This method returns a string representation of the FASTA record. -This string representation conforms to the TIGR definition of a -FASTA record. - -=cut - - - sub toString() { - my $self = shift; - my $data_copy = $self->{data_rec}; - my $str_rep = $self->getHeader() . "\n"; - my $seg = undef; - my @segments = (); - my $last_seg = undef; - my $skip = undef; - - while((defined($seg = substr $data_copy, 0, - $$OUTPUT_LINE_LENGTH_REF,'')) && ($seg ne '')) { - $str_rep .= $seg . "\n"; - } - return $str_rep; - } - -=item $rev_compl_str = $obj_instance->reverseComplement($NA_strand); - -This method takes in a string which represents a Nucleotide strand and -returns the reverse complement of the strand. If the string does not -represent a Nucleotide strand or is undefined, the method returns undefined. -When an empty string is passed into the method, we get an empty string on -return. - -=cut - - - sub reverseComplement($) { - my $self = shift; - my $NA_strand = shift; - my $error_condition = 1; - my $rev_compl_str = undef; - my $validData = 0; - - if((defined ($NA_strand)) && - ($validData = _isNucleotideData($NA_strand)) && ($validData == 1)) { - - $rev_compl_str = ''; - # taking the complement of the sequence - if(!defined( $NA_strand =~ tr/AaCcTtGgUuMmRrWwSsYyKkVvHhDdBbXxNn.-/TtGgAaCcAaKkYyWwSsRrMmBbDdHhVvXxNn.-/)) { - $error_condition = undef; - } - # taking the reverse of the complemented sequence - elsif(!defined($rev_compl_str = reverse($NA_strand))) { - $error_condition = undef; - } - } - else { - $error_condition = undef; - } - return ( defined ( $error_condition ) ) ? $rev_compl_str : undef; - } - - -=item $rev_compl_str = $obj_instance->reverseComplementData(); - -This method returns the reverse complement of the FASTA record data. If the -FASTA record data does not represent a Nucleotide strand or is undefined, the -method returns undefined. - -=cut - - - sub reverseComplementData() { - my $self = shift; - my $NA_strand = $self->{data_rec}; - my $error_condition = 1; - my $rev_compl_str = undef; - my $validData = 0; - - if((defined ($NA_strand)) && - ($validData = _isNucleotideData($NA_strand)) && ($validData == 1)) { - - $rev_compl_str = ''; - # taking the complement of the sequence - if(!defined( $NA_strand =~ tr/AaCcTtGgUuMmRrWwSsYyKkVvHhDdBbXxNn.-/TtGgAaCcAaKkYyWwSsRrMmBbDdHhVvXxNn.-/)) { - $error_condition = undef; - } - # taking the reverse of the complemented sequence - elsif(!defined($rev_compl_str = reverse($NA_strand))) { - $error_condition = undef; - } - } - else { - $error_condition = undef; - } - return ( defined ( $error_condition ) ) ? $rev_compl_str : undef; - } - - -=item $data_substr = $obj_instance->subSequence($startpos, $length); - -This method behaves like substr(). This method takes in two numbers $startpos -and $length and returns a substring of the record data. The $startpos for the -first base in the data is 0. The $length is optional. The substring is -extracted starting at $startpos characters from the front of the data string. -If $startpos is negative, the substring starts that far from the end of the -string instead. If $length is omitted, everything to the end of the string is -returned. If $length is negative, the length is calculated to leave that many -characters off the end of the string. Otherwise, $length indicates the length -of the substring to extract. If $startpos is undefined the function returns -undefined. If either $startpos or $length are greater than the data length, the -method returns undefined - -=cut - - - sub subSequence($;$) { - my $self = shift; - my ($startpos, $length) = @_; - my $data_str = $self->getData(); - my $data_substr = undef; - my $strlen = length($data_str); - - if((defined ($data_str)) && (defined ($startpos)) && - (defined ($length)) && (abs($startpos) < $strlen) && - (abs($length) <= $strlen)) { - #both parameters are specified - $data_substr = substr($data_str, $startpos, $length); - } - elsif((defined ($data_str)) && (defined ($startpos)) && - (!defined ($length)) && (abs($startpos) < $strlen) ) { - #only one parameter is specified. - $data_substr = substr($data_str, $startpos); - } - else { - #either $data_str is undefined or $startpos is undefined so do - #nothing. - } - return $data_substr; - } - -=item $result = $obj_instance->unGap(); - -This method removes all the gaps('-' characters) from the record data. The -record size is changed after the gaps are removed. The method returns 1 on -success and undefined otherwise. - -=cut - - - sub unGap() { - my $self = shift; - my $data_str = $self->getData(); - my $result = 1; - if((defined ($data_str)) && ($data_str =~ /[-]+/)) { - #removing the gaps in the data. - $data_str =~ s/[-]+//g; - $self->{data_rec} = $data_str; - $self->{rec_size} = length($data_str); - - } - elsif(!defined ($data_str)) { - $result = undef; - } - return $result; - } - -=back - - - -=head1 USAGE - -To use this module, load it using the C function. The object must -be initialized with a header and a data string. An example follows. -Please refer to the C package usage for more examples. - - #!/usr/local/bin/perl -w - - use strict; - use TIGR::FASTArecord; - - MAIN: - { - - # set up a simple example without using the leading carot. - my $header1 = "ORF00001 The first ORF in the series"; - my $data1 = - "MEEISTPEGGVLVPISIETEVKRAYIDYSMSVIVSRALPDVRDGLKPVHRRILYAMEEKG" . - "LRFSGPTRKCAKIVGDVLGSFHPHGDASVYDALVRLGQDFSLRYPVIHPQGNFGTIGGDP" . - "PAAYRYTEAKMARIAESMVEDIKKETVSFVPNFDDSDVEPTVLPGRFPFLLANGSSGIAV" . - "GMTTNMPPHNLREIAAAISAYIENPNLSIQELCDCINGPDFPTGGIIFGKNGIRQSYETG" . - "RGKIVVRARFTIETDSKGRDTIIFTEVPYQVNTTMLVMRIGELARAKVIEGIANVNDETS" . - "DRTGLRIVVELKKGTPAQVVLNHLFAKTPLQSSFNVINLALVEGRPRMLTLKDLVRYFVE" . - "HRVDVVTRRAHFELRKAQERIHLVRALIRALDAIDKIITLIRHSQNTELAKQRLREQFDF" . - "DNVQAQAIVDMQMKRLTGLEVESLRTELKDLTELISSLEELLTSPQKVLGVVKKETRDIA" . - "DMFGDDRRTDIVSNEIEYLDVEDFIQKEEMVILISHLGYIKRVPVSAYRNQNRGGKGSSS" . - "ANLAAHDFISQIFTASTHDYVMFVTSRGRAYWLKVYGIPESGRANRGSHIKSLLMVATDE" . - "EITAIVSLREFSNKSYVFMATARGVVKKVTTDNFVNAKTRGIIALKLSGGDTLVSAVLVQ" . - "DEDEVMLITRQGKALRMSGREVREMGRNSSGVIGIKLTSEDLVAGVLRVSEQRKVLIMTE" . - "NGYGKRVSFSEFSVHGRGTAGQKIYTQTDRKGAIIGALAVLDTDECMCITGQGKTIRVDV" . - "CAISVLGRGAQGVRVLDIEPSDLVVGLSCVMQG"; - - my $fasta_record = new TIGR::FASTArecord $header1, $data1; - if ( defined ( $fasta_record ) ) { - print STDOUT "Sequence " . $fasta_record->getIdentifier() . " is " . - $fasta_record->size() . " residues.\n"; - print STDOUT $fasta_record->toString(); - } - else { - die "Invalid FASTA record 1"; - } - - # but this header is also valid. - my $header2 = ">ORF00001 The first ORF in the series"; - my $data2 = - "MEEISTPEGGVLVPISIETEVKRAYIDYSMSVIVSRALPDVRDGLKPVHRRILYAMEEKG" . - "LRFSGPTRKCAKIVGDVLGSFHPHGDASVYDALVRLGQDFSLRYPVIHPQGNFGTIGGDP" . - "PAAYRYTEAKMARIAESMVEDIKKETVSFVPNFDDSDVEPTVLPGRFPFLLANGSSGIAV" . - "GMTTNMPPHNLREIAAAISAYIENPNLSIQELCDCINGPDFPTGGIIFGKNGIRQSYETG" . - "RGKIVVRARFTIETDSKGRDTIIFTEVPYQVNTTMLVMRIGELARAKVIEGIANVNDETS" . - "DRTGLRIVVELKKGTPAQVVLNHLFAKTPLQSSFNVINLALVEGRPRMLTLKDLVRYFVE" . - "HRVDVVTRRAHFELRKAQERIHLVRALIRALDAIDKIITLIRHSQNTELAKQRLREQFDF" . - "DNVQAQAIVDMQMKRLTGLEVESLRTELKDLTELISSLEELLTSPQKVLGVVKKETRDIA" . - "DMFGDDRRTDIVSNEIEYLDVEDFIQKEEMVILISHLGYIKRVPVSAYRNQNRGGKGSSS" . - "ANLAAHDFISQIFTASTHDYVMFVTSRGRAYWLKVYGIPESGRANRGSHIKSLLMVATDE" . - "EITAIVSLREFSNKSYVFMATARGVVKKVTTDNFVNAKTRGIIALKLSGGDTLVSAVLVQ" . - "DEDEVMLITRQGKALRMSGREVREMGRNSSGVIGIKLTSEDLVAGVLRVSEQRKVLIMTE" . - "NGYGKRVSFSEFSVHGRGTAGQKIYTQTDRKGAIIGALAVLDTDECMCITGQGKTIRVDV" . - "CAISVLGRGAQGVRVLDIEPSDLVVGLSCVMQG"; - - my $fasta_record2 = new TIGR::FASTArecord $header2, $data2; - if ( defined ( $fasta_record2 ) ) { - print STDOUT "Sequence " . $fasta_record2->getIdentifier() . " is " . - $fasta_record2->size() . " residues.\n"; - print STDOUT $fasta_record2->toString(); - } - else { - die "Invalid FASTA record 2"; - } - - # this entry fails; note the 'J' in the second line of data (8th char.) - my $header3 = "ORF00001 The first ORF in the series"; - my $data3 = - "MEEISTPEGGVLVPISIETEVKRAYIDYSMSVIVSRALPDVRDGLKPVHRRILYAMEEKG" . - "LRFSGPTJKCAKIVGDVLGSFHPHGDASVYDALVRLGQDFSLRYPVIHPQGNFGTIGGDP" . - "PAAYRYTEAKMARIAESMVEDIKKETVSFVPNFDDSDVEPTVLPGRFPFLLANGSSGIAV" . - "GMTTNMPPHNLREIAAAISAYIENPNLSIQELCDCINGPDFPTGGIIFGKNGIRQSYETG" . - "RGKIVVRARFTIETDSKGRDTIIFTEVPYQVNTTMLVMRIGELARAKVIEGIANVNDETS" . - "DRTGLRIVVELKKGTPAQVVLNHLFAKTPLQSSFNVINLALVEGRPRMLTLKDLVRYFVE" . - "HRVDVVTRRAHFELRKAQERIHLVRALIRALDAIDKIITLIRHSQNTELAKQRLREQFDF" . - "DNVQAQAIVDMQMKRLTGLEVESLRTELKDLTELISSLEELLTSPQKVLGVVKKETRDIA" . - "DMFGDDRRTDIVSNEIEYLDVEDFIQKEEMVILISHLGYIKRVPVSAYRNQNRGGKGSSS" . - "ANLAAHDFISQIFTASTHDYVMFVTSRGRAYWLKVYGIPESGRANRGSHIKSLLMVATDE" . - "EITAIVSLREFSNKSYVFMATARGVVKKVTTDNFVNAKTRGIIALKLSGGDTLVSAVLVQ" . - "DEDEVMLITRQGKALRMSGREVREMGRNSSGVIGIKLTSEDLVAGVLRVSEQRKVLIMTE" . - "NGYGKRVSFSEFSVHGRGTAGQKIYTQTDRKGAIIGALAVLDTDECMCITGQGKTIRVDV" . - "CAISVLGRGAQGVRVLDIEPSDLVVGLSCVMQG"; - - my $fasta_record3 = new TIGR::FASTArecord $header3, $data3; - if ( defined ( $fasta_record3 ) ) { - print STDOUT "Sequence " . $fasta_record3->getIdentifier() . " is " . - $fasta_record3->size() . " residues.\n"; - print STDOUT $fasta_record3->toString(); - } - else { - die "Invalid FASTA record 3"; - } - } - -=cut - -} - -1; diff --git a/lib/TIGR/FASTAwriter.pm b/lib/TIGR/FASTAwriter.pm deleted file mode 100755 index ff4a8ecb..00000000 --- a/lib/TIGR/FASTAwriter.pm +++ /dev/null @@ -1,408 +0,0 @@ -# $Id: FASTAwriter.pm,v 1.1 2004/04/28 15:03:43 aphillip Exp $ - -# Copyright @ 2002 - 2010 The Institute for Genomic Research (TIGR). -# All rights reserved. -# -# This software is provided "AS IS". TIGR makes no warranties, express or -# implied, including no representation or warranty with respect to the -# performance of the software and derivatives or their safety, -# effectiveness, or commercial viability. TIGR does not warrant the -# merchantability or fitness of the software and derivatives for any -# particular purpose, or that they may be exploited without infringing the -# copyrights, patent rights or property rights of others. -# -# This software program may not be sold, leased, transferred, exported or -# otherwise disclaimed to anyone, in whole or in part, without the prior -# written consent of TIGR. - -package TIGR::FASTAwriter; -{ - -=head1 NAME - -TIGR::FASTAwriter - TIGR::FASTAwriter class for writing TIGR::FASTArecord -objects to a file - -=head1 SYNOPSIS - - use TIGR::FASTAwriter; - my $obj_instance = new TIGR::FASTAwriter ($tigr_foundation_obj, - $output_file_name); - -=head1 DESCRIPTION - -This module provides an object definition for a TIGR::FASTAwriter. -The TIGR::FASTAwriter object accepts TIGR::FASTArecord objects for -printing to an output file. - -=cut - - BEGIN { - require 5.006_00; - } - - use strict; - use IO::File; - use TIGR::FASTArecord; - - - - ## internal variables and identifiers - - our $REVISION = (qw$Revision: 1.1 $)[-1]; - our $VERSION = '1.0'; - our $VERSION_STRING = "$VERSION (Build $REVISION)"; - our @DEPEND = (); - - my $SYS_ERR = 0; # this flag specifies non-user related error - my $USR_ERR = 1; # this flag specifies user related error - - #debugging scheme - # - # Debugging via the TIGR Foundation uses increasing log levels based on - # nesting. 'MAIN' starts at level 1. Every nest increments the level by 1. - # Subroutines always start nesting at level 2. As debugging levels - # increase, logging is more verbose. This makes sense as you log at - # greater depth (ie. deeper branching). - # - # The following definitions help emphasize the debugging in the program. - # - my $DEBUG_LEVEL_1 = 1; - my $DEBUG_LEVEL_2 = 2; - my $DEBUG_LEVEL_3 = 3; - my $DEBUG_LEVEL_4 = 4; - my $DEBUG_LEVEL_5 = 5; - my $DEBUG_LEVEL_6 = 6; - my $DEBUG_LEVEL_7 = 7; - my $DEBUG_LEVEL_8 = 8; - my $DEBUG_LEVEL_9 = 9; - - ## prototypes - - sub new(;$$$); - sub open($;$); - sub close(); - sub write($); - sub _errorHandler($$$); - - ## implementation - -=over - -=item $obj_instance = new TIGR::FASTAwriter ($foundation_object, - $error_array_ref, $output_file); - -This method returns a new instance of a TIGR::FASTAwriter object. It takes -three optional parameters: a TIGR::Foundation object (C<$foundation_object>), -a reference to an array for logging user error messages (C<$error_array_ref>), -and an output file name, C<$output_file>, as parameters. A new object instance -is returned on success and successful opening of a specified output -file. -If the file supplied cannot be opened, this method returns undefined. -This method also returns undefined if the parameters supplied are invalid. -Writing errors are written to the array at C<$error_array_ref> and the -log file. - -=cut - - - sub new(;$$$) { - - my $pkg = shift; - my @method_args = @_; - - my $error_condition = 0; - my $self = {}; - bless $self, $pkg; - - if ( ( scalar (@method_args) > 0 ) && - ( ( ref ($method_args[0]) ) =~ /foundation/i ) ) { - $self->{foundation} = shift @method_args; - $self->_errorHandler("Got TIGR::Foundation in new()", $DEBUG_LEVEL_3, - $SYS_ERR); - } - else { - $self->{foundation} = undef; - $self->_errorHandler("No TIGR::Foundation in new()", $DEBUG_LEVEL_3, - $SYS_ERR); - } - - if ( ( scalar (@method_args) > 0 ) && - ( ( ref ($method_args[0]) ) =~ /array/i ) ) { - $self->{error_ref} = shift @method_args; - - $self->_errorHandler("Got Error ARRAY in new()", $DEBUG_LEVEL_3, - $SYS_ERR); - } - else { - $self->{error_ref} = undef; - $self->_errorHandler("No Error ARRAY in new()", $DEBUG_LEVEL_3, - $SYS_ERR); - } - - if ( ( scalar (@method_args) > 0 ) && - ( ! ref ($method_args[0]) ) ) { - my $filename = shift @method_args; - if(defined($filename)) { - $self->{db_file_name} = $filename ; - $self->_errorHandler("Got file name in new()", $DEBUG_LEVEL_4, - $SYS_ERR); - } - else { - $self->{db_file_name} = "" ; - $self->_errorHandler("undef passed as filename", $DEBUG_LEVEL_4, - $USR_ERR); - } - } - else { - $self->{db_file_name} = undef; - $self->_errorHandler("No file name in new()", $DEBUG_LEVEL_3, - $SYS_ERR); - } - - # check for invocation errors - if ( ( scalar (@method_args) > 0 ) ) { - $error_condition = 1; - $self->_errorHandler("Too many parameters passed to new() method", - $DEBUG_LEVEL_3, $SYS_ERR); - } - elsif ((defined ( $self->{db_file_name} )) && - (! defined ( $self->open($self->{db_file_name}, "w") ) )) { - # the error message is logged via the open() routine - $self = undef; - } - - return ( $error_condition == 0 ) ? $self : undef; - - } - - -=item $result = $obj_instance->open($file_name, $flag); - -This method opens a FASTA file for writing or appending. The file, -F<$file_name>, is opened using the C flags specified by C<$flag>. -Supported flags include 'w' and 'a'. On success, this method returns 1. -The default C method is 'w', or truncated open. If the file cannot -be opened, this method returns undefined. - -=cut - - - sub open($;$) { - my $self = shift; - my $db_file_name = shift; - my $open_flags = shift; - - my $error_condition = 0; - - if ( ( ! defined ($open_flags) ) || - ( ( $open_flags !~ /^w$/i ) && - ( $open_flags !~ /^a$/i ) ) ) { - $open_flags = "w"; - } - - # close a previously open file - if ( defined ($self->{db_handle}) ) { - $self->close(); - } - - if (!( - ( defined ( $db_file_name ) ) && - ( $self->{db_file_name} = $db_file_name ) && - ( defined ( $self->{db_file_name} )) && - ( defined ( $self->{db_handle} = - new IO::File $self->{db_file_name}, $open_flags )) - ) ) { - $error_condition = 1; - $self->_errorHandler( - "Cannot open file \'$self->{db_file_name}\'", $DEBUG_LEVEL_3, - $USR_ERR); - } - return ($error_condition == 1) ? undef : 1; - } - - -=item $result = $obj_instance->close(); - -This method closes the object file stream and resets all internal data -structures. The result of the operation is returned. If the file stream -is closed successfully, this object returns true (1), otherwise false -(undefined). - -=cut - - - sub close() { - my $self = shift; - my $return_val = undef; - - if ( defined ( $self->{db_handle} ) ) { - $return_val = $self->{db_handle}->close(); - if ( ! defined ($return_val) ) { - $return_val = undef; - $self->_errorHandler( - "Error closing FASTA file: $self->{db_file_name}", - $DEBUG_LEVEL_4, $USR_ERR); - } - } - $self->{db_file_name} = undef; - $self->{db_handle} = undef; - return $return_val; - } - - -=item $result = $obj_instance->write($fasta_obj); - -This method takes a TIGR::FASTArecord object, C<$fasta_obj>, and writes it -to the file specified in C or C. On success, this method -returns true (1). On error, this method returns false (undefined) and logs -an error message. - -=cut - - - sub write($) { - my $self = shift; - my $fasta_obj = shift; - my $return_val = 1; - - if ( ( defined ($fasta_obj) ) && - ( ( ref($fasta_obj) ) =~ /fastarecord/i ) && - ( defined ($self->{db_handle}) ) ) { - - if ( $self->{db_handle}->print($fasta_obj->toString()) ) { - $return_val = 1; - } - else { - - $return_val = undef; - $self->_errorHandler( - "Error printing to FASTA file: $self->{db_file_name}", - $DEBUG_LEVEL_3, $USR_ERR); - } - } - else { - $return_val = undef; - $self->_errorHandler( - "Invalid method of initialization for " . - "TIGR::FASTAwriter", $DEBUG_LEVEL_3, $USR_ERR); - } - return $return_val; - } - -# $message = $obj_instance->_errorHandler($message, $tf_level, -# $internal_log_flag); - -#This method handles logging to the TIGR::Foundation module and -#internal error record reference array. The C<$message> argument is logged -#to the appropriate service. The C<$tf_level> parameter specifies the -#logging level for TIGR::Foundation, while the C<$internal_log_flag> parameter -#specifies if C<$message> should be written to the internal array reference -#specified in C. If a TIGR::Foundation instance does not exist, -#no logging to that facility occurs. This method returns C<$message>. - - - - sub _errorHandler($$$) { - - my $self = shift; - - my ( $message, $tf_level, $log_facility ) = @_; - - if ( defined ($message) && - defined ($tf_level) && - defined ($log_facility) ) { - - if ( defined ($self->{foundation}) ) { - if ( $log_facility != $USR_ERR ) { # all user errors go to .error - $self->{foundation}->logLocal($message, $tf_level); - } - else { - $self->{foundation}->logError($message); - } - } - - if ( ( defined ($self->{error_ref}) ) && - ( $log_facility == $USR_ERR ) ) { - push @{$self->{error_ref}}, $message; - } - } - return $message; - } - - -=back - -=head1 USAGE - -To use this module, load the C and C -modules with the C function. Then, create a new instance of the object -via the C method, as shown below. There are several invocations -possible for this method since all parameters to C are optional. -An example script using this module follows. The C -module is included for completeness but does not have to be used. - - #!/usr/local/bin/perl -w - - # This example uses the TIGR::FASTAwriter object to write - # a simple TIGR::FASTArecord object to a file specified with - # the '-o' option to this script. - # Writing errors are collected to the '@errors_list' array. - - use strict; - use TIGR::Foundation; - use TIGR::FASTArecord; - use TIGR::FASTAwriter; - - MAIN: - { - my $tf_object = new TIGR::Foundation; - my @errors_list = (); - my $output_file = undef; - - my $getopts_result = undef; - - $getopts_result = $tf_object->TIGR_GetOptions( "o=s" => \$output_file ); - - if ( $getopts_result != 1 ) { - $tf_object->bail("Invalid command line option."); - } - - if ( ! defined ( $output_file ) ) { - $tf_object->bail("Must specify an output file with the '-o' option"); - } - - my $header = "ORF00001"; - my $data = "ATGC"; - - my $fasta_record = new TIGR::FASTArecord $header, $data; - if ( ! defined ( $fasta_record ) ) { - $tf_object->bail("Cannot create TIGR::FASTArecord object"); - } - - # Create a TIGR::FASTAwriter instance using TIGR::Foundation and - # an error message list. - - my $fasta_writer = new TIGR::FASTAwriter $tf_object, \@errors_list; - - $fasta_writer->open($output_file) or - $tf_object->logLocal("Cannot open output file $output_file", - $DEBUG_LEVEL_1); - - if ( scalar(@errors_list) > 0 ) { # are there parse errors? - while ( @errors_list ) { # get the messages from the list - my $message = shift @errors_list; - print STDERR $message, "\n"; - } - } - - $fasta_writer->write($fasta_record ) or - $tf_object->logLocal("Cannot write FASTA record to $output_file", - $DEBUG_LEVEL_1); - } - -=cut - -} - -1; diff --git a/lib/TIGR/Foundation.pm b/lib/TIGR/Foundation.pm deleted file mode 100755 index 923919a1..00000000 --- a/lib/TIGR/Foundation.pm +++ /dev/null @@ -1,1822 +0,0 @@ -# $Id: Foundation.pm,v 1.1 2004/04/28 15:03:43 aphillip Exp $ - -# Copyright @ 2002 - 2010 The Institute for Genomic Research (TIGR). -# All rights reserved. -# -# This software is provided "AS IS". TIGR makes no warranties, express or -# implied, including no representation or warranty with respect to the -# performance of the software and derivatives or their safety, -# effectiveness, or commercial viability. TIGR does not warrant the -# merchantability or fitness of the software and derivatives for any -# particular purpose, or that they may be exploited without infringing the -# copyrights, patent rights or property rights of others. -# -# This software program may not be sold, leased, transferred, exported or -# otherwise disclaimed to anyone, in whole or in part, without the prior -# written consent of TIGR. - -package TIGR::Foundation; -{ - -=head1 NAME - -TIGR::Foundation - TIGR Foundation object - -=head1 SYNOPSIS - - use TIGR::Foundation; - my $obj_instance = new TIGR::Foundation; - -=head1 DESCRIPTION - -This module defines a structure for Perl programs to utilize -logging, version reporting, and dependency checking in a simple way. - -=cut - - BEGIN { - require 5.006_00; # error if using Perl < v5.6.0 - } - - use strict; - use Cwd; - use Cwd 'chdir'; - use Cwd 'abs_path'; - use File::Basename; - use Getopt::Long; - use IO::Handle; - use POSIX qw(strftime); - use Sys::Hostname; - - require Exporter; - - our @ISA; - our @EXPORT; - @ISA = ('Exporter'); - @EXPORT = qw( - isReadableFile - isWritableFile - isExecutableFile - isCreatableFile - isReadableDir - isWritableDir - isCreatableDir - isCreatablePath - - getISODate - getSybaseDate - getMySQLDate - getFilelabelDate - getLogfileDate - ); - - ## internal variables and identifiers - our $REVISION = (qw$Revision: 1.1 $)[-1]; - our $VERSION = '1.41'; - our $VERSION_STRING = "$VERSION (Build $REVISION)"; - our @DEPEND = (); # there are no dependencies - - - ## prototypes - - # Functional Class : general - sub new(); - sub getProgramInfo($); - sub runCommand($); - - # Functional Class : depend - sub printDependInfo(); - sub printDependInfoAndExit(); - sub addDependInfo(@); - - # Functional Class : version - sub getVersionInfo(); - sub printVersionInfo(); - sub printVersionInfoAndExit(); - sub setVersionInfo($); - sub setVersionHandler($); - - # Functional Class : help - sub printHelpInfo(); - sub printHelpInfoAndExit(); - sub setHelpInfo($); - - # Functional Class : usage - sub printUsageInfo(); - sub printUsageInfoAndExit(); - sub setUsageInfo($); - - # Functional Class : files - sub isReadableFile($); - sub isExecutableFile($); - sub isWritableFile($); - sub isCreatableFile($); - sub isReadableDir($); - sub isWritableDir($); - sub isCreatableDir($); - sub isCreatablePath($); - - # Functional Class : date - sub getISODate(;@); - sub getSybaseDate(;@); - sub getMySQLDate(;@); - sub getFilelabelDate(;@); - sub getLogfileDate(;@); - - # Functional Class : logging - sub setDebugLevel($;$); - sub getDebugLevel(); - sub setLogFile($;$); - sub getLogFile(); - sub getErrorFile(); - sub printDependInfo(); - sub invalidateLogFILES(); - sub cleanLogFILES(); - sub closeLogERROR(); - sub closeLogMSG(); - sub openLogERROR(); - sub openLogMSG(); - sub logAppend($;$); - sub debugPush(); - sub debugPop(); - sub logLocal($$); - sub logError($;$); - sub bail($;$); - - # Functional Class : modified methods - sub TIGR_GetOptions(@); - - ## Implementation - - -# Functional Class : general - -=over - -=item $obj_instance = new TIGR::Foundation; - -This function creates a new instance of the TIGR::Foundation -object. A reference pointing to the object is returned on success. Otherwise, -this method returns undefined. - -=cut - - - sub new() { - my $self = {}; - my $pkg = shift; - - # create the object - bless $self, $pkg; - - # Get the program name. - my $pname = basename($0, () ); - if ( (defined ($pname) ) && ($pname =~ /^(.*)$/) ) { - $pname = $1; - $self->{program_name} = $pname ; - } - if ($self->{program_name} =~ /^-$/) { # check if '-' is the input - $self->{program_name} = "STDIN"; - } - # Get the invocation. - my $pcommand = join (' ', @ARGV); - if ( defined $pcommand ) { - $pcommand =~ /^(.*)$/; - $pcommand = $1; - } - else { - $pcommand = ""; - } - $self->{invocation} = $pcommand ; - - # The following variables are to contain information specified by - # the 'host' program; there are methods of setting and retrieving each. - @{$self->{depend_info}} = (); - $self->{version_handler} = undef; - $self->{version_info} = undef; - $self->{help_info} = undef; - $self->{usage_info} = undef; - - # These are used for logging. - $self->{debug_level} = -1; # debug is negative, no logging - @{$self->{debug_store}} = (); # the backup debug level stack - @{$self->{debug_queue}} = (); # queue used by MSG routine - @{$self->{error_queue}} = (); # queue used by ERROR routine - $self->{max_debug_queue_size} = 100; # maximum size for queue before - # log entries are expired - @{$self->{log_files}} = # these log files are consulted - ("$self->{program_name}.log", # on file write error and are - "/tmp/$self->{program_name}.$$.log"); # modified by setLogFile - $self->{msg_file_open_flag} = 0; # flag to check logLocal file - $self->{error_file_open_flag} = 0; # flag to check logError file - $self->{msg_file_used} = 0; # flag to indicate if log file - $self->{error_file_used} = 0; # has been written to - $self->{msg_append_flag} = 0; # by default logs are truncated - $self->{error_append_flag} = 0; # by default logs are truncated - $self->{log_append_setting} = 0; # (truncate == 0) - $self->{static_log_file} = undef; # user defined log file - - # These monitor program execution time. - $self->{start_time} = time; # program start time - $self->{finish_time} = undef; # program stop time - - # Set a user name and a host name. - $self->{'host_name'} = hostname(); - if ( ! defined ( $self->{'host_name'} ) ) { - $self->{'host_name'} = "NOHOSTNAME"; - } - else { - $self->{'host_name'} =~ s/^(\.*)$/$1/; # Taint-check it. - } - - # A __WARN__ handler is needed to keep this sane. - my $tmp_warn_handler = $SIG{__WARN__} || "DEFAULT"; - $SIG{__WARN__} = sub {}; - my @info_arr = getpwuid($<); - $self->{'user_name'} = $info_arr[0]; - $self->{'home_dir'} = $info_arr[7]; - $SIG{__WARN__} = $tmp_warn_handler; - if ( ! defined ( $self->{'user_name'} ) ) { - $self->{'user_name'} = "NOUSERNAME"; - } - else { - $self->{'user_name'} =~ s/^(\.*)$/$1/g;# Taint check. - } - if ( ! defined ( $self->{'home_dir'} ) ) { - $self->{'home_dir'} = "/"; - } - else { - $self->{'home_dir'} =~ s/^(\.*)$/$1/g; # Taint check. - } - - $self->logLocal("START: " . $self->{'program_name'} . " " . - $self->{'invocation'}, 0); - $self->logLocal("Username: " . $self->{'user_name'}, 0); - $self->logLocal("Hostname: " . $self->{'host_name'}, 0); - - return $self; - } - - - -=item $value = $obj_instance->getProgramInfo($field_type); - -This function returns field values for specified field types describing -attributes of the program. The C<$field_type> parameter must be a listed -attribute: C, C, C, C. -The C field specifies the bare name of the executable. The -C field specifies the command line arguments passed to the -executable. The C value returns the environment path to the -working directory. The C value specifies the absolute path to the -working directory. If C is found to be inconsistent, then that -value will return the C value. If an invalid C<$field_type> is -passed, the function returns undefined. - -=cut - - - sub getProgramInfo($) { - my $self = shift; - my $field_type = shift; - my $return_value = undef; - if (defined $field_type) { - $field_type =~ /^name$/ && do { - $return_value = $self->{program_name}; - }; - $field_type =~ /^invocation$/ && do { - $return_value = $self->{invocation}; - }; - $field_type =~ /^env_path$/ && do { - my $return_value = ""; - if ( - (defined $ENV{'PWD'}) && - (abs_path($ENV{'PWD'}) eq abs_path(".") ) && - ($ENV{'PWD'} =~ /^(.*)$/) - ) { - $ENV{'PWD'} = $1; - $return_value = $ENV{'PWD'}; - } - else { - my $tmp_val = abs_path("."); - - if ( (defined ($tmp_val) ) && ($tmp_val =~ /^(.*)$/) ) { - $tmp_val = $1; - $return_value = $tmp_val; - } - } - return $return_value; - }; - - $field_type =~ /^abs_path$/ && do { - my $tmp_val = abs_path("."); - - if ( (defined ($tmp_val) ) && ($tmp_val =~ /^(.*)$/) ) { - $tmp_val = $1; - $return_value = $tmp_val; - } - }; - } - return $return_value; - } - -=item $exit_code = $obj_instance->runCommand($command_str); - -This function passes the argument C<$command_str> to /bin/sh -for processing. The return value is the exit code of the -C<$command_str>. If the exit code is not defined, then either the signal or -core dump value of the execution is returned, whichever is applicable. Perl -variables C<$?> and C<$!> are set accordingly. If C<$command_str> is not -defined, this function returns undefined. Log messages are recorded at log -level 4 to indicate the type of exit status and the corresponding code. -A failure to start the program (invalid program) results in return code -1. - -=cut - - - sub runCommand($) { - my $self = shift; - my $command_str = shift; - my $exit_code = undef; - my $signal_num = undef; - my $dumped_core = undef; - my $return_value = undef; - my $current_dir = $self->getProgramInfo("abs_path"); - - # Return if the command string is not set. - if ( ! defined ( $command_str ) ) { - return undef; - } - - # Substitute out the tilde and dot in the directory paths. - if ( defined ($ENV{PATH}) ) { - ( $ENV{PATH} ) = $ENV{PATH} =~ /^(.*)$/; - my @paths = split /:/, $ENV{PATH}; - for (my $i = 0; $i <= $#paths; $i++) { - $paths[$i] =~ s/^~\/?$/$self->{'home_dir'}/g; - $paths[$i] =~ s/^\.\/?$/$current_dir/g; - } - $ENV{PATH} = join(":", @paths); - } - - $command_str =~ s/^(.*)$/$1/g; # Taint checking. - # Run the command and parse the results. - system($command_str); - my $return_str = $?; - $exit_code = $? >> 8; - $signal_num = $? & 127; - $dumped_core = $? & 128; - if ( $return_str == -1 ) { # Check for invalid program. - $self->logLocal("Invalid execution of \'$command_str\'.", 4); - $return_value = -1; - } - elsif ( $dumped_core != 0 ) { - $self->logLocal("\'$command_str\' dumped core.", 4); - $return_value = $dumped_core; - } - elsif ( $signal_num != 0 ) { - $self->logLocal("\'$command_str\' signalled \'$signal_num\'.", 4); - $return_value = $signal_num; - } - else { - $self->logLocal("\'$command_str\' exited \'$exit_code\'.", 4); - $return_value = $exit_code; - } - return $return_value; - } - - -# Functional Class : depend - -=item $obj_instance->printDependInfo(); - -The C function prints the dependency list created by -C. One item is printed per line. - -=cut - - - sub printDependInfo() { - my $self = shift; - foreach my $dependent (@{$self->{depend_info}}) { - print STDERR $dependent, "\n"; - } - } - - -=item $obj_instance->printDependInfoAndExit(); - -The C function prints the dependency list created by -C. One item is printed per line. The function exits with -exit code 0. - -=cut - - - sub printDependInfoAndExit() { - my $self = shift; - $self->printDependInfo(); - exit 0; - } - - -=item $obj_instance->addDependInfo(@depend_list); - -The C function adds C<@depend_list> information -to the dependency list. If C<@depend_list> is empty, the internal -dependency list is emptied. Contents of C<@depend_list> are not checked -for validity (eg. they can be composed entirely of white space or -multiple files per record). The first undefined record in C<@depend_list> -halts reading in of dependency information. - -=cut - - - sub addDependInfo(@) { - my $self = shift; - my $num_elts = 0; - while (my $data_elt = shift @_) { - push (@{$self->{depend_info}}, $data_elt); - $num_elts++; - } - if ($num_elts == 0) { - @{$self->{depend_info}} = (); - } - } - - -# Functional Class : version - -=item $version_string = $obj_instance->getVersionInfo(); - -The C function returns the version information set by the -C function. - -=cut - - - sub getVersionInfo() { - my $self = shift; - return $self->{version_info}; - } - - -=item $obj_instance->printVersionInfo(); - -The C function calls the version handler, if set. If not, -it prints the version information set by the C function. -If there is no defined version information, a message is returned notifying -the user. - -=cut - - - sub printVersionInfo() { - my $self = shift; - if ( defined $self->{'version_handler'} ) { - $self->{'version_handler'}->(); - } - elsif (defined $self->getVersionInfo() ) { - print STDERR $self->getProgramInfo('name'), " ", - $self->getVersionInfo(), "\n"; - } - else { - print STDERR $self->getProgramInfo('name'), - " has no defined version information\n"; - } - } - - -=item $obj_instance->printVersionInfoAndExit(); - -The C function calls the version handler, if set. -Otherwise, it prints prints version info set by the C -function. If there is no defined version information, a message is printed -notifying the user. This function calls exit with exit code 0. - -=cut - - - sub printVersionInfoAndExit() { - my $self = shift; - $self->printVersionInfo(); - exit 0; - } - - -=item $obj_instance->setVersionInfo($version_string); - -The C function sets the version information to be reported -by C. If C<$version_string> is empty, invalid, or -undefined, the stored version information will be undefined. - -=cut - - - sub setVersionInfo($) { - my $self = shift; - my $v_info = shift; - if ( defined ( $v_info ) && ( $v_info =~ /\S/ ) ) { - $self->{version_info} = $v_info; - } - else { - $self->{version_info} = undef; - } - } - - -=item $obj_instance->setVersionHandler($function_ref); - -The C method establishes a callback function for handling -the reporting of version information to the user. If a handler is set, then -any information passed in via C is not reported. To -remove the handler, call this method without any arguments. If a handler is -not a proper code reference, this method returns undefined and does not set -a handler. This method returns 1 on success. - -=cut - - - sub setVersionHandler($) { - my $self = shift; - my $v_handler = shift; - if ( defined ( $v_handler ) && ( (ref $v_handler) eq "CODE" ) ) { - $self->{version_handler} = $v_handler; - } - elsif ( ! defined ( $v_handler ) ) { - $self->{version_handler} = undef; - } - else { - # Bad input. - return undef; - } - return 1; - } - - -# Functional Class : help - -=item $obj_instance->printHelpInfo(); - -The C function prints the help information passed by the -C function. - -=cut - - - sub printHelpInfo() { - my $self = shift; - if (defined $self->{help_info}) { - print STDERR $self->{help_info}; - } - else { - print STDERR "No help information defined.\n"; - } - } - - -=item $obj_instance->printHelpInfoAndExit(); - -The C function prints the help info passed by the -C function. This function exits with exit code 0. - -=cut - - - sub printHelpInfoAndExit() { - my $self = shift; - $self->printHelpInfo(); - exit 0; - } - - -=item $obj_instance->setHelpInfo($help_string); - -The C function sets the help information via C<$help_string>. -If C<$help_string> is undefined, invalid, or empty, the help information -is undefined. - -=cut - - - sub setHelpInfo($) { - my $self = shift; - my $help_string = shift; - if ( ( defined $help_string ) && ( $help_string =~ /\S/ ) ) { - chomp $help_string; - $self->{help_info} = $help_string . "\n"; - } - else { - $self->{help_info} = undef; - } - } - - -# Functional Class : usage - -=item $obj_instance->printUsageInfo(); - -The C function prints the usage information reported by the -C function. If no usage information is defined, but help -information is defined, help information will be printed. - -=cut - - - sub printUsageInfo() { - - my $self = shift; - if ( defined $self->{usage_info} ) { - print STDERR $self->{usage_info}; - } - elsif ( defined $self->{help_info} ) { - print STDERR $self->{help_info}; - } - else { - print STDERR "No usage information defined.\n"; - } - } - - -=item $obj_instance->printUsageInfoAndExit(); - -The C function prints the usage information the -reported by the C function and exits with status 1. - -=cut - - - sub printUsageInfoAndExit() { - my $self = shift; - $self->printUsageInfo(); - exit 1; - } - - -=item $obj_instance->setUsageInfo($usage_string); - -The C function sets the usage information via C<$usage_string>. -If C<$usage_string> is undefined, invalid, or empty, the usage information -is undefined. - -=cut - - - sub setUsageInfo($) { - my $self = shift; - my $usage_string = shift; - if ( ( defined $usage_string ) && ( $usage_string =~ /\S/ ) ) { - chomp($usage_string); - $self->{usage_info} = $usage_string . "\n"; - } - else { - $self->{usage_info} = undef; - } - } - - -# Functional Class : files - -=item $valid = isReadableFile($file_name); - -This function accepts a single scalar parameter containing a file name. -If the file corresponding to the file name is a readable plain file or symbolic -link, this function returns 1. Otherwise, the function returns 0. If the file -name passed is undefined, this function returns 0 as well. - -=cut - - - sub isReadableFile($) { - my $self = shift; - my $file = shift; - if ( ! defined ( $file ) ) { # class, not instance, invocation - $file = $self; - } - - if (defined ($file) && # was a file name passed? - ( (-f $file) || (-l $file) ) && # is the file a file or sym. link? - (-r $file) # is the file readable? - ) { - return 1; - } - else { - return 0; - } - } - - -=item $valid = isExecutableFile($file_name); - -This function accepts a single scalar parameter containing a file name. -If the file corresponding to the file name is an executable plain file -or symbolic link, this function returns 1. Otherwise, the function returns 0. -If the file name passed is undefined, this function returns 0 as well. - -=cut - - - sub isExecutableFile($) { - my $self = shift; - my $file = shift; - if ( ! defined ( $file ) ) { # class invocation, not instance - $file = $self; - } - - if (defined ($file) && # was a file name passed? - ( (-f $file) || (-l $file) ) && # is the file a file or sym. link? - (-x $file) # is the file executable? - ) { - return 1; - } - else { - return 0; - } - } - - -=item $valid = isWritableFile($file_name); - -This function accepts a single scalar parameter containing a file name. -If the file corresponding to the file name is a writable plain file -or symbolic link, this function returns 1. Otherwise, the function returns 0. -If the file name passed is undefined, this function returns 0 as well. - -=cut - - - sub isWritableFile($) { - my $self = shift; - my $file = shift; - if ( ! defined ( $file ) ) { # class, not instance, invocation - $file = $self; - } - - if (defined ($file) && # was a file name passed? - ( (-f $file) || (-l $file) ) && # is the file a file or sym. link? - (-w $file) # is the file writable? - ) { - return 1; - } - else { - return 0; - } - } - - -=item $valid = isCreatableFile($file_name); - -This function accepts a single scalar parameter containing a file name. If -the file corresponding to the file name is creatable this function returns 1. -The function checks if the location of the file is writable by the effective -user id (EUID). If the file location does not exist or the location is not -writable, the function returns 0. If the file name passed is undefined, -this function returns 0 as well. Note that files with suffix F are not -supported under UNIX platforms, and will return 0. - -=cut - - - sub isCreatableFile($) { - my $self = shift; - my $file = shift; - if ( ! defined ( $file ) ) { - $file = $self; - } - my $return_code = 0; - if ( - (defined ($file) ) && - (! -e $file) && - ($file !~ /\/$/) - ) { - my $dirname = dirname($file); - # check the writability of the directory - $return_code = isWritableDir($dirname); - } - else { - # the file exists, it's not creatable - $return_code = 0; - } - return $return_code; - } - - -=item $valid = isReadableDir($directory_name); - -This function accepts a single scalar parameter containing a directory name. -If the name corresponding to the directory is a readable, searchable directory -entry, this function returns 1. Otherwise, the function returns 0. If the -name passed is undefined, this function returns 0 as well. - -=cut - - - sub isReadableDir($) { - my $self = shift; - my $file = shift; - if ( ! defined ( $file ) ) { # class invocation - $file = $self; - } - - if (defined ($file) && # was a name passed? - (-d $file) && # is the name a directory? - (-r $file) && # is the directory readable? - (-x $file) # is the directory searchable? - ) { - return 1; - } - else { - return 0; - } - } - - -=item $valid = isWritableDir($directory_name); - -This function accepts a single scalar parameter containing a directory name. -If the name corresponding to the directory is a writable, searchable directory -entry, this function returns 1. Otherwise, the function returns 0. If the -name passed is undefined, this function returns 0 as well. - -=cut - - - sub isWritableDir($) { - my $self = shift; - my $file = shift; - if ( ! defined ( $file ) ) { # class invocation - $file = $self; - } - - if (defined ($file) && # was a name passed? - (-d $file) && # is the name a directory? - (-w $file) && # is the directory writable? - (-x $file) # is the directory searchable? - ) { - return 1; - } - else { - return 0; - } - } - - -=item $valid = isCreatableDir($directory_name); - -This function accepts a single scalar parameter containing a directory name. -If the name corresponding to the directory is creatable this function returns -1. The function checks if the immediate parent of the directory is writable by -the effective user id (EUID). If the parent directory does not exist or the -tree is not writable, the function returns 0. If the directory name passed is -undefined, this function returns 0 as well. - -=cut - - - sub isCreatableDir($) { - my $self = shift; - my $dir = shift; - if ( ! defined ( $dir ) ) { - $dir = $self; - } - my $return_code = 0; - - if (defined ($dir) ) { - $dir =~ s/\/$//g; - $return_code = isCreatableFile($dir); - } - return $return_code; - } - - -=item $valid = isCreatablePath($path_name); - -This function accepts a single scalar parameter containing a path name. If -the C<$path_name> is creatable this function returns 1. The function checks -if the directory hierarchy of the path is creatable or writable by the -effective user id (EUID). This function calls itself recursively until -an existing directory node is found. If that node is writable, ie. the path -can be created in it, then this function returns 1. Otherwise, the function -returns 0. This function also returns zero if the C<$path_name> supplied -is disconnected from a reachable directory tree on the file system. -If the path already exists, this function returns 0. The C<$path_name> may -imply either a path to a file or a directory. Path names may be relative or -absolute paths. Any unresolvable relative paths will return 0 as well. This -includes paths with F<..> back references to nonexistent directories. -This function is recursive whereas C and -C are not. - -=cut - - - sub isCreatablePath($) { - my $self = shift; - my $pathname = shift; - if ( ! defined ( $pathname ) ) { # class invocation - $pathname = shift; - } - my $return_code = 0; - - if (defined $pathname) { - # strip trailing '/' - $pathname =~ s/(.+)\/$/$1/g; - my $filename = basename($pathname); - my $dirname = dirname($pathname); - if ( - (! -e $pathname) && - ($dirname ne $pathname) && - ($filename ne "..") - ) { - if (-e $dirname) { - $return_code = isWritableDir($dirname); - } - else { - $return_code = isCreatablePath($dirname); - } - } - else { - $return_code = 0; - } - } - return $return_code; - } - - -# Functional Class : date - -=item $date_string = getISODate($tm); - -This function returns the ISO 8601 datetime as a string given a time -structure as returned by the C