diff --git a/Bio/DB/Fasta.pm b/Bio/DB/Fasta.pm index 6651e69ed5..cb62f5b489 100644 --- a/Bio/DB/Fasta.pm +++ b/Bio/DB/Fasta.pm @@ -139,46 +139,6 @@ use base qw(Bio::DB::IndexedBase); our $obj_class = 'Bio::PrimarySeq::Fasta'; our $file_glob = '*.{fa,FA,fasta,FASTA,fast,FAST,dna,DNA,fna,FNA,faa,FAA,fsa,FSA}'; -# Compiling the below regular expressions speeds up the Pure Perl -# seq/subseq() by about 7% from 7.76s to 7.22s over 32358 calls on -# Variant Effect Prediction data. -my $nl = qr/\n/; -my $cr = qr/\r/; - -# Remove carriage returns (\r) and newlines (\n) from a string. When -# called from subseq, this can take a signficiant portion of time, in -# Variant Effect Prediction. Therefore we compile the match -# portion. -sub strip_crnl { - my $str = shift; - $str =~ s/$nl//g; - $str =~ s/$cr//g; - return $str; -} - -# C can do perfrom strip_crnl much faster. But this requires the -# Inline::C module which we don't require people to have. So we make -# this optional by wrapping the C code in an eval. If the eval works, -# the Perl strip_crnl() function is overwritten. -eval q{ - use Inline C => <<'END_OF_C_CODE'; - /* Strip all new line (\n) and carriage return (\r) characters - from string str - */ - char* strip_crnl(char* str) { - char *s; - char *s2 = str; - for (s = str; *s; *s++) { - if (*s != '\n' && *s != '\r') { - *s2++ = *s; - } - } - *s2 = '\0'; - return str; - } -END_OF_C_CODE -}; - =head2 new @@ -329,7 +289,7 @@ sub subseq { seek($fh, $filestart,0); read($fh, $data, $filestop-$filestart+1); - $data = strip_crnl($data); + $data = Bio::DB::IndexedBase::_strip_crnl($data); if ($strand == -1) { # Reverse-complement the sequence @@ -371,7 +331,7 @@ sub header { read($fh, $data, $headerlen); # On Windows chomp remove '\n' but leaves '\r' # when reading '\r\n' in binary mode - $data = strip_crnl($data); + $data = Bio::DB::IndexedBase::_strip_crnl($data); substr($data, 0, 1) = ''; return $data; } diff --git a/Bio/DB/IndexedBase.pm b/Bio/DB/IndexedBase.pm index 8b47d07894..b8334e4ba6 100644 --- a/Bio/DB/IndexedBase.pm +++ b/Bio/DB/IndexedBase.pm @@ -268,6 +268,46 @@ use constant DIE_ON_MISSMATCHED_LINES => 1; # you can avoid dying if you want but you may get incorrect results +# Compiling the below regular expressions speeds up the Pure Perl +# seq/subseq() from Bio::DB::Fasta by about 7% from 7.76s to 7.22s +# over 32358 calls on Variant Effect Prediction data. +my $nl = qr/\n/; +my $cr = qr/\r/; + +# Remove carriage returns (\r) and newlines (\n) from a string. When +# called from subseq, this can take a signficiant portion of time, in +# Variant Effect Prediction. Therefore we compile the match portion. +sub _strip_crnl { + my $str = shift; + $str =~ s/$nl//g; + $str =~ s/$cr//g; + return $str; +} + +# C can do perfrom _strip_crnl much faster. But this requires the +# Inline::C module which we don't require people to have. So we make +# this optional by wrapping the C code in an eval. If the eval works, +# the Perl strip_crnl() function is overwritten. +eval q{ + use Inline C => <<'END_OF_C_CODE'; + /* Strip all new line (\n) and carriage return (\r) characters + from string str + */ + char* _strip_crnl(char* str) { + char *s; + char *s2 = str; + for (s = str; *s; *s++) { + if (*s != '\n' && *s != '\r') { + *s2++ = *s; + } + } + *s2 = '\0'; + return str; + } +END_OF_C_CODE +}; + + =head2 new Title : new diff --git a/Bio/DB/Qual.pm b/Bio/DB/Qual.pm index 62c87e020f..675b470748 100644 --- a/Bio/DB/Qual.pm +++ b/Bio/DB/Qual.pm @@ -335,8 +335,7 @@ sub subqual { read($fh, $data, $filestop-$filestart+1); # Process quality score - $data =~ s/\n//g; - $data =~ s/\r//g; + Bio::DB::IndexedBase::_strip_crnl($data); my $subqual = 0; $subqual = 1 if ( $start || $stop ); my @data; @@ -379,9 +378,9 @@ sub header { seek($fh, $offset, 0); read($fh, $data, $headerlen); # On Windows chomp remove '\n' but leaves '\r' - # when reading '\r\n' in binary mode - $data =~ s/\n//g; - $data =~ s/\r//g; + # when reading '\r\n' in binary mode, + # _strip_crnl removes both + $data = Bio::DB::IndexedBase::_strip_crnl($data); substr($data, 0, 1) = ''; return $data; } diff --git a/Build.PL b/Build.PL index e48779e88d..25d7e6ab27 100644 --- a/Build.PL +++ b/Build.PL @@ -107,6 +107,7 @@ my %recommends = ( 'Inline::C' => [0.67, 'Speeding up code like Fasta Bio::DB::Fasta'], + 'IO::Scalar' => [0, 'Deal with non-seekable filehandles/Bio::Tools::GuessSeqFormat'], diff --git a/Changes b/Changes index c649a3276f..1bcf897016 100644 --- a/Changes +++ b/Changes @@ -34,6 +34,8 @@ CPAN releases are branched from 'master'. * Issue #81: Small updates to make sure possible memory leaks are detected [cjfields] * Issue #84: EMBL format wrapping problem [nyamned] * Issue #90: Missing entries for translation tables 24 and 25 [fjossandon] + * Issue #95: Speed up of Bio::DB::Fasta::subseq by using a compiled regex + or compiled C code (when Inline::C is installed) [rocky] * Fix various Bio::Tools::Analysis remote server config problems [cjfields] * Added several missing 'Data::Stag' and 'LWP::UserAgent' requirements [fjossandon] * Added a workaround in Bio::DB::Registry to get Username in Windows [fjossandon] diff --git a/t/Tree/TreeIO/nhx.t b/t/Tree/TreeIO/nhx.t index a95307c703..73ed689d6f 100644 --- a/t/Tree/TreeIO/nhx.t +++ b/t/Tree/TreeIO/nhx.t @@ -13,6 +13,8 @@ BEGIN { } my $verbose = 0; #test_debug(); +my $nl = qr/\n/; +my $cr = qr/\r/; my $treeio = Bio::TreeIO->new( -format => 'nhx', @@ -85,8 +87,8 @@ sub read_file { binmode $IN; $string = <$IN>; close $IN; - $string =~ s/\n//g; - $string =~ s/\r//g; # For files with Windows line-endings + $string =~ s/$nl//g; + $string =~ s/$cr//g; # For files with Windows line-endings #print STDERR "STR: $string\n"; return $string; }