#!/usr/bin/perl -w # 27 September 2004 # Michael E Sparks (mespar1@iastate.edu) # A tool to scrape out conserved nucleotide blocks in a T-COFFEE alignment according to # CORE index / minimum block length thresholds. Will ablate $ABLATE residues from both # ends of a conserved block. # # Assumes T-COFFEE was run with at least the following parameters: # -seqnos=on \ # -output=t_coffee,score_ascii,fasta_aln # # Use mrtrans or a clone (e.g., bpmrtrans.PLS of BioPerl) to create a DNA fasta alignment # from the protein alignment. # # Finally, use this script to extract blocks. # Example usage for a collection of cDNA sequences ``cluster.dna.fst" and # their translation products ``cluster.pep.fst" : # $ cat cluster.pep.fst | t_coffee \ # -infile=stdin \ # -outfile=PROT \ # -type=PROTEIN \ # -seqnos=on \ # -output=t_coffee,fasta_aln,score_ascii # $ bp_mrtrans.PLS \ # -i PROT.fasta_aln \ # -if fasta \ # -o DNA.aln.fst \ # -of fasta \ # -s cluster.dna.fst \ # -sf fasta # $ block-scraper.pl \ # 7 \ # 20 \ # 5 \ # PROT.t_coffee \ # PROT.score_ascii \ # DNA.aln.fst > nucleotide.blocks # Copyright (c) Michael E Sparks # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # * Neither the name of the Iowa State University nor the names of # its contributors may be used to endorse or promote products # derived from this software without specific prior written permission. # # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. use strict; my $MINCORE=shift; my $MINLEN=shift; my $ABLATE=shift; my $tcoff_file=shift; my $score_file=shift; my $dna_file=shift || die "Usage: ./block-scraper.pl min_CORE minblocklen ablatelen *.t_coffee *.score_ascii DNA.aln.fst\n"; open(TCOFF,"<$tcoff_file") or die "Can't open $tcoff_file! $!\n"; open(SCORE,"<$score_file") or die "Can't open $score_file! $!\n"; open(DNA,"<$dna_file") or die "Can't open $dna_file! $!\n"; my ($line,$seqid,$scorestring,$alnseq)=""; my (%scores,%alnseqs,%alnseqsd)=(); # PARSE CORE INDEX STRINGS until ($line=~m/:\s+\d+$/) { $line=; } # Scroll past overall scores until ($line=~m/^$/) { $line=; } $line=; until ($line=~m/^$/) { $line=~m/^(\S{1,15}).*?\d+\s(.*?)\s+\d+\s$/; ($seqid,$scorestring)=($1,$2); $scores{$seqid}=$scorestring; $line=; } until ($line=~/^Cons/) { $line=; } $line=; while(defined()) { $line=; if ($line=~m/^$/) { last; } until ($line=~m/^$/) { $line=~m/^(\S{1,15}).*?\d+\s(.*?)\s+\d+\s$/; ($seqid,$scorestring)=($1,$2); $scores{$seqid}.=$scorestring; $line=; } until ($line=~/^Cons/) { $line=; } $line=; } close SCORE; # PARSE PROTEIN SEQUENCE STRINGS FROM ALIGNMENT until ($line=~m/^LEN /) { $line=; } # Scroll past header $line=~m/^LEN\s+(\d+)/; my $protalnlength=$1; until ($line=~m/^$/) { $line=; } $line=; until ($line=~m/^\s+.*?\d+$/) { $line=~m/^(\S{1,15}).*?\s+\d+\s(.*?)\s+\d+\s$/; ($seqid,$alnseq)=($1,$2); $alnseqs{$seqid}=$alnseq; $line=; } while(defined()) { $line=; if ($line=~m/^$/) { last; } until ($line=~m/^\s+.*?\d+$/) { $line=~m/^(\S{1,15}).*?\s+\d+\s(.*?)\s+\d+\s$/; ($seqid,$alnseq)=($1,$2); $alnseqs{$seqid}.=$alnseq; $line=; } } close TCOFF; #PARSE DNA ALIGNMENT STRINGS $seqid=""; my $tempstring=""; while ($line=) { if ($line=~m/^>/) { $line=~m/^>.*?(\S{1,15}).*?$/; $seqid=$1; } else { chomp($line); $alnseqsd{$seqid}.=$line; } } close DNA; # Print strings for debug if needed foreach my $key (sort keys %alnseqs) { if (length($alnseqs{$key}) != $protalnlength) { die "Inconsistency\n"; } # print $key," ",$alnseqs{$key},"\n"; } foreach my $key (sort keys %alnseqsd) { if (length($alnseqsd{$key}) != (3*$protalnlength)) { die "Inconsistency\n"; } # print $key," ",$alnseqsd{$key},"\n"; } foreach my $key (sort keys %scores) { if (length($scores{$key}) != $protalnlength) { die "Inconsistency\n"; } # print $key," ",$scores{$key},"\n"; } # NOW, SCRAPE OUT THE BLOCKS my $block_len=0; my $block_ct=1; my $blockcoltrue=1; my $blocksettrue=0; my $i=0; my ($start,$stop)=0; my @keys=keys(%alnseqs); # identical to keys(%scores) for ($i=1;$i<=$protalnlength;++$i) { KEYPROGRESSION : foreach my $key (@keys) { my $char2test=substr($scores{$key},$i-1,1); if (($char2test eq '-') || ($char2test !~ m/[\d]/) || ($char2test < $MINCORE)) { if ($block_len >= $MINLEN) { $stop=$i-1; #Report it print "\n"; $start+=$ABLATE; $stop-=$ABLATE; print "Block $block_ct ($start,$stop):\n"; foreach my $key2 (@keys) { my $protstring=substr($alnseqs{$key2},$start-1,$stop-$start+1); my @protstring=split(//,$protstring); my $preprintstring="$key2 "; $preprintstring=~m/^(.{20})/; print "$1"; foreach my $protpiece (@protstring) { print "$protpiece "; } print "\n"; # Now report the coding data my $dnastring=substr($alnseqsd{$key2},3*($start-1),3*($stop-$start+1)); my @dnastring=split(//,$dnastring); print " "; my $j=0; foreach my $dnapiece (@dnastring) { print "$dnapiece"; ++$j; if ($j%3 == 0 && $j>0) { print " "; } } print "\n"; } ++$block_ct; } $block_len=0; $blockcoltrue=0; $blocksettrue=0; last KEYPROGRESSION; } } # end foreach if ($blockcoltrue) { if (!$blocksettrue) { $start=$i; $blocksettrue=1; } ++$block_len; } else { $blockcoltrue=1; # An alignment position has to be fouled to get set to 0 } } # Does block persist until end of alignment? if ($blocksettrue) { if ($block_len >= $MINLEN) { $stop=$protalnlength; #Report it print "\n"; $start+=$ABLATE; $stop-=$ABLATE; print "Block $block_ct ($start,$stop):\n"; foreach my $key2 (@keys) { my $protstring=substr($alnseqs{$key2},$start-1,$stop-$start+1); my @protstring=split(//,$protstring); my $preprintstring="$key2 "; $preprintstring=~m/^(.{20})/; print "$1"; foreach my $protpiece (@protstring) { print "$protpiece "; } print "\n"; # Now report the coding data my $dnastring=substr($alnseqsd{$key2},3*($start-1),3*($stop-$start+1)); my @dnastring=split(//,$dnastring); print " "; my $j=0; foreach my $dnapiece (@dnastring) { print "$dnapiece"; ++$j; if ($j%3 == 0 && $j>0) { print " "; } } print "\n"; } ++$block_ct; } } print "\n"; exit 0;