blast_parse_2.pl

#!/usr/local/bin/perl -w

# Parsing BLAST reports with the NHGRI::Blastall module
# see http://genome.nhgri.nih.gov/blastall/Blastall.html for documentation
# WI Biocomputing course - Unix and Programming Skills for Biologists - March 2003

# Show Perl where the module is (since it's in my own directory)
use lib '/home/gbell/lib/lib/perl5/site_perl/5.8.0/';
# Use the NHGRI::Blastall module
use NHGRI::Blastall;

if (! $ARGV[0])
{
   print "What is the BLAST file to parse? ";

   # Get input and remove the newline character at the end
   chomp ($inFile = <STDIN>);
}
else
{
   $inFile = $ARGV[0];
}

# Make a new BLAST report "object"
my $b = new NHGRI::Blastall;

# Read BLAST report
$b->read_report("$inFile");

# Parse all results
@results = $b->result();

# Matching sequence IDs (of hit seqs)
@ids = $b->result('id');

# Get defline of query sequence
@query_defline = $b->result('query');

print "HIT_NUM\tQUERY\tQUERY_ID\tSUBJECT_LENGTH\tSCORE\tEXPECT\t";
print "MATCH_LENGTH\tIDENTITY\tQUERY_START\tQUERY_END\tSUBJECT_START\tSUBJECT_END\n";
print "\n**** DATA FOR QUERY $query_defline[0] ****\n\n";

# For each matching sequence in the report 
for ($i = 0; $i <= $#ids; $i++)
{
   # Number of hit (1...)
   $hit_num = $i + 1;
   # Get seq ID for this hit seq
   $this_id = $ids[$i];
   # Get the definition line of the matching sequence
   @deflines = $b->result('defline', $ids[$i]);

   for ($j = 0; $j <= $#deflines; $j++)
   {
      # Get rid of extra spaces at end of lines comprising @deflines
      # (This is a bug in this implementation)
      $deflines[$j] =~ s/(\s){5,}/ /g;
   }

   # Score of hit
   @scores = $b->result('scores', $ids[$i]);
   # Expect value
   @expects = $b->result('expects', $ids[$i]);
   # Length of matching sequence (entire seq, not just matching region)
   @subject_length = $b->result('subject_length', $ids[$i]);
   # Length of match
   @match_lengths = $b->result('match_lengths', $ids[$i]);
   # Percent identities for length of match
   @identities = $b->result('identities', $ids[$i]);

   # nt number for each end of query and subject alignment
   @query_starts = $b->result('query_starts', $ids[$i]);
   @query_ends = $b->result('query_ends', $ids[$i]);
   @subject_starts = $b->result('subject_starts', $ids[$i]);
   @subject_ends = $b->result('subject_ends', $ids[$i]);

   # Go through every alignment between query and hit seq
   # !!! Note that there's often more than one alignment
   # between pair of sequences !!!

   # If you only care about the best alignment region between sequences,
   # only print out results where $k = 0

   for ($k = 0; $k <= $#scores; $k++)
   {
      print "$hit_num\t$deflines[0]\t$this_id\t";
      print "$subject_length[0]\t$scores[$k]\t$expects[$k]\t$match_lengths[$k]\t";
      printf "%0.2f", $identities[$k];
      print "\t$query_starts[$k]\t$query_ends[$k]\t$subject_starts[$k]\t$subject_ends[$k]\n";
   }
}



# To print out all results (using dump_results() subroutine below)
# print "ALL RESULTS:\n";
# dump_results();

sub dump_results
{
   foreach $rh_r (@results)
   {
      while (($key,$value) = each %$rh_r)
      {
         if (ref($value) eq "ARRAY")
         {
            print "$key: ";
            foreach $v (@$value)
            {
               print "$v ";
            }
            print "\n";
         }
         else
         {
            print "$key: $value\n";
         }
      }
      print "\n";
   }
}