blastparse.pl

#!/usr/local/bin/perl -w

# Parsing BLAST reports with BioPerl's Bio::SearchIO module
# WI Unix and Programming Skills for Biologists - class 3

# See documentation at http://www.bioperl.org/HOWTOs/html/SearchIO.html 

use Bio::SearchIO;

# Prompt the user for the filename it it's not given as an argument
if (! $ARGV[0])
{
   print "What is the BLAST text file to parse? ";

   # Get input and remove the newline character at the end
   chomp ($inFile = <STDIN>);
}
else
{
   $inFile = $ARGV[0];
}

# Create a SearchIO "object" from the BLAST report
# This report may contain multiple concatenated reports
$report = new Bio::SearchIO( -file => "$inFile", -format => "blast");

# Print out one line of all desired fields, delimited by tabs:
# QUERY NAME, HIT NUM, HIT DESCRIPTION, HIT E-VALUE, FRACTION IDENTITY   [and ending with a newline]

# 1



# Go through BLAST reports one by one              
while($result = $report->next_result)
{
   # Reset hit number for each query  
   $hitNumber = 0;

   # Go through each each matching sequence
   while($hit = $result->next_hit)
   {
      $hitNumber++;

      # Go through each each HSP for this sequence
      while ($hsp = $hit->next_hsp)
      {
         # Print some tab-delimited data about this HSP
         $queryName = $result->query_name;
         $queryAcc = $result->query_accession;
         $queryLength = $result->query_length;
         $queryDesc = $result->query_description;
         $dbName = $result->database_name;
         $numSeqsInDb = $result->database_entries;
         $numHits = $result->num_hits;

         $hitName = $hit->name;
         $hitAcc = $hit->accession;
         $hitDesc = $hit->description;
         $hitEvalue = $hit->significance;
         $hitBits = $hit->bits;
         $numHsps = $hit->num_hsps;

         $hspEvalue = $hsp->evalue;
         $fracIdentical = $hsp->frac_identical;
         $fracConserved = $hsp->frac_conserved;
         $numGaps = $hsp->gaps;
         $hspLength = $hsp->hsp_length;
         $hspQuerySeq = $hsp->query_string;
         $hspHitSeq = $hsp->hit_string;
         $hspConsensus = $hsp->homology_string;
         $hspLength = $hsp->hsp_length;
         $hspRank = $hsp->rank;

         $queryStrand = $hsp->strand('query');
         $hitStrand = $hsp->strand('hit');
         $queryStart = $hsp->start('query');
         $queryEnd = $hsp->end('query');
         $hspStart = $hsp->start('hit');
         $hspEnd = $hsp->end('hit');

         $hspScore = $hsp->score;
         $hspBits = $hsp->bits;

         # PRINT OUT BEST FIVE HITS FOR EACH QUERY SEQUENCE

         # If $hitNumber is less than or equal to 5 and $hspRank is 1, print out the following fields: 
         # QUERY NAME, HIT NUM, HIT DESCRIPTION, HIT E-VALUE, FRACTION IDENTITY
         # as tab-delimited fields on one line

         # 2


      }
   }
}