genbank2image.pl

#!/usr/local/bin/perl -w

# Use BioPerl to convert sequence formats
# Use GD to generate a PNG figure 
# WIBR - Unix and Programming Skills for Biologists - class 3

# Input file in GenBank format
$inFile = "seqs.gb";

use Bio::SeqIO;      # for sequence handling
use GD;         # for graphics

# Set input format (fasta, genbank, embl, pir, gcg, etc.)
$in = Bio::SeqIO->new('-file' => "$inFile", '-format' => 'Genbank');

# Set width of each nt block on image
$ntWidth = 15;
# Set number of nts per line in figure
$ntPerLine = 50;

# Iterate through the sequences
while ($seqobj = $in->next_seq())
{
   readSequence();

   setupImage();

   # Split sequence ($rawseq) into an array (@nt) of nts
   @nt = split(//, $rawseq);

   for ($i = 0; $i <= $#nt; $i++)
   {
      $thisNT = $nt[$i];
      # print "$nt[$i] ";

      # Create coordinates for boxes and letters 
      $x1 += $ntWidth;
      $x2 = $x1 + $ntWidth;
      $y2 = $y1 + $ntWidth;

      # Print boxes only for nts in the coding region (CDS)
      # See parseSeqFeatures subroutine for variable to use.
      # using the GD command:
      # $img->filledRectangle($x1, $y1, $x2, $y2, $ntToColor{$thisNT});

      # 1 - uncomment the following line

      # $img->filledRectangle($x1, $y1, $x2, $y2, $ntToColor{$thisNT});

      # Print nucleotide in center of box

      # 2 - uncomment the following line

      # $img->string(gdGiantFont, $x1 + 3, $y1, $thisNT, $black);

      # Handle end of row => start new row      
      if (($i + 1) % $ntPerLine == 0)
      {
         $y1 += 2 * $ntWidth;
         $x1 = 25;
         # Print position of nucleotide on left
         $img->string(gdTinyFont, 5, $y1 + 4, $i + 2, $black);
      }
   }

   # Print this image to a file as a PNG graphic
   open(OUT_IMG, ">$imagefile") || die "Cannot write to $imagefile: $!\n";
   print OUT_IMG $img->png;
   close OUT_IMG;

   # Make a file of alternating DNA and protein sequences (in fasta format)

   # 5 - uncomment the following line

   # printSeqs();
}

# Print final message
$imagefileList = join (" ", @imagefiles);
print "\nFor images, see\n$imagefileList\n";

#########################  SUBROUTINES  #########################

sub readSequence
{
   # Get raw sequence for this sequence
   $rawseq = $seqobj->seq();
   # Get length (regular Perl command)
   $seqLength = length($rawseq);

   # Get sequence ID
   $id = $seqobj->display_id();
   # Get sequence description
   # $desc = $seqobj->desc();

   # Parse sequence features (CDS, etc.)
   parseSeqFeatures();

   # Print sequence in fasta format (for debugging, at least)
   # $out->write_seq($seqobj);   

   # Write title of figure
   $title = "$id ($seqLength nt, from $inFile)";

   # If the CDS was found, add the nt for the CDS start and end to the title
   # See the parseSeqFeatures subroutine to find the variable names

   # 3


}

sub setupImage
{
   # Make name of output file and add to array of all images
   $imagefile = $id . ".png";
   push (@imagefiles, $imagefile);

   # Calculate size of image to create
   $imageHeight = $seqLength / $ntPerLine * $ntWidth * 2 + 80;
   $imageWidth = $ntPerLine * $ntWidth + 100;

   # Create a new image
   $img = new GD::Image($imageWidth, $imageHeight);
   defineColors();
   defineNtColors();

   # Initialize location of first nt box
   $x1 = 25; $y1 = 40;

   # Write a title across the top - see documentation for choices of font sizes
   $img->string(gdGiantFont, 45, 5, $title, $black);
   # Print 1 next to 1st nt
   $img->string(gdTinyFont, 5, $y1 + 4, "1", $black);
}

sub defineColors
{
   # Describe colors you want to use: each integer is 0-255 for Red, Green, and Blue

   # Change the nt boxes to your 4 favorite colors 
   # To do this change, the numbers below (and adjust variable names if desired).
   # If you do this adjust the hash in the defineNtColors subroutine

   # 4

   $white = $img->colorAllocate(255,255,255);
   $black = $img->colorAllocate(0,0,0);
   $green = $img->colorAllocate(0, 255, 0);
   $blue = $img->colorAllocate(0, 0, 255);
   $red = $img->colorAllocate(255, 0, 0);
   $yellow = $img->colorAllocate(255, 255, 0);
 }
sub defineNtColors
{
   # Make hash to convert nts to colors
   $ntToColor{"A"} = $red;
   $ntToColor{"G"} = $green;
   $ntToColor{"C"} = $blue;
   $ntToColor{"T"} = $yellow;
 }

sub parseSeqFeatures
{
   # Look at sequence features (but just save CDS data)
   foreach $feat ($seqobj->all_SeqFeatures())
   {
      $featName = $feat->primary_tag;

      if ($featName eq "CDS")
      {
         $cdsStart = $feat->start;
            $cdsEnd = $feat->end;
      }
      foreach $tag ($feat->all_tags())
      {
           if ($tag eq "translation")
           {
              # Get protein sequence
            @protein = $feat->each_tag_value("translation");
            $protein = join ("", @protein);
            $protId = "$id";

            # Make a sequence object of the protein
            $protSeqObj = Bio::PrimarySeq->new('-seq' => $protein, '-format' => 'raw',
               -id => $protId, -type=> 'protein');
         }
         }
   }
}

sub printSeqs
{
   $outDna = Bio::SeqIO->new('-format' => 'Fasta');
   $outDna->write_seq($seqobj);

   $outProt = Bio::SeqIO->new('-format' => 'Fasta');
      $outProt->write_seq($protSeqObj);
}