analyzeGFs_SOLUTION.pl

#!/usr/local/bin/perl -w
# Unix and Programming Skills for Biologists - class 2
# Analyze a list of growth factors

# Input data file
$gfData = "mapped_growth_factors.txt";

# Output files (cDNA and promoters)
$cdnaSeqFile = "factors_cdna.fa";
$promoterFile = "factors_prom.fa";

# Temporary promoter file: to be appended to $promoterFile
$tempFile = "prom.tmp";

# Length of promoter to extract
$prom_length = 1000;

# Full path to human chromosome files
$chrPath = "/cluster/db0/Data/human_gp_jul_03/";

# If $promoterFile or $cdnaSeqFile already exists, delete it.
if (-e "$promoterFile")
    { unlink($promoterFile); }
if (-e "$cdnaSeqFile")
   { unlink($cdnaSeqFile); }

# Input file contains the following fields:
# 0: accession
# 1: description of gene
# 2: date of assembly used for mapping data
# 3: chromosome (ex: chr10)
# 4: strand (+ or -)
# 5: transcript end 1 (start if strand = +; end otherwise)
# 6: transcript end 2 (end if strand = +; start otherwise)

# This file was generated from Ensembl Mart data (using an SQL query)

open (GF_DATA, $gfData) || die "Major problem: cannot open $gfData for reading: $!";
while (<GF_DATA>)
{
   # Read one line at a time

   # Get rid of new line at end of each line
   chomp($_);

   # Split the line in to tab-delimited fields
   @data = split(/\t/, $_);

   $acc = $data[0];
   $desc = $data[1];
   $chr = $data[3];
   $strand = $data[4];
   $end1 = $data[5];
   $end2 = $data[6];

   print "$acc $desc\n";      # for debugging

   # Get a sequence from a BLAST-formatted database using the "fastacmd" syntax
   # fastacmd -d database -s accession               and append (>>) to $cdnaSeqFile

   `fastacmd -d nt -s $acc >> $cdnaSeqFile`;

   # Get the sequence 1 kb upstream of the transcription start
   getUpstreamSeq();
}

# Delete temporary file and close file handles 
unlink("$tempFile", "${tempFile}.new");
close (GF_DATA);

print "\nSee output in $cdnaSeqFile (cDNA) and $promoterFile (promoters)\n\n";

sub getUpstreamSeq
{
   # Get full path to chromosome file
   $chrFile = $chrPath . $chr . ".nib";

   if ($strand eq "+")      # get sequence "before" cDNA - ende1 is 5' start
   {
        $toExtractStart = $end1 - $prom_length;
        $toExtractEnd = $end1;
   }
   elsif ($strand eq "-")      # get sequence "after" cDNA - end2 is 5' start
   {
        $toExtractStart = $end2;
        $toExtractEnd = $end2 + $prom_length;
   }
   else   # problems
   {
        print ">Strand couldn't be determined for $acc\n";
   }
   if ($strand eq "+" || $strand eq "-")
   {
        # Extract sequence using nibFrag from nib files
        # Syntax: nibFrag chrFile toExtractStart toExtractEnd strand outputFile
        `nibFrag -upper $chrFile $toExtractStart $toExtractEnd $strand $tempFile`;

      # Change the header of the temporary promoter file
      $newHeader = "$acc promoter $chr($strand):$toExtractStart-$toExtractEnd $desc";
      # $toAdd = " foo";
      `descseq -seq $tempFile -out ${tempFile}.new -name "$newHeader"`;

      # Add this promoter to the big promoter file
      `cat ${tempFile}.new >> $promoterFile`;
   }
}