#!/usr/bin/perl -w

# oligos.pl
# Create and analyze an overlapping series of oligos
# WI Bioinformatics course - Feb 2002 - Lecture 5
# WI Bioinformatics course - Revised - Sep 2003
# Example of input taken as multiple arguments 

# Check input and give info if arguments are missing
if (! $ARGV[3])
{
	print "
This program creates a series of oligos from 'sequence'
of length 'oligoLength' starting at startPosition nt and
ending at endPosition nt
USAGE: oligos.pl sequence oligoLength startPosition endPosition\n\n";

	exit(0);
}
else
{
	$seq = $ARGV[0];	# sequence name
	$mer = $ARGV[1];	# length of oligos to extract
	$start = $ARGV[2];	# nt number for beginning of first oligo
	$end = $ARGV[3];	# nt number for beginning of last oligo
}

# Get continuous sequence from sequence file 
open (SEQ, $seq) || die "cannot open $seq for reading: $!";
@seq = <SEQ>;				# make array of lines
$defline = $seq[0];			# get defline

# Split definition line into fields separated by spaces
@defline_fields = split(/\s/, $defline);	

$seq[0] = "";				# delete defline
$seq = join ("", @seq);			# join($glue, @list) to get seq without defline
$seq =~ s/\s*//g;			# delete spaces and end of lines
$seqLength = length ($seq); 		# get length of sequence (optional) 

# $defline_fields[0] is the first part of defline (up to first space)
print "Oligos ($mer mers) for $defline_fields[0] ($seqLength nt) and % GC content\n";

# Beware of OBOB ==> for ($i = $start; $i < $end; $i++) would be one nucleotide off
# This "off-by-one bug" is a common problem, so always check your code

# Adjust counting since computers start at 0 but people think of sequences starting at 1
for ($i = $start - 1; $i < $end - 1; $i++)
{
    # Extract substring of sequence to make oligo ==> $oligo = substr(seq, start, length);
    $oligo = substr($seq, $i, $mer); 
    
    # Adjust to conventional way of counting (e.g., first nt = 1, not 0)
    $nt = $i + 1;
    
    # Count the number of Gs and Cs in the oligo by counting 
    # the number of times G or C are "translated" into the same G or C within $oligo 
    $numGC = $oligo =~ tr/GC/GC/;
    
    # Calculate per cent GC 
    $pcGC = 100 * $numGC / $mer;
    
	if (($pcGC > 40) && ($pcGC < 60)) {  #selecting oligos whose GC% between 40-60
	    
	    # Round off percent GC to 1 decimal places (%0.2f ==> 2 decimal places)
	    $pcGCrounded = sprintf("%0.1f", $pcGC);
	    
	# Print results delimited by tabs (for easy importing into spreadsheets, etc)
	    print "$nt\t$oligo\t$pcGCrounded\n";
	} 
    
    
}
print "\n";