#!/usr/local/bin/perl -w

# Unix, Perl, and Python - class 2
# Align a list of human-mouse homology pairs

# Input data file - pairs of human and mouse RefSeq accessions
$hs_mm_pairs = "human_mouse_pairs.txt";

# Files used in data processing
$humanSeq = "mouse.fa";
$mouseSeq = "human.fa";
$seqPairFile = "seqPair.fa";
$bigOutputFile = "alignment-all.txt";

# Remove $bigOutputFile from any previous script outputs
if (-e "$bigOutputFile")				# if the file exists
	 { unlink($bigOutputFile); }			# delete it [Perl's unlink = Unix's rm]

# Open the homology data file
open (HOMOLOGY, "$hs_mm_pairs") || die "Major problem: cannot open $hs_mm_pairs for reading: $!";

while (<HOMOLOGY>)		# Read one line at a time
{
	chomp($_);					# Delete newline at end of each line
	@data = split(/\t/, $_);	# Split the line into tab-delimited fields

	# Get human ($humanAcc) and mouse ($mouseAcc) accessions from @data
	# 0
	$humanAcc = $data[0];
	$mouseAcc = $data[1];

	print STDERR "$humanAcc vs. $mouseAcc\n";

	# Get a sequence from a BLAST-formatted database using the "fastacmd" syntax
	# fastacmd -d database -s accession					and append (>>) to a file

	# Get a human sequence using the "fastacmd" syntax and redirect output to $humanSeq
	# 1
	`fastacmd -d nr -s $humanAcc > $humanSeq`;

	# Change format of sequence headers to make alignments clearer
	`perl -i  -pe 's/>gi.+ref\\|/>/' $humanSeq`;

	# Get a mouse sequence using the "fastacmd" syntax and redirect output to $mouseSeq	
	# 2
	`fastacmd -d nr -s $mouseAcc > $mouseSeq`;

	# Change format of sequence headers to make alignments clearer 
	`perl -i  -pe 's/>gi.+ref\\|/>/' $mouseSeq`;

	# Concatenate both sequences ($humanSeq and $mouseSeq) to $seqPairFile (if doing clustal alignment)
	# 3
	`cat $humanSeq $mouseSeq > $seqPairFile`;

	# Do alignments (perform subroutines below)
	doClustal();
	doGlobal();
	doLocal();
}

# Close file handles and delete temporary files
close (HOMOLOGY);
unlink ($humanSeq, $mouseSeq, $seqPairFile);

print "\nSee $bigOutputFile for output\n\n";

##########  subroutines  ##########

sub doClustal
{
	$clustalOut = "clustal.aln";

	# Do alignment (#4)
	`clustalw -INFILE=$seqPairFile -TYPE=DNA -OUTFILE=$clustalOut`;

	# Add sequence headers to big alignment file (#5) 
	`grep ">" $seqPairFile >> $bigOutputFile`;

	# Append alignment to big file and delete the small files (#5)
	`cat $clustalOut >> $bigOutputFile`;
	unlink("$clustalOut", "seqPair.dnd");
}

sub doGlobal
{
	$globalOut = "global.aln";

	# Do alignment (#4)
	`needle $humanSeq $mouseSeq -outfile $globalOut -auto`;
		
	# Add sequence headers to big alignment file (#5)
	`grep ">" $seqPairFile >> $bigOutputFile`;

	# Append alignment to big file and delete the small file (#5)
	`cat $globalOut >> $bigOutputFile`;
	unlink("$globalOut");
}

sub doLocal
{
	$localOut = "local.aln";

	# Do alignment (#4)
	`water $humanSeq $mouseSeq -outfile $localOut -auto`;
	
	# Add sequence headers to big alignment file (#5)
	`grep ">" $seqPairFile >> $bigOutputFile`;

	# Append alignment to big file and delete the small file (#5)
	`cat $localOut >> $bigOutputFile`;
	unlink("$localOut");
}


syntax highlighted by Code2HTML, v. 0.9.1