#!/usr/bin/perl
#ssrtool.pl
#This is a marked up version of an ssr search tool that was written in 1999
#by Sam Cartinhour.
#Code based on Lincoln Stein code (http://stein.cshl.org/WWW/software/CGI/examples/internal_links.cgi?amenu=FOO1#start).
#Genevieve DeClerck. 10/2000

use lib '..';
use CGI qw(:standard);
use CGI::Carp qw/fatalsToBrowser/;

$/ = ">";
$| = 1;
my $seqcount;

#####################
##    SSR FORM
#####################
# Header
print header();
print start_html(-title=>'SSRIT',-bgcolor=>'#F3F3E0');
print h1("SSRIT - Simple Sequence Repeat Identification Tool");
print 'This tool finds all ', i(' perfect '), ' simple sequence repeats (SSRs) in a given sequence.';
print p 'For source code please click <a href="ftp://www.gramene.org/pub/gramene/scripts/ssr.pl">here</a>.';
print p,i("Note: Netscape 2.0 or greater, or IE 4.0 or greater required."),p,hr;

# Start a form.
print start_form, p, h3("1)  Select search parameters"),p;
print strong("  a) Select the maximum motif-length group you wish to find.    ");
print "For example, if you want to search for all SSRs up to and including pentamers (meaning, you'd like to search for dimers, trimers, tetramers, and pentamers), you should select 'pentamer' from the drop-down menu. ",p,popup_menu(-name=>'motif_length',-value=>['dimer','trimer','tetramer','pentamer','hexamer','heptamer','octamer','nonamer','decamer'],-default=>'tetramer'),p;
#print checkbox(-name=>'monomer',-checked=>'0',-value=>'OFF',-label=>'Include monomers'),br,p,;

print strong("  b) Enter the minimum number of repeats you will allow.  ");
print "Entering a '5,' for example, will match SSRs with three or more motif repeats, such as ag-3 ('agagag').",p;
print textfield(-name=>'min_repeats',-value=>'5',size=>2),p,br,p;

print h3("2)  Paste/Enter your sequence of interest into the textarea");
print "The sequence(s) must be in FASTA format - meaning, there must be a title line with a '>' at the beginning for each sequence.",p;
print " FOR EXAMPLE,",p;
print ">seq1",br;
print "agagattaggatcgatcgcgctctctctctctctctcgatcgagatcgat",br;
print "ggccatcatcatcatcatcattgagatatagcgcgatatcgagagatctc",br;
print "agaatagatatcgcgctatagagagatcgagagagagtaga",br;
print ">seq2",br;
print "agagataggaatatgagatagcgggggggggggggcgctatacgcgctcg",br;
print "gagagagatctctctctctcttatagagatcgatcgactagctagatata",br;
print "agactcactcactcactcactcactcagcgcgat",br,p,br;

print "Paste/Enter your sequence(s) here:",br;
print textarea(-name=>'sequence',-rows=>10,-cols=>60,-wrap=>'physical');

    print p,reset,submit('submit','FIND SSRs'),hr,
    end_form;


#####################
## Find SSRS
#####################

## Create temporary sequence file
my $seq_tmp = param(-name=>'sequence'); #print STDOUT "SEQ->$seq_tmp<-";#works
open (FILE,"+>/tmp/seq_tmp") || die "** 1: Can't open file **";
print FILE $seq_tmp;
close(FILE);


## Get and set parameters
if (param(-name=>'min_repeats')){$min_repeats = param(-name=>'min_repeats');}
$motif = param(-name=>'motif_length'); 

if(param){ 

  if ( $motif eq "dimer"){  @di = [2,$min_repeats];@mono = [1,$min_repeats]; @specs = (@di); } 
  elsif ($motif eq "trimer"){  @tri = [3,$min_repeats];@di = [2,$min_repeats];@mono = [1,$min_repeats]; @specs = (@di,@tri); } 
  elsif ($motif eq "tetramer" ){  @tetra = [4,$min_repeats];@tri =[3,$min_repeats];@di = [2,$min_repeats];@mono = [1,$min_repeats]; @specs = (@di,@tri,@tetra); } 
  elsif ($motif eq "pentamer"){  @pent = [5,$min_repeats];@tetra = [4,$min_repeats];@tri =[3,$min_repeats];@di = [2,$min_repeats];@mono = [1,$min_repeats]; @specs = (@di,@tri,@tetra,@pent); } 
  elsif ($motif eq "hexamer"){  @hex = [6,$min_repeats];@pent = [5,$min_repeats];@tetra = [4,$min_repeats];@tri =[3,$min_repeats];@di = [2,$min_repeats];@mono = [1,$min_repeats]; @specs = (@di,@tri,@tetra,@pent,@hex); } 
  elsif ($motif eq "heptamer"){  @hept = [7,$min_repeats];@hex = [6,$min_repeats];@pent = [5,$min_repeats];@tetra = [4,$min_repeats];@tri =[3,$min_repeats];@di = [2,$min_repeats];@mono = [1,$min_repeats]; @specs = (@di,@tri,@tetra,@pent,@hex,@hept); } 
  elsif ($motif eq "octamer"){  @oct = [8,$min_repeats];@hept = [7,$min_repeats];@hex = [6,$min_repeats];@pent = [5,$min_repeats];@tetra = [4,$min_repeats];@tri =[3,$min_repeats];@di = [2,$min_repeats];@mono = [1,$min_repeats]; @specs = (@di,@tri,@tetra,@pent,@hex,@hept,@oct); } 
  elsif ($motif eq "nonamer"){  @non = [9,$min_repeats];@oct = [8,$min_repeats];@hept = [7,$min_repeats];@hex = [6,$min_repeats];@pent = [5,$min_repeats];@tetra = [4,$min_repeats];@tri =[3,$min_repeats];@di = [2,$min_repeats];@mono = [1,$min_repeats]; @specs = (@di,@tri,@tetra,@pent,@hex,@hept,@oct,@non); } 
  elsif ($motif eq "decamer"){  @dec = [10,$min_repeats];@non = [9,$min_repeats];@oct = [8,$min_repeats];@hept = [7,$min_repeats];@hex = [6,$min_repeats];@pent = [5,$min_repeats];@tetra = [4,$min_repeats];@tri =[3,$min_repeats];@di = [2,$min_repeats];@mono = [1,$min_repeats];  @specs = (@di,@tri,@tetra,@pent,@hex,@hept,@oct,@non,@dec); } 
 
} 

## Open seq_tmp for ssr search
open (FILE2,"</tmp/seq_tmp") || die "** 2: Can't open file **";

@stats = stat FILE2; #check size of file
#print "STATS-->$stats[7]<---\n";
if($stats[7] > 1000000){die " * * * * * * WARNING: Your sequence data is too large. Please reduce the amount of sequence you paste into the textarea and try again.* * * * * * *                                                                                     "}

# Search for SSRs. Sam Cartinhour code.
   while(<FILE2>){ #FASTA formatted sequences as input
      chomp;# print "IN!!!!!";
      ($titleline, $sequence) = split(/\n/,$_,2);
      next unless ($sequence && $titleline);
      $seqcount++;
      my ($id) = $titleline =~ /^(\S+)/; #the ID is the first whitespace-
                                       #delimited item on titleline
      $sequence =~ s/\s//g; #concatenate multi-line sequence
      study($sequence);     #is this necessary?
      my $seqlength = length($sequence);
      my $ssr_number = 1;   #track multiple ssrs within a single sequence
      my %locations;        #track location of SSRs as detected
      my $i;
      for($i=0; $i<scalar(@specs); $i++){ #test each spec against sequence
	my $motiflength = $specs[$i]->[0]; 
	my $minreps = $specs[$i]->[1] - 1;
	my $regexp = "(([gatc]{$motiflength})\\2{$minreps,})";
	while ($sequence =~ /$regexp/ig){
	    my $motif = lc($2); my $ssr = $1;
	    #reject "aaaaaaaaa", "ggggggggggg", etc.
	    next if &homopolymer($motif,$motiflength);
	    my $ssrlength = length($ssr);          #overall SSR length
	    my $repeats = $ssrlength/$motiflength; #number of rep units
	    my $end = pos($sequence);              #where SSR ends
	    pos($sequence) = $end - $motiflength;  #see docs
	    my $start = $end - $ssrlength + 1;     #where SSR starts
	    ## To print results to screen
	   # print join("\t", $id, $ssr_number++,
	#		      $motiflength, $motif, $repeats, 
	#		      $start, $end, $seqlength), "\n"
	#	if (&novel($start, \%locations));  #count SSR only once
	    ## To deposit results in array for later printing
	    $id_and_num = $id."-".$ssr_number++;
	    @tmp = ($id_and_num, $motif, $repeats, $start, $end, $seqlength) if (&novel($start, \%locations)); #count SSR only once
	    push @ssrs,[@tmp]; 

	    print p;
	}
    }
}

## Print elements (by row) of 2-D array to form in table format
if(param){
@values = (0 .. $#ssrs);
@headings = ('Sequence','Motif','No.of Repeats','SSR start','SSR end','SeqLength');
@rows = th(\@headings); 
foreach $n (@values){ 
  push(@rows,td([$ssrs[$n][0],$ssrs[$n][1],$ssrs[$n][2],$ssrs[$n][3],$ssrs[$n][4],$ssrs[$n][5]]));}  
print table({-border=>undef,-width=>'25%'},
	    caption(b('SSRs found in your sequence(s)')),
	    Tr(\@rows));

print p,br;
print h4("Explanation of table:"),strong("Sequence")," - The identifying tag you've given to your sequence in title line. The appendage (\"-1,\" \"-2,\" etc.) is an index number which identifies the first, second, etc. SSR found in a particular sequence.";print br;
print strong("Motif")," - the simple sequence that is repeated.",br;
print strong("No. of Repeats")," - number of times the simple sequence (motif) is repeated.",br;
print strong("SSR start")," - start coordinate of the SSR. The number is nth nucleotide from the beginning of the sequence.",br;
print strong("SSR end")," - end coordinate of the SSR. The number is nth nucleotide from the beginning of the sequence + the length of the SSR.",br;
print strong("SeqLength")," - the total length of the sequence in which the SSR was found.",br;
}

print hr;

print "<div align=right><font color=\"#008080\"><font size=+1><a href=\"http://www.gramene.org\">Gramene home page</a></font></font><p></div>";
print hr;
print "Questions or comments:  ";
print "<A HREF=\"mailto:gramene\@gramene.org\" TITLE=\"Your Subject\">","Genevieve DeClerck","</A>";
#print "<A HREF=\"mailto:gad14\@cornell.edu\" TITLE=\"Your Subject\">","Genevieve DeClerck","</A>";
print br,br;
#print a({href=>"http://ars-genome.cornell.edu/rice/tools.html"},"Back to Software Tools page");
print hr,br,
    'Last modified: Fri Apr 19 15:12:35 EDT 2002',br,br;
    #'Last modified: Mon Oct 30 11:24 EST 2000',br,br;
    print br;

    end_html;

close(FILE2);
unlink("/tmp/seq_tmp");


#####################
##   Subroutines
#####################

sub homopolymer {             
    #return true if motif is repeat of single nucleotide
    my ($motif,$motiflength) = @_;
    my ($reps) = $motiflength - 1;
    return 1 if ($motif =~ /([gatc])\1{$reps}/);
    return 0;
}

sub novel {
    my($position, $locationsref) = @_;
    if(defined $locationsref->{$position}) {
       return undef;
   } else {
       $locationsref->{$position} = 1;
       return 1;
   }
}
#} #if(my $file...
