#!/usr/local/bin/perl
# ssrtool.pl
# This is a marked up version of an ssr search tool that was written in 1999
# by Sam Cartinhour.
# Code based on Lincoln Stein code 
# (http://stein.cshl.org/WWW/software/CGI/examples/internal_links.cgi?\
# amenu=FOO1#start).
# Genevieve DeClerck. 10/2000
# Steven Schmidt 6/2003 - get a temporary file properly, 
# clean up for 'use strict'
# Steven Schmidt 3/2004 - use GramenePage.pm
# Steven Schmidt 5/2005 - put everything after query box within if(param)

use strict;
use warnings;

$| = 1;

#use lib '..';
use CGI qw(:standard);
use CGI::Carp qw/fatalsToBrowser/;
use File::Temp qw[ tempdir tempfile ];
use Gramene::Page;

my $seqcount;
my $r    = Apache->request;
my $page = Gramene::Page->new( $r )
    || die( "Can't open Gramene page configurator" );

#####################
##    SSR FORM
#####################
# Header
print header();

my $sh = $page->start_html(
    -Title  => 'SSRIT',
    -Class  => 'search',
    -Target => '_top',
);
$sh =~ s/<body\b[^>]*>//i;
my $sb = $page->start_body;

#$sb =~ s/<body\b[^>]*>//i;
print "$sh$sb";

print h1( "SSRIT - Simple Sequence Repeat Identification Tool" );
print 'This tool finds all ', i( ' perfect ' ),
    ' simple sequence repeats (SSRs) in a given sequence.';
print p
    'For source code for a stand-alone version please click <a href="ftp://ftp.gramene.org/pub/gramene/software/scripts/ssr.pl">here</a>.';
print p
    'For citation, please use this reference <a href="/db/literature/pub_search?ref_id=7962"><i>Temnykh et al.</i> (2001)</a>.';
print p, i( "Note: Netscape 2.0 or greater, or IE 4.0 or greater required." ),
    p, hr;

# Start a form.
print start_form, p, h3( "1)  Select search parameters" ), p;
print strong(
    "  a) Select the maximum motif-length group you wish to find.    " );
print
    "For example, if you want to search for all SSRs up to and including pentamers (meaning, you'd like to search for dimers, trimers, tetramers, and pentamers), you should select 'pentamer' from the drop-down menu. ",
    p,
    popup_menu(
    -name  => 'motif_length',
    -value => [
        'dimer',   'trimer',   'tetramer', 'pentamer',
        'hexamer', 'heptamer', 'octamer',  'nonamer',
        'decamer'
    ],
    -default => 'tetramer'
    ),
    p;

#print checkbox(-name=>'monomer',-checked=>'0',-value=>'OFF',-label=>'Include monomers'),br,p,;

print strong( "  b) Enter the minimum number of repeats you will allow.  " );
print
    "Entering a '5,' for example, will match SSRs with five or more motif repeats, such as  ag-5 ('agagagagag').",
    p;
print textfield( -name => 'min_repeats', -value => '5', size => 2 ), p, br, p;

print h3( "2)  Paste/Enter your sequence of interest into the textarea" );
print
    "The sequence(s) must be in FASTA format - meaning, there must be a title line with a '>' at the beginning for each sequence.",
    p;
print " FOR EXAMPLE,",                                      p;
print ">seq1",                                              br;
print "agagattaggatcgatcgcgctctctctctctctctcgatcgagatcgat", br;
print "ggccatcatcatcatcatcattgagatatagcgcgatatcgagagatctc", br;
print "agaatagatatcgcgctatagagagatcgagagagagtaga",          br;
print ">seq2",                                              br;
print "agagataggaatatgagatagcgggggggggggggcgctatacgcgctcg", br;
print "gagagagatctctctctctcttatagagatcgatcgactagctagatata", br;
print "agactcactcactcactcactcactcagcgcgat",                 br, p, br;

print "Paste/Enter your sequence(s) here:", br;
print textarea(
    -name => 'sequence',
    -rows => 10,
    -cols => 60,
    -wrap => 'physical'
);

print p, reset, submit( 'submit', 'FIND SSRs' ), hr, end_form;

if ( param ) {

#####################
## Find SSRS
#####################

## Create temporary sequence file
    my $seq_tmp
        = param( -name => 'sequence' ); #print STDOUT "SEQ->$seq_tmp<-";#works

    #open (FILE,"+>/tmp/seq_tmp") || die "** 1: Can't open file **";
    my $tmpdir = tempdir( CLEANUP => 1 );
    my ( $seq_fh, $seq_fn ) = tempfile(
        "ssrtoolXXXXXX",
        DIR    => $tmpdir,
        SUFFIX => '.dat'
    ) or die "Can't open tmp file: $!";
    print $seq_fh $seq_tmp;

    #close(FILE);

## Get and set parameters
    my $min_repeats = param( -name => 'min_repeats' ) || 5;
    my $motif = param( -name => 'motif_length' );

    my ( @specs );

    my ( @dec, @di, @hept, @hex, @mono, @non, @oct, @pent, @tetra, @tri );

    if ( $motif eq "dimer" ) {
        @di   = [ 2, $min_repeats ];
        @mono = [ 1, $min_repeats ];
        @specs = ( @di );
    }
    elsif ( $motif eq "trimer" ) {
        @tri  = [ 3, $min_repeats ];
        @di   = [ 2, $min_repeats ];
        @mono = [ 1, $min_repeats ];
        @specs = ( @di, @tri );
    }
    elsif ( $motif eq "tetramer" ) {
        @tetra = [ 4, $min_repeats ];
        @tri   = [ 3, $min_repeats ];
        @di    = [ 2, $min_repeats ];
        @mono  = [ 1, $min_repeats ];
        @specs = ( @di, @tri, @tetra );
    }
    elsif ( $motif eq "pentamer" ) {
        @pent  = [ 5, $min_repeats ];
        @tetra = [ 4, $min_repeats ];
        @tri   = [ 3, $min_repeats ];
        @di    = [ 2, $min_repeats ];
        @mono  = [ 1, $min_repeats ];
        @specs = ( @di, @tri, @tetra, @pent );
    }
    elsif ( $motif eq "hexamer" ) {
        @hex   = [ 6, $min_repeats ];
        @pent  = [ 5, $min_repeats ];
        @tetra = [ 4, $min_repeats ];
        @tri   = [ 3, $min_repeats ];
        @di    = [ 2, $min_repeats ];
        @mono  = [ 1, $min_repeats ];
        @specs = ( @di, @tri, @tetra, @pent, @hex );
    }
    elsif ( $motif eq "heptamer" ) {
        @hept  = [ 7, $min_repeats ];
        @hex   = [ 6, $min_repeats ];
        @pent  = [ 5, $min_repeats ];
        @tetra = [ 4, $min_repeats ];
        @tri   = [ 3, $min_repeats ];
        @di    = [ 2, $min_repeats ];
        @mono  = [ 1, $min_repeats ];
        @specs = ( @di, @tri, @tetra, @pent, @hex, @hept );
    }
    elsif ( $motif eq "octamer" ) {
        @oct   = [ 8, $min_repeats ];
        @hept  = [ 7, $min_repeats ];
        @hex   = [ 6, $min_repeats ];
        @pent  = [ 5, $min_repeats ];
        @tetra = [ 4, $min_repeats ];
        @tri   = [ 3, $min_repeats ];
        @di    = [ 2, $min_repeats ];
        @mono  = [ 1, $min_repeats ];
        @specs = ( @di, @tri, @tetra, @pent, @hex, @hept, @oct );
    }
    elsif ( $motif eq "nonamer" ) {
        @non   = [ 9, $min_repeats ];
        @oct   = [ 8, $min_repeats ];
        @hept  = [ 7, $min_repeats ];
        @hex   = [ 6, $min_repeats ];
        @pent  = [ 5, $min_repeats ];
        @tetra = [ 4, $min_repeats ];
        @tri   = [ 3, $min_repeats ];
        @di    = [ 2, $min_repeats ];
        @mono  = [ 1, $min_repeats ];
        @specs = ( @di, @tri, @tetra, @pent, @hex, @hept, @oct, @non );
    }
    elsif ( $motif eq "decamer" ) {
        @dec   = [ 10, $min_repeats ];
        @non   = [ 9,  $min_repeats ];
        @oct   = [ 8,  $min_repeats ];
        @hept  = [ 7,  $min_repeats ];
        @hex   = [ 6,  $min_repeats ];
        @pent  = [ 5,  $min_repeats ];
        @tetra = [ 4,  $min_repeats ];
        @tri   = [ 3,  $min_repeats ];
        @di    = [ 2,  $min_repeats ];
        @mono  = [ 1,  $min_repeats ];
        @specs = ( @di, @tri, @tetra, @pent, @hex, @hept, @oct, @non, @dec );
    }

## Open seq_tmp for ssr search
    #open (FILE2,"</tmp/seq_tmp") || die "** 2: Can't open file **";
    seek $seq_fh, 0, 0;

    my @stats = stat $seq_fh;    #check size of file

    #print "STATS-->$stats[7]<---\n";
    if ( $stats[7] > 1000000 ) {
        print
            " * * * * * * WARNING: Your sequence data is too large. Please reduce the amount of sequence you paste into the textarea and try again.* * * * * * *                                                                                     ",
            $page->end_body;
        close( $seq_fh );
        unlink( $seq_fn );
        return 1;
    }

    # Search for SSRs. Sam Cartinhour code.
    my @ssrs;
    {
        $/ = ">";
        while ( <$seq_fh> ) {    #FASTA formatted sequences as input
            chomp;               # print "IN!!!!!";
            my ( $titleline, $sequence ) = split( /\n/, $_, 2 );
            next unless ( $sequence && $titleline );
            $seqcount++;
            my ( $id ) = $titleline =~ /^(\S+)/;  #the ID is the first whitespace-
                                                  #delimited item on titleline
            $id ||= 'INPUT';
            $sequence =~ s/\s//g;                 #concatenate multi-line sequence
            study( $sequence );                   #is this necessary?
            my $seqlength = length( $sequence );
            my $ssr_number = 1;    #track multiple ssrs within a single sequence
            my %locations;         #track location of SSRs as detected
            my $i;

            for ( $i = 0; $i < scalar( @specs ); $i++ )
            {                      #test each spec against sequence
                my $motiflength = $specs[$i]->[0];
                my $minreps     = $specs[$i]->[1] - 1;
                my $regexp      = "(([gatc]{$motiflength})\\2{$minreps,})";
                while ( $sequence =~ /$regexp/ig ) {
                    my $motif = lc( $2 );
                    my $ssr   = $1;

                    #reject "aaaaaaaaa", "ggggggggggg", etc.
                    next if &homopolymer( $motif, $motiflength );
                    my $ssrlength = length( $ssr );    #overall SSR length
                    my $repeats = $ssrlength / $motiflength;  #number of rep units
                    my $end     = pos( $sequence );           #where SSR ends
                    pos( $sequence ) = $end - $motiflength;   #see docs
                    my $start = $end - $ssrlength + 1;        #where SSR starts
                    ## To print results to screen
                    # print join("\t", $id, $ssr_number++,
                    #		      $motiflength, $motif, $repeats,
                    #		      $start, $end, $seqlength), "\n"
                    #	if (&novel($start, \%locations));  #count SSR only once
                    ## To deposit results in array for later printing
                    my $id_and_num = $id . "-" . $ssr_number++;
                    push @ssrs,
                        [
                        $id_and_num, $motif, $repeats,
                        $start,      $end,   $seqlength
                        ]
                        if ( &novel( $start, \%locations ) ); #count SSR only once
                    print p;
                }
            }
        }
    }

## Print elements (by row) of 2-D array to form in table format
    my @values   = ( 0 .. $#ssrs );
    my @headings = (
        'Sequence', 'Motif', 'No.of Repeats', 'SSR start',
        'SSR end',  'SeqLength'
    );
    my @rows = th( \@headings );
    foreach my $n ( @values ) {
        push(
            @rows,
            td(
                [
                    $ssrs[$n][0], $ssrs[$n][1], $ssrs[$n][2],
                    $ssrs[$n][3], $ssrs[$n][4], $ssrs[$n][5]
                ]
            )
        );
    }
    @values
        or push @rows,
        td(
        { -colspan => 6, align => "center" },
        "No SSR's of length at most $motif"
        ),
        td( { -colspan => 6, align => "center" },
        , "with at least $min_repeats repeats" );
    print table(
        { -border => undef, -width => '25%' },
        caption( b( 'SSRs found in your sequence(s)' ) ),
        Tr( \@rows )
    );

    close( $seq_fh );
    unlink( $seq_fn );

    print p, br;
    print h4( "Explanation of table:" ), strong( "Sequence" ),
        " - The identifying tag you've given to your sequence in title line. The appendage (\"-1,\" \"-2,\" etc.) is an index number which identifies the first, second, etc. SSR found in a particular sequence.";
    print br;
    print strong( "Motif" ), " - the simple sequence that is repeated.", br;
    print strong( "No. of Repeats" ),
        " - number of times the simple sequence (motif) is repeated.", br;
    print strong( "SSR start" ),
        " - start coordinate of the SSR. The number is nth nucleotide from the beginning of the sequence.",
        br;
    print strong( "SSR end" ),
        " - end coordinate of the SSR. The number is nth nucleotide from the beginning of the sequence + the length of the SSR.",
        br;
    print strong( "SeqLength" ),
        " - the total length of the sequence in which the SSR was found.", br;

}    # end if(param)

#print hr;

#print "<div align=right><font color=\"#008080\"><font size=+1><a href=\"http://www.gramene.org\">Gramene home page</a></font></font><p></div>";
#print hr;
#print "Questions or comments:  ";
#print "<A HREF=\"mailto:gramene\@gramene.org\" TITLE=\"Your Subject\">","Gramene Support","</A>";
#print "<A HREF=\"mailto:gad14\@cornell.edu\" TITLE=\"Your Subject\">","Genevieve DeClerck","</A>";
#print br,br;
#print a({href=>"http://ars-genome.cornell.edu/rice/tools.html"},"Back to Software Tools page");
print hr, 'Last modified: Thu Jul 28 13:56:19 EDT 2005';

#'Last modified: Mon Mar 29 15:29:55 EST 2004';
#'Last modified: Fri Apr 19 15:12:35 EDT 2002',br,br;
#'Last modified: Mon Oct 30 11:24 EST 2000',br,br;
#    print br;

print $page->end_body;

#####################
##   Subroutines
#####################

sub homopolymer {

    #return true if motif is repeat of single nucleotide
    my ( $motif, $motiflength ) = @_;
    my ( $reps ) = $motiflength - 1;
    return 1 if ( $motif =~ /([gatc])\1{$reps}/ );
    return 0;
}

sub novel {
    my ( $position, $locationsref ) = @_;
    if ( defined $locationsref->{$position} ) {
        return undef;
    }
    else {
        $locationsref->{$position} = 1;
        return 1;
    }
}

#} #if(my $file...
