package Gramene::Utils;

# $Id: Utils.pm,v 1.14 2007/03/19 18:10:58 kclark Exp $

use strict;
use CGI;
use Data::Pageset;
use Data::Dumper;
use Gramene::Config;
use Carp qw( carp croak );
use List::Util qw( max );
use Log::Dispatch::File;
use String::Similarity;

require Exporter;
use vars qw( $VERSION @EXPORT @EXPORT_OK );
$VERSION = (qw$Revision: 1.14 $)[-1];

use base 'Exporter';

my @subs   = qw[ 
    commify 
    iris_normalize
    iterative_search_values
    get_logger
    paginate
    pager
    parse_words
    similarity_search
    table_name_to_gramene_cdbi_class
];

@EXPORT_OK = @subs;
@EXPORT    = @subs;

# ----------------------------------------------------
sub commify {
    my $number = shift;
    1 while $number =~ s/^(-?\d+)(\d{3})/$1,$2/;
    return $number;
}

# ----------------------------------------------------
sub iterative_search_values {
    my $v       = shift || '';
    my $options = shift || {};

    my @return;

    if ( $v ) {
        push @return, $v;

        unless ( $v =~ /^\*.+\*$/ ) {
            push @return, "$v*" unless $v =~ /\*$/;
            push @return, "*$v*" unless $options->{'no_leading_wildcard'};
        }
    }
    else {
        @return = ('*');
    }

    return @return;
}

# ----------------------------------------------------
sub get_logger {
    my %opts      = ref $_[0] eq 'HASH' ? %{ $_[0] } : @_;
    my $config    = Gramene::Config->new;
    my $log_conf  = $config->get('logging');
    my $log_file  = $log_conf->{'log_file'} or croak 'No log file defined';
    my $log_level = $opts{'log_level'} || $log_conf->{'log_level'} || 'warn';

    my ( $pkg, $file, $line ) = caller;

    if ( -e $log_file && !-r _ ) {
        croak "Log file '$log_file' not writable";
    }

    my $logger = Log::Dispatch->new;
    $logger->add( 
        Log::Dispatch::File->new(
            name      => 'file',
            min_level => $log_level,
            filename  => $log_file,
            mode      => 'append',
            callbacks => sub { 
                my %p   = @_; 
                my $msg = $p{'message'};
                my $now = scalar localtime;
                chomp $msg;
                return "[$now] $file:$pkg:$line: $msg\n";
            },
        )
    );

    return $logger;
}

# ----------------------------------------------------
sub pager {
    if ( scalar(@_) % 2 ) {
        carp("odd number of args to pager");
    }

    my %args             =  @_;
    my $entries_per      =  $args{'entries_per_page'} || 25; 
    my $current_page     =  $args{'current_page'}     ||  1;
    my $url              =  $args{'url'}              || '';
       $url             .=  '?' unless $url =~ /\?/;
    my $data             =  $args{'data'};
    my $count            =  $args{'count'};
    my $object_name      =  $args{'object_name'} || 'Items';
    my $total            =  $count || scalar @{ $data || [] } || return;
    my $pager            =  Data::Pageset->new({
        total_entries    => $total,
        entries_per_page => $entries_per,
        current_page     => $current_page,
    });

    my $text = qq[<form method="GET">&nbsp;&nbsp;$object_name ] . 
        commify( $pager->first ) .
        ' to ' . commify( $pager->last ) . 
        ' of ' . commify( $pager->total_entries );

    if ( $pager->last_page > 1 ) {
        $text .= '.' . '&nbsp;' x 15;
        $url  =~ s/[;&]?page_no=\d+//;          # get rid of page_no arg
        $url  =~ s!^http://.*?(?=/)!!;          # remove host
        (my $query_string = $url) =~ s/^.*\?//; # isolate the query string
        my $q =  CGI->new( $query_string );

        if ( my $prev = $pager->previous_page ) {
            $text .= $q->a( 
                { href=> "$url&page_no=${prev}" }, 'Previous' 
            ) . ' | ';
        }

        for my $param ( $q->param ) {
            next if $param eq 'page_no';
            $text .= qq[<input type="hidden" name="$param" value="] .
                $q->param( $param ) . '">';
        }

        $text .= '<input type="submit" value="Page">' .
            '<input name="page_no" size="4" value="' . 
            $pager->current_page . '">' . 
            ' of ' . $pager->last_page . '.';

        if ( my $next = $pager->next_page ) {
            $text .= ' | ' . $q->a( 
                { href=> "$url&page_no=${next}" }, 'Next'
            );
        }
    }

    $text .= '&nbsp;&nbsp;</form>';
    $data  = [ $pager->splice( $data ) ] if @{ $data || [] };
    return wantarray ? ( $text, $data ) : $text;
}

# ----------------------------------------------------
sub paginate {
    carp("paginate is deprecated! Use pager instead.");

    my %args        = @_;
    my $data        = $args{'data'}        || [];
    my $limit_start = $args{'limit_start'} || 1;
    my $page_size   = $args{'page_size'}   || 0;
    my $max_pages   = $args{'max_pages'}   || 1;    
    my $no_elements = $args{'no_elements'} || @$data;

    my $limit_stop;
    if ( $no_elements > $page_size ) {
        $limit_start  = 1 if $limit_start < 1;
        $limit_start  = $no_elements if $limit_start > $no_elements;
        $limit_stop   = ( $limit_start + $page_size >= $no_elements )
            ? $no_elements
            : $limit_start + $page_size - 1;
        $data         = [ @$data[ $limit_start - 1 .. $limit_stop - 1 ] ];
    }
    elsif ( $no_elements ) {
        $limit_stop = $no_elements;
    }
    else {
        $limit_stop = 0;
    }

    my $no_pages = $no_elements
        ? sprintf( "%.0f", ( $no_elements / $page_size ) + .5 ) : 0;
    my $step     = ( $no_pages > $max_pages ) 
        ? sprintf( "%.0f", ($no_pages/$max_pages) + .5 ) : 1;
    my $cur_page = int( ( $limit_start + 1 ) / $page_size ) + 1;
    my ( $done, $prev_page, @pages );
    for ( my $page = 1; $page <= $no_pages; $page += $step ) {
        if ( 
            !$done              &&
            $page != $cur_page  && 
            $page  > $cur_page  && 
            $page  > $prev_page
        ) {
            push @pages, $cur_page unless $pages[-1] == $cur_page;
            $done = 1;
        }
        $done = $cur_page == $page unless $done;
        push @pages, $page;
    }

    if ( @pages ) {
        push @pages, $cur_page unless $done;
        push @pages, $no_pages unless $pages[-1] == $no_pages;
    }
        
    return {
        data        => $data,
        no_elements => $no_elements,
        pages       => \@pages,
        cur_page    => $cur_page,
        page_size   => $page_size,
        no_pages    => $no_pages,
        show_start  => $limit_start,
        show_stop   => $limit_stop,
    };
}

# ----------------------------------------------------
sub parse_words {
    my $string    = shift;
    my @words     = ();
    my $inquote   = 0;
    my $length    = length($string);
    my $nextquote = 0;
    my $nextspace = 0;
    my $pos       = 0;

    # shrink whitespace sets to just a single space
    $string =~ s/\s+/ /g;

    # Extract words from list
    while ( $pos < $length ) {
        $nextquote = index( $string, '"', $pos );
        $nextspace = index( $string, ' ', $pos );
        $nextspace = $length if $nextspace < 0;
        $nextquote = $length if $nextquote < 0;

        if ( $inquote ) {
            push(@words, substr($string, $pos, $nextquote - $pos));
            $pos = $nextquote + 2;
            $inquote = 0;
        } 
        elsif ( $nextspace < $nextquote ) {  
            push @words,
                split /[,\s+]/, substr($string, $pos, $nextspace - $pos);
            $pos = $nextspace + 1;
        } 
        elsif ( $nextspace == $length && $nextquote == $length ) {
            # End of the line
            push @words, 
                map { s/^\s+|\s+$//g; $_ }
                split /,/,
                substr( $string, $pos, $nextspace - $pos );
            $pos = $nextspace; 
        }
        else {
            $inquote = 1;
            $pos = $nextquote + 1;
        }
    }

    push( @words, $string ) unless scalar( @words );

    return @words;
}

# ----------------------------------------------------
sub iris_normalize {
    my $given = shift or croak('No value given to iris_normalize');

    # Uppercase
    $given = uc $given; 

    # Paren spacing
    $given =~ s/\)(?!\s)/) /g;
    $given =~ s/(?<!\s)\(/ (/g;
    $given =~ s/\(\s+/(/g;
    $given =~ s/\s+\)/)/g;

    # Period to space
    $given =~ s/\./ /g;

    # Squash spaces, trim
    $given =~ s/\s+/ /g;
    $given =~ s/^\s+|\s+$//g;

    # letters bumping numbers
    $given =~ s/([A-Z])(\d)/$1 $2/g;

    # numbers bumping letters 
    $given =~ s/(\d)([A-Z])/$1 $2/g;

    # dashes between letters
    $given =~ s/(?<=[A-Z])-(?=[A-Z])/ /g;

    # left-zero-padded numbers
    $given =~ s/(?<=[\sA-Z])0+(\d+)/$1/g;

    # L-N becomes L^N when there is only one '-' in the name and L is not
    # preceded by a space
    $given =~ s/(?<!\s)([A-Z]+)-(\d+)/$1 $2/g;

    
    # Close in crossing symbols
    $given =~ s#\s+/#/#g;
    $given =~ s#/\s+#/#g;

    return $given;
}

# ----------------------------------------------------
sub similarity_search {
    my %args          = @_;
    my @search_values = @{ $args{'search_values'} || [] };
    my @search_fields = @{ $args{'search_fields'} || [] };
    my $threshold     =    $args{'threshold'}     || 0.3;
    my $data          =    $args{'data'};

    croak 'No data' unless UNIVERSAL::isa( $data, 'ARRAY' ) && @$data;

    # get rid of wildcards, trim values
    @search_values = map { s/\*//g; s/^\s+|\s+$//g; $_ || () } @search_values;

    # if nothing to search for, everything is a match, so bail
    unless ( @search_values ) {
        $_->{'similarity'} = 1 for @$data;
        return wantarray ? @$data : $data;
    }
    
    unless ( @search_fields ) {
        die 'Data not hashref' unless UNIVERSAL::isa( $data->[0], 'HASH' );
        @search_fields = keys %{ $data->[0] };
    }

    if ( !@search_fields ) {
        croak "Can't determine search fields\n";
    }

    my @return;
    for my $t ( @$data ) {
        my @scores;
        for my $s1 ( @search_values ) {
            for my $s2 ( map { $t->{ $_ } } @search_fields ) { 
                next unless $s2;
                my $score = sprintf( "%.02f", similarity( $s1, $s2 ) );
                if ( $score == 1 ) {
                    @scores = ( $score );
                    last;
                }
                else {
                    push @scores, $score if $score >= $threshold;;
                }
            }

            last if scalar @scores == 1 && $scores[0] == 1;
        }

        # take the highest score
        if ( @scores ) {
            $t->{'similarity'} = max @scores;
            push @return, $t;
        } 
    }

    return wantarray ? @return : \@return;
}

# ----------------------------------------------------
sub table_name_to_gramene_cdbi_class {
    if ( scalar @_ != 2 ) {
        croak 'table_name_to_gramene_cdbi_class needs module and table name';
    }

    my $module     = shift or croak 'No module name';
    my $table_name = shift or croak 'No table name';
    my $class      = join('::', 
        'Gramene', 'CDBI', 
        join( '', map { ucfirst } split /_/, lc($module)),
        join( '', map { ucfirst } split /_/, $table_name)
    );

    return $class;
}

# ----------------------------------------------------

__END__

=pod

=head1 NAME

Gramene::Utils - generalized utilities

=head1 SYNOPSIS

  use Gramene::Utils qw( sub );

=head1 DESCRIPTION

This module contains general-purpose routines, all of which are
exported by default.

=head1 EXPORTED SUBROUTINES

=head2 commify

Turns "12345" into "12,345"

=head2 iterative_search_values

  for my $v ( 
      iterative_search_values( 'foo', { no_leading_wildcard => 1 } ) 
  ) {
      my @data = search( val => $v ) && last;
  );

Options:

  no_leading_wildcard - don't return a new value with "*" 
                        at the beginning

Aids searches by iteratively adding wildcards, e.g., for "foo" then
"foo*" then "*foo*."

=head2 get_logger

  my $logger = get_logger( log_level => 'debug' );
  $logger->info("Setting db to '$db_name'");

Returns a Log::Dispatch::File logger.

=head2 pager

Create a pager for data using Data::Pageset.

=head3 Arguments

=over 4

=item * data

An array reference of the dataset

=item * count

The number of records if "data" is not present

=item * url

The URL to use in constructing links

=item * current_page

Integer value of the current page number

=item * entries_per_page

Maximum number of records to allow many on a page; optional, default
value = 25.

=item * object_name

Use something like "Markers" instead of the (default) generic "Items"

=back

=head3 Returns

=over 4

=item * A string of HTML that is the pager navigator

=item * The current "page/slice" of data

=back

(In a scalar context, only returns the first.)

E.g.:

  my ( $pager, $data ) = pager(
      data             => $data, # or "count => 504"
      current_page     => $cgi->param('page_no'),
      url              => 'myscript.cgi?foo=bar&baz=quux',
      entries_per_page => 25,
      object_name      => 'Markers', # instead of default "Items"
  );
  $body .= "<center>$pager</center>";
  for my $rec ( @$data ) { ... }

=head2 paginate -- DEPRECATED! USE pager.

Given a set of data, break it up into pages.

Args:

    data        : a reference to an array of rows
    limit_start : where to start the slice [opt]
    page_size   : how big to make each page [opt]
    max_pages   : the maximum number of pages to allow (roughly) [opt]
    no_elements : how many records are in "data" [opt]

Returns a hashref of:

    data        : the slice of data that comprises the current page
    no_elements : how many elements were in the original data set
    pages       : an array of the page numbers to display
    cur_page    : the page number of the current page
    page_size   : how many records are on each page
    no_pages    : the total number of pages returned
    show_start  : the number of the first record in the current page
    show_stop   : the number of the last record in the current page

=head2 parse_words

  "Foo bar" baz => ('Foo bar', 'baz')

Stole this from String::ParseWords::parse by Christian Gilmore
(CPAN ID: CGILMORE), modified to split on commas or spaces.  Allows
quoted phrases within a string to count as a "word," e.g.:

=head2 IRIS Germplasm Name

  my $normalized_name = iris_normalize( $name );

Here are the rules:

  (L= any letter; ^= space; N= any numeral, S= any of {-,',[,],+,.})
  a)  Capitalize all letters  Khao-Dawk-Mali105 becomes 
      KHAO-DAWK-MALI105
  b)  L( becomes L^(and)L becomes)^L  IR64(BPH) becomes ^R64 (BPH)
  c)  N( becomes N^(and )N becomes)^N IR64(5A) becomes ^R64 (5A)
  d)  L. becomes L^   IR 64 SEL. becomes IR 64 SEL
  e)  LN becomes L^N EXCEPT SLN   MALI105 becomes MALI 105 but 
      MALI-F4 is unchanged
  f)  NL becomes N^L EXCEPT SNL   B 533A-1 becomes B 533 A-1 
      but B 533 A-4B is unchanged
  g)  LL-LL becomes LL^LL KHAO-DAWK-MALI105 becomes KHAO DAWK MALI 105
  h)  ^ON becomes ^N  IRTP 00123 becomes IRTP 123
  ^)  ^^ becomes ^    Double spaces changed to single spaces
  j)  REMOVE LEADING OR TRAILING ^    ..
  k)  ^) becomes) and (^ becomes (    Close in braces
  l)  L-N becomes L^N when there is only one '-' in the name and L is 
      not preceded by a space
  m)  ^/ becomes / and /^ becomes /   Close in crossing symbols

=head2 similarity_search

  my @matches = similarity_search(
      data          => \@data,    # arrayref of hashrefs
      threshold     => 0.5,       # a number 0-1, default 0.3
      search_fields => [ 'foo' ], # keys in the hashrefs
      search_values => [ 'bar' ]  # search strings
  );

Returns all the members of "data" where any of the "search_fields"
match any of the "search_values" with a similarity greater than or 
equal to the "threshold."  Each returned member will have a new 
field called "similarity" with a value of the highest similarity 
score for any of the search values in any of the search fields.

=head2 table_name_to_gramene_cdbi_class

  my $class = table_name_to_gramene_cdbi_class( 'Markers', 'marker_type' );
  # $class now has "Gramene::CDBI::Markers::MarkerType"

Turns a module's table name into it's Gramene::CDBI class.  Both arguments
are required.

=head1 AUTHOR

Ken Youens-Clark E<lt>kclark@cshl.orgE<gt>

=head1 COPYRIGHT

Copyright (c) 2006 Cold Spring Harbor Laboratory

This library is free software;  you can redistribute it and/or modify 
it under the same terms as Perl itself.

=cut
