#!/usr/bin/perl

our $VERSION = '0.02';

# $Id: auxiliary_tables.pl,v 1.1.2.1 2007/06/14 19:03:44 kclark Exp $

=head1 NAME

auxiliary_tables.pl

=head1 SYNOPSIS

 auxiliary_tables.pl -config <config> -log <log> [-sql_file <sql file>] [-steps <steps>]

 config   : Config file (contains datasource, username and password)
 log      : Log file
 sql_file : SQL file (defaults to auxiliary_tables.sql included in the package)
 steps    : Steps to perform 

=head1 DESCRIPTION

This script processes GDPDM-based databases for display by Panzea code base.

=cut

use warnings;
use strict;

use lib '/usr/local/gramene-25/lib/perl/'; # MKFILE:Q:LIB

use Carp;
use Config::General;
use DBI;
use File::Slurp qw(slurp);
use Getopt::Long;
use Time::Format qw(%time);

use Panzea::WebFormUtils;

# Global variables
our $DBH;
our $LOG_FH;
our $NUMBER_OF_STEPS = 36;
our $SQL_FILE_CONTENT;

# Record usage
my $usage = <<USAGE;
$0 -config <config> -log <log> [-sql_file <sql file>] [-steps <steps>]
(-sql_file defaults to auxiliary_tables.sql)
USAGE

# Parse parameters
my $config;
my $log;
my $sql_file = 'auxiliary_tables.sql';
my $steps;

my $result = GetOptions(
    "config=s"   => \$config,
    "log=s"      => \$log,
    "sql_file=s" => \$sql_file,
    "steps=s"    => \$steps
) or croak("Usage: $usage\n");

die("Usage: $usage\n") unless ($config && $log && ($sql_file || $steps));

# Load sql file into memory
$SQL_FILE_CONTENT = slurp($sql_file)
  or croak("Cannot read file ($sql_file): $!");

croak("Empty sql file ($sql_file)!") unless $SQL_FILE_CONTENT;

# Scan sql file and extract steps
my @steps;
if ($steps) {
    @steps = split(',', $steps);
}
else {
    @steps = extract_steps();
    if (@steps != $NUMBER_OF_STEPS) {
        croak(
            "Unexpected number of steps (@steps), instead of $NUMBER_OF_STEPS!"
        );
    }
}

# Parse config file
my $config_obj = Config::General->new(-ConfigFile => $config);
my %config = $config_obj->getall;

my $datasource = $config{"datasource"};
my $username   = $config{"username"};
my $password   = $config{"password"};

# Connect to database
my $dbh = DBI->connect(
    $datasource, $username, $password,
    {PrintError => 1, RaiseError => 1}
) or croak("Cannot connect to database!");

$DBH = $dbh;

# Open log file
open(LOG, ">$log") or croak("Cannot write file ($log): $!\n");

$LOG_FH = \*LOG;

# Initialize log file
log_n("datasource: $datasource");
log_n("username:   $username");
log_n("password:   [removed]");
log_n("sql_file:   $sql_file");
log_n("");

# Run step
foreach my $step (@steps) {
    log_n("");
    log_n("");
    if ($step =~ /^sub:(\S+)/) {
        my $sub = $1;
        log_n("Running sub ($sub) ...");
        my $start_time = time;
        no strict 'refs';
        &{$sub};
        use strict 'refs';
        my $elapsed_time = time - $start_time;
        log_n("Done! - elapsed: $elapsed_time sec");
    }
    else {
        my $statement = get_statement_by_label($step);
        my $dbh       = $DBH;
        my $formatted_statement =
          qq[\n    ] . join(qq[\n    ], split("\n", $statement));
        log_n(  "Running statement [$step]:"
              . "\n+---------------------------------------------------------+\n"
              . $formatted_statement
              . "\n+---------------------------------------------------------+\n"
        );
        my $start_time   = time;
        my $row_count    = $dbh->do($statement) + 0;
        my $elapsed_time = time - $start_time;
        log_n(
            "step: $step done! - rows affected: $row_count; elapsed: $elapsed_time sec");
        show_warnings($step);    
    }
}

# [END]

sub get_statement_by_label {
    my ($step) = @_;

    my ($statement) =
      $SQL_FILE_CONTENT =~ /--\s*LABEL:$step\s*\n(.+?)--\s*END/s;

    croak("Cannot find statement for step ($step)!") unless $statement;

    return $statement;
}

sub extract_steps {
    my ($file) = @_;

    my @steps;
    while ($SQL_FILE_CONTENT =~ /--\s*LABEL:(\S+)/g) {
        my $step = $1;

        my $statement = get_statement_by_label($step);

        push @steps, $step;
    }

    return @steps;
}

sub show_warnings {
    my ($step) = @_;
    
    my $dbh = $DBH;

    my $sth = $dbh->prepare('SHOW WARNINGS');
    $sth->execute;

    my @warnings;
    while (my @ary = $sth->fetchrow_array) {
        push @warnings, join(' ', @ary);
    }
    
    if (@warnings) {
        foreach (@warnings) {
            log_n("** $step mysql_warning ** $_");
        }
    }
    else {
        log_n("** No warnings for step: $step **");                
    }
    
    return 1;
}

sub format_genotypes {
    my $dbh = $DBH;

    my $count_statement = qq[SELECT count(*) FROM aux_genotype_by_accession];
    my $count_sth       = $dbh->prepare($count_statement);
    $count_sth->execute;

    my ($total_count) = $count_sth->fetchrow_array;

    my $update_statement = qq[UPDATE aux_genotype_by_accession
                              SET sorted_genotype = ?, formatted_genotype = ?, resolved_genotype = ?
                              WHERE aux_genotype_by_accession_id = ?
                              ];

    my $update_sth = $dbh->prepare($update_statement);

    my $select_statement =
      qq[SELECT aux_genotype_by_accession_id, marker_type, genotype_string
                              FROM aux_genotype_by_accession];

    my $select_sth = $dbh->prepare($select_statement);
    $select_sth->execute;

    my $counter = 0;
    while (
        my ($aux_genotype_by_accession_id, $marker_type, $genotype_string) =
        $select_sth->fetchrow_array) {
        $counter++;
        log_n("counter: $counter of $total_count\n") if $counter % 10000 == 0;

        my ($sorted_genotype, $formatted_genotype, $resolved_genotype) =
          _format_genotype($marker_type, $genotype_string);

        if (!defined $sorted_genotype or !defined $formatted_genotype) {
            log_n("Error: Cannot format [$genotype_string]!");
        }

        $update_sth->execute(
            $sorted_genotype,   $formatted_genotype,
            $resolved_genotype, $aux_genotype_by_accession_id
        );
    }

    return 1;
}

sub _format_genotype {
    my ($marker_type, $genotype_string) = @_;

    # Sort
    my @pairs =
      sort split(',', $genotype_string);    # split and sort by assay_id
    my $sorted_genotype = join(',', @pairs);

    # Check errors
    my $error = 0;

    # Check whether each assay has 2 and only 2 values
    my %assay_ids;
    foreach my $pair (@pairs) {
        $pair = uc $pair;

        my ($assay_id, $value) = split(':', $pair);

        if ($value !~ /^\d+$/ and $value !~ /^[ATCGN]$/ and $value ne 'ND')
        {    # Null value
            log_n(
                "Warning: Invalid value [$value] in [$genotype_string], skipping ..."
            );
            next;
        }

        push @{$assay_ids{$assay_id}}, $value;
    }

    foreach my $assay_id (keys %assay_ids) {
        if (@{$assay_ids{$assay_id}} != 2) {
            log_n(
                "Error: Invalid value count/assay id ($assay_id) in [$genotype_string]!"
            );
            $error = 1;
        }
    }

    # Format
    my $formatted_genotype = $marker_type eq 'SSR' ? 'ND,ND' : 'N,N';

    my %single_genotypes;
    my $total_count;
    foreach my $assay_id (keys %assay_ids) {
        my $single_genotype = join(',', sort @{$assay_ids{$assay_id}});
        next
          if ($single_genotype eq 'N,N' or $single_genotype eq 'ND,ND')
          ;    # Ignore N,N, ND,ND
        $single_genotypes{$single_genotype}++;
        $total_count++;
    }

    foreach my $single_genotype (keys %single_genotypes) {
        my $ratio = $single_genotypes{$single_genotype} / $total_count;
        $formatted_genotype = $single_genotype if $ratio > 0.5;
    }

    # Resove
    my $resolved_genotype =
      ($formatted_genotype eq 'ND,ND' or $formatted_genotype eq 'N,N')
      ? undef
      : $formatted_genotype;

    return $error
      ? (undef, undef, undef)
      : ($sorted_genotype, $formatted_genotype, $resolved_genotype);
}

sub process_map_info {
    my $dbh = $DBH;

    my $select_statement = qq[SELECT aux_map_info_id,
                              ibm2_2005_chr, ibm2_2005_position, all_ibm2_2005_positions,
                              ibm2_2005_bin, all_ibm2_2005_bins,
                              fpc_contig, fpc_chr, fpc_start, fpc_stop, all_fpc_positions
                              FROM aux_map_info];

    my $select_sth = $dbh->prepare($select_statement);
    $select_sth->execute;

    my $update_statement = qq[UPDATE aux_map_info
                              SET    ibm2_2005_chr           = ?,
                                     ibm2_2005_position      = ?,
                                     all_ibm2_2005_positions = ?,
                                     ibm2_2005_bin           = ?,
                                     all_ibm2_2005_bins      = ?,
                                     fpc_contig              = ?,
                                     fpc_chr                 = ?,
                                     fpc_start               = ?,
                                     fpc_stop                = ?,
                                     all_fpc_positions       = ?,
                                     sort_ibm2_2005_bin      = ?,
                                     sort_ibm2_2005_chr      = ?,
                                     sort_ibm2_2005_position = ?,
                                     sort_fpc_contig         = ?,
                                     sort_fpc_chr            = ?,
                                     sort_fpc_start          = ?,
                                     sort_fpc_stop           = ?
                              WHERE  aux_map_info_id = ?];

    my $update_sth = $dbh->prepare($update_statement);

    while (
        my ($aux_map_info_id,
            $ibm2_2005_chr,
            $ibm2_2005_position,
            $all_ibm2_2005_positions,
            $ibm2_2005_bin,
            $all_ibm2_2005_bins,
            $fpc_contig,
            $fpc_chr,
            $fpc_start,
            $fpc_stop,
            $all_fpc_positions
        )
        = $select_sth->fetchrow_array
      ) {

        # New values
        my $new_ibm2_2005_chr;
        my $new_ibm2_2005_position;
        my $new_all_ibm2_2005_positions;
        my $new_ibm2_2005_bin;
        my $new_all_ibm2_2005_bins;
        my $new_fpc_contig;
        my $new_fpc_chr;
        my $new_fpc_start;
        my $new_fpc_stop;
        my $new_all_fpc_positions;

        # Sort (new) values
        my $sort_ibm2_2005_bin;
        my $sort_ibm2_2005_chr;
        my $sort_ibm2_2005_position;
        my $sort_fpc_contig;
        my $sort_fpc_chr;
        my $sort_fpc_start;
        my $sort_fpc_stop;

        # Fix IBM2 Positions
        if (!defined $all_ibm2_2005_positions) {
            $new_ibm2_2005_chr           = $ibm2_2005_chr;
            $new_ibm2_2005_position      = $ibm2_2005_position;
            $new_ibm2_2005_bin           = $ibm2_2005_bin;
            $new_all_ibm2_2005_positions = $all_ibm2_2005_positions;
            $new_all_ibm2_2005_bins      = $all_ibm2_2005_bins;
        }        
        elsif ($all_ibm2_2005_positions !~ /,/) {
            ($new_ibm2_2005_chr, $new_ibm2_2005_position) =
              split(/\|/, $all_ibm2_2005_positions);
            $new_ibm2_2005_bin           = $all_ibm2_2005_bins;
            $new_all_ibm2_2005_positions = $all_ibm2_2005_positions;
            $new_all_ibm2_2005_bins      = $all_ibm2_2005_bins;
        }
        else {
            $new_ibm2_2005_chr           = 'conflict';
            $new_ibm2_2005_position      = undef;
            $new_ibm2_2005_bin           = undef;
            $new_all_ibm2_2005_positions = $all_ibm2_2005_positions;
            $new_all_ibm2_2005_bins      = $all_ibm2_2005_bins;
        }

        # Add sort columns for IBM2 Positions
        if (!defined $new_ibm2_2005_chr) { $sort_ibm2_2005_chr = 1000; }
        elsif ($new_ibm2_2005_chr eq 'conflict') {
            $sort_ibm2_2005_chr = 999;
        }
        else { $sort_ibm2_2005_chr = $new_ibm2_2005_chr + 0; }

        $sort_ibm2_2005_bin =
          defined $new_ibm2_2005_bin ? $new_ibm2_2005_bin + 0 : 1000;
        $sort_ibm2_2005_position =
          defined $new_ibm2_2005_position
          ? $new_ibm2_2005_position + 0
          : 1000;

        # Fix FPC Contigs
        if (!defined $all_fpc_positions) {
            $new_fpc_contig        = $fpc_contig;
            $new_fpc_chr           = $fpc_chr;
            $new_fpc_start         = $fpc_start;
            $new_fpc_stop          = $fpc_stop;
            $new_all_fpc_positions = $all_fpc_positions;
        }        
        elsif ($all_fpc_positions !~ /,/) {
            ($new_fpc_contig, $new_fpc_chr, $new_fpc_start, $new_fpc_stop) =
              split(/\|/, $all_fpc_positions);
            $new_all_fpc_positions = $all_ibm2_2005_positions;
        }
        else {
            $new_fpc_contig        = 'conflict';
            $new_fpc_chr           = undef;
            $new_fpc_start         = undef;
            $new_fpc_stop          = undef;
            $new_all_fpc_positions = $all_fpc_positions;
        }

        # Add sort columns for FPC Positions
        if (!defined $new_fpc_contig || $new_fpc_contig eq '') { 
            $sort_fpc_contig = 1000; 
        }
        elsif ($new_fpc_contig eq 'conflict') { 
            $sort_fpc_contig = 999; 
        }
        else { 
            $new_fpc_contig =~ /(\d+)/; 
            $sort_fpc_contig = $1 + 0; 
        }

        $sort_fpc_chr   = defined $new_fpc_chr && $new_fpc_chr ne ''  
                        ? $new_fpc_chr + 0   
                        : 1000;
        
        $sort_fpc_start = defined $new_fpc_start && $new_fpc_start ne ''  
                        ? $new_fpc_start + 0 
                        : 1000;
        
        $sort_fpc_stop  = defined $new_fpc_stop && $new_fpc_stop ne ''   
                        ? $new_fpc_stop + 0  
                        : 1000;

        # Update
        $update_sth->execute(
            $new_ibm2_2005_chr,
            $new_ibm2_2005_position,
            $new_all_ibm2_2005_positions,
            $new_ibm2_2005_bin,
            $new_all_ibm2_2005_bins,
            $new_fpc_contig,
            $new_fpc_chr,
            $new_fpc_start,
            $new_fpc_stop,
            $new_all_fpc_positions,
            $sort_ibm2_2005_bin,
            $sort_ibm2_2005_chr,
            $sort_ibm2_2005_position,
            $sort_fpc_contig,
            $sort_fpc_chr,
            $sort_fpc_start,
            $sort_fpc_stop,
            $aux_map_info_id
        );

        #    log_n("******************************************************************************");
        #    log_n("new_ibm2_2005_chr            $ibm2_2005_chr              TO $new_ibm2_2005_chr");
        #    log_n("new_ibm2_2005_position       $ibm2_2005_position         TO $new_ibm2_2005_position");
        #    log_n("new_all_ibm2_2005_positions  $all_ibm2_2005_positions    TO $new_all_ibm2_2005_positions");
        #    log_n("new_ibm2_2005_bin            $ibm2_2005_bin              TO $new_ibm2_2005_bin");
        #    log_n("new_all_ibm2_2005_bins       $all_ibm2_2005_bins         TO $new_all_ibm2_2005_bins");
        #    log_n("new_fpc_contig               $fpc_contig                 TO $new_fpc_contig");
        #    log_n("new_fpc_chr                  $fpc_chr                    TO $new_fpc_chr");
        #    log_n("new_fpc_start                $fpc_start                  TO $new_fpc_start");
        #    log_n("new_fpc_stop                 $fpc_stop                   TO $new_fpc_stop");
        #    log_n("new_all_fpc_positions        $all_fpc_positions          TO $new_all_fpc_positions");
        #    log_n("sort_ibm2_2005_bin           $sort_ibm2_2005_bin");
        #    log_n("sort_ibm2_2005_chr           $sort_ibm2_2005_chr");
        #    log_n("sort_ibm2_2005_position      $sort_ibm2_2005_position");
        #    log_n("sort_fpc_contig              $sort_fpc_contig");
        #    log_n("sort_fpc_chr                 $sort_fpc_chr");
        #    log_n("sort_fpc_start               $sort_fpc_start");
        #    log_n("sort_fpc_stop                $sort_fpc_stop");
        #    log_n("aux_map_info_id              $aux_map_info_id");
        #    log_n("******************************************************************************");

    }

    return 1;
}

sub process_genotypes_gmap {
    my $dbh = $DBH;

    my $code2sampstat = Panzea::WebFormUtils::code2sampstat();

    my $update_statement =
      qq[UPDATE aux_assay_plant_genotype SET germplasm_type = ?
                              WHERE sampstat = ?];

    my $update_sth = $dbh->prepare($update_statement);

    foreach my $sampstat (keys %$code2sampstat) {
        my $germplasm_type = $code2sampstat->{$sampstat};

        log_n(
            "Converting sampstat ($sampstat)to germplasm_type ($germplasm_type) ..."
        );

        $update_sth->execute($germplasm_type, $sampstat);

        log_n("converted!");
    }

    return 1;
}

sub make_genotype_counts {
    my $dbh = $DBH;

    # Get distinct accename
    my $accename_sth =
      $dbh->prepare(
        qq[SELECT DISTINCT accename, marker_type FROM aux_genotype_by_accession]
      );
    $accename_sth->execute;

    my @info;
    while (my ($accename, $marker_type) = $accename_sth->fetchrow_array) {
        push @info, [$accename, $marker_type];
    }

    # Prepare select statement
    my $query_sth = $dbh->prepare(
        qq[
                    INSERT INTO `aux_genotype_by_accession_count`
                    (accename1, accename2, marker_type, count_accename)

                    SELECT agba_a.accename, agba_b.accename, agba_a.marker_type, count(*)
                    FROM
                         aux_genotype_by_accession agba_a
                    JOIN aux_genotype_by_accession agba_b ON (agba_b.cdv_marker_id = agba_a.cdv_marker_id)
                    WHERE
                        agba_a.marker_type  = ?
                    AND agba_a.accename     = ?
                    AND agba_b.accename    != ?
                    GROUP by agba_a.accename, agba_b.accename, agba_a.marker_type
                    ]
    );

    # Loop through accenames and execute query statement
    foreach my $i (0 .. $#info) {
        my ($accename, $marker_type) = @{$info[$i]};

        $query_sth->bind_param(1, $marker_type);
        $query_sth->bind_param(2, $accename);
        $query_sth->bind_param(3, $accename);

        my $start_time = time;

        $query_sth->execute;

        my $elapsed_time = time - $start_time;

        log_n(
            "Completed accename: $accename marker_type: $marker_type; $i of $#info (elapsed_time: $elapsed_time sec)"
        );
    }

    return 1;
}

sub log_n {
    my ($message) = @_;
    $message =~ s/\n+$//;

    my $log_fh    = $LOG_FH;
    my $timestamp = '[' . $time{"dd-Mon-yy hh:mm:ss"} . ']';

    my $log_line = "$timestamp $message\n";

    print $log_line;
    print $log_fh $log_line;

    return 1;
}

=head1 AUTHOR

Payan Canaran <canaran@cshl.edu>

=head1 BUGS

=head1 VERSION

Version 0.02

=head1 ACKNOWLEDGEMENTS

=head1 COPYRIGHT & LICENSE

Copyright (c) 2006-2007 Cold Spring Harbor Laboratory

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself. See DISCLAIMER.txt for
disclaimers of warranty.

=cut

1;
