#!/usr/local/bin/perl -w

=head1 NAME

import_gene_ontology_association.pl


=head1 DESCRIPTION

Imports tab-delimited data (Gene phenotype association) into Gramene Gene schema.
The file type is automatically detected from the header line.  The
column names in the header file may contain spaces and mixed case as
spaces will be converted to underscores and letters will be lowercased
(e.g., "Gene Accession" => "gene_accession").


* The required fields in the header line are:
  gene_accession

* The optional fields in the header line are:
  TO, PO Structure, PO Growth Stage, GRO Growth Stage, GO Function, GO Process, GO Component, EO

Multiple term accesssions can be listed in one field delimited by comma


=head1 SYNOPSIS

import_gene_ontology_association.pl [options] <gene phenotype association file>

 Options:
    --help              help message
    --man               full documentation
                                                                                


=head1 OPTIONS
                                                                                
=over 4
                                                                                
=item B<--help>
                                                                                
print a help message and exit
                                                                                
=item B<--man>
                                                                                
print documentation and exit
                                                                                
                                                                                
=back
                                                                                
=head1 ARGUMENTS

gene ontology association file                                                                             

=cut

use lib '/usr/local/gramene/lib/perl/';
use strict;

use Text::RecordParser;
use File::Temp qw/ tempfile/;

use Pod::Usage;
use Getopt::Long;

use Gramene::DB;

use constant {
    GENE_ACCESSION_PREFIX => 'GR:',
    GENE_ACCESSION_LENGTH => 7,
    TERM_ACCESSION_LENGTH => 7
};

local $^W = 0;  # to turn off the warning of empty string in  Text::RecordParser

{               #Argument Processing
    my $help = 0;
    my $man  = 0;
    GetOptions( "help|?" => \$help, "man" => \$man )
      or pod2usage(2);
    pod2usage( -verbose => 2 ) if $man;
    pod2usage(1)                if $help;
    pod2usage('No import file') if ( scalar(@ARGV) < 1 );
}

my $file = $ARGV[0];

my %term_types = ( 
    'go_component' => {'Cellular Component' => 'GO:'},
    'go_function' => {'Molecular Function' => 'GO:'},
    'go_process' => {'Biological Process' => 'GO:'},
    'to' => {'Trait' => 'TO:'},
    'po_structure' => {'Plant Structure' => 'PO:' },
    'po_growth_stage' => {'Plant Growth and Development Stage' => 'PO:'},
    'gro_growth_stage' => { 'Cereal Plant Growth Stage' => 'GRO:'},
    'eo' => {'Enviroment' => 'EO:' }
);

# The Text::ParseWords modules used in Text::RecordParser will remove the quotes
# we have to add backslash to the quotes for parsing quotes
# the processed_file is a temple file, it will be removed when program exist
my $processed_file = process_file($file);

my $parser = Text::RecordParser->new(
    field_separator => qr/\t/,    #separate the fields by "\t",remove escape
    filename => $processed_file,
);

#a filter for the header field: convert the space to _ and the data to lower case
$parser->header_filter( sub { $_ = shift; s/"//g; s/\s+/_/g; lc $_ } );

#Takes the fields from the next row under the cursor and assigns the field names to the values.
$parser->bind_header;

#Returns the fields bound via bind_fields (or bind_header) and convert to hash.
my %fields = map { $_, 1 } $parser->field_list;

my ( $db, $ot_db );
eval {
    $db = Gramene::DB->new('genes_edit');
    $db->{AutoCommit} = 0;    # set transaction control
};

if ($@) {
    die "DB connection failed: $@\n";
}

my @req_fields = qw[gene_accession];
my @opt_fields = keys %term_types; 

my %acceptable = map { $_, 1 } ( @req_fields, @opt_fields );

our ( $no_imported, $no_processed ) = ( 0, 0 );
our @warning_to_checks;

print "\n***** Importing Gene Ontology Association *****\n\n";

for my $field ( $parser->field_list ) {
    next if $acceptable{$field};
    warn "\nUnknown field: $field!\n";
}

$parser->field_filter( sub { $_ = shift; s/"//g; s/^\s+|\s+$//g; s/\s+//g; $_ } );

eval {
    while ( my $record = $parser->fetchrow_hashref )
    {
        $no_processed++;
        my @missing;
        for my $field (@req_fields) {
            push @missing, $field unless defined $record->{$field};
        }

        my $onto_exist = 0;
        for my $field (@opt_fields) {
            $onto_exist = 1 if $record->{$field};
        }
        push @missing, "ontology" unless $onto_exist;

        if (@missing) {
            warn "Line $no_processed: Missing ",
              join( ', ', map { qq["$_"] } @missing ), ".  Skipping.\n";
            next;
        }

        my $mu_acc = $record->{'gene_accession'};
        $mu_acc = process_accession( GENE_ACCESSION_PREFIX, GENE_ACCESSION_LENGTH, $mu_acc );

        my ($gene_id) = $db->selectrow_array(
            q[
                                           SELECT gene_id
					     FROM gene_gene
					    WHERE accession =?
					  ],
            {},
            ($mu_acc)
        );

        unless ($gene_id) {
            warn "\nLine $no_processed gene gene $mu_acc is not in database. Skipping.\n";
            push @warning_to_checks, "\nLine $no_processed gene gene $mu_acc is not in database. Skipping.\n";
            next;
        }
	
	foreach my $type_header (keys %term_types){
	    my ($term_type) = keys %{$term_types{$type_header}};
	    my $term_prefix = $term_types{$type_header}->{$term_type};

	    #maybe multiple terms delimited by ,
	    my @term_accs = split /,/, $record->{$type_header};
	    if(@term_accs){
		&load_term_assocs( $gene_id, $mu_acc, 'gene',$term_type, $term_prefix, TERM_ACCESSION_LENGTH, @term_accs);
	    }
	}

    }
    $db->commit;

    print "\n\n\nDone.\nProcessed $no_processed  gene ontology association records, ",
	"imported $no_imported.\n\n";
};

if ($@) {
    warn "Unable to save to database: $@\n";
    $db->rollback();
}

if ( @warning_to_checks && $no_imported > 50 ) {    #for visiblity reason
    print "Please check the following lines:\n";
    foreach (@warning_to_checks) {
        print $_;
    }
}

sub next_id {
    my ( $db, $table_name, $field_name ) = @_;
    my $id = $db->selectrow_array("select max($field_name) from $table_name");
    return $id + 1;
}

sub load_term_assocs {
    my ( $gene_id, $mu_acc, $obj_type, $term_type, $term_prefix, $term_acc_length,  @term_accs ) = @_;

    my %uniq_terms = map { $_, 1 } map { s/^\s+|\s+$//g; $_ } @term_accs;
    foreach my $term ( keys %uniq_terms ) {

        #
        #validate the term
	#


        #
        if ( $term =~ /^(\w+:)(\d+)$/ ) {

            unless ( $1 eq $term_prefix) {
                warn "\nLine $no_processed term [", $term,
                  "]'s  syntax is not wrong. Skipping.\n";
                push @warning_to_checks, "\nLine $no_processed term [", $term,
                  "]'s  syntax is not wrong. Skipping.\n";
                next;
            }

	    $term = process_accession($term_prefix, $term_acc_length, $term);

        }
        else {
            warn "\nLine $no_processed term [", $term,
              "]'s  type is not defined. Skipping.\n";
            push @warning_to_checks, "\nLine $no_processed term [", $term,
              "]'s  type is not defined. Skipping.\n";
            next;

        }

        my ($gene_term_assoc_id) = $db->selectrow_array(
            q[
                                           SELECT ontology_association_id
					     FROM gene_ontology_association
					    WHERE object_id=? 
					    AND term_accession =?
					    AND object_table = ?
					  ],
            {},
            ( $gene_id, $term,$obj_type )
        );
        if ($gene_term_assoc_id) {
            print
		"\nLine $no_processed association [$mu_acc,$term] exist in db already. Skipping\n";
        }
        else {


	    $gene_term_assoc_id = next_id( $db, 'gene_ontology_association', 'ontology_association_id' );

                $db->do(
                    q[
                           INSERT INTO  gene_ontology_association
			   (ontology_association_id,object_id,object_table,term_accession,term_type)
                           VALUES (?,?,?,?,?)
                         ],
                    {},
                    ( $gene_term_assoc_id, $gene_id,$obj_type, $term, $term_type )

                );
                $no_imported++;

                print "\nInsert association [$mu_acc,$term].\n";

	}

    }
}

sub process_accession {

    my ( $prefix, $length, $acc ) = @_;
    $acc =~ s/^$prefix//;
    $acc =~ s/^0+//;

    my $str_len = length($acc);
    my $num     = $length - $str_len;
    for my $i ( 1 .. $num ) {
        $acc = '0' . $acc;
    }

    $acc = $prefix . $acc;
    return $acc;

}

sub process_file {

    my $file = shift;
    my ( $fh, $temp_file ) = tempfile( "tmpfileXXXXX", UNLINK => 1 );

    open( RAW, $file ) or die "can't open $file :$!";
    while (<RAW>) {
        chomp;
        $_ = clean_field($_);

        $_ =~ s/'/\\'/g;
        $_ =~ s/"/\\"/g;
        print $fh "$_\n";
    }
    close(RAW);
    close($fh);

    return $temp_file;

}

sub clean_field {
    my $field = shift;
    return unless $field;

    $field =~ s/^\s+|\s+$//g;

    # 0x93 (147) and 0x94 (148) are "smart" quotes
    $field =~ s/[\x93\x94]/"/g;

    # 0x91 (145) and 0x92 (146) are "smart" singlequotes
    $field =~ s/[\x91\x92]/'/g;

    # 0x96 (150) and 0x97 (151) are emdashes
    $field =~ s/[\x96\x97]/--/g;

    # 0x85 (133) is an ellipsis
    $field =~ s/\x85/. . ./g;

    # convert line ending on windows
    $field =~ s/\r\n/\n/;

    return $field;

}
