#!/usr/local/bin/perl -w



=head1 NAME

import_dbxref.pl


=head1 DESCRIPTION

Imports tab-delimited data (DBXref) into Gramene Mutant schema.
The file type is automatically detected from the header line.  The
column names in the header file may contain spaces and mixed case as
spaces will be converted to underscores and letters will be lowercased
(e.g., "DBXref Name" => "dbxref_name").


* The required fields in the header line are:
  DBXref_name, url


If the dbxref_name exist in the database already, the script will prompt the user whether to replace the url associated to the dbxef_name to the new url.

The url represents generic url, e,g. /perl/pub_search?id=[%?%]

=head1 SYNOPSIS

import_dbxref.pl [options] <dbxref file>

 Options:
    --help              help message
    --man               full documentation
                                                                                


=head1 OPTIONS
                                                                                
=over 4
                                                                                
=item B<--help>
                                                                                
print a help message and exit
                                                                                
=item B<--man>
                                                                                
print documentation and exit
                                                                                
                                                                                
=back
                                                                                
=head1 ARGUMENTS

dbxref file                                                                                

=cut



use lib '/usr/local/gramene/lib/perl/';

use strict; 



use Text::RecordParser; 
use File::Temp qw/ tempfile/;

use Pod::Usage;
use Getopt::Long;
 
use Gramene::DB; 


local $^W=0; # to turn off the warning of empty string in  Text::RecordParser


    {  #Argument Processing
        my $help=0;
        my $man=0;
        GetOptions( "help|?"=>\$help,"man"=>\$man)
          or pod2usage(2);
        pod2usage(-verbose => 2) if $man;
        pod2usage(1) if $help;
        pod2usage('No import file') if(scalar(@ARGV)<1);
    }



my $file            = $ARGV[0]; 


# The Text::ParseWords modules used in Text::RecordParser will remove the quotes
# we have to add backslash to the quotes for parsing quotes
# the processed_file is a temple file, it will be removed when program exist
my $processed_file = process_file($file);



my $parser          = Text::RecordParser->new( 
    field_separator => qr/\t/,                #separate the fields by "\t",remove escape
    filename        => $processed_file, 
); 
 
 


 

#a filter for the header field: convert the space to _ and the data to lower case
$parser->header_filter( sub { $_ = shift; s/\s+/_/g; lc $_ } );


#Takes the fields from the next row under the cursor and assigns the field names to the values. 
$parser->bind_header;


#Returns the fields bound via bind_fields (or bind_header) and convert to hash.
my %fields = map { $_, 1 } $parser->field_list; 



my $db;

eval{

      $db = Gramene::DB->new('genes');
      $db->{AutoCommit}=0;  # set transaction control
    };

if($@){
         die "DB connection failed: $@\n";
     }



import_dbxref($parser,$db);






sub import_dbxref{

    my ( $parser, $db ) = @_;
    my @fields      = qw[dbxref_name url];
    
    my %acceptable      = map { $_, 1 } @fields;

    my ( $no_imported, $no_updated, $no_processed ) = ( 0, 0, 0 );

    print "Importing DBXREF\n\n";

    for my $field ( $parser->field_list ) {
        print "f=$field\n";
        next if $acceptable{ $field };
        warn "Unknown field: $field!\n";
    }

    $parser->field_filter( sub { $_ = shift;s/"//g; s/^\s+|\s+$//g; $_ } );


    eval{
      while ( my $record = $parser->fetchrow_hashref ) {
        $no_processed++;
        
        unless($record->{'dbxref_name'} && $record->{'url'}){
           warn "Skipping Line $no_processed, no dbxref name or url !\n";
           next;
        }

        my ($dbxref_id,$dbxref_name,$url) = $db->selectrow_array(
                         q[
                            SELECT dbxref_id,dbxref_name,url
                            FROM   gene_dbxref
                            WHERE  UPPER(dbxref_name) =? 
                          ],
			  {},
                          (uc($record->{'dbxref_name'}))  
                         );

       my $verb;
       my @data;
       if($dbxref_id){
            
           @data  = map {defined $record->{$_} ? $record->{$_}:'' } @fields;    
         
           print "\nDBXREF [$dbxref_name,$url] existing in database already !\n";  
         
           print "\nDo you want to override it ? [y/n]\n";
           my $answer =<STDIN>;
           chomp($answer);
           if($answer =~/^y/i ){     
              $db->do(
                     q[
                         UPDATE gene_dbxref
                         SET    dbxref_name =?,
                                url =?
                         WHERE  dbxref_id=?
		       ],
                       {}, (@data,$dbxref_id)
                      
                     );
           
            @data =($dbxref_id,@data);

            $verb = 'Updated';
            $no_updated++;
          } 
       }else{
           $dbxref_id = next_id($db,'gene_dbxref','dbxref_id');

            @data  = map {defined $record->{$_} ? $record->{$_}:'' } @fields;           
            @data =($dbxref_id,@data);
        
           $db->do(
                    q[
                        INSERT INTO gene_dbxref
                             VALUES (?,?,?) 
                      ],
                      {}, @data
                   );

                   $verb = 'Inserted';
                   $no_imported++;                    

       }  
       print "$verb DBXref [",join(", ", map {defined $_? $_:''} @data),"]\n\n" if $verb;


      }
     $db->commit; 
  
     print "Done.\nProcessed $no_processed records, ",
        "imported $no_imported, updated $no_updated.\n";
    };
    
    if($@){
        warn "Unable to save to database: $@\n";
        $db->rollback();
    }

}



sub next_id {
    my ( $db, $table_name, $field_name ) = @_;
    my $id = $db->selectrow_array("select max($field_name) from $table_name");
    return $id + 1;
}

sub process_file{

  my $file = shift;
  my  ($fh, $temp_file) = tempfile( "tmpfileXXXXX",UNLINK => 1);

  open(RAW,$file) or die "can't open $file :$!";
  while(<RAW>){
     chomp;
     $_ = clean_field($_); 

     $_=~s/'/\\'/g;
     $_=~s/"/\\"/g;  
     print $fh "$_\n"; 
  }
  close(RAW);
  close($fh);
  
  return $temp_file;


}

sub clean_field{
    my $field = shift;
    return unless $field;

    $field =~s/^\s+|\s+$//g;

    # 0x93 (147) and 0x94 (148) are "smart" quotes
    $field =~s/[\x93\x94]/"/g;

    # 0x91 (145) and 0x92 (146) are "smart" singlequotes
    $field =~s/[\x91\x92]/'/g;

    # 0x96 (150) and 0x97 (151) are emdashes
    $field =~s/[\x96\x97]/--/g;

    # 0x85 (133) is an ellipsis
    $field =~s/\x85/. . ./g;

    # convert line ending on windows 
    $field =~s/\r\n/\n/;

    return $field;

}
