#!/usr/local/bin/perl -w



=head1 NAME

import_gene_dbxref.pl


=head1 DESCRIPTION

Imports tab-delimited data (Gene dbxref) into Gramene Gene schema.
The file type is automatically detected from the header line.  The
column names in the header file may contain spaces and mixed case as
spaces will be converted to underscores and letters will be lowercased
(e.g., "Gene Accession" => "gene_accession").


* The required fields in the header line are:
  gene_accession, dbxref_db_name

The dbxref_db_name head example is dbxref_Gramene Protein

Multiple dbxrefs can be listed in one field delimited by comma


If the gene accession does not exist in the database, the entry will be skipped

If the dbxref db name  does not exist in the database, the entry will be skipped


=head1 SYNOPSIS

import_gene_dbxref.pl [options] <gene dbxref file>

 Options:
    --help              help message
    --man               full documentation
                                                                                


=head1 OPTIONS
                                                                                
=over 4
                                                                                
=item B<--help>
                                                                                
print a help message and exit
                                                                                
=item B<--man>
                                                                                
print documentation and exit
                                                                                
                                                                                
=back
                                                                                
=head1 ARGUMENTS

gene dbxref file                                                                                

=cut




use lib '/usr/local/gramene/lib/perl/';

use strict; 

use Text::RecordParser; 
use File::Temp qw/ tempfile/;

use Pod::Usage;
use Getopt::Long;
 
use Gramene::DB; 


use constant {
    GENE_ACCESSION_PREFIX => 'GR:',
    GENE_ACCESSION_LENGTH => 7
};

local $^W=0; # to turn off the warning of empty string in  Text::RecordParser


    {  #Argument Processing
        my $help=0;
        my $man=0;
        GetOptions( "help|?"=>\$help,"man"=>\$man)
          or pod2usage(2);
        pod2usage(-verbose => 2) if $man;
        pod2usage(1) if $help;
        pod2usage('No import file') if(scalar(@ARGV)<1);
    }



my $file            = $ARGV[0]; 


# The Text::ParseWords modules used in Text::RecordParser will remove the quotes
# we have to add backslash to the quotes for parsing quotes
# the processed_file is a temple file, it will be removed when program exist
my $processed_file = process_file($file);



my $parser          = Text::RecordParser->new( 
    field_separator => qr/\t/,                #separate the fields by "\t",remove escape
    filename        => $processed_file, 
); 
 
 

#a filter for the header field: convert the space to _ and the data to lower case
$parser->header_filter( sub { $_ = shift; s/\s+/_/g; lc $_ } );


#Takes the fields from the next row under the cursor and assigns the field names to the values. 
$parser->bind_header;


#Returns the fields bound via bind_fields (or bind_header) and convert to hash.
my %fields = map { $_, 1 } $parser->field_list; 


my $db;
eval{
      $db = Gramene::DB->new('genes_edit');
      $db->{AutoCommit}=0;  # set transaction control
    };
if($@){
         die "DB connection failed: $@\n";
    }

import_dbxrefs($parser,$db);




sub import_dbxrefs{

    my ( $parser, $db ) = @_;
    my @req_fields  = qw[gene_accession];
  

    my %acceptable  = map { $_, 1 } (@req_fields);

    my ($no_imported, $no_updated, $no_processed ) = (0, 0, 0 );

    my %dbxref_fields;

    print "\n***** Importing Gene DBXref *****\n\n";

    for my $field ( $parser->field_list ) {
        next if $acceptable{ $field };
	if($field =~/dbxref_(.+)/){
	    print "dbx=$field\n";
	    my $dbx = $1;
	    my $dbx2 = $dbx;
	    $dbx2 =~s/_/ /g;

	    my $dbxref_id = $db->selectrow_array(
		  q[
		     SELECT dbxref_id FROM gene_dbxref
		      WHERE UPPER(dbxref_name) = ?
		      OR    UPPER(dbxref_name) = ?
		   ], {}, (uc($dbx), uc($dbx2))
		 );
	    if($dbxref_id){
		$dbxref_fields{$dbx} = $dbxref_id;
	    }else{
		warn "\nUnknown field: $field!\n";
	    }

	}else{
	    warn "\nUnknown field: $field!\n";
	}
    }

    $parser->field_filter( sub { $_ = shift; s/"//g; s/^\s+|\s+$//g; $_ } );

    eval{
      while ( my $record = $parser->fetchrow_hashref ) {
        $no_processed++;
        my @missing;
        for my $field ( @req_fields ) {
            push @missing, $field unless defined $record->{ $field };
	  }

        if ( @missing ) {
            warn "Line $no_processed: Missing ", 
                join(', ', map { qq["$_"] } @missing), ".  Skipping.\n";
            next;
	  } 
        

        my $mu_acc = $record->{'gene_accession'};
        $mu_acc =process_accession(GENE_ACCESSION_PREFIX,GENE_ACCESSION_LENGTH,$mu_acc);

 
        
        my ($gene_id) = $db->selectrow_array(
				       	q[
                                           SELECT gene_id
					     FROM gene_gene
					    WHERE accession =?
					  ],
					   {},
					   ($mu_acc )
					  );       

        unless($gene_id){
          warn "\nLine $no_processed gene gene $mu_acc is not in database. Skipping.\n";
          next;
        }

	foreach my $dbxref_field (keys %dbxref_fields){

	    my $dbxref_id = $dbxref_fields{$dbxref_field};


	    #maybe multiple terms delimited by ,
	    my @dbxrefs = split /,/,$record->{"dbxref_$dbxref_field"};   
	    my %uniq_dbxrefs = map {$_,1} map {s/^\s+|\s+$//g;$_} @dbxrefs;
	    foreach my $db_val (keys %uniq_dbxrefs){
		  $db_val =~s/^\s+|\s+$//g;
		  $db_val =~s/\s+/ /g;

		  my $table_name ='gene';
		  my ($dbxref_to_obj_id) = $db->selectrow_array(
			q[
			   SELECT dbxref_to_object_id
			     FROM gene_dbxref_to_object
			    WHERE dbxref_id = ? 
			      AND table_name = ?
			      AND record_id = ?
			      AND dbxref_value = ?
			  ], {}, ($dbxref_id, $table_name, $gene_id,$db_val )
		  );
		  if($dbxref_to_obj_id){
		       print "\nLine $no_processed gene dbxref [$mu_acc,$db_val] exist in db already. Skipping\n"; 
		  }else{
		       $dbxref_to_obj_id = next_id( $db, 'gene_dbxref_to_object', 'dbxref_to_object_id' ); 

		       $db->do(
                        q[
                           INSERT INTO gene_dbxref_to_object 
			   (dbxref_to_object_id, table_name, record_id, dbxref_id, dbxref_value)
                           VALUES (?,?,?,?,?)
                         ],
                         {},
                         ($dbxref_to_obj_id, $table_name, $gene_id, $dbxref_id, $db_val)
                      );
		       $no_imported++;
		       print "\nInsert gene dbxref [$mu_acc,$db_val].\n";
          

		 }
	     }
	}
      }
 

    
     $db->commit; 
  
     print "\n\nDone.\nProcessed $no_processed  gene dbxref records, ",
        "imported $no_imported.\n";
    };
    
    if($@){
        warn "Unable to save to database: $@\n";
        $db->rollback();
    }

  }



sub next_id {
    my ( $db, $table_name, $field_name ) = @_;
    my $id = $db->selectrow_array("select max($field_name) from $table_name");
    return $id + 1;
}


sub process_accession {

    my ($prefix, $length,$acc) = @_;
    $acc =~s/^$prefix//;
    $acc =~s/^0+//;

    my $str_len = length($acc);
    my $num = $length - $str_len;
    for my $i (1 .. $num){
	$acc = '0'.$acc;
    }

    $acc = $prefix.$acc;
    return $acc;

}

sub process_file{

  my $file = shift;
  my  ($fh, $temp_file) = tempfile( "tmpfileXXXXX",UNLINK => 1);

  open(RAW,$file) or die "can't open $file :$!";
  while(<RAW>){
     chomp;
     $_ = clean_field($_);

     $_=~s/'/\\'/g;
     $_=~s/"/\\"/g;  
     print $fh "$_\n"; 
  }
  close(RAW);
  close($fh);
  
  return $temp_file;


}

sub clean_field{
    my $field = shift;
    return unless $field;

    $field =~s/^\s+|\s+$//g;

    # 0x93 (147) and 0x94 (148) are "smart" quotes
    $field =~s/[\x93\x94]/"/g;

    # 0x91 (145) and 0x92 (146) are "smart" singlequotes
    $field =~s/[\x91\x92]/'/g;

    # 0x96 (150) and 0x97 (151) are emdashes
    $field =~s/[\x96\x97]/--/g;

    # 0x85 (133) is an ellipsis
    $field =~s/\x85/. . ./g;

    # convert line ending on windows 
    $field =~s/\r\n/\n/;

    return $field;
}
