#!/usr/local/bin/perl -w

=head1 NAME

import_gene.pl


=head1 DESCRIPTION

Imports tab-delimited data (genes) into Gramene Gene schema.
The file type is automatically detected from the header line. 


* The required fields in the header line are:
  gene_accession, symbol, name, species, gene_type description

* The optional fields in the header line are:
  synonyms, chromosome, public_comment, private_comment, has_phenotype

If the gene accession exist in the database already, the script will skip that entry.

=head1 SYNOPSIS

import_gene.pl [options] <gene_file>

bash:import_gene.pl [options] <gene_file> 1>db_gene.txt 2>err.txt
tcsh:(import_gene.pl [options] <gene_file> >db_gene.txt) >& err.txt

 Options:
    --help              help message
    --man               full documentation
                                                                                


=head1 OPTIONS
                                                                                
=over 4
                                                                                
=item B<--help>
                                                                                
print a help message and exit
                                                                                
=item B<--man>
                                                                                
print documentation and exit
                                                                                
                                                                                
=back
                                                                                
=head1 ARGUMENTS

gene file                                                                                

=cut

use lib '/usr/local/gramene/lib/perl/';

use strict;

use Text::RecordParser;
use File::Temp qw/ tempfile/;

use Pod::Usage;
use Getopt::Long;

use Gramene::DB;


use constant {
    GENE_ACCESSION_PREFIX => 'GR:',
    GENE_ACCESSION_LENGTH => 7
};

local $^W = 0;  # to turn off the warning of empty string in  Text::RecordParser

{               #Argument Processing
    my $help = 0;
    my $man  = 0;
    GetOptions( "help|?" => \$help, "man" => \$man )
      or pod2usage(2);
    pod2usage( -verbose => 2 ) if $man;
    pod2usage(1)                if $help;
    pod2usage('No import file') if ( scalar(@ARGV) < 1 );
}

my $file = $ARGV[0];

# The Text::ParseWords modules used in Text::RecordParser will remove the quotes
# we have to add backslash to the quotes for parsing quotes
# the processed_file is a temple file, it will be removed when program exist
my $processed_file = process_file($file);

my $parser = Text::RecordParser->new(
    field_separator => qr/\t/,    #separate the fields by "\t",remove escape
    filename => $processed_file,
);

#a filter for the header field: convert the space to _ and the data to lower case
$parser->header_filter( sub { $_ = shift; s/\s+/_/g; lc $_ } );

#Takes the fields from the next row under the cursor and assigns the field names to the values.
$parser->bind_header;

#Returns the fields bound via bind_fields (or bind_header) and convert to hash.
my %fields = map { $_, 1 } $parser->field_list;

my $db;
eval {
    $db = Gramene::DB->new('genes_edit');
    $db->{AutoCommit} = 0;    # set transaction control
};
if ($@) {
    die "DB connection failed: $@\n";
}

import_genes( $parser, $db );

sub import_genes {

    my ( $parser, $db ) = @_;
    my @req_fields = qw[gene_accession gene_type symbol name species description];
    my @opt_fields = qw[synonyms chromosome public_comment private_comment has_phenotype];

    my %acceptable = map { $_, 1 } ( @req_fields, @opt_fields );

    my ( $no_imported, $no_updated, $no_processed ) = ( 0, 0, 0 );

    print "\n***** Importing Gene *****\n\n";

    for my $field ( $parser->field_list ) {
        next if $acceptable{$field};
        warn "\nUnknown field: $field!\n";
    }

    $parser->field_filter( sub { $_ = shift; s/^\s+|\s+$//g; $_ } );

    eval {
        while ( my $record = $parser->fetchrow_hashref )
        {
	

                
            $no_processed++;
            my @missing;
            for my $field (@req_fields) {
                push @missing, $field unless defined $record->{$field};
            }

            if (@missing) {
		warn "\nLine $no_processed: Missing ",
                  join( ', ', map { qq["$_"] } @missing ), ".  Skipping.\n";
                next;
            }

            # find or create the 'species'
            my $species_id = find_or_create_species( $db, $record->{'species'} );

	    my $gene_type_id = find_or_create_gene_type($db, $record->{'gene_type'});
	

            my $accession = $record->{'gene_accession'};
	    $accession = process_accession(GENE_ACCESSION_PREFIX, GENE_ACCESSION_LENGTH,$accession);


            my ($gene_id) = $db->selectrow_array(
                q[
                                           SELECT gene_id
					     FROM gene_gene
					    WHERE accession =?
					  ],
                {},
                ($accession)
            );

            if ($gene_id) {    # skip the entry
                warn "\nLine $no_processed: gene gene accession $accession
		existing in database already.  Skipping.\n";
                next;

            }
            else {             #gene_id not exist
                $gene_id = next_id( $db, 'gene_gene', 'gene_id' );
                my $is_obsolete = 0;
		my $has_phenotype = $record->{'has_phenotype'} || 'not curated';
                my @data = map { defined $_ ? $_ : '' } (
                    $gene_id, 
		    $accession,
                    $record->{'symbol'},
		    $record->{'name'},
                    $species_id,
		    $gene_type_id,
		    $record->{'chromosome'},
                    $record->{'description'}, 
		    $record->{'public_comment'},
		    $record->{'private_comment'},
		    $has_phenotype,
                    $is_obsolete
                );

                $db->do(
                    q[
                           INSERT INTO gene_gene
                           (gene_id,
			   accession,
			   symbol,
			   name,
			   species_id,
			   gene_type_id,
			   chromosome,
			   description, 
			   public_curation_comment,
			   internal_curation_comment,
			   has_phenotype,
			   is_obsolete
			   )
                           VALUES (?,?,?,?,?,?,?,?,?,?,?,?)
                         ],
                    {},
                    @data
                );

                $no_imported++;

            }    #gene_id

            print "\nInserted gene [", $accession, "].\n";

            #
            # gene synonyms
            #
            if ( $record->{'synonyms'} ) {
                my @synonyms = split /,/, $record->{'synonyms'};
                my %uniq_synonyms = map { $_, 1 } map { s/^\s+|\s+$//g; $_ } @synonyms;
                foreach my $syn ( keys %uniq_synonyms ) {
                    create_gene_synonym( $db, $gene_id, $accession,$syn );
                }
            }

        }    #while

        $db->commit;

        print "\n\nDone.\nProcessed $no_processed gene gene records, ",
          "imported $no_imported.\n";
    };

    if ($@) {
        warn "Unable to save to database: $@\n";
        $db->rollback();
    }

}

sub next_id {
    my ( $db, $table_name, $field_name ) = @_;
    my $id = $db->selectrow_array("select max($field_name) from $table_name");
    return $id + 1;
}

sub find_or_create_gene_type{
    my ($db, $gene_type) = @_;
    my ($gene_type_id) = $db->selectrow_array(
	q[
	    SELECT gene_type_id FROM gene_gene_type
	    WHERE  UPPER(gene_type) = ?
	], {}, $gene_type);

    unless($gene_type_id){

        $gene_type_id = next_id( $db, 'gene_gene_type', 'gene_type_id' );
        my @data = ( $gene_type_id, $gene_type );

        $db->do(
            q[
                        INSERT INTO gene_gene_type (gene_type_id, gene_type)
                             VALUES (?,?) 
                      ],
            {}, @data
        );
    }

}

	


sub find_or_create_species {

    my ( $db, $species ) = @_;
    $species = uc($species);

   #my @fields      = qw[ncbi_taxa_id common_name lineage_string genus species];

    my ($species_id) = $db->selectrow_array(
        q[
                            SELECT species_id
                            FROM   gene_species
                            WHERE  UPPER(common_name) =? 
                               OR  UPPER(species)=?
                          ],
        {},
        ( $species, $species )
    );

    my $verb;
    my @data;

    unless ($species_id) {

        #   print "\n\n***** Insert Species *****\n";
        $species_id = next_id( $db, 'gene_species', 'species_id' );
        @data = ( $species_id, $species );

        $db->do(
            q[
                        INSERT INTO gene_species (species_id,common_name)
                             VALUES (?,?) 
                      ],
            {}, @data
        );

        $verb = 'Inserted';

        #   print "$verb species [",join(", ", @data),"]\n";

    }

    return $species_id;

}

sub create_gene_synonym {
    my ( $db, $gene_id, $accession,  $syn ) = @_;

    my $gene_synonym_id = next_id( $db, 'gene_gene_synonym', 'gene_synonym_id' );
    $db->do(
	q[ 
	      INSERT INTO gene_gene_synonym (gene_synonym_id, gene_id, synonym_name)
		 VALUES   (?, ?, ?) 
                ],
            {},
            ( $gene_synonym_id, $gene_id, $syn )
    );

        # print "Insert Gene Synonym [$syn] for Gene $accession .\n";
}

sub process_accession {

    my ($prefix, $length,$acc) = @_;
    $acc =~s/^$prefix//;
    $acc =~s/^0+//;

    my $str_len = length($acc);
    my $num = $length - $str_len;
    for my $i (1 .. $num){
	$acc = '0'.$acc;
    }

    $acc = $prefix.$acc;
    return $acc;

}

sub process_file {

    my $file = shift;
    my ( $fh, $temp_file ) = tempfile( "tmpfileXXXXX", UNLINK => 1 );

    open( RAW, $file ) or die "can't open $file :$!";
    while (<RAW>) {
	chomp;
	$_ = clean_field($_); # clean MS character	

        $_ =~ s/'/\\'/g;
        $_ =~ s/"/\\"/g;
        print $fh "$_\n";
    }
    close(RAW);
    close($fh);

    return $temp_file;

}

sub clean_field{
    my $field = shift;
    return unless $field;

    $field =~s/^\s+|\s+$//g;

    # 0x93 (147) and 0x94 (148) are "smart" quotes
    $field =~s/[\x93\x94]/"/g;

    # 0x91 (145) and 0x92 (146) are "smart" singlequotes
    $field =~s/[\x91\x92]/'/g;

    # 0x96 (150) and 0x97 (151) are emdashes
    $field =~s/[\x96\x97]/--/g;

    # 0x85 (133) is an ellipsis
    $field =~s/\x85/. . ./g;

    # convert line ending on windows 
    $field =~s/\r\n/\n/;

    return $field;

}

