#!/usr/local/bin/perl

=head1 NAME

load-sequenced-bacs.pl - Loads sequenced BAC data into a core Ensembl DB

=head1 SYNOPSIS

perl load-sequenced-bacs.pl [options]

Options:
 -h --help
 -m --man
 -r --registry_file
 -s --species

=head1 OPTIONS

Reads the B<overgo-hits-file>, and uses its clones to load the misc_attrib table.

B<-h --help>
  Print a brief help message and exits.

B<-m --man>
  Print man page and exit

B<-r --registry_file>
  Use this Ensembl registry file for database connection info.
  Default is <ENSEMBLHOME>/conf/ensembl.registry

B<-s --species>
  Use this species entry from the registry file [REQUIRED].

=head1 DESCRIPTION

B<This program> 

  Loads overgo-clone associations into a core Ensembl DB
  
  Maintained by Shiran Pasternak <shiranp@cshl.edu>

=cut

use strict;
use warnings;
use Getopt::Long;
use Pod::Usage;
use Data::Dumper qw(Dumper);    # For debug

use English;
use Carp;
use DBI;
use DBI qw(:sql_types);
use FindBin qw($Bin);
use File::Basename qw( dirname );

use vars qw($BASEDIR);

BEGIN {

    # Set the perl libraries
    $BASEDIR = dirname($Bin);
    unshift @INC, $BASEDIR . '/ensembl-live/ensembl/modules';
}

use Bio::EnsEMBL::Attribute;
use Bio::EnsEMBL::Registry;
use Bio::EnsEMBL::MiscFeature;
use Bio::EnsEMBL::Attribute;
use Bio::EnsEMBL::SimpleFeature;
use Bio::EnsEMBL::Analysis;

use DBI;

use vars qw($ENS_DBA);

use Readonly;

Readonly my $CLONEPATH_DB   => 'clonepath';
Readonly my $CLONEPATH_HOST => 'ascutney.cshl.edu';
Readonly my $CLONEPATH_PORT => 3306;
Readonly my $CLONEPATH_USER => 'maize_rw';
Readonly my $CLONEPATH_PASS => 'z3@m@ys';

Readonly my @CLONEPATH_COLUMNS =>
    qw(accessionNumber version giNumber sequenceLength chromosome status cloneName);

Readonly my %status_name_for =>
    (1 => 'FULLTOP', 2 => 'ACTIVEFIN', 3 => 'IMPROVED', 0 => 'PREFIN');

my ($misc_feature_adaptor, $attribute_adaptor, $misc_set_adaptor);
my %clone_feature_cache = ();
MAIN: {
    my $help = 0;
    my $man  = 0;
    my ($species_name, $registry_file);
    GetOptions(
        "help|?"          => \$help,
        "man"             => \$man,
        "species=s"       => \$species_name,
        "registry_file=s" => \$registry_file,
        )
        or pod2usage(2);
    pod2usage(-verbose => 2) if $man;
    pod2usage(1) if $help;

    # Validate file paths
    $registry_file ||= $BASEDIR . '/conf/SiteDefs.pm';

    map {
        -e $_ || (warn("File $_ does not exist\n")    && pod2usage(1));
        -r $_ || (warn("Cannot read $_\n")            && pod2usage(1));
        -f $_ || (warn("File $_ is not plain-text\n") && pod2usage(1));
        -s $_ || (warn("File $_ is empty\n")          && pod2usage(1));
    } $registry_file;

    my $clones = fetch_sequenced_clones();

    # Load the ensembl file
    $species_name || (warn("Need a --species\n") && pod2usage(1));
    Bio::EnsEMBL::Registry->load_all($registry_file);
    $ENS_DBA = load_db_adaptor($species_name)
        || (warn("No core DB for $species_name set in $registry_file: $@\n")
        && pod2usage(1));

    my $meta    = $ENS_DBA->get_MetaContainer();
    my $species = $meta->get_Species
        || die("Cannot find the species in the meta table of the DB");
    my $common_name = $species->common_name
        || die(
        "Cannot find the species common name in the meta table of the DB");

    $common_name = ucfirst($common_name);

    ###########
    # Prepare some Ensembl adaptors
    $misc_feature_adaptor = $ENS_DBA->get_adaptor('MiscFeature');
    $attribute_adaptor    = $ENS_DBA->get_adaptor('Attribute');
    $misc_set_adaptor     = $ENS_DBA->get_adaptor('MiscSet');

    reset_previous_clones();
    update_accessioned_bac_set($clones);
    update_clone_features($clones);
    exit;

}

#======================================================================

=pod

=head2 load_db_adaptor
    Loads a DB adaptor from the registry

=cut

sub load_db_adaptor {
    my ($species) = @_;

    ${ Bio::EnsEMBL::Registry->get_all_DBAdaptors(-species => $species) }[0];
}

=pod

=head2 get_clonepath_handle
    Creates a connection to the clonepath database

=cut

sub get_clonepath_handle {
    my $data_source = sprintf('DBI:mysql:database=%s;host=%s;port=%s',
        $CLONEPATH_DB, $CLONEPATH_HOST, $CLONEPATH_PORT);
    my $db_handle
        = DBI->connect($data_source, $CLONEPATH_USER, $CLONEPATH_PASS);
    return $db_handle;
}

=pod

=head2 fetch_sequenced_clones
    Returns clones from the clonepath database

=cut

sub fetch_sequenced_clones {
    my $clonepath_handle = get_clonepath_handle();
    my $fields = join(', ', @CLONEPATH_COLUMNS);
    return $clonepath_handle->selectall_arrayref("SELECT $fields FROM clone",
        { Slice => {} });
}

=pod

=head2 validate_clone
    Run a quality-control check to ensure the GenBank and Ensembl datasets jive

=cut

sub validate_clone {
    my ($feature, $genbank_value, $attribute_name) = @_;
    my $value = $feature->get_scalar_attribute($attribute_name);
    my $name  = $feature->get_scalar_attribute('name');
    if (defined $value && $value ne q{} && $value ne $genbank_value) {
        qc_error(
            "[$name] Clone mismatch on field '$attribute_name' (GenBank=$genbank_value Ensembl=$value)"
        );
    }
}

=pod

=head2 update_clone_attribute
    Adds an attribute for the sequenced clone

=cut

sub update_clone_attribute {
    my ($feature, $attribute_name, $new_value) = @_;
    my $clone_name = $feature->get_scalar_attribute('name');
    my $old_value = $feature->get_scalar_attribute($attribute_name);
    if (!defined $new_value) {
        qc_error("[$clone_name] Undefined new value for '$attribute_name'");
        return;
    }
    if (defined $old_value && $old_value ne q{}) {
        if ($old_value ne $new_value) {
            qc_error(
                "[$clone_name] Overriding attribute '$attribute_name' ($old_value => $new_value)"
            );
            remove_attribute_from_feature($feature, $attribute_name);
        } else {
            print
                "[$clone_name] Identical attribute '$attribute_name' => $old_value\n";
            return;
        }
    }
    my $attribute = Bio::EnsEMBL::Attribute->new(
        -CODE  => $attribute_name,
        -VALUE => $new_value,
    );
    $feature->add_Attribute($attribute);
    $attribute_adaptor->store_on_MiscFeature($feature, [$attribute]);

    print "[$clone_name] Added '$attribute_name' attribute => $new_value\n";
}

=pod

=head2 remove_attribute_from_feature
    Since the API cannot do it for a single attribute

=cut

sub remove_attribute_from_feature {
    my ($feature, $attribute_name) = @_;
    my $statement = $misc_feature_adaptor->prepare(
        q(
DELETE
  FROM misc_attrib
 WHERE misc_feature_id = ?
   AND attrib_type_id = (SELECT attrib_type_id FROM attrib_type WHERE code = ?)
    )
    );

    $statement->bind_param(1, $feature->dbID, SQL_INTEGER);
    $statement->bind_param(2, $attribute_name, SQL_VARCHAR);
    $statement->execute();
}

=pod

=head2 qc_error
    Print a QC error

=cut

sub qc_error {
    print STDERR "[QC] ", join('', @_), "\n";
}

=pod

=head2 reset_previous_clones
    Sets initial attributes for historical clones

=cut

sub reset_previous_clones {
    my @previous_clones
        = @{ $misc_feature_adaptor->fetch_all_by_attribute_type_value(
            'embl_acc') };
    for my $clone (@previous_clones) {
        update_clone_attribute($clone, 'external',   'true');
        update_clone_attribute($clone, 'annotation', 'II');
    }
}

=pod

=head2 update_clone_features
    Adds attributes for incoming sequenced clones

=cut

sub update_clone_features {
    my ($clones) = @_;
    for my $seq_clone (@$clones) {
        print
            "Clone (Name=$seq_clone->{cloneName} Accession=$seq_clone->{accessionNumber})\n";

        my ($clone_feature);
        eval { $clone_feature = fetch_feature_for($seq_clone); };
        if ($@) {
            qc_error($@);
            next;
        }

        validate_clone($clone_feature, $seq_clone->{cloneName},      'name');
        validate_clone($clone_feature, $seq_clone->{sequenceLength}, 'seq_len');

        update_clone_attribute($clone_feature, 'embl_acc',
            $seq_clone->{accessionNumber});
        update_clone_attribute($clone_feature, 'seq_len',
            $seq_clone->{sequenceLength});
        update_clone_attribute($clone_feature, 'seqstatus',
            $status_name_for{ $seq_clone->{status} });
        update_clone_attribute($clone_feature, 'external',   'false');
        update_clone_attribute($clone_feature, 'annotation', 'I');
        update_clone_attribute($clone_feature, 'state',      '12:Accessioned');
    }
}

=pod

=head2 update_accessioned_bac_set
    Updates the acc_bac_map misc_set to contain the sequenced BACs

=cut

sub update_accessioned_bac_set {
    my ($clones) = @_;
    my $acc_bac_map = $misc_set_adaptor->fetch_by_code('acc_bac_map');
    for my $clone (@$clones) {
        my ($feature);
        eval { $feature = fetch_feature_for($clone); };
        if ($@) {
            qc_error($@);
            next;
        }
        add_feature_set($feature, $acc_bac_map);
    }
}

=pod

=head2 add_feature_set
    Store the correspondence between a feature and its misc_set. Required because no such method is provided in the API (must be a new misc_feature)

=cut

sub add_feature_set {
    my ($feature, $misc_set) = @_;
    my $name = $feature->get_scalar_attribute('name');
    if (scalar @{ $feature->get_all_MiscSets('acc_bac_map') } > 0) {
        print "[$name] Already on acc_bac_map\n";
        return;
    }
    $feature->add_MiscSet($misc_set);    # Does nothing in the DB

    my $feature_set_sth = $misc_feature_adaptor->prepare(
        q(INSERT IGNORE misc_feature_misc_set SET misc_feature_id = ?, misc_set_id = ?)
    );

    print "Adding feature correspondence: ", $feature->dbID, " -> ",
        $misc_set->dbID, "\n";
    $feature_set_sth->bind_param(1, $feature->dbID,  SQL_INTEGER);
    $feature_set_sth->bind_param(2, $misc_set->dbID, SQL_INTEGER);

    $feature_set_sth->execute();
}

=pod

=head2 fetch_feature_for
    Fetches a feature for a given sequenced clone

=cut

sub fetch_feature_for {
    my ($sequenced_clone) = @_;
    my $clone_name = $sequenced_clone->{cloneName};
    if (exists $clone_feature_cache{$clone_name}) {
        return $clone_feature_cache{$clone_name};
    }
    my $clone_features
        = $misc_feature_adaptor->fetch_all_by_attribute_type_value('name',
        $clone_name);
    if (scalar @$clone_features != 1) {
        if (scalar @$clone_features == 0) {
            die "[$clone_name] Clone not found in Ensembl";
        } else {
            die
                "[$clone_name] Multiple (@{[scalar @$clone_features]}) features";
        }
        $clone_feature_cache{$clone_name} = undef;
        return;
    }
    my $feature = $clone_features->[0];
    $clone_feature_cache{$clone_name} = $feature;
    return $feature;
}

1;
