#!/usr/local/bin/perl

# vim: tw=78: sw=4: ts=4: et: 

# $Id: export-gene-mappings-to-mdb.pl,v 1.7 2007/05/23 01:55:14 kclark Exp $

use strict;
use warnings;
use English qw( -no_match_vars );
use File::Basename;
use File::Path qw( mkpath );
use File::Spec::Functions;
use Getopt::Long;
use Gramene::CDBI::Genes;
use Gramene::CDBI::Markers;
use List::MoreUtils qw( uniq );
use IO::Prompt;
use Pod::Usage;
use Readonly;

Readonly my $COMMA           => q{,};
Readonly my $EMPTY_STR       => q{};
Readonly my $GENE            => 'Gene';
Readonly my $MAX_NAME_LENGTH => 50;
Readonly my $NL              => qq{\n};
Readonly my $SPACE           => q{ };
Readonly my $TAB             => qq{\t};
Readonly my $MARKER_IMPORT_SCRIPT 
    => q[/usr/local/gramene/scripts/markers/load-mappings.pl -p ]
    .  q[--no-prompt --analysis-name='GENES_DB'];
Readonly my @OUT_FIELDS => qw(
    marker_name
    marker_synonyms
    marker_species
    marker_type
    feature_name
    feature_acc
    map_acc
    map_name
    feature_start
    feature_stop
    description
);
Readonly my $VERSION 
    => sprintf '%d.%02d', qq$Revision: 1.7 $ =~ /(\d+)\.(\d+)/;

my $out_dir   = '';
my $gene_accs = '';
my ( $help, $man_page, $show_version );
GetOptions(
    'o|out=s'       => \$out_dir,
    'g|gene-accs:s' => \$gene_accs,
    'help'          => \$help,
    'man'           => \$man_page,
    'version'       => \$show_version,
) or pod2usage(2);

if ( $help || $man_page ) {
    pod2usage({
        -exitval => 0,
        -verbose => $man_page ? 2 : 1
    });
}; 

if ( $show_version ) {
    my $prog = basename( $PROGRAM_NAME );
    print "$prog v$VERSION\n";
    exit 0;
}

if ( !$out_dir ) {
    pod2usage('No out directory specified');
}

if ( !-d $out_dir ) {
    my $make_dir = prompt -yn,
        "The directory '$out_dir' does not exist.  OK to create? ";

    if ( $make_dir ) {
        mkpath( $out_dir );
    }
    else {
        die "Not OK, exiting.\n";
    }
}

my %species_id = map { $_->species, $_->id } 
    Gramene::CDBI::Markers::Species->retrieve_all;
my %gene_acc   = map { s/^\s+|\s+$//g; $_, 1; } split( $COMMA, $gene_accs );
my $genes_iter = Gramene::CDBI::Genes::GeneGene->retrieve_all;

my ( %data, %map_set_acc, @errors );
GENE:
while ( my $gene = $genes_iter->next ) {
    if ( %gene_acc && !$gene_acc{ $gene->accession } || $gene->is_obsolete ) {
        next GENE;
    }

    my @synonyms;
    for my $name ( 
        $gene->name, 
        map { $_->synonym_name } $gene->gene_gene_synonyms 
    ) {
        next if length($name) > $MAX_NAME_LENGTH;
        if ( $name =~ /$COMMA/ ) {
             $name = qq["$name"];
        }

        push @synonyms, $name;
    }
    @synonyms = uniq( @synonyms );

    for my $mapping ( $gene->gene_map_positions ) {
        my $map_acc = $mapping->{'cmap_map_accession'};
        if ( !$map_acc ) {
            push @errors, 
                sprintf("No map accession for gene %s (%s), skipped\n",
                    $gene->accession, $gene->id
                );
            next GENE;
        }

        my $gene_species = join( $SPACE, 
            $gene->species->genus, $gene->species->species 
        );
        my $species_id   = $species_id{ $gene_species };

        if ( !$species_id ) {
            push @errors, 
                sprintf("Bad species '%s' for gene %s (%s), skipped\n",
                    $gene_species, $gene->accession, $gene->id
                );
            next GENE;
        }

        if ( !$map_set_acc{ $map_acc } ) {
            my ($map) = Gramene::CDBI::Markers::Map->search(
                cmap_map_accession => $map_acc,
            );

            if ( !$map ) {
                my ($map_set) = Gramene::CDBI::Markers::MapSet->search(
                    map_set_name => $mapping->cmap_map_set,
                    species_id   => $species_id, 
                );
            }

            if ( !$map_acc ) {
                push @errors, sprintf(
                    "Can't find map '$map_acc' in mappings db, gene %s (%s)\n",
                    $gene->accession, $gene->id
                );
                next GENE;
            }

            $map_set_acc{ $map_acc } = $map->map_set->cmap_map_set_accession;
        }

        my $map_set_acc = $map_set_acc{ $map_acc } 
            or die "No map set for map '$map_acc'\n";;

        push @{ $data{ $map_set_acc } }, {
            marker_name      => sprintf(
                '%s [[synonym_type=GRAMENE_GENE]]', $gene->accession
            ),
            marker_synonyms  => join( $COMMA, @synonyms ),
            marker_species   => $gene_species,
            marker_type      => $GENE,
            feature_acc      => $mapping->cmap_feature_accession,
            map_acc          => $map_acc,
            map_name         => $mapping->cmap_map_name,
            feature_name     => $gene->symbol,
            feature_start    => $mapping->start_position,
            feature_stop     => $mapping->stop_position,
            description      => $gene->description,
        };
    }
}

my $shell_file = catfile( $out_dir, 'marker-import.sh' );
open my $shell_fh, '>', $shell_file or die "Can't write '$shell_file': $!\n";

my ( $num_mappings, $num_map_sets, $i );
for my $map_set_acc ( keys %data ) {
    $i++;

    my $file = catfile( $out_dir, "${map_set_acc}.tab" );

    open my $fh, '>', $file or die "Can't write $file: $!\n";

    print $fh join($TAB, @OUT_FIELDS), $NL;

    for my $gene ( @{ $data{ $map_set_acc } } ) {
        $num_mappings++;
        print $fh join( $TAB, 
            map { s/'/\\'/g; s/\r?\n/ /g; $_ } 
            map { $gene->{ $_ } } 
            @OUT_FIELDS
        ), $NL;
    }
    $num_map_sets++;

    my $redir = $i == 1 ? '>' : '>>';

    print $shell_fh 
        "$MARKER_IMPORT_SCRIPT -m $map_set_acc $file 2${redir}err\n";
}

print $shell_fh join( $NL, 
    'if [ -s err ]; then', 
    '  echo "There were errors:";', 
    '  cat err;', 
    'fi' 
);

close $shell_fh;

if ( @errors ) {
    print join($NL, 
        'found ' . scalar @errors . ' errors:',
        @errors,
        '',
    );
}

print join($NL, 
    "Finished exporting $num_mappings mappings for $num_map_sets map sets.",
    'Now do this:',
    "sh $shell_file",
    $EMPTY_STR
);

__END__

# ----------------------------------------------------

=pod

=head1 NAME

export-gene-mappings-to-mdb.pl - export gene mapping data to tab file for mappings db 

=head1 VERSION

This documentation refers to version $Revision: 1.7 $

=head1 SYNOPSIS

  export-gene-mappings-to-mdb.pl [options] -o OUT_DIR 

Then note the errors and follow the directions.

Options:

  -o|--out=DIR                    Diretory to write output files [REQUIRED]
  -g|--gene-accs=GR:...[,GR:...]  A comma-separated list of accessions
  --help                          Show brief help and exit
  --man                           Show full documentation
  --version                       Show version and exit

=head1 DESCRIPTION

This script exports the gene db's map position data to tab-delimited files.
Each file is named for the CMap map set accession.  A shell script is created
that should be run to import the data when this script has finished.

=head1 SEE ALSO

Gramene::CDBI::Genes, Gramene::CDBI::Markers.

=head1 AUTHOR

Ken Youens-Clark E<lt>kclark@cshl.eduE<gt>.

=head1 COPYRIGHT

Copyright (c) 2006 Cold Spring Harbor Laboratory

This library is free software;  you can redistribute it and/or modify 
it under the same terms as Perl itself.

=cut
