#!/usr/local/bin/perl

# vim: tw=78: sw=4: ts=4: et: 

# $Id: convert_mapping_4import.pl,v 1.2 2007/01/30 20:36:36 weix Exp $

#convert from

#HIT_ID  HIT_NAME        CHR_WITH_OFFSET STRAND  HIT_START       HIT_STOP        CHR_START       CHR_STOP        SCORE   PERC_ID CIGAR_LINE      MARKER_ID
#1       AJ228945.1      chr_6   1       1       195     28402209        28402403        164     92      194M    1375463
#2       AJ228934.1      chr_12  1       1       130     11274654        11274783        144     93      129M    1375465
#2       AJ228934.1      chr_12  1       138     188     11318055        11318105        144     93      50M     1375465

#to

#| marker_id | map_id | display_synonym_id | start    | end      |  marker_start | marker_end | marker_strand | cigar_line | score |  percent_identity | analysis_id

use strict;
use warnings;
use English qw( -no_match_vars );
use File::Basename;
use Getopt::Long;
use Gramene::CDBI::Markers;
use Gramene::Marker::DB;
use Gramene::Utils qw( get_logger );
use IO::Prompt;
use Pod::Usage;
use Readonly;
use Text::RecordParser::Tab;
use Date::Calc;

Readonly my @FILE_SUFFIXES   => ( '.dat', '.txt', '.tab' );
Readonly my $VERSION         => sprintf '%d.%02d', 
                                qq$Revision: 1.2 $ =~ /(\d+)\.(\d+)/;
Readonly my %FILES_TO_SPECIES_TYPE => (
    'Maize_consensus'       => {
        species              => 'Zea mays',
        marker_type          => 'mRNA',
    },
    'Maize_est'              => {
        species              => 'Zea mays',
        marker_type          => 'EST',
    },
    'Maize_markers'          => {
        species              => 'Zea mays',
        marker_type          => 'Undefined',
    },

    'Maize_MarkerRFLP'          => {
        species              => 'Zea mays',
        marker_type          => 'RFLP',
    },
    'RiceAlta_BACend_OMAP'   => {
        species              => 'Oryza alta',
        marker_type          => 'GSS',
    },
    'RiceAustraliensis_BACend_OMAP' => {
        species              => 'Oryza australiensis',
        marker_type          => 'GSS',
    },
    'RiceBrachyantha_BACend_OMAP' => {
        species              => 'Oryza brachyantha',
        marker_type          => 'GSS',
    },
    'RiceCoarctata_BACend_OMAP' => {
        species              => 'Oryza coarctata',
        marker_type          => 'GSS',
    },
    'RiceGlaberrima_BACend_OMAP' => {
        species              => 'Oryza glaberrima',
        marker_type          => 'GSS',
    },
    'RiceNivara_BACend_OMAP' => {
        species              => 'Oryza nivara',
        marker_type          => 'GSS',
    },
    'RicePunctata_BACend_OMAP' => {
        species              => 'Oryza punctata',
        marker_type          => 'GSS',
    },
    'RiceRufipogon_BACend_OMAP' => {
        species              => 'Oryza rufipogon',
        marker_type          => 'GSS',
    },

    'RiceOfficinalis_BACend_OMAP' => {
        species              => 'Oryza officinalis',
        marker_type          => 'GSS',
    },
    'RiceMinuta_BACend_OMAP' => {
        species              => 'Oryza minuta',
        marker_type          => 'GSS',
    },
    'Rice_BAC'               => {
        species              => 'Oryza sativa',
        marker_type          => 'Clone',
    },
    'Rice_rflp_marker'  => {
        species              => 'Oryza sativa',
        marker_type          => 'RFLP',
    },
    'Rice_tos17_insert'      => {
        species              => 'Oryza sativa (japonica cultivar-group)',
        marker_type          => 'GSS',
    },
    'Sorghum_ESTCluster3P_LGBPratt' => {
        species              => 'Sorghum bicolor',
        marker_type          => 'EST Cluster',
    },
    'Sorghum_Markers'        => {
        species              => 'Sorghum bicolor',
        marker_type          => 'Undefined',
    },
    'Sorghum_est'            => {
        species              => 'Sorghum bicolor',
        marker_type          => 'EST',
    },
    'Sorghum_gss-read_Klein' => {
        species              => 'Sorghum bicolor',
        marker_type          => 'GSS',
    },
    'Wheat_est'              => {
        species              => 'Triticum aestivum',
        marker_type          => 'EST',
    },
    'Barley_est'              => {
        species              => 'Hordeum vulgare',
        marker_type          => 'EST',
    },				       

);

Readonly my $NL => qq{\n};

my $ms_acc          = '';
my $delete_mappings = 0;
my $ms_species      = '';
my $ms_map_type     = '';
my $map_set_name    = '';
my $no_prompt       = 0;
my $ensembl         = 0;

my ( $help, $man_page, $show_version, $analysis_id, $analysis_name, $marker_id_required , $outfile, $date_created); 
GetOptions(
    'm|ms-acc=s'     => \$ms_acc,
    'help'           => \$help,
    'man'            => \$man_page,
    'version'        => \$show_version,
    'species:s'      => \$ms_species,
    'map-type:s'     => \$ms_map_type,
    'map-set-name:s' => \$map_set_name,
    'no-prompt'      => \$no_prompt,
    'ensembl'        => \$ensembl,
    'analysis_id=i'       => \$analysis_id,
    'analysis_name:s'       => \$analysis_name,	   
    'marker_id_required' => \$marker_id_required,
    'outfile=s'      => \$outfile,
    'date_created=s'         => \$date_created,
) or pod2usage(2);

if ( $help || $man_page ) {
    pod2usage({
        -exitval => 0,
        -verbose => $man_page ? 2 : 1
    });
}; 

if ( $show_version ) {
    my $prog = basename( $PROGRAM_NAME );
    print "$prog v$VERSION\n";
    exit 0;
}

$outfile ||= "mapping.dat";

unless($outfile =~ /^mapping(\.\S+)?/){
  print STDERR "output file name need to be mapping.xxx for importing to mapping table\n";
  pod2usage({
	     -exitval => 0,
	     -verbose => $man_page ? 2 : 1
	    });
}

my @files = @ARGV or pod2usage('No input files');
@ARGV = ();

my $map_set_id;
if ( $ms_acc ) {
    my @map_sets = Gramene::CDBI::Markers::MapSet->search(
        cmap_map_set_accession => $ms_acc
    );

    if ( scalar @map_sets > 1 ) {
        die "More than one map set with that accession?!\n";
    }
    elsif ( scalar @map_sets == 1 ) {
        $map_set_id = $map_sets[0]->map_set_id;
    }
    else {
        if ( !$ms_species || !$ms_map_type || !$map_set_name ) {
            my $create = prompt -yn,
                "Can't find CMap map set accession '$ms_acc'. Create? ";

            if ( !$create ) {
                die "Can't do anything.  Please try again.\n";
            }
        }

        my $species_id   = prompt_for('species', $ms_species);
        my $map_type_id  = prompt_for('map_type', $ms_map_type);
        $map_set_name  ||= prompt 'Map Set Name? ';

        if ( $species_id && $map_type_id && $map_set_name ) {
            my $map_set = Gramene::CDBI::Markers::MapSet->insert({
                species_id             => $species_id,
                map_type_id            => $map_type_id,
                map_set_name           => $map_set_name, 
                cmap_map_set_accession => $ms_acc,
            });
            $map_set_id = $map_set->id;
        }
    }
}
else {
    pod2usage('No map set accession');
}

unless ( $no_prompt ) {
    my $ok = prompt( 
        -yn,
        sprintf( 
            "OK to %s mappings for the markers on map set '$ms_acc'? ",
            $delete_mappings ? 'delete' : 'load'
        )
    );

    if ( !$ok ) {
        print "Exiting.\n";
        exit 0;
    }
}


my $p   = Text::RecordParser::Tab->new;
my $mdb = Gramene::Marker::DB->new;
$date_created ||= sprintf('%4.4i%2.2i%2.2i',Date::Calc::Today);
my $strand = 0;

$p->header_filter( sub { $_ = shift; s/\s+/_/g; lc $_ } );

my %deletes_done;
my ( $num_files, $num_mappings, $num_errors ) = ( 0, 0, 0 );
open my $mapping_fh, '>', $outfile or die "Cannot open file $outfile to write";
print $mapping_fh (join "\t", qw(
 marker_id 
 map_id 
 display_synonym_id 
 start    
 end
 strand
 marker_start 
 marker_end 
 marker_strand 
 cigar_line 
 score 
 percent_identity 
 analysis_id
 date_created
)) . $NL;


unless($analysis_id){

  #user defined analysis_name for this load
  if($analysis_name){
    my $analysis = Gramene::CDBI::Markers::Analysis->find_or_create({
								    analysis_name => $analysis_name,
								    type => 'ensembl',
								    #description => 'ensembl_track mappings',
	
								 });
    $analysis_id = $analysis->id;
  }

}

my $analysis_id_defined = 1 if $analysis_id;

FILE:
for my $file ( @files ) {
    print "Processing file '$file'\n";
    $p->filename( $file );
    $p->bind_header;

    my $file_basename    = basename( $file, @FILE_SUFFIXES );
    
    $file_basename  =~ s/final_filtered_//;

    my $file_meta        = $FILES_TO_SPECIES_TYPE{ $file_basename };

    if( !$marker_id_required && $ensembl && !$file_meta){
      print STDERR "No meta info for $file_basename in the pre-defined hash, skip\n";
      next;
    }

    my $file_species     = $file_meta->{'species'}     || '';
    my $file_marker_type = $file_meta->{'marker_type'} || '';


    my $analysis;
    
#    if($analysis_id){
#      $analysis = Gramene::CDBI::Markers::Analysis->retrieve($analysis_id);
#    }else{
#      $analysis = Gramene::CDBI::Markers::Analysis->find_or_create({
#								    analysis_name => $file_basename,
#								    type => 'ensembl',
#								    description => 'ensembl_track mappings',
#								   });
#    }

    unless($analysis_id_defined){
      
      $analysis = Gramene::CDBI::Markers::Analysis->find_or_create({
								    analysis_name => $file_basename,
								    type => 'ensembl',
								    description => 'ensembl_track mappings',
								   });
      $analysis_id = $analysis->id;
    }
  

    my $line_num = 0;
    MAPPING:
    while ( my $rec = $p->fetchrow_hashref ) {
        $line_num++;

        my $marker_id      = $rec->{'marker_id'} || 0;
        my $marker_type    = $rec->{'marker_type'}
                          || $rec->{'feature_type'}
                          || $file_marker_type;
        my $marker_species = $rec->{'marker_species'}
                          || $rec->{'feature_species'}
                          || $file_species;
        my $marker_name    = $rec->{'marker_name'}    
                          || $rec->{'feature_name'}
                          || $rec->{'hit_name'}
                          || '';

	#sometimes extra header lines embedded in the mapping file
	next MAPPING if($marker_name =~ /^hit_name$/i);

        my $map_name       = $rec->{'map_name'}
                          || $rec->{'chr'}
                          || $rec->{'chr_with_offset'}
                          || '';
	
	if($map_name =~ /0*(\d+)/){
	  $map_name = "Chr. $1";
	}

        my $start          = $rec->{'marker_start'}   
                          || $rec->{'feature_start'}
                          || $rec->{'chr_start'}
                          || 0;
        my $end            = $rec->{'marker_end'}
                          || $rec->{'feature_stop'}
                          || $rec->{'chr_stop'}
                          || 0;
        my $map_acc        = $rec->{'cmap_map_accession'}
                          || $rec->{'cmap_map_acc'}
                          || $rec->{'map_acc'};


	my $marker_start   = $rec->{'hit_start'} || 0;
	my $marker_end     = $rec->{'hit_stop'}  || 0;
	my $marker_strand  = $rec->{'strand'}    || 0;
	my $cigar_line     = $rec->{'cigar_line'} || '';
	my $score           = $rec->{'score'}    || '';
	my $percent_identity  = $rec->{'perc_id'} || '';



#print "$marker_id, $map_name, $start, $end\n"; next;
        if ( !$marker_id && !$marker_name ) {
            complain("No marker name or marker id, $file line $line_num");
            $num_errors++;
            next MAPPING;
        }

	if ( !$marker_id && $marker_id_required ) {
            complain("No marker id (marker_id required), $file line $line_num");
            $num_errors++;
            next MAPPING;
        }

	unless( $marker_id ){
	  
	  # types can be tricky, so first check without and then with
	  my @markers;
	  for my $i ( 1..2 ) {
            my $type = $i == 1 ? '' : $marker_type;
            $type    = '' if $type eq 'Undefined';
	    
            @markers        =  $mdb->marker_search(
						   #marker_id   => $marker_id,
						   marker_name => $marker_name,
						   species     => $marker_species,
						   marker_type => $type,
						  );
	    #print "number of markers found for $marker_name, $marker_species, $type is " . scalar @markers . "\n";

            last if scalar @markers <= 1;
	  }
	  
	  my $num_markers = scalar @markers;
#print "number of markers found for $marker_name, $marker_species is $num_markers\n";	  
	  if ( $num_markers > 1 ) {
            complain( 
		     "Too many markers ($num_markers) match $marker_name, ",
		     "$file line $line_num"
		    );
            $num_errors++;
            next MAPPING;
	  }
	  elsif ( $num_markers == 0 ) {
           # if ( $marker_id ) {
	    #  complain("Marker id '$marker_id' not found!");
            #}
            if ( $marker_name && $marker_type && $marker_species ) {
	      $marker_id      =  $mdb->create_marker(
						     marker_name => $marker_name,
						     marker_type => $marker_type,
						     species     => $marker_species,
						    );
            }
            else {
	      complain(
		       "$file, line $line_num: ",
		       "Can't create new marker without name, type, and species"
		      );
	      $num_errors++;
	      next MAPPING;
            }
	  }
	  else {
            $marker_id = $markers[0]->{'marker_id'};
	    #print "marker_id = $marker_id\n";
	  }
	}

        if ( !$marker_id ) {
            print "$file, line $line_num: can't figure out marker id\n";
            $num_errors++;
            next MAPPING;
        }

        my $marker = Gramene::CDBI::Markers::Marker->retrieve( $marker_id );
	unless($marker){
	  complain("$file, line $line_num: Marker id '$marker_id' not found!");
	  $num_errors++;
	  next MAPPING;
	}

        if ( 
            $marker_species ne ''
            && $marker->source_species->to_string ne $marker_species 
        ) {
            complain( 
                sprintf(
                    '%s (%s) species "%s" not the same as file "%s"',
                    $marker->display_synonym->to_string, 
                    $marker->id,
                    $marker->source_species->to_string,
                    $marker_species
                )
            );

            if ( 
                $marker->source_species->to_string eq 'UNKNOWN' 
            ) {
                $mdb->update_Marker( 
                    marker_id => $marker_id,
                    species   => $marker_species 
                );
            }

            #die "Tried to update marker species\n";
        }

        my $map        =  Gramene::CDBI::Markers::Map->find_or_create({
            map_set_id => $map_set_id,
            map_name   => $map_name,
        });

	# cmap map accession, usually not present in the input file
	# so the following if block will not be executed
        if ( $map_acc ) {
            $map->cmap_map_accession( $map_acc );
            $map->update;
        }

        if ( my $analysis_name = $rec->{'analysis'} ) { #the analysis defined in the mapping file override any previously defined analysis
	  $analysis = Gramene::CDBI::Markers::Analysis->find_or_create({
									analysis_name => $analysis_name,
								       });
	  $analysis_id = $analysis->id;
	}

	my ($map_id, $marker_display_synonym_id) = ( $map->id, $marker->display_synonym_id);

	if( my @mappings_found = Gramene::CDBI::Markers::Mapping->search(
						marker_id => $marker_id,
						map_id    => $map_id,
						start     => $start,
						end       => $end,
						marker_start => $marker_start,
						marker_end => $marker_end,
						marker_strand => $marker_strand,
									)){
	  print "Skip: marker_id$marker_id already exist ($map_id, $start, $end, $marker_start, $marker_end, $marker_strand)\n";
	  next MAPPING;
	  
	}

	my @mapping_record = ($marker_id, 
			      $map_id, 
			      $marker_display_synonym_id, 
			      $start,
			      $end,
			      $strand,
			      $marker_start,
			      $marker_end,
			      $marker_strand,
			      $cigar_line,
			      $score,
			      $percent_identity,
			      $analysis_id,
			      $date_created. $NL
			     );

	print $mapping_fh join("\t", @mapping_record);
	++$num_mappings;
    }

    $num_files++;
}

print "Done, processed $num_files files, $num_mappings mappings ",
    "with $num_errors errors.\n"; 

sub complain {
    print STDERR @_, "\n";
}

sub prompt_for {
    my $object_type = shift or return;
    my $value       = shift || '';

    my $class = 'Gramene::CDBI::Markers::' . 
                join( '', map { ucfirst $_ } split /_/, $object_type );

    for (;;) {
        $value ||= prompt "$object_type? ";

        if ( $value eq 'q' ) {
            print "Quitting.\n";
            exit 0;
        }

        my @found = $class->search(
            $object_type => $value
        );

        my $num_found = scalar @found;

        if ( $num_found == 1 ) {
            return $found[0]->id;
        }
        else {
            print join($NL,
                "Found $num_found for '$value'",
                'Please be more specific ("q" to quit)',
                ''
            );
            $value = '';
        }
    }
}

__END__

# ----------------------------------------------------

=head1 NAME

load-mappings.pl - load mapping data into markers db

=head1 VERSION

This documentation refers to load-mappings.pl version $Revision: 1.2 $

=head1 SYNOPSIS

  load-mappings.pl [options] file1.dat [file2.dat ...]

Required Arguments:

  -m|--ms-acc   CMap map set accession

Options:

  --help        Show brief help and exit
  --man         Show full documentation
  --version     Show version and exit
  --species     specise of the mapping data
  --m|ms-acc=s  cmap accession for this map, for example 'gt0506'
  --map-type    map set map type, for example Genetic | QTL | mbin | sequence,
  --map-set-name map set name, for example "TIGR/IRGSP Assembly v4 (Jan 2006)",
  --no-prompt    no_prompt,
  --ensembl      the input mapping file are in the format of loading into ensembl
                 database, analysis name will be parsed out from file name, the
                 species and type can be decided by consulting a pre-defined hash
  --analysis_id  analysis_id provided, the program will not create any analysis
  --analysis_name analysis_name provided, used to find or create for analysis
  --marker_id_required marker_id in the data file, no need to create or search for marker of the mapping
  --outfile      output file name, need to be like mapping.xxx (default mapping.dat)

=head1 DESCRIPTION

This script CONVERT the mapping file to a loadable format for mysqlimport
to load in batch to markers database mapping table. The output file will
assume the table name, which is mapping.dat

You must specify the CMap map set accession to load into.  If the map set
cannot be found, you will be prompted to create it.  If the mapping file has no
marker_id, You must supply the species, map type, and map set name before continuing.

The files should be tab-delimited with the first line containing the
field names.  Ideally, the field names will be the same as what's in
the markers database, but many aliases are accepted, e.g., "perc_id"
for "percent_identity" (these aliases taken from BioPipe output).

Each line of the input files must identify the marker to which to link
the mapping.  Ideally, "marker_id" will do this, but some combination
of "marker_name," "marker_species" and "marker_type" may also be used.
The file names may be used to specify the species and marker types of
the markers if they are not specified in the data, e.g., for the TIGR
tracks (please view the %FILES_TO_SPECIES_TYPE hash in the code).
Otherwise, the following fields are required:

=over 4

=item * marker_id

=item * marker_type

=item * marker_species

=item * marker_name (or "hit_name")

=item * map_name (or "chr" or "chr_with_offset")

=item * marker_start (or "chr_start")

=item * marker_end (or "chr_stop")

=back

The following are optional fields:

=over 4

=item * analysis

=item * marker_strand (or "strand")

=item * cigar_line

=item * score

=item * evalue

=item * percent_identity (or "perc_id")

=item * remark (or "comments")

=back

If no markers can be found for the available criteria, then a new
marker will be created.  If more than one marker matches, then an
error will be printed (to STDERR) and the mapping will be skipped.

When searching for a marker when "marker_id" is not present, then at
least "marker_name" must be present.  If it is not, then an error will
be printed and the mapping skipped.  The same will happen if a
"marker_id" is supplied that does not exist in the database.  The
other criteria (marker type and species) are used in various
combinations until just one marker can be found.  If a marker is found
without using the species in the file when one is present and then the
found marker does not have the same species, an error will be printed
that there is a mismatch.  If the marker db has an "UNKNOWN" species
for the marker and a different value is present in the file, then the
marker db will be updated to match the file.

For each record processed, a line will be printed to STDOUT and a
summary will be shown when the script has finished.
  
=head1 SEE ALSO

Gramene::Markers::DB, Gramene::CDBI::Markers.

=head1 AUTHOR

Ken Youens-Clark E<lt>kclark@cshl.eduE<gt>.

=head1 COPYRIGHT

Copyright (c) 2006 Cold Spring Harbor Laboratory

This library is free software;  you can redistribute it and/or modify 
it under the same terms as Perl itself.

=cut
