#!/usr/local/bin/perl

use strict;
use Getopt::Long;
use Pod::Usage;
use Test::More;
use Readonly;

Readonly my $DASH      =>  q{-};
Readonly my $EMPTY_STR =>  q{};
Readonly my $TAB       => qq{\t};
Readonly my $NL        => qq{\n};

my $feature_prefix = '';
my $clones_only    = 0;
my $add_field      = '';
my ( $help, $test, $skip, $fielded_input, $minimum_bac_hits, $show_bac_hits,
    $show_all_clones, $ms_acc );
GetOptions( 
    'h|help'           => \$help,
    't|test'           => \$test,
    's|skip:s'         => \$skip,
    'f|fielded-input'  => \$fielded_input,
	'b|bac-hits:i'     => \$minimum_bac_hits,
    'm|ms-acc:s'       => \$ms_acc,
    'feature-prefix:s' => \$feature_prefix,
    'clones-only'      => \$clones_only,
    'add-field:s'      => \$add_field,
    'show-bac-hits'    => \$show_bac_hits,
    'show-all-clones'  => \$show_all_clones,
);
pod2usage if $help;

if ( !defined $show_all_clones ) {
    $show_all_clones = 1;
}

my %skip = map { $_ =~ s/^\s+|\s+$//g; $_, 1 } split (/,/, $skip);

my $fh;
if ( $test ) {
	require IO::Scalar;
    my $data = $fielded_input ? fielded_data() : normal_data();
    $fh = IO::Scalar->new( \$data );
}
else {
    my $file = shift or die pod2usage( -message => 'No input file' );
    open $fh, "<$file" or die "Can't read '$file': $!\n";
}

my $data = parse_file( $fh, $fielded_input );

if ( $test ) {
    verify_parsing( $data );
}
else {
    output( $data );
}

# ----------------------------------------------------
sub parse_file {
    #
    # Parse the file into contig, marker and clone data.
    #
    my $fh = shift or die "No filehandle to read from.\n";
    my $fielded_input = shift || 0;
    my %contigs;

    if ( $fielded_input ) {
        my $header = <$fh>; # throw away, assume correct order
    }

    while ( my $line = <$fh> ) {
        chomp $line;
        my ( $clone, $contig_name, $start, $end, $linkage_group, $markers ) =
            map{ s/"//g; $_ } split( /\t/, $line, 6 );

        next unless $contig_name;
        my $contig = $contigs{ $contig_name } || 
                     ImportContig->new( $contig_name );
        $contig->start( $start );
        $contig->end( $end );
        $contig->linkage_group( $linkage_group );

        #
        # Save raw clone info for sequenced clones.
        #
        my $is_sequenced = 0;
        if ( $clone =~ m/^(.+)sd\d+$/ ) {
            $clone = $1;
            $is_sequenced = 1;
        }

        $contig->add_bac( 
            clone_name   => $clone,
            start        => $start,
            end          => $end,
            is_sequenced => $is_sequenced,
        );

        #
        # Look up the info for each marker or create a new object.
        #
        my $split_token = $fielded_input ? ',' : "\t";
        for my $marker_name ( split( /$split_token/, $markers ) ) {
            next if defined $skip{ $marker_name };

            my $marker = $contig->{'markers'}{ $marker_name } ||
                Marker->new( $marker_name );

            $marker->marker_name( $marker_name );
            $marker->contig( $contig_name );
            $marker->start( $start );
            $marker->end( $end );
            $marker->inc_bac_hits();

            $contig->{'markers'}{ $marker_name } = $marker;
        }

        $contigs{ $contig_name } = $contig;
    }
    close $fh;

    return \%contigs;
}

# ----------------------------------------------------
#
# Print column headers and parsed data in output format.
#
sub output {
    my $contigs = shift or die "No data to output.\n";
    my @headers = qw[ map_name map_start map_stop feature_name feature_start 
        feature_stop feature_type ];
    push @headers, 'bac_hits' if $show_bac_hits;
    push @headers, qw[ map_acc feature_acc ] if $ms_acc;

    my ($extra_field_name, $extra_field_value) = split /=/, $add_field;
    if ( $extra_field_name && $extra_field_value) {
        $extra_field_value =~ s/^'|'$//g;
        push @headers, $extra_field_name;
    }

    print join("\t", @headers), "\n";

    my %last_number_for_map;
    for my $contig_name ( sort keys %$contigs ) {
        my $contig        = $contigs->{ $contig_name } or next;
        my $contig_start  = defined $contig->start ? $contig->start : 1;
        my $contig_end    = $contig->end;
        my $linkage_group = $contig->linkage_group;
        ( my $map_acc     = join($DASH, $ms_acc, $contig_name) ) =~ s/\s+//g;

        for my $marker ( values %{ $contig->{'markers'} } ) {
            next if $clones_only;
            my $marker_name = $marker->marker_name;
            my $start       = $marker->start || 1;
            my $end         = $marker->end;
#            my $position    = ( $start + $end ) / 2;
            my $bac_hits    = $marker->bac_hits;

			next if $minimum_bac_hits && $bac_hits < $minimum_bac_hits;

            my $feature_acc = join( $DASH, 
                $map_acc, ++$last_number_for_map{ $map_acc }
            );

            my %rec = (
                map_acc       => $map_acc,
                map_name      => $contig_name,
                map_start     => $contig_start,
                map_stop      => $contig_end,
                feature_acc   => $feature_acc,
                feature_name  => join($EMPTY_STR, 
                                 $feature_prefix, $marker->marker_name),
                feature_start => $marker->start || 1,
                feature_stop  => $marker->end,
                feature_type  => 'Marker',
                bac_hits      => $marker->bac_hits,
            );

            if ( $extra_field_name ) {
                $rec{ $extra_field_name } = $extra_field_value;
            }

            print join($TAB, map { $rec{ $_ } } @headers), $NL;
        }

        for my $clone ( 
            $contig->get_bacs( sequenced => $show_all_clones ? 0 : 1 ) 
        ) {
            my $feature_acc = join( $DASH, 
                $map_acc, ++$last_number_for_map{ $map_acc }
            );

            my %clone = (
                map_acc       => $map_acc,
                map_name      => $contig_name,
                map_start     => $contig_start,
                map_stop      => $contig_end,
                feature_acc   => $feature_acc,
                feature_name  => join($EMPTY_STR, 
                                 $feature_prefix, $clone->{'clone_name'}),
                feature_start => $clone->{'start'},
                feature_stop  => $clone->{'end'},
                feature_type  => 'Clone'
            );

            if ( $extra_field_name ) {
                $clone{ $extra_field_name } = $extra_field_value;
            }

            print join($TAB, map { $clone{ $_ } } @headers), $NL;
        }
    }
}

# ----------------------------------------------------
sub verify_parsing {
    plan( tests => 59 );

    my $contigs = shift;
	ok( %$contigs, 'Data' );

	#
	# ctg1 tests
	#
	is( scalar keys %$contigs, 4, 'Number of contigs' );
    my $ctg1 = $contigs->{'ctg1'};
    ok( defined $ctg1, 'ctg1 defined' );
    isa_ok( $ctg1, 'ImportContig', 'ctg1 of right class' );
    is( $ctg1->contig_name, 'ctg1', 'contig_name returns "ctg1"' );
    is( $ctg1->start, 4, 'ctg1 starts at band 4' );
    is( $ctg1->end, 352, 'ctg1 ends at band 352' );

	is( $ctg1->linkage_group, 1, 'ctg1 has linkage group 1' );

    my %markers = %{ $ctg1->{'markers'} || {} };
    is( scalar keys %markers, 1, 'ctg1 has 1 marker' );
    my $marker = $markers{'C421'};
    ok( defined $marker, 'Marker C421 defined' );
    isa_ok( $marker, 'Marker', 'Marker is of right class' );
    is( $marker->start, 22, 'C421 starts at 22' );
	is( $marker->end, 52, 'C421 ends at 52' );

	my @clones = @{ $ctg1->{'clones'} || [] };
	is( scalar @clones, 0, 'ctg1 has 0 clones' );

	#
	# ctg2 tests
	#
    my $ctg2 = $contigs->{'ctg2'};
    ok( defined $ctg2, 'ctg2 defined' );
    isa_ok( $ctg2, 'ImportContig', 'ctg2 of right class' );
    is( $ctg2->contig_name, 'ctg2', 'contig_name returns "ctg2"' );
    is( $ctg2->start, 62, 'ctg2 starts at band 62' );
    is( $ctg2->end, 284, 'ctg2 ends at band 284' );

	is( $ctg2->linkage_group, 9, 'ctg2 has linkage group 9' );

    %markers = %{ $ctg2->{'markers'} || {} };
    is( scalar keys %markers, 4, 'ctg2 has 4 markers' );

    my $rz514 = $markers{'RZ514'};
    ok( defined $rz514, 'Marker RZ514 defined' );
    isa_ok( $rz514, 'Marker', 'Marker is of right class' );
    is( $rz514->start, 258, 'RZ514 starts at 258' );
	is( $rz514->end, 233, 'RZ514 ends at 233' );
	is( $rz514->bac_hits, 2, 'RZ514 has two BAC hits' );

    my $c189 = $markers{'C189'};
    ok( defined $c189, 'Marker C189 defined' );
    isa_ok( $c189, 'Marker', 'Marker is of right class' );
    is( $c189->start, 209, 'C189 starts at 209' );
	is( $c189->end, 233, 'C189 ends at 233' );
	is( $c189->bac_hits, 2, 'C189 has two BAC hits' );

	@clones = @{ $ctg2->{'clones'} || [] };
	is( scalar @clones, 0, 'ctg2 has 0 clones' );

	#
	# ctg3 tests
	#
    my $ctg3 = $contigs->{'ctg3'};
    ok( defined $ctg3, 'ctg3 defined' );
    isa_ok( $ctg3, 'ImportContig', 'ctg3 of right class' );
    is( $ctg3->contig_name, 'ctg3', 'contig_name returns "ctg3"' );
    is( $ctg3->start, -9, 'ctg3 starts at band -9' );
    is( $ctg3->end, 160, 'ctg3 ends at band 160' );

	is( $ctg3->linkage_group, 6, 'ctg3 has linkage group 6' );

    %markers = %{ $ctg3->{'markers'} || {} };
    is( scalar keys %markers, 3, 'ctg3 has 3 markers' );

    my $r2634 = $markers{'R2634'};
    ok( defined $r2634, 'Marker R2634 defined' );
    isa_ok( $r2634, 'Marker', 'Marker is of right class' );
    is( $r2634->start, 60, 'R2634 starts at 60' );
	is( $r2634->end, 98, 'R2634 ends at 98' );
	is( $r2634->bac_hits, 1, 'R2634 has one BAC hit' );

	@clones = $ctg3->get_bacs;
	is( scalar @clones, 3, 'ctg3 has 3 BACs' );

	my $clone1 = $clones[0];
    ok( defined $clone1, 'Clone "AB023482sd1" defined' );
	is( $clone1->{'start'}, 124, 'Clone starts at 124' );
	is( $clone1->{'end'}, 160, 'Clone ends at 160' );

	#
	# ctg4 tests
	#
    my $ctg4 = $contigs->{'ctg4'};
    ok( defined $ctg4, 'ctg4 defined' );
    isa_ok( $ctg4, 'ImportContig', 'ctg4 of right class' );
    is( $ctg4->contig_name, 'ctg4', 'contig_name returns "ctg4"' );
    is( $ctg4->start, -6, 'ctg4 starts at band -6' );
    is( $ctg4->end, 408, 'ctg4 ends at band 408' );
	is( $ctg4->linkage_group, 12, 'ctg4 has linkage group 12' );

    %markers = %{ $ctg4->{'markers'} || {} };
    is( scalar keys %markers, 0, 'ctg4 has 0 markers' );

	@clones = $ctg4->get_bacs;
	is( scalar @clones, 4, 'ctg4 has 4 clones' );

	my $clone2 = $clones[1];
    ok( defined $clone2, 'Clone "AB076665sd3" defined' );
	is( $clone2->{'start'}, -6, 'Clone starts at -6' );
	is( $clone2->{'end'}, 44, 'Clone ends at 44' );
}

exit(0);

# ----------------------------------------------------
package ImportContig;
# ----------------------------------------------------
sub new {
    my $class = shift;
    my $self  = { contig_name => shift() };
    return bless $self, $class;
}

# ----------------------------------------------------
sub add_bac {
#
# Add a BAC.
#
    my $self = shift;
    push @{ $self->{'bacs'} }, ref $_[0] eq 'HASH' ? shift : { @_ };
}

# ----------------------------------------------------
sub get_bacs {
#
# Get all BACs (or filter just for those sequenced).
#
    my ( $self, %args ) = @_;
    my @return;
    if ( $args{'sequenced'} ) {
        @return = grep { $_->{'is_sequenced'} } @{ $self->{'bacs'} };
    }
    else {
        @return = @{ $self->{'bacs'} };
    }

    return wantarray ? @return : \@return;
}

# ----------------------------------------------------
sub start {
#
# Take the minimum start
#
    my ( $self, $arg ) = @_;

    if ( defined $arg ) {
        $self->{'start'} = $arg unless defined $self->{'start'};
        $self->{'start'} = $arg if $arg < $self->{'start'};
    }
    return $self->{'start'};
}

# ----------------------------------------------------
sub end {
#
# Take the maximum end
#
    my ( $self, $arg ) = @_;

    if ( defined $arg ) { 
        $self->{'end'} = $arg if $arg > $self->{'end'};
    }
    return $self->{'end'};
}

# ----------------------------------------------------
sub contig_name {
    my ( $self, $arg ) = @_;
    $self->{'contig_name'} = $arg if $arg;
    return $self->{'contig_name'};
}

# ----------------------------------------------------
sub linkage_group {
    my ( $self, $arg ) = @_;
    $self->{'linkage_group'} = $arg if $arg;
    return $self->{'linkage_group'};
}

# ----------------------------------------------------
sub map_id {
    my ( $self, $arg ) = @_;
    $self->{'map_id'} = $arg if $arg;
    return $self->{'map_id'};
}

# ----------------------------------------------------
package Marker;
# ----------------------------------------------------
sub new {
    my $class       = shift;
    my $marker_name = shift;
    my $self        = { marker_name => $marker_name };
    return bless $self, $class;
}

# ----------------------------------------------------
sub start {
#
# Take the maximum start
#
    my ( $self, $arg ) = @_;

    if ( defined $arg ) {
        $self->{'start'} = $arg if $arg > $self->{'start'};
    }
    return $self->{'start'};
}

# ----------------------------------------------------
sub end {
#
# Take the minimum end
#
    my ( $self, $arg ) = @_;

    if ( defined $arg ) { 
        $self->{'end'} = $arg unless defined $self->{'end'};
        $self->{'end'} = $arg if $arg < $self->{'end'};
    }
    return $self->{'end'};
}

# ----------------------------------------------------
sub marker_name {
    my ( $self, $arg ) = @_;
    $self->{'marker_name'} = $arg if $arg;
    return $self->{'marker_name'};
}

# ----------------------------------------------------
sub contig {
    my ( $self, $arg ) = @_;
    $self->{'contig'} = $arg if $arg;
    return $self->{'contig'};
}

# ----------------------------------------------------
sub inc_bac_hits {
    my $self = shift;
    $self->{'bac_hits'}++;
}

# ----------------------------------------------------
sub bac_hits {
    my $self = shift;
    return $self->{'bac_hits'};
}


# ----------------------------------------------------
#
# Data for test suite.
#
package main;

sub normal_data {
    return <<'EOF';
a022A21	ctg1	5	133	1
a0001A01	ctg1	4	173	1	C421
a0001A02	ctg1	327	352	1
a0001A03	ctg1	22	52	1	C421
a0001A04	ctg2	62	95	9
a0001A05	ctg2	258	284	9	RZ514
a0001A06	ctg2	200	233	9	C189	RZ514
a0001A07	ctg2	209	241	9	C189	RZ537	C130
AB023482sd1	ctg3	124	160	6
AB026295sd1	ctg3	60	98	6	E10921	G30	R2634
AB076665sd1	ctg3	-9	47	
AB076665sd2	ctg4	358	408	12
AB076665sd3	ctg4	-6	44	
AB076665sd4	ctg4	-6	44	
AB076665sd5	ctg4	-6	44	
EOF
}

sub fielded_data {
    return <<'EOF';
clone_name	contig_name	start	end	linkage_group	markers
a022A21	ctg1	5	133	1
a0001A01	ctg1	4	173	1	C421
a0001A02	ctg1	327	352	1
a0001A03	ctg1	22	52	1	C421
a0001A04	ctg2	62	95	9
a0001A05	ctg2	258	284	9	RZ514
a0001A06	ctg2	200	233	9	C189,RZ514
a0001A07	ctg2	209	241	9	C189,RZ537,C130
AB023482sd1	ctg3	124	160	6
AB026295sd1	ctg3	60	98	6	E10921,G30,R2634
AB076665sd1	ctg3	-9	47	
AB076665sd2	ctg4	358	408	12
AB076665sd3	ctg4	-6	44	
AB076665sd4	ctg4	-6	44	
AB076665sd5	ctg4	-6	44	
EOF
}

# ----------------------------------------------------

=pod

=head1 NAME

fpc_converter.pl - Turn parsed AGI-FPC data into CMap import format

=head1 SYNOPSIS

  ./fpc_converter.pl [options] parsed-file > cmap-file

=head1 OPTIONS

  -h|--help            Show brief usage statement

  -m|ms-acc=foo        Map set accession, also used to create map acc's
  -s|--skip=foo[,bar]  Skip markers with names in comma-separated list
  -t|--test            Run automated tests
  -f|--fielded-input   Assume input is fielded data
  -b|--bac-hits        Minimum number of BAC hits (default "0")
  --feature-prefix=X   String to prefix feature names (e.g., clone library)
  --add-field=A=B      Add an arbitrary key/value pair to the ouput
                       (e.g., not a CMap field like a feature's species)
  --clones-only        Only print out the clones, not the markers
  --show-bac-hits      Print out the "bac_hits" for each feature
                       (CMap doesn't like this output)
  --show-all-clones    Print all the clones, not just the sequenced ones

=head1 DESCRIPTION

This program will parse a tab-delimited input file with columns:

  clone   
  contig_name    
  start    
  end    
  linkage_group
  markers

Here's some sample input:

  a0001A06	ctg125	200	233
  a0001A07	ctg219	409	441	2	C189	RZ537
  a0001A08	ctg224	192	215	10	30A09r

Alternately, you may specify that the input is "fielded" such that it 
looks like this:

  clone_name	contig	left	right	linkage_group	markers
  AC023240sd1	ctg220	2	30	10	G291,R1877 

The output of this script will be a fielded, tab-delimited file 
(the standard CMap data import format) with the following columns:

  map_name    
  map_start   
  map_stop    
  feature_name    
  feature_start
  feature_stop    
  feature_type    
  bac_hits      [optional]

There will be many clones that make up a contig.  For each clone
record, a "contig" object is created (see contained "ImportContig"
package).  The start and stop bands of the clone are given to the
contig so that it can determine its overall band length by taking the
minimum start and maximum stop bands from all of its composite clones.
All the clones I<should> have the same linkage group;  in the event
that they don't, the last given linkage group sets the linkage group
for the contig.

There are two types of features that will be extracted from each clone
and placed on the contigs:

=head2 MARKERS

The last fields contain a variable number of markers (all data being
tab-delimited).  For each marker name, a "marker" object is created
(see the contained "Marker" class).  A marker's position will be an
average of the I<maximum> start and I<minimum> stop position from the
clones on which they fall (this is opposite from how contigs determine
their length).

=head2 CLONES

Those clones which have been sequenced will be extracted.  They are
identified by a suffix to their name in the form of "sd\d+".  When
found, they will be added to the contig object with their raw start
and stop coordinates.

=head1 CHANGE LOG

=over 4

=item *

2002-12-11:  Written to convert Lenny's older style updated FPC data
into a format acceptable for importing into CMap.  Most of the code
was cribbed from an older script.

=item *

2003-01-29:  Added code to get linkage group information.

=item *

2003-01-30:  Augmented docs and added test suite.

=item *

2003-06-27:  Added "skip" option. (Ken)

=item *

2003-07-01:  Removed "linkage group" from output as CMap no longer
likes this.

=item * 

2004-04-09:  Added "fielded-input" and "bac-hits" options.  The latter
makes it easy to create a curated map where, e.g., you require a marker
to have a minimum of 2 BAC hits to be placed on a contig.

=back

=head1 AUTHOR

Ken Y. Clark E<lt>kclark@cshl.orgE<gt>

=cut
