#!/usr/local/bin/perl

use strict;
use Data::Dumper;
use File::Basename;
use File::Spec::Functions;
use Pod::Usage;
use Getopt::Long;
use Test::More;

my ( $help, $test, $skip, $fielded, $out_dir );
GetOptions( 
    'd|out-dir:s' => \$out_dir,
    'h|help'      => \$help,
    't|test'      => \$test,
    's|skip:s'    => \$skip,
	'f|fielded'   => \$fielded,
);
pod2usage if $help;

my %skip = map { $_ =~ s/^\s+|\s+$//g; $_, 1 } split (/,/, $skip);

if ( $test ) {
    verify_parsing( parse_file( *DATA ) );
}
else {
    if ( !@ARGV ) {
        pod2usage( -message => 'No input file' );
    }
    elsif ( @ARGV > 1 && !$out_dir ) {
        pod2usage( 
            -message => 'Please specify outdir with multiple FPC files'
        );
    }
}

for my $file ( @ARGV ) {
    my $basename = basename( $file );
    print STDERR "Processing $basename\n";

    open my $fh, "<$file" or die "Can't read '$file': $!\n";

    my $data = parse_file( $fh );

    output( 
        data         => $data, 
        is_fielded   => $fielded, 
        in_file_name => $basename, 
        out_dir      => $out_dir 
    );
}

# ----------------------------------------------------
sub parse_file {
    my $fh = shift or die 'No filehandle to read';
    my ( $clone, $ctg, $left, $right,$bottom_ctg, @markers, @clones, %contigs);
    while ( <$fh> ) {
        chomp;
        $clone          = $1         if m/^(?:Clone|BAC) : \"(.+)\"/;
        ( $ctg, $left ) = ( $1, $2 ) if m/^Map \"(.+)\" Ends Left (.+)\.0/;
        $right          = $1         if m/^Map \".+\" Ends Right (.+)\.0/;

        #
        # "Positive_(*)" means a marker.
        #
        if ( m/^Positive_(STS|OVERGO|RFLP|eMRK|Probe|Probe_weak) \"(.+)\"/ ) {
            my $type   = $1;
            my $marker = $2;

            #
            # Delete all suffixes after marker names, except for maize CL
            # clusters and "Probe*" (Klein sorghum).
            #
            $marker =~ s/(?:[\.\_+\-\W].*$\)// 
                unless ( $type eq 'OVERGO' && $marker =~ m/^CL/ ) ||
                $type =~ /^Probe/;

            $marker =~ s/[A-JX]$//;
            $marker =  '' if defined $skip{ $marker };
            push @markers, $marker if $marker;
        }

        #
        # "Modified_date" is the last field in a record.
        #
        if ( m/^Modified_date|^$|^\n/ ) {
            #
            # An exclamation point at the beginning of a clone name
            # denotes a deletion.
            #
            if ( $clone &&  $clone !~ m/^!/ ) {
                push @clones, {
                    name    => $clone,
                    contig  => $ctg, 
                    left    => $left, 
                    right   => $right, 
                    markers => [ @markers ],
                };
                ( $clone, $ctg, $left, $right ) = ( '', '', '', '' );
                @markers = ();
            }
        }

        #
        # This parses the latter part of the file.
        # If there's a "Chr_remark", it refers to
        # the contig on the line above
        #
        if ( m/^(Ctg\d+)/ ) {
            $bottom_ctg = lc $1;
        }

        if ( m/^Chr_remark\s+\"Chr(\d+)/ ) {
            $contigs{ $bottom_ctg } = $1;
        }
        elsif ( m/^Chr_remark\s+\"Lg(\w)/ ) {
            $contigs{ $bottom_ctg } = $1;
        }
    }
    close $fh;

    #
    # Sort out the linkage group for each contig (if defined).
    #
    for my $clone ( @clones ) {
        $clone->{'linkage_group'} = $contigs{ $clone->{'contig'} } || '';
    }

    return \@clones;
}

# ----------------------------------------------------
sub output {
    my %args         = @_;
    my $clones       = $args{'data'};
	my $is_fielded   = $args{'is_fielded'};
    my $in_file_name = $args{'in_file_name'};
    my $out_dir      = $args{'out_dir'};

    my $out_fh;
    if ( $out_dir ) {
        my $out_file = catfile( $out_dir, $in_file_name . '.parsed' );
        open $out_fh, '>', $out_file or die "Can't write $out_file: $!\n";
    }
    else {
        $out_fh = *STDOUT;
    }

	print join( "\t", qw[
		bac_name contig_name start end linkage_group markers
	] ), "\n" if $fielded;

    my $field_join_token = $is_fielded ? ',' : "\t";
    for my $clone ( @$clones ) {
        print $out_fh join( "\t", 
            $clone->{'name'}, 
            $clone->{'contig'}, 
            $clone->{'left'}, 
            $clone->{'right'}, 
            $clone->{'linkage_group'},
            join( $field_join_token, @{ $clone->{'markers'} })
        ), "\n";
    }
}

# ----------------------------------------------------
sub verify_parsing {
    plan( tests => 26 );

    my $clones = shift;
    ok( @$clones, 'Data' );
    cmp_ok( scalar @$clones, '==', 3, 'Three clones present' );
    my $no_bad_names = grep { $_->{'name'} =~ m/^!/ } @$clones;
    cmp_ok( $no_bad_names, '==', 0, 'Names OK' );

    my $clone1 = shift @$clones;
    is( $clone1->{'name'}, 'AB026295sd1', 'Clone name is "AB026295sd1"' );
    is( $clone1->{'contig'}, 'ctg131', 'Contig is "ctg131"' );
    cmp_ok( $clone1->{'left'}, '==', 60, 'Contig left is "60"' );
    cmp_ok( $clone1->{'right'}, '==', 98, 'Contig right is "98"' );
    cmp_ok( $clone1->{'linkage_group'}, '==', 6, 'Linkage group is "6"' );

    my @markers = @{ $clone1->{'markers'} || [] };
    is( scalar @markers, 4, 'Clone has 4 markers' );
    is( $markers[0], 'E10921', 'First marker is "E10921"' );
    is( $markers[1], 'G30', 'Second marker is "G30"' );
    is( $markers[2], 'R2634', 'Third marker is "R2634"' );
    is( $markers[3], 'RM575', 'Fourth marker is "RM575"' );

    my $clone2 = shift @$clones;
    is( $clone2->{'name'}, 'AY013245sd1', 'Clone name is "AY013245sd1"' );
    is( $clone2->{'contig'}, 'ctg384', 'Contig is "ctg384"' );
    cmp_ok( $clone2->{'left'}, '==', 3, 'Contig left is "3"' );
    cmp_ok( $clone2->{'right'}, '==', 15, 'Contig right is "15"' );
    ok( !$clone2->{'linkage_group'}, 'Linkage group is undefined' );
    @markers = @{ $clone2->{'markers'} || [] };
    cmp_ok( scalar @markers, '==', 1, 'Clone has 1 marker' );
    is( $markers[0], 'R2628', 'Marker is "R2628"' );

    my $clone3 = shift @$clones;
    is( $clone3->{'name'}, 'AB076666sd1', 'Clone name is "AB076666sd1"' );
    ok( !$clone3->{'contig'}, 'Contig is undefined' );
    ok( !$clone3->{'left'}, 'Contig left is undefined' );
    ok( !$clone3->{'right'}, 'Contig right is undefined' );
    ok( !$clone3->{'linkage_group'}, 'Linkage group is undefined' );
    is( scalar @{ $clone3->{'markers'} || [] }, 0, 'Clone has 0 markers' );
}

# ----------------------------------------------------

=pod

=head1 NAME 

fpc_parser.pl - parse FPC files from AGI

=head1 SYNOPSIS

  fpc_parser.pl [options] input.fpc [input2.fpc ...]

=head1 OPTIONS

  -d|--out-dir=X       Directory to write output
  -s|--skip=foo[,bar]  Skip markers with names in comma-separated list
  -h|--help            Print usage
  -t|--test            Run embedded tests
  -f|--fielded         Created fielded output (header row, markers 
                       joined on commas)

=head1 DESCRIPTION

Parses an FPC_rice file from AGI (nee CUGI) to extract clone, contig,
start, stop, markers and linkage group information.  The markers are
stripped of their alphabetical suffixes "A-J,X".  Also, the endings that
follow "_-.+-" delimiters are chopped off.  

If you use the "-d" "output directory" option, the parsed file will be
placed into a file ending in ".parsed" in that directory.  This is
useful if you are parsing many input files.

Here's a input sample record:

  Clone : "AC023240sd1"
  Map "ctg220" Ends Left 2.000
  Map "ctg220" Ends Right 30.000 Oldctg 0
  Gel_number    20010830
  Bands  1949531 29
  Gel_number    AC023240sd
  Bands  1827803 29
  Remark "a0051D19, Chr10 - Buell"
  Remark "eMRK by Gramene (G291)"
  Remark "electronic by Gramene (R1877)"
  Shotgun Full_X FINISHED
  Positive_eMRK "G291" New
  Positive_STS "R1877" New
  Creation_date 101 4 27 14 7 
  Modified_date 102 10 9 10 41 

Each record represents a clone which I<should> have "left" and "right"
("start" and "stop") coordinates on a particular contig.  An
exclamation point at the beginning of a clone name denotes a deletion,
so the clone will be ignored.  The markers on the clone are denoted by
one of "Positive_STS," "Postive_eMRK," "Positive_OVERGO," or
"Positive_RFLP."  The linkage group information is at the bottom of
the file and looks like this:

  Ctg325 21/3/103 14:27 Ok  -1 #
  Chr_remark "Chr12 [2 Seq2]"
  Trace_remark "Add 2. "

As the linkage group is tied to the contig and not the clone, it is
necessary to match up the clone to the linkage group via the contig at
the end of the process.

The output of this program is a tab-delimited file containing the
following columns (though not labelled):

  clone_name
  contig
  left
  right
  linkage_group
  markers

Here's a sample output from the above:

  AC023240sd1     ctg220  2       30      10    G291    R1877 

With the "fielded" option, the field names will be printed on the first
line and the "markers" will be joined on commas:

  clone_name	contig	left	right	linkage_group	markers
  AC023240sd1	ctg220	2	30	10	G291,R1877 

where the "markers" field is simply a tab-delimited listing of the
marker names on the clone.

The next step is to run this output through fpc_converter.pl to get
the CMap import format.

=head1 CHANGE LOG

=over 4

=item * 

2002-12-10:  Received original version of script from Lenny.

=item *

2003-01-29:  Altered to extract linkage group, added docs.

=item *

2003-01-30:  Added command-line options and test suite.

=item *

2003-04-14:  Don't remove value in parentheses from feature name for 
"Probe" markers (Klein sorghum).

=item *

2003-04-30  File's format has changed in regard to linkage group 
information. (Lenny)

=item *

2003-06-27:  Added "skip" option. (Ken)

=item *

2004-04-09:  Added "fielded" output option. (Ken)

=item *

2004-04-13:  Added code to catch assignment of contig to a linkage
group (e.g., "LgB") in addition to a chromosome ("Chr1").

=back

=cut


=head1 AUTHORS

Ken Youens-Clark E<lt>kclark@cshl.orgE<gt>,
Lenny Teytelman E<lt>teytelma@cshl.orgE<gt>

=cut

__DATA__

// Build 3/10/1 11:43 Cut 1e-12 Off 50 0 0 0 TBL 0 0e+00 0 0e+00 0 0e+00
// Clip(600 2000) MinMax(0 32767) AutoRemark

Clone : "!0J1057_B10"
Gel_number    6002A1
Bands  1815240 18
Creation_date 100 12 14 12 39
Modified_date 101 3 19 15 18

Clone : "AB026295sd1"
Map "ctg131" Ends Left 60.000
Map "ctg131" Ends Right 98.000 Oldctg 0
Gel_number    20020525
Bands  2092328 39
Gel_number    20020322
Bands  2062361 39
Gel_number    AB026295sd
Bands  1827294 39
Remark "P0681F10, Chr6 - Sasaki,T."
Remark "P0681F10, Chr6 - Sasaki,T., Nagamura"
Remark "eMRK by Gramene (E10921)"
Remark "eMRK by Gramene (G30)"
Remark "eMRK by Gramene (R2634)"
Shotgun Full_X FINISHED
Positive_STS "E10921" New
Positive_RFLP "G30" New
Positive_eMRK "R2634" New
Positive_OVERGO "RM575" New
Creation_date 101 4 27 14 6
Modified_date 102 10 9 10 41

Clone : "AY013245sd1"
Map "ctg384" Ends Left 3.000
Map "ctg384" Ends Right 15.000 Oldctg 0
Gel_number    20020508
Bands  2084622 12
Gel_number    20020220
Bands  2027306 10
Gel_number    AY013245sd
Bands  1846181 10
Remark "BAC 36I5, Chr3 - Dubcovsky,J."
Remark "eMRK by Gramene (R2628)"
Shotgun Full_X FINISHED
Positive_eMRK "R2628" New
Creation_date 101 4 27 14 10
Modified_date 102 10 9 10 41

BAC : "AB076666sd1"
Gel_number    20030128
Bands  2186098 44
Gel_number    20021224
Bands  2181203 44
Remark " - Kadowaki"
Shotgun Full_X FINISHED
Creation_date 102 12 24 3 37
Modified_date 103 1 28 3 34

Ctg131 21/3/103 14:47 Ok  -1 #
Chr_remark "Chr6  [4 Fw4 Seq3]"
Trace_remark "Add 3. "
Ctg384 26/4/103 3:43 Ok  -1 #
Chr_remark "+     [ ch2-1 ch3-5 ch9-1 Fw2 Seq5]"
Trace_remark "Add 1. Add 4. "

