#!/usr/local/bin/perl

# vim: tw=78: sw=4: ts=4: et: 

# $Id: excel2sexp,v 1.3 2006/08/31 18:24:36 kycl4rk Exp $

use strict;
use warnings;
use Cwd;
use English qw( -no_match_vars );
use File::Basename qw( basename fileparse );
use File::Spec::Functions;
use Getopt::Long;
use List::Util qw( max );
use List::MoreUtils qw( none );
use Pod::Usage;
use Readonly;
use Spreadsheet::ParseExcel;

Readonly my $EMPTY_STR => q{};
Readonly my $TAB       => "\t";
Readonly my $VERSION   => sprintf '%d.%02d', 
                          qq$Revision: 1.3 $ =~ /(\d+)\.(\d+)/;

my ( $help, $man_page, $show_version );
GetOptions(
    'help'    => \$help,
    'man'     => \$man_page,
    'version' => \$show_version,
) or pod2usage(2);

if ( $help || $man_page ) {
    pod2usage({
        -exitval => 0,
        -verbose => $man_page ? 2 : 1
    });
}; 

if ( $show_version ) {
    my $prog = basename( $PROGRAM_NAME );
    print "$prog v$VERSION\n";
    exit 0;
}

my @files     = @ARGV or pod2usage('No input files');
my $cwd       = cwd();
my $num_files = 0;

INPUT_FILE:
for my $file ( @files ) {
    unless ( -e $file && -s _ && -r _ ) {
        warn "'$file' doesn't exist, is zero-length or unreadable, skipping.\n";
        next INPUT_FILE;
    }

    my $basename = fileparse( $file, '.xls' );
    my $out_file = catfile( $cwd, normalize($basename) . '.sxp' );

    if ( -e $out_file ) {
        print "'$out_file' exists.  OK to overwrite? [y/N] ";
        chomp( my $overwrite = <STDIN> );
        next INPUT_FILE if $overwrite !~ /^[Yy]/;
    }

    open my $out_fh, '>', $out_file
        or die "Can't write to '$out_file': $!\n";

    my $workbook = Spreadsheet::ParseExcel::Workbook->Parse( $file );

    if ( ref $workbook->{'Worksheet'} ne 'ARRAY' ) {
        warn "'$file' has no worksheets (not an Excel spreadsheet?)\n";
        next INPUT_FILE;
    }

    print {$out_fh} qq[(file\n];

    WORKSHEET:
    for my $ws ( @{ $workbook->{'Worksheet'} } ) {
        my $min_row = $ws->{'MinRow'};
        my $min_col = $ws->{'MinCol'};
        my $max_row = $ws->{'MaxRow'} or next;
        my $max_col = $ws->{'MaxCol'} or next;
        my $ws_name = $ws->{'Name'};

        my @data;
        for my $row_num ( $min_row .. $max_row ) {
            my @row;
            for my $col_num ( $min_col .. $max_col ) {
                my $cell = $ws->{'Cells'}[ $row_num ][ $col_num ];
                push @row, defined $cell ? $cell->Value : $EMPTY_STR;
            }

            if ( @row ) {
                push @data, \@row;
            }
        }

        if ( !@data ) {
            warn "No data in worksheet '$ws_name' in file '$file'\n";
            next WORKSHEET;
        }

        my $object_name = normalize($ws_name);
        my @columns     = map { normalize($_) || () } @{ shift @data };
        my $max         = max map { length $_ } @columns;
        my $pattern     = qq[  (%-${max}s %s)];

        next WORKSHEET unless @columns;

        ROW:
        for my $row ( @data ) {
            next ROW if none { defined($_) } @$row;
            next ROW if join($EMPTY_STR, @$row) eq $EMPTY_STR;

            my @out;
            for my $col_num ( 0..$#columns ) {
                my $value = $row->[ $col_num ];

                if ( defined $value && $value ne $EMPTY_STR ) {
                    $value =~ s/^"|"$//g;      # remove quotes
                    $value =~ s/([()])/\\$1/g; # escape parens
                    $value = qq["$value"] if $value =~ /[^a-zA-Z0-9]/;
                }

                push @out, sprintf $pattern, $columns[ $col_num ], $value;
            }

            print {$out_fh} "($object_name\n", join( "\n", @out ), ")\n";
        }
    }

    print {$out_fh} ")\n";

    print "$file written to $out_file\n";
    $num_files++;
}

printf "Done, processed %s file%s.\n",
    $num_files, $num_files == 1 ? $EMPTY_STR : 's';

sub normalize {
    my $name = shift;
    $name    = lc $name;            # lowercase
    $name    =~ s/\s+/_/g;          # spaces to underscores
    $name    =~ s/[^a-zA-Z0-9_]//g; # rm bad characters
    return $name;
}

__END__

# ----------------------------------------------------
=head1 NAME

excel2sexp - convert an Excel spreadsheet to an S-expression file

=head1 VERSION

This documentation refers to excel2sexp version $Revision: 1.3 $

=head1 SYNOPSIS

  excel2sexp 

Options:

  --help        Show brief help and exit
  --man         Show full documentation
  --version     Show version and exit

=head1 DESCRIPTION

For each Excel file given, creates a text version.  Each worksheet is
considered an "object/table" (e.g., "div_taxonomy") and each row in that
worksheet is wrapped in an S-expression by that name.  The entire file will be
rooted in an element of '(file "$filename")'.

=head1 SEE ALSO

Spreadsheet::ParseExcel, Data::Stag.

=head1 AUTHOR

Ken Youens-Clark E<lt>kclark@cshl.eduE<gt>.

=head1 COPYRIGHT

Copyright (c) 2006 Cold Spring Harbor Laboratory

This library is free software;  you can redistribute it and/or modify 
it under the same terms as Perl itself.

=cut
