#!/usr/local/bin/perl

# vim: tw=78: sw=4: ts=4: et: 

# $Id: parse-db-requests.pl,v 1.2 2007/02/26 21:02:23 kclark Exp $

use strict;
use warnings;
use DateTime;
use Digest::MD5 qw( md5_hex );
use English qw( -no_match_vars );
use File::Basename;
use Getopt::Long;
use Gramene::DB;
use Pod::Usage;
use Readonly;
use Time::ParseDate;

Readonly my $VERSION => sprintf '%d.%02d', 
                        qq$Revision: 1.2 $ =~ /(\d+)\.(\d+)/;

my ( $help, $man_page, $show_version );
GetOptions(
    'help'    => \$help,
    'man'     => \$man_page,
    'version' => \$show_version,
) or pod2usage(2);

if ( $help || $man_page ) {
    pod2usage({
        -exitval => 0,
        -verbose => $man_page ? 2 : 1
    });
}; 

if ( $show_version ) {
    my $prog = basename( $PROGRAM_NAME );
    print "$prog v$VERSION\n";
    exit 0;
}

if ( !@ARGV ) {
    pod2usage('No input files');
}

my $check = join('|',
    map { quotemeta }
    '/db',
    '/Oryza_sativa',
    '/Oryza_rufipogon',
    '/Zea_mays',
    '/Arabidopsis_thaliana',
    '/organelles',
);

my $db = Gramene::DB->new('requests');

$| = 1;

my $last_symbol = '|';
my ( $num_files, $num_inspected, $num_inserted ) = ( 0, 0, 0 );
for my $file ( @ARGV ) {
    open my $fh, '<', $file or die "Can't read '$file': $!\n";

    while ( my $line = <$fh> ) {
        if ( $line =~ 
            m,
            ^([^\s]+)    # host
            \s+          # space
            -            # dash
            \s+          # space
            ([^ ]+)      # something not a space
            \s+          # space
            \[           # left square bracket
            (.*?)        # date b/w brackets
            \]           # right square bracket
            \s+          # space
            ["]          # double quote
            GET\s+       # GET string
            (            # capturing paren
            (?:${check}) # non-captured grouping of checked URL prefixes
            [^?]*?       # not question mark (script name) (non-greedy)
            )            # end capturing paren
            \?           # literal question mark
            (.+)         # capture args (up to space)
            \s+HTTP
            ,xms
        ) {
            my ( $host, $user, $date, $script, $args ) = ( $1, $2, $3, $4, $5 );
            next if $script =~ /\.(css|html)$/;

            my $md5 = md5_hex( $line );
            my $exists = $db->selectrow_array(
                q[
                    select count(*)
                    from   request
                    where  request_md5=?
                ],
                {},
                ( $md5 )
            );

            if ( $num_inspected % 10 == 0 ) {
                print STDERR "Parsing '$file' $last_symbol [$num_inspected]\r";

                if ( $last_symbol eq '|'  ) {
                    $last_symbol = '-';
                }
                else {
                    $last_symbol = '|';
                }
            }

            if ( !$exists ) {
                my $epoch = parsedate( $date );
                my $dt    = DateTime->from_epoch( epoch => $epoch );

                $db->do(
                    q[
                        insert
                        into   request 
                               (request_md5, host, date_requested, 
                                script, arguments)
                        values (?, ?, ?, ?, ?)
                    ],
                    {},
                    ($md5, $host, $dt->strftime('%Y-%m-%d %H:%M:%S'), 
                     $script, $args)
                );

                $num_inserted++;
            }
        }

        $num_inspected++;
    }

    print "\n";
    $num_files++;
}

printf STDERR 
    "Done, examined %s record%s in %s file%s, inserted %s records.\n",
    $num_inspected,
    $num_inspected == 1 ? '' : 's',
    $num_files,
    $num_files == 1 ? '' : 's',
    $num_inserted,
    $num_inserted == 1 ? '' : 's',
;

__END__

# ----------------------------------------------------

=pod

=head1 NAME

parse-db-requests.pl - a script

=head1 VERSION

This documentation refers to version $Revision: 1.2 $

=head1 SYNOPSIS

  parse-db-requests.pl 

Options:

  --help        Show brief help and exit
  --man         Show full documentation
  --version     Show version and exit

=head1 DESCRIPTION

Describe what the script does, what input it expects, what output it
creates, etc.

=head1 SEE ALSO

perl.

=head1 AUTHOR

Ken Youens-Clark E<lt>kclark@cshl.eduE<gt>.

=head1 COPYRIGHT

Copyright (c) 2007 Cold Spring Harbor Laboratory

This library is free software;  you can redistribute it and/or modify 
it under the same terms as Perl itself.

=cut
