package LookAlign::Alignment::Container;

our $VERSION = '0.01';

# $Id: Container.pm,v 1.1.2.1 2007/06/14 19:03:07 kclark Exp $

=head1 NAME

LookAlign::Alignment::Container

=head1 DESCRIPTION

Object for represent multiple sequence alignments.

=cut

use warnings;
use strict;

use Carp;
use XML::Simple;

use LookAlign::Alignment::Container::Sq;

###############
# CONSTRUCTOR #
###############

sub new {
    my ($class) = @_;
    my %obj;
    bless \%obj, $class;
}

###########
# METHODS #
###########

sub title {
    my ($self, $value) = @_;
    defined $value and $self->{_title} = $value;
    return $self->{_title};
}

sub info {
    my ($self, $value) = @_;
    defined $value and $self->{_info} = $value;
    return $self->{_info};
}

sub offset {
    my ($self, $value) = @_;
    defined $value and $self->{_offset} = $value;
    return $self->{_offset};
}

sub data_file {
    my ($self, $value) = @_;
    defined $value and $self->{_data_file} = $value;
    return $self->{_data_file};
}

sub load_from_file {
    my ($self, $file) = @_;

    my $doc;

    eval {
        $file or croak("Cannot find file ($file)");
        my $xml = XML::Simple->new();
        $doc = $xml->XMLin(
            $file,
            forcearray => [
                '_global', '_sequence', 'attribute', '_offset', '_title',
                '_info'
            ]
        ) || croak("Cannot parse XML file ($file)");
    } or return;

    # Components
    my @sequences;
    @sequences = @{$doc->{_sequence}} if defined $doc->{_sequence};
    my @globals;
    @globals = @{$doc->{_global}} if defined $doc->{_global};
    my @offset;
    @offset = @{$doc->{_offset}} if defined $doc->{_offset};
    my $offset = shift @offset;
    my @title;
    @title = @{$doc->{_title}} if defined $doc->{_title};
    my $title = shift @title;
    my @info;
    @info = @{$doc->{_info}} if defined $doc->{_info};
    my $info = shift @info;

    # Sequences
    foreach my $s (@sequences) {

        my %attributes;
        foreach my $t (@{$s->{attribute}}) {
            my ($label, $content) = ($t->{label}, $t->{content});
            $attributes{$label} = $content;
        }

        eval {
            $self->add_sequence('label' => $s->{sequence}{label},
                'sequence'   => $s->{sequence}{content},
                'attributes' => \%attributes)
              or croak($@);
        } or return;
    }

    # Globals
    foreach my $g (@globals) {
        my %globals;
        my ($label, $content) = ($g->{label}, $g->{content});

        eval {
            $self->add_global('label' => $g->{label}, 'data' => $g->{content})
              or croak($@);
        } or return;
    }

    # Offset
    $self->offset($offset);

    # Data file
    $self->data_file($file);

    # Title
    $self->title($title);

    # Info
    $self->info($info);

    return 1;
}

sub add_sequence {
    my ($self, %args) = @_;
    eval {
        defined $args{label}
          or croak("A 'label' is required to 'add_sequence'");
    } or return;

    my $order = scalar keys %{$self->{_sequences}};
    $order += scalar keys %{$self->{_globals}};
    $order++;

    my $id = "sequence${order}";

    # When uc is used, the following error is obtained:
    # Can't locate object method "SWASHNEW" via package "utf8" (perhaps you forgot to load "utf8"?) at /usr/local/panzea/cgi-perl/snp_alignment/lib/Alignment/Container.pm line 117.
    # Attempts to identify the problem has been unsuccessfull
    # uc is converted to tr as a practical solution but did not work

    my $uc_sequence = uc $args{sequence};    # $uc_sequence =~ tr/a-z/A-Z/;

    $self->{_sequences}{$id}{sequence} = $uc_sequence;
    $self->{_sequences}{$id}{label}    = $args{label};
    $self->{_sequences}{$id}{order}    = $order;

    $args{attributes}
      and $self->{_sequences}{$id}{attributes} = $args{attributes};

    return 1;
}

sub sequence_ids {
    my ($self) = @_;
    my @ids = keys %{$self->{_sequences}};
    return (@ids);
}

sub valid_sequence_ids {
    my ($self) = @_;
    my @valid_sequence_ids;
    foreach my $sequence_id ($self->sequence_ids) {
        next if ($self->{_sequences}{$sequence_id}{attributes}{'disable'});
        push @valid_sequence_ids, $sequence_id;
    }
    return (@valid_sequence_ids);
}

sub disabled_sequence_ids {
    my ($self) = @_;
    my @disabled_sequence_ids;
    foreach my $sequence_id ($self->sequence_ids) {
        next
          unless ($self->{_sequences}{$sequence_id}{attributes}{'disable'});
        push @disabled_sequence_ids, $sequence_id;
    }
    return (@disabled_sequence_ids);
}

sub sequences {
    my ($self) = @_;
    my @ids = keys %{$self->{_sequences}};

    my @id_objs;
    foreach my $id (@ids) {
        my $id_obj =
             LookAlign::Alignment::Container::Sq->new('alignment' => $self, 'id' => $id)
          or croak($@);
        push(@id_objs, $id_obj);
    }
    return (@id_objs);
}

sub valid_sequences {
    my ($self) = @_;
    my @valid_sequences;
    foreach my $sequence ($self->sequences) {
        next
          if (  $sequence->exists_attribute('disable')
            and $sequence->attribute_value('disable'));
        push @valid_sequences, $sequence;
    }
    return (@valid_sequences);
}

sub disabled_sequences {
    my ($self) = @_;
    my @disabled_sequences;
    foreach my $sequence ($self->sequences) {
        next
          unless ($sequence->exists_attribute('disable')
            and $sequence->attribute_value('disable'));
        push @disabled_sequences, $sequence;
    }
    return (@disabled_sequences);
}

sub add_global {
    my ($self, %args) = @_;
    eval {
        $args{label} or croak("A 'label' is required to 'add_global'");

        #defined $args{data} or croak("A 'data' is required to 'add_global' (label: " . $args{label} . ")");
    } or return;

    my $id = $args{label};

    $self->{_globals}{$id} = $args{data};

    return 1;
}

sub globals {
    my ($self) = @_;
    my @ids = keys %{$self->{_globals}};
    return (@ids);
}

sub exists_global {
    my ($self, $key) = @_;
    defined $self->{_globals}{$key} ? return 1 : return;
}

sub global_value {
    my ($self, $key, $value) = @_;
    if ($value) { $self->{_globals}{$key} = $value }
    exists $self->{_globals}{$key}
      or croak("Global key ($key) does not exist");
    $value = $self->{_globals}{$key};
    return $value;
}

sub global_value_ary {
    my ($self, $key) = @_;
    exists $self->{_globals}{$key}
      or croak("Global key ($key) does not exist");
    my $value = $self->global_value($key);

    my @value = split(',', $value);

    #    if ($value =~ /,/) { @value = split(',', $value) }
    #    else               { @value = split('' , $value) }

    return (@value);
}

sub calculate_consensus {
    my ($self) = @_;

    # get maximum length
    my $max_len = $self->longest_seq_len;

    # calculate consensus
    #    my $consensus_with_gaps;
    my $consensus_no_gaps;

    my @frequency;
    my @pic;
    my @n;

    # Temp sequence ary hash
    my %temp_sequence_ary;
    foreach my $s ($self->valid_sequences) {
        my $id = $s->id;
        @{$temp_sequence_ary{$id}} = $s->sequence_ary;
    }

    for (my $i = 0 ; $i < $max_len ; $i++) {
        my %bases;
        foreach my $id ($self->valid_sequence_ids) {
            my $b =
              $temp_sequence_ary{$id}[$i]
              ; # This bypasses the regular object methods but is used for speed
            $bases{$b}++ if $b;
        }

        #        if (scalar(keys %bases) > 1) { $consensus_with_gaps .= '*' }
        #        else { my ($j) = keys %bases; $consensus_with_gaps .= $j }

        my %bases_no_gaps = %bases
          ;     # Ignores gaps (-), question mark (?) and (N) and ('') and (.)
        delete $bases_no_gaps{'-'};
        delete $bases_no_gaps{'?'};
        delete $bases_no_gaps{'N'};
        delete $bases_no_gaps{'.'};
        delete $bases_no_gaps{'~'};
        delete $bases_no_gaps{'+'};

        # delete $bases_no_gaps{''};

        if (scalar(keys %bases_no_gaps) > 1) {
            $consensus_no_gaps .= '*';
            my ($f, $p, $n) = $self->_stats(\%bases_no_gaps);
            push(@frequency, $f);
            push(@pic,       $p);
            push(@n,         $n);
        }

        elsif (scalar(keys %bases_no_gaps) == 1) {
            my ($j) = keys %bases_no_gaps;
            $consensus_no_gaps .= $j;
            push(@frequency, '');
            push(@pic,       '');
            push(@n,         '');
        }

        elsif (scalar(keys %bases_no_gaps) == 0) {
            $consensus_no_gaps .= ' ';
            push(@frequency, '');
            push(@pic,       '');
            push(@n,         '');
        }

    }

    # Calculate consensus_no_gaps_iupac
    my @consensus_no_gaps = split('', $consensus_no_gaps);

    my @consensus_no_gaps_iupac;
    foreach my $i (0 .. $#consensus_no_gaps) {
        my $base = $consensus_no_gaps[$i];
        if ($base eq '*') {
            $base = $self->_iupac($frequency[$i]);
        }
        push @consensus_no_gaps_iupac, $base;
    }

    #    $self->add_global( 'label' => 'consensus_with_gaps', 'data' => $consensus_with_gaps) or croak($@);
    $self->add_global('label' => 'consensus_no_gaps',
        'data' => join(',', @consensus_no_gaps))
      or croak($@);
    $self->add_global('label' => 'consensus_no_gaps_iupac',
        'data' => join(',', @consensus_no_gaps_iupac))
      or croak($@);

    $self->add_global('label' => 'frequencies',
        'data' => join(',', @frequency))
      or croak($@);
    $self->add_global('label' => 'pic_values', 'data' => join(',', @pic))
      or croak($@);
    $self->add_global('label' => 'N', 'data' => join(',', @n)) or croak($@);

    return 1;
}

sub _iupac {
    my ($self, $frequency) = @_;

    my @bases;

    while ($frequency =~ /(.)=/g) {
        push @bases, $1;
    }

    my $base_string = uc(join('', sort @bases));

    my %iupac_codes = qw(A     A
      C     C
      G     G
      T     T
      AG    R
      CT    Y
      GT    K
      AC    M
      CG    S
      AT    W
      CGT   B
      AGT   D
      ACT   H
      ACG   V
      ACGT  N
    );

    if (!$iupac_codes{$base_string}) {
        croak("Cannot resolve base string ($base_string)");
    }

    return $iupac_codes{$base_string};
}

sub _stats {
    my ($self, $ref_bases) = @_;
    my %bases = %{$ref_bases};

    my $total;
    foreach (keys %bases) { $total += $bases{$_} }

    # Frequencies (calculated and formatted)
    my @f;    # Holds frequencies to feed into the PIC calculation
    my $f = '[';
    foreach my $b (keys %bases) {
        my $frequency = sprintf("%.2f", $bases{$b} / $total);
        $f .= $b . '=' . $frequency . ':';
        push(@f, $frequency);
    }
    $f =~ s/:$//;
    $f .= ']';

    # PIC values
    my $p = $self->_pic(@f);

    return ($f, $p, $total);
}

sub _pic {
    my ($self, @af) = @_;

    my $c1 = 0;
    foreach my $i (@af) { $c1 += $i * $i; }

    my $c2 = 0;
    foreach my $i (0 .. $#af - 1) {
        my $c3 = 0;
        foreach my $j ($i + 1 .. $#af) {
            $c3 += 2 * $af[$i] * $af[$i] * $af[$j] * $af[$j];
        }
        $c2 += $c3;
    }
    my $pic = 1 - $c1 - $c2;

    return sprintf("%.2f", $pic);
}

sub longest_seq_len {
    my ($self) = @_;
    my @lengths;
    foreach my $sequence ($self->valid_sequences) {
        push @lengths, length($sequence->sequence);
    }
    my @sorted_lengths = sort { $b <=> $a } @lengths;
    my $longest_length = shift @sorted_lengths;
    return $longest_length ? $longest_length : 0;
}

sub dump_container {
    my ($self, $file, $start, $end) = @_;

    if (defined $start and defined $end and $start > $end) {
        croak(
            "Cannot dump container if start ($start) is greater than end ($end)"
        );
    }
    if (defined $start and $start < 1) { $start = 1 }
    if (defined $end and $end > $self->longest_seq_len) {
        $end = $self->longest_seq_len;
    }
    if (defined $start and !defined $end) { $end = $self->longest_seq_len }
    if (defined $end and !defined $start) { $start = 1 }

    if (defined $start) { $self->offset($self->offset + $start - 1); }

    open(OUT, ">$file") or croak($!);

    flock(OUT, 2) or croak($!);

    print OUT qq[<data>\n];

    foreach my $s ($self->sequences) {
        my $label = $s->label;

        # *** TEMPORARY ***
        $label =~ s/\&/&amp;/g;

        my $sequence = $s->sequence;
        if (defined $start) {
            $sequence = $self->_get_segment($sequence, $start, $end);
        }
        print OUT qq[<_sequence>\n];
        print OUT qq[\t<sequence label="$label">$sequence</sequence>\n];
        foreach my $t ($s->attributes) {
            my $attribute_value = $s->attribute_value($t);
            if (defined $start and $attribute_value =~ /,/) {
                $attribute_value =
                  $self->_get_segment($attribute_value, $start, $end);
            }
            print OUT
              qq[\t<attribute label="$t">$attribute_value</attribute>\n];
        }
        print OUT qq[</_sequence>\n];
    }

    foreach my $g ($self->globals) {
        my $global_value = $self->global_value($g);
        if (defined $start and $global_value =~ /,/) {
            $global_value = $self->_get_segment($global_value, $start, $end);
        }
        print OUT qq[<_global label="$g">$global_value</_global>\n];
    }

    my $offset = $self->offset;
    print OUT qq[<_offset>$offset</_offset>\n] if defined $offset;

    my $title = $self->title;
    print OUT qq[<_title>$title</_title>\n] if defined $title;

    my $info = $self->info;
    print OUT qq[<_info>$info</_info>\n] if defined $info;

    print OUT qq[</data>\n];

    close OUT;

    return 1;
}

sub _get_segment {
    my ($self, $sequence, $start, $end) = @_;

    my $segment;

    if ($sequence =~ /,/) {
        my @ary = split(',', $sequence);
        my @processed_segment =
            map { defined $_ ? $_ : '' } @ary[$start - 1 .. $end - 1];
        $segment = join(',', @processed_segment) if @ary;
    }
    else {
        my @ary = split('', $sequence);
        $segment = join('', @ary[$start - 1 .. $end - 1]) if @ary;
    }

    return $segment;
}

sub mask_low_quality_bases {
    my ($self, $qscore_threshold) = @_;

    # Mask all low-quality bases to '+'
    foreach my $sequence ($self->sequences) {
        unless ($sequence->exists_attribute('quality')) {
            next;    # Later add detailed reporting items
        }
        my @q = $sequence->attribute_value_ary('quality');
        my @s = $sequence->sequence_ary;

        foreach my $i (0 .. $#q) {
            if ($q[$i] and $q[$i] =~ /\d/ and $q[$i] < $qscore_threshold) {
                $q[$i] = '';
                $s[$i] = 'N';
            }        # Skip if there is no quality score
        }

        $sequence->sequence(join('', @s));
        $sequence->attribute_value('quality', join(',', @q));
    }

    return 1;
}

sub filter_by_selection {
    my ($self, $ref_selection_criteria) = @_;

    my %sc = %{$ref_selection_criteria};

    # Disable all that are not included in the selection criteria
    SEQ: foreach my $sequence ($self->sequences) {
        foreach my $sc (keys %sc) {
            if ($sequence->exists_attribute($sc)) {
                foreach my $value (@{$sc{$sc}}) {
                    if ($sequence->attribute_value($sc) eq $value) {
                        next SEQ;
                    }
                }
            }
        }
        $sequence->attribute_value('disable', 'selection');
    }

    return 1;
}

sub _get_identity_percentage {
    my ($self, $s1, $s2) = @_;

    my $seq1_valid_len = $s1->valid_len;
    my $seq2_valid_len = $s2->valid_len;

    my @seq1 = $s1->sequence_ary;
    my @seq2 = $s2->sequence_ary;

    my $matches = 0;
    foreach my $i (0 .. $#seq1) {
        if (    $seq1[$i]
            and $seq2[$i]
            and $seq1[$i] =~ /^[ATCGatcg]$/
            and $seq1[$i] eq $seq2[$i]) {
            $matches++;
        }
    }

    my $valid_len =
      $seq1_valid_len < $seq2_valid_len ? $seq1_valid_len : $seq2_valid_len;

    my $identity_percentage = 0;    # Skips zero (valid-)length sequences
    $identity_percentage = sprintf("%.2f", $matches / $valid_len * 100)
      if $valid_len;

    return $identity_percentage;
}

sub _get_identity_matrix {
    my ($self) = @_;

    my %matrix;

    my @sequences = $self->valid_sequences;

    foreach my $i (0 .. $#sequences - 1) {
        foreach my $j ($i + 1 .. $#sequences) {
            $matrix{$sequences[$i]->id}{$sequences[$j]->id} =
              $self->_get_identity_percentage($sequences[$i], $sequences[$j]);
        }
    }

    return \%matrix;
}

sub filter_alignment_cluster {
    my ($self, $identity_threshold) = @_;

    my %matrix = %{$self->_get_identity_matrix};

    my %analysis;

    foreach my $i (keys %matrix) {
        foreach my $j (keys %{$matrix{$i}}) {
            if ($matrix{$i}{$j} >= $identity_threshold) {
                $analysis{$i}++;
                $analysis{$j}++;
            }
        }
    }

    unless (%analysis) {
        croak(
            "A reference sequence that has similarity above the current identity percent
              threshold does not exist. Please try turn off filtering or decrease identity
              percent threshold"
        );
    }

    # Sort each sequence by the number of above-the-threshold bindings descending
    # For cases that there might be a tie implement sorting to ensure reproducibility
    # The following is used:
    # If there is a tie (same number of above-the-threshold bindings)
    # (i) Get the longest, then (ii) sequence-wise alphabetically smaller, then (iii) the label is alphabetically smaller
    # If all these are met, that means the sequences are identical, sequence-wise and identification-wise
    # After the sorting, grab the ref sequence

    my @sorted_sequences = sort {
             $analysis{$b} <=> $analysis{$a}
          || $self->{_sequences}{$b}{sequence}
          cmp $self->{_sequences}{$a}
          {sequence}    # Sq object bypassed for easier sorting
          || $self->{_sequences}{$b}{label}
          cmp $self->{_sequences}{$a}
          {label}       # Sq object bypassed for easier sorting
    } keys %analysis;

    my $ref_sequence_id = shift @sorted_sequences;
    my $ref_sequence =
      LookAlign::Alignment::Container::Sq->new('alignment' => $self,
        'id' => $ref_sequence_id)
      or croak($@);

    # Determine sequences that are outliers (ones that do not satisfy identity_threshold with the ref sequence)

    my @outlier_sequences;

    foreach my $i (keys %matrix) {
        foreach my $j (keys %{$matrix{$i}}) {
            next
              unless ($i eq $ref_sequence_id or $j eq $ref_sequence_id)
              ; # Skip if neither is the ref sequence id (!!! sequence_id, not sequence !!!)
            if ($matrix{$i}{$j} < $identity_threshold) {
                if ($i eq $ref_sequence_id) {
                    push @outlier_sequences,
                      LookAlign::Alignment::Container::Sq->new('alignment' => $self,
                        'id' => $j)
                      or croak($@);
                }
                elsif ($j eq $ref_sequence_id) {
                    push @outlier_sequences,
                      LookAlign::Alignment::Container::Sq->new('alignment' => $self,
                        'id' => $i)
                      or croak($@);
                }
            }
        }
    }

    # Change the object appropriately for the calculated sequences
    # (i) Mark ref sequence (add an attribute name: ref_sequence value: 1)
    # (ii) Disable outlier sequences (add an attribute name: disable value: filter)

    $ref_sequence->attribute_value('ref_sequence', 1);
    foreach my $s (@outlier_sequences) {
        $s->attribute_value('disable', 'filter');
    }

    # If there were disabled changes, this will cause some of the present globals to change, re-calculate them (if they exist)
    # Adding a ref_seq attribute does not make any changes

    if (@outlier_sequences) {
        $self->calculate_consensus
          if $self->exists_global('consensus_no_gaps');
    }

    return 1;
}

sub remove_common_gaps {
    my ($self) = @_;

    # get maximum length
    my $max_len = $self->longest_seq_len;

    # Temp sequence ary hash
    my %temp_sequence_ary;
    foreach my $s ($self->valid_sequences) {
        my $id = $s->id;
        @{$temp_sequence_ary{$id}} = $s->sequence_ary;
    }

    my %gap_positions;

    # Identify all common gap positions
    for (my $i = 0 ; $i < $max_len ; $i++) {
        my %bases;
        foreach my $id ($self->valid_sequence_ids) {
            my $b =
              $temp_sequence_ary{$id}[$i]
              ; # This bypasses the regular object methods but is used for speed
            $bases{$b}++ if $b;
        }

        # If this is a common gap, the following will be met
        if (scalar(keys %bases) == 1 and $bases{'-'}) {
            $gap_positions{$i} = 1;
        }
    }

    # Replace sequences and sequence attributes that are position-dependent (contain a comma)
    # IF there are any common gaps
    if (keys %gap_positions) {
        foreach my $s ($self->valid_sequences) {
            my @sequence = $s->sequence_ary;
            my @degapped_sequence =
              $self->_degap_ary(\@sequence, \%gap_positions);
            $s->sequence(join('', @degapped_sequence));

            foreach my $a ($s->attributes) {
                next if $s->attribute_value($a) !~ /,/;
                my @attribute = $s->attribute_value_ary($a);
                my @degapped_attribute =
                  $self->_degap_ary(\@attribute, \%gap_positions);
                $s->attribute_value($a, join(',', @degapped_attribute));
            }
        }

        foreach my $g ($self->globals) {
            next if $self->global_value($g) !~ /,/;
            my @global = $self->global_value_ary($g);
            my @degapped_global =
              $self->_degap_ary(\@global, \%gap_positions);
            $self->global_value($a, join(',', @degapped_global));
        }
    }

    return 1;
}

sub _degap_ary {
    my ($self, $ref_ary, $ref_idx) = @_;

    my @ary = @{$ref_ary};
    my @degapped_ary;

    foreach my $i (0 .. $#ary) {
        next if $ref_idx->{$i};
        push @degapped_ary, $ary[$i];
    }

    return @degapped_ary;
}

=head1 AUTHOR

Payan Canaran <canaran@cshl.edu>

=head1 BUGS

=head1 VERSION

Version 0.01

=head1 ACKNOWLEDGEMENTS

=head1 COPYRIGHT & LICENSE

Copyright (c) 2004-2007 Cold Spring Harbor Laboratory

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself. See DISCLAIMER.txt for
disclaimers of warranty.

=cut

1;
