#!/usr/bin/perl
use strict;

my %stop_words = ();
open(STOP, "<../stop_words.txt");
while (<STOP>) {
    chomp;
    $stop_words{$_} = 1;
}
close STOP;

my %pronouns = ();
open(PRO, "<../pronouns.txt");
while (<PRO>) {
    chomp;
    $pronouns{$_} = 1;
}
close PRO;

my @sids = ();
my @sentences = ();
my @vis = ();
my @mps = ();
my @vsids = ();
my @candidates = ();
my %wc = ();
my %verbs = ();
my %sc = (); # syntactic components

my ($sentence, $last_snum, $sid);
my $tokens = 0;
my @files = <*>;
my $threshold = 1000; # from testing different values
foreach my $f (@files) {
  next if ($f =~ m/\.pl$/ || $f =~ m/57/);
  open(NYT, "<$f");
  $sentence = "";
  while (<NYT>) {
    my @fields = split(/\s+/);
    my $snum = $fields[0];
    next if (!($snum =~ m/\d+/));
    my $word = $fields[2];
    next if ($word eq "COMMA" || $word eq "'s" || $word eq "_" || $word eq "." || $word eq "'t");
    my $pos = $fields[3];
    my $minipar = $fields[$#fields];

    $wc{lc $word}++;
    $tokens++;
    
    if ($snum != $last_snum) {
      if ($last_snum > 0) {
        push @sids, $sid;
        push @sentences, $sentence;
        $sentence = "";
      } 
    }
    $sentence .= $word . " ";
    $last_snum = $snum;
    $sid = "$f" . ":" . "$snum";

    if ($pos =~ m/^VB/) {
      push @vis, lc $word;
      push @mps, $minipar;
      push @vsids, $sid;
      $verbs{lc $word}++;
      my @scs = split(/,/, $minipar);
      foreach my $mpsc (@scs) {
        $sc{$mpsc}++;
      }
    }
  }
  if (length($sentence) > 0) { push @sids, $sid; push @sentences, $sentence; }
  close NYT;
}

# same subject, object
for (my $y=0; $y<$#mps; $y++) {
  my $mp1 = $mps[$y];
  my $s1 = extract_subject($mp1);
  my $o1 = extract_obj($mp1);
  next if (!defined $s1 || !defined $o1 || $s1 eq "~");
  next if ($pronouns{$s1} || $pronouns{$o1});
  for (my $z=$y+1; $z<$#mps; $z++) {
    next if ($vis[$y] eq $vis[$z]);
    my $mp2 = $mps[$z];
    my $s2 = extract_subject($mp2);
    my $o2 = extract_obj($mp2);
    next if (!defined $s2 || !defined $o2 || $s2 eq "~");
    next if ($pronouns{$s2} || $pronouns{$o2});
    if ((lc $s1 eq lc $s2) && (lc $o1 eq lc $o2)) {
      push @candidates, "$y,$z";
    }
  }
}

# tfidf over all sentences
# pre-compute all tf-idf vectors
my @tis = ();
for (my $i=0; $i<$#sentences; $i++) {
  my $sen = lc $sentences[$i];
  my $tfidf = tfidf_sen($sen);
  push @tis, $tfidf;
}

# loop twice over tfidf vectors
for (my $i=0; $i<$#tis; $i++) {
  my $tfidf1 = $tis[$i];
  for (my $j=$i+1; $j<$#tis; $j++) {
    my $tfidf2 = $tis[$j];
    my $dot = 0;
    foreach my $w (keys %$tfidf1) {
      $dot += $tfidf1->{$w} * $tfidf2->{$w};
    }
    if ($dot > $threshold && !($sentences[$i] eq $sentences[$j]) ) {
      my $sid1 = $sids[$i];
      my $sid2 = $sids[$j];
      my @vi1 = ();
      my @vi2 = ();
      for (my $x=0; $x<$#vsids; $x++) {
        if ($vsids[$x] eq $sid1) { push @vi1, $x; }
        if ($vsids[$x] eq $sid2) { push @vi2, $x; }
      }
      for (my $m=0; $m<$#vi1; $m++) {
        for (my $n=0; $n<$#vi2; $n++) {
          next if ($vis[$vi1[$m]] eq $vis[$vi2[$n]]);
          push @candidates, $vi1[$m] . "," . $vi2[$n]; #stored as string of indices...hack
        }
      }
    }
  }
}

# score the candidates
my %scores = ();
my %vp2indices = ();
my $n_verbs = keys(%verbs);
my $n_sc = keys(%sc);
foreach my $c (@candidates) {
  if ($c =~ m/(\d+),(\d+)/) {
    my $i1 = $1;
    my $i2 = $2;
    my $v1 = $vis[$i1];
    my $v2 = $vis[$i2];
    my $key = "$v1,$v2";
    my $k2 = "$v2,$v1";
    if (defined $scores{$k2}) { 
      $key = $k2; 
      $vp2indices{$key} = "$i2,$i1"; 
    } else {
      $vp2indices{$key} = "$i1,$i2"; 
    }
    if (!defined $scores{$key}) { $scores{$key} = 1; } 
    $scores{$key} *= (($verbs{$v1}+0.00001)/$n_verbs)*(($verbs{$v2}+0.00001)/$n_verbs);
    my @mpfs1 = split(/,/, $mps[$i1]);
    my @mpfs2 = split(/,/, $mps[$i2]);
    foreach my $sc1 (@mpfs1) {
      foreach my $sc2 (@mpfs2) {
        if ($sc1 eq $sc2) {
          my $p = ($sc{$sc1}+0.00001)/$n_sc;
          $scores{$key} *= $p*$p;
        }
      }
    }
  } else{
    warn "this should not be happening!";
    next;
  }
}

# end of section 3.3
foreach my $k (keys %scores) { if ($scores{$k} == 0) {$scores{$k}=0.00000001; } $scores{$k} = -1*log($scores{$k}); }

foreach my $k (sort scoresSortDesc (keys %scores)) { 
  my $vp = $k;
  $vp =~ s/,/ /g;
  print "$vp $scores{$k} "; 
  my $idxs = $vp2indices{$k};
  if ($idxs =~ m/(\d+),(\d+)/) {
    my $i1 = $1;
    my $i2 = $2;
    my $sid1 = $vsids[$i1];
    my $sid2 = $vsids[$i2];
    print "$sid1 $sid2\n";
  } else{
    warn "this sould not be happening!";
    next;
  }
}

sub scoresSortDesc { $scores{$b} <=> $scores{$a}; }

sub tfidf_sen {
  my $sen = shift;
  my $length = $sen =~ s/((^|\s)\S)/$1/g;
  my $ret;
  if ($length > 42) { return $ret; }
  my @words = split(/ /, $sen);
  foreach my $w (@words) {
    if (!$stop_words{$w}) {
      $ret->{$w}++;
    }
  }
  foreach my $w (keys %$ret) {
    $ret->{$w} *= log($tokens/$wc{$w});
  }
  return $ret;
}

sub extract_subject {
  my $mp = shift;
  my $ret;
  if ($mp =~ m/subj:(.*?),/) {
    $ret = $1;
  } 
  return $ret;
}

sub extract_obj {
  my $mp = shift;
  my $ret;
  if ($mp =~ m/obj:(.*?),/) {
    $ret = $1;
  } 
  return $ret;
}

