#!/usr/bin/perl use strict; my %stop_words = (); open(STOP, "<../stop_words.txt"); while () { chomp; $stop_words{$_} = 1; } close STOP; my %pronouns = (); open(PRO, "<../pronouns.txt"); while () { chomp; $pronouns{$_} = 1; } close PRO; my @sids = (); my @sentences = (); my @vis = (); my @mps = (); my @vsids = (); my @candidates = (); my %wc = (); my %verbs = (); my %sc = (); # syntactic components my ($sentence, $last_snum, $sid); my $tokens = 0; my @files = <*>; my $threshold = 1000; # from testing different values foreach my $f (@files) { next if ($f =~ m/\.pl$/ || $f =~ m/57/); open(NYT, "<$f"); $sentence = ""; while () { my @fields = split(/\s+/); my $snum = $fields[0]; next if (!($snum =~ m/\d+/)); my $word = $fields[2]; next if ($word eq "COMMA" || $word eq "'s" || $word eq "_" || $word eq "." || $word eq "'t"); my $pos = $fields[3]; my $minipar = $fields[$#fields]; $wc{lc $word}++; $tokens++; if ($snum != $last_snum) { if ($last_snum > 0) { push @sids, $sid; push @sentences, $sentence; $sentence = ""; } } $sentence .= $word . " "; $last_snum = $snum; $sid = "$f" . ":" . "$snum"; if ($pos =~ m/^VB/) { push @vis, lc $word; push @mps, $minipar; push @vsids, $sid; $verbs{lc $word}++; my @scs = split(/,/, $minipar); foreach my $mpsc (@scs) { $sc{$mpsc}++; } } } if (length($sentence) > 0) { push @sids, $sid; push @sentences, $sentence; } close NYT; } # same subject, object for (my $y=0; $y<$#mps; $y++) { my $mp1 = $mps[$y]; my $s1 = extract_subject($mp1); my $o1 = extract_obj($mp1); next if (!defined $s1 || !defined $o1 || $s1 eq "~"); next if ($pronouns{$s1} || $pronouns{$o1}); for (my $z=$y+1; $z<$#mps; $z++) { next if ($vis[$y] eq $vis[$z]); my $mp2 = $mps[$z]; my $s2 = extract_subject($mp2); my $o2 = extract_obj($mp2); next if (!defined $s2 || !defined $o2 || $s2 eq "~"); next if ($pronouns{$s2} || $pronouns{$o2}); if ((lc $s1 eq lc $s2) && (lc $o1 eq lc $o2)) { push @candidates, "$y,$z"; } } } # tfidf over all sentences # pre-compute all tf-idf vectors my @tis = (); for (my $i=0; $i<$#sentences; $i++) { my $sen = lc $sentences[$i]; my $tfidf = tfidf_sen($sen); push @tis, $tfidf; } # loop twice over tfidf vectors for (my $i=0; $i<$#tis; $i++) { my $tfidf1 = $tis[$i]; for (my $j=$i+1; $j<$#tis; $j++) { my $tfidf2 = $tis[$j]; my $dot = 0; foreach my $w (keys %$tfidf1) { $dot += $tfidf1->{$w} * $tfidf2->{$w}; } if ($dot > $threshold && !($sentences[$i] eq $sentences[$j]) ) { my $sid1 = $sids[$i]; my $sid2 = $sids[$j]; my @vi1 = (); my @vi2 = (); for (my $x=0; $x<$#vsids; $x++) { if ($vsids[$x] eq $sid1) { push @vi1, $x; } if ($vsids[$x] eq $sid2) { push @vi2, $x; } } for (my $m=0; $m<$#vi1; $m++) { for (my $n=0; $n<$#vi2; $n++) { next if ($vis[$vi1[$m]] eq $vis[$vi2[$n]]); push @candidates, $vi1[$m] . "," . $vi2[$n]; #stored as string of indices...hack } } } } } # score the candidates my %scores = (); my %vp2indices = (); my $n_verbs = keys(%verbs); my $n_sc = keys(%sc); foreach my $c (@candidates) { if ($c =~ m/(\d+),(\d+)/) { my $i1 = $1; my $i2 = $2; my $v1 = $vis[$i1]; my $v2 = $vis[$i2]; my $key = "$v1,$v2"; my $k2 = "$v2,$v1"; if (defined $scores{$k2}) { $key = $k2; $vp2indices{$key} = "$i2,$i1"; } else { $vp2indices{$key} = "$i1,$i2"; } if (!defined $scores{$key}) { $scores{$key} = 1; } $scores{$key} *= (($verbs{$v1}+0.00001)/$n_verbs)*(($verbs{$v2}+0.00001)/$n_verbs); my @mpfs1 = split(/,/, $mps[$i1]); my @mpfs2 = split(/,/, $mps[$i2]); foreach my $sc1 (@mpfs1) { foreach my $sc2 (@mpfs2) { if ($sc1 eq $sc2) { my $p = ($sc{$sc1}+0.00001)/$n_sc; $scores{$key} *= $p*$p; } } } } else{ warn "this should not be happening!"; next; } } # end of section 3.3 foreach my $k (keys %scores) { if ($scores{$k} == 0) {$scores{$k}=0.00000001; } $scores{$k} = -1*log($scores{$k}); } foreach my $k (sort scoresSortDesc (keys %scores)) { my $vp = $k; $vp =~ s/,/ /g; print "$vp $scores{$k} "; my $idxs = $vp2indices{$k}; if ($idxs =~ m/(\d+),(\d+)/) { my $i1 = $1; my $i2 = $2; my $sid1 = $vsids[$i1]; my $sid2 = $vsids[$i2]; print "$sid1 $sid2\n"; } else{ warn "this sould not be happening!"; next; } } sub scoresSortDesc { $scores{$b} <=> $scores{$a}; } sub tfidf_sen { my $sen = shift; my $length = $sen =~ s/((^|\s)\S)/$1/g; my $ret; if ($length > 42) { return $ret; } my @words = split(/ /, $sen); foreach my $w (@words) { if (!$stop_words{$w}) { $ret->{$w}++; } } foreach my $w (keys %$ret) { $ret->{$w} *= log($tokens/$wc{$w}); } return $ret; } sub extract_subject { my $mp = shift; my $ret; if ($mp =~ m/subj:(.*?),/) { $ret = $1; } return $ret; } sub extract_obj { my $mp = shift; my $ret; if ($mp =~ m/obj:(.*?),/) { $ret = $1; } return $ret; }