#!/usr/bin/perl -w ########################################################################### # # Corpus Analysis and Programming, Tips to Final Exam. # Find collocation candidates with a given "collocational window". # # [Reference] Foundations of Statistical Natural Language Processing, # Chapter 5: Collocations. # # Programmed by William Yeh, 2002/12/12. # ########################################################################### # # global data structures # our %table; # key: collocation candidate (w1,w2) # value: count of the (w1,w2) collocation main(); sub main { my $window_size = 3; # size of the collocational window my $line = "Stocks crash as rescue plan teeters."; count_collocation($line, $window_size); while ( my ($coll, $count) = each %table) { print $coll, "\t", $count, "\n"; } } # main() sub count_collocation { my ($line, $window_size) = @_; my @words = split( /[\s\.,!:?\-"]+/, $line); my $max_index = scalar(@words) - 1; # maximal valid index in the @words for (my $i = 0; $i < $max_index; ++$i) { # $i: index of w1 for (my $k = 1; $k <= $window_size; ++$k) { # $k: offset of w2-w1 last if ($i + $k) > $max_index ; add_collocation($words[$i], $words[$i + $k]); } } } # count_collocation() sub add_collocation { my ($w1, $w2) = @_; return if length($w1) < 1; return if length($w2) < 1; # print "[", $w1, ",", $w2, "]\n"; my $key = $w1 . "\t" . $w2; if ( exists $table{ $key } ) { $table{ $key } ++; } else { $table{ $key } = 1; } } # add_collocation