The script is partially working. It should read in a list of words in the form word_word (or word_word_word etc. eventually), splits on the underscore and if a stopword is found for either position then it won't be in the new list created. This works for stopwords in the first position but phrases with stopwords in the second position are still showing up on the new list. It seems that the logical OR is not working and I don't understand why it is failing.
Help appreciated
Code:
#!/usr/bin/perl
use strict;
my (@stopW, $freq, $word, $tuple, $word, $sw2, $c, $d);
######################################### Reading Stopwords ###
open(STOPfile,"<./stopwords.txt");
while (<STOPfile>)
{
chop;
push(@stopW, $_);
}
close(STOPfile);
###################################################################
my $inputDIR="./";
my ($freq,$word);
for ($tuple=1; $tuple<=2; $tuple++)
{
open (INfile,"<".$inputDIR."file".$tuple.".txt");
open (OUTfile,">cleanup".$tuple.".txt");
while(<INfile>){
chop;
($freq,$word)=split(/:/,$_);
# leaving single words alone
if ($tuple==1)
{ foreach (@stopW) {
if (length($word)<=1 ) { next; }
if ($word eq $_) {
$word=$word."*"; last; }
}
}
# $tuple=2, eliminate phrases with a stopword
# or if any of the words is <=1 (0,1)
if ($tuple==2)
{ ($a,$b) = split("_",$word);
if ( (length($a)<=1) || (length($b)<=1) ) { next; }
foreach (@stopW)
{ if ( ($a eq $_) || ($b eq $_))
{ $sw2=1; last; }
}
}
if ($sw2==1) { $sw2=0; next; }
#print "$word\n";
#print $freq.":".$word."\n";
printf OUTfile "%s:%s\n", $freq,$word ;
} # end while
close INfile;
close OUTfile;
} # end for tuple
exit(0);
Help appreciated
Code:
#!/usr/bin/perl
use strict;
my (@stopW, $freq, $word, $tuple, $word, $sw2, $c, $d);
######################################### Reading Stopwords ###
open(STOPfile,"<./stopwords.txt");
while (<STOPfile>)
{
chop;
push(@stopW, $_);
}
close(STOPfile);
###################################################################
my $inputDIR="./";
my ($freq,$word);
for ($tuple=1; $tuple<=2; $tuple++)
{
open (INfile,"<".$inputDIR."file".$tuple.".txt");
open (OUTfile,">cleanup".$tuple.".txt");
while(<INfile>){
chop;
($freq,$word)=split(/:/,$_);
# leaving single words alone
if ($tuple==1)
{ foreach (@stopW) {
if (length($word)<=1 ) { next; }
if ($word eq $_) {
$word=$word."*"; last; }
}
}
# $tuple=2, eliminate phrases with a stopword
# or if any of the words is <=1 (0,1)
if ($tuple==2)
{ ($a,$b) = split("_",$word);
if ( (length($a)<=1) || (length($b)<=1) ) { next; }
foreach (@stopW)
{ if ( ($a eq $_) || ($b eq $_))
{ $sw2=1; last; }
}
}
if ($sw2==1) { $sw2=0; next; }
#print "$word\n";
#print $freq.":".$word."\n";
printf OUTfile "%s:%s\n", $freq,$word ;
} # end while
close INfile;
close OUTfile;
} # end for tuple
exit(0);