#!/usr/bin/perl -w
use strict;
use English;
#
# Array where input files are stored
#
my (@File1, @File2);
#
# SortFileToArray ( $file, $ref2array) - Store file in an array and sort array
# $file = Input file name
# $ref2array = Reference to array in which file is stored
#
# Input fields are in the form "f1 f2 f3+f4.f5" xhere fn is firld n.
# Records are stored in $SUBSEP delimited form (fields are delimited by $SUBSEP)
# The number of input records is displayed.
#
sub SortFileToArray ($$) {
my ($file, $ref2array) = @_;
open(F2A, $file) or die "Unable toopen file $file\n$!";
while (<F2A>) {
chomp;
next if (/^\s*$/);
push @$ref2array,
join($SUBSEP, /^([^ ]+)\s+([^ ]+)\s+([^+]+)\+([^.]+)\.([^ ]+)$/);
}
close(F2A);
@$ref2array = sort @$ref2array;
print "File: $file - Number of lines ( records ) ".(@$ref2array+0)."\n" ;
}
#
# OutRecord ($record) - Convert $SUBSEP delimited record for display
# $record = $SUBSEP delimited
#
# Return record in the form "f1 f2 f3+f4.f5" xhere fn is field n.
#
sub OutRecord($) {
my ($record) = @_;
my @fields = split($SUBSEP,$record);
return($fields[0]." ".$fields[1]." ".$fields[2]."+".
$fields[3].".".$fields[4]."\n");
}
#
# IdenticalMatches($file) - Creates file with identical records
# $file = Resulting file.
#
# Identicals records from @File1 and @File2 are removed and stored to $file
# The identical record count is displayed.
#
sub IdenticalMatches ($) {
my ($file) = @_;
my ($count, $indx1, $indx2) = (0, 0, 0);
my ($val1, $val2) = ($File1[0], $File2[0]);
open(OUT, '>'.$file) or die "Unable to create file $file\n$!";
while ($indx1 <= $#File1 and $indx2 <= $#File2) {
if ($val1 eq $val2) {
$count++;
$File1[$indx1] = "";
$File2[$indx2] = "";
print OUT OutRecord($val1);
$val1 = $File1[++$indx1];
$val2 = $File2[++$indx2];
} elsif ($val1 lt $val2) {
$val1 = $File1[++$indx1];
} else {
$val2 = $File2[++$indx2];
}
}
close(OUT);
print "\nIdentical matches ( i.e lines in both files are same ) $count\n";
}
#
# BuildCloseArray ($ref2full, $ref2close, $ignored) - Build array of records
# from testing close matches
# $ref2full = Reference to array containing records
# $ref2close = reference to array to fill
# $ignored = Field to ignore
#
# The @$ref2close is filled with records of @$ref2full whith field $ignored
# not copied.
#
sub BuildCloseArray($$$) {
my ($ref2full, $ref2close, $ignored) = @_;
my ($indx, $elemf, @fields, $elemc);
@$ref2close = ();
$ignored -= 1;
foreach $elemf (@$ref2full) {
next if $elemf eq '';
@fields = split($SUBSEP, $elemf);
$elemc = '';
for ($indx=0; $indx <= $#fields; $indx++) {
next if $indx == $ignored;
$elemc .= ($elemc eq '' ? '' : $SUBSEP) . $fields[$indx];
}
push(@$ref2close, $elemc);
}
@$ref2close = sort @$ref2close;
}
#
# SearchCloseMatches ($ignored) - Search for close matching records (ignore
# one field).
# $ignore = Field to ignore when comparing records
#
# The records of the arrays @File1 and @File2 are compared for matching,
# the field $ignored not considered.
#
# return the close matching count.
#
sub SearchCloseMatches($) {
my ($ignored) = @_;
my (@close1, @close2);
BuildCloseArray(\@File1, \@close1, $ignored);
BuildCloseArray(\@File2, \@close2, $ignored);
return 0 if $#close1 == 0;
return 0 if $#close2 == 0;
my ($count, $indx1, $indx2) = (0, 0, 0);
my ($val1, $val2) = ($close1[0], $close2[0]);
while ($indx1 <= $#close1 and $indx2 <= $#close2) {
if ($val1 eq $val2) {
$count++;
$val1 = $close1[++$indx1];
$val2 = $close2[++$indx2];
} elsif ($val1 lt $val2) {
$val1 = $close1[++$indx1];
} else {
$val2 = $close2[++$indx2];
}
}
return $count;
}
#
# NoMatch ($file, $ref2array, $matches) - Compute non matching record count
# for a file
# $File = File name (only used for display)
# $ref2array = Reference to array containing non natching records for $file
# $matches = Close matching count
#
# Non matching records in @$ref2array are non empty elements.
# Close matching count is substracted.
# The number of non matching records is displayed.
#
sub NoMatch($$$) {
my ($file, $ref2array, $matches) = @_;
my $nomatch = -$matches;
foreach my $elem (@$ref2array) {
$nomatch++ unless $elem eq '';
}
print "Lines in file $file with no match $nomatch\n";
}
#
# CloseMatches ($file1, $file2) - Count close and non matching records
# $file1 = Input file 1 name
# $file1 = Input file 2 name
#
# Close matching count is the sum of matching records with one field ignored.
# Close and non matching counts are displayed
#
sub CloseMatches($$) {
my ($file1, $file2) = @_;
my $count = 0;
$count += SearchCloseMatches(1);
$count += SearchCloseMatches(2);
$count += SearchCloseMatches(3);
$count += SearchCloseMatches(4);
$count += SearchCloseMatches(5);
print "\nClose Matches ( 4 of the 5 fields match ) $count\n\n";
NoMatch($file1, \@File1, $count);
NoMatch($file2, \@File2, $count);
}
#
# Main . . .
#
SortFileToArray('file1.txt', \@File1);
SortFileToArray('file2.txt', \@File2);
IdenticalMatches('file12.txt');
CloseMatches('file1.txt', 'file2.txt');