#!C:\perl\bin\perl
#---------------------------------------------------------------------
# Top Dragon FixWord
# (c) 2001 Top Dragon Software
# tracy@bydisn.com or [URL unfurl="true"]www.bydisn.com/software[/URL]
#---------------------------------------------------------------------
$| = 1; # flush buffer after every print
%Files = ();
$Unformat = 0;
%Symbols = (
210 => '®',
211 => '©',
212 => '™',
226 => '®',
227 => '©',
228 => '™',
);
# Get all arguments (filenames/filespecs/switches) from command line
foreach my $arg (@ARGV) {
if ( $arg =~ /\A-(.+)/ ) { # if it's a switch
if ( $1 eq "U" ) { # Unformat switch
$Unformat = 1;
next;
}
}
if ( $arg =~ /\*/ ) { # if it's a filespec
foreach my $file (glob $arg) { # expand filespecs
next if ( $file =~ /\.bak\Z/ ); # ignore .bak (backup) files
$Files{$file} = 1; # save filename as hash key (prevents duplicate filenames)
}
} else { # not a filespec
next if ( $file =~ /\.bak\Z/ ); # ignore .bak (backup) files
$Files{$arg} = 1; # save filename as hash key (prevents duplicate filenames)
}
}
@ARGV = (); # Remove all arguments (so perl won't see them)
# Now process the list of filenames stored as keys in the hash
foreach my $file (sort keys %Files) {
unless ( -e "$file" ) {
die "$0 Error\nCould not find file $file\nError: $!";
}
print "Processing $file\n";
# delete backup (.bak) file if it exists
if ( -e "$file.bak" ) {
unlink "$file.bak" or
die "$0 Error\nCould not delete old backup file $file.bak\nError: $!";
}
# rename input file to backup (.bak) file name
rename "$file","$file.bak" or
die "$0 Error\nCould not rename file $file to $file.bak\nError: $!";
# fix the file
FixFile($file);
}
exit 0; # quit
#---------------------------------------------------------------------
sub FixFile {
my($file) = @_;
undef $/; # undefine input record separator
# open the input (.bak) file
open(INFILE, "<$file.bak") or
die "$0 Error\nCould not open INFILE $file.bak\nError: $!";
# open the new output file (original file name)
open(OUTFILE, ">$file") or
die "$0 Error\nCould not open OUTFILE $file\nError: $!";
$doc = <INFILE>; # slurp in entire input file
# Fix up the html
$doc =~ s{<html[^>]*?>}{<html>}sgi; # fix <html> tag
if ( $doc =~ m{<title>.*</title>} ) { # if there's a title
$doc =~ s{<head>.*(<title>.*</title>).*</head>}{<head>$1</head>}sgi; # remove everything in <head> section but <title>
} else { # there is NO title
$doc =~ s{<head>.*</head>}{<head></head>}sgi; # remove everything in <head> section
}
$doc =~ s{<body[^>]*?>}{<body>}sgi; # fix <body> tag
$doc =~ s{<div[^>]*?>|</div>}{}sgi; # remove <div> and </div> tags
$doc =~ s{<p[^>]*?>}{<p>}sgi; # fix <p> tags
$doc =~ s{<span[^>]*?mso-spacerun[^>]*?>}{}sgi; # remove <span ... mso-spacerun...> tags
$doc =~ s{<span[^>]*?mso-char-type:symbol[^>]*?>(.*?)</span>}{GetSymbol($+)}sgie; # fix symbol characters
$doc =~ s{<span[^>]*?>|</span>|</p>}{}sgi; # remove <span>, </span>, </p> tags
$doc =~ s{<(ol)[^>]*?>|<(ul)[^>]*?>|<(li)[^>]*?>}{<$+>}sgi; # clean up <ol>, <ul> and <li> tags
$doc =~ s{\Q<![if\E.*?\Q<![endif]>\E}{}sgi; # remove if statements
$doc =~ s{<o:p>|</o:p>}{}sgi; # remove <o:p> and </o:p> tags
$doc =~ s{<!--.*?-->}{}sgi; # remove comments
if ( $Unformat ) { # if unformat switch (-U) specified
# Fix up formatting (spaces and linefeeds)
$doc =~ s| | |sgi; # convert non-breaking spaces to plain spaces
$doc =~ s|\xa0| |sg; # convert carriage returns to spaces
$doc =~ s|\n{2,}|\x02|sg; # convert multiple linefeeds to single hex 02
$doc =~ s|\n| |sg; # convert single linefeeds to spaces
$doc =~ s| +| |sg; # convert multiple spaces to single spaces
$doc =~ s|\x02|\n|sg; # convert hex 02 back to linefeed
}
print OUTFILE $doc; # print out the modified file
close(INFILE); # close the input file
close(OUTFILE); # close the output file
return 1;
} # FixFile
#---------------------------------------------------------------------
sub GetSymbol {
my($chars) = @_;
my $string = "";
foreach my $char ( split(/ */, $chars) ) {
if ( exists($Symbols{ord($char)}) ) {
$string .= $Symbols{ord($char)};
} else {
$string .= $char;
}
}
return $string;
} # GetSymbol
#---------------------------------------------------------------------
1;