#!/usr/bin/perl

# stripHtml
# This script strips the HTML tags from an HTML file and outputs (to STDOUT)
# the contents of the file as plain text.
# It expects the name of an HTML file as a command-line argument.
# If no command-line argument is supplied, it reads from STDIN.
# Cameron Hayne (macdev@hayne.net)  February 2009

use strict;
use warnings;
use HTML::TreeBuilder;

my $htmlFile;
if (@ARGV)
{
    die "Usage: stripHtml htmlFile\n" unless scalar(@ARGV) == 1;
    $htmlFile = $ARGV[0];
    die "No such file: '$htmlFile'\n" unless -f $htmlFile;
}
else
{
    $htmlFile = *STDIN;
}

my $tree = HTML::TreeBuilder->new;
$tree->ignore_ignorable_whitespace(0);
$tree->no_space_compacting(1);
$tree->parse_file($htmlFile);
my $plainText = $tree->as_text();
# change non-breaking-spaces to regular space characters
$plainText =~ s/\xA0/ /g;
print $plainText;
$tree->delete();