#!/usr/bin/perl # stripHtml # This script strips the HTML tags from an HTML file and outputs (to STDOUT) # the contents of the file as plain text. # It expects the name of an HTML file as a command-line argument. # If no command-line argument is supplied, it reads from STDIN. # Cameron Hayne (macdev@hayne.net) February 2009 use strict; use warnings; use HTML::TreeBuilder; my $htmlFile; if (@ARGV) { die "Usage: stripHtml htmlFile\n" unless scalar(@ARGV) == 1; $htmlFile = $ARGV[0]; die "No such file: '$htmlFile'\n" unless -f $htmlFile; } else { $htmlFile = *STDIN; } my $tree = HTML::TreeBuilder->new; $tree->ignore_ignorable_whitespace(0); $tree->no_space_compacting(1); $tree->parse_file($htmlFile); my $plainText = $tree->as_text(); # change non-breaking-spaces to regular space characters $plainText =~ s/\xA0/ /g; print $plainText; $tree->delete();