#!/usr/local/bin/perl -w

use strict;

=head1

Do these transforms on input html:

   tidy (indent, remove extra styling)
   retain only body innerhtml
   fix urls to be relative

=cut

use File::Basename;

my $outdir = shift @ARGV;
die "dir '$outdir' does not exist or is not a directory" unless -d $outdir;

for my $f (@ARGV) {
    open(HTML , "tidy -indent -clean -ashtml -quiet -wrap 1000 < $f |") || die "can't tidy: $!";
    my @lines = <HTML>;
    close HTML;
    my $html = join('', @lines);

    die "nothing in $f after tidy" if $html =~ m/^\s*$/s;

    # remove all but body innerhtml
    $html =~ s/.*<body[^>]*>//is;
    $html =~ s,</body>.*,,is;

    die "nothing in $f after body innerhtml" if $html =~ m/^\s*$/s;

    # TEMPORARY
    # remove all the surrounding crap
    if ($html =~ m,.*\n(.*)<td[^>]*>[\s\n]+<hr>(.*),si) {
	$html = $2;
	my $match = $1;
	# find matching close td
	if ($html =~ m,(.*?)\n$match</td>(.*)$,si) {
	    $html = $1;
	    my $rest = $2;
	    if ($rest =~ m,(<form.*        )</td>,si) {
		# print STDERR "preserving form: '$1' out of '$rest'";
		$html .= "\n$1";
	    }
	    else {
		die "in $f did not preserve form" if $rest =~ m/form/si;
	    }
	}
	else {
	    die "in $f no closing '$match</td>'";
	}
    }
    else {
	die "didn't match leading crap in $f";
    }
    die "nothing in $f after menu removal" if $html =~ m/^\s*$/s;

    # remove leading and trailing white
    $html =~ s,^[\n\s]*\n,,;
    die "nothing in $f after leading white removal" if $html =~ m/^\s*$/s;
    $html =~ s,(<p>|</p>|\&nbsp;|[\n\s])+$,\n,;
    die "nothing in $f after trailing white removal" if $html =~ m/^\s*$/s;
    # $html .= '</p>' if $html =~ m/<p>[^<]+$/s;
    # we almost certainly removed a <p>
    $html .= "</p>\n" unless $html =~ m/p>\n*$/;

    # remove common indent
    $html =~ m/^([\s]*)/;
    my $indent = $1;
    # print STDERR "removing indent '$indent'\n";
    $html =~ s,^$indent,,mg;

    # relativize urls
    $html =~ s,http:[^\'\"]*/([^\'\"]*),$1,g;

    # collapse empty lines
    $html =~ s,\n\s*\n,\n,g;

    # remove class attributes added by -clean
    $html =~ s, class=".*?",,g;

    if ($outdir) {
	my $outfile = "$outdir/" . basename($f);
	open(OUT, ">$outfile") || die "can't open $outfile for write: $!";
	print OUT $html;
	close(OUT);
	print STDERR "wrote cleaned content to $outfile\n";
    }
    else {
	print $html;
    }
}

