#!/usr/bin/perl -w
use strict;

=head1 NAME

abunique.pl - merge duplicates in an exported CSV address book

=head1 SYNOPSIS

  abunique.pl tb.csv 4

=head1 BUGS

Sometimes getting:

Use of uninitialized value in pattern match (m//) at blib/lib/Text/CSV.pm (autosplit into blib/lib/auto/Text/CSV/combine.al) line 139.
Use of uninitialized value in substitution (s///) at blib/lib/Text/CSV.pm (autosplit into blib/lib/auto/Text/CSV/combine.al) line 154.

=head1 TODO

See "TODO" in the source.

Offer control of warning messages in command line, and count by type.

=cut

use Data::Dumper;
use Text::CSV;
my $csv = Text::CSV->new();

my $DEBUG = 0;

sub debug {
    print STDERR "DEBUG: ", @_, "\n" if $DEBUG;
}

my @ALL_WARNINGS = qw(substring emptyrow nonascii collapse wrap nokey conflict);
my %WARNINGS = map {$_=>1} qw(conflict);

sub warning {
    my ($warning_id, @rest) = @_;
    print STDERR "WARNING($warning_id): ", @rest, "\n" if $WARNINGS{$warning_id} || $DEBUG;
}

sub print_row {
    my ($row) = @_;
    die "can't combine row: ", Dumper($row) unless $csv->combine(@$row);
    print $csv->string(), "\n";
}
    

# 0 if nothing or substring or same. 1 if previous has nothing or is substring. 2 if conflict.
sub has_new_info {
    my ($k, $i, $merged, $r) = @_;
    my $av = $merged->[$i];
    my $bv = $r->[$i];
    return 0 unless $bv; # nothing in this row

    # ignore values identical to the key value
    # TODO: also check for other reundant copies across different columns
    return 0 if $bv eq $k;

    return 1 unless $av;
    return 0 unless lc($av) ne lc($bv);

    # one is a substring of another
    my $qav = quotemeta(lc($av));
    my $qbv = quotemeta(lc($bv));
    if (lc($av) =~ m/$qbv/) {
	# second is a substring of the first, so don't need to replace
	warning('substring', "for '$k' at column $i: ignoring value '$bv' because substring of '$av'");
	return 0;
    }
    if (lc($bv) =~ m/$qav/) {
	# first is a substring of the second, so need to replace
	warning('substring', "for '$k' at column $i: ignoring value '$av' because substring of '$bv'");
	return 1;
    }

    # it conflicts with this most recent row
    return 2;
}

    
sub merge_csv {
    my ($fname, $colind) = @_;

    my $indexed_count = 0;
    my $unindexed_lines = {};
    my $h = {};
    open(CSV, "<$fname") || die "can't open file $fname: $!";
    my $ncols;
    while(<CSV>) {
	chomp;
	# skip entirely empty lines
	if (m/^[,\s]*$/) {
	    warning('emptyrow', "skipping empty row $.: $_");
	    next;
	}
	# normalize fields that are just a single "0"
	# debug("line $. has 0 to get rid of: $_") if m/,0,/ || m/,"0",/;
	# sigh, how do i get global replace to start again ?
	s/,0,/,,/g;
	s/,"0",/,,/g;
	s/,0,/,,/g;
	s/,"0",/,,/g;
	die "did not fix line $.: $_\n" if m/,0,/;

	# sigh; Text::CSV seems to barf on non-ascii
	if (m/[^\040-\176]/) {
	    warning('nonascii', "row $. contains non-ascii, converting to space character: $_");
	    s/[^\040-\176]/ /g;
	}
	# also collapse spaces
	if (m/\s\s/) {
	    warning('collapse', "collapsing white space in row $.: $_");
	    s/\s+/ /g;
	}
	if (!$csv->parse($_)) {
	    for my $attempts (1..4) {
		warning('wrap', "row $. could not be parsed; trying an append of line after it ($attempts lines total): $_");
		my $rest = <CSV>;
		chomp($rest);
		$_ .= ' ' . $rest;
		s/\s+/ /g;
		last if $csv->parse($_);
		die "FATAL: can't parse line $.: $_\n" if $attempts == 3;
	    }
	}

	my $cols = [$csv->fields()];
	# trim fields
	for my $v (@$cols) {
	    $v =~ s/^\s+//;
	    # TODO: warn if removing trailing commas
	    $v =~ s/[\s,]+$//;
	}
	$ncols = scalar(@$cols) unless $ncols;
	my $key = $cols->[$colind];
	if (!$key) {
	    warning('nokey', "row $. has no value for key colum $colind: $_");
	    $unindexed_lines->{$_}++;
	    next;
	}
	my $a = ($h->{$key} ||= []);
	push(@$a, $cols);
	$indexed_count++;
    }
    close(CSV);

    # for each set of rows matching a single key, print just one row (assuming no conflicts)
    my $conflict_count = 0;
    my $merged_count = 0;
    while (my ($k,$a) = each %$h) {
	debug("merging ", scalar(@$a), " rows for '$k'");
	my @rows = @$a;
	my $merged = shift @rows;
	my @printed_rows = ($merged);
	my $maxind = scalar(@$merged) - 1;
	while(@rows) {
	    my $r = shift @rows;
	    # merge this row and the $merged row
	    # if any conflicts, print both lines
	    for my $i (0..$maxind) {
		my $ni = has_new_info($k, $i, $merged, $r);
		next unless $ni;
		die "empty or undefined value" unless $r->[$i];
		if ($ni == 1) {
		    $merged->[$i] = $r->[$i];
		    next;
		}

		# UGLY: also check the row before that.
		# TODO: compare multiple rows
		next if @printed_rows && !has_new_info($k, $i, $printed_rows[0], $r); 

		# there is a conflict
		warning('conflict', "there is a conflict for '$k' at column $i: '$merged->[$i]' != '$r->[$i]' so printing both");
		$conflict_count++;
		print_row($merged);
		push(@printed_rows, $merged);
		$merged_count++;
		$merged = $r;

		# we just made this the new row, so don't compare any more columns
		last;
	    }
	}
	print_row($merged);
	$merged_count++;
    }

    my $num_original_unindexed = 0; for(values %$unindexed_lines) {$num_original_unindexed += $_;}
    my $num_merged_unindexed = scalar(keys %$unindexed_lines);
    debug("printing $num_merged_unindexed unindexed lines (out of original $num_original_unindexed)" );
    print join("\n", keys %$unindexed_lines);
    print STDERR "SUMMARY: containing key: $indexed_count, merged to $merged_count with $conflict_count conflicts; $num_original_unindexed lines with no key, merged to $num_merged_unindexed\n";
}

sub main {
    die "Usage: $0 filename colind\n" unless @ARGV >= 2;
    my ($fname, $colind) = @ARGV;
    merge_csv($fname, $colind);
}

main();



