#!/usr/local/bin/perl -w
#
use strict;

my $isbn2ids = shift @ARGV or
die "\nUsage: $0 +isbn2ids [qrels_file|runs]\n\n";

if (scalar @ARGV == 0) {
    print STDERR "No data to deduplicate, exiting...\n";
    exit 0;
}

my %work = ();

print STDERR "Reading ISBN to work ID mappings...\n";
if ($isbn2ids =~ /\.gz$/) {
    open ID, "gunzip -c $isbn2ids |" or die "can't read from $isbn2ids: $!\n";
}
else {
    open ID, $isbn2ids or die "can't read from $isbn2ids: $!\n";
}
while (<ID>) {
    my ($isbn, $id) = split /[ \t\r\n]/;
    $work{$isbn} = $id;
}
close ID;

foreach my $file (@ARGV) {
    print STDERR "Deduplicating file $file...\n";
    my %found = ();
    my $duplicate = 0;
    my %dup_type = ();
    open FI, $file or die "can't read from $file: $!\n";
    open FN, ">$file.no_duplicates" or die "can't write to $file.no_duplicates: $!\n";
    while (<FI>) {
	my ($qid, $qit, $doc) = split /[ \t\r\n]/;
	if (not exists $work{$doc}) {
	    print "no work ID for isbn $doc\n";
	    print FN "$_";
	    next;
	}
	if (exists $found{$qid}{$work{$doc}}) {
            $duplicate++;
            next;
	}
	$found{$qid}{$work{$doc}}{$doc} = 1;
        $_ =~ s/$doc/$work{$doc}/;
	print FN $_;
    }
    close FN;
    close FI;

    print "$duplicate duplicates removed from $file\n";
}

