#! /usr/bin/env perl -w
#
# Process a recursive md5sum output and hard link the files together that
# have the same md5sum to save space (assuming they have different inodes).
#
# Usage:
#
# 1. Checksum files:
#
# find /basepath [/basepath] -type f | xargs md5sum | tee /tmp/files.md5sum
#
# 2. See files to be linked together (ie, dry run, unset GO))
#
# cat /tmp/files.md5sum | lnsame      # or
# lnsame /tmp/files.md5sum
#
# 3. Replace copies with hard links  (ie, live, set GO=1)
#
# cat /tmp/files.md5sum | GO=1 lnsame   # or
# GO=1 lnsame /tmp/files.md5sum
#
# This is a very naive approach, _assuming_ that all files with the same
# checksum should be linked together, and that they are not already linked
# together.  It does not check file owners, or permissions, just arbitrarily
# choosing one file to be the reference for those.  It also assumes that 
# the files both remain unmodified between the checksum and linking, and
# afterwards (as modifying one of the hard linked files will modify all
# of them).  Ideally it would be used on "read only" backups.
#
# It is intended to be used when, eg, the same camera memory cards have been
# copied into multiple locations in multiple layouts (eg, one arranged 
# by camera card, and another arranged by date).
#
# Written by Ewen McNeill <ewen@naos.co.nz>, 2012-04-29
# Updated by Ewen McNeill <ewen@naos.co.nz>, 2015-07-16
#---------------------------------------------------------------------------
#
# Copyright (c) 2015, Ewen McNeill.  All rights reserved.
# 
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# 
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
# 
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
# 
#---------------------------------------------------------------------------

use 5;
use strict;
use warnings;

my $do_update = defined($ENV{'GO'});    # Any setting is "go"
if ($do_update) {
    print "Live update starting now -- stop program immediately to abort\n";
    sleep 2;
    print "-" x 75, "\n";
} else {
    print "Dry run -- will only list changes that would be made\n";
    print "For live run set GO=1 environment variable; eg\n";
    print "GO=1 $0 ", join(" ", map { qq("$_") } @ARGV), "\n";
    print "-" x 75, "\n";
}


my %file_by_sum;

# Organise by checksum
while (<>) {
    if (/^([0-9a-f]+)\s\s(.+)$/) {
       push @{$file_by_sum{$1}}, $2;
    }
}

# Look for files with more than one copy
foreach my $sum (keys %file_by_sum) {
    my $filelist = $file_by_sum{$sum};
    if ((scalar @{$filelist}) > 1) {
        print scalar @{$filelist}, " files: ",
              join(", ", @{$filelist}), "\n";

        my $master = shift @{$filelist};
        foreach my $copy (@{$filelist}) {
            print "$copy -> $master\n";
            if ($do_update) {
                unlink($copy);
                link($master, $copy);
                #sleep 1;
            }
        }
    }
}
