# detect_duplicates.pl

# Usage: perl detect_duplicates.pl <DIRECTORY>

# --------------------

# By googiek, 2013

# --------------------

# This script searches recursively through a given directory and finds

# duplicate files. Excellent for a large, shared dropbox folder.

#

# The script sorts the files by size, and finds files that have the same size.

# Then, for files with the same size (ie potential duplicates) the script compares

# the md5 checksum of the files to ensure that they are identical. (Checking by size first

# saves time, since md5 is a slower operation than a size check

#

# NOTE: Be careful when working with old iWork files, or other files that are packages with lots

# of tiny system files in them. Often they'll show up as identical to the system files in other

# Pages, Numbers, etc files, and bring up a lot of duplicates.

#

# Running this script with no argument, "perl detect_duplicates.pl" brings up this help

#

# ----------------------------------------------------------------------------------------

#

# This script only detects files with exactly the same content. It does not find images that

# are resized / different aspect ratio, GIFs in different frame numbers, watermarks, etc.

# Adding this would be a cool exercise though, and would probably involve some Mad Math,

# ie taking into account color profiles, finding the average color of certain areas in the images,

# or even affine transformations.

use strict ;

use Data :: Dumper ;

use Digest :: MD5 ;

unless ( scalar ( @ ARGV ) ) { print "

# detect_duplicates.pl

# Usage: perl detect_duplicates.pl <DIRECTORY>

# --------------------

# This script searches recursively through a given directory and finds

# duplicate files. Excellent for a large, shared dropbox folder.

#

# The script sorts the files by size, and finds files that have the same size.

# Then, for files with the same size (ie potential duplicates) the script compares

# the md5 checksum of the files to ensure that they are identical. (Checking by size first

# saves time, since md5 is a slower operation than a size check

#

# NOTE: Be careful when working with old iWork files, or other files that are packages with lots

# of tiny system files in them. Often they'll show up as identical to the system files in other

# Pages, Numbers, etc files, and bring up a lot of duplicates.

#

# Running this script with no argument, \" perl detect_duplicates.pl \" brings up this help

" ;

die ;

}

my ( @files , $filetemp , @filesizes ) ;

my $dir = $ARGV [ 0 ] ;

my @todelete ;

#get files

push ( @files , @ { AddFiles ( $dir ) } ) ;

print scalar ( @files ) . " files found...

" ;

#sort files by size

@files = sort { - s $b <=> - s $a } @files ;

print "Files have been sorted...

" ;

my $file1_digest = Digest :: MD5 -> new ;

my $file2_digest = Digest :: MD5 -> new ;

for ( 0 .. scalar ( @files ) - 2 ) {

if ( - s $files [ $_ ] == - s $files [ $_ + 1 ] ) { #if two files have the same size...

open ( FILE1 , $files [ $_ ] ) or die "Can't open $files[$_]

$!

" ;

open ( FILE2 , $files [ $_ + 1 ] ) or die "Can't open $files[$_+1]

$!

" ;

$file1_digest -> addfile ( *FILE1 ) ;

$file2_digest -> addfile ( *FILE2 ) ;

if ( $file1_digest -> hexdigest eq $file2_digest -> hexdigest and ! ( $files [ $_ ] =~ m/\.DS_Store/ ) ) { #compare the md5 of each file

print "Duplicates found: \t $files[$_] \t $files[$_+1]

" ;

push ( @todelete , $files [ $_ ] ) ;

}

}

}

print "

Delete " . scalar ( @todelete ) . " files? (y/n)" ;

chomp ( my $flag = <STDIN> ) ;

if ( lc ( $flag ) eq "y" ) {

foreach ( @todelete ) { unlink ( $_ ) ; }

print "Files deleted.

" ;

}

else { print "Files not deleted.

" ; }

print "



Done!



" ;

# This function recursively searches through a folder, and adds files to an array

sub AddFiles

{

my $dir = $_ [ 0 ] ;

my $dirhandle ;

opendir ( $dirhandle , $dir ) or die "Ouch!

$!



" ;

my $filetemp ;

my @files ;

while ( $filetemp = readdir ( $dirhandle ) ) {

if ( - f "$dir/$filetemp" and $filetemp ne "Icon \r " ) { push ( @files , "$dir/$filetemp" ) ; }

#checks if the file is a directory, not . or .., and not a symbolic link. This prevents infinite loops

elsif ( - d "$dir/$filetemp" and ! ( - l "$dir/$filetemp" ) and ! ( $filetemp =~ /^\.+$/ ) ) { push ( @files , @ { AddFiles ( "$dir/$filetemp" ) } ) ; }

}

return \@files ;

close $dirhandle ;