#!/usr/bin/perl
#
# rmaiib.pl - remove files in tree of a if they are in tree b
# Note: the files do NOT have to be in same place, 
# i.e., directory names don't have to be the same.
#
# Version 070804, 070122, 061129
# This version works by size of file.
# Need a test of equivalence -- test(x,y) returns true (1) if 
#       x == y in our setting. May in fact have x "earlier" than
#       y if we trust timestamps.
#
#       So far, we use exact equality, and use compare
# 
# Program structure and calling sequence
#  rmaifinb.pl topa topb savedir 
# where
#   topa gives path from current directory to top of tree a, or
#        else an absolute path
#   topb gives path from current directory to top of tree b, or
#        else an absolute path
#   savedir gives path from current directory or absolute path
#        to location where files "removed" from tree a will be
#        saved. ?? Should check this is NOT in a or b trees!
#        That could be interesting if it is a link.??
#
#  TODO:
#  1) manual, info, --help 
#  2) check for savedir location (not in a or b??)
#  3) Possibly put in links for removed files -- that way stuff still works
#  4) Check it runs on Windows and Mac
#  5) rmfilelist more like recovery script -- close but still some glitches

#
# Start:
#  - extract the arguments 
#    @ARGV[0] is topa
#    @ARGV[1] is topb - Use $ rather than @ to access elements
# (optional) @ARGV(2) is savedir -- provide a default, 
#	e.g. ~/rmaifinbsave/ (?? will this work??)
#
#      This program written by Prof. J C Nash
#         nashjc@uottawa.ca
#      This program is Copyright (C) 2006  J C Nash
#      It is distributed under the Gnu Public Licence
#       http://www.gnu.org/licenses/gpl.txt
#
use strict; # ensure all variables declared etc.
use File::Find;  # module that lets us build a full tree.
use File::Basename; # module that parses filenames into path, name, extension etc.
use File::stat; # module to allow access to file characteristics by name
use File::Compare; # module to compare 
#   or compare_txt (f1, f2, [shallow = T for just directory stuff])
use File::Copy;
use Cwd;

my $dirchar = "\/";     # use / for all??  "\\"; # for Windows. For Linux use / 

my ($i, $j, $tmpa, $tmpb, @arec, @brec, @globa, @globb);
my ($saved, $savedir, $nextai, $nextbj, @temprec, $trec);
my ($nfa, $nfb, @tda, @tdb, @tfa, @tfb, $nda, $ndb);
# for use in treefiledir and doit
my ($tmp, @tf, @td, $tfa_ref, $tda_ref, $tdb_ref, $tfb_ref, $uda_ref, $udb_ref, @uda, @udb); 
my ($tempdir, $savegloba, $saveglobb);

my $tempdir = "/temp/"; # default temporary directory. 
### 130902: Some systems do not have tempdir specified and user cannot create it! ???
# If possible, set storage for removed files on the same filesystem as topa to avoid 
# long file copies.

my $joinchar = '>'; # "|" this is "alternative" in regexes. 
# Must assume the joinchar is NOT in file names 
# * may cause Perl problems
# Note piping character | very bad. So is ^, <
# May need to filter all filenames first, then offer user advice.

# Get the top of both trees
my $numclargs = @ARGV; # get the number of arguments on the command line
# print "Number of arguments on command line = $numclargs \n";
if ($numclargs < 2) {
  print "$numclargs < 2 arguments - please supply directory tree names\n";
  exit;
}
if ($numclargs > 4) {
  print "$numclargs > max. 4 - TOO MANY ARGUMENTS\n";
  exit;
}
 
my $topa = $ARGV[0]; # first argument is 0'th
my $topb = $ARGV[1]; # note $ instead of @ and square rather than round brackets
my $wd = cwd; # the working directory


# my $savedir="."; # define just in case
if ($numclargs > 2) {
	$saved = $ARGV[2];
#?? should check that it has a trailing / ??
} else {
	$saved=$tempdir; # Directory to store removed files (Linux)
}
# my $saved="c:/temp/rm"; # Same for Windows
# NOTE: /tmp/ may be useful, but on shutdown files get cleared.

   my @splta=split("/",$topa);
   my $lastsdir=pop(@splta);
   my $tstmp = tdstamp();
   $savedir=$saved . $tstmp;
   $savedir = $savedir . $lastsdir;

print "Removed files go to $savedir \n";
mkdir($savedir) or die("Could not make save-directory $savedir"); # and make the directory

my $outfn="000_rmfilelist".$tstmp.".txt"; # a file to list the removed files
if ($numclargs > 3) {
	$outfn = $ARGV[3];
}
print "Removed duplicates listed to $outfn \n";

# Expand directory names to full path - 'a'
if (!( $topa =~ '^\/')) { # NOT a top level directory  ?? Windows \??
   $tmpa = $topa;
   $topa = $wd . $dirchar . $topa;
   print "Changed $tmpa to $topa \n";
}
($nfa, $tfa_ref, $nda, $tda_ref, $uda_ref) = treefiledir($topa);
@tda=@$tda_ref;
@uda=@$uda_ref;
@tfa=@$tfa_ref;

print "$nfa files and $nda unique directories in $topa\n";

#print "tda\n";
#showfiles(@tda);
#$tmp = <>;


# Expand directory names to full path - 'b'
if (!( $topb =~ '^\/')) { # NOT a top level directory
   $tmpb = $topb;
   $topb = $wd . $dirchar . $topb;
   print "Changed $tmpb to $topb \n";
}
($nfb, $tfb_ref, $ndb, $tdb_ref, $udb_ref) = treefiledir($topb);

@tdb=@$tdb_ref;
@udb=@$udb_ref;
@tfb=@$tfb_ref;


#print "tdb\n";
#showfiles(@tdb);
#$tmp = <>;


# Use uda and udb to find if trees overlap
# ?? ARE THERE SYMLINKS ??
# At the moment, SHOULD be ignoring them, as we are using -f switch
# in doit subroutine. However, this COULD be a bad idea if we are
# referring to directories. Why?? If we point to a from b, then we
# won't find a regular file in b if we don't follow the link. So it
# should be OK in this case. Need to test to make sure i.e., put such
# a case in the test files.

# Check that the savedir is NOT in either tree.
my $sdirbase=basename($savedir);
print "Savedir is in $sdirbase\n";
my $tmp=<>;

for ($i=0;$i<$nda;$i++) {
	if ( $sdirbase eq @uda[$i] ) { die("Savedir is in tree a!"); }
}
for ($j=0;$j<$ndb;$j++) {
	if ( $sdirbase eq @udb[$j] ) { die("Savedir is in tree b!"); }
}
print "Savedir appears safe\n";


# Now check for duplicate directories i.e., directory overlap
# ?? Another reason to NOT allow symlinks!! The link could make an overlap.
$i=0; # index to tree a directories
$j=0; # index to tree b
my $neqdir=0; # we will count how many directory strings are the same
while( ($i<$nda) and ($j<$ndb) )  { # ?? are directories sorted by name?
	if (@uda[$i] gt @udb[$j]) { 
		if ($j<$ndb) { 
			$j++ ;
		}  
	} else {
	  if (@uda[$i] lt @udb[$j]) {
		if ($i<$nda) { 
			$i++;
		 } 
	  } else { ## equal
		$neqdir++;
		print "$neqdir Duplicate directory: @tda[$i] =eq= @tdb[$j]\n";
		## flag equality, count it and possibly stop.
	  }
    	}
}
if ($neqdir > 0) {
	die("Duplicate directories");
}
print "No apparent directory tree overlaps\n";
#my $tmp=<>;

# Build file info for tree 'a' (by size at moment)
my @tglob=(); # initialize null
my $abytes = 0; # counter for number of bytes
for ($i=0;$i<$nfa;$i++) {
   my $ttd = $tda[$i];
   my $ttf = $tfa[$i];
#   print "tda[ $i ] = $ttd tfa = $ttf\n";
   my $ff = $tda[$i] . $dirchar . $tfa[$i]; # full file name with path
#   print "working on $ff\n";
#   print stat($ff);
#   print "\n";
   my $fstat = stat($ff); # get file properties 
   $abytes = $abytes + $fstat->size;  # accumulate number of bytes
   my $temp = join($joinchar, $fstat->size, $tfa[$i], $tda[$i] . $dirchar . $tfa[$i],  $fstat->mtime);
   push(@tglob, $temp); # generate record, size, name, expanded name, mod. time
#   print "$tfa[$i]\n";
 }
print "Total bytes in tree a = $abytes\n";

# print "TGLOB\n";
# dispglob(@tglob);

# sort by size into new array
@globa = sort {(split($joinchar, $a))[0] <=> (split($joinchar, $b))[0]} @tglob;  
print "Tree a list sorted\n";

$savegloba = $tempdir . "sgloba";
open (TFILE, ">$savegloba");
foreach $tmp (@globa) {
	print TFILE "$tmp\n";
}
close(TFILE);

# print "GLOBA\n";
# dispglob(@globa);

@tglob=(); # re-initialize and do same for tree 'b'
my $bbytes = 0;
for ($j=0;$j<$nfb;$j++) {
   my $ttd = $tdb[$i];
   my $ttf = $tfb[$i];
#   print "tdb[ $i ] = $ttd tfb = $ttf\n";
   my $ff = $tdb[$j] . $dirchar . $tfb[$j]; # full file name with path
#   print "working on $ff\n";
#   print stat($ff);
#   print "\n";
   my $fstat = stat($ff);
   $bbytes = $bbytes + $fstat->size; 
   my $temp = join($joinchar, $fstat->size, $tfb[$j], $tdb[$j] . $dirchar . $tfb[$j],  $fstat->mtime);
   push(@tglob, $temp);
#   print "$tfb[$j]\n";
 }
print "Total bytes in tree b = $bbytes\n";
# print "TGLOB\n";
# dispglob(@tglob);

# sort by size into new array
@globb = sort {(split($joinchar, $a))[0] <=> (split($joinchar, $b))[0]} @tglob;  
print "Tree b list sorted\n";

$saveglobb = $tempdir . "sglobb";
open (TFILE, ">$saveglobb");
foreach $tmp (@globb) {
	print TFILE "$tmp\n";
}
close(TFILE);


# print "GLOBB\n";
# dispglob(@globb);

$|=1; # unbuffer printer so we see all actions

# Now do the search to remove duplicates
# open a file to list the files removed
my $outfile=$topa . $dirchar . $outfn;
open (OUTF, ">$outfile") or die("Could not open file $outfile for write"); # and open the file
my $numequal = 0; # To count the number of equalities
my $bytesgone = 0; # to accumulate the savings
# NOTE: because of block size, we actually may free more
# disk space than reported.
# print "Using joinchar = $joinchar\n";
# $i will be index for tree a
# $j will be index for tree b
$i = $nfa-1; # start from top (biggest files)
#   Note: indexing from 0, so start at top-1
$j = $nfb-1; 
# now have both records with 0=size, 1= name, 2=full file, 3=time??
while ( ($i>=0) && ($j>=0) ) { # main loop until one or other set of files exhausted
	# i indexes current top of tree A, j top of tree B
	@arec=split($joinchar, $globa[$i]); # extract data about files
	@brec=split($joinchar, $globb[$j]);
#    print "\r $i : $j : size=$arec[0] : ";
    if ($arec[0] == $brec[0]) { # filesize same
	# Find the lengths of equal sized groups
		if($i > 0) {  # $i must be > 0; only proceed if have not exhaused tree 
			# tree a group will be from $i down to ($nextai+1)
			@temprec = @arec;
			$nextai=$i; # initialize to current tree-a pointer
			while ( ($temprec[0]  ==  $arec[0]) && ($nextai >= 0) ) {
			# always TRUE on first pass
				$nextai--; # but immediately decrement
				# we will exit loop if size changes OR exhaust tree a
				if ($nextai >=0 ) {
					@temprec=split($joinchar,$globa[$nextai]);
				}
			}
		} else { $nextai = -1 }
		if($j > 0) { 
			@temprec = @brec;
			$nextbj=$j;
			while ( ($temprec[0]  ==  $brec[0] ) && ($nextbj >= 0)) {
				$nextbj--;
				if ($nextbj >=0) {
					@temprec=split($joinchar,$globb[$nextbj]);
				}
			} 
		} else { $nextbj = -1 }
	    print "\r $i : $j : size=$arec[0] : $arec[1] : $nextai : $nextbj ->";
	    # attempt to display what is going on in terms of indices of A and B trees
		# groups run down from $i to ($nextai+1), $j to ($nextbj+1) INCLUSIVE		
		while ($i > $nextai) { # loop over the files to be possibly removed from set
			@arec=split($joinchar, $globa[$i]); # extract data about files
			# extract again, just in case of changes in code
			my $tj = $j;
			while ($tj>$nextbj) { # loop over ALL the b files
				@brec=split($joinchar, $globb[$tj]);
				if (eqfile($arec[2], $brec[2]) == 0) {
			    		$numequal++;
					$bytesgone=$bytesgone+$arec[0];
	          			# Now check for file already present then move to savedir
        	  			my $nameidx=0;
          				my $savefn="$savedir/$arec[1]";
          				if (-e $savefn) {
            				# print "File $savefn exists, so adding index\n";
	    					print "i"; 
		            			$nameidx++;
            					$savefn=$savefn."_$nameidx";
          				} else { print "r"; }
		       			move($arec[2], $savefn); # but move (delete) it anyway
					$arec[0] = -1; # indicate file has been removed from glob ?? needed?.
   		    			print OUTF "\# mv $savefn $arec[2]\n";
	    				print OUTF "\# which corresponds to: $brec[2]\n";
## WRONG -- an earlier one may not be removed!!
	#				$i--; # decrement index on tree a, file has been removed
	#				if ($i > $nextai) {
	#					@arec=split($joinchar, $globa[$i]); # extract data about next file
	#				} else { $tj = $nextbj } # we are finished with these  groups
					$tj = $nextbj; # want to stop loop on tree b group
	  			} # end direct comparison of files
				$tj--; # use next file in tree b for comparison (doesn't matter if we've set to nextbj)
			} # end loop on tree b -- not same file
#	        if ($i > $nextai) {$i=$i-1}; # index of next file in tree a
			$i--; # simply decrement tree a index
	     	} # end while on tree a
		$i=$nextai; # This is merely  aprecaution
		$j=$nextbj; # adjust indices to next set of files
		# end of loop -- we have compared all files and possibly removed some
	} else { # unequal sizes
		if ($arec[0] > $brec[0]) { # file from tree a is bigger
			$i--; # next file from tree a (comparison is to tree b)
		} else { # b bigger
 			$j--; # next file in tree b
		}
 	} # end if equal size
} # end main while
print "\n";
print  "# $numequal equivalences found\n";
print "# At least $bytesgone bytes removed \n";
print OUTF "# ================= $numequal equivalences found   ===============\n";
print OUTF "# At least $bytesgone bytes removed \n";
close OUTF; # have to close this out and PREPEND the dirs

my $tmpout = $tempdir . "workfile";
open (TOUTF, ">$tmpout") or die("Could not open file $tmpout for writing"); 
print TOUTF "# List of files removed from $topa tree\n\n";
print TOUTF "# Date/time: $tstmp\n\n";

# Now try to remove empty directories in tree a
print "\n Remove empty directories \n";
# NO notification and no way to rebuild later!! But maybe we can learn how.??
my %rmopts =(
				wanted => "\&remmer",
				no_chdir =>	1
);
my $numrmdirs = 0;

finddepth (\&remmer, $topa);
print "Now add in the file recovery commands\n";
open (INF, $outfile) or die("could not open $outfile for re-read");
@tglob = <INF>; # get the data
close INF;
foreach (@tglob) {
	# Don't need line end -- in the file already.
	print TOUTF "$_";
}
print "# $numrmdirs empty directories removed\n";
print TOUTF "# $numrmdirs empty directories removed";
close TOUTF;
unlink($outfile);
copy($tmpout, $outfile);

my $tofile =$savedir.$dirchar.$outfn;
print "Save filelist again to $tofile\n";
copy($outfile, $tofile ) or print "File $tofile cannot be written/copied.\n";
copy($outfile, $tempdir.$outfn); # another copy just in case
print  "# $numequal equivalences found\n";
print "# $bytesgone bytes removed \n";
exit; # end of procedure


#===================   Subroutines  ==========================================
# Used by finddepth() from File::Find to remove empty dirs.
# Need to figure out how to save info to file about which ones work.
sub remmer {
	if (rmdir( $_ ) ) {
		print "Removed $File::Find::name \n";
		print TOUTF "# mkdir -p $File::Find::name \n";
		$numrmdirs++;
	}
}

# The following function is used to declare if files are equivalent.
# This could be altered depending on the usage of the program.
# An earlier version relied on MD5 sums of the files. If the MD5 sms
#   were equivalent, then the files were taken to be so. However, the
#   present code uses a direct comparison. Timings have not been 
#   carried out as at 2006-7-6.
sub eqfile { # Compare files, returns 0 if equal, 1 if otherwise
   my $one = shift @_;
   my $two = shift @_;
 #  print "Comparing $one and $two \n";
   my $result = compare($one, $two);
   $result; # return the result
}

# The following subroutine builds the tree array of files and 
# directories
sub treefiledir {
   my $treetop = shift @_;
   my @dirs = ();
   push (@dirs, $treetop);
   print "Expanding file and directory list for tree: $treetop \n";
   # Array initialization 
   @tf=(); # initialize. To hold the filenames
   @td=(); # to hold directory names
   find(\&doit, @dirs );
   # Note no return value used.
   # Uses "find" from the top of the a tree.
   # calls subroutine doit (the \& escapes the & so we get a subroutine
   # call and not another interpretaion of & e.g., bitwise and

#   foreach $_  (@td) {
#	print $_,"\n";
#   }

   my $nf=@tf; # the number of files
   my $nd=@td; # the number of directories
   print "$nf files directories found\n";
   # showfiles(@tf); # display the files
   #Now get the number of unique directories in tree
   # see	Unique Arrays
   #		in Perl Tips, Data Structures
   #		by William Ward on July 21, 2002 4:37 pm

#   If you have an array that may contain the same value in several places, and you would like to sort the array and remove all duplicates, here’s one way to do it…
#   
#   First, make a hash using the elements of the array as keys. It doesn’t matter what the values are. For example:
#   
#     my %hash = map { $_ => 1 } @array;
#   
#   The "map" function converts each array element into a tuple consisting of that value and 1. The results are then loaded into the hash as key/value pairs. The key is the array element and the corresponding value in the hash is 1. Even if two or more array elements have the same value, each key will only be in the hash once, because hash keys must be unique.
#   
#   Then we take that hash and sort the keys:
#   
#     my @array2 = sort keys %hash;
#   
#   And voila! Now you have all the elements of the original array, sorted, and each value occuring only once!



   my %hash = map { $_ => 1 } @td;
   my @utd = sort keys %hash;

# 070206 -- the sort commands are arcane and may give lots of trouble
# if the underlying "trick" no longer works. 
# ?? Can we find a sure way to make this work.
   my $ntd=@utd;
   print "$ntd unique directories in tree\n";
   #showfiles(@utd);
   print "==========================\n";
# need to document next line WELL ??
   return ($nf, \@tf, $ntd, \@td, \@utd);
}


# The following routine is needed by find() (from File::Find) in order
#  to process and save the results of the find operations.
# ?? put in links finder -- may be useful, even if commented out.
sub doit {
# ?? do we need??  my (@tf, @td);

  my $olddol = $_; # for safety, since Find::File may mess up with 
  #      changes in $_
  my $fpath=$File::Find::name;
  if (-f $fpath) { # only want regular files
    if (index($fpath, $joinchar)>-1) { 
     print "ERROR - joinchar ($joinchar) in path!!\n";
     print "fpath = $fpath \n";
     exit;
    }
    my $ofp=$fpath;
    my $fname=basename($fpath); # the name only
    my $dname=dirname($fpath);  #  the directory only
#    print "fname=$fname    dname=$dname \n";
    my $fn = $fname;
#    print "fn = $fn\n";
    push (@tf, $fname);
    push (@td, $dname);
  $_ = $olddol; # recover the $_ variable and return with the 
#   filename and directory arrays
#  ($_, \@tf, \@td);
  }
}

# Diagnostic display of files found.
sub showfiles {
 print "\n\n";
 my $numf = @_;
 print "$numf files\n";
  for (my $i=0;$i<$numf;$i++) {
    print "$i  $_[$i]\n"; # $_ not @_
   }
 print "===============\n";
 }

sub dispglob {
	my @tglob=@_;
	foreach my $row (@tglob) {
		print "$row\n";
	}
    $tmp = <>;
}	

sub tdstamp {
#
# Creates a timestamp string that can be appended to filenames
#
# J C Nash formalized on 20070207
#
	my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst);
	($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst)=localtime(time);
# The timestamp at start of program execution.
# printf "%4d-%02d-%02d %02d:%02d:%02d\n",$year+1900,$mon+1,$mday,$hour,$min,$sec;
# Can display timestamp if we wish by uncommenting line above.
	my $tstmp='';
	$year=1900+$year; # Full year number
	$mon=$mon+1; # Human style month number 
	$tstmp = $tstmp. $year; # make a good directory name
	if ($mon < 10) {
	  $tstmp = $tstmp . "0" . $mon;
	  } else {
	  $tstmp = $tstmp . $mon; 
	}
	if ($mday < 10) {
	  $tstmp = $tstmp . "0" . $mday;
	  } else {
	  $tstmp = $tstmp . $mday;
	}
   if ($hour < 10) {
	  $tstmp = $tstmp . "0" . $hour;
	  } else {
	  $tstmp = $tstmp . $hour;
	}
   if ($min < 10) {
	  $tstmp = $tstmp . "0" . $min;
	  } else {
	  $tstmp = $tstmp . $min;
	}
   if ($sec < 10) {
	  $tstmp = $tstmp . "0" . $sec;
	  } else {
	  $tstmp = $tstmp . $sec;
	}
   return($tstmp);
}
# ============ end of rmaiib.pl =================
