''' Created on Oct 26, 2015 @author: Brett Paufler (c) Copyright Brett Paufler NOT MEANINGFUL AFTER RUNNING A BEFORE AND AFTER COMPRESSION SPACE SAVINGS IS NEGLIGABLE HAS NO MEANINGFUL EFFECT ON 7zip size Deletes duplicate files of noted extension type(s). Extension types are listed in ext_list. Two files are considered the same (and the second one encountered is deleted) if: file_name is the same file_size is the same file_hash is the same Intent is to clear duplicate image files from website back up as they cannot be compressed, so only keeps one copy of each image, but multiple copies of everything else 1) Place all files and folders in .\output 2) Add 'reference' to name of any file/folder to be left intact, Program works from the top down, So if no 'reference', top folder is processed first 3) Run this python program ''' from os import walk, listdir, mkdir, remove from os.path import join, exists, getsize import shutil from collections import namedtuple #THIS IS HARDWIRED -Changing anything may have unforseen side effects dir_in = r'.\input' dir_out = r'.\output' ext_list = ['bmp', 'jpg', 'png', 'pdf', 'gif', 'js'] fileData = namedtuple('fileData', ['name', 'size', 'hash' ]) def files_in_dir(directory): '''Returns list of path to all files in passed directory tree, including sub-directories.''' d = [] for root, _, names in walk(directory): for name in names: d.append(join(root, name)) return d #THIS ISN'T USED, JUST COPY INTO OUTPUT DIRECTLY def copy_input(): '''All folders/files in .\input are copied to .\output. Errors likely if .\output not empty''' if not exists(dir_out): mkdir(dir_out) for items in listdir(dir_in): name_in = join(dir_in, items) name_out = name_in.replace('input', 'output') print name_in, name_out if '.' in name_out[-5:]: #it's a file shutil.copy(name_in, name_out) else: shutil.copytree(name_in, name_out) def get_file_data(f): '''Returns identifying data for passed file.''' return fileData(f.split('\\')[-1], getsize(f), hash(open(f).read())) class MemonizeImages(): '''Class to hold the list of previously copied/archived images, etc.''' def __repr__(self): return str(self.have_copy) def __init__(self): '''If a file's fileData is added to this list, it should have been copied arleady. That's the intent, anyhow.''' self.have_copy = self.memonize_reference() def memonize_reference(self): '''Returns a list of fileData namedTuples as taken from the reference directories. Function used by __init__, only''' have_copy = [] ref_dirs = [f for f in listdir(dir_out) if 'reference' in f.lower()] for ref_dir in ref_dirs: ref_path = join(dir_out, ref_dir) for f in files_in_dir(ref_path): if f[-3:].lower() in ext_list: have_copy.append(get_file_data(f)) return have_copy def archived(self, file_path): '''Is passed file already in the have_file archive, i.e. is there arleady a copy of the passed file. Since f_name, f_size, f_hash are computed in order, this should be relatively optimized. However, unoptimized version would work fast enough, so if problems, changing OK.''' archived = False f_size, f_hash = 0, 0 f_name = file_path.split('\\')[-1] print f_name match_list = [m for m in self.have_copy if m.name == f_name] if len(match_list) > 0: f_size = getsize(file_path) match_list = [m for m in match_list if m.size == f_size] if len(match_list) > 0: f_hash = hash(open(file_path).read()) match_list = [m for m in match_list if m.hash == f_hash] if len(match_list) > 0: archived = True return archived, fileData(f_name, f_size, f_hash) def delete_duplicates(): '''Main function, see module header, deletes duplicate image files in ./output directory.''' print 'RUNNING: delete_duplicates()' print 'Memonizing all images (etc) in reference directories' memon = MemonizeImages() #processes directories with 'reference' in name in __init__ #arch_dirs are the archive directories, everything without 'reference' in the name arch_dirs = [f for f in listdir(dir_out) if 'reference' not in f.lower()] for arch_dir in arch_dirs: arch_path = join(dir_out, arch_dir) for f in [f for f in files_in_dir(arch_path) if f[-3:].lower() in ext_list]: archived, f_data = memon.archived(f) if archived: print 'KILLING (already archived): ', f remove(f) else: print 'ARCHIVING (new resource): ', f memon.have_copy.append(f_data) print '\n\n \t FINISHED' if __name__ == '__main__': #HAS NO EFFECT ON 7zip size delete_duplicates()