'''
Created on Oct 26, 2015
@author: Brett Paufler
(c) Copyright Brett Paufler

NOT MEANINGFUL
    AFTER RUNNING A BEFORE AND AFTER COMPRESSION
    SPACE SAVINGS IS NEGLIGABLE
        HAS NO MEANINGFUL EFFECT ON 7zip size

Deletes duplicate files of noted extension type(s).
    Extension types are listed in ext_list.
    Two files are considered the same
        (and the second one encountered is deleted) if:
        file_name is the same
        file_size is the same
        file_hash is the same

Intent is to clear duplicate image files from website back up as they cannot be compressed,
    so only keeps one copy of each image, but multiple copies of everything else

1) Place all files and folders in .\output
2) Add 'reference' to name of any file/folder to be left intact,
    Program works from the top down,
    So if no 'reference', top folder is processed first
3) Run this python program
'''

from os import walk, listdir, mkdir, remove
from os.path import join, exists, getsize
import shutil
from collections import namedtuple

#THIS IS HARDWIRED -Changing anything may have unforseen side effects
dir_in = r'.\input'
dir_out = r'.\output'
ext_list = ['bmp', 'jpg', 'png', 'pdf', 'gif', 'js']
fileData = namedtuple('fileData', ['name', 'size', 'hash' ])


def files_in_dir(directory):
    '''Returns list of path to all files in passed directory tree,
    including sub-directories.'''
    d = []
    for root, _, names in walk(directory):
        for name in names:
            d.append(join(root, name))
    return d

#THIS ISN'T USED, JUST COPY INTO OUTPUT DIRECTLY
def copy_input():
    '''All folders/files in .\input are copied to .\output.
    Errors likely if .\output not empty'''
    if not exists(dir_out):
        mkdir(dir_out)
    for items in listdir(dir_in):
        name_in = join(dir_in, items)
        name_out = name_in.replace('input', 'output')
        print name_in, name_out
        if '.' in name_out[-5:]: #it's a file
            shutil.copy(name_in, name_out)
        else:
            shutil.copytree(name_in, name_out)

            
            
def get_file_data(f):
    '''Returns identifying data for passed file.'''
    return fileData(f.split('\\')[-1],
                    getsize(f),
                    hash(open(f).read()))


class MemonizeImages():
    '''Class to hold the list of previously copied/archived images, etc.'''
    
    def __repr__(self):
        return str(self.have_copy)
    
    def __init__(self):
        '''If a file's fileData is added to this list, it should have been copied arleady.
        That's the intent, anyhow.'''
        self.have_copy = self.memonize_reference()
    
    def memonize_reference(self):
        '''Returns a list of fileData namedTuples as taken from the reference directories.
            Function used by __init__, only'''
        have_copy = []
        ref_dirs = [f for f in listdir(dir_out) if 'reference' in f.lower()]
        for ref_dir in ref_dirs:
            ref_path = join(dir_out, ref_dir)
            for f in files_in_dir(ref_path):
                if f[-3:].lower() in ext_list:
                    have_copy.append(get_file_data(f))
        return have_copy

    def archived(self, file_path):
        '''Is passed file already in the have_file archive, 
            i.e. is there arleady a copy of the passed file.
            Since f_name, f_size, f_hash are computed in order,
                this should be relatively optimized.
                However, unoptimized version would work fast enough,
                    so if problems, changing OK.'''
        archived = False
        f_size, f_hash = 0, 0
        f_name = file_path.split('\\')[-1]
        print f_name
        match_list = [m for m in self.have_copy if m.name == f_name]
        if len(match_list) > 0:
            f_size = getsize(file_path)  
            match_list = [m for m in match_list if m.size == f_size]
            if len(match_list) > 0:
                f_hash = hash(open(file_path).read())
                match_list = [m for m in match_list if m.hash == f_hash]
                if len(match_list) > 0:
                    archived = True
        return archived, fileData(f_name, f_size, f_hash)
    
        
def delete_duplicates():
    '''Main function, see module header,
    deletes duplicate image files in ./output directory.'''
    print 'RUNNING: delete_duplicates()'
    print 'Memonizing all images (etc) in reference directories'
    memon = MemonizeImages() #processes directories with 'reference' in name in __init__
    #arch_dirs are the archive directories, everything without 'reference' in the name
    arch_dirs = [f for f in listdir(dir_out) if 'reference' not in f.lower()]
    for arch_dir in arch_dirs:
        arch_path = join(dir_out, arch_dir)
        for f in [f for f in files_in_dir(arch_path) if f[-3:].lower() in ext_list]:
            archived, f_data = memon.archived(f)
            if archived:
                print 'KILLING (already archived): ', f
                remove(f)
            else:
                print 'ARCHIVING (new resource): ', f
                memon.have_copy.append(f_data)
    print '\n\n \t FINISHED'



if __name__ == '__main__':
    
    #HAS NO EFFECT ON 7zip size
    delete_duplicates()