'''
Created on Feb 29, 2016
@author: Brett Paufler
Copyright Brett Paufler

image links on src are updated to conform single ./images directory format
    sequentially renaming images in the process

Input
    Single html file
    folder containing images (any name, OK)

Output
    Single html file
        same name
        img tag src links changed
    directory containing images
        name is ./images
        image names = stub_XX
            where stub if this_not_this
                this_00
                this_01

This works, as far as I know (3-2-16)
    created principally to automate renaming of BrettFood images
    
'''

#TODO - Diff test is meaningless, always true


import re
from os import walk, mkdir
from os.path import join, basename, exists
from shutil import copyfile
from difflib import Differ


def src_to_filename(src):
    for a, b in [('%20', ' '), ('&amp;', '&'), ('%28', '('), ('%29', ')')]:
        src = src.replace(a, b)
    return src

class src_rename():
    
    def __init__(self):
        '''Transforms HTML image-directory pair in input
        to single ./image directory format in output.'''
        
        self.input_dir = '.\input' #source directory
        self.output_dir = '.\output' #destination directory
        self.output_image_dir = '.\output\images' #destination directory for images
        
        self.input_files = [] #All files in directory walk of ./input
        self.html_filename = '' #source html file path
        self.html_text = '' #source html text
        self.image_files = [] #source image listing
        self.src_list = [] #source html listing of all src="()" values
        self.stub = '' #base name of source html used in renaming
        
        #Populates the variables initialized with holding variables above
        self.get_input_files()
        self.split_input_files()
        self.get_html_text()
        self.get_src_list()
        
        
    #Maybe combine this with split_input_files
    def get_input_files(self):
        '''Walks ./input, pushes files as list self.input_files.'''
        
        for root, _, names in walk(self.input_dir):
            for name in names:
                self.input_files.append(join(root, name))


    def split_input_files(self):
        '''Splits input files in html_file, src_list,
        and extracts the stub string from html file_name.'''
        
        htmls = [i for i in self.input_files if i.endswith('.html')]
        if len(htmls) != 1:
            raise ValueError('Code Hardwired for a Single HTML instance')
        self.html_filename = htmls.pop()
        
        self.image_files = self.input_files[:]
        self.image_files.remove(self.html_filename)
        
        self.stub = basename(self.html_filename)[:-5].split('_')[0]


    def get_html_text(self):
        '''Retrieves text of source html file.'''
        
        with open(self.html_filename, 'r') as f:
            self.html_text = f.read()
        
        #No provision for UTF-8/16/32 provided 
        for c in self.html_text:
            if ord(c) > 128:
                raise ValueError('Original HTML contains none ASCII characters')     


    #TODO: I expect this will fail if there are non-image src's, js, etc
    def get_src_list(self):
        '''Extracts listing of all src's from html_text.'''
        
        self.src_list = set(re.findall('src="(.+?)"',
                                    self.html_text, re.DOTALL | re.IGNORECASE))
        
        #Confirm each src has a resource and each resource has a src 
        basename_image_set = set(basename(s) for s in self.image_files)
        basename_src_set = set(src_to_filename(basename(s)) for s in self.src_list)
        src_viewpoint = basename_src_set - basename_image_set
        image_viewpoint = basename_image_set - basename_src_set
        #if 0 != len(src_viewpoint):
        #    raise ValueError('SRC points to missing resource: %s' % src_viewpoint)
        if 0 != len(image_viewpoint):
            raise ValueError('Image Resource not utilized: %s' % image_viewpoint)


    def copy_rename(self):
        '''New html wiht updated text and copied images created in ./output.
        This is the workhorse of this class.  All the heavy lifting done here.'''
        
        #Insure ./output/images exits
        if not exists(self.output_image_dir):
            mkdir(self.output_image_dir)
        
        #Insures stub_1, stub_01, or stub_001 
        pad = len(str(len(self.src_list)))
        
        
        #Processes each src/image pair one by one
        for i, src in enumerate(self.src_list, 1):
            
            ext = basename(src).split('.')[-1] #jpg, png, etc.
            old_base = src_to_filename(basename(src)) #file_name, no path
            new_base = '%s_%0*d.%s' % (self.stub, pad, i, ext)
            
            #updates html_text
            self.html_text = self.html_text.replace(src, './images/' + new_base)
            
            #isolate relevant image and format new name
            images_old = [img for img in self.image_files
                            if old_base == basename(img)]
            if 1 != len(images_old):
                raise ValueError('Image Names Clash (shared base): %s' % images_old)
            image_old = images_old.pop()
            image_new = '.\output\images\%s' % new_base
        
            #Copy images with new name
            print 'Copying Image (from, to): %s, %s' % (image_old, image_new)
            #print 'Old to New'
            #print image_old
            #print image_new
            copyfile(image_old, image_new)
            
        #Write updated html_text to ./output
        html_out = self.html_filename.replace('input', 'output')
        with open(html_out, 'w') as f:
            f.write(self.html_text)


    #TODO: This is a failed test
    def changes(self):
        '''Intent was to automate detection of errors.
        Code in place is meaningless, will always be True.'''
        
        with open(self.html_filename, 'r') as f:
            text_in = f.read().split('\n')
        with open(self.html_filename.replace('input', 'output'), 'r') as f:
            text_out = f.read().split('\n')

        d = Differ()
        difference = list(d.compare(text_in, text_out))
        diff_in = [d[2:] for d in difference if d.startswith('-')]
        diff_out = [d[2:] for d in difference if d.startswith('+')]
        
        #This is an empty test as the two will always be of equal length
        if len(diff_in) != len(diff_out):
            raise AssertionError('Unequal Changes in HTML')

        for a, b in zip(diff_in, diff_out):
            print a
            print b

        
        

if __name__ == '__main__':
    
    src = src_rename()
    src.copy_rename()
    src.changes()