''' Created on Feb 29, 2016 @author: Brett Paufler Copyright Brett Paufler image links on src are updated to conform single ./images directory format sequentially renaming images in the process Input Single html file folder containing images (any name, OK) Output Single html file same name img tag src links changed directory containing images name is ./images image names = stub_XX where stub if this_not_this this_00 this_01 This works, as far as I know (3-2-16) created principally to automate renaming of BrettFood images ''' #TODO - Diff test is meaningless, always true import re from os import walk, mkdir from os.path import join, basename, exists from shutil import copyfile from difflib import Differ def src_to_filename(src): for a, b in [('%20', ' '), ('&', '&'), ('%28', '('), ('%29', ')')]: src = src.replace(a, b) return src class src_rename(): def __init__(self): '''Transforms HTML image-directory pair in input to single ./image directory format in output.''' self.input_dir = '.\input' #source directory self.output_dir = '.\output' #destination directory self.output_image_dir = '.\output\images' #destination directory for images self.input_files = [] #All files in directory walk of ./input self.html_filename = '' #source html file path self.html_text = '' #source html text self.image_files = [] #source image listing self.src_list = [] #source html listing of all src="()" values self.stub = '' #base name of source html used in renaming #Populates the variables initialized with holding variables above self.get_input_files() self.split_input_files() self.get_html_text() self.get_src_list() #Maybe combine this with split_input_files def get_input_files(self): '''Walks ./input, pushes files as list self.input_files.''' for root, _, names in walk(self.input_dir): for name in names: self.input_files.append(join(root, name)) def split_input_files(self): '''Splits input files in html_file, src_list, and extracts the stub string from html file_name.''' htmls = [i for i in self.input_files if i.endswith('.html')] if len(htmls) != 1: raise ValueError('Code Hardwired for a Single HTML instance') self.html_filename = htmls.pop() self.image_files = self.input_files[:] self.image_files.remove(self.html_filename) self.stub = basename(self.html_filename)[:-5].split('_')[0] def get_html_text(self): '''Retrieves text of source html file.''' with open(self.html_filename, 'r') as f: self.html_text = f.read() #No provision for UTF-8/16/32 provided for c in self.html_text: if ord(c) > 128: raise ValueError('Original HTML contains none ASCII characters') #TODO: I expect this will fail if there are non-image src's, js, etc def get_src_list(self): '''Extracts listing of all src's from html_text.''' self.src_list = set(re.findall('src="(.+?)"', self.html_text, re.DOTALL | re.IGNORECASE)) #Confirm each src has a resource and each resource has a src basename_image_set = set(basename(s) for s in self.image_files) basename_src_set = set(src_to_filename(basename(s)) for s in self.src_list) src_viewpoint = basename_src_set - basename_image_set image_viewpoint = basename_image_set - basename_src_set #if 0 != len(src_viewpoint): # raise ValueError('SRC points to missing resource: %s' % src_viewpoint) if 0 != len(image_viewpoint): raise ValueError('Image Resource not utilized: %s' % image_viewpoint) def copy_rename(self): '''New html wiht updated text and copied images created in ./output. This is the workhorse of this class. All the heavy lifting done here.''' #Insure ./output/images exits if not exists(self.output_image_dir): mkdir(self.output_image_dir) #Insures stub_1, stub_01, or stub_001 pad = len(str(len(self.src_list))) #Processes each src/image pair one by one for i, src in enumerate(self.src_list, 1): ext = basename(src).split('.')[-1] #jpg, png, etc. old_base = src_to_filename(basename(src)) #file_name, no path new_base = '%s_%0*d.%s' % (self.stub, pad, i, ext) #updates html_text self.html_text = self.html_text.replace(src, './images/' + new_base) #isolate relevant image and format new name images_old = [img for img in self.image_files if old_base == basename(img)] if 1 != len(images_old): raise ValueError('Image Names Clash (shared base): %s' % images_old) image_old = images_old.pop() image_new = '.\output\images\%s' % new_base #Copy images with new name print 'Copying Image (from, to): %s, %s' % (image_old, image_new) #print 'Old to New' #print image_old #print image_new copyfile(image_old, image_new) #Write updated html_text to ./output html_out = self.html_filename.replace('input', 'output') with open(html_out, 'w') as f: f.write(self.html_text) #TODO: This is a failed test def changes(self): '''Intent was to automate detection of errors. Code in place is meaningless, will always be True.''' with open(self.html_filename, 'r') as f: text_in = f.read().split('\n') with open(self.html_filename.replace('input', 'output'), 'r') as f: text_out = f.read().split('\n') d = Differ() difference = list(d.compare(text_in, text_out)) diff_in = [d[2:] for d in difference if d.startswith('-')] diff_out = [d[2:] for d in difference if d.startswith('+')] #This is an empty test as the two will always be of equal length if len(diff_in) != len(diff_out): raise AssertionError('Unequal Changes in HTML') for a, b in zip(diff_in, diff_out): print a print b if __name__ == '__main__': src = src_rename() src.copy_rename() src.changes()