''' Created on Oct 30, 2015 @author: Brett Paufler Copyright Brett Paufler Changes links for website. You'll be back to this shortly, I would think. link_report() provides information on: files not referenced links with no corresponding files #TODO - add this to link report report_name_collisions() advises of name collisions extension_count() provides count of various extensions base_update() changes all file names to be more url friendly: all lower case _ seperated no spaces or special characters updates all occurences of these file names (and all url varations) in all files, so all links, src, and href are updated Make sure your live analyser (GetLinks) is working first #TODO - Clean Up - Links to Nowhere #TODO 1) (c) symbol to © throughout other symbols, as well 2) Scrub beginning HTML info 3) Look at Minataur Tails TOC for more ideas ''' #TODO - Brett Paufler is Wanted in Many Places #Resumes for instance from os import walk, rename from os.path import join from collections import Counter from itertools import permutations import re #from fileinput import FileInput #import fileinput # um, probably not dir_path = r'.\output' #path to directory to be effected img_ext = ['bmp', 'jpg', 'gif', 'png'] def files_in_dir(directory=dir_path): '''Returns list of path to all files in passed directory tree, including sub-directories.''' d = [] for root, _, names in walk(directory): for name in names: d.append(join(root, name)) return d def html_files(): return [f for f in files_in_dir() if f.lower().endswith('.html')] def extension_count(): '''Returns a Count object of various file types: jpg, JPG, js, py, html, etc ''' ext_list = [f.split('.')[-1] for f in files_in_dir()] count = Counter(ext_list) print 'extension_count(): %s' % str(count) return count def url_variations(file_path): '''Returns a list of possible all possible url variotions for the name portion of the passed file_path. Directory Information is discarded: './this/that the other.jpg' reduces to 'that the other.jpg' And 'that the other.jpg' becomes a listing of: ['that the other.jpg', 'that the other.JPG', 'that%20the%20other.jpg', 'that%20the%20other.JPG'] NOTE: If url_variations > 2: is a great test for bad link names. (2 because jpg & JPG).''' file_path = file_path.split('\\')[-1] parts = file_path.split('.') file_path = '.'.join(parts[:-1]) ext = parts[-1] variations = [file_path] #unmodified version starts the list possible_replacements = [('%', '%25'), (' ', '%20'), ('&', '%26'), ('&', '&'), ("'", '%27'), ('(', '%28'), (')', '%29'), (',', '%82')] #add to as appropriate replacements = [p for p in possible_replacements if p[0] in file_path] if replacements: repl_list = list(permutations(replacements)) for rl in repl_list: fN = str(file_path) for r in rl: fN = fN.replace(r[0], r[1]) variations.append(fN) variations = list(set(variations)) up = ['.'.join([v, ext.lower()]) for v in variations] down = ['.'.join([v, ext.upper()]) for v in variations] return up + down def file_references_path_string(path_string, file_to_check): '''Returns True if path_string (or it's url deriviations) is found within file_to_check. Coerces all to lower case prior to making check.''' file_contents = open(file_to_check).read() #.lower() return_value = [(i in file_contents) for i in url_variations(path_string)] return any(return_value) def files_referencing_path_string(path_string): '''Returns list of all files that reference passed path_string or it's url encoded deriviations, so a simple text string will work as well.''' html_or_js = [f for f in files_in_dir() if f.endswith('.html') or f.endswith('.js')] return [f for f in html_or_js if file_references_path_string(path_string, f)] def file_names_not_referenced(): '''Returns list of files whose name (or it's url encoded deriviatons) are not referenced in another html document. Note: this in itself is NOT a positive indication of a link, and as such, this function is a dead end.''' no_link = [f for f in files_in_dir() if len(files_referencing_path_string(f)) == 0] return no_link def unencode_url(url): '''Returns plain string version of url encoded string. e.g. 'this%20that.html' is translated into 'this that.html' Note: the encoding list if very limited. For base changes, if url_in == url_out: there is no point in changing a url.''' rP = [('%20', ' '), ('%26', '&'), ('&', '&'), ('%27', "'"), ('%28', '('), ('%29', ')'), ('%82', ','), ('%25', '%')] #('%25', '%') needs to be last for r in rP: url = url.replace(r[0], r[1]) return url def links_html(): '''For all src or href in all html, returns un-encoded list of resources un-encodes link (%20 converted to ' '). It's simplistic in that directories are not taken into account.''' html_files = [f for f in files_in_dir() if f.endswith('.html')] link_list = [] for h_file in html_files: text_lines = open(h_file).read().split() text_lines = [t for t in text_lines if 'src="' in t or 'href="' in t] link_list += text_lines link_list = [i.split('"')[1] for i in link_list] link_list = [unencode_url(url) for url in link_list] link_list = [i.split('/')[-1] for i in link_list] #Scrubs Directory Information link_list = [i for i in link_list if not i.startswith('#')] #Scrubs #, used by anchors link_list = list(set(link_list)) return link_list #Way Too Specialized def slide_show_links(): '''Images that the slideShow page links to. It's really too specific to be much good for anything else.''' js_files = [f for f in files_in_dir() if f.endswith('.js')] link_list = [] #Get Text lines with 'src' for js in js_files: text_lines = open(js).read().split('\n') text_lines = [t for t in text_lines if 'src' in t and any([(i.lower() in t.lower()) for i in img_ext]) ] text_lines = [t.split('./')[-1] for t in text_lines] text_lines = [t.split('"')[0] for t in text_lines] text_lines = [unencode_url(t) for t in text_lines] link_list += text_lines return link_list def link_reports(): '''One questions how long-term useful this is. Spits out a report (prints to screen) a simplistic listing Files with no obvious links Links with no obvious Files. Only takes the name part of the extension into account.''' #In other words, no src, but referenced inline javascript print 'THESE FILES ARE REFERENCED: LOGIC WOULD BE A ONE OFF' print '\t2013_6_5_brett_paufler_tom_kha_gai_quail_winner.jpg' print '\t2013_6_5_brett_paufler_tom_kha_gai_quail_empty_ha_ha.jpg' print '\t2013_6_5_brett_paufler_tom_kha_gai_quail_empty.jpg' #HTML Links List html = set([h.lower() for h in links_html() + slide_show_links()]) html = [h for h in html if 'tom' in h] html = set(html) #Files Listing dir_files = [f.split('\\')[-1] for f in files_in_dir()] dir_files = [d.lower() for d in dir_files] dir_files = sorted(dir_files) dir_files = [h for h in dir_files if 'tom' in h] dir_files = set(dir_files) print '\nBY NAME ONLY (no dir referencing)\nLinks that Point to NoWhere (appropriate file does not exist)' for i in list(html - dir_files): print i print '\n\nBY NAME ONLY (no dir referencing)\nFILES that are NOT Utilized (unused file resource listing)' print '\t\t\tSEE TOP OF REPORT' for i in (dir_files - html): print i #TODO - Additional report for these final only (if anywhere) # See above for more clarification def update_links_in_file(old, new, file_path): '''Updates all variations (url) of old with new (base no variations) in file_path. If new is a busted path (needs escaping), link won't work.''' with open(file_path, 'r') as f: s = f.read() for var in url_variations(old): s = s.replace(var, new) with open(file_path, 'w') as f: f.write(s) def update_all_links(old, new): '''Updates all links (all url variations) in all files referencing ''' for file_to_update in files_referencing_path_string(old): print 'UPDATING (all links): %s\n\tOLD: %s\n\tNEW: %s' % (file_to_update, old, new) update_links_in_file(old, new, file_to_update) def update_file_name(old, new): target_file = [f for f in files_in_dir() if old in f] assert len(target_file) == 1 path_old = target_file[0] path_new = path_old.replace(old, new) print 'FILE RENAMED:\n\tOLD:%s\n\tNEW:%s' % (path_old, path_new) rename(path_old, path_new) def clean_name(file_path): '''Tidies up name_part of file path (dir_path_part/name_part.ext) by removing spaces, special characters, and miscellaneous key words. Returned string is '_' seperated lower case. Trailing '.ext' if any is converted to lower case. Directory Path is completely unaffected.''' parts = file_path.split('\\') file_name = parts[-1] file_name = file_name.replace('.', '_', file_name.count('.') - 1) # '.' <= 1 kill_list = ['Copyright Brett Paufler', '(c)', 'Copyright', 'Brett Food', ',', '&', '-', "'", '(', ')', '%' ] for c in kill_list: file_name = re.sub(re.escape(c), ' ', file_name, flags=re.IGNORECASE) file_name = '_'.join(file_name.split()) file_name = file_name.replace('_.', '.') # del trailing '_' file_name = file_name.lower() file_path = '\\'.join(parts[:-1] + [file_name]) return file_path def base_update(): '''Updates bad url's with cleaner ones.''' to_update = [f for f in files_in_dir()] for old in to_update: new = clean_name(old) if old != new: print 'UPDATING:\n\tOLD: %s\n\tNEW: %s' % (old, new) update_all_links(old.split('\\')[-1], new.split('\\')[-1]) update_file_name(old, new) def rename_resource(old, new): '''Resource with name == old is renamed to new. There are no errors (should there be): Warnings are printed to screen. ''' print 'renaming_resource:' matching_files = [f for f in files_in_dir() if old in f] num_matches = len(matching_files) if num_matches == 0: print 'No Matching Files' elif num_matches == 1: update_all_links(old, new) update_file_name(old, new) elif num_matches >= 2: print 'File Name Collision: Process Aborted' for m in matching_files: print '\t%s' % m def report_name_collisions(): '''Prints list of all files with the same name extension to screen.''' print 'RUNNING: report_name_collisions:' #Logic: could break out, but why bother file_paths = files_in_dir() collisions = [] while file_paths: next_path = file_paths.pop() next_name = next_path.split('\\')[-1] hits = [f for f in file_paths if f.split('\\')[-1] == next_name] if hits: for hit in hits: file_paths.remove(hit) collisions.append((next_path, hit)) #Report if collisions: print '\tNAME COLLISIONS:' for c in collisions: print '\t\t%', c else: print '\tNO COLLISIONS\n\tGood to Go!!!' #TODO - Directory Name CHanger #def replace_text(old, new): if __name__ == '__main__': print 'Loading: links_and_layout' #base_update() #link_reports() #extension_count() #report_name_collisions() #for f in files_in_dir(): # print f print chr(169) #with as text_file: #a = fileinput.input(f, inplace=True)# as text_file: #a..for line in text_file: # print text_file #rename_resource #old = "pork_tom_kha_gha_2013_9_3_pork_bones_soup_split_4.jpg" #new = "pork_tom_kha_gha_soup_4.jpg" #old = 'r_tritip_cherry_salad_2013_9_3_with_blue_cheese_1.jpg' #old = 'tritip_cherry_salad_with_blue_cheese_1.jpg' #new = 'tritip_cherry_salad_with_blue_cheese_1-HAPPY.jpg' #rename_resource(old, new) print 'Finished' #TODO - Diablo FIre (the js scripting portion) will Need hand updating #Diablo fire uses a for/next so only part of the string is used. #If all are updated to similiar, should be easy