'''
Created on Oct 30, 2015
@author: Brett Paufler
Copyright Brett Paufler


Changes links for website.
You'll be back to this shortly, I would think.


link_report()
    provides information on:
        files not referenced
        links with no corresponding files

#TODO - add this to link report        
report_name_collisions()
    advises of name collisions

extension_count()
    provides count of various extensions

base_update()
    changes all file names to be more url friendly:
        all lower case
        _ seperated
        no spaces or special characters
    updates all occurences of these file names (and all url varations)
        in all files, so all links, src, and href are updated 

    
Make sure your live analyser (GetLinks) is working first

#TODO - Clean Up - Links to Nowhere

#TODO
    1) (c) symbol to &#169; throughout
        other symbols, as well
    2) Scrub beginning HTML info
    3) Look at Minataur Tails TOC for more ideas

'''


#TODO - Brett Paufler is Wanted in Many Places
#Resumes for instance

from os import walk, rename
from os.path import join
from collections import Counter
from itertools import permutations
import re

#from fileinput import FileInput
#import fileinput # um, probably not

dir_path = r'.\output' #path to directory to be effected
img_ext = ['bmp', 'jpg', 'gif', 'png']

def files_in_dir(directory=dir_path):
    '''Returns list of path to all files in passed directory tree,
    including sub-directories.'''
    d = []
    for root, _, names in walk(directory):
        for name in names:
            d.append(join(root, name))
    return d

def html_files():
    return [f for f in files_in_dir() if f.lower().endswith('.html')]

def extension_count():
    '''Returns a Count object of various file types:
        jpg, JPG, js, py, html, etc '''
    ext_list = [f.split('.')[-1] for f in files_in_dir()]
    count = Counter(ext_list)
    print 'extension_count(): %s' % str(count)
    return count

def url_variations(file_path):
    '''Returns a list of possible all possible url variotions
    for the name portion of the passed file_path.
        Directory Information is discarded:
            './this/that the other.jpg' reduces to 'that the other.jpg'
            And 'that the other.jpg' becomes a listing of:
                ['that the other.jpg', 'that the other.JPG',
                'that%20the%20other.jpg', 'that%20the%20other.JPG']
        NOTE: If url_variations > 2:
            is a great test for bad link names.
            (2 because jpg & JPG).'''
    file_path = file_path.split('\\')[-1]
    parts = file_path.split('.')
    file_path = '.'.join(parts[:-1])
    ext = parts[-1]
    variations = [file_path] #unmodified version starts the list   
    possible_replacements = [('%', '%25'), (' ', '%20'), ('&', '%26'), ('&', '&amp;'),
                             ("'", '%27'), ('(', '%28'), (')', '%29'), (',', '%82')] #add to as appropriate
    replacements = [p for p in possible_replacements if p[0] in file_path]
    if replacements:
        repl_list = list(permutations(replacements))
        for rl in repl_list:
            fN = str(file_path)
            for r in rl:
                fN = fN.replace(r[0], r[1])
                variations.append(fN)
        variations = list(set(variations))
    up = ['.'.join([v, ext.lower()]) for v in variations]
    down = ['.'.join([v, ext.upper()]) for v in variations]
    return up + down  

def file_references_path_string(path_string, file_to_check):
    '''Returns True if path_string (or it's url deriviations) is found within file_to_check.
        Coerces all to lower case prior to making check.'''
    file_contents = open(file_to_check).read() #.lower()
    return_value = [(i in file_contents) for i in url_variations(path_string)]
    return any(return_value)

def files_referencing_path_string(path_string):
    '''Returns list of all files that reference passed path_string
        or it's url encoded deriviations,
        so a simple text string will work as well.'''
    html_or_js = [f for f in files_in_dir() if f.endswith('.html') or f.endswith('.js')]
    return [f for f in html_or_js if file_references_path_string(path_string, f)]

def file_names_not_referenced():
    '''Returns list of files whose name (or it's url encoded deriviatons)
        are not referenced in another html document.
        Note: this in itself is NOT a positive indication of a link,
            and as such, this function is a dead end.'''
    no_link = [f for f in files_in_dir() if len(files_referencing_path_string(f)) == 0]
    return no_link

def unencode_url(url):
    '''Returns plain string version of url encoded string.
        e.g. 'this%20that.html' is translated into 'this that.html'
        Note: the encoding list if very limited.
        For base changes, if url_in == url_out:
            there is no point in changing a url.''' 
    rP = [('%20', ' '), ('%26', '&'), ('&amp;', '&'),
          ('%27', "'"), ('%28', '('), ('%29', ')'), 
          ('%82', ','), 
          ('%25', '%')] #('%25', '%') needs to be last
    for r in rP:
        url = url.replace(r[0], r[1])
    return url

def links_html():
    '''For all src or href in all html, returns un-encoded list of resources
        un-encodes link (%20 converted to ' ').
        It's simplistic in that directories are not taken into account.'''
    html_files = [f for f in files_in_dir() if f.endswith('.html')]
    link_list = []
    for h_file in html_files:
        text_lines = open(h_file).read().split()
        text_lines = [t for t in text_lines if 'src="' in t or 'href="' in t]
        link_list += text_lines
    link_list = [i.split('"')[1] for i in link_list]
    link_list = [unencode_url(url) for url in link_list]
    link_list = [i.split('/')[-1] for i in link_list] #Scrubs Directory Information
    link_list = [i for i in link_list if not i.startswith('#')] #Scrubs #, used by anchors
    link_list = list(set(link_list))
    return link_list


#Way Too Specialized
def slide_show_links():
    '''Images that the slideShow page links to.
    It's really too specific to be much good for anything else.''' 
    js_files = [f for f in files_in_dir() if f.endswith('.js')]
    link_list = []
    #Get Text lines with 'src'
    for js in js_files:
        text_lines = open(js).read().split('\n')
        text_lines = [t for t in text_lines if 'src' in t and
                      any([(i.lower() in t.lower()) for i in img_ext])
                      ]
        text_lines = [t.split('./')[-1] for t in text_lines]
        text_lines = [t.split('"')[0] for t in text_lines]
        text_lines = [unencode_url(t) for t in text_lines]
        link_list += text_lines
    return link_list

def link_reports():
    '''One questions how long-term useful this is.
    Spits out a report (prints to screen) a simplistic listing
        Files with no obvious links
        Links with no obvious Files.
        Only takes the name part of the extension into account.'''
    #In other words, no src, but referenced inline javascript
    print 'THESE FILES ARE REFERENCED: LOGIC WOULD BE A ONE OFF'
    print '\t2013_6_5_brett_paufler_tom_kha_gai_quail_winner.jpg'
    print '\t2013_6_5_brett_paufler_tom_kha_gai_quail_empty_ha_ha.jpg'
    print '\t2013_6_5_brett_paufler_tom_kha_gai_quail_empty.jpg'
    #HTML Links List
    html = set([h.lower() for h in links_html() + slide_show_links()])
    html = [h for h in html if 'tom' in h]
    html = set(html)
    #Files Listing
    dir_files = [f.split('\\')[-1] for f in files_in_dir()]
    dir_files = [d.lower() for d in dir_files]
    dir_files = sorted(dir_files)
    dir_files = [h for h in dir_files if 'tom' in h]
    dir_files = set(dir_files)
    print '\nBY NAME ONLY (no dir referencing)\nLinks that Point to NoWhere (appropriate file does not exist)'
    for i in list(html - dir_files):
        print i
    print '\n\nBY NAME ONLY (no dir referencing)\nFILES that are NOT Utilized (unused file resource listing)'
    print '\t\t\tSEE TOP OF REPORT'
    for i in (dir_files - html):
        print i
    #TODO - Additional report for these final only (if anywhere)
    #    See above for more clarification

def update_links_in_file(old, new, file_path):
    '''Updates all variations (url) of old with new (base no variations) in file_path.
        If new is a busted path (needs escaping), link won't work.'''
    with open(file_path, 'r') as f:
        s = f.read()
    for var in url_variations(old):
        s = s.replace(var, new)
    with open(file_path, 'w') as f:
        f.write(s)

def update_all_links(old, new):
    '''Updates all links (all url variations) in all files referencing
    '''
    for file_to_update in files_referencing_path_string(old):
        print 'UPDATING (all links): %s\n\tOLD: %s\n\tNEW: %s' % (file_to_update, old, new)
        update_links_in_file(old, new, file_to_update)
        
def update_file_name(old, new):
    target_file = [f for f in files_in_dir() if old in f]
    assert len(target_file) == 1
    path_old = target_file[0]
    path_new = path_old.replace(old, new)
    print 'FILE RENAMED:\n\tOLD:%s\n\tNEW:%s' % (path_old, path_new)
    rename(path_old, path_new)

def clean_name(file_path):
    '''Tidies up name_part of file path (dir_path_part/name_part.ext)
            by removing spaces, special characters, and miscellaneous key words.
        Returned string is '_' seperated lower case.
        Trailing '.ext' if any is converted to lower case.
        Directory Path is completely unaffected.'''
    parts = file_path.split('\\')
    file_name = parts[-1]
    file_name = file_name.replace('.', '_', file_name.count('.') - 1) # '.' <= 1
    kill_list = ['Copyright Brett Paufler', '(c)', 'Copyright', 'Brett Food', 
                 ',', '&', '-', "'", '(', ')', '%' ]
    for c in kill_list:
        file_name = re.sub(re.escape(c), ' ', file_name, flags=re.IGNORECASE)
    file_name = '_'.join(file_name.split())
    file_name = file_name.replace('_.', '.') # del trailing '_'
    file_name = file_name.lower()
    file_path = '\\'.join(parts[:-1] + [file_name])
    return file_path

def base_update():
    '''Updates bad url's with cleaner ones.'''
    to_update = [f for f in files_in_dir()]
    for old in to_update:
        new = clean_name(old)
        if old != new:
            print 'UPDATING:\n\tOLD: %s\n\tNEW: %s' % (old, new)
            update_all_links(old.split('\\')[-1], new.split('\\')[-1])
            update_file_name(old, new)

def rename_resource(old, new):
    '''Resource with name == old is renamed to new.
    There are no errors (should there be):
        Warnings are printed to screen.
    '''
    print 'renaming_resource:'
    matching_files = [f for f in files_in_dir() if old in f]
    num_matches = len(matching_files)
    
    if num_matches == 0:
        print 'No Matching Files'
    elif num_matches == 1:
        update_all_links(old, new)
        update_file_name(old, new)
    elif num_matches >= 2:
        print 'File Name Collision: Process Aborted'
        for m in matching_files:
            print '\t%s' % m

def report_name_collisions():
    '''Prints list of all files with the same name extension to screen.'''
    print 'RUNNING: report_name_collisions:'
    #Logic: could break out, but why bother
    file_paths = files_in_dir()
    collisions = []
    while file_paths:
        next_path = file_paths.pop()
        next_name = next_path.split('\\')[-1]
        hits = [f for f in file_paths if f.split('\\')[-1] == next_name]
        if hits:
            for hit in hits:
                file_paths.remove(hit)
                collisions.append((next_path, hit))
    #Report
    if collisions:
        print '\tNAME COLLISIONS:'
        for c in collisions:
            print '\t\t%', c
    else:
        print '\tNO COLLISIONS\n\tGood to Go!!!'
    


#TODO - Directory Name CHanger


#def replace_text(old, new):
    


if __name__ == '__main__':

    print 'Loading: links_and_layout'

    
    #base_update()
    #link_reports()
    #extension_count()
    
    #report_name_collisions()
    
    
    #for f in files_in_dir():
    #    print f

    print chr(169)
    




        #with  as text_file:
        #a = fileinput.input(f, inplace=True)# as text_file:
        #a..for line in text_file:
        #    print text_file

    #rename_resource
    #old = "pork_tom_kha_gha_2013_9_3_pork_bones_soup_split_4.jpg"
    #new = "pork_tom_kha_gha_soup_4.jpg"
    #old = 'r_tritip_cherry_salad_2013_9_3_with_blue_cheese_1.jpg'
    #old = 'tritip_cherry_salad_with_blue_cheese_1.jpg'
    #new = 'tritip_cherry_salad_with_blue_cheese_1-HAPPY.jpg'
    #rename_resource(old, new)
     


    print 'Finished'
#TODO - Diablo FIre (the js scripting portion) will Need hand updating
#Diablo fire uses a for/next so only part of the string is used.
#If all are updated to similiar, should be easy