'''
Created on Aug 24, 2018
@author: Brett Paufler
Copyright Brett Paufler

Removes:
    All Header Information
    All Style Information

Leaves Base Tags in Place
    div, i, u, img, span, etc
    
For working on imported HTML Files
Not so much for maintaining my website


Works as intended, so good show
'''

from os import listdir
from os.path import join as path_join
from os.path import splitext

import re

#List of Files
dir_in = './/input//'
dir_out = './/output//'
file_paths = [path_join(dir_in, f)
              for f in listdir(dir_in)]

for f_path in file_paths:
    
    print f_path
    
    #Get Input Text
    with open(f_path, 'r') as f:
        raw_text = f.read()
    
    #Text Stripped of All Header Material
    new_header = '<html>\n<head>\n</head>'
    nh_text = new_header + raw_text.split('</head>')[1]
    
    #Removes Tag Style
    ns_text = re.sub(
        pattern='\s+\S*?="\S*?"',
        repl='',
        string=nh_text)
    
    #Removes Comments
    nc_text = re.sub(
        pattern='<!--.*?-->',
        repl='',
        string=ns_text)
    
    #Strips Whitespace from Lines
    strip_text = '\n'.join([line.strip() for line
                  in nc_text.splitlines()
                  if line.strip() != ''])
    
    save_path = f_path.replace(dir_in, dir_out)
    head, tail = splitext(save_path)
    save_name = ''.join([head, '_no_style', tail])
    print save_name
    
    with open(save_name, 'w') as f:
        f.write(strip_text)