''' Created on Aug 24, 2018 @author: Brett Paufler Copyright Brett Paufler Removes: All Header Information All Style Information Leaves Base Tags in Place div, i, u, img, span, etc For working on imported HTML Files Not so much for maintaining my website Works as intended, so good show ''' from os import listdir from os.path import join as path_join from os.path import splitext import re #List of Files dir_in = './/input//' dir_out = './/output//' file_paths = [path_join(dir_in, f) for f in listdir(dir_in)] for f_path in file_paths: print f_path #Get Input Text with open(f_path, 'r') as f: raw_text = f.read() #Text Stripped of All Header Material new_header = '\n\n' nh_text = new_header + raw_text.split('')[1] #Removes Tag Style ns_text = re.sub( pattern='\s+\S*?="\S*?"', repl='', string=nh_text) #Removes Comments nc_text = re.sub( pattern='', repl='', string=ns_text) #Strips Whitespace from Lines strip_text = '\n'.join([line.strip() for line in nc_text.splitlines() if line.strip() != '']) save_path = f_path.replace(dir_in, dir_out) head, tail = splitext(save_path) save_name = ''.join([head, '_no_style', tail]) print save_name with open(save_name, 'w') as f: f.write(strip_text)