''' Created on May 12, 2016 @author: Brett Paufler Copyright Brett Paufler Rants in Input Formats to New in Output TODO External Tags for style text-align italics etc Reduce Repeated Tags Nested ... ... ... kill kill ''' from os import listdir import re from collections import namedtuple #from re import DOTALL #from tables._past import old2newnames class DirList(): '''Base Class that lists files in input directory.''' def __init__(self, dir_input='./input/'): self.files = [dir_input + f for f in listdir(dir_input)] def __repr__(self): return '\n'.join(self.files) class RantFormatter(): def __init__(self, file_path): self.file_path = file_path self.text = open(self.file_path).read() #self.load_old_rant_text() def __repr__(self): text = 'RantFormatter:\n' text += '%s\n' % self.file_path text += '\n\n%s' % self.text return text def save(self): sN = self.file_path.replace('input', 'output') with open(sN, 'w') as file_out: file_out.write(self.text) def swap(self, old, new): self.text = self.text.replace(old, new) def reformat(self): sub = namedtuple('sub', 'old, new, comment') replacements = [sub('(?<=[a-zA-Z0-9])\n(?!=<)', ' ', 'newlines for plain text'), sub('\xa9', '©', 'utc copyright symbol'), sub(' ', ' ', 'html blank space ( )'), sub('', '', 'html header'), sub('', '', 'meta content tag'), sub('
', '
', '
with italics tag'), ] for old, new, comment in replacements: print 'Removing %s' % comment self.text = re.sub(old, new, self.text) #Ends the line on open or closing of these tags tag_ends_line = ['html', 'head', 'body', 'center', 'title', 'br' ] open_close_tags = [tag for tags in tag_ends_line for tag in ['<%s>' % tags, '' % tags]] for tag in open_close_tags: self.text = re.sub('%s(?!\n)' % tag, '%s\n' % tag, self.text) #Kill Leading trailing newlines, blanks, comments, everything self.text = re.sub('.*', '', self.text, flags=re.DOTALL) self.text = re.sub('.*', '', self.text, flags=re.DOTALL) while ' ' in self.text: self.text = self.text.replace(' ', ' ') #sub = namedtuple('sub', 'old, new, comment') final_clean_up = [sub('\n ', '\n', 'Kill Space at Start of Lines'), sub('\n', '<title>', 'Title all on one line') ] for old, new, comment in final_clean_up: print comment self.text = self.text.replace(old, new) #THIS IS DEFINITELY NOT WORKING #TODO - Finish or live without def center_tags(self): '''Start of a method to extract italic and center from span and div tags. Not implemented''' #reg = '<(?P<tag>span)(((?!</span>).)*)(font-style: italic;)(.*)(</(?P=tag)>)' #span = ('span', 'span', 'span') #div = ('div', 'div', 'div') #reg = '<(%s)(((?!</%s>).)*)(font-style: italic;)(.*)(</%s>)' % div open_tag = '<(?P<tag>([span|div|h1|h2|h3|h4]))' #This Works no_closing_tag = '(((?!</(?P=tag)>).)*)' # % tag keyword = 'font-style: italic;' anything = '.*' close_tag = '(</(?P=tag)>)+?' # % tag #reg = '%s %s(%s).*%s' % (open_tag, no_closing_tag, # keyword, close_tag) reg = open_tag + no_closing_tag + keyword reg += anything + close_tag matches = re.search(reg, self.text, flags=re.DOTALL) return matches.group(0) file_listing = DirList() #print file_listing r = RantFormatter(file_listing.files[0]) #print r #.text r.reformat() #print r r.save() #print #print #print r.center_tags()