'''
Created on May 12, 2016
@author: Brett Paufler
Copyright Brett Paufler
Rants in Input
Formats to New in Output
TODO
External Tags for
style text-align
italics
etc
Reduce Repeated Tags
Nested
... ... ...
kill kill
'''
from os import listdir
import re
from collections import namedtuple
#from re import DOTALL
#from tables._past import old2newnames
class DirList():
'''Base Class that lists files in input directory.'''
def __init__(self, dir_input='./input/'):
self.files = [dir_input + f for f in listdir(dir_input)]
def __repr__(self):
return '\n'.join(self.files)
class RantFormatter():
def __init__(self, file_path):
self.file_path = file_path
self.text = open(self.file_path).read() #self.load_old_rant_text()
def __repr__(self):
text = 'RantFormatter:\n'
text += '%s\n' % self.file_path
text += '\n\n%s' % self.text
return text
def save(self):
sN = self.file_path.replace('input', 'output')
with open(sN, 'w') as file_out:
file_out.write(self.text)
def swap(self, old, new):
self.text = self.text.replace(old, new)
def reformat(self):
sub = namedtuple('sub', 'old, new, comment')
replacements = [sub('(?<=[a-zA-Z0-9])\n(?!=<)',
' ',
'newlines for plain text'),
sub('\xa9',
'©',
'utc copyright symbol'),
sub(' ',
' ',
'html blank space ( )'),
sub('',
'',
'html header'),
sub('',
'',
'meta content tag'),
sub('
',
'
',
'
with italics tag'),
]
for old, new, comment in replacements:
print 'Removing %s' % comment
self.text = re.sub(old, new, self.text)
#Ends the line on open or closing of these tags
tag_ends_line = ['html', 'head', 'body',
'center', 'title', 'br' ]
open_close_tags = [tag for tags in tag_ends_line
for tag in ['<%s>' % tags, '%s>' % tags]]
for tag in open_close_tags:
self.text = re.sub('%s(?!\n)' % tag,
'%s\n' % tag,
self.text)
#Kill Leading trailing newlines, blanks, comments, everything
self.text = re.sub('.*', '', self.text, flags=re.DOTALL)
self.text = re.sub('.*', '', self.text, flags=re.DOTALL)
while ' ' in self.text:
self.text = self.text.replace(' ', ' ')
#sub = namedtuple('sub', 'old, new, comment')
final_clean_up = [sub('\n ',
'\n',
'Kill Space at Start of Lines'),
sub('