'''
Created on Nov 29, 2014
@author: Brett Paufler
Copyright Brett Paufler
Takes the type of HTML page that
Microsoft Word spits out
and strips it of mso tags
NOTE: THERE ARE A LOT OF MSO TAGS
the ones in the returns list replaces with
/n/n
the ones in the eliminate list replaces with " "
THESE LISTS SHOULD BE ADDED TO
'''
import os
import re
def microsoftHTMLtoStrippedHTML(text, title):
text = text.replace("\n", " ")
while " " in text:
text = text.replace(" ", " ")
newHeader = "\n
", "
", "
", "
", "
", '
', "
", "
", "
", "
",
]
for r in returns:
text = text.replace(r,"
\n
\n\n")
#These phrases are replaced by a blank space " "
eliminate = ["
", "