''' Created on Nov 29, 2014 @author: Brett Paufler Copyright Brett Paufler Takes the type of HTML page that Microsoft Word spits out and strips it of mso tags NOTE: THERE ARE A LOT OF MSO TAGS the ones in the returns list replaces with
/n/n the ones in the eliminate list replaces with " " THESE LISTS SHOULD BE ADDED TO ''' import os import re def microsoftHTMLtoStrippedHTML(text, title): text = text.replace("\n", " ") while " " in text: text = text.replace(" ", " ") newHeader = "\n\n\n%s\n\n\n\n" % title oldHeader = re.match("", text, re.DOTALL).group() text = text.replace(oldHeader,newHeader) oldBody = "
" newBody = "\n\n
\n\n

\n\n

\n\n

\n\n
\n\n" text = text.replace(oldBody, newBody) oldFooter = " " newFooter = "\n\n
\n\n
\n\n\n\n" text = text.replace(oldFooter, newFooter) oneOff_1 = "
" #These phrases are replaced by a line break (and
) returns = [oneOff_1, "

", "

", "

", "

", "

", '

', '

', "

", "

", "

", "

", "

", ] for r in returns: text = text.replace(r,"
\n
\n\n") #These phrases are replaced by a blank space " " eliminate = ["", "", "", " ", "", "", "

", '', "
", "", "", "

", "

", "style='mso-pagination:none'", "" ] for e in eliminate: text = text.replace(e," ") while " " in text: text = text.replace(" ", " ") text = text.replace("\n ","\n") text = text.replace("
","
") print text #print oldBody #src = re.findall('src="(.*?)"',self.currentText, re.DOTALL) return text if __name__ == "__main__": dirIn = "./docFilesIn/" dirOut = "./htmlOut/" for f in os.listdir(dirIn): rawText = open(dirIn + f, "r").read() #print rawText alteredText = microsoftHTMLtoStrippedHTML(rawText,f) open(dirOut + f, "w").write(alteredText)