''' Created on Apr 16, 2014 refactor 5-17-17 @author: Brett Paufler Copyright Brett Paufler For all .doc files in ./input, outputs .txt .html Works on Microsoft Word 2003 On upgrade, may fail On upgrade, pdf conversion may be available NOTE: the html output sucks so if using, create utility to clean it up ''' from win32com.client.gencache import EnsureDispatch from win32com.client import constants from os import listdir from os.path import abspath def all_docs_to_txt_and_html(txt=True, html=True): '''Creates new .txt & .html files in ./input for every .doc file in ./input.''' WORD = EnsureDispatch('Word.Application') dirs_in = './input/' docs = [abspath(dirs_in + doc) for doc in listdir(dirs_in) if doc.endswith('doc')] for doc_file in docs: print 'CONVERTING' print doc_file doc = WORD.Documents.Open(doc_file) if txt: txt_file = doc_file.replace('.doc', '.txt') print txt_file txt_file_magic_number = 7 doc.SaveAs(txt_file, txt_file_magic_number) if html: html_file = doc_file.replace('.doc', '.html') print html_file doc.SaveAs(html_file, constants.wdFormatHTML) #if upgrade to better office: pdf magic number below ##wdFormatPDF=17 doc.Close() if __name__ == '__main__': print 'Starting' all_docs_to_txt_and_html() print 'Finished'