''' Created on May 4, 2014 (major refactor 5-29-16) @author: Brett Paufler Copyright Brett Paufler To use: 1) Use snapScan, scan into ./input 2) Use nuance PDF converter, to convert pdf to doc 3) run this python program a) converts all .doc to .txt b) outputs a receipts.csv 4) if any misses, refine store_hint_list Receipt(): text: raw receipt text is mangled maybe 10% of time information not salvageable date: found if in form {m/d/yy to mm/dd/yyyy} month name data will have to wait until I get an example but likely falls into that 10%, just not going to work category amount: clever algorithm if I do say so myself by keyword name, first hit so visa trumps total by keyword name, most common by dollar amount, most common And then, by the most common (or first found) among these three store: simplistic keyword match algorithm Old File had 480 LOC, this as of final rewrite 430 TODO (ideas): Testing: easy enough to have known values for some raw receipts Machine Learning: might be interesting to train something to output the correct answers after all, in time, will have extensive training data receipts_master vs. raw receipt .txt documents) ''' import os import re import win32com.client from datetime import date, timedelta DUMMY_DATE = date(year=1999, month=9, day=9) ''' #Store (first) is identified by hint (second) #Once a match is found, search stops #So, hints hand sorted in order of targeting power ''' ''' Dead Code Post It Note Yep, I spent a lot of time in California Hawaii No, I will not bother to obfuscate this. ''' store_hint_list = [ #Strong name matches ('Safeway', 'safeway'), ('Liliha Bakery', 'liliha'), ('Navy Exchange', 'navy ex'), ('Petco', 'petc'), ('McDonald', 'mcdon'), ('Walmart', 'walmar'), ('Home Depot', 'home dep'), ('Toritos', 'torit'), ('Target', 'targe'), ('Costco', 'costco'), ('Fischer Hawaii', 'fisher'), ('Jamba Juice', 'jamba'), ('SuperCuts', 'super cu'), ('Comfort Inn', 'comfort inn'), ('lynardys', 'lynar'), ('lynardys', 'lunard'), ('Walgreens', 'walgre'), #weaker name matches ('Pizza Guys', 'pizza gu'), ('Trader Joes', 'trader j'), ('Panda Express', 'panda'), ('Pho Think', 'thinh'), ('Golden Palace', 'golden'), ('Longs CVS', 'longs'), ('Longs CVS', 'cvs'), ('Family BBQ', 'bbq'), #Keyword Matches (note the use of addresses) ('Costco', '525 ala'), ('Joses Mexican', '1134 koko'), ('Toritos', '7991'), ('Safeway', '1360 pali'), ('Longs CVS', '1030 s. king'), ('Walgreens', '1613 nuua'), ('Bento Musubi', 'musubi'), ('Walmart', 'live better'), ] #Could be a class method #But I felt like making it easier to pull into other modules def assign_name( text_to_search, hint_list=store_hint_list ): '''Returns 'name' for passed document based on hint_list. hint_list: a list of (name, hint) tupes such that if hint is found in text.lower(), name is returned Note: most of the work has already been done in the passed hint_list ''' text = text_to_search.lower() for store, hint in store_hint_list: if hint in text: return store else: return '' def all_files_of_type(directory='./input', extension='.doc'): '''Returns absolute path_in listing of files in directory with given extension. Used to seperate .doc from .pdf and again later to seperate .txt from .doc''' dir_path = os.path_in.abspath(directory) all_file_paths = [os.path_in.join(dir_path, f) for f in os.listdir(directory) if f.endswith(extension)] return all_file_paths def convert_all_docs_to_txt(): '''Converts all Word Documents in receipt directory to simple text documents.''' for doc_file_path in all_files_of_type(extension='.doc'): convert_doc_to_txt(doc_file_path) def convert_doc_to_txt(absolute_doc_file_path): '''Takes a Microsoft Word .doc and outputs standard .txt to same directory. Note: the output is in utf-16 which is less than ideal.''' save_name = absolute_doc_file_path[:-4] + '.txt' print 'converting file from .doc to .txt\n%s\n%s\n' % ( absolute_doc_file_path, save_name) word = win32com.client.gencache.EnsureDispatch('Word.Application') doc = word.Documents.Open(absolute_doc_file_path) print doc magic_number_for_txt_format = 7 doc.SaveAs(save_name, magic_number_for_txt_format) doc.Close() def convert_all_utf16_to_ascii(): '''All text files are reduced to ASCII standard. The text files outputed from the win32com word converter are in utf-16 format, which is not overly helpful (spaces everywhere, leading BOM, who knows what else). Everything ASCII compatible is simply ignored.''' for text_file in all_files_of_type(extension='.txt'): print "Converting text file from utf-16 to ascii:\n%s\n\n" % text_file utf16_txt_to_ascii(text_file) def utf16_txt_to_ascii(utf16_txt_file): '''Converts utf16 encoded txt file to ascii txt file (inplace).''' with open(utf16_txt_file, 'rb') as f: text = f.read() text = text.decode('utf-16') text = text.encode('ascii', 'ignore') text = text.replace('"',' ') with open(utf16_txt_file, 'wb') as f: f.write(text) def parse_dates_from_string(input_string): '''Converts 'm+/d+/yy' in input_string into a list of Python date objects. The vast majority of dates on receipts take the following form: 05/27/16, 5/27/16, 5/27/16, and/or 05/27/2016 any one of which would return a list containing one object: [(datetime.date(16, 5, 27)] or all would return a list of objects: [(datetime.date(16, 5, 27), (datetime.date(16, 5, 27), ect...] ''' four_digit_year = date.today().year #datetime.now().year two_digit_year = four_digit_year % 100 text = input_string.replace(str(four_digit_year), str(two_digit_year)) re_for_slash_seperated_month_day_year = '1?[0-9]/\d{1,2}/\d{2}' list_of_date_strings = re.findall(re_for_slash_seperated_month_day_year, text) list_of_dates = [_string_to_date_object(date_string) for date_string in list_of_date_strings] return list_of_dates def _string_to_date_object(date_string): '''Given date of form 'm+/d+/yy' returns a standard Python date object. If string not a valid date, returns the DUMMY_DATE''' m, d, y = [int(m_d_y) for m_d_y in date_string.split('/')] add_millenium_to_two_digit_year = 2000 y += add_millenium_to_two_digit_year try: valid_date = date(year=y, month=m, day=d) except: valid_date = DUMMY_DATE return valid_date def select_best_date(list_of_dates): '''Filters a date list returning the best or the dummy. If exists, return: most recent date that is today or earlier (not a future date) but no more than 90 days old ''' today = date.today() ninety_days_ago = today - timedelta(days=90) dates = [d for d in list_of_dates if ninety_days_ago < d <= today] dates.sort(reverse=True) if dates: best_date = dates[0] else: best_date = date(year=1999, month=9, day=9) return best_date #This could easily be pulled apart into three or more functions def determine_receipt_total(receipt_text): '''Returns the best_guess of the total purchase given a receipt_text. guesses uses three methods keyword search, first result so the order of choices is important in the RE keyword search, most common result most common dollar amount on receipt best_guess is the most common of the three, if all three are different, defaults to the first on the list ''' guesses = [] text = receipt_text text = text.lower() #Find Total on same line or next by listed keywords (doesn't always work) #The order is important, earlier terms have precedence in the final choice re_payment_keywords = '(?:visa|amex|american express|' re_payment_keywords += 'charge|amount|credit|total|final)' re_garbage_on_this_or_next_line = '.*?\n??.*?' re_dollar_amount = '(\d{1,3}[\.,]\d{2})' re_string = (re_payment_keywords + re_garbage_on_this_or_next_line + re_dollar_amount) keyword_amounts = re.findall(re_string, text) keyword_amounts = [a.replace(',', '.') for a in keyword_amounts] keyword_amounts = [float(a) for a in keyword_amounts] #Note, keyword adds two values to guesses if keyword_amounts: guesses.append(max(keyword_amounts, key=keyword_amounts.count)) guesses.append(keyword_amounts[0]) #Keyword Method doesn't always work, # seeds guesses 0.00 if nothing else # but typically all values on receipt all_dollar_amounts = ['0.00'] all_dollar_amounts += re.findall(re_dollar_amount, text) all_dollar_amounts = [a.replace(',', '.') for a in all_dollar_amounts] all_dollar_amounts = [float(a) for a in all_dollar_amounts] #Add to guesses the most common; or if all equal, the highest total all_dollar_amounts.sort(reverse=True) guesses.append(max(all_dollar_amounts, key=all_dollar_amounts.count)) #best_guess is the most common guess # or if all equal, the first added value best_guess = max(guesses, key=guesses.count) return best_guess #Perhaps more of the above functions should be folded into Receipt class #For now, just hanging loose class Receipt(): '''The __init__ of the Receipt from the passed txt_file_path does ALL the heavy lifting of the class. Printing or pulling csv is all that will typically be required. ''' def __init__(self, txt_file_path): ''' name: file name base text: raw text from file date: best guess as to the receipt's date of issuance store: best guess as to the store that issued receipt amount: best guess as to the receipt's total amount csv: date, store, amount formatted nicely for one line of csv output ''' self.name = os.path_in.basename(txt_file_path)[:-4] self.txt_file_path = txt_file_path self.text = 'Loads in a second' self.load_text() self.date = select_best_date(parse_dates_from_string(self.text)) self.amount = determine_receipt_total(self.text) self.store = assign_name(self.text, hint_list=store_hint_list) self.csv = self.csv_format() def __repr__(self): text = 'NAME: %s\n' % self.name text += 'DATE: %s\n' % self.date text += 'AMOUNT: %s\n' % str(self.amount) text += 'STORE: %s\n' % self.store text += 'CSV: %s\n' % self.csv return text def load_text(self): with open(self.txt_file_path, 'r') as f: self.text = f.read() def csv_format(self): '''Converts relevant receipt attributes into csv format, zeroing out the holding values.''' #Outputs dates in 'mm/dd/yy' format, removing 9-9-1999 y, m, d = self.date.year, self.date.month, self.date.day if y == 1999: date_string = '' else: date_string = '%s/%s/%s' % (m, d, y % 100) #0.0 amounts become '' for ease of detection in output amount_string = str(self.amount) if self.amount else '' #Commas to match existing csv_text = '%s,%s,,%s,,,,%s' % ( date_string, self.name, self.store, amount_string) return csv_text ''' run_all acts as a 'main' From here down are consolidation functions that feed into run_all. ''' def convert_all_word_to_txt(): '''Converts all Word .doc to usuable .txt files.''' convert_all_docs_to_txt() convert_all_utf16_to_ascii() def init_receipt_list(): '''Returns a list of receipt items, one for each .txt files in ./input.''' txt_files = all_files_of_type(extension='.txt') receipt_list = [Receipt(f) for f in txt_files] for receipt in receipt_list: print receipt return receipt_list def save_receipt_list_to_csv(receipt_list): '''Outputs all receipt data into a csv.''' csv_text = '' for receipt in receipt_list: csv_text += receipt.csv + '\n' csv_save_name = './input/%s.csv' % str(date.today()) with open(csv_save_name, 'w') as f: f.write(csv_text) def run_all(): '''Runs the lot, essentially a main function.''' convert_all_word_to_txt() receipt_list = init_receipt_list() save_receipt_list_to_csv(receipt_list) if __name__ == '__main__': run_all() print 'Receipts as __main__ finished successfully!' print 'If using, refactor to use doc_converter'