'''
Created on May 4, 2014 (major refactor 5-29-16)
@author: Brett Paufler
Copyright Brett Paufler

To use:
    1) Use snapScan, scan into ./input
    2) Use nuance PDF converter, to convert pdf to doc
    3) run this python program
        a) converts all .doc to .txt
        b) outputs a receipts.csv
        
    4) if any misses, refine store_hint_list


Receipt():
    text: raw receipt text is mangled maybe 10% of time
        information not salvageable
    date: found if in form {m/d/yy to mm/dd/yyyy}
        month name data will have to wait until I get an example
            but likely falls into that 10%,
                just not going to work category
    amount: clever algorithm if I do say so myself
        by keyword name, first hit
            so visa trumps total
        by keyword name, most common
        by dollar amount, most common
        And then, by the most common (or first found) among these three
    store: simplistic keyword match algorithm
        
        
Old File had 480 LOC, this as of final rewrite 430

TODO (ideas):
    Testing:
        easy enough to have known values for some raw receipts
    Machine Learning:
        might be interesting to train something to output the correct answers
        after all, in time, will have extensive training data
            receipts_master vs. raw receipt .txt documents)       
'''


import os
import re
import win32com.client
from datetime import date, timedelta


DUMMY_DATE = date(year=1999, month=9, day=9)


'''
#Store (first) is identified by hint (second)
#Once a match is found, search stops
#So, hints hand sorted in order of targeting power
'''

'''
Dead Code
Post It Note

Yep, I spent a lot of time in
    California
    Hawaii
No, I will not bother to obfuscate this.
'''

store_hint_list = [
                   
    #Strong name matches
    ('Safeway', 'safeway'),
    ('Liliha Bakery', 'liliha'),
    ('Navy Exchange', 'navy ex'),
    ('Petco', 'petc'), 
    ('McDonald', 'mcdon'),
    ('Walmart', 'walmar'),
    ('Home Depot', 'home dep'),
    ('Toritos', 'torit'),
    ('Target', 'targe'),
    ('Costco', 'costco'),
    ('Fischer Hawaii', 'fisher'),
    ('Jamba Juice', 'jamba'),
    ('SuperCuts', 'super cu'),
    ('Comfort Inn', 'comfort inn'),
    ('lynardys', 'lynar'),
    ('lynardys', 'lunard'),
    ('Walgreens', 'walgre'),
    
    #weaker name matches
    ('Pizza Guys', 'pizza gu'),
    ('Trader Joes', 'trader j'),
    ('Panda Express', 'panda'),
    ('Pho Think', 'thinh'),
    ('Golden Palace', 'golden'),
    ('Longs CVS', 'longs'),
    ('Longs CVS', 'cvs'),
    ('Family BBQ', 'bbq'),
    
    #Keyword Matches (note the use of addresses)
    ('Costco', '525 ala'),
    ('Joses Mexican', '1134 koko'),
    ('Toritos', '7991'),
    ('Safeway', '1360 pali'),
    ('Longs CVS', '1030 s. king'),
    ('Walgreens', '1613 nuua'),
    ('Bento Musubi', 'musubi'),
    ('Walmart', 'live better'),
    ]

#Could be a class method
#But I felt like making it easier to pull into other modules
def assign_name(
    text_to_search,
    hint_list=store_hint_list
    ):
    '''Returns 'name' for passed document based on hint_list.
    
    hint_list: a list of (name, hint) tupes
        such that if hint is found in text.lower(),
            name is returned
    
    Note: most of the work has already been done in the passed hint_list
    '''
    
    text = text_to_search.lower()
    
    for store, hint in store_hint_list:
        if hint in text:
            return store
    else:
        return ''
    

def all_files_of_type(directory='./input', extension='.doc'):
    '''Returns absolute path_in listing of files in directory with given extension.
    
    Used to seperate .doc from .pdf
        and again later to seperate .txt from .doc'''
    
    dir_path = os.path_in.abspath(directory)
    
    all_file_paths = [os.path_in.join(dir_path, f) for f
        in os.listdir(directory) if f.endswith(extension)]

    return all_file_paths


def convert_all_docs_to_txt():
    '''Converts all Word Documents in receipt directory to simple text documents.'''

    for doc_file_path in all_files_of_type(extension='.doc'):
        convert_doc_to_txt(doc_file_path)


def convert_doc_to_txt(absolute_doc_file_path):
    '''Takes a Microsoft Word .doc and outputs standard .txt to same directory.
    
    Note: the output is in utf-16 which is less than ideal.'''
    
    save_name = absolute_doc_file_path[:-4] + '.txt'
    
    print 'converting file from .doc to .txt\n%s\n%s\n' % (
        absolute_doc_file_path, save_name)
    
    word = win32com.client.gencache.EnsureDispatch('Word.Application')
    doc = word.Documents.Open(absolute_doc_file_path)
    
    print doc   
    magic_number_for_txt_format = 7
    doc.SaveAs(save_name, magic_number_for_txt_format)
    
    doc.Close()


def convert_all_utf16_to_ascii():
    '''All text files are reduced to ASCII standard.
    
    The text files outputed from the win32com word converter are in utf-16 format,
    which is not overly helpful (spaces everywhere, leading BOM, who knows what else).
    
    Everything ASCII compatible is simply ignored.'''
    
    for text_file in all_files_of_type(extension='.txt'):
        print "Converting text file from utf-16 to ascii:\n%s\n\n" % text_file
        utf16_txt_to_ascii(text_file)


def utf16_txt_to_ascii(utf16_txt_file):
    '''Converts utf16 encoded txt file to ascii txt file (inplace).''' 
    
    with open(utf16_txt_file, 'rb') as f:
        text = f.read()

    text = text.decode('utf-16')
    text = text.encode('ascii', 'ignore')
    text = text.replace('"',' ')
    
    with open(utf16_txt_file, 'wb') as f:
        f.write(text)


def parse_dates_from_string(input_string):
    '''Converts 'm+/d+/yy' in input_string into a list of Python date objects.
    
        The vast majority of dates on receipts take the following form:
         
            05/27/16, 5/27/16, 5/27/16, and/or 05/27/2016
            any one of which would return a list containing one object:
                [(datetime.date(16, 5, 27)]
            or all would return a list of objects:
                [(datetime.date(16, 5, 27), (datetime.date(16, 5, 27), ect...]
    '''
    
    four_digit_year = date.today().year #datetime.now().year
    two_digit_year = four_digit_year % 100
    text = input_string.replace(str(four_digit_year), str(two_digit_year))
    
    re_for_slash_seperated_month_day_year = '1?[0-9]/\d{1,2}/\d{2}'
    list_of_date_strings = re.findall(re_for_slash_seperated_month_day_year, text)
    
    list_of_dates = [_string_to_date_object(date_string)
        for date_string in list_of_date_strings]
    
    return list_of_dates


def _string_to_date_object(date_string):
    '''Given date of form 'm+/d+/yy' returns a standard Python date object.
    
        If string not a valid date,
            returns the DUMMY_DATE'''
    
    m, d, y = [int(m_d_y) for m_d_y in date_string.split('/')]
    
    add_millenium_to_two_digit_year = 2000
    y += add_millenium_to_two_digit_year
    
    try:
        valid_date = date(year=y, month=m, day=d)
    except:
        valid_date = DUMMY_DATE
        
    return valid_date 


def select_best_date(list_of_dates):
    '''Filters a date list returning the best or the dummy.
    
    If exists, return:
        most recent date
        that is today or earlier (not a future date)
        but no more than 90 days old
    '''
    
    today = date.today()
    ninety_days_ago = today - timedelta(days=90)

    dates = [d for d in list_of_dates if ninety_days_ago < d <= today]
    dates.sort(reverse=True)

    if dates:
        best_date = dates[0]
    else:
        best_date = date(year=1999, month=9, day=9)
    
    return best_date


#This could easily be pulled apart into three or more functions
def determine_receipt_total(receipt_text):
    '''Returns the best_guess of the total purchase given a receipt_text.
    
        guesses uses three methods
            keyword search, first result
                so the order of choices is important in the RE
            keyword search, most common result
            most common dollar amount on receipt
        best_guess is the most common of the three,
            if all three are different,
                defaults to the first on the list
    '''
    
    guesses = []
    
    text = receipt_text
    text = text.lower()
    
    #Find Total on same line or next by listed keywords (doesn't always work)
    #The order is important, earlier terms have precedence in the final choice
    re_payment_keywords = '(?:visa|amex|american express|'
    re_payment_keywords += 'charge|amount|credit|total|final)'
    re_garbage_on_this_or_next_line = '.*?\n??.*?'
    re_dollar_amount = '(\d{1,3}[\.,]\d{2})'
    
    re_string = (re_payment_keywords
        + re_garbage_on_this_or_next_line
        + re_dollar_amount)
    
    keyword_amounts = re.findall(re_string, text)
    keyword_amounts = [a.replace(',', '.') for a in keyword_amounts]
    keyword_amounts = [float(a) for a in keyword_amounts]
    
    #Note, keyword adds two values to guesses
    if keyword_amounts:
        guesses.append(max(keyword_amounts, key=keyword_amounts.count))
        guesses.append(keyword_amounts[0])
         
    #Keyword Method doesn't always work,
    #   seeds guesses 0.00 if nothing else
    #   but typically all values on receipt
    all_dollar_amounts = ['0.00']
    all_dollar_amounts += re.findall(re_dollar_amount, text)
    all_dollar_amounts = [a.replace(',', '.') for a in all_dollar_amounts]
    all_dollar_amounts = [float(a) for a in all_dollar_amounts]

    #Add to guesses the most common; or if all equal, the highest total
    all_dollar_amounts.sort(reverse=True)
    guesses.append(max(all_dollar_amounts, key=all_dollar_amounts.count))
    
    #best_guess is the most common guess
    #    or if all equal, the first added value
    best_guess = max(guesses, key=guesses.count)
    
    return best_guess


#Perhaps more of the above functions should be folded into Receipt class
#For now, just hanging loose
class Receipt():
    '''The __init__ of the Receipt from the passed txt_file_path
    does ALL the heavy lifting of the class.
    
    Printing or pulling csv is all that will typically be required.
    '''
    
    def __init__(self, txt_file_path):
        '''
        name: file name base
        text: raw text from file
        
        date: best guess as to the receipt's date of issuance
        store: best guess as to the store that issued receipt
        amount: best guess as to the receipt's total amount
        
        csv: date, store, amount formatted nicely for one line of csv output
        '''
        
        self.name = os.path_in.basename(txt_file_path)[:-4]
        self.txt_file_path = txt_file_path
        
        self.text = 'Loads in a second'
        self.load_text()
    
        self.date = select_best_date(parse_dates_from_string(self.text))
        self.amount = determine_receipt_total(self.text)
        self.store = assign_name(self.text, hint_list=store_hint_list)
        self.csv = self.csv_format()

    def __repr__(self):
        text = 'NAME: %s\n' % self.name
        text += 'DATE: %s\n' % self.date
        text += 'AMOUNT: %s\n' % str(self.amount)
        text += 'STORE: %s\n' % self.store
        text += 'CSV: %s\n' % self.csv
        return text


    def load_text(self):
        with open(self.txt_file_path, 'r') as f:
            self.text = f.read()


    def csv_format(self):
        '''Converts relevant receipt attributes into csv format,
        zeroing out the holding values.'''
        
        #Outputs dates in 'mm/dd/yy' format, removing 9-9-1999
        y, m, d = self.date.year, self.date.month, self.date.day
        if y == 1999:
            date_string = ''
        else:
            date_string = '%s/%s/%s' % (m, d, y % 100)
        
        #0.0 amounts become '' for ease of detection in output
        amount_string = str(self.amount) if self.amount else ''
        
        #Commas to match existing
        csv_text = '%s,%s,,%s,,,,%s' % (
            date_string,
            self.name,
            self.store,
            amount_string)
        return csv_text


'''
    run_all acts as a 'main'
    From here down are consolidation functions that feed into run_all.
'''

def convert_all_word_to_txt():
    '''Converts all Word .doc to usuable .txt files.'''
    convert_all_docs_to_txt()
    convert_all_utf16_to_ascii()


def init_receipt_list():
    '''Returns a list of receipt items,
    one for each .txt files in ./input.'''
    
    txt_files = all_files_of_type(extension='.txt')
    receipt_list = [Receipt(f) for f in txt_files]
    
    for receipt in receipt_list:
        print receipt

    return receipt_list


def save_receipt_list_to_csv(receipt_list):
    '''Outputs all receipt data into a csv.'''
    
    csv_text = ''
    for receipt in receipt_list:
        csv_text += receipt.csv + '\n'
    
    csv_save_name = './input/%s.csv' % str(date.today())
    
    with open(csv_save_name, 'w') as f:
        f.write(csv_text)


def run_all():
    '''Runs the lot, essentially a main function.'''
    convert_all_word_to_txt()
    receipt_list = init_receipt_list()
    save_receipt_list_to_csv(receipt_list)


if __name__ == '__main__':

    run_all()
    print 'Receipts as __main__ finished successfully!'
    
    print 'If using, refactor to use doc_converter'