# -*- coding: utf-8 -*-

'''
Created on Dec 22, 2014
@author: Brett Paufler
Copyright Brett Paufler


thousands_to_dataframe has the following column headings

author
category  (new | front_page)
created_utc
domain
id
num_comments
over_18
permalink
score
selftext
subreddit
title
url
'''

#Filters Text NOT in given column (kill ~ for is in)
#dF = dF[~dF["url"].str.contains("reddit")]




import pandas as pd
import glob
from collections import Counter
import datetime

#import urllib

def quick_report(dF, entries=2, col_width=10):
    '''A quick glance shot of a pandas data frame
        with lots of options being reset
    '''
    pd.set_option('display.width',500)
    pd.set_option('display.max_colwidth',10)
    pd.set_option('display.expand_frame_repr', 'False')
    pd.set_option('display.max_rows',500)
    pd.set_option('display.max_colwidth',col_width)
    
    print "\nQUICK REPORT (%d, %d): %s" % (entries,col_width,str(dF.shape))
    print dF.head(entries)




def thousands_to_dataframe(dfr = "C:\\  string data_reddit\\",
                           last_first=False,
                           hour_filter=0,
                           max_num_files=1000):
    '''
    assembles files in dfr of form "*reddit_thousands.txt" that pass filters
        into one comprehensive pandas dataFrame
    
    dfr = data file repository (where files are kept)
    hour_filter = min number of hours between data sets
    last_first = reverses sort order
    max_num_files = maximum number of files to compose data set from
    
    sample usage:
        thousands_to_dataframe()
            returns the first 1000 saved csv files in one giant pandas dataframe
        thousands_to_dataframe(hour_filter=24)
            returns sets sequenced at least 24 hours apart
        thousands_to_dataframe(last_first=True, max_num_files=1)
            returns the last crawl as a pandas dataframe
    '''
    
    #dfr
    rawThousands = glob.glob(dfr + "*reddit_thousands.txt")
    
    #last_first
    rawThousands.sort()
    if last_first:
        rawThousands.reverse()

    #hour_filter
    thousands = [rawThousands[0]]
    print "Files Passing Filter (last_first=%r, hour_filter=%d):\n\t%s" % (
                                    last_first, hour_filter, rawThousands[0])
    lastTime = datetime.datetime.strptime(rawThousands[0],
                                    dfr + "%Y-%m-%d-%H-%M-%S_thousands.txt")
    for t in rawThousands[1:]:
        thisTime = datetime.datetime.strptime(t,
                                    dfr + "%Y-%m-%d-%H-%M-%S_thousands.txt") 
        if abs(lastTime - thisTime) > datetime.timedelta(hours=hour_filter):
            print "\t%s" % t
            thousands.append(t) 
            lastTime = thisTime 
    
    #max_num_files
    print "max_num_files=%d" % max_num_files
    thousands = thousands[:max_num_files]
    
    #to pandas dataFrame
    dFList = [pd.DataFrame.from_csv(t) for t in thousands]
    dF = pd.concat(dFList,ignore_index=True )

    print "%d Files Considered: %d Files Passed Filter" % (
                                    len(rawThousands), len(dFList))
    print "Returning Pandas Array (thousands_to_dataframe): %s" % str(dF.shape)
    print quick_report(dF, entries=2, col_width=10)
    print "End thousands_to_dataframe(last_first=%r, hour_filter=%d, max_num_files=%d)\n\n" % (
                                    last_first, hour_filter, max_num_files)
    return dF


def tally_column_groups(dF, col_name):
    '''returns a list of tuples for items in col_name
        (number X, name X)
    '''

    sC = Counter(dF[col_name].values.tolist())
    sL = [(int(n),c) for c,n in sC.items()]
    sL.sort()
    sL.reverse()

    print "\ncount_col_lumps called: grouping '%s'" % col_name 
    print "\t%d Values In: %d Categories Out" % (len(dF),len(sL))
    print "\tHead: %s" % str(sL[:5])
    print "\tTail: %s" % str(sL[-5:])
    
    return sL


def tally_report(dF):
    '''runs tally_column_groups on the passed dF for all columns
        good fast summary
    '''
    for k in list(dF.columns.values):
        tally_column_groups(dF, k)
    print "\ntally_report Ending: Nothing Returned"


if __name__ == "__main__":
    #dF = thousands_to_dataframe()
    #dF = thousands_to_dataframe(hour_filter=-6)
    #dF = thousands_to_dataframe(last_first=True, hour_filter=24)
    dF = thousands_to_dataframe(last_first=False, max_num_files=1)

    tally_report(dF)
    #over_18
    #dF = dF[~dF.over_18]
    #dF = dF[~dF["url"].str.contains("reddit")]
    #gF = dF[dF["url"].str.endswith('.gif')]
    #iF = dF[dF["url"].str.endswith('.jpg')]
    #dF = dF[["over_18","url"]]
    #quick_report(dF, 5, 10)
    #quick_report(gF, 5, 100)
    #quick_report(iF, 5, 10)


    #imgList = iF.url.tolist()[:5]
    #print imgList

    #sD = "C:/ full path needed data_reddit/img/"
    
    
    
    #for img in imgList:
    #    sN = sD + img.split("/")[-1]
    #    urllib.urlretrieve(img, sN)
    #    print sN




'''
def subreddit_count(dF, n_most_common=None):
'''
'''
    returns a list of tuples (name, number) of most_common subreddits in dF
        
        if n_most_common == None:
            all are returned
        else:
            n
'''
'''   
    subCount = Counter(dF.subreddit.values.tolist()).most_common(n_most_common)
    
    return_count = sum([t[1] for t in subCount])
    print "\n%d values passed to subreddit_count" % len(dF)
    print "%d items represented in returned tuple list (%.1f%%)" % (return_count, 100 * float(return_count)/len(dF))
    print subCount
    
    return subCount
'''

'''
def media_source_count(dF):
'''
#returns a list of tuples (source, count) of outside media source from dF
'''
    
    mediaList = dF.media_embed[dF.media_embed != "{}"].values.tolist()
    mediaList = [re.search('schema=(.*)" width', d).group(1) for d in mediaList]
    mediaList = Counter(mediaList).most_common()

    return_count = sum([m[1] for m in mediaList])
    print "\n%d records passed to media_source_count: %d link to outside media sources" % (len(dF), return_count)
    print mediaList
    return mediaList
'''