# -*- coding: utf-8 -*- ''' Created on Dec 22, 2014 @author: Brett Paufler Copyright Brett Paufler thousands_to_dataframe has the following column headings author category (new | front_page) created_utc domain id num_comments over_18 permalink score selftext subreddit title url ''' #Filters Text NOT in given column (kill ~ for is in) #dF = dF[~dF["url"].str.contains("reddit")] import pandas as pd import glob from collections import Counter import datetime #import urllib def quick_report(dF, entries=2, col_width=10): '''A quick glance shot of a pandas data frame with lots of options being reset ''' pd.set_option('display.width',500) pd.set_option('display.max_colwidth',10) pd.set_option('display.expand_frame_repr', 'False') pd.set_option('display.max_rows',500) pd.set_option('display.max_colwidth',col_width) print "\nQUICK REPORT (%d, %d): %s" % (entries,col_width,str(dF.shape)) print dF.head(entries) def thousands_to_dataframe(dfr = "C:\\ string data_reddit\\", last_first=False, hour_filter=0, max_num_files=1000): ''' assembles files in dfr of form "*reddit_thousands.txt" that pass filters into one comprehensive pandas dataFrame dfr = data file repository (where files are kept) hour_filter = min number of hours between data sets last_first = reverses sort order max_num_files = maximum number of files to compose data set from sample usage: thousands_to_dataframe() returns the first 1000 saved csv files in one giant pandas dataframe thousands_to_dataframe(hour_filter=24) returns sets sequenced at least 24 hours apart thousands_to_dataframe(last_first=True, max_num_files=1) returns the last crawl as a pandas dataframe ''' #dfr rawThousands = glob.glob(dfr + "*reddit_thousands.txt") #last_first rawThousands.sort() if last_first: rawThousands.reverse() #hour_filter thousands = [rawThousands[0]] print "Files Passing Filter (last_first=%r, hour_filter=%d):\n\t%s" % ( last_first, hour_filter, rawThousands[0]) lastTime = datetime.datetime.strptime(rawThousands[0], dfr + "%Y-%m-%d-%H-%M-%S_thousands.txt") for t in rawThousands[1:]: thisTime = datetime.datetime.strptime(t, dfr + "%Y-%m-%d-%H-%M-%S_thousands.txt") if abs(lastTime - thisTime) > datetime.timedelta(hours=hour_filter): print "\t%s" % t thousands.append(t) lastTime = thisTime #max_num_files print "max_num_files=%d" % max_num_files thousands = thousands[:max_num_files] #to pandas dataFrame dFList = [pd.DataFrame.from_csv(t) for t in thousands] dF = pd.concat(dFList,ignore_index=True ) print "%d Files Considered: %d Files Passed Filter" % ( len(rawThousands), len(dFList)) print "Returning Pandas Array (thousands_to_dataframe): %s" % str(dF.shape) print quick_report(dF, entries=2, col_width=10) print "End thousands_to_dataframe(last_first=%r, hour_filter=%d, max_num_files=%d)\n\n" % ( last_first, hour_filter, max_num_files) return dF def tally_column_groups(dF, col_name): '''returns a list of tuples for items in col_name (number X, name X) ''' sC = Counter(dF[col_name].values.tolist()) sL = [(int(n),c) for c,n in sC.items()] sL.sort() sL.reverse() print "\ncount_col_lumps called: grouping '%s'" % col_name print "\t%d Values In: %d Categories Out" % (len(dF),len(sL)) print "\tHead: %s" % str(sL[:5]) print "\tTail: %s" % str(sL[-5:]) return sL def tally_report(dF): '''runs tally_column_groups on the passed dF for all columns good fast summary ''' for k in list(dF.columns.values): tally_column_groups(dF, k) print "\ntally_report Ending: Nothing Returned" if __name__ == "__main__": #dF = thousands_to_dataframe() #dF = thousands_to_dataframe(hour_filter=-6) #dF = thousands_to_dataframe(last_first=True, hour_filter=24) dF = thousands_to_dataframe(last_first=False, max_num_files=1) tally_report(dF) #over_18 #dF = dF[~dF.over_18] #dF = dF[~dF["url"].str.contains("reddit")] #gF = dF[dF["url"].str.endswith('.gif')] #iF = dF[dF["url"].str.endswith('.jpg')] #dF = dF[["over_18","url"]] #quick_report(dF, 5, 10) #quick_report(gF, 5, 100) #quick_report(iF, 5, 10) #imgList = iF.url.tolist()[:5] #print imgList #sD = "C:/ full path needed data_reddit/img/" #for img in imgList: # sN = sD + img.split("/")[-1] # urllib.urlretrieve(img, sN) # print sN ''' def subreddit_count(dF, n_most_common=None): ''' ''' returns a list of tuples (name, number) of most_common subreddits in dF if n_most_common == None: all are returned else: n ''' ''' subCount = Counter(dF.subreddit.values.tolist()).most_common(n_most_common) return_count = sum([t[1] for t in subCount]) print "\n%d values passed to subreddit_count" % len(dF) print "%d items represented in returned tuple list (%.1f%%)" % (return_count, 100 * float(return_count)/len(dF)) print subCount return subCount ''' ''' def media_source_count(dF): ''' #returns a list of tuples (source, count) of outside media source from dF ''' mediaList = dF.media_embed[dF.media_embed != "{}"].values.tolist() mediaList = [re.search('schema=(.*)" width', d).group(1) for d in mediaList] mediaList = Counter(mediaList).most_common() return_count = sum([m[1] for m in mediaList]) print "\n%d records passed to media_source_count: %d link to outside media sources" % (len(dF), return_count) print mediaList return mediaList '''