''' Created on Jan 21, 2015 @author: Brett Paufler Copyright Brett Paufler The file and directory paths will need to be fixed as they have been obfuscated for posting 2020-03-24 ''' import pandas as pd from reddit_offline import quick_report, tally_report, tally_column_groups #dfr = "C:\\data_reddit\\", #mPath = r"C:\\mass_pull.txt" #mPath = r"C:\\\mass_pull.txt" import matplotlib.pyplot as plt #import datetime def load_mass_pull(): '''returns the mass_pull.csv as a pd.Dataframe ''' mPath = r"C:\mass_pull.txt" dF = pd.DataFrame.from_csv(mPath) dF['time'] = pd.to_datetime(dF['time'], unit='s') quick_report(dF, 5, 25) return dF def timespan(dF): '''returns the total timespan of a pd.Dataframe in mass_pull format ''' start = min(dF['time']) finish = max(dF['time']) timeDelta = finish - start print "Start Time: %s" % start print "Finis Time: %s" % finish print "TIME SPAN: %s" % timeDelta return timeDelta def author_review(): dF = load_mass_pull() a = tally_column_groups(dF, 'author') #44997 values, 29902 authors, total #7533 being attributed to 'None' a = a[1:] #kills the initial 'None' Grouping print a[:100] topTen = [p for p,v in a] topTen = topTen[:10] print "topTen %s" % topTen def listAve(listIn): return float(sum(listIn))/len(listIn) totalAve = listAve([p for p,v in a]) print "TOTAL AVE: %.2f" % totalAve botAve = listAve([p for p,v in a if 'bot' in v]) print "BOT AVE: %.2f" % botAve bots = 0 botPosts = 0 bL = [] botList = ['bot', 'reddit', 'auto', 'moderator', 'roto', 'download'] for p,v in a: for b in botList: if b in v.lower(): bL.append(p) bots += 1 botPosts += p break #print bL botListAve = listAve(bL) print "BOT LIST AVE: %.2f" % botListAve underTenAve = listAve([p for p,v in a if (p < 10)]) print "Under Ten AVE: %.2f" % underTenAve underTenBotAve = listAve([p for p,v in a if ('bot' in v.lower()) and (p < 10)]) print "BOT Under Ten AVE: %.2f" % underTenBotAve count = [0] * 150 for p,v in a: count[p] += 1 print "COUNT" print count print "%d Bots Were Deemed to Make %d Posts" % (bots, botPosts) #author_review() def over_18_review(): dF = load_mass_pull() a = tally_column_groups(dF, 'over_18') dF = pd.DataFrame(a, columns=["Count", "NSFW"], index=['False','True']) dF = dF.drop("NSFW", axis=1) print dF #Chart Output s = 5 dF.plot(kind='pie', figsize=(s,s), legend=False, title="NSFW Posts", subplots=True) #, fontsize=5) plt.xlabel=("") plt.ylabel("") sN = "stream_explore_over_18_pie.png" plt.savefig(sN) def time_review(): dF = load_mass_pull() tL = tally_column_groups(dF, 'time') start = min(dF['time']) finish = max(dF['time']) print start print finish print finish - start count = [0] * 11 for t,l in tL: count[t] += 1 #dF = pd.DataFrame(t) #print t #for #quick_report(dF, 25, 25) count[0] = (5 * 60 * 60 + 25 * 60 + 58) - sum(count) print count print sum(count) print (5 * 60 * 60 + 25 * 60 + 58) dF = pd.DataFrame(count, columns=["New Submissions per Second"]) print dF tot = sum(count) adder = [a*b for a,b in zip(count, range(0,11))] ave = sum(adder) / float(tot) print tot,adder,ave print [a*b for a,b in zip([1,2], [0,1])] ''' #Chart Output s = 5 dF.plot(kind='bar', figsize=(s,s), legend=False, subplots=True) #, fontsize=5) plt.xlabel=("") plt.ylabel("") sN = "stream_explore_posts_per_second_bar.png" plt.savefig(sN) #title="Posts per Second", ''' def domain_review(): dF = load_mass_pull() rawTally = tally_column_groups(dF, 'domain') print "Keeping it Honest: %d" % sum([a for a,b in rawTally]) tally = [(a,b) for a,b in rawTally if (not b.startswith('self')) and (not 'redd' in b)] selfCount = sum([a for a,b in rawTally if b.startswith('self') or ('redd' in b)]) tally += [(selfCount, 'self')] print "selfCount: %d" % selfCount print "Keeping it Honest: %d" % sum([a for a,b in tally]) shortList = [] breakOut = [(1,'Ones'),(2,'Twos'),(3,'Three'),(4,'Four'),(5,'Five'), (6,'Six'), (7,'Seven'),(8,'Eight'), (9,'Nine'), (10,'Ten') ] for n,label in breakOut: shortList.append((sum([n for a,b in tally if a == n]), label)) longList = [(a,b) for a,b in tally if a > 10] tally = shortList + longList print "ShortList: %s" % shortList print "Keeping it Honest: %d" % sum([a for a,b in tally]) imgur = (sum([a for a,b in tally if 'imgur' in b]), 'imgur') youtube = (sum([a for a,b in tally if 'youtu' in b]), 'youtube') tumblr = (sum([a for a,b in tally if 'tumb' in b]), 'tumblr') wiki = (sum([a for a,b in tally if 'wiki' in b]), 'wikipedia') google = (sum([a for a,b in tally if 'goog' in b]), 'google') tally = [(a,b) for a,b in tally if ('imgur' not in b) and ('youtu' not in b) and ('tumb' not in b) and ('wiki' not in b) and ('google' not in b) ] print "Keeping it Honest: %d" % sum([a for a,b in tally]) print imgur, youtube tally += [imgur, youtube, tumblr, wiki, google] tally.sort() tally.reverse() #tally = print tally print len(tally) print "Keeping it Honest: %d" % sum([a for a,b in tally]) rawTally.sort() rawTally.reverse() rawTally = rawTally[:20] print rawTally killList = ['Ones', 'Twos', 'Three', 'Four', 'Five', 'Six', 'Seven', 'Eight', 'Nine', 'Ten'] important = [(a,b) for a,b in tally if b not in killList] print len(tally) print len(important) important = important[:100] gD = important #[:10] index = [t for n,t in gD] count = [n for n,t in gD] tF = pd.DataFrame(count, index=index, columns=['Posts']) #tF = tF.set_index('Domain') print tF ax = tF.plot(kind='bar', figsize=(20,10), legend=False, fontsize=5, title="Posts per Domain (Top Ten Final Reduction)") #, subplots=True) #) ax.grid(False) sN = "stream_explore_posts_per_domain_final_top_10.png" #plt.savefig(sN) print "HTML \n\n\n \n" print tF.to_html() print "Keeping it Honest: %d" % sum([a for a,b in tally]) def subreddit_review(): dF = load_mass_pull() tally = tally_column_groups(dF, 'subreddit') python = [(a,b) for a,b in tally if 'python' in b.lower()] gifs = [(a,b) for a,b in tally if 'gif' in b.lower() and not 'gift' in b.lower()] popular = [(a,b) for a,b in tally if a >= 200] significant = [b for a,b in tally if a >= 10] programming = [(a,b) for a,b in tally if 'programming' in b.lower()] this_one = [(a,b) for a,b in tally if 'this_one' in b.lower()] that_one = [(a,b) for a,b in tally if 'that_one' in b.lower()] print "TALLY: %s" % tally[:50] print "Python: %s" % python print "GiF: %s" % gifs print "Programming (%d): %s" % (len(programming), programming) print "PoPular (%d): %s" % (len(popular), popular) print "\n\nSIGNIFICANT: \n" print len(significant) print " ".join(significant) print "this_one: %s" % this_one print "that_one: %s" % that_one print "RedditDev: %s" % [(a,b) for a,b in tally if 'redditdev' in b.lower()] ''' #Chart Programming gF = pd.DataFrame(programming, columns=["", "Reddits with 'Programming' in Name"]) #count, index=index, columns=['Posts']) gF = gF.set_index("Reddits with 'Programming' in Name") print gF plt.xlabel=("") plt.ylabel("") gF.plot(kind='pie', figsize=(5,5), legend=False, title="SubReddits with 'Programming' in Name", subplots=True, fontsize=5) sN = "stream_explore_programming" plt.savefig(sN) print gF.to_html() ''' ''' #Chart Popular gF = pd.DataFrame(popular, columns=["", "Most Popular"]) #count, index=index, columns=['Posts']) gF = gF.set_index('Most Popular') print gF plt.xlabel=("") plt.ylabel("") gF.plot(kind='pie', figsize=(5,5), legend=False, title="Most Popular Subreddits", subplots=True, fontsize=5) sN = "stream_explore_most_popular.png" plt.savefig(sN) print gF.to_html() ''' ''' #Chart GiFs gF = pd.DataFrame(gifs, columns=["", "GiFs"]) #count, index=index, columns=['Posts']) gF = gF.set_index('GiFs') print gF plt.xlabel=("") plt.ylabel("") gF.plot(kind='pie', figsize=(5,5), legend=False, title="GiF Subreddits", subplots=True, fontsize=5) sN = "stream_explore_subreddits_gifs.png" plt.savefig(sN) ''' if __name__ == "__main__": #dF = load_mass_pull() #tally_report(dF) subreddit_review() print 1725.0 / 44997