''' Created on Jan 15, 2015 @author: Brett Paufler (c) Copyright Brett Paufler stream_view() runs for an hour, saving the stream to a csv ''' import praw import pandas as pd from reddit_thousands import time_label import datetime import time from reddit_offline import quick_report #, tally_report, thousands_to_dataframe #import matplotlib.pyplot as plt def stream_info(s): sub = ",".join([s.id.encode('ascii', 'replace'), s.domain.encode('ascii', 'replace'), s.subreddit.display_name.encode('ascii', 'replace'), str(s.author).encode('ascii', 'replace'), str(s.over_18), str(s.created_utc), ]) return sub def stream_view(sleepTime=300, postsToPull=1000, runTime=4000): '''partial view of the reddit stream (new) With Duplicates: No Effort to Screen out Duplicates sleepTime is in seconds, 5 for test, five minutes = 300 postsToPull, 5 for test, 1000 for run runTime, 30 for test, 3600 for run Typical Usuage TEST: stream_view(sleepTime=5, postsToPull=5, runTime=30 ) MID: stream_view(sleepTime=60, postsToPull=100, runTime=600) RUN: stream_view(sleepTime=300, postsToPull=1000, runTime=36000) ''' tako = praw.Reddit(user_agent="nope") #initializes output file for csv sN = "save_path/%s_%s.txt" % (runTime, time_label()) f = open(sN,"w") f.write("id,domain,subreddit,author,over_18,time\n") #list of posts reviewed #subIds = [] start = datetime.datetime.now() finish = start + datetime.timedelta(seconds=runTime) print "Starting: %s \t Ending: %s" % (start, finish) #Main Timer Loop while datetime.datetime.now() < finish: newSubs = tako.get_new(limit=postsToPull) for s in newSubs: text = stream_info(s) print text f.write(text) f.write("\n") f.flush() print "Now: %s \t Sleep: %s \t End: %s \t" % (datetime.datetime.now(), sleepTime, finish ) time.sleep(sleepTime) f.flush() f.close() print "ALL ENDS THAT ENDES WELL: %s" % datetime.datetime.now() def sixty_minutes_of_reddit(fN="C:/Users/etc"): '''returns a pandas data frame from a stream_view csv file of all data time-stamped within an hour of the last (i.e. gives a 60min view of the stream) fN = an absolute path "C:/Users/etc" ''' #fN = 'test file name path' dF = pd.DataFrame.from_csv(fN) quick_report(dF, 2, 25) #unique, drop_duplicates, as it sounds dF = dF.drop_duplicates() quick_report(dF, 2, 25) #working backwards from maxTime #limits data to within an hour of that dF = dF[dF["time"] >= (max(dF["time"]) - 3600)] quick_report(dF, 2, 25) return dF #fN = "C:/data_reddit/" #fN += "stream_view_4000_2015-01-16-06-45-24.txt" #dF = sixty_minutes_of_reddit(fN) def next_sub_id(subId): '''given a reddit id#, increments id by one XXXX7z to XXXX80 or XXXyzz to XXXz00 ''' plusOne = True d = -1 subId = [ord(x) for x in list(subId)] while plusOne: subId[d] += 1 if subId[d] == 123: subId[d] = 48 d -= 1 elif subId[d] == 58: subId[d] = 97 plusOne = False else: plusOne = False subId = ''.join([chr(x) for x in subId]) return subId def mass_pull(start="6 digits alpha numeral", howMany=10000, info=stream_info, addToExisting=True): '''pulls sequential reddit submissions and saves to mass_pull start = optional reddit Id of form start="" where to start sequential pull from howMany = number of sequential submissions to pull (if they exist) info = a function to pull data (see stream_info for format/info) mass_pull() resumes where left off ''' tako = praw.Reddit(user_agent="agent id") #add to old or not if addToExisting: sN = "C:/data_reddit/mass_pull.txt" with open(sN,"r") as f: start = f.readlines()[-1].split(',')[0] subId = next_sub_id(start) print "mass_pull() resuming at %s" % start f = open(sN,"a") else: sN = "C:/data_reddit/mass_pull_%s_%d.txt" % (start, howMany) f = open(sN,"w") f.write("id,domain,subreddit,author,over_18,time\n") subId = start count = 0 while count < (howMany): try: sub = tako.get_submission(submission_id=subId) sub = stream_info(sub) f.write(sub) f.write("\n") f.flush() print sub except: print "Bad ID: %s" % subId subId = next_sub_id(subId) count += 1 f.flush() f.close() print "MASS_PULL() FINISHED: %s %d" % (start, howMany) if __name__ == "__main__": mass_pull() #I don't think we need this anymore #tako = praw.Reddit(user_agent="user account") #dF = thousands_to_dataframe(max_num_files=1) #print dF #dF = dF[] # #stream_view() #fN += "stream_view_4000_2015-01-15-16-50-03.txt" #tally_report(dF) #dF = dF[dF["over_18"]==False] #print dF #graphing sub #s = 5 #dF.plot(kind="pie", figsize=(s,s), legend=False, title="STREAM", subplots=True, fontsize=5) #plt.xlabel=("") #plt.ylabel("") #sN = "subreddits_%s_%s_%s.png" % ("new","pie","stream") #plt.savefig(sN) #print "End of It All" #stream_view()