''' Created on Dec 27, 2014 @author: Brett Paufler Copyright Brett Paufler either run as main (or run as a call to update_all() updates manifest listing of all subreddits in subreddits_list for further processing See text files for further info Desktop/PROGRAMS/data_reddit/ subreddit_name.txt ''' import json import pickle import re import os import time import praw import pandas as pd from reddit_offline import quick_report from reddit_thousands import praw_submission_to_dict #import datetime #THESE ARE THE SUBREDDITS TO BE UPDATED subreddits_list = ["Some_Subreddit",] mP = "C:/dir_full_path/" tako = praw.Reddit(user_agent="get your own") def check_if_new(sub): '''Creates new manifest for subreddit if none exists sub.txt made in dir_full_path ''' fN = "%s%s.txt" % (mP, sub) if not os.path.isfile(fN): with open(fN,"w") as f: f.write("id,time,score,comm,author,title,final\n") print "NEW FILE: %s" % fN else: print "EXISTS %s" % fN def scrub_text(text): try: return re.sub(r'[^a-zA-Z0-9]+', ' ', str(text).encode('ascii','ignore')) except: return 'ASCII FAIL' def manifest_data(sO): '''reduces praw reddit submission object to tuple of... ''' return (sO.id, sO.created_utc, #time stamp sO.score, sO.num_comments, scrub_text(sO.author), scrub_text(sO.title) ) def pull_new_posts(sub): '''Adds ''' fN = "%s%s.txt" % (mP, sub) previous = pd.read_csv(fN) previous = previous['id'].tolist() print previous for s in tako.get_subreddit(sub).get_new(limit=1000): (postID,utc,s,c,a,title) = manifest_data(s) if utc - time.time() > 1250000: finalCode = 99 else: finalCode = 0 data = "%s,%d,%d,%d,%s,%s,%d\n" % (postID,utc,s,c,a,title,finalCode) if postID not in previous: with open(fN,'a') as f: f.write(data) print "%s: Pulled: %s" % (sub,data) else: print "%s: REPEAT: %s" % (sub,data) break def update_old_posts(sub): '''gets final score, comments on 2 week + old posts ''' fN = mP + sub + ".txt" dF = pd.read_csv(fN, index_col=0) #limits check to those posts 2 weeks old (+ or -, roughly) timeOffset = max(dF['time']) - 1250000 uF = dF[dF['time'] <= timeOffset] uF = uF[uF['final'] < 99] quick_report(uF, 5, 25) #Guts nested in a Try if reddit fails or post disappears for subID in uF.index.values: print 'UPDATING: %s (contacting reddit)' % subID tryNum = dF.loc[subID,'final'] try: submission = tako.get_submission(submission_id=subID) postID,_,score,comments,_,_ = manifest_data(submission) assert postID == subID #Probably Useless in the Try... dF.loc[subID, 'comm'] = comments dF.loc[subID, 'score'] = score dF.loc[subID, 'final'] = 99 except: print 'FAILED UPDATE #%d for: %s' % (tryNum,subID) tryNum += 1 if tryNum > 5: dF.loc[subID, 'final'] = 99 else: dF.loc[subID, 'final'] = tryNum quick_report(dF, 5,25) dF.to_csv(fN) print 'FINISHED: update_old_posts(), %d updated' % len(uF) def base_pull(sub): '''convenience lumping function a call to one, rather than several ''' check_if_new(sub) pull_new_posts(sub) update_old_posts(sub) if __name__ == '__main__': #for sub in subreddits_list: #base_pull(sub) print "Explore Post Deeper" subID='nope' subID='a real subID' tako.config.store_json_result = True s = tako.get_submission(submission_id=subID) print s #print s.comments with open(subID + '_json.txt', 'w') as f: print "Pickle Starting %s" % subID json.dump(s.json_dict,f) print "Pickle Ending %s" % subID with open(subID + '_comments.txt', 'w') as f: for c in s.comments: print c f.write(str(c)) def subreddit_update_manifest(tako,subreddit): '''checks for new submissions and updates manifest nothing returned tako = reddit client crawler subreddit = sub to crawl, outputed to subreddit.txt (no limit on size) ''' print "UPDATING MANIFEST: %s" % subreddit manifest_path = "C:/ reddit/%s.txt" % subreddit #creates new subreddit.txt file if one does not exist if not os.path.isfile(manifest_path): with open(manifest_path,"w") as f: header = ",author,created_utc,id,num_comments,over_18,score,title\n" f.write(header) existing = pd.read_csv(manifest_path) #, index=id) if any(existing['created_utc']): print max(existing['created_utc']) #Pulls 1000 submissions... or until a repeat is hit for sub in tako.get_subreddit(subreddit).get_new(limit=5): s = praw_submission_to_dict(sub, subreddit) #deleting categories from default praw_submission_to_dict #not needed for the manifests (self text takes too much memory del s['category'] del s['domain'] del s['permalink'] del s['selftext'] del s['subreddit'] del s['url'] existing_ids = existing['id'].tolist() count = 0 if s["id"] not in existing_ids: count += 1 print "ADDING (%s): %r" % (subreddit,sub) s = pd.DataFrame([s]) s.set_index('id') quick_report(existing, 5) s.to_csv(path_or_buf=manifest_path, header=False, mode='a') existing = pd.concat([existing,s], ignore_index=True) else: print "REPEAT[BREAK LOOP](%s): %r" % (subreddit,sub) break quick_report(existing, 5, 50) print "FINISHED UPDATE: %s (%d added)" % (subreddit,count) def update_all(sr_list=subreddits_list): '''runs update_subreddit_manifest for all subs in subreddit_list ''' tako = praw.Reddit(user_agent="should have used a variable") for subreddit in subreddits_list: subreddit_update_manifest(tako,subreddit) #def update_last_week(): #import sqlite3