''' Created on Dec 16, 2014 @author: Brett Paufler Copyright Brett Paufler # -*- coding: utf-8 -*- Pulls 1000 new, 1000 front_page submissions and saves all sorts of data on them v1.3 12-30-14 pulled praw_submission_to_dict as seperate function v1.2 new & front page only v1.1 12-29-14 Fixed Slow to_csv function dF.to_csv(path_or_buf=sN, encoding='utf-8') was very slow, so killed all utf on page and works much faster old data presumed dead 12-29-14 No longer recording: "media_embed": s.media_embed, ''' import praw import datetime import pandas as pd from reddit_offline import quick_report def time_label(): ''' returns a date string of 'now' for use as file header for easy sort order "2014-12-22-10-21-35" ''' return str(datetime.datetime.now().isoformat())[:-7].replace(":","-").replace("T","-") def praw_submission_to_dict(s, category_name): '''given a praw submission object (s) returns a dictionary of values usuable as a row in a pandas dataframe category_name is name of the pull (new, front_page, games, pr_n) ''' try: #s.selftext scrub sText = s.selftext killList = ["\n", "\r", "\t", "\0", "\O",] for k in killList: sText = sText.replace(k, " ") row = {"id":s.id.encode('ascii', 'replace'), "category": category_name.encode('ascii', 'replace'), "title": s.title.encode('ascii', 'replace'), "selftext": sText.encode('ascii', 'replace'), "domain": s.domain.encode('ascii', 'replace'), "url": s.url.encode('ascii', 'replace'), # img link "permalink": s.permalink.encode('ascii', 'replace'), "subreddit": s.subreddit.display_name.encode('ascii', 'replace'), "author": str(s.author).encode('ascii', 'replace'), "num_comments": s.num_comments, "score": s.score, "over_18": s.over_18, "created_utc": s.created_utc, #time stamp } print row return row except AttributeError: print "\tBad Egg: Incomplete Submission %s" % category_name def get_thousands(tako, n): ''' returns n reddit_update_manifests for each of the listed submission groups ''' categoryDictionary = {"new": tako.get_new(limit=n), "front_page": tako.get_front_page(limit=n), #"controversial": tako.get_controversial(limit=n), #"rising": tako.get_rising(limit=n), #"top": tako.get_top(limit=n), } subs = [] for catName, catGen in categoryDictionary.items(): print "GETTING %s" % catName for s in catGen: row = praw_submission_to_dict(s, catName) if row: subs.append(row) dF = pd.DataFrame(subs) sN = "C:/%s_thousands.txt" % time_label() print "IN PROCESS OF SAVING PANDAS: %s" % sN dF.to_csv(path_or_buf=sN) quick_report(dF) print "Got %d Links" % len(subs) print "Thousands Ending" return dF ### ### thousands Above, update_subreddits Below ### if __name__ == "__main__": tako = praw.Reddit(user_agent="unknown") get_thousands(tako, 1000) print datetime.datetime.now() #TODO - subReddit Specific Pulls with memory