''' Created on Mar 11, 2014 @author: Brett Paufler copyright Brett Paufler WebCrawl Utilities ''' import os import urllib2 import re import string import unicodedata import pickle #standard two tier, make directory subroutine def makeOutputDirectory(a,letter=0): print "makeOutputDirectory called" if letter: c = a + ".\\" + letter + ".\\" else: c = a if not os.path.exists(c): os.makedirs(c) print "Directory " + c + " was created" else: print "Directory " + c + " already existed" def createEmptyGalleryObject(): a = {} a["views"] = 0 a["points"] = 0 a["weight"] = 0 a["title"] = "" a["gallery"] = "" a["c1Text"] = "" a["c2Text"] = "" a["c3Text"] = "" a["poster"] = "" a["c1poster"] = "" a["c2poster"] = "" a["c3poster"] = "" return a #downloads a webpage, should work for any webpage by feeding it seperate inputs #Note: it will be a static web page, not dynamic, links not included #So, often no pictures def downloadWebpage(thisURL="http://imgur.com", name="imgurFrontPage.html", dirThis="imgur"): print "downlaodWebpage called" if dirThis == "": localName = name else: localName = ".//" + dirThis + ".//" + name imgurHomeURL = thisURL imgurPage = urllib2.urlopen(imgurHomeURL) LocalWegPage = open(localName, 'wb') LocalWegPage.write(imgurPage.read()) LocalWegPage.close() if os.path.exists(localName): print thisURL + " webpage saved as " + localName ''' Extracting Info from Saved Webpage This may work for other imgur pages not just main But this is specialized towards pulling data from imgur Won't work on Reddit, say ''' def imageFrontPageToDataTextFile(savedWebpageLocation = ".//imgur//imgurFrontPage.html", outputFile = ".//imgur//imgurFrontPageDatumFile.txt"): print "imageFrontPageToDataTextFile(): starting" imgurFrontPageDataArray = [] with open (savedWebpageLocation, 'r') as rawFile: fileString = rawFile.read() fileString = fileString.replace("\n"," ") fileString = fileString.replace("\t"," ") fileString = fileString.replace("\r"," ") for r in range(0,10): fileString = fileString.replace(" "," ") print fileString fileBits = fileString.split("")] print piece if piece.startswith("href="): start = 'href="' end = '">' urlString = re.search(re.escape(start)+"(.*?)"+re.escape(end),piece).group(1) print urlString if urlString.startswith("/gallery/"): urlString = urlString.replace("/gallery/", "") temp["gallery"] = urlString print temp["gallery"] piece = piece[piece.find('title'):] piece = piece.replace('title="', '') #print piece temp["title"] = piece[:piece.find("

")] print temp["title"] piece = piece[piece.find("points"):] piece = piece[piece.find(">"):] piece = piece[1:] temp["points"] = piece[:piece.find("<"):] print temp["points"] #piece = piece[piece.find("points"):] #piece = piece[piece.find(""):] piece = piece[piece.find(":"):] piece = piece[2:] piece = piece[:piece.find("views")] print piece temp["views"] = piece #Gets Rid of Garbage Varialbes in Title temp["title"] = scrubHTML(temp["title"]) temp["weight"] = 0 imgurFrontPageDataArray.append(temp) print "NEXT" else: print "This piece of page rejected" #cleans out garbage entries in array #will save trouble checking for existing fields downstream imgurFrontPageDataArray = [x for x in imgurFrontPageDataArray if x["gallery"] != ""] imgurFrontPageDataArray = [x for x in imgurFrontPageDataArray if x["title"] != ""] imgurFrontPageDataArray = [x for x in imgurFrontPageDataArray if x["points"] != ""] imgurFrontPageDataArray = [x for x in imgurFrontPageDataArray if x["views"] != ""] textOut = open(outputFile, "w") pickle.dump(imgurFrontPageDataArray, textOut) textOut.close() cbuName = outputFile[:-4] + "TEXTCBU.txt" CBUtextOut = open(cbuName, "w") for datum in imgurFrontPageDataArray: #print datum CBUtextOut.write(repr(datum) + "\n") CBUtextOut.close() print repr(imgurFrontPageDataArray) print "imageFrontPageToDataTextFile(): FINISHING" return imgurFrontPageDataArray # end imageFrontPateToData #To be used to clean up Title and comment entries" def scrubHTML(text): #common garbage, add more entries as needed text = text.replace(";quot;", " ") text = text.replace(";#039;", " ") text = text.replace("&", " ") #text to lower makes the remainder easier to scrub text = text.lower() #list of letters & spaces #This is ALL that's getting through #everything else gone l1 = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k","l", "m"] l2 = ["n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", " "] letters = l1 + l2 tempString = "" for bP in text: if bP in letters: tempString += bP #This is probably overkill, but computers are fast #Between the two, all leading, trailing, and double spaces are removed for r in range(0,10): tempString = tempString.replace(" "," ") tempString = tempString.strip() return tempString #end def scrubHTML(text): ''' First Crawl of the Day return the imgur front page as imgurFrontPage.html and raw analytical data as imgurFrontPageDatumFile.txt ''' def initialImgurCrawl(): print "Initial Crawl called" #makes directory if none exists makeOutputDirectory("imgur") #makes rollingGalleryData if none exists if not os.path.exists(".\\imgur\\rollingGalleryData.txt"): temp = open(".\\imgur\\rollingGalleryData.txt", "w") #emptySet = {} #pickle.load(open, emptySet) temp.close() downloadWebpage() #imgur frontpage home testArray = imageFrontPageToDataTextFile() print testArray print "Initial Crawl Returned the Above" return testArray ''' This Adds the Newly Crawled Front Page and appends the Rolling Gallery as a Pickle Object ''' def frontPageReadtoRollingGalleryFile(): print "def frontPageReadtoRollingGalleryFile(): called" try: #if os.path.exists(".\\imgur\\imgurFrontPageDatumFile.txt"): with open (".\\imgur\\imgurFrontPageDatumFile.txt", 'r') as rawFrontPage: newData = pickle.load(rawFrontPage) print "newData" print repr(newData) print len(newData) try: with open (".\\imgur\\rollingGalleryData.txt", 'r') as rollingGalleryData: oldData = pickle.load(rollingGalleryData) except: oldData = [] print "OlD Data Values" print repr(oldData) print len(oldData) uniqueData = [] for newItem in newData: inSet = False for oldItem in oldData: if oldItem["gallery"] == newItem["gallery"]: inSet = True if inSet == False: uniqueData.append(newItem) print "uniqueData" print repr(uniqueData) print "updatedData" updatedData = oldData + uniqueData print repr(updatedData) with open (".\\imgur\\rollingGalleryData.txt", 'w') as rollingGalleryData: pickle.dump(updatedData, rollingGalleryData) print "After Pickle" print repr(updatedData) with open (".\\imgur\\rollingGalleryDataTEXTCBU.txt", 'w') as rollingGalleryText: for datum in updatedData: rollingGalleryText.write(repr(datum) + "\n") print "def frontPageReadtoRollingGalleryFile(): ended" except: print "NO imgurFrontPageDatumFile" print "TODO LEARN HOW TO RAISE EXCEPTION" print "Crawl Loaded... or You're running the wrong file"