''' Created on Mar 11, 2014 @author: Brett Paufler Copyright Brett Paufler ''' import re import urllib2 import os import imgur_crawl import sets import pickle import random import imgur_logic import webbrowser import imgur_pageup import Tkinter as tk import Image, ImageTk import imageManagement ''' Interesting REG EX''' #'''href=["'](.[^"']+)["']''' print "imageCrawler Started" ''' TODO BRING THIS BACK ''' # ''' Initial Front Page Crawl When Program Starts ''' def frontPageCrawl(): print "frontPageCrawl(): Initiated" #Crawl imgur Gallery imgur_crawl.initialImgurCrawl() #Pre-Processes Front Page imgur_crawl.imageFrontPageToDataTextFile() #Adds Front page to Rolling Gallery File imgur_crawl.frontPageReadtoRollingGalleryFile() print "frontPageCrawl(): Finished" frontPageCrawl() ''' This processes the existing galleryDatabase And returns a galleryObject ''' def testerPageUpAfterInitialCrawl(): print "testerPageUpAfterInitialCrawl(): Initiated" nextImgurGalleryObject = imgur_logic.runLogic() print repr(nextImgurGalleryObject) imgur_pageup.viewNextGallery(nextImgurGalleryObject) return nextImgurGalleryObject print "testerPageUpAfterInitialCrawl(): Finished" return nextImgurGalleryObject() testerPageUpAfterInitialCrawl() ''' Save Imgur Page based on Next gallery Show Gallery Update database Wash Repeat ''' #This is redundant nIGO = imgur_logic.runLogic() print repr(nIGO) ''' This Pulls the Updated Page, saves at imgurGalleryPage Commented out while work out rest of code imgGalleryURL = "http://imgur.com/gallery/" + nIGO["gallery"] imgur_crawl.downloadWebpage(imgGalleryURL, "imgurGalleryPage.html", "imgur") ''' ''' Extracting Info from Saved Webpage -- IMGUR GALLERY AND THIS PROBABLY ISN'T GOING TO WORK WITHOUT A PLUG-IN OR LIBRARY EXTENSION ''' def imgurGalleryPageDataTextFile(savedWebpageLocation = ".//imgur//imgurGalleryPage.html", outputFile = ".//imgur//thisImgurPageDatumFile.txt"): print "imgurGalleryPageDataTextFile(): starting" currentGalleryData = [] with open (savedWebpageLocation, 'r') as rawFile: fileString = rawFile.read() fileString = fileString.replace("\n"," ") fileString = fileString.replace("\t"," ") fileString = fileString.replace("\r"," ") for r in range(0,10): fileString = fileString.replace(" "," ") print fileString ''' fileBits = fileString.split("")] print piece if piece.startswith("href="): start = 'href="' end = '">' urlString = re.search(re.escape(start)+"(.*?)"+re.escape(end),piece).group(1) ''' textOut = open(outputFile, "w") pickle.dump(currentGalleryData, textOut) textOut.close() cbuName = outputFile[:-4] + "TEXTCBU.txt" CBUtextOut = open(cbuName, "w") for datum in currentGalleryData: CBUtextOut.write(repr(datum) + "\n") CBUtextOut.close() print repr(currentGalleryData) print "imgurGalleryPageDataTextFile(): FINISHING" return currentGalleryData # end imageGalleryPageDataTextFile imgurGalleryPageDataTextFile() ''' DON"T NEED THESE FOR NOW TODO BRING BACK OR REDUCE TO FUNCTION (probably not ''' imageManagement.resizePicture() #imgur_pageup.runSavePictureButtonTKinputBox() print "imageCrawler Finished" #http://imgur.com/gallery/------