'''
Created on Mar 11, 2014
@author: Brett Paufler
copyright Brett Paufler

WebCrawl Utilities
'''

import os
import urllib2
import re
import string
import unicodedata
import pickle


#standard two tier, make directory subroutine
def makeOutputDirectory(a,letter=0):
    print "makeOutputDirectory called"
    if letter:
        c = a + ".\\" + letter + ".\\"
    else:
        c = a
    if not os.path.exists(c):
        os.makedirs(c)
        print "Directory " + c + " was created"
    else:
        print "Directory " + c + " already existed"

def createEmptyGalleryObject():
    a = {}
    a["views"] = 0
    a["points"] = 0
    a["weight"] = 0
    a["title"] = ""
    a["gallery"] = ""
    a["c1Text"] = ""
    a["c2Text"] = ""
    a["c3Text"] = ""
    a["poster"] = ""
    a["c1poster"] = ""
    a["c2poster"] = ""
    a["c3poster"] = ""    
    return a
    
    

#downloads a webpage, should work for any webpage by feeding it seperate inputs
#Note: it will be a static web page, not dynamic, links not included
#So, often no pictures
def downloadWebpage(thisURL="http://imgur.com", name="imgurFrontPage.html", dirThis="imgur"):
    print "downlaodWebpage called"
    if dirThis == "":
        localName = name
    else:
        localName = ".//" + dirThis + ".//" + name
    imgurHomeURL = thisURL
    imgurPage = urllib2.urlopen(imgurHomeURL)  
    LocalWegPage = open(localName, 'wb')
    LocalWegPage.write(imgurPage.read())
    LocalWegPage.close()
    if os.path.exists(localName):
        print thisURL + " webpage saved as " + localName



'''
Extracting Info from Saved Webpage
This may work for other imgur pages not just main
But this is specialized towards pulling data from imgur
Won't work on Reddit, say

'''
def imageFrontPageToDataTextFile(savedWebpageLocation = ".//imgur//imgurFrontPage.html", outputFile = ".//imgur//imgurFrontPageDatumFile.txt"):

    print "imageFrontPageToDataTextFile(): starting"
    imgurFrontPageDataArray = []  
    
    with open (savedWebpageLocation, 'r') as rawFile:
        fileString = rawFile.read()
        fileString = fileString.replace("\n"," ")
        fileString = fileString.replace("\t"," ")
        fileString = fileString.replace("\r"," ")
        for r in range(0,10):
            fileString = fileString.replace("  "," ")
       
    print fileString
    
    fileBits = fileString.split("<a ")
    print fileBits

    for piece in fileBits:
        
        temp = createEmptyGalleryObject()   
        #print piece
        piece = piece[:piece.find("</a>")]
        print piece
    
        if piece.startswith("href="):        
            start = 'href="'
            end = '">'
            urlString = re.search(re.escape(start)+"(.*?)"+re.escape(end),piece).group(1) 
            print urlString        
            if urlString.startswith("/gallery/"):
                urlString = urlString.replace("/gallery/", "")
                temp["gallery"] = urlString
                print temp["gallery"]
            
                piece = piece[piece.find('title'):]
                piece = piece.replace('title="', '')
                #print piece
                temp["title"] = piece[:piece.find("<p>")]
                print temp["title"]
                
                piece = piece[piece.find("points"):]
                piece = piece[piece.find(">"):]
                piece = piece[1:]
            
                temp["points"] = piece[:piece.find("<"):]
                print temp["points"]
                #piece = piece[piece.find("points"):]            
                #piece = piece[piece.find("</span>"):]
                piece = piece[piece.find(":"):]                      
                piece = piece[2:]            
                piece = piece[:piece.find("views")]   
                print piece
                
                temp["views"] = piece
                
                #Gets Rid of Garbage Varialbes in Title
                temp["title"] = scrubHTML(temp["title"])
                
                
                temp["weight"] = 0
                                
                imgurFrontPageDataArray.append(temp)
    
                print "NEXT"
        else:
            print "This piece of page rejected"        

    #cleans out garbage entries in array 
    #will save trouble checking for existing fields downstream   
    imgurFrontPageDataArray = [x for x in imgurFrontPageDataArray if x["gallery"] != ""]
    imgurFrontPageDataArray = [x for x in imgurFrontPageDataArray if x["title"] != ""]
    imgurFrontPageDataArray = [x for x in imgurFrontPageDataArray if x["points"] != ""]
    imgurFrontPageDataArray = [x for x in imgurFrontPageDataArray if x["views"] != ""]

    textOut = open(outputFile, "w")
    pickle.dump(imgurFrontPageDataArray, textOut)
    textOut.close()
    
    cbuName = outputFile[:-4] + "TEXTCBU.txt"
    CBUtextOut = open(cbuName, "w")
    for datum in imgurFrontPageDataArray:
        #print datum
        CBUtextOut.write(repr(datum) + "\n")
    CBUtextOut.close()
    

    print repr(imgurFrontPageDataArray)
    
    print "imageFrontPageToDataTextFile(): FINISHING"
    return imgurFrontPageDataArray
# end imageFrontPateToData




#To be used to clean up Title and comment entries"
def scrubHTML(text):
    
    #common garbage, add more entries as needed
    text = text.replace(";quot;", " ")
    text = text.replace(";#039;", " ")
    text = text.replace("&amp", " ")

    #text to lower makes the remainder easier to scrub
    text = text.lower()
    
    #list of letters & spaces
    #This is ALL that's getting through
    #everything else gone
    l1 = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k","l", "m"]
    l2 = ["n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", " "]
    letters = l1 + l2
    tempString = ""
    for bP in text:
        if bP in letters:
            tempString += bP
    
    #This is probably overkill, but computers are fast
    #Between the two, all leading, trailing, and double spaces are removed        
    for r in range(0,10):
        tempString = tempString.replace("  "," ")
    tempString = tempString.strip()

    return tempString
#end def scrubHTML(text):
    


'''
First Crawl of the Day return the imgur front page as
     imgurFrontPage.html
and raw analytical data as
    imgurFrontPageDatumFile.txt
'''
def initialImgurCrawl():
    print "Initial Crawl called"
    #makes directory if none exists
    makeOutputDirectory("imgur")
    #makes rollingGalleryData if none exists
    if not os.path.exists(".\\imgur\\rollingGalleryData.txt"):
            temp = open(".\\imgur\\rollingGalleryData.txt", "w")
            #emptySet = {}
            #pickle.load(open, emptySet)
            temp.close()
    downloadWebpage() #imgur frontpage home
    testArray = imageFrontPageToDataTextFile()
    print testArray
    print "Initial Crawl Returned the Above"
    return testArray




'''
This Adds the Newly Crawled Front Page and appends the Rolling Gallery
as a Pickle Object
'''
def frontPageReadtoRollingGalleryFile():
    
    print "def frontPageReadtoRollingGalleryFile(): called"
    
    try:
        #if os.path.exists(".\\imgur\\imgurFrontPageDatumFile.txt"):
      
        with open (".\\imgur\\imgurFrontPageDatumFile.txt", 'r') as rawFrontPage:
            newData = pickle.load(rawFrontPage)
            
        print "newData"   
        print repr(newData)
        print len(newData)
            
        try:   
            with open (".\\imgur\\rollingGalleryData.txt", 'r') as rollingGalleryData:
                    oldData = pickle.load(rollingGalleryData)
        except:
            oldData = []
    
        print "OlD Data Values"
        print repr(oldData)
        print len(oldData)
        
        uniqueData = []
            
        for newItem in newData:
            inSet = False
            for oldItem in oldData:
                if oldItem["gallery"] == newItem["gallery"]:
                    inSet = True
            if inSet == False:
                uniqueData.append(newItem)
        
        print "uniqueData"
        print repr(uniqueData)
        
        print "updatedData"
        updatedData = oldData + uniqueData
        print repr(updatedData)
            
        with open (".\\imgur\\rollingGalleryData.txt", 'w') as rollingGalleryData:
            pickle.dump(updatedData, rollingGalleryData)
        
        print "After Pickle"
        print repr(updatedData)
        
        with open (".\\imgur\\rollingGalleryDataTEXTCBU.txt", 'w') as rollingGalleryText:
            for datum in updatedData:
                rollingGalleryText.write(repr(datum) + "\n")
                     
        print "def frontPageReadtoRollingGalleryFile(): ended"

    except:
        print "NO imgurFrontPageDatumFile"
        print "TODO LEARN HOW TO RAISE EXCEPTION"





print "Crawl Loaded... or You're running the wrong file"