''' Created on Mar 27, 2014 @author: Brett Paufler Copyright Brett Paufler ''' import os import urllib2 ''' STEP TWO Gets the Scratches webpages, based on info in Step One lotto_pickle is the next step ''' def extractScratchersURL(): '''Depends upon files downloaded with lotto_download_html cycles through scratchers.html (1-20) and outputs list of html addresses ''' sourceDir = ".\\lottery" dirListing = os.listdir(sourceDir) #print dirListing sourceFiles = [] #finds current Scratcher page names #VERY BRITTLE for webPage in dirListing: if webPage.startswith("scratch"): #print webPage pathString = ".\\lottery\\" + webPage page = open(pathString, 'r') for line in page: #print line if 'href="/Play/Scratchers' in line: sB = line.split('href="') for bit in sB: print bit if "/Play/Scratchers-games/" in bit: bitLess = bit.split('">') for bM in bitLess: if "/Play/Scratchers-games/" in bM: sourceFiles.append(bM) unique = [] baseURL = "http://www.calottery.com" for item in sourceFiles: uniURL = baseURL + item + '.html' unique.append(uniURL) unique = set(unique) print unique return unique def downloadWebpage(thisURL="http://www.calottery.com", name="caLottery.html", dirThis="lottery"): print "downlaodWebpage called" if dirThis == "": localName = name else: localName = ".//" + dirThis + ".//" + name imgurHomeURL = thisURL imgurPage = urllib2.urlopen(imgurHomeURL) LocalWegPage = open(localName, 'wb') LocalWegPage.write(imgurPage.read()) LocalWegPage.close() if os.path.exists(localName): print thisURL + " webpage saved as " + localName def downloadALLScratchers(): '''runs the Wad downloading all the scratchers pages ''' print "downloadAllScratchers started" #list of pages to download, complete URL pages = extractScratchersURL() for url in pages: nameList = url.split("-Scratchers/") for word in nameList: if "Scratchers" not in word: sN = "SGAME-" + word + ".html" print "DOWNLOADING URL: %s \t sN: %s" % (url, sN) downloadWebpage(url, sN) if __name__ == '__main__': print "downloading all scratchers" downloadALLScratchers()