''' Created on Jul 11, 2014 (Major Refactor 8-17-15) @author: Brett Paufler Copyright Brett Paufler sequential_downloader(url_list) tries to download all numeric combinations of all urls in passed url list url_list = [url, url, url, ... with each url containing one or more optional '*' each '*' being replaced with all possible numeric combinations if url = test_00*.img test_001.img thru test_009.img will be downloaded if url = sample_*0*.pdf sample_000.pdf thru sample_909.pdf will be downloaded ''' import os import urllib2 def create_webpage_dir(): '''Ensures directory for output exists''' if not os.path.exists('.\webPages'): os.makedirs('.\webPages') def numberfy(list_of_text): '''Given a list of text strings, returns list of text strings wherein all * are replaced by all possible values of 0-9 so text** returns 100 replacement values''' assert isinstance(list_of_text, list) if 0 == len([t for t in list_of_text if '*' in t]): return list_of_text else: new_list = [] for text in list_of_text: if not '*' in text: new_list.append(text) else: for n in range(10): t = text t = t.replace('*', str(n), 1) new_list.append(t) return numberfy(new_list) def rip_url(url): '''Saves contents of given url. sN is last part of url.''' print 'CONTACTING: %s' % url try: dataToRip = urllib2.urlopen(url).read() except: dataToRip = None print "FAILED:\n\t%s" % (url) if dataToRip: sN = url.split('/') sN = '.\webPages\%s' % sN[-1] download = open((sN), 'wb') download.write(dataToRip) download.close() print "SUCCEEDED:\n\t%s\n\t%s" % (url, sN) def sequential_downloader(url_list): '''Tries to download and save all numeric combinations of each url in url_list. for test**.pdf, would try to download all values between test00.pdf and test99.pdf''' url_list = numberfy(url_list) for url in url_list: rip_url(url) if __name__ == "__main__": #NOTE FORM #EACH * replaced with 0-9, sequentially, so 00-99 here # #NOTE FORM: [url_1_*, url_2_**, url_3_***, etc] # #THIS IS THE BASE #url_list = [r'http: - something lec**.pdf'] #sequential_downloader(url_list) #CUSTOM for --- for n in range(1, 16): #url = 'http://www/%d something sciences%d.pdf' % (n, n) #url = 'http://www. something %dsciences%d.pdf' % (n, n) url = 'http:// missing chap%d.pdf' % n rip_url(url) print 'FINISHED RUN'