'''
Created on Jul 11, 2014 (Major Refactor 8-17-15)
@author: Brett Paufler
Copyright Brett Paufler


sequential_downloader(url_list) tries to download all numeric combinations of all urls in passed url list

    url_list = [url, url, url, ...
        with each url containing one or more optional '*'
        each '*' being replaced with all possible numeric combinations
        
    if url = test_00*.img
        test_001.img thru test_009.img will be downloaded
    if url = sample_*0*.pdf
        sample_000.pdf thru sample_909.pdf will be downloaded

'''

import os
import urllib2

def create_webpage_dir():
    '''Ensures directory for output exists'''
    if not os.path.exists('.\webPages'):
        os.makedirs('.\webPages')

def numberfy(list_of_text):
    '''Given a list of text strings, returns list of text strings
    wherein all * are replaced by all possible values of 0-9
    so text** returns 100 replacement values'''
    assert isinstance(list_of_text, list)
    if 0 == len([t for t in list_of_text if '*' in t]):
        return list_of_text
    else:
        new_list = []
        for text in list_of_text:
            if not '*' in text:
                new_list.append(text)
            else:
                for n in range(10):
                    t = text
                    t = t.replace('*', str(n), 1)
                    new_list.append(t)   
        return numberfy(new_list)
        
            

def rip_url(url):
    '''Saves contents of given url.
        sN is last part of url.'''
    print 'CONTACTING: %s' % url
    try:
        dataToRip = urllib2.urlopen(url).read()
    except:
        dataToRip = None
        print "FAILED:\n\t%s" % (url)
    if dataToRip:
        sN = url.split('/')
        sN = '.\webPages\%s' % sN[-1]
        download = open((sN), 'wb')
        download.write(dataToRip)
        download.close()
        print "SUCCEEDED:\n\t%s\n\t%s" % (url, sN)

def sequential_downloader(url_list):
    '''Tries to download and save all numeric combinations of each url in url_list.
        for test**.pdf, would try to download all values
        between test00.pdf and test99.pdf'''
    url_list = numberfy(url_list)
    for url in url_list:
        rip_url(url)


if __name__ == "__main__":

    #NOTE FORM
    #EACH * replaced with 0-9, sequentially, so 00-99 here
    #
    #NOTE FORM: [url_1_*, url_2_**, url_3_***, etc]
    #
    
    #THIS IS THE BASE
    #url_list = [r'http: - something lec**.pdf']
    #sequential_downloader(url_list)
    
    #CUSTOM for ---
    for n in range(1, 16):
        #url = 'http://www/%d something sciences%d.pdf' % (n, n)
        #url = 'http://www. something %dsciences%d.pdf' % (n, n)
        url = 'http:// missing chap%d.pdf' % n
        
        rip_url(url)
    
        print 'FINISHED RUN'