''' Created on Apr 8, 2020 @author: Brett Paufler (c) Copyright Brett Paufler Finished on Apr 23, 2020 I really just worked on the first and last day Takes html files as posted to web for 2018 Docket Year Extracts Case Summary Data Exports the lot as: Plain Text (easiest to read) JSON (as this was my original intent) Pickle (the easiest to reload) Interpretation of Data Happens elsewhere all_cases =[Case, Case, Case... test_case = CASE( R=2, Date='2020-15-26', Docket='16-245', Name='Test Case', Worthy=True, Opinions=[test_opinion, test_opinion]) test_opinion = OPINION( type='Court', author='Some Judge', joining='Judge A, Judge B', good=True, pages=10) ''' ########################### # # Get The HTMLs # ########################### from os import listdir from os.path import join as path_join from astropy.units import year def get_list_html(DIR_HTML = '.\\html\\'): list_html = [path_join(DIR_HTML, f) for f in listdir(DIR_HTML) if f.endswith('.html')] return list_html html_list = get_list_html() #print html_list ################################## # # CASE # OPINION, OPINION # # namedtuples # ################################### from collections import namedtuple OPINION = namedtuple( typename='OPINION', field_names=[ 'type', 'author', 'joining', 'good', 'pages' ]) test_opinion = OPINION( type='Court', author='Some Judge', joining='Judge A, Judge B', good=True, pages=10) #print test_opinion CASE = namedtuple( typename='CASE', field_names=[ 'R', 'Date', 'Docket', 'Name', 'Worthy', 'Opinions', ]) test_case = CASE( R=2, Date='2020-15-26', Docket='16-245', Name='Test Case', Worthy=True, Opinions=[test_opinion, test_opinion]) #print test_case ################################# # # RAW HTML TO CASE/OPINION # # The helper functions work in order # Best To Work Logic from Final Function # ################################# from datetime import date OPINION_TYPES = ['Court', 'Dissenting', 'Concurring'] AUTHOR_TYPES = ['Roberts', 'Thomas', 'Ginsburg', 'Breyer', 'Alito', 'Sotomayor', 'Kagan', 'Gorsuch', 'Kavanaugh',] def get_html_text(path_to_html): '''Return text of html file''' with open(path_to_html, 'r') as f: text = f.read() return text def reduce_to_pertinent(text_in): '''Kills Extraneous Header, Body, Footer Returns raw text of interest for Case/Opinion''' text_working = text_in.split('')[1] text_working = text_working.split('') #Corrects For Error Wherein #Opinion Inserted after break text_check = text_working[1] assert 'WORTHY:' not in text_check assert '   AUTHOR: ' not in text_check assert 'GOOD:' not in text_check text = text_working[0] return text def get_case_head_opinions_tail(text): '''Returns a list [Case, Opinion, Opinion... Raw unworked text''' return text.split('
') def objectify_opinion(op_text): '''Turns Opinion Text into Named Tuple Also does error checking, so we can assume perfectly formatted from here out''' working_list = op_text.split('
\n') this_opinion_type = working_list[0][10:] assert this_opinion_type in OPINION_TYPES this_author = working_list[1][21:] assert this_author in (AUTHOR_TYPES + ['Per Curiam']) joining_list = working_list[2][22:] #print joining_list this_joining = joining_list.split(', ') for this_j in this_joining: assert this_j in (AUTHOR_TYPES + ['None']) this_good = working_list[3][19:] assert this_good in ['No', 'Yes'] this_pages = int(working_list[4][7:]) this_opinion = OPINION( type=this_opinion_type, author=this_author, joining=this_joining, good=this_good, pages=this_pages) #print this_opinion_type #print this_author #print this_joining #print this_good #print this_pages #print working_list #print this_opinion return this_opinion def objectify_case(case_text, opinions_list): case_listing = case_text.split('
\n') this_r = int(case_listing[0][3:]) #YEAR-MM-DD Outputted as String year_month_date = case_listing[1][6:] year, month, day = year_month_date.split('-') this_date = date(int(year), int(month), int(day)) #Returns a String but confirms of right format #XX-XXXX, int(XX) and int(XXXX) should work this_docket = case_listing[2][8:] test_docket = this_docket.split('-') int(test_docket[0]), int(test_docket[1]) this_name = case_listing[3][6:].strip() this_worthy_text = case_listing[4][8:] if this_worthy_text == 'True': this_worthy = True elif this_worthy_text == 'False': this_worthy = False else: assert this_worthy_text == 'Must be True or False' this_case = CASE( R=this_r, Date=year_month_date, Docket=this_docket, Name=this_name, Worthy=this_worthy, Opinions=opinions_list[:]) print this_case return this_case #print case_text #print this_r #print year_month_date #print this_date #print type(this_date) #print this_docket #print test_docket #print this_name #print this_worthy_text #print this_worthy print case_listing def html_to_case_opinion(path_to_html): '''File Path to Case/Opinion Objects The Main of this Section''' #Full Raw HTML Text text = get_html_text(path_to_html) #print text #Reduces to Case/Option Only Text text = reduce_to_pertinent(text) #print text #A Listing [Case, Opinion, Opinion... #In Raw Unworked Text case_opinions = get_case_head_opinions_tail(text) #print case_opinions case_text = case_opinions[0] opinions_text_list = case_opinions[1:] print case_text print opinions_text_list opinions_list = [objectify_opinion(op_text) for op_text in opinions_text_list] this_case = objectify_case(case_text, opinions_list) return this_case #html_to_case_opinion(html_list[-1]) all_cases = [html_to_case_opinion(file_path) for file_path in html_list] ######################################### # # OUTPUT # later scripts only use these # ######################################### #As Text with open('.//input//2018_judges_text.txt', 'w') as f: for case in all_cases: f.write(str(case)) f.write('\n') #Not as Pretty as I would like import json with open('.//input//2018_judges_json.txt', 'w') as f: json.dump(all_cases, f) #As an Unreadable Pickle import pickle with open('.//input//2018_judges_pickle.txt', 'w') as f: pickle.dump(all_cases, f)