'''
Created on Apr 8, 2020
@author: Brett Paufler
(c) Copyright Brett Paufler
Finished on Apr 23, 2020
I really just worked on the first and last day
Takes html files as posted to web for 2018 Docket Year
Extracts Case Summary Data
Exports the lot as:
Plain Text (easiest to read)
JSON (as this was my original intent)
Pickle (the easiest to reload)
Interpretation of Data Happens elsewhere
all_cases =[Case, Case, Case...
test_case = CASE(
R=2,
Date='2020-15-26',
Docket='16-245',
Name='Test Case',
Worthy=True,
Opinions=[test_opinion, test_opinion])
test_opinion = OPINION(
type='Court',
author='Some Judge',
joining='Judge A, Judge B',
good=True,
pages=10)
'''
###########################
#
# Get The HTMLs
#
###########################
from os import listdir
from os.path import join as path_join
from astropy.units import year
def get_list_html(DIR_HTML = '.\\html\\'):
list_html = [path_join(DIR_HTML, f) for
f in listdir(DIR_HTML)
if f.endswith('.html')]
return list_html
html_list = get_list_html()
#print html_list
##################################
#
# CASE
# OPINION, OPINION
#
# namedtuples
#
###################################
from collections import namedtuple
OPINION = namedtuple(
typename='OPINION',
field_names=[
'type',
'author',
'joining',
'good',
'pages'
])
test_opinion = OPINION(
type='Court',
author='Some Judge',
joining='Judge A, Judge B',
good=True,
pages=10)
#print test_opinion
CASE = namedtuple(
typename='CASE',
field_names=[
'R',
'Date',
'Docket',
'Name',
'Worthy',
'Opinions',
])
test_case = CASE(
R=2,
Date='2020-15-26',
Docket='16-245',
Name='Test Case',
Worthy=True,
Opinions=[test_opinion, test_opinion])
#print test_case
#################################
#
# RAW HTML TO CASE/OPINION
#
# The helper functions work in order
# Best To Work Logic from Final Function
#
#################################
from datetime import date
OPINION_TYPES = ['Court', 'Dissenting', 'Concurring']
AUTHOR_TYPES = ['Roberts', 'Thomas', 'Ginsburg',
'Breyer', 'Alito', 'Sotomayor',
'Kagan', 'Gorsuch', 'Kavanaugh',]
def get_html_text(path_to_html):
'''Return text of html file'''
with open(path_to_html, 'r') as f:
text = f.read()
return text
def reduce_to_pertinent(text_in):
'''Kills Extraneous Header, Body, Footer
Returns raw text of interest for Case/Opinion'''
text_working = text_in.split('')[1]
text_working = text_working.split('
')
#Corrects For Error Wherein
#Opinion Inserted after break
text_check = text_working[1]
assert 'WORTHY:' not in text_check
assert ' AUTHOR: ' not in text_check
assert 'GOOD:' not in text_check
text = text_working[0]
return text
def get_case_head_opinions_tail(text):
'''Returns a list [Case, Opinion, Opinion...
Raw unworked text'''
return text.split('
')
def objectify_opinion(op_text):
'''Turns Opinion Text into Named Tuple
Also does error checking, so we can assume
perfectly formatted from here out'''
working_list = op_text.split('
\n')
this_opinion_type = working_list[0][10:]
assert this_opinion_type in OPINION_TYPES
this_author = working_list[1][21:]
assert this_author in (AUTHOR_TYPES + ['Per Curiam'])
joining_list = working_list[2][22:]
#print joining_list
this_joining = joining_list.split(', ')
for this_j in this_joining:
assert this_j in (AUTHOR_TYPES + ['None'])
this_good = working_list[3][19:]
assert this_good in ['No', 'Yes']
this_pages = int(working_list[4][7:])
this_opinion = OPINION(
type=this_opinion_type,
author=this_author,
joining=this_joining,
good=this_good,
pages=this_pages)
#print this_opinion_type
#print this_author
#print this_joining
#print this_good
#print this_pages
#print working_list
#print this_opinion
return this_opinion
def objectify_case(case_text, opinions_list):
case_listing = case_text.split('
\n')
this_r = int(case_listing[0][3:])
#YEAR-MM-DD Outputted as String
year_month_date = case_listing[1][6:]
year, month, day = year_month_date.split('-')
this_date = date(int(year), int(month), int(day))
#Returns a String but confirms of right format
#XX-XXXX, int(XX) and int(XXXX) should work
this_docket = case_listing[2][8:]
test_docket = this_docket.split('-')
int(test_docket[0]), int(test_docket[1])
this_name = case_listing[3][6:].strip()
this_worthy_text = case_listing[4][8:]
if this_worthy_text == 'True':
this_worthy = True
elif this_worthy_text == 'False':
this_worthy = False
else:
assert this_worthy_text == 'Must be True or False'
this_case = CASE(
R=this_r,
Date=year_month_date,
Docket=this_docket,
Name=this_name,
Worthy=this_worthy,
Opinions=opinions_list[:])
print this_case
return this_case
#print case_text
#print this_r
#print year_month_date
#print this_date
#print type(this_date)
#print this_docket
#print test_docket
#print this_name
#print this_worthy_text
#print this_worthy
print case_listing
def html_to_case_opinion(path_to_html):
'''File Path to Case/Opinion Objects
The Main of this Section'''
#Full Raw HTML Text
text = get_html_text(path_to_html)
#print text
#Reduces to Case/Option Only Text
text = reduce_to_pertinent(text)
#print text
#A Listing [Case, Opinion, Opinion...
#In Raw Unworked Text
case_opinions = get_case_head_opinions_tail(text)
#print case_opinions
case_text = case_opinions[0]
opinions_text_list = case_opinions[1:]
print case_text
print opinions_text_list
opinions_list = [objectify_opinion(op_text)
for op_text
in opinions_text_list]
this_case = objectify_case(case_text, opinions_list)
return this_case
#html_to_case_opinion(html_list[-1])
all_cases = [html_to_case_opinion(file_path)
for file_path in html_list]
#########################################
#
# OUTPUT
# later scripts only use these
#
#########################################
#As Text
with open('.//input//2018_judges_text.txt', 'w') as f:
for case in all_cases:
f.write(str(case))
f.write('\n')
#Not as Pretty as I would like
import json
with open('.//input//2018_judges_json.txt', 'w') as f:
json.dump(all_cases, f)
#As an Unreadable Pickle
import pickle
with open('.//input//2018_judges_pickle.txt', 'w') as f:
pickle.dump(all_cases, f)