'''
Created on Apr 8, 2020
@author: Brett Paufler
(c) Copyright Brett Paufler
Finished on Apr 23, 2020
I really just worked on the first and last day
Takes html files as posted to web for 2018 Docket Year
Extracts Case Summary Data
Exports the lot as:
    Plain Text (easiest to read)
    JSON (as this was my original intent)
    Pickle (the easiest to reload)
Interpretation of Data Happens elsewhere
all_cases =[Case, Case, Case...
test_case = CASE(
            R=2,
            Date='2020-15-26',
            Docket='16-245',
            Name='Test Case',
            Worthy=True,
            Opinions=[test_opinion, test_opinion])
test_opinion = OPINION(
            type='Court',
            author='Some Judge',
            joining='Judge A, Judge B',
            good=True,
            pages=10)
'''
###########################
#
#    Get The HTMLs
#
###########################
from os import listdir
from os.path import join as path_join
from astropy.units import year
def get_list_html(DIR_HTML = '.\\html\\'):
    list_html = [path_join(DIR_HTML, f) for
                f in listdir(DIR_HTML)
                if f.endswith('.html')]
    return list_html
html_list = get_list_html()
#print html_list
##################################
#
#  CASE
#    OPINION, OPINION
#
#    namedtuples
#
###################################
from collections import namedtuple
OPINION = namedtuple(
            typename='OPINION',
            field_names=[
            'type',
            'author',
            'joining',
            'good',
            'pages'            
            ])
test_opinion = OPINION(
            type='Court',
            author='Some Judge',
            joining='Judge A, Judge B',
            good=True,
            pages=10)
#print test_opinion
CASE = namedtuple(
            typename='CASE',
            field_names=[
            'R',
            'Date',
            'Docket',
            'Name',
            'Worthy',
            'Opinions',
            ])
            
test_case = CASE(
            R=2,
            Date='2020-15-26',
            Docket='16-245',
            Name='Test Case',
            Worthy=True,
            Opinions=[test_opinion, test_opinion])
#print test_case
#################################
#
#    RAW HTML TO CASE/OPINION
#
#    The helper functions work in order
#        Best To Work Logic from Final Function
#
#################################
from datetime import date
OPINION_TYPES = ['Court', 'Dissenting', 'Concurring']
AUTHOR_TYPES = ['Roberts', 'Thomas', 'Ginsburg',
                'Breyer', 'Alito', 'Sotomayor',
                'Kagan', 'Gorsuch', 'Kavanaugh',]
def get_html_text(path_to_html):
    '''Return text of html file'''
    with open(path_to_html, 'r') as f:
        text = f.read()
    return text
def reduce_to_pertinent(text_in):
    '''Kills Extraneous Header, Body, Footer
    Returns raw text of interest for Case/Opinion'''
    text_working = text_in.split('')[1]
    text_working = text_working.split('')
    
    #Corrects For Error Wherein
    #Opinion Inserted after  break
    text_check = text_working[1]
    assert 'WORTHY:' not in text_check
    assert '   AUTHOR: ' not in text_check
    assert 'GOOD:' not in text_check
    
    text = text_working[0]
    return text
def get_case_head_opinions_tail(text):
    '''Returns a list [Case, Opinion, Opinion...
    Raw unworked text'''
    return text.split('
')
def objectify_opinion(op_text):
    '''Turns Opinion Text into Named Tuple
    Also does error checking, so we can assume
        perfectly formatted from here out'''
    
    working_list = op_text.split('
\n')
    
    this_opinion_type = working_list[0][10:]
    assert this_opinion_type in OPINION_TYPES
    
    this_author = working_list[1][21:]
    assert this_author in (AUTHOR_TYPES + ['Per Curiam'])
    
    joining_list = working_list[2][22:]
    #print joining_list
    this_joining = joining_list.split(', ')
    for this_j in this_joining:
        assert this_j in (AUTHOR_TYPES + ['None'])
    
    this_good = working_list[3][19:]
    assert this_good in ['No', 'Yes']
    
    this_pages = int(working_list[4][7:])
    
    this_opinion = OPINION(
                    type=this_opinion_type,
                    author=this_author,
                    joining=this_joining,
                    good=this_good,
                    pages=this_pages)
    
    #print this_opinion_type
    #print this_author
    #print this_joining
    #print this_good
    #print this_pages
    #print working_list
    #print this_opinion
    return this_opinion
    
def objectify_case(case_text, opinions_list):
    
    case_listing = case_text.split('
\n')
    
    this_r = int(case_listing[0][3:])
    
    #YEAR-MM-DD Outputted as String
    year_month_date = case_listing[1][6:]
    year, month, day = year_month_date.split('-')
    this_date = date(int(year), int(month), int(day))
    
    #Returns a String but confirms of right format
    #XX-XXXX, int(XX) and int(XXXX) should work
    this_docket = case_listing[2][8:]
    test_docket = this_docket.split('-')
    int(test_docket[0]), int(test_docket[1])
    this_name = case_listing[3][6:].strip()
    
    this_worthy_text = case_listing[4][8:]
    if this_worthy_text == 'True':
        this_worthy = True
    elif this_worthy_text == 'False':
        this_worthy = False
    else:
        assert this_worthy_text == 'Must be True or False'
    
    this_case = CASE(
            R=this_r,
            Date=year_month_date,
            Docket=this_docket,
            Name=this_name,
            Worthy=this_worthy,
            Opinions=opinions_list[:])
    print this_case
    return this_case
    #print case_text
    #print this_r
    #print year_month_date
    #print this_date
    #print type(this_date)
    #print this_docket
    #print test_docket
    #print this_name
    #print this_worthy_text
    #print this_worthy
    print case_listing
def html_to_case_opinion(path_to_html):
    '''File Path to Case/Opinion Objects
    The Main of this Section'''
    
    #Full Raw HTML Text
    text = get_html_text(path_to_html)
    #print text
    
    #Reduces to Case/Option Only Text
    text = reduce_to_pertinent(text)
    #print text
    
    #A Listing [Case, Opinion, Opinion...
    #In Raw Unworked Text
    case_opinions = get_case_head_opinions_tail(text)
    #print case_opinions
    
    case_text = case_opinions[0]
    opinions_text_list = case_opinions[1:]
    print case_text
    print opinions_text_list
    opinions_list = [objectify_opinion(op_text)
                     for op_text
                     in opinions_text_list]
    this_case = objectify_case(case_text, opinions_list)
    return this_case
#html_to_case_opinion(html_list[-1])
all_cases = [html_to_case_opinion(file_path)
             for file_path in html_list]
#########################################
#
#    OUTPUT
#        later scripts only use these
#
#########################################
#As Text
with open('.//input//2018_judges_text.txt', 'w') as f:
    for case in all_cases:
        f.write(str(case))
        f.write('\n')
#Not as Pretty as I would like
import json
with open('.//input//2018_judges_json.txt', 'w') as f:
    json.dump(all_cases, f)
#As an Unreadable Pickle
import pickle
with open('.//input//2018_judges_pickle.txt', 'w') as f:
    pickle.dump(all_cases, f)