'''
Created on Oct 24, 2017
@author: Brett Paufler
Copyright Brett Paufler

Given a very large .csv file
    Source: City of Chicago
    Data: All Crimes 2001 to present
        Breaks data up by year
        Culls unneeded
        and outputs
            Yearly .csv files

Saves to output in programs/eclipse/crime

As Follows is 2017-11-05: create_yearly_report

When I ran
    Data Points: RAW = 6385918
    Data Points: Non Nulls = 6303034
    Ratio Preserved: 0.987021

Data Points per Year
2001 482878
2002 471522
2003 472037
2004 467195
2005 449920
2006 445542
2007 435685
2008 420030
2009 386028
2010 368612
2011 350660
2012 334426
2013 304251
2014 269323
2015 259592
2016 250491
2017 134842

Crime Codes Downloadable (do a search)
    Chicago Police Department
    Illinois Uniform Crime
    
The Original Pandas DataFrame Structure
    Prior to (I believe, it's been a few years)
        Splitting To Years
        or Reducing Data Content
        
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6385918 entries, 0 to 6385917
Data columns (total 22 columns):
ID                      int64
Case Number             object
Date                    object
Block                   object
IUCR                    object
Primary Type            object
Description             object
Location Description    object
Arrest                  bool
Domestic                bool
Beat                    int64
District                float64
Ward                    float64
Community Area          float64
FBI Code                object
X Coordinate            float64
Y Coordinate            float64
Year                    int64
Updated On              object
Latitude                float64
Longitude               float64
Location                object
dtypes: bool(2), float64(7), int64(3), object(10)
memory usage: 986.6+ MB

My Yearly Crime files clock in at +/-600mb
    So, that's a 40% reduction
'''

import logging
logging.basicConfig(
    filename='.\\output\\create_yearly_report.txt',
    level=logging.INFO,
    filemode='w',
    format='%(message)s')
logger = logging.getLogger()
console = logging.StreamHandler()
console.setLevel(logging.INFO)
console.setFormatter(logging.Formatter('%(message)s'))
logging.getLogger('').addHandler(console)

import pandas as pd

full_data_file ='crimes_full.csv'
partial_data_file = ''

data_columns = ['Date','IUCR', 'Primary Type',
             'Location Description', 'Arrest',
             'Domestic', 'Latitude', 'Longitude']

def print_first_fifty(data_file):
    '''Prints the first fifty lines of a large csv
    This was useful in creating the rest
    eliminated need to load_crime_csv 1.4G during debug.'''
    with open(data_file, 'r') as f:
        for _ in range(50):
            print(f.readline(), end='')
#print_first_fifty(data_file=partial_data_file)


def load_crime_csv(data_file):
    dF = pd.read_csv(data_file, usecols=data_columns)
    return dF

def save_data_by_years(dF):
    logger.info('Data Points per Year')
    for year in range(2001, 2020):
        year_data = dF[dF['Date'].dt.year == year]
        logger.info('%d %d' % (year, year_data.shape[0]))
        year_data.to_csv(
            'output/{}.csv'.format(year),
            index=False)
    

if __name__ == '__main__':
    
    print('Processing Raw Crime Data\n')
    print('If Crashes, then Need File Location Most Likely')
    data_file = './input/crimes_full.csv'

    dF = load_crime_csv(data_file)
    start = dF.shape[0]
    print('CSV LOADED: %d' % start)

    dF = dF.dropna(axis=0, how='any')
    finish = dF.shape[0]
    print('After Dropped Nulls: %d' % finish)
    
    #Input Time: 06/04/2001 05:00:00 AM
    strftime= '%m/%d/%Y %I:%M:%S %p'
    dF['Date'] = pd.to_datetime(dF['Date'],
        format=strftime)
    
    print ('If do not get all years, see save_data_by_year hard coded values')
    
    logger.info('Data Points: RAW = %d' % start)
    logger.info('Data Points: Non Nulls = %d' % finish)
    logger.info('Ratio Preserved: %f' % (float(finish)/float(start)))
    logger.info('\n')
    
    print ('If do not get all years, see save_data_by_year hard coded values')
    
    save_data_by_years(dF)
    
    logger.info('\n')
    logger.info('Raw Crime Data Processed')