''' Created on Oct 24, 2017 @author: Brett Paufler Copyright Brett Paufler Given a very large .csv file Source: City of Chicago Data: All Crimes 2001 to present Breaks data up by year Culls unneeded and outputs Yearly .csv files Saves to output in programs/eclipse/crime As Follows is 2017-11-05: create_yearly_report When I ran Data Points: RAW = 6385918 Data Points: Non Nulls = 6303034 Ratio Preserved: 0.987021 Data Points per Year 2001 482878 2002 471522 2003 472037 2004 467195 2005 449920 2006 445542 2007 435685 2008 420030 2009 386028 2010 368612 2011 350660 2012 334426 2013 304251 2014 269323 2015 259592 2016 250491 2017 134842 Crime Codes Downloadable (do a search) Chicago Police Department Illinois Uniform Crime The Original Pandas DataFrame Structure Prior to (I believe, it's been a few years) Splitting To Years or Reducing Data Content RangeIndex: 6385918 entries, 0 to 6385917 Data columns (total 22 columns): ID int64 Case Number object Date object Block object IUCR object Primary Type object Description object Location Description object Arrest bool Domestic bool Beat int64 District float64 Ward float64 Community Area float64 FBI Code object X Coordinate float64 Y Coordinate float64 Year int64 Updated On object Latitude float64 Longitude float64 Location object dtypes: bool(2), float64(7), int64(3), object(10) memory usage: 986.6+ MB My Yearly Crime files clock in at +/-600mb So, that's a 40% reduction ''' import logging logging.basicConfig( filename='.\\output\\create_yearly_report.txt', level=logging.INFO, filemode='w', format='%(message)s') logger = logging.getLogger() console = logging.StreamHandler() console.setLevel(logging.INFO) console.setFormatter(logging.Formatter('%(message)s')) logging.getLogger('').addHandler(console) import pandas as pd full_data_file ='crimes_full.csv' partial_data_file = '' data_columns = ['Date','IUCR', 'Primary Type', 'Location Description', 'Arrest', 'Domestic', 'Latitude', 'Longitude'] def print_first_fifty(data_file): '''Prints the first fifty lines of a large csv This was useful in creating the rest eliminated need to load_crime_csv 1.4G during debug.''' with open(data_file, 'r') as f: for _ in range(50): print(f.readline(), end='') #print_first_fifty(data_file=partial_data_file) def load_crime_csv(data_file): dF = pd.read_csv(data_file, usecols=data_columns) return dF def save_data_by_years(dF): logger.info('Data Points per Year') for year in range(2001, 2020): year_data = dF[dF['Date'].dt.year == year] logger.info('%d %d' % (year, year_data.shape[0])) year_data.to_csv( 'output/{}.csv'.format(year), index=False) if __name__ == '__main__': print('Processing Raw Crime Data\n') print('If Crashes, then Need File Location Most Likely') data_file = './input/crimes_full.csv' dF = load_crime_csv(data_file) start = dF.shape[0] print('CSV LOADED: %d' % start) dF = dF.dropna(axis=0, how='any') finish = dF.shape[0] print('After Dropped Nulls: %d' % finish) #Input Time: 06/04/2001 05:00:00 AM strftime= '%m/%d/%Y %I:%M:%S %p' dF['Date'] = pd.to_datetime(dF['Date'], format=strftime) print ('If do not get all years, see save_data_by_year hard coded values') logger.info('Data Points: RAW = %d' % start) logger.info('Data Points: Non Nulls = %d' % finish) logger.info('Ratio Preserved: %f' % (float(finish)/float(start))) logger.info('\n') print ('If do not get all years, see save_data_by_year hard coded values') save_data_by_years(dF) logger.info('\n') logger.info('Raw Crime Data Processed')