From Knowing to Saving Lives: Learning to Reduce High Fatality Traffic Accidents
  • AI Chat
  • Code
  • Report
  • Beta
    Spinner

    Data processing

    Import modules

    import pandas as pd
    import matplotlib.pyplot as plt
    import numpy as np
    import seaborn as sns
    sns.set_theme(context='notebook', 
                  style='dark', 
                  font_scale=1.2)

    Read the dataset

    # read the dataset 'accidents', parse 'date' and 'time' as datetime objects, 
    # drop 'accident_reference' and 'accident_year' columns as 'accident_index' column contains them both, 
    # set the index column
    accidents = (pd.read_csv(r'./data/accident-data.csv',
                             parse_dates=[['date', 'time']])
                   .set_index('accident_index')
                   .drop(columns=['accident_reference', 'accident_year']))
    
    # extract 'hour' column with hour values of 'date_time' column
    accidents['hour'] = accidents.date_time.dt.hour
    
    # add column 'level' that indicates if the accident is major or minor
    accidents['level'] = [1 if (accident['number_of_casualties'] >= 3)
                             & (accident['accident_severity'] == 1) 
                            else 0
                          for index, accident in accidents.iterrows()]
    accidents.isna().sum()

    Read the discriptions

    # read the lookup file
    lookup = pd.read_csv(r'./data/road-safety-lookups.csv',
                         keep_default_na=True, 
                         error_bad_lines=False, 
                         skip_blank_lines=True)
    
    lookup = lookup.drop(columns='note').dropna()
    
    # add discription for 'minor' and 'major' values of column 'level' as rows in 'lookup' DataFrame
    lookup = lookup.append({'table': 'Accident', 
                            'field name': 'level', 
                            'code/format': 0, 
                            'label': 'minor'}, 
                           ignore_index=True)
    
    lookup = lookup.append({'table': 'Accident', 
                            'field name': 'level', 
                            'code/format': 1, 
                            'label': 'major'}, 
                           ignore_index=True)
    
    # drop redundant items from lookup including badly coded 'speed_limit'
    lookup = lookup[~lookup['field name'].isin(['accident_index',
                                                'accident_year',
                                                'accident_reference', 
                                                'date', 'time', 
                                                'first_road_number', 
                                                'second_road_number',
                                                'speed_limit'])]
    
    # recreate 'speed_limit' as categories in 'lookup'
    for limit in range(20, 80, 10):
        lookup = lookup.append({'table': 'Accident',
                                'field name': 'speed_limit',
                                'code/format': limit,
                                'label': limit},
                               ignore_index=True)

    Create boolean masks for dtypes

    # columns with corresponding labels in 'lookup' are grouped as 'categoricals'
    categoricals = lookup['field name'].unique()
    
    # columns that are not from categoricals are masked as 'numericals'
    numericals = accidents.columns[~np.isin(accidents.columns, 
                                            categoricals)].drop(
                                                labels=['date_time', 
                                                        'longitude', 
                                                        'latitude', 
                                                        'first_road_number', 
                                                        'second_road_number'])

    Deal with missing values

    Looking at value counts of 'junction_details' and 'junction_control' revealed that same accidents that were labeled 0 for 'not at junction' in 'junction_control' were labeled -1 for 'missing data' in 'junction_details'. Assuming that the first is correct we can rename the missing values to 0 in the second.

    accidents.junction_control = accidents.junction_control.replace(-1, 0)
    accidents.speed_limit = accidents.speed_limit.replace(-1, 30)
    accidents = accidents[(accidents.iloc[:, :] != -1).all(axis=1)]

    Write a function to decode categories

    def decode_categorical(df, categorical):
        '''
        Take categorical column from DataFrame df and return 
        decoded values using codes with corresponding labels from lookup DataFrame.
        
        df: (DataFrame)
        categorical: (string)
        '''
        labels = lookup['label'].loc[
                    lookup['field name'] == categorical].values
        
        code_format = lookup['code/format'].loc[
                        lookup['field name'] == categorical].astype('int').values
       
        dtype = pd.CategoricalDtype(code_format, 
                                    ordered=True)
        
        decoded = pd.Categorical(df[categorical].values,
                                 dtype=dtype)
        
        return decoded.rename_categories(labels)

    Subset the data using dtype masks, merge and decode categorical variables