Beta
Data processing
Import modules
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set_theme(context='notebook',
style='dark',
font_scale=1.2)
Read the dataset
# read the dataset 'accidents', parse 'date' and 'time' as datetime objects,
# drop 'accident_reference' and 'accident_year' columns as 'accident_index' column contains them both,
# set the index column
accidents = (pd.read_csv(r'./data/accident-data.csv',
parse_dates=[['date', 'time']])
.set_index('accident_index')
.drop(columns=['accident_reference', 'accident_year']))
# extract 'hour' column with hour values of 'date_time' column
accidents['hour'] = accidents.date_time.dt.hour
# add column 'level' that indicates if the accident is major or minor
accidents['level'] = [1 if (accident['number_of_casualties'] >= 3)
& (accident['accident_severity'] == 1)
else 0
for index, accident in accidents.iterrows()]
accidents.isna().sum()
Read the discriptions
# read the lookup file
lookup = pd.read_csv(r'./data/road-safety-lookups.csv',
keep_default_na=True,
error_bad_lines=False,
skip_blank_lines=True)
lookup = lookup.drop(columns='note').dropna()
# add discription for 'minor' and 'major' values of column 'level' as rows in 'lookup' DataFrame
lookup = lookup.append({'table': 'Accident',
'field name': 'level',
'code/format': 0,
'label': 'minor'},
ignore_index=True)
lookup = lookup.append({'table': 'Accident',
'field name': 'level',
'code/format': 1,
'label': 'major'},
ignore_index=True)
# drop redundant items from lookup including badly coded 'speed_limit'
lookup = lookup[~lookup['field name'].isin(['accident_index',
'accident_year',
'accident_reference',
'date', 'time',
'first_road_number',
'second_road_number',
'speed_limit'])]
# recreate 'speed_limit' as categories in 'lookup'
for limit in range(20, 80, 10):
lookup = lookup.append({'table': 'Accident',
'field name': 'speed_limit',
'code/format': limit,
'label': limit},
ignore_index=True)
Create boolean masks for dtypes
# columns with corresponding labels in 'lookup' are grouped as 'categoricals'
categoricals = lookup['field name'].unique()
# columns that are not from categoricals are masked as 'numericals'
numericals = accidents.columns[~np.isin(accidents.columns,
categoricals)].drop(
labels=['date_time',
'longitude',
'latitude',
'first_road_number',
'second_road_number'])
Deal with missing values
Looking at value counts of 'junction_details' and 'junction_control' revealed that same accidents that were labeled 0 for 'not at junction' in 'junction_control' were labeled -1 for 'missing data' in 'junction_details'. Assuming that the first is correct we can rename the missing values to 0 in the second.
accidents.junction_control = accidents.junction_control.replace(-1, 0)
accidents.speed_limit = accidents.speed_limit.replace(-1, 30)
accidents = accidents[(accidents.iloc[:, :] != -1).all(axis=1)]
Write a function to decode categories
def decode_categorical(df, categorical):
'''
Take categorical column from DataFrame df and return
decoded values using codes with corresponding labels from lookup DataFrame.
df: (DataFrame)
categorical: (string)
'''
labels = lookup['label'].loc[
lookup['field name'] == categorical].values
code_format = lookup['code/format'].loc[
lookup['field name'] == categorical].astype('int').values
dtype = pd.CategoricalDtype(code_format,
ordered=True)
decoded = pd.Categorical(df[categorical].values,
dtype=dtype)
return decoded.rename_categories(labels)
Subset the data using dtype masks, merge and decode categorical variables