Beta
Table of Contents
The outline of your notebook will show up here. You can include headings in any text cell by starting a line with #
, ##
, ###
, etc., depending on the desired title hierarchy.
import pandas as pd
from matplotlib import pyplot as plt
df = pd.read_csv(r"bureau.csv")
df.info()
Hidden output
df.drop(["AMT_ANNUITY", "CNT_CREDIT_PROLONG"], axis = 1, inplace = True)
categorical_columns = ["CREDIT_ACTIVE","CREDIT_CURRENCY","CREDIT_TYPE"]
numerical_cols = df.columns.difference(categorical_columns)
numerical_cols
bureau_numerical = df[numerical_cols]
bureau_numerical.drop(["SK_ID_BUREAU", "SK_ID_CURR"], axis = 1, inplace = True)
bureau_numerical.head()
import numpy as np
for column in bureau_numerical.columns:
q1 = np.nanpercentile(df[column], 0.25)
q3 = np.nanpercentile(df[column], 0.75)
iqr = q3 - q1
maxim = q3 + 1.5 * iqr
minim = q1 - 1.5 * iqr
outliers = np.where((df[column] > maxim) | (df[column] < minim), 1, 0)
print(column, maxim, minim, sum(outliers))
# MIce imputer (more robust but computationally consuming)
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
# Copy the data
bureau_numerical_imputed = bureau_numerical.copy(deep=True)
# Init
ii_imp = IterativeImputer(
estimator=ExtraTreesRegressor(), max_iter=10, random_state=1121218
)
# Tranform
bureau_numerical_imputed.loc[:, :] = ii_imp.fit_transform(bureau_numerical_imputed)
Hidden output