Duplicate of Sample Data Analyst Associate Solution
  • AI Chat
  • Code
  • Report
  • Beta
    Spinner
    import pandas as pd
    from matplotlib import pyplot as plt
    df = pd.read_csv(r"bureau.csv")
    df.info()
    Hidden output
    df.drop(["AMT_ANNUITY", "CNT_CREDIT_PROLONG"], axis = 1, inplace = True)
    categorical_columns = ["CREDIT_ACTIVE","CREDIT_CURRENCY","CREDIT_TYPE"]
    numerical_cols = df.columns.difference(categorical_columns)
    numerical_cols
    bureau_numerical = df[numerical_cols]
    bureau_numerical.drop(["SK_ID_BUREAU", "SK_ID_CURR"], axis = 1, inplace = True)
    bureau_numerical.head()
    import numpy as np
    for column in bureau_numerical.columns:
        
        q1 = np.nanpercentile(df[column], 0.25)
        q3 = np.nanpercentile(df[column], 0.75)
        iqr = q3 - q1
        maxim = q3 + 1.5 * iqr
        minim = q1 - 1.5 * iqr
        outliers = np.where((df[column] > maxim) | (df[column] < minim), 1, 0)
        print(column, maxim, minim, sum(outliers))
    # MIce imputer (more robust but computationally consuming)
    from sklearn.ensemble import ExtraTreesRegressor
    from sklearn.experimental import enable_iterative_imputer
    from sklearn.impute import IterativeImputer
    
    # Copy the data
    bureau_numerical_imputed = bureau_numerical.copy(deep=True)
    
    # Init
    ii_imp = IterativeImputer(
        estimator=ExtraTreesRegressor(), max_iter=10, random_state=1121218
    )
    
    # Tranform
    bureau_numerical_imputed.loc[:, :] = ii_imp.fit_transform(bureau_numerical_imputed)
    Hidden output