<Python> Data Wrangling (Demo)
  • AI Chat
  • Code
  • Report
  • Beta
    Spinner

    Clean multilevel table

    import sys
    import json
    import ast # handle single/double quote in dict/json string
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    
    pd.set_option('display.max_rows', None)
    %matplotlib inline
    %config InlineBackend.figure_format='retina'
    df = pd.read_excel('https://github.com/prasertcbs/basic-dataset/raw/master/starbucks_bakery.xlsx')
    df
    df = df.dropna(subset=['Product Name']).reset_index(drop=True) # drop Null-values in all columns by rows
    df
    df.info()
    df.loc[0] # this is what we don't like
    df.loc[1] # this is what we want
    np.isnan(df.loc[0, 'Calories'])
    np.isnan(df.loc[1, 'Calories'])
    df['Category'] = df.apply(lambda r: r['Product Name'] if np.isnan(r['Calories']) else np.nan, axis=1)
    df
    df['Category'] = df['Category'].ffill()
    df
    df = df.dropna(subset=['Calories']).reset_index(drop=True)
    df
    df.columns
    df = df[['Category', 'Product Name', 'Label Wt (g)', 'Calories', 'Total fat (g)',
           'Saturated  Fat (g)', 'Trans Fat (g)', 'Cholesterol (mg)',
           'Sodium (mg)', 'Carbohydrates (g)', 'Fiber (g)', 'Sugar (g)',
           'Protein (g)', 'Vitamin A (%DV)', 'Vitamin C (%DV)', 'Calcium (%DV)',
           'Iron (%DV)']]
    df
    
    # df.to_excel('bakery_clean.xlsx', index=False)
    df.groupby('Category').describe()