Beta
Clean multilevel table
import sys
import json
import ast # handle single/double quote in dict/json string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', None)
%matplotlib inline
%config InlineBackend.figure_format='retina'
df = pd.read_excel('https://github.com/prasertcbs/basic-dataset/raw/master/starbucks_bakery.xlsx')
df
df = df.dropna(subset=['Product Name']).reset_index(drop=True) # drop Null-values in all columns by rows
df
df.info()
df.loc[0] # this is what we don't like
df.loc[1] # this is what we want
np.isnan(df.loc[0, 'Calories'])
np.isnan(df.loc[1, 'Calories'])
df['Category'] = df.apply(lambda r: r['Product Name'] if np.isnan(r['Calories']) else np.nan, axis=1)
df
df['Category'] = df['Category'].ffill()
df
df = df.dropna(subset=['Calories']).reset_index(drop=True)
df
df.columns
df = df[['Category', 'Product Name', 'Label Wt (g)', 'Calories', 'Total fat (g)',
'Saturated Fat (g)', 'Trans Fat (g)', 'Cholesterol (mg)',
'Sodium (mg)', 'Carbohydrates (g)', 'Fiber (g)', 'Sugar (g)',
'Protein (g)', 'Vitamin A (%DV)', 'Vitamin C (%DV)', 'Calcium (%DV)',
'Iron (%DV)']]
df
# df.to_excel('bakery_clean.xlsx', index=False)
df.groupby('Category').describe()