Lesson3 — DataLab

Lesson3

Beta

# Start coding here... 
import pandas as pd
import numpy as np
file='Dietary Habits Survey Data.csv'
df=pd.read_csv(file)[['Age', 'Gender',
       'How many meals do you have a day? (number of regular occasions in a day when a significant and reasonably filling amount of food is eaten)','How many times a week do you order-in or go out to eat?']].dropna()
df.columns=['Age','Gender','meals_per_day','eat_out_per_wk']
df

#frequency distribution
df.meals_per_day.value_counts(normalize=True).to_frame()
#df.Gender.value_counts().plot(kind='pie')
df.meals_per_day.value_counts()#.plot(kind='hist')
#print(df.Gender.value_counts().to_frame('Number'))
df.Gender.value_counts().to_frame('Number').plot(kind='bar')
#to_frame returns a dataframe with one column, which we'll specify as Number since this is the number of male and female respondents, and the index, which is the Gender.
#From the table and the plot, we can see that slightly more men than women took part in the study.
#df.Age.value_counts().to_frame("Number").plot(kind='bar')

#measures of central tendency
print(df.mean()) #the average xxx
print(df.median()) #vlue exactly in the middle of the data set
df.mode() #most popular response in each column

df.mean() #calculates the average of each numeric column. default, mean is caluclated over columns

df.median() #calculates the value exactly in the middle of the dataa in each numeric column. default, median is caluclated over columns

df.describe() #generates descriptive statistics for numeric columns by default.missing values are automatically excluded. Note that mean value is indicated here, and the 50 percentile is the same as the median.

#measures of variability
#range of meals eaten per day
print(df.meals_per_day.max()-df.meals_per_day.min())
#the lowest number of meals per day reported is 3 meals apart from the highest number of meals per day reported
range_value = df.eat_out_per_wk.max() - df.eat_out_per_wk.min()
print(range_value)

# Find the standard deviation of meals eaten out or ordered
std_dev_value = df.eat_out_per_wk.std()
print(std_dev_value)

#std dev
df.meals_per_day.std()
#on average, each meal value deviates from the mean by 0.65 points. There isn't much variablity within the dataset

#variance
df.meals_per_day.var()
#the presence of variance shows that the survey has different responses to this question, however, the degree of scatter or variability among the observation is quite low.

#df.plot(kind='scatter',x=)

import pandas as pd
import scipy.stats as stats

file='young_people.csv'
df=pd.read_csv(file)[['Gender','Age','Height','Weight']]
df.head()

df1=df.select_dtypes(include='number').apply(stats.zscore).head(20)
df['Age_zscore']=stats.zscore(df.Age)
df['Height_zscore']=stats.zscore(df.Height)
df['Weight_zscore']=stats.zscore(df.Weight)
df2=df[(df.Age_zscore>=3) | (df.Age_zscore<=-3)]
#df.Age_zscore.plot(kind='hist')
print(df.Age.mean())
#df2.Age.mean()
df2#.Age.unique()#2
df[(df.Age==29) | (df.Age==30)]

#plot distribution of values in Age column using histogram
df.Age.plot(kind='hist')

import pandas as pd
import matplotlib.pyplot as plt
file='World_Happiness_2019.csv'
df=pd.read_csv(file)
df.head(20)
#df.corr()
plt.scatter(df['Social support'],df['Healthy life expectancy'])
df['Social support'].corr(df['Healthy life expectancy'])

#Helliwell, J., Layard, R., & Sachs, J. (2019). World Happiness Report 2019, New York: Sustainable Development Solutions Network.
#The 7th World Happiness Report present the available global data on national happiness, showing how the quality of people's lives can be assessed by a variety of subjective well-being measures. In this particular edition, the report examines the links between governement and happiness.

‌
‌
‌