Lesson3
  • AI Chat
  • Code
  • Report
  • Beta
    Spinner
    # Start coding here... 
    import pandas as pd
    import numpy as np
    file='Dietary Habits Survey Data.csv'
    df=pd.read_csv(file)[['Age', 'Gender',
           'How many meals do you have a day? (number of regular occasions in a day when a significant and reasonably filling amount of food is eaten)','How many times a week do you order-in or go out to eat?']].dropna()
    df.columns=['Age','Gender','meals_per_day','eat_out_per_wk']
    df
    #frequency distribution
    df.meals_per_day.value_counts(normalize=True).to_frame()
    #df.Gender.value_counts().plot(kind='pie')
    df.meals_per_day.value_counts()#.plot(kind='hist')
    #print(df.Gender.value_counts().to_frame('Number'))
    df.Gender.value_counts().to_frame('Number').plot(kind='bar')
    #to_frame returns a dataframe with one column, which we'll specify as Number since this is the number of male and female respondents, and the index, which is the Gender.
    #From the table and the plot, we can see that slightly more men than women took part in the study.
    #df.Age.value_counts().to_frame("Number").plot(kind='bar')
    #measures of central tendency
    print(df.mean()) #the average xxx
    print(df.median()) #vlue exactly in the middle of the data set
    df.mode() #most popular response in each column
    df.mean() #calculates the average of each numeric column. default, mean is caluclated over columns
    df.median() #calculates the value exactly in the middle of the dataa in each numeric column. default, median is caluclated over columns
    df.describe() #generates descriptive statistics for numeric columns by default.missing values are automatically excluded. Note that mean value is indicated here, and the 50 percentile is the same as the median.
    #measures of variability
    #range of meals eaten per day
    print(df.meals_per_day.max()-df.meals_per_day.min())
    #the lowest number of meals per day reported is 3 meals apart from the highest number of meals per day reported
    range_value = df.eat_out_per_wk.max() - df.eat_out_per_wk.min()
    print(range_value)
    
    # Find the standard deviation of meals eaten out or ordered
    std_dev_value = df.eat_out_per_wk.std()
    print(std_dev_value)
    #std dev
    df.meals_per_day.std()
    #on average, each meal value deviates from the mean by 0.65 points. There isn't much variablity within the dataset
    #variance
    df.meals_per_day.var()
    #the presence of variance shows that the survey has different responses to this question, however, the degree of scatter or variability among the observation is quite low.
    #df.plot(kind='scatter',x=)
    import pandas as pd
    import scipy.stats as stats
    
    file='young_people.csv'
    df=pd.read_csv(file)[['Gender','Age','Height','Weight']]
    df.head()
    df1=df.select_dtypes(include='number').apply(stats.zscore).head(20)
    df['Age_zscore']=stats.zscore(df.Age)
    df['Height_zscore']=stats.zscore(df.Height)
    df['Weight_zscore']=stats.zscore(df.Weight)
    df2=df[(df.Age_zscore>=3) | (df.Age_zscore<=-3)]
    #df.Age_zscore.plot(kind='hist')
    print(df.Age.mean())
    #df2.Age.mean()
    df2#.Age.unique()#2
    df[(df.Age==29) | (df.Age==30)]
    #plot distribution of values in Age column using histogram
    df.Age.plot(kind='hist')
    import pandas as pd
    import matplotlib.pyplot as plt
    file='World_Happiness_2019.csv'
    df=pd.read_csv(file)
    df.head(20)
    #df.corr()
    plt.scatter(df['Social support'],df['Healthy life expectancy'])
    df['Social support'].corr(df['Healthy life expectancy'])
    #Helliwell, J., Layard, R., & Sachs, J. (2019). World Happiness Report 2019, New York: Sustainable Development Solutions Network.
    #The 7th World Happiness Report present the available global data on national happiness, showing how the quality of people's lives can be assessed by a variety of subjective well-being measures. In this particular edition, the report examines the links between governement and happiness.