Eric Manuel Villegas Gómez














Sign up
Importing Data in Python
  • AI Chat
  • Code
  • Report
  • Beta
    Spinner

    Introduction to Importing Data in Python

    Run the hidden code cell below to import the data used in this course.

    Hidden code

    Pasos para Importar datos en Python

    Previo Comandos para visualizar datos

    1. Importar Flats o CSV ! Acceso completo

    ! ls #Directorio actual

    Ejemplo 1 Numpy: Datos Flat. Divididos por comas.

    Ejemplo 2 y 3 Pandas Extraer datos es el standard usar pandas

    1. Importar Exceles
    2. Importar SAS
    3. Importar Stata Files
    4. HDF5 Files
    5. Matlab Files
    6. SQL
    7. Importar SQL con una linea con Pandas
    8. Importar y unir tablas con SQL
    # Add your code snippets here
    #Ejemplo 1
    # Import package
    import numpy as np
    
    # Assign filename: file
    file = 'datasets/seaslug.txt'
    
    # Import file: data
    data = np.loadtxt(file, delimiter='\t', dtype=str)
    
    # Print the first element of data
    print(data[0])
    
    # Import data as floats and skip the first row: data_float
    data_float = np.loadtxt(file, delimiter='\t', dtype=float, skiprows=1)
    
    # Print the 10th element of data_float
    print(data_float[9])
    
    # Plot a scatterplot of the data
    plt.scatter(data_float[:, 0], data_float[:, 1])
    plt.xlabel('time (min.)')
    plt.ylabel('percentage of larvae')
    plt.show()
    #Ejemplo 2
    
    # Assign the filename: file
    file = 'datasets/titanic_sub.csv'
    
    # Read the first 5 rows of the file into a DataFrame: data
    data = pd.read_csv(file, header=None, nrows=5)
    
    # Build a numpy array from the DataFrame: data_array
    data_array = data.values
    
    # Print the datatype of data_array to the shell
    print(type(data_array))
    #Ejemplo3 .txt
    # Assign filename: file
    #file = 'titanic_corrupt.txt'
    
    # Import file: data
    #data = pd.read_csv(file, sep='\t', comment='#', na_values='Nothing')
    
    # Print the head of the DataFrame
    #print(data.head())
    
    # Plot 'Age' variable in a histogram
    #pd.DataFrame.hist(data[['Age']])
    #plt.xlabel('Age (years)')
    #plt.ylabel('count')
    #plt.show()
    

    #Importar Excel y otros...

    # Assign spreadsheet filename: file
    file = 'datasets/battledeath.xlsx'
    
    # Load spreadsheet: xls
    xls = pd.ExcelFile(file)
    
    # Print sheet names
    print(xls.sheet_names)
    
    # Load a sheet into a DataFrame by name: df1
    df1 = xls.parse('2004')
    
    # Print the head of the DataFrame df1
    print(df1.head())
    
    # Load a sheet into a DataFrame by index: df2
    df2 = xls.parse(0)
    
    # Print the head of the DataFrame df2
    print(df2.head())
    
    # Parse the first sheet and rename the columns: df1
    df1 = xls.parse(0, skiprows=[0], names=['Country','AAM due to War (2002)'])
    
    # Print the head of the DataFrame df1
    print(df1.head())
    
    # Parse the first column of the second sheet and rename the column: df2
    df2 = xls.parse(1, usecols=[0], skiprows=[0], names=['Country'])
    
    # Print the head of the DataFrame df2
    print(df2.head())

    Importar SAS

    # Import sas7bdat package
    from sas7bdat import SAS7BDAT
    
    # Save file to a DataFrame: df_sas
    with SAS7BDAT('datasets/sales.sas7bdat') as file:
        df_sas = file.to_data_frame()
    
    # Print head of DataFrame
    print(df_sas.head())
    
    # Plot histogram of DataFrame features (pandas and pyplot already imported)
    pd.DataFrame.hist(df_sas[['P']])
    plt.ylabel('count')
    plt.show()

    Importar Stata

    # Import pandas
    import pandas as pd
    
    # Load Stata file into a pandas DataFrame: df
    df = pd.read_stata('datasets/disarea.dta')
    
    # Print the head of the DataFrame df
    print(df.head())
    
    # Plot histogram of one column of the DataFrame
    pd.DataFrame.hist(df[['disa10']])
    plt.xlabel('Extent of disease')
    plt.ylabel('Number of countries')
    plt.show()

    Importar HDF5

    # Import packages
    import numpy as np
    import h5py
    
    # Assign filename: file
    file = 'datasets/L-L1_LOSC_4_V1-1126259446-32.hdf5'
    
    # Load file: data
    data = h5py.File(file, 'r')
    
    # Print the datatype of the loaded file
    print(type(data))
    
    # Print the keys of the file
    for key in data.keys():
        print(key)
    # Get the HDF5 group: group
    group = data['strain']
    
    # Check out keys of group
    for key in group.keys():
        print(key)
    
    # Set variable equal to time series data: strain
    strain = np.array(data['strain']['Strain'])
    
    # Set number of time points to sample: num_samples
    num_samples = 10000
    
    # Set time vector
    time = np.arange(0, 1, 1/num_samples)
    
    # Plot data
    plt.plot(time, strain[:num_samples])
    plt.xlabel('GPS Time (s)')
    plt.ylabel('strain')
    plt.show()