Beta
Introduction to Importing Data in Python
Run the hidden code cell below to import the data used in this course.
Hidden code
Pasos para Importar datos en Python
Previo Comandos para visualizar datos
- Importar Flats o CSV ! Acceso completo
! ls #Directorio actual
Ejemplo 1 Numpy: Datos Flat. Divididos por comas.
Ejemplo 2 y 3 Pandas Extraer datos es el standard usar pandas
- Importar Exceles
- Importar SAS
- Importar Stata Files
- HDF5 Files
- Matlab Files
- SQL
- Importar SQL con una linea con Pandas
- Importar y unir tablas con SQL
# Add your code snippets here
#Ejemplo 1
# Import package
import numpy as np
# Assign filename: file
file = 'datasets/seaslug.txt'
# Import file: data
data = np.loadtxt(file, delimiter='\t', dtype=str)
# Print the first element of data
print(data[0])
# Import data as floats and skip the first row: data_float
data_float = np.loadtxt(file, delimiter='\t', dtype=float, skiprows=1)
# Print the 10th element of data_float
print(data_float[9])
# Plot a scatterplot of the data
plt.scatter(data_float[:, 0], data_float[:, 1])
plt.xlabel('time (min.)')
plt.ylabel('percentage of larvae')
plt.show()
#Ejemplo 2
# Assign the filename: file
file = 'datasets/titanic_sub.csv'
# Read the first 5 rows of the file into a DataFrame: data
data = pd.read_csv(file, header=None, nrows=5)
# Build a numpy array from the DataFrame: data_array
data_array = data.values
# Print the datatype of data_array to the shell
print(type(data_array))
#Ejemplo3 .txt
# Assign filename: file
#file = 'titanic_corrupt.txt'
# Import file: data
#data = pd.read_csv(file, sep='\t', comment='#', na_values='Nothing')
# Print the head of the DataFrame
#print(data.head())
# Plot 'Age' variable in a histogram
#pd.DataFrame.hist(data[['Age']])
#plt.xlabel('Age (years)')
#plt.ylabel('count')
#plt.show()
#Importar Excel y otros...
# Assign spreadsheet filename: file
file = 'datasets/battledeath.xlsx'
# Load spreadsheet: xls
xls = pd.ExcelFile(file)
# Print sheet names
print(xls.sheet_names)
# Load a sheet into a DataFrame by name: df1
df1 = xls.parse('2004')
# Print the head of the DataFrame df1
print(df1.head())
# Load a sheet into a DataFrame by index: df2
df2 = xls.parse(0)
# Print the head of the DataFrame df2
print(df2.head())
# Parse the first sheet and rename the columns: df1
df1 = xls.parse(0, skiprows=[0], names=['Country','AAM due to War (2002)'])
# Print the head of the DataFrame df1
print(df1.head())
# Parse the first column of the second sheet and rename the column: df2
df2 = xls.parse(1, usecols=[0], skiprows=[0], names=['Country'])
# Print the head of the DataFrame df2
print(df2.head())
Importar SAS
# Import sas7bdat package
from sas7bdat import SAS7BDAT
# Save file to a DataFrame: df_sas
with SAS7BDAT('datasets/sales.sas7bdat') as file:
df_sas = file.to_data_frame()
# Print head of DataFrame
print(df_sas.head())
# Plot histogram of DataFrame features (pandas and pyplot already imported)
pd.DataFrame.hist(df_sas[['P']])
plt.ylabel('count')
plt.show()
Importar Stata
# Import pandas
import pandas as pd
# Load Stata file into a pandas DataFrame: df
df = pd.read_stata('datasets/disarea.dta')
# Print the head of the DataFrame df
print(df.head())
# Plot histogram of one column of the DataFrame
pd.DataFrame.hist(df[['disa10']])
plt.xlabel('Extent of disease')
plt.ylabel('Number of countries')
plt.show()
Importar HDF5
# Import packages
import numpy as np
import h5py
# Assign filename: file
file = 'datasets/L-L1_LOSC_4_V1-1126259446-32.hdf5'
# Load file: data
data = h5py.File(file, 'r')
# Print the datatype of the loaded file
print(type(data))
# Print the keys of the file
for key in data.keys():
print(key)
# Get the HDF5 group: group
group = data['strain']
# Check out keys of group
for key in group.keys():
print(key)
# Set variable equal to time series data: strain
strain = np.array(data['strain']['Strain'])
# Set number of time points to sample: num_samples
num_samples = 10000
# Set time vector
time = np.arange(0, 1, 1/num_samples)
# Plot data
plt.plot(time, strain[:num_samples])
plt.xlabel('GPS Time (s)')
plt.ylabel('strain')
plt.show()