Beta
Machine learning for time series data with Python
Notes by César Muro Cabral
Time series means data that changes over time.
Times series data consists of a least two things: One, an array of number that represents the data itself.
Plotting a time series (I)
In this exercise, you'll practice plotting the values of two time series without the time component.
Two DataFrames, data and data2 are available in your workspace.
Unless otherwise noted, assume that all required packages are loaded with their common aliases throughout this course.
import h5py
f = h5py.File('datasets/audio_munged.hdf5', 'r')
def print_attrs(name, obj):
print(name)
for key, val in obj.attrs.items():
print(" %s: %s" % (key, val))
with h5py.File('datasets/audio_munged.hdf5', 'r') as hdf:
hdf.visititems(print_attrs)
import pandas as pd
import h5py
with h5py.File('datasets/audio_munged.hdf5', 'r') as hdf:
dataset_1 = hdf['h5io/key_data/axis0']
df_1 = pd.DataFrame(dataset_1[()])
dataset_2=hdf['h5io/key_data/axis1']
df_2=pd.DataFrame(dataset_2[()])
dataset_3=hdf['h5io/key_data/block0_items']
df_3 = pd.DataFrame(dataset_3[()])
df_1.head()
import h5py
# Abre el archivo HDF5 en modo de lectura
with h5py.File('datasets/audio_munged.hdf5', 'r') as f:
print(f['/h5io/key_data'].keys())
# Lista las claves (nombres) de los grupos y conjuntos de datos
print('Claves del archivo HDF5:')
for key in f.keys():
print(key)
# Explora los grupos y conjuntos de datos
for key in f.keys():
print('\n' + key + ':')
if isinstance(f[key], h5py.Group):
for subkey in f[key].keys():
print(' ' + subkey + ':', f[key][subkey])
else:
print(' Valor:', f[key].value)
# Import any packages you want to use here
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
#data2=pd.concat([df_1,df_2],axis=1)
data=pd.read_csv("datasets/prices.csv")
data.head()
data['symbol'].nunique()
data['symbol'].value_counts()
symbols=list(data['symbol'].unique())
for symbol in symbols:
plt.figure(figsize=(4,2.5))
data[data['symbol']==symbol].plot(x='date',y='close');
plt.xticks(rotation='vertical')
plt.title('{} daily closing price'.format(symbol));
plt.tight_layout();
plt.show();
# Example for opening wav files- audio files
from glob import glop
files = glob('data/heartbeat-sounds/files/*.wav')
print(files)
# To read the audio in a dataset we use the library librosa
import librosa as lr
audio, sfreq = lr.load('data/heartbeat-sounds/proc/files/murmur__345345.wav')
sfreq is the sample frequency.
- If we know the sampling rate of a timeseries, then we know the timestamp of each datapoint relative to the first datapoint
- This assumes that sampling rate is fixed and no data points are lost
We can create an array of indices, one for each sample, and divide by the sampling frequency