Deep Learning mit Python
Time Series Forecast mit LSTM
# Time series forecast with LSTM # Import packages import numpy as np import pandas as pd import tensorflow as tf from tensorflow import keras import matplotlib.pyplot as plt from sklearn.preprocessing import MinMaxScaler # Import file dataset = pd.read_pickle("./dataset_lstm.pkl") # Functions ## Window and split function def split_window(features,input_width,label_width,shift,label_columns): total_window_size = input_width + shift column_indices = {name: i for i, name in enumerate(train_data.columns)} input_slice = slice(0, input_width) label_start = total_window_size - label_width labels_slice = slice(label_start, None) inputs = features[:, input_slice, :] # apply on tf.Tensor labels = features[:, labels_slice, :] # apply on tf.Tensor if label_columns is not None: labels = tf.stack( [labels[:, :, column_indices[name]] for name in label_columns],axis=-1) # Slicing doesn't preserve static shape information, so set the shapes # manually. This way the `tf.data.Datasets` are easier to inspect. inputs.set_shape([None, input_width, None]) labels.set_shape([None, label_width, None]) return inputs, labels # Data preprocessing past = 60 future = 31 ## Data used for training and validation dataset2 = dataset[:'2020-04'].copy() dataset2.Sales.describe() ## Split in train, validation train_data = dataset2[:-future].copy() val_data = dataset2[-past-future:].copy() ## Remove outliers in train train_data = outlier(train_data) train_data.Sales.describe() ## Normalize per complete year (2020 respective to 2019) scaler = MinMaxScaler(feature_range=(-1, 1)) # define scaler scaler1 = scaler.fit(train_data.loc[:'2017-12',["Sales","Year"]]) train_data.loc[:'2017-12',["Sales","Year"]] = scaler1.transform( train_data.loc[:'2017-12',["Sales","Year"]]) # apply scaler on train data scaler2 = scaler.fit(train_data.loc['2018-1':'2018-12',["Sales","Year"]]) train_data.loc['2018-1':'2018-12',["Sales","Year"]] = scaler2.transform( train_data.loc['2018-1':'2018-12',["Sales","Year"]]) # apply scaler on train data scaler3 = scaler.fit(train_data.loc['2019-1':'2019-12',["Sales","Year"]]) train_data.loc['2019-1':'2019-12',["Sales","Year"]] = scaler3.transform( train_data.loc['2019-1':'2019-12',["Sales","Year"]]) # apply scaler on train data train_data.loc['2020-1':,["Sales","Year"]] = scaler3.transform( train_data.loc['2020-1':,["Sales","Year"]]) # apply scaler on 2020 train data val_data.iloc[:,[0,1]] = scaler3.transform(val_data.iloc[:,[0,1]]) # apply scaler on 2020 val data ## Reshape train data data = np.array(train_data, dtype=np.float32) batch_train = tf.keras.preprocessing.timeseries_dataset_from_array( data = data, targets = None, sequence_length = 91, shuffle = False, batch_size = 2) batch_train = batch_train.map(lambda x: split_window(x,60,31,31,["Sales"])) # apply split_window # Check train for inputs, labels in batch_train: print(inputs.shape) print(labels.shape) ## Reshape val data data = np.array(val_data, dtype=np.float32) batch_val = tf.keras.preprocessing.timeseries_dataset_from_array( data=data, targets=None, sequence_length=91, batch_size=1) batch_val = batch_val.map(lambda x: split_window(x,60,31,31,["Sales"])) # apply split_window # Check val for inputs, labels in batch_val: print(inputs.shape) print(labels.shape) # Fit architecture: Stateless multistep one shot LSTM ## Initialize RMSE and cumulative error for the validation and test dataset rmse_val = np.array([]) cumerror_test = np.array([]) ## Repeat NN to check stability with respect to validation dataset for i in range(10): learning_rate = 0.001 epochs = 100 n_neurons = 32 reg = 0.0001 inputs = keras.layers.Input(shape=(60, 21)) lstm_out1 = keras.layers.LSTM(n_neurons, return_sequences=True, kernel_regularizer=tf.keras.regularizers.l2(reg), recurrent_regularizer = tf.keras.regularizers.l2(reg))(inputs) lstm_out2 = keras.layers.LSTM(n_neurons, kernel_regularizer=tf.keras.regularizers.l2(reg), recurrent_regularizer = tf.keras.regularizers.l2(reg))(lstm_out1) outputs = keras.layers.Dense(31, kernel_initializer = tf.initializers.zeros)(lstm_out2) # initialize with 0 to start with small deltas model = keras.Model(inputs=inputs, outputs=outputs) model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate), loss="mse") model.summary() # Model fit with early epoch stopping path_checkpoint = "model_checkpoint.h5" es_callback = keras.callbacks.EarlyStopping(monitor="val_loss", min_delta=0, patience=5) modelckpt_callback = keras.callbacks.ModelCheckpoint( monitor="val_loss", filepath=path_checkpoint, verbose=1, save_weights_only=True, save_best_only=True, ) history = model.fit( batch_train, epochs = epochs, validation_data = batch_val, callbacks=[es_callback, modelckpt_callback], ) # Vizualize loss def visualize_loss(history, title): loss = history.history["loss"] val_loss = history.history["val_loss"] epochs = range(len(loss)) plt.figure() plt.plot(epochs, loss, "b", label="Training loss") plt.plot(epochs, val_loss, "r", label="Validation loss") plt.title(title) plt.xlabel("Epochs") plt.ylabel("Loss") plt.legend() plt.show() visualize_loss(history, "Training and Validation Loss") # Final evaluation on validation data and append to check stability rmse_val_this = np.sqrt(model.evaluate(batch_val)) rmse_val = np.append(rmse_val,rmse_val_this) # Prediction ## Expected expected = dataset['2020-5-1':'2020-5-31'].copy() ## Initiate forecast error vector for each day of the may month error = np.array([]) ## Loop over forecast horizon for horizon in range(future, 0, -1): known = future-horizon if horizon == future: date = '2020-4-30' elif horizon < future: date = '2020-5-' + str(known) test_data = dataset[:date][-past:].copy() # take only past values ## Prepare past data test_data.iloc[:,[0,1]] = scaler3.transform(test_data.iloc[:,[0,1]]) # apply scaler ## Reshape past dataset (only X) data = np.array(test_data, dtype=np.float32) batch_test = tf.keras.preprocessing.timeseries_dataset_from_array( data=data, targets=None, sequence_length=60, # only past needed batch_size=1) ## Make forecast yhat = model.predict(batch_test) ## Invert scaling of forecast invert = pd.DataFrame({'Sales': yhat.reshape(31), 'Year': np.ones(31)}) # generate a df suitable for inverting yhat = scaler3.inverse_transform(invert) # invert both Sales and Year yhat = yhat[:,0] # keep only Sales yhat_final = yhat[:horizon] # Keep only required horizon yhat_final[yhat_final<0] = 0 # constrain manually to zero ## Plot forecast at the beginning of month if horizon==31: plt.plot(yhat_final) ## Final percentage forecast error currenterror = 1 - (np.sum(yhat_final) + np.sum(expected.Sales[:known].copy())) / np.sum(expected.Sales) error = np.append(error,currenterror) ## Plot error x = np.arange(31,0,-1) y = np.abs(error) fig, ax = plt.subplots() ax.plot(x, y, color='r') plt.show() ## Cumerror max cumerror_test = np.append(cumerror_test,max(y)) # Check error distribution plt.figure() plt.boxplot(rmse_val) # looks normally distributed plt.show() plt.figure() plt.boxplot(cumerror_test) # looks normally distributed plt.show() # Median of the max error np.median(cumerror_test) # Save results = pd.DataFrame({'RMSE_Validation': rmse_val, 'Cumerror': cumerror_test}) results.to_pickle("./results_stateless.pkl")
Links: