其他分享
首页 > 其他分享> > 基于LSTM实现多变量预测(Tensorflow2实现)

基于LSTM实现多变量预测(Tensorflow2实现)

作者:互联网

这篇文章的内容来自B站UP主唐国梁Tommy老师的视频

TensorFlow 2.0 基于LSTM多变量_共享单车使用量预测: https://www.bilibili.com/video/BV1y5411K7NR

<iframe allowfullscreen="true" border="0" frameborder="no" framespacing="0" scrolling="no" src="//player.bilibili.com/player.html?aid=461172712&bvid=BV1y5411K7NR&cid=356128226&page=1"> </iframe>

案例实现思路:

1、模块导入

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score

import tensorflow as tf
from tensorflow.keras import Sequential, layers, utils, losses
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard

import warnings
warnings.filterwarnings('ignore')

2、加载数据集、预处理

共享单车使用量数据集

# 加载数据集
dataset = pd.read_csv("BikeShares.csv", parse_dates=['timestamp'], index_col=['timestamp'])
# (17414, 9)
# 默认显示前5行
dataset.head()
timestamp cnt t1 t2 hum wind_speed weather_code is_holiday is_weekend season
2015-01-04 00:00:00 182 3.0 2.0 93.0 6.0 3.0 0.0 1.0 3.0
2015-01-04 01:00:00 138 3.0 2.5 93.0 5.0 1.0 0.0 1.0 3.0
2015-01-04 02:00:00 134 2.5 2.5 96.5 0.0 1.0 0.0 1.0 3.0
2015-01-04 03:00:00 72 2.0 2.0 100.0 0.0 1.0 0.0 1.0 3.0
2015-01-04 04:00:00 47 2.0 0.0 93.0 6.5 1.0 0.0 1.0 3.0

字段说明:

2、数据可视化

字段t1(气温)与字段cnt(单车使用量)之间的关系

plt.figure(figsize=(16,8))
sns.pointplot(x='t1', y='cnt', data=dataset)
plt.show()

字段t2(体感温度)与字段cnt(单车使用量)之间的关系

plt.figure(figsize=(16,8))
sns.lineplot(x='t2', y='cnt', data=dataset)
plt.show()

字段hum(湿度)与字段cnt(单车使用量)之间的关系

plt.figure(figsize=(16,8))
sns.lineplot(x='hum', y='cnt', data=dataset)
plt.xticks([])
plt.show()

字段weather_code : 天气的类别与字段cnt(单车使用量)之间的关系

# weather_code : 天气的类别(1=干净,2 =很少的云,3=碎云,4=多云,7=雨/小雨,10=有雷雨,26=降雪,94=冰雾
plt.figure(figsize=(16,8))
sns.pointplot(x='weather_code', y='cnt', data=dataset)
plt.show()

注意:创建时间字段,用于分析数据

# 创建hour字段
dataset['hour'] = dataset.index.hour
# 创建year字段
dataset['year'] = dataset.index.year
# 创建month字段
dataset['month'] = dataset.index.month

基于is_holiday 统计 hour 与 cnt 之间的分布

# 1:假期 / 0:工作日

plt.figure(figsize=(16,8))
sns.lineplot(x='hour', y='cnt', data=dataset, hue='is_holiday')
plt.xticks(list(range(24)))
plt.show()

基于 season 统计 hour 与 cnt 之间的分布

# 0:春天 ; 1:夏天 ; 2:秋天 ; 3:冬天

plt.figure(figsize=(16,8))
sns.pointplot(x='hour', y='cnt', data=dataset, hue='season')
plt.xticks(list(range(24)))
plt.show()

基于 is_holiday 统计 hour 与 cnt 之间的分布

# 1:假期 / 0:工作日

plt.figure(figsize=(16,8))
sns.lineplot(x='month', y='cnt', data=dataset, hue='is_holiday')
plt.show()

3、数据预处理

# 删除多余的列 hour, year, month
# axis=1 删列
dataset.drop(columns=['hour', 'year', 'month'], axis=1, inplace=True)
timestamp cnt t1 t2 hum wind_speed weather_code is_holiday is_weekend season
2015-01-04 00:00:00 182 3.0 2.0 93.0 6.0 3.0 0.0 1.0 3.0
2015-01-04 01:00:00 138 3.0 2.5 93.0 5.0 1.0 0.0 1.0 3.0
2015-01-04 02:00:00 134 2.5 2.5 96.5 0.0 1.0 0.0 1.0 3.0
2015-01-04 03:00:00 72 2.0 2.0 100.0 0.0 1.0 0.0 1.0 3.0
2015-01-04 04:00:00 47 2.0 0.0 93.0 6.5 1.0 0.0 1.0 3.0
# 分别对字段t1, t2, hum, wind_speed进行归一化

columns = ['cnt', 't1', 't2', 'hum', 'wind_speed']

for col in columns:
    scaler = MinMaxScaler()
    dataset[col] = scaler.fit_transform(dataset[col].values.reshape(-1,1))
timestamp cnt t1 t2 hum wind_speed weather_code is_holiday is_weekend season
2015-01-04 00:00:00 0.023155 0.126761 0.2000 0.911950 0.106195 3.0 0.0 1.0 3.0
2015-01-04 01:00:00 0.017557 0.126761 0.2125 0.911950 0.088496 1.0 0.0 1.0 3.0
2015-01-04 02:00:00 0.017048 0.112676 0.2125 0.955975 0.000000 1.0 0.0 1.0 3.0
2015-01-04 03:00:00 0.009160 0.098592 0.2000 1.000000 0.000000 1.0 0.0 1.0 3.0
2015-01-04 04:00:00 0.005980 0.098592 0.1500 0.911950 0.115044 1.0 0.0 1.0 3.0

4、特征工程

# 特征数据集
X = dataset.drop(columns=['cnt'], axis=1) 
# X.shape (17414, 8)
# 标签数据集
y = dataset['cnt']
# y.shape (17414,)

1 数据集分离: X_train, X_test

# shuffle=False 不能打乱 因为是时序预测
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False, random_state=666)

# X_train.shape (13931, 8)
# y_train.shape (13931,)
# X_test.shape (3483, 8)
# y_test.shape (3483,)

2 构造特征数据集

def create_dataset(X, y, seq_len=10):
    features = []
    targets = []
    
    for i in range(0, len(X) - seq_len, 1):
        data = X.iloc[i:i+seq_len] # 序列数据
        label = y.iloc[i+seq_len] # 标签数据
        # 保存到features和labels
        features.append(data)
        targets.append(label)
    
    # 返回
    return np.array(features), np.array(targets)
# ① 构造训练特征数据集
train_dataset, train_labels = create_dataset(X_train, y_train, seq_len=10)

# 有13921个滑动窗口 每个滑动窗口有10条数据 每条数据有8个特征
# train_dataset.shape (13921, 10, 8)
# train_labels.shape (13921,)
# ② 构造测试特征数据集
test_dataset, test_labels = create_dataset(X_test, y_test, seq_len=10)
# test_dataset.shape (3473, 10, 8)
# test_labels.shape (3473,)

3 构造批数据

def create_batch_dataset(X, y, train=True, buffer_size=1000, batch_size=128):
    batch_data = tf.data.Dataset.from_tensor_slices((tf.constant(X), tf.constant(y))) # 数据封装,tensor类型
    if train: # 训练集
        return batch_data.cache().shuffle(buffer_size).batch(batch_size)
    else: # 测试集
        return batch_data.batch(batch_size)
# 训练批数据
train_batch_dataset = create_batch_dataset(train_dataset, train_labels)
# 测试批数据
test_batch_dataset = create_batch_dataset(test_dataset, test_labels, train=False)

5、模型搭建、编译、训练

# 模型搭建--版本1
model = Sequential([#(10,8)
    layers.LSTM(units=256, input_shape=train_dataset.shape[-2:], return_sequences=True),
    layers.Dropout(0.4),
    layers.LSTM(units=256, return_sequences=True),
    layers.Dropout(0.3),
    layers.LSTM(units=128, return_sequences=True),
    layers.LSTM(units=32), 
    layers.Dense(1)
])

# 模型编译
model.compile(optimizer='adam',loss='mse')

checkpoint_file = "best_model.hdf5"

checkpoint_callback = ModelCheckpoint(filepath=checkpoint_file, 
                                      monitor='loss',
                                      mode='min',
                                      save_best_only=True,
                                      save_weights_only=True)
# 模型训练
history = model.fit(train_batch_dataset,
                    epochs=30,
                    validation_data=test_batch_dataset,
                    callbacks=[checkpoint_callback])
# 显示训练结果
plt.figure(figsize=(16,8))
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='val loss')
plt.legend(loc='best')
plt.show()

6、模型验证

# test_dataset.shape (3473, 10, 8)
test_preds = model.predict(test_dataset, verbose=1)
# test_preds.shape (3473, 1)
# 真值shape
# test_labels.shape (3473,)
# 计算r2值
score = r2_score(test_labels, test_preds)
print("r^2 值为: ", score)

r^2 值为: 0.4916024135223812

# 绘制 预测与真值结果

plt.figure(figsize=(16,8))
plt.plot(test_labels[:300], label="True value")
plt.plot(test_preds[:300], label="Pred value")
plt.legend(loc='best')
plt.show()

标签:Tensorflow2,00,plt,1.0,实现,dataset,3.0,test,LSTM
来源: https://www.cnblogs.com/wkfvawl/p/16220836.html