利用LSTM+CNN+glove词向量预训练模型进行微博评论情感分析(二分类)
作者:互联网
先上代码和数据集
https://pan.baidu.com/s/1tpEKb0nCun2oxlBXGlPvxA
提取码:cryy
里面所需要的,都在文件里,
数据是微博评论(共12万,没记错的话,0,1各6万)
首先先给出文字预处理的代码
'''
分词,去停用词
'''
# -*- coding:utf-8 -*-
import csv
import pandas as pd
import jieba
import jieba.analyse
# 添加自定义词典和停用词典
#jieba.load_userdict("user_dict.txt")
stop_list = pd.read_csv('stop_word.txt',engine='python', encoding='utf-8',delimiter="\n",
names=['t'],error_bad_lines=False)['t'].tolist()
# -----------------------------------------------------------------------
# Jieba分词函数
def txt_cut(juzi):
return [w for w in jieba.lcut(juzi) if w not in stop_list]
# -----------------------------------------------------------------------
# 中文分词读取文件
def fenci(filename, result):
# 写入分词结果
fw = open(result, "w", newline='', encoding='gb18030')
writer = csv.writer(fw)
writer.writerow(['label', 'cutword'])
# 使用csv.DictReader读取文件中的信息
labels = []
contents = []
with open(filename, "r", encoding="gb18030") as f:
reader = csv.DictReader(f)
for row in reader:
# 数据元素获取
labels.append(row['label'])
content = row['content']
# 中文分词
seglist = txt_cut(content)
# 空格拼接
output = ' '.join(list(seglist))
contents.append(output)
# 文件写入
tlist = []
tlist.append(row['label'])
tlist.append(output)
writer.writerow(tlist)
print(labels[:5])
print(contents[:5])
fw.close()
# -----------------------------------------------------------------------
# 主函数
if __name__ == '__main__':
fenci("weibo_senti_100k.csv", "twofenlei_data_fc.csv")
分完词后,这里有一个小bug:
就是你会在fc.csv中发现// / 这样的符号
我一般都是利用wps 的替换 按钮,替换掉
把替换成空字符(不是空格,就是什么都不加的那种)
后面就是主程序了
'''
lstm+cnn+glove
'''
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import matplotlib.pyplot as plt
from tensorflow.python.keras.models import load_model
from sklearn import metrics
import seaborn as sns
from keras import Sequential
from keras.layers import LSTM, Embedding, Dropout, Dense,Flatten
from keras.optimizers import Adam
import numpy as np
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras.layers.convolutional import Convolution1D
from keras.layers.pooling import MaxPool1D
## 解决中文显示问题
plt.rcParams['font.sans-serif'] = ['KaiTi'] # 指定默认字体 SimHei黑体
plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'
# ------------------------------------------模型基本超参数--------------------------------------
epochs_num = 1
num_label = 2
max_len = 30
embedding_dim = 50
lstm_size=64
conkernel_num=32
conkernel_size=5
pool_size_num=3
dropout_rate=0.3
kernel_num=64
# ----------------------------------------------- 读取测数据集----------------------------------------
data_df = pd.read_csv("twofenlei_data_fc.csv", encoding='gb18030')
data = data_df['cutword'].values
label = data_df['label'].values
trainval_data, test_data, trainval_label, test_label = train_test_split(data, label, test_size=0.2, random_state=1000)
train_data,val_data,train_label,val_label=train_test_split(trainval_data,trainval_label,test_size=0.25,random_state=50)
# ------------------------------------------------------标签处理-------------------------------------------
# 贴标签
train_data = [str(a) for a in train_data.tolist()]
val_data =[str(a) for a in val_data.tolist()]
test_data = [str(a) for a in test_data.tolist()]
# 对label进行ont_hot编码
le = LabelEncoder()
train_label = le.fit_transform(train_label).reshape(-1, 1)
val_label=le.transform(val_label).reshape(-1,1)
test_label = le.transform(test_label).reshape(-1, 1)
ohe = OneHotEncoder()
train_label = ohe.fit_transform(train_label).toarray()
val_label=ohe.transform(val_label).toarray()
test_label = ohe.transform(test_label).toarray()
# -----------------------------使用Tokenizer对词组进行编码--------------------------
tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(train_data)
tokenizer.fit_on_texts(val_data)
tokenizer.fit_on_texts(test_data)
train_data_sq = tokenizer.texts_to_sequences(train_data)
val_data_sq=tokenizer.texts_to_sequences(val_data)
test_data_sq = tokenizer.texts_to_sequences(test_data)
vocab_size = len(tokenizer.word_index) + 1
# 补齐与截断
from keras_preprocessing.sequence import pad_sequences
train_data_sq_pading = pad_sequences(train_data_sq, padding='post', maxlen=max_len)
val_data_sq_pading=pad_sequences(val_data_sq,padding='post',maxlen=max_len)
test_data_sq_pading = pad_sequences(test_data_sq, padding='post', maxlen=max_len)
def create_embedding_matrix(filepath, word_index, embedding):
vocab_size = len(word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))
with open(filepath, encoding="utf-8") as f:
for line in f:
word, *vector = line.split()
if word in word_index:
idx = word_index[word]
embedding_matrix[idx] = np.array(
vector, dtype=np.float32)[:embedding_dim]
return embedding_matrix
embedding_matrix = create_embedding_matrix("glove.6B.50d.txt", tokenizer.word_index, embedding_dim)
#
# ##-------------------------------------------------模型构建-------------------------------------
# def created_model(lstm_size, vocab_size, embedding_dim, max_len,conkernel_num, conkernel_size,pool_size_num ,dropout_rate,kernel_num,num_label):
# model = Sequential()
# model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim,
# weights=[embedding_matrix], input_length=max_len,
# trainable=True))
# model.add(LSTM(lstm_size, return_sequences=True))
# model.add(Convolution1D(conkernel_num, conkernel_size, padding='same', strides=1, activation='relu'))
# model.add(MaxPool1D(pool_size=pool_size_num))
# model.add(Flatten())
# model.add(Dense(kernel_num, activation='relu'))
# model.add(Dropout(dropout_rate))
# model.add(Dense(num_label, activation='softmax'))
# model.summary()
# model.compile(optimizer=Adam(0.001), loss='categorical_crossentropy', metrics=["accuracy"])
# return model
#
#
# # !参数空间必须与create_model的参数要对应上,顺序也不能错!
# param_grid = dict(lstm_size=[64],
# vocab_size=[len(tokenizer.index_word) + 1],
# embedding_dim=[embedding_dim],
# max_len=[max_len],
# conkernel_num=[32],
# conkernel_size=[3,5],
# pool_size_num=[3],
# dropout_rate=[0.3,],
# kernel_num=[64],
# num_label=[2],
# )
#
# param_outputflie = './param_out.txt'
#
# # -------------------------------------------训练or预测-----------------------------------------
# model = KerasClassifier(build_fn=created_model, epochs=epochs_num, batch_size=32, verbose=True)
# grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid, cv=4, verbose=1, n_iter=5)
# grid_result = grid.fit(train_data_sq_pading, train_label)
# #
# test_accuracy = grid.score(test_data_sq_pading, test_label)
#
# with open(param_outputflie, 'a') as f:
# s = ('best Accuracy:'
# '{:.4f}\n{}\n test accuracy: {:.4f}\n\n')
# output_string = s.format(
# grid_result.best_score_,
# grid_result.best_params_,
# test_accuracy
# )
# print(output_string)
# f.write(output_string)
vocab_size=len(tokenizer.index_word) + 1
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim,
weights=[embedding_matrix], input_length=max_len,
trainable=True))
model.add(LSTM(lstm_size, return_sequences=True))
model.add(Convolution1D(conkernel_num, conkernel_size, padding='same', strides=1, activation='relu'))
model.add(MaxPool1D(pool_size=pool_size_num))
model.add(Flatten())
model.add(Dense(kernel_num, activation='relu'))
model.add(Dropout(dropout_rate))
model.add(Dense(num_label, activation='softmax'))
model.summary()
model.compile(optimizer=Adam(0.001), loss='categorical_crossentropy', metrics=["accuracy"])
flag="train"
if flag=="train":
print("模型训练..........")
model_fit=model.fit(train_data_sq_pading,train_label,epochs=epochs_num,
validation_data=(val_data_sq_pading,val_label),batch_size=32,verbose=True)
model.save("model_lstm_word2vec.h5")
loss,accuracy=model.evaluate(train_data_sq_pading,train_label)
print("train_acc= {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(val_data_sq_pading, val_label)
print("val_acc= {:.4f}".format(accuracy))
# 记录训练和验证集的准确率和损失值
history_dict = model_fit.history
train_loss = history_dict["loss"]
train_accuracy = history_dict["accuracy"]
val_loss = history_dict["val_loss"]
val_accuracy = history_dict["val_accuracy"]
# 绘制损失值
plt.figure()
plt.plot(range(epochs_num), train_loss, label='train_loss')
plt.plot(range(epochs_num), val_loss, label='val_loss')
plt.title('Loss曲线图')
plt.legend()
plt.xlabel('epochs')
plt.ylabel('loss')
# 绘制准确率
plt.figure()
plt.plot(range(epochs_num), train_accuracy, label='train_accuracy')
plt.plot(range(epochs_num), val_accuracy, label='val_accuracy')
plt.legend()
plt.title("accuracy曲线图")
plt.xlabel('epochs')
plt.ylabel('accuracy')
plt.show()
else:
print("模型预测...........")
## 导入已经训练好的模型
model = load_model('model_lstm_word2vec.h5')
## 对测试集进行预测
test_pre = model.predict(test_data_sq_pading)
## 评价预测效果,计算混淆矩阵
confm = metrics.confusion_matrix(np.argmax(test_label, axis=1), np.argmax(test_pre, axis=1))
print(confm)
## 混淆矩阵可视化
Labname = ["0", "1"]
print(metrics.classification_report(np.argmax(test_label, axis=1), np.argmax(test_pre, axis=1)))
plt.figure(figsize=(8, 8))
sns.heatmap(confm.T, square=True, annot=True,
fmt='d', cbar=False, linewidths=.6,
cmap="YlGnBu")
plt.xlabel('True label', size=14)
plt.ylabel('Predicted label', size=14)
plt.xticks(np.arange(num_label) + 0.5, Labname, size=12)
plt.yticks(np.arange(num_label) + 0.5, Labname, size=12)
plt.title('word2vec+lstm')
plt.savefig('result_lstm_word2vec.png')
plt.show()
loss, accuracy = model.evaluate(test_data_sq_pading, test_label)
print("test_acc= {:.4f}".format(accuracy))
训练的时候就在flag=‘train’
预测时候就随便改个值就可以了
最后的model_acc差不多能上98%吧
模型参考Eastmount大佬的文章,和b站唐宇迪的视频。
有疑问的小伙伴可以评论区留言
最后在上一个随机空间搜索最优参数的程序
大家可以先跑这个,确定出最优参数组合,再跑上面的程序。
'''
lstm+cnn+glove
'''
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import matplotlib.pyplot as plt
from tensorflow.python.keras.models import load_model
from sklearn import metrics
import seaborn as sns
from keras import Sequential
from keras.layers import LSTM, Embedding, Dropout, Dense,Flatten
from keras.optimizers import Adam
import numpy as np
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV
from keras.layers.convolutional import Convolution1D
from keras.layers.pooling import MaxPool1D
## 解决中文显示问题
plt.rcParams['font.sans-serif'] = ['KaiTi'] # 指定默认字体 SimHei黑体
plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'
# ------------------------------------------模型基本超参数--------------------------------------
epochs_num = 20
num_label = 2
max_len = 30
embedding_dim = 50
# ----------------------------------------------- 读取测数据集----------------------------------------
data_df = pd.read_csv('computer_comments_fc.csv', encoding='gb18030')
data = data_df['cutword'].values
label = data_df['label'].values
train_data, test_data, train_label, test_label = train_test_split(data, label, test_size=0.2, random_state=50)
# train_data,val_data,train_label,val_label=train_test_split(trainval_data,trainval_label,test_size=0.2,random_state=50)
# ------------------------------------------------------标签处理-------------------------------------------
# 贴标签
train_data = [str(a) for a in train_data.tolist()]
# val_data =[str(a) for a in val_data.tolist()]
test_data = [str(a) for a in test_data.tolist()]
# 对label进行ont_hot编码
le = LabelEncoder()
train_label = le.fit_transform(train_label).reshape(-1, 1)
# val_label=le.transform(val_label).reshape(-1,1)
test_label = le.transform(test_label).reshape(-1, 1)
ohe = OneHotEncoder()
train_label = ohe.fit_transform(train_label).toarray()
# val_label=ohe.transform(val_label).toarray()
test_label = ohe.transform(test_label).toarray()
# -----------------------------使用Tokenizer对词组进行编码--------------------------
tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(train_data)
# tokenizer.fit_on_texts(val_data)
tokenizer.fit_on_texts(test_data)
train_data_sq = tokenizer.texts_to_sequences(train_data)
# val_data_sq=tokenizer.texts_to_sequences(val_data)
test_data_sq = tokenizer.texts_to_sequences(test_data)
vocab_size = len(tokenizer.word_index) + 1
# 补齐与截断
from keras_preprocessing.sequence import pad_sequences
train_data_sq_pading = pad_sequences(train_data_sq, padding='post', maxlen=max_len)
# val_data_sq_pading=pad_sequences(val_data_sq,padding='post',maxlen=max_len)
test_data_sq_pading = pad_sequences(test_data_sq, padding='post', maxlen=max_len)
def create_embedding_matrix(filepath, word_index, embedding):
vocab_size = len(word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))
with open(filepath, encoding="utf-8") as f:
for line in f:
word, *vector = line.split()
if word in word_index:
idx = word_index[word]
embedding_matrix[idx] = np.array(
vector, dtype=np.float32)[:embedding_dim]
return embedding_matrix
embedding_matrix = create_embedding_matrix("glove.6B.50d.txt", tokenizer.word_index, embedding_dim)
##-------------------------------------------------模型构建-------------------------------------
def created_model(lstm_size, vocab_size, embedding_dim, max_len,conkernel_num, conkernel_size,pool_size_num ,dropout_rate,kernel_num,num_label):
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim,
weights=[embedding_matrix], input_length=max_len,
trainable=True))
model.add(LSTM(lstm_size, return_sequences=True))
model.add(Convolution1D(conkernel_num, conkernel_size, padding='same', strides=1, activation='relu'))
model.add(MaxPool1D(pool_size=pool_size_num))
model.add(Flatten())
model.add(Dense(kernel_num, activation='relu'))
model.add(Dropout(dropout_rate))
model.add(Dense(num_label, activation='softmax'))
model.summary()
model.compile(optimizer=Adam(0.001), loss='categorical_crossentropy', metrics=["accuracy"])
return model
# !参数空间必须与create_model的参数要对应上,顺序也不能错!
param_grid = dict(lstm_size=[64],
vocab_size=[len(tokenizer.index_word) + 1],
embedding_dim=[embedding_dim],
max_len=[max_len],
conkernel_num=[32],
conkernel_size=[3,5],
pool_size_num=[3],
dropout_rate=[0.3],
kernel_num=[64],
num_label=[2],
)
param_outputflie = './param_out.txt'
# -------------------------------------------训练or预测-----------------------------------------
model = KerasClassifier(build_fn=created_model, epochs=epochs_num, batch_size=16, verbose=True)
grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid, cv=4, verbose=1, n_iter=5)
grid_result = grid.fit(train_data_sq_pading, train_label)
#
test_accuracy = grid.score(test_data_sq_pading, test_label)
with open(param_outputflie, 'a') as f:
s = ('best Accuracy:'
'{:.4f}\n{}\n test accuracy: {:.4f}\n\n')
output_string = s.format(
grid_result.best_score_,
grid_result.best_params_,
test_accuracy
)
print(output_string)
f.write(output_string)
#
# flag="train"
# if flag=="train":
# print("模型训练..........")
# model_fit=model.fit(train_data_sq_pading,train_label,epochs=epochs_num,
# validation_data=(val_data_sq_pading,val_label),batch_size=32,verbose=True)
#
# model.save("model_lstm_word2vec.h5")
# loss,accuracy=model.evaluate(train_data_sq_pading,train_label)
# print("train_acc= {:.4f}".format(accuracy))
# loss, accuracy = model.evaluate(val_data_sq_pading, val_label)
# print("val_acc= {:.4f}".format(accuracy))
#
# # 记录训练和验证集的准确率和损失值
# history_dict = model_fit.history
# train_loss = history_dict["loss"]
# train_accuracy = history_dict["accuracy"]
# val_loss = history_dict["val_loss"]
# val_accuracy = history_dict["val_accuracy"]
# # 绘制损失值
# plt.figure()
# plt.plot(range(epochs_num), train_loss, label='train_loss')
# plt.plot(range(epochs_num), val_loss, label='val_loss')
# plt.title('Loss曲线图')
# plt.legend()
# plt.xlabel('epochs')
# plt.ylabel('loss')
# # 绘制准确率
# plt.figure()
# plt.plot(range(epochs_num), train_accuracy, label='train_accuracy')
# plt.plot(range(epochs_num), val_accuracy, label='val_accuracy')
# plt.legend()
# plt.title("accuracy曲线图")
# plt.xlabel('epochs')
# plt.ylabel('accuracy')
# plt.show()
#
# else:
# print("模型预测...........")
# ## 导入已经训练好的模型
# model = load_model('model_lstm_word2vec.h5')
# ## 对测试集进行预测
# test_pre = model.predict(test_data_sq_pading)
# ## 评价预测效果,计算混淆矩阵
# confm = metrics.confusion_matrix(np.argmax(test_label, axis=1), np.argmax(test_pre, axis=1))
# print(confm)
# ## 混淆矩阵可视化
# Labname = ["0", "1"]
# print(metrics.classification_report(np.argmax(test_label, axis=1), np.argmax(test_pre, axis=1)))
# plt.figure(figsize=(8, 8))
# sns.heatmap(confm.T, square=True, annot=True,
# fmt='d', cbar=False, linewidths=.6,
# cmap="YlGnBu")
# plt.xlabel('True label', size=14)
# plt.ylabel('Predicted label', size=14)
# plt.xticks(np.arange(num_label) + 0.5, Labname, size=12)
# plt.yticks(np.arange(num_label) + 0.5, Labname, size=12)
# plt.title('word2vec+lstm')
# plt.savefig('result_lstm_word2vec.png')
# plt.show()
# loss, accuracy = model.evaluate(test_data_sq_pading, test_label)
# print("test_acc= {:.4f}".format(accuracy))
最后提示一下,该程序不一定只能做二分类的情感分析,也可以三分类(积极,中性,消极);四分类…等等
标签:glove,label,test,微博,train,CNN,model,data,size 来源: https://blog.csdn.net/qq_44182694/article/details/117705167