LSTM电影评论情感分析
作者:互联网
LSTM情感分析
原文来自于(原文链接:https://blog.csdn.net/duanlianvip/article/details/103584543)
导入数据
利用谷歌已有的词向量
import numpy as np
wordsList = np.load('./training_data/wordsList.npy')
print('Loaded the word list!')
wordsList = wordsList.tolist() # Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] # Encode words as UTF-8
wordVectors = np.load('./training_data/wordVectors.npy')
print('Loaded the word vectors!')
每一个词在wordVectors中都对应一个索引 都是一个50维的向量
因此 一个句子 如
I thought the movie was incredible and inspiring
对应的索引可以得出是[41 804 201534 1005 15 7446 5 13767 0 0] (此处设置maxSeqLength为10)
然后通过每一个索引可以替换为对应的词向量 因此结果为一个10*60的矩阵
with tf.Session() as sess:
print(tf.nn.embedding_lookup(wordVectors, firstSentence).eval().shape) # 上图左侧两个图
另外两个数据集:一个正向数据(positive)一个负向数据(negative)
负向:
正向:
词向量的映射
首选需要选择一个maxSeqLength 即每次处理序列的最大限度 即每一个评论的单词量限度
因此需要对所有的评论进行遍历 得到评论单词量的平均值
from os import listdir
from os.path import isfile, join
positiveFiles = ['./training_data/positiveReviews/' + f for f in listdir('./training_data/positiveReviews/') if isfile(join('./training_data/positiveReviews/', f))]
negativeFiles = ['./training_data/negativeReviews/' + f for f in listdir('./training_data/negativeReviews/') if isfile(join('./training_data/negativeReviews/', f))]
numWords = []
for pf in positiveFiles:
with open(pf, 'r', encoding='utf-8') as f:
line = f.readline()
counter = len(line.split()) # 按空格分隔
numWords.append(counter)
print('Positive files finished')
for nf in negativeFiles:
with open(nf, 'r', encoding='utf-8') as f:
line = f.readline()
counter = len(line.split())
numWords.append(counter)
print('Negative files finished')
numFiles = len(numWords)
print('The total number of files is', numFiles)
print('The total number of words in the files is', sum(numWords))
print('The average number of words in the files is', sum(numWords) / len(numWords))
因此选择 maxSeqLength = 250
将每一个文本(即一个评论)转换成对应的索引矩阵
ids = np.zeros((numFiles, maxSeqLength), dtype='int32')
fileCounter = 0
for pf in positiveFiles:
with open(pf, 'r', encoding='UTF-8') as f:
indexCounter = 0
line = f.readline()
cleanedLine = cleanSentences(line)
split = cleanedLine.split()
for word in split:
try:
ids[fileCounter][indexCounter] = wordsList.index(word)
except ValueError:
ids[fileCounter][indexCounter] = 399999
indexCounter = indexCounter + 1
if indexCounter >= maxSeqLength:
break
fileCounter = fileCounter + 1
for nf in negativeFiles:
with open(nf, 'r', encoding='UTF-8') as f:
indexCounter = 0
line = f.readline()
cleanedLine = cleanSentences(line)
split = cleanedLine.split()
for word in split:
try:
ids[fileCounter][indexCounter] = wordsList.index(word)
except ValueError:
ids[fileCounter][indexCounter] = 399999
indexCounter = indexCounter + 1
if indexCounter >= maxSeqLength:
break
fileCounter = fileCounter + 1
np.save('idsMatrix', ids)
得到一个25000*250的索引矩阵(索引矩阵长这样)
接下来要进行词嵌入
由于25000太大
所以选择一个batch 一个batch为24
不仅需要一个batch * 250 的索引矩阵(后面会进行词向量的嵌入 得到batch * 250 * 50的矩阵)还需要一个batch * 2的 labels 矩阵 记录每一个评论对应的是positive还是negative 进行训练
如何得到一个batch
from random import randint
def getTrainBatch():
labels = []
arr = np.zeros([batchSize, maxSeqLength])
for i in range(batchSize):
if (i % 2 == 0):
num = randint(1,11499) # 产生[1,11499]范围内一个整数型随机数
labels.append([1,0])
else:
num = randint(13499,24999)
labels.append([0,1])
arr[i] = ids[num-1:num]
return arr, labels
def getTestBatch():
labels = []
arr = np.zeros([batchSize, maxSeqLength])
for i in range(batchSize):
num = randint(11499,13499)
if (num <= 12499): #总共12500条正评论 超过12500就是负向
labels.append([1,0])
else:
labels.append([0,1])
arr[i] = ids[num-1:num]
return arr, labels
构造模型
batchSize = 24
lstmUnits = 64 # 隐层神经元个数
numClasses = 2 # 2分类
iterations = 50000 # 迭代次数
import tensorflow as tf
tf.reset_default_graph()
labels = tf.placeholder(tf.float32, [batchSize, numClasses])
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])
向tensorflow中输入batch * 250 和 batch * 2这样两个矩阵
调用tf.nn.embedding_lookup()函数来得到我们的词向量 得到 batch * 250 * 50的矩阵
data = tf.Variable(tf.zeros([batchSize, maxSeqLength, numDimensions]), dtype=tf.float32)
data = tf.nn.embedding_lookup(wordVectors, input_data) # input_data的每个id对应一个wordVectors中的向量,利用embedding_lookup查找
得到输入数据和输入的labels
下面开始构建网络框架
lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits) #一个神经元细胞
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75) #dropout一下
value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32) # 返回value,最终输出值
# tf.nn.dynamic_rnn,这个函数的功能是展开整个网络,并且构建一整个RNN模型
# 全连接层
weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(value, [1, 0, 2]) # 将矩阵value转置成[1, 0, 2]
# 取最终的结果值
last = tf.gather(value, int(value.get_shape()[0]) -1) # last为最后神经元结果值
prediction = (tf.matmul(last, weight) + bias)
定义正确的预测函数和正确率评估参数。正确的预测形式是查看最后输出的0-1向量是否和标记的0-1向量相同
correctPred = tf.equal(tf.arg_max(prediction, 1), tf.arg_max(labels, 1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))
使用一个标准的交叉熵损失函数来作为损失值。对于优化器,我们选择Adam,并且采用默认的学习率
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer().minimize(loss)
训练模型
用一个batch的数据进行输入
sess = tf.InteractiveSession()
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())
for i in range(iterations):
nextBatch, nextBatchLabels = getTrainBatch();
sess.run(optimizer, {input_data: nextBatch, labels: nextBatchLabels})
if(i % 1000 == 0 and i != 0):
loss_ = sess.run(loss, {input_data: nextBatch, labels: nextBatchLabels}) # 改成get TestBatch
accuracy_ = sess.run(accuracy, {input_data: nextBatch, labels: nextBatchLabels})
print('iteration {}/{}...'.format(i+1, iterations),
'loss {}...'.format(loss_),
'accuracy {}...'.format(accuracy_))
if(i % 10000 == 0 and i != 0):
save_path = saver.save(sess, './training_data/models/pretrained_lstm.ckpt', global_step = i)
print('saved to %s' % save_path)
测试模型
导入一个已经训练好的模型
sess = tf.InteractiveSession()
saver = tf.train.Saver()
saver.restore(sess, tf.train.latest_checkpoint('./training_data/models'))
从我们的测试集中导入一些电影评论(即一些新的batch三维向量)
iterations = 10
for i in range(iterations):
nextBatch, nextBatchLabels = getTestBatch();
print('iteration {}/{}...'.format(i + 1, iterations))
print("Accuracy for this batch:", (sess.run(accuracy, {input_data: nextBatch, labels: nextBatchLabels})) * 100)
标签:indexCounter,labels,batch,print,情感,评论,tf,LSTM,data 来源: https://www.cnblogs.com/monster-little/p/16558457.html