CNN&NRR对商品名称分类
作者:互联网
import torch
import torch.nn as nn
from torch import optim
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader,dataset
import torch.nn.functional as F
from matplotlib import pyplot as plt
EPOCHES = 3
#TEXT_CNN参数:
IN_CHANNELS = 100 # 输入特征的维度,在文本处理中就是词向量的维度,在图像处理中就是通道数
OUT_CHANNELS = 256 # 卷积产生的通道数,即卷积核个数,滤波器个数
KERNEL_SIZE = 2 # 卷积核尺寸,实际输入的滤波器尺寸应该为(KERNEL_SIZE,embedding_size),
BATCH_SIZE = 5 # 这里可以用(KERNEL_SIZE,IN_CHANNELS)来表示
WORD_MAX_LENGTH = 10
#TEXT_RNN参数:
def read_data():
#读取数据
data_corpus = pd.read_excel(’./keyword_all.xlsx’) #读取全部关键词
corpus_list = list(data_corpus[‘keyword_all’].values) # 转化为列表,长度为22933
data_goods = pd.read_excel('./分词后数据.xlsx')
# print(data_goods)
return corpus_list,data_goods
def preprocess(corpus_list,data_goods):
#建立语料字典(词:下标)
corpus_dict = dict() #收集词下标
for index, word in enumerate(corpus_list):
corpus_dict[word] = index
length_corpus_dict = len(corpus_dict)
print("length_corpus_dict = ", length_corpus_dict)
#提取分类
class_fiction_name = data_goods['一级分类']
class_list = list(set(list(class_fiction_name.values)))
print("length_class:",len(class_list))
target_dict = {}
for i,class_name in enumerate(class_list):
target_dict[class_name] = i
print(target_dict)
embedding = nn.Embedding(length_corpus_dict+1, 100) #设置把文本转化为向量的形状
goods_vector_target_dict = {} #收集词向量
for i in range(data_goods.shape[0]): # 遍历每个商品
keywords = data_goods['sku_name_afterdivide'][i] #商品名称
keyword_list = keywords.split('|') # 获取一个商品所有关键词
# print(keyword_list)
idx_list_onegoods = [] # 收集一个商品名称所有词语的下标
for w in keyword_list: # 遍历每个词语
if w != '':
idx = corpus_dict.get(w,0) #若自于存在于语料库中,输出语料库中的下标,若不存在则等于一个无效值的下标0
if idx != 0:
idx_list_onegoods.append(idx)
#经过观察分布图可以发现,99%的商品名称的分词结果的词语数量是在10(WORD_MAX_LENGTH) 个以内的,所有以10个为准,多的舍去,少的补0
if len(idx_list_onegoods) > WORD_MAX_LENGTH:
idx_list_onegoods = idx_list_onegoods[:WORD_MAX_LENGTH]
elif len(idx_list_onegoods) < WORD_MAX_LENGTH:
for j in range(WORD_MAX_LENGTH-len(idx_list_onegoods)):
idx_list_onegoods.append(0)
idx_list_tensor_onegoods = torch.LongTensor(idx_list_onegoods) # 把词下标列表转化为,tensor格式的下标矩阵
embedding_one_goods = embedding(idx_list_tensor_onegoods) #把下标转化为向量
#读取该商品所属的分类
class_name = data_goods['一级分类'][i]
target = target_dict[class_name]
# print("class_name :",class_name,"target : ",target)
goods_vector_target_dict[str(i)] = [embedding_one_goods,target]
# print(len(goods_vector_target_dict))
return goods_vector_target_dict,target_dict
class goods_vector_target_Dataset(dataset.Dataset): #继承Dataset类
def init(self,goods_vector_target_dict):
self.data = goods_vector_target_dict
def len(self):
#返回数据的长度
return len(goods_vector_target_dict)
def getitem(self, g_v_index):
[x,target] = self.data[str(g_v_index)]
label = torch.IntTensor([target])
return x,label
def set_dataloader(goods_vector_target_dict):
g_v_target_Dataset = goods_vector_target_Dataset(goods_vector_target_dict)
x,target = g_v_target_Dataset[0]
DL = DataLoader(g_v_target_Dataset,
batch_size=BATCH_SIZE,
shuffle = True)
return DL
class TEXT_CNN(nn.Module):
def init(self):
super(TEXT_CNN,self).init()
self.conv1 = nn.Conv1d(in_channels=IN_CHANNELS, #卷积前的特征维度
out_channels=OUT_CHANNELS, #卷积后的特征维度
kernel_size=KERNEL_SIZE, #滤波器尺寸
)
self.word_length = WORD_MAX_LENGTH
self.input_word_length_conv = int((self.word_length-KERNEL_SIZE)/1 +1) #卷积之后的特征数量
self.input_word_length_pool = int((self.input_word_length_conv-2)/2 +1) #池化后的特征数量,池化过程不会改变特征维度
self.fc1 = nn.Linear(OUT_CHANNELS * self.input_word_length_pool,500)
self.fc2 = nn.Linear(500,17)
def forward(self,x):
x = self.conv1(x)
# print("x.size() - after conv1 = ", x.size())
x = F.relu(x)
# print("x.size() - after relu = ", x.size())
x = F.max_pool1d(x,2)
# print("x.size() ---after pool ",x.size())
x = x.view(-1, self.input_word_length_pool * OUT_CHANNELS) #第一个参数是batch
# print("x.size() -- after reshape = ",x.size())
x = self.fc1(x)
x = F.relu(x)
# print("x.size() -- x.size() after linear-1 = ",x.size())
x = self.fc2(x)
x = F.log_softmax(x,dim=1)
return x
def train_text_cnn(dataloader):
print(“初始化cnn类,设置损失函数和优化函数”)
net_cnn = TEXT_CNN()
Loss = nn.MultiLabelSoftMarginLoss()
optimizer = optim.Adam(net_cnn.parameters())
print("开始训练")
correct_number = 0
for i, item in enumerate(dataloader): # 遍历loader
# print("i = ",i)
x, label = item # loader出数据和分类
x = x.transpose(2, 1) # 转置维度参数为1和2的数据,让词向量维度(此处为in_channel作为第二个维度)
label = label.float() # 把label标签转化为float类型
# 正向传播
out = net_cnn(x)
# 把数据喂入损失函数
loss = Loss(out, label)
# 清空梯度
optimizer.zero_grad()
# 反向传播
loss.backward()
# 优化权重
optimizer.step()
# 计算预测准确的样本数量
correct = (torch.argmax(out, 1) == torch.argmax(label, 1)).sum().float()
correct_number += correct.item()
# 每训练一千个商品计算损失和准确率
if (i + 1) % 200 == 0:
acc = correct_number / (200 * BATCH_SIZE)
print("acc = %.2f " % (acc * 100) + '%', " i = ", i, " loss = ", loss)
# 清空 correct_num
correct_number = 0
def train_text_rnn(dataloader):
pass
if name == ‘main’:
print(“读取数据…”)
corpus_list,data_goods = read_data()
print("预处理数据............")
goods_vector_target_dict,target_dict = preprocess(corpus_list,data_goods)
length_sample = len(goods_vector_target_dict)
print("length_sample",length_sample)
print("生成数据批量读取器..........")
g_v_target_Dataset = goods_vector_target_Dataset(goods_vector_target_dict)
dataloader = set_dataloader(goods_vector_target_dict)
#train_cnn #训练卷积网络
train_text_cnn(dataloader)
#train_rnn
标签:goods,target,商品名称,self,list,print,dict,CNN,NRR 来源: https://blog.csdn.net/weixin_42681868/article/details/100749545