其他分享
首页 > 其他分享> > CNN&NRR对商品名称分类

CNN&NRR对商品名称分类

作者:互联网

import torch
import torch.nn as nn
from torch import optim
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader,dataset
import torch.nn.functional as F
from matplotlib import pyplot as plt

EPOCHES = 3

#TEXT_CNN参数:
IN_CHANNELS = 100 # 输入特征的维度,在文本处理中就是词向量的维度,在图像处理中就是通道数
OUT_CHANNELS = 256 # 卷积产生的通道数,即卷积核个数,滤波器个数
KERNEL_SIZE = 2 # 卷积核尺寸,实际输入的滤波器尺寸应该为(KERNEL_SIZE,embedding_size),
BATCH_SIZE = 5 # 这里可以用(KERNEL_SIZE,IN_CHANNELS)来表示
WORD_MAX_LENGTH = 10
#TEXT_RNN参数:

def read_data():
#读取数据
data_corpus = pd.read_excel(’./keyword_all.xlsx’) #读取全部关键词
corpus_list = list(data_corpus[‘keyword_all’].values) # 转化为列表,长度为22933

data_goods = pd.read_excel('./分词后数据.xlsx')
# print(data_goods)
return corpus_list,data_goods

def preprocess(corpus_list,data_goods):
#建立语料字典(词:下标)
corpus_dict = dict() #收集词下标
for index, word in enumerate(corpus_list):
corpus_dict[word] = index
length_corpus_dict = len(corpus_dict)
print("length_corpus_dict = ", length_corpus_dict)

#提取分类
class_fiction_name = data_goods['一级分类']
class_list = list(set(list(class_fiction_name.values)))
print("length_class:",len(class_list))
target_dict = {}
for i,class_name in enumerate(class_list):
    target_dict[class_name] = i
print(target_dict)


embedding = nn.Embedding(length_corpus_dict+1, 100) #设置把文本转化为向量的形状


goods_vector_target_dict = {} #收集词向量
for i in range(data_goods.shape[0]):  # 遍历每个商品
    keywords = data_goods['sku_name_afterdivide'][i] #商品名称
    keyword_list = keywords.split('|')  # 获取一个商品所有关键词
    # print(keyword_list)
    idx_list_onegoods = []  # 收集一个商品名称所有词语的下标
    for w in keyword_list:  # 遍历每个词语
        if w != '':
            idx = corpus_dict.get(w,0) #若自于存在于语料库中,输出语料库中的下标,若不存在则等于一个无效值的下标0
            if idx != 0:
                idx_list_onegoods.append(idx)
    #经过观察分布图可以发现,99%的商品名称的分词结果的词语数量是在10(WORD_MAX_LENGTH)  个以内的,所有以10个为准,多的舍去,少的补0
    if len(idx_list_onegoods) > WORD_MAX_LENGTH:
        idx_list_onegoods = idx_list_onegoods[:WORD_MAX_LENGTH]
    elif len(idx_list_onegoods) < WORD_MAX_LENGTH:
        for j in range(WORD_MAX_LENGTH-len(idx_list_onegoods)):
            idx_list_onegoods.append(0)
    idx_list_tensor_onegoods = torch.LongTensor(idx_list_onegoods) # 把词下标列表转化为,tensor格式的下标矩阵
    embedding_one_goods  = embedding(idx_list_tensor_onegoods) #把下标转化为向量

    #读取该商品所属的分类
    class_name = data_goods['一级分类'][i]
    target = target_dict[class_name]
    # print("class_name :",class_name,"target : ",target)

    goods_vector_target_dict[str(i)] = [embedding_one_goods,target]

# print(len(goods_vector_target_dict))
return goods_vector_target_dict,target_dict

class goods_vector_target_Dataset(dataset.Dataset): #继承Dataset类
def init(self,goods_vector_target_dict):
self.data = goods_vector_target_dict
def len(self):
#返回数据的长度
return len(goods_vector_target_dict)
def getitem(self, g_v_index):
[x,target] = self.data[str(g_v_index)]
label = torch.IntTensor([target])
return x,label

def set_dataloader(goods_vector_target_dict):
g_v_target_Dataset = goods_vector_target_Dataset(goods_vector_target_dict)
x,target = g_v_target_Dataset[0]

DL = DataLoader(g_v_target_Dataset,
                batch_size=BATCH_SIZE,
                shuffle = True)
return DL

class TEXT_CNN(nn.Module):
def init(self):
super(TEXT_CNN,self).init()
self.conv1 = nn.Conv1d(in_channels=IN_CHANNELS, #卷积前的特征维度
out_channels=OUT_CHANNELS, #卷积后的特征维度
kernel_size=KERNEL_SIZE, #滤波器尺寸
)
self.word_length = WORD_MAX_LENGTH
self.input_word_length_conv = int((self.word_length-KERNEL_SIZE)/1 +1) #卷积之后的特征数量
self.input_word_length_pool = int((self.input_word_length_conv-2)/2 +1) #池化后的特征数量,池化过程不会改变特征维度

    self.fc1 = nn.Linear(OUT_CHANNELS * self.input_word_length_pool,500)
    self.fc2 = nn.Linear(500,17)

def forward(self,x):
    x = self.conv1(x)
    # print("x.size() - after conv1 = ", x.size())
    x = F.relu(x)
    # print("x.size() - after relu = ", x.size())
    x = F.max_pool1d(x,2)
    # print("x.size() ---after pool ",x.size())
    x = x.view(-1, self.input_word_length_pool * OUT_CHANNELS) #第一个参数是batch
    # print("x.size() -- after reshape = ",x.size())
    x = self.fc1(x)
    x = F.relu(x)
    # print("x.size() -- x.size() after linear-1 = ",x.size())
    x = self.fc2(x)
    x = F.log_softmax(x,dim=1)
    return x

def train_text_cnn(dataloader):
print(“初始化cnn类,设置损失函数和优化函数”)
net_cnn = TEXT_CNN()
Loss = nn.MultiLabelSoftMarginLoss()
optimizer = optim.Adam(net_cnn.parameters())

print("开始训练")
correct_number = 0
for i, item in enumerate(dataloader):  # 遍历loader
    # print("i = ",i)
    x, label = item  # loader出数据和分类
    x = x.transpose(2, 1)  # 转置维度参数为1和2的数据,让词向量维度(此处为in_channel作为第二个维度)
    label = label.float()  # 把label标签转化为float类型
    # 正向传播
    out = net_cnn(x)

    # 把数据喂入损失函数
    loss = Loss(out, label)
    # 清空梯度
    optimizer.zero_grad()
    # 反向传播
    loss.backward()
    # 优化权重
    optimizer.step()

    # 计算预测准确的样本数量
    correct = (torch.argmax(out, 1) == torch.argmax(label, 1)).sum().float()
    correct_number += correct.item()

    # 每训练一千个商品计算损失和准确率
    if (i + 1) % 200 == 0:
        acc = correct_number / (200 * BATCH_SIZE)
        print("acc = %.2f " % (acc * 100) + '%', "   i = ", i, "   loss = ", loss)
        # 清空  correct_num
        correct_number = 0

def train_text_rnn(dataloader):
pass

if name == ‘main’:
print(“读取数据…”)
corpus_list,data_goods = read_data()

print("预处理数据............")
goods_vector_target_dict,target_dict = preprocess(corpus_list,data_goods)
length_sample = len(goods_vector_target_dict)
print("length_sample",length_sample)


print("生成数据批量读取器..........")
g_v_target_Dataset = goods_vector_target_Dataset(goods_vector_target_dict)
dataloader = set_dataloader(goods_vector_target_dict)

#train_cnn #训练卷积网络
train_text_cnn(dataloader)

#train_rnn

标签:goods,target,商品名称,self,list,print,dict,CNN,NRR
来源: https://blog.csdn.net/weixin_42681868/article/details/100749545