fasttext训练模型代码
作者:互联网
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# author ChenYongSheng
# date 20201222
import pandas as pd
import jieba
'''数据预处理'''
df = pd.read_csv('data/8qi/xx.csv', header=0)
stopwords = [line.strip() for line in open('data/all/stopwords.txt', encoding='utf-8').readlines()]
def remove_stopwords(text_cut, stopwords):
result = []
for word in text_cut:
if word not in stopwords:
result.append(word)
return result
lines = []
test_lines = []
for data in df.itertuples():
# print(data)
label = '__label__' + str(data.label)
text = str(data.text)
text_cut = jieba.lcut(text)
text_remove_stop = remove_stopwords(text_cut, stopwords)
words = ''
for word in text_remove_stop:
words = word + ' ' + words
body = label + ' , ' + words.rstrip(' ')
if data.Index % 10 == 0:
test_lines.append(body)
else:
lines.append(body)
with open('data/8qi/train.txt', 'w', encoding='utf-8') as f:
for line in lines:
f.write(line + '\n')
f.close()
with open('data/8qi/test.txt', 'w', encoding='utf-8') as f:
for line in test_lines:
f.write(line + '\n')
f.close()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# author ChenYongSheng
# date 20201222
import fasttext
'''模型训练'''
trainDataFile = 'data/8qi/train.txt'
model = fasttext.train_supervised(trainDataFile, lr=0.1, dim=100, epoch=30, word_ngrams=2, loss='softmax')
model.save_model("model/fasttext_model.bin")
testDataFile = 'data/8qi/test.txt'
model = fasttext.load_model('model/fasttext_model.bin')
result = model.test(testDataFile)
print('测试集上数据量', result[0])
print('测试集上准确率', result[1])
print('测试集上召回率', result[2])
必须是这样的数据格式:__label__分类名(空格)(逗号)(空格)(切词)
__label__安静程度 , 吵不吵 房子 那套 肯德基
__label__安静程度 , 吵
__label__安静程度 , 位置 吵 卧室
如果报错ValueError: data/7期/train.txt cannot be opened for training!
即是数据文件路径包含中文名,改成英文或拼音
标签:__,fasttext,text,代码,label,result,model,data,模型 来源: https://blog.csdn.net/qq236237606/article/details/111572554