其他分享
首页 > 其他分享> > fasttext训练模型代码

fasttext训练模型代码

作者:互联网

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# author ChenYongSheng
# date 20201222

import pandas as pd
import jieba

'''数据预处理'''
df = pd.read_csv('data/8qi/xx.csv', header=0)
stopwords = [line.strip() for line in open('data/all/stopwords.txt', encoding='utf-8').readlines()]


def remove_stopwords(text_cut, stopwords):
    result = []
    for word in text_cut:
        if word not in stopwords:
            result.append(word)
    return result


lines = []
test_lines = []
for data in df.itertuples():
    # print(data)
    label = '__label__' + str(data.label)
    text = str(data.text)
    text_cut = jieba.lcut(text)
    text_remove_stop = remove_stopwords(text_cut, stopwords)
    words = ''
    for word in text_remove_stop:
        words = word + ' ' + words
    body = label + ' , ' + words.rstrip(' ')
    if data.Index % 10 == 0:
        test_lines.append(body)
    else:
        lines.append(body)

with open('data/8qi/train.txt', 'w', encoding='utf-8') as f:
    for line in lines:
        f.write(line + '\n')
    f.close()

with open('data/8qi/test.txt', 'w', encoding='utf-8') as f:
    for line in test_lines:
        f.write(line + '\n')
    f.close()
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# author ChenYongSheng
# date 20201222

import fasttext

'''模型训练'''

trainDataFile = 'data/8qi/train.txt'

model = fasttext.train_supervised(trainDataFile, lr=0.1, dim=100, epoch=30, word_ngrams=2, loss='softmax')
model.save_model("model/fasttext_model.bin")


testDataFile = 'data/8qi/test.txt'

model = fasttext.load_model('model/fasttext_model.bin')

result = model.test(testDataFile)
print('测试集上数据量', result[0])
print('测试集上准确率', result[1])
print('测试集上召回率', result[2])

必须是这样的数据格式:__label__分类名(空格)(逗号)(空格)(切词)
__label__安静程度 , 吵不吵 房子 那套 肯德基
__label__安静程度 , 吵
__label__安静程度 , 位置 吵 卧室

如果报错ValueError: data/7期/train.txt cannot be opened for training!
即是数据文件路径包含中文名,改成英文或拼音

标签:__,fasttext,text,代码,label,result,model,data,模型
来源: https://blog.csdn.net/qq236237606/article/details/111572554