首页 > 其他分享> > 『NLP学习笔记』BERT命名实体识别(NER)实战

『NLP学习笔记』BERT命名实体识别(NER)实战

2021-12-20 23:03:18 作者：互联网

BERT命名实体识别(NER)实战！

文章目录

一. 数据集介绍
二. 数据集读取&预处理
三. 数据分词tokenizer
四. 定义数据读取(继承Dataset)
五. 定义模型&优化器&学习率
六. 训练测试以及准确率
七. 模型预测
八. 整个代码
八. 参考

BERT技术详细介绍： https://zhangkaifang.blog.csdn.net/article/details/120507302
BERT命名实体识别模型如下：

一. 数据集介绍

实验使用的数据集是微软亚洲研究院提供的词性标注数据集，其目标是识别文本中具有特定意义的实体,包括人名、地名、机构名。链接：https://mirror.coggle.club/dataset/ner/msra.zip
百度云链接: https://pan.baidu.com/s/17MRMTrQKWJ6-HUI-rWL80A 提取码: oq7w

二. 数据集读取&预处理

import codecs

################## 1. 读取数据
# 训练数据和标签
train_lines = codecs.open('msra/train/sentences.txt').readlines()
train_lines = [x.replace(' ', '').strip() for x in train_lines]  # 用于移除字符串开头和结尾指定的字符（默认为空格或换行符）或字符序列。
train_tags = codecs.open('msra/train/tags.txt').readlines()
train_tags = [x.strip().split(' ') for x in train_tags]
train_tags = [[tag_type.index(x) for x in tag] for tag in train_tags]
train_lines, train_tags = train_lines[:20000], train_tags[:20000]  # 只取两万数据
print(train_lines[0], "\n", train_tags[0])
# 如何解决足球界长期存在的诸多矛盾，重振昔日津门足球的雄风，成为天津足坛上下内外到处议论的话题。 
# [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

# 验证数据和标签
val_lines = codecs.open('msra/val/sentences.txt').readlines()
val_lines = [x.replace(' ', '').strip() for x in val_lines]
val_tags = codecs.open('msra/val/tags.txt').readlines()
val_tags = [x.strip().split(' ') for x in val_tags]
val_tags = [[tag_type.index(x) for x in tag] for tag in val_tags]

三. 数据分词tokenizer

注意：中文注意加list(train_lines),因为不加因为单词作为整体了。

################## 2. 对数据进行分词
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
# 中文注意加list(train_lines),因为不加因为单词作为整体了。
max_length = 64
train_encoding = tokenizer.batch_encode_plus(list(train_lines), truncation=True, padding=True, max_length=max_length)
val_encoding = tokenizer.batch_encode_plus(list(val_lines), truncation=True, padding=True, max_length=max_length)

四. 定义数据读取(继承Dataset)

注意：下面labels需要填充开头cls，结尾部分不够maxlen也要填0。

################## 3. 定义Dataset类对象
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(value[idx][:maxlen]) for key, value in self.encodings.items()}
        # 字级别的标注，注意填充cls，这里[0]代表cls。后面不够长的这里也是补充0，样本tokenizer的时候已经填充了
        # item['labels'] = torch.tensor([0] + self.labels[idx] + [0] * (63-len(self.labels[idx])))[:64]
        item['labels'] = torch.tensor([0] + self.labels[idx] + [0] * (maxlen - 1 - len(self.labels[idx])))[:maxlen]
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = TextDataset(train_encoding, train_tags)
test_dataset = TextDataset(val_encoding, val_tags)
print(train_dataset[0])

# Dataset转换成Dataloader
batchsz = 32
train_loader = DataLoader(train_dataset, batch_size=batchsz, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batchsz, shuffle=True)

五. 定义模型&优化器&学习率

from transformers import BertForTokenClassification, AdamW, get_linear_schedule_with_warmup

################## 4. 定义模型
model = BertForTokenClassification.from_pretrained('bert-base-chinese', num_labels=7)
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
model.to(device)

# 优化器和学习率
optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_loader) * 1
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,
                                            num_training_steps=total_steps)  # Default value in run_glue.py

六. 训练测试以及准确率

注意：outputs输出结果中的logits，NER其实就是对每个token进行分类。

然后对dim=2维度上取argmax，找出每个位置所属的类别下标。

# 这里测试计算准确率中的：
a = torch.tensor([1, 2, 3, 4, 2])
b = torch.tensor([1, 2, 4, 3, 2])
print((a==b).float().mean())
print((a==b).float().mean().item())

代码如下：

from tqdm import tqdm

def train():
    model.train()
    total_train_loss = 0
    iter_num = 0
    total_iter = len(train_loader)
    for idx, batch in enumerate(train_loader):
        optim.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            # loss = outputs[0]

        loss = outputs.loss
        
        if idx % 20 == 0:
            with torch.no_grad():
                # 64 * 7
                print((outputs[1].argmax(2).data == labels.data).float().mean().item(), loss.item())
        
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optim.step()
        scheduler.step()

        iter_num += 1
        if(iter_num % 100==0):
            print("epoth: %d, iter_num: %d, loss: %.4f, %.2f%%" % (epoch, iter_num, loss.item(), iter_num/total_iter*100))
        
    print("Epoch: %d, Average training loss: %.4f"%(epoch, total_train_loss/len(train_loader)))
    
def validation():
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    for batch in test_dataloader:
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs[1]

        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        total_eval_accuracy += (outputs[1].argmax(2).data == labels.data).float().mean().item()
        
    avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
    print("Accuracy: %.4f" % (avg_val_accuracy))
    print("Average testing loss: %.4f"%(total_eval_loss/len(test_dataloader)))
    print("-------------------------------")
    

for epoch in range(4):
    print("------------Epoch: %d ----------------" % epoch)
    train()
    validation()

七. 模型预测

model = torch.load('bert-ner.pt')

tag_type = ['O', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC']

def predcit(s):
    item = tokenizer([s], truncation=True, padding='longest', max_length=64) # 加一个list
    with torch.no_grad():
        input_ids = torch.tensor(item['input_ids']).to(device).reshape(1, -1)
        attention_mask = torch.tensor(item['attention_mask']).to(device).reshape(1, -1)
        labels = torch.tensor([0] * attention_mask.shape[1]).to(device).reshape(1, -1)
        
        outputs = model(input_ids, attention_mask, labels)
        outputs = outputs[0].data.cpu().numpy()
        
    outputs = outputs[0].argmax(1)[1:-1]
    ner_result = ''
    ner_flag = ''
    
    for o, c in zip(outputs,s):
        # 0 就是 O，没有含义
        if o == 0 and ner_result == '':
            continue
        
        # 
        elif o == 0 and ner_result != '':
            if ner_flag == 'O':
                print('机构：', ner_result)
            if ner_flag == 'P':
                print('人名：', ner_result)
            if ner_flag == 'L':
                print('位置：', ner_result)
                
            ner_result = ''
        
        elif o != 0:
            ner_flag = tag_type[o][2]
            ner_result += c
    return outputs

s = '整个华盛顿已笼罩在一片夜色之中，一个电话从美国总统府白宫打到了菲律宾总统府马拉卡南宫。'
# 识别出句子里面的实体识别（NER）
data = predcit(s)
s = '人工智能是未来的希望，也是中国和美国的冲突点。'
data = predcit(s)
s = '明天我们一起在海淀吃个饭吧，把叫刘涛和王华也叫上。'
data = predcit(s)
s = '同煤集团同生安平煤业公司发生井下安全事故 19名矿工遇难'
data = predcit(s)
s = '山东省政府办公厅就平邑县玉荣商贸有限公司石膏矿坍塌事故发出通报'
data = predcit(s)
s = '[新闻直播间]黑龙江:龙煤集团一煤矿发生火灾事故'
data = predcit(s)

位置： 华盛顿
位置： 美国总统府白宫
位置： 菲律宾总统府马拉卡南宫
位置： 华盛顿
位置： 美国总统府白宫
位置： 菲律宾总统府马拉卡南宫
位置： 中国
位置： 美国
位置： 海淀
人名： 刘涛
人名： 王华
机构： 同煤集团同生安平煤业公司
机构： 山东省政府办公厅
机构： 平邑县玉荣商贸有限公司
位置： 黑龙江
机构： 龙煤集团

八. 整个代码

此外提供了notebook版本代码，百度云: https://pan.baidu.com/s/1tiLqvsdzuBgWFb6defNyWg 提取码: gu8t

# !/usr/bin/env python
# -*- encoding: utf-8 -*-
"""=====================================
@author : kaifang zhang
@time   : 2021/12/19 1:33 PM
@contact: kaifang.zkf@dtwave-inc.com
====================================="""
import codecs
import torch
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader, TensorDataset
from transformers import BertTokenizer
from transformers import BertForTokenClassification, AdamW, get_linear_schedule_with_warmup

tag_type = ['O', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC']
# B-ORG I-ORG 机构的开始位置和中间位置
# B-PER I-PER 人物名字的开始位置和中间位置
# B-LOC I-LOC 位置的开始位置和中间位置

################## 1. 读取数据
# 训练数据和标签
train_lines = codecs.open('msra/train/sentences.txt').readlines()
train_lines = [x.replace(' ', '').strip() for x in train_lines]  # 用于移除字符串开头和结尾指定的字符（默认为空格或换行符）或字符序列。
train_tags = codecs.open('msra/train/tags.txt').readlines()
train_tags = [x.strip().split(' ') for x in train_tags]
train_tags = [[tag_type.index(x) for x in tag] for tag in train_tags]
train_lines, train_tags = train_lines[:20000], train_tags[:20000]  # 只取两万数据
print(f"样例数据：{train_lines[0]} \n样例标签：{train_tags[0]}")

# 验证数据和标签
val_lines = codecs.open('msra/val/sentences.txt').readlines()
val_lines = [x.replace(' ', '').strip() for x in val_lines]
val_tags = codecs.open('msra/val/tags.txt').readlines()
val_tags = [x.strip().split(' ') for x in val_tags]
val_tags = [[tag_type.index(x) for x in tag] for tag in val_tags]  # 标签转换为数值

################## 2. 对数据进行分词
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
# 中文注意加list(train_lines),因为不加因为单词作为整体了。
maxlen = 64
train_encoding = tokenizer.batch_encode_plus(list(train_lines), truncation=True, padding=True, max_length=maxlen)
val_encoding = tokenizer.batch_encode_plus(list(val_lines), truncation=True, padding=True, max_length=maxlen)

################## 3. 定义Dataset类对象
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(value[idx][:maxlen]) for key, value in self.encodings.items()}
        # 字级别的标注，注意填充cls，这里[0]代表cls。后面不够长的这里也是补充0，样本tokenizer的时候已经填充了
        # item['labels'] = torch.tensor([0] + self.labels[idx] + [0] * (63-len(self.labels[idx])))[:64]
        item['labels'] = torch.tensor([0] + self.labels[idx] + [0] * (maxlen - 1 - len(self.labels[idx])))[:maxlen]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encoding, train_tags)
test_dataset = TextDataset(val_encoding, val_tags)
batchsz = 32
train_loader = DataLoader(train_dataset, batch_size=batchsz, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batchsz, shuffle=True)

# print(train_dataset[0])

# 测试样本是否满足最大长度
for idx in range(len(train_dataset)):
    item = train_dataset[idx]
    for key in item:
        if item[key].shape[0] != 64:
            print(key, item[key].shape)
for idx in range(len(test_dataset)):
    item = test_dataset[idx]
    for key in item:
        if item[key].shape[0] != 64:
            print(key, item[key].shape)

################## 4. 定义模型
model = BertForTokenClassification.from_pretrained('bert-base-chinese', num_labels=7)
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
model.to(device)

# 优化器和学习率
optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_loader) * 1
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,
                                            num_training_steps=total_steps)  # Default value in run_glue.py

################## 4. 训练测试以及字符的分类准确率
def train():
    model.train()
    total_train_loss = 0
    iter_num = 0
    total_iter = len(train_loader)
    for idx, batch in enumerate(train_loader):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)  # shape: [32, 64]
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        # loss = outputs.loss
        logits1 = outputs[1]  # shape: [32, 64, 7]
        out = logits1.argmax(dim=2)
        out1 = out.data
        # logits2 = outputs.logits

        if idx % 20 == 0:  # 看模型的准确率
            with torch.no_grad():
                # 假如输入的是64个字符，64 * 7
                print((outputs[1].argmax(2).data == labels.data).float().mean().item(), loss.item())

        total_train_loss += loss.item()

        # 反向梯度信息
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # 参数更新
        optimizer.step()
        scheduler.step()

        iter_num += 1
        if (iter_num % 100 == 0):
            print("epoth: %d, iter_num: %d, loss: %.4f, %.2f%%" % (
                epoch, iter_num, loss.item(), iter_num / total_iter * 100))

    print("Epoch: %d, Average training loss: %.4f" % (epoch, total_train_loss / len(train_loader)))

def validation():
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    for batch in test_loader:
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs[1]

        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        total_eval_accuracy += (outputs[1].argmax(2).data == labels.data).float().mean().item()

    avg_val_accuracy = total_eval_accuracy / len(test_loader)
    print("Accuracy: %.4f" % (avg_val_accuracy))
    print("Average testing loss: %.4f" % (total_eval_loss / len(test_loader)))
    print("-------------------------------")

# tag_type = ['O', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC']
def predcit(s):
    item = tokenizer([s], truncation=True, padding='longest', max_length=64)  # 加一个list
    with torch.no_grad():
        input_ids = torch.tensor(item['input_ids']).to(device).reshape(1, -1)
        attention_mask = torch.tensor(item['attention_mask']).to(device).reshape(1, -1)
        labels = torch.tensor([0] * attention_mask.shape[1]).to(device).reshape(1, -1)

        outputs = model(input_ids, attention_mask, labels)
        outputs = outputs[0].data.cpu().numpy()

    outputs = outputs[0].argmax(1)[1:-1]
    ner_result = ''
    ner_flag = ''

    for o, c in zip(outputs, s):
        # 0 就是 O，没有含义
        if o == 0 and ner_result == '':
            continue
        #
        elif o == 0 and ner_result != '':
            if ner_flag == 'O':
                print('机构：', ner_result)
            if ner_flag == 'P':
                print('人名：', ner_result)
            if ner_flag == 'L':
                print('位置：', ner_result)

            ner_result = ''

        elif o != 0:
            ner_flag = tag_type[o][2]
            ner_result += c
    return outputs

# for epoch in range(4):
#     print("------------Epoch: %d ----------------" % epoch)
#     train()
#     validation()
# torch.save(model, 'bert-ner.pt')

model = torch.load('/data/aibox/kaifang/NLP学习资料/bert-ner.pt')
s = '整个华盛顿已笼罩在一片夜色之中，一个电话从美国总统府白宫打到了菲律宾总统府马拉卡南宫。'
# 识别出句子里面的实体识别（NER）
data = predcit(s)
s = '整个华盛顿已笼罩在一片夜色之中，一个电话从美国总统府白宫打到了菲律宾总统府马拉卡南宫。'
# 识别出句子里面的实体识别（NER）
data = predcit(s)
s = '人工智能是未来的希望，也是中国和美国的冲突点。'
data = predcit(s)
s = '明天我们一起在海淀吃个饭吧，把叫刘涛和王华也叫上。'
data = predcit(s)
s = '同煤集团同生安平煤业公司发生井下安全事故 19名矿工遇难'
data = predcit(s)
s = '山东省政府办公厅就平邑县玉荣商贸有限公司石膏矿坍塌事故发出通报'
data = predcit(s)
s = '[新闻直播间]黑龙江:龙煤集团一煤矿发生火灾事故'
data = predcit(s)

八. 参考

主要参考dasou博主的视频：https://www.bilibili.com/video/BV1Ey4y1874y?p=6&spm_id_from=pageDriver
腾讯Bugly的专栏：图解BERT模型：从零开始构建BERT

Bert源代码解读-以BERT文本分类代码为例子：https://github.com/DA-southampton/Read_Bert_Code
BERT大火却不懂Transformer？读这一篇就够了：https://zhuanlan.zhihu.com/p/54356280
pytorch 中加载BERT模型, 获取词向量：https://blog.csdn.net/znsoft/article/details/107725285
Bert生成句向量(pytorch)
https://blog.csdn.net/weixin_41519463/article/details/100863313
学习率预热(transformers.get_linear_schedule_with_warmup)：https://blog.csdn.net/orangerfun/article/details/120400247

标签：NLP,loss,outputs,labels,item,train,BERT,NER,tags
来源： https://blog.csdn.net/abc13526222160/article/details/122052004