首页 > 其他分享> > 【动手学深度学习pytorch】学习笔记 8.3 语言模型和数据集

【动手学深度学习pytorch】学习笔记 8.3 语言模型和数据集

2022-06-10 00:01:10 作者：互联网

8.3. 语言模型和数据集 — 动手学深度学习 2.0.0-beta0 documentation (d2l.ai)

这一小节有一定难度，需要耐心阅读。

主要讲了三段程序：自然语言统计；读取长序列数据；数据迭代器

自然语言统计

知识点：n元语法。属于NLP基础。

简单地说，一元语法就是考虑“自己”就行；二元语法要考虑“自己“和“自己前面的词元“；三元语法要考虑“自己“和“自己前面的词元“以及“自己前面的词元的前面的词元“

读取长序列数据

这段程序比较晦涩，需要熟悉python语法。多加点print语句输出，观察数值变化，有助于读懂程序。

目标：长序列拆成短序列，方便处理

方法：随机生成一个小批量数据的特征和标签以供读取

标签的值：目标是基于到目前为止我们看到的词元来预测下一个词元，因此标签是移位了一个词元的原始序列

随机采样（random sampling）：每个样本都是在原始的长序列上任意捕获的子序列

顺序分区（sequential partitioning）：基于小批量的迭代过程中保留了拆分的子序列的顺序

数据迭代器

[将上面的两个采样函数包装到一个类中]，以便稍后可以将其用作数据迭代器。

函数load_data_time_machine，它同时返回数据迭代器和词表

作者把前面的知识点综合起来，写了一个类，一个函数，便于后面使用，很贴心。

如果觉得这部分读起来有点吃力的话，可以当黑盒处理，了解输入和输出即可。

这部分主要是考验基础编程，对学习DL影响不大。

自然语言统计；

import random
import matplotlib.pyplot as plt
import torch
from d2l import torch as d2l

tokens = d2l.tokenize(d2l.read_time_machine())
# 因为每个文本行不一定是一个句子或一个段落，因此我们把所有文本行拼接到一起
corpus = [token for line in tokens for token in line]
vocab = d2l.Vocab(corpus)
print(vocab.token_freqs[:10])

freqs = [freq for token, freq in vocab.token_freqs]
d2l.plot(freqs, xlabel='token: x', ylabel='frequency: n(x)', xscale='log', yscale='log')
plt.show()

bigram_tokens = [pair for pair in zip(corpus[:-1], corpus[1:])]
bigram_vocab = d2l.Vocab(bigram_tokens)
print(bigram_vocab.token_freqs[:10])

trigram_tokens = [triple for triple in zip(
    corpus[:-2], corpus[1:-1], corpus[2:])]
trigram_vocab = d2l.Vocab(trigram_tokens)
print(trigram_vocab.token_freqs[:10])

bigram_freqs = [freq for token, freq in bigram_vocab.token_freqs]
trigram_freqs = [freq for token, freq in trigram_vocab.token_freqs]
d2l.plot([freqs, bigram_freqs, trigram_freqs], xlabel='token: x',
         ylabel='frequency: n(x)', xscale='log', yscale='log',
         legend=['unigram', 'bigram', 'trigram'])


plt.show()

输出一元语法；二元语法；三元语法的词元频率

[('the', 2261), ('i', 1267), ('and', 1245), ('of', 1155), ('a', 816), ('to', 695), ('was', 552), ('in', 541), ('that', 443), ('my', 440)]
[(('of', 'the'), 309), (('in', 'the'), 169), (('i', 'had'), 130), (('i', 'was'), 112), (('and', 'the'), 109), (('the', 'time'), 102), (('it', 'was'), 99), (('to', 'the'), 85), (('as', 'i'), 78), (('of', 'a'), 73)]
[(('the', 'time', 'traveller'), 59), (('the', 'time', 'machine'), 30), (('the', 'medical', 'man'), 24), (('it', 'seemed', 'to'), 16), (('it', 'was', 'a'), 15), (('here', 'and', 'there'), 15), (('seemed', 'to', 'me'), 14), (('i', 'did', 'not'), 14), (('i', 'saw', 'the'), 13), (('i', 'began', 'to'), 13)]

读取长序列数据；

import random
import matplotlib.pyplot as plt
import torch
from d2l import torch as d2l


my_seq = list(range(35))
print('my_seq :', my_seq)


# """使用随机抽样生成一个小批量子序列"""
def seq_data_iter_random(corpus, batch_size, num_steps):
    # 从 随机偏移量 开始对序列进行分区，随机范围包括 num_steps-1
    random_num = random.randint(0, num_steps - 1)
    # 减去1是因为需要考虑标签. random.randint返回 0~（num_steps-1）间随机整数
    print('random_num:', random_num)

    corpus = corpus[random_num:]
    # 从 第random_num个数 到 末尾， 形成新的语料
    print('corpus:', corpus)

    num_subseqs = (len(corpus) - 1) // num_steps
    # 长度为num_steps的子序列的起始索引。“//”表示整数除法，返回商的整数部分（向下取整）
    print('num_subseqs:', num_subseqs)

    initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
    # range(start,stop,step)：[start,stop)之间，步长step
    print('initial_indices:', initial_indices)

    # 在随机抽样的迭代过程中，来自两个相邻的、随机的、小批量中的子序列不一定在原始序列上相邻
    random.shuffle(initial_indices)

    def data(pos):  # 返回从pos位置开始的长度为num_steps的序列
        return corpus[pos: pos + num_steps]

    num_batches = num_subseqs // batch_size
    for i in range(0, batch_size * num_batches, batch_size):
        # 在这里，initial_indices包含子序列的随机起始索引
        initial_indices_per_batch = initial_indices[i: i + batch_size]
        print('initial_indices_per_batch:', initial_indices_per_batch)
        X = [data(j) for j in initial_indices_per_batch]    # 特征
        Y = [data(j + 1) for j in initial_indices_per_batch]  # 标签
        yield torch.tensor(X), torch.tensor(Y)


# print('随机抽样')
for X, Y in seq_data_iter_random(my_seq, batch_size=2, num_steps=5):
    print('X: ', X, '\nY:', Y)


# """使用顺序分区生成一个小批量子序列"""
# def seq_data_iter_sequential(corpus, batch_size, num_steps):  # @save
#     # 从随机偏移量开始划分序列
#     offset = random.randint(0, num_steps)
#     num_tokens = ((len(corpus) - offset - 1) // batch_size) * batch_size
#     Xs = torch.tensor(corpus[offset: offset + num_tokens])
#     Ys = torch.tensor(corpus[offset + 1: offset + 1 + num_tokens])
#     Xs, Ys = Xs.reshape(batch_size, -1), Ys.reshape(batch_size, -1)
#     num_batches = Xs.shape[1] // num_steps
#     for i in range(0, num_steps * num_batches, num_steps):
#         X = Xs[:, i: i + num_steps]
#         Y = Ys[:, i: i + num_steps]
#         yield X, Y
#
#
# print('顺序分区')
# for X, Y in seq_data_iter_sequential(my_seq, batch_size=2, num_steps=5):
#     print('X: ', X, '\nY:', Y)

输出：

my_seq : [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
random_num: 3
corpus: [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
num_subseqs: 6
initial_indices: [0, 5, 10, 15, 20, 25]
initial_indices_per_batch: [20, 25]
X: tensor([[23, 24, 25, 26, 27], [28, 29, 30, 31, 32]])
Y: tensor([[24, 25, 26, 27, 28], [29, 30, 31, 32, 33]])
initial_indices_per_batch: [10, 5]
X: tensor([[13, 14, 15, 16, 17], [ 8, 9, 10, 11, 12]])
Y: tensor([[14, 15, 16, 17, 18], [ 9, 10, 11, 12, 13]])
initial_indices_per_batch: [0, 15]
X: tensor([[ 3, 4, 5, 6, 7], [18, 19, 20, 21, 22]])
Y: tensor([[ 4, 5, 6, 7, 8], [19, 20, 21, 22, 23]])

数据迭代器

import random
import matplotlib.pyplot as plt
import torch
from d2l import torch as d2l


class SeqDataLoader:
    """加载序列数据的迭代器"""

    def __init__(self, batch_size, num_steps, use_random_iter, max_tokens):
        if use_random_iter:
            self.data_iter_fn = d2l.seq_data_iter_random
        else:
            self.data_iter_fn = d2l.seq_data_iter_sequential
        self.corpus, self.vocab = d2l.load_corpus_time_machine(max_tokens)
        self.batch_size, self.num_steps = batch_size, num_steps

    def __iter__(self):
        return self.data_iter_fn(self.corpus, self.batch_size, self.num_steps)


def load_data_time_machine(batch_size, num_steps, use_random_iter=False, max_tokens=10000):
    """返回时光机器数据集的迭代器和词表"""
    data_iter = SeqDataLoader(batch_size, num_steps, use_random_iter, max_tokens)
    return data_iter, data_iter.vocab

标签：8.3,random,batch,学习,pytorch,num,steps,corpus,size
来源： https://www.cnblogs.com/hbuwyg/p/16361670.html