其他分享
首页 > 其他分享> > 加载词向量

加载词向量

作者:互联网

import numpy as np
from numpy import dtype, fromstring, float32 as REAL
#fname=glove.refine.txt
class WordLoader(object):
    def load_word_vector(self, fname, binary=None):
        if binary == None:
            if fname.endswith('.txt'):
                binary = False
            elif fname.endswith('.bin'):
                binary = True
            else:
                raise NotImplementedError('Cannot infer binary from %s' % (fname))#抛出未实现异常

        vocab = {}
        with open(fname) as fin:
            header = fin.readline()
            vocab_size, vec_size = map(int, header.split())
            if binary:#如果是二进制文件
                binary_len = dtype(REAL).itemsize * vec_size#dtype(REAL).itemsize字节大小4
                for line_no in range(vocab_size):
                    try:
                        word = []
                        while True:
                            ch = fin.read(1)
                            if ch == ' ':
                                word = ''.join(word)
                                break
                            if ch != '\n':
                                word.append(ch)
                        vocab[word] = fromstring(fin.read(binary_len), dtype=REAL)
                    except:
                        pass
            else:
                for line_no, line in enumerate(fin):
                    try:
                        parts = line.strip().split(' ')
                        word, weights = parts[0], map(REAL, parts[1:])
                        vocab[word] = weights
                    except:
                        pass
        return vocab

标签:vocab,binary,word,REAL,fname,fin,向量,加载
来源: https://blog.51cto.com/u_14540820/2759518