1. 磐创AI首页
  2. 自然语言处理

nltk 获取 gutenberg 语料,gensim 生成词库和 onehot 编码

nltk 获取 gutenberg 语料
gensim 生成词库和 onehot 编码
正在尝试基于 Tensorflow LSTM 模型开发另外一个项目,需要自然语言处理的工具和语料。

import nltk
import numpy as np
from nltk.corpus import gutenberg
from gensim import corpora, models, similarities


class Book2Array(object):
    sentences=None
    token2id_dic=None
    def __init__(self,sentences):
        self.sentences=sentences
        self.token2id_dic=self.get_token2id_dic()

    def get_sentences(self):
        #macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
        #print(macbeth_sentences)
        #print(type(macbeth_sentences))
        print(len(macbeth_sentences))
        sentences_list=[sentence for sentence in self.sentences]
        #print(type(macbeth_list))
        return sentences_list

    def get_token2id_dic(self):
        # collect statistics about all tokens
        dictionary = corpora.Dictionary(self.sentences)
        # remove stop words and words that appear only once
        dictionary.compactify() # remove gaps in id sequence after words that were removed
        print(len(dictionary))
        token2id_dic=dictionary.token2id
        return token2id_dic

    def word2onehot(self,word):
        onehot_list=np.zeros(8192)
        onehot_list[self.token2id_dic[word]]=1
        return onehot_list

    def sent2vec(self,sentence):
        vec=[]
        if(len(sentence)>20):
            sentence=sentence[0:20]
        for word in sentence:
            onehot_list=self.word2onehot(word)
            vec.append(onehot_list)
        len_vec=len(vec)
        for i in range(0,20-len_vec):
            vec.append(np.zeros(8192))
        #print(len(vec))
        vec_np=np.asarray(vec)
        return vec_np

    def sentences2array(self):
        array=[]
        for sentence in self.sentences:
            array.append(self.sent2vec(sentence))
        return array

    def gen_batch(self):
        pass

if __name__ == '__main__':
    macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
    book_array=Book2Array(macbeth_sentences)
    book_array.get_sentences()
    array=book_array.sentences2array()
    np_array=np.array(array[0])
    print(np_array.shape)

原创文章,作者:fendouai,如若转载,请注明出处:https://panchuang.net/2017/08/02/chatgirl-project-gensim-nltk-onehot/

发表评论

登录后才能评论

联系我们

400-800-8888

在线咨询:点击这里给我发消息

邮件:admin@example.com

工作时间:周一至周五,9:30-18:30,节假日休息