nltk 获取 gutenberg 语料，gensim 生成词库和 onehot 编码

fendouai • 2017年8月2日 pm5:05 • 自然语言处理

nltk 获取 gutenberg 语料
gensim 生成词库和 onehot 编码
正在尝试基于 Tensorflow LSTM 模型开发另外一个项目，需要自然语言处理的工具和语料。

import nltk
import numpy as np
from nltk.corpus import gutenberg
from gensim import corpora, models, similarities


class Book2Array(object):
    sentences=None
    token2id_dic=None
    def __init__(self,sentences):
        self.sentences=sentences
        self.token2id_dic=self.get_token2id_dic()

    def get_sentences(self):
        #macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
        #print(macbeth_sentences)
        #print(type(macbeth_sentences))
        print(len(macbeth_sentences))
        sentences_list=[sentence for sentence in self.sentences]
        #print(type(macbeth_list))
        return sentences_list

    def get_token2id_dic(self):
        # collect statistics about all tokens
        dictionary = corpora.Dictionary(self.sentences)
        # remove stop words and words that appear only once
        dictionary.compactify() # remove gaps in id sequence after words that were removed
        print(len(dictionary))
        token2id_dic=dictionary.token2id
        return token2id_dic

    def word2onehot(self,word):
        onehot_list=np.zeros(8192)
        onehot_list[self.token2id_dic[word]]=1
        return onehot_list

    def sent2vec(self,sentence):
        vec=[]
        if(len(sentence)>20):
            sentence=sentence[0:20]
        for word in sentence:
            onehot_list=self.word2onehot(word)
            vec.append(onehot_list)
        len_vec=len(vec)
        for i in range(0,20-len_vec):
            vec.append(np.zeros(8192))
        #print(len(vec))
        vec_np=np.asarray(vec)
        return vec_np

    def sentences2array(self):
        array=[]
        for sentence in self.sentences:
            array.append(self.sent2vec(sentence))
        return array

    def gen_batch(self):
        pass

if __name__ == '__main__':
    macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
    book_array=Book2Array(macbeth_sentences)
    book_array.get_sentences()
    array=book_array.sentences2array()
    np_array=np.array(array[0])
    print(np_array.shape)

原创文章，作者：fendouai，如若转载，请注明出处：https://panchuang.net/2017/08/02/chatgirl-project-gensim-nltk-onehot/

赞 (0)

fendouai 注册用户

0 0

FaceRank Github Trending Python 排名第4

« 上一篇 2017年8月2日 pm4:48

Tensorflow BasicLSTMCell 初始化参数

下一篇 » 2017年8月2日 pm9:36

发表评论取消回复

登录后才能评论

联系我们

400-800-8888

在线咨询：

邮件：admin@example.com

工作时间：周一至周五，9:30-18:30，节假日休息