nltk 获取 gutenberg 语料,gensim 生成词库和 onehot 编码

nltk 获取 gutenberg 语料
gensim 生成词库和 onehot 编码
正在尝试基于 Tensorflow LSTM 模型开发另外一个项目,需要自然语言处理的工具和语料。


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import nltk
import numpy as np
from nltk.corpus import gutenberg
from gensim import corpora, models, similarities


class Book2Array(object):
    sentences=None
    token2id_dic=None
    def __init__(self,sentences):
        self.sentences=sentences
        self.token2id_dic=self.get_token2id_dic()

    def get_sentences(self):
        #macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
        #print(macbeth_sentences)
        #print(type(macbeth_sentences))
        print(len(macbeth_sentences))
        sentences_list=[sentence for sentence in self.sentences]
        #print(type(macbeth_list))
        return sentences_list

    def get_token2id_dic(self):
        # collect statistics about all tokens
        dictionary = corpora.Dictionary(self.sentences)
        # remove stop words and words that appear only once
        dictionary.compactify() # remove gaps in id sequence after words that were removed
        print(len(dictionary))
        token2id_dic=dictionary.token2id
        return token2id_dic

    def word2onehot(self,word):
        onehot_list=np.zeros(8192)
        onehot_list[self.token2id_dic[word]]=1
        return onehot_list

    def sent2vec(self,sentence):
        vec=[]
        if(len(sentence)>20):
            sentence=sentence[0:20]
        for word in sentence:
            onehot_list=self.word2onehot(word)
            vec.append(onehot_list)
        len_vec=len(vec)
        for i in range(0,20-len_vec):
            vec.append(np.zeros(8192))
        #print(len(vec))
        vec_np=np.asarray(vec)
        return vec_np

    def sentences2array(self):
        array=[]
        for sentence in self.sentences:
            array.append(self.sent2vec(sentence))
        return array

    def gen_batch(self):
        pass

if __name__ == '__main__':
    macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
    book_array=Book2Array(macbeth_sentences)
    book_array.get_sentences()
    array=book_array.sentences2array()
    np_array=np.array(array[0])
    print(np_array.shape)

原创文章,作者:fendouai,如若转载,请注明出处:http://panchuang.net/2017/08/02/chatgirl-project-gensim-nltk-onehot/

发表评论

电子邮件地址不会被公开。

联系我们

400-800-8888

在线咨询:点击这里给我发消息

邮件:admin@example.com

工作时间:周一至周五,9:30-18:30,节假日休息