无法使用 LDA 训练模型对主题进行分类

萨朗曼杰瑞卡

我使用 Gensim 创建了一个 LDA 模型,为此我首先从 num_topics 的范围 3 到 10 进行迭代,并基于 pyLDAvis 图,在最终的 lda 模型中选择了 n = 3。

import glob
import sys
sys.path.append('/Users/tcssig/Documents/NLP_code_base/Doc_Similarity')
import normalization
from gensim.models.coherencemodel import CoherenceModel
datalist = []

for filename in glob.iglob('/Users/tcssig/Documents/Speech_text_files/*.*'):
    text = open(filename).readlines()
    text = normalization.normalize_corpus(text, only_text_chars=True, tokenize=True)
    datalist.append(text)

datalist = [datalist[i][0] for i in range(len(datalist))]

from gensim import models,corpora
import spacy
dictionary = corpora.Dictionary(datalist)
num_topics = 3
Lda = models.LdaMulticore

#lda= Lda(doc_term_matrix, num_topics=num_topics,id2word = dictionary, passes=20,chunksize=2000,random_state=3)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in datalist]

dictionary = corpora.Dictionary(datalist)
import numpy as np 
import pandas as pd
import spacy
import re
from tqdm._tqdm_notebook import tqdm_notebook,tnrange,tqdm
from collections import Counter,OrderedDict
from gensim import models,corpora
from gensim.summarization import summarize,keywords
import warnings
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import seaborn as sns

Lda = models.LdaMulticore
coherenceList_umass = []
coherenceList_cv = []
num_topics_list = np.arange(3,10)
for num_topics in tqdm(num_topics_list):
    lda= Lda(doc_term_matrix, num_topics=num_topics,id2word = dictionary, passes=20,chunksize=4000,random_state=43)
    cm = CoherenceModel(model=lda, corpus=doc_term_matrix, dictionary=dictionary, coherence='u_mass')
    coherenceList_umass.append(cm.get_coherence())
    cm_cv = CoherenceModel(model=lda, corpus=doc_term_matrix, texts=datalist, dictionary=dictionary, coherence='c_v')
    coherenceList_cv.append(cm_cv.get_coherence())
    vis = pyLDAvis.gensim.prepare(lda, doc_term_matrix, dictionary)
    pyLDAvis.save_html(vis,'pyLDAvis_%d.html' %num_topics)


plotData = pd.DataFrame({'Number of topics':num_topics_list,'CoherenceScore':coherenceList_umass})
f,ax = plt.subplots(figsize=(10,6))
sns.set_style("darkgrid")
sns.pointplot(x='Number of topics',y= 'CoherenceScore',data=plotData)
plt.axhline(y=-3.9)
plt.title('Topic coherence')
plt.savefig('Topic coherence plot.png')

#################################################################
#################################################################

lda_final= Lda(doc_term_matrix, num_topics=3,id2word = dictionary, passes=20,chunksize=4000,random_state=43)

lda_final.save('lda_final')

dictionary.save('dictionary')

corpora.MmCorpus.serialize('doc_term_matrix.mm', doc_term_matrix)


a = lda_final.show_topics(num_topics=3,formatted=False,num_words=10)
b = lda_final.top_topics(doc_term_matrix,dictionary=dictionary,topn=10)


topic2wordb = {}
topic2csb = {}
topic2worda = {}
topic2csa = {}
num_topics =lda_final.num_topics
cnt =1

for ws in b:
    wset = set(w[1] for w in ws[0])
    topic2wordb[cnt] = wset
    topic2csb[cnt] = ws[1]
    cnt +=1

for ws in a:
    wset = set(w[0]for w in ws[1])
    topic2worda[ws[0]+1] = wset

for i in range(1,num_topics+1):
    for j in range(1,num_topics+1):  
        if topic2worda[i].intersection(topic2wordb[j])==topic2worda[i]:
            topic2csa[i] = topic2csb[j]

print('the final data block')
finalData = pd.DataFrame([],columns=['Topic','words'])
finalData['Topic']=topic2worda.keys()
finalData['Topic'] = finalData['Topic'].apply(lambda x: 'Topic'+str(x))
finalData['words']=topic2worda.values()
finalData['cs'] = topic2csa.values()
finalData.sort_values(by='cs',ascending=False,inplace=True)
finalData.to_csv('CoherenceScore.csv')
print(finalData)

现在我有了训练有素的模型,但我想知道我如何在用于训练的文档以及新的看不见的文档上使用模型来分配主题

我使用下面的代码来做到这一点,但得到如下错误:

unseen_document = 'How a Pentagon deal became an identity crisis for Google'

text = normalization.normalize_corpus(unseen_document, only_text_chars=True, tokenize=True)

bow_vector = dictionary.doc2bow(text)

corpora.MmCorpus.serialize('x.bow_vector', bow_vector)

corpus = [dictionary.doc2bow(text)]

x = lda_final[corpus]

错误信息 :

    Topic                                              words        cs
2  Topic3  {senator, people, power, home, year, believe, ... -0.175486
1  Topic2  {friend, place, love, play, general, house, ye... -0.318839
0  Topic1  {money, doe, fucking, play, love, people, worl... -1.360688

Traceback (most recent call last):
  File "LDA_test.py", line 141, in <module>
    corpus = [dictionary.doc2bow(text)]
  File "/Users/tcssig/anaconda/lib/python3.5/site-packages/gensim/corpora/dictionary.py", line 250, in doc2bow
    counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 1
TypeError: coercing to str: need a bytes-like object, list found
山姆 H。

在这一行

corpus = [dictionary.doc2bow(text)]

您正在创建一个 BOW 矢量列表。您需要查找这些向量,而不是列表,例如

for v in corpus:
    print(lda_final[v])

将显示文档的主题概率分布。

查看gensim 文档

本文收集自互联网,转载请注明来源。

如有侵权,请联系 [email protected] 删除。

编辑于
0

我来说两句

0 条评论
登录 后参与评论

相关文章