stop_words = [] with open('stop_words.txt', encoding='utf-8') as f: line = f.readline() while line: stop_words.append(line[:-1]) line = f.readline() stop_words = set(stop_words) print('停用词读取完毕,共{n}个单词'.format(n=len(stop_words)))
for doc in doc_list: text = str(HanLP.GetFenword(doc)).split(',') text_list = [] for t in text: if re.findall('[\u4e00-\u9fa5]',t): t = t.replace(' ', '').replace('[','').replace(']','') if t not in stop_words: text_list.append(t) if text_list: texts.append(text_list) # 为不重复的单词设置一个ID,并记录出现次数 dictionary = gensim.corpora.Dictionary(texts) #文档词频矩阵 corpus = [dictionary.doc2bow(text) for text in texts]