def initdata(filedir=os.getcwd()+'/data'): stopwordlist = getStopword() for path in os.listdir(filedir): filepath = filedir + '/' + path with open(filepath,'rb') as f: m = hashlib.md5() m.update(f.read()) filemd5 = m.hexdigest() with open(filepath, 'r') as f: text = f.read() fence = jieba.cut(text) fencelist = [] for f in fence: if f in stopwordlist: continue fencelist.append(f)
import json import math from article.models import ArticleModel
def tfidf(keylist): reskeylist = [] for key in keylist: articles = ArticleModel.objects.filter(file_fence__icontains=key) # print(articles) reskeylist.append(articles)
keyDict = dict() for articles in reskeylist: for article in articles: id = int(article.id) keyDict[id] = 0 #获取文章总数 allFileNum = ArticleModel.objects.count()