1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
| import hashlib import os from time import sleep
import MySQLdb import elasticsearch from EMDT import EMDT from QGDT import QGDT
from utils.mysql_to_es import es, QAType, getConnect, MySQLQueryPagination
def get_md5(question): m = hashlib.md5(question.encode("utf-8")) return m.hexdigest()
def summery(summery_list): summery = [] question_list = [] for i in summery_list: q = QGDT(i[0], LOG_ENABLE=True, LOG_LEVEL='WARNING', MAX_SAMPLE=10, RANDOM=True, LAMBDA=0.2, ALPHA=0.3, BETA=0.5) q.ranking_algorithm() question_list.append(q.question_generation()) for index, i in enumerate(summery_list): question = question_list[index] topic = i[1] answer = i[2] expand_list = [] for i in question_list: if i == question: pass else: expand_list.append(i) summery.append([question, topic, answer, expand_list]) return summery
class Mysql(object): def __init__(self, i): self.conn = MySQLdb.connect('127.0.0.1', 'root', 'root', 'QADB', charset="utf8", use_unicode=True) self.cursor = self.conn.cursor() self.question = i[0] self.md5 = get_md5(self.question) self.topic = i[1] self.answer = i[2] self.file_name = path self.expand = str(i[3])
def save(self): insert_sql = """ insert into huawei_qamodel(md5, question, topic, answer,file_name,expand) VALUES (%s, %s, %s, %s, %s, %s) """ try: # 执行sql语句 self.cursor.execute(insert_sql, (self.md5, self.question, self.topic, self.answer, self.file_name, self.expand))
# 提交到数据库执行 self.conn.commit() except: # 发生错误时回滚 self.conn.rollback()
# 关闭数据库连接 self.conn.close()
def mysql_to_es(): # es初始化 es.indices.delete(index='qa', ignore=[400, 404]) QAType.init() # mysql初始化 conn = getConnect() pag = MySQLQueryPagination(conn) sql = r'SELECT * FROM `huawei_qamodel` WHERE id<%s' # mysql_to_elasticsearch for ret in pag.queryForList(sql): actions = [] for row in ret: try: action = pag._parse_serialize_table_data(row) actions.append(action) except Exception as e: print(e) elasticsearch.helpers.bulk(es, actions) del actions[0:len(actions)] conn.close()
if __name__ == "__main__": dir = os.path.dirname(os.getcwd()) + '/IQAS/media/huawei' path_list = os.listdir(dir) count = 0 for path in path_list: filepath = os.path.join(dir, path) with open(filepath, 'r', encoding='utf-8') as html: i = html.read() e = EMDT(i, LOG_ENABLE=False, LOG_LEVEL='WARNING', FORMAT='%(asctime)s - %(levelname)s - %(message)s', BLOCKSIZE=10, CAPACITY=5, TIMEOUT=5, SAVEIMAGE=False, CONTENT_RULE=['.help-details.webhelp', '.help-center-title'], TOPIC_RULE=['.crumbs', '.parentlink'], QA_JACCARD_THRESHOLD=0.25, REMOVE_HTML=False, ) e.analyse() e.format() summery_list = summery(e.summery) for i in summery_list: m = Mysql(i) m.save() count += 1 if count%100 == 0: mysql_to_es() print('md5:{}\nquestion:{}\ntopic:{}\nanswer:{}\nfile_name:{}\nexpand:{}\n'.format(m.md5, m.question, m.topic, m.answer,m.file_name,m.expand)) mysql_to_es()
|