Ex_treme's blog.

云端构建知识库

2018/06/28 Share

理论脱离不开时间,脱离了实践的理论叫做扯淡,本地和云的截然不同就是这个概念。
以后必须给自己打个预防针了,异常处理的重要之处不言而喻。

知识库构建————本地构建

  • 下载华为测试集
  • 下载github知识库构建包
  • 云端运行

下载华为测试集

1
2
3
4
5
6
7
8
9
$ ssh root@39.105.124.151 -i IQAS.pem
$ git clone https://github.com/pzs741/IQAS.git
$ wget http://www.cnsoftbei.com/upload_files/other/znwdxtsjykf_cssj.rar
$ apt install unrar-free
$ cd huawei
$ unrar -x znwdxtsjykf_cssj.rar
$ rm -rf znwdxtsjykf_cssj.rar 问答对\(QA\)样例.xlsx
$ cd IQAS
$ python local_bulid.py

配置mysql

1
2
3
4
SET innodb_lock_wait_timeout=50;
SET autocommit=on;
flush privileges;
service mysql restart

本地构建

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import hashlib
import os
from time import sleep

import MySQLdb
import elasticsearch
from EMDT import EMDT
from QGDT import QGDT

from utils.mysql_to_es import es, QAType, getConnect, MySQLQueryPagination


def get_md5(question):
m = hashlib.md5(question.encode("utf-8"))
return m.hexdigest()


def summery(summery_list):
summery = []
question_list = []
for i in summery_list:
q = QGDT(i[0],
LOG_ENABLE=True,
LOG_LEVEL='WARNING',
MAX_SAMPLE=10,
RANDOM=True,
LAMBDA=0.2,
ALPHA=0.3,
BETA=0.5)
q.ranking_algorithm()
question_list.append(q.question_generation())
for index, i in enumerate(summery_list):
question = question_list[index]
topic = i[1]
answer = i[2]
expand_list = []
for i in question_list:
if i == question:
pass
else:
expand_list.append(i)
summery.append([question, topic, answer, expand_list])
return summery


class Mysql(object):
def __init__(self, i):
self.conn = MySQLdb.connect('127.0.0.1', 'root', 'root', 'QADB', charset="utf8", use_unicode=True)
self.cursor = self.conn.cursor()
self.question = i[0]
self.md5 = get_md5(self.question)
self.topic = i[1]
self.answer = i[2]
self.file_name = path
self.expand = str(i[3])

def save(self):

insert_sql = """
insert into huawei_qamodel(md5, question, topic, answer,file_name,expand)
VALUES (%s, %s, %s, %s, %s, %s)
"""
try:
# 执行sql语句
self.cursor.execute(insert_sql,
(self.md5, self.question, self.topic, self.answer, self.file_name, self.expand))

# 提交到数据库执行
self.conn.commit()
except:
# 发生错误时回滚
self.conn.rollback()

# 关闭数据库连接
self.conn.close()

def mysql_to_es():
# es初始化
es.indices.delete(index='qa', ignore=[400, 404])
QAType.init()
# mysql初始化
conn = getConnect()
pag = MySQLQueryPagination(conn)
sql = r'SELECT * FROM `huawei_qamodel` WHERE id<%s'
# mysql_to_elasticsearch
for ret in pag.queryForList(sql):
actions = []
for row in ret:
try:
action = pag._parse_serialize_table_data(row)
actions.append(action)
except Exception as e:
print(e)
elasticsearch.helpers.bulk(es, actions)
del actions[0:len(actions)]
conn.close()

if __name__ == "__main__":
dir = os.path.dirname(os.getcwd()) + '/IQAS/media/huawei'
path_list = os.listdir(dir)
count = 0
for path in path_list:
filepath = os.path.join(dir, path)
with open(filepath, 'r', encoding='utf-8') as html:
i = html.read()
e = EMDT(i,
LOG_ENABLE=False,
LOG_LEVEL='WARNING',
FORMAT='%(asctime)s - %(levelname)s - %(message)s',
BLOCKSIZE=10,
CAPACITY=5,
TIMEOUT=5,
SAVEIMAGE=False,
CONTENT_RULE=['.help-details.webhelp', '.help-center-title'],
TOPIC_RULE=['.crumbs', '.parentlink'],
QA_JACCARD_THRESHOLD=0.25,
REMOVE_HTML=False,
)
e.analyse()
e.format()
summery_list = summery(e.summery)
for i in summery_list:
m = Mysql(i)
m.save()
count += 1
if count%100 == 0:
mysql_to_es()
print('md5:{}\nquestion:{}\ntopic:{}\nanswer:{}\nfile_name:{}\nexpand:{}\n'.format(m.md5, m.question, m.topic, m.answer,m.file_name,m.expand))
mysql_to_es()
CATALOG
  1. 1. 知识库构建————本地构建
    1. 1.1. 下载华为测试集
    2. 1.2. 配置mysql
    3. 1.3. 本地构建