Ex_treme's blog.

复赛笔记(一):EXCLE和集合SET的处理

2018/07/20 Share

主要是对EXCLE进行读取和写入,以及利用SET进行过滤操作,完成了EMDT原始输出记录和进一步利用QGDT处理数据再次写入EXCLE。

EXCLE的基本操作

EXCLE的操作使用了两个库,读取使用xlrd,写入使用xlwt

EXCLE读取操作

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# -*- coding: utf-8 -*-
import xlrd
import xlwt
from datetime import date,datetime

def read_excel():
# 打开文件
workbook = xlrd.open_workbook(r'F:\demo.xlsx')
# 获取所有sheet
print workbook.sheet_names() # [u'sheet1', u'sheet2']
sheet2_name = workbook.sheet_names()[1]

# 根据sheet索引或者名称获取sheet内容
sheet2 = workbook.sheet_by_index(1) # sheet索引从0开始
sheet2 = workbook.sheet_by_name('sheet2')

# sheet的名称,行数,列数
print sheet2.name,sheet2.nrows,sheet2.ncols

# 获取整行和整列的值(数组)
rows = sheet2.row_values(3) # 获取第四行内容
cols = sheet2.col_values(2) # 获取第三列内容
print rows
print cols

# 获取单元格内容
print sheet2.cell(1,0).value.encode('utf-8')
print sheet2.cell_value(1,0).encode('utf-8')
print sheet2.row(1)[0].value.encode('utf-8')

# 获取单元格内容的数据类型
print sheet2.cell(1,0).ctype

if __name__ == '__main__':
read_excel()

EXCLE写入操作

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
'''
设置单元格样式
'''

def set_style(name,height,bold=False):
style = xlwt.XFStyle() # 初始化样式

font = xlwt.Font() # 为样式创建字体
font.name = name # 'Times New Roman'
font.bold = bold
font.color_index = 4
font.height = height

# borders= xlwt.Borders()
# borders.left= 6
# borders.right= 6
# borders.top= 6
# borders.bottom= 6

style.font = font
# style.borders = borders

return style


#写excel
def write_excel():
f = xlwt.Workbook() #创建工作簿

'''
创建第一个sheet:
sheet1
'''
sheet1 = f.add_sheet(u'sheet1',cell_overwrite_ok=True) #创建sheet
row0 = [u'业务',u'状态',u'北京',u'上海',u'广州',u'深圳',u'状态小计',u'合计']
column0 = [u'机票',u'船票',u'火车票',u'汽车票',u'其它']
status = [u'预订',u'出票',u'退票',u'业务小计']

#生成第一行
for i in range(0,len(row0)):
sheet1.write(0,i,row0[i],set_style('Times New Roman',220,True))

#生成第一列和最后一列(合并4行)
i, j = 1, 0
while i < 4*len(column0) and j < len(column0):
sheet1.write_merge(i,i+3,0,0,column0[j],set_style('Arial',220,True)) #第一列
sheet1.write_merge(i,i+3,7,7) #最后一列"合计"
i += 4
j += 1

sheet1.write_merge(21,21,0,1,u'合计',set_style('Times New Roman',220,True))

#生成第二列
i = 0
while i < 4*len(column0):
for j in range(0,len(status)):
sheet1.write(j+i+1,1,status[j])
i += 4

f.save('demo1.xlsx') #保存文件

if __name__ == '__main__':
#generate_workbook()
#read_excel()
write_excel()

集合基本操作

1
2
3
# 创建
s = set()
s = {11,22,33,44} #注意在创建空集合的时候只能使用s=set(),因为s={}创建的是空字典
1
2
3
4
5
6
7
8
9
10
# 比较
se = {11, 22, 33}
be = {22, 55}
temp1 = se.difference(be) #找到se中存在,be中不存在的集合,返回新值
print(temp1) #{33, 11}
print(se) #{33, 11, 22}

temp2 = se.difference_update(be) #找到se中存在,be中不存在的集合,覆盖掉se
print(temp2) #None
print(se) #{33, 11},
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# 删除
se = {11, 22, 33}
se.discard(11)
se.discard(44) # 移除不存的元素不会报错
print(se)

se = {11, 22, 33}
se.remove(11)
se.remove(44) # 移除不存的元素会报错
print(se)

se = {11, 22, 33} # 移除末尾元素并把移除的元素赋给新值
temp = se.pop()
print(temp) # 33
print(se) # {11, 22}
1
2
3
4
5
6
7
8
9
10
11
# 取交集
se = {11, 22, 33}
be = {22, 55}

temp1 = se.intersection(be) #取交集,赋给新值
print(temp1) # 22
print(se) # {11, 22, 33}

temp2 = se.intersection_update(be) #取交集并更新自己
print(temp2) # None
print(se) # 22
1
2
3
4
5
6
7
# 判断
se = {11, 22, 33}
be = {22}

print(se.isdisjoint(be)) #False,判断是否不存在交集(有交集False,无交集True)
print(se.issubset(be)) #False,判断se是否是be的子集合
print(se.issuperset(be)) #True,判断se是否是be的父集合
1
2
3
4
5
6
7
8
9
10
11
12

# 合并
se = {11, 22, 33}
be = {22}

temp1 = se.symmetric_difference(be) # 合并不同项,并赋新值
print(temp1) #{33, 11}
print(se) #{33, 11, 22}

temp2 = se.symmetric_difference_update(be) # 合并不同项,并更新自己
print(temp2) #None
print(se) #{33, 11}
1
2
3
4
5
6
7
# 取并集
se = {11, 22, 33}
be = {22,44,55}

temp=se.union(be) #取并集,并赋新值
print(se) #{33, 11, 22}
print(temp) #{33, 22, 55, 11, 44}
1
2
3
4
5
6
7
8
# 更新
se = {11, 22, 33}
be = {22,44,55}

se.update(be) # 把se和be合并,得出的值覆盖se
print(se)
se.update([66, 77]) # 可增加迭代项
print(se)
1
2
3
4
5
6
7
8
9
10
11
12
13
14

# 集合的转换
se = set(range(4))
li = list(se)
tu = tuple(se)
st = str(se)
print(li,type(li))
print(tu,type(tu))
print(st,type(st))

OUTPUT:
[0, 1, 2, 3] <class 'list'>
(0, 1, 2, 3) <class 'tuple'>
{0, 1, 2, 3} <class 'str'>

EMDT数据写入EXCLE

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# -*- coding: utf-8 -*-
# @Time : 18-7-20 上午9:01
# @Author : Ex_treme
# @Email : pzsyjsgldd@163.com
# @File : emdt_debug.py
# @Software: PyCharm

import os
from EMDT import EMDT
from xlwt import *

if __name__ == "__main__":
# 指定file以utf-8的格式打开
file = Workbook(encoding='utf-8')

# 指定打开的文件名
table = file.add_sheet('EMDT抽取及挖掘结果.xlsx')

# 数据集合
ldata = []
init = ['标准问题(必填)','主题(可选)','答案(必填)','答案链接(必填)']
ldata.append(init)
dir = 'support.huaweicloud.com/'
count = 0
for x in os.listdir(dir):
with open(dir + x, encoding='utf-8', mode='r') as f:
e = EMDT(f.read(),
LOG_ENABLE=False,
LOG_LEVEL='INFO',
BLOCKSIZE=10,
CAPACITY=5,
TIMEOUT=5,
SAVEIMAGE=False,
CONTENT_RULE=['.help-details.webhelp', '.help-center-title'],
TOPIC_RULE=['.crumbs', '.parentlink'],
QA_JACCARD_THRESHOLD=0.25,
REMOVE_HTML=False,
)
e.analyse()
e.format()
for i in e.summery:
i.append(x)
count += 1
print('第{}个:{}'.format(count,i))
ldata.append(i)


for i, p in enumerate(ldata):
# 将数据写入文件,i是enumerate()函数返回的序号数
for j, q in enumerate(p):
# print i,j,q
if len(q)>=32767:
q = q[0:32767]
table.write(i, j, q)
file.save('EMDT抽取及挖掘结果.xlsx')

QGDT读取并写入数据到EXCLE

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# -*- coding: utf-8 -*-
# @Time : 18-7-20 上午11:15
# @Author : Ex_treme
# @Email : pzsyjsgldd@163.com
# @File : qgdt_debug.py
# @Software: PyCharm
import hashlib

from QGDT import QGDT
import xlrd
from xlwt import Workbook


def get_md5(question):
m = hashlib.md5(question.encode("utf-8"))
return str(m.hexdigest())

def summery(summery_list):
summery = []
question_list = []
for i in summery_list:
q = QGDT(i[0],
LOG_ENABLE=True,
LOG_LEVEL='WARNING',
MAX_SAMPLE=10,
RANDOM=True,
LAMBDA=0.2,
ALPHA=0.3,
BETA=0.5)
q.ranking_algorithm()
question_list.append(q.question_generation())
for index,i in enumerate(summery_list):
question = question_list[index]
topic = i[1]
answer = i[2]
file_name = i[3]
expand_list = []
for i in question_list:
if i == question:
pass
else:
expand_list.append(i)
summery.append([get_md5(question),question,topic,answer,file_name,str(expand_list)])
return summery


if __name__ == "__main__":
# 打开文件
workbook = xlrd.open_workbook('EMDT抽取及挖掘结果(11818条).xlsx')
# 获取所有sheet
sheet_name = workbook.sheet_names()[0]
print(sheet_name)
# 根据sheet名称获取sheet内容
sheet = workbook.sheet_by_name('EMDT抽取及挖掘结果.xlsx')
# 打印sheet名称,行数,列数
print(sheet.name,sheet.nrows,sheet.ncols)

# cols = sheet.col_values(0)
ldata = []
tmp = []
file_name = None
for i in range(1,sheet.nrows):
# 获取整行和整列的值
rows = sheet.row_values(i)
if not file_name:
file_name = rows[-1]
tmp.append(rows)
elif file_name == rows[-1]:
tmp.append(rows)
elif file_name is not rows[-1]:
ldata.append(tmp)
tmp = [rows]
file_name = rows[-1]

# 指定file以utf-8的格式打开
file = Workbook(encoding='utf-8')
# 指定打开的文件名
table = file.add_sheet('QGDT问句生成最终结果.xlsx')

row_count = 0
md5 = set()
for i in ldata:
res = summery(i)
for x in res:
if x[0] not in md5:
md5.add(x[0])
row_count +=1
print('生成第{}条问句!'.format(row_count))
else:
continue
for index,y in enumerate(x):
table.write(row_count,index,y)
file.save('QGDT问句生成最终结果.xlsx')
CATALOG
  1. 1. EXCLE的基本操作
    1. 1.1. EXCLE读取操作
    2. 1.2. EXCLE写入操作
    3. 1.3. 集合基本操作
  2. 2. EMDT数据写入EXCLE
  3. 3. QGDT读取并写入数据到EXCLE