Ex_treme's blog.

文章摘要器————HanLP的简单使用

2018/03/27 Share

HanLP介绍

HanLP是由一系列模型与算法组成的Java工具包,目标是普及自然与眼处理在生产环境中的应用。HanLP具备功能完善、性能高校、架构清晰、语料及时、可自定义的特点。
HanLP实现了中文分词、词性标注、关键词提取、自动摘要、短语转换、拼音转换、文本推荐、依存句法分析等功能。

HanLP简单使用

  • 下载HanLP Protable版

http://t.cn/RfMdLJf

  • 下载Jpype

https://pypi.python.org/pypi/JPype1-py3#downloads

1
2
3
4
$ sudo apt-get install g++ python3-dev
$ wget https://pypi.python.org/packages/59/90/149647ac2c8649a5983fcc47c78f2881af80cbd99f54248ac31b3d611618/JPype1-py3-0.5.5.2.tar.gz#md5=06481b851244abb37d45f3a03f0f0455
$ tar -zvxf JPype1-py3-0.5.5.2
$ python3 setup.py install

注意:安装JPype要提前配置好java环境,下面推荐一种源安装方式

1
2
3
4
5
6
7
8
9
10
11
12
13
14
$ sudo apt-get install default-jre
$ sudo apt-get install default-jdk
$ sudo add-apt-repository ppa:webupd8team/java
$ sudo apt-get update
$ sudo apt-get install oracle-java8-installer
$ sudo apt-get install oracle-java8-set-default
$ sudo apt-get install software-properties-common python-software-properties
$ su -
$ gedit /root/.bashrc
export JAVA_HOME=/usr/lib/jvm/java-8-oracle
export CLASSPATH=.:$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib/tools.jar
export PATH=$PATH:$JAVA_HOME/bin
export JAVA_HOME PATH CLASSPATH
$ source /root/.bashrc

HanLP开发文章摘要器

实现思路

  • WxPython实现页面布局,为两个按钮添加相应的点击事件
  • 通过文件后缀来判断需要处理的文件类型
  • 如果是PDF或者docx类型文件就进行处理
  • 将处理后的内容通过HanLP分词
  • 将分词结果显示在界面中

配置开发环境

  • 开发环境配置
1
2
3
4
创建虚拟环境 virtualenv或者conda都行
$ pip install jpype1 numpy wxPython docx2txt pdfminer.six
$ wget https://github.com/hankcs/HanLP/releases/download/v1.3.2/hanlp-1.3.2-release.zip
$ get data from https://pan.baidu.com/s/1pKUVNYF
  • HanLP配置
1
2
3
4
$ cd hanlp-1.3.2-release
$ gedit hanlp.properties
$ root=/home/pzs741/PycharmProjects/acc/
$ ShowTermNature=false
  • HanLP调用
1
2
3
4
5
6
7
from jpype import *
startJVM(get_default_jvm_path(), '-Djava.class.path=/home/pzs741/PycharmProjects/acc/hanlp-1.3.2.jar:')
HanLP = JClass('com.hankcs.hanlp.HanLP')
HanLP.segment
HanLP.extractKeyword
HanLP.extractSummary
HanLP.extractPhrase

实现PDF和Docx字符转换

Docx转换

1
2
3
4
5
6
7
8
9
10
11
12
import docx2txt

filepath = '/home/pzs741/PycharmProjects/acc/Untitled 1.docx'
text = docx2txt.process(filepath)
print(text)

def WtoT(filepath):
if filepath.split('.')[-1] == 'docx' or filepath.split('.')[-1] == 'doc':
text = docx2txt.process(filepath)
return text

print(WtoT(filepath))

PDF转换

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams


def PDFToT(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
# 创建PDF资源管理器
manager = PDFResourceManager()
# 创建PDF设备对象
laParams = LAParams()
# 文本转换器
converter = TextConverter(manager, output, laparams=laParams)
# PDF解释器
interpreter = PDFPageInterpreter(manager, converter)

file = open(fname, 'rb')

for page in PDFPage.get_pages(file, pagenums):
interpreter.process_page(page)

file.close()
converter.close()
text = output.getvalue()
output.close()

return text


def GetPDF(filepath):
if filepath.split('.')[-1] == 'pdf' or filepath.split('.')[-1] == 'PDF':
text = PDFToT(filepath)
return text


if __name__ == '__main__':
print(GetPDF('text.pdf'))

文章摘要器页面布局

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import wx

#选择文件
def OnOpen():
pass
#运算
def OnSave():
pass

class Acc(wx.Frame):
def __init__(self):
wx.Frame.__init__(self,None,0,'Acc',size = (700,400))
# 创建面板
panel = wx.Panel(self,1)
# 显示文件路径
title_box = wx.TextCtrl(panel, 3,
"显示文件路径",
)
# 显示文章抽取结果
content_box = wx.TextCtrl(panel, 2,
"显示文章抽取结果",
style=wx.TE_MULTILINE)


# 控件和布局
loadButton = wx.Button(panel, 4,'选择文件')
saveButton = wx.Button(panel, 5,'运算')
loadButton.Bind(wx.EVT_BUTTON, OnOpen())
saveButton.Bind(wx.EVT_BUTTON, OnSave())

hbox = wx.BoxSizer()
hbox.Add(title_box,proportion=1,flag = wx.EXPAND)
hbox.Add(loadButton,proportion=0,flag = wx.LEFT,border = 5)
hbox.Add(saveButton,proportion=0,flag = wx.LEFT,border = 5)

vbox = wx.BoxSizer(wx.VERTICAL)
vbox.Add(hbox, proportion=0, flag=wx.EXPAND|wx.LEFT,border = 10)
vbox.Add(content_box, proportion=1, flag=wx.EXPAND|wx.LEFT|wx.BOTTOM|wx.RIGHT, border=10)

panel.SetSizer(vbox)


if __name__ == '__main__':
# 定义程序对象
app = wx.PySimpleApp()
# 创建顶层窗口
frame = Acc()
frame.Show()
# 界面死循环
app.MainLoop()

文章摘要器的实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import DoxcToStr as doc
import PDFToStr as pdf
import os
import atexit
import wx
import useHanLP as HanLP



class Acc(wx.Frame):
def __init__(self):
wx.Frame.__init__(self,None,0,'Acc',size = (700,400))
# 创建面板
panel = wx.Panel(self,1)
# 显示文件路径
self.filename = wx.TextCtrl(panel, 3,"显示文件路径")
# 显示文章抽取结果
self.contents = wx.TextCtrl(panel, 2,"显示文章抽取结果",style=wx.TE_MULTILINE)


# 控件和布局
loadButton = wx.Button(panel, 4,'选择文件')
saveButton = wx.Button(panel, 5,'运算')
loadButton.Bind(wx.EVT_BUTTON, self.OnOpen)
saveButton.Bind(wx.EVT_BUTTON, self.OnSave)

hbox = wx.BoxSizer()
hbox.Add(self.filename,proportion=1,flag = wx.EXPAND)
hbox.Add(loadButton,proportion=0,flag = wx.LEFT,border = 5)
hbox.Add(saveButton,proportion=0,flag = wx.LEFT,border = 5)

vbox = wx.BoxSizer(wx.VERTICAL)
vbox.Add(hbox, proportion=0, flag=wx.EXPAND|wx.LEFT,border = 10)
vbox.Add(self.contents, proportion=1, flag=wx.EXPAND|wx.LEFT|wx.BOTTOM|wx.RIGHT, border=10)

panel.SetSizer(vbox)

# 选择文件
def OnOpen(self,click):
if click:
# parent, message=None, defaultDir=None, defaultFile=None, wildcard=None, style=None, pos=None, size=None, name=None
dialog = wx.FileDialog(parent=None, message='选择一个文件', defaultDir=os.getcwd(),style=wx.FD_OPEN)
if dialog.ShowModal() == wx.ID_OK:
self.filename.SetValue(dialog.GetPath())
dialog.Destroy()

# 运算
def OnSave(self,click):
if click:
filepath = self.filename.GetValue()
if filepath == '' or filepath == '显示文件路径':
self.contents.SetValue('必须选择一个文件!!!')
else:
fileExtension = filepath.split('.')[-1]
if (fileExtension == 'docx' or fileExtension == 'doc'):
res = doc.WtoT(filepath)
elif (fileExtension == 'PDF' or fileExtension == 'pdf'):
res = pdf.GetPDF(filepath)
else:
self.contents.SetValue('必须是DOC或者PDF文件!!!')

content = """
------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------
关键词:{}
摘要:{}
短语:{}
------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------
原文:{}
""".format(res[0], res[1], res[2],res[3])

self.contents.SetValue(content)


if __name__ == '__main__':
# 定义程序对象
app = wx.PySimpleApp()
# 创建顶层窗口
frame = Acc()
frame.Show()
# 界面死循环
app.MainLoop()
#关闭JAVA虚拟机
atexit.register(HanLP.StopJVM())

总结

整个AAC主要调用了三个库,一个是DOC转换器,一个是PDF转换器,最重要的是HanLP的语言转换器,然后通过wxPython做前端,整个项目就完成了。

  • 配置并封装了HanLP
  • 获得PDF和Docx文件的内容
  • wxPython编写了界面布局
  • 为按钮添加点击时间
  • 调用HanLP,提取PDF或Docx文件中的关键词、摘要和短语
  • 通过atexit优化代码

最后附上ACC的效果图

image

CATALOG
  1. 1. HanLP介绍
    1. 1.1. HanLP简单使用
  2. 2. HanLP开发文章摘要器
    1. 2.1. 实现思路
    2. 2.2. 配置开发环境
    3. 2.3. 实现PDF和Docx字符转换
    4. 2.4. 文章摘要器页面布局
    5. 2.5. 文章摘要器的实现
  3. 3. 总结