Ex_treme's blog.

pyspider快速部署爬虫前台

2018/05/19 Share

Pyspider的self.crwal使用

本文绍一下Pyspider的基本知识和原理

self.crawl(url, **kwargs)

1
2
def on_start(self):
self.crawl('http://localhost:8000/gys/0/', callback=self.index_page)

url是起始地址,callback一个回调函数解析网页。

age

1
2
3
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
...

age是任务有效期限,如果是-1代表永远不爬取,默认的是10天的有效期,这很好理解。

priority

1
2
3
4
def index_page(self):
self.crawl('http://www.example.org/page2.html', callback=self.index_page)
self.crawl('http://www.example.org/233.html', callback=self.detail_page,
priority=1)

默认的优先级都是0,越大越好,可以通过优先级做广度遍历,减少url队列带来的内存消耗。

exetime

1
2
3
4
import time
def on_start(self):
self.crawl('http://www.example.org/', callback=self.callback,
exetime=time.time()+30*60)

unix时间戳格式,默认是0,示例的写法,网页爬取将发生在30分钟后。

itag

1
2
3
4
def index_page(self, response):
for item in response.doc('.item').items():
self.crawl(item.find('a').attr.url, callback=self.detail_page,
itag=item.find('.update-time').text())

示例中把itag和.update-time样式值绑定在一起了,如果这个值不变的话一个周期后将不会对网页进行爬取,如果想要解除这种限制的话,在配置里面写成下面这样。

1
2
3
4
class Handler(BaseHandler):
crawl_config = {
'itag': 'v223'
}

auto_recrawl

1
2
3
def on_start(self):
self.crawl('http://www.example.org/', callback=self.callback,
age=5*60*60, auto_recrawl=True)

自动爬取,如果开启的话就会在任务周期结束后重新爬取了。

params

1
2
3
4
def on_start(self):
self.crawl('http://httpbin.org/get', callback=self.callback,
params={'a': 123, 'b': 'c'})
self.crawl('http://httpbin.org/get?a=123&b=c', callback=self.callback)

给爬取的url传参数,实例中的两个是一样的。

@config(**kwargs)

1
2
3
4
5
6
7
8
@config(age=15*60)
def index_page(self, response):
self.crawl('http://www.example.org/list-1.html', callback=self.index_page)
self.crawl('http://www.example.org/product-233', callback=self.detail_page)

@config(age=10*24*60*60)
def detail_page(self, response):
return {...}

这个修饰器里面可以写很多参数,实例里面写时间,list页是15min,product页是10天,因为被覆盖了。

专家库实例

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-05-18 20:17:17
# Project: test

from pyspider.libs.base_handler import *
import MySQLdb
import MySQLdb.cursors
import hashlib

def get_md5(data):
m = hashlib.md5(data.encode("utf-8"))
return m.hexdigest()

class Handler(BaseHandler):
crawl_config = {
'itag': 'v223'
}
def __init__(self):
self.conn = MySQLdb.connect('127.0.0.1', 'root', 'root', 'pyspider', charset="utf8", use_unicode=True)
self.cursor = self.conn.cursor()

@every(minutes=24 * 60)
def on_start(self):
self.crawl('http://localhost:8000/zjk/0/', callback=self.index_page)

@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for i in range(99):
self.crawl('http://localhost:8000/zjk/'+str(i)+'/', callback=self.detail_page)

@config(priority=2)
def detail_page(self, response):
url = response.url
md5 = get_md5(url)
xm= response.doc('#xmValue').text()
jsjb= response.doc('#jsjbValue').text()
zjlb= response.doc('#zjlbValue').text()
dwmc= response.doc('#dwmcValue').text()
bgdh= response.doc('#bgdhValue').text()
sj= response.doc('#sjValue').text()
zjcj= response.doc('#zjcjValue').text()
zgxlsxzy= response.doc('#zgxlsxzyValue').text()
kpsjshqzy1= response.doc('#kpsjshqzy1Value').text()
kpsjshqzy2= response.doc('#kpsjshqzy2Value').text()
kpsjshqzy3= response.doc('#kpsjshqzy3Value').text()
kpszylb= response.doc('#kpszylbValue').text()
insert_sql = """
insert into zjk(url,md5,xm,jsjb,zjlb,dwmc,bgdh,sj,zjcj,zgxlsxzy,kpsjshqzy1,kpsjshqzy2,kpsjshqzy3,kpszylb)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
"""
try:
self.cursor.execute(insert_sql, (url,md5,xm,jsjb,zjlb,dwmc,bgdh,sj,zjcj,zgxlsxzy,kpsjshqzy1,kpsjshqzy2,kpsjshqzy3,kpszylb))
self.conn.commit()
except:
pass

供应商实例

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-05-18 20:17:17
# Project: test

from pyspider.libs.base_handler import *
import MySQLdb
import MySQLdb.cursors
import hashlib

def get_md5(data):
m = hashlib.md5(data.encode("utf-8"))
return m.hexdigest()

class Handler(BaseHandler):
crawl_config = {
'itag': 'v223'
}
def __init__(self):
self.conn = MySQLdb.connect('127.0.0.1', 'root', 'root', 'pyspider', charset="utf8", use_unicode=True)
self.cursor = self.conn.cursor()

@every(minutes=24 * 60)
def on_start(self):
self.crawl('http://localhost:8000/gys/0/', callback=self.index_page)

@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for i in range(99):
self.crawl('http://localhost:8000/gys/'+str(i)+'/', callback=self.detail_page)

@config(priority=2)
def detail_page(self, response):
url = response.url
md5 = get_md5(url)
gsmc= response.doc('#gsmcValue').text()
jjxz= response.doc('#jjxzValue').text()
gyslx= response.doc('#gyslxValue').text()
gysdj= response.doc('#gysdjValue').text()
gsclsj= response.doc('#gsclsjValue').text()
lxrxm= response.doc('#lxrxmValue').text()
lxrsj= response.doc('#lxrsjValue').text()
lxrgddh= response.doc('#lxrgddhValue').text()
lxrcz= response.doc('#lxrczValue').text()
fddbrxm= response.doc('#fddbrxmValue').text()
fddbrgddh= response.doc('#fddbrgddhValue').text()
fddbrsj= response.doc('#fddbrsjValue').text()
fddbrsfzh= response.doc('#fddbrsfzhValue').text()
lxryx= response.doc('#lxryxValue').text()
lxrdz= response.doc('#lxrdzValue').text()
zzjgdm= response.doc('#zzjgdmValue').text()
yzbm= response.doc('#yzbmValue').text()
zcdz= response.doc('#zcdzValue').text()
gswz= response.doc('#gswzValue').text()
ssdq= response.doc('#ssdqValue').text()
swdjh_gs= response.doc('#swdjh_gsValue').text()
swdjh_ds= response.doc('#swdjh_dsValue').text()
swdjfzjg_gs= response.doc('#swdjfzjg_gsValue').text()
swdjfzjg_ds= response.doc('#swdjfzjg_dsValue').text()
swdjyxq_gs= response.doc('#swdjyxq_gsValue').text()
swdjyxq_ds= response.doc('#swdjyxq_dsValue').text()
yyzzzch= response.doc('#yyzzzchValue').text()
yyzzfzjg= response.doc('#yyzzfzjgValue').text()
yyzzzczj= response.doc('#yyzzzczjValue').text()
yyzzzcd= response.doc('#yyzzzcdValue').text()
jyfw= response.doc('#jyfwValue').text()
yyzzyxq= response.doc('#yyzzyxqValue').text()
yyzzzjnjsj= response.doc('#yyzzzjnjsjValue').text()
gsjj= response.doc('#gsjjValue').text()
khyh= response.doc('#khyhValue').text()
yhzh= response.doc('#yhzhValue').text()
ctbjl= response.doc('#ctbjlValue').text()
zhtjsj= response.doc('#zhtjsjValue').text()
cpml= response.doc('#cpmlValue').text()
insert_sql = """
insert into gys(url,md5,gsmc, jjxz, gyslx, gysdj, gsclsj, lxrxm, lxrsj, lxrgddh, lxrcz, fddbrxm, fddbrgddh, fddbrsj, fddbrsfzh, lxryx, lxrdz, zzjgdm, yzbm, zcdz, gswz, ssdq, swdjh_gs, swdjh_ds, swdjfzjg_gs, swdjfzjg_ds, swdjyxq_gs, swdjyxq_ds, yyzzzch, yyzzfzjg, yyzzzczj, yyzzzcd, jyfw, yyzzyxq, yyzzzjnjsj, gsjj, khyh, yhzh, ctbjl, zhtjsj, cpml)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
"""
try:
self.cursor.execute(insert_sql, (url,md5,gsmc, jjxz, gyslx, gysdj, gsclsj, lxrxm, lxrsj, lxrgddh, lxrcz, fddbrxm, fddbrgddh, fddbrsj, fddbrsfzh, lxryx, lxrdz, zzjgdm, yzbm, zcdz, gswz, ssdq, swdjh_gs, swdjh_ds, swdjfzjg_gs, swdjfzjg_ds, swdjyxq_gs, swdjyxq_ds, yyzzzch, yyzzfzjg, yyzzzczj, yyzzzcd, jyfw, yyzzyxq, yyzzzjnjsj, gsjj, khyh, yhzh, ctbjl, zhtjsj, cpml))
self.conn.commit()
except:
pass

pyspider前台效果图

image

CATALOG
  1. 1. Pyspider的self.crwal使用
    1. 1.1. self.crawl(url, **kwargs)
    2. 1.2. age
    3. 1.3. priority
    4. 1.4. exetime
    5. 1.5. itag
    6. 1.6. auto_recrawl
    7. 1.7. params
    8. 1.8. @config(**kwargs)
    9. 1.9. 这个修饰器里面可以写很多参数,实例里面写时间,list页是15min,product页是10天,因为被覆盖了。
  2. 2. 专家库实例
  3. 3. 供应商实例