pyspider快速部署爬虫前台

pyspider 爬虫 self.crwal

 2018/05/19   Share

Pyspider的self.crwal使用

本文绍一下Pyspider的基本知识和原理

self.crawl(url, **kwargs)

1 2	def on_start(self): self.crawl('http://localhost:8000/gys/0/', callback=self.index_page)

url是起始地址，callback一个回调函数解析网页。

age

1
2
3

@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
    ...

age是任务有效期限，如果是-1代表永远不爬取，默认的是10天的有效期，这很好理解。

priority

def index_page(self):
    self.crawl('http://www.example.org/page2.html', callback=self.index_page)
    self.crawl('http://www.example.org/233.html', callback=self.detail_page,
               priority=1)

默认的优先级都是0,越大越好，可以通过优先级做广度遍历，减少url队列带来的内存消耗。

exetime

import time
def on_start(self):
    self.crawl('http://www.example.org/', callback=self.callback,
               exetime=time.time()+30*60)

unix时间戳格式，默认是0,示例的写法，网页爬取将发生在30分钟后。

itag

def index_page(self, response):
    for item in response.doc('.item').items():
        self.crawl(item.find('a').attr.url, callback=self.detail_page,
                   itag=item.find('.update-time').text())

示例中把itag和.update-time样式值绑定在一起了，如果这个值不变的话一个周期后将不会对网页进行爬取，如果想要解除这种限制的话，在配置里面写成下面这样。

class Handler(BaseHandler):
    crawl_config = {
        'itag': 'v223'
    }

auto_recrawl

1
2
3

def on_start(self):
    self.crawl('http://www.example.org/', callback=self.callback,
               age=5*60*60, auto_recrawl=True)

自动爬取，如果开启的话就会在任务周期结束后重新爬取了。

params

def on_start(self):
    self.crawl('http://httpbin.org/get', callback=self.callback,
               params={'a': 123, 'b': 'c'})
    self.crawl('http://httpbin.org/get?a=123&b=c', callback=self.callback)

给爬取的url传参数，实例中的两个是一样的。

@config(**kwargs)

@config(age=15*60)
def index_page(self, response):
    self.crawl('http://www.example.org/list-1.html', callback=self.index_page)
    self.crawl('http://www.example.org/product-233', callback=self.detail_page)

@config(age=10*24*60*60)
def detail_page(self, response):
    return {...}

这个修饰器里面可以写很多参数，实例里面写时间，list页是15min，product页是10天，因为被覆盖了。

专家库实例

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-05-18 20:17:17
# Project: test

from pyspider.libs.base_handler import *
import MySQLdb
import MySQLdb.cursors
import hashlib

def get_md5(data):
    m = hashlib.md5(data.encode("utf-8"))
    return m.hexdigest()

class Handler(BaseHandler):
    crawl_config = {
    'itag': 'v223'
}
    def __init__(self):
        self.conn = MySQLdb.connect('127.0.0.1', 'root', 'root', 'pyspider', charset="utf8", use_unicode=True)
        self.cursor = self.conn.cursor()
    
    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl('http://localhost:8000/zjk/0/', callback=self.index_page)

    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        for i in range(99):
            self.crawl('http://localhost:8000/zjk/'+str(i)+'/', callback=self.detail_page)

    @config(priority=2)
    def detail_page(self, response):
        url = response.url
        md5 = get_md5(url)
        xm= response.doc('#xmValue').text()
        jsjb= response.doc('#jsjbValue').text()
        zjlb= response.doc('#zjlbValue').text()
        dwmc= response.doc('#dwmcValue').text()
        bgdh= response.doc('#bgdhValue').text()
        sj= response.doc('#sjValue').text()
        zjcj= response.doc('#zjcjValue').text()
        zgxlsxzy= response.doc('#zgxlsxzyValue').text()
        kpsjshqzy1= response.doc('#kpsjshqzy1Value').text()
        kpsjshqzy2= response.doc('#kpsjshqzy2Value').text()
        kpsjshqzy3= response.doc('#kpsjshqzy3Value').text()
        kpszylb= response.doc('#kpszylbValue').text()
        insert_sql = """
                    insert into zjk(url,md5,xm,jsjb,zjlb,dwmc,bgdh,sj,zjcj,zgxlsxzy,kpsjshqzy1,kpsjshqzy2,kpsjshqzy3,kpszylb)
                    VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
                """
        try:
            self.cursor.execute(insert_sql, (url,md5,xm,jsjb,zjlb,dwmc,bgdh,sj,zjcj,zgxlsxzy,kpsjshqzy1,kpsjshqzy2,kpsjshqzy3,kpszylb))
            self.conn.commit()
        except:
            pass

供应商实例

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-05-18 20:17:17
# Project: test

from pyspider.libs.base_handler import *
import MySQLdb
import MySQLdb.cursors
import hashlib

def get_md5(data):
    m = hashlib.md5(data.encode("utf-8"))
    return m.hexdigest()

class Handler(BaseHandler):
    crawl_config = {
    'itag': 'v223'
}
    def __init__(self):
        self.conn = MySQLdb.connect('127.0.0.1', 'root', 'root', 'pyspider', charset="utf8", use_unicode=True)
        self.cursor = self.conn.cursor()
    
    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl('http://localhost:8000/gys/0/', callback=self.index_page)

    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        for i in range(99):
            self.crawl('http://localhost:8000/gys/'+str(i)+'/', callback=self.detail_page)

    @config(priority=2)
    def detail_page(self, response):
        url = response.url
        md5 = get_md5(url)
        gsmc= response.doc('#gsmcValue').text()
        jjxz= response.doc('#jjxzValue').text()
        gyslx= response.doc('#gyslxValue').text()
        gysdj= response.doc('#gysdjValue').text()
        gsclsj= response.doc('#gsclsjValue').text()
        lxrxm= response.doc('#lxrxmValue').text()
        lxrsj= response.doc('#lxrsjValue').text()
        lxrgddh= response.doc('#lxrgddhValue').text()
        lxrcz= response.doc('#lxrczValue').text()
        fddbrxm= response.doc('#fddbrxmValue').text()
        fddbrgddh= response.doc('#fddbrgddhValue').text()
        fddbrsj= response.doc('#fddbrsjValue').text()
        fddbrsfzh= response.doc('#fddbrsfzhValue').text()
        lxryx= response.doc('#lxryxValue').text()
        lxrdz= response.doc('#lxrdzValue').text()
        zzjgdm= response.doc('#zzjgdmValue').text()
        yzbm= response.doc('#yzbmValue').text()
        zcdz= response.doc('#zcdzValue').text()
        gswz= response.doc('#gswzValue').text()
        ssdq= response.doc('#ssdqValue').text()
        swdjh_gs= response.doc('#swdjh_gsValue').text()
        swdjh_ds= response.doc('#swdjh_dsValue').text()
        swdjfzjg_gs= response.doc('#swdjfzjg_gsValue').text()
        swdjfzjg_ds= response.doc('#swdjfzjg_dsValue').text()
        swdjyxq_gs= response.doc('#swdjyxq_gsValue').text()
        swdjyxq_ds= response.doc('#swdjyxq_dsValue').text()
        yyzzzch= response.doc('#yyzzzchValue').text()
        yyzzfzjg= response.doc('#yyzzfzjgValue').text()
        yyzzzczj= response.doc('#yyzzzczjValue').text()
        yyzzzcd= response.doc('#yyzzzcdValue').text()
        jyfw= response.doc('#jyfwValue').text()
        yyzzyxq= response.doc('#yyzzyxqValue').text()
        yyzzzjnjsj= response.doc('#yyzzzjnjsjValue').text()
        gsjj= response.doc('#gsjjValue').text()
        khyh= response.doc('#khyhValue').text()
        yhzh= response.doc('#yhzhValue').text()
        ctbjl= response.doc('#ctbjlValue').text()
        zhtjsj= response.doc('#zhtjsjValue').text()
        cpml= response.doc('#cpmlValue').text()
        insert_sql = """
                    insert into gys(url,md5,gsmc, jjxz, gyslx, gysdj, gsclsj, lxrxm, lxrsj, lxrgddh, lxrcz, fddbrxm, fddbrgddh, fddbrsj, fddbrsfzh, lxryx, lxrdz, zzjgdm, yzbm, zcdz, gswz, ssdq, swdjh_gs, swdjh_ds, swdjfzjg_gs, swdjfzjg_ds, swdjyxq_gs, swdjyxq_ds, yyzzzch, yyzzfzjg, yyzzzczj, yyzzzcd, jyfw, yyzzyxq, yyzzzjnjsj, gsjj, khyh, yhzh, ctbjl, zhtjsj, cpml)
                    VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
                """
        try:
            self.cursor.execute(insert_sql, (url,md5,gsmc, jjxz, gyslx, gysdj, gsclsj, lxrxm, lxrsj, lxrgddh, lxrcz, fddbrxm, fddbrgddh, fddbrsj, fddbrsfzh, lxryx, lxrdz, zzjgdm, yzbm, zcdz, gswz, ssdq, swdjh_gs, swdjh_ds, swdjfzjg_gs, swdjfzjg_ds, swdjyxq_gs, swdjyxq_ds, yyzzzch, yyzzfzjg, yyzzzczj, yyzzzcd, jyfw, yyzzyxq, yyzzzjnjsj, gsjj, khyh, yhzh, ctbjl, zhtjsj, cpml))
            self.conn.commit()
        except:
            pass

pyspider前台效果图

Next Post

xadmin快速开发后台管理系统
Previous Post

django前端快速部署

CATALOG

1. Pyspider的self.crwal使用
2. 专家库实例
3. 供应商实例



缺失模块。
1、请确保node版本大于6.2
2、在博客根目录（注意不是archer根目录）执行以下命令：
npm i hexo-generator-json-content --save
3、在根目录_config.yml里添加配置：

jsonContent:
  meta: false
  pages: false
  posts:
    title: true
    date: true
    path: true
    text: false
    raw: false
    content: false
    slug: false
    updated: false
    comments: false
    link: false
    permalink: false
    excerpt: false
    categories: true
    tags: true