1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
|
from pyspider.libs.base_handler import * from pyspider.database.mysql.mysqldb import SQL import urllib import time import json
class Handler(BaseHandler): crawl_config = { 'headers' : {'Connection':'keep-alive','Accept-Encoding':'gzip, deflate, br','Accept-Language':'zh-CN,zh;q=0.8','content-type':'application/x-www-form-urlencoded','Referer':'//home.mi.com/crowdfundinglist?id=78&title=%E4%BC%97%E7%AD%B9','User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'} }
@every(minutes=24 * 60) def on_start(self): param = 'data=%7B%22HomeList%22%3A%7B%22model%22%3A%22Homepage%22%2C%22action%22%3A%22BuildHome%22%2C%22parameters%22%3A%7B%22id%22%3A12%7D%7D%7D' self.crawl('https://home.mi.com/app/shopv3/pipe',method="GET",params=param,callback=self.index_page)
@config(age=60 * 60) def index_page(self, response): for each in response.json['result']['HomeList']['data']: gid = each['gid'] detailparm = "{\"detail\":{\"model\":\"Shopv2\",\"action\":\"getDetail\",\"parameters\":{\"gid\":\"%s\"}},\"comment\":{\"model\":\"Comment\",\"action\":\"getList\",\"parameters\":{\"goods_id\":\"%s\",\"orderby\":\"1\",\"pageindex\":\"0\",\"pagesize\":3}},\"activity\":{\"model\":\"Activity\",\"action\":\"getAct\",\"parameters\":{\"gid\":\"%s\"}}}" % (gid,gid,gid) detailreq = urllib.quote(detailparm) detailreq = "data=" + detailreq detailurl = "https://home.mi.com/app/shop/pipe?gid=%s" % gid self.crawl(detailurl,method='POST',data=detailreq ,callback=self.detail_page)
@config(priority=2) def detail_page(self, response): resultjsonstr = json.dumps(response.json) result = json.loads(resultjsonstr)['result']['detail']['data']['good'] resultfile = open("/tmp/xiaomi/%s.txt" % result['gid'].encode('utf-8'),'w') resultfile.write(resultjsonstr) resultfile.close() return { "original_id": result['gid'].encode('utf-8'), "project_name": result['name'].encode('utf-8'), "project_desc": result['summary'].encode('utf-8'), "curr_money":result['saled'].encode('utf-8'), "begin_date":time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(float(result['ctime'].encode('utf-8')))) }
def on_result(self,result): if not result or not result['original_id']: return sql = SQL() sql.insert('t_dream_xm_project',**result)
|