From d4f63ce51b84fe9ad9e536e41ef64e0d7be0c814 Mon Sep 17 00:00:00 2001 From: Danceiny Date: Thu, 28 Mar 2019 23:53:37 +0800 Subject: [PATCH] add example code to README --- README.md | 151 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 150 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index f29676f..debf1a8 100644 --- a/README.md +++ b/README.md @@ -17,8 +17,157 @@ >`pip install -U parser_engine` ### 示例 +- 极简版,使用`CrawlSpider`的rules机制。 +```python +from parser_engine import TemplateAnnotation +@TemplateAnnotation(tpls="demo") +class DemoSpider4(CrawlSpider): + name = "demo4" + start_urls = [ + "http://github.cannot.cc/baixing-helper" + ] +``` + +- 使用scrapy_redis,解析start_urls的响应。 +```python +from parser_engine import TemplateAnnotation +from parser_engine.clue.spider import ClueSpider +@TemplateAnnotation(start_url_tpl=({ + "name": "zhongguozhongqi_xiaoshouwangluo", + "itemname": "HuocheDealerItem", + "parent": { + "xpath": "//tr[@class=\"bgcolor2\"]" + }, + "fields": [ + { + "key": "area", + "xpath": "td[1]/text()", + "value_type": "stripped_string" + }, { + "key": "leads_name", + "xpath": "td[2]/text()", + "value_type": "stripped_string" + }, { + "key": "address", + "xpath": "td[3]/text()", + "value_type": "stripped_string" + }, { + "key": "phone", + "xpath": "td[5]/text()", + "value_type": "stripped_string" + } + ] +}), channel='zhongguozhongqi', leads_src='中国重汽') +class ZhongguozhongqiSpider(ClueSpider): + name = 'zhongguozhongqi' + def parse(self, response): + items = self._parse_start_url(response) + for item in items: + phone = item.get('phone') + if phone: + item['phone'] = phone.replace('、', ',') + yield item + self.finish_clue(response, len(items)) +``` + +- 使用scrapy_redis,灵活运用多种PE特性。 +```python +from parser_engine.clue.spider import ClueSpider +from parser_engine import TemplateAnnotation +from parser_engine.clue.items import ClueItem +from parser_engine.request import TaskRequest +from scrapy import Request +@TemplateAnnotation(start_url_tpl=({ + "name": "youka_shop_listing_api", + "parent": { + "json_key": "data", + }, + "fields": [{ + "key": "totalPage", + "json_key": "totalPage", + + }, { + "key": "ids", + "json_path": "dataList[*].id" + }] + },), + tpls=({ + "name": "youka_shop_detail_api", + "itemname": "HuocheDealerItem", + "parent": { + "json_key": "data", + }, + "fields": [{ + "key": "company_type", + "json_key": "category", + "mapper": { + 1: "二手车直营店", + 2: "4S店" + } + }, { + "key": "dealer_id", + "json_key": "id", + "required": 1, + }, { + "key": "leads_name", + "json_key": "shopName", + }, { + "key": "area", + "json_path": "districtDto.districtName", + "value_type": "singleton" + }, { + "key": "city", + "json_path": "cityDto.cityName", + "value_type": "singleton" + }, { + "key": "service_phone", + "default_value": "", + }, { + "key": "wechat", + "json_key": "wechat", + }, { + "key": "tags", + "json_key": "tags", + "join": "," + }] + }), channel='youka', leads_src='优卡') +class YoukaSpider(ClueSpider): + name = 'youka' + custom_settings = { + 'CONCURRENT_REQUESTS': 2, + 'CONCURRENT_REQUESTS_PER_DOMAIN': 1 + } + def parse(self, response): + items = self._parse_start_url(response) + meta = response.meta + clue_id = meta.get('clue_id') + from_url = response.request.url + if meta.get('open_pages'): + total_page = items[0]['totalPage'] + import re + current_page = int(re.findall('page=(\\d+)', from_url)[0]) + for i in range(1, total_page + 1): + if current_page == i: + continue + url = "http://www.china2cv.com/truck-foton-web/api/shop/v1/getShopList?page=%d&pageSize=10" % i + yield ClueItem({"project": "huoche", "spider": self.name, "req": TaskRequest( + url=url, + meta={"from_clue_id": clue_id} + )}) + for item in items: + for id in item['ids']: + r = Request(url="http://www.china2cv.com/truck-foton-web/api/shop/v1/getShopInfo?shopId=%d" % int(id), + callback=self._response_downloaded) + r.meta.update(rule=0, from_clue_id=clue_id) + yield r + + def process_results(self, response, results): + for item in results: + item['url'] = 'http://www.china2cv.com/storeDetail.html?typess=1&shopId=' + str(item['dealer_id']) + return results +``` -请参考:[examples](./examples)。 +更多请参考:[examples](./examples)。 ### 原理 - 解析器