Skip to content

Commit

Permalink
add example code to README
Browse files Browse the repository at this point in the history
  • Loading branch information
Danceiny committed Mar 28, 2019
1 parent 629e677 commit d4f63ce
Showing 1 changed file with 150 additions and 1 deletion.
151 changes: 150 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,157 @@
>`pip install -U parser_engine`
### 示例
- 极简版,使用`CrawlSpider`的rules机制。
```python
from parser_engine import TemplateAnnotation
@TemplateAnnotation(tpls="demo")
class DemoSpider4(CrawlSpider):
name = "demo4"
start_urls = [
"http://github.cannot.cc/baixing-helper"
]
```

- 使用scrapy_redis,解析start_urls的响应。
```python
from parser_engine import TemplateAnnotation
from parser_engine.clue.spider import ClueSpider
@TemplateAnnotation(start_url_tpl=({
"name": "zhongguozhongqi_xiaoshouwangluo",
"itemname": "HuocheDealerItem",
"parent": {
"xpath": "//tr[@class=\"bgcolor2\"]"
},
"fields": [
{
"key": "area",
"xpath": "td[1]/text()",
"value_type": "stripped_string"
}, {
"key": "leads_name",
"xpath": "td[2]/text()",
"value_type": "stripped_string"
}, {
"key": "address",
"xpath": "td[3]/text()",
"value_type": "stripped_string"
}, {
"key": "phone",
"xpath": "td[5]/text()",
"value_type": "stripped_string"
}
]
}), channel='zhongguozhongqi', leads_src='中国重汽')
class ZhongguozhongqiSpider(ClueSpider):
name = 'zhongguozhongqi'
def parse(self, response):
items = self._parse_start_url(response)
for item in items:
phone = item.get('phone')
if phone:
item['phone'] = phone.replace('', ',')
yield item
self.finish_clue(response, len(items))
```

- 使用scrapy_redis,灵活运用多种PE特性。
```python
from parser_engine.clue.spider import ClueSpider
from parser_engine import TemplateAnnotation
from parser_engine.clue.items import ClueItem
from parser_engine.request import TaskRequest
from scrapy import Request
@TemplateAnnotation(start_url_tpl=({
"name": "youka_shop_listing_api",
"parent": {
"json_key": "data",
},
"fields": [{
"key": "totalPage",
"json_key": "totalPage",

}, {
"key": "ids",
"json_path": "dataList[*].id"
}]
},),
tpls=({
"name": "youka_shop_detail_api",
"itemname": "HuocheDealerItem",
"parent": {
"json_key": "data",
},
"fields": [{
"key": "company_type",
"json_key": "category",
"mapper": {
1: "二手车直营店",
2: "4S店"
}
}, {
"key": "dealer_id",
"json_key": "id",
"required": 1,
}, {
"key": "leads_name",
"json_key": "shopName",
}, {
"key": "area",
"json_path": "districtDto.districtName",
"value_type": "singleton"
}, {
"key": "city",
"json_path": "cityDto.cityName",
"value_type": "singleton"
}, {
"key": "service_phone",
"default_value": "",
}, {
"key": "wechat",
"json_key": "wechat",
}, {
"key": "tags",
"json_key": "tags",
"join": ","
}]
}), channel='youka', leads_src='优卡')
class YoukaSpider(ClueSpider):
name = 'youka'
custom_settings = {
'CONCURRENT_REQUESTS': 2,
'CONCURRENT_REQUESTS_PER_DOMAIN': 1
}
def parse(self, response):
items = self._parse_start_url(response)
meta = response.meta
clue_id = meta.get('clue_id')
from_url = response.request.url
if meta.get('open_pages'):
total_page = items[0]['totalPage']
import re
current_page = int(re.findall('page=(\\d+)', from_url)[0])
for i in range(1, total_page + 1):
if current_page == i:
continue
url = "http://www.china2cv.com/truck-foton-web/api/shop/v1/getShopList?page=%d&pageSize=10" % i
yield ClueItem({"project": "huoche", "spider": self.name, "req": TaskRequest(
url=url,
meta={"from_clue_id": clue_id}
)})
for item in items:
for id in item['ids']:
r = Request(url="http://www.china2cv.com/truck-foton-web/api/shop/v1/getShopInfo?shopId=%d" % int(id),
callback=self._response_downloaded)
r.meta.update(rule=0, from_clue_id=clue_id)
yield r

def process_results(self, response, results):
for item in results:
item['url'] = 'http://www.china2cv.com/storeDetail.html?typess=1&shopId=' + str(item['dealer_id'])
return results
```

请参考[examples](./examples)
更多请参考[examples](./examples)

### 原理
- 解析器
Expand Down

0 comments on commit d4f63ce

Please sign in to comment.