diff --git a/README.md b/README.md index 41c4a8e..f29676f 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,10 @@ - 安装稳定版: >`pip install -U parser_engine` +### 示例 + +请参考:[examples](./examples)。 + ### 原理 - 解析器 >PE向调用方提供一套简单、易懂的参数,实际会将其`编译`成较为复杂的xpath表达式,再借助scrapy封装的解析器将所需内容提取出来。 @@ -41,8 +45,6 @@ >一个简单的需求场景:API返回的性别字段是0和1,但是需要将其转换成"男"和"女"。 ### 待做清单 -- 功能 - - 优化 - [ ] 支持直接在`Item`的类定义中定义模板 >用法示例:原模板的`itemname`参数通过注解传参,其他的模板参数定义在`Item`类中,如下所示。 @@ -124,9 +126,6 @@ TemplateAnnotation注解中传进来的参数,除了下面列出的,其他 - tpls: 模板的数组,或者模板id的数组 -其它约定: -- Spider类的`name`类变量,会被翻译成`business`赋值给item。 - 具体请参考[decorator.py](./parser_engine/decorator.py)中的注释及源代码。 #### Html格式 diff --git a/VERSION b/VERSION index 6da28dd..8294c18 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.1.1 \ No newline at end of file +0.1.2 \ No newline at end of file diff --git a/demo/demo/items.py b/demo/demo/items.py deleted file mode 100644 index 8c56a00..0000000 --- a/demo/demo/items.py +++ /dev/null @@ -1,81 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://doc.scrapy.org/en/latest/topics/items.html - -from scrapy.item import Item, Field -from peewee import Model, MySQLDatabase, PrimaryKeyField, CharField, IntegerField - -db = None - - -# db = PostgresqlDatabase("bxdw", host='bi.baixing.com', port=35432, user="biz_user", passwd="biz_user", charset="utf8") -def init_db(datasource): - global db - if not db: - from scrapy.utils import project - settings = project.get_project_settings() - if datasource == 'mysql': - db = MySQLDatabase(database=settings.get("MYSQL_DATABASE"), - host=settings.get("MYSQL_HOST"), - user=settings.get("MYSQL_USER"), - passwd=settings.get("MYSQL_PASSWORD"), - port=3306, charset="utf8") - return db - - -class BaseItem(Item): - channel_id = Field() - channel = Field() - created_time = Field() - - -class DemoItem(BaseItem): - # define the fields for your item here like: - name = Field() - text = Field() - author = Field() - steps = Field() - - -class ClueItem(Item): - channel = Field() - name = Field() - index = Field() - url = Field() - from_url = Field() - status = Field() - created_time = Field() - finished_time = Field() - - -class LeadsItem(Item): - channel_id = Field() - channel = Field() - name = Field() - contact = Field() - contact_type = Field() - city = Field() - category = Field() - address = Field() - created_time = Field() - extra = Field() - - -class Leads(Model): - id = PrimaryKeyField() - channel_id = CharField(verbose_name="渠道id", max_length=50, null=False, unique=True) - channel = CharField(verbose_name="渠道名称", null=False) - name = CharField(verbose_name="名称", null=False, unique=True) - contact = CharField(verbose_name="联系方式", null=False) - contact_type = CharField(verbose_name="联系方式类型", null=False) - city = CharField(verbose_name="城市") - category = CharField(verbose_name="类目/行业") - address = CharField(verbose_name="地址") - created_time = IntegerField(verbose_name="创建时间") - extra = CharField(verbose_name="附加信息") - - class Meta: - database = init_db("mysql") diff --git a/demo/demo/middlewares.py b/demo/demo/middlewares.py deleted file mode 100644 index fd80448..0000000 --- a/demo/demo/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://doc.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class ProcessorSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Response, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class ProcessorDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/demo/demo/pipelines.py b/demo/demo/pipelines.py deleted file mode 100644 index a997c40..0000000 --- a/demo/demo/pipelines.py +++ /dev/null @@ -1,76 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html - -import pymongo -import logging -from .items import LeadsItem, Leads, DemoItem - - -class DuplicatesPipeline(object): - def __init__(self): - self.leads_id_set = set() - self.leads_name_set = set() - - def process_item(self, item, spider): - channel_id = item['channel_id'] - name = item.get('name') - if channel_id in self.leads_id_set: - pass - if name in self.leads_name_set: - pass - self.leads_id_set.add(channel_id) - self.leads_name_set.add(name) - return item - - -class MongoDBPipeline(object): - collection_name = 'leads' - - def __init__(self, mongo_uri, mongo_db): - self.mongo_uri = mongo_uri - self.mongo_db = mongo_db - - @classmethod - def from_crawler(cls, crawler): - return cls( - mongo_uri=crawler.settings.get('MONGO_URI'), - mongo_db=crawler.settings.get('MONGO_DATABASE') - ) - - def open_spider(self, spider): - self.client = pymongo.MongoClient(self.mongo_uri) - self.db = self.client[self.mongo_db] - - def close_spider(self, spider): - self.client.close() - - def process_item(self, item, spider): - self.db[self.collection_name].insert(dict(item)) - logging.debug("item added to MongoDB") - return item - - -class MySQLPipeline(object): - def process_item(self, item, spider): - if isinstance(item, LeadsItem): - if not Leads.table_exists(): - Leads.create_table() - leads = Leads( - channel_id=item['channel_id'], - channel=item['channel'], - name=item['name'], - contact=item['contact'], - contact_type=item['contact_type'], - city=item['city'], - category=item['category'], - address=item['address'], - created_time=item['created_time'], - extra=item.get('extra', '')) - leads.save() - return item - elif isinstance(item, DemoItem): - print("pipeline get DemoItem", item) diff --git a/demo/demo/settings.py b/demo/demo/settings.py deleted file mode 100644 index 97ab155..0000000 --- a/demo/demo/settings.py +++ /dev/null @@ -1,108 +0,0 @@ -# -*- coding: utf-8 -*- - -# Scrapy settings for demo project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://doc.scrapy.org/en/latest/topics/settings.html -# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -# https://doc.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'demo' - -SPIDER_MODULES = ['demo.spiders'] -NEWSPIDER_MODULE = 'demo.spiders' -PARSER_ENGINE_CONFIG_FILE = "parser_engine2.json" -# SCHEDULER = "scrapy_redis.scheduler.Scheduler" -# DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" -# SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue' - -# REDIS_HOST = '127.0.0.1' -# REDIS_PORT = 6379 - -MYSQL_HOST = '127.0.0.1' -MYSQL_USER = 'root' -MYSQL_PASSWORD = '' -MYSQL_DATABASE = 'test' - -SCHEDULER_PERSIS = True - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -# USER_AGENT = 'demo (+http://www.yourdomain.com)' -USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:7.0.1) Gecko/20100101 Firefox/7.7' -# Obey robots.txt rules -ROBOTSTXT_OBEY = False - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -# CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -CONCURRENT_REQUESTS_PER_DOMAIN = 5 -# CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -# TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -# DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -# } - -# Enable or disable spider middlewares -# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html -# SPIDER_MIDDLEWARES = { -# 'demo.middlewares.ProcessorSpiderMiddleware': 543, -# } - -# Enable or disable downloader middlewares -# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -# DOWNLOADER_MIDDLEWARES = { -# 'demo.middlewares.ProcessorDownloaderMiddleware': 543, -# } - -# Enable or disable extensions -# See https://doc.scrapy.org/en/latest/topics/extensions.html -# EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -# } - -# Configure item pipelines -# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - # 'scrapy_redis.pipelines.RedisPipeline': 300, - # 'demo.pipelines.DuplicatesPipeline': 300, - # 'demo.pipelines.MongoDBPipeline': 350, - 'demo.pipelines.MySQLPipeline': 350, -} -# MONGO_URI = 'mongodb://localhost:27017' -# MONGO_DATABASE = 'test' - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://doc.scrapy.org/en/latest/topics/autothrottle.html -# AUTOTHROTTLE_ENABLED = True -# The initial download delay -# AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -# AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -# AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -# HTTPCACHE_ENABLED = True -# HTTPCACHE_EXPIRATION_SECS = 0 -# HTTPCACHE_DIR = 'httpcache' -# HTTPCACHE_IGNORE_HTTP_CODES = [] -# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000..1072a26 --- /dev/null +++ b/examples/README.md @@ -0,0 +1,41 @@ +## Examples of parser-engine +### demo + +>written before v0.1.0 + +为了验证PE的设计理念,从`http://github.cannot.cc/baixing-helper/`这一GitHub Pages的简单目录页着手,主要测试了`parser_engine.spider.PECrawlSpider`和PE模板配置文件的编写,以及PE对配置文件的加载、执行、输出等。 + +该项目不需要任何redis、db等依赖,可以直接进入到目录下`scrapy crawl **`运行,观察控制台标准输出即可。 + +注意,GitHub Pages似乎有轻微的反爬(症状是`连接被拒绝`),需要控制爬取速率。 + +### huoche + +>written after v0.1.0 + +抓取国内几家货车网站的经销商信息。 + +PE的大量特性,是在该项目开发过程中遇到问题之后开发的,因此该demo具有较高的参考意义。 + +`parser_engine.spider.PESpider`及其子类`parser_engine.clue.spider.ClueSpider`,基于`scrapy_redis`进行了二次开发,需要构造一个[TaskRequest](../parser_engine/request.py)对象,经json序列化后扔进某个spider对应的redis队列(通常是redis的list结构)中。 + +如果对如何构造该demo中所需的`TaskRequest`有兴趣,可以联系 [Danceiny](mailto:danceiny@gmail.com)。这里给出[中国重汽](./huoche/huoche/spiders/zhongguozhongqi.py)的实际例子: +```python +import json +import redis +r = redis.from_url("redis://127.0.0.1:6379") +task_reqs = [] +for i in range(34): + task_reqs.append({ + 'url': 'http://www.cnhtc.com.cn/View/XiaoShouWangLuoDetail.aspx?sc=5&Category=1&PV=0010%s' % ( + str(i) if i >= 10 else ('0%d' % i)), + 'headers': { + 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8' + } + }) +for task_req in task_reqs: + r.lpush('huoche:zhongguozhongqi:start_urls', json.dumps(task_req)) +``` + + +运行该项目前,除了安装python依赖(`pip install -r requirements.txt`)之外,还需要部署并配置好redis、mysql,相应的连接配置项见[settings.py](./huoche/huoche/settings.py)。 diff --git a/demo/demo/__init__.py b/examples/demo/__init__.py similarity index 100% rename from demo/demo/__init__.py rename to examples/demo/__init__.py diff --git a/examples/demo/demo/__init__.py b/examples/demo/demo/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/examples/demo/demo/items.py b/examples/demo/demo/items.py new file mode 100644 index 0000000..0c5347a --- /dev/null +++ b/examples/demo/demo/items.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- +from scrapy.item import Item, Field + + +class BaseItem(Item): + channel_id = Field() + channel = Field() + created_time = Field() + + +class DemoItem(BaseItem): + # define the fields for your item here like: + name = Field() + text = Field() + author = Field() + steps = Field() + + +class ClueItem(Item): + channel = Field() + name = Field() + index = Field() + url = Field() + from_url = Field() + status = Field() + created_time = Field() + finished_time = Field() + + +class LeadsItem(Item): + channel_id = Field() + channel = Field() + name = Field() + contact = Field() + contact_type = Field() + city = Field() + category = Field() + address = Field() + created_time = Field() + extra = Field() diff --git a/examples/demo/demo/pipelines.py b/examples/demo/demo/pipelines.py new file mode 100644 index 0000000..b57c400 --- /dev/null +++ b/examples/demo/demo/pipelines.py @@ -0,0 +1,5 @@ +# -*- coding: utf-8 -*- +class DemoPipeline(object): + def process_item(self, item, spider): + print("pipeline receive item, type: ", type(item), item) + return item diff --git a/examples/demo/demo/settings.py b/examples/demo/demo/settings.py new file mode 100644 index 0000000..6d97708 --- /dev/null +++ b/examples/demo/demo/settings.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- +BOT_NAME = 'demo' +SPIDER_MODULES = ['demo.spiders'] +NEWSPIDER_MODULE = 'demo.spiders' +PARSER_ENGINE_CONFIG_FILE = "parser_engine2.json" +SCHEDULER_PERSIS = True +USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:7.0.1) Gecko/20100101 Firefox/7.7' +ROBOTSTXT_OBEY = False +DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +CONCURRENT_REQUESTS_PER_DOMAIN = 1 +COOKIES_ENABLED = False +ITEM_PIPELINES = { + 'demo.pipelines.DemoPipeline': 350, +} \ No newline at end of file diff --git a/demo/demo/spiders/__init__.py b/examples/demo/demo/spiders/__init__.py similarity index 100% rename from demo/demo/spiders/__init__.py rename to examples/demo/demo/spiders/__init__.py diff --git a/demo/demo/spiders/demo_spider.py b/examples/demo/demo/spiders/demo_spider.py similarity index 80% rename from demo/demo/spiders/demo_spider.py rename to examples/demo/demo/spiders/demo_spider.py index 16e8a10..5e15a5a 100644 --- a/demo/demo/spiders/demo_spider.py +++ b/examples/demo/demo/spiders/demo_spider.py @@ -4,9 +4,9 @@ from scrapy.spiders import CrawlSpider -@TemplateAnnotation(tpls=("demo", "json-api-demo"), channel_id="cannot.cc", channel="Danceiny") +@TemplateAnnotation(tpls=("demo", "dict-api-demo"), channel_id="cannot.cc", channel="Danceiny") class DemoSpider(PECrawlSpider): - name = "demo" + name = "demo1" start_urls = [ "http://github.cannot.cc/baixing-helper/" @@ -44,13 +44,9 @@ class DemoSpider3(PECrawlSpider): name = "demo3" start_urls = [ - "http://172.31.1.4:30815/api/dict/area/0?childrenDepth=1", - # "https://restapi.amap.com/v3/place/text?citylimit=true&output=json&offset=20&city=shanghai&page=1&key=0f1ef779f17ac1f0541bef5452eb7570&keywords=%E6%95%99%E8%82%B2" + "https://restapi.amap.com/v3/place/text?citylimit=true&output=json&offset=20&city=shanghai&page=1&key=0f1ef779f17ac1f0541bef5452eb7570&keywords=%E6%95%99%E8%82%B2" ] - def callback(self, data): - print("准备持久化", data) - def process_results(self, response, results): print("处理结果", results) return results diff --git a/demo/demo/spiders/gaode_spider.py b/examples/demo/demo/spiders/gaode_spider.py similarity index 60% rename from demo/demo/spiders/gaode_spider.py rename to examples/demo/demo/spiders/gaode_spider.py index b0aa9ae..9d42d81 100644 --- a/demo/demo/spiders/gaode_spider.py +++ b/examples/demo/demo/spiders/gaode_spider.py @@ -10,28 +10,11 @@ class GaodeSpider(CrawlSpider): name = "gaode" def generate_urls(self): - # keywords = getattr(self, 'keywords', None) - # if keywords is None: keywords = "教育|培训" key = '0f1ef779f17ac1f0541bef5452eb7570' total = 2 adcodes = [ 310101, # 黄浦区 - # 310104,#徐汇区 - # 310105,#长宁区 - # 310106,#静安区 - # 310107,#普陀区 - # 310109,#虹口区 - # 310110,#杨浦区 - # 310115,#浦东新区 - # 310112,#闵行区 - # 310113,#宝山区 - # 310114,#嘉定区 - # 310116,#金山区 - # 310117,#松江区 - # 310118,#青浦区 - # 310120,#奉贤区 - # 310151,#崇明区 ] urls = [] for adcode in adcodes: @@ -42,7 +25,7 @@ def generate_urls(self): return urls def parse(self, response): - items = self._parse(response) + items = self._parse_start_url(response) if items: for item in items: yield item diff --git a/demo/parser_engine2.json b/examples/demo/parser_engine2.json similarity index 98% rename from demo/parser_engine2.json rename to examples/demo/parser_engine2.json index d6a1281..8e4b742 100644 --- a/demo/parser_engine2.json +++ b/examples/demo/parser_engine2.json @@ -6,7 +6,7 @@ "fields": [ { "dom_id": null, - "_css": null, + "css": null, "xpath": null, "tags": [ "h3" diff --git a/demo/scrapy.cfg b/examples/demo/scrapy.cfg similarity index 88% rename from demo/scrapy.cfg rename to examples/demo/scrapy.cfg index 0dd4859..306e2a1 100644 --- a/demo/scrapy.cfg +++ b/examples/demo/scrapy.cfg @@ -7,7 +7,6 @@ default = demo.settings [deploy] -;url = http://172.31.1.4:30217/ url = http://127.0.0.1:6800/ project = demo diff --git a/examples/huoche/huoche/__init__.py b/examples/huoche/huoche/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/examples/huoche/huoche/items.py b/examples/huoche/huoche/items.py new file mode 100644 index 0000000..f316e8c --- /dev/null +++ b/examples/huoche/huoche/items.py @@ -0,0 +1,64 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/items.html + +from scrapy.item import Item, Field +from peewee import Model, PrimaryKeyField, CharField, IntegerField +from parser_engine.config import mysqldb + + +class HuocheDealerItem(Item): + # channel + dealer_id 联合构成该dealer的唯一id + channel = Field() + dealer_id = Field() + leads_src = Field() # 线索渠道 + url = Field() # 网站URL + company_type = Field() # 公司类型 + leads_name = Field() # leads名称:公司名称,服务站名称 + area = Field() # 区域 + province = Field() # 省份 + city = Field() # 城市 + address = Field() # 地址 + phone = Field() # 电话 + service_phone = Field() # 24 小时服务电话 + wechat = Field() # 微信 + linkman = Field() # 联系人 + main_model = Field() # 主销车型 + online_source = Field() # 在线车源 + business_scope = Field() # 经营范围 + brand = Field() # 品牌 + tags = Field() # 标签 + + crawled_time = Field() + + +class HuocheDealerModel(Model): + id = PrimaryKeyField() + dealer_id = CharField(default='', max_length=32) # 在该渠道的id + channel = CharField(default='', max_length=16) # channel是英文版的leads_src + leads_src = CharField(default='', max_length=16) # 线索渠道 + phone = CharField(default='', max_length=64) # 电话 + wechat = CharField(default='', max_length=32) # 微信 + url = CharField(default='', max_length=64) # 网站URL + brand = CharField(default='', max_length=16) # 品牌 + tags = CharField(default='', max_length=64) # 标签 + company_type = CharField(default='', max_length=16) # 公司类型 + leads_name = CharField(default='', max_length=64) # leads名称:公司名称,服务站名称 + area = CharField(default='', max_length=16) # 区域 + province = CharField(default='', max_length=16) # 省份 + city = CharField(default='', max_length=16) # 城市 + address = CharField(default='', max_length=64) # 地址 + service_phone = CharField(default='', max_length=64) # 24 小时服务电话 + linkman = CharField(default='', max_length=64) # 联系人 + main_model = CharField(default='', max_length=64) # 主销车型 + online_source = CharField(default='', max_length=64) # 在线车源 + business_scope = CharField(default='', max_length=64) # 经营范围 + + crawled_time = IntegerField(default=0) + + class Meta: + database = mysqldb + table_name = 'huoche_dealer' diff --git a/examples/huoche/huoche/logger.py b/examples/huoche/huoche/logger.py new file mode 100644 index 0000000..de050af --- /dev/null +++ b/examples/huoche/huoche/logger.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- +import json +from parser_engine.patch import get_redis +from parser_engine.singleton import Singleton + + +@Singleton +class DwLogger: + def __init__(self, write_filename='dw_local.txt'): + from scrapy.utils import project + settings = project.get_project_settings() + self.r = get_redis(**settings.getdict('REDIS_PARAMS')) + self.ENV = settings.get('ENV') + if write_filename: + self.f = open(write_filename, 'a+') + else: + self.f = None + + def __del__(self): + if self.f: + self.f.close() + + def log_to_dw(self, action, **data): + if self.ENV == 'local': + if self.f: + self.f.write(json.dumps(data) + '\n') + return + + # dev环境才打数据到dw + pass diff --git a/examples/huoche/huoche/pipelines.py b/examples/huoche/huoche/pipelines.py new file mode 100644 index 0000000..8dec984 --- /dev/null +++ b/examples/huoche/huoche/pipelines.py @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- +from cpca import transform +from playhouse.shortcuts import dict_to_model +from .items import HuocheDealerItem, HuocheDealerModel +from .logger import DwLogger + + +class HuocheDealerItemPipeline(object): + """ + 地址分词,https://github.com/DQinYuan/chinese_province_city_area_mapper + """ + + def process_item(self, item, spider): + if isinstance(item, HuocheDealerItem): + if item.get('address') and (not item.get('province') or not item['city']): + dataframe = transform([item['address']]) + item['province'] = dataframe['省'].values[0] + item['city'] = dataframe['市'].values[0] + if item.get('tags') and isinstance(item['tags'], list): + item['tags'] = ','.join(item['tags']) + return item + + +class HuocheDealerDwPipeline(object): + + def __init__(self): + self.logger = DwLogger() + + def process_item(self, item, spider): + if isinstance(item, HuocheDealerItem): + self.logger.log_to_dw("huoche_dealer", **item) + return item + + +class HuocheDealerMySQLPipeline(object): + def process_item(self, item, spider): + if isinstance(item, HuocheDealerItem): + if not HuocheDealerModel.table_exists(): + HuocheDealerModel.create_table() + try: + model = HuocheDealerModel.get_or_none(dealer_id=item.get('dealer_id'), channel=item['channel']) + if model: + HuocheDealerModel.update(**item).where(HuocheDealerModel.id == model.id).execute() + else: + model = dict_to_model(HuocheDealerModel, item, True) + model.save() + except Exception as e: + spider.error("huoche_dealer MySQL pipeline failed, exception: %s" % str(e)) + print(item) + return item diff --git a/examples/huoche/huoche/settings.py b/examples/huoche/huoche/settings.py new file mode 100644 index 0000000..fc1306f --- /dev/null +++ b/examples/huoche/huoche/settings.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- + +BOT_NAME = 'huoche' + +SPIDER_MODULES = ['huoche.spiders'] +NEWSPIDER_MODULE = 'huoche.spiders' + +SCHEDULER = "scrapy_redis.scheduler.Scheduler" +DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" + +REDIS_PARAMS = { + "url": "redis://127.0.0.1:6379" +} +MYSQL_HOST = '127.0.0.1' +MYSQL_PORT = 3306 +MYSQL_USER = 'root' +MYSQL_PASSWORD = '' +MYSQL_DATABASE = 'crawler' +ENV = 'local' + +SCHEDULER_PERSIS = True +SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue' +REDIS_START_URLS_KEY = BOT_NAME + ":" + '%(name)s:start_urls' + +PARSER_ENGINE_CONFIG_FILE = "templates/gaode_pe.json" +ROBOTSTXT_OBEY = False + +DOWNLOADER_MIDDLEWARES = { + "scrapy.downloadermiddlewares.retry.RetryMiddleware": 543 +} +RETRY_TIMES = 1 +RETRY_HTTP_CODES = [] +ITEM_PIPELINES = { + 'huoche.pipelines.HuocheDealerMySQLPipeline': 339, + 'huoche.pipelines.HuocheDealerDwPipeline': 340, + 'parser_engine.clue.pipelines.CluePersistentPipeline': 341, + 'parser_engine.clue.pipelines.CluePipeline': 342, +} \ No newline at end of file diff --git a/examples/huoche/huoche/spiders/__init__.py b/examples/huoche/huoche/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/examples/huoche/huoche/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/examples/huoche/huoche/spiders/dongfeng_spider.py b/examples/huoche/huoche/spiders/dongfeng_spider.py new file mode 100644 index 0000000..f663adf --- /dev/null +++ b/examples/huoche/huoche/spiders/dongfeng_spider.py @@ -0,0 +1,45 @@ +from parser_engine import TemplateAnnotation +from parser_engine.clue.spider import ClueSpider + + +@TemplateAnnotation(start_url_tpl=({ + "name": "dongfeng_pe", + "parent": { + "xpath": "//li" + }, + "itemname": "HuocheDealerItem", + "fields": [ + { + "key": "dealer_id", + "xpath": "@data-id", + "value_type": "singleton" + }, + { + "key": "leads_name", + "xpath": "div[contains(@class,'data-Title')]/text()", + "value_type": "singleton" + }, + { + "key": "address", + "xpath": "p/span[@class='data-Address']/text()", + "value_type": "singleton" + }, + { + "key": "phone", + "xpath": "p/span[@class='data-Tel']/text()", + "value_type": "singleton" + }, + { + "key": "brand", + "xpath": "p/span[@class='data-Main']/text()", + "value_type": "singleton" + }, + ]},), channel='dongfeng', leads_src='东风') +class DongfengSpider(ClueSpider): + name = 'dongfeng' + + def parse(self, response): + items = self._parse_start_url(response) + for item in items: + yield item + self.finish_clue(response, len(items)) \ No newline at end of file diff --git a/examples/huoche/huoche/spiders/futian_spider.py b/examples/huoche/huoche/spiders/futian_spider.py new file mode 100644 index 0000000..e058d8a --- /dev/null +++ b/examples/huoche/huoche/spiders/futian_spider.py @@ -0,0 +1,21 @@ +from parser_engine import TemplateAnnotation +from parser_engine.clue.spider import ClueSpider + + +@TemplateAnnotation(start_url_tpl=({ + "name": "futian_pe", + "itemname": "HuocheDealerItem", + "extract_keys_map": { + "id": "dealer_id", + "dealerName": "leads_name", + "dealerAddress": "address", + "dealerTel": "phone" + }},), channel='futian', leads_src='福田汽车') +class FutianSpider(ClueSpider): + name = 'futian' + + def parse(self, response): + items = self._parse_start_url(response) + for item in items: + yield item + self.finish_clue(response, len(items)) diff --git a/examples/huoche/huoche/spiders/kachezhijia_spider.py b/examples/huoche/huoche/spiders/kachezhijia_spider.py new file mode 100644 index 0000000..e69655c --- /dev/null +++ b/examples/huoche/huoche/spiders/kachezhijia_spider.py @@ -0,0 +1,95 @@ +from parser_engine import TemplateAnnotation +from parser_engine.clue.spider import ClueSpider +from parser_engine.clue.items import ClueItem +import re + + +@TemplateAnnotation(start_url_tpl=({ + "name": "kachezhijia_listing", + "parent": { + "xpath": "//ul[@class=\"dealers\"]/li/div[@class=\"detail\"]" + }, + "itemname": "HuocheDealerItem", + "fields": [ + { + "key": "dealer_id", + "xpath": "p[@class=\"contact\"]/a/@href", + "regexp": "360che.com/(\\d+)/", + "value_type": "singleton" + }, + { + "key": "url", + "xpath": "p[@class=\"contact\"]/a/@href", + "value_type": "singleton" + }, + { + "key": "leads_name", + "xpath": "h2/a[@href]/text()", + "value_type": "singleton" + }, + { + "key": "tags", + "xpath": "span[@class=\"inline-block\"]/text()" + }, + { + "key": "address", + "xpath": "p[last()-1]/text()", + "regexp": "地址 : (\\w+)", + "value_type": "stripped_string" + }, + { + "key": "brand", + "xpath": "p[@class=\"contact\"]/a[@href]/text()", + "value_type": "singleton" + }, + { + "key": "phone", + "xpath": "p[@class=\"contact\"]/span[@class=\"tel\"]/text()", + "value_type": "singleton" + } + ] + }, { + "name": "kachezhijia_pageinfo", + "fields": [{ + "key": "totalPage", + "xpath": "//ul[@class=\"page-list\"]/li[last()-1]//a[@href]/text()", + "value_type": "int" + }, { + "key": "totalCount", + "xpath": "//ul[@id=\"site-list\"]/li[1]/a[@href]/text()", + "regexp": "不限 \((\\d+)\)", + "value_type": "int" + } + ] + }), channel='kachezhijia', leads_src='卡车之家') +class CachezhijiaSpider(ClueSpider): + name = 'kachezhijia' + + def parse(self, response): + from_url = response.request.url + from_clue_id = response.meta.get('clue_id') + # 翻页 + if response.meta.get('open_pages', False): + data = self._parse_start_url(response, 1) + try: + total_count = data[0]['totalCount'] + total_page = data[0]['totalPage'] + self.info("卡车之家今日共计%d个HuocheDealer" % total_count) + except (KeyError, IndexError) as e: + self.error("get kachezhijia page totalCount error: %s data: %s, request.body: %s" + % (str(e), data, response.request.body)) + else: + response.request.meta['open_pages'] = 0 + current_page = int(re.findall('0_c(\\d+)', from_url)[0]) + for i in range(0, total_page + 1): + if i == current_page: + continue + task = self.request_to_task(response.request) + task.url = re.sub('c(\\d)', 'c%d' % i, task.url) + yield ClueItem( + {"url": task.url, "req": task, "project": self.project, "spider": self.name, + "from_clue_id": from_clue_id, }) + items = self._parse_start_url(response) + for item in items: + yield item + self.finish_clue(response, len(items)) diff --git a/examples/huoche/huoche/spiders/sxqc_spider.py b/examples/huoche/huoche/spiders/sxqc_spider.py new file mode 100644 index 0000000..dcc908a --- /dev/null +++ b/examples/huoche/huoche/spiders/sxqc_spider.py @@ -0,0 +1,31 @@ +from parser_engine import TemplateAnnotation +from parser_engine.clue.spider import ClueSpider +from ..items import HuocheDealerItem +import json + + +@TemplateAnnotation(start_url_tpl=({ + "name": "sxqc_pe", + "itemname": "HuocheDealerItem", + "extract_keys_map": { + "title": "leads_name", + "address": "address", + "phone": "phone" + }},), channel='', leads_src='') +class FutianSpider(ClueSpider): + name = 'sxqc' + + def parse(self, response): + body = '[' + bytes.decode(response.body) + ']' + body = body.replace("'", '"') + data = json.loads(body) + for v in data: + item = HuocheDealerItem( + leads_name=v['title'], + address=v['address'], + phone=v['phone'], + channel="sxqc", + leads_src="陕西重卡" + ) + yield item + self.finish_clue(response, len(data)) diff --git a/examples/huoche/huoche/spiders/yiqijiefang_spider.py b/examples/huoche/huoche/spiders/yiqijiefang_spider.py new file mode 100644 index 0000000..a47774a --- /dev/null +++ b/examples/huoche/huoche/spiders/yiqijiefang_spider.py @@ -0,0 +1,50 @@ +from parser_engine import TemplateAnnotation +from parser_engine.clue.spider import ClueSpider +from six.moves.urllib.parse import parse_qsl + + +@TemplateAnnotation(start_url_tpl=({ + "name": "yiqijiefang_pe", + "parent": { + "xpath": "//table[@class='list_1']/tr" + }, + "itemname": "HuocheDealerItem", + "fields": [ + { + "key": "city", + "xpath": "td[@class='city']/text()", + "value_type": "singleton" + }, + { + "key": "leads_name", + "xpath": "td[@class='fwz_name']/text()", + "value_type": "singleton" + }, + { + "key": "address", + "xpath": "td[@class='address']/text()", + "value_type": "singleton" + }, + { + "key": "phone", + "xpath": "td[@class='phone']/text()", + "value_type": "singleton" + }, + { + "key": "service_phone", + "xpath": "td[@class='bei1']/text()", + "value_type": "singleton" + }, + ]},), channel='jiefang', leads_src='解放') +class YiqijiefangSpider(ClueSpider): + name = 'yiqijiefang' + + def parse(self, response): + items = self._parse_start_url(response) + request_body = str(response.request.body, encoding="utf-8") + request_data = dict(parse_qsl(request_body)) + province = request_data.get('province') + for item in items: + item['province'] = province + yield item + self.finish_clue(response, len(items)) diff --git a/examples/huoche/huoche/spiders/youka.py b/examples/huoche/huoche/spiders/youka.py new file mode 100644 index 0000000..26a04b5 --- /dev/null +++ b/examples/huoche/huoche/spiders/youka.py @@ -0,0 +1,118 @@ +from parser_engine.clue.spider import ClueSpider +from parser_engine import TemplateAnnotation +from parser_engine.clue.items import ClueItem +from parser_engine.request import TaskRequest +from scrapy import Request + + +@TemplateAnnotation(start_url_tpl=({ + "name": "youka_shop_listing_api", + "parent": { + "json_key": "data", + }, + "fields": [{ + "key": "totalPage", + "json_key": "totalPage", + + }, { + "key": "ids", + "json_path": "dataList[*].id" + }] + },), + tpls=({ + "name": "youka_shop_detail_api", + "itemname": "HuocheDealerItem", + "parent": { + "json_key": "data", + }, + "fields": [{ + "key": "company_type", + "json_key": "category", + "mapper": { + 1: "二手车直营店", + 2: "4S店" + } + }, { + "key": "dealer_id", + "json_key": "id", + "required": 1, + }, { + "key": "leads_name", + "json_key": "shopName", + }, { + "key": "area", + "json_path": "districtDto.districtName", + "value_type": "singleton" + }, { + "key": "province", + "json_path": "provinceDto.provinceName", + "value_type": "singleton" + }, { + "key": "city", + "json_path": "cityDto.cityName", + "value_type": "singleton" + }, { + "key": "address", + "json_key": "wholeAddress", + }, { + "key": "phone", + "json_key": "mobile", + }, { + "key": "service_phone", + "default_value": "", + }, { + "key": "wechat", + "json_key": "wechat", + }, { + "key": "linkman", + "json_key": "contactName" + }, { + "key": "tags", + "json_key": "tags", + "join": "," + }, { + "key": "brand", + "json_key": "brandList", + "join": "," + }, { + "key": "business_scope", + "json_key": "scope" + }] + }), channel='youka', leads_src='优卡') +class YoukaSpider(ClueSpider): + name = 'youka' + custom_settings = { + 'CONCURRENT_REQUESTS': 2, + 'CONCURRENT_REQUESTS_PER_DOMAIN': 1 + } + + # 二手车直营店 "category": 1, + # 4S店 "category": 2, + def parse(self, response): + items = self._parse_start_url(response) + meta = response.meta + clue_id = meta.get('clue_id') + from_url = response.request.url + if meta.get('open_pages'): + total_page = items[0]['totalPage'] + import re + current_page = int(re.findall('page=(\\d+)', from_url)[0]) + for i in range(1, total_page + 1): + if current_page == i: + continue + url = "http://www.china2cv.com/truck-foton-web/api/shop/v1/getShopList?page=%d&pageSize=10" % i + yield ClueItem({"project": "huoche", "spider": self.name, "req": TaskRequest( + url=url, + meta={"from_clue_id": clue_id} + )}) + for item in items: + for id in item['ids']: + r = Request(url="http://www.china2cv.com/truck-foton-web/api/shop/v1/getShopInfo?shopId=%d" % int(id), + callback=self._response_downloaded) + r.meta.update(rule=0, from_clue_id=clue_id) + yield r + + def process_results(self, response, results): + for item in results: + item['url'] = 'http://www.china2cv.com/storeDetail.html?typess=1&shopId=' + str(item['dealer_id']) + return results diff --git a/examples/huoche/huoche/spiders/zhongguozhongqi.py b/examples/huoche/huoche/spiders/zhongguozhongqi.py new file mode 100644 index 0000000..6f7c96a --- /dev/null +++ b/examples/huoche/huoche/spiders/zhongguozhongqi.py @@ -0,0 +1,44 @@ +from parser_engine import TemplateAnnotation +from parser_engine.clue.spider import ClueSpider + + +@TemplateAnnotation(start_url_tpl=({ + "name": "zhongguozhongqi_xiaoshouwangluo", + "itemname": "HuocheDealerItem", + "parent": { + "xpath": "//tr[@class=\"bgcolor2\"]" + }, + "fields": [ + { + "key": "area", + "xpath": "td[1]/text()", + "value_type": "stripped_string" + }, { + "key": "leads_name", + "xpath": "td[2]/text()", + "value_type": "stripped_string" + }, { + "key": "address", + "xpath": "td[3]/text()", + "value_type": "stripped_string" + }, { + "key": "linkman", + "xpath": "td[4]/text()", + "value_type": "stripped_string" + }, { + "key": "phone", + "xpath": "td[5]/text()", + "value_type": "stripped_string" + } + ] +}), channel='zhongguozhongqi', leads_src='中国重汽') +class ZhongguozhongqiSpider(ClueSpider): + name = 'zhongguozhongqi' + def parse(self, response): + items = self._parse_start_url(response) + for item in items: + phone = item.get('phone') + if phone: + item['phone'] = phone.replace('、', ',') + yield item + self.finish_clue(response, len(items)) diff --git a/examples/huoche/requirements.txt b/examples/huoche/requirements.txt new file mode 100644 index 0000000..7beb861 --- /dev/null +++ b/examples/huoche/requirements.txt @@ -0,0 +1,5 @@ +scrapy_redis +scrapy +parser_engine +cpca +peewee \ No newline at end of file diff --git a/parser_engine/clue/pipelines.py b/parser_engine/clue/pipelines.py index 66c7a7b..ce26ba9 100644 --- a/parser_engine/clue/pipelines.py +++ b/parser_engine/clue/pipelines.py @@ -12,14 +12,17 @@ def process_item(self, item, spider): model = ClueModel.from_item(item) model.save() item['req'].meta['clue_id'] = model.id - spider.info('CluePersistentPipeline save clue [clue_id] %s to database' % item['req'].meta.get('clue_id')) + spider.info('CluePersistentPipeline save clue {clue_id} to database' + .format(clue_id=item['req'].meta.get('clue_id'))) return item # route clue to queue class CluePipeline(object): + def process_item(self, item, spider): if isinstance(item, ClueItem): - spider.info('CluePipeline route clue [clue_id] %s to queue' % item['req'].meta.get('clue_id')) + clue_id = item['req'].meta.get('clue_id') + spider.info('CluePipeline route clue {clue_id} to queue'.format(clue_id=clue_id)) spider.route('%s:%s:start_urls' % (item['project'], item['spider']), item['req']) return item diff --git a/parser_engine/decorator.py b/parser_engine/decorator.py index cc92e99..fc154f4 100644 --- a/parser_engine/decorator.py +++ b/parser_engine/decorator.py @@ -4,7 +4,7 @@ from scrapy.linkextractors import LinkExtractor from .template import PETemplate -from .parser import parse_with_tpl +from .parser import parse_with_tpl, PEParser from .utils import is_sequence, is_string from .singleton import Singleton from .config import init_config, get_config_data @@ -19,13 +19,19 @@ def get_method(method): return getattr(self, method, None) self._rules = [copy.copy(r) for r in self.rules] + global c for rule in self._rules: # diff start + # support PECrawlSpider: use template driven callback processor + if getattr(rule, "template", None): + rule.parser = get_method(PEParser(rule.template)) + continue tpl_id = rule.cb_kwargs.pop('tpl_id', None) if tpl_id: tpl = PETemplate.from_json(find_by_id(tpl_id)) rule.link_extractor = tpl.get_link_extractor() rule.callback = parse_with_tpl + rule.cb_kwargs['tpl'] = tpl # diff end rule.process_links = get_method(rule.process_links) rule.process_request = get_method(rule.process_request) @@ -212,18 +218,8 @@ def _parse_start_url(self, response, tpl_index_or_id=None): # do patch spcls._compile_rules = _compile_rules_patch else: - pass # FIXME: scrapy.Spider && scrapy_redis.spiders.RedisSpider case - # def parse_response_patch(self, response): - # return self.start_rule.callback(response) - # - # spcls.parse_response = classmethod(parse_response_patch) - # - # def start_requests(self): - # for url in self.start_urls: - # yield Request(url, callback=spcls.parse_response) - # - # spcls.start_requests = classmethod(start_requests) + pass return spcls return _deco diff --git a/parser_engine/request.py b/parser_engine/request.py index 539a1eb..474707b 100644 --- a/parser_engine/request.py +++ b/parser_engine/request.py @@ -4,22 +4,6 @@ from six.moves.urllib.parse import urlencode -class JsonRequest(Request): - - def __init__(self, *args, **kwargs): - jsondata = kwargs.pop('jsondata', None) - if jsondata and kwargs.get('method') is None: - kwargs['method'] = 'POST' - - super(JsonRequest, self).__init__(*args, **kwargs) - - if jsondata: - data = json.dumps(jsondata) if isinstance(jsondata, dict) else jsondata - if self.method == 'POST': - self.headers.setdefault(b'Content-Type', b'application/json') - self._set_body(data) - - class TaskRequest(dict): def __init__(self, url=None, method='GET', body=None, headers=None, cookies=None, meta=None, **kwargs): if headers is None: @@ -59,6 +43,22 @@ def __getattr__(self, item): return self.get(item) +class JsonRequest(Request): + + def __init__(self, *args, **kwargs): + jsondata = kwargs.pop('jsondata', None) + if jsondata and kwargs.get('method') is None: + kwargs['method'] = 'POST' + + super(JsonRequest, self).__init__(*args, **kwargs) + + if jsondata: + data = json.dumps(jsondata) if isinstance(jsondata, dict) else jsondata + if self.method == 'POST': + self.headers.setdefault(b'Content-Type', b'application/json') + self._set_body(data) + + def make_request(url, method='GET', formdata=None, jsondata=None, headers=None, **kwargs): if formdata: return FormRequest(url=url, method=method, formdata=formdata, headers=headers, **kwargs) diff --git a/parser_engine/spider.py b/parser_engine/spider.py index 022cd6b..d77d4fe 100644 --- a/parser_engine/spider.py +++ b/parser_engine/spider.py @@ -44,12 +44,14 @@ def __str__(self): class PECrawlSpider(CrawlSpider): + """ + start_urls driven + """ # subclass should init rules before call super init start_rule = None def __init__(self, *a, **kw): super(PECrawlSpider, self).__init__(*a, **kw) - self._compile_rules() def start_requests(self): """ @@ -104,7 +106,7 @@ def _parse_response_v2(self, response, parser, callback, cb_kwargs, follow=True) if parser: cb_res = parser(response, **cb_kwargs) or () if callback: - cb_res = callback(response, **cb_kwargs) or () + cb_res = callback(response, cb_res=cb_res, **cb_kwargs) or () cb_res = self.process_results(response, cb_res) for requests_or_item in iterate_spider_output(cb_res): yield requests_or_item @@ -132,6 +134,9 @@ def get_method(method): class PESpider(RedisCrawlSpider): + """ + redis driven + """ def __init__(self, *args, **kwargs): super(PESpider, self).__init__(*args, **kwargs)