本文共 3508 字,大约阅读时间需要 11 分钟。
以爬取伯乐文章为例,使用itemloader后:
from scrapy.loader.processors import MapCompose,TakeFirstimport datetimefrom scrapy.loader import ItemLoaderclass ArticleItemLoader(ItemLoader): """自定制ItemLoader,取值都会调用TakeFirst函数""" default_output_processor = TakeFirst()def transform_date(publish_date): """在item赋值前处理之前xpath定位的的publish_date字段并返回""" try: publish_date = publish_date.strip().split(' ')[0] publish_date = datetime.datetime.strptime(publish_date, "%Y/%m/%d") except: publish_date = datetime.datetime.now() return publish_datedef get_collect_num(value): """在item赋值前处理之前xpath定位的的 collect_num 字段并返回""" try: collect_num = int(value.strip().split(' ')[0]) except: collect_num=0 return collect_numdef return_value(value): """覆盖output_processor,保持默认的状态""" return valueclass JobboleArticleItem(scrapy.Item): title = scrapy.Field() publish_date = scrapy.Field( # 对传入到item的值调用指定的函数进行预处理,且自动传入传入当前字段值 input_processor= MapCompose(transform_date), ) cate = scrapy.Field() favor_num = scrapy.Field( input_processor=MapCompose(lambda x:int(x)) ) collect_num = scrapy.Field( input_processor=MapCompose(get_collect_num) ) # 收藏 img_url=scrapy.Field( output_processor=MapCompose(return_value) ) # 封面图片 img_save_path=scrapy.Field( ) # 封面图片 url = scrapy.Field() # 当前图片路径
from scrapy.loader import ItemLoaderfrom ..items import JobboleArticleItem,ArticleItemLoaderdef parse_detail(self,response): """解析文章的具体字段""" img_url = response.meta.get('img_url','') img_url = urljoin(response.url,img_url) # 1、实例化ArticleItemLoader对象 item_loader= ArticleItemLoader(item=JobboleArticleItem(), response=response) # 2、搜集数据,指定保存的字段名和xpath路径, # item_loader.add_css('title','...') item_loader.add_xpath('title', '//*[@class="entry-header"]/h1/text()') item_loader.add_xpath('publish_date', '//*[@class="entry-meta-hide-on-mobile"]/text()[1]') item_loader.add_xpath('cate', '//p[@class="entry-meta-hide-on-mobile"]/a[1]/text()') item_loader.add_xpath('favor_num', '//*[@class="post-adds"]/span[1]/h10/text()') item_loader.add_xpath('collect_num','//*[@class="post-adds"]/span[2]/text()') # 2、添加已经确定的值到loader中 item_loader.add_value('img_url', [img_url]) # 特殊字段,必须输入列表或元祖对象,供默认的imagepipeline使用 item_loader.add_value('url', response.url) item_loader.add_value('img_save_path', '') # 3、调用load_item方法取出最终的item对象并返回 loaded_item = item_loader.load_item() return loaded_item# 通过item css 实现 # front_image_url = response.meta.get("front_image_url", "") # 文章封面图 # item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) # item_loader.add_css("title", ".entry-header h1::text") # item_loader.add_value("url", response.url) # item_loader.add_value("url_object_id", get_md5(response.url)) # item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") # item_loader.add_value("front_image_url", [front_image_url]) # item_loader.add_css("praise_nums", ".vote-post-up h10::text") # item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") # item_loader.add_css("fav_nums", ".bookmark-btn::text") # item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") # item_loader.add_css("content", "div.entry") # # article_item = item_loader.load_item() # yield article_item
转载地址:http://cuksi.baihongyu.com/