# 爬虫启动时,会自动调用start_requests方法,并且只会被调用一次用于生成初始的Request对象 # 这个函数实现翻页功能,对每个url发起请求并调用parse方法进行处理 defstart_requests(self): for i inrange(0, 10): url = f'https://movie.douban.com/top250?start={i*25}' yield scrapy.Request(url=url, callback=self.parse)
defparse(self, response): items = []
soup = BeautifulSoup(response.text, 'html.parser') title_list = soup.find_all('div', attrs={'class', 'hd'}) for i inrange(len(title_list)): item = DouBanMovieItem() title = title_list[i].find('a').find('span',).text link = title_list[i].get('href') item['title'] = title item['link'] = link items.append(item)
# 返回items 而不是存储数据,交给pipline处理,解耦 return items
items.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
# Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
# class SpidersItem(scrapy.Item): # # define the fields for your item here like: # # name = scrapy.Field() # pas
# 定义一个item类,用于存储数据 classDouBanMovieItem(scrapy.Item): # scrapy.Field() 定义一个字段,用于存储数据 类似于 Python 中的字典 title = scrapy.Field() link = scrapy.Field()
for i in title_list: item = DouBanMovieItem() title = i.find('a').find('span',).text link = i.find('a').get('href') item['title'] = title item['link'] = link # 返回请求或者item对象交给engin, 由engin决定如何处理: # 1. 交给调度器入队列 # 2. 交给下载器下载 # 3. 交给spider解析
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface from itemadapter import ItemAdapter import os