【python】scrapy爬虫小项目-豆瓣电影top250
https://www.jianshu.com/p/79af7806d01a
https://blog.csdn.net/firewall5788/article/details/73526387
# doubanspider.py
import scrapy from scrapy.http import Request from scrapy.selector import Selector from Spider.doubanmovie.doubanmovie.items import DoubanmovieItem from urllib.parse import urljoin
class Douban(scrapy.spiders.Spider): name = "douban" allowed_domains = ["douban.com"] #redis_key = 'douban:start_urls' start_urls = ['https://movie.douban.com/top250']
def parse(self, response): item = DoubanmovieItem() selector = Selector(response) Movies = selector.xpath('//div[@class="info"]') for eachMovie in Movies: title = eachMovie.xpath('div[@class="hd"]/a/span/text()').extract() # 多个span标签 fullTitle = "".join(title) # 将多个字符串无缝连接起来 movieInfo = eachMovie.xpath('div[@class="bd"]/p/text()').extract() star = eachMovie.xpath('div[@class="bd"]/div[@class="star"]/span/text()').extract()[0] quote = eachMovie.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract() # quote可能为空,因此需要先进行判断 if quote: quote = quote[0] else: quote = '' item['title'] = fullTitle item['movieInfo'] = ';'.join(movieInfo) item['star'] = star item['quote'] = quote yield item nextLink = selector.xpath('//span[@class="next"]/link/@href').extract() # 第10页是最后一页,没有下一页的链接 if nextLink: nextLink = nextLink[0] yield Request(urljoin(response.url, nextLink), callback=self.parse)
#items.py
import scrapy
class DoubanmovieItem(scrapy.Item): title = scrapy.Field() # 电影名字 movieInfo = scrapy.Field() # 电影的描述信息,包括导演、主演、电影类型等等 star = scrapy.Field() # 电影评分 quote = scrapy.Field() # 电影中最经典或者说脍炙人口的一句话 pass
#main.py
from scrapy.cmdline import execute execute("scrapy crawl douban".split())
#middlewares.py
# -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://doc.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals
class DoubanmovieSpiderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s
def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. for i in result: yield i
def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Response, dict # or Item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r
def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name)
#pipelines.py
import pymysql from Spider.doubanmovie.doubanmovie import settings
class DoubanmoviePipeline(object): def __init__(self): self.connect = pymysql.connect( host=settings.MYSQL_HOST, db=settings.MYSQL_DBNAME, user=settings.MYSQL_USER, passwd=settings.MYSQL_PASSWD, charset='utf8', use_unicode=True) self.cursor = self.connect.cursor()
def process_item(self, item, spider): try: self.cursor.execute( """insert into doubantop250(title,movieInfo,star,quote) value (%s,%s,%s,%s)""", (item['title'], item['movieInfo'], item['star'], item['quote'])) self.connect.commit() except Exception as err: print("重复插入了==>错误信息为:" + str(err)) return item
#settings.py
# -*- coding: utf-8 -*- BOT_NAME = 'doubanmovie' SPIDER_MODULES = ['doubanmovie.spiders'] NEWSPIDER_MODULE = 'doubanmovie.spiders' ROBOTSTXT_OBEY = True USER_AGENT = 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50' MYSQL_HOST = 'localhost' # 数据库地址 MYSQL_DBNAME = 'doubanmovie' # 数据库名字 MYSQL_USER = 'root' # 数据库登录名 MYSQL_PASSWD = 'mysql' # 数据库登录密码 # 数据传输 ITEM_PIPELINES = { 'doubanmovie.pipelines.DoubanmoviePipeline': 301, }
#scrapy.cfg
# Automatically created by: scrapy startproject # # For more information about the [deploy] section see: # https://scrapyd.readthedocs.io/en/latest/deploy.html [settings] default = doubanmovie.settings [deploy] #url = http://localhost:6800/ project = doubanmovie
热门话题 · · · · · · ( 去话题广场 )
- 歌手2024直播 4.6万次浏览
- 身边的科普现场 新话题
- 我在网络上感受到的陌生人善意 新话题
- 少年时代的科普读物 新话题
- 现代人的“卡夫卡时刻” 354次浏览
- 一人一杯一口入魂的夏日特饮 新话题 · 3705次浏览