import requests
from bs4 import BeautifulSoup
import csv
def get_html(url):
headers = {'User-Agent': 'Mozilla/5.0'}
r = requests.get(url, headers=headers, timeout=30)
r.encoding = r.apparent_encoding
return r.text
def extract_movies(html):
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('div', class_='item')
movies = []
for item in items:
movie = {}
title = item.find('span', class_='title')
movie['name'] = title.get_text().strip() if title else "N/A"
info = item.find('p', class_='')
if info:
details = info.get_text().strip().split('\n')
director_and_cast = details[0]
movie['director'] = director_and_cast.split('主演')[0].strip().replace('导演:', '')
movie['cast'] = director_and_cast.split('主演')[-1].strip()
additional_info = details[1].split('/')
movie['release_year'] = additional_info[0].strip()
movie['genre'] = additional_info[-1].strip()
rating = item.find('span', class_='rating_num')
movie['rating'] = rating.get_text().strip() if rating else "N/A"
movie_link = item.find('div', class_='hd').find('a')['href']
detail_html = get_html(movie_link)
detail_soup = BeautifulSoup(detail_html, 'html.parser')
summary_span = detail_soup.find('span', property='v:summary')
movie['summary'] = summary_span.get_text().strip() if summary_span else "N/A"
movies.append(movie)
save_csv(movies)
def save_csv(data):
with open('douban_movies.csv', 'a', encoding='utf-8-sig', newline='') as f:
writer = csv.DictWriter(f, fieldnames=['name', 'director', 'cast', 'release_year', 'genre', 'rating', 'summary'])
writer.writeheader()
writer.writerows(data)
urls = ["https://movie.douban.com/top250?start={}".format(i) for i in range(0, 125, 25)]
for page, url in enumerate(urls, 1):
print(f'第{page}页电影正在下载')
html = get_html(url)
extract_movies(html)