豆瓣电影TOP250
英文21世纪
import requests
from bs4 import BeautifulSoup
import os
def getHtml(url):
'''请求网页'''
hd = {'User-Agent':'Mozilla/5.0'} # 加入请求头
r = requests.get(url, headers=hd, timeout=10) # get请求方法
r.encoding=r.apparent_encoding
return r.text
def extractHtml(html):
'''解析与获取网页'''
soup=BeautifulSoup(html,'html.parser') # 网页信息解析,使用python默认方式,即html.parser
# 尝试找到文章的标题,这里需要根据具体的网页结构调整选择器
title_tag = soup.find('span', class_='TitleCn_1')
if title_tag:
title = title_tag.get_text().strip() # 获取标题文本并去除首尾空格
# 替换文件名中不允许的字符
filename = ''.join(c for c in title if c.isalnum() or c in " _-.")
filename += '.txt'
else:
filename = 'default_article.txt' # 如果没有找到标题,使用默认文件名
span=soup.find_all('div', class_="artcontent txt-16") #找所有p标签
print(span)
txt=""
for item in span:
# print(item.get_text()) #去标签 并输出
txt=txt+item.get_text()
# print(txt)
save_txt(filename,txt)
def save_txt(filename, data):
'''存储到txt'''
with open(filename,'wt',encoding='utf-8', newline='') as f:
f.writelines(data)
urls = ['https://www.i21st.cn/article/translate/30122_1.html', 'https://www.i21st.cn/article/translate/30116_1.html','https://www.i21st.cn/article/translate/30088_1.html','https://www.i21st.cn/article/translate/30085_1.html','https://www.i21st.cn/article/translate/30077_1.html']
for url in urls:
html=getHtml(url)
extractHtml(html)