import time
import requests
from bs4 import BeautifulSoup as bs
from time import sleep
def get_url_douban(url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36',
}
response = requests.get(url=url,headers=headers).text
douban_info = bs(response,'html.parser')
# print(douban_info)
# Python 中使用 for in 形式的循环,Python 使用缩进来做语句块分隔
## 混合使用模块和 for 的功能,因为 tags atag 对象既能支持 find_all 又拥有迭代功能
for tags in douban_info.find_all('div', attrs={'class': 'pl2'}):
for atag in tags.find_all('a',):
# 获取所有链接
hrdf_url = atag.get('href')
# print(atag.get('href'))
# 获取图书名字
hrdf_name = atag.get('title')
# print(atag.get('title'))
with open('douban.txt','a',encoding='utf-8')as file:
file.write(hrdf_name+':'+'\n')
file.write(hrdf_url+'\n')
print(f'正在写入--{hrdf_name}:{hrdf_url}')
file.close
# 方法1
for page in range(10):
astring = f'https://book.douban.com/top250?start={ page * 25}'
# print(astring)
get_url_douban(astring)
page=page+1
print(f'正在输出第{page}页')
time.sleep(1)
# # 方法2
# url = tuple(f'https://book.douban.com/top250?start={ page * 25}' for page in range(10))
# if __name__ == '__main__':
# for page in url:
# get_url_douban(page)
# print()
# sleep(5)
相关