python模块 / Python爬虫 / python记录 · 2019年6月26日

Python爬虫BeautifulSoup

import requests
from bs4 import BeautifulSoup

response = requests.get("https://www.autohome.com.cn/news/")
response.encoding = 'gbk'

soup = BeautifulSoup(response.text,'html.parser')

div = soup.find(name='div',attrs={'id':'auto-channel-lazyload-article'})

li_list = div.find_all(name='li')

for li in li_list:

    title = li.find(name='h3')
    if not title:
        continue
    p = li.find(name='p')
    a = li.find(name='a')

    print(title.text)
    print(a.attrs.get('href'))
    print(p.text)

    img = li.find(name='img')
    src = img.get('src')
    src = "https:" + src
    print(src)

    # 再次发起请求，下载图片
    file_name = src.rsplit('/',maxsplit=1)[1]
    ret = requests.get(src)
    with open('./Web/'+file_name, 'wb') as fp:
        fp.write(ret.content)

相关

标签： Python 爬虫

您可能还喜欢...