# Author: Sooele
# coding=utf-8
from lxml import etree
import requests
url = "https://www.dytt8.net/html/gndy/dyzz/list_23_2.html"
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}
response = requests.get(url,headers=headers)
#response.text
#response.content
#response.text库,默认会使用自己猜测的编码方式将抓取下来的网页进行解码,然后存储到text属性上去,如果猜错,会乱码
#
# print(response.text)
# response.content.decode('gb18030')
text = response.content.decode('gbk',errors = 'ignore')
html = etree.HTML(text)
# print(text)
detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
for detail_url in detail_urls:
print(detail_url)
UnicodeDecodeError
response=XXXX.content.decode('gbk',errors = 'ignore')