# Author: Sooele
# Author: Sooele
# coding=utf-8
from lxml import etree
import requests
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}
BASE_DOMAIN = 'https://dytt8.net'
def get_detail_urls(url):
response = requests.get(url, headers=HEADERS)
# response.text
# response.content
# response.text库,默认会使用自己猜测的编码方式将抓取下来的网页进行解码,然后存储到text属性上去,如果猜错,会乱码
#
# print(response.text)
# response.content.decode('gb18030')
text = response.content.decode('gbk', errors='ignore')
html = etree.HTML(text)
detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
##detail_urls = map(lambda url:BASE_DOMAIN+url,detail_urls)##
#map等于下面
#def abc(url):
# return BASE_DOMAIN+url
#
# index = 0
# for detail_url in detail_urls:
# detail_url = abc(detail_url)
# detail_urls{index} = detail_url
# index += 1
detail_urls = map(lambda url:BASE_DOMAIN+url,detail_urls)
return detail_urls
def spider():
base_url = "https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html"
for x in range(1,8):
url = base_url.format(x)
print(url)
if __name__ == '__main__':
spider(
detail_urls = map(lambda url:BASE_DOMAIN+url,detail_urls)
博主全是发代码呀。
是的!只是个笔记博客。留个底