爬虫dytt8详细页面顺序(map用法)

# Author: Sooele

# Author: Sooele
# coding=utf-8

from lxml import etree
import requests


HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}
BASE_DOMAIN = 'https://dytt8.net'


def get_detail_urls(url):


    response = requests.get(url, headers=HEADERS)
    # response.text
    # response.content
    # response.text库,默认会使用自己猜测的编码方式将抓取下来的网页进行解码,然后存储到text属性上去,如果猜错,会乱码
    #
    # print(response.text)
    # response.content.decode('gb18030')
    text = response.content.decode('gbk', errors='ignore')
    html = etree.HTML(text)
    detail_urls = html.xpath("//table[@class='tbspan']//a/@href")


    ##detail_urls = map(lambda url:BASE_DOMAIN+url,detail_urls)##
    #map等于下面
    #def abc(url):
    #     return BASE_DOMAIN+url
    #
    # index = 0
    # for detail_url in detail_urls:
    #     detail_url = abc(detail_url)
    #     detail_urls{index} = detail_url
    #     index += 1


    detail_urls = map(lambda url:BASE_DOMAIN+url,detail_urls)
    return detail_urls



def spider():
    base_url = "https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html"
    for x in range(1,8):
        url = base_url.format(x)
        print(url)

if __name__ == '__main__':
    spider(

 

detail_urls = map(lambda url:BASE_DOMAIN+url,detail_urls)

2
说点什么

avatar
1 Comment threads
1 Thread replies
0 Followers
 
Most reacted comment
Hottest comment thread
2 Comment authors
sooelerepostone Recent comment authors
  Subscribe  
提醒
repostone
游客

博主全是发代码呀。