使用scrapy框架出现callback指定的函数不被调用的情况

allowed_domains = ['book.suning.com']
start_urls = ['https://book.suning.com']

问题在于,Xpath获取链接跳转后如果

会跳出二级域名book.suning.com去其他 xx.suning.com

解决办法:

allowed_domains = ['suning.com']

把allowed_domains = [‘suning.com’] 改成suning的顶级域名

 

成功:

import scrapy


class BookSpider(scrapy.Spider):
    name = 'book'
    allowed_domains = ['suning.com']
    start_urls = ['https://book.suning.com']

    def parse(self, response):
        html_list = response.xpath("//div[@class='menu-list']/div")[3:8]
        # print(html_list.extract())
        # test_list = html_list.xpath("./div/dl/dt/h3")
        # print(test_list.extract())
        for list_ul in html_list:
            item={}
            item["classification"]=list_ul.xpath("./dl/dt/h3/a/text()").extract_first()
            item["classification_url"]=list_ul.xpath("./dl/dt/h3/a/@href").extract_first()
            item["url"]=""+ item["classification_url"]
            # print(item["url"])
            yield scrapy.Request(url=item["url"],callback=self.parse_detail,meta={"item":item})



    def parse_detail(self,response):###处理购买页
        print(response)
        item = response.meta["item"]
        # print(item)
    #     item["book_list"]=response.xpath("//div[@class='res-info']")
    #     print(item)s

 

失败:

import scrapy


class BookSpider(scrapy.Spider):
    name = 'book'
    allowed_domains = ['book.suning.com']
    start_urls = ['https://book.suning.com']

    def parse(self, response):
        html_list = response.xpath("//div[@class='menu-list']/div")[3:8]
        # print(html_list.extract())
        # test_list = html_list.xpath("./div/dl/dt/h3")
        # print(test_list.extract())
        for list_ul in html_list:
            item={}
            item["classification"]=list_ul.xpath("./dl/dt/h3/a/text()").extract_first()
            item["classification_url"]=list_ul.xpath("./dl/dt/h3/a/@href").extract_first()
            item["url"]=""+ item["classification_url"]
            # print(item["url"])
            yield scrapy.Request(url=item["url"],callback=self.parse_detail,meta={"item":item})



    def parse_detail(self,response):###处理购买页
        print(response)
        item = response.meta["item"]
        # print(item)
    #     item["book_list"]=response.xpath("//div[@class='res-info']")
    #     print(item)