allowed_domains = ['book.suning.com'] start_urls = ['https://book.suning.com']
问题在于,Xpath获取链接跳转后如果
会跳出二级域名book.suning.com去其他 xx.suning.com
解决办法:
allowed_domains = ['suning.com']
把allowed_domains = [‘suning.com’] 改成suning的顶级域名
成功:
import scrapy class BookSpider(scrapy.Spider): name = 'book' allowed_domains = ['suning.com'] start_urls = ['https://book.suning.com'] def parse(self, response): html_list = response.xpath("//div[@class='menu-list']/div")[3:8] # print(html_list.extract()) # test_list = html_list.xpath("./div/dl/dt/h3") # print(test_list.extract()) for list_ul in html_list: item={} item["classification"]=list_ul.xpath("./dl/dt/h3/a/text()").extract_first() item["classification_url"]=list_ul.xpath("./dl/dt/h3/a/@href").extract_first() item["url"]=""+ item["classification_url"] # print(item["url"]) yield scrapy.Request(url=item["url"],callback=self.parse_detail,meta={"item":item}) def parse_detail(self,response):###处理购买页 print(response) item = response.meta["item"] # print(item) # item["book_list"]=response.xpath("//div[@class='res-info']") # print(item)s
失败:
import scrapy class BookSpider(scrapy.Spider): name = 'book' allowed_domains = ['book.suning.com'] start_urls = ['https://book.suning.com'] def parse(self, response): html_list = response.xpath("//div[@class='menu-list']/div")[3:8] # print(html_list.extract()) # test_list = html_list.xpath("./div/dl/dt/h3") # print(test_list.extract()) for list_ul in html_list: item={} item["classification"]=list_ul.xpath("./dl/dt/h3/a/text()").extract_first() item["classification_url"]=list_ul.xpath("./dl/dt/h3/a/@href").extract_first() item["url"]=""+ item["classification_url"] # print(item["url"]) yield scrapy.Request(url=item["url"],callback=self.parse_detail,meta={"item":item}) def parse_detail(self,response):###处理购买页 print(response) item = response.meta["item"] # print(item) # item["book_list"]=response.xpath("//div[@class='res-info']") # print(item)