allowed_domains = ['book.suning.com'] start_urls = ['https://book.suning.com']
问题在于,Xpath获取链接跳转后如果
会跳出二级域名book.suning.com去其他 xx.suning.com
解决办法:
allowed_domains = ['suning.com']
把allowed_domains = [‘suning.com’] 改成suning的顶级域名
成功:
import scrapy
class BookSpider(scrapy.Spider):
name = 'book'
allowed_domains = ['suning.com']
start_urls = ['https://book.suning.com']
def parse(self, response):
html_list = response.xpath("//div[@class='menu-list']/div")[3:8]
# print(html_list.extract())
# test_list = html_list.xpath("./div/dl/dt/h3")
# print(test_list.extract())
for list_ul in html_list:
item={}
item["classification"]=list_ul.xpath("./dl/dt/h3/a/text()").extract_first()
item["classification_url"]=list_ul.xpath("./dl/dt/h3/a/@href").extract_first()
item["url"]=""+ item["classification_url"]
# print(item["url"])
yield scrapy.Request(url=item["url"],callback=self.parse_detail,meta={"item":item})
def parse_detail(self,response):###处理购买页
print(response)
item = response.meta["item"]
# print(item)
# item["book_list"]=response.xpath("//div[@class='res-info']")
# print(item)s
失败:
import scrapy
class BookSpider(scrapy.Spider):
name = 'book'
allowed_domains = ['book.suning.com']
start_urls = ['https://book.suning.com']
def parse(self, response):
html_list = response.xpath("//div[@class='menu-list']/div")[3:8]
# print(html_list.extract())
# test_list = html_list.xpath("./div/dl/dt/h3")
# print(test_list.extract())
for list_ul in html_list:
item={}
item["classification"]=list_ul.xpath("./dl/dt/h3/a/text()").extract_first()
item["classification_url"]=list_ul.xpath("./dl/dt/h3/a/@href").extract_first()
item["url"]=""+ item["classification_url"]
# print(item["url"])
yield scrapy.Request(url=item["url"],callback=self.parse_detail,meta={"item":item})
def parse_detail(self,response):###处理购买页
print(response)
item = response.meta["item"]
# print(item)
# item["book_list"]=response.xpath("//div[@class='res-info']")
# print(item)