python爬虫抓数据例子

#需求:爬取搜狗指定词条搜索后的页面数据
import requests
url = 'https://www.sogou.com/web'
#封装参数
wd = input('enter a word:')
param = {
    'query':wd
}
response = requests.get(url=url,params=param)

page_text = response.content
fileName = wd+'.html'
with open(fileName,'wb') as fp:
    fp.write(page_text)
    print('over')
爬取百度翻译结果
url = 'https://fanyi.baidu.com/sug'
wd = input('enter a word:')
data = {
    'kw':wd
}
response = requests.post(url=url,data=data)

print(response.json())

#response.text : 字符串
#.content : 二进制
#.json() : 对象
#爬取豆瓣电影分类排行榜 https://movie.douban.com/中的电影详情数据
url = 'https://movie.douban.com/j/chart/top_list'
param = {
    "type": "5",
    "interval_id": "100:90",
    "action": '',
    "start": "60",
    "limit": "100",
}

movie_data = requests.get(url=url,params=param).json()

print(movie_data)
#需求:爬取国家药品监督管理总局中基于中华人民共和国化妆品生产许可证相关数据http://125.35.6.84:81/xk/
#反爬机制:UA检测  --> UA伪装


url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
}
id_list = []
for page in range(1,11):
    data = {
        'on': 'true',
        'page': 'str(page)',
        'pageSize': '15',
        'productName':'',
        'conditionType': '1',
        'applyname':'',
        'applysn':'' ,
    }
    json_data = requests.post(url=url,data=data,headers=headers).json()
    for dic in json_data['list']:
        id = dic['ID']
        id_list.append(id)
#print(id_list)

detail_url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById'
for id in  id_list:
    detail_data = {
        'id':id
    }
    detail_json = requests.post(url=detail_url,data=detail_data,headers=headers).json( )
    print(detail_json)
#爬取照片
# import requests
# url = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1561020728161&di=ccb2385ec14262c372f91b35d7155600&imgtype=0&src=http%3A%2F%2Fb-ssl.duitang.com%2Fuploads%2Fitem%2F201304%2F12%2F20130412110111_8E4Qs.thumb.700_0.jpeg'
# headers = {
#     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
# }
# image_data = requests.get(url=url,headers=headers).content
# with open('./chenjing.jpg','wb')as fp:
#     fp.write(image_data)


#模块urllib使用
import requests
import urllib
url = 'https://ss2.bdstatic.com/70cFvnSh_Q1YnxGkpoWK1HF6hhy/it/u=806201715,3137077445&fm=26&gp=0.jpg'
urllib.reques.urlretrieve(url=url,filename='./123.jpg')\

 

说点什么

avatar
  Subscribe  
提醒