# Author: Sooele import json import re import time import requests #网页选择器 from bs4 import BeautifulSoup #html解析库 import lxml ##re正则表达式的方式去获取json数据 headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36' } def get_html(url): response= requests.post(url=url,headers=headers).content ###text方法,将网页源代码通过文本形式返回 ,content是js response=response.decode('gbk') ######content获取的页面需要decode转码 return response def search_job(job_name): for index in range(1,5): url = (f'https://search.51job.com/list/030200,000000,0000,00,9,99,{job_name},2,{index}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=') print(f'正在访问:第{index}页') time.sleep(1) # print(url) html = get_html(url) # print(html) lst = re.findall('{"top_ads".*?</script>',html) # print(lst) ##输出re 正则提取的内容 re_script = re.compile('</script>',re.I) ####re.I不区分大小写 res = re_script.sub('',str(lst)) ###把lst中提出的内容转成字符串 str # print(res) formats = eval(res) # print(formats) s = json.loads(formats[0]) #json.load(filename) 一个从文件加载 json.loads(string)一个从内存加载 # print(s) data = s.get('engine_jds') # print(data) job_info = [] # print(job_info) for info in data: item={ '公司': info.get('company_name'), ##公司名字 '职位':info.get('job_name'),###职位名称 '工资':info.get('providesalary_text'), ###薪酬 } # print(item) ###输出item with open('test6.txt','a+',encoding='utf-8')as file: data_txt=json.dumps(item,ensure_ascii=False) #####txt乱码写入解决 ensure_ascii=False###### file.write(data_txt + '\n') print(f'正在写入:{item}') search_job(input('请输入你要查询的岗位'))