# Author: Sooele
import json
import re
import time
import requests
#网页选择器
from bs4 import BeautifulSoup
#html解析库
import lxml
##re正则表达式的方式去获取json数据
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
}
def get_html(url):
response= requests.post(url=url,headers=headers).content ###text方法,将网页源代码通过文本形式返回 ,content是js
response=response.decode('gbk') ######content获取的页面需要decode转码
return response
def search_job(job_name):
for index in range(1,5):
url = (f'https://search.51job.com/list/030200,000000,0000,00,9,99,{job_name},2,{index}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=')
print(f'正在访问:第{index}页')
time.sleep(1)
# print(url)
html = get_html(url)
# print(html)
lst = re.findall('{"top_ads".*?</script>',html)
# print(lst) ##输出re 正则提取的内容
re_script = re.compile('</script>',re.I) ####re.I不区分大小写
res = re_script.sub('',str(lst)) ###把lst中提出的内容转成字符串 str
# print(res)
formats = eval(res)
# print(formats)
s = json.loads(formats[0]) #json.load(filename) 一个从文件加载 json.loads(string)一个从内存加载
# print(s)
data = s.get('engine_jds')
# print(data)
job_info = []
# print(job_info)
for info in data:
item={
'公司': info.get('company_name'), ##公司名字
'职位':info.get('job_name'),###职位名称
'工资':info.get('providesalary_text'), ###薪酬
}
# print(item) ###输出item
with open('test6.txt','a+',encoding='utf-8')as file:
data_txt=json.dumps(item,ensure_ascii=False) #####txt乱码写入解决 ensure_ascii=False######
file.write(data_txt + '\n')
print(f'正在写入:{item}')
search_job(input('请输入你要查询的岗位'))
相关