Python(前程无忧爬取)

# Author: Sooele
import json
import re
import time
import requests
#网页选择器
from bs4 import BeautifulSoup

#html解析库
import lxml

##re正则表达式的方式去获取json数据

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
}

def get_html(url):
    response= requests.post(url=url,headers=headers).content  ###text方法,将网页源代码通过文本形式返回 ,content是js
    response=response.decode('gbk')    ######content获取的页面需要decode转码
    return response


def search_job(job_name):
    for  index in range(1,5):
        url = (f'https://search.51job.com/list/030200,000000,0000,00,9,99,{job_name},2,{index}.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=')
        print(f'正在访问:第{index}页')
        time.sleep(1)
        # print(url)
        html = get_html(url)
        # print(html)
        lst = re.findall('{"top_ads".*?</script>',html)
        # print(lst)   ##输出re 正则提取的内容

        re_script = re.compile('</script>',re.I)  ####re.I不区分大小写
        res = re_script.sub('',str(lst))  ###把lst中提出的内容转成字符串 str
        # print(res)
        formats = eval(res)
        # print(formats)
        s = json.loads(formats[0])  #json.load(filename) 一个从文件加载 json.loads(string)一个从内存加载

        # print(s)
        data = s.get('engine_jds')
        # print(data)
        job_info = []
        # print(job_info)
        for info in data:
            item={
                '公司': info.get('company_name'),  ##公司名字
                '职位':info.get('job_name'),###职位名称
                '工资':info.get('providesalary_text'), ###薪酬
            }
            # print(item) ###输出item
            with open('test6.txt','a+',encoding='utf-8')as file:
                data_txt=json.dumps(item,ensure_ascii=False)     #####txt乱码写入解决 ensure_ascii=False######
                file.write(data_txt + '\n')

                print(f'正在写入:{item}')

search_job(input('请输入你要查询的岗位'))