首页 新闻 会员 周边

jupyter中爬虫

0
悬赏园豆:100 [待解决问题]

我在爬取数据一直报错,应该是json的格式错误
uploading-image-502600.png

import requests
import time
import re
import csv
import json
import pandas as pd
from lxml import etree

创建一个csv文件,设置编码格式

file = open('qcwy.csv','a+',encoding='gbk')

写入表头

writer =csv.writer(file)
writer.writerow(['公司','岗位','薪资','福利','工作经验','学历','城市','招聘人数','公司规模','公司方向'])
file.close()
from urllib.parse import urlencode

页数循环,设置10页

for page in range(1,10):
try:
url0 = 'https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,{}.html?'.format(page)
#设置请求头,防止被网站识别爬虫
headers = {
'Connection': 'keep-alive',
'Host': 'search.51job.com',
'Cookie': 'guid=eafda637f951289cc3971b74087ee992; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; search=jobarea%7E%60000000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FApython%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22184f1556385bb4-00d9951ae6397368-7a575474-3686400-184f1556386a8d%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTg0ZjE1NTYzODViYjQtMDBkOTk1MWFlNjM5NzM2OC03YTU3NTQ3NC0zNjg2NDAwLTE4NGYxNTU2Mzg2YThkIn0%3D%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%22%2C%22value%22%3A%22%22%7D%2C%22%24device_id%22%3A%22184f1556385bb4-00d9951ae6397368-7a575474-3686400-184f1556386a8d%22%7D; ssxmod_itna=eqUxnDu7GQG=6xGq0du7t8mx4=Dgn0phxhx0yDi=TDSxGKidDqxBnWjeDQTdSdfnPh0EGY0A4rwTrGmR051vYj72oElt4B3DEx0=edIixiicDCeDIDWeDiDG4Gml4GtDpxG=Djnz/1lXxGWDmbkDWPDYxDr61KDRxi7DDydCx07DQHk3Rpw9Oo1YABDqnKD9hoDsEbfSnImfRftlEAAmtBovx0kS40OBOHszOoDUDvsg=7NKCR4KQxNdih3EBres7Gt3ZQ4fG6x=Yn45=iepNi43o9Pl0tDi=GtHjUDD; ssxmod_itna2=eqUxnDu7GQG=6xGq0du7t8mx4=Dgn0phxxnIgDi=eDlrGlxjRRieZrTMD6CrLfzOzoWadeRi/qDQ4c2TELIl/2LE3wTZ6DrdD6iaKLXdCbFAdXpdNW6Qq/nx/1lurV+lUkBVILs11ura8igqh/WvFZaIbRPFsP4EOu2F802yhjo5bloHOxc5C=BTesx5x/7mH+8BOw99p6UfQX8E7RKeAEPn=6=SQZxSWEhSDToyAPXLQPdy=3VPfS/aURROGDRe+btCd3805zduC=jm5MQtsKYUHYZWS/naO6y3Nz=Zl194CyT8duyd1CqXndH0NAPe4FvrxK=xm2x=/a0Z=tiEq/EN/K4QHm0HEwN93/ad+nXh00b+fTS+afXe+EF00M0031m2Oue9Y33YNttfC2s9f893mnuVowUaIE7mp9fICWM=Yn1QGG3qnGR0GKA3LYIpjef8YTDG2KG2WrHiiTYieKc4Qi2iL8grkskCwvxD08DijpYD; partner=51jobhtml5',
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Mobile Safari/537.36 Edg/108.0.1462.54'
}
#在url后拼接参数,参数固定
params = {
'lang': 'c',
'postchannel': '0000',
'workyear': '99',
'cotype': '99',
'degreefrom': '99',
'jobterm': '99',
'companysize': '99',
'ord_field': '0',
'dibiaoid': '0',
'line': '',
'welfare': '',
}
#拼接url
url = url0+urlencode(params)
print(url)
#requests请求,设置请求时间最长为30秒,超时报错
r = requests.get(url,headers=headers,timeout=30)
#print(r.text)
#将请求到的字符串转化为html标签
html = etree.HTML(r.text)
#标签定位到该字段
nr = html.xpath('//script[@type="text/javascript"]/text()')[0].replace('\n','').replace('\t','').replace('window.SEARCH_RESULT = ','')
#将字符串抓华为json格式
datas = json.loads(nr)['engine_search_result']
#循环,获取字段
for sjs in datas:
#判断
if len(sjs['attribute_text']) == 4:
workyear = sjs['attribute_text'][1]
education = sjs['attribute_text'][2]
city = sjs['attribute_text'][0]
renshu = sjs['attribute_text'][-1]
else:
city = sjs['attribute_text'][0]
renshu = sjs['attribute_text'][-1]
test = sjs['attribute_text'][1]
#判断经验是否在test里面
if '经验' in test:
workyear = test
education = '无'
else:
education = test
workyear = '无'
company_name = sjs['company_name']
job_name = sjs['job_name']
providesalary_text = sjs['providesalary_text'].replace('\',"")
jobwelf = sjs['jobwelf'].replace('\',"")
companysize_text = sjs['companysize_text'].replace('\',"")
companyind_text = sjs['companyind_text'].replace('\',"")
#如果为空,直接设置为无
if not providesalary_text:
providesalary_text = '无'
if not jobwelf:
jobwelf = '无'
if not companysize_text:
companysize_text = '无'
if not companyind_text:
companyind_text = '无'
file = open('qcwy.csv', 'a+', encoding='gbk')
writer = csv.writer(file)
#将数据每行写入
writer.writerow([company_name,job_name,providesalary_text,jobwelf,workyear,education,city,renshu,companysize_text,companyind_text])
print(company_name,job_name,providesalary_text,jobwelf,workyear,education,city,renshu,companysize_text,companyind_text)
#异常处理
except Exception as e:
print(e)
time.sleep(1)
# break

将csv转成excel

datas = pd.read_csv('qcwy.csv',encoding='gbk')

liky1010的主页 liky1010 | 初学一级 | 园豆:16
提问于:2022-12-18 19:43
< >
分享
所有回答(1)
0

代码整体思路是正确的,但是可能在处理JSON数据时出现了问题。主要疑点是这一行:

datas = json.loads(nr)['engine_search_result']

这里需要确定nr中是否有engine_search_result这个键值对,否则会报错。

npe0 | 园豆:1299 (小虾三级) | 2023-12-12 17:13
清除回答草稿
   您需要登录以后才能回答,未注册用户请先注册