以下是正则表达式获得的部分结果
{\"count\":4,\"sub_images\":[{\"url\":\"http:\\/\\/p1.pstatp.com\\/origin\\/615d000107f351c21106\",\"width\":753,\"url_list\":[{\"url\":\"http:\\/\\/p1.pstatp.com\\/origin\\/615d000107f351c21106\"},{\"url\":\"http:\\/\\/pb3.pstatp.com\\/origin\\/615d000107f351c21106\"},{\"url\":\"http:\\/\\/pb9.pstatp.com\\/origin\\/615d000107f351c21106\"}],\"uri\":\"origin\\/615d000107f351c21106\",\"height\":1024},{\"url\":\"http:\\/\\/p3.pstatp.com\\/origin\\/615d000107f64abe1ff1\",\"width\":682,\"url_list\":[{\"url\":\"http:\\/\\/p3.pstatp.com\\/origin\\/615d000107f64abe1ff1\"},{\"url\":\"http:\\/\\/pb9.pstatp.com\\/origin\\/615d000107f64abe1ff1\"},{\"url\":\"http:\\/\\/pb1.pstatp.com\\/origin\\/615d000107f64abe1ff1\"}],\"uri\":\"origin\\/615d000107f64abe1ff1\",\"height\":1024},{\"url\":\"http:\\/\\/p3.pstatp.com\\/origin\\/615d000107f7f96f2989\",\"width\":682,
对于这种数据没法json.loads()
错误提示是json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
from urllib.parse import urlencode
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
import requests
import json
import re
import demjson
# 得到索引页的html json格式的一些数据
def get_page_index(offset, keyword):
data = {
'offset': offset,
'format': 'json',
'keyword': keyword,
'autoload': 'true',
'count': '20',
'cur_tab': 3,
'from': 'gallery'
}
url = 'https://www.toutiao.com/search_content/?' + urlencode(data) # 索引页的url
try:
response = requests.get(url)
if response.status_code == 200:
return response.text # 返回索引页的响应html 此处为json数据
return None
except RequestException:
print('请求索引页出错')
return None
# 解析json数据,得到详情页的url
def parse_page_index(html):
data = json.loads(html) # 将json字符串转换成字典dict格式的变量
if data and 'data' in data.keys(): # 保证json数据里面含有data这个属性
for item in data.get('data'):
yield item.get('article_url') # 详情页的url
# 根据得到详情页的url 得到相应的html 进一步获取里面的图片url
def get_page_detail(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text # 返回详情页的响应html
return None
except RequestException:
print('请求详情页出错', url)
return None
# 解析得到的html 获得图片url和标题等信息
def parse_page_detail(html, url):
soup = BeautifulSoup(html, 'lxml')
title = soup.select('title')[0].get_text()
print(title)
images_pattern = re.compile('gallery: JSON.parse\("(.*?)"\),', re.S)#需要对括号进行转义
result = re.search(images_pattern, html)
if result:
data = json.loads(result.group(1))
if data and 'sub_images' in data.keys():
sub_images =data.get('sub_images')
print(type(sub_images))
images = [item.get('url') for item in sub_images]
return {
'title':title,
'url':url,
'images':images
}
def main():
html = get_page_index(0, '街拍')
for url in parse_page_index(html):
html = get_page_detail(url) # 对于每一个详情页的url进行请求
if html :
result = parse_page_detail(html, url)
print(result)
if __name__ == '__main__':
main()
这个主要是那个\" 转义了双引号 你可以用replace函数替换掉所有\" 为 "
x谢谢 已经可以了