 悬赏园豆:50
                [已解决问题] 
            
                    解决于 2018-02-08 20:58
                悬赏园豆:50
                [已解决问题] 
            
                    解决于 2018-02-08 20:58 
                 
        以下是正则表达式获得的部分结果
{\"count\":4,\"sub_images\":[{\"url\":\"http:\\/\\/p1.pstatp.com\\/origin\\/615d000107f351c21106\",\"width\":753,\"url_list\":[{\"url\":\"http:\\/\\/p1.pstatp.com\\/origin\\/615d000107f351c21106\"},{\"url\":\"http:\\/\\/pb3.pstatp.com\\/origin\\/615d000107f351c21106\"},{\"url\":\"http:\\/\\/pb9.pstatp.com\\/origin\\/615d000107f351c21106\"}],\"uri\":\"origin\\/615d000107f351c21106\",\"height\":1024},{\"url\":\"http:\\/\\/p3.pstatp.com\\/origin\\/615d000107f64abe1ff1\",\"width\":682,\"url_list\":[{\"url\":\"http:\\/\\/p3.pstatp.com\\/origin\\/615d000107f64abe1ff1\"},{\"url\":\"http:\\/\\/pb9.pstatp.com\\/origin\\/615d000107f64abe1ff1\"},{\"url\":\"http:\\/\\/pb1.pstatp.com\\/origin\\/615d000107f64abe1ff1\"}],\"uri\":\"origin\\/615d000107f64abe1ff1\",\"height\":1024},{\"url\":\"http:\\/\\/p3.pstatp.com\\/origin\\/615d000107f7f96f2989\",\"width\":682,
对于这种数据没法json.loads()
错误提示是json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
from urllib.parse import urlencode
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
import requests
import json
import re
import demjson
# 得到索引页的html json格式的一些数据
def get_page_index(offset, keyword):
 data = {
 'offset': offset,
 'format': 'json',
 'keyword': keyword,
 'autoload': 'true',
 'count': '20',
 'cur_tab': 3,
 'from': 'gallery'
 }
 url = 'https://www.toutiao.com/search_content/?' + urlencode(data) # 索引页的url
 try:
 response = requests.get(url)
 if response.status_code == 200:
 return response.text # 返回索引页的响应html 此处为json数据
 return None
 except RequestException:
 print('请求索引页出错')
 return None
# 解析json数据,得到详情页的url
def parse_page_index(html):
 data = json.loads(html) # 将json字符串转换成字典dict格式的变量
 if data and 'data' in data.keys(): # 保证json数据里面含有data这个属性
 for item in data.get('data'):
 yield item.get('article_url') # 详情页的url
# 根据得到详情页的url 得到相应的html 进一步获取里面的图片url
def get_page_detail(url):
 try:
 response = requests.get(url)
 if response.status_code == 200:
 return response.text # 返回详情页的响应html
 return None
 except RequestException:
 print('请求详情页出错', url)
 return None
# 解析得到的html 获得图片url和标题等信息
def parse_page_detail(html, url):
 soup = BeautifulSoup(html, 'lxml')
 title = soup.select('title')[0].get_text()
 print(title)
 images_pattern = re.compile('gallery: JSON.parse\("(.*?)"\),', re.S)#需要对括号进行转义
 result = re.search(images_pattern, html)
 if result:
 data = json.loads(result.group(1))
 if data and 'sub_images' in data.keys():
 sub_images =data.get('sub_images')
 print(type(sub_images))
 images = [item.get('url') for item in sub_images]
 return {
 'title':title,
 'url':url,
 'images':images
 }
def main():
 html = get_page_index(0, '街拍')
 for url in parse_page_index(html):
 html = get_page_detail(url) # 对于每一个详情页的url进行请求
 if html :
 result = parse_page_detail(html, url)
 print(result)
if __name__ == '__main__':
 main()
这个主要是那个\" 转义了双引号 你可以用replace函数替换掉所有\" 为 "
x谢谢 已经可以了