首页 新闻 会员 周边 捐助

怎么去除爬下来的网站中的一些转义字符串

0
悬赏园豆:50 [已解决问题] 解决于 2020-11-26 14:21


from bs4 import BeautifulSoup
import re
import urllib.request, urllib.error

def first(baseurl):
return getData(baseurl)

findLink = re.compile(r'<a href="(.?)" target="_blank">', re.S)
findTttle = re.compile(r'<!-- headline start -->(.
?)<!-- headline end -->', re.S)
findContant1 = re.compile(r'<p style="width: 100%;">(.)</p>', re.S)
findContant2 = re.compile(r'justify;">(.
)</p>', re.S)
dr=re.compile(r'<[^>]+>', re.S)

def getData(baseurl):
datalist = []
url = baseurl
html = askUrl(url)
soup = BeautifulSoup(html, "html.parser")
for presscolumn in soup.find_all('div', class_="presscolumn"):
item = str(presscolumn)
link = re.findall(findLink, item)
datalist.append(link[0])
return datalist

def askUrl(url):
head = {
"User-Agent": "Mozilla/5.0(Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
request = urllib.request.Request(url, headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html

def getData2(baseurl2):
datalist2 = []
url = baseurl2
html = askUrl(url)
soup = BeautifulSoup(html, "html.parser")
for overall in soup.find_all('div', class_="overall"):
item = str(overall)
title = re.findall(findTttle, item)[0]
datalist2.append(title)
contant =re.findall(findContant1, item)
if len(contant) <= 0:
contant = re.findall(findContant2, item)
contant = dr.sub('', str(contant))
datalist2.append(contant)
return datalist2

def main():
baseurl = input("请输入您要的网站:")
data = []
baseurl2 = first(baseurl)
for n in range(0, len(baseurl2)):
print(getData2(baseurl2[n]))
data.append(getData2(baseurl2[n]))

if name == "main":
main()
兄弟萌,这些\和后面的东西怎么去掉啊

歐瀚的主页 歐瀚 | 初学一级 | 园豆:5
提问于:2020-11-26 10:25
< >
分享
最佳答案
0

爬的网站url 发一下,都啥年代了还用这种古老的模块

收获园豆:50
小小咸鱼YwY | 老鸟四级 |园豆:3312 | 2020-11-26 10:37

我是新手qaq
http://www.prnasia.com/m/mediafeed/rss?id=2303&t=240
爬这个上面每个网页的标题和文章内容
谢谢大佬

歐瀚 | 园豆:5 (初学一级) | 2020-11-26 10:41

@歐瀚:

#垃圾爬虫项目对于提升爬虫毫无软用
import requests
from lxml import etree

url = 'http://www.prnasia.com/m/mediafeed/rss?id=2303&t=240'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
}
res = requests.get(url, headers=headers)
res_dome = etree.HTML(res.text)
url_list = res_dome.xpath('//h3/a/@href')
title_list = res_dome.xpath('//*[@id="container"]/div/div[1]/div/div/h3[1]/text()')
new_zip = zip(title_list, url_list)
for title, url in new_zip:
    print(f'标题:{title}\n链接:{url}\n')
小小咸鱼YwY | 园豆:3312 (老鸟四级) | 2020-11-26 13:49

@小小咸鱼YwY: 谢谢大佬,我刚学没多久,就是看着一些网上的视频在学

歐瀚 | 园豆:5 (初学一级) | 2020-11-26 14:15

@歐瀚: 学基础,前端基础知识,后端基础知识,安卓开发

小小咸鱼YwY | 园豆:3312 (老鸟四级) | 2020-11-26 14:24

@小小咸鱼YwY: 谢谢啦

歐瀚 | 园豆:5 (初学一级) | 2020-11-26 14:45
清除回答草稿
   您需要登录以后才能回答,未注册用户请先注册