首页 新闻 搜索 专区 学院

我用python爬下来网址怎么带个括号和单引号啊

0
悬赏园豆:50 [已解决问题] 解决于 2020-11-25 10:41


我看别人最后都是一串一串网址没有【】和''的
from bs4 import BeautifulSoup
import re
import urllib.request, urllib.error

import sqlite3

def main():
baseurl = "http://www.prnasia.com/m/mediafeed/rss?id=2303&t=240"
datalist = getData(baseurl)
savepath = ".\wenjian"
saveData(savepath)

findLink = re.compile('<a href="(.*?)" target="_blank">')

def getData(baseurl):
datalist = []
# html= askUrl("http://www.prnasia.com/m/mediafeed/rss?id=2303&t=240")
# for i in range(0,1):
url = baseurl
html = askUrl(url)
soup = BeautifulSoup(html, "html.parser")
for presscolumn in soup.find_all('div', class_="presscolumn"):
data = []
item = str(presscolumn)
link = re.findall(findLink, item)
print(link)
data.append(link)

return datalist

def askUrl(url):
head = {
"User-Agent": "Mozilla/5.0(Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
request = urllib.request.Request(url, headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
# print(html)
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html

def saveData(savepath):
print("save....")

if name == "main":
main()

歐瀚的主页 歐瀚 | 初学一级 | 园豆:5
提问于:2020-11-24 16:20
< >
分享
最佳答案
0
import requests
from lxml import etree

url = 'http://www.prnasia.com/m/mediafeed/rss?id=2303&t=240'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
}
res = requests.get(url, headers=headers)
res_dome = etree.HTML(res.text)
print(res_dome.xpath('//h3/a/@href'))
收获园豆:40
小小咸鱼YwY | 老鸟四级 |园豆:2547 | 2020-11-24 17:02
其他回答(1)
0

正则表达式findall返回的不就是数组吗

收获园豆:10
yytxdy | 园豆:1318 (小虾三级) | 2020-11-24 19:33
清除回答草稿
   您需要登录以后才能回答,未注册用户请先注册