求大佬写个爬虫在GitHub 我可以捐款30块。
只需要爬虫 2种语言 EN 和 RU
爬虫 Khan Academy 所有视频 transcripts
爬虫类别
爬虫步骤 转到 math 转到二级目录
Early math review
>进入目录 Unit 1
>点击播放图标,网站底部的 Video transcript
是字幕。
Early math review
下的章节全部字幕 合成一个文件 如 Early math review.txt我 用来制作 AnkiDroid 或 https://github.com/VaibhavCodeClub/learn 学习列表.
import requests
from bs4 import BeautifulSoup
languages = ["EN", "RU"]
for language in languages:
url = f"https://www.khanacademy.org/{language}/video-transcripts"
try:
response = requests.get(url)
response.raise_for_status() # 检查请求是否成功
soup = BeautifulSoup(response.content, "html.parser")
transcripts = soup.find_all("div", class_="video-transcript")
for transcript in transcripts:
try:
title = transcript.find("h3", class_="video-title").text.strip()
link = transcript.find("a", class_="video-link")["href"]
print(f"语言: {language}")
print(f"视频标题: {title}")
print(f"视频链接: {link}")
print("=" * 50)
except AttributeError:
print(f"在 {language} 语言页面中,获取标题或链接时出现错误")
except requests.exceptions.RequestException as e:
print(f"在获取 {language} 语言页面时发生错误: {e}")