# -*- coding: utf-8 -*- from lxml import html from time import sleep import requests from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from selenium.webdriver.common.by import By from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium import webdriver url = 'http://www.tianyancha.com/company/638562997' ''' for i in range(0, 100000): data = requests.get(url) dataHtml = html.fromstring(data.content) print dataHtml.xpath('//title/text()')[0], len(data.content),data.status_code ''' dcap = dict(DesiredCapabilities.PHANTOMJS) dcap = { "phantomjs.page.settings.userAgent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 " \ "(KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36", "phantomjs.page.settings.loadImages": False, "phantomjs.page.settings.resourceTimeout": 5000 } proxy = [ '--proxy=120.27.142.209:82', '--proxy-type=http', '--ignore-ssl-errors=true', '--ssl-protocol=tlsv1' ] for i in range(0, 50): driver = webdriver.PhantomJS(desired_capabilities=dcap,service_args=proxy) #driver = webdriver.Chrome() driver.get(url) #wait = WebDriverWait(driver, 10) #a = wait.until(EC.presence_of_element_located(By.CSS_SELECTOR, 'div.datatable')) sleep(5) open('logs/2.html', 'w').write(driver.page_source.encode('utf8')) print i, driver.title
问题1:
如果我将
driver=webdriver.PhantomJS(desired_capabilities=dcap,service_args=proxy)
换成
driver=webdriver.PhantomJS(desired_capabilities=dcap),即不使用代理ip的话,最后爬取的结果里就有js动态加载的内容
问题2:
如果我将url换成含有中文字福的url : http://www.tianyancha.com/search/%E6%B1%95%E5%A4%B4%E9%87%91%E7%9F%B3%E5%88%B6%E8%8D%AF%E6%80%BB%E5%8E%82,
使用phantomjs爬取下来的内容并不包含有我需要的那部分信息的内容,也只是个空架子。这是为什么?谢谢!!!
您好,请问这个问题解决了吗