# -*-coding:utf-8-*- '''# 9.18 url = 'http://fjrb.fjsen.com/nasb/html/2017-09/18/node_122.htm' # 第一版 urlend = 'http://fjrb.fjsen.com/nasb/html/2017-09/18/node_131.htm' # 第十版 # 9.20 url = 'http://fjrb.fjsen.com/nasb/html/2017-09/20/node_122.htm' # 第一版 url = 'http://fjrb.fjsen.com/nasb/html/2017-09/20/node_129.htm' # 第十版 ''' import time from selenium import webdriver import pymysql import uuid class mainAll(object): def __init__(self): self.conn = pymysql.connect(host='localhost', user='root', passwd='123', db='tianyan', port=3306, charset='utf8') self.cur = self.conn.cursor() # 获取一个游标 self.main() self.cur.close() self.conn.close() def main(self): # 获取当前年月日 y = time.strftime('%Y', time.localtime(time.time())) # 年 m = time.strftime('%m', time.localtime(time.time())) # 月 d = time.strftime('%d', time.localtime(time.time())) # 日 data_time = time.strftime('%Y-%m-%d', time.localtime(time.time())) # 抓取时间 data_time_now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) website = '海丝商报' # 创建相应时间的url地址 url = 'http://fjrb.fjsen.com/nasb/html/%s-%s/%s/node_122.htm' % (y, m, d) driver = webdriver.Chrome() driver.get(url) # 找到版面数 sheets = driver.find_element_by_xpath("//table[@cellpadding='2']") sheets_len = len(sheets.find_elements_by_tag_name('tr')) # 找到每个版面的标题数量 for sheet in range(sheets_len): titles = driver.find_element_by_xpath("//table[@cellpadding='1']") titles_len = int(len(titles.find_elements_by_tag_name('tr')) / 2) content_type = driver.find_element_by_xpath("//table[@cellpadding='2']").find_elements_by_tag_name('tr') content_type = content_type[sheet].text.split(':')[-1] # 以冒号为分隔符切开版面的文字 # 点击版面的第一篇文章 title_button = driver.find_element_by_xpath("//*[@id='demo']/table[1]/tbody/tr[3]/td[2]/table/tbody/" "tr[4]/td/table/tbody/tr/td[2]/table/tbody/tr[1]/td/table/" "tbody/tr[4]/td/div/table/tbody/tr[1]/td[2]/a") title_button.click() for title in range(titles_len): # 找到主标题和子标题的table表 title_table = driver.find_element_by_xpath( "//*[@id='demo']/table/tbody/tr[3]/td[2]/table/tbody/tr[4]//tr") content_title = title_table.find_elements_by_tag_name('p')[0].text content_subtitle = title_table.find_elements_by_tag_name('p')[1].text content = driver.find_element_by_xpath("//table[@class='content_tt']").text # 获取左下角每一版的所有标题的链接 content_id = driver.find_elements_by_xpath("//*[@id='demo']/table/tbody/tr[3]/td[1]/table/tbody/tr[3]/" "td/table//a") content_id = content_id[title].get_attribute('href') content_id = content_id.split('content_')[-1].split('.')[0] # 正则表达式没有处理成功!!!!! # content_id = driver.current_url # 'http://fjrb.fjsen.com/nasb/html/2017-09/21/content_1055929.htm?div=-1' idd = str(uuid.uuid1()) idd.replace('-', '') # 新闻时间和爬取时间是一个时候 sentiment_source 和sentiment_website是同一处理的 lists = (idd, content_title, content_subtitle, website, data_time, url, website, data_time_now, content, content_id, content_type) self.conn(lists) driver.find_elements_by_xpath("//a[@class='preart']")[-1].click() # 点击下一篇章 # 当把一版的所有标题都走完以后,点击下一版,回到外层循环的页面 if title == titles_len - 1 and sheet == 0: driver.find_elements_by_xpath("//a[@class='preart']")[0].click() elif title == titles_len - 1: driver.find_elements_by_xpath("//a[@class='preart']")[1].click() elif title == 0 and sheet == 0: flag = self.judge(content_id) if flag > 0: break # 我这里的break会不会让定时程序都停止了 driver.close() def conn(self, table): # 名称 职位 公司名称 entuid sql = "INSERT INTO sentiment_info (sentiment_id, sentiment_title, sentiment_subtitle, sentiment_source," \ "sentiment_time, sentiment_url,sentiment_website,sentiment_create_time,sentiment_content," \ "sentiment_source_id,sentiment_type) VALUES ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s'," \ " '%s','%s')" self.cur.execute(sql % table) self.conn.commit() # 第一页点击 driver.find_element_by_xpath("//a[@class='preart']").click() 即可到下一页 def judge(self, content_id): sql = "SELECT COUNT(*) FROM sentiment_info WHERE sentiment_source='海丝商报' AND sentiment_type='要闻'" \ " AND sentiment_source_id=%s", content_id self.cur.execute(sql) a = self.cur.fetchall() a = max(max(a)) self.conn.commit() return a if __name__ == '__main__': mainAll()
main()函数内部调用conn()和judge(),但是在main()函数内部走到调用conn那一步就报错,报错内容为,请问这个类哪里写错了?
C:\Users\Administrator\AppData\Local\Programs\Python\Python36\python.exe "D:\pycharm\PyCharm 2016.3\helpers\pydev\pydevd.py" --multiproc --qt-support --client 127.0.0.1 --port 53782 --file D:/pyworkpeace/HaiSi warning: Debugger speedups using cython not found. Run '"C:\Users\Administrator\AppData\Local\Programs\Python\Python36\python.exe" "D:\pycharm\PyCharm 2016.3\helpers\pydev\setup_cython.py" build_ext --inplace' to build. pydev debugger: process 7620 is connecting Connected to pydev debugger (build 163.8233.8) Traceback (most recent call last): File "D:\pycharm\PyCharm 2016.3\helpers\pydev\pydevd.py", line 1596, in <module> globals = debugger.run(setup['file'], None, None, is_module) File "D:\pycharm\PyCharm 2016.3\helpers\pydev\pydevd.py", line 974, in run pydev_imports.execfile(file, globals, locals) # execute the script File "D:\pycharm\PyCharm 2016.3\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile exec(compile(contents+"\n", file, 'exec'), glob, loc) File "D:/pyworkpeace/HaiSi", line 110, in <module> mainAll() File "D:/pyworkpeace/HaiSi", line 20, in __init__ self.main() File "D:/pyworkpeace/HaiSi", line 75, in main self.conn(lists) TypeError: 'Connection' object is not callable Process finished with exit code 1
Cython加速调试没有打开。Cython是Python的一个扩展模块工具,采用Python和C语言混合编成,用于加速Python计算速度。
解决方法:
1. 打开终端:找到“Pycharm/pycharm-community-2016.2.2/helpers/pydev”完整路径,然后cd到那里,比如我的是:“cd madd/soft/Pycharm/pycharm-community-2016.2.2/helpers/pydev”;
2. 输入:“python setup_cython.py build_ext --inplace”,回车;