首页新闻找找看学习计划

python down user big img (part2)

0
[已关闭问题] 关闭于 2016-01-26 17:38

# 代理信息,装载代理
proxy_info = {'user': 'xxxxxxx', 'password': 'xxxx%xxxx', 'server': 'openproxy.xxxxxxx.com:8080'}
if enable_proxy:
opener = url_build_proxy_opener(proxy_info)
else:
proxy_info = null
opener = url_build_proxy_opener(proxy_info)
urllib2.install_opener(opener)

# 在闪存页面找出闪存用户大头像链接
gethtml = get_html(url)
userhomepage_patten = '<a href="(.+?)" class="ing-author" target="_blank">'
userhomepage = re.findall(userhomepage_patten, gethtml)

# 在闪存页面找出所有用户名称
user_patten = '<a href="http://home.cnblogs.com/u/(.+?)/" class="ing-author" target="_blank">(.+?)</a>'
# <a href="http://home.cnblogs.com/u/775401/" class="ing-author" target="_blank">RosonJ</a>
users = re.findall(user_patten, gethtml)
print users
#应该把所有路径放到一个list里,在list里查找是否存在,如果已存在再新创建路径并且添加到list尾部
CheckPathList = []
newpath = '0'
j = 1

#获取用户主页html信息并找出大头像图片地址,然后下载到本地保存
NumberOfUsers = len(users)
for i in range(0, NumberOfUsers):
newgethtml = get_html(userhomepage[i])
bigpic_patten = '<img src="(.+?)" alt=' + '"' + users[i][1] + '的头像"' + ' class="img_avatar">'
bigpic = re.findall(bigpic_patten, newgethtml)
print bigpic
imgurl = bigpic[0]

#给当前下载的文件命名
fileSavePath = file_dir + "/" + users[i][1] + ".png"
print fileSavePath

print "正在下载第" + str(i + 1) + "个图片..."

if fileSavePath in CheckPathList:
fileSavePath_repeat = file_dir + "/" + users[i][1] + "(" + str(j) + ")" + ".png"
downimg(imgurl, fileSavePath_repeat)
newpath = updatefilesavepath(fileSavePath_repeat)
j += 1
print "j = " + str(j)
else:
downimg(imgurl, fileSavePath)
newpath = updatefilesavepath(fileSavePath)
CheckPathList.append(newpath)

print "newpath = " + newpath
print "下载完成,准备下一个"
time.sleep(1)
print ""
print ""
west_Tang风的主页 west_Tang风 | 菜鸟二级 | 园豆:201
提问于:2016-01-26 17:35
< >
分享
所有回答(1)
0

python down user big img (part1)

# coding=utf-8
import os
import re
import time
import sys
import json
import urllib
import urllib2
import codecs

def get_newdir(dir_name):
curr_dir = os.getcwd()
curr_dir = curr_dir.replace('\\', '/')
file_dir = curr_dir + '/' + dir_name
return file_dir

def get_html(url):
req = urllib2.Request(url)
res = urllib2.urlopen(req)
html = res.read()
return html

def downimgs(url, filepath):
urllib.urlretrieve(url, filepath)

def downimg(imgurl, filepath):
fpath = filepath.decode("utf-8")
f = open(fpath, 'wb')
f.write(getHtml(imgurl))
f.close()

def updatefilesavepath(savepath):
newpath = savepath
return newpath

def url_build_proxy_opener(proxy_info):
passmgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
passmgr.add_password(None, proxy_info['server'], proxy_info['user'], proxy_info['password'])
auth = urllib2.ProxyBasicAuthHandler(passmgr)
opener = urllib2.build_opener(urllib2.ProxyHandler({'http': proxy_info['server']}), auth)
return opener

if __name__ == "__main__":
for pageIndex in range(1, 100):
# 请求这个动态js更新的内容,循环修改page页"
url = "http://ing.cnblogs.com/ajax/ing/GetIngList?IngListType=all&PageIndex=" + str(pageIndex) + "&PageSize=30"
# 由于权限问题,需要挂载本地代理
enable_proxy = True
# 创建本地路径,用以存放下载的图片,保存图片的文件夹存放在与该脚本同级目录下
dir_name = "BlogsPictures_test"
file_dir = get_newdir(dir_name)

if os.path.exists(file_dir) is False:
os.mkdir(file_dir)
else:
pass

west_Tang风 | 园豆:201 (菜鸟二级) | 2016-01-26 17:37
清除回答草稿
   您需要登录以后才能回答,未注册用户请先注册