返回信息流2015年1月4日16:31:02更新说明:
1:怎么能快速的保存mm图片呢?我曾经试验过多进程和gevent,但是实验结果和单进程直接下载在时间上没有减少反而有的时候增加了,原因是多方面的,一部分是网速的原因,但是主要原因是我用过的请求方法,urllib2和requests都不是异步的,所以导致的效率上没有提升
2:所以要用到异步框架,这里主要用到的是scrapy框架。
下图是项目结构
几点说明:
1:scrapy框架中有个ImagesPipeline,需要自己实现一个子类,为的是自定义图片文件夹名称,在这个地方走过弯路,需要有两个Pipeline,一个是在新建项目的时候,自动生成的class TaommPipeline(object),另一个则是需要我们实现的子类class MMImagesPipeline(ImagesPipeline),我当时在做的时候,把这两个类合并了,所以出现错误。
2:settings中的ITEM_PIPELINES = {'taomm.pipelines.TaommPipeline': 1, 'taomm.pipelines.MMImagesPipeline': 2}顺序很重要,决定了的数据先经过谁处理
主要文件:
mmspider.py文件:
# coding:utf-8
import re, os, requests, sys, uuid, hashlib, time
from scrapy import Spider
from bs4 import BeautifulSoup
from taomm.items import TaommItem
reload(sys)
sys.setdefaultencoding('utf-8')
rc = re.compile(r'src=\"(http\:\/\/img[\w\/!\-\.]*\.jpg)\"') # 正则表达式匹配所有的图片,提前编译
class mmspider(Spider):
name = 'taomm'
allowed_domains = ['taobao.com']
start_urls = []
# start_urls = ['http://mm.taobao.com/687471686.htm']
mm_url_name_dict = {}
page_url = 'http://mm.taobao.com/json/request_top_list.htm?page=%s'
hello = ''
def __init__(self):#预处理,主要是先找到各个mm的姓名和主页
for page_index in range(1, 2): # 只取前1页
p_url = self.page_url % page_index
bs_html = BeautifulSoup(requests.get(p_url).content.decode('GBK'))
div_personal_info_list = bs_html.find_all('div', {'class': 'personal-info'})
for personal_info in div_personal_info_list:
mm_name = personal_info.find('a', {'class': 'lady-name', 'target': '_blank'}).get_text()
mm_href = personal_info.find('a', {'class': 'lady-avatar', 'target': '_blank'}).get('href')
self.mm_url_name_dict[mm_href] = mm_name
self.start_urls.append(mm_href)
pass
pass
def parse(self, response):
mm_html = BeautifulSoup(response.body).find('div', {'id': 'J_AixiuShow'})
image_urls = re.findall(rc, str(mm_html))
mm = TaommItem()
mm['mm_name'] = self.mm_url_name_dict[response.url]
mm['image_urls'] = image_urls
return mm
pass
if __name__ == '__main__':#测试采用scrapy框架和不采用该框架时间上对比
start_time = time.time()
start_urls = []
# start_urls = ['http://mm.taobao.com/687471686.htm']
mm_url_name_dict = {}
page_url = 'http://mm.taobao.com/json/request_top_list.htm?page=%s'
for page_index in range(1, 2): # 只取前1页
p_url = page_url % page_index
bs_html = BeautifulSoup(requests.get(p_url).content.decode('GBK'))
div_personal_info_list = bs_html.find_all('div', {'class': 'personal-info'})
for personal_info in div_personal_info_list:
mm_name = personal_info.find('a', {'class': 'lady-name', 'target': '_blank'}).get_text()
mm_href = personal_info.find('a', {'class': 'lady-avatar', 'target': '_blank'}).get('href')
mm_url_name_dict[mm_href] = mm_name
start_urls.append(mm_href)
pass
pass
print 'page 1 is over'
for mm_url in start_urls:
mm_name = mm_url_name_dict[mm_url]
path = '../../pics/%s' % mm_name
os.makedirs(path)
mm_html = BeautifulSoup(requests.get(mm_url).content).find('div', {'id': 'J_AixiuShow'})
image_urls = re.findall(rc, str(mm_html))
for img_url in image_urls:
resp = requests.get(img_url)
image_guid = hashlib.sha1(img_url).hexdigest()
image_path = '../../pics/%s/%s.jpg' % (mm_name, image_guid)
with open(image_path, 'wb') as f:
f.write(resp.content)
print mm_name, 'over'
pass
end_time = time.time()
print end_time - start_time
Pipeline.py文件
from scrapy.contrib.pipeline.images import ImagesPipeline
from scrapy.http.request import Request
class TaommPipeline(object):
def process_item(self, item, spider):
# item['image_urls'] = item['image_urls'][1:2]#测试用,只请求1张图片,测试该Pipeline是否起作用
return item
pass
class MMImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield Request(image_url, meta={'mm_name': item['mm_name']})#因为在下面的file_path方法中获得不到mm的姓名,所以在这里把mm的姓名作为meta传过去
def item_completed(self, results, item, info):
return super(MMImagesPipeline, self).item_completed(results, item, info)
def file_path(self, request, response=None, info=None):
f_path = super(MMImagesPipeline, self).file_path(request, response, info)
f_path = f_path.replace('full', request.meta['mm_name'], 1)#从meta取出mm的姓名作为文件夹名称
return f_path
pass
items.py文件
from scrapy import Item, Field
class TaommItem(Item):
# define the fields for your item here like:
#一个mm的结构是:名字和图片url的列表
mm_name = Field()
image_urls = Field()#是个list
images = Field()
settings.py文件
BOT_NAME = 'taomm'
SPIDER_MODULES = ['taomm.spiders']
NEWSPIDER_MODULE = 'taomm.spiders'
# ITEM_PIPELINES = {'scrapy.contrib.pipeline.images.ImagesPipeline': 1}
ITEM_PIPELINES = {'taomm.pipelines.TaommPipeline': 1, 'taomm.pipelines.MMImagesPipeline': 2}
IMAGES_STORE = './mms'
最后奉上源码
附件(15.1KB) taomm.zip
下一步工作和感慨:
1:校园网收费了,怎么办啊……………………我的大长腿
2:分布式
3:让spider动起来,crawlspider
【更新】版本2采用多线程,一页有10个mm,然后申请10个线程分别下载每个美眉的所有图片。代码在下面
思路是以上思路:出现的问题域:
1:没有体现出多线程的优势来啊,加入多线程之后的统计了下时间,还是和当时单线程的时间差不多。
2:即使采用我所谓的多线程,总是有部分图片因为下载不完整,直接把线程给阻塞了,然后图片是这样的
我的问题:
1:采用多线程下载,一个线程下载一个mm的,我的思路错了吗?
2:urllib.urlretrieve是线程安全的吗?为什么会出现被打断,然后导致图片破损数据不完整
谢谢!
代码是:
#!/usr/bin/env python
#coding: utf-8
#多线程
import sys,os,urllib,urllib2,re,codecs,bs4,threading,time,random,socket
socket.setdefaulttimeout(30)
base_url='http://mm.taobao.com/json/request_top_list.htm?page='
base_dir=os.getcwd()+'\\pics\\'
rc=re.compile(r'src=\"(http\:\/\/img[\w\/!\-\.]*\.jpg)\"')#正则表达式匹配所有的图片,提前编译
#lock=threading.Lock()
def download(mm_name,mm_url):
path_name=os.path.join(base_dir,mm_name)
if not os.path.exists(path_name):
os.mkdir(path_name) # 根据mm的姓名新建文件夹
print path_name;
mm_page=urllib2.urlopen(mm_url,timeout=60)
mm_html=mm_page.read()
mm_html=bs4.BeautifulSoup(mm_html).find('div',{'id':'J_AixiuShow'})#mm主页中有两个区域有图片,我们在还需要此区域的
pics=re.findall(rc,str(mm_html))#用正则匹配
print '-------name: '+mm_name+' images count: '+ str(len(pics))+'------'
print '-------name:'+path_name+'------begin download-------------'
count=0#图片编号
for pic_url in pics:
count+=1
pic_name=os.path.join(path_name,str(count)+'.jpg')
print '-------pic name:'+pic_name
#lock.acquire()
urllib.urlretrieve(pic_url,pic_name)
#lock.release()
print '-------name:'+path_name+'------finish download------------'
class DownloadThread(threading.Thread):
def __init__(self,mm_name,mm_url):
threading.Thread.__init__(self)
self.mm_name=mm_name
self.mm_url=mm_url
def __unicode__(self):
return self.mm_name,self.mm_url
def run(self):
download(self.mm_name,self.mm_url)
def get_mms(start_page,end_page):
if not os.path.exists(base_dir):
os.mkdir(base_dir)#创建目录
for i in range(start_page,end_page):
page_num=str(i)
print '-------page:' + page_num+'----------------'
list_url=base_url+page_num;
list_page=urllib2.urlopen(list_url)
list_html=list_page.read().decode('GBK')#中文乱码
bs_html=bs4.BeautifulSoup(list_html)
list_mm_tag_a= bs_html.find_all('a',{'class':'lady-name'})#获取多有包含mm姓名的所有a标签
list_mm_url=bs_html.find_all('a',{'class':'lady-avatar','target':'_blank'})#获取所有mm的空间图片页的url
threads=[]
lens=len(list_mm_tag_a)
for j in range(lens):
mm_name=list_mm_tag_a[j].get_text()#获取mm姓名
mm_url=list_mm_url[j].get('href')#获取mm的url
#启动多线程下载各个mm的图片,一个mm启动一个线程,一页大约有10个mm
t=DownloadThread(mm_name,mm_url)
threads.append(t)
time.sleep(random.random())
t.start()
for t in threads:
t.join()
if __name__=='__main__':
start=time.time()
get_mms(1,2)
end=time.time()
print end-start
话说,撸主喜欢大长腿妹纸,加上近来学习python,所以就爬淘宝mm的图片,先上图片
图片按照mm的姓名分文件夹,
然后在淘宝页面中用BeautifulSoup转化一下,然后用正则匹配
代码如下
#!/usr/bin/env python
#coding: utf-8
import sys,os,urllib,urllib2,re,codecs,bs4
base_url='http://mm.taobao.com/json/request_top_list.htm?page='
base_dir=os.getcwd()+'\\pics\\'
def get_mms(start_page,end_page):
rc=re.compile(r'src=\"(http\:\/\/img[\w\/!\-\.]*\.jpg)\"')#正则表达式匹配所有的图片,提前编译
for i in range(start_page,end_page):
page_num=str(i)
print '-------page:' + page_num+'----------------'
list_url=base_url+page_num;
list_page=urllib2.urlopen(list_url)
list_html=list_page.read().decode('GBK')#中文乱码
bs_html=bs4.BeautifulSoup(list_html)
list_mm_tag_a= bs_html.find_all('a',{'class':'lady-name'})
list_mm_url=bs_html.find_all('a',{'class':'lady-avatar','target':'_blank'})
if not os.path.exists(base_dir):
os.mkdir(base_dir)#创建目录
for j in range(len(list_mm_tag_a)):
name=list_mm_tag_a[j].get_text()
path_name=os.path.join(base_dir,name)
if not os.path.exists(path_name):
os.mkdir(path_name) # 根据mm的姓名新建文件夹
mm_url=list_mm_url[j].get('href')
mm_page=urllib2.urlopen(mm_url)
mm_html=mm_page.read()
mm_html=bs4.BeautifulSoup(mm_html).find('div',{'id':'J_AixiuShow'})#mm主页中有两个区域有图片,我们在还需要此区域的
pics=re.findall(rc,str(mm_html))#用正则匹配
print '-------name:'+name+'------begin download-------------'
count=0#图片编号
for pic_url in pics:
count+=1
pic_name=os.path.join(path_name,str(count)+'.jpg')
print '-------pic name:'+pic_name
urllib.urlretrieve(pic_url,pic_name)
print '-------name:'+name+'------finish download------------'
if __name__=='__main__':
get_mms(2,3)
这是一条镜像帖。来源:北邮人论坛 / python / #4379同步于 2014/11/21
该镜像源已超过 30 天没有更新,可能在源站已被删除。
Python机器人发帖
【更新】基于scrapy框架的淘宝美眉图片爬虫
WTF
2014/11/21镜像同步36 回复
订阅后,新回复会通过你的通知中心匿名送达。
9 条回复