BYR Achieve · 镜像论坛

一个小爬虫，检索北邮bt的关键词资源并下载torrent

2014/11/11镜像同步18 回复

这两天学习写爬虫，有时候遇到未完结的电视剧或者动漫，不想一个一个点开每一集然后选择下载torrent，我这里就写了一个脚本初版，输入关键词和资源类编号，然后等它下载就行了。效果如下：这里设置每次download一个torrent就sleep几秒，你可以改小一点，注意保护服务器（是我多虑了吗？[ema13]）源码如下，这里贴的是我的cookie（话说会泄露信息吗[ema13]），不同人可能不一样，就自己改一下吧，F12看下php的包头就行了。 #! usr/bin/env python # -*- coding: utf-8 -*- import sys import string import os import re import time import random import requests from BeautifulSoup import BeautifulSoup header = { 'Connection': 'keep-alive', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36', 'Accept-Encoding': 'gzip,deflate,sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cookie': '' } def download_bt(i, ids): for key in ids: url2 = 'http://bt.byr.cn/download.php?id=' + key tmpname = ids[key].replace('?','').replace('.',' ').replace(':', '')\ .replace('<', '').replace('>','').replace('|','')\ .replace('*','').replace('/','').replace('\\','') + '.torrent' name = pwd + table[int(i)-1] + '/' + tmpname if os.path.exists(name): pass else: r2 = requests.get(url2, headers=header, stream=True) with open(name, 'wb') as fd: for chunk in r2.iter_content(): fd.write(chunk) print 'download torrent %s is done.' % tmpname time.sleep(random.uniform(5, 10)) def run(i, words): url = 'http://bt.byr.cn/torrents.php?inclbookmarked=0&incldead=0&spstate=0&cat=4' s = requests.Session() if i == 10: r = s.get(url+str(i)+'&page=0', headers=header) else: r = s.get(url+str(0)+str(i)+'&page=0', headers=header) r_soup = BeautifulSoup(r.text) fpage = {'href': re.compile('^\?inclbookmarked=0&incldead=0&spstate=0&cat=(\d*?)&page=(\d*?) )} fps = r_soup.findAll('a', fpage) num = 0 for p in fps: start = p['href'].rfind('=') + 1 tmp = int(p['href'][start:]) if tmp > num: num = tmp for page in range(num+1): if i == 10: r = s.get(url+str(i)+'&page='+str(page), headers=header) else: r = s.get(url+str(0)+str(i)+'&page='+str(page), headers=header) r_soup = BeautifulSoup(r.text) fatt = {'href': re.compile('^details\.php\?id=(\d*?)&hit=1 )} fres = r_soup.findAll('a', fatt) ids = {} for det in fres: if re.search(words.decode('utf-8'), det['title'], re.IGNORECASE): start = det['href'].find('=') + 1 end = det['href'].find('&') ids[det['href'][start:end]] = det['title'] else: pass download_bt(i, ids) print 'page %d is done.' % page print 'now all download mission completed' if __name__ == "__main__": # put global varialbe here or behind the import. pwd = os.getcwd() + '/' table = ( u'剧集', u'音乐', u'游戏', u'动漫', u'综艺', u'软件', u'资料', u'电影', u'体育', u'纪录') print 'the classification number is as follows:' for i in range(len(table)): print '%d <---> %s' % (i, table[i]) i = int(raw_input('please input classification number: ')) words = raw_input('please input keywords: ') if i >=0 and i <= 9 : if not os.path.exists(pwd + table[i]): os.mkdir(pwd + table[i]) run(i+1, words) else: print 'not valid classification.' sys.exit()

订阅后，新回复会通过你的通知中心匿名送达。