BYR Achieve · 镜像论坛

受版主提示，改进了之前的按回复数抓取论坛的帖子，现在按有标记的来抓取保存本地金钻黄钻蓝钻火钻上图：上代码： #encoding='GB2312' import requests import os i=1 print '本脚本可以帮助您筛选某个版面历史上部分有特殊标记的精华帖子，并部分保存在本地，请按下面提示操作'+'\n' board = raw_input('请直接输入您所选择查询的版面的英文完整名称，例如："Python","StudyShare": ') PAGE=int(raw_input('请输入您想查询的页数，一页代表最新的30篇帖子，为了爱护论坛的服务器，强烈建议您不要经常超过 10页(300篇啊): ')) ROW=30 bourl = "http://bbs.byr.cn/board/" + board +"?p=" filepath= 'bbs.byr.cn/tag_list/' + board +'/' headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, compress', 'Accept-Language': 'en-us;q=0.5,en;q=0.3', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36' } if os.path.exists(filepath): pass else: os.makedirs(filepath) if(int(PAGE)>=50): PAGE=50 while (i <= PAGE) : bbourl = bourl + str(i) print bbourl print '--------------------------' bbocont =requests.get(bbourl,headers=headers).content articleurl = ['']*ROW titlelist = ['']*ROW taglist = ['']*ROW replylist =['']*ROW j=0 ptitle_8h = bbocont.find('<td class="title_8') ptitle_8t = bbocont.find(r'</samp>',ptitle_8h) while(ptitle_8h!= -1 and ptitle_8t!=-1 and j<=ROW): psamph = bbocont.find(r'<samp class="tag',ptitle_8h) psampt = bbocont.find(r'"></samp>',psamph) tagcont = bbocont[ psamph + len(r'<samp class="tag ico-pos-article-') : psampt ] #print tagcont ptitle_11htmp = bbocont.find(r'title_11 middle', ptitle_8h) ptitle_11h = bbocont.find(r'">',ptitle_11htmp) #print ptitle_11h ptitle_11t = bbocont.find(r'</td>', ptitle_11h) #print ptitle_11t replyamnt = bbocont[ ptitle_11h + len('">') : ptitle_11t ] if(tagcont=='b' or tagcont=='m' or tagcont=='g' or tagcont=='fire'): particleurlh = bbocont.find(r'<a target="_blank" href="',ptitle_8h ) #print particleurlh particleurlt = bbocont.find(r'" title', particleurlh) #print particleurlt articleurl[j] = bbocont[ particleurlh + len('<a target="_blank" href="') : particleurlt ] ptitlehtmp = bbocont.find(r'<a href="', psampt) ptitleh = bbocont.find(r'">', ptitlehtmp) ptitlet = bbocont.find(r'</a>', ptitleh) titlelist[j]= bbocont[ ptitleh + len('">') : ptitlet ] replylist[j]=replyamnt if(tagcont=='b'): taglist[j]="金钻" if(tagcont=='m'): taglist[j]="黄钻" if(tagcont=='g'): taglist[j]="蓝钻" if(tagcont=='fire'): taglist[j]="火钻" artipages = round((int(replylist[j])+1+5)/10) artipaurltmp='http://bbs.byr.cn'+articleurl[j]+'?p=' n=1 while(n<=artipages): artipaurl= artipaurltmp +str(n) articont = requests.get(artipaurl,headers=headers).content open(filepath+taglist[j]+replylist[j]+'_'+titlelist[j]+'.html','a').write(artipaurl+'<br><br>') print 'Downloading page: '+taglist[j]+artipaurl pcwh = articont.find(r'<div class="a-content-wrap">') pcwt = articont.find(r'<font class=',pcwh) NANU=10 k=0 namecont = articont[pcwh + len('<div class="a-content-wrap">') : pcwt] while(pcwh!=-1 and pcwt!=-1 and namecont!=-1 and k<NANU): open(filepath+taglist[j]+replylist[j]+ '_' + titlelist[j] + '.html','a').write(namecont+'<br><br>') pcwh = articont.find(r'<div class="a-content-wrap">',pcwt) pcwt = articont.find(r'<font',pcwh) namecont = articont[pcwh + len('<div class="a-content-wrap">') : pcwt] k +=1 n +=1 ptitle_8h = bbocont.find('<td class="title_8',ptitle_8t) ptitle_8t = bbocont.find(r'</samp>',ptitle_8h) j +=1 i +=1 print 'Completed!'

保存论坛最重要帖