返回信息流受版主提示,改进了之前的按回复数抓取论坛的帖子,现在按有标记的来抓取保存本地
金钻
黄钻
蓝钻
火钻
上图:
上代码:
#encoding='GB2312'
import requests
import os
i=1
print '本脚本可以帮助您筛选某个版面历史上部分有特殊标记的精华帖子,并部分保存在本地,请按下面提示操作'+'\n'
board = raw_input('请直接输入您所选择查询的版面的英文完整名称,例如:"Python","StudyShare": ')
PAGE=int(raw_input('请输入您想查询的页数,一页代表最新的30篇帖子,为了爱护论坛的服务器,强烈建议您不要经常超过 10页(300篇啊): '))
ROW=30
bourl = "http://bbs.byr.cn/board/" + board +"?p="
filepath= 'bbs.byr.cn/tag_list/' + board +'/'
headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, compress',
'Accept-Language': 'en-us;q=0.5,en;q=0.3',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36' }
if os.path.exists(filepath):
pass
else:
os.makedirs(filepath)
if(int(PAGE)>=50):
PAGE=50
while (i <= PAGE) :
bbourl = bourl + str(i)
print bbourl
print '--------------------------'
bbocont =requests.get(bbourl,headers=headers).content
articleurl = ['']*ROW
titlelist = ['']*ROW
taglist = ['']*ROW
replylist =['']*ROW
j=0
ptitle_8h = bbocont.find('<td class="title_8')
ptitle_8t = bbocont.find(r'</samp>',ptitle_8h)
while(ptitle_8h!= -1 and ptitle_8t!=-1 and j<=ROW):
psamph = bbocont.find(r'<samp class="tag',ptitle_8h)
psampt = bbocont.find(r'"></samp>',psamph)
tagcont = bbocont[ psamph + len(r'<samp class="tag ico-pos-article-') : psampt ]
#print tagcont
ptitle_11htmp = bbocont.find(r'title_11 middle', ptitle_8h)
ptitle_11h = bbocont.find(r'">',ptitle_11htmp)
#print ptitle_11h
ptitle_11t = bbocont.find(r'</td>', ptitle_11h)
#print ptitle_11t
replyamnt = bbocont[ ptitle_11h + len('">') : ptitle_11t ]
if(tagcont=='b' or tagcont=='m' or tagcont=='g' or tagcont=='fire'):
particleurlh = bbocont.find(r'<a target="_blank" href="',ptitle_8h )
#print particleurlh
particleurlt = bbocont.find(r'" title', particleurlh)
#print particleurlt
articleurl[j] = bbocont[ particleurlh + len('<a target="_blank" href="') : particleurlt ]
ptitlehtmp = bbocont.find(r'<a href="', psampt)
ptitleh = bbocont.find(r'">', ptitlehtmp)
ptitlet = bbocont.find(r'</a>', ptitleh)
titlelist[j]= bbocont[ ptitleh + len('">') : ptitlet ]
replylist[j]=replyamnt
if(tagcont=='b'):
taglist[j]="金钻"
if(tagcont=='m'):
taglist[j]="黄钻"
if(tagcont=='g'):
taglist[j]="蓝钻"
if(tagcont=='fire'):
taglist[j]="火钻"
artipages = round((int(replylist[j])+1+5)/10)
artipaurltmp='http://bbs.byr.cn'+articleurl[j]+'?p='
n=1
while(n<=artipages):
artipaurl= artipaurltmp +str(n)
articont = requests.get(artipaurl,headers=headers).content
open(filepath+taglist[j]+replylist[j]+'_'+titlelist[j]+'.html','a').write(artipaurl+'<br><br>')
print 'Downloading page: '+taglist[j]+artipaurl
pcwh = articont.find(r'<div class="a-content-wrap">')
pcwt = articont.find(r'<font class=',pcwh)
NANU=10
k=0
namecont = articont[pcwh + len('<div class="a-content-wrap">') : pcwt]
while(pcwh!=-1 and pcwt!=-1 and namecont!=-1 and k<NANU):
open(filepath+taglist[j]+replylist[j]+ '_' + titlelist[j] + '.html','a').write(namecont+'<br><br>')
pcwh = articont.find(r'<div class="a-content-wrap">',pcwt)
pcwt = articont.find(r'<font',pcwh)
namecont = articont[pcwh + len('<div class="a-content-wrap">') : pcwt]
k +=1
n +=1
ptitle_8h = bbocont.find('<td class="title_8',ptitle_8t)
ptitle_8t = bbocont.find(r'</samp>',ptitle_8h)
j +=1
i +=1
print 'Completed!'
这是一条镜像帖。来源:北邮人论坛 / python / #2152同步于 2014/7/28
该镜像源已超过 30 天没有更新,可能在源站已被删除。
Python机器人发帖
保存论坛最重要帖
heamon7
2014/7/28镜像同步4 回复
订阅后,新回复会通过你的通知中心匿名送达。