返回信息流刚入门scrapy,想要模拟登录知乎,然而网上的资料很多都看了,并没有解决我的登录问题。
这个爬虫的设计功能是爬取我的收藏夹内容,但因为模拟登录失败所以仍然没办法爬到我私密收藏夹的内容。怀疑是FormRequest那里不行。最下面是CMD中显示可能出问题的过程
恳请各位巨巨帮忙看看把把脉,谢谢!
代码如下
# coding=utf-8
import re
import json
from scrapy.selector import Selector
try:
from scrapy.spiders import Spider
except:
from scrapy.spider import BaseSpider as Spider
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import FormRequest
from myzhihucollections.items import *
#下面这句可忽略,只是实现提示语句info()
from myzhihucollections.misc.log import *
import requests
import re
import codecs
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class zhihuSpider(CrawlSpider):
BASE = "https://www.zhihu.com"
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, sdch",
"Accept-Language": "zh-CN,zh;q=0.8",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Host": "www.zhihu.com",
"Referer": "https://www.zhihu.com/",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36}"
}
cookies = {
"__utma": "51854390.19274654.14532989.14533548.14534952.10",
"__utmc": "51854390",
"__utmv": "51854390.100-1|2=registration_date=201210=1^3=entry_date=201210=1",
"cap_id": "NDc4MkMzU5ZmIyOWM=|14547863|a708881e8fe136125e3c4",
"__utmz": "51854390.1453307541.8.4.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/question/39663757/answer/82465040",
"n_c": "1",
"_za": "8540ee22-74-4cdc-9d8d-8e773a1e",
"aliyungf_tc": "AQAAANKRWDR/OAsAugp",
"q_c1": "df4489b8eeedc3016402|1454280000|141160000",
"unlock_ticket": "QUFEQTljd1pBQU",
"z_c0": "QUQTljd1pBQUFd2lTMThRPT0=|1454848365|860bf6a3d9f1035",
}
name = "zhihu"
allowed_domains = ["zhihu.com"]
start_urls = [
"https://www.zhihu.com/people/zhang-jie-48/collections",
]
rules = [
Rule(LinkExtractor(allow=("\/collections")), follow=True, callback='parse_collection_dir'),
# Rule(LinkExtractor(allow="collection/[0~9]+\?page\=([\w]+)"))
]
answer_dict = dict()
def start_requests(self):
info("start url:" + self.start_urls[0])
yield scrapy.Request("https://www.zhihu.com/#signin",
meta={'cookiejar':1},callback=self.post_login,
headers=self.headers, cookies=self.cookies)
def post_login(self, response):
print 'Preparing login'
# 下面这句话用于抓取请求网页后返回网页中的_xsrf字段的文字, 用于成功提交表单
xsrf = Selector(response).xpath("//input[@name='_xsrf']/@value").extract()[0]
print xsrf
# FormRequest.from_response是Scrapy提供的一个函数, 用于post表单
# 登录成功后, 将调用after_login回调函数
return [FormRequest.from_response(response, # "https://www.zhihu.com/#signin",
meta={'cookiejar': response.meta['cookiejar']},
headers=self.headers,
formdata={
'_xsrf': xsrf,
'email': 'hahaha@hkjournalist.org',
'password': 'zhichijibenfa',
'remember_me': 'true'
},
callback=self.after_login,
dont_filter=True)]
def after_login(self, response):
print '\n'
print response #显示出来的response中的网址并不能有效登录,所以怀疑
#可能是post_login中FormRequest的问题。
print '\n'
for url in self.start_urls:
yield scrapy.Request(url, meta={'cookiejar': 1}, callback=self.parse_collection_dir,
headers=self.headers, cookies=self.cookies)
def parse_follow_id(self, id_url):
id_url = id_url[1:]
str_list = id_url.split("/")
return str_list[1]
def parse_collection_dir(self, response):
info("collection rsp url:" + response.url)
selectorList = response.xpath("//a[@class='zm-profile-fav-item-title']/@href")
for selector in selectorList:
hrefStr = selector.extract()
info("fav dir:" + hrefStr)
collection_dir = self.BASE + hrefStr
yield scrapy.Request(url=collection_dir, callback=self.parse_collection_list, headers=self.headers,
cookies=self.cookies)
def parse_collection_list(self, response):
info("parse collection list:" + response.url)
selector_list = response.xpath("//a[@class='toggle-expand']/@href")
for selector in selector_list:
answer_url = self.BASE + str(selector.extract())
info("answer url:" + answer_url)
if (False == self.is_answer_in_dict(answer_url)):
yield scrapy.Request(url=answer_url, callback=self.parse_answer_detail, headers=self.headers,cookies=self.cookies)
def is_answer_in_dict(self, url):
if self.answer_dict.has_key(url):
self.answer_dict[url] += 1
return True
else:
self.answer_dict[url] = 1
return False
def parse_answer_detail(self, response):
answer_url = response.url
answer = response.xpath("//h2[@class='zm-item-title zm-editable-content']/a/text()").extract()
info("answer url:" + answer_url)
info("answer :" + answer[0].decode())
meta = {}
meta['title'] = answer[0].decode()
meta['url'] = answer_url
return meta
这是一条镜像帖。来源:北邮人论坛 / python / #12240同步于 2016/2/7
该镜像源已超过 30 天没有更新,可能在源站已被删除。
Python机器人发帖
求教scrapy中模拟登录的一个问题
jaegerstar
2016/2/7镜像同步29 回复
订阅后,新回复会通过你的通知中心匿名送达。
9 条回复
[ema2][ema2][ema2],给人家不解答问题,还在这笑
【 在 jh1 (邯是郸身|土豆|鱼欲遇雨|儿时吹过的牛逼) 的大作中提到: 】
: [ema21]过年还码代码,也是拼
通过『我邮2.0』发布
[ema2][ema2][ema2],不相信
【 在 jh1 (邯是郸身|土豆|鱼欲遇雨|儿时吹过的牛逼) 的大作中提到: 】
: [ema1]不会
通过『我邮2.0』发布