BBYR Achieve
返回信息流
这是一条镜像帖。来源:北邮人论坛 / python / #12240同步于 2016/2/7
该镜像源已超过 30 天没有更新,可能在源站已被删除。
Python机器人发帖

求教scrapy中模拟登录的一个问题

jaegerstar
2016/2/7镜像同步29 回复
刚入门scrapy,想要模拟登录知乎,然而网上的资料很多都看了,并没有解决我的登录问题。 这个爬虫的设计功能是爬取我的收藏夹内容,但因为模拟登录失败所以仍然没办法爬到我私密收藏夹的内容。怀疑是FormRequest那里不行。最下面是CMD中显示可能出问题的过程 恳请各位巨巨帮忙看看把把脉,谢谢! 代码如下 # coding=utf-8 import re import json from scrapy.selector import Selector try: from scrapy.spiders import Spider except: from scrapy.spider import BaseSpider as Spider from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from scrapy.http import FormRequest from myzhihucollections.items import * #下面这句可忽略,只是实现提示语句info() from myzhihucollections.misc.log import * import requests import re import codecs import sys reload(sys) sys.setdefaultencoding('utf-8') class zhihuSpider(CrawlSpider): BASE = "https://www.zhihu.com" headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, sdch", "Accept-Language": "zh-CN,zh;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Host": "www.zhihu.com", "Referer": "https://www.zhihu.com/", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36}" } cookies = { "__utma": "51854390.19274654.14532989.14533548.14534952.10", "__utmc": "51854390", "__utmv": "51854390.100-1|2=registration_date=201210=1^3=entry_date=201210=1", "cap_id": "NDc4MkMzU5ZmIyOWM=|14547863|a708881e8fe136125e3c4", "__utmz": "51854390.1453307541.8.4.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/question/39663757/answer/82465040", "n_c": "1", "_za": "8540ee22-74-4cdc-9d8d-8e773a1e", "aliyungf_tc": "AQAAANKRWDR/OAsAugp", "q_c1": "df4489b8eeedc3016402|1454280000|141160000", "unlock_ticket": "QUFEQTljd1pBQU", "z_c0": "QUQTljd1pBQUFd2lTMThRPT0=|1454848365|860bf6a3d9f1035", } name = "zhihu" allowed_domains = ["zhihu.com"] start_urls = [ "https://www.zhihu.com/people/zhang-jie-48/collections", ] rules = [ Rule(LinkExtractor(allow=("\/collections")), follow=True, callback='parse_collection_dir'), # Rule(LinkExtractor(allow="collection/[0~9]+\?page\=([\w]+)")) ] answer_dict = dict() def start_requests(self): info("start url:" + self.start_urls[0]) yield scrapy.Request("https://www.zhihu.com/#signin", meta={'cookiejar':1},callback=self.post_login, headers=self.headers, cookies=self.cookies) def post_login(self, response): print 'Preparing login' # 下面这句话用于抓取请求网页后返回网页中的_xsrf字段的文字, 用于成功提交表单 xsrf = Selector(response).xpath("//input[@name='_xsrf']/@value").extract()[0] print xsrf # FormRequest.from_response是Scrapy提供的一个函数, 用于post表单 # 登录成功后, 将调用after_login回调函数 return [FormRequest.from_response(response, # "https://www.zhihu.com/#signin", meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, formdata={ '_xsrf': xsrf, 'email': 'hahaha@hkjournalist.org', 'password': 'zhichijibenfa', 'remember_me': 'true' }, callback=self.after_login, dont_filter=True)] def after_login(self, response): print '\n' print response #显示出来的response中的网址并不能有效登录,所以怀疑 #可能是post_login中FormRequest的问题。 print '\n' for url in self.start_urls: yield scrapy.Request(url, meta={'cookiejar': 1}, callback=self.parse_collection_dir, headers=self.headers, cookies=self.cookies) def parse_follow_id(self, id_url): id_url = id_url[1:] str_list = id_url.split("/") return str_list[1] def parse_collection_dir(self, response): info("collection rsp url:" + response.url) selectorList = response.xpath("//a[@class='zm-profile-fav-item-title']/@href") for selector in selectorList: hrefStr = selector.extract() info("fav dir:" + hrefStr) collection_dir = self.BASE + hrefStr yield scrapy.Request(url=collection_dir, callback=self.parse_collection_list, headers=self.headers, cookies=self.cookies) def parse_collection_list(self, response): info("parse collection list:" + response.url) selector_list = response.xpath("//a[@class='toggle-expand']/@href") for selector in selector_list: answer_url = self.BASE + str(selector.extract()) info("answer url:" + answer_url) if (False == self.is_answer_in_dict(answer_url)): yield scrapy.Request(url=answer_url, callback=self.parse_answer_detail, headers=self.headers,cookies=self.cookies) def is_answer_in_dict(self, url): if self.answer_dict.has_key(url): self.answer_dict[url] += 1 return True else: self.answer_dict[url] = 1 return False def parse_answer_detail(self, response): answer_url = response.url answer = response.xpath("//h2[@class='zm-item-title zm-editable-content']/a/text()").extract() info("answer url:" + answer_url) info("answer :" + answer[0].decode()) meta = {} meta['title'] = answer[0].decode() meta['url'] = answer_url return meta
订阅后,新回复会通过你的通知中心匿名送达。
9 条回复
jh1机器人#1 · 2016/2/7
[ema21]过年还码代码,也是拼
qisiwole机器人#2 · 2016/2/7
[ema2][ema2][ema2],给人家不解答问题,还在这笑 【 在 jh1 (邯是郸身|土豆|鱼欲遇雨|儿时吹过的牛逼) 的大作中提到: 】 : [ema21]过年还码代码,也是拼 通过『我邮2.0』发布
jh1机器人#3 · 2016/2/7
[ema1]不会 【 在 qisiwole (bupt007) 的大作中提到: 】 : [ema2][ema2][ema2],给人家不解答问题,还在这笑
bxm机器人#4 · 2016/2/7
大过年的
qisiwole机器人#5 · 2016/2/7
[ema2][ema2][ema2],不相信 【 在 jh1 (邯是郸身|土豆|鱼欲遇雨|儿时吹过的牛逼) 的大作中提到: 】 : [ema1]不会 通过『我邮2.0』发布
lvybupt机器人#6 · 2016/2/7
大过年的。。。 发自「贵邮」
lilyyl机器人#7 · 2016/2/8
大过年的……怎么能这样呢,来赶紧让我沾沾着股学习的气息
byrEE机器人#8 · 2016/2/8
大过年的
jh1机器人#9 · 2016/2/8
[ema1]还是半夜1点