Python 模拟登录知乎

Last updated on 2018-02-18

之前模拟登录pixiv时不需要验证码，知乎需要验证码，于是写这篇文章记录我学习模拟登录知乎的过程

需要的工具及环境

Python 3.6
requests 库
pillow 库
re 库
Chrome 浏览器

尝试登录

打开知乎登录页面,按F12->选中‘preserve log’,登录界面如图，点击提交，获得post数据和网址

我们可以获得获得post URL 地址

1	post_url = 'https://www.zhihu.com/login/phone_num' # post请求提交的URL

从post提交的数据看我们需要4个参数’xsrf’,’captcha’,’password’,’phone_num’，所以，我们构建提交的数据字典如下

datas = {
        '_xsrf': '',
        'password': 'your_password',
        'captcha': '',
        'phone_num': 'your_phonenumber'
        }

获取验证码

右键点击验证码，查看源码，看到”img“标签下，src属性存在验证码的网址，鼠标移到src属性上即可查看源网址

r 是当前时间的时间戳单位为毫秒，具体获得代码如下：

1
2
3

time = str(int(time.time() * 1000))
captcha_url = 'http://www.zhihu.com/captcha.gif?r=' + time + "&type=login"
captcha = s.get(captcha_url)

完整代码

# -*- coding=utf-8 -*-
import requests
import re
import time
from PIL import Image

headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36'}

datas = {
        '_xsrf': '',
        'password': 'your_password',
        'captcha': '',
        'phone_num': 'your_phonenumber'
        }

login_url = 'https://www.zhihu.com/' # 登录界面URL
post_url = 'https://www.zhihu.com/login/phone_num' # post请求提交的URL

s = requests.Session()
s.headers = headers

# 获取xsrf
res1 = s.get(login_url)
pattern = re.compile('<input type="hidden" name="_xsrf" value="(.*?)"/>')
p = pattern.findall(res1.text)
datas['_xsrf'] = p[0]

# 获取验证码
time = str(int(time.time() * 1000))
captcha_url = 'http://www.zhihu.com/captcha.gif?r=' + time + "&type=login"
captcha = s.get(captcha_url)
with open('captcha.gif', 'wb') as f:
    f.write(captcha.content)
f.close()

# 显示验证码
im = Image.open('captcha.gif')
im.show()
im.close()

# 手动输入验证码
login_captcha = input('input captcha:\n')
datas['captcha'] = login_captcha

# 提交post请求，并且打印出json
result = s.post(post_url, data=datas)
print(result.text)

2017.5.16 完善代码

增加了cookie，登录成功后可以用cookie登录
增加了交互式
完善代码，将代码进行了封装

# -*- coding=utf-8 -*-
import requests
import re
import time
import http.cookiejar
from PIL import Image

class zhihuSpider(object):
    def __init__(self):
        self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36'}
        self.datas = {
            '_xsrf': '',
            'password': '',
            'captcha': '',
            'phone_num': ''
            }
        self.session = requests.Session()
        self.session.headers = self.headers
        self.session.cookies = http.cookiejar.LWPCookieJar('cookies')
        try:
            self.session.cookies.load('cookies', ignore_discard=True)
        except:
            print('cookies 不能加载')

    # 获取xsrf
    def get_xsrf(self):
        login_url = 'https://www.zhihu.com/' # 登录界面URL
        res = self.session.get(login_url)
        pattern = re.compile('<input type="hidden" name="_xsrf" value="(.*?)"/>')
        p = pattern.findall(res.text)
        self.datas['_xsrf'] = p[0]

    # 获取验证码
    def get_captcha(self):
        t = str(int(time.time() * 1000))
        captcha_url = 'http://www.zhihu.com/captcha.gif?r=' + t + "&type=login"
        # 获取验证码，并将验证码写入captcha.gif
        captcha = self.session.get(captcha_url)
        with open('captcha.gif', 'wb') as f:
            f.write(captcha.content)
        f.close()
        # 显示验证码
        im = Image.open('captcha.gif')
        im.show()
        im.close()
        # 手动输入验证码
        login_captcha = input('input captcha:\n')
        self.datas['captcha'] = login_captcha


    def already_login(self):
        url = 'https://www.zhihu.com/settings/profile'
        login_code = self.session.get(url, allow_redirects=False).status_code
        if login_code == 200:
            return True
        else:
            return False

    def login(self, account, password):
        post_url = 'https://www.zhihu.com/login/phone_num' # post请求提交的URL
        # 获取xsrf
        self.get_xsrf()
        # 获取验证码
        self.get_captcha()
        self.datas['phone_num'] = account
        self.datas['password'] = password
        # 提交post请求，并且打印出json
        result = self.session.post(post_url, data=self.datas)
        print(result.text)
        self.session.cookies.save(ignore_discard=True, ignore_expires=True)

if __name__ == "__main__":
    spider = zhihuSpider()
    if spider.already_login():
        print('用户已经登录')
    else:
        account = input('请输入用户名 \n> ')
        password = input('请输入密码 \n> ')
        spider.login(account, password)