Requests 请求
4744 字约 16 分钟
2026-05-20
1,request模块
1-1,response的常用属性:
1,response.text #响应体 str类型response.encoding #从HTTP header中猜测的响应内容的编码方式2,respones.content #响应体 bytes类型3,response.status_code #响应状态码,4,response.request.headers #响应对应的请求头5,response.headers 响应头,
6,response.cookies #响应的cookie(经过了set-cookie动作)7,response.url #获取访问的url,8,response.json() #获取json数据 得到内容为字典 (如果接口响应体的格式是json格式时)9,response.ok
# 如果status_code小于等于200,response.ok返回True。 # 如果status_code大于200,response.ok返回False。# 先安装 reqeuestsimport requests
# 给定抓取的网址的urlurl = 'http://www.baidu.com'# 发起get请求response = requests.get(url)
# 看一下响应的状态码print(response.status_code)
# 看一下请求的urlprint('response.url', response.url)
# 获取编码print(response.encoding)
# 设定编码# response.encoding = 'GBK'# response.encoding = 'UTF-8'# response.encoding = response.apparent_encoding# 看一下返回的数据 以字符串形式返回的(一般情况下会乱码 需要给定编码)print(response.text)
# 获取bytesprint( response.content)
print(response.content.decode()) # 使用默认解码方式print(response.content.decode('UTF-8')) # 使用默认解码方式print(response.content.decode('GBK')) # 使用默认解码方式# 获取响应头print(response.headers)
# 获取响应对应的请求头print(response.request.headers)
print(response.ok)
# 获取JSON数据print(response.json())1-2, response.text 和response.content的区别
## response.text# 类型:str# 解码类型: requests模块自动根据HTTP 头部对响应的编码作出有根据的推测,推测的文本编码# 如何修改编码方式:`response.encoding="gbk/UTF-8"`## response.content## 类型:bytes## 解码类型: 没有指定## 如何修改编码方式:`response.content.deocde("utf8")`获取网页源码的通用方式:
response.content.decode()
response.content.decode("UTF-8")
response.text
## 以上三种方法从前往后尝试,能够100%的解决所有网页解码的问题## 所以:更推荐使用`response.content.deocde()`的方式获取响应的html页面1-3,下载图片实例
import requests
# 图片的URLurl = 'https://ww4.sinaimg.cn/mw690/0076vsZ6ly1hiw5krf9wdj31401hcb2a.jpg'# 发起请求response = requests.get(url)
# 图片使用wbwith open('dlrb.jpg', 'wb') as f:
f.write(response.content)1-4,添加请求头
import requests
url = 'http://www.baidu.com'## 添加请求头headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'}
# 携带请求头response = requests.get(url, headers=headers)
# print(response.content.decode())print(response.request.headers)1-5,get传参数
1-5-1 第一种方式
import requests
## url## get参数在URL里以查询字符串形式传递url = 'http://www.zishazx.com/product?page=1&size_id=0&volum_id=0&price_id=2&category_id=1001&prize_id=0&pug_id=25#views'# 设置请求头headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'}
# 发起get请求response = requests.get(url, headers=headers)
# 获取返回数据data = response.content.decode()
print(data)
# 写入with open('zsh.html', 'w', encoding='UTF-8') as f:
f.write(data)1-5-2 第二种方式
import requests
url = 'http://www.zishazx.com/product'# 设置请求头headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'}
# 鞋带的参数 get传参params = {
'page': 1,
'size_id': 0,
'volum_id': 0,
'price_id': 2,
'category_id': 1001,
'prize_id': 0,
'pug_id': 25}
# 发起get请求 携带了 get传参 携带了headers请求头response = requests.get(url, headers=headers, params=params)
# 获取返回数据data = response.content.decode()
print(response.url)
# print(data)# 写入with open('zsh.html', 'w', encoding='UTF-8') as f:
f.write(data)1-6 实战案例
1-6-1 获取图片案例
import requests
from lxml import etree
url = 'https://app.mi.com/subject/115150'headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'}
response = requests.get(url, headers=headers)
data = response.content.decode()
tree = etree.HTML(data)
li_list = tree.xpath('//ul[@class="applist"]/li')
for li in li_list:
# 获取图片地址 img_src = li.xpath('./a/img/@data-src')[0]
# 获取名称 name = li.xpath('./h5/a/text()')[0]
# 获取简介 decr = li.xpath('./p/a/text()')[0]
print(img_src, name, decr)1-6-2 xpath紫砂之星抓取单页数据
import requests
from lxml import etree
# 要抓取的urlurl = 'http://www.zishazx.com/product'# 给请求头 uaheaders = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'}
# 发起get请求res = requests.get(url, headers=headers)
# 获取响应的页面内容data = res.content.decode()
# 实例化匹配对象tree = etree.HTML(data)
# print(data)# 获取到了所有的lili_list = tree.xpath('//ul[@class="list clearfix"]/li')
for li in li_list:
# 获取到图片的src地址 img = li.xpath('./p[@class="img"]/a/img/@src')[0]
name = li.xpath('./p[@class="name"]/text()')[0]
p_no = li.xpath('./p[@class="p_no"]/text()')[0]
print(img, name, p_no)1-6-3 bs4紫砂之星抓取单页数据
import requests
from bs4 import BeautifulSoup
# 要抓取的urlurl = 'http://www.zishazx.com/product'# 给请求头 uaheaders = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'}
# 发起get请求res = requests.get(url, headers=headers)
# 获取响应的页面内容data = res.content.decode()
# 实例化匹配对象soup = BeautifulSoup(data, 'lxml')
# print(data)# 获取到了所有的li# li_list = tree.xpath('//ul[@class="list clearfix"]/li')li_list = soup.find('ul', class_="list clearfix").findAll('li')
for li in li_list:
# 获取到图片的src地址 # img = li.xpath('./p[@class="img"]/a/img/@src')[0] img = li.find('p', class_="img").a.img['src']
# name = li.xpath('./p[@class="name"]/text()')[0] name = li.find('p', class_="name").string
# p_no = li.xpath('./p[@class="p_no"]/text()')[0] p_no = li.find('p', class_="p_no").string
print(img, name, p_no)1-6-4 xpath小说阅读网
from lxml import etree
import requests
url = 'https://www.readnovel.com/category'# 给定请求头headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'}
# 发起请求res = requests.get(url, headers=headers)
data = res.content.decode()
# print(data)# 实例化tree对象tree = etree.HTML(data)
# 获取所有图书的lili_list = tree.xpath('//div[@class="right-book-list"]/ul/li')
for li in li_list:
# 获取封面 img = li.xpath('./div[@class="book-img"]/a/img/@src')[0]
# 获取标题 title = li.xpath('./div[@class="book-info"]/h3/a/text()')[0]
# 获取简介 intro = li.xpath('.//p[@class="intro"]/text()')[0]
print(img, title, intro)1-6-5 bs4小说阅读网
from bs4 import BeautifulSoup
import requests
url = 'https://www.readnovel.com/category'# 给定请求头headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'}
# 发起请求res = requests.get(url, headers=headers)
data = res.content.decode()
# print(data)# 实例化tree对象soup = BeautifulSoup(data, 'lxml')
# 获取所有图书的li# li_list = tree.xpath('//div[@class="right-book-list"]/ul/li')li_list = soup.find('div', class_="right-book-list").ul.findAll('li')
for li in li_list:
# 获取封面 # img = li.xpath('./div[@class="book-img"]/a/img/@src')[0] img = li.find('div', class_="book-img").a.img['src']
# 获取标题 # title = li.xpath('./div[@class="book-info"]/h3/a/text()')[0] title = li.find('div', class_="book-info").h3.a.string
# 获取简介 # intro = li.xpath('.//p[@class="intro"]/text()')[0] intro = li.find('p', class_="intro").string
print(img, title, intro)1-6-6 xpath抓取优美图库
import os
import random
import time
from lxml import etree
import requests
url = 'http://www.umeituku.com/bizhitupian/'headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'}
res = requests.get(url, headers=headers)
data = res.content.decode()
tree = etree.HTML(data)
# 获取所有二级标签页的url地址a_list = tree.xpath('//div[@class="TypeList"]/ul/li/a/@href')
# 创建图片目录 存储当前的图片path= 'img'if not os.path.exists(path):
os.mkdir(path)
for url in a_list:
# 开始访问二级标签页内容 res = requests.get(url, headers=headers)
data = res.content.decode()
tree = etree.HTML(data)
try:
# 找二级标签页当前大图的src地址和图片名称 src = tree.xpath('//p[@align="center"]/a/img/@src')[0]
alt = tree.xpath('//p[@align="center"]/a/img/@alt')[0]
print(src, alt, '图片下载中======')
# 图片存入本地 with open(os.path.join(path, alt+'.jpg'), 'wb') as f:
f.write(requests.get(src, headers=headers).content)
print(src, alt, '图片下载完成......')
# 给个时间自省 except Exception as e:
print(f'在抓取网址:{url}的过程中出现了问题, 问题为:', e)
time.sleep(random.randint(1, 4))
print('OVER 下载结束!')1-6-7 抓取冶金信息
from lxml import etree
import requests
url = 'http://www.metalinfo.cn/mi.html'headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'}
"""pageSize: 20current: 2resourceType: r_newsfacetFilter: {}order: descsort: sort_time"""res = requests.get(url, headers=headers)
data = res.content.decode()
tree = etree.HTML(data)
li_list = tree.xpath('//ul[@id="searchLists"]/li')
print(li_list)
for li in li_list:
list_title = ''.join(li.xpath('//div[@class="list-title "]//text()'))
list_intro = li.xpath('//div[@class="list-intro"]/text()')[0]
list_keys = ''.join(li.xpath('//div[@class="list-keys"]/text()'))
print(list_title, list_intro, list_keys)1-6-8 异步请求
from lxml import etree
import requests
url = 'http://www.metalinfo.cn/json/search/list'headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'}
# 传递post数据data = {
'pageSize': 200,
'current': 1,
'resourceType': 'r_news',
'facetFilter': {},
'order': 'desc',
'sort': 'sort_time',
}
# 发起post请求res = requests.post(url, headers=headers, data=data)
# 获取json数据json_data = res.json()['result']['records']
i = 0for j in json_data:
# 获取标题 title = j['title']
# 获取简介 r_abstract = j['r_abstract']
print(title)
print(r_abstract)
i += 1print(i)1-6-9 中信证券抓取单页
import requests
from lxml import etree
url = 'http://www.cs.ecitic.com/newsite/cpzx/jrcpxxgs/zgcp/index.html'headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
}
res = requests.get(url, headers=headers)
html = res.content.decode()
tree = etree.HTML(html)
li_list = tree.xpath('//ul[@class="list-con"]/li')
# print(li_list)for li in li_list:
con = li.xpath('./span/text()')
th1 = con[0]
th2 = con[1]
th3 = con[2]
th4 = con[3]
th5 = con[4]
print(th1, th2, th3, th4, th5)2,反扒措施
2-1 代理的使用
# 用到的库import requests
# 写入获取到的ip地址到proxy# 一个ip地址proxy = {
'http':'http://221.178.232.130:8080'}
res = requests.get(url, proxies=proxy)
print(res.content.decode())
### ----------------------------------------------------------------------------# 多个ip地址proxy = [
{'http':'http://221.178.232.130:8080'},
{'http':'http://221.178.232.130:8080'}
]
import random
proxy = random.choice(proxy) # 随机获取ipres = requests.get(url, proxies=proxy)
print(res.content.decode())2-2 处理cookie
2-2-1 带cookie的请求
import requests
url = 'https://xueqiu.com/statuses/hot/listV2.json?since_id=-1&max_id=554225&size=15'headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
'Referer': 'https://xueqiu.com/',
## 携带cookie信息 'Cookie':'cookiesu=121697804017738; device_id=2cb32776fe1f32adba3aefc52173bcdc; xq_a_token=e2f0876e8fd368a0be2b6d38a49ed2dd5eec7557; xqat=e2f0876e8fd368a0be2b6d38a49ed2dd5eec7557; xq_r_token=2a5b753b2db675b4ac36c938d20120660651116d; xq_id_token=eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJ1aWQiOi0xLCJpc3MiOiJ1YyIsImV4cCI6MTcwMDY5OTg3NSwiY3RtIjoxNjk4MjM4NzI4MTU4LCJjaWQiOiJkOWQwbjRBWnVwIn0.RtY0JREVs0R4s9sgP2RsybzTrLY7UD5dDElnpf16r7-F02lOLkU7mdgAm0HjvKvbcAYYeRyP6Ke6rdy3WfbFI-RlJwzxIo5wZ4ScGzy0Vj3VYKqsh7-Wx8MnzyRjVcJPtVUfBlN_Plj5nmxnQPykmZwKSRjKT02YBy2XH4OHNaN0sG1Rst37mAj2f42lTogbHdfZBsRUkweP-UezUkEyvSncUYIe9IAMZmHf7d5AQ94BK5h3nhSqy01KyyTf2aonnwWG7rNrOeuo7F28S50Wz-1JBKtbQYhRbOEZL2FVpizmpC_h98pYl3RtDBVvbiUEJPxx1-bRN6J78h3bduYu0w; u=121697804017738; Hm_lvt_1db88642e346389874251b5a1eded6e3=1697804019,1698238782;'}
# 再发请求。拿数据res = requests.get(url, headers=headers)
print(res.json())2-2-2, coocie的字典形式
import requests
# 携带cookie登录雪球网 抓取完善个人资料页面headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36',
'Referer': 'https://xueqiu.com/u/1990923459',
'Host': 'xueqiu.com',
}
url = 'https://xueqiu.com/users/connectnew?redirect=/setting/user'cookie_dict = {
'u': '1990923459',
'bid': '1f110dfd43538f4b8362dfcd21ffbb64_l27g4lfl',
'xq_is_login': '1',
'xq_r_token': '5dcbe83944f0b75325f91246061d4a2a01999367'}
res = requests.get(url, headers=headers, cookies=cookie_dict)
with open('雪球网.html', 'w') as f:
f.write(res.content.decode('UTF-8'))
print(res.content.decode('UTF-8'))2-2-3 获取服务端返回的cookie
import requests
url = 'https://xueqiu.com/'headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
}
res = requests.get(url, headers=headers)
# 获取cookiecookies = res.cookies
print(cookies)
# 获取字典格式的cookieprint(dict(cookies))2-2-4 携带首页服务器响应的cookie
import requests
# 就是为了获取cookieindex_url = 'https://xueqiu.com/'headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
'Referer': 'https://xueqiu.com/',
}
res = requests.get(index_url, headers=headers)
# 获取cookiecookies = dict(res.cookies)
url = 'https://xueqiu.com/statuses/hot/listV2.json?since_id=-1&max_id=554225&size=15'# 携带cookie进行请求res = requests.get(url, headers=headers, cookies=cookies)
print(res.json())2-2-5, 使用session处理cookie
import requests
# 就是为了获取cookieheaders = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
'Referer': 'https://xueqiu.com/',
}
# 使用这个session对象进行维护session = requests.Session()
# session = requests.session()# 请求首页 获取返回的cookieindex_url = 'https://xueqiu.com/'session.get(index_url, headers=headers)
# 获取数据url = 'https://xueqiu.com/statuses/hot/listV2.json?since_id=-1&max_id=554225&size=15'# 携带cookie进行请求res = session.get(url, headers=headers)
print(res.json())2-2-6 模拟登录
6-1 手动处理cookie
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
}
# 登录的url地址login_url = 'https://passport.17k.com/ck/user/login'data = {
'loginName': '17346570232',
'password': 'xlg17346570232',
}
# 发起登录请求res = requests.post(login_url, headers=headers, data=data)
cookies = dict(res.cookies) # 获取登录后的cookie# 获取登录后的数据url = 'https://user.17k.com/ck/user/myInfo/96139098?bindInfo=1&appKey=2406394919'res = requests.get(url, headers=headers, cookies=cookies)
print(res.content.decode())6-2 自动维护cookie
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
}
# 登录的url地址login_url = 'https://passport.17k.com/ck/user/login'data = {
'loginName': '17346570232',
'password': 'xlg17346570232',
}
# 发起登录请求session = requests.Session()
session.post(login_url, headers=headers, data=data)
# 获取登录后的数据url = 'https://user.17k.com/ck/user/myInfo/96139098?bindInfo=1&appKey=2406394919'res = session.get(url, headers=headers)
print(res.content.decode())2-3 案例
2-3-1 处理验证码
1-1 下载验证码
## 首先找到发放验证码的URL地址,获取验证码保存到本地import requests
# 验证码地址url = 'https://so.gushiwen.cn/RandCode.ashx'res = requests.get(url)
with open('yzm.jpg', 'wb') as f:
f.write(res.content)1-2 识别验证码
## 用验证码识别模块识别验证码# 终端用pip安装ddddoce包pip install ddddocr
# ddddocr可能会出现的问题# 1 在运行过程中 报错最底部出现dll的问题 安装一下c++环境# 2 运行顶部会出现PIL的问题# 可能没安装pillow模块 pip install pillow# 安装了(版本高了 卸载原有pillow 安装9.5.0的pillow模块)# pip uninstall pillow# pip install pillow==9.5.0import ddddocr
ocr = ddddocr.DdddOcr()
with open('yzm.jpg', 'rb') as f:
data = f.read()
result = ocr.classification(data)
print(result)1-3 登录
import requests
url = 'https://so.gushiwen.cn/user/login.aspx?from=http%3a%2f%2fso.gushiwen.cn%2fuser%2fcollect.aspx'headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
'Referer': 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx',
}
email = '[email protected]',
pwd = 'xlg17346570232',
data = {
'__VIEWSTATE': 'AaJHgmeyf8Le6GErv1HkNyY9sDsTvgOx5w95HI82SkYSEWCpd9gSo2mvsYno9ZIc1D/tjgrPiujAhdRcKtnUjN5RdyvONf3MAk83da/5zRoc2WtYcqhyh1iEk9hVU6e7jmM8I07Z3dNPLNcAouMrW4mUaGk=',
'__VIEWSTATEGENERATOR': 'C93BE1AE',
'from': 'http://so.gushiwen.cn/user/collect.aspx',
'email': email,
'pwd': pwd,
'code': 'k04c',
'denglu': '登录',
}
res = requests.post(url, headers=headers, data=data)
# print(res.content.decode())with open('gsw.html', 'w', encoding='UTF-8') as f:
f.write(res.content.decode())1-4 上面三步整合
import requests
import ddddocr
url = 'https://so.gushiwen.cn/user/login.aspx?from=http%3a%2f%2fso.gushiwen.cn%2fuser%2fcollect.aspx'headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
'Referer': 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx',
}
# 验证码处理img_url = 'https://so.gushiwen.cn/RandCode.ashx'res = requests.get(img_url)
img = res.content
with open('yzm.jpg', 'wb') as f:
f.write(img)
ocr = ddddocr.DdddOcr()
result = ocr.classification(img)
email = '[email protected]',
pwd = 'xlg17346570232',
print(result)
data = {
'__VIEWSTATE': 'AaJHgmeyf8Le6GErv1HkNyY9sDsTvgOx5w95HI82SkYSEWCpd9gSo2mvsYno9ZIc1D/tjgrPiujAhdRcKtnUjN5RdyvONf3MAk83da/5zRoc2WtYcqhyh1iEk9hVU6e7jmM8I07Z3dNPLNcAouMrW4mUaGk=',
'__VIEWSTATEGENERATOR': 'C93BE1AE',
'from': 'http://so.gushiwen.cn/user/collect.aspx',
'email': email,
'pwd': pwd,
'code': result,
'denglu': '登录',
}
res = requests.post(url, headers=headers, data=data)
# print(res.content.decode())with open('gsw.html', 'w', encoding='UTF-8') as f:
f.write(res.content.decode())但是还是提示验证码不对?? 问题出现在哪里??
1-5 最终版本
需要添加cookie数据
import requests
import ddddocr
session = requests.Session()
url = 'https://so.gushiwen.cn/user/login.aspx?from=http%3a%2f%2fso.gushiwen.cn%2fuser%2fcollect.aspx'headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
'Referer': 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx',
}
# 验证码处理img_url = 'https://so.gushiwen.cn/RandCode.ashx'res = session.get(img_url)
img = res.content
with open('yzm.jpg', 'wb') as f:
f.write(img)
ocr = ddddocr.DdddOcr()
result = ocr.classification(img)
email = '[email protected]',
pwd = 'xlg17346570232',
print(result)
data = {
'__VIEWSTATE': 'AaJHgmeyf8Le6GErv1HkNyY9sDsTvgOx5w95HI82SkYSEWCpd9gSo2mvsYno9ZIc1D/tjgrPiujAhdRcKtnUjN5RdyvONf3MAk83da/5zRoc2WtYcqhyh1iEk9hVU6e7jmM8I07Z3dNPLNcAouMrW4mUaGk=',
'__VIEWSTATEGENERATOR': 'C93BE1AE',
'from': 'http://so.gushiwen.cn/user/collect.aspx',
'email': email,
'pwd': pwd,
'code': result,
'denglu': '登录',
}
res = session.post(url, headers=headers, data=data)
# print(res.content.decode())with open('gsw.html', 'w', encoding='UTF-8') as f:
f.write(res.content.decode())1-6 打码平台使用
import base64
import json
import requests
def base64_api(uname, pwd, img, typeid):
with open(img, 'rb') as f:
base64_data = base64.b64encode(f.read())
b64 = base64_data.decode()
data = {"username": uname, "password": pwd, "typeid": typeid, "image": b64}
result = json.loads(requests.post("http://api.ttshitu.com/predict", json=data).text)
if result['success']:
return result["data"]["result"]
else:
#!!!!!!!注意:返回 人工不足等 错误情况 请加逻辑处理防止脚本卡死 继续重新 识别 return result["message"]
return ''if __name__ == "__main__":
img_path = "./code.png" result = base64_api(uname='xxxxx', pwd='xxxxx', img=img_path, typeid=3)
print(result)
import base64
import json
import requests
# 一、图片文字类型(默认 3 数英混合):# 1 : 纯数字# 1001:纯数字2# 2 : 纯英文# 1002:纯英文2# 3 : 数英混合# 1003:数英混合2# 4 : 闪动GIF# 7 : 无感学习(独家)# 11 : 计算题# 1005: 快速计算题# 16 : 汉字# 32 : 通用文字识别(证件、单据)# 66: 问答题# 49 :recaptcha图片识别# 二、图片旋转角度类型:# 29 : 旋转类型## 三、图片坐标点选类型:# 19 : 1个坐标# 20 : 3个坐标# 21 : 3 ~ 5个坐标# 22 : 5 ~ 8个坐标# 27 : 1 ~ 4个坐标# 48 : 轨迹类型## 四、缺口识别# 18 : 缺口识别(需要2张图 一张目标图一张缺口图)# 33 : 单缺口识别(返回X轴坐标 只需要1张图)# 五、拼图识别# 53:拼图识别def base64_api(uname, pwd, img, typeid):
with open(img, 'rb') as f:
base64_data = base64.b64encode(f.read())
b64 = base64_data.decode()
data = {"username": uname, "password": pwd, "typeid": typeid, "image": b64}
result = json.loads(requests.post("http://api.ttshitu.com/predict", json=data).text)
if result['success']:
return result["data"]["result"]
else:
#!!!!!!!注意:返回 人工不足等 错误情况 请加逻辑处理防止脚本卡死 继续重新 识别 return result["message"]
return ""if __name__ == "__main__":
img_path = "C:/Users/Administrator/Desktop/file.jpg" result = base64_api(uname='你的账号', pwd='你的密码', img=img_path, typeid=3)
print(result)3,抓取多页数据
3-1 xpath紫砂之星抓取多页数据
import random
import time
import requests
from lxml import etree
for i in range(1, 11):
url = f'http://www.zishazx.com/product?page={i}&volum_start=0&volum_end=0&volum_id=0&price_start=0&price_end=0&category_id=0&pug_id=0&size_id=0&price_id=0&prize_id=0&search=&sflag=#views' print(f'正在抓取第{i}页数据======》')
# 给请求头 ua headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36' }
# 发起get请求 res = requests.get(url, headers=headers)
# 获取响应的页面内容 data = res.content.decode()
# 实例化匹配对象 tree = etree.HTML(data)
# print(data) # 获取到了所有的li li_list = tree.xpath('//ul[@class="list clearfix"]/li')
for li in li_list:
# 获取到图片的src地址 img = li.xpath('./p[@class="img"]/a/img/@src')[0]
name = li.xpath('./p[@class="name"]/text()')[0]
p_no = li.xpath('./p[@class="p_no"]/text()')[0]
print(img, name, p_no)
time.sleep(random.randint(1,4))3-2 抓取多页数据案例
"""http://www.cs.ecitic.com/newsite/cpzx/jrcpxxgs/zgcp/index.html 1http://www.cs.ecitic.com/newsite/cpzx/jrcpxxgs/zgcp/index_1.html 2http://www.cs.ecitic.com/newsite/cpzx/jrcpxxgs/zgcp/index_4.html 5"""import requests
import re
from lxml import etree
url = 'http://www.cs.ecitic.com/newsite/cpzx/jrcpxxgs/zgcp/index.html'headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
}
res = requests.get(url, headers=headers)
html = res.content.decode()
# print(html)page = re.search('var countPage = (?P<page>\d+)//共多少页', html).group('page')
for i in range(int(page)+1):
url = f'http://www.cs.ecitic.com/newsite/cpzx/jrcpxxgs/zgcp/index_{i}.html' if not i:
url = f'http://www.cs.ecitic.com/newsite/cpzx/jrcpxxgs/zgcp/index.html' print(url)3-3 登录处理
import requests
index_url = 'https://v3pro.houjiemeishi.com/PC/index.html'headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
}
session = requests.Session()
session.get(index_url, headers=headers)
# 登录的url地址login_url = 'https://v3pro.houjiemeishi.com//index.php?store_id=1&store_type=6'# 登录携带的表单数据data = {
'module': 'app_pc',
'action': 'login',
'm': 'login',
'phone': 'luckyboy',
'password': 'bHVja3lib3k=',
'imgCode': 'flms',
'language': '',
}
headers['Referer'] = 'https://v3pro.houjiemeishi.com/PC/pages/login/login.html'res = session.post(login_url, data=data, headers=headers)
data = res.json()
print('登录后返回的', data)
access_id = data['data']['access_id']
# 抓取个人资料own = 'https://v3pro.houjiemeishi.com//index.php?store_id=1&store_type=6'data = {
'module': 'app_pc',
'action': 'user',
'm': 'personal_resources',
'access_id': access_id,
'language': '',
}
headers['Referer'] = 'https://v3pro.houjiemeishi.com/PC/index.html?module=my&action=my&m=my&a=myinfor'res = session.post(login_url, data=data, headers=headers)
print('获取个人资料的', res.json()4,requests处理证书错误

12306ssl错误
import requests
url = "https://www.12306.cn/mormhweb/"response = requests.get(url)5、超时参数的使用
response = requests.get(url,timeout=3)6,请求参数类型
1. Query String Parameters
# 最终是要被添加在url上面的. # 此时, 你可以有两个选择. 1. 把参数直接怼在url上面
url = https://movie.douban.com/j/chart/top_list?type=13&interval_id=100%3A90&action=&start=0&limit=20 requests.get(url)
2. 把参数弄出来. 弄成字典. 然后通过params传递给服务器
url = "https://movie.douban.com/j/chart/top_list" params = {
type: 13 interval_id: 100:90 action:
start: 0 limit: 20 }
requests.get(url, params=params)
3. 还可以混合搭配.
url = "https://movie.douban.com/j/chart/top_list?type=13" params = {
interval_id: 100:90 action:
start: 0 limit: 20 }
requests.get(url, params=params)
# 上述三种方案. 最终在服务器是一样的..原因是, 请求是要遵守http协议的# 在http协议中. get请求的所有参数都要怼在url上面 2. Form Data
# 首先Form Data一定是post请求 # 它的参数. 需要处理成字典. 然后再requests模块里. url = "xxxxx" data = {
数据
}
requests.post(url, data=data)
3. Request Payload 表示的就是挂载.
# 挂载的数据类型不一定是json. 最多的是json... # 在请求头里. 会有一个content-type属性. 来描述该挂载内容的格式 # 我们处理的时候. 一般是按照浏览器上面的提示. 来组装参数以及请求头 url = "xxxxx" data = {
字典
}
解决方案:
1. 直接用json参数传递
requests.post(url, json=data)
# 把你传递的字典. 处理成json然后发送给服务器 # 隐藏的逻辑: # 自动在请求头里帮你添加content-type: json.... # 上述逻辑是自动的 2. 手动把字典处理成json. 然后再请求头里增加content-type 把处理好的json用data传递出去
s = json.dumps(data, separators=(',', ':')) # json字符串 requests.post(url, data=s, headers={
"Content-Type": "application/json" })
# 上述逻辑是自己手动来... 4. 三种参数可以混搭....
# Query String 和 Form Data # Query String 和 Requests payload # 用上面的三种方案. 混合处理即可. 4. 请求头.
User-Agent: 表示的是你的网络设备...
Referer: 防盗链.
# a页面 发请求到b页面. 在b页面的请求头上面就会有referer, 是a页面的地址. Cookie:
# 客户端和服务器的会话状态. # 应该如何处理cookie 1. 服务器在返回的http数据包的时候. 在响应头上会有Set-Cookie
# Set-Cookie: xxxxx=xxxxx # 告诉浏览器. 把该信息保存下来.以后使用我的网站的时候. 带着该cookie信息 # 保持cookie状态: session #session表示会话. 你再session中保存的请求头信息. cookie信息 # 可以重复使用... # 可以帮我们保持住cookie的状态. # 它只能帮你保持住, 响应头回来的cookie信息. 2. 在js中可以动态的修改cookie
# 有些网站不是通过`响应头`加载的cookie..... 这时候session就不行了....该信息是`js动态`加载的 # 此时. 我们只能想办法手工去维护cookie的改变策略. 还是要用session.... # 综上. 我们发请求的时候. 一定要用session