day28-常见反爬机制及应对方法
作者:互联网
day28
总结
-
事件循环:死循环,将函数(协程对象)挂载事件 - asyncio.get_event_loop() 协程对象---> 基于生成器 / 原生协程对象 - asyncio.run_until_complete(...) Task / Future ---> 对协程对象进行了封装 - [co1, co2, co3, ...] ---> asyncio.wait([...]) 阻塞:绝大多数都是由IO操作(网络IO,文件IO、用户输入等)造成的 非阻塞: 同步:同步是有序的, 异步:无序的 ---> 效率提升 ---> I/O密集型任务 异步爬虫 - aiohttp - httpx 两个关键字 - async ---> @asyncio.coroutine - await 协程:一个线程中的多个子程序,它是用户态的微线程 ---> 纤程 常见反爬机制及其代理: - 封禁或限制IP ---> 商业IP代理 ---> 蘑菇代理 / 芝麻代理 / 快代理 ----> 自行实现对IP代理的封装(IP代理池: 获取代理、判断代理、判断失效、更换代理) - 强制登录 ---> 如果用户登录成功,服务器通常会在浏览器中储存中放置用户身份信息 - Cookie - Local Storage / Session Storage - 嵌入式数据库 ----> 每次请求(请求头/消息体) 带上一个身份标识 ---> Cookies池 ----> 每次请求(请求头/消息体) 带上一个身份标识 - 限流(限制访问速率) - 验证码 ---> 文字验证码 ---> OCR(光学文字识别) ---> 接口 / easyocr 程序中自己解决不了的问题就可以考虑使用三方接口(付费/免费) ---> 行为验证码 ---> 超级鹰 - 手机号+短信短信码 - 超级鹰 - 动态内容 ---> JavaScript逆向 ---> 找到提供数据的API接口 ---> 手机抓接口 ---> 抓包工具(Charles/ Fidder) ---> Selenium直接模拟浏览器操作获取动态内容 - find_element_by_xxx / - page_source ---> 获取包含动态内容的网页源代码 ---> JavaScript 混淆技术 - 字体反爬 / 内容来自与抠图
-
天行数据接口测试
-
pillow图片处理
def test_pil(): guido = Image.open(r'./八重樱.jpg') # 剪辑 guido.crop((10, 30, 40, 50)).show() # 滤镜 guido.filter(ImageFilter.CONTOUR).show() # 缩略图 guido.thumbnail((100, 80)) guido.show()
-
超级鹰平台验证码使用
from hashlib import md5 import requests class ChaojiyingClient: def __init__(self, username, password, soft_id): self.username = username self.password = md5(password.encode('utf8')).hexdigest() self.soft_id = soft_id self.base_params = { 'user': self.username, 'pass2': self.password, 'softid': self.soft_id, } self.headers = { 'Connection': 'Keep-Alive', 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)', } def post_pic(self, image_date, code_type): """ image_date: 图片字节,二进制数据 code_type: 验证码类型 参考 http://www.chaojiying.com/price.html """ params = { 'codetype': code_type, } params.update(self.base_params) files = {'userfile': ('ccc.jpg', image_date)} resp = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers) return resp.json() def report_error(self, image_id): """ im_id:报错题目的图片ID """ params = { 'id': image_id, } params.update(self.base_params) resp = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers) return resp.json() if __name__ == '__main__': chaojiying = ChaojiyingClient('jackfrued', '1Qaz2Wsx', '900260') # 用户中心>>软件ID 生成一个替换 96001 print(chaojiying.report_error("9143209112576300341")) # with open('files/result.png', 'rb') as f: # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要// # image_data = f.read() # print(chaojiying.post_pic(image_data, 1902)) # 1902 验证码类型 官方网站>>价格体系 3.4+版 print 后要加()
-
实现打码平台自动登录
import io # import ssl # ssl._create_default_https_context = ssl._create_unverified_context import easyocr from PIL import Image from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions from selenium.webdriver.support.wait import WebDriverWait browser = webdriver.Chrome() browser.set_window_size(1280, 960) browser.get('http://mail.1000phone.com/') # 显示等待 wait = WebDriverWait(browser, 10) wait.until(expected_conditions.presence_of_element_located((By.CSS_SELECTOR, '.login_panel_iframe'))) # 隐式等待 # browser.implicitly_wait(10) # 到页面上拿元素,最多等10秒 iframe1 = browser.find_element_by_css_selector('.login_panel_iframe') x1, y1 = iframe1.location['x'], iframe1.location['y'] # Chrome对象的switch_to属性的frame方法,可以从页面切换到iframe中 browser.switch_to.frame(iframe1) iframe2 = browser.find_element_by_css_selector('#ding-login-iframe') x2, y2 = iframe2.location['x'], iframe2.location['y'] browser.switch_to.frame(iframe2) username_input = browser.find_element_by_css_selector('#username') # 模拟用户输入 # username_input.send_keys('luohao@1000phone.com') username_input.send_keys('111hao@1000phone.com') password_input = browser.find_element_by_css_selector('#password') password_input.send_keys('Abc123!!') # 创建一个等待对象 wait = WebDriverWait(browser, 10) wait.until(expected_conditions.element_to_be_clickable((By.CSS_SELECTOR, '#login_checkcode_ico'))) captcha_img = browser.find_element_by_css_selector('#login_checkcode_ico') # WebElement对象的size属性代表元素宽度和高度,location属性代表元素在窗口中的位置 size, location = captcha_img.size, captcha_img.location print(size, location) x3, y3, width, height = location['x'], location['y'], size['width'], size['height'] # 截取整个浏览器窗口的图片获得图片的二进制数据 image_data = browser.get_screenshot_as_png() # bytes(只读字节串) ----> io.BytesIO(可写字节串)---> getvalue() ---> bytes # str(只读字符串) ----> io.StringIO(可写字符串)---> getvalue() ---> str browser_image = Image.open(io.BytesIO(image_data)) # 从截图上剪裁出验证码的图片 x, y = x1 + x2 + x3, y1 + y2 + y3 # Windows系统的写法 ---> 如果截图有问题就把坐标写死 # print(x, y, width, height) checkcode_image = browser_image.crop((x, y, x + width, y + height)) # macOS系统的写法 # checkcode_image = browser_image.crop((x * 2, y * 2, (x + width) * 2, (y + height) * 2)) checkcode_image.save('files/result.png') # 通过easyocr做光学文字识别 reader = easyocr.Reader(['en'], gpu=False) code = reader.readtext('result.png', detail=0)[0] # 将识别出的验证码输入文本框 checkcode_input = browser.find_element_by_css_selector('#login_checkcode') checkcode_input.send_keys(code) login_button = browser.find_element_by_css_selector('#login_submit_btn') # 模拟用户点击 login_button.click()
-
ocr练习
import easyocr reader = easyocr.Reader(['en'], gpu=False) res = reader.readtext('files/test.jpg', detail=0) print(res)
作业
-
使用超级鹰实现自动登录
import asyncio import io import time import selenium from PIL import Image from selenium import webdriver from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions from selenium.webdriver.support.wait import WebDriverWait from chaojiying import ChaojiyingClient import getcode urls = ['https://ke.qq.com/', 'https://www.nowcoder.com/login', 'http://tljh5.h5.wan.360.cn/game', 'https://passport.bilibili.com/login', 'https://passport.vip.com/login', 'https://www.iqiyi.com/', 'https://passport.liepin.com/account/v1/hlogin', 'https://mail.163.com/', 'https://pan.baidu.com/', 'http://www.kuwo.cn/'] def tenxunketang(brower): brower.implicitly_wait(10) login = brower.find_element_by_css_selector(".mod-header__link-login.js-login-op") login.click() wait = WebDriverWait(brower, 10) wait.until(expected_conditions.presence_of_element_located( (By.CSS_SELECTOR, "body div.login-wrapper.login-wrapper--qq > div > div > iframe"))) iframe_login = brower.find_element_by_css_selector("body div.login-wrapper.login-wrapper--qq > div > div > iframe") brower.switch_to.frame(iframe_login) login_with_pwd = brower.find_element_by_css_selector("#switcher_plogin") login_with_pwd.click() wait.until(expected_conditions.presence_of_element_located( (By.CSS_SELECTOR, "#u"))) username = brower.find_element_by_css_selector("#u") username.send_keys("2209229157") password = brower.find_element_by_css_selector("#p") password.send_keys("lei.15183394253") login_btn = brower.find_element_by_css_selector("#login_button") login_btn.click() async def main(url): with webdriver.Chrome() as brower: brower.set_window_size(1280, 720) brower.get(url) brower.implicitly_wait(10) duanxin_login = brower.find_element_by_css_selector("body form.js-login-form > div.login-tips > span > a") duanxin_login.click() time.sleep(1) phone_input = brower.find_element_by_css_selector("#jsEmailIpt") phone_input.send_keys("16521686439") get_code_btn = brower.find_element_by_css_selector("#jsSendCaptcha") get_code_btn.click() while True: wait = WebDriverWait(brower, 10) wait.until(expected_conditions.presence_of_all_elements_located( (By.CSS_SELECTOR, 'div.yidun_panel > div > div.yidun_bgimg > img[src]'))) # 找到验证码图片 image_ele = brower.find_element_by_css_selector("div.yidun_panel > div > div.yidun_bgimg > img") size, location = image_ele.size, image_ele.location x, y, width, height = location['x'], location['y'], size['width'], size['height'] image_data = brower.get_screenshot_as_png() brower_image = Image.open(io.BytesIO(image_data)) brower_image = brower_image.crop((x, y, x + width, y + height)) image_w, image_h = brower_image.size brower_image.thumbnail((image_w // 1.5, image_h // 1.5)) brower_image.save(r'./files/temp.png') client = ChaojiyingClient('jackfrued', '1Qaz2Wsx', '900260') with open(r'./files/temp.png', 'rb') as f: resp = client.post_pic(f.read(), 9101) print(resp) code = resp["pic_str"] slider = brower.find_element_by_css_selector("div.yidun_control > div.yidun_slider") ac = ActionChains(brower) move_x = int(code.split(',')[0]) * 1.5 # move_x = int(code.split(',')[0]) ac.click_and_hold(slider).perform() ac.move_by_offset(move_x, 0) time.sleep(1) ac.release().perform() time.sleep(3) try: brower.find_element_by_css_selector(".layer-container-content .pop-content") # print(client.report_error(resp["pic_id"])) except: break time.sleep(2) try: brower.find_element_by_css_selector(".confirm-content") suc_btn = brower.find_element_by_css_selector("div.pop-footer.clearfix > a") suc_btn.click() print('失败') except: for _ in range(60): title, pwd_text = await getcode.main() print(title if title else "None-Title", pwd_text if pwd_text else "None-Pwd") if title == "牛客网": break else: time.sleep(1) pwd = brower.find_element_by_css_selector("#jsCaptcha") pwd.send_keys(pwd_text) login_btn = brower.find_element_by_css_selector("#jsLoginBtn") login_btn.click() time.sleep(300) if __name__ == '__main__': loop = asyncio.get_event_loop() loop.run_until_complete(main(urls[1]))
标签:day28,反爬,image,常见,element,---,brower,login,find 来源: https://blog.csdn.net/qq_46137199/article/details/117672410