爬虫进阶(五)——selenium
作者:互联网
selenium基本操作(需要提前下载浏览器driver.exe)
from selenium import webdriver from time import sleep bro = webdriver.Chrome(executable_path='chromedriver.exe') bro.get('https://www.jd.com/') sleep(1) #进行标签定位 search_input = bro.find_element_by_id('key') search_input.send_keys('mac pro') btn = bro.find_element_by_xpath('//*[@id="search"]/div/div[2]/button') btn.click() sleep(2) #执行js bro.execute_script('window.scrollTo(0,document.body.scrollHeight)') sleep(2) page_text = bro.page_source print(page_text) sleep(2) bro.quit()
动态加载数据爬取(个人认为极难爬取的网站)
https://www.aqistudy.cn/
from selenium import webdriver from time import sleep from lxml import etree bro = webdriver.Chrome(executable_path='chromedriver.exe') bro.get('https://www.aqistudy.cn/') sleep(1) page_text = bro.page_source page_text_list = [page_text] for i in range(3): bro.find_element_by_id('pageIto_next').click()#点击下一页 sleep(1) page_text_list.append(bro.page_source) for page_text in page_text_list: tree = etree.HTML(page_text) li_list = tree.xpath('//ul[@id="gzlist"]/li') for li in li_list: title = li.xpath('./dl/@title')[0] num = li.xpath('./ol/@title')[0] print(title+':'+num) sleep(2) bro.quit()
selenium动作链+iframe标签获取
from selenium import webdriver from time import sleep from selenium.webdriver import ActionChains bro = webdriver.Chrome(executable_path='chromedriver.exe') bro.get('https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable') # iframe标签 bro.switch_to.frame('iframeResult') div_tag = bro.find_element_by_id('draggable') #拖动= 点击+滑动 action = ActionChains(bro) action.click_and_hold(div_tag) for i in range(5): #perform让动作链立即执行 action.move_by_offset(17,2).perform() sleep(0.5) action.release() sleep(3) bro.quit()
12306模拟登录,很早之前做的,现在不知道行不行,可以借鉴一下
from selenium import webdriver from time import sleep from PIL import Image from selenium.webdriver import ActionChains from Cjy import Chaojiying_Client from selenium.webdriver import ActionChains bro = webdriver.Chrome(executable_path='chromedriver.exe') # bro.get('https://kyfw.12306.cn/otn/login/init') bro.get('https://kyfw.12306.cn/otn/resources/login.html') sleep(5) # bro.save_screenshot('main.png') # # code_img_tag = bro.find_element_by_xpath('//*[@id="loginForm"]/div/ul[2]/li[4]/div/div/div[3]/img') # location = code_img_tag.location # size = code_img_tag.size # #裁剪的区域范围 # rangle = (int(location['x']),int(location['y']),int(location['x']+size['width']),int(location['y']+size['height'])) # # i = Image.open('./main.png') # frame = i.crop(rangle) # frame.save('code.png') # # def get_text(imgPath,imgType): # chaojiying = Chaojiying_Client('666', '666', '899370') # im = open(imgPath, 'rb').read() # return chaojiying.PostPic(im, imgType)['pic_str'] # # #55,70|267,133 ==[[55,70],[33,66]] # result = get_text('./code.png',9004) # all_list = [] # if '|' in result: # list_1 = result.split('|') # count_1 = len(list_1) # for i in range(count_1): # xy_list = [] # x = int(list_1[i].split(',')[0]) # y = int(list_1[i].split(',')[1]) # xy_list.append(x) # xy_list.append(y) # all_list.append(xy_list) # else: # x = int(result.split(',')[0]) # y = int(result.split(',')[1]) # xy_list = [] # xy_list.append(x) # xy_list.append(y) # all_list.append(xy_list) # print(all_list) # # action = ActionChains(bro) # for a in all_list: # x = a[0] # y = a[1] # ActionChains(bro).move_to_element_with_offset(code_img_tag,x,y).click().perform() # sleep(1) bro.find_element_by_id('username').send_keys('123456') sleep(1) bro.find_element_by_id('password').send_keys('67890000000') sleep(1) bro.find_element_by_id('loginSub').click() sleep(5) bro.quit()
selenium规避检测,无头浏览器
#使用谷歌无头浏览器 # from selenium import webdriver # from time import sleep # from selenium.webdriver.chrome.options import Options # # chrome_options = Options() # chrome_options.add_argument('--headless') # chrome_options.add_argument('--disable-gpu') # # driver = webdriver.Chrome(r'chromedriver.exe',chrome_options=chrome_options) # driver.get('https://www.cnblogs.com/') # print(driver.page_source) #如何规避selenium被检测 from selenium import webdriver from selenium.webdriver import ChromeOptions from time import sleep option = ChromeOptions() option.add_experimental_option('excludeSwitches', ['enable-automation']) option.add_argument('user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"') driver = webdriver.Chrome(r'chromedriver.exe',options=option) # driver.get('https://www.taobao.com/') url = 'https://www.aqistudy.cn/' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36', 'Connection':"close" } driver.get(url) sleep(4) page_text = driver.page_source print(page_text) sleep(2) driver.quit()
示例:梨视频爬取
import requests from lxml import etree import re headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36' } url = 'https://www.pearvideo.com/category_1' page_text = requests.get(url,headers=headers).text tree = etree.HTML(page_text) li_list = tree.xpath('//*[@id="listvideoListUl"]/li') for li in li_list: detail_url = 'https://www.pearvideo.com/'+li.xpath('./div/a/@href')[0] title = li.xpath('./div/a/div[2]/text()')[0]+'.mp4' print(detail_url) detail_page_text = requests.get(detail_url,headers=headers).text print(detail_page_text) # ex = 'srcUrl="(.*?)",vdoUrl' # video_url = re.findall(ex,detail_page_text,re.S)[0] tree = etree.HTML(detail_page_text) video_url = tree.xpath('//video/@src')[0] video_data = requests.get(video_url,headers=headers).content with open(title,'wb') as fp: fp.write(video_data)
标签:进阶,selenium,list,爬虫,bro,sleep,text,import,page 来源: https://www.cnblogs.com/zzj666/p/14747529.html