其他分享
首页 > 其他分享> > 某班课群网页爬虫(2)

某班课群网页爬虫(2)

作者:互联网

本次增加的是答题多次循环,以及出现网络情况不好弹窗的解决

from selenium import webdriver
from lxml import etree
import requests
import time
import random
import json

from selenium.webdriver.remote.webelement import WebElement

headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.56 "
}
browser = webdriver.Edge(executable_path = "MicrosoftWebDriver.exe")
# 让浏览器发起一个指定url的请求
browser.get("https://www.yooc.me/login")
# 定位标签
account_input = browser.find_element_by_xpath('/html/body/div[2]/div/div/div[2]/div/div[1]/form/div[2]/input')
account_input.send_keys('你的账号')
password_input = browser.find_element_by_xpath('/html/body/div[2]/div/div/div[2]/div/div[1]/form/div[3]/input')
password_input.send_keys('你的密码')
# 用page_source获取当前页面的源码数据
response = browser.page_source
tree = etree.HTML(response)
code_url = tree.xpath('/html/body/div[2]/div/div/div[2]/div/div[1]/form/div[4]/img/@src')[0]
text_response = requests.get(url = code_url, headers = headers).content
with open("./code_text.jpg", "wb") as fp:
    fp.write(text_response)
    code_text = input("请查看验证码,并在30秒内输入:")

code_text_input = browser.find_element_by_xpath('/html/body/div[2]/div/div/div[2]/div/div[1]/form/div[4]/input[1]')
# 与标签交互,输入文本
code_text_input.send_keys(code_text)

login = browser.find_element_by_id('submit')
login.click()
time.sleep(5)

topic_url = browser.find_element_by_xpath('/html/body/div[2]/div[2]/table/tbody/tr/td/div[1]/div[3]/div[1]/a[2]')
topic_url.click()
time.sleep(2)

handles = browser.window_handles
browser.switch_to.window(handles[1])
exam_url = browser.find_element_by_xpath('/html/body/section/section/div[1]/div[4]/a')
exam_url.click()
time.sleep(2)

for every in range(100):
    exam_detail = browser.find_element_by_xpath('/html/body/section/section/div[2]/div[3]/ul/li[1]/div[2]/a[2]')
    exam_detail.click()
    time.sleep(2)
    confirm_btn = browser.find_element_by_xpath('/html/body/div[12]/div[3]/div/div[1]')
    confirm_btn.click()
    time.sleep(5)
    while True:
        try:  # 出现网络问题弹窗时,自动关闭,并试到成功为止
            network_anomaly = browser.find_element_by_xpath('/html/body/div[12]/div[3]/div/div/button')
            network_anomaly.click()
        except:
            break
            pass
        else:
            confirm_btn.click()

    bodylist = browser.find_elements_by_xpath('/html/body/section/section/div[5]/div[@class="question-board"]')
    print(bodylist)
    browser.maximize_window()

    for each in bodylist:
        print(each.text)
        templist = each.find_elements_by_tag_name('label')
        islist = random.choice(templist)
        while True:
            try:  # 出现网络问题弹窗时,自动关闭,并试到成功为止
                network_anomaly = browser.find_element_by_xpath('/html/body/div[7]/div[3]/div/div/button')
                network_anomaly.click()
            except:
                break
                pass
            else:
                islist.click()
                time.sleep(0.5)
        islist.click()
        time.sleep(0.5)
        print("选项已勾选!")

    print("正在交卷!")
    submit = browser.find_element_by_xpath('/html/body/section/aside/div[2]/div[2]/div[3]/a[2]')
    submit.click()
    time.sleep(2)

    confirm_again = browser.find_element_by_xpath('/html/body/div[7]/div[3]/div/div[1]')
    confirm_again.click()
    time.sleep(2)

    check_detail = browser.find_element_by_xpath('/html/body/div[7]/div[3]/div/div[1]')
    check_detail.click()
    time.sleep(2)

    print("正在读取题目!")
    exam_answers = browser.page_source
    exam_answers_tree = etree.HTML(exam_answers)
    answers_list = exam_answers_tree.xpath('/html/body/section/section/div[3]/div[@class="question-board"]')
    questions = {}
    for answers in answers_list:
        if answers.xpath('./@id')[0] not in list(questions.keys()):
            questions[answers.xpath('./@id')[0]] = answers.xpath('.//text()')
            print(answers, "读取成功!")

    again_btn = browser.find_element_by_xpath('/html/body/section/div/a')
    again_btn.click()

    # 2. 把python格式数据转换为json格式文件
    # 2.1 构建指向该文件的文件对象
    with open("./yiban_questionbank.json", "a+") as fp:
        # 2.2 其中会把python的对应类型转换为json的类型
        json.dump(questions, fp, ensure_ascii = False)
        print("保存成功!")

browser.quit()

 

之后会考虑增加第三方识别验证码,实现完全自动化,如果有什么建议,欢迎提出!!!

标签:xpath,body,爬虫,html,div,某班,课群,find,browser
来源: https://blog.csdn.net/weixin_57258167/article/details/117456549