首页 > 其他分享> > selenum_斗鱼直播爬取

selenum_斗鱼直播爬取

2021-01-05 22:02:08 作者：互联网

斗鱼直播主播信息采集

from selenium import webdriver
import time
from lxml import etree
from excel_utils.excel_utils import write_to_excel,append_to_excel
import os


# 浏览器生成并解析
def get_page_content_by_selenium(url):
    driver.get(url)
    time.sleep(2)
    driver.maximize_window()
    page_content = driver.page_source
    return etree.HTML(page_content)


def main():
    start_url = 'https://www.douyu.com/g_LOL'
    page_content = get_page_content_by_selenium(start_url)
    next_btn = driver.find_element_by_xpath('//div[@class="ListFooter"]/ul/li[last()]')
    print(next_btn.tag_name)
    n = 1
    while True:
        print(f'爬取第{n}页')
        titles = page_content.xpath('//section[@id="listAll"]//ul[@class="layout-Cover-list"]//h3/text()')
        anchor = page_content.xpath('//section[@id="listAll"]//ul[@class="layout-Cover-list"]//h2/div[@class="DyListCover-userName"]/text()')
        focus = page_content.xpath('//section[@id="listAll"]//ul[@class="layout-Cover-list"]//span[@class="DyListCover-hot"]/text()')
        anchor_list = []
        for index, title in enumerate(titles):
            item = {}
            item['title'] = title
            item['anchor'] = anchor[index]
            item['focus'] = focus[index]
            anchor_list.append(item)
        file_name = 'anchor.xls'
        if not os.path.exists(file_name):
            write_to_excel(anchor_list, file_name)
        else:
            append_to_excel(anchor_list, file_name)
        if next_btn.get_attribute('aria-disabled') == 'false':
            next_btn.click()
            time.sleep(0.5)
            page_content = etree.HTML(driver.page_source)
        else:
            break
        n += 1


if __name__ == '__main__':
    driver = webdriver.Chrome()
    main()

标签：selenum,content,name,excel,list,anchor,爬取,斗鱼,page
来源： https://www.cnblogs.com/childheart/p/14238274.html