其他分享
首页 > 其他分享> > B站路飞学城爬虫教学爬取梨视频

B站路飞学城爬虫教学爬取梨视频

作者:互联网

import requests
from lxml import etree
import re
import os
from multiprocessing.dummy import Pool
import random

if __name__ == '__main__':
    #创建视频得文件
    if not os.path.exists("./video"):
        os.mkdir("./video")

    url="https://www.pearvideo.com/category_59"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4455.2 Safari/537.36"
    }
    page_text=requests.get(url=url,headers=headers).text
    tree=etree.HTML(page_text)

    li_list=tree.xpath('//*[@id="listvideoList"]/ul/li')
    video_ajax="https://www.pearvideo.com/videoStatus.jsp?"#通过抓包工具获取
    urls = []  # 存储所有视频的链接and名字
    for li in li_list:
        video_id=li.xpath('./div/a/@href')[0]#得到视频id,如video_1727785
        video_num=video_id.split('_')[1]#得到视频id里得数字
        video_name=li.xpath("./div/a/div[2]/text()")[0]+'.mp4'
        params={
            'contId':video_num,
            'mrd':str(random.random())#随机数
        }
        video_headers={
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4455.2 Safari/537.36",
            'Referer': 'https://www.pearvideo.com/' +video_id
        }
        video_dic=requests.get(url=video_ajax,headers=video_headers,params=params).json()
        # print(video_dic)#此时就不会显示下架了,得到含有视频地址得字典

        video_url=video_dic["videoInfo"]["videos"]["srcUrl"]
        # 此处视频地址做了加密即ajax中得到的地址需要加上cont-,并且修改一段数字为id才是真地址
        # 真地址:"https://video.pearvideo.com/mp4/third/20201120/cont-1708144-10305425-222728-hd.mp4"
        # 伪地址:"https://video.pearvideo.com/mp4/third/20201120/1606132035863-10305425-222728-hd.mp4"
        re_list=re.split('[/-]', video_url)
        re_str=re_list[6]
        video_true_url=video_url.replace(re_str,"cont-"+video_num)
        # print(video_true_url)
        dic={
            "name":video_name,
            "url":video_true_url
        }
        urls.append(dic)

    #使用线程池对视频数进行请求
    def get_video_data(dic):
        url=dic["url"]
        name=dic["name"]
        print(name+"正在下载。。。。。。")
        video_data=requests.get(url=url,headers=headers).content
        with open("./video/"+name,"wb")as fp:
            fp.write(video_data)
            print(name+"下载成功!!!")

    pool=Pool(4)
    pool.map(get_video_data,urls)

    pool.close()
    pool.join()

 

标签:name,站路,url,dic,爬取,headers,video,飞学城,id
来源: https://www.cnblogs.com/zhouchengbin/p/14709472.html