某MP3爬虫

2022-02-28 19:34:13 作者：互联网

某MP3爬虫，爬取预数据，保存数据为后续下载脚本提供数据

import requests
import re
from lxml import etree
from openpyxl import Workbook


"""
获取单个的连接、标题、cv、R*J号
"""

pattern = re.compile(r'RJ\d+')
 
hd = {
    'cookie': '_ga=GA1.2.1877631639.1626354328; _gid=GA1.2.556826390.1626510667; aiBLOCKS={%221%22:{%22c%22:1%2C%22h%22:62193%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%222%22:{%22c%22:3%2C%22h%22:62302%2C%22cpt%22:1%2C%22ct%22:1626597077%2C%22x%22:1626597634}%2C%223%22:{%22c%22:1%2C%22h%22:32840%2C%22cpt%22:1%2C%22ct%22:1626597069%2C%22x%22:1626597634}%2C%224%22:{%22c%22:1%2C%22h%22:52649%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%225%22:{%22c%22:1%2C%22h%22:52655%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%226%22:{%22c%22:-1627116371%2C%22h%22:43456%2C%22cpt%22:0%2C%22ct%22:1626597077%2C%22x%22:1626597634}%2C%227%22:{%22c%22:1%2C%22h%22:52661%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%228%22:{%22c%22:1%2C%22h%22:52613%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%229%22:{%22c%22:1%2C%22h%22:52646%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2210%22:{%22c%22:1%2C%22h%22:52619%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2211%22:{%22c%22:1%2C%22h%22:52625%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2212%22:{%22c%22:1%2C%22h%22:52616%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2213%22:{%22c%22:1%2C%22h%22:52628%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2214%22:{%22c%22:1%2C%22h%22:52634%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2215%22:{%22c%22:1%2C%22h%22:52640%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2216%22:{%22c%22:1%2C%22h%22:52625%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2217%22:{%22c%22:2%2C%22h%22:21793%2C%22cpt%22:2%2C%22ct%22:1626597069%2C%22x%22:1626597634}%2C%2219%22:{%22c%22:1%2C%22h%22:10267%2C%22cpt%22:1%2C%22ct%22:1626597069%2C%22x%22:1626597634}}; pvc_visits[0]=1626546673b51942a1626546785b51317a1626546803b37431a1626551293b50818a1626551308b49536a1626551325b49528a1626551373b46295a1626551396b45281a1626551410b41686a1626551655b40332a1626551671b39967a1626551704b39303a1626551720b38318a1626551735b38310a1626551762b38178a1626551779b36528a1626551801b36178; __cf_bm=59c696b6a3b1617334bdb0a004f9eb5f58ea942b-1626517081-1800-AbUJDW1X\/BeCmR0+SQCFmzBW9EGU98T4cWhuG3bGsB3HeDggVQDWZq4ljeoE8EjdpFKxCDdDnvJuBrdh4oo0bYrJg7\/zcaIZUcc0gDqY3D3k6u7tLaaooNYTqBXwPLox3g==',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
}

def gethtml(url,title,cv,rj,href):
        r = requests.get(url, headers=hd, stream=True)
        print(r.status_code)
        html = etree.HTML(r.text)
        num_html_data = html.xpath('//*[@id="site-main"]/div/div/div[1]/div/ul/li')
        print("当前页作品数有"+len(num_html_data))
        for i in range(1,len(num_html_data)+1):
            href_html_data = html.xpath('//*[@id="site-main"]/div/div/div[1]/div/ul/li[{}]/div/div/div/h2/a/@href'.format(i))
            name_html_data = html.xpath('//*[@id="site-main"]/div/div/div[1]/div/ul/li[{}]/div/div/div/h2/a/text()'.format(i))
            cv_rj=html.xpath('//*[@id="site-main"]/div/div/div[1]/div/ul/li[{}]/div/div/div//p[@style]'.format(i))
            if len(cv_rj)is 3:
                name_html_data[0] = html.xpath('//*[@id="site-main"]/div/div/div[1]/div/ul/li[{}]/div/div/div/p[3]/strong/text()'.format(i))
            print("--------------------第{}个---------------------------".format(i))   
            print(href_html_data[0])#url
            print(name_html_data[0])#title
            print(cv_rj[-2].text)#CV
            print(pattern.search(cv_rj[-1].text)[0])#RJ
            #保存到数组
            href.append(href_html_data[0])
            title.append(str(name_html_data[0]))
            cv.append(cv_rj[-2].text)
            rj.append(pattern.search(cv_rj[-1].text)[0])

def touchfile(title,cv,rj,href):
    wb = Workbook()
    ws = wb.active
    for i in range(len(href)):
        ws.append([href[i],rj[i],title[i],cv[i]])
    wb.save('sample.xlsx')#保存到当前目录
def main():
    title = []
    cv = []
    rj = []
    href = []
    r = requests.get('https://xxxx.com/tag/xxxx/', headers=hd, stream=True)
    print(r.status_code)
    html = etree.HTML(r.text)
    num = html.xpath('//*[@id="site-main"]/div/div/div[1]/div/nav/div/a[2]/text()')
    print("总页数：{}页".format(num[0]))
    #print(type(int(num[0])))
    #print(int(num[0]))

    for i in range(1,int(num[0])+1):
        url = 'https://xxxx/tag/xxxx/page/{}/'.format(i)
        gethtml(url,title,cv,rj,href)
    print("开始保存excel文件")
    touchfile(title,cv,rj,href)
    print("文件保存成功-------------------完成")

main()

单个mp3下载脚本

import requests
 
hd = {
    'sec-fetch-dest': 'audio',
    'cookie': '_ga=GA1.2.1877631639.1626354328; _gid=GA1.2.556826390.1626510667; aiBLOCKS={%221%22:{%22c%22:1%2C%22h%22:62193%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%222%22:{%22c%22:3%2C%22h%22:62302%2C%22cpt%22:1%2C%22ct%22:1626597077%2C%22x%22:1626597634}%2C%223%22:{%22c%22:1%2C%22h%22:32840%2C%22cpt%22:1%2C%22ct%22:1626597069%2C%22x%22:1626597634}%2C%224%22:{%22c%22:1%2C%22h%22:52649%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%225%22:{%22c%22:1%2C%22h%22:52655%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%226%22:{%22c%22:-1627116371%2C%22h%22:43456%2C%22cpt%22:0%2C%22ct%22:1626597077%2C%22x%22:1626597634}%2C%227%22:{%22c%22:1%2C%22h%22:52661%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%228%22:{%22c%22:1%2C%22h%22:52613%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%229%22:{%22c%22:1%2C%22h%22:52646%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2210%22:{%22c%22:1%2C%22h%22:52619%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2211%22:{%22c%22:1%2C%22h%22:52625%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2212%22:{%22c%22:1%2C%22h%22:52616%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2213%22:{%22c%22:1%2C%22h%22:52628%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2214%22:{%22c%22:1%2C%22h%22:52634%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2215%22:{%22c%22:1%2C%22h%22:52640%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2216%22:{%22c%22:1%2C%22h%22:52625%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2217%22:{%22c%22:2%2C%22h%22:21793%2C%22cpt%22:2%2C%22ct%22:1626597069%2C%22x%22:1626597634}%2C%2219%22:{%22c%22:1%2C%22h%22:10267%2C%22cpt%22:1%2C%22ct%22:1626597069%2C%22x%22:1626597634}}; pvc_visits[0]=1626546673b51942a1626546785b51317a1626546803b37431a1626551293b50818a1626551308b49536a1626551325b49528a1626551373b46295a1626551396b45281a1626551410b41686a1626551655b40332a1626551671b39967a1626551704b39303a1626551720b38318a1626551735b38310a1626551762b38178a1626551779b36528a1626551801b36178; __cf_bm=59c696b6a3b1617334bdb0a004f9eb5f58ea942b-1626517081-1800-AbUJDW1X\/BeCmR0+SQCFmzBW9EGU98T4cWhuG3bGsB3HeDggVQDWZq4ljeoE8EjdpFKxCDdDnvJuBrdh4oo0bYrJg7\/zcaIZUcc0gDqY3D3k6u7tLaaooNYTqBXwPLox3g==',
    'Referer':'https://xxxx/51938/',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
}
 
print("开始下载")
url = 'https://xxxx/xx.mp3'
r = requests.get(url, headers=hd, stream=True)
file_size_str=r.headers['Content-Length'] #提取出来的是个数字str
file_size=int(file_size_str)/1024/1024    #把提取出数字str转为int或者float进行运算
print('文件大小为：'+str(file_size)+'M')
with open('RJ334558.mp3', "wb") as mp3:  
    for chunk in r.iter_content(chunk_size=1024 * 1024):
        if chunk:
            mp3.write(chunk)
            print("正在下载")
 
print("下载结束")

更改后的代码，功能完善

import pandas as pd
import requests
import time
url = []
rj = []
#读取url
url_df = pd.read_excel("sample.xlsx", usecols=[0],names=None)  # 读取项目名称列,不要列名
url_df_li = url_df.values.tolist()
for url_s_li in url_df_li:
    url.append(url_s_li[0])
    
#读取rj
rj_df = pd.read_excel("sample.xlsx", usecols=[1],names=None)  # 读取项目名称列,不要列名
rj_df_li = rj_df.values.tolist()
for rj_s_li in rj_df_li:
    rj.append(rj_s_li[0])

hd = {
    'sec-fetch-dest': 'audio',
    'cookie': '_ga=GA1.2.1877631639.1626354328; _gid=GA1.2.556826390.1626510667; aiBLOCKS={%221%22:{%22c%22:1%2C%22h%22:62193%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%222%22:{%22c%22:3%2C%22h%22:62302%2C%22cpt%22:1%2C%22ct%22:1626597077%2C%22x%22:1626597634}%2C%223%22:{%22c%22:1%2C%22h%22:32840%2C%22cpt%22:1%2C%22ct%22:1626597069%2C%22x%22:1626597634}%2C%224%22:{%22c%22:1%2C%22h%22:52649%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%225%22:{%22c%22:1%2C%22h%22:52655%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%226%22:{%22c%22:-1627116371%2C%22h%22:43456%2C%22cpt%22:0%2C%22ct%22:1626597077%2C%22x%22:1626597634}%2C%227%22:{%22c%22:1%2C%22h%22:52661%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%228%22:{%22c%22:1%2C%22h%22:52613%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%229%22:{%22c%22:1%2C%22h%22:52646%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2210%22:{%22c%22:1%2C%22h%22:52619%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2211%22:{%22c%22:1%2C%22h%22:52625%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2212%22:{%22c%22:1%2C%22h%22:52616%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2213%22:{%22c%22:1%2C%22h%22:52628%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2214%22:{%22c%22:1%2C%22h%22:52634%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2215%22:{%22c%22:1%2C%22h%22:52640%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2216%22:{%22c%22:1%2C%22h%22:52625%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2217%22:{%22c%22:2%2C%22h%22:21793%2C%22cpt%22:2%2C%22ct%22:1626597069%2C%22x%22:1626597634}%2C%2219%22:{%22c%22:1%2C%22h%22:10267%2C%22cpt%22:1%2C%22ct%22:1626597069%2C%22x%22:1626597634}}; pvc_visits[0]=1626546673b51942a1626546785b51317a1626546803b37431a1626551293b50818a1626551308b49536a1626551325b49528a1626551373b46295a1626551396b45281a1626551410b41686a1626551655b40332a1626551671b39967a1626551704b39303a1626551720b38318a1626551735b38310a1626551762b38178a1626551779b36528a1626551801b36178; __cf_bm=59c696b6a3b1617334bdb0a004f9eb5f58ea942b-1626517081-1800-AbUJDW1X\/BeCmR0+SQCFmzBW9EGU98T4cWhuG3bGsB3HeDggVQDWZq4ljeoE8EjdpFKxCDdDnvJuBrdh4oo0bYrJg7\/zcaIZUcc0gDqY3D3k6u7tLaaooNYTqBXwPLox3g==',
    'Referer':'',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
}

for i in range(68,len(url)):
    i = int(i)
    print("{}开始下载".format(rj[i]))
    manurl = 'https://xxxxxxx/f/{}.mp3'.format(rj[i])
    hd['Referer']=url[i]
    print(hd['Referer'])
    print(type(hd['Referer']))
    print("对应url：{}".format(url[i]))
    r = requests.get(manurl, headers=hd, stream=True)
    print("状态码：{}".format(r.status_code))
    file_size_str=r.headers['content-Length'] #提取出来的是个数字str
    file_size=int(file_size_str)/1024/1024    #把提取出数字str转为int或者float进行运算
    print('文件大小为：'+str(file_size)+'M')
    with open('{}.mp3'.format(rj[i]), "wb") as mp3:  
        for chunk in r.iter_content(chunk_size=1024 * 1024):
            if chunk:
                mp3.write(chunk)
                print("{}正在下载".format(rj[i]))
     
    print("{}下载结束".format(rj[i]))
    time.sleep(5)

标签：22,22cpt%,22ct%,爬虫,22h%,1%,MP3,2C%
来源： https://www.cnblogs.com/JKding233/p/15947116.html