首页 > 其他分享> > 【爬虫】爬取网易云热门歌曲歌曲信息-歌手、链接、歌手信息

【爬虫】爬取网易云热门歌曲歌曲信息-歌手、链接、歌手信息

2021-02-28 23:02:12 作者：互联网

# By Vax
# At time - 2020/12/27 21:59
# linked from

import requests,json
from lxml import etree

url = 'https://music.163.com/discover/artist'
singer_infos = []


# ---------------通过url获取该页面的内容，返回xpath对象
def get_xpath(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    return etree.HTML(response.text)


# --------------通过get_xpath爬取到页面后，我们获取华宇，华宇男等分类
def parse():
    html = get_xpath(url)
    fenlei_url_list = html.xpath('//ul[@class="nav f-cb"]/li/a/@href')  # 获取华宇等分类的url
    # print(fenlei_url_list)
    # --------将热门和推荐两栏去掉筛选
    new_list = [i for i in fenlei_url_list if 'id' in i]
    for i in new_list:
        fenlei_url = 'https://music.163.com' + i
        parse_fenlei(fenlei_url)
        # print(fenlei_url)


# -------------通过传入的分类url，获取A,B，C页面内容
def parse_fenlei(url):
    html = get_xpath(url)
    # 获得字母排序，每个字母的链接
    zimu_url_list = html.xpath('//ul[@id="initial-selector"]/li[position()>1]/a/@href')
    for i in zimu_url_list:
        zimu_url = 'https://music.163.com' + i
        parse_singer(zimu_url)


# ---------------------传入获得的字母链接，开始爬取歌手内容
def parse_singer(url):
    html = get_xpath(url)
    item = {}
    singer_names = html.xpath('//ul[@id="m-artist-box"]/li/p/a/text()')
    # --详情页看到页面结构会有两个a标签，所以取第一个
    singer_href = html.xpath('//ul[@id="m-artist-box"]/li/p/a[1]/@href')
    # print(singer_names,singer_href)
    for i, name in enumerate(singer_names):
        item['歌手名'] = name
        item['音乐链接'] = 'https://music.163.com' + singer_href[i].strip()
        # 获取歌手详情页的链接
        url = item['音乐链接'].replace(r'?id', '/desc?id')
        # print(url)
        parse_detail(url, item)

        print(item)


# ---------获取详情页url和存着歌手名字和音乐列表的字典，在字典中添加详情页数据
def parse_detail(url, item):
    html = get_xpath(url)
    desc_list = html.xpath('//div[@class="n-artdesc"]/p/text()')
    item['歌手信息'] = desc_list
    singer_infos.append(item)
    write_singer(item)


# ----------------将数据字典写入歌手文件
def write_singer(item):
    with open('singer.json', 'a+', encoding='utf-8') as file:
        json.dump(item,file)


if __name__ == '__main__':
    parse()

效果图：

标签：xpath,singer,url,list,歌手,爬取,item,html,歌曲
来源： https://blog.csdn.net/qq_41823684/article/details/114241838