首页 > 其他分享> > 爬虫B站【原神】相关视频播放数前1000的数据并生成词云图

爬虫B站【原神】相关视频播放数前1000的数据并生成词云图

2021-02-02 19:33:46 作者：互联网

老规矩，先上代码

# -*- coding: utf-8 -*
import pandas as pd
import matplotlib.pyplot as plt
import requests
import sys
import time
from bs4 import BeautifulSoup
import jieba
import wordcloud

def mihoyo():#爬虫b站视频信息
    target='https://search.bilibili.com/all?keyword=%E5%8E%9F%E7%A5%9E&order=click&duration=0&tids_1=0'#网址
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
    headers = {'User-Agent':user_agent}

    result=[]
    n=0#视频总数量
    for i in range(50):
        mid_target=target+"&page={}".format(i+1)
        req=requests.get(url=mid_target)
        html=req.text
        html=html.replace('<br>',' ').replace('<br/>',' ').replace('/>','>')
        bf=BeautifulSoup(html,"html.parser")#网页解析
        texts=bf.find('ul',class_='video-list clearfix')#大列表
        texts_div=texts.find_all('li',class_='video-item matrix')#每一项
        for item in texts_div:
            n=n+1
            item_name=item.find('a')['title']#标题
            #item_href=item.find('a')['href']#链接
            item_refer_watch_num=item.find('span',class_='so-icon watch-num').text
            item_refer_watch_num=item_refer_watch_num.replace(" ","").replace("\n","")
            #print(item_refer_watch_num)
            item_refer_uptime=item.find('span',class_='so-icon time').text
            item_refer_uptime=item_refer_uptime.replace(" ","").replace("\n","")
            #print(item_refer_uptime)
            result.append([item_name,item_refer_uptime,item_refer_watch_num])
    pd.DataFrame(result).to_excel("output.xls")

def info_of_b():
    excel=pd.read_excel(r'output.xls',sheet_name='Sheet1')#读取数据
    province_data=excel
    dates=province_data.index.tolist()#第一列
    countries=province_data.columns.tolist()#第一行
    values=province_data.values.tolist()
    print(countries)
    print(values[0])
    yun=""
    for item in values:
        if item[2][:7]=="2020-08":
            yun+=item[1]
            print(item[1])
    #print(result)
    yun=yun.replace("你","").replace("我","").replace("的","").replace("了","").replace("吗","").replace("个","")
    yun=yun.replace("是","").replace("吧","").replace("这","").replace("原神","").replace("","").replace("","")
    cut_text = " ".join(jieba.cut(yun))
    wc = wordcloud.WordCloud(
    font_path="C:/Windows/Fonts/simfang.ttf",
    background_color="white",width=1000,height=880).generate(cut_text)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.show()
    #print(cut_text)

info_of_b()

先运行mihoyo()函数进行爬虫，将数据写入output.xls文件中

这里的网址是在B站搜索关键词（本例中为“原神”）复制网址栏的url而来，可以看到数据被加密为