b站爬取排行榜内容并生成词云图
作者:互联网
import requests import linecache import wordcloud import jieba import matplotlib.pyplot as plt from bs4 import BeautifulSoup if __name__=="__main__": n=0#ID编号 target='https://www.bilibili.com/v/popular/rank/all'#b站 user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36' headers = {'User-Agent':user_agent} req=requests.get(url=target) html=req.text html=html.replace('<br>',' ').replace('<br/>',' ').replace('/>','>') bf=BeautifulSoup(html,"html.parser") texts=bf.find('ul',class_='rank-list') texts_div=texts.find_all('div',class_='info') #print(texts_div) yun="" for item in texts_div: n=n+1 item_name=item.find('a').text#标题 yun+=str(item_name) item_href=item.find('a')['href']#链接 h=item_href.rfind('/') item_href=item_href[h+1:] item_refer=item.find_all('span',class_='data-box') item_refer1=item_refer[0].text item_refer2=item_refer[1].text mid=[n,item_name,item_href,item_refer1,item_refer2] print(mid) # 结巴分词,生成字符串,wordcloud无法直接生成正确的中文词云 cut_text = " ".join(jieba.cut(yun)) wc = wordcloud.WordCloud( #设置字体,不然会出现口字乱码,文字的路径是电脑的字体一般路径,可以换成别的 font_path="C:/Windows/Fonts/simfang.ttf", #设置了背景,宽高 background_color="white",width=1000,height=880).generate(cut_text) plt.imshow(wc, interpolation="bilinear") plt.axis("off") plt.show()
标签:item,text,texts,生成,排行榜,href,import,云图,find 来源: https://www.cnblogs.com/ljy1227476113/p/14155188.html