简单入门-美团主页词云
作者:互联网
1 #1.爬取原始数据 2 # 导入requests库 3 import requests as rs 4 #获取网页源代码 修改headers通过基本猫眼发爬虫审查 5 headers = { 6 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36' 7 } 8 url='https://maoyan.com' 9 resp = rs.get(url, headers=headers) 10 print(resp.status_code) 11 print(type(resp)) 12 print('----------------') 13 #设置编码格式避免乱码 14 resp.encoding='utf-8' 15 # 保存网页源代码 16 webText=resp.text 17 open("source.txt", "w").write(webText) 18 print('----souce download------------') 19 20 21 #2. 使⽤用BeautifulSoup进⾏数据解析 22 from bs4 import BeautifulSoup 23 # HtmlParser,是解析Html的一个工具。python自带的,用来解析数据 24 soup = BeautifulSoup(resp.text, 'html.parser') 25 #获取blog文本,保存 26 webContent=soup.text 27 file=open("webContent.txt", "w").write(webContent) 28 print('----written---------') 29 30 31 #3. 使⽤WordCloud库生成词云;使用matplotlib库进行可视化 32 from wordcloud import WordCloud 33 import matplotlib.pyplot as plt 34 #读出文本 35 text= open("webContent.txt").read().replace("票","").replace("购","").replace\ 36 ("想看","").replace("人",'').replace("分","").replace("预告片","").replace\ 37 ("想","").replace("预","").replace("万","").replace("售","").replace\ 38 ("上映","").replace("猫眼电影","").replace("maoyan","") 39 print(text) 40 #print(type(text)) 41 #设置词云字体格式 42 font = r'/simhei.ttf' 43 #调用WordCloud()词云生产函数 44 wc = WordCloud(font_path=font, width=1400, height=1400, margin=2).generate(text) 45 #imshow()函数负责对图像进行处理 46 plt.imshow(wc) 47 #plt.axis("off") 48 #show()函数负责对图像进行展示 49 plt.show() 50 #词云保存为图片 51 wc.to_file('webToWordCloud.png') # 把词云保存下来 52 print('----pic saved---------')
标签:主页,text,美团,replace,WordCloud,词云,print,resp 来源: https://www.cnblogs.com/zx3707/p/15240113.html