其他分享
首页 > 其他分享> > 爬取大学排名 用pyecharts进行可视化

爬取大学排名 用pyecharts进行可视化

作者:互联网

先找到对应的全部list

需要先安装requests,lxml

可直接用 pip install requests pip install lxml 命令安装

导入需要的相关包

import requests

from lxml import etree

import time

import random

import csv

 

#避免网页反爬虫

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}

url = 'http://college.gaokao.com/schlist/p'

response = requests.get(url,headers=headers)

time.sleep(random.randint(0,2)) #同样用于反爬虫

再调用 lxml 获取到整页的学校名称

selector = etree.HTML(response.text)

all_list = selector.xpath('//*[starts-with(@class,"scores_List")]/dl') #页面中全部学校  全部dl列

调用 for 循环获取dl中所有需要的数据

for sel in all_list:

        name = sel.xpath('dt/strong/a/text()')[0]  #学校名称

        place = sel.xpath('dd/ul/li[1]/text()')[0][6:] #高校所在地

        type = sel.xpath('dd/ul/li[3]/text()')[0][5:] #高校类型

        nature = sel.xpath('dd/ul/li[5]/text()')[0][5:] #高校性质

        try: #获取的数据院校特色有地方空缺为避免出现空缺无法爬取数据

            tese = sel.xpath('dd/ul/li[2]/span/text()')[0] #院校特色

        except:

            tese='' #遇到空缺值让院校特色等于null

        lishu = sel.xpath('dd/ul/li[4]/text()')[0][5:] #高校隶属

最后将爬取的数据保存(保存成CSV文件格式)

    with open('school.csv','a',encoding='gbk',newline='')as file:

        writer = csv.writer(file)

        try:

            writer.writerow(item)

        except Exception as e:

            print(e)

            

最后用函数将全部外汇返佣串接
附上完整代码

import requests

from lxml import etree

import time

import random

import csv

 

def csv_writer(item):

    with open('school.csv','a',encoding='gbk',newline='')as file:

        writer = csv.writer(file)

        try:

            writer.writerow(item)

        except Exception as e:

            print(e)

def spider(url_):

    time.sleep(random.randint(0,2))

    res = requests.get(url_,headers=headers)

    return etree.HTML(res.text)

def parse(list_url):

    selector = spider(list_url)

    all_list = selector.xpath('//*[starts-with(@class,"scores_List")]/dl')

    for sel in all_list:

        name = sel.xpath('dt/strong/a/text()')[0]

        place = sel.xpath('dd/ul/li[1]/text()')[0][6:]

        type = sel.xpath('dd/ul/li[3]/text()')[0][5:]

        nature = sel.xpath('dd/ul/li[5]/text()')[0][5:]

        try:

            tese = sel.xpath('dd/ul/li[2]/span/text()')[0]

        except:

            tese=''

        lishu = sel.xpath('dd/ul/li[4]/text()')[0][5:]

        # print(name,place,type,nature,tese,lishu)

        csv_writer([name,place,type,nature,tese,lishu])

 

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}

url_ = 'http://college.gaokao.com/schlist/p'

all_url = [url_ + str(i) for i in range(1,107)]

for url in all_url:

    parse(url)

将爬取的文件进行整合并进行可视化

柱状图

from pyecharts.charts import Bar

from pyecharts import options as opts

import pandas as pd

datafile = r'D:/Program Files/Tencent/QQ/QQ/out2/school.xlsx'

data = pd.read_excel(datafile)

 

x1 = data['Column1'].tolist()

y1 = data['Column2'].tolist()

y2 = data['Column3'].tolist()

bar = (

    Bar()

    .add_xaxis(x1)

    .add_yaxis("本科",y1)

    .add_yaxis("专科",y2)

    .set_global_opts(title_opts=opts.TitleOpts(title="大学",subtitle="情况"))

)

bar.render(path='bar.html')

前十条形图

from pyecharts.charts import Line

import pandas as pd

from pyecharts import options as opts

datafile = r'D:/Program Files/Tencent/QQ/QQ/out2/school.xlsx'

data = pd.read_excel(datafile)

encoding='utf-8'

x1 = data['Column1'].tolist()[:10]

y1 = data['Column2'].tolist()[:10]

y2 = data['Column3'].tolist()[:10]

line = Line()

line.add_xaxis(x1)

line.add_yaxis("本科",y1)

line.add_yaxis("专科",y2)

line.set_global_opts(title_opts=opts.TitleOpts(title="前十"))

line.render(path='line.html')

高校数前十名 环形图

from pyecharts.charts import Pie

import pandas as pd

from pyecharts import options as opts

datafile = r'D:/Program Files/Tencent/QQ/QQ/out2/school.xlsx'

data = pd.read_excel(datafile)

# 高校数量前十名

pie = Pie()

pie.add("", [list(z) for z in zip(data['Column1'].values.tolist()[:10], data['Column2'].values.tolist()[:10])],

       radius=["30%", "75%"],

            center=["40%", "50%"],

            rosetype="radius")

pie.set_global_opts(

            title_opts=opts.TitleOpts(title="高校数量前十名"),

            legend_opts=opts.LegendOpts(

                type_="scroll", pos_left="80%", orient="vertical"

            ),

        )

pie.render('高校数量前十名.html')

散点图

import pyecharts.options as opts

from pyecharts.charts import Scatter

import pandas as pd

datafile = r'D:/Program Files/Tencent/QQ/QQ/out2/school.xlsx'

data = pd.read_excel(datafile)

x1 = data['Column1'].tolist()[:10]

y1 = data['Column2'].tolist()[:10]

y2 = data['Column3'].tolist()[:10]

 

scatter = Scatter()

scatter.add_xaxis(x1)

scatter.add_yaxis('本科',y1)

scatter.add_yaxis('专科',y2)

scatter.set_global_opts(title_opts=opts.TitleOpts(title="高校"))

scatter.render(path='scatter.html')

Geo

from pyecharts.charts import Geo

import pandas as pd

from pyecharts import options as opts

datafile = r'D:/Program Files/Tencent/QQ/QQ/out2/school.xlsx'

data = pd.read_excel(datafile)

geo = Geo()

geo.add_schema(maptype="china")

geo.add("高校分布图",[list(z) for z in zip(data['Column1'].values.tolist(), data['Column2'].values.tolist())])

geo.set_global_opts(visualmap_opts=opts.VisualMapOpts(is_piecewise=True,max_=150),

                    title_opts=opts.TitleOpts(title="各地区高校数量"))

geo.set_series_opts(label_opts=opts.LabelOpts(is_show=False))

geo.render(path='geo.html')

Map

from pyecharts.charts import Map

import pandas as pd

from pyecharts import options as opts

datafile = r'D:/Program Files/Tencent/QQ/QQ/out2/school.xlsx'

data = pd.read_excel(datafile)

map = Map()

map.add("高校分布图",[list(z) for z in zip(data['Column1'].values.tolist(), data['Column2'].values.tolist())])

map.set_global_opts(visualmap_opts=opts.VisualMapOpts(max_=150),

                    title_opts=opts.TitleOpts(title="各地区高校数量"))

map.render(path='map.html')

原文链接:https://blog.csdn.net/zql200008/article/details/103716683

标签:tolist,pyecharts,大学排名,text,爬取,import,sel,data,opts
来源: https://www.cnblogs.com/benming/p/12106937.html