首页 > 其他分享> > 基金数据爬取

基金数据爬取

2022-05-02 16:01:10 作者：互联网

起因

昨天被老板娘安排了一个爬取基金网站的活（我一个搞信息论的，来做爬虫？），当时说比较急，问我什么时候可以搞完，我说下午就可以，但其实我心里很慌，毕竟上次做爬虫还是两年前。有人强要面子，我不说是谁。然后即开始复习了一下爬虫相关的知识，调网页源码，直到中午才把代码写完。下午开始跑程序，爬虫的速度一直很慢，网站还动不动拒绝访问，程序一直拖到第二天。昨天答应老板娘今早上一定可以给她的，起个大早远程一看，程序罢工了。Deadline近在咫尺，生产力迅速提高，想到可以多开几个程序，分段重新跑，这样就可以避免网站拒绝访问。但可惜自己进程与线程学艺不精，只能暴力开了十几个项目来操作，记录下后续自己学习的过程。

教训

拿不准的事情，不要强揽。
写代码要细心，低级bug不能有。
多小步测试，降低时间成本。

问题描述

从网址https://www.izaiwen.cn/中下载所有符合条件的数据，筛选条件为学科分类为信息科学部，总共有66368条记录，整体数据量偏大。

问题分析

找到静态网址

这个网址是JS动态渲染的，没有直接显示出搜索的URL，我们一般可以在Network中的Fetch/XHR中从Headers找到。不确定是哪个文件时，可以多次尝试搜索，新出现的文件就是创建的申请，其Headers中对应的URL就是我们要找到静态网址。

第一页的静态URL为https://www.izaiwen.cn/list/sonFpg1?psnname=&orgname=&prjno=&keyword=，而第二页的静态URL为https://www.izaiwen.cn/list/sonFpg2?psnname=&orgname=&prjno=&keyword=，我们就可以合理推测sonFpg后的数字就是当面页数。随机选取几个页面的静态URL即可验证猜想。

分析返回内容

我们直接在浏览器中输入上面找到的静态网址，就可以得到下面的页面。我注意到其中明显时采用了unicode的编码，例如\u5c71\u897f，使用unicode解码后就可以从其中提取出我们所需信息，比如项目id、学科分类等信息。这里记录一个坑，在使用find进行字符串匹配的时候，尽量把两端的引号加上，并且测试一下搜索到的是否是符合要求的信息，比如进行id匹配时使用cont.find("\"id\":")可以避开干扰项，优化代码。

老板娘要的信息比较全，这个页面还是无法满足所有的要求。当我打开搜索结果中的一个项目时，发现它的URL为https://www.izaiwen.cn/detail?id=NDI5ODgx，很明显后台数据库中id就是主键。我只需要在上面的页面中把一页10个项目的id全部记录下来，就可以以https://www.izaiwen.cn/detail?id=+id的形式导航到对应的项目详细信息网页。

实验代码

为了避免爬虫速度过快，网站限制访问，我才用了如下三种措施：

借助time库，适当暂定爬出0.1秒。
使用try，except语句。
多个工程同时运行，爬取不同的页面。

爬取代码

import requests
import xlsxwriter
from bs4 import BeautifulSoup
import time
import re
import json


def getfoundation_id(cont):
    # 得到基金id
    id_list = []
    # 学科分类
    term_list = []
    len = 0
    for m in range(10):
        cont = cont[len:]
        # 获取学科分类
        t1 = cont[cont.find("sbjcode\"")+10:cont.find("flname\"")-3]
        t2 = cont[cont.find("flname\"")+9:cont.find("prjno\"")-3]
        term_list.append(t1 + "." + t2)

        a1 = cont.find("id\":")
        a2 = cont[a1 + 5:].find("id\":") + a1 + 5
        a3 = cont[a2 + 5:].find("id\":") + a2 + 5
        id_list.append(cont[a1 + 5: cont.find("kd_gzr_id") - 3])
        len = a3 + 100
    return id_list, term_list


def getinformation(id):
    url = "https://www.izaiwen.cn/detail?id=" + id
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                             'Chrome/51.0.2704.63 Safari/537.36'}
    try:
        res = requests.get(url, headers=headers)
    except:
        time.sleep(0.1)
        res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.text, "html.parser")
    info_list = []
    result = soup.find_all('div', class_="layui-col-xs8 info-row")
    for m in result:
        temp = m.find('span').text
        if temp == "":
            temp = "无"
        info_list.append(temp)
    result = soup.find_all('div', class_="layui-col-xs4 info-row")
    for m in result:
        temp = m.find('span').text
        if temp == "":
            temp = "无"
        info_list.append(temp)
    return info_list


def main():
    # 保存到excel
    data = xlsxwriter.Workbook("基金数据1.xlsx")
    sheet = data.add_worksheet("基金数据")
    sheet.write(0, 0, '项目名称')
    sheet.write(0, 1, '学科分类')
    sheet.write(0, 2, '资助类别')
    sheet.write(0, 3, '依托单位')
    sheet.write(0, 4, '关键词')
    sheet.write(0, 5, '项目批准号')
    sheet.write(0, 6, '项目负责人')
    sheet.write(0, 7, '负责人职称')
    sheet.write(0, 8, '资助金额')
    sheet.write(0, 9, '起止时间')
    index = 0
    flag = 2

    # 解析页面，总共6637个页面
    for i in range(1, 500):
        url1 = "https://www.izaiwen.cn/list/sonFpg"
        url2 = "?psnname=&orgname=&prjno=&keyword="
        url = url1 + str(i) + url2
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                                 'Chrome/51.0.2704.63 Safari/537.36'}
        res = requests.get(url, headers=headers)
        res.encoding = "unicode-escape"
        id_list, term_list = getfoundation_id(res.text)
        term_index = 0
        for id in id_list:
            index = index + 1
            temp = getinformation(id)
            try:
                sheet.write(index, 0, temp[0])
                sheet.write(index, 1, term_list[term_index])
                term_index = term_index + 1
                sheet.write(index, 2, temp[2])
                sheet.write(index, 3, temp[3])
                sheet.write(index, 4, temp[4])
                sheet.write(index, 5, temp[5])
                sheet.write(index, 6, temp[6])
                sheet.write(index, 7, temp[7])
                sheet.write(index, 8, temp[8])
                sheet.write(index, 9, temp[9])
            except:
                index = 0
                data.close()
                data = xlsxwriter.Workbook("基金数据" + str(flag) + ".xlsx")
                flag = flag + 1
                sheet = data.add_worksheet("基金数据")
                sheet.write(0, 0, '项目名称')
                sheet.write(0, 1, '学科分类')
                sheet.write(0, 2, '资助类别')
                sheet.write(0, 3, '依托单位')
                sheet.write(0, 4, '关键词')
                sheet.write(0, 5, '项目批准号')
                sheet.write(0, 6, '项目负责人')
                sheet.write(0, 7, '负责人职称')
                sheet.write(0, 8, '资助金额')
                sheet.write(0, 9, '起止时间')
                continue
    data.close()


if __name__ == "__main__":
    main()

整合及分类代码

import xlsxwriter
import xlrd


def getnamelist(filename):
    file = xlrd.open_workbook(filename)
    sheet = file.sheets()[0]
    name = sheet.col_values(0)
    return name


def main():
    #os = "D:/Project/PyCharm_Code/NationalFoundation/" #测试用
    os = ""
    file_name1 = os + "名单61.xlsx"
    file_name2 = os + "名单2.xlsx"
    file_name3 = os + "名单3.xlsx"
    name_list1 = getnamelist(file_name1)
    name_list2 = getnamelist(file_name2)
    name_list3 = getnamelist(file_name3)

    data = xlsxwriter.Workbook("基金数据（汇总）.xlsx")
    sheet = data.add_worksheet("基金数据")
    sheet.write(0, 0, '项目名称')
    sheet.write(0, 1, '学科分类')
    sheet.write(0, 2, '资助类别')
    sheet.write(0, 3, '依托单位')
    sheet.write(0, 4, '关键词')
    sheet.write(0, 5, '项目批准号')
    sheet.write(0, 6, '项目负责人')
    sheet.write(0, 7, '负责人职称')
    sheet.write(0, 8, '资助金额')
    sheet.write(0, 9, '起止时间')
    index = 0

    data1 = xlsxwriter.Workbook("基金数据_名单1.xlsx")
    sheet1 = data1.add_worksheet("基金数据")
    sheet1.write(0, 0, '项目名称')
    sheet1.write(0, 1, '学科分类')
    sheet1.write(0, 2, '资助类别')
    sheet1.write(0, 3, '依托单位')
    sheet1.write(0, 4, '关键词')
    sheet1.write(0, 5, '项目批准号')
    sheet1.write(0, 6, '项目负责人')
    sheet1.write(0, 7, '负责人职称')
    sheet1.write(0, 8, '资助金额')
    sheet1.write(0, 9, '起止时间')
    index1 = 0

    data2 = xlsxwriter.Workbook("基金数据_名单2.xlsx")
    sheet2 = data2.add_worksheet("基金数据")
    sheet2.write(0, 0, '项目名称')
    sheet2.write(0, 1, '学科分类')
    sheet2.write(0, 2, '资助类别')
    sheet2.write(0, 3, '依托单位')
    sheet2.write(0, 4, '关键词')
    sheet2.write(0, 5, '项目批准号')
    sheet2.write(0, 6, '项目负责人')
    sheet2.write(0, 7, '负责人职称')
    sheet2.write(0, 8, '资助金额')
    sheet2.write(0, 9, '起止时间')
    index2 = 0

    data3 = xlsxwriter.Workbook("基金数据_名单3.xlsx")
    sheet3 = data3.add_worksheet("基金数据")
    sheet3.write(0, 0, '项目名称')
    sheet3.write(0, 1, '学科分类')
    sheet3.write(0, 2, '资助类别')
    sheet3.write(0, 3, '依托单位')
    sheet3.write(0, 4, '关键词')
    sheet3.write(0, 5, '项目批准号')
    sheet3.write(0, 6, '项目负责人')
    sheet3.write(0, 7, '负责人职称')
    sheet3.write(0, 8, '资助金额')
    sheet3.write(0, 9, '起止时间')
    index3 = 0


    # 读取基金数据
    for i in range(1, 19):
        temp_data = xlrd.open_workbook(os + "基金数据" + str(i) + ".xlsx")
        temp_sheet = temp_data.sheets()[0]
        if temp_sheet.nrows == 1:
            continue  # 跳过空表
        num_row = temp_sheet.nrows
        for a in range(1, num_row):
            index = index + 1
            sheet.write(index, 0, temp_sheet.row_values(a)[0])
            sheet.write(index, 1, temp_sheet.row_values(a)[1])
            sheet.write(index, 2, temp_sheet.row_values(a)[2])
            sheet.write(index, 3, temp_sheet.row_values(a)[3])
            sheet.write(index, 4, temp_sheet.row_values(a)[4])
            sheet.write(index, 5, temp_sheet.row_values(a)[5])
            sheet.write(index, 6, temp_sheet.row_values(a)[6])
            sheet.write(index, 7, temp_sheet.row_values(a)[7])
            sheet.write(index, 8, temp_sheet.row_values(a)[8])
            sheet.write(index, 9, temp_sheet.row_values(a)[9])

            if temp_sheet.row_values(a)[6] in name_list1:
                index1 = index1 + 1
                sheet1.write(index1, 0, temp_sheet.row_values(a)[0])
                sheet1.write(index1, 1, temp_sheet.row_values(a)[1])
                sheet1.write(index1, 2, temp_sheet.row_values(a)[2])
                sheet1.write(index1, 3, temp_sheet.row_values(a)[3])
                sheet1.write(index1, 4, temp_sheet.row_values(a)[4])
                sheet1.write(index1, 5, temp_sheet.row_values(a)[5])
                sheet1.write(index1, 6, temp_sheet.row_values(a)[6])
                sheet1.write(index1, 7, temp_sheet.row_values(a)[7])
                sheet1.write(index1, 8, temp_sheet.row_values(a)[8])
                sheet1.write(index1, 9, temp_sheet.row_values(a)[9])

            if temp_sheet.row_values(a)[6] in name_list2:
                index2 = index2 + 1
                sheet2.write(index2, 0, temp_sheet.row_values(a)[0])
                sheet2.write(index2, 1, temp_sheet.row_values(a)[1])
                sheet2.write(index2, 2, temp_sheet.row_values(a)[2])
                sheet2.write(index2, 3, temp_sheet.row_values(a)[3])
                sheet2.write(index2, 4, temp_sheet.row_values(a)[4])
                sheet2.write(index2, 5, temp_sheet.row_values(a)[5])
                sheet2.write(index2, 6, temp_sheet.row_values(a)[6])
                sheet2.write(index2, 7, temp_sheet.row_values(a)[7])
                sheet2.write(index2, 8, temp_sheet.row_values(a)[8])
                sheet2.write(index2, 9, temp_sheet.row_values(a)[9])

            if temp_sheet.row_values(a)[6] in name_list3:
                index3 = index3 + 1
                sheet3.write(index3, 0, temp_sheet.row_values(a)[0])
                sheet3.write(index3, 1, temp_sheet.row_values(a)[1])
                sheet3.write(index3, 2, temp_sheet.row_values(a)[2])
                sheet3.write(index3, 3, temp_sheet.row_values(a)[3])
                sheet3.write(index3, 4, temp_sheet.row_values(a)[4])
                sheet3.write(index3, 5, temp_sheet.row_values(a)[5])
                sheet3.write(index3, 6, temp_sheet.row_values(a)[6])
                sheet3.write(index3, 7, temp_sheet.row_values(a)[7])
                sheet3.write(index3, 8, temp_sheet.row_values(a)[8])
                sheet3.write(index3, 9, temp_sheet.row_values(a)[9])
    data.close()
    data1.close()
    data2.close()
    data3.close()


if __name__ == "__main__":
    main()

代码优化

不懂python中的进程与线程，只能暴力创建十几个工程项目同时运行，过程十分繁琐。进程与线程的区别主要为：

地址空间：同一进程的线程共享本进程的地址空间，而进程之间则是独立的地址空间。
资源拥有：同一进程内的线程共享本进程的资源（如内存、I/O、cpu等），但是进程之间的资源是独立的。
一个进程崩溃后，在保护模式下不会对其他进程产生影响，但是一个线程崩溃整个进程都死掉。所以多进程要比多线程健壮。
进程切换时，消耗的资源大，效率高。所以涉及到频繁的切换时，使用线程要好于进程。同样如果要求同时进行并且又要共享某些变量的并发操作，只能用线程不能用进程
执行过程：每个独立的进程有一个程序运行的入口、顺序执行序列和程序入口。但是线程不能独立执行，必须依存在应用程序中，由应用程序提供多个线程执行控制。
线程是处理器调度的基本单位，但是进程不是。

import requests
import xlsxwriter
from bs4 import BeautifulSoup
import time
import multiprocessing as mp
import re
import json


def getfoundation_id(cont):
    # 得到基金id
    id_list = []
    # 学科分类
    term_list = []
    """a1 = cont.find("id\":")
    a2 = cont[a1 + 5:].find("id\":") + a1 + 5
    a3 = cont[a2 + 5 :].find("id\":") + a2 + 5
    a4 = cont[a3 + 5:].find("id\":") + a3 + 5
    a5 = cont[a4 + 5:].find("id\":") + a4 + 5
    a6 = cont[a5 + 5:].find("id\":") + a5 + 5
    a7 = cont[a6 + 5:].find("id\":") + a6 + 5
    id_list.append(cont[a1 + 5: a1 + 13])
    id_list.append(cont[a4 + 5: a4 + 13])
    id_list.append(cont[a7 + 5: a7 + 13])"""
    len = 0
    for m in range(10):
        cont = cont[len:]
        # 获取学科分类
        t1 = cont[cont.find("sbjcode\"")+10:cont.find("flname\"")-3]
        t2 = cont[cont.find("flname\"")+9:cont.find("prjno\"")-3]
        term_list.append(t1 + "." + t2)

        a1 = cont.find("id\":")
        a2 = cont[a1 + 5:].find("id\":") + a1 + 5
        a3 = cont[a2 + 5:].find("id\":") + a2 + 5
        id_list.append(cont[a1 + 5: cont.find("kd_gzr_id") - 3])
        len = a3 + 100
    return id_list, term_list


def getinformation(id):
    url = "https://www.izaiwen.cn/detail?id=" + id
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                             'Chrome/51.0.2704.63 Safari/537.36'}
    try:
        res = requests.get(url, headers=headers)
    except:
        time.sleep(0.1)
        res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.text, "html.parser")
    info_list = []
    result = soup.find_all('div', class_="layui-col-xs8 info-row")
    for m in result:
        temp = m.find('span').text
        if temp == "":
            temp = "无"
        info_list.append(temp)
    result = soup.find_all('div', class_="layui-col-xs4 info-row")
    for m in result:
        temp = m.find('span').text
        if temp == "":
            temp = "无"
        info_list.append(temp)
    return info_list


def main(name, page1, page2):
    # 保存到excel
    data = xlsxwriter.Workbook("基金数据a" + str(name) + ".xlsx")
    sheet = data.add_worksheet("基金数据")
    sheet.write(0, 0, '项目名称')
    sheet.write(0, 1, '学科分类')
    sheet.write(0, 2, '资助类别')
    sheet.write(0, 3, '依托单位')
    sheet.write(0, 4, '关键词')
    sheet.write(0, 5, '项目批准号')
    sheet.write(0, 6, '项目负责人')
    sheet.write(0, 7, '负责人职称')
    sheet.write(0, 8, '资助金额')
    sheet.write(0, 9, '起止时间')
    index = 0
    flag = name + 0.1

    # 解析页面，总共6637个页面
    for i in range(page1, page2+1):
        url1 = "https://www.izaiwen.cn/list/sonFpg"
        url2 = "?psnname=&orgname=&prjno=&keyword="
        url = url1 + str(i) + url2
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                                 'Chrome/51.0.2704.63 Safari/537.36'}
        res = requests.get(url, headers=headers)
        res.encoding = "unicode-escape"
        id_list, term_list = getfoundation_id(res.text)
        term_index = 0
        for id in id_list:
            index = index + 1
            temp = getinformation(id)
            try:
                sheet.write(index, 0, temp[0])
                sheet.write(index, 1, term_list[term_index])
                term_index = term_index + 1
                sheet.write(index, 2, temp[2])
                sheet.write(index, 3, temp[3])
                sheet.write(index, 4, temp[4])
                sheet.write(index, 5, temp[5])
                sheet.write(index, 6, temp[6])
                sheet.write(index, 7, temp[7])
                sheet.write(index, 8, temp[8])
                sheet.write(index, 9, temp[9])
            except:
                index = 0
                data.close()
                data = xlsxwriter.Workbook("基金数据a" + str(flag) + ".xlsx")
                flag = flag + 0.1
                sheet = data.add_worksheet("基金数据")
                sheet.write(0, 0, '项目名称')
                sheet.write(0, 1, '学科分类')
                sheet.write(0, 2, '资助类别')
                sheet.write(0, 3, '依托单位')
                sheet.write(0, 4, '关键词')
                sheet.write(0, 5, '项目批准号')
                sheet.write(0, 6, '项目负责人')
                sheet.write(0, 7, '负责人职称')
                sheet.write(0, 8, '资助金额')
                sheet.write(0, 9, '起止时间')
                continue
    data.close()


if __name__ == "__main__":
    # target：指定执行的函数名
    # args:使用元组方式给指定任务传参
    # kwargs:使用字典方式给指定任务传参
    p1 = mp.Process(target=main, args=(1, 1, 2))
    p2 = mp.Process(target=main, kwargs={"name": 2, "page1": 3, "page2": 4})
    p1.start()
    p2.start()
    
    # 多进程循环示例
    """for i in range(1, 10):
        mp.Process(target=main, args=(i, i*2-1, i*2)).start()"""

标签：write,sheet,temp,index,id,爬取,数据,基金,row
来源： https://www.cnblogs.com/xuzhang/p/16215988.html