首页 > 其他分享> > CVPR顶会论文爬取

CVPR顶会论文爬取

2021-06-12 14:35:14 作者：互联网

main.py

import pymysql
import re
import requests

# 连接数据库函数
from bs4 import BeautifulSoup


def insertCvpr(value):

    try:
        db = pymysql.connect(host="localhost", user="root", password="password", database="article",charset="utf8")
        print("数据库连接成功!")
        cur = db.cursor()
        sql = 'INSERT INTO cvpr(title,ab,author,hotword,pdf,path) VALUE (%s,%s,%s,%s,%s,%s)'
        cur.execute(sql, value)
        db.commit()
        print("增加数据成功!")
    except pymysql.Error as e:
        print("增加数据失败:  " + str(e))
        db.rollback()

    db.close()


#主函数
print("1")
url = "https://openaccess.thecvf.com/CVPR2020.py?day=2020-06-16"
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36"}
res = requests.get(url,headers=headers)
res.encoding = "utf-8"
# 先爬取每个论文的网址
web = re.findall("""<dt class="ptitle"><br><a href="(.*?)">.*?</a></dt>""", res.text, re.S)
print("2")
for each in web:
    try:
        each = "http://openaccess.thecvf.com/" + each
        print("3")
        print(each)
        res = requests.get(each, headers=headers, timeout=(3, 7))
        paper = BeautifulSoup(res.text)
        res.encoding = "utf-8"
        # 在各各论文网站中爬取详细信息
        title = re.findall("""<div id="papertitle">(.*?)</div>""", res.text, re.S)#标题
        ab = re.findall("""<div id="abstract" >(.*?)</div>""", res.text, re.S)#摘要
        author = paper.find("div", {"id": "authors"}).find("b").find("i").text#作者
        pdf = re.findall("""\[<a href="\.\./\.\./(.*?)">pdf</a>\]""", res.text, re.S)#pdf下载地址
        path = each#论文简述页面
        if (len(title) > 0):
            title = title[0].replace("\n", "")
            ab = ab[0].replace("\n", "")
            pdf = "http://openaccess.thecvf.com/" + pdf[0]
            print(title)
            print(author)
            value = (title, ab, author, "", pdf, path)
            insertCvpr(value)
    except:
        print("异常")

2.数据库

遇到的问题：

注意varchar最大长度为255，数据长度可能不够，使用longtext类型存储。

MySQL中tinytext、text、mediumtext和longtext等各个类型详解

标签：re,res,title,爬取,CVPR,text,print,pdf,顶会
来源： https://www.cnblogs.com/Arisf/p/14878182.html