首页 > 其他分享> > 顶会热词统计

顶会热词统计

2022-06-14 13:01:37 作者：互联网

作业要求

用户可给定论文列表 ◦通过论文列表，爬取论文的题目、摘要、关键词、原文链接；可对论文列表进行增删改操作(今年、近两年、近三年)； •对爬取的信息进行结构化处理，分析top10个热门领域或热门研究方向；可进行论文检索，当用户输入论文编号、题目、关键词等基本信息，分析返回相关的paper、source code、homepage等信息形成如关键词图谱之类直观的查看方式； •可对多年间、不同顶会的热词呈现热度走势对比（这里将范畴限定在计算机视觉的三大顶会CVPR、ICCV、ECCV内）。

代码部分

lunwen.java

public class Lunwen {
    private String id;
    private String title;
    private String keyword;
    private String net;
    private String author;

    @Override
    public String toString() {
        return "Lunwen{" +
                "id='" + id + '\'' +
                ", title='" + title + '\'' +
                ", keyword='" + keyword + '\'' +
                ", net='" + net + '\'' +
                ", author='" + author + '\'' +
                '}';
    }

    public String getAuthor() {
        return author;
    }

    public void setAuthor(String author) {
        this.author = author;
    }

    public Lunwen(String id, String title, String keyword) {
        this.id = id;
        this.title = title;
        this.keyword = keyword;
    }


    public Lunwen(String id, String title, String keyword, String net, String author) {
        this.id = id;
        this.title = title;
        this.keyword = keyword;
        this.net = net;
        this.author = author;
    }

    public String getId() {
        return id;
    }

    public void setId(String id) {
        this.id = id;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getKeyword() {
        return keyword;
    }

    public void setKeyword(String keyword) {
        this.keyword = keyword;
    }

    public String getNet() {
        return net;
    }

    public void setNet(String net) {
        this.net = net;
    }

    public Lunwen(String id, String title, String keyword, String net) {
        this.id = id;
        this.title = title;
        this.keyword = keyword;
        this.net = net;
    }

    public Lunwen() {
    }
}

lunwendao.java

import Bean.Lunwen;
import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.dbutils.handlers.BeanListHandler;
import util.DButil;

import java.sql.SQLException;
import java.util.List;

public class Lunwendao {
    private QueryRunner queryRunner = new QueryRunner(DButil.getDruidDataSource());

    public boolean insert(Lunwen lunwen) {
        Object[] prams = {lunwen.getTitle(), lunwen.getKeyword(), lunwen.getNet(), lunwen.getAuthor()};
        int result = 0;
        try {
            result = queryRunner.update("insert into lunwen(title,keyword,new,author) value (?,?,?,?)", prams);
            if (result == 1)
                return true;
        } catch (SQLException e) {
            e.printStackTrace();
            return false;
        }
        return false;
    }

    public boolean delete(String id) {
        int result = 0;
        try {
            result = queryRunner.update("delete from lunwen where id=?", id);
            if (result == 1) {
                return true;
            }
        } catch (SQLException e) {
            e.printStackTrace();
        }
        return false;
    }
    //id

    public List<Lunwen> selectid(String id) {
        List<Lunwen> lunwenList = null;
        try {
            lunwenList = queryRunner.query("select * from lunwen where id=?", new BeanListHandler<Lunwen>(Lunwen.class), id);
        } catch (SQLException e) {
            e.printStackTrace();
        }
        return lunwenList;
    }
//title
    public List<Lunwen> selecttitle(String title) {
        List<Lunwen> lunwenList = null;
        try {
            lunwenList = queryRunner.query("select * from lunwen where title=?", new BeanListHandler<Lunwen>(Lunwen.class), title);
        } catch (SQLException e) {
            e.printStackTrace();
        }
        return lunwenList;
    }
//keyword
    public List<Lunwen> selectkeyword(String keyword) {
        List<Lunwen> lunwenList = null;
        try {
            lunwenList = queryRunner.query("select * from lunwen where keyword=?", new BeanListHandler<Lunwen>(Lunwen.class), keyword);
        } catch (SQLException e) {
            e.printStackTrace();
        }
        return lunwenList;
    }
//id title
    public List<Lunwen> selectidandtitle(String id, String title) {
        Object[] prams = {id, title};
        List<Lunwen> lunwenList = null;
        try {
            lunwenList = queryRunner.query("select * from lunwen where id=? and title=?", new BeanListHandler<Lunwen>(Lunwen.class), prams);
        } catch (SQLException e) {
            e.printStackTrace();
        }
        return lunwenList;
    }
//id keyword
    public List<Lunwen> selectidandkeyword(String id, String keyword) {
        Object[] prams = {id, keyword};
        List<Lunwen> lunwenList = null;
        try {
            lunwenList = queryRunner.query("select * from lunwen where id=? and keyword =?", new BeanListHandler<Lunwen>(Lunwen.class), prams);
        } catch (SQLException e) {
            e.printStackTrace();
        }
        return lunwenList;
    }
//title keyword
    public List<Lunwen> selecttitleandkeyword(String title, String keyword) {
        Object[] prams = {title, keyword};
        List<Lunwen> lunwenList = null;
        try {
            lunwenList = queryRunner.query("select * from lunwen where title=? and keyword=?", new BeanListHandler<Lunwen>(Lunwen.class), prams);
        } catch (SQLException e) {
            e.printStackTrace();
        }
        return lunwenList;
    }
//all
    public List<Lunwen> selectall(String id,String title, String keyword) {
        Object[] prams = {id,title, keyword};
        List<Lunwen> lunwenList = null;
        try {
            lunwenList = queryRunner.query("select * from lunwen where id=? and title=? and keyword=?", new BeanListHandler<Lunwen>(Lunwen.class), prams);
        } catch (SQLException e) {
            e.printStackTrace();
        }
        return lunwenList;
    }

    public List<Lunwen> selectAll() {
        List<Lunwen> lunwenList = null;
        try {
            lunwenList = queryRunner.query("select * from lunwen", new BeanListHandler<Lunwen>(Lunwen.class));
        } catch (SQLException e) {
            e.printStackTrace();
        }
        return lunwenList;
    }



}

selectservlet.java

import Bean.Lunwen;
import Dao.Lunwendao;

import javax.servlet.ServletException;
import javax.servlet.annotation.WebServlet;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.List;

@WebServlet("/selectservlet")
public class selectservlet extends HttpServlet {
    @Override
    protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {
        resp.setContentType("text/html;charset=utf-8");
        PrintWriter printWriter = resp.getWriter();
        String id = req.getParameter("id");
        String title = req.getParameter("title");
        String keyword = req.getParameter("keyword");
        String delete = req.getParameter("delete");

        Lunwendao lunwendao = new Lunwendao();
        List<Lunwen> lunwenList = null;
        if(delete != null){
            lunwendao.delete(id);
            delete = null;
            id = "";
            title = "";
            keyword = "";
        }

        if (!id.equals("") && title.equals("") && keyword.equals("")) {
            lunwenList = lunwendao.selectid(id);
        }
        if (id.equals("") && !title.equals("") && keyword.equals("")) {
            lunwenList = lunwendao.selecttitle(title);
        }
        if (id.equals("") && title.equals("") && !keyword.equals("")) {
            lunwenList = lunwendao.selectkeyword(keyword);
        }
        if (!id.equals("") && !title.equals("") && keyword.equals("")) {
            lunwenList = lunwendao.selectidandtitle(id, title);
        }
        if (!id.equals("") && title.equals("") && !keyword.equals("")) {
            lunwenList = lunwendao.selectidandkeyword(id, keyword);
        }
        if (id.equals("") && !title.equals("") && !keyword.equals("")) {
            lunwenList = lunwendao.selecttitleandkeyword(title, keyword);
        }
        if (!id.equals("") && !title.equals("") && !keyword.equals("")) {
            lunwenList = lunwendao.selectall(id, title, keyword);
        }
        if (id.equals("") && title.equals("") && keyword.equals("")) {
            lunwenList = lunwendao.selectAll();
        }
        req.setAttribute("id",id);
        req.setAttribute("title",title);
        req.setAttribute("keyword",keyword);
        req.setAttribute("lunwenList",lunwenList);
        req.getRequestDispatcher("/select.jsp").forward(req,resp);
    }

    @Override
    protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {
        doGet(req, resp);
    }
}

select.jsp

<%@ page import="java.util.List" %>
<%@ page import="Bean.Lunwen" %><%--
  Created by IntelliJ IDEA.
  User: ACER-CN
  Date: 2022/5/13
  Time: 10:06
  To change this template use File | Settings | File Templates.
--%>
<%@ page contentType="text/html;charset=UTF-8" language="java" %>
<%
    List<Lunwen> lunwenList = (List<Lunwen>) request.getAttribute("lunwenList");
    String id = (String) request.getAttribute("id");
    String title = (String) request.getAttribute("title");
    String keyword = (String) request.getAttribute("keyword");
%>
<html>
<head>
    <title>查询</title>
    <STYLE type="text/css">
        input {
            width: 500px;
            height: 30px;
        }

        td {
            font-family: 隶书;
            font-size: 20px;
        }

        #submet {
            height: 25px;
            width: 300px;
            margin-left: 35%;
        }

        #table_2 {
            margin-top: 50px;
        }

        table {
            table-layout: fixed;
            word-break: break-all;
            word-wrap: break-word;
        / / 表格固定布局
        }

        #table_2, #table_2 tr td {

            border: 1px solid #000;

            text-align: center;

            border-collapse: collapse;

        }

        a {
            text-decoration: none;
        / / 设置超链接取消下划线
        }

    </STYLE>
</head>
<body>
<form action="/lunwenpaqu_war_exploded/selectservlet" method="get">
    <table align="center">
        <tr>
            <td>论文编号：</td>
            <%
                if (id == null) {


            %>
            <td><input type="text" name="id"></td>
        <%
            }else {


        %>
            <td><input type="text" name="id" value="<%=id%>"></td>
            <%
                }
            %>
        </tr>
        <tr>
            <td>&nbsp;</td>
        </tr>
        <tr>
            <td>论文题目：</td>
            <%
                if (title == null) {
            %>
            <td><input type="text" name="title"></td>
            <%
            }else {
            %>
            <td><input type="text" name="title" value="<%=title%>"></td>
            <%
                }
            %>
        </tr>
        <tr>
            <td>&nbsp;</td>
        </tr>
        <tr>
            <td>关键字：</td>
            <%
                if (keyword == null) {


            %>
            <td><input type="text" name="keyword"></td>
            <%
            }else {


            %>
            <td><input type="text" name="keyword" value="<%=keyword%>"></td>
            <%
                }
            %>
        </tr>
        <tr>
            <td>&nbsp;</td>
        </tr>
        <tr>
            <td>&nbsp;</td>
        </tr>
        <tr>
            <td colspan="2"><p><input type="submit" value="查询" id="submet"></p></td>
        </tr>
    </table>
</form>
<%
    if (lunwenList != null) {
%>
<table align="center" id="table_2">
    <%
        for (int i = 0; i < lunwenList.size(); i++) {
            Lunwen temp = lunwenList.get(i);

    %>
    <tr>
        <td>论文编号：<%=temp.getId()%>
        </td>
        <td>论文题目：<%=temp.getTitle()%>
        </td>
        <td>论文摘要：<%=temp.getKeyword()%>
        </td>
        <td>发布作者：<%=temp.getAuthor()%>
        </td>
        <td>论文网址：<a href="<%=temp.getNet()%>"><%=temp.getNet()%>
        </a>
        </td>
        <td><a href="/lunwenpaqu_war_exploded/selectservlet?id=<%=temp.getId()%>&delete=1">删除</a></td>
    </tr>
    <%
        }
    %>
</table>
<%
    }
%>
</body>
</html>

main.py

import urllib.request

import urllib.request
from bs4 import BeautifulSoup
import requests
from lxml import etree
import json

url = "https://openaccess.thecvf.com/WACV2021"
url1 = "https://openaccess.thecvf.com"
resp = urllib.request.urlopen(url)
content = resp.read().decode("utf-8")
# 创建soup对象
soup = BeautifulSoup(content, 'lxml')
list_info = soup.find('dl')
list_url = list_info.find_all('dt')  # 返回一个列表
list_title = soup.select('.ptitle a')
title_url = []  # 获取所有论文标题的链接
for title in list_title:
    title_url.append(url1 + title.get('href'))
# ok,现在都存到一个列表里面了,接下来循环遍历,依次爬取
# down_title = []           # 论文标题
# down_abstract = []        # 论文摘要
# down_author = []          #论文作者
# down_pdf = []             #论文PDF
# authors
for two_url in title_url:
    # resp = urllib.request.urlopen(two_url)
    resp = requests.get(two_url)
    # html = resp.read().decode('utf-8')
    body = etree.HTML(resp.text)
    # two_soup = BeautifulSoup(html, 'lxml')
    # papertitle = two_soup.select('#papertitle>')
    # title1 = two_soup.xpath("//div[@id='papertitle']/text()")[0]
    down_title = body.xpath('//*[@id="papertitle"]/text()')[0].strip()  # 论文标题
    down_author = body.xpath('//*[@id="authors"]/b/i/text()')[0].strip()  # 论文作者
    down_abstract = body.xpath('//*[@id="abstract"]/text()')[0].strip()  # 论文摘要
    down_pdf = url1 + body.xpath('//*[@id="content"]/dl/dd/a/@href')[0].strip()  # 论文PDF
    down_abstract = down_abstract.replace("'", "\\'")
    # 爬取标题
    print("开始爬取" + two_url)
    print(down_author)
    import pymysql

    db = pymysql.connect(host="localhost", user="root", password="123456", database="lunwenpaqu", charset="utf8")
    cursor = db.cursor()
    sql = "insert into lunwen(title,keyword,net,author) values ('" + str(down_title) + "','" + str(
        down_abstract) + "','" + str(
        down_pdf) + "','" +str(
        down_author) + "')"
    try:
        cursor.execute(sql)
        print("插入成功")
        db.commit()
        # print(school_shengfen + "\t" + school_name + "添加成功")
    except pymysql.Error as e:
        print("增加数据失败:  " + str(e))
        db.rollback()

#     down_author.append(two_soup.select('#authors'))
#     print(title1)
#     print(soup.select('#authors'))
# print('爬取完成')

# /html/body/div[3]/dl/div[1]

第一次接触爬虫，对Python还并不熟悉，对于爬虫也是套的模板

并不是很了解其中的原理，在写这个小程序的时候遇到了很多困难

在java部分就是运用的简单的增删改查，并没有什么创新点，也没有用模糊查询，程序还有很大的完善空间

标签：lunwenList,String,keyword,title,热词,public,顶会,id,统计
来源： https://www.cnblogs.com/-0112/p/16374251.html