顶会热词统计
作者:互联网
作业要求
用户可给定论文列表 ◦通过论文列表,爬取论文的题目、摘要、关键词、原文链接; 可对论文列表进行增删改操作(今年、近两年、近三年); •对爬取的信息进行结构化处理,分析top10个热门领域或热门研究方向;可进行论文检索,当用户输入论文编号、题目、关键词等基本信息,分析返回相关的paper、source code、homepage等信息 形成如关键词图谱之类直观的查看方式; •可对多年间、不同顶会的热词呈现热度走势对比(这里将范畴限定在计算机视觉的三大顶会CVPR、ICCV、ECCV内)。
代码部分
lunwen.java
public class Lunwen { private String id; private String title; private String keyword; private String net; private String author; @Override public String toString() { return "Lunwen{" + "id='" + id + '\'' + ", title='" + title + '\'' + ", keyword='" + keyword + '\'' + ", net='" + net + '\'' + ", author='" + author + '\'' + '}'; } public String getAuthor() { return author; } public void setAuthor(String author) { this.author = author; } public Lunwen(String id, String title, String keyword) { this.id = id; this.title = title; this.keyword = keyword; } public Lunwen(String id, String title, String keyword, String net, String author) { this.id = id; this.title = title; this.keyword = keyword; this.net = net; this.author = author; } public String getId() { return id; } public void setId(String id) { this.id = id; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getKeyword() { return keyword; } public void setKeyword(String keyword) { this.keyword = keyword; } public String getNet() { return net; } public void setNet(String net) { this.net = net; } public Lunwen(String id, String title, String keyword, String net) { this.id = id; this.title = title; this.keyword = keyword; this.net = net; } public Lunwen() { } }
lunwendao.java
import Bean.Lunwen; import org.apache.commons.dbutils.QueryRunner; import org.apache.commons.dbutils.handlers.BeanListHandler; import util.DButil; import java.sql.SQLException; import java.util.List; public class Lunwendao { private QueryRunner queryRunner = new QueryRunner(DButil.getDruidDataSource()); public boolean insert(Lunwen lunwen) { Object[] prams = {lunwen.getTitle(), lunwen.getKeyword(), lunwen.getNet(), lunwen.getAuthor()}; int result = 0; try { result = queryRunner.update("insert into lunwen(title,keyword,new,author) value (?,?,?,?)", prams); if (result == 1) return true; } catch (SQLException e) { e.printStackTrace(); return false; } return false; } public boolean delete(String id) { int result = 0; try { result = queryRunner.update("delete from lunwen where id=?", id); if (result == 1) { return true; } } catch (SQLException e) { e.printStackTrace(); } return false; } //id public List<Lunwen> selectid(String id) { List<Lunwen> lunwenList = null; try { lunwenList = queryRunner.query("select * from lunwen where id=?", new BeanListHandler<Lunwen>(Lunwen.class), id); } catch (SQLException e) { e.printStackTrace(); } return lunwenList; } //title public List<Lunwen> selecttitle(String title) { List<Lunwen> lunwenList = null; try { lunwenList = queryRunner.query("select * from lunwen where title=?", new BeanListHandler<Lunwen>(Lunwen.class), title); } catch (SQLException e) { e.printStackTrace(); } return lunwenList; } //keyword public List<Lunwen> selectkeyword(String keyword) { List<Lunwen> lunwenList = null; try { lunwenList = queryRunner.query("select * from lunwen where keyword=?", new BeanListHandler<Lunwen>(Lunwen.class), keyword); } catch (SQLException e) { e.printStackTrace(); } return lunwenList; } //id title public List<Lunwen> selectidandtitle(String id, String title) { Object[] prams = {id, title}; List<Lunwen> lunwenList = null; try { lunwenList = queryRunner.query("select * from lunwen where id=? and title=?", new BeanListHandler<Lunwen>(Lunwen.class), prams); } catch (SQLException e) { e.printStackTrace(); } return lunwenList; } //id keyword public List<Lunwen> selectidandkeyword(String id, String keyword) { Object[] prams = {id, keyword}; List<Lunwen> lunwenList = null; try { lunwenList = queryRunner.query("select * from lunwen where id=? and keyword =?", new BeanListHandler<Lunwen>(Lunwen.class), prams); } catch (SQLException e) { e.printStackTrace(); } return lunwenList; } //title keyword public List<Lunwen> selecttitleandkeyword(String title, String keyword) { Object[] prams = {title, keyword}; List<Lunwen> lunwenList = null; try { lunwenList = queryRunner.query("select * from lunwen where title=? and keyword=?", new BeanListHandler<Lunwen>(Lunwen.class), prams); } catch (SQLException e) { e.printStackTrace(); } return lunwenList; } //all public List<Lunwen> selectall(String id,String title, String keyword) { Object[] prams = {id,title, keyword}; List<Lunwen> lunwenList = null; try { lunwenList = queryRunner.query("select * from lunwen where id=? and title=? and keyword=?", new BeanListHandler<Lunwen>(Lunwen.class), prams); } catch (SQLException e) { e.printStackTrace(); } return lunwenList; } public List<Lunwen> selectAll() { List<Lunwen> lunwenList = null; try { lunwenList = queryRunner.query("select * from lunwen", new BeanListHandler<Lunwen>(Lunwen.class)); } catch (SQLException e) { e.printStackTrace(); } return lunwenList; } }
selectservlet.java
import Bean.Lunwen; import Dao.Lunwendao; import javax.servlet.ServletException; import javax.servlet.annotation.WebServlet; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import java.io.IOException; import java.io.PrintWriter; import java.util.List; @WebServlet("/selectservlet") public class selectservlet extends HttpServlet { @Override protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { resp.setContentType("text/html;charset=utf-8"); PrintWriter printWriter = resp.getWriter(); String id = req.getParameter("id"); String title = req.getParameter("title"); String keyword = req.getParameter("keyword"); String delete = req.getParameter("delete"); Lunwendao lunwendao = new Lunwendao(); List<Lunwen> lunwenList = null; if(delete != null){ lunwendao.delete(id); delete = null; id = ""; title = ""; keyword = ""; } if (!id.equals("") && title.equals("") && keyword.equals("")) { lunwenList = lunwendao.selectid(id); } if (id.equals("") && !title.equals("") && keyword.equals("")) { lunwenList = lunwendao.selecttitle(title); } if (id.equals("") && title.equals("") && !keyword.equals("")) { lunwenList = lunwendao.selectkeyword(keyword); } if (!id.equals("") && !title.equals("") && keyword.equals("")) { lunwenList = lunwendao.selectidandtitle(id, title); } if (!id.equals("") && title.equals("") && !keyword.equals("")) { lunwenList = lunwendao.selectidandkeyword(id, keyword); } if (id.equals("") && !title.equals("") && !keyword.equals("")) { lunwenList = lunwendao.selecttitleandkeyword(title, keyword); } if (!id.equals("") && !title.equals("") && !keyword.equals("")) { lunwenList = lunwendao.selectall(id, title, keyword); } if (id.equals("") && title.equals("") && keyword.equals("")) { lunwenList = lunwendao.selectAll(); } req.setAttribute("id",id); req.setAttribute("title",title); req.setAttribute("keyword",keyword); req.setAttribute("lunwenList",lunwenList); req.getRequestDispatcher("/select.jsp").forward(req,resp); } @Override protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { doGet(req, resp); } }
select.jsp
<%@ page import="java.util.List" %> <%@ page import="Bean.Lunwen" %><%-- Created by IntelliJ IDEA. User: ACER-CN Date: 2022/5/13 Time: 10:06 To change this template use File | Settings | File Templates. --%> <%@ page contentType="text/html;charset=UTF-8" language="java" %> <% List<Lunwen> lunwenList = (List<Lunwen>) request.getAttribute("lunwenList"); String id = (String) request.getAttribute("id"); String title = (String) request.getAttribute("title"); String keyword = (String) request.getAttribute("keyword"); %> <html> <head> <title>查询</title> <STYLE type="text/css"> input { width: 500px; height: 30px; } td { font-family: 隶书; font-size: 20px; } #submet { height: 25px; width: 300px; margin-left: 35%; } #table_2 { margin-top: 50px; } table { table-layout: fixed; word-break: break-all; word-wrap: break-word; / / 表格固定布局 } #table_2, #table_2 tr td { border: 1px solid #000; text-align: center; border-collapse: collapse; } a { text-decoration: none; / / 设置超链接取消下划线 } </STYLE> </head> <body> <form action="/lunwenpaqu_war_exploded/selectservlet" method="get"> <table align="center"> <tr> <td>论文编号:</td> <% if (id == null) { %> <td><input type="text" name="id"></td> <% }else { %> <td><input type="text" name="id" value="<%=id%>"></td> <% } %> </tr> <tr> <td> </td> </tr> <tr> <td>论文题目:</td> <% if (title == null) { %> <td><input type="text" name="title"></td> <% }else { %> <td><input type="text" name="title" value="<%=title%>"></td> <% } %> </tr> <tr> <td> </td> </tr> <tr> <td>关键字:</td> <% if (keyword == null) { %> <td><input type="text" name="keyword"></td> <% }else { %> <td><input type="text" name="keyword" value="<%=keyword%>"></td> <% } %> </tr> <tr> <td> </td> </tr> <tr> <td> </td> </tr> <tr> <td colspan="2"><p><input type="submit" value="查询" id="submet"></p></td> </tr> </table> </form> <% if (lunwenList != null) { %> <table align="center" id="table_2"> <% for (int i = 0; i < lunwenList.size(); i++) { Lunwen temp = lunwenList.get(i); %> <tr> <td>论文编号:<%=temp.getId()%> </td> <td>论文题目:<%=temp.getTitle()%> </td> <td>论文摘要:<%=temp.getKeyword()%> </td> <td>发布作者:<%=temp.getAuthor()%> </td> <td>论文网址:<a href="<%=temp.getNet()%>"><%=temp.getNet()%> </a> </td> <td><a href="/lunwenpaqu_war_exploded/selectservlet?id=<%=temp.getId()%>&delete=1">删除</a></td> </tr> <% } %> </table> <% } %> </body> </html>
main.py
import urllib.request import urllib.request from bs4 import BeautifulSoup import requests from lxml import etree import json url = "https://openaccess.thecvf.com/WACV2021" url1 = "https://openaccess.thecvf.com" resp = urllib.request.urlopen(url) content = resp.read().decode("utf-8") # 创建soup对象 soup = BeautifulSoup(content, 'lxml') list_info = soup.find('dl') list_url = list_info.find_all('dt') # 返回一个列表 list_title = soup.select('.ptitle a') title_url = [] # 获取所有论文标题的链接 for title in list_title: title_url.append(url1 + title.get('href')) # ok,现在都存到一个列表里面了,接下来循环遍历,依次爬取 # down_title = [] # 论文标题 # down_abstract = [] # 论文摘要 # down_author = [] #论文作者 # down_pdf = [] #论文PDF # authors for two_url in title_url: # resp = urllib.request.urlopen(two_url) resp = requests.get(two_url) # html = resp.read().decode('utf-8') body = etree.HTML(resp.text) # two_soup = BeautifulSoup(html, 'lxml') # papertitle = two_soup.select('#papertitle>') # title1 = two_soup.xpath("//div[@id='papertitle']/text()")[0] down_title = body.xpath('//*[@id="papertitle"]/text()')[0].strip() # 论文标题 down_author = body.xpath('//*[@id="authors"]/b/i/text()')[0].strip() # 论文作者 down_abstract = body.xpath('//*[@id="abstract"]/text()')[0].strip() # 论文摘要 down_pdf = url1 + body.xpath('//*[@id="content"]/dl/dd/a/@href')[0].strip() # 论文PDF down_abstract = down_abstract.replace("'", "\\'") # 爬取标题 print("开始爬取" + two_url) print(down_author) import pymysql db = pymysql.connect(host="localhost", user="root", password="123456", database="lunwenpaqu", charset="utf8") cursor = db.cursor() sql = "insert into lunwen(title,keyword,net,author) values ('" + str(down_title) + "','" + str( down_abstract) + "','" + str( down_pdf) + "','" +str( down_author) + "')" try: cursor.execute(sql) print("插入成功") db.commit() # print(school_shengfen + "\t" + school_name + "添加成功") except pymysql.Error as e: print("增加数据失败: " + str(e)) db.rollback() # down_author.append(two_soup.select('#authors')) # print(title1) # print(soup.select('#authors')) # print('爬取完成') # /html/body/div[3]/dl/div[1]
第一次接触爬虫,对Python还并不熟悉,对于爬虫也是套的模板
并不是很了解其中的原理,在写这个小程序的时候遇到了很多困难
在java部分就是运用的简单的增删改查,并没有什么创新点,也没有用模糊查询,程序还有很大的完善空间
标签:lunwenList,String,keyword,title,热词,public,顶会,id,统计 来源: https://www.cnblogs.com/-0112/p/16374251.html