其他分享
首页 > 其他分享> > R语言爬虫、新闻

R语言爬虫、新闻

作者:互联网

library(rvest)
library(magrittr)
library(jiebaRD)
library(ggplot2)
#载入rvest包
url <- 'https://www.thepaper.cn/'
#需要爬取的网址
web <- read_html(url)
#新闻主页对应的html文档
news <- web %>% html_nodes('h2 a')
#读取html的节点
titles <- news %>% html_text()
#将new传递到text管道函数得到新闻的title
link <- news %>% html_attrs()

link1 <- c(1:length(link))
for (i in 1:length(link)) {
  link1[i] <- link[[i]][1]
}

#得到新闻标签的三项属性信息 href id target
link2 <- paste("https://www.thepaper.cn/", link1, sep = "")
#获取网页的详细信息,【1】代表第一条信息 href内的新闻
news_content <- c(1:length(link2))
for (i in 1:length(link2)) {
  x <-
    read_html(link2[i]) %>% html_nodes('div.video_txt_l p') %>% html_name()
  y <- 'p'
  if (identical(x, y) == TRUE)
    news_content[i] <-
    read_html(link2[i]) %>% html_nodes('div.video_txt_l p') %>% html_text()
  else
    news_content[i] <-
    read_html(link2[i]) %>% html_nodes('div.news_txt ') %>% html_text()
}
#获取新闻每篇上架时间
news_date <- c(1:length(link2))
for (i in 1:length(link2)) {
  news_date[i] <-
    (read_html(link2[i]) %>% html_nodes('div p') %>% html_text(''))[2]
}

date <- c(1:length(link2))
time <- c(1:length(link2))
for (i in 1:length(link2)) {
  date[i] <- strsplit(news_date, split = ' ')[[i]][21]
  time[i] <- strsplit(news_date, split = ' ')[[i]][22]
}

news_01 <- data.frame(titles, date, time, url = link2, news_content)
save(news_01, file = "thepaper1.Rdata")

#获取正文的函数,读取link2,传递到nodes里,仅需获取正文text
write.csv(news_01, file = "newspaperto.csv")

#中文分析分词
library(jiebaR)
wk <- worker()
words <-  segment(news_content, wk)
library(dplyr)
library(stringr)

#停用库测试
wktest <- worker(stop_word = 'stop_words.txt')
wordstest <- segment(news_content,wktest)
text_dftest <- tibble(line= c(1:length(wordstest)),word = wordstest)
text_dftest%>% count(word,sort=TRUE)


#画图
text_dftest %>%
count(word,sort=TRUE) %>%
  filter(n > 40) %>%
  mutate(word =reorder(word,n))%>%
  ggplot(aes(word , n)) +
  geom_col(fill="pink") +
  xlab(NULL)+
  coord_flip()

注意。该爬虫只针对当时有视频和文字情况。未考虑其他格式,selector可能选择不正确

GitHub下载地址

(R语言爬虫)[https://github.com/lehoso/RWebCrawler]

标签:word,语言,新闻,爬虫,library,html,text,nodes,div
来源: https://www.cnblogs.com/lehoso/p/15550194.html