R语言 迪士尼点评文本挖掘
作者:互联网
setwd("D:\\迪士尼点评文本挖掘")
#getwd()可查看目前的工作路径
#加载包()
library(DBI)
#library(RMySQL)
library(rJava)
library(openxlsx)
library(stringr)
library(xlsxjars)
library(reshape)
library(readxl)
library(xlsx)
library(sqldf)
library(wordcloud)
library(Rwordseg) #加载分词包
library(tm)
library(tmcn)
library(jiebaRD)
library(jiebaR) #里面的segmentC用于分词
#library(wordcloud2)
###########################读入表格###############################
comment_01 <- read_excel("点评文本挖掘.xlsx",1) #读入原始文本
comment_01 <- comment_01[,'点评内容'] #仅保留文本字段
stopwords_01 <- read_excel("停用词汇总.xlsx",1) # 读入停用词文本
#str函数,即structure,紧凑的显示对象内部结构,即对象里有什么。作用跟head相似
#str(comment_01)
#添加搜狗词汇词典 查看已安装的词典用 :listDict()。卸载词典:uninstallDict()。
installDict(dictpath = '旅游词汇大全【官方推荐】.scel',dictname = 'Vocabulary_books', dicttype = 'scel') #旅游词汇大全
installDict(dictpath = 'disney.scel',dictname = 'disney', dicttype = 'scel') #迪士尼词汇大全
installDict(dictpath = '自定义词典.txt',dictname = 'dictionary_01') #迪士尼词汇大全
#uninstallDict("disney")
#uninstallDict("Vocabulary_books")
#uninstallDict("dictionary_01")
#加词
#uninstallDict(disney)
listDict()
insertWords(c("排队","不满意","非常满意","很好","不方便","非常好","很棒","驴妈妈","不舒服","不值","七个小矮人","飞越地平线","创极速光轮","米奇大街","奇想花园","梦幻世界","探险岛","宝藏湾","明日世界","巴斯光年星际营救","喷气背包飞行器","太空幸会史迪奇","星球大战远征基地","皮克斯玩具总动员","快速通道","加勒比海盗","灯光秀","飞跃地平线"))
#去掉字母和数字
comment_02 <- gsub('[0-9a-zA-Z]','',comment_01)
#分词
segword <- unlist(lapply(X=comment_02,FUN=segmentCN))
#创建停止词
#head(stopwords_01) #查看数据
#class(stopwords_01) #查看变量类型,可知是属于数据框类型
#segword[1:10]
#需要将数据框格式的数据转化为向量格式
stopwords_01<- as.matrix(stopwords_01[,1])
stopwords_01<- as.vector(stopwords_01[,1])
#自定义删除停止词的函数
removeStopWords <- function(x,stopwords) {
temp <- character(0)
index <- 1
xLen <- length(x)
while (index <= xLen) {
if (length(stopwords[stopwords==x[index]]) <1)
temp<- c(temp,x[index])
index <- index +1
}
temp
}
#删词
segword3 <-lapply(segword,removeStopWords,stopwords_01)
#绘制文字图
word_freq <- createWordFreq(unlist(segword3)) #createWordFreq函数来自于tmcn包
opar <-par(no.readonly = TRUE)
par(bg = 'black')
#绘制出现频率最高的前100个词
wordcloud(words=word_freq$word,freq=word_freq$freq,max.words=100,random.color=TRUE,colors=rainbow(n=7))
par(opar)
##############################数据输出##############################
result_filename<-paste(Sys.Date(),'数据',".xlsx",sep="")
write.xlsx(word_freq,result_filename,sheetName='明细')
标签:迪士尼,01,word,点评,library,uninstallDict,stopwords,freq,文本 来源: https://www.cnblogs.com/daisy-ma/p/10846086.html