R语言数据预处理基础
作者:互联网
包含了数据的创建、缺失值判断处理、日期处理、数据排序以及选取,并从三个方面进行了数据的入选观测,包括了SQL语句,需要先下载包。
类型转换函数没有进行测试:
其中涉及转换的有as.numeric() as.character() as.vector() as.matrix() as.data.frame() as.factor() as.logical() 等。
> manager <- c(1,2,3,4,5)
> date <- c("10/24/08","10/28/08","10/1/08","10/12/08","5/1/09")
> country <- c("US","US","UK","UK","UK")
> gender <- c("M","F","F","M","F")
> age <- c(32,42,25,39,99)
> q1 <- c(5,3,3,3,2)
> q2 <- c(4,5,5,3,2)
> q3 <- c(5,2,5,4,1)
> q4 <- c(5,5,5,NA,2)
> q5 <- c(5,5,2,NA,1)
> leadership <- data.frame(manager,date,country,gender,age,
+ q1,q2,q3,q4,q5,stringsAsFactors = FALSE)
> leadership
manager date country gender age q1 q2 q3 q4 q5
1 1 10/24/08 US M 32 5 4 5 5 5
2 2 10/28/08 US F 42 3 5 2 5 5
3 3 10/1/08 UK F 25 3 5 5 5 2
4 4 10/12/08 UK M 39 3 3 4 NA NA
5 5 5/1/09 UK F 99 2 2 1 2 1
> #对测试数据的缺失值判断
> is.na(leadership[,6:10])
q1 q2 q3 q4 q5
[1,] FALSE FALSE FALSE FALSE FALSE
[2,] FALSE FALSE FALSE FALSE FALSE
[3,] FALSE FALSE FALSE FALSE FALSE
[4,] FALSE FALSE FALSE TRUE TRUE
[5,] FALSE FALSE FALSE FALSE FALSE
> #去除缺失值
> newdata <- na.omit(leadership)
> newdata
manager date country gender age q1 q2 q3 q4 q5
1 1 10/24/08 US M 32 5 4 5 5 5
2 2 10/28/08 US F 42 3 5 2 5 5
3 3 10/1/08 UK F 25 3 5 5 5 2
5 5 5/1/09 UK F 99 2 2 1 2 1
> #数据排序
> newdata1 <- leadership[order(leadership$age),]
> attach(leadership)
The following objects are masked _by_ .GlobalEnv:
age, country, date, gender, manager, q1, q2, q3, q4, q5
> #按age排序
> newdata1
manager date country gender age q1 q2 q3 q4 q5
3 3 10/1/08 UK F 25 3 5 5 5 2
1 1 10/24/08 US M 32 5 4 5 5 5
4 4 10/12/08 UK M 39 3 3 4 NA NA
2 2 10/28/08 US F 42 3 5 2 5 5
5 5 5/1/09 UK F 99 2 2 1 2 1
> newdata2 <- leadership[order(gender,age),]
> detach(leadership)
> #按性别分组,年龄升序排序
> newdata2
manager date country gender age q1 q2 q3 q4 q5
3 3 10/1/08 UK F 25 3 5 5 5 2
2 2 10/28/08 US F 42 3 5 2 5 5
5 5 5/1/09 UK F 99 2 2 1 2 1
1 1 10/24/08 US M 32 5 4 5 5 5
4 4 10/12/08 UK M 39 3 3 4 NA NA
> attach(leadership)
The following objects are masked _by_ .GlobalEnv:
age, country, date, gender, manager, q1, q2, q3, q4, q5
> newdata3 <- leadership[order(gender,-age),]
> detach(leadership)
> #按性别分组,降序年龄
> newdata3
manager date country gender age q1 q2 q3 q4 q5
5 5 5/1/09 UK F 99 2 2 1 2 1
2 2 10/28/08 US F 42 3 5 2 5 5
3 3 10/1/08 UK F 25 3 5 5 5 2
4 4 10/12/08 UK M 39 3 3 4 NA NA
1 1 10/24/08 US M 32 5 4 5 5 5
> #入选观测,选择1-3行
> newdata4 <- leadership[1:3,]
> #逻辑比较
> newdata5 <- leadership[leadership$gender=="M" & leadership$age >30]
> attach(leadership)
The following objects are masked _by_ .GlobalEnv:
age, country, date, gender, manager, q1, q2, q3, q4, q5
> newdata5
manager gender q1 q4
1 1 M 5 5
2 2 F 3 5
3 3 F 3 5
4 4 M 3 NA
5 5 F 2 2
> #入选观测性别为男,年龄大于30
> newdata6 <- leadership[gender=='M' & age > 30]
> detach(leadership)
> newdata6
manager gender q1 q4
1 1 M 5 5
2 2 F 3 5
3 3 F 3 5
4 4 M 3 NA
5 5 F 2 2
> #subset()函数实现选择观测
>
> newdata7 <- subset(leadership,age >= 35 | age<24,select = c(q1,q2,q3,q4))
> newdata7
q1 q2 q3 q4
2 3 5 2 5
4 3 3 4 NA
5 2 2 1 2
> newdata8 <- subset(leadership,gender=="M" & age>25,select = gender:q4)
> newdata8
gender age q1 q2 q3 q4
1 M 32 5 4 5 5
4 M 39 3 3 4 NA
> #直接使用sql语句查询
> newdf <- sqldf("select * from leadership where country='US'",row.names = TRUE)
> newdf
manager date country gender age q1 q2 q3 q4 q5
1 1 10/24/08 US M 32 5 4 5 5 5
2 2 10/28/08 US F 42 3 5 2 5 5
> #日期格式的观测
>
> #默认方式
> mydates <- as.Date(c("2021-06-22","2022-02-13"))
> mydates
[1] "2021-06-22" "2022-02-13"
> #将默认格式转换为对应日期
>
> strDates <- c("01/05/2022","08/16/2021")
> dates <- as.Date(strDates,"%m/%d/%Y")
> dates
[1] "2022-01-05" "2021-08-16"
> #将原本数据框里的日期格式转为字符型的
>
> myformat <- "%m/%d/%y"
> leadership$date <- as.Date(leadership$date,myformat)
> leadership$date
[1] "2008-10-24" "2008-10-28" "2008-10-01" "2008-10-12" "2009-05-01"
> #变量的创建
> mydata <- data.frame(x1=c(2,2,6,4),x2=c(3,4,2,8))
> mydata$sumx <- mydata$x1+mydata$x2
> mydata$meanx <- (mydata$x1 + mydata$x2)/2
> attach(mydata)
> mydata
x1 x2 sumx meanx
1 2 3 5 2.5
2 2 4 6 3.0
3 6 2 8 4.0
4 4 8 12 6.0
> mydata$sumx <- x1+x2
> mydata$meanx <- (x1+x2)/2
> detach(mydata)
> mydata
x1 x2 sumx meanx
1 2 3 5 2.5
2 2 4 6 3.0
3 6 2 8 4.0
4 4 8 12 6.0
> mydata <- transform(mydata,sumx=x1+x2,meanx=(x1+x2)/2)
> mydata
x1 x2 sumx meanx
1 2 3 5 2.5
2 2 4 6 3.0
3 6 2 8 4.0
4 4 8 12 6.0
标签:q1,10,FALSE,语言,gender,08,数据,预处理,q4 来源: https://blog.csdn.net/weixin_44692890/article/details/122383224