โจ ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ ๊ณผ์ (Data Preprocessing Data)
์ฃผ์ด์ง ๋ฐ์ดํฐ๋ฅผ ๊ทธ๋๋ก Data mining ํ์ง ์๊ณ , ๋ถ์ํ๊ธฐ ์ ํฉํ๊ฒ ๋ฐ์ดํฐ๋ฅผ ๊ฐ๊ณตํ๋ ์์
โจ dplyr package
๋ฐ์ดํฐ๋ฅผ ๋นจ๋ฆฌ ๊ฐ๊ณตํ ์ ์๊ฒ ๋์์ฃผ๋ package
์ด ํจํค์ง์ ๋ด๋ถ ํจ์๋ฅผ ์์๋ณด๊ณ ์ํ๋ค.
- filter() : ํ ์ถ์ถ
- select() : ์ด(variable) ์ถ์ถ
- arrange() : ์ ๋ ฌ
- mutate() : variable ์ถ๊ฐ
- summarise() : ํต๊ณ์น ์ฐ์ถ
- group_by() : grouping, ์ง๋จ๋ณ๋ก ๋๋๊ธฐ
- left_join() : ๋ฐ์ดํฐ ํฉ์น๊ธฐ(์ด, variable)
- bind_rows() : ๋ฐ์ดํฐ ํ(record) ํฉ์น๊ธฐ
bind_rows()์์๋ ๋ณ์์ ๊ฐ์์ ๋ณ์์ ์ด๋ฆ์ด ๊ฐ์์ผํ๋ค.
summarise()์ group_by()๋ aggregation function์ ์ฃผ๋ก ์ฌ์ฉํ๋ค. db์์ group by, having๊ณผ ๋น์ทํ ์ญํ ์ ํ๋ ๊ฒ์ด๋ค.
install.packages("dplyr")
library(dplyr)
โจ dplyr method ์ฌ์ฉ
์ถ์ฒ์ ๋ฐ์ดํฐ๋ฅผ ๋ฐํ์ผ๋ก dplyr method ์ฌ์ฉ์ ์ตํ๋ณด๊ณ ์ ํ๋ค.
๐ filter()
# filter
exam[exam$class == 1, ]
exam %<% filter(class == 1)
์ ์ฝ๋์ ์๋์ฝ๋๋ ์ ํํ ๊ฐ์ ๋ฐ์ดํฐ๋ฅผ ๋ณด์ฌ์ค๋ค.
%>%๋ ํ์ดํ ์ฐ์ฐ์๋ก Ctrl + Shift + M์ ๋๋ฅด๋ฉด %>%์ด ์ฝ์ ๋๋ค.
exam %>% filter(math>80 & english <90)
๋ํ ๊ฐ์ ๊บผ๋ผ ์๋ ์๋ค.
class1 <- exam %<% filter(class == 1)
mean(class1$math)
filter()๋ db์ where ์ ๊ณผ ๊ฐ์ ์ญํ ์ ํ๋ค.
๐ select()
exam %>% select(math)
exam %>% select(math, english, science)
exam %>% select(-math) # math ์ ์ ๋ฐ์ดํฐ ์ถ์ถ
exam %>%
select(id, math) %>%
head(10)
๐ filter() & select()
exam %>%
filter(class==1) %>%
select(math, english)
๐ arrange()
์ ๋ ฌ, order by์ ๊ฐ์
exam %>% arrange(math)
exam %>% arrange(id)
exam %>% arrange(class)
exam %>% arrange(id, class)
exam %>% arrange(desc(class)) %>% head(10)
๋ด๋ฆผ์ฐจ์์ผ๋ก ๋ณ๊ฒฝํ๊ณ ์ถ์ผ๋ฉด desc() method๋ฅผ ์ฌ์ฉํ๋ฉด ๋๋ค.
๐ mutate()
ํ์๋ณ์ ์์ฑ method => db add column (๋ณ์(variable), ์ด ์ถ๊ฐ)
exam %>% mutate(total = english + math + science)
exam %>% mutate(total = english + math + science, mean = total/3)
exam %>% mutate(test = ifelse(science>=60, "P", "F")) %>% head
๐ arrange() & mutate()
exam %>%
mutate(total = english + math + science) %>%
arrange(desc(total)) %>%
head
๐ group_by() & summarise()
exam %>% summarise(math_mean = mean(math))
exam %>%
group_by(class) %>%
summarise(math_mean = mean(math)
exam %>%
group_by(class) %>%
summarise(mean_math = mean(math),
sum_math = sum(math),
median_math = median(math),
n = n()) # n์ ๋น๋ = ํ์ ๊ฐ์
ํจ์ | ์๋ฏธ |
mean() | ํ๊ท |
sd() | ํ์คํธ์ฐจ |
sum() | ํฉ๊ณ |
median() | ์ค์์ |
min() | ์ต์๊ฐ |
max() | ์ต๋๊ฐ |
n() | ๋น๋ |
๐ ์ฐ์ต
mpg_audi <- mpg %>% filter(manufacturer == "audi")
mpg_toyota <- mpg %>% filter(manufacturer== "toyota")
mean(mpg_audi$hwy)
mean(mpg_toyota$hwy)
mpg_new <- mpg %>% select(class, cty)
mpg_new
mpg %>% filter(manufacturer == "audi") %>% arrange(desc(hwy)) %>% head(5)
mpg %>% group_by(manufacturer) %>%
filter(class == "suv") %>%
mutate(mean_y = (cty + hwy)/2) %>%
summarise(mean_total = mean(mean_y)) %>%
arrange(desc(mean_total)) %>%
head(5)
์ถ์ฒ]
https://rstudio-pubs-static.s3.amazonaws.com/382545_098d268806f449c496734236e0b97493.html
'Programming Language > R' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
R studio] ํ๊ฒฝ, ํ์ผ ์ฐฝ์ด ์ฌ๋ผ์ก์ ๋ (0) | 2022.04.25 |
---|---|
R ๊ธฐ์ด ๋ช ๋ น์ด(qplot graph, ๋น๋ ํ, dplyr rename, ํ์ ๋ณ์, ifelse) (0) | 2022.04.08 |
R ๊ธฐ์ด ๋ช ๋ น์ด(excel, ggplot2, rm) (0) | 2022.04.08 |
R ๊ธฐ์ด ๋ช ๋ น์ด(c(), factor(), class(), levels(), as.numeric(), is.numeric()) (0) | 2022.04.07 |