티스토리 뷰
INTRO
data.table
Rstudio Console> install.packages("data.table")
data.table
간결한 syntax로 grouping, ordering 등의 기능을 제공하는 data.frame의 확장 데이터 구조로 조작, 관리, 처리에 대해 알아보자
test_datatable.R
rm(list=ls())
setwd = "~/Rcoding"
library(data.table)
DT = data.table(x=c('b','b','b','a','a'), v=rnorm(5))
print(DT)
## data.frame type
data(cars)
head(cars)
## data.table type
CARS = data.table(cars)
head(CARS)
## data.table status
tables()
sapply(CARS,class)
## data.table handle
DT
DT[2,]
DT[,1]
DT[DT$x=="b",]
## data.table assign the key
DT
tables()
setkey(DT,x)
DT
tables()
DT["b",]
DT["b", mult="first"]
DT["b", mult="last"]
출력결과
> source("~/Rcoding/test_datatable.R", echo=TRUE)
> rm(list=ls())
> setwd = "~/Rcoding"
> library(data.table)
> DT = data.table(x=c('b','b','b','a','a'), v=rnorm(5))
> print(DT)
x v
1: b -0.7970895
2: b 1.2540831
3: b 0.7721422
4: a -0.2195156
5: a -0.4248103
> ## data.frame type
> data(cars)
> head(cars)
speed dist
1 4 2
2 4 10
3 7 4
4 7 22
5 8 16
6 9 10
> ## data.table type
> CARS = data.table(cars)
> head(CARS)
speed dist
1: 4 2
2: 4 10
3: 7 4
4: 7 22
5: 8 16
6: 9 10
> ## data.table status
> tables()
NAME NROW NCOL MB COLS KEY
1: CARS 50 2 0 speed,dist
2: DT 5 2 0 x,v
Total: 0MB
> sapply(CARS,class)
speed dist
"numeric" "numeric"
> ## data.table handle
> DT
x v
1: b -0.7970895
2: b 1.2540831
3: b 0.7721422
4: a -0.2195156
5: a -0.4248103
> DT[2,]
x v
1: b 1.254083
> DT[,1]
x
1: b
2: b
3: b
4: a
5: a
> DT[DT$x=="b",]
x v
1: b -0.7970895
2: b 1.2540831
3: b 0.7721422
> ## data.table assign the key
> DT
x v
1: b -0.7970895
2: b 1.2540831
3: b 0.7721422
4: a -0.2195156
5: a -0.4248103
> tables()
NAME NROW NCOL MB COLS KEY
1: CARS 50 2 0 speed,dist
2: DT 5 2 0 x,v
Total: 0MB
> setkey(DT,x)
> DT
x v
1: a -0.2195156
2: a -0.4248103
3: b -0.7970895
4: b 1.2540831
5: b 0.7721422
> tables()
NAME NROW NCOL MB COLS KEY
1: CARS 50 2 0 speed,dist
2: DT 5 2 0 x,v x
Total: 0MB
> DT["b",]
x v
1: b -0.7970895
2: b 1.2540831
3: b 0.7721422
> DT["b", mult="first"]
x v
1: b -0.7970895
> DT["b", mult="last"]
x v
1: b 0.7721422
data.table 활용 : data search
data.frame(DF) vector scan VS, data.table(DT) binary search
test_datatable_advanced.R
rm(list=ls())
setwd = "~/Rcoding"
#### create data of 10milion rows and 676 groups
grpsize = ceiling(1e7/26^2)
## time to create data.frame
ttCreateDF = system.time(DF <- data.frame(
x = rep(LETTERS, each=26*grpsize),
y = rep(letters, each=grpsize),
v = runif(grpsize*26^2),
stringAsFactors = FALSE)
)
print(ttCreateDF)
# DF status
head(DF,3)
tail(DF,3)
dim(DF)
# ---- time to find target data : vector scan ----
ttSearchres = system.time(resDF <- DF[DF$x=='R' & DF$y=='h',])
print(ttSearchres)
# resDF status
head(resDF,3)
tail(resDF,3)
dim(resDF)
## ---- time to find target data : binary search ----
DT <- data.table(DF)
setkey(DT,x,y)
ttSearchresDT = system.time(resDT <- DT[.('R','h')])
print(ttSearchresDT)
# resDT status
head(resDT,3)
tail(resDT,3)
dim(resDT)
identical(resDF$v,resDT$v)
출력결과
DF serch result : elapsed 0.177
DT serch result : elapsed 0.003
> source("~/Rcoding/test_datatable_advanced.R", echo=TRUE)
> rm(list=ls())
> setwd = "~/Rcoding"
> #### create data of 10milion rows and 676 groups
> grpsize = ceiling(1e7/26^2)
> ## time to create data.frame
> ttCreateDF = system.time(DF <- data.frame(
+ x = rep(LETTERS, each=26*grpsize),
+ y = rep(letters, each=grpsize) .... [TRUNCATED]
> print(ttCreateDF)
user system elapsed
0.539 0.002 0.540
> # DF status
> head(DF,3)
x y v stringAsFactors
1 A a 0.71354214 FALSE
2 A a 0.05661365 FALSE
3 A a 0.53363919 FALSE
> tail(DF,3)
x y v stringAsFactors
10000066 Z z 0.09806447 FALSE
10000067 Z z 0.99961317 FALSE
10000068 Z z 0.73183891 FALSE
> dim(DF)
[1] 10000068 4
> # ---- time to find target data : vector scan ----
> ttSearchres = system.time(resDF <- DF[DF$x=='R' & DF$y=='h',])
> print(ttSearchres)
user system elapsed
0.167 0.009 0.177
> # resDF status
> head(resDF,3)
x y v stringAsFactors
6642058 R h 0.7099618 FALSE
6642059 R h 0.7154811 FALSE
6642060 R h 0.3180660 FALSE
> tail(resDF,3)
x y v stringAsFactors
6656848 R h 0.57967734 FALSE
6656849 R h 0.58723313 FALSE
6656850 R h 0.06696274 FALSE
> dim(resDF)
[1] 14793 4
> ## ---- time to find target data : binary search ----
> DT <- data.table(DF)
> setkey(DT,x,y)
> ttSearchresDT = system.time(resDT <- DT[.('R','h')])
> print(ttSearchresDT)
user system elapsed
0.003 0.001 0.003
> # resDT status
> head(resDT,3)
x y v stringAsFactors
1: R h 0.7099618 FALSE
2: R h 0.7154811 FALSE
3: R h 0.3180660 FALSE
> tail(resDT,3)
x y v stringAsFactors
1: R h 0.57967734 FALSE
2: R h 0.58723313 FALSE
3: R h 0.06696274 FALSE
> dim(resDT)
[1] 14793 4
> identical(resDF$v,resDT$v)
[1] TRUE
data.table 활용 : j, by
test_datatable_advanced.R
동일한 R 파일 뒤에 다음 코드를 붙여 실행한다
...
(상단 생략)
...
## 2nd arg. : j, 3rd arg. : by
DT[,sum(v)]
DT[,sum(v),by=x]
# search time between tapply and by
tttDF = system.time(ttSearchresDF <- tapply(DT$v,DT$x,sum));tttDF
tttDT = system.time(ttSearchresDT <- DT[,sum(v),by=x]);tttDT
head(ttSearchresDF)
head(ttSearchresDT)
identical(as.vector(ttSearchresDF),ttSearchresDT$V1)
tttDT = system.time(ttSearchresDT <- DT[,sum(v),by='x,y']);tttDT;ttSearchresDT
출력결과
...
(상단 생략)
...
> ## 2nd arg. : j, 3rd arg. : by
> DT[,sum(v)]
[1] 5000784
> DT[,sum(v),by=x]
x V1
1: A 192535.5
2: B 192443.5
3: C 192570.7
4: D 192413.4
5: E 192484.7
6: F 192385.8
7: G 192109.2
8: H 192227.4
9: I 192026.9
10: J 192476.7
11: K 192384.9
12: L 192374.9
13: M 192046.9
14: N 192316.5
15: O 192499.7
16: P 192325.3
17: Q 192590.1
18: R 192388.0
19: S 192199.5
20: T 192202.6
21: U 192453.2
22: V 192077.6
23: W 192272.3
24: X 192360.7
25: Y 192226.3
26: Z 192391.6
x V1
> # search time between tapply and by
> tttDF = system.time(ttSearchresDF <- tapply(DT$v,DT$x,sum));tttDF
user system elapsed
0.522 0.068 0.589
> tttDT = system.time(ttSearchresDT <- DT[,sum(v),by=x]);tttDT
user system elapsed
0.194 0.037 0.231
> head(ttSearchresDF)
A B C D E F
192535.5 192443.5 192570.7 192413.4 192484.7 192385.8
> head(ttSearchresDT)
x V1
1: A 192535.5
2: B 192443.5
3: C 192570.7
4: D 192413.4
5: E 192484.7
6: F 192385.8
> identical(as.vector(ttSearchresDF),ttSearchresDT$V1)
[1] TRUE
> tttDT = system.time(ttSearchresDT <- DT[,sum(v),by='x,y']);tttDT;ttSearchresDT
user system elapsed
0.554 0.021 0.574
x y V1
1: A a 7380.790
2: A b 7379.921
3: A c 7407.945
4: A d 7353.961
5: A e 7424.646
---
672: Z v 7433.949
673: Z w 7401.457
674: Z x 7384.072
675: Z y 7419.421
676: Z z 7417.403
반응형
'R' 카테고리의 다른 글
(macOS)[R] 기초 통계 분석 : 기술 통계 (0) | 2022.04.05 |
---|---|
(macOS)[R] 결측값 처리 / 이상치 탐색 (0) | 2022.04.04 |
(macOS)[R] sqldf, plyr (0) | 2022.03.30 |
(macOS)[R] reshape (0) | 2022.03.29 |
(macOS)[R] plot : scatter plot, histogram, box plot (0) | 2022.03.29 |
댓글
공지사항
최근에 올라온 글
최근에 달린 댓글
- Total
- Today
- Yesterday
링크
TAG
- github
- sublime text
- 코로나19
- raspberrypi
- SSH
- Raspberry Pi
- MacOS
- Regression
- 라즈베리파이
- Model
- Django
- vscode
- 자가격리
- arduino
- git
- DAQ
- Pandas
- server
- Templates
- COVID-19
- pyserial
- CSV
- r
- template
- 코로나
- DS18B20
- analysis
- 확진
- ERP
- Python
일 | 월 | 화 | 수 | 목 | 금 | 토 |
---|---|---|---|---|---|---|
1 | ||||||
2 | 3 | 4 | 5 | 6 | 7 | 8 |
9 | 10 | 11 | 12 | 13 | 14 | 15 |
16 | 17 | 18 | 19 | 20 | 21 | 22 |
23 | 24 | 25 | 26 | 27 | 28 |
글 보관함