티스토리 뷰

R

(macOS)[R] data.table

jinozpersona 2022. 3. 30. 13:48

INTRO

data.table

Rstudio Console> install.packages("data.table")

 

data.table

간결한 syntax로 grouping, ordering 등의 기능을 제공하는 data.frame의 확장 데이터 구조로 조작, 관리, 처리에 대해 알아보자

test_datatable.R
rm(list=ls())
setwd = "~/Rcoding"

library(data.table)
DT = data.table(x=c('b','b','b','a','a'), v=rnorm(5))
print(DT)

## data.frame type
data(cars)
head(cars)

## data.table type
CARS = data.table(cars)
head(CARS)

## data.table status
tables()
sapply(CARS,class)

## data.table handle
DT
DT[2,]
DT[,1]
DT[DT$x=="b",]

## data.table assign the key
DT
tables()
setkey(DT,x)
DT
tables()

DT["b",]
DT["b", mult="first"]
DT["b", mult="last"]

출력결과

> source("~/Rcoding/test_datatable.R", echo=TRUE)

> rm(list=ls())

> setwd = "~/Rcoding"

> library(data.table)

> DT = data.table(x=c('b','b','b','a','a'), v=rnorm(5))

> print(DT)
   x          v
1: b -0.7970895
2: b  1.2540831
3: b  0.7721422
4: a -0.2195156
5: a -0.4248103

> ## data.frame type
> data(cars)

> head(cars)
  speed dist
1     4    2
2     4   10
3     7    4
4     7   22
5     8   16
6     9   10

> ## data.table type
> CARS = data.table(cars)

> head(CARS)
   speed dist
1:     4    2
2:     4   10
3:     7    4
4:     7   22
5:     8   16
6:     9   10

> ## data.table status
> tables()
   NAME NROW NCOL MB       COLS KEY
1: CARS   50    2  0 speed,dist    
2:   DT    5    2  0        x,v    
Total: 0MB

> sapply(CARS,class)
    speed      dist 
"numeric" "numeric" 

> ## data.table handle
> DT
   x          v
1: b -0.7970895
2: b  1.2540831
3: b  0.7721422
4: a -0.2195156
5: a -0.4248103

> DT[2,]
   x        v
1: b 1.254083

> DT[,1]
   x
1: b
2: b
3: b
4: a
5: a

> DT[DT$x=="b",]
   x          v
1: b -0.7970895
2: b  1.2540831
3: b  0.7721422

> ## data.table assign the key
> DT
   x          v
1: b -0.7970895
2: b  1.2540831
3: b  0.7721422
4: a -0.2195156
5: a -0.4248103

> tables()
   NAME NROW NCOL MB       COLS KEY
1: CARS   50    2  0 speed,dist    
2:   DT    5    2  0        x,v    
Total: 0MB

> setkey(DT,x)

> DT
   x          v
1: a -0.2195156
2: a -0.4248103
3: b -0.7970895
4: b  1.2540831
5: b  0.7721422

> tables()
   NAME NROW NCOL MB       COLS KEY
1: CARS   50    2  0 speed,dist    
2:   DT    5    2  0        x,v   x
Total: 0MB

> DT["b",]
   x          v
1: b -0.7970895
2: b  1.2540831
3: b  0.7721422

> DT["b", mult="first"]
   x          v
1: b -0.7970895

> DT["b", mult="last"]
   x         v
1: b 0.7721422

 

 

data.table 활용 : data search

 data.frame(DF) vector scan VS, data.table(DT) binary search

test_datatable_advanced.R
rm(list=ls())
setwd = "~/Rcoding"

#### create data of 10milion rows and 676 groups
grpsize = ceiling(1e7/26^2)

## time to create data.frame 
ttCreateDF = system.time(DF <- data.frame(
  x = rep(LETTERS, each=26*grpsize),
  y = rep(letters, each=grpsize),
  v = runif(grpsize*26^2),
  stringAsFactors = FALSE)
  )
print(ttCreateDF)
# DF status
head(DF,3)
tail(DF,3)
dim(DF)

# ---- time to find target data : vector scan ----
ttSearchres = system.time(resDF <- DF[DF$x=='R' & DF$y=='h',])
print(ttSearchres)
# resDF status
head(resDF,3)
tail(resDF,3)
dim(resDF)

## ---- time to find target data : binary search ----
DT <- data.table(DF)
setkey(DT,x,y)
ttSearchresDT = system.time(resDT <- DT[.('R','h')])
print(ttSearchresDT)
# resDT status
head(resDT,3)
tail(resDT,3)
dim(resDT)

identical(resDF$v,resDT$v)

출력결과

 DF serch result : elapsed 0.177

 DT serch result : elapsed 0.003

 

> source("~/Rcoding/test_datatable_advanced.R", echo=TRUE)

> rm(list=ls())

> setwd = "~/Rcoding"

> #### create data of 10milion rows and 676 groups
> grpsize = ceiling(1e7/26^2)

> ## time to create data.frame 
> ttCreateDF = system.time(DF <- data.frame(
+   x = rep(LETTERS, each=26*grpsize),
+   y = rep(letters, each=grpsize) .... [TRUNCATED] 

> print(ttCreateDF)
   user  system elapsed 
  0.539   0.002   0.540 

> # DF status
> head(DF,3)
  x y          v stringAsFactors
1 A a 0.71354214           FALSE
2 A a 0.05661365           FALSE
3 A a 0.53363919           FALSE

> tail(DF,3)
         x y          v stringAsFactors
10000066 Z z 0.09806447           FALSE
10000067 Z z 0.99961317           FALSE
10000068 Z z 0.73183891           FALSE

> dim(DF)
[1] 10000068        4

> # ---- time to find target data : vector scan ----
> ttSearchres = system.time(resDF <- DF[DF$x=='R' & DF$y=='h',])

> print(ttSearchres)
   user  system elapsed 
  0.167   0.009   0.177 

> # resDF status
> head(resDF,3)
        x y         v stringAsFactors
6642058 R h 0.7099618           FALSE
6642059 R h 0.7154811           FALSE
6642060 R h 0.3180660           FALSE

> tail(resDF,3)
        x y          v stringAsFactors
6656848 R h 0.57967734           FALSE
6656849 R h 0.58723313           FALSE
6656850 R h 0.06696274           FALSE

> dim(resDF)
[1] 14793     4

> ## ---- time to find target data : binary search ----
> DT <- data.table(DF)

> setkey(DT,x,y)

> ttSearchresDT = system.time(resDT <- DT[.('R','h')])

> print(ttSearchresDT)
   user  system elapsed 
  0.003   0.001   0.003 

> # resDT status
> head(resDT,3)
   x y         v stringAsFactors
1: R h 0.7099618           FALSE
2: R h 0.7154811           FALSE
3: R h 0.3180660           FALSE

> tail(resDT,3)
   x y          v stringAsFactors
1: R h 0.57967734           FALSE
2: R h 0.58723313           FALSE
3: R h 0.06696274           FALSE

> dim(resDT)
[1] 14793     4

> identical(resDF$v,resDT$v)
[1] TRUE

 

data.table 활용 : j, by

test_datatable_advanced.R
동일한 R 파일 뒤에 다음 코드를 붙여 실행한다
...
(상단 생략)
...

## 2nd arg. : j, 3rd arg. : by
DT[,sum(v)]
DT[,sum(v),by=x]
# search time between tapply and by
tttDF = system.time(ttSearchresDF <- tapply(DT$v,DT$x,sum));tttDF
tttDT = system.time(ttSearchresDT <- DT[,sum(v),by=x]);tttDT
head(ttSearchresDF)
head(ttSearchresDT)
identical(as.vector(ttSearchresDF),ttSearchresDT$V1)

tttDT = system.time(ttSearchresDT <- DT[,sum(v),by='x,y']);tttDT;ttSearchresDT

출력결과

...
(상단 생략)
...


> ## 2nd arg. : j, 3rd arg. : by
> DT[,sum(v)]
[1] 5000784

> DT[,sum(v),by=x]
    x       V1
 1: A 192535.5
 2: B 192443.5
 3: C 192570.7
 4: D 192413.4
 5: E 192484.7
 6: F 192385.8
 7: G 192109.2
 8: H 192227.4
 9: I 192026.9
10: J 192476.7
11: K 192384.9
12: L 192374.9
13: M 192046.9
14: N 192316.5
15: O 192499.7
16: P 192325.3
17: Q 192590.1
18: R 192388.0
19: S 192199.5
20: T 192202.6
21: U 192453.2
22: V 192077.6
23: W 192272.3
24: X 192360.7
25: Y 192226.3
26: Z 192391.6
    x       V1

> # search time between tapply and by
> tttDF = system.time(ttSearchresDF <- tapply(DT$v,DT$x,sum));tttDF
   user  system elapsed 
  0.522   0.068   0.589 

> tttDT = system.time(ttSearchresDT <- DT[,sum(v),by=x]);tttDT
   user  system elapsed 
  0.194   0.037   0.231 

> head(ttSearchresDF)
       A        B        C        D        E        F 
192535.5 192443.5 192570.7 192413.4 192484.7 192385.8 

> head(ttSearchresDT)
   x       V1
1: A 192535.5
2: B 192443.5
3: C 192570.7
4: D 192413.4
5: E 192484.7
6: F 192385.8

> identical(as.vector(ttSearchresDF),ttSearchresDT$V1)
[1] TRUE

> tttDT = system.time(ttSearchresDT <- DT[,sum(v),by='x,y']);tttDT;ttSearchresDT
   user  system elapsed 
  0.554   0.021   0.574 
     x y       V1
  1: A a 7380.790
  2: A b 7379.921
  3: A c 7407.945
  4: A d 7353.961
  5: A e 7424.646
 ---             
672: Z v 7433.949
673: Z w 7401.457
674: Z x 7384.072
675: Z y 7419.421
676: Z z 7417.403
반응형

'R' 카테고리의 다른 글

(macOS)[R] 기초 통계 분석 : 기술 통계  (0) 2022.04.05
(macOS)[R] 결측값 처리 / 이상치 탐색  (0) 2022.04.04
(macOS)[R] sqldf, plyr  (0) 2022.03.30
(macOS)[R] reshape  (0) 2022.03.29
(macOS)[R] plot : scatter plot, histogram, box plot  (0) 2022.03.29
댓글
공지사항
최근에 올라온 글
최근에 달린 댓글
Total
Today
Yesterday
링크
«   2025/02   »
1
2 3 4 5 6 7 8
9 10 11 12 13 14 15
16 17 18 19 20 21 22
23 24 25 26 27 28
글 보관함