r - Why is the time complexity of this loop non-linear? -


why time complexity of loop non-linear , why slow? loop takes ~38s n=50k, , ~570s n=200k. there faster way this? rprof() seems indicate writing memory slow.

df <- data.frame(replicate(5, runif(200000))) df[,1:3] <- round(df[,1:3])  rprof(line.profiling = true); timer <- proc.time() x <- df; n <- nrow(df); <- 1  ind <- df[1:(n-1),1:3] == df[2:n,1:3];  rind <- which(apply(ind,1,all)) n <- length(rind) while(i <= n) {     x$x4[rind[i]+1] <- x$x4[rind[i]+1] + x$x4[rind[i]]     x$x5[rind[i]+1] <- x$x4[rind[i]+1] * x$x3[rind[i]+1]     x$x5[rind[i]+1] <- trunc(x$x5[rind[i]+1]*10^8)/10^8     x$x1[rind[i]] <- na     <- + 1 };x <- na.omit(x) proc.time() - timer; rprof(null) summaryrprof(lines = "show") 

the purpose of algorithm iterate on data frame , combine adjacent rows match on elements. is, removes 1 of rows , adds of row's values other row. resulting data frame should have n less rows, n number of matching adjacent rows in original data frame. every time pair of rows combined, index of source data frame , new data frame out of sync 1, since 1 row removed/omitted new frame, i keeps track of position on source data frame, , q keeps track of position on new data frame.

the code above updated @joran's comment. performance improved substantially ~5.5s n=50k , ~88s n=200k. however, time complexity still non-linear, can't fathom. need run @ n = 1 million or more, still not great speed.

only x4 column update depends on previous values, loop can 'vectorized' (with little bit of optimization, avoiding addition of 1 rind in each iteration) as

rind1 <- rind + 1l (i in seq_len(n))     x$x4[rind1[i]] <- x$x4[rind1[i]] + x$x4[rind[i]]  x$x5[rind1] <- x$x4[rind1] * x$x3[rind1] x$x5[rind1] <- trunc(x$x5[rind1] * 10^8) / 10^8 x$x1[rind] <- na na.omit(x) 

x4 numeric value , update can made more efficient updating vector rather column of data.frame

x4 <- x$x4 (i in seq_len(n))     x4[rind1[i]] <- x4[rind1[i]] + x4[rind[i]] x$x4 <- x4 

for comparison, have

f0 <- function(nrow) {     set.seed(123)     df <- data.frame(replicate(5, runif(nrow)))     df[,1:3] <- round(df[,1:3])     x <- df; n <- nrow(df); <- 1      ind <- df[1:(n-1),1:3] == df[2:n,1:3];      rind <- which(apply(ind,1,all))     n <- length(rind)      while(i <= n)     {         x$x4[rind[i]+1] <- x$x4[rind[i]+1] + x$x4[rind[i]]         x$x5[rind[i]+1] <- x$x4[rind[i]+1] * x$x3[rind[i]+1]         x$x5[rind[i]+1] <- trunc(x$x5[rind[i]+1]*10^8)/10^8         x$x1[rind[i]] <- na         <- + 1     }     na.omit(x) }  f1a <- function(nrow) {     set.seed(123)     df <- data.frame(replicate(5, runif(nrow)))     df[,1:3] <- round(df[,1:3])     x <- df; n <- nrow(df)     ind <- df[1:(n-1),1:3] == df[2:n,1:3];      rind <- which(apply(ind,1,all))        rind1 <- rind + 1l     (i in seq_along(rind))         x$x4[rind1[i]] <- x$x4[rind1[i]] + x$x4[rind[i]]      x$x5[rind1] <- x$x4[rind1] * x$x3[rind1]     x$x5[rind1] <- trunc(x$x5[rind1] * 10^8) / 10^8     x$x1[rind] <- na     na.omit(x) }  f4a <- function(nrow) {     set.seed(123)     df <- data.frame(replicate(5, runif(nrow)))     df[,1:3] <- round(df[,1:3])     x <- df; n <- nrow(df)      ind <- df[1:(n-1),1:3] == df[2:n,1:3];      rind <- which(apply(ind,1,all))      rind1 <- rind + 1l     x4 <- x$x4     (i in seq_along(rind))         x4[rind1[i]] <- x4[rind1[i]] + x4[rind[i]]     x$x4 <- x4      x$x1[rind] <- na     x$x5[rind1] <- x4[rind1] * x$x3[rind1]     x$x5[rind1] <- trunc(x$x5[rind1] * 10^8) / 10^8      na.omit(x) } 

the results same

> identical(f0(1000), f1a(1000)) [1] true > identical(f0(1000), f4a(1000)) [1] true 

the speedup substantial (using library(microbenchmark))

> microbenchmark(f0(10000), f1a(10000), f4a(10000), times=10) unit: milliseconds        expr       min        lq      mean    median        uq       max neval   f0(10000) 346.35906 354.37637 361.15188 363.71627 366.74944 373.88275    10  f1a(10000) 124.71766 126.43532 127.99166 127.39257 129.51927 133.01573    10  f4a(10000)  41.70401  42.48141  42.90487  43.00584  43.32059  43.83757    10 

the reason difference can seen when r has been compiled memory profiling enabled --

> tracemem(x) [1] "<0x39d93a8>" > tracemem(x$x4) [1] "<0x6586e40>" > x$x4[1] <- 1 tracemem[0x39d93a8 -> 0x39d9410]:  tracemem[0x6586e40 -> 0x670d870]:  tracemem[0x39d9410 -> 0x39d9478]:  tracemem[0x39d9478 -> 0x39d94e0]: $<-.data.frame $<-  tracemem[0x39d94e0 -> 0x39d9548]: $<-.data.frame $<-  > 

each line indicates memory copy, updating cell in data frame incurs 5 copies of outer structure or vector itself. in contrast, vector can updated without copies.

> tracemem(x4) [1] "<0xdd44460>" > x4[1] = 1 tracemem[0xdd44460 -> 0x9d26c10]:  > x4[1] = 2 > 

(the first assignment expensive because represents duplication of data.frame column; subsequent updates x4, x4 refers vector being updated, , vector not need duplicated).

the data.frame implementation seem scale non-linearly

> microbenchmark(f1a(100), f1a(1000), f1a(10000), f1a(100000), times=10) unit: milliseconds        expr         min          lq        mean      median          uq    f1a(100)    2.372266    2.479458    2.551568    2.524818    2.640244   f1a(1000)   10.831288   11.100009   11.210483   11.194863   11.432533  f1a(10000)  130.011104  138.686445  139.556787  141.138329  141.522686  f1a(1e+05) 4092.439956 4117.818817 4145.809235 4143.634663 4172.282888          max neval     2.727221    10    11.581644    10   147.993499    10  4216.129732    10 

the reason apparent in second line of tracemem output above -- updating row triggers copy of entire column. algorithm scales number of rows update times number of rows in column, approximately quadratic.

f4a() appears scale linearly

> microbenchmark(f4a(100), f4a(1000), f4a(10000), f4a(100000), f4a(1e6), times=10) unit: milliseconds        expr         min          lq        mean      median          uq    f4a(100)    1.741458    1.756095    1.827886    1.773887    1.929943   f4a(1000)    5.286016    5.517491    5.558091    5.569514    5.671840  f4a(10000)   42.906895   43.025385   43.880020   43.928631   44.633684  f4a(1e+05)  467.698285  478.919843  539.696364  552.896109  576.707913  f4a(1e+06) 5385.029968 5521.645185 5614.960871 5573.475270 5794.307470          max neval     2.003700    10     5.764022    10    44.983002    10   644.927832    10  5823.868167    10 

one try , clever vectorizing loop, necessary?

a tuned version of data processing part of function uses negative indexing (e.g., -nrow(df)) remove rows data frame, rowsums() instead of apply(), , unname() subset operations don't carry around unused names:

g0 <- function(df) {     ind <- df[-nrow(df), 1:3] == df[-1, 1:3]     rind <- unname(which(rowsums(ind) == ncol(ind)))     rind1 <- rind + 1l      x4 <- df$x4     (i in seq_along(rind))         x4[rind1[i]] <- x4[rind1[i]] + x4[rind[i]]      df$x4 <- x4     df$x1[rind] <- na     df$x5[rind1] <- trunc(df$x4[rind1] * df$x3[rind1] * 10^8) / 10^8      na.omit(df) } 

compared data.table solution suggested @khashaa

g1 <- function(df) {     x <- setdt(df)[, r:=rleid(x1, x2, x3),]     x <- x[, .(x1=x1[.n], x2=x2[.n], x3=x3[.n], x4=sum(x4), x5=x5[.n]), by=r]     x <- x[, x5:= trunc(x3 * x4 * 10^8)/10^8]     x } 

the base r version performs favorably times

> n_row <- 200000 > set.seed(123) > df <- data.frame(replicate(5, runif(n_row))) > df[,1:3] <- round(df[,1:3]) > system.time(g0res <- g0(df))    user  system elapsed    0.247   0.000   0.247  > system.time(g1res <- g1(df))    user  system elapsed    0.551   0.000   0.551  

(the pre-tuning version in f4a takes 760ms, more twice slow).

the results data.table implementation not correct

> head(g0res)   x1 x2 x3        x4        x5 1  0  1  1 0.4708851 0.8631978 2  1  1  0 0.8977670 0.8311355 3  0  1  0 0.7615472 0.6002179 4  1  1  1 0.6478515 0.5616587 5  1  0  0 0.5329256 0.5805195 6  0  1  1 0.8526255 0.4913130 > head(g1res)    r x1 x2 x3        x4        x5 1: 1  0  1  1 0.4708851 0.4708851 2: 2  1  1  0 0.8977670 0.0000000 3: 3  0  1  0 0.7615472 0.0000000 4: 4  1  1  1 0.6478515 0.6478515 5: 5  1  0  0 0.5329256 0.0000000 6: 6  0  1  1 0.8526255 0.8526255 

and i'm not enough of data.table wizard (barely data.table user) know correct formulation is.

compiling (benefits exclusively loop?) increases speed 20%

> g0c <- compiler::cmpfun(g0) > microbenchmark(g0(df), g0c(df), times=10) unit: milliseconds      expr      min      lq     mean   median       uq      max neval   g0(df)  250.0750 262.941 276.1549 276.8848 281.1966 321.3778    10   g0c(df) 214.3132 219.940 228.0784 230.2098 235.4579 242.6636    10 

Comments

Popular posts from this blog

c++ - llvm function pass ReplaceInstWithInst malloc -

Cross-Compiling Linux Kernel for Raspberry Pi - ${CCPREFIX}gcc -v does not work -

java.lang.NoClassDefFoundError When Creating New Android Project -