r - Why is the time complexity of this loop non-linear? -
why time complexity of loop non-linear , why slow? loop takes ~38s n=50k,
, ~570s n=200k
. there faster way this? rprof()
seems indicate writing memory slow.
df <- data.frame(replicate(5, runif(200000))) df[,1:3] <- round(df[,1:3]) rprof(line.profiling = true); timer <- proc.time() x <- df; n <- nrow(df); <- 1 ind <- df[1:(n-1),1:3] == df[2:n,1:3]; rind <- which(apply(ind,1,all)) n <- length(rind) while(i <= n) { x$x4[rind[i]+1] <- x$x4[rind[i]+1] + x$x4[rind[i]] x$x5[rind[i]+1] <- x$x4[rind[i]+1] * x$x3[rind[i]+1] x$x5[rind[i]+1] <- trunc(x$x5[rind[i]+1]*10^8)/10^8 x$x1[rind[i]] <- na <- + 1 };x <- na.omit(x) proc.time() - timer; rprof(null) summaryrprof(lines = "show")
the purpose of algorithm iterate on data frame , combine adjacent rows match on elements. is, removes 1 of rows , adds of row's values other row. resulting data frame should have n less rows, n number of matching adjacent rows in original data frame. every time pair of rows combined, index of source data frame , new data frame out of sync 1, since 1 row removed/omitted new frame, i
keeps track of position on source data frame, , q
keeps track of position on new data frame.
the code above updated @joran's comment. performance improved substantially ~5.5s n=50k
, ~88s n=200k
. however, time complexity still non-linear, can't fathom. need run @ n = 1 million or more, still not great speed.
only x4
column update depends on previous values, loop can 'vectorized' (with little bit of optimization, avoiding addition of 1 rind
in each iteration) as
rind1 <- rind + 1l (i in seq_len(n)) x$x4[rind1[i]] <- x$x4[rind1[i]] + x$x4[rind[i]] x$x5[rind1] <- x$x4[rind1] * x$x3[rind1] x$x5[rind1] <- trunc(x$x5[rind1] * 10^8) / 10^8 x$x1[rind] <- na na.omit(x)
x4
numeric value , update can made more efficient updating vector rather column of data.frame
x4 <- x$x4 (i in seq_len(n)) x4[rind1[i]] <- x4[rind1[i]] + x4[rind[i]] x$x4 <- x4
for comparison, have
f0 <- function(nrow) { set.seed(123) df <- data.frame(replicate(5, runif(nrow))) df[,1:3] <- round(df[,1:3]) x <- df; n <- nrow(df); <- 1 ind <- df[1:(n-1),1:3] == df[2:n,1:3]; rind <- which(apply(ind,1,all)) n <- length(rind) while(i <= n) { x$x4[rind[i]+1] <- x$x4[rind[i]+1] + x$x4[rind[i]] x$x5[rind[i]+1] <- x$x4[rind[i]+1] * x$x3[rind[i]+1] x$x5[rind[i]+1] <- trunc(x$x5[rind[i]+1]*10^8)/10^8 x$x1[rind[i]] <- na <- + 1 } na.omit(x) } f1a <- function(nrow) { set.seed(123) df <- data.frame(replicate(5, runif(nrow))) df[,1:3] <- round(df[,1:3]) x <- df; n <- nrow(df) ind <- df[1:(n-1),1:3] == df[2:n,1:3]; rind <- which(apply(ind,1,all)) rind1 <- rind + 1l (i in seq_along(rind)) x$x4[rind1[i]] <- x$x4[rind1[i]] + x$x4[rind[i]] x$x5[rind1] <- x$x4[rind1] * x$x3[rind1] x$x5[rind1] <- trunc(x$x5[rind1] * 10^8) / 10^8 x$x1[rind] <- na na.omit(x) } f4a <- function(nrow) { set.seed(123) df <- data.frame(replicate(5, runif(nrow))) df[,1:3] <- round(df[,1:3]) x <- df; n <- nrow(df) ind <- df[1:(n-1),1:3] == df[2:n,1:3]; rind <- which(apply(ind,1,all)) rind1 <- rind + 1l x4 <- x$x4 (i in seq_along(rind)) x4[rind1[i]] <- x4[rind1[i]] + x4[rind[i]] x$x4 <- x4 x$x1[rind] <- na x$x5[rind1] <- x4[rind1] * x$x3[rind1] x$x5[rind1] <- trunc(x$x5[rind1] * 10^8) / 10^8 na.omit(x) }
the results same
> identical(f0(1000), f1a(1000)) [1] true > identical(f0(1000), f4a(1000)) [1] true
the speedup substantial (using library(microbenchmark)
)
> microbenchmark(f0(10000), f1a(10000), f4a(10000), times=10) unit: milliseconds expr min lq mean median uq max neval f0(10000) 346.35906 354.37637 361.15188 363.71627 366.74944 373.88275 10 f1a(10000) 124.71766 126.43532 127.99166 127.39257 129.51927 133.01573 10 f4a(10000) 41.70401 42.48141 42.90487 43.00584 43.32059 43.83757 10
the reason difference can seen when r has been compiled memory profiling enabled --
> tracemem(x) [1] "<0x39d93a8>" > tracemem(x$x4) [1] "<0x6586e40>" > x$x4[1] <- 1 tracemem[0x39d93a8 -> 0x39d9410]: tracemem[0x6586e40 -> 0x670d870]: tracemem[0x39d9410 -> 0x39d9478]: tracemem[0x39d9478 -> 0x39d94e0]: $<-.data.frame $<- tracemem[0x39d94e0 -> 0x39d9548]: $<-.data.frame $<- >
each line indicates memory copy, updating cell in data frame incurs 5 copies of outer structure or vector itself. in contrast, vector can updated without copies.
> tracemem(x4) [1] "<0xdd44460>" > x4[1] = 1 tracemem[0xdd44460 -> 0x9d26c10]: > x4[1] = 2 >
(the first assignment expensive because represents duplication of data.frame column; subsequent updates x4
, x4
refers vector being updated, , vector not need duplicated).
the data.frame implementation seem scale non-linearly
> microbenchmark(f1a(100), f1a(1000), f1a(10000), f1a(100000), times=10) unit: milliseconds expr min lq mean median uq f1a(100) 2.372266 2.479458 2.551568 2.524818 2.640244 f1a(1000) 10.831288 11.100009 11.210483 11.194863 11.432533 f1a(10000) 130.011104 138.686445 139.556787 141.138329 141.522686 f1a(1e+05) 4092.439956 4117.818817 4145.809235 4143.634663 4172.282888 max neval 2.727221 10 11.581644 10 147.993499 10 4216.129732 10
the reason apparent in second line of tracemem output above -- updating row triggers copy of entire column. algorithm scales number of rows update times number of rows in column, approximately quadratic.
f4a()
appears scale linearly
> microbenchmark(f4a(100), f4a(1000), f4a(10000), f4a(100000), f4a(1e6), times=10) unit: milliseconds expr min lq mean median uq f4a(100) 1.741458 1.756095 1.827886 1.773887 1.929943 f4a(1000) 5.286016 5.517491 5.558091 5.569514 5.671840 f4a(10000) 42.906895 43.025385 43.880020 43.928631 44.633684 f4a(1e+05) 467.698285 478.919843 539.696364 552.896109 576.707913 f4a(1e+06) 5385.029968 5521.645185 5614.960871 5573.475270 5794.307470 max neval 2.003700 10 5.764022 10 44.983002 10 644.927832 10 5823.868167 10
one try , clever vectorizing loop, necessary?
a tuned version of data processing part of function uses negative indexing (e.g., -nrow(df)
) remove rows data frame, rowsums()
instead of apply()
, , unname()
subset operations don't carry around unused names:
g0 <- function(df) { ind <- df[-nrow(df), 1:3] == df[-1, 1:3] rind <- unname(which(rowsums(ind) == ncol(ind))) rind1 <- rind + 1l x4 <- df$x4 (i in seq_along(rind)) x4[rind1[i]] <- x4[rind1[i]] + x4[rind[i]] df$x4 <- x4 df$x1[rind] <- na df$x5[rind1] <- trunc(df$x4[rind1] * df$x3[rind1] * 10^8) / 10^8 na.omit(df) }
compared data.table solution suggested @khashaa
g1 <- function(df) { x <- setdt(df)[, r:=rleid(x1, x2, x3),] x <- x[, .(x1=x1[.n], x2=x2[.n], x3=x3[.n], x4=sum(x4), x5=x5[.n]), by=r] x <- x[, x5:= trunc(x3 * x4 * 10^8)/10^8] x }
the base r version performs favorably times
> n_row <- 200000 > set.seed(123) > df <- data.frame(replicate(5, runif(n_row))) > df[,1:3] <- round(df[,1:3]) > system.time(g0res <- g0(df)) user system elapsed 0.247 0.000 0.247 > system.time(g1res <- g1(df)) user system elapsed 0.551 0.000 0.551
(the pre-tuning version in f4a takes 760ms, more twice slow).
the results data.table implementation not correct
> head(g0res) x1 x2 x3 x4 x5 1 0 1 1 0.4708851 0.8631978 2 1 1 0 0.8977670 0.8311355 3 0 1 0 0.7615472 0.6002179 4 1 1 1 0.6478515 0.5616587 5 1 0 0 0.5329256 0.5805195 6 0 1 1 0.8526255 0.4913130 > head(g1res) r x1 x2 x3 x4 x5 1: 1 0 1 1 0.4708851 0.4708851 2: 2 1 1 0 0.8977670 0.0000000 3: 3 0 1 0 0.7615472 0.0000000 4: 4 1 1 1 0.6478515 0.6478515 5: 5 1 0 0 0.5329256 0.0000000 6: 6 0 1 1 0.8526255 0.8526255
and i'm not enough of data.table wizard (barely data.table user) know correct formulation is.
compiling (benefits exclusively loop?) increases speed 20%
> g0c <- compiler::cmpfun(g0) > microbenchmark(g0(df), g0c(df), times=10) unit: milliseconds expr min lq mean median uq max neval g0(df) 250.0750 262.941 276.1549 276.8848 281.1966 321.3778 10 g0c(df) 214.3132 219.940 228.0784 230.2098 235.4579 242.6636 10
Comments
Post a Comment