What about the data.table
?
I ma not sure how the logic should work (cases like SNP.x != SNP.y
or both NA's
, but you can amend it yourself.
Edit: Few approaches benchmarked.
Prepare data:
require(data.table)
require(microbenchmark)
dat1 <- data.table(Name = c("exm-rs10128711", "exm-rs10134944", "exm-rs10218696", "exm-rs10223421", "both_NAs", "no_NAs_just_diff"),
SNP.x = c("[T/C]", "[A/G]", NA, "[A/C]", NA, "new_x"),
ILMN.Strand.x = c("BOT", "TOP", NA, "TOP", "new_x", "new_x"),
Customer.Strand.x = c("BOT", "BOT", NA, "BOT", "new_x", "new_x"),
SNP.y = c("[T/C]", NA, "[T/C]", NA, NA, "new_y"),
ILMN.Strand.y = c("BOT", NA, "BOT", NA, "new_y", "new_y"),
Customer.Strand.y = c("BOT", NA, "TOP", NA, "new_y", "new_y"))
# Make it a bit bigger
for (i in seq_len(15)) dat1 <- rbind(dat1, dat1) # 15 MB, 196608 rows
# If needed cast to characters (to get rid of "level sets of factors are different" error...)
# dat <- dat[, lapply(.SD, as.character)]
Functions:
# if else returning a list
f1 <- function() {
dat1[, c("SNP", "ILMN.Strand", "Customer.Strand") :=
if ( !is.na(SNP.x) ) { list(SNP.x, ILMN.Strand.x, Customer.Strand.x)
} else if ( !is.na(SNP.y) ) { list(SNP.y, ILMN.Strand.y, Customer.Strand.y)
} else { list(NA_character_, NA_character_, NA_character_) },
by = seq_len(nrow(dat1))
][]
}
# ifelse per column
f2 <- function() {
dat1[, ":="(SNP = ifelse(!is.na(SNP.x), SNP.x,
ifelse(!is.na(SNP.y), SNP.y, NA_character_)),
ILMN.Strand = ifelse(!is.na(SNP.x), ILMN.Strand.x,
ifelse(!is.na(SNP.y), ILMN.Strand.y, NA_character_)),
Customer.Strand = ifelse(!is.na(SNP.x), Customer.Strand.x,
ifelse(!is.na(SNP.y), Customer.Strand.y, NA_character_)))
][]
}
# ifelse returning a list
f3 <- function() {
dat1[, c("SNP", "ILMN.Strand", "Customer.Strand") :=
ifelse (!is.na(SNP.x), list(list(SNP.x, ILMN.Strand.x, Customer.Strand.x)),
ifelse (!is.na(SNP.y), list(list(SNP.y, ILMN.Strand.y, Customer.Strand.y)),
list(list(NA_character_, NA_character_, NA_character_))))[[1]] # HERE IS THE ONE!
][]
}
Benchmarking
microbenchmark(
d1 <- f1(),
d2 <- f2(),
d3 <- f3(),
times = 5)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# d1 <- f1() 303.03681 316.91054 354.9147 330.91177 403.3858 420.3286 5 b
# d2 <- f2() 658.27527 660.19131 723.9005 664.31352 737.0994 899.6230 5 c
# d3 <- f3() 78.20754 84.91487 110.3533 86.73539 104.9149 196.9938 5 a
d1[1:6, ]
# Name SNP.x ILMN.Strand.x Customer.Strand.x SNP.y ILMN.Strand.y Customer.Strand.y SNP ILMN.Strand Customer.Strand
# 1: exm-rs10128711 [T/C] BOT BOT [T/C] BOT BOT [T/C] BOT BOT
# 2: exm-rs10134944 [A/G] TOP BOT NA NA NA [A/G] TOP BOT
# 3: exm-rs10218696 NA NA NA [T/C] BOT TOP [T/C] BOT TOP
# 4: exm-rs10223421 [A/C] TOP BOT NA NA NA [A/C] TOP BOT
# 5: both_NAs NA new_x new_x NA new_y new_y NA NA NA
# 6: no_NAs_just_diff new_x new_x new_x new_y new_y new_y new_x new_x new_x
sapply(list(d1, d2, d3), FUN = identical, d1)
# [1] TRUE TRUE TRUE
Comments
f2
is here only because I could not figure out how to return a list
from the ifelse
, just by luck I got this double list
idea used in f3
.
To read about multiple assigments in data.table
refer to e.g. Assign multiple columns using := in data.table, by group
Smaller sets
96 rows:
# Unit: microseconds
# expr min lq mean median uq max neval cld
# d1 <- f1() 1964.988 1968.936 2238.697 2273.276 2404.722 2581.564 5 b
# d2 <- f2() 976.574 998.284 1147.020 1033.021 1038.942 1688.280 5 a
# d3 <- f3() 684.471 845.916 1026.389 1141.573 1209.466 1250.519 5 a
6144 rows:
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# d1 <- f1() 11.977032 12.128610 13.869310 12.52532 12.585317 20.130271 5 b
# d2 <- f2() 17.200552 17.627260 21.616209 20.76224 22.830254 29.660738 5 c
# d3 <- f3() 2.945114 3.009456 3.317191 3.04064 3.071429 4.519314 5 a