The usual and fundumental result of the analysis of a microarray data experiment is a list of differentially expressed genes, always according to the statistical design and the biological questions asked. However, what we see most of the times in such a list is a set of differentially expressed probes and not a list of differentially expressed genes (according to official HUGO nomenclature for example). Although this final list contains also gene names next to probes, I have been asked a lot of times to provide a list of genes and discard the probes as they are rather confusing the bench biologists. The latter are asking questions of the type "which probe should I believe" or "why the expression in one probe is so much different from the expression of the other probe corresponding to the same gene"?
The answer is not always easy,
if there is a satisfactory one. First and foremost, it has to do with the probe design itself. Fortunately, RNA-Seq has partially remedied such problems when one wants to study only gene expression and not differential splicing for example. But what about microarrays? One way is to use the avereps function from the Bioconductor limma package. Unfortunately, using this on the gene level after the replicate probe level (e.g. some Agilent probes exist many times in an array), not only may have an impact in the normalization procedure but it may affect the subsequent statistical analysis by incorporating bad quality probes. I have written a small R script which sumarizes gene expression from probes to genes, after filtering, normalization and the assignment of statistical scores (p-values etc.). Thus, if the remaining probes are not unique, they are summarized to a gene value based on:
Additionally, it offers three simple methods for estimating the final expression:
The answer is not always easy,
if there is a satisfactory one. First and foremost, it has to do with the probe design itself. Fortunately, RNA-Seq has partially remedied such problems when one wants to study only gene expression and not differential splicing for example. But what about microarrays? One way is to use the avereps function from the Bioconductor limma package. Unfortunately, using this on the gene level after the replicate probe level (e.g. some Agilent probes exist many times in an array), not only may have an impact in the normalization procedure but it may affect the subsequent statistical analysis by incorporating bad quality probes. I have written a small R script which sumarizes gene expression from probes to genes, after filtering, normalization and the assignment of statistical scores (p-values etc.). Thus, if the remaining probes are not unique, they are summarized to a gene value based on:
- Each probe's p-value (must be greater than a threshold).
- Each probe's fold change (must be greater than a threshold).
Additionally, it offers three simple methods for estimating the final expression:
- maxf: summarizes probes to genes, assuming the probe with the highest absolute fold change as the final gene expression (suitable only for a single contrast, e.g. Control vs Treatment), after filtering probes based on the above criteria.
- minp: summarizes probes to genes, assuming the probe with the minimum p-value as the final gene expression (suitable only for a single contrast, e.g. Control vs Treatment), after filtering probes based on the above criteria.
- wmean: summarizes probes to genes, assuming that the final gene expression is the weighted mean of the gene's probes. The weights are calculated based on the probe's p-values, after filtering probes based on the above criteria. This method is suitable for multiple contrasts (e.g. ANOVA designs).
The function works only for limma fit objects for the time being and is a little under-documented... But please, ask me if there is something you don't understand, something does not work or you find it useful. Additionally, it runs parallely if the package multicore is present and your machine has multiple cores of course! Here it is...
# maxf summarizes ONE condition/comparison based on maximum fold change # minp summarizes ONE condition/comparison based on minimum p-value # wmean summarizes ONE OR MORE conditions/comparisons based on the weighted mean of the fold chnages and weights are based on the # p-values # maxf and minp CANNOT be used when there are more than one conditions/comparisons summarize.probes <- function(fit,chip,method=c("maxf","minp","wmean"),exclude.p=NULL,exclude.f=NULL,adjust="none",...) { if (missing(fit)) stop("No data given to summarize probes! fit is required!") if (missing(chip)) stop("Please provide the platform used in Bioconductor nomenclature!") if (class(fit) != "MArrayLM") stop("Only limma fit objects currently supported! Will add more soon...") if (!is.null(exclude.p) && (exclude.p < 0 || exclude.p > 1)) stop("Exclude p should be either NULL or a value between 0 and 1...") if (!(adjust %in% p.adjust.methods)) stop(paste("adjust should be one of ",paste(p.adjust.methods,collapse=", "),"...",sep="")) d <- dim(fit$coefficients) if (!is.null(d) && d[2] > 1) { if (method %in% c("maxf","minp")) { warning("maxf and minp can be used only with a single contrast! Switching to wmean",call.=FALSE) method <- "wmean" } } else d <- 1 paral <- ifelse(require(multicore),TRUE,FALSE) # Load annotation DB cat("\n===== Loading annotation...\n") require(annotate) chip2i<-paste(chip,".db",sep="") if (!require(chip2i,character.only=TRUE)) { setRepositories(graphics=FALSE,5) install.packages(chip2i,dependencies=TRUE) } require(chip2i,character.only=TRUE) # Some values probe.names <- fit$genes$ProbeName folds <- fit$coefficients p.values <- fit$p.value if (is.matrix(folds)) rownames(folds) <- rownames(p.values) <- probe.names else names(folds) <- names(p.values) <- probe.names if (!is.null(adjust)) for (j in 1:ncol(p.values)) p.values[,j] <- p.adjust(p.values[,j],adjust) if (is.null(fit$F.p.value)) total.p.values <- NULL else { total.p.values <- fit$F.p.value names(total.p.values) <- rownames(folds) if (!is.null(adjust)) total.p.values <- p.adjust(total.p.values,adjust) } # Create summarization list cat("===== Summarizing gene probes to gene names...\n") probe2gene <- eval(parse(text=paste("unlist(as.list(",chip,"SYMBOL[mappedkeys(",chip,"SYMBOL)]))",sep=""))) fitted.probes <- intersect(names(probe2gene),probe.names) probe2gene <- probe2gene[fitted.probes] if (paral) sum.list <- mclapply(unique(probe2gene),function(x,y) { names(which(y==x)) },probe2gene) else sum.list <- lapply(unique(probe2gene),function(x,y) { which(y==x) },probe2gene) names(sum.list) <- unique(probe2gene) cat("===== Summarizing probe values to gene values...\n") if (paral) # Split the probles per gene name and then do the rest { gene.split <- mclapply(sum.list,function(i,f,p,a) { if (!missing(a) && !is.null(a) && !is.na(a)) { if (is.vector(f)) return(list(f=f[i],p=p[i],a=a[i])) else { if (is.vector(f[i,])) return(list(f=t(as.matrix(f[i,])),p=t(as.matrix(p[i,])),a=a[i])) else return(list(f=f[i,],p=p[i,],a=a[i])) } } else { if (is.vector(f)) return(list(f=f[i],p=p[i])) else { if (is.vector(f[i,])) return(list(f=t(as.matrix(f[i,])),p=t(as.matrix(p[i,])))) else return(list(f=f[i,],p=p[i,])) } } },folds,p.values,total.p.values) sum.val <- summarize.parallel(gene.split,method,exclude.p,exclude.f) tmp <- mclapply(sum.val,unlist) tmp <- tmp[!sapply(tmp,is.null)] } else { gene.split <- lapply(sum.list,function(i,f,p,a) { if (!missing(a) && !is.null(a) && !is.na(a)) { if (is.vector(f)) return(list(f=f[i],p=p[i],a=a[i])) else { if (is.vector(f[i,])) return(list(f=t(as.matrix(f[i,])),p=t(as.matrix(p[i,])),a=a[i])) else return(list(f=f[i,],p=p[i,],a=a[i])) } } else { if (is.vector(f)) return(list(f=f[i],p=p[i])) else { if (is.vector(f[i,])) return(list(f=t(as.matrix(f[i,])),p=t(as.matrix(p[i,])))) else return(list(f=f[i,],p=p[i,])) } } },folds,p.values,total.p.values) sum.val <- summarize.single(gene.split,method,exclude.p,exclude.f) tmp <- lapply(sum.val,unlist) tmp <- tmp[!sapply(tmp,is.null)] } cat("===== Creating output...\n\n") result <- do.call("rbind",tmp) if (is.matrix(folds)) the.cnames <- c(paste("fc",colnames(folds),sep="_"),paste("p",colnames(p.values),sep="_")) else the.cnames <- c("log2_fc","p-value") if (!is.null(total.p.values)) the.cnames <- c(the.cnames,"F_p-value") colnames(result) <- the.cnames return(result) } summarize.parallel <- function(gene.split,method,exclude.p,exclude.f) { if (method == "maxf") # Summarize one condition per maximum fold change sum.val <- mclapply(gene.split,sum.maxf,exclude.p,exclude.f) else if (method == "minp") # Summarize one condition per minimum p-value sum.val <- mclapply(gene.split,sum.minp,exclude.p,exclude.f) else if (method == "wmean") # Summarize one or more conditions per weighted mean of fold changes, weights are based on p-values sum.val <- mclapply(gene.split,sum.wmean,exclude.p,exclude.f) return(sum.val) } summarize.single <- function(gene.split,method,exclude.p,exclude.f) { if (method == "maxf") # Summarize one condition per maximum fold change sum.val <- lapply(gene.split,sum.maxf,exclude.p,exclude.f) else if (method == "minp") # Summarize one condition per minimum p-value sum.val <- lapply(gene.split,sum.minp,exclude.p,exclude.f) else if (method == "wmean") # Summarize one or more conditions per weighted mean of fold changes, weights are based on p-values sum.val <- lapply(gene.split,sum.wmean,exclude.p,exclude.f) return(sum.val) } sum.maxf <- function(x,ep,ef) { # 1st pass: p-value if (!is.null(ep)) { g <- which(x$p<ep) if (length(g) == 0) return(NULL) else { x$f <- x$f[g] x$p <- x$p[g] if (!is.null(x$a)) x$a <- x$a[g] } } # 2nd pass: fold-change or whatever if (!is.null(ef)) { g <- which(abs(x$f)<ef) if (length(g) == 0) return(NULL) else { x$f <- x$f[g] x$p <- x$p[g] if (!is.null(x$a)) x$a <- x$a[g] } } # What to do by default i <- which(x$f==max(abs(x$f))) if (!is.null(x$a)) return(list(f=x$f[i],p=x$p[i],a=x$a[i])) else return(list(f=x$f[i],p=x$p[i])) } sum.minp <- function(x,ep,ef) { # 1st pass: p-value if (!is.null(ep)) { g <- which(x$p<ep) if (length(g) == 0) return(NULL) else { x$f <- x$f[g] x$p <- x$p[g] if (!is.null(x$a)) x$a <- x$a[g] } } # 2nd pass: fold-change or whatever if (!is.null(ef)) { g <- which(abs(x$f)<ef) if (length(g) == 0) return(NULL) else { x$f <- x$f[g] x$p <- x$p[g] if (!is.null(x$a)) x$a <- x$a[g] } } # What to do by default i <- which(x$p==min(x$p)) if (!is.null(x$a)) return(list(f=x$f[i],p=x$p[i],a=x$a[i])) else return(list(f=x$f[i],p=x$p[i])) } sum.wmean <- function(x,ep,ef) { eps <- 1e-6 if (!is.null(ep)) { g <- which(apply(x$p,1,function(z,ep) return(any(z<ep)),ep)) if (length(g) == 0) return(NULL) if (is.matrix(x$f)) { x$f <- x$f[g,] x$p <- x$p[g,] if (is.vector(x$f)) { x$f <- t(as.matrix(x$f)) x$p <- t(as.matrix(x$p)) } } else { x$f <- x$f[g] x$p <- x$p[g] } if (!is.null(x$a)) x$a <- x$a[g] } if (!is.null(ef)) { g <- which(apply(x$f,1,function(z,ef) return(any(abs(z)>ef)),ef)) if (length(g) == 0) return(NULL) if (is.matrix(x$f)) { x$f <- x$f[g,] x$p <- x$p[g,] if (is.vector(x$f)) { x$f <- t(as.matrix(x$f)) x$p <- t(as.matrix(x$p)) } } else { x$f <- x$f[g] x$p <- x$p[g] } if (!is.null(x$a)) x$a <- x$a[g] } if (!is.null(x$a)) { if (is.vector(x$f)) { ia <- which(x$p==min(x$p))[1] pw <- 1 - x$p/max(x$p) pw[which(pw==min(pw))] <- pw[which(pw==min(pw))] + eps return(list(f=weighted.mean(x$f,pw),p=min(x$p),a=x$a[ia])) } else { # The hard part... is a matrix pw <- matrix(0,nrow(x$p),ncol(x$p)) fn <- numeric(ncol(x$f)) rownames(pw) <- rownames(x$f) colnames(pw) <- colnames(x$f) names(fn) <- colnames(x$f) ia <- apply(x$p,2,function(x) return(which(x==min(x))[1]))[1] for (j in 1:ncol(x$p)) { pw[,j] <- 1 - x$p[,j]/max(x$p[,j]) pw[which(pw[,j]==min(pw[,j])),j] <- pw[which(pw[,j]==min(pw[,j])),j] + eps fn[j] <- weighted.mean(x$f[,j],pw[,j]) } an <- x$a[ia] return(list(f=fn,p=apply(x$p,2,min),a=an)) } } else { if (is.vector(x$f)) { pw <- 1 - x$p/max(x$p) pw[which(pw==min(pw))] <- pw[which(pw==min(pw))] + eps return(list(f=weighted.mean(x$f,pw),p=min(x$p))) } else { # The hard part... is a matrix pw <- matrix(0,nrow(x$p),ncol(x$p)) fn <- numeric(ncol(x$f)) rownames(pw) <- rownames(x$f) colnames(pw) <- colnames(x$f) names(fn) <- colnames(x$f) for (j in 1:ncol(x$p)) { pw[,j] <- 1 - x$p[,j]/max(x$p[,j]) pw[which(pw[,j]==min(pw[,j])),j] <- pw[which(pw[,j]==min(pw[,j])),j] + eps fn[,j] <- weighted.mean(x$f[,j],pw[,j]) } return(list(f=fn,p=apply(x$p,2,min))) } } }
0 comments:
Post a Comment