The usual and fundumental result of the analysis of a microarray data experiment is a list of differentially expressed genes, always according to the statistical design and the biological questions asked. However, what we see most of the times in such a list is a set of differentially expressed probes and not a list of differentially expressed genes (according to official HUGO nomenclature for example). Although this final list contains also gene names next to probes, I have been asked a lot of times to provide a list of genes and discard the probes as they are rather confusing the bench biologists. The latter are asking questions of the type "which probe should I believe" or "why the expression in one probe is so much different from the expression of the other probe corresponding to the same gene"?
The answer is not always easy,
if there is a satisfactory one. First and foremost, it has to do with the probe design itself. Fortunately, RNA-Seq has partially remedied such problems when one wants to study only gene expression and not differential splicing for example. But what about microarrays? One way is to use the avereps function from the Bioconductor limma package. Unfortunately, using this on the gene level after the replicate probe level (e.g. some Agilent probes exist many times in an array), not only may have an impact in the normalization procedure but it may affect the subsequent statistical analysis by incorporating bad quality probes. I have written a small R script which sumarizes gene expression from probes to genes, after filtering, normalization and the assignment of statistical scores (p-values etc.). Thus, if the remaining probes are not unique, they are summarized to a gene value based on:
Additionally, it offers three simple methods for estimating the final expression:
The answer is not always easy,
if there is a satisfactory one. First and foremost, it has to do with the probe design itself. Fortunately, RNA-Seq has partially remedied such problems when one wants to study only gene expression and not differential splicing for example. But what about microarrays? One way is to use the avereps function from the Bioconductor limma package. Unfortunately, using this on the gene level after the replicate probe level (e.g. some Agilent probes exist many times in an array), not only may have an impact in the normalization procedure but it may affect the subsequent statistical analysis by incorporating bad quality probes. I have written a small R script which sumarizes gene expression from probes to genes, after filtering, normalization and the assignment of statistical scores (p-values etc.). Thus, if the remaining probes are not unique, they are summarized to a gene value based on:
- Each probe's p-value (must be greater than a threshold).
- Each probe's fold change (must be greater than a threshold).
Additionally, it offers three simple methods for estimating the final expression:
- maxf: summarizes probes to genes, assuming the probe with the highest absolute fold change as the final gene expression (suitable only for a single contrast, e.g. Control vs Treatment), after filtering probes based on the above criteria.
- minp: summarizes probes to genes, assuming the probe with the minimum p-value as the final gene expression (suitable only for a single contrast, e.g. Control vs Treatment), after filtering probes based on the above criteria.
- wmean: summarizes probes to genes, assuming that the final gene expression is the weighted mean of the gene's probes. The weights are calculated based on the probe's p-values, after filtering probes based on the above criteria. This method is suitable for multiple contrasts (e.g. ANOVA designs).
The function works only for limma fit objects for the time being and is a little under-documented... But please, ask me if there is something you don't understand, something does not work or you find it useful. Additionally, it runs parallely if the package multicore is present and your machine has multiple cores of course! Here it is...
# maxf summarizes ONE condition/comparison based on maximum fold change
# minp summarizes ONE condition/comparison based on minimum p-value
# wmean summarizes ONE OR MORE conditions/comparisons based on the weighted mean of the fold chnages and weights are based on the
# p-values
# maxf and minp CANNOT be used when there are more than one conditions/comparisons
summarize.probes <- function(fit,chip,method=c("maxf","minp","wmean"),exclude.p=NULL,exclude.f=NULL,adjust="none",...)
{
if (missing(fit))
stop("No data given to summarize probes! fit is required!")
if (missing(chip))
stop("Please provide the platform used in Bioconductor nomenclature!")
if (class(fit) != "MArrayLM")
stop("Only limma fit objects currently supported! Will add more soon...")
if (!is.null(exclude.p) && (exclude.p < 0 || exclude.p > 1))
stop("Exclude p should be either NULL or a value between 0 and 1...")
if (!(adjust %in% p.adjust.methods))
stop(paste("adjust should be one of ",paste(p.adjust.methods,collapse=", "),"...",sep=""))
d <- dim(fit$coefficients)
if (!is.null(d) && d[2] > 1)
{
if (method %in% c("maxf","minp"))
{
warning("maxf and minp can be used only with a single contrast! Switching to wmean",call.=FALSE)
method <- "wmean"
}
} else d <- 1
paral <- ifelse(require(multicore),TRUE,FALSE)
# Load annotation DB
cat("\n===== Loading annotation...\n")
require(annotate)
chip2i<-paste(chip,".db",sep="")
if (!require(chip2i,character.only=TRUE))
{
setRepositories(graphics=FALSE,5)
install.packages(chip2i,dependencies=TRUE)
}
require(chip2i,character.only=TRUE)
# Some values
probe.names <- fit$genes$ProbeName
folds <- fit$coefficients
p.values <- fit$p.value
if (is.matrix(folds))
rownames(folds) <- rownames(p.values) <- probe.names
else
names(folds) <- names(p.values) <- probe.names
if (!is.null(adjust))
for (j in 1:ncol(p.values))
p.values[,j] <- p.adjust(p.values[,j],adjust)
if (is.null(fit$F.p.value))
total.p.values <- NULL
else
{
total.p.values <- fit$F.p.value
names(total.p.values) <- rownames(folds)
if (!is.null(adjust))
total.p.values <- p.adjust(total.p.values,adjust)
}
# Create summarization list
cat("===== Summarizing gene probes to gene names...\n")
probe2gene <- eval(parse(text=paste("unlist(as.list(",chip,"SYMBOL[mappedkeys(",chip,"SYMBOL)]))",sep="")))
fitted.probes <- intersect(names(probe2gene),probe.names)
probe2gene <- probe2gene[fitted.probes]
if (paral)
sum.list <- mclapply(unique(probe2gene),function(x,y) { names(which(y==x)) },probe2gene)
else
sum.list <- lapply(unique(probe2gene),function(x,y) { which(y==x) },probe2gene)
names(sum.list) <- unique(probe2gene)
cat("===== Summarizing probe values to gene values...\n")
if (paral) # Split the probles per gene name and then do the rest
{
gene.split <- mclapply(sum.list,function(i,f,p,a) {
if (!missing(a) && !is.null(a) && !is.na(a)) {
if (is.vector(f))
return(list(f=f[i],p=p[i],a=a[i]))
else {
if (is.vector(f[i,]))
return(list(f=t(as.matrix(f[i,])),p=t(as.matrix(p[i,])),a=a[i]))
else
return(list(f=f[i,],p=p[i,],a=a[i]))
}
}
else {
if (is.vector(f))
return(list(f=f[i],p=p[i]))
else {
if (is.vector(f[i,]))
return(list(f=t(as.matrix(f[i,])),p=t(as.matrix(p[i,]))))
else
return(list(f=f[i,],p=p[i,]))
}
}
},folds,p.values,total.p.values)
sum.val <- summarize.parallel(gene.split,method,exclude.p,exclude.f)
tmp <- mclapply(sum.val,unlist)
tmp <- tmp[!sapply(tmp,is.null)]
}
else
{
gene.split <- lapply(sum.list,function(i,f,p,a) {
if (!missing(a) && !is.null(a) && !is.na(a)) {
if (is.vector(f))
return(list(f=f[i],p=p[i],a=a[i]))
else {
if (is.vector(f[i,]))
return(list(f=t(as.matrix(f[i,])),p=t(as.matrix(p[i,])),a=a[i]))
else
return(list(f=f[i,],p=p[i,],a=a[i]))
}
}
else {
if (is.vector(f))
return(list(f=f[i],p=p[i]))
else {
if (is.vector(f[i,]))
return(list(f=t(as.matrix(f[i,])),p=t(as.matrix(p[i,]))))
else
return(list(f=f[i,],p=p[i,]))
}
}
},folds,p.values,total.p.values)
sum.val <- summarize.single(gene.split,method,exclude.p,exclude.f)
tmp <- lapply(sum.val,unlist)
tmp <- tmp[!sapply(tmp,is.null)]
}
cat("===== Creating output...\n\n")
result <- do.call("rbind",tmp)
if (is.matrix(folds))
the.cnames <- c(paste("fc",colnames(folds),sep="_"),paste("p",colnames(p.values),sep="_"))
else
the.cnames <- c("log2_fc","p-value")
if (!is.null(total.p.values))
the.cnames <- c(the.cnames,"F_p-value")
colnames(result) <- the.cnames
return(result)
}
summarize.parallel <- function(gene.split,method,exclude.p,exclude.f)
{
if (method == "maxf") # Summarize one condition per maximum fold change
sum.val <- mclapply(gene.split,sum.maxf,exclude.p,exclude.f)
else if (method == "minp") # Summarize one condition per minimum p-value
sum.val <- mclapply(gene.split,sum.minp,exclude.p,exclude.f)
else if (method == "wmean") # Summarize one or more conditions per weighted mean of fold changes, weights are based on p-values
sum.val <- mclapply(gene.split,sum.wmean,exclude.p,exclude.f)
return(sum.val)
}
summarize.single <- function(gene.split,method,exclude.p,exclude.f)
{
if (method == "maxf") # Summarize one condition per maximum fold change
sum.val <- lapply(gene.split,sum.maxf,exclude.p,exclude.f)
else if (method == "minp") # Summarize one condition per minimum p-value
sum.val <- lapply(gene.split,sum.minp,exclude.p,exclude.f)
else if (method == "wmean") # Summarize one or more conditions per weighted mean of fold changes, weights are based on p-values
sum.val <- lapply(gene.split,sum.wmean,exclude.p,exclude.f)
return(sum.val)
}
sum.maxf <- function(x,ep,ef)
{
# 1st pass: p-value
if (!is.null(ep))
{
g <- which(x$p<ep)
if (length(g) == 0)
return(NULL)
else
{
x$f <- x$f[g]
x$p <- x$p[g]
if (!is.null(x$a))
x$a <- x$a[g]
}
}
# 2nd pass: fold-change or whatever
if (!is.null(ef))
{
g <- which(abs(x$f)<ef)
if (length(g) == 0)
return(NULL)
else
{
x$f <- x$f[g]
x$p <- x$p[g]
if (!is.null(x$a))
x$a <- x$a[g]
}
}
# What to do by default
i <- which(x$f==max(abs(x$f)))
if (!is.null(x$a))
return(list(f=x$f[i],p=x$p[i],a=x$a[i]))
else
return(list(f=x$f[i],p=x$p[i]))
}
sum.minp <- function(x,ep,ef)
{
# 1st pass: p-value
if (!is.null(ep))
{
g <- which(x$p<ep)
if (length(g) == 0)
return(NULL)
else
{
x$f <- x$f[g]
x$p <- x$p[g]
if (!is.null(x$a))
x$a <- x$a[g]
}
}
# 2nd pass: fold-change or whatever
if (!is.null(ef))
{
g <- which(abs(x$f)<ef)
if (length(g) == 0)
return(NULL)
else
{
x$f <- x$f[g]
x$p <- x$p[g]
if (!is.null(x$a))
x$a <- x$a[g]
}
}
# What to do by default
i <- which(x$p==min(x$p))
if (!is.null(x$a))
return(list(f=x$f[i],p=x$p[i],a=x$a[i]))
else
return(list(f=x$f[i],p=x$p[i]))
}
sum.wmean <- function(x,ep,ef)
{
eps <- 1e-6
if (!is.null(ep))
{
g <- which(apply(x$p,1,function(z,ep) return(any(z<ep)),ep))
if (length(g) == 0)
return(NULL)
if (is.matrix(x$f))
{
x$f <- x$f[g,]
x$p <- x$p[g,]
if (is.vector(x$f))
{
x$f <- t(as.matrix(x$f))
x$p <- t(as.matrix(x$p))
}
}
else
{
x$f <- x$f[g]
x$p <- x$p[g]
}
if (!is.null(x$a))
x$a <- x$a[g]
}
if (!is.null(ef))
{
g <- which(apply(x$f,1,function(z,ef) return(any(abs(z)>ef)),ef))
if (length(g) == 0)
return(NULL)
if (is.matrix(x$f))
{
x$f <- x$f[g,]
x$p <- x$p[g,]
if (is.vector(x$f))
{
x$f <- t(as.matrix(x$f))
x$p <- t(as.matrix(x$p))
}
}
else
{
x$f <- x$f[g]
x$p <- x$p[g]
}
if (!is.null(x$a))
x$a <- x$a[g]
}
if (!is.null(x$a))
{
if (is.vector(x$f))
{
ia <- which(x$p==min(x$p))[1]
pw <- 1 - x$p/max(x$p)
pw[which(pw==min(pw))] <- pw[which(pw==min(pw))] + eps
return(list(f=weighted.mean(x$f,pw),p=min(x$p),a=x$a[ia]))
}
else
{
# The hard part... is a matrix
pw <- matrix(0,nrow(x$p),ncol(x$p))
fn <- numeric(ncol(x$f))
rownames(pw) <- rownames(x$f)
colnames(pw) <- colnames(x$f)
names(fn) <- colnames(x$f)
ia <- apply(x$p,2,function(x) return(which(x==min(x))[1]))[1]
for (j in 1:ncol(x$p)) {
pw[,j] <- 1 - x$p[,j]/max(x$p[,j])
pw[which(pw[,j]==min(pw[,j])),j] <- pw[which(pw[,j]==min(pw[,j])),j] + eps
fn[j] <- weighted.mean(x$f[,j],pw[,j])
}
an <- x$a[ia]
return(list(f=fn,p=apply(x$p,2,min),a=an))
}
}
else
{
if (is.vector(x$f))
{
pw <- 1 - x$p/max(x$p)
pw[which(pw==min(pw))] <- pw[which(pw==min(pw))] + eps
return(list(f=weighted.mean(x$f,pw),p=min(x$p)))
}
else
{
# The hard part... is a matrix
pw <- matrix(0,nrow(x$p),ncol(x$p))
fn <- numeric(ncol(x$f))
rownames(pw) <- rownames(x$f)
colnames(pw) <- colnames(x$f)
names(fn) <- colnames(x$f)
for (j in 1:ncol(x$p)) {
pw[,j] <- 1 - x$p[,j]/max(x$p[,j])
pw[which(pw[,j]==min(pw[,j])),j] <- pw[which(pw[,j]==min(pw[,j])),j] + eps
fn[,j] <- weighted.mean(x$f[,j],pw[,j])
}
return(list(f=fn,p=apply(x$p,2,min)))
}
}
}
RSS Feed
Facebook
Twitter
0 comments:
Post a Comment