Summarizing microarray normalized probe expression to gene expression

The usual and fundumental result of the analysis of a microarray data experiment is a list of differentially expressed genes, always according to the statistical design and the biological questions asked. However, what we see most of the times in such a list is a set of differentially expressed probes and not a list of differentially expressed genes (according to official HUGO nomenclature for example). Although this final list contains also gene names next to probes, I have been asked a lot of times to provide a list of genes and discard the probes as they are rather confusing the bench biologists. The latter are asking questions of the type "which probe should I believe" or "why the expression in one probe is so much different from the expression of the other probe corresponding to the same gene"?

The answer is not always easy,
if there is a satisfactory one. First and foremost, it has to do with the probe design itself. Fortunately, RNA-Seq has partially remedied such problems when one wants to study only gene expression and not differential splicing for example. But what about microarrays? One way is to use the avereps function from the Bioconductor limma package. Unfortunately, using this on the gene level after the replicate probe level (e.g. some Agilent probes exist many times in an array), not only may have an impact in the normalization procedure but it may affect the subsequent statistical analysis by incorporating bad quality probes. I have written a small R script which sumarizes gene expression from probes to genes, after filtering, normalization and the assignment of statistical scores (p-values etc.). Thus, if the remaining probes are not unique, they are summarized to a gene value based on:

  1. Each probe's p-value (must be greater than a threshold).
  2. Each probe's fold change (must be greater than a threshold).

Additionally, it offers three simple methods for estimating the final expression:

  1. maxf: summarizes probes to genes, assuming the probe with the highest absolute fold change as the final gene expression (suitable only for a single contrast, e.g. Control vs Treatment), after filtering probes based on the above criteria.
  2. minp: summarizes probes to genes, assuming the probe with the minimum p-value as the final gene expression (suitable only for a single contrast, e.g. Control vs Treatment), after filtering probes based on the above criteria.
  3. wmean: summarizes probes to genes, assuming that the final gene expression is the weighted mean of the gene's probes. The weights are calculated based on the probe's p-values, after filtering probes based on the above criteria. This method is suitable for multiple contrasts (e.g. ANOVA designs).

The function works only for limma fit objects for the time being and is a little under-documented... But please, ask me if there is something you don't understand, something does not work or you find it useful. Additionally, it runs parallely if the package multicore is present and your machine has multiple cores of course! Here it is...
# maxf summarizes ONE condition/comparison based on maximum fold change
# minp summarizes ONE condition/comparison based on minimum p-value
# wmean summarizes ONE OR MORE conditions/comparisons based on the weighted mean of the fold chnages and weights are based on the
# p-values
# maxf and minp CANNOT be used when there are more than one conditions/comparisons
summarize.probes <- function(fit,chip,method=c("maxf","minp","wmean"),exclude.p=NULL,exclude.f=NULL,adjust="none",...)
{
 if (missing(fit))
  stop("No data given to summarize probes! fit is required!")
 if (missing(chip))
  stop("Please provide the platform used in Bioconductor nomenclature!")

 if (class(fit) != "MArrayLM")
  stop("Only limma fit objects currently supported! Will add more soon...")
 if (!is.null(exclude.p) && (exclude.p < 0 || exclude.p > 1))
  stop("Exclude p should be either NULL or a value between 0 and 1...")
 if (!(adjust %in% p.adjust.methods))
  stop(paste("adjust should be one of ",paste(p.adjust.methods,collapse=", "),"...",sep=""))

 d <- dim(fit$coefficients)
 if (!is.null(d) && d[2] > 1)
 {
  if (method %in% c("maxf","minp"))
  {
   warning("maxf and minp can be used only with a single contrast! Switching to wmean",call.=FALSE)
   method <- "wmean"
  }
 } else d <- 1

 paral <- ifelse(require(multicore),TRUE,FALSE)
 
 # Load annotation DB
 cat("\n===== Loading annotation...\n")
 require(annotate)
 chip2i<-paste(chip,".db",sep="")
 if (!require(chip2i,character.only=TRUE))
 {
  setRepositories(graphics=FALSE,5)
  install.packages(chip2i,dependencies=TRUE)
 }
 require(chip2i,character.only=TRUE)

 # Some values
 probe.names <- fit$genes$ProbeName
 folds <- fit$coefficients
 p.values <- fit$p.value
 if (is.matrix(folds))
  rownames(folds) <- rownames(p.values) <- probe.names
 else
  names(folds) <- names(p.values) <- probe.names
 if (!is.null(adjust))
  for (j in 1:ncol(p.values))
   p.values[,j] <- p.adjust(p.values[,j],adjust)
 if (is.null(fit$F.p.value))
  total.p.values <- NULL
 else
 {
  total.p.values <- fit$F.p.value
  names(total.p.values) <- rownames(folds)
  if (!is.null(adjust))
   total.p.values <- p.adjust(total.p.values,adjust)
 }

 # Create summarization list
 cat("===== Summarizing gene probes to gene names...\n")
 probe2gene <- eval(parse(text=paste("unlist(as.list(",chip,"SYMBOL[mappedkeys(",chip,"SYMBOL)]))",sep="")))
 fitted.probes <- intersect(names(probe2gene),probe.names)
 probe2gene <- probe2gene[fitted.probes]
 if (paral)
  sum.list <- mclapply(unique(probe2gene),function(x,y) { names(which(y==x)) },probe2gene)
 else
  sum.list <- lapply(unique(probe2gene),function(x,y) { which(y==x) },probe2gene)
 names(sum.list) <- unique(probe2gene)

 cat("===== Summarizing probe values to gene values...\n")
 if (paral) # Split the probles per gene name and then do the rest
 {
  gene.split <- mclapply(sum.list,function(i,f,p,a) {
   if (!missing(a) && !is.null(a) && !is.na(a)) {
    if (is.vector(f))
     return(list(f=f[i],p=p[i],a=a[i]))
    else {
     if (is.vector(f[i,]))
      return(list(f=t(as.matrix(f[i,])),p=t(as.matrix(p[i,])),a=a[i]))
     else
      return(list(f=f[i,],p=p[i,],a=a[i]))
    }
   }
   else {
    if (is.vector(f))
     return(list(f=f[i],p=p[i]))
    else {
     if (is.vector(f[i,]))
      return(list(f=t(as.matrix(f[i,])),p=t(as.matrix(p[i,]))))
     else
      return(list(f=f[i,],p=p[i,]))
    }
   }
  },folds,p.values,total.p.values)
  sum.val <- summarize.parallel(gene.split,method,exclude.p,exclude.f)
  tmp <- mclapply(sum.val,unlist)
  tmp <- tmp[!sapply(tmp,is.null)]
 }
 else
 {
  gene.split <- lapply(sum.list,function(i,f,p,a) {
   if (!missing(a) && !is.null(a) && !is.na(a)) {
    if (is.vector(f))
     return(list(f=f[i],p=p[i],a=a[i]))
    else {
     if (is.vector(f[i,]))
      return(list(f=t(as.matrix(f[i,])),p=t(as.matrix(p[i,])),a=a[i]))
     else
      return(list(f=f[i,],p=p[i,],a=a[i]))
    }
   }
   else {
    if (is.vector(f))
     return(list(f=f[i],p=p[i]))
    else {
     if (is.vector(f[i,]))
      return(list(f=t(as.matrix(f[i,])),p=t(as.matrix(p[i,]))))
     else
      return(list(f=f[i,],p=p[i,]))
    }
   }
  },folds,p.values,total.p.values)
  sum.val <- summarize.single(gene.split,method,exclude.p,exclude.f)
  tmp <- lapply(sum.val,unlist)
  tmp <- tmp[!sapply(tmp,is.null)]
 }

 cat("===== Creating output...\n\n")
 result <- do.call("rbind",tmp)
 if (is.matrix(folds))
  the.cnames <- c(paste("fc",colnames(folds),sep="_"),paste("p",colnames(p.values),sep="_"))
 else
  the.cnames <- c("log2_fc","p-value")
 if (!is.null(total.p.values))
  the.cnames <- c(the.cnames,"F_p-value")
 colnames(result) <- the.cnames
 
 return(result)
}

summarize.parallel <- function(gene.split,method,exclude.p,exclude.f)
{
 if (method == "maxf") # Summarize one condition per maximum fold change
  sum.val <- mclapply(gene.split,sum.maxf,exclude.p,exclude.f)
 else if (method == "minp") # Summarize one condition per minimum p-value
  sum.val <- mclapply(gene.split,sum.minp,exclude.p,exclude.f)
 else if (method == "wmean") # Summarize one or more conditions per weighted mean of fold changes, weights are based on p-values
  sum.val <- mclapply(gene.split,sum.wmean,exclude.p,exclude.f)
 return(sum.val)
}

summarize.single <- function(gene.split,method,exclude.p,exclude.f)
{
 if (method == "maxf") # Summarize one condition per maximum fold change
  sum.val <- lapply(gene.split,sum.maxf,exclude.p,exclude.f)
 else if (method == "minp") # Summarize one condition per minimum p-value
  sum.val <- lapply(gene.split,sum.minp,exclude.p,exclude.f)
 else if (method == "wmean") # Summarize one or more conditions per weighted mean of fold changes, weights are based on p-values
  sum.val <- lapply(gene.split,sum.wmean,exclude.p,exclude.f)
 return(sum.val)
}

sum.maxf <- function(x,ep,ef)
{
 # 1st pass: p-value
 if (!is.null(ep))
 {
  g <- which(x$p<ep)
  if (length(g) == 0)
   return(NULL)
  else
  {
   x$f <- x$f[g]
   x$p <- x$p[g]
   if (!is.null(x$a))
    x$a <- x$a[g]
  }
 }
 # 2nd pass: fold-change or whatever
 if (!is.null(ef))
 {
  g <- which(abs(x$f)<ef)
  if (length(g) == 0)
   return(NULL)
  else
  {
   x$f <- x$f[g]
   x$p <- x$p[g]
   if (!is.null(x$a))
    x$a <- x$a[g]
  }
 }
 # What to do by default
 i <- which(x$f==max(abs(x$f)))
 if (!is.null(x$a))
  return(list(f=x$f[i],p=x$p[i],a=x$a[i]))
 else
  return(list(f=x$f[i],p=x$p[i]))
}

sum.minp <- function(x,ep,ef)
{
 # 1st pass: p-value
 if (!is.null(ep))
 {
  g <- which(x$p<ep)
  if (length(g) == 0)
   return(NULL)
  else
  {
   x$f <- x$f[g]
   x$p <- x$p[g]
   if (!is.null(x$a))
    x$a <- x$a[g]
  }
 }
 # 2nd pass: fold-change or whatever
 if (!is.null(ef))
 {
  g <- which(abs(x$f)<ef)
  if (length(g) == 0)
   return(NULL)
  else
  {
   x$f <- x$f[g]
   x$p <- x$p[g]
   if (!is.null(x$a))
    x$a <- x$a[g]
  }
 }
 # What to do by default
 i <- which(x$p==min(x$p))
 if (!is.null(x$a))
  return(list(f=x$f[i],p=x$p[i],a=x$a[i]))
 else
  return(list(f=x$f[i],p=x$p[i]))
}

sum.wmean <- function(x,ep,ef)
{
 eps <- 1e-6
 if (!is.null(ep))
 {
  g <- which(apply(x$p,1,function(z,ep) return(any(z<ep)),ep))
  if (length(g) == 0)
   return(NULL)
  if (is.matrix(x$f))
  {
   x$f <- x$f[g,]
   x$p <- x$p[g,]
   if (is.vector(x$f))
   {
    x$f <- t(as.matrix(x$f))
    x$p <- t(as.matrix(x$p))
   }
  }
  else
  {
   x$f <- x$f[g]
   x$p <- x$p[g]
  }
  if (!is.null(x$a))
   x$a <- x$a[g]
 }
 if (!is.null(ef))
 {
  g <- which(apply(x$f,1,function(z,ef) return(any(abs(z)>ef)),ef))
  if (length(g) == 0)
   return(NULL)
  if (is.matrix(x$f))
  {
   x$f <- x$f[g,]
   x$p <- x$p[g,]
   if (is.vector(x$f))
   {
    x$f <- t(as.matrix(x$f))
    x$p <- t(as.matrix(x$p))
   }
  }
  else
  {
   x$f <- x$f[g]
   x$p <- x$p[g]
  }
  if (!is.null(x$a))
   x$a <- x$a[g]
 }
 if (!is.null(x$a))
 {
  if (is.vector(x$f))
  {
   ia <- which(x$p==min(x$p))[1]
   pw <- 1 - x$p/max(x$p)
   pw[which(pw==min(pw))] <- pw[which(pw==min(pw))] + eps
   return(list(f=weighted.mean(x$f,pw),p=min(x$p),a=x$a[ia]))
  }
  else
  {
   # The hard part... is a matrix
   pw <- matrix(0,nrow(x$p),ncol(x$p))
   fn <- numeric(ncol(x$f))
   rownames(pw) <- rownames(x$f)
   colnames(pw) <- colnames(x$f)
   names(fn) <- colnames(x$f)
   ia <- apply(x$p,2,function(x) return(which(x==min(x))[1]))[1]
   for (j in 1:ncol(x$p)) {
    pw[,j] <- 1 - x$p[,j]/max(x$p[,j])
    pw[which(pw[,j]==min(pw[,j])),j] <- pw[which(pw[,j]==min(pw[,j])),j] + eps
    fn[j] <- weighted.mean(x$f[,j],pw[,j])
   }
   an <- x$a[ia]
   return(list(f=fn,p=apply(x$p,2,min),a=an))
  }
 }
 else
 {
  if (is.vector(x$f))
  {
   pw <- 1 - x$p/max(x$p)
   pw[which(pw==min(pw))] <- pw[which(pw==min(pw))] + eps
   return(list(f=weighted.mean(x$f,pw),p=min(x$p)))
  }
  else
  {
   # The hard part... is a matrix
   pw <- matrix(0,nrow(x$p),ncol(x$p))
   fn <- numeric(ncol(x$f))
   rownames(pw) <- rownames(x$f)
   colnames(pw) <- colnames(x$f)
   names(fn) <- colnames(x$f)
   for (j in 1:ncol(x$p)) {
    pw[,j] <- 1 - x$p[,j]/max(x$p[,j])
    pw[which(pw[,j]==min(pw[,j])),j] <- pw[which(pw[,j]==min(pw[,j])),j] + eps
    fn[,j] <- weighted.mean(x$f[,j],pw[,j])
   }
   return(list(f=fn,p=apply(x$p,2,min)))
  }
 }
}

0 comments:

Post a Comment

Copyright © Bioinformatics dance