ASCI 896 Statistical Genomics

0.1 Bayesian penalized regression

0.1.1 Rice data

Download rice data from http://ricediversity.org/data/sets/44kgwas/

0.1.2 Data cleaning

library(BGLR)
out<- read_ped("RiceDiversity_44K_Genotypes_PLINK/sativas413.ped")
p=out$p
n=out$n
out=out$x
#Recode snp to 0,1,2 format using allele 1
# 0 --> 0
# 1 --> 1
# 2 --> NA
# 3 --> 2
out[out==2]=NA
out[out==3]=2
X=matrix(out,nrow=p,ncol=n,byrow=TRUE)
X=t(X)

# genotype imputation
for (j in 1:ncol(X)){
  X[,j] <- ifelse(is.na(X[,j]), mean(X[,j], na.rm=TRUE), X[,j])
}

# accession ID
fam <-read.table("RiceDiversity_44K_Genotypes_PLINK/sativas413.fam", header = FALSE, stringsAsFactors = FALSE)  
head(fam)

##           V1 V2 V3 V4 V5 V6
## 1 081215-A05  1  0  0 -9 -9
## 2 081215-A06  3  0  0 -9 -9
## 3 081215-A07  4  0  0 -9 -9
## 4 081215-A08  5  0  0 -9 -9
## 5 090414-A09  6  0  0 -9 -9
## 6 090414-A10  7  0  0 -9 -9

rownames(X) <- fam$V2 # 413 x 36901

# MAF=0.05
p <- colSums(X) / (2*nrow(X))
maf <- ifelse(p > 0.5, 1-p, p)
maf.index <- which(maf < 0.05)
X <- X[, -maf.index]  # 413 x 33701

# phenotypes
rice.pheno <- read.table("http://www.ricediversity.org/data/sets/44kgwas/RiceDiversity_44K_Phenotypes_34traits_PLINK.txt", header=TRUE, stringsAsFactors = FALSE, sep = "\t")
table(rownames(X) == rice.pheno$NSFTVID)

## 
## TRUE 
##  413

y <- rice.pheno$Flowering.time.at.Arkansas # # use the first trait 
index <- !is.na(y)
y <- y[index] # 374
X <- X[index,] # 374 33701

0.1.3 Bayesian ridge regression

# training - testing sets partition
set.seed(0728)
tst <- sample(1:nrow(X), size=80)
y.na <- y
y.na[tst] <- NA

# user specified priors
ETA <- list(MRK=list(X = X, df0 = 5, S0 = 1, model="BRR"))
fit.BRR <- BGLR(y = y.na, ETA = ETA, nIter = 3000, burnIn = 1000, verbose = FALSE)
cor(y[tst], fit.BRR$yHat[tst])

## [1] 0.728925

# rule-based priors
ETA <- list(MRK=list(X = X, model="BRR"))
fit.BRR <- BGLR(y = y.na, ETA = ETA, nIter = 3000, burnIn = 1000, verbose = FALSE)
cor(y[tst], fit.BRR$yHat[tst])

## [1] 0.7223458

0.1.4 Bayesian LASSO

# training - testing sets partition
set.seed(0728)
tst <- sample(1:nrow(X), size=80)
y.na <- y
y.na[tst] <- NA

# user specified priors
ETA <- list(MRK=list(X = X, type="gamma", lambda=10, shape=1.1, rate=0.5, model="BL"))
fit.BL <- BGLR(y = y.na, ETA = ETA, nIter = 3000, burnIn = 1000, verbose = FALSE)
cor(y[tst], fit.BL$yHat[tst])

## [1] 0.6936454

# rule-based priors 
ETA <- list(MRK=list(X = X, model="BL"))
fit.BL <- BGLR(y = y.na, ETA = ETA, nIter = 3000, burnIn = 1000, verbose = FALSE)
cor(y[tst], fit.BL$yHat[tst])

## [1] 0.7155831

ASCI 896 Statistical Genomics

Contents

0.1 Bayesian penalized regression

0.1.1 Rice data

0.1.2 Data cleaning

0.1.3 Bayesian ridge regression

0.1.4 Bayesian LASSO