ASCI 896 Statistical Genomics

ridge regression BLUP

Rice data

Download rice data from http://ricediversity.org/data/sets/44kgwas/

##################################
# data cleaning 
##################################
library(BGLR)
out<- read_ped("RiceDiversity_44K_Genotypes_PLINK/sativas413.ped")
p=out$p
n=out$n
out=out$x
#Recode snp to 0,1,2 format using allele 1
# 0 --> 0
# 1 --> 1
# 2 --> NA
# 3 --> 2
out[out==2]=NA
out[out==3]=2
X=matrix(out,nrow=p,ncol=n,byrow=TRUE)
X=t(X)

# genotype imputation
for (j in 1:ncol(X)){
  X[,j] <- ifelse(is.na(X[,j]), mean(X[,j], na.rm=TRUE), X[,j])
}

# accession ID
fam <-read.table("RiceDiversity_44K_Genotypes_PLINK/sativas413.fam", header = FALSE, stringsAsFactors = FALSE)  
head(fam)

##           V1 V2 V3 V4 V5 V6
## 1 081215-A05  1  0  0 -9 -9
## 2 081215-A06  3  0  0 -9 -9
## 3 081215-A07  4  0  0 -9 -9
## 4 081215-A08  5  0  0 -9 -9
## 5 090414-A09  6  0  0 -9 -9
## 6 090414-A10  7  0  0 -9 -9

rownames(X) <- fam$V2 # 413 x 36901

# phenotypes
rice.pheno <- read.table("http://www.ricediversity.org/data/sets/44kgwas/RiceDiversity_44K_Phenotypes_34traits_PLINK.txt", header=TRUE, stringsAsFactors = FALSE, sep = "\t")
table(rownames(X) == rice.pheno$NSFTVID)

## 
## TRUE 
##  413

y <- rice.pheno$Flowering.time.at.Arkansas # # use the first trait 
index <- !is.na(y)
y <- y[index] # 374
X <- X[index,] # 374x36901


##################################
# mixed.solve()
# ridge regression BLUP 
##################################
library(rrBLUP)
# center genotype matrix
X2 <- scale(X, center = TRUE, scale = FALSE)
fit <- mixed.solve(y = y, Z=X2)
# marker additive genetic variance
fit$Vu

## [1] 0.008110318

# residual variance
fit$Ve

## [1] 40.5186

# intercept 
fit$beta

## [1] 87.94439

# marker effects 
head(fit$u)

## [1] -0.015622948 -0.015272687 -0.001885351 -0.012680621 -0.016499673
## [6] -0.008310810

#########################################################
# GWAS()
# single marker regression + polygenic effects (G matrix) 
##########################################################
# create data frame pheno 
my.pheno <- data.frame(NSFTV_ID=rice.pheno[,2], y=rice.pheno[,3]) # use the first trait 
my.pheno <- my.pheno[index,]
table(rownames(X) == my.pheno[,1])

## 
## TRUE 
##  374

# read map file
map <-read.table("RiceDiversity_44K_Genotypes_PLINK/sativas413.map", header = FALSE, stringsAsFactors = FALSE)  
my.geno <- data.frame(marker=map[,2], chrom=map[,1], pos=map[,4], t(X-1), check.names = FALSE) # X = \in{-1, 0, 1}
scores <- GWAS(my.pheno, my.geno, min.MAF=0.05, P3D=TRUE, plot=FALSE)

## [1] "GWAS for trait: y"
## [1] "Variance components estimated. Testing markers."

head(scores[,4]) # returns -log10p

## [1] 1.6629346 1.6122190 0.1745523 1.1612296 1.8167903 0.7408174