Deterministic formulas for genomic prediction

Population parameter-based deterministic prediction
Data
Imputation
Quality control
Centering and scaling
G matrix
Relationship-based deterministic prediction of VanRaden (2008)
Leave-one-out cross-validation

Population parameter-based deterministic prediction

ShinyGPAS - Shiny Genomic Prediction Accuracy Simulator

library(shiny)
library(plotly)
runApp("app.R")

Data

Resende Jr. et al. (2012) (DOI: 10.1534/genetics.111.137026) analyzed 17 traits in loblolly pine (Pinus taeda) data, which include 951 individuals genotyped with 4853 SNPs. In this homework assignment, we will use the derregressed breeding values of crown width across the planting beds at age 6 (CWAC6). Download the zip file and type the following code.

# read phenotype and SNP files
CWAC6 <- read.csv("Loblolly_Pine_Resende/Phenotypic_Data Folder/DATA_nassau_age6_CWAC.csv", 
    header = TRUE, stringsAsFactors = FALSE)
SNP <- read.csv("Loblolly_Pine_Resende/Snp_Data.csv", header = TRUE, stringsAsFactors = FALSE)

# remove missing phenotypes
na.index <- which(is.na(CWAC6$Derregressed_BV))
CWAC6 <- CWAC6[-na.index, ]
SNP <- SNP[-na.index, ]
table(CWAC6$Genotype == SNP[, 1])

# phenotypes
y <- CWAC6$Derregressed_BV
y <- matrix(y, ncol = 1)

# markers
SNP <- SNP[, -1]  # 861 x 4853
SNP[SNP == -9] <- NA

Imputation

Replace missing marker genotypes with mean values. Then store the marker genotypes in a matrix object W.

W <- matrix(0, ncol = ncol(SNP), nrow = nrow(SNP))
for (j in 1:ncol(SNP)) {
    # cat('j = ', j, '\n')
    W[, j] <- ifelse(is.na(SNP[, j]), mean(SNP[, j], na.rm = TRUE), SNP[, j])
}

Quality control

Perform a quality control by removing markers with MAF < 0.05. How many markers are removed? Save the filtered genotype matrix in W2.

freq <- colMeans(W)/2
maf <- ifelse(freq > 0.5, 1 - freq, freq)
maf.index <- which(maf < 0.05)
length(maf.index)
W2 <- W[, -maf.index]

Centering and scaling

Standardize the genotype matrix to have a mean of zero and variance of one. Save this matrix as Xs.

Ws <- scale(W2, center = TRUE, scale = TRUE)

# dimensions
n <- nrow(Ws)
m <- ncol(Ws)

G matrix

Compute the second genomic relationship matrix of VanRaden (2008) G using the entire markers. Then add a very small positive constant (e.g., 0.001) to the diagonal elements so that G matrix is invertible.

G <- tcrossprod(Ws)/ncol(Ws)
G <- G + diag(n) * 0.001

Relationship-based deterministic prediction of VanRaden (2008)

# training-testing sets partitioning
n.trn <- 761
n.tst <- 100
y.trn <- y[1:n.trn]
y.tst <- y[n.trn + 1:n.tst]
Ws.trn <- Ws[1:n.trn, ]
Ws.tst <- Ws[n.trn + 1:n.tst, ]

# G matrix
Gtrn <- tcrossprod(Ws.trn)/ncol(Ws.trn)
Gtrn <- Gtrn + diag(n.trn) * 0.001
Gtst.trn <- tcrossprod(Ws.tst, Ws.trn)/ncol(Ws.tst)

# When training-testing sets relationship is equal to zero
lambda <- 1.029561  # fit.trn$Ve / fit.trn$Vu
I <- diag(n.trn)
sqrt(matrix(0, ncol = 761, nrow = 1) %*% solve(Gtrn + I * lambda) %*% matrix(0, 
    nrow = 761, ncol = 1))

# fit GREML to obtain variance components
library(rrBLUP)
fit.trn <- mixed.solve(y = y.trn, K = Gtrn)

# Compute deterministic prediction accuracy
r <- array()
for (i in 1:n.tst) {
    cat("Running ", "individual ", i, "\n")
    r[i] <- sqrt(matrix(Gtst.trn[i, ], nrow = 1) %*% solve(Gtrn + I * lambda) %*% 
        matrix(Gtst.trn[i, ], ncol = 1))
}

# Historgram of accuracy (r)
hist(r, breaks = 25, xlim = c(0, 1))

# Average squared relationship vs. Reliability (r^2)
plot(apply(Gtst.trn^2, 1, median), r^2, xlab = "Average squared relationship", 
    ylab = "Reliability")
abline(lm(r^2 ~ apply(Gtst.trn^2, 1, median)), col = "red")

Leave-one-out cross-validation

# Do not run
ghat <- array()
for (i in 1:n.tst) {
    yNA <- y.trn
    yNA <- c(yNA, NA)
    cat("Running ", "individual ", i, "\n")
    Gtrn2 <- rbind(Gtrn, Gtst.trn[i, ])
    Gtrn2 <- cbind(Gtrn2, c(Gtst.trn[i, ], G[i + n.trn, i + n.trn]))
    fit <- mixed.solve(yNA, K = Gtrn2)
    ghat[i] <- fit$u[n.trn + 1]
}

# square root of heritability
h <- sqrt(fit$Vu/(fit$Vu + fit$Ve))

# genomic prediction accuracy - cor(g, ghat)/h
cor(ghat, y.tst)/h  # 

# comparison
mean(r)

Deterministic formulas for genomic prediction

Quantitative Genetics and Genomics Workshop @ESALQ