From 8bb7398383b7c084164665d26e4567d35a1af770 Mon Sep 17 00:00:00 2001 From: Junming Guan Date: Tue, 3 Dec 2024 14:35:17 -0500 Subject: [PATCH] Add code for conditioinal gaussian imputation; add note on paper results in readme --- README.md | 5 +- .../analyze_imputation_from_cousins.py | 36 ++ .../simulate_cousins_with_pop_struc.py | 365 ++++++++++++++++++ 3 files changed, 405 insertions(+), 1 deletion(-) create mode 100644 paper_results/simulations/analyze_imputation_from_cousins.py create mode 100644 paper_results/simulations/simulate_cousins_with_pop_struc.py diff --git a/README.md b/README.md index 169deb04..755638ba 100644 --- a/README.md +++ b/README.md @@ -165,4 +165,7 @@ it in the container. To install snipar, follow these steps: # Running tests To check that the code is working properly and that the C modules have been compiled, you can run the tests using this command: - python -m unittest snipar.tests \ No newline at end of file + python -m unittest snipar.tests + +# Code for reproducing results +See paper_results/ for code to reproduce results in https://www.biorxiv.org/content/10.1101/2022.10.24.513611v1 \ No newline at end of file diff --git a/paper_results/simulations/analyze_imputation_from_cousins.py b/paper_results/simulations/analyze_imputation_from_cousins.py new file mode 100644 index 00000000..4cb0e514 --- /dev/null +++ b/paper_results/simulations/analyze_imputation_from_cousins.py @@ -0,0 +1,36 @@ +import numpy as np +import matplotlib.pyplot as plt +from scipy.interpolate import make_interp_spline +from sklearn.model_selection import LeaveOneOut +import pandas as pd + + +def plot(alpha, alpha_cov, non_sampling=False): + s = np.zeros(alpha.shape[0]) + loo = LeaveOneOut() + loo.get_n_splits(alpha) + for i, (index, _) in enumerate(loo.split(alpha)): + if alpha_cov.ndim == 1: + if not non_sampling: + s[i] = np.var(alpha[index, 0] / np.sqrt(alpha_cov)[index]) #/ (np.var(alpha_max[index, 0] / np.sqrt(alpha_cov_max)[index,0,0])) + else: + s[i] = np.mean(alpha[index,0] ** 2 - alpha_cov[index]) #/ (np.mean(alpha_max[index,0] ** 2) - np.mean(alpha_cov_max[index,0,0])) + else: + if not non_sampling: + s[i] = np.var(alpha[index, 0] / np.sqrt(np.diagonal(alpha_cov, axis1=1, axis2=2))[index,0]) #/ (np.var(alpha_max[index, 0] / np.sqrt(alpha_cov_max)[index,0,0])) + else: + s[i] = (np.mean(alpha[index,0] ** 2) - np.mean(alpha_cov[index,0,0])) #/(np.mean(alpha_max[index,0] ** 2) - np.mean(alpha_cov_max[index,0,0])) + mean = np.mean(s) + std = np.sqrt(np.sum(np.power(mean - s, 2)) * (s.shape[0] - 1) / s.shape[0]) + return mean, std + +df = [] +for F in [0, 0.001, 0.01, 0.1]: + x = np.load(f'impute_cond_gau_results/3000SNPs_5000cousinpairs_0.5h2pop_{F}Fst.npz') + m, std, = plot(x['alpha'], x['cov']) + df.append(pd.DataFrame({'Fst': [F], + 'method': ['cond. Gaussian'], + 'non_sampling_var': m, 'non_sampling_var_std': std})) + plot(x['alpha'], x['cov'], True) + +pd.concat(df).to_csv('impute_cond_gau_results/sp_non_sampling_var.csv', index=False, sep='\t') diff --git a/paper_results/simulations/simulate_cousins_with_pop_struc.py b/paper_results/simulations/simulate_cousins_with_pop_struc.py new file mode 100644 index 00000000..3319aba4 --- /dev/null +++ b/paper_results/simulations/simulate_cousins_with_pop_struc.py @@ -0,0 +1,365 @@ + +import numpy as np +from scipy.stats import multivariate_normal +from scipy.linalg import block_diag +from itertools import combinations_with_replacement, product +from scipy.optimize import fmin_l_bfgs_b, minimize, OptimizeResult +from numpy.linalg import slogdet, solve, inv, norm +from scipy.linalg import cho_factor, cho_solve +from scipy.sparse import csc_matrix, tril +from scipy.sparse.linalg import splu, SuperLU, spsolve, cg +from collections import defaultdict +import scipy +import os + + +save = False +num_threads = 1 +os.environ["OMP_NUM_THREADS"] = str(num_threads) +# export OPENBLAS_NUM_THREADS=... +os.environ["OPENBLAS_NUM_THREADS"] = str(num_threads) +# export MKL_NUM_THREADS=... +os.environ["MKL_NUM_THREADS"] = str(num_threads) +# export VECLIB_MAXIMUM_THREADS=... +os.environ["VECLIB_MAXIMUM_THREADS"] = str(num_threads) +# export NUMEXPR_NUM_THREADS=... +os.environ["NUMEXPR_NUM_THREADS"] = str(num_threads) + +def sp_solve_dense3d_lu(sps_mat: csc_matrix, + dense_mat: np.ndarray) -> np.ndarray: + """Compute product of the inverse of given sparse matrix and given 3-d dense array uisng LU; used for repeated OLS. + + Args: + sps_mat (csc_matrix): 2-d sparse matrix. + dense_mat (np.ndarray): 3-d dense array. + + Raises: + ValueError: dense_mat should be 3-d. + ValueError: 2nd dimensions of both inputs should match. + + Returns: + np.ndarray: 3-d dense array. + """ + if dense_mat.ndim != 3: + raise ValueError('dense_mat must be a 3-d array.') + n1: int + n2: int + m: int + n: int + c: int + n1, n2 = sps_mat.shape + m, n, c = dense_mat.shape + if n != n2: + raise ValueError(f'Input dims do not match: {n2} and {n}.') + lu = splu(sps_mat) + out: np.ndarray = np.empty((m, n1, c)) + for i in range(m): + for j in range(c): + out[i, :, j] = lu.solve(dense_mat[i, :, j]) + return out + +def build_sib_arr(fam_labels, x=1, y=1): + """Build lower-triangular nonzero entries of sibship matrix. + + Args: + fam_labels (FamLabels): ndarray of family id strings corresponding to each individual. + + Returns: + SparseGRMRepr: sparse GRM representation. + """ + data = [] + row_ind = [] + col_ind = [] + + label_indices = defaultdict(list) + for l, f in enumerate(fam_labels): + label_indices[f].append(l) + f = lambda lst: len(lst) * (len(lst) + 1) / 2 + n = sum(map(f, label_indices.values())) + + for f, indices_lst in label_indices.items(): + for pair in combinations_with_replacement(indices_lst, 2): + ind1, ind2 = max(pair[0], pair[1]), min(pair[0], pair[1]) + if ind1 == ind2: + data.append(1) + elif '_' in f: + data.append(y) + else: + data.append(x) + row_ind.append(ind1) + col_ind.append(ind2) + return np.array(data), np.array(row_ind, dtype='uint32'), np.array(col_ind, dtype='uint32') + +def simulate_ind(nsnp,f): + return np.random.binomial(1,f,nsnp*2).reshape((nsnp,2)) + +def simulate_sibs(father,mother, blocksize=200): + # Compute blocks without recombination + n_blocks = np.int(np.ceil(father.shape[0]/float(blocksize))) + blocksizes = np.zeros((n_blocks),dtype=int) + for i in range(n_blocks-1): + blocksizes[i] = blocksize + blocksizes[n_blocks-1] = father.shape[0]-np.sum(blocksizes) + meioses = [[np.zeros((father.shape[0]),dtype=np.int8),np.zeros((father.shape[0]),dtype=np.int8)], # sib1_f, sib1_m + [np.zeros((father.shape[0]),dtype=np.int8),np.zeros((father.shape[0]),dtype=np.int8)]] # sib2_f, sib2_m + for i in range(2): + for j in range(2): + block_start=0 + for k in range(n_blocks): + block_end = block_start+blocksizes[k] + meioses[i][j][block_start:block_end] = np.random.binomial(1,0.5) + block_start = block_end + ibd = np.array(meioses[0][0]==meioses[1][0],dtype=np.int8)+np.array(meioses[0][1]==meioses[1][1],dtype=np.int8) + gts = np.zeros((2,father.shape[0],2),dtype=np.int8) + snp_index = np.array([x for x in range(0, father.shape[0])]) + for i in range(0,2): + gts[i, :, 0] = father[snp_index,meioses[i][0]] + gts[i, :, 1] = mother[snp_index, meioses[i][1]] + return [gts,ibd] + +def simulate_sibs_phased(father,mother, blocksize=200): + # Compute blocks without recombination + n_blocks = np.int(np.ceil(father.shape[0]/float(blocksize))) + blocksizes = np.zeros((n_blocks),dtype=int) + for i in range(n_blocks-1): + blocksizes[i] = blocksize + blocksizes[n_blocks-1] = father.shape[0]-np.sum(blocksizes) + meioses = [[np.zeros((father.shape[0]),dtype=np.int8),np.zeros((father.shape[0]),dtype=np.int8)], # sib1_f, sib1_m + [np.zeros((father.shape[0]),dtype=np.int8),np.zeros((father.shape[0]),dtype=np.int8)]] # sib2_f, sib2_m + for i in range(2): + for j in range(2): + block_start=0 + for k in range(n_blocks): + block_end = block_start+blocksizes[k] + meioses[i][j][block_start:block_end] = np.random.binomial(1,0.5) # the whole block passes to sibling j + block_start = block_end + # (nfams, 2) ibd: entry in first column is 1 (0) if (not) sharing allele from father, entry in second column is 1 if sharing allele from mother + # sum of two column give IBD state + ibd = np.vstack([meioses[0][0]==meioses[1][0], meioses[0][1]==meioses[1][1]],) + gts = np.zeros((2,father.shape[0],2),dtype=np.int8) + snp_index = np.array([x for x in range(0, father.shape[0])]) + for i in range(0,2): + gts[i, :, 0] = father[snp_index,meioses[i][0]] # get father allele from meiosis + gts[i, :, 1] = mother[snp_index, meioses[i][1]] # get mother allele from meiosis + return [gts,ibd] + + + + + +def impute_cond_gau_gwas(sib_gts, f, true_par=None): # true_father, true_mother, true_grandpar_f=None, true_grandpar_m=None): + R12 = scipy.linalg.block_diag(*[np.array([[1/2,1/8],[1/8,1/2]]) for i in range(int(sib_gts.shape[0]/2))]) + R22 = scipy.linalg.block_diag(*[np.array([[1,1/8],[1/8,1]]) for i in range(int(sib_gts.shape[0]/2))]) + # R12 = 2*f*(1-f)*R12 + # R22 = 2*f*(1-f)*R22 + factor = R12.T @ inv(R22) + a_minus_mu = sib_gts.T - 2*np.expand_dims(f, axis=-1) + imp = np.einsum('jk,...k', factor, a_minus_mu) + 2*np.expand_dims(f, axis=-1) + imp[imp > 2] = 2 + imp[imp < 0] = 0 + linimp = sib_gts + 2*f + father_imp = imp[:, ::2] + mother_imp = imp[:, 1::2] + + return 2*father_imp, 2*mother_imp + + +def loglik(sf, se, Vf, y): + n = Vf.shape[0] + V = sf * Vf + se * csc_matrix((np.ones(n), (np.arange(0, n), np.arange(0, n))), shape=(n, n)) + e: np.ndarray = np.ones(n) + lu = splu(V) + Vinv_y = lu.solve(y) + Vinv_e = lu.solve(e) + yT_Vinv_y: float = y @ Vinv_y + eT_Vinv_e: float = e @ Vinv_e + eT_Vinv_y: float = e @ Vinv_y + logdet_eT_Vinv_e: float = np.log(e @ Vinv_e) + diag_l: np.ndarray = lu.L.diagonal() + diag_u: np.ndarray = lu.U.diagonal() + V_logdet: float = np.log(diag_l).sum() + np.log(diag_u).sum() + # _, V_logdet = np.linalg.slogdet(V) + return -0.5 * (V_logdet + logdet_eT_Vinv_e + yT_Vinv_y - eT_Vinv_y ** 2 / eT_Vinv_e) + + +from scipy.optimize import minimize, OptimizeResult +def optimize(Vf, y): + yvar = np.var(y) + sf = se = yvar / 2 + def nll(x): + sf, se = x + return -1 * loglik(sf, se, Vf, y) + res: OptimizeResult = minimize(nll, x0=np.array([sf, se]), + method='L-BFGS-B', bounds=[(0.0001 * yvar, yvar) for i in range(2)]) + if res.success: + optimized = True + sf, se = tuple(res.x) + else: + raise ValueError(f'Scipy minimize failed: {res.message}') + return sf, se + + + + +def savez(*args, **kwds): + if save: + print('SAVING') + np.savez(*args, **kwds) + else: + print('NOT SAVING') + + + +def main(nsnp=1500, nfam=1500, v=1, phased=True, blocksize=1, f=0.5, h2quad=0.5, h2_pop=0.1, unrel=False, npop=4, Fst=0.01, balding_nicols=True, save=True, diff=0.05, robust=False): + + fsize = 2 + + direct_var = 1/100 #1/nsnp + indirect_var = 1/100#0.25/nsnp + direct_indirect_corr = 0. + direct_indirect_cov = direct_indirect_corr*np.sqrt(indirect_var*direct_var) + direct_indirect_cov_matrix = [[direct_var, direct_indirect_cov],[direct_indirect_cov, indirect_var]] + direct_indirect = multivariate_normal.rvs(cov = direct_indirect_cov_matrix, size=nsnp) + if nsnp == 1: + direct_indirect = direct_indirect.reshape((-1, 2)) + direct_effects = direct_indirect[:,0].reshape((-1, 1)) + direct_effects[np.random.choice([i for i in range(nsnp)], size=int(nsnp*0.9), replace=True), 0] = 0. + direct_effects = [1] + + + + + cousin_gts_lst = [] + cousin_phen_lst = [] + father_gts_lst = [] + + + if Fst == 0 and balding_nicols: + pop_fs = np.ones((npop, nsnp), dtype=float) * 0.5 + else: + if balding_nicols: + a = (1-Fst)/Fst*0.5 + b = (1-Fst)/Fst*0.5 + pop_fs = np.zeros((npop, nsnp), dtype=float) + for snp in range(nsnp): + if nsnp == 1 and npop == 2: + pop_fs[0, snp] = 0.5 + np.sqrt(Fst) / 2 + pop_fs[1, snp] = 0.5 - np.sqrt(Fst) / 2 + else: + if balding_nicols: + pop_fs[:, snp] = np.random.beta(a, b, size=npop) + else: + pop_fs[0, snp] = 0.5 + np.sqrt(Fst) / 2 + pop_fs[1, snp] = 0.5 - np.sqrt(Fst) / 2 + + + pop_effs = np.random.randn(npop) + pop_indiv = np.hstack([[e for i in range(nfam * 2)] for e in pop_effs]) + scale_fctr = np.sqrt(h2_pop/np.var(pop_indiv)) + pop_effs = pop_effs * scale_fctr + for s, pf in enumerate(pop_fs): + if phased: ibd = np.zeros((nfam, 2, nsnp),dtype=np.int8) + else: ibd = np.zeros((nfam,fsize*(fsize-1)//2,nsnp),dtype=np.int8) + father_gts = np.full((nfam,nsnp,2), fill_value=np.nan, dtype=int) + mother_gts = np.full((nfam,nsnp,2), fill_value=np.nan, dtype=int) + for i,f in enumerate(pf): + father_gts[:,i,:] = np.random.binomial(1, f, size=(nfam,2)) + mother_gts[:,i,:] = np.random.binomial(1, f, size=(nfam,2)) + + sib_gts = np.zeros((nfam,fsize,nsnp,2)) + for i in range(0,nfam): + + father = father_gts[i, :, :] + mother = mother_gts[i, :, :] + if phased: + sibs = simulate_sibs_phased(father,mother, blocksize = blocksize) + else: + sibs = simulate_sibs(father,mother, blocksize = blocksize) + sib_gts[i, :, :, :] = sibs[0] + # ibd[i,:,:] = sibs[1] + + + + father_i = sum([[0+3*i, 2+3*i] for i in range(int(sib_gts.shape[0] / 3))],[]) + mother_i = sum([[1+3*i, 0+3*i] for i in range(int(sib_gts.shape[0] / 3))],[]) + + + father_gts = sib_gts[father_i, 0, :, :].copy() + mother_gts = sib_gts[mother_i, 1, :, :].copy() + sib_gts = np.zeros((father_gts.shape[0],fsize,nsnp,2)) + for i in range(0,father_gts.shape[0]): + father = father_gts[i, :, :] + mother = mother_gts[i, :, :] + if phased: + sibs = simulate_sibs_phased(father,mother, blocksize = blocksize) + else: + sibs = simulate_sibs(father,mother, blocksize = blocksize) + sib_gts[i, :, :, :] = sibs[0] + # ibd[i,:,:] = sibs[1] + # tmp=((father_sum+mother_sum)@indirect_effects).flatten() + + # n = sib_gts.shape[0] + cousin_gts = sib_gts[:, 0, :, :].sum(axis=2) + + cousin_noise = np.random.normal(0, v, size=(cousin_gts.shape[0],)) + cousin_phen = pop_effs[s] + np.sqrt(1-h2_pop)* cousin_noise + cousin_gts_lst.append(cousin_gts) + cousin_phen_lst.append(cousin_phen) + father_gts_lst.append(father_gts.sum(axis=2)) + + + + + cousin_gts = np.vstack(cousin_gts_lst) + cousin_phen = np.hstack(cousin_phen_lst) + father_gts = np.vstack(father_gts_lst) + pop_ids = np.hstack([np.ones(cousin_gts.shape[0], dtype=int) * i for i in range(npop)]) + + + + + + + + X = np.full((cousin_gts.shape[0], 2, cousin_gts.shape[1]), fill_value=np.nan) + X[:, 0, :] = cousin_gts + par1, par2 = impute_cond_gau_gwas(cousin_gts, cousin_gts.mean(axis=0)) + X[::2,1,:] = par1.T + X[1::2,1,:] = par2.T + + + fam_labels = sum([[str(i), str(i)] for i in range(int(cousin_gts.shape[0])//2)],[]) + sib_data, sib_row_ind, sib_col_ind = build_sib_arr(fam_labels, x=1, y=1) + n = cousin_gts.shape[0] + tril_mat: csc_matrix = csc_matrix((sib_data, (sib_row_ind, sib_col_ind)), shape=(n,n)) + triu_mat: csc_matrix = tril(tril_mat, k=-1, format='csc') + V=tril_mat + triu_mat.T + sf_cousin,se = optimize(V, cousin_phen-cousin_phen.mean()) + V_cousin = sf_cousin * V + se * csc_matrix((np.ones(n), (np.arange(0, n), np.arange(0, n))), shape=(n,n)) + alpha_cousin,alpha_ses_cousin, alpha_cov_cousin = standard_gwas(V_cousin, X, cousin_phen) + Z = alpha_cousin/alpha_ses_cousin + return alpha_cousin,alpha_cov_cousin + + +def standard_gwas(V_, X_, y_, n_rel=None): + X = X_.copy() + y = y_.copy() + lu = splu(V_) + y -= y.mean() + for i in range(0, X.shape[1]): + X[:, i, :] = X[:, i, :] - np.mean(X[:, i, :], axis=0) + Vinv_X: np.ndarray = sp_solve_dense3d_lu(V_, X.transpose(2,0,1)) + XT_Vinv_X: np.ndarray = np.einsum('...ij,...ik', X.transpose(2,0,1), Vinv_X) + XT_Vinv_y: np.ndarray = np.einsum('...ij,i', X.transpose(2,0,1), lu.solve(y)) + alpha: np.ndarray = solve(XT_Vinv_X, XT_Vinv_y) + alpha_cov: np.ndarray = np.linalg.inv(XT_Vinv_X) + alpha_ses = np.zeros((alpha_cov.shape[0], 2)) + for i in range(alpha_cov.shape[0]): + alpha_ses[i, :] = np.sqrt(np.diag(alpha_cov[i,:,:])) + return alpha, alpha_ses, alpha_cov + +save = True + +import sys +alpha, cov = main(nsnp=3000, nfam=5000, h2quad=0.9, h2_pop=0.5, unrel=False, Fst=0, npop=2, save=save, balding_nicols=True) +np.savez(f'impute_cond_gau_results/3000SNPs_50000cousinpairs_0.5h2pop_0Fst.npz', alpha=alpha, cov=cov) \ No newline at end of file