PRS Tutorial
In [1]:
Copied!
import prslink as pl
import prslink as pl
In [2]:
Copied!
a= pl.PRS()
a= pl.PRS()
Thu Jun 13 12:28:18 2024 PRSLink v0.0.1 Thu Jun 13 12:28:18 2024 (C) 2023-2024, Yunye He, Kamatani Lab, MIT License, gwaslab@gmail.com
In [3]:
Copied!
a.add_score("./1kgeas.pT0.1.sscore", "#IID",["SCORE1_AVG"],["pT0.1"],sep="\s+")
a.add_score("./1kgeas.pT0.05.sscore", "#IID",["SCORE1_AVG"],["pT0.05"],sep="\s+")
a.add_score("./1kgeas.pT0.2.sscore", "#IID",["SCORE1_AVG"],["pT0.2"],sep="\s+")
a.add_score("./1kgeas.pT0.3.sscore", "#IID",["SCORE1_AVG"],["pT0.3"],sep="\s+")
a.add_score("./1kgeas.pT0.4.sscore", "#IID",["SCORE1_AVG"],["pT0.4"],sep="\s+")
a.add_score("./1kgeas.pT0.5.sscore", "#IID",["SCORE1_AVG"],["pT0.5"],sep="\s+")
a.add_score("./1kgeas.pT0.001.sscore","#IID",["SCORE1_AVG"],["pT0.01"],sep="\s+")
a.add_score("./1kgeas.pT0.1.sscore", "#IID",["SCORE1_AVG"],["pT0.1"],sep="\s+")
a.add_score("./1kgeas.pT0.05.sscore", "#IID",["SCORE1_AVG"],["pT0.05"],sep="\s+")
a.add_score("./1kgeas.pT0.2.sscore", "#IID",["SCORE1_AVG"],["pT0.2"],sep="\s+")
a.add_score("./1kgeas.pT0.3.sscore", "#IID",["SCORE1_AVG"],["pT0.3"],sep="\s+")
a.add_score("./1kgeas.pT0.4.sscore", "#IID",["SCORE1_AVG"],["pT0.4"],sep="\s+")
a.add_score("./1kgeas.pT0.5.sscore", "#IID",["SCORE1_AVG"],["pT0.5"],sep="\s+")
a.add_score("./1kgeas.pT0.001.sscore","#IID",["SCORE1_AVG"],["pT0.01"],sep="\s+")
Thu Jun 13 12:28:18 2024 Start loading datasets... Thu Jun 13 12:28:18 2024 - Dataset shape before loading : (0, 1) Thu Jun 13 12:28:18 2024 - Loading score data from file: ./1kgeas.pT0.1.sscore Thu Jun 13 12:28:18 2024 - Setting ID:#IID Thu Jun 13 12:28:18 2024 - Loading score:SCORE1_AVG Thu Jun 13 12:28:18 2024 - Loaded columns: pT0.1 Thu Jun 13 12:28:18 2024 - Overlapping IDs:0 Thu Jun 13 12:28:18 2024 - Merging to main dataset... Thu Jun 13 12:28:18 2024 - Added new IDs : 500 Thu Jun 13 12:28:18 2024 - Loading finished successfully! Thu Jun 13 12:28:18 2024 - Dataset shape after loading : (500, 2) Thu Jun 13 12:28:18 2024 Finished loading datasets... Thu Jun 13 12:28:18 2024 Start loading datasets... Thu Jun 13 12:28:18 2024 - Dataset shape before loading : (500, 2) Thu Jun 13 12:28:18 2024 - Loading score data from file: ./1kgeas.pT0.05.sscore Thu Jun 13 12:28:18 2024 - Setting ID:#IID Thu Jun 13 12:28:18 2024 - Loading score:SCORE1_AVG Thu Jun 13 12:28:18 2024 - Loaded columns: pT0.05 Thu Jun 13 12:28:18 2024 - Overlapping IDs:500 Thu Jun 13 12:28:18 2024 - Merging to main dataset... Thu Jun 13 12:28:18 2024 - Loading finished successfully! Thu Jun 13 12:28:18 2024 - Dataset shape after loading : (500, 3) Thu Jun 13 12:28:18 2024 Finished loading datasets... Thu Jun 13 12:28:18 2024 Start loading datasets... Thu Jun 13 12:28:18 2024 - Dataset shape before loading : (500, 3) Thu Jun 13 12:28:18 2024 - Loading score data from file: ./1kgeas.pT0.2.sscore Thu Jun 13 12:28:18 2024 - Setting ID:#IID Thu Jun 13 12:28:18 2024 - Loading score:SCORE1_AVG Thu Jun 13 12:28:18 2024 - Loaded columns: pT0.2 Thu Jun 13 12:28:18 2024 - Overlapping IDs:500 Thu Jun 13 12:28:18 2024 - Merging to main dataset... Thu Jun 13 12:28:18 2024 - Loading finished successfully! Thu Jun 13 12:28:18 2024 - Dataset shape after loading : (500, 4) Thu Jun 13 12:28:18 2024 Finished loading datasets... Thu Jun 13 12:28:18 2024 Start loading datasets... Thu Jun 13 12:28:18 2024 - Dataset shape before loading : (500, 4) Thu Jun 13 12:28:18 2024 - Loading score data from file: ./1kgeas.pT0.3.sscore Thu Jun 13 12:28:18 2024 - Setting ID:#IID Thu Jun 13 12:28:18 2024 - Loading score:SCORE1_AVG Thu Jun 13 12:28:18 2024 - Loaded columns: pT0.3 Thu Jun 13 12:28:18 2024 - Overlapping IDs:500 Thu Jun 13 12:28:18 2024 - Merging to main dataset... Thu Jun 13 12:28:18 2024 - Loading finished successfully! Thu Jun 13 12:28:18 2024 - Dataset shape after loading : (500, 5) Thu Jun 13 12:28:18 2024 Finished loading datasets... Thu Jun 13 12:28:18 2024 Start loading datasets... Thu Jun 13 12:28:18 2024 - Dataset shape before loading : (500, 5) Thu Jun 13 12:28:18 2024 - Loading score data from file: ./1kgeas.pT0.4.sscore Thu Jun 13 12:28:18 2024 - Setting ID:#IID Thu Jun 13 12:28:18 2024 - Loading score:SCORE1_AVG Thu Jun 13 12:28:18 2024 - Loaded columns: pT0.4 Thu Jun 13 12:28:18 2024 - Overlapping IDs:500 Thu Jun 13 12:28:18 2024 - Merging to main dataset... Thu Jun 13 12:28:18 2024 - Loading finished successfully! Thu Jun 13 12:28:18 2024 - Dataset shape after loading : (500, 6) Thu Jun 13 12:28:18 2024 Finished loading datasets... Thu Jun 13 12:28:18 2024 Start loading datasets... Thu Jun 13 12:28:18 2024 - Dataset shape before loading : (500, 6) Thu Jun 13 12:28:18 2024 - Loading score data from file: ./1kgeas.pT0.5.sscore Thu Jun 13 12:28:18 2024 - Setting ID:#IID Thu Jun 13 12:28:18 2024 - Loading score:SCORE1_AVG Thu Jun 13 12:28:18 2024 - Loaded columns: pT0.5 Thu Jun 13 12:28:18 2024 - Overlapping IDs:500 Thu Jun 13 12:28:18 2024 - Merging to main dataset... Thu Jun 13 12:28:18 2024 - Loading finished successfully! Thu Jun 13 12:28:18 2024 - Dataset shape after loading : (500, 7) Thu Jun 13 12:28:18 2024 Finished loading datasets... Thu Jun 13 12:28:18 2024 Start loading datasets... Thu Jun 13 12:28:18 2024 - Dataset shape before loading : (500, 7) Thu Jun 13 12:28:18 2024 - Loading score data from file: ./1kgeas.pT0.001.sscore Thu Jun 13 12:28:18 2024 - Setting ID:#IID Thu Jun 13 12:28:18 2024 - Loading score:SCORE1_AVG Thu Jun 13 12:28:18 2024 - Loaded columns: pT0.01 Thu Jun 13 12:28:18 2024 - Overlapping IDs:500 Thu Jun 13 12:28:18 2024 - Merging to main dataset... Thu Jun 13 12:28:18 2024 - Loading finished successfully! Thu Jun 13 12:28:18 2024 - Dataset shape after loading : (500, 8) Thu Jun 13 12:28:18 2024 Finished loading datasets...
In [4]:
Copied!
a.add_pheno("../01_Dataset/t2d/1kgeas_t2d.txt","IID",["T2D"],types="B",sep="\s+")
a.add_pheno("../01_Dataset/t2d/1kgeas_t2d.txt","IID",["T2D"],types="B",sep="\s+")
Thu Jun 13 12:28:19 2024 Start loading datasets... Thu Jun 13 12:28:19 2024 - Dataset shape before loading : (500, 8) Thu Jun 13 12:28:19 2024 - Loading pheno data from file: ../01_Dataset/t2d/1kgeas_t2d.txt Thu Jun 13 12:28:19 2024 - Setting ID:IID Thu Jun 13 12:28:19 2024 - Loading pheno:T2D Thu Jun 13 12:28:19 2024 - Loaded columns: T2D Thu Jun 13 12:28:19 2024 - Overlapping IDs:500 Thu Jun 13 12:28:19 2024 - Merging to main dataset... Thu Jun 13 12:28:19 2024 - Added new IDs : 4 Thu Jun 13 12:28:19 2024 - Loading finished successfully! Thu Jun 13 12:28:19 2024 - Dataset shape after loading : (504, 9) Thu Jun 13 12:28:19 2024 Finished loading datasets...
In [5]:
Copied!
a.add_covar("./1kgeas.eigenvec","IID",["PC1","PC2","PC3","PC4","PC5"],sep="\s+")
a.add_covar("./1kgeas.eigenvec","IID",["PC1","PC2","PC3","PC4","PC5"],sep="\s+")
Thu Jun 13 12:28:19 2024 Start loading datasets... Thu Jun 13 12:28:19 2024 - Dataset shape before loading : (504, 9) Thu Jun 13 12:28:19 2024 - Loading covar data from file: ./1kgeas.eigenvec Thu Jun 13 12:28:19 2024 - Setting ID:IID Thu Jun 13 12:28:19 2024 - Loading covar:PC1 PC2 PC3 PC4 PC5 Thu Jun 13 12:28:19 2024 - Loaded columns: PC1 PC2 PC3 PC4 PC5 Thu Jun 13 12:28:19 2024 - Overlapping IDs:500 Thu Jun 13 12:28:19 2024 - Merging to main dataset... Thu Jun 13 12:28:19 2024 - Loading finished successfully! Thu Jun 13 12:28:19 2024 - Dataset shape after loading : (504, 14) Thu Jun 13 12:28:19 2024 Finished loading datasets...
In [6]:
Copied!
# convert 1/2 coding to 0/1 coding
a.data["T2D"] = a.data["T2D"]-1
# convert 1/2 coding to 0/1 coding
a.data["T2D"] = a.data["T2D"]-1
In [7]:
Copied!
a.data
a.data
Out[7]:
IID | pT0.1 | pT0.05 | pT0.2 | pT0.3 | pT0.4 | pT0.5 | pT0.01 | T2D | PC1 | PC2 | PC3 | PC4 | PC5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | HG00403 | 2.844550e-05 | -0.000010 | 0.000085 | 0.000070 | 0.000076 | 0.000060 | 1.331020e-05 | 0 | 0.002265 | 0.030121 | -0.009379 | 0.014346 | -0.000008 |
1 | HG00404 | 5.651720e-05 | 0.000035 | 0.000114 | 0.000111 | 0.000114 | 0.000091 | -5.712170e-05 | 1 | 0.000120 | 0.037074 | 0.005504 | 0.032108 | 0.003675 |
2 | HG00406 | -3.918720e-05 | -0.000098 | 0.000015 | 0.000037 | 0.000052 | 0.000030 | -3.058490e-04 | 0 | 0.005554 | 0.043812 | -0.001995 | -0.001842 | -0.000286 |
3 | HG00407 | -9.876060e-05 | -0.000142 | -0.000014 | 0.000012 | 0.000029 | 0.000014 | -9.035270e-04 | 1 | 0.005308 | 0.036650 | -0.013282 | -0.006384 | 0.004915 |
4 | HG00409 | 1.671570e-05 | 0.000030 | 0.000056 | 0.000054 | 0.000061 | 0.000037 | -4.403280e-04 | 1 | 0.000620 | 0.042519 | -0.006477 | -0.026379 | -0.012448 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
499 | NA19087 | 6.296510e-07 | -0.000033 | 0.000063 | 0.000083 | 0.000088 | 0.000068 | -3.409900e-04 | 0 | -0.075750 | -0.044500 | 0.019200 | 0.073495 | -0.000317 |
500 | NA19088 | 1.460160e-04 | 0.000170 | 0.000153 | 0.000141 | 0.000135 | 0.000107 | 6.891660e-04 | 0 | -0.076007 | -0.049376 | 0.004827 | 0.061414 | 0.002162 |
501 | NA19089 | -3.846640e-05 | -0.000047 | 0.000044 | 0.000053 | 0.000059 | 0.000038 | -6.212790e-04 | 0 | -0.077956 | -0.045994 | -0.019492 | -0.073673 | 0.015631 |
502 | NA19090 | 3.620130e-05 | 0.000042 | 0.000061 | 0.000085 | 0.000099 | 0.000071 | -3.948200e-04 | 0 | -0.065308 | -0.032709 | -0.010401 | -0.050258 | -0.006227 |
503 | NA19091 | 8.955840e-05 | 0.000044 | 0.000130 | 0.000130 | 0.000131 | 0.000109 | 1.222930e-07 | 0 | -0.076216 | -0.054380 | -0.010428 | -0.059694 | -0.007113 |
504 rows × 14 columns
In [8]:
Copied!
# drop missing values
a.data = a.data[a.data["T2D"]>=0]
# drop missing values
a.data = a.data[a.data["T2D"]>=0]
In [9]:
Copied!
# set population prevalence rate
a.set_k({"T2D":0.2})
# set population prevalence rate
a.set_k({"T2D":0.2})
In [10]:
Copied!
a.evaluate(a.pheno_cols,
a.score_cols,
a.covar_cols,
r2_lia=True)
a.evaluate(a.pheno_cols,
a.score_cols,
a.covar_cols,
r2_lia=True)
Thu Jun 13 12:28:24 2024 Strat to evaluate PRS performance... Optimization terminated successfully. Current function value: 0.665421 Iterations 5 Thu Jun 13 12:28:24 2024 - Fitting Null Logistic: T2D~PC1+PC2+PC3+PC4+PC5 Thu Jun 13 12:28:24 2024 - Fitting Full Logistic : T2D~pT0.1+PC1+PC2+PC3+PC4+PC5 Optimization terminated successfully. Current function value: 0.659809 Iterations 5 Thu Jun 13 12:28:24 2024 - Population prevalence for T2D : 0.2 Thu Jun 13 12:28:24 2024 - Fitting Full Logistic : T2D~pT0.05+PC1+PC2+PC3+PC4+PC5 Optimization terminated successfully. Current function value: 0.657462 Iterations 5 Thu Jun 13 12:28:24 2024 - Population prevalence for T2D : 0.2 Thu Jun 13 12:28:24 2024 - Fitting Full Logistic : T2D~pT0.2+PC1+PC2+PC3+PC4+PC5 Optimization terminated successfully. Current function value: 0.658569 Iterations 5 Thu Jun 13 12:28:24 2024 - Population prevalence for T2D : 0.2 Thu Jun 13 12:28:24 2024 - Fitting Full Logistic : T2D~pT0.3+PC1+PC2+PC3+PC4+PC5 Optimization terminated successfully. Current function value: 0.657275 Iterations 5 Thu Jun 13 12:28:25 2024 - Population prevalence for T2D : 0.2 Thu Jun 13 12:28:25 2024 - Fitting Full Logistic : T2D~pT0.4+PC1+PC2+PC3+PC4+PC5 Optimization terminated successfully. Current function value: 0.656994 Iterations 5 Thu Jun 13 12:28:25 2024 - Population prevalence for T2D : 0.2 Thu Jun 13 12:28:25 2024 - Fitting Full Logistic : T2D~pT0.5+PC1+PC2+PC3+PC4+PC5 Optimization terminated successfully. Current function value: 0.657091 Iterations 5 Thu Jun 13 12:28:25 2024 - Population prevalence for T2D : 0.2 Thu Jun 13 12:28:25 2024 - Fitting Full Logistic : T2D~pT0.01+PC1+PC2+PC3+PC4+PC5 Optimization terminated successfully. Current function value: 0.660222 Iterations 5 Thu Jun 13 12:28:25 2024 - Population prevalence for T2D : 0.2 Thu Jun 13 12:28:25 2024 - Fitted models have been stored into .bmodels and .qmodels Thu Jun 13 12:28:25 2024 - Results have been stored into .bresults and .qresults Thu Jun 13 12:28:25 2024 Finished evaluating PRS performance...
/home/yunye/anaconda3/envs/gwaslab_py39/lib/python3.9/site-packages/prslink/model_evaluate.py:424: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation. bresults = pd.concat([bresults, row],ignore_index=True) /home/yunye/anaconda3/envs/gwaslab_py39/lib/python3.9/site-packages/prslink/g_PRS_object.py:157: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation. return pd.concat([bresults,qresults],ignore_index=True)
Out[10]:
PHENO | TYPE | PRS | N_CASE | N | BETA | SE | CI_L | CI_U | P | ... | Delta_R2 | AUC_null | AUC_full | Delta_AUC | R2_lia_null | R2_lia_full | Delta_R2_lia | NULL_formula | FULL_formula | NORM | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | T2D | B | pT0.01 | 199 | 498 | 0.215648 | 0.095424 | 0.028620 | 0.402675 | 0.023828 | ... | 0.013780 | 0.538226 | 0.581352 | 0.043125 | 0.019143 | 0.033083 | 0.013939 | T2D~PC1+PC2+PC3+PC4+PC5 | T2D~pT0.01+PC1+PC2+PC3+PC4+PC5 | True |
1 | T2D | B | pT0.1 | 199 | 498 | 0.231887 | 0.098979 | 0.037892 | 0.425882 | 0.019140 | ... | 0.014867 | 0.538226 | 0.579452 | 0.041226 | 0.019143 | 0.034344 | 0.015201 | T2D~PC1+PC2+PC3+PC4+PC5 | T2D~pT0.1+PC1+PC2+PC3+PC4+PC5 | True |
2 | T2D | B | pT0.2 | 199 | 498 | 0.260394 | 0.100672 | 0.063079 | 0.457708 | 0.009694 | ... | 0.018131 | 0.538226 | 0.581856 | 0.043630 | 0.019143 | 0.037758 | 0.018615 | T2D~PC1+PC2+PC3+PC4+PC5 | T2D~pT0.2+PC1+PC2+PC3+PC4+PC5 | True |
3 | T2D | B | pT0.05 | 199 | 498 | 0.272784 | 0.098132 | 0.080449 | 0.465120 | 0.005440 | ... | 0.021035 | 0.538226 | 0.590948 | 0.052722 | 0.019143 | 0.040672 | 0.021528 | T2D~PC1+PC2+PC3+PC4+PC5 | T2D~pT0.05+PC1+PC2+PC3+PC4+PC5 | True |
4 | T2D | B | pT0.3 | 199 | 498 | 0.285854 | 0.101471 | 0.086974 | 0.484734 | 0.004846 | ... | 0.021526 | 0.538226 | 0.585167 | 0.046940 | 0.019143 | 0.041221 | 0.022078 | T2D~PC1+PC2+PC3+PC4+PC5 | T2D~pT0.3+PC1+PC2+PC3+PC4+PC5 | True |
5 | T2D | B | pT0.5 | 199 | 498 | 0.291207 | 0.102281 | 0.090741 | 0.491673 | 0.004411 | ... | 0.022008 | 0.538226 | 0.586982 | 0.048755 | 0.019143 | 0.041654 | 0.022510 | T2D~PC1+PC2+PC3+PC4+PC5 | T2D~pT0.5+PC1+PC2+PC3+PC4+PC5 | True |
6 | T2D | B | pT0.4 | 199 | 498 | 0.290829 | 0.101559 | 0.091777 | 0.489881 | 0.004188 | ... | 0.022262 | 0.538226 | 0.586377 | 0.048150 | 0.019143 | 0.041912 | 0.022769 | T2D~PC1+PC2+PC3+PC4+PC5 | T2D~pT0.4+PC1+PC2+PC3+PC4+PC5 | True |
7 rows × 22 columns
In [11]:
Copied!
a.plot_roc(a.pheno_cols, a.score_cols, a.covar_cols)
a.plot_roc(a.pheno_cols, a.score_cols, a.covar_cols)
Thu Jun 13 12:28:25 2024 -Color map: tab10 Thu Jun 13 12:28:25 2024 -Color number: 7 (head: 0; tail: 0) Thu Jun 13 12:28:25 2024 -Color map norm: range(0, 7) Thu Jun 13 12:28:25 2024 -Color to use: ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2'] Optimization terminated successfully. Current function value: 0.665421 Iterations 5
In [12]:
Copied!
a.plot_prs(scores=a.score_cols,fig_args={"figsize":(10,5)})
a.plot_prs(scores=a.score_cols,fig_args={"figsize":(10,5)})
Thu Jun 13 12:28:26 2024 -Plot args: {} Thu Jun 13 12:28:26 2024 -Figure args: {'figsize': (10, 5)} Thu Jun 13 12:28:26 2024 -Color map: coolwarm Thu Jun 13 12:28:26 2024 -Color number: 2 (head: 0; tail: 0) Thu Jun 13 12:28:26 2024 -Color map norm: [0.0, 1.0] Thu Jun 13 12:28:26 2024 -Color to use: ['#3b4cc0', '#b40426']
In [13]:
Copied!
a.plot_prs(scores=a.score_cols,phenos=["T2D"],fig_args={"figsize":(10,5)})
a.plot_prs(scores=a.score_cols,phenos=["T2D"],fig_args={"figsize":(10,5)})
Thu Jun 13 12:28:27 2024 -Plot args: {} Thu Jun 13 12:28:27 2024 -Figure args: {'figsize': (10, 5)} Thu Jun 13 12:28:27 2024 -Color map: coolwarm Thu Jun 13 12:28:27 2024 -Color number: 2 (head: 0; tail: 0) Thu Jun 13 12:28:27 2024 -Color map norm: [0.0, 1.0] Thu Jun 13 12:28:27 2024 -Color to use: ['#3b4cc0', '#b40426']