set.seed(123456)
library(dplyr)Example pipeline
Working with data from Statistics Denmark (DST)
Below I will construct some DST-like datasets and show how a full data preprocessing pipeline might look.
Let’s construct some fake data
Population
n <- 2000
bef <- data.frame(
PNR = runif(n, min = 1e6, max = 1e7-1) |> floor() |> as.character(),
koen = rbinom(n, size = 1, prob = 0.49) + 1,
FOED_DAG = runif(n, as.Date("1940-01-01"), as.Date("1959-12-31")) |> floor() |> as.Date()
)Preview:
bef |> head(3) |> gt::gt()| PNR | koen | FOED_DAG |
|---|---|---|
| 8180058 | 2 | 1955-09-19 |
| 7782085 | 2 | 1947-06-21 |
| 4521300 | 1 | 1959-04-11 |
LMDB
# Making an API call to a website that sends back all existing ATC-codes + corresponding drug names.
ATC_codes <- httr2::request("https://pgx-db.org/rest-api/atc/atc_code/CS/") |>
httr2::req_headers(
accept = "application/json",
`X-CSRFToken`= "3T14DJ7uB6gqdVnsbPclbAonNeAAiP4rAA47TXe3lsagl9PGOlPM9BKNhIFughWy"
) |>
httr2::req_perform() |>
httr2::resp_body_json() |>
dplyr::bind_rows()
# Select a subset
ATC_select_glucose_lowering <- ATC_codes |>
rename(ATC = `ATC code`) |>
filter(grepl("^A10BA02|^A10BB12|^A10BH0[1,2,3]|^A10BJ0[2,6]|^A10BK0[1-4]", ATC))
# Simulate prescriptions
n_lmdb <- n * 10
lmdb <- sample_n(ATC_select_glucose_lowering,
size = n_lmdb,
replace=T) |>
mutate(PNR = sample(bef$PNR, size = n_lmdb, replace=T),
EKSD = runif(n_lmdb, as.Date("2000-01-01"), as.Date("2020-12-31")) |> floor() |> as.Date(),
APK = runif(n_lmdb, 1, 4) |> floor(),
PACKSIZE = runif(n_lmdb, 1, 4) |> floor() * 10,
STRNUM = round(rnorm(n_lmdb, 15, 4)),
VOLUME = PACKSIZE * APK)Preview:
lmdb |> head(3) |> gt::gt()| ATC | Description | PNR | EKSD | APK | PACKSIZE | STRNUM | VOLUME |
|---|---|---|---|---|---|---|---|
| A10BB12 | Glimepiride | 8913161 | 2010-04-16 | 2 | 20 | 12 | 40 |
| A10BJ06 | Semaglutide | 7155623 | 2005-01-23 | 3 | 10 | 19 | 30 |
| A10BJ02 | Liraglutide | 2448276 | 2005-03-13 | 2 | 20 | 23 | 40 |
LPR2
n_lpr <- n * 30
hip_injury_icd <- c("S70.0", "S71.0", "S72.0", "S72.1", "S72.2", "S72.3", "S72.4", "S72.7", "S72.8", "S72.9")
lpr_adm <- data.frame(
PNR = sample(bef$PNR, size = n_lpr, replace = T),
RECNUM = runif(n_lpr, min = 1e9, max = 1e10-1) |> floor() |> unique(),
D_INDDTO = runif(n_lpr, as.Date("2000-01-01"), as.Date("2009-12-31")) |> floor() |> as.Date(),
C_ADIAG = sample(hip_injury_icd, n_lpr, replace=T)
)
lpr_diag <- lpr_adm |>
select(RECNUM, C_ADIAG) |>
rename(C_DIAG = C_ADIAG) |>
mutate(C_DIAGTYPE = "A") |>
# Add in a B-diagnosis for a third of people
rbind(
data.frame(
RECNUM = sample(lpr_adm$RECNUM, n_lpr/3),
C_DIAG = sample(hip_injury_icd, n_lpr/3, replace=T),
C_DIAGTYPE = "B"
)
)Previews:
lpr_subset <- lpr_adm$RECNUM |> head(3)
lpr_adm |> filter(RECNUM %in% lpr_subset) |> gt::gt()| PNR | RECNUM | D_INDDTO | C_ADIAG |
|---|---|---|---|
| 7465102 | 7161939417 | 2003-03-07 | S72.4 |
| 4352898 | 3238130082 | 2005-04-19 | S72.4 |
| 3752898 | 3030336058 | 2009-03-18 | S72.1 |
lpr_diag |> filter(RECNUM %in% lpr_subset) |> gt::gt()| RECNUM | C_DIAG | C_DIAGTYPE |
|---|---|---|
| 7161939417 | S72.4 | A |
| 3238130082 | S72.4 | A |
| 3030336058 | S72.1 | A |
| 3030336058 | S72.8 | B |
LAB
n_lab <- n * 40
n_lab_missing <- floor(n / 10)
lab_dm_forsker <- data.frame(
PNR = sample(bef$PNR, size = n - n_lab_missing) |> sample(size = n_lab, replace = T),
SAMPLINGDATE = runif(n_lab, as.Date("2000-01-01"), as.Date("2009-12-31")) |> floor() |> as.Date(),
ANALYSISCODE = rep("eGFR", n_lab)
) |>
left_join(bef |> select(c(PNR, FOED_DAG)), by = "PNR") |>
mutate(
VALUE = rnorm(n_lab,
mean = 100 - as.numeric((as.Date("2000-01-01") - as.Date(FOED_DAG))/3650)^10/(2^20) |> round(2),
sd = 10) |> round()
) |>
select(-c(FOED_DAG))Preview:
lab_dm_forsker |> head(3) |> gt::gt()| PNR | SAMPLINGDATE | ANALYSISCODE | VALUE |
|---|---|---|---|
| 8172410 | 2006-12-10 | eGFR | 101 |
| 2697955 | 2004-02-22 | eGFR | 38 |
| 1088015 | 2009-09-23 | eGFR | 98 |
Putting things together
Identify first hip fracture
lpr <- left_join(lpr_adm, lpr_diag, by = "RECNUM") |>
# Only care about A-diagnoses pertaining to hip fracture (S72...)
filter(C_DIAGTYPE == "A",
grepl("^S72", C_DIAG)
) |>
# For each person, keep only first date
group_by(PNR) |>
filter(D_INDDTO == min(D_INDDTO)) |>
# Keep one row if multiple diagnoses on the same date
slice(1) |>
ungroup() # Always best to ungroup in the endPreview:
lpr |> head(3) |> gt::gt()| PNR | RECNUM | D_INDDTO | C_ADIAG | C_DIAG | C_DIAGTYPE |
|---|---|---|---|---|---|
| 1000746 | 5843620927 | 2000-05-07 | S72.0 | S72.0 | A |
| 1006023 | 7507439271 | 2000-01-03 | S72.1 | S72.1 | A |
| 1006164 | 1399413324 | 2000-04-04 | S72.0 | S72.0 | A |
Add background info
pop <- left_join(lpr, bef, by = "PNR") |>
# calculate age in days, divide into years, round to two decimals
mutate(age = ((D_INDDTO - FOED_DAG) / 365.25) |> as.numeric() |> round(2)) |>
select(c(PNR, D_INDDTO, FOED_DAG, koen, age)) |>
rename(index_date = D_INDDTO)Preview:
pop |> head(3) |> gt::gt()| PNR | index_date | FOED_DAG | koen | age |
|---|---|---|---|---|
| 1000746 | 2000-05-07 | 1947-04-20 | 1 | 53.05 |
| 1006023 | 2000-01-03 | 1957-02-07 | 1 | 42.90 |
| 1006164 | 2000-04-04 | 1947-04-19 | 2 | 52.96 |
Get last eGFR before fracture
last_eGFR <- left_join(pop, lab_dm_forsker, by = "PNR")
# Note, approx 10% of individuals have never had an eGFR measured; they now have NAs
last_eGFR <- last_eGFR |>
# Get eGFRs prior to fracture
filter(SAMPLINGDATE <= index_date) |>
# keep only last eGFR for each person
group_by(PNR) |>
filter(SAMPLINGDATE == max(SAMPLINGDATE)) |>
slice(1) |>
ungroup() |>
# keep only necessary columns to add back onto the full pop
select(c(PNR, SAMPLINGDATE, VALUE)) |>
rename(eGFR_val = VALUE, eGFR_date = SAMPLINGDATE)
pop <- left_join(pop, last_eGFR, by ="PNR")Previews:
last_eGFR |> head(3) |> gt::gt()| PNR | eGFR_date | eGFR_val |
|---|---|---|
| 1000746 | 2000-04-19 | 80 |
| 1010815 | 2000-01-17 | 107 |
| 1019932 | 2001-03-12 | 107 |
pop |> head(3) |> gt::gt()| PNR | index_date | FOED_DAG | koen | age | eGFR_date | eGFR_val |
|---|---|---|---|---|---|---|
| 1000746 | 2000-05-07 | 1947-04-20 | 1 | 53.05 | 2000-04-19 | 80 |
| 1006023 | 2000-01-03 | 1957-02-07 | 1 | 42.90 | NA | NA |
| 1006164 | 2000-04-04 | 1947-04-19 | 2 | 52.96 | NA | NA |
Get first glucose-lowering drug prescription
lmdb_first_gld <- lmdb |>
# For each person, only keep the earliest date of redemption
group_by(PNR) |>
filter(EKSD == min(EKSD)) |>
# keep one line per person
slice(1) |>
ungroup() |>
select(c(PNR, EKSD))Preview:
lmdb_first_gld |> head(3) |> gt::gt()| PNR | EKSD |
|---|---|
| 1000746 | 2000-12-27 |
| 1006023 | 2000-07-12 |
| 1006164 | 2000-11-09 |
Add in medication data and remove individuals with prior drug exposure
pop <- left_join(pop, lmdb_first_gld, by ="PNR")
# Only keep people with no prior glucose-lowering drug use before hip fracture
pop <- pop |>
filter(is.na(EKSD) | index_date <= EKSD) |>
select(-c(EKSD))Preview:
pop |> head(3) |> gt::gt()| PNR | index_date | FOED_DAG | koen | age | eGFR_date | eGFR_val |
|---|---|---|---|---|---|---|
| 1000746 | 2000-05-07 | 1947-04-20 | 1 | 53.05 | 2000-04-19 | 80 |
| 1006023 | 2000-01-03 | 1957-02-07 | 1 | 42.90 | NA | NA |
| 1006164 | 2000-04-04 | 1947-04-19 | 2 | 52.96 | NA | NA |
Get first metformin prescription
lmdb_metformin <- lmdb |>
filter(ATC == "A10BA02")
lmdb_metformin <- lmdb_metformin |>
# For each person, only keep the earliest date of redemption
group_by(PNR) |>
filter(EKSD == min(EKSD)) |>
# For each prescription, multiply the pill dosage by the number of pills to get a total dose and sum it over all prescriptions for that date (in case of multiple redemptions same day) to get a grand total
mutate(no_of_pills = sum(APK * PACKSIZE),
total_dose = sum(APK * PACKSIZE * STRNUM)) |>
# keep one line per person
slice(1) |>
ungroup() |>
select(c(PNR, EKSD, no_of_pills, total_dose)) |>
rename(metf_date = EKSD,
metf_no_of_pills = no_of_pills,
metf_total_dose = total_dose)Preview:
lmdb_metformin |> head(3) |> gt::gt()| PNR | metf_date | metf_no_of_pills | metf_total_dose |
|---|---|---|---|
| 1010815 | 2005-07-08 | 30 | 360 |
| 1019932 | 2001-02-20 | 60 | 840 |
| 1024746 | 2019-11-14 | 30 | 540 |
Add in metformin data
pop <- left_join(pop, lmdb_metformin, by ="PNR")Preview:
pop |> head(3) |> gt::gt()| PNR | index_date | FOED_DAG | koen | age | eGFR_date | eGFR_val | metf_date | metf_no_of_pills | metf_total_dose |
|---|---|---|---|---|---|---|---|---|---|
| 1000746 | 2000-05-07 | 1947-04-20 | 1 | 53.05 | 2000-04-19 | 80 | NA | NA | NA |
| 1006023 | 2000-01-03 | 1957-02-07 | 1 | 42.90 | NA | NA | NA | NA | NA |
| 1006164 | 2000-04-04 | 1947-04-19 | 2 | 52.96 | NA | NA | NA | NA | NA |
Calculate time to metformin
pop <- pop |>
mutate(tte_metf = difftime(metf_date, index_date))Preview:
pop |> head(3) |> gt::gt()| PNR | index_date | FOED_DAG | koen | age | eGFR_date | eGFR_val | metf_date | metf_no_of_pills | metf_total_dose | tte_metf |
|---|---|---|---|---|---|---|---|---|---|---|
| 1000746 | 2000-05-07 | 1947-04-20 | 1 | 53.05 | 2000-04-19 | 80 | NA | NA | NA | NA |
| 1006023 | 2000-01-03 | 1957-02-07 | 1 | 42.90 | NA | NA | NA | NA | NA | NA |
| 1006164 | 2000-04-04 | 1947-04-19 | 2 | 52.96 | NA | NA | NA | NA | NA | NA |
Now you can start analysing the data…
library(ggplot2)
ggplot(pop, aes(x=age, y = eGFR_val)) +
geom_point() +
geom_smooth(method = lm, formula = y ~ splines::bs(x, 3)) +
theme_bw()Warning: Removed 750 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 750 rows containing missing values or values outside the scale range
(`geom_point()`).
