Example pipeline

Working with data from Statistics Denmark (DST)

Below I will construct some DST-like datasets and show how a full data preprocessing pipeline might look.

set.seed(123456)
library(dplyr)

Let’s construct some fake data

Population

n <- 2000
bef <- data.frame(
  PNR = runif(n, min = 1e6, max = 1e7-1) |> floor() |> as.character(),
  koen = rbinom(n, size = 1, prob = 0.49) + 1,
  FOED_DAG = runif(n, as.Date("1940-01-01"), as.Date("1959-12-31")) |> floor()  |> as.Date()
)

Preview:

bef |> head(3) |> gt::gt()
PNR koen FOED_DAG
8180058 2 1955-09-19
7782085 2 1947-06-21
4521300 1 1959-04-11

LMDB

# Making an API call to a website that sends back all existing ATC-codes + corresponding drug names.
ATC_codes <- httr2::request("https://pgx-db.org/rest-api/atc/atc_code/CS/") |>
  httr2::req_headers(
    accept = "application/json",
    `X-CSRFToken`= "3T14DJ7uB6gqdVnsbPclbAonNeAAiP4rAA47TXe3lsagl9PGOlPM9BKNhIFughWy"
  ) |>
  httr2::req_perform() |>
  httr2::resp_body_json() |> 
  dplyr::bind_rows()

# Select a subset
ATC_select_glucose_lowering <- ATC_codes |> 
  rename(ATC = `ATC code`) |>
  filter(grepl("^A10BA02|^A10BB12|^A10BH0[1,2,3]|^A10BJ0[2,6]|^A10BK0[1-4]", ATC))

# Simulate prescriptions
n_lmdb <- n * 10
lmdb <- sample_n(ATC_select_glucose_lowering,
               size = n_lmdb,
               replace=T) |>
  mutate(PNR = sample(bef$PNR, size = n_lmdb, replace=T),
         EKSD = runif(n_lmdb, as.Date("2000-01-01"), as.Date("2020-12-31")) |> floor()  |> as.Date(),
         APK = runif(n_lmdb, 1, 4) |> floor(),
         PACKSIZE = runif(n_lmdb, 1, 4) |> floor() * 10,
         STRNUM = round(rnorm(n_lmdb, 15, 4)),
         VOLUME = PACKSIZE * APK)

Preview:

lmdb |> head(3) |> gt::gt()
ATC Description PNR EKSD APK PACKSIZE STRNUM VOLUME
A10BB12 Glimepiride 8913161 2010-04-16 2 20 12 40
A10BJ06 Semaglutide 7155623 2005-01-23 3 10 19 30
A10BJ02 Liraglutide 2448276 2005-03-13 2 20 23 40

LPR2

n_lpr <- n * 30
hip_injury_icd <- c("S70.0", "S71.0", "S72.0", "S72.1", "S72.2", "S72.3", "S72.4", "S72.7", "S72.8", "S72.9")
lpr_adm <- data.frame(
  PNR = sample(bef$PNR, size = n_lpr, replace = T),
  RECNUM = runif(n_lpr, min = 1e9, max = 1e10-1) |> floor() |> unique(),
  D_INDDTO = runif(n_lpr, as.Date("2000-01-01"), as.Date("2009-12-31")) |> floor() |> as.Date(),
  C_ADIAG = sample(hip_injury_icd, n_lpr, replace=T)
)

lpr_diag <- lpr_adm |> 
  select(RECNUM, C_ADIAG) |>
  rename(C_DIAG = C_ADIAG) |>
  mutate(C_DIAGTYPE = "A") |>
  # Add in a B-diagnosis for a third of people
  rbind(
    data.frame(
      RECNUM = sample(lpr_adm$RECNUM, n_lpr/3),
      C_DIAG = sample(hip_injury_icd, n_lpr/3, replace=T),
      C_DIAGTYPE = "B"
    )
  )

Previews:

lpr_subset <- lpr_adm$RECNUM |> head(3)
lpr_adm |> filter(RECNUM %in% lpr_subset) |> gt::gt()
PNR RECNUM D_INDDTO C_ADIAG
7465102 7161939417 2003-03-07 S72.4
4352898 3238130082 2005-04-19 S72.4
3752898 3030336058 2009-03-18 S72.1
lpr_diag |> filter(RECNUM %in% lpr_subset) |> gt::gt()
RECNUM C_DIAG C_DIAGTYPE
7161939417 S72.4 A
3238130082 S72.4 A
3030336058 S72.1 A
3030336058 S72.8 B

LAB

n_lab <- n * 40
n_lab_missing <- floor(n / 10)
lab_dm_forsker <- data.frame(
  PNR = sample(bef$PNR, size = n - n_lab_missing) |> sample(size = n_lab, replace = T),
  SAMPLINGDATE = runif(n_lab, as.Date("2000-01-01"), as.Date("2009-12-31")) |> floor()  |> as.Date(),
  ANALYSISCODE = rep("eGFR", n_lab)
) |>
  left_join(bef |> select(c(PNR, FOED_DAG)), by = "PNR") |>
  mutate(
    VALUE = rnorm(n_lab, 
                  mean = 100 - as.numeric((as.Date("2000-01-01") - as.Date(FOED_DAG))/3650)^10/(2^20) |> round(2), 
                  sd = 10) |> round()
  ) |>
  select(-c(FOED_DAG))

Preview:

lab_dm_forsker |> head(3) |> gt::gt()
PNR SAMPLINGDATE ANALYSISCODE VALUE
8172410 2006-12-10 eGFR 101
2697955 2004-02-22 eGFR 38
1088015 2009-09-23 eGFR 98

Putting things together

Identify first hip fracture

lpr <- left_join(lpr_adm, lpr_diag, by = "RECNUM") |>
  # Only care about A-diagnoses pertaining to hip fracture (S72...)
  filter(C_DIAGTYPE == "A",
         grepl("^S72", C_DIAG)
  ) |>
  # For each person, keep only first date
  group_by(PNR) |>
  filter(D_INDDTO == min(D_INDDTO)) |>
  # Keep one row if multiple diagnoses on the same date
  slice(1) |>
  ungroup() # Always best to ungroup in the end

Preview:

lpr |> head(3) |> gt::gt()
PNR RECNUM D_INDDTO C_ADIAG C_DIAG C_DIAGTYPE
1000746 5843620927 2000-05-07 S72.0 S72.0 A
1006023 7507439271 2000-01-03 S72.1 S72.1 A
1006164 1399413324 2000-04-04 S72.0 S72.0 A

Add background info

pop <- left_join(lpr, bef, by = "PNR") |>
  # calculate age in days, divide into years, round to two decimals
  mutate(age = ((D_INDDTO - FOED_DAG) / 365.25) |> as.numeric() |> round(2)) |>
  select(c(PNR, D_INDDTO, FOED_DAG, koen, age)) |>
  rename(index_date = D_INDDTO)

Preview:

pop |> head(3) |> gt::gt()
PNR index_date FOED_DAG koen age
1000746 2000-05-07 1947-04-20 1 53.05
1006023 2000-01-03 1957-02-07 1 42.90
1006164 2000-04-04 1947-04-19 2 52.96

Get last eGFR before fracture

last_eGFR <- left_join(pop, lab_dm_forsker, by = "PNR")
# Note, approx 10% of individuals have never had an eGFR measured; they now have NAs

last_eGFR <- last_eGFR |>
  # Get eGFRs prior to fracture
  filter(SAMPLINGDATE <= index_date) |>
  # keep only last eGFR for each person
  group_by(PNR) |>
  filter(SAMPLINGDATE == max(SAMPLINGDATE)) |>
  slice(1) |>
  ungroup() |>
  # keep only necessary columns to add back onto the full pop
  select(c(PNR, SAMPLINGDATE, VALUE)) |>
  rename(eGFR_val = VALUE, eGFR_date = SAMPLINGDATE)

pop <- left_join(pop, last_eGFR, by ="PNR")

Previews:

last_eGFR |> head(3) |> gt::gt()
PNR eGFR_date eGFR_val
1000746 2000-04-19 80
1010815 2000-01-17 107
1019932 2001-03-12 107
pop |> head(3) |> gt::gt()
PNR index_date FOED_DAG koen age eGFR_date eGFR_val
1000746 2000-05-07 1947-04-20 1 53.05 2000-04-19 80
1006023 2000-01-03 1957-02-07 1 42.90 NA NA
1006164 2000-04-04 1947-04-19 2 52.96 NA NA

Get first glucose-lowering drug prescription

lmdb_first_gld <- lmdb |>
  # For each person, only keep the earliest date of redemption
  group_by(PNR) |>
  filter(EKSD == min(EKSD)) |>
  # keep one line per person
  slice(1) |>
  ungroup() |>
  select(c(PNR, EKSD))

Preview:

lmdb_first_gld |> head(3) |> gt::gt()
PNR EKSD
1000746 2000-12-27
1006023 2000-07-12
1006164 2000-11-09

Add in medication data and remove individuals with prior drug exposure

pop <- left_join(pop, lmdb_first_gld, by ="PNR")

# Only keep people with no prior glucose-lowering drug use before hip fracture
pop <- pop |>
  filter(is.na(EKSD) | index_date <= EKSD) |>
  select(-c(EKSD))

Preview:

pop |> head(3) |> gt::gt()
PNR index_date FOED_DAG koen age eGFR_date eGFR_val
1000746 2000-05-07 1947-04-20 1 53.05 2000-04-19 80
1006023 2000-01-03 1957-02-07 1 42.90 NA NA
1006164 2000-04-04 1947-04-19 2 52.96 NA NA

Get first metformin prescription

lmdb_metformin <- lmdb |>
  filter(ATC == "A10BA02")

lmdb_metformin <- lmdb_metformin |>
  # For each person, only keep the earliest date of redemption
  group_by(PNR) |>
  filter(EKSD == min(EKSD)) |>
  # For each prescription, multiply the pill dosage by the number of pills to get a total dose and sum it over all prescriptions for that date (in case of multiple redemptions same day) to get a grand total
  mutate(no_of_pills = sum(APK * PACKSIZE),
         total_dose = sum(APK * PACKSIZE * STRNUM)) |>
  # keep one line per person
  slice(1) |>
  ungroup() |>
  select(c(PNR, EKSD, no_of_pills, total_dose)) |>
  rename(metf_date = EKSD,
         metf_no_of_pills = no_of_pills,
         metf_total_dose = total_dose)

Preview:

lmdb_metformin |> head(3) |> gt::gt()
PNR metf_date metf_no_of_pills metf_total_dose
1010815 2005-07-08 30 360
1019932 2001-02-20 60 840
1024746 2019-11-14 30 540

Add in metformin data

pop <- left_join(pop, lmdb_metformin, by ="PNR")

Preview:

pop |> head(3) |> gt::gt()
PNR index_date FOED_DAG koen age eGFR_date eGFR_val metf_date metf_no_of_pills metf_total_dose
1000746 2000-05-07 1947-04-20 1 53.05 2000-04-19 80 NA NA NA
1006023 2000-01-03 1957-02-07 1 42.90 NA NA NA NA NA
1006164 2000-04-04 1947-04-19 2 52.96 NA NA NA NA NA

Calculate time to metformin

pop <- pop |>
  mutate(tte_metf = difftime(metf_date, index_date))

Preview:

pop |> head(3) |> gt::gt()
PNR index_date FOED_DAG koen age eGFR_date eGFR_val metf_date metf_no_of_pills metf_total_dose tte_metf
1000746 2000-05-07 1947-04-20 1 53.05 2000-04-19 80 NA NA NA NA
1006023 2000-01-03 1957-02-07 1 42.90 NA NA NA NA NA NA
1006164 2000-04-04 1947-04-19 2 52.96 NA NA NA NA NA NA

Now you can start analysing the data…

library(ggplot2)
ggplot(pop, aes(x=age, y = eGFR_val)) +
  geom_point() +
  geom_smooth(method = lm, formula = y ~ splines::bs(x, 3)) +
  theme_bw()
Warning: Removed 750 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 750 rows containing missing values or values outside the scale range
(`geom_point()`).