Example pipeline

Working with data from Statistics Denmark (DST)

Here, we’ll use the simulated data from the previous note to show how a full data preprocessing pipeline might look. In practice, you would load in the data from DST using the appropriate functions (e.g. read_dta, read_csv, etc.) and then follow a similar pipeline to clean and prepare the data for analysis.

library(dplyr)

Loading in the simulated data here, but in practice you would load in the data from DST using the appropriate functions (e.g. read_dta, read_csv, etc.):

load("simulated_data.RData")

Identify first hip fracture

lpr <- left_join(lpr_adm, lpr_diag, by = "RECNUM") |>
  # Only care about A-diagnoses pertaining to hip fracture (S72...)
  filter(C_DIAGTYPE == "A",
         grepl("^S72", C_DIAG)
  ) |>
  # For each person, keep only first date
  group_by(PNR) |>
  filter(D_INDDTO == min(D_INDDTO)) |>
  # Keep one row if multiple diagnoses on the same date
  slice(1) |>
  ungroup() # Always best to ungroup in the end

Preview:

lpr |> head(3) |> gt::gt(id = "lpr")
PNR RECNUM D_INDDTO C_ADIAG C_DIAG C_DIAGTYPE
1000746 4520098676 2000-08-26 S72.7 S72.7 A
1006023 3028465586 2000-01-13 S72.7 S72.7 A
1006164 6616548853 2000-02-13 S72.1 S72.1 A

Add background info

pop <- left_join(lpr, bef, by = "PNR") |>
  # calculate age in days, divide into years, round to two decimals
  mutate(age = ((D_INDDTO - FOED_DAG) / 365.25) |> as.numeric() |> round(2)) |>
  select(c(PNR, D_INDDTO, FOED_DAG, koen, age)) |>
  rename(index_date = D_INDDTO)

Preview:

pop |> head(3) |> gt::gt(id = "pop")
PNR index_date FOED_DAG koen age
1000746 2000-08-26 1947-04-20 1 53.35
1006023 2000-01-13 1957-02-07 1 42.93
1006164 2000-02-13 1947-04-19 2 52.82

Get last eGFR before fracture

last_eGFR <- left_join(pop, lab_dm_forsker, by = "PNR")

Previews:

last_eGFR |> head(3) |> gt::gt(id = "last_eGFR")
PNR index_date FOED_DAG koen age SAMPLINGDATE ANALYSISCODE VALUE
1000746 2000-08-26 1947-04-20 1 53.35 2008-10-14 eGFR 90
1000746 2000-08-26 1947-04-20 1 53.35 2003-04-19 eGFR 92
1000746 2000-08-26 1947-04-20 1 53.35 2007-07-26 eGFR 104

Note, approx 10% of individuals have never had an eGFR measured; they now have NAs for the lab values, which is realistic and something you will have to deal with in real data.

Also, “ANALYSISCODE” is currently just “eGFR” for all rows, but in real data you would have many different types of lab measurements, each represented not with readable names but with NPU-codes.

last_eGFR <- last_eGFR |>
  # Get eGFRs prior to fracture
  filter(SAMPLINGDATE <= index_date) |>
  # keep only last eGFR for each person
  group_by(PNR) |>
  filter(SAMPLINGDATE == max(SAMPLINGDATE)) |>
  slice(1) |>
  ungroup() |>
  # keep only necessary columns to add back onto the full pop
  select(c(PNR, SAMPLINGDATE, VALUE)) |>
  rename(eGFR_val = VALUE, eGFR_date = SAMPLINGDATE)

pop <- left_join(pop, last_eGFR, by ="PNR")

Previews:

last_eGFR |> head(3) |> gt::gt(id = "last_eGFR2")
PNR eGFR_date eGFR_val
1000746 2000-06-19 67
1006164 2000-01-25 71
1010815 2000-05-29 79
pop |> head(3) |> gt::gt(id = "pop2")
PNR index_date FOED_DAG koen age eGFR_date eGFR_val
1000746 2000-08-26 1947-04-20 1 53.35 2000-06-19 67
1006023 2000-01-13 1957-02-07 1 42.93 NA NA
1006164 2000-02-13 1947-04-19 2 52.82 2000-01-25 71

Get first glucose-lowering drug prescription

lmdb_first_gld <- lmdb |>
  # only get glucose-lowering drugs (ATC code starting with A10BA02, A10BB12, A10BH0[1,2,3], A10BJ0[2,6], or A10BK0[1-4])
  filter(grepl("^A10BA02|^A10BB12|^A10BH0[1,2,3]|^A10BJ0[2,6]|^A10BK0[1-4]", ATC)) |>
  # For each person, only keep the earliest date of redemption
  group_by(PNR) |>
  filter(EKSD == min(EKSD)) |>
  # keep one line per person
  slice(1) |>
  ungroup() |>
  select(c(PNR, EKSD))

Preview:

lmdb_first_gld |> head(3) |> gt::gt(id = "lmdb_first_gld")
PNR EKSD
1000746 2001-04-25
1006023 2006-03-21
1006164 2000-08-04

Add in medication data and remove individuals with prior drug exposure

pop <- left_join(pop, lmdb_first_gld, by ="PNR")

# Only keep people with no prior glucose-lowering drug use before hip fracture
pop <- pop |>
  filter(is.na(EKSD) | index_date <= EKSD) |>
  select(-c(EKSD))

Preview:

pop |> head(3) |> gt::gt(id = "pop3")
PNR index_date FOED_DAG koen age eGFR_date eGFR_val
1000746 2000-08-26 1947-04-20 1 53.35 2000-06-19 67
1006023 2000-01-13 1957-02-07 1 42.93 NA NA
1006164 2000-02-13 1947-04-19 2 52.82 2000-01-25 71

Get first metformin prescription

lmdb_metformin <- lmdb |>
  filter(ATC == "A10BA02")

lmdb_metformin <- lmdb_metformin |>
  # For each person, only keep the earliest date of redemption
  group_by(PNR) |>
  filter(EKSD == min(EKSD)) |>
  # For each prescription, multiply the pill dosage by the number of pills to get a total dose and sum it over all prescriptions for that date (in case of multiple redemptions same day) to get a grand total
  mutate(no_of_pills = sum(APK * PACKSIZE),
         total_dose = sum(APK * PACKSIZE * STRNUM)) |>
  # keep one line per person
  slice(1) |>
  ungroup() |>
  select(c(PNR, EKSD, no_of_pills, total_dose)) |>
  rename(metf_date = EKSD,
         metf_no_of_pills = no_of_pills,
         metf_total_dose = total_dose)

Preview:

lmdb_metformin |> head(3) |> gt::gt(id = "lmdb_metformin")
PNR metf_date metf_no_of_pills metf_total_dose
1000746 2014-01-18 90 1260
1019932 2017-10-14 90 1170
1033372 2008-05-08 40 600

Add in metformin data

pop <- left_join(pop, lmdb_metformin, by ="PNR")

Preview:

pop |> head(3) |> gt::gt(id = "pop4")
PNR index_date FOED_DAG koen age eGFR_date eGFR_val metf_date metf_no_of_pills metf_total_dose
1000746 2000-08-26 1947-04-20 1 53.35 2000-06-19 67 2014-01-18 90 1260
1006023 2000-01-13 1957-02-07 1 42.93 NA NA NA NA NA
1006164 2000-02-13 1947-04-19 2 52.82 2000-01-25 71 NA NA NA

Calculate time to metformin

pop <- pop |>
  mutate(tte_metf = difftime(metf_date, index_date))

Preview:

pop |> head(3) |> gt::gt(id = "pop5")
PNR index_date FOED_DAG koen age eGFR_date eGFR_val metf_date metf_no_of_pills metf_total_dose tte_metf
1000746 2000-08-26 1947-04-20 1 53.35 2000-06-19 67 2014-01-18 90 1260 4893
1006023 2000-01-13 1957-02-07 1 42.93 NA NA NA NA NA NA
1006164 2000-02-13 1947-04-19 2 52.82 2000-01-25 71 NA NA NA NA

Now you can start analysing the data…

library(ggplot2)
ggplot(pop, aes(x=age, y = eGFR_val)) +
  geom_point() +
  geom_smooth(method = lm, formula = y ~ splines::bs(x, 3)) +
  theme_bw()
Warning: Removed 753 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 753 rows containing missing values or values outside the scale range
(`geom_point()`).

Or save the cleaned dataset for later use

saveRDS(pop, "cleaned_data.rds")

Making it easy to load the dataset in at a later time without having to repeat the whole pipeline:

pop <- readRDS("cleaned_data.rds")