Simulate some data for further work

Author

Zheer Kejlberg Al-Mashhadi

Published

March 19, 2026

Below I will construct some DST-like datasets and show how a full data preprocessing pipeline might look.

set.seed(123456)
library(dplyr)

Let’s construct some fake data

Population

n <- 2000
bef <- data.frame(
  PNR = runif(n, min = 1e6, max = 1e7-1) |> floor() |> as.character(),
  koen = rbinom(n, size = 1, prob = 0.49) + 1,
  FOED_DAG = runif(n, as.Date("1940-01-01"), as.Date("1959-12-31")) |> floor()  |> as.Date()
)

Preview:

bef |> head(3) |> gt::gt()
PNR koen FOED_DAG
8180058 2 1955-09-19
7782085 2 1947-06-21
4521300 1 1959-04-11

LMDB

# Making an API call to a website that sends back all existing ATC-codes + corresponding drug names.
ATC_codes <- httr2::request("https://pgx-db.org/rest-api/atc/atc_code/CS/") |>
  httr2::req_headers(
    accept = "application/json",
    `X-CSRFToken`= "3T14DJ7uB6gqdVnsbPclbAonNeAAiP4rAA47TXe3lsagl9PGOlPM9BKNhIFughWy"
  ) |>
  httr2::req_perform() |>
  httr2::resp_body_json() |> 
  dplyr::bind_rows()

# Select a subset
ATC_select_A <- ATC_codes |> 
  rename(ATC = `ATC code`) |>
  filter(grepl("^A", ATC))

# Simulate prescriptions
n_lmdb <- n * 600
lmdb <- sample_n(ATC_select_A,
               size = n_lmdb,
               replace=T) |>
  mutate(PNR = sample(bef$PNR, size = n_lmdb, replace=T),
         EKSD = runif(n_lmdb, as.Date("2000-01-01"), as.Date("2020-12-31")) |> floor()  |> as.Date(),
         APK = runif(n_lmdb, 1, 4) |> floor(),
         PACKSIZE = runif(n_lmdb, 1, 4) |> floor() * 10,
         STRNUM = round(rnorm(n_lmdb, 15, 4)),
         VOLUME = PACKSIZE * APK)

Preview:

lmdb |> head(3) |> gt::gt()
ATC Description PNR EKSD APK PACKSIZE STRNUM VOLUME
A11HA03 Tocopherol (vit e) 2265313 2014-10-27 3 20 11 60
A03FA05 Alizapride 7571615 2013-08-25 3 10 11 30
A10AD06 Insulin degludec and insulin aspart 5854842 2000-11-01 3 10 2 30

LPR2

n_lpr <- n * 30
hip_injury_icd <- c("S70.0", "S71.0", "S72.0", "S72.1", "S72.2", "S72.3", "S72.4", "S72.7", "S72.8", "S72.9")
lpr_adm <- data.frame(
  PNR = sample(bef$PNR, size = n_lpr, replace = T),
  RECNUM = runif(n_lpr, min = 1e9, max = 1e10-1) |> floor() |> unique(),
  D_INDDTO = runif(n_lpr, as.Date("2000-01-01"), as.Date("2009-12-31")) |> floor() |> as.Date(),
  C_ADIAG = sample(hip_injury_icd, n_lpr, replace=T)
)

lpr_diag <- lpr_adm |> 
  select(RECNUM, C_ADIAG) |>
  rename(C_DIAG = C_ADIAG) |>
  mutate(C_DIAGTYPE = "A") |>
  # Add in a B-diagnosis for a third of people
  rbind(
    data.frame(
      RECNUM = sample(lpr_adm$RECNUM, n_lpr/3),
      C_DIAG = sample(hip_injury_icd, n_lpr/3, replace=T),
      C_DIAGTYPE = "B"
    )
  )

Previews:

lpr_subset <- lpr_adm$RECNUM |> head(3)
lpr_adm |> filter(RECNUM %in% lpr_subset) |> gt::gt()
PNR RECNUM D_INDDTO C_ADIAG
9768595 3499773625 2006-04-06 S72.9
5026141 2430170757 2003-05-21 S72.0
4377798 3828151776 2003-07-27 S72.1
lpr_diag |> filter(RECNUM %in% lpr_subset) |> gt::gt()
RECNUM C_DIAG C_DIAGTYPE
3499773625 S72.9 A
2430170757 S72.0 A
3828151776 S72.1 A
3828151776 S72.8 B
3499773625 S72.1 B

LAB

n_lab <- n * 40
n_lab_missing <- floor(n / 10)
lab_dm_forsker <- data.frame(
  PNR = sample(bef$PNR, size = n - n_lab_missing) |> sample(size = n_lab, replace = T),
  SAMPLINGDATE = runif(n_lab, as.Date("2000-01-01"), as.Date("2009-12-31")) |> floor()  |> as.Date(),
  ANALYSISCODE = rep("eGFR", n_lab)
) |>
  left_join(bef |> select(c(PNR, FOED_DAG)), by = "PNR") |>
  mutate(
    VALUE = rnorm(n_lab, 
                  mean = 100 - as.numeric((as.Date("2000-01-01") - as.Date(FOED_DAG))/3650)^10/(2^20) |> round(2), 
                  sd = 10) |> round()
  ) |>
  select(-c(FOED_DAG))

Preview:

lab_dm_forsker |> head(3) |> gt::gt()
PNR SAMPLINGDATE ANALYSISCODE VALUE
4827513 2001-02-12 eGFR 83
8230182 2009-03-16 eGFR 75
4513978 2002-02-28 eGFR 68