practical-genai-coding-guide/scripts/simulate_ehr_data.R at main · FritscheLab/practical-genai-coding-guide · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
###############################################################################
# Script Title: Simulate EHR Data
# Purpose: Simulate electronic health record (EHR) data including BMI, height, weight,
#          and demographics, while injecting data quality issues for downstream analysis.
# Author: Lars Fritsche
# Date Created: 2025-03-26
# Date Last Modified: 2025-03-28 # <--- Updated modification date
# Usage: Rscript scripts/simulate_ehr_data.R [options]
# Example:
# Rscript scripts/simulate_ehr_data.R \
#    --output_ehr "./data/raw/ehr_bmi_simulated_data.tsv" \
#    --output_ehr_dict "./data/raw/data_dictionary.txt" \
#    --output_demo "./data/raw/demographics_simulated_data.tsv" \
#    --output_demo_dict "./data/raw/demographics_data_dictionary.txt" \
#    --seed 123 \
#    --n_individuals 1000
#
# Input Data: None (data is simulated within the script)
# Output Data:
#   - EHR dataset: Simulated EHR data with anthropometric measurements.
#   - Data dictionary: Description of variables in the EHR dataset.
#   - Demographics dataset: Simulated patient demographics.
#   - Demographics data dictionary: Description of variables in the demographics dataset.
# Dependencies:
#   - R version: 4.3.0 or higher
#   - Packages: optparse
###############################################################################

#------------------------------------------------------------------------------
# Library Management: Install and load required packages
#------------------------------------------------------------------------------
required_packages <- c("optparse") # For handling command-line arguments

# Install missing packages if necessary
for (pkg in required_packages) {
    if (!requireNamespace(pkg, quietly = TRUE)) {
        message("Installing package: ", pkg)
        install.packages(pkg, repos = "https://cloud.r-project.org")
    }
    # Load required packages
    library(pkg, character.only = TRUE)
    message("Package loaded: ", pkg)
}

#------------------------------------------------------------------------------
# Parameter Initialization: Set up command-line options using optparse
#------------------------------------------------------------------------------
option_list <- list(
    make_option(c("--output_ehr"),
        type = "character", default = "./data/ehr_bmi_simulated_data.tsv",
        help = "Path to the EHR output TSV file [default= %default]", metavar = "character"
    ),
    make_option(c("--output_ehr_dict"),
        type = "character", default = "./data/data_dictionary.txt",
        help = "Path to the EHR data dictionary file [default= %default]", metavar = "character"
    ),
    make_option(c("--output_demo"),
        type = "character", default = "./data/demographics_simulated_data.tsv",
        help = "Path to the demographics output TSV file [default= %default]", metavar = "character"
    ),
    make_option(c("--output_demo_dict"),
        type = "character", default = "./data/demographics_data_dictionary.txt",
        help = "Path to the demographics data dictionary file [default= %default]", metavar = "character"
    ),
    make_option(c("--seed"),
        type = "numeric", default = 123,
        help = "Random seed for reproducibility [default= %default]", metavar = "numeric"
    ),
    make_option(c("--n_individuals"),
        # Changed type to integer for better validation
        type = "integer", default = 1000,
        help = "Number of unique individuals to simulate [default= %default]", metavar = "integer"
    ),
    make_option(c("-v", "--verbose"),
        action = "store_true", default = TRUE,
        help = "Print extra output [default= %default]"
    ),
    make_option(c("-q", "--quiet"),
        action = "store_false", dest = "verbose",
        help = "Print little output"
    )
)

opt_parser <- OptionParser(
    option_list = option_list,
    usage = "Usage: Rscript %prog [options]"
)
opt <- parse_args(opt_parser)

# --- Input Validation ---
if (!is.integer(opt$n_individuals) || opt$n_individuals <= 0) {
    stop("--n_individuals must be a positive integer.", call. = FALSE)
}
if (!is.numeric(opt$seed)) {
     stop("--seed must be numeric.", call. = FALSE)
}
# --- End Input Validation ---

# Assign options to variables for easier use
output_ehr <- opt$output_ehr
output_ehr_dict <- opt$output_ehr_dict
output_demo <- opt$output_demo
output_demo_dict <- opt$output_demo_dict
seed_val <- opt$seed
n_individuals <- opt$n_individuals
verbose_output <- opt$verbose

#------------------------------------------------------------------------------
# Verbose Messaging Functionality
#------------------------------------------------------------------------------
vmsg <- function(...) {
    # Use message directly, no need for paste
    if (verbose_output) message(...)
}

#------------------------------------------------------------------------------
# Check and Prepare Output Directory
#------------------------------------------------------------------------------
output_dirs <- unique(dirname(c(output_ehr, output_ehr_dict, output_demo, output_demo_dict)))
for (dir in output_dirs) {
    if (!dir.exists(dir)) {
        message("Output directory '", dir, "' does not exist. Creating directory...")
        dir.create(dir, recursive = TRUE)
    }
}

#------------------------------------------------------------------------------
# Simulation Parameters Setup
#------------------------------------------------------------------------------
# Set seed for reproducibility
set.seed(seed_val)
vmsg("Random seed set to: ", seed_val)

# --- General Simulation Parameters ---
min_records_per_person <- 50  # Minimum measurements per individual
max_records_per_person <- 150 # Maximum measurements per individual
max_years_span         <- 10  # Maximum span of years for measurements
ehr_start_date         <- as.Date("2010-01-01")
ehr_end_date           <- as.Date("2019-12-31")

# --- ID Generation Parameters ---
person_id_length    <- 12 # Length of hashed person_id strings
encounter_id_length <- 12 # Length of hashed encounter_id strings
id_charset          <- c(0:9, LETTERS, letters) # Characters for IDs

# --- Anthropometric Simulation Parameters ---
height_mean           <- 170 # cm
height_sd             <- 10  # cm
height_min_plausible  <- 130 # cm (used in generation)
height_max_plausible  <- 210 # cm (used in generation)
bmi_mean              <- 26  # kg/m^2
bmi_sd                <- 6   # kg/m^2
bmi_min_plausible     <- 15  # kg/m^2 (used in generation)
bmi_max_plausible     <- 50  # kg/m^2 (used in generation)
weight_change_mean    <- 0.2 # kg/year trend
weight_change_sd      <- 0.5 # kg/year trend SD
weight_noise_sd       <- 2.0 # kg short-term fluctuation SD
height_measurement_error_prob <- 0.1 # Probability of +/- 1cm error

# --- Data Quality Injection Parameters ---
# Missingness
prop_height_missing      <- 0.10
prop_weight_missing      <- 0.05
prop_bmi_missing_added <- 0.03 # Additional BMI missing (where H/W present)
# Implausible Outliers
prop_height_outliers     <- 0.01
prop_weight_outliers     <- 0.01
prop_bmi_outliers        <- 0.01
height_outlier_low_max   <- 99    # cm, values below this are outliers
height_outlier_high_min  <- 251   # cm, values above this are outliers
weight_outlier_low_max   <- 24.9  # kg, values below this are outliers
weight_outlier_high_min  <- 300.1 # kg, values above this are outliers
bmi_outlier_low_max      <- 9.9   # kg/m^2, values below this are outliers
bmi_outlier_high_min     <- 70.1  # kg/m^2, values above this are outliers
# BMI Mismatches
prop_bmi_mismatch        <- 0.05
bmi_mismatch_sd_error    <- 2.0   # SD of error added to create mismatch
# Extreme/Nonsensical Outliers
n_extreme_outliers       <- 5     # Fixed number of specific extreme errors

# --- Demographics Simulation Parameters ---
dob_min_date            <- as.Date("1930-01-01")
dob_max_date            <- as.Date("2000-12-31")
demographics_ref_date   <- as.Date("2019-12-31") # Date for age calculation
prop_age_missing        <- 0.05
prob_deceased           <- c(Yes = 0.1, No = 0.9)
race_categories         <- c("White", "Black", "Asian", "Native American", "Pacific Islander", "Other", "Patient Refused", "Unknown", "")
race_probs              <- c(0.4, 0.15, 0.1, 0.05, 0.05, 0.1, 0.05, 0.05, 0.05)
ethnicity_categories    <- c("Hispanic", "Non-Hispanic", "Patient Refused", "Unknown", "")
ethnicity_probs         <- c(0.2, 0.7, 0.05, 0.03, 0.02)
na_race_eth_values      <- c("Patient Refused", "Unknown", "") # Values to map to NA
sex_gender_categories   <- c("Male", "Female")
marital_status_categories <- c("Single", "Married", "Divorced", "Widowed")
marital_status_probs    <- c(0.4, 0.4, 0.15, 0.05)
zip3_min                <- 0
zip3_max                <- 999


#------------------------------------------------------------------------------
# Main Code Logic: Simulate EHR and Demographics Data
#------------------------------------------------------------------------------
message("Starting simulation of EHR and demographics data...")

# -----------------------------
# Generate Unique Person IDs
# -----------------------------
vmsg("Generating unique person IDs...")
person_ids <- replicate(n_individuals, paste(sample(id_charset, person_id_length, replace = TRUE), collapse = ""))
# Ensure uniqueness by regenerating any duplicates.
# Note: Collisions are extremely unlikely with these ID parameters but check is kept for robustness.
while (length(unique(person_ids)) < n_individuals) {
    vmsg("...regenerating duplicate person IDs (rare event)...")
    dup_idx <- duplicated(person_ids) | duplicated(person_ids, fromLast = TRUE)
    person_ids[dup_idx] <- replicate(sum(dup_idx), paste(sample(id_charset, person_id_length, replace = TRUE), collapse = ""))
}

# -----------------------------
# Simulate Measurements for Each Person
# -----------------------------
vmsg("Simulating measurements for each individual...")

# Functions to generate plausible base height and BMI values
generate_height <- function() {
    # Note: Could add max iterations for safety, but unlikely necessary here.
    h <- rnorm(1, mean = height_mean, sd = height_sd)
    while (h < height_min_plausible || h > height_max_plausible) {
        h <- rnorm(1, mean = height_mean, sd = height_sd)
    }
    return(round(h)) # Round to nearest cm
}

generate_bmi <- function() {
    # Note: Could add max iterations for safety, but unlikely necessary here.
    b <- rnorm(1, mean = bmi_mean, sd = bmi_sd)
    while (b < bmi_min_plausible || b > bmi_max_plausible) {
        b <- rnorm(1, mean = bmi_mean, sd = bmi_sd)
    }
    return(b)
}

# Initialize list to collect records for each individual
records_list <- vector("list", n_individuals)

for (i in 1:n_individuals) {
    if (i %% 100 == 0) vmsg("...simulating individual ", i, " of ", n_individuals) # Progress indicator

    # Determine a random number of records for the individual
    n_rec <- sample(min_records_per_person:max_records_per_person, 1)
    pid <- person_ids[i]

    # Generate stable attributes for the individual
    height_cm_base <- generate_height()
    bmi_base <- generate_bmi()
    weight_kg_base <- bmi_base * (height_cm_base / 100)^2 # Derive weight from BMI and height
    yearly_weight_change <- rnorm(1, mean = weight_change_mean, sd = weight_change_sd) # Annual weight change (kg/year)

    # Determine the measurement date range
    start_day_offset <- sample(0:(365 * (max_years_span - 1)), 1)
    person_start_date <- ehr_start_date + start_day_offset
    span_days <- sample(0:(365 * max_years_span), 1)
    span_days <- max(span_days, n_rec - 1) # Ensure span is sufficient for n_rec distinct days if needed
    person_end_date <- person_start_date + span_days
    if (person_end_date > ehr_end_date) {
        person_end_date <- ehr_end_date
    }
    # Ensure start date is not after end date if span is short or hits boundary
    if(person_start_date > person_end_date) person_start_date <- person_end_date

    # Sample measurement dates (with replacement to allow multiple per day initially)
    if(person_start_date == person_end_date) {
         all_dates <- person_start_date
    } else {
         all_dates <- seq.Date(person_start_date, person_end_date, by = "day")
    }
    measurement_dates <- sample(all_dates, size = n_rec, replace = TRUE)

    # Initialize data frame for the individual's records
    person_df <- data.frame(
        person_id        = pid,
        measurement_date = measurement_dates,
        height_cm        = numeric(n_rec),
        weight_kg        = numeric(n_rec),
        bmi              = numeric(n_rec)
    )

    # Order records chronologically to simulate realistic trends, then revert to original order
    ord <- order(person_df$measurement_date)
    sorted_dates <- person_df$measurement_date[ord]

    for (j in seq_along(sorted_dates)) {
        date_j <- sorted_dates[j]
        # Calculate elapsed years since the start date for trend calculation
        years_since_start <- as.numeric(difftime(date_j, person_start_date, units = "days")) / 365.25

        # Simulate height with occasional +/- 1 cm measurement error
        if (runif(1) < height_measurement_error_prob) {
            height_val <- height_cm_base + sample(c(-1, 1), 1)
        } else {
            height_val <- height_cm_base
        }
        # Ensure height doesn't go below a reasonable minimum due to error
        if(height_val < height_min_plausible - 5) height_val <- height_min_plausible - 5

        # Simulate weight as baseline plus trend and short-term fluctuation
        expected_weight <- weight_kg_base + yearly_weight_change * years_since_start
        weight_val <- expected_weight + rnorm(1, mean = 0, sd = weight_noise_sd)
        if (weight_val < 0) weight_val <- 0 # Ensure weight is non-negative

        # Calculate BMI from the current height and weight
        # Use max(1, height_val) to avoid division by zero or negative heights
        bmi_val <- ifelse(height_val > 0, weight_val / ((height_val / 100)^2), NA)

        # Assign computed values back to the record (in the original sampled date order)
        person_df$height_cm[ord[j]] <- height_val
        person_df$weight_kg[ord[j]] <- weight_val
        person_df$bmi[ord[j]] <- bmi_val
    }

    # Round values to appropriate precision before storing
    person_df$height_cm <- round(person_df$height_cm)
    person_df$weight_kg <- round(person_df$weight_kg, 1)
    person_df$bmi <- round(person_df$bmi, 1)

    records_list[[i]] <- person_df
}

# Combine all individuals' records into one data frame
# Note: For very large n_individuals, consider data.table::rbindlist or dplyr::bind_rows
ehr_data <- do.call(rbind, records_list)
total_records <- nrow(ehr_data)
vmsg("Total EHR records simulated: ", total_records)

# -----------------------------
# Inject Data Quality Issues
# -----------------------------
vmsg("Injecting data quality issues into EHR data...")
n <- total_records # Use shorter alias

# 1. Introduce missing values
vmsg("...injecting missing values...")
height_missing_idx <- sample(1:n, size = floor(prop_height_missing * n))
weight_missing_idx <- sample(1:n, size = floor(prop_weight_missing * n))
ehr_data$height_cm[height_missing_idx] <- NA
ehr_data$weight_kg[weight_missing_idx] <- NA

# Add BMI missing where weight or height is missing, plus some extra
bmi_missing_from_hw <- which(is.na(ehr_data$height_cm) | is.na(ehr_data$weight_kg))
potential_bmi_missing_added <- setdiff(1:n, bmi_missing_from_hw)
bmi_missing_added_idx <- sample(potential_bmi_missing_added, size = floor(prop_bmi_missing_added * n))
ehr_data$bmi[union(bmi_missing_from_hw, bmi_missing_added_idx)] <- NA


# 2. Inject implausible anthropometric values (outliers)
vmsg("...injecting outlier values...")
# Height outliers
valid_h_idx <- which(!is.na(ehr_data$height_cm))
height_outlier_idx <- sample(valid_h_idx, size = floor(prop_height_outliers * n))
h_n <- length(height_outlier_idx)
if (h_n > 0) {
    low_h_idx <- sample(height_outlier_idx, size = floor(h_n / 2))
    high_h_idx <- setdiff(height_outlier_idx, low_h_idx)
    ehr_data$height_cm[low_h_idx] <- sample(50:height_outlier_low_max, length(low_h_idx), replace = TRUE)
    ehr_data$height_cm[high_h_idx] <- sample(height_outlier_high_min:300, length(high_h_idx), replace = TRUE)
}

# Weight outliers
valid_w_idx <- which(!is.na(ehr_data$weight_kg))
# Ensure we don't pick indices already used for height outliers if possible
potential_w_outlier_idx <- setdiff(valid_w_idx, height_outlier_idx)
weight_outlier_idx <- sample(potential_w_outlier_idx, size = floor(prop_weight_outliers * n))
w_n <- length(weight_outlier_idx)
if (w_n > 0) {
    low_w_idx <- sample(weight_outlier_idx, size = floor(w_n / 2))
    high_w_idx <- setdiff(weight_outlier_idx, low_w_idx)
    ehr_data$weight_kg[low_w_idx] <- round(runif(length(low_w_idx), 10, weight_outlier_low_max), 1)
    ehr_data$weight_kg[high_w_idx] <- round(runif(length(high_w_idx), weight_outlier_high_min, 500), 1)
}

# BMI outliers
valid_b_idx <- which(!is.na(ehr_data$bmi))
# Ensure we don't pick indices already used for height/weight outliers if possible
potential_b_outlier_idx <- setdiff(valid_b_idx, c(height_outlier_idx, weight_outlier_idx))
bmi_outlier_idx <- sample(potential_b_outlier_idx, size = floor(prop_bmi_outliers * n))
b_n <- length(bmi_outlier_idx)
if (b_n > 0) {
    low_b_idx <- sample(bmi_outlier_idx, size = floor(b_n / 2))
    high_b_idx <- setdiff(bmi_outlier_idx, low_b_idx)
    ehr_data$bmi[low_b_idx] <- round(runif(length(low_b_idx), 5, bmi_outlier_low_max), 1)
    ehr_data$bmi[high_b_idx] <- round(runif(length(high_b_idx), bmi_outlier_high_min, 100), 1)
}

# 3. Introduce BMI discrepancies (where BMI != calculated BMI)
vmsg("...injecting BMI calculation mismatches...")
# Identify rows with valid H, W, B that haven't been made outliers yet
all_outlier_or_missing_idx <- unique(c(
    which(is.na(ehr_data$height_cm)), which(is.na(ehr_data$weight_kg)), which(is.na(ehr_data$bmi)),
    height_outlier_idx, weight_outlier_idx, bmi_outlier_idx
))
clean_idx <- setdiff(1:n, all_outlier_or_missing_idx)
mismatch_idx <- sample(clean_idx, size = floor(prop_bmi_mismatch * n))

for (idx in mismatch_idx) {
    # Double check validity just in case (should be valid based on clean_idx)
    if (!is.na(ehr_data$height_cm[idx]) && ehr_data$height_cm[idx] > 0 &&
        !is.na(ehr_data$weight_kg[idx]) && ehr_data$weight_kg[idx] >= 0 &&
        !is.na(ehr_data$bmi[idx]))
    {
        orig_bmi <- ehr_data$bmi[idx]
        # Add a small error to the recorded BMI to create a mismatch
        err <- rnorm(1, mean = 0, sd = bmi_mismatch_sd_error)
        new_bmi <- orig_bmi + err
        # Keep the mismatched BMI within a broadly plausible range
        if (new_bmi < bmi_min_plausible - 5) new_bmi <- bmi_min_plausible - 5
        if (new_bmi > bmi_max_plausible + 20) new_bmi <- bmi_max_plausible + 20
        ehr_data$bmi[idx] <- round(new_bmi, 1)
    }
}

# 4. Inject rare extreme or nonsensical outliers
vmsg("...injecting extreme/nonsensical outliers...")
# Sample from indices not already heavily modified
potential_extreme_idx <- setdiff(clean_idx, mismatch_idx)
if(length(potential_extreme_idx) >= n_extreme_outliers && n_extreme_outliers > 0) {
    extreme_idx <- sample(potential_extreme_idx, size = n_extreme_outliers)
    # Assign specific nonsensical values to showcase different error types
    # Ensure indices are assigned sequentially if n_extreme_outliers < 5
    if(n_extreme_outliers >= 1) ehr_data$height_cm[extreme_idx[1]] <- -5    # Negative height
    if(n_extreme_outliers >= 2) ehr_data$height_cm[extreme_idx[2]] <- 999   # Very large height
    if(n_extreme_outliers >= 3) ehr_data$weight_kg[extreme_idx[3]] <- -10   # Negative weight
    if(n_extreme_outliers >= 4) ehr_data$weight_kg[extreme_idx[4]] <- 800   # Very large weight
    if(n_extreme_outliers >= 5) ehr_data$bmi[extreme_idx[5]]       <- 150   # Very large BMI
}

# Ensure rounding consistency after all injections
ehr_data$height_cm <- round(ehr_data$height_cm)
ehr_data$weight_kg <- round(ehr_data$weight_kg, 1)
ehr_data$bmi <- round(ehr_data$bmi, 1)

# -----------------------------
# Assign Encounter IDs
# -----------------------------
vmsg("Assigning encounter IDs to each record...")
encounter_ids <- replicate(total_records, paste(sample(id_charset, encounter_id_length, replace = TRUE), collapse = ""))
# Check for unlikely duplicates
while (length(unique(encounter_ids)) < total_records) {
    vmsg("...regenerating duplicate encounter IDs (rare event)...")
    dup_idx <- duplicated(encounter_ids) | duplicated(encounter_ids, fromLast = TRUE)
    encounter_ids[dup_idx] <- replicate(sum(dup_idx), paste(sample(id_charset, encounter_id_length, replace = TRUE), collapse = ""))
}
ehr_data$encounter_id <- encounter_ids

# -----------------------------
# Convert Measurement Dates to Date-Time Format
# -----------------------------
vmsg("Converting measurement dates to date-time format...")
# Convert Date objects to POSIXct at the start of the day
ehr_data$measurement_date <- as.POSIXct(as.Date(ehr_data$measurement_date))
# Add a random time offset (0 to 86399 seconds) within each day
random_seconds <- runif(total_records, 0, 86400 - 1)
ehr_data$measurement_date <- ehr_data$measurement_date + random_seconds
# Format as 'YYYY-MM-DD HH:MM:SS'
ehr_data$measurement_date <- format(ehr_data$measurement_date, "%Y-%m-%d %H:%M:%S")

# -----------------------------
# Reorder Columns for Final EHR Output
# -----------------------------
ehr_data <- ehr_data[, c("person_id", "encounter_id", "bmi", "height_cm", "weight_kg", "measurement_date")]

# -----------------------------
# Output: Write EHR Dataset and Data Dictionary
# -----------------------------
vmsg("Writing EHR dataset and data dictionary to files...")

# Write the simulated EHR dataset to a TSV file
tryCatch(
    {
        write.table(ehr_data, file = output_ehr, sep = "\t", quote = FALSE, row.names = FALSE, na = "")
        vmsg("EHR dataset successfully saved to: ", output_ehr)
    },
    error = function(e) {
        stop("Error writing EHR dataset to '", output_ehr, "': ", e$message, call. = FALSE)
    }
)

# --- Generate and Write EHR Data Dictionary ---
ehr_colnames <- colnames(ehr_data)
dict_descriptions_ehr <- c(
    # Descriptions MUST be in the same order as columns in ehr_data
    "Hashed unique identifier for an individual (consistent across records).",
    "Hashed unique identifier for the encounter (unique per record).",
    "Body Mass Index (kg/m^2); may be missing, implausible, inconsistent, or nonsensical.",
    "Height in centimeters; may be missing, implausible (<100 or >250), or nonsensical.",
    "Weight in kilograms; may be missing, implausible (<25 or >300), or nonsensical.",
    "Date and time of measurement (YYYY-MM-DD HH:MM:SS)."
)

if (length(ehr_colnames) == length(dict_descriptions_ehr)) {
     dict_lines_ehr <- paste0(ehr_colnames, ": ", dict_descriptions_ehr)
} else {
     warning("Mismatch between EHR column names and dictionary descriptions. Writing basic names only.")
     dict_lines_ehr <- paste0(ehr_colnames, ": [Description Mismatch - Check Script]")
}

tryCatch(
    {
        writeLines(dict_lines_ehr, con = output_ehr_dict)
        vmsg("EHR data dictionary successfully saved to: ", output_ehr_dict)
    },
    error = function(e) {
        stop("Error writing EHR data dictionary to '", output_ehr_dict, "': ", e$message, call. = FALSE)
    }
)
# --- End EHR Dictionary ---

# -----------------------------
# Generate Demographics Data
# -----------------------------
vmsg("Generating demographics data...")
n_demo <- n_individuals # Number of demographic records = number of unique individuals

# Simulate Date of Birth
date_of_birth <- sample(seq.Date(dob_min_date, dob_max_date, by = "day"), n_demo, replace = TRUE)

# Compute age as of the reference date
age <- floor(as.numeric(difftime(demographics_ref_date, date_of_birth, units = "days")) / 365.25)
# Randomly set some ages to NA
na_age_idx <- sample(1:n_demo, size = floor(prop_age_missing * n_demo))
age[na_age_idx] <- NA

# Define age bins based on computed age
age_bin <- ifelse(is.na(age), NA_character_,
    ifelse(age < 18, "<18",
        ifelse(age < 35, "18-34",
            ifelse(age < 55, "35-54",
                ifelse(age < 75, "55-74", "75+")
            )
        )
    )
)

# Simulate deceased status
deceased <- sample(names(prob_deceased), n_demo, replace = TRUE, prob = prob_deceased)

# Simulate race and map specified values to NA
race_raw <- sample(race_categories, n_demo, replace = TRUE, prob = race_probs)
race_clean <- ifelse(race_raw %in% na_race_eth_values, NA_character_, race_raw)

# Simulate ethnicity and map specified values to NA
ethnicity_raw <- sample(ethnicity_categories, n_demo, replace = TRUE, prob = ethnicity_probs)
ethnicity_clean <- ifelse(ethnicity_raw %in% na_race_eth_values, NA_character_, ethnicity_raw)

# Combine race and ethnicity (only if both are non-NA)
race_ethnicity <- ifelse(is.na(race_clean) | is.na(ethnicity_clean),
    NA_character_,
    paste(race_clean, ethnicity_clean)
)

# Harmonize race and ethnicity into broader categories (vectorized approach)
race_ethnicity_harmonized <- ifelse(is.na(race_clean) | is.na(ethnicity_clean), NA_character_,
    ifelse(ethnicity_clean == "Hispanic", "Hispanic",
        ifelse(race_clean == "White", "Non-Hispanic White",
            ifelse(race_clean == "Black", "Non-Hispanic Black",
                ifelse(race_clean %in% c("Asian", "Native American", "Pacific Islander"),
                       "Non-Hispanic Asian/Pacific Islander/Native American",
                       "Other" # Includes 'Other' race category when Non-Hispanic
                )
            )
        )
    )
)

# Simulate sex/gender
sex_gender <- sample(sex_gender_categories, n_demo, replace = TRUE)

# Simulate marital status
marital_status_name <- sample(marital_status_categories, n_demo, replace = TRUE, prob = marital_status_probs)

# Simulate a three-digit ZIP code (with leading zeros)
zip3 <- sprintf("%03d", sample(zip3_min:zip3_max, n_demo, replace = TRUE))

# Combine demographics into a data frame
demo_data <- data.frame(
    person_id = person_ids,
    date_of_birth = format(date_of_birth, "%Y-%m-%d"),
    age = age,
    age_bin = age_bin,
    deceased = deceased,
    race_clean = race_clean,
    ethnicity_clean = ethnicity_clean,
    race_ethnicity = race_ethnicity,
    race_ethnicity_harmonized = race_ethnicity_harmonized,
    sex_gender = sex_gender,
    marital_status_name = marital_status_name,
    zip3 = zip3,
    stringsAsFactors = FALSE # Good practice
)

# -----------------------------
# Output: Write Demographics Dataset and Data Dictionary
# -----------------------------
# Write the demographics dataset to a TSV file
tryCatch(
    {
        write.table(demo_data, file = output_demo, sep = "\t", quote = FALSE, row.names = FALSE, na = "")
        vmsg("Demographics dataset successfully saved to: ", output_demo)
    },
    error = function(e) {
        stop("Error writing demographics dataset to '", output_demo, "': ", e$message, call. = FALSE)
    }
)

# --- Generate and Write Demographics Data Dictionary ---
demo_colnames <- colnames(demo_data)
dict_descriptions_demo <- c(
    # Descriptions MUST be in the same order as columns in demo_data
    "Unique identifier for the individual (matches EHR data).",
    "Date of birth in YYYY-MM-DD format.",
    paste("Age in years as of", format(demographics_ref_date), ". May contain NA."),
    "Age category based on calculated age (<18, 18-34, 35-54, 55-74, 75+).",
    "Indicator if the person is recorded as deceased (Yes/No).",
    paste("Self-reported race (NA for:", paste(na_race_eth_values, collapse=", "), ")."),
    paste("Self-reported ethnicity (NA for:", paste(na_race_eth_values, collapse=", "), ")."),
    "Combined race and ethnicity string (NA if either component is NA).",
    "Harmonized race/ethnicity classification (e.g., Hispanic, Non-Hispanic White, Non-Hispanic Black, Non-Hispanic Asian/Pacific Islander/Native American, Other).",
    "Recorded sex/gender of the individual.",
    "Self-reported marital status.",
    "Simulated three-digit ZIP code prefix."
)

if (length(demo_colnames) == length(dict_descriptions_demo)) {
     dict_lines_demo <- paste0(demo_colnames, ": ", dict_descriptions_demo)
} else {
     warning("Mismatch between Demographics column names and dictionary descriptions. Writing basic names only.")
     dict_lines_demo <- paste0(demo_colnames, ": [Description Mismatch - Check Script]")
}

tryCatch(
    {
        writeLines(dict_lines_demo, con = output_demo_dict)
        vmsg("Demographics data dictionary successfully saved to: ", output_demo_dict)
    },
    error = function(e) {
        stop("Error writing demographics data dictionary to '", output_demo_dict, "': ", e$message, call. = FALSE)
    }
)
# --- End Demographics Dictionary ---


#------------------------------------------------------------------------------
# Session Information: Record the R environment for reproducibility
#------------------------------------------------------------------------------
message("-------------------------------------------------------------------------------")
message("Session Information:")
sessionInfo()
message("-------------------------------------------------------------------------------")

# End of script message
message("Script execution completed successfully.")