############################################################
# AKI trial data cleaning (TidierOrg / Tidier.jl workflow)
# - Reads messy Excel
# - Detects the true header row
# - Cleans/standardizes fields (dates, categories, numerics)
# - Validates + flags HITL edge cases
# - Exports clean CSV + a markdown QA report
############################################################
using Tidier # meta-package (re-exports TidierData, TidierFiles, TidierStrings, TidierDates, etc.)
using DataFrames
using Dates
using Statistics
# -------------------------
# 0) I/O paths
# -------------------------
in_path = "/mnt/data/messy_aki.xlsx"
out_csv = "aki_clean.csv"
out_md = "aki_cleaning_report.md"
# -------------------------
# 1) Read Excel (TidierFiles)
# We read with col_names=false because the file has title rows above the real header
# -------------------------
df_raw = TidierFiles.read_xlsx(in_path; sheet=1, col_names=false, trim_ws=true)
# -------------------------
# 2) Find the header row containing "study_id" in the first column, then re-read as data
# -------------------------
firstcol = df_raw[:, 1]
header_row = findfirst(x -> !ismissing(x) && lowercase(String(x)) == "study_id", firstcol)
if header_row === nothing
error("Could not find a header row with 'study_id' in column 1. Inspect df_raw to locate headers.")
end
# Extract header values and data rows
hdr_vals = df_raw[header_row, :]
df = df_raw[(header_row + 1):nrow(df_raw), :]
# Assign column names from the header row
rename!(df, Symbol.(map(x -> ismissing(x) ? "" : String(x), collect(hdr_vals))))
# Drop any columns with empty names (common when there are blank columns in the sheet)
empty_name_cols = names(df)[map(nm -> nm == Symbol(""), names(df))]
if !isempty(empty_name_cols)
select!(df, Not(empty_name_cols))
end
# Clean column names to snake_case (TidierData)
df = @chain df begin
@clean_names()
end
# -------------------------
# 3) Drop trailing note rows / non-data rows
# (in this file, rows with missing study_id are notes/permissions)
# -------------------------
df = @chain df begin
@filter(!ismissing(study_id))
end
# -------------------------
# 4) Helper parsers (used inside @mutate)
# -------------------------
# Extract the first number from messy strings like "19 but poor collection"
function first_number(x)
if ismissing(x)
return missing
end
if x isa Number
return Float64(x)
end
s = strip(String(x))
m = match(r"-?\d+(\.\d+)?", s)
return m === nothing ? missing : parse(Float64, m.match)
end
# Parse DOB from multiple common Excel/string formats
function parse_dob(x)
if ismissing(x)
return missing
end
# Already a Date/DateTime
if x isa Date
return x
elseif x isa DateTime
return Date(x)
end
s = strip(String(x))
if isempty(s)
return missing
end
# Normalize month-name strings like "may 5 1970" -> "May 5 1970"
s2 = join(uppercasefirst.(split(lowercase(s), ' ')), ' ')
# Try a few formats
for fmt in (dateformat"y-m-d", dateformat"m/d/y", dateformat"u d y", dateformat"u d, y")
try
return Date(s2, fmt)
catch
end
end
# Last resort: try Date(s2) (can work for some ISO-ish strings)
try
return Date(s2)
catch
return missing
end
end
# Standardize Yes/No-ish fields
function yesno(x)
if ismissing(x)
return missing
end
s = lowercase(strip(String(x)))
if s in ("yes", "y", "true", "1")
return "Yes"
elseif s in ("no", "n", "false", "0")
return "No"
else
return missing
end
end
# Standardize sex
function sex_std(x)
if ismissing(x)
return missing
end
s = lowercase(strip(String(x)))
if s in ("m", "male")
return "Male"
elseif s in ("f", "female")
return "Female"
else
return missing
end
end
# Standardize race (example mapping—edit if your protocol uses different groupings)
function race_std(x)
if ismissing(x)
return missing
end
s = lowercase(strip(String(x)))
if s in ("white", "caucasian", "w", "white ")
return "White"
elseif s in ("black", "af-am", "african american", "african-american")
return "Black"
elseif s in ("asian")
return "Asian"
elseif s in ("mixed", "multiracial")
return "Mixed"
elseif s in ("other")
return "Other"
else
return "Other"
end
end
# Standardize Hispanic ethnicity with typo handling
function hispanic_std(x)
if ismissing(x)
return missing
end
s = lowercase(strip(String(x)))
if s in ("hispanic", "h", "h ", "hisspanic", "hispannic")
return "Hispanic"
elseif s in ("not hispanic", "nh", "non-hispanic", "non hispanic")
return "Not Hispanic"
else
return missing
end
end
# -------------------------
# 5) Main cleaning transforms (TidierData)
# - Parse types
# - Keep “raw” copies where useful for audit
# - Create QA flags for HITL review
# -------------------------
df_clean = @chain df begin
@mutate(
# IDs
study_id = as_integer(study_id),
# DOB -> Date
dob_raw = dob,
dob = ~parse_dob(dob),
# Demographics standardization
sex_raw = sex,
sex = ~sex_std(sex),
race_raw = race,
race = ~race_std(race),
hispanic_raw = hispanic,
hispanic = ~hispanic_std(hispanic),
# GFR numerics (keep raw columns + parsed numeric columns)
admission_gfr_raw = admission_gfr,
day_1_gfr_raw = day_1_gfr,
day2gfr_raw = day2gfr,
day_3_gfr_raw = day_3_gfr,
day_4_gfr_raw = day_4_gfr,
day5_gfr_raw = day5_gfr,
admission_gfr = ~first_number(admission_gfr),
day_1_gfr = ~first_number(day_1_gfr),
day2_gfr = ~first_number(day2gfr),
day_3_gfr = ~first_number(day_3_gfr),
day_4_gfr = ~first_number(day_4_gfr),
day5_gfr = ~first_number(day5_gfr),
# Flags: “had extra text” (e.g., "19 but poor collection")
day2_parse_note = !(ismissing(day2gfr_raw)) && !(String(day2gfr_raw) |> strip |> x -> occursin(r"^-?\d+(\.\d+)?$", x)),
# Devices
hd_catheter_raw = hd_catheter,
hd_catheter = ~yesno(hd_catheter),
av_fistula_raw = av_fistula,
av_fistula = ~yesno(av_fistula)
) |>
# Keep a tidy column order (optional)
@select(
study_id, dob, sex, race, hispanic,
admission_gfr, day_1_gfr, day2_gfr, day_3_gfr, day_4_gfr, day5_gfr,
diagnosis, sediment, hd_catheter, av_fistula,
day2_parse_note,
# audit columns at the end
dob_raw, sex_raw, race_raw, hispanic_raw,
admission_gfr_raw, day_1_gfr_raw, day2gfr_raw, day_3_gfr_raw, day_4_gfr_raw, day5_gfr_raw,
hd_catheter_raw, av_fistula_raw
)
end
# -------------------------
# 6) Validation + HITL flags
# -------------------------
# Duplicated study_id check
dup_ids = @chain df_clean begin
@count(study_id)
@filter(n > 1)
end
# Missingness summary
missing_summary = DataFrame(
column = String[],
n_missing = Int[],
pct_missing = Float64[]
)
for nm in names(df_clean)
col = df_clean[!, nm]
nmiss = count(ismissing, col)
push!(missing_summary, (String(nm), nmiss, round(100 * nmiss / nrow(df_clean); digits=1)))
end
# Outlier thresholds (IQR rule) for each GFR column
gfr_cols = [:admission_gfr, :day_1_gfr, :day2_gfr, :day_3_gfr, :day_4_gfr, :day5_gfr]
function iqr_bounds(v::Vector{Union{Missing, Float64}})
x = collect(skipmissing(v))
if length(x) < 4
return (missing, missing) # too few points
end
q1 = quantile(x, 0.25)
q3 = quantile(x, 0.75)
iqr = q3 - q1
return (q1 - 1.5*iqr, q3 + 1.5*iqr)
end
outlier_rows = DataFrame()
for c in gfr_cols
lo, hi = iqr_bounds(df_clean[!, c])
if !(ismissing(lo) || ismissing(hi))
tmp = @chain df_clean begin
@mutate(_col = String(c), _lo = lo, _hi = hi)
@filter(!ismissing($(c)) && (($(c) < lo) || ($(c) > hi)))
@select(study_id, _col, $(c), _lo, _hi)
end
outlier_rows = vcat(outlier_rows, tmp; cols=:union)
end
end
# Extra clinical sanity flags (edit thresholds to your protocol)
sanity_flags = @chain df_clean begin
@mutate(
flag_dob_unparsed = ismissing(dob),
flag_gfr_negative = any.((admission_gfr, day_1_gfr, day2_gfr, day_3_gfr, day_4_gfr, day5_gfr) .< 0),
flag_gfr_very_high = any.((admission_gfr, day_1_gfr, day2_gfr, day_3_gfr, day_4_gfr, day5_gfr) .> 200),
flag_day3_missing = ismissing(day_3_gfr),
flag_day2_note = day2_parse_note
)
end
# -------------------------
# 7) Export clean CSV (TidierFiles)
# -------------------------
TidierFiles.write_csv(df_clean, out_csv)
# -------------------------
# 8) Write a markdown QA report
# -------------------------
function df_to_md_table(d::DataFrame; max_rows::Int=50)
n = min(nrow(d), max_rows)
cols = names(d)
header = "|" * join(string.(cols), "|") * "|\n"
sep = "|" * join(fill("---", length(cols)), "|") * "|\n"
rows = ""
for i in 1:n
vals = [ismissing(d[i, c]) ? "missing" : string(d[i, c]) for c in cols]
rows *= "|" * join(vals, "|") * "|\n"
end
if nrow(d) > max_rows
rows *= "\n*(truncated to first $(max_rows) rows)*\n"
end
return header * sep * rows
end
report = """
# AKI Data Cleaning QA Report
## Files
- Input: `$(in_path)`
- Output CSV: `$(out_csv)`
## Row counts
- Rows after header detection + dropping note rows: **$(nrow(df_clean))**
- Columns: **$(ncol(df_clean))**
## Missing data (all columns)
$(df_to_md_table(sort(missing_summary, :n_missing, rev=true)))
## Duplicate study IDs (requires HITL)
$(nrow(dup_ids) == 0 ? "_None detected._" : df_to_md_table(dup_ids))
## Parsed-text notes (e.g., GFR cell had extra text)
Rows with `day2_parse_note = true` should be reviewed to confirm the numeric extraction matches intent.
$(df_to_md_table(@chain df_clean begin
@filter(day2_parse_note == true)
@select(study_id, day2gfr_raw, day2_gfr)
end))
## Outliers by IQR rule (requires HITL)
$(nrow(outlier_rows) == 0 ? "_None detected by IQR rule._" : df_to_md_table(outlier_rows))
## Sanity flags (requires HITL)
$(df_to_md_table(@chain sanity_flags begin
@filter(flag_dob_unparsed || flag_gfr_very_high || flag_day3_missing || flag_day2_note)
@select(study_id, flag_dob_unparsed, flag_gfr_very_high, flag_day3_missing, flag_day2_note,
admission_gfr, day_1_gfr, day2_gfr, day_3_gfr, day_4_gfr, day5_gfr,
dob_raw)
end))
"""
open(out_md, "w") do io
write(io, report)
end
println("Wrote clean CSV -> $(out_csv)")
println("Wrote QA report -> $(out_md)")