3 SQL for Cleansing Data

3.1 Issues

Missing data: This refers to the absence of values in data fields.
Incorrect data: Erroneous, inaccurate, or invalid values.
Duplicate data: Multiple instances of the same or similar records.
Inconsistent data: Values that deviate from expected patterns or formats.
Outliers: Extreme values that differ significantly from the majority of data points.

3.2 Load data

# Load necessary packages
library(DBI)
library(duckdb)
library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

library(RPostgres)
library(tools)

# Close any existing connections first
if (exists("con")) {
  try(dbDisconnect(con), silent = TRUE)
}

# Connect to DuckDB database
con <- dbConnect(
  duckdb::duckdb(),
  dbdir = "/Users/nnthieu/SyntheaData/SyntheaCovid19/synthea19.duckdb"
)

# Load the PostgreSQL extension
dbExecute(con, "INSTALL postgres; LOAD postgres;")

[1] 0

# Verify tables
dbListTables(con)

 [1] "allergies"         "careplans"         "conditions"       
 [4] "devices"           "encounters"        "imaging_studies"  
 [7] "immunizations"     "medications"       "observations"     
[10] "organizations"     "patients"          "payer_transitions"
[13] "payers"            "procedures"        "providers"        
[16] "supplies"

q1 <- "
SELECT * FROM conditions
"
df1 = dbGetQuery(con, q1)
head(df1)

       START       STOP                              PATIENT
1 2020-03-01 2020-03-30 1ff7f10f-a204-4bb1-aa72-dd763fa99482
2 2020-03-01 2020-03-01 1ff7f10f-a204-4bb1-aa72-dd763fa99482
3 2020-03-01 2020-03-30 1ff7f10f-a204-4bb1-aa72-dd763fa99482
4 2020-02-12 2020-02-26 9bcf6ed5-d808-44af-98a0-7d78a29ede72
5 2020-03-13 2020-04-14 9bcf6ed5-d808-44af-98a0-7d78a29ede72
6 2020-03-13 2020-04-14 9bcf6ed5-d808-44af-98a0-7d78a29ede72
                             ENCOUNTER      CODE              DESCRIPTION
1 60584c6a-e26b-4176-a660-86ad84c7ceca 386661006          Fever (finding)
2 60584c6a-e26b-4176-a660-86ad84c7ceca 840544004       Suspected COVID-19
3 60584c6a-e26b-4176-a660-86ad84c7ceca 840539006                 COVID-19
4 5d9dfe80-7edf-4181-9753-770e361934d1  44465007          Sprain of ankle
5 a78e78d9-33bb-40bc-9e42-e47ff7e910cc  49727002          Cough (finding)
6 a78e78d9-33bb-40bc-9e42-e47ff7e910cc 248595008 Sputum finding (finding)
  days_duration
1            29
2             0
3            29
4            14
5            32
6            32

3.3 Removing duplicate records

For more sophisticated deduplication that preserves the most recent record:

q_dedup <- "
SELECT DISTINCT *
FROM conditions;
"

df_dedup <- dbGetQuery(con, q_dedup)
head(df_dedup)

       START       STOP                              PATIENT
1 2020-03-17 2020-03-30 83082363-aef0-4f91-96a8-c9f864ea6a35
2 2020-02-18 2020-03-19 7c74d68f-4a82-4d74-8381-7329af8123f6
3 1968-05-26       <NA> 2ec42d0a-23c9-4feb-a537-3d883a07e93c
4 2020-03-09 2020-03-09 cb33c8ca-cda7-4127-8624-d32042a05651
5 2020-01-29 2020-02-25 5058675e-160b-478b-8f14-55ace1a4b690
6 2020-03-01 2020-04-02 599ec95d-48b3-460d-99dc-97dc1099a998
                             ENCOUNTER      CODE        DESCRIPTION
1 7e8e4d7b-13a7-43dd-9ae8-cdfca239ecc5 386661006    Fever (finding)
2 28d1d62a-4bfe-423d-a9f9-f942d3504ac1  56018004 Wheezing (finding)
3 2c13ca08-4073-4339-97f6-25879939fe45  15777000        Prediabetes
4 dba29554-4e69-4f7e-a27e-034168e2b37e 840544004 Suspected COVID-19
5 b0dbf4d5-1d43-4141-af66-9814ecdd9a1e  49727002    Cough (finding)
6 1ef4dc01-de0d-404e-8117-3f4b2e9ce82e 840539006           COVID-19
  days_duration
1            13
2            30
3            NA
4             0
5            27
6            32

3.4 Handling missing values

Remove rows with null values:

q3 <- "DELETE FROM conditions WHERE PATIENT IS NULL;"

# Execute the DELETE statement (returns number of rows affected)
rows_deleted <- dbExecute(con, q3)
rows_deleted

[1] 0

# If you want to see the results after deletion, run a separate SELECT
q3_check <- "SELECT COUNT(*) as remaining_rows FROM conditions;"
remaining_count <- dbGetQuery(con, q3_check)
remaining_count

  remaining_rows
1        1069109

Impute null values with a default:

# Execute the UPDATE statement
q4 <- "UPDATE conditions
SET PATIENT = 'unknown'
WHERE PATIENT IS NULL;"

rows_updated <- dbExecute(con, q4)
rows_updated

[1] 0

# Query to see records with 'unknown' PATIENT values
q4_check <- "SELECT * FROM conditions WHERE PATIENT = 'unknown';"
df_unknown <- dbGetQuery(con, q4_check)
head(df_unknown)

[1] START         STOP          PATIENT       ENCOUNTER     CODE         
[6] DESCRIPTION   days_duration
<0 rows> (or 0-length row.names)

Use conditional logic for context-sensitive replacements:

# DuckDB supports IF NOT EXISTS
q_add_column_safe <- "ALTER TABLE conditions ADD COLUMN IF NOT EXISTS days_duration INTEGER;"
dbExecute(con, q_add_column_safe)

[1] 0

# Calculate days_duration for all records
q_calc_days <- "UPDATE conditions SET days_duration = STOP - START;"
dbExecute(con, q_calc_days)

[1] 1069109

# Update missing values with average by CODE
q_update_missing <- "
UPDATE conditions
SET days_duration = (
  SELECT AVG(days_duration) 
  FROM conditions c2 
  WHERE c2.CODE = conditions.CODE 
    AND c2.days_duration IS NOT NULL
)
WHERE STOP IS NULL;
"

rows_updated <- dbExecute(con, q_update_missing)
rows_updated

[1] 457067

# See the first few rows with the new days_duration column
q_check_modified <- "SELECT * FROM conditions LIMIT 10;"
df_modified <- dbGetQuery(con, q_check_modified)
head(df_modified)

       START       STOP                              PATIENT
1 2020-03-01 2020-03-30 1ff7f10f-a204-4bb1-aa72-dd763fa99482
2 2020-03-01 2020-03-01 1ff7f10f-a204-4bb1-aa72-dd763fa99482
3 2020-03-01 2020-03-30 1ff7f10f-a204-4bb1-aa72-dd763fa99482
4 2020-02-12 2020-02-26 9bcf6ed5-d808-44af-98a0-7d78a29ede72
5 2020-03-13 2020-04-14 9bcf6ed5-d808-44af-98a0-7d78a29ede72
6 2020-03-13 2020-04-14 9bcf6ed5-d808-44af-98a0-7d78a29ede72
                             ENCOUNTER      CODE              DESCRIPTION
1 60584c6a-e26b-4176-a660-86ad84c7ceca 386661006          Fever (finding)
2 60584c6a-e26b-4176-a660-86ad84c7ceca 840544004       Suspected COVID-19
3 60584c6a-e26b-4176-a660-86ad84c7ceca 840539006                 COVID-19
4 5d9dfe80-7edf-4181-9753-770e361934d1  44465007          Sprain of ankle
5 a78e78d9-33bb-40bc-9e42-e47ff7e910cc  49727002          Cough (finding)
6 a78e78d9-33bb-40bc-9e42-e47ff7e910cc 248595008 Sputum finding (finding)
  days_duration
1            29
2             0
3            29
4            14
5            32
6            32

3.5 Correcting inconsistent or invalid data

UPDATE conditions
SET PATIENT = TRIM(PATIENT);

UPDATE conditions
SET PATIENT = UPPER(PATIENT);

UPDATE conditions
SET PATIENT = REPLACE(PATIENT, 'old_value', 'new_value');

3.5.1 Data normalization

# For DuckDB, use CAST or :: operator for date conversion
q_normal <- "
UPDATE conditions
SET START = CAST(START AS DATE);
"

rows_updated <- dbExecute(con, q_normal)
rows_updated

[1] 1069109

3.6 Handling outliers

Use aggregate functions to calculate summary statistics, then remove or adjust values outside acceptable ranges:

First, check for outliers before deleting:

# Check current statistics and potential outliers
q_check_outliers <- "
SELECT 
  COUNT(*) as total_records,
  AVG(days_duration) as mean_duration,
  STDDEV(days_duration) as std_duration,
  MIN(days_duration) as min_duration,
  MAX(days_duration) as max_duration,
  AVG(days_duration) + 3 * STDDEV(days_duration) as upper_limit,
  AVG(days_duration) - 3 * STDDEV(days_duration) as lower_limit
FROM conditions;
"

outlier_stats <- dbGetQuery(con, q_check_outliers)
outlier_stats

  total_records mean_duration std_duration min_duration max_duration
1       1069109      17.66605     10.57688            0           49
  upper_limit lower_limit
1    49.39667   -14.06458

Count how many outliers will be removed:

q_count_outliers <- "
SELECT COUNT(*) as outliers_to_remove
FROM conditions
WHERE days_duration > (
    SELECT AVG(days_duration) + 3 * STDDEV(days_duration) FROM conditions
)
OR days_duration < (
    SELECT AVG(days_duration) - 3 * STDDEV(days_duration) FROM conditions
);
"

outlier_count <- dbGetQuery(con, q_count_outliers)
outlier_count

  outliers_to_remove
1                  0

Execute the deletion (your query is correct):

q_remove_outliers <- "
DELETE FROM conditions
WHERE days_duration > (
    SELECT AVG(days_duration) + 3 * STDDEV(days_duration) FROM conditions
)
OR days_duration < (
    SELECT AVG(days_duration) - 3 * STDDEV(days_duration) FROM conditions
);
"

rows_deleted <- dbExecute(con, q_remove_outliers)
rows_deleted

[1] 0

Verify the results:

# Check final statistics after outlier removal
q_final_stats <- "
SELECT 
  COUNT(*) as remaining_records,
  AVG(days_duration) as new_mean,
  STDDEV(days_duration) as new_std,
  MIN(days_duration) as new_min,
  MAX(days_duration) as new_max
FROM conditions;
"

final_stats <- dbGetQuery(con, q_final_stats)
final_stats

  remaining_records new_mean  new_std new_min new_max
1           1069109 17.66605 10.57688       0      49

3.7 Sumary

A systematic approach ensures comprehensive data quality improvement while maintaining data integrity and business context. The following sequence provides a proven framework for SQL-based cleaning workflows:

Profiling and assessment - Analyze data distributions, identify patterns, and quantify quality issues
Data validation and filtering - Apply business rules and constraints to flag problematic records
Fixing missing data - Implement appropriate imputation strategies based on business requirements
Standardization and transformation - Normalize formats, data types, and categorical values
Removing duplicates - Eliminate redundant records while preserving data completeness
Correcting errors - Fix identified inaccuracies through automated rules and manual review
Handling outliers - Evaluate extreme values for legitimacy and business impact
Data-integrity checks and constraints - Implement ongoing validation mechanisms