q3 <-"DELETE FROM conditions WHERE PATIENT IS NULL;"# Execute the DELETE statement (returns number of rows affected)rows_deleted <-dbExecute(con, q3)rows_deleted
[1] 0
# If you want to see the results after deletion, run a separate SELECTq3_check <-"SELECT COUNT(*) as remaining_rows FROM conditions;"remaining_count <-dbGetQuery(con, q3_check)remaining_count
remaining_rows
1 1069109
Impute null values with a default:
# Execute the UPDATE statementq4 <-"UPDATE conditionsSET PATIENT = 'unknown'WHERE PATIENT IS NULL;"rows_updated <-dbExecute(con, q4)rows_updated
[1] 0
# Query to see records with 'unknown' PATIENT valuesq4_check <-"SELECT * FROM conditions WHERE PATIENT = 'unknown';"df_unknown <-dbGetQuery(con, q4_check)head(df_unknown)
Use conditional logic for context-sensitive replacements:
# DuckDB supports IF NOT EXISTSq_add_column_safe <-"ALTER TABLE conditions ADD COLUMN IF NOT EXISTS days_duration INTEGER;"dbExecute(con, q_add_column_safe)
[1] 0
# Calculate days_duration for all recordsq_calc_days <-"UPDATE conditions SET days_duration = STOP - START;"dbExecute(con, q_calc_days)
[1] 1069109
# Update missing values with average by CODEq_update_missing <-"UPDATE conditionsSET days_duration = ( SELECT AVG(days_duration) FROM conditions c2 WHERE c2.CODE = conditions.CODE AND c2.days_duration IS NOT NULL)WHERE STOP IS NULL;"rows_updated <-dbExecute(con, q_update_missing)rows_updated
[1] 457067
# See the first few rows with the new days_duration columnq_check_modified <-"SELECT * FROM conditions LIMIT 10;"df_modified <-dbGetQuery(con, q_check_modified)head(df_modified)
UPDATE conditions
SET PATIENT = TRIM(PATIENT);
UPDATE conditions
SET PATIENT = UPPER(PATIENT);
UPDATE conditions
SET PATIENT = REPLACE(PATIENT, 'old_value', 'new_value');
3.5.1Data normalization
# For DuckDB, use CAST or :: operator for date conversionq_normal <-"UPDATE conditionsSET START = CAST(START AS DATE);"rows_updated <-dbExecute(con, q_normal)rows_updated
[1] 1069109
3.6Handling outliers
Use aggregate functions to calculate summary statistics, then remove or adjust values outside acceptable ranges:
First, check for outliers before deleting:
# Check current statistics and potential outliersq_check_outliers <-"SELECT COUNT(*) as total_records, AVG(days_duration) as mean_duration, STDDEV(days_duration) as std_duration, MIN(days_duration) as min_duration, MAX(days_duration) as max_duration, AVG(days_duration) + 3 * STDDEV(days_duration) as upper_limit, AVG(days_duration) - 3 * STDDEV(days_duration) as lower_limitFROM conditions;"outlier_stats <-dbGetQuery(con, q_check_outliers)outlier_stats
q_count_outliers <-"SELECT COUNT(*) as outliers_to_removeFROM conditionsWHERE days_duration > ( SELECT AVG(days_duration) + 3 * STDDEV(days_duration) FROM conditions)OR days_duration < ( SELECT AVG(days_duration) - 3 * STDDEV(days_duration) FROM conditions);"outlier_count <-dbGetQuery(con, q_count_outliers)outlier_count
outliers_to_remove
1 0
Execute the deletion (your query is correct):
q_remove_outliers <-"DELETE FROM conditionsWHERE days_duration > ( SELECT AVG(days_duration) + 3 * STDDEV(days_duration) FROM conditions)OR days_duration < ( SELECT AVG(days_duration) - 3 * STDDEV(days_duration) FROM conditions);"rows_deleted <-dbExecute(con, q_remove_outliers)rows_deleted
[1] 0
Verify the results:
# Check final statistics after outlier removalq_final_stats <-"SELECT COUNT(*) as remaining_records, AVG(days_duration) as new_mean, STDDEV(days_duration) as new_std, MIN(days_duration) as new_min, MAX(days_duration) as new_maxFROM conditions;"final_stats <-dbGetQuery(con, q_final_stats)final_stats
A systematic approach ensures comprehensive data quality improvement while maintaining data integrity and business context. The following sequence provides a proven framework for SQL-based cleaning workflows:
Profiling and assessment - Analyze data distributions, identify patterns, and quantify quality issues
Data validation and filtering - Apply business rules and constraints to flag problematic records
Fixing missing data - Implement appropriate imputation strategies based on business requirements
Standardization and transformation - Normalize formats, data types, and categorical values
Removing duplicates - Eliminate redundant records while preserving data completeness
Correcting errors - Fix identified inaccuracies through automated rules and manual review
Handling outliers - Evaluate extreme values for legitimacy and business impact
Data-integrity checks and constraints - Implement ongoing validation mechanisms