4  Data Analysis With SQL

4.1 Introduction

In this chapter, we perform data processing and analysis tasks, including filtering, recoding values, formatting string variables, creating date durations, joining tables, and counting rows using SQL. COVID-19 data are utilized to generate summary calculations such as counts, sum, and percentage of events across different patient groups.

4.2 Load Data and necessary r packages

# Load necessary packages
library(DBI)
library(duckdb)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(RPostgres)
library(tools)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats   1.0.1     ✔ readr     2.1.5
✔ ggplot2   4.0.0     ✔ stringr   1.6.0
✔ lubridate 1.9.4     ✔ tibble    3.3.0
✔ purrr     1.2.0     ✔ tidyr     1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Connect to DuckDB database

# Close any existing connections first
if (exists("con")) {
  try(dbDisconnect(con), silent = TRUE)
}

# Connect to DuckDB database
con <- dbConnect(
  duckdb::duckdb(),
  dbdir = "/Users/nnthieu/SyntheaData/SyntheaCovid19/synthea19.duckdb"
)

Verify tables

# Load the PostgreSQL extension
dbExecute(con, "INSTALL postgres; LOAD postgres;")
[1] 0
# Verify tables
dbListTables(con)
 [1] "allergies"         "careplans"         "conditions"       
 [4] "devices"           "encounters"        "imaging_studies"  
 [7] "immunizations"     "medications"       "observations"     
[10] "organizations"     "patients"          "payer_transitions"
[13] "payers"            "procedures"        "providers"        
[16] "supplies"         

4.3 Data analysis

4.3.0.1 Count number of patients seen in year 2000

 q1 <- "
SELECT COUNT(DISTINCT PATIENT) AS patients_counts FROM encounters
  WHERE EXTRACT(YEAR FROM START) = 2020
  ; "
 df1 <- dbGetQuery(con,q1)
 head(df1)
  patients_counts
1           98400

4.3.0.2 Count total patients diagnosed with COVID-19 in the year 2020

First, identify COVID-19 patients in the conditions table where code = '840539006', which represents a COVID-19 diagnosis.

q2 <- "SELECT DISTINCT PATIENT AS covid_patient_ids FROM conditions
  WHERE CODE = '840539006'
  AND EXTRACT(YEAR FROM START) = 2020
  "
covid_patient_ids = dbGetQuery(con, q2)

print(head(covid_patient_ids))
                     covid_patient_ids
1 94181a8e-4f80-45ed-89ff-b60aa6b8f4b4
2 67a7eef9-2e8f-4a5f-828e-39adf83495e6
3 9e20ba93-c722-46c2-a99e-a4c87a34e55a
4 ad0342e3-6958-40d3-9e0b-3dadcd05ba99
5 e199f46f-6785-4e5a-bb70-70ee08852d63
6 b2cf32f7-5ae1-4996-bd78-1e58c9402905
cat("Number of COVID-19 patients:", nrow(covid_patient_ids), "\n")
Number of COVID-19 patients: 88166 

List of the covid19 patients in year 2020

q3 <- "
SELECT DISTINCT PATIENT AS covid_patient_ids FROM encounters
  WHERE REASONDESCRIPTION = 'COVID-19'
  AND EXTRACT(YEAR FROM START) = 2020
  "
covid_patient_ids = dbGetQuery(con, q3)

print(head(covid_patient_ids))
                     covid_patient_ids
1 346b2b95-b8c5-4fe1-acf3-94de7e6e965b
2 e43fe5fb-3841-4690-b2a6-cca56edd7916
3 3079f00b-3055-4b37-ae4d-bf81d30e345e
4 5c99edc3-b1c2-45c0-a876-f879c45dab98
5 1cb463bf-6b1d-4ac8-9538-aeccc562f9e3
6 e5701c1d-05da-472f-b490-6b6c57601ef5
cat("Number of COVID-19 patients:", nrow(covid_patient_ids), "\n")
Number of COVID-19 patients: 18180 

However, there are just 18,180 patients with covid-19 diagnosis seeing care via encounters. The reason is that the patients with covid-19 diagnosis are not always seen in the encounters table.

q4 <- "
SELECT COUNT(DISTINCT PATIENT) AS covid19_counts FROM encounters
  WHERE REASONDESCRIPTION = 'COVID-19'
  AND EXTRACT(YEAR FROM START) = 2020
  ; "
dbGetQuery(con, q4)
  covid19_counts
1          18180

4.3.0.3 Count total patients with covid-19 in year 2020 by age group

In this example, we join two tables, create an ‘age’ column based on the date of birth and the date of COVID-19 infection, then recode ‘age’ into age groups, and count the number and percentage of distinct patients who contracted COVID-19 in each age group.

q5 <- "
SELECT 
  age_group,
  COUNT(DISTINCT id) AS covid_patient_count,
  ROUND(
    COUNT(DISTINCT id) * 100.0 / 
    SUM(COUNT(DISTINCT id)) OVER (), 
    2
  ) AS percent_of_total
FROM (
  SELECT 
    p.id,
    CASE 
      WHEN DATEDIFF('day', p.birthdate, c.start) / 365.25 < 18 THEN '0-17'
      WHEN DATEDIFF('day', p.birthdate, c.start) / 365.25 BETWEEN 18 AND 34 THEN '18-34'
      WHEN DATEDIFF('day', p.birthdate, c.start) / 365.25 BETWEEN 35 AND 49 THEN '35-49'
      WHEN DATEDIFF('day', p.birthdate, c.start) / 365.25 BETWEEN 50 AND 64 THEN '50-64'
      ELSE '65+'
    END AS age_group
  FROM conditions c
  LEFT JOIN patients p ON c.patient = p.id 
  WHERE c.code = '840539006'
    AND EXTRACT(YEAR FROM c.start) = 2020
) sub
GROUP BY age_group
ORDER BY age_group;"
results = dbGetQuery(con, q5)
print(results)
  age_group covid_patient_count percent_of_total
1      0-17               17416            19.75
2     18-34               19364            21.96
3     35-49               15351            17.41
4     50-64               17682            20.06
5       65+               18353            20.82

4.3.0.4 Hospitalized patients for covid-19

Number of patients hospitalized because of covid-19.

q6 <- "
SELECT COUNT(*) AS hospitalized_patients,
ROUND(100.0 * COUNT(*) / (SELECT COUNT(DISTINCT patient)
  FROM conditions
  WHERE code = '840539006'), 2) AS percent_of_covid_hospitalized_patients
FROM encounters 
WHERE REASONCODE = '840539006' AND CODE = '1505002' ;
"
dbGetQuery(con, q6)
  hospitalized_patients percent_of_covid_hospitalized_patients
1                 18177                                  20.62

4.3.0.5 Patients recovered from covid-19 whose covid-19 status is negative after home isolation

q7 <- "
SELECT COUNT(DISTINCT c.patient) AS recovered_patients,
ROUND(100.0 * COUNT(*) / (SELECT COUNT(DISTINCT patient)
  FROM conditions
  WHERE code = '840539006'), 2) AS percent_of_home_covid_recovered_patients
FROM careplans c
LEFT JOIN observations o ON c.patient = o.patient
WHERE c.code = '736376001'
  AND c.reasoncode = '840539006'
  AND c.stop IS NOT NULL
  AND o.code = '94531-1'
  AND o.value = 'Not detected (qualifier value)';
"
dbGetQuery(con, q6)
  hospitalized_patients percent_of_covid_hospitalized_patients
1                 18177                                  20.62

4.3.0.6 Health outcomes experienced by COVID-19 patients

This example shows how to recode string values into categories using CASE statements, join tables to link COVID-19 patients with their health outcomes, and aggregate the results to count the number and percentage of patients experiencing each outcome.

q7. <- "
WITH covid_patients AS (
  SELECT DISTINCT patient
  FROM conditions
  WHERE code = '840539006'
),
outcome_patients AS (
  SELECT DISTINCT patient,
    CASE 
      WHEN description ILIKE '%sepsis%' THEN 'Sepsis'
      WHEN description ILIKE '%septic shock%' THEN 'Septic Shock'
      WHEN description ILIKE '%respiratory failure%' THEN 'Respiratory Failure'
      WHEN description ILIKE '%Respiratory distress%' THEN 'Respiratory distress'
      WHEN description ILIKE '%acute cardiac injury%' THEN 'Acute Cardiac Injury'
      WHEN description ILIKE '%heart failure%' THEN 'Heart Failure'
      WHEN description ILIKE '%pneumonia%' THEN 'Pneumonia'
      WHEN description ILIKE '%acute kidney failure%' THEN 'Acute Kidney Failure'
      WHEN description ILIKE '%Acute pulmonary embolism%' THEN 'Acute pulmonary embolism'
    END AS outcome
  FROM conditions
  WHERE description ILIKE '%sepsis%' 
     OR description ILIKE '%septic shock%' 
     OR description ILIKE '%respiratory failure%' 
     OR description ILIKE '%Respiratory distress%' 
     OR description ILIKE '%acute cardiac injury%' 
     OR description ILIKE '%heart failure%' 
     OR description ILIKE '%pneumonia%' 
     OR description ILIKE '%acute kidney failure%' 
     OR description ILIKE '%Acute pulmonary embolism%'
),
covid_with_outcomes AS (
  SELECT cp.patient, op.outcome
  FROM covid_patients cp
  JOIN outcome_patients op ON cp.patient = op.patient
)
SELECT 
  outcome,
  COUNT(DISTINCT patient) AS patient_count,
  ROUND(100.0 * COUNT(DISTINCT patient) / (SELECT COUNT(*) FROM covid_patients), 2) AS percent_of_covid_patients
FROM covid_with_outcomes
GROUP BY outcome
ORDER BY percent_of_covid_patients DESC;
"
dbGetQuery(con, q7)
  recovered_patients percent_of_home_covid_recovered_patients
1                437                                     0.99

4.3.0.7 Labs for COVID-19 patients

This example demonstrates how to recode ‘lab code’ values into descriptive test names using CASE statements, filter specific lab values needed for insights, join multiple tables to link COVID-19 patients with their lab results, calculate the duration since COVID-19 diagnosis, and categorize patients based on their survival status.

q8 <- "
SELECT 
  o.date, 
  o.patient, 
  o.encounter, 
  o.code, 
  o.value, 
  o.units, 
  CASE o.code
    WHEN '48065-7' THEN 'D-dimer'
    WHEN '2276-4' THEN 'Serum Ferritin'
    WHEN '89579-7' THEN 'High Sensitivity Cardiac Troponin I'
    WHEN '26881-3' THEN 'IL-6'
    WHEN '731-0' THEN 'Lymphocytes'
    WHEN '14804-9' THEN 'Lactate dehydrogenase'
    ELSE NULL
  END AS test,
  c.start,
  o.date - c.start AS covid_days,
  c.code AS con_code, 
  c.description AS con_description,
  CASE 
    WHEN p.deathdate IS NULL THEN 'survivor'
    ELSE 'die'
  END AS survivor
FROM observations o
LEFT JOIN conditions c ON o.patient = c.patient
LEFT JOIN patients p ON o.patient = p.id
WHERE c.code = '840539006'
  AND o.code IN ('48065-7', '26881-3', '2276-4', '89579-7', '731-0', '14804-9')
"
labs_covid19 <- dbGetQuery(con, q8)
head(labs_covid19, 5)
        DATE                              PATIENT
1 2020-02-19 bd1c4ffc-7f1d-4590-adbb-1d6533fb623e
2 2020-02-19 bd1c4ffc-7f1d-4590-adbb-1d6533fb623e
3 2020-02-19 bd1c4ffc-7f1d-4590-adbb-1d6533fb623e
4 2020-02-19 bd1c4ffc-7f1d-4590-adbb-1d6533fb623e
5 2020-02-19 bd1c4ffc-7f1d-4590-adbb-1d6533fb623e
                             ENCOUNTER    CODE VALUE   UNITS
1 b7455838-3607-47f4-aaa5-fd89abea7d29   731-0   1.1 10*3/uL
2 b7455838-3607-47f4-aaa5-fd89abea7d29 48065-7   0.4   ug/mL
3 b7455838-3607-47f4-aaa5-fd89abea7d29  2276-4 332.4    ug/L
4 b7455838-3607-47f4-aaa5-fd89abea7d29 89579-7   2.3   pg/mL
5 b7455838-3607-47f4-aaa5-fd89abea7d29 14804-9 223.9     U/L
                                 test      START covid_days  con_code
1                         Lymphocytes 2020-02-19          0 840539006
2                             D-dimer 2020-02-19          0 840539006
3                      Serum Ferritin 2020-02-19          0 840539006
4 High Sensitivity Cardiac Troponin I 2020-02-19          0 840539006
5               Lactate dehydrogenase 2020-02-19          0 840539006
  con_description survivor
1        COVID-19 survivor
2        COVID-19 survivor
3        COVID-19 survivor
4        COVID-19 survivor
5        COVID-19 survivor

Lab visualization

library(ggplot2)
options(repr.plot.width = 21, repr.plot.height = 17) 

ggplot(labs_covid19, aes(x = as.factor(covid_days), y = as.numeric(VALUE), fill = survivor)) +
  geom_boxplot(outlier.alpha = 0.3) +
  facet_wrap(~ test, scales = "free_y", ncol = 2, nrow = 3) +
  labs(
    title = "Distribution of Lab Values by Days Since COVID-19 Diagnosis",
    x = "Days Since COVID-19",
    y = "Test Result Value"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 7),
    strip.text = element_text(face = "bold")
  )

4.3.0.8 Symptoms experienced by COVID-19 patients

q9 <-  "
WITH covid_patients AS (
  SELECT DISTINCT patient
  FROM conditions
  WHERE code = '840539006'
),
patient_symptoms AS (
  SELECT c.description
  FROM conditions c
  JOIN covid_patients cp ON c.patient = cp.patient
  WHERE LOWER(c.description) NOT IN (
    'covid-19',
    'suspected covid-19',
    'body mass index 30+ - obesity (finding)',
    'prediabetes',
    'hypertension'
  )
  AND LOWER(c.description) NOT LIKE '%chronic%'
  AND LOWER(c.description) NOT LIKE '%anemia%'
  AND LOWER(c.description) NOT LIKE '%Hypoxemia%'
  
)
SELECT 
  description,
  COUNT(*) AS symptom_count,
  ROUND(100.0 * COUNT(*) / (SELECT COUNT(*) FROM patient_symptoms), 2) AS percent
FROM patient_symptoms
GROUP BY description
ORDER BY symptom_count DESC
LIMIT 14;
"
symptoms = dbGetQuery(con,q9)
symptoms
                      DESCRIPTION symptom_count percent
1                 Fever (finding)         78341   13.09
2                 Cough (finding)         59766    9.99
3         Loss of taste (finding)         44646    7.46
4               Fatigue (finding)         33855    5.66
5        Sputum finding (finding)         29666    4.96
6  Respiratory distress (finding)         18175    3.04
7            Pneumonia (disorder)         18175    3.04
8            Hypoxemia (disorder)         18175    3.04
9              Wheezing (finding)         17516    2.93
10              Dyspnea (finding)         17516    2.93
11 Miscarriage in first trimester         14972    2.50
12           Joint pain (finding)         13290    2.22
13          Muscle pain (finding)         13290    2.22
14  Sore throat symptom (finding)         12397    2.07

4.3.0.9 Length of symptoms in days

q10 <- "
WITH covid_patients AS (
  SELECT DISTINCT patient
  FROM conditions
  WHERE code = '840539006'
)
SELECT 
  c.description,
  ROUND(AVG(EXTRACT(DAY FROM c.stop) - EXTRACT(DAY FROM c.start)), 2) AS avg_duration
FROM conditions c
JOIN covid_patients cp ON c.patient = cp.patient
WHERE c.stop IS NOT NULL AND c.start IS NOT NULL
AND c.description IN (
'Fever (finding)',  
'Cough (finding)'   ,
'Loss of taste (finding)'   ,
'Fatigue (finding)',    
'Sputum finding (finding)'
)
GROUP BY c.description
ORDER BY avg_duration DESC;
"
symptom_length <- dbGetQuery(con, q10)

print(symptom_length)
               DESCRIPTION avg_duration
1 Sputum finding (finding)         4.90
2  Loss of taste (finding)         4.78
3        Fatigue (finding)         4.77
4          Fever (finding)         4.77
5          Cough (finding)         4.76

4.3.0.10 Deaths with COVID-19 by age-group

q11 <- "
SELECT 
  CASE 
    WHEN age < 18 THEN '0-17'
    WHEN age BETWEEN 18 AND 34 THEN '18-34'
    WHEN age BETWEEN 35 AND 49 THEN '35-49'
    WHEN age BETWEEN 50 AND 64 THEN '50-64'
    ELSE '65+'
  END AS age_group,
  COUNT(DISTINCT id) AS death_count,
  ROUND(100.0 * COUNT(DISTINCT id) / (SELECT COUNT(DISTINCT patient)
  FROM conditions
  WHERE code = '840539006'), 2) AS percent_of_covid_patients
FROM (
  SELECT 
    p.id,
    DATE_PART('year', c.start) - DATE_PART('year', p.birthdate) AS age
  FROM patients p
  JOIN conditions c ON p.id = c.patient
  WHERE c.code = '840539006'
    AND p.deathdate IS NOT NULL
    AND EXTRACT(YEAR FROM c.start) = 2020
) AS covid_deaths
GROUP BY age_group
ORDER BY age_group;

"
age_deaths = dbGetQuery(con, q11)
print(age_deaths)
  age_group death_count percent_of_covid_patients
1      0-17           6                      0.01
2     18-34         166                      0.19
3     35-49         206                      0.23
4     50-64         819                      0.93
5       65+        2444                      2.77

4.3.0.11 Visualize deaths with COVID-19 by age-group

ggplot(age_deaths, aes(x = age_group, y = death_count)) +
  geom_bar(stat = "identity", fill = "gray") +
  geom_text(
    aes(label = round(death_count, 2)),
    vjust = 1.5,        # Puts the count inside the bar
    color = "black",    # White text for count
    size = 4
  ) +
  geom_text(
    aes(label = paste("(", round(percent_of_covid_patients, 2), "%)", sep = "")),
    vjust = 3.5,       # Puts the percentage below the count
    color = "blue",    # White text for percentage
    size = 3
  ) +
  labs(
    title = "Covid-19 Deaths by Age Group in 2020",
    x = "Age Group",
    y = "Total Deaths"
  ) +
  theme_minimal() +
  theme(text = element_text(size = 12))

4.4 Conclusion

In summary, this chapter demonstrated how to process and analyze healthcare data using SQL techniques. By applying filtering, transformation, and aggregation methods, we successfully derived meaningful insights from COVID-19 patient data to support further analysis and reporting.