13.5. Grouping and Aggregates#

The GROUP BY allows you to group values together and then apply one of the following five functions

  1. MAX()

  2. MIN()

  3. AVG()

  4. SUM()

  5. COUNT()

The HAVING keyword, which follows the GROUP BY keyword allows you to filter the aggregated results.

13.5.1. A Simple Grouping Examples#

Find the total number of lab results for each patients

SELECT PatientID, Count(LabName) LabCount
FROM LabsCorePopulatedTable 
GROUP BY PatientID
ORDER BY -LabCount
Hide code cell source
import sqlite3
import pandas as pd

conn = sqlite3.connect('100_patients.db')

sql_statement = """
SELECT PatientID, Count(LabName) LabCount
FROM LabsCorePopulatedTable 
GROUP BY PatientID
ORDER BY -LabCount
"""
df = pd.read_sql_query(sql_statement, conn)
df.style.set_table_attributes('style="font-size: 12px"')
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[1], line 2
      1 import sqlite3
----> 2 import pandas as pd
      4 conn = sqlite3.connect('100_patients.db')
      6 sql_statement = """
      7 SELECT PatientID, Count(LabName) LabCount
      8 FROM LabsCorePopulatedTable 
      9 GROUP BY PatientID
     10 ORDER BY -LabCount
     11 """

ModuleNotFoundError: No module named 'pandas'
SELECT PatientID, Count(LabName) LabCount
FROM LabsCorePopulatedTable 
GROUP BY PatientID
HAVING LabCount > 2000
ORDER BY -LabCount
Hide code cell source
import sqlite3
import pandas as pd

conn = sqlite3.connect('100_patients.db')

sql_statement = """
SELECT PatientID, Count(LabName) LabCount
FROM LabsCorePopulatedTable 
GROUP BY PatientID
HAVING LabCount > 2000
ORDER BY -LabCount
"""
df = pd.read_sql_query(sql_statement, conn)
df.style.set_table_attributes('style="font-size: 12px"')
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[2], line 2
      1 import sqlite3
----> 2 import pandas as pd
      4 conn = sqlite3.connect('100_patients.db')
      6 sql_statement = """
      7 SELECT PatientID, Count(LabName) LabCount
      8 FROM LabsCorePopulatedTable 
   (...)     11 ORDER BY -LabCount
     12 """

ModuleNotFoundError: No module named 'pandas'
SELECT PatientID, Count(LabName) LabCount, Max(LabValue),  Min(LabValue),  SUM(LabValue), ROUND(AVG(LabValue),2) AVERAGE
FROM LabsCorePopulatedTable 
WHERE LabName = 'URINALYSIS: RED BLOOD CELLS'
GROUP BY PatientID
ORDER BY -AVERAGE
Hide code cell source
import sqlite3
import pandas as pd

conn = sqlite3.connect('100_patients.db')

sql_statement = """
SELECT PatientID, Count(LabName) LabCount, Max(LabValue),  Min(LabValue),  SUM(LabValue), ROUND(AVG(LabValue),2) AVERAGE
FROM LabsCorePopulatedTable 
WHERE LabName = 'URINALYSIS: RED BLOOD CELLS'
GROUP BY PatientID
ORDER BY -AVERAGE
"""
df = pd.read_sql_query(sql_statement, conn)
df.style.set_table_attributes('style="font-size: 12px"')
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[3], line 2
      1 import sqlite3
----> 2 import pandas as pd
      4 conn = sqlite3.connect('100_patients.db')
      6 sql_statement = """
      7 SELECT PatientID, Count(LabName) LabCount, Max(LabValue),  Min(LabValue),  SUM(LabValue), ROUND(AVG(LabValue),2) AVERAGE
      8 FROM LabsCorePopulatedTable 
   (...)     11 ORDER BY -AVERAGE
     12 """

ModuleNotFoundError: No module named 'pandas'

13.5.2. Multicolumn Grouping#

SELECT PatientID, LabName, Count(LabName) LabCount, Max(LabValue),  Min(LabValue),  SUM(LabValue), ROUND(AVG(LabValue),2) AVERAGE
FROM LabsCorePopulatedTable 
GROUP BY PatientID, LabName
ORDER BY -PatientID, -LabCount
Hide code cell source
import sqlite3
import pandas as pd

conn = sqlite3.connect('100_patients.db')

sql_statement = """
SELECT PatientID, LabName, Count(LabName) LabCount, Max(LabValue),  Min(LabValue),  SUM(LabValue), ROUND(AVG(LabValue),2) AVERAGE
FROM LabsCorePopulatedTable 
GROUP BY PatientID, LabName
ORDER BY -PatientID, -LabCount
"""
df = pd.read_sql_query(sql_statement, conn)
df.style.set_table_attributes('style="font-size: 12px"')
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[4], line 2
      1 import sqlite3
----> 2 import pandas as pd
      4 conn = sqlite3.connect('100_patients.db')
      6 sql_statement = """
      7 SELECT PatientID, LabName, Count(LabName) LabCount, Max(LabValue),  Min(LabValue),  SUM(LabValue), ROUND(AVG(LabValue),2) AVERAGE
      8 FROM LabsCorePopulatedTable 
      9 GROUP BY PatientID, LabName
     10 ORDER BY -PatientID, -LabCount
     11 """

ModuleNotFoundError: No module named 'pandas'

13.5.3. Date Difference#

SELECT PatientID, ROUND(MAX(julianday(AdmissionEndDate)-julianday(AdmissionStartDate))) MAX_STAY
FROM AdmissionsCorePopulatedTable 
GROUP BY PatientID
HAVING MAX_STAY >= 20
ORDER BY -MAX_STAY
Hide code cell source
import sqlite3
import pandas as pd

conn = sqlite3.connect('100_patients.db')

sql_statement = """
SELECT PatientID, ROUND(MAX(julianday(AdmissionEndDate)-julianday(AdmissionStartDate))) MAX_STAY
FROM AdmissionsCorePopulatedTable 
GROUP BY PatientID
HAVING MAX_STAY >= 20
ORDER BY -MAX_STAY
"""
df = pd.read_sql_query(sql_statement, conn)
df.style.set_table_attributes('style="font-size: 12px"')
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[5], line 2
      1 import sqlite3
----> 2 import pandas as pd
      4 conn = sqlite3.connect('100_patients.db')
      6 sql_statement = """
      7 SELECT PatientID, ROUND(MAX(julianday(AdmissionEndDate)-julianday(AdmissionStartDate))) MAX_STAY
      8 FROM AdmissionsCorePopulatedTable 
   (...)     11 ORDER BY -MAX_STAY
     12 """

ModuleNotFoundError: No module named 'pandas'