13.5. Grouping and Aggregates#
The GROUP BY
allows you to group values together and then apply one of the following five functions
MAX()
MIN()
AVG()
SUM()
COUNT()
The HAVING
keyword, which follows the GROUP BY
keyword allows you to filter the aggregated results.
13.5.1. A Simple Grouping Examples#
Find the total number of lab results for each patients
SELECT PatientID, Count(LabName) LabCount
FROM LabsCorePopulatedTable
GROUP BY PatientID
ORDER BY -LabCount
Show code cell source
import sqlite3
import pandas as pd
conn = sqlite3.connect('100_patients.db')
sql_statement = """
SELECT PatientID, Count(LabName) LabCount
FROM LabsCorePopulatedTable
GROUP BY PatientID
ORDER BY -LabCount
"""
df = pd.read_sql_query(sql_statement, conn)
df.style.set_table_attributes('style="font-size: 12px"')
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[1], line 2
1 import sqlite3
----> 2 import pandas as pd
4 conn = sqlite3.connect('100_patients.db')
6 sql_statement = """
7 SELECT PatientID, Count(LabName) LabCount
8 FROM LabsCorePopulatedTable
9 GROUP BY PatientID
10 ORDER BY -LabCount
11 """
ModuleNotFoundError: No module named 'pandas'
SELECT PatientID, Count(LabName) LabCount
FROM LabsCorePopulatedTable
GROUP BY PatientID
HAVING LabCount > 2000
ORDER BY -LabCount
Show code cell source
import sqlite3
import pandas as pd
conn = sqlite3.connect('100_patients.db')
sql_statement = """
SELECT PatientID, Count(LabName) LabCount
FROM LabsCorePopulatedTable
GROUP BY PatientID
HAVING LabCount > 2000
ORDER BY -LabCount
"""
df = pd.read_sql_query(sql_statement, conn)
df.style.set_table_attributes('style="font-size: 12px"')
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[2], line 2
1 import sqlite3
----> 2 import pandas as pd
4 conn = sqlite3.connect('100_patients.db')
6 sql_statement = """
7 SELECT PatientID, Count(LabName) LabCount
8 FROM LabsCorePopulatedTable
(...) 11 ORDER BY -LabCount
12 """
ModuleNotFoundError: No module named 'pandas'
SELECT PatientID, Count(LabName) LabCount, Max(LabValue), Min(LabValue), SUM(LabValue), ROUND(AVG(LabValue),2) AVERAGE
FROM LabsCorePopulatedTable
WHERE LabName = 'URINALYSIS: RED BLOOD CELLS'
GROUP BY PatientID
ORDER BY -AVERAGE
Show code cell source
import sqlite3
import pandas as pd
conn = sqlite3.connect('100_patients.db')
sql_statement = """
SELECT PatientID, Count(LabName) LabCount, Max(LabValue), Min(LabValue), SUM(LabValue), ROUND(AVG(LabValue),2) AVERAGE
FROM LabsCorePopulatedTable
WHERE LabName = 'URINALYSIS: RED BLOOD CELLS'
GROUP BY PatientID
ORDER BY -AVERAGE
"""
df = pd.read_sql_query(sql_statement, conn)
df.style.set_table_attributes('style="font-size: 12px"')
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[3], line 2
1 import sqlite3
----> 2 import pandas as pd
4 conn = sqlite3.connect('100_patients.db')
6 sql_statement = """
7 SELECT PatientID, Count(LabName) LabCount, Max(LabValue), Min(LabValue), SUM(LabValue), ROUND(AVG(LabValue),2) AVERAGE
8 FROM LabsCorePopulatedTable
(...) 11 ORDER BY -AVERAGE
12 """
ModuleNotFoundError: No module named 'pandas'
13.5.2. Multicolumn Grouping#
SELECT PatientID, LabName, Count(LabName) LabCount, Max(LabValue), Min(LabValue), SUM(LabValue), ROUND(AVG(LabValue),2) AVERAGE
FROM LabsCorePopulatedTable
GROUP BY PatientID, LabName
ORDER BY -PatientID, -LabCount
Show code cell source
import sqlite3
import pandas as pd
conn = sqlite3.connect('100_patients.db')
sql_statement = """
SELECT PatientID, LabName, Count(LabName) LabCount, Max(LabValue), Min(LabValue), SUM(LabValue), ROUND(AVG(LabValue),2) AVERAGE
FROM LabsCorePopulatedTable
GROUP BY PatientID, LabName
ORDER BY -PatientID, -LabCount
"""
df = pd.read_sql_query(sql_statement, conn)
df.style.set_table_attributes('style="font-size: 12px"')
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[4], line 2
1 import sqlite3
----> 2 import pandas as pd
4 conn = sqlite3.connect('100_patients.db')
6 sql_statement = """
7 SELECT PatientID, LabName, Count(LabName) LabCount, Max(LabValue), Min(LabValue), SUM(LabValue), ROUND(AVG(LabValue),2) AVERAGE
8 FROM LabsCorePopulatedTable
9 GROUP BY PatientID, LabName
10 ORDER BY -PatientID, -LabCount
11 """
ModuleNotFoundError: No module named 'pandas'
13.5.3. Date Difference#
SELECT PatientID, ROUND(MAX(julianday(AdmissionEndDate)-julianday(AdmissionStartDate))) MAX_STAY
FROM AdmissionsCorePopulatedTable
GROUP BY PatientID
HAVING MAX_STAY >= 20
ORDER BY -MAX_STAY
Show code cell source
import sqlite3
import pandas as pd
conn = sqlite3.connect('100_patients.db')
sql_statement = """
SELECT PatientID, ROUND(MAX(julianday(AdmissionEndDate)-julianday(AdmissionStartDate))) MAX_STAY
FROM AdmissionsCorePopulatedTable
GROUP BY PatientID
HAVING MAX_STAY >= 20
ORDER BY -MAX_STAY
"""
df = pd.read_sql_query(sql_statement, conn)
df.style.set_table_attributes('style="font-size: 12px"')
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[5], line 2
1 import sqlite3
----> 2 import pandas as pd
4 conn = sqlite3.connect('100_patients.db')
6 sql_statement = """
7 SELECT PatientID, ROUND(MAX(julianday(AdmissionEndDate)-julianday(AdmissionStartDate))) MAX_STAY
8 FROM AdmissionsCorePopulatedTable
(...) 11 ORDER BY -MAX_STAY
12 """
ModuleNotFoundError: No module named 'pandas'