13.8. Analytic Functions#
Analytic functions allow you to group rows into windows, partitioning
the data. Windows are defined using the OVER
clause and optionally
combined with the PARTITION
subclause.
Date Reference: https://www.techonthenet.com/sqlite/functions/julianday.php
13.8.1. Data Windows#
SELECT
CASE
WHEN 0 + strftime('%m', sale_date) BETWEEN 1
AND 3 THEN 'Q1'
WHEN 0 + strftime('%m', sale_date) BETWEEN 4
AND 6 THEN 'Q2'
WHEN 0 + strftime('%m', sale_date) BETWEEN 7
AND 9 THEN 'Q3'
WHEN 0 + strftime('%m', sale_date) BETWEEN 10
AND 12 THEN 'Q4'
END Quarter,
CASE
WHEN strftime('%m', sale_date) = '01' THEN 'January'
WHEN strftime('%m', sale_date) = '02' THEN 'February'
WHEN strftime('%m', sale_date) = '03' THEN 'March'
WHEN strftime('%m', sale_date) = '04' THEN 'April'
WHEN strftime('%m', sale_date) = '05' THEN 'May'
WHEN strftime('%m', sale_date) = '06' THEN 'June'
WHEN strftime('%m', sale_date) = '07' THEN 'July'
WHEN strftime('%m', sale_date) = '08' THEN 'August'
WHEN strftime('%m', sale_date) = '09' THEN 'September'
WHEN strftime('%m', sale_date) = '10' THEN 'October'
WHEN strftime('%m', sale_date) = '11' THEN 'November'
WHEN strftime('%m', sale_date) = '12' THEN 'December'
END Month,
sum(total) Monthly_Sales
FROM
bakery_sales
GROUP BY
Quarter,
Month
ORDER BY
strftime('%m', sale_date)
Show code cell source
import sqlite3
import pandas as pd
conn = sqlite3.connect('bakery_sales.db')
sql_statement = """
SELECT
CASE
WHEN 0 + strftime('%m', sale_date) BETWEEN 1
AND 3 THEN 'Q1'
WHEN 0 + strftime('%m', sale_date) BETWEEN 4
AND 6 THEN 'Q2'
WHEN 0 + strftime('%m', sale_date) BETWEEN 7
AND 9 THEN 'Q3'
WHEN 0 + strftime('%m', sale_date) BETWEEN 10
AND 12 THEN 'Q4'
END Quarter,
CASE
WHEN strftime('%m', sale_date) = '01' THEN 'January'
WHEN strftime('%m', sale_date) = '02' THEN 'February'
WHEN strftime('%m', sale_date) = '03' THEN 'March'
WHEN strftime('%m', sale_date) = '04' THEN 'April'
WHEN strftime('%m', sale_date) = '05' THEN 'May'
WHEN strftime('%m', sale_date) = '06' THEN 'June'
WHEN strftime('%m', sale_date) = '07' THEN 'July'
WHEN strftime('%m', sale_date) = '08' THEN 'August'
WHEN strftime('%m', sale_date) = '09' THEN 'September'
WHEN strftime('%m', sale_date) = '10' THEN 'October'
WHEN strftime('%m', sale_date) = '11' THEN 'November'
WHEN strftime('%m', sale_date) = '12' THEN 'December'
END Month,
sum(total) Monthly_Sales
FROM
bakery_sales
GROUP BY
Quarter,
Month
ORDER BY
strftime('%m', sale_date)
"""
df = pd.read_sql_query(sql_statement, conn)
df.style.set_table_attributes('style="font-size: 12px"')
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[1], line 2
1 import sqlite3
----> 2 import pandas as pd
4 conn = sqlite3.connect('bakery_sales.db')
6 sql_statement = """
7 SELECT
8 CASE
(...) 39 strftime('%m', sale_date)
40 """
ModuleNotFoundError: No module named 'pandas'
WITH SalesTable AS (
SELECT
CASE
WHEN 0 + strftime('%m', sale_date) BETWEEN 1
AND 3 THEN 'Q1'
WHEN 0 + strftime('%m', sale_date) BETWEEN 4
AND 6 THEN 'Q2'
WHEN 0 + strftime('%m', sale_date) BETWEEN 7
AND 9 THEN 'Q3'
WHEN 0 + strftime('%m', sale_date) BETWEEN 10
AND 12 THEN 'Q4'
END Quarter,
CASE
WHEN strftime('%m', sale_date) = '01' THEN 'January'
WHEN strftime('%m', sale_date) = '02' THEN 'February'
WHEN strftime('%m', sale_date) = '03' THEN 'March'
WHEN strftime('%m', sale_date) = '04' THEN 'April'
WHEN strftime('%m', sale_date) = '05' THEN 'May'
WHEN strftime('%m', sale_date) = '06' THEN 'June'
WHEN strftime('%m', sale_date) = '07' THEN 'July'
WHEN strftime('%m', sale_date) = '08' THEN 'August'
WHEN strftime('%m', sale_date) = '09' THEN 'September'
WHEN strftime('%m', sale_date) = '10' THEN 'October'
WHEN strftime('%m', sale_date) = '11' THEN 'November'
WHEN strftime('%m', sale_date) = '12' THEN 'December'
END Month,
Total
FROM
bakery_sales
)
SELECT
Quarter,
Month,
Sum(Total) MonthlySales,
Max(sum(total)) over() max_overall_sales,
Max(sum(total)) over(partition by quarter) max_quarter_sales
FROM SalesTable
GROUP BY Quarter, Month
Show code cell source
import sqlite3
import pandas as pd
conn = sqlite3.connect('bakery_sales.db')
sql_statement = """
WITH SalesTable AS (
SELECT
CASE
WHEN 0 + strftime('%m', sale_date) BETWEEN 1
AND 3 THEN 'Q1'
WHEN 0 + strftime('%m', sale_date) BETWEEN 4
AND 6 THEN 'Q2'
WHEN 0 + strftime('%m', sale_date) BETWEEN 7
AND 9 THEN 'Q3'
WHEN 0 + strftime('%m', sale_date) BETWEEN 10
AND 12 THEN 'Q4'
END Quarter,
CASE
WHEN strftime('%m', sale_date) = '01' THEN 'January'
WHEN strftime('%m', sale_date) = '02' THEN 'February'
WHEN strftime('%m', sale_date) = '03' THEN 'March'
WHEN strftime('%m', sale_date) = '04' THEN 'April'
WHEN strftime('%m', sale_date) = '05' THEN 'May'
WHEN strftime('%m', sale_date) = '06' THEN 'June'
WHEN strftime('%m', sale_date) = '07' THEN 'July'
WHEN strftime('%m', sale_date) = '08' THEN 'August'
WHEN strftime('%m', sale_date) = '09' THEN 'September'
WHEN strftime('%m', sale_date) = '10' THEN 'October'
WHEN strftime('%m', sale_date) = '11' THEN 'November'
WHEN strftime('%m', sale_date) = '12' THEN 'December'
END Month,
Total
FROM
bakery_sales
)
SELECT
Quarter,
Month,
Sum(Total) MonthlySales,
Max(sum(total)) over() max_overall_sales,
Max(sum(total)) over(partition by quarter) max_quarter_sales
FROM SalesTable
GROUP BY Quarter, Month
"""
df = pd.read_sql_query(sql_statement, conn)
df.style.set_table_attributes('style="font-size: 12px"')
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[2], line 2
1 import sqlite3
----> 2 import pandas as pd
4 conn = sqlite3.connect('bakery_sales.db')
6 sql_statement = """
7 WITH SalesTable AS (
8 SELECT
(...) 44 GROUP BY Quarter, Month
45 """
ModuleNotFoundError: No module named 'pandas'
13.8.2. Localized Sorting#
WITH SalesTable AS (
SELECT
CASE
WHEN 0 + strftime('%m', sale_date) BETWEEN 1
AND 3 THEN 'Q1'
WHEN 0 + strftime('%m', sale_date) BETWEEN 4
AND 6 THEN 'Q2'
WHEN 0 + strftime('%m', sale_date) BETWEEN 7
AND 9 THEN 'Q3'
WHEN 0 + strftime('%m', sale_date) BETWEEN 10
AND 12 THEN 'Q4'
END Quarter,
CASE
WHEN strftime('%m', sale_date) = '01' THEN 'January'
WHEN strftime('%m', sale_date) = '02' THEN 'February'
WHEN strftime('%m', sale_date) = '03' THEN 'March'
WHEN strftime('%m', sale_date) = '04' THEN 'April'
WHEN strftime('%m', sale_date) = '05' THEN 'May'
WHEN strftime('%m', sale_date) = '06' THEN 'June'
WHEN strftime('%m', sale_date) = '07' THEN 'July'
WHEN strftime('%m', sale_date) = '08' THEN 'August'
WHEN strftime('%m', sale_date) = '09' THEN 'September'
WHEN strftime('%m', sale_date) = '10' THEN 'October'
WHEN strftime('%m', sale_date) = '11' THEN 'November'
WHEN strftime('%m', sale_date) = '12' THEN 'December'
END Month,
Total
FROM
bakery_sales
)
SELECT
Quarter,
Month,
Sum(Total) MonthlySales,
rank() OVER (ORDER BY -sum(total)) SalesRank
FROM SalesTable
GROUP BY Quarter, Month
Show code cell source
import sqlite3
import pandas as pd
conn = sqlite3.connect('bakery_sales.db')
sql_statement = """
WITH SalesTable AS (
SELECT
CASE
WHEN 0 + strftime('%m', sale_date) BETWEEN 1
AND 3 THEN 'Q1'
WHEN 0 + strftime('%m', sale_date) BETWEEN 4
AND 6 THEN 'Q2'
WHEN 0 + strftime('%m', sale_date) BETWEEN 7
AND 9 THEN 'Q3'
WHEN 0 + strftime('%m', sale_date) BETWEEN 10
AND 12 THEN 'Q4'
END Quarter,
CASE
WHEN strftime('%m', sale_date) = '01' THEN 'January'
WHEN strftime('%m', sale_date) = '02' THEN 'February'
WHEN strftime('%m', sale_date) = '03' THEN 'March'
WHEN strftime('%m', sale_date) = '04' THEN 'April'
WHEN strftime('%m', sale_date) = '05' THEN 'May'
WHEN strftime('%m', sale_date) = '06' THEN 'June'
WHEN strftime('%m', sale_date) = '07' THEN 'July'
WHEN strftime('%m', sale_date) = '08' THEN 'August'
WHEN strftime('%m', sale_date) = '09' THEN 'September'
WHEN strftime('%m', sale_date) = '10' THEN 'October'
WHEN strftime('%m', sale_date) = '11' THEN 'November'
WHEN strftime('%m', sale_date) = '12' THEN 'December'
END Month,
Total
FROM
bakery_sales
)
SELECT
Quarter,
Month,
Sum(Total) MonthlySales,
rank() OVER (ORDER BY -sum(total)) SalesRank
FROM SalesTable
GROUP BY Quarter, Month
"""
df = pd.read_sql_query(sql_statement, conn)
df.style.set_table_attributes('style="font-size: 12px"')
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[3], line 2
1 import sqlite3
----> 2 import pandas as pd
4 conn = sqlite3.connect('bakery_sales.db')
6 sql_statement = """
7 WITH SalesTable AS (
8 SELECT
(...) 43 GROUP BY Quarter, Month
44 """
ModuleNotFoundError: No module named 'pandas'
WITH SalesTable AS (
SELECT
CASE
WHEN 0 + strftime('%m', sale_date) BETWEEN 1
AND 3 THEN 'Q1'
WHEN 0 + strftime('%m', sale_date) BETWEEN 4
AND 6 THEN 'Q2'
WHEN 0 + strftime('%m', sale_date) BETWEEN 7
AND 9 THEN 'Q3'
WHEN 0 + strftime('%m', sale_date) BETWEEN 10
AND 12 THEN 'Q4'
END Quarter,
CASE
WHEN strftime('%m', sale_date) = '01' THEN 'January'
WHEN strftime('%m', sale_date) = '02' THEN 'February'
WHEN strftime('%m', sale_date) = '03' THEN 'March'
WHEN strftime('%m', sale_date) = '04' THEN 'April'
WHEN strftime('%m', sale_date) = '05' THEN 'May'
WHEN strftime('%m', sale_date) = '06' THEN 'June'
WHEN strftime('%m', sale_date) = '07' THEN 'July'
WHEN strftime('%m', sale_date) = '08' THEN 'August'
WHEN strftime('%m', sale_date) = '09' THEN 'September'
WHEN strftime('%m', sale_date) = '10' THEN 'October'
WHEN strftime('%m', sale_date) = '11' THEN 'November'
WHEN strftime('%m', sale_date) = '12' THEN 'December'
END Month,
Total
FROM
bakery_sales
)
SELECT
Quarter,
Month,
Sum(Total) MonthlySales,
rank() OVER (PARTITION BY Quarter ORDER BY -sum(total)) SalesRank
FROM SalesTable
GROUP BY Quarter, Month
Show code cell source
import sqlite3
import pandas as pd
conn = sqlite3.connect('bakery_sales.db')
sql_statement = """
WITH SalesTable AS (
SELECT
CASE
WHEN 0 + strftime('%m', sale_date) BETWEEN 1
AND 3 THEN 'Q1'
WHEN 0 + strftime('%m', sale_date) BETWEEN 4
AND 6 THEN 'Q2'
WHEN 0 + strftime('%m', sale_date) BETWEEN 7
AND 9 THEN 'Q3'
WHEN 0 + strftime('%m', sale_date) BETWEEN 10
AND 12 THEN 'Q4'
END Quarter,
CASE
WHEN strftime('%m', sale_date) = '01' THEN 'January'
WHEN strftime('%m', sale_date) = '02' THEN 'February'
WHEN strftime('%m', sale_date) = '03' THEN 'March'
WHEN strftime('%m', sale_date) = '04' THEN 'April'
WHEN strftime('%m', sale_date) = '05' THEN 'May'
WHEN strftime('%m', sale_date) = '06' THEN 'June'
WHEN strftime('%m', sale_date) = '07' THEN 'July'
WHEN strftime('%m', sale_date) = '08' THEN 'August'
WHEN strftime('%m', sale_date) = '09' THEN 'September'
WHEN strftime('%m', sale_date) = '10' THEN 'October'
WHEN strftime('%m', sale_date) = '11' THEN 'November'
WHEN strftime('%m', sale_date) = '12' THEN 'December'
END Month,
Total
FROM
bakery_sales
)
SELECT
Quarter,
Month,
Sum(Total) MonthlySales,
rank() OVER (PARTITION BY Quarter ORDER BY -sum(total)) SalesRank
FROM SalesTable
GROUP BY Quarter, Month
"""
df = pd.read_sql_query(sql_statement, conn)
df.style.set_table_attributes('style="font-size: 12px"')
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[4], line 2
1 import sqlite3
----> 2 import pandas as pd
4 conn = sqlite3.connect('bakery_sales.db')
6 sql_statement = """
7 WITH SalesTable AS (
8 SELECT
(...) 43 GROUP BY Quarter, Month
44 """
ModuleNotFoundError: No module named 'pandas'
13.8.3. Window Frames#
ref: https://www.sqlitetutorial.net/sqlite-window-functions/sqlite-window-frame/
WITH SalesTable AS (
SELECT
cast(strftime('%Y', sale_date) AS INT) sale_year,
cast(strftime('%W', sale_date) AS INT) sale_week,
Total
FROM
bakery_sales
ORDER BY sale_date
)
SELECT
sale_year,
sale_week,
sum(total) week_total,
sum(sum(total)) OVER (ORDER BY sale_year, sale_week ROWS UNBOUNDED PRECEDING) rolling_sum
FROM SalesTable
GROUP BY sale_year, sale_week
Show code cell source
import sqlite3
import pandas as pd
conn = sqlite3.connect('bakery_sales.db')
sql_statement = """
WITH SalesTable AS (
SELECT
cast(strftime('%Y', sale_date) AS INT) sale_year,
cast(strftime('%W', sale_date) AS INT) sale_week,
Total
FROM
bakery_sales
ORDER BY sale_date
)
SELECT
sale_year,
sale_week,
sum(total) week_total,
sum(sum(total)) OVER (ORDER BY sale_year, sale_week ROWS UNBOUNDED PRECEDING) rolling_sum
FROM SalesTable
GROUP BY sale_year, sale_week
"""
df = pd.read_sql_query(sql_statement, conn)
df.style.set_table_attributes('style="font-size: 12px"')
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[5], line 2
1 import sqlite3
----> 2 import pandas as pd
4 conn = sqlite3.connect('bakery_sales.db')
6 sql_statement = """
7 WITH SalesTable AS (
8 SELECT
(...) 22 GROUP BY sale_year, sale_week
23 """
ModuleNotFoundError: No module named 'pandas'
SELECT
sale_date,
sum(total) total,
max(sum(total)) over (order by cast(strftime('%j', sale_date) AS INT) range between 3 preceding and 3 following) seven_day_max
FROM bakery_sales
GROUP BY sale_date
order by sale_date
Show code cell source
import sqlite3
import pandas as pd
conn = sqlite3.connect('bakery_sales.db')
sql_statement = """
SELECT
sale_date,
sum(total) total,
max(sum(total)) over (order by cast(strftime('%j', sale_date) AS INT) range between 3 preceding and 3 following) seven_day_max
FROM bakery_sales
GROUP BY sale_date
order by sale_date
"""
df = pd.read_sql_query(sql_statement, conn)
df.style.set_table_attributes('style="font-size: 12px"')
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[6], line 2
1 import sqlite3
----> 2 import pandas as pd
4 conn = sqlite3.connect('bakery_sales.db')
6 sql_statement = """
7 SELECT
8 sale_date,
(...) 13 order by sale_date
14 """
ModuleNotFoundError: No module named 'pandas'
13.8.4. Lag and Lead#
WITH SalesTable AS (
SELECT
cast(strftime('%Y', sale_date) AS INT) sale_year,
cast(strftime('%W', sale_date) AS INT) sale_week,
Total
FROM
bakery_sales
ORDER BY sale_date
)
SELECT
sale_year,
sale_week,
sum(total) week_total,
lag(sum(total), 1) over (order by sale_week) prev_week_total,
lead(sum(total), 1) over (order by sale_week) next_week_total
FROM SalesTable
GROUP BY sale_year, sale_week
Show code cell source
import sqlite3
import pandas as pd
conn = sqlite3.connect('bakery_sales.db')
sql_statement = """
WITH SalesTable AS (
SELECT
cast(strftime('%Y', sale_date) AS INT) sale_year,
cast(strftime('%W', sale_date) AS INT) sale_week,
Total
FROM
bakery_sales
ORDER BY sale_date
)
SELECT
sale_year,
sale_week,
sum(total) week_total,
lag(sum(total), 1) over (order by sale_week) prev_week_total,
lead(sum(total), 1) over (order by sale_week) next_week_total
FROM SalesTable
GROUP BY sale_year, sale_week
"""
df = pd.read_sql_query(sql_statement, conn)
df.style.set_table_attributes('style="font-size: 12px"')
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[7], line 2
1 import sqlite3
----> 2 import pandas as pd
4 conn = sqlite3.connect('bakery_sales.db')
6 sql_statement = """
7 WITH SalesTable AS (
8 SELECT
(...) 23 GROUP BY sale_year, sale_week
24 """
ModuleNotFoundError: No module named 'pandas'
WITH SalesTable AS (
SELECT
cast(strftime('%Y', sale_date) AS INT) sale_year,
cast(strftime('%W', sale_date) AS INT) sale_week,
Total
FROM
bakery_sales
ORDER BY
sale_date
)
SELECT
sale_year,
sale_week,
sum(total) week_total,
round(
CAST(
(
sum(total) - lag(sum(total), 1) over (
order by
sale_week
)
) AS REAL
) / CAST(
lag(sum(total), 1) over (
order by
sale_week
) AS REAL
) * 100,
1
) CHANGE
FROM
SalesTable
GROUP BY
sale_year,
sale_week
Show code cell source
import sqlite3
import pandas as pd
conn = sqlite3.connect('bakery_sales.db')
sql_statement = """
WITH SalesTable AS (
SELECT
cast(strftime('%Y', sale_date) AS INT) sale_year,
cast(strftime('%W', sale_date) AS INT) sale_week,
Total
FROM
bakery_sales
ORDER BY
sale_date
)
SELECT
sale_year,
sale_week,
sum(total) week_total,
round(
CAST(
(
sum(total) - lag(sum(total), 1) over (
order by
sale_week
)
) AS REAL
) / CAST(
lag(sum(total), 1) over (
order by
sale_week
) AS REAL
) * 100,
1
) CHANGE
FROM
SalesTable
GROUP BY
sale_year,
sale_week
"""
df = pd.read_sql_query(sql_statement, conn)
df.style.set_table_attributes('style="font-size: 12px"')
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[8], line 2
1 import sqlite3
----> 2 import pandas as pd
4 conn = sqlite3.connect('bakery_sales.db')
6 sql_statement = """
7 WITH SalesTable AS (
8 SELECT
(...) 41 sale_week
42 """
ModuleNotFoundError: No module named 'pandas'
13.8.5. Ranking#
rank
: returns the same ranking in case of a tie, with gaps in the rankingsrow_number
: returns a unique number for each row, with rankings arbitrarily assigned in case of a tiedense_rank
: returns the same ranking in the case of a tie, with no gaps in the rankings
Ref: https://blog.jooq.org/the-difference-between-row_number-rank-and-dense_rank/
SELECT
PatientGender,
PatientMaritalStatus,
count(*) StatusCount,
rank() OVER (PARTITION BY PatientGender ORDER BY COUNT(*) DESC) StatusRank,
row_number() OVER (PARTITION BY PatientGender ORDER BY COUNT(*) DESC) StatusRowNumber,
dense_rank() OVER (PARTITION BY PatientGender ORDER BY COUNT(*) DESC) StatusDenseRank
FROM
PatientCorePopulatedTable
GROUP BY PatientGender, PatientMaritalStatus
Show code cell source
import sqlite3
import pandas as pd
conn = sqlite3.connect('100_patients.db')
sql_statement = """
SELECT
PatientGender,
PatientMaritalStatus,
count(*) StatusCount,
rank() OVER (PARTITION BY PatientGender ORDER BY COUNT(*) DESC) StatusRank,
row_number() OVER (PARTITION BY PatientGender ORDER BY COUNT(*) DESC) StatusRowNumber,
dense_rank() OVER (PARTITION BY PatientGender ORDER BY COUNT(*) DESC) StatusDenseRank
FROM
PatientCorePopulatedTable
GROUP BY PatientGender, PatientMaritalStatus
"""
df = pd.read_sql_query(sql_statement, conn)
df.style.set_table_attributes('style="font-size: 12px"')
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[9], line 2
1 import sqlite3
----> 2 import pandas as pd
4 conn = sqlite3.connect('100_patients.db')
6 sql_statement = """
7 SELECT
8 PatientGender,
(...) 16 GROUP BY PatientGender, PatientMaritalStatus
17 """
ModuleNotFoundError: No module named 'pandas'