15.3. Cross-referenced Exercises#
15.3.1. Query Primer#
15.3.1.1. Basic query#
SQL cross-reference: Basic query
import pandas as pd
import numpy as np
filename = 'PatientCorePopulatedTable.txt'
df = pd.read_csv(filename, delimiter='\t')
df
PatientID | PatientGender | PatientDateOfBirth | PatientRace | PatientMaritalStatus | PatientLanguage | PatientPopulationPercentageBelowPoverty | |
---|---|---|---|---|---|---|---|
0 | FB2ABB23-C9D0-4D09-8464-49BF0B982F0F | Male | 1947-12-28 02:45:40.547 | Unknown | Married | Icelandic | 18.08 |
1 | 64182B95-EB72-4E2B-BE77-8050B71498CE | Male | 1952-01-18 19:51:12.917 | African American | Separated | English | 13.03 |
2 | DB22A4D9-7E4D-485C-916A-9CD1386507FB | Female | 1970-07-25 13:04:20.717 | Asian | Married | English | 6.67 |
3 | 6E70D84D-C75F-477C-BC37-9177C3698C66 | Male | 1979-01-04 05:45:29.580 | White | Married | English | 16.09 |
4 | C8556CC0-32FC-4CA5-A8CD-9CCF38816167 | Female | 1921-04-11 11:39:49.197 | White | Married | English | 18.20 |
... | ... | ... | ... | ... | ... | ... | ... |
95 | 135C831F-7DA5-46C0-959C-EBCBD8810B43 | Male | 1971-05-13 04:40:05.623 | White | Unknown | Spanish | 12.38 |
96 | 8856096E-E59C-4156-A767-C091AF799C80 | Female | 1988-11-25 02:59:36.373 | White | Divorced | English | 11.08 |
97 | 6623F5D6-D581-4268-9F9B-21612FBBF7B5 | Female | 1943-02-17 15:36:13.787 | Asian | Single | Spanish | 14.49 |
98 | 65A7FBE0-EA9F-49E9-9824-D8F3AD98DAC0 | Female | 1962-11-30 06:28:33.110 | White | Separated | Spanish | 17.98 |
99 | FB909FAE-72DD-4F6F-9828-D92183DF185F | Male | 1940-07-15 12:18:41.080 | White | Single | Spanish | 14.90 |
100 rows × 7 columns
15.3.1.2. Limit rows#
SQL cross-reference: Limit rows
from IPython.display import display
import pandas as pd
import numpy as np
filename = 'PatientCorePopulatedTable.txt'
df = pd.read_csv(filename, delimiter='\t')
# Get first 10 rows
display(df.head(10))
# Get last 10 row
display(df.tail(10))
# Get rows between range
display(df[50:60])
PatientID | PatientGender | PatientDateOfBirth | PatientRace | PatientMaritalStatus | PatientLanguage | PatientPopulationPercentageBelowPoverty | |
---|---|---|---|---|---|---|---|
0 | FB2ABB23-C9D0-4D09-8464-49BF0B982F0F | Male | 1947-12-28 02:45:40.547 | Unknown | Married | Icelandic | 18.08 |
1 | 64182B95-EB72-4E2B-BE77-8050B71498CE | Male | 1952-01-18 19:51:12.917 | African American | Separated | English | 13.03 |
2 | DB22A4D9-7E4D-485C-916A-9CD1386507FB | Female | 1970-07-25 13:04:20.717 | Asian | Married | English | 6.67 |
3 | 6E70D84D-C75F-477C-BC37-9177C3698C66 | Male | 1979-01-04 05:45:29.580 | White | Married | English | 16.09 |
4 | C8556CC0-32FC-4CA5-A8CD-9CCF38816167 | Female | 1921-04-11 11:39:49.197 | White | Married | English | 18.20 |
5 | 7FD13988-E58A-4A5C-8680-89AC200950FA | Male | 1965-07-12 15:41:20.523 | White | Married | Spanish | 12.41 |
6 | C60FE675-CA52-4C55-A233-F4B27E94987F | Male | 1957-10-30 23:26:15.303 | Asian | Married | Spanish | 12.80 |
7 | B39DC5AC-E003-4E6A-91B6-FC07625A1285 | Female | 1935-11-03 21:07:09.040 | White | Married | English | 15.31 |
8 | FA157FA5-F488-4884-BF87-E144630D595C | Female | 1932-11-01 06:19:56.577 | White | Single | English | 16.32 |
9 | B7E9FC4C-5182-4A34-954E-CEF5FC07E96D | Female | 1985-12-11 02:48:16.907 | Unknown | Single | English | 11.43 |
PatientID | PatientGender | PatientDateOfBirth | PatientRace | PatientMaritalStatus | PatientLanguage | PatientPopulationPercentageBelowPoverty | |
---|---|---|---|---|---|---|---|
90 | 36E2F89E-777A-4D77-9D95-0D70A8AB416F | Male | 1980-05-30 13:23:50.703 | African American | Separated | English | 19.36 |
91 | 672D554B-D6D1-40B2-A6A4-21A4CB6B1AA6 | Male | 1983-06-17 04:46:13.753 | African American | Single | Icelandic | 11.04 |
92 | 03A481F5-B32A-4A91-BD42-43EB78FEBA77 | Female | 1968-02-07 23:02:38.017 | Asian | Single | Unknown | 93.23 |
93 | 2A5251B1-0945-47FA-A65C-7A6381562591 | Female | 1942-01-07 16:45:33.060 | Unknown | Married | Spanish | 18.05 |
94 | 8AF47463-8534-4203-B210-C2290F6CE689 | Female | 1952-06-27 17:29:04.187 | White | Divorced | English | 11.88 |
95 | 135C831F-7DA5-46C0-959C-EBCBD8810B43 | Male | 1971-05-13 04:40:05.623 | White | Unknown | Spanish | 12.38 |
96 | 8856096E-E59C-4156-A767-C091AF799C80 | Female | 1988-11-25 02:59:36.373 | White | Divorced | English | 11.08 |
97 | 6623F5D6-D581-4268-9F9B-21612FBBF7B5 | Female | 1943-02-17 15:36:13.787 | Asian | Single | Spanish | 14.49 |
98 | 65A7FBE0-EA9F-49E9-9824-D8F3AD98DAC0 | Female | 1962-11-30 06:28:33.110 | White | Separated | Spanish | 17.98 |
99 | FB909FAE-72DD-4F6F-9828-D92183DF185F | Male | 1940-07-15 12:18:41.080 | White | Single | Spanish | 14.90 |
PatientID | PatientGender | PatientDateOfBirth | PatientRace | PatientMaritalStatus | PatientLanguage | PatientPopulationPercentageBelowPoverty | |
---|---|---|---|---|---|---|---|
50 | 35FE7491-1A1D-48CB-810C-8DC2599AB3DD | Male | 1969-11-02 06:34:34.527 | White | Married | English | 4.51 |
51 | 868E700E-3C56-458F-A477-078D671DCB20 | Female | 1978-09-21 15:29:44.770 | African American | Married | Icelandic | 5.94 |
52 | 56A35E74-90BE-44A0-B7BA-7743BB152133 | Female | 1968-11-09 15:29:19.557 | White | Married | English | 8.81 |
53 | CC12B481-B516-455B-884F-4CA900B29F2E | Female | 1985-10-21 07:59:04.777 | Unknown | Divorced | English | 14.58 |
54 | 714823AF-C52C-414C-B53B-C43EACD194C3 | Male | 1952-05-08 23:51:50.127 | White | Married | English | 13.23 |
55 | 3231F930-2978-4F50-8234-755449851E7B | Male | 1979-05-26 04:58:10.627 | White | Single | English | 18.36 |
56 | C2CCB1AB-6633-4CB3-B4E8-157E6FB02376 | Female | 1964-05-07 10:20:37.740 | White | Single | Spanish | 15.98 |
57 | 1311FEE4-2FDC-46E4-83D3-1550A3E51D2C | Female | 1988-03-28 03:09:22.807 | White | Single | English | 14.99 |
58 | 21792512-2D40-4326-BEA2-A40127EB24FF | Male | 1938-03-24 19:25:53.980 | White | Single | Unknown | 89.44 |
59 | EEAFC0B3-B835-4D99-AB33-2F9428E54E5F | Female | 1961-01-08 15:19:15.490 | Asian | Unknown | English | 18.95 |
15.3.1.3. Select some columns#
SQL cross-reference: Select some columns
from IPython.display import display
import pandas as pd
import numpy as np
filename = 'PatientCorePopulatedTable.txt'
df = pd.read_csv(filename, delimiter='\t')
df[['PatientID', 'PatientDateOfBirth']].head(10)
PatientID | PatientDateOfBirth | |
---|---|---|
0 | FB2ABB23-C9D0-4D09-8464-49BF0B982F0F | 1947-12-28 02:45:40.547 |
1 | 64182B95-EB72-4E2B-BE77-8050B71498CE | 1952-01-18 19:51:12.917 |
2 | DB22A4D9-7E4D-485C-916A-9CD1386507FB | 1970-07-25 13:04:20.717 |
3 | 6E70D84D-C75F-477C-BC37-9177C3698C66 | 1979-01-04 05:45:29.580 |
4 | C8556CC0-32FC-4CA5-A8CD-9CCF38816167 | 1921-04-11 11:39:49.197 |
5 | 7FD13988-E58A-4A5C-8680-89AC200950FA | 1965-07-12 15:41:20.523 |
6 | C60FE675-CA52-4C55-A233-F4B27E94987F | 1957-10-30 23:26:15.303 |
7 | B39DC5AC-E003-4E6A-91B6-FC07625A1285 | 1935-11-03 21:07:09.040 |
8 | FA157FA5-F488-4884-BF87-E144630D595C | 1932-11-01 06:19:56.577 |
9 | B7E9FC4C-5182-4A34-954E-CEF5FC07E96D | 1985-12-11 02:48:16.907 |
15.3.1.4. Using column alias#
SQL cross-reference: Using column alias
from IPython.display import display
import pandas as pd
import numpy as np
filename = 'PatientCorePopulatedTable.txt'
df = pd.read_csv(filename, delimiter='\t')
df[['PatientID', 'PatientDateOfBirth']].rename(columns={'PatientDateOfBirth': 'Date of Birth'}).head(10)
PatientID | Date of Birth | |
---|---|---|
0 | FB2ABB23-C9D0-4D09-8464-49BF0B982F0F | 1947-12-28 02:45:40.547 |
1 | 64182B95-EB72-4E2B-BE77-8050B71498CE | 1952-01-18 19:51:12.917 |
2 | DB22A4D9-7E4D-485C-916A-9CD1386507FB | 1970-07-25 13:04:20.717 |
3 | 6E70D84D-C75F-477C-BC37-9177C3698C66 | 1979-01-04 05:45:29.580 |
4 | C8556CC0-32FC-4CA5-A8CD-9CCF38816167 | 1921-04-11 11:39:49.197 |
5 | 7FD13988-E58A-4A5C-8680-89AC200950FA | 1965-07-12 15:41:20.523 |
6 | C60FE675-CA52-4C55-A233-F4B27E94987F | 1957-10-30 23:26:15.303 |
7 | B39DC5AC-E003-4E6A-91B6-FC07625A1285 | 1935-11-03 21:07:09.040 |
8 | FA157FA5-F488-4884-BF87-E144630D595C | 1932-11-01 06:19:56.577 |
9 | B7E9FC4C-5182-4A34-954E-CEF5FC07E96D | 1985-12-11 02:48:16.907 |
15.3.1.5. Adding columns not from the table#
SQL cross-reference: Adding columns not from the table
from IPython.display import display
import pandas as pd
import numpy as np
filename = 'PatientCorePopulatedTable.txt'
df = pd.read_csv(filename, delimiter='\t')
df_new = df[['PatientID', 'PatientPopulationPercentageBelowPoverty']].copy()
df_new.rename(
columns={
'PatientID': 'PTID',
'PatientPopulationPercentageBelowPoverty': 'Poverty Level'
},
inplace=True
)
df_new.insert(loc=1, column='Hospital', value='Buffalo Hospital')
df_new['Poverty Level'] = df_new['Poverty Level'] * 10
df_new.head(10)
PTID | Hospital | Poverty Level | |
---|---|---|---|
0 | FB2ABB23-C9D0-4D09-8464-49BF0B982F0F | Buffalo Hospital | 180.8 |
1 | 64182B95-EB72-4E2B-BE77-8050B71498CE | Buffalo Hospital | 130.3 |
2 | DB22A4D9-7E4D-485C-916A-9CD1386507FB | Buffalo Hospital | 66.7 |
3 | 6E70D84D-C75F-477C-BC37-9177C3698C66 | Buffalo Hospital | 160.9 |
4 | C8556CC0-32FC-4CA5-A8CD-9CCF38816167 | Buffalo Hospital | 182.0 |
5 | 7FD13988-E58A-4A5C-8680-89AC200950FA | Buffalo Hospital | 124.1 |
6 | C60FE675-CA52-4C55-A233-F4B27E94987F | Buffalo Hospital | 128.0 |
7 | B39DC5AC-E003-4E6A-91B6-FC07625A1285 | Buffalo Hospital | 153.1 |
8 | FA157FA5-F488-4884-BF87-E144630D595C | Buffalo Hospital | 163.2 |
9 | B7E9FC4C-5182-4A34-954E-CEF5FC07E96D | Buffalo Hospital | 114.3 |
15.3.1.6. Removing duplicates#
SQL cross-reference: Removing duplicates
from IPython.display import display
import pandas as pd
import numpy as np
filename = 'PatientCorePopulatedTable.txt'
df = pd.read_csv(filename, delimiter='\t')
pd.DataFrame(df['PatientMaritalStatus'].unique(), columns=['PatientMaritalStatus'])
PatientMaritalStatus | |
---|---|
0 | Married |
1 | Separated |
2 | Single |
3 | Divorced |
4 | Unknown |
5 | Widowed |
15.3.1.7. Removing duplicates with multiple columns#
SQL cross-reference: Removing duplicates with multiple columns
from IPython.display import display
import pandas as pd
import numpy as np
filename = 'PatientCorePopulatedTable.txt'
df = pd.read_csv(filename, delimiter='\t')
df[['PatientRace', 'PatientMaritalStatus']].drop_duplicates().sort_values(['PatientRace', 'PatientMaritalStatus']).reset_index(drop=True)
PatientRace | PatientMaritalStatus | |
---|---|---|
0 | African American | Married |
1 | African American | Separated |
2 | African American | Single |
3 | African American | Unknown |
4 | Asian | Divorced |
5 | Asian | Married |
6 | Asian | Separated |
7 | Asian | Single |
8 | Asian | Unknown |
9 | Unknown | Divorced |
10 | Unknown | Married |
11 | Unknown | Single |
12 | White | Divorced |
13 | White | Married |
14 | White | Separated |
15 | White | Single |
16 | White | Unknown |
17 | White | Widowed |
15.3.1.8. Derived table#
SQL cross-reference: Derived table
from IPython.display import display
import pandas as pd
import numpy as np
filename = 'AdmissionsDiagnosesCorePopulatedTable.txt'
df = pd.read_csv(filename, delimiter='\t')
pd.DataFrame(df[['PrimaryDiagnosisCode', 'PrimaryDiagnosisDescription']].apply(lambda row: f'({row[0]}) {row[1]}', axis=1), columns=['CodeWDescription'])
/tmp/ipykernel_290992/1037439662.py:7: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
pd.DataFrame(df[['PrimaryDiagnosisCode', 'PrimaryDiagnosisDescription']].apply(lambda row: f'({row[0]}) {row[1]}', axis=1), columns=['CodeWDescription'])
CodeWDescription | |
---|---|
0 | (M01.X) Direct infection of joint in infectiou... |
1 | (D65) Disseminated intravascular coagulation [... |
2 | (C92.1) Chronic myeloid leukemia, BCR/ABL-posi... |
3 | (M05.51) Rheumatoid polyneuropathy with rheuma... |
4 | (C91.00) Acute lymphoblastic leukemia not havi... |
... | ... |
367 | (H15) Disorders of sclera |
368 | (M05.27) Rheumatoid vasculitis with rheumatoid... |
369 | (N16) Renal tubulo-interstitial disorders in d... |
370 | (D35.2) Benign neoplasm of pituitary gland |
371 | (F06.1) Catatonic disorder due to known physio... |
372 rows × 1 columns
15.3.1.9. Filtering Data#
SQL cross-reference: The WHERE Clause
from IPython.display import display
import pandas as pd
import numpy as np
filename = 'PatientCorePopulatedTable.txt'
df = pd.read_csv(filename, delimiter='\t')
mask = (
(((df['PatientRace'] == 'White') & (df['PatientMaritalStatus'] == 'Married')) |
((df['PatientRace'] == 'African American') & (df['PatientMaritalStatus'] == 'Married'))) &
(df['PatientPopulationPercentageBelowPoverty'] > 15)
)
df_new = df[mask][['PatientID', 'PatientRace', 'PatientMaritalStatus', 'PatientPopulationPercentageBelowPoverty']].reset_index(drop=True)
df_new
PatientID | PatientRace | PatientMaritalStatus | PatientPopulationPercentageBelowPoverty | |
---|---|---|---|---|
0 | 6E70D84D-C75F-477C-BC37-9177C3698C66 | White | Married | 16.09 |
1 | C8556CC0-32FC-4CA5-A8CD-9CCF38816167 | White | Married | 18.20 |
2 | B39DC5AC-E003-4E6A-91B6-FC07625A1285 | White | Married | 15.31 |
3 | 25B786AF-0F99-478C-9CFA-0EA607E45834 | White | Married | 93.60 |
4 | 81C5B13B-F6B2-4E57-9593-6E7E4C13B2CE | White | Married | 19.22 |
5 | 69B5D2A0-12FD-46EF-A5FF-B29C4BAFBE49 | White | Married | 18.65 |
6 | 9BBF3A51-443D-438B-9289-B98B8E0577C0 | White | Married | 16.76 |
7 | 49DADA25-F2C2-42BB-8210-D78E6C7B0D48 | African American | Married | 19.50 |
8 | 2EE42DEF-37CA-4694-827E-FA4EAF882BFC | White | Married | 18.91 |
9 | C65A4ADE-112E-49E4-B72A-0DED22C242ED | White | Married | 19.66 |
10 | 80AC01B2-BD55-4BE0-A59A-4024104CF4E9 | African American | Married | 19.74 |
11 | 0E0EADE8-5592-4E0B-9F88-D7596E32EE08 | White | Married | 18.80 |
12 | BC44CE19-9FC5-4AC9-A296-9EBC5E3D03AE | African American | Married | 15.04 |
13 | 3E462A8F-7B90-43A1-A8B6-AD82CB5002C9 | White | Married | 18.33 |
df.query("((PatientRace == 'White' & PatientMaritalStatus == 'Married') | (PatientRace == 'African American' & PatientMaritalStatus == 'Married')) & PatientPopulationPercentageBelowPoverty > 15")
15.3.1.10. Sort Values#
SQL cross-reference: The ORDER BY Clauses
from IPython.display import display
import pandas as pd
import numpy as np
filename = 'PatientCorePopulatedTable.txt'
df = pd.read_csv(filename, delimiter='\t')
df_new = df[
['PatientMaritalStatus', 'PatientPopulationPercentageBelowPoverty']
].sort_values(['PatientPopulationPercentageBelowPoverty']).reset_index(drop=True)
display(df_new)
PatientMaritalStatus | PatientPopulationPercentageBelowPoverty | |
---|---|---|
0 | Single | 1.70 |
1 | Married | 3.30 |
2 | Married | 4.34 |
3 | Married | 4.51 |
4 | Married | 5.94 |
... | ... | ... |
95 | Single | 93.23 |
96 | Married | 93.60 |
97 | Single | 94.00 |
98 | Married | 94.06 |
99 | Separated | 98.40 |
100 rows × 2 columns
from IPython.display import display
import pandas as pd
import numpy as np
filename = 'PatientCorePopulatedTable.txt'
df = pd.read_csv(filename, delimiter='\t')
df_new = df[
['PatientMaritalStatus', 'PatientPopulationPercentageBelowPoverty']
].sort_values(['PatientMaritalStatus', 'PatientPopulationPercentageBelowPoverty'], ascending=[True, False]).reset_index(drop=True)
display(df_new)
PatientMaritalStatus | PatientPopulationPercentageBelowPoverty | |
---|---|---|
0 | Divorced | 88.65 |
1 | Divorced | 83.75 |
2 | Divorced | 19.52 |
3 | Divorced | 19.49 |
4 | Divorced | 19.41 |
... | ... | ... |
95 | Unknown | 18.95 |
96 | Unknown | 16.89 |
97 | Unknown | 15.02 |
98 | Unknown | 12.38 |
99 | Widowed | 88.66 |
100 rows × 2 columns
15.3.2. Filtering#
15.3.2.1. Conditional Evaluation#
SQL cross-reference: sql:filtering:basic
from IPython.display import display
import pandas as pd
import numpy as np
filename = 'PatientCorePopulatedTable.txt'
df = pd.read_csv(filename, delimiter='\t')
cond = (df['PatientGender'] == 'Male') & (df['PatientDateOfBirth'] < '1950-01-01')
df_new = df[cond].reset_index(drop=True).sort_values('PatientDateOfBirth')
display(df_new)
PatientID | PatientGender | PatientDateOfBirth | PatientRace | PatientMaritalStatus | PatientLanguage | PatientPopulationPercentageBelowPoverty | |
---|---|---|---|---|---|---|---|
14 | DDC0BC57-7A4E-4E02-9282-177750B74FBC | Male | 1921-03-26 14:38:51.803 | White | Single | English | 18.41 |
10 | 53B9FFDD-F80B-43BE-93CF-C34A023EE7E9 | Male | 1921-08-22 19:17:09.227 | White | Divorced | Icelandic | 18.17 |
2 | DA6CECFF-DE13-4C4C-919F-64E1A2B76C9D | Male | 1924-06-27 19:37:58.823 | Asian | Divorced | English | 83.75 |
3 | 7C788499-7798-484B-A027-9FCDC4C0DADB | Male | 1926-08-13 10:22:16.247 | White | Married | English | 11.89 |
4 | 25B786AF-0F99-478C-9CFA-0EA607E45834 | Male | 1926-08-20 00:21:38.870 | White | Married | English | 93.60 |
13 | 4C201C71-CCED-40D1-9642-F9C8C485B854 | Male | 1926-09-22 09:17:14.713 | African American | Married | English | 10.30 |
8 | A0A976C8-9B30-4492-B8C4-5B25095B9192 | Male | 1931-05-26 14:54:15.847 | Asian | Single | English | 19.52 |
15 | 1A220558-5996-43E1-AE5D-7B96180FED35 | Male | 1937-09-07 22:23:53.143 | Asian | Married | English | 15.11 |
12 | 21792512-2D40-4326-BEA2-A40127EB24FF | Male | 1938-03-24 19:25:53.980 | White | Single | Unknown | 89.44 |
11 | C5D09468-574F-4802-B56F-DB38F4EB1687 | Male | 1939-07-07 19:39:49.753 | African American | Married | Icelandic | 10.87 |
16 | FB909FAE-72DD-4F6F-9828-D92183DF185F | Male | 1940-07-15 12:18:41.080 | White | Single | Spanish | 14.90 |
5 | FFCDECD6-4048-4DCB-B910-1218160005B3 | Male | 1941-05-06 14:56:42.687 | White | Single | English | 14.49 |
7 | 9BBF3A51-443D-438B-9289-B98B8E0577C0 | Male | 1944-11-25 06:12:56.860 | White | Married | English | 16.76 |
9 | E483DE6E-D4E6-47FD-905B-22EE86EC7ACE | Male | 1945-11-18 04:14:31.573 | African American | Single | English | 16.88 |
6 | 967987B9-FFEF-4776-85CF-AE05CA81F583 | Male | 1947-02-13 10:26:55.520 | White | Widowed | English | 88.66 |
0 | FB2ABB23-C9D0-4D09-8464-49BF0B982F0F | Male | 1947-12-28 02:45:40.547 | Unknown | Married | Icelandic | 18.08 |
1 | 1A40AF35-C6D4-4D46-B475-A15D84E8A9D5 | Male | 1949-01-10 15:37:35.543 | White | Married | English | 11.25 |
15.3.2.2. Using Parenthesis#
SQL cross-reference: sql:filtering:parenthesis
from IPython.display import display
import pandas as pd
import numpy as np
filename = 'PatientCorePopulatedTable.txt'
df = pd.read_csv(filename, delimiter='\t')
cond = ((df['PatientGender'] == 'Male') | (df['PatientRace'] == 'White')) & (df['PatientDateOfBirth'] < '1950-01-01')
df_new = df[cond].reset_index(drop=True).sort_values('PatientDateOfBirth')
display(df_new)
PatientID | PatientGender | PatientDateOfBirth | PatientRace | PatientMaritalStatus | PatientLanguage | PatientPopulationPercentageBelowPoverty | |
---|---|---|---|---|---|---|---|
20 | DDC0BC57-7A4E-4E02-9282-177750B74FBC | Male | 1921-03-26 14:38:51.803 | White | Single | English | 18.41 |
1 | C8556CC0-32FC-4CA5-A8CD-9CCF38816167 | Female | 1921-04-11 11:39:49.197 | White | Married | English | 18.20 |
15 | 53B9FFDD-F80B-43BE-93CF-C34A023EE7E9 | Male | 1921-08-22 19:17:09.227 | White | Divorced | Icelandic | 18.17 |
5 | DA6CECFF-DE13-4C4C-919F-64E1A2B76C9D | Male | 1924-06-27 19:37:58.823 | Asian | Divorced | English | 83.75 |
6 | 7C788499-7798-484B-A027-9FCDC4C0DADB | Male | 1926-08-13 10:22:16.247 | White | Married | English | 11.89 |
7 | 25B786AF-0F99-478C-9CFA-0EA607E45834 | Male | 1926-08-20 00:21:38.870 | White | Married | English | 93.60 |
18 | 4C201C71-CCED-40D1-9642-F9C8C485B854 | Male | 1926-09-22 09:17:14.713 | African American | Married | English | 10.30 |
8 | 81C5B13B-F6B2-4E57-9593-6E7E4C13B2CE | Female | 1930-05-28 02:59:42.857 | White | Married | Icelandic | 19.22 |
13 | A0A976C8-9B30-4492-B8C4-5B25095B9192 | Male | 1931-05-26 14:54:15.847 | Asian | Single | English | 19.52 |
3 | FA157FA5-F488-4884-BF87-E144630D595C | Female | 1932-11-01 06:19:56.577 | White | Single | English | 16.32 |
2 | B39DC5AC-E003-4E6A-91B6-FC07625A1285 | Female | 1935-11-03 21:07:09.040 | White | Married | English | 15.31 |
23 | 1A220558-5996-43E1-AE5D-7B96180FED35 | Male | 1937-09-07 22:23:53.143 | Asian | Married | English | 15.11 |
12 | 80D356B4-F974-441F-A5F2-F95986D119A2 | Female | 1938-03-06 18:24:18.297 | White | Single | English | 18.88 |
17 | 21792512-2D40-4326-BEA2-A40127EB24FF | Male | 1938-03-24 19:25:53.980 | White | Single | Unknown | 89.44 |
16 | C5D09468-574F-4802-B56F-DB38F4EB1687 | Male | 1939-07-07 19:39:49.753 | African American | Married | Icelandic | 10.87 |
24 | FB909FAE-72DD-4F6F-9828-D92183DF185F | Male | 1940-07-15 12:18:41.080 | White | Single | Spanish | 14.90 |
9 | FFCDECD6-4048-4DCB-B910-1218160005B3 | Male | 1941-05-06 14:56:42.687 | White | Single | English | 14.49 |
19 | 0E0EADE8-5592-4E0B-9F88-D7596E32EE08 | Female | 1942-07-13 21:15:37.517 | White | Married | Spanish | 18.80 |
22 | 66154E24-D3EE-4311-89DB-6195278F9B3C | Female | 1944-08-26 13:03:24.297 | White | Divorced | English | 19.49 |
11 | 9BBF3A51-443D-438B-9289-B98B8E0577C0 | Male | 1944-11-25 06:12:56.860 | White | Married | English | 16.76 |
21 | E250799D-F6DE-4914-ADB4-B08A6E5029B9 | Female | 1945-08-04 19:03:00.757 | White | Single | Unknown | 12.86 |
14 | E483DE6E-D4E6-47FD-905B-22EE86EC7ACE | Male | 1945-11-18 04:14:31.573 | African American | Single | English | 16.88 |
10 | 967987B9-FFEF-4776-85CF-AE05CA81F583 | Male | 1947-02-13 10:26:55.520 | White | Widowed | English | 88.66 |
0 | FB2ABB23-C9D0-4D09-8464-49BF0B982F0F | Male | 1947-12-28 02:45:40.547 | Unknown | Married | Icelandic | 18.08 |
4 | 1A40AF35-C6D4-4D46-B475-A15D84E8A9D5 | Male | 1949-01-10 15:37:35.543 | White | Married | English | 11.25 |
15.3.2.3. Range condition#
SQL cross-reference: sql:filtering:range
from IPython.display import display
import pandas as pd
import numpy as np
filename = 'PatientCorePopulatedTable.txt'
df = pd.read_csv(filename, delimiter='\t')
cond = (df['PatientDateOfBirth'] > '1920-01-01') & (df['PatientDateOfBirth'] < '1950-01-01')
df_new = df[cond].reset_index(drop=True).sort_values('PatientDateOfBirth')
display(df_new)
PatientID | PatientGender | PatientDateOfBirth | PatientRace | PatientMaritalStatus | PatientLanguage | PatientPopulationPercentageBelowPoverty | |
---|---|---|---|---|---|---|---|
21 | DDC0BC57-7A4E-4E02-9282-177750B74FBC | Male | 1921-03-26 14:38:51.803 | White | Single | English | 18.41 |
1 | C8556CC0-32FC-4CA5-A8CD-9CCF38816167 | Female | 1921-04-11 11:39:49.197 | White | Married | English | 18.20 |
9 | 0BC491C5-5A45-4067-BD11-A78BEA00D3BE | Female | 1921-04-18 01:56:01.807 | Unknown | Married | English | 18.05 |
16 | 53B9FFDD-F80B-43BE-93CF-C34A023EE7E9 | Male | 1921-08-22 19:17:09.227 | White | Divorced | Icelandic | 18.17 |
5 | DA6CECFF-DE13-4C4C-919F-64E1A2B76C9D | Male | 1924-06-27 19:37:58.823 | Asian | Divorced | English | 83.75 |
6 | 7C788499-7798-484B-A027-9FCDC4C0DADB | Male | 1926-08-13 10:22:16.247 | White | Married | English | 11.89 |
7 | 25B786AF-0F99-478C-9CFA-0EA607E45834 | Male | 1926-08-20 00:21:38.870 | White | Married | English | 93.60 |
19 | 4C201C71-CCED-40D1-9642-F9C8C485B854 | Male | 1926-09-22 09:17:14.713 | African American | Married | English | 10.30 |
25 | EA9B67E2-15F0-450B-A8A6-14F6E4AE3D12 | Female | 1930-04-08 20:59:31.057 | Asian | Separated | Icelandic | 98.40 |
8 | 81C5B13B-F6B2-4E57-9593-6E7E4C13B2CE | Female | 1930-05-28 02:59:42.857 | White | Married | Icelandic | 19.22 |
14 | A0A976C8-9B30-4492-B8C4-5B25095B9192 | Male | 1931-05-26 14:54:15.847 | Asian | Single | English | 19.52 |
3 | FA157FA5-F488-4884-BF87-E144630D595C | Female | 1932-11-01 06:19:56.577 | White | Single | English | 16.32 |
2 | B39DC5AC-E003-4E6A-91B6-FC07625A1285 | Female | 1935-11-03 21:07:09.040 | White | Married | English | 15.31 |
27 | 1A220558-5996-43E1-AE5D-7B96180FED35 | Male | 1937-09-07 22:23:53.143 | Asian | Married | English | 15.11 |
13 | 80D356B4-F974-441F-A5F2-F95986D119A2 | Female | 1938-03-06 18:24:18.297 | White | Single | English | 18.88 |
18 | 21792512-2D40-4326-BEA2-A40127EB24FF | Male | 1938-03-24 19:25:53.980 | White | Single | Unknown | 89.44 |
17 | C5D09468-574F-4802-B56F-DB38F4EB1687 | Male | 1939-07-07 19:39:49.753 | African American | Married | Icelandic | 10.87 |
31 | FB909FAE-72DD-4F6F-9828-D92183DF185F | Male | 1940-07-15 12:18:41.080 | White | Single | Spanish | 14.90 |
10 | FFCDECD6-4048-4DCB-B910-1218160005B3 | Male | 1941-05-06 14:56:42.687 | White | Single | English | 14.49 |
29 | 2A5251B1-0945-47FA-A65C-7A6381562591 | Female | 1942-01-07 16:45:33.060 | Unknown | Married | Spanish | 18.05 |
24 | CC9CDA72-B37A-4F8F-AFE4-B08F56A183BE | Female | 1942-04-14 14:01:01.130 | Asian | Married | Spanish | 14.15 |
20 | 0E0EADE8-5592-4E0B-9F88-D7596E32EE08 | Female | 1942-07-13 21:15:37.517 | White | Married | Spanish | 18.80 |
30 | 6623F5D6-D581-4268-9F9B-21612FBBF7B5 | Female | 1943-02-17 15:36:13.787 | Asian | Single | Spanish | 14.49 |
26 | 98F593D2-8894-49BB-93B9-5A0E2CF85E2E | Female | 1944-07-15 19:04:11.487 | African American | Married | English | 9.10 |
23 | 66154E24-D3EE-4311-89DB-6195278F9B3C | Female | 1944-08-26 13:03:24.297 | White | Divorced | English | 19.49 |
12 | 9BBF3A51-443D-438B-9289-B98B8E0577C0 | Male | 1944-11-25 06:12:56.860 | White | Married | English | 16.76 |
28 | 7A7332AD-88B1-4848-9356-E5260E477C59 | Female | 1944-12-01 06:30:01.543 | Unknown | Married | English | 19.46 |
22 | E250799D-F6DE-4914-ADB4-B08A6E5029B9 | Female | 1945-08-04 19:03:00.757 | White | Single | Unknown | 12.86 |
15 | E483DE6E-D4E6-47FD-905B-22EE86EC7ACE | Male | 1945-11-18 04:14:31.573 | African American | Single | English | 16.88 |
11 | 967987B9-FFEF-4776-85CF-AE05CA81F583 | Male | 1947-02-13 10:26:55.520 | White | Widowed | English | 88.66 |
0 | FB2ABB23-C9D0-4D09-8464-49BF0B982F0F | Male | 1947-12-28 02:45:40.547 | Unknown | Married | Icelandic | 18.08 |
4 | 1A40AF35-C6D4-4D46-B475-A15D84E8A9D5 | Male | 1949-01-10 15:37:35.543 | White | Married | English | 11.25 |
15.3.2.4. String condition#
SQL cross-reference: sql:filtering:string
from IPython.display import display
import pandas as pd
import numpy as np
filename = 'PatientCorePopulatedTable.txt'
df = pd.read_csv(filename, delimiter='\t')
cond = (df['PatientID'] > '2A') & (df['PatientID'] < '53')
df_new = df[cond].reset_index(drop=True).sort_values('PatientID')
display(df_new)
PatientID | PatientGender | PatientDateOfBirth | PatientRace | PatientMaritalStatus | PatientLanguage | PatientPopulationPercentageBelowPoverty | |
---|---|---|---|---|---|---|---|
11 | 2A5251B1-0945-47FA-A65C-7A6381562591 | Female | 1942-01-07 16:45:33.060 | Unknown | Married | Spanish | 18.05 |
7 | 2A8772FE-61DB-483E-B6BF-6C0A74BA9C2A | Female | 1966-10-14 15:31:48.427 | Asian | Single | English | 11.15 |
8 | 2E26695A-EFB0-4C7F-9318-E3030B154E39 | Female | 1963-06-28 05:37:36.843 | Asian | Married | English | 14.20 |
4 | 2EE42DEF-37CA-4694-827E-FA4EAF882BFC | Male | 1964-04-27 00:41:40.410 | White | Married | English | 18.91 |
3 | 3231F930-2978-4F50-8234-755449851E7B | Male | 1979-05-26 04:58:10.627 | White | Single | English | 18.36 |
2 | 35FE7491-1A1D-48CB-810C-8DC2599AB3DD | Male | 1969-11-02 06:34:34.527 | White | Married | English | 4.51 |
5 | 36775002-9EC3-4889-AD4F-80DC6855C8D8 | Female | 1963-07-16 22:16:48.477 | Asian | Single | Spanish | 15.56 |
10 | 36E2F89E-777A-4D77-9D95-0D70A8AB416F | Male | 1980-05-30 13:23:50.703 | African American | Separated | English | 19.36 |
1 | 3B11D6B3-A36A-4B69-A437-C29BF425A941 | Female | 1954-05-18 10:19:19.110 | Asian | Separated | English | 10.80 |
9 | 3E462A8F-7B90-43A1-A8B6-AD82CB5002C9 | Female | 1986-12-20 16:41:34.793 | White | Married | Spanish | 18.33 |
0 | 49DADA25-F2C2-42BB-8210-D78E6C7B0D48 | Male | 1962-04-08 10:18:26.263 | African American | Married | English | 19.50 |
6 | 4C201C71-CCED-40D1-9642-F9C8C485B854 | Male | 1926-09-22 09:17:14.713 | African American | Married | English | 10.30 |
15.3.2.5. Wildcard Matches#
SQL cross-reference: sql:filtering:startswith
from IPython.display import display
import pandas as pd
import numpy as np
filename = 'AdmissionsDiagnosesCorePopulatedTable.txt'
df = pd.read_csv(filename, delimiter='\t')
cond = (df['PrimaryDiagnosisCode'].str.startswith('M'))
df_new = df[cond].reset_index(drop=True).sort_values('PrimaryDiagnosisCode')
display(df_new)
PatientID | AdmissionID | PrimaryDiagnosisCode | PrimaryDiagnosisDescription | |
---|---|---|---|---|
0 | 80AC01B2-BD55-4BE0-A59A-4024104CF4E9 | 2 | M01.X | Direct infection of joint in infectious and pa... |
78 | 4C201C71-CCED-40D1-9642-F9C8C485B854 | 5 | M01.X1 | Direct infection of shoulder joint in infectio... |
16 | 25B786AF-0F99-478C-9CFA-0EA607E45834 | 1 | M01.X5 | Direct infection of hip in infectious and para... |
71 | 21792512-2D40-4326-BEA2-A40127EB24FF | 4 | M01.X72 | Direct infection of left ankle and foot in inf... |
13 | DA6CECFF-DE13-4C4C-919F-64E1A2B76C9D | 2 | M02.35 | Reiter's disease, hip |
... | ... | ... | ... | ... |
43 | 9E18822E-7D13-45C7-B50E-F95CFF92BC3E | 2 | M90.6 | Osteitis deformans in neoplastic diseases |
55 | 8856096E-E59C-4156-A767-C091AF799C80 | 1 | M90.6 | Osteitis deformans in neoplastic diseases |
53 | 135C831F-7DA5-46C0-959C-EBCBD8810B43 | 4 | M90.632 | Osteitis deformans in neoplastic diseases, lef... |
70 | 21792512-2D40-4326-BEA2-A40127EB24FF | 3 | M90.642 | Osteitis deformans in neoplastic diseases, lef... |
61 | C60FE675-CA52-4C55-A233-F4B27E94987F | 3 | M90.86 | Osteopathy in diseases classified elsewhere, l... |
81 rows × 4 columns
SQL cross-reference: sql:filtering:endswith
from IPython.display import display
import pandas as pd
import numpy as np
filename = 'AdmissionsDiagnosesCorePopulatedTable.txt'
df = pd.read_csv(filename, delimiter='\t')
cond = (df['PrimaryDiagnosisCode'].str.endswith('4'))
df_new = df[cond].reset_index(drop=True).sort_values('PrimaryDiagnosisCode')
display(df_new)
PatientID | AdmissionID | PrimaryDiagnosisCode | PrimaryDiagnosisDescription | |
---|---|---|---|---|
21 | 868E700E-3C56-458F-A477-078D671DCB20 | 2 | B33.4 | Hantavirus (cardio)-pulmonary syndrome [HPS] [... |
0 | 66154E24-D3EE-4311-89DB-6195278F9B3C | 4 | B97.34 | Human T-cell lymphotrophic virus, type II [HTL... |
1 | DA6CECFF-DE13-4C4C-919F-64E1A2B76C9D | 3 | C71.4 | Malignant neoplasm of occipital lobe |
19 | 6E70D84D-C75F-477C-BC37-9177C3698C66 | 2 | D13.4 | Benign neoplasm of liver |
7 | 79A7BA2A-D35A-4CB8-A835-6BAA13B0058C | 3 | D14 | Benign neoplasm of middle ear and respiratory ... |
8 | 6D8008ED-D623-4BE4-B93B-335F9797C170 | 3 | D41.4 | Neoplasm of uncertain behavior of bladder |
14 | 35FE7491-1A1D-48CB-810C-8DC2599AB3DD | 2 | D48.4 | Neoplasm of uncertain behavior of peritoneum |
3 | 25B786AF-0F99-478C-9CFA-0EA607E45834 | 5 | E08.64 | Diabetes mellitus due to underlying condition ... |
11 | 016A590E-D093-4667-A5DA-D68EA6987D93 | 2 | E11.64 | Type 2 diabetes mellitus with hypoglycemia |
15 | EA9B67E2-15F0-450B-A8A6-14F6E4AE3D12 | 1 | E72.4 | Disorders of ornithine metabolism |
22 | CC12B481-B516-455B-884F-4CA900B29F2E | 2 | E87.4 | Mixed disorder of acid-base balance |
2 | 0A9BA3E4-CF3C-49C4-9774-5EEA2EE7D123 | 1 | F31.4 | Bipolar disorder, current episode depressed, s... |
18 | 64182B95-EB72-4E2B-BE77-8050B71498CE | 2 | F45.4 | Pain disorders related to psychological factors |
9 | 80D356B4-F974-441F-A5F2-F95986D119A2 | 1 | H34 | Retinal vascular occlusions |
5 | 81C5B13B-F6B2-4E57-9593-6E7E4C13B2CE | 1 | I25.84 | Coronary atherosclerosis due to calcified coro... |
6 | 220C8D43-1322-4A9D-B890-D426942A3649 | 3 | J14 | Pneumonia due to Hemophilus influenzae |
10 | 80D356B4-F974-441F-A5F2-F95986D119A2 | 2 | K50.014 | Crohn's disease of small intestine with abscess |
20 | FA157FA5-F488-4884-BF87-E144630D595C | 2 | M06.34 | Rheumatoid nodule, hand |
16 | 6D5DCAC1-17FE-4D7C-923B-806EFBA3E6DF | 3 | M49.84 | Spondylopathy in diseases classified elsewhere... |
17 | 2A5251B1-0945-47FA-A65C-7A6381562591 | 4 | M84 | Disorder of continuity of bone |
4 | F0B53A2C-98CA-415D-B928-E3FD0E52B22A | 3 | N18.4 | Chronic kidney disease, stage 4 (severe) |
13 | 53B9FFDD-F80B-43BE-93CF-C34A023EE7E9 | 2 | Q60.4 | Renal hypoplasia, bilateral |
12 | 3B11D6B3-A36A-4B69-A437-C29BF425A941 | 2 | T82.4 | Mechanical complication of vascular dialysis c... |
SQL cross-reference: sql:filtering:contains
from IPython.display import display
import pandas as pd
import numpy as np
filename = 'AdmissionsDiagnosesCorePopulatedTable.txt'
df = pd.read_csv(filename, delimiter='\t')
cond = (df['PrimaryDiagnosisCode'].str.contains('5.'))
df_new = df[cond].reset_index(drop=True).sort_values('PrimaryDiagnosisCode')
display(df_new)
PatientID | AdmissionID | PrimaryDiagnosisCode | PrimaryDiagnosisDescription | |
---|---|---|---|---|
11 | DDC0BC57-7A4E-4E02-9282-177750B74FBC | 4 | A52.0 | Cardiovascular and cerebrovascular syphilis |
33 | 967987B9-FFEF-4776-85CF-AE05CA81F583 | 2 | B95.1 | Streptococcus, group B, as the cause of diseas... |
9 | DDC0BC57-7A4E-4E02-9282-177750B74FBC | 2 | B95.62 | Methicillin resistant Staphylococcus aureus in... |
10 | DDC0BC57-7A4E-4E02-9282-177750B74FBC | 3 | C05.1 | Malignant neoplasm of soft palate |
24 | 1A8791E3-A61C-455A-8DEE-763EB90C9B2C | 3 | C15.8 | Malignant neoplasm of overlapping sites of eso... |
... | ... | ... | ... | ... |
87 | 21792512-2D40-4326-BEA2-A40127EB24FF | 1 | O99.351 | Diseases of the nervous system complicating pr... |
85 | 714823AF-C52C-414C-B53B-C43EACD194C3 | 3 | O9A.513 | Psychological abuse complicating pregnancy, th... |
49 | 53B9FFDD-F80B-43BE-93CF-C34A023EE7E9 | 3 | O9A.53 | Psychological abuse complicating the puerperium |
70 | 65A7FBE0-EA9F-49E9-9824-D8F3AD98DAC0 | 2 | S35.415 | Laceration of left renal vein |
76 | C8556CC0-32FC-4CA5-A8CD-9CCF38816167 | 2 | Z95.810 | Presence of automatic (implantable) cardiac de... |
94 rows × 4 columns
15.3.2.6. 13.5.1. A Simple Grouping Examples#
import pandas as pd
filename = "LabsCorePopulatedTable.txt"
df = pd.read_csv(filename, delimiter="\t")
# df.set_index("PatientID", inplace=True)
count = df.groupby("PatientID")["LabName"].count().sort_values(ascending=False)
display(count)
count[count>2000]
import pandas as pd
import numpy as np
filename = "LabsCorePopulatedTable.txt"
df = pd.read_csv(filename, delimiter="\t")
df.set_index("PatientID", inplace=True)
df = df[df["LabName"] == "URINALYSIS: RED BLOOD CELLS"]
df.select_dtypes(include=[np.number])
count = df.groupby("PatientID").agg((
{
"LabValue": ["count", "max", "min", "sum", "mean"]
}
))
count[('LabValue', 'mean')] = count[('LabValue', 'mean')].round(2)
count.sort_values(('LabValue', 'mean'), ascending=False)
import pandas as pd
import numpy as np
filename = "LabsCorePopulatedTable.txt"
df = pd.read_csv(filename, delimiter="\t")
df.set_index("PatientID", inplace=True)
count = df.groupby(["PatientID", "LabName"]).agg((
{
"LabValue": ["count", "max", "min", "sum", "mean"]
}
))
count[('LabValue', 'mean')] = count[('LabValue', 'mean')].round(2)
count.sort_values(["PatientID", "LabName", ('LabValue', 'count')], ascending=(False, True, False))
# count.loc["9E18822E-7D13-45C7-B50E-F95CFF92BC3E"].sort_values(('LabValue', 'count'), ascending=False)
import pandas as pd
import numpy as np
filename = "AdmissionsCorePopulatedTable.txt"
df = pd.read_csv(filename, delimiter="\t")
df.set_index("PatientID", inplace=True)
df['AdmissionStartDate'] = pd.to_datetime(df['AdmissionStartDate'])
df['AdmissionEndDate'] = pd.to_datetime(df['AdmissionEndDate'])
df["StayDuration"] = (df['AdmissionEndDate'] - df['AdmissionStartDate']).dt.days
df.groupby("PatientID").max().sort_values('StayDuration', ascending=False)
# df[df["StayDuration"] > 20]
15.3.2.7. 13.6.1. Using value from a subquery#
import pandas as pd
import numpy as np
filename = "AdmissionsCorePopulatedTable.txt"
df_adm = pd.read_csv(filename, delimiter="\t")
df_adm.set_index("PatientID", inplace=True)
df_adm['AdmissionStartDate'] = pd.to_datetime(df_adm['AdmissionStartDate'])
df_adm['AdmissionEndDate'] = pd.to_datetime(df_adm['AdmissionEndDate'])
df_adm["StayDuration"] = (df_adm['AdmissionEndDate'] - df_adm['AdmissionStartDate']).dt.days
df_adm = df_adm.groupby("PatientID").max().sort_values('StayDuration', ascending=False)
df_adm = df_adm[df_adm["StayDuration"] >= 19]["StayDuration"]
df_adm
filename = "PatientCorePopulatedTable.txt"
df = pd.read_csv(filename, delimiter="\t")
df.set_index("PatientID", inplace=True)
new_df = df.join(df_adm)
new_df[new_df["StayDuration"].notna()]
15.3.2.8. 13.6.2. IN and NOT IN examples#
filename = "LabsCorePopulatedTable.txt"
labs = pd.read_csv(filename, delimiter="\t")
labs.set_index("PatientID", inplace=True)
filename = "PatientCorePopulatedTable.txt"
pt = pd.read_csv(filename, delimiter="\t")
pt.set_index("PatientID", inplace=True)
pt
labs.loc[pt[pt["PatientLanguage"].isin(('Icelandic', 'Spanish'))].index].loc["81C5B13B-F6B2-4E57-9593-6E7E4C13B2CE"]
filename = "LabsCorePopulatedTable.txt"
labs = pd.read_csv(filename, delimiter="\t")
labs.set_index("PatientID", inplace=True)
filename = "PatientCorePopulatedTable.txt"
pt = pd.read_csv(filename, delimiter="\t")
pt.set_index("PatientID", inplace=True)
pt
labs.loc[pt[~pt["PatientLanguage"].isin(('Icelandic', 'Spanish'))].index][:10]
15.3.2.9. 13.7. Conditionals#
import pandas as pd
import numpy as np
filename = "AdmissionsCorePopulatedTable.txt"
df_adm = pd.read_csv(filename, delimiter="\t", usecols =["AdmissionStartDate"])
df_adm['AdmissionMonth'] = pd.to_datetime(df_adm['AdmissionStartDate']).dt.month_name()
df_adm
df_adm = df_adm.groupby("AdmissionMonth").count()
df_adm.columns = ["AdmissionStartCount"]
df_adm.sort_values("AdmissionStartCount", ascending=False)
from datetime import datetime
filename = "PatientCorePopulatedTable.txt"
pt = pd.read_csv(filename, delimiter="\t")
pt.set_index("PatientID", inplace=True)
pt['PatientDateOfBirth'] = pd.to_datetime(pt['PatientDateOfBirth'])
pt["Age"] = ((datetime.now() - pt['PatientDateOfBirth']).dt.days / 365.25).astype(int)
pt["AgeCategory"] = pd.cut(pt["Age"], bins=[-np.inf, 17, 35, 55, np.inf], labels=['YOUTH', 'YOUNG ADULT', 'ADULT', 'SENIOR'])
new_pt = pt[["Age", "AgeCategory"]].sort_values("Age")
new_pt.groupby("AgeCategory", observed=True).count()
import pandas as pd
filename = "PatientCorePopulatedTable.txt"
pt = pd.read_csv(filename, delimiter="\t")
pt.set_index("PatientID", inplace=True)
(
len(pt.query("PatientGender == 'Male' & PatientMaritalStatus == 'Married'")) /
len(pt.query("PatientGender == 'Female' & PatientMaritalStatus == 'Married'"))
)
import pandas as pd
filename = "LabsCorePopulatedTable.txt"
df = pd.read_csv(filename, delimiter="\t")
urinalysis_exams = [
'URINALYSIS: SPECIFIC GRAVITY',
'URINALYSIS: PH',
'URINALYSIS: RED BLOOD CELLS',
'URINALYSIS: WHITE BLOOD CELLS'
]
exam_check = pd.crosstab(
df['PatientID'],
df['LabName']
)[urinalysis_exams].astype(bool)
import pandas as pd
# Sample DataFrame
df = pd.DataFrame({
'Gender': ['M', 'F', 'M', 'F', 'M'],
'Category': ['A', 'B', 'A', 'A', 'B']
})
# Create a cross-tabulation
result = pd.crosstab(df['Gender'], df['Category'])
print(result)
result = pd.crosstab(
df['PatientID'],
df['LabName'],
values=df['LabValue'],
aggfunc='mean' # or 'sum', 'min', 'max', etc.
)
result
import pandas as pd
filename = "LabsCorePopulatedTable.txt"
lab = pd.read_csv(filename, delimiter="\t")
lab.set_index("PatientID", inplace=True)
lab = lab[lab['LabName']=='METABOLIC: CREATININE'][['LabName', 'LabValue', 'LabUnits']]
filename = "PatientCorePopulatedTable.txt"
pt = pd.read_csv(filename, delimiter="\t")
pt.set_index("PatientID", inplace=True)
pt = pt[['PatientGender']]
lab = lab.join(pt)
def interpret_creatinine_result(row):
if row['PatientGender'] == 'Male' and 0.7 <= row['LabValue'] <= 1.3:
return 'Normal'
elif row['PatientGender'] == 'Female' and 0.6 <= row['LabValue'] <= 1.1:
return 'Normal'
else:
return 'Out of Range'
lab['Interpretation'] = lab.apply(lambda row: interpret_creatinine_result(row), axis=1)
lab = lab[['PatientGender', 'LabName', 'LabValue', 'LabUnits', 'Interpretation']]
lab.sort_values('LabValue', ascending=False)
lab.loc['81C5B13B-F6B2-4E57-9593-6E7E4C13B2CE']['Interpretation'].value_counts()
15.3.3. Analytic Functions#
import pandas as pd
df = pd.read_csv('bakery_sales.csv')
df['sale_date'] = pd.to_datetime(df['sale_date'])
new_df = df.copy()
new_df['Quarter'] = new_df['sale_date'].dt.quarter
new_df['Month'] = new_df['sale_date'].dt.month_name()
new_df = new_df[['Quarter', 'Month', 'total']]
result = new_df.groupby(['Quarter', 'Month']).sum()
def month_sort(x):
month_order = {
'January': 1,
'February': 2,
'March': 3,
'April': 4,
'May': 5,
'June': 6,
'July': 7,
'August': 8,
'September': 9,
'October': 10,
'November': 11,
'December': 12
}
return x.map(month_order)
result.sort_values('Month', key=lambda x: month_sort(x))
15.3.3.1. More Exmaples#
import pandas as pd
import numpy as np
# Create the sample DataFrame
data = {
'Quarter': ['Q1', 'Q1', 'Q1', 'Q2', 'Q2', 'Q3', 'Q3', 'Q3', 'Q4', 'Q4', 'Q4'],
'Month': ['January', 'February', 'March', 'April', 'May', 'July', 'August',
'September', 'October', 'November', 'December'],
'Monthly_Sales': [4582500, 6423700, 6445100, 4893700, 308400, 4076500,
6100500, 4895500, 3959100, 4543000, 5009500]
}
df = pd.DataFrame(data)
15.3.3.1.1. 1. Basic Window Operations#
# Running total of sales
df['Cumulative_Sales'] = df['Monthly_Sales'].cumsum()
# Average sales by quarter
df['Avg_Quarter_Sales'] = df.groupby('Quarter')['Monthly_Sales'].transform('mean')
# Sales rank within quarter
df['Quarter_Sales_Rank'] = df.groupby('Quarter')['Monthly_Sales'].rank(method='dense')
print("\nBasic Window Operations:")
df
15.3.3.1.2. 2. Advanced Window Calculations#
# Rolling 3-month sales average
df['Rolling_3Month_Avg'] = df['Monthly_Sales'].rolling(window=3, min_periods=1).mean()
# Cumulative sum within quarter
df['Quarter_Cumulative'] = df.groupby('Quarter')['Monthly_Sales'].transform('cumsum')
# Percentage of quarterly total
df['Pct_of_Quarter'] = df.groupby('Quarter')['Monthly_Sales'].transform(
lambda x: x / x.sum() * 100
)
print("\nAdvanced Window Calculations:")
df
15.3.3.1.3. 3. Ranking Functions#
# Different ranking methods within quarter
df['Dense_Rank'] = df.groupby('Quarter')['Monthly_Sales'].rank(method='dense') #ascending=False
df['Regular_Rank'] = df.groupby('Quarter')['Monthly_Sales'].rank(method='min')
df['Percent_Rank'] = df.groupby('Quarter')['Monthly_Sales'].rank(pct=True)
print("\nRanking Functions:")
print(df[['Quarter', 'Month', 'Monthly_Sales', 'Dense_Rank', 'Regular_Rank', 'Percent_Rank']])
15.3.3.1.4. 4. Lead/Lag Analysis#
# Next month's sales
df['Next_Month_Sales'] = df['Monthly_Sales'].shift(-1)
# Previous month's sales
df['Prev_Month_Sales'] = df['Monthly_Sales'].shift(1)
# Sales difference from previous month
df['Month_Over_Month_Change'] = df['Monthly_Sales'] - df['Prev_Month_Sales']
print("\nLead/Lag Analysis:")
df[['Month', 'Monthly_Sales', 'Next_Month_Sales', 'Prev_Month_Sales', 'Month_Over_Month_Change']]
15.3.3.1.5. 5. Complex Analysis#
# Comprehensive analysis
df_analyzed = df.assign(
# Quarter total
Quarter_Total=lambda x: x.groupby('Quarter')['Monthly_Sales'].transform('sum'),
# Percentage of annual sales
Pct_of_Annual=lambda x: x['Monthly_Sales'] / x['Monthly_Sales'].sum() * 100,
# Quarter-over-Quarter growth
QoQ_Growth=lambda x: x.groupby('Quarter')['Monthly_Sales'].transform('sum').pct_change() * 100,
# Running total within quarter
Quarter_Running_Total=lambda x: x.groupby('Quarter')['Monthly_Sales'].transform('cumsum'),
# Difference from quarter average
Diff_From_Quarter_Avg=lambda x: x['Monthly_Sales'] -
x.groupby('Quarter')['Monthly_Sales'].transform('mean')
)
print("\nComprehensive Analysis:")
df_analyzed
15.3.3.1.6. 6. Summary Statistics#
# Summary by quarter
quarterly_summary = df.groupby('Quarter').agg({
'Monthly_Sales': ['count', 'sum', 'mean', 'min', 'max', 'std']
}).round(2)
print("\nQuarterly Summary:")
display(quarterly_summary)
# Year-to-date calculations
df['YTD_Sales'] = df['Monthly_Sales'].cumsum()
df['YTD_Avg'] = df['Monthly_Sales'].expanding().mean()
print("\nYear-to-Date Analysis:")
df[['Month', 'Monthly_Sales', 'YTD_Sales', 'YTD_Avg']]
15.3.3.1.7. 7. Performance Metrics#
# Calculate various performance metrics
performance_metrics = pd.DataFrame({
'Month': df['Month'],
'Sales': df['Monthly_Sales'],
'Pct_of_Year': df['Monthly_Sales'] / df['Monthly_Sales'].sum() * 100,
'Cumulative_Pct': (df['Monthly_Sales'].cumsum() / df['Monthly_Sales'].sum() * 100),
'Rolling_3M_Avg': df['Monthly_Sales'].rolling(3, min_periods=1).mean(),
'Rolling_3M_Std': df['Monthly_Sales'].rolling(3, min_periods=1).std()
}).round(2)
print("\nPerformance Metrics:")
performance_metrics
15.3.3.1.8. 8. Change#
import pandas as pd
# Create the sample DataFrame
data = {
'Quarter': ['Q1', 'Q1', 'Q1', 'Q2', 'Q2', 'Q3', 'Q3', 'Q3', 'Q4', 'Q4', 'Q4'],
'Month': ['January', 'February', 'March', 'April', 'May', 'July', 'August',
'September', 'October', 'November', 'December'],
'Monthly_Sales': [4582500, 6423700, 6445100, 4893700, 308400, 4076500,
6100500, 4895500, 3959100, 4543000, 5009500]
}
df = pd.DataFrame(data)
# Calculate change from previous month
df['Change_From_Previous'] = df['Monthly_Sales'].diff()
# Add percentage change
df['Pct_Change'] = df['Monthly_Sales'].pct_change() * 100
# Format the results
result = df.copy()
result['Change_From_Previous'] = result['Change_From_Previous'].round(2)
result['Pct_Change'] = result['Pct_Change'].round(2)
# Add trend indicator
result['Trend'] = result['Change_From_Previous'].apply(
# lambda x: '↑' if x > 0 else '↓' if x < 0 else '→'
lambda x: "\U0001F600" if x > 0 else "\U0001F61E" if x < 0 else "\U0001F610"
)
print("Sales Analysis with Month-over-Month Changes:")
result
15.3.3.2. 9. Various Rankings#
rank
: returns the same ranking in case of a tie, with gaps in the rankingsdense_rank
: returns the same ranking in the case of a tie, with no gaps in the rankingsrow_number
: returns a unique number for each row, with rankings arbitrarily assigned in case of a tiePanda offer other types of ranking also.
import pandas as pd
import numpy as np
# Create sample data
data = {
'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank'],
'Score': [85, 92, 92, 78, 85, 95]
}
# Create DataFrame
df = pd.DataFrame(data)
# Sort by Score in descending order first
df_sorted = df.sort_values('Score', ascending=False)
# Add different types of ranks
df_sorted['Rank'] = df_sorted['Score'].rank(method='min', ascending=False)
df_sorted['DenseRank'] = df_sorted['Score'].rank(method='dense', ascending=False)
df_sorted['RowNumber'] = range(1, len(df_sorted) + 1) # Add row numbers after sorting
print("\nDataFrame sorted by Score (descending):")
df_sorted
15.3.4. Pivot Tables#
15.3.4.1. Basic Example#
import pandas as pd
# Create sample data
data = {
'Date': ['2024-01-01', '2024-01-01', '2024-01-02', '2024-01-02', '2024-01-03'],
'Product': ['Laptop', 'Phone', 'Laptop', 'Phone', 'Laptop'],
'Region': ['North', 'South', 'North', 'North', 'South'],
'Sales': [1000, 500, 1200, 600, 900]
}
# Create DataFrame
df = pd.DataFrame(data)
# Create a pivot table
pivot_table = pd.pivot_table(
df,
values='Sales',
index='Product',
columns='Region',
aggfunc='sum', # 'mean', 'count', 'max'
fill_value=0 # fill_value=0 replaces any NaN values with 0
)
print("\nOriginal Data:")
display(df)
print("\nPivot Table:")
display(pivot_table)
15.3.4.2. Multiindices#
import pandas as pd
# Create sample data
data = {
'Date': ['2024-01-01', '2024-01-01', '2024-01-01', '2024-01-02', '2024-01-02',
'2024-01-02', '2024-01-03', '2024-01-03', '2024-01-03'],
'Product': ['Laptop', 'Phone', 'Tablet', 'Laptop', 'Phone',
'Tablet', 'Laptop', 'Phone', 'Tablet'],
'Category': ['Electronics', 'Electronics', 'Electronics', 'Electronics', 'Electronics',
'Electronics', 'Electronics', 'Electronics', 'Electronics'],
'Region': ['North', 'South', 'North', 'North', 'South',
'South', 'North', 'North', 'South'],
'Sales': [1000, 500, 300, 1200, 600, 400, 900, 700, 350],
'Units': [5, 10, 6, 6, 12, 8, 4, 14, 7]
}
# Create DataFrame
df = pd.DataFrame(data)
# Create a pivot table with multiple indices
pivot_table = pd.pivot_table(
df,
values=['Sales', 'Units'], # Multiple values
index=['Category', 'Product'], # Multiple indices
columns=['Region'],
aggfunc={'Sales': 'sum', 'Units': 'mean'}, # Different aggregations for different values
fill_value=0,
margins=True # Add totals
)
print("\nOriginal Data:")
display(df)
print("\nPivot Table with Multiple Indices:")
display(pivot_table)
# To make it more readable, we can also flatten the column headers
pivot_table.columns = [f'{col[0]}_{col[1]}' for col in pivot_table.columns]
print("\nPivot Table with Flattened Headers:")
display(pivot_table)
15.3.4.3. Multiple Aggregations#
pivot_table = pd.pivot_table(
df,
values=['Sales', 'Units'],
index=['Category', 'Product'],
columns=['Region'],
aggfunc={
'Sales': ['sum', 'mean'], # Multiple aggregations for Sales
'Units': ['mean', 'max'] # Multiple aggregations for Units
},
fill_value=0
)
pivot_table