13.12. India Export Database#

import pandas as pd
import sqlite3

df = pd.read_csv('india_export.csv')
display(df)
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[1], line 1
----> 1 import pandas as pd
      2 import sqlite3
      4 df = pd.read_csv('india_export.csv')

ModuleNotFoundError: No module named 'pandas'
conn = sqlite3.connect('india_export.db')
cur = conn.cursor()
cur.execute('''DROP TABLE IF EXISTS export''')
df.to_sql('export', conn, if_exists='replace', index=False) # - writes the pd.df to SQLIte DB
conn.commit()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[2], line 1
----> 1 conn = sqlite3.connect('india_export.db')
      2 cur = conn.cursor()
      3 cur.execute('''DROP TABLE IF EXISTS export''')

NameError: name 'sqlite3' is not defined
pd.read_sql('select * from export', conn)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[3], line 1
----> 1 pd.read_sql('select * from export', conn)

NameError: name 'pd' is not defined
conn.close()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[4], line 1
----> 1 conn.close()

NameError: name 'conn' is not defined

13.13. Steps to normalizing India export database#

  • Inspect table and determine how many tables into convert the database into

    • Four tables:

      • year

      • country

      • commodity

      • a table that joins country, year, commmodity, and value

  • Steps to create year table

    • list of unique years

    • create statement

    • insert function

import pandas as pd
import sqlite3
from sqlite3 import Error

def create_connection(db_file, delete_db=False):
    import os
    if delete_db and os.path.exists(db_file):
        os.remove(db_file)

    conn = None
    try:
        conn = sqlite3.connect(db_file)
        conn.execute("PRAGMA foreign_keys = 1")
    except Error as e:
        print(e)

    return conn


def create_table(conn, create_table_sql):
    try:
        c = conn.cursor()
        c.execute(create_table_sql)
    except Error as e:
        print(e)
        
def execute_sql_statement(sql_statement, conn):
    cur = conn.cursor()
    cur.execute(sql_statement)

    rows = cur.fetchall()

    return rows
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[5], line 1
----> 1 import pandas as pd
      2 import sqlite3
      3 from sqlite3 import Error

ModuleNotFoundError: No module named 'pandas'
conn = create_connection('india_export.db')
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[6], line 1
----> 1 conn = create_connection('india_export.db')

NameError: name 'create_connection' is not defined
# Get list of unique years

sql_statement = "SELECT DISTINCT year from export ORDER BY year"

years = execute_sql_statement(sql_statement, conn)
years = list(map(lambda row: int(row[0]), years))
print(years)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[7], line 5
      1 # Get list of unique years
      3 sql_statement = "SELECT DISTINCT year from export ORDER BY year"
----> 5 years = execute_sql_statement(sql_statement, conn)
      6 years = list(map(lambda row: int(row[0]), years))
      7 print(years)

NameError: name 'execute_sql_statement' is not defined
create_table_sql = """CREATE TABLE [Year] (
    [YEAR] INTEGER NOT NULL PRIMARY KEY
);
"""
# conn_norm.close()

conn_norm = create_connection('india_export_norm.db', True)
create_table(conn_norm, create_table_sql)
sql_statement = "SELECT * FROM Year"
df = pd.read_sql_query(sql_statement, conn_norm)
display(df)
conn_norm.close()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[8], line 7
      1 create_table_sql = """CREATE TABLE [Year] (
      2     [YEAR] INTEGER NOT NULL PRIMARY KEY
      3 );
      4 """
      5 # conn_norm.close()
----> 7 conn_norm = create_connection('india_export_norm.db', True)
      8 create_table(conn_norm, create_table_sql)
      9 sql_statement = "SELECT * FROM Year"

NameError: name 'create_connection' is not defined
conn_norm = create_connection('india_export_norm.db')
sql_statement = "SELECT * FROM Year"
df = pd.read_sql_query(sql_statement, conn_norm)
display(df)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[9], line 1
----> 1 conn_norm = create_connection('india_export_norm.db')
      2 sql_statement = "SELECT * FROM Year"
      3 df = pd.read_sql_query(sql_statement, conn_norm)

NameError: name 'create_connection' is not defined
(2022,)
(2022,)
def insert_year(conn, values):
    sql = ''' INSERT INTO YEAR(YEAR)
              VALUES(?) '''
    cur = conn.cursor()
    cur.execute(sql, values)
    return cur.lastrowid

with conn_norm:
    for year in years:
        insert_year(conn_norm, (year, ))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[11], line 8
      5     cur.execute(sql, values)
      6     return cur.lastrowid
----> 8 with conn_norm:
      9     for year in years:
     10         insert_year(conn_norm, (year, ))

NameError: name 'conn_norm' is not defined
sql_statement = "SELECT * FROM Year"
df = pd.read_sql_query(sql_statement, conn_norm)
display(df)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[12], line 2
      1 sql_statement = "SELECT * FROM Year"
----> 2 df = pd.read_sql_query(sql_statement, conn_norm)
      3 display(df)

NameError: name 'pd' is not defined
conn_norm.close()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[13], line 1
----> 1 conn_norm.close()

NameError: name 'conn_norm' is not defined
conn_norm = create_connection('india_export_norm.db')
sql_statement = "SELECT * FROM Year"
df = pd.read_sql_query(sql_statement, conn_norm)
display(df)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[14], line 1
----> 1 conn_norm = create_connection('india_export_norm.db')
      2 sql_statement = "SELECT * FROM Year"
      3 df = pd.read_sql_query(sql_statement, conn_norm)

NameError: name 'create_connection' is not defined
# Get list of unique countries

sql_statement = "SELECT DISTINCT country from export ORDER BY country"

countries = execute_sql_statement(sql_statement, conn)
countries = list(map(lambda row: row[0], countries))
print(countries)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[15], line 5
      1 # Get list of unique countries
      3 sql_statement = "SELECT DISTINCT country from export ORDER BY country"
----> 5 countries = execute_sql_statement(sql_statement, conn)
      6 countries = list(map(lambda row: row[0], countries))
      7 print(countries)

NameError: name 'execute_sql_statement' is not defined
def insert_country(conn, values):
    sql = ''' INSERT INTO COUNTRY(COUNTRY)
              VALUES(?) '''
    cur = conn.cursor()
    cur.execute(sql, values)
    return cur.lastrowid

create_table_sql = """CREATE TABLE IF NOT EXISTS [Country] (
    [Country] TEXT NOT NULL PRIMARY KEY
);
"""
conn_norm = create_connection('india_export_norm.db')

create_table(conn_norm, create_table_sql)

with conn_norm:
    for country in countries:
        insert_country(conn_norm, (country, ))
        
sql_statement = "SELECT * FROM country"
df = pd.read_sql_query(sql_statement, conn_norm)
display(df)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[16], line 12
      6     return cur.lastrowid
      8 create_table_sql = """CREATE TABLE IF NOT EXISTS [Country] (
      9     [Country] TEXT NOT NULL PRIMARY KEY
     10 );
     11 """
---> 12 conn_norm = create_connection('india_export_norm.db')
     14 create_table(conn_norm, create_table_sql)
     16 with conn_norm:

NameError: name 'create_connection' is not defined
sql_statement = "SELECT DISTINCT COMMODITY FROM export order by COMMODITY"
commodities = execute_sql_statement(sql_statement, conn)
print(commodities)
print(len(commodities))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[17], line 2
      1 sql_statement = "SELECT DISTINCT COMMODITY FROM export order by COMMODITY"
----> 2 commodities = execute_sql_statement(sql_statement, conn)
      3 print(commodities)
      4 print(len(commodities))

NameError: name 'execute_sql_statement' is not defined
def insert_commodity(conn, values):
    sql = ''' INSERT INTO COMMODITY (COMMODITY)
              VALUES(?) '''
    cur = conn.cursor()
    cur.execute(sql, values)
    return cur.lastrowid

create_table_sql = """CREATE TABLE IF NOT EXISTS [COMMODITY] (
    [HSCODE] INTEGER NOT NULL PRIMARY KEY, 
    [COMMODITY] TEXT NOT NULL
);
"""
create_table(conn_norm, create_table_sql)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[18], line 13
      6     return cur.lastrowid
      8 create_table_sql = """CREATE TABLE IF NOT EXISTS [COMMODITY] (
      9     [HSCODE] INTEGER NOT NULL PRIMARY KEY, 
     10     [COMMODITY] TEXT NOT NULL
     11 );
     12 """
---> 13 create_table(conn_norm, create_table_sql)

NameError: name 'create_table' is not defined
with conn_norm:
    for commodity in commodities:
        insert_commodity(conn_norm, commodity)
        
sql_statement = "SELECT * FROM Commodity"
df = pd.read_sql_query(sql_statement, conn_norm)
display(df)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[19], line 1
----> 1 with conn_norm:
      2     for commodity in commodities:
      3         insert_commodity(conn_norm, commodity)

NameError: name 'conn_norm' is not defined
# Table 4 which is going link all the tables with values

def insert_value(conn, values):
    sql = ''' INSERT INTO Export (HSCODE, COUNTRY, YEAR, VALUE)
              VALUES(?, ?, ?, ?) '''
    cur = conn.cursor()
    cur.execute(sql, values)
    return cur.lastrowid

create_table_sql = """CREATE TABLE IF NOT EXISTS [Export] (
    [VALUEID] INTEGER NOT NULL PRIMARY KEY,
    [HSCODE] INTEGER NOT NULL, 
    [COUNTRY] TEXT NOT NULL,
    [YEAR] INTEGER NOT NULL,
    [VALUE] REAL,
    FOREIGN KEY(COUNTRY) REFERENCES Country(COUNTRY),
    FOREIGN KEY(YEAR) REFERENCES YEAR(YEAR),
    FOREIGN KEY(HSCODE) REFERENCES Commodity(HSCODE)
);
"""
create_table(conn_norm, create_table_sql)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[20], line 21
      8     return cur.lastrowid
     10 create_table_sql = """CREATE TABLE IF NOT EXISTS [Export] (
     11     [VALUEID] INTEGER NOT NULL PRIMARY KEY,
     12     [HSCODE] INTEGER NOT NULL, 
   (...)     19 );
     20 """
---> 21 create_table(conn_norm, create_table_sql)

NameError: name 'create_table' is not defined
sql_statement = 'SELECT Commodity, country, year, value from export'
values = execute_sql_statement(sql_statement, conn)
print(values[:10])
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[21], line 2
      1 sql_statement = 'SELECT Commodity, country, year, value from export'
----> 2 values = execute_sql_statement(sql_statement, conn)
      3 print(values[:10])

NameError: name 'execute_sql_statement' is not defined
sql_statement = 'SELECT HSCODE, COMMODITY from COMMODITY'
values = execute_sql_statement(sql_statement, conn_norm)
print(values[:10])
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[22], line 2
      1 sql_statement = 'SELECT HSCODE, COMMODITY from COMMODITY'
----> 2 values = execute_sql_statement(sql_statement, conn_norm)
      3 print(values[:10])

NameError: name 'execute_sql_statement' is not defined
commodity_lookup = {}
for hscode, commodity in values:
    commodity_lookup[commodity] = hscode
    
print(commodity_lookup)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[23], line 2
      1 commodity_lookup = {}
----> 2 for hscode, commodity in values:
      3     commodity_lookup[commodity] = hscode
      5 print(commodity_lookup)

NameError: name 'values' is not defined
commodity_lookup['MEAT AND EDIBLE MEAT OFFAL.']
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[24], line 1
----> 1 commodity_lookup['MEAT AND EDIBLE MEAT OFFAL.']

KeyError: 'MEAT AND EDIBLE MEAT OFFAL.'
# create_table_sql = """CREATE TABLE IF NOT EXISTS [Export] (
#     [VALUEID] INTEGER NOT NULL PRIMARY KEY,
#     [HSCODE] INTEGER NOT NULL, 
#     [COUNTRY] TEXT NOT NULL,
#     [YEAR] INTEGER NOT NULL,
#     [VALUE] REAL,
#     FOREIGN KEY(COUNTRY) REFERENCES Country(COUNTRY),
#     FOREIGN KEY(YEAR) REFERENCES YEAR(YEAR),
#     FOREIGN KEY(HSCODE) REFERENCES Commodity(HSCODE)
# );
# """

sql_statement = 'SELECT Commodity, country, year, value from export'
values = execute_sql_statement(sql_statement, conn)
# with conn_norm:
count = 0
with conn_norm:
    for value in values:
        text = value[0]
        hscode = commodity_lookup[text]
        insert_tuple = (hscode, value[1], value[2], value[3])
        insert_value(conn_norm, insert_tuple)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[25], line 14
      1 # create_table_sql = """CREATE TABLE IF NOT EXISTS [Export] (
      2 #     [VALUEID] INTEGER NOT NULL PRIMARY KEY,
      3 #     [HSCODE] INTEGER NOT NULL, 
   (...)     10 # );
     11 # """
     13 sql_statement = 'SELECT Commodity, country, year, value from export'
---> 14 values = execute_sql_statement(sql_statement, conn)
     15 # with conn_norm:
     16 count = 0

NameError: name 'execute_sql_statement' is not defined
sql_statement = """SELECT 
export.hscode,
Commodity.COMMODITY,
export.country,
export.year,
export.value
FROM Export
JOIN Commodity ON Commodity.hscode = export.hscode
"""
df = pd.read_sql_query(sql_statement, conn_norm)
display(df)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[26], line 10
      1 sql_statement = """SELECT 
      2 export.hscode,
      3 Commodity.COMMODITY,
   (...)      8 JOIN Commodity ON Commodity.hscode = export.hscode
      9 """
---> 10 df = pd.read_sql_query(sql_statement, conn_norm)
     11 display(df)

NameError: name 'pd' is not defined
conn_norm.close()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[27], line 1
----> 1 conn_norm.close()

NameError: name 'conn_norm' is not defined
import time
time.monotonic() 
10317.868688047
conn_norm = create_connection('india_export_norm.db')
cur = conn_norm.cursor()
cur.execute('DROP TABLE EXPORT')
create_table_sql = """CREATE TABLE IF NOT EXISTS [Export] (
    [VALUEID] INTEGER NOT NULL PRIMARY KEY,
    [HSCODE] INTEGER NOT NULL, 
    [COUNTRY] TEXT NOT NULL,
    [YEAR] INTEGER NOT NULL,
    [VALUE] REAL,
    FOREIGN KEY(COUNTRY) REFERENCES Country(COUNTRY),
    FOREIGN KEY(YEAR) REFERENCES YEAR(YEAR),
    FOREIGN KEY(HSCODE) REFERENCES Commodity(HSCODE)
);
"""
create_table(conn_norm, create_table_sql)

with conn_norm:
    start_time = time.monotonic() 
    for value in values:
        text = value[0]
        hscode = commodity_lookup[text]
        insert_tuple = (hscode, value[1], value[2], value[3])
        insert_value(conn_norm, insert_tuple)
end_time = time.monotonic()     


print(end_time-start_time)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[29], line 1
----> 1 conn_norm = create_connection('india_export_norm.db')
      2 cur = conn_norm.cursor()
      3 cur.execute('DROP TABLE EXPORT')

NameError: name 'create_connection' is not defined
insert_tuples = []

for value in values:
    text = value[0]
    hscode = commodity_lookup[text]
    insert_tuple = (hscode, value[1], value[2], value[3])
    insert_tuples.append(insert_tuple)
print(insert_tuples[:100])
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[30], line 3
      1 insert_tuples = []
----> 3 for value in values:
      4     text = value[0]
      5     hscode = commodity_lookup[text]

NameError: name 'values' is not defined
conn_norm = create_connection('india_export_norm.db')
cur = conn_norm.cursor()
cur.execute('DROP TABLE EXPORT')
create_table_sql = """CREATE TABLE IF NOT EXISTS [Export] (
    [VALUEID] INTEGER NOT NULL PRIMARY KEY,
    [HSCODE] INTEGER NOT NULL, 
    [COUNTRY] TEXT NOT NULL,
    [YEAR] INTEGER NOT NULL,
    [VALUE] REAL,
    FOREIGN KEY(COUNTRY) REFERENCES Country(COUNTRY),
    FOREIGN KEY(YEAR) REFERENCES YEAR(YEAR),
    FOREIGN KEY(HSCODE) REFERENCES Commodity(HSCODE)
);
"""
create_table(conn_norm, create_table_sql)
sql = ''' INSERT INTO Export (HSCODE, COUNTRY, YEAR, VALUE)
              VALUES(?, ?, ?, ?) '''
cur = conn_norm.cursor()
start_time = time.monotonic() 
cur.executemany(sql, insert_tuples)
conn_norm.commit()
end_time = time.monotonic()     
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[31], line 1
----> 1 conn_norm = create_connection('india_export_norm.db')
      2 cur = conn_norm.cursor()
      3 cur.execute('DROP TABLE EXPORT')

NameError: name 'create_connection' is not defined
print(end_time-start_time)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[32], line 1
----> 1 print(end_time-start_time)

NameError: name 'end_time' is not defined
x = 'abcd'
x.upper
<function str.upper()>
conn_norm.commit
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[34], line 1
----> 1 conn_norm.commit

NameError: name 'conn_norm' is not defined