13.12. India Export Database#
import pandas as pd
import sqlite3
df = pd.read_csv('india_export.csv')
display(df)
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[1], line 1
----> 1 import pandas as pd
2 import sqlite3
4 df = pd.read_csv('india_export.csv')
ModuleNotFoundError: No module named 'pandas'
conn = sqlite3.connect('india_export.db')
cur = conn.cursor()
cur.execute('''DROP TABLE IF EXISTS export''')
df.to_sql('export', conn, if_exists='replace', index=False) # - writes the pd.df to SQLIte DB
conn.commit()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[2], line 1
----> 1 conn = sqlite3.connect('india_export.db')
2 cur = conn.cursor()
3 cur.execute('''DROP TABLE IF EXISTS export''')
NameError: name 'sqlite3' is not defined
pd.read_sql('select * from export', conn)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[3], line 1
----> 1 pd.read_sql('select * from export', conn)
NameError: name 'pd' is not defined
conn.close()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[4], line 1
----> 1 conn.close()
NameError: name 'conn' is not defined
13.13. Steps to normalizing India export database#
Inspect table and determine how many tables into convert the database into
Four tables:
year
country
commodity
a table that joins country, year, commmodity, and value
Steps to create year table
list of unique years
create statement
insert function
import pandas as pd
import sqlite3
from sqlite3 import Error
def create_connection(db_file, delete_db=False):
import os
if delete_db and os.path.exists(db_file):
os.remove(db_file)
conn = None
try:
conn = sqlite3.connect(db_file)
conn.execute("PRAGMA foreign_keys = 1")
except Error as e:
print(e)
return conn
def create_table(conn, create_table_sql):
try:
c = conn.cursor()
c.execute(create_table_sql)
except Error as e:
print(e)
def execute_sql_statement(sql_statement, conn):
cur = conn.cursor()
cur.execute(sql_statement)
rows = cur.fetchall()
return rows
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[5], line 1
----> 1 import pandas as pd
2 import sqlite3
3 from sqlite3 import Error
ModuleNotFoundError: No module named 'pandas'
conn = create_connection('india_export.db')
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[6], line 1
----> 1 conn = create_connection('india_export.db')
NameError: name 'create_connection' is not defined
# Get list of unique years
sql_statement = "SELECT DISTINCT year from export ORDER BY year"
years = execute_sql_statement(sql_statement, conn)
years = list(map(lambda row: int(row[0]), years))
print(years)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[7], line 5
1 # Get list of unique years
3 sql_statement = "SELECT DISTINCT year from export ORDER BY year"
----> 5 years = execute_sql_statement(sql_statement, conn)
6 years = list(map(lambda row: int(row[0]), years))
7 print(years)
NameError: name 'execute_sql_statement' is not defined
create_table_sql = """CREATE TABLE [Year] (
[YEAR] INTEGER NOT NULL PRIMARY KEY
);
"""
# conn_norm.close()
conn_norm = create_connection('india_export_norm.db', True)
create_table(conn_norm, create_table_sql)
sql_statement = "SELECT * FROM Year"
df = pd.read_sql_query(sql_statement, conn_norm)
display(df)
conn_norm.close()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[8], line 7
1 create_table_sql = """CREATE TABLE [Year] (
2 [YEAR] INTEGER NOT NULL PRIMARY KEY
3 );
4 """
5 # conn_norm.close()
----> 7 conn_norm = create_connection('india_export_norm.db', True)
8 create_table(conn_norm, create_table_sql)
9 sql_statement = "SELECT * FROM Year"
NameError: name 'create_connection' is not defined
conn_norm = create_connection('india_export_norm.db')
sql_statement = "SELECT * FROM Year"
df = pd.read_sql_query(sql_statement, conn_norm)
display(df)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[9], line 1
----> 1 conn_norm = create_connection('india_export_norm.db')
2 sql_statement = "SELECT * FROM Year"
3 df = pd.read_sql_query(sql_statement, conn_norm)
NameError: name 'create_connection' is not defined
(2022,)
(2022,)
def insert_year(conn, values):
sql = ''' INSERT INTO YEAR(YEAR)
VALUES(?) '''
cur = conn.cursor()
cur.execute(sql, values)
return cur.lastrowid
with conn_norm:
for year in years:
insert_year(conn_norm, (year, ))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[11], line 8
5 cur.execute(sql, values)
6 return cur.lastrowid
----> 8 with conn_norm:
9 for year in years:
10 insert_year(conn_norm, (year, ))
NameError: name 'conn_norm' is not defined
sql_statement = "SELECT * FROM Year"
df = pd.read_sql_query(sql_statement, conn_norm)
display(df)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[12], line 2
1 sql_statement = "SELECT * FROM Year"
----> 2 df = pd.read_sql_query(sql_statement, conn_norm)
3 display(df)
NameError: name 'pd' is not defined
conn_norm.close()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[13], line 1
----> 1 conn_norm.close()
NameError: name 'conn_norm' is not defined
conn_norm = create_connection('india_export_norm.db')
sql_statement = "SELECT * FROM Year"
df = pd.read_sql_query(sql_statement, conn_norm)
display(df)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[14], line 1
----> 1 conn_norm = create_connection('india_export_norm.db')
2 sql_statement = "SELECT * FROM Year"
3 df = pd.read_sql_query(sql_statement, conn_norm)
NameError: name 'create_connection' is not defined
# Get list of unique countries
sql_statement = "SELECT DISTINCT country from export ORDER BY country"
countries = execute_sql_statement(sql_statement, conn)
countries = list(map(lambda row: row[0], countries))
print(countries)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[15], line 5
1 # Get list of unique countries
3 sql_statement = "SELECT DISTINCT country from export ORDER BY country"
----> 5 countries = execute_sql_statement(sql_statement, conn)
6 countries = list(map(lambda row: row[0], countries))
7 print(countries)
NameError: name 'execute_sql_statement' is not defined
def insert_country(conn, values):
sql = ''' INSERT INTO COUNTRY(COUNTRY)
VALUES(?) '''
cur = conn.cursor()
cur.execute(sql, values)
return cur.lastrowid
create_table_sql = """CREATE TABLE IF NOT EXISTS [Country] (
[Country] TEXT NOT NULL PRIMARY KEY
);
"""
conn_norm = create_connection('india_export_norm.db')
create_table(conn_norm, create_table_sql)
with conn_norm:
for country in countries:
insert_country(conn_norm, (country, ))
sql_statement = "SELECT * FROM country"
df = pd.read_sql_query(sql_statement, conn_norm)
display(df)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[16], line 12
6 return cur.lastrowid
8 create_table_sql = """CREATE TABLE IF NOT EXISTS [Country] (
9 [Country] TEXT NOT NULL PRIMARY KEY
10 );
11 """
---> 12 conn_norm = create_connection('india_export_norm.db')
14 create_table(conn_norm, create_table_sql)
16 with conn_norm:
NameError: name 'create_connection' is not defined
sql_statement = "SELECT DISTINCT COMMODITY FROM export order by COMMODITY"
commodities = execute_sql_statement(sql_statement, conn)
print(commodities)
print(len(commodities))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[17], line 2
1 sql_statement = "SELECT DISTINCT COMMODITY FROM export order by COMMODITY"
----> 2 commodities = execute_sql_statement(sql_statement, conn)
3 print(commodities)
4 print(len(commodities))
NameError: name 'execute_sql_statement' is not defined
def insert_commodity(conn, values):
sql = ''' INSERT INTO COMMODITY (COMMODITY)
VALUES(?) '''
cur = conn.cursor()
cur.execute(sql, values)
return cur.lastrowid
create_table_sql = """CREATE TABLE IF NOT EXISTS [COMMODITY] (
[HSCODE] INTEGER NOT NULL PRIMARY KEY,
[COMMODITY] TEXT NOT NULL
);
"""
create_table(conn_norm, create_table_sql)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[18], line 13
6 return cur.lastrowid
8 create_table_sql = """CREATE TABLE IF NOT EXISTS [COMMODITY] (
9 [HSCODE] INTEGER NOT NULL PRIMARY KEY,
10 [COMMODITY] TEXT NOT NULL
11 );
12 """
---> 13 create_table(conn_norm, create_table_sql)
NameError: name 'create_table' is not defined
with conn_norm:
for commodity in commodities:
insert_commodity(conn_norm, commodity)
sql_statement = "SELECT * FROM Commodity"
df = pd.read_sql_query(sql_statement, conn_norm)
display(df)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[19], line 1
----> 1 with conn_norm:
2 for commodity in commodities:
3 insert_commodity(conn_norm, commodity)
NameError: name 'conn_norm' is not defined
# Table 4 which is going link all the tables with values
def insert_value(conn, values):
sql = ''' INSERT INTO Export (HSCODE, COUNTRY, YEAR, VALUE)
VALUES(?, ?, ?, ?) '''
cur = conn.cursor()
cur.execute(sql, values)
return cur.lastrowid
create_table_sql = """CREATE TABLE IF NOT EXISTS [Export] (
[VALUEID] INTEGER NOT NULL PRIMARY KEY,
[HSCODE] INTEGER NOT NULL,
[COUNTRY] TEXT NOT NULL,
[YEAR] INTEGER NOT NULL,
[VALUE] REAL,
FOREIGN KEY(COUNTRY) REFERENCES Country(COUNTRY),
FOREIGN KEY(YEAR) REFERENCES YEAR(YEAR),
FOREIGN KEY(HSCODE) REFERENCES Commodity(HSCODE)
);
"""
create_table(conn_norm, create_table_sql)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[20], line 21
8 return cur.lastrowid
10 create_table_sql = """CREATE TABLE IF NOT EXISTS [Export] (
11 [VALUEID] INTEGER NOT NULL PRIMARY KEY,
12 [HSCODE] INTEGER NOT NULL,
(...) 19 );
20 """
---> 21 create_table(conn_norm, create_table_sql)
NameError: name 'create_table' is not defined
sql_statement = 'SELECT Commodity, country, year, value from export'
values = execute_sql_statement(sql_statement, conn)
print(values[:10])
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[21], line 2
1 sql_statement = 'SELECT Commodity, country, year, value from export'
----> 2 values = execute_sql_statement(sql_statement, conn)
3 print(values[:10])
NameError: name 'execute_sql_statement' is not defined
sql_statement = 'SELECT HSCODE, COMMODITY from COMMODITY'
values = execute_sql_statement(sql_statement, conn_norm)
print(values[:10])
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[22], line 2
1 sql_statement = 'SELECT HSCODE, COMMODITY from COMMODITY'
----> 2 values = execute_sql_statement(sql_statement, conn_norm)
3 print(values[:10])
NameError: name 'execute_sql_statement' is not defined
commodity_lookup = {}
for hscode, commodity in values:
commodity_lookup[commodity] = hscode
print(commodity_lookup)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[23], line 2
1 commodity_lookup = {}
----> 2 for hscode, commodity in values:
3 commodity_lookup[commodity] = hscode
5 print(commodity_lookup)
NameError: name 'values' is not defined
commodity_lookup['MEAT AND EDIBLE MEAT OFFAL.']
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
Cell In[24], line 1
----> 1 commodity_lookup['MEAT AND EDIBLE MEAT OFFAL.']
KeyError: 'MEAT AND EDIBLE MEAT OFFAL.'
# create_table_sql = """CREATE TABLE IF NOT EXISTS [Export] (
# [VALUEID] INTEGER NOT NULL PRIMARY KEY,
# [HSCODE] INTEGER NOT NULL,
# [COUNTRY] TEXT NOT NULL,
# [YEAR] INTEGER NOT NULL,
# [VALUE] REAL,
# FOREIGN KEY(COUNTRY) REFERENCES Country(COUNTRY),
# FOREIGN KEY(YEAR) REFERENCES YEAR(YEAR),
# FOREIGN KEY(HSCODE) REFERENCES Commodity(HSCODE)
# );
# """
sql_statement = 'SELECT Commodity, country, year, value from export'
values = execute_sql_statement(sql_statement, conn)
# with conn_norm:
count = 0
with conn_norm:
for value in values:
text = value[0]
hscode = commodity_lookup[text]
insert_tuple = (hscode, value[1], value[2], value[3])
insert_value(conn_norm, insert_tuple)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[25], line 14
1 # create_table_sql = """CREATE TABLE IF NOT EXISTS [Export] (
2 # [VALUEID] INTEGER NOT NULL PRIMARY KEY,
3 # [HSCODE] INTEGER NOT NULL,
(...) 10 # );
11 # """
13 sql_statement = 'SELECT Commodity, country, year, value from export'
---> 14 values = execute_sql_statement(sql_statement, conn)
15 # with conn_norm:
16 count = 0
NameError: name 'execute_sql_statement' is not defined
sql_statement = """SELECT
export.hscode,
Commodity.COMMODITY,
export.country,
export.year,
export.value
FROM Export
JOIN Commodity ON Commodity.hscode = export.hscode
"""
df = pd.read_sql_query(sql_statement, conn_norm)
display(df)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[26], line 10
1 sql_statement = """SELECT
2 export.hscode,
3 Commodity.COMMODITY,
(...) 8 JOIN Commodity ON Commodity.hscode = export.hscode
9 """
---> 10 df = pd.read_sql_query(sql_statement, conn_norm)
11 display(df)
NameError: name 'pd' is not defined
conn_norm.close()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[27], line 1
----> 1 conn_norm.close()
NameError: name 'conn_norm' is not defined
import time
time.monotonic()
10317.868688047
conn_norm = create_connection('india_export_norm.db')
cur = conn_norm.cursor()
cur.execute('DROP TABLE EXPORT')
create_table_sql = """CREATE TABLE IF NOT EXISTS [Export] (
[VALUEID] INTEGER NOT NULL PRIMARY KEY,
[HSCODE] INTEGER NOT NULL,
[COUNTRY] TEXT NOT NULL,
[YEAR] INTEGER NOT NULL,
[VALUE] REAL,
FOREIGN KEY(COUNTRY) REFERENCES Country(COUNTRY),
FOREIGN KEY(YEAR) REFERENCES YEAR(YEAR),
FOREIGN KEY(HSCODE) REFERENCES Commodity(HSCODE)
);
"""
create_table(conn_norm, create_table_sql)
with conn_norm:
start_time = time.monotonic()
for value in values:
text = value[0]
hscode = commodity_lookup[text]
insert_tuple = (hscode, value[1], value[2], value[3])
insert_value(conn_norm, insert_tuple)
end_time = time.monotonic()
print(end_time-start_time)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[29], line 1
----> 1 conn_norm = create_connection('india_export_norm.db')
2 cur = conn_norm.cursor()
3 cur.execute('DROP TABLE EXPORT')
NameError: name 'create_connection' is not defined
insert_tuples = []
for value in values:
text = value[0]
hscode = commodity_lookup[text]
insert_tuple = (hscode, value[1], value[2], value[3])
insert_tuples.append(insert_tuple)
print(insert_tuples[:100])
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[30], line 3
1 insert_tuples = []
----> 3 for value in values:
4 text = value[0]
5 hscode = commodity_lookup[text]
NameError: name 'values' is not defined
conn_norm = create_connection('india_export_norm.db')
cur = conn_norm.cursor()
cur.execute('DROP TABLE EXPORT')
create_table_sql = """CREATE TABLE IF NOT EXISTS [Export] (
[VALUEID] INTEGER NOT NULL PRIMARY KEY,
[HSCODE] INTEGER NOT NULL,
[COUNTRY] TEXT NOT NULL,
[YEAR] INTEGER NOT NULL,
[VALUE] REAL,
FOREIGN KEY(COUNTRY) REFERENCES Country(COUNTRY),
FOREIGN KEY(YEAR) REFERENCES YEAR(YEAR),
FOREIGN KEY(HSCODE) REFERENCES Commodity(HSCODE)
);
"""
create_table(conn_norm, create_table_sql)
sql = ''' INSERT INTO Export (HSCODE, COUNTRY, YEAR, VALUE)
VALUES(?, ?, ?, ?) '''
cur = conn_norm.cursor()
start_time = time.monotonic()
cur.executemany(sql, insert_tuples)
conn_norm.commit()
end_time = time.monotonic()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[31], line 1
----> 1 conn_norm = create_connection('india_export_norm.db')
2 cur = conn_norm.cursor()
3 cur.execute('DROP TABLE EXPORT')
NameError: name 'create_connection' is not defined
print(end_time-start_time)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[32], line 1
----> 1 print(end_time-start_time)
NameError: name 'end_time' is not defined
x = 'abcd'
x.upper
<function str.upper()>
conn_norm.commit
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[34], line 1
----> 1 conn_norm.commit
NameError: name 'conn_norm' is not defined