Soccer Database

13.14. Soccer Database#

import pandas as pd
import sqlite3


df = pd.read_csv('all_players.csv')
df['Club'].isna().sum()
np.int64(5493)
conn = sqlite3.connect('all_players.db')
cur = conn.cursor()
df.to_sql('players', conn, if_exists='replace', index=False) # - writes the all_players.df to SQLIte DB
conn.commit()
conn.close()
import pandas as pd
import sqlite3
from sqlite3 import Error


def create_connection(db_file, delete_db=False):
    import os
    if delete_db and os.path.exists(db_file):
        os.remove(db_file)

    conn = None
    try:
        conn = sqlite3.connect(db_file)
        conn.execute("PRAGMA foreign_keys = 1")
    except Error as e:
        print(e)

    return conn


def create_table(conn, create_table_sql):
    try:
        c = conn.cursor()
        c.execute(create_table_sql)
    except Error as e:
        print(e)
        
def execute_sql_statement(sql_statement, conn):
    cur = conn.cursor()
    cur.execute(sql_statement)
    rows = cur.fetchall()

    return rows
display(df)
Player Club POS GP GS MINS Year Season
0 Roy Lassiter TB F 30 30 2580 1996 reg
1 Raul Diaz Arce DC F 28 28 2351 1996 reg
2 Eduardo Hurtado LA F 26 26 2323 1996 reg
3 Preki KC M 32 32 2880 1996 reg
4 Brian McBride CLB F 28 28 2307 1996 reg
... ... ... ... ... ... ... ... ...
15762 Alejandro Pozuelo NaN M 1 1 120 2020 post
15763 Franco Jara NaN F 2 2 210 2020 post
15764 Andy Polo NaN M 1 1 105 2020 post
15765 Cristian Roldan NaN M 4 4 360 2020 post
15766 Joevin Jones NaN D-M 4 4 260 2020 post

15767 rows × 8 columns

13.15. Tables#

  • Club

  • Position

  • Year

  • Season

  • Player

  • PlayerStats

conn = sqlite3.connect('all_players.db')
sql_statement = "SELECT DISTINCT CLUB FROM players"
rows = execute_sql_statement(sql_statement, conn)
rows = filter(lambda row: row[0], rows)
clubs = list(map(lambda row: row[0].strip(), rows))
clubs = list(set(clubs))
from collections import Counter

counter = Counter(clubs)
print(counter)
Counter({'ECU': 1, 'VAN': 1, 'COL': 1, 'ROC': 1, 'CHI': 1, 'LAFC': 1, 'USA': 1, 'HAI': 1, 'RSL': 1, 'SJ': 1, 'CHV': 1, 'MTL': 1, 'PAN': 1, 'CIN': 1, 'NSH': 1, 'POR': 1, 'DC': 1, 'MIN': 1, 'HOU': 1, 'SEA': 1, 'KC': 1, 'NYC': 1, 'TOR': 1, 'LFC': 1, 'ORL': 1, 'RBNY': 1, 'MET': 1, 'SKC': 1, 'GHA': 1, 'NYR': 1, 'TB': 1, 'LA': 1, 'PHI': 1, 'CIV': 1, 'SLV': 1, 'MCF': 1, 'CAN': 1, 'MTQ': 1, 'MIA': 1, 'JAM': 1, 'DAL': 1, 'NE': 1, 'HON': 1, 'CLB': 1, 'ATL': 1, 'NY': 1})
# write create statement
# execute create statement
# fetch unique values
# write insert function
# feed unique values into insert function
conn_norm.close()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[9], line 1
----> 1 conn_norm.close()

NameError: name 'conn_norm' is not defined
conn_norm = create_connection('players_norm.db', True)
create_table_sql = """CREATE TABLE IF NOT EXISTS[Clubs] (
    [Club] TEXT NOT NULL PRIMARY KEY
);
"""

create_table(conn_norm, create_table_sql) # doesn't require commit
def insert_club(conn, values):
    sql = """INSERT INTO Clubs(Club)
                VALUES(?)"""
    cur = conn.cursor()
    cur.execute(sql, values)
    return cur.lastrowid


with conn_norm:
    for club in clubs:
        insert_club(conn_norm, (club,))
sql_statement = "SELECT DISTINCT CLUB FROM Clubs"
rows = execute_sql_statement(sql_statement, conn_norm)
print(rows)
[('ECU',), ('VAN',), ('COL',), ('ROC',), ('CHI',), ('LAFC',), ('USA',), ('HAI',), ('RSL',), ('SJ',), ('CHV',), ('MTL',), ('PAN',), ('CIN',), ('NSH',), ('POR',), ('DC',), ('MIN',), ('HOU',), ('SEA',), ('KC',), ('NYC',), ('TOR',), ('LFC',), ('ORL',), ('RBNY',), ('MET',), ('SKC',), ('GHA',), ('NYR',), ('TB',), ('LA',), ('PHI',), ('CIV',), ('SLV',), ('MCF',), ('CAN',), ('MTQ',), ('MIA',), ('JAM',), ('DAL',), ('NE',), ('HON',), ('CLB',), ('ATL',), ('NY',)]
sql_statement = "SELECT DISTINCT POS FROM players"
rows = execute_sql_statement(sql_statement, conn)
rows = filter(lambda row: row[0], rows)
positions = list(map(lambda row: row[0].strip(), rows))
positions = list(set(positions))
create_table_sql = """CREATE TABLE IF NOT EXISTS[Positions] (
    [Position] TEXT NOT NULL PRIMARY KEY
);
"""

create_table(conn_norm, create_table_sql) # doesn't require commit
def insert_position(conn, values):
    sql = """INSERT INTO Positions(Position)
                VALUES(?)"""
    cur = conn.cursor()
    cur.execute(sql, values)
    return cur.lastrowid


with conn_norm:
    for position in positions:
        insert_position(conn_norm, (position,))
sql_statement = "SELECT DISTINCT Year FROM players"
rows = execute_sql_statement(sql_statement, conn)
rows = filter(lambda row: row[0], rows)
years = list(map(lambda row: int(row[0]), rows))
years = list(set(years))
create_table_sql = """CREATE TABLE IF NOT EXISTS[Years] (
    [Year] INTEGER NOT NULL PRIMARY KEY
);
"""

create_table(conn_norm, create_table_sql) # doesn't require commit
def insert_year(conn, values):
    sql = """INSERT INTO Years(Year)
                VALUES(?)"""
    cur = conn.cursor()
    cur.execute(sql, values)
    return cur.lastrowid


with conn_norm:
    for year in years:
        insert_year(conn_norm, (year,))
sql_statement = "SELECT DISTINCT season FROM players"
rows = execute_sql_statement(sql_statement, conn)
rows = filter(lambda row: row[0], rows)
seasons = list(map(lambda row: row[0].strip(), rows))
seasons = list(set(seasons))
create_table_sql = """CREATE TABLE IF NOT EXISTS[Seasons] (
    [Season] TEXT NOT NULL PRIMARY KEY
);
"""

create_table(conn_norm, create_table_sql) # doesn't require commit
def insert_season(conn, values):
    sql = """INSERT INTO Seasons(season)
                VALUES(?)"""
    cur = conn.cursor()
    cur.execute(sql, values)
    return cur.lastrowid


with conn_norm:
    for season in seasons:
        insert_season(conn_norm, (season,))
sql_statement = "SELECT DISTINCT player FROM players"
rows = execute_sql_statement(sql_statement, conn)
players = []
for row in rows:
    row = row[0].strip()
    name = row.split(' ', 1)
    if len(name) == 2:
        first, last = name
        first = first.strip()
        last = last.strip()
        players.append((first, last))
    elif len(name) == 1:
        first = name[0]
        players.append((first, None))
create_table_sql = """CREATE TABLE IF NOT EXISTS[Players] (
    [PlayerID] INTEGER NOT NULL PRIMARY KEY,
    [FirstName] TEXT NOT NULL,
    [LastName] TEXT
);
"""

create_table(conn_norm, create_table_sql) # doesn't require commit
def insert_player(conn, values):
    sql = """INSERT INTO Players(FirstName, LastName)
                VALUES(?, ?)"""
    cur = conn.cursor()
    cur.execute(sql, values)
    return cur.lastrowid


with conn_norm:
    for player in players:
        insert_player(conn_norm, player)
cur = conn_norm.cursor()
cur.execute('DROP TABLE PlayerStats')
---------------------------------------------------------------------------
OperationalError                          Traceback (most recent call last)
Cell In[26], line 2
      1 cur = conn_norm.cursor()
----> 2 cur.execute('DROP TABLE PlayerStats')

OperationalError: no such table: PlayerStats
create_table_sql = """CREATE TABLE IF NOT EXISTS[PlayerStats] (
    [StatID] INTEGER NOT NULL PRIMARY KEY,
    [PlayerID] INTEGER NOT NULL,
    [Club] TEXT,
    [Position] TEXT NOT NULL,
    [GP] INTEGER NOT NULL,
    [GS] INTEGER NOT NULL,
    [MIN] INTEGER NOT NULL,
    [Year] INTEGER NOT NULL,
    [Season] TEXT NOT NULL,
    FOREIGN KEY(PlayerID) REFERENCES Players(PlayerID),
    FOREIGN KEY(Club) REFERENCES Clubs(Club),
    FOREIGN KEY(Year) REFERENCES Years(Year),
    FOREIGN KEY(Season) REFERENCES Seasons(Season)     
);
"""

create_table(conn_norm, create_table_sql) # doesn't require commit
sql_statement = "SELECT DISTINCT * FROM players"
rows = execute_sql_statement(sql_statement, conn)
rows[:10]
[('Roy Lassiter', 'TB ', 'F', 30, 30, 2580, 1996, 'reg'),
 ('Raul Diaz Arce', 'DC ', 'F', 28, 28, 2351, 1996, 'reg'),
 ('Eduardo Hurtado', 'LA ', 'F', 26, 26, 2323, 1996, 'reg'),
 ('Preki', 'KC ', 'M', 32, 32, 2880, 1996, 'reg'),
 ('Brian McBride', 'CLB', 'F', 28, 28, 2307, 1996, 'reg'),
 ('Steve Rammel', 'DC ', 'F', 26, 20, 1649, 1996, 'reg'),
 ('Vitalis Takawira', 'KC ', 'F', 28, 25, 2233, 1996, 'reg'),
 ('Paul Bravo', 'SJ ', 'M', 31, 31, 2654, 1996, 'reg'),
 ('Jason Kreis', 'DAL', 'M', 31, 29, 2564, 1996, 'reg'),
 ('Giovanni Savarese', 'MET', 'F', 26, 15, 1576, 1996, 'reg')]
sql_statement = "SELECT * FROM players"
rows = execute_sql_statement(sql_statement, conn_norm)
player_lookup_dict = {}
for row in rows:
    pid, first, last = row
    if last:
        name = f'{first} {last}'
        player_lookup_dict[name] = pid
    else:
        player_lookup_dict[first] = pid
        
sql_statement = "SELECT DISTINCT * FROM players"
rows = execute_sql_statement(sql_statement, conn)


def insert_playerstats(conn, values):
    sql = """INSERT INTO PlayerStats(PlayerID, Club, position, gp, gs, min, year, season)
                VALUES(?, ?, ?, ?, ?, ?, ?,?)"""
    cur = conn.cursor()
    cur.execute(sql, values)
    return cur.lastrowid

with conn_norm:
    for row in rows:
        row = list(row)
        pid = player_lookup_dict[row[0].strip()]            
        row[0] = pid
        if not row[1]:
            continue
        row[1] = row[1].strip()
        insert_playerstats(conn_norm, row)