ArcticDB_pythagorean_won_loss_formula_notebook

No description has been provided for this image

Pythagorean Won Loss Formula Notebook¶

Initial setup

In [ ]:

Copied!





try:
    import google.colab
    import requests
    exec(requests.get(f"https://raw.githubusercontent.com/man-group/ArcticDB/master/docs/mkdocs/docs/notebooks/styling.py").text)
    dataFile = f'https://raw.githubusercontent.com/man-group/ArcticDB/master/docs/mkdocs/docs/notebooks/data/2025-08-26-sports.csv'
except ImportError:
    from styling import *
    dataFile = "data/2025-08-26-sports.csv"
try:
    import google.colab
    import requests
    exec(requests.get(f"https://raw.githubusercontent.com/man-group/ArcticDB/master/docs/mkdocs/docs/notebooks/styling.py").text)
    dataFile = f'https://raw.githubusercontent.com/man-group/ArcticDB/master/docs/mkdocs/docs/notebooks/data/2025-08-26-sports.csv'
except ImportError:
    from styling import *
    dataFile = "data/2025-08-26-sports.csv"

In [2]:

Copied!

%pip -q install arcticdb pandas numpy statsmodels plotly scikit-learn
import arcticdb as adb
adb.__version__
%pip -q install arcticdb pandas numpy statsmodels plotly scikit-learn
import arcticdb as adb
adb.__version__

Note: you may need to restart the kernel to use updated packages.

Out[2]:

'6.1.0'

In [3]:

Copied!





import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

colors = setup_plotly_theme()
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

colors = setup_plotly_theme()

In [4]:

Copied!

df = pd.read_csv(dataFile)
df = pd.read_csv(dataFile)

In [5]:

Copied!





df = df.drop(columns="Unnamed: 0")
df = df.rename(columns={'FREQ_RUNS_S': 'PS',
                        'FREQ_RUNS_A': 'PA' })

df = df.sort_values(['LEAGUE','YEAR_ID','TEAM_ID'])
styled = style_table(df.head())
export_table(styled, "basic_data")
styled
df = df.drop(columns="Unnamed: 0")
df = df.rename(columns={'FREQ_RUNS_S': 'PS',
                        'FREQ_RUNS_A': 'PA' })

df = df.sort_values(['LEAGUE','YEAR_ID','TEAM_ID'])
styled = style_table(df.head())
export_table(styled, "basic_data")
styled

Exported: basic_data.csv

Out[5]:

YEAR_ID	TEAM_ID	OPP_TEAM_ID	PS	PA	date	LEAGUE
1897	Carlton	Fitzroy	16	49	1897-05-08	AFL
1897	Carlton	South Melbourne	36	40	1897-05-15	AFL
1897	Carlton	Essendon	41	78	1897-05-24	AFL
1897	Carlton	Geelong	22	44	1897-05-29	AFL
1897	Carlton	Melbourne	26	107	1897-06-05	AFL

In [6]:

Copied!





# Create a head/tail display of the full dataset
styled_head_tail = style_head_tail_table(df, head_rows=10, tail_rows=10)
export_table(styled_head_tail, "head_tail_data", "Sports Data - First 10 and Last 10 Rows")
styled_head_tail
# Create a head/tail display of the full dataset
styled_head_tail = style_head_tail_table(df, head_rows=10, tail_rows=10)
export_table(styled_head_tail, "head_tail_data", "Sports Data - First 10 and Last 10 Rows")
styled_head_tail

Exported: head_tail_data.csv

Out[6]:

853152 rows × 9 columns
YEAR_ID	TEAM_ID	OPP_TEAM_ID	PS	PA	date	home	wi	LEAGUE
1897	Carlton	Fitzroy	16	49	1897-05-08	0	0.000000	AFL
1897	Carlton	South Melbourne	36	40	1897-05-15	0	0.000000	AFL
1897	Carlton	Essendon	41	78	1897-05-24	0	0.000000	AFL
1897	Carlton	Geelong	22	44	1897-05-29	0	0.000000	AFL
1897	Carlton	Melbourne	26	107	1897-06-05	0	0.000000	AFL
1897	Carlton	St Kilda	35	24	1897-06-19	0	1.000000	AFL
1897	Carlton	Collingwood	35	41	1897-06-22	1	0.000000	AFL
1897	Carlton	Fitzroy	9	47	1897-06-26	1	0.000000	AFL
1897	Carlton	South Melbourne	28	74	1897-07-03	1	0.000000	AFL
1897	Carlton	Essendon	13	42	1897-07-10	1	0.000000	AFL
...	...	...	...	...	...	...	...	...
2019	Waratahs	Blues	29	32	2019-04-06	0	0.000000	SUP
2019	Waratahs	Rebels	23	20	2019-04-20	1	1.000000	SUP
2019	Waratahs	Sharks	15	23	2019-04-27	1	0.000000	SUP
2019	Waratahs	Bulls	21	28	2019-05-04	0	0.000000	SUP
2019	Waratahs	Lions	28	29	2019-05-11	0	0.000000	SUP
2019	Waratahs	Reds	40	32	2019-05-18	0	1.000000	SUP
2019	Waratahs	Jaguares	15	23	2019-05-25	1	0.000000	SUP
2019	Waratahs	Rebels	20	15	2019-05-31	0	1.000000	SUP
2019	Waratahs	Brumbies	24	35	2019-06-08	1	0.000000	SUP
2019	Waratahs	Highlanders	12	49	2019-06-14	0	0.000000	SUP

In [7]:

Copied!





arctic = adb.Arctic('lmdb://arcticdb_sports')
lib = arctic.get_library('leagues', create_if_missing=True)
for league in df['LEAGUE'].unique():
    lib.write(league, df[df['LEAGUE']==league])
lib.list_symbols()
arctic = adb.Arctic('lmdb://arcticdb_sports')
lib = arctic.get_library('leagues', create_if_missing=True)
for league in df['LEAGUE'].unique():
    lib.write(league, df[df['LEAGUE']==league])
lib.list_symbols()

Out[7]:

['MLB', 'NFL', 'AFL', 'EPL', 'SUP', 'NHL', 'LAX', 'IPL', 'NBA']

In [8]:

Copied!

style_head_tail_table(lib.read('LAX').data, head_rows=5, tail_rows=5)
style_head_tail_table(lib.read('LAX').data, head_rows=5, tail_rows=5)

Out[8]:

10006 rows × 9 columns
YEAR_ID	TEAM_ID	OPP_TEAM_ID	PS	PA	date	home	wi	LEAGUE
2010	Air Force	St. John's (NY)	7	10	2010-02-28	1	0.000000	LAX
2010	Air Force	Lehigh	13	14	2010-03-06	1	0.000000	LAX
2010	Air Force	Penn	7	8	2010-03-07	1	0.000000	LAX
2010	Air Force	Army West Point	8	7	2010-03-13	1	1.000000	LAX
2010	Air Force	Loyola Maryland	3	18	2010-03-20	0	0.000000	LAX
...	...	...	...	...	...	...	...	...
2019	Yale	Penn	11	12	2019-05-05	1	0.000000	LAX
2019	Yale	Georgetown	19	16	2019-05-11	1	1.000000	LAX
2019	Yale	Penn	19	18	2019-05-19	1	1.000000	LAX
2019	Yale	Penn St.	21	17	2019-05-25	1	1.000000	LAX
2019	Yale	Virginia	9	13	2019-05-27	1	0.000000	LAX

In [9]:

Copied!





def ols_coef (df, xcol, ycol):
    """simple OLS function used for lambda calcs -- assumes univariate regression"""
    model=sm.OLS(df[ycol],sm.add_constant(df[xcol])).fit()    

    outdf = pd.DataFrame({'adj.'    : model.params.iloc[0],
                          'adj. se' : model.bse.iloc[0],
                          'λ'    : model.params.iloc[1],
                          'λ se' : model.bse.iloc[1],
                          'nobs' : model.nobs,
                          'adj. $R^2$': model.rsquared_adj}, index=[0])    

    return outdf
def ols_coef (df, xcol, ycol):
    """simple OLS function used for lambda calcs -- assumes univariate regression"""
    model=sm.OLS(df[ycol],sm.add_constant(df[xcol])).fit()    

    outdf = pd.DataFrame({'adj.'    : model.params.iloc[0],
                          'adj. se' : model.bse.iloc[0],
                          'λ'    : model.params.iloc[1],
                          'λ se' : model.bse.iloc[1],
                          'nobs' : model.nobs,
                          'adj. $R^2$': model.rsquared_adj}, index=[0])    

    return outdf

Our study period is from 2010 to 2019

In [10]:

Copied!

st_year_ = 2010
end_year_ = 2019
st_year_ = 2010
end_year_ = 2019

Connect to ArcticDB and get list of leagues¶

In [11]:

Copied!

league_data = arctic.get_library('leagues')
leagues = league_data.list_symbols()
leagues
league_data = arctic.get_library('leagues')
leagues = league_data.list_symbols()
leagues

Out[11]:

['MLB', 'NFL', 'AFL', 'EPL', 'SUP', 'NHL', 'LAX', 'IPL', 'NBA']

In [ ]:

Copied!





# Get points scored data for all games, 2010-2019 seasons  
lazy_dfs = league_data.read_batch(leagues, lazy=True)
lazy_dfs = lazy_dfs[(lazy_dfs['YEAR_ID']>=st_year_) & (lazy_dfs['YEAR_ID']<=end_year_)]
lazy_dfs = adb.concat(lazy_dfs)
points_data = lazy_dfs.collect().data[['LEAGUE', 'PS']]

# Define league order to match Figure 1 layout (3x3 grid, alphabetical)
league_order = ['AFL', 'EPL', 'IPL', 'LAX', 'MLB', 'NBA', 'NFL', 'NHL', 'SUP']

# Create and display interactive Plotly version with adaptive binning
fig = create_plotly_histplot(
    data=points_data,
    x_col='PS',
    col_col='LEAGUE',
    col_order=league_order,  # Same order as seaborn
    col_wrap=3,  # 3x3 grid layout
    adaptive_bins=True  # Use adaptive binning to fix EPL/NHL gaps
)

# Display interactive plot in notebook
fig.show()

# Export both HTML and JSON versions
export_plot(fig, "figure1_points_distributions", "Figure 1 - Points per game distributions")
# Get points scored data for all games, 2010-2019 seasons  
lazy_dfs = league_data.read_batch(leagues, lazy=True)
lazy_dfs = lazy_dfs[(lazy_dfs['YEAR_ID']>=st_year_) & (lazy_dfs['YEAR_ID']<=end_year_)]
lazy_dfs = adb.concat(lazy_dfs)
points_data = lazy_dfs.collect().data[['LEAGUE', 'PS']]

# Define league order to match Figure 1 layout (3x3 grid, alphabetical)
league_order = ['AFL', 'EPL', 'IPL', 'LAX', 'MLB', 'NBA', 'NFL', 'NHL', 'SUP']

# Create and display interactive Plotly version with adaptive binning
fig = create_plotly_histplot(
    data=points_data,
    x_col='PS',
    col_col='LEAGUE',
    col_order=league_order,  # Same order as seaborn
    col_wrap=3,  # 3x3 grid layout
    adaptive_bins=True  # Use adaptive binning to fix EPL/NHL gaps
)

# Display interactive plot in notebook
fig.show()

# Export both HTML and JSON versions
export_plot(fig, "figure1_points_distributions", "Figure 1 - Points per game distributions")

Exported: exports/figure1_points_distributions.json

In [13]:

Copied!





# points scored for all games
lazy_dfs = league_data.read_batch(leagues, lazy=True)
lazy_dfs = lazy_dfs[(lazy_dfs['YEAR_ID']>=st_year_) & (lazy_dfs['YEAR_ID']<=end_year_)]
lazy_dfs = adb.concat(lazy_dfs)
points_scored = lazy_dfs.collect().data[['LEAGUE', 'PS']]
style_table(points_scored)
# points scored for all games
lazy_dfs = league_data.read_batch(leagues, lazy=True)
lazy_dfs = lazy_dfs[(lazy_dfs['YEAR_ID']>=st_year_) & (lazy_dfs['YEAR_ID']<=end_year_)]
lazy_dfs = adb.concat(lazy_dfs)
points_scored = lazy_dfs.collect().data[['LEAGUE', 'PS']]
style_table(points_scored)

Out[13]:

LEAGUE	PS
MLB	6
MLB	3
MLB	2
MLB	1
MLB	4
MLB	4
MLB	4
MLB	5
MLB	5
MLB	2
MLB	7
MLB	6
MLB	3
MLB	2
MLB	6
MLB	3
MLB	4
MLB	6
MLB	1
MLB	8

Figure 1 calculate points-scored stats¶

check positive skew comment in text
note need help to plot via python like r panel plot

In [14]:

Copied!





point_stats = points_scored.groupby(['LEAGUE']).agg(Mean=('PS', 'mean'),
                                             Std=('PS', 'std'),
                                             Skew=('PS', 'skew'),
                                             nobs=('PS', 'count'))
point_stats['IR']=point_stats['Mean']/point_stats['Std']
point_stats = point_stats.sort_values(['IR'])
style_table(point_stats.round(2), hide_index=False)
point_stats = points_scored.groupby(['LEAGUE']).agg(Mean=('PS', 'mean'),
                                             Std=('PS', 'std'),
                                             Skew=('PS', 'skew'),
                                             nobs=('PS', 'count'))
point_stats['IR']=point_stats['Mean']/point_stats['Std']
point_stats = point_stats.sort_values(['IR'])
style_table(point_stats.round(2), hide_index=False)

Out[14]:

	Mean	Std	Skew	nobs	IR
LEAGUE
EPL	1.380000	1.260000	1.030000	7600	1.090000
MLB	4.390000	3.100000	1.000000	48594	1.420000
NHL	2.810000	1.640000	0.420000	23744	1.710000
SUP	25.100000	11.970000	0.880000	2402	2.100000
NFL	22.640000	10.180000	0.290000	5120	2.230000
LAX	10.260000	3.980000	0.500000	10006	2.580000
AFL	88.300000	27.750000	0.480000	3892	3.180000
IPL	163.270000	31.420000	0.160000	1274	5.200000
NBA	102.240000	12.660000	0.180000	24118	8.080000

In [ ]:

Copied!





# Create and display interactive Plotly version with same layout
fig = create_plotly_catplot(
    data=point_stats.reset_index().melt(['LEAGUE']),
    x_col="LEAGUE",
    y_col="value",
    hue_col="LEAGUE",
    col_col="variable",
    col_wrap=3,  # Same as seaborn version
    sharey=False
)

# Display interactive plot in notebook
fig.show()

# Export both HTML and JSON versions
export_plot(fig, "point_stats_catplot", "Points Scored Statistics")
# Create and display interactive Plotly version with same layout
fig = create_plotly_catplot(
    data=point_stats.reset_index().melt(['LEAGUE']),
    x_col="LEAGUE",
    y_col="value",
    hue_col="LEAGUE",
    col_col="variable",
    col_wrap=3,  # Same as seaborn version
    sharey=False
)

# Display interactive plot in notebook
fig.show()

# Export both HTML and JSON versions
export_plot(fig, "point_stats_catplot", "Points Scored Statistics")

Exported: exports/point_stats_catplot.json

Save into ArcticDB¶

In [16]:

Copied!

analysis = arctic.get_library('analysis', create_if_missing=True)
analysis.write('point_stats', point_stats)
analysis = arctic.get_library('analysis', create_if_missing=True)
analysis.write('point_stats', point_stats)

Out[16]:

VersionedItem(symbol='point_stats', library='analysis', data=n/a, version=0, metadata=None, host='LMDB(path=/home/nick/source/customers/man-group/ArcticDB/docs/mkdocs/docs/notebooks/arcticdb_sports)', timestamp=1756225340782814206)

In [17]:

Copied!





lazy_dfs = league_data.read_batch(leagues, lazy=True)
df = adb.concat(lazy_dfs).collect().data[['LEAGUE','YEAR_ID','TEAM_ID','PS','PA','wi']]
df_team = df.groupby(['LEAGUE','YEAR_ID','TEAM_ID']).agg(PS=('PS', 'mean'),
                                                         PA=('PA', 'mean'),
                                                         wi=('wi', 'mean'),
                                                         nobs=('wi', 'count'))
style_table(df_team.round(2).head(),hide_index=False)
lazy_dfs = league_data.read_batch(leagues, lazy=True)
df = adb.concat(lazy_dfs).collect().data[['LEAGUE','YEAR_ID','TEAM_ID','PS','PA','wi']]
df_team = df.groupby(['LEAGUE','YEAR_ID','TEAM_ID']).agg(PS=('PS', 'mean'),
                                                         PA=('PA', 'mean'),
                                                         wi=('wi', 'mean'),
                                                         nobs=('wi', 'count'))
style_table(df_team.round(2).head(),hide_index=False)

Out[17]:

			PS	PA	wi	nobs
LEAGUE	YEAR_ID	TEAM_ID
AFL	1897	Carlton	26.860000	52.640000	0.140000	14
		Collingwood	36.360000	31.790000	0.640000	14
		Essendon	50.430000	31.790000	0.790000	14
		Fitzroy	36.360000	34.640000	0.320000	14
		Geelong	50.290000	27.360000	0.790000	14

Figure 2 -- NBA and EPL¶

In [ ]:

Copied!





# prepare data for regression
df_reg = df_team.copy()
df_reg = df_reg.loc[(df_reg['PS']>0) & (df_reg['PA']>0) & (df_reg['wi']>0) & (df_reg['wi']<1)]
df_reg['lnwi']=np.log(df_reg['wi']/(1-df_reg['wi']))
df_reg['lnPSPA']=np.log(df_reg['PS']/df_reg['PA'])
df_reg = df_reg.reset_index()

# Create static seaborn plot
NBA_and_EPL = df_reg[df_reg['LEAGUE'].isin(['NBA', 'EPL'])]

# Create and display interactive Plotly version
fig = create_plotly_scatterplot(
    data=NBA_and_EPL,
    x_col='lnPSPA',
    y_col='lnwi', 
    hue_col='LEAGUE',
    show_regression=True
)

# Display interactive plot in notebook
fig.show()

# Export both HTML and JSON versions
export_plot(fig, "team_season_plot", "Team-Season Analysis")
# prepare data for regression
df_reg = df_team.copy()
df_reg = df_reg.loc[(df_reg['PS']>0) & (df_reg['PA']>0) & (df_reg['wi']>0) & (df_reg['wi']<1)]
df_reg['lnwi']=np.log(df_reg['wi']/(1-df_reg['wi']))
df_reg['lnPSPA']=np.log(df_reg['PS']/df_reg['PA'])
df_reg = df_reg.reset_index()

# Create static seaborn plot
NBA_and_EPL = df_reg[df_reg['LEAGUE'].isin(['NBA', 'EPL'])]

# Create and display interactive Plotly version
fig = create_plotly_scatterplot(
    data=NBA_and_EPL,
    x_col='lnPSPA',
    y_col='lnwi', 
    hue_col='LEAGUE',
    show_regression=True
)

# Display interactive plot in notebook
fig.show()

# Export both HTML and JSON versions
export_plot(fig, "team_season_plot", "Team-Season Analysis")

Exported: exports/team_season_plot.json

In [19]:

Copied!





# Compute standard regression results by team-season for 2010-19 period
df_ols = df_reg.loc[(df_reg['YEAR_ID']>=st_year_) & (df_reg['YEAR_ID']<=end_year_)]
ols=df_ols.groupby(['LEAGUE']).apply(ols_coef,xcol=['lnPSPA'],ycol=['lnwi'])
ols = ols.droplevel(1).sort_values(['λ']).reset_index().round(2)
style_table(ols, hide_index=False)
# Compute standard regression results by team-season for 2010-19 period
df_ols = df_reg.loc[(df_reg['YEAR_ID']>=st_year_) & (df_reg['YEAR_ID']<=end_year_)]
ols=df_ols.groupby(['LEAGUE']).apply(ols_coef,xcol=['lnPSPA'],ycol=['lnwi'])
ols = ols.droplevel(1).sort_values(['λ']).reset_index().round(2)
style_table(ols, hide_index=False)

/tmp/ipykernel_42612/1625533100.py:3: FutureWarning:

DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.

Out[19]:

	LEAGUE	adj.	adj. se	λ	λ se	nobs	adj. $R^2$
0	EPL	-0.150000	0.010000	1.240000	0.020000	200.000000	0.940000
1	MLB	-0.000000	0.010000	1.780000	0.040000	300.000000	0.890000
2	NHL	0.240000	0.010000	2.060000	0.040000	302.000000	0.910000
3	SUP	-0.030000	0.030000	2.600000	0.090000	154.000000	0.840000
4	NFL	0.020000	0.020000	2.840000	0.080000	319.000000	0.810000
5	LAX	-0.020000	0.010000	3.170000	0.050000	655.000000	0.840000
6	AFL	-0.020000	0.020000	3.570000	0.090000	177.000000	0.900000
7	IPL	-0.020000	0.040000	6.790000	0.650000	84.000000	0.570000
8	NBA	-0.000000	0.010000	14.320000	0.210000	300.000000	0.940000

Save into ArcticDB¶

In [20]:

Copied!

analysis.write('ols', ols);
analysis.write('ols', ols);

Figure 3a -- 2010 to 2019¶

In [ ]:

Copied!





# Prepare data for interactive Plotly version with error bars
ols_melted = ols.melt(['LEAGUE'])
# Add error column - only λ has error bars (2 * standard error)
ols_melted['error'] = ols_melted.apply(lambda row: 
    ols.loc[ols['LEAGUE'] == row['LEAGUE'], 'λ se'].iloc[0] * 2 
    if row['variable'] == 'λ' else None, axis=1)

# Create and display interactive Plotly version
fig = create_plotly_catplot(
    data=ols_melted,
    x_col="LEAGUE",
    y_col="value",
    hue_col="LEAGUE",
    col_col="variable",
    col_order=["λ", "nobs", "adj. $R^2$"],
    sharey=False,
    error_col="error",
    add_value_labels=True  # Show values on R^2 bars like the seaborn version
)

# Display interactive plot in notebook
fig.show()

# Export both HTML and JSON versions
export_plot(fig, "ols_results_catplot", "OLS Results by League")
# Prepare data for interactive Plotly version with error bars
ols_melted = ols.melt(['LEAGUE'])
# Add error column - only λ has error bars (2 * standard error)
ols_melted['error'] = ols_melted.apply(lambda row: 
    ols.loc[ols['LEAGUE'] == row['LEAGUE'], 'λ se'].iloc[0] * 2 
    if row['variable'] == 'λ' else None, axis=1)

# Create and display interactive Plotly version
fig = create_plotly_catplot(
    data=ols_melted,
    x_col="LEAGUE",
    y_col="value",
    hue_col="LEAGUE",
    col_col="variable",
    col_order=["λ", "nobs", "adj. $R^2$"],
    sharey=False,
    error_col="error",
    add_value_labels=True  # Show values on R^2 bars like the seaborn version
)

# Display interactive plot in notebook
fig.show()

# Export both HTML and JSON versions
export_plot(fig, "ols_results_catplot", "OLS Results by League")

Exported: exports/ols_results_catplot.json

In [ ]:

Copied!





# Create and display interactive Plotly versions

# 1. Lambda estimates with error bars
fig1 = create_plotly_bar_chart(
    data=ols,
    x_col='LEAGUE',
    y_col='λ',
    title='Figure 3a. Estimates of λ by League, 2010-2019 Seasons',
    error_col='λ se',  # Error bars (will be multiplied by 2 in the function)
    add_value_labels=False
)
# Multiply error bars by 2 to match matplotlib version
fig1.data[0].error_y.array = fig1.data[0].error_y.array * 2
fig1.show()
export_plot(fig1, "lambda_estimates", "Lambda Estimates by League")

# 2. Number of observations
fig2 = create_plotly_bar_chart(
    data=ols,
    x_col='LEAGUE',
    y_col='nobs',
    title='Figure 3a. Number of Obs by League, 2010-2019 Seasons',
    add_value_labels=False
)
# Use secondary color
fig2.data[0].marker.color = colors['secondary_color']
fig2.show()
export_plot(fig2, "number_observations", "Number of Observations by League")

# 3. Adjusted R-squared with value labels
fig3 = create_plotly_bar_chart(
    data=ols,
    x_col='LEAGUE',
    y_col='adj. $R^2$',
    add_value_labels=True
)
# Use accent color
fig3.data[0].marker.color = colors['accent_color']
fig3.show()
export_plot(fig3, "adjusted_rsquared", "Adjusted R-squared by League")

# Create and display interactive Plotly versions

# 1. Lambda estimates with error bars
fig1 = create_plotly_bar_chart(
    data=ols,
    x_col='LEAGUE',
    y_col='λ',
    title='Figure 3a. Estimates of λ by League, 2010-2019 Seasons',
    error_col='λ se',  # Error bars (will be multiplied by 2 in the function)
    add_value_labels=False
)
# Multiply error bars by 2 to match matplotlib version
fig1.data[0].error_y.array = fig1.data[0].error_y.array * 2
fig1.show()
export_plot(fig1, "lambda_estimates", "Lambda Estimates by League")

# 2. Number of observations
fig2 = create_plotly_bar_chart(
    data=ols,
    x_col='LEAGUE',
    y_col='nobs',
    title='Figure 3a. Number of Obs by League, 2010-2019 Seasons',
    add_value_labels=False
)
# Use secondary color
fig2.data[0].marker.color = colors['secondary_color']
fig2.show()
export_plot(fig2, "number_observations", "Number of Observations by League")

# 3. Adjusted R-squared with value labels
fig3 = create_plotly_bar_chart(
    data=ols,
    x_col='LEAGUE',
    y_col='adj. $R^2$',
    add_value_labels=True
)
# Use accent color
fig3.data[0].marker.color = colors['accent_color']
fig3.show()
export_plot(fig3, "adjusted_rsquared", "Adjusted R-squared by League")

Exported: exports/lambda_estimates.json

Exported: exports/number_observations.json

Exported: exports/adjusted_rsquared.json

In [ ]:

Copied!





# Figure 3a - Single Lambda Plot with Error Bars and N Values
# Sort by lambda values (ascending order)
ols_sorted = ols.sort_values('λ').copy()


# Create interactive Plotly version
fig_plotly = create_plotly_bar_chart(
    data=ols_sorted,
    x_col='LEAGUE',
    y_col='λ',
    error_col='λ se',
    add_value_labels=True
)

# Multiply error bars by 2 to show ±2 standard errors
fig_plotly.data[0].error_y.array = fig_plotly.data[0].error_y.array * 2

# Update x-axis labels to include N values
x_labels_plotly = [f"{row['LEAGUE']}<br>N = {int(row['nobs'])}" for _, row in ols_sorted.iterrows()]
fig_plotly.update_xaxes(
    tickmode='array',
    tickvals=list(range(len(ols_sorted))),
    ticktext=x_labels_plotly
)

# Update layout for better spacing
fig_plotly.update_layout(
    yaxis_title="Fitted λ",
    height=600,
    margin=dict(b=100)  # More bottom margin for N values
)

fig_plotly.show()

# Export the plot
export_plot(fig_plotly, "lambda_estimates_complete", "Lambda Estimates with Error Bars")
# Figure 3a - Single Lambda Plot with Error Bars and N Values
# Sort by lambda values (ascending order)
ols_sorted = ols.sort_values('λ').copy()


# Create interactive Plotly version
fig_plotly = create_plotly_bar_chart(
    data=ols_sorted,
    x_col='LEAGUE',
    y_col='λ',
    error_col='λ se',
    add_value_labels=True
)

# Multiply error bars by 2 to show ±2 standard errors
fig_plotly.data[0].error_y.array = fig_plotly.data[0].error_y.array * 2

# Update x-axis labels to include N values
x_labels_plotly = [f"{row['LEAGUE']}
N = {int(row['nobs'])}" for _, row in ols_sorted.iterrows()]
fig_plotly.update_xaxes(
    tickmode='array',
    tickvals=list(range(len(ols_sorted))),
    ticktext=x_labels_plotly
)

# Update layout for better spacing
fig_plotly.update_layout(
    yaxis_title="Fitted λ",
    height=600,
    margin=dict(b=100)  # More bottom margin for N values
)

fig_plotly.show()

# Export the plot
export_plot(fig_plotly, "lambda_estimates_complete", "Lambda Estimates with Error Bars")

Exported: exports/lambda_estimates_complete.json

Source: Various (listed in Appendix) as at January 2025.

Key: Australian Football League (AFL), English Premier League (EPL), Indian Premier League (IPL), NCAA Men's Division I Lacrosse (LAX), Major League Baseball (MLB), National Basketball Association (NBA), National Football League (NFL), National Hockey League (NHL), Super Rugby Pacific (SUP)

Note: Error bars represent ±2 standard errors around the coefficient estimate. N is the number of team-seasons used to estimate the model.

The λ estimates and errors are shown in Figure 3a. As we've noted, the EPL and NBA sit at the extremes. The model fits all sports well with statistically significant coefficients. The confidence bands around our estimate for the IPL is wider (also adjusted R² is 0.57 for the IPL compared to a range of 0.81 to 0.94 for the other leagues). For those that are interested, the poorer fit for the IPL we believe is due to the limited number of teams each season (8–10) and the truncated game effect in Twenty20 Cricket, discussed in Appendix A, which we can only partially adjust for.

In [ ]:

Copied!





# calculate mean league IRs group by league-season
df_leagueM = df[['LEAGUE','YEAR_ID','PS','PA','wi']].groupby(['LEAGUE','YEAR_ID']).mean()
df_leagueS = df[['LEAGUE','YEAR_ID','PS','PA','wi']].groupby(['LEAGUE','YEAR_ID']).std()
df_leagueIR = df_leagueM['PS']/df_leagueS['PS']
df_leagueIR = df_leagueIR.reset_index().rename(columns={'PS': 'IR'})

# calc 10 year average for 2010-2019
df_leagueIR10 = df_leagueIR.loc[(df_leagueIR['YEAR_ID']>=st_year_) & (df_leagueIR['YEAR_ID']<=end_year_)]
df_leagueIR10 = df_leagueIR10[['LEAGUE','IR']].groupby(['LEAGUE']).mean()
df_leagueIR10 = df_leagueIR10.reset_index().rename(columns={'IR': 'IR10'})
df_leagueIR10 = df_leagueIR10.merge(ols, how='left', on=['LEAGUE'])


# Create and display interactive Plotly version
fig = create_plotly_scatterplot(
    data=df_leagueIR10,
    x_col='IR10',
    y_col='λ',
    show_regression=True,
    text_col='LEAGUE',  # Add league labels to data points
    xlim=(0, 9),        # Match matplotlib axis limits
    ylim=(0, 16)
)

# Display interactive plot in notebook
fig.show()

# Export both HTML and JSON versions
export_plot(fig, "lambda_vs_information_ratio", "Lambda vs Information Ratio")
# calculate mean league IRs group by league-season
df_leagueM = df[['LEAGUE','YEAR_ID','PS','PA','wi']].groupby(['LEAGUE','YEAR_ID']).mean()
df_leagueS = df[['LEAGUE','YEAR_ID','PS','PA','wi']].groupby(['LEAGUE','YEAR_ID']).std()
df_leagueIR = df_leagueM['PS']/df_leagueS['PS']
df_leagueIR = df_leagueIR.reset_index().rename(columns={'PS': 'IR'})

# calc 10 year average for 2010-2019
df_leagueIR10 = df_leagueIR.loc[(df_leagueIR['YEAR_ID']>=st_year_) & (df_leagueIR['YEAR_ID']<=end_year_)]
df_leagueIR10 = df_leagueIR10[['LEAGUE','IR']].groupby(['LEAGUE']).mean()
df_leagueIR10 = df_leagueIR10.reset_index().rename(columns={'IR': 'IR10'})
df_leagueIR10 = df_leagueIR10.merge(ols, how='left', on=['LEAGUE'])


# Create and display interactive Plotly version
fig = create_plotly_scatterplot(
    data=df_leagueIR10,
    x_col='IR10',
    y_col='λ',
    show_regression=True,
    text_col='LEAGUE',  # Add league labels to data points
    xlim=(0, 9),        # Match matplotlib axis limits
    ylim=(0, 16)
)

# Display interactive plot in notebook
fig.show()

# Export both HTML and JSON versions
export_plot(fig, "lambda_vs_information_ratio", "Lambda vs Information Ratio")

Exported: exports/lambda_vs_information_ratio.json

In [25]:

Copied!





# Rolling 10-Year Lambda Analysis
# First, let's check what columns are available in the league data
print("Checking available columns in league data...")
sample_league = league_data.read('MLB').data
print(f"Available columns: {list(sample_league.columns)}")
print(f"Sample data shape: {sample_league.shape}")
print("\nFirst few rows:")
print(sample_league.head())

def calculate_rolling_lambdas(league_name, window_years=10, min_observations=50):
    """
    Calculate rolling lambda estimates for a specific league.
    
    Args:
        league_name: Name of the league
        window_years: Size of rolling window in years
        min_observations: Minimum observations required for regression
    
    Returns:
        DataFrame with year, lambda, std_error, adj_r2, nobs
    """
    # Get league data from ArcticDB
    league_df = league_data.read(league_name).data
    
    # Check if the columns exist, if not use the renamed columns from processed data
    if 'FREQ_RUNS_S' in league_df.columns:
        ps_col = 'FREQ_RUNS_S'
        pa_col = 'FREQ_RUNS_A'
    else:
        ps_col = 'PS'
        pa_col = 'PA'
    
    # Filter for team-season data (same processing as before)
    team_season = league_df.groupby(['YEAR_ID','TEAM_ID']).agg(
        PS=(ps_col, 'mean'),
        PA=(pa_col, 'mean'),
        wi=('wi', 'mean'),
        nobs=('wi', 'count')
    ).reset_index()
    
    # Filter valid observations
    team_season = team_season.loc[
        (team_season['PS'] > 0) & 
        (team_season['PA'] > 0) & 
        (team_season['wi'] > 0) & 
        (team_season['wi'] < 1)
    ]
    
    # Create log variables
    team_season['lnwi'] = np.log(team_season['wi'] / (1 - team_season['wi']))
    team_season['lnPSPA'] = np.log(team_season['PS'] / team_season['PA'])
    
    # Get year range
    years = sorted(team_season['YEAR_ID'].unique())
    start_year = min(years) + window_years - 1  # First complete window
    end_year = max(years)
    
    results = []
    
    for year in range(start_year, end_year + 1):
        # Define window
        window_start = year - window_years + 1
        window_data = team_season[
            (team_season['YEAR_ID'] >= window_start) & 
            (team_season['YEAR_ID'] <= year)
        ]
        
        if len(window_data) >= min_observations:
            try:
                # Run OLS regression
                model = sm.OLS(window_data['lnwi'], 
                              sm.add_constant(window_data['lnPSPA'])).fit()
                
                results.append({
                    'year': year,
                    'lambda': model.params.iloc[1],
                    'lambda_se': model.bse.iloc[1],
                    'adj_r2': model.rsquared_adj,
                    'nobs': len(window_data)
                })
            except:
                # Skip if regression fails
                continue
    
    return pd.DataFrame(results)

# Calculate rolling lambdas for leagues shown in the reference chart
target_leagues = ['AFL', 'EPL', 'MLB', 'NBA', 'NFL', 'NHL']  # Matching the 2x3 layout
rolling_results = {}

print("\nCalculating rolling 10-year lambda estimates...")
for league in target_leagues:
    print(f"Processing {league}...")
    rolling_results[league] = calculate_rolling_lambdas(league)
    print(f"  {league}: {len(rolling_results[league])} data points")

print("Rolling lambda calculations complete!")
# Rolling 10-Year Lambda Analysis
# First, let's check what columns are available in the league data
print("Checking available columns in league data...")
sample_league = league_data.read('MLB').data
print(f"Available columns: {list(sample_league.columns)}")
print(f"Sample data shape: {sample_league.shape}")
print("\nFirst few rows:")
print(sample_league.head())

def calculate_rolling_lambdas(league_name, window_years=10, min_observations=50):
    """
    Calculate rolling lambda estimates for a specific league.
    
    Args:
        league_name: Name of the league
        window_years: Size of rolling window in years
        min_observations: Minimum observations required for regression
    
    Returns:
        DataFrame with year, lambda, std_error, adj_r2, nobs
    """
    # Get league data from ArcticDB
    league_df = league_data.read(league_name).data
    
    # Check if the columns exist, if not use the renamed columns from processed data
    if 'FREQ_RUNS_S' in league_df.columns:
        ps_col = 'FREQ_RUNS_S'
        pa_col = 'FREQ_RUNS_A'
    else:
        ps_col = 'PS'
        pa_col = 'PA'
    
    # Filter for team-season data (same processing as before)
    team_season = league_df.groupby(['YEAR_ID','TEAM_ID']).agg(
        PS=(ps_col, 'mean'),
        PA=(pa_col, 'mean'),
        wi=('wi', 'mean'),
        nobs=('wi', 'count')
    ).reset_index()
    
    # Filter valid observations
    team_season = team_season.loc[
        (team_season['PS'] > 0) & 
        (team_season['PA'] > 0) & 
        (team_season['wi'] > 0) & 
        (team_season['wi'] < 1)
    ]
    
    # Create log variables
    team_season['lnwi'] = np.log(team_season['wi'] / (1 - team_season['wi']))
    team_season['lnPSPA'] = np.log(team_season['PS'] / team_season['PA'])
    
    # Get year range
    years = sorted(team_season['YEAR_ID'].unique())
    start_year = min(years) + window_years - 1  # First complete window
    end_year = max(years)
    
    results = []
    
    for year in range(start_year, end_year + 1):
        # Define window
        window_start = year - window_years + 1
        window_data = team_season[
            (team_season['YEAR_ID'] >= window_start) & 
            (team_season['YEAR_ID'] <= year)
        ]
        
        if len(window_data) >= min_observations:
            try:
                # Run OLS regression
                model = sm.OLS(window_data['lnwi'], 
                              sm.add_constant(window_data['lnPSPA'])).fit()
                
                results.append({
                    'year': year,
                    'lambda': model.params.iloc[1],
                    'lambda_se': model.bse.iloc[1],
                    'adj_r2': model.rsquared_adj,
                    'nobs': len(window_data)
                })
            except:
                # Skip if regression fails
                continue
    
    return pd.DataFrame(results)

# Calculate rolling lambdas for leagues shown in the reference chart
target_leagues = ['AFL', 'EPL', 'MLB', 'NBA', 'NFL', 'NHL']  # Matching the 2x3 layout
rolling_results = {}

print("\nCalculating rolling 10-year lambda estimates...")
for league in target_leagues:
    print(f"Processing {league}...")
    rolling_results[league] = calculate_rolling_lambdas(league)
    print(f"  {league}: {len(rolling_results[league])} data points")

print("Rolling lambda calculations complete!")

Checking available columns in league data...
Available columns: ['YEAR_ID', 'TEAM_ID', 'OPP_TEAM_ID', 'PS', 'PA', 'date', 'home', 'wi', 'LEAGUE']
Sample data shape: (436786, 9)

First few rows:
        YEAR_ID TEAM_ID OPP_TEAM_ID  PS  PA        date  home   wi LEAGUE
138750     1871     ATL         WS3  20  18  1871-05-05     0  1.0    MLB
138751     1871     ATL         TRO   9   5  1871-05-09     0  1.0    MLB
138752     1871     ATL         TRO  14  29  1871-05-16     1  0.0    MLB
138753     1871     ATL         PH1  11   8  1871-05-20     1  1.0    MLB
138754     1871     ATL         WS3   4   4  1871-05-24     1  0.5    MLB

Calculating rolling 10-year lambda estimates...
Processing AFL...
  AFL: 114 data points
Processing EPL...
  EPL: 122 data points
Processing MLB...
  MLB: 140 data points
Processing NBA...
  NBA: 64 data points
Processing NFL...
  NFL: 89 data points
Processing NHL...
  NHL: 93 data points
Rolling lambda calculations complete!

In [ ]:

Copied!





# Define the order to match the reference chart layout
league_layout = ['AFL', 'EPL', 'MLB', 'NBA', 'NFL', 'NHL']

# Create interactive Plotly version
from plotly.subplots import make_subplots

# Set up Plotly subplot structure
fig_plotly = make_subplots(
    rows=2, cols=3,
    subplot_titles=league_layout,
    horizontal_spacing=0.08,
    vertical_spacing=0.12
)

for i, league in enumerate(league_layout):
    row = (i // 3) + 1
    col = (i % 3) + 1
    
    if league in rolling_results and len(rolling_results[league]) > 0:
        data = rolling_results[league]
        
        # Add confidence band
        upper_bound = data['lambda'] + 2 * data['lambda_se']
        lower_bound = data['lambda'] - 2 * data['lambda_se']
        
        # Create filled area for confidence band
        years_extended = list(data['year']) + list(data['year'][::-1])
        bounds_extended = list(upper_bound) + list(lower_bound[::-1])
        
        fig_plotly.add_trace(
            dict(
                type='scatter',
                x=years_extended,
                y=bounds_extended,
                fill='toself',
                fillcolor='rgba(128,128,128,0.3)',
                line=dict(color='rgba(255,255,255,0)'),
                name='±2 SE',
                showlegend=(i == 0),
                hoverinfo='skip'
            ),
            row=row, col=col
        )
        
        # Add lambda line
        fig_plotly.add_trace(
            dict(
                type='scatter',
                x=data['year'],
                y=data['lambda'],
                mode='lines',
                line=dict(color=colors['primary_color'], width=2),
                name='λ estimate',
                showlegend=(i == 0),
                hovertemplate=f'{league}<br>Year: %{{x}}<br>λ: %{{y:.2f}}<extra></extra>'
            ),
            row=row, col=col
        )

# Update layout
fig_plotly.update_layout(
    height=700,
    template='custom_theme',
    showlegend=True,
    margin=dict(t=100, b=50)
)

# Update individual subplot y-axes to match reference chart ranges
y_ranges = {
    'AFL': [2.5, 5.0],
    'EPL': [0.9, 1.3], 
    'MLB': [1.6, 2.1],
    'NBA': [12, 16],
    'NFL': [1.0, 3.0],
    'NHL': [1.5, 2.3]
}

for i, league in enumerate(league_layout):
    row = (i // 3) + 1
    col = (i % 3) + 1
    if league in y_ranges:
        fig_plotly.update_yaxes(range=y_ranges[league], row=row, col=col)

fig_plotly.show()

# Export the plot
export_plot(fig_plotly, "rolling_lambda_time_series", "Rolling 10-Year Lambda Estimates")
# Define the order to match the reference chart layout
league_layout = ['AFL', 'EPL', 'MLB', 'NBA', 'NFL', 'NHL']

# Create interactive Plotly version
from plotly.subplots import make_subplots

# Set up Plotly subplot structure
fig_plotly = make_subplots(
    rows=2, cols=3,
    subplot_titles=league_layout,
    horizontal_spacing=0.08,
    vertical_spacing=0.12
)

for i, league in enumerate(league_layout):
    row = (i // 3) + 1
    col = (i % 3) + 1
    
    if league in rolling_results and len(rolling_results[league]) > 0:
        data = rolling_results[league]
        
        # Add confidence band
        upper_bound = data['lambda'] + 2 * data['lambda_se']
        lower_bound = data['lambda'] - 2 * data['lambda_se']
        
        # Create filled area for confidence band
        years_extended = list(data['year']) + list(data['year'][::-1])
        bounds_extended = list(upper_bound) + list(lower_bound[::-1])
        
        fig_plotly.add_trace(
            dict(
                type='scatter',
                x=years_extended,
                y=bounds_extended,
                fill='toself',
                fillcolor='rgba(128,128,128,0.3)',
                line=dict(color='rgba(255,255,255,0)'),
                name='±2 SE',
                showlegend=(i == 0),
                hoverinfo='skip'
            ),
            row=row, col=col
        )
        
        # Add lambda line
        fig_plotly.add_trace(
            dict(
                type='scatter',
                x=data['year'],
                y=data['lambda'],
                mode='lines',
                line=dict(color=colors['primary_color'], width=2),
                name='λ estimate',
                showlegend=(i == 0),
                hovertemplate=f'{league}
Year: %{{x}}
λ: %{{y:.2f}}'
            ),
            row=row, col=col
        )

# Update layout
fig_plotly.update_layout(
    height=700,
    template='custom_theme',
    showlegend=True,
    margin=dict(t=100, b=50)
)

# Update individual subplot y-axes to match reference chart ranges
y_ranges = {
    'AFL': [2.5, 5.0],
    'EPL': [0.9, 1.3], 
    'MLB': [1.6, 2.1],
    'NBA': [12, 16],
    'NFL': [1.0, 3.0],
    'NHL': [1.5, 2.3]
}

for i, league in enumerate(league_layout):
    row = (i // 3) + 1
    col = (i % 3) + 1
    if league in y_ranges:
        fig_plotly.update_yaxes(range=y_ranges[league], row=row, col=col)

fig_plotly.show()

# Export the plot
export_plot(fig_plotly, "rolling_lambda_time_series", "Rolling 10-Year Lambda Estimates")

Exported: exports/rolling_lambda_time_series.json

Source: Various (listed in Appendix) as at January 2025.

Key: Australian Football League (AFL), English Premier League (EPL), Major League Baseball (MLB), National Basketball Association (NBA), National Football League (NFL), National Hockey League (NHL)

Note: Shaded regions represent ±2 standard errors around the coefficient estimate.

The rolling 10-year lambda estimates shown in Figure 4 reveal the temporal evolution of the Pythagorean exponent across different sports leagues. Each panel shows how λ has changed over time, providing insights into the historical development of competitive balance and scoring dynamics within each league. The confidence bands help assess the statistical precision of these estimates over different time periods.

LEAGUE	PS
MLB	6
MLB	3
MLB	2
MLB	1
MLB	4
MLB	4
MLB	4
MLB	5
MLB	5
MLB	2
MLB	7
MLB	6
MLB	3
MLB	2
MLB	6
MLB	3
MLB	4
MLB	6
MLB	1
MLB	8

LEAGUE	PS
MLB	6
MLB	3
MLB	2
MLB	1
MLB	4
MLB	4
MLB	4
MLB	5
MLB	5
MLB	2
MLB	7
MLB	6
MLB	3
MLB	2
MLB	6
MLB	3
MLB	4
MLB	6
MLB	1
MLB	8

LEAGUE	PS
MLB	6
MLB	3
MLB	2
MLB	1
MLB	4
MLB	4
MLB	4
MLB	5
MLB	5
MLB	2
MLB	7
MLB	6
MLB	3
MLB	2
MLB	6
MLB	3
MLB	4
MLB	6
MLB	1
MLB	8