import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.weightstats import ttest_ind

# loading the data
wage = pd.read_csv("wage.csv")
wage.head()

wage["Black"] = wage["Black"].map({"Yes": 1, "No": 0})

wage["SMSA"] = wage["SMSA"].map({"Yes": 1, "No":0})
wage.head()

wage.describe()

# Wage difference by Region
wage.groupby("Region")["Wage"].median()

Region
MW    581.67
NE    593.54
S     498.58
W     569.80
Name: Wage, dtype: float64

# Wage difference by SMSA (urban vs non-urban)
group1 = wage[wage["SMSA"] == 1]["Wage"]
group2 = wage[wage["SMSA"] == 0]["Wage"]

ttest_ind(group1, group2)

(19.984874212965273, 3.505710453796782e-88, 25629.0)

model1 = smf.ols("np.log(Wage) ~ Education + Experience", data=wage).fit()
print(model1.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:           np.log(Wage)   R-squared:                       0.256
Model:                            OLS   Adj. R-squared:                  0.256
Method:                 Least Squares   F-statistic:                     4414.
Date:                Fri, 01 May 2026   Prob (F-statistic):               0.00
Time:                        16:34:26   Log-Likelihood:                -20624.
No. Observations:               25631   AIC:                         4.125e+04
Df Residuals:                   25628   BIC:                         4.128e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      4.6058      0.018    250.273      0.000       4.570       4.642
Education      0.1014      0.001     83.558      0.000       0.099       0.104
Experience     0.0184      0.000     64.944      0.000       0.018       0.019
==============================================================================
Omnibus:                     1659.008   Durbin-Watson:                   1.788
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             3153.192
Skew:                          -0.470   Prob(JB):                         0.00
Kurtosis:                       4.438   Cond. No.                         136.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

model2 = smf.ols("np.log(Wage) ~ Education + Experience + Black", data=wage).fit()
print(model2.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:           np.log(Wage)   R-squared:                       0.266
Model:                            OLS   Adj. R-squared:                  0.266
Method:                 Least Squares   F-statistic:                     3099.
Date:                Fri, 01 May 2026   Prob (F-statistic):               0.00
Time:                        16:38:08   Log-Likelihood:                -20451.
No. Observations:               25631   AIC:                         4.091e+04
Df Residuals:                   25627   BIC:                         4.094e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      4.6466      0.018    252.401      0.000       4.610       4.683
Education      0.0998      0.001     82.540      0.000       0.097       0.102
Experience     0.0184      0.000     65.172      0.000       0.018       0.019
Black         -0.2350      0.013    -18.675      0.000      -0.260      -0.210
==============================================================================
Omnibus:                     1699.141   Durbin-Watson:                   1.792
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             3289.481
Skew:                          -0.474   Prob(JB):                         0.00
Kurtosis:                       4.477   Cond. No.                         138.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

model3 = smf.ols("np.log(Wage) ~ Education + Experience + Black + SMSA + C(Region)", data=wage).fit()
print(model3.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:           np.log(Wage)   R-squared:                       0.283
Model:                            OLS   Adj. R-squared:                  0.283
Method:                 Least Squares   F-statistic:                     1443.
Date:                Fri, 01 May 2026   Prob (F-statistic):               0.00
Time:                        16:40:37   Log-Likelihood:                -20159.
No. Observations:               25631   AIC:                         4.033e+04
Df Residuals:                   25623   BIC:                         4.040e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept           4.5764      0.020    232.281      0.000       4.538       4.615
C(Region)[T.NE]     0.0360      0.010      3.702      0.000       0.017       0.055
C(Region)[T.S]     -0.0612      0.009     -6.743      0.000      -0.079      -0.043
C(Region)[T.W]     -0.0018      0.010     -0.182      0.855      -0.021       0.018
Education           0.0970      0.001     80.773      0.000       0.095       0.099
Experience          0.0184      0.000     65.810      0.000       0.018       0.019
Black              -0.2304      0.013    -18.206      0.000      -0.255      -0.206
SMSA                0.1578      0.008     20.467      0.000       0.143       0.173
==============================================================================
Omnibus:                     1767.285   Durbin-Watson:                   1.832
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             3555.155
Skew:                          -0.479   Prob(JB):                         0.00
Kurtosis:                       4.553   Cond. No.                         154.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

model4 = smf.ols("np.log(Wage) ~ Education + Experience + Black * Region", data=wage).fit()
model4.summary()

	Wage	Education	Experience	Black	SMSA
count	25631.000000	25631.000000	25631.000000	25631.000000	25631.000000
mean	640.162470	13.076275	18.586555	0.077562	0.742850
std	444.283273	2.904286	12.424661	0.267487	0.437071
min	50.390000	0.000000	-4.000000	0.000000	0.000000
25%	356.130000	12.000000	9.000000	0.000000	0.000000
50%	567.230000	12.000000	16.000000	0.000000	1.000000
75%	826.210000	16.000000	27.000000	0.000000	1.000000
max	18777.200000	18.000000	63.000000	1.000000	1.000000

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	4.6601	0.019	239.211	0.000	4.622	4.698
Region[T.NE]	0.0607	0.010	6.078	0.000	0.041	0.080
Region[T.S]	-0.0548	0.010	-5.751	0.000	-0.073	-0.036
Region[T.W]	0.0034	0.010	0.333	0.739	-0.017	0.023
Education	0.0989	0.001	81.926	0.000	0.097	0.101
Experience	0.0183	0.000	64.992	0.000	0.018	0.019
Black	-0.1928	0.031	-6.287	0.000	-0.253	-0.133
Black:Region[T.NE]	-0.0035	0.043	-0.082	0.935	-0.088	0.081
Black:Region[T.S]	-0.0418	0.035	-1.192	0.233	-0.110	0.027
Black:Region[T.W]	0.0382	0.051	0.743	0.458	-0.063	0.139

Statsmodels Tutorial¶

1. Introduction to Statsmodels¶

2. When and Why Use Statsmodels¶

3. Comparison to R (stats package)¶

4.Core Functional Areas of Statsmodels¶

4.1 Summary Statistics and Hypothesis Testing¶

4.2 Linear Regression (OLS)¶

4.3 Categorical Variables¶

4.4 Logistic Regression (Binary Outcomes)¶

5. Dataset Overview: Wages Data¶

6. Research Questions¶

7.Loading and Inspecting the Data¶

8. Data Cleaning and Encoding¶

9. Exploratory Data Analysis¶

Summary Statistics¶

10. Hypothesis Testing¶

11. Regression Analysis¶

13. Findings and Conclusion¶

	Wage	Education	Experience	Black	SMSA	Region
0	354.94	7	45	No	Yes	NE
1	370.37	9	9	No	Yes	NE
2	754.94	11	46	No	Yes	NE
3	593.54	12	36	No	Yes	NE
4	377.23	16	22	No	Yes	NE

Dep. Variable:	np.log(Wage)	R-squared:	0.271
Model:	OLS	Adj. R-squared:	0.271
Method:	Least Squares	F-statistic:	1059.
Date:	Fri, 01 May 2026	Prob (F-statistic):	0.00
Time:	20:04:55	Log-Likelihood:	-20365.
No. Observations:	25631	AIC:	4.075e+04
Df Residuals:	25621	BIC:	4.083e+04
Df Model:	9
Covariance Type:	nonrobust

Omnibus:	1711.542	Durbin-Watson:	1.804
Prob(Omnibus):	0.000	Jarque-Bera (JB):	3339.172
Skew:	-0.475	Prob(JB):	0.00
Kurtosis:	4.492	Cond. No.	507.