import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("2015.csv")
df.head()

df.columns

Index(['Country', 'Region', 'Happiness Rank', 'Happiness Score',
       'Standard Error', 'Economy (GDP per Capita)', 'Family',
       'Health (Life Expectancy)', 'Freedom', 'Trust (Government Corruption)',
       'Generosity', 'Dystopia Residual'],
      dtype='object')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        158 non-null    object 
 1   Region                         158 non-null    object 
 2   Happiness Rank                 158 non-null    int64  
 3   Happiness Score                158 non-null    float64
 4   Standard Error                 158 non-null    float64
 5   Economy (GDP per Capita)       158 non-null    float64
 6   Family                         158 non-null    float64
 7   Health (Life Expectancy)       158 non-null    float64
 8   Freedom                        158 non-null    float64
 9   Trust (Government Corruption)  158 non-null    float64
 10  Generosity                     158 non-null    float64
 11  Dystopia Residual              158 non-null    float64
dtypes: float64(9), int64(1), object(2)
memory usage: 14.9+ KB

df.shape

(158, 12)

df.describe()

df.isnull().sum()

Country                          0
Region                           0
Happiness Rank                   0
Happiness Score                  0
Standard Error                   0
Economy (GDP per Capita)         0
Family                           0
Health (Life Expectancy)         0
Freedom                          0
Trust (Government Corruption)    0
Generosity                       0
Dystopia Residual                0
dtype: int64

df = df.rename(columns={
    "Country": "country",
    "Region": "region",
    "Happiness Rank": "rank",
    "Happiness Score": "happiness_score",
    "Standard Error": "standard_error",
    "Economy (GDP per Capita)": "gdp_per_capita",
    "Family": "social_support",
    "Health (Life Expectancy)": "healthy_life_expectancy",
    "Freedom": "freedom",
    "Trust (Government Corruption)": "corruption",
    "Generosity": "generosity",
    "Dystopia Residual": "dystopia_residual"
})
df.head()

sns.histplot(data=df, x="happiness_score", kde=True)
plt.title("Distribution of Happiness Scores")
plt.xlabel("Happiness Score")
plt.ylabel("Count")
plt.show()

sns.scatterplot(data=df, x="gdp_per_capita", y="happiness_score")
plt.title("GDP per Capita and Happiness Score")
plt.xlabel("GDP per Capita")
plt.ylabel("Happiness Score")
plt.show()

numeric_df = df[["happiness_score", "gdp_per_capita", "social_support", 
                 "healthy_life_expectancy", "freedom", "generosity", "corruption"]]

sns.heatmap(numeric_df.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Between Variables")
plt.show()

X = df[["gdp_per_capita", "social_support", "healthy_life_expectancy", 
        "freedom", "generosity", "corruption"]]

y = df["happiness_score"]

X.head()

y.head()

0    7.587
1    7.561
2    7.527
3    7.522
4    7.427
Name: happiness_score, dtype: float64

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

print("Training data size:", X_train.shape)
print("Testing data size:", X_test.shape)

Training data size: (126, 6)
Testing data size: (32, 6)

linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

y_pred_linear = linear_model.predict(X_test)
y_pred_linear[:5]

array([4.7601377 , 6.46907653, 4.61372277, 3.05290197, 4.96414597])

results = pd.DataFrame({
    "actual_happiness": y_test,
    "predicted_happiness": y_pred_linear})

results.head()

linear_mse = mean_squared_error(y_test, y_pred_linear)
linear_r2 = r2_score(y_test, y_pred_linear)

print("Linear regression MSE:", linear_mse)
print("Linear regression R^2:", linear_r2)

Linear regression MSE: 0.24193882833563737
Linear regression R^2: 0.8294705100069293

plt.figure(figsize=(6, 5))
plt.scatter(y_test, y_pred_linear, alpha=0.7, color="steelblue")
plt.plot([y_test.min(), y_test.max()],
         [y_test.min(), y_test.max()], 'r--', label="Perfect prediction")
plt.xlabel("Actual Happiness Score")
plt.ylabel("Predicted Happiness Score")
plt.title("Linear Regression: Actual vs Predicted")
plt.legend()
plt.tight_layout()
plt.show()

rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

rf_mse = mean_squared_error(y_test, y_pred_rf)
rf_r2 = r2_score(y_test, y_pred_rf)

print("Random Forest MSE:", rf_mse)
print("Random Forest R^2:", rf_r2)

Random Forest MSE: 0.2638721893812502
Random Forest R^2: 0.8140108795760777

model_comparison = pd.DataFrame({
    "Model": ["Linear Regression", "Random Forest"],
    "MSE": [linear_mse, rf_mse],
    "R^2": [linear_r2, rf_r2]})

model_comparison

feature_importance = pd.DataFrame({
    "feature": X.columns,
    "importance": rf_model.feature_importances_
})

feature_importance = feature_importance.sort_values(by="importance", ascending=False)

feature_importance

sns.barplot(data=feature_importance, x="importance", y="feature")
plt.title("Feature Importance from Random Forest")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()

scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)

df["cluster"] = kmeans.fit_predict(X_scaled)
df[["country", "happiness_score", "cluster"]].head()

df.groupby("cluster")[["happiness_score", "gdp_per_capita", "social_support",
                       "healthy_life_expectancy", "freedom", "generosity", "corruption"]].mean()

sns.scatterplot(
    data=df,
    x="gdp_per_capita",
    y="happiness_score",
    hue="cluster",
    palette="Set2"
)
plt.title("Country Clusters Based on Social and Economic Factors")
plt.xlabel("GDP per Capita")
plt.ylabel("Happiness Score")
plt.show()

focus_countries = df[df["country"].isin(["Rwanda", "Somaliland region"])]

focus_countries

global_avg = df[[
    "happiness_score",
    "gdp_per_capita",
    "social_support",
    "healthy_life_expectancy",
    "freedom",
    "generosity",
    "corruption"]].mean()

global_avg

happiness_score            5.375734
gdp_per_capita             0.846137
social_support             0.991046
healthy_life_expectancy    0.630259
freedom                    0.428615
generosity                 0.237296
corruption                 0.143422
dtype: float64

focus_countries[[
    "country",
    "happiness_score",
    "gdp_per_capita",
    "social_support",
    "healthy_life_expectancy",
    "freedom",
    "generosity",
    "corruption"]]

df.groupby("region")[["happiness_score", "gdp_per_capita"]].mean()

df[df["country"].isin(["Rwanda", "Somaliland region"])][["country", "region"]]

focus_X = focus_countries[[
    "gdp_per_capita",
    "social_support",
    "healthy_life_expectancy",
    "freedom",
    "generosity",
    "corruption"]]

focus_countries.loc[:, "predicted_happiness"] = linear_model.predict(focus_X)

focus_countries[["country", "happiness_score", "predicted_happiness"]]

df[df["country"].isin(["Rwanda", "Somaliland region"])][["country", "cluster"]]

import seaborn as sns
sns.scatterplot(
    data=df,
    x="gdp_per_capita",
    y="happiness_score",
    hue="region")

sns.scatterplot(
    data=focus_countries,
    x="gdp_per_capita",
    y="happiness_score",
    color="Red",
    s=100)

plt.title("Rwanda and Somaliland Compared to Global Data")
plt.show()

	Country	Region	Happiness Rank	Happiness Score	Standard Error	Economy (GDP per Capita)	Family	Health (Life Expectancy)	Freedom	Trust (Government Corruption)	Generosity	Dystopia Residual
0	Switzerland	Western Europe	1	7.587	0.03411	1.39651	1.34951	0.94143	0.66557	0.41978	0.29678	2.51738
1	Iceland	Western Europe	2	7.561	0.04884	1.30232	1.40223	0.94784	0.62877	0.14145	0.43630	2.70201
2	Denmark	Western Europe	3	7.527	0.03328	1.32548	1.36058	0.87464	0.64938	0.48357	0.34139	2.49204
3	Norway	Western Europe	4	7.522	0.03880	1.45900	1.33095	0.88521	0.66973	0.36503	0.34699	2.46531
4	Canada	North America	5	7.427	0.03553	1.32629	1.32261	0.90563	0.63297	0.32957	0.45811	2.45176

	Happiness Rank	Happiness Score	Standard Error	Economy (GDP per Capita)	Family	Health (Life Expectancy)	Freedom	Trust (Government Corruption)	Generosity	Dystopia Residual
count	158.000000	158.000000	158.000000	158.000000	158.000000	158.000000	158.000000	158.000000	158.000000	158.000000
mean	79.493671	5.375734	0.047885	0.846137	0.991046	0.630259	0.428615	0.143422	0.237296	2.098977
std	45.754363	1.145010	0.017146	0.403121	0.272369	0.247078	0.150693	0.120034	0.126685	0.553550
min	1.000000	2.839000	0.018480	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.328580
25%	40.250000	4.526000	0.037268	0.545808	0.856823	0.439185	0.328330	0.061675	0.150553	1.759410
50%	79.500000	5.232500	0.043940	0.910245	1.029510	0.696705	0.435515	0.107220	0.216130	2.095415
75%	118.750000	6.243750	0.052300	1.158448	1.214405	0.811013	0.549092	0.180255	0.309883	2.462415
max	158.000000	7.587000	0.136930	1.690420	1.402230	1.025250	0.669730	0.551910	0.795880	3.602140

	country	region	rank	happiness_score	standard_error	gdp_per_capita	social_support	healthy_life_expectancy	freedom	corruption	generosity	dystopia_residual
0	Switzerland	Western Europe	1	7.587	0.03411	1.39651	1.34951	0.94143	0.66557	0.41978	0.29678	2.51738
1	Iceland	Western Europe	2	7.561	0.04884	1.30232	1.40223	0.94784	0.62877	0.14145	0.43630	2.70201
2	Denmark	Western Europe	3	7.527	0.03328	1.32548	1.36058	0.87464	0.64938	0.48357	0.34139	2.49204
3	Norway	Western Europe	4	7.522	0.03880	1.45900	1.33095	0.88521	0.66973	0.36503	0.34699	2.46531
4	Canada	North America	5	7.427	0.03553	1.32629	1.32261	0.90563	0.63297	0.32957	0.45811	2.45176

	gdp_per_capita	social_support	healthy_life_expectancy	freedom	generosity	corruption
0	1.39651	1.34951	0.94143	0.66557	0.29678	0.41978
1	1.30232	1.40223	0.94784	0.62877	0.43630	0.14145
2	1.32548	1.36058	0.87464	0.64938	0.34139	0.48357
3	1.45900	1.33095	0.88521	0.66973	0.34699	0.36503
4	1.32629	1.32261	0.90563	0.63297	0.45811	0.32957

	actual_happiness	predicted_happiness
128	4.307	4.760138
45	5.987	6.469077
134	4.194	4.613723
156	2.905	3.052902
90	5.057	4.964146

Scikit-learn Tutorial: Predicting Happiness with Machine Learning¶

1. What is scikit-learn?¶

Why use scikit-learn?¶

When would you use scikit-learn?¶

Import libraries¶

The dataset: World Happiness Report¶

Variables¶

Why This Dataset Matter to Us¶

3. Exploratory data analysis¶

Relationship between GDP and happiness¶

Correlation heatmap¶

Part1: Regression with scikit-learn¶

Train-test split¶

Fit Linear Regression Model¶

Create a table comparing actual and predicted happiness scores:¶

Evaluate the Linear Regression model¶

Random Forest Regression¶

Compare Two Models¶

PART 3: Clustering countries¶

Fit K-Means clustering¶

Explore the clusters¶

Visualize the clusters¶

Our Focus¶

Region / Continent comparison¶

7. Conclusion¶

	Model	MSE	R^2
0	Linear Regression	0.241939	0.829471
1	Random Forest	0.263872	0.814011

	feature	importance
0	gdp_per_capita	0.419622
1	social_support	0.193982
2	healthy_life_expectancy	0.186078
3	freedom	0.099359
5	corruption	0.056074
4	generosity	0.044885

	happiness_score	gdp_per_capita	social_support	healthy_life_expectancy	freedom	generosity	corruption
cluster
0	5.576582	0.980120	1.056143	0.732992	0.401858	0.184052	0.088927
1	4.206500	0.369585	0.737940	0.336673	0.366064	0.254268	0.132607
2	6.844517	1.302792	1.250101	0.856586	0.609351	0.353076	0.310519

	country	region	rank	happiness_score	standard_error	gdp_per_capita	social_support	healthy_life_expectancy	freedom	corruption	generosity	dystopia_residual	cluster
90	Somaliland region	Sub-Saharan Africa	91	5.057	0.06161	0.18847	0.95152	0.43873	0.46582	0.39928	0.50318	2.11032	1
153	Rwanda	Sub-Saharan Africa	154	3.465	0.03464	0.22208	0.77370	0.42864	0.59201	0.55191	0.22628	0.67042	1

	happiness_score	gdp_per_capita
region
Australia and New Zealand	7.285000	1.291880
Central and Eastern Europe	5.332931	0.942438
Eastern Asia	5.626167	1.151780
Latin America and Caribbean	6.144682	0.876815
Middle East and Northern Africa	5.406900	1.066974
North America	7.273000	1.360400
Southeastern Asia	5.317444	0.789054
Southern Asia	4.580857	0.560486
Sub-Saharan Africa	4.202800	0.380473
Western Europe	6.689619	1.298596