import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

file_path = "loan_approval.csv"

df = kagglehub.dataset_load(
    KaggleDatasetAdapter.PANDAS,
    "anishdevedward/loan-approval-dataset",
    file_path,
)

df.shape

(2000, 8)

df.head()

def extract_gender(df) -> pd.DataFrame:
    from gender_guesser import detector

    d = detector.Detector()

    first_names = df["name"].str.split().str[0]
    # first_names can also be Dr., Mr., Mrs., etc. so we handle those cases by using second word if first is a title
    titles = {"Dr.", "Mr.", "Mrs.", "Ms.", "Miss", "Mx."}
    first_names = first_names.where(~first_names.isin(titles), df["name"].str.split().str[1])
    df["gender"] = first_names.apply(d.get_gender).replace(
        {
            "mostly_male": "Male",
            "mostly_female": "Female",
            "male": "Male",
            "female": "Female",
            "unknown": "Unknown",
        }
    )
    return df


df = df.pipe(extract_gender)
df.head()

df.describe(include='all')

plt.figure(figsize=(10, 6))
bars = sns.countplot(x="loan_approved", data=df)
for bar in bars.patches:
    height = bar.get_height() / len(df) * 100
    bars.annotate(
        f"{height:.2f}%",
        xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()),
        xytext=(0, 3),
        textcoords="offset points",
        ha="center",
        va="bottom",
    )
plt.title("Loan Approval Distribution")
plt.grid()
plt.show()

plt.figure(figsize=(10, 6))
bars = sns.countplot(x="gender", hue="loan_approved", data=df)
for bar in bars.patches:
    height = bar.get_height() / len(df) * 100
    bars.annotate(
        f"{height:.2f}%",
        xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()),
        xytext=(0, 3),
        textcoords="offset points",
        ha="center",
        va="bottom",
    )
plt.title("Loan Approval by Gender")
plt.grid(axis='y')
plt.show()

# Set up a figure for multiple plots
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Looking for distribution differences by Gender', fontsize=16)

# Plot 1: Income vs. Gender
sns.boxplot(ax=axes[0, 0], x='gender', y='income', data=df)
axes[0, 0].set_title('Income by Gender')
axes[0, 0].grid(axis='y')

# Plot 2: Credit Score vs. Gender
sns.boxplot(ax=axes[0, 1], x='gender', y='credit_score', data=df)
axes[0, 1].set_title('Credit Score by Gender')
axes[0, 1].grid(axis='y')

# Plot 3: Loan Amount vs. Gender
sns.boxplot(ax=axes[1, 0], x='gender', y='loan_amount', data=df)
axes[1, 0].set_title('Loan Amount by Gender')
axes[1, 0].grid(axis='y')

# Plot 4: Years Employed vs. Gender
sns.boxplot(ax=axes[1, 1], x='gender', y='years_employed', data=df)
axes[1, 1].set_title('Years Employed by Gender')
axes[1, 1].grid(axis='y')

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

# See the exact number of unique cities
city_count = df['city'].nunique()
print(f"Number of unique cities: {city_count}")

# See the top 10 most common cities
print("\nTop 10 most common cities:")
print(df['city'].value_counts().head(10))

Number of unique cities: 1882

Top 10 most common cities:
city
North Joseph        4
Williamsstad        3
West Elizabeth      3
Port Christopher    3
East Mary           3
Leeland             3
West Melissa        3
Christinachester    3
North David         3
Port Michael        3
Name: count, dtype: int64

# Calculate the mean approval rate for each city (True=1, False=0)
city_approval_rates = df.groupby('city')['loan_approved'].mean().sort_values(ascending=False)

print("\n--- City vs. Loan Approval ---")

print("\nTop 5 cities with HIGHEST approval rates:")
print(city_approval_rates.head(5))

print("\nTop 5 cities with LOWEST approval rates:")
print(city_approval_rates.tail(5))

--- City vs. Loan Approval ---

Top 5 cities with HIGHEST approval rates:
city
Aaronstad           1.0
New Josephland      1.0
New Lorraineview    1.0
New Kristy          1.0
New Kimberlyport    1.0
Name: loan_approved, dtype: float64

Top 5 cities with LOWEST approval rates:
city
East Patty           0.0
New Sarah            0.0
New Sandraborough    0.0
New Samanthabury     0.0
Nelsonside           0.0
Name: loan_approved, dtype: float64

# This shows the % of Male/Female in each city
city_gender_dist = pd.crosstab(df['city'], df['gender'], normalize='index')

# Sort by the 'Male' percentage to find the most skewed cities
city_gender_dist_sorted = city_gender_dist.sort_values(by='Male', ascending=False)

print("\n--- City vs. Gender ---")

print("\nTop 5 most 'Male' cities:")
print(city_gender_dist_sorted.head(5))

print("\nTop 5 most 'Female' cities:")
print(city_gender_dist_sorted.tail(5))

--- City vs. Gender ---

Top 5 most 'Male' cities:
gender             Female  Male
city                           
Aaronstad             0.0   1.0
New Nicholashaven     0.0   1.0
New Sarah             0.0   1.0
New Richard           0.0   1.0
New Reginald          0.0   1.0

Top 5 most 'Female' cities:
gender            Female  Male
city                          
Maryburgh            1.0   0.0
East Edwardhaven     1.0   0.0
Maryberg             1.0   0.0
Port Dwayneberg      1.0   0.0
Keithland            1.0   0.0

corr_matrix = df.select_dtypes(include=['number', 'bool']).corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title('Correlation Matrix of Numerical Features')
plt.show()

features_corr = corr_matrix['loan_approved'].drop('loan_approved').sort_values(ascending=False)
print("\nFeature correlations with Loan Approval:")
print(features_corr)

Feature correlations with Loan Approval:
points            0.821415
credit_score      0.715788
income            0.238066
years_employed    0.104408
loan_amount      -0.157859
Name: loan_approved, dtype: float64

# Debt to Income Ratio
df['dti_ratio'] = df['loan_amount'] / (df['income'] + 1)

plt.figure(figsize=(10, 6))
sns.boxplot(x='gender', y='dti_ratio', data=df)
plt.title('Debt to Income Ratio by Gender')
plt.grid(axis='y')
plt.show()

df['loan_to_points'] = df['loan_amount'] / (df['points'] + 1)

plt.figure(figsize=(10, 6))
sns.boxplot(x='gender', y='loan_to_points', data=df)
plt.title('Loan Amount to Points by Gender')
plt.grid(axis='y')
plt.show()

corr_mat = df.select_dtypes(include=["number", "bool"]).corr()

plt.figure(figsize=(24, 8))
plt.subplot(1, 2, 1)
sns.heatmap(
    corr_mat,
    annot=True,
    fmt=".2f",
    cmap="coolwarm",
    square=True,
)
plt.title("Correlation Matrix after Feature Engineering")

features_corr = (
    corr_mat["loan_approved"].drop("loan_approved").sort_values(ascending=False)
)

plt.subplot(1, 2, 2)
sns.barplot(y=features_corr.index, x=features_corr.values)
plt.xlabel("Pearson Correlation Coefficient")
plt.ylabel("Features")
plt.title("Feature Correlations with 'Loan Approval'")
plt.grid(axis="y")
plt.show()

X = df.drop(
    columns=[
        "loan_approved",
        "name",
        "city",
        "gender",
        "credit_score",
        "points",
    ]
)
X.columns

Index(['income', 'loan_amount', 'years_employed', 'dti_ratio',
       'loan_to_points'],
      dtype='object')

sensitive_attribute = df['gender']

y = df['loan_approved'].astype(int)
y.value_counts()

loan_approved
0    1121
1     879
Name: count, dtype: int64

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)
f"Training set size: {X_train.shape}, Test set size: {X_test.shape}"

'Training set size: (1600, 5), Test set size: (400, 5)'

gender_test = sensitive_attribute.loc[y_test.index]

f"Test set gender distribution: {gender_test.value_counts(normalize=True) * 100}"

'Test set gender distribution: gender\nMale      51.75\nFemale    48.25\nName: proportion, dtype: float64'

ct = ColumnTransformer(
    transformers=[
        (
            "num",
            StandardScaler(),
            ["income", "loan_amount", "years_employed", "dti_ratio", "loan_to_points"],
        ),
    ],
    remainder="passthrough",
)
X_train = pd.DataFrame(ct.fit_transform(X_train), columns=ct.get_feature_names_out())
X_test = pd.DataFrame(ct.transform(X_test), columns=ct.get_feature_names_out())

X_train.head()

svm = SVC(random_state=42)

svm.fit(X_train, y_train)

SVC(random_state=42)

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)

LogisticRegression(random_state=42)

y_pred_svm = svm.predict(X_test)
y_pred_rf = rf.predict(X_test)
y_pred_lr = log_reg.predict(X_test)


# classification reports
print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm))

SVM Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.93      0.95       224
           1       0.92      0.97      0.94       176

    accuracy                           0.95       400
   macro avg       0.95      0.95      0.95       400
weighted avg       0.95      0.95      0.95       400

print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.97      0.98       224
           1       0.97      0.97      0.97       176

    accuracy                           0.97       400
   macro avg       0.97      0.97      0.97       400
weighted avg       0.97      0.97      0.97       400

print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr))

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       224
           1       0.97      1.00      0.99       176

    accuracy                           0.99       400
   macro avg       0.99      0.99      0.99       400
weighted avg       0.99      0.99      0.99       400

from IPython.display import Markdown

# disparate impact analysis
def disparate_impact(y_pred, name, feature='gender'):
    eval_df = pd.DataFrame(
        {
            feature: gender_test,
            "Prediction": map(lambda x: 'Approved' if x == 1 else 'Denied', y_pred),
        }
    )
    disparity = (
        eval_df.groupby([feature, "Prediction"]).size().unstack(fill_value=0)
    )
    disparity["Total"] = disparity.sum(axis=1)
    disparity["Proportion Approved"] = (disparity["Approved"] / disparity["Total"]) * 100
    disparity["Proportion Denied"] = (disparity["Denied"] / disparity["Total"]) * 100
    return disparity

for name, pred in zip(
    ["SVM", "Random Forest", "Logistic Regression"],
    [y_pred_svm, y_pred_rf, y_pred_lr],
):
    display(Markdown(f"### Disparate Impact Analysis for {name}:"))
    display(disparate_impact(pred, name))
    print("")

def disparity_mistreatment(y_pred, feature='gender'):
    eval_df = pd.DataFrame(
        {
            feature: gender_test,
            "Prediction": y_pred,
            "Actual": y_test,
        }
    )
    accuracy = (
        eval_df.groupby(feature)
        .apply(lambda x: accuracy_score(x["Actual"], x["Prediction"]),  include_groups=False)
        .rename("Accuracy")
        .reset_index()
    )
    accuracy["Accuracy"] = accuracy["Accuracy"] * 100
    return accuracy

for name, pred in zip(
    ["SVM", "Random Forest", "Logistic Regression"],
    [y_pred_svm, y_pred_rf, y_pred_lr],
):
    display(Markdown(f"### Disparate Mistreatment (Accuracy) Analysis for {name}:"))
    display(disparity_mistreatment(pred))
    print("")

def disparity_treatment(y_pred, name, feature='gender'):
    eval_df = pd.DataFrame(
        {
            feature: gender_test,
            "Prediction": y_pred,
            "Actual": y_test,
        }
    )
    error_rate = (
        eval_df.groupby(feature)
        .apply(lambda x: (x["Actual"] != x["Prediction"]).mean(), include_groups=False)
        .rename("Error Rate")
        .reset_index()
    )
    return error_rate

for name, pred in zip(
    ["SVM", "Random Forest", "Logistic Regression"],
    [y_pred_svm, y_pred_rf, y_pred_lr],
):
    display(Markdown(f"### Disparate Treatment Analysis for {name}:"))
    display(disparity_treatment(pred, name))
    print("")

log_reg_coefficients = pd.Series(log_reg.coef_[0], index=X_train.columns).sort_values(ascending=False)
log_reg_coefficients.plot(kind='bar', figsize=(12, 6))
plt.title('Logistic Regression Coefficients')
plt.ylabel('Coefficient Value')
plt.grid(axis='y')
plt.show()

rf_feature_importances = pd.Series(rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
rf_feature_importances.plot(kind='bar', figsize=(12, 6))
plt.title('Random Forest Feature Importances')
plt.ylabel('Importance Score')
plt.grid(axis='y')
plt.show()

	name	city	income	credit_score	loan_amount	years_employed	points	loan_approved
0	Allison Hill	East Jill	113810	389	39698	27	50.0	False
1	Brandon Hall	New Jamesside	44592	729	15446	28	55.0	False
2	Rhonda Smith	Lake Roberto	33278	584	11189	13	45.0	False
3	Gabrielle Davis	West Melanieview	127196	344	48823	29	50.0	False
4	Valerie Gray	Mariastad	66048	496	47174	4	25.0	False

	name	city	income	credit_score	loan_amount	years_employed	points	loan_approved	gender
0	Allison Hill	East Jill	113810	389	39698	27	50.0	False	Female
1	Brandon Hall	New Jamesside	44592	729	15446	28	55.0	False	Male
2	Rhonda Smith	Lake Roberto	33278	584	11189	13	45.0	False	Female
3	Gabrielle Davis	West Melanieview	127196	344	48823	29	50.0	False	Female
4	Valerie Gray	Mariastad	66048	496	47174	4	25.0	False	Female

	name	city	income	credit_score	loan_amount	years_employed	points	loan_approved	gender
count	2000	2000	2000.000000	2000.000000	2000.000000	2000.000000	2000.000000	2000	2000
unique	2000	1882	NaN	NaN	NaN	NaN	NaN	2	2
top	Allison Hill	North Joseph	NaN	NaN	NaN	NaN	NaN	False	Male
freq	1	4	NaN	NaN	NaN	NaN	NaN	1121	1010
mean	NaN	NaN	90585.977000	573.946000	25308.503000	20.441000	56.680000	NaN	NaN
std	NaN	NaN	34487.874907	160.564945	14207.320147	11.777813	18.638033	NaN	NaN
min	NaN	NaN	30053.000000	300.000000	1022.000000	0.000000	10.000000	NaN	NaN
25%	NaN	NaN	61296.250000	433.000000	12748.750000	10.000000	45.000000	NaN	NaN
50%	NaN	NaN	90387.500000	576.000000	25661.500000	21.000000	55.000000	NaN	NaN
75%	NaN	NaN	120099.750000	715.000000	37380.500000	31.000000	70.000000	NaN	NaN
max	NaN	NaN	149964.000000	850.000000	49999.000000	40.000000	100.000000	NaN	NaN

	num__income	num__loan_amount	num__years_employed	num__dti_ratio	num__loan_to_points
0	-0.173429	0.034781	-1.153971	-0.110775	-0.139129
1	-0.229602	-1.228299	1.400357	-0.915992	-1.067289
2	0.464640	0.362460	1.485501	-0.182121	0.209131
3	-0.761176	-0.845148	1.400357	-0.490703	-0.895565
4	1.061643	-1.236909	-0.302528	-1.045114	-1.118882

	C	1.0
	kernel	'rbf'
	degree	3
	gamma	'scale'
	coef0	0.0
	shrinking	True
	probability	False
	tol	0.001
	cache_size	200
	class_weight	None
	verbose	False
	max_iter	-1
	decision_function_shape	'ovr'
	break_ties	False
	random_state	42

Ethical Analysis on Loan Approval Dataset¶

Importing Required Libraries¶

1. Feature Engineering: Adding a Sensitive Attribute¶

2. Exploratory Data Analysis (EDA) & Bias Investigation¶

2.1. Overall Loan Approval¶

2.1 Observations¶

2.2. Approval Rates and Feature Distributions by Gender¶

2.2 Observations¶

2.3. City Feature Analysis¶

2.3. City Feature Analysis¶

Correlation Observations¶

Feature Engineering¶

3. Model Preparation¶

4. Model Training¶

Model Evaluation¶

4.1. Performance Summary¶

5. Bias & Fairness Analysis¶

5.1. Disparate Impact (Statistical Parity)¶

5.2. Disparate Mistreatment (Accuracy)¶

5.3. Disparate Treatment (Error Rate)¶

Disparate Impact Analysis for SVM:¶

Disparate Impact Analysis for Random Forest:¶

Disparate Impact Analysis for Logistic Regression:¶

Disparate Mistreatment (Accuracy) Analysis for SVM:¶

Disparate Mistreatment (Accuracy) Analysis for Random Forest:¶

Disparate Mistreatment (Accuracy) Analysis for Logistic Regression:¶

Disparate Treatment Analysis for SVM:¶

Disparate Treatment Analysis for Random Forest:¶

Disparate Treatment Analysis for Logistic Regression:¶

6. Feature Importance & Bias Explanation¶

6.1. Analysis of Feature Importance¶

🏆 Final Conclusion¶

	n_estimators	100
	criterion	'gini'
	max_depth	None
	min_samples_split	2
	min_samples_leaf	1
	min_weight_fraction_leaf	0.0
	max_features	'sqrt'
	max_leaf_nodes	None
	min_impurity_decrease	0.0
	bootstrap	True
	oob_score	False
	n_jobs	None
	random_state	42
	verbose	0
	warm_start	False
	class_weight	None
	ccp_alpha	0.0
	max_samples	None
	monotonic_cst	None

	penalty	'l2'
	dual	False
	tol	0.0001
	C	1.0
	fit_intercept	True
	intercept_scaling	1
	class_weight	None
	random_state	42
	solver	'lbfgs'
	max_iter	100
	multi_class	'deprecated'
	verbose	0
	warm_start	False
	n_jobs	None
	l1_ratio	None

Prediction	Approved	Denied	Total	Proportion Approved	Proportion Denied
gender
Female	86	107	193	44.559585	55.440415
Male	99	108	207	47.826087	52.173913

Prediction	Approved	Denied	Total	Proportion Approved	Proportion Denied
gender
Female	82	111	193	42.487047	57.512953
Male	95	112	207	45.893720	54.106280

	gender	Accuracy
0	Female	93.782383
1	Male	95.652174

	gender	Accuracy
0	Female	95.854922
1	Male	98.550725

	gender	Accuracy
0	Female	97.927461
1	Male	99.516908

	gender	Error Rate
0	Female	0.062176
1	Male	0.043478