import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import sklearn

import plotly.io as pio
from plotly.offline import init_notebook_mode

init_notebook_mode(connected=True)
pio.renderers.keys()
# pio.renderers.default = 'svg'
pio.renderers.default = 'iframe_connected'

df = pd.read_csv("./Data/WA_Fn-UseC_-HR-Employee-Attrition.csv")
df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                  1470 non-null   int64 
 15  JobRole                   1470 non-null   object
 16  JobSatisfaction           1470 non-null   int64 
 17  MaritalStatus             1470 non-null   object
 18  MonthlyIncome             1470 non-null   int64 
 19  MonthlyRate               1470 non-null   int64 
 20  NumCompaniesWorked        1470 non-null   int64 
 21  Over18                    1470 non-null   object
 22  OverTime                  1470 non-null   object
 23  PercentSalaryHike         1470 non-null   int64 
 24  PerformanceRating         1470 non-null   int64 
 25  RelationshipSatisfaction  1470 non-null   int64 
 26  StandardHours             1470 non-null   int64 
 27  StockOptionLevel          1470 non-null   int64 
 28  TotalWorkingYears         1470 non-null   int64 
 29  TrainingTimesLastYear     1470 non-null   int64 
 30  WorkLifeBalance           1470 non-null   int64 
 31  YearsAtCompany            1470 non-null   int64 
 32  YearsInCurrentRole        1470 non-null   int64 
 33  YearsSinceLastPromotion   1470 non-null   int64 
 34  YearsWithCurrManager      1470 non-null   int64 
dtypes: int64(26), object(9)
memory usage: 402.1+ KB

df.describe()

df.columns
print(df.columns)

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

df.dtypes

Age                          int64
Attrition                   object
BusinessTravel              object
DailyRate                    int64
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EmployeeCount                int64
EmployeeNumber               int64
EnvironmentSatisfaction      int64
Gender                      object
HourlyRate                   int64
JobInvolvement               int64
JobLevel                     int64
JobRole                     object
JobSatisfaction              int64
MaritalStatus               object
MonthlyIncome                int64
MonthlyRate                  int64
NumCompaniesWorked           int64
Over18                      object
OverTime                    object
PercentSalaryHike            int64
PerformanceRating            int64
RelationshipSatisfaction     int64
StandardHours                int64
StockOptionLevel             int64
TotalWorkingYears            int64
TrainingTimesLastYear        int64
WorkLifeBalance              int64
YearsAtCompany               int64
YearsInCurrentRole           int64
YearsSinceLastPromotion      int64
YearsWithCurrManager         int64
dtype: object

df.isnull().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64

df.isna().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64

attrition_counts = df["Attrition"].value_counts()
yes_percent = attrition_counts["Yes"] / (
    attrition_counts["Yes"] + attrition_counts["No"]
)
no_percent = attrition_counts["No"] / (attrition_counts["Yes"] + attrition_counts["No"])

fig = go.Figure(
    data=[
        go.Pie(
            labels=["Yes", "No"],
            values=[yes_percent, no_percent],
        ),
    ]
)
fig.show()

average_age_by_gender = df.groupby("Gender")["Age"].mean()

print("\nAverage age by Gender:")
print("================================")
print(average_age_by_gender)

Average age by Gender:
================================
Gender
Female    37.329932
Male      36.653061
Name: Age, dtype: float64

df.groupby("Gender")["Age"].describe()

from plotly.subplots import make_subplots

fig = make_subplots(
    rows=2,
    cols=2,
    subplot_titles=("Female Employees", "Male Employees", "Overall Employees"),
)

# Female employees
female_df = df[df["Gender"] == "Female"]
female_hist = px.histogram(female_df, x="Age")
female_hist.update_traces(showlegend=True)
mean_age_female = female_df["Age"].mean()

# Male employees
male_df = df[df["Gender"] == "Male"]
male_hist = px.histogram(male_df, x="Age")
male_hist.update_traces(showlegend=True)
mean_age_male = male_df["Age"].mean()

# Overall employees
overall_hist = px.histogram(df, x="Age")
overall_hist.update_traces(showlegend=True)
mean_age_overall = df["Age"].mean()

for trace in female_hist.data:
    fig.add_trace(trace, row=1, col=1)
    fig.update_xaxes(title_text="Age", row=1, col=1)
    fig.update_yaxes(title_text="Count", row=1, col=1)
    fig.add_vline(
        x=mean_age_female,
        line_width=2,
        line_dash="dash",
        line_color="red",
        annotation_text="Mean Age",
        annotation_position="top right",
        row=1,
        col=1,
    )

for trace in male_hist.data:
    fig.add_trace(trace, row=1, col=2)
    fig.update_xaxes(title_text="Age", row=1, col=2)
    fig.update_yaxes(title_text="Count", row=1, col=2)
    fig.add_vline(
        x=mean_age_male,
        line_width=2,
        line_dash="dash",
        line_color="red",
        annotation_text="Mean Age",
        annotation_position="top right",
        row=1,
        col=2,
    )

for trace in overall_hist.data:
    fig.add_trace(trace, row=2, col=1)
    fig.update_xaxes(title_text="Age", row=2, col=1)
    fig.update_yaxes(title_text="Count", row=2, col=1)
    fig.add_vline(
        x=mean_age_overall,
        line_width=2,
        line_dash="dash",
        line_color="red",
        annotation_text="Mean Age",
        annotation_position="top right",
        row=2,
        col=1,
    )

fig.update_layout(title_text="Age Distribution of Employees", height=800)

fig.show()

job_satisfaction_by_gender = df.groupby("Gender")["JobSatisfaction"].mean()

print("\nMean Job Satisfaction by Gender:")
print("================================")
print(job_satisfaction_by_gender)

Mean Job Satisfaction by Gender:
================================
Gender
Female    2.683673
Male      2.758503
Name: JobSatisfaction, dtype: float64

df.groupby("Gender")["JobSatisfaction"].describe()

job_satisfaction_pct = pd.crosstab(df['Gender'], df['JobSatisfaction'], normalize='index') * 100

fig = go.Figure()

for level in job_satisfaction_pct.columns:
    fig.add_trace(go.Bar(
        name=f'Level {level}',
        x=job_satisfaction_pct.index,
        y=job_satisfaction_pct[level],
        text=job_satisfaction_pct[level].round(1).astype(str) + '%',
        textposition='inside'
    ))

fig.update_layout(
    barmode='stack',
    title='Job Satisfaction Distribution by Gender (%)',
    xaxis_title="Gender",
    yaxis_title="Percentage (%)",
    showlegend=True,
    legend_title="Job Satisfaction Level"
)

fig.show()

average_salary_by_gender = df.groupby("Gender")["MonthlyIncome"].mean()

print("\nAverage Salary by Gender:")
print("================================")
display(average_salary_by_gender)

Average Salary by Gender:
================================

Gender
Female    6686.566327
Male      6380.507937
Name: MonthlyIncome, dtype: float64

fig = px.strip(
    df,
    x="Gender",
    y="MonthlyIncome",
    title="Average Salary by Gender",
    hover_data=["MonthlyIncome", "JobSatisfaction"],
    color="Gender",
)
fig.show()

grouped_df = (
    df.groupby(["Gender", "Department"])["Department"]
    .count()
    .reset_index(name="Count")
)

print("\nEmployee Count by Gender and Department:")
print("=======================================")
display(grouped_df)

print("\nTotal Employees per Department:")
print("============================")
dept_totals = grouped_df.groupby("Department")["Count"].sum()
display(dept_totals)

Employee Count by Gender and Department:
=======================================

Total Employees per Department:
============================

Department
Human Resources            63
Research & Development    961
Sales                     446
Name: Count, dtype: int64

department_counts = df["Department"].value_counts().reset_index()
department_counts.columns = ["Department", "Count"]

fig = px.bar_polar(
    department_counts,
    r="Count",
    theta="Department",
    color="Department",
    title="Total Employees per Department",
)

fig.show()

df["Education"].value_counts()

Education
3    572
4    398
2    282
1    170
5     48
Name: count, dtype: int64

df["EducationLevel"] = df["Education"].map(
    {1: "School", 2: "College", 3: "Bachelor", 4: "Master", 5: "PhD"}
)

education_percentages = df["EducationLevel"].value_counts().reset_index()
education_percentages.columns = ["EducationLevel", "Count"]

attrition_percentages = df.groupby(["EducationLevel", "Attrition"]).size().reset_index(name="Count")
attrition_percentages["Percentage"] = attrition_percentages.groupby("EducationLevel")["Count"].transform(
    lambda x: (x / x.sum()) * 100)

fig = make_subplots(
    rows=2,
    cols=1,
    subplot_titles=(
        "Education Level Distribution - Percentage",
        "Attrition Percentage by Education Level"
    ),
    specs=[[{"type": "pie"}], [{"type": "xy"}]],
    vertical_spacing=0.2
)

fig.add_trace(
    go.Pie(
        labels=education_percentages["EducationLevel"],
        values=education_percentages["Count"],
        textinfo="label+percent",
        hoverinfo="label+percent",
        texttemplate="%{label}: %{percent}",
        insidetextorientation="radial"
    ),
    row=1, col=1
)
fig.update_traces(
    textposition="inside"
)

fig_bar_percent = px.bar(
    attrition_percentages,
    x="EducationLevel",
    y="Percentage",
    color="Attrition",
    barmode="group",
    text=attrition_percentages["Percentage"].round(1)
)
fig_bar_percent.update_traces(
    textposition="outside",
    texttemplate="%{text}%"
)
for trace in fig_bar_percent.data:
    fig.add_trace(trace, row=2, col=1)  

fig.update_layout(
    height=900, 
    width=900,
    title_text="Education Level Analysis",
    showlegend=True
)

fig.update_xaxes(title_text="Education Level", row=2, col=1)
fig.update_yaxes(title_text="Percentage (%)", row=2, col=1)

fig.show()

# Determine the average monthly income by department
average_income_by_department = (
    df.groupby("Department")["MonthlyIncome"].mean().reset_index(name="MeanIncome")
)

# Determine the average monthly income by department and attrition status
average_income_by_department_attrition = (
    df.groupby(["Department", "Attrition"])["MonthlyIncome"]
    .mean()
    .reset_index(name="MeanIncome")
)

fig = make_subplots(
    rows=2,
    cols=1,
    subplot_titles=(
        "Average Monthly Income by Department",
        "Average Monthly Income by Department and Attrition Status",
    ),
)

fig1 = px.bar(
    average_income_by_department,
    x="Department",
    y="MeanIncome",
    title="Average Monthly Income by Department",
    color="Department",
)

average_income_by_department_attrition["AttritionPercentage"] = (
    average_income_by_department_attrition.groupby("Department")[
        "MeanIncome"
    ].transform(lambda x: (x / x.sum()) * 100)
)

fig2 = px.bar(
    average_income_by_department_attrition,
    x="Department",
    y="MeanIncome",
    color="Attrition",
    barmode="group",
    title="Average Monthly Income by Department and Attrition Status",
    text="AttritionPercentage",
)
fig2.update_traces(
    texttemplate="%{text:.2f}%",
    textposition="outside",
)

for trace in fig1.data:
    fig.add_trace(trace, row=1, col=1)

for trace in fig2.data:
    fig.add_trace(trace, row=2, col=1)

fig.update_layout(height=800, title_text="Average Monthly Income Analysis")

fig.show()

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Determine the average monthly income by department
average_income_by_department = (
    df.groupby("Department")["MonthlyIncome"].mean().reset_index(name="MeanIncome")
)

# Calculate percentage of total income for each department
total_income = average_income_by_department["MeanIncome"].sum()
average_income_by_department["Percentage"] = (
    average_income_by_department["MeanIncome"] / total_income
) * 100

# Determine the average monthly income by department and attrition status
average_income_by_department_attrition = (
    df.groupby(["Department", "Attrition"])["MonthlyIncome"]
    .mean()
    .reset_index(name="MeanIncome")
)

# Calculate the percentage distribution of salary within each department by attrition status
dept_totals = average_income_by_department_attrition.groupby("Department")["MeanIncome"].sum().reset_index()
dept_totals.columns = ["Department", "TotalDeptIncome"]

# Merge the department totals back
average_income_by_department_attrition = average_income_by_department_attrition.merge(
    dept_totals, on="Department", how="left"
)

# Calculate the percentage of each attrition group within its department
average_income_by_department_attrition["AttritionIncomePercentage"] = (
    average_income_by_department_attrition["MeanIncome"] / 
    average_income_by_department_attrition["TotalDeptIncome"]
) * 100

# Create subplots
fig = make_subplots(
    rows=2,
    cols=2,
    subplot_titles=(
        "Average Monthly Income by Department",
        "Department Income Distribution (%)",
        "Average Monthly Income by Department and Attrition",
    ),
    specs=[[{"type": "xy"}, {"type": "pie"}], [{"type": "xy", "colspan": 2}, None]],
)

# Top Left: Bar chart of average income by department
fig1 = px.bar(
    average_income_by_department,
    x="Department",
    y="MeanIncome",
    color="Department",
    text=average_income_by_department["MeanIncome"].round(0),
)
fig1.update_traces(textposition="outside", texttemplate="%{text:,.0f}")

# Top Right: Pie chart of income distribution by department
fig_pie = px.pie(
    average_income_by_department,
    names="Department",
    values="MeanIncome",
    title="Department Income Distribution",
    hover_data=["Percentage"],
    labels={"MeanIncome": "Average Income"},
)
fig_pie.update_traces(
    texttemplate="%{label}: %{percent}",
    textposition="inside",
)

# Bottom Left: Bar chart of income by department and attrition
fig2 = px.bar(
    average_income_by_department_attrition,
    x="Department",
    y="MeanIncome",
    color="Attrition",
    barmode="group",
    text=average_income_by_department_attrition["MeanIncome"].round(0),
)
fig2.update_traces(textposition="outside", texttemplate="%{text:,.0f}")

# Add all traces to the subplots
for trace in fig1.data:
    fig.add_trace(trace, row=1, col=1)

for trace in fig_pie.data:
    fig.add_trace(trace, row=1, col=2)

for trace in fig2.data:
    fig.add_trace(trace, row=2, col=1)

# Update layout
fig.update_layout(height=1000, width=1200, title_text="Monthly Income Analysis")

# Update axes titles
fig.update_xaxes(title_text="Department", row=1, col=1)
fig.update_yaxes(title_text="Average Monthly Income ($)", row=1, col=1)
fig.update_xaxes(title_text="Department", row=2, col=1)
fig.update_yaxes(title_text="Average Monthly Income ($)", row=2, col=1)
fig.update_xaxes(title_text="Department", row=2, col=2)

fig.show()

fig = px.box(
    df,
    x="Attrition",
    y="MonthlyIncome",
    color="JobSatisfaction",
    title="Distribution of Monthly Income by Job Satisfaction and Attrition",
)
fig.show()

# Group by Attrition and calculate average MonthlyIncome
df_grouped = df.groupby("Attrition", as_index=False)["MonthlyIncome"].mean()
df_grouped['MonthlyIncomePercentage'] = (df_grouped['MonthlyIncome'] / df_grouped['MonthlyIncome'].sum()) * 100

fig = px.bar(
    df_grouped,
    x="Attrition",
    y="MonthlyIncome",
    title="Income and its Impact on Attrition",
    labels={"MonthlyIncome": "Average Monthly Income"},
    color="Attrition",
    text="MonthlyIncomePercentage"
)

fig.update_traces(
    textposition="inside",
    texttemplate="%{text:.2f}%",
)

fig.show()

# Count of employees by OverTime and Attrition
df_grouped = df.groupby(["OverTime", "Attrition"], as_index=False).size()
df_grouped['EmployeePercentage'] = (df_grouped['size'] / df_grouped['size'].sum()) * 100

fig = px.bar(
    df_grouped,
    x="OverTime",
    y="size",
    color="Attrition",
    title="Attrition Count by Overtime Status",
    labels={"size": "Number of Employees", "OverTime": "Overtime Status"},
    barmode="group",
    text="EmployeePercentage"
)

fig.update_traces(
    textposition="inside",
    texttemplate="%{text:.2f}%",
)

fig.show()

df.WorkLifeBalance.value_counts()

WorkLifeBalance
3    893
2    344
4    153
1     80
Name: count, dtype: int64

df["WorkLifeBalanceLevel"] = df.WorkLifeBalance.map(
    {1: "Bad", 2: "Good", 3: "Better", 4: "Best"}
)
df["WorkLifeBalanceLevel"].value_counts()

WorkLifeBalanceLevel
Better    893
Good      344
Best      153
Bad        80
Name: count, dtype: int64

# Group by WorkLifeBalance and Attrition
df_grouped = (
    df.groupby(["WorkLifeBalanceLevel", "WorkLifeBalance", "Attrition"], as_index=False)
    .size()
    .sort_values(by="WorkLifeBalance")
)
df_grouped["sizePercentage"] = (
    df_grouped.groupby("WorkLifeBalanceLevel")["size"]
    .transform(lambda x: (x / x.sum()) * 100)
)

fig = px.bar(
    df_grouped,
    x="WorkLifeBalanceLevel",
    y="size",
    color="Attrition",
    title="Is there a Work Life Balance Environment?",
    labels={
        "size": "Number of Employees",
        "WorkLifeBalanceLevel": "Work-Life Balance Rating",
    },
    barmode="group",
    text="sizePercentage",
)

fig.update_traces(
    textposition="outside",
    texttemplate="%{text:.2f}%",
)

fig.show()

# Group by DistanceFromHome and Attrition, and count employees
df_grouped = df.groupby(["DistanceFromHome", "Attrition"], as_index=False).size()

fig = px.bar(
    df_grouped,
    x="DistanceFromHome",
    y="size",
    color="Attrition",
    title="Attrition Rate by Distance from Home",
    labels={"size": "Number of Employees"},
)

fig.show()

fig = px.histogram(
    df,
    x="JobLevel",
    color="Attrition",
    title="Attrition Rate by Job Level for Nearby Employees",
    labels={"JobLevel": "Job Level"},
    barmode="group",
)

fig.show()

job_attrition = pd.crosstab(df['JobLevel'], df['Attrition'], normalize='index') * 100
job_attrition = job_attrition.reset_index()
job_attrition = pd.melt(job_attrition, id_vars=['JobLevel'], var_name='Attrition', value_name='Percentage')

fig = px.bar(
    job_attrition,
    x="JobLevel",
    y="Percentage",
    color="Attrition",
    title="Attrition Rate by Job Level for Nearby Employees",
    labels={"JobLevel": "Job Level", "Percentage": "Percentage (%)"},
    barmode="group",
    text="Percentage"
)

fig.update_traces(
    texttemplate='%{text:.1f}%',
    textposition='outside'
)

fig.update_layout(
    yaxis_title="Percentage (%)",
    yaxis=dict(range=[0, 100]), 
    legend_title="Attrition Status",
    bargap=0.2
)

fig.show()

df.dtypes

Age                          int64
Attrition                   object
BusinessTravel              object
DailyRate                    int64
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EmployeeCount                int64
EmployeeNumber               int64
EnvironmentSatisfaction      int64
Gender                      object
HourlyRate                   int64
JobInvolvement               int64
JobLevel                     int64
JobRole                     object
JobSatisfaction              int64
MaritalStatus               object
MonthlyIncome                int64
MonthlyRate                  int64
NumCompaniesWorked           int64
Over18                      object
OverTime                    object
PercentSalaryHike            int64
PerformanceRating            int64
RelationshipSatisfaction     int64
StandardHours                int64
StockOptionLevel             int64
TotalWorkingYears            int64
TrainingTimesLastYear        int64
WorkLifeBalance              int64
YearsAtCompany               int64
YearsInCurrentRole           int64
YearsSinceLastPromotion      int64
YearsWithCurrManager         int64
EducationLevel              object
WorkLifeBalanceLevel        object
dtype: object

df.Department.value_counts()

Department
Research & Development    961
Sales                     446
Human Resources            63
Name: count, dtype: int64

df['DepartmentValue'] = df['Department'].map({'Sales': 1, 'Research & Development': 2, 'Human Resources': 3})

df['GenderValue'] = df.Gender.map({'Male': True, 'Female': False})

df.select_dtypes(include=["object"]).columns

Index(['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender',
       'JobRole', 'MaritalStatus', 'Over18', 'OverTime', 'EducationLevel',
       'WorkLifeBalanceLevel'],
      dtype='object')

df.MaritalStatus.value_counts()

MaritalStatus
Married     673
Single      470
Divorced    327
Name: count, dtype: int64

df.select_dtypes(exclude=["object"]).columns

Index(['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
       'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager', 'DepartmentValue',
       'GenderValue'],
      dtype='object')

df.BusinessTravel.value_counts()

BusinessTravel
Travel_Rarely        1043
Travel_Frequently     277
Non-Travel            150
Name: count, dtype: int64

df['BusinessTravelValue'] = df.BusinessTravel.map({'Non-Travel': 0, 'Travel_Rarely': 1, 'Travel_Frequently': 2})

df['AttritionValue'] = df['Attrition'].map({'Yes': True, 'No': False})

df['OverTimeValue'] = df['OverTime'].map({'Yes': True, 'No': False})

df.EducationLevel.value_counts()

EducationLevel
Bachelor    572
Master      398
College     282
School      170
PhD          48
Name: count, dtype: int64

df.StandardHours.value_counts()

StandardHours
80    1470
Name: count, dtype: int64

correlation = df.drop(
    [
        "EmployeeCount",
        "EmployeeNumber",
        "Over18",
        "HourlyRate",
        "MaritalStatus",
        "Attrition",
        "EducationField",
        "Department",
        "Gender",
        'OverTime',
        'EducationLevel',
        'WorkLifeBalanceLevel',
        "JobRole",
        "BusinessTravel",
        'StandardHours'
    ], axis='columns'
).corr()
correlation

fig = px.imshow(
    correlation,
    color_continuous_scale="Viridis",
    title="Correlation Heatmap",
)
fig.show()

correlation['AttritionValue'].sort_values(ascending=False).drop('AttritionValue')

OverTimeValue               0.246118
BusinessTravelValue         0.127006
DistanceFromHome            0.077924
NumCompaniesWorked          0.043494
GenderValue                 0.029453
MonthlyRate                 0.015170
PerformanceRating           0.002889
PercentSalaryHike          -0.013478
Education                  -0.031373
YearsSinceLastPromotion    -0.033019
RelationshipSatisfaction   -0.045872
DailyRate                  -0.056652
TrainingTimesLastYear      -0.059478
WorkLifeBalance            -0.063939
DepartmentValue            -0.063991
EnvironmentSatisfaction    -0.103369
JobSatisfaction            -0.103481
JobInvolvement             -0.130016
YearsAtCompany             -0.134392
StockOptionLevel           -0.137145
YearsWithCurrManager       -0.156199
Age                        -0.159205
MonthlyIncome              -0.159840
YearsInCurrentRole         -0.160545
JobLevel                   -0.169105
TotalWorkingYears          -0.171063
Name: AttritionValue, dtype: float64

features = correlation['AttritionValue'].sort_values(ascending=False).drop('AttritionValue')

fig = px.bar(
    x=features.index,
    y=features.values,
    title="Feature Correlation with Attrition",
    labels={"x": "Feature", "y": "Correlation"},
    color=features.values,
)

for i in range(len(features)):
    fig.add_annotation(
        x=features.index[i],
        y=features.values[i],
        text=f"{features.values[i]:.2f}",
        yshift=10 if features.values[i] > 0 else -10,
        showarrow=False,
    )

fig.show()

from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

training_features = [
    "OverTimeValue",
    "BusinessTravelValue",
    "DistanceFromHome",
    "NumCompaniesWorked",
    "GenderValue",
    "MonthlyRate",
    "PerformanceRating",
    "PercentSalaryHike",
    "Education",
    "YearsSinceLastPromotion",
    "RelationshipSatisfaction",
    "DailyRate",
    "TrainingTimesLastYear",
    "WorkLifeBalance",
    "DepartmentValue",
    "EnvironmentSatisfaction",
    "JobSatisfaction",
    "JobInvolvement",
    "YearsAtCompany",
    "StockOptionLevel",
    "YearsWithCurrManager",
    "Age",
    "MonthlyIncome",
    "YearsInCurrentRole",
    "JobLevel",
    "TotalWorkingYears",
]
X = df[training_features]
y = df["Attrition"]

f"X.shape: {X.shape}, y.shape: {y.shape}"

'X.shape: (1470, 26), y.shape: (1470,)'

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=500)
f"X_train.shape: {X_train.shape}, X_test.shape: {X_test.shape}, y_train.shape: {y_train.shape}, y_test.shape: {y_test.shape}"

'X_train.shape: (1176, 26), X_test.shape: (294, 26), y_train.shape: (1176,), y_test.shape: (294,)'

smote = SMOTE(random_state=69)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
f"X_train_balanced.shape: {X_train_balanced.shape}, y_train_balanced.shape: {y_train_balanced.shape}"

'X_train_balanced.shape: (1974, 26), y_train_balanced.shape: (1974,)'

print("After SMOTE:")
display(pd.Series(y_train_balanced).value_counts())

After SMOTE:

Attrition
Yes    987
No     987
Name: count, dtype: int64

# calculate class weights
class_weights = dict(zip(
    [0, 1],
    compute_class_weight('balanced', classes=np.unique(y), y=y_train_balanced)
))

# create pipeline for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), X.select_dtypes(exclude=["object"]).columns),
        ("cat", OneHotEncoder(drop='first'), X.select_dtypes(include=["object"]).columns),
    ]
).set_output(transform='pandas')

# preprocess the data
y_train = LabelEncoder().fit_transform(y_train)
y_test = LabelEncoder().fit_transform(y_test)

models = {
    "Logistic Regression": LogisticRegression(random_state=100),
    "Random Forest": RandomForestClassifier(random_state=100),
    "Gradient Boosting": GradientBoostingClassifier(random_state=100),
    "SVM": SVC(random_state=100),
    "Decision Tree": DecisionTreeClassifier(random_state=100),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
}

results = {}
confusion_matrix_results = {}
for name, model in models.items():
    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("sampler", SMOTE(random_state=69)),
        ("classifier", model),
    ]).set_output(transform='pandas')
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    results[name] = {
        "Recall": recall_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
    }
    confusion_matrix_results[name] = confusion_matrix(y_test, y_pred)
    print(f"Model: {name}")
    print(classification_report(y_test, y_pred))
    print("=====================================")

Model: Logistic Regression
              precision    recall  f1-score   support

           0       0.93      0.80      0.86       246
           1       0.40      0.69      0.51        48

    accuracy                           0.78       294
   macro avg       0.67      0.74      0.68       294
weighted avg       0.84      0.78      0.80       294

=====================================
Model: Random Forest
              precision    recall  f1-score   support

           0       0.89      0.96      0.92       246
           1       0.65      0.42      0.51        48

    accuracy                           0.87       294
   macro avg       0.77      0.69      0.71       294
weighted avg       0.85      0.87      0.86       294

=====================================
Model: Gradient Boosting
              precision    recall  f1-score   support

           0       0.89      0.95      0.92       246
           1       0.61      0.40      0.48        48

    accuracy                           0.86       294
   macro avg       0.75      0.67      0.70       294
weighted avg       0.84      0.86      0.85       294

=====================================
Model: SVM
              precision    recall  f1-score   support

           0       0.91      0.91      0.91       246
           1       0.53      0.52      0.53        48

    accuracy                           0.85       294
   macro avg       0.72      0.72      0.72       294
weighted avg       0.85      0.85      0.85       294

=====================================
Model: Decision Tree
              precision    recall  f1-score   support

           0       0.87      0.80      0.84       246
           1       0.27      0.38      0.32        48

    accuracy                           0.73       294
   macro avg       0.57      0.59      0.58       294
weighted avg       0.77      0.73      0.75       294

=====================================
Model: KNN
              precision    recall  f1-score   support

           0       0.92      0.68      0.78       246
           1       0.30      0.71      0.42        48

    accuracy                           0.68       294
   macro avg       0.61      0.69      0.60       294
weighted avg       0.82      0.68      0.72       294

=====================================
Model: Naive Bayes
              precision    recall  f1-score   support

           0       0.91      0.55      0.68       246
           1       0.23      0.71      0.35        48

    accuracy                           0.57       294
   macro avg       0.57      0.63      0.52       294
weighted avg       0.80      0.57      0.63       294

=====================================

for model, result in results.items():
    print(f"Model: {model}")
    for metric, value in result.items():
        print(f"{metric}: {value}")
    print("\n")

Model: Logistic Regression
Recall: 0.6875
Precision: 0.4024390243902439
F1 Score: 0.5076923076923077


Model: Random Forest
Recall: 0.4166666666666667
Precision: 0.6451612903225806
F1 Score: 0.5063291139240507


Model: Gradient Boosting
Recall: 0.3958333333333333
Precision: 0.6129032258064516
F1 Score: 0.4810126582278481


Model: SVM
Recall: 0.5208333333333334
Precision: 0.5319148936170213
F1 Score: 0.5263157894736842


Model: Decision Tree
Recall: 0.375
Precision: 0.2727272727272727
F1 Score: 0.3157894736842105


Model: KNN
Recall: 0.7083333333333334
Precision: 0.3008849557522124
F1 Score: 0.422360248447205


Model: Naive Bayes
Recall: 0.7083333333333334
Precision: 0.23448275862068965
F1 Score: 0.35233160621761656

results_df = pd.DataFrame(results).T.reset_index(names="Model")

fig = px.bar(
    results_df,
    x="Model",
    y=["Recall", "Precision", "F1 Score"],
    title="Recall Score by Model",
    barmode="group",
    text_auto=True,
)

fig.show(renderer="vscode")

for model, cm in confusion_matrix_results.items():
    print(f"Confusion Matrix for {model}:")
    print(cm)
    print("\n")

Confusion Matrix for Logistic Regression:
[[197  49]
 [ 15  33]]


Confusion Matrix for Random Forest:
[[235  11]
 [ 28  20]]


Confusion Matrix for Gradient Boosting:
[[234  12]
 [ 29  19]]


Confusion Matrix for SVM:
[[224  22]
 [ 23  25]]


Confusion Matrix for Decision Tree:
[[198  48]
 [ 30  18]]


Confusion Matrix for KNN:
[[167  79]
 [ 14  34]]


Confusion Matrix for Naive Bayes:
[[135 111]
 [ 14  34]]

fig = make_subplots(rows=2, cols=4, subplot_titles=list(confusion_matrix_results.keys()))

for i, (model, cm) in enumerate(confusion_matrix_results.items()):
    fig.add_trace(
        go.Heatmap(
            z=cm,
            x=["No", "Yes"],
            y=["No", "Yes"],
            colorscale="Viridis",
            showscale=False,
        ),
        row=(i // 4) + 1,
        col=(i % 4) + 1,
    )

fig.update_layout(title="Confusion Matrix for all Models", height=800)

rf_model = RandomForestClassifier(random_state=100)

# GridSearchCV
from sklearn.model_selection import GridSearchCV
parameters = {
    'n_estimators': [200, 300, 400],
    'max_depth': [10, 15, 20],
}

rf_model_tuned = GridSearchCV(
    estimator=rf_model,
    param_grid=parameters,
    cv=5,
    n_jobs=-1,
)
tuning_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("sampler", SMOTE(random_state=69)),
    ("classifier", rf_model_tuned),
]).set_output(transform='pandas').fit(X_train, y_train)

y_pred_tuned = tuning_pipeline.predict(X_test)

print(classification_report(y_test, y_pred_tuned))

              precision    recall  f1-score   support

           0       0.89      0.96      0.93       246
           1       0.69      0.42      0.52        48

    accuracy                           0.87       294
   macro avg       0.79      0.69      0.72       294
weighted avg       0.86      0.87      0.86       294

evaluation_results = pd.DataFrame({
    "Recall": [recall_score(y_test, y_pred), recall_score(y_test, y_pred_tuned)],
    "Precision": [precision_score(y_test, y_pred), precision_score(y_test, y_pred_tuned)],
    "F1 Score": [f1_score(y_test, y_pred), f1_score(y_test, y_pred_tuned)],
}, index=["Random Forest Original", "Random Forest Tuned"])


fig = px.bar(
    evaluation_results,
    x=evaluation_results.index,
    y=evaluation_results.columns,
    title="Model Evaluation",
    labels={"value": "Score", "index": "Model"},
    barmode="group",
    text_auto=True,
)

fig.show()

improvement = evaluation_results.diff().iloc[1]

fig = px.bar(
    improvement,
    x=improvement.index,
    y=improvement.values,
    title="Improvement in Model Performance",
    labels={"y": "Improvement", "index": "Metric"},
    color=improvement.values,
)

for i in range(len(improvement)):
    fig.add_annotation(
        x=improvement.index[i],
        y=improvement.values[i],
        text=f"{improvement.values[i]:.2f}",
        yshift=10 if improvement.values[i] > 0 else -10,
        showarrow=False,
    )

fig.show()

f"Best paramaters: {rf_model_tuned.best_params_}"

"Best paramaters: {'max_depth': 20, 'n_estimators': 200}"

feature_names = []
feature_names.extend(X.select_dtypes(exclude=["object"]).columns)

for i, col in enumerate(X.select_dtypes(include=["object"]).columns):
    categories = preprocessor.named_transformers_["one_hot_encoder"].categories_[i]
    for category in categories:
        feature_names.append(f"{col}_{category}")

feature_importance = pd.DataFrame(
    {
        "Feature": feature_names,
        "Importance": rf_model_tuned.best_estimator_.feature_importances_,
    }
)

feature_importance = feature_importance.sort_values("Importance", ascending=False).reset_index(drop=True)

fig = px.bar(
    feature_importance,
    x="Importance",
    y="Feature",
    title="Feature Importance",
    color="Feature",
    labels={"Importance": "Importance", "Feature": "Feature"},
    text_auto=True,
    hover_data=["Feature", "Importance"],
)

fig.show()

	Age	DailyRate	DistanceFromHome	Education	EmployeeCount	EmployeeNumber	EnvironmentSatisfaction	HourlyRate	JobInvolvement	JobLevel	...	RelationshipSatisfaction	StandardHours	StockOptionLevel	TotalWorkingYears	TrainingTimesLastYear	WorkLifeBalance	YearsAtCompany	YearsInCurrentRole	YearsSinceLastPromotion	YearsWithCurrManager
count	1470.000000	1470.000000	1470.000000	1470.000000	1470.0	1470.000000	1470.000000	1470.000000	1470.000000	1470.000000	...	1470.000000	1470.0	1470.000000	1470.000000	1470.000000	1470.000000	1470.000000	1470.000000	1470.000000	1470.000000
mean	36.923810	802.485714	9.192517	2.912925	1.0	1024.865306	2.721769	65.891156	2.729932	2.063946	...	2.712245	80.0	0.793878	11.279592	2.799320	2.761224	7.008163	4.229252	2.187755	4.123129
std	9.135373	403.509100	8.106864	1.024165	0.0	602.024335	1.093082	20.329428	0.711561	1.106940	...	1.081209	0.0	0.852077	7.780782	1.289271	0.706476	6.126525	3.623137	3.222430	3.568136
min	18.000000	102.000000	1.000000	1.000000	1.0	1.000000	1.000000	30.000000	1.000000	1.000000	...	1.000000	80.0	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000
25%	30.000000	465.000000	2.000000	2.000000	1.0	491.250000	2.000000	48.000000	2.000000	1.000000	...	2.000000	80.0	0.000000	6.000000	2.000000	2.000000	3.000000	2.000000	0.000000	2.000000
50%	36.000000	802.000000	7.000000	3.000000	1.0	1020.500000	3.000000	66.000000	3.000000	2.000000	...	3.000000	80.0	1.000000	10.000000	3.000000	3.000000	5.000000	3.000000	1.000000	3.000000
75%	43.000000	1157.000000	14.000000	4.000000	1.0	1555.750000	4.000000	83.750000	3.000000	3.000000	...	4.000000	80.0	1.000000	15.000000	3.000000	3.000000	9.000000	7.000000	3.000000	7.000000
max	60.000000	1499.000000	29.000000	5.000000	1.0	2068.000000	4.000000	100.000000	4.000000	5.000000	...	4.000000	80.0	3.000000	40.000000	6.000000	4.000000	40.000000	18.000000	15.000000	17.000000

	count	mean	std	min	25%	50%	75%	max
Gender
Female	588.0	37.329932	9.266083	18.0	31.0	36.0	44.0	60.0
Male	882.0	36.653061	9.042329	18.0	30.0	35.0	42.0	60.0

	count	mean	std	min	25%	50%	75%	max
Gender
Female	588.0	2.683673	1.096038	1.0	2.0	3.0	4.0	4.0
Male	882.0	2.758503	1.106970	1.0	2.0	3.0	4.0	4.0

	Age	DailyRate	DistanceFromHome	Education	EnvironmentSatisfaction	JobInvolvement	JobLevel	JobSatisfaction	MonthlyIncome	MonthlyRate	...	WorkLifeBalance	YearsAtCompany	YearsInCurrentRole	YearsSinceLastPromotion	YearsWithCurrManager	DepartmentValue	GenderValue	BusinessTravelValue	AttritionValue	OverTimeValue
Age	1.000000	0.010661	-0.001686	0.208034	0.010146	0.029820	0.509604	-0.004892	0.497855	0.028051	...	-0.021490	0.311309	0.212901	0.216513	0.202089	0.031882	-0.036311	-0.011807	-0.159205	0.028062
DailyRate	0.010661	1.000000	-0.004985	-0.016806	0.018355	0.046135	0.002966	0.030571	0.007707	-0.032182	...	-0.037848	-0.034055	0.009932	-0.033229	-0.026363	-0.007109	-0.011716	-0.015539	-0.056652	0.009135
DistanceFromHome	-0.001686	-0.004985	1.000000	0.021042	-0.016075	0.008783	0.005303	-0.003669	-0.017014	0.027473	...	-0.026556	0.009508	0.018845	0.010029	0.014406	-0.017225	-0.001851	-0.009696	0.077924	0.025514
Education	0.208034	-0.016806	0.021042	1.000000	-0.027128	0.042438	0.101589	-0.011296	0.094961	-0.026084	...	0.009819	0.069114	0.060236	0.054254	0.069065	-0.007996	-0.016547	-0.008670	-0.031373	-0.020322
EnvironmentSatisfaction	0.010146	0.018355	-0.016075	-0.027128	1.000000	-0.008278	0.001212	-0.006784	-0.006259	0.037600	...	0.027627	0.001458	0.018007	0.016194	-0.004999	0.019395	0.000508	-0.011310	-0.103369	0.070132
JobInvolvement	0.029820	0.046135	0.008783	0.042438	-0.008278	1.000000	-0.012630	-0.021476	-0.015271	-0.016322	...	-0.014617	-0.021355	0.008717	-0.024184	0.025976	0.024586	0.017960	0.029300	-0.130016	-0.003507
JobLevel	0.509604	0.002966	0.005303	0.101589	0.001212	-0.012630	1.000000	-0.001944	0.950300	0.039563	...	0.037818	0.534739	0.389447	0.353885	0.375281	-0.101963	-0.039403	-0.011696	-0.169105	0.000544
JobSatisfaction	-0.004892	0.030571	-0.003669	-0.011296	-0.006784	-0.021476	-0.001944	1.000000	-0.007157	0.000644	...	-0.019459	-0.003803	-0.002305	-0.018214	-0.027656	-0.021001	0.033252	0.008666	-0.103481	0.024539
MonthlyIncome	0.497855	0.007707	-0.017014	0.094961	-0.006259	-0.015271	0.950300	-0.007157	1.000000	0.034814	...	0.030683	0.514285	0.363818	0.344978	0.344079	-0.053130	-0.031858	-0.013450	-0.159840	0.006089
MonthlyRate	0.028051	-0.032182	0.027473	-0.026084	0.037600	-0.016322	0.039563	0.000644	0.034814	1.000000	...	0.007963	-0.023655	-0.012815	0.001567	-0.036746	-0.023642	-0.041482	-0.008440	0.015170	0.021431
NumCompaniesWorked	0.299635	0.038153	-0.029251	0.126317	0.012594	0.015012	0.142501	-0.055699	0.149515	0.017521	...	-0.008366	-0.118421	-0.090754	-0.036814	-0.110319	0.035882	-0.039147	-0.030743	0.043494	-0.020786
PercentSalaryHike	0.003634	0.022704	0.040235	-0.011111	-0.031701	-0.017205	-0.034730	0.020002	-0.027269	-0.006429	...	-0.003280	-0.035991	-0.001520	-0.022154	-0.011985	0.007840	0.002733	-0.025727	-0.013478	-0.005433
PerformanceRating	0.001904	0.000473	0.027110	-0.024539	-0.029548	-0.029071	-0.021222	0.002297	-0.017120	-0.009811	...	0.002572	0.003435	0.034986	0.017896	0.022827	0.024604	-0.013859	0.001683	0.002889	0.004369
RelationshipSatisfaction	0.053535	0.007846	0.006557	-0.009118	0.007665	0.034297	0.021642	-0.012454	0.025873	-0.004085	...	0.019604	0.019367	-0.015123	0.033493	-0.000867	0.022414	0.022868	0.008926	-0.045872	0.048493
StockOptionLevel	0.037510	0.042143	0.044872	0.018422	0.003432	0.021523	0.013984	0.010690	0.005408	-0.034323	...	0.004129	0.015058	0.050818	0.014352	0.024698	0.012193	0.012716	-0.028257	-0.137145	-0.000449
TotalWorkingYears	0.680381	0.014515	0.004628	0.148280	-0.002693	-0.005533	0.782208	-0.020185	0.772893	0.026442	...	0.001008	0.628133	0.460365	0.404858	0.459188	0.015762	-0.046881	0.007972	-0.171063	0.012754
TrainingTimesLastYear	-0.019621	0.002453	-0.036942	-0.025100	-0.019359	-0.015338	-0.018191	-0.005779	-0.021736	0.001467	...	0.028072	0.003569	-0.005738	-0.002067	-0.004096	-0.036875	-0.038787	0.016357	-0.059478	-0.079113
WorkLifeBalance	-0.021490	-0.037848	-0.026556	0.009819	0.027627	-0.014617	0.037818	-0.019459	0.030683	0.007963	...	1.000000	0.012089	0.049856	0.008941	0.002759	-0.026383	-0.002753	0.004209	-0.063939	-0.027092
YearsAtCompany	0.311309	-0.034055	0.009508	0.069114	0.001458	-0.021355	0.534739	-0.003803	0.514285	-0.023655	...	0.012089	1.000000	0.758754	0.618409	0.769212	-0.022920	-0.029747	0.005212	-0.134392	-0.011687
YearsInCurrentRole	0.212901	0.009932	0.018845	0.060236	0.018007	0.008717	0.389447	-0.002305	0.363818	-0.012815	...	0.049856	0.758754	1.000000	0.548056	0.714365	-0.056315	-0.041483	-0.005336	-0.160545	-0.029758
YearsSinceLastPromotion	0.216513	-0.033229	0.010029	0.054254	0.016194	-0.024184	0.353885	-0.018214	0.344978	0.001567	...	0.008941	0.618409	0.548056	1.000000	0.510224	-0.040061	-0.026985	0.005222	-0.033019	-0.012239
YearsWithCurrManager	0.202089	-0.026363	0.014406	0.069065	-0.004999	0.025976	0.375281	-0.027656	0.344079	-0.036746	...	0.002759	0.769212	0.714365	0.510224	1.000000	-0.034282	-0.030599	-0.000229	-0.156199	-0.041586
DepartmentValue	0.031882	-0.007109	-0.017225	-0.007996	0.019395	0.024586	-0.101963	-0.021001	-0.053130	-0.023642	...	-0.026383	-0.022920	-0.056315	-0.040061	-0.034282	1.000000	0.041583	0.002640	-0.063991	-0.007481
GenderValue	-0.036311	-0.011716	-0.001851	-0.016547	0.000508	0.017960	-0.039403	0.033252	-0.031858	-0.041482	...	-0.002753	-0.029747	-0.041483	-0.026985	-0.030599	0.041583	1.000000	-0.044896	0.029453	-0.041924
BusinessTravelValue	-0.011807	-0.015539	-0.009696	-0.008670	-0.011310	0.029300	-0.011696	0.008666	-0.013450	-0.008440	...	0.004209	0.005212	-0.005336	0.005222	-0.000229	0.002640	-0.044896	1.000000	0.127006	0.042752
AttritionValue	-0.159205	-0.056652	0.077924	-0.031373	-0.103369	-0.130016	-0.169105	-0.103481	-0.159840	0.015170	...	-0.063939	-0.134392	-0.160545	-0.033019	-0.156199	-0.063991	0.029453	0.127006	1.000000	0.246118
OverTimeValue	0.028062	0.009135	0.025514	-0.020322	0.070132	-0.003507	0.000544	0.024539	0.006089	0.021431	...	-0.027092	-0.011687	-0.029758	-0.012239	-0.041586	-0.007481	-0.041924	0.042752	0.246118	1.000000

	Age	Attrition	BusinessTravel	DailyRate	Department	DistanceFromHome	Education	EducationField	EmployeeCount	EmployeeNumber	...	RelationshipSatisfaction	StandardHours	StockOptionLevel	TotalWorkingYears	TrainingTimesLastYear	WorkLifeBalance	YearsAtCompany	YearsInCurrentRole	YearsSinceLastPromotion	YearsWithCurrManager
0	41	Yes	Travel_Rarely	1102	Sales	1	2	Life Sciences	1	1	...	1	80	0	8	0	1	6	4	0	5
1	49	No	Travel_Frequently	279	Research & Development	8	1	Life Sciences	1	2	...	4	80	1	10	3	3	10	7	1	7
2	37	Yes	Travel_Rarely	1373	Research & Development	2	2	Other	1	4	...	2	80	0	7	3	3	0	0	0	0
3	33	No	Travel_Frequently	1392	Research & Development	3	4	Life Sciences	1	5	...	3	80	0	8	3	3	8	7	3	0
4	27	No	Travel_Rarely	591	Research & Development	2	1	Medical	1	7	...	4	80	1	6	3	3	2	2	2	2

	Gender	Department	Count
0	Female	Human Resources	20
1	Female	Research & Development	379
2	Female	Sales	189
3	Male	Human Resources	43
4	Male	Research & Development	582
5	Male	Sales	257

MACHINE LEARNING PROJECT¶

TITLE: Attrition Unmasked - Why Employees Leave¶

An Interesting Quote I Found:¶

What is Attrition and What Determines It?¶

This can happen for many reasons:¶

Structure of the Project¶

Table of Contents¶

I. General Information¶

II. Gender Analysis¶

III. Analysis by Education¶

IV. The Impact of Income Towards Attrition¶

V. Working Environment¶

VI. Other Factors¶

VII. Feature Engineering¶

VIII. Data Preprocessing¶

IX. Analysis and Models¶

X. Fine-Tuning¶

XI. Conclusion¶

Importing libraries¶

Summary of our Data¶

Summary¶

Distribution of our Labels¶

Gender Analysis¶

Questions to ask Ourselves:¶

Age Distribution by Gender¶

Distribution of Job Satisfaction¶

Monthly Income by Gender¶

Presence by Department¶

Summary:¶

Analysis by Education and Attrition by Level of Education¶

Summary:¶

The Impact of Income towards Attrition¶

Questions to Ask Ourselves¶

Average Income by Department and Attrition Status¶

Determining Satisfaction by Income¶

Income and its Impact on Attrition¶

Level of Attrition by Overtime Status¶

Summary:¶

Working Environment¶

Question to ask Ourselves¶

Summary:¶

Other Factors that could Influence Attrition¶

Question to Ask Ourselves:¶

Summary:¶

Feature Engineering¶

Mapping Categorical Values to Numerical Values for Correlation Matrix¶

Dropping all of the object d-type for Correlation Matrix¶

Plotting the Correlation Matrix¶

Summary:¶

Checking the fields correlated to attrition¶

These are the features we will use to predict the Attrition value¶

Data Preprocessing¶

Importing the Libraries¶

Defining Features and Target Variable for Model Training¶

Splitting the Data into Training and Testing Sets¶

Balancing the Training Data using SMOTE¶

Preprocessing the Data and Calculating Class Weights¶

Defining the Models for Training¶

Training and Evaluating the Models¶

Model Evaluation Results¶

Model Evaluation Summary¶

Conclusion:¶

The confusion matrices for each model are summarized below:¶

Confusion Matrix Summary¶

Conclusion:¶

Since the Random Forest model provided the better balanced results, showing a good ability to correctly classify negatives while still capturing some positives, I am proceeding with fine-tuning the RandomForestClassifier to further improve its performance and enhance its prediction capabilities.¶

Fine-tuning Random Forest Model with GridSearchCV¶

Evaluating the Performance of the Fine-Tuned Random Forest Model¶

Analysis of Updated Model Results¶

Random Forest (Best Overall)¶

Gradient Boosting (Second Best)¶

High Recall but Low Precision:¶

Balanced Performance:¶

Poor Performance:¶

Best Model Choice:¶

Visualizing Feature Importances of the Fine-Tuned Random Forest Model¶

Conclusion¶

Top Reasons Why Employees Leave the Organization¶

Dropping all of the `object` d-type for Correlation Matrix¶