import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from datetime import datetime
from tabulate import tabulate

loan_df = pd.read_csv('Loan_Data.csv')
loan_df['default'] = loan_df['default'].astype('category')
loan_df.head()

loan_df.iloc[:, 1:].describe()

fig = px.histogram(
    loan_df,
    x = 'credit_lines_outstanding',
    color = 'default',
    title = 'Default Percentages vs Total Credit Lines',
    barmode = 'group',
    histnorm = 'percent',
    template = 'plotly_white',
    category_orders = {'default': [0, 1]}
)

fig.show()

#Creates 1 row of 2 subplots for Outstanding Loan Amount and Total Debt Outstanding
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=("Oustanding Loan Amount", "Total Debt Oustanding"),
)

#These are the two variables I will 
var = ['loan_amt_outstanding', 'total_debt_outstanding']
marker_colors = ['blue', 'red']

#This for-loop creates four boxplot traces on 2 subplots. One for each target class' loan_amt_outstanding and total_debt_outstanding features
#Target values are 0 (did not default) and 1 (defaulted on loan)
for target in range(2):
    #Tracks which subplot the current trace should be plotted on
    for col in range(2):
        fig.add_trace(
            go.Box(
                x = loan_df[loan_df['default'] == target]['default'],
                y = loan_df[loan_df['default'] == target][var[col]],
                marker_color = marker_colors[target]
            ), 
            row = 1, 
            col = col + 1
        )

#Applies the template to both plots
fig.update_layout(template = "plotly_white", showlegend = False)
fig.show()

fig = px.histogram(
    loan_df,
    x = 'total_debt_outstanding',
    color = 'default',
    title = 'Loan Borrowers Outstanding Debt by Default Group',
    barmode = 'overlay',
    histnorm = 'percent',
    template = 'plotly_white'
)

fig.show()

fig = px.histogram(
    loan_df,
    x = 'income',
    color = 'default',
    title = 'Loan Borrower Income by Default Group',
    barmode = 'overlay',
    histnorm = 'percent',
    template = 'plotly_white'
)

fig.show()

fig = go.Figure()

#Plots debt vs income scatterplot for borrowers who did not default (0)
fig.add_trace(go.Scatter(
    x = loan_df[loan_df['default'] == 0]['income'],
    y = loan_df[loan_df['default'] == 0]['total_debt_outstanding'],
    mode = 'markers',
    marker_color = 'blue',
    name = '0',
    opacity = 0.5,
    zorder = 0
))

#Plots debt vs income scatterplot for borrowers who did default (1)
fig.add_trace(go.Scatter(
    x = loan_df[loan_df['default'] == 1]['income'],
    y = loan_df[loan_df['default'] == 1]['total_debt_outstanding'],
    mode = 'markers',
    marker_color = 'red',
    name = '1',
    opacity = 0.5,
    zorder = 0
))

#Based on the data, I estimate the decision boundary is y=0.2x
#This code uses numpy to generate 20 points for that line
x_line = np.linspace(0, 150000, num = 20)
y_line = x_line * 0.20

#Plots the estimated decision boundary as a line
fig.add_trace(go.Scatter(
    x=x_line, 
    y=y_line, 
    mode='lines',
    name = 'y=0.2x',
    line=dict(color='black', width=2), 
    zorder=1
))

#Applies template and adds a title
fig.update_layout(template = "plotly_white", title = 'Outstanding Debt vs Income')
fig.show()

employed_default_prop = loan_df.groupby('years_employed')['default'].value_counts(normalize = True).reset_index()

fig = px.bar(
    employed_default_prop,
    x = 'years_employed',
    y = 'proportion',
    color = 'default',
    title = 'Default Proportions by Years Employed',
    template = 'plotly_white',
    category_orders = {'default': [0, 1]}
)

fig.show()

value_cols = ['credit_lines_outstanding', 'loan_amt_outstanding', 'total_debt_outstanding', 'income', 'fico_score']

years_employed_default_df = pd.pivot_table(
    loan_df, 
    index = ['default', 'years_employed'], 
    values = value_cols, 
    observed = False
).reset_index()

years_employed_default_df.sort_values(by = ['years_employed', 'default'])

fig = px.box(
    loan_df, 
    x = 'fico_score', 
    color = 'default',
    title = 'FICO Score by Default Group',
    template = 'plotly_white',
    category_orders = {'default': [0, 1]}
)

fig.show()

#List storing the five credit range values
credit_range = ['Poor', 'Fair', 'Good', 'Very Good', 'Exceptional']

#Assigns each borrower a credit range value based on their fico score
loan_df['credit_range'] = np.select(
    [
        loan_df['fico_score'].between(300, 579),
        loan_df['fico_score'].between(580, 669),
        loan_df['fico_score'].between(670, 739),
        loan_df['fico_score'].between(740, 799),
        loan_df['fico_score'].between(800, 850)
    ],
    credit_range
)

#Calculates the proportion of borrowers who defaulted for each credit range value
credit_range_proportions = loan_df.groupby('credit_range')['default'].value_counts(normalize = True).reset_index()

#Plots grouped bar charts for each credit range value showing the proportion of borrowers in that group who defaulted
fig = px.bar(credit_range_proportions, x = 'credit_range', y = 'proportion', color = 'default', title = 'Credit Range Proportion by Default Group',
             category_orders = {'credit_range': credit_range}, barmode = 'group', template = 'plotly_white')
fig.show()

#Converts the categorical column to a factor for use in training predictive models
loan_df['credit_range'] = pd.factorize(loan_df['credit_range'])[0]

#Reorders the columns to keep default as the last column in the dataframe
col_reorder = list(loan_df.columns[:-2]) + [loan_df.columns[-1], loan_df.columns[-2]]
loan_df = loan_df[col_reorder]

fig = px.imshow(
    loan_df.iloc[:, 1:].corr(),
    template = 'plotly_white'
)

fig.show()

#Features are all columns except the target column (default) and customer_id, which is unique to each borrower
features = loan_df.columns.tolist()
features.remove('default')
features.remove('customer_id')

#Models in order: Decision Tree, Naive Bayes, K-Nearest Neighbors (K=3), Neural Network, Random Forest, Gradient Boosting
models = [
    DecisionTreeClassifier(), 
    GaussianNB(), 
    KNeighborsClassifier(n_neighbors=3),
    MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(5, 5), max_iter = 150, random_state=26),
    RandomForestClassifier(n_estimators=50, max_features=1, random_state=26), 
    GradientBoostingClassifier(random_state=26)
]

#Stores model names, accuracy, and training time for output purposes
model_name = ['Decision Tree', 'Naive Bayes', 'K Nearest Neighbors', 'Neural Network', 'Random Forest', 'Gradient Boosting']
model_accuracy = []
training_time = []

#Calculates training time as the start time - time taken to calculate the cross-val score
start = datetime.now()

for model in models:
    #Calculates accuracy of 5-fold cross validation
    avg_cross_val_score = cross_val_score(model, loan_df[features], loan_df['default'], cv = 5).mean()
    model_accuracy.append(round(avg_cross_val_score, 5))
    #Calculates training time as the start time - time taken to calculate the cross-val score
    training_time.append(datetime.now() - start)
    start = datetime.now()

#Outputs the model data, organized with tabulate
print(tabulate([model_name, model_accuracy, training_time]))

--------------  --------------  -------------------  --------------  --------------  -----------------
Decision Tree   Naive Bayes     K Nearest Neighbors  Neural Network  Random Forest   Gradient Boosting
0.9954          0.9694          0.9791               0.9788          0.9929          0.9962
0:00:00.087078  0:00:00.023020  0:00:00.481423       0:00:06.091359  0:00:01.082952  0:00:05.655976
--------------  --------------  -------------------  --------------  --------------  -----------------

#Sets the recovery rate to the given value
recovery_rate = 0.1

#Splits the dataset into training and testing (20% for testing)
x_train, x_test, y_train, y_test = train_test_split(loan_df[features], loan_df['default'], test_size = 0.2, random_state = 26)

#Trains a Decision Tree Classifier using the training data
tree_model = DecisionTreeClassifier()
tree_model.fit(x_train.values, y_train.values)

#Calculates model accuracy to ensure the result matches the cross_val_score
accuracy = tree_model.score(x_test.values, y_test.values)
print('Model Accuracy:', accuracy)

#Calculates the probability of each borrower belonging to either default class (0 for repay, 1 for default)
default_probability = tree_model.predict_proba(loan_df[features].values)

#Retrieves the probability of each borrower defaulting from the predict_proba results
loan_df['default_probability'] = [i[1] for i in default_probability]
#Calculates expected loss as default_probability * loan_amt_outstanding * (1 - recovery_rate)
loan_df['expected_loss'] = loan_df['default_probability'] * loan_df['loan_amt_outstanding'] * (1 - recovery_rate)

#Outputs the result
loan_df[['customer_id', 'loan_amt_outstanding', 'default_probability', 'expected_loss']]

Model Accuracy: 0.995

#Function parameters are all model features + recovery_rate
def calculate_expected_loss(credit_lines_outstanding, loan_amt_outstanding, total_debt_outstanding, income, years_employed, fico_score, credit_range, recovery_rate):
    #Creates numpy array for loan data
    loan_data = np.array([credit_lines_outstanding, loan_amt_outstanding, total_debt_outstanding, income, years_employed, fico_score, credit_range])
    #Calculates the default probability using the decision tree model trained above
    #predict_proba requires the data to be reshaped when predicting probability for a single observation/row
    default_probability = tree_model.predict_proba(loan_data.reshape(1, -1))[0][1]
    #Calculates and returns expected loss
    expected_loss = default_probability * loan_amt_outstanding * (1 - recovery_rate)
    return round(expected_loss, 2)

#Tests the function using the second row of the dataset, where expected_loss > 0
borrower_data = list(loan_df.iloc[1, 1:-3])
calculate_expected_loss(*borrower_data, 0.1)

1763.04

	customer_id	credit_lines_outstanding	loan_amt_outstanding	total_debt_outstanding	income	years_employed	fico_score	default
0	8153374	0	5221.545193	3915.471226	78039.38546	5	605	0
1	7442532	5	1958.928726	8228.752520	26648.43525	2	572	1
2	2256073	0	3363.009259	2027.830850	65866.71246	4	602	0
3	4885975	0	4766.648001	2501.730397	74356.88347	5	612	0
4	4700614	1	1345.827718	1768.826187	23448.32631	6	631	0

	credit_lines_outstanding	loan_amt_outstanding	total_debt_outstanding	income	years_employed	fico_score
count	10000.000000	10000.000000	10000.000000	10000.000000	10000.000000	10000.000000
mean	1.461200	4159.677034	8718.916797	70039.901401	4.552800	637.557700
std	1.743846	1421.399078	6627.164762	20072.214143	1.566862	60.657906
min	0.000000	46.783973	31.652732	1000.000000	0.000000	408.000000
25%	0.000000	3154.235371	4199.836020	56539.867903	3.000000	597.000000
50%	1.000000	4052.377228	6732.407217	70085.826330	5.000000	638.000000
75%	2.000000	5052.898103	11272.263740	83429.166133	6.000000	679.000000
max	5.000000	10750.677810	43688.784100	148412.180500	10.000000	850.000000

	default	years_employed	credit_lines_outstanding	fico_score	income	loan_amt_outstanding	total_debt_outstanding
0	0	0	0.500000	608.800000	63938.530390	4483.093431	6667.892335
11	1	0	3.636364	580.818182	69312.865828	4739.948908	17589.244442
1	0	1	0.535211	640.323944	71336.091945	4648.299663	6370.618752
12	1	1	4.107527	559.182796	71417.095009	4666.486285	19129.876586
2	0	2	0.286652	636.625821	69867.031199	4438.563836	5584.963443
13	1	2	4.406926	591.627706	71765.473616	4764.868233	19404.746082
3	0	3	0.323849	628.968201	69483.620639	4300.498560	5833.240296
14	1	3	4.628521	593.862676	70230.702943	4540.397929	19220.221603
4	0	4	0.522919	642.849819	69817.025081	4207.398307	6036.797102
15	1	4	4.799127	590.748908	71419.363512	4479.799450	20204.005841
5	0	5	0.845201	650.050862	70158.665028	4063.709107	6532.574802
16	1	5	4.693291	613.738019	69812.078978	4195.175035	18649.037444
6	0	6	0.901042	650.798828	69082.822538	3901.290777	6422.289949
17	1	6	4.567164	607.253731	70985.887712	4050.556868	18146.229257
7	0	7	1.265528	667.043478	70846.610828	3853.235682	6907.015686
18	1	7	5.000000	638.050000	69334.110872	3876.485605	18873.024796
8	0	8	1.615970	671.064639	71418.159414	3708.359463	7449.145197
19	1	8	5.000000	676.250000	68301.588226	3575.121841	17048.861106
9	0	9	1.697674	666.906977	74008.684770	3765.281520	7608.307502
10	0	10	2.545455	710.727273	66387.171349	3042.684203	9006.955532

	customer_id	loan_amt_outstanding	default_probability	expected_loss
0	8153374	5221.545193	0.0	0.000000
1	7442532	1958.928726	1.0	1763.035853
2	2256073	3363.009259	0.0	0.000000
3	4885975	4766.648001	0.0	0.000000
4	4700614	1345.827718	0.0	0.000000
...	...	...	...	...
9995	3972488	3033.647103	0.0	0.000000
9996	6184073	4146.239304	0.0	0.000000
9997	6694516	3088.223727	0.0	0.000000
9998	3942961	3288.901666	0.0	0.000000
9999	5533570	1917.652480	0.0	0.000000

Loan Default Predictive Modeling