## Importing necessary libraries for this assignment.
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
#Reading CSV file for analysis and renaming the dataset.
bank = pd.read_csv("Bank_Personal_Loan_Modelling.csv")
#Looking at first 5 observations in dataset.
bank.head()
#Data type of variables in the dataset.
bank.dtypes
#Descriptive statistics of variables
bank.describe().transpose()
#Shape of data.
bank.shape
#Seeing if there are any variables with null values
bank.info()
#Checking to see unique values in Age, Experience, Income, Family, CCAvg, Education, Mortgage, Personal Loan,
#Securities Account, CD Account, Online and CreditCard
print("Age")
print(np.unique(bank[['Age']].values))
print("")
print("Experience")
print(np.unique(bank[['Experience']].values))
print("")
print("Income")
print(np.unique(bank[['Income']].values))
print("")
print("Family")
print(np.unique(bank[['Family']].values))
print("")
print("CCAvg")
print(np.unique(bank[['CCAvg']].values))
print("")
print("Education")
print(np.unique(bank[['Education']].values))
print("")
print("Mortgage")
print(np.unique(bank[['Mortgage']].values))
print("")
print("Personal Loan")
print(np.unique(bank[['Personal Loan']].values))
print("")
print("Securities Account")
print(np.unique(bank[['Securities Account']].values))
print("")
print("CD Account")
print(np.unique(bank[['CD Account']].values))
print("")
print("Online")
print(np.unique(bank[['Online']].values))
print("")
print("Credit Card")
print(np.unique(bank[['CreditCard']].values))
print("ZIP Code")
print(np.unique(bank['ZIP Code'].values))
#Number of unique in each column.
bank.nunique()
#Subset Mortgage = 0, and make a new dataframe and count the number of 0s
print("Number of Zeroes in Mortgage:")
bank_mortgage = bank[bank['Mortgage'] == 0]
print(bank_mortgage['Mortgage'].value_counts().sum())
#Double check
print("")
print("Double check")
print(bank['Mortgage'].value_counts())
#Subset CreditCard = 0, and make a new dataframe and count the number of 0s
print("Number of Zeroes in CreditCard:")
bank_credit_card = bank[bank['CreditCard'] == 0]
print(bank_credit_card['CreditCard'].value_counts().sum())
#Double check
print("")
print("Double Check")
print(bank['CreditCard'].value_counts())
#Categorical variables in dataset include: Family, Education, Personal Loan, Securities Account, CD Account, Online, Credit Card
print("Family")
print(bank['Family'].value_counts())
print("")
print("Education")
print(bank['Education'].value_counts())
print("")
print("Personal Loan")
print(bank['Personal Loan'].value_counts())
print("")
print("Securities Account")
print(bank['Securities Account'].value_counts())
print("")
print("CD Account")
print(bank['CD Account'].value_counts())
print("")
print("Online")
print(bank['Online'].value_counts())
print("")
print("CreditCard")
print(bank['CreditCard'].value_counts())
print("ZIP Code")
print(bank['ZIP Code'].value_counts())
#Univariate analysis
columns = list(bank)[:] # Creating a new list with all columns
bank[columns].hist(stacked=False, bins=100, figsize=(12,30), layout=(14,2)); # Histogram of all columns
# Pairplot gives us a rough idea of how the variables could be related.
sns.pairplot(bank[['Experience', 'Income', 'CCAvg', 'Education', 'Mortgage', 'Personal Loan', 'Securities Account', 'CD Account', 'CreditCard']]);
# Correlation
# A better way to evaluate the relationship between 2 variables is using correlation.
bank.corr()
#Correlation
#Correlation Matrix
def plot_corr(bank, size=18):
corr = bank.corr()
fig, ax = plt.subplots(figsize=(size, size))
ax.matshow(corr)
plt.xticks(range(len(corr.columns)), corr.columns)
plt.yticks(range(len(corr.columns)), corr.columns)
for (i, j), z in np.ndenumerate(corr):
ax.text(j, i, '{:0.1f}'.format(z), ha='center', va='center')
plot_corr(bank)
#I will only evaluate the mean for numerical continuous variables: Age, Experience, Income, CCAvg. and Mortgage
bank.groupby(["Personal Loan"]).mean()
#I will also only evaluate the median for numerical continuous variables: Age, Experience, Income, CCAvg. and Mortgage
bank.groupby(["Personal Loan"]).median()
pd.crosstab(bank['Family'],bank['Personal Loan'],normalize='columns')
pd.crosstab(bank['Education'],bank['Personal Loan'],normalize='columns')
pd.crosstab(bank['Online'],bank['Personal Loan'],normalize='columns')
pd.crosstab(bank['Securities Account'],bank['Personal Loan'],normalize='columns')
pd.crosstab(bank['CD Account'],bank['Personal Loan'],normalize='columns')
pd.crosstab(bank['CreditCard'],bank['Personal Loan'],normalize='columns')
pd.crosstab(bank['ZIP Code'],bank['Personal Loan'],normalize='columns')
# Getting Data Model Ready: 1. Replace negative values in Experience with the mean years of Experience.
# Sidenote: Mortgage has excessive 0s for a continuous variable. If I were tasked with using this dataset as a small sample to predict the outcomes of a large population, I would impute a fraction of the zeroes in Mortgage to match the population that had 0 mortgages. Since that is not what we are trying to accomplish here, I will not impute that variable and will eliminate it from the model all together.
#Impute Experience average for the negative values.
from sklearn.impute import SimpleImputer
rep_neg3 = SimpleImputer(missing_values=-3, strategy="mean")
cols=['Experience']
imputer = rep_neg3.fit(bank[cols])
bank[cols] = imputer.transform(bank[cols])
rep_neg2 = SimpleImputer(missing_values=-2, strategy="mean")
cols=['Experience']
imputer = rep_neg2.fit(bank[cols])
bank[cols] = imputer.transform(bank[cols])
rep_neg1 = SimpleImputer(missing_values=-1, strategy="mean")
cols=['Experience']
imputer = rep_neg1.fit(bank[cols])
bank[cols] = imputer.transform(bank[cols])
#Double check
print("Experience")
print(np.unique(bank[['Experience']].values))
print(bank['Experience'].value_counts())
bank['Experience'].mean()
#The mean values differ based on the removal of these negative values from the dataset. However, the means are very close to one another and range from 20.12-20.33.
from sklearn.model_selection import train_test_split
# I will drop age, id, zip code and mortgage from my list of indepdent variable predictors.
# Reasons: 1. Age is highly correlated with 'Experience' from Correlation matrix (Pearson's corr = 1). Age doesn't show true independence from Experience.
# 2. ID is just an identifier, is different for each case and holds no predictive value for the model.
# 3. ZIP code should not be in the model because there are too few cases reported in each zip code for any of the calculations to mean anything. In the bivariate analysis, it was difficult to interpret anything because the results were all close to 0.
# 4. Mortgage is not a reliable variable. It has excessive zeroes and has high outliers.
X = bank.drop(['Personal Loan', 'Age', 'ID', 'ZIP Code', 'Mortgage'], axis=1) # Separate predictor variables (independent variables) from outcome (dependent variable)
Y = bank['Personal Loan'] # Predicted class (1=True, 0=False)
#Make dummy variable out of the categorical predictor variables
X = pd.get_dummies(X, drop_first=True)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1) #Splitting 70:30
# 1 equals any random seed number
x_train.head()
#Verifying if 70% is training data and 30% is test data
print("{0:0.2f}% data is in training set".format((len(x_train)/len(bank.index)) * 100))
print("{0:0.2f}% data is in test set".format((len(x_test)/len(bank.index)) * 100))
#Seing the distribution of values after data splitting
print("Original Personal Loan True Values : {0} ({1:0.2f}%)".format(len(bank.loc[bank['Personal Loan'] == 1]), (len(bank.loc[bank['Personal Loan'] == 1])/len(bank.index)) * 100))
print("Original Personal Loan False Values : {0} ({1:0.2f}%)".format(len(bank.loc[bank['Personal Loan'] == 0]), (len(bank.loc[bank['Personal Loan'] == 0])/len(bank.index)) * 100))
print("")
print("Training Personal Loan True Values : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 1]), (len(y_train[y_train[:] == 1])/len(y_train)) * 100))
print("Training Personal Loan False Values : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 0]), (len(y_train[y_train[:] == 0])/len(y_train)) * 100))
print("")
print("Test Personal Loan True Values : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 1]), (len(y_test[y_test[:] == 1])/len(y_test)) * 100))
print("Test Personal Loan False Values : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 0]), (len(y_test[y_test[:] == 0])/len(y_test)) * 100))
print("")
#Importing Logistic Regression from sklearn
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
# Fit the model on train
model = LogisticRegression(solver="liblinear")
model.fit(x_train, y_train)
#predict on test
y_predict = model.predict(x_test)
#Looking at the model_score
model_score = model.score(x_test, y_test)
print(model_score)
#Finding the actual true positive, true negative, false positive and false negative values
cm=metrics.confusion_matrix(y_test, y_predict, labels=[1, 0])
print("Numbers from Confusion Matrix")
print(cm)
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True)
print("")
#Confusion Matrix
print("Confusion Matrix")
def draw_cm( actual, predicted ):
cm = confusion_matrix( actual, predicted)
sns.heatmap(cm, annot=True, fmt='.2f', xticklabels = [0,1] , yticklabels = [0,1] )
plt.ylabel('Observed')
plt.xlabel('Predicted')
plt.show()
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score,accuracy_score
print("Trainig accuracy",model.score(x_train,y_train))
print()
print("Testing accuracy",model.score(x_test, y_test))
print()
print("Recall:",recall_score(y_test,y_predict))
print()
print("Precision:",precision_score(y_test,y_predict))
print()
print("F1 Score:",f1_score(y_test,y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
## Feature Importance or Coefficients
fi = pd.DataFrame()
fi['Col'] = x_train.columns
fi['Coeff'] = np.round(abs(model.coef_[0]),2)
fi.sort_values(by='Coeff',ascending=False)
import statsmodels.api as sm
logitmodel=sm.Logit(y_train, x_train)
result=logitmodel.fit()
print(result.summary())
predict= x_test.copy()
predict['Observed Personal Loan Status'] = y_test
predict['Predicted Personal Loan Status'] = y_predict
# Showing rows where predicted does not equal observed.
pd.set_option("display.max_rows", None, "display.max_columns", None)
predict.loc[predict['Observed Personal Loan Status'] != predict['Predicted Personal Loan Status']]
Logistic regression may not be the most suitable model for the data. Here are the reasons why:
According to the confusion matrix, 43 (77) individuals were true positive cases, meaning we predict these individuals to accept a personal loan offer and they will take it, based off of the data from the last campaign. We also anticipate 13 false positives, which we predict will accept the personal loan offered but they will refuse it. There will be 1338 true negative cases, which are individuals we predict will not accept the personal loan and they will not accept it. 72 individuals were labeled as false negative, which we predict will not accept the personal loan but they will accept it.
Important features of this logistic regression model include: CD Account (3.15), Education (1.18), Credit Card (1.01) and Securities Account (0.94). These are the top predictor variables (and their corresponding coefficients) that impact the model's performance. There were other variables that were not as influential, notably: Online (0.62), Family (0.46), CCAvg (0.13), Income (0.4) and Experience (0.1).
Important metric: In order for the bank to secure less of a loss, we need to rely on recall and precision as important metrics. Recall determines of all who accept the loan, how many will the model predict accurately. In this case, the recall value is 0.517. 1-Recall (0.483) determines the severity of false negatives. This value is decent. Larger false negative count is bad for the bank, as these are individuals that were predicted to not take the personal loan but they end up accepting it. So, the bank would have to provide loans and take a risk on these individuals that were not predicted by the model to accept it in the first place. In summary, the bank may have to lose money by lending to these 72 individuals (False negative count).
At least with this model, there were more true positives than false negatives.
Precision is also an important metric. It determines of all the cases the model predicted would accept the personal loan, how many will actually accept the loan. Precision for this model is: 0.8556. 1-Precision(0.1444) allows us to understand how the bank can save money on these loans. Individuals accounted for in 1-Precision are people who were predicted to take a loan but ended up not taking it; so the bank would not have to take a gamble on these individuals. There are 13 individuals in this group (False Positive) and the bank would save money.
According to this model, the bank would lose some money overall because there are more false negatives than false positives. This model predicts that the bank would have to pay for 59 additional personal loans (False Negative - False Positive = 72-13).