import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from collections import Counter

df_loans = pd.read_csv('loans_full_schema.csv')
pd.set_option("display.max.columns", None)
df_loans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 55 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   emp_title                         9167 non-null   object 
 1   emp_length                        9183 non-null   float64
 2   state                             10000 non-null  object 
 3   homeownership                     10000 non-null  object 
 4   annual_income                     10000 non-null  float64
 5   verified_income                   10000 non-null  object 
 6   debt_to_income                    9976 non-null   float64
 7   annual_income_joint               1495 non-null   float64
 8   verification_income_joint         1455 non-null   object 
 9   debt_to_income_joint              1495 non-null   float64
 10  delinq_2y                         10000 non-null  int64  
 11  months_since_last_delinq          4342 non-null   float64
 12  earliest_credit_line              10000 non-null  int64  
 13  inquiries_last_12m                10000 non-null  int64  
 14  total_credit_lines                10000 non-null  int64  
 15  open_credit_lines                 10000 non-null  int64  
 16  total_credit_limit                10000 non-null  int64  
 17  total_credit_utilized             10000 non-null  int64  
 18  num_collections_last_12m          10000 non-null  int64  
 19  num_historical_failed_to_pay      10000 non-null  int64  
 20  months_since_90d_late             2285 non-null   float64
 21  current_accounts_delinq           10000 non-null  int64  
 22  total_collection_amount_ever      10000 non-null  int64  
 23  current_installment_accounts      10000 non-null  int64  
 24  accounts_opened_24m               10000 non-null  int64  
 25  months_since_last_credit_inquiry  8729 non-null   float64
 26  num_satisfactory_accounts         10000 non-null  int64  
 27  num_accounts_120d_past_due        9682 non-null   float64
 28  num_accounts_30d_past_due         10000 non-null  int64  
 29  num_active_debit_accounts         10000 non-null  int64  
 30  total_debit_limit                 10000 non-null  int64  
 31  num_total_cc_accounts             10000 non-null  int64  
 32  num_open_cc_accounts              10000 non-null  int64  
 33  num_cc_carrying_balance           10000 non-null  int64  
 34  num_mort_accounts                 10000 non-null  int64  
 35  account_never_delinq_percent      10000 non-null  float64
 36  tax_liens                         10000 non-null  int64  
 37  public_record_bankrupt            10000 non-null  int64  
 38  loan_purpose                      10000 non-null  object 
 39  application_type                  10000 non-null  object 
 40  loan_amount                       10000 non-null  int64  
 41  term                              10000 non-null  int64  
 42  interest_rate                     10000 non-null  float64
 43  installment                       10000 non-null  float64
 44  grade                             10000 non-null  object 
 45  sub_grade                         10000 non-null  object 
 46  issue_month                       10000 non-null  object 
 47  loan_status                       10000 non-null  object 
 48  initial_listing_status            10000 non-null  object 
 49  disbursement_method               10000 non-null  object 
 50  balance                           10000 non-null  float64
 51  paid_total                        10000 non-null  float64
 52  paid_principal                    10000 non-null  float64
 53  paid_interest                     10000 non-null  float64
 54  paid_late_fees                    10000 non-null  float64
dtypes: float64(17), int64(25), object(13)
memory usage: 4.2+ MB

nRow, nCol = df_loans.shape
print(f'There are {nRow} rows and {nCol} columns')
print(df_loans.dtypes)

There are 10000 rows and 55 columns
emp_title                            object
emp_length                          float64
state                                object
homeownership                        object
annual_income                       float64
verified_income                      object
debt_to_income                      float64
annual_income_joint                 float64
verification_income_joint            object
debt_to_income_joint                float64
delinq_2y                             int64
months_since_last_delinq            float64
earliest_credit_line                  int64
inquiries_last_12m                    int64
total_credit_lines                    int64
open_credit_lines                     int64
total_credit_limit                    int64
total_credit_utilized                 int64
num_collections_last_12m              int64
num_historical_failed_to_pay          int64
months_since_90d_late               float64
current_accounts_delinq               int64
total_collection_amount_ever          int64
current_installment_accounts          int64
accounts_opened_24m                   int64
months_since_last_credit_inquiry    float64
num_satisfactory_accounts             int64
num_accounts_120d_past_due          float64
num_accounts_30d_past_due             int64
num_active_debit_accounts             int64
total_debit_limit                     int64
num_total_cc_accounts                 int64
num_open_cc_accounts                  int64
num_cc_carrying_balance               int64
num_mort_accounts                     int64
account_never_delinq_percent        float64
tax_liens                             int64
public_record_bankrupt                int64
loan_purpose                         object
application_type                     object
loan_amount                           int64
term                                  int64
interest_rate                       float64
installment                         float64
grade                                object
sub_grade                            object
issue_month                          object
loan_status                          object
initial_listing_status               object
disbursement_method                  object
balance                             float64
paid_total                          float64
paid_principal                      float64
paid_interest                       float64
paid_late_fees                      float64
dtype: object

df_loans.head(5)

df_loans.dtypes

emp_title                            object
emp_length                          float64
state                                object
homeownership                        object
annual_income                       float64
verified_income                      object
debt_to_income                      float64
annual_income_joint                 float64
verification_income_joint            object
debt_to_income_joint                float64
delinq_2y                             int64
months_since_last_delinq            float64
earliest_credit_line                  int64
inquiries_last_12m                    int64
total_credit_lines                    int64
open_credit_lines                     int64
total_credit_limit                    int64
total_credit_utilized                 int64
num_collections_last_12m              int64
num_historical_failed_to_pay          int64
months_since_90d_late               float64
current_accounts_delinq               int64
total_collection_amount_ever          int64
current_installment_accounts          int64
accounts_opened_24m                   int64
months_since_last_credit_inquiry    float64
num_satisfactory_accounts             int64
num_accounts_120d_past_due          float64
num_accounts_30d_past_due             int64
num_active_debit_accounts             int64
total_debit_limit                     int64
num_total_cc_accounts                 int64
num_open_cc_accounts                  int64
num_cc_carrying_balance               int64
num_mort_accounts                     int64
account_never_delinq_percent        float64
tax_liens                             int64
public_record_bankrupt                int64
loan_purpose                         object
application_type                     object
loan_amount                           int64
term                                  int64
interest_rate                       float64
installment                         float64
grade                                object
sub_grade                            object
issue_month                          object
loan_status                          object
initial_listing_status               object
disbursement_method                  object
balance                             float64
paid_total                          float64
paid_principal                      float64
paid_interest                       float64
paid_late_fees                      float64
dtype: object

df_loans.isnull().sum()

emp_title                            833
emp_length                           817
state                                  0
homeownership                          0
annual_income                          0
verified_income                        0
debt_to_income                        24
annual_income_joint                 8505
verification_income_joint           8545
debt_to_income_joint                8505
delinq_2y                              0
months_since_last_delinq            5658
earliest_credit_line                   0
inquiries_last_12m                     0
total_credit_lines                     0
open_credit_lines                      0
total_credit_limit                     0
total_credit_utilized                  0
num_collections_last_12m               0
num_historical_failed_to_pay           0
months_since_90d_late               7715
current_accounts_delinq                0
total_collection_amount_ever           0
current_installment_accounts           0
accounts_opened_24m                    0
months_since_last_credit_inquiry    1271
num_satisfactory_accounts              0
num_accounts_120d_past_due           318
num_accounts_30d_past_due              0
num_active_debit_accounts              0
total_debit_limit                      0
num_total_cc_accounts                  0
num_open_cc_accounts                   0
num_cc_carrying_balance                0
num_mort_accounts                      0
account_never_delinq_percent           0
tax_liens                              0
public_record_bankrupt                 0
loan_purpose                           0
application_type                       0
loan_amount                            0
term                                   0
interest_rate                          0
installment                            0
grade                                  0
sub_grade                              0
issue_month                            0
loan_status                            0
initial_listing_status                 0
disbursement_method                    0
balance                                0
paid_total                             0
paid_principal                         0
paid_interest                          0
paid_late_fees                         0
dtype: int64

df_loans.describe(include=np.object)

df_loans.describe()

df_loans.describe(include=[np.object,np.number])

#graph number1 (loan purpose bar graph)
# import seaborn as sns
# count = df_loans['Sector'].value_counts()
# plt.figure(figsize=(15,10))
# ax = sns.countplot(x='Sector', data=df, palette="Set2", order=count.index[0:10])
# ax.set(xlabel='Sectors', ylabel='Number of Companies')
# plt.title("Bar Graph of Sectors")
df_loans.loan_purpose.describe()
loanpurpose = df_loans['loan_purpose'].value_counts()
x = loanpurpose.index
y = loanpurpose.values
#define Seaborn color palette to use
colors = sns.color_palette('pastel')[0:12]
plt.gca().axis("equal")
pie = plt.pie(loanpurpose, startangle=0, autopct='%1.0f%%', pctdistance=0.9, radius=3, colors=colors)
labels=loanpurpose.index.unique()
plt.title('Loan Purpose Pie Chart', weight='bold', size=16, y=2, pad=-20)
plt.legend(pie[0],labels, bbox_to_anchor=(1.5,0.5), loc="center right", fontsize=10, 
           bbox_transform=plt.gcf().transFigure)
plt.subplots_adjust(left=0.0, bottom=0.1, right=0.85)
plt.savefig('Images/pie.png', dpi=300,bbox_inches = 'tight')

#graph number2 (state data)
df = df_loans['state'].value_counts()
# df2 = pd.DataFrame(np.array([df.index,df.values]), columns=['State','Count'])
# df = df.reset_index()
# df.rename( columns={0:'Count','state':'State'}, inplace=True )
df2 = df.index[0:50]
plt.figure(figsize=(20,10))
ax = sns.countplot(x='state', data=df_loans, palette="Set2", order=df.index[0:10])
ax.set(xlabel='State', ylabel='Count')
plt.title("Bar Graph of States")
plt.savefig('Images/bar.png', dpi=300,bbox_inches = 'tight')

#graph number3 
# sns.scatterplot(data=df_loans, x="debt_to_income", y="interest_rate")
plt.figure(figsize = (30,16))
ax = sns.displot(data=df_loans, x="interest_rate", hue="application_type", kind="kde", fill=True)
ax.set(xlabel='Interest Rate', ylabel='Density')
plt.title("Interest Rate based on Application type")
plt.savefig('Images/displot.png', dpi=300,bbox_inches = 'tight')

<Figure size 2160x1152 with 0 Axes>

#graph number3 
# sns.scatterplot(data=df_loans, x="debt_to_income", y="interest_rate")
plt.figure(figsize = (30,16))
ax = sns.boxplot(x="homeownership", y="interest_rate", data=df_loans)
ax.set(xlabel='Homeownership', ylabel='Interest Rate')
plt.title("Interest Rate based on Homeownership", fontsize=14)
plt.savefig('Images/box.png', dpi=300,bbox_inches = 'tight')

#graph number4 (emp_length to annual_income)
plt.figure(figsize = (30,16))
p = sns.scatterplot(data=df_loans, x="annual_income", y="interest_rate", hue="grade")
p.set(xlabel='Annual Income', ylabel='Interest Rate')
plt.title("Interest Rate based on Annual Income and Corresponding Grade",  fontsize=14)
plt.savefig('Images/scatter.png', dpi=300,bbox_inches = 'tight')

#graph number5 (compare those with public_record_bankrupt and those without)
df_loans['delinq'] = np.where(df_loans['delinq_2y']== 0, True, False)
# print(df_loans['bankruptcy'])
plt.figure(figsize = (60,60))
p=sns.lmplot(x="debt_to_income_joint", y="interest_rate", col="issue_month", hue="delinq", data=df_loans,
           markers=["o", "x"], palette="Set1");
p.set(xlabel='Debt to Income', ylabel='Interest Rate')
fig = p.fig 
fig.suptitle("Interest Rate based on Debt to Income Ratio", fontsize=12, y=1.08)
plt.savefig('Images/lmplot.png', dpi=300,bbox_inches = 'tight')

<Figure size 4320x4320 with 0 Axes>

#Create a feature set and create a model which predicts interest rate using at least 2 algorithms. 

#Describe any data cleansing that must be performed and analysis when examining the data.
#Also describe assumptions you made and your approach.
#Visualize the test results and propose enhancements to the model, what would you do if you had more time. 
from sklearn.model_selection import train_test_split
#cleaning and feature set
df_loans = pd.read_csv('loans_full_schema.csv')
pd.set_option("display.max.columns", None) 
my_list = df_loans.columns.values.tolist()
# my_list.remove('months_since_90d_late')
# my_list.remove('months_since_last_delinq')
# my_list.remove('debt_to_income_joint')
# my_list.remove('verification_income_joint')
# my_list.remove('annual_income_joint')
# my_list.remove('emp_title')
# my_list.remove('emp_length')
# my_list.remove('months_since_last_credit_inquiry')
# my_list.remove('num_accounts_120d_past_due')
# my_list.remove('state')
objects = df_loans.select_dtypes(include=['object']).columns
for i in objects:
    my_list.remove(i)
df_loans2 = df_loans[my_list].dropna(how='any',axis=0)
df_loans2 = df_loans2.fillna(df_loans2.mean())
y = df_loans2['interest_rate']
my_list.remove('interest_rate')
df2 = df_loans2[my_list]
# df2 = pd.get_dummies(df2, columns=df2.select_dtypes(include=['object']).columns)
# X = df_loans2[my_list]
X = df2
X = X.values #returns a numpy array
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(X)
df = pd.DataFrame(x_scaled)
x = df
# configure to select all features
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from matplotlib import pyplot
# feature selection
def select_features(X_train, y_train, X_test):
    # configure to select all features
    fs = SelectKBest(score_func=f_classif, k='all')
    # learn relationship from training data
    fs.fit(X_train, y_train)
    # transform train input data
    X_train_fs = fs.transform(X_train)
    # transform test input data
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs, fs
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=1)
# feature selection
X_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)
# what are scores for the features
# for i in range(len(fs.scores_)):
# 	print('Feature %d: %f' % (i, fs.scores_[i]))
# # plot the scores
# pyplot.bar([i for i in range(len(fs.scores_))], fs.scores_)
# pyplot.show()
#create dataframe
scores = pd.DataFrame(list(zip(fs.scores_, df2.columns)),
               columns =['FS Score', 'FeatureName'])
scores = scores.sort_values(by=['FS Score'], ascending=False)
scores = scores.set_index('FeatureName')
plt.figure(figsize=(30,10))
# print(scores.FeatureName.values)
x = sns.barplot(x=scores.index, y="FS Score", palette="Set2", data=scores,order=scores.index[0:10])

/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_selection/_univariate_selection.py:114: UserWarning: Features [16 22 23] are constant.
  warnings.warn("Features %s are constant." % constant_features_idx,
/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_selection/_univariate_selection.py:116: RuntimeWarning: invalid value encountered in true_divide
  f = msb / msw

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
df_loans = pd.read_csv('loans_full_schema.csv')
pd.set_option("display.max.columns", None) 
y = df_loans['interest_rate']
x = df_loans[['sub_grade','grade','homeownership','loan_purpose']]
x = pd.get_dummies(x, columns=x.select_dtypes(include=['object']).columns)
xtrain, xtest, ytrain, ytest=train_test_split(x, y, random_state=12, 
             test_size=0.15)
# with new parameters
gbr = GradientBoostingRegressor(n_estimators=600, 
    max_depth=5, 
    learning_rate=0.01, 
    min_samples_split=3)
# with default parameters
gbr = GradientBoostingRegressor()

gbr.fit(xtrain, ytrain)

ypred = gbr.predict(xtest)
mse = mean_squared_error(ytest,ypred)
print("MSE: %.2f" % mse)

x_ax = range(len(ytest))
plt.figure(figsize=(15,10))
plt.scatter(x_ax, ytest, s=5, color="blue", label="original")
plt.plot(x_ax, ypred, lw=0.8, color="red", label="predicted")
plt.legend()
plt.show()

MSE: 0.06

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
df_loans = pd.read_csv('loans_full_schema.csv')
pd.set_option("display.max.columns", None) 
y = df_loans['interest_rate']
df = df_loans.drop(df_loans.select_dtypes(include=['int64','float64']).columns, axis=1)
x = df
x = pd.get_dummies(x, columns=x.select_dtypes(include=['object']).columns)
# print(x)
xtrain, xtest, ytrain, ytest=train_test_split(x, y, random_state=12, 
             test_size=0.15)
# with new parameters
gbr = GradientBoostingRegressor(n_estimators=600, 
    max_depth=5, 
    learning_rate=0.01, 
    min_samples_split=3)
# with default parameters
gbr = GradientBoostingRegressor()

gbr.fit(xtrain, ytrain)

ypred = gbr.predict(xtest)
mse = mean_squared_error(ytest,ypred)
print("MSE: %.2f" % mse)

x_ax = range(len(ytest))
plt.figure(figsize=(15,10))
plt.scatter(x_ax, ytest, s=5, color="blue", label="original")
plt.plot(x_ax, ypred, lw=0.8, color="red", label="predicted")
plt.legend()
plt.show()

MSE: 0.06

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
df_loans = pd.read_csv('loans_full_schema.csv')
pd.set_option("display.max.columns", None) 
y = df_loans['interest_rate']
x = df_loans[['paid_interest','public_record_bankrupt','sub_grade','grade']]
x = pd.get_dummies(x, columns=x.select_dtypes(include=['object']).columns)
xtrain, xtest, ytrain, ytest=train_test_split(x, y, random_state=12, 
             test_size=0.15)
# with new parameters
gbr = GradientBoostingRegressor(n_estimators=600, 
    max_depth=5, 
    learning_rate=0.01, 
    min_samples_split=3)
# with default parameters
gbr = GradientBoostingRegressor()

gbr.fit(xtrain, ytrain)

ypred = gbr.predict(xtest)
mse = mean_squared_error(ytest,ypred)
print("MSE: %.2f" % mse)

x_ax = range(len(ytest))
plt.figure(figsize=(15,10))
plt.scatter(x_ax, ytest, s=5, color="blue", label="original")
plt.plot(x_ax, ypred, lw=0.8, color="red", label="predicted")
plt.legend()
plt.show()
# print(scores.index[0:10])
# paid_interest

MSE: 0.05

import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
df_loans = pd.read_csv('loans_full_schema.csv')
pd.set_option("display.max.columns", None) 
y = df_loans['interest_rate']
df_loans = df_loans.fillna(df_loans.mean())
x = df_loans.drop('interest_rate',axis=1)
x = pd.get_dummies(x, columns=x.select_dtypes(include=['object']).columns)
data_dmatrix = xgb.DMatrix(data=x,label=y)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=123)
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 100)
xg_reg.fit(X_train,y_train)

preds = xg_reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 0.524121

/opt/anaconda3/lib/python3.8/site-packages/xgboost/data.py:112: UserWarning: Use subset (sliced data) of np.ndarray is not recommended because it will generate extra copies and increase memory consumption
  warnings.warn(

from catboost import CatBoostRegressor
my_list = df_loans.columns.values.tolist()
# my_list.remove('months_since_90d_late')
# my_list.remove('months_since_last_delinq')
# my_list.remove('debt_to_income_joint')
my_list.remove('verification_income_joint')
my_list.remove('annual_income_joint')
my_list.remove('emp_title')
# my_list.remove('emp_length')
# my_list.remove('months_since_last_credit_inquiry')
# my_list.remove('num_accounts_120d_past_due')
my_list.remove('state')
# my_list.remove('homeownership')
my_list.remove('verified_income')
# my_list.remove('loan_purpose')
# my_list.remove('application_type')
# my_list.remove('issue_month')
# my_list.remove('loan_status')
# my_list.remove('initial_listing_status')
# my_list.remove('disbursement_method')
objects = df_loans.select_dtypes(include=['object']).columns
df_loans2 = df_loans[my_list].dropna(how='any',axis=0)
df_loans2 = df_loans2.fillna(df_loans2.mean())
y = df_loans2['interest_rate']
my_list.remove('interest_rate')
df2 = df_loans2[my_list]
X = df2
# x = X.values #returns a numpy array
pd.set_option("display.max.columns", None) 
y = df_loans['interest_rate']
# df_loans = df_loans.fillna(df_loans.mean())
# x = df_loans.drop('interest_rate',axis=1)
# x = pd.get_dummies(x, columns=x.select_dtypes(include=['object']).columns)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=13)
CAT_FEATURES = ['grade', 'sub_grade','homeownership','loan_purpose','application_type','issue_month','loan_status','initial_listing_status','disbursement_method'] #list of your categorical features
# set up the model
catboost_model = CatBoostRegressor(n_estimators=50,
                                   loss_function = 'RMSE',
                                   eval_metric = 'RMSE',
                                   cat_features = CAT_FEATURES)
# fit model
catboost_model.fit(X_train, y_train, 
                   eval_set = (X_test, y_test),
                   use_best_model = True,
                   plot = True)
preds = catboost_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
x_ax = range(len(ytest))
plt.figure(figsize=(15,10))
plt.scatter(x_ax, ytest, s=5, color="blue", label="original")
plt.plot(x_ax, ypred, lw=0.8, color="red", label="predicted")
plt.legend()
plt.show()
print("RMSE: %f" % (rmse))

Learning rate set to 0.434419
0:	learn: 3.0290727	test: 2.9858375	best: 2.9858375 (0)	total: 3.92ms	remaining: 192ms
1:	learn: 1.9742155	test: 1.9312472	best: 1.9312472 (1)	total: 7.11ms	remaining: 171ms
2:	learn: 1.4796737	test: 1.4405998	best: 1.4405998 (2)	total: 9.54ms	remaining: 149ms
3:	learn: 1.2577354	test: 1.2277346	best: 1.2277346 (3)	total: 12.3ms	remaining: 141ms
4:	learn: 0.9903951	test: 0.9600714	best: 0.9600714 (4)	total: 15.3ms	remaining: 137ms
5:	learn: 0.8793019	test: 0.8551476	best: 0.8551476 (5)	total: 17.7ms	remaining: 130ms
6:	learn: 0.7506535	test: 0.7210207	best: 0.7210207 (6)	total: 20.6ms	remaining: 126ms
7:	learn: 0.6593646	test: 0.6180749	best: 0.6180749 (7)	total: 23.4ms	remaining: 123ms
8:	learn: 0.6186267	test: 0.5652070	best: 0.5652070 (8)	total: 26ms	remaining: 118ms
9:	learn: 0.6173373	test: 0.5652487	best: 0.5652070 (8)	total: 28.7ms	remaining: 115ms
10:	learn: 0.6075676	test: 0.5548688	best: 0.5548688 (10)	total: 31.5ms	remaining: 112ms
11:	learn: 0.6067630	test: 0.5547227	best: 0.5547227 (11)	total: 33.9ms	remaining: 107ms
12:	learn: 0.6005232	test: 0.5474828	best: 0.5474828 (12)	total: 36.9ms	remaining: 105ms
13:	learn: 0.5899092	test: 0.5325066	best: 0.5325066 (13)	total: 39.5ms	remaining: 102ms
14:	learn: 0.5863593	test: 0.5285854	best: 0.5285854 (14)	total: 41.5ms	remaining: 96.9ms
15:	learn: 0.5835324	test: 0.5276009	best: 0.5276009 (15)	total: 44.3ms	remaining: 94.1ms
16:	learn: 0.5794176	test: 0.5256546	best: 0.5256546 (16)	total: 46.6ms	remaining: 90.5ms
17:	learn: 0.5794173	test: 0.5256556	best: 0.5256546 (16)	total: 47.8ms	remaining: 84.9ms
18:	learn: 0.5751275	test: 0.5247105	best: 0.5247105 (18)	total: 50.8ms	remaining: 82.9ms
19:	learn: 0.5728119	test: 0.5238224	best: 0.5238224 (19)	total: 53.3ms	remaining: 79.9ms
20:	learn: 0.5727735	test: 0.5238242	best: 0.5238224 (19)	total: 55.1ms	remaining: 76ms
21:	learn: 0.5689593	test: 0.5222888	best: 0.5222888 (21)	total: 57.5ms	remaining: 73.2ms
22:	learn: 0.5641473	test: 0.5220487	best: 0.5220487 (22)	total: 60.7ms	remaining: 71.2ms
23:	learn: 0.5526374	test: 0.5084134	best: 0.5084134 (23)	total: 62.9ms	remaining: 68.2ms
24:	learn: 0.5525604	test: 0.5084188	best: 0.5084134 (23)	total: 65ms	remaining: 65ms
25:	learn: 0.5510476	test: 0.5081869	best: 0.5081869 (25)	total: 67.4ms	remaining: 62.2ms
26:	learn: 0.5498335	test: 0.5081699	best: 0.5081699 (26)	total: 69.5ms	remaining: 59.2ms
27:	learn: 0.5453937	test: 0.5082106	best: 0.5081699 (26)	total: 72.1ms	remaining: 56.6ms
28:	learn: 0.5420939	test: 0.5074699	best: 0.5074699 (28)	total: 75ms	remaining: 54.3ms
29:	learn: 0.5372371	test: 0.5071988	best: 0.5071988 (29)	total: 77.8ms	remaining: 51.9ms
30:	learn: 0.5308965	test: 0.5066908	best: 0.5066908 (30)	total: 80.4ms	remaining: 49.3ms
31:	learn: 0.5273909	test: 0.5058569	best: 0.5058569 (31)	total: 83.4ms	remaining: 46.9ms
32:	learn: 0.5243017	test: 0.5040859	best: 0.5040859 (32)	total: 85.7ms	remaining: 44.1ms
33:	learn: 0.5205391	test: 0.5042567	best: 0.5040859 (32)	total: 87.8ms	remaining: 41.3ms
34:	learn: 0.5177222	test: 0.5044089	best: 0.5040859 (32)	total: 90.1ms	remaining: 38.6ms
35:	learn: 0.5162744	test: 0.5041497	best: 0.5040859 (32)	total: 92.3ms	remaining: 35.9ms
36:	learn: 0.5137202	test: 0.5033154	best: 0.5033154 (36)	total: 94.7ms	remaining: 33.3ms
37:	learn: 0.5116704	test: 0.5021077	best: 0.5021077 (37)	total: 97.5ms	remaining: 30.8ms
38:	learn: 0.5061969	test: 0.5024547	best: 0.5021077 (37)	total: 99.8ms	remaining: 28.1ms
39:	learn: 0.5058026	test: 0.5028111	best: 0.5021077 (37)	total: 102ms	remaining: 25.4ms
40:	learn: 0.5037612	test: 0.5042516	best: 0.5021077 (37)	total: 104ms	remaining: 22.9ms
41:	learn: 0.4983204	test: 0.5024636	best: 0.5021077 (37)	total: 107ms	remaining: 20.4ms
42:	learn: 0.4972772	test: 0.5019073	best: 0.5019073 (42)	total: 109ms	remaining: 17.8ms
43:	learn: 0.4952416	test: 0.5004255	best: 0.5004255 (43)	total: 112ms	remaining: 15.2ms
44:	learn: 0.4919655	test: 0.5005430	best: 0.5004255 (43)	total: 114ms	remaining: 12.7ms
45:	learn: 0.4881841	test: 0.5022669	best: 0.5004255 (43)	total: 116ms	remaining: 10.1ms
46:	learn: 0.4867624	test: 0.5031545	best: 0.5004255 (43)	total: 119ms	remaining: 7.59ms
47:	learn: 0.4833233	test: 0.5006676	best: 0.5004255 (43)	total: 122ms	remaining: 5.08ms
48:	learn: 0.4819493	test: 0.4995150	best: 0.4995150 (48)	total: 125ms	remaining: 2.54ms
49:	learn: 0.4779636	test: 0.4958292	best: 0.4958292 (49)	total: 127ms	remaining: 0us

bestTest = 0.4958292269
bestIteration = 49

RMSE: 0.495829

	emp_title	emp_length	state	homeownership	annual_income	verified_income	debt_to_income	annual_income_joint	verification_income_joint	debt_to_income_joint	months_since_last_delinq	earliest_credit_line	inquiries_last_12m	total_credit_lines	open_credit_lines	total_credit_limit	total_credit_utilized	num_historical_failed_to_pay	months_since_90d_late	total_collection_amount_ever	current_installment_accounts	accounts_opened_24m	months_since_last_credit_inquiry	num_satisfactory_accounts	num_active_debit_accounts	total_debit_limit	num_total_cc_accounts	num_open_cc_accounts	num_cc_carrying_balance	num_mort_accounts	account_never_delinq_percent	tax_liens	public_record_bankrupt	loan_purpose	application_type	loan_amount	term	interest_rate	installment	grade	sub_grade	issue_month	loan_status	initial_listing_status	disbursement_method	balance	paid_total	paid_principal	paid_interest
0	global config engineer	3.0	NJ	MORTGAGE	90000.0	Verified	18.01	NaN	NaN	NaN	38.0	2001	6	28	10	70795	38767	0	38.0	1250	2	5	5.0	10	2	11100	14	8	6	1	92.9	0	0	moving	individual	28000	60	14.07	652.53	C	C3	Mar-2018	Current	whole	Cash	27015.86	1999.33	984.14	1015.19
1	warehouse office clerk	10.0	HI	RENT	40000.0	Not Verified	5.04	NaN	NaN	NaN	NaN	1996	1	30	14	28800	4321	1	NaN	0	0	11	8.0	14	3	16500	24	14	4	0	100.0	0	1	debt_consolidation	individual	5000	36	12.61	167.54	C	C1	Feb-2018	Current	whole	Cash	4651.37	499.12	348.63	150.49
2	assembly	3.0	WI	RENT	40000.0	Source Verified	21.15	NaN	NaN	NaN	28.0	2006	4	31	10	24193	16000	0	28.0	432	1	13	7.0	10	3	4300	14	8	6	0	93.5	0	0	other	individual	2000	36	17.09	71.40	D	D1	Feb-2018	Current	fractional	Cash	1824.63	281.80	175.37	106.43
3	customer service	1.0	PA	RENT	30000.0	Not Verified	10.16	NaN	NaN	NaN	NaN	2007	0	4	4	25400	4997	1	NaN	0	1	1	15.0	4	2	19400	3	3	2	0	100.0	1	0	debt_consolidation	individual	21600	36	6.72	664.19	A	A3	Jan-2018	Current	whole	Cash	18853.26	3312.89	2746.74	566.15
4	security supervisor	10.0	CA	RENT	35000.0	Verified	57.96	57000.0	Verified	37.66	NaN	2008	7	22	16	69839	52722	0	NaN	0	1	6	4.0	16	10	32700	20	15	13	0	100.0	0	0	credit_card	joint	23000	36	14.07	786.87	C	C3	Mar-2018	Current	whole	Cash	21430.15	2324.65	1569.85	754.80

	emp_length	annual_income	debt_to_income	annual_income_joint	debt_to_income_joint	delinq_2y	months_since_last_delinq	earliest_credit_line	inquiries_last_12m	total_credit_lines	open_credit_lines	total_credit_limit	total_credit_utilized	num_collections_last_12m	num_historical_failed_to_pay	months_since_90d_late	current_accounts_delinq	total_collection_amount_ever	current_installment_accounts	accounts_opened_24m	months_since_last_credit_inquiry	num_satisfactory_accounts	num_accounts_120d_past_due	num_accounts_30d_past_due	num_active_debit_accounts	total_debit_limit	num_total_cc_accounts	num_open_cc_accounts	num_cc_carrying_balance	num_mort_accounts	account_never_delinq_percent	tax_liens	public_record_bankrupt	loan_amount	term	interest_rate	installment	balance	paid_total	paid_principal	paid_interest	paid_late_fees
count	9183.000000	1.000000e+04	9976.000000	1.495000e+03	1495.000000	10000.00000	4342.000000	10000.00000	10000.00000	10000.000000	10000.00000	1.000000e+04	10000.000000	10000.000000	10000.000000	2285.000000	10000.0000	10000.000000	10000.000000	10000.000000	8729.000000	10000.000000	9682.0	10000.0000	10000.000000	10000.000000	10000.000000	10000.000000	10000.000000	10000.000000	10000.000000	10000.000000	10000.000000	10000.000000	10000.000000	10000.000000	10000.000000	10000.000000	10000.000000	10000.000000	10000.000000	10000.000000
mean	5.930306	7.922215e+04	19.308192	1.279146e+05	19.979304	0.21600	36.760709	2001.29000	1.95820	22.679600	11.40440	1.836062e+05	51049.063100	0.013800	0.167100	46.112473	0.0001	184.334000	2.663600	4.376100	7.340703	11.379600	0.0	0.0001	3.595300	27357.410300	13.030000	8.095000	5.230500	1.383100	94.648990	0.043300	0.123800	16361.922500	43.272000	12.427524	476.205323	14458.916610	2494.234773	1894.448466	599.666781	0.119516
std	3.703734	6.473429e+04	15.004851	7.016838e+04	8.054781	0.68366	21.634939	7.79551	2.38013	11.885439	5.86828	1.876327e+05	53636.731172	0.126535	0.687768	21.748021	0.0100	2206.389859	2.935387	3.158583	5.981590	5.859705	0.0	0.0100	2.434689	26570.164081	7.874713	4.906606	3.327623	1.717274	9.155059	0.593852	0.337172	10301.956759	11.029877	5.001105	294.851627	9964.561865	3958.230365	3884.407175	517.328062	1.813468
min	0.000000	0.000000e+00	0.000000	1.920000e+04	0.320000	0.00000	1.000000	1963.00000	0.00000	2.000000	0.00000	0.000000e+00	0.000000	0.000000	0.000000	2.000000	0.0000	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	0.0000	0.000000	0.000000	2.000000	0.000000	0.000000	0.000000	14.300000	0.000000	0.000000	1000.000000	36.000000	5.310000	30.750000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	2.000000	4.500000e+04	11.057500	8.683350e+04	14.160000	0.00000	19.000000	1997.00000	0.00000	14.000000	7.00000	5.159375e+04	19185.500000	0.000000	0.000000	29.000000	0.0000	0.000000	1.000000	2.000000	2.000000	7.000000	0.0	0.0000	2.000000	10000.000000	7.000000	5.000000	3.000000	0.000000	92.600000	0.000000	0.000000	8000.000000	36.000000	9.430000	256.040000	6679.065000	928.700000	587.100000	221.757500	0.000000
50%	6.000000	6.500000e+04	17.570000	1.130000e+05	19.720000	0.00000	34.000000	2003.00000	1.00000	21.000000	10.00000	1.146670e+05	36927.000000	0.000000	0.000000	47.000000	0.0000	0.000000	2.000000	4.000000	6.000000	10.000000	0.0	0.0000	3.000000	19500.000000	11.000000	7.000000	5.000000	1.000000	100.000000	0.000000	0.000000	14500.000000	36.000000	11.980000	398.420000	12379.495000	1563.300000	984.990000	446.140000	0.000000
75%	10.000000	9.500000e+04	25.002500	1.515455e+05	25.500000	0.00000	53.000000	2006.00000	3.00000	29.000000	14.00000	2.675500e+05	65421.000000	0.000000	0.000000	63.000000	0.0000	0.000000	3.000000	6.000000	11.000000	14.000000	0.0	0.0000	5.000000	36100.000000	17.000000	10.000000	7.000000	2.000000	100.000000	0.000000	0.000000	24000.000000	60.000000	15.050000	644.690000	20690.182500	2616.005000	1694.555000	825.420000	0.000000
max	10.000000	2.300000e+06	469.090000	1.100000e+06	39.980000	13.00000	118.000000	2015.00000	29.00000	87.000000	51.00000	3.386034e+06	942456.000000	3.000000	52.000000	128.000000	1.0000	199308.000000	35.000000	29.000000	24.000000	51.000000	0.0	1.0000	32.000000	386700.000000	66.000000	46.000000	43.000000	14.000000	100.000000	52.000000	3.000000	40000.000000	60.000000	30.940000	1566.590000	40000.000000	41630.443684	40000.000000	4216.440000	52.980000

	emp_title	emp_length	state	homeownership	annual_income	verified_income	debt_to_income	annual_income_joint	verification_income_joint	debt_to_income_joint	delinq_2y	months_since_last_delinq	earliest_credit_line	inquiries_last_12m	total_credit_lines	open_credit_lines	total_credit_limit	total_credit_utilized	num_collections_last_12m	num_historical_failed_to_pay	months_since_90d_late	current_accounts_delinq	total_collection_amount_ever	current_installment_accounts	accounts_opened_24m	months_since_last_credit_inquiry	num_satisfactory_accounts	num_accounts_120d_past_due	num_accounts_30d_past_due	num_active_debit_accounts	total_debit_limit	num_total_cc_accounts	num_open_cc_accounts	num_cc_carrying_balance	num_mort_accounts	account_never_delinq_percent	tax_liens	public_record_bankrupt	loan_purpose	application_type	loan_amount	term	interest_rate	installment	grade	sub_grade	issue_month	loan_status	initial_listing_status	disbursement_method	balance	paid_total	paid_principal	paid_interest	paid_late_fees
count	9167	9183.000000	10000	10000	1.000000e+04	10000	9976.000000	1.495000e+03	1455	1495.000000	10000.00000	4342.000000	10000.00000	10000.00000	10000.000000	10000.00000	1.000000e+04	10000.000000	10000.000000	10000.000000	2285.000000	10000.0000	10000.000000	10000.000000	10000.000000	8729.000000	10000.000000	9682.0	10000.0000	10000.000000	10000.000000	10000.000000	10000.000000	10000.000000	10000.000000	10000.000000	10000.000000	10000.000000	10000	10000	10000.000000	10000.000000	10000.000000	10000.000000	10000	10000	10000	10000	10000	10000	10000.000000	10000.000000	10000.000000	10000.000000	10000.000000
unique	4741	NaN	50	3	NaN	3	NaN	NaN	3	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	12	2	NaN	NaN	NaN	NaN	7	32	3	6	2	2	NaN	NaN	NaN	NaN	NaN
top	manager	NaN	CA	MORTGAGE	NaN	Source Verified	NaN	NaN	Not Verified	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	debt_consolidation	individual	NaN	NaN	NaN	NaN	B	B1	Mar-2018	Current	whole	Cash	NaN	NaN	NaN	NaN	NaN
freq	218	NaN	1330	4789	NaN	4116	NaN	NaN	611	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	5144	8505	NaN	NaN	NaN	NaN	3037	647	3617	9375	8206	9284	NaN	NaN	NaN	NaN	NaN
mean	NaN	5.930306	NaN	NaN	7.922215e+04	NaN	19.308192	1.279146e+05	NaN	19.979304	0.21600	36.760709	2001.29000	1.95820	22.679600	11.40440	1.836062e+05	51049.063100	0.013800	0.167100	46.112473	0.0001	184.334000	2.663600	4.376100	7.340703	11.379600	0.0	0.0001	3.595300	27357.410300	13.030000	8.095000	5.230500	1.383100	94.648990	0.043300	0.123800	NaN	NaN	16361.922500	43.272000	12.427524	476.205323	NaN	NaN	NaN	NaN	NaN	NaN	14458.916610	2494.234773	1894.448466	599.666781	0.119516
std	NaN	3.703734	NaN	NaN	6.473429e+04	NaN	15.004851	7.016838e+04	NaN	8.054781	0.68366	21.634939	7.79551	2.38013	11.885439	5.86828	1.876327e+05	53636.731172	0.126535	0.687768	21.748021	0.0100	2206.389859	2.935387	3.158583	5.981590	5.859705	0.0	0.0100	2.434689	26570.164081	7.874713	4.906606	3.327623	1.717274	9.155059	0.593852	0.337172	NaN	NaN	10301.956759	11.029877	5.001105	294.851627	NaN	NaN	NaN	NaN	NaN	NaN	9964.561865	3958.230365	3884.407175	517.328062	1.813468
min	NaN	0.000000	NaN	NaN	0.000000e+00	NaN	0.000000	1.920000e+04	NaN	0.320000	0.00000	1.000000	1963.00000	0.00000	2.000000	0.00000	0.000000e+00	0.000000	0.000000	0.000000	2.000000	0.0000	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	0.0000	0.000000	0.000000	2.000000	0.000000	0.000000	0.000000	14.300000	0.000000	0.000000	NaN	NaN	1000.000000	36.000000	5.310000	30.750000	NaN	NaN	NaN	NaN	NaN	NaN	0.000000	0.000000	0.000000	0.000000	0.000000
25%	NaN	2.000000	NaN	NaN	4.500000e+04	NaN	11.057500	8.683350e+04	NaN	14.160000	0.00000	19.000000	1997.00000	0.00000	14.000000	7.00000	5.159375e+04	19185.500000	0.000000	0.000000	29.000000	0.0000	0.000000	1.000000	2.000000	2.000000	7.000000	0.0	0.0000	2.000000	10000.000000	7.000000	5.000000	3.000000	0.000000	92.600000	0.000000	0.000000	NaN	NaN	8000.000000	36.000000	9.430000	256.040000	NaN	NaN	NaN	NaN	NaN	NaN	6679.065000	928.700000	587.100000	221.757500	0.000000
50%	NaN	6.000000	NaN	NaN	6.500000e+04	NaN	17.570000	1.130000e+05	NaN	19.720000	0.00000	34.000000	2003.00000	1.00000	21.000000	10.00000	1.146670e+05	36927.000000	0.000000	0.000000	47.000000	0.0000	0.000000	2.000000	4.000000	6.000000	10.000000	0.0	0.0000	3.000000	19500.000000	11.000000	7.000000	5.000000	1.000000	100.000000	0.000000	0.000000	NaN	NaN	14500.000000	36.000000	11.980000	398.420000	NaN	NaN	NaN	NaN	NaN	NaN	12379.495000	1563.300000	984.990000	446.140000	0.000000
75%	NaN	10.000000	NaN	NaN	9.500000e+04	NaN	25.002500	1.515455e+05	NaN	25.500000	0.00000	53.000000	2006.00000	3.00000	29.000000	14.00000	2.675500e+05	65421.000000	0.000000	0.000000	63.000000	0.0000	0.000000	3.000000	6.000000	11.000000	14.000000	0.0	0.0000	5.000000	36100.000000	17.000000	10.000000	7.000000	2.000000	100.000000	0.000000	0.000000	NaN	NaN	24000.000000	60.000000	15.050000	644.690000	NaN	NaN	NaN	NaN	NaN	NaN	20690.182500	2616.005000	1694.555000	825.420000	0.000000
max	NaN	10.000000	NaN	NaN	2.300000e+06	NaN	469.090000	1.100000e+06	NaN	39.980000	13.00000	118.000000	2015.00000	29.00000	87.000000	51.00000	3.386034e+06	942456.000000	3.000000	52.000000	128.000000	1.0000	199308.000000	35.000000	29.000000	24.000000	51.000000	0.0	1.0000	32.000000	386700.000000	66.000000	46.000000	43.000000	14.000000	100.000000	52.000000	3.000000	NaN	NaN	40000.000000	60.000000	30.940000	1566.590000	NaN	NaN	NaN	NaN	NaN	NaN	40000.000000	41630.443684	40000.000000	4216.440000	52.980000