In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from collections import Counter
In [4]:
df_loans = pd.read_csv('loans_full_schema.csv')
pd.set_option("display.max.columns", None)
df_loans.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 55 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   emp_title                         9167 non-null   object 
 1   emp_length                        9183 non-null   float64
 2   state                             10000 non-null  object 
 3   homeownership                     10000 non-null  object 
 4   annual_income                     10000 non-null  float64
 5   verified_income                   10000 non-null  object 
 6   debt_to_income                    9976 non-null   float64
 7   annual_income_joint               1495 non-null   float64
 8   verification_income_joint         1455 non-null   object 
 9   debt_to_income_joint              1495 non-null   float64
 10  delinq_2y                         10000 non-null  int64  
 11  months_since_last_delinq          4342 non-null   float64
 12  earliest_credit_line              10000 non-null  int64  
 13  inquiries_last_12m                10000 non-null  int64  
 14  total_credit_lines                10000 non-null  int64  
 15  open_credit_lines                 10000 non-null  int64  
 16  total_credit_limit                10000 non-null  int64  
 17  total_credit_utilized             10000 non-null  int64  
 18  num_collections_last_12m          10000 non-null  int64  
 19  num_historical_failed_to_pay      10000 non-null  int64  
 20  months_since_90d_late             2285 non-null   float64
 21  current_accounts_delinq           10000 non-null  int64  
 22  total_collection_amount_ever      10000 non-null  int64  
 23  current_installment_accounts      10000 non-null  int64  
 24  accounts_opened_24m               10000 non-null  int64  
 25  months_since_last_credit_inquiry  8729 non-null   float64
 26  num_satisfactory_accounts         10000 non-null  int64  
 27  num_accounts_120d_past_due        9682 non-null   float64
 28  num_accounts_30d_past_due         10000 non-null  int64  
 29  num_active_debit_accounts         10000 non-null  int64  
 30  total_debit_limit                 10000 non-null  int64  
 31  num_total_cc_accounts             10000 non-null  int64  
 32  num_open_cc_accounts              10000 non-null  int64  
 33  num_cc_carrying_balance           10000 non-null  int64  
 34  num_mort_accounts                 10000 non-null  int64  
 35  account_never_delinq_percent      10000 non-null  float64
 36  tax_liens                         10000 non-null  int64  
 37  public_record_bankrupt            10000 non-null  int64  
 38  loan_purpose                      10000 non-null  object 
 39  application_type                  10000 non-null  object 
 40  loan_amount                       10000 non-null  int64  
 41  term                              10000 non-null  int64  
 42  interest_rate                     10000 non-null  float64
 43  installment                       10000 non-null  float64
 44  grade                             10000 non-null  object 
 45  sub_grade                         10000 non-null  object 
 46  issue_month                       10000 non-null  object 
 47  loan_status                       10000 non-null  object 
 48  initial_listing_status            10000 non-null  object 
 49  disbursement_method               10000 non-null  object 
 50  balance                           10000 non-null  float64
 51  paid_total                        10000 non-null  float64
 52  paid_principal                    10000 non-null  float64
 53  paid_interest                     10000 non-null  float64
 54  paid_late_fees                    10000 non-null  float64
dtypes: float64(17), int64(25), object(13)
memory usage: 4.2+ MB
In [8]:
nRow, nCol = df_loans.shape
print(f'There are {nRow} rows and {nCol} columns')
print(df_loans.dtypes)
There are 10000 rows and 55 columns
emp_title                            object
emp_length                          float64
state                                object
homeownership                        object
annual_income                       float64
verified_income                      object
debt_to_income                      float64
annual_income_joint                 float64
verification_income_joint            object
debt_to_income_joint                float64
delinq_2y                             int64
months_since_last_delinq            float64
earliest_credit_line                  int64
inquiries_last_12m                    int64
total_credit_lines                    int64
open_credit_lines                     int64
total_credit_limit                    int64
total_credit_utilized                 int64
num_collections_last_12m              int64
num_historical_failed_to_pay          int64
months_since_90d_late               float64
current_accounts_delinq               int64
total_collection_amount_ever          int64
current_installment_accounts          int64
accounts_opened_24m                   int64
months_since_last_credit_inquiry    float64
num_satisfactory_accounts             int64
num_accounts_120d_past_due          float64
num_accounts_30d_past_due             int64
num_active_debit_accounts             int64
total_debit_limit                     int64
num_total_cc_accounts                 int64
num_open_cc_accounts                  int64
num_cc_carrying_balance               int64
num_mort_accounts                     int64
account_never_delinq_percent        float64
tax_liens                             int64
public_record_bankrupt                int64
loan_purpose                         object
application_type                     object
loan_amount                           int64
term                                  int64
interest_rate                       float64
installment                         float64
grade                                object
sub_grade                            object
issue_month                          object
loan_status                          object
initial_listing_status               object
disbursement_method                  object
balance                             float64
paid_total                          float64
paid_principal                      float64
paid_interest                       float64
paid_late_fees                      float64
dtype: object
In [7]:
df_loans.head(5)
Out[7]:
emp_title emp_length state homeownership annual_income verified_income debt_to_income annual_income_joint verification_income_joint debt_to_income_joint delinq_2y months_since_last_delinq earliest_credit_line inquiries_last_12m total_credit_lines open_credit_lines total_credit_limit total_credit_utilized num_collections_last_12m num_historical_failed_to_pay months_since_90d_late current_accounts_delinq total_collection_amount_ever current_installment_accounts accounts_opened_24m months_since_last_credit_inquiry num_satisfactory_accounts num_accounts_120d_past_due num_accounts_30d_past_due num_active_debit_accounts total_debit_limit num_total_cc_accounts num_open_cc_accounts num_cc_carrying_balance num_mort_accounts account_never_delinq_percent tax_liens public_record_bankrupt loan_purpose application_type loan_amount term interest_rate installment grade sub_grade issue_month loan_status initial_listing_status disbursement_method balance paid_total paid_principal paid_interest paid_late_fees
0 global config engineer 3.0 NJ MORTGAGE 90000.0 Verified 18.01 NaN NaN NaN 0 38.0 2001 6 28 10 70795 38767 0 0 38.0 0 1250 2 5 5.0 10 0.0 0 2 11100 14 8 6 1 92.9 0 0 moving individual 28000 60 14.07 652.53 C C3 Mar-2018 Current whole Cash 27015.86 1999.33 984.14 1015.19 0.0
1 warehouse office clerk 10.0 HI RENT 40000.0 Not Verified 5.04 NaN NaN NaN 0 NaN 1996 1 30 14 28800 4321 0 1 NaN 0 0 0 11 8.0 14 0.0 0 3 16500 24 14 4 0 100.0 0 1 debt_consolidation individual 5000 36 12.61 167.54 C C1 Feb-2018 Current whole Cash 4651.37 499.12 348.63 150.49 0.0
2 assembly 3.0 WI RENT 40000.0 Source Verified 21.15 NaN NaN NaN 0 28.0 2006 4 31 10 24193 16000 0 0 28.0 0 432 1 13 7.0 10 0.0 0 3 4300 14 8 6 0 93.5 0 0 other individual 2000 36 17.09 71.40 D D1 Feb-2018 Current fractional Cash 1824.63 281.80 175.37 106.43 0.0
3 customer service 1.0 PA RENT 30000.0 Not Verified 10.16 NaN NaN NaN 0 NaN 2007 0 4 4 25400 4997 0 1 NaN 0 0 1 1 15.0 4 0.0 0 2 19400 3 3 2 0 100.0 1 0 debt_consolidation individual 21600 36 6.72 664.19 A A3 Jan-2018 Current whole Cash 18853.26 3312.89 2746.74 566.15 0.0
4 security supervisor 10.0 CA RENT 35000.0 Verified 57.96 57000.0 Verified 37.66 0 NaN 2008 7 22 16 69839 52722 0 0 NaN 0 0 1 6 4.0 16 0.0 0 10 32700 20 15 13 0 100.0 0 0 credit_card joint 23000 36 14.07 786.87 C C3 Mar-2018 Current whole Cash 21430.15 2324.65 1569.85 754.80 0.0
In [9]:
df_loans.dtypes
Out[9]:
emp_title                            object
emp_length                          float64
state                                object
homeownership                        object
annual_income                       float64
verified_income                      object
debt_to_income                      float64
annual_income_joint                 float64
verification_income_joint            object
debt_to_income_joint                float64
delinq_2y                             int64
months_since_last_delinq            float64
earliest_credit_line                  int64
inquiries_last_12m                    int64
total_credit_lines                    int64
open_credit_lines                     int64
total_credit_limit                    int64
total_credit_utilized                 int64
num_collections_last_12m              int64
num_historical_failed_to_pay          int64
months_since_90d_late               float64
current_accounts_delinq               int64
total_collection_amount_ever          int64
current_installment_accounts          int64
accounts_opened_24m                   int64
months_since_last_credit_inquiry    float64
num_satisfactory_accounts             int64
num_accounts_120d_past_due          float64
num_accounts_30d_past_due             int64
num_active_debit_accounts             int64
total_debit_limit                     int64
num_total_cc_accounts                 int64
num_open_cc_accounts                  int64
num_cc_carrying_balance               int64
num_mort_accounts                     int64
account_never_delinq_percent        float64
tax_liens                             int64
public_record_bankrupt                int64
loan_purpose                         object
application_type                     object
loan_amount                           int64
term                                  int64
interest_rate                       float64
installment                         float64
grade                                object
sub_grade                            object
issue_month                          object
loan_status                          object
initial_listing_status               object
disbursement_method                  object
balance                             float64
paid_total                          float64
paid_principal                      float64
paid_interest                       float64
paid_late_fees                      float64
dtype: object
In [10]:
df_loans.isnull().sum()
Out[10]:
emp_title                            833
emp_length                           817
state                                  0
homeownership                          0
annual_income                          0
verified_income                        0
debt_to_income                        24
annual_income_joint                 8505
verification_income_joint           8545
debt_to_income_joint                8505
delinq_2y                              0
months_since_last_delinq            5658
earliest_credit_line                   0
inquiries_last_12m                     0
total_credit_lines                     0
open_credit_lines                      0
total_credit_limit                     0
total_credit_utilized                  0
num_collections_last_12m               0
num_historical_failed_to_pay           0
months_since_90d_late               7715
current_accounts_delinq                0
total_collection_amount_ever           0
current_installment_accounts           0
accounts_opened_24m                    0
months_since_last_credit_inquiry    1271
num_satisfactory_accounts              0
num_accounts_120d_past_due           318
num_accounts_30d_past_due              0
num_active_debit_accounts              0
total_debit_limit                      0
num_total_cc_accounts                  0
num_open_cc_accounts                   0
num_cc_carrying_balance                0
num_mort_accounts                      0
account_never_delinq_percent           0
tax_liens                              0
public_record_bankrupt                 0
loan_purpose                           0
application_type                       0
loan_amount                            0
term                                   0
interest_rate                          0
installment                            0
grade                                  0
sub_grade                              0
issue_month                            0
loan_status                            0
initial_listing_status                 0
disbursement_method                    0
balance                                0
paid_total                             0
paid_principal                         0
paid_interest                          0
paid_late_fees                         0
dtype: int64
In [13]:
df_loans.describe(include=np.object)
Out[13]:
emp_title state homeownership verified_income verification_income_joint loan_purpose application_type grade sub_grade issue_month loan_status initial_listing_status disbursement_method
count 9167 10000 10000 10000 1455 10000 10000 10000 10000 10000 10000 10000 10000
unique 4741 50 3 3 3 12 2 7 32 3 6 2 2
top manager CA MORTGAGE Source Verified Not Verified debt_consolidation individual B B1 Mar-2018 Current whole Cash
freq 218 1330 4789 4116 611 5144 8505 3037 647 3617 9375 8206 9284
In [17]:
df_loans.describe()
Out[17]:
emp_length annual_income debt_to_income annual_income_joint debt_to_income_joint delinq_2y months_since_last_delinq earliest_credit_line inquiries_last_12m total_credit_lines open_credit_lines total_credit_limit total_credit_utilized num_collections_last_12m num_historical_failed_to_pay months_since_90d_late current_accounts_delinq total_collection_amount_ever current_installment_accounts accounts_opened_24m months_since_last_credit_inquiry num_satisfactory_accounts num_accounts_120d_past_due num_accounts_30d_past_due num_active_debit_accounts total_debit_limit num_total_cc_accounts num_open_cc_accounts num_cc_carrying_balance num_mort_accounts account_never_delinq_percent tax_liens public_record_bankrupt loan_amount term interest_rate installment balance paid_total paid_principal paid_interest paid_late_fees
count 9183.000000 1.000000e+04 9976.000000 1.495000e+03 1495.000000 10000.00000 4342.000000 10000.00000 10000.00000 10000.000000 10000.00000 1.000000e+04 10000.000000 10000.000000 10000.000000 2285.000000 10000.0000 10000.000000 10000.000000 10000.000000 8729.000000 10000.000000 9682.0 10000.0000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000
mean 5.930306 7.922215e+04 19.308192 1.279146e+05 19.979304 0.21600 36.760709 2001.29000 1.95820 22.679600 11.40440 1.836062e+05 51049.063100 0.013800 0.167100 46.112473 0.0001 184.334000 2.663600 4.376100 7.340703 11.379600 0.0 0.0001 3.595300 27357.410300 13.030000 8.095000 5.230500 1.383100 94.648990 0.043300 0.123800 16361.922500 43.272000 12.427524 476.205323 14458.916610 2494.234773 1894.448466 599.666781 0.119516
std 3.703734 6.473429e+04 15.004851 7.016838e+04 8.054781 0.68366 21.634939 7.79551 2.38013 11.885439 5.86828 1.876327e+05 53636.731172 0.126535 0.687768 21.748021 0.0100 2206.389859 2.935387 3.158583 5.981590 5.859705 0.0 0.0100 2.434689 26570.164081 7.874713 4.906606 3.327623 1.717274 9.155059 0.593852 0.337172 10301.956759 11.029877 5.001105 294.851627 9964.561865 3958.230365 3884.407175 517.328062 1.813468
min 0.000000 0.000000e+00 0.000000 1.920000e+04 0.320000 0.00000 1.000000 1963.00000 0.00000 2.000000 0.00000 0.000000e+00 0.000000 0.000000 0.000000 2.000000 0.0000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.0000 0.000000 0.000000 2.000000 0.000000 0.000000 0.000000 14.300000 0.000000 0.000000 1000.000000 36.000000 5.310000 30.750000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 2.000000 4.500000e+04 11.057500 8.683350e+04 14.160000 0.00000 19.000000 1997.00000 0.00000 14.000000 7.00000 5.159375e+04 19185.500000 0.000000 0.000000 29.000000 0.0000 0.000000 1.000000 2.000000 2.000000 7.000000 0.0 0.0000 2.000000 10000.000000 7.000000 5.000000 3.000000 0.000000 92.600000 0.000000 0.000000 8000.000000 36.000000 9.430000 256.040000 6679.065000 928.700000 587.100000 221.757500 0.000000
50% 6.000000 6.500000e+04 17.570000 1.130000e+05 19.720000 0.00000 34.000000 2003.00000 1.00000 21.000000 10.00000 1.146670e+05 36927.000000 0.000000 0.000000 47.000000 0.0000 0.000000 2.000000 4.000000 6.000000 10.000000 0.0 0.0000 3.000000 19500.000000 11.000000 7.000000 5.000000 1.000000 100.000000 0.000000 0.000000 14500.000000 36.000000 11.980000 398.420000 12379.495000 1563.300000 984.990000 446.140000 0.000000
75% 10.000000 9.500000e+04 25.002500 1.515455e+05 25.500000 0.00000 53.000000 2006.00000 3.00000 29.000000 14.00000 2.675500e+05 65421.000000 0.000000 0.000000 63.000000 0.0000 0.000000 3.000000 6.000000 11.000000 14.000000 0.0 0.0000 5.000000 36100.000000 17.000000 10.000000 7.000000 2.000000 100.000000 0.000000 0.000000 24000.000000 60.000000 15.050000 644.690000 20690.182500 2616.005000 1694.555000 825.420000 0.000000
max 10.000000 2.300000e+06 469.090000 1.100000e+06 39.980000 13.00000 118.000000 2015.00000 29.00000 87.000000 51.00000 3.386034e+06 942456.000000 3.000000 52.000000 128.000000 1.0000 199308.000000 35.000000 29.000000 24.000000 51.000000 0.0 1.0000 32.000000 386700.000000 66.000000 46.000000 43.000000 14.000000 100.000000 52.000000 3.000000 40000.000000 60.000000 30.940000 1566.590000 40000.000000 41630.443684 40000.000000 4216.440000 52.980000
In [16]:
df_loans.describe(include=[np.object,np.number])
Out[16]:
emp_title emp_length state homeownership annual_income verified_income debt_to_income annual_income_joint verification_income_joint debt_to_income_joint delinq_2y months_since_last_delinq earliest_credit_line inquiries_last_12m total_credit_lines open_credit_lines total_credit_limit total_credit_utilized num_collections_last_12m num_historical_failed_to_pay months_since_90d_late current_accounts_delinq total_collection_amount_ever current_installment_accounts accounts_opened_24m months_since_last_credit_inquiry num_satisfactory_accounts num_accounts_120d_past_due num_accounts_30d_past_due num_active_debit_accounts total_debit_limit num_total_cc_accounts num_open_cc_accounts num_cc_carrying_balance num_mort_accounts account_never_delinq_percent tax_liens public_record_bankrupt loan_purpose application_type loan_amount term interest_rate installment grade sub_grade issue_month loan_status initial_listing_status disbursement_method balance paid_total paid_principal paid_interest paid_late_fees
count 9167 9183.000000 10000 10000 1.000000e+04 10000 9976.000000 1.495000e+03 1455 1495.000000 10000.00000 4342.000000 10000.00000 10000.00000 10000.000000 10000.00000 1.000000e+04 10000.000000 10000.000000 10000.000000 2285.000000 10000.0000 10000.000000 10000.000000 10000.000000 8729.000000 10000.000000 9682.0 10000.0000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000 10000 10000.000000 10000.000000 10000.000000 10000.000000 10000 10000 10000 10000 10000 10000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000
unique 4741 NaN 50 3 NaN 3 NaN NaN 3 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 12 2 NaN NaN NaN NaN 7 32 3 6 2 2 NaN NaN NaN NaN NaN
top manager NaN CA MORTGAGE NaN Source Verified NaN NaN Not Verified NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN debt_consolidation individual NaN NaN NaN NaN B B1 Mar-2018 Current whole Cash NaN NaN NaN NaN NaN
freq 218 NaN 1330 4789 NaN 4116 NaN NaN 611 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5144 8505 NaN NaN NaN NaN 3037 647 3617 9375 8206 9284 NaN NaN NaN NaN NaN
mean NaN 5.930306 NaN NaN 7.922215e+04 NaN 19.308192 1.279146e+05 NaN 19.979304 0.21600 36.760709 2001.29000 1.95820 22.679600 11.40440 1.836062e+05 51049.063100 0.013800 0.167100 46.112473 0.0001 184.334000 2.663600 4.376100 7.340703 11.379600 0.0 0.0001 3.595300 27357.410300 13.030000 8.095000 5.230500 1.383100 94.648990 0.043300 0.123800 NaN NaN 16361.922500 43.272000 12.427524 476.205323 NaN NaN NaN NaN NaN NaN 14458.916610 2494.234773 1894.448466 599.666781 0.119516
std NaN 3.703734 NaN NaN 6.473429e+04 NaN 15.004851 7.016838e+04 NaN 8.054781 0.68366 21.634939 7.79551 2.38013 11.885439 5.86828 1.876327e+05 53636.731172 0.126535 0.687768 21.748021 0.0100 2206.389859 2.935387 3.158583 5.981590 5.859705 0.0 0.0100 2.434689 26570.164081 7.874713 4.906606 3.327623 1.717274 9.155059 0.593852 0.337172 NaN NaN 10301.956759 11.029877 5.001105 294.851627 NaN NaN NaN NaN NaN NaN 9964.561865 3958.230365 3884.407175 517.328062 1.813468
min NaN 0.000000 NaN NaN 0.000000e+00 NaN 0.000000 1.920000e+04 NaN 0.320000 0.00000 1.000000 1963.00000 0.00000 2.000000 0.00000 0.000000e+00 0.000000 0.000000 0.000000 2.000000 0.0000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.0000 0.000000 0.000000 2.000000 0.000000 0.000000 0.000000 14.300000 0.000000 0.000000 NaN NaN 1000.000000 36.000000 5.310000 30.750000 NaN NaN NaN NaN NaN NaN 0.000000 0.000000 0.000000 0.000000 0.000000
25% NaN 2.000000 NaN NaN 4.500000e+04 NaN 11.057500 8.683350e+04 NaN 14.160000 0.00000 19.000000 1997.00000 0.00000 14.000000 7.00000 5.159375e+04 19185.500000 0.000000 0.000000 29.000000 0.0000 0.000000 1.000000 2.000000 2.000000 7.000000 0.0 0.0000 2.000000 10000.000000 7.000000 5.000000 3.000000 0.000000 92.600000 0.000000 0.000000 NaN NaN 8000.000000 36.000000 9.430000 256.040000 NaN NaN NaN NaN NaN NaN 6679.065000 928.700000 587.100000 221.757500 0.000000
50% NaN 6.000000 NaN NaN 6.500000e+04 NaN 17.570000 1.130000e+05 NaN 19.720000 0.00000 34.000000 2003.00000 1.00000 21.000000 10.00000 1.146670e+05 36927.000000 0.000000 0.000000 47.000000 0.0000 0.000000 2.000000 4.000000 6.000000 10.000000 0.0 0.0000 3.000000 19500.000000 11.000000 7.000000 5.000000 1.000000 100.000000 0.000000 0.000000 NaN NaN 14500.000000 36.000000 11.980000 398.420000 NaN NaN NaN NaN NaN NaN 12379.495000 1563.300000 984.990000 446.140000 0.000000
75% NaN 10.000000 NaN NaN 9.500000e+04 NaN 25.002500 1.515455e+05 NaN 25.500000 0.00000 53.000000 2006.00000 3.00000 29.000000 14.00000 2.675500e+05 65421.000000 0.000000 0.000000 63.000000 0.0000 0.000000 3.000000 6.000000 11.000000 14.000000 0.0 0.0000 5.000000 36100.000000 17.000000 10.000000 7.000000 2.000000 100.000000 0.000000 0.000000 NaN NaN 24000.000000 60.000000 15.050000 644.690000 NaN NaN NaN NaN NaN NaN 20690.182500 2616.005000 1694.555000 825.420000 0.000000
max NaN 10.000000 NaN NaN 2.300000e+06 NaN 469.090000 1.100000e+06 NaN 39.980000 13.00000 118.000000 2015.00000 29.00000 87.000000 51.00000 3.386034e+06 942456.000000 3.000000 52.000000 128.000000 1.0000 199308.000000 35.000000 29.000000 24.000000 51.000000 0.0 1.0000 32.000000 386700.000000 66.000000 46.000000 43.000000 14.000000 100.000000 52.000000 3.000000 NaN NaN 40000.000000 60.000000 30.940000 1566.590000 NaN NaN NaN NaN NaN NaN 40000.000000 41630.443684 40000.000000 4216.440000 52.980000
In [8]:
#graph number1 (loan purpose bar graph)
# import seaborn as sns
# count = df_loans['Sector'].value_counts()
# plt.figure(figsize=(15,10))
# ax = sns.countplot(x='Sector', data=df, palette="Set2", order=count.index[0:10])
# ax.set(xlabel='Sectors', ylabel='Number of Companies')
# plt.title("Bar Graph of Sectors")
df_loans.loan_purpose.describe()
loanpurpose = df_loans['loan_purpose'].value_counts()
x = loanpurpose.index
y = loanpurpose.values
#define Seaborn color palette to use
colors = sns.color_palette('pastel')[0:12]
plt.gca().axis("equal")
pie = plt.pie(loanpurpose, startangle=0, autopct='%1.0f%%', pctdistance=0.9, radius=3, colors=colors)
labels=loanpurpose.index.unique()
plt.title('Loan Purpose Pie Chart', weight='bold', size=16, y=2, pad=-20)
plt.legend(pie[0],labels, bbox_to_anchor=(1.5,0.5), loc="center right", fontsize=10, 
           bbox_transform=plt.gcf().transFigure)
plt.subplots_adjust(left=0.0, bottom=0.1, right=0.85)
plt.savefig('Images/pie.png', dpi=300,bbox_inches = 'tight')
In [111]:
#graph number2 (state data)
df = df_loans['state'].value_counts()
# df2 = pd.DataFrame(np.array([df.index,df.values]), columns=['State','Count'])
# df = df.reset_index()
# df.rename( columns={0:'Count','state':'State'}, inplace=True )
df2 = df.index[0:50]
plt.figure(figsize=(20,10))
ax = sns.countplot(x='state', data=df_loans, palette="Set2", order=df.index[0:10])
ax.set(xlabel='State', ylabel='Count')
plt.title("Bar Graph of States")
plt.savefig('Images/bar.png', dpi=300,bbox_inches = 'tight')
In [112]:
#graph number3 
# sns.scatterplot(data=df_loans, x="debt_to_income", y="interest_rate")
plt.figure(figsize = (30,16))
ax = sns.displot(data=df_loans, x="interest_rate", hue="application_type", kind="kde", fill=True)
ax.set(xlabel='Interest Rate', ylabel='Density')
plt.title("Interest Rate based on Application type")
plt.savefig('Images/displot.png', dpi=300,bbox_inches = 'tight')
<Figure size 2160x1152 with 0 Axes>
In [127]:
#graph number3 
# sns.scatterplot(data=df_loans, x="debt_to_income", y="interest_rate")
plt.figure(figsize = (30,16))
ax = sns.boxplot(x="homeownership", y="interest_rate", data=df_loans)
ax.set(xlabel='Homeownership', ylabel='Interest Rate')
plt.title("Interest Rate based on Homeownership", fontsize=14)
plt.savefig('Images/box.png', dpi=300,bbox_inches = 'tight')
In [126]:
#graph number4 (emp_length to annual_income)
plt.figure(figsize = (30,16))
p = sns.scatterplot(data=df_loans, x="annual_income", y="interest_rate", hue="grade")
p.set(xlabel='Annual Income', ylabel='Interest Rate')
plt.title("Interest Rate based on Annual Income and Corresponding Grade",  fontsize=14)
plt.savefig('Images/scatter.png', dpi=300,bbox_inches = 'tight')
In [124]:
#graph number5 (compare those with public_record_bankrupt and those without)
df_loans['delinq'] = np.where(df_loans['delinq_2y']== 0, True, False)
# print(df_loans['bankruptcy'])
plt.figure(figsize = (60,60))
p=sns.lmplot(x="debt_to_income_joint", y="interest_rate", col="issue_month", hue="delinq", data=df_loans,
           markers=["o", "x"], palette="Set1");
p.set(xlabel='Debt to Income', ylabel='Interest Rate')
fig = p.fig 
fig.suptitle("Interest Rate based on Debt to Income Ratio", fontsize=12, y=1.08)
plt.savefig('Images/lmplot.png', dpi=300,bbox_inches = 'tight')
<Figure size 4320x4320 with 0 Axes>
In [290]:
#Create a feature set and create a model which predicts interest rate using at least 2 algorithms. 

#Describe any data cleansing that must be performed and analysis when examining the data.
#Also describe assumptions you made and your approach.
#Visualize the test results and propose enhancements to the model, what would you do if you had more time. 
from sklearn.model_selection import train_test_split
#cleaning and feature set
df_loans = pd.read_csv('loans_full_schema.csv')
pd.set_option("display.max.columns", None) 
my_list = df_loans.columns.values.tolist()
# my_list.remove('months_since_90d_late')
# my_list.remove('months_since_last_delinq')
# my_list.remove('debt_to_income_joint')
# my_list.remove('verification_income_joint')
# my_list.remove('annual_income_joint')
# my_list.remove('emp_title')
# my_list.remove('emp_length')
# my_list.remove('months_since_last_credit_inquiry')
# my_list.remove('num_accounts_120d_past_due')
# my_list.remove('state')
objects = df_loans.select_dtypes(include=['object']).columns
for i in objects:
    my_list.remove(i)
df_loans2 = df_loans[my_list].dropna(how='any',axis=0)
df_loans2 = df_loans2.fillna(df_loans2.mean())
y = df_loans2['interest_rate']
my_list.remove('interest_rate')
df2 = df_loans2[my_list]
# df2 = pd.get_dummies(df2, columns=df2.select_dtypes(include=['object']).columns)
# X = df_loans2[my_list]
X = df2
X = X.values #returns a numpy array
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(X)
df = pd.DataFrame(x_scaled)
x = df
# configure to select all features
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from matplotlib import pyplot
# feature selection
def select_features(X_train, y_train, X_test):
    # configure to select all features
    fs = SelectKBest(score_func=f_classif, k='all')
    # learn relationship from training data
    fs.fit(X_train, y_train)
    # transform train input data
    X_train_fs = fs.transform(X_train)
    # transform test input data
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs, fs
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=1)
# feature selection
X_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)
# what are scores for the features
# for i in range(len(fs.scores_)):
# 	print('Feature %d: %f' % (i, fs.scores_[i]))
# # plot the scores
# pyplot.bar([i for i in range(len(fs.scores_))], fs.scores_)
# pyplot.show()
#create dataframe
scores = pd.DataFrame(list(zip(fs.scores_, df2.columns)),
               columns =['FS Score', 'FeatureName'])
scores = scores.sort_values(by=['FS Score'], ascending=False)
scores = scores.set_index('FeatureName')
plt.figure(figsize=(30,10))
# print(scores.FeatureName.values)
x = sns.barplot(x=scores.index, y="FS Score", palette="Set2", data=scores,order=scores.index[0:10])
/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_selection/_univariate_selection.py:114: UserWarning: Features [16 22 23] are constant.
  warnings.warn("Features %s are constant." % constant_features_idx,
/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_selection/_univariate_selection.py:116: RuntimeWarning: invalid value encountered in true_divide
  f = msb / msw
In [288]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
df_loans = pd.read_csv('loans_full_schema.csv')
pd.set_option("display.max.columns", None) 
y = df_loans['interest_rate']
x = df_loans[['sub_grade','grade','homeownership','loan_purpose']]
x = pd.get_dummies(x, columns=x.select_dtypes(include=['object']).columns)
xtrain, xtest, ytrain, ytest=train_test_split(x, y, random_state=12, 
             test_size=0.15)
# with new parameters
gbr = GradientBoostingRegressor(n_estimators=600, 
    max_depth=5, 
    learning_rate=0.01, 
    min_samples_split=3)
# with default parameters
gbr = GradientBoostingRegressor()

gbr.fit(xtrain, ytrain)

ypred = gbr.predict(xtest)
mse = mean_squared_error(ytest,ypred)
print("MSE: %.2f" % mse)

x_ax = range(len(ytest))
plt.figure(figsize=(15,10))
plt.scatter(x_ax, ytest, s=5, color="blue", label="original")
plt.plot(x_ax, ypred, lw=0.8, color="red", label="predicted")
plt.legend()
plt.show()
MSE: 0.06
In [287]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
df_loans = pd.read_csv('loans_full_schema.csv')
pd.set_option("display.max.columns", None) 
y = df_loans['interest_rate']
df = df_loans.drop(df_loans.select_dtypes(include=['int64','float64']).columns, axis=1)
x = df
x = pd.get_dummies(x, columns=x.select_dtypes(include=['object']).columns)
# print(x)
xtrain, xtest, ytrain, ytest=train_test_split(x, y, random_state=12, 
             test_size=0.15)
# with new parameters
gbr = GradientBoostingRegressor(n_estimators=600, 
    max_depth=5, 
    learning_rate=0.01, 
    min_samples_split=3)
# with default parameters
gbr = GradientBoostingRegressor()

gbr.fit(xtrain, ytrain)

ypred = gbr.predict(xtest)
mse = mean_squared_error(ytest,ypred)
print("MSE: %.2f" % mse)

x_ax = range(len(ytest))
plt.figure(figsize=(15,10))
plt.scatter(x_ax, ytest, s=5, color="blue", label="original")
plt.plot(x_ax, ypred, lw=0.8, color="red", label="predicted")
plt.legend()
plt.show()
MSE: 0.06
In [301]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
df_loans = pd.read_csv('loans_full_schema.csv')
pd.set_option("display.max.columns", None) 
y = df_loans['interest_rate']
x = df_loans[['paid_interest','public_record_bankrupt','sub_grade','grade']]
x = pd.get_dummies(x, columns=x.select_dtypes(include=['object']).columns)
xtrain, xtest, ytrain, ytest=train_test_split(x, y, random_state=12, 
             test_size=0.15)
# with new parameters
gbr = GradientBoostingRegressor(n_estimators=600, 
    max_depth=5, 
    learning_rate=0.01, 
    min_samples_split=3)
# with default parameters
gbr = GradientBoostingRegressor()

gbr.fit(xtrain, ytrain)

ypred = gbr.predict(xtest)
mse = mean_squared_error(ytest,ypred)
print("MSE: %.2f" % mse)

x_ax = range(len(ytest))
plt.figure(figsize=(15,10))
plt.scatter(x_ax, ytest, s=5, color="blue", label="original")
plt.plot(x_ax, ypred, lw=0.8, color="red", label="predicted")
plt.legend()
plt.show()
# print(scores.index[0:10])
# paid_interest
MSE: 0.05
In [330]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
df_loans = pd.read_csv('loans_full_schema.csv')
pd.set_option("display.max.columns", None) 
y = df_loans['interest_rate']
df_loans = df_loans.fillna(df_loans.mean())
x = df_loans.drop('interest_rate',axis=1)
x = pd.get_dummies(x, columns=x.select_dtypes(include=['object']).columns)
data_dmatrix = xgb.DMatrix(data=x,label=y)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=123)
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 100)
xg_reg.fit(X_train,y_train)

preds = xg_reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))
RMSE: 0.524121
/opt/anaconda3/lib/python3.8/site-packages/xgboost/data.py:112: UserWarning: Use subset (sliced data) of np.ndarray is not recommended because it will generate extra copies and increase memory consumption
  warnings.warn(
In [382]:
from catboost import CatBoostRegressor
my_list = df_loans.columns.values.tolist()
# my_list.remove('months_since_90d_late')
# my_list.remove('months_since_last_delinq')
# my_list.remove('debt_to_income_joint')
my_list.remove('verification_income_joint')
my_list.remove('annual_income_joint')
my_list.remove('emp_title')
# my_list.remove('emp_length')
# my_list.remove('months_since_last_credit_inquiry')
# my_list.remove('num_accounts_120d_past_due')
my_list.remove('state')
# my_list.remove('homeownership')
my_list.remove('verified_income')
# my_list.remove('loan_purpose')
# my_list.remove('application_type')
# my_list.remove('issue_month')
# my_list.remove('loan_status')
# my_list.remove('initial_listing_status')
# my_list.remove('disbursement_method')
objects = df_loans.select_dtypes(include=['object']).columns
df_loans2 = df_loans[my_list].dropna(how='any',axis=0)
df_loans2 = df_loans2.fillna(df_loans2.mean())
y = df_loans2['interest_rate']
my_list.remove('interest_rate')
df2 = df_loans2[my_list]
X = df2
# x = X.values #returns a numpy array
pd.set_option("display.max.columns", None) 
y = df_loans['interest_rate']
# df_loans = df_loans.fillna(df_loans.mean())
# x = df_loans.drop('interest_rate',axis=1)
# x = pd.get_dummies(x, columns=x.select_dtypes(include=['object']).columns)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=13)
CAT_FEATURES = ['grade', 'sub_grade','homeownership','loan_purpose','application_type','issue_month','loan_status','initial_listing_status','disbursement_method'] #list of your categorical features
# set up the model
catboost_model = CatBoostRegressor(n_estimators=50,
                                   loss_function = 'RMSE',
                                   eval_metric = 'RMSE',
                                   cat_features = CAT_FEATURES)
# fit model
catboost_model.fit(X_train, y_train, 
                   eval_set = (X_test, y_test),
                   use_best_model = True,
                   plot = True)
preds = catboost_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
x_ax = range(len(ytest))
plt.figure(figsize=(15,10))
plt.scatter(x_ax, ytest, s=5, color="blue", label="original")
plt.plot(x_ax, ypred, lw=0.8, color="red", label="predicted")
plt.legend()
plt.show()
print("RMSE: %f" % (rmse))
Learning rate set to 0.434419
0:	learn: 3.0290727	test: 2.9858375	best: 2.9858375 (0)	total: 3.92ms	remaining: 192ms
1:	learn: 1.9742155	test: 1.9312472	best: 1.9312472 (1)	total: 7.11ms	remaining: 171ms
2:	learn: 1.4796737	test: 1.4405998	best: 1.4405998 (2)	total: 9.54ms	remaining: 149ms
3:	learn: 1.2577354	test: 1.2277346	best: 1.2277346 (3)	total: 12.3ms	remaining: 141ms
4:	learn: 0.9903951	test: 0.9600714	best: 0.9600714 (4)	total: 15.3ms	remaining: 137ms
5:	learn: 0.8793019	test: 0.8551476	best: 0.8551476 (5)	total: 17.7ms	remaining: 130ms
6:	learn: 0.7506535	test: 0.7210207	best: 0.7210207 (6)	total: 20.6ms	remaining: 126ms
7:	learn: 0.6593646	test: 0.6180749	best: 0.6180749 (7)	total: 23.4ms	remaining: 123ms
8:	learn: 0.6186267	test: 0.5652070	best: 0.5652070 (8)	total: 26ms	remaining: 118ms
9:	learn: 0.6173373	test: 0.5652487	best: 0.5652070 (8)	total: 28.7ms	remaining: 115ms
10:	learn: 0.6075676	test: 0.5548688	best: 0.5548688 (10)	total: 31.5ms	remaining: 112ms
11:	learn: 0.6067630	test: 0.5547227	best: 0.5547227 (11)	total: 33.9ms	remaining: 107ms
12:	learn: 0.6005232	test: 0.5474828	best: 0.5474828 (12)	total: 36.9ms	remaining: 105ms
13:	learn: 0.5899092	test: 0.5325066	best: 0.5325066 (13)	total: 39.5ms	remaining: 102ms
14:	learn: 0.5863593	test: 0.5285854	best: 0.5285854 (14)	total: 41.5ms	remaining: 96.9ms
15:	learn: 0.5835324	test: 0.5276009	best: 0.5276009 (15)	total: 44.3ms	remaining: 94.1ms
16:	learn: 0.5794176	test: 0.5256546	best: 0.5256546 (16)	total: 46.6ms	remaining: 90.5ms
17:	learn: 0.5794173	test: 0.5256556	best: 0.5256546 (16)	total: 47.8ms	remaining: 84.9ms
18:	learn: 0.5751275	test: 0.5247105	best: 0.5247105 (18)	total: 50.8ms	remaining: 82.9ms
19:	learn: 0.5728119	test: 0.5238224	best: 0.5238224 (19)	total: 53.3ms	remaining: 79.9ms
20:	learn: 0.5727735	test: 0.5238242	best: 0.5238224 (19)	total: 55.1ms	remaining: 76ms
21:	learn: 0.5689593	test: 0.5222888	best: 0.5222888 (21)	total: 57.5ms	remaining: 73.2ms
22:	learn: 0.5641473	test: 0.5220487	best: 0.5220487 (22)	total: 60.7ms	remaining: 71.2ms
23:	learn: 0.5526374	test: 0.5084134	best: 0.5084134 (23)	total: 62.9ms	remaining: 68.2ms
24:	learn: 0.5525604	test: 0.5084188	best: 0.5084134 (23)	total: 65ms	remaining: 65ms
25:	learn: 0.5510476	test: 0.5081869	best: 0.5081869 (25)	total: 67.4ms	remaining: 62.2ms
26:	learn: 0.5498335	test: 0.5081699	best: 0.5081699 (26)	total: 69.5ms	remaining: 59.2ms
27:	learn: 0.5453937	test: 0.5082106	best: 0.5081699 (26)	total: 72.1ms	remaining: 56.6ms
28:	learn: 0.5420939	test: 0.5074699	best: 0.5074699 (28)	total: 75ms	remaining: 54.3ms
29:	learn: 0.5372371	test: 0.5071988	best: 0.5071988 (29)	total: 77.8ms	remaining: 51.9ms
30:	learn: 0.5308965	test: 0.5066908	best: 0.5066908 (30)	total: 80.4ms	remaining: 49.3ms
31:	learn: 0.5273909	test: 0.5058569	best: 0.5058569 (31)	total: 83.4ms	remaining: 46.9ms
32:	learn: 0.5243017	test: 0.5040859	best: 0.5040859 (32)	total: 85.7ms	remaining: 44.1ms
33:	learn: 0.5205391	test: 0.5042567	best: 0.5040859 (32)	total: 87.8ms	remaining: 41.3ms
34:	learn: 0.5177222	test: 0.5044089	best: 0.5040859 (32)	total: 90.1ms	remaining: 38.6ms
35:	learn: 0.5162744	test: 0.5041497	best: 0.5040859 (32)	total: 92.3ms	remaining: 35.9ms
36:	learn: 0.5137202	test: 0.5033154	best: 0.5033154 (36)	total: 94.7ms	remaining: 33.3ms
37:	learn: 0.5116704	test: 0.5021077	best: 0.5021077 (37)	total: 97.5ms	remaining: 30.8ms
38:	learn: 0.5061969	test: 0.5024547	best: 0.5021077 (37)	total: 99.8ms	remaining: 28.1ms
39:	learn: 0.5058026	test: 0.5028111	best: 0.5021077 (37)	total: 102ms	remaining: 25.4ms
40:	learn: 0.5037612	test: 0.5042516	best: 0.5021077 (37)	total: 104ms	remaining: 22.9ms
41:	learn: 0.4983204	test: 0.5024636	best: 0.5021077 (37)	total: 107ms	remaining: 20.4ms
42:	learn: 0.4972772	test: 0.5019073	best: 0.5019073 (42)	total: 109ms	remaining: 17.8ms
43:	learn: 0.4952416	test: 0.5004255	best: 0.5004255 (43)	total: 112ms	remaining: 15.2ms
44:	learn: 0.4919655	test: 0.5005430	best: 0.5004255 (43)	total: 114ms	remaining: 12.7ms
45:	learn: 0.4881841	test: 0.5022669	best: 0.5004255 (43)	total: 116ms	remaining: 10.1ms
46:	learn: 0.4867624	test: 0.5031545	best: 0.5004255 (43)	total: 119ms	remaining: 7.59ms
47:	learn: 0.4833233	test: 0.5006676	best: 0.5004255 (43)	total: 122ms	remaining: 5.08ms
48:	learn: 0.4819493	test: 0.4995150	best: 0.4995150 (48)	total: 125ms	remaining: 2.54ms
49:	learn: 0.4779636	test: 0.4958292	best: 0.4958292 (49)	total: 127ms	remaining: 0us

bestTest = 0.4958292269
bestIteration = 49

RMSE: 0.495829
In [ ]: