import getpass
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.metrics
import sklearn
import imblearn
import webdav4.client
import io
import collections
import types
import warnings
import colorsys
import IPython.display
import sklearn
import xgboost
#from imblearn.over_sampling import SMOTE
import imblearn
pd.set_option('display.float_format', lambda x: f'{x:.2f}')
def checkfornan(x):
return [col for col in x.columns if x[col].isna().any()]
client=webdav4.client.Client(base_url='https://webdav.critchley.biz', auth=(input('User:'), getpass.getpass('Password:')))
User:john Password:········
df_ls=pd.DataFrame(client.ls('/aws/H'))
df_ls.sort_values('created')
name | href | content_length | created | modified | content_language | content_type | etag | type | display_name | |
---|---|---|---|---|---|---|---|---|---|---|
2 | aws/H/Sample_Submission_(1).csv | /aws/H/Sample_Submission_(1).csv | 468235 | 2023-12-23 22:16:18+00:00 | 2023-12-18 21:09:01+00:00 | None | text/csv | "7250b-60ccf28b69457" | file | None |
3 | aws/H/Data_Dictionary_(1)_(1)_(1)_(3).csv | /aws/H/Data_Dictionary_(1)_(1)_(1)_(3).csv | 1337 | 2023-12-23 22:16:18+00:00 | 2023-12-18 21:09:41+00:00 | None | text/csv | "539-60ccf2b1edabe" | file | None |
0 | aws/H/Test_set_(1).csv | /aws/H/Test_set_(1).csv | 6096186 | 2023-12-23 22:16:21+00:00 | 2023-12-18 21:08:23+00:00 | None | text/csv | "5d053a-60ccf2673fdf5" | file | None |
1 | aws/H/Train_set_(1).csv | /aws/H/Train_set_(1).csv | 14415377 | 2023-12-23 22:16:27+00:00 | 2023-12-18 21:07:50+00:00 | None | text/csv | "dbf611-60ccf2488fc75" | file | None |
f=types.SimpleNamespace()
df=types.SimpleNamespace()
f.test=io.BytesIO()
client.download_fileobj( df_ls['href'][0], f.test)
f.test.seek(0)
df.test=pd.read_csv(f.test, low_memory=False)
print(df.test.shape)
f.train=io.BytesIO()
client.download_fileobj( df_ls['href'][1], f.train)
f.train.seek(0)
df.train=pd.read_csv(f.train, low_memory=False)
print(df.train.shape)
#del f
(39933, 22) (93174, 23)
f.test.seek(0)
df.test=pd.read_csv(f.test, low_memory=False)
print(df.test.shape)
f.train.seek(0)
df.train=pd.read_csv(f.train, low_memory=False)
print(df.train.shape)
(39933, 22) (93174, 23)
df.test
ID | loan_amnt | loan_term | interest_rate | loan_grade | loan_subgrade | job_experience | home_ownership | annual_income | income_verification_status | ... | debt_to_income | delinq_2yrs | public_records | revolving_balance | total_acc | interest_receive | application_type | last_week_pay | total_current_balance | total_revolving_limit | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 4855329 | 12000 | 3 years | 15.31 | C | C2 | <5 Years | MORTGAGE | 73400.00 | Not Verified | ... | 14.62 | 0.00 | 0.00 | 22156 | 30.00 | 2290.24 | INDIVIDUAL | 87.00 | 128098.00 | 25800.00 |
1 | 66862420 | 12000 | 3 years | 7.26 | A | A4 | 10+ years | MORTGAGE | 105000.00 | Not Verified | ... | 11.38 | 0.00 | 0.00 | 7592 | 14.00 | 202.68 | INDIVIDUAL | 13.00 | 269396.00 | 23600.00 |
2 | 3637416 | 15000 | 3 years | 14.33 | C | C1 | 6-10 years | MORTGAGE | 50000.00 | Verified | ... | 28.15 | 0.00 | 1.00 | 17983 | 19.00 | 1166.24 | INDIVIDUAL | 30.00 | 220135.00 | 34100.00 |
3 | 53682249 | 12000 | 3 years | 9.99 | B | B3 | 6-10 years | RENT | 37000.00 | Source Verified | ... | 34.32 | 0.00 | 0.00 | 12262 | 18.00 | 635.06 | INDIVIDUAL | 35.00 | 39436.00 | 21700.00 |
4 | 53937165 | 20150 | 3 years | 11.53 | B | B5 | <5 Years | RENT | 75000.00 | Source Verified | ... | 26.74 | 1.00 | 0.00 | 8251 | 11.00 | 1232.84 | INDIVIDUAL | 31.00 | 52764.00 | 12000.00 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
39928 | 57779318 | 5000 | 3 years | 8.18 | B | B1 | 10+ years | MORTGAGE | 65000.00 | Source Verified | ... | 13.09 | 0.00 | 5.00 | 6731 | 24.00 | 187.27 | INDIVIDUAL | 26.00 | 14422.00 | 26100.00 |
39929 | 59742362 | 1800 | 3 years | 11.53 | B | B5 | 6-10 years | MORTGAGE | 55000.00 | Source Verified | ... | 25.44 | 0.00 | 1.00 | 11092 | 38.00 | 81.24 | INDIVIDUAL | 26.00 | 243469.00 | 29200.00 |
39930 | 72657145 | 15200 | 3 years | 13.44 | C | C3 | 10+ years | MORTGAGE | 78000.00 | Not Verified | ... | 19.40 | 0.00 | 0.00 | 19688 | 26.00 | 0.00 | INDIVIDUAL | NaN | 145370.00 | 45400.00 |
39931 | 15220189 | 14425 | 5 years | 18.92 | D | D4 | <5 Years | MORTGAGE | 38000.00 | Not Verified | ... | 17.40 | 0.00 | 1.00 | 10805 | 29.00 | 4268.80 | INDIVIDUAL | 96.00 | 106449.00 | 19700.00 |
39932 | 21810584 | 27000 | 5 years | 20.20 | E | E3 | <5 Years | MORTGAGE | 62000.00 | Source Verified | ... | 24.12 | 1.00 | 0.00 | 17795 | 17.00 | 7436.41 | INDIVIDUAL | 78.00 | 55787.00 | 33500.00 |
39933 rows × 22 columns
df.train
ID | loan_amnt | loan_term | interest_rate | loan_grade | loan_subgrade | job_experience | home_ownership | annual_income | income_verification_status | ... | delinq_2yrs | public_records | revolving_balance | total_acc | interest_receive | application_type | last_week_pay | total_current_balance | total_revolving_limit | default | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 72199369 | 9000 | 3 years | 9.17 | B | B2 | <5 Years | OWN | 85000.00 | Not Verified | ... | 0.00 | 0.00 | 39519 | 20.00 | 59.60 | INDIVIDUAL | 4.00 | 95493.00 | 84100.00 | 0 |
1 | 14257956 | 18000 | 3 years | 13.65 | C | C1 | <5 Years | OWN | 64000.00 | Verified | ... | 0.00 | 1.00 | 9783 | 24.00 | 3348.25 | INDIVIDUAL | 95.00 | 185433.00 | 13500.00 | 0 |
2 | 66216451 | 16000 | 3 years | 7.26 | A | A4 | <5 Years | MORTGAGE | 150000.00 | Source Verified | ... | 2.00 | 0.00 | 13641 | 27.00 | 276.69 | INDIVIDUAL | 13.00 | 180519.00 | 19300.00 | 0 |
3 | 46974169 | 25000 | 3 years | 13.99 | C | C4 | NaN | MORTGAGE | 59800.00 | Verified | ... | 0.00 | 0.00 | 35020 | 35.00 | 1106.72 | INDIVIDUAL | 17.00 | 183208.00 | 55400.00 | 0 |
4 | 46725961 | 17000 | 3 years | 6.39 | A | A2 | 10+ years | MORTGAGE | 72000.00 | Source Verified | ... | 0.00 | 0.00 | 23990 | 26.00 | 725.29 | INDIVIDUAL | 39.00 | 23990.00 | 81300.00 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
93169 | 65577252 | 3200 | 3 years | 7.26 | A | A4 | <5 Years | RENT | 85000.00 | Not Verified | ... | 0.00 | 0.00 | 7924 | 38.00 | 55.34 | INDIVIDUAL | 13.00 | 64635.00 | 47600.00 | 0 |
93170 | 836021 | 3500 | 3 years | 5.42 | A | A1 | NaN | MORTGAGE | 57550.00 | Not Verified | ... | 0.00 | 0.00 | 10174 | 24.00 | 299.67 | INDIVIDUAL | 161.00 | NaN | NaN | 1 |
93171 | 33058720 | 8000 | 3 years | 13.98 | C | C3 | 10+ years | RENT | 148531.50 | Source Verified | ... | 1.00 | 0.00 | 5391 | 25.00 | 1150.58 | INDIVIDUAL | 65.00 | 94596.00 | 6500.00 | 0 |
93172 | 4060472 | 35000 | 3 years | 17.77 | D | D1 | <5 Years | RENT | 100000.00 | Verified | ... | 0.00 | 0.00 | 24609 | 45.00 | 5764.58 | INDIVIDUAL | 56.00 | 33759.00 | 34900.00 | 1 |
93173 | 3628127 | 10000 | 3 years | 15.80 | C | C3 | <5 Years | RENT | 60000.00 | Verified | ... | 0.00 | 0.00 | 11285 | 7.00 | 2279.36 | INDIVIDUAL | 104.00 | 25594.00 | 12300.00 | 0 |
93174 rows × 23 columns
dep_var='default'
df.all=pd.concat((df.test,df.train.drop('default', axis=1))).reset_index( drop=True)
df.all
ID | loan_amnt | loan_term | interest_rate | loan_grade | loan_subgrade | job_experience | home_ownership | annual_income | income_verification_status | ... | debt_to_income | delinq_2yrs | public_records | revolving_balance | total_acc | interest_receive | application_type | last_week_pay | total_current_balance | total_revolving_limit | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 4855329 | 12000 | 3 years | 15.31 | C | C2 | <5 Years | MORTGAGE | 73400.00 | Not Verified | ... | 14.62 | 0.00 | 0.00 | 22156 | 30.00 | 2290.24 | INDIVIDUAL | 87.00 | 128098.00 | 25800.00 |
1 | 66862420 | 12000 | 3 years | 7.26 | A | A4 | 10+ years | MORTGAGE | 105000.00 | Not Verified | ... | 11.38 | 0.00 | 0.00 | 7592 | 14.00 | 202.68 | INDIVIDUAL | 13.00 | 269396.00 | 23600.00 |
2 | 3637416 | 15000 | 3 years | 14.33 | C | C1 | 6-10 years | MORTGAGE | 50000.00 | Verified | ... | 28.15 | 0.00 | 1.00 | 17983 | 19.00 | 1166.24 | INDIVIDUAL | 30.00 | 220135.00 | 34100.00 |
3 | 53682249 | 12000 | 3 years | 9.99 | B | B3 | 6-10 years | RENT | 37000.00 | Source Verified | ... | 34.32 | 0.00 | 0.00 | 12262 | 18.00 | 635.06 | INDIVIDUAL | 35.00 | 39436.00 | 21700.00 |
4 | 53937165 | 20150 | 3 years | 11.53 | B | B5 | <5 Years | RENT | 75000.00 | Source Verified | ... | 26.74 | 1.00 | 0.00 | 8251 | 11.00 | 1232.84 | INDIVIDUAL | 31.00 | 52764.00 | 12000.00 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
133102 | 65577252 | 3200 | 3 years | 7.26 | A | A4 | <5 Years | RENT | 85000.00 | Not Verified | ... | 17.11 | 0.00 | 0.00 | 7924 | 38.00 | 55.34 | INDIVIDUAL | 13.00 | 64635.00 | 47600.00 |
133103 | 836021 | 3500 | 3 years | 5.42 | A | A1 | NaN | MORTGAGE | 57550.00 | Not Verified | ... | 22.64 | 0.00 | 0.00 | 10174 | 24.00 | 299.67 | INDIVIDUAL | 161.00 | NaN | NaN |
133104 | 33058720 | 8000 | 3 years | 13.98 | C | C3 | 10+ years | RENT | 148531.50 | Source Verified | ... | 13.04 | 1.00 | 0.00 | 5391 | 25.00 | 1150.58 | INDIVIDUAL | 65.00 | 94596.00 | 6500.00 |
133105 | 4060472 | 35000 | 3 years | 17.77 | D | D1 | <5 Years | RENT | 100000.00 | Verified | ... | 17.22 | 0.00 | 0.00 | 24609 | 45.00 | 5764.58 | INDIVIDUAL | 56.00 | 33759.00 | 34900.00 |
133106 | 3628127 | 10000 | 3 years | 15.80 | C | C3 | <5 Years | RENT | 60000.00 | Verified | ... | 11.83 | 0.00 | 0.00 | 11285 | 7.00 | 2279.36 | INDIVIDUAL | 104.00 | 25594.00 | 12300.00 |
133107 rows × 22 columns
pd_summary=pd.DataFrame(dict(nunique=df.all.nunique()*100.0/len(df.all),hasna=df.all.isna().sum()*100.0/len(df.all)))
pd_summary
nunique | hasna | |
---|---|---|
ID | 100.00 | 0.00 |
loan_amnt | 1.01 | 0.00 |
loan_term | 0.00 | 0.00 |
interest_rate | 0.38 | 0.00 |
loan_grade | 0.01 | 0.00 |
loan_subgrade | 0.03 | 0.00 |
job_experience | 0.00 | 5.10 |
home_ownership | 0.00 | 0.00 |
annual_income | 8.64 | 0.00 |
income_verification_status | 0.00 | 0.00 |
loan_purpose | 0.00 | 0.00 |
state_code | 0.04 | 0.00 |
debt_to_income | 3.01 | 0.00 |
delinq_2yrs | 0.02 | 0.00 |
public_records | 0.01 | 0.00 |
revolving_balance | 30.94 | 0.00 |
total_acc | 0.08 | 0.00 |
interest_receive | 69.15 | 0.00 |
application_type | 0.00 | 0.00 |
last_week_pay | 0.07 | 2.05 |
total_current_balance | 72.83 | 7.98 |
total_revolving_limit | 4.23 | 7.98 |
renames={'interest_receive':'interest_received',
'loan_amnt':'loan_amount',
'delinq_2yrs':'delinquent_for_2y',
'total_acc': 'credit_lines',
'public_records': 'legal_cases',
'last_week_pay': 'months_paid'
}
for dfi, dfd in df.__dict__.items():
dfd.rename({oldname:newname for oldname,newname in renames.items() if oldname in dfd.columns and newname not in dfd.columns}, axis=1, inplace=True)
criteria=(df.all.nunique()>10)
df.test[criteria.index[criteria]]
ID | loan_amount | interest_rate | loan_subgrade | annual_income | state_code | debt_to_income | delinquent_for_2y | legal_cases | revolving_balance | credit_lines | interest_received | months_paid | total_current_balance | total_revolving_limit | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 4855329 | 12000 | 15.31 | C2 | 73400.00 | CA | 14.62 | 0.00 | 0.00 | 22156 | 30.00 | 2290.24 | 87.00 | 128098.00 | 25800.00 |
1 | 66862420 | 12000 | 7.26 | A4 | 105000.00 | VA | 11.38 | 0.00 | 0.00 | 7592 | 14.00 | 202.68 | 13.00 | 269396.00 | 23600.00 |
2 | 3637416 | 15000 | 14.33 | C1 | 50000.00 | TX | 28.15 | 0.00 | 1.00 | 17983 | 19.00 | 1166.24 | 30.00 | 220135.00 | 34100.00 |
3 | 53682249 | 12000 | 9.99 | B3 | 37000.00 | NJ | 34.32 | 0.00 | 0.00 | 12262 | 18.00 | 635.06 | 35.00 | 39436.00 | 21700.00 |
4 | 53937165 | 20150 | 11.53 | B5 | 75000.00 | CA | 26.74 | 1.00 | 0.00 | 8251 | 11.00 | 1232.84 | 31.00 | 52764.00 | 12000.00 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
39928 | 57779318 | 5000 | 8.18 | B1 | 65000.00 | MI | 13.09 | 0.00 | 5.00 | 6731 | 24.00 | 187.27 | 26.00 | 14422.00 | 26100.00 |
39929 | 59742362 | 1800 | 11.53 | B5 | 55000.00 | AZ | 25.44 | 0.00 | 1.00 | 11092 | 38.00 | 81.24 | 26.00 | 243469.00 | 29200.00 |
39930 | 72657145 | 15200 | 13.44 | C3 | 78000.00 | TX | 19.40 | 0.00 | 0.00 | 19688 | 26.00 | 0.00 | NaN | 145370.00 | 45400.00 |
39931 | 15220189 | 14425 | 18.92 | D4 | 38000.00 | IN | 17.40 | 0.00 | 1.00 | 10805 | 29.00 | 4268.80 | 96.00 | 106449.00 | 19700.00 |
39932 | 21810584 | 27000 | 20.20 | E3 | 62000.00 | FL | 24.12 | 1.00 | 0.00 | 17795 | 17.00 | 7436.41 | 78.00 | 55787.00 | 33500.00 |
39933 rows × 15 columns
df.train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 93174 entries, 0 to 93173 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 93174 non-null int64 1 loan_amount 93174 non-null int64 2 loan_term 93174 non-null object 3 interest_rate 93174 non-null float64 4 loan_grade 93174 non-null object 5 loan_subgrade 93174 non-null object 6 job_experience 88472 non-null object 7 home_ownership 93174 non-null object 8 annual_income 93173 non-null float64 9 income_verification_status 93174 non-null object 10 loan_purpose 93174 non-null object 11 state_code 93174 non-null object 12 debt_to_income 93174 non-null float64 13 delinquent_for_2y 93172 non-null float64 14 legal_cases 93172 non-null float64 15 revolving_balance 93174 non-null int64 16 credit_lines 93172 non-null float64 17 interest_received 93174 non-null float64 18 application_type 93174 non-null object 19 months_paid 91250 non-null float64 20 total_current_balance 85788 non-null float64 21 total_revolving_limit 85788 non-null float64 22 default 93174 non-null int64 dtypes: float64(10), int64(4), object(9) memory usage: 16.3+ MB
df.test.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 39933 entries, 0 to 39932 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 39933 non-null int64 1 loan_amount 39933 non-null int64 2 loan_term 39933 non-null object 3 interest_rate 39933 non-null float64 4 loan_grade 39933 non-null object 5 loan_subgrade 39933 non-null object 6 job_experience 37844 non-null object 7 home_ownership 39933 non-null object 8 annual_income 39933 non-null float64 9 income_verification_status 39933 non-null object 10 loan_purpose 39933 non-null object 11 state_code 39933 non-null object 12 debt_to_income 39933 non-null float64 13 delinquent_for_2y 39932 non-null float64 14 legal_cases 39932 non-null float64 15 revolving_balance 39933 non-null int64 16 credit_lines 39932 non-null float64 17 interest_received 39933 non-null float64 18 application_type 39933 non-null object 19 months_paid 39127 non-null float64 20 total_current_balance 36703 non-null float64 21 total_revolving_limit 36703 non-null float64 dtypes: float64(10), int64(3), object(9) memory usage: 6.7+ MB
df.all.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 133107 entries, 0 to 133106 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 133107 non-null int64 1 loan_amount 133107 non-null int64 2 loan_term 133107 non-null object 3 interest_rate 133107 non-null float64 4 loan_grade 133107 non-null object 5 loan_subgrade 133107 non-null object 6 job_experience 126316 non-null object 7 home_ownership 133107 non-null object 8 annual_income 133106 non-null float64 9 income_verification_status 133107 non-null object 10 loan_purpose 133107 non-null object 11 state_code 133107 non-null object 12 debt_to_income 133107 non-null float64 13 delinquent_for_2y 133104 non-null float64 14 legal_cases 133104 non-null float64 15 revolving_balance 133107 non-null int64 16 credit_lines 133104 non-null float64 17 interest_received 133107 non-null float64 18 application_type 133107 non-null object 19 months_paid 130377 non-null float64 20 total_current_balance 122491 non-null float64 21 total_revolving_limit 122491 non-null float64 dtypes: float64(10), int64(3), object(9) memory usage: 22.3+ MB
d=df.train.drop('ID', axis=1).copy()
d
loan_amount | loan_term | interest_rate | loan_grade | loan_subgrade | job_experience | home_ownership | annual_income | income_verification_status | loan_purpose | ... | delinquent_for_2y | legal_cases | revolving_balance | credit_lines | interest_received | application_type | months_paid | total_current_balance | total_revolving_limit | default | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 9000 | 3 years | 9.17 | B | B2 | <5 Years | OWN | 85000.00 | Not Verified | debt_consolidation | ... | 0.00 | 0.00 | 39519 | 20.00 | 59.60 | INDIVIDUAL | 4.00 | 95493.00 | 84100.00 | 0 |
1 | 18000 | 3 years | 13.65 | C | C1 | <5 Years | OWN | 64000.00 | Verified | debt_consolidation | ... | 0.00 | 1.00 | 9783 | 24.00 | 3348.25 | INDIVIDUAL | 95.00 | 185433.00 | 13500.00 | 0 |
2 | 16000 | 3 years | 7.26 | A | A4 | <5 Years | MORTGAGE | 150000.00 | Source Verified | debt_consolidation | ... | 2.00 | 0.00 | 13641 | 27.00 | 276.69 | INDIVIDUAL | 13.00 | 180519.00 | 19300.00 | 0 |
3 | 25000 | 3 years | 13.99 | C | C4 | NaN | MORTGAGE | 59800.00 | Verified | debt_consolidation | ... | 0.00 | 0.00 | 35020 | 35.00 | 1106.72 | INDIVIDUAL | 17.00 | 183208.00 | 55400.00 | 0 |
4 | 17000 | 3 years | 6.39 | A | A2 | 10+ years | MORTGAGE | 72000.00 | Source Verified | credit_card | ... | 0.00 | 0.00 | 23990 | 26.00 | 725.29 | INDIVIDUAL | 39.00 | 23990.00 | 81300.00 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
93169 | 3200 | 3 years | 7.26 | A | A4 | <5 Years | RENT | 85000.00 | Not Verified | debt_consolidation | ... | 0.00 | 0.00 | 7924 | 38.00 | 55.34 | INDIVIDUAL | 13.00 | 64635.00 | 47600.00 | 0 |
93170 | 3500 | 3 years | 5.42 | A | A1 | NaN | MORTGAGE | 57550.00 | Not Verified | other | ... | 0.00 | 0.00 | 10174 | 24.00 | 299.67 | INDIVIDUAL | 161.00 | NaN | NaN | 1 |
93171 | 8000 | 3 years | 13.98 | C | C3 | 10+ years | RENT | 148531.50 | Source Verified | credit_card | ... | 1.00 | 0.00 | 5391 | 25.00 | 1150.58 | INDIVIDUAL | 65.00 | 94596.00 | 6500.00 | 0 |
93172 | 35000 | 3 years | 17.77 | D | D1 | <5 Years | RENT | 100000.00 | Verified | debt_consolidation | ... | 0.00 | 0.00 | 24609 | 45.00 | 5764.58 | INDIVIDUAL | 56.00 | 33759.00 | 34900.00 | 1 |
93173 | 10000 | 3 years | 15.80 | C | C3 | <5 Years | RENT | 60000.00 | Verified | debt_consolidation | ... | 0.00 | 0.00 | 11285 | 7.00 | 2279.36 | INDIVIDUAL | 104.00 | 25594.00 | 12300.00 | 0 |
93174 rows × 22 columns
d['loan_term'].value_counts().sort_values()
loan_term 5 years 27963 3 years 65211 Name: count, dtype: int64
del d
dims = (11.7, 8.27) dfi=df.all for col in dfi.columns: if col in {'ID'}: continue if dfi[col].nunique()>=10: print(col) fig, ax = plt.subplots(figsize=dims) sns.histplot(dfi[col], kde=True, ax=ax) ax.set(title=col) plt.show()
Statistics = collections.namedtuple('Statistics', ['mean', 'median', 'percentile_25', 'percentile_75'])
def stats(data):
return Statistics(
data.mean(),
data.median(),
np.percentile(data, 25),
np.percentile(data, 75)
)
counts=df.train[dep_var].value_counts()
plt.title(f'Initial ratio of {dep_var}')
plt.pie(counts, labels=counts.index, autopct=lambda x:f'{x:.2f}%', colors=['#ff6060','#60ff60'], explode=[0.005,.005])
plt.show()
def mkjumptable(grp, labels):
"Make an indedx with links to various items, params are the index name and the label."
htmls=[f'<h2><a id="{grp}_index"></a>Go to:</h2>',
'<ul>'
]
for colname in labels:
if colname in {'ID'}: continue
htmls.append(f'<li><a href="#{colname}_{grp}">{colname}</a></li>')
htmls.append('</ul>')
IPython.display.display(IPython.display.HTML('\n'.join(htmls)))
def placelabel(grp, label):
"Target label for index, the params are the index name and the label."
html=IPython.display.HTML(f'<a id="{label}_{grp}" href="#{grp}_index" style="text-decoration:none">🠝</a><h2>{label}</h2>')
IPython.display.display(html)
dims = (11.7, 8.27)
thechange=50
narep=0
dfi=df.all
mkjumptable('plot', dfi.columns)
for colname in dfi.columns:
if colname in {'ID'}: continue
placelabel('plot', colname)
col=dfi[colname]
if col.nunique()<=thechange:
if col.dtype.type == np.object_:
print("Categorical")
c=col.fillna('Unknown').astype('category').value_counts()#.sort_values()
else:
c=col.fillna(narep).value_counts()#.sort_values(ascending=True)
print(c)
fig, ax = plt.subplots(figsize=dims)
# ax.set_title(colname)
sns.barplot(y=c, x=c.index, ax=ax)
else:
if col.nunique()>thechange:
if col.dtype.type == np.object_:
print("XXXXXX")
continue
print('LargeNumeric')
c=col.fillna(narep) if col.isna().all() else col
s=stats(c)
fig, ax = plt.subplots(figsize=dims)
# ax.set_title(colname)
sns.kdeplot(x=col, ax=ax)
plt.axvline(s.mean, color='g', linestyle='--', label=f'Mean: {s.mean:.2f}')
if not np.isnan(s.percentile_25):
plt.axvline(s.percentile_25, color='r', linestyle=':', label=f'25th Percentile: {s.percentile_25:.2f}')
plt.axvline(s.median, color='r', linestyle='-', label=f'Median: {s.median:.2f}')
if not np.isnan(s.percentile_75):
plt.axvline(s.percentile_75, color='r', linestyle=':', label=f'75th Percentile: {s.percentile_75:.2f}')
plt.legend()
if col.isna().any():
fig.patch.set_edgecolor('red') # Put a red box around any which have NaNs
fig.patch.set_linewidth(2)
print("NaN:", col.isna().sum())
plt.show()
LargeNumeric
Categorical loan_term 3 years 93321 5 years 39786 Name: count, dtype: int64
LargeNumeric
Categorical loan_grade B 38416 C 36735 A 22298 D 20952 E 10536 F 3374 G 796 Name: count, dtype: int64
Categorical loan_subgrade B3 8426 B4 8398 C1 7879 C2 7850 C3 7490 B2 7375 B5 7342 C4 7266 B1 6875 A5 6792 C5 6250 D1 5292 A4 5190 D2 4612 D3 3934 D4 3858 A3 3524 A1 3444 A2 3348 D5 3256 E1 2727 E2 2491 E3 2165 E4 1747 E5 1406 F1 1057 F2 799 F3 660 F4 502 F5 356 G1 245 G2 218 G3 151 G5 97 G4 85 Name: count, dtype: int64
Categorical job_experience <5 Years 57900 10+ years 43508 6-10 years 24908 Unknown 6791 Name: count, dtype: int64 NaN: 6791
Categorical home_ownership MORTGAGE 66453 RENT 53387 OWN 13233 OTHER 24 NONE 10 Name: count, dtype: int64
LargeNumeric NaN: 1
Categorical income_verification_status Source Verified 49267 Verified 43645 Not Verified 40195 Name: count, dtype: int64
Categorical loan_purpose debt_consolidation 78714 credit_card 30954 other 15664 home_improvement 7775 Name: count, dtype: int64
Categorical state_code CA 19675 NY 11191 TX 10630 FL 9074 IL 5252 NJ 4977 PA 4712 OH 4476 GA 4349 VA 3987 NC 3732 MI 3416 MD 3214 MA 3177 AZ 3032 WA 2946 CO 2793 MN 2346 MO 2135 IN 1997 CT 1989 TN 1922 NV 1870 AL 1713 WI 1680 SC 1602 OR 1581 LA 1570 KY 1347 OK 1216 KS 1146 AR 1009 UT 938 NM 721 WV 673 HI 660 NH 659 RI 568 MS 547 DC 370 DE 369 MT 366 AK 331 WY 294 SD 263 VT 262 NE 184 ME 79 ND 65 ID 2 Name: count, dtype: int64
LargeNumeric
delinquent_for_2y 0.00 107638 1.00 16846 2.00 5019 3.00 1869 4.00 786 5.00 396 6.00 213 7.00 113 8.00 78 9.00 43 10.00 28 12.00 24 11.00 17 13.00 12 14.00 11 17.00 3 15.00 3 18.00 3 16.00 2 19.00 1 21.00 1 22.00 1 Name: count, dtype: int64 NaN: 3
legal_cases 0.00 112785 1.00 16997 2.00 2164 3.00 665 4.00 244 5.00 112 6.00 73 7.00 26 8.00 15 9.00 9 10.00 7 11.00 4 12.00 2 21.00 1 23.00 1 13.00 1 49.00 1 Name: count, dtype: int64 NaN: 3
LargeNumeric
LargeNumeric NaN: 3
LargeNumeric
Categorical application_type INDIVIDUAL 133027 JOINT 80 Name: count, dtype: int64
LargeNumeric NaN: 2730
LargeNumeric NaN: 10616
LargeNumeric NaN: 10616
Location = collections.namedtuple('Location', ['longitude', 'latitude'])
# Create the dictionary
state_centroids = {'AK': Location(longitude=-86.8287, latitude=32.7794),
'AL': Location(longitude=-152.2782, latitude=64.0685),
'AR': Location(longitude=-111.6602, latitude=34.2744),
'AZ': Location(longitude=-92.4426, latitude=34.8938),
'CA': Location(longitude=-119.4696, latitude=37.1841),
'CO': Location(longitude=-105.5478, latitude=38.9972),
'CT': Location(longitude=-72.7273, latitude=41.6219),
'DC': Location(longitude=-75.505, latitude=38.9896),
'DE': Location(longitude=-77.0147, latitude=38.9101),
'FL': Location(longitude=-82.4497, latitude=28.6305),
'GA': Location(longitude=-83.4426, latitude=32.6415),
'HI': Location(longitude=-156.3737, latitude=20.2927),
'ID': Location(longitude=-114.613, latitude=44.3509),
'IL': Location(longitude=-89.1965, latitude=40.0417),
'IN': Location(longitude=-86.2816, latitude=39.8942),
'KS': Location(longitude=-93.496, latitude=42.0751),
'KY': Location(longitude=-98.3804, latitude=38.4937),
'LA': Location(longitude=-85.3021, latitude=37.5347),
'MA': Location(longitude=-91.9968, latitude=31.0689),
'MD': Location(longitude=-69.2428, latitude=45.3695),
'ME': Location(longitude=-76.7909, latitude=39.055),
'MI': Location(longitude=-71.8083, latitude=42.2596),
'MN': Location(longitude=-85.4102, latitude=44.3467),
'MO': Location(longitude=-94.3053, latitude=46.2807),
'MS': Location(longitude=-89.6678, latitude=32.7364),
'MT': Location(longitude=-92.458, latitude=38.3566),
'NC': Location(longitude=-109.6333, latitude=47.0527),
'ND': Location(longitude=-99.6809, latitude=41.5),
'NE': Location(longitude=-116.6312, latitude=39.3289),
'NH': Location(longitude=-71.5811, latitude=43.6805),
'NJ': Location(longitude=-74.6728, latitude=40.1907),
'NM': Location(longitude=-106.1126, latitude=34.4071),
'NV': Location(longitude=-75.5268, latitude=42.9538),
'NY': Location(longitude=-79.3877, latitude=35.5557),
'OH': Location(longitude=-100.4659, latitude=47.4501),
'OK': Location(longitude=-82.7937, latitude=40.2862),
'OR': Location(longitude=-97.4943, latitude=35.5889),
'PA': Location(longitude=-120.5583, latitude=43.9336),
'RI': Location(longitude=-77.7996, latitude=40.8781),
'SC': Location(longitude=-71.5562, latitude=41.6762),
'SD': Location(longitude=-80.8964, latitude=33.9169),
'TN': Location(longitude=-100.2263, latitude=44.4443),
'TX': Location(longitude=-86.3505, latitude=35.858),
'UT': Location(longitude=-99.3312, latitude=31.4757),
'VA': Location(longitude=-111.6703, latitude=39.3055),
'VT': Location(longitude=-72.6658, latitude=44.0687),
'WA': Location(longitude=-78.8537, latitude=37.5215),
'WI': Location(longitude=-120.4472, latitude=47.3826),
'WV': Location(longitude=-80.6227, latitude=38.6409),
'WY': Location(longitude=-89.9941, latitude=44.6243)}
state_centroids
{'AK': Location(longitude=-86.8287, latitude=32.7794), 'AL': Location(longitude=-152.2782, latitude=64.0685), 'AR': Location(longitude=-111.6602, latitude=34.2744), 'AZ': Location(longitude=-92.4426, latitude=34.8938), 'CA': Location(longitude=-119.4696, latitude=37.1841), 'CO': Location(longitude=-105.5478, latitude=38.9972), 'CT': Location(longitude=-72.7273, latitude=41.6219), 'DC': Location(longitude=-75.505, latitude=38.9896), 'DE': Location(longitude=-77.0147, latitude=38.9101), 'FL': Location(longitude=-82.4497, latitude=28.6305), 'GA': Location(longitude=-83.4426, latitude=32.6415), 'HI': Location(longitude=-156.3737, latitude=20.2927), 'ID': Location(longitude=-114.613, latitude=44.3509), 'IL': Location(longitude=-89.1965, latitude=40.0417), 'IN': Location(longitude=-86.2816, latitude=39.8942), 'KS': Location(longitude=-93.496, latitude=42.0751), 'KY': Location(longitude=-98.3804, latitude=38.4937), 'LA': Location(longitude=-85.3021, latitude=37.5347), 'MA': Location(longitude=-91.9968, latitude=31.0689), 'MD': Location(longitude=-69.2428, latitude=45.3695), 'ME': Location(longitude=-76.7909, latitude=39.055), 'MI': Location(longitude=-71.8083, latitude=42.2596), 'MN': Location(longitude=-85.4102, latitude=44.3467), 'MO': Location(longitude=-94.3053, latitude=46.2807), 'MS': Location(longitude=-89.6678, latitude=32.7364), 'MT': Location(longitude=-92.458, latitude=38.3566), 'NC': Location(longitude=-109.6333, latitude=47.0527), 'ND': Location(longitude=-99.6809, latitude=41.5), 'NE': Location(longitude=-116.6312, latitude=39.3289), 'NH': Location(longitude=-71.5811, latitude=43.6805), 'NJ': Location(longitude=-74.6728, latitude=40.1907), 'NM': Location(longitude=-106.1126, latitude=34.4071), 'NV': Location(longitude=-75.5268, latitude=42.9538), 'NY': Location(longitude=-79.3877, latitude=35.5557), 'OH': Location(longitude=-100.4659, latitude=47.4501), 'OK': Location(longitude=-82.7937, latitude=40.2862), 'OR': Location(longitude=-97.4943, latitude=35.5889), 'PA': Location(longitude=-120.5583, latitude=43.9336), 'RI': Location(longitude=-77.7996, latitude=40.8781), 'SC': Location(longitude=-71.5562, latitude=41.6762), 'SD': Location(longitude=-80.8964, latitude=33.9169), 'TN': Location(longitude=-100.2263, latitude=44.4443), 'TX': Location(longitude=-86.3505, latitude=35.858), 'UT': Location(longitude=-99.3312, latitude=31.4757), 'VA': Location(longitude=-111.6703, latitude=39.3055), 'VT': Location(longitude=-72.6658, latitude=44.0687), 'WA': Location(longitude=-78.8537, latitude=37.5215), 'WI': Location(longitude=-120.4472, latitude=47.3826), 'WV': Location(longitude=-80.6227, latitude=38.6409), 'WY': Location(longitude=-89.9941, latitude=44.6243)}
df.train.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
ID | 93174.00 | 35050211.39 | 24149262.07 | 70735.00 | 10859832.50 | 37107507.00 | 58598949.50 | 73519746.00 |
loan_amount | 93174.00 | 14733.86 | 8428.18 | 500.00 | 8000.00 | 13000.00 | 20000.00 | 35000.00 |
interest_rate | 93174.00 | 13.23 | 4.37 | 5.32 | 9.99 | 12.99 | 16.20 | 28.99 |
annual_income | 93173.00 | 75028.26 | 69454.78 | 1200.00 | 45000.00 | 64000.00 | 90000.00 | 9500000.00 |
debt_to_income | 93174.00 | 18.13 | 8.56 | 0.00 | 11.93 | 17.64 | 23.89 | 672.52 |
delinquent_for_2y | 93172.00 | 0.32 | 0.88 | 0.00 | 0.00 | 0.00 | 0.00 | 22.00 |
legal_cases | 93172.00 | 0.20 | 0.58 | 0.00 | 0.00 | 0.00 | 0.00 | 49.00 |
revolving_balance | 93174.00 | 16854.47 | 23689.07 | 0.00 | 6433.00 | 11856.00 | 20745.00 | 2560703.00 |
credit_lines | 93172.00 | 25.25 | 11.86 | 1.00 | 17.00 | 24.00 | 32.00 | 119.00 |
interest_received | 93174.00 | 1747.26 | 2088.24 | 0.00 | 439.88 | 1070.76 | 2219.61 | 23172.31 |
months_paid | 91250.00 | 58.15 | 44.33 | 0.00 | 22.00 | 48.00 | 83.00 | 291.00 |
total_current_balance | 85788.00 | 139252.92 | 157686.79 | 0.00 | 29642.00 | 79363.50 | 207160.00 | 8000078.00 |
total_revolving_limit | 85788.00 | 32085.90 | 47052.51 | 0.00 | 14000.00 | 23700.00 | 39700.00 | 9999999.00 |
default | 93174.00 | 0.24 | 0.43 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
df.test.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
ID | 39933.00 | 34811489.37 | 24217502.30 | 74301.00 | 10751721.00 | 36805086.00 | 58459985.00 | 73519693.00 |
loan_amount | 39933.00 | 14781.99 | 8427.85 | 500.00 | 8000.00 | 13000.00 | 20000.00 | 35000.00 |
interest_rate | 39933.00 | 13.22 | 4.39 | 5.32 | 9.99 | 12.99 | 16.20 | 28.99 |
annual_income | 39933.00 | 75264.80 | 56556.66 | 3300.00 | 45262.00 | 65000.00 | 90000.00 | 5000000.00 |
debt_to_income | 39933.00 | 18.12 | 8.49 | 0.00 | 11.96 | 17.61 | 23.88 | 380.53 |
delinquent_for_2y | 39932.00 | 0.31 | 0.85 | 0.00 | 0.00 | 0.00 | 0.00 | 17.00 |
legal_cases | 39932.00 | 0.19 | 0.56 | 0.00 | 0.00 | 0.00 | 0.00 | 23.00 |
revolving_balance | 39933.00 | 17019.28 | 26129.74 | 0.00 | 6482.00 | 11949.00 | 20928.00 | 2568995.00 |
credit_lines | 39932.00 | 25.26 | 11.77 | 1.00 | 17.00 | 24.00 | 32.00 | 114.00 |
interest_received | 39933.00 | 1764.74 | 2095.44 | 0.00 | 441.47 | 1077.72 | 2260.99 | 21811.29 |
months_paid | 39127.00 | 58.42 | 44.48 | 0.00 | 22.00 | 48.00 | 83.00 | 278.00 |
total_current_balance | 36703.00 | 140462.51 | 156575.93 | 0.00 | 29807.00 | 80590.00 | 209590.50 | 3881449.00 |
total_revolving_limit | 36703.00 | 32453.21 | 61835.45 | 0.00 | 14000.00 | 23900.00 | 40100.00 | 9999999.00 |
corr=df.train.drop(['ID'],axis=1).select_dtypes(exclude=['object']).corr()
correlation_limit=0.75
too_correlated=list()
seen=set()
for x in corr.columns:
for y in corr.index:
seen.add((y,x))
if (x,y) not in seen:
if abs(corr[x][y])>correlation_limit:
too_correlated.append(y)
del seen
plt.figure(figsize=(10, 8)) # You can adjust the size as needed
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm', vmin=-1, vmax=1)
# Show the plot
plt.show()
for c in too_correlated:
print(c)
total_revolving_limit
v1s=['loan_term', 'loan_purpose', 'income_verification_status', 'job_experience']
v2s=['loan_amount', 'interest_rate']
v3='default'
def mk_colour_patches(n):
v=0.5
return (
'#'+(''.join(f'{value:02x}'
for value in (np.array(colorsys.hsv_to_rgb(h,s,v))*255).round().astype(dtype=np.uint8)))
for h in np.linspace(0,1,n, endpoint=False)*1.5
for s in [2/3,1/3] )
sep=' 🙼 '
mkjumptable('bivariant', [f'{v1}{sep}{v2}' for v1 in v1s for v2 in v2s ])
colour_patches=list(mk_colour_patches(len(v1s)*len(v2s)))
for v1 in v1s:
for v2 in v2s:
cp1,cp2=colour_patches.pop(), colour_patches.pop()
placelabel('bivariant', f"{v1}{sep}{v2}", )
plt.xticks(rotation=45, horizontalalignment='right')
ax=sns.violinplot(data=df.train, x=v1, y=v2, hue=v3, split=True,
palette=[ cp1,cp2 ], cut=1.5, bw=0.4) #, order=df.train['annual_income'].value_counts().index
plt.show()
df.pltdata=df.train.drop(['ID'], axis=1).select_dtypes(include=['number'])
df_tmp=pd.DataFrame()
for col in df.pltdata.columns:
firstQ,thirdQ=df.pltdata[col].quantile([.25,.75])
iqr=thirdQ-firstQ
sele=df.pltdata[col]<1.5*iqr+thirdQ
if not sele.any():
continue
# print(col, sele)
df_tmp[col]=sele
df_tmp.all(axis=1)
del firstQ,thirdQ,iqr,sele # ,df_tmp
#df.pltdata[df.train.public_records>8].public_records=8
df.pltdata['legal_cases'] = np.where(df.train['legal_cases'] < 8, df.train['legal_cases'], 8)
df.pltdata[df_tmp.all(axis=1)]
loan_amount | interest_rate | annual_income | debt_to_income | delinquent_for_2y | legal_cases | revolving_balance | credit_lines | interest_received | months_paid | total_current_balance | total_revolving_limit | default | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 18000 | 13.65 | 64000.00 | 31.67 | 0.00 | 1.00 | 9783 | 24.00 | 3348.25 | 95.00 | 185433.00 | 13500.00 | 0 |
2 | 16000 | 7.26 | 150000.00 | 19.70 | 2.00 | 0.00 | 13641 | 27.00 | 276.69 | 13.00 | 180519.00 | 19300.00 | 0 |
3 | 25000 | 13.99 | 59800.00 | 37.39 | 0.00 | 0.00 | 35020 | 35.00 | 1106.72 | 17.00 | 183208.00 | 55400.00 | 0 |
5 | 10000 | 12.69 | 56000.00 | 16.16 | 0.00 | 1.00 | 6643 | 48.00 | 590.50 | 26.00 | 24054.00 | 25167.00 | 0 |
6 | 15000 | 11.14 | 100000.00 | 10.51 | 0.00 | 1.00 | 14088 | 14.00 | 1020.57 | 35.00 | 14519.00 | 15200.00 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
93163 | 19000 | 5.32 | 120000.00 | 16.25 | 0.00 | 0.00 | 8712 | 18.00 | 467.14 | 26.00 | 220697.00 | 57200.00 | 0 |
93167 | 7800 | 12.29 | 21000.00 | 12.97 | 0.00 | 0.00 | 8197 | 8.00 | 479.81 | 31.00 | 93130.00 | 17200.00 | 0 |
93169 | 3200 | 7.26 | 85000.00 | 17.11 | 0.00 | 0.00 | 7924 | 38.00 | 55.34 | 13.00 | 64635.00 | 47600.00 | 0 |
93171 | 8000 | 13.98 | 148531.50 | 13.04 | 1.00 | 0.00 | 5391 | 25.00 | 1150.58 | 65.00 | 94596.00 | 6500.00 | 0 |
93173 | 10000 | 15.80 | 60000.00 | 11.83 | 0.00 | 0.00 | 11285 | 7.00 | 2279.36 | 104.00 | 25594.00 | 12300.00 | 0 |
67390 rows × 13 columns
if 'printslow' in vars() and printslow:
if 'warnings' in globals():
warnings.filterwarnings('ignore', category=UserWarning, module='seaborn.axisgrid')
sns.pairplot(data=df.pltdata[df_tmp.all(axis=1)], diag_kind="kde")
plt.show()
if 'X' in df.__dict__:
del df.X
if 'y' in df.__dict__:
del df.y
df.y=df.train[dep_var]
dummyize=[
'job_experience',
'loan_grade',
'loan_term',
'income_verification_status',
'loan_purpose',
'application_type',
'home_ownership',
]
drops=[
dep_var, # this is going into y
'ID', # id is unique and probably adds no value
'state_code', # will be converted to lat/long
'loan_subgrade', # will be converted to number (integer)
]
#df.X=df.train.drop(drops+too_correlated+dummyize, axis=1).copy().fillna(0)
df.X=pd.concat(
(
df.train.drop(drops+too_correlated+dummyize, axis=1).fillna(0),
pd.DataFrame(df.train['state_code'].apply(lambda x: state_centroids[x]).tolist())
), axis=1)
df.X['longitude']=df.train['state_code'].apply(lambda x: state_centroids[x].longitude) df.X['latitude']=df.train['state_code'].apply(lambda x: state_centroids[x].latitude)
nans=df.train.loc[:,df.train.isna().any()].isna().astype(np.int8)
nans.columns+='_nan'
assert not checkfornan(df.X),f"NaN values found in columns: {checkfornan(df.X)}"
df.X = pd.concat([df.X, nans], axis=1)
df.X
loan_amount | interest_rate | annual_income | debt_to_income | delinquent_for_2y | legal_cases | revolving_balance | credit_lines | interest_received | months_paid | ... | longitude | latitude | job_experience_nan | annual_income_nan | delinquent_for_2y_nan | legal_cases_nan | credit_lines_nan | months_paid_nan | total_current_balance_nan | total_revolving_limit_nan | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 9000 | 9.17 | 85000.00 | 26.68 | 0.00 | 0.00 | 39519 | 20.00 | 59.60 | 4.00 | ... | -109.63 | 47.05 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 18000 | 13.65 | 64000.00 | 31.67 | 0.00 | 1.00 | 9783 | 24.00 | 3348.25 | 95.00 | ... | -71.56 | 41.68 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 16000 | 7.26 | 150000.00 | 19.70 | 2.00 | 0.00 | 13641 | 27.00 | 276.69 | 13.00 | ... | -86.35 | 35.86 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 25000 | 13.99 | 59800.00 | 37.39 | 0.00 | 0.00 | 35020 | 35.00 | 1106.72 | 17.00 | ... | -94.31 | 46.28 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 17000 | 6.39 | 72000.00 | 8.92 | 0.00 | 0.00 | 23990 | 26.00 | 725.29 | 39.00 | ... | -86.35 | 35.86 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
93169 | 3200 | 7.26 | 85000.00 | 17.11 | 0.00 | 0.00 | 7924 | 38.00 | 55.34 | 13.00 | ... | -82.45 | 28.63 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
93170 | 3500 | 5.42 | 57550.00 | 22.64 | 0.00 | 0.00 | 10174 | 24.00 | 299.67 | 161.00 | ... | -82.45 | 28.63 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
93171 | 8000 | 13.98 | 148531.50 | 13.04 | 1.00 | 0.00 | 5391 | 25.00 | 1150.58 | 65.00 | ... | -86.83 | 32.78 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
93172 | 35000 | 17.77 | 100000.00 | 17.22 | 0.00 | 0.00 | 24609 | 45.00 | 5764.58 | 56.00 | ... | -119.47 | 37.18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
93173 | 10000 | 15.80 | 60000.00 | 11.83 | 0.00 | 0.00 | 11285 | 7.00 | 2279.36 | 104.00 | ... | -74.67 | 40.19 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
93174 rows × 21 columns
for colname in dummyize: # Or use pd.get_dummies
all_colvals=df.train[colname].unique()
for colval in all_colvals[1:]: # the [:1] is like drop_first param of get_dummies
df.X[f'{colname}_{colval}']=(df.train[colname]==colval).astype(np.int8)
df.X.columns
Index(['loan_amount', 'interest_rate', 'annual_income', 'debt_to_income', 'delinquent_for_2y', 'legal_cases', 'revolving_balance', 'credit_lines', 'interest_received', 'months_paid', 'total_current_balance', 'longitude', 'latitude', 'job_experience_nan', 'annual_income_nan', 'delinquent_for_2y_nan', 'legal_cases_nan', 'credit_lines_nan', 'months_paid_nan', 'total_current_balance_nan', 'total_revolving_limit_nan', 'job_experience_10+ years', 'job_experience_6-10 years', 'loan_grade_C', 'loan_grade_A', 'loan_grade_E', 'loan_grade_D', 'loan_grade_F', 'loan_grade_G', 'loan_term_5 years', 'income_verification_status_Verified', 'income_verification_status_Source Verified', 'loan_purpose_credit_card', 'loan_purpose_other', 'loan_purpose_home_improvement', 'application_type_JOINT', 'home_ownership_MORTGAGE', 'home_ownership_RENT', 'home_ownership_NONE', 'home_ownership_OTHER'], dtype='object')
df.X['loan_score']=df.train['loan_subgrade'].apply(lambda x:(ord(x[0])-ord('A'))*5+ord(x[1])-ord('1'))
df.X['loan_score']
0 6 1 10 2 3 3 13 4 1 .. 93169 3 93170 0 93171 12 93172 15 93173 12 Name: loan_score, Length: 93174, dtype: int64
dims = (11.7, 8.27)
fig, ax = plt.subplots(figsize=dims)
sns.histplot(df.X, x='loan_score', ax=ax, bins=
range(df.X['loan_score'].min()+1,
df.X['loan_score'].max()
)
)
#sns.histplot(df.X, x='loan_score', ax=ax, bins=np.sort(df.X['loan_score'].unique()))
<Axes: xlabel='loan_score', ylabel='Count'>
df.X.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 93174 entries, 0 to 93173 Data columns (total 41 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 loan_amount 93174 non-null int64 1 interest_rate 93174 non-null float64 2 annual_income 93174 non-null float64 3 debt_to_income 93174 non-null float64 4 delinquent_for_2y 93174 non-null float64 5 legal_cases 93174 non-null float64 6 revolving_balance 93174 non-null int64 7 credit_lines 93174 non-null float64 8 interest_received 93174 non-null float64 9 months_paid 93174 non-null float64 10 total_current_balance 93174 non-null float64 11 longitude 93174 non-null float64 12 latitude 93174 non-null float64 13 job_experience_nan 93174 non-null int8 14 annual_income_nan 93174 non-null int8 15 delinquent_for_2y_nan 93174 non-null int8 16 legal_cases_nan 93174 non-null int8 17 credit_lines_nan 93174 non-null int8 18 months_paid_nan 93174 non-null int8 19 total_current_balance_nan 93174 non-null int8 20 total_revolving_limit_nan 93174 non-null int8 21 job_experience_10+ years 93174 non-null int8 22 job_experience_6-10 years 93174 non-null int8 23 loan_grade_C 93174 non-null int8 24 loan_grade_A 93174 non-null int8 25 loan_grade_E 93174 non-null int8 26 loan_grade_D 93174 non-null int8 27 loan_grade_F 93174 non-null int8 28 loan_grade_G 93174 non-null int8 29 loan_term_5 years 93174 non-null int8 30 income_verification_status_Verified 93174 non-null int8 31 income_verification_status_Source Verified 93174 non-null int8 32 loan_purpose_credit_card 93174 non-null int8 33 loan_purpose_other 93174 non-null int8 34 loan_purpose_home_improvement 93174 non-null int8 35 application_type_JOINT 93174 non-null int8 36 home_ownership_MORTGAGE 93174 non-null int8 37 home_ownership_RENT 93174 non-null int8 38 home_ownership_NONE 93174 non-null int8 39 home_ownership_OTHER 93174 non-null int8 40 loan_score 93174 non-null int64 dtypes: float64(11), int64(3), int8(27) memory usage: 12.4 MB
colour_patches=('#'+(''.join(f'{value:02x}' for value in (np.array(colorsys.hsv_to_rgb(h,s,0.5))*255).round().astype(dtype=np.uint8))) for h in [0.0,.25,.5,.75] for s in [2/3,1/3] )
IPython.display.HTML('\n'.join(
[f'<div style="width:25px; height:25px; background-color:{colour_patch};display:inline-block"></div>'
for colour_patch in colour_patches]))
def markstartof(my_marker):
html=IPython.display.HTML(f'<a id="{my_marker}"></a><h2>{my_marker}</h2>')
IPython.display.display(html)
df.X_train, df.X_test, df.y_train, df.y_test = sklearn.model_selection.train_test_split(df.X, df.y, test_size=0.4, random_state=1)
models=dict()
scores=dict()
score=collections.namedtuple( 'score',
[
'accuracy',
'recall',
'precision',
'f1',
'roc_auc'
])
def mkscore(pred, y):
metrics=(sklearn.metrics.__getattribute__(f'{sc}_{score.__name__}')(y, pred) for sc in score._fields)
return score(*metrics)
def plot_confusion(y, ŷ):
cm = sklearn.metrics.confusion_matrix(y, ŷ)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
plt.show()
def tree_graph(model, feature_names, fix_arrows=False):
plt.figure(figsize=(20, 60))
out = sklearn.tree.plot_tree(
model,
feature_names=feature_names,
filled=True,
fontsize=9,
node_ids=True,
class_names=True,
)
if not fix_arrows:
return
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor("black")
arrow.set_linewidth(1)
plt.show()
feature_names = list(df.X_test)
print(feature_names)
['loan_amount', 'interest_rate', 'annual_income', 'debt_to_income', 'delinquent_for_2y', 'legal_cases', 'revolving_balance', 'credit_lines', 'interest_received', 'months_paid', 'total_current_balance', 'longitude', 'latitude', 'job_experience_nan', 'annual_income_nan', 'delinquent_for_2y_nan', 'legal_cases_nan', 'credit_lines_nan', 'months_paid_nan', 'total_current_balance_nan', 'total_revolving_limit_nan', 'job_experience_10+ years', 'job_experience_6-10 years', 'loan_grade_C', 'loan_grade_A', 'loan_grade_E', 'loan_grade_D', 'loan_grade_F', 'loan_grade_G', 'loan_term_5 years', 'income_verification_status_Verified', 'income_verification_status_Source Verified', 'loan_purpose_credit_card', 'loan_purpose_other', 'loan_purpose_home_improvement', 'application_type_JOINT', 'home_ownership_MORTGAGE', 'home_ownership_RENT', 'home_ownership_NONE', 'home_ownership_OTHER', 'loan_score']
counts=df.train[dep_var].value_counts()
plt.title(f'Initial ratio of {dep_var}')
plt.pie(counts, labels=counts.index, autopct=lambda x:f'{x:.2f}%', colors=['#ff6060','#60ff60'], explode=[0.005,.005])
plt.show()
scores=dict()
ŷs=dict()
%%time
random_state=1
input_models = [
sklearn.tree.DecisionTreeClassifier(random_state=random_state),
sklearn.ensemble.BaggingClassifier(random_state=random_state),
sklearn.ensemble.RandomForestClassifier(random_state=random_state),
sklearn.ensemble.GradientBoostingClassifier(random_state=random_state),
sklearn.ensemble.AdaBoostClassifier(random_state=random_state),
xgboost.XGBClassifier(random_state=random_state, eval_metric="logloss")
]
mkjumptable('model', [type(model).__name__ for model in input_models]) # add patches
for model in input_models:
mname=type(model).__name__
placelabel('model', mname)
model.fit(df.X_train, df.y_train)
IPython.display.display(model)
ŷs[mname] = model.predict(df.X_test)
sc=mkscore(ŷs[mname], df.y_test)
print(sc)
scores[mname]=sc
plot_confusion(df.y_test, ŷs[mname])
# if 'printslow' in vars() and printslow:
# tree_graph(model, feature_names)
# Can't do this on all types; do we want to do this (or something else) as appropriate?
DecisionTreeClassifier(random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(random_state=1)
score(accuracy=0.7823718808693318, recall=0.5667804323094425, precision=0.5365643511039311, f1=0.5512586445366529, roc_auc=0.7078459745816876)
BaggingClassifier(random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
BaggingClassifier(random_state=1)
score(accuracy=0.8517842768983096, recall=0.4987485779294653, precision=0.7968011632133769, f1=0.6134900643716764, roc_auc=0.7297464799759686)
RandomForestClassifier(random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(random_state=1)
score(accuracy=0.8275288435739201, recall=0.32992036405005687, precision=0.8435136707388017, f1=0.474321229964017, roc_auc=0.6555149573059272)
GradientBoostingClassifier(random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GradientBoostingClassifier(random_state=1)
score(accuracy=0.8309095787496646, recall=0.3502844141069397, precision=0.8389645776566758, f1=0.4942215088282504, roc_auc=0.6647665048062789)
AdaBoostClassifier(random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
AdaBoostClassifier(random_state=1)
score(accuracy=0.8198819425811644, recall=0.3078498293515358, precision=0.8113943028485757, f1=0.4463505154639176, roc_auc=0.6428820775971164)
XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=None, enable_categorical=False, eval_metric='logloss', feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=None, num_parallel_tree=None, predictor=None, random_state=1, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=None, enable_categorical=False, eval_metric='logloss', feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=None, num_parallel_tree=None, predictor=None, random_state=1, ...)
score(accuracy=0.8752079420445399, recall=0.6022753128555176, precision=0.8209024654985269, f1=0.6947962464728656, roc_auc=0.7808602687873093)
CPU times: total: 1min 11s Wall time: 44.5 s
pd.DataFrame.from_dict(scores, orient='index')
accuracy | recall | precision | f1 | roc_auc | |
---|---|---|---|---|---|
DecisionTreeClassifier | 0.78 | 0.57 | 0.54 | 0.55 | 0.71 |
BaggingClassifier | 0.85 | 0.50 | 0.80 | 0.61 | 0.73 |
RandomForestClassifier | 0.83 | 0.33 | 0.84 | 0.47 | 0.66 |
GradientBoostingClassifier | 0.83 | 0.35 | 0.84 | 0.49 | 0.66 |
AdaBoostClassifier | 0.82 | 0.31 | 0.81 | 0.45 | 0.64 |
XGBClassifier | 0.88 | 0.60 | 0.82 | 0.69 | 0.78 |
rus = imblearn.under_sampling.RandomUnderSampler(random_state=0, replacement=True)
df.X_resampled, df.y_resampled = rus.fit_resample(df.X_train, df.y_train)
counts=df.y_resampled.value_counts()
plt.title(f'Resampled ratio of {dep_var}')
plt.pie(counts, labels=counts.index, autopct=lambda x:f'{x:.2f}%', colors=['#ff6060','#60ff60'], explode=[0.005,.005])
plt.show()
%%time
if 'scores' not in globals():
scores=dict()
if 'ŷs' not in globals():
ŷs=dict()
random_state=1
input_models_urdersample = [
sklearn.tree.DecisionTreeClassifier(random_state=random_state),
sklearn.ensemble.BaggingClassifier(random_state=random_state),
sklearn.ensemble.RandomForestClassifier(random_state=random_state),
sklearn.ensemble.GradientBoostingClassifier(random_state=random_state),
sklearn.ensemble.AdaBoostClassifier(random_state=random_state),
xgboost.XGBClassifier(random_state=random_state, eval_metric="logloss")
]
mkjumptable('model', [type(model).__name__+'_undersampling' for model in input_models_urdersample]) # add patches
for model in input_models_urdersample:
mname=type(model).__name__+'_undersampling'
placelabel('model', mname)
model.fit(df.X_resampled, df.y_resampled) # Haha!
IPython.display.display(model)
ŷs[mname] = model.predict(df.X_test)
sc=mkscore(ŷs[mname], df.y_test)
print(sc)
scores[mname]=sc
plot_confusion(df.y_test, ŷs[mname])
# if 'printslow' in vars() and printslow:
# tree_graph(model, feature_names)
# Can't do this on all types; do we want to do this (or something else) as appropriate?
DecisionTreeClassifier(random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(random_state=1)
score(accuracy=0.6675073785886773, recall=0.7072810011376565, precision=0.3876901970566226, f1=0.5008458873761379, roc_auc=0.6812563713553451)
BaggingClassifier(random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
BaggingClassifier(random_state=1)
score(accuracy=0.7943654413737591, recall=0.7006825938566553, precision=0.5503037884203003, f1=0.6164548093283956, roc_auc=0.7619810441193391)
RandomForestClassifier(random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(random_state=1)
score(accuracy=0.7504158840890797, recall=0.7212741751990899, precision=0.4805942995755003, f1=0.5768355927577109, roc_auc=0.7403421437793203)
GradientBoostingClassifier(random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GradientBoostingClassifier(random_state=1)
score(accuracy=0.7857526160450765, recall=0.7036405005688282, precision=0.5348032857760484, f1=0.6077130926062393, roc_auc=0.7573680030934028)
AdaBoostClassifier(random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
AdaBoostClassifier(random_state=1)
score(accuracy=0.7495841159109203, recall=0.6502844141069397, precision=0.4773277661795407, f1=0.550541777028654, roc_auc=0.7152580778399868)
XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=None, enable_categorical=False, eval_metric='logloss', feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=None, num_parallel_tree=None, predictor=None, random_state=1, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=None, enable_categorical=False, eval_metric='logloss', feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=None, num_parallel_tree=None, predictor=None, random_state=1, ...)
score(accuracy=0.8425006707807888, recall=0.7952218430034129, precision=0.6320072332730561, f1=0.7042821158690177, roc_auc=0.8261572698163133)
CPU times: total: 35.5 s Wall time: 21.3 s
df_val_results=pd.DataFrame.from_dict(scores, orient='index') #.sort_values('accuracy')
df_val_results
accuracy | recall | precision | f1 | roc_auc | |
---|---|---|---|---|---|
DecisionTreeClassifier | 0.78 | 0.57 | 0.54 | 0.55 | 0.71 |
BaggingClassifier | 0.85 | 0.50 | 0.80 | 0.61 | 0.73 |
RandomForestClassifier | 0.83 | 0.33 | 0.84 | 0.47 | 0.66 |
GradientBoostingClassifier | 0.83 | 0.35 | 0.84 | 0.49 | 0.66 |
AdaBoostClassifier | 0.82 | 0.31 | 0.81 | 0.45 | 0.64 |
XGBClassifier | 0.88 | 0.60 | 0.82 | 0.69 | 0.78 |
DecisionTreeClassifier_undersampling | 0.67 | 0.71 | 0.39 | 0.50 | 0.68 |
BaggingClassifier_undersampling | 0.79 | 0.70 | 0.55 | 0.62 | 0.76 |
RandomForestClassifier_undersampling | 0.75 | 0.72 | 0.48 | 0.58 | 0.74 |
GradientBoostingClassifier_undersampling | 0.79 | 0.70 | 0.53 | 0.61 | 0.76 |
AdaBoostClassifier_undersampling | 0.75 | 0.65 | 0.48 | 0.55 | 0.72 |
XGBClassifier_undersampling | 0.84 | 0.80 | 0.63 | 0.70 | 0.83 |
params=dict()
%%time
random_state=1
input_models = [
(sklearn.tree.DecisionTreeClassifier(random_state=random_state),
dict(
criterion= ["gini", "entropy"],
max_depth= [None]+np.arange(2, 50, 4).tolist(),
max_leaf_nodes= [None]+np.arange(2, 20, 4).tolist(),
),
True),
(sklearn.ensemble.BaggingClassifier(random_state=random_state),
dict(
n_estimators= np.arange(8, 14, 2),
bootstrap= [True, False],
bootstrap_features= [True, False],
),
True
),
(sklearn.ensemble.RandomForestClassifier(random_state=random_state),
dict(
criterion= ['entropy', 'log_loss', 'gini'],
max_depth= [None]+np.arange(2, 50, 4).tolist(),
max_leaf_nodes= [None]+np.arange(2, 20, 4).tolist(),
),
True),
(sklearn.ensemble.GradientBoostingClassifier(random_state=random_state),
dict(
# loss=['log_loss', 'exponential'],
# learning_rate=np.linspace(0.15,0.3,3),
learning_rate=[0.15],
n_estimators=[10000],
# n_estimators=[100000],
# n_estimators=10**np.arange(1, 7, 2),
# subsample = [0.95, 1.0],
max_depth=[11],
# max_depth=[None]+np.arange(10, 20, 10).tolist(),
# criterion=['friedman_mse', 'squared_error'],
# max_leaf_nodes=[None]+np.arange(10, 20, 5).tolist(),
), True),
(sklearn.ensemble.AdaBoostClassifier(random_state=random_state),
dict(
learning_rate=2.0**np.arange(-1,2,1),
# n_estimators=10**np.arange(1, 6, 3),
n_estimators=[10000],
# algorithm= ['SAMME', 'SAMME.R']
),
True
),
(xgboost.XGBClassifier(random_state=random_state, eval_metric="logloss"),
dict(
# grow_policy=['depthwise','lossguide'],
# tree_method = ['hist', 'approx'],
# max_leaves = np.arange(0,10,2)**2,
booster =['gbtree', 'gblinear', 'dart']
),True
),
]
input_models=[(m[0], m[1]) for m in input_models if len(m)==3 and m[2]]
n_jobs=-1
mkjumptable('GridSearchCV', [type(model).__name__+'_grid_search' for model, _ in input_models]) # add patches
for (estimator,parameters) in input_models:
mname=type(estimator).__name__+'_grid_search'
placelabel('GridSearchCV', mname)
# Grid of parameters to choose from
# Type of scoring used to compare parameter combinations
scorer = sklearn.metrics.make_scorer(sklearn.metrics.accuracy_score # f1_score
)
# Run the grid search
grid_obj = sklearn.model_selection.GridSearchCV(estimator, parameters, scoring=scorer, cv=5, n_jobs=n_jobs)
grid_obj = grid_obj.fit(df.X_train, df.y_train)
# get the best combination of parameters
models[mname] = grid_obj.best_estimator_
# Fit the best algorithm to the data.
models[mname].fit(df.X_train, df.y_train)
IPython.display.display(grid_obj.best_estimator_)
ŷs[mname] = models[mname].predict(df.X_test)
sc=mkscore(ŷs[mname], df.y_test)
print(sc)
scores[mname]=sc
params[mname]=grid_obj.best_params_
plot_confusion(df.y_test, ŷs[mname])
DecisionTreeClassifier(criterion='entropy', max_depth=14, random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(criterion='entropy', max_depth=14, random_state=1)
score(accuracy=0.8287362489938288, recall=0.41547212741751993, precision=0.7457627118644068, f1=0.5336450646598963, roc_auc=0.6858786198885353)
BaggingClassifier(n_estimators=12, random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
BaggingClassifier(n_estimators=12, random_state=1)
score(accuracy=0.8562382613361953, recall=0.5133105802047782, precision=0.8068669527896996, f1=0.6274509803921569, roc_auc=0.7376946159450858)
RandomForestClassifier(criterion='entropy', max_depth=30, random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(criterion='entropy', max_depth=30, random_state=1)
score(accuracy=0.8286557552991682, recall=0.3328782707622298, precision=0.8486078886310905, f1=0.47818270959307074, roc_auc=0.6572748095384183)
GradientBoostingClassifier(learning_rate=0.15, max_depth=11, n_estimators=10000, random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GradientBoostingClassifier(learning_rate=0.15, max_depth=11, n_estimators=10000, random_state=1)
score(accuracy=0.8829085055004025, recall=0.6581342434584755, precision=0.8097704367301232, f1=0.726120246014811, roc_auc=0.8052082734146311)
AdaBoostClassifier(n_estimators=10000, random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
AdaBoostClassifier(n_estimators=10000, random_state=1)
score(accuracy=0.8476254360075127, recall=0.4805460750853242, precision=0.7914558740865655, f1=0.5980038224676152, roc_auc=0.7207330094527744)
XGBClassifier(base_score=None, booster='gbtree', callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=None, enable_categorical=False, eval_metric='logloss', feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=None, num_parallel_tree=None, predictor=None, random_state=1, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
XGBClassifier(base_score=None, booster='gbtree', callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=None, enable_categorical=False, eval_metric='logloss', feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=None, num_parallel_tree=None, predictor=None, random_state=1, ...)
score(accuracy=0.8752079420445399, recall=0.6022753128555176, precision=0.8209024654985269, f1=0.6947962464728656, roc_auc=0.7808602687873093)
CPU times: total: 1h 4min 6s Wall time: 1h 57min 7s
df_val_results=pd.DataFrame.from_dict(scores, orient='index') #.sort_values('accuracy')
df_val_results
accuracy | recall | precision | f1 | roc_auc | |
---|---|---|---|---|---|
DecisionTreeClassifier | 0.78 | 0.57 | 0.54 | 0.55 | 0.71 |
BaggingClassifier | 0.85 | 0.50 | 0.80 | 0.61 | 0.73 |
RandomForestClassifier | 0.83 | 0.33 | 0.84 | 0.47 | 0.66 |
GradientBoostingClassifier | 0.83 | 0.35 | 0.84 | 0.49 | 0.66 |
AdaBoostClassifier | 0.82 | 0.31 | 0.81 | 0.45 | 0.64 |
XGBClassifier | 0.88 | 0.60 | 0.82 | 0.69 | 0.78 |
DecisionTreeClassifier_undersampling | 0.67 | 0.71 | 0.39 | 0.50 | 0.68 |
BaggingClassifier_undersampling | 0.79 | 0.70 | 0.55 | 0.62 | 0.76 |
RandomForestClassifier_undersampling | 0.75 | 0.72 | 0.48 | 0.58 | 0.74 |
GradientBoostingClassifier_undersampling | 0.79 | 0.70 | 0.53 | 0.61 | 0.76 |
AdaBoostClassifier_undersampling | 0.75 | 0.65 | 0.48 | 0.55 | 0.72 |
XGBClassifier_undersampling | 0.84 | 0.80 | 0.63 | 0.70 | 0.83 |
DecisionTreeClassifier_grid_search | 0.83 | 0.42 | 0.75 | 0.53 | 0.69 |
BaggingClassifier_grid_search | 0.86 | 0.51 | 0.81 | 0.63 | 0.74 |
RandomForestClassifier_grid_search | 0.83 | 0.33 | 0.85 | 0.48 | 0.66 |
GradientBoostingClassifier_grid_search | 0.88 | 0.66 | 0.81 | 0.73 | 0.81 |
AdaBoostClassifier_grid_search | 0.85 | 0.48 | 0.79 | 0.60 | 0.72 |
XGBClassifier_grid_search | 0.88 | 0.60 | 0.82 | 0.69 | 0.78 |
melted_df_val_results=df_val_results.reset_index(names=['Name']).melt(id_vars=["Name"], var_name="Metric", value_name="Value")
plt.figure(figsize=(12, 7))
sns.barplot(data=melted_df_val_results, x='Name', y='Value', hue='Metric')
plt.title('Metrics by Name')
plt.ylabel('Value')
plt.xlabel('Name')
plt.xticks(rotation=30, ha='right') # Rotate x-axis labels by 30 degrees and align to the
plt.legend(title='Metric', loc='upper left', bbox_to_anchor=(1, 1)) # Move legend outside
plt.show()
models['XGBClassifier_grid_search']
XGBClassifier(base_score=None, booster='gbtree', callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=None, enable_categorical=False, eval_metric='logloss', feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=None, num_parallel_tree=None, predictor=None, random_state=1, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
XGBClassifier(base_score=None, booster='gbtree', callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=None, enable_categorical=False, eval_metric='logloss', feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=None, num_parallel_tree=None, predictor=None, random_state=1, ...)
n_jobs=-1
foo='re_search'
redo_models=[sklearn.ensemble.GradientBoostingClassifier(random_state=random_state,learning_rate=0.15, max_depth=11, n_estimators=10000)]
mkjumptable(foo, redo_models) # add patches
for estimator in redo_models:
mname=type(estimator).__name__+'_'+foo
placelabel(foo, mname)
model.fit(df.X_resampled, df.y_resampled)
IPython.display.display(model)
ŷs[mname] = model.predict(df.X_test)
sc=mkscore(ŷs[mname], df.y_test)
print(sc)
scores[mname]=sc
plot_confusion(df.y_test, ŷs[mname])
XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=None, enable_categorical=False, eval_metric='logloss', feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=None, num_parallel_tree=None, predictor=None, random_state=1, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=None, enable_categorical=False, eval_metric='logloss', feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=None, num_parallel_tree=None, predictor=None, random_state=1, ...)
score(accuracy=0.8425006707807888, recall=0.7952218430034129, precision=0.6320072332730561, f1=0.7042821158690177, roc_auc=0.8261572698163133)