import getpass

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import sklearn.metrics

import sklearn

import imblearn

import webdav4.client

import io

import collections

import types

import warnings

import colorsys

import IPython.display

import sklearn

import xgboost

#from imblearn.over_sampling import SMOTE
import imblearn


pd.set_option('display.float_format', lambda x: f'{x:.2f}')


def checkfornan(x):
    return [col for col in x.columns if x[col].isna().any()]


client=webdav4.client.Client(base_url='https://webdav.critchley.biz', auth=(input('User:'), getpass.getpass('Password:')))

User:john
Password:········


df_ls=pd.DataFrame(client.ls('/aws/H'))


df_ls.sort_values('created')


f=types.SimpleNamespace()
df=types.SimpleNamespace()


f.test=io.BytesIO()
client.download_fileobj( df_ls['href'][0], f.test)
f.test.seek(0)
df.test=pd.read_csv(f.test, low_memory=False)
print(df.test.shape)
f.train=io.BytesIO()
client.download_fileobj( df_ls['href'][1], f.train)
f.train.seek(0)
df.train=pd.read_csv(f.train, low_memory=False)
print(df.train.shape)
#del f

(39933, 22)
(93174, 23)


f.test.seek(0)
df.test=pd.read_csv(f.test, low_memory=False)
print(df.test.shape)
f.train.seek(0)
df.train=pd.read_csv(f.train, low_memory=False)
print(df.train.shape)

(39933, 22)
(93174, 23)


df.test


df.train


dep_var='default'


df.all=pd.concat((df.test,df.train.drop('default', axis=1))).reset_index( drop=True)
df.all


pd_summary=pd.DataFrame(dict(nunique=df.all.nunique()*100.0/len(df.all),hasna=df.all.isna().sum()*100.0/len(df.all)))
pd_summary


renames={'interest_receive':'interest_received', 
         'loan_amnt':'loan_amount',
         'delinq_2yrs':'delinquent_for_2y',
         'total_acc': 'credit_lines',
         'public_records': 'legal_cases',
         'last_week_pay': 'months_paid'
        }
for dfi, dfd in df.__dict__.items():
    dfd.rename({oldname:newname for oldname,newname in renames.items() if oldname in dfd.columns and newname not in dfd.columns}, axis=1, inplace=True)


criteria=(df.all.nunique()>10)


df.test[criteria.index[criteria]]


df.train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93174 entries, 0 to 93173
Data columns (total 23 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ID                          93174 non-null  int64  
 1   loan_amount                 93174 non-null  int64  
 2   loan_term                   93174 non-null  object 
 3   interest_rate               93174 non-null  float64
 4   loan_grade                  93174 non-null  object 
 5   loan_subgrade               93174 non-null  object 
 6   job_experience              88472 non-null  object 
 7   home_ownership              93174 non-null  object 
 8   annual_income               93173 non-null  float64
 9   income_verification_status  93174 non-null  object 
 10  loan_purpose                93174 non-null  object 
 11  state_code                  93174 non-null  object 
 12  debt_to_income              93174 non-null  float64
 13  delinquent_for_2y           93172 non-null  float64
 14  legal_cases                 93172 non-null  float64
 15  revolving_balance           93174 non-null  int64  
 16  credit_lines                93172 non-null  float64
 17  interest_received           93174 non-null  float64
 18  application_type            93174 non-null  object 
 19  months_paid                 91250 non-null  float64
 20  total_current_balance       85788 non-null  float64
 21  total_revolving_limit       85788 non-null  float64
 22  default                     93174 non-null  int64  
dtypes: float64(10), int64(4), object(9)
memory usage: 16.3+ MB


df.test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39933 entries, 0 to 39932
Data columns (total 22 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ID                          39933 non-null  int64  
 1   loan_amount                 39933 non-null  int64  
 2   loan_term                   39933 non-null  object 
 3   interest_rate               39933 non-null  float64
 4   loan_grade                  39933 non-null  object 
 5   loan_subgrade               39933 non-null  object 
 6   job_experience              37844 non-null  object 
 7   home_ownership              39933 non-null  object 
 8   annual_income               39933 non-null  float64
 9   income_verification_status  39933 non-null  object 
 10  loan_purpose                39933 non-null  object 
 11  state_code                  39933 non-null  object 
 12  debt_to_income              39933 non-null  float64
 13  delinquent_for_2y           39932 non-null  float64
 14  legal_cases                 39932 non-null  float64
 15  revolving_balance           39933 non-null  int64  
 16  credit_lines                39932 non-null  float64
 17  interest_received           39933 non-null  float64
 18  application_type            39933 non-null  object 
 19  months_paid                 39127 non-null  float64
 20  total_current_balance       36703 non-null  float64
 21  total_revolving_limit       36703 non-null  float64
dtypes: float64(10), int64(3), object(9)
memory usage: 6.7+ MB


df.all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133107 entries, 0 to 133106
Data columns (total 22 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   ID                          133107 non-null  int64  
 1   loan_amount                 133107 non-null  int64  
 2   loan_term                   133107 non-null  object 
 3   interest_rate               133107 non-null  float64
 4   loan_grade                  133107 non-null  object 
 5   loan_subgrade               133107 non-null  object 
 6   job_experience              126316 non-null  object 
 7   home_ownership              133107 non-null  object 
 8   annual_income               133106 non-null  float64
 9   income_verification_status  133107 non-null  object 
 10  loan_purpose                133107 non-null  object 
 11  state_code                  133107 non-null  object 
 12  debt_to_income              133107 non-null  float64
 13  delinquent_for_2y           133104 non-null  float64
 14  legal_cases                 133104 non-null  float64
 15  revolving_balance           133107 non-null  int64  
 16  credit_lines                133104 non-null  float64
 17  interest_received           133107 non-null  float64
 18  application_type            133107 non-null  object 
 19  months_paid                 130377 non-null  float64
 20  total_current_balance       122491 non-null  float64
 21  total_revolving_limit       122491 non-null  float64
dtypes: float64(10), int64(3), object(9)
memory usage: 22.3+ MB


d=df.train.drop('ID', axis=1).copy()

d


d['loan_term'].value_counts().sort_values()

loan_term
5 years    27963
3 years    65211
Name: count, dtype: int64


del d


Statistics = collections.namedtuple('Statistics', ['mean', 'median', 'percentile_25', 'percentile_75'])
def stats(data):
    return Statistics(
        data.mean(),
        data.median(),
        np.percentile(data, 25),
        np.percentile(data, 75)
    )


counts=df.train[dep_var].value_counts()
plt.title(f'Initial ratio of {dep_var}')
plt.pie(counts, labels=counts.index, autopct=lambda x:f'{x:.2f}%', colors=['#ff6060','#60ff60'], explode=[0.005,.005])
plt.show()


def mkjumptable(grp, labels):
    "Make an indedx with links to various items, params are the index name and the label."
    htmls=[f'<h2><a id="{grp}_index"></a>Go to:</h2>',
        '<ul>'
    ]
    for colname in labels:
        if colname in {'ID'}: continue
        htmls.append(f'<li><a href="#{colname}_{grp}">{colname}</a></li>')
    htmls.append('</ul>')
    IPython.display.display(IPython.display.HTML('\n'.join(htmls)))
def placelabel(grp, label):
    "Target label for index, the params are the index name and the label."
    html=IPython.display.HTML(f'<a id="{label}_{grp}" href="#{grp}_index" style="text-decoration:none">🠝</a><h2>{label}</h2>')
    IPython.display.display(html)


dims = (11.7, 8.27)

thechange=50
narep=0 

dfi=df.all
mkjumptable('plot', dfi.columns)

for colname in dfi.columns:
    if colname in {'ID'}: continue
    placelabel('plot', colname)
    col=dfi[colname]
    if col.nunique()<=thechange:
        if col.dtype.type == np.object_: 
            print("Categorical")
            c=col.fillna('Unknown').astype('category').value_counts()#.sort_values()
        else:
            c=col.fillna(narep).value_counts()#.sort_values(ascending=True)
        print(c)
        fig, ax = plt.subplots(figsize=dims)
#        ax.set_title(colname)
        sns.barplot(y=c, x=c.index, ax=ax)
    else:
        if col.nunique()>thechange:
            if col.dtype.type == np.object_: 
                print("XXXXXX")
                continue
            print('LargeNumeric')
            c=col.fillna(narep) if col.isna().all() else col
            s=stats(c)
            fig, ax = plt.subplots(figsize=dims)
#        ax.set_title(colname)
            sns.kdeplot(x=col, ax=ax)
            
            plt.axvline(s.mean, color='g', linestyle='--', label=f'Mean: {s.mean:.2f}')
            if not np.isnan(s.percentile_25):
                plt.axvline(s.percentile_25, color='r', linestyle=':', label=f'25th Percentile: {s.percentile_25:.2f}')
            plt.axvline(s.median, color='r', linestyle='-', label=f'Median: {s.median:.2f}')
            if not np.isnan(s.percentile_75):
                plt.axvline(s.percentile_75, color='r', linestyle=':', label=f'75th Percentile: {s.percentile_75:.2f}')
            plt.legend()
    if col.isna().any():
        fig.patch.set_edgecolor('red') # Put a red box around any which have NaNs
        fig.patch.set_linewidth(2)
        print("NaN:", col.isna().sum())
    plt.show()

LargeNumeric

Categorical
loan_term
3 years    93321
5 years    39786
Name: count, dtype: int64

LargeNumeric

Categorical
loan_grade
B    38416
C    36735
A    22298
D    20952
E    10536
F     3374
G      796
Name: count, dtype: int64

Categorical
loan_subgrade
B3    8426
B4    8398
C1    7879
C2    7850
C3    7490
B2    7375
B5    7342
C4    7266
B1    6875
A5    6792
C5    6250
D1    5292
A4    5190
D2    4612
D3    3934
D4    3858
A3    3524
A1    3444
A2    3348
D5    3256
E1    2727
E2    2491
E3    2165
E4    1747
E5    1406
F1    1057
F2     799
F3     660
F4     502
F5     356
G1     245
G2     218
G3     151
G5      97
G4      85
Name: count, dtype: int64

Categorical
job_experience
<5 Years      57900
10+ years     43508
6-10 years    24908
Unknown        6791
Name: count, dtype: int64
NaN: 6791

Categorical
home_ownership
MORTGAGE    66453
RENT        53387
OWN         13233
OTHER          24
NONE           10
Name: count, dtype: int64

LargeNumeric
NaN: 1

Categorical
income_verification_status
Source Verified    49267
Verified           43645
Not Verified       40195
Name: count, dtype: int64

Categorical
loan_purpose
debt_consolidation    78714
credit_card           30954
other                 15664
home_improvement       7775
Name: count, dtype: int64


Location = collections.namedtuple('Location', ['longitude', 'latitude'])
# Create the dictionary
state_centroids = {'AK': Location(longitude=-86.8287, latitude=32.7794),
 'AL': Location(longitude=-152.2782, latitude=64.0685),
 'AR': Location(longitude=-111.6602, latitude=34.2744),
 'AZ': Location(longitude=-92.4426, latitude=34.8938),
 'CA': Location(longitude=-119.4696, latitude=37.1841),
 'CO': Location(longitude=-105.5478, latitude=38.9972),
 'CT': Location(longitude=-72.7273, latitude=41.6219),
 'DC': Location(longitude=-75.505, latitude=38.9896),
 'DE': Location(longitude=-77.0147, latitude=38.9101),
 'FL': Location(longitude=-82.4497, latitude=28.6305),
 'GA': Location(longitude=-83.4426, latitude=32.6415),
 'HI': Location(longitude=-156.3737, latitude=20.2927),
 'ID': Location(longitude=-114.613, latitude=44.3509),
 'IL': Location(longitude=-89.1965, latitude=40.0417),
 'IN': Location(longitude=-86.2816, latitude=39.8942),
 'KS': Location(longitude=-93.496, latitude=42.0751),
 'KY': Location(longitude=-98.3804, latitude=38.4937),
 'LA': Location(longitude=-85.3021, latitude=37.5347),
 'MA': Location(longitude=-91.9968, latitude=31.0689),
 'MD': Location(longitude=-69.2428, latitude=45.3695),
 'ME': Location(longitude=-76.7909, latitude=39.055),
 'MI': Location(longitude=-71.8083, latitude=42.2596),
 'MN': Location(longitude=-85.4102, latitude=44.3467),
 'MO': Location(longitude=-94.3053, latitude=46.2807),
 'MS': Location(longitude=-89.6678, latitude=32.7364),
 'MT': Location(longitude=-92.458, latitude=38.3566),
 'NC': Location(longitude=-109.6333, latitude=47.0527),
 'ND': Location(longitude=-99.6809, latitude=41.5),
 'NE': Location(longitude=-116.6312, latitude=39.3289),
 'NH': Location(longitude=-71.5811, latitude=43.6805),
 'NJ': Location(longitude=-74.6728, latitude=40.1907),
 'NM': Location(longitude=-106.1126, latitude=34.4071),
 'NV': Location(longitude=-75.5268, latitude=42.9538),
 'NY': Location(longitude=-79.3877, latitude=35.5557),
 'OH': Location(longitude=-100.4659, latitude=47.4501),
 'OK': Location(longitude=-82.7937, latitude=40.2862),
 'OR': Location(longitude=-97.4943, latitude=35.5889),
 'PA': Location(longitude=-120.5583, latitude=43.9336),
 'RI': Location(longitude=-77.7996, latitude=40.8781),
 'SC': Location(longitude=-71.5562, latitude=41.6762),
 'SD': Location(longitude=-80.8964, latitude=33.9169),
 'TN': Location(longitude=-100.2263, latitude=44.4443),
 'TX': Location(longitude=-86.3505, latitude=35.858),
 'UT': Location(longitude=-99.3312, latitude=31.4757),
 'VA': Location(longitude=-111.6703, latitude=39.3055),
 'VT': Location(longitude=-72.6658, latitude=44.0687),
 'WA': Location(longitude=-78.8537, latitude=37.5215),
 'WI': Location(longitude=-120.4472, latitude=47.3826),
 'WV': Location(longitude=-80.6227, latitude=38.6409),
 'WY': Location(longitude=-89.9941, latitude=44.6243)}
state_centroids

{'AK': Location(longitude=-86.8287, latitude=32.7794),
 'AL': Location(longitude=-152.2782, latitude=64.0685),
 'AR': Location(longitude=-111.6602, latitude=34.2744),
 'AZ': Location(longitude=-92.4426, latitude=34.8938),
 'CA': Location(longitude=-119.4696, latitude=37.1841),
 'CO': Location(longitude=-105.5478, latitude=38.9972),
 'CT': Location(longitude=-72.7273, latitude=41.6219),
 'DC': Location(longitude=-75.505, latitude=38.9896),
 'DE': Location(longitude=-77.0147, latitude=38.9101),
 'FL': Location(longitude=-82.4497, latitude=28.6305),
 'GA': Location(longitude=-83.4426, latitude=32.6415),
 'HI': Location(longitude=-156.3737, latitude=20.2927),
 'ID': Location(longitude=-114.613, latitude=44.3509),
 'IL': Location(longitude=-89.1965, latitude=40.0417),
 'IN': Location(longitude=-86.2816, latitude=39.8942),
 'KS': Location(longitude=-93.496, latitude=42.0751),
 'KY': Location(longitude=-98.3804, latitude=38.4937),
 'LA': Location(longitude=-85.3021, latitude=37.5347),
 'MA': Location(longitude=-91.9968, latitude=31.0689),
 'MD': Location(longitude=-69.2428, latitude=45.3695),
 'ME': Location(longitude=-76.7909, latitude=39.055),
 'MI': Location(longitude=-71.8083, latitude=42.2596),
 'MN': Location(longitude=-85.4102, latitude=44.3467),
 'MO': Location(longitude=-94.3053, latitude=46.2807),
 'MS': Location(longitude=-89.6678, latitude=32.7364),
 'MT': Location(longitude=-92.458, latitude=38.3566),
 'NC': Location(longitude=-109.6333, latitude=47.0527),
 'ND': Location(longitude=-99.6809, latitude=41.5),
 'NE': Location(longitude=-116.6312, latitude=39.3289),
 'NH': Location(longitude=-71.5811, latitude=43.6805),
 'NJ': Location(longitude=-74.6728, latitude=40.1907),
 'NM': Location(longitude=-106.1126, latitude=34.4071),
 'NV': Location(longitude=-75.5268, latitude=42.9538),
 'NY': Location(longitude=-79.3877, latitude=35.5557),
 'OH': Location(longitude=-100.4659, latitude=47.4501),
 'OK': Location(longitude=-82.7937, latitude=40.2862),
 'OR': Location(longitude=-97.4943, latitude=35.5889),
 'PA': Location(longitude=-120.5583, latitude=43.9336),
 'RI': Location(longitude=-77.7996, latitude=40.8781),
 'SC': Location(longitude=-71.5562, latitude=41.6762),
 'SD': Location(longitude=-80.8964, latitude=33.9169),
 'TN': Location(longitude=-100.2263, latitude=44.4443),
 'TX': Location(longitude=-86.3505, latitude=35.858),
 'UT': Location(longitude=-99.3312, latitude=31.4757),
 'VA': Location(longitude=-111.6703, latitude=39.3055),
 'VT': Location(longitude=-72.6658, latitude=44.0687),
 'WA': Location(longitude=-78.8537, latitude=37.5215),
 'WI': Location(longitude=-120.4472, latitude=47.3826),
 'WV': Location(longitude=-80.6227, latitude=38.6409),
 'WY': Location(longitude=-89.9941, latitude=44.6243)}


df.train.describe().T


df.test.describe().T


corr=df.train.drop(['ID'],axis=1).select_dtypes(exclude=['object']).corr()

correlation_limit=0.75
too_correlated=list()
seen=set()
for x in corr.columns:
    for y in corr.index:
        seen.add((y,x))
        if (x,y) not in seen:
            if abs(corr[x][y])>correlation_limit:
                too_correlated.append(y)
del seen

plt.figure(figsize=(10, 8))  # You can adjust the size as needed
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm', vmin=-1, vmax=1)

# Show the plot
plt.show()
for c in too_correlated:
    print(c)

total_revolving_limit


v1s=['loan_term', 'loan_purpose', 'income_verification_status', 'job_experience']
v2s=['loan_amount', 'interest_rate']
v3='default'

def mk_colour_patches(n):
    v=0.5
    return (
    '#'+(''.join(f'{value:02x}'
             for value in (np.array(colorsys.hsv_to_rgb(h,s,v))*255).round().astype(dtype=np.uint8)))
                for h in np.linspace(0,1,n, endpoint=False)*1.5
                for s in [2/3,1/3] )
        

sep=' 🙼 '
mkjumptable('bivariant', [f'{v1}{sep}{v2}' for v1 in v1s for v2 in v2s ])

colour_patches=list(mk_colour_patches(len(v1s)*len(v2s)))
for v1 in v1s:
    for v2 in v2s:
        cp1,cp2=colour_patches.pop(), colour_patches.pop()
        placelabel('bivariant', f"{v1}{sep}{v2}", )
        
        plt.xticks(rotation=45, horizontalalignment='right')
        ax=sns.violinplot(data=df.train, x=v1, y=v2, hue=v3, split=True,
                       palette=[ cp1,cp2 ], cut=1.5, bw=0.4) #, order=df.train['annual_income'].value_counts().index
        plt.show()


df.pltdata=df.train.drop(['ID'], axis=1).select_dtypes(include=['number'])


df_tmp=pd.DataFrame()
for col in df.pltdata.columns:
    firstQ,thirdQ=df.pltdata[col].quantile([.25,.75])
    iqr=thirdQ-firstQ
    sele=df.pltdata[col]<1.5*iqr+thirdQ
    if not sele.any():
        continue
#    print(col, sele)
    df_tmp[col]=sele
    
df_tmp.all(axis=1)

del firstQ,thirdQ,iqr,sele # ,df_tmp


#df.pltdata[df.train.public_records>8].public_records=8
df.pltdata['legal_cases'] = np.where(df.train['legal_cases'] < 8, df.train['legal_cases'], 8)


df.pltdata[df_tmp.all(axis=1)]


if 'printslow' in vars() and printslow:
    if 'warnings' in globals():
        warnings.filterwarnings('ignore', category=UserWarning, module='seaborn.axisgrid')
    sns.pairplot(data=df.pltdata[df_tmp.all(axis=1)], diag_kind="kde")
    plt.show()


if 'X' in df.__dict__:
    del df.X
if 'y' in df.__dict__:
    del df.y


df.y=df.train[dep_var]


dummyize=[
    'job_experience',
    'loan_grade',
    'loan_term',
    'income_verification_status',
    'loan_purpose',
    'application_type',
    'home_ownership',
]
drops=[
    dep_var,       # this is going into y
    'ID',            # id is unique and probably adds no value
    'state_code',    # will be converted to lat/long
    'loan_subgrade', # will be converted to number (integer)
]
#df.X=df.train.drop(drops+too_correlated+dummyize, axis=1).copy().fillna(0)


df.X=pd.concat(
    (
        df.train.drop(drops+too_correlated+dummyize, axis=1).fillna(0),
        pd.DataFrame(df.train['state_code'].apply(lambda x: state_centroids[x]).tolist())
    ), axis=1)


nans=df.train.loc[:,df.train.isna().any()].isna().astype(np.int8)
nans.columns+='_nan'


assert not checkfornan(df.X),f"NaN values found in columns: {checkfornan(df.X)}"


df.X = pd.concat([df.X, nans], axis=1)
df.X


for colname in dummyize: # Or use pd.get_dummies
    all_colvals=df.train[colname].unique() 
    for colval in all_colvals[1:]: # the [:1] is like drop_first param of get_dummies
        df.X[f'{colname}_{colval}']=(df.train[colname]==colval).astype(np.int8)


df.X.columns

Index(['loan_amount', 'interest_rate', 'annual_income', 'debt_to_income',
       'delinquent_for_2y', 'legal_cases', 'revolving_balance', 'credit_lines',
       'interest_received', 'months_paid', 'total_current_balance',
       'longitude', 'latitude', 'job_experience_nan', 'annual_income_nan',
       'delinquent_for_2y_nan', 'legal_cases_nan', 'credit_lines_nan',
       'months_paid_nan', 'total_current_balance_nan',
       'total_revolving_limit_nan', 'job_experience_10+ years',
       'job_experience_6-10 years', 'loan_grade_C', 'loan_grade_A',
       'loan_grade_E', 'loan_grade_D', 'loan_grade_F', 'loan_grade_G',
       'loan_term_5 years', 'income_verification_status_Verified',
       'income_verification_status_Source Verified',
       'loan_purpose_credit_card', 'loan_purpose_other',
       'loan_purpose_home_improvement', 'application_type_JOINT',
       'home_ownership_MORTGAGE', 'home_ownership_RENT', 'home_ownership_NONE',
       'home_ownership_OTHER'],
      dtype='object')


df.X['loan_score']=df.train['loan_subgrade'].apply(lambda x:(ord(x[0])-ord('A'))*5+ord(x[1])-ord('1'))


df.X['loan_score']

0         6
1        10
2         3
3        13
4         1
         ..
93169     3
93170     0
93171    12
93172    15
93173    12
Name: loan_score, Length: 93174, dtype: int64


dims = (11.7, 8.27)

fig, ax = plt.subplots(figsize=dims)

sns.histplot(df.X, x='loan_score', ax=ax, bins=
             range(df.X['loan_score'].min()+1,
df.X['loan_score'].max()
)
             )
#sns.histplot(df.X, x='loan_score', ax=ax, bins=np.sort(df.X['loan_score'].unique()))

<Axes: xlabel='loan_score', ylabel='Count'>


df.X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93174 entries, 0 to 93173
Data columns (total 41 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   loan_amount                                 93174 non-null  int64  
 1   interest_rate                               93174 non-null  float64
 2   annual_income                               93174 non-null  float64
 3   debt_to_income                              93174 non-null  float64
 4   delinquent_for_2y                           93174 non-null  float64
 5   legal_cases                                 93174 non-null  float64
 6   revolving_balance                           93174 non-null  int64  
 7   credit_lines                                93174 non-null  float64
 8   interest_received                           93174 non-null  float64
 9   months_paid                                 93174 non-null  float64
 10  total_current_balance                       93174 non-null  float64
 11  longitude                                   93174 non-null  float64
 12  latitude                                    93174 non-null  float64
 13  job_experience_nan                          93174 non-null  int8   
 14  annual_income_nan                           93174 non-null  int8   
 15  delinquent_for_2y_nan                       93174 non-null  int8   
 16  legal_cases_nan                             93174 non-null  int8   
 17  credit_lines_nan                            93174 non-null  int8   
 18  months_paid_nan                             93174 non-null  int8   
 19  total_current_balance_nan                   93174 non-null  int8   
 20  total_revolving_limit_nan                   93174 non-null  int8   
 21  job_experience_10+ years                    93174 non-null  int8   
 22  job_experience_6-10 years                   93174 non-null  int8   
 23  loan_grade_C                                93174 non-null  int8   
 24  loan_grade_A                                93174 non-null  int8   
 25  loan_grade_E                                93174 non-null  int8   
 26  loan_grade_D                                93174 non-null  int8   
 27  loan_grade_F                                93174 non-null  int8   
 28  loan_grade_G                                93174 non-null  int8   
 29  loan_term_5 years                           93174 non-null  int8   
 30  income_verification_status_Verified         93174 non-null  int8   
 31  income_verification_status_Source Verified  93174 non-null  int8   
 32  loan_purpose_credit_card                    93174 non-null  int8   
 33  loan_purpose_other                          93174 non-null  int8   
 34  loan_purpose_home_improvement               93174 non-null  int8   
 35  application_type_JOINT                      93174 non-null  int8   
 36  home_ownership_MORTGAGE                     93174 non-null  int8   
 37  home_ownership_RENT                         93174 non-null  int8   
 38  home_ownership_NONE                         93174 non-null  int8   
 39  home_ownership_OTHER                        93174 non-null  int8   
 40  loan_score                                  93174 non-null  int64  
dtypes: float64(11), int64(3), int8(27)
memory usage: 12.4 MB


colour_patches=('#'+(''.join(f'{value:02x}' for value in (np.array(colorsys.hsv_to_rgb(h,s,0.5))*255).round().astype(dtype=np.uint8))) for h in [0.0,.25,.5,.75] for s in [2/3,1/3] )

IPython.display.HTML('\n'.join(
    [f'<div style="width:25px; height:25px; background-color:{colour_patch};display:inline-block"></div>'
        for colour_patch in colour_patches]))


def markstartof(my_marker):
    html=IPython.display.HTML(f'<a id="{my_marker}"></a><h2>{my_marker}</h2>')
    IPython.display.display(html)


df.X_train, df.X_test, df.y_train, df.y_test = sklearn.model_selection.train_test_split(df.X, df.y, test_size=0.4, random_state=1)


models=dict()
scores=dict()


score=collections.namedtuple( 'score',
    [
    'accuracy',
    'recall',
    'precision',
    'f1',
    'roc_auc'
    ])


def mkscore(pred, y):
    metrics=(sklearn.metrics.__getattribute__(f'{sc}_{score.__name__}')(y, pred) for sc in score._fields)
    return score(*metrics)

def plot_confusion(y, ŷ):
    cm = sklearn.metrics.confusion_matrix(y, ŷ)
    labels = np.asarray(
        [
            ["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
            for item in cm.flatten()
        ]
    ).reshape(2, 2)

    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=labels, fmt="")
    plt.ylabel("True label")
    plt.xlabel("Predicted label")
    plt.show()


def tree_graph(model, feature_names, fix_arrows=False):
    plt.figure(figsize=(20, 60))
    out = sklearn.tree.plot_tree(
        model,
        feature_names=feature_names,
        filled=True,
        fontsize=9,
        node_ids=True,
        class_names=True,
    )
    if not fix_arrows:
        return
    for o in out:
        arrow = o.arrow_patch
        if arrow is not None:
            arrow.set_edgecolor("black")
            arrow.set_linewidth(1)
    plt.show()


feature_names = list(df.X_test)
print(feature_names)

['loan_amount', 'interest_rate', 'annual_income', 'debt_to_income', 'delinquent_for_2y', 'legal_cases', 'revolving_balance', 'credit_lines', 'interest_received', 'months_paid', 'total_current_balance', 'longitude', 'latitude', 'job_experience_nan', 'annual_income_nan', 'delinquent_for_2y_nan', 'legal_cases_nan', 'credit_lines_nan', 'months_paid_nan', 'total_current_balance_nan', 'total_revolving_limit_nan', 'job_experience_10+ years', 'job_experience_6-10 years', 'loan_grade_C', 'loan_grade_A', 'loan_grade_E', 'loan_grade_D', 'loan_grade_F', 'loan_grade_G', 'loan_term_5 years', 'income_verification_status_Verified', 'income_verification_status_Source Verified', 'loan_purpose_credit_card', 'loan_purpose_other', 'loan_purpose_home_improvement', 'application_type_JOINT', 'home_ownership_MORTGAGE', 'home_ownership_RENT', 'home_ownership_NONE', 'home_ownership_OTHER', 'loan_score']


counts=df.train[dep_var].value_counts()
plt.title(f'Initial ratio of {dep_var}')
plt.pie(counts, labels=counts.index, autopct=lambda x:f'{x:.2f}%', colors=['#ff6060','#60ff60'], explode=[0.005,.005])
plt.show()


scores=dict()
ŷs=dict()


%%time

random_state=1
input_models = [
        sklearn.tree.DecisionTreeClassifier(random_state=random_state),
        sklearn.ensemble.BaggingClassifier(random_state=random_state),
        sklearn.ensemble.RandomForestClassifier(random_state=random_state),
        sklearn.ensemble.GradientBoostingClassifier(random_state=random_state),
        sklearn.ensemble.AdaBoostClassifier(random_state=random_state),
        xgboost.XGBClassifier(random_state=random_state, eval_metric="logloss")
      ]

mkjumptable('model', [type(model).__name__ for model in input_models]) # add patches

for model in input_models:
    mname=type(model).__name__
    placelabel('model', mname)
    model.fit(df.X_train, df.y_train)
    IPython.display.display(model)
    ŷs[mname] = model.predict(df.X_test)
    sc=mkscore(ŷs[mname], df.y_test)
    print(sc)
    scores[mname]=sc
    plot_confusion(df.y_test, ŷs[mname])
    
#    if 'printslow' in vars() and printslow:
#    tree_graph(model, feature_names)
# Can't do this on all types; do we want to do this (or something else) as appropriate?

DecisionTreeClassifier(random_state=1)

DecisionTreeClassifier(random_state=1)

score(accuracy=0.7823718808693318, recall=0.5667804323094425, precision=0.5365643511039311, f1=0.5512586445366529, roc_auc=0.7078459745816876)

BaggingClassifier(random_state=1)

BaggingClassifier(random_state=1)

score(accuracy=0.8517842768983096, recall=0.4987485779294653, precision=0.7968011632133769, f1=0.6134900643716764, roc_auc=0.7297464799759686)

RandomForestClassifier(random_state=1)

RandomForestClassifier(random_state=1)

score(accuracy=0.8275288435739201, recall=0.32992036405005687, precision=0.8435136707388017, f1=0.474321229964017, roc_auc=0.6555149573059272)

GradientBoostingClassifier(random_state=1)


pd.DataFrame.from_dict(scores, orient='index')


rus = imblearn.under_sampling.RandomUnderSampler(random_state=0, replacement=True)

df.X_resampled, df.y_resampled = rus.fit_resample(df.X_train, df.y_train)


counts=df.y_resampled.value_counts()
plt.title(f'Resampled ratio of {dep_var}')
plt.pie(counts, labels=counts.index, autopct=lambda x:f'{x:.2f}%', colors=['#ff6060','#60ff60'], explode=[0.005,.005])
plt.show()


%%time

if 'scores' not in globals():
    scores=dict()
if 'ŷs' not in globals():
    ŷs=dict()
random_state=1
input_models_urdersample = [
        sklearn.tree.DecisionTreeClassifier(random_state=random_state),
        sklearn.ensemble.BaggingClassifier(random_state=random_state),
        sklearn.ensemble.RandomForestClassifier(random_state=random_state),
        sklearn.ensemble.GradientBoostingClassifier(random_state=random_state),
        sklearn.ensemble.AdaBoostClassifier(random_state=random_state),
        xgboost.XGBClassifier(random_state=random_state, eval_metric="logloss")
      ]

mkjumptable('model', [type(model).__name__+'_undersampling' for model in input_models_urdersample]) # add patches

for model in input_models_urdersample:
    mname=type(model).__name__+'_undersampling'
    placelabel('model', mname)
    model.fit(df.X_resampled, df.y_resampled)   # Haha!
    IPython.display.display(model)
    
    ŷs[mname] = model.predict(df.X_test)
    sc=mkscore(ŷs[mname], df.y_test)
    print(sc)
    scores[mname]=sc
    plot_confusion(df.y_test, ŷs[mname])
    
#    if 'printslow' in vars() and printslow:
#    tree_graph(model, feature_names)
# Can't do this on all types; do we want to do this (or something else) as appropriate?

DecisionTreeClassifier(random_state=1)

DecisionTreeClassifier(random_state=1)

score(accuracy=0.6675073785886773, recall=0.7072810011376565, precision=0.3876901970566226, f1=0.5008458873761379, roc_auc=0.6812563713553451)

BaggingClassifier(random_state=1)

BaggingClassifier(random_state=1)

score(accuracy=0.7943654413737591, recall=0.7006825938566553, precision=0.5503037884203003, f1=0.6164548093283956, roc_auc=0.7619810441193391)

RandomForestClassifier(random_state=1)

RandomForestClassifier(random_state=1)

score(accuracy=0.7504158840890797, recall=0.7212741751990899, precision=0.4805942995755003, f1=0.5768355927577109, roc_auc=0.7403421437793203)

GradientBoostingClassifier(random_state=1)


df_val_results=pd.DataFrame.from_dict(scores, orient='index') #.sort_values('accuracy')
df_val_results


params=dict()


%%time


random_state=1
input_models = [

        (sklearn.tree.DecisionTreeClassifier(random_state=random_state), 
         dict(
            criterion= ["gini", "entropy"],
            max_depth= [None]+np.arange(2, 50, 4).tolist(),
            max_leaf_nodes=  [None]+np.arange(2, 20, 4).tolist(),
            ),
        True),
    
        (sklearn.ensemble.BaggingClassifier(random_state=random_state),
         dict(
             n_estimators= np.arange(8, 14, 2),
             bootstrap= [True, False],
             bootstrap_features= [True, False],
         ),
         True
        ),

        (sklearn.ensemble.RandomForestClassifier(random_state=random_state),
         dict(
            criterion= ['entropy', 'log_loss', 'gini'],
            max_depth= [None]+np.arange(2, 50, 4).tolist(),
            max_leaf_nodes=  [None]+np.arange(2, 20, 4).tolist(),
            ),
        True),
    
        (sklearn.ensemble.GradientBoostingClassifier(random_state=random_state),
         dict(
#            loss=['log_loss', 'exponential'], 
#            learning_rate=np.linspace(0.15,0.3,3),
            learning_rate=[0.15],

            n_estimators=[10000],
#            n_estimators=[100000],
#            n_estimators=10**np.arange(1, 7, 2),
#            subsample = [0.95, 1.0],
            max_depth=[11],
#            max_depth=[None]+np.arange(10, 20, 10).tolist(),
#            criterion=['friedman_mse', 'squared_error'], 
#            max_leaf_nodes=[None]+np.arange(10, 20, 5).tolist(),
         ), True),
         
        (sklearn.ensemble.AdaBoostClassifier(random_state=random_state),
         dict(
             learning_rate=2.0**np.arange(-1,2,1),
#             n_estimators=10**np.arange(1, 6, 3),
             n_estimators=[10000],
#             algorithm= ['SAMME', 'SAMME.R']
            ),
         True
        ),
    
        (xgboost.XGBClassifier(random_state=random_state, eval_metric="logloss"),
        dict(
#            grow_policy=['depthwise','lossguide'],
#            tree_method = ['hist', 'approx'],
#            max_leaves = np.arange(0,10,2)**2,
            booster =['gbtree', 'gblinear', 'dart']
            
        ),True
        ),
      ]
input_models=[(m[0], m[1]) for m in input_models if len(m)==3 and m[2]]

n_jobs=-1

mkjumptable('GridSearchCV', [type(model).__name__+'_grid_search' for model, _ in input_models]) # add patches

    
for (estimator,parameters) in input_models:
    mname=type(estimator).__name__+'_grid_search'
    placelabel('GridSearchCV', mname)

    # Grid of parameters to choose from


    # Type of scoring used to compare parameter combinations
    scorer = sklearn.metrics.make_scorer(sklearn.metrics.accuracy_score # f1_score
                                        )

    # Run the grid search
    grid_obj = sklearn.model_selection.GridSearchCV(estimator, parameters, scoring=scorer, cv=5, n_jobs=n_jobs)
    grid_obj = grid_obj.fit(df.X_train, df.y_train)

    # get the best combination of parameters
    models[mname] = grid_obj.best_estimator_

    # Fit the best algorithm to the data.
    models[mname].fit(df.X_train, df.y_train)

    IPython.display.display(grid_obj.best_estimator_)
    ŷs[mname] = models[mname].predict(df.X_test)
    sc=mkscore(ŷs[mname], df.y_test)
    print(sc)
    scores[mname]=sc
    params[mname]=grid_obj.best_params_
    plot_confusion(df.y_test, ŷs[mname])

DecisionTreeClassifier(criterion='entropy', max_depth=14, random_state=1)

DecisionTreeClassifier(criterion='entropy', max_depth=14, random_state=1)

score(accuracy=0.8287362489938288, recall=0.41547212741751993, precision=0.7457627118644068, f1=0.5336450646598963, roc_auc=0.6858786198885353)

BaggingClassifier(n_estimators=12, random_state=1)

BaggingClassifier(n_estimators=12, random_state=1)

score(accuracy=0.8562382613361953, recall=0.5133105802047782, precision=0.8068669527896996, f1=0.6274509803921569, roc_auc=0.7376946159450858)

RandomForestClassifier(criterion='entropy', max_depth=30, random_state=1)

RandomForestClassifier(criterion='entropy', max_depth=30, random_state=1)

score(accuracy=0.8286557552991682, recall=0.3328782707622298, precision=0.8486078886310905, f1=0.47818270959307074, roc_auc=0.6572748095384183)

GradientBoostingClassifier(learning_rate=0.15, max_depth=11, n_estimators=10000,
                           random_state=1)


df_val_results=pd.DataFrame.from_dict(scores, orient='index') #.sort_values('accuracy')
df_val_results


melted_df_val_results=df_val_results.reset_index(names=['Name']).melt(id_vars=["Name"], var_name="Metric", value_name="Value")
plt.figure(figsize=(12, 7))
sns.barplot(data=melted_df_val_results, x='Name', y='Value', hue='Metric')
plt.title('Metrics by Name')
plt.ylabel('Value')
plt.xlabel('Name')
plt.xticks(rotation=30, ha='right')  # Rotate x-axis labels by 30 degrees and align to the
plt.legend(title='Metric', loc='upper left', bbox_to_anchor=(1, 1))  # Move legend outside
plt.show()


models['XGBClassifier_grid_search']

XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=None, gpu_id=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, predictor=None, random_state=1, ...)

XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=None, gpu_id=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, predictor=None, random_state=1, ...)


n_jobs=-1

foo='re_search'
redo_models=[sklearn.ensemble.GradientBoostingClassifier(random_state=random_state,learning_rate=0.15, max_depth=11, n_estimators=10000)]

mkjumptable(foo, redo_models) # add patches

    
for estimator in redo_models:
    mname=type(estimator).__name__+'_'+foo
    placelabel(foo, mname)
    model.fit(df.X_resampled, df.y_resampled)
    IPython.display.display(model)
    
    ŷs[mname] = model.predict(df.X_test)
    sc=mkscore(ŷs[mname], df.y_test)
    print(sc)
    scores[mname]=sc
    plot_confusion(df.y_test, ŷs[mname])

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=None, gpu_id=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, predictor=None, random_state=1, ...)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=None, gpu_id=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, predictor=None, random_state=1, ...)

score(accuracy=0.8425006707807888, recall=0.7952218430034129, precision=0.6320072332730561, f1=0.7042821158690177, roc_auc=0.8261572698163133)

	name	href	content_length	created	modified	content_language	content_type	etag	type	display_name
2	aws/H/Sample_Submission_(1).csv	/aws/H/Sample_Submission_(1).csv	468235	2023-12-23 22:16:18+00:00	2023-12-18 21:09:01+00:00	None	text/csv	"7250b-60ccf28b69457"	file	None
3	aws/H/Data_Dictionary_(1)_(1)_(1)_(3).csv	/aws/H/Data_Dictionary_(1)_(1)_(1)_(3).csv	1337	2023-12-23 22:16:18+00:00	2023-12-18 21:09:41+00:00	None	text/csv	"539-60ccf2b1edabe"	file	None
0	aws/H/Test_set_(1).csv	/aws/H/Test_set_(1).csv	6096186	2023-12-23 22:16:21+00:00	2023-12-18 21:08:23+00:00	None	text/csv	"5d053a-60ccf2673fdf5"	file	None
1	aws/H/Train_set_(1).csv	/aws/H/Train_set_(1).csv	14415377	2023-12-23 22:16:27+00:00	2023-12-18 21:07:50+00:00	None	text/csv	"dbf611-60ccf2488fc75"	file	None

	ID	loan_amnt	loan_term	interest_rate	loan_grade	loan_subgrade	job_experience	home_ownership	annual_income	income_verification_status	...	debt_to_income	delinq_2yrs	public_records	revolving_balance	total_acc	interest_receive	application_type	last_week_pay	total_current_balance	total_revolving_limit
0	4855329	12000	3 years	15.31	C	C2	<5 Years	MORTGAGE	73400.00	Not Verified	...	14.62	0.00	0.00	22156	30.00	2290.24	INDIVIDUAL	87.00	128098.00	25800.00
1	66862420	12000	3 years	7.26	A	A4	10+ years	MORTGAGE	105000.00	Not Verified	...	11.38	0.00	0.00	7592	14.00	202.68	INDIVIDUAL	13.00	269396.00	23600.00
2	3637416	15000	3 years	14.33	C	C1	6-10 years	MORTGAGE	50000.00	Verified	...	28.15	0.00	1.00	17983	19.00	1166.24	INDIVIDUAL	30.00	220135.00	34100.00
3	53682249	12000	3 years	9.99	B	B3	6-10 years	RENT	37000.00	Source Verified	...	34.32	0.00	0.00	12262	18.00	635.06	INDIVIDUAL	35.00	39436.00	21700.00
4	53937165	20150	3 years	11.53	B	B5	<5 Years	RENT	75000.00	Source Verified	...	26.74	1.00	0.00	8251	11.00	1232.84	INDIVIDUAL	31.00	52764.00	12000.00
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
39928	57779318	5000	3 years	8.18	B	B1	10+ years	MORTGAGE	65000.00	Source Verified	...	13.09	0.00	5.00	6731	24.00	187.27	INDIVIDUAL	26.00	14422.00	26100.00
39929	59742362	1800	3 years	11.53	B	B5	6-10 years	MORTGAGE	55000.00	Source Verified	...	25.44	0.00	1.00	11092	38.00	81.24	INDIVIDUAL	26.00	243469.00	29200.00
39930	72657145	15200	3 years	13.44	C	C3	10+ years	MORTGAGE	78000.00	Not Verified	...	19.40	0.00	0.00	19688	26.00	0.00	INDIVIDUAL	NaN	145370.00	45400.00
39931	15220189	14425	5 years	18.92	D	D4	<5 Years	MORTGAGE	38000.00	Not Verified	...	17.40	0.00	1.00	10805	29.00	4268.80	INDIVIDUAL	96.00	106449.00	19700.00
39932	21810584	27000	5 years	20.20	E	E3	<5 Years	MORTGAGE	62000.00	Source Verified	...	24.12	1.00	0.00	17795	17.00	7436.41	INDIVIDUAL	78.00	55787.00	33500.00

	ID	loan_amount	interest_rate	loan_subgrade	annual_income	state_code	debt_to_income	delinquent_for_2y	legal_cases	revolving_balance	credit_lines	interest_received	months_paid	total_current_balance	total_revolving_limit
0	4855329	12000	15.31	C2	73400.00	CA	14.62	0.00	0.00	22156	30.00	2290.24	87.00	128098.00	25800.00
1	66862420	12000	7.26	A4	105000.00	VA	11.38	0.00	0.00	7592	14.00	202.68	13.00	269396.00	23600.00
2	3637416	15000	14.33	C1	50000.00	TX	28.15	0.00	1.00	17983	19.00	1166.24	30.00	220135.00	34100.00
3	53682249	12000	9.99	B3	37000.00	NJ	34.32	0.00	0.00	12262	18.00	635.06	35.00	39436.00	21700.00
4	53937165	20150	11.53	B5	75000.00	CA	26.74	1.00	0.00	8251	11.00	1232.84	31.00	52764.00	12000.00
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
39928	57779318	5000	8.18	B1	65000.00	MI	13.09	0.00	5.00	6731	24.00	187.27	26.00	14422.00	26100.00
39929	59742362	1800	11.53	B5	55000.00	AZ	25.44	0.00	1.00	11092	38.00	81.24	26.00	243469.00	29200.00
39930	72657145	15200	13.44	C3	78000.00	TX	19.40	0.00	0.00	19688	26.00	0.00	NaN	145370.00	45400.00
39931	15220189	14425	18.92	D4	38000.00	IN	17.40	0.00	1.00	10805	29.00	4268.80	96.00	106449.00	19700.00
39932	21810584	27000	20.20	E3	62000.00	FL	24.12	1.00	0.00	17795	17.00	7436.41	78.00	55787.00	33500.00

	count	mean	std	min	25%	50%	75%	max
ID	93174.00	35050211.39	24149262.07	70735.00	10859832.50	37107507.00	58598949.50	73519746.00
loan_amount	93174.00	14733.86	8428.18	500.00	8000.00	13000.00	20000.00	35000.00
interest_rate	93174.00	13.23	4.37	5.32	9.99	12.99	16.20	28.99
annual_income	93173.00	75028.26	69454.78	1200.00	45000.00	64000.00	90000.00	9500000.00
debt_to_income	93174.00	18.13	8.56	0.00	11.93	17.64	23.89	672.52
delinquent_for_2y	93172.00	0.32	0.88	0.00	0.00	0.00	0.00	22.00
legal_cases	93172.00	0.20	0.58	0.00	0.00	0.00	0.00	49.00
revolving_balance	93174.00	16854.47	23689.07	0.00	6433.00	11856.00	20745.00	2560703.00
credit_lines	93172.00	25.25	11.86	1.00	17.00	24.00	32.00	119.00
interest_received	93174.00	1747.26	2088.24	0.00	439.88	1070.76	2219.61	23172.31
months_paid	91250.00	58.15	44.33	0.00	22.00	48.00	83.00	291.00
total_current_balance	85788.00	139252.92	157686.79	0.00	29642.00	79363.50	207160.00	8000078.00
total_revolving_limit	85788.00	32085.90	47052.51	0.00	14000.00	23700.00	39700.00	9999999.00
default	93174.00	0.24	0.43	0.00	0.00	0.00	0.00	1.00

	count	mean	std	min	25%	50%	75%	max
ID	39933.00	34811489.37	24217502.30	74301.00	10751721.00	36805086.00	58459985.00	73519693.00
loan_amount	39933.00	14781.99	8427.85	500.00	8000.00	13000.00	20000.00	35000.00
interest_rate	39933.00	13.22	4.39	5.32	9.99	12.99	16.20	28.99
annual_income	39933.00	75264.80	56556.66	3300.00	45262.00	65000.00	90000.00	5000000.00
debt_to_income	39933.00	18.12	8.49	0.00	11.96	17.61	23.88	380.53
delinquent_for_2y	39932.00	0.31	0.85	0.00	0.00	0.00	0.00	17.00
legal_cases	39932.00	0.19	0.56	0.00	0.00	0.00	0.00	23.00
revolving_balance	39933.00	17019.28	26129.74	0.00	6482.00	11949.00	20928.00	2568995.00
credit_lines	39932.00	25.26	11.77	1.00	17.00	24.00	32.00	114.00
interest_received	39933.00	1764.74	2095.44	0.00	441.47	1077.72	2260.99	21811.29
months_paid	39127.00	58.42	44.48	0.00	22.00	48.00	83.00	278.00
total_current_balance	36703.00	140462.51	156575.93	0.00	29807.00	80590.00	209590.50	3881449.00
total_revolving_limit	36703.00	32453.21	61835.45	0.00	14000.00	23900.00	40100.00	9999999.00

	ID	loan_amnt	loan_term	interest_rate	loan_grade	loan_subgrade	job_experience	home_ownership	annual_income	income_verification_status	...	delinq_2yrs	public_records	revolving_balance	total_acc	interest_receive	application_type	last_week_pay	total_current_balance	total_revolving_limit	default
0	72199369	9000	3 years	9.17	B	B2	<5 Years	OWN	85000.00	Not Verified	...	0.00	0.00	39519	20.00	59.60	INDIVIDUAL	4.00	95493.00	84100.00	0
1	14257956	18000	3 years	13.65	C	C1	<5 Years	OWN	64000.00	Verified	...	0.00	1.00	9783	24.00	3348.25	INDIVIDUAL	95.00	185433.00	13500.00	0
2	66216451	16000	3 years	7.26	A	A4	<5 Years	MORTGAGE	150000.00	Source Verified	...	2.00	0.00	13641	27.00	276.69	INDIVIDUAL	13.00	180519.00	19300.00	0
3	46974169	25000	3 years	13.99	C	C4	NaN	MORTGAGE	59800.00	Verified	...	0.00	0.00	35020	35.00	1106.72	INDIVIDUAL	17.00	183208.00	55400.00	0
4	46725961	17000	3 years	6.39	A	A2	10+ years	MORTGAGE	72000.00	Source Verified	...	0.00	0.00	23990	26.00	725.29	INDIVIDUAL	39.00	23990.00	81300.00	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
93169	65577252	3200	3 years	7.26	A	A4	<5 Years	RENT	85000.00	Not Verified	...	0.00	0.00	7924	38.00	55.34	INDIVIDUAL	13.00	64635.00	47600.00	0
93170	836021	3500	3 years	5.42	A	A1	NaN	MORTGAGE	57550.00	Not Verified	...	0.00	0.00	10174	24.00	299.67	INDIVIDUAL	161.00	NaN	NaN	1
93171	33058720	8000	3 years	13.98	C	C3	10+ years	RENT	148531.50	Source Verified	...	1.00	0.00	5391	25.00	1150.58	INDIVIDUAL	65.00	94596.00	6500.00	0
93172	4060472	35000	3 years	17.77	D	D1	<5 Years	RENT	100000.00	Verified	...	0.00	0.00	24609	45.00	5764.58	INDIVIDUAL	56.00	33759.00	34900.00	1
93173	3628127	10000	3 years	15.80	C	C3	<5 Years	RENT	60000.00	Verified	...	0.00	0.00	11285	7.00	2279.36	INDIVIDUAL	104.00	25594.00	12300.00	0

	nunique	hasna
ID	100.00	0.00
loan_amnt	1.01	0.00
loan_term	0.00	0.00
interest_rate	0.38	0.00
loan_grade	0.01	0.00
loan_subgrade	0.03	0.00
job_experience	0.00	5.10
home_ownership	0.00	0.00
annual_income	8.64	0.00
income_verification_status	0.00	0.00
loan_purpose	0.00	0.00
state_code	0.04	0.00
debt_to_income	3.01	0.00
delinq_2yrs	0.02	0.00
public_records	0.01	0.00
revolving_balance	30.94	0.00
total_acc	0.08	0.00
interest_receive	69.15	0.00
application_type	0.00	0.00
last_week_pay	0.07	2.05
total_current_balance	72.83	7.98
total_revolving_limit	4.23	7.98

	accuracy	recall	precision	f1	roc_auc
DecisionTreeClassifier	0.78	0.57	0.54	0.55	0.71
BaggingClassifier	0.85	0.50	0.80	0.61	0.73
RandomForestClassifier	0.83	0.33	0.84	0.47	0.66
GradientBoostingClassifier	0.83	0.35	0.84	0.49	0.66
AdaBoostClassifier	0.82	0.31	0.81	0.45	0.64
XGBClassifier	0.88	0.60	0.82	0.69	0.78

Go to:

loan_amount

loan_term

interest_rate

loan_grade

loan_subgrade

job_experience

home_ownership

annual_income

income_verification_status

loan_purpose

state_code

debt_to_income

delinquent_for_2y

legal_cases

revolving_balance

credit_lines

interest_received

application_type

months_paid

total_current_balance

total_revolving_limit

Go to:

loan_term 🙼 loan_amount

loan_term 🙼 interest_rate

loan_purpose 🙼 loan_amount

loan_purpose 🙼 interest_rate

income_verification_status 🙼 loan_amount

income_verification_status 🙼 interest_rate

job_experience 🙼 loan_amount

job_experience 🙼 interest_rate

Go to:

DecisionTreeClassifier

BaggingClassifier

RandomForestClassifier

GradientBoostingClassifier

AdaBoostClassifier

XGBClassifier

Go to:

DecisionTreeClassifier_undersampling

BaggingClassifier_undersampling

RandomForestClassifier_undersampling

GradientBoostingClassifier_undersampling

AdaBoostClassifier_undersampling

XGBClassifier_undersampling

Go to:

DecisionTreeClassifier_grid_search

BaggingClassifier_grid_search

RandomForestClassifier_grid_search

GradientBoostingClassifier_grid_search

AdaBoostClassifier_grid_search

XGBClassifier_grid_search

Go to:

GradientBoostingClassifier_re_search