import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path
import os

#imbalance-learn import
import imblearn as im

# sklearn imports
from sklearn.utils import _pprint as pp,resample
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import (
    train_test_split, 
    cross_validate, 
    GridSearchCV)
from sklearn.metrics import (
    fbeta_score, 
    make_scorer,
    confusion_matrix,
    ConfusionMatrixDisplay)
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import (
    RandomForestClassifier, 
    HistGradientBoostingClassifier, 
    AdaBoostClassifier)
from sklearn.neural_network import MLPClassifier

# scipy import
from scipy.stats import mannwhitneyu

data = pd.read_csv('creditcard.csv')

target = ['Class']
exclude_cols = ['Time']

data.head()

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     284807 non-null  float64
 22  V22     284807 non-null  float64
 23  V23     284807 non-null  float64
 24  V24     284807 non-null  float64
 25  V25     284807 non-null  float64
 26  V26     284807 non-null  float64
 27  V27     284807 non-null  float64
 28  V28     284807 non-null  float64
 29  Amount  284807 non-null  float64
 30  Class   284807 non-null  int64  
dtypes: float64(30), int64(1)
memory usage: 67.4 MB

with pd.option_context('display.float_format',lambda x: '%.2f' % x):
    display(data.describe())

data[target].value_counts(normalize=True)

Class
0        0.998273
1        0.001727
Name: proportion, dtype: float64

fraud_amount = data[data[target[0]] == 1]['Amount'].sum()
total_amount = data['Amount'].sum()
loss_ratio = fraud_amount / total_amount

fraud_amount, total_amount, loss_ratio

(60127.97, 25162590.009999998, 0.002389577939953885)

def plot_univariate(data,columns,factor=5,**args):
    ''' 
    Makes a grid of axes which will be populated by kdeplots of `columns`
    The size of the grid is scaled by `factor`
    '''
    assert type(columns) is list
    
    n = len(columns)
    _, axs = plt.subplots(int(np.ceil(n/3)),3,figsize=[3 * factor,int(np.ceil(n/3) * factor)])

    for i,feature in enumerate(columns):
        sns.kdeplot(data,x=feature,ax=axs.flat[i],**args)

    plt.tight_layout()

desired_cols = [col for col in data.columns if col not in exclude_cols]
plot_univariate(data,desired_cols)

_, axs = plt.subplots(1,2,figsize=[10,5])
sns.histplot(data,x='V9',y='V10',bins=50,cbar=True,ax=axs[0])
sns.scatterplot(data,x='V9',y='V10',ax=axs[1])
plt.tight_layout()

sns.pairplot(data[['V1','V2','V3','V4','V5','V6','V7','V8','V9','V10','Class']],hue='Class',corner=True,kind='hist',plot_kws={'bins':30})

<seaborn.axisgrid.PairGrid at 0x168f7af00>

plot_univariate(data,desired_cols,hue='Class',common_norm=False)

/var/folders/5b/0xnktdbn1cl34pj3c4bwxrlr0000gn/T/ipykernel_47480/4109673558.py:12: UserWarning: Dataset has 0 variance; skipping density estimate. Pass `warn_singular=False` to disable this warning.
  sns.kdeplot(data,x=feature,ax=axs.flat[i],**args)

data.groupby('Class').mean()

def multiple_mannwu(data,target,exclude=None,pvalue_thresh=0.01):
    '''
    Compares pvalue from `mannwhitneyu()` with a given threshold `pvalue_thresh`
    Returns columns in `data`, excludeept `target`, whose pvalue is above the threshold indicating
    that the null hypothesis (mean of the two sets in a column discriminated by `target` are equal) cannot 
    be rejected.
    '''
    if type(exclude) is list:
        exclude = exclude + target
    elif exclude is None:
        exclude = target
    else:
        exclude = target + [exclude]

    d_col = [col for col in data.columns if col not in exclude]
    
    mask = data[target[0]] == 0
    
    nosig = []
    
    for col in d_col:
        x = data[mask][col]
        y = data[~mask][col]
        if mannwhitneyu(x,y).pvalue > pvalue_thresh:
            nosig.append(col)
    
    return nosig

_features = multiple_mannwu(data,target,pvalue_thresh=0.05)
exclude = exclude_cols + _features
display(_features)

['V13', 'V15', 'V22']

train,test = train_test_split(data,test_size=0.2,stratify=data[target],random_state=9876)

train.head()

print('No. of train instances : ',train.shape[0])
print('No. of test instances : ', test.shape[0])

No. of train instances :  227845
No. of test instances :  56962

display(train[target].value_counts())
display(train[target].value_counts(normalize=True))

Class
0        227451
1           394
Name: count, dtype: int64

Class
0        0.998271
1        0.001729
Name: proportion, dtype: float64

def get_xy(data,target,exclude=[],size=None):
    '''
    Splits `data` into X and y on the basis of `target` while excluding columns in `exclude`
    If `size` is provided as a float, returns a resampled subset of data.
    '''
    assert type(target) is list
    assert type(exclude) is list

    if size is not None and size > 0 and size < 1:
        data = resample(data,
                        replace=False,
                        n_samples=int(np.ceil(size*data.shape[0])),
                        stratify=data[target],
                        random_state=9876)
    
    return data[[col for col in data.columns if col not in [target[0],*exclude]]], data[target[0]]

get_xy(train,target)[1].value_counts()

Class
0    227451
1       394
Name: count, dtype: int64

f2 = make_scorer(fbeta_score,beta=2)
chosen_scoring = {'average precision': 'average_precision','f2':f2}

oversampler = im.over_sampling.SMOTE(sampling_strategy=0.01,random_state=9876)
undersampler = im.under_sampling.RandomUnderSampler(sampling_strategy=0.02,random_state=9876)

# steps = [('over',oversampler),('under',undersampler)]
# rebalance = im.pipeline.Pipeline(steps)
# rebalance.fit_resample(*get_xy(train,target,exclude_cols))[1].value_counts()

# Helper functions

def test_model(model,x,y,scoring,**args):
    '''
    Returns the results of cross validation of `model` on metrics `scoring` along with train_scores
    Additional named arguments to functions are passed through to `cross_validate()`
    '''
    assert type(scoring) is dict

    cv_scores = cross_validate(model,x,y,scoring=scoring,return_train_score=True,**args)  

    return cv_scores

def summary(results,agg=False):
    '''
    Returns mean value of columns in `results`
    If `agg` is True, returns a dataframe collating various results in `results` passed as a dictionary
    '''
    if agg is True:
        assert type(results) is dict
        df = pd.DataFrame()
        for key in results.keys():
            df[key] = pd.DataFrame(pd.DataFrame(results[key]).mean(axis=0))
        return(df)
    else:
        return(pd.DataFrame(results).mean(axis=0))
            
def test_over_models(models,x,y,scoring,**args):
    '''
    Tests models passed as dictionary in `models` on data `x` and `y`
    '''
    assert type(models) is dict

    results = {}
    for key in models.keys():
        results[key] = test_model(models[key],x,y,scoring,**args)

    display(summary(results,agg=True))

def test_over_data(model,datas,scoring,**args):
    '''
    Tests model passed as `model` on different datasets passed as a list of lists in `datas`
    The elements of `datas` need to be formatted as ['name','x','y']
    '''
    results = {}
    assert type(datas) is list

    for data in datas:
        name = data[0]
        _x = data[1]
        _y = data[2]
        results[name] = test_model(model,_x,_y,scoring,**args)
    
    display(summary(results,agg=True))

models = {
    'Logistic': make_pipeline(StandardScaler(),LogisticRegression()),
    'LDA': LinearDiscriminantAnalysis(),
    'KNN': KNeighborsClassifier(),
    'Tree': DecisionTreeClassifier(),
    'SVC': LinearSVC(),
    'RF': RandomForestClassifier(),
    'HGB': HistGradientBoostingClassifier(),
    'Ada': AdaBoostClassifier(),
    'MLP': MLPClassifier()
}

pipe_models = { key : im.pipeline.make_pipeline(oversampler,undersampler,models[key]) for key in models.keys() }

train_x, train_y = get_xy(train,target,exclude_cols)

results = {}

results['Logistic'] = test_model(pipe_models['Logistic'],train_x,train_y,chosen_scoring)
summary(results['Logistic'])

fit_time                   0.379676
score_time                 0.025833
test_average precision     0.775992
train_average precision    0.783169
test_f2                    0.811099
train_f2                   0.816886
dtype: float64

results['KNN'] = test_model(pipe_models['KNN'],train_x,train_y,chosen_scoring)
summary(results['KNN'])

fit_time                   0.117326
score_time                 7.054832
test_average precision     0.726229
train_average precision    0.920933
test_f2                    0.748187
train_f2                   0.893922
dtype: float64

results['Tree'] = test_model(pipe_models['Tree'],train_x,train_y,chosen_scoring)
summary(results['Tree'])

fit_time                   4.390193
score_time                 0.014634
test_average precision     0.357940
train_average precision    0.633491
test_f2                    0.694900
train_f2                   0.896062
dtype: float64

results['SVC'] = test_model(pipe_models['SVC'],train_x,train_y,chosen_scoring)
summary(results['SVC'])

fit_time                   0.578071
score_time                 0.030805
test_average precision     0.786126
train_average precision    0.788145
test_f2                    0.811447
train_f2                   0.816411
dtype: float64

results['RF'] = test_model(pipe_models['RF'],train_x,train_y,chosen_scoring)
summary(results['RF'])

fit_time                   49.517888
score_time                  0.395611
test_average precision      0.851992
train_average precision     0.996217
test_f2                     0.842551
train_f2                    0.988834
dtype: float64

results['HGB'] = test_model(pipe_models['HGB'],train_x,train_y,chosen_scoring)
summary(results['HGB'])

fit_time                   2.093622
score_time                 0.123978
test_average precision     0.852891
train_average precision    0.988732
test_f2                    0.838377
train_f2                   0.981290
dtype: float64

results['Ada'] = test_model(pipe_models['Ada'],train_x,train_y,chosen_scoring)
summary(results['Ada'])

fit_time                   16.489126
score_time                  0.201321
test_average precision      0.759975
train_average precision     0.778394
test_f2                     0.760541
train_f2                    0.755231
dtype: float64

results['MLP'] = test_model(pipe_models['MLP'],train_x,train_y,chosen_scoring)
summary(results['MLP'])

fit_time                   39.928020
score_time                  0.069507
test_average precision      0.808602
train_average precision     0.955041
test_f2                     0.799826
train_f2                    0.939518
dtype: float64

summary(results,agg = True)

test_over_models(pipe_models,*get_xy(train,target,exclude_cols,size=0.1),chosen_scoring)

ds_data = []
for size in np.arange(0.1,1,0.1):
    ds_data.append([str(size),*get_xy(train,target,exclude_cols,size=size)])

test_over_data(pipe_models['HGB'],ds_data,chosen_scoring)

test_over_data(pipe_models['RF'],ds_data,chosen_scoring)

def save_results(results,filename,folder="."):
    ''' Save the results to a given file and in a given folder
    '''
    folderpath = Path(folder)
    filepath = Path(folder,filename)
    
    os.makedirs(folderpath,exist_ok=True)
    
    results.to_csv(filepath)

tuning_x, tuning_y = get_xy(train,target,exclude_cols,size=0.5)

tuning_y.value_counts()

Class
0    113726
1       197
Name: count, dtype: int64

# hgb = im.pipeline.Pipeline([
#     ('smote',oversampler),
#     ('rus', undersampler),
#     ('hgb', HistGradientBoostingClassifier(scoring='average_precision'))
# ])

# summary(test_model(hgb,tuning_x,tuning_y,scoring=chosen_scoring))

# param_dist_hgb = [{
#               # 'smote__sampling_strategy': [samp_strat / 2],
#               # 'rus__sampling_strategy': [samp_strat],
#               'smotetomek__sampling_strategy': [samp_strat],
#               'hgb__max_bins': [50,100],
#               'hgb__max_depth': np.arange(2,5,1),
#               'hgb__learning_rate': [0.1,0.2],
#               # 'hgb__l2_regularization': [0,100,500,1000],
#               'hgb__max_features': [0.4,0.5,0.6],
#               # 'hgb__min_samples_leaf': np.arange(3000,8001,2000),
#               # 'hgb__max_iter' : [100,500,1000]
#             } for samp_strat in [0.02,0.05,0.1]]
# param_dist_hgb

# gsc_hgb = GridSearchCV(hgb,param_dist_hgb,scoring='average_precision',cv=10,return_train_score=True,verbose=1).fit(tuning_x,tuning_y)

# gsc_results_hgb = pd.DataFrame(rsc.cv_results_)

# with pd.option_context('display.max_colwidth',None,'display.max_row',None):
#     display(gsc_results_hgb.sort_values(by='rank_test_score')[['params','mean_test_score','std_test_score','mean_train_score','std_train_score']])

# save_results(gsc_results_hgb,'maxbins,maxdepth,learningrate,maxfeatures,samplingstrategy',Path('results/hgb'))

_ = im.pipeline.make_pipeline(oversampler,undersampler,HistGradientBoostingClassifier())
_.set_params(
              smote__sampling_strategy = 0.025,
              randomundersampler__sampling_strategy = 0.05,
              histgradientboostingclassifier__max_bins = 50, 
              histgradientboostingclassifier__scoring = 'average_precision',
              histgradientboostingclassifier__max_depth =  2,
              histgradientboostingclassifier__learning_rate = 0.1,
              histgradientboostingclassifier__max_features = 0.4)

summary(test_model(_,train_x,train_y,scoring=chosen_scoring))

fit_time                   0.806242
score_time                 0.055328
test_average precision     0.797309
train_average precision    0.829837
test_f2                    0.798474
train_f2                   0.818594
dtype: float64

# rf = im.pipeline.Pipeline([
#     ('smote',oversampler),
#     ('rus', undersampler),
#     ('rf',RandomForestClassifier(n_estimators=10))
# ])

# summary(test_model(rf,train_x,train_y,scoring=chosen_scoring))

# summary(test_model(rf,tuning_x,tuning_y,scoring=chosen_scoring))

# param_dist_rf = [{
#               'smote__sampling_strategy' : [samp_strat / 2],
#               'rus__sampling_strategy' : [samp_strat],
#               'rf__n_estimators': [100],
#               'rf__criterion': ['entropy'],
#               'rf__max_samples': np.arange(0.4,0.7,0.1),
#               'rf__bootstrap': [True],
#               'rf__max_leaf_nodes': [50,100,150],
#               'rf__max_features': [0.4,0.5,0.6],  
#               # 'rf__min_samples_leaf': np.linspace(1,250,9,dtype=int),
#               # 'rf__min_samples_split': np.geomspace(2,10000,6,dtype=int),
#               'rf__max_depth' : [5,6,7]
#             } for samp_strat in [0.02,0.05,0.1]]
          
# param_dist_rf

# gsc_rf = GridSearchCV(rf,param_dist_rf,scoring='average_precision',cv=5,return_train_score=True,verbose=1).fit(tuning_x,tuning_y)

# gsc_results_rf = pd.DataFrame(gsc_rf.cv_results_)

# with pd.option_context('display.max_colwidth',None,'display.max_row',None):
#     display(
#         gsc_results_rf.sort_values(by='rank_test_score')
#         .query('param_rf__max_depth in [5,6,7]')[['params','mean_test_score','std_test_score','mean_train_score','std_train_score']]
#         )

# save_results(gsc_results_rf,'maxdepth,maxfeatures,maxsamples,maxleafnodes,criterion,estimators,samplingstrategy',Path('results/rf'))

_ = im.pipeline.make_pipeline(oversampler,undersampler,RandomForestClassifier())
_.set_params(
              randomforestclassifier__n_estimators = 100,
              randomforestclassifier__max_depth = 5,
              randomforestclassifier__criterion = 'entropy',
              randomforestclassifier__bootstrap = True,
              randomforestclassifier__max_samples = 0.5,
              randomforestclassifier__max_features = 0.5,
              randomforestclassifier__max_leaf_nodes = 100,
              smote__sampling_strategy = 0.01,
              randomundersampler__sampling_strategy = 0.02
            )

summary(test_model(_,train_x,train_y,scoring=chosen_scoring))

fit_time                   36.088854
score_time                  0.163479
test_average precision      0.837601
train_average precision     0.873818
test_f2                     0.831567
train_f2                    0.847607
dtype: float64

test_x,test_y = get_xy(test,target,exclude_cols)

tuned_oversampler = im.over_sampling.SMOTE(sampling_strategy = 0.025,random_state=9876)
tuned_undersampler = im.under_sampling.RandomUnderSampler(sampling_strategy=0.05,random_state=9876)
tuned_hgb = HistGradientBoostingClassifier(max_bins = 50, 
              scoring = 'average_precision',
              max_depth =  2,
              learning_rate = 0.1,
              max_features = 0.4,
              random_state=9876)

tuned_pipeline_hgb = im.pipeline.make_pipeline(tuned_oversampler,tuned_undersampler,tuned_hgb)

tuned_pipeline_hgb.fit(train_x,train_y)

Pipeline(steps=[('smote', SMOTE(random_state=9876, sampling_strategy=0.025)),
                ('randomundersampler',
                 RandomUnderSampler(random_state=9876, sampling_strategy=0.05)),
                ('histgradientboostingclassifier',
                 HistGradientBoostingClassifier(max_bins=50, max_depth=2,
                                                max_features=0.4,
                                                random_state=9876,
                                                scoring='average_precision'))])

Pipeline(steps=[('smote', SMOTE(random_state=9876, sampling_strategy=0.025)),
                ('randomundersampler',
                 RandomUnderSampler(random_state=9876, sampling_strategy=0.05)),
                ('histgradientboostingclassifier',
                 HistGradientBoostingClassifier(max_bins=50, max_depth=2,
                                                max_features=0.4,
                                                random_state=9876,
                                                scoring='average_precision'))])

SMOTE(random_state=9876, sampling_strategy=0.025)

RandomUnderSampler(random_state=9876, sampling_strategy=0.05)

HistGradientBoostingClassifier(max_bins=50, max_depth=2, max_features=0.4,
                               random_state=9876, scoring='average_precision')

pred_y = tuned_pipeline_hgb.predict(test_x)

ConfusionMatrixDisplay.from_predictions(test_y,pred_y);

amount_fraud_total = test_x[test_y == 1]['Amount'].sum()
amount_fraud_identified = test_x[(pred_y==1) & (test_y == 1)]['Amount'].sum()

amount_save_ratio = amount_fraud_identified / amount_fraud_total

num_fraud_total = (test_y == 1).sum()
num_fraud_identified = ((test_y == 1) & (pred_y == 1)).sum()

num_save_ratio = num_fraud_identified / num_fraud_total

print(f'{num_save_ratio*100:.2f}% of fraudulent transaction were correctly identified leading to saving {amount_save_ratio * 100:.2f}% (€{amount_fraud_identified}) of value earlier lost to fraudulent transactions.')

84.69% of fraudulent transaction were correctly identified leading to saving 70.46% (€8593.05) of value earlier lost to fraudulent transactions.

pred_y_train = tuned_pipeline_hgb.predict(train_x)
ConfusionMatrixDisplay.from_predictions(train_y,pred_y_train);

tuned_oversampler = im.over_sampling.SMOTE(sampling_strategy = 0.01,random_state=9876)
tuned_undersampler = im.under_sampling.RandomUnderSampler(sampling_strategy=0.02,random_state=9876)
tuned_rf = RandomForestClassifier(n_estimators = 100,
              max_depth = 5,
              criterion = 'entropy',
              bootstrap = True,
              max_samples = 0.5,
              max_features = 0.5,
              max_leaf_nodes = 100,
              random_state=9876)

tuned_pipeline_rf = im.pipeline.make_pipeline(tuned_oversampler,tuned_undersampler,tuned_rf)

tuned_pipeline_rf.fit(train_x,train_y)

Pipeline(steps=[('smote', SMOTE(random_state=9876, sampling_strategy=0.01)),
                ('randomundersampler',
                 RandomUnderSampler(random_state=9876, sampling_strategy=0.02)),
                ('randomforestclassifier',
                 RandomForestClassifier(criterion='entropy', max_depth=5,
                                        max_features=0.5, max_leaf_nodes=100,
                                        max_samples=0.5, random_state=9876))])

Pipeline(steps=[('smote', SMOTE(random_state=9876, sampling_strategy=0.01)),
                ('randomundersampler',
                 RandomUnderSampler(random_state=9876, sampling_strategy=0.02)),
                ('randomforestclassifier',
                 RandomForestClassifier(criterion='entropy', max_depth=5,
                                        max_features=0.5, max_leaf_nodes=100,
                                        max_samples=0.5, random_state=9876))])

SMOTE(random_state=9876, sampling_strategy=0.01)

RandomUnderSampler(random_state=9876, sampling_strategy=0.02)

RandomForestClassifier(criterion='entropy', max_depth=5, max_features=0.5,
                       max_leaf_nodes=100, max_samples=0.5, random_state=9876)

pred_y = tuned_pipeline_rf.predict(test_x)

ConfusionMatrixDisplay.from_predictions(test_y,pred_y);

amount_fraud_total = test_x[test_y == 1]['Amount'].sum()
amount_fraud_identified = test_x[(pred_y==1) & (test_y == 1)]['Amount'].sum()

amount_save_ratio = amount_fraud_identified / amount_fraud_total

num_fraud_total = (test_y == 1).sum()
num_fraud_identified = ((test_y == 1) & (pred_y == 1)).sum()

num_save_ratio = num_fraud_identified / num_fraud_total

print(f'{num_save_ratio*100:.2f}% of fraudulent transaction were correctly identified leading to saving {amount_save_ratio * 100:.2f}% (€{amount_fraud_identified}) of value earlier lost to fraudulent transactions.')

80.61% of fraudulent transaction were correctly identified leading to saving 67.74% (€8261.48) of value earlier lost to fraudulent transactions.

pred_y_train = tuned_pipeline_rf.predict(train_x)
ConfusionMatrixDisplay.from_predictions(train_y,pred_y_train);

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	0.0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	...	-0.018307	0.277838	-0.110474	0.066928	0.128539	-0.189115	0.133558	-0.021053	149.62
1	0.0	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	...	-0.225775	-0.638672	0.101288	-0.339846	0.167170	0.125895	-0.008983	0.014724	2.69
2	1.0	-1.358354	-1.340163	1.773209	0.379780	-0.503198	1.800499	0.791461	0.247676	-1.514654	...	0.247998	0.771679	0.909412	-0.689281	-0.327642	-0.139097	-0.055353	-0.059752	378.66
3	1.0	-0.966272	-0.185226	1.792993	-0.863291	-0.010309	1.247203	0.237609	0.377436	-1.387024	...	-0.108300	0.005274	-0.190321	-1.175575	0.647376	-0.221929	0.062723	0.061458	123.50
4	2.0	-1.158233	0.877737	1.548718	0.403034	-0.407193	0.095921	0.592941	-0.270533	0.817739	...	-0.009431	0.798278	-0.137458	0.141267	-0.206010	0.502292	0.219422	0.215153	69.99

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V21	V22	V23	V24	V25	V26	V27	V28	Amount	Class
count	284807.00	284807.00	284807.00	284807.00	284807.00	284807.00	284807.00	284807.00	284807.00	284807.00	...	284807.00	284807.00	284807.00	284807.00	284807.00	284807.00	284807.00	284807.00	284807.00	284807.00
mean	94813.86	0.00	0.00	-0.00	0.00	0.00	0.00	-0.00	0.00	-0.00	...	0.00	-0.00	0.00	0.00	0.00	0.00	-0.00	-0.00	88.35	0.00
std	47488.15	1.96	1.65	1.52	1.42	1.38	1.33	1.24	1.19	1.10	...	0.73	0.73	0.62	0.61	0.52	0.48	0.40	0.33	250.12	0.04
min	0.00	-56.41	-72.72	-48.33	-5.68	-113.74	-26.16	-43.56	-73.22	-13.43	...	-34.83	-10.93	-44.81	-2.84	-10.30	-2.60	-22.57	-15.43	0.00	0.00
25%	54201.50	-0.92	-0.60	-0.89	-0.85	-0.69	-0.77	-0.55	-0.21	-0.64	...	-0.23	-0.54	-0.16	-0.35	-0.32	-0.33	-0.07	-0.05	5.60	0.00
50%	84692.00	0.02	0.07	0.18	-0.02	-0.05	-0.27	0.04	0.02	-0.05	...	-0.03	0.01	-0.01	0.04	0.02	-0.05	0.00	0.01	22.00	0.00
75%	139320.50	1.32	0.80	1.03	0.74	0.61	0.40	0.57	0.33	0.60	...	0.19	0.53	0.15	0.44	0.35	0.24	0.09	0.08	77.16	0.00
max	172792.00	2.45	22.06	9.38	16.88	34.80	73.30	120.59	20.01	15.59	...	27.20	10.50	22.53	4.58	7.52	3.52	31.61	33.85	25691.16	1.00

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V20	V21	V22	V23	V24	V25	V26	V27	V28	Amount
Class
0	94838.202258	0.008258	-0.006271	0.012171	-0.007860	0.005453	0.002419	0.009637	-0.000987	0.004467	...	-0.000644	-0.001235	-0.000024	0.000070	0.000182	-0.000072	-0.000089	-0.000295	-0.000131	88.291022
1	80746.806911	-4.771948	3.623778	-7.033281	4.542029	-3.151225	-1.397737	-5.568731	0.570636	-2.581123	...	0.372319	0.713588	0.014049	-0.040308	-0.105130	0.041449	0.051648	0.170575	0.075667	122.211321

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V21	V22	V23	V24	V25	V26	V27	V28	Amount
230184	146226.0	2.255889	-1.505012	-0.808560	-1.668634	-1.308710	-0.195650	-1.432749	0.093177	-1.171014	...	-0.124897	0.060801	0.310999	0.647424	-0.323789	-0.222878	0.008770	-0.054068	7.15
204689	135394.0	2.106139	-0.798143	-1.189310	-0.720381	-0.758720	-0.553299	-1.070078	0.075041	-0.020218	...	0.264418	0.778379	0.068968	-0.521369	-0.186387	-0.078634	0.021085	-0.027298	19.95
28599	35071.0	-1.185866	1.226142	0.804271	0.958248	-0.001800	-0.885471	0.460888	0.078149	-0.717416	...	0.194333	0.600367	0.122303	0.425851	-0.172097	-0.357723	-0.272225	0.069364	10.50
119959	75650.0	-1.406713	1.493848	1.225097	-1.349293	0.449902	-0.655615	0.760328	0.305292	-0.640502	...	-0.241368	-0.860797	-0.279210	-0.452110	0.584831	-0.094257	-0.051081	0.036349	2.50
174824	122085.0	-0.518872	0.323049	-0.530168	-0.989712	1.634748	-0.437116	1.404158	-0.431957	0.051537	...	0.183203	0.837499	-0.067642	0.198096	-0.381740	-0.747842	-0.139885	-0.003166	46.13

	Logistic	LDA	KNN	Tree	SVC	RF	HGB	Ada	MLP
fit_time	0.034858	0.079543	0.016641	0.192856	0.081436	2.127398	1.545502	1.408812	5.634064
score_time	0.005146	0.009446	0.200080	0.004202	0.015375	0.026604	0.029841	0.028889	0.013227
test_average precision	0.764215	0.764178	0.616071	0.327894	0.754806	0.819435	0.811073	0.737595	0.787113
train_average precision	0.848495	0.752214	0.860910	0.589516	0.852941	0.968123	0.970338	0.901878	0.914766
test_f2	0.759132	0.856624	0.693362	0.668273	0.777703	0.848291	0.831575	0.730343	0.824123
train_f2	0.831018	0.833318	0.834748	0.871039	0.838366	0.975333	0.982594	0.884723	0.921345

Imports¶

Exploratory Data Analysis¶

Univariate plots¶

Difference in means¶

Modelling¶

Evaluation metric¶

Balancing¶

Testing Models¶

Hyperparameter Tuning¶

Histogram Based Gradient Boosting Classifier¶

Random Forest Classifier¶

Test set evaluation¶

Histogram Based Gradient Boosting Classifier¶

Random Forest Classifier¶

	Logistic	KNN	Tree	SVC	RF	HGB	Ada	MLP
fit_time	0.379676	0.117326	4.390193	0.578071	49.517888	2.093622	16.489126	39.928020
score_time	0.025833	7.054832	0.014634	0.030805	0.395611	0.123978	0.201321	0.069507
test_average precision	0.775992	0.726229	0.357940	0.786126	0.851992	0.852891	0.759975	0.808602
train_average precision	0.783169	0.920933	0.633491	0.788145	0.996217	0.988732	0.778394	0.955041
test_f2	0.811099	0.748187	0.694900	0.811447	0.842551	0.838377	0.760541	0.799826
train_f2	0.816886	0.893922	0.896062	0.816411	0.988834	0.981290	0.755231	0.939518

	0.1	0.2	0.30000000000000004	0.4	0.5	0.6000000000000001	0.7000000000000001	0.8	0.9
fit_time	1.549482	1.220542	1.127654	1.317820	1.792807	1.663364	1.976792	2.203707	2.221403
score_time	0.029732	0.027938	0.030928	0.043338	0.067005	0.070172	0.098156	0.120267	0.136374
test_average precision	0.811073	0.838364	0.827606	0.837979	0.860979	0.830672	0.832617	0.835914	0.838150
train_average precision	0.970338	0.939075	0.979897	0.978999	0.991391	0.986564	0.983307	0.989744	0.983047
test_f2	0.831575	0.816265	0.827197	0.836423	0.844497	0.839662	0.833661	0.831518	0.829214
train_f2	0.982594	0.973856	0.975021	0.979435	0.981353	0.977319	0.971988	0.979300	0.974785

	0.1	0.2	0.30000000000000004	0.4	0.5	0.6000000000000001	0.7000000000000001	0.8	0.9
fit_time	2.147401	6.021093	11.053385	15.909114	21.025546	27.100481	31.314481	38.253374	44.598699
score_time	0.028112	0.055147	0.092112	0.134303	0.189957	0.211098	0.265522	0.315061	0.355359
test_average precision	0.813307	0.785515	0.808419	0.853890	0.836636	0.834036	0.826098	0.817533	0.830270
train_average precision	0.982358	0.980441	0.981132	0.991165	0.995313	0.991519	0.992421	0.988512	0.993725
test_f2	0.852137	0.849467	0.858357	0.861843	0.849332	0.846081	0.843803	0.827664	0.831152
train_f2	0.982628	0.983196	0.984989	0.986288	0.988220	0.981101	0.983069	0.990094	0.987086