Imports¶

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path
import os

#imbalance-learn import
import imblearn as im

# sklearn imports
from sklearn.utils import _pprint as pp,resample
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import (
    train_test_split, 
    cross_validate, 
    GridSearchCV)
from sklearn.metrics import (
    fbeta_score, 
    make_scorer,
    confusion_matrix,
    ConfusionMatrixDisplay)
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import (
    RandomForestClassifier, 
    HistGradientBoostingClassifier, 
    AdaBoostClassifier)
from sklearn.neural_network import MLPClassifier

# scipy import
from scipy.stats import mannwhitneyu
In [2]:
data = pd.read_csv('creditcard.csv')
In [3]:
target = ['Class']
exclude_cols = ['Time']

Exploratory Data Analysis¶

The dataset contains transaction made by European cardholders over a 2 day period in September 2013. The numerical input variables ranging from V1 to V28 are arrived at by PCA transformation.

In [4]:
data.head()
Out[4]:
Time V1 V2 V3 V4 V5 V6 V7 V8 V9 ... V21 V22 V23 V24 V25 V26 V27 V28 Amount Class
0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 0.098698 0.363787 ... -0.018307 0.277838 -0.110474 0.066928 0.128539 -0.189115 0.133558 -0.021053 149.62 0
1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 0.085102 -0.255425 ... -0.225775 -0.638672 0.101288 -0.339846 0.167170 0.125895 -0.008983 0.014724 2.69 0
2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 0.247676 -1.514654 ... 0.247998 0.771679 0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752 378.66 0
3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 0.377436 -1.387024 ... -0.108300 0.005274 -0.190321 -1.175575 0.647376 -0.221929 0.062723 0.061458 123.50 0
4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 -0.270533 0.817739 ... -0.009431 0.798278 -0.137458 0.141267 -0.206010 0.502292 0.219422 0.215153 69.99 0

5 rows × 31 columns

There are no missing values

In [5]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     284807 non-null  float64
 22  V22     284807 non-null  float64
 23  V23     284807 non-null  float64
 24  V24     284807 non-null  float64
 25  V25     284807 non-null  float64
 26  V26     284807 non-null  float64
 27  V27     284807 non-null  float64
 28  V28     284807 non-null  float64
 29  Amount  284807 non-null  float64
 30  Class   284807 non-null  int64  
dtypes: float64(30), int64(1)
memory usage: 67.4 MB
In [6]:
with pd.option_context('display.float_format',lambda x: '%.2f' % x):
    display(data.describe())
Time V1 V2 V3 V4 V5 V6 V7 V8 V9 ... V21 V22 V23 V24 V25 V26 V27 V28 Amount Class
count 284807.00 284807.00 284807.00 284807.00 284807.00 284807.00 284807.00 284807.00 284807.00 284807.00 ... 284807.00 284807.00 284807.00 284807.00 284807.00 284807.00 284807.00 284807.00 284807.00 284807.00
mean 94813.86 0.00 0.00 -0.00 0.00 0.00 0.00 -0.00 0.00 -0.00 ... 0.00 -0.00 0.00 0.00 0.00 0.00 -0.00 -0.00 88.35 0.00
std 47488.15 1.96 1.65 1.52 1.42 1.38 1.33 1.24 1.19 1.10 ... 0.73 0.73 0.62 0.61 0.52 0.48 0.40 0.33 250.12 0.04
min 0.00 -56.41 -72.72 -48.33 -5.68 -113.74 -26.16 -43.56 -73.22 -13.43 ... -34.83 -10.93 -44.81 -2.84 -10.30 -2.60 -22.57 -15.43 0.00 0.00
25% 54201.50 -0.92 -0.60 -0.89 -0.85 -0.69 -0.77 -0.55 -0.21 -0.64 ... -0.23 -0.54 -0.16 -0.35 -0.32 -0.33 -0.07 -0.05 5.60 0.00
50% 84692.00 0.02 0.07 0.18 -0.02 -0.05 -0.27 0.04 0.02 -0.05 ... -0.03 0.01 -0.01 0.04 0.02 -0.05 0.00 0.01 22.00 0.00
75% 139320.50 1.32 0.80 1.03 0.74 0.61 0.40 0.57 0.33 0.60 ... 0.19 0.53 0.15 0.44 0.35 0.24 0.09 0.08 77.16 0.00
max 172792.00 2.45 22.06 9.38 16.88 34.80 73.30 120.59 20.01 15.59 ... 27.20 10.50 22.53 4.58 7.52 3.52 31.61 33.85 25691.16 1.00

8 rows × 31 columns

Negative class (0) (Non-fraudulent transactions): 99.83%

Positive class (1) (Fraudulent transactions) : 0.17%

In [7]:
data[target].value_counts(normalize=True)
Out[7]:
Class
0        0.998273
1        0.001727
Name: proportion, dtype: float64

Over the period of data collection, from the total transaction value of $25,162,590, $60,128 was involved in fraudulent transactions, which amount to 0.24% of total transaction value.

In [8]:
fraud_amount = data[data[target[0]] == 1]['Amount'].sum()
total_amount = data['Amount'].sum()
loss_ratio = fraud_amount / total_amount

fraud_amount, total_amount, loss_ratio
Out[8]:
(60127.97, 25162590.009999998, 0.002389577939953885)

There is no need to check correlation because we already have these features from PCA

Univariate plots¶

In [9]:
def plot_univariate(data,columns,factor=5,**args):
    ''' 
    Makes a grid of axes which will be populated by kdeplots of `columns`
    The size of the grid is scaled by `factor`
    '''
    assert type(columns) is list
    
    n = len(columns)
    _, axs = plt.subplots(int(np.ceil(n/3)),3,figsize=[3 * factor,int(np.ceil(n/3) * factor)])

    for i,feature in enumerate(columns):
        sns.kdeplot(data,x=feature,ax=axs.flat[i],**args)

    plt.tight_layout()
In [10]:
desired_cols = [col for col in data.columns if col not in exclude_cols]
plot_univariate(data,desired_cols)
No description has been provided for this image

Reason why scatterplots can be deceptive. Using bivariate histogram instead.

In [11]:
_, axs = plt.subplots(1,2,figsize=[10,5])
sns.histplot(data,x='V9',y='V10',bins=50,cbar=True,ax=axs[0])
sns.scatterplot(data,x='V9',y='V10',ax=axs[1])
plt.tight_layout()
No description has been provided for this image

All the features in the dataset are orthogonal as they are derived from PCA, hence bivariate plots do not yield any significant correlation.

In [12]:
sns.pairplot(data[['V1','V2','V3','V4','V5','V6','V7','V8','V9','V10','Class']],hue='Class',corner=True,kind='hist',plot_kws={'bins':30})
Out[12]:
<seaborn.axisgrid.PairGrid at 0x168f7af00>
No description has been provided for this image

From a visual inspection of below graphs, V6, V13, V15, V22, V24, V25, V26 seem to have very similar density distribution across fraudulent and non-fraudulent cases.

In [13]:
plot_univariate(data,desired_cols,hue='Class',common_norm=False)
/var/folders/5b/0xnktdbn1cl34pj3c4bwxrlr0000gn/T/ipykernel_47480/4109673558.py:12: UserWarning: Dataset has 0 variance; skipping density estimate. Pass `warn_singular=False` to disable this warning.
  sns.kdeplot(data,x=feature,ax=axs.flat[i],**args)
No description has been provided for this image

Difference in means¶

There are features where mean seams quite different and some features where it is quite close

In [14]:
data.groupby('Class').mean()
Out[14]:
Time V1 V2 V3 V4 V5 V6 V7 V8 V9 ... V20 V21 V22 V23 V24 V25 V26 V27 V28 Amount
Class
0 94838.202258 0.008258 -0.006271 0.012171 -0.007860 0.005453 0.002419 0.009637 -0.000987 0.004467 ... -0.000644 -0.001235 -0.000024 0.000070 0.000182 -0.000072 -0.000089 -0.000295 -0.000131 88.291022
1 80746.806911 -4.771948 3.623778 -7.033281 4.542029 -3.151225 -1.397737 -5.568731 0.570636 -2.581123 ... 0.372319 0.713588 0.014049 -0.040308 -0.105130 0.041449 0.051648 0.170575 0.075667 122.211321

2 rows × 30 columns

Mann Whitney U is a non-parametric test to whether two populations are identically distributed.

The test result is not significant on the below outputted columns. Hence, they are added to the list of excluded columns as they are unlikely to provide discriminating information.

In [15]:
def multiple_mannwu(data,target,exclude=None,pvalue_thresh=0.01):
    '''
    Compares pvalue from `mannwhitneyu()` with a given threshold `pvalue_thresh`
    Returns columns in `data`, excludeept `target`, whose pvalue is above the threshold indicating
    that the null hypothesis (mean of the two sets in a column discriminated by `target` are equal) cannot 
    be rejected.
    '''
    if type(exclude) is list:
        exclude = exclude + target
    elif exclude is None:
        exclude = target
    else:
        exclude = target + [exclude]

    d_col = [col for col in data.columns if col not in exclude]
    
    mask = data[target[0]] == 0
    
    nosig = []
    
    for col in d_col:
        x = data[mask][col]
        y = data[~mask][col]
        if mannwhitneyu(x,y).pvalue > pvalue_thresh:
            nosig.append(col)
    
    return nosig
In [16]:
_features = multiple_mannwu(data,target,pvalue_thresh=0.05)
exclude = exclude_cols + _features
display(_features)
['V13', 'V15', 'V22']

Modelling¶

Split the data 80:20 between train and test set in a stratified manner. The test set is set aside for final evaluation.

In [17]:
train,test = train_test_split(data,test_size=0.2,stratify=data[target],random_state=9876)
In [18]:
train.head()
Out[18]:
Time V1 V2 V3 V4 V5 V6 V7 V8 V9 ... V21 V22 V23 V24 V25 V26 V27 V28 Amount Class
230184 146226.0 2.255889 -1.505012 -0.808560 -1.668634 -1.308710 -0.195650 -1.432749 0.093177 -1.171014 ... -0.124897 0.060801 0.310999 0.647424 -0.323789 -0.222878 0.008770 -0.054068 7.15 0
204689 135394.0 2.106139 -0.798143 -1.189310 -0.720381 -0.758720 -0.553299 -1.070078 0.075041 -0.020218 ... 0.264418 0.778379 0.068968 -0.521369 -0.186387 -0.078634 0.021085 -0.027298 19.95 0
28599 35071.0 -1.185866 1.226142 0.804271 0.958248 -0.001800 -0.885471 0.460888 0.078149 -0.717416 ... 0.194333 0.600367 0.122303 0.425851 -0.172097 -0.357723 -0.272225 0.069364 10.50 0
119959 75650.0 -1.406713 1.493848 1.225097 -1.349293 0.449902 -0.655615 0.760328 0.305292 -0.640502 ... -0.241368 -0.860797 -0.279210 -0.452110 0.584831 -0.094257 -0.051081 0.036349 2.50 0
174824 122085.0 -0.518872 0.323049 -0.530168 -0.989712 1.634748 -0.437116 1.404158 -0.431957 0.051537 ... 0.183203 0.837499 -0.067642 0.198096 -0.381740 -0.747842 -0.139885 -0.003166 46.13 0

5 rows × 31 columns

The train set consists of 227,845 samples and test set consists of 56,962 samples.

In [19]:
print('No. of train instances : ',train.shape[0])
print('No. of test instances : ', test.shape[0])
No. of train instances :  227845
No. of test instances :  56962

The distribution of classes in the train set is similar to the overall dataset.

In [20]:
display(train[target].value_counts())
display(train[target].value_counts(normalize=True))
Class
0        227451
1           394
Name: count, dtype: int64
Class
0        0.998271
1        0.001729
Name: proportion, dtype: float64
In [21]:
def get_xy(data,target,exclude=[],size=None):
    '''
    Splits `data` into X and y on the basis of `target` while excluding columns in `exclude`
    If `size` is provided as a float, returns a resampled subset of data.
    '''
    assert type(target) is list
    assert type(exclude) is list

    if size is not None and size > 0 and size < 1:
        data = resample(data,
                        replace=False,
                        n_samples=int(np.ceil(size*data.shape[0])),
                        stratify=data[target],
                        random_state=9876)
    
    return data[[col for col in data.columns if col not in [target[0],*exclude]]], data[target[0]]
In [22]:
get_xy(train,target)[1].value_counts()
Out[22]:
Class
0    227451
1       394
Name: count, dtype: int64

Evaluation metric¶

Selected Evaluation Metrics : Average Precision (discretized version of PR AUC) and F2 score

Reason : We need high recall in this situation in order to detect all true fraudulent transactions. However, due to precision-recall tradeoff, a high recall will adversely impact precision. This would translate to poor user experience as many genuine transaction would be classified as fraudulent. So we need an evaluation metric that takes into consideration both precision and recall and that which focuses more on positive class.

F1 score and PR AUC are two such metrics. F1 will favour precision and recall having similar values, but there is no such consideration required in this scenario. We are vying for a higher recall. Instead of F1 score, we can use F2 score which will give more importance to recall over precision. PR AUC will be a summary metric that will take into account all thresholds and resulting precision and recall.

In [23]:
f2 = make_scorer(fbeta_score,beta=2)
chosen_scoring = {'average precision': 'average_precision','f2':f2}

Balancing¶

In [24]:
oversampler = im.over_sampling.SMOTE(sampling_strategy=0.01,random_state=9876)
undersampler = im.under_sampling.RandomUnderSampler(sampling_strategy=0.02,random_state=9876)

The positive class has now increased ~2% of total instances.

In [25]:
# steps = [('over',oversampler),('under',undersampler)]
# rebalance = im.pipeline.Pipeline(steps)
# rebalance.fit_resample(*get_xy(train,target,exclude_cols))[1].value_counts()

Testing Models¶

In [26]:
# Helper functions

def test_model(model,x,y,scoring,**args):
    '''
    Returns the results of cross validation of `model` on metrics `scoring` along with train_scores
    Additional named arguments to functions are passed through to `cross_validate()`
    '''
    assert type(scoring) is dict

    cv_scores = cross_validate(model,x,y,scoring=scoring,return_train_score=True,**args)  

    return cv_scores

def summary(results,agg=False):
    '''
    Returns mean value of columns in `results`
    If `agg` is True, returns a dataframe collating various results in `results` passed as a dictionary
    '''
    if agg is True:
        assert type(results) is dict
        df = pd.DataFrame()
        for key in results.keys():
            df[key] = pd.DataFrame(pd.DataFrame(results[key]).mean(axis=0))
        return(df)
    else:
        return(pd.DataFrame(results).mean(axis=0))
            
def test_over_models(models,x,y,scoring,**args):
    '''
    Tests models passed as dictionary in `models` on data `x` and `y`
    '''
    assert type(models) is dict

    results = {}
    for key in models.keys():
        results[key] = test_model(models[key],x,y,scoring,**args)

    display(summary(results,agg=True))

def test_over_data(model,datas,scoring,**args):
    '''
    Tests model passed as `model` on different datasets passed as a list of lists in `datas`
    The elements of `datas` need to be formatted as ['name','x','y']
    '''
    results = {}
    assert type(datas) is list

    for data in datas:
        name = data[0]
        _x = data[1]
        _y = data[2]
        results[name] = test_model(model,_x,_y,scoring,**args)
    
    display(summary(results,agg=True))

This section is for individual testing of models.

In [27]:
models = {
    'Logistic': make_pipeline(StandardScaler(),LogisticRegression()),
    'LDA': LinearDiscriminantAnalysis(),
    'KNN': KNeighborsClassifier(),
    'Tree': DecisionTreeClassifier(),
    'SVC': LinearSVC(),
    'RF': RandomForestClassifier(),
    'HGB': HistGradientBoostingClassifier(),
    'Ada': AdaBoostClassifier(),
    'MLP': MLPClassifier()
}

pipe_models = { key : im.pipeline.make_pipeline(oversampler,undersampler,models[key]) for key in models.keys() } 
In [28]:
train_x, train_y = get_xy(train,target,exclude_cols)
In [29]:
results = {}
In [30]:
results['Logistic'] = test_model(pipe_models['Logistic'],train_x,train_y,chosen_scoring)
summary(results['Logistic'])
Out[30]:
fit_time                   0.379676
score_time                 0.025833
test_average precision     0.775992
train_average precision    0.783169
test_f2                    0.811099
train_f2                   0.816886
dtype: float64
In [31]:
results['KNN'] = test_model(pipe_models['KNN'],train_x,train_y,chosen_scoring)
summary(results['KNN'])
Out[31]:
fit_time                   0.117326
score_time                 7.054832
test_average precision     0.726229
train_average precision    0.920933
test_f2                    0.748187
train_f2                   0.893922
dtype: float64
In [32]:
results['Tree'] = test_model(pipe_models['Tree'],train_x,train_y,chosen_scoring)
summary(results['Tree'])
Out[32]:
fit_time                   4.390193
score_time                 0.014634
test_average precision     0.357940
train_average precision    0.633491
test_f2                    0.694900
train_f2                   0.896062
dtype: float64
In [33]:
results['SVC'] = test_model(pipe_models['SVC'],train_x,train_y,chosen_scoring)
summary(results['SVC'])
Out[33]:
fit_time                   0.578071
score_time                 0.030805
test_average precision     0.786126
train_average precision    0.788145
test_f2                    0.811447
train_f2                   0.816411
dtype: float64
In [34]:
results['RF'] = test_model(pipe_models['RF'],train_x,train_y,chosen_scoring)
summary(results['RF'])
Out[34]:
fit_time                   49.517888
score_time                  0.395611
test_average precision      0.851992
train_average precision     0.996217
test_f2                     0.842551
train_f2                    0.988834
dtype: float64
In [35]:
results['HGB'] = test_model(pipe_models['HGB'],train_x,train_y,chosen_scoring)
summary(results['HGB'])
Out[35]:
fit_time                   2.093622
score_time                 0.123978
test_average precision     0.852891
train_average precision    0.988732
test_f2                    0.838377
train_f2                   0.981290
dtype: float64
In [36]:
results['Ada'] = test_model(pipe_models['Ada'],train_x,train_y,chosen_scoring)
summary(results['Ada'])
Out[36]:
fit_time                   16.489126
score_time                  0.201321
test_average precision      0.759975
train_average precision     0.778394
test_f2                     0.760541
train_f2                    0.755231
dtype: float64
In [37]:
results['MLP'] = test_model(pipe_models['MLP'],train_x,train_y,chosen_scoring)
summary(results['MLP'])
Out[37]:
fit_time                   39.928020
score_time                  0.069507
test_average precision      0.808602
train_average precision     0.955041
test_f2                     0.799826
train_f2                    0.939518
dtype: float64

The best performing models with default configuration are tree based ensemble methods.

Histogram Gradient Boosting provides good performance with low train and inference times but both Histogram based Gradient Boosting and Random Forests are overfitting.

In [38]:
summary(results,agg = True)
Out[38]:
Logistic KNN Tree SVC RF HGB Ada MLP
fit_time 0.379676 0.117326 4.390193 0.578071 49.517888 2.093622 16.489126 39.928020
score_time 0.025833 7.054832 0.014634 0.030805 0.395611 0.123978 0.201321 0.069507
test_average precision 0.775992 0.726229 0.357940 0.786126 0.851992 0.852891 0.759975 0.808602
train_average precision 0.783169 0.920933 0.633491 0.788145 0.996217 0.988732 0.778394 0.955041
test_f2 0.811099 0.748187 0.694900 0.811447 0.842551 0.838377 0.760541 0.799826
train_f2 0.816886 0.893922 0.896062 0.816411 0.988834 0.981290 0.755231 0.939518

The following tests are performed on 10% of the original train set across different models.

In [39]:
test_over_models(pipe_models,*get_xy(train,target,exclude_cols,size=0.1),chosen_scoring)
Logistic LDA KNN Tree SVC RF HGB Ada MLP
fit_time 0.034858 0.079543 0.016641 0.192856 0.081436 2.127398 1.545502 1.408812 5.634064
score_time 0.005146 0.009446 0.200080 0.004202 0.015375 0.026604 0.029841 0.028889 0.013227
test_average precision 0.764215 0.764178 0.616071 0.327894 0.754806 0.819435 0.811073 0.737595 0.787113
train_average precision 0.848495 0.752214 0.860910 0.589516 0.852941 0.968123 0.970338 0.901878 0.914766
test_f2 0.759132 0.856624 0.693362 0.668273 0.777703 0.848291 0.831575 0.730343 0.824123
train_f2 0.831018 0.833318 0.834748 0.871039 0.838366 0.975333 0.982594 0.884723 0.921345

The following tests a given model over different sizes of the dataset.

In [40]:
ds_data = []
for size in np.arange(0.1,1,0.1):
    ds_data.append([str(size),*get_xy(train,target,exclude_cols,size=size)])
In [41]:
test_over_data(pipe_models['HGB'],ds_data,chosen_scoring)
0.1 0.2 0.30000000000000004 0.4 0.5 0.6000000000000001 0.7000000000000001 0.8 0.9
fit_time 1.549482 1.220542 1.127654 1.317820 1.792807 1.663364 1.976792 2.203707 2.221403
score_time 0.029732 0.027938 0.030928 0.043338 0.067005 0.070172 0.098156 0.120267 0.136374
test_average precision 0.811073 0.838364 0.827606 0.837979 0.860979 0.830672 0.832617 0.835914 0.838150
train_average precision 0.970338 0.939075 0.979897 0.978999 0.991391 0.986564 0.983307 0.989744 0.983047
test_f2 0.831575 0.816265 0.827197 0.836423 0.844497 0.839662 0.833661 0.831518 0.829214
train_f2 0.982594 0.973856 0.975021 0.979435 0.981353 0.977319 0.971988 0.979300 0.974785
In [42]:
test_over_data(pipe_models['RF'],ds_data,chosen_scoring)
0.1 0.2 0.30000000000000004 0.4 0.5 0.6000000000000001 0.7000000000000001 0.8 0.9
fit_time 2.147401 6.021093 11.053385 15.909114 21.025546 27.100481 31.314481 38.253374 44.598699
score_time 0.028112 0.055147 0.092112 0.134303 0.189957 0.211098 0.265522 0.315061 0.355359
test_average precision 0.813307 0.785515 0.808419 0.853890 0.836636 0.834036 0.826098 0.817533 0.830270
train_average precision 0.982358 0.980441 0.981132 0.991165 0.995313 0.991519 0.992421 0.988512 0.993725
test_f2 0.852137 0.849467 0.858357 0.861843 0.849332 0.846081 0.843803 0.827664 0.831152
train_f2 0.982628 0.983196 0.984989 0.986288 0.988220 0.981101 0.983069 0.990094 0.987086

Hyperparameter Tuning¶

In [43]:
def save_results(results,filename,folder="."):
    ''' Save the results to a given file and in a given folder
    '''
    folderpath = Path(folder)
    filepath = Path(folder,filename)
    
    os.makedirs(folderpath,exist_ok=True)
    
    results.to_csv(filepath)

For tuning purposes, 50% of the original train set is used in order to speed up training.

In [44]:
tuning_x, tuning_y = get_xy(train,target,exclude_cols,size=0.5)
In [45]:
tuning_y.value_counts()
Out[45]:
Class
0    113726
1       197
Name: count, dtype: int64

Histogram Based Gradient Boosting Classifier¶

In [46]:
# hgb = im.pipeline.Pipeline([
#     ('smote',oversampler),
#     ('rus', undersampler),
#     ('hgb', HistGradientBoostingClassifier(scoring='average_precision'))
# ])
In [47]:
# summary(test_model(hgb,tuning_x,tuning_y,scoring=chosen_scoring))

The major problem is overfitting of the model, so parameters are also tuned to calibrate mean_train_score

In [48]:
# param_dist_hgb = [{
#               # 'smote__sampling_strategy': [samp_strat / 2],
#               # 'rus__sampling_strategy': [samp_strat],
#               'smotetomek__sampling_strategy': [samp_strat],
#               'hgb__max_bins': [50,100],
#               'hgb__max_depth': np.arange(2,5,1),
#               'hgb__learning_rate': [0.1,0.2],
#               # 'hgb__l2_regularization': [0,100,500,1000],
#               'hgb__max_features': [0.4,0.5,0.6],
#               # 'hgb__min_samples_leaf': np.arange(3000,8001,2000),
#               # 'hgb__max_iter' : [100,500,1000]
#             } for samp_strat in [0.02,0.05,0.1]]
# param_dist_hgb
In [49]:
# gsc_hgb = GridSearchCV(hgb,param_dist_hgb,scoring='average_precision',cv=10,return_train_score=True,verbose=1).fit(tuning_x,tuning_y)
In [50]:
# gsc_results_hgb = pd.DataFrame(rsc.cv_results_)

# with pd.option_context('display.max_colwidth',None,'display.max_row',None):
#     display(gsc_results_hgb.sort_values(by='rank_test_score')[['params','mean_test_score','std_test_score','mean_train_score','std_train_score']])
In [51]:
# save_results(gsc_results_hgb,'maxbins,maxdepth,learningrate,maxfeatures,samplingstrategy',Path('results/hgb'))
In [52]:
_ = im.pipeline.make_pipeline(oversampler,undersampler,HistGradientBoostingClassifier())
_.set_params(
              smote__sampling_strategy = 0.025,
              randomundersampler__sampling_strategy = 0.05,
              histgradientboostingclassifier__max_bins = 50, 
              histgradientboostingclassifier__scoring = 'average_precision',
              histgradientboostingclassifier__max_depth =  2,
              histgradientboostingclassifier__learning_rate = 0.1,
              histgradientboostingclassifier__max_features = 0.4)

summary(test_model(_,train_x,train_y,scoring=chosen_scoring))
Out[52]:
fit_time                   0.806242
score_time                 0.055328
test_average precision     0.797309
train_average precision    0.829837
test_f2                    0.798474
train_f2                   0.818594
dtype: float64

Random Forest Classifier¶

In [53]:
# rf = im.pipeline.Pipeline([
#     ('smote',oversampler),
#     ('rus', undersampler),
#     ('rf',RandomForestClassifier(n_estimators=10))
# ])
In [54]:
# summary(test_model(rf,train_x,train_y,scoring=chosen_scoring))
In [55]:
# summary(test_model(rf,tuning_x,tuning_y,scoring=chosen_scoring))
In [56]:
# param_dist_rf = [{
#               'smote__sampling_strategy' : [samp_strat / 2],
#               'rus__sampling_strategy' : [samp_strat],
#               'rf__n_estimators': [100],
#               'rf__criterion': ['entropy'],
#               'rf__max_samples': np.arange(0.4,0.7,0.1),
#               'rf__bootstrap': [True],
#               'rf__max_leaf_nodes': [50,100,150],
#               'rf__max_features': [0.4,0.5,0.6],  
#               # 'rf__min_samples_leaf': np.linspace(1,250,9,dtype=int),
#               # 'rf__min_samples_split': np.geomspace(2,10000,6,dtype=int),
#               'rf__max_depth' : [5,6,7]
#             } for samp_strat in [0.02,0.05,0.1]]
          
# param_dist_rf
In [57]:
# gsc_rf = GridSearchCV(rf,param_dist_rf,scoring='average_precision',cv=5,return_train_score=True,verbose=1).fit(tuning_x,tuning_y)
In [58]:
# gsc_results_rf = pd.DataFrame(gsc_rf.cv_results_)

# with pd.option_context('display.max_colwidth',None,'display.max_row',None):
#     display(
#         gsc_results_rf.sort_values(by='rank_test_score')
#         .query('param_rf__max_depth in [5,6,7]')[['params','mean_test_score','std_test_score','mean_train_score','std_train_score']]
#         )
In [59]:
# save_results(gsc_results_rf,'maxdepth,maxfeatures,maxsamples,maxleafnodes,criterion,estimators,samplingstrategy',Path('results/rf'))
In [60]:
_ = im.pipeline.make_pipeline(oversampler,undersampler,RandomForestClassifier())
_.set_params(
              randomforestclassifier__n_estimators = 100,
              randomforestclassifier__max_depth = 5,
              randomforestclassifier__criterion = 'entropy',
              randomforestclassifier__bootstrap = True,
              randomforestclassifier__max_samples = 0.5,
              randomforestclassifier__max_features = 0.5,
              randomforestclassifier__max_leaf_nodes = 100,
              smote__sampling_strategy = 0.01,
              randomundersampler__sampling_strategy = 0.02
            )

summary(test_model(_,train_x,train_y,scoring=chosen_scoring))
Out[60]:
fit_time                   36.088854
score_time                  0.163479
test_average precision      0.837601
train_average precision     0.873818
test_f2                     0.831567
train_f2                    0.847607
dtype: float64

Test set evaluation¶

In [61]:
test_x,test_y = get_xy(test,target,exclude_cols)

Histogram Gradient Boosting is providing better results on the test set

Histogram Based Gradient Boosting Classifier¶

In [62]:
tuned_oversampler = im.over_sampling.SMOTE(sampling_strategy = 0.025,random_state=9876)
tuned_undersampler = im.under_sampling.RandomUnderSampler(sampling_strategy=0.05,random_state=9876)
tuned_hgb = HistGradientBoostingClassifier(max_bins = 50, 
              scoring = 'average_precision',
              max_depth =  2,
              learning_rate = 0.1,
              max_features = 0.4,
              random_state=9876)

tuned_pipeline_hgb = im.pipeline.make_pipeline(tuned_oversampler,tuned_undersampler,tuned_hgb)
In [63]:
tuned_pipeline_hgb.fit(train_x,train_y)
Out[63]:
Pipeline(steps=[('smote', SMOTE(random_state=9876, sampling_strategy=0.025)),
                ('randomundersampler',
                 RandomUnderSampler(random_state=9876, sampling_strategy=0.05)),
                ('histgradientboostingclassifier',
                 HistGradientBoostingClassifier(max_bins=50, max_depth=2,
                                                max_features=0.4,
                                                random_state=9876,
                                                scoring='average_precision'))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('smote', SMOTE(random_state=9876, sampling_strategy=0.025)),
                ('randomundersampler',
                 RandomUnderSampler(random_state=9876, sampling_strategy=0.05)),
                ('histgradientboostingclassifier',
                 HistGradientBoostingClassifier(max_bins=50, max_depth=2,
                                                max_features=0.4,
                                                random_state=9876,
                                                scoring='average_precision'))])
SMOTE(random_state=9876, sampling_strategy=0.025)
RandomUnderSampler(random_state=9876, sampling_strategy=0.05)
HistGradientBoostingClassifier(max_bins=50, max_depth=2, max_features=0.4,
                               random_state=9876, scoring='average_precision')
In [64]:
pred_y = tuned_pipeline_hgb.predict(test_x)
In [65]:
ConfusionMatrixDisplay.from_predictions(test_y,pred_y);
No description has been provided for this image
In [66]:
amount_fraud_total = test_x[test_y == 1]['Amount'].sum()
amount_fraud_identified = test_x[(pred_y==1) & (test_y == 1)]['Amount'].sum()

amount_save_ratio = amount_fraud_identified / amount_fraud_total

num_fraud_total = (test_y == 1).sum()
num_fraud_identified = ((test_y == 1) & (pred_y == 1)).sum()

num_save_ratio = num_fraud_identified / num_fraud_total

print(f'{num_save_ratio*100:.2f}% of fraudulent transaction were correctly identified leading to saving {amount_save_ratio * 100:.2f}% (€{amount_fraud_identified}) of value earlier lost to fraudulent transactions.')
84.69% of fraudulent transaction were correctly identified leading to saving 70.46% (€8593.05) of value earlier lost to fraudulent transactions.
In [67]:
pred_y_train = tuned_pipeline_hgb.predict(train_x)
ConfusionMatrixDisplay.from_predictions(train_y,pred_y_train);
No description has been provided for this image

Random Forest Classifier¶

In [68]:
tuned_oversampler = im.over_sampling.SMOTE(sampling_strategy = 0.01,random_state=9876)
tuned_undersampler = im.under_sampling.RandomUnderSampler(sampling_strategy=0.02,random_state=9876)
tuned_rf = RandomForestClassifier(n_estimators = 100,
              max_depth = 5,
              criterion = 'entropy',
              bootstrap = True,
              max_samples = 0.5,
              max_features = 0.5,
              max_leaf_nodes = 100,
              random_state=9876)

tuned_pipeline_rf = im.pipeline.make_pipeline(tuned_oversampler,tuned_undersampler,tuned_rf)
In [69]:
tuned_pipeline_rf.fit(train_x,train_y)
Out[69]:
Pipeline(steps=[('smote', SMOTE(random_state=9876, sampling_strategy=0.01)),
                ('randomundersampler',
                 RandomUnderSampler(random_state=9876, sampling_strategy=0.02)),
                ('randomforestclassifier',
                 RandomForestClassifier(criterion='entropy', max_depth=5,
                                        max_features=0.5, max_leaf_nodes=100,
                                        max_samples=0.5, random_state=9876))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('smote', SMOTE(random_state=9876, sampling_strategy=0.01)),
                ('randomundersampler',
                 RandomUnderSampler(random_state=9876, sampling_strategy=0.02)),
                ('randomforestclassifier',
                 RandomForestClassifier(criterion='entropy', max_depth=5,
                                        max_features=0.5, max_leaf_nodes=100,
                                        max_samples=0.5, random_state=9876))])
SMOTE(random_state=9876, sampling_strategy=0.01)
RandomUnderSampler(random_state=9876, sampling_strategy=0.02)
RandomForestClassifier(criterion='entropy', max_depth=5, max_features=0.5,
                       max_leaf_nodes=100, max_samples=0.5, random_state=9876)
In [70]:
pred_y = tuned_pipeline_rf.predict(test_x)
In [71]:
ConfusionMatrixDisplay.from_predictions(test_y,pred_y);
No description has been provided for this image
In [72]:
amount_fraud_total = test_x[test_y == 1]['Amount'].sum()
amount_fraud_identified = test_x[(pred_y==1) & (test_y == 1)]['Amount'].sum()

amount_save_ratio = amount_fraud_identified / amount_fraud_total

num_fraud_total = (test_y == 1).sum()
num_fraud_identified = ((test_y == 1) & (pred_y == 1)).sum()

num_save_ratio = num_fraud_identified / num_fraud_total

print(f'{num_save_ratio*100:.2f}% of fraudulent transaction were correctly identified leading to saving {amount_save_ratio * 100:.2f}% (€{amount_fraud_identified}) of value earlier lost to fraudulent transactions.')
80.61% of fraudulent transaction were correctly identified leading to saving 67.74% (€8261.48) of value earlier lost to fraudulent transactions.
In [73]:
pred_y_train = tuned_pipeline_rf.predict(train_x)
ConfusionMatrixDisplay.from_predictions(train_y,pred_y_train);
No description has been provided for this image