Imports¶
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path
import os
#imbalance-learn import
import imblearn as im
# sklearn imports
from sklearn.utils import _pprint as pp,resample
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import (
train_test_split,
cross_validate,
GridSearchCV)
from sklearn.metrics import (
fbeta_score,
make_scorer,
confusion_matrix,
ConfusionMatrixDisplay)
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import (
RandomForestClassifier,
HistGradientBoostingClassifier,
AdaBoostClassifier)
from sklearn.neural_network import MLPClassifier
# scipy import
from scipy.stats import mannwhitneyu
data = pd.read_csv('creditcard.csv')
target = ['Class']
exclude_cols = ['Time']
Exploratory Data Analysis¶
The dataset contains transaction made by European cardholders over a 2 day period in September 2013. The numerical input variables ranging from V1 to V28 are arrived at by PCA transformation.
data.head()
Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | ... | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 149.62 | 0 |
1 | 0.0 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | ... | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | 2.69 | 0 |
2 | 1.0 | -1.358354 | -1.340163 | 1.773209 | 0.379780 | -0.503198 | 1.800499 | 0.791461 | 0.247676 | -1.514654 | ... | 0.247998 | 0.771679 | 0.909412 | -0.689281 | -0.327642 | -0.139097 | -0.055353 | -0.059752 | 378.66 | 0 |
3 | 1.0 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | ... | -0.108300 | 0.005274 | -0.190321 | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 123.50 | 0 |
4 | 2.0 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | ... | -0.009431 | 0.798278 | -0.137458 | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | 69.99 | 0 |
5 rows × 31 columns
There are no missing values
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 284807 entries, 0 to 284806 Data columns (total 31 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Time 284807 non-null float64 1 V1 284807 non-null float64 2 V2 284807 non-null float64 3 V3 284807 non-null float64 4 V4 284807 non-null float64 5 V5 284807 non-null float64 6 V6 284807 non-null float64 7 V7 284807 non-null float64 8 V8 284807 non-null float64 9 V9 284807 non-null float64 10 V10 284807 non-null float64 11 V11 284807 non-null float64 12 V12 284807 non-null float64 13 V13 284807 non-null float64 14 V14 284807 non-null float64 15 V15 284807 non-null float64 16 V16 284807 non-null float64 17 V17 284807 non-null float64 18 V18 284807 non-null float64 19 V19 284807 non-null float64 20 V20 284807 non-null float64 21 V21 284807 non-null float64 22 V22 284807 non-null float64 23 V23 284807 non-null float64 24 V24 284807 non-null float64 25 V25 284807 non-null float64 26 V26 284807 non-null float64 27 V27 284807 non-null float64 28 V28 284807 non-null float64 29 Amount 284807 non-null float64 30 Class 284807 non-null int64 dtypes: float64(30), int64(1) memory usage: 67.4 MB
with pd.option_context('display.float_format',lambda x: '%.2f' % x):
display(data.describe())
Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 284807.00 | 284807.00 | 284807.00 | 284807.00 | 284807.00 | 284807.00 | 284807.00 | 284807.00 | 284807.00 | 284807.00 | ... | 284807.00 | 284807.00 | 284807.00 | 284807.00 | 284807.00 | 284807.00 | 284807.00 | 284807.00 | 284807.00 | 284807.00 |
mean | 94813.86 | 0.00 | 0.00 | -0.00 | 0.00 | 0.00 | 0.00 | -0.00 | 0.00 | -0.00 | ... | 0.00 | -0.00 | 0.00 | 0.00 | 0.00 | 0.00 | -0.00 | -0.00 | 88.35 | 0.00 |
std | 47488.15 | 1.96 | 1.65 | 1.52 | 1.42 | 1.38 | 1.33 | 1.24 | 1.19 | 1.10 | ... | 0.73 | 0.73 | 0.62 | 0.61 | 0.52 | 0.48 | 0.40 | 0.33 | 250.12 | 0.04 |
min | 0.00 | -56.41 | -72.72 | -48.33 | -5.68 | -113.74 | -26.16 | -43.56 | -73.22 | -13.43 | ... | -34.83 | -10.93 | -44.81 | -2.84 | -10.30 | -2.60 | -22.57 | -15.43 | 0.00 | 0.00 |
25% | 54201.50 | -0.92 | -0.60 | -0.89 | -0.85 | -0.69 | -0.77 | -0.55 | -0.21 | -0.64 | ... | -0.23 | -0.54 | -0.16 | -0.35 | -0.32 | -0.33 | -0.07 | -0.05 | 5.60 | 0.00 |
50% | 84692.00 | 0.02 | 0.07 | 0.18 | -0.02 | -0.05 | -0.27 | 0.04 | 0.02 | -0.05 | ... | -0.03 | 0.01 | -0.01 | 0.04 | 0.02 | -0.05 | 0.00 | 0.01 | 22.00 | 0.00 |
75% | 139320.50 | 1.32 | 0.80 | 1.03 | 0.74 | 0.61 | 0.40 | 0.57 | 0.33 | 0.60 | ... | 0.19 | 0.53 | 0.15 | 0.44 | 0.35 | 0.24 | 0.09 | 0.08 | 77.16 | 0.00 |
max | 172792.00 | 2.45 | 22.06 | 9.38 | 16.88 | 34.80 | 73.30 | 120.59 | 20.01 | 15.59 | ... | 27.20 | 10.50 | 22.53 | 4.58 | 7.52 | 3.52 | 31.61 | 33.85 | 25691.16 | 1.00 |
8 rows × 31 columns
Negative class (0) (Non-fraudulent transactions): 99.83%
Positive class (1) (Fraudulent transactions) : 0.17%
data[target].value_counts(normalize=True)
Class 0 0.998273 1 0.001727 Name: proportion, dtype: float64
Over the period of data collection, from the total transaction value of $25,162,590, $60,128 was involved in fraudulent transactions, which amount to 0.24% of total transaction value.
fraud_amount = data[data[target[0]] == 1]['Amount'].sum()
total_amount = data['Amount'].sum()
loss_ratio = fraud_amount / total_amount
fraud_amount, total_amount, loss_ratio
(60127.97, 25162590.009999998, 0.002389577939953885)
There is no need to check correlation because we already have these features from PCA
Univariate plots¶
def plot_univariate(data,columns,factor=5,**args):
'''
Makes a grid of axes which will be populated by kdeplots of `columns`
The size of the grid is scaled by `factor`
'''
assert type(columns) is list
n = len(columns)
_, axs = plt.subplots(int(np.ceil(n/3)),3,figsize=[3 * factor,int(np.ceil(n/3) * factor)])
for i,feature in enumerate(columns):
sns.kdeplot(data,x=feature,ax=axs.flat[i],**args)
plt.tight_layout()
desired_cols = [col for col in data.columns if col not in exclude_cols]
plot_univariate(data,desired_cols)
Reason why scatterplots can be deceptive. Using bivariate histogram instead.
_, axs = plt.subplots(1,2,figsize=[10,5])
sns.histplot(data,x='V9',y='V10',bins=50,cbar=True,ax=axs[0])
sns.scatterplot(data,x='V9',y='V10',ax=axs[1])
plt.tight_layout()
All the features in the dataset are orthogonal as they are derived from PCA, hence bivariate plots do not yield any significant correlation.
sns.pairplot(data[['V1','V2','V3','V4','V5','V6','V7','V8','V9','V10','Class']],hue='Class',corner=True,kind='hist',plot_kws={'bins':30})
<seaborn.axisgrid.PairGrid at 0x168f7af00>
From a visual inspection of below graphs, V6, V13, V15, V22, V24, V25, V26 seem to have very similar density distribution across fraudulent and non-fraudulent cases.
plot_univariate(data,desired_cols,hue='Class',common_norm=False)
/var/folders/5b/0xnktdbn1cl34pj3c4bwxrlr0000gn/T/ipykernel_47480/4109673558.py:12: UserWarning: Dataset has 0 variance; skipping density estimate. Pass `warn_singular=False` to disable this warning. sns.kdeplot(data,x=feature,ax=axs.flat[i],**args)
Difference in means¶
There are features where mean seams quite different and some features where it is quite close
data.groupby('Class').mean()
Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V20 | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Class | |||||||||||||||||||||
0 | 94838.202258 | 0.008258 | -0.006271 | 0.012171 | -0.007860 | 0.005453 | 0.002419 | 0.009637 | -0.000987 | 0.004467 | ... | -0.000644 | -0.001235 | -0.000024 | 0.000070 | 0.000182 | -0.000072 | -0.000089 | -0.000295 | -0.000131 | 88.291022 |
1 | 80746.806911 | -4.771948 | 3.623778 | -7.033281 | 4.542029 | -3.151225 | -1.397737 | -5.568731 | 0.570636 | -2.581123 | ... | 0.372319 | 0.713588 | 0.014049 | -0.040308 | -0.105130 | 0.041449 | 0.051648 | 0.170575 | 0.075667 | 122.211321 |
2 rows × 30 columns
Mann Whitney U is a non-parametric test to whether two populations are identically distributed.
The test result is not significant on the below outputted columns. Hence, they are added to the list of excluded columns as they are unlikely to provide discriminating information.
def multiple_mannwu(data,target,exclude=None,pvalue_thresh=0.01):
'''
Compares pvalue from `mannwhitneyu()` with a given threshold `pvalue_thresh`
Returns columns in `data`, excludeept `target`, whose pvalue is above the threshold indicating
that the null hypothesis (mean of the two sets in a column discriminated by `target` are equal) cannot
be rejected.
'''
if type(exclude) is list:
exclude = exclude + target
elif exclude is None:
exclude = target
else:
exclude = target + [exclude]
d_col = [col for col in data.columns if col not in exclude]
mask = data[target[0]] == 0
nosig = []
for col in d_col:
x = data[mask][col]
y = data[~mask][col]
if mannwhitneyu(x,y).pvalue > pvalue_thresh:
nosig.append(col)
return nosig
_features = multiple_mannwu(data,target,pvalue_thresh=0.05)
exclude = exclude_cols + _features
display(_features)
['V13', 'V15', 'V22']
Modelling¶
Split the data 80:20 between train and test set in a stratified manner. The test set is set aside for final evaluation.
train,test = train_test_split(data,test_size=0.2,stratify=data[target],random_state=9876)
train.head()
Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
230184 | 146226.0 | 2.255889 | -1.505012 | -0.808560 | -1.668634 | -1.308710 | -0.195650 | -1.432749 | 0.093177 | -1.171014 | ... | -0.124897 | 0.060801 | 0.310999 | 0.647424 | -0.323789 | -0.222878 | 0.008770 | -0.054068 | 7.15 | 0 |
204689 | 135394.0 | 2.106139 | -0.798143 | -1.189310 | -0.720381 | -0.758720 | -0.553299 | -1.070078 | 0.075041 | -0.020218 | ... | 0.264418 | 0.778379 | 0.068968 | -0.521369 | -0.186387 | -0.078634 | 0.021085 | -0.027298 | 19.95 | 0 |
28599 | 35071.0 | -1.185866 | 1.226142 | 0.804271 | 0.958248 | -0.001800 | -0.885471 | 0.460888 | 0.078149 | -0.717416 | ... | 0.194333 | 0.600367 | 0.122303 | 0.425851 | -0.172097 | -0.357723 | -0.272225 | 0.069364 | 10.50 | 0 |
119959 | 75650.0 | -1.406713 | 1.493848 | 1.225097 | -1.349293 | 0.449902 | -0.655615 | 0.760328 | 0.305292 | -0.640502 | ... | -0.241368 | -0.860797 | -0.279210 | -0.452110 | 0.584831 | -0.094257 | -0.051081 | 0.036349 | 2.50 | 0 |
174824 | 122085.0 | -0.518872 | 0.323049 | -0.530168 | -0.989712 | 1.634748 | -0.437116 | 1.404158 | -0.431957 | 0.051537 | ... | 0.183203 | 0.837499 | -0.067642 | 0.198096 | -0.381740 | -0.747842 | -0.139885 | -0.003166 | 46.13 | 0 |
5 rows × 31 columns
The train set consists of 227,845 samples and test set consists of 56,962 samples.
print('No. of train instances : ',train.shape[0])
print('No. of test instances : ', test.shape[0])
No. of train instances : 227845 No. of test instances : 56962
The distribution of classes in the train set is similar to the overall dataset.
display(train[target].value_counts())
display(train[target].value_counts(normalize=True))
Class 0 227451 1 394 Name: count, dtype: int64
Class 0 0.998271 1 0.001729 Name: proportion, dtype: float64
def get_xy(data,target,exclude=[],size=None):
'''
Splits `data` into X and y on the basis of `target` while excluding columns in `exclude`
If `size` is provided as a float, returns a resampled subset of data.
'''
assert type(target) is list
assert type(exclude) is list
if size is not None and size > 0 and size < 1:
data = resample(data,
replace=False,
n_samples=int(np.ceil(size*data.shape[0])),
stratify=data[target],
random_state=9876)
return data[[col for col in data.columns if col not in [target[0],*exclude]]], data[target[0]]
get_xy(train,target)[1].value_counts()
Class 0 227451 1 394 Name: count, dtype: int64
Evaluation metric¶
Selected Evaluation Metrics : Average Precision (discretized version of PR AUC) and F2 score
Reason : We need high recall in this situation in order to detect all true fraudulent transactions. However, due to precision-recall tradeoff, a high recall will adversely impact precision. This would translate to poor user experience as many genuine transaction would be classified as fraudulent. So we need an evaluation metric that takes into consideration both precision and recall and that which focuses more on positive class.
F1 score and PR AUC are two such metrics. F1 will favour precision and recall having similar values, but there is no such consideration required in this scenario. We are vying for a higher recall. Instead of F1 score, we can use F2 score which will give more importance to recall over precision. PR AUC will be a summary metric that will take into account all thresholds and resulting precision and recall.
f2 = make_scorer(fbeta_score,beta=2)
chosen_scoring = {'average precision': 'average_precision','f2':f2}
Balancing¶
oversampler = im.over_sampling.SMOTE(sampling_strategy=0.01,random_state=9876)
undersampler = im.under_sampling.RandomUnderSampler(sampling_strategy=0.02,random_state=9876)
The positive class has now increased ~2% of total instances.
# steps = [('over',oversampler),('under',undersampler)]
# rebalance = im.pipeline.Pipeline(steps)
# rebalance.fit_resample(*get_xy(train,target,exclude_cols))[1].value_counts()
Testing Models¶
# Helper functions
def test_model(model,x,y,scoring,**args):
'''
Returns the results of cross validation of `model` on metrics `scoring` along with train_scores
Additional named arguments to functions are passed through to `cross_validate()`
'''
assert type(scoring) is dict
cv_scores = cross_validate(model,x,y,scoring=scoring,return_train_score=True,**args)
return cv_scores
def summary(results,agg=False):
'''
Returns mean value of columns in `results`
If `agg` is True, returns a dataframe collating various results in `results` passed as a dictionary
'''
if agg is True:
assert type(results) is dict
df = pd.DataFrame()
for key in results.keys():
df[key] = pd.DataFrame(pd.DataFrame(results[key]).mean(axis=0))
return(df)
else:
return(pd.DataFrame(results).mean(axis=0))
def test_over_models(models,x,y,scoring,**args):
'''
Tests models passed as dictionary in `models` on data `x` and `y`
'''
assert type(models) is dict
results = {}
for key in models.keys():
results[key] = test_model(models[key],x,y,scoring,**args)
display(summary(results,agg=True))
def test_over_data(model,datas,scoring,**args):
'''
Tests model passed as `model` on different datasets passed as a list of lists in `datas`
The elements of `datas` need to be formatted as ['name','x','y']
'''
results = {}
assert type(datas) is list
for data in datas:
name = data[0]
_x = data[1]
_y = data[2]
results[name] = test_model(model,_x,_y,scoring,**args)
display(summary(results,agg=True))
This section is for individual testing of models.
models = {
'Logistic': make_pipeline(StandardScaler(),LogisticRegression()),
'LDA': LinearDiscriminantAnalysis(),
'KNN': KNeighborsClassifier(),
'Tree': DecisionTreeClassifier(),
'SVC': LinearSVC(),
'RF': RandomForestClassifier(),
'HGB': HistGradientBoostingClassifier(),
'Ada': AdaBoostClassifier(),
'MLP': MLPClassifier()
}
pipe_models = { key : im.pipeline.make_pipeline(oversampler,undersampler,models[key]) for key in models.keys() }
train_x, train_y = get_xy(train,target,exclude_cols)
results = {}
results['Logistic'] = test_model(pipe_models['Logistic'],train_x,train_y,chosen_scoring)
summary(results['Logistic'])
fit_time 0.379676 score_time 0.025833 test_average precision 0.775992 train_average precision 0.783169 test_f2 0.811099 train_f2 0.816886 dtype: float64
results['KNN'] = test_model(pipe_models['KNN'],train_x,train_y,chosen_scoring)
summary(results['KNN'])
fit_time 0.117326 score_time 7.054832 test_average precision 0.726229 train_average precision 0.920933 test_f2 0.748187 train_f2 0.893922 dtype: float64
results['Tree'] = test_model(pipe_models['Tree'],train_x,train_y,chosen_scoring)
summary(results['Tree'])
fit_time 4.390193 score_time 0.014634 test_average precision 0.357940 train_average precision 0.633491 test_f2 0.694900 train_f2 0.896062 dtype: float64
results['SVC'] = test_model(pipe_models['SVC'],train_x,train_y,chosen_scoring)
summary(results['SVC'])
fit_time 0.578071 score_time 0.030805 test_average precision 0.786126 train_average precision 0.788145 test_f2 0.811447 train_f2 0.816411 dtype: float64
results['RF'] = test_model(pipe_models['RF'],train_x,train_y,chosen_scoring)
summary(results['RF'])
fit_time 49.517888 score_time 0.395611 test_average precision 0.851992 train_average precision 0.996217 test_f2 0.842551 train_f2 0.988834 dtype: float64
results['HGB'] = test_model(pipe_models['HGB'],train_x,train_y,chosen_scoring)
summary(results['HGB'])
fit_time 2.093622 score_time 0.123978 test_average precision 0.852891 train_average precision 0.988732 test_f2 0.838377 train_f2 0.981290 dtype: float64
results['Ada'] = test_model(pipe_models['Ada'],train_x,train_y,chosen_scoring)
summary(results['Ada'])
fit_time 16.489126 score_time 0.201321 test_average precision 0.759975 train_average precision 0.778394 test_f2 0.760541 train_f2 0.755231 dtype: float64
results['MLP'] = test_model(pipe_models['MLP'],train_x,train_y,chosen_scoring)
summary(results['MLP'])
fit_time 39.928020 score_time 0.069507 test_average precision 0.808602 train_average precision 0.955041 test_f2 0.799826 train_f2 0.939518 dtype: float64
The best performing models with default configuration are tree based ensemble methods.
Histogram Gradient Boosting provides good performance with low train and inference times but both Histogram based Gradient Boosting and Random Forests are overfitting.
summary(results,agg = True)
Logistic | KNN | Tree | SVC | RF | HGB | Ada | MLP | |
---|---|---|---|---|---|---|---|---|
fit_time | 0.379676 | 0.117326 | 4.390193 | 0.578071 | 49.517888 | 2.093622 | 16.489126 | 39.928020 |
score_time | 0.025833 | 7.054832 | 0.014634 | 0.030805 | 0.395611 | 0.123978 | 0.201321 | 0.069507 |
test_average precision | 0.775992 | 0.726229 | 0.357940 | 0.786126 | 0.851992 | 0.852891 | 0.759975 | 0.808602 |
train_average precision | 0.783169 | 0.920933 | 0.633491 | 0.788145 | 0.996217 | 0.988732 | 0.778394 | 0.955041 |
test_f2 | 0.811099 | 0.748187 | 0.694900 | 0.811447 | 0.842551 | 0.838377 | 0.760541 | 0.799826 |
train_f2 | 0.816886 | 0.893922 | 0.896062 | 0.816411 | 0.988834 | 0.981290 | 0.755231 | 0.939518 |
The following tests are performed on 10% of the original train set across different models.
test_over_models(pipe_models,*get_xy(train,target,exclude_cols,size=0.1),chosen_scoring)
Logistic | LDA | KNN | Tree | SVC | RF | HGB | Ada | MLP | |
---|---|---|---|---|---|---|---|---|---|
fit_time | 0.034858 | 0.079543 | 0.016641 | 0.192856 | 0.081436 | 2.127398 | 1.545502 | 1.408812 | 5.634064 |
score_time | 0.005146 | 0.009446 | 0.200080 | 0.004202 | 0.015375 | 0.026604 | 0.029841 | 0.028889 | 0.013227 |
test_average precision | 0.764215 | 0.764178 | 0.616071 | 0.327894 | 0.754806 | 0.819435 | 0.811073 | 0.737595 | 0.787113 |
train_average precision | 0.848495 | 0.752214 | 0.860910 | 0.589516 | 0.852941 | 0.968123 | 0.970338 | 0.901878 | 0.914766 |
test_f2 | 0.759132 | 0.856624 | 0.693362 | 0.668273 | 0.777703 | 0.848291 | 0.831575 | 0.730343 | 0.824123 |
train_f2 | 0.831018 | 0.833318 | 0.834748 | 0.871039 | 0.838366 | 0.975333 | 0.982594 | 0.884723 | 0.921345 |
The following tests a given model over different sizes of the dataset.
ds_data = []
for size in np.arange(0.1,1,0.1):
ds_data.append([str(size),*get_xy(train,target,exclude_cols,size=size)])
test_over_data(pipe_models['HGB'],ds_data,chosen_scoring)
0.1 | 0.2 | 0.30000000000000004 | 0.4 | 0.5 | 0.6000000000000001 | 0.7000000000000001 | 0.8 | 0.9 | |
---|---|---|---|---|---|---|---|---|---|
fit_time | 1.549482 | 1.220542 | 1.127654 | 1.317820 | 1.792807 | 1.663364 | 1.976792 | 2.203707 | 2.221403 |
score_time | 0.029732 | 0.027938 | 0.030928 | 0.043338 | 0.067005 | 0.070172 | 0.098156 | 0.120267 | 0.136374 |
test_average precision | 0.811073 | 0.838364 | 0.827606 | 0.837979 | 0.860979 | 0.830672 | 0.832617 | 0.835914 | 0.838150 |
train_average precision | 0.970338 | 0.939075 | 0.979897 | 0.978999 | 0.991391 | 0.986564 | 0.983307 | 0.989744 | 0.983047 |
test_f2 | 0.831575 | 0.816265 | 0.827197 | 0.836423 | 0.844497 | 0.839662 | 0.833661 | 0.831518 | 0.829214 |
train_f2 | 0.982594 | 0.973856 | 0.975021 | 0.979435 | 0.981353 | 0.977319 | 0.971988 | 0.979300 | 0.974785 |
test_over_data(pipe_models['RF'],ds_data,chosen_scoring)
0.1 | 0.2 | 0.30000000000000004 | 0.4 | 0.5 | 0.6000000000000001 | 0.7000000000000001 | 0.8 | 0.9 | |
---|---|---|---|---|---|---|---|---|---|
fit_time | 2.147401 | 6.021093 | 11.053385 | 15.909114 | 21.025546 | 27.100481 | 31.314481 | 38.253374 | 44.598699 |
score_time | 0.028112 | 0.055147 | 0.092112 | 0.134303 | 0.189957 | 0.211098 | 0.265522 | 0.315061 | 0.355359 |
test_average precision | 0.813307 | 0.785515 | 0.808419 | 0.853890 | 0.836636 | 0.834036 | 0.826098 | 0.817533 | 0.830270 |
train_average precision | 0.982358 | 0.980441 | 0.981132 | 0.991165 | 0.995313 | 0.991519 | 0.992421 | 0.988512 | 0.993725 |
test_f2 | 0.852137 | 0.849467 | 0.858357 | 0.861843 | 0.849332 | 0.846081 | 0.843803 | 0.827664 | 0.831152 |
train_f2 | 0.982628 | 0.983196 | 0.984989 | 0.986288 | 0.988220 | 0.981101 | 0.983069 | 0.990094 | 0.987086 |
Hyperparameter Tuning¶
def save_results(results,filename,folder="."):
''' Save the results to a given file and in a given folder
'''
folderpath = Path(folder)
filepath = Path(folder,filename)
os.makedirs(folderpath,exist_ok=True)
results.to_csv(filepath)
For tuning purposes, 50% of the original train set is used in order to speed up training.
tuning_x, tuning_y = get_xy(train,target,exclude_cols,size=0.5)
tuning_y.value_counts()
Class 0 113726 1 197 Name: count, dtype: int64
Histogram Based Gradient Boosting Classifier¶
# hgb = im.pipeline.Pipeline([
# ('smote',oversampler),
# ('rus', undersampler),
# ('hgb', HistGradientBoostingClassifier(scoring='average_precision'))
# ])
# summary(test_model(hgb,tuning_x,tuning_y,scoring=chosen_scoring))
The major problem is overfitting of the model, so parameters are also tuned to calibrate mean_train_score
# param_dist_hgb = [{
# # 'smote__sampling_strategy': [samp_strat / 2],
# # 'rus__sampling_strategy': [samp_strat],
# 'smotetomek__sampling_strategy': [samp_strat],
# 'hgb__max_bins': [50,100],
# 'hgb__max_depth': np.arange(2,5,1),
# 'hgb__learning_rate': [0.1,0.2],
# # 'hgb__l2_regularization': [0,100,500,1000],
# 'hgb__max_features': [0.4,0.5,0.6],
# # 'hgb__min_samples_leaf': np.arange(3000,8001,2000),
# # 'hgb__max_iter' : [100,500,1000]
# } for samp_strat in [0.02,0.05,0.1]]
# param_dist_hgb
# gsc_hgb = GridSearchCV(hgb,param_dist_hgb,scoring='average_precision',cv=10,return_train_score=True,verbose=1).fit(tuning_x,tuning_y)
# gsc_results_hgb = pd.DataFrame(rsc.cv_results_)
# with pd.option_context('display.max_colwidth',None,'display.max_row',None):
# display(gsc_results_hgb.sort_values(by='rank_test_score')[['params','mean_test_score','std_test_score','mean_train_score','std_train_score']])
# save_results(gsc_results_hgb,'maxbins,maxdepth,learningrate,maxfeatures,samplingstrategy',Path('results/hgb'))
_ = im.pipeline.make_pipeline(oversampler,undersampler,HistGradientBoostingClassifier())
_.set_params(
smote__sampling_strategy = 0.025,
randomundersampler__sampling_strategy = 0.05,
histgradientboostingclassifier__max_bins = 50,
histgradientboostingclassifier__scoring = 'average_precision',
histgradientboostingclassifier__max_depth = 2,
histgradientboostingclassifier__learning_rate = 0.1,
histgradientboostingclassifier__max_features = 0.4)
summary(test_model(_,train_x,train_y,scoring=chosen_scoring))
fit_time 0.806242 score_time 0.055328 test_average precision 0.797309 train_average precision 0.829837 test_f2 0.798474 train_f2 0.818594 dtype: float64
Random Forest Classifier¶
# rf = im.pipeline.Pipeline([
# ('smote',oversampler),
# ('rus', undersampler),
# ('rf',RandomForestClassifier(n_estimators=10))
# ])
# summary(test_model(rf,train_x,train_y,scoring=chosen_scoring))
# summary(test_model(rf,tuning_x,tuning_y,scoring=chosen_scoring))
# param_dist_rf = [{
# 'smote__sampling_strategy' : [samp_strat / 2],
# 'rus__sampling_strategy' : [samp_strat],
# 'rf__n_estimators': [100],
# 'rf__criterion': ['entropy'],
# 'rf__max_samples': np.arange(0.4,0.7,0.1),
# 'rf__bootstrap': [True],
# 'rf__max_leaf_nodes': [50,100,150],
# 'rf__max_features': [0.4,0.5,0.6],
# # 'rf__min_samples_leaf': np.linspace(1,250,9,dtype=int),
# # 'rf__min_samples_split': np.geomspace(2,10000,6,dtype=int),
# 'rf__max_depth' : [5,6,7]
# } for samp_strat in [0.02,0.05,0.1]]
# param_dist_rf
# gsc_rf = GridSearchCV(rf,param_dist_rf,scoring='average_precision',cv=5,return_train_score=True,verbose=1).fit(tuning_x,tuning_y)
# gsc_results_rf = pd.DataFrame(gsc_rf.cv_results_)
# with pd.option_context('display.max_colwidth',None,'display.max_row',None):
# display(
# gsc_results_rf.sort_values(by='rank_test_score')
# .query('param_rf__max_depth in [5,6,7]')[['params','mean_test_score','std_test_score','mean_train_score','std_train_score']]
# )
# save_results(gsc_results_rf,'maxdepth,maxfeatures,maxsamples,maxleafnodes,criterion,estimators,samplingstrategy',Path('results/rf'))
_ = im.pipeline.make_pipeline(oversampler,undersampler,RandomForestClassifier())
_.set_params(
randomforestclassifier__n_estimators = 100,
randomforestclassifier__max_depth = 5,
randomforestclassifier__criterion = 'entropy',
randomforestclassifier__bootstrap = True,
randomforestclassifier__max_samples = 0.5,
randomforestclassifier__max_features = 0.5,
randomforestclassifier__max_leaf_nodes = 100,
smote__sampling_strategy = 0.01,
randomundersampler__sampling_strategy = 0.02
)
summary(test_model(_,train_x,train_y,scoring=chosen_scoring))
fit_time 36.088854 score_time 0.163479 test_average precision 0.837601 train_average precision 0.873818 test_f2 0.831567 train_f2 0.847607 dtype: float64
Test set evaluation¶
test_x,test_y = get_xy(test,target,exclude_cols)
Histogram Gradient Boosting is providing better results on the test set
Histogram Based Gradient Boosting Classifier¶
tuned_oversampler = im.over_sampling.SMOTE(sampling_strategy = 0.025,random_state=9876)
tuned_undersampler = im.under_sampling.RandomUnderSampler(sampling_strategy=0.05,random_state=9876)
tuned_hgb = HistGradientBoostingClassifier(max_bins = 50,
scoring = 'average_precision',
max_depth = 2,
learning_rate = 0.1,
max_features = 0.4,
random_state=9876)
tuned_pipeline_hgb = im.pipeline.make_pipeline(tuned_oversampler,tuned_undersampler,tuned_hgb)
tuned_pipeline_hgb.fit(train_x,train_y)
Pipeline(steps=[('smote', SMOTE(random_state=9876, sampling_strategy=0.025)), ('randomundersampler', RandomUnderSampler(random_state=9876, sampling_strategy=0.05)), ('histgradientboostingclassifier', HistGradientBoostingClassifier(max_bins=50, max_depth=2, max_features=0.4, random_state=9876, scoring='average_precision'))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('smote', SMOTE(random_state=9876, sampling_strategy=0.025)), ('randomundersampler', RandomUnderSampler(random_state=9876, sampling_strategy=0.05)), ('histgradientboostingclassifier', HistGradientBoostingClassifier(max_bins=50, max_depth=2, max_features=0.4, random_state=9876, scoring='average_precision'))])
SMOTE(random_state=9876, sampling_strategy=0.025)
RandomUnderSampler(random_state=9876, sampling_strategy=0.05)
HistGradientBoostingClassifier(max_bins=50, max_depth=2, max_features=0.4, random_state=9876, scoring='average_precision')
pred_y = tuned_pipeline_hgb.predict(test_x)
ConfusionMatrixDisplay.from_predictions(test_y,pred_y);
amount_fraud_total = test_x[test_y == 1]['Amount'].sum()
amount_fraud_identified = test_x[(pred_y==1) & (test_y == 1)]['Amount'].sum()
amount_save_ratio = amount_fraud_identified / amount_fraud_total
num_fraud_total = (test_y == 1).sum()
num_fraud_identified = ((test_y == 1) & (pred_y == 1)).sum()
num_save_ratio = num_fraud_identified / num_fraud_total
print(f'{num_save_ratio*100:.2f}% of fraudulent transaction were correctly identified leading to saving {amount_save_ratio * 100:.2f}% (€{amount_fraud_identified}) of value earlier lost to fraudulent transactions.')
84.69% of fraudulent transaction were correctly identified leading to saving 70.46% (€8593.05) of value earlier lost to fraudulent transactions.
pred_y_train = tuned_pipeline_hgb.predict(train_x)
ConfusionMatrixDisplay.from_predictions(train_y,pred_y_train);
Random Forest Classifier¶
tuned_oversampler = im.over_sampling.SMOTE(sampling_strategy = 0.01,random_state=9876)
tuned_undersampler = im.under_sampling.RandomUnderSampler(sampling_strategy=0.02,random_state=9876)
tuned_rf = RandomForestClassifier(n_estimators = 100,
max_depth = 5,
criterion = 'entropy',
bootstrap = True,
max_samples = 0.5,
max_features = 0.5,
max_leaf_nodes = 100,
random_state=9876)
tuned_pipeline_rf = im.pipeline.make_pipeline(tuned_oversampler,tuned_undersampler,tuned_rf)
tuned_pipeline_rf.fit(train_x,train_y)
Pipeline(steps=[('smote', SMOTE(random_state=9876, sampling_strategy=0.01)), ('randomundersampler', RandomUnderSampler(random_state=9876, sampling_strategy=0.02)), ('randomforestclassifier', RandomForestClassifier(criterion='entropy', max_depth=5, max_features=0.5, max_leaf_nodes=100, max_samples=0.5, random_state=9876))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('smote', SMOTE(random_state=9876, sampling_strategy=0.01)), ('randomundersampler', RandomUnderSampler(random_state=9876, sampling_strategy=0.02)), ('randomforestclassifier', RandomForestClassifier(criterion='entropy', max_depth=5, max_features=0.5, max_leaf_nodes=100, max_samples=0.5, random_state=9876))])
SMOTE(random_state=9876, sampling_strategy=0.01)
RandomUnderSampler(random_state=9876, sampling_strategy=0.02)
RandomForestClassifier(criterion='entropy', max_depth=5, max_features=0.5, max_leaf_nodes=100, max_samples=0.5, random_state=9876)
pred_y = tuned_pipeline_rf.predict(test_x)
ConfusionMatrixDisplay.from_predictions(test_y,pred_y);
amount_fraud_total = test_x[test_y == 1]['Amount'].sum()
amount_fraud_identified = test_x[(pred_y==1) & (test_y == 1)]['Amount'].sum()
amount_save_ratio = amount_fraud_identified / amount_fraud_total
num_fraud_total = (test_y == 1).sum()
num_fraud_identified = ((test_y == 1) & (pred_y == 1)).sum()
num_save_ratio = num_fraud_identified / num_fraud_total
print(f'{num_save_ratio*100:.2f}% of fraudulent transaction were correctly identified leading to saving {amount_save_ratio * 100:.2f}% (€{amount_fraud_identified}) of value earlier lost to fraudulent transactions.')
80.61% of fraudulent transaction were correctly identified leading to saving 67.74% (€8261.48) of value earlier lost to fraudulent transactions.
pred_y_train = tuned_pipeline_rf.predict(train_x)
ConfusionMatrixDisplay.from_predictions(train_y,pred_y_train);