基于lightGBM的交易诈骗案例检测
作者:互联网
一、数据预处理
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
import lightgbm as lgb
from tqdm import tqdm_notebook
import joblib
import warnings
warnings.filterwarnings("ignore")
%%time
# 加载训练和测试数据集
train_transaction = pd.read_csv("train_transaction.csv")
print("train_transaction shape : ", train_transaction.shape)
train_identity = pd.read_csv("train_identity.csv")
print("train_identity shape : ", train_identity.shape)
test_transaction = pd.read_csv("test_transaction.csv")
print("test_transaction shape : ", test_transaction.shape)
test_identity = pd.read_csv("test_identity.csv")
print("test_identity shape : ", test_identity.shape)
train_transaction shape : (590540, 394)
train_identity shape : (144233, 41)
test_transaction shape : (506691, 393)
test_identity shape : (141907, 41)
Wall time: 21.5 s
# 默认显示前5行
train_transaction.head()
TransactionID | isFraud | TransactionDT | TransactionAmt | ProductCD | card1 | card2 | card3 | card4 | card5 | ... | V330 | V331 | V332 | V333 | V334 | V335 | V336 | V337 | V338 | V339 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2987000 | 0 | 86400 | 68.5 | W | 13926 | NaN | 150.0 | discover | 142.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 2987001 | 0 | 86401 | 29.0 | W | 2755 | 404.0 | 150.0 | mastercard | 102.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 2987002 | 0 | 86469 | 59.0 | W | 4663 | 490.0 | 150.0 | visa | 166.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 2987003 | 0 | 86499 | 50.0 | W | 18132 | 567.0 | 150.0 | mastercard | 117.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 2987004 | 0 | 86506 | 50.0 | H | 4497 | 514.0 | 150.0 | mastercard | 102.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 394 columns
# 查看数据信息
train_transaction.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590540 entries, 0 to 590539
Columns: 394 entries, TransactionID to V339
dtypes: float64(376), int64(4), object(14)
memory usage: 1.7+ GB
# 统计数据中的NaN
train_transaction.isnull().sum()
TransactionID 0
isFraud 0
TransactionDT 0
TransactionAmt 0
ProductCD 0
...
V335 508189
V336 508189
V337 508189
V338 508189
V339 508189
Length: 394, dtype: int64
同样的方式查看train_identity、test_transaction、test_identity的数据类型、数据信息和空值情况
根据 TransactionID 合并 train_transaction 和 train_identity,test_transaction 和 test_identity
# 根据以上数据的shape,可以发现,并非所有的transaction都有相关联的identity
# 计算一下占比
train_count = np.sum(train_transaction['TransactionID'].isin(train_identity['TransactionID']))
print("train_transaction与train_identity基于TransactionID相关联的数据量 : ", train_count)
train_ratio = train_count / len(train_transaction)
print("相关联数据量占整个train_transaction数据量的比例是 : {:.2f}%".format(train_ratio * 100))
train_transaction与train_identity基于TransactionID相关联的数据量 : 144233
相关联数据量占整个train_transaction数据量的比例是 : 24.42%
test_count = np.sum(test_transaction['TransactionID'].isin(test_identity['TransactionID']))
print("test_transaction与test_identity基于TransactionID相关联的数据量 : ", test_count)
test_ratio = test_count / len(test_transaction)
print("相关联数据量占整个test_transaction数据量的比例是 : {:.2f}%".format(test_ratio * 100))
est_transaction与test_identity基于TransactionID相关联的数据量 : 141907
相关联数据量占整个test_transaction数据量的比例是 : 28.01%
# 合并
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')
train.shape:(590540, 434)
test.shape:(506691, 433)
训练数据比测试数据多一个标签特征
二、数据探索分析
在train_transaction中,正负样本比例
sns.countplot('isFraud', data=train)
plt.title("Normal VS Fraud")
plt.show()
交易金额分布
train['TransactionAmt'].apply(np.log).plot(kind='hist', bins=100, figsize=(15, 5), title='Distribution of Transaction Amount')
交易金额(正常 vs 欺诈)分布
为了展示更为
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 6)) # 画布大小
train.loc[train['isFraud']==1]['TransactionAmt'].apply(np.log).plot(kind='hist',
bins=100,
title='Log Transaction Amount isFraud = 1',
ax=ax1)
train.loc[train['isFraud']==1]['TransactionAmt'].plot(kind='hist',
bins=100,
title='Transaction Amount isFraud=1',
ax=ax2)
train.loc[train['isFraud']==0]['TransactionAmt'].apply(np.log).plot(kind='hist',
bins=100,
title='Log Transaction Amount isFraud = 0',
ax=ax3)
train.loc[train['isFraud']==0]['TransactionAmt'].plot(kind='hist',
bins=100,
title='Transaction Amount isFraud=0',
ax=ax4)
plt.show()
分析 ProductCD
train.groupby('ProductCD')['TransactionID'].count().plot(kind='barh',
figsize=(15, 6),
title='ProductCD TransctionID')
plt.show()
train.groupby('ProductCD')['isFraud'].mean().plot(kind='barh',
figsize=(15, 6),
title='ProductCD isFraud')
plt.show()
分类变量进行转换 LabelEncoder
LabelEncoder :将离散型的数据转换成 0 到 n − 1 之间的数,这里 n 是一个列表的不同取值的个数,可以认为是某个特征的所有不同取值的个数。
# 输出 object 类型的列
for col in train.columns:
if train[col].dtype == "object":
print(col)
ProductCD
card4
card6
P_emaildomain
R_emaildomain
M1
M2
M3
M4
M5
M6
M7
M8
M9
id_12
id_15
id_16
id_23
id_27
id_28
id_29
id_30
id_31
id_33
id_34
id_35
id_36
id_37
id_38
DeviceType
DeviceInfo
# 输出 DeviceType 的分布
train.groupby("DeviceType").mean()["isFraud"].plot(kind='barh',
figsize=(15, 5),
title = "DeviceType Distribution")
plt.show()
统计每一列的NaN的数量,如果比例超过70%,则删除该列
def clean_nan(df):
temp_columns = []
for col in df.columns:
# 获取一列
counter = df[col].isnull().sum()
# 占该列的比例
ratio = counter / len(df[col])
if ratio > 0.7:
# 删除该列
temp_columns.append(col)
# 删除那些列
new_df = df.drop(temp_columns, axis=1)
return new_df
print("原始的train shape : ", train.shape)
train = clean_nan(train)
print("清洗后的train shape : ", train.shape)
原始的train shape : (590540, 434)
清洗后的train shape : (590540, 226)
训练数据清洗掉208个特征
print("原始的test shape : ", test.shape)
test = clean_nan(test)
print("清洗后的test shape : ", test.shape)
原始的test shape : (506691, 433)
清洗后的test shape : (506691, 225)
测试数据清洗掉208个特征
删除那些一列中某个类别数据量超过90%的列
def clean_top_cols(df):
new_cols = [col for col in df.columns if df[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
return new_cols
train_cols = clean_top_cols(train)
test_cols = clean_top_cols(test)
cols_to_drop = list(set(train_cols + test_cols)) # 使用set列表进行去重
'isFraud' in cols_to_drop # 查看标签列是否在其中
cols_to_drop.remove('isFraud') # 删除标签列
# 清理掉这些列
print("原始的train shape : ", train.shape)
train = train.drop(cols_to_drop, axis=1)
print("清理后的train shape : ", train.shape)
原始的train shape : (590540, 226)
清理后的train shape : (590540, 156)
# 清理掉这些列
print("原始的test shape : ", test.shape)
test = test.drop(cols_to_drop, axis=1)
print("清理后的test shape : ", test.shape)
原始的test shape : (506691, 225)
清理后的test shape : (506691, 155)
# LabelEncoder
# 加上进度条:tqdm_notebook
for col in tqdm_notebook(train.columns):
if train[col].dtype == "object":
encoder = LabelEncoder()
encoder.fit(list(train[col].values) + list(test[col].values))
train[col] = encoder.transform(list(train[col].values))
test[col] = encoder.transform(list(test[col].values))
train = train.reset_index() # 重置索引
test = test.reset_index()
del train['index']
del test['index']
train.shape:(590540, 156)
test.shape:(506691, 155)
# 分离数据集和标签
X = train.sort_values('TransactionDT').drop(['isFraud', 'TransactionDT', 'TransactionID'], axis=1)
y = train.sort_values('TransactionDT')['isFraud']
X.shape:(590540, 153)
y.shape:(590540,)
# test
X_test = test.drop(['TransactionDT', 'TransactionID'], axis=1) # 删除多余的列,不是特征
test = test[['TransactionDT', 'TransactionID']]
X_test.shape:(506691, 153)
test.shape:(506691, 2)
三、LightGBM建模
params 超参数设置
params = {'num_leaves': 491, # 叶节点数量
'min_data_in_leaf': 106, # 每个叶子节点中的数据
'objective': 'binary', # 任务:二分类
'max_depth': -1, # -1 : 不限制深度
"boosting_type": "gbdt", # 'dart', 'goss', 'rf'
"metric": 'auc', # 衡量标准
"verbosity" : -1, # 不显示信息
'random_state': 66, # 随机种子
}
创建DataFrame保存特征重要性
feature_importances = pd.DataFrame(index=None)
feature_importances['features'] = X.columns
feature_importances
features | |
---|---|
0 | TransactionAmt |
1 | ProductCD |
2 | card1 |
3 | card2 |
4 | card3 |
... | ... |
148 | V312 |
149 | V313 |
150 | V314 |
151 | V315 |
152 | V317 |
153 rows × 1 columns
5折交叉验证
folds = KFold(n_splits=5)
splits = folds.split(X, y) # 分割成5份,前4份是训练集索引,最后1份是验证集索引
next(iter(splits)) # 输出的是索引
(array([118108, 118109, 118110, ..., 590537, 590538, 590539]),
array([ 0, 1, 2, ..., 118105, 118106, 118107]))
best_auc = 0
best_model = None
for k, (train_indices, val_indices) in enumerate(splits):
print("第 %d 折\n" % k)
X_train_data, X_val_data = X.iloc[train_indices], X.iloc[val_indices] # 训练集, 验证集
y_train, y_val = y.iloc[train_indices], y.iloc[val_indices] # 训练标签,验证标签
#print("X_train_data shape : \t", X_train_data.shape, "X_val_data shape : \t", X_val_data.shape)
train_dataset = lgb.Dataset(X_train_data, label=y_train) # 训练集
val_dataset = lgb.Dataset(X_val_data, label=y_val) # 验证集
lgb_model = lgb.train(params=params, # 超参数设置
train_set=train_dataset, # 训练数据
num_boost_round=10000, # 循环的轮数
valid_sets=val_dataset, # 验证数据
valid_names='validation', # 验证集名称
early_stopping_rounds=200,) # 如果200轮后没有提升,就停止循环
# 保存特征重要性
feature_importances[f'fold_{k+1}'] = lgb_model.feature_importance()
# 对验证集进行预测
y_val_pred = lgb_model.predict(X_val_data)
# 计算roc_auc
roc_auc = roc_auc_score(y_val, y_val_pred)
print(f" Fold {k + 1} | AUC_ROC : { roc_auc * 100}%")
# 判断是否是最优模型
if roc_auc > best_auc:
best_auc = roc_auc
best_model = lgb_model
feature_importances['average'] = feature_importances[[f'fold_{k+1}' for k in range(folds.n_splits-1)]].mean(axis=1)
feature_importances.head()
features | fold_1 | fold_2 | fold_3 | fold_4 | average | |
---|---|---|---|---|---|---|
0 | TransactionAmt | 10758 | 2337 | 2571 | 3088 | 4688.5 |
1 | ProductCD | 478 | 148 | 189 | 179 | 248.5 |
2 | card1 | 10241 | 2791 | 3168 | 3450 | 4912.5 |
3 | card2 | 8222 | 2517 | 2606 | 3039 | 4096.0 |
4 | card3 | 559 | 214 | 263 | 310 | 336.5 |
# 可视化显示前50个特征
plt.figure(figsize=(16, 16))
sns.barplot(data=feature_importances.sort_values(by='average', ascending=False).head(50),
x='average',
y='features')
plt.title("50 top features importance over {} folds average.".format(folds.n_splits-1))
print("The best roc_auc : ", roc_auc)
The best roc_auc : 0.9187823659441293
joblib.dump(best_model, "best_model.pkl") # 保存模型
对与test进行预测
y_test_pred = best_model.predict(X_test)
labels = np.round(y_test_pred) # np.round() 四舍五入
from collections import Counter
Counter(labels)
Counter({0.0: 498254, 1.0: 8437})
标签:transaction,lightGBM,NaN,诈骗,案例,shape,train,test,identity 来源: https://www.cnblogs.com/wkfvawl/p/16629316.html