https://tianchi.aliyun.com/competition/entrance/531830/informationpython
# 导入标准库 import io, os, sys, types, time, datetime, math, random, requests, subprocess,io, tempfile, math # 导入第三方库 # 数据处理 import numpy as np import pandas as pd # 数据可视化 import matplotlib.pyplot as plt import missingno import seaborn as sns # from pandas.tools.plotting import scatter_matrix # No module named 'pandas.tools' from mpl_toolkits.mplot3d import Axes3D # plt.style.use('seaborn') # 改变图像风格 plt.rcParams['font.family'] = ['Arial Unicode MS', 'Microsoft Yahei', 'SimHei', 'sans-serif'] # 解决中文乱码 plt.rcParams['axes.unicode_minus'] = False # simhei黑体字 负号乱码 解决 # 特征选择和编码 from sklearn.feature_selection import RFE, RFECV from sklearn.svm import SVR from sklearn.decomposition import PCA from sklearn import preprocessing from sklearn.preprocessing import OneHotEncoder, LabelEncoder, label_binarize # Imputer # from fancyimpute import BiScaler, KNN, NuclearNormMinimization, SoftImpute # 机器学习 import sklearn.ensemble as ske from sklearn import datasets, model_selection, tree, preprocessing, metrics, linear_model from sklearn.svm import LinearSVC from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, SGDClassifier from sklearn.tree import DecisionTreeClassifier # 网格搜索、随机搜索 import scipy.stats as st from scipy.stats import randint as sp_randint from sklearn.model_selection import GridSearchCV from sklearn.model_selection import RandomizedSearchCV from sklearn.model_selection import train_test_split # 模型度量(分类) from sklearn.metrics import precision_recall_fscore_support, roc_curve, auc # 警告处理 import warnings warnings.filterwarnings('ignore') # 在Jupyter上画图 %matplotlib inline # 数据预处理 import numpy as np import scipy as sc import sklearn as sk import matplotlib.pyplot as plt # 绘图工具包 import seaborn as sns import pyecharts.options as opts from pyecharts.charts import Line, Grid
赛题以预测用户贷款是否违约为任务,数据集报名后可见并可下载,该数据来自某信贷平台的贷款记录,总数据量超过120w,包含47列变量信息,其中15列为匿名变量。为了保证比赛的公平性,将会从中抽取80万条做为训练集,20万条做为测试集A,20万条做为测试集B,同时会对employmentTitle、purpose、postCode和title等信息进行脱敏。web
# 数据集路径 train_path = 'train.csv' test_path = 'testA.csv' dataset_path = './' data_train_path = dataset_path + train_path data_test_path = dataset_path + test_path # 2.数据集csv读入 train = pd.read_csv(data_train_path) test = pd.read_csv(data_test_path)
train.info() test.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 800000 entries, 0 to 799999 Data columns (total 47 columns): id 800000 non-null int64 loanAmnt 800000 non-null float64 term 800000 non-null int64 interestRate 800000 non-null float64 installment 800000 non-null float64 grade 800000 non-null object subGrade 800000 non-null object employmentTitle 799999 non-null float64 employmentLength 753201 non-null object homeOwnership 800000 non-null int64 annualIncome 800000 non-null float64 verificationStatus 800000 non-null int64 issueDate 800000 non-null object isDefault 800000 non-null int64 purpose 800000 non-null int64 postCode 799999 non-null float64 regionCode 800000 non-null int64 dti 799761 non-null float64 delinquency_2years 800000 non-null float64 ficoRangeLow 800000 non-null float64 ficoRangeHigh 800000 non-null float64 openAcc 800000 non-null float64 pubRec 800000 non-null float64 pubRecBankruptcies 799595 non-null float64 revolBal 800000 non-null float64 revolUtil 799469 non-null float64 totalAcc 800000 non-null float64 initialListStatus 800000 non-null int64 applicationType 800000 non-null int64 earliesCreditLine 800000 non-null object title 799999 non-null float64 policyCode 800000 non-null float64 n0 759730 non-null float64 n1 759730 non-null float64 n2 759730 non-null float64 n3 759730 non-null float64 n4 766761 non-null float64 n5 759730 non-null float64 n6 759730 non-null float64 n7 759730 non-null float64 n8 759729 non-null float64 n9 759730 non-null float64 n10 766761 non-null float64 n11 730248 non-null float64 n12 759730 non-null float64 n13 759730 non-null float64 n14 759730 non-null float64 dtypes: float64(33), int64(9), object(5) memory usage: 286.9+ MB <class 'pandas.core.frame.DataFrame'> RangeIndex: 200000 entries, 0 to 199999 Data columns (total 46 columns): id 200000 non-null int64 loanAmnt 200000 non-null float64 term 200000 non-null int64 interestRate 200000 non-null float64 installment 200000 non-null float64 grade 200000 non-null object subGrade 200000 non-null object employmentTitle 200000 non-null float64 employmentLength 188258 non-null object homeOwnership 200000 non-null int64 annualIncome 200000 non-null float64 verificationStatus 200000 non-null int64 issueDate 200000 non-null object purpose 200000 non-null int64 postCode 200000 non-null float64 regionCode 200000 non-null int64 dti 199939 non-null float64 delinquency_2years 200000 non-null float64 ficoRangeLow 200000 non-null float64 ficoRangeHigh 200000 non-null float64 openAcc 200000 non-null float64 pubRec 200000 non-null float64 pubRecBankruptcies 199884 non-null float64 revolBal 200000 non-null float64 revolUtil 199873 non-null float64 totalAcc 200000 non-null float64 initialListStatus 200000 non-null int64 applicationType 200000 non-null int64 earliesCreditLine 200000 non-null object title 200000 non-null float64 policyCode 200000 non-null float64 n0 189889 non-null float64 n1 189889 non-null float64 n2 189889 non-null float64 n3 189889 non-null float64 n4 191606 non-null float64 n5 189889 non-null float64 n6 189889 non-null float64 n7 189889 non-null float64 n8 189889 non-null float64 n9 189889 non-null float64 n10 191606 non-null float64 n11 182425 non-null float64 n12 189889 non-null float64 n13 189889 non-null float64 n14 189889 non-null float64 dtypes: float64(33), int64(8), object(5) memory usage: 70.2+ MB
# 数据维度 train.shape, test.shape
((800000, 47), (200000, 46))
# train.head(20).iloc[:,:13] train.head(20).iloc[:,13:26]
isDefault | purpose | postCode | regionCode | dti | delinquency_2years | ficoRangeLow | ficoRangeHigh | openAcc | pubRec | pubRecBankruptcies | revolBal | revolUtil | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 137.0 | 32 | 17.05 | 0.0 | 730.0 | 734.0 | 7.0 | 0.0 | 0.0 | 24178.0 | 48.9 |
1 | 0 | 0 | 156.0 | 18 | 27.83 | 0.0 | 700.0 | 704.0 | 13.0 | 0.0 | 0.0 | 15096.0 | 38.9 |
2 | 0 | 0 | 337.0 | 14 | 22.77 | 0.0 | 675.0 | 679.0 | 11.0 | 0.0 | 0.0 | 4606.0 | 51.8 |
3 | 0 | 4 | 148.0 | 11 | 17.21 | 0.0 | 685.0 | 689.0 | 9.0 | 0.0 | 0.0 | 9948.0 | 52.6 |
4 | 0 | 10 | 301.0 | 21 | 32.16 | 0.0 | 690.0 | 694.0 | 12.0 | 0.0 | 0.0 | 2942.0 | 32.0 |
5 | 0 | 9 | 512.0 | 21 | 17.14 | 0.0 | 730.0 | 734.0 | 19.0 | 0.0 | 0.0 | 4047.0 | 31.1 |
6 | 0 | 0 | 517.0 | 14 | 17.49 | 0.0 | 755.0 | 759.0 | 12.0 | 0.0 | 0.0 | 3111.0 | 8.5 |
7 | 0 | 0 | 100.0 | 4 | 32.60 | 0.0 | 665.0 | 669.0 | 8.0 | 1.0 | 1.0 | 14021.0 | 59.7 |
8 | 1 | 0 | 792.0 | 13 | 19.22 | 0.0 | 690.0 | 694.0 | 15.0 | 0.0 | 0.0 | 27176.0 | 46.0 |
9 | 0 | 0 | 59.0 | 11 | 24.39 | 0.0 | 725.0 | 729.0 | 7.0 | 0.0 | 0.0 | 2936.0 | 30.6 |
10 | 0 | 4 | 134.0 | 8 | 14.21 | 0.0 | 665.0 | 669.0 | 13.0 | 0.0 | 0.0 | 8653.0 | 47.5 |
11 | 0 | 0 | 893.0 | 49 | 34.63 | 0.0 | 710.0 | 714.0 | 10.0 | 0.0 | 0.0 | 16343.0 | 80.9 |
12 | 0 | 0 | 195.0 | 38 | 7.58 | 0.0 | 680.0 | 684.0 | 12.0 | 0.0 | 0.0 | 18866.0 | 35.7 |
13 | 0 | 2 | 134.0 | 8 | 5.68 | 0.0 | 690.0 | 694.0 | 7.0 | 0.0 | 0.0 | 4334.0 | 68.8 |
14 | 0 | 4 | 167.0 | 8 | 38.95 | 0.0 | 710.0 | 714.0 | 9.0 | 0.0 | 0.0 | 19023.0 | 60.8 |
15 | 0 | 2 | 194.0 | 38 | 17.27 | 0.0 | 660.0 | 664.0 | 16.0 | 1.0 | 1.0 | 220.0 | 3.6 |
16 | 0 | 2 | 492.0 | 36 | 21.02 | 0.0 | 705.0 | 709.0 | 16.0 | 0.0 | 0.0 | 36609.0 | 61.1 |
17 | 1 | 4 | 56.0 | 8 | 17.14 | 0.0 | 695.0 | 699.0 | 5.0 | 0.0 | 0.0 | 5463.0 | 76.9 |
18 | 1 | 3 | 140.0 | 8 | 28.95 | 3.0 | 660.0 | 664.0 | 6.0 | 0.0 | 0.0 | 6804.0 | 84.0 |
19 | 0 | 0 | 305.0 | 15 | 15.55 | 0.0 | 700.0 | 704.0 | 10.0 | 0.0 | 0.0 | 22859.0 | 57.0 |
missingno.bar(train)
<matplotlib.axes._subplots.AxesSubplot at 0x20f802ad588>
# employmentLength字段值的分布 train['employmentLength'].value_counts()
10+ years 262753 2 years 72358 < 1 year 64237 3 years 64152 1 year 52489 5 years 50102 4 years 47985 6 years 37254 8 years 36192 7 years 35407 9 years 30272 Name: employmentLength, dtype: int64
提交结果为每一个测试样本是1的几率,也就是y为1的几率。评价方法为AUC评估模型效果(越大越好)。app
分类经常使用使用的评估指标是:echarts
本次是学习赛使用的评估指标是AUCdom
提交前请确保预测结果的格式与sample_submit.csv中的格式一致,以及提交文件后缀名为csv。机器学习