# 查看训练集与测试集字段的含义 df = pd.read_excel(datapath+'Data_Dictionary.xlsx', sheet_name='train', skiprows=2) for va in df.values: print('Column:', va[0], 'Description:', va[1])
1 2 3 4
# 查看history_transactions.csv字段的含义 df2 = pd.read_excel(datapath+'Data_Dictionary.xlsx', sheet_name='history', skiprows=2) for va in df.values: print('Column:', va[0], 'Description:', va[1])
数据分布差异
1 2 3 4 5 6 7 8 9 10 11 12
# 以first_active_month为例分析训练集与测试集的差异 features = ['first_active_month','feature_1','feature_2','feature_3'] train_count = train.shape[0] test_count = test.shape[0] for feature in features: train[feature].value_counts().sort_index().plot() test[feature].value_counts().sort_index().plot() plt.xlabel(feature) plt.legend(['train','test']) plt.ylabel('count') plt.show() #结论:训练集与测试集在所有单变量上的绝对数量分布形状极其相似,需要进一步查看相对占比分布
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# 以first_active_month为例分析训练集与测试集的差异 features = ['first_active_month','feature_1','feature_2','feature_3'] train_count = train.shape[0] test_count = test.shape[0] for feature in features: (train[feature].value_counts().sort_index()/train_count).plot() (test[feature].value_counts().sort_index()/test_count).plot() plt.legend(['train','test']) plt.xlabel(feature) plt.ylabel('ratio') plt.show()