{"id":326605,"date":"2021-07-18T09:00:17","date_gmt":"2021-07-18T09:00:17","guid":{"rendered":"http:\/\/savepearlharbor.com\/?p=326605"},"modified":"-0001-11-30T00:00:00","modified_gmt":"-0001-11-29T21:00:00","slug":"","status":"publish","type":"post","link":"https:\/\/savepearlharbor.com\/?p=326605","title":{"rendered":"\u041a\u0430\u043a \u043f\u043e\u0431\u0435\u0434\u0438\u0442\u044c \u043d\u0435\u0441\u0431\u0430\u043b\u0430\u043d\u0441\u0438\u0440\u043e\u0432\u0430\u043d\u043d\u043e\u0441\u0442\u044c \u0434\u0430\u0442\u0430\u0441\u0435\u0442\u0430: \u043c\u0435\u0442\u043e\u0434 upsampling\u00a0data"},"content":{"rendered":"\n<div class=\"post__text post__text_v2\" id=\"post-content-body\">\n<figure class=\"full-width\"><img loading=\"lazy\" decoding=\"async\" src=\"https:\/\/habrastorage.org\/getpro\/habr\/upload_files\/e9b\/c27\/2e6\/e9bc272e621c74e5181067153987edcb.jpg\" alt=\"Upsampling data\" title=\"Upsampling data\" width=\"600\" height=\"343\"><figcaption>Upsampling data<\/figcaption><\/figure>\n<p>\u0414\u0430\u043d\u043d\u0430\u044f \u0441\u0442\u0430\u0442\u044c\u044f \u0440\u0430\u0441\u0441\u0447\u0438\u0442\u0430\u043d\u0430 \u0434\u043b\u044f \u043d\u043e\u0432\u0438\u0447\u043a\u043e\u0432 \u0432 \u043c\u0430\u0448\u0438\u043d\u043d\u043e\u043c \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u0438. \u0418\u0441\u043f\u043e\u043b\u044c\u0437\u0443\u044e\u0442\u0441\u044f \u0441\u043b\u0435\u0434\u0443\u044e\u0449\u0438\u0435 \u0438\u043d\u0442\u0441\u0442\u0440\u0443\u043c\u0435\u043d\u0442\u044b:<\/p>\n<ul>\n<li>\n<p>Python<\/p>\n<\/li>\n<li>\n<p>Random forest classifier<\/p>\n<\/li>\n<li>\n<p>Google Colab<\/p>\n<\/li>\n<li>\n<p>Upsampling data<\/p>\n<\/li>\n<\/ul>\n<p>\u041a\u0430\u0436\u0434\u044b\u0439 \u0434\u0430\u0442\u0430 \u0441\u0430\u0435\u043d\u0442\u0438\u0441\u0442 \u0445\u043e\u0442\u044c \u0440\u0430\u0437 \u0441\u0442\u0430\u043b\u043a\u0438\u0432\u0430\u043b\u0441\u044f \u0441 \u043f\u0440\u043e\u0431\u043b\u0435\u043c\u043e\u0439 \u043d\u0435\u0441\u0431\u0430\u043b\u0430\u043d\u0441\u0438\u0440\u043e\u0432\u0430\u043d\u043d\u043e\u0441\u0442\u0438 \u0434\u0430\u043d\u043d\u044b\u0445 \u0434\u043b\u044f \u043a\u043b\u0430\u0441\u0441\u0438\u0444\u0438\u043a\u0430\u0446\u0438\u0438: \u043a\u0430\u043a\u043e\u0439-\u0442\u043e \u043a\u043b\u0430\u0441\u0441 \u043f\u0440\u0435\u0432\u043e\u0441\u0445\u043e\u0434\u0438\u0442 \u0434\u0440\u0443\u0433\u0438\u0435. \u0421\u0443\u0449\u0435\u0441\u0442\u0432\u0443\u0435\u0442 \u0434\u0430\u043b\u0435\u043a\u043e \u043d\u0435 \u043e\u0434\u0438\u043d \u0441\u043f\u043e\u0441\u043e\u0431 \u0431\u043e\u0440\u044c\u0431\u044b \u0441 \u044d\u0442\u043e\u0439 \u043f\u0440\u043e\u0431\u043b\u0435\u043c\u043e\u0439. \u041d\u0430\u0438\u0431\u043e\u043b\u044c\u0448\u0443\u044e \u0438\u0437\u0432\u0435\u0441\u0442\u043d\u043e\u0441\u0442\u044c \u0438\u043c\u0435\u0435\u0442 \u043f\u0440\u0435\u043e\u0431\u0440\u0430\u0437\u043e\u0432\u0430\u043d\u0438\u0435 \u0433\u0438\u043f\u0435\u0440\u043f\u0430\u0440\u0430\u043c\u0435\u0442\u0440\u043e\u0432, \u043d\u0430\u043f\u0440\u0438\u043c\u0435\u0440:<\/p>\n<ul>\n<li>\n<p>class_weight, \u043d\u043e \u0435\u0433\u043e \u043c\u043e\u0436\u043d\u043e \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u044c \u043f\u0440\u0438 \u043d\u0435\u0437\u043d\u0430\u0447\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0439 \u043d\u0435\u0441\u0431\u0430\u043b\u0430\u043d\u0441\u0438\u0440\u043e\u0432\u0430\u043d\u043d\u043e\u0441\u0442\u0438: \u0441\u043e\u043e\u0442\u043d\u043e\u0448\u0435\u043d\u0438\u0438 \u0434\u0430\u043d\u043d\u044b\u0445 \u0440\u0430\u0437\u043d\u044b\u0445 \u043a\u043b\u0430\u0441\u0441\u043e\u0432, \u043d\u0430\u043f\u0440\u0438\u043c\u0435\u0440, 4:3 (\u043f\u043e\u0434\u0440\u043e\u0431\u043d\u0435\u0435 \u043c\u043e\u0436\u043d\u043e \u043f\u0440\u043e\u0447\u0435\u0441\u0442\u044c \u0442\u0443\u0442: <a href=\"https:\/\/datascience.stackexchange.com\/questions\/11564\/how-does-class-weights-work-in-randomforestclassifier\" rel=\"noopener noreferrer nofollow\">https:\/\/datascience.stackexchange.com\/questions\/11564\/how-does-class-weights-work-in-randomforestclassifier<\/a>)<\/p>\n<\/li>\n<li>\n<p>warm_start, \u043a\u043e\u0442\u043e\u0440\u044b\u0439 \u043f\u043e\u0437\u0432\u043e\u043b\u044f\u0435\u0442 \u0431\u0430\u0442\u0447\u0430\u043c\u0438 (\u0447\u0430\u0441\u0442\u044f\u043c\u0438 \u0434\u0430\u0442\u0430\u0441\u0435\u0442\u0430) \u043e\u0431\u0443\u0447\u0430\u0442\u044c \u0434\u0430\u043d\u043d\u044b\u0435 (\u043f\u043e\u0434\u0440\u043e\u0431\u043d\u0435\u0435 \u043c\u043e\u0436\u043d\u043e \u043f\u0440\u043e\u0447\u0435\u0441\u0442\u044c \u0442\u0443\u0442: <a href=\"https:\/\/stackoverflow.com\/questions\/42757892\/how-to-use-warm-start\/42763502\" rel=\"noopener noreferrer nofollow\">https:\/\/stackoverflow.com\/questions\/42757892\/how-to-use-warm-start\/42763502<\/a>)<\/p>\n<\/li>\n<\/ul>\n<p>\u041e\u0434\u043d\u0430\u043a\u043e \u0432 \u0434\u0430\u043d\u043d\u043e\u0439 \u0441\u0442\u0430\u0442\u044c\u0435 \u043c\u044b \u0440\u0430\u0441\u0441\u043c\u043e\u0442\u0440\u0438\u043c \u043c\u0435\u0442\u043e\u0434, \u043d\u0435 \u0441\u0432\u044f\u0437\u0430\u043d\u043d\u044b\u0439 \u0441 \u0433\u0438\u043f\u0435\u0440\u043f\u0430\u0440\u0430\u043c\u0435\u0442\u0440\u0430\u043c\u0438 \u043c\u043e\u0434\u0435\u043b\u0438: upsampling&nbsp;data. \u041c\u044b \u043f\u0440\u0435\u0443\u0432\u0435\u043b\u0438\u0447\u0438\u043c \u043a\u043e\u043b\u0438\u0447\u0435\u0441\u0442\u0432\u043e \u043d\u0430\u0438\u043c\u0435\u043d\u044c\u0448\u0438\u0445 \u043a\u043b\u0430\u0441\u0441\u043e\u0432 \u0435\u0449\u0451 \u0434\u043e \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u044f \u043c\u043e\u0434\u0435\u043b\u0438: \u043f\u0440\u043e\u0434\u0443\u0431\u043b\u0438\u0440\u0443\u0435\u043c n (\u043e\u0442\u043d\u043e\u0448\u0435\u043d\u0438\u0435 \u043a\u043e\u043b\u0438\u0447\u0435\u0441\u0442\u0432\u0430 \u043f\u0440\u0435\u043e\u0431\u043b\u0430\u0434\u0430\u044e\u0449\u0435\u0433\u043e \u043a\u043b\u0430\u0441\u0441 \u043a \u0438\u043d\u0442\u0435\u0440\u0435\u0441\u0443\u044e\u0449\u0435\u043c\u0443) \u0440\u0430\u0437 \u043d\u0430\u0438\u043c\u0435\u043d\u044c\u0448\u0438\u0439 \u043a\u043b\u0430\u0441\u0441.<\/p>\n<p>\u0412 \u043a\u0430\u0447\u0435\u0441\u0442\u0432\u0435 \u0434\u0430\u043d\u043d\u044b\u0445 \u0432\u044b\u0431\u0440\u0430\u043d\u044b \u0434\u0430\u043d\u043d\u044b\u0435 \u0441\u043e\u0440\u0435\u0432\u043d\u043e\u0432\u0430\u043d\u0438\u044f \u043d\u0430 kaggle: <a href=\"https:\/\/www.kaggle.com\/arashnic\/banking-loan-prediction\" rel=\"noopener noreferrer nofollow\">https:\/\/www.kaggle.com\/arashnic\/banking-loan-prediction<\/a>. \u0412 \u043a\u0430\u0447\u0435\u0441\u0442\u0432\u0435 \u0430\u043b\u0433\u043e\u0440\u0438\u0442\u043c\u0430 \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u044f \u0432\u043e\u0437\u044c\u043c\u0451\u043c \u0441\u043b\u0443\u0447\u0430\u0439\u043d\u044b\u0435 \u043b\u0435\u0441. \u041d\u0430\u0447\u043d\u0451\u043c!<\/p>\n<p>\u0418\u043c\u043f\u043e\u0440\u0442\u0438\u0440\u0443\u0435\u043c \u043d\u0435\u043e\u0431\u0445\u043e\u0434\u0438\u043c\u044b\u0435 \u0431\u0438\u0431\u043b\u0438\u043e\u0442\u0435\u043a\u0438:<\/p>\n<pre><code class=\"python\">import seaborn as sns import pandas as pd import numpy as np import matplotlib.pyplot as plt from numpy import nan from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import confusion_matrix from sklearn.metrics import precision_recall_fscore_support from sklearn.metrics import precision_recall_curve from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix from sklearn.metrics import roc_auc_score from sklearn.metrics import roc_curve, auc from sklearn import metrics import copy from tune_sklearn import TuneSearchCV import scipy from ray import tune<\/code><\/pre>\n<p>\u0417\u0430\u0433\u0440\u0443\u0436\u0430\u0435\u043c \u0434\u0430\u043d\u043d\u044b\u0435 (\u0432 \u043a\u0430\u0447\u0435\u0441\u0442\u0432\u0435 \u0441\u0440\u0435\u0434\u044b \u0440\u0430\u0437\u0440\u0430\u0431\u043e\u0442\u043a\u0438 \u043c\u043d\u043e\u0439 \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u043b\u0441\u044f Google Colab, \u0430 \u0434\u0430\u043d\u043d\u044b\u0435 \u0440\u0430\u0441\u043f\u043e\u043b\u0430\u0433\u0430\u043b\u0438\u0441\u044c \u043d\u0430 Google Drive):<\/p>\n<pre><code class=\"python\">from google.colab import drive drive.mount('\/content\/drive') train = pd.read_csv('\/content\/drive\/MyDrive\/\u043f\u043e\u0440\u0442\u0444\u043e\u043b\u0438\u043e\/Project \"Help to increase customer acquisition\"\/train.csv') test = pd.read_csv('\/content\/drive\/MyDrive\/\u043f\u043e\u0440\u0442\u0444\u043e\u043b\u0438\u043e\/Project \"Help to increase customer acquisition\"\/test.csv')<\/code><\/pre>\n<p>\u041f\u043e\u0441\u043c\u043e\u0442\u0440\u0438\u043c \u043d\u0430 \u0438\u0441\u0445\u043e\u0434\u043d\u044b\u0435 \u0434\u0430\u043d\u043d\u044b\u0435 train (\u0438\u0445 \u043c\u044b \u0431\u0443\u0434\u0435\u043c \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u044c \u0434\u043b\u044f \u0442\u0440\u0435\u043d\u0438\u0440\u043e\u0432\u043a\u0438 \u0438 \u0442\u0435\u0441\u0442\u0430, \u0432 \u0434\u0430\u043d\u043d\u044b\u0445 test \u043e\u0442\u0441\u0443\u0442\u0441\u0442\u0432\u0443\u0435\u0442 \u0442\u0430\u0440\u0433\u0435\u0442\u0438\u0440\u043e\u0432\u0430\u043d\u043d\u044b\u0439 \u0441\u0442\u043e\u043b\u0431\u0435\u0446)<\/p>\n<pre><code class=\"python\">train<\/code><\/pre>\n<div class=\"table\">\n<table>\n<tbody>\n<tr>\n<th>\n<p align=\"center\">\n<\/th>\n<th data-colwidth=\"50\" width=\"50\"><\/th>\n<th>\n<p><strong>Gender<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>DOB<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>Lead_Creation_Date<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>City_Code<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>City_Category<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>Employer_Code<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>Employer_Category1<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>Employer_Category2<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>Monthly_Income<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>Customer_Existing_Primary_Bank_Code<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>Primary_Bank_Type<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>Contacted<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>Source<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>Source_Category<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>Existing_EMI<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>Loan_Amount<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>Loan_Period<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>Interest_Rate<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>EMI<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>Var1<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>Approved<\/strong><\/p>\n<\/th>\n<\/tr>\n<tr>\n<th>\n<p><strong>0<\/strong><\/p>\n<\/th>\n<td data-colwidth=\"50\" width=\"50\">\n<p>APPC90493171225<\/p>\n<\/td>\n<td>\n<p>Female<\/p>\n<\/td>\n<td>\n<p>23\/07\/79<\/p>\n<\/td>\n<td>\n<p>15\/07\/16<\/p>\n<\/td>\n<td>\n<p>C10001<\/p>\n<\/td>\n<td>\n<p>A<\/p>\n<\/td>\n<td>\n<p>COM0044082<\/p>\n<\/td>\n<td>\n<p>A<\/p>\n<\/td>\n<td>\n<p>4.0<\/p>\n<\/td>\n<td>\n<p>2000.0<\/p>\n<\/td>\n<td>\n<p>B001<\/p>\n<\/td>\n<td>\n<p>P<\/p>\n<\/td>\n<td>\n<p>N<\/p>\n<\/td>\n<td>\n<p>S122<\/p>\n<\/td>\n<td>\n<p>G<\/p>\n<\/td>\n<td>\n<p>0.0<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>0<\/p>\n<\/td>\n<td>\n<p>0<\/p>\n<\/td>\n<\/tr>\n<tr>\n<th>\n<p><strong>1<\/strong><\/p>\n<\/th>\n<td data-colwidth=\"50\" width=\"50\">\n<p>APPD40611263344<\/p>\n<\/td>\n<td>\n<p>Male<\/p>\n<\/td>\n<td>\n<p>07\/12\/86<\/p>\n<\/td>\n<td>\n<p>04\/07\/16<\/p>\n<\/td>\n<td>\n<p>C10003<\/p>\n<\/td>\n<td>\n<p>A<\/p>\n<\/td>\n<td>\n<p>COM0000002<\/p>\n<\/td>\n<td>\n<p>C<\/p>\n<\/td>\n<td>\n<p>1.0<\/p>\n<\/td>\n<td>\n<p>3500.0<\/p>\n<\/td>\n<td>\n<p>B002<\/p>\n<\/td>\n<td>\n<p>P<\/p>\n<\/td>\n<td>\n<p>Y<\/p>\n<\/td>\n<td>\n<p>S122<\/p>\n<\/td>\n<td>\n<p>G<\/p>\n<\/td>\n<td>\n<p>0.0<\/p>\n<\/td>\n<td>\n<p>20000.0<\/p>\n<\/td>\n<td>\n<p>2.0<\/p>\n<\/td>\n<td>\n<p>13.25<\/p>\n<\/td>\n<td>\n<p>953.0<\/p>\n<\/td>\n<td>\n<p>10<\/p>\n<\/td>\n<td>\n<p>0<\/p>\n<\/td>\n<\/tr>\n<tr>\n<th>\n<p><strong>2<\/strong><\/p>\n<\/th>\n<td data-colwidth=\"50\" width=\"50\">\n<p>APPE70289249423<\/p>\n<\/td>\n<td>\n<p>Male<\/p>\n<\/td>\n<td>\n<p>10\/12\/82<\/p>\n<\/td>\n<td>\n<p>19\/07\/16<\/p>\n<\/td>\n<td>\n<p>C10125<\/p>\n<\/td>\n<td>\n<p>C<\/p>\n<\/td>\n<td>\n<p>COM0005267<\/p>\n<\/td>\n<td>\n<p>C<\/p>\n<\/td>\n<td>\n<p>4.0<\/p>\n<\/td>\n<td>\n<p>2250.0<\/p>\n<\/td>\n<td>\n<p>B003<\/p>\n<\/td>\n<td>\n<p>G<\/p>\n<\/td>\n<td>\n<p>Y<\/p>\n<\/td>\n<td>\n<p>S143<\/p>\n<\/td>\n<td>\n<p>B<\/p>\n<\/td>\n<td>\n<p>0.0<\/p>\n<\/td>\n<td>\n<p>45000.0<\/p>\n<\/td>\n<td>\n<p>4.0<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>0<\/p>\n<\/td>\n<td>\n<p>0<\/p>\n<\/td>\n<\/tr>\n<tr>\n<th>\n<p><strong>3<\/strong><\/p>\n<\/th>\n<td data-colwidth=\"50\" width=\"50\">\n<p>APPF80273865537<\/p>\n<\/td>\n<td>\n<p>Male<\/p>\n<\/td>\n<td>\n<p>30\/01\/89<\/p>\n<\/td>\n<td>\n<p>09\/07\/16<\/p>\n<\/td>\n<td>\n<p>C10477<\/p>\n<\/td>\n<td>\n<p>C<\/p>\n<\/td>\n<td>\n<p>COM0004143<\/p>\n<\/td>\n<td>\n<p>A<\/p>\n<\/td>\n<td>\n<p>4.0<\/p>\n<\/td>\n<td>\n<p>3500.0<\/p>\n<\/td>\n<td>\n<p>B003<\/p>\n<\/td>\n<td>\n<p>G<\/p>\n<\/td>\n<td>\n<p>Y<\/p>\n<\/td>\n<td>\n<p>S143<\/p>\n<\/td>\n<td>\n<p>B<\/p>\n<\/td>\n<td>\n<p>0.0<\/p>\n<\/td>\n<td>\n<p>92000.0<\/p>\n<\/td>\n<td>\n<p>5.0<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>7<\/p>\n<\/td>\n<td>\n<p>0<\/p>\n<\/td>\n<\/tr>\n<tr>\n<th>\n<p><strong>4<\/strong><\/p>\n<\/th>\n<td data-colwidth=\"50\" width=\"50\">\n<p>APPG60994436641<\/p>\n<\/td>\n<td>\n<p>Male<\/p>\n<\/td>\n<td>\n<p>19\/04\/85<\/p>\n<\/td>\n<td>\n<p>20\/07\/16<\/p>\n<\/td>\n<td>\n<p>C10002<\/p>\n<\/td>\n<td>\n<p>A<\/p>\n<\/td>\n<td>\n<p>COM0001781<\/p>\n<\/td>\n<td>\n<p>A<\/p>\n<\/td>\n<td>\n<p>4.0<\/p>\n<\/td>\n<td>\n<p>10000.0<\/p>\n<\/td>\n<td>\n<p>B001<\/p>\n<\/td>\n<td>\n<p>P<\/p>\n<\/td>\n<td>\n<p>Y<\/p>\n<\/td>\n<td>\n<p>S134<\/p>\n<\/td>\n<td>\n<p>B<\/p>\n<\/td>\n<td>\n<p>2500.0<\/p>\n<\/td>\n<td>\n<p>50000.0<\/p>\n<\/td>\n<td>\n<p>2.0<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>10<\/p>\n<\/td>\n<td>\n<p>0<\/p>\n<\/td>\n<\/tr>\n<tr>\n<th>\n<p><strong>&#8230;<\/strong><\/p>\n<\/th>\n<td data-colwidth=\"50\" width=\"50\">\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<\/tr>\n<tr>\n<th>\n<p><strong>69708<\/strong><\/p>\n<\/th>\n<td data-colwidth=\"50\" width=\"50\">\n<p>APPU90955789628<\/p>\n<\/td>\n<td>\n<p>Female<\/p>\n<\/td>\n<td>\n<p>31\/07\/83<\/p>\n<\/td>\n<td>\n<p>30\/09\/16<\/p>\n<\/td>\n<td>\n<p>C10006<\/p>\n<\/td>\n<td>\n<p>A<\/p>\n<\/td>\n<td>\n<p>COM0000010<\/p>\n<\/td>\n<td>\n<p>A<\/p>\n<\/td>\n<td>\n<p>1.0<\/p>\n<\/td>\n<td>\n<p>4900.0<\/p>\n<\/td>\n<td>\n<p>B002<\/p>\n<\/td>\n<td>\n<p>P<\/p>\n<\/td>\n<td>\n<p>N<\/p>\n<\/td>\n<td>\n<p>S122<\/p>\n<\/td>\n<td>\n<p>G<\/p>\n<\/td>\n<td>\n<p>0.0<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>10<\/p>\n<\/td>\n<td>\n<p>0<\/p>\n<\/td>\n<\/tr>\n<tr>\n<th>\n<p><strong>69709<\/strong><\/p>\n<\/th>\n<td data-colwidth=\"50\" width=\"50\">\n<p>APPV80989824738<\/p>\n<\/td>\n<td>\n<p>Female<\/p>\n<\/td>\n<td>\n<p>27\/01\/71<\/p>\n<\/td>\n<td>\n<p>30\/09\/16<\/p>\n<\/td>\n<td>\n<p>C10116<\/p>\n<\/td>\n<td>\n<p>C<\/p>\n<\/td>\n<td>\n<p>COM0045789<\/p>\n<\/td>\n<td>\n<p>A<\/p>\n<\/td>\n<td>\n<p>4.0<\/p>\n<\/td>\n<td>\n<p>7190.1<\/p>\n<\/td>\n<td>\n<p>B002<\/p>\n<\/td>\n<td>\n<p>P<\/p>\n<\/td>\n<td>\n<p>N<\/p>\n<\/td>\n<td>\n<p>S122<\/p>\n<\/td>\n<td>\n<p>G<\/p>\n<\/td>\n<td>\n<p>1450.0<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>7<\/p>\n<\/td>\n<td>\n<p>0<\/p>\n<\/td>\n<\/tr>\n<tr>\n<th>\n<p><strong>69710<\/strong><\/p>\n<\/th>\n<td data-colwidth=\"50\" width=\"50\">\n<p>APPW50697209842<\/p>\n<\/td>\n<td>\n<p>Female<\/p>\n<\/td>\n<td>\n<p>01\/02\/92<\/p>\n<\/td>\n<td>\n<p>30\/09\/16<\/p>\n<\/td>\n<td>\n<p>C10022<\/p>\n<\/td>\n<td>\n<p>B<\/p>\n<\/td>\n<td>\n<p>COM0013284<\/p>\n<\/td>\n<td>\n<p>C<\/p>\n<\/td>\n<td>\n<p>4.0<\/p>\n<\/td>\n<td>\n<p>1600.0<\/p>\n<\/td>\n<td>\n<p>B030<\/p>\n<\/td>\n<td>\n<p>P<\/p>\n<\/td>\n<td>\n<p>Y<\/p>\n<\/td>\n<td>\n<p>S122<\/p>\n<\/td>\n<td>\n<p>G<\/p>\n<\/td>\n<td>\n<p>0.0<\/p>\n<\/td>\n<td>\n<p>24000.0<\/p>\n<\/td>\n<td>\n<p>4.0<\/p>\n<\/td>\n<td>\n<p>35.50<\/p>\n<\/td>\n<td>\n<p>943.0<\/p>\n<\/td>\n<td>\n<p>2<\/p>\n<\/td>\n<td>\n<p>0<\/p>\n<\/td>\n<\/tr>\n<tr>\n<th>\n<p><strong>69711<\/strong><\/p>\n<\/th>\n<td data-colwidth=\"50\" width=\"50\">\n<p>APPY50870035036<\/p>\n<\/td>\n<td>\n<p>Male<\/p>\n<\/td>\n<td>\n<p>27\/06\/78<\/p>\n<\/td>\n<td>\n<p>30\/09\/16<\/p>\n<\/td>\n<td>\n<p>C10002<\/p>\n<\/td>\n<td>\n<p>A<\/p>\n<\/td>\n<td>\n<p>COM0000098<\/p>\n<\/td>\n<td>\n<p>C<\/p>\n<\/td>\n<td>\n<p>3.0<\/p>\n<\/td>\n<td>\n<p>9893.0<\/p>\n<\/td>\n<td>\n<p>B002<\/p>\n<\/td>\n<td>\n<p>P<\/p>\n<\/td>\n<td>\n<p>Y<\/p>\n<\/td>\n<td>\n<p>S122<\/p>\n<\/td>\n<td>\n<p>G<\/p>\n<\/td>\n<td>\n<p>1366.0<\/p>\n<\/td>\n<td>\n<p>80000.0<\/p>\n<\/td>\n<td>\n<p>5.0<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>10<\/p>\n<\/td>\n<td>\n<p>0<\/p>\n<\/td>\n<\/tr>\n<tr>\n<th>\n<p><strong>69712<\/strong><\/p>\n<\/th>\n<td data-colwidth=\"50\" width=\"50\">\n<p>APPZ60733046119<\/p>\n<\/td>\n<td>\n<p>Male<\/p>\n<\/td>\n<td>\n<p>31\/12\/89<\/p>\n<\/td>\n<td>\n<p>30\/09\/16<\/p>\n<\/td>\n<td>\n<p>C10003<\/p>\n<\/td>\n<td>\n<p>A<\/p>\n<\/td>\n<td>\n<p>COM0000056<\/p>\n<\/td>\n<td>\n<p>A<\/p>\n<\/td>\n<td>\n<p>1.0<\/p>\n<\/td>\n<td>\n<p>4230.0<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>Y<\/p>\n<\/td>\n<td>\n<p>S122<\/p>\n<\/td>\n<td>\n<p>G<\/p>\n<\/td>\n<td>\n<p>0.0<\/p>\n<\/td>\n<td>\n<p>69000.0<\/p>\n<\/td>\n<td>\n<p>4.0<\/p>\n<\/td>\n<td>\n<p>13.99<\/p>\n<\/td>\n<td>\n<p>1885.0<\/p>\n<\/td>\n<td>\n<p>10<\/p>\n<\/td>\n<td>\n<p>0<\/p>\n<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<\/div>\n<p>69713 rows \u00d7 22 columns<\/p>\n<p>\u041a\u0430\u043a \u043c\u043e\u0436\u043d\u043e \u0437\u0430\u043c\u0435\u0442\u0438\u0442\u044c, \u0434\u0430\u043d\u043d\u044b\u0435 \u043d\u0435\u043e\u0431\u0445\u043e\u0434\u0438\u043c\u043e \u043f\u0440\u0435\u0434\u043e\u0431\u0440\u0430\u0431\u043e\u0442\u0430\u0442\u044c \u043f\u0435\u0440\u0435\u0434 \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u0435\u043c:<\/p>\n<ol>\n<li>\n<p>\u0441\u043e\u0437\u0434\u0430\u0442\u044c \u043d\u043e\u0432\u044b\u0435 \u043f\u0440\u0438\u0437\u043d\u0430\u043a\u0438 \u043d\u0430 \u043e\u0441\u043d\u043e\u0432\u0435 \u0441\u0442\u0430\u0440\u044b\u0445: \u0432\u043e\u0437\u0440\u0430\u0441\u0442 \u0432\u043c\u0435\u0441\u0442\u043e \u0434\u0430\u0442\u044b \u0440\u043e\u0436\u0434\u0435\u043d\u0438\u044f, \u0434\u0435\u043d\u044c \u0432 \u0433\u043e\u0434\u0443 \u0432\u043c\u0435\u0441\u0442\u043e \u0434\u0430\u0442\u044b \u0437\u0430\u044f\u0432\u043a\u0438 \u043d\u0430 \u0437\u0430\u0451\u043c (\u0442\u043e\u043b\u044c\u043a\u043e \u0438\u044e\u043b\u044c-\u0441\u0435\u043d\u0442\u044f\u0431\u0440\u044c 2016)<\/p>\n<\/li>\n<li>\n<p>\u043e\u0431\u0440\u0430\u0431\u043e\u0442\u0430\u0442\u044c nan: \u0437\u0430\u043c\u0435\u043d\u0438\u0442\u044c nan \u043d\u0430 \u043c\u043e\u0434\u044b (\u043a\u0430\u0442\u0435\u0433\u043e\u0440\u0438\u0430\u043b\u044c\u043d\u044b\u0435 \u043f\u0440\u0438\u0437\u043d\u0430\u043a\u0438) \u0438 \u043c\u0435\u0434\u0438\u0430\u043d\u044b (\u0447\u0438\u0441\u043b\u0435\u043d\u043d\u044b\u0435 \u043f\u0440\u0438\u0437\u043d\u0430\u043a\u0438)<\/p>\n<\/li>\n<li>\n<p>\u043f\u0440\u0435\u043e\u0431\u0440\u0430\u0437\u043e\u0432\u0430\u0442\u044c \u043a\u0430\u0442\u0435\u0433\u043e\u0440\u0438\u0430\u043b\u044c\u043d\u044b\u0435 \u043f\u0440\u0438\u0437\u043d\u0430\u043a\u0438 \u0432 \u0447\u0438\u0441\u043b\u043e\u0432\u044b\u0435 (\u0441\u043b\u0443\u0447\u0430\u0439\u043d\u044b\u0439 \u043b\u0435\u0441 \u043e\u0431\u0443\u0447\u0430\u0435\u0442\u0441\u044f \u043d\u0430 integer, float, boolean)<\/p>\n<\/li>\n<\/ol>\n<p>\u0414\u043b\u044f \u0443\u0434\u043e\u0431\u0441\u0442\u0432\u0430 \u043e\u0431\u0440\u0430\u0431\u043e\u0442\u043a\u0438 \u043e\u0431\u043e\u0438\u0445 \u043d\u0430\u0431\u043e\u0440\u043e\u0432 \u0434\u0430\u043d\u043d\u044b\u0445 \u0441\u043e\u0437\u0434\u0430\u0434\u0438\u043c \u0444\u0443\u043d\u043a\u0446\u0438\u044e \u043f\u0440\u0435\u0434\u043e\u0431\u0440\u0430\u0431\u043e\u0442\u043a\u0438:<\/p>\n<pre><code class=\"python\">def data_preprocessing(df):   # \u043f\u0440\u0435\u043e\u0431\u0440\u0430\u0437\u0443\u0435\u043c \u0437\u043d\u0430\u0447\u0435\u043d\u0438\u044f \u043f\u043e\u043b\u044f Gender: Female - 0, Male - 1   df.loc[(df['Gender'] == 'Female'), 'Gender'] = 0   df.loc[(df['Gender'] != 0), 'Gender'] = 1   # \u0434\u043e\u0431\u0430\u0432\u0438\u043c \u043f\u0440\u0438\u0437\u043d\u0430\u043a \u0432\u043e\u0437\u0440\u0430\u0441\u0442   df['DOB_year'] = nan   df.loc[df['DOB'].notnull(), 'DOB_year'] = 121 - df['DOB'].loc[df['DOB'].notnull()].str[-2:].astype(int)   df['DOB_year'] = df['DOB_year'].fillna(df['DOB_year'].median())   # \u0434\u043e\u0431\u0430\u0432\u0438\u043c \u043f\u0440\u0438\u0437\u043d\u0430\u043a \u0434\u043d\u0435\u0439 \u0441 \u043d\u0430\u0447\u0430\u043b\u0430 \u0433\u043e\u0434\u0430 \u043e\u0442 \u0434\u0430\u0442\u044b \u0437\u0430\u0451\u043c\u0430 (\u0432 \u0434\u0430\u043d\u043d\u044b\u0445 \u0438\u044e\u043b\u044c-\u0441\u0435\u043d\u0442\u044f\u0431\u0440\u044c 2016)   df['Lead_Creation_Date'] = df['Lead_Creation_Date'].str.replace(r'(..\\\/..\\\/)(..)', r'\\1 20\\2')   df['Lead_Creation_Date'] = pd.to_datetime(df['Lead_Creation_Date'], format=\"%d\/%m\/ %Y\")   df['Lead_Creation_Date_day'] = (df['Lead_Creation_Date']-pd.to_datetime('1\/1\/2016')).astype('timedelta64[h]')\/24    #\u0443\u0434\u0430\u043b\u044f\u0435\u043c \u043f\u0435\u0440\u0432\u044b\u0439 \u0441\u0438\u043c\u0432\u043e\u043b \u0434\u0430\u043d\u043d\u044b\u0445 \u0441\u0442\u043e\u043b\u0431\u0446\u043e\u0432 (\u043e\u043d\u0438 \u043e\u0434\u0438\u043d\u0430\u043a\u043e\u0432\u044b), \u043f\u0440\u0435\u043e\u0431\u0440\u0430\u0437\u0443\u0435\u043c \u0432 int,   #\u0437\u0430\u043c\u0435\u043d\u044f\u0435\u043c nan \u043d\u0430 \u043c\u043e\u0434\u0443 (\u043f\u0440\u0438\u0437\u043d\u0430\u043a \u0431\u044b\u043b \u043a\u0430\u0442\u0435\u0433\u043e\u0440\u0438\u0430\u043b\u044c\u043d\u044b\u043c)   first_drop_cols = ['City_Code', 'Source', 'Customer_Existing_Primary_Bank_Code']   for i in first_drop_cols:     df[i] = df[i].loc[df[i].notnull()].str[1:].astype(int)     df[i] = df[i].fillna(df[i].mode()[0]) \t# \u0443\u0434\u0430\u043b\u044f\u0435\u043c \u043f\u0435\u0440\u0432\u044b\u0435 3 \u0441\u0438\u043c\u0432\u043e\u043b\u0430 \u0438 \u0434\u0430\u043b\u0435\u0435 \u0430\u043d\u0430\u043b\u043e\u0433\u0438\u0447\u043d\u043e \u0432\u0435\u0440\u0445\u043d\u0435\u043c\u0443   df['Employer_Code'] = df['Employer_Code'].loc[df['Employer_Code'].notnull()].str[3:].astype(int)   df['Employer_Code'] = df['Employer_Code'].fillna(df['Employer_Code'].mode()[0]) \t# \u0437\u0430\u043f\u043e\u043b\u043d\u044f\u0435\u043c nan \u043c\u0435\u0434\u0438\u0430\u043d\u043e\u0439   amount_cols = ['Employer_Category2', 'Monthly_Income', 'Existing_EMI', 'Loan_Amount',                'Loan_Period', 'Interest_Rate', 'EMI', 'Var1']    df[amount_cols] = df[amount_cols].fillna(df[amount_cols].median()) \t# \u0437\u0430\u043f\u043e\u043b\u043d\u044f\u0435\u043c nan \u043c\u043e\u0434\u043e\u0439 \u0438 \u043a\u043e\u0434\u0438\u0440\u0443\u0435\u043c \u0441\u0442\u043e\u043b\u0431\u0446\u044b (\u043f\u0435\u0440\u0435\u0432\u043e\u0434\u0438\u043c \u0432 \u0447\u0438\u0441\u043b\u0435\u043d\u043d\u044b\u0435)   str_cols = ['City_Category', 'Employer_Category1', 'Primary_Bank_Type', 'Contacted', 'Source_Category']   str_dict = dict(enumerate(str_cols))   for i in str_cols:     df[i] = df[i].fillna(df[i].mode()[0])   le = LabelEncoder()   df[str_cols] = df[str_cols].apply(le.fit_transform)   return df<\/code><\/pre>\n<pre><code class=\"python\">train = data_preprocessing(train) test = data_preprocessing(test)<\/code><\/pre>\n<pre><code class=\"python\"># \u043f\u0440\u0435\u043e\u0431\u0440\u0430\u0437\u0443\u0435\u043c \u0432 float not_float_cols = ['ID', 'DOB', 'Lead_Creation_Date'] train[train.columns.difference(not_float_cols)] = train[train.columns.difference(not_float_cols)].astype(float)<\/code><\/pre>\n<pre><code class=\"python\">#\u043f\u0440\u043e\u0432\u0435\u0440\u0438\u043c \u043d\u0435\u0442 \u043b\u0438 NaN \u0432 \u0441\u0442\u043e\u043b\u0431\u0446\u0430\u0445 for i in train.columns.difference(unused_cols):   print('{} {}'.format(i, train[i].notnull().unique()))<\/code><\/pre>\n<figure class=\"\"><img loading=\"lazy\" decoding=\"async\" src=\"https:\/\/habrastorage.org\/getpro\/habr\/upload_files\/ad0\/039\/dd9\/ad0039dd9f6b53e2f7486c694a5a57b9.png\" width=\"427\" height=\"445\"><figcaption><\/figcaption><\/figure>\n<p>Nan \u043d\u0435\u0442, \u0430 \u0447\u0442\u043e \u0436\u0435 \u0441 \u0440\u0430\u0441\u043f\u0440\u0435\u0434\u0435\u043b\u0435\u043d\u0438\u0435\u043c \u043a\u043b\u0430\u0441\u0441\u043e\u0432?<\/p>\n<pre><code class=\"python\"># \u043f\u043e\u0441\u043c\u043e\u0442\u0440\u0438\u043c \u043d\u0430 \u043a\u043e\u043b\u0438\u0447\u0435\u0441\u0442\u0432\u043e \u0441\u0442\u0440\u043e\u043a \u0441 \u0440\u0430\u0437\u043d\u044b\u043c\u0438 \u043a\u043b\u0430\u0441\u0441\u0430\u043c\u0438: \u0434\u0430\u0442\u0430\u0441\u0435\u0442 \u043e\u0447\u0435\u043d\u044c \u043d\u0435\u0441\u0431\u0430\u043b\u0430\u043d\u0441\u0438\u0440\u043e\u0432\u0430\u043d \u0432 \u0441\u0442\u043e\u0440\u043e\u043d\u0443 0 (\u0432 \u043e\u043a\u043e\u043b\u043e 68 \u0440\u0430\u0437) train['Approved'].value_counts()<\/code><\/pre>\n<figure class=\"\"><img loading=\"lazy\" decoding=\"async\" src=\"https:\/\/habrastorage.org\/getpro\/habr\/upload_files\/13f\/659\/93d\/13f65993d96c8ac8ac8a48cd319e0714.png\" width=\"281\" height=\"73\"><figcaption><\/figcaption><\/figure>\n<pre><code class=\"python\"># \u043e\u0442\u043d\u043e\u0448\u0435\u043d\u0438\u0435 \u043a\u043e\u043b\u0438\u0447\u0435\u0441\u0442\u0432\u0430 \u0441\u0442\u0440\u043e\u043a \u0441 0 \u043a 1 Approved rat = len(train.loc[train['Approved']==0])\/\/len(train.loc[train['Approved']==1]) rat<\/code><\/pre>\n<figure class=\"\"><img loading=\"lazy\" decoding=\"async\" src=\"https:\/\/habrastorage.org\/getpro\/habr\/upload_files\/045\/b9c\/6b5\/045b9c6b53a0d41cbcb455906bceee30.png\" alt=\"\" title=\"\" width=\"36\" height=\"34\"><figcaption><\/figcaption><\/figure>\n<p>\u0421\u043e\u0437\u0434\u0430\u0434\u0438\u043c \u043d\u043e\u0432\u044b\u0439 train \u0434\u0430\u0442\u0430\u0441\u0435\u0442 <strong>\u043c\u0435\u0442\u043e\u0434\u043e\u043c upsampling<\/strong>:<\/p>\n<ol>\n<li>\n<p>\u0432\u043e\u0437\u044c\u043c\u0451\u043c \u0432\u0441\u0435 \u0434\u0430\u043d\u043d\u044b\u0435 \u0441 \u043a\u043b\u0430\u0441\u0441\u043e\u043c 1<\/p>\n<\/li>\n<li>\n<p>\u043f\u0440\u043e\u0434\u0443\u0431\u043b\u0438\u0440\u0443\u0435\u043c \u0435\u0433\u043e rat \u0440\u0430\u0437<\/p>\n<\/li>\n<li>\n<p>\u043f\u0440\u0438\u0441\u043e\u0435\u0434\u0438\u043d\u0438\u043c \u043a \u0434\u0430\u043d\u043d\u044b\u043c \u043a\u043b\u0430\u0441\u0441\u0430 0 \u043f\u0440\u043e\u0434\u0443\u0431\u043b\u0438\u0440\u043e\u0432\u0430\u043d\u043d\u044b\u0439 \u043a\u043b\u0430\u0441\u0441 1 \u0438 \u043f\u0435\u0440\u0435\u043c\u0435\u0449\u0430\u0435\u043c<\/p>\n<\/li>\n<\/ol>\n<pre><code class=\"python\">df_1 = train.loc[train['Approved']==1] df_1 = df_1.loc[df_1.index.repeat(rat)] train_n = pd.concat([train.loc[train['Approved']==0], df_1]).sample(frac=1)<\/code><\/pre>\n<p>\u041f\u043e\u0441\u043c\u043e\u0442\u0440\u0438\u043c \u043d\u0430 \u043d\u043e\u0432\u043e\u0435 \u0440\u0430\u0441\u043f\u0440\u0435\u0434\u0435\u043b\u0435\u043d\u0438\u0435 \u043a\u043b\u0430\u0441\u0441\u043e\u0432:<\/p>\n<pre><code class=\"python\">train_n['Approved'].value_counts()<\/code><\/pre>\n<figure class=\"\"><img loading=\"lazy\" decoding=\"async\" src=\"https:\/\/habrastorage.org\/getpro\/habr\/upload_files\/a21\/ea9\/548\/a21ea95484eb1fe7de8089fb541c6f98.png\" alt=\"\u0420\u0430\u0441\u043f\u0440\u0435\u0434\u0435\u043b\u0435\u043d\u0438\u0435 \u043a\u043b\u0430\u0441\u0441\u043e\u0432 \u0432 \u043d\u043e\u0432\u043e\u043c \u0434\u0430\u0442\u0430\u0441\u0435\u0442\u0435\" title=\"\u0420\u0430\u0441\u043f\u0440\u0435\u0434\u0435\u043b\u0435\u043d\u0438\u0435 \u043a\u043b\u0430\u0441\u0441\u043e\u0432 \u0432 \u043d\u043e\u0432\u043e\u043c \u0434\u0430\u0442\u0430\u0441\u0435\u0442\u0435\" width=\"284\" height=\"69\"><figcaption>\u0420\u0430\u0441\u043f\u0440\u0435\u0434\u0435\u043b\u0435\u043d\u0438\u0435 \u043a\u043b\u0430\u0441\u0441\u043e\u0432 \u0432 \u043d\u043e\u0432\u043e\u043c \u0434\u0430\u0442\u0430\u0441\u0435\u0442\u0435<\/figcaption><\/figure>\n<p>\u041f\u0440\u0438\u0441\u0442\u0443\u043f\u0430\u0435\u043c \u043a \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u044e. \u0411\u0443\u0434\u0435\u043c \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u044c \u0441\u043b\u0443\u0447\u0430\u0439\u043d\u044b\u0439 \u043b\u0435\u0441, \u0430 \u0442\u0430\u043a\u0436\u0435 \u0435\u0433\u043e \u0442\u044e\u043d\u0438\u043d\u0433 (\u043f\u043e\u0434\u0431\u043e\u0440 \u043d\u0430\u0438\u0431\u043e\u043b\u0435\u0435 \u043a\u0430\u0447\u0435\u0441\u0442\u0432\u0435\u043d\u043d\u044b\u0445 \u0433\u0438\u043f\u0435\u0440\u043f\u0430\u0440\u0430\u043c\u0435\u0442\u0440\u043e\u0432)<\/p>\n<pre><code class=\"python\"># \u0434\u0435\u043b\u0438\u043c \u043d\u0430 \u0442\u0440\u0435\u043d\u0438\u0440\u043e\u0432\u043e\u0447\u043d\u0443\u044e \u0438 \u0442\u0435\u0441\u0442\u043e\u0432\u0443\u044e X =  train_n[train_n.columns.difference(['Approved'])] y = train_n['Approved'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)  # \u0437\u0430\u0434\u0430\u0451\u043c \u043f\u0430\u0440\u0430\u043c\u0435\u0442\u0440\u044b, \u0438\u0437 \u0434\u0438\u0430\u043f\u0430\u0437\u043e\u043d\u0430 \u0437\u043d\u0430\u0447\u0435\u043d\u0438\u0439 \u043a\u043e\u0442\u043e\u0440\u044b\u0445 \u043d\u0430\u0434\u043e \u0432\u044b\u0431\u0440\u0430\u0442\u044c \u043b\u0443\u0447\u0448\u0435\u0435 # https:\/\/github.com\/ray-project\/tune-sklearn param_dists = {     'criterion': tune.choice(['gini', 'entropy']),     'max_depth': tune.choice([i for i in range(2, 17)]),     'max_features': tune.choice(['log2', 'sqrt']),      'min_samples_leaf': tune.choice([i for i in range(2, 33)]),     'min_samples_split': tune.choice([i for i in range(2, 17)]),     'random_state': tune.choice([23]) }  hyperopt_tune_search = TuneSearchCV(RandomForestClassifier(),     param_distributions=param_dists,     n_trials=2,     early_stopping=True,     max_iters=10,     search_optimization=\"hyperopt\" )  hts = hyperopt_tune_search.fit(X_train, y_train)<\/code><\/pre>\n<pre><code class=\"python\">y_pred = hts.predict(X_test) print(confusion_matrix(y_test, y_pred)) print(precision_recall_fscore_support(y_test, y_pred)) print(roc_auc_score(y_test, y_pred, average='weighted'))<\/code><\/pre>\n<figure class=\"full-width\"><img loading=\"lazy\" decoding=\"async\" src=\"https:\/\/habrastorage.org\/getpro\/habr\/upload_files\/ef0\/3e8\/a25\/ef03e8a253984edd1aa9764c4af0b59d.png\" alt=\"\u0417\u043d\u0430\u0447\u0435\u043d\u0438\u044f \u043c\u0435\u0442\u0440\u0438\u043a\" title=\"\u0417\u043d\u0430\u0447\u0435\u043d\u0438\u044f \u043c\u0435\u0442\u0440\u0438\u043a\" width=\"1187\" height=\"95\"><figcaption>\u0417\u043d\u0430\u0447\u0435\u043d\u0438\u044f \u043c\u0435\u0442\u0440\u0438\u043a<\/figcaption><\/figure>\n<p>\u0417\u043d\u0430\u0447\u0435\u043d\u0438\u044f&nbsp;\u043c\u0435\u0442\u0440\u0438\u043a&nbsp;f1&nbsp;\u043f\u043e\u043b\u0443\u0447\u0438\u043b\u0438\u0441\u044c&nbsp;\u0434\u043e\u0441\u0442\u0430\u0442\u043e\u0447\u043d\u043e&nbsp;\u0432\u044b\u0441\u043e\u043a\u0438\u0435&nbsp;(90%+),&nbsp;\u0447\u0442\u043e&nbsp;\u043c\u043e\u0436\u0435\u0442&nbsp;\u0433\u043e\u0432\u043e\u0440\u0438\u0442\u044c&nbsp;\u043a\u0430\u0447\u0435\u0441\u0442\u0432\u0435\u043d\u043d\u043e\u0441\u0442\u0438&nbsp;\u043c\u043e\u0434\u0435\u043b\u0438&nbsp;\u043a\u043b\u0430\u0441\u0441\u0438\u0444\u0438\u043a\u0430\u0446\u0438\u0438.<\/p>\n<p>\u0422\u0430\u043a\u0438\u043c \u043e\u0431\u0440\u0430\u0437\u043e\u043c, \u0440\u0430\u0431\u043e\u0442\u0430\u0442\u044c \u0441 \u043d\u0435\u0441\u0431\u0430\u043b\u0430\u043d\u0441\u0438\u0440\u043e\u0432\u0430\u043d\u043d\u044b\u043c\u0438 \u0434\u0430\u043d\u043d\u044b\u043c\u0438 \u043c\u043e\u0436\u043d\u043e \u043d\u0435 \u0442\u043e\u043b\u044c\u043a\u043e \u0447\u0435\u0440\u0435\u0437 \u0433\u0438\u043f\u0435\u0440\u043f\u0430\u0440\u0430\u043c\u0435\u0442\u0440\u044b, \u043d\u043e \u0438 \u043c\u0435\u0442\u043e\u0434\u043e\u043c upsampling. \u041e\u043d \u043f\u043e\u0437\u0432\u043e\u043b\u044f\u0435\u0442 \u043c\u0435\u0442\u0440\u0438\u043a\u0430\u043c \u043d\u0430\u0438\u043c\u0435\u043d\u044c\u0448\u0435\u0433\u043e \u043a\u043b\u0430\u0441\u0441\u0430 \u043e\u0442 0.0 \u0434\u043e\u0441\u0442\u0438\u0447\u044c \u0437\u043d\u0430\u0447\u0435\u043d\u0438\u0439 \u0431\u043e\u043b\u0435\u0435 0.8<\/p>\n<p>\u041f\u043e\u043b\u043d\u044b\u0439 \u043a\u043e\u0434 \u043c\u043e\u0436\u043d\u043e \u0441\u043a\u0430\u0447\u0430\u0442\u044c \u0437\u0434\u0435\u0441\u044c: <a href=\"https:\/\/github.com\/sivovaalex\/for_magazines\/blob\/master\/Banking_Marketing_Leads_Conversion_Data\/Project.ipynb\" rel=\"noopener noreferrer nofollow\">https:\/\/github.com\/sivovaalex\/for_magazines\/blob\/master\/Banking_Marketing_Leads_Conversion_Data\/Project.ipynb<\/a><\/p>\n<\/div>\n<p> \u0441\u0441\u044b\u043b\u043a\u0430 \u043d\u0430 \u043e\u0440\u0438\u0433\u0438\u043d\u0430\u043b \u0441\u0442\u0430\u0442\u044c\u0438 <a href=\"https:\/\/habr.com\/ru\/post\/568266\/\"> https:\/\/habr.com\/ru\/post\/568266\/<\/a><\/p>\n","protected":false},"excerpt":{"rendered":"\n<div class=\"post__text post__text_v2\" id=\"post-content-body\">\n<figure class=\"full-width\"><figcaption>Upsampling data<\/figcaption><\/figure>\n<p>\u0414\u0430\u043d\u043d\u0430\u044f \u0441\u0442\u0430\u0442\u044c\u044f \u0440\u0430\u0441\u0441\u0447\u0438\u0442\u0430\u043d\u0430 \u0434\u043b\u044f \u043d\u043e\u0432\u0438\u0447\u043a\u043e\u0432 \u0432 \u043c\u0430\u0448\u0438\u043d\u043d\u043e\u043c \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u0438. \u0418\u0441\u043f\u043e\u043b\u044c\u0437\u0443\u044e\u0442\u0441\u044f \u0441\u043b\u0435\u0434\u0443\u044e\u0449\u0438\u0435 \u0438\u043d\u0442\u0441\u0442\u0440\u0443\u043c\u0435\u043d\u0442\u044b:<\/p>\n<ul>\n<li>\n<p>Python<\/p>\n<\/li>\n<li>\n<p>Random forest classifier<\/p>\n<\/li>\n<li>\n<p>Google Colab<\/p>\n<\/li>\n<li>\n<p>Upsampling data<\/p>\n<\/li>\n<\/ul>\n<p>\u041a\u0430\u0436\u0434\u044b\u0439 \u0434\u0430\u0442\u0430 \u0441\u0430\u0435\u043d\u0442\u0438\u0441\u0442 \u0445\u043e\u0442\u044c \u0440\u0430\u0437 \u0441\u0442\u0430\u043b\u043a\u0438\u0432\u0430\u043b\u0441\u044f \u0441 \u043f\u0440\u043e\u0431\u043b\u0435\u043c\u043e\u0439 \u043d\u0435\u0441\u0431\u0430\u043b\u0430\u043d\u0441\u0438\u0440\u043e\u0432\u0430\u043d\u043d\u043e\u0441\u0442\u0438 \u0434\u0430\u043d\u043d\u044b\u0445 \u0434\u043b\u044f \u043a\u043b\u0430\u0441\u0441\u0438\u0444\u0438\u043a\u0430\u0446\u0438\u0438: \u043a\u0430\u043a\u043e\u0439-\u0442\u043e \u043a\u043b\u0430\u0441\u0441 \u043f\u0440\u0435\u0432\u043e\u0441\u0445\u043e\u0434\u0438\u0442 \u0434\u0440\u0443\u0433\u0438\u0435. \u0421\u0443\u0449\u0435\u0441\u0442\u0432\u0443\u0435\u0442 \u0434\u0430\u043b\u0435\u043a\u043e \u043d\u0435 \u043e\u0434\u0438\u043d \u0441\u043f\u043e\u0441\u043e\u0431 \u0431\u043e\u0440\u044c\u0431\u044b \u0441 \u044d\u0442\u043e\u0439 \u043f\u0440\u043e\u0431\u043b\u0435\u043c\u043e\u0439. \u041d\u0430\u0438\u0431\u043e\u043b\u044c\u0448\u0443\u044e \u0438\u0437\u0432\u0435\u0441\u0442\u043d\u043e\u0441\u0442\u044c \u0438\u043c\u0435\u0435\u0442 \u043f\u0440\u0435\u043e\u0431\u0440\u0430\u0437\u043e\u0432\u0430\u043d\u0438\u0435 \u0433\u0438\u043f\u0435\u0440\u043f\u0430\u0440\u0430\u043c\u0435\u0442\u0440\u043e\u0432, \u043d\u0430\u043f\u0440\u0438\u043c\u0435\u0440:<\/p>\n<ul>\n<li>\n<p>class_weight, \u043d\u043e \u0435\u0433\u043e \u043c\u043e\u0436\u043d\u043e \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u044c \u043f\u0440\u0438 \u043d\u0435\u0437\u043d\u0430\u0447\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0439 \u043d\u0435\u0441\u0431\u0430\u043b\u0430\u043d\u0441\u0438\u0440\u043e\u0432\u0430\u043d\u043d\u043e\u0441\u0442\u0438: \u0441\u043e\u043e\u0442\u043d\u043e\u0448\u0435\u043d\u0438\u0438 \u0434\u0430\u043d\u043d\u044b\u0445 \u0440\u0430\u0437\u043d\u044b\u0445 \u043a\u043b\u0430\u0441\u0441\u043e\u0432, \u043d\u0430\u043f\u0440\u0438\u043c\u0435\u0440, 4:3 (\u043f\u043e\u0434\u0440\u043e\u0431\u043d\u0435\u0435 \u043c\u043e\u0436\u043d\u043e \u043f\u0440\u043e\u0447\u0435\u0441\u0442\u044c \u0442\u0443\u0442: <a href=\"https:\/\/datascience.stackexchange.com\/questions\/11564\/how-does-class-weights-work-in-randomforestclassifier\" rel=\"noopener noreferrer nofollow\">https:\/\/datascience.stackexchange.com\/questions\/11564\/how-does-class-weights-work-in-randomforestclassifier<\/a>)<\/p>\n<\/li>\n<li>\n<p>warm_start, \u043a\u043e\u0442\u043e\u0440\u044b\u0439 \u043f\u043e\u0437\u0432\u043e\u043b\u044f\u0435\u0442 \u0431\u0430\u0442\u0447\u0430\u043c\u0438 (\u0447\u0430\u0441\u0442\u044f\u043c\u0438 \u0434\u0430\u0442\u0430\u0441\u0435\u0442\u0430) \u043e\u0431\u0443\u0447\u0430\u0442\u044c \u0434\u0430\u043d\u043d\u044b\u0435 (\u043f\u043e\u0434\u0440\u043e\u0431\u043d\u0435\u0435 \u043c\u043e\u0436\u043d\u043e \u043f\u0440\u043e\u0447\u0435\u0441\u0442\u044c \u0442\u0443\u0442: <a href=\"https:\/\/stackoverflow.com\/questions\/42757892\/how-to-use-warm-start\/42763502\" rel=\"noopener noreferrer nofollow\">https:\/\/stackoverflow.com\/questions\/42757892\/how-to-use-warm-start\/42763502<\/a>)<\/p>\n<\/li>\n<\/ul>\n<p>\u041e\u0434\u043d\u0430\u043a\u043e \u0432 \u0434\u0430\u043d\u043d\u043e\u0439 \u0441\u0442\u0430\u0442\u044c\u0435 \u043c\u044b \u0440\u0430\u0441\u0441\u043c\u043e\u0442\u0440\u0438\u043c \u043c\u0435\u0442\u043e\u0434, \u043d\u0435 \u0441\u0432\u044f\u0437\u0430\u043d\u043d\u044b\u0439 \u0441 \u0433\u0438\u043f\u0435\u0440\u043f\u0430\u0440\u0430\u043c\u0435\u0442\u0440\u0430\u043c\u0438 \u043c\u043e\u0434\u0435\u043b\u0438: upsampling&nbsp;data. \u041c\u044b \u043f\u0440\u0435\u0443\u0432\u0435\u043b\u0438\u0447\u0438\u043c \u043a\u043e\u043b\u0438\u0447\u0435\u0441\u0442\u0432\u043e \u043d\u0430\u0438\u043c\u0435\u043d\u044c\u0448\u0438\u0445 \u043a\u043b\u0430\u0441\u0441\u043e\u0432 \u0435\u0449\u0451 \u0434\u043e \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u044f \u043c\u043e\u0434\u0435\u043b\u0438: \u043f\u0440\u043e\u0434\u0443\u0431\u043b\u0438\u0440\u0443\u0435\u043c n (\u043e\u0442\u043d\u043e\u0448\u0435\u043d\u0438\u0435 \u043a\u043e\u043b\u0438\u0447\u0435\u0441\u0442\u0432\u0430 \u043f\u0440\u0435\u043e\u0431\u043b\u0430\u0434\u0430\u044e\u0449\u0435\u0433\u043e \u043a\u043b\u0430\u0441\u0441 \u043a \u0438\u043d\u0442\u0435\u0440\u0435\u0441\u0443\u044e\u0449\u0435\u043c\u0443) \u0440\u0430\u0437 \u043d\u0430\u0438\u043c\u0435\u043d\u044c\u0448\u0438\u0439 \u043a\u043b\u0430\u0441\u0441.<\/p>\n<p>\u0412 \u043a\u0430\u0447\u0435\u0441\u0442\u0432\u0435 \u0434\u0430\u043d\u043d\u044b\u0445 \u0432\u044b\u0431\u0440\u0430\u043d\u044b \u0434\u0430\u043d\u043d\u044b\u0435 \u0441\u043e\u0440\u0435\u0432\u043d\u043e\u0432\u0430\u043d\u0438\u044f \u043d\u0430 kaggle: <a href=\"https:\/\/www.kaggle.com\/arashnic\/banking-loan-prediction\" rel=\"noopener noreferrer nofollow\">https:\/\/www.kaggle.com\/arashnic\/banking-loan-prediction<\/a>. \u0412 \u043a\u0430\u0447\u0435\u0441\u0442\u0432\u0435 \u0430\u043b\u0433\u043e\u0440\u0438\u0442\u043c\u0430 \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u044f \u0432\u043e\u0437\u044c\u043c\u0451\u043c \u0441\u043b\u0443\u0447\u0430\u0439\u043d\u044b\u0435 \u043b\u0435\u0441. \u041d\u0430\u0447\u043d\u0451\u043c!<\/p>\n<p>\u0418\u043c\u043f\u043e\u0440\u0442\u0438\u0440\u0443\u0435\u043c \u043d\u0435\u043e\u0431\u0445\u043e\u0434\u0438\u043c\u044b\u0435 \u0431\u0438\u0431\u043b\u0438\u043e\u0442\u0435\u043a\u0438:<\/p>\n<pre><code class=\"python\">import seaborn as sns import pandas as pd import numpy as np import matplotlib.pyplot as plt from numpy import nan from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import confusion_matrix from sklearn.metrics import precision_recall_fscore_support from sklearn.metrics import precision_recall_curve from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix from sklearn.metrics import roc_auc_score from sklearn.metrics import roc_curve, auc from sklearn import metrics import copy from tune_sklearn import TuneSearchCV import scipy from ray import tune<\/code><\/pre>\n<p>\u0417\u0430\u0433\u0440\u0443\u0436\u0430\u0435\u043c \u0434\u0430\u043d\u043d\u044b\u0435 (\u0432 \u043a\u0430\u0447\u0435\u0441\u0442\u0432\u0435 \u0441\u0440\u0435\u0434\u044b \u0440\u0430\u0437\u0440\u0430\u0431\u043e\u0442\u043a\u0438 \u043c\u043d\u043e\u0439 \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u043b\u0441\u044f Google Colab, \u0430 \u0434\u0430\u043d\u043d\u044b\u0435 \u0440\u0430\u0441\u043f\u043e\u043b\u0430\u0433\u0430\u043b\u0438\u0441\u044c \u043d\u0430 Google Drive):<\/p>\n<pre><code class=\"python\">from google.colab import drive drive.mount('\/content\/drive') train = pd.read_csv('\/content\/drive\/MyDrive\/\u043f\u043e\u0440\u0442\u0444\u043e\u043b\u0438\u043e\/Project \"Help to increase customer acquisition\"\/train.csv') test = pd.read_csv('\/content\/drive\/MyDrive\/\u043f\u043e\u0440\u0442\u0444\u043e\u043b\u0438\u043e\/Project \"Help to increase customer acquisition\"\/test.csv')<\/code><\/pre>\n<p>\u041f\u043e\u0441\u043c\u043e\u0442\u0440\u0438\u043c \u043d\u0430 \u0438\u0441\u0445\u043e\u0434\u043d\u044b\u0435 \u0434\u0430\u043d\u043d\u044b\u0435 train (\u0438\u0445 \u043c\u044b \u0431\u0443\u0434\u0435\u043c \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u044c \u0434\u043b\u044f \u0442\u0440\u0435\u043d\u0438\u0440\u043e\u0432\u043a\u0438 \u0438 \u0442\u0435\u0441\u0442\u0430, \u0432 \u0434\u0430\u043d\u043d\u044b\u0445 test \u043e\u0442\u0441\u0443\u0442\u0441\u0442\u0432\u0443\u0435\u0442 \u0442\u0430\u0440\u0433\u0435\u0442\u0438\u0440\u043e\u0432\u0430\u043d\u043d\u044b\u0439 \u0441\u0442\u043e\u043b\u0431\u0435\u0446)<\/p>\n<pre><code class=\"python\">train<\/code><\/pre>\n<div class=\"table\">\n<table>\n<tbody>\n<tr>\n<th>\n<p align=\"center\">\n<\/th>\n<th data-colwidth=\"50\" width=\"50\"><\/th>\n<th>\n<p><strong>Gender<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>DOB<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>Lead_Creation_Date<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>City_Code<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>City_Category<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>Employer_Code<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>Employer_Category1<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>Employer_Category2<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>Monthly_Income<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>Customer_Existing_Primary_Bank_Code<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>Primary_Bank_Type<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>Contacted<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>Source<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>Source_Category<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>Existing_EMI<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>Loan_Amount<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>Loan_Period<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>Interest_Rate<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>EMI<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>Var1<\/strong><\/p>\n<\/th>\n<th>\n<p><strong>Approved<\/strong><\/p>\n<\/th>\n<\/tr>\n<tr>\n<th>\n<p><strong>0<\/strong><\/p>\n<\/th>\n<td data-colwidth=\"50\" width=\"50\">\n<p>APPC90493171225<\/p>\n<\/td>\n<td>\n<p>Female<\/p>\n<\/td>\n<td>\n<p>23\/07\/79<\/p>\n<\/td>\n<td>\n<p>15\/07\/16<\/p>\n<\/td>\n<td>\n<p>C10001<\/p>\n<\/td>\n<td>\n<p>A<\/p>\n<\/td>\n<td>\n<p>COM0044082<\/p>\n<\/td>\n<td>\n<p>A<\/p>\n<\/td>\n<td>\n<p>4.0<\/p>\n<\/td>\n<td>\n<p>2000.0<\/p>\n<\/td>\n<td>\n<p>B001<\/p>\n<\/td>\n<td>\n<p>P<\/p>\n<\/td>\n<td>\n<p>N<\/p>\n<\/td>\n<td>\n<p>S122<\/p>\n<\/td>\n<td>\n<p>G<\/p>\n<\/td>\n<td>\n<p>0.0<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>0<\/p>\n<\/td>\n<td>\n<p>0<\/p>\n<\/td>\n<\/tr>\n<tr>\n<th>\n<p><strong>1<\/strong><\/p>\n<\/th>\n<td data-colwidth=\"50\" width=\"50\">\n<p>APPD40611263344<\/p>\n<\/td>\n<td>\n<p>Male<\/p>\n<\/td>\n<td>\n<p>07\/12\/86<\/p>\n<\/td>\n<td>\n<p>04\/07\/16<\/p>\n<\/td>\n<td>\n<p>C10003<\/p>\n<\/td>\n<td>\n<p>A<\/p>\n<\/td>\n<td>\n<p>COM0000002<\/p>\n<\/td>\n<td>\n<p>C<\/p>\n<\/td>\n<td>\n<p>1.0<\/p>\n<\/td>\n<td>\n<p>3500.0<\/p>\n<\/td>\n<td>\n<p>B002<\/p>\n<\/td>\n<td>\n<p>P<\/p>\n<\/td>\n<td>\n<p>Y<\/p>\n<\/td>\n<td>\n<p>S122<\/p>\n<\/td>\n<td>\n<p>G<\/p>\n<\/td>\n<td>\n<p>0.0<\/p>\n<\/td>\n<td>\n<p>20000.0<\/p>\n<\/td>\n<td>\n<p>2.0<\/p>\n<\/td>\n<td>\n<p>13.25<\/p>\n<\/td>\n<td>\n<p>953.0<\/p>\n<\/td>\n<td>\n<p>10<\/p>\n<\/td>\n<td>\n<p>0<\/p>\n<\/td>\n<\/tr>\n<tr>\n<th>\n<p><strong>2<\/strong><\/p>\n<\/th>\n<td data-colwidth=\"50\" width=\"50\">\n<p>APPE70289249423<\/p>\n<\/td>\n<td>\n<p>Male<\/p>\n<\/td>\n<td>\n<p>10\/12\/82<\/p>\n<\/td>\n<td>\n<p>19\/07\/16<\/p>\n<\/td>\n<td>\n<p>C10125<\/p>\n<\/td>\n<td>\n<p>C<\/p>\n<\/td>\n<td>\n<p>COM0005267<\/p>\n<\/td>\n<td>\n<p>C<\/p>\n<\/td>\n<td>\n<p>4.0<\/p>\n<\/td>\n<td>\n<p>2250.0<\/p>\n<\/td>\n<td>\n<p>B003<\/p>\n<\/td>\n<td>\n<p>G<\/p>\n<\/td>\n<td>\n<p>Y<\/p>\n<\/td>\n<td>\n<p>S143<\/p>\n<\/td>\n<td>\n<p>B<\/p>\n<\/td>\n<td>\n<p>0.0<\/p>\n<\/td>\n<td>\n<p>45000.0<\/p>\n<\/td>\n<td>\n<p>4.0<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>0<\/p>\n<\/td>\n<td>\n<p>0<\/p>\n<\/td>\n<\/tr>\n<tr>\n<th>\n<p><strong>3<\/strong><\/p>\n<\/th>\n<td data-colwidth=\"50\" width=\"50\">\n<p>APPF80273865537<\/p>\n<\/td>\n<td>\n<p>Male<\/p>\n<\/td>\n<td>\n<p>30\/01\/89<\/p>\n<\/td>\n<td>\n<p>09\/07\/16<\/p>\n<\/td>\n<td>\n<p>C10477<\/p>\n<\/td>\n<td>\n<p>C<\/p>\n<\/td>\n<td>\n<p>COM0004143<\/p>\n<\/td>\n<td>\n<p>A<\/p>\n<\/td>\n<td>\n<p>4.0<\/p>\n<\/td>\n<td>\n<p>3500.0<\/p>\n<\/td>\n<td>\n<p>B003<\/p>\n<\/td>\n<td>\n<p>G<\/p>\n<\/td>\n<td>\n<p>Y<\/p>\n<\/td>\n<td>\n<p>S143<\/p>\n<\/td>\n<td>\n<p>B<\/p>\n<\/td>\n<td>\n<p>0.0<\/p>\n<\/td>\n<td>\n<p>92000.0<\/p>\n<\/td>\n<td>\n<p>5.0<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>7<\/p>\n<\/td>\n<td>\n<p>0<\/p>\n<\/td>\n<\/tr>\n<tr>\n<th>\n<p><strong>4<\/strong><\/p>\n<\/th>\n<td data-colwidth=\"50\" width=\"50\">\n<p>APPG60994436641<\/p>\n<\/td>\n<td>\n<p>Male<\/p>\n<\/td>\n<td>\n<p>19\/04\/85<\/p>\n<\/td>\n<td>\n<p>20\/07\/16<\/p>\n<\/td>\n<td>\n<p>C10002<\/p>\n<\/td>\n<td>\n<p>A<\/p>\n<\/td>\n<td>\n<p>COM0001781<\/p>\n<\/td>\n<td>\n<p>A<\/p>\n<\/td>\n<td>\n<p>4.0<\/p>\n<\/td>\n<td>\n<p>10000.0<\/p>\n<\/td>\n<td>\n<p>B001<\/p>\n<\/td>\n<td>\n<p>P<\/p>\n<\/td>\n<td>\n<p>Y<\/p>\n<\/td>\n<td>\n<p>S134<\/p>\n<\/td>\n<td>\n<p>B<\/p>\n<\/td>\n<td>\n<p>2500.0<\/p>\n<\/td>\n<td>\n<p>50000.0<\/p>\n<\/td>\n<td>\n<p>2.0<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>10<\/p>\n<\/td>\n<td>\n<p>0<\/p>\n<\/td>\n<\/tr>\n<tr>\n<th>\n<p><strong>&#8230;<\/strong><\/p>\n<\/th>\n<td data-colwidth=\"50\" width=\"50\">\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<td>\n<p>&#8230;<\/p>\n<\/td>\n<\/tr>\n<tr>\n<th>\n<p><strong>69708<\/strong><\/p>\n<\/th>\n<td data-colwidth=\"50\" width=\"50\">\n<p>APPU90955789628<\/p>\n<\/td>\n<td>\n<p>Female<\/p>\n<\/td>\n<td>\n<p>31\/07\/83<\/p>\n<\/td>\n<td>\n<p>30\/09\/16<\/p>\n<\/td>\n<td>\n<p>C10006<\/p>\n<\/td>\n<td>\n<p>A<\/p>\n<\/td>\n<td>\n<p>COM0000010<\/p>\n<\/td>\n<td>\n<p>A<\/p>\n<\/td>\n<td>\n<p>1.0<\/p>\n<\/td>\n<td>\n<p>4900.0<\/p>\n<\/td>\n<td>\n<p>B002<\/p>\n<\/td>\n<td>\n<p>P<\/p>\n<\/td>\n<td>\n<p>N<\/p>\n<\/td>\n<td>\n<p>S122<\/p>\n<\/td>\n<td>\n<p>G<\/p>\n<\/td>\n<td>\n<p>0.0<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>10<\/p>\n<\/td>\n<td>\n<p>0<\/p>\n<\/td>\n<\/tr>\n<tr>\n<th>\n<p><strong>69709<\/strong><\/p>\n<\/th>\n<td data-colwidth=\"50\" width=\"50\">\n<p>APPV80989824738<\/p>\n<\/td>\n<td>\n<p>Female<\/p>\n<\/td>\n<td>\n<p>27\/01\/71<\/p>\n<\/td>\n<td>\n<p>30\/09\/16<\/p>\n<\/td>\n<td>\n<p>C10116<\/p>\n<\/td>\n<td>\n<p>C<\/p>\n<\/td>\n<td>\n<p>COM0045789<\/p>\n<\/td>\n<td>\n<p>A<\/p>\n<\/td>\n<td>\n<p>4.0<\/p>\n<\/td>\n<td>\n<p>7190.1<\/p>\n<\/td>\n<td>\n<p>B002<\/p>\n<\/td>\n<td>\n<p>P<\/p>\n<\/td>\n<td>\n<p>N<\/p>\n<\/td>\n<td>\n<p>S122<\/p>\n<\/td>\n<td>\n<p>G<\/p>\n<\/td>\n<td>\n<p>1450.0<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>7<\/p>\n<\/td>\n<td>\n<p>0<\/p>\n<\/td>\n<\/tr>\n<tr>\n<th>\n<p><strong>69710<\/strong><\/p>\n<\/th>\n<td data-colwidth=\"50\" width=\"50\">\n<p>APPW50697209842<\/p>\n<\/td>\n<td>\n<p>Female<\/p>\n<\/td>\n<td>\n<p>01\/02\/92<\/p>\n<\/td>\n<td>\n<p>30\/09\/16<\/p>\n<\/td>\n<td>\n<p>C10022<\/p>\n<\/td>\n<td>\n<p>B<\/p>\n<\/td>\n<td>\n<p>COM0013284<\/p>\n<\/td>\n<td>\n<p>C<\/p>\n<\/td>\n<td>\n<p>4.0<\/p>\n<\/td>\n<td>\n<p>1600.0<\/p>\n<\/td>\n<td>\n<p>B030<\/p>\n<\/td>\n<td>\n<p>P<\/p>\n<\/td>\n<td>\n<p>Y<\/p>\n<\/td>\n<td>\n<p>S122<\/p>\n<\/td>\n<td>\n<p>G<\/p>\n<\/td>\n<td>\n<p>0.0<\/p>\n<\/td>\n<td>\n<p>24000.0<\/p>\n<\/td>\n<td>\n<p>4.0<\/p>\n<\/td>\n<td>\n<p>35.50<\/p>\n<\/td>\n<td>\n<p>943.0<\/p>\n<\/td>\n<td>\n<p>2<\/p>\n<\/td>\n<td>\n<p>0<\/p>\n<\/td>\n<\/tr>\n<tr>\n<th>\n<p><strong>69711<\/strong><\/p>\n<\/th>\n<td data-colwidth=\"50\" width=\"50\">\n<p>APPY50870035036<\/p>\n<\/td>\n<td>\n<p>Male<\/p>\n<\/td>\n<td>\n<p>27\/06\/78<\/p>\n<\/td>\n<td>\n<p>30\/09\/16<\/p>\n<\/td>\n<td>\n<p>C10002<\/p>\n<\/td>\n<td>\n<p>A<\/p>\n<\/td>\n<td>\n<p>COM0000098<\/p>\n<\/td>\n<td>\n<p>C<\/p>\n<\/td>\n<td>\n<p>3.0<\/p>\n<\/td>\n<td>\n<p>9893.0<\/p>\n<\/td>\n<td>\n<p>B002<\/p>\n<\/td>\n<td>\n<p>P<\/p>\n<\/td>\n<td>\n<p>Y<\/p>\n<\/td>\n<td>\n<p>S122<\/p>\n<\/td>\n<td>\n<p>G<\/p>\n<\/td>\n<td>\n<p>1366.0<\/p>\n<\/td>\n<td>\n<p>80000.0<\/p>\n<\/td>\n<td>\n<p>5.0<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>10<\/p>\n<\/td>\n<td>\n<p>0<\/p>\n<\/td>\n<\/tr>\n<tr>\n<th>\n<p><strong>69712<\/strong><\/p>\n<\/th>\n<td data-colwidth=\"50\" width=\"50\">\n<p>APPZ60733046119<\/p>\n<\/td>\n<td>\n<p>Male<\/p>\n<\/td>\n<td>\n<p>31\/12\/89<\/p>\n<\/td>\n<td>\n<p>30\/09\/16<\/p>\n<\/td>\n<td>\n<p>C10003<\/p>\n<\/td>\n<td>\n<p>A<\/p>\n<\/td>\n<td>\n<p>COM0000056<\/p>\n<\/td>\n<td>\n<p>A<\/p>\n<\/td>\n<td>\n<p>1.0<\/p>\n<\/td>\n<td>\n<p>4230.0<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>NaN<\/p>\n<\/td>\n<td>\n<p>Y<\/p>\n<\/td>\n<td>\n<p>S122<\/p>\n<\/td>\n<td>\n<p>G<\/p>\n<\/td>\n<td>\n<p>0.0<\/p>\n<\/td>\n<td>\n<p>69000.0<\/p>\n<\/td>\n<td>\n<p>4.0<\/p>\n<\/td>\n<td>\n<p>13.99<\/p>\n<\/td>\n<td>\n<p>1885.0<\/p>\n<\/td>\n<td>\n<p>10<\/p>\n<\/td>\n<td>\n<p>0<\/p>\n<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<\/div>\n<p>69713 rows \u00d7 22 columns<\/p>\n<p>\u041a\u0430\u043a \u043c\u043e\u0436\u043d\u043e \u0437\u0430\u043c\u0435\u0442\u0438\u0442\u044c, \u0434\u0430\u043d\u043d\u044b\u0435 \u043d\u0435\u043e\u0431\u0445\u043e\u0434\u0438\u043c\u043e \u043f\u0440\u0435\u0434\u043e\u0431\u0440\u0430\u0431\u043e\u0442\u0430\u0442\u044c \u043f\u0435\u0440\u0435\u0434 \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u0435\u043c:<\/p>\n<ol>\n<li>\n<p>\u0441\u043e\u0437\u0434\u0430\u0442\u044c \u043d\u043e\u0432\u044b\u0435 \u043f\u0440\u0438\u0437\u043d\u0430\u043a\u0438 \u043d\u0430 \u043e\u0441\u043d\u043e\u0432\u0435 \u0441\u0442\u0430\u0440\u044b\u0445: \u0432\u043e\u0437\u0440\u0430\u0441\u0442 \u0432\u043c\u0435\u0441\u0442\u043e \u0434\u0430\u0442\u044b \u0440\u043e\u0436\u0434\u0435\u043d\u0438\u044f, \u0434\u0435\u043d\u044c \u0432 \u0433\u043e\u0434\u0443 \u0432\u043c\u0435\u0441\u0442\u043e \u0434\u0430\u0442\u044b \u0437\u0430\u044f\u0432\u043a\u0438 \u043d\u0430 \u0437\u0430\u0451\u043c (\u0442\u043e\u043b\u044c\u043a\u043e \u0438\u044e\u043b\u044c-\u0441\u0435\u043d\u0442\u044f\u0431\u0440\u044c 2016)<\/p>\n<\/li>\n<li>\n<p>\u043e\u0431\u0440\u0430\u0431\u043e\u0442\u0430\u0442\u044c nan: \u0437\u0430\u043c\u0435\u043d\u0438\u0442\u044c nan \u043d\u0430 \u043c\u043e\u0434\u044b (\u043a\u0430\u0442\u0435\u0433\u043e\u0440\u0438\u0430\u043b\u044c\u043d\u044b\u0435 \u043f\u0440\u0438\u0437\u043d\u0430\u043a\u0438) \u0438 \u043c\u0435\u0434\u0438\u0430\u043d\u044b (\u0447\u0438\u0441\u043b\u0435\u043d\u043d\u044b\u0435 \u043f\u0440\u0438\u0437\u043d\u0430\u043a\u0438)<\/p>\n<\/li>\n<li>\n<p>\u043f\u0440\u0435\u043e\u0431\u0440\u0430\u0437\u043e\u0432\u0430\u0442\u044c \u043a\u0430\u0442\u0435\u0433\u043e\u0440\u0438\u0430\u043b\u044c\u043d\u044b\u0435 \u043f\u0440\u0438\u0437\u043d\u0430\u043a\u0438 \u0432 \u0447\u0438\u0441\u043b\u043e\u0432\u044b\u0435 (\u0441\u043b\u0443\u0447\u0430\u0439\u043d\u044b\u0439 \u043b\u0435\u0441 \u043e\u0431\u0443\u0447\u0430\u0435\u0442\u0441\u044f \u043d\u0430 integer, float, boolean)<\/p>\n<\/li>\n<\/ol>\n<p>\u0414\u043b\u044f \u0443\u0434\u043e\u0431\u0441\u0442\u0432\u0430 \u043e\u0431\u0440\u0430\u0431\u043e\u0442\u043a\u0438 \u043e\u0431\u043e\u0438\u0445 \u043d\u0430\u0431\u043e\u0440\u043e\u0432 \u0434\u0430\u043d\u043d\u044b\u0445 \u0441\u043e\u0437\u0434\u0430\u0434\u0438\u043c \u0444\u0443\u043d\u043a\u0446\u0438\u044e \u043f\u0440\u0435\u0434\u043e\u0431\u0440\u0430\u0431\u043e\u0442\u043a\u0438:<\/p>\n<pre><code class=\"python\">def data_preprocessing(df):   # \u043f\u0440\u0435\u043e\u0431\u0440\u0430\u0437\u0443\u0435\u043c \u0437\u043d\u0430\u0447\u0435\u043d\u0438\u044f \u043f\u043e\u043b\u044f Gender: Female - 0, Male - 1   df.loc[(df['Gender'] == 'Female'), 'Gender'] = 0   df.loc[(df['Gender'] != 0), 'Gender'] = 1   # \u0434\u043e\u0431\u0430\u0432\u0438\u043c \u043f\u0440\u0438\u0437\u043d\u0430\u043a \u0432\u043e\u0437\u0440\u0430\u0441\u0442   df['DOB_year'] = nan   df.loc[df['DOB'].notnull(), 'DOB_year'] = 121 - df['DOB'].loc[df['DOB'].notnull()].str[-2:].astype(int)   df['DOB_year'] = df['DOB_year'].fillna(df['DOB_year'].median())   # \u0434\u043e\u0431\u0430\u0432\u0438\u043c \u043f\u0440\u0438\u0437\u043d\u0430\u043a \u0434\u043d\u0435\u0439 \u0441 \u043d\u0430\u0447\u0430\u043b\u0430 \u0433\u043e\u0434\u0430 \u043e\u0442 \u0434\u0430\u0442\u044b \u0437\u0430\u0451\u043c\u0430 (\u0432 \u0434\u0430\u043d\u043d\u044b\u0445 \u0438\u044e\u043b\u044c-\u0441\u0435\u043d\u0442\u044f\u0431\u0440\u044c 2016)   df['Lead_Creation_Date'] = df['Lead_Creation_Date'].str.replace(r'(..\\\/..\\\/)(..)', r'\\1 20\\2')   df['Lead_Creation_Date'] = pd.to_datetime(df['Lead_Creation_Date'], format=\"%d\/%m\/ %Y\")   df['Lead_Creation_Date_day'] = (df['Lead_Creation_Date']-pd.to_datetime('1\/1\/2016')).astype('timedelta64[h]')\/24    #\u0443\u0434\u0430\u043b\u044f\u0435\u043c \u043f\u0435\u0440\u0432\u044b\u0439 \u0441\u0438\u043c\u0432\u043e\u043b \u0434\u0430\u043d\u043d\u044b\u0445 \u0441\u0442\u043e\u043b\u0431\u0446\u043e\u0432 (\u043e\u043d\u0438 \u043e\u0434\u0438\u043d\u0430\u043a\u043e\u0432\u044b), \u043f\u0440\u0435\u043e\u0431\u0440\u0430\u0437\u0443\u0435\u043c \u0432 int,   #\u0437\u0430\u043c\u0435\u043d\u044f\u0435\u043c nan \u043d\u0430 \u043c\u043e\u0434\u0443 (\u043f\u0440\u0438\u0437\u043d\u0430\u043a \u0431\u044b\u043b \u043a\u0430\u0442\u0435\u0433\u043e\u0440\u0438\u0430\u043b\u044c\u043d\u044b\u043c)   first_drop_cols = ['City_Code', 'Source', 'Customer_Existing_Primary_Bank_Code']   for i in first_drop_cols:     df[i] = df[i].loc[df[i].notnull()].str[1:].astype(int)     df[i] = df[i].fillna(df[i].mode()[0]) \t# \u0443\u0434\u0430\u043b\u044f\u0435\u043c \u043f\u0435\u0440\u0432\u044b\u0435 3 \u0441\u0438\u043c\u0432\u043e\u043b\u0430 \u0438 \u0434\u0430\u043b\u0435\u0435 \u0430\u043d\u0430\u043b\u043e\u0433\u0438\u0447\u043d\u043e \u0432\u0435\u0440\u0445\u043d\u0435\u043c\u0443   df['Employer_Code'] = df['Employer_Code'].loc[df['Employer_Code'].notnull()].str[3:].astype(int)   df['Employer_Code'] = df['Employer_Code'].fillna(df['Employer_Code'].mode()[0]) \t# \u0437\u0430\u043f\u043e\u043b\u043d\u044f\u0435\u043c nan \u043c\u0435\u0434\u0438\u0430\u043d\u043e\u0439   amount_cols = ['Employer_Category2', 'Monthly_Income', 'Existing_EMI', 'Loan_Amount',                'Loan_Period', 'Interest_Rate', 'EMI', 'Var1']    df[amount_cols] = df[amount_cols].fillna(df[amount_cols].median()) \t# \u0437\u0430\u043f\u043e\u043b\u043d\u044f\u0435\u043c nan \u043c\u043e\u0434\u043e\u0439 \u0438 \u043a\u043e\u0434\u0438\u0440\u0443\u0435\u043c \u0441\u0442\u043e\u043b\u0431\u0446\u044b (\u043f\u0435\u0440\u0435\u0432\u043e\u0434\u0438\u043c \u0432 \u0447\u0438\u0441\u043b\u0435\u043d\u043d\u044b\u0435)   str_cols = ['City_Category', 'Employer_Category1', 'Primary_Bank_Type', 'Contacted', 'Source_Category']   str_dict = dict(enumerate(str_cols))   for i in str_cols:     df[i] = df[i].fillna(df[i].mode()[0])   le = LabelEncoder()   df[str_cols] = df[str_cols].apply(le.fit_transform)   return df<\/code><\/pre>\n<pre><code class=\"python\">train = data_preprocessing(train) test = data_preprocessing(test)<\/code><\/pre>\n<pre><code class=\"python\"># \u043f\u0440\u0435\u043e\u0431\u0440\u0430\u0437\u0443\u0435\u043c \u0432 float not_float_cols = ['ID', 'DOB', 'Lead_Creation_Date'] train[train.columns.difference(not_float_cols)] = train[train.columns.difference(not_float_cols)].astype(float)<\/code><\/pre>\n<pre><code class=\"python\">#\u043f\u0440\u043e\u0432\u0435\u0440\u0438\u043c \u043d\u0435\u0442 \u043b\u0438 NaN \u0432 \u0441\u0442\u043e\u043b\u0431\u0446\u0430\u0445 for i in train.columns.difference(unused_cols):   print('{} {}'.format(i, train[i].notnull().unique()))<\/code><\/pre>\n<figure class=\"\"><figcaption><\/figcaption><\/figure>\n<p>Nan \u043d\u0435\u0442, \u0430 \u0447\u0442\u043e \u0436\u0435 \u0441 \u0440\u0430\u0441\u043f\u0440\u0435\u0434\u0435\u043b\u0435\u043d\u0438\u0435\u043c \u043a\u043b\u0430\u0441\u0441\u043e\u0432?<\/p>\n<pre><code class=\"python\"># \u043f\u043e\u0441\u043c\u043e\u0442\u0440\u0438\u043c \u043d\u0430 \u043a\u043e\u043b\u0438\u0447\u0435\u0441\u0442\u0432\u043e \u0441\u0442\u0440\u043e\u043a \u0441 \u0440\u0430\u0437\u043d\u044b\u043c\u0438 \u043a\u043b\u0430\u0441\u0441\u0430\u043c\u0438: \u0434\u0430\u0442\u0430\u0441\u0435\u0442 \u043e\u0447\u0435\u043d\u044c \u043d\u0435\u0441\u0431\u0430\u043b\u0430\u043d\u0441\u0438\u0440\u043e\u0432\u0430\u043d \u0432 \u0441\u0442\u043e\u0440\u043e\u043d\u0443 0 (\u0432 \u043e\u043a\u043e\u043b\u043e 68 \u0440\u0430\u0437) train['Approved'].value_counts()<\/code><\/pre>\n<figure class=\"\"><figcaption><\/figcaption><\/figure>\n<pre><code class=\"python\"># \u043e\u0442\u043d\u043e\u0448\u0435\u043d\u0438\u0435 \u043a\u043e\u043b\u0438\u0447\u0435\u0441\u0442\u0432\u0430 \u0441\u0442\u0440\u043e\u043a \u0441 0 \u043a 1 Approved rat = len(train.loc[train['Approved']==0])\/\/len(train.loc[train['Approved']==1]) rat<\/code><\/pre>\n<figure class=\"\"><figcaption><\/figcaption><\/figure>\n<p>\u0421\u043e\u0437\u0434\u0430\u0434\u0438\u043c \u043d\u043e\u0432\u044b\u0439 train \u0434\u0430\u0442\u0430\u0441\u0435\u0442 <strong>\u043c\u0435\u0442\u043e\u0434\u043e\u043c upsampling<\/strong>:<\/p>\n<ol>\n<li>\n<p>\u0432\u043e\u0437\u044c\u043c\u0451\u043c \u0432\u0441\u0435 \u0434\u0430\u043d\u043d\u044b\u0435 \u0441 \u043a\u043b\u0430\u0441\u0441\u043e\u043c 1<\/p>\n<\/li>\n<li>\n<p>\u043f\u0440\u043e\u0434\u0443\u0431\u043b\u0438\u0440\u0443\u0435\u043c \u0435\u0433\u043e rat \u0440\u0430\u0437<\/p>\n<\/li>\n<li>\n<p>\u043f\u0440\u0438\u0441\u043e\u0435\u0434\u0438\u043d\u0438\u043c \u043a \u0434\u0430\u043d\u043d\u044b\u043c \u043a\u043b\u0430\u0441\u0441\u0430 0 \u043f\u0440\u043e\u0434\u0443\u0431\u043b\u0438\u0440\u043e\u0432\u0430\u043d\u043d\u044b\u0439 \u043a\u043b\u0430\u0441\u0441 1 \u0438 \u043f\u0435\u0440\u0435\u043c\u0435\u0449\u0430\u0435\u043c<\/p>\n<\/li>\n<\/ol>\n<pre><code class=\"python\">df_1 = train.loc[train['Approved']==1] df_1 = df_1.loc[df_1.index.repeat(rat)] train_n = pd.concat([train.loc[train['Approved']==0], df_1]).sample(frac=1)<\/code><\/pre>\n<p>\u041f\u043e\u0441\u043c\u043e\u0442\u0440\u0438\u043c \u043d\u0430 \u043d\u043e\u0432\u043e\u0435 \u0440\u0430\u0441\u043f\u0440\u0435\u0434\u0435\u043b\u0435\u043d\u0438\u0435 \u043a\u043b\u0430\u0441\u0441\u043e\u0432:<\/p>\n<pre><code class=\"python\">train_n['Approved'].value_counts()<\/code><\/pre>\n<figure class=\"\"><figcaption>\u0420\u0430\u0441\u043f\u0440\u0435\u0434\u0435\u043b\u0435\u043d\u0438\u0435 \u043a\u043b\u0430\u0441\u0441\u043e\u0432 \u0432 \u043d\u043e\u0432\u043e\u043c \u0434\u0430\u0442\u0430\u0441\u0435\u0442\u0435<\/figcaption><\/figure>\n<p>\u041f\u0440\u0438\u0441\u0442\u0443\u043f\u0430\u0435\u043c \u043a \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u044e. \u0411\u0443\u0434\u0435\u043c \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u044c \u0441\u043b\u0443\u0447\u0430\u0439\u043d\u044b\u0439 \u043b\u0435\u0441, \u0430 \u0442\u0430\u043a\u0436\u0435 \u0435\u0433\u043e \u0442\u044e\u043d\u0438\u043d\u0433 (\u043f\u043e\u0434\u0431\u043e\u0440 \u043d\u0430\u0438\u0431\u043e\u043b\u0435\u0435 \u043a\u0430\u0447\u0435\u0441\u0442\u0432\u0435\u043d\u043d\u044b\u0445 \u0433\u0438\u043f\u0435\u0440\u043f\u0430\u0440\u0430\u043c\u0435\u0442\u0440\u043e\u0432)<\/p>\n<pre><code class=\"python\"># \u0434\u0435\u043b\u0438\u043c \u043d\u0430 \u0442\u0440\u0435\u043d\u0438\u0440\u043e\u0432\u043e\u0447\u043d\u0443\u044e \u0438 \u0442\u0435\u0441\u0442\u043e\u0432\u0443\u044e X =  train_n[train_n.columns.difference(['Approved'])] y = train_n['Approved'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)  # \u0437\u0430\u0434\u0430\u0451\u043c \u043f\u0430\u0440\u0430\u043c\u0435\u0442\u0440\u044b, \u0438\u0437 \u0434\u0438\u0430\u043f\u0430\u0437\u043e\u043d\u0430 \u0437\u043d\u0430\u0447\u0435\u043d\u0438\u0439 \u043a\u043e\u0442\u043e\u0440\u044b\u0445 \u043d\u0430\u0434\u043e \u0432\u044b\u0431\u0440\u0430\u0442\u044c \u043b\u0443\u0447\u0448\u0435\u0435 # https:\/\/github.com\/ray-project\/tune-sklearn param_dists = {     'criterion': tune.choice(['gini', 'entropy']),     'max_depth': tune.choice([i for i in range(2, 17)]),     'max_features': tune.choice(['log2', 'sqrt']),      'min_samples_leaf': tune.choice([i for i in range(2, 33)]),     'min_samples_split': tune.choice([i for i in range(2, 17)]),     'random_state': tune.choice([23]) }  hyperopt_tune_search = TuneSearchCV(RandomForestClassifier(),     param_distributions=param_dists,     n_trials=2,     early_stopping=True,     max_iters=10,     search_optimization=\"hyperopt\" )  hts = hyperopt_tune_search.fit(X_train, y_train)<\/code><\/pre>\n<pre><code class=\"python\">y_pred = hts.predict(X_test) print(confusion_matrix(y_test, y_pred)) print(precision_recall_fscore_support(y_test, y_pred)) print(roc_auc_score(y_test, y_pred, average='weighted'))<\/code><\/pre>\n<figure class=\"full-width\"><figcaption>\u0417\u043d\u0430\u0447\u0435\u043d\u0438\u044f \u043c\u0435\u0442\u0440\u0438\u043a<\/figcaption><\/figure>\n<p>\u0417\u043d\u0430\u0447\u0435\u043d\u0438\u044f&nbsp;\u043c\u0435\u0442\u0440\u0438\u043a&nbsp;f1&nbsp;\u043f\u043e\u043b\u0443\u0447\u0438\u043b\u0438\u0441\u044c&nbsp;\u0434\u043e\u0441\u0442\u0430\u0442\u043e\u0447\u043d\u043e&nbsp;\u0432\u044b\u0441\u043e\u043a\u0438\u0435&nbsp;(90%+),&nbsp;\u0447\u0442\u043e&nbsp;\u043c\u043e\u0436\u0435\u0442&nbsp;\u0433\u043e\u0432\u043e\u0440\u0438\u0442\u044c&nbsp;\u043a\u0430\u0447\u0435\u0441\u0442\u0432\u0435\u043d\u043d\u043e\u0441\u0442\u0438&nbsp;\u043c\u043e\u0434\u0435\u043b\u0438&nbsp;\u043a\u043b\u0430\u0441\u0441\u0438\u0444\u0438\u043a\u0430\u0446\u0438\u0438.<\/p>\n<p>\u0422\u0430\u043a\u0438\u043c \u043e\u0431\u0440\u0430\u0437\u043e\u043c, \u0440\u0430\u0431\u043e\u0442\u0430\u0442\u044c \u0441 \u043d\u0435\u0441\u0431\u0430\u043b\u0430\u043d\u0441\u0438\u0440\u043e\u0432\u0430\u043d\u043d\u044b\u043c\u0438 \u0434\u0430\u043d\u043d\u044b\u043c\u0438 \u043c\u043e\u0436\u043d\u043e \u043d\u0435 \u0442\u043e\u043b\u044c\u043a\u043e \u0447\u0435\u0440\u0435\u0437 \u0433\u0438\u043f\u0435\u0440\u043f\u0430\u0440\u0430\u043c\u0435\u0442\u0440\u044b, \u043d\u043e \u0438 \u043c\u0435\u0442\u043e\u0434\u043e\u043c upsampling. \u041e\u043d \u043f\u043e\u0437\u0432\u043e\u043b\u044f\u0435\u0442 \u043c\u0435\u0442\u0440\u0438\u043a\u0430\u043c \u043d\u0430\u0438\u043c\u0435\u043d\u044c\u0448\u0435\u0433\u043e \u043a\u043b\u0430\u0441\u0441\u0430 \u043e\u0442 0.0 \u0434\u043e\u0441\u0442\u0438\u0447\u044c \u0437\u043d\u0430\u0447\u0435\u043d\u0438\u0439 \u0431\u043e\u043b\u0435\u0435 0.8<\/p>\n<p>\u041f\u043e\u043b\u043d\u044b\u0439 \u043a\u043e\u0434 \u043c\u043e\u0436\u043d\u043e \u0441\u043a\u0430\u0447\u0430\u0442\u044c \u0437\u0434\u0435\u0441\u044c: <a href=\"https:\/\/github.com\/sivovaalex\/for_magazines\/blob\/master\/Banking_Marketing_Leads_Conversion_Data\/Project.ipynb\" rel=\"noopener noreferrer nofollow\">https:\/\/github.com\/sivovaalex\/for_magazines\/blob\/master\/Banking_Marketing_Leads_Conversion_Data\/Project.ipynb<\/a><\/p>\n<\/div>\n<p> \u0441\u0441\u044b\u043b\u043a\u0430 \u043d\u0430 \u043e\u0440\u0438\u0433\u0438\u043d\u0430\u043b \u0441\u0442\u0430\u0442\u044c\u0438 <a href=\"https:\/\/habr.com\/ru\/post\/568266\/\"> https:\/\/habr.com\/ru\/post\/568266\/<\/a><br \/><\/br><\/br><\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[],"tags":[],"class_list":["post-326605","post","type-post","status-publish","format-standard","hentry"],"_links":{"self":[{"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=\/wp\/v2\/posts\/326605","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=326605"}],"version-history":[{"count":0,"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=\/wp\/v2\/posts\/326605\/revisions"}],"wp:attachment":[{"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=326605"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=326605"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=326605"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}