{"id":426692,"date":"2024-07-19T21:00:52","date_gmt":"2024-07-19T21:00:52","guid":{"rendered":"http:\/\/savepearlharbor.com\/?p=426692"},"modified":"-0001-11-30T00:00:00","modified_gmt":"-0001-11-29T21:00:00","slug":"","status":"publish","type":"post","link":"https:\/\/savepearlharbor.com\/?p=426692","title":{"rendered":"<span>\u0410\u043d\u0430\u043b\u0438\u0437 \u043d\u0435\u0433\u0430\u0442\u0438\u0432\u043d\u044b\u0445 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0435\u0432 TRUE CRIME<\/span>"},"content":{"rendered":"<div><!--[--><!--]--><\/div>\n<div id=\"post-content-body\">\n<div>\n<div class=\"article-formatted-body article-formatted-body article-formatted-body_version-2\">\n<div xmlns=\"http:\/\/www.w3.org\/1999\/xhtml\">\n<figure class=\"full-width\"><img loading=\"lazy\" decoding=\"async\" src=\"https:\/\/habrastorage.org\/r\/w780q1\/getpro\/habr\/upload_files\/4df\/1ee\/cf5\/4df1eecf5af2805933cd023319247905.jpg\" alt=\"\" title=\"\" width=\"1363\" height=\"858\" data-src=\"https:\/\/habrastorage.org\/getpro\/habr\/upload_files\/4df\/1ee\/cf5\/4df1eecf5af2805933cd023319247905.jpg\" data-blurred=\"true\"\/><\/figure>\n<p>\u041f\u0440\u0438\u0432\u0435\u0442! \u042f \u0442\u0443\u0442 \u0430\u043a\u0442\u0438\u0432\u043d\u043e \u043f\u044b\u0442\u0430\u044e\u0441\u044c \u043e\u0445\u0432\u0430\u0442\u0438\u0442\u044c \u0440\u0430\u0437\u043d\u044b\u0435 \u043e\u0431\u043b\u0430\u0441\u0442\u0438 \u0432 \u0441\u0444\u0435\u0440\u0435 Data Science \u0438 \u0440\u0435\u0448\u0438\u043b\u0430, \u0447\u0442\u043e \u0431\u044b\u043b\u043e \u0431\u044b \u043a\u043b\u0430\u0441\u0441\u043d\u043e \u043f\u043e\u043a\u043e\u043f\u0430\u0442\u044c\u0441\u044f c \u043e\u0431\u0440\u0430\u0431\u043e\u0442\u043a\u043e\u0439 \u0435\u0441\u0442\u0435\u0441\u0442\u0432\u0435\u043d\u043d\u043e\u0433\u043e \u044f\u0437\u044b\u043a\u0430 (<em>NLP<\/em>) \u043d\u0430 \u043f\u0440\u0438\u043c\u0435\u0440\u0435 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0435\u0432 YouTube. \u0422\u0430\u043a \u043a\u0430\u043a \u043f\u043e\u0441\u043b\u0435 \u0440\u0430\u0431\u043e\u0442\u044b \u044f \u0447\u0430\u0441\u0442\u043e \u0441\u043c\u043e\u0442\u0440\u044e \u0432\u0438\u0434\u0435\u043e <a href=\"https:\/\/www.youtube.com\/@sasha_sulim_channel\" rel=\"noopener noreferrer nofollow\">\u0421\u0430\u0448\u0438 \u0421\u0443\u043b\u0438\u043c<\/a>, \u044f \u0437\u0430\u0434\u0430\u043b\u0430\u0441\u044c \u0432\u043e\u043f\u0440\u043e\u0441\u043e\u043c: &#171;\u0418\u043d\u0442\u0435\u0440\u0435\u0441\u043d\u043e, \u0430 \u0435\u0441\u0442\u044c \u043b\u0438 \u0440\u0430\u0437\u043b\u0438\u0447\u0438\u044f \u0432 \u043e\u0446\u0435\u043d\u043a\u0435 \u0437\u0440\u0438\u0442\u0435\u043b\u044f\u043c\u0438 \u0432\u0438\u0434\u0435\u043e \u043f\u0440\u043e \u043c\u0430\u043d\u044c\u044f\u043a\u043e\u0432 \u0432 \u0437\u0430\u0432\u0438\u0441\u0438\u043c\u043e\u0441\u0442\u0438 \u043e\u0442 \u043f\u043e\u043b\u0430!? \u0418\u043b\u0438 \u043d\u0430\u043c \u043d\u0435 \u0432\u0430\u0436\u043d\u043e, \u043a\u0442\u043e \u0431\u044b\u043b \u0443\u0431\u0438\u0439\u0446\u0435\u0439 &#8212; \u043c\u0443\u0436\u0447\u0438\u043d\u0430\/\u0436\u0435\u043d\u0449\u0438\u043d\u0430?&#187; <\/p>\n<p>\u0422\u0430\u043a \u044f \u043f\u0440\u0438\u0448\u043b\u0430 \u043a \u0442\u043e\u043c\u0443, \u0447\u0442\u043e \u043c\u043e\u0433\u0443 \u0432\u0437\u044f\u0442\u044c \u0437\u0430\u0434\u0430\u0447\u043a\u0443 \u043a\u043b\u0430\u0441\u0441\u0438\u0444\u0438\u043a\u0430\u0446\u0438\u0438 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0435\u0432 \u043f\u043e \u043e\u0446\u0435\u043d\u043a\u0435 \u0438\u0445 \u043d\u0435\u0433\u0430\u0442\u0438\u0432\u043d\u043e\u0441\u0442\u0438 \u0432 \u043a\u0430\u0447\u0435\u0441\u0442\u0432\u0435 pet-\u043f\u0440\u043e\u0435\u043a\u0442\u0430. \u0422\u043e, \u043d\u0430\u0441\u043a\u043e\u043b\u044c\u043a\u043e \u044d\u0442\u043e \u043f\u043e\u043b\u0443\u0447\u0438\u043b\u043e\u0441\u044c, \u043f\u0440\u0435\u0434\u043b\u0430\u0433\u0430\u044e \u043e\u0446\u0435\u043d\u0438\u0442\u044c \u0432\u0430\u043c. <\/p>\n<p>\u0412\u0435\u0441\u044c \u043a\u043e\u0434 \u043c\u043e\u0436\u043d\u043e \u043d\u0430\u0439\u0442\u0438 \u0432\u00a0<a href=\"https:\/\/github.com\/makarstasia\/Pet-projects\/tree\/main\/Toxic_comments\" rel=\"noopener noreferrer nofollow\">github<\/a>, \u0430 \u0432 \u0440\u0430\u043c\u043a\u0430\u0445 \u0434\u0430\u043d\u043d\u043e\u0439 \u0441\u0442\u0430\u0442\u044c\u0438 \u044f \u043f\u043e\u0434\u0440\u043e\u0431\u043d\u0435\u0435 \u043e\u043f\u0438\u0448\u0443 \u043f\u0440\u043e\u0446\u0435\u0441\u0441 \u0438\u0441\u0441\u043b\u0435\u0434\u043e\u0432\u0430\u043d\u0438\u044f \u0434\u0430\u043d\u043d\u043e\u0439 \u0442\u0435\u043c\u044b.<\/p>\n<h3>Dataset<\/h3>\n<p>\u0414\u043b\u044f \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u044f \u043c\u043d\u043e\u044e \u0431\u044b\u043b \u0432\u044b\u0431\u0440\u0430\u043d <a href=\"https:\/\/www.kaggle.com\/datasets\/blackmoon\/russian-language-toxic-comments\" rel=\"noopener noreferrer nofollow\">\u0434\u0430\u0442\u0430\u0441\u0435\u0442<\/a> \u0441 Kaggle \u0438\u0437 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0435\u0432, \u0441\u043e\u0431\u0440\u0430\u043d\u043d\u044b\u0445 \u0441 \u0441\u0430\u0439\u0442\u0430 <a href=\"https:\/\/2ch.hk\" rel=\"noopener noreferrer nofollow\">2ch.hk<\/a> \u0438 <a href=\"https:\/\/pikabu.ru\" rel=\"noopener noreferrer nofollow\">pikabu.ru<\/a>. \u0421\u0440\u0435\u0434\u043d\u0435\u0441\u0442\u0430\u0442\u0438\u0441\u0442\u0438\u0447\u0435\u0441\u043a\u0438\u0439 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0439 \u0438\u043c\u0435\u0435\u0442 \u0434\u043b\u0438\u043d\u0443 175 \u0441\u0438\u043c\u0432\u043e\u043b\u043e\u0432, \u043c\u0438\u043d\u0438\u043c\u0430\u043b\u044c\u043d\u0430\u044f \u0434\u043b\u0438\u043d\u0430 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u044f &#8212; 21 \u0441\u0438\u043c\u0432\u043e\u043b, \u043c\u0430\u043a\u0441\u0438\u043c\u0430\u043b\u044c\u043d\u0430\u044f &#8212; 7 403. <\/p>\n<h4>EDA (Exploratory Data Analysis)<\/h4>\n<p>\u0414\u043b\u044f \u043d\u0430\u0447\u0430\u043b\u0430 \u043f\u043e\u0441\u043c\u043e\u0442\u0440\u0438\u043c \u0447\u0442\u043e \u0438\u0437 \u0441\u0435\u0431\u044f \u043f\u0440\u0435\u0434\u0441\u0442\u0430\u0432\u043b\u044f\u0435\u0442 \u043d\u0430\u0448 \u0434\u0430\u0442\u0430\u0441\u0435\u0442. \u0414\u043b\u044f \u044d\u0442\u043e\u0433\u043e \u043f\u0440\u043e\u0432\u0435\u0434\u0435\u043c \u0441\u0442\u0430\u043d\u0434\u0430\u0440\u0442\u043d\u044b\u0439 \u0430\u043d\u0430\u043b\u0438\u0437:<\/p>\n<pre><code class=\"python\">df = pd.read_csv(\".\/data\/labeled.csv\", sep=',') df.shape >>> (14412, 2)  # \u043f\u0440\u0435\u043e\u0431\u0440\u0430\u0437\u0443\u0435\u043c \u0437\u043d\u0430\u0447\u0435\u043d\u0438\u044f \u043a\u043e\u043b\u043e\u043d\u043a\u0438 \u00abtoxic\u00bb \u043a \u0442\u0438\u043f\u0443 (int) \u0434\u043b\u044f \u0443\u0434\u043e\u0431\u0441\u0442\u0432\u0430 df[\"toxic\"] = df[\"toxic\"].apply(int)  df[\"toxic\"].value_counts() >>> 0    9586 >>> 1    4826  # \u043f\u0440\u043e\u0432\u0435\u0440\u0438\u043c, \u0447\u0442\u043e \u043d\u0435\u0442 \u043f\u0443\u0441\u0442\u044b\u0445 \u0437\u043d\u0430\u0447\u0435\u043d\u0438\u0439 df[df[\"toxic\"] == 0][\"comment\"].isna().sum() >>> 0<\/code><\/pre>\n<p>\u0418\u0442\u0430\u043a, \u043c\u044b \u0432\u044b\u044f\u0441\u043d\u0438\u043b\u0438, \u0447\u0442\u043e \u0434\u0430\u0442\u0430\u0441\u0435\u0442 \u043f\u0440\u0435\u0434\u0441\u0442\u0430\u0432\u043b\u044f\u0435\u043c \u0441\u043e\u0431\u043e\u0439 14 412<strong> <\/strong>\u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0435\u0432. \u0420\u0430\u0441\u043f\u0440\u0435\u0434\u0435\u043b\u0435\u043d\u0438\u0435 \u0432 \u0434\u0430\u043d\u043d\u043e\u043c \u043d\u0430\u0431\u043e\u0440\u0435 \u0441\u043b\u0435\u0434\u0443\u044e\u0449\u0435\u0435: 4 826 &#8212; \u043d\u0435\u0433\u0430\u0442\u0438\u0432\u043d\u044b\u0435, 9 586 &#8212; \u043d\u0435\u0439\u0442\u0440\u0430\u043b\u044c\u043d\u044b\u0435.<\/p>\n<h3>Text preprocessing<\/h3>\n<p>\u041b\u044e\u0431\u044b\u0435 \u0441\u044b\u0440\u044b\u0435 \u0434\u0430\u043d\u043d\u044b\u0435 \u043d\u0443\u0436\u043d\u043e \u043f\u0440\u0435\u0434\u043e\u0431\u0430\u0431\u043e\u0442\u0430\u0442\u044c. \u0414\u043b\u044f \u044d\u0442\u043e\u0433\u043e \u0435\u0441\u0442\u044c \u043d\u0435\u0441\u043a\u043e\u043b\u044c\u043a\u043e \u0432\u0430\u0436\u043d\u044b\u0445 \u044d\u0442\u0430\u043f\u043e\u0432: \u0442\u043e\u043a\u0435\u043d\u0438\u0437\u0430\u0446\u0438\u044f, \u0443\u0434\u0430\u043b\u0435\u043d\u0438\u0435 \u043f\u0443\u043d\u043a\u0442\u0443\u0430\u0446\u0438\u0438 \u0438 \u0441\u0442\u043e\u043f-\u0441\u043b\u043e\u0432, \u0430 \u0442\u0430\u043a\u0436\u0435 \u0441\u0442\u0435\u043c\u043c\u0438\u043d\u0433. \u0414\u0430\u0432\u0430\u0439\u0442\u0435 \u043f\u0440\u0438\u0441\u0442\u0443\u043f\u0438\u043c!<\/p>\n<pre><code class=\"python\"># \u0432\u043e\u0437\u044c\u043c\u0435\u043c \u0434\u043b\u044f \u043f\u0440\u0438\u043c\u0435\u0440\u0430 \u043e\u0434\u0438\u043d \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0439 example = df.iloc[1][\"comment\"] print(f\"\u0418\u0441\u0445\u043e\u0434\u043d\u044b\u0439 \u0442\u0435\u043a\u0441\u0442: {example}\") >>> \u0418\u0441\u0445\u043e\u0434\u043d\u044b\u0439 \u0442\u0435\u043a\u0441\u0442: \u0425\u043e\u0445\u043b\u044b, \u044d\u0442\u043e \u043e\u0442\u0434\u0443\u0448\u0438\u043d\u0430 \u0437\u0430\u0442\u044e\u043a\u0430\u043d\u043e\u0433\u043e \u0440\u043e\u0441\u0441\u0438\u044f\u043d\u0438\u043d\u0430, \u043c\u043e\u043b, \u0432\u043e\u043d, \u0430 \u0443 \u0445\u043e\u0445\u043b\u043e\u0432 \u0435\u0449\u0435 \u0445\u0443\u0436\u0435. \u0415\u0441\u043b\u0438 \u0431\u044b \u0445\u043e\u0445\u043b\u043e\u0432 \u043d\u0435 \u0431\u044b\u043b\u043e, \u043a\u0438\u0441\u0435\u043b\u044c \u0438\u0445 \u0431\u044b \u043f\u0440\u0438\u0434\u0443\u043c\u0430\u043b.  # \u0440\u0430\u0437\u043e\u0431\u044c\u0435\u043c \u043d\u0430 \u0442\u043e\u043a\u0435\u043d\u044b tokens = word_tokenize(example, language=\"russian\") print(f\"\u0422\u043e\u043a\u0435\u043d\u044b: {tokens}\") >>> \u0422\u043e\u043a\u0435\u043d\u044b: ['\u0425\u043e\u0445\u043b\u044b', ',', '\u044d\u0442\u043e', '\u043e\u0442\u0434\u0443\u0448\u0438\u043d\u0430', '\u0437\u0430\u0442\u044e\u043a\u0430\u043d\u043e\u0433\u043e', '\u0440\u043e\u0441\u0441\u0438\u044f\u043d\u0438\u043d\u0430', ',', '\u043c\u043e\u043b', ',', '\u0432\u043e\u043d', ',', '\u0430', '\u0443', '\u0445\u043e\u0445\u043b\u043e\u0432', '\u0435\u0449\u0435', '\u0445\u0443\u0436\u0435', '.', '\u0415\u0441\u043b\u0438', '\u0431\u044b', '\u0445\u043e\u0445\u043b\u043e\u0432', '\u043d\u0435', '\u0431\u044b\u043b\u043e', ',', '\u043a\u0438\u0441\u0435\u043b\u044c', '\u0438\u0445', '\u0431\u044b', '\u043f\u0440\u0438\u0434\u0443\u043c\u0430\u043b', '.']  # \u0443\u0431\u0435\u0440\u0435\u043c \u0432\u0441\u044e \u043f\u0443\u043d\u043a\u0442\u0443\u0430\u0446\u0438\u044e \u0438 \u0441\u0442\u043e\u043f-\u0441\u043b\u043e\u0432\u0430 tokens_without_punct = [i for i in tokens if i not in string.punctuation] stop_words = stopwords.words(\"russian\") print(f\"\u0422\u043e\u043a\u0435\u043d\u044b \u0431\u0435\u0437 \u043f\u0443\u043d\u043a\u0442\u0443\u0430\u0446\u0438\u0438: {tokens_without_punct}\") print(f\"\u0422\u043e\u043a\u0435\u043d\u044b \u0431\u0435\u0437 \u043f\u0443\u043d\u043a\u0442\u0443\u0430\u0446\u0438\u0438 \u0438 \u0441\u0442\u043e\u043f \u0441\u043b\u043e\u0432: {tokens_without_punct_and_stopwords}\") >>> \u0422\u043e\u043a\u0435\u043d\u044b \u0431\u0435\u0437 \u043f\u0443\u043d\u043a\u0442\u0443\u0430\u0446\u0438\u0438: ['\u0425\u043e\u0445\u043b\u044b', '\u044d\u0442\u043e', '\u043e\u0442\u0434\u0443\u0448\u0438\u043d\u0430', '\u0437\u0430\u0442\u044e\u043a\u0430\u043d\u043e\u0433\u043e', '\u0440\u043e\u0441\u0441\u0438\u044f\u043d\u0438\u043d\u0430', '\u043c\u043e\u043b', '\u0432\u043e\u043d', '\u0430', '\u0443', '\u0445\u043e\u0445\u043b\u043e\u0432', '\u0435\u0449\u0435', '\u0445\u0443\u0436\u0435', '\u0415\u0441\u043b\u0438', '\u0431\u044b', '\u0445\u043e\u0445\u043b\u043e\u0432', '\u043d\u0435', '\u0431\u044b\u043b\u043e', '\u043a\u0438\u0441\u0435\u043b\u044c', '\u0438\u0445', '\u0431\u044b', '\u043f\u0440\u0438\u0434\u0443\u043c\u0430\u043b'] >>> \u0422\u043e\u043a\u0435\u043d\u044b \u0431\u0435\u0437 \u043f\u0443\u043d\u043a\u0442\u0443\u0430\u0446\u0438\u0438 \u0438 \u0441\u0442\u043e\u043f \u0441\u043b\u043e\u0432: ['\u0425\u043e\u0445\u043b\u044b', '\u044d\u0442\u043e', '\u043e\u0442\u0434\u0443\u0448\u0438\u043d\u0430', '\u0437\u0430\u0442\u044e\u043a\u0430\u043d\u043e\u0433\u043e', '\u0440\u043e\u0441\u0441\u0438\u044f\u043d\u0438\u043d\u0430', '\u043c\u043e\u043b', '\u0432\u043e\u043d', '\u0445\u043e\u0445\u043b\u043e\u0432', '\u0445\u0443\u0436\u0435', '\u0415\u0441\u043b\u0438', '\u0445\u043e\u0445\u043b\u043e\u0432', '\u043a\u0438\u0441\u0435\u043b\u044c', '\u043f\u0440\u0438\u0434\u0443\u043c\u0430\u043b']  # \u0434\u0430\u043b\u0435\u0435 \u0421\u0442\u0435\u043c\u043c\u0438\u043d\u0433 - \u043f\u0440\u043e\u0446\u0435\u0441\u0441 \u043f\u0440\u0438\u0432\u0435\u0434\u0435\u043d\u0438\u044f \u0441\u043b\u043e\u0432 \u043a \u0438\u0445 \u0431\u0430\u0437\u043e\u0432\u043e\u0439\/\u043a\u043e\u0440\u043d\u0435\u0432\u043e\u0439 \u0444\u043e\u0440\u043c\u0435.  tokens_without_punct_and_stopwords = [i for i in tokens_without_punct if i not in stop_words] snowball = SnowballStemmer(language=\"russian\") stemmed_tokens = [snowball.stem(i) for i in tokens_without_punct_and_stopwords] print(f\"\u0422\u043e\u043a\u0435\u043d\u044b \u043f\u043e\u0441\u043b\u0435 \u0441\u0442\u0435\u043c\u043c\u0438\u043d\u0433\u0430: {stemmed_tokens}\") >>> \u0422\u043e\u043a\u0435\u043d\u044b \u043f\u043e\u0441\u043b\u0435 \u0441\u0442\u0435\u043c\u043c\u0438\u043d\u0433\u0430: ['\u0445\u043e\u0445\u043b', '\u044d\u0442', '\u043e\u0442\u0434\u0443\u0448\u0438\u043d', '\u0437\u0430\u0442\u044e\u043a\u0430\u043d', '\u0440\u043e\u0441\u0441\u0438\u044f\u043d\u0438\u043d', '\u043c\u043e\u043b', '\u0432\u043e\u043d', '\u0445\u043e\u0445\u043b', '\u0445\u0443\u0436', '\u0435\u0441\u043b', '\u0445\u043e\u0445\u043b', '\u043a\u0438\u0441\u0435\u043b', '\u043f\u0440\u0438\u0434\u0443\u043c\u0430']<\/code><\/pre>\n<p>\u0422\u0430\u043a \u043a\u0430\u043a \u043f\u0440\u043e\u0446\u0435\u0441\u0441 \u043f\u0440\u0435\u0434\u043e\u0431\u0440\u0430\u0431\u043e\u0442\u043a\u0438 \u0431\u0443\u0434\u0435\u0442 \u043f\u043e\u0432\u0442\u043e\u0440\u044f\u0442\u044c\u0441\u044f &#8212; \u0441\u043e\u0437\u0434\u0430\u0434\u0438\u043c \u0434\u043b\u044f \u0443\u0434\u043e\u0431\u0441\u0442\u0432\u0430 \u0444\u0443\u043d\u043a\u0446\u0438\u044e, \u043f\u043e\u0432\u0442\u043e\u0440\u044f\u044e\u0449\u0443\u044e \u0432\u0441\u0435 \u0432\u044b\u0448\u0435\u043f\u0435\u0440\u0435\u0447\u0438\u0441\u043b\u0435\u043d\u043d\u044b\u0435 \u043f\u0440\u0435\u043e\u0431\u0440\u0430\u0437\u043e\u0432\u0430\u043d\u0438\u044f.<\/p>\n<pre><code class=\"python\">snowball = SnowballStemmer(language=\"russian\") russian_stop_words = stopwords.words(\"russian\")  def tokenize_sentence(sentence: str, remove_stop_words: bool = True):     tokens = word_tokenize(sentence, language=\"russian\")     tokens = [i for i in tokens if i not in string.punctuation]     if remove_stop_words:         tokens = [i for i in tokens if i not in russian_stop_words]     tokens = [snowball.stem(i) for i in tokens]     return tokens<\/code><\/pre>\n<p>\u041e\u0442\u043b\u0438\u0447\u043d\u043e, \u0442\u0435\u043f\u0435\u0440\u044c \u0440\u0430\u0437\u0434\u0435\u043b\u0438\u043c \u043d\u0430\u0448 \u0434\u0430\u0442\u0430\u0441\u0435\u0442 \u043d\u0430 \u043e\u0431\u0443\u0447\u0430\u044e\u0449\u0443\u044e \u0438 \u0442\u0435\u0441\u0442\u043e\u0432\u0443\u044e \u0432\u044b\u0431\u043e\u0440\u043a\u0443 \u0438 \u0441\u0440\u0430\u0432\u043d\u0438\u043c \u0438\u0445 \u0440\u0430\u0441\u043f\u0440\u0435\u0434\u0435\u043b\u0435\u043d\u0438\u0435. <\/p>\n<pre><code class=\"python\">train_df, test_df = train_test_split(df, test_size = 500, random_state=234) print(train_df.shape) print(test_df.shape) >>> (13912, 2) >>> (500, 2)  # \u0441\u0440\u0430\u0432\u043d\u0438\u043c \u0440\u0430\u0441\u043f\u0440\u0435\u0434\u0435\u043b\u0435\u043d\u0438\u0435 \u0446\u0435\u043b\u0435\u0432\u043e\u0433\u043e \u043f\u0440\u0438\u0437\u043d\u0430\u043a\u0430 for sample in [train_df, test_df]:     print(sample[sample['toxic'] == 1].shape[0] \/ sample.shape[0]) >>> 0.3356095457159287 >>> 0.314<\/code><\/pre>\n<p>\u041f\u043e\u043b\u0443\u0447\u0438\u043b\u0438 \u0440\u0430\u0441\u043f\u0440\u0435\u0434\u0435\u043b\u0435\u043d\u0438\u0435: <\/p>\n<div>\n<div class=\"table\">\n<table>\n<tbody>\n<tr>\n<td>\n<p align=\"left\">\u041e\u0431\u0443\u0447\u0430\u044e\u0449\u0430\u044f \u0432\u044b\u0431\u043e\u0440\u043a\u0430<\/p>\n<\/td>\n<td>\n<p align=\"left\">33.56% \u0442\u043e\u043a\u0441\u0438\u0447\u043d\u044b\u0445 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0435\u0432<\/p>\n<\/td>\n<\/tr>\n<tr>\n<td>\n<p align=\"left\">\u0422\u0435\u0441\u0442\u043e\u0432\u0430\u044f \u0432\u044b\u0431\u043e\u0440\u043a\u0430<\/p>\n<\/td>\n<td>\n<p align=\"left\">31.4% \u0442\u043e\u043a\u0441\u0438\u0447\u043d\u044b\u0445 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0435\u0432<\/p>\n<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<\/div>\n<\/div>\n<p>\u0414\u0430\u043d\u043d\u044b\u0435 \u0440\u0430\u0432\u043d\u043e\u043c\u0435\u0440\u043d\u043e \u0440\u0430\u0441\u043f\u0440\u0435\u0434\u0435\u043b\u0435\u043d\u044b \u043f\u043e \u0432\u044b\u0431\u043e\u0440\u043a\u0430\u043c, \u0441\u043b\u0435\u0434\u043e\u0432\u0430\u0442\u0435\u043b\u044c\u043d\u043e \u043d\u0430\u0448\u0430 \u0431\u0443\u0434\u0443\u0449\u0430\u044f \u043c\u043e\u0434\u0435\u043b\u044c \u0434\u043e\u043b\u0436\u043d\u0430 \u0430\u0434\u0435\u043a\u0432\u0430\u0442\u043d\u043e \u043e\u0446\u0435\u043d\u0438\u0432\u0430\u0442\u044c\u0441\u044f \u043d\u0430 \u0442\u0435\u0441\u0442\u043e\u0432\u044b\u0445 \u0434\u0430\u043d\u043d\u044b\u0445.<\/p>\n<h4>TF-IDF <\/h4>\n<p>\u041f\u0440\u0435\u0436\u0434\u0435 \u0447\u0435\u043c \u043f\u0440\u0438\u0441\u0442\u0443\u043f\u0438\u0442\u044c \u043a \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u044e \u043d\u0430\u0448\u0435\u0439 \u043c\u043e\u0434\u0435\u043b\u0438 \u043c\u044b \u0434\u043e\u043b\u0436\u043d\u044b \u043f\u0440\u0435\u043e\u0431\u0440\u0430\u0437\u043e\u0432\u0430\u0442\u044c \u043d\u0430\u0448\u0438 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0438 \u0432 \u0447\u0438\u0441\u043b\u0435\u043d\u043d\u044b\u0435 \u043c\u0430\u0441\u0441\u0438\u0432\u044b. \u0414\u043b\u044f \u044d\u0442\u043e\u0433\u043e \u0432\u043e\u0441\u043f\u043e\u043b\u044c\u0437\u0443\u0435\u043c\u0441\u044f TF-IDF \u0432\u0435\u043a\u0442\u043e\u0440\u0438\u0437\u0430\u0446\u0438\u0435\u0439.<\/p>\n<p><strong>TF<\/strong> \u0438\u0437\u043c\u0435\u0440\u044f\u0435\u0442 \u043d\u0430\u0441\u043a\u043e\u043b\u044c\u043a\u043e \u0447\u0430\u0441\u0442\u043e \u0442\u0435\u0440\u043c\u0438\u043d (\u0441\u043b\u043e\u0432\u043e) \u0432\u0441\u0442\u0440\u0435\u0447\u0430\u0435\u0442\u0441\u044f \u0432 \u0434\u043e\u043a\u0443\u043c\u0435\u043d\u0442\u0435. \u0424\u043e\u0440\u043c\u0443\u043b\u0430 \u0434\u043b\u044f \u0440\u0430\u0441\u0447\u0435\u0442\u0430 TF: <\/p>\n<p><img loading=\"lazy\" decoding=\"async\" class=\"formula\" source=\"\\text{TF}(t, d) = \\frac{f(t, d)}{N_d}\" alt=\"\\text{TF}(t, d) = \\frac{f(t, d)}{N_d}\" src=\"https:\/\/habrastorage.org\/getpro\/habr\/upload_files\/5a5\/257\/6c7\/5a52576c7332df127f940f1dd37df8c3.svg\" width=\"153\" height=\"48\"\/><\/p>\n<p>\u0433\u0434\u0435\u00a0f(t,d)\u00a0\u2014 \u043a\u043e\u043b\u0438\u0447\u0435\u0441\u0442\u0432\u043e \u0432\u0445\u043e\u0436\u0434\u0435\u043d\u0438\u0439 \u0442\u0435\u0440\u043c\u0438\u043d\u0430\u00a0t \u0432 \u0434\u043e\u043a\u0443\u043c\u0435\u043d\u0442\u00a0d , \u0430\u00a0N<sub>d<\/sub>\u00a0\u2014 \u043e\u0431\u0449\u0435\u0435 \u043a\u043e\u043b\u0438\u0447\u0435\u0441\u0442\u0432\u043e \u0442\u0435\u0440\u043c\u0438\u043d\u043e\u0432 \u0432 \u0434\u043e\u043a\u0443\u043c\u0435\u043d\u0442\u0435\u00a0d.<\/p>\n<p><strong>IDF<\/strong> \u0438\u0437\u043c\u0435\u0440\u044f\u0435\u0442 \u0432\u0430\u0436\u043d\u043e\u0441\u0442\u044c \u0442\u0435\u0440\u043c\u0438\u043d\u0430 \u043f\u043e \u043e\u0442\u043d\u043e\u0448\u0435\u043d\u0438\u044e \u043a\u043e \u0432\u0441\u0435\u043c\u0443 \u043a\u043e\u0440\u043f\u0443\u0441\u0443 \u0434\u043e\u043a\u0443\u043c\u0435\u043d\u0442\u043e\u0432. \u0427\u0435\u043c \u0440\u0435\u0436\u0435 \u0442\u0435\u0440\u043c\u0438\u043d \u0432\u0441\u0442\u0440\u0435\u0447\u0430\u0435\u0442\u0441\u044f \u0432 \u043a\u043e\u0440\u043f\u0443\u0441\u0435, \u0442\u0435\u043c \u0432\u044b\u0448\u0435 \u0435\u0433\u043e IDF. \u0424\u043e\u0440\u043c\u0443\u043b\u0430 \u0434\u043b\u044f \u0440\u0430\u0441\u0447\u0435\u0442\u0430 IDF:<\/p>\n<p><img loading=\"lazy\" decoding=\"async\" class=\"formula\" source=\" \\text{IDF}(t, D) = \\log \\left( \\frac{N}{\\left|\\{d \\in D : t \\in d\\}\\right|} \\right) \" alt=\" \\text{IDF}(t, D) = \\log \\left( \\frac{N}{\\left|\\{d \\in D : t \\in d\\}\\right|} \\right) \" src=\"https:\/\/habrastorage.org\/getpro\/habr\/upload_files\/b49\/a4b\/166\/b49a4b1666de291c33c696281da1d880.svg\" width=\"307\" height=\"51\"\/><\/p>\n<p>\u0433\u0434\u0435\u00a0N\u00a0\u2014 \u043e\u0431\u0449\u0435\u0435 \u043a\u043e\u043b\u0438\u0447\u0435\u0441\u0442\u0432\u043e \u0434\u043e\u043a\u0443\u043c\u0435\u043d\u0442\u043e\u0432 \u0432 \u043a\u043e\u0440\u043f\u0443\u0441\u0435\u00a0D, \u0430\u00a0\u2223{d\u2208D:t\u2208d}\u2223\u00a0\u2014 \u043a\u043e\u043b\u0438\u0447\u0435\u0441\u0442\u0432\u043e \u0434\u043e\u043a\u0443\u043c\u0435\u043d\u0442\u043e\u0432, \u0441\u043e\u0434\u0435\u0440\u0436\u0430\u0449\u0438\u0445 \u0442\u0435\u0440\u043c\u0438\u043d\u00a0t.<\/p>\n<p><strong>TF-IDF<\/strong> \u043e\u0431\u044a\u0435\u0434\u0438\u043d\u044f\u0435\u0442 TF \u0438 IDF \u0434\u043b\u044f \u043e\u0446\u0435\u043d\u043a\u0438 \u0432\u0430\u0436\u043d\u043e\u0441\u0442\u0438 \u0442\u0435\u0440\u043c\u0438\u043d\u0430 \u0432 \u043a\u043e\u043d\u043a\u0440\u0435\u0442\u043d\u043e\u043c \u0434\u043e\u043a\u0443\u043c\u0435\u043d\u0442\u0435. \u0424\u043e\u0440\u043c\u0443\u043b\u0430 \u0434\u043b\u044f \u0440\u0430\u0441\u0447\u0435\u0442\u0430 TF-IDF:<\/p>\n<p><img loading=\"lazy\" decoding=\"async\" class=\"formula\" source=\" \\text{TF-IDF}(t, d, D) = \\text{TF}(t, d) \\times \\text{IDF}(t, D) \" alt=\" \\text{TF-IDF}(t, d, D) = \\text{TF}(t, d) \\times \\text{IDF}(t, D) \" src=\"https:\/\/habrastorage.org\/getpro\/habr\/upload_files\/924\/007\/201\/924007201210c0e6e7b60a37c456934d.svg\" width=\"334\" height=\"22\"\/><\/p>\n<p>\u0414\u043b\u044f \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u043d\u0438\u044f TF-IDF \u043f\u0440\u0438\u043c\u0435\u043d\u0438\u043c \u0431\u0438\u0431\u043b\u0438\u043e\u0442\u0435\u043a\u0443\u00a0<code>scikit-learn<\/code>.\u00a0<\/p>\n<pre><code class=\"python\"># \u0438\u043d\u0438\u0446\u0438\u0430\u043b\u0438\u0437\u0438\u0440\u0443\u0435\u043c \u0432\u0435\u043a\u0442\u043e\u0440\u0430\u0439\u0437\u0435\u0440 \u0438 \u043f\u0440\u0438\u043c\u0435\u043d\u0438\u043c \u043a \u043d\u0430\u0448\u0438\u043c \u0432\u044b\u0431\u043e\u0440\u043a\u0430\u043c count_idf_1 = TfidfVectorizer(ngram_range = (1,1), tokenizer=lambda x: tokenize_sentence(x, remove_stop_words=True)) tf_idf_base_1 = count_idf_1.fit(df['comment']) tf_idf_train_base_1 = count_idf_1.transform(train_df['comment']) tf_idf_test_base_1 = count_idf_1.transform(test_df['comment'])  # \u0432\u044b\u0432\u0435\u0434\u0435\u043c \u0440\u0430\u0437\u043c\u0435\u0440\u044b \u043c\u0430\u0442\u0440\u0438\u0446, \u0447\u0442\u043e\u0431\u044b \u0443\u0431\u0435\u0434\u0438\u0442\u044c\u0441\u044f \u0432 \u043a\u043e\u0440\u0440\u0435\u043a\u0442\u043d\u043e\u0441\u0442\u0438: print(tf_idf_train_base_1.shape) print(tf_idf_test_base_1.shape) >>> (13912, 36122) >>> (500, 36122)<\/code><\/pre>\n<p>\u0414\u043b\u044f \u043f\u0440\u0438\u043c\u0435\u0440\u0430 \u0434\u0430\u0432\u0430\u0439\u0442\u0435 \u0440\u0430\u0441\u0441\u043c\u043e\u0442\u0440\u0438\u043c \u043a\u0430\u043a \u043f\u0440\u043e\u0438\u0441\u0445\u043e\u0434\u0438\u0442 TF-IDF\u00a0\u043d\u0430 \u043e\u0434\u043d\u043e\u043c \u0438\u0437 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0435\u0432.<\/p>\n<pre><code class=\"python\">sample = test_df.sample(n=1)['comment'] sample_tf_idf = count_idf_1.transform(sample) sample_tf_idf.shape >>> (1, 36122)  array = sample_tf_idf.toarray() array >>> array([[0., 0., 0., ..., 0., 0., 0.]])  # \u043a\u0430\u043a \u0432\u044b\u0433\u043b\u044f\u0434\u0438\u0442 \u043d\u0430\u0448 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0439 \u0434\u043e \u0432\u0435\u043a\u0442\u043e\u0440\u0438\u0437\u0430\u0446\u0438\u0438 sample >>> 12391    \u0427\u0442\u043e \u043a\u0430\u0441\u0430\u0435\u0442\u0441\u044f 3 \u043c\u043b\u043d, \u0443 \u041a\u0438\u044f \u0441\u0430\u043c\u0430\u044f \u0434\u043e\u0440\u043e\u0433\u0430\u044f \u043c\u0430\u0448\u0438\u043d\u0430...  # \u0438\u0437\u0432\u043b\u0435\u043a\u0430\u0435\u043c \u0438 \u0432\u044b\u0432\u043e\u0434\u0438\u043c \u043d\u0435\u043d\u0443\u043b\u0435\u0432\u044b\u0435 \u044d\u043b\u0435\u043c\u0435\u043d\u0442\u044b, \u043a\u043e\u0442\u043e\u0440\u044b\u0435 \u0441\u043e\u043e\u0442\u0432\u0435\u0442\u0441\u0442\u0432\u0443\u044e\u0442 \u0437\u043d\u0430\u0447\u0438\u043c\u044b\u043c \u0441\u043b\u043e\u0432\u0430\u043c: array[array!= 0] >>> array([0.27552192, 0.25845753, 0.24785363, 0.19574676, 0.13724815,            0.25845753, 0.13854953, 0.21636683, 0.18436214, 0.2040751 ,            0.25845753, 0.23449431, 0.13459448, 0.37887959, 0.20099479,            0.14063173, 0.15832929, 0.10074052, 0.11669742, 0.25845753,            0.25845753, 0.06473031]) <\/code><\/pre>\n<p>\u0422\u0435\u043f\u0435\u0440\u044c, \u043a\u043e\u0433\u0434\u0430 \u043d\u0430\u0448\u0438 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0438 \u0438\u043c\u0435\u044e\u0442 \u0432\u0435\u043a\u0442\u043e\u0440\u043d\u043e\u0435 \u043f\u0440\u0435\u0434\u0441\u0442\u0430\u0432\u043b\u0435\u043d\u0438\u0435, \u043c\u044b \u043c\u043e\u0436\u0435\u043c \u043f\u0435\u0440\u0435\u0439\u0442\u0438 \u043a \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u044e \u043c\u043e\u0434\u0435\u043b\u0438.<\/p>\n<h4>\u041e\u0431\u0443\u0447\u0435\u043d\u0438\u0435 \u043c\u043e\u0434\u0435\u043b\u0438<\/h4>\n<p>\u0412 \u043a\u0430\u0447\u0435\u0441\u0442\u0432\u0435 baseline \u044f \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u043b\u0430 \u043b\u043e\u0433\u0438\u0441\u0442\u0438\u0447\u0435\u0441\u043a\u0443\u044e \u0440\u0435\u0433\u0440\u0435\u0441\u0441\u0438\u044e, \u0442.\u043a \u043e\u043d\u0430 \u0445\u043e\u0440\u043e\u0448\u043e \u043f\u043e\u0434\u0445\u043e\u0434\u0438\u0442 \u0434\u043b\u044f \u0437\u0430\u0434\u0430\u0447\u0438 \u0431\u0438\u043d\u0430\u0440\u043d\u043e\u0439 \u043a\u043b\u0430\u0441\u0441\u0438\u0444\u0438\u043a\u0430\u0446\u0438\u0438.<\/p>\n<p>\u0415\u0441\u043b\u0438 \u0432\u044b \u0435\u0449\u0435 \u043d\u0435 \u0437\u043d\u0430\u043a\u043e\u043c\u044b \u0441 \u0434\u0430\u043d\u043d\u043e\u0439 \u043c\u043e\u0434\u0435\u043b\u044c\u044e, \u043d\u043e \u0443\u0436\u0435 \u0441\u043b\u044b\u0448\u0430\u043b\u0438 \u043f\u0440\u043e \u043b\u0438\u043d\u0435\u0439\u043d\u0443\u044e \u0440\u0435\u0433\u0440\u0435\u0441\u0441\u0438\u044e, \u0442\u043e \u043c\u043e\u0436\u043d\u043e \u0441\u043a\u0430\u0437\u0430\u0442\u044c, \u0447\u0442\u043e \u0432\u044b \u043f\u043e\u0447\u0442\u0438 \u0437\u043d\u0430\u0442\u043e\u043a. \u0414\u0435\u043b\u043e \u0432 \u0442\u043e\u043c, \u0447\u0442\u043e \u043b\u043e\u0433\u0438\u0441\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u0440\u0435\u0433\u0440\u0435\u0441\u0441\u0438\u044f \u043f\u043e \u0441\u0443\u0442\u0438 \u044d\u0442\u043e \u043b\u0438\u043d\u0435\u0439\u043d\u0430\u044f \u0440\u0435\u0433\u0440\u0435\u0441\u0441\u0438\u044f, \u043a \u0440\u0435\u0437\u0443\u043b\u044c\u0442\u0430\u0442\u0443 \u043a\u043e\u0442\u043e\u0440\u043e\u0439 \u0432 \u043a\u043e\u043d\u0446\u0435 \u043f\u0440\u0438\u043c\u0435\u043d\u044f\u0435\u0442\u0441\u044f \u043b\u043e\u0433\u0438\u0441\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u0444\u0443\u043d\u043a\u0446\u0438\u044f (\u043d\u0430\u043f\u0440\u0438\u043c\u0435\u0440, \u0441\u0438\u0433\u043c\u043e\u0438\u0434\u0430). <\/p>\n<p>\u0424\u043e\u0440\u043c\u0443\u043b\u0430 \u0441\u0438\u0433\u043c\u043e\u0438\u0434\u043d\u043e\u0439 \u0444\u0443\u043d\u043a\u0446\u0438\u0438:<\/p>\n<p><img loading=\"lazy\" decoding=\"async\" class=\"formula\" source=\"\\sigma(z) = \\frac{1}{1 + e^{-z}}\" alt=\"\\sigma(z) = \\frac{1}{1 + e^{-z}}\" src=\"https:\/\/habrastorage.org\/getpro\/habr\/upload_files\/f2c\/986\/3f5\/f2c9863f54c618e1eee8e5409685c662.svg\" width=\"131\" height=\"46\"\/><\/p>\n<p>\u0433\u0434\u0435\u00a0z\u2014 \u043b\u0438\u043d\u0435\u0439\u043d\u0430\u044f \u043a\u043e\u043c\u0431\u0438\u043d\u0430\u0446\u0438\u044f \u043f\u0440\u0438\u0437\u043d\u0430\u043a\u043e\u0432 \u0438 \u0438\u0445 \u0432\u0435\u0441\u043e\u0432:\u00a0z = \u03b2<sub>0<\/sub>+\u03b2<sub>1<\/sub>x<sub>1<\/sub>+\u03b2<sub>2<\/sub>x<sub>2<\/sub>+\u2026+\u03b2<sub>n<\/sub>x<sub>n<\/sub>.<\/p>\n<p>\u0417\u043d\u0430\u0447\u0435\u043d\u0438\u0435\u00a0\u03c3(z)\u00a0\u043b\u0435\u0436\u0438\u0442 \u043c\u0435\u0436\u0434\u0443 0 \u0438 1, \u0447\u0442\u043e \u0438\u043d\u0442\u0435\u0440\u043f\u0440\u0435\u0442\u0438\u0440\u0443\u0435\u0442\u0441\u044f \u043a\u0430\u043a \u0432\u0435\u0440\u043e\u044f\u0442\u043d\u043e\u0441\u0442\u044c.<\/p>\n<pre><code class=\"python\"># \u0438\u043d\u0438\u0446\u0438\u0430\u043b\u0438\u0437\u0438\u0440\u0443\u0435\u043c \u043c\u043e\u0434\u0435\u043b\u044c model_lr_base_1 = LogisticRegression(solver='lbfgs', random_state=234, max_iter= 10000, n_jobs= -1)  # \u043e\u0431\u0443\u0447\u0438\u043c \u043c\u043e\u0434\u0435\u043b\u044c model_lr_base_1.fit(tf_idf_train_base_1, train_df['toxic'])  # \u043f\u043e\u043b\u0443\u0447\u0438\u043c \u043f\u0440\u043e\u0433\u043d\u043e\u0437 \u0432\u0435\u0440\u043e\u044f\u0442\u043d\u043e\u0441\u0442\u0435\u0439 \u043a\u043b\u0430\u0441\u0441\u043e\u0432 predict_lr_base_proba = model_lr_base_1.predict_proba(tf_idf_test_base_1) predict_lr_base_proba >>> array([[0.85603587, 0.14396413],            [0.29448938, 0.70551062],            [0.41543358, 0.58456642],            [0.77011541, 0.22988459],            [0.62820949, 0.37179051],            ...            [0.82299013, 0.17700987]])<\/code><\/pre>\n<p>\u041a\u0430\u0436\u0434\u0430\u044f \u0441\u0442\u0440\u043e\u043a\u0430 <code>predict_lr_base_proba<\/code> \u043f\u0440\u0435\u0434\u0441\u0442\u0430\u0432\u043b\u044f\u0435\u0442 \u0441\u043e\u0431\u043e\u0439 \u043f\u0430\u0440\u0443 \u0447\u0438\u0441\u0435\u043b: \u0432\u0435\u0440\u043e\u044f\u0442\u043d\u043e\u0441\u0442\u044c \u043d\u0435 \u0442\u043e\u043a\u0441\u0438\u0447\u043d\u043e\u0433\u043e \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u044f (\u043f\u0435\u0440\u0432\u043e\u0435 \u0447\u0438\u0441\u043b\u043e) \u0438 \u0432\u0435\u0440\u043e\u044f\u0442\u043d\u043e\u0441\u0442\u044c \u0442\u043e\u043a\u0441\u0438\u0447\u043d\u043e\u0433\u043e \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u044f (\u0432\u0442\u043e\u0440\u043e\u0435 \u0447\u0438\u0441\u043b\u043e) \u0441\u043e\u043e\u0442\u0432\u0435\u0442\u0441\u0442\u0432\u0435\u043d\u043d\u043e. <\/p>\n<h3>\u041e\u0446\u0435\u043d\u043a\u0430 \u043c\u043e\u0434\u0435\u043b\u0438<\/h3>\n<p>\u041f\u0440\u0435\u0434\u043b\u0430\u0433\u0430\u044e \u0435\u0449\u0435 \u0441\u0440\u0430\u0432\u043d\u0438\u0442\u044c \u043a\u0430\u0447\u0435\u0441\u0442\u0432\u043e \u043d\u0430\u0448\u0435\u0439 \u043c\u043e\u0434\u0435\u043b\u0438 \u0441 \u0441\u043b\u0443\u0447\u0430\u0439\u043d\u044b\u043c \u043a\u043b\u0430\u0441\u0441\u0438\u0444\u0438\u043a\u0430\u0442\u043e\u0440\u043e\u043c.<\/p>\n<pre><code class=\"python\">def coin_classifier(X:np.array) -> np.array:     predict = np.random.uniform(0.0, 1.0, X.shape[0])     return predict coin_predict = coin_classifier(tf_idf_test_base_1)<\/code><\/pre>\n<p>\u0412\u0438\u0437\u0443\u0430\u043b\u0438\u0437\u0438\u0440\u0443\u0435\u043c ROC-\u043a\u0440\u0438\u0432\u044b\u0435 \u0438 \u0432\u044b\u0432\u0435\u0434\u0435\u043c \u043c\u0430\u0442\u0440\u0438\u0446\u0443 \u043e\u0448\u0438\u0431\u043e\u043a.<\/p>\n<pre><code class=\"python\"># \u0434\u043b\u044f \u043d\u0430\u0448\u0435\u0439 \u043c\u043e\u0434\u0435\u043b\u0438 \u043b\u043e\u0433\u0438\u0441\u0442\u0438\u0447\u0435\u0441\u043a\u043e\u0439 \u0440\u0435\u0433\u0440\u0435\u0441\u0441\u0438\u0438 fpr_base, tpr_base, _ = roc_curve(test_df['toxic'], predict_lr_base_proba[:, 1]) roc_auc_base = auc(fpr_base, tpr_base)  # \u0434\u043b\u044f \u0441\u043b\u0443\u0447\u0430\u0439\u043d\u043e\u0433\u043e \u043a\u043b\u0430\u0441\u0441\u0438\u0444\u0438\u043a\u0430\u0442\u043e\u0440\u0430  fpr_coin, tpr_coin, _ = roc_curve(test_df['toxic'], coin_predict) roc_auc_coin = auc(fpr_base, tpr_base)  fig = make_subplots(1,1,                     subplot_titles = [\"Receiver operating characteristic\"],                     x_title=\"False Positive Rate\",                     y_title = \"True Positive Rate\"                    ) fig.add_trace(go.Scatter(     x = fpr_base,     y = tpr_base,     #fill = 'tozeroy',     name = \"ROC base (area = %0.3f)\" % roc_auc_base,     )) fig.add_trace(go.Scatter(     x = fpr_coin,     y = tpr_coin,     mode = 'lines',     line = dict(dash = 'dash'),     name = 'Coin classifier (area = 0.5)'     )) fig.update_layout(     height = 600,     width = 800,     xaxis_showgrid=False,     xaxis_zeroline=False,     template = 'plotly_dark',     font_color = 'rgba(212, 210, 210, 1)'     )  # \u043c\u0430\u0442\u0440\u0438\u0446\u0430 \u043e\u0448\u0438\u0431\u043e\u043a confusion_matrix(test_df['toxic'],                  (predict_lr_base_proba[:, 1] > 0.5).astype('float'),                  normalize='true',                 ) >>> array([[0.97959184, 0.02040816],        [0.35031847, 0.64968153]]) <\/code><\/pre>\n<figure class=\"full-width\"><img loading=\"lazy\" decoding=\"async\" src=\"https:\/\/habrastorage.org\/r\/w1560\/getpro\/habr\/upload_files\/04b\/c7d\/fe5\/04bc7dfe5bf72595804bc9968e6ece58.png\" width=\"800\" height=\"600\" data-src=\"https:\/\/habrastorage.org\/getpro\/habr\/upload_files\/04b\/c7d\/fe5\/04bc7dfe5bf72595804bc9968e6ece58.png\"\/><\/figure>\n<ul>\n<li>\n<p>AUC \u0441\u043b\u0443\u0447\u0430\u0439\u043d\u043e\u0433\u043e \u043a\u043b\u0430\u0441\u0441\u0438\u0444\u0438\u043a\u0430\u0442\u043e\u0440\u0430 \u0431\u043b\u0438\u0437\u043e\u043a \u043a 0.5, \u0447\u0442\u043e \u0441\u0432\u0438\u0434\u0435\u0442\u0435\u043b\u044c\u0441\u0442\u0432\u0443\u0435\u0442 \u043e \u0442\u043e\u043c, \u0447\u0442\u043e \u044d\u0442\u043e\u0442 \u043a\u043b\u0430\u0441\u0441\u0438\u0444\u0438\u043a\u0430\u0442\u043e\u0440 \u043d\u0435\u0441\u043f\u043e\u0441\u043e\u0431\u0435\u043d \u044d\u0444\u0444\u0435\u043a\u0442\u0438\u0432\u043d\u043e \u0440\u0430\u0437\u043b\u0438\u0447\u0430\u0442\u044c \u043a\u043b\u0430\u0441\u0441\u044b.<\/p>\n<\/li>\n<li>\n<p>\u041c\u043e\u0434\u0435\u043b\u044c \u043b\u043e\u0433\u0438\u0441\u0442\u0438\u0447\u0435\u0441\u043a\u043e\u0439 \u0440\u0435\u0433\u0440\u0435\u0441\u0441\u0438\u0438 \u043f\u043e\u043a\u0430\u0437\u044b\u0432\u0430\u0435\u0442 \u0437\u043d\u0430\u0447\u0438\u0442\u0435\u043b\u044c\u043d\u043e \u043b\u0443\u0447\u0448\u0438\u0435 \u0440\u0435\u0437\u0443\u043b\u044c\u0442\u0430\u0442\u044b \u043f\u043e \u0441\u0440\u0430\u0432\u043d\u0435\u043d\u0438\u044e \u0441 \u0441\u043b\u0443\u0447\u0430\u0439\u043d\u044b\u043c \u043a\u043b\u0430\u0441\u0441\u0438\u0444\u0438\u043a\u0430\u0442\u043e\u0440\u043e\u043c, \u0447\u0442\u043e \u043f\u043e\u0434\u0442\u0432\u0435\u0440\u0436\u0434\u0430\u0435\u0442 \u0435\u0435 \u0446\u0435\u043d\u043d\u043e\u0441\u0442\u044c \u0432 \u0437\u0430\u0434\u0430\u0447\u0435 \u043a\u043b\u0430\u0441\u0441\u0438\u0444\u0438\u043a\u0430\u0446\u0438\u0438 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0435\u0432.<\/p>\n<\/li>\n<\/ul>\n<h3>\u041f\u0430\u0440\u0441\u0438\u043d\u0433 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0435\u0432 <\/h3>\n<p>\u041d\u0430\u043a\u043e\u043d\u0435\u0446, \u043f\u0435\u0440\u0435\u0439\u0434\u0435\u043c \u043a \u0437\u0430\u043a\u043b\u044e\u0447\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0439 \u0447\u0430\u0441\u0442\u0438 &#8212; \u043a \u043d\u0430\u0448\u0438\u043c \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u044f\u043c \u043f\u043e\u0434 \u0432\u0438\u0434\u0435\u043e \u0421\u0430\u0448\u0438 \u0421\u0443\u043b\u0438\u043c! \u0414\u0430\u0432\u0430\u0439\u0442\u0435 \u0434\u043b\u044f \u043d\u0430\u0447\u0430\u043b\u0430 \u0441\u043f\u0430\u0440\u0441\u0438\u043c \u0432\u0441\u0435 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0438 \u0441 <a href=\"https:\/\/www.youtube.com\/watch?v=Bru4DtUe_CE&amp;t=4s\" rel=\"noopener noreferrer nofollow\">\u0432\u0438\u0434\u0435\u043e<\/a> \u043f\u0440\u043e \u0436\u0435\u043d\u0449\u0438\u043d-\u043c\u0430\u043d\u044c\u044f\u043a\u043e\u0432.<\/p>\n<pre><code class=\"python\"># \u0438\u043d\u0438\u0446\u0438\u0430\u043b\u0438\u0437\u0438\u0440\u0443\u0435\u043c Chrome WebDriver \u0441 \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u043d\u0438\u0435\u043c chromedriver-py driver = webdriver.Chrome(executable_path=binary_path)  # \u0441\u043e\u0437\u0434\u0430\u0435\u043c \u0441\u043f\u0438\u0441\u043e\u043a \u0434\u043b\u044f \u0440\u0435\u0437\u0443\u043b\u044c\u0442\u0430\u0442\u043e\u0432 \u043f\u0430\u0440\u0441\u0438\u043d\u0433\u0430 scrapped = []  # \u0443\u043a\u0430\u0437\u044b\u0432\u0430\u0435\u043c \u0432\u0440\u0435\u043c\u044f \u043e\u0436\u0438\u0434\u0430\u043d\u0438\u044f \u0432 \u0441\u0435\u043a\u0443\u043d\u0434\u0430\u0445 \u0438 URL \u0432\u0438\u0434\u0435\u043e wait = WebDriverWait(driver, 10) driver.get(\"https:\/\/www.youtube.com\/watch?v=Bru4DtUe_CE&amp;t=4s\")  # \u0437\u0430\u0434\u0430\u0435\u043c \u043a\u043e\u043b\u0438\u0447\u0435\u0441\u0442\u0432\u043e \u043f\u0440\u043e\u043a\u0440\u0443\u0442\u043e\u043a \u0434\u043b\u044f \u0437\u0430\u0433\u0440\u0443\u0437\u043a\u0438 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0435\u0432 for item in tqdm(range(200)):     wait.until(EC.visibility_of_element_located((By.TAG_NAME, \"body\"))).send_keys(Keys.END)     time.sleep(2)  # \u043f\u043e\u043b\u0443\u0447\u0430\u0435\u043c \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0438 \u043f\u043e \u0442\u044d\u0433\u0443 \"#content\" for comment in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, \"#content\"))):     scrapped.append(comment.text)  # \u0417\u0430\u043a\u0440\u044b\u0432\u0430\u0435\u043c \u0431\u0440\u0430\u0443\u0437\u0435\u0440 driver.quit()<\/code><\/pre>\n<p>\u0422\u0435\u043f\u0435\u0440\u044c \u043e\u0442\u0447\u0438\u0441\u0442\u0438\u043c \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0438 \u043e\u0442 \u043b\u0438\u0448\u043d\u0435\u0433\u043e \u0438 \u0441\u043e\u0445\u0440\u0430\u043d\u0438\u043c \u0438\u0445 \u0441\u0435\u0431\u0435.<\/p>\n<pre><code class=\"python\">comments = [] for part in scrapped[0].split('\u043d\u0430\u0437\u0430\u0434'):     split_part = part.split('\\n\u041e\u0422\u0412\u0415\u0422\u0418\u0422\u042c')[0].split('\\n')     if len(split_part) > 1:         comments.append(split_part[1]) comments = comments[3:]  # \u0443\u0434\u0430\u043b\u0438\u043c \u043b\u0438\u0448\u043d\u0438\u0435  comments_woman = comments + scrapped[1:] comments_woman_df = pd.DataFrame({'comment':comments_woman})  comments_woman_df.to_csv('\/Users\/amakarshina\/Desktop\/Toxic_comments\/Pet-projects\/Toxic_comments\/data\/' + 'comments_woman.csv') comments_woman_df = comments_woman_df[comments_woman_df['comment'].str.len() > 0] comments_woman_df<\/code><\/pre>\n<div class=\"floating-image\">\n<figure class=\"float full-width\"><img loading=\"lazy\" decoding=\"async\" src=\"https:\/\/habrastorage.org\/r\/w1560\/getpro\/habr\/upload_files\/d1a\/b5a\/159\/d1ab5a1596e36e29fe3c1468f2091e08.png\" alt=\"\u041f\u0440\u0438\u043c\u0435\u0440 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0435\u0432 \u0438\u0437 \u0432\u0438\u0434\u0435\u043e \u043f\u0440\u043e \u0443\u0431\u0438\u0439\u0446-\u0436\u0435\u043d\u0449\u0438\u043d.\" title=\"\u041f\u0440\u0438\u043c\u0435\u0440 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0435\u0432 \u0438\u0437 \u0432\u0438\u0434\u0435\u043e \u043f\u0440\u043e \u0443\u0431\u0438\u0439\u0446-\u0436\u0435\u043d\u0449\u0438\u043d.\" width=\"754\" height=\"548\" data-src=\"https:\/\/habrastorage.org\/getpro\/habr\/upload_files\/d1a\/b5a\/159\/d1ab5a1596e36e29fe3c1468f2091e08.png\"\/><\/p>\n<div><figcaption>\u041f\u0440\u0438\u043c\u0435\u0440 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0435\u0432 \u0438\u0437 \u0432\u0438\u0434\u0435\u043e \u043f\u0440\u043e \u0443\u0431\u0438\u0439\u0446-\u0436\u0435\u043d\u0449\u0438\u043d.<\/figcaption><\/div>\n<\/figure>\n<p>\u0412\u0441\u0435\u0433\u043e \u043f\u043e\u0434 \u0432\u0438\u0434\u0435\u043e \u043e \u0436\u0435\u043d\u0449\u0438\u043d\u0430\u0445-\u0443\u0431\u0438\u0439\u0446\u0430\u0445 \u043d\u0430 \u043c\u043e\u043c\u0435\u043d\u0442 \u043d\u0430\u043f\u0438\u0441\u0430\u043d\u0438\u044f \u044d\u0442\u043e\u0433\u043e \u043f\u0440\u043e\u0435\u043a\u0442\u0430 \u0431\u044b\u043b\u043e 2 358 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u044f. <\/p>\n<\/p>\n<\/div>\n<p>\u0422\u0435\u043f\u0435\u0440\u044c \u043f\u043e\u0432\u0442\u043e\u0440\u0438\u043c \u043f\u0430\u0440\u0441\u0438\u043d\u0433 \u0434\u043b\u044f <a href=\"https:\/\/www.youtube.com\/watch?v=_8bXHh3pOvA&amp;t=156s\" rel=\"noopener noreferrer nofollow\">\u0432\u0438\u0434\u0435\u043e<\/a> \u043f\u0440\u043e \u043c\u0430\u043d\u044c\u044f\u043a\u0430-\u043c\u0443\u0436\u0447\u0438\u043d\u0443. <\/p>\n<pre><code class=\"python\">driver = webdriver.Chrome(executable_path=binary_path) scrapped_man = [] wait = WebDriverWait(driver, 10) driver.get(\"https:\/\/www.youtube.com\/watch?v=_8bXHh3pOvA&amp;t=156s\") for item in tqdm(range(200)):     wait.until(EC.visibility_of_element_located((By.TAG_NAME, \"body\"))).send_keys(Keys.END)     time.sleep(2) for comment in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, \"#content\"))):     scrapped_man.append(comment.text) driver.quit() # \u043e\u0442\u0447\u0438\u0441\u0442\u0438\u043c \u043e\u0442 \u043b\u0438\u0448\u043d\u0435\u0433\u043e comments_man = [] for part in scrapped_man[0].split('\u043d\u0430\u0437\u0430\u0434'):     split_part = part.split('\\n\u041e\u0422\u0412\u0415\u0422\u0418\u0422\u042c')[0].split('\\n')     if len(split_part) > 1:         comments_man.append(split_part[1]) # \u0441\u043e\u0445\u0440\u0430\u043d\u0438\u043c comments_man = comments_man + scrapped[1:] comments_man_df = pd.DataFrame({'comment':comments_man})  comments_man_df.to_csv('\/Users\/amakarshina\/Desktop\/Toxic_comments\/Pet-projects\/Toxic_comments\/data\/' + 'comments_man.csv') comments_man_df = comments_man_df[comments_man_df['comment'].str.len() > 0]<\/code><\/pre>\n<div class=\"floating-image\">\n<figure class=\"float full-width\"><img loading=\"lazy\" decoding=\"async\" src=\"https:\/\/habrastorage.org\/r\/w1560\/getpro\/habr\/upload_files\/962\/34d\/353\/96234d3530d26399eb362a5881351932.png\" alt=\"\u041f\u0440\u0438\u043c\u0435\u0440 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0435\u0432 \u0438\u0437 \u0432\u0438\u0434\u0435\u043e \u043f\u0440\u043e \u0443\u0431\u0438\u0439\u0446\u0443-\u043c\u0443\u0436\u0447\u0438\u043d\u0443.\" title=\"\u041f\u0440\u0438\u043c\u0435\u0440 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0435\u0432 \u0438\u0437 \u0432\u0438\u0434\u0435\u043e \u043f\u0440\u043e \u0443\u0431\u0438\u0439\u0446\u0443-\u043c\u0443\u0436\u0447\u0438\u043d\u0443.\" width=\"738\" height=\"536\" data-src=\"https:\/\/habrastorage.org\/getpro\/habr\/upload_files\/962\/34d\/353\/96234d3530d26399eb362a5881351932.png\"\/><\/p>\n<div><figcaption>\u041f\u0440\u0438\u043c\u0435\u0440 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0435\u0432 \u0438\u0437 \u0432\u0438\u0434\u0435\u043e \u043f\u0440\u043e \u0443\u0431\u0438\u0439\u0446\u0443-\u043c\u0443\u0436\u0447\u0438\u043d\u0443.<\/figcaption><\/div>\n<\/figure>\n<p>\u041f\u043e\u0434 \u0440\u043e\u043b\u0438\u043a\u043e\u043c \u043f\u0440\u043e \u0414\u0436\u0435\u043a\u0430-\u043f\u043e\u0442\u0440\u043e\u0448\u0438\u0442\u0435\u043b\u044f \u043d\u0430 \u043c\u043e\u043c\u0435\u043d\u0442 \u043d\u0430\u043f\u0438\u0441\u0430\u043d\u0438\u044f \u044d\u0442\u043e\u0433\u043e \u043f\u0440\u043e\u0435\u043a\u0442\u0430 \u0431\u044b\u043b\u043e 2 323 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u044f. <\/p>\n<\/div>\n<h4>\u041a\u043b\u044e\u0447\u0435\u0432\u044b\u0435 \u0441\u043b\u043e\u0432\u0430 <\/h4>\n<p>\u0414\u043b\u044f \u0431\u043e\u043b\u044c\u0448\u0435\u0439 \u043d\u0430\u0433\u043b\u044f\u0434\u043d\u043e\u0441\u0442\u0438 \u0432\u0438\u0437\u0443\u0430\u043b\u0438\u0437\u0438\u0440\u0443\u0435\u043c \u043a\u043b\u044e\u0447\u0435\u0432\u044b\u0435 \u0441\u043b\u043e\u0432\u0430, \u043a\u043e\u0442\u043e\u0440\u044b\u0435 \u0447\u0430\u0449\u0435 \u0432\u0441\u0435\u0433\u043e \u0432\u0441\u0442\u0440\u0435\u0447\u0430\u044e\u0442\u0441\u044f \u0432 \u043d\u0430\u0448\u0438\u0445 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u044f\u0445.<\/p>\n<pre><code class=\"python\">man_counter = CountVectorizer(ngram_range=(1, 1)) woman_counter = CountVectorizer(ngram_range=(1, 1))  # \u043f\u0440\u0438\u043c\u0435\u043d\u044f\u0435\u043c \u0441\u0447\u0435\u0442\u0447\u0438\u043a\u0438 \u043a \u0442\u0435\u043a\u0441\u0442\u0430\u043c man_count = man_counter.fit_transform(comments_man_df['text_clear']) woman_count = woman_counter.fit_transform(comments_woman_df['text_clear'])  # \u0441\u043e\u0437\u0434\u0430\u0435\u043c DataFrame \u0441 \u0447\u0430\u0441\u0442\u043e\u0442\u0430\u043c\u0438 \u0441\u043b\u043e\u0432 man_frequence = pd.DataFrame(     {'word': man_counter.get_feature_names_out(),      'frequency': man_count.toarray().sum(axis=0)} ).sort_values(by='frequency', ascending=False)  woman_frequence = pd.DataFrame(     {'word': woman_counter.get_feature_names_out(),      'frequency': woman_count.toarray().sum(axis=0)} ).sort_values(by='frequency', ascending=False) display(man_frequence.shape[0]) display(woman_frequence.shape[0])  # \u0444\u0438\u043b\u044c\u0442\u0440\u0443\u0435\u043c \u0443\u043d\u0438\u043a\u0430\u043b\u044c\u043d\u044b\u0435 \u0441\u043b\u043e\u0432\u0430 man_frequence_filtered = man_frequence.query('word not in @woman_frequence.word')[:100] woman_frequence_filtered = woman_frequence.query('word not in @man_frequence.word')[:100]  # \u0421\u043e\u0437\u0434\u0430\u0435\u043c \u043e\u0431\u043b\u0430\u043a\u043e \u0441\u043b\u043e\u0432 wordcloud_man = WordCloud(     background_color=\"black\",     colormap='Blues',     max_words=200,     width=1600,     height=1600 ).generate_from_frequencies(dict(man_frequence_filtered.values))  # \u0441\u043e\u0437\u0434\u0430\u0435\u043c \u043e\u0431\u043b\u0430\u043a\u043e \u0441\u043b\u043e\u0432 wordcloud_woman = WordCloud(     background_color=\"black\",     colormap='Oranges',     max_words=200,     width=1600,     height=1600 ).generate_from_frequencies(dict(woman_frequence.values))  # \u0412\u0438\u0437\u0443\u0430\u043b\u0438\u0437\u0438\u0440\u0443\u0435\u043c fig, ax = plt.subplots(1, 2, figsize=(20, 12))  ax[0].imshow(wordcloud_man, interpolation='bilinear') ax[1].imshow(wordcloud_woman, interpolation='bilinear')  ax[0].set_title(     f'\u0422\u043e\u043f 100 \u0441\u043b\u043e\u0432 \u043d\u0430\u0438\u0431\u043e\u043b\u0435\u0435 \u0447\u0430\u0441\u0442\u043e\u0442\u043d\u044b\u0445,\\n \u0443\u043d\u0438\u043a\u0430\u043b\u044c\u043d\u044b\u0445 \u0441\u043b\u043e\u0432 \u0432 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u044f\u0445 \u043c\u0443\u0436\u0447\u0438\u043d',     fontsize=20 ) ax[1].set_title(     f'\u0422\u043e\u043f 100 \u0441\u043b\u043e\u0432 \u043d\u0430\u0438\u0431\u043e\u043b\u0435\u0435 \u0447\u0430\u0441\u0442\u043e\u0442\u043d\u044b\u0445,\\n \u0443\u043d\u0438\u043a\u0430\u043b\u044c\u043d\u044b\u0445 \u0441\u043b\u043e\u0432 \u0432 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u044f\u0445 \u0436\u0435\u043d\u0449\u0438\u043d',     fontsize=20 )  ax[0].axis(\"off\") ax[1].axis(\"off\")  plt.show()<\/code><\/pre>\n<figure class=\"full-width\"><img loading=\"lazy\" decoding=\"async\" src=\"https:\/\/habrastorage.org\/r\/w1560\/getpro\/habr\/upload_files\/53c\/a41\/a98\/53ca41a982fbeec91bd5eca7fdf9a1bf.png\" width=\"1569\" height=\"787\" data-src=\"https:\/\/habrastorage.org\/getpro\/habr\/upload_files\/53c\/a41\/a98\/53ca41a982fbeec91bd5eca7fdf9a1bf.png\"\/><\/figure>\n<h3>\u041e\u0446\u0435\u043d\u043a\u0430 \u043c\u043e\u0434\u0435\u043b\u0438 \u043d\u0430 \u043d\u0430\u0448\u0438\u0445 \u0432\u0438\u0434\u0435\u043e <\/h3>\n<p>\u041f\u0435\u0440\u0435\u0439\u0434\u0435\u043c \u043a \u0437\u0430\u043a\u043b\u044e\u0447\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0439 \u043e\u0446\u0435\u043d\u043a\u0435: \u043d\u0430\u0439\u0434\u0435\u043c \u0434\u043e\u043b\u0438 \u043d\u0435\u0433\u0430\u0442\u0438\u0432\u043d\u044b\u0445 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0435\u0432 \u043f\u0440\u0438 \u043e\u043f\u0442\u0438\u043c\u0430\u043b\u044c\u043d\u043e\u043c \u043f\u043e\u0440\u043e\u0433\u043e\u0432\u043e\u043c \u0437\u043d\u0430\u0447\u0435\u043d\u0438\u0438.<\/p>\n<pre><code class=\"python\">woman_share_neg = (comments_woman_df['negative_proba'] >  0.575758).sum() \/ comments_woman_df.shape[0] woman_share_neg >>> 0.766156462585034  man_share_neg = (comments_man_df['negative_proba'] >  0.575758).sum() \/ comments_man_df.shape[0] man_share_neg >>> 0.7492447129909365<\/code><\/pre>\n<figure class=\"full-width\"><img loading=\"lazy\" decoding=\"async\" src=\"https:\/\/habrastorage.org\/r\/w1560\/getpro\/habr\/upload_files\/407\/aff\/f18\/407afff18ced7209b77a9a099b7a2149.png\" width=\"631\" height=\"500\" data-src=\"https:\/\/habrastorage.org\/getpro\/habr\/upload_files\/407\/aff\/f18\/407afff18ced7209b77a9a099b7a2149.png\"\/><\/figure>\n<h4>\u0412\u044b\u0432\u043e\u0434\u044b<\/h4>\n<ul>\n<li>\n<p><strong>\u0412\u044b\u0441\u043e\u043a\u0430\u044f \u0434\u043e\u043b\u044f \u043d\u0435\u0433\u0430\u0442\u0438\u0432\u043d\u044b\u0445 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0435\u0432<\/strong>: \u041e\u0431\u0430 \u0432\u0438\u0434\u0435\u043e \u0438\u043c\u0435\u044e\u0442 \u0437\u043d\u0430\u0447\u0438\u0442\u0435\u043b\u044c\u043d\u0443\u044e \u0434\u043e\u043b\u044e \u043d\u0435\u0433\u0430\u0442\u0438\u0432\u043d\u043e \u043e\u043a\u0440\u0430\u0448\u0435\u043d\u043d\u044b\u0445 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0435\u0432, \u043f\u0440\u0435\u0432\u044b\u0448\u0430\u044e\u0449\u0443\u044e 70%. \u042d\u0442\u043e \u0443\u043a\u0430\u0437\u044b\u0432\u0430\u0435\u0442 \u043d\u0430 \u0442\u043e, \u0447\u0442\u043e \u043f\u043e\u0434 TRUE CRIME \u0440\u043e\u043b\u0438\u043a\u0430\u043c\u0438 \u0431\u043e\u0301\u043b\u044c\u0448\u0430\u044f \u0447\u0430\u0441\u0442\u044c \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0435\u0432 \u0434\u0435\u0439\u0441\u0442\u0432\u0438\u0442\u0435\u043b\u044c\u043d\u043e \u043d\u0435\u0433\u0430\u0442\u0438\u0432\u043d\u0430\u044f.<\/p>\n<\/li>\n<li>\n<p><strong>\u041d\u0435\u0437\u043d\u0430\u0447\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0435 \u0440\u0430\u0437\u043b\u0438\u0447\u0438\u0435 \u043c\u0435\u0436\u0434\u0443 \u043f\u043e\u043b\u0430\u043c\u0438<\/strong>: \u0414\u043e\u043b\u044f \u043d\u0435\u0433\u0430\u0442\u0438\u0432\u043d\u044b\u0445 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0435\u0432 \u043f\u043e\u0434 \u0432\u0438\u0434\u0435\u043e \u043f\u0440\u043e \u0443\u0431\u0438\u0439\u0446 \u0436\u0435\u043d\u0449\u0438\u043d \u043d\u0435\u043c\u043d\u043e\u0433\u043e \u043f\u0440\u0435\u0432\u044b\u0448\u0430\u0435\u0442 \u0434\u043e\u043b\u044e \u043d\u0435\u0433\u0430\u0442\u0438\u0432\u043d\u044b\u0445 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0435\u0432 \u043f\u043e\u0434 \u0440\u043e\u043b\u0438\u043a\u043e\u043c \u043f\u0440\u043e \u043c\u0430\u043d\u044c\u044f\u043a\u043e\u0432 \u043c\u0443\u0436\u0447\u0438\u043d (0.766 \u043f\u0440\u043e\u0442\u0438\u0432 0.749). \u042d\u0442\u043e \u0443\u043a\u0430\u0437\u044b\u0432\u0430\u0435\u0442 \u043d\u0430 \u0442\u043e, \u0447\u0442\u043e \u0432 \u0446\u0435\u043b\u043e\u043c \u0440\u0430\u0437\u043b\u0438\u0447\u0438\u044f \u0432 \u0442\u043e\u043d\u0430\u043b\u044c\u043d\u043e\u0441\u0442\u0438 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0435\u0432 \u043c\u0435\u0436\u0434\u0443 \u044d\u0442\u0438\u043c\u0438 \u0434\u0432\u0443\u043c\u044f \u0442\u0438\u043f\u0430\u043c\u0438 \u0432\u0438\u0434\u0435\u043e \u043d\u0435\u0437\u043d\u0430\u0447\u0438\u0442\u0435\u043b\u044c\u043d\u044b.<\/p>\n<\/li>\n<\/ul>\n<p>\u041d\u0430\u0434\u0435\u044e\u0441\u044c, \u0447\u0442\u043e \u044d\u0442\u043e \u043d\u0435\u0431\u043e\u043b\u044c\u0448\u043e\u0435 \u0438\u0441\u0441\u043b\u0435\u0434\u043e\u0432\u0430\u043d\u0438\u0435 \u0431\u044b\u043b\u043e \u0438 \u043d\u0442\u0435\u0440\u0435\u0441\u043d\u043e \u0434\u043b\u044f \u0432\u0430\u0441, \u0431\u0443\u0434\u0443 \u0440\u0430\u0434\u0430 \u0435\u0441\u043b\u0438 \u043f\u043e\u0434\u043f\u0438\u0448\u0438\u0442\u0435\u0441\u044c \u043d\u0430 \u043c\u0435\u043d\u044f \u0442\u0443\u0442 \u0438\u043b\u0438 \u043d\u0430\u00a0<a href=\"https:\/\/t.me\/neuronwet\" rel=\"noopener noreferrer nofollow\">telegram &#8212; \u043a\u0430\u043d\u0430\u043b<\/a>, \u0432 \u043a\u043e\u0442\u043e\u0440\u043e\u043c \u043f\u0438\u0448\u0443 \u043f\u0440\u043e \u0441\u0432\u043e\u0435 \u0440\u0430\u0437\u0432\u0438\u0442\u0438\u0435 \u0432 \u043e\u0431\u043b\u0430\u0441\u0442\u0438 Data Science \u0438 \u0434\u0435\u043b\u044e\u0441\u044c \u043f\u0440\u043e\u0433\u0440\u0435\u0441\u0441\u043e\u043c. \u0412\u0441\u0435\u043c \u0436\u0435\u043b\u0430\u044e \u043a\u043b\u0430\u0441\u0441\u043d\u044b\u0445 \u043f\u0440\u043e\u0435\u043a\u0442\u043e\u0432!<\/p>\n<\/p>\n<\/div>\n<\/div>\n<\/div>\n<p><!----><!----><\/div>\n<p><!----><!----><br \/> \u0441\u0441\u044b\u043b\u043a\u0430 \u043d\u0430 \u043e\u0440\u0438\u0433\u0438\u043d\u0430\u043b \u0441\u0442\u0430\u0442\u044c\u0438 <a href=\"https:\/\/habr.com\/ru\/articles\/830100\/\"> https:\/\/habr.com\/ru\/articles\/830100\/<\/a><\/p>\n","protected":false},"excerpt":{"rendered":"<div><!--[--><!--]--><\/div>\n<div id=\"post-content-body\">\n<div>\n<div class=\"article-formatted-body article-formatted-body article-formatted-body_version-2\">\n<div xmlns=\"http:\/\/www.w3.org\/1999\/xhtml\">\n<figure class=\"full-width\"><\/figure>\n<p>\u041f\u0440\u0438\u0432\u0435\u0442! \u042f \u0442\u0443\u0442 \u0430\u043a\u0442\u0438\u0432\u043d\u043e \u043f\u044b\u0442\u0430\u044e\u0441\u044c \u043e\u0445\u0432\u0430\u0442\u0438\u0442\u044c \u0440\u0430\u0437\u043d\u044b\u0435 \u043e\u0431\u043b\u0430\u0441\u0442\u0438 \u0432 \u0441\u0444\u0435\u0440\u0435 Data Science \u0438 \u0440\u0435\u0448\u0438\u043b\u0430, \u0447\u0442\u043e \u0431\u044b\u043b\u043e \u0431\u044b \u043a\u043b\u0430\u0441\u0441\u043d\u043e \u043f\u043e\u043a\u043e\u043f\u0430\u0442\u044c\u0441\u044f c \u043e\u0431\u0440\u0430\u0431\u043e\u0442\u043a\u043e\u0439 \u0435\u0441\u0442\u0435\u0441\u0442\u0432\u0435\u043d\u043d\u043e\u0433\u043e \u044f\u0437\u044b\u043a\u0430 (<em>NLP<\/em>) \u043d\u0430 \u043f\u0440\u0438\u043c\u0435\u0440\u0435 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0435\u0432 YouTube. \u0422\u0430\u043a \u043a\u0430\u043a \u043f\u043e\u0441\u043b\u0435 \u0440\u0430\u0431\u043e\u0442\u044b \u044f \u0447\u0430\u0441\u0442\u043e \u0441\u043c\u043e\u0442\u0440\u044e \u0432\u0438\u0434\u0435\u043e <a href=\"https:\/\/www.youtube.com\/@sasha_sulim_channel\" rel=\"noopener noreferrer nofollow\">\u0421\u0430\u0448\u0438 \u0421\u0443\u043b\u0438\u043c<\/a>, \u044f \u0437\u0430\u0434\u0430\u043b\u0430\u0441\u044c \u0432\u043e\u043f\u0440\u043e\u0441\u043e\u043c: &#171;\u0418\u043d\u0442\u0435\u0440\u0435\u0441\u043d\u043e, \u0430 \u0435\u0441\u0442\u044c \u043b\u0438 \u0440\u0430\u0437\u043b\u0438\u0447\u0438\u044f \u0432 \u043e\u0446\u0435\u043d\u043a\u0435 \u0437\u0440\u0438\u0442\u0435\u043b\u044f\u043c\u0438 \u0432\u0438\u0434\u0435\u043e \u043f\u0440\u043e \u043c\u0430\u043d\u044c\u044f\u043a\u043e\u0432 \u0432 \u0437\u0430\u0432\u0438\u0441\u0438\u043c\u043e\u0441\u0442\u0438 \u043e\u0442 \u043f\u043e\u043b\u0430!? \u0418\u043b\u0438 \u043d\u0430\u043c \u043d\u0435 \u0432\u0430\u0436\u043d\u043e, \u043a\u0442\u043e \u0431\u044b\u043b \u0443\u0431\u0438\u0439\u0446\u0435\u0439 &#8212; \u043c\u0443\u0436\u0447\u0438\u043d\u0430\/\u0436\u0435\u043d\u0449\u0438\u043d\u0430?&#187; <\/p>\n<p>\u0422\u0430\u043a \u044f \u043f\u0440\u0438\u0448\u043b\u0430 \u043a \u0442\u043e\u043c\u0443, \u0447\u0442\u043e \u043c\u043e\u0433\u0443 \u0432\u0437\u044f\u0442\u044c \u0437\u0430\u0434\u0430\u0447\u043a\u0443 \u043a\u043b\u0430\u0441\u0441\u0438\u0444\u0438\u043a\u0430\u0446\u0438\u0438 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0435\u0432 \u043f\u043e \u043e\u0446\u0435\u043d\u043a\u0435 \u0438\u0445 \u043d\u0435\u0433\u0430\u0442\u0438\u0432\u043d\u043e\u0441\u0442\u0438 \u0432 \u043a\u0430\u0447\u0435\u0441\u0442\u0432\u0435 pet-\u043f\u0440\u043e\u0435\u043a\u0442\u0430. \u0422\u043e, \u043d\u0430\u0441\u043a\u043e\u043b\u044c\u043a\u043e \u044d\u0442\u043e \u043f\u043e\u043b\u0443\u0447\u0438\u043b\u043e\u0441\u044c, \u043f\u0440\u0435\u0434\u043b\u0430\u0433\u0430\u044e \u043e\u0446\u0435\u043d\u0438\u0442\u044c \u0432\u0430\u043c. <\/p>\n<p>\u0412\u0435\u0441\u044c \u043a\u043e\u0434 \u043c\u043e\u0436\u043d\u043e \u043d\u0430\u0439\u0442\u0438 \u0432\u00a0<a href=\"https:\/\/github.com\/makarstasia\/Pet-projects\/tree\/main\/Toxic_comments\" rel=\"noopener noreferrer nofollow\">github<\/a>, \u0430 \u0432 \u0440\u0430\u043c\u043a\u0430\u0445 \u0434\u0430\u043d\u043d\u043e\u0439 \u0441\u0442\u0430\u0442\u044c\u0438 \u044f \u043f\u043e\u0434\u0440\u043e\u0431\u043d\u0435\u0435 \u043e\u043f\u0438\u0448\u0443 \u043f\u0440\u043e\u0446\u0435\u0441\u0441 \u0438\u0441\u0441\u043b\u0435\u0434\u043e\u0432\u0430\u043d\u0438\u044f \u0434\u0430\u043d\u043d\u043e\u0439 \u0442\u0435\u043c\u044b.<\/p>\n<h3>Dataset<\/h3>\n<p>\u0414\u043b\u044f \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u044f \u043c\u043d\u043e\u044e \u0431\u044b\u043b \u0432\u044b\u0431\u0440\u0430\u043d <a href=\"https:\/\/www.kaggle.com\/datasets\/blackmoon\/russian-language-toxic-comments\" rel=\"noopener noreferrer nofollow\">\u0434\u0430\u0442\u0430\u0441\u0435\u0442<\/a> \u0441 Kaggle \u0438\u0437 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0435\u0432, \u0441\u043e\u0431\u0440\u0430\u043d\u043d\u044b\u0445 \u0441 \u0441\u0430\u0439\u0442\u0430 <a href=\"https:\/\/2ch.hk\" rel=\"noopener noreferrer nofollow\">2ch.hk<\/a> \u0438 <a href=\"https:\/\/pikabu.ru\" rel=\"noopener noreferrer nofollow\">pikabu.ru<\/a>. \u0421\u0440\u0435\u0434\u043d\u0435\u0441\u0442\u0430\u0442\u0438\u0441\u0442\u0438\u0447\u0435\u0441\u043a\u0438\u0439 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0439 \u0438\u043c\u0435\u0435\u0442 \u0434\u043b\u0438\u043d\u0443 175 \u0441\u0438\u043c\u0432\u043e\u043b\u043e\u0432, \u043c\u0438\u043d\u0438\u043c\u0430\u043b\u044c\u043d\u0430\u044f \u0434\u043b\u0438\u043d\u0430 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u044f &#8212; 21 \u0441\u0438\u043c\u0432\u043e\u043b, \u043c\u0430\u043a\u0441\u0438\u043c\u0430\u043b\u044c\u043d\u0430\u044f &#8212; 7 403. <\/p>\n<h4>EDA (Exploratory Data Analysis)<\/h4>\n<p>\u0414\u043b\u044f \u043d\u0430\u0447\u0430\u043b\u0430 \u043f\u043e\u0441\u043c\u043e\u0442\u0440\u0438\u043c \u0447\u0442\u043e \u0438\u0437 \u0441\u0435\u0431\u044f \u043f\u0440\u0435\u0434\u0441\u0442\u0430\u0432\u043b\u044f\u0435\u0442 \u043d\u0430\u0448 \u0434\u0430\u0442\u0430\u0441\u0435\u0442. \u0414\u043b\u044f \u044d\u0442\u043e\u0433\u043e \u043f\u0440\u043e\u0432\u0435\u0434\u0435\u043c \u0441\u0442\u0430\u043d\u0434\u0430\u0440\u0442\u043d\u044b\u0439 \u0430\u043d\u0430\u043b\u0438\u0437:<\/p>\n<pre><code class=\"python\">df = pd.read_csv(\".\/data\/labeled.csv\", sep=',') df.shape >>> (14412, 2)  # \u043f\u0440\u0435\u043e\u0431\u0440\u0430\u0437\u0443\u0435\u043c \u0437\u043d\u0430\u0447\u0435\u043d\u0438\u044f \u043a\u043e\u043b\u043e\u043d\u043a\u0438 \u00abtoxic\u00bb \u043a \u0442\u0438\u043f\u0443 (int) \u0434\u043b\u044f \u0443\u0434\u043e\u0431\u0441\u0442\u0432\u0430 df[\"toxic\"] = df[\"toxic\"].apply(int)  df[\"toxic\"].value_counts() >>> 0    9586 >>> 1    4826  # \u043f\u0440\u043e\u0432\u0435\u0440\u0438\u043c, \u0447\u0442\u043e \u043d\u0435\u0442 \u043f\u0443\u0441\u0442\u044b\u0445 \u0437\u043d\u0430\u0447\u0435\u043d\u0438\u0439 df[df[\"toxic\"] == 0][\"comment\"].isna().sum() >>> 0<\/code><\/pre>\n<p>\u0418\u0442\u0430\u043a, \u043c\u044b \u0432\u044b\u044f\u0441\u043d\u0438\u043b\u0438, \u0447\u0442\u043e \u0434\u0430\u0442\u0430\u0441\u0435\u0442 \u043f\u0440\u0435\u0434\u0441\u0442\u0430\u0432\u043b\u044f\u0435\u043c \u0441\u043e\u0431\u043e\u0439 14 412<strong> <\/strong>\u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0435\u0432. \u0420\u0430\u0441\u043f\u0440\u0435\u0434\u0435\u043b\u0435\u043d\u0438\u0435 \u0432 \u0434\u0430\u043d\u043d\u043e\u043c \u043d\u0430\u0431\u043e\u0440\u0435 \u0441\u043b\u0435\u0434\u0443\u044e\u0449\u0435\u0435: 4 826 &#8212; \u043d\u0435\u0433\u0430\u0442\u0438\u0432\u043d\u044b\u0435, 9 586 &#8212; \u043d\u0435\u0439\u0442\u0440\u0430\u043b\u044c\u043d\u044b\u0435.<\/p>\n<h3>Text preprocessing<\/h3>\n<p>\u041b\u044e\u0431\u044b\u0435 \u0441\u044b\u0440\u044b\u0435 \u0434\u0430\u043d\u043d\u044b\u0435 \u043d\u0443\u0436\u043d\u043e \u043f\u0440\u0435\u0434\u043e\u0431\u0430\u0431\u043e\u0442\u0430\u0442\u044c. \u0414\u043b\u044f \u044d\u0442\u043e\u0433\u043e \u0435\u0441\u0442\u044c \u043d\u0435\u0441\u043a\u043e\u043b\u044c\u043a\u043e \u0432\u0430\u0436\u043d\u044b\u0445 \u044d\u0442\u0430\u043f\u043e\u0432: \u0442\u043e\u043a\u0435\u043d\u0438\u0437\u0430\u0446\u0438\u044f, \u0443\u0434\u0430\u043b\u0435\u043d\u0438\u0435 \u043f\u0443\u043d\u043a\u0442\u0443\u0430\u0446\u0438\u0438 \u0438 \u0441\u0442\u043e\u043f-\u0441\u043b\u043e\u0432, \u0430 \u0442\u0430\u043a\u0436\u0435 \u0441\u0442\u0435\u043c\u043c\u0438\u043d\u0433. \u0414\u0430\u0432\u0430\u0439\u0442\u0435 \u043f\u0440\u0438\u0441\u0442\u0443\u043f\u0438\u043c!<\/p>\n<pre><code class=\"python\"># \u0432\u043e\u0437\u044c\u043c\u0435\u043c \u0434\u043b\u044f \u043f\u0440\u0438\u043c\u0435\u0440\u0430 \u043e\u0434\u0438\u043d \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0439 example = df.iloc[1][\"comment\"] print(f\"\u0418\u0441\u0445\u043e\u0434\u043d\u044b\u0439 \u0442\u0435\u043a\u0441\u0442: {example}\") >>> \u0418\u0441\u0445\u043e\u0434\u043d\u044b\u0439 \u0442\u0435\u043a\u0441\u0442: \u0425\u043e\u0445\u043b\u044b, \u044d\u0442\u043e \u043e\u0442\u0434\u0443\u0448\u0438\u043d\u0430 \u0437\u0430\u0442\u044e\u043a\u0430\u043d\u043e\u0433\u043e \u0440\u043e\u0441\u0441\u0438\u044f\u043d\u0438\u043d\u0430, \u043c\u043e\u043b, \u0432\u043e\u043d, \u0430 \u0443 \u0445\u043e\u0445\u043b\u043e\u0432 \u0435\u0449\u0435 \u0445\u0443\u0436\u0435. \u0415\u0441\u043b\u0438 \u0431\u044b \u0445\u043e\u0445\u043b\u043e\u0432 \u043d\u0435 \u0431\u044b\u043b\u043e, \u043a\u0438\u0441\u0435\u043b\u044c \u0438\u0445 \u0431\u044b \u043f\u0440\u0438\u0434\u0443\u043c\u0430\u043b.  # \u0440\u0430\u0437\u043e\u0431\u044c\u0435\u043c \u043d\u0430 \u0442\u043e\u043a\u0435\u043d\u044b tokens = word_tokenize(example, language=\"russian\") print(f\"\u0422\u043e\u043a\u0435\u043d\u044b: {tokens}\") >>> \u0422\u043e\u043a\u0435\u043d\u044b: ['\u0425\u043e\u0445\u043b\u044b', ',', '\u044d\u0442\u043e', '\u043e\u0442\u0434\u0443\u0448\u0438\u043d\u0430', '\u0437\u0430\u0442\u044e\u043a\u0430\u043d\u043e\u0433\u043e', '\u0440\u043e\u0441\u0441\u0438\u044f\u043d\u0438\u043d\u0430', ',', '\u043c\u043e\u043b', ',', '\u0432\u043e\u043d', ',', '\u0430', '\u0443', '\u0445\u043e\u0445\u043b\u043e\u0432', '\u0435\u0449\u0435', '\u0445\u0443\u0436\u0435', '.', '\u0415\u0441\u043b\u0438', '\u0431\u044b', '\u0445\u043e\u0445\u043b\u043e\u0432', '\u043d\u0435', '\u0431\u044b\u043b\u043e', ',', '\u043a\u0438\u0441\u0435\u043b\u044c', '\u0438\u0445', '\u0431\u044b', '\u043f\u0440\u0438\u0434\u0443\u043c\u0430\u043b', '.']  # \u0443\u0431\u0435\u0440\u0435\u043c \u0432\u0441\u044e \u043f\u0443\u043d\u043a\u0442\u0443\u0430\u0446\u0438\u044e \u0438 \u0441\u0442\u043e\u043f-\u0441\u043b\u043e\u0432\u0430 tokens_without_punct = [i for i in tokens if i not in string.punctuation] stop_words = stopwords.words(\"russian\") print(f\"\u0422\u043e\u043a\u0435\u043d\u044b \u0431\u0435\u0437 \u043f\u0443\u043d\u043a\u0442\u0443\u0430\u0446\u0438\u0438: {tokens_without_punct}\") print(f\"\u0422\u043e\u043a\u0435\u043d\u044b \u0431\u0435\u0437 \u043f\u0443\u043d\u043a\u0442\u0443\u0430\u0446\u0438\u0438 \u0438 \u0441\u0442\u043e\u043f \u0441\u043b\u043e\u0432: {tokens_without_punct_and_stopwords}\") >>> \u0422\u043e\u043a\u0435\u043d\u044b \u0431\u0435\u0437 \u043f\u0443\u043d\u043a\u0442\u0443\u0430\u0446\u0438\u0438: ['\u0425\u043e\u0445\u043b\u044b', '\u044d\u0442\u043e', '\u043e\u0442\u0434\u0443\u0448\u0438\u043d\u0430', '\u0437\u0430\u0442\u044e\u043a\u0430\u043d\u043e\u0433\u043e', '\u0440\u043e\u0441\u0441\u0438\u044f\u043d\u0438\u043d\u0430', '\u043c\u043e\u043b', '\u0432\u043e\u043d', '\u0430', '\u0443', '\u0445\u043e\u0445\u043b\u043e\u0432', '\u0435\u0449\u0435', '\u0445\u0443\u0436\u0435', '\u0415\u0441\u043b\u0438', '\u0431\u044b', '\u0445\u043e\u0445\u043b\u043e\u0432', '\u043d\u0435', '\u0431\u044b\u043b\u043e', '\u043a\u0438\u0441\u0435\u043b\u044c', '\u0438\u0445', '\u0431\u044b', '\u043f\u0440\u0438\u0434\u0443\u043c\u0430\u043b'] >>> \u0422\u043e\u043a\u0435\u043d\u044b \u0431\u0435\u0437 \u043f\u0443\u043d\u043a\u0442\u0443\u0430\u0446\u0438\u0438 \u0438 \u0441\u0442\u043e\u043f \u0441\u043b\u043e\u0432: ['\u0425\u043e\u0445\u043b\u044b', '\u044d\u0442\u043e', '\u043e\u0442\u0434\u0443\u0448\u0438\u043d\u0430', '\u0437\u0430\u0442\u044e\u043a\u0430\u043d\u043e\u0433\u043e', '\u0440\u043e\u0441\u0441\u0438\u044f\u043d\u0438\u043d\u0430', '\u043c\u043e\u043b', '\u0432\u043e\u043d', '\u0445\u043e\u0445\u043b\u043e\u0432', '\u0445\u0443\u0436\u0435', '\u0415\u0441\u043b\u0438', '\u0445\u043e\u0445\u043b\u043e\u0432', '\u043a\u0438\u0441\u0435\u043b\u044c', '\u043f\u0440\u0438\u0434\u0443\u043c\u0430\u043b']  # \u0434\u0430\u043b\u0435\u0435 \u0421\u0442\u0435\u043c\u043c\u0438\u043d\u0433 - \u043f\u0440\u043e\u0446\u0435\u0441\u0441 \u043f\u0440\u0438\u0432\u0435\u0434\u0435\u043d\u0438\u044f \u0441\u043b\u043e\u0432 \u043a \u0438\u0445 \u0431\u0430\u0437\u043e\u0432\u043e\u0439\/\u043a\u043e\u0440\u043d\u0435\u0432\u043e\u0439 \u0444\u043e\u0440\u043c\u0435.  tokens_without_punct_and_stopwords = [i for i in tokens_without_punct if i not in stop_words] snowball = SnowballStemmer(language=\"russian\") stemmed_tokens = [snowball.stem(i) for i in tokens_without_punct_and_stopwords] print(f\"\u0422\u043e\u043a\u0435\u043d\u044b \u043f\u043e\u0441\u043b\u0435 \u0441\u0442\u0435\u043c\u043c\u0438\u043d\u0433\u0430: {stemmed_tokens}\") >>> \u0422\u043e\u043a\u0435\u043d\u044b \u043f\u043e\u0441\u043b\u0435 \u0441\u0442\u0435\u043c\u043c\u0438\u043d\u0433\u0430: ['\u0445\u043e\u0445\u043b', '\u044d\u0442', '\u043e\u0442\u0434\u0443\u0448\u0438\u043d', '\u0437\u0430\u0442\u044e\u043a\u0430\u043d', '\u0440\u043e\u0441\u0441\u0438\u044f\u043d\u0438\u043d', '\u043c\u043e\u043b', '\u0432\u043e\u043d', '\u0445\u043e\u0445\u043b', '\u0445\u0443\u0436', '\u0435\u0441\u043b', '\u0445\u043e\u0445\u043b', '\u043a\u0438\u0441\u0435\u043b', '\u043f\u0440\u0438\u0434\u0443\u043c\u0430']<\/code><\/pre>\n<p>\u0422\u0430\u043a \u043a\u0430\u043a \u043f\u0440\u043e\u0446\u0435\u0441\u0441 \u043f\u0440\u0435\u0434\u043e\u0431\u0440\u0430\u0431\u043e\u0442\u043a\u0438 \u0431\u0443\u0434\u0435\u0442 \u043f\u043e\u0432\u0442\u043e\u0440\u044f\u0442\u044c\u0441\u044f &#8212; \u0441\u043e\u0437\u0434\u0430\u0434\u0438\u043c \u0434\u043b\u044f \u0443\u0434\u043e\u0431\u0441\u0442\u0432\u0430 \u0444\u0443\u043d\u043a\u0446\u0438\u044e, \u043f\u043e\u0432\u0442\u043e\u0440\u044f\u044e\u0449\u0443\u044e \u0432\u0441\u0435 \u0432\u044b\u0448\u0435\u043f\u0435\u0440\u0435\u0447\u0438\u0441\u043b\u0435\u043d\u043d\u044b\u0435 \u043f\u0440\u0435\u043e\u0431\u0440\u0430\u0437\u043e\u0432\u0430\u043d\u0438\u044f.<\/p>\n<pre><code class=\"python\">snowball = SnowballStemmer(language=\"russian\") russian_stop_words = stopwords.words(\"russian\")  def tokenize_sentence(sentence: str, remove_stop_words: bool = True):     tokens = word_tokenize(sentence, language=\"russian\")     tokens = [i for i in tokens if i not in string.punctuation]     if remove_stop_words:         tokens = [i for i in tokens if i not in russian_stop_words]     tokens = [snowball.stem(i) for i in tokens]     return tokens<\/code><\/pre>\n<p>\u041e\u0442\u043b\u0438\u0447\u043d\u043e, \u0442\u0435\u043f\u0435\u0440\u044c \u0440\u0430\u0437\u0434\u0435\u043b\u0438\u043c \u043d\u0430\u0448 \u0434\u0430\u0442\u0430\u0441\u0435\u0442 \u043d\u0430 \u043e\u0431\u0443\u0447\u0430\u044e\u0449\u0443\u044e \u0438 \u0442\u0435\u0441\u0442\u043e\u0432\u0443\u044e \u0432\u044b\u0431\u043e\u0440\u043a\u0443 \u0438 \u0441\u0440\u0430\u0432\u043d\u0438\u043c \u0438\u0445 \u0440\u0430\u0441\u043f\u0440\u0435\u0434\u0435\u043b\u0435\u043d\u0438\u0435. <\/p>\n<pre><code class=\"python\">train_df, test_df = train_test_split(df, test_size = 500, random_state=234) print(train_df.shape) print(test_df.shape) >>> (13912, 2) >>> (500, 2)  # \u0441\u0440\u0430\u0432\u043d\u0438\u043c \u0440\u0430\u0441\u043f\u0440\u0435\u0434\u0435\u043b\u0435\u043d\u0438\u0435 \u0446\u0435\u043b\u0435\u0432\u043e\u0433\u043e \u043f\u0440\u0438\u0437\u043d\u0430\u043a\u0430 for sample in [train_df, test_df]:     print(sample[sample['toxic'] == 1].shape[0] \/ sample.shape[0]) >>> 0.3356095457159287 >>> 0.314<\/code><\/pre>\n<p>\u041f\u043e\u043b\u0443\u0447\u0438\u043b\u0438 \u0440\u0430\u0441\u043f\u0440\u0435\u0434\u0435\u043b\u0435\u043d\u0438\u0435: <\/p>\n<div>\n<div class=\"table\">\n<table>\n<tbody>\n<tr>\n<td>\n<p align=\"left\">\u041e\u0431\u0443\u0447\u0430\u044e\u0449\u0430\u044f \u0432\u044b\u0431\u043e\u0440\u043a\u0430<\/p>\n<\/td>\n<td>\n<p align=\"left\">33.56% \u0442\u043e\u043a\u0441\u0438\u0447\u043d\u044b\u0445 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0435\u0432<\/p>\n<\/td>\n<\/tr>\n<tr>\n<td>\n<p align=\"left\">\u0422\u0435\u0441\u0442\u043e\u0432\u0430\u044f \u0432\u044b\u0431\u043e\u0440\u043a\u0430<\/p>\n<\/td>\n<td>\n<p align=\"left\">31.4% \u0442\u043e\u043a\u0441\u0438\u0447\u043d\u044b\u0445 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0435\u0432<\/p>\n<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<\/div>\n<\/div>\n<p>\u0414\u0430\u043d\u043d\u044b\u0435 \u0440\u0430\u0432\u043d\u043e\u043c\u0435\u0440\u043d\u043e \u0440\u0430\u0441\u043f\u0440\u0435\u0434\u0435\u043b\u0435\u043d\u044b \u043f\u043e \u0432\u044b\u0431\u043e\u0440\u043a\u0430\u043c, \u0441\u043b\u0435\u0434\u043e\u0432\u0430\u0442\u0435\u043b\u044c\u043d\u043e \u043d\u0430\u0448\u0430 \u0431\u0443\u0434\u0443\u0449\u0430\u044f \u043c\u043e\u0434\u0435\u043b\u044c \u0434\u043e\u043b\u0436\u043d\u0430 \u0430\u0434\u0435\u043a\u0432\u0430\u0442\u043d\u043e \u043e\u0446\u0435\u043d\u0438\u0432\u0430\u0442\u044c\u0441\u044f \u043d\u0430 \u0442\u0435\u0441\u0442\u043e\u0432\u044b\u0445 \u0434\u0430\u043d\u043d\u044b\u0445.<\/p>\n<h4>TF-IDF <\/h4>\n<p>\u041f\u0440\u0435\u0436\u0434\u0435 \u0447\u0435\u043c \u043f\u0440\u0438\u0441\u0442\u0443\u043f\u0438\u0442\u044c \u043a \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u044e \u043d\u0430\u0448\u0435\u0439 \u043c\u043e\u0434\u0435\u043b\u0438 \u043c\u044b \u0434\u043e\u043b\u0436\u043d\u044b \u043f\u0440\u0435\u043e\u0431\u0440\u0430\u0437\u043e\u0432\u0430\u0442\u044c \u043d\u0430\u0448\u0438 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0438 \u0432 \u0447\u0438\u0441\u043b\u0435\u043d\u043d\u044b\u0435 \u043c\u0430\u0441\u0441\u0438\u0432\u044b. \u0414\u043b\u044f \u044d\u0442\u043e\u0433\u043e \u0432\u043e\u0441\u043f\u043e\u043b\u044c\u0437\u0443\u0435\u043c\u0441\u044f TF-IDF \u0432\u0435\u043a\u0442\u043e\u0440\u0438\u0437\u0430\u0446\u0438\u0435\u0439.<\/p>\n<p><strong>TF<\/strong> \u0438\u0437\u043c\u0435\u0440\u044f\u0435\u0442 \u043d\u0430\u0441\u043a\u043e\u043b\u044c\u043a\u043e \u0447\u0430\u0441\u0442\u043e \u0442\u0435\u0440\u043c\u0438\u043d (\u0441\u043b\u043e\u0432\u043e) \u0432\u0441\u0442\u0440\u0435\u0447\u0430\u0435\u0442\u0441\u044f \u0432 \u0434\u043e\u043a\u0443\u043c\u0435\u043d\u0442\u0435. \u0424\u043e\u0440\u043c\u0443\u043b\u0430 \u0434\u043b\u044f \u0440\u0430\u0441\u0447\u0435\u0442\u0430 TF: <\/p>\n<p>\u0433\u0434\u0435\u00a0f(t,d)\u00a0\u2014 \u043a\u043e\u043b\u0438\u0447\u0435\u0441\u0442\u0432\u043e \u0432\u0445\u043e\u0436\u0434\u0435\u043d\u0438\u0439 \u0442\u0435\u0440\u043c\u0438\u043d\u0430\u00a0t \u0432 \u0434\u043e\u043a\u0443\u043c\u0435\u043d\u0442\u00a0d , \u0430\u00a0N<sub>d<\/sub>\u00a0\u2014 \u043e\u0431\u0449\u0435\u0435 \u043a\u043e\u043b\u0438\u0447\u0435\u0441\u0442\u0432\u043e \u0442\u0435\u0440\u043c\u0438\u043d\u043e\u0432 \u0432 \u0434\u043e\u043a\u0443\u043c\u0435\u043d\u0442\u0435\u00a0d.<\/p>\n<p><strong>IDF<\/strong> \u0438\u0437\u043c\u0435\u0440\u044f\u0435\u0442 \u0432\u0430\u0436\u043d\u043e\u0441\u0442\u044c \u0442\u0435\u0440\u043c\u0438\u043d\u0430 \u043f\u043e \u043e\u0442\u043d\u043e\u0448\u0435\u043d\u0438\u044e \u043a\u043e \u0432\u0441\u0435\u043c\u0443 \u043a\u043e\u0440\u043f\u0443\u0441\u0443 \u0434\u043e\u043a\u0443\u043c\u0435\u043d\u0442\u043e\u0432. \u0427\u0435\u043c \u0440\u0435\u0436\u0435 \u0442\u0435\u0440\u043c\u0438\u043d \u0432\u0441\u0442\u0440\u0435\u0447\u0430\u0435\u0442\u0441\u044f \u0432 \u043a\u043e\u0440\u043f\u0443\u0441\u0435, \u0442\u0435\u043c \u0432\u044b\u0448\u0435 \u0435\u0433\u043e IDF. \u0424\u043e\u0440\u043c\u0443\u043b\u0430 \u0434\u043b\u044f \u0440\u0430\u0441\u0447\u0435\u0442\u0430 IDF:<\/p>\n<p>\u0433\u0434\u0435\u00a0N\u00a0\u2014 \u043e\u0431\u0449\u0435\u0435 \u043a\u043e\u043b\u0438\u0447\u0435\u0441\u0442\u0432\u043e \u0434\u043e\u043a\u0443\u043c\u0435\u043d\u0442\u043e\u0432 \u0432 \u043a\u043e\u0440\u043f\u0443\u0441\u0435\u00a0D, \u0430\u00a0\u2223{d\u2208D:t\u2208d}\u2223\u00a0\u2014 \u043a\u043e\u043b\u0438\u0447\u0435\u0441\u0442\u0432\u043e \u0434\u043e\u043a\u0443\u043c\u0435\u043d\u0442\u043e\u0432, \u0441\u043e\u0434\u0435\u0440\u0436\u0430\u0449\u0438\u0445 \u0442\u0435\u0440\u043c\u0438\u043d\u00a0t.<\/p>\n<p><strong>TF-IDF<\/strong> \u043e\u0431\u044a\u0435\u0434\u0438\u043d\u044f\u0435\u0442 TF \u0438 IDF \u0434\u043b\u044f \u043e\u0446\u0435\u043d\u043a\u0438 \u0432\u0430\u0436\u043d\u043e\u0441\u0442\u0438 \u0442\u0435\u0440\u043c\u0438\u043d\u0430 \u0432 \u043a\u043e\u043d\u043a\u0440\u0435\u0442\u043d\u043e\u043c \u0434\u043e\u043a\u0443\u043c\u0435\u043d\u0442\u0435. \u0424\u043e\u0440\u043c\u0443\u043b\u0430 \u0434\u043b\u044f \u0440\u0430\u0441\u0447\u0435\u0442\u0430 TF-IDF:<\/p>\n<p>\u0414\u043b\u044f \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u043d\u0438\u044f TF-IDF \u043f\u0440\u0438\u043c\u0435\u043d\u0438\u043c \u0431\u0438\u0431\u043b\u0438\u043e\u0442\u0435\u043a\u0443\u00a0<code>scikit-learn<\/code>.\u00a0<\/p>\n<pre><code class=\"python\"># \u0438\u043d\u0438\u0446\u0438\u0430\u043b\u0438\u0437\u0438\u0440\u0443\u0435\u043c \u0432\u0435\u043a\u0442\u043e\u0440\u0430\u0439\u0437\u0435\u0440 \u0438 \u043f\u0440\u0438\u043c\u0435\u043d\u0438\u043c \u043a \u043d\u0430\u0448\u0438\u043c \u0432\u044b\u0431\u043e\u0440\u043a\u0430\u043c count_idf_1 = TfidfVectorizer(ngram_range = (1,1), tokenizer=lambda x: tokenize_sentence(x, remove_stop_words=True)) tf_idf_base_1 = count_idf_1.fit(df['comment']) tf_idf_train_base_1 = count_idf_1.transform(train_df['comment']) tf_idf_test_base_1 = count_idf_1.transform(test_df['comment'])  # \u0432\u044b\u0432\u0435\u0434\u0435\u043c \u0440\u0430\u0437\u043c\u0435\u0440\u044b \u043c\u0430\u0442\u0440\u0438\u0446, \u0447\u0442\u043e\u0431\u044b \u0443\u0431\u0435\u0434\u0438\u0442\u044c\u0441\u044f \u0432 \u043a\u043e\u0440\u0440\u0435\u043a\u0442\u043d\u043e\u0441\u0442\u0438: print(tf_idf_train_base_1.shape) print(tf_idf_test_base_1.shape) >>> (13912, 36122) >>> (500, 36122)<\/code><\/pre>\n<p>\u0414\u043b\u044f \u043f\u0440\u0438\u043c\u0435\u0440\u0430 \u0434\u0430\u0432\u0430\u0439\u0442\u0435 \u0440\u0430\u0441\u0441\u043c\u043e\u0442\u0440\u0438\u043c \u043a\u0430\u043a \u043f\u0440\u043e\u0438\u0441\u0445\u043e\u0434\u0438\u0442 TF-IDF\u00a0\u043d\u0430 \u043e\u0434\u043d\u043e\u043c \u0438\u0437 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0435\u0432.<\/p>\n<pre><code class=\"python\">sample = test_df.sample(n=1)['comment'] sample_tf_idf = count_idf_1.transform(sample) sample_tf_idf.shape >>> (1, 36122)  array = sample_tf_idf.toarray() array >>> array([[0., 0., 0., ..., 0., 0., 0.]])  # \u043a\u0430\u043a \u0432\u044b\u0433\u043b\u044f\u0434\u0438\u0442 \u043d\u0430\u0448 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0439 \u0434\u043e \u0432\u0435\u043a\u0442\u043e\u0440\u0438\u0437\u0430\u0446\u0438\u0438 sample >>> 12391    \u0427\u0442\u043e \u043a\u0430\u0441\u0430\u0435\u0442\u0441\u044f 3 \u043c\u043b\u043d, \u0443 \u041a\u0438\u044f \u0441\u0430\u043c\u0430\u044f \u0434\u043e\u0440\u043e\u0433\u0430\u044f \u043c\u0430\u0448\u0438\u043d\u0430...  # \u0438\u0437\u0432\u043b\u0435\u043a\u0430\u0435\u043c \u0438 \u0432\u044b\u0432\u043e\u0434\u0438\u043c \u043d\u0435\u043d\u0443\u043b\u0435\u0432\u044b\u0435 \u044d\u043b\u0435\u043c\u0435\u043d\u0442\u044b, \u043a\u043e\u0442\u043e\u0440\u044b\u0435 \u0441\u043e\u043e\u0442\u0432\u0435\u0442\u0441\u0442\u0432\u0443\u044e\u0442 \u0437\u043d\u0430\u0447\u0438\u043c\u044b\u043c \u0441\u043b\u043e\u0432\u0430\u043c: array[array!= 0] >>> array([0.27552192, 0.25845753, 0.24785363, 0.19574676, 0.13724815,            0.25845753, 0.13854953, 0.21636683, 0.18436214, 0.2040751 ,            0.25845753, 0.23449431, 0.13459448, 0.37887959, 0.20099479,            0.14063173, 0.15832929, 0.10074052, 0.11669742, 0.25845753,            0.25845753, 0.06473031]) <\/code><\/pre>\n<p>\u0422\u0435\u043f\u0435\u0440\u044c, \u043a\u043e\u0433\u0434\u0430 \u043d\u0430\u0448\u0438 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0438 \u0438\u043c\u0435\u044e\u0442 \u0432\u0435\u043a\u0442\u043e\u0440\u043d\u043e\u0435 \u043f\u0440\u0435\u0434\u0441\u0442\u0430\u0432\u043b\u0435\u043d\u0438\u0435, \u043c\u044b \u043c\u043e\u0436\u0435\u043c \u043f\u0435\u0440\u0435\u0439\u0442\u0438 \u043a \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u044e \u043c\u043e\u0434\u0435\u043b\u0438.<\/p>\n<h4>\u041e\u0431\u0443\u0447\u0435\u043d\u0438\u0435 \u043c\u043e\u0434\u0435\u043b\u0438<\/h4>\n<p>\u0412 \u043a\u0430\u0447\u0435\u0441\u0442\u0432\u0435 baseline \u044f \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u043b\u0430 \u043b\u043e\u0433\u0438\u0441\u0442\u0438\u0447\u0435\u0441\u043a\u0443\u044e \u0440\u0435\u0433\u0440\u0435\u0441\u0441\u0438\u044e, \u0442.\u043a \u043e\u043d\u0430 \u0445\u043e\u0440\u043e\u0448\u043e \u043f\u043e\u0434\u0445\u043e\u0434\u0438\u0442 \u0434\u043b\u044f \u0437\u0430\u0434\u0430\u0447\u0438 \u0431\u0438\u043d\u0430\u0440\u043d\u043e\u0439 \u043a\u043b\u0430\u0441\u0441\u0438\u0444\u0438\u043a\u0430\u0446\u0438\u0438.<\/p>\n<p>\u0415\u0441\u043b\u0438 \u0432\u044b \u0435\u0449\u0435 \u043d\u0435 \u0437\u043d\u0430\u043a\u043e\u043c\u044b \u0441 \u0434\u0430\u043d\u043d\u043e\u0439 \u043c\u043e\u0434\u0435\u043b\u044c\u044e, \u043d\u043e \u0443\u0436\u0435 \u0441\u043b\u044b\u0448\u0430\u043b\u0438 \u043f\u0440\u043e \u043b\u0438\u043d\u0435\u0439\u043d\u0443\u044e \u0440\u0435\u0433\u0440\u0435\u0441\u0441\u0438\u044e, \u0442\u043e \u043c\u043e\u0436\u043d\u043e \u0441\u043a\u0430\u0437\u0430\u0442\u044c, \u0447\u0442\u043e \u0432\u044b \u043f\u043e\u0447\u0442\u0438 \u0437\u043d\u0430\u0442\u043e\u043a. \u0414\u0435\u043b\u043e \u0432 \u0442\u043e\u043c, \u0447\u0442\u043e \u043b\u043e\u0433\u0438\u0441\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u0440\u0435\u0433\u0440\u0435\u0441\u0441\u0438\u044f \u043f\u043e \u0441\u0443\u0442\u0438 \u044d\u0442\u043e \u043b\u0438\u043d\u0435\u0439\u043d\u0430\u044f \u0440\u0435\u0433\u0440\u0435\u0441\u0441\u0438\u044f, \u043a \u0440\u0435\u0437\u0443\u043b\u044c\u0442\u0430\u0442\u0443 \u043a\u043e\u0442\u043e\u0440\u043e\u0439 \u0432 \u043a\u043e\u043d\u0446\u0435 \u043f\u0440\u0438\u043c\u0435\u043d\u044f\u0435\u0442\u0441\u044f \u043b\u043e\u0433\u0438\u0441\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u0444\u0443\u043d\u043a\u0446\u0438\u044f (\u043d\u0430\u043f\u0440\u0438\u043c\u0435\u0440, \u0441\u0438\u0433\u043c\u043e\u0438\u0434\u0430). <\/p>\n<p>\u0424\u043e\u0440\u043c\u0443\u043b\u0430 \u0441\u0438\u0433\u043c\u043e\u0438\u0434\u043d\u043e\u0439 \u0444\u0443\u043d\u043a\u0446\u0438\u0438:<\/p>\n<p>\u0433\u0434\u0435\u00a0z\u2014 \u043b\u0438\u043d\u0435\u0439\u043d\u0430\u044f \u043a\u043e\u043c\u0431\u0438\u043d\u0430\u0446\u0438\u044f \u043f\u0440\u0438\u0437\u043d\u0430\u043a\u043e\u0432 \u0438 \u0438\u0445 \u0432\u0435\u0441\u043e\u0432:\u00a0z = \u03b2<sub>0<\/sub>+\u03b2<sub>1<\/sub>x<sub>1<\/sub>+\u03b2<sub>2<\/sub>x<sub>2<\/sub>+\u2026+\u03b2<sub>n<\/sub>x<sub>n<\/sub>.<\/p>\n<p>\u0417\u043d\u0430\u0447\u0435\u043d\u0438\u0435\u00a0\u03c3(z)\u00a0\u043b\u0435\u0436\u0438\u0442 \u043c\u0435\u0436\u0434\u0443 0 \u0438 1, \u0447\u0442\u043e \u0438\u043d\u0442\u0435\u0440\u043f\u0440\u0435\u0442\u0438\u0440\u0443\u0435\u0442\u0441\u044f \u043a\u0430\u043a \u0432\u0435\u0440\u043e\u044f\u0442\u043d\u043e\u0441\u0442\u044c.<\/p>\n<pre><code class=\"python\"># \u0438\u043d\u0438\u0446\u0438\u0430\u043b\u0438\u0437\u0438\u0440\u0443\u0435\u043c \u043c\u043e\u0434\u0435\u043b\u044c model_lr_base_1 = LogisticRegression(solver='lbfgs', random_state=234, max_iter= 10000, n_jobs= -1)  # \u043e\u0431\u0443\u0447\u0438\u043c \u043c\u043e\u0434\u0435\u043b\u044c model_lr_base_1.fit(tf_idf_train_base_1, train_df['toxic'])  # \u043f\u043e\u043b\u0443\u0447\u0438\u043c \u043f\u0440\u043e\u0433\u043d\u043e\u0437 \u0432\u0435\u0440\u043e\u044f\u0442\u043d\u043e\u0441\u0442\u0435\u0439 \u043a\u043b\u0430\u0441\u0441\u043e\u0432 predict_lr_base_proba = model_lr_base_1.predict_proba(tf_idf_test_base_1) predict_lr_base_proba >>> array([[0.85603587, 0.14396413],            [0.29448938, 0.70551062],            [0.41543358, 0.58456642],            [0.77011541, 0.22988459],            [0.62820949, 0.37179051],            ...            [0.82299013, 0.17700987]])<\/code><\/pre>\n<p>\u041a\u0430\u0436\u0434\u0430\u044f \u0441\u0442\u0440\u043e\u043a\u0430 <code>predict_lr_base_proba<\/code> \u043f\u0440\u0435\u0434\u0441\u0442\u0430\u0432\u043b\u044f\u0435\u0442 \u0441\u043e\u0431\u043e\u0439 \u043f\u0430\u0440\u0443 \u0447\u0438\u0441\u0435\u043b: \u0432\u0435\u0440\u043e\u044f\u0442\u043d\u043e\u0441\u0442\u044c \u043d\u0435 \u0442\u043e\u043a\u0441\u0438\u0447\u043d\u043e\u0433\u043e \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u044f (\u043f\u0435\u0440\u0432\u043e\u0435 \u0447\u0438\u0441\u043b\u043e) \u0438 \u0432\u0435\u0440\u043e\u044f\u0442\u043d\u043e\u0441\u0442\u044c \u0442\u043e\u043a\u0441\u0438\u0447\u043d\u043e\u0433\u043e \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u044f (\u0432\u0442\u043e\u0440\u043e\u0435 \u0447\u0438\u0441\u043b\u043e) \u0441\u043e\u043e\u0442\u0432\u0435\u0442\u0441\u0442\u0432\u0435\u043d\u043d\u043e. <\/p>\n<h3>\u041e\u0446\u0435\u043d\u043a\u0430 \u043c\u043e\u0434\u0435\u043b\u0438<\/h3>\n<p>\u041f\u0440\u0435\u0434\u043b\u0430\u0433\u0430\u044e \u0435\u0449\u0435 \u0441\u0440\u0430\u0432\u043d\u0438\u0442\u044c \u043a\u0430\u0447\u0435\u0441\u0442\u0432\u043e \u043d\u0430\u0448\u0435\u0439 \u043c\u043e\u0434\u0435\u043b\u0438 \u0441 \u0441\u043b\u0443\u0447\u0430\u0439\u043d\u044b\u043c \u043a\u043b\u0430\u0441\u0441\u0438\u0444\u0438\u043a\u0430\u0442\u043e\u0440\u043e\u043c.<\/p>\n<pre><code class=\"python\">def coin_classifier(X:np.array) -> np.array:     predict = np.random.uniform(0.0, 1.0, X.shape[0])     return predict coin_predict = coin_classifier(tf_idf_test_base_1)<\/code><\/pre>\n<p>\u0412\u0438\u0437\u0443\u0430\u043b\u0438\u0437\u0438\u0440\u0443\u0435\u043c ROC-\u043a\u0440\u0438\u0432\u044b\u0435 \u0438 \u0432\u044b\u0432\u0435\u0434\u0435\u043c \u043c\u0430\u0442\u0440\u0438\u0446\u0443 \u043e\u0448\u0438\u0431\u043e\u043a.<\/p>\n<pre><code class=\"python\"># \u0434\u043b\u044f \u043d\u0430\u0448\u0435\u0439 \u043c\u043e\u0434\u0435\u043b\u0438 \u043b\u043e\u0433\u0438\u0441\u0442\u0438\u0447\u0435\u0441\u043a\u043e\u0439 \u0440\u0435\u0433\u0440\u0435\u0441\u0441\u0438\u0438 fpr_base, tpr_base, _ = roc_curve(test_df['toxic'], predict_lr_base_proba[:, 1]) roc_auc_base = auc(fpr_base, tpr_base)  # \u0434\u043b\u044f \u0441\u043b\u0443\u0447\u0430\u0439\u043d\u043e\u0433\u043e \u043a\u043b\u0430\u0441\u0441\u0438\u0444\u0438\u043a\u0430\u0442\u043e\u0440\u0430  fpr_coin, tpr_coin, _ = roc_curve(test_df['toxic'], coin_predict) roc_auc_coin = auc(fpr_base, tpr_base)  fig = make_subplots(1,1,                     subplot_titles = [\"Receiver operating characteristic\"],                     x_title=\"False Positive Rate\",                     y_title = \"True Positive Rate\"                    ) fig.add_trace(go.Scatter(     x = fpr_base,     y = tpr_base,     #fill = 'tozeroy',     name = \"ROC base (area = %0.3f)\" % roc_auc_base,     )) fig.add_trace(go.Scatter(     x = fpr_coin,     y = tpr_coin,     mode = 'lines',     line = dict(dash = 'dash'),     name = 'Coin classifier (area = 0.5)'     )) fig.update_layout(     height = 600,     width = 800,     xaxis_showgrid=False,     xaxis_zeroline=False,     template = 'plotly_dark',     font_color = 'rgba(212, 210, 210, 1)'     )  # \u043c\u0430\u0442\u0440\u0438\u0446\u0430 \u043e\u0448\u0438\u0431\u043e\u043a confusion_matrix(test_df['toxic'],                  (predict_lr_base_proba[:, 1] > 0.5).astype('float'),                  normalize='true',                 ) >>> array([[0.97959184, 0.02040816],        [0.35031847, 0.64968153]]) <\/code><\/pre>\n<figure class=\"full-width\"><\/figure>\n<ul>\n<li>\n<p>AUC \u0441\u043b\u0443\u0447\u0430\u0439\u043d\u043e\u0433\u043e \u043a\u043b\u0430\u0441\u0441\u0438\u0444\u0438\u043a\u0430\u0442\u043e\u0440\u0430 \u0431\u043b\u0438\u0437\u043e\u043a \u043a 0.5, \u0447\u0442\u043e \u0441\u0432\u0438\u0434\u0435\u0442\u0435\u043b\u044c\u0441\u0442\u0432\u0443\u0435\u0442 \u043e \u0442\u043e\u043c, \u0447\u0442\u043e \u044d\u0442\u043e\u0442 \u043a\u043b\u0430\u0441\u0441\u0438\u0444\u0438\u043a\u0430\u0442\u043e\u0440 \u043d\u0435\u0441\u043f\u043e\u0441\u043e\u0431\u0435\u043d \u044d\u0444\u0444\u0435\u043a\u0442\u0438\u0432\u043d\u043e \u0440\u0430\u0437\u043b\u0438\u0447\u0430\u0442\u044c \u043a\u043b\u0430\u0441\u0441\u044b.<\/p>\n<\/li>\n<li>\n<p>\u041c\u043e\u0434\u0435\u043b\u044c \u043b\u043e\u0433\u0438\u0441\u0442\u0438\u0447\u0435\u0441\u043a\u043e\u0439 \u0440\u0435\u0433\u0440\u0435\u0441\u0441\u0438\u0438<\/p>\n<\/li>\n<\/ul>\n<\/div>\n<\/div>\n<\/div>\n<\/div>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[],"tags":[],"class_list":["post-426692","post","type-post","status-publish","format-standard","hentry"],"_links":{"self":[{"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=\/wp\/v2\/posts\/426692","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=426692"}],"version-history":[{"count":0,"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=\/wp\/v2\/posts\/426692\/revisions"}],"wp:attachment":[{"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=426692"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=426692"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=426692"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}