{"id":308420,"date":"2020-08-13T15:00:26","date_gmt":"2020-08-13T15:00:26","guid":{"rendered":"http:\/\/savepearlharbor.com\/?p=308420"},"modified":"-0001-11-30T00:00:00","modified_gmt":"-0001-11-29T21:00:00","slug":"","status":"publish","type":"post","link":"https:\/\/savepearlharbor.com\/?p=308420","title":{"rendered":"\u041e\u0431\u0437\u043e\u0440 \u043c\u0435\u0442\u043e\u0434\u043e\u0432 \u0441\u043e\u0437\u0434\u0430\u043d\u0438\u044f \u044d\u043c\u0431\u0435\u0434\u0438\u043d\u0433\u043e\u0432 \u043f\u0440\u0435\u0434\u043b\u043e\u0436\u0435\u043d\u0438\u0439, \u0427\u0430\u0441\u0442\u044c 1"},"content":{"rendered":"\n<div class=\"post__text post__text-html post__text_v1\" id=\"post-content-body\" data-io-article-url=\"https:\/\/habr.com\/ru\/post\/515036\/\">\n<p>\u041f\u0440\u0435\u0434\u0441\u0442\u0430\u0432\u0442\u0435 \u0441\u0435\u0431\u0435, \u043a\u0430\u043a \u0431\u044b\u043b\u043e \u0431\u044b \u0443\u0434\u043e\u0431\u043d\u043e, \u043d\u0430\u043f\u0438\u0441\u0430\u0442\u044c \u043f\u0440\u0435\u0434\u043b\u043e\u0436\u0435\u043d\u0438\u0435 \u0438 \u043d\u0430\u0439\u0442\u0438 \u043f\u043e\u0445\u043e\u0436\u0435\u0435 \u043a \u043d\u0435\u043c\u0443 \u043f\u043e \u0441\u043c\u044b\u0441\u043b\u0443. \u0414\u043b\u044f \u044d\u0442\u043e\u0433\u043e \u043d\u0443\u0436\u043d\u043e \u0443\u043c\u0435\u0442\u044c \u0432\u0435\u043a\u0442\u043e\u0440\u0438\u0437\u043e\u0432\u0430\u0442\u044c \u0432\u0441\u0451 \u043f\u0440\u0435\u0434\u043b\u043e\u0436\u0435\u043d\u0438\u0435, \u0447\u0442\u043e \u043c\u043e\u0436\u0435\u0442 \u0431\u044b\u0442\u044c \u043e\u0447\u0435\u043d\u044c \u043d\u0435 \u0442\u0440\u0438\u0432\u0438\u0430\u043b\u044c\u043d\u043e\u0439 \u0437\u0430\u0434\u0430\u0447\u0435\u0439.<br \/>  \u041f\u043e \u0441\u043f\u0435\u0446\u0438\u0444\u0438\u043a\u0435 \u0441\u0432\u043e\u0435\u0439 \u0440\u0430\u0431\u043e\u0442\u044b, \u044f \u0434\u043e\u043b\u0436\u0435\u043d \u0438\u0441\u043a\u0430\u0442\u044c \u043f\u043e\u0445\u043e\u0436\u0438\u0435 \u0437\u0430\u043f\u0440\u043e\u0441\u044b \u0432 \u0441\u043b\u0443\u0436\u0431\u0443 \u043f\u043e\u0434\u0434\u0435\u0440\u0436\u043a\u0438 \u0438 \u0434\u0430\u0436\u0435 \u0438\u043c\u0435\u044f \u0434\u043e\u0441\u0442\u0430\u0442\u043e\u0447\u043d\u043e \u0431\u043e\u043b\u044c\u0448\u0443\u044e \u0440\u0430\u0437\u043c\u0435\u0442\u043a\u0443, \u0431\u044b\u0432\u0430\u0435\u0442 \u0442\u044f\u0436\u0435\u043b\u043e \u0441\u043e\u0431\u0440\u0430\u0442\u044c \u043d\u0435\u043e\u0431\u0445\u043e\u0434\u0438\u043c\u043e\u0435 \u043a\u043e\u043b\u0438\u0447\u0435\u0441\u0442\u0432\u043e \u0441\u043e\u043e\u0431\u0449\u0435\u043d\u0438\u0439 \u043f\u043e\u0434\u0445\u043e\u0434\u044f\u0449\u0438\u0445 \u043f\u043e \u0442\u0435\u043c\u0430\u0442\u0438\u043a\u0435, \u043d\u043e \u043d\u0430\u043f\u0438\u0441\u0430\u043d\u043d\u044b\u0445 \u0434\u0440\u0443\u0433\u0438\u043c\u0438 \u0441\u043b\u043e\u0432\u0430\u043c\u0438.<br \/>  \u041d\u0438\u0436\u0435 \u043e\u0431\u0437\u043e\u0440\u043d\u043e\u0435 \u0438\u0441\u0441\u043b\u0435\u0434\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 \u0441\u043f\u043e\u0441\u043e\u0431\u044b \u0432\u0435\u043a\u0442\u043e\u0440\u0438\u0437\u0430\u0446\u0438\u0438 \u0432\u0441\u0435\u0433\u043e \u043f\u0440\u0435\u0434\u043b\u043e\u0436\u0435\u043d\u0438\u044f \u0438 \u043d\u0435 \u043f\u0440\u043e\u0441\u0442\u043e \u0432\u0435\u043a\u0442\u043e\u0440\u0438\u0437\u0430\u0446\u0438\u0438, \u0430 \u043f\u043e\u043f\u044b\u0442\u043a\u0430 \u0432\u0435\u043a\u0442\u043e\u0440\u0438\u0437\u043e\u0432\u0430\u0442\u044c \u043f\u0440\u0435\u0434\u043b\u043e\u0436\u0435\u043d\u0438\u0435 \u0441 \u0443\u0447\u0451\u0442\u043e\u043c \u0435\u0433\u043e \u0441\u043c\u044b\u0441\u043b\u0430.<br \/>  \u041d\u0430\u043f\u0440\u0438\u043c\u0435\u0440 \u0434\u0432\u0435 \u0444\u0440\u0430\u0437\u044b <strong>&#8216;\u044d\u043f\u043b \u043b\u0443\u0447\u0448\u0435 \u0441\u0430\u043c\u0441\u0443\u043d\u0433&#8217;<\/strong> \u043e\u0442 <strong>&#8216;\u0441\u0430\u043c\u0441\u0443\u043d\u0433 \u043b\u0443\u0447\u0448\u0435 \u044d\u043f\u043b&#8217;<\/strong>, \u0434\u043e\u043b\u0436\u043d\u044b \u0431\u044b\u0442\u044c \u043d\u0430 \u043f\u0440\u043e\u0442\u0438\u0432\u043e\u043f\u043e\u043b\u043e\u0436\u043d\u043e\u043c \u043a\u043e\u043d\u0446\u0435 \u043f\u043e \u043e\u0434\u043d\u043e\u043c\u0443 \u0438\u0437 \u0437\u043d\u0430\u0447\u0435\u043d\u0438\u0439 \u0432\u0435\u043a\u0442\u043e\u0440\u0430, \u043d\u043e \u043f\u0440\u0438 \u044d\u0442\u043e\u043c \u0441\u043e\u0432\u043f\u0430\u0434\u0430\u0442\u044c \u043f\u043e \u0434\u0440\u0443\u0433\u0438\u043c.<br \/>  \u041c\u043e\u0436\u043d\u043e \u043f\u0440\u0438\u0432\u0435\u0441\u0442\u0438 \u0430\u043d\u0430\u043b\u043e\u0433\u0438\u044e \u0441 \u043a\u0430\u0440\u0442\u0438\u043d\u043a\u043e\u0439 \u043d\u0438\u0436\u0435. \u041f\u043e \u0448\u043a\u0430\u043b\u0435 \u043e\u0442 \u043a\u0435\u043a\u0441\u0430 \u0434\u043e \u0441\u043e\u0431\u0430\u043a\u0438 \u043e\u043d\u0438 \u043d\u0430\u0445\u043e\u0434\u044f\u0442\u0441\u044f \u043d\u0430 \u0440\u0430\u0437\u043d\u044b\u0445 \u043a\u043e\u043d\u0446\u0430\u0445, \u0430 \u043f\u043e \u043a\u043e\u043b\u0438\u0447\u0435\u0441\u0442\u0432\u0443 \u0447\u0451\u0440\u043d\u044b\u0445 \u0442\u043e\u0447\u0435\u043a \u0438 \u0446\u0432\u0435\u0442\u0443 \u043e\u0431\u044a\u0435\u043a\u0442\u0430 \u043d\u0430 \u043e\u0434\u043d\u043e\u043c.<\/p>\n<p>  <\/p>\n<p><img decoding=\"async\" src=\"https:\/\/cdn-media-1.freecodecamp.org\/images\/1*bt-E2YcPafjiPbZFDMMmNQ.jpeg\" alt=\"https:\/\/cdn-media-1.freecodecamp.org\/images\/1*bt-E2YcPafjiPbZFDMMmNQ.jpeg\"\/><\/p>\n<p>  <\/p>\n<p><a href=\"https:\/\/paperswithcode.com\/task\/sentence-embedding\" rel=\"nofollow\">\u0412\u043e\u0442 \u0442\u0443\u0442 \u0441\u0431\u043e\u0440\u043d\u0438\u043a \u0441\u0442\u0430\u0442\u0435\u0439 \u043f\u043e \u0432\u0435\u043a\u0442\u043e\u0440\u0438\u0437\u0430\u0446\u0438\u0438 \u043f\u0440\u0435\u0434\u043b\u043e\u0436\u0435\u043d\u0438\u0439<\/a> <\/p>\n<p><a name=\"habracut\"><\/a>  <\/p>\n<p>\u041c\u0435\u0442\u043e\u0434\u044b \u0432 \u0441\u0442\u0430\u0442\u044c\u044f\u0445 \u043e\u0447\u0435\u043d\u044c \u043d\u0435 \u0442\u0440\u0438\u0432\u0438\u0430\u043b\u044c\u043d\u044b\u0435 \u0438 \u0438\u043d\u0442\u0435\u0440\u0435\u0441\u043d\u044b \u0434\u043b\u044f \u0438\u0437\u0443\u0447\u0435\u043d\u0438\u044f, \u043d\u043e \u043c\u0438\u043d\u0443\u0441\u044b \u0432 \u0442\u043e\u043c, \u0447\u0442\u043e:<\/p>\n<p>  <\/p>\n<ol>\n<li>\u043e\u043d\u0438 \u0438\u0441\u043f\u044b\u0442\u044b\u0432\u0430\u043b\u0438\u0441\u044c \u043d\u0430 \u0430\u043d\u0433\u043b\u0438\u0439\u0441\u043a\u043e\u043c \u044f\u0437\u044b\u043a\u0435 <\/li>\n<li>\u0432 \u043a\u0430\u0436\u0434\u043e\u0439 \u0441\u0442\u0430\u0442\u044c\u0435 \u043d\u0430\u043f\u0438\u0441\u0430\u043d\u043e, \u0447\u0442\u043e \u043e\u043d\u0438 \u043f\u0440\u0435\u0432\u0437\u043e\u0448\u043b\u0438 \u043f\u0440\u0435\u0434\u0448\u0435\u0441\u0442\u0432\u0435\u043d\u043d\u0438\u043a\u043e\u0432, \u043d\u043e \u0441\u0440\u0430\u0432\u043d\u0435\u043d\u0438\u044f\u043f\u0440\u043e\u0432\u043e\u0434\u0438\u043b\u0438\u0441\u044c \u043d\u0430 \u0440\u0430\u0437\u043d\u044b\u0445 \u0434\u0430\u0442\u0430\u0441\u0435\u0442\u0430\u0445 \u0438 \u043d\u0435\u0442 \u0432\u043e\u0437\u043c\u043e\u0436\u043d\u043e\u0441\u0442\u0438 \u0441\u0434\u0435\u043b\u0430\u0442\u044c \u0440\u0435\u0439\u0442\u0438\u043d\u0433<\/li>\n<\/ol>\n<p>  <\/p>\n<p>\u041f\u043e\u044d\u0442\u043e\u043c\u0443 \u043d\u0438\u0436\u0435 \u043e\u0431\u0437\u043e\u0440\u043d\u043e \u2014 \u0441\u0440\u0430\u0432\u043d\u0438\u0442\u0435\u043b\u044c\u043d\u044b\u0439 \u0430\u043d\u0430\u043b\u0438\u0437 7 \u0440\u0430\u0437\u043d\u044b\u0445 \u043c\u0435\u0442\u043e\u0434\u043e\u0432 \u0432\u0435\u043a\u0442\u043e\u0440\u0438\u0437\u0430\u0446\u0438\u0438 \u043f\u0440\u0435\u0434\u043b\u043e\u0436\u0435\u043d\u0438\u0439 \u043d\u0430 \u043e\u0434\u043d\u043e\u043c \u0434\u0430\u0442\u0430\u0441\u0435\u0442\u0435.<\/p>\n<p>  <\/p>\n<h2 id=\"soderzhanie\">\u0421\u043e\u0434\u0435\u0440\u0436\u0430\u043d\u0438\u0435<\/h2>\n<p>  <\/p>\n<p>\u041f\u043e\u0434\u0433\u043e\u0442\u043e\u0432\u043a\u0430<br \/>  \u0424\u0443\u043d\u043a\u0446\u0438\u044f \u043e\u0446\u0435\u043d\u043a\u0438 <\/p>\n<p>  <\/p>\n<ol>\n<li>\u041c\u0435\u0442\u043e\u0434\u044b BOW<br \/>  1.1. \u043f\u0440\u043e\u0441\u0442\u043e\u0439 BOW<br \/>  1.2. BOW c \u043b\u0435\u043c\u043c\u0430\u043c\u0438 \u0441\u043b\u043e\u0432<br \/>  1.3. BOW \u0441 \u043b\u0435\u043c\u043c\u0430\u043c\u0438 \u0438 \u043e\u0447\u0438\u0441\u0442\u043a\u043e\u0439 \u0441\u0442\u043e\u043f\u0441\u043b\u043e\u0432<br \/>  1.4. LDA<\/li>\n<li>\u041c\u0435\u0442\u043e\u0434\u044b, \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u0443\u044e\u0449\u0438\u0435 \u044d\u043c\u0431\u0435\u0434\u0438\u043d\u0433\u0438 \u0442\u043e\u043a\u0435\u043d\u043e\u0432<br \/>  2.1 \u0421\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e \u044d\u043c\u0431\u0435\u0434\u0438\u043d\u0433\u0443 \u0432\u0441\u0435\u0445 \u0441\u043b\u043e\u0432<br \/>  2.2 \u0421\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e \u044d\u043c\u0431\u0435\u0434\u0438\u043d\u0433\u0443 \u0441 \u043e\u0447\u0438\u0441\u0442\u043a\u043e\u0439 \u0441\u0442\u043e\u043f \u0441\u043b\u043e\u0432<br \/>  2.3 \u0421\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e \u044d\u043c\u0431\u0435\u0434\u0438\u043d\u0433\u0443 \u0441 \u0432\u0435\u0441\u0430\u043c\u0438 tf-idf <\/li>\n<li>Languade Models<br \/>  3.1 Language Model on embedings<br \/>  3.2 Language Model on index <\/li>\n<li>BERT<br \/>  4.1 rubert_cased_L-12_H-768_A-12_pt<br \/>  4.2 ru_conversational_cased_L-12_H-768_A-12_pt<br \/>  4.3 sentence_ru_cased_L-12_H-768_A-12_pt<br \/>  4.4 elmo_ru-news_wmt11-16_1.5M_steps.tar.gz<\/li>\n<li>\u0410\u0432\u0442\u043e\u044d\u043d\u043a\u043e\u0434\u0435\u0440\u044b<br \/>  5.1 \u0410\u0432\u0442\u043e\u044d\u043d\u043a\u043e\u0434\u0435\u0440 embedings -&gt; embedings<br \/>  5.2 \u0410\u0432\u0442\u043e\u044d\u043d\u043a\u043e\u0434\u0435\u0440 embedings -&gt; indexes<br \/>  5.3 \u0410\u0432\u0442\u043e\u044d\u043d\u043a\u043e\u0434\u0435\u0440 \u0430\u0440\u0445\u0438\u0442\u0435\u043a\u0442\u0443\u0440\u0430 LSTM -&gt; LSTM<br \/>  5.4 \u0410\u0432\u0442\u043e\u044d\u043d\u043a\u043e\u0434\u0435\u0440 \u0430\u0440\u0445\u0438\u0442\u0435\u043a\u0442\u0443\u0440\u0430 LSTM -&gt; LSTM -&gt; indexes <\/li>\n<li>\u042d\u043c\u0431\u0435\u0434\u0438\u043d\u0433\u0438 \u043d\u0430 Transfer Learning<br \/>  6.1 \u042d\u043c\u0431\u0435\u0434\u0438\u043d\u0433\u0438 \u043d\u0430 BOW<br \/>  6.2 \u042d\u043c\u0431\u0435\u0434\u0438\u043d\u0433 \u043d\u0430 LSTM + MaxPooling<br \/>  6.3 \u042d\u043c\u0431\u0435\u0434\u0438\u043d\u0433 \u043d\u0430 LSTM + Conv1D + AveragePooling<br \/>  6.4 \u042d\u043c\u0431\u0435\u0434\u0438\u043d\u0433 \u043d\u0430 LSTM + Inception + Attention <\/li>\n<li>Triplet loss<br \/>  7.1 Triplet loss \u043d\u0430 BOW<br \/>  7.2 Triplet loss \u043d\u0430 embedings <\/li>\n<\/ol>\n<p>  <\/p>\n<h1 id=\"podgotovka\">\u041f\u043e\u0434\u0433\u043e\u0442\u043e\u0432\u043a\u0430<\/h1>\n<p>  <\/p>\n<pre><code class=\"python\">import pandas as pd import numpy as np from collections import defaultdict, Counter import random from tqdm.notebook import tqdm from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_distances, euclidean_distances from sklearn.decomposition import LatentDirichletAllocation  from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.utils import to_categorical  from tensorflow.keras.layers import Input, Bidirectional, LSTM, Dense, MaxPooling1D, AveragePooling1D, Conv1D from tensorflow.keras.layers import Flatten, Reshape, Concatenate, Permute, Activation, Dropout, multiply from tensorflow.keras.layers import BatchNormalization from tensorflow.keras.models import Model from tensorflow.keras.losses import cosine_similarity from tensorflow.keras import regularizers import tensorflow.keras.backend as K import tensorflow as tf  import pymorphy2 import nltk nltk.download('stopwords') from nltk.corpus import stopwords import numpy as np import matplotlib.pyplot as plt import pickle import os import re from conllu import parse_incr<\/code><\/pre>\n<p>  <\/p>\n<h2 id=\"baza-znaniy\">\u0411\u0430\u0437\u0430 \u0437\u043d\u0430\u043d\u0438\u0439<\/h2>\n<p>  <\/p>\n<p><a href=\"https:\/\/github.com\/UniversalDependencies\/UD_Russian-SynTagRus\" rel=\"nofollow\">\u0418\u0441\u0442\u043e\u0447\u043d\u0438\u043a \u0437\u043d\u0430\u043d\u0438\u0439 \u0438 \u0440\u0430\u0437\u043c\u0435\u0442\u043a\u0438 \u043f\u043e \u0442\u0435\u043c\u0430\u0442\u0438\u043a\u0430\u043c<\/a><br \/>  <a href=\"https:\/\/ru.wikipedia.org\/wiki\/%D0%93%D0%BB%D1%83%D0%B1%D0%BE%D0%BA%D0%BE_%D0%B0%D0%BD%D0%BD%D0%BE%D1%82%D0%B8%D1%80%D0%BE%D0%B2%D0%B0%D0%BD%D0%BD%D1%8B%D0%B9_%D0%BA%D0%BE%D1%80%D0%BF%D1%83%D1%81_%D1%80%D1%83%D1%81%D1%81%D0%BA%D0%BE%D0%B3%D0%BE_%D1%8F%D0%B7%D1%8B%D0%BA%D0%B0#%D0%94%D0%BE%D1%81%D1%82%D1%83%D0%BF\" rel=\"nofollow\">\u041e\u0431\u044f\u0441\u043d\u0435\u043d\u0438\u0435 \u043d\u0430 \u0412\u0438\u043a\u0438<\/a><\/p>\n<p>  <\/p>\n<pre><code class=\"python\">files = {'train': 'ru_syntagrus-ud-train.conllu',          'test':  'ru_syntagrus-ud-test.conllu',          'dev':   'ru_syntagrus-ud-dev.conllu'} database = {} for data_type in files:     filename = files[data_type]     database = {}     with open(os.path.join('UD_Russian-SynTagRus-master', filename), encoding='utf-8') as f:         parsed = parse_incr(f)         for token_list in parsed:             topic_name = token_list.metadata['sent_id'].split('.')[0]             # \u0443\u0431\u0440\u0451\u043c \u0446\u0438\u0444\u0440\u044b \u0438\u0437 \u043d\u0430\u0437\u0432\u0430\u043d\u0438\u0439 \u0442\u0435\u043c\u044b             topic_name = re.sub(r'\\d+', '', topic_name)             if topic_name not in database:                 database[topic_name] = []             sentence = ' '.join([token['form'] for token in token_list]).lower()             database[topic_name].append(sentence)<\/code><\/pre>\n<p>  <\/p>\n<p>\u0412\u044b\u0431\u0438\u0440\u0430\u0435\u043c \u0438\u0437 \u0431\u0430\u0437\u044b \u0437\u043d\u0430\u043d\u0438\u0439 \u0442\u0440\u0438 \u0442\u0435\u043c\u044b \u0441 \u043f\u0440\u0438\u043c\u0435\u0440\u043d\u043e \u0440\u0430\u0432\u043d\u044b\u043c \u043a\u043e\u043b\u0438\u0447\u0435\u0441\u0442\u0432\u043e \u0441\u043e\u043e\u0431\u0449\u0435\u043d\u0438\u0439 \u0432 \u043d\u0438\u0445, \u043a\u043e\u0442\u043e\u0440\u044b\u0435 \u0431\u0443\u0434\u0443\u0442 \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u044c\u0441\u044f \u0434\u043b\u044f \u043e\u0446\u0435\u043d\u043a\u0438 \u043c\u0435\u0442\u043e\u0434\u043e\u0432 \u0438 \u0443\u0434\u0430\u043b\u0438\u043c \u0438\u0445 \u0438\u0437 \u0431\u0430\u0437\u044b \u0437\u043d\u0430\u043d\u0438\u0439.<\/p>\n<p>  <\/p>\n<pre><code class=\"python\">choosen_for_evaluation = ['I_slepye_prozreyut',                           'Interviyu_Mariny_Astvatsaturyan',                           'Byudzhet'] texts_for_evaluation = {} texts_for_training = {} for topic in database:     if topic in choosen_for_evaluation:         texts_for_evaluation[topic] = database[topic]     else:         texts_for_training[topic] = database[topic]  TEXTS_CORPUS = [sentence for topic in texts_for_training for sentence in texts_for_training[topic]]  # \u043d\u0430\u043f\u0435\u0447\u0430\u0442\u0430\u0435\u043c \u043f\u0440\u0438\u043c\u0435\u0440\u044b for topic in texts_for_evaluation:     print(topic, len(texts_for_evaluation[topic]))     for index, sentence in enumerate(texts_for_evaluation[topic]):         print('\\t', sentence[:100])         if index &gt; 5:             break     print('\\n')<\/code><\/pre>\n<p>  <\/p>\n<p>\u041f\u0440\u0438\u043c\u0435\u0440\u044b \u0432\u044b\u0432\u043e\u0434\u043e\u0432<\/p>\n<p>  <\/p>\n<pre><code class=\"plaintext\">Byudzhet 70      \u043a\u0430\u043d\u0434\u0438\u0434\u0430\u0442 \u0438\u0441\u0442\u043e\u0440\u0438\u0447\u0435\u0441\u043a\u0438\u0445 \u043d\u0430\u0443\u043a \u043d. \u043c\u0438\u0442\u0440\u043e\u0444\u0430\u043d\u043e\u0432 .      \u0432\u0441\u0435 \u043c\u0435\u0434\u043d\u044b\u0435 \u0438 \u0441\u0435\u0440\u0435\u0431\u0440\u044f\u043d\u044b\u0435 \u043d\u0430 \u0438\u043c\u043f\u0435\u0440\u0430\u0442\u043e\u0440\u0441\u043a\u043e\u043c \u043a\u043e\u043d\u0443 .      \u0435\u043a\u0430\u0442\u0435\u0440\u0438\u043d\u0430 ii \u0438 \u0431\u044e\u0434\u0436\u0435\u0442 .      \u043f\u0440\u0438\u0445\u043e\u0434\u0438\u0442 \u0434\u0435\u043d\u044c , \u0438 \u043d\u0430 \u0432\u0441\u0435 \u043b\u0430\u0434\u044b \u043e\u0442\u043e\u0432\u0441\u044e\u0434\u0443 \u0437\u0432\u0443\u0447\u0438\u0442 \u0441\u043b\u043e\u0432\u043e &quot; \u0431\u044e\u0434\u0436\u0435\u0442 &quot; .      \u0433\u0430\u0437\u0435\u0442\u044b \u0440\u0430\u0441\u0448\u0438\u0444\u0440\u043e\u0432\u044b\u0432\u0430\u044e\u0442 \u0441\u043c\u044b\u0441\u043b \u0435\u0433\u043e \u043e\u0441\u043d\u043e\u0432\u043e\u043f\u043e\u043b\u0430\u0433\u0430\u044e\u0449\u0438\u0445 \u0441\u0442\u0430\u0442\u0435\u0439 , \u0441\u0440\u0430\u0432\u043d\u0438\u0432\u0430\u044f \u043f\u0440\u043e\u0448\u043b\u043e\u0433\u043e\u0434\u043d\u0438\u0435 \u043d\u0430\u043c\u0435\u0442\u043a\u0438 \u0441 \u043d\u044b\u043d\u0435\u0448\u043d\u0438\u043c\u0438      \u043c\u043d\u043e\u0433\u0438\u0435 \u0432\u043e\u0437\u043c\u0443\u0449\u0430\u044e\u0442\u0441\u044f \u043f\u043e \u043f\u043e\u0432\u043e\u0434\u0443 \u043c\u0430\u043b\u043e\u0433\u043e \u0432\u043d\u0438\u043c\u0430\u043d\u0438\u044f \u043a \u043a\u0443\u043b\u044c\u0442\u0443\u0440\u0435 , \u043e\u0431\u043e\u0440\u043e\u043d\u0435 , \u0430\u0440\u043c\u0438\u0438 \u2026 - \u043f\u0440\u043e\u0434\u043e\u043b\u0436\u0430\u0439\u0442\u0435 \u043f\u0435\u0440\u0435\u0447\u0435\u043d\u044c \u0432      \u043a\u043e\u043d\u0444\u043e\u0440\u043c\u0438\u0441\u0442\u044b \u043f\u0440\u0438\u043d\u0438\u043c\u0430\u044e\u0442\u0441\u044f \u0440\u0430\u0437\u043e\u0431\u043b\u0430\u0447\u0430\u0442\u044c &quot; \u0432\u044b\u0441\u043a\u043e\u0447\u0435\u043a &quot; , \u0443\u043b\u0438\u0447\u0430\u044f \u0438\u0445 \u0432 \u043f\u0435\u0440\u0435\u0434\u0435\u0440\u0433\u0438\u0432\u0430\u043d\u0438\u0438 \u043f\u0440\u043e\u0446\u0435\u043d\u0442\u043d\u044b\u0445 \u0434\u043e\u043b\u0435\u0439 , \u0432 \u0431  Interviyu_Mariny_Astvatsaturyan 72      &quot; \u043c\u043e\u044f \u043d\u0430\u0443\u0447\u043d\u0430\u044f \u0436\u0443\u0440\u043d\u0430\u043b\u0438\u0441\u0442\u0438\u043a\u0430 \u0441\u0442\u043e\u0438\u0442 \u043d\u0430 \u0446\u0430\u0440\u0441\u043a\u0438\u0445 \u043a\u043e\u0441\u0442\u044f\u0445 &quot; .      - \u043a\u0430\u043a \u0432\u044b , \u0431\u0438\u043e\u043b\u043e\u0433 , \u043f\u0440\u0438\u0448\u043b\u0438 \u0432 \u043d\u0430\u0443\u0447\u043d\u0443\u044e \u0436\u0443\u0440\u043d\u0430\u043b\u0438\u0441\u0442\u0438\u043a\u0443 ? \u0441 \u0447\u0435\u0433\u043e \u0432\u0441\u0451 \u043d\u0430\u0447\u0438\u043d\u0430\u043b\u043e\u0441\u044c ?      - \u043d\u0430\u0447\u0430\u043b\u043e\u0441\u044c \u043f\u043e \u0432\u043e\u043b\u0435 \u0441\u043b\u0443\u0447\u0430\u044f , \u0432 1993-\u043c .      \u043e\u0434\u043d\u043e\u043c\u0443 \u0438\u0437 \u0442\u043e\u0433\u0434\u0430\u0448\u043d\u0438\u0445 \u043c\u0443\u0437\u044b\u043a\u0430\u043b\u044c\u043d\u044b\u0445 \u0432\u0435\u0434\u0443\u0449\u0438\u0445 &quot; \u044d\u0445\u0430 \u043c\u043e\u0441\u043a\u0432\u044b &quot; \u043f\u043e\u043d\u0430\u0434\u043e\u0431\u0438\u043b\u0438\u0441\u044c \u0437\u0430\u043f\u0438\u0441\u0438 \u0438\u0432\u0430 \u043c\u043e\u043d\u0442\u0430\u043d\u0430 , \u0438 \u043e\u043d \u043e\u0431\u0440\u0430\u0442\u0438      \u0443 \u043d\u0430\u0441 \u0434\u043e\u043c\u0430 , \u0441\u043a\u043e\u043b\u044c\u043a\u043e \u044f \u0441\u0435\u0431\u044f \u043f\u043e\u043c\u043d\u044e , \u043b\u0435\u0436\u0430\u043b\u0438 \u0432\u0438\u043d\u0438\u043b\u043e\u0432\u044b\u0435 \u043f\u043b\u0430\u0441\u0442\u0438\u043d\u043a\u0438 - \u043a\u043e\u043d\u0446\u0435\u0440\u0442\u043d\u044b\u0435 \u0437\u0430\u043f\u0438\u0441\u0438 \u0438\u0432\u0430 \u043c\u043e\u043d\u0442\u0430\u043d\u0430 , \u043f\u0440\u0438      \u044f \u043f\u0440\u0435\u0434\u043e\u0441\u0442\u0430\u0432\u0438\u043b\u0430 \u0438\u0445 \u0434\u043b\u044f \u043f\u043e\u0434\u0433\u043e\u0442\u043e\u0432\u043a\u0438 \u0440\u0430\u0434\u0438\u043e\u043f\u0435\u0440\u0435\u0434\u0430\u0447\u0438 \u043e\u0431 \u0438\u0432\u0435 \u043c\u043e\u043d\u0442\u0430\u043d\u0435 .      \u0442\u0430\u043a \u044f \u043f\u043e\u0437\u043d\u0430\u043a\u043e\u043c\u0438\u043b\u0430\u0441\u044c \u0441 \u0442\u043e\u0433\u0434\u0430\u0448\u043d\u0435\u0439 \u0440\u0435\u0434\u0430\u043a\u0446\u0438\u0435\u0439 &quot; \u044d\u0445\u0430 &quot; , \u043a\u043e\u0442\u043e\u0440\u0430\u044f \u0441\u0438\u0434\u0435\u043b\u0430 \u043d\u0430 \u043d\u0438\u043a\u043e\u043b\u044c\u0441\u043a\u043e\u0439 , \u043d\u0430\u043f\u0440\u043e\u0442\u0438\u0432 \u0433\u0443\u043c\u0430 .  I_slepye_prozreyut 72      \u0438 \u0441\u043b\u0435\u043f\u044b\u0435 \u043f\u0440\u043e\u0437\u0440\u0435\u044e\u0442 \u2026      \u043e\u043f\u044b\u0442\u044b , \u043f\u0440\u043e\u0432\u0435\u0434\u0435\u043d\u043d\u044b\u0435 \u0441\u043f\u0435\u0446\u0438\u0430\u043b\u0438\u0441\u0442\u0430\u043c\u0438 \u0438\u043d\u0441\u0442\u0438\u0442\u0443\u0442\u0430 \u043c\u043e\u0437\u0433\u0430 \u0447\u0435\u043b\u043e\u0432\u0435\u043a\u0430 \u0440\u0430\u043d , \u043f\u043e\u0434\u0442\u0432\u0435\u0440\u0434\u0438\u043b\u0438 \u0441\u0443\u0449\u0435\u0441\u0442\u0432\u043e\u0432\u0430\u043d\u0438\u0435 \u0443 homo sa      \u0442\u0440\u0438 \u0433\u043e\u0434\u0430 \u043d\u0430\u0437\u0430\u0434 \u043c\u043e\u0441\u043a\u043e\u0432\u0441\u043a\u0438\u0439 \u0443\u0447\u0435\u043d\u044b\u0439 \u0432\u044f\u0447\u0435\u0441\u043b\u0430\u0432 \u0431\u0440\u043e\u043d\u043d\u0438\u043a\u043e\u0432 \u043d\u0430\u0447\u0430\u043b \u0443\u0447\u0438\u0442\u044c \u0441\u043b\u0435\u043f\u044b\u0445 \u0432\u0438\u0434\u0435\u0442\u044c .      \u043e\u043d \u0440\u0430\u0437\u0440\u0430\u0431\u043e\u0442\u0430\u043b \u043e\u0440\u0438\u0433\u0438\u043d\u0430\u043b\u044c\u043d\u0443\u044e \u043c\u0435\u0442\u043e\u0434\u0438\u043a\u0443 , \u043f\u043e\u0437\u0432\u043e\u043b\u044f\u044e\u0449\u0443\u044e \u0440\u0435\u0437\u043a\u043e \u0430\u043a\u0442\u0438\u0432\u0438\u0437\u0438\u0440\u043e\u0432\u0430\u0442\u044c \u0434\u0435\u044f\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u044c \u043f\u0440\u0430\u0432\u043e\u0433\u043e \u043f\u043e\u043b\u0443\u0448\u0430\u0440\u0438      \u0432 \u0440\u0435\u0437\u0443\u043b\u044c\u0442\u0430\u0442\u0435 \u0437\u0430 \u0434\u0435\u0441\u044f\u0442\u044c \u0434\u043d\u0435\u0439 \u0437\u0430\u043d\u044f\u0442\u0438\u0439 \u0431\u0440\u043e\u043d\u043d\u0438\u043a\u043e\u0432 \u0440\u0430\u0437\u0432\u0438\u0432\u0430\u043b \u0443 \u0441\u0432\u043e\u0438\u0445 \u043f\u043e\u0434\u043e\u043f\u0435\u0447\u043d\u044b\u0445 \u043d\u0430\u0432\u044b\u043a\u0438 \u0442\u0430\u043a \u043d\u0430\u0437\u044b\u0432\u0430\u0435\u043c\u043e\u0433\u043e \u043f\u0440\u044f      \u0434\u0435\u0442\u0438 \u0441 \u043f\u043e\u0440\u043e\u043a\u0430\u043c\u0438 \u0437\u0440\u0435\u043d\u0438\u044f \u043f\u043e\u0441\u043b\u0435 \u0441\u043f\u0435\u0446\u0438\u0430\u043b\u044c\u043d\u043e\u0433\u043e \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u044f \u043c\u043e\u0433\u043b\u0438 \u043a\u0430\u0442\u0430\u0442\u044c\u0441\u044f \u043d\u0430 \u0432\u0435\u043b\u043e\u0441\u0438\u043f\u0435\u0434\u0430\u0445 , \u0438\u0433\u0440\u0430\u0442\u044c \u0432 \u0448\u0430\u0445\u043c\u0430\u0442\u044b       \u0441\u043b\u0435\u043f\u043e\u0439 \u0447\u0435\u043b\u043e\u0432\u0435\u043a , \u043a\u0430\u043a \u0443\u0442\u0432\u0435\u0440\u0436\u0434\u0430\u044e\u0442 \u043c\u0435\u0434\u0438\u043a\u0438 , \u0432\u0438\u0434\u0438\u0442 \u043f\u0435\u0440\u0435\u0434 \u0441\u043e\u0431\u043e\u0439 \u043f\u0435\u043b\u0435\u043d\u0443 .<\/code><\/pre>\n<p>  <\/p>\n<p>\u0414\u043b\u044f \u043e\u0446\u0435\u043d\u043a\u0438 \u043d\u0443\u0436\u043d\u0430 \u0442\u043e\u043b\u044c\u043a\u043e \u043e\u0434\u043d\u0430 \u0444\u0443\u043d\u043a\u0446\u0438\u044f \u043e\u0442 \u043a\u0430\u0436\u0434\u043e\u0433\u043e \u043c\u0435\u0442\u043e\u0434\u0430 <strong>most similar<\/strong>, \u043a\u043e\u0442\u043e\u0440\u0430\u044f \u0431\u0443\u0434\u0435\u0442 \u043f\u0440\u0438\u043d\u0438\u043c\u0430\u0442\u044c \u0446\u0435\u043b\u0435\u0432\u043e\u0439 \u0441\u043e\u043e\u0431\u0449\u0435\u043d\u0438\u0435, \u0441\u043e\u043e\u0431\u0449\u0435\u043d\u0438\u044f, \u043a\u043e\u0442\u043e\u0440\u044b\u0435 \u043d\u0430\u0434\u043e \u0440\u0430\u0441\u0441\u0442\u0430\u0432\u0438\u0442\u044c \u0432 \u043f\u043e\u0440\u044f\u0434\u043a\u0435 \u0443\u0431\u044b\u0432\u0430\u043d\u0438\u044f \u0431\u043b\u0438\u0437\u043e\u0441\u0442\u0438 \u0438 \u0438\u043d\u0434\u0435\u043a\u0441\u044b \u044d\u0442\u0438\u0445 \u0441\u043e\u043e\u0431\u0449\u0435\u043d\u0438\u0439, \u043a\u043e\u0442\u043e\u0440\u044b\u0439 \u043d\u0430\u0434\u043e \u0440\u0430\u0441\u0441\u0442\u0430\u0432\u0438\u0442\u044c \u043f\u043e \u0442\u043e\u043c\u0443 \u0436\u0435 \u043f\u043e\u0440\u044f\u0434\u043a\u0443, \u0447\u0442\u043e \u0438 \u0441\u043e\u043e\u0431\u0449\u0435\u043d\u0438\u044f.<br \/>  \u0411\u0430\u043b\u043b\u044b \u0431\u0443\u0434\u0443\u0442 \u0441\u0447\u0438\u0442\u0430\u0442\u044c\u0441\u044f \u0442\u0430\u043a: \u0432\u0441\u0435\u0433\u043e 3 \u0442\u0435\u043c\u044b \u0441 70, 72 \u0438 72 \u043f\u0440\u0435\u0434\u043b\u043e\u0436\u0435\u043d\u0438\u044f\u043c\u0438 \u0432 \u043a\u0430\u0436\u0434\u043e\u043c. \u0427\u0435\u043c \u0431\u043b\u0438\u0436\u0435 \u0441\u043e\u043e\u0431\u0449\u0435\u043d\u0438\u0435 \u0440\u0430\u0441\u043f\u043e\u043b\u043e\u0433\u0430\u0435\u0442\u0441\u044f \u043f\u043e \u043e\u0442\u043d\u043e\u0448\u0435\u043d\u0438\u044e \u043a \u043e\u0441\u0442\u0430\u043b\u044c\u043d\u044b\u043c, \u0442\u0435\u043c \u0431\u043e\u043b\u044c\u0448\u0435 \u0431\u0430\u043b\u043b\u043e\u0432 \u0437\u0430 \u043d\u0435\u0433\u043e \u043d\u0430\u0447\u0438\u0441\u043b\u044f\u0435\u0442\u0441\u044f. \u0423\u0431\u044b\u0432\u0430\u0435\u043d\u0438\u0435 \u0446\u0435\u043d\u043d\u043e\u0441\u0442\u0438 \u043f\u0440\u0435\u0434\u043b\u043e\u0436\u0435\u043d\u0438\u044f \u0438\u0434\u0451\u0442 \u043f\u0440\u043e\u043f\u043e\u0440\u0446\u0438\u043e\u043d\u0430\u043b\u044c\u043d\u043e \u0438\u043d\u0434\u0435\u043a\u0441\u0443. \u0422.\u0435. \u0435\u0441\u043b\u0438 \u0441\u0430\u043c\u043e\u0435 \u0431\u043b\u0438\u0436\u0430\u0439\u0448\u0435\u0435 \u0441\u043e\u043e\u0431\u0449\u0435\u043d\u0438\u0435 \u0438\u0437 \u0442\u043e\u0439 \u0436\u0435 \u0442\u0435\u043c\u044b, \u0442\u043e +214, \u0435\u0441\u043b\u0438 \u0432\u0442\u043e\u0440\u043e\u0435 \u0441\u043e\u043e\u0431\u0449\u0435\u043d\u0438\u0435 \u0438\u0437 \u0434\u0440\u0443\u0433\u043e\u0439 \u0442\u0435\u043c\u044b, \u0442\u043e -213 \u0438 \u0442.\u0434. \u0434\u043e \u043f\u043e\u0441\u043b\u0435\u0434\u043d\u0435\u0433\u043e.<\/p>\n<p>  <\/p>\n<p>\u0422\u0435\u043e\u0440\u0435\u0442\u0438\u0447\u0435\u0441\u043a\u0438 \u043c\u0430\u043a\u0441\u0438\u043c\u0430\u043b\u044c\u043d\u043e \u0432\u043e\u0437\u043c\u043e\u0436\u043d\u043e\u0435 \u0437\u043d\u0430\u0447\u0435\u043d\u0438\u0435 = sum(214\u2026 214 \u2014 72) \u2014 sum(214-72\u2026 0) + 7626 = 10395<br \/>  \u0422\u0435\u043e\u0440\u0435\u0442\u0438\u0447\u0435\u0441\u043a\u0438 \u043c\u0438\u043d\u0438\u043c\u0430\u043b\u044c\u043d\u043e \u0432\u043e\u0437\u043c\u043e\u0436\u043d\u043e\u0435 \u0437\u043d\u0430\u0447\u0435\u043d\u0438\u0435 = -sum(214\u2026 72) + sum(72\u2026 0) + 7626 = -10053<\/p>\n<p>  <\/p>\n<pre><code class=\"python\">np.random.seed(42) random.seed(777)  index2topic = {} index2text = {} index = 0 for topic in texts_for_evaluation:     for sentence in texts_for_evaluation[topic]:         index2topic[index] = topic         index2text[index] = sentence         index += 1  def get_similarity_values(sentences):     return np.random.rand(len(sentences), len(sentences))  chart_methods = {} bottom_minimum = -7626.2336448598135  def evaluate(get_similarity_values, method_name=None, add_to_chart=True):     test_messages = [index2text[index] for index in range(len(index2text))]     distances_each_to_each = get_similarity_values(test_messages)     evaluations = []     for target_index in index2topic:         distances = distances_each_to_each[target_index]         distances_indexes = sorted(zip(distances, range(len(index2topic))), key=lambda x: x[0])         evaluation_result = 0         for i, (distance, index) in enumerate(distances_indexes):             if index2topic[index] == index2topic[target_index]:                 evaluation_result += len(test_messages) - i             else:                 evaluation_result -= len(test_messages) - i         evaluations.append(evaluation_result)             # \u0441\u0434\u0435\u043b\u0430\u0435\u043c \u0441\u043b\u0443\u0447\u0430\u0439\u043d\u043e\u0435 \u0440\u0430\u0441\u043f\u0440\u0435\u0434\u0435\u043b\u043d\u0438\u0435 \u0438\u0441\u043a\u0443\u0441\u0441\u0442\u0432\u0435\u043d\u043d\u044b\u043c \u043d\u0443\u043b\u0451\u043c (baseline)     result = round(np.mean(evaluations) - bottom_minimum, 1)     if add_to_chart:         #\u0434\u043e\u0431\u0430\u0432\u043b\u044f\u0435\u043c \u043d\u0430 \u0433\u0440\u0430\u0444\u0438\u043a, \u0442\u043e\u043b\u044c\u043a\u043e \u043b\u0443\u0447\u0448\u0438\u0439 \u0440\u0435\u0437\u0443\u043b\u044c\u0442\u0430\u0442 \u043f\u043e \u0432\u0441\u0435\u043c \u043f\u0440\u043e\u0433\u043e\u043d\u0430\u043c         if method_name not in chart_methods or result &gt; chart_methods[method_name][0]:             chart_methods[method_name] = (result, np.std(evaluations))     return f'{method_name}: {str(result)}'  def parse_result(result):     return float(new_result.split(': ')[1])  evaluate(get_similarity_values, 'random arrange')<\/code><\/pre>\n<p>  <\/p>\n<p><code>'random arrange: 0.0'<\/code><\/p>\n<p>  <\/p>\n<h1 id=\"1-metody-bow\">1. \u041c\u0435\u0442\u043e\u0434\u044b BOW<\/h1>\n<p>  <\/p>\n<h3 id=\"11-bow\">1.1 BOW<\/h3>\n<p>  <\/p>\n<p>\u0411\u0443\u0434\u0435\u043c \u043e\u043f\u0440\u0435\u0434\u0435\u043b\u044f\u0442\u044c \u0440\u0430\u0441\u0441\u0442\u043e\u044f\u043d\u0438 \u043c\u0435\u0436\u0434\u0443 \u043f\u0440\u0435\u0434\u043b\u043e\u0436\u0435\u043d\u0438\u043c\u0438 \u043f\u043e \u0441\u043b\u043e\u0432\u0430\u043c, \u043a\u043e\u0442\u043e\u0440\u044b\u0435 \u043d\u0430\u0445\u043e\u0434\u044f\u0442\u0441\u044f \u0432 \u043f\u0440\u0435\u0434\u043b\u043e\u0436\u0435\u043d\u0438\u0438 \u0431\u0435\u0437 \u043f\u0440\u0435\u0434\u0432\u0430\u0440\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0439 \u043e\u0431\u0440\u0430\u0431\u043e\u0442\u043a\u0438 (\u0441\u043e\u0445\u0440\u0430\u043d\u044f\u044f \u0432\u0441\u0435 \u0437\u043d\u0430\u043a\u0438 \u043f\u0440\u0435\u043f\u0438\u043d\u0430\u043d\u0438\u044f).<\/p>\n<p>  <\/p>\n<pre><code class=\"python\">count_vectorizer = CountVectorizer() corpus = TEXTS_CORPUS count_vectorizer.fit(corpus)  def get_similarity_values(sentences):     sentences_bow = count_vectorizer.transform(sentences)     distances = cosine_distances(sentences_bow, sentences_bow)     return distances  evaluate(get_similarity_values, 'BOW')<\/code><\/pre>\n<p>  <\/p>\n<p><code>'BOW: 693.1'<\/code><\/p>\n<p>  <\/p>\n<h3 id=\"12-bow-s-lemmami-slov\">1.2 BOW \u0441 \u043b\u0435\u043c\u043c\u0430\u043c\u0438 \u0441\u043b\u043e\u0432<\/h3>\n<p>  <\/p>\n<p>\u0422\u043e\u0442 \u0436\u0435 \u0430\u043b\u0433\u043e\u0440\u0438\u0442\u043c, \u043d\u043e \u0442\u0435\u043f\u0435\u0440\u044c \u0431\u0443\u0434\u0443\u0442 \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u044c\u0441\u044f \u043b\u0435\u043c\u043c\u044b \u0441\u043b\u043e\u0432.<\/p>\n<p>  <\/p>\n<pre><code class=\"python\">morph = pymorphy2.MorphAnalyzer()  def lemmatize(corpus, verbose=False):     clear_corpus = []     if verbose:         iterator = tqdm(corpus, leave=False)     else:         iterator = corpus     for sentence in iterator:         tokens = sentence.split() # \u0440\u0430\u0437\u0431\u0438\u0432\u0430\u0435\u043c \u0442\u0435\u043a\u0441\u0442 \u043d\u0430 \u0441\u043b\u043e\u0432\u0430         res = []         for token in tokens:             p = morph.parse(token)[0]             res.append(p.normal_form)         clear_corpus.append(' '.join(res))     return clear_corpus  count_vectorizer = CountVectorizer() corpus = lemmatize(TEXTS_CORPUS, True) count_vectorizer.fit(corpus)  def get_similarity_values(sentences):     sentences_bow = count_vectorizer.transform(lemmatize(sentences))     distances = cosine_distances(sentences_bow, sentences_bow)     return distances  evaluate(get_similarity_values, 'BOW \u0441 \u043b\u0435\u043c\u043c\u0430\u043c\u0438 \u0441\u043b\u043e\u0432', False)<\/code><\/pre>\n<p>  <\/p>\n<p><code>'BOW \u0441 \u043b\u0435\u043c\u043c\u0430\u043c\u0438 \u0441\u043b\u043e\u0432: 1645.8'<\/code><\/p>\n<p>  <\/p>\n<h3 id=\"13-bow-s-lemmami-s-ochistkoy-stop-slov--i-znakov-prepinaniya\">1.3 BOW \u0441 \u043b\u0435\u043c\u043c\u0430\u043c\u0438 \u0441 \u043e\u0447\u0438\u0441\u0442\u043a\u043e\u0439 \u0441\u0442\u043e\u043f \u0441\u043b\u043e\u0432 \u0438 \u0437\u043d\u0430\u043a\u043e\u0432 \u043f\u0440\u0435\u043f\u0438\u043d\u0430\u043d\u0438\u044f<\/h3>\n<p>  <\/p>\n<pre><code class=\"python\">ru_stopwords = stopwords.words('russian') ru_stopwords += ['.', ',', '&quot;', '!',                  '?','(', ')', '-',                  ':', ';', '_', '\\\\']  def delete_stopwords(corpus, verbose=False):     clear_corpus = []     if verbose:         iterator = tqdm(corpus, leave=False)     else:         iterator = corpus     for sentence in iterator:         tokens = sentence.split() # \u0440\u0430\u0437\u0431\u0438\u0432\u0430\u0435\u043c \u0442\u0435\u043a\u0441\u0442 \u043d\u0430 \u0441\u043b\u043e\u0432\u0430         res = []         without_stopwords = [token for token in tokens if token not in ru_stopwords]         clear_corpus.append(' '.join(without_stopwords))     return clear_corpus  count_vectorizer = CountVectorizer() corpus = lemmatize(delete_stopwords(TEXTS_CORPUS), True) count_vectorizer.fit(corpus)  def get_similarity_values(sentences):     sentences_bow = count_vectorizer.transform(lemmatize(delete_stopwords(sentences)))     distances = cosine_distances(sentences_bow, sentences_bow)     return distances  evaluate(get_similarity_values, 'BOW \u0441 \u043b\u0435\u043c\u043c\u0430\u043c\u0438 \u0438 \u0431\u0435\u0437 \u0441\u0442\u043e\u043f \u0441\u043b\u043e\u0432')<\/code><\/pre>\n<p>  <\/p>\n<p><code>'BOW \u0441 \u043b\u0435\u043c\u043c\u0430\u043c\u0438 \u0438 \u0431\u0435\u0437 \u0441\u0442\u043e\u043f \u0441\u043b\u043e\u0432: 1917.6'<\/code><\/p>\n<p>  <\/p>\n<h3 id=\"14-lda\">1.4 LDA<\/h3>\n<p>  <\/p>\n<pre><code class=\"python\">def similarity_values_wrapper(lda, count_vectorizer, do_lemmatize=False, do_delete_stopwords=False):     def get_similarity_values(sentences):         if do_delete_stopwords:             sentences = delete_stopwords(sentences)         if do_lemmatize:             sentences = lemmatize(sentences)          sent_vector = count_vectorizer.transform(sentences)         sent_vector = lda.transform(sent_vector)         distances = cosine_distances(sent_vector, sent_vector)         return distances     return get_similarity_values  lda = LatentDirichletAllocation(n_components=300) corpus = TEXTS_CORPUS count_vectorizer = CountVectorizer().fit(corpus) corpus = count_vectorizer.transform(corpus) lda.fit(corpus) get_similarity_values = similarity_values_wrapper(lda, count_vectorizer)  print(evaluate(get_similarity_values, 'LDA', False))  lda = LatentDirichletAllocation(n_components=300) corpus = lemmatize(TEXTS_CORPUS, True) count_vectorizer = CountVectorizer().fit(corpus) corpus = count_vectorizer.transform(corpus) lda.fit(corpus) get_similarity_values = similarity_values_wrapper(lda, count_vectorizer, do_lemmatize=True)  print(evaluate(get_similarity_values, 'LDA \u0441 \u043b\u0435\u043c\u043c\u0430\u043c\u0438', True))  lda = LatentDirichletAllocation(n_components=300) corpus = lemmatize(delete_stopwords(TEXTS_CORPUS), True) count_vectorizer = CountVectorizer().fit(corpus) corpus = count_vectorizer.transform(corpus) lda.fit(corpus) get_similarity_values = similarity_values_wrapper(lda, count_vectorizer, do_lemmatize=True, do_delete_stopwords=True)  print(evaluate(get_similarity_values, 'LDA \u0441 \u043b\u0435\u043c\u043c\u0430\u043c\u0438 \u0438 \u0431\u0435\u0437 \u0441\u0442\u043e\u043f \u0441\u043b\u043e\u0432', False))<\/code><\/pre>\n<p>  <\/p>\n<p><code>LDA: 344.7<\/code><br \/>  <code>LDA \u0441 \u043b\u0435\u043c\u043c\u0430\u043c\u0438: 1092.1<\/code><br \/>  <code>LDA \u0441 \u043b\u0435\u043c\u043c\u0430\u043c\u0438 \u0438 \u0431\u0435\u0437 \u0441\u0442\u043e\u043f \u0441\u043b\u043e\u0432: 1077.2<\/code><\/p>\n<p>  <\/p>\n<pre><code class=\"python\">%matplotlib inline def plot_results():     methods = sorted(chart_methods.items(), key=lambda x: x[1][0])      labels = [m[0] for m in methods]     x_pos = np.arange(len(labels))     mean = [m[1][0] for m in methods]     std = [m[1][1] for m in methods]      # Build the plot     fig, ax = plt.subplots(figsize=(12,8))     ax.bar(x_pos,            mean,            yerr=std,            align='center',            alpha=0.5,            ecolor='black',            capsize=10)     ax.set_ylabel('\u0411\u0430\u043b\u043b\u044b \u0438 \u0441\u0442\u0430\u043d\u0434\u0430\u0440\u0442\u043d\u043e\u0435 \u043e\u0442\u043a\u043b\u043e\u043d\u0435\u043d\u0438\u0435')      ax.set_xticks(x_pos)     ax.set_xticklabels(labels, rotation=20, ha='right')     ax.set_title('\u0421\u0440\u0430\u043d\u0438\u0442\u0435\u043b\u044c\u043d\u044b\u0439 \u0433\u0440\u0430\u0444\u0438\u043a \u043c\u0435\u0442\u043e\u0434\u043e\u0432')     ax.yaxis.grid(True)     plt.show()  plot_results()<\/code><\/pre>\n<p>  <\/p>\n<p><img decoding=\"async\" src=\"https:\/\/habrastorage.org\/webt\/x8\/m3\/nx\/x8m3nxb3d4qrppthhbbcd4ryj4m.png\" alt=\"png\"\/><\/p>\n<p>  <\/p>\n<h1 id=\"2-metody-ispolzuyuschie-embedingi-tokenov\">2. \u041c\u0435\u0442\u043e\u0434\u044b, \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u0443\u044e\u0449\u0438\u0435 \u044d\u043c\u0431\u0435\u0434\u0438\u043d\u0433\u0438 \u0442\u043e\u043a\u0435\u043d\u043e\u0432<\/h1>\n<p>  <\/p>\n<p>\u0411\u0443\u0434\u0435\u043c \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u044c \u043f\u0440\u0435\u0434\u043e\u0431\u0443\u0447\u0435\u043d\u043d\u044b\u0435 \u044d\u043c\u0431\u0435\u0434\u0438\u043d\u0433\u0438 \u0438 \u0434\u043e\u0431\u0430\u0432\u0438\u043c \u0432\u043e\u0437\u043c\u043e\u0436\u043d\u043e\u0441\u0442\u044c \u0432\u044b\u0431\u0438\u0440\u0430\u0442\u044c \u043c\u0435\u0442\u043e\u0434 \u0432\u0435\u043a\u0442\u043e\u0440\u0438\u0437\u0430\u0446\u0438\u0438 \u0438 \u0444\u0443\u043d\u0446\u0438\u044e \u0440\u0430\u0441\u0441\u0442\u043e\u044f\u043d\u0438\u044f.<br \/>  <a href=\"https:\/\/fasttext.cc\/\" rel=\"nofollow\">\u0417\u0434\u0435\u0441\u044c<\/a> \u043c\u043e\u0436\u043d\u043e \u0441\u043a\u0430\u0447\u0430\u0442\u044c fasttext<br \/>  <a href=\"https:\/\/wikipedia2vec.github.io\/wikipedia2vec\/pretrained\/\" rel=\"nofollow\">\u0410 \u0432\u043e\u0442 \u0442\u0443\u0442<\/a> \u0441\u0441\u044b\u043b\u043a\u0430 \u0434\u043b\u044f \u0441\u043a\u0430\u0447\u0438\u0432\u0430\u043d\u0438\u044f \u0438 \u0438\u043d\u0441\u0442\u0440\u0443\u043a\u0446\u0438\u044f \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u043d\u0438\u044f gensim \u043c\u043e\u0434\u0435\u043b\u0438 word2vec<\/p>\n<p>  <\/p>\n<pre><code class=\"python\"># \u0421\u043a\u0430\u0447\u0438\u0432\u0430\u043d\u0438\u0435 \u043f\u0440\u0435\u0434\u043e\u0431\u0443\u0447\u0435\u043d\u043d\u044b\u0445 \u044d\u043c\u0431\u0435\u0434\u0438\u043d\u0433\u043e\u0432 import fasttext.util from wikipedia2vec import Wikipedia2Vec fasttext.util.download_model('ru', if_exists='ignore')  wiki2vec = Wikipedia2Vec.load('ruwiki_20180420_300d.pkl') ft = fasttext.load_model('cc.ru.300.bin')<\/code><\/pre>\n<p>  <\/p>\n<h3 id=\"21-srednee-po-embedingu-slov\">2.1 \u0421\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e \u044d\u043c\u0431\u0435\u0434\u0438\u043d\u0433\u0443 \u0441\u043b\u043e\u0432<\/h3>\n<p>  <\/p>\n<pre><code class=\"python\">def vectorize(token, use_word2vec=True, use_fasttext=True):     assert use_word2vec or use_fasttext     if use_fasttext:         try:             fast_text_vector = ft.get_word_vector(token)         except KeyError:             fast_text_vector = np.zeros((ft.get_dimension()))      if use_word2vec:         try:             word2vec_vector = wiki2vec.get_word_vector(token)         except KeyError:             word2vec_vector = np.zeros((len(wiki2vec.get_word_vector('the'))))      if use_fasttext and use_word2vec:         return np.concatenate([word2vec_vector, fast_text_vector])     elif use_fasttext:         return np.array(fast_text_vector)     elif use_word2vec:         return np.array(word2vec_vector)     else:         return 'something went wrong on vectorisation'  print(np.shape(vectorize('any_token')))<\/code><\/pre>\n<p>  <\/p>\n<pre><code class=\"python\">def similarity_values_wrapper(use_word2vec=True, use_fasttext=True, distance_function=cosine_distances):     def get_similarity_values(sentences):         sent_vector = []         for sentence in sentences:             sentence_vector = []             for token in sentence.split():                 sentence_vector.append(vectorize(token, use_word2vec, use_fasttext))             sent_vector.append(np.mean(sentence_vector, axis=0))         distances = distance_function(sent_vector, sent_vector)         return distances     return get_similarity_values  get_similarity_values = similarity_values_wrapper(use_word2vec=True, use_fasttext=True, distance_function=euclidean_distances) print(evaluate(get_similarity_values, '\u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0441 euclidean_distances \u0441 word2vec + fast_text', add_to_chart=False))  get_similarity_values = similarity_values_wrapper(use_word2vec=False, use_fasttext=True, distance_function=euclidean_distances) print(evaluate(get_similarity_values, '\u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0441 euclidean_distances \u0441 fast_text', add_to_chart=False))  get_similarity_values = similarity_values_wrapper(use_word2vec=True, use_fasttext=False, distance_function=euclidean_distances) print(evaluate(get_similarity_values, '\u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0441 euclidean_distances \u0441 word2vec'))  get_similarity_values = similarity_values_wrapper(use_word2vec=True, use_fasttext=True, distance_function=cosine_distances) print(evaluate(get_similarity_values, '\u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0441 cosine_distance \u0441 word2vec + fast_text', add_to_chart=False))  get_similarity_values = similarity_values_wrapper(use_word2vec=False, use_fasttext=True, distance_function=cosine_distances) print(evaluate(get_similarity_values, '\u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0441 cosine_distance \u0441 fast_text', add_to_chart=False))  get_similarity_values = similarity_values_wrapper(use_word2vec=True, use_fasttext=False, distance_function=cosine_distances) print(evaluate(get_similarity_values, '\u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0441 cosine_distance \u0441 word2vec'))<\/code><\/pre>\n<p>  <\/p>\n<p>\u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0441 euclidean_distances \u0441 word2vec + fast_text: 1833.6<br \/>  \u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0441 euclidean_distances \u0441 fast_text: 913.5<br \/>  \u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0441 euclidean_distances \u0441 word2vec: 1941.6<br \/>  \u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0441 cosine_distance \u0441 word2vec + fast_text: 2278.1<br \/>  c\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0441 cosine_distance \u0441 fast_text: 829.2<br \/>  \u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0441 cosine_distance \u0441 word2vec: 2437.7<\/p>\n<p>  <\/p>\n<h3 id=\"22-srednee-po-embedingu-s-predvaritelnoy-ochistkoy-stop-slov\">2.2 \u0421\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e \u044d\u043c\u0431\u0435\u0434\u0438\u043d\u0433\u0443 \u0441 \u043f\u0440\u0435\u0434\u0432\u0430\u0440\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0439 \u043e\u0447\u0438\u0441\u0442\u043a\u043e\u0439 \u0441\u0442\u043e\u043f \u0441\u043b\u043e\u0432<\/h3>\n<p>  <\/p>\n<pre><code class=\"python\">def similarity_values_wrapper(use_word2vec=True, use_fasttext=True, distance_function=cosine_distances):     def get_similarity_values(sentences):         sentences = delete_stopwords(sentences)         sent_vector = []         for sentence in sentences:             sentence_vector = []             for token in sentence.split():                 sentence_vector.append(vectorize(token, use_word2vec, use_fasttext))             sent_vector.append(np.mean(sentence_vector, axis=0))         distances = distance_function(sent_vector, sent_vector)         return distances     return get_similarity_values  get_similarity_values = similarity_values_wrapper(use_word2vec=True, use_fasttext=True, distance_function=euclidean_distances) print(evaluate(get_similarity_values, '\u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0431\u0435\u0437 \u0441\u0442\u043e\u043f\u0441\u043b\u043e\u0432 \u0441 euclidean_distances \u0441 word2vec + fast_text', add_to_chart=False))  get_similarity_values = similarity_values_wrapper(use_word2vec=False, use_fasttext=True, distance_function=euclidean_distances) print(evaluate(get_similarity_values, '\u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0431\u0435\u0437 \u0441\u0442\u043e\u043f\u0441\u043b\u043e\u0432 \u0441 euclidean_distances \u0441 fast_text', add_to_chart=False))  get_similarity_values = similarity_values_wrapper(use_word2vec=True, use_fasttext=False, distance_function=euclidean_distances) print(evaluate(get_similarity_values, '\u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0431\u0435\u0437 \u0441\u0442\u043e\u043f\u0441\u043b\u043e\u0432 \u0441 euclidean_distances \u0441 word2vec'))  get_similarity_values = similarity_values_wrapper(use_word2vec=True, use_fasttext=True) print(evaluate(get_similarity_values, '\u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0431\u0435\u0437 \u0441\u0442\u043e\u043f\u0441\u043b\u043e\u0432 \u0441 cosine_distance \u0441 word2vec + fast_text', add_to_chart=False))  get_similarity_values = similarity_values_wrapper(use_word2vec=False, use_fasttext=True) print(evaluate(get_similarity_values, '\u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0431\u0435\u0437 \u0441\u0442\u043e\u043f\u0441\u043b\u043e\u0432 \u0441 cosine_distance \u0441 fast_text', add_to_chart=False))  get_similarity_values = similarity_values_wrapper(use_word2vec=True, use_fasttext=False) print(evaluate(get_similarity_values, '\u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0431\u0435\u0437 \u0441\u0442\u043e\u043f\u0441\u043b\u043e\u0432 \u0441 cosine_distance \u0441 word2vec'))<\/code><\/pre>\n<p>  <\/p>\n<p>\u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0431\u0435\u0437 \u0441\u0442\u043e\u043f\u0441\u043b\u043e\u0432 \u0441 euclidean_distances \u0441 word2vec + fast_text: 2116.9<br \/>  \u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0431\u0435\u0437 \u0441\u0442\u043e\u043f\u0441\u043b\u043e\u0432 \u0441 euclidean_distances \u0441 fast_text: 1314.5<br \/>  \u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0431\u0435\u0437 \u0441\u0442\u043e\u043f\u0441\u043b\u043e\u0432 \u0441 euclidean_distances \u0441 word2vec: 2159.1<br \/>  \u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0431\u0435\u0437 \u0441\u0442\u043e\u043f\u0441\u043b\u043e\u0432 \u0441 cosine_distance \u0441 word2vec + fast_text: 2779.7<br \/>  \u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0431\u0435\u0437 \u0441\u0442\u043e\u043f\u0441\u043b\u043e\u0432 \u0441 cosine_distance \u0441 fast_text: 2199.0<br \/>  \u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0431\u0435\u0437 \u0441\u0442\u043e\u043f\u0441\u043b\u043e\u0432 \u0441 cosine_distance \u0441 word2vec: 2814.4<\/p>\n<p>  <\/p>\n<h3 id=\"23-srednee-po-embedingu-s-vesami-tf-idf\">2.3 \u0421\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e \u044d\u043c\u0431\u0435\u0434\u0438\u043d\u0433\u0443 \u0441 \u0432\u0435\u0441\u0430\u043c\u0438 tf-idf<\/h3>\n<p>  <\/p>\n<pre><code class=\"python\">tf_idf_vectorizer = TfidfVectorizer() tf_idf_vectorizer.fit(TEXTS_CORPUS) vocab = tf_idf_vectorizer.get_feature_names()<\/code><\/pre>\n<p>  <\/p>\n<pre><code class=\"python\">def similarity_values_wrapper(use_word2vec=True, use_fasttext=True, distance_function=cosine_distances):     def get_similarity_values(sentences):         sent_vector = [[]]*len(sentences)         weights_data = tf_idf_vectorizer.transform(sentences).tocoo()         for row, col, weight in zip(weights_data.row, weights_data.col, weights_data.data):             sent_vector[row].append(weight*vectorize(vocab[col], use_word2vec, use_fasttext))          for row in range(len(sent_vector)):             if not sent_vector[row]:                 sent_vector.append((len(vectorize('zoros_vector'))))         sent_vector = np.sum(sent_vector, axis=1)         distances = distance_function(sent_vector, sent_vector)         return distances     return get_similarity_values  get_similarity_values = similarity_values_wrapper(use_word2vec=True, use_fasttext=True) print(evaluate(get_similarity_values,'\u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0441 tf-idf \u0441 cosine_distance \u0441 word2vec + fast_text', add_to_chart=False))  get_similarity_values = similarity_values_wrapper(use_word2vec=False, use_fasttext=True) print(evaluate(get_similarity_values,'\u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0441 tf-idf \u0441 cosine_distance \u0441 fast_text', add_to_chart=False))  get_similarity_values = similarity_values_wrapper(use_word2vec=True, use_fasttext=False) print(evaluate(get_similarity_values,'\u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0441 tf-idf \u0441 cosine_distance \u0441 word2vec', add_to_chart=False))  get_similarity_values = similarity_values_wrapper(use_word2vec=True, use_fasttext=True, distance_function=euclidean_distances) print(evaluate(get_similarity_values,'\u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0441 tf-idf \u0441 euclidian_distance \u0441 word2vec + fast_text', add_to_chart=True))  get_similarity_values = similarity_values_wrapper(use_word2vec=False, use_fasttext=True, distance_function=euclidean_distances) print(evaluate(get_similarity_values,'\u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0441 tf-idf \u0441 euclidian_distance \u0441 fast_text', add_to_chart=False))  get_similarity_values = similarity_values_wrapper(use_word2vec=True, use_fasttext=False, distance_function=euclidean_distances) print(evaluate(get_similarity_values,'\u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0441 tf-idf \u0441 euclidian_distance \u0441 word2vec', add_to_chart=False))<\/code><\/pre>\n<p>  <\/p>\n<p>\u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0441 tf-idf \u0441 cosine_distance \u0441 word2vec + fast_text: -133.6<br \/>  \u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0441 tf-idf \u0441 cosine_distance \u0441 fast_text: 9.0<br \/>  \u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0441 tf-idf \u0441 cosine_distance \u0441 word2vec: -133.6<br \/>  \u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0441 tf-idf \u0441 euclidian_distance \u0441 word2vec + fast_text: 6.4<br \/>  \u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0441 tf-idf \u0441 euclidian_distance \u0441 fast_text: -133.6<br \/>  \u0441\u0440\u0435\u0434\u043d\u0435\u0435 \u043f\u043e embedings \u0441 tf-idf \u0441 euclidian_distance \u0441 word2vec: -133.6<\/p>\n<p>  <\/p>\n<pre><code class=\"python\">plot_results()<\/code><\/pre>\n<p>  <\/p>\n<p><img decoding=\"async\" src=\"https:\/\/habrastorage.org\/webt\/h_\/ta\/rq\/h_tarq9one97n2zp_afji7rtui0.png\" alt=\"png\"\/><\/p>\n<p>  <\/p>\n<h1 id=\"metody-bez-uchitelya\">\u041c\u0435\u0442\u043e\u0434\u044b \u0431\u0435\u0437 \u0443\u0447\u0438\u0442\u0435\u043b\u044f<\/h1>\n<p>  <\/p>\n<p>\u0421\u043b\u0435\u0434\u0443\u044e\u0449\u0438\u0435 \u043d\u0435\u0441\u043a\u043e\u043b\u044c\u043a\u043e \u043c\u043e\u0434\u0435\u043b\u0435\u0439 \u043f\u043e\u0442\u0440\u0435\u0431\u0443\u044e\u0442 \u043f\u043e\u0442\u043e\u043a\u043e\u0432\u043e\u0439 \u0433\u0435\u043d\u0435\u0440\u0430\u0446\u0438\u0438 \u0434\u0430\u043d\u043d\u044b\u0445, \u043f\u043e\u044d\u0442\u043e\u043c\u0443 \u0441\u0434\u0435\u043b\u0430\u0435\u043c \u0443\u043d\u0438\u0432\u0435\u0440\u0441\u0430\u043b\u044c\u043d\u044b\u0435 \u0433\u0435\u043d\u0435\u0440\u0430\u0442\u043e\u0440\u044b.<\/p>\n<p>  <\/p>\n<pre><code class=\"python\">max_len = 20 min_len = 5 embedding_size = len(vectorize('any token'))  class EmbedingsDataGenerator():     def __init__(self, texts_corpus=TEXTS_CORPUS, min_len=5, max_len=20, batch_size=32, batches_per_epoch=100, use_word2vec=True, use_fasttext=True):         self.texts = texts_corpus         self.min_len = min_len         self.max_len = max_len         self.batch_size = batch_size         self.batches_per_epoch = batches_per_epoch         self.use_word2vec = use_word2vec         self.use_fasttext = use_fasttext         self.embedding_size = len(vectorize('token', use_word2vec=self.use_word2vec, use_fasttext=self.use_fasttext))      def vectorize(self, sentences):         vectorized_sentences = []         for text in sentences:             text_vec = []             tokens = str(text).split()             for token in tokens:                 text_vec.append(vectorize(token, use_word2vec=self.use_word2vec, use_fasttext=self.use_fasttext))             vectorized_sentences.append(text_vec)         vectorized_sentences = pad_sequences(vectorized_sentences, maxlen=self.max_len, dtype='float32')         return vectorized_sentences      def __iter__(self):         for _ in tqdm(range(self.batches_per_epoch), leave=False):             X_batch = []             y_batch = []             finished_batch = False             while not finished_batch:                 text = random.choice(self.texts)                 tokens = str(text).split()                 if len(tokens) &lt; self.min_len:                     continue                 x_vec = []                 for token in tokens:                     token_vec = vectorize(token, use_word2vec=self.use_word2vec, use_fasttext=self.use_fasttext)                     if len(x_vec) &gt;= self.min_len:                         X_batch.append(x_vec)                         y_batch.append(token_vec)                         if len(X_batch) == self.batch_size:                             X_batch = pad_sequences(X_batch, maxlen=self.max_len, dtype='float32')                             yield np.array(X_batch), np.array(y_batch)                             finished_batch = True                             break                     x_vec.append(token_vec)  class IndexesDataGenerator(EmbedingsDataGenerator):     def __init__(self, *args, **kwargs):         super().__init__(*args, **kwargs)         self.token2index = {}         index = 0         for text in self.texts:             tokens = str(text).split()             for token in tokens:                 if token not in self.token2index:                     self.token2index[token] = index                     index += 1      def __iter__(self):         for _ in tqdm(range(self.batches_per_epoch), leave=False):             X_batch = []             X_batch_indexes = []             y_batch = []             finished_batch = False             while not finished_batch:                 text = random.choice(self.texts)                 tokens = str(text).split()                 if len(tokens) &lt; self.min_len:                     continue                 x_vec = []                 x_tokens = []                 for token in tokens:                     token_vec = vectorize(token, use_word2vec=self.use_word2vec, use_fasttext=self.use_fasttext)                     if len(x_vec) &gt;= self.min_len:                         X_batch.append(x_vec)                         X_batch_indexes.append(to_categorical(x_tokens, num_classes=len(self.token2index)))                         y_batch.append(self.token2index[token])                         if len(X_batch) == self.batch_size:                             X_batch = pad_sequences(X_batch, maxlen=self.max_len, dtype='float32')                             X_batch_indexes = pad_sequences(X_batch_indexes, maxlen=self.max_len, dtype='int32')                             y_batch = to_categorical(y_batch, num_classes=len(self.token2index))                             yield np.array(X_batch), np.array(X_batch_indexes), np.array(y_batch)                             finished_batch = True                             break                     x_vec.append(token_vec)                     x_tokens.append(self.token2index[token])<\/code><\/pre>\n<p>  <\/p>\n<p>\u0415\u0441\u043b\u0438 \u043a\u0430\u0436\u0442\u0441\u044f, \u0447\u0442\u043e \u043f\u043e\u0442\u043e\u043a\u043e\u0432\u0430\u044f \u0433\u0435\u043d\u0435\u0440\u0430\u0446\u0438\u044f \u0441\u043b\u0438\u0448\u043a\u043e\u043c \u0434\u043e\u0440\u043e\u0433\u0430\u044f, \u0442\u043e \u043e\u0446\u0435\u043d\u0438\u0442\u0435 \u0432\u0440\u0435\u043c\u044f, \u043a\u043e\u0442\u043e\u0440\u043e\u0435 \u0437\u0430\u043d\u0438\u043c\u0430\u0435\u0442 \u0433\u0435\u043d\u0435\u0440\u0430\u0446\u0438\u044f 100 \u0431\u0430\u0442\u0447\u0435\u0439 \u0441 \u0440\u0430\u0437\u043c\u0435\u0440\u043e\u043c \u0431\u0430\u0442\u0447\u0430 32, \u043f\u043e\u043b\u0443\u0447\u0430\u0435\u0442\u0441\u044f:<\/p>\n<p>  <\/p>\n<pre><code class=\"python\">data_generator = EmbedingsDataGenerator()<\/code><\/pre>\n<p>  <\/p>\n<pre><code class=\"python\">%%timeit for x, y in data_generator:     pass<\/code><\/pre>\n<p>  <\/p>\n<p>448 ms \u00b1 65.6 ms per loop (mean \u00b1 std. dev. of 7 runs, 1 loop each)<\/p>\n<p>  <\/p>\n<pre><code class=\"python\">data_generator = IndexesDataGenerator()<\/code><\/pre>\n<p>  <\/p>\n<pre><code class=\"python\">%%timeit for x_e, x_i, y_i in data_generator:     pass<\/code><\/pre>\n<p>  <\/p>\n<p>5.77 s \u00b1 115 ms per loop (mean \u00b1 std. dev. of 7 runs, 1 loop each)<\/p>\n<p>  <\/p>\n<h1 id=\"3-languade-models\">3. Languade Models<\/h1>\n<p>  <\/p>\n<p>\u041d\u0435\u0439\u0440\u043e\u043d\u043d\u0430\u044f \u0441\u0435\u0442\u044c \u0443\u0433\u0430\u0434\u044b\u0432\u0430\u0435\u0442 \u0441\u043b\u0435\u0434\u0443\u044e\u0449\u0435\u0435 \u0441\u043b\u043e\u0432\u043e \u0432 \u043f\u0440\u0435\u043b\u043e\u0436\u0435\u043d\u0438\u0438. \u041f\u0440\u0435\u0434\u0441\u043a\u0430\u0437\u0430\u043d\u0438\u0435 \u043f\u043e \u0432\u0441\u0435\u0439 \u0434\u043b\u0438\u043d\u0435 \u0442\u0435\u043a\u0441\u0442\u0430, \u044f\u0432\u043b\u044f\u0435\u0442\u0441\u044f \u044d\u043c\u0431\u0435\u0434\u0438\u043d\u0433\u043e\u043c \u043f\u0440\u0435\u0434\u043e\u0436\u0435\u043d\u0438\u044f.<br \/>  \u0423\u0433\u0430\u0434\u044b\u0432\u0430\u0442\u044c \u0431\u0443\u0434\u0435\u043c \u043d\u0430 \u043e\u0441\u043d\u043e\u0432\u0430\u043d\u0438\u0438 \u043f\u0440\u0435\u0434\u044b\u0434\u0443\u0449\u0438\u0445 \u0441\u043b\u043e\u0432: \u043c\u0430\u043a\u0441\u0438\u043c\u0443\u043c 20 \u0438 \u043c\u0438\u043d\u0438\u043c\u0443\u043c 5.<\/p>\n<p>  <\/p>\n<pre><code class=\"python\">def similarity_values_wrapper(embedder, vectorizer, distance_function=cosine_distances):     def get_similarity_values(sentences):         sent_vec = vectorizer(sentences)         sent_embedings = embedder(sent_vec)         distances = distance_function(sent_embedings, sent_embedings)         return distances     return get_similarity_values<\/code><\/pre>\n<p>  <\/p>\n<h3 id=\"31-language-model-on-embedings\">3.1 Language Model on embedings<\/h3>\n<p>  <\/p>\n<pre><code class=\"python\">def model_builder(data_generator):     complexity = 500     inp = Input(shape=(data_generator.max_len, data_generator.embedding_size))     X = inp     X = LSTM(complexity, return_sequences=True)(X)     X = LSTM(complexity)(X)     X = Dense(complexity, activation='elu')(X)     X = Dense(complexity, activation='elu')(X)     X = Dense(data_generator.embedding_size, activation='linear')(X)     model = Model(inputs=inp, outputs=X)     model.compile(loss=cosine_similarity, optimizer='adam')     model.summary()     return model  data_generator = EmbedingsDataGenerator(use_fasttext=False) next_word_model = model_builder(data_generator) get_similarity_values = similarity_values_wrapper(next_word_model.predict, data_generator.vectorize)<\/code><\/pre>\n<p>  <\/p>\n<pre><code class=\"python\">new_result = -10e5 for i in tqdm(range(1000)):     if i%3==0:         previous_result = new_result         new_result = evaluate(get_similarity_values, 'Language Model on embedings')         new_result = parse_result(new_result)         print(i, new_result)         # stopping condition         if new_result &lt; previous_result and i &gt; 20:             break     for x, y in data_generator:         next_word_model.train_on_batch(x, y)<\/code><\/pre>\n<p>  <\/p>\n<p>0 1644.6<br \/>  3 148.7<br \/>  6 274.8<br \/>  9 72.3<br \/>  12 186.8<br \/>  15 183.7<br \/>  18 415.8<br \/>  21 138.9<\/p>\n<p>  <\/p>\n<h3 id=\"32-language-model-on-token-index\">3.2 Language Model on token index<\/h3>\n<p>  <\/p>\n<pre><code class=\"python\">def model_builder(data_generator):     complexity = 200     inp = Input(shape=(data_generator.max_len, data_generator.embedding_size))     X = inp     X = LSTM(complexity, return_sequences=True)(X)     X = LSTM(complexity)(X)     X = Dense(complexity, activation='linear', name='embedding_output')(X)     X = Dense(complexity, activation='elu')(X)     X = Dense(len(data_generator.token2index), activation='softmax')(X)     model = Model(inputs=inp, outputs=X)     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])     model.summary()     embedder = Model(inputs=inp, outputs=model.get_layer('embedding_output').output)     return model, embedder    data_generator = IndexesDataGenerator() next_word_model, embedder = model_builder(data_generator) get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)<\/code><\/pre>\n<p>  <\/p>\n<pre><code class=\"python\">new_result = -10e5 for i in tqdm(range(1000)):     if i%3==0:         previous_result = new_result         new_result = evaluate(get_similarity_values, 'Language Model on token index')         new_result = parse_result(new_result)         print(i, new_result)         if new_result &lt; previous_result and i &gt; 20:             break     for x_e, x_i, y in data_generator:         next_word_model.train_on_batch(x_e, y)<\/code><\/pre>\n<p>  <\/p>\n<p>0 1700.6<br \/>  3 404.7<br \/>  6 255.3<br \/>  9 379.8<br \/>  12 195.2<br \/>  15 160.1<br \/>  18 530.7<br \/>  21 701.9<br \/>  24 536.9<\/p>\n<p>  <\/p>\n<pre><code class=\"python\">plot_results()<\/code><\/pre>\n<p>  <\/p>\n<p><img decoding=\"async\" src=\"https:\/\/habrastorage.org\/webt\/ad\/ln\/j4\/adlnj4dlmxf641ev74rda-sr1j0.png\" alt=\"png\"\/><\/p>\n<p>  <\/p>\n<h1 id=\"konec-pervoy-chasti\">\u041a\u043e\u043d\u0435\u0446 \u043f\u0435\u0440\u0432\u043e\u0439 \u0447\u0430\u0441\u0442\u0438<\/h1>\n<p>  <\/p>\n<p>\u043f\u0440\u043e\u0434\u043e\u043b\u0436\u0435\u043d\u0438\u0435 \u0441\u043b\u0435\u0434\u0443\u0435\u0442 &#8230;<\/p>\n<\/div>\n<p> \u0441\u0441\u044b\u043b\u043a\u0430 \u043d\u0430 \u043e\u0440\u0438\u0433\u0438\u043d\u0430\u043b \u0441\u0442\u0430\u0442\u044c\u0438 <a href=\"https:\/\/habr.com\/ru\/post\/515036\/\"> https:\/\/habr.com\/ru\/post\/515036\/<\/a><\/p>\n","protected":false},"excerpt":{"rendered":"\n<div class=\"post__text post__text-html post__text_v1\" id=\"post-content-body\" data-io-article-url=\"https:\/\/habr.com\/ru\/post\/515036\/\">\n<p>\u041f\u0440\u0435\u0434\u0441\u0442\u0430\u0432\u0442\u0435 \u0441\u0435\u0431\u0435, \u043a\u0430\u043a \u0431\u044b\u043b\u043e \u0431\u044b \u0443\u0434\u043e\u0431\u043d\u043e, \u043d\u0430\u043f\u0438\u0441\u0430\u0442\u044c \u043f\u0440\u0435\u0434\u043b\u043e\u0436\u0435\u043d\u0438\u0435 \u0438 \u043d\u0430\u0439\u0442\u0438 \u043f\u043e\u0445\u043e\u0436\u0435\u0435 \u043a \u043d\u0435\u043c\u0443 \u043f\u043e \u0441\u043c\u044b\u0441\u043b\u0443. \u0414\u043b\u044f \u044d\u0442\u043e\u0433\u043e \u043d\u0443\u0436\u043d\u043e \u0443\u043c\u0435\u0442\u044c \u0432\u0435\u043a\u0442\u043e\u0440\u0438\u0437\u043e\u0432\u0430\u0442\u044c \u0432\u0441\u0451 \u043f\u0440\u0435\u0434\u043b\u043e\u0436\u0435\u043d\u0438\u0435, \u0447\u0442\u043e \u043c\u043e\u0436\u0435\u0442 \u0431\u044b\u0442\u044c \u043e\u0447\u0435\u043d\u044c \u043d\u0435 \u0442\u0440\u0438\u0432\u0438\u0430\u043b\u044c\u043d\u043e\u0439 \u0437\u0430\u0434\u0430\u0447\u0435\u0439.<br \/>  \u041f\u043e \u0441\u043f\u0435\u0446\u0438\u0444\u0438\u043a\u0435 \u0441\u0432\u043e\u0435\u0439 \u0440\u0430\u0431\u043e\u0442\u044b, \u044f \u0434\u043e\u043b\u0436\u0435\u043d \u0438\u0441\u043a\u0430\u0442\u044c \u043f\u043e\u0445\u043e\u0436\u0438\u0435 \u0437\u0430\u043f\u0440\u043e\u0441\u044b \u0432 \u0441\u043b\u0443\u0436\u0431\u0443 \u043f\u043e\u0434\u0434\u0435\u0440\u0436\u043a\u0438 \u0438 \u0434\u0430\u0436\u0435 \u0438\u043c\u0435\u044f \u0434\u043e\u0441\u0442\u0430\u0442\u043e\u0447\u043d\u043e \u0431\u043e\u043b\u044c\u0448\u0443\u044e \u0440\u0430\u0437\u043c\u0435\u0442\u043a\u0443, \u0431\u044b\u0432\u0430\u0435\u0442 \u0442\u044f\u0436\u0435\u043b\u043e \u0441\u043e\u0431\u0440\u0430\u0442\u044c \u043d\u0435\u043e\u0431\u0445\u043e\u0434\u0438\u043c\u043e\u0435 \u043a\u043e\u043b\u0438\u0447\u0435\u0441\u0442\u0432\u043e \u0441\u043e\u043e\u0431\u0449\u0435\u043d\u0438\u0439 \u043f\u043e\u0434\u0445\u043e\u0434\u044f\u0449\u0438\u0445 \u043f\u043e \u0442\u0435\u043c\u0430\u0442\u0438\u043a\u0435, \u043d\u043e \u043d\u0430\u043f\u0438\u0441\u0430\u043d\u043d\u044b\u0445 \u0434\u0440\u0443\u0433\u0438\u043c\u0438 \u0441\u043b\u043e\u0432\u0430\u043c\u0438.<br \/>  \u041d\u0438\u0436\u0435 \u043e\u0431\u0437\u043e\u0440\u043d\u043e\u0435 \u0438\u0441\u0441\u043b\u0435\u0434\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 \u0441\u043f\u043e\u0441\u043e\u0431\u044b \u0432\u0435\u043a\u0442\u043e\u0440\u0438\u0437\u0430\u0446\u0438\u0438 \u0432\u0441\u0435\u0433\u043e \u043f\u0440\u0435\u0434\u043b\u043e\u0436\u0435\u043d\u0438\u044f \u0438 \u043d\u0435 \u043f\u0440\u043e\u0441\u0442\u043e \u0432\u0435\u043a\u0442\u043e\u0440\u0438\u0437\u0430\u0446\u0438\u0438, \u0430 \u043f\u043e\u043f\u044b\u0442\u043a\u0430 \u0432\u0435\u043a\u0442\u043e\u0440\u0438\u0437\u043e\u0432\u0430\u0442\u044c \u043f\u0440\u0435\u0434\u043b\u043e\u0436\u0435\u043d\u0438\u0435 \u0441 \u0443\u0447\u0451\u0442\u043e\u043c \u0435\u0433\u043e \u0441\u043c\u044b\u0441\u043b\u0430.<br \/>  \u041d\u0430\u043f\u0440\u0438\u043c\u0435\u0440 \u0434\u0432\u0435 \u0444\u0440\u0430\u0437\u044b <strong>&#8216;\u044d\u043f\u043b \u043b\u0443\u0447\u0448\u0435 \u0441\u0430\u043c\u0441\u0443\u043d\u0433&#8217;<\/strong> \u043e\u0442 <strong>&#8216;\u0441\u0430\u043c\u0441\u0443\u043d\u0433 \u043b\u0443\u0447\u0448\u0435 \u044d\u043f\u043b&#8217;<\/strong>, \u0434\u043e\u043b\u0436\u043d\u044b \u0431\u044b\u0442\u044c \u043d\u0430 \u043f\u0440\u043e\u0442\u0438\u0432\u043e\u043f\u043e\u043b\u043e\u0436\u043d\u043e\u043c \u043a\u043e\u043d\u0446\u0435 \u043f\u043e \u043e\u0434\u043d\u043e\u043c\u0443 \u0438\u0437 \u0437\u043d\u0430\u0447\u0435\u043d\u0438\u0439 \u0432\u0435\u043a\u0442\u043e\u0440\u0430, \u043d\u043e \u043f\u0440\u0438 \u044d\u0442\u043e\u043c \u0441\u043e\u0432\u043f\u0430\u0434\u0430\u0442\u044c \u043f\u043e \u0434\u0440\u0443\u0433\u0438\u043c.<br \/>  \u041c\u043e\u0436\u043d\u043e \u043f\u0440\u0438\u0432\u0435\u0441\u0442\u0438 \u0430\u043d\u0430\u043b\u043e\u0433\u0438\u044e \u0441 \u043a\u0430\u0440\u0442\u0438\u043d\u043a\u043e\u0439 \u043d\u0438\u0436\u0435. \u041f\u043e \u0448\u043a\u0430\u043b\u0435 \u043e\u0442 \u043a\u0435\u043a\u0441\u0430 \u0434\u043e \u0441\u043e\u0431\u0430\u043a\u0438 \u043e\u043d\u0438 \u043d\u0430\u0445\u043e\u0434\u044f\u0442\u0441\u044f \u043d\u0430 \u0440\u0430\u0437\u043d\u044b\u0445 \u043a\u043e\u043d\u0446\u0430\u0445, \u0430 \u043f\u043e \u043a\u043e\u043b\u0438\u0447\u0435\u0441\u0442\u0432\u0443 \u0447\u0451\u0440\u043d\u044b\u0445 \u0442\u043e\u0447\u0435\u043a \u0438 \u0446\u0432\u0435\u0442\u0443 \u043e\u0431\u044a\u0435\u043a\u0442\u0430 \u043d\u0430 \u043e\u0434\u043d\u043e\u043c.<\/p>\n<p>  <\/p>\n<p><img decoding=\"async\" src=\"https:\/\/cdn-media-1.freecodecamp.org\/images\/1*bt-E2YcPafjiPbZFDMMmNQ.jpeg\" alt=\"https:\/\/cdn-media-1.freecodecamp.org\/images\/1*bt-E2YcPafjiPbZFDMMmNQ.jpeg\"\/><\/p>\n<p>  <\/p>\n<p><a href=\"https:\/\/paperswithcode.com\/task\/sentence-embedding\" rel=\"nofollow\">\u0412\u043e\u0442 \u0442\u0443\u0442 \u0441\u0431\u043e\u0440\u043d\u0438\u043a \u0441\u0442\u0430\u0442\u0435\u0439 \u043f\u043e \u0432\u0435\u043a\u0442\u043e\u0440\u0438\u0437\u0430\u0446\u0438\u0438 \u043f\u0440\u0435\u0434\u043b\u043e\u0436\u0435\u043d\u0438\u0439<\/a> <\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[],"tags":[],"class_list":["post-308420","post","type-post","status-publish","format-standard","hentry"],"_links":{"self":[{"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=\/wp\/v2\/posts\/308420","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=308420"}],"version-history":[{"count":0,"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=\/wp\/v2\/posts\/308420\/revisions"}],"wp:attachment":[{"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=308420"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=308420"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=308420"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}