{"id":473305,"date":"2025-09-02T16:07:31","date_gmt":"2025-09-02T16:07:31","guid":{"rendered":"http:\/\/savepearlharbor.com\/?p=473305"},"modified":"-0001-11-30T00:00:00","modified_gmt":"-0001-11-29T21:00:00","slug":"","status":"publish","type":"post","link":"https:\/\/savepearlharbor.com\/?p=473305","title":{"rendered":"<span>\u0421\u0435\u0433\u043c\u0435\u043d\u0442\u0430\u0446\u0438\u044f \u043a\u043b\u0438\u0435\u043d\u0442\u043e\u0432 \u043c\u0435\u0442\u043e\u0434\u043e\u043c K-Means \u043d\u0430 \u0441\u0442\u043e\u0440\u043e\u043d\u0435 \u0411\u0414<\/span>"},"content":{"rendered":"<div><!--[--><!--]--><\/div>\n<div id=\"post-content-body\">\n<div>\n<div class=\"article-formatted-body article-formatted-body article-formatted-body_version-2\">\n<div xmlns=\"http:\/\/www.w3.org\/1999\/xhtml\">\n<p>\u041f\u0440\u0438\u0432\u0435\u0442, \u0425\u0430\u0431\u0440! <\/p>\n<p>\u0421\u0435\u0433\u043e\u0434\u043d\u044f \u043c\u044b \u0440\u0430\u0441\u0441\u043c\u043e\u0442\u0440\u0438\u043c, \u043a\u0430\u043a \u0437\u0430\u0441\u0442\u0430\u0432\u0438\u0442\u044c PostgreSQL \u0441\u0430\u043c\u043e\u0441\u0442\u043e\u044f\u0442\u0435\u043b\u044c\u043d\u043e \u043a\u0440\u0443\u0442\u0438\u0442\u044c K-Means \u0434\u043b\u044f \u0441\u0435\u0433\u043c\u0435\u043d\u0442\u0430\u0446\u0438\u0438 \u043a\u043b\u0438\u0435\u043d\u0442\u043e\u0432, \u043d\u0435 \u0432\u044b\u0442\u0430\u0441\u043a\u0438\u0432\u0430\u044f \u0434\u0430\u043d\u043d\u044b\u0435 \u043d\u0430\u0440\u0443\u0436\u0443. \u041f\u0440\u043e\u0439\u0434\u0435\u043c\u0441\u044f \u043f\u043e \u0446\u0438\u043a\u043b\u0443: \u043d\u043e\u0440\u043c\u0430\u043b\u0438\u0437\u0443\u0435\u043c \u0444\u0438\u0447\u0438 \u0432 materialized view, \u043d\u0430\u043f\u0438\u0448\u0435\u043c \u0444\u0443\u043d\u043a\u0446\u0438\u044e PL\/PythonU, \u043a\u043e\u0442\u043e\u0440\u0430\u044f \u0434\u0435\u0440\u0433\u0430\u0435\u0442 scikit-learn, \u0441\u043e\u0445\u0440\u0430\u043d\u044f\u0435\u043c cluster_id \u043e\u0431\u0440\u0430\u0442\u043d\u043e \u0432 \u0442\u0430\u0431\u043b\u0438\u0446\u0443 \u0438 \u0437\u0430\u043a\u0440\u044b\u0432\u0430\u0435\u043c \u0433\u0435\u0448\u0442\u0430\u043b\u044c\u0442 \u043e\u0442\u0447\u0451\u0442\u043e\u043c \u00ab\u0434\u043e\u0445\u043e\u0434 \u043f\u043e \u043a\u043b\u0430\u0441\u0442\u0435\u0440\u0443\u00bb \u0447\u0438\u0441\u0442\u044b\u043c SQL. <\/p>\n<h3>\u0421\u0445\u0435\u043c\u0430 \u0434\u0430\u043d\u043d\u044b\u0445<\/h3>\n<p>\u0414\u043e\u043f\u0443\u0441\u0442\u0438\u043c, \u0435\u0441\u0442\u044c \u0442\u0440\u0430\u043d\u0437\u0430\u043a\u0446\u0438\u0438 \u0438 \u0431\u0430\u0437\u043e\u0432\u0430\u044f \u0438\u043d\u0444\u043e\u0440\u043c\u0430\u0446\u0438\u044f \u043e \u043a\u043b\u0438\u0435\u043d\u0442\u0430\u0445:<\/p>\n<pre><code class=\"sql\">CREATE TABLE public.customers (     customer_id      bigint PRIMARY KEY,     registered_at    timestamptz,     email            text UNIQUE );  CREATE TABLE public.orders (     order_id         bigint PRIMARY KEY,     customer_id      bigint REFERENCES public.customers,     order_dt         timestamptz NOT NULL,     order_amount     numeric(12,2) NOT NULL );<\/code><\/pre>\n<h3>\u0413\u0435\u043d\u0435\u0440\u0430\u0446\u0438\u044f \u0444\u0438\u0447\u0435\u0439 \u0438 \u043d\u043e\u0440\u043c\u0430\u043b\u0438\u0437\u0430\u0446\u0438\u044f \u0432 Materialized View<\/h3>\n<p>\u041d\u0443\u0436\u043d\u044b \u0447\u0438\u0441\u043b\u043e\u0432\u044b\u0435 \u043f\u0440\u0438\u0437\u043d\u0430\u043a\u0438 \u043d\u0430 \u043e\u0434\u043d\u043e\u0433\u043e \u043a\u043b\u0438\u0435\u043d\u0442\u0430: <code>orders_cnt<\/code>, <code>days_since_last<\/code>, <code>mean_amount<\/code>, <code>total_amount<\/code>. \u0421\u0440\u0430\u0437\u0443 \u0434\u0435\u043b\u0430\u0435\u043c z-score, \u0447\u0442\u043e\u0431\u044b K-Means \u043d\u0435 \u0441\u0442\u0440\u0430\u0434\u0430\u043b \u043e\u0442 \u0440\u0430\u0437\u043d\u044b\u0445 \u043c\u0430\u0441\u0448\u0442\u0430\u0431\u043e\u0432. \u0412\u0441\u0451 \u0432 \u043e\u0434\u043d\u043e\u043c \u0437\u0430\u043f\u0440\u043e\u0441\u0435, \u0430 \u0440\u0435\u0437\u0443\u043b\u044c\u0442\u0430\u0442 \u043a\u0435\u0448\u0438\u0440\u0443\u0435\u043c \u043c\u0430\u0442\u0435\u0440\u0438\u0430\u043b\u0438\u0437\u043e\u0432\u0430\u043d\u043d\u044b\u043c \u043f\u0440\u0435\u0434\u0441\u0442\u0430\u0432\u043b\u0435\u043d\u0438\u0435\u043c:<\/p>\n<pre><code class=\"sql\">CREATE MATERIALIZED VIEW ds.mv_customer_features AS WITH stats AS (     SELECT         avg(orders_cnt)::numeric  AS avg_orders_cnt,         stddev_samp(orders_cnt)   AS sd_orders_cnt,         avg(days_since_last)      AS avg_days_last,         stddev_samp(days_since_last) AS sd_days_last,         avg(mean_amount)          AS avg_mean_amount,         stddev_samp(mean_amount)  AS sd_mean_amount,         avg(total_amount)         AS avg_total_amount,         stddev_samp(total_amount) AS sd_total_amount     FROM (         SELECT             c.customer_id,             COUNT(o.*)                      AS orders_cnt,             EXTRACT(EPOCH FROM (now() - MAX(o.order_dt))) \/ 86400 AS days_since_last,             AVG(o.order_amount)             AS mean_amount,             SUM(o.order_amount)             AS total_amount         FROM public.customers c         LEFT JOIN public.orders o USING (customer_id)         GROUP BY c.customer_id     ) sub ), base AS (     SELECT         c.customer_id,         COUNT(o.*)                      AS orders_cnt,         EXTRACT(EPOCH FROM (now() - MAX(o.order_dt))) \/ 86400 AS days_since_last,         AVG(o.order_amount)             AS mean_amount,         SUM(o.order_amount)             AS total_amount     FROM public.customers c     LEFT JOIN public.orders o USING (customer_id)     GROUP BY c.customer_id ), z AS (     SELECT         b.customer_id,         (b.orders_cnt    - s.avg_orders_cnt)   \/ NULLIF(s.sd_orders_cnt,0)   AS z_orders_cnt,         (b.days_since_last - s.avg_days_last)  \/ NULLIF(s.sd_days_last,0)    AS z_days_last,         (b.mean_amount   - s.avg_mean_amount)  \/ NULLIF(s.sd_mean_amount,0)  AS z_mean_amount,         (b.total_amount  - s.avg_total_amount) \/ NULLIF(s.sd_total_amount,0) AS z_total_amount     FROM base b CROSS JOIN stats s ) SELECT * FROM z;<\/code><\/pre>\n<p>\u041c\u0430\u0442\u0435\u0440\u0438\u0430\u043b\u0438\u0437\u043e\u0432\u0430\u043d\u043d\u044b\u0439 \u0432\u0438\u0434 \u0445\u043e\u0440\u043e\u0448 \u0442\u0435\u043c, \u0447\u0442\u043e \u0435\u0433\u043e \u043c\u043e\u0436\u043d\u043e \u043e\u0441\u0432\u0435\u0436\u0430\u0442\u044c \u043f\u043e \u0440\u0430\u0441\u043f\u0438\u0441\u0430\u043d\u0438\u044e (<code>REFRESH MATERIALIZED VIEW CONCURRENTLY ds.mv_customer_features;<\/code>) \u0438 \u043e\u043d \u0434\u0430\u0451\u0442 \u0438\u043d\u0434\u0435\u043a\u0441\u044b, \u0435\u0441\u043b\u0438 \u043f\u043e\u043d\u0430\u0434\u043e\u0431\u0438\u0442\u0441\u044f. \u041f\u043e\u0434 \u043a\u0430\u043f\u043e\u0442\u043e\u043c \u0432\u0441\u0451 \u0441\u0442\u0430\u043d\u0434\u0430\u0440\u0442\u043d\u044b\u0439 SQL, \u043d\u0438\u043a\u0430\u043a\u0438\u0445 \u043a\u043e\u0441\u0442\u044b\u043b\u0435\u0439. <\/p>\n<h3>\u041f\u043e\u0434\u043a\u043b\u044e\u0447\u0430\u0435\u043c PL\/PythonU<\/h3>\n<pre><code class=\"sql\">CREATE EXTENSION IF NOT EXISTS plpython3u;<\/code><\/pre>\n<p>\u041f\u0440\u0430\u0432\u0430 \u0432\u044b\u0434\u0430\u0451\u043c \u0442\u043e\u043b\u044c\u043a\u043e \u0447\u0442\u0435\u043d\u0438\u0435 \u043d\u0430 <code>ds.mv_customer_features<\/code> \u0438 \u0437\u0430\u043f\u0438\u0441\u044c \u043d\u0430 <code>public.customers.cluster_id<\/code>.<\/p>\n<h3>\u0425\u0440\u0430\u043d\u0438\u043c\u0430\u044f \u0444\u0443\u043d\u043a\u0446\u0438\u044f K-Means<\/h3>\n<p>\u0424\u0443\u043d\u043a\u0446\u0438\u044f \u043f\u043e\u043b\u0443\u0447\u0430\u0435\u0442 \u0436\u0435\u043b\u0430\u0435\u043c\u043e\u0435 \u0447\u0438\u0441\u043b\u043e \u043a\u043b\u0430\u0441\u0442\u0435\u0440\u043e\u0432 (<code>k<\/code>), \u043e\u0431\u0443\u0447\u0430\u0435\u0442 K-Means \u043d\u0430 \u043d\u043e\u0440\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u043d\u044b\u0445 \u0444\u0438\u0447\u0430\u0445, \u0441\u043e\u0445\u0440\u0430\u043d\u044f\u0435\u0442 \u043c\u043e\u0434\u0435\u043b\u044c \u0432 JSON (\u0434\u043b\u044f \u0438\u0441\u0442\u043e\u0440\u0438\u0438) \u0438 \u043f\u0438\u0448\u0435\u0442 \u043d\u043e\u043c\u0435\u0440 \u043a\u043b\u0430\u0441\u0442\u0435\u0440\u0430 \u043a\u043b\u0438\u0435\u043d\u0442\u0443 \u0432 \u0442\u0430\u0431\u043b\u0438\u0446\u0443.<\/p>\n<pre><code class=\"sql\">CREATE OR REPLACE FUNCTION ds.build_customer_clusters(k int DEFAULT 5) RETURNS void LANGUAGE plpython3u SECURITY DEFINER AS $$ import json from sklearn.cluster import KMeans from sklearn.exceptions import ConvergenceWarning import warnings  plpy.execute(\"SET search_path TO ds, public\")  # 1. \u0417\u0430\u0431\u0438\u0440\u0430\u0435\u043c \u0434\u0430\u043d\u043d\u044b\u0435 rows = plpy.execute(\"\"\"     SELECT customer_id,            ARRAY[z_orders_cnt, z_days_last, z_mean_amount, z_total_amount] AS f     FROM ds.mv_customer_features     WHERE z_orders_cnt IS NOT NULL \"\"\")  if len(rows) &lt; k:     plpy.error(f\"Not enough data points ({len(rows)}) for k={k}\")  cust_ids = [r['customer_id'] for r in rows] X = [r['f'] for r in rows]  # 2. \u041e\u0431\u0443\u0447\u0430\u0435\u043c \u043c\u043e\u0434\u0435\u043b\u044c warnings.filterwarnings(\"ignore\", category=ConvergenceWarning) model = KMeans(n_clusters=k, n_init='auto', random_state=42) model.fit(X) labels = model.labels_  # 3. \u0417\u0430\u043f\u0438\u0441\u044b\u0432\u0430\u0435\u043c \u043a\u043b\u0430\u0441\u0442\u0435\u0440\u0430 tuples = [{'customer_id': cid, 'cluster_id': int(lbl)} for cid, lbl in zip(cust_ids, labels)] plpy.execute(\"CREATE TEMP TABLE _tmp_cluster (customer_id bigint, cluster_id int) ON COMMIT DROP\") plpy.execute(\"INSERT INTO _tmp_cluster VALUES \" +               \", \".join(f\"({t['customer_id']}, {t['cluster_id']})\" for t in tuples))  plpy.execute(\"\"\"     UPDATE public.customers c     SET cluster_id = t.cluster_id     FROM _tmp_cluster t     WHERE t.customer_id = c.customer_id \"\"\")  # 4. \u0421\u0435\u0440\u0438\u0430\u043b\u0438\u0437\u0443\u0435\u043c \u043c\u043e\u0434\u0435\u043b\u044c (\u043e\u043f\u0446\u0438\u043e\u043d\u0430\u043b\u044c\u043d\u043e) plpy.execute(\"\"\"     INSERT INTO ds.model_registry(model_name, trained_at, params, inertia)     VALUES ('customer_kmeans', now(), $1, $2) \"\"\", [json.dumps(model.get_params()), float(model.inertia_)]) $$;<\/code><\/pre>\n<p><code><strong>SECURITY DEFINER<\/strong><\/code> \u0438\u0441\u043f\u043e\u043b\u043d\u044f\u0435\u0442\u0441\u044f \u0441 \u043f\u0440\u0430\u0432\u0430\u043c\u0438 \u0432\u043b\u0430\u0434\u0435\u043b\u044c\u0446\u0430, \u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u0435\u043b\u0438 \u043d\u0435 \u043f\u043e\u043b\u0443\u0447\u0430\u0442 \u043b\u0438\u0448\u043d\u0438\u0435 \u043f\u0440\u0438\u0432\u0438\u043b\u0435\u0433\u0438\u0438. <strong>\u0422\u0435\u043c\u043f-\u0442\u0430\u0431\u043b\u0438\u0446\u0430<\/strong> \u0434\u043b\u044f \u043c\u0430\u0441\u0441\u043e\u0432\u043e\u0433\u043e \u0430\u043f\u0434\u0435\u0439\u0442\u0430 \u0431\u044b\u0441\u0442\u0440\u0435\u0435, \u0447\u0435\u043c <code>UPDATE \u2026 FROM (VALUES \u2026)<\/code> \u043d\u0430 \u0442\u044b\u0441\u044f\u0447\u0438 \u0441\u0442\u0440\u043e\u043a. <code><strong>random_state<\/strong><\/code> \u0444\u0438\u043a\u0441\u0438\u0440\u0443\u0435\u043c, \u0447\u0442\u043e\u0431\u044b \u0440\u0435\u0437\u0443\u043b\u044c\u0442\u0430\u0442\u044b \u0432\u043e\u0441\u043f\u0440\u043e\u0438\u0437\u0432\u043e\u0434\u0438\u043b\u0438\u0441\u044c.<\/p>\n<h3>\u0417\u0430\u043f\u0443\u0441\u043a \u0438 \u0440\u0430\u0441\u043f\u0438\u0441\u0430\u043d\u0438\u0435<\/h3>\n<p>\u0420\u0430\u0437\u043e\u0432\u043e:<\/p>\n<pre><code class=\"sql\">SELECT ds.build_customer_clusters(6); REFRESH MATERIALIZED VIEW CONCURRENTLY ds.mv_customer_features;<\/code><\/pre>\n<p>\u0415\u0449\u0435 \u0443\u0434\u043e\u0431\u043d\u043e \u0434\u0435\u043b\u0430\u0442\u044c <code>REFRESH<\/code> \u0438 \u043f\u0435\u0440\u0435\u0441\u0442\u0440\u043e\u0439\u043a\u0443 \u043c\u043e\u0434\u0435\u043b\u0438 \u043d\u043e\u0447\u044c\u044e cron-\u0434\u0436\u043e\u0431\u043e\u0439 \u0438\u043b\u0438 pg_cron:<\/p>\n<pre><code class=\"sql\">SELECT cron.schedule('0 3 * * *', $$REFRESH MATERIALIZED VIEW CONCURRENTLY ds.mv_customer_features;                        SELECT ds.build_customer_clusters(6);$$);<\/code><\/pre>\n<h3>\u041e\u0442\u0447\u0451\u0442 \u00ab\u0434\u043e\u0445\u043e\u0434 \u043f\u043e \u043a\u043b\u0430\u0441\u0442\u0435\u0440\u0443\u00bb<\/h3>\n<p>\u041d\u0438\u043a\u0430\u043a\u043e\u0433\u043e Python \u2014 \u0442\u043e\u043b\u044c\u043a\u043e SQL:<\/p>\n<pre><code class=\"sql\">WITH rev AS (     SELECT         c.cluster_id,         SUM(o.order_amount) AS revenue     FROM public.customers c     JOIN public.orders o USING (customer_id)     GROUP BY c.cluster_id ) SELECT     cluster_id,     revenue,     ROUND(revenue * 100.0 \/ SUM(revenue) OVER (), 2) AS revenue_pct FROM rev ORDER BY revenue DESC;<\/code><\/pre>\n<p>\u0420\u0435\u0437\u0443\u043b\u044c\u0442\u0430\u0442 \u0441\u0440\u0430\u0437\u0443 \u0433\u043e\u0442\u043e\u0432 \u043a \u0434\u0430\u0448\u0431\u043e\u0440\u0434\u0443: \u0432\u0438\u0434\u043d\u043e \u0434\u043e\u043b\u044e \u043a\u0430\u0436\u0434\u043e\u0433\u043e \u0441\u0435\u0433\u043c\u0435\u043d\u0442\u0430 \u0432 \u043e\u0431\u0449\u0435\u0439 \u0432\u044b\u0440\u0443\u0447\u043a\u0435.<\/p>\n<h3>\u0418\u0442\u043e\u0433<\/h3>\n<p>\u0415\u0441\u043b\u0438 \u0443 \u0432\u0430\u0441 \u0443\u0436\u0435 \u0435\u0441\u0442\u044c \u043e\u043f\u044b\u0442 \u0441\u0435\u0433\u043c\u0435\u043d\u0442\u0430\u0446\u0438\u0438 \u043d\u0430 \u0441\u0442\u043e\u0440\u043e\u043d\u0435 \u0411\u0414 \u2014 \u0434\u0435\u043b\u0438\u0442\u0435\u0441\u044c \u0432 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u044f\u0445: \u043a\u0430\u043a \u043c\u0430\u0441\u0448\u0442\u0430\u0431\u0438\u0440\u043e\u0432\u0430\u043b\u0438, \u043a\u0430\u043a\u0438\u0435 \u043f\u0440\u043e\u0431\u043b\u0435\u043c\u044b \u043b\u043e\u0432\u0438\u043b\u0438, \u0433\u0434\u0435 K-Means \u043d\u0435 \u0437\u0430\u0448\u0451\u043b. \u0427\u0435\u043c \u0431\u043e\u043b\u044c\u0448\u0435 \u043f\u0440\u0438\u043c\u0435\u0440\u043e\u0432, \u0442\u0435\u043c \u043f\u043e\u043b\u0435\u0437\u043d\u0435\u0435 \u0441\u0442\u0430\u0442\u044c\u044f \u0434\u043b\u044f \u0432\u0441\u0435\u0445.  <\/p>\n<p><em>\u041f\u043e\u0433\u0440\u0443\u0437\u0438\u0442\u0435\u0441\u044c \u0432 \u043f\u0440\u043e\u0446\u0435\u0441\u0441 \u0440\u0430\u0437\u0440\u0430\u0431\u043e\u0442\u043a\u0438 \u041f\u041e \u0441 \u043d\u0443\u043b\u044f: \u043d\u0430\u0443\u0447\u0438\u0442\u0435\u0441\u044c \u0443\u0447\u0438\u0442\u044b\u0432\u0430\u0442\u044c \u0446\u0435\u043b\u0438 \u0431\u0438\u0437\u043d\u0435\u0441\u0430 \u0438 \u0444\u043e\u0440\u043c\u0443\u043b\u0438\u0440\u043e\u0432\u0430\u0442\u044c \u0442\u0435\u0445\u043d\u0438\u0447\u0435\u0441\u043a\u0438\u0435 \u0442\u0440\u0435\u0431\u043e\u0432\u0430\u043d\u0438\u044f \u043a \u043f\u0440\u043e\u0434\u0443\u043a\u0442\u0443 <\/em><a href=\"https:\/\/otus.pw\/r6W4\/\" rel=\"noopener noreferrer nofollow\"><em>\u043d\u0430 \u0431\u0430\u0437\u043e\u0432\u043e\u043c \u043a\u0443\u0440\u0441\u0435 \u00ab\u0421\u0438\u0441\u0442\u0435\u043c\u043d\u044b\u0439 \u0430\u043d\u0430\u043b\u0438\u0442\u0438\u043a\u00bb.<\/em><\/a><\/p>\n<p><em>\u0427\u0442\u043e\u0431\u044b \u043e\u0441\u0442\u0430\u0432\u0430\u0442\u044c\u0441\u044f \u0432 \u043a\u0443\u0440\u0441\u0435 \u0430\u043a\u0442\u0443\u0430\u043b\u044c\u043d\u044b\u0445 \u0442\u0435\u0445\u043d\u043e\u043b\u043e\u0433\u0438\u0439 \u0438 \u0442\u0440\u0435\u043d\u0434\u043e\u0432, \u043f\u043e\u0434\u043f\u0438\u0441\u044b\u0432\u0430\u0439\u0442\u0435\u0441\u044c \u043d\u0430 <\/em><a href=\"https:\/\/t.me\/+dxHyB6AgI99kNDUy\" rel=\"noopener noreferrer nofollow\"><em>Telegram-\u043a\u0430\u043d\u0430\u043b OTUS.<\/em><\/a><\/p>\n<\/div>\n<\/div>\n<\/div>\n<p><!----><!----><\/div>\n<p><!----><!----><br \/> \u0441\u0441\u044b\u043b\u043a\u0430 \u043d\u0430 \u043e\u0440\u0438\u0433\u0438\u043d\u0430\u043b \u0441\u0442\u0430\u0442\u044c\u0438 <a href=\"https:\/\/habr.com\/ru\/articles\/930506\/\"> https:\/\/habr.com\/ru\/articles\/930506\/<\/a><\/p>\n","protected":false},"excerpt":{"rendered":"<div><!--[--><!--]--><\/div>\n<div id=\"post-content-body\">\n<div>\n<div class=\"article-formatted-body article-formatted-body article-formatted-body_version-2\">\n<div xmlns=\"http:\/\/www.w3.org\/1999\/xhtml\">\n<p>\u041f\u0440\u0438\u0432\u0435\u0442, \u0425\u0430\u0431\u0440! <\/p>\n<p>\u0421\u0435\u0433\u043e\u0434\u043d\u044f \u043c\u044b \u0440\u0430\u0441\u0441\u043c\u043e\u0442\u0440\u0438\u043c, \u043a\u0430\u043a \u0437\u0430\u0441\u0442\u0430\u0432\u0438\u0442\u044c PostgreSQL \u0441\u0430\u043c\u043e\u0441\u0442\u043e\u044f\u0442\u0435\u043b\u044c\u043d\u043e \u043a\u0440\u0443\u0442\u0438\u0442\u044c K-Means \u0434\u043b\u044f \u0441\u0435\u0433\u043c\u0435\u043d\u0442\u0430\u0446\u0438\u0438 \u043a\u043b\u0438\u0435\u043d\u0442\u043e\u0432, \u043d\u0435 \u0432\u044b\u0442\u0430\u0441\u043a\u0438\u0432\u0430\u044f \u0434\u0430\u043d\u043d\u044b\u0435 \u043d\u0430\u0440\u0443\u0436\u0443. \u041f\u0440\u043e\u0439\u0434\u0435\u043c\u0441\u044f \u043f\u043e \u0446\u0438\u043a\u043b\u0443: \u043d\u043e\u0440\u043c\u0430\u043b\u0438\u0437\u0443\u0435\u043c \u0444\u0438\u0447\u0438 \u0432 materialized view, \u043d\u0430\u043f\u0438\u0448\u0435\u043c \u0444\u0443\u043d\u043a\u0446\u0438\u044e PL\/PythonU, \u043a\u043e\u0442\u043e\u0440\u0430\u044f \u0434\u0435\u0440\u0433\u0430\u0435\u0442 scikit-learn, \u0441\u043e\u0445\u0440\u0430\u043d\u044f\u0435\u043c cluster_id \u043e\u0431\u0440\u0430\u0442\u043d\u043e \u0432 \u0442\u0430\u0431\u043b\u0438\u0446\u0443 \u0438 \u0437\u0430\u043a\u0440\u044b\u0432\u0430\u0435\u043c \u0433\u0435\u0448\u0442\u0430\u043b\u044c\u0442 \u043e\u0442\u0447\u0451\u0442\u043e\u043c \u00ab\u0434\u043e\u0445\u043e\u0434 \u043f\u043e \u043a\u043b\u0430\u0441\u0442\u0435\u0440\u0443\u00bb \u0447\u0438\u0441\u0442\u044b\u043c SQL. <\/p>\n<h3>\u0421\u0445\u0435\u043c\u0430 \u0434\u0430\u043d\u043d\u044b\u0445<\/h3>\n<p>\u0414\u043e\u043f\u0443\u0441\u0442\u0438\u043c, \u0435\u0441\u0442\u044c \u0442\u0440\u0430\u043d\u0437\u0430\u043a\u0446\u0438\u0438 \u0438 \u0431\u0430\u0437\u043e\u0432\u0430\u044f \u0438\u043d\u0444\u043e\u0440\u043c\u0430\u0446\u0438\u044f \u043e \u043a\u043b\u0438\u0435\u043d\u0442\u0430\u0445:<\/p>\n<pre><code class=\"sql\">CREATE TABLE public.customers (     customer_id      bigint PRIMARY KEY,     registered_at    timestamptz,     email            text UNIQUE );  CREATE TABLE public.orders (     order_id         bigint PRIMARY KEY,     customer_id      bigint REFERENCES public.customers,     order_dt         timestamptz NOT NULL,     order_amount     numeric(12,2) NOT NULL );<\/code><\/pre>\n<h3>\u0413\u0435\u043d\u0435\u0440\u0430\u0446\u0438\u044f \u0444\u0438\u0447\u0435\u0439 \u0438 \u043d\u043e\u0440\u043c\u0430\u043b\u0438\u0437\u0430\u0446\u0438\u044f \u0432 Materialized View<\/h3>\n<p>\u041d\u0443\u0436\u043d\u044b \u0447\u0438\u0441\u043b\u043e\u0432\u044b\u0435 \u043f\u0440\u0438\u0437\u043d\u0430\u043a\u0438 \u043d\u0430 \u043e\u0434\u043d\u043e\u0433\u043e \u043a\u043b\u0438\u0435\u043d\u0442\u0430: <code>orders_cnt<\/code>, <code>days_since_last<\/code>, <code>mean_amount<\/code>, <code>total_amount<\/code>. \u0421\u0440\u0430\u0437\u0443 \u0434\u0435\u043b\u0430\u0435\u043c z-score, \u0447\u0442\u043e\u0431\u044b K-Means \u043d\u0435 \u0441\u0442\u0440\u0430\u0434\u0430\u043b \u043e\u0442 \u0440\u0430\u0437\u043d\u044b\u0445 \u043c\u0430\u0441\u0448\u0442\u0430\u0431\u043e\u0432. \u0412\u0441\u0451 \u0432 \u043e\u0434\u043d\u043e\u043c \u0437\u0430\u043f\u0440\u043e\u0441\u0435, \u0430 \u0440\u0435\u0437\u0443\u043b\u044c\u0442\u0430\u0442 \u043a\u0435\u0448\u0438\u0440\u0443\u0435\u043c \u043c\u0430\u0442\u0435\u0440\u0438\u0430\u043b\u0438\u0437\u043e\u0432\u0430\u043d\u043d\u044b\u043c \u043f\u0440\u0435\u0434\u0441\u0442\u0430\u0432\u043b\u0435\u043d\u0438\u0435\u043c:<\/p>\n<pre><code class=\"sql\">CREATE MATERIALIZED VIEW ds.mv_customer_features AS WITH stats AS (     SELECT         avg(orders_cnt)::numeric  AS avg_orders_cnt,         stddev_samp(orders_cnt)   AS sd_orders_cnt,         avg(days_since_last)      AS avg_days_last,         stddev_samp(days_since_last) AS sd_days_last,         avg(mean_amount)          AS avg_mean_amount,         stddev_samp(mean_amount)  AS sd_mean_amount,         avg(total_amount)         AS avg_total_amount,         stddev_samp(total_amount) AS sd_total_amount     FROM (         SELECT             c.customer_id,             COUNT(o.*)                      AS orders_cnt,             EXTRACT(EPOCH FROM (now() - MAX(o.order_dt))) \/ 86400 AS days_since_last,             AVG(o.order_amount)             AS mean_amount,             SUM(o.order_amount)             AS total_amount         FROM public.customers c         LEFT JOIN public.orders o USING (customer_id)         GROUP BY c.customer_id     ) sub ), base AS (     SELECT         c.customer_id,         COUNT(o.*)                      AS orders_cnt,         EXTRACT(EPOCH FROM (now() - MAX(o.order_dt))) \/ 86400 AS days_since_last,         AVG(o.order_amount)             AS mean_amount,         SUM(o.order_amount)             AS total_amount     FROM public.customers c     LEFT JOIN public.orders o USING (customer_id)     GROUP BY c.customer_id ), z AS (     SELECT         b.customer_id,         (b.orders_cnt    - s.avg_orders_cnt)   \/ NULLIF(s.sd_orders_cnt,0)   AS z_orders_cnt,         (b.days_since_last - s.avg_days_last)  \/ NULLIF(s.sd_days_last,0)    AS z_days_last,         (b.mean_amount   - s.avg_mean_amount)  \/ NULLIF(s.sd_mean_amount,0)  AS z_mean_amount,         (b.total_amount  - s.avg_total_amount) \/ NULLIF(s.sd_total_amount,0) AS z_total_amount     FROM base b CROSS JOIN stats s ) SELECT * FROM z;<\/code><\/pre>\n<p>\u041c\u0430\u0442\u0435\u0440\u0438\u0430\u043b\u0438\u0437\u043e\u0432\u0430\u043d\u043d\u044b\u0439 \u0432\u0438\u0434 \u0445\u043e\u0440\u043e\u0448 \u0442\u0435\u043c, \u0447\u0442\u043e \u0435\u0433\u043e \u043c\u043e\u0436\u043d\u043e \u043e\u0441\u0432\u0435\u0436\u0430\u0442\u044c \u043f\u043e \u0440\u0430\u0441\u043f\u0438\u0441\u0430\u043d\u0438\u044e (<code>REFRESH MATERIALIZED VIEW CONCURRENTLY ds.mv_customer_features;<\/code>) \u0438 \u043e\u043d \u0434\u0430\u0451\u0442 \u0438\u043d\u0434\u0435\u043a\u0441\u044b, \u0435\u0441\u043b\u0438 \u043f\u043e\u043d\u0430\u0434\u043e\u0431\u0438\u0442\u0441\u044f. \u041f\u043e\u0434 \u043a\u0430\u043f\u043e\u0442\u043e\u043c \u0432\u0441\u0451 \u0441\u0442\u0430\u043d\u0434\u0430\u0440\u0442\u043d\u044b\u0439 SQL, \u043d\u0438\u043a\u0430\u043a\u0438\u0445 \u043a\u043e\u0441\u0442\u044b\u043b\u0435\u0439. <\/p>\n<h3>\u041f\u043e\u0434\u043a\u043b\u044e\u0447\u0430\u0435\u043c PL\/PythonU<\/h3>\n<pre><code class=\"sql\">CREATE EXTENSION IF NOT EXISTS plpython3u;<\/code><\/pre>\n<p>\u041f\u0440\u0430\u0432\u0430 \u0432\u044b\u0434\u0430\u0451\u043c \u0442\u043e\u043b\u044c\u043a\u043e \u0447\u0442\u0435\u043d\u0438\u0435 \u043d\u0430 <code>ds.mv_customer_features<\/code> \u0438 \u0437\u0430\u043f\u0438\u0441\u044c \u043d\u0430 <code>public.customers.cluster_id<\/code>.<\/p>\n<h3>\u0425\u0440\u0430\u043d\u0438\u043c\u0430\u044f \u0444\u0443\u043d\u043a\u0446\u0438\u044f K-Means<\/h3>\n<p>\u0424\u0443\u043d\u043a\u0446\u0438\u044f \u043f\u043e\u043b\u0443\u0447\u0430\u0435\u0442 \u0436\u0435\u043b\u0430\u0435\u043c\u043e\u0435 \u0447\u0438\u0441\u043b\u043e \u043a\u043b\u0430\u0441\u0442\u0435\u0440\u043e\u0432 (<code>k<\/code>), \u043e\u0431\u0443\u0447\u0430\u0435\u0442 K-Means \u043d\u0430 \u043d\u043e\u0440\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u043d\u044b\u0445 \u0444\u0438\u0447\u0430\u0445, \u0441\u043e\u0445\u0440\u0430\u043d\u044f\u0435\u0442 \u043c\u043e\u0434\u0435\u043b\u044c \u0432 JSON (\u0434\u043b\u044f \u0438\u0441\u0442\u043e\u0440\u0438\u0438) \u0438 \u043f\u0438\u0448\u0435\u0442 \u043d\u043e\u043c\u0435\u0440 \u043a\u043b\u0430\u0441\u0442\u0435\u0440\u0430 \u043a\u043b\u0438\u0435\u043d\u0442\u0443 \u0432 \u0442\u0430\u0431\u043b\u0438\u0446\u0443.<\/p>\n<pre><code class=\"sql\">CREATE OR REPLACE FUNCTION ds.build_customer_clusters(k int DEFAULT 5) RETURNS void LANGUAGE plpython3u SECURITY DEFINER AS $$ import json from sklearn.cluster import KMeans from sklearn.exceptions import ConvergenceWarning import warnings  plpy.execute(\"SET search_path TO ds, public\")  # 1. \u0417\u0430\u0431\u0438\u0440\u0430\u0435\u043c \u0434\u0430\u043d\u043d\u044b\u0435 rows = plpy.execute(\"\"\"     SELECT customer_id,            ARRAY[z_orders_cnt, z_days_last, z_mean_amount, z_total_amount] AS f     FROM ds.mv_customer_features     WHERE z_orders_cnt IS NOT NULL \"\"\")  if len(rows) &lt; k:     plpy.error(f\"Not enough data points ({len(rows)}) for k={k}\")  cust_ids = [r['customer_id'] for r in rows] X = [r['f'] for r in rows]  # 2. \u041e\u0431\u0443\u0447\u0430\u0435\u043c \u043c\u043e\u0434\u0435\u043b\u044c warnings.filterwarnings(\"ignore\", category=ConvergenceWarning) model = KMeans(n_clusters=k, n_init='auto', random_state=42) model.fit(X) labels = model.labels_  # 3. \u0417\u0430\u043f\u0438\u0441\u044b\u0432\u0430\u0435\u043c \u043a\u043b\u0430\u0441\u0442\u0435\u0440\u0430 tuples = [{'customer_id': cid, 'cluster_id': int(lbl)} for cid, lbl in zip(cust_ids, labels)] plpy.execute(\"CREATE TEMP TABLE _tmp_cluster (customer_id bigint, cluster_id int) ON COMMIT DROP\") plpy.execute(\"INSERT INTO _tmp_cluster VALUES \" +               \", \".join(f\"({t['customer_id']}, {t['cluster_id']})\" for t in tuples))  plpy.execute(\"\"\"     UPDATE public.customers c     SET cluster_id = t.cluster_id     FROM _tmp_cluster t     WHERE t.customer_id = c.customer_id \"\"\")  # 4. \u0421\u0435\u0440\u0438\u0430\u043b\u0438\u0437\u0443\u0435\u043c \u043c\u043e\u0434\u0435\u043b\u044c (\u043e\u043f\u0446\u0438\u043e\u043d\u0430\u043b\u044c\u043d\u043e) plpy.execute(\"\"\"     INSERT INTO ds.model_registry(model_name, trained_at, params, inertia)     VALUES ('customer_kmeans', now(), $1, $2) \"\"\", [json.dumps(model.get_params()), float(model.inertia_)]) $$;<\/code><\/pre>\n<p><code><strong>SECURITY DEFINER<\/strong><\/code> \u0438\u0441\u043f\u043e\u043b\u043d\u044f\u0435\u0442\u0441\u044f \u0441 \u043f\u0440\u0430\u0432\u0430\u043c\u0438 \u0432\u043b\u0430\u0434\u0435\u043b\u044c\u0446\u0430, \u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u0435\u043b\u0438 \u043d\u0435 \u043f\u043e\u043b\u0443\u0447\u0430\u0442 \u043b\u0438\u0448\u043d\u0438\u0435 \u043f\u0440\u0438\u0432\u0438\u043b\u0435\u0433\u0438\u0438. <strong>\u0422\u0435\u043c\u043f-\u0442\u0430\u0431\u043b\u0438\u0446\u0430<\/strong> \u0434\u043b\u044f \u043c\u0430\u0441\u0441\u043e\u0432\u043e\u0433\u043e \u0430\u043f\u0434\u0435\u0439\u0442\u0430 \u0431\u044b\u0441\u0442\u0440\u0435\u0435, \u0447\u0435\u043c <code>UPDATE \u2026 FROM (VALUES \u2026)<\/code> \u043d\u0430 \u0442\u044b\u0441\u044f\u0447\u0438 \u0441\u0442\u0440\u043e\u043a. <code><strong>random_state<\/strong><\/code> \u0444\u0438\u043a\u0441\u0438\u0440\u0443\u0435\u043c, \u0447\u0442\u043e\u0431\u044b \u0440\u0435\u0437\u0443\u043b\u044c\u0442\u0430\u0442\u044b \u0432\u043e\u0441\u043f\u0440\u043e\u0438\u0437\u0432\u043e\u0434\u0438\u043b\u0438\u0441\u044c.<\/p>\n<h3>\u0417\u0430\u043f\u0443\u0441\u043a \u0438 \u0440\u0430\u0441\u043f\u0438\u0441\u0430\u043d\u0438\u0435<\/h3>\n<p>\u0420\u0430\u0437\u043e\u0432\u043e:<\/p>\n<pre><code class=\"sql\">SELECT ds.build_customer_clusters(6); REFRESH MATERIALIZED VIEW CONCURRENTLY ds.mv_customer_features;<\/code><\/pre>\n<p>\u0415\u0449\u0435 \u0443\u0434\u043e\u0431\u043d\u043e \u0434\u0435\u043b\u0430\u0442\u044c <code>REFRESH<\/code> \u0438 \u043f\u0435\u0440\u0435\u0441\u0442\u0440\u043e\u0439\u043a\u0443 \u043c\u043e\u0434\u0435\u043b\u0438 \u043d\u043e\u0447\u044c\u044e cron-\u0434\u0436\u043e\u0431\u043e\u0439 \u0438\u043b\u0438 pg_cron:<\/p>\n<pre><code class=\"sql\">SELECT cron.schedule('0 3 * * *', $$REFRESH MATERIALIZED VIEW CONCURRENTLY ds.mv_customer_features;                        SELECT ds.build_customer_clusters(6);$$);<\/code><\/pre>\n<h3>\u041e\u0442\u0447\u0451\u0442 \u00ab\u0434\u043e\u0445\u043e\u0434 \u043f\u043e \u043a\u043b\u0430\u0441\u0442\u0435\u0440\u0443\u00bb<\/h3>\n<p>\u041d\u0438\u043a\u0430\u043a\u043e\u0433\u043e Python \u2014 \u0442\u043e\u043b\u044c\u043a\u043e SQL:<\/p>\n<pre><code class=\"sql\">WITH rev AS (     SELECT         c.cluster_id,         SUM(o.order_amount) AS revenue     FROM public.customers c     JOIN public.orders o USING (customer_id)     GROUP BY c.cluster_id ) SELECT     cluster_id,     revenue,     ROUND(revenue * 100.0 \/ SUM(revenue) OVER (), 2) AS revenue_pct FROM rev ORDER BY revenue DESC;<\/code><\/pre>\n<p>\u0420\u0435\u0437\u0443\u043b\u044c\u0442\u0430\u0442 \u0441\u0440\u0430\u0437\u0443 \u0433\u043e\u0442\u043e\u0432 \u043a \u0434\u0430\u0448\u0431\u043e\u0440\u0434\u0443: \u0432\u0438\u0434\u043d\u043e \u0434\u043e\u043b\u044e \u043a\u0430\u0436\u0434\u043e\u0433\u043e \u0441\u0435\u0433\u043c\u0435\u043d\u0442\u0430 \u0432 \u043e\u0431\u0449\u0435\u0439 \u0432\u044b\u0440\u0443\u0447\u043a\u0435.<\/p>\n<h3>\u0418\u0442\u043e\u0433<\/h3>\n<p>\u0415\u0441\u043b\u0438 \u0443 \u0432\u0430\u0441 \u0443\u0436\u0435 \u0435\u0441\u0442\u044c \u043e\u043f\u044b\u0442 \u0441\u0435\u0433\u043c\u0435\u043d\u0442\u0430\u0446\u0438\u0438 \u043d\u0430 \u0441\u0442\u043e\u0440\u043e\u043d\u0435 \u0411\u0414 \u2014 \u0434\u0435\u043b\u0438\u0442\u0435\u0441\u044c \u0432 \u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u044f\u0445: \u043a\u0430\u043a \u043c\u0430\u0441\u0448\u0442\u0430\u0431\u0438\u0440\u043e\u0432\u0430\u043b\u0438, \u043a\u0430\u043a\u0438\u0435 \u043f\u0440\u043e\u0431\u043b\u0435\u043c\u044b \u043b\u043e\u0432\u0438\u043b\u0438, \u0433\u0434\u0435 K-Means \u043d\u0435 \u0437\u0430\u0448\u0451\u043b. \u0427\u0435\u043c \u0431\u043e\u043b\u044c\u0448\u0435 \u043f\u0440\u0438\u043c\u0435\u0440\u043e\u0432, \u0442\u0435\u043c \u043f\u043e\u043b\u0435\u0437\u043d\u0435\u0435 \u0441\u0442\u0430\u0442\u044c\u044f \u0434\u043b\u044f \u0432\u0441\u0435\u0445.  <\/p>\n<p><em>\u041f\u043e\u0433\u0440\u0443\u0437\u0438\u0442\u0435\u0441\u044c \u0432 \u043f\u0440\u043e\u0446\u0435\u0441\u0441 \u0440\u0430\u0437\u0440\u0430\u0431\u043e\u0442\u043a\u0438 \u041f\u041e \u0441 \u043d\u0443\u043b\u044f: \u043d\u0430\u0443\u0447\u0438\u0442\u0435\u0441\u044c \u0443\u0447\u0438\u0442\u044b\u0432\u0430\u0442\u044c \u0446\u0435\u043b\u0438 \u0431\u0438\u0437\u043d\u0435\u0441\u0430 \u0438 \u0444\u043e\u0440\u043c\u0443\u043b\u0438\u0440\u043e\u0432\u0430\u0442\u044c \u0442\u0435\u0445\u043d\u0438\u0447\u0435\u0441\u043a\u0438\u0435 \u0442\u0440\u0435\u0431\u043e\u0432\u0430\u043d\u0438\u044f \u043a \u043f\u0440\u043e\u0434\u0443\u043a\u0442\u0443 <\/em><a href=\"https:\/\/otus.pw\/r6W4\/\" rel=\"noopener noreferrer nofollow\"><em>\u043d\u0430 \u0431\u0430\u0437\u043e\u0432\u043e\u043c \u043a\u0443\u0440\u0441\u0435 \u00ab\u0421\u0438\u0441\u0442\u0435\u043c\u043d\u044b\u0439 \u0430\u043d\u0430\u043b\u0438\u0442\u0438\u043a\u00bb.<\/em><\/a><\/p>\n<p><em>\u0427\u0442\u043e\u0431\u044b \u043e\u0441\u0442\u0430\u0432\u0430\u0442\u044c\u0441\u044f \u0432 \u043a\u0443\u0440\u0441\u0435 \u0430\u043a\u0442\u0443\u0430\u043b\u044c\u043d\u044b\u0445 \u0442\u0435\u0445\u043d\u043e\u043b\u043e\u0433\u0438\u0439 \u0438 \u0442\u0440\u0435\u043d\u0434\u043e\u0432, \u043f\u043e\u0434\u043f\u0438\u0441\u044b\u0432\u0430\u0439\u0442\u0435\u0441\u044c \u043d\u0430 <\/em><a href=\"https:\/\/t.me\/+dxHyB6AgI99kNDUy\" rel=\"noopener noreferrer nofollow\"><em>Telegram-\u043a\u0430\u043d\u0430\u043b OTUS.<\/em><\/a><\/p>\n<\/div>\n<\/div>\n<\/div>\n<p><!----><!----><\/div>\n<p><!----><!----><br \/> \u0441\u0441\u044b\u043b\u043a\u0430 \u043d\u0430 \u043e\u0440\u0438\u0433\u0438\u043d\u0430\u043b \u0441\u0442\u0430\u0442\u044c\u0438 <a href=\"https:\/\/habr.com\/ru\/articles\/930506\/\"> https:\/\/habr.com\/ru\/articles\/930506\/<\/a><br \/><\/br><\/br><\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[],"tags":[],"class_list":["post-473305","post","type-post","status-publish","format-standard","hentry"],"_links":{"self":[{"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=\/wp\/v2\/posts\/473305","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=473305"}],"version-history":[{"count":0,"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=\/wp\/v2\/posts\/473305\/revisions"}],"wp:attachment":[{"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=473305"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=473305"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=473305"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}