{"id":329918,"date":"2022-02-21T09:01:25","date_gmt":"2022-02-21T09:01:25","guid":{"rendered":"http:\/\/savepearlharbor.com\/?p=329918"},"modified":"-0001-11-30T00:00:00","modified_gmt":"-0001-11-29T21:00:00","slug":"","status":"publish","type":"post","link":"https:\/\/savepearlharbor.com\/?p=329918","title":{"rendered":"<span>\u0423\u0447\u0438\u043c \u0433\u0438\u043f\u0435\u0440\u043c\u043e\u0434\u0430\u043b\u044c\u043d\u044b\u0439 \u0442\u0440\u0430\u043d\u0441\u0444\u043e\u0440\u043c\u0435\u0440 \u043f\u0440\u0435\u0434\u0441\u043a\u0430\u0437\u044b\u0432\u0430\u0442\u044c \u043a\u0430\u043b\u043e\u0440\u0438\u0439\u043d\u043e\u0441\u0442\u044c \u0431\u043e\u0440\u0449\u0430<\/span>"},"content":{"rendered":"<div><\/div>\n<div id=\"post-content-body\">\n<div>\n<div class=\"article-formatted-body article-formatted-body_version-2\">\n<div xmlns=\"http:\/\/www.w3.org\/1999\/xhtml\">\n<figure class=\"full-width\"><img loading=\"lazy\" decoding=\"async\" src=\"https:\/\/habrastorage.org\/r\/w1560\/getpro\/habr\/upload_files\/215\/576\/862\/2155768622ab20489e96eb1aca345d96.png\" width=\"1556\" height=\"922\" data-src=\"https:\/\/habrastorage.org\/getpro\/habr\/upload_files\/215\/576\/862\/2155768622ab20489e96eb1aca345d96.png\"\/><figcaption><\/figcaption><\/figure>\n<details class=\"spoiler\">\n<summary>\u041f\u043e\u043f\u0440\u043e\u0431\u043e\u0432\u0430\u0442\u044c \u043c\u043e\u0436\u043d\u043e \u0442\u0443\u0442<\/summary>\n<div class=\"spoiler__content\">\n<p>\u0422\u044b\u043a\u0430\u0442\u044c<a href=\"https:\/\/huggingface.co\/spaces\/AlexWortega\/food_calories\" rel=\"noopener noreferrer nofollow\"> \u0442\u0443\u0442<\/a><\/p>\n<figure class=\"full-width\"><img loading=\"lazy\" decoding=\"async\" src=\"https:\/\/habrastorage.org\/r\/w1560\/getpro\/habr\/upload_files\/75a\/e94\/89a\/75ae9489a6f0f8ce93f061cedd55c72e.png\" width=\"2650\" height=\"928\" data-src=\"https:\/\/habrastorage.org\/getpro\/habr\/upload_files\/75a\/e94\/89a\/75ae9489a6f0f8ce93f061cedd55c72e.png\"\/><figcaption><\/figcaption><\/figure>\n<\/div>\n<\/details>\n<p>\u041f\u0440\u0438\u0432\u0435\u0442 \u0425\u0430\u0431\u0440, \u0441\u0435\u0433\u043e\u0434\u043d\u044f \u043c\u044b \u043f\u043e\u0433\u043e\u0432\u043e\u0440\u0438\u043c \u043e \u0442\u0430\u043a\u043e\u0439 \u0432\u0430\u0436\u043d\u043e\u0439 \u0442\u0435\u043c\u0435 \u043a\u0430\u043a \u043c\u0443\u043b\u044c\u0442\u0438\u043c\u043e\u0434\u0430\u043b\u044c\u043d\u044b\u0435 \u0442\u0440\u0430\u043d\u0441\u0444\u043e\u0440\u043c\u0435\u0440\u044b.<\/p>\n<p>\u0427\u0442\u043e \u0436\u0435 \u044d\u0442\u043e \u0442\u0430\u043a\u043e\u0435 \u0432 \u043a\u043e\u043d\u0442\u0435\u043a\u0441\u0442\u0435 \u043c\u0430\u0448\u0438\u043d\u043d\u043e\u0433\u043e \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u044f &#8212; \u044d\u0442\u043e \u0441\u043f\u043e\u0441\u043e\u0431\u043d\u043e\u0441\u0442\u044c \u043e\u0434\u043d\u043e\u0439 \u043c\u043e\u0434\u0435\u043b\u0438 \u0440\u0430\u0431\u043e\u0442\u0430\u0442\u044c \u0441\u0440\u0430\u0437\u0443 \u0441 \u043d\u0435\u0441\u043a\u043e\u043b\u044c\u043a\u0438\u043c\u0438 \u0432\u0438\u0434\u0430\u043c\u0438 \u0434\u0430\u043d\u043d\u044b\u0445 &#8212; \u0442\u0435\u043a\u0441\u0442\u043e\u043c,  \u043a\u0430\u0440\u0442\u0438\u043d\u043a\u0430\u043c\u0438, \u0437\u0432\u0443\u043a\u043e\u043c, \u0432\u044b\u0442\u0430\u0441\u043a\u0438\u0432\u0430\u0442\u044c \u0438\u0437 \u043d\u0438\u0445 \u0444\u0438\u0447\u0438 \u0432 \u0435\u0434\u0438\u043d\u043e\u0435 \u0432\u0435\u043a\u0442\u043e\u0440\u043d\u043e\u0435 \u043f\u0440\u043e\u0441\u0442\u0440\u0430\u043d\u0441\u0442\u0432\u043e \u0438 \u043c\u0430\u043d\u0438\u043f\u0443\u043b\u0438\u0440\u043e\u0432\u0430\u0442\u044c \u043a\u043e\u043d\u0442\u0435\u043d\u0442\u043e\u043c \u043d\u0430 \u0432\u0445\u043e\u0434\u0435 \u0438 \u0432\u044b\u0445\u043e\u0434\u0435. \u042d\u0442\u0430 \u0438\u0434\u0435\u044f \u043f\u043e\u044f\u0432\u0438\u043b\u0430\u0441\u044c \u0435\u0449\u0435 \u043d\u0430 \u0437\u0430\u0440\u0435 \u0442\u0440\u0430\u043d\u0441\u0444\u043e\u0440\u043c\u0435\u0440\u043e\u0432 \u0432 <a href=\"https:\/\/arxiv.org\/pdf\/1706.05137.pdf\" rel=\"noopener noreferrer nofollow\">\u0441\u0442\u0430\u0442\u044c\u0435<\/a>  <a href=\"https:\/\/arxiv.org\/pdf\/1706.05137.pdf\" rel=\"noopener noreferrer nofollow\">One Model To Learn Them All<\/a><\/p>\n<figure class=\"full-width\"><img loading=\"lazy\" decoding=\"async\" src=\"https:\/\/habrastorage.org\/r\/w1560\/getpro\/habr\/upload_files\/b46\/aa4\/0e1\/b46aa40e1a9485cb2049891b4ec66b08.png\" width=\"1280\" height=\"685\" data-src=\"https:\/\/habrastorage.org\/getpro\/habr\/upload_files\/b46\/aa4\/0e1\/b46aa40e1a9485cb2049891b4ec66b08.png\"\/><figcaption><\/figcaption><\/figure>\n<p>\u0410\u0432\u0442\u043e\u0440\u044b \u0438\u0437 \u0433\u0443\u0433\u043b \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u043b\u0438 \u0435\u0434\u0438\u043d\u044b\u0439 \u0431\u0430\u0439\u0442\u043e\u0432\u044b\u0439 \u044d\u043d\u043a\u043e\u0434\u0435\u0440 \u0434\u043b\u044f \u043b\u044e\u0431\u044b\u0445 \u0432\u0445\u043e\u0434\u043d\u044b\u0445 \u0434\u0430\u043d\u043d\u044b\u0445, MoE \u0434\u043b\u044f \u0443\u043f\u0440\u0430\u0432\u043b\u0435\u043d\u0438\u044f \u043c\u043e\u0434\u0430\u043b\u044c\u043d\u043e\u0441\u0442\u044f\u043c\u0438 \u0438 \u0435\u0434\u0438\u043d\u044b\u0439 \u0434\u0435\u043a\u043e\u0434\u0435\u0440 \u0434\u043b\u044f \u0432\u0441\u0435\u0445 \u0437\u0430\u0434\u0430\u0447. <\/p>\n<p>\u041d\u043e \u0432\u0435\u0440\u043d\u0435\u043c\u0441\u044f \u0432 \u043d\u0430\u0448\u0435 \u0432\u0440\u0435\u043c\u044f \u043a <a href=\"https:\/\/github.com\/sberbank-ai\/fusion_brain_aij2021\" rel=\"noopener noreferrer nofollow\">\u0440\u0435\u0441\u0435\u0440\u0447\u0443<\/a> \u043f\u0440\u043e\u0432\u043e\u0434\u0438\u043c\u043e\u043c\u0443 \u0432 \u0440\u0430\u043c\u043a\u0430\u0445 fusion brain challenge.<\/p>\n<figure class=\"full-width\"><img loading=\"lazy\" decoding=\"async\" src=\"https:\/\/habrastorage.org\/r\/w1560\/getpro\/habr\/upload_files\/9ce\/111\/c10\/9ce111c10a4177eeea3b93892040db8b.png\" width=\"2152\" height=\"1074\" data-src=\"https:\/\/habrastorage.org\/getpro\/habr\/upload_files\/9ce\/111\/c10\/9ce111c10a4177eeea3b93892040db8b.png\"\/><figcaption><\/figcaption><\/figure>\n<p>\u0422\u0443\u0442 \u0430\u0432\u0442\u043e\u0440\u044b \u0438\u0441\u0445\u043e\u0434\u044f\u0442 \u0438\u0437 \u043a\u043e\u043d\u0446\u0435\u043f\u0446\u0438\u0438 &#8212; \u0431\u043e\u043b\u044c\u0448\u0438\u0435 \u044f\u0437\u044b\u043a\u043e\u0432\u044b\u0435 \u043c\u043e\u0434\u0435\u043b\u0438 \u0434\u043e\u0441\u0442\u0430\u0442\u043e\u0447\u043d\u043e \u0443\u043c\u043d\u044b \u0447\u0442\u043e\u0431\u044b \u0440\u0435\u0448\u0430\u0442\u044c \u043b\u044e\u0431\u044b\u0435 \u0437\u0430\u0434\u0430\u0447\u0438, \u043f\u0440\u0438 \u0443\u0441\u043b\u043e\u0432\u0438\u0438 \u043f\u0440\u0430\u0432\u0438\u043b\u044c\u043d\u043e\u0433\u043e \u0444\u043e\u0440\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u044f \u0432\u0445\u043e\u0434\u043d\u043e\u0439 \u043f\u043e\u0441\u043b\u0435\u0434\u043e\u0432\u0430\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u0438 \u043c\u043e\u0434\u0435\u043b\u0438 &#8212; \u0432\u043d\u0443\u0442\u0440\u0435\u043d\u043d\u044f\u044f \u043c\u043e\u0434\u0435\u043b\u044c (Decoder only(GPT\/Dalle) \/ Full transformer (T5\/Bart) ) \u0437\u0430\u043c\u043e\u0440\u0430\u0436\u0438\u0432\u0430\u0435\u0442\u0441\u044f,  \u0443\u0447\u0438\u0442\u0441\u044f \u0442\u043e\u043b\u044c\u043a\u043e \u043b\u0438\u043d\u0435\u0439\u043d\u044b\u0439 \u0441\u043b\u043e\u0439 \u043d\u0430 \u0432\u0445\u043e\u0434\u0435 \u0438 \u043d\u0430 \u0432\u044b\u0445\u043e\u0434\u0435. \u0412 \u0442\u0430\u043a\u043e\u043c \u0432\u0430\u0440\u0438\u0430\u043d\u0442\u0435 \u043c\u043e\u0434\u0435\u043b\u044c \u043c\u043e\u0436\u0435\u0442 \u0440\u0435\u0448\u0430\u0442\u044c VQA,Image captioning, Code2code, etc \u043f\u0440\u0438 \u044d\u0442\u043e\u043c \u043d\u0435 \u0443\u0447\u0430 \u043a\u0430\u0436\u0434\u044b\u0439 \u0440\u0430\u0437 \u0440\u0430\u0437\u043d\u0443\u044e \u043c\u043e\u0434\u0435\u043b\u044c. \u0412 \u0434\u0430\u043d\u043d\u043e\u043c \u0441\u043b\u0443\u0447\u0430\u0435 \u043e\u0434\u043d\u0430 \u043c\u043e\u0434\u0435\u043b\u044c \u0440\u0435\u0448\u0430\u0435\u0442 \u0432\u0441\u0435 \u0437\u0430\u0434\u0430\u0447\u0438, \u043f\u0440\u0438 \u044d\u0442\u043e\u043c \u0434\u043e\u0432\u043e\u043b\u044c\u043d\u043e \u044d\u0444\u0444\u0435\u043a\u0442\u0438\u0432\u043d\u043e \u0445\u043e\u0442\u044f \u0438 \u043d\u0435 \u0431\u044c\u0435\u0442 SOTA \u0440\u0435\u0437\u0443\u043b\u044c\u0442\u0430\u0442\u044b.<\/p>\n<figure class=\"full-width\"><img loading=\"lazy\" decoding=\"async\" src=\"https:\/\/habrastorage.org\/r\/w1560\/getpro\/habr\/upload_files\/8ca\/db4\/ffe\/8cadb4ffe6115c736656fcc1c1680277.png\" alt=\" \" title=\" \" width=\"934\" height=\"320\" data-src=\"https:\/\/habrastorage.org\/getpro\/habr\/upload_files\/8ca\/db4\/ffe\/8cadb4ffe6115c736656fcc1c1680277.png\"\/><figcaption> <\/figcaption><\/figure>\n<p><a href=\"https:\/\/arxiv.org\/pdf\/2111.10974.pdf\" rel=\"noopener noreferrer nofollow\">\u041f\u0440\u0435\u043f\u0440\u0438\u043d\u0442 \u0441\u0442\u0430\u0442\u044c\u0438<\/a> <a href=\"https:\/\/github.com\/sberbank-ai\/fusion_brain_aij2021\" rel=\"noopener noreferrer nofollow\">Github<\/a><\/p>\n<h3>\u0427\u0442\u043e \u0442\u043e \u044f \u0437\u0430\u0433\u043e\u0432\u043e\u0440\u0438\u043b\u0441\u044f \u043f\u0440\u043e \u0431\u043b\u0438\u0437\u043a\u0443\u044e \u043c\u043d\u0435 \u0442\u0435\u043c\u0443, \u0430 \u043f\u0440\u043e \u0431\u043e\u0440\u0449\u0438 \u043d\u0435 \u0441\u043b\u043e\u0432\u0430<\/h3>\n<p>\u0412 \u043d\u043e\u044f\u0431\u0440\u0435 \u044d\u0442\u043e\u0433\u043e \u0433\u043e\u0434\u0430 \u0421\u0431\u0435\u0440 \u0432\u044b\u043b\u043e\u0436\u0438\u043b \u0441\u0443\u043f\u0435\u0440 \u0432\u0430\u0436\u043d\u0443\u044e \u043c\u043e\u0434\u0435\u043b\u044c &#8212; ruDall-e. \u0414\u0430\u043b\u0435\u043a\u0438\u0439 \u043e\u0442 \u0440\u0435\u0441\u0435\u0440\u0447\u0430 \u0447\u0435\u043b\u043e\u0432\u0435\u043a \u0441\u0435\u0439\u0447\u0430\u0441 \u043c\u043e\u0436\u0435\u0442 \u0437\u0430\u0434\u0430\u0442\u044c\u0441\u044f \u043b\u043e\u0433\u0438\u0447\u043d\u044b\u043c \u0432\u043e\u043f\u0440\u043e\u0441\u043e\u043c &#8212; \u0430 \u0432 \u0447\u0435\u043c \u0446\u0435\u043d\u043d\u043e\u0441\u0442\u044c? \u041f\u043e\u0447\u0435\u043c\u0443 \u0441\u043c\u0435\u0448\u043d\u044b\u0435 \u043a\u0430\u0440\u0442\u0438\u043d\u043a\u0438 \u043a\u043e\u0442\u043e\u0432 \u0442\u0430\u043a \u0432\u0430\u0436\u043d\u044b? <\/p>\n<p>\u041e\u0442\u0432\u0435\u0442 \u043f\u0440\u043e\u0441\u0442 &#8212; \u044d\u0442\u043e \u0432\u0430\u0436\u043d\u044b\u0439 \u0448\u0430\u0433 \u043a \u043f\u043e\u0441\u0442\u0440\u043e\u0435\u043d\u0438\u044e \u0431\u043e\u043b\u0435\u0435 \u0441\u043b\u043e\u0436\u043d\u044b\u0445 \u043c\u043e\u0434\u0435\u043b\u0435\u0439, \u043f\u043e \u0441\u0443\u0442\u0438 Dalle-like  \u0432\u044b\u043f\u043e\u043b\u043d\u044f\u0435\u0442 \u0440\u043e\u043b\u044c \u043f\u0435\u0440\u0435\u0432\u043e\u0434\u0447\u0438\u043a\u0430 \u043c\u0435\u0436\u0434\u0443 \u0442\u0435\u043a\u0441\u0442\u043e\u0432\u043e\u0439 \u0438 \u043a\u0430\u0440\u0442\u0438\u043d\u043e\u0447\u043d\u043e\u0439 \u043c\u043e\u0434\u0430\u043b\u044c\u043d\u043e\u0441\u0442\u044c\u044e. \u041d\u0435 \u0437\u043d\u0430\u044f \u043d\u0430\u043f\u0440\u044f\u043c\u0443\u044e \u0444\u0430\u043a\u0442\u043e\u0432 \u0438\u0437 \u0440\u0435\u0430\u043b\u044c\u043d\u043e\u0433\u043e \u043c\u0438\u0440\u0430 \u043c\u043e\u0434\u0435\u043b\u044c \u0442\u0435\u043c \u043d\u0435 \u043c\u0435\u043d\u0435\u0435 \u0438\u0445 \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u0443\u0435\u0442. <\/p>\n<p>\u041d\u043e \u043a\u0430\u043a \u0432\u044b \u043f\u043e\u043d\u044f\u043b\u0438 \u0440\u0435\u0447\u044c \u043f\u043e\u0439\u0434\u0435\u0442 \u043e \u0441\u043b\u0435\u0434\u0443\u0449\u0435\u0439 \u043c\u043e\u0434\u0435\u043b\u0438 \u0421\u0431\u0435\u0440\u0430: Rudolph &#8212; One Hyper-Modal transformer can be creative as DALL-E and smart as CLIP<\/p>\n<p>\u0422\u0443\u0442 \u0438\u0434\u0435\u044f \u0432 \u0442\u043e\u043c \u0447\u0442\u043e\u0431\u044b \u043d\u0430 \u0432\u0445\u043e\u0434 \u0432 \u0434\u0435\u043a\u043e\u0434\u0435\u0440 \u043f\u043e\u0434\u0430\u0432\u0430\u0442\u044c \u043d\u0435 \u043f\u0440\u043e\u0441\u0442\u043e \u0422\u0435\u043a\u0441\u0442-\u041a\u0430\u0440\u0442\u0438\u043d\u043a\u0430 \u043a\u0430\u043a \u0432 DALL-E\/Nuwa\/CogView, \u0430 \u0442\u0440\u0438\u043f\u043b\u0435\u0442 \u0432\u0438\u0434\u0430 l-Text-Image-r-Text<\/p>\n<figure class=\"full-width\"><img loading=\"lazy\" decoding=\"async\" src=\"https:\/\/habrastorage.org\/r\/w1560\/getpro\/habr\/upload_files\/44d\/139\/f6c\/44d139f6cd4754f1cd21025cfcf7ea7e.png\" width=\"1051\" height=\"817\" data-src=\"https:\/\/habrastorage.org\/getpro\/habr\/upload_files\/44d\/139\/f6c\/44d139f6cd4754f1cd21025cfcf7ea7e.png\"\/><figcaption><\/figcaption><\/figure>\n<p> \u043f\u0440\u0438 \u044d\u0442\u043e\u043c, \u043c\u043e\u0434\u0435\u043b\u044c \u0443\u0447\u0438\u0442\u0441\u044f \u0441  \u0440\u0430\u0437\u043d\u044b\u043c\u0438 \u043c\u0430\u0441\u043a\u0430\u043c\u0438 \u0432\u043d\u0438\u043c\u0430\u043d\u0438\u044f, \u043a\u0430\u0436\u0434\u0430\u044f \u0438\u0437 \u043a\u043e\u0442\u043e\u0440\u044b\u0445 \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u0443\u0435\u0442\u0441\u044f \u0434\u043b\u044f \u0440\u0435\u0448\u0435\u043d\u0438\u044f \u0440\u0430\u0437\u043d\u044b\u0445 \u0437\u0430\u0434\u0430\u0447. \u0422\u0435 \u043e\u0434\u043d\u0430 \u043c\u043e\u0434\u0435\u043b\u044c \u043c\u043e\u0436\u0435\u0442 \u0440\u0435\u0448\u0430\u0442\u044c \u0438 \u0437\u0430\u0434\u0430\u0447\u0443 \u043f\u0440\u043e\u0434\u043e\u043b\u0436\u0435\u043d\u0438\u044f \u0442\u0435\u043a\u0441\u0442\u0430 \u043a\u0430\u043a GPT, \u0437\u0430\u0434\u0430\u0447\u0443 \u0433\u0435\u043d\u0435\u0440\u0430\u0446\u0438\u0438 \u0438\u0437\u043e\u0431\u0440\u0430\u0436\u0435\u043d\u0438\u0439 \u0438 \u0437\u0430\u0434\u0430\u0447\u0443 \u0433\u0435\u043d\u0435\u0440\u0430\u0446\u0438\u0438 \u043f\u043e\u0434\u043f\u0438\u0441\u0438 \u043a \u0438\u0437\u043e\u0431\u0440\u0430\u0436\u0435\u043d\u0438\u044f\u043c. <a href=\"https:\/\/github.com\/sberbank-ai\/ru-dolph\" rel=\"noopener noreferrer nofollow\">\u0420\u0435\u043f\u043e\u0437\u0438\u0442\u043e\u0440\u0438\u0439<\/a> <\/p>\n<h3>\u0414\u0430\u0432\u0430\u0439\u0442\u0435 \u043f\u0435\u0440\u0435\u0439\u0434\u0435\u043c \u043a \u0442\u0435\u043c\u0435 \u0441\u0442\u0430\u0442\u044c\u0438 \u0438 \u043e\u0431\u0443\u0447\u0438\u043c \u043c\u043e\u0434\u0435\u043b\u044c<\/h3>\n<p><a href=\"https:\/\/colab.research.google.com\/drive\/12YRRzhl5cHER_U2F-buQxif8GlhMPWq3?usp=sharing\" rel=\"noopener noreferrer nofollow\">\u041a\u043e\u043b\u043b\u0430\u0431 \u043d\u043e\u0443\u0442\u0431\u0443\u043a \u0434\u043b\u044f \u0441\u0430\u043c\u044b\u0445 \u043d\u0435\u0442\u0435\u0440\u043f\u0435\u043b\u0438\u0432\u044b\u0445<\/a><\/p>\n<p>\u041f\u0440\u043e\u043f\u0443\u0441\u0442\u0438\u043c \u043a\u0440\u0430\u0439\u043d\u0435 \u043d\u0435 \u0438\u043d\u0442\u0435\u0440\u0435\u0441\u043d\u044b\u0439 \u0441\u0431\u043e\u0440 \u043a\u0430\u0440\u0442\u0438\u043d\u043e\u043a \u0441 \u043f\u043e\u0434\u043f\u0438\u0441\u044f\u043c\u0438,  \u0443 \u043c\u0435\u043d\u044f \u043f\u043e\u043b\u0443\u0447\u0438\u043b\u0441\u044f \u0442\u0430\u043a\u043e\u0439 \u0441\u0435\u0442 \u043d\u0430 500 \u043f\u0440\u0438\u043c\u0435\u0440\u043e\u0432 \u0432 \u0442\u0430\u043a\u043e\u043c \u0444\u043e\u0440\u043c\u0430\u0442\u0435<\/p>\n<figure class=\"full-width\"><img loading=\"lazy\" decoding=\"async\" src=\"https:\/\/habrastorage.org\/r\/w1560\/getpro\/habr\/upload_files\/9ec\/1b1\/e78\/9ec1b1e78bd458581c34246110b294dc.png\" width=\"1272\" height=\"580\" data-src=\"https:\/\/habrastorage.org\/getpro\/habr\/upload_files\/9ec\/1b1\/e78\/9ec1b1e78bd458581c34246110b294dc.png\"\/><figcaption><\/figcaption><\/figure>\n<p>\u0418 \u0432\u043e\u0442 \u0442\u0430\u043a\u043e\u0435 \u043e\u0431\u043b\u0430\u043a\u043e \u0441\u043b\u043e\u0432 \u0434\u043b\u044f \u043d\u0430\u0448\u0435\u0433\u043e \u0441\u0435\u0442\u0430<\/p>\n<figure class=\"\"><img loading=\"lazy\" decoding=\"async\" src=\"https:\/\/habrastorage.org\/r\/w1560\/getpro\/habr\/upload_files\/79e\/9fd\/91d\/79e9fd91db7cda9319dfccfa510b13c8.png\" width=\"404\" height=\"209\" data-src=\"https:\/\/habrastorage.org\/getpro\/habr\/upload_files\/79e\/9fd\/91d\/79e9fd91db7cda9319dfccfa510b13c8.png\"\/><figcaption><\/figcaption><\/figure>\n<p>\u0423\u0441\u0442\u0430\u043d\u043e\u0432\u0438\u043c \u0432\u0441\u0435 \u0447\u0442\u043e \u043d\u0430\u043c \u043f\u043e\u043d\u0430\u0434\u043e\u0431\u0438\u0442\u0441\u044f, \u0441\u043a\u0430\u0447\u0430\u0435\u043c \u0438 \u0440\u0430\u0437\u0436\u0438\u043c\u0430\u0435\u043c \u0434\u0430\u043d\u043d\u044b\u0435<\/p>\n<pre><code>!pip install rudolph==0.0.1rc4 > \/dev\/null !pip install bitsandbytes-cuda111 > \/dev\/null !pip install wandb > \/dev\/null !gdown https:\/\/drive.google.com\/uc?id=17bPt7G3N_vGKCCxppIOPbPlhv1qUnv0o !unzip -qn food.zip > \/dev\/null<\/code><\/pre>\n<p>\u0413\u0443\u0433\u043b \u0434\u0438\u0441\u043a \u043a\u0430\u0436\u0435\u0442\u0441\u044f \u0438\u0437\u043c\u0435\u043d\u0438\u043b\u0438 \u043f\u043e\u043b\u0438\u0442\u0438\u043a\u0443 \u0440\u0430\u0441\u043f\u0440\u043e\u0441\u0442\u0440\u0430\u043d\u0435\u043d\u0438\u044f \u0444\u0430\u0439\u043b\u043e\u0432, \u0441\u043a\u0430\u0447\u0430\u0439\u0442\u0435 \u0440\u0443\u043a\u0430\u043c\u0438 \u0444\u0430\u0439\u043b \u0441 \u0434\u0430\u0442\u0430\u0441\u0435\u0442\u043e\u043c \u0438 \u043f\u043e\u0433\u0440\u0443\u0437\u0438\u0442\u0435 \u0432 \u0441\u0440\u0435\u0434\u0443 \u0432\u044b\u043f\u043e\u043b\u043d\u0435\u043d\u0438\u044f<\/p>\n<p>\u0418\u043c\u043f\u043e\u0440\u0442\u0438\u0440\u0443\u0435\u043c \u0432\u0441\u0435 \u0447\u0442\u043e \u043d\u0443\u0436\u043d\u043e<\/p>\n<pre><code>import os import sys import random from collections import Counter  import PIL import torch import numpy as np import pandas as pd import bitsandbytes as bnb import torchvision.transforms as T import torchvision.transforms.functional as TF from tqdm import tqdm from wordcloud import WordCloud from matplotlib import pyplot as plt from torch.utils.data import Dataset, DataLoader from rudalle import get_tokenizer, get_vae from rudalle.utils import seed_everything  from rudolph.model.utils import get_attention_mask from rudolph.model import get_rudolph_model, ruDolphModel, FP16Module from rudolph.pipelines import generate_codebooks, self_reranking_by_image, self_reranking_by_text, show, generate_captions, generate_texts, zs_clf from rudolph import utils  device = 'cuda'  model = get_rudolph_model('350M',  fp16=True, device='cuda') tokenizer = get_tokenizer() vae = get_vae(dwt=False).to(device)<\/code><\/pre>\n<p>\u041e\u043f\u0438\u0448\u0435\u043c \u043f\u0430\u0440\u0430\u043c\u0435\u0442\u0440\u044b \u0434\u043b\u044f \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u044f, \u043a\u0443\u0434\u0430 \u0431\u0443\u0434\u0435\u043c \u0441\u043e\u0445\u0440\u0430\u043d\u044f\u0442\u044c \u043c\u043e\u0434\u0435\u043b\u044c \u0438 \u0433\u0438\u043f\u0435\u0440\u043f\u0430\u0440\u0430\u043c\u0435\u0442\u0440\u044b \u043e\u043f\u0442\u0438\u043c\u0430\u0439\u0437\u0435\u0440\u0430<\/p>\n<pre><code>class Args():     def __init__(self, model):         self.device = model.get_param('device')         self.l_text_seq_length = model.get_param('l_text_seq_length')         self.r_text_seq_length = model.get_param('r_text_seq_length')         self.image_tokens_per_dim = model.get_param('image_tokens_per_dim')         self.image_seq_length = model.get_param('image_seq_length')         self.epochs = 5         self.save_path='checkpoints\/'         self.model_name = 'awesomemodel_'         self.save_every = 500         self.bs = 10         self.clip = 1.0         self.lr = 2e-5         self.wandb = False         self.lt_loss_weight = 0.01         self.img_loss_weight = 1         self.rt_loss_weight = 7         self.image_size = self.image_tokens_per_dim * 8  args = Args(model) if not os.path.exists(args.save_path):     os.makedirs(args.save_path)<\/code><\/pre>\n<p>\u0421\u0442\u0430\u043d\u0434\u0430\u0440\u0442\u043d\u044b\u0439 \u043a\u043b\u0430\u0441\u0441 Dataset \u0434\u043b\u044f image2text text2image \u0437\u0430\u0434\u0430\u0447<\/p>\n<pre><code>class FoodDataset(Dataset):     def __init__(self, file_path, csv_path, tokenizer, shuffle=True):         self.tokenizer = tokenizer         self.samples = []         self.image_transform = T.Compose([             T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),             T.RandomResizedCrop(args.image_size, scale=(1., 1.), ratio=(1., 1.)),             T.ToTensor()         ])          df = pd.read_csv(csv_path)         df.columns = ['index', 'belok', 'fats', 'uglevod', 'kkal', 'name', 'path']          for belok, fats, uglevod, kkal, caption, f_path in zip(             df['belok'],df['fats'], df['uglevod'], df['kkal'], df['name'], df['path']         ):             caption = f'\u0431\u043b\u044e\u0434\u043e: {caption}; \u0431\u0435\u043b\u043a\u043e\u0432: {belok}; \u0436\u0438\u0440\u043e\u0432: {fats}; \u0443\u0433\u043b\u0435\u0432\u043e\u0434\u043e\u0432: {uglevod}; \u043a\u043a\u0430\u043b: {kkal};'             if len(caption)>10 and len(caption)&lt;100 and os.path.isfile(f'{file_path}\/{f_path}'):                 self.samples.append([file_path, f_path, caption.lower()])         if shuffle:             np.random.shuffle(self.samples)             print('Shuffled')      def __len__(self):         return len(self.samples)      def load_image(self, file_path, img_name):         return PIL.Image.open(f'{file_path}\/{img_name}')      def __getitem__(self, item):         item = item % len(self.samples)         file_path, img_name, text = self.samples[item]          try:             image = self.load_image(file_path, img_name)             image = self.image_transform(image)         except Exception as err:               print(err)             random_item = random.randint(0, len(self.samples) - 1)             return self.__getitem__(random_item)                  text = text.lower().strip()         encoded = self.tokenizer.encode_text(text, text_seq_length=args.r_text_seq_length)                return encoded, image<\/code><\/pre>\n<p>\u041e\u0431\u0440\u0430\u0442\u0438\u0442\u0435 \u0432\u043d\u0438\u043c\u0430\u043d\u0438\u0435 \u0447\u0442\u043e \u0440\u0430\u0437\u043c\u0435\u0440 \u0438\u0437\u043e\u0431\u0440\u0430\u0436\u0435\u043d\u0438\u044f 128*128, \u043f\u043e\u044d\u0442\u043e\u043c\u0443 \u043a\u0430\u0447\u0435\u0441\u0442\u0432\u043e \u0433\u0435\u043d\u0435\u0440\u0430\u0446\u0438\u0438 \u0438\u0437\u043e\u0431\u0440\u0430\u0436\u0435\u043d\u0438\u044f \u0431\u0443\u0434\u0435\u0442 \u043d\u0435 \u043e\u0447\u0435\u043d\u044c \u0438\u043d\u0442\u0435\u0440\u0435\u0441\u043d\u044b\u043c, \u0434\u043b\u044f \u0433\u0435\u043d\u0435\u0440\u0430\u0446\u0438\u0438 \u0438\u0437\u043e\u0431\u0440\u0430\u0436\u0435\u043d\u0438\u0439 \u043b\u0443\u0447\u0448\u0435 \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u044c ruDall-e, \u0442\u0430\u043c \u0438 \u043f\u0430\u0440\u0430\u043c\u0435\u0442\u0440\u043e\u0432 \u0432 5 \u0440\u0430\u0437 \u0431\u043e\u043b\u044c\u0448\u0435 \u0438 \u0440\u0430\u0437\u043c\u0435\u0440 \u0438\u0437\u043e\u0431\u0440\u0430\u0436\u0435\u043d\u0438\u044f 256.<\/p>\n<p>\u041e\u043f\u0438\u0448\u0435\u043c \u043a\u043b\u0430\u0441\u0441 \u0434\u0430\u0442\u0430\u0441\u0435\u0442\u0430 \u0438 \u0437\u0430\u0433\u0440\u0443\u0437\u0438\u043c \u0432 DataLoader <\/p>\n<pre><code>dataset = FoodDataset(file_path='\/content\/food' ,csv_path ='\/content\/food\/food.csv',tokenizer=tokenizer) train_dataloader = DataLoader(dataset, batch_size=args.bs, shuffle=True, drop_last=True)<\/code><\/pre>\n<p>\u0412\u044b\u0441\u0442\u0430\u0432\u0438\u043c \u043b\u043e\u0433\u0438 \u043d\u0430 Wandb \u0435\u0441\u043b\u0438 \u0432\u044b \u0437\u0430\u043b\u043e\u0433\u0438\u043d\u0438\u043b\u0438\u0441\u044c<\/p>\n<pre><code>try:     if args.wandb:         import wandb         wandb.init(project = args.model_name) except:     args.wandb = False     print('If you want to use wandb logs pls login via wandb -login')<\/code><\/pre>\n<p>\u0417\u0430\u043c\u043e\u0440\u043e\u0437\u0438\u043c \u0447\u0430\u0441\u0442\u044c \u043f\u0430\u0440\u0430\u043c\u0435\u0442\u0440\u043e\u0432 \u043c\u043e\u0434\u0435\u043b\u0438 \u0434\u043b\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u0438 \u043f\u0430\u043c\u044f\u0442\u0438 \u0438 \u0432\u043e\u0441\u043f\u043e\u043b\u044c\u0437\u0443\u0435\u043c\u0441\u044f 8-\u0431\u0438\u0442\u043d\u044b\u043c \u043e\u043f\u0442\u0438\u043c\u0430\u0439\u0437\u0435\u0440\u043e\u043c \u0434\u043b\u044f \u0431\u043e\u043b\u0435\u0435 \u044d\u0444\u0444\u0435\u043a\u0442\u0438\u0432\u043d\u043e\u0433\u043e \u0444\u0430\u0439\u043d\u0442\u044e\u043d\u0430<\/p>\n<pre><code>def freeze(     model,     freeze_emb=False,     freeze_ln=False,     freeze_attn=True,     freeze_ff=True,     freeze_other=False, ):     for name, p in model.module.named_parameters():         name = name.lower()         if 'ln' in name or 'norm' in name:             p.requires_grad = not freeze_ln         elif 'embeddings' in name:             p.requires_grad = not freeze_emb         elif 'mlp' in name:             p.requires_grad = not freeze_ff         elif 'attn' in name:             p.requires_grad = not freeze_attn         else:             p.requires_grad = not freeze_other     return model model.train() optimizer = bnb.optim.Adam8bit(model.parameters(), lr=args.lr)  scheduler = torch.optim.lr_scheduler.OneCycleLR(     optimizer, max_lr=args.lr, final_div_factor=500,      steps_per_epoch=len(train_dataloader), epochs=args.epochs  )<\/code><\/pre>\n<p>\u0418 \u0444\u0443\u043d\u043a\u0446\u0438\u044e \u0434\u043b\u044f \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u044f \u043c\u043e\u0434\u0435\u043b\u0438<\/p>\n<pre><code>def train(model,args: Args, train_dataloader: FoodDataset):   \"\"\"   args - arguments for training    train_dataloader - RuDalleDataset class with text - image pair in batch   \"\"\"    loss_logs = []   try:     progress = tqdm(total=len(train_dataloader)*args.epochs, desc='finetuning goes brrr??\u2603\ufe0f')          save_counter = 0      for epoch in range(args.epochs):              for text, images in train_dataloader:                  save_counter+=1          model.zero_grad()          total_seq_length = args.l_text_seq_length + args.image_seq_length + args.r_text_seq_length                           masks = torch.ones(args.bs, args.r_text_seq_length, dtype=torch.int32)          attention_mask = get_attention_mask(masks, args.bs, args.l_text_seq_length, args.image_tokens_per_dim,                                                     args.r_text_seq_length, device)                  image_input_ids = vae.get_codebook_indices(images.to(device))                  r_text = text.to(device)          l_text = torch.zeros((args.bs, args.l_text_seq_length), device=device, dtype=torch.long)          input_ids = torch.cat((l_text, image_input_ids, r_text), dim=1)           loss, loss_values = model.forward(input_ids, attention_mask, lt_loss_weight=args.lt_loss_weight,         img_loss_weight=args.img_loss_weight,rt_loss_weight=args.rt_loss_weight,  return_loss=True)          loss.backward()                  torch.nn.utils.clip_grad_norm_(model.parameters(),args.clip)         optimizer.step()         scheduler.step()         optimizer.zero_grad()         if save_counter % args.save_every == 0:             print(f'Saveing checkpoint here {args.model_name}_dalle_{save_counter}.pt')             plt.plot(loss_logs)             plt.show()             torch.save(                 model.state_dict(),                 os.path.join(args.save_path,f\"{args.model_name}_dalle_{save_counter}.pt\")             )          if args.wandb:             wandb.log({\"loss\":  loss.item()})         loss_logs+=[loss.item()]         progress.update()         progress.set_postfix({\"loss\": loss.item()})          print(f'Complitly tuned and saved here  {args.model_name}__dalle_last.pt')     plt.plot(loss_logs)     plt.show()     torch.save(         model.state_dict(),         os.path.join(args.save_path,f\"{args.model_name}dalle_last.pt\")     )      except KeyboardInterrupt:               print(f'What for did you stopped? Please change model_path to \/{args.save_path}\/{args.model_name}_rudolf_Failed_train')     plt.plot(loss_logs)     plt.show()          torch.save(                 model.state_dict(),                 os.path.join(args.save_path,f\"{args.model_name}_rudolf_Failed_train.pt\")                 )   except Exception as err:     print(f'Failed with {err}') model = freeze(     model=model,     freeze_emb=False,     freeze_ln=False,     freeze_attn=True,     freeze_ff=True,     freeze_other=False, )  train(model, args, train_dataloader) <\/code><\/pre>\n<p>\u0417\u0430\u043f\u0443\u0441\u0442\u0438\u043c \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u0435 <\/p>\n<figure class=\"full-width\"><img loading=\"lazy\" decoding=\"async\" src=\"https:\/\/habrastorage.org\/r\/w1560\/getpro\/habr\/upload_files\/a7d\/4f1\/1a6\/a7d4f11a6fc5a206cc34f0e66795a9e4.png\" width=\"1092\" height=\"548\" data-src=\"https:\/\/habrastorage.org\/getpro\/habr\/upload_files\/a7d\/4f1\/1a6\/a7d4f11a6fc5a206cc34f0e66795a9e4.png\"\/><figcaption><\/figcaption><\/figure>\n<p>\u0417\u0430\u043f\u0443\u0441\u0442\u0438\u043c \u0438\u043d\u0444\u0435\u0440\u0435\u043d\u0441 \u0438 \u043f\u0440\u043e\u0432\u0435\u0440\u0438\u043c \u0447\u0442\u043e \u0432\u0441\u0435 \u0440\u0430\u0431\u043e\u0442\u0430\u0435\u0442 \u0445\u043e\u0440\u043e\u0448\u043e <\/p>\n<pre><code>template = '\u0431\u043b\u044e\u0434\u043e:'  import requests from PIL import Image import torch   img_by_url = 'https:\/\/kulinarenok.ru\/img\/steps\/31445\/1-7.jpg' #@param {type:\"string\"}  img_by_url = Image.open(requests.get(img_by_url, stream=True).raw).resize((128, 128)) #@markdown number of images captions_num = 4 #@param{type:'slider'} display(img_by_url)  texts = generate_captions(img_by_url, tokenizer, model, vae, template=template,                            top_k=16, captions_num=captions_num, bs=16, top_p=0.6, seed=43,                            temperature=0.8, limit_eos=False) ppl_text, ppl_image = self_reranking_by_image(texts, img_by_url, tokenizer, model, vae, bs=16, seed=42) for idx in ppl_image.argsort()[:8]:     print(texts[idx])<\/code><\/pre>\n<p>\u041f\u0440\u043e\u0432\u0435\u0440\u0438\u043c \u043d\u0430 \u0448\u0430\u0443\u0440\u043c\u0435 \u0441\u043f\u043e\u0441\u043e\u0431\u043d\u043e\u0441\u0442\u0438 \u043c\u043e\u0434\u0435\u043b\u0438 \u043a ZeroShot(\u0441\u043f\u043e\u0441\u043e\u0431\u043d\u043e\u0441\u0442\u0438 \u043c\u043e\u0434\u0435\u043b\u0438 \u043f\u0440\u0435\u0434\u0441\u043a\u0430\u0437\u044b\u0432\u0430\u0442\u044c \u0434\u0430\u043d\u043d\u044b\u0435 \u043a\u043e\u0442\u043e\u0440\u044b\u0445 \u043d\u0435 \u0431\u044b\u043b\u043e \u0432 \u043e\u0431\u0443\u0447\u0430\u044e\u0449\u0435\u0439 \u0432\u044b\u0431\u043e\u0440\u043a\u0435)<\/p>\n<figure class=\"full-width\"><img loading=\"lazy\" decoding=\"async\" src=\"https:\/\/habrastorage.org\/r\/w1560\/getpro\/habr\/upload_files\/aa6\/7a2\/8bf\/aa67a28bf7ccae0855ed020bbe081a51.png\" width=\"1532\" height=\"578\" data-src=\"https:\/\/habrastorage.org\/getpro\/habr\/upload_files\/aa6\/7a2\/8bf\/aa67a28bf7ccae0855ed020bbe081a51.png\"\/><figcaption><\/figcaption><\/figure>\n<p>\u041d\u0443 \u0438 \u0432 \u0446\u0435\u043b\u043e\u043c \u043f\u043e\u043b\u0443\u0447\u0430\u0435\u043c \u0432\u0435\u0440\u043d\u044b\u0439 \u0440\u0435\u0437\u0443\u043b\u044c\u0442\u0430\u0442, \u0430 \u0438\u043c\u0435\u043d\u043d\u043e \u043b\u0430\u0432\u0430\u0448 \u0441 \u0432\u0435\u0442\u0447\u0438\u043d\u043e\u0439 \u0438 \u0441\u044b\u0440\u043e\u043c.<\/p>\n<h3>\u041f\u043e\u043b\u0435\u0437\u043d\u044b\u0435 \u0441\u0441\u044b\u043b\u043a\u0438<\/h3>\n<p><a href=\"https:\/\/github.com\/sberbank-ai\/ru-dolph\" rel=\"noopener noreferrer nofollow\">Rudolph<\/a><\/p>\n<p><a href=\"https:\/\/github.com\/sberbank-ai\/ru-dalle\" rel=\"noopener noreferrer nofollow\">Rudalle<\/a><\/p>\n<p><a href=\"https:\/\/github.com\/sberbank-ai\/fusion_brain_aij2021\" rel=\"noopener noreferrer nofollow\">FusionBrain<\/a> <a href=\"https:\/\/arxiv.org\/pdf\/2111.10974.pdf\" rel=\"noopener noreferrer nofollow\">arxiv preprint<\/a><\/p>\n<h3>\u0411\u043b\u0430\u0433\u043e\u0434\u0430\u0440\u043d\u043e\u0441\u0442\u0438 \u0438 \u0441\u0441\u044b\u043b\u043a\u0438<\/h3>\n<p>\u041e\u0442\u0434\u0435\u043b\u044c\u043d\u043e\u0435 \u0441\u043f\u0430\u0441\u0438\u0431\u043e <a href=\"https:\/\/t.me\/mishin_learning\" rel=\"noopener noreferrer nofollow\">\u041c\u0438\u0448\u0438\u043d \u041b\u0435\u0440\u043d\u0438\u043d\u0433<\/a> \u0438 \u0414\u0435\u043d\u0438\u0441\u0443 \u0414\u0438\u043c\u0438\u0442\u0440\u043e\u0432\u0443 \u0437\u0430 \u0440\u0435\u0434\u0430\u043a\u0442\u0443\u0440\u0443 \u0438 \u043c\u043e\u0442\u0438\u0432\u0430\u0446\u0438\u044e \u0434\u043b\u044f \u043d\u0430\u043f\u0438\u0441\u0430\u043d\u0438\u044f \u0441\u0442\u0430\u0442\u044c\u0438.<\/p>\n<p><a href=\"https:\/\/t.me\/AIAbuZZer\" rel=\"noopener noreferrer nofollow\">\u0422\u0435\u043b\u0435\u0433\u0440\u0430\u043c \u0441 \u043a\u0440\u0443\u0442\u044b\u043c\u0438 \u043f\u0440\u0438\u043c\u0435\u0440\u0430\u043c\u0438 \u0433\u0435\u043d\u0435\u0440\u0430\u0446\u0438\u0438 Rudall-e \u0438 Rudolph<\/a> <\/p>\n<p><a href=\"https:\/\/t.me\/lovedeathtransformers\" rel=\"noopener noreferrer nofollow\">\u041c\u043e\u0439 \u0442\u0435\u043b\u0435\u0433\u0440\u0430\u043c \u043a\u0430\u043d\u0430\u043b<\/a> <\/p>\n<\/div>\n<\/div>\n<\/div>\n<div class=\"v-portal\" style=\"display:none;\"><\/div>\n<\/div>\n<p> <!----> <!----><br \/> \u0441\u0441\u044b\u043b\u043a\u0430 \u043d\u0430 \u043e\u0440\u0438\u0433\u0438\u043d\u0430\u043b \u0441\u0442\u0430\u0442\u044c\u0438 <a href=\"https:\/\/habr.com\/ru\/post\/652347\/\"> https:\/\/habr.com\/ru\/post\/652347\/<\/a><\/p>\n","protected":false},"excerpt":{"rendered":"<div><\/div>\n<div id=\"post-content-body\">\n<div>\n<div class=\"article-formatted-body article-formatted-body_version-2\">\n<div xmlns=\"http:\/\/www.w3.org\/1999\/xhtml\">\n<figure class=\"full-width\"><figcaption><\/figcaption><\/figure>\n<details class=\"spoiler\">\n<summary>\u041f\u043e\u043f\u0440\u043e\u0431\u043e\u0432\u0430\u0442\u044c \u043c\u043e\u0436\u043d\u043e \u0442\u0443\u0442<\/summary>\n<div class=\"spoiler__content\">\n<p>\u0422\u044b\u043a\u0430\u0442\u044c<a href=\"https:\/\/huggingface.co\/spaces\/AlexWortega\/food_calories\" rel=\"noopener noreferrer nofollow\"> \u0442\u0443\u0442<\/a><\/p>\n<figure class=\"full-width\"><figcaption><\/figcaption><\/figure>\n<\/div>\n<\/details>\n<p>\u041f\u0440\u0438\u0432\u0435\u0442 \u0425\u0430\u0431\u0440, \u0441\u0435\u0433\u043e\u0434\u043d\u044f \u043c\u044b \u043f\u043e\u0433\u043e\u0432\u043e\u0440\u0438\u043c \u043e \u0442\u0430\u043a\u043e\u0439 \u0432\u0430\u0436\u043d\u043e\u0439 \u0442\u0435\u043c\u0435 \u043a\u0430\u043a \u043c\u0443\u043b\u044c\u0442\u0438\u043c\u043e\u0434\u0430\u043b\u044c\u043d\u044b\u0435 \u0442\u0440\u0430\u043d\u0441\u0444\u043e\u0440\u043c\u0435\u0440\u044b.<\/p>\n<p>\u0427\u0442\u043e \u0436\u0435 \u044d\u0442\u043e \u0442\u0430\u043a\u043e\u0435 \u0432 \u043a\u043e\u043d\u0442\u0435\u043a\u0441\u0442\u0435 \u043c\u0430\u0448\u0438\u043d\u043d\u043e\u0433\u043e \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u044f &#8212; \u044d\u0442\u043e \u0441\u043f\u043e\u0441\u043e\u0431\u043d\u043e\u0441\u0442\u044c \u043e\u0434\u043d\u043e\u0439 \u043c\u043e\u0434\u0435\u043b\u0438 \u0440\u0430\u0431\u043e\u0442\u0430\u0442\u044c \u0441\u0440\u0430\u0437\u0443 \u0441 \u043d\u0435\u0441\u043a\u043e\u043b\u044c\u043a\u0438\u043c\u0438 \u0432\u0438\u0434\u0430\u043c\u0438 \u0434\u0430\u043d\u043d\u044b\u0445 &#8212; \u0442\u0435\u043a\u0441\u0442\u043e\u043c,  \u043a\u0430\u0440\u0442\u0438\u043d\u043a\u0430\u043c\u0438, \u0437\u0432\u0443\u043a\u043e\u043c, \u0432\u044b\u0442\u0430\u0441\u043a\u0438\u0432\u0430\u0442\u044c \u0438\u0437 \u043d\u0438\u0445 \u0444\u0438\u0447\u0438 \u0432 \u0435\u0434\u0438\u043d\u043e\u0435 \u0432\u0435\u043a\u0442\u043e\u0440\u043d\u043e\u0435 \u043f\u0440\u043e\u0441\u0442\u0440\u0430\u043d\u0441\u0442\u0432\u043e \u0438 \u043c\u0430\u043d\u0438\u043f\u0443\u043b\u0438\u0440\u043e\u0432\u0430\u0442\u044c \u043a\u043e\u043d\u0442\u0435\u043d\u0442\u043e\u043c \u043d\u0430 \u0432\u0445\u043e\u0434\u0435 \u0438 \u0432\u044b\u0445\u043e\u0434\u0435. \u042d\u0442\u0430 \u0438\u0434\u0435\u044f \u043f\u043e\u044f\u0432\u0438\u043b\u0430\u0441\u044c \u0435\u0449\u0435 \u043d\u0430 \u0437\u0430\u0440\u0435 \u0442\u0440\u0430\u043d\u0441\u0444\u043e\u0440\u043c\u0435\u0440\u043e\u0432 \u0432 <a href=\"https:\/\/arxiv.org\/pdf\/1706.05137.pdf\" rel=\"noopener noreferrer nofollow\">\u0441\u0442\u0430\u0442\u044c\u0435<\/a>  <a href=\"https:\/\/arxiv.org\/pdf\/1706.05137.pdf\" rel=\"noopener noreferrer nofollow\">One Model To Learn Them All<\/a><\/p>\n<figure class=\"full-width\"><figcaption><\/figcaption><\/figure>\n<p>\u0410\u0432\u0442\u043e\u0440\u044b \u0438\u0437 \u0433\u0443\u0433\u043b \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u043b\u0438 \u0435\u0434\u0438\u043d\u044b\u0439 \u0431\u0430\u0439\u0442\u043e\u0432\u044b\u0439 \u044d\u043d\u043a\u043e\u0434\u0435\u0440 \u0434\u043b\u044f \u043b\u044e\u0431\u044b\u0445 \u0432\u0445\u043e\u0434\u043d\u044b\u0445 \u0434\u0430\u043d\u043d\u044b\u0445, MoE \u0434\u043b\u044f \u0443\u043f\u0440\u0430\u0432\u043b\u0435\u043d\u0438\u044f \u043c\u043e\u0434\u0430\u043b\u044c\u043d\u043e\u0441\u0442\u044f\u043c\u0438 \u0438 \u0435\u0434\u0438\u043d\u044b\u0439 \u0434\u0435\u043a\u043e\u0434\u0435\u0440 \u0434\u043b\u044f \u0432\u0441\u0435\u0445 \u0437\u0430\u0434\u0430\u0447. <\/p>\n<p>\u041d\u043e \u0432\u0435\u0440\u043d\u0435\u043c\u0441\u044f \u0432 \u043d\u0430\u0448\u0435 \u0432\u0440\u0435\u043c\u044f \u043a <a href=\"https:\/\/github.com\/sberbank-ai\/fusion_brain_aij2021\" rel=\"noopener noreferrer nofollow\">\u0440\u0435\u0441\u0435\u0440\u0447\u0443<\/a> \u043f\u0440\u043e\u0432\u043e\u0434\u0438\u043c\u043e\u043c\u0443 \u0432 \u0440\u0430\u043c\u043a\u0430\u0445 fusion brain challenge.<\/p>\n<figure class=\"full-width\"><figcaption><\/figcaption><\/figure>\n<p>\u0422\u0443\u0442 \u0430\u0432\u0442\u043e\u0440\u044b \u0438\u0441\u0445\u043e\u0434\u044f\u0442 \u0438\u0437 \u043a\u043e\u043d\u0446\u0435\u043f\u0446\u0438\u0438 &#8212; \u0431\u043e\u043b\u044c\u0448\u0438\u0435 \u044f\u0437\u044b\u043a\u043e\u0432\u044b\u0435 \u043c\u043e\u0434\u0435\u043b\u0438 \u0434\u043e\u0441\u0442\u0430\u0442\u043e\u0447\u043d\u043e \u0443\u043c\u043d\u044b \u0447\u0442\u043e\u0431\u044b \u0440\u0435\u0448\u0430\u0442\u044c \u043b\u044e\u0431\u044b\u0435 \u0437\u0430\u0434\u0430\u0447\u0438, \u043f\u0440\u0438 \u0443\u0441\u043b\u043e\u0432\u0438\u0438 \u043f\u0440\u0430\u0432\u0438\u043b\u044c\u043d\u043e\u0433\u043e \u0444\u043e\u0440\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u044f \u0432\u0445\u043e\u0434\u043d\u043e\u0439 \u043f\u043e\u0441\u043b\u0435\u0434\u043e\u0432\u0430\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u0438 \u043c\u043e\u0434\u0435\u043b\u0438 &#8212; \u0432\u043d\u0443\u0442\u0440\u0435\u043d\u043d\u044f\u044f \u043c\u043e\u0434\u0435\u043b\u044c (Decoder only(GPT\/Dalle) \/ Full transformer (T5\/Bart) ) \u0437\u0430\u043c\u043e\u0440\u0430\u0436\u0438\u0432\u0430\u0435\u0442\u0441\u044f,  \u0443\u0447\u0438\u0442\u0441\u044f \u0442\u043e\u043b\u044c\u043a\u043e \u043b\u0438\u043d\u0435\u0439\u043d\u044b\u0439 \u0441\u043b\u043e\u0439 \u043d\u0430 \u0432\u0445\u043e\u0434\u0435 \u0438 \u043d\u0430 \u0432\u044b\u0445\u043e\u0434\u0435. \u0412 \u0442\u0430\u043a\u043e\u043c \u0432\u0430\u0440\u0438\u0430\u043d\u0442\u0435 \u043c\u043e\u0434\u0435\u043b\u044c \u043c\u043e\u0436\u0435\u0442 \u0440\u0435\u0448\u0430\u0442\u044c VQA,Image captioning, Code2code, etc \u043f\u0440\u0438 \u044d\u0442\u043e\u043c \u043d\u0435 \u0443\u0447\u0430 \u043a\u0430\u0436\u0434\u044b\u0439 \u0440\u0430\u0437 \u0440\u0430\u0437\u043d\u0443\u044e \u043c\u043e\u0434\u0435\u043b\u044c. \u0412 \u0434\u0430\u043d\u043d\u043e\u043c \u0441\u043b\u0443\u0447\u0430\u0435 \u043e\u0434\u043d\u0430 \u043c\u043e\u0434\u0435\u043b\u044c \u0440\u0435\u0448\u0430\u0435\u0442 \u0432\u0441\u0435 \u0437\u0430\u0434\u0430\u0447\u0438, \u043f\u0440\u0438 \u044d\u0442\u043e\u043c \u0434\u043e\u0432\u043e\u043b\u044c\u043d\u043e \u044d\u0444\u0444\u0435\u043a\u0442\u0438\u0432\u043d\u043e \u0445\u043e\u0442\u044f \u0438 \u043d\u0435 \u0431\u044c\u0435\u0442 SOTA \u0440\u0435\u0437\u0443\u043b\u044c\u0442\u0430\u0442\u044b.<\/p>\n<figure class=\"full-width\"><figcaption> <\/figcaption><\/figure>\n<p><a href=\"https:\/\/arxiv.org\/pdf\/2111.10974.pdf\" rel=\"noopener noreferrer nofollow\">\u041f\u0440\u0435\u043f\u0440\u0438\u043d\u0442 \u0441\u0442\u0430\u0442\u044c\u0438<\/a> <a href=\"https:\/\/github.com\/sberbank-ai\/fusion_brain_aij2021\" rel=\"noopener noreferrer nofollow\">Github<\/a><\/p>\n<h3>\u0427\u0442\u043e \u0442\u043e \u044f \u0437\u0430\u0433\u043e\u0432\u043e\u0440\u0438\u043b\u0441\u044f \u043f\u0440\u043e \u0431\u043b\u0438\u0437\u043a\u0443\u044e \u043c\u043d\u0435 \u0442\u0435\u043c\u0443, \u0430 \u043f\u0440\u043e \u0431\u043e\u0440\u0449\u0438 \u043d\u0435 \u0441\u043b\u043e\u0432\u0430<\/h3>\n<p>\u0412 \u043d\u043e\u044f\u0431\u0440\u0435 \u044d\u0442\u043e\u0433\u043e \u0433\u043e\u0434\u0430 \u0421\u0431\u0435\u0440 \u0432\u044b\u043b\u043e\u0436\u0438\u043b \u0441\u0443\u043f\u0435\u0440 \u0432\u0430\u0436\u043d\u0443\u044e \u043c\u043e\u0434\u0435\u043b\u044c &#8212; ruDall-e. \u0414\u0430\u043b\u0435\u043a\u0438\u0439 \u043e\u0442 \u0440\u0435\u0441\u0435\u0440\u0447\u0430 \u0447\u0435\u043b\u043e\u0432\u0435\u043a \u0441\u0435\u0439\u0447\u0430\u0441 \u043c\u043e\u0436\u0435\u0442 \u0437\u0430\u0434\u0430\u0442\u044c\u0441\u044f \u043b\u043e\u0433\u0438\u0447\u043d\u044b\u043c \u0432\u043e\u043f\u0440\u043e\u0441\u043e\u043c &#8212; \u0430 \u0432 \u0447\u0435\u043c \u0446\u0435\u043d\u043d\u043e\u0441\u0442\u044c? \u041f\u043e\u0447\u0435\u043c\u0443 \u0441\u043c\u0435\u0448\u043d\u044b\u0435 \u043a\u0430\u0440\u0442\u0438\u043d\u043a\u0438 \u043a\u043e\u0442\u043e\u0432 \u0442\u0430\u043a \u0432\u0430\u0436\u043d\u044b? <\/p>\n<p>\u041e\u0442\u0432\u0435\u0442 \u043f\u0440\u043e\u0441\u0442 &#8212; \u044d\u0442\u043e \u0432\u0430\u0436\u043d\u044b\u0439 \u0448\u0430\u0433 \u043a \u043f\u043e\u0441\u0442\u0440\u043e\u0435\u043d\u0438\u044e \u0431\u043e\u043b\u0435\u0435 \u0441\u043b\u043e\u0436\u043d\u044b\u0445 \u043c\u043e\u0434\u0435\u043b\u0435\u0439, \u043f\u043e \u0441\u0443\u0442\u0438 Dalle-like  \u0432\u044b\u043f\u043e\u043b\u043d\u044f\u0435\u0442 \u0440\u043e\u043b\u044c \u043f\u0435\u0440\u0435\u0432\u043e\u0434\u0447\u0438\u043a\u0430 \u043c\u0435\u0436\u0434\u0443 \u0442\u0435\u043a\u0441\u0442\u043e\u0432\u043e\u0439 \u0438 \u043a\u0430\u0440\u0442\u0438\u043d\u043e\u0447\u043d\u043e\u0439 \u043c\u043e\u0434\u0430\u043b\u044c\u043d\u043e\u0441\u0442\u044c\u044e. \u041d\u0435 \u0437\u043d\u0430\u044f \u043d\u0430\u043f\u0440\u044f\u043c\u0443\u044e \u0444\u0430\u043a\u0442\u043e\u0432 \u0438\u0437 \u0440\u0435\u0430\u043b\u044c\u043d\u043e\u0433\u043e \u043c\u0438\u0440\u0430 \u043c\u043e\u0434\u0435\u043b\u044c \u0442\u0435\u043c \u043d\u0435 \u043c\u0435\u043d\u0435\u0435 \u0438\u0445 \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u0443\u0435\u0442. <\/p>\n<p>\u041d\u043e \u043a\u0430\u043a \u0432\u044b \u043f\u043e\u043d\u044f\u043b\u0438 \u0440\u0435\u0447\u044c \u043f\u043e\u0439\u0434\u0435\u0442 \u043e \u0441\u043b\u0435\u0434\u0443\u0449\u0435\u0439 \u043c\u043e\u0434\u0435\u043b\u0438 \u0421\u0431\u0435\u0440\u0430: Rudolph &#8212; One Hyper-Modal transformer can be creative as DALL-E and smart as CLIP<\/p>\n<p>\u0422\u0443\u0442 \u0438\u0434\u0435\u044f \u0432 \u0442\u043e\u043c \u0447\u0442\u043e\u0431\u044b \u043d\u0430 \u0432\u0445\u043e\u0434 \u0432 \u0434\u0435\u043a\u043e\u0434\u0435\u0440 \u043f\u043e\u0434\u0430\u0432\u0430\u0442\u044c \u043d\u0435 \u043f\u0440\u043e\u0441\u0442\u043e \u0422\u0435\u043a\u0441\u0442-\u041a\u0430\u0440\u0442\u0438\u043d\u043a\u0430 \u043a\u0430\u043a \u0432 DALL-E\/Nuwa\/CogView, \u0430 \u0442\u0440\u0438\u043f\u043b\u0435\u0442 \u0432\u0438\u0434\u0430 l-Text-Image-r-Text<\/p>\n<figure class=\"full-width\"><figcaption><\/figcaption><\/figure>\n<p> \u043f\u0440\u0438 \u044d\u0442\u043e\u043c, \u043c\u043e\u0434\u0435\u043b\u044c \u0443\u0447\u0438\u0442\u0441\u044f \u0441  \u0440\u0430\u0437\u043d\u044b\u043c\u0438 \u043c\u0430\u0441\u043a\u0430\u043c\u0438 \u0432\u043d\u0438\u043c\u0430\u043d\u0438\u044f, \u043a\u0430\u0436\u0434\u0430\u044f \u0438\u0437 \u043a\u043e\u0442\u043e\u0440\u044b\u0445 \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u0443\u0435\u0442\u0441\u044f \u0434\u043b\u044f \u0440\u0435\u0448\u0435\u043d\u0438\u044f \u0440\u0430\u0437\u043d\u044b\u0445 \u0437\u0430\u0434\u0430\u0447. \u0422\u0435 \u043e\u0434\u043d\u0430 \u043c\u043e\u0434\u0435\u043b\u044c \u043c\u043e\u0436\u0435\u0442 \u0440\u0435\u0448\u0430\u0442\u044c \u0438 \u0437\u0430\u0434\u0430\u0447\u0443 \u043f\u0440\u043e\u0434\u043e\u043b\u0436\u0435\u043d\u0438\u044f \u0442\u0435\u043a\u0441\u0442\u0430 \u043a\u0430\u043a GPT, \u0437\u0430\u0434\u0430\u0447\u0443 \u0433\u0435\u043d\u0435\u0440\u0430\u0446\u0438\u0438 \u0438\u0437\u043e\u0431\u0440\u0430\u0436\u0435\u043d\u0438\u0439 \u0438 \u0437\u0430\u0434\u0430\u0447\u0443 \u0433\u0435\u043d\u0435\u0440\u0430\u0446\u0438\u0438 \u043f\u043e\u0434\u043f\u0438\u0441\u0438 \u043a \u0438\u0437\u043e\u0431\u0440\u0430\u0436\u0435\u043d\u0438\u044f\u043c. <a href=\"https:\/\/github.com\/sberbank-ai\/ru-dolph\" rel=\"noopener noreferrer nofollow\">\u0420\u0435\u043f\u043e\u0437\u0438\u0442\u043e\u0440\u0438\u0439<\/a> <\/p>\n<h3>\u0414\u0430\u0432\u0430\u0439\u0442\u0435 \u043f\u0435\u0440\u0435\u0439\u0434\u0435\u043c \u043a \u0442\u0435\u043c\u0435 \u0441\u0442\u0430\u0442\u044c\u0438 \u0438 \u043e\u0431\u0443\u0447\u0438\u043c \u043c\u043e\u0434\u0435\u043b\u044c<\/h3>\n<p><a href=\"https:\/\/colab.research.google.com\/drive\/12YRRzhl5cHER_U2F-buQxif8GlhMPWq3?usp=sharing\" rel=\"noopener noreferrer nofollow\">\u041a\u043e\u043b\u043b\u0430\u0431 \u043d\u043e\u0443\u0442\u0431\u0443\u043a \u0434\u043b\u044f \u0441\u0430\u043c\u044b\u0445 \u043d\u0435\u0442\u0435\u0440\u043f\u0435\u043b\u0438\u0432\u044b\u0445<\/a><\/p>\n<p>\u041f\u0440\u043e\u043f\u0443\u0441\u0442\u0438\u043c \u043a\u0440\u0430\u0439\u043d\u0435 \u043d\u0435 \u0438\u043d\u0442\u0435\u0440\u0435\u0441\u043d\u044b\u0439 \u0441\u0431\u043e\u0440 \u043a\u0430\u0440\u0442\u0438\u043d\u043e\u043a \u0441 \u043f\u043e\u0434\u043f\u0438\u0441\u044f\u043c\u0438,  \u0443 \u043c\u0435\u043d\u044f \u043f\u043e\u043b\u0443\u0447\u0438\u043b\u0441\u044f \u0442\u0430\u043a\u043e\u0439 \u0441\u0435\u0442 \u043d\u0430 500 \u043f\u0440\u0438\u043c\u0435\u0440\u043e\u0432 \u0432 \u0442\u0430\u043a\u043e\u043c \u0444\u043e\u0440\u043c\u0430\u0442\u0435<\/p>\n<figure class=\"full-width\"><figcaption><\/figcaption><\/figure>\n<p>\u0418 \u0432\u043e\u0442 \u0442\u0430\u043a\u043e\u0435 \u043e\u0431\u043b\u0430\u043a\u043e \u0441\u043b\u043e\u0432 \u0434\u043b\u044f \u043d\u0430\u0448\u0435\u0433\u043e \u0441\u0435\u0442\u0430<\/p>\n<figure class=\"\"><figcaption><\/figcaption><\/figure>\n<p>\u0423\u0441\u0442\u0430\u043d\u043e\u0432\u0438\u043c \u0432\u0441\u0435 \u0447\u0442\u043e \u043d\u0430\u043c \u043f\u043e\u043d\u0430\u0434\u043e\u0431\u0438\u0442\u0441\u044f, \u0441\u043a\u0430\u0447\u0430\u0435\u043c \u0438 \u0440\u0430\u0437\u0436\u0438\u043c\u0430\u0435\u043c \u0434\u0430\u043d\u043d\u044b\u0435<\/p>\n<pre><code>!pip install rudolph==0.0.1rc4 > \/dev\/null !pip install bitsandbytes-cuda111 > \/dev\/null !pip install wandb > \/dev\/null !gdown https:\/\/drive.google.com\/uc?id=17bPt7G3N_vGKCCxppIOPbPlhv1qUnv0o !unzip -qn food.zip > \/dev\/null<\/code><\/pre>\n<p>\u0413\u0443\u0433\u043b \u0434\u0438\u0441\u043a \u043a\u0430\u0436\u0435\u0442\u0441\u044f \u0438\u0437\u043c\u0435\u043d\u0438\u043b\u0438 \u043f\u043e\u043b\u0438\u0442\u0438\u043a\u0443 \u0440\u0430\u0441\u043f\u0440\u043e\u0441\u0442\u0440\u0430\u043d\u0435\u043d\u0438\u044f \u0444\u0430\u0439\u043b\u043e\u0432, \u0441\u043a\u0430\u0447\u0430\u0439\u0442\u0435 \u0440\u0443\u043a\u0430\u043c\u0438 \u0444\u0430\u0439\u043b \u0441 \u0434\u0430\u0442\u0430\u0441\u0435\u0442\u043e\u043c \u0438 \u043f\u043e\u0433\u0440\u0443\u0437\u0438\u0442\u0435 \u0432 \u0441\u0440\u0435\u0434\u0443 \u0432\u044b\u043f\u043e\u043b\u043d\u0435\u043d\u0438\u044f<\/p>\n<p>\u0418\u043c\u043f\u043e\u0440\u0442\u0438\u0440\u0443\u0435\u043c \u0432\u0441\u0435 \u0447\u0442\u043e \u043d\u0443\u0436\u043d\u043e<\/p>\n<pre><code>import os import sys import random from collections import Counter  import PIL import torch import numpy as np import pandas as pd import bitsandbytes as bnb import torchvision.transforms as T import torchvision.transforms.functional as TF from tqdm import tqdm from wordcloud import WordCloud from matplotlib import pyplot as plt from torch.utils.data import Dataset, DataLoader from rudalle import get_tokenizer, get_vae from rudalle.utils import seed_everything  from rudolph.model.utils import get_attention_mask from rudolph.model import get_rudolph_model, ruDolphModel, FP16Module from rudolph.pipelines import generate_codebooks, self_reranking_by_image, self_reranking_by_text, show, generate_captions, generate_texts, zs_clf from rudolph import utils  device = 'cuda'  model = get_rudolph_model('350M',  fp16=True, device='cuda') tokenizer = get_tokenizer() vae = get_vae(dwt=False).to(device)<\/code><\/pre>\n<p>\u041e\u043f\u0438\u0448\u0435\u043c \u043f\u0430\u0440\u0430\u043c\u0435\u0442\u0440\u044b \u0434\u043b\u044f \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u044f, \u043a\u0443\u0434\u0430 \u0431\u0443\u0434\u0435\u043c \u0441\u043e\u0445\u0440\u0430\u043d\u044f\u0442\u044c \u043c\u043e\u0434\u0435\u043b\u044c \u0438 \u0433\u0438\u043f\u0435\u0440\u043f\u0430\u0440\u0430\u043c\u0435\u0442\u0440\u044b \u043e\u043f\u0442\u0438\u043c\u0430\u0439\u0437\u0435\u0440\u0430<\/p>\n<pre><code>class Args():     def __init__(self, model):         self.device = model.get_param('device')         self.l_text_seq_length = model.get_param('l_text_seq_length')         self.r_text_seq_length = model.get_param('r_text_seq_length')         self.image_tokens_per_dim = model.get_param('image_tokens_per_dim')         self.image_seq_length = model.get_param('image_seq_length')         self.epochs = 5         self.save_path='checkpoints\/'         self.model_name = 'awesomemodel_'         self.save_every = 500         self.bs = 10         self.clip = 1.0         self.lr = 2e-5         self.wandb = False         self.lt_loss_weight = 0.01         self.img_loss_weight = 1         self.rt_loss_weight = 7         self.image_size = self.image_tokens_per_dim * 8  args = Args(model) if not os.path.exists(args.save_path):     os.makedirs(args.save_path)<\/code><\/pre>\n<p>\u0421\u0442\u0430\u043d\u0434\u0430\u0440\u0442\u043d\u044b\u0439 \u043a\u043b\u0430\u0441\u0441 Dataset \u0434\u043b\u044f image2text text2image \u0437\u0430\u0434\u0430\u0447<\/p>\n<pre><code>class FoodDataset(Dataset):     def __init__(self, file_path, csv_path, tokenizer, shuffle=True):         self.tokenizer = tokenizer         self.samples = []         self.image_transform = T.Compose([             T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),             T.RandomResizedCrop(args.image_size, scale=(1., 1.), ratio=(1., 1.)),             T.ToTensor()         ])          df = pd.read_csv(csv_path)         df.columns = ['index', 'belok', 'fats', 'uglevod', 'kkal', 'name', 'path']          for belok, fats, uglevod, kkal, caption, f_path in zip(             df['belok'],df['fats'], df['uglevod'], df['kkal'], df['name'], df['path']         ):             caption = f'\u0431\u043b\u044e\u0434\u043e: {caption}; \u0431\u0435\u043b\u043a\u043e\u0432: {belok}; \u0436\u0438\u0440\u043e\u0432: {fats}; \u0443\u0433\u043b\u0435\u0432\u043e\u0434\u043e\u0432: {uglevod}; \u043a\u043a\u0430\u043b: {kkal};'             if len(caption)>10 and len(caption)&lt;100 and os.path.isfile(f'{file_path}\/{f_path}'):                 self.samples.append([file_path, f_path, caption.lower()])         if shuffle:             np.random.shuffle(self.samples)             print('Shuffled')      def __len__(self):         return len(self.samples)      def load_image(self, file_path, img_name):         return PIL.Image.open(f'{file_path}\/{img_name}')      def __getitem__(self, item):         item = item % len(self.samples)         file_path, img_name, text = self.samples[item]          try:             image = self.load_image(file_path, img_name)             image = self.image_transform(image)         except Exception as err:               print(err)             random_item = random.randint(0, len(self.samples) - 1)             return self.__getitem__(random_item)                  text = text.lower().strip()         encoded = self.tokenizer.encode_text(text, text_seq_length=args.r_text_seq_length)                return encoded, image<\/code><\/pre>\n<p>\u041e\u0431\u0440\u0430\u0442\u0438\u0442\u0435 \u0432\u043d\u0438\u043c\u0430\u043d\u0438\u0435 \u0447\u0442\u043e \u0440\u0430\u0437\u043c\u0435\u0440 \u0438\u0437\u043e\u0431\u0440\u0430\u0436\u0435\u043d\u0438\u044f 128*128, \u043f\u043e\u044d\u0442\u043e\u043c\u0443 \u043a\u0430\u0447\u0435\u0441\u0442\u0432\u043e \u0433\u0435\u043d\u0435\u0440\u0430\u0446\u0438\u0438 \u0438\u0437\u043e\u0431\u0440\u0430\u0436\u0435\u043d\u0438\u044f \u0431\u0443\u0434\u0435\u0442 \u043d\u0435 \u043e\u0447\u0435\u043d\u044c \u0438\u043d\u0442\u0435\u0440\u0435\u0441\u043d\u044b\u043c, \u0434\u043b\u044f \u0433\u0435\u043d\u0435\u0440\u0430\u0446\u0438\u0438 \u0438\u0437\u043e\u0431\u0440\u0430\u0436\u0435\u043d\u0438\u0439 \u043b\u0443\u0447\u0448\u0435 \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u044c ruDall-e, \u0442\u0430\u043c \u0438 \u043f\u0430\u0440\u0430\u043c\u0435\u0442\u0440\u043e\u0432 \u0432 5 \u0440\u0430\u0437 \u0431\u043e\u043b\u044c\u0448\u0435 \u0438 \u0440\u0430\u0437\u043c\u0435\u0440 \u0438\u0437\u043e\u0431\u0440\u0430\u0436\u0435\u043d\u0438\u044f 256.<\/p>\n<p>\u041e\u043f\u0438\u0448\u0435\u043c \u043a\u043b\u0430\u0441\u0441 \u0434\u0430\u0442\u0430\u0441\u0435\u0442\u0430 \u0438 \u0437\u0430\u0433\u0440\u0443\u0437\u0438\u043c \u0432 DataLoader <\/p>\n<pre><code>dataset = FoodDataset(file_path='\/content\/food' ,csv_path ='\/content\/food\/food.csv',tokenizer=tokenizer) train_dataloader = DataLoader(dataset, batch_size=args.bs, shuffle=True, drop_last=True)<\/code><\/pre>\n<p>\u0412\u044b\u0441\u0442\u0430\u0432\u0438\u043c \u043b\u043e\u0433\u0438 \u043d\u0430 Wandb \u0435\u0441\u043b\u0438 \u0432\u044b \u0437\u0430\u043b\u043e\u0433\u0438\u043d\u0438\u043b\u0438\u0441\u044c<\/p>\n<pre><code>try:     if args.wandb:         import wandb         wandb.init(project = args.model_name) except:     args.wandb = False     print('If you want to use wandb logs pls login via wandb -login')<\/code><\/pre>\n<p>\u0417\u0430\u043c\u043e\u0440\u043e\u0437\u0438\u043c \u0447\u0430\u0441\u0442\u044c \u043f\u0430\u0440\u0430\u043c\u0435\u0442\u0440\u043e\u0432 \u043c\u043e\u0434\u0435\u043b\u0438 \u0434\u043b\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u0438 \u043f\u0430\u043c\u044f\u0442\u0438 \u0438 \u0432\u043e\u0441\u043f\u043e\u043b\u044c\u0437\u0443\u0435\u043c\u0441\u044f 8-\u0431\u0438\u0442\u043d\u044b\u043c \u043e\u043f\u0442\u0438\u043c\u0430\u0439\u0437\u0435\u0440\u043e\u043c \u0434\u043b\u044f \u0431\u043e\u043b\u0435\u0435 \u044d\u0444\u0444\u0435\u043a\u0442\u0438\u0432\u043d\u043e\u0433\u043e \u0444\u0430\u0439\u043d\u0442\u044e\u043d\u0430<\/p>\n<pre><code>def freeze(     model,     freeze_emb=False,     freeze_ln=False,     freeze_attn=True,     freeze_ff=True,     freeze_other=False, ):     for name, p in model.module.named_parameters():         name = name.lower()         if 'ln' in name or 'norm' in name:             p.requires_grad = not freeze_ln         elif 'embeddings' in name:             p.requires_grad = not freeze_emb         elif 'mlp' in name:             p.requires_grad = not freeze_ff         elif 'attn' in name:             p.requires_grad = not freeze_attn         else:             p.requires_grad = not freeze_other     return model model.train() optimizer = bnb.optim.Adam8bit(model.parameters(), lr=args.lr)  scheduler = torch.optim.lr_scheduler.OneCycleLR(     optimizer, max_lr=args.lr, final_div_factor=500,      steps_per_epoch=len(train_dataloader), epochs=args.epochs  )<\/code><\/pre>\n<p>\u0418 \u0444\u0443\u043d\u043a\u0446\u0438\u044e \u0434\u043b\u044f \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u044f \u043c\u043e\u0434\u0435\u043b\u0438<\/p>\n<pre><code>def train(model,args: Args, train_dataloader: FoodDataset):   \"\"\"   args - arguments for training    train_dataloader - RuDalleDataset class with text - image pair in batch   \"\"\"    loss_logs = []   try:     progress = tqdm(total=len(train_dataloader)*args.epochs, desc='finetuning goes brrr??\u2603\ufe0f')          save_counter = 0      for epoch in range(args.epochs):              for text, images in train_dataloader:                  save_counter+=1          model.zero_grad()          total_seq_length = args.l_text_seq_length + args.image_seq_length + args.r_text_seq_length                           masks = torch.ones(args.bs, args.r_text_seq_length, dtype=torch.int32)          attention_mask = get_attention_mask(masks, args.bs, args.l_text_seq_length, args.image_tokens_per_dim,                                                     args.r_text_seq_length, device)                  image_input_ids = vae.get_codebook_indices(images.to(device))                  r_text = text.to(device)          l_text = torch.zeros((args.bs, args.l_text_seq_length), device=device, dtype=torch.long)          input_ids = torch.cat((l_text, image_input_ids, r_text), dim=1)           loss, loss_values = model.forward(input_ids, attention_mask, lt_loss_weight=args.lt_loss_weight,         img_loss_weight=args.img_loss_weight,rt_loss_weight=args.rt_loss_weight,  return_loss=True)          loss.backward()                  torch.nn.utils.clip_grad_norm_(model.parameters(),args.clip)         optimizer.step()         scheduler.step()         optimizer.zero_grad()         if save_counter % args.save_every == 0:             print(f'Saveing checkpoint here<\/code><\/pre>\n<\/div>\n<\/div>\n<\/div>\n<\/div>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[],"tags":[],"class_list":["post-329918","post","type-post","status-publish","format-standard","hentry"],"_links":{"self":[{"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=\/wp\/v2\/posts\/329918","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=329918"}],"version-history":[{"count":0,"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=\/wp\/v2\/posts\/329918\/revisions"}],"wp:attachment":[{"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=329918"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=329918"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=329918"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}