{"id":171447,"date":"2013-03-03T22:07:03","date_gmt":"2013-03-03T18:07:03","guid":{"rendered":"http:\/\/savepearlharbor.com\/?p=171447"},"modified":"-0001-11-30T00:00:00","modified_gmt":"-0001-11-29T21:00:00","slug":"","status":"publish","type":"post","link":"https:\/\/savepearlharbor.com\/?p=171447","title":{"rendered":"<span class=\"post_title\">\u0421\u0430\u043c\u044b\u0439 \u0431\u044b\u0441\u0442\u0440\u044b\u0439 SAX-\u043f\u0430\u0440\u0441\u0435\u0440 \u0434\u043b\u044f python<\/span>"},"content":{"rendered":"<div class=\"content html_format\">   \t\u0412\u043d\u0435\u0437\u0430\u043f\u043d\u043e \u0437\u0430\u0445\u043e\u0442\u0435\u043b\u043e\u0441\u044c \u043f\u0435\u0440\u0435\u0441\u0447\u0438\u0442\u0430\u0442\u044c \u0432\u0441\u0435 xml-\u0442\u0435\u0433\u0438 \u0432 240 \u0442\u044b\u0441\u044f\u0447\u0430\u0445 xml-\u0444\u0430\u0439\u043b\u043e\u0432 \u043e\u0431\u0449\u0438\u043c \u0432\u0435\u0441\u043e\u043c 180 GB. \u041f\u0438\u0442\u043e\u043d\u043e\u043c \u2014 \u0438 \u043f\u043e\u0431\u044b\u0441\u0442\u0440\u0435\u0435.<a name=\"habracut\"><\/a>  <\/p>\n<h4>\u0417\u0430\u0434\u0430\u0447\u0430<\/h4>\n<p>  \u041d\u0430 \u0441\u0430\u043c\u043e\u043c \u0434\u0435\u043b\u0435 \u0437\u0430\u0445\u043e\u0442\u0435\u043b\u043e\u0441\u044c \u043f\u0440\u0438\u043a\u0438\u043d\u0443\u0442\u044c \u2014 \u043d\u0430\u0441\u043a\u043e\u043b\u044c\u043a\u043e \u0440\u0435\u0430\u043b\u044c\u043d\u043e \u043f\u0435\u0440\u0435\u0433\u043d\u0430\u0442\u044c \u0411\u0438\u0431\u043b\u0438\u043e\u0442\u0435\u043a\u0443, \u0427\u044c\u0435 \u0418\u043c\u044f \u041d\u0435\u043b\u044c\u0437\u044f \u041f\u0440\u043e\u0438\u0437\u043d\u043e\u0441\u0438\u0442\u044c \u0412\u0441\u043b\u0443\u0445, \u0438\u0437 fb2 \u0432 docbook. \u0412 \u0441\u0432\u044f\u0437\u0438 \u0441\u043e \u00ab\u0441\u043f\u0435\u0446\u0438\u0444\u0438\u0447\u043d\u043e\u0441\u0442\u044c\u044e\u00bb FB2 \u043d\u0430\u0434\u043e \u043f\u0440\u0438\u043a\u0438\u043d\u0443\u0442\u044c \u2014 \u043a\u0430\u043a\u0438\u0435 \u0442\u0435\u0433\u0438 \u043c\u043e\u0436\u043d\u043e \u043f\u0440\u043e\u0441\u0442\u043e \u043f\u0440\u043e\u043f\u0443\u0441\u0442\u0438\u0442\u044c \u0432\u0432\u0438\u0434\u0443 \u0440\u0435\u0434\u043a\u043e\u0441\u0442\u0438. \u0422.\u0435. \u043f\u0440\u043e\u0441\u0442\u043e \u043f\u0435\u0440\u0435\u0441\u0447\u0438\u0442\u0430\u0442\u044c \u043a\u043e\u043b\u0438\u0447\u0435\u0441\u0442\u0432\u043e \u0432\u0445\u043e\u0436\u0434\u0435\u043d\u0438\u044f \u043a\u0430\u0436\u0434\u043e\u0433\u043e \u0442\u0435\u0433\u0430 \u0432\u043e \u0432\u0441\u0435 \u0444\u0430\u0439\u043b\u044b.<br \/>  \u041f\u043e \u0434\u043e\u0440\u043e\u0433\u0435 \u043f\u043b\u0430\u043d\u0438\u0440\u043e\u0432\u0430\u043b\u043e\u0441\u044c \u0441\u0440\u0430\u0432\u043d\u0438\u0442\u044c \u0440\u0430\u0437\u043d\u044b\u0435 sax-\u043f\u0430\u0440\u0441\u0435\u0440\u044b. \u041a \u0441\u043e\u0436\u0430\u043b\u0435\u043d\u0438\u044e \u2014 \u0442\u0435\u0441\u0442\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u044f \u043d\u0435 \u043f\u043e\u043b\u0443\u0447\u0438\u043b\u043e\u0441\u044c, \u0442.\u043a. \u0438 xml.sax \u0438 lxml \u043d\u0430 \u043f\u0435\u0440\u0432\u043e\u043c \u0436\u0435 fb2 \u043f\u043e\u043b\u043e\u043c\u0430\u043b\u0438\u0441\u044c. \u0412 \u0438\u0442\u043e\u0433\u0435 \u043e\u0441\u0442\u0430\u043b\u0441\u044f xml.parsers.expat.<br \/>  \u0414\u0430, \u0438 \u0435\u0449\u0435 \u2014 \u0444\u0430\u0439\u043b\u044b *.fb2 \u0443\u043f\u0430\u043a\u043e\u0432\u0430\u043d\u044b \u0432 zip-\u0430\u0440\u0445\u0438\u0432\u044b.  <\/p>\n<h4>\u0418\u0441\u0445\u043e\u0434\u043d\u044b\u0435 \u0434\u0430\u043d\u043d\u044b\u0435<\/h4>\n<p>  \u0418\u0441\u0445\u043e\u0434\u043d\u044b\u043c\u0438 \u0434\u0430\u043d\u043d\u044b\u043c\u0438 \u044f\u0432\u043b\u044f\u0435\u0442\u0441\u044f \u0441\u043d\u0430\u043f\u0448\u043e\u0442 \u0411\u0438\u0431\u043b\u0438\u043e\u0442\u0435\u043a\u0438 \u043f\u043e \u0441\u043e\u0441\u0442\u043e\u044f\u043d\u0438\u044e \u043d\u0430 2013.02.01, \u0446\u0435\u043b\u044c\u043d\u043e\u0442\u044f\u043d\u0443\u0442\u044b\u0439 \u0438\u0437 <s>\u0442\u043e\u0440<\/s> \u0418\u043d\u0442\u0435\u0440\u043d\u0435\u0442\u043e\u0432: 242525 \u0444\u0430\u0439\u043b\u0430 *.fb2 \u043e\u0431\u0449\u0438\u043c \u0432\u0435\u0441\u043e\u043c 183909288096 \u0431\u0430\u0439\u0442, \u0443\u043f\u0430\u043a\u043e\u0432\u0430\u043d\u044b\u0435 \u0432 56 zip-\u0430\u0440\u0445\u0438\u0432\u043e\u0432 \u043e\u0431\u0449\u0438\u043c \u0432\u0435\u0441\u043e\u043c 82540008 \u0431\u0430\u0439\u0442.<br \/>  \u041f\u043b\u0430\u0442\u0444\u043e\u0440\u043c\u0430: Asus X5DIJ (Pentium DualCore T4500 (2&#215;2.30), 2GB RAM); Fedora 18, python 2.7.  <\/p>\n<h4>\u041a\u043e\u0434<\/h4>\n<p>  \u041d\u0430\u043f\u0438\u0441\u0430\u043d\u043e \u043d\u0430 \u0441\u043a\u043e\u0440\u0443\u044e \u0440\u0443\u043a\u0443, \u0441 \u043f\u0440\u0435\u0442\u0435\u043d\u0437\u0438\u0435\u0439 \u043d\u0430 \u0443\u043d\u0438\u0432\u0435\u0440\u0441\u0430\u043b\u044c\u043d\u043e\u0441\u0442\u044c:  <\/p>\n<pre><code>#!\/bin\/env python # -*- coding: utf-8 -*- ''' '''  import sys, os, zipfile, hashlib, pprint import xml.parsers.expat, magic  mime = magic.open(magic.MIME_TYPE) mime.load() tags = dict() files = 0  reload(sys) sys.setdefaultencoding('utf-8')  def start_element(name, attrs): \ttags[name] = tags[name] + 1 if name in tags else 1  def\tparse_dir(fn): \tdirlist = os.listdir(fn) \tdirlist.sort() \tfor i in dirlist: \t\tparse_file(os.path.join(fn, i))  def\tparse_file(fn): \tm = mime.file(fn) \tif (m == 'application\/zip'): \t\tparse_zip(fn) \telif (m == 'application\/xml'): \t\tparse_fb2(fn) \telse: \t\tprint &gt;&gt; sys.stderr, 'Unknown mime type (%s) of file %s' % (m, fn)  def\tparse_zip(fn): \tprint &gt;&gt; sys.stderr, 'Zip:', os.path.basename(fn) \tz = zipfile.ZipFile(fn, 'r') \tfilelist = z.namelist() \tfilelist.sort() \tfor n in filelist: \t\ttry: \t\t\tparse_fb2(z.open(n)) \t\t\tprint &gt;&gt; sys.stderr, n \t\texcept: \t\t\tprint &gt;&gt; sys.stderr, 'X:', n  def\tparse_fb2(fn): \tglobal files \tif isinstance(fn, str): \t\tfn = open(fn) \tparser = xml.parsers.expat.ParserCreate() \tparser.StartElementHandler = start_element \tparser.Parse(fn.read(), True) \tfiles += 1  def\tprint_result(): \tout = open('result.txt', 'w') \tfor k, v in tags.iteritems(): \t\tout.write(u'%s\\t%d\\n' % (k, v)) \tprint 'Files:', files  if (__name__ == '__main__'): \tif len(sys.argv) != 2: \t\tprint &gt;&gt; sys.stderr, 'Usage: %s &lt;xmlfile|zipfile|folder&gt;' % sys.argv[0] \t\tsys.exit(1) \tsrc = sys.argv[1] \tif (os.path.isdir(src)): \t\tparse_dir(src) \telse: \t\tparse_file(src) \tprint_result()  <\/code><\/pre>\n<p>  <\/p>\n<h4>\u0420\u0435\u0437\u0443\u043b\u044c\u0442\u0430\u0442\u044b<\/h4>\n<p>  \u0417\u0430\u0440\u044f\u0436\u0430\u0435\u043c:  <\/p>\n<pre><code>time nice .\/thisfile.py ~\/Torrent\/....ec &gt; out.txt 2&gt;err.txt<\/code><\/pre>\n<p>  \u041f\u043e\u043b\u0443\u0447\u0430\u0435\u043c:<br \/>  * \u0412\u0440\u0435\u043c\u044f \u0432\u044b\u043f\u043e\u043b\u043d\u0435\u043d\u0438\u044f \u2014 74&#8217;15..45&quot; (\u043f\u0430\u0440\u0430\u043b\u043b\u0435\u043b\u044c\u043d\u043e \u0432\u044b\u043f\u043e\u043b\u043d\u044f\u043b\u0430\u0441\u044c \u043d\u0435\u0431\u043e\u043b\u044c\u0448\u0430\u044f \u0440\u0430\u0431\u043e\u0442\u0430 \u0438 \u0441\u043b\u0443\u0448\u0430\u043b\u0430\u0441\u044c \u043c\u0443\u0437\u044b\u043a\u0430, \u0435\u0441\u0441\u043d\u043e);<br \/>  * \u041f\u043e\u043b\u0443\u0447\u0438\u043b\u043e\u0441\u044c, \u0447\u0442\u043e \u0441\u043a\u043e\u0440\u043e\u0441\u0442\u044c \u043e\u0431\u0440\u0430\u0431\u043e\u0442\u043a\u0438 \u2014 ~40 MB\/s (\u0438\u043b\u0438 58 \u0442\u0430\u043a\u0442\u043e\u0432\/\u0431\u0430\u0439\u0442)<br \/>  * \u041e\u0442\u0431\u0440\u043e\u0448\u0435\u043d\u043e 2584 \u0444\u0430\u0439\u043b\u043e\u0432 *.fb2 (expat \u0445\u043e\u0442\u044c \u0438 non validate parser \u2014 \u043d\u043e \u043d\u0435 \u0434\u043e \u0442\u0430\u043a\u043e\u0439 \u0436\u0435 \u0441\u0442\u0435\u043f\u0435\u043d\u0438&#8230;) \u2014 ~10%;<br \/>  * \u0432 \u0444\u0430\u0439\u043b\u0435 results.txt \u2014 \u0447\u0435\u0433\u043e \u0442\u043e\u043b\u044c\u043a\u043e \u043d\u0435\u0442\u2026<\/p>\n<p>  \u0410 \u0431\u044b\u0441\u0442\u0440\u0435\u0435 \u2014 \u043c\u043e\u0436\u043d\u043e? \u041d\u0430 python.    \t \t\t   \t<\/p>\n<div class=\"clear\"><\/div>\n<\/p><\/div>\n<p> \u0441\u0441\u044b\u043b\u043a\u0430 \u043d\u0430 \u043e\u0440\u0438\u0433\u0438\u043d\u0430\u043b \u0441\u0442\u0430\u0442\u044c\u0438 <a href=\"http:\/\/habrahabr.ru\/post\/171447\/\"> http:\/\/habrahabr.ru\/post\/171447\/<\/a><\/p>\n","protected":false},"excerpt":{"rendered":"<div class=\"content html_format\">   \t\u0412\u043d\u0435\u0437\u0430\u043f\u043d\u043e \u0437\u0430\u0445\u043e\u0442\u0435\u043b\u043e\u0441\u044c \u043f\u0435\u0440\u0435\u0441\u0447\u0438\u0442\u0430\u0442\u044c \u0432\u0441\u0435 xml-\u0442\u0435\u0433\u0438 \u0432 240 \u0442\u044b\u0441\u044f\u0447\u0430\u0445 xml-\u0444\u0430\u0439\u043b\u043e\u0432 \u043e\u0431\u0449\u0438\u043c \u0432\u0435\u0441\u043e\u043c 180 GB. \u041f\u0438\u0442\u043e\u043d\u043e\u043c \u2014 \u0438 \u043f\u043e\u0431\u044b\u0441\u0442\u0440\u0435\u0435.<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[],"tags":[],"class_list":["post-171447","post","type-post","status-publish","format-standard","hentry"],"_links":{"self":[{"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=\/wp\/v2\/posts\/171447","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=171447"}],"version-history":[{"count":0,"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=\/wp\/v2\/posts\/171447\/revisions"}],"wp:attachment":[{"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=171447"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=171447"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/savepearlharbor.com\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=171447"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}