html_image_url_extractor module (OCA#624)

* Image extractor from HTML fields. (OCA#354) * [8.0][html_image_url_extractor] Image extractor from HTML fields. This technical utility allows the developer to get a list of image URLs from any piece of HTML. You can use it for example, to get the cover image from a blog post (upcoming module), or to create a slider with all images from it. * [9.0] [MIG] html_image_url_extractor * Updated README.rst
tarteo · Jan 9, 2020 · 8580954 · 8580954
1 parent efbe824
commit 8580954
Show file tree

Hide file tree

Showing 8 changed files with 260 additions and 0 deletions.
diff --git a/html_image_url_extractor/README.rst b/html_image_url_extractor/README.rst
@@ -0,0 +1,78 @@
+.. image:: https://img.shields.io/badge/licence-AGPL--3-blue.svg
+ :target: http://www.gnu.org/licenses/agpl-3.0-standalone.html
+ :alt: License: AGPL-3
+
+==========================
+Image URLs from HTML field
+==========================
+
+This module includes a method that extracts image URLs from any chunk of HTML,
+in appearing order.
+
+Usage
+=====
+
+This module just adds a technical utility, but nothing for the end user.
+
+If you are a developer and need this utility for your module, see these
+examples and read the docs inside the code.
+
+Python example::
+
+ @api.multi
+ def some_method(self):
+ # Get images from an HTML field
+ imgs = self.env["ir.fields.converter"].imgs_from_html(self.html_field)
+ for url in imgs:
+ # Do stuff with those URLs
+ pass
+
+QWeb example::
+
+ <!-- Extract first image from a blog post -->
+ <t t-foreach="env['ir.fields.converter']
+ .imgs_from_html(blog_post.content, 1)"
+ t-as="url">
+ <img t-att-href="url"/>
+ </t>
+
+.. image:: https://odoo-community.org/website/image/ir.attachment/5784_f2813bd/datas
+ :alt: Try me on Runbot
+ :target: https://runbot.odoo-community.org/runbot/149/9.0
+
+Known issues / Roadmap
+======================
+
+* The regexp to find the URL could be better.
+
+Bug Tracker
+===========
+
+Bugs are tracked on `GitHub Issues
+<https:/OCA/server-tools/issues>`_. In case of trouble, please
+check there if your issue has already been reported. If you spotted it first,
+help us smashing it by providing a detailed and welcomed feedback.
+
+Credits
+=======
+
+Contributors
+------------
+
+* Jairo Llopis <[email protected]>
+* Vicent Cubells <[email protected]>
+
+Maintainer
+----------
+
+.. image:: https://odoo-community.org/logo.png
+ :alt: Odoo Community Association
+ :target: https://odoo-community.org
+
+This module is maintained by the OCA.
+
+OCA, or the Odoo Community Association, is a nonprofit organization whose
+mission is to support the collaborative development of Odoo features and
+promote its widespread use.
+
+To contribute to this module, please visit https://odoo-community.org.
diff --git a/html_image_url_extractor/__init__.py b/html_image_url_extractor/__init__.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+# Copyright 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis
+# Copyright 2016 Tecnativa - Vicent Cubells
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+
+from . import models
diff --git a/html_image_url_extractor/__openerp__.py b/html_image_url_extractor/__openerp__.py
@@ -0,0 +1,24 @@
+# -*- coding: utf-8 -*-
+# Copyright 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis
+# Copyright 2016 Tecnativa - Vicent Cubells
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+{
+ "name": "Image URLs from HTML field",
+ "summary": "Extract images found in any HTML field",
+ "version": "9.0.1.0.0",
+ "category": "Tools",
+ "website": "https://tecnativa.com",
+ "author": "Tecnativa, "
+ "Odoo Community Association (OCA)",
+ "license": "AGPL-3",
+ "application": False,
+ "installable": True,
+ "external_dependencies": {
+ "python": [
+ "lxml.html",
+ ],
+ },
+ "depends": [
+ "base",
+ ],
+}
diff --git a/html_image_url_extractor/models/__init__.py b/html_image_url_extractor/models/__init__.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+# Copyright 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis
+# Copyright 2016 Tecnativa - Vicent Cubells
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+
+from . import ir_fields_converter
diff --git a/html_image_url_extractor/models/ir_fields_converter.py b/html_image_url_extractor/models/ir_fields_converter.py
@@ -0,0 +1,72 @@
+# -*- coding: utf-8 -*-
+# Copyright 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis
+# Copyright 2016 Tecnativa - Vicent Cubells
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+
+import re
+import logging
+from lxml import etree, html
+from openerp import api, models
+
+_logger = logging.getLogger(__name__)
+
+
+class IrFieldsConverter(models.Model):
+ _inherit = "ir.fields.converter"
+
+ @api.model
+ def imgs_from_html(self, html_content, limit=None, fail=False):
+ """Extract all images in order from an HTML field in a generator.
+
+ :param str html_content:
+ HTML contents from where to extract the images.
+
+ :param int limit:
+ Only get up to this number of images.
+
+ :param bool fail:
+ If ``True``, exceptions will be raised.
+ """
+ # Parse HTML
+ try:
+ doc = html.fromstring(html_content)
+ except (TypeError, etree.XMLSyntaxError, etree.ParserError):
+ if fail:
+ raise
+ else:
+ _logger.exception("Failure parsing this HTML:\n%s",
+ html_content)
+ return
+
+ # Required tools
+ query = """
+ //img[@src] |
+ //*[contains(translate(@style, "BACKGROUND", "background"),
+ 'background')]
+ [contains(translate(@style, "URL", "url"), 'url(')]
+ """
+ rgx = r"""
+ url\(\s* # Start function
+ (?P<url>[^)]*) # URL string
+ \s*\) # End function
+ """
+ rgx = re.compile(rgx, re.IGNORECASE | re.VERBOSE)
+
+ # Loop through possible image URLs
+ for lap, element in enumerate(doc.xpath(query)):
+ if limit and lap >= limit:
+ break
+ if element.tag == "img":
+ yield element.attrib["src"]
+ else:
+ for rule in element.attrib["style"].split(";"):
+ # Extract background image
+ parts = rule.split(":", 1)
+ try:
+ if parts[0].strip().lower() in {"background",
+ "background-image"}:
+ yield (rgx.search(parts[1])
+ .group("url").strip("\"'"))
+ # Malformed CSS or no match for URL
+ except (IndexError, AttributeError):
+ pass
diff --git a/html_image_url_extractor/static/description/icon.png b/html_image_url_extractor/static/description/icon.png
diff --git a/html_image_url_extractor/tests/__init__.py b/html_image_url_extractor/tests/__init__.py
@@ -0,0 +1,5 @@
+# -*- coding: utf-8 -*-
+# © 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+
+from . import test_extractor
diff --git a/html_image_url_extractor/tests/test_extractor.py b/html_image_url_extractor/tests/test_extractor.py
@@ -0,0 +1,69 @@
+# -*- coding: utf-8 -*-
+# © 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+
+from lxml import etree
+from openerp.tests.common import TransactionCase
+
+
+class ExtractorCase(TransactionCase):
+ def setUp(self):
+ super(ExtractorCase, self).setUp()
+
+ # Shortcut
+ self.imgs_from_html = self.env["ir.fields.converter"].imgs_from_html
+
+ def test_mixed_images_found(self):
+ """Images correctly found in <img> elements and backgrounds."""
+ content = u"""
+ <div>
+ <!-- src-less img -->
+ <img/>
+ <p/>
+ <img src="/path/0"/>
+ <img src="/path/1"/>
+ <img src="/path/2"/>
+ <img src="/path/3"/>
+ <section style="background : URL('/path/4');;background;ö;">
+ <div style='BACKGROUND-IMAGE:url(/path/5)'>
+ <p style="background:uRl(&quot;/path/6&quot;)">
+ <img src="/path/7"/>
+ </p>
+ </div>
+ </section>
+ </div>
+ """
+
+ # Read all images
+ for n, url in enumerate(self.imgs_from_html(content)):
+ self.assertEqual("/path/%d" % n, url)
+ self.assertEqual(n, 7)
+
+ # Read only first image
+ for n, url in enumerate(self.imgs_from_html(content, 1)):
+ self.assertEqual("/path/%d" % n, url)
+ self.assertEqual(n, 0)
+
+ def test_empty_html(self):
+ """Empty HTML handled correctly."""
+ for laps, text in self.imgs_from_html(""):
+ self.assertTrue(False) # You should never get here
+
+ with self.assertRaises(etree.XMLSyntaxError):
+ list(self.imgs_from_html("", fail=True))
+
+ def test_false_html(self):
+ """``False`` HTML handled correctly."""
+ for laps, text in self.imgs_from_html(False):
+ self.assertTrue(False) # You should never get here
+
+ with self.assertRaises(TypeError):
+ list(self.imgs_from_html(False, fail=True))
+
+ def test_bad_html(self):
+ """Bad HTML handled correctly."""
+ for laps, text in self.imgs_from_html("<<bad>"):
+ self.assertTrue(False) # You should never get here
+
+ with self.assertRaises(etree.ParserError):
+ list(self.imgs_from_html("<<bad>", fail=True))