forked from OCA/server-tools
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
html_image_url_extractor module (OCA#624)
* Image extractor from HTML fields. (OCA#354) * [8.0][html_image_url_extractor] Image extractor from HTML fields. This technical utility allows the developer to get a list of image URLs from any piece of HTML. You can use it for example, to get the cover image from a blog post (upcoming module), or to create a slider with all images from it. * [9.0] [MIG] html_image_url_extractor * Updated README.rst
- Loading branch information
Showing
8 changed files
with
260 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
.. image:: https://img.shields.io/badge/licence-AGPL--3-blue.svg | ||
:target: http://www.gnu.org/licenses/agpl-3.0-standalone.html | ||
:alt: License: AGPL-3 | ||
|
||
========================== | ||
Image URLs from HTML field | ||
========================== | ||
|
||
This module includes a method that extracts image URLs from any chunk of HTML, | ||
in appearing order. | ||
|
||
Usage | ||
===== | ||
|
||
This module just adds a technical utility, but nothing for the end user. | ||
|
||
If you are a developer and need this utility for your module, see these | ||
examples and read the docs inside the code. | ||
|
||
Python example:: | ||
|
||
@api.multi | ||
def some_method(self): | ||
# Get images from an HTML field | ||
imgs = self.env["ir.fields.converter"].imgs_from_html(self.html_field) | ||
for url in imgs: | ||
# Do stuff with those URLs | ||
pass | ||
|
||
QWeb example:: | ||
|
||
<!-- Extract first image from a blog post --> | ||
<t t-foreach="env['ir.fields.converter'] | ||
.imgs_from_html(blog_post.content, 1)" | ||
t-as="url"> | ||
<img t-att-href="url"/> | ||
</t> | ||
|
||
.. image:: https://odoo-community.org/website/image/ir.attachment/5784_f2813bd/datas | ||
:alt: Try me on Runbot | ||
:target: https://runbot.odoo-community.org/runbot/149/9.0 | ||
|
||
Known issues / Roadmap | ||
====================== | ||
|
||
* The regexp to find the URL could be better. | ||
|
||
Bug Tracker | ||
=========== | ||
|
||
Bugs are tracked on `GitHub Issues | ||
<https:/OCA/server-tools/issues>`_. In case of trouble, please | ||
check there if your issue has already been reported. If you spotted it first, | ||
help us smashing it by providing a detailed and welcomed feedback. | ||
|
||
Credits | ||
======= | ||
|
||
Contributors | ||
------------ | ||
|
||
* Jairo Llopis <[email protected]> | ||
* Vicent Cubells <[email protected]> | ||
|
||
Maintainer | ||
---------- | ||
|
||
.. image:: https://odoo-community.org/logo.png | ||
:alt: Odoo Community Association | ||
:target: https://odoo-community.org | ||
|
||
This module is maintained by the OCA. | ||
|
||
OCA, or the Odoo Community Association, is a nonprofit organization whose | ||
mission is to support the collaborative development of Odoo features and | ||
promote its widespread use. | ||
|
||
To contribute to this module, please visit https://odoo-community.org. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
# -*- coding: utf-8 -*- | ||
# Copyright 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis | ||
# Copyright 2016 Tecnativa - Vicent Cubells | ||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). | ||
|
||
from . import models |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# -*- coding: utf-8 -*- | ||
# Copyright 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis | ||
# Copyright 2016 Tecnativa - Vicent Cubells | ||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). | ||
{ | ||
"name": "Image URLs from HTML field", | ||
"summary": "Extract images found in any HTML field", | ||
"version": "9.0.1.0.0", | ||
"category": "Tools", | ||
"website": "https://tecnativa.com", | ||
"author": "Tecnativa, " | ||
"Odoo Community Association (OCA)", | ||
"license": "AGPL-3", | ||
"application": False, | ||
"installable": True, | ||
"external_dependencies": { | ||
"python": [ | ||
"lxml.html", | ||
], | ||
}, | ||
"depends": [ | ||
"base", | ||
], | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
# -*- coding: utf-8 -*- | ||
# Copyright 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis | ||
# Copyright 2016 Tecnativa - Vicent Cubells | ||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). | ||
|
||
from . import ir_fields_converter |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
# -*- coding: utf-8 -*- | ||
# Copyright 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis | ||
# Copyright 2016 Tecnativa - Vicent Cubells | ||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). | ||
|
||
import re | ||
import logging | ||
from lxml import etree, html | ||
from openerp import api, models | ||
|
||
_logger = logging.getLogger(__name__) | ||
|
||
|
||
class IrFieldsConverter(models.Model): | ||
_inherit = "ir.fields.converter" | ||
|
||
@api.model | ||
def imgs_from_html(self, html_content, limit=None, fail=False): | ||
"""Extract all images in order from an HTML field in a generator. | ||
:param str html_content: | ||
HTML contents from where to extract the images. | ||
:param int limit: | ||
Only get up to this number of images. | ||
:param bool fail: | ||
If ``True``, exceptions will be raised. | ||
""" | ||
# Parse HTML | ||
try: | ||
doc = html.fromstring(html_content) | ||
except (TypeError, etree.XMLSyntaxError, etree.ParserError): | ||
if fail: | ||
raise | ||
else: | ||
_logger.exception("Failure parsing this HTML:\n%s", | ||
html_content) | ||
return | ||
|
||
# Required tools | ||
query = """ | ||
//img[@src] | | ||
//*[contains(translate(@style, "BACKGROUND", "background"), | ||
'background')] | ||
[contains(translate(@style, "URL", "url"), 'url(')] | ||
""" | ||
rgx = r""" | ||
url\(\s* # Start function | ||
(?P<url>[^)]*) # URL string | ||
\s*\) # End function | ||
""" | ||
rgx = re.compile(rgx, re.IGNORECASE | re.VERBOSE) | ||
|
||
# Loop through possible image URLs | ||
for lap, element in enumerate(doc.xpath(query)): | ||
if limit and lap >= limit: | ||
break | ||
if element.tag == "img": | ||
yield element.attrib["src"] | ||
else: | ||
for rule in element.attrib["style"].split(";"): | ||
# Extract background image | ||
parts = rule.split(":", 1) | ||
try: | ||
if parts[0].strip().lower() in {"background", | ||
"background-image"}: | ||
yield (rgx.search(parts[1]) | ||
.group("url").strip("\"'")) | ||
# Malformed CSS or no match for URL | ||
except (IndexError, AttributeError): | ||
pass |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
# -*- coding: utf-8 -*- | ||
# © 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis | ||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). | ||
|
||
from . import test_extractor |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
# -*- coding: utf-8 -*- | ||
# © 2016 Grupo ESOC Ingeniería de Servicios, S.L.U. - Jairo Llopis | ||
# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html). | ||
|
||
from lxml import etree | ||
from openerp.tests.common import TransactionCase | ||
|
||
|
||
class ExtractorCase(TransactionCase): | ||
def setUp(self): | ||
super(ExtractorCase, self).setUp() | ||
|
||
# Shortcut | ||
self.imgs_from_html = self.env["ir.fields.converter"].imgs_from_html | ||
|
||
def test_mixed_images_found(self): | ||
"""Images correctly found in <img> elements and backgrounds.""" | ||
content = u""" | ||
<div> | ||
<!-- src-less img --> | ||
<img/> | ||
<p/> | ||
<img src="/path/0"/> | ||
<img src="/path/1"/> | ||
<img src="/path/2"/> | ||
<img src="/path/3"/> | ||
<section style="background : URL('/path/4');;background;ö;"> | ||
<div style='BACKGROUND-IMAGE:url(/path/5)'> | ||
<p style="background:uRl("/path/6")"> | ||
<img src="/path/7"/> | ||
</p> | ||
</div> | ||
</section> | ||
</div> | ||
""" | ||
|
||
# Read all images | ||
for n, url in enumerate(self.imgs_from_html(content)): | ||
self.assertEqual("/path/%d" % n, url) | ||
self.assertEqual(n, 7) | ||
|
||
# Read only first image | ||
for n, url in enumerate(self.imgs_from_html(content, 1)): | ||
self.assertEqual("/path/%d" % n, url) | ||
self.assertEqual(n, 0) | ||
|
||
def test_empty_html(self): | ||
"""Empty HTML handled correctly.""" | ||
for laps, text in self.imgs_from_html(""): | ||
self.assertTrue(False) # You should never get here | ||
|
||
with self.assertRaises(etree.XMLSyntaxError): | ||
list(self.imgs_from_html("", fail=True)) | ||
|
||
def test_false_html(self): | ||
"""``False`` HTML handled correctly.""" | ||
for laps, text in self.imgs_from_html(False): | ||
self.assertTrue(False) # You should never get here | ||
|
||
with self.assertRaises(TypeError): | ||
list(self.imgs_from_html(False, fail=True)) | ||
|
||
def test_bad_html(self): | ||
"""Bad HTML handled correctly.""" | ||
for laps, text in self.imgs_from_html("<<bad>"): | ||
self.assertTrue(False) # You should never get here | ||
|
||
with self.assertRaises(etree.ParserError): | ||
list(self.imgs_from_html("<<bad>", fail=True)) |