From e9d3496e02e7f6c64a8457bd099980f8f2d9f336 Mon Sep 17 00:00:00 2001 From: Kevin Decherf Date: Fri, 4 Feb 2022 01:30:38 +0100 Subject: [PATCH 1/4] Update codebase to support Readability 2 The dom property is initialized by Readability::loadHtml() which has been moved from __construct() to init() in v2; thus we must call init() early. We also remove the previous init() call to prevent any issues as it is not idempotent. Signed-off-by: Kevin Decherf --- src/Extractor/ContentExtractor.php | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/src/Extractor/ContentExtractor.php b/src/Extractor/ContentExtractor.php index 7101605..7078514 100644 --- a/src/Extractor/ContentExtractor.php +++ b/src/Extractor/ContentExtractor.php @@ -158,6 +158,7 @@ public function process(string $html, UriInterface $url, ?SiteConfig $siteConfig $this->logger->info('Attempting to parse HTML with {parser}', ['parser' => $parser]); $this->readability = $this->getReadability($html, $url, $parser, $this->siteConfig->tidy() && $smartTidy); + $success = $this->readability->init(); $tidied = $this->readability->tidied; $this->logger->info('Body size after Readability: {length}', ['length' => \strlen((string) $this->readability->dom->saveXML($this->readability->dom->documentElement))]); @@ -471,17 +472,6 @@ public function process(string $html, UriInterface $url, ?SiteConfig $siteConfig 'Date found (datetime marked time element): {date}' ); - // still missing title or body, so we detect using Readability - $success = false; - if ($detectTitle || $detectBody) { - $this->logger->info('Using Readability'); - // clone body if we're only using Readability for title (otherwise it may interfere with body element) - if (isset($this->body)) { - $this->body = $this->body->cloneNode(true); - } - $success = $this->readability->init(); - } - if ($detectTitle && $this->readability->getTitle()->textContent) { $this->title = trim($this->readability->getTitle()->textContent); $this->logger->info('Detected title: {title}', ['title' => $this->title]); From 782db14880e6bd3dec715044fc2fac6d4f44889d Mon Sep 17 00:00:00 2001 From: Kevin Decherf Date: Fri, 4 Feb 2022 01:35:17 +0100 Subject: [PATCH 2/4] Remove redundant invisible elements stripping The stripping of invisible elements is now done by Readability 2 Signed-off-by: Kevin Decherf --- src/Extractor/ContentExtractor.php | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/Extractor/ContentExtractor.php b/src/Extractor/ContentExtractor.php index 7078514..6001656 100644 --- a/src/Extractor/ContentExtractor.php +++ b/src/Extractor/ContentExtractor.php @@ -322,12 +322,6 @@ public function process(string $html, UriInterface $url, ?SiteConfig $siteConfig $this->removeElements($elems, 'Stripping {length} .entry-unrelated,.instapaper_ignore elements'); - // strip elements that contain style 'display: none' or 'visibility:hidden' - // @todo: inline style are convert to