diff --git a/src/Extractor/ContentExtractor.php b/src/Extractor/ContentExtractor.php index 9dc04754..23430752 100644 --- a/src/Extractor/ContentExtractor.php +++ b/src/Extractor/ContentExtractor.php @@ -292,7 +292,9 @@ public function process($html, $url, SiteConfig $siteConfig = null, $smartTidy = } // use JSON-LD to retrieve information - $this->extractJsonLdInformation($html); + if (false === $this->siteConfig->skip_json_ld) { + $this->extractJsonLdInformation($html); + } // strip elements (using xpath expressions) foreach ($this->siteConfig->strip as $pattern) { diff --git a/src/SiteConfig/ConfigBuilder.php b/src/SiteConfig/ConfigBuilder.php index 7c80e803..a9d023d5 100644 --- a/src/SiteConfig/ConfigBuilder.php +++ b/src/SiteConfig/ConfigBuilder.php @@ -281,7 +281,7 @@ public function mergeConfig(SiteConfig $currentConfig, SiteConfig $newConfig) // check for single statement commands // we do not overwrite existing non null values - foreach (['tidy', 'prune', 'parser', 'autodetect_on_failure', 'requires_login'] as $var) { + foreach (['tidy', 'prune', 'parser', 'autodetect_on_failure', 'requires_login', 'skip_json_ld'] as $var) { if ($currentConfig->$var === null) { $currentConfig->$var = $newConfig->$var; } @@ -336,7 +336,7 @@ public function parseLines(array $lines) if (\in_array($command, ['title', 'body', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'next_page_link', 'test_url', 'find_string', 'replace_string', 'login_extra_fields', 'native_ad_clue', 'date', 'author', 'strip_attr'], true)) { array_push($config->$command, $val); // check for single statement commands that evaluate to true or false - } elseif (\in_array($command, ['tidy', 'prune', 'autodetect_on_failure', 'requires_login'], true)) { + } elseif (\in_array($command, ['tidy', 'prune', 'autodetect_on_failure', 'requires_login', 'skip_json_ld'], true)) { $config->$command = ('yes' === $val || 'true' === $val); // check for single statement commands stored as strings } elseif (\in_array($command, ['parser', 'login_username_field', 'login_password_field', 'not_logged_in_xpath', 'login_uri', 'src_lazy_load_attr'], true)) { diff --git a/src/SiteConfig/SiteConfig.php b/src/SiteConfig/SiteConfig.php index 7c9447de..94701c7f 100644 --- a/src/SiteConfig/SiteConfig.php +++ b/src/SiteConfig/SiteConfig.php @@ -132,6 +132,13 @@ class SiteConfig */ public $login_extra_fields = []; + /** + * Explicitly skip getting data from JSON-LD. + * + * @var string + */ + public $skip_json_ld = false; + protected $default_tidy = true; // used if undeclared protected $default_autodetect_on_failure = true; // used if undeclared protected $default_prune = true; // used if undeclared diff --git a/tests/Extractor/ContentExtractorTest.php b/tests/Extractor/ContentExtractorTest.php index 0f5ad75a..b1c748c4 100644 --- a/tests/Extractor/ContentExtractorTest.php +++ b/tests/Extractor/ContentExtractorTest.php @@ -939,6 +939,29 @@ public function testJsonLd() $this->assertContains('

hihi

', $content_block->ownerDocument->saveXML($content_block)); } + public function testJsonLdSkipper() + { + $contentExtractor = new ContentExtractor(self::$contentExtractorConfig); + + $config = new SiteConfig(); + $config->skip_json_ld = true; + + $res = $contentExtractor->process( + '
hello !hello !hello !hello !hello !hello !hello !

' . str_repeat('this is the best part of the show', 10) . '

', + 'https://skipjsonld.io/jsonld', + $config + ); + + $this->assertTrue($res, 'Extraction went well'); + + $content_block = $contentExtractor->getContent(); + + $this->assertEmpty($contentExtractor->getTitle()); + $this->assertNull($contentExtractor->getDate()); + $this->assertEmpty($contentExtractor->getAuthors()); + $this->assertContains('this is the best part of the show', $content_block->ownerDocument->saveXML($content_block)); + } + public function testJsonLdName() { $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);