Skip to content

Commit

Permalink
Merge pull request #170 from j0k3r/skip-json-ld
Browse files Browse the repository at this point in the history
Add ability to skip getting data from json-ld
  • Loading branch information
j0k3r authored Oct 29, 2018
2 parents f419f8e + 5fb4e24 commit 8f68263
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 3 deletions.
4 changes: 3 additions & 1 deletion src/Extractor/ContentExtractor.php
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,9 @@ public function process($html, $url, SiteConfig $siteConfig = null, $smartTidy =
}

// use JSON-LD to retrieve information
$this->extractJsonLdInformation($html);
if (false === $this->siteConfig->skip_json_ld) {
$this->extractJsonLdInformation($html);
}

// strip elements (using xpath expressions)
foreach ($this->siteConfig->strip as $pattern) {
Expand Down
4 changes: 2 additions & 2 deletions src/SiteConfig/ConfigBuilder.php
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ public function mergeConfig(SiteConfig $currentConfig, SiteConfig $newConfig)

// check for single statement commands
// we do not overwrite existing non null values
foreach (['tidy', 'prune', 'parser', 'autodetect_on_failure', 'requires_login'] as $var) {
foreach (['tidy', 'prune', 'parser', 'autodetect_on_failure', 'requires_login', 'skip_json_ld'] as $var) {
if ($currentConfig->$var === null) {
$currentConfig->$var = $newConfig->$var;
}
Expand Down Expand Up @@ -336,7 +336,7 @@ public function parseLines(array $lines)
if (\in_array($command, ['title', 'body', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'next_page_link', 'test_url', 'find_string', 'replace_string', 'login_extra_fields', 'native_ad_clue', 'date', 'author', 'strip_attr'], true)) {
array_push($config->$command, $val);
// check for single statement commands that evaluate to true or false
} elseif (\in_array($command, ['tidy', 'prune', 'autodetect_on_failure', 'requires_login'], true)) {
} elseif (\in_array($command, ['tidy', 'prune', 'autodetect_on_failure', 'requires_login', 'skip_json_ld'], true)) {
$config->$command = ('yes' === $val || 'true' === $val);
// check for single statement commands stored as strings
} elseif (\in_array($command, ['parser', 'login_username_field', 'login_password_field', 'not_logged_in_xpath', 'login_uri', 'src_lazy_load_attr'], true)) {
Expand Down
7 changes: 7 additions & 0 deletions src/SiteConfig/SiteConfig.php
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,13 @@ class SiteConfig
*/
public $login_extra_fields = [];

/**
* Explicitly skip getting data from JSON-LD.
*
* @var string
*/
public $skip_json_ld = false;

protected $default_tidy = true; // used if undeclared
protected $default_autodetect_on_failure = true; // used if undeclared
protected $default_prune = true; // used if undeclared
Expand Down
23 changes: 23 additions & 0 deletions tests/Extractor/ContentExtractorTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -939,6 +939,29 @@ public function testJsonLd()
$this->assertContains('<p>hihi</p>', $content_block->ownerDocument->saveXML($content_block));
}

public function testJsonLdSkipper()
{
$contentExtractor = new ContentExtractor(self::$contentExtractorConfig);

$config = new SiteConfig();
$config->skip_json_ld = true;

$res = $contentExtractor->process(
'<html><script type="application/ld+json">{ "@context": "https:\/\/schema.org", "@type": "NewsArticle", "headline": "title !!", "mainEntityOfPage": "http:\/\/jsonld.io\/toto", "datePublished": "2017-10-23T16:05:38+02:00", "dateModified": "2017-10-23T16:06:28+02:00", "description": "it is describe", "articlebody": " my body", "relatedLink": "", "image": { "@type": "ImageObject", "url": "https:\/\/static.jsonld.io\/medias.jpg", "height": "830", "width": "532" }, "author": { "@type": "Person", "name": "bob", "sameAs": ["https:\/\/twitter.com\/bob"] }, "keywords": ["syndicat", "usine", "licenciement", "Emmanuel Macron", "creuse", "plan social", "Automobile"] }</script><body><div>hello !hello !hello !hello !hello !hello !hello !<p itemprop="articleBody">' . str_repeat('this is the best part of the show', 10) . '</p></div></body></html>',
'https://skipjsonld.io/jsonld',
$config
);

$this->assertTrue($res, 'Extraction went well');

$content_block = $contentExtractor->getContent();

$this->assertEmpty($contentExtractor->getTitle());
$this->assertNull($contentExtractor->getDate());
$this->assertEmpty($contentExtractor->getAuthors());
$this->assertContains('this is the best part of the show', $content_block->ownerDocument->saveXML($content_block));
}

public function testJsonLdName()
{
$contentExtractor = new ContentExtractor(self::$contentExtractorConfig);
Expand Down

0 comments on commit 8f68263

Please sign in to comment.