diff --git a/README.md b/README.md index 0dca42ad..ebf0815c 100644 --- a/README.md +++ b/README.md @@ -234,6 +234,7 @@ These checks are available in the package. You can add or remove checks in the c ✅ The page contains no broken links.
✅ The page contains no broken images.
✅ Length of the content is at least 2100 characters.
+✅ No more than 20% of the content contains too long sentences (more than 20 words).
### Meta diff --git a/resources/lang/en.json b/resources/lang/en.json index b4255720..cf7c144b 100644 --- a/resources/lang/en.json +++ b/resources/lang/en.json @@ -29,5 +29,6 @@ "failed.performance.javascript_size": "The page contains Javascript files that are too large (max :expectedValue). These files were found: :actualValue.", "failed.performance.response": "The page returned a response code other than :expectedValue. The actual response code was :actualValue.", "failed.performance.ttfb": "The page took too long to load (max :expectedValuems). The actual time was :actualValuems.", - "failed.performance.ttfb.missing_url": "We could not get the TTFB for this page." + "failed.performance.ttfb.missing_url": "We could not get the TTFB for this page.", + "failed.content.too_long_sentence": "The page contains :actualValue sentences that are too long." } \ No newline at end of file diff --git a/resources/lang/nl.json b/resources/lang/nl.json index 8cc4583c..8b1fcd39 100644 --- a/resources/lang/nl.json +++ b/resources/lang/nl.json @@ -27,5 +27,6 @@ "failed.performance.javascript_size": "The page contains Javascript files that are too large (max :expectedValue). These files were found: :actualValue.", "failed.performance.response": "The page returned a response code other than :expectedValue. The actual response code was :actualValue.", "failed.performance.ttfb": "The page took too long to load (max :expectedValuems). The actual time was :actualValuems.", - "failed.performance.ttfb.missing_url": "We could not get the TTFB for this page." + "failed.performance.ttfb.missing_url": "We could not get the TTFB for this page.", + "failed.content.too_long_sentence": "The page contains :actualValue sentences that are too long." } \ No newline at end of file diff --git a/src/Checks/Content/TooLongSentenceCheck.php b/src/Checks/Content/TooLongSentenceCheck.php new file mode 100644 index 00000000..82c1e3c0 --- /dev/null +++ b/src/Checks/Content/TooLongSentenceCheck.php @@ -0,0 +1,109 @@ +validateContent($crawler)) { + return false; + } + + return true; + } + + public function validateContent(Crawler $crawler): bool + { + $realSentences = []; + $sentences = $this->getSentencesFromCrawler($crawler); + + $sentences = $this->separateSentencesByDot($sentences); + + $sentencesWithTooManyWords = $this->calculateSentencesWithTooManyWords($sentences); + + $this->actualValue = $this->calculateSentencesWithTooManyWords($sentences); + + if (count($sentencesWithTooManyWords) === 0) { + return true; + } + + // If more than 20% of the total sentences are too long, fail + if (count($sentencesWithTooManyWords) / count($sentences) > 0.2) { + $this->failureReason = __('failed.content.too_long_sentence', [ + 'actualValue' => count($this->actualValue), + ]); + + return false; + } + + return true; + } + + private function separateSentencesByDot(array $sentences): array + { + $newSentences = []; + + foreach ($sentences as $sentence) { + $sentence = explode('.', $sentence); + $newSentences = array_merge($newSentences, $sentence); + } + + // Remove all sentences that are empty + $sentences = array_filter($newSentences, function ($sentence) { + return ! empty($sentence); + }); + + return $sentences; + } + + private function getSentencesFromCrawler(Crawler $crawler): array + { + $content = $crawler->filterXPath('//body')->children(); + + // Get all elements that contain text + $content = $content->filterXPath('//*/text()[normalize-space()]'); + + $content = $content->each(function (Crawler $node, $i) { + return $node->text(); + }); + + return $content; + } + + private function calculateSentencesWithTooManyWords(array $sentences): array + { + $tooLongSentences = []; + + foreach ($sentences as $sentence) { + if (str_word_count($sentence) > 20) { + $tooLongSentences[] = $sentence; + } + } + + return $tooLongSentences; + } +} diff --git a/tests/Checks/Content/TooLongSentenceCheckTest.php b/tests/Checks/Content/TooLongSentenceCheckTest.php new file mode 100644 index 00000000..8290a8ef --- /dev/null +++ b/tests/Checks/Content/TooLongSentenceCheckTest.php @@ -0,0 +1,74 @@ + Http::response( + ' + + Test + + +

'.$body.'

+ ', + 200), + ]); + + $crawler->addHtmlContent(Http::get('vormkracht10.nl')->body()); + + $this->assertFalse($check->check(Http::get('vormkracht10.nl'), $crawler)); +}); + +it('can perform the too long sentence check on page with no too long sentence', function () { + $check = new TooLongSentenceCheck(); + $crawler = new Crawler(); + + $body = 'One two three four five six seven eight nine ten eleven twelve thirteen fourteen fifteen sixteen seventeen eighteen'; + + Http::fake([ + 'vormkracht10.nl' => Http::response( + ' + + Test + + +

'.$body.'

+ ', + 200), + ]); + + $crawler->addHtmlContent(Http::get('vormkracht10.nl')->body()); + + $check->check(Http::get('vormkracht10.nl'), $crawler); + + $this->assertTrue($check->check(Http::get('vormkracht10.nl'), $crawler)); +}); + +it('can perform the too long sentence check on page with no body', function () { + $check = new TooLongSentenceCheck(); + $crawler = new Crawler(); + + Http::fake([ + 'vormkracht10.nl' => Http::response( + ' + + Test + + ', + 200), + ]); + + $crawler->addHtmlContent(Http::get('vormkracht10.nl')->body()); + + $check->check(Http::get('vormkracht10.nl'), $crawler); + + $this->assertTrue($check->check(Http::get('vormkracht10.nl'), $crawler)); +});