Use graby ContentExtractor to clean html

It might be better to re-use some graby functionalities to clean html instead of building a new system.
2017-05-12 07:53:21 +02:00
parent fb436e8ca0
commit 74a75f7d43
4 changed files with 66 additions and 2 deletions
--- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php
+++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php
@ -47,6 +47,16 @@ class ContentProxy
    {
        // ensure content is a bit cleaned up
        if (!empty($content['html'])) {
+            $extractor = $this->graby->getExtractor();
+            $contentExtracted = $extractor->process($content['html'], $url);
+
+            if ($contentExtracted) {
+                $contentBlock = $extractor->getContent();
+                $contentBlock->normalize();
+
+                $content['html'] = trim($contentBlock->innerHTML);
+            }
+
            $content['html'] = htmLawed($content['html'], [
                'safe' => 1,
                // which means: do not remove iframe elements