From 0fe05664db27eb226c6c626543b89525de2c8220 Mon Sep 17 00:00:00 2001 From: Robert Kaiser Date: Sun, 17 Mar 2024 22:34:30 +0100 Subject: [PATCH] improve loading of documents into HTML5 --- classes/document.php-class | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/classes/document.php-class b/classes/document.php-class index 211ae44..ebde6ec 100755 --- a/classes/document.php-class +++ b/classes/document.php-class @@ -284,7 +284,7 @@ class ExtendedDocument extends DOMDocument { static function initHTML5($doc = null) { if (is_null($doc)) { $doc = new ExtendedDocument(); } - $doc->loadHTML5(''); // this seems to be the only way to get the DOCTYPE set properly. + $doc->loadHTML5(''."\n".''."\n".''); // this seems to be the only way to get the DOCTYPE set properly. // Created basic HTML document structure. $root = $doc->getElementsByTagName('html')->item(0); @@ -302,11 +302,18 @@ class ExtendedDocument extends DOMDocument { public function loadHTML5($source) { // Do our own handling of DOMDocument error reporting so we can ignore "unknown tags" which are usually fine in HTML5. libxml_use_internal_errors(true); - if (!preg_match('/^\s*'."\n".$source; } $result = $this->loadHTML($source); + // Set encoding directly a,d remove any processing node that isn't the first node + $this->encoding = 'utf-8'; + foreach ($this->childNodes as $i => $child) { + if ($i && $child->nodeType == XML_PI_NODE) { + $this->removeChild($child); + } + } // Handle DOMDocument loading errors, throw away warnings on unknown tags as HTML5 allows all kinds. $errseverity = array(LIBXML_ERR_WARNING => 'Warning', LIBXML_ERR_ERROR => 'Error', LIBXML_ERR_FATAL => 'Fatal'); foreach (libxml_get_errors() as $error) { @@ -439,7 +446,7 @@ class ExtendedDocument extends DOMDocument { // Use loadHTML5() to parse and turn the markup into proper HTML. $tmpdoc = new ExtendedDocument; // The XML line is needed to tell the parser that we need UTF-8 parsing. - $tmpdoc->loadHTML5(''.$htmldata.''); + $tmpdoc->loadHTML5(''."\n".''."\n".''.$htmldata.''); foreach ($tmpdoc->getElementsByTagName('body')->item(0)->childNodes as $child) { $parentNode->appendChild($this->importNode($child, true)); } -- 2.35.3