improve loading of documents into HTML5

author Robert Kaiser <kairo@kairo.at>

Sun, 17 Mar 2024 21:34:30 +0000 (22:34 +0100)

committer Robert Kaiser <kairo@kairo.at>

Sun, 17 Mar 2024 21:34:30 +0000 (22:34 +0100)
author Robert Kaiser <kairo@kairo.at>
Sun, 17 Mar 2024 21:34:30 +0000 (22:34 +0100)
committer Robert Kaiser <kairo@kairo.at>
Sun, 17 Mar 2024 21:34:30 +0000 (22:34 +0100)
diff --git a/classes/document.php-class b/classes/document.php-class

index 211ae44116cd35bd7ca3e74328591b94cad33576..ebde6ec965f8ec52f16bbfaa6f40610519351e7e 100755 (executable)
--- a/classes/document.php-class
+++ b/classes/document.php-class
@@ -284,7 +284,7 @@ class ExtendedDocument extends DOMDocument {
  
    static function initHTML5($doc = null) {
      if (is_null($doc)) { $doc = new ExtendedDocument(); }
-    $doc->loadHTML5('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE html><html></html>'); // this seems to be the only way to get the DOCTYPE set properly.
+    $doc->loadHTML5('<?xml version="1.0" encoding="utf-8"?>'."\n".'<!DOCTYPE html>'."\n".'<html></html>'); // this seems to be the only way to get the DOCTYPE set properly.
  
      // Created basic HTML document structure.
      $root = $doc->getElementsByTagName('html')->item(0);
@@ -302,11 +302,18 @@ class ExtendedDocument extends DOMDocument {
    public function loadHTML5($source) {
      // Do our own handling of DOMDocument error reporting so we can ignore "unknown tags" which are usually fine in HTML5.
      libxml_use_internal_errors(true);
-    if (!preg_match('/^\s*<?xml /', $source)) {
+    if (!preg_match('/^\s*<\?xml /', $source)) {
        // Add an XML declaration to force DOMDocument into UTF-8 mode.
        $source = '<?xml version="1.0" encoding="utf-8"?>'."\n".$source;
      }
      $result = $this->loadHTML($source);
+    // Set encoding directly a,d remove any processing node that isn't the first node
+    $this->encoding = 'utf-8';
+    foreach ($this->childNodes as $i => $child) {
+      if ($i && $child->nodeType == XML_PI_NODE) {
+        $this->removeChild($child);
+      }
+    }
      // Handle DOMDocument loading errors, throw away warnings on unknown tags as HTML5 allows all kinds.
      $errseverity = array(LIBXML_ERR_WARNING => 'Warning', LIBXML_ERR_ERROR => 'Error', LIBXML_ERR_FATAL => 'Fatal');
      foreach (libxml_get_errors() as $error) {
@@ -439,7 +446,7 @@ class ExtendedDocument extends DOMDocument {
      // Use loadHTML5() to parse and turn the markup into proper HTML.
      $tmpdoc = new ExtendedDocument;
      // The XML line is needed to tell the parser that we need UTF-8 parsing.
-    $tmpdoc->loadHTML5('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE html><html><body>'.$htmldata.'</body></html>');
+    $tmpdoc->loadHTML5('<?xml version="1.0" encoding="utf-8"?>'."\n".'<!DOCTYPE html>'."\n".'<html><body>'.$htmldata.'</body></html>');
      foreach ($tmpdoc->getElementsByTagName('body')->item(0)->childNodes as $child) {
        $parentNode->appendChild($this->importNode($child, true));
      }
author	Robert Kaiser <kairo@kairo.at>
	Sun, 17 Mar 2024 21:34:30 +0000 (22:34 +0100)
committer	Robert Kaiser <kairo@kairo.at>
	Sun, 17 Mar 2024 21:34:30 +0000 (22:34 +0100)