static function initHTML5($doc = null) {
if (is_null($doc)) { $doc = new ExtendedDocument(); }
- $doc->loadHTML5('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE html><html></html>'); // this seems to be the only way to get the DOCTYPE set properly.
+ $doc->loadHTML5('<?xml version="1.0" encoding="utf-8"?>'."\n".'<!DOCTYPE html>'."\n".'<html></html>'); // this seems to be the only way to get the DOCTYPE set properly.
// Created basic HTML document structure.
$root = $doc->getElementsByTagName('html')->item(0);
public function loadHTML5($source) {
// Do our own handling of DOMDocument error reporting so we can ignore "unknown tags" which are usually fine in HTML5.
libxml_use_internal_errors(true);
- if (!preg_match('/^\s*<?xml /', $source)) {
+ if (!preg_match('/^\s*<\?xml /', $source)) {
// Add an XML declaration to force DOMDocument into UTF-8 mode.
$source = '<?xml version="1.0" encoding="utf-8"?>'."\n".$source;
}
$result = $this->loadHTML($source);
+ // Set encoding directly a,d remove any processing node that isn't the first node
+ $this->encoding = 'utf-8';
+ foreach ($this->childNodes as $i => $child) {
+ if ($i && $child->nodeType == XML_PI_NODE) {
+ $this->removeChild($child);
+ }
+ }
// Handle DOMDocument loading errors, throw away warnings on unknown tags as HTML5 allows all kinds.
$errseverity = array(LIBXML_ERR_WARNING => 'Warning', LIBXML_ERR_ERROR => 'Error', LIBXML_ERR_FATAL => 'Fatal');
foreach (libxml_get_errors() as $error) {
// Use loadHTML5() to parse and turn the markup into proper HTML.
$tmpdoc = new ExtendedDocument;
// The XML line is needed to tell the parser that we need UTF-8 parsing.
- $tmpdoc->loadHTML5('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE html><html><body>'.$htmldata.'</body></html>');
+ $tmpdoc->loadHTML5('<?xml version="1.0" encoding="utf-8"?>'."\n".'<!DOCTYPE html>'."\n".'<html><body>'.$htmldata.'</body></html>');
foreach ($tmpdoc->getElementsByTagName('body')->item(0)->childNodes as $child) {
$parentNode->appendChild($this->importNode($child, true));
}