mailcow/src/mailcow-dockerized/data/web/inc/lib/vendor/soundasleep/html2text/src/Html2Text.php - kubeia - Gitiles

 <?php
 /******************************************************************************
  * Copyright (c) 2010 Jevon Wright and others.
  * All rights reserved. This program and the accompanying materials
  * are made available under the terms of the Eclipse Public License v1.0
  * which accompanies this distribution, and is available at
  * http://www.eclipse.org/legal/epl-v10.html
  *
  * or
  *
  * LGPL which is available at http://www.gnu.org/licenses/lgpl.html
  *
  *
  * Contributors:
  *    Jevon Wright - initial API and implementation
  ****************************************************************************/

 namespace Html2Text;

 class Html2Text {

 	/**
 	 * Tries to convert the given HTML into a plain text format - best suited for
 	 * e-mail display, etc.
 	 *
 	 * <p>In particular, it tries to maintain the following features:
 	 * <ul>
 	 *   <li>Links are maintained, with the 'href' copied over
 	 *   <li>Information in the &lt;head&gt; is lost
 	 * </ul>
 	 *
 	 * @param string $html the input HTML
 	 * @param boolean $ignore_error Ignore xml parsing errors
 	 * @return string the HTML converted, as best as possible, to text
 	 * @throws Html2TextException if the HTML could not be loaded as a {@link DOMDocument}
 	 */
 	public static function convert($html, $ignore_error = false) {
 		// replace &nbsp; with spaces
 		$html = str_replace("&nbsp;", " ", $html);
 		$html = str_replace("\xc2\xa0", " ", $html);

 		$is_office_document = static::isOfficeDocument($html);

 		if ($is_office_document) {
 			// remove office namespace
 			$html = str_replace(array("<o:p>", "</o:p>"), "", $html);
 		}

 		$html = static::fixNewlines($html);
 		if (mb_detect_encoding($html, "UTF-8", true)) {
 			$html = mb_convert_encoding($html, "HTML-ENTITIES", "UTF-8");
 		}

 		$doc = static::getDocument($html, $ignore_error);

 		$output = static::iterateOverNode($doc, null, false, $is_office_document);

 		// remove leading and trailing spaces on each line
 		$output = preg_replace("/[ \t]*\n[ \t]*/im", "\n", $output);
 		$output = preg_replace("/ *\t */im", "\t", $output);

 		// unarmor pre blocks
 		$output = str_replace("\r", "\n", $output);

 		// remove unnecessary empty lines
 		$output = preg_replace("/\n\n\n*/im", "\n\n", $output);

 		// remove leading and trailing whitespace
 		$output = trim($output);

 		return $output;
 	}

 	/**
 	 * Unify newlines; in particular, \r\n becomes \n, and
 	 * then \r becomes \n. This means that all newlines (Unix, Windows, Mac)
 	 * all become \ns.
 	 *
 	 * @param string $text text with any number of \r, \r\n and \n combinations
 	 * @return string the fixed text
 	 */
 	static function fixNewlines($text) {
 		// replace \r\n to \n
 		$text = str_replace("\r\n", "\n", $text);
 		// remove \rs
 		$text = str_replace("\r", "\n", $text);

 		return $text;
 	}

 	/**
 	 * Parse HTML into a DOMDocument
 	 *
 	 * @param string $html the input HTML
 	 * @param boolean $ignore_error Ignore xml parsing errors
 	 * @return DOMDocument the parsed document tree
 	 */
 	static function getDocument($html, $ignore_error = false) {

 		$doc = new \DOMDocument();

 		$html = trim($html);

 		if (!$html) {
 			// DOMDocument doesn't support empty value and throws an error
 			// Return empty document instead
 			return $doc;
 		}

 		if ($html[0] !== '<') {
 			// If HTML does not begin with a tag, we put a body tag around it.
 			// If we do not do this, PHP will insert a paragraph tag around
 			// the first block of text for some reason which can mess up
 			// the newlines. See pre.html test for an example.
 			$html = '<body>' . $html . '</body>';
 		}

 		if ($ignore_error) {
 			$doc->strictErrorChecking = false;
 			$doc->recover = true;
 			$doc->xmlStandalone = true;
 			$old_internal_errors = libxml_use_internal_errors(true);
 			$load_result = $doc->loadHTML($html, LIBXML_NOWARNING | LIBXML_NOERROR | LIBXML_NONET);
 			libxml_use_internal_errors($old_internal_errors);
 		}
 		else {
 			$load_result = $doc->loadHTML($html);
 		}

 		if (!$load_result) {
 			throw new Html2TextException("Could not load HTML - badly formed?", $html);
 		}

 		return $doc;
 	}

 	/**
 	 * Can we guess that this HTML is generated by Microsoft Office?
 	 */
 	static function isOfficeDocument($html) {
 		return strpos($html, "urn:schemas-microsoft-com:office") !== false;
 	}

 	static function isWhitespace($text) {
 		return strlen(trim($text, "\n\r\t ")) === 0;
 	}

 	static function nextChildName($node) {
 		// get the next child
 		$nextNode = $node->nextSibling;
 		while ($nextNode != null) {
 			if ($nextNode instanceof \DOMText) {
 				if (!static::isWhitespace($nextNode->wholeText)) {
 					break;
 				}
 			}
 			if ($nextNode instanceof \DOMElement) {
 				break;
 			}
 			$nextNode = $nextNode->nextSibling;
 		}
 		$nextName = null;
 		if (($nextNode instanceof \DOMElement || $nextNode instanceof \DOMText) && $nextNode != null) {
 			$nextName = strtolower($nextNode->nodeName);
 		}

 		return $nextName;
 	}

 	static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_office_document = false) {

 		if ($node instanceof \DOMText) {
 		  // Replace whitespace characters with a space (equivilant to \s)
 			if ($in_pre) {
 				$text = "\n" . trim($node->wholeText, "\n\r\t ") . "\n";
 				// Remove trailing whitespace only
 				$text = preg_replace("/[ \t]*\n/im", "\n", $text);
 				// armor newlines with \r.
 				return str_replace("\n", "\r", $text);
 			} else {
 				$text = preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $node->wholeText);
 				if (!static::isWhitespace($text) && ($prevName == 'p' || $prevName == 'div')) {
 					return "\n" . $text;
 				}
 				return $text;
 			}
 		}
 		if ($node instanceof \DOMDocumentType) {
 			// ignore
 			return "";
 		}
 		if ($node instanceof \DOMProcessingInstruction) {
 			// ignore
 			return "";
 		}

 		$name = strtolower($node->nodeName);
 		$nextName = static::nextChildName($node);

 		// start whitespace
 		switch ($name) {
 			case "hr":
 				$prefix = '';
 				if ($prevName != null) {
 					$prefix = "\n";
 				}
 				return $prefix . "---------------------------------------------------------------\n";

 			case "style":
 			case "head":
 			case "title":
 			case "meta":
 			case "script":
 				// ignore these tags
 				return "";

 			case "h1":
 			case "h2":
 			case "h3":
 			case "h4":
 			case "h5":
 			case "h6":
 			case "ol":
 			case "ul":
 				// add two newlines, second line is added below
 				$output = "\n";
 				break;

 			case "td":
 			case "th":
 				// add tab char to separate table fields
 			   $output = "\t";
 			   break;

 			case "p":
 				// Microsoft exchange emails often include HTML which, when passed through
 				// html2text, results in lots of double line returns everywhere.
 				//
 				// To fix this, for any p element with a className of `MsoNormal` (the standard
 				// classname in any Microsoft export or outlook for a paragraph that behaves
 				// like a line return) we skip the first line returns and set the name to br.
 				if ($is_office_document && $node->getAttribute('class') == 'MsoNormal') {
 					$output = "";
 					$name = 'br';
 					break;
 				}
 				// add two lines
 				$output = "\n\n";
 				break;

 			case "pre":
 			case "tr":
 			case "div":
 				// add one line
 				$output = "\n";
 				break;

 			case "li":
 				$output = "- ";
 				break;

 			default:
 				// print out contents of unknown tags
 				$output = "";
 				break;
 		}

 		// debug
 		//$output .= "[$name,$nextName]";

 		if (isset($node->childNodes)) {

 			$n = $node->childNodes->item(0);
 			$previousSiblingName = null;

 			while($n != null) {

 				$text = static::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document);

 				// Pass current node name to next child, as previousSibling does not appear to get populated
 				if ($n instanceof \DOMDocumentType
 					|| $n instanceof \DOMProcessingInstruction
 					|| ($n instanceof \DOMText && static::isWhitespace($text))) {
 					// Keep current previousSiblingName, these are invisible
 				}
 				else {
 					$previousSiblingName = strtolower($n->nodeName);
 				}

 				$node->removeChild($n);
 				$n = $node->childNodes->item(0);

 				// suppress last br tag inside a node list
 				if ($n != null || $previousSiblingName != 'br') {
 					$output .= $text;
 				}
 			}
 		}

 		// end whitespace
 		switch ($name) {
 			case "h1":
 			case "h2":
 			case "h3":
 			case "h4":
 			case "h5":
 			case "h6":
 				$output .= "\n";
 				break;

 			case "p":
 				// add two lines
 				$output .= "\n\n";
 				break;

 			case "pre":
 			case "br":
 				// add one line
 				$output .= "\n";
 				break;

 			case "div":
 				break;

 			case "a":
 				// links are returned in [text](link) format
 				$href = $node->getAttribute("href");

 				$output = trim($output);

 				// remove double [[ ]] s from linking images
 				if (substr($output, 0, 1) == "[" && substr($output, -1) == "]") {
 					$output = substr($output, 1, strlen($output) - 2);

 					// for linking images, the title of the <a> overrides the title of the <img>
 					if ($node->getAttribute("title")) {
 						$output = $node->getAttribute("title");
 					}
 				}

 				// if there is no link text, but a title attr
 				if (!$output && $node->getAttribute("title")) {
 					$output = $node->getAttribute("title");
 				}

 				if ($href == null) {
 					// it doesn't link anywhere
 					if ($node->getAttribute("name") != null) {
 						$output = "[$output]";
 					}
 				} else {
 					if ($href == $output || $href == "mailto:$output" || $href == "http://$output" || $href == "https://$output") {
 						// link to the same address: just use link
 						$output;
 					} else {
 						// replace it
 						if ($output) {
 							$output = "[$output]($href)";
 						} else {
 							// empty string
 							$output = $href;
 						}
 					}
 				}

 				// does the next node require additional whitespace?
 				switch ($nextName) {
 					case "h1": case "h2": case "h3": case "h4": case "h5": case "h6":
 						$output .= "\n";
 						break;
 				}
 				break;

 			case "img":
 				if ($node->getAttribute("title")) {
 					$output = "[" . $node->getAttribute("title") . "]";
 				} elseif ($node->getAttribute("alt")) {
 					$output = "[" . $node->getAttribute("alt") . "]";
 				} else {
 					$output = "";
 				}
 				break;

 			case "li":
 				$output .= "\n";
 				break;

 			default:
 				// do nothing
 		}

 		return $output;
 	}
 }
	<?php
	/******************************************************************************
	* Copyright (c) 2010 Jevon Wright and others.
	* All rights reserved. This program and the accompanying materials
	* are made available under the terms of the Eclipse Public License v1.0
	* which accompanies this distribution, and is available at
	* http://www.eclipse.org/legal/epl-v10.html
	*
	* or
	*
	* LGPL which is available at http://www.gnu.org/licenses/lgpl.html
	*
	*
	* Contributors:
	* Jevon Wright - initial API and implementation
	****************************************************************************/

	namespace Html2Text;

	class Html2Text {

	/**
	* Tries to convert the given HTML into a plain text format - best suited for
	* e-mail display, etc.
	*
	* <p>In particular, it tries to maintain the following features:
	* <ul>
	* <li>Links are maintained, with the 'href' copied over
	* <li>Information in the <head> is lost
	* </ul>
	*
	* @param string $html the input HTML
	* @param boolean $ignore_error Ignore xml parsing errors
	* @return string the HTML converted, as best as possible, to text
	* @throws Html2TextException if the HTML could not be loaded as a {@link DOMDocument}
	*/
	public static function convert($html, $ignore_error = false) {
	// replace   with spaces
	$html = str_replace(" ", " ", $html);
	$html = str_replace("\xc2\xa0", " ", $html);

	$is_office_document = static::isOfficeDocument($html);

	if ($is_office_document) {
	// remove office namespace
	$html = str_replace(array("<o:p>", "</o:p>"), "", $html);
	}

	$html = static::fixNewlines($html);
	if (mb_detect_encoding($html, "UTF-8", true)) {
	$html = mb_convert_encoding($html, "HTML-ENTITIES", "UTF-8");
	}

	$doc = static::getDocument($html, $ignore_error);

	$output = static::iterateOverNode($doc, null, false, $is_office_document);

	// remove leading and trailing spaces on each line
	$output = preg_replace("/[ \t]\n[ \t]/im", "\n", $output);
	$output = preg_replace("/ \t /im", "\t", $output);

	// unarmor pre blocks
	$output = str_replace("\r", "\n", $output);

	// remove unnecessary empty lines
	$output = preg_replace("/\n\n\n*/im", "\n\n", $output);

	// remove leading and trailing whitespace
	$output = trim($output);

	return $output;
	}

	/**
	* Unify newlines; in particular, \r\n becomes \n, and
	* then \r becomes \n. This means that all newlines (Unix, Windows, Mac)
	* all become \ns.
	*
	* @param string $text text with any number of \r, \r\n and \n combinations
	* @return string the fixed text
	*/
	static function fixNewlines($text) {
	// replace \r\n to \n
	$text = str_replace("\r\n", "\n", $text);
	// remove \rs
	$text = str_replace("\r", "\n", $text);

	return $text;
	}

	/**
	* Parse HTML into a DOMDocument
	*
	* @param string $html the input HTML
	* @param boolean $ignore_error Ignore xml parsing errors
	* @return DOMDocument the parsed document tree
	*/
	static function getDocument($html, $ignore_error = false) {

	$doc = new \DOMDocument();

	$html = trim($html);

	if (!$html) {
	// DOMDocument doesn't support empty value and throws an error
	// Return empty document instead
	return $doc;
	}

	if ($html[0] !== '<') {
	// If HTML does not begin with a tag, we put a body tag around it.
	// If we do not do this, PHP will insert a paragraph tag around
	// the first block of text for some reason which can mess up
	// the newlines. See pre.html test for an example.
	$html = '<body>' . $html . '</body>';
	}

	if ($ignore_error) {
	$doc->strictErrorChecking = false;
	$doc->recover = true;
	$doc->xmlStandalone = true;
	$old_internal_errors = libxml_use_internal_errors(true);
	$load_result = $doc->loadHTML($html, LIBXML_NOWARNING \| LIBXML_NOERROR \| LIBXML_NONET);
	libxml_use_internal_errors($old_internal_errors);
	}
	else {
	$load_result = $doc->loadHTML($html);
	}

	if (!$load_result) {
	throw new Html2TextException("Could not load HTML - badly formed?", $html);
	}

	return $doc;
	}

	/**
	* Can we guess that this HTML is generated by Microsoft Office?
	*/
	static function isOfficeDocument($html) {
	return strpos($html, "urn:schemas-microsoft-com:office") !== false;
	}

	static function isWhitespace($text) {
	return strlen(trim($text, "\n\r\t ")) === 0;
	}

	static function nextChildName($node) {
	// get the next child
	$nextNode = $node->nextSibling;
	while ($nextNode != null) {
	if ($nextNode instanceof \DOMText) {
	if (!static::isWhitespace($nextNode->wholeText)) {
	break;
	}
	}
	if ($nextNode instanceof \DOMElement) {
	break;
	}
	$nextNode = $nextNode->nextSibling;
	}
	$nextName = null;
	if (($nextNode instanceof \DOMElement \|\| $nextNode instanceof \DOMText) && $nextNode != null) {
	$nextName = strtolower($nextNode->nodeName);
	}

	return $nextName;
	}

	static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_office_document = false) {

	if ($node instanceof \DOMText) {
	// Replace whitespace characters with a space (equivilant to \s)
	if ($in_pre) {
	$text = "\n" . trim($node->wholeText, "\n\r\t ") . "\n";
	// Remove trailing whitespace only
	$text = preg_replace("/[ \t]*\n/im", "\n", $text);
	// armor newlines with \r.
	return str_replace("\n", "\r", $text);
	} else {
	$text = preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $node->wholeText);
	if (!static::isWhitespace($text) && ($prevName == 'p' \|\| $prevName == 'div')) {
	return "\n" . $text;
	}
	return $text;
	}
	}
	if ($node instanceof \DOMDocumentType) {
	// ignore
	return "";
	}
	if ($node instanceof \DOMProcessingInstruction) {
	// ignore
	return "";
	}

	$name = strtolower($node->nodeName);
	$nextName = static::nextChildName($node);

	// start whitespace
	switch ($name) {
	case "hr":
	$prefix = '';
	if ($prevName != null) {
	$prefix = "\n";
	}
	return $prefix . "---------------------------------------------------------------\n";

	case "style":
	case "head":
	case "title":
	case "meta":
	case "script":
	// ignore these tags
	return "";

	case "h1":
	case "h2":
	case "h3":
	case "h4":
	case "h5":
	case "h6":
	case "ol":
	case "ul":
	// add two newlines, second line is added below
	$output = "\n";
	break;

	case "td":
	case "th":
	// add tab char to separate table fields
	$output = "\t";
	break;

	case "p":
	// Microsoft exchange emails often include HTML which, when passed through
	// html2text, results in lots of double line returns everywhere.
	//
	// To fix this, for any p element with a className of `MsoNormal` (the standard
	// classname in any Microsoft export or outlook for a paragraph that behaves
	// like a line return) we skip the first line returns and set the name to br.
	if ($is_office_document && $node->getAttribute('class') == 'MsoNormal') {
	$output = "";
	$name = 'br';
	break;
	}
	// add two lines
	$output = "\n\n";
	break;

	case "pre":
	case "tr":
	case "div":
	// add one line
	$output = "\n";
	break;

	case "li":
	$output = "- ";
	break;

	default:
	// print out contents of unknown tags
	$output = "";
	break;
	}

	// debug
	//$output .= "[$name,$nextName]";

	if (isset($node->childNodes)) {

	$n = $node->childNodes->item(0);
	$previousSiblingName = null;

	while($n != null) {

	$text = static::iterateOverNode($n, $previousSiblingName, $in_pre \|\| $name == 'pre', $is_office_document);

	// Pass current node name to next child, as previousSibling does not appear to get populated
	if ($n instanceof \DOMDocumentType
	\|\| $n instanceof \DOMProcessingInstruction
	\|\| ($n instanceof \DOMText && static::isWhitespace($text))) {
	// Keep current previousSiblingName, these are invisible
	}
	else {
	$previousSiblingName = strtolower($n->nodeName);
	}

	$node->removeChild($n);
	$n = $node->childNodes->item(0);

	// suppress last br tag inside a node list
	if ($n != null \|\| $previousSiblingName != 'br') {
	$output .= $text;
	}
	}
	}

	// end whitespace
	switch ($name) {
	case "h1":
	case "h2":
	case "h3":
	case "h4":
	case "h5":
	case "h6":
	$output .= "\n";
	break;

	case "p":
	// add two lines
	$output .= "\n\n";
	break;

	case "pre":
	case "br":
	// add one line
	$output .= "\n";
	break;

	case "div":
	break;

	case "a":
	// links are returned in [text](link) format
	$href = $node->getAttribute("href");

	$output = trim($output);

	// remove double [[ ]] s from linking images
	if (substr($output, 0, 1) == "[" && substr($output, -1) == "]") {
	$output = substr($output, 1, strlen($output) - 2);

	// for linking images, the title of the <a> overrides the title of the <img>
	if ($node->getAttribute("title")) {
	$output = $node->getAttribute("title");
	}
	}

	// if there is no link text, but a title attr
	if (!$output && $node->getAttribute("title")) {
	$output = $node->getAttribute("title");
	}

	if ($href == null) {
	// it doesn't link anywhere
	if ($node->getAttribute("name") != null) {
	$output = "[$output]";
	}
	} else {
	if ($href == $output \|\| $href == "mailto:$output" \|\| $href == "http://$output" \|\| $href == "https://$output") {
	// link to the same address: just use link
	$output;
	} else {
	// replace it
	if ($output) {
	$output = "[$output]($href)";
	} else {
	// empty string
	$output = $href;
	}
	}
	}

	// does the next node require additional whitespace?
	switch ($nextName) {
	case "h1": case "h2": case "h3": case "h4": case "h5": case "h6":
	$output .= "\n";
	break;
	}
	break;

	case "img":
	if ($node->getAttribute("title")) {
	$output = "[" . $node->getAttribute("title") . "]";
	} elseif ($node->getAttribute("alt")) {
	$output = "[" . $node->getAttribute("alt") . "]";
	} else {
	$output = "";
	}
	break;

	case "li":
	$output .= "\n";
	break;

	default:
	// do nothing
	}

	return $output;
	}
	}