Blame - mailcow/src/mailcow-dockerized/data/web/inc/lib/vendor/soundasleep/html2text/src/Html2Text.php - kubeia

blob: 8e1f8adcd038aaec8cd244321adfeb85c209ea71 [file] [log] [blame]

Matthias Andreas Benkard	b382b10	2021-01-02 15:32:21 +0100	[diff] [blame^]	1	<?php
				2	/******************************************************************************
				3	* Copyright (c) 2010 Jevon Wright and others.
				4	* All rights reserved. This program and the accompanying materials
				5	* are made available under the terms of the Eclipse Public License v1.0
				6	* which accompanies this distribution, and is available at
				7	* http://www.eclipse.org/legal/epl-v10.html
				8	*
				9	* or
				10	*
				11	* LGPL which is available at http://www.gnu.org/licenses/lgpl.html
				12	*
				13	*
				14	* Contributors:
				15	* Jevon Wright - initial API and implementation
				16	****************************************************************************/
				17
				18	namespace Html2Text;
				19
				20	class Html2Text {
				21
				22	/**
				23	* Tries to convert the given HTML into a plain text format - best suited for
				24	* e-mail display, etc.
				25	*
				26	* <p>In particular, it tries to maintain the following features:
				27	* <ul>
				28	* <li>Links are maintained, with the 'href' copied over
				29	* <li>Information in the <head> is lost
				30	* </ul>
				31	*
				32	* @param string $html the input HTML
				33	* @param boolean $ignore_error Ignore xml parsing errors
				34	* @return string the HTML converted, as best as possible, to text
				35	* @throws Html2TextException if the HTML could not be loaded as a {@link DOMDocument}
				36	*/
				37	public static function convert($html, $ignore_error = false) {
				38	// replace   with spaces
				39	$html = str_replace(" ", " ", $html);
				40	$html = str_replace("\xc2\xa0", " ", $html);
				41
				42	$is_office_document = static::isOfficeDocument($html);
				43
				44	if ($is_office_document) {
				45	// remove office namespace
				46	$html = str_replace(array("<o:p>", "</o:p>"), "", $html);
				47	}
				48
				49	$html = static::fixNewlines($html);
				50	if (mb_detect_encoding($html, "UTF-8", true)) {
				51	$html = mb_convert_encoding($html, "HTML-ENTITIES", "UTF-8");
				52	}
				53
				54	$doc = static::getDocument($html, $ignore_error);
				55
				56	$output = static::iterateOverNode($doc, null, false, $is_office_document);
				57
				58	// remove leading and trailing spaces on each line
				59	$output = preg_replace("/[ \t]\n[ \t]/im", "\n", $output);
				60	$output = preg_replace("/ \t /im", "\t", $output);
				61
				62	// unarmor pre blocks
				63	$output = str_replace("\r", "\n", $output);
				64
				65	// remove unnecessary empty lines
				66	$output = preg_replace("/\n\n\n*/im", "\n\n", $output);
				67
				68	// remove leading and trailing whitespace
				69	$output = trim($output);
				70
				71	return $output;
				72	}
				73
				74	/**
				75	* Unify newlines; in particular, \r\n becomes \n, and
				76	* then \r becomes \n. This means that all newlines (Unix, Windows, Mac)
				77	* all become \ns.
				78	*
				79	* @param string $text text with any number of \r, \r\n and \n combinations
				80	* @return string the fixed text
				81	*/
				82	static function fixNewlines($text) {
				83	// replace \r\n to \n
				84	$text = str_replace("\r\n", "\n", $text);
				85	// remove \rs
				86	$text = str_replace("\r", "\n", $text);
				87
				88	return $text;
				89	}
				90
				91	/**
				92	* Parse HTML into a DOMDocument
				93	*
				94	* @param string $html the input HTML
				95	* @param boolean $ignore_error Ignore xml parsing errors
				96	* @return DOMDocument the parsed document tree
				97	*/
				98	static function getDocument($html, $ignore_error = false) {
				99
				100	$doc = new \DOMDocument();
				101
				102	$html = trim($html);
				103
				104	if (!$html) {
				105	// DOMDocument doesn't support empty value and throws an error
				106	// Return empty document instead
				107	return $doc;
				108	}
				109
				110	if ($html[0] !== '<') {
				111	// If HTML does not begin with a tag, we put a body tag around it.
				112	// If we do not do this, PHP will insert a paragraph tag around
				113	// the first block of text for some reason which can mess up
				114	// the newlines. See pre.html test for an example.
				115	$html = '<body>' . $html . '</body>';
				116	}
				117
				118	if ($ignore_error) {
				119	$doc->strictErrorChecking = false;
				120	$doc->recover = true;
				121	$doc->xmlStandalone = true;
				122	$old_internal_errors = libxml_use_internal_errors(true);
				123	$load_result = $doc->loadHTML($html, LIBXML_NOWARNING \| LIBXML_NOERROR \| LIBXML_NONET);
				124	libxml_use_internal_errors($old_internal_errors);
				125	}
				126	else {
				127	$load_result = $doc->loadHTML($html);
				128	}
				129
				130	if (!$load_result) {
				131	throw new Html2TextException("Could not load HTML - badly formed?", $html);
				132	}
				133
				134	return $doc;
				135	}
				136
				137	/**
				138	* Can we guess that this HTML is generated by Microsoft Office?
				139	*/
				140	static function isOfficeDocument($html) {
				141	return strpos($html, "urn:schemas-microsoft-com:office") !== false;
				142	}
				143
				144	static function isWhitespace($text) {
				145	return strlen(trim($text, "\n\r\t ")) === 0;
				146	}
				147
				148	static function nextChildName($node) {
				149	// get the next child
				150	$nextNode = $node->nextSibling;
				151	while ($nextNode != null) {
				152	if ($nextNode instanceof \DOMText) {
				153	if (!static::isWhitespace($nextNode->wholeText)) {
				154	break;
				155	}
				156	}
				157	if ($nextNode instanceof \DOMElement) {
				158	break;
				159	}
				160	$nextNode = $nextNode->nextSibling;
				161	}
				162	$nextName = null;
				163	if (($nextNode instanceof \DOMElement \|\| $nextNode instanceof \DOMText) && $nextNode != null) {
				164	$nextName = strtolower($nextNode->nodeName);
				165	}
				166
				167	return $nextName;
				168	}
				169
				170	static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_office_document = false) {
				171
				172	if ($node instanceof \DOMText) {
				173	// Replace whitespace characters with a space (equivilant to \s)
				174	if ($in_pre) {
				175	$text = "\n" . trim($node->wholeText, "\n\r\t ") . "\n";
				176	// Remove trailing whitespace only
				177	$text = preg_replace("/[ \t]*\n/im", "\n", $text);
				178	// armor newlines with \r.
				179	return str_replace("\n", "\r", $text);
				180	} else {
				181	$text = preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $node->wholeText);
				182	if (!static::isWhitespace($text) && ($prevName == 'p' \|\| $prevName == 'div')) {
				183	return "\n" . $text;
				184	}
				185	return $text;
				186	}
				187	}
				188	if ($node instanceof \DOMDocumentType) {
				189	// ignore
				190	return "";
				191	}
				192	if ($node instanceof \DOMProcessingInstruction) {
				193	// ignore
				194	return "";
				195	}
				196
				197	$name = strtolower($node->nodeName);
				198	$nextName = static::nextChildName($node);
				199
				200	// start whitespace
				201	switch ($name) {
				202	case "hr":
				203	$prefix = '';
				204	if ($prevName != null) {
				205	$prefix = "\n";
				206	}
				207	return $prefix . "---------------------------------------------------------------\n";
				208
				209	case "style":
				210	case "head":
				211	case "title":
				212	case "meta":
				213	case "script":
				214	// ignore these tags
				215	return "";
				216
				217	case "h1":
				218	case "h2":
				219	case "h3":
				220	case "h4":
				221	case "h5":
				222	case "h6":
				223	case "ol":
				224	case "ul":
				225	// add two newlines, second line is added below
				226	$output = "\n";
				227	break;
				228
				229	case "td":
				230	case "th":
				231	// add tab char to separate table fields
				232	$output = "\t";
				233	break;
				234
				235	case "p":
				236	// Microsoft exchange emails often include HTML which, when passed through
				237	// html2text, results in lots of double line returns everywhere.
				238	//
				239	// To fix this, for any p element with a className of `MsoNormal` (the standard
				240	// classname in any Microsoft export or outlook for a paragraph that behaves
				241	// like a line return) we skip the first line returns and set the name to br.
				242	if ($is_office_document && $node->getAttribute('class') == 'MsoNormal') {
				243	$output = "";
				244	$name = 'br';
				245	break;
				246	}
				247	// add two lines
				248	$output = "\n\n";
				249	break;
				250
				251	case "pre":
				252	case "tr":
				253	case "div":
				254	// add one line
				255	$output = "\n";
				256	break;
				257
				258	case "li":
				259	$output = "- ";
				260	break;
				261
				262	default:
				263	// print out contents of unknown tags
				264	$output = "";
				265	break;
				266	}
				267
				268	// debug
				269	//$output .= "[$name,$nextName]";
				270
				271	if (isset($node->childNodes)) {
				272
				273	$n = $node->childNodes->item(0);
				274	$previousSiblingName = null;
				275
				276	while($n != null) {
				277
				278	$text = static::iterateOverNode($n, $previousSiblingName, $in_pre \|\| $name == 'pre', $is_office_document);
				279
				280	// Pass current node name to next child, as previousSibling does not appear to get populated
				281	if ($n instanceof \DOMDocumentType
				282	\|\| $n instanceof \DOMProcessingInstruction
				283	\|\| ($n instanceof \DOMText && static::isWhitespace($text))) {
				284	// Keep current previousSiblingName, these are invisible
				285	}
				286	else {
				287	$previousSiblingName = strtolower($n->nodeName);
				288	}
				289
				290	$node->removeChild($n);
				291	$n = $node->childNodes->item(0);
				292
				293	// suppress last br tag inside a node list
				294	if ($n != null \|\| $previousSiblingName != 'br') {
				295	$output .= $text;
				296	}
				297	}
				298	}
				299
				300	// end whitespace
				301	switch ($name) {
				302	case "h1":
				303	case "h2":
				304	case "h3":
				305	case "h4":
				306	case "h5":
				307	case "h6":
				308	$output .= "\n";
				309	break;
				310
				311	case "p":
				312	// add two lines
				313	$output .= "\n\n";
				314	break;
				315
				316	case "pre":
				317	case "br":
				318	// add one line
				319	$output .= "\n";
				320	break;
				321
				322	case "div":
				323	break;
				324
				325	case "a":
				326	// links are returned in [text](link) format
				327	$href = $node->getAttribute("href");
				328
				329	$output = trim($output);
				330
				331	// remove double [[ ]] s from linking images
				332	if (substr($output, 0, 1) == "[" && substr($output, -1) == "]") {
				333	$output = substr($output, 1, strlen($output) - 2);
				334
				335	// for linking images, the title of the <a> overrides the title of the <img>
				336	if ($node->getAttribute("title")) {
				337	$output = $node->getAttribute("title");
				338	}
				339	}
				340
				341	// if there is no link text, but a title attr
				342	if (!$output && $node->getAttribute("title")) {
				343	$output = $node->getAttribute("title");
				344	}
				345
				346	if ($href == null) {
				347	// it doesn't link anywhere
				348	if ($node->getAttribute("name") != null) {
				349	$output = "[$output]";
				350	}
				351	} else {
				352	if ($href == $output \|\| $href == "mailto:$output" \|\| $href == "http://$output" \|\| $href == "https://$output") {
				353	// link to the same address: just use link
				354	$output;
				355	} else {
				356	// replace it
				357	if ($output) {
				358	$output = "[$output]($href)";
				359	} else {
				360	// empty string
				361	$output = $href;
				362	}
				363	}
				364	}
				365
				366	// does the next node require additional whitespace?
				367	switch ($nextName) {
				368	case "h1": case "h2": case "h3": case "h4": case "h5": case "h6":
				369	$output .= "\n";
				370	break;
				371	}
				372	break;
				373
				374	case "img":
				375	if ($node->getAttribute("title")) {
				376	$output = "[" . $node->getAttribute("title") . "]";
				377	} elseif ($node->getAttribute("alt")) {
				378	$output = "[" . $node->getAttribute("alt") . "]";
				379	} else {
				380	$output = "";
				381	}
				382	break;
				383
				384	case "li":
				385	$output .= "\n";
				386	break;
				387
				388	default:
				389	// do nothing
				390	}
				391
				392	return $output;
				393	}
				394	}