blob: 8e1f8adcd038aaec8cd244321adfeb85c209ea71 [file] [log] [blame]
Matthias Andreas Benkardb382b102021-01-02 15:32:21 +01001<?php
2/******************************************************************************
3 * Copyright (c) 2010 Jevon Wright and others.
4 * All rights reserved. This program and the accompanying materials
5 * are made available under the terms of the Eclipse Public License v1.0
6 * which accompanies this distribution, and is available at
7 * http://www.eclipse.org/legal/epl-v10.html
8 *
9 * or
10 *
11 * LGPL which is available at http://www.gnu.org/licenses/lgpl.html
12 *
13 *
14 * Contributors:
15 * Jevon Wright - initial API and implementation
16 ****************************************************************************/
17
18namespace Html2Text;
19
20class Html2Text {
21
22 /**
23 * Tries to convert the given HTML into a plain text format - best suited for
24 * e-mail display, etc.
25 *
26 * <p>In particular, it tries to maintain the following features:
27 * <ul>
28 * <li>Links are maintained, with the 'href' copied over
29 * <li>Information in the &lt;head&gt; is lost
30 * </ul>
31 *
32 * @param string $html the input HTML
33 * @param boolean $ignore_error Ignore xml parsing errors
34 * @return string the HTML converted, as best as possible, to text
35 * @throws Html2TextException if the HTML could not be loaded as a {@link DOMDocument}
36 */
37 public static function convert($html, $ignore_error = false) {
38 // replace &nbsp; with spaces
39 $html = str_replace("&nbsp;", " ", $html);
40 $html = str_replace("\xc2\xa0", " ", $html);
41
42 $is_office_document = static::isOfficeDocument($html);
43
44 if ($is_office_document) {
45 // remove office namespace
46 $html = str_replace(array("<o:p>", "</o:p>"), "", $html);
47 }
48
49 $html = static::fixNewlines($html);
50 if (mb_detect_encoding($html, "UTF-8", true)) {
51 $html = mb_convert_encoding($html, "HTML-ENTITIES", "UTF-8");
52 }
53
54 $doc = static::getDocument($html, $ignore_error);
55
56 $output = static::iterateOverNode($doc, null, false, $is_office_document);
57
58 // remove leading and trailing spaces on each line
59 $output = preg_replace("/[ \t]*\n[ \t]*/im", "\n", $output);
60 $output = preg_replace("/ *\t */im", "\t", $output);
61
62 // unarmor pre blocks
63 $output = str_replace("\r", "\n", $output);
64
65 // remove unnecessary empty lines
66 $output = preg_replace("/\n\n\n*/im", "\n\n", $output);
67
68 // remove leading and trailing whitespace
69 $output = trim($output);
70
71 return $output;
72 }
73
74 /**
75 * Unify newlines; in particular, \r\n becomes \n, and
76 * then \r becomes \n. This means that all newlines (Unix, Windows, Mac)
77 * all become \ns.
78 *
79 * @param string $text text with any number of \r, \r\n and \n combinations
80 * @return string the fixed text
81 */
82 static function fixNewlines($text) {
83 // replace \r\n to \n
84 $text = str_replace("\r\n", "\n", $text);
85 // remove \rs
86 $text = str_replace("\r", "\n", $text);
87
88 return $text;
89 }
90
91 /**
92 * Parse HTML into a DOMDocument
93 *
94 * @param string $html the input HTML
95 * @param boolean $ignore_error Ignore xml parsing errors
96 * @return DOMDocument the parsed document tree
97 */
98 static function getDocument($html, $ignore_error = false) {
99
100 $doc = new \DOMDocument();
101
102 $html = trim($html);
103
104 if (!$html) {
105 // DOMDocument doesn't support empty value and throws an error
106 // Return empty document instead
107 return $doc;
108 }
109
110 if ($html[0] !== '<') {
111 // If HTML does not begin with a tag, we put a body tag around it.
112 // If we do not do this, PHP will insert a paragraph tag around
113 // the first block of text for some reason which can mess up
114 // the newlines. See pre.html test for an example.
115 $html = '<body>' . $html . '</body>';
116 }
117
118 if ($ignore_error) {
119 $doc->strictErrorChecking = false;
120 $doc->recover = true;
121 $doc->xmlStandalone = true;
122 $old_internal_errors = libxml_use_internal_errors(true);
123 $load_result = $doc->loadHTML($html, LIBXML_NOWARNING | LIBXML_NOERROR | LIBXML_NONET);
124 libxml_use_internal_errors($old_internal_errors);
125 }
126 else {
127 $load_result = $doc->loadHTML($html);
128 }
129
130 if (!$load_result) {
131 throw new Html2TextException("Could not load HTML - badly formed?", $html);
132 }
133
134 return $doc;
135 }
136
137 /**
138 * Can we guess that this HTML is generated by Microsoft Office?
139 */
140 static function isOfficeDocument($html) {
141 return strpos($html, "urn:schemas-microsoft-com:office") !== false;
142 }
143
144 static function isWhitespace($text) {
145 return strlen(trim($text, "\n\r\t ")) === 0;
146 }
147
148 static function nextChildName($node) {
149 // get the next child
150 $nextNode = $node->nextSibling;
151 while ($nextNode != null) {
152 if ($nextNode instanceof \DOMText) {
153 if (!static::isWhitespace($nextNode->wholeText)) {
154 break;
155 }
156 }
157 if ($nextNode instanceof \DOMElement) {
158 break;
159 }
160 $nextNode = $nextNode->nextSibling;
161 }
162 $nextName = null;
163 if (($nextNode instanceof \DOMElement || $nextNode instanceof \DOMText) && $nextNode != null) {
164 $nextName = strtolower($nextNode->nodeName);
165 }
166
167 return $nextName;
168 }
169
170 static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_office_document = false) {
171
172 if ($node instanceof \DOMText) {
173 // Replace whitespace characters with a space (equivilant to \s)
174 if ($in_pre) {
175 $text = "\n" . trim($node->wholeText, "\n\r\t ") . "\n";
176 // Remove trailing whitespace only
177 $text = preg_replace("/[ \t]*\n/im", "\n", $text);
178 // armor newlines with \r.
179 return str_replace("\n", "\r", $text);
180 } else {
181 $text = preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $node->wholeText);
182 if (!static::isWhitespace($text) && ($prevName == 'p' || $prevName == 'div')) {
183 return "\n" . $text;
184 }
185 return $text;
186 }
187 }
188 if ($node instanceof \DOMDocumentType) {
189 // ignore
190 return "";
191 }
192 if ($node instanceof \DOMProcessingInstruction) {
193 // ignore
194 return "";
195 }
196
197 $name = strtolower($node->nodeName);
198 $nextName = static::nextChildName($node);
199
200 // start whitespace
201 switch ($name) {
202 case "hr":
203 $prefix = '';
204 if ($prevName != null) {
205 $prefix = "\n";
206 }
207 return $prefix . "---------------------------------------------------------------\n";
208
209 case "style":
210 case "head":
211 case "title":
212 case "meta":
213 case "script":
214 // ignore these tags
215 return "";
216
217 case "h1":
218 case "h2":
219 case "h3":
220 case "h4":
221 case "h5":
222 case "h6":
223 case "ol":
224 case "ul":
225 // add two newlines, second line is added below
226 $output = "\n";
227 break;
228
229 case "td":
230 case "th":
231 // add tab char to separate table fields
232 $output = "\t";
233 break;
234
235 case "p":
236 // Microsoft exchange emails often include HTML which, when passed through
237 // html2text, results in lots of double line returns everywhere.
238 //
239 // To fix this, for any p element with a className of `MsoNormal` (the standard
240 // classname in any Microsoft export or outlook for a paragraph that behaves
241 // like a line return) we skip the first line returns and set the name to br.
242 if ($is_office_document && $node->getAttribute('class') == 'MsoNormal') {
243 $output = "";
244 $name = 'br';
245 break;
246 }
247 // add two lines
248 $output = "\n\n";
249 break;
250
251 case "pre":
252 case "tr":
253 case "div":
254 // add one line
255 $output = "\n";
256 break;
257
258 case "li":
259 $output = "- ";
260 break;
261
262 default:
263 // print out contents of unknown tags
264 $output = "";
265 break;
266 }
267
268 // debug
269 //$output .= "[$name,$nextName]";
270
271 if (isset($node->childNodes)) {
272
273 $n = $node->childNodes->item(0);
274 $previousSiblingName = null;
275
276 while($n != null) {
277
278 $text = static::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document);
279
280 // Pass current node name to next child, as previousSibling does not appear to get populated
281 if ($n instanceof \DOMDocumentType
282 || $n instanceof \DOMProcessingInstruction
283 || ($n instanceof \DOMText && static::isWhitespace($text))) {
284 // Keep current previousSiblingName, these are invisible
285 }
286 else {
287 $previousSiblingName = strtolower($n->nodeName);
288 }
289
290 $node->removeChild($n);
291 $n = $node->childNodes->item(0);
292
293 // suppress last br tag inside a node list
294 if ($n != null || $previousSiblingName != 'br') {
295 $output .= $text;
296 }
297 }
298 }
299
300 // end whitespace
301 switch ($name) {
302 case "h1":
303 case "h2":
304 case "h3":
305 case "h4":
306 case "h5":
307 case "h6":
308 $output .= "\n";
309 break;
310
311 case "p":
312 // add two lines
313 $output .= "\n\n";
314 break;
315
316 case "pre":
317 case "br":
318 // add one line
319 $output .= "\n";
320 break;
321
322 case "div":
323 break;
324
325 case "a":
326 // links are returned in [text](link) format
327 $href = $node->getAttribute("href");
328
329 $output = trim($output);
330
331 // remove double [[ ]] s from linking images
332 if (substr($output, 0, 1) == "[" && substr($output, -1) == "]") {
333 $output = substr($output, 1, strlen($output) - 2);
334
335 // for linking images, the title of the <a> overrides the title of the <img>
336 if ($node->getAttribute("title")) {
337 $output = $node->getAttribute("title");
338 }
339 }
340
341 // if there is no link text, but a title attr
342 if (!$output && $node->getAttribute("title")) {
343 $output = $node->getAttribute("title");
344 }
345
346 if ($href == null) {
347 // it doesn't link anywhere
348 if ($node->getAttribute("name") != null) {
349 $output = "[$output]";
350 }
351 } else {
352 if ($href == $output || $href == "mailto:$output" || $href == "http://$output" || $href == "https://$output") {
353 // link to the same address: just use link
354 $output;
355 } else {
356 // replace it
357 if ($output) {
358 $output = "[$output]($href)";
359 } else {
360 // empty string
361 $output = $href;
362 }
363 }
364 }
365
366 // does the next node require additional whitespace?
367 switch ($nextName) {
368 case "h1": case "h2": case "h3": case "h4": case "h5": case "h6":
369 $output .= "\n";
370 break;
371 }
372 break;
373
374 case "img":
375 if ($node->getAttribute("title")) {
376 $output = "[" . $node->getAttribute("title") . "]";
377 } elseif ($node->getAttribute("alt")) {
378 $output = "[" . $node->getAttribute("alt") . "]";
379 } else {
380 $output = "";
381 }
382 break;
383
384 case "li":
385 $output .= "\n";
386 break;
387
388 default:
389 // do nothing
390 }
391
392 return $output;
393 }
394}