Matthias Andreas Benkard | b382b10 | 2021-01-02 15:32:21 +0100 | [diff] [blame^] | 1 | <?php |
| 2 | /****************************************************************************** |
| 3 | * Copyright (c) 2010 Jevon Wright and others. |
| 4 | * All rights reserved. This program and the accompanying materials |
| 5 | * are made available under the terms of the Eclipse Public License v1.0 |
| 6 | * which accompanies this distribution, and is available at |
| 7 | * http://www.eclipse.org/legal/epl-v10.html |
| 8 | * |
| 9 | * or |
| 10 | * |
| 11 | * LGPL which is available at http://www.gnu.org/licenses/lgpl.html |
| 12 | * |
| 13 | * |
| 14 | * Contributors: |
| 15 | * Jevon Wright - initial API and implementation |
| 16 | ****************************************************************************/ |
| 17 | |
| 18 | namespace Html2Text; |
| 19 | |
| 20 | class Html2Text { |
| 21 | |
| 22 | /** |
| 23 | * Tries to convert the given HTML into a plain text format - best suited for |
| 24 | * e-mail display, etc. |
| 25 | * |
| 26 | * <p>In particular, it tries to maintain the following features: |
| 27 | * <ul> |
| 28 | * <li>Links are maintained, with the 'href' copied over |
| 29 | * <li>Information in the <head> is lost |
| 30 | * </ul> |
| 31 | * |
| 32 | * @param string $html the input HTML |
| 33 | * @param boolean $ignore_error Ignore xml parsing errors |
| 34 | * @return string the HTML converted, as best as possible, to text |
| 35 | * @throws Html2TextException if the HTML could not be loaded as a {@link DOMDocument} |
| 36 | */ |
| 37 | public static function convert($html, $ignore_error = false) { |
| 38 | // replace with spaces |
| 39 | $html = str_replace(" ", " ", $html); |
| 40 | $html = str_replace("\xc2\xa0", " ", $html); |
| 41 | |
| 42 | $is_office_document = static::isOfficeDocument($html); |
| 43 | |
| 44 | if ($is_office_document) { |
| 45 | // remove office namespace |
| 46 | $html = str_replace(array("<o:p>", "</o:p>"), "", $html); |
| 47 | } |
| 48 | |
| 49 | $html = static::fixNewlines($html); |
| 50 | if (mb_detect_encoding($html, "UTF-8", true)) { |
| 51 | $html = mb_convert_encoding($html, "HTML-ENTITIES", "UTF-8"); |
| 52 | } |
| 53 | |
| 54 | $doc = static::getDocument($html, $ignore_error); |
| 55 | |
| 56 | $output = static::iterateOverNode($doc, null, false, $is_office_document); |
| 57 | |
| 58 | // remove leading and trailing spaces on each line |
| 59 | $output = preg_replace("/[ \t]*\n[ \t]*/im", "\n", $output); |
| 60 | $output = preg_replace("/ *\t */im", "\t", $output); |
| 61 | |
| 62 | // unarmor pre blocks |
| 63 | $output = str_replace("\r", "\n", $output); |
| 64 | |
| 65 | // remove unnecessary empty lines |
| 66 | $output = preg_replace("/\n\n\n*/im", "\n\n", $output); |
| 67 | |
| 68 | // remove leading and trailing whitespace |
| 69 | $output = trim($output); |
| 70 | |
| 71 | return $output; |
| 72 | } |
| 73 | |
| 74 | /** |
| 75 | * Unify newlines; in particular, \r\n becomes \n, and |
| 76 | * then \r becomes \n. This means that all newlines (Unix, Windows, Mac) |
| 77 | * all become \ns. |
| 78 | * |
| 79 | * @param string $text text with any number of \r, \r\n and \n combinations |
| 80 | * @return string the fixed text |
| 81 | */ |
| 82 | static function fixNewlines($text) { |
| 83 | // replace \r\n to \n |
| 84 | $text = str_replace("\r\n", "\n", $text); |
| 85 | // remove \rs |
| 86 | $text = str_replace("\r", "\n", $text); |
| 87 | |
| 88 | return $text; |
| 89 | } |
| 90 | |
| 91 | /** |
| 92 | * Parse HTML into a DOMDocument |
| 93 | * |
| 94 | * @param string $html the input HTML |
| 95 | * @param boolean $ignore_error Ignore xml parsing errors |
| 96 | * @return DOMDocument the parsed document tree |
| 97 | */ |
| 98 | static function getDocument($html, $ignore_error = false) { |
| 99 | |
| 100 | $doc = new \DOMDocument(); |
| 101 | |
| 102 | $html = trim($html); |
| 103 | |
| 104 | if (!$html) { |
| 105 | // DOMDocument doesn't support empty value and throws an error |
| 106 | // Return empty document instead |
| 107 | return $doc; |
| 108 | } |
| 109 | |
| 110 | if ($html[0] !== '<') { |
| 111 | // If HTML does not begin with a tag, we put a body tag around it. |
| 112 | // If we do not do this, PHP will insert a paragraph tag around |
| 113 | // the first block of text for some reason which can mess up |
| 114 | // the newlines. See pre.html test for an example. |
| 115 | $html = '<body>' . $html . '</body>'; |
| 116 | } |
| 117 | |
| 118 | if ($ignore_error) { |
| 119 | $doc->strictErrorChecking = false; |
| 120 | $doc->recover = true; |
| 121 | $doc->xmlStandalone = true; |
| 122 | $old_internal_errors = libxml_use_internal_errors(true); |
| 123 | $load_result = $doc->loadHTML($html, LIBXML_NOWARNING | LIBXML_NOERROR | LIBXML_NONET); |
| 124 | libxml_use_internal_errors($old_internal_errors); |
| 125 | } |
| 126 | else { |
| 127 | $load_result = $doc->loadHTML($html); |
| 128 | } |
| 129 | |
| 130 | if (!$load_result) { |
| 131 | throw new Html2TextException("Could not load HTML - badly formed?", $html); |
| 132 | } |
| 133 | |
| 134 | return $doc; |
| 135 | } |
| 136 | |
| 137 | /** |
| 138 | * Can we guess that this HTML is generated by Microsoft Office? |
| 139 | */ |
| 140 | static function isOfficeDocument($html) { |
| 141 | return strpos($html, "urn:schemas-microsoft-com:office") !== false; |
| 142 | } |
| 143 | |
| 144 | static function isWhitespace($text) { |
| 145 | return strlen(trim($text, "\n\r\t ")) === 0; |
| 146 | } |
| 147 | |
| 148 | static function nextChildName($node) { |
| 149 | // get the next child |
| 150 | $nextNode = $node->nextSibling; |
| 151 | while ($nextNode != null) { |
| 152 | if ($nextNode instanceof \DOMText) { |
| 153 | if (!static::isWhitespace($nextNode->wholeText)) { |
| 154 | break; |
| 155 | } |
| 156 | } |
| 157 | if ($nextNode instanceof \DOMElement) { |
| 158 | break; |
| 159 | } |
| 160 | $nextNode = $nextNode->nextSibling; |
| 161 | } |
| 162 | $nextName = null; |
| 163 | if (($nextNode instanceof \DOMElement || $nextNode instanceof \DOMText) && $nextNode != null) { |
| 164 | $nextName = strtolower($nextNode->nodeName); |
| 165 | } |
| 166 | |
| 167 | return $nextName; |
| 168 | } |
| 169 | |
| 170 | static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_office_document = false) { |
| 171 | |
| 172 | if ($node instanceof \DOMText) { |
| 173 | // Replace whitespace characters with a space (equivilant to \s) |
| 174 | if ($in_pre) { |
| 175 | $text = "\n" . trim($node->wholeText, "\n\r\t ") . "\n"; |
| 176 | // Remove trailing whitespace only |
| 177 | $text = preg_replace("/[ \t]*\n/im", "\n", $text); |
| 178 | // armor newlines with \r. |
| 179 | return str_replace("\n", "\r", $text); |
| 180 | } else { |
| 181 | $text = preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $node->wholeText); |
| 182 | if (!static::isWhitespace($text) && ($prevName == 'p' || $prevName == 'div')) { |
| 183 | return "\n" . $text; |
| 184 | } |
| 185 | return $text; |
| 186 | } |
| 187 | } |
| 188 | if ($node instanceof \DOMDocumentType) { |
| 189 | // ignore |
| 190 | return ""; |
| 191 | } |
| 192 | if ($node instanceof \DOMProcessingInstruction) { |
| 193 | // ignore |
| 194 | return ""; |
| 195 | } |
| 196 | |
| 197 | $name = strtolower($node->nodeName); |
| 198 | $nextName = static::nextChildName($node); |
| 199 | |
| 200 | // start whitespace |
| 201 | switch ($name) { |
| 202 | case "hr": |
| 203 | $prefix = ''; |
| 204 | if ($prevName != null) { |
| 205 | $prefix = "\n"; |
| 206 | } |
| 207 | return $prefix . "---------------------------------------------------------------\n"; |
| 208 | |
| 209 | case "style": |
| 210 | case "head": |
| 211 | case "title": |
| 212 | case "meta": |
| 213 | case "script": |
| 214 | // ignore these tags |
| 215 | return ""; |
| 216 | |
| 217 | case "h1": |
| 218 | case "h2": |
| 219 | case "h3": |
| 220 | case "h4": |
| 221 | case "h5": |
| 222 | case "h6": |
| 223 | case "ol": |
| 224 | case "ul": |
| 225 | // add two newlines, second line is added below |
| 226 | $output = "\n"; |
| 227 | break; |
| 228 | |
| 229 | case "td": |
| 230 | case "th": |
| 231 | // add tab char to separate table fields |
| 232 | $output = "\t"; |
| 233 | break; |
| 234 | |
| 235 | case "p": |
| 236 | // Microsoft exchange emails often include HTML which, when passed through |
| 237 | // html2text, results in lots of double line returns everywhere. |
| 238 | // |
| 239 | // To fix this, for any p element with a className of `MsoNormal` (the standard |
| 240 | // classname in any Microsoft export or outlook for a paragraph that behaves |
| 241 | // like a line return) we skip the first line returns and set the name to br. |
| 242 | if ($is_office_document && $node->getAttribute('class') == 'MsoNormal') { |
| 243 | $output = ""; |
| 244 | $name = 'br'; |
| 245 | break; |
| 246 | } |
| 247 | // add two lines |
| 248 | $output = "\n\n"; |
| 249 | break; |
| 250 | |
| 251 | case "pre": |
| 252 | case "tr": |
| 253 | case "div": |
| 254 | // add one line |
| 255 | $output = "\n"; |
| 256 | break; |
| 257 | |
| 258 | case "li": |
| 259 | $output = "- "; |
| 260 | break; |
| 261 | |
| 262 | default: |
| 263 | // print out contents of unknown tags |
| 264 | $output = ""; |
| 265 | break; |
| 266 | } |
| 267 | |
| 268 | // debug |
| 269 | //$output .= "[$name,$nextName]"; |
| 270 | |
| 271 | if (isset($node->childNodes)) { |
| 272 | |
| 273 | $n = $node->childNodes->item(0); |
| 274 | $previousSiblingName = null; |
| 275 | |
| 276 | while($n != null) { |
| 277 | |
| 278 | $text = static::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document); |
| 279 | |
| 280 | // Pass current node name to next child, as previousSibling does not appear to get populated |
| 281 | if ($n instanceof \DOMDocumentType |
| 282 | || $n instanceof \DOMProcessingInstruction |
| 283 | || ($n instanceof \DOMText && static::isWhitespace($text))) { |
| 284 | // Keep current previousSiblingName, these are invisible |
| 285 | } |
| 286 | else { |
| 287 | $previousSiblingName = strtolower($n->nodeName); |
| 288 | } |
| 289 | |
| 290 | $node->removeChild($n); |
| 291 | $n = $node->childNodes->item(0); |
| 292 | |
| 293 | // suppress last br tag inside a node list |
| 294 | if ($n != null || $previousSiblingName != 'br') { |
| 295 | $output .= $text; |
| 296 | } |
| 297 | } |
| 298 | } |
| 299 | |
| 300 | // end whitespace |
| 301 | switch ($name) { |
| 302 | case "h1": |
| 303 | case "h2": |
| 304 | case "h3": |
| 305 | case "h4": |
| 306 | case "h5": |
| 307 | case "h6": |
| 308 | $output .= "\n"; |
| 309 | break; |
| 310 | |
| 311 | case "p": |
| 312 | // add two lines |
| 313 | $output .= "\n\n"; |
| 314 | break; |
| 315 | |
| 316 | case "pre": |
| 317 | case "br": |
| 318 | // add one line |
| 319 | $output .= "\n"; |
| 320 | break; |
| 321 | |
| 322 | case "div": |
| 323 | break; |
| 324 | |
| 325 | case "a": |
| 326 | // links are returned in [text](link) format |
| 327 | $href = $node->getAttribute("href"); |
| 328 | |
| 329 | $output = trim($output); |
| 330 | |
| 331 | // remove double [[ ]] s from linking images |
| 332 | if (substr($output, 0, 1) == "[" && substr($output, -1) == "]") { |
| 333 | $output = substr($output, 1, strlen($output) - 2); |
| 334 | |
| 335 | // for linking images, the title of the <a> overrides the title of the <img> |
| 336 | if ($node->getAttribute("title")) { |
| 337 | $output = $node->getAttribute("title"); |
| 338 | } |
| 339 | } |
| 340 | |
| 341 | // if there is no link text, but a title attr |
| 342 | if (!$output && $node->getAttribute("title")) { |
| 343 | $output = $node->getAttribute("title"); |
| 344 | } |
| 345 | |
| 346 | if ($href == null) { |
| 347 | // it doesn't link anywhere |
| 348 | if ($node->getAttribute("name") != null) { |
| 349 | $output = "[$output]"; |
| 350 | } |
| 351 | } else { |
| 352 | if ($href == $output || $href == "mailto:$output" || $href == "http://$output" || $href == "https://$output") { |
| 353 | // link to the same address: just use link |
| 354 | $output; |
| 355 | } else { |
| 356 | // replace it |
| 357 | if ($output) { |
| 358 | $output = "[$output]($href)"; |
| 359 | } else { |
| 360 | // empty string |
| 361 | $output = $href; |
| 362 | } |
| 363 | } |
| 364 | } |
| 365 | |
| 366 | // does the next node require additional whitespace? |
| 367 | switch ($nextName) { |
| 368 | case "h1": case "h2": case "h3": case "h4": case "h5": case "h6": |
| 369 | $output .= "\n"; |
| 370 | break; |
| 371 | } |
| 372 | break; |
| 373 | |
| 374 | case "img": |
| 375 | if ($node->getAttribute("title")) { |
| 376 | $output = "[" . $node->getAttribute("title") . "]"; |
| 377 | } elseif ($node->getAttribute("alt")) { |
| 378 | $output = "[" . $node->getAttribute("alt") . "]"; |
| 379 | } else { |
| 380 | $output = ""; |
| 381 | } |
| 382 | break; |
| 383 | |
| 384 | case "li": |
| 385 | $output .= "\n"; |
| 386 | break; |
| 387 | |
| 388 | default: |
| 389 | // do nothing |
| 390 | } |
| 391 | |
| 392 | return $output; |
| 393 | } |
| 394 | } |