| <?php |
| /****************************************************************************** |
| * Copyright (c) 2010 Jevon Wright and others. |
| * All rights reserved. This program and the accompanying materials |
| * are made available under the terms of the Eclipse Public License v1.0 |
| * which accompanies this distribution, and is available at |
| * http://www.eclipse.org/legal/epl-v10.html |
| * |
| * or |
| * |
| * LGPL which is available at http://www.gnu.org/licenses/lgpl.html |
| * |
| * |
| * Contributors: |
| * Jevon Wright - initial API and implementation |
| ****************************************************************************/ |
| |
| namespace Html2Text; |
| |
| class Html2Text { |
| |
| /** |
| * Tries to convert the given HTML into a plain text format - best suited for |
| * e-mail display, etc. |
| * |
| * <p>In particular, it tries to maintain the following features: |
| * <ul> |
| * <li>Links are maintained, with the 'href' copied over |
| * <li>Information in the <head> is lost |
| * </ul> |
| * |
| * @param string $html the input HTML |
| * @param boolean $ignore_error Ignore xml parsing errors |
| * @return string the HTML converted, as best as possible, to text |
| * @throws Html2TextException if the HTML could not be loaded as a {@link DOMDocument} |
| */ |
| public static function convert($html, $ignore_error = false) { |
| // replace with spaces |
| $html = str_replace(" ", " ", $html); |
| $html = str_replace("\xc2\xa0", " ", $html); |
| |
| $is_office_document = static::isOfficeDocument($html); |
| |
| if ($is_office_document) { |
| // remove office namespace |
| $html = str_replace(array("<o:p>", "</o:p>"), "", $html); |
| } |
| |
| $html = static::fixNewlines($html); |
| if (mb_detect_encoding($html, "UTF-8", true)) { |
| $html = mb_convert_encoding($html, "HTML-ENTITIES", "UTF-8"); |
| } |
| |
| $doc = static::getDocument($html, $ignore_error); |
| |
| $output = static::iterateOverNode($doc, null, false, $is_office_document); |
| |
| // remove leading and trailing spaces on each line |
| $output = preg_replace("/[ \t]*\n[ \t]*/im", "\n", $output); |
| $output = preg_replace("/ *\t */im", "\t", $output); |
| |
| // unarmor pre blocks |
| $output = str_replace("\r", "\n", $output); |
| |
| // remove unnecessary empty lines |
| $output = preg_replace("/\n\n\n*/im", "\n\n", $output); |
| |
| // remove leading and trailing whitespace |
| $output = trim($output); |
| |
| return $output; |
| } |
| |
| /** |
| * Unify newlines; in particular, \r\n becomes \n, and |
| * then \r becomes \n. This means that all newlines (Unix, Windows, Mac) |
| * all become \ns. |
| * |
| * @param string $text text with any number of \r, \r\n and \n combinations |
| * @return string the fixed text |
| */ |
| static function fixNewlines($text) { |
| // replace \r\n to \n |
| $text = str_replace("\r\n", "\n", $text); |
| // remove \rs |
| $text = str_replace("\r", "\n", $text); |
| |
| return $text; |
| } |
| |
| /** |
| * Parse HTML into a DOMDocument |
| * |
| * @param string $html the input HTML |
| * @param boolean $ignore_error Ignore xml parsing errors |
| * @return DOMDocument the parsed document tree |
| */ |
| static function getDocument($html, $ignore_error = false) { |
| |
| $doc = new \DOMDocument(); |
| |
| $html = trim($html); |
| |
| if (!$html) { |
| // DOMDocument doesn't support empty value and throws an error |
| // Return empty document instead |
| return $doc; |
| } |
| |
| if ($html[0] !== '<') { |
| // If HTML does not begin with a tag, we put a body tag around it. |
| // If we do not do this, PHP will insert a paragraph tag around |
| // the first block of text for some reason which can mess up |
| // the newlines. See pre.html test for an example. |
| $html = '<body>' . $html . '</body>'; |
| } |
| |
| if ($ignore_error) { |
| $doc->strictErrorChecking = false; |
| $doc->recover = true; |
| $doc->xmlStandalone = true; |
| $old_internal_errors = libxml_use_internal_errors(true); |
| $load_result = $doc->loadHTML($html, LIBXML_NOWARNING | LIBXML_NOERROR | LIBXML_NONET); |
| libxml_use_internal_errors($old_internal_errors); |
| } |
| else { |
| $load_result = $doc->loadHTML($html); |
| } |
| |
| if (!$load_result) { |
| throw new Html2TextException("Could not load HTML - badly formed?", $html); |
| } |
| |
| return $doc; |
| } |
| |
| /** |
| * Can we guess that this HTML is generated by Microsoft Office? |
| */ |
| static function isOfficeDocument($html) { |
| return strpos($html, "urn:schemas-microsoft-com:office") !== false; |
| } |
| |
| static function isWhitespace($text) { |
| return strlen(trim($text, "\n\r\t ")) === 0; |
| } |
| |
| static function nextChildName($node) { |
| // get the next child |
| $nextNode = $node->nextSibling; |
| while ($nextNode != null) { |
| if ($nextNode instanceof \DOMText) { |
| if (!static::isWhitespace($nextNode->wholeText)) { |
| break; |
| } |
| } |
| if ($nextNode instanceof \DOMElement) { |
| break; |
| } |
| $nextNode = $nextNode->nextSibling; |
| } |
| $nextName = null; |
| if (($nextNode instanceof \DOMElement || $nextNode instanceof \DOMText) && $nextNode != null) { |
| $nextName = strtolower($nextNode->nodeName); |
| } |
| |
| return $nextName; |
| } |
| |
| static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_office_document = false) { |
| |
| if ($node instanceof \DOMText) { |
| // Replace whitespace characters with a space (equivilant to \s) |
| if ($in_pre) { |
| $text = "\n" . trim($node->wholeText, "\n\r\t ") . "\n"; |
| // Remove trailing whitespace only |
| $text = preg_replace("/[ \t]*\n/im", "\n", $text); |
| // armor newlines with \r. |
| return str_replace("\n", "\r", $text); |
| } else { |
| $text = preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $node->wholeText); |
| if (!static::isWhitespace($text) && ($prevName == 'p' || $prevName == 'div')) { |
| return "\n" . $text; |
| } |
| return $text; |
| } |
| } |
| if ($node instanceof \DOMDocumentType) { |
| // ignore |
| return ""; |
| } |
| if ($node instanceof \DOMProcessingInstruction) { |
| // ignore |
| return ""; |
| } |
| |
| $name = strtolower($node->nodeName); |
| $nextName = static::nextChildName($node); |
| |
| // start whitespace |
| switch ($name) { |
| case "hr": |
| $prefix = ''; |
| if ($prevName != null) { |
| $prefix = "\n"; |
| } |
| return $prefix . "---------------------------------------------------------------\n"; |
| |
| case "style": |
| case "head": |
| case "title": |
| case "meta": |
| case "script": |
| // ignore these tags |
| return ""; |
| |
| case "h1": |
| case "h2": |
| case "h3": |
| case "h4": |
| case "h5": |
| case "h6": |
| case "ol": |
| case "ul": |
| // add two newlines, second line is added below |
| $output = "\n"; |
| break; |
| |
| case "td": |
| case "th": |
| // add tab char to separate table fields |
| $output = "\t"; |
| break; |
| |
| case "p": |
| // Microsoft exchange emails often include HTML which, when passed through |
| // html2text, results in lots of double line returns everywhere. |
| // |
| // To fix this, for any p element with a className of `MsoNormal` (the standard |
| // classname in any Microsoft export or outlook for a paragraph that behaves |
| // like a line return) we skip the first line returns and set the name to br. |
| if ($is_office_document && $node->getAttribute('class') == 'MsoNormal') { |
| $output = ""; |
| $name = 'br'; |
| break; |
| } |
| // add two lines |
| $output = "\n\n"; |
| break; |
| |
| case "pre": |
| case "tr": |
| case "div": |
| // add one line |
| $output = "\n"; |
| break; |
| |
| case "li": |
| $output = "- "; |
| break; |
| |
| default: |
| // print out contents of unknown tags |
| $output = ""; |
| break; |
| } |
| |
| // debug |
| //$output .= "[$name,$nextName]"; |
| |
| if (isset($node->childNodes)) { |
| |
| $n = $node->childNodes->item(0); |
| $previousSiblingName = null; |
| |
| while($n != null) { |
| |
| $text = static::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document); |
| |
| // Pass current node name to next child, as previousSibling does not appear to get populated |
| if ($n instanceof \DOMDocumentType |
| || $n instanceof \DOMProcessingInstruction |
| || ($n instanceof \DOMText && static::isWhitespace($text))) { |
| // Keep current previousSiblingName, these are invisible |
| } |
| else { |
| $previousSiblingName = strtolower($n->nodeName); |
| } |
| |
| $node->removeChild($n); |
| $n = $node->childNodes->item(0); |
| |
| // suppress last br tag inside a node list |
| if ($n != null || $previousSiblingName != 'br') { |
| $output .= $text; |
| } |
| } |
| } |
| |
| // end whitespace |
| switch ($name) { |
| case "h1": |
| case "h2": |
| case "h3": |
| case "h4": |
| case "h5": |
| case "h6": |
| $output .= "\n"; |
| break; |
| |
| case "p": |
| // add two lines |
| $output .= "\n\n"; |
| break; |
| |
| case "pre": |
| case "br": |
| // add one line |
| $output .= "\n"; |
| break; |
| |
| case "div": |
| break; |
| |
| case "a": |
| // links are returned in [text](link) format |
| $href = $node->getAttribute("href"); |
| |
| $output = trim($output); |
| |
| // remove double [[ ]] s from linking images |
| if (substr($output, 0, 1) == "[" && substr($output, -1) == "]") { |
| $output = substr($output, 1, strlen($output) - 2); |
| |
| // for linking images, the title of the <a> overrides the title of the <img> |
| if ($node->getAttribute("title")) { |
| $output = $node->getAttribute("title"); |
| } |
| } |
| |
| // if there is no link text, but a title attr |
| if (!$output && $node->getAttribute("title")) { |
| $output = $node->getAttribute("title"); |
| } |
| |
| if ($href == null) { |
| // it doesn't link anywhere |
| if ($node->getAttribute("name") != null) { |
| $output = "[$output]"; |
| } |
| } else { |
| if ($href == $output || $href == "mailto:$output" || $href == "http://$output" || $href == "https://$output") { |
| // link to the same address: just use link |
| $output; |
| } else { |
| // replace it |
| if ($output) { |
| $output = "[$output]($href)"; |
| } else { |
| // empty string |
| $output = $href; |
| } |
| } |
| } |
| |
| // does the next node require additional whitespace? |
| switch ($nextName) { |
| case "h1": case "h2": case "h3": case "h4": case "h5": case "h6": |
| $output .= "\n"; |
| break; |
| } |
| break; |
| |
| case "img": |
| if ($node->getAttribute("title")) { |
| $output = "[" . $node->getAttribute("title") . "]"; |
| } elseif ($node->getAttribute("alt")) { |
| $output = "[" . $node->getAttribute("alt") . "]"; |
| } else { |
| $output = ""; |
| } |
| break; |
| |
| case "li": |
| $output .= "\n"; |
| break; |
| |
| default: |
| // do nothing |
| } |
| |
| return $output; |
| } |
| } |