Keyvan 13 years ago
parent
commit
15782b1b8e
2 changed files with 26 additions and 8 deletions
  1. 25 7
      Readability.php
  2. 1 1
      examples/Readability.php

+ 25 - 7
Readability.php

@@ -10,7 +10,7 @@
 * More information: http://fivefilters.org/content-only/
 * License: Apache License, Version 2.0
 * Requires: PHP5
-* Date: 2010-10-29
+* Date: 2011-07-22
 * 
 * Differences between the PHP port and the original
 * ------------------------------------------------------
@@ -46,7 +46,7 @@
 require_once(dirname(__FILE__).'/JSLikeHTMLElement.php');
 
 // Alternative usage (for testing only!)
-// uncomment the lins below and call Readability.php in your browser 
+// uncomment the lines below and call Readability.php in your browser 
 // passing it the URL of the page you'd like content from, e.g.:
 // Readability.php?url=http://medialens.org/alerts/09/090615_the_guardian_climate.php
 
@@ -109,13 +109,13 @@ class Readability
 	function __construct($html, $url=null)
 	{
 		/* Turn all double br's into p's */
-		/* Note, this is pretty costly as far as processing goes. Maybe optimize later. */
 		$html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html);
 		$html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html);
 		$html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
 		$this->dom = new DOMDocument();
 		$this->dom->preserveWhiteSpace = false;
 		$this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
+		if (trim($html) == '') $html = '<html></html>';
 		@$this->dom->loadHTML($html);
 		$this->url = $url;
 	}
@@ -150,6 +150,7 @@ class Readability
 	**/
 	public function init()
 	{
+		if (!isset($this->dom->documentElement)) return false;
 		$this->removeScripts($this->dom);
 		//die($this->getInnerHTML($this->dom->documentElement));
 		
@@ -293,7 +294,6 @@ class Readability
 			$this->body = $this->dom->createElement('body');
 			$this->dom->documentElement->appendChild($this->body);
 		}
-		
 		$this->body->setAttribute('id', 'readabilityBody');
 
 		/* Remove all style tags in head */
@@ -664,9 +664,19 @@ class Readability
 		if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY')
 		{
 			$topCandidate = $this->dom->createElement('div');
-			$topCandidate->innerHTML = ($page instanceof DOMDocument) ? $page->saveXML($page->documentElement) : $page->innerHTML;
-			$page->innerHTML = '';
-			$page->appendChild($topCandidate);
+			if ($page instanceof DOMDocument) {
+				if (!isset($page->documentElement)) {
+					// we don't have a body either? what a mess! :)
+				} else {
+					$topCandidate->innerHTML = $page->documentElement->innerHTML;
+					$page->documentElement->innerHTML = '';
+					$page->documentElement->appendChild($topCandidate);
+				}
+			} else {
+				$topCandidate->innerHTML = $page->innerHTML;
+				$page->innerHTML = '';
+				$page->appendChild($topCandidate);
+			}
 			$this->initializeNode($topCandidate);
 		}
 
@@ -678,6 +688,10 @@ class Readability
 		$articleContent->setAttribute('id', 'readability-content');
 		$siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2);
 		$siblingNodes          = $topCandidate->parentNode->childNodes;
+		if (!isset($siblingNodes)) {
+			$siblingNodes = new stdClass;
+			$siblingNodes->length = 0;
+		}
 
 		for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++)
 		{
@@ -769,6 +783,9 @@ class Readability
 		**/
 		if (strlen($this->getInnerText($articleContent, false)) < 250)
 		{
+			// TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7
+			// in the meantime, we check and create an empty element if it's not there.
+			if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body');
 			$this->body->innerHTML = $this->bodyCache;
 			
 			if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
@@ -846,6 +863,7 @@ class Readability
 	* @return void
 	*/
 	public function cleanStyles($e) {
+		if (!is_object($e)) return;
 		$elems = $e->getElementsByTagName('*');
 		foreach ($elems as $elem) {
 			$elem->removeAttribute('style');

+ 1 - 1
examples/Readability.php

@@ -16,7 +16,7 @@ $html = file_get_contents($url);
 // This step is highly recommended - PHP's default HTML parser
 // often does a terrible job and results in strange output.
 if (function_exists('tidy_parse_string')) {
-	$tidy = tidy_parse_string($html, array('indent'=>true), 'UTF8');
+	$tidy = tidy_parse_string($html, array(), 'UTF8');
 	$tidy->cleanRepair();
 	$html = $tidy->value;
 }