Browse Source

Updated with lightClean mode to preserve more images and youtube/vimeo/viddler embeds

Also contains optional parameter in constructor to indicate HTML parsing
should be done with html5lib (although that has its own issues).
FiveFilters.org 12 years ago
parent
commit
24fb14a55f
5 changed files with 177 additions and 35 deletions
  1. 1 2
      JSLikeHTMLElement.php
  2. 64 0
      README.md
  3. 84 32
      Readability.php
  4. 27 0
      composer.json
  5. 1 1
      examples/Readability.php

+ 1 - 2
JSLikeHTMLElement.php

@@ -106,5 +106,4 @@ class JSLikeHTMLElement extends DOMElement
 	{
 		return '['.$this->tagName.']';
 	}
-}
-?>
+}

+ 64 - 0
README.md

@@ -0,0 +1,64 @@
+PHP Readability
+================
+
+This is a PHP port of Arc90's original Javascript version of Readability. (Arc90 has since relaunched the project.)
+
+For instructions on how to use this, please see http://www.keyvan.net/2010/08/php-readability/
+
+For a more flexible and robust solution to article extraction, take a look at [Full-Text RSS](http://fivefilters.org/content-only/) - it makes use of PHP Readability, but offers much more.
+
+Feel free to fork this and change/improve it. I would love to see your results. Please do share them and I'll consider pulling them in.
+
+PHP Readability is licensed under the Apache License, Version 2.0 (the same license as the original JS version). The original Javascript version can be found here: http://code.google.com/p/arc90labs-readability/source/browse/ (readability.js)
+
+### Simple example
+
+	<?php
+	require_once 'readability/Readability.php';
+	header('Content-Type: text/plain; charset=utf-8');
+
+	// get latest Medialens alert 
+	// (change this URL to whatever you'd like to test)
+	$url = 'http://www.medialens.org/index.php/alerts/alert-archive/alerts-2013/729-thatcher.html';
+	$html = file_get_contents($url);
+
+	// Note: PHP Readability expects UTF-8 encoded content.
+	// If your content is not UTF-8 encoded, convert it 
+	// first before passing it to PHP Readability. 
+	// Both iconv() and mb_convert_encoding() can do this.
+
+	// If we've got Tidy, let's clean up input.
+	// This step is highly recommended - PHP's default HTML parser
+	// often doesn't do a great job and results in strange output.
+	if (function_exists('tidy_parse_string')) {
+		$tidy = tidy_parse_string($html, array(), 'UTF8');
+		$tidy->cleanRepair();
+		$html = $tidy->value;
+	}
+
+	// give it to Readability
+	$readability = new Readability($html, $url);
+	// print debug output? 
+	// useful to compare against Arc90's original JS version - 
+	// simply click the bookmarklet with FireBug's console window open
+	$readability->debug = false;
+	// convert links to footnotes?
+	$readability->convertLinksToFootnotes = true;
+	// process it
+	$result = $readability->init();
+	// does it look like we found what we wanted?
+	if ($result) {
+		echo "== Title =====================================\n";
+		echo $readability->getTitle()->textContent, "\n\n";
+		echo "== Body ======================================\n";
+		$content = $readability->getContent()->innerHTML;
+		// if we've got Tidy, let's clean it up for output
+		if (function_exists('tidy_parse_string')) {
+			$tidy = tidy_parse_string($content, array('indent'=>true, 'show-body-only' => true), 'UTF8');
+			$tidy->cleanRepair();
+			$content = $tidy->value;
+		}
+		echo $content;
+	} else {
+		echo 'Looks like we couldn\'t find the content. :(';
+	}

+ 84 - 32
Readability.php

@@ -2,6 +2,8 @@
 /** 
 * Arc90's Readability ported to PHP for FiveFilters.org
 * Based on readability.js version 1.7.1 (without multi-page support)
+* Updated to allow HTML5 parsing with html5lib
+* Updated with lightClean mode to preserve more images and youtube/vimeo/viddler embeds
 * ------------------------------------------------------
 * Original URL: http://lab.arc90.com/experiments/readability/js/readability.js
 * Arc90's project URL: http://lab.arc90.com/experiments/readability/
@@ -10,7 +12,7 @@
 * More information: http://fivefilters.org/content-only/
 * License: Apache License, Version 2.0
 * Requires: PHP5
-* Date: 2011-07-22
+* Date: 2012-09-19
 * 
 * Differences between the PHP port and the original
 * ------------------------------------------------------
@@ -72,6 +74,7 @@ class Readability
 	public $dom;
 	public $url = null; // optional - URL where HTML was retrieved
 	public $debug = false;
+	public $lightClean = true; // preserves more content (experimental) added 2012-09-19
 	protected $body = null; // 
 	protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later
 	protected $flags = 7; // 1 | 2 | 4;   // Start with all flags set.
@@ -82,17 +85,17 @@ class Readability
 	* Defined up here so we don't instantiate them repeatedly in loops.
 	**/
 	public $regexps = array(
-		'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i',
+		'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i',
 		'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
-		'positive' => '/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i',
-		'negative' => '/combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i',
+		'positive' => '/article|body|content|entry|hentry|main|page|attachment|pagination|post|text|blog|story/i',
+		'negative' => '/combx|comment|com-|contact|foot|footer|_nav|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i',
 		'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i',
 		'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i',
 		'replaceFonts' => '/<(\/?)font[^>]*>/i',
 		// 'trimRe' => '/^\s+|\s+$/g', // PHP has trim()
 		'normalize' => '/\s{2,}/',
 		'killBreaks' => '/(<br\s*\/?>(\s|&nbsp;?)*){1,}/',
-		'video' => '/http:\/\/(www\.)?(youtube|vimeo)\.com/i',
+		'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i',
 		'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i'
 	);	
 	
@@ -105,19 +108,24 @@ class Readability
 	* Create instance of Readability
 	* @param string UTF-8 encoded string
 	* @param string (optional) URL associated with HTML (used for footnotes)
+	* @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib')
 	*/	
-	function __construct($html, $url=null)
+	function __construct($html, $url=null, $parser='libxml')
 	{
+		$this->url = $url;
 		/* Turn all double br's into p's */
 		$html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html);
 		$html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html);
 		$html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
-		$this->dom = new DOMDocument();
-		$this->dom->preserveWhiteSpace = false;
-		$this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
 		if (trim($html) == '') $html = '<html></html>';
-		@$this->dom->loadHTML($html);
-		$this->url = $url;
+		if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) {
+			// all good
+		} else {
+			$this->dom = new DOMDocument();
+			$this->dom->preserveWhiteSpace = false;
+			@$this->dom->loadHTML($html);
+		}
+		$this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
 	}
 
 	/**
@@ -213,7 +221,7 @@ class Readability
 	* Debug
 	*/
 	protected function dbg($msg) {
-		if ($this->debug) echo '* ',$msg, '<br />', "\n";
+		if ($this->debug) echo '* ',$msg, "\n";
 	}
 	
 	/**
@@ -420,7 +428,7 @@ class Readability
 		* If there is only one h2, they are probably using it
 		* as a header and not a subheader, so remove it since we already have a header.
 		***/
-		if ($articleContent->getElementsByTagName('h2')->length == 1) {
+		if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) {
 			$this->clean($articleContent, 'h2'); 
 		}
 		$this->clean($articleContent, 'iframe');
@@ -439,8 +447,9 @@ class Readability
 			$imgCount    = $articleParagraphs->item($i)->getElementsByTagName('img')->length;
 			$embedCount  = $articleParagraphs->item($i)->getElementsByTagName('embed')->length;
 			$objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length;
+			$iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length;
 			
-			if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '')
+			if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '')
 			{
 				$articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i));
 			}
@@ -946,13 +955,15 @@ class Readability
 	* Clean a node of all elements of type "tag".
 	* (Unless it's a youtube/vimeo video. People love movies.)
 	*
+	* Updated 2012-09-18 to preserve youtube/vimeo iframes
+	*
 	* @param DOMElement $e
 	* @param string $tag
 	* @return void
 	*/
 	public function clean($e, $tag) {
 		$targetList = $e->getElementsByTagName($tag);
-		$isEmbed = ($tag == 'object' || $tag == 'embed');
+		$isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed');
 		
 		for ($y=$targetList->length-1; $y >= 0; $y--) {
 			/* Allow youtube and vimeo videos through as people usually want to see those. */
@@ -1017,12 +1028,19 @@ class Readability
 				$img    = $tagsList->item($i)->getElementsByTagName('img')->length;
 				$li     = $tagsList->item($i)->getElementsByTagName('li')->length-100;
 				$input  = $tagsList->item($i)->getElementsByTagName('input')->length;
+				$a 		= $tagsList->item($i)->getElementsByTagName('a')->length;
 
 				$embedCount = 0;
 				$embeds = $tagsList->item($i)->getElementsByTagName('embed');
 				for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
 					if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
-					$embedCount++; 
+						$embedCount++; 
+					}
+				}
+				$embeds = $tagsList->item($i)->getElementsByTagName('iframe');
+				for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
+					if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
+						$embedCount++; 
 					}
 				}
 
@@ -1030,23 +1048,58 @@ class Readability
 				$contentLength = strlen($this->getInnerText($tagsList->item($i)));
 				$toRemove      = false;
 
-				if ( $img > $p ) {
-					$toRemove = true;
-				} else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
-					$toRemove = true;
-				} else if ( $input > floor($p/3) ) {
-					$toRemove = true; 
-				} else if ($contentLength < 25 && ($img === 0 || $img > 2) ) {
-					$toRemove = true;
-				} else if($weight < 25 && $linkDensity > 0.2) {
-					$toRemove = true;
-				} else if($weight >= 25 && $linkDensity > 0.5) {
-					$toRemove = true;
-				} else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) {
-					$toRemove = true;
+				if ($this->lightClean) {
+					$this->dbg('Light clean...');
+					if ( ($img > $p) && ($img > 4) ) {
+						$this->dbg(' more than 4 images and more image elements than paragraph elements');
+						$toRemove = true;
+					} else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
+						$this->dbg(' too many <li> elements, and parent is not <ul> or <ol>');
+						$toRemove = true;
+					} else if ( $input > floor($p/3) ) {
+						$this->dbg(' too many <input> elements');
+						$toRemove = true; 
+					} else if ($contentLength < 25 && ($embedCount === 0 && ($img === 0 || $img > 2))) {
+						$this->dbg(' content length less than 25 chars, 0 embeds and either 0 images or more than 2 images');
+						$toRemove = true;
+					} else if($weight < 25 && $linkDensity > 0.2) {
+						$this->dbg(' weight smaller than 25 and link density above 0.2');
+						$toRemove = true;
+					} else if($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) {
+						$this->dbg(' more than 2 links and weight above 25 but link density greater than 0.5');
+						$toRemove = true;
+					} else if($embedCount > 3) {
+						$this->dbg(' more than 3 embeds');
+						$toRemove = true;
+					}
+				} else {
+					$this->dbg('Standard clean...');
+					if ( $img > $p ) {
+						$this->dbg(' more image elements than paragraph elements');
+						$toRemove = true;
+					} else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
+						$this->dbg(' too many <li> elements, and parent is not <ul> or <ol>');
+						$toRemove = true;
+					} else if ( $input > floor($p/3) ) {
+						$this->dbg(' too many <input> elements');
+						$toRemove = true; 
+					} else if ($contentLength < 25 && ($img === 0 || $img > 2) ) {
+						$this->dbg(' content length less than 25 chars and 0 images, or more than 2 images');
+						$toRemove = true;
+					} else if($weight < 25 && $linkDensity > 0.2) {
+						$this->dbg(' weight smaller than 25 and link density above 0.2');
+						$toRemove = true;
+					} else if($weight >= 25 && $linkDensity > 0.5) {
+						$this->dbg(' weight above 25 but link density greater than 0.5');
+						$toRemove = true;
+					} else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) {
+						$this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed');
+						$toRemove = true;
+					}
 				}
 
 				if ($toRemove) {
+					//$this->dbg('Removing: '.$tagsList->item($i)->innerHTML);
 					$tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
 				}
 			}
@@ -1081,5 +1134,4 @@ class Readability
 	public function removeFlag($flag) {
 		$this->flags = $this->flags & ~$flag;
 	}
-}
-?>
+}

+ 27 - 0
composer.json

@@ -0,0 +1,27 @@
+{
+	"name": "fivefilters/php-readability",
+	"type": "library",
+	"description": "Automatic article extraction from HTML",
+	"keywords": ["article extraction","content extraction","extraction","article","content","html"],
+	"homepage": "http://code.fivefilters.org/php-readability/",
+	"license": "Apache-2.0",
+	"authors": [
+	{
+		"name": "Keyvan Minoukadeh",
+		"email": "keyvan@keyvan.net",
+		"homepage": "http://keyvan.net",
+		"role": "Developer (ported original JS code to PHP)"
+	},
+	{
+		"name": "Arc90",
+		"homepage": "http://arc90.com",
+		"role": "Developer (original JS version)"
+	}
+	],
+	"require": {
+		"php": ">=5.2"
+	},
+	"autoload": {
+		"psr-0": { "Readability": "" }
+	}
+}

+ 1 - 1
examples/Readability.php

@@ -4,7 +4,7 @@ header('Content-Type: text/plain; charset=utf-8');
 
 // get latest Medialens alert 
 // (change this URL to whatever you'd like to test)
-$url = 'http://medialens.org/alerts/index.php';
+$url = 'http://www.medialens.org/index.php/alerts/alert-archive/alerts-2013/729-thatcher.html';
 $html = file_get_contents($url);
 
 // Note: PHP Readability expects UTF-8 encoded content.