فهرست منبع

Updated with lightClean mode to preserve more images and youtube/vimeo/viddler embeds

Also contains optional parameter in constructor to indicate HTML parsing
should be done with html5lib (although that has its own issues).
FiveFilters.org 12 سال پیش
والد
کامیت
24fb14a55f
5فایلهای تغییر یافته به همراه177 افزوده شده و 35 حذف شده
  1. 1 2
      JSLikeHTMLElement.php
  2. 64 0
      README.md
  3. 84 32
      Readability.php
  4. 27 0
      composer.json
  5. 1 1
      examples/Readability.php

+ 1 - 2
JSLikeHTMLElement.php

@@ -106,5 +106,4 @@ class JSLikeHTMLElement extends DOMElement
 	{
 		return '['.$this->tagName.']';
 	}
-}
-?>
+}

+ 64 - 0
README.md

@@ -0,0 +1,64 @@
+PHP Readability
+================
+
+This is a PHP port of Arc90's original Javascript version of Readability. (Arc90 has since relaunched the project.)
+
+For instructions on how to use this, please see http://www.keyvan.net/2010/08/php-readability/
+
+For a more flexible and robust solution to article extraction, take a look at [Full-Text RSS](http://fivefilters.org/content-only/) - it makes use of PHP Readability, but offers much more.
+
+Feel free to fork this and change/improve it. I would love to see your results. Please do share them and I'll consider pulling them in.
+
+PHP Readability is licensed under the Apache License, Version 2.0 (the same license as the original JS version). The original Javascript version can be found here: http://code.google.com/p/arc90labs-readability/source/browse/ (readability.js)
+
+### Simple example
+
+	<?php
+	require_once 'readability/Readability.php';
+	header('Content-Type: text/plain; charset=utf-8');
+
+	// get latest Medialens alert 
+	// (change this URL to whatever you'd like to test)
+	$url = 'http://www.medialens.org/index.php/alerts/alert-archive/alerts-2013/729-thatcher.html';
+	$html = file_get_contents($url);
+
+	// Note: PHP Readability expects UTF-8 encoded content.
+	// If your content is not UTF-8 encoded, convert it 
+	// first before passing it to PHP Readability. 
+	// Both iconv() and mb_convert_encoding() can do this.
+
+	// If we've got Tidy, let's clean up input.
+	// This step is highly recommended - PHP's default HTML parser
+	// often doesn't do a great job and results in strange output.
+	if (function_exists('tidy_parse_string')) {
+		$tidy = tidy_parse_string($html, array(), 'UTF8');
+		$tidy->cleanRepair();
+		$html = $tidy->value;
+	}
+
+	// give it to Readability
+	$readability = new Readability($html, $url);
+	// print debug output? 
+	// useful to compare against Arc90's original JS version - 
+	// simply click the bookmarklet with FireBug's console window open
+	$readability->debug = false;
+	// convert links to footnotes?
+	$readability->convertLinksToFootnotes = true;
+	// process it
+	$result = $readability->init();
+	// does it look like we found what we wanted?
+	if ($result) {
+		echo "== Title =====================================\n";
+		echo $readability->getTitle()->textContent, "\n\n";
+		echo "== Body ======================================\n";
+		$content = $readability->getContent()->innerHTML;
+		// if we've got Tidy, let's clean it up for output
+		if (function_exists('tidy_parse_string')) {
+			$tidy = tidy_parse_string($content, array('indent'=>true, 'show-body-only' => true), 'UTF8');
+			$tidy->cleanRepair();
+			$content = $tidy->value;
+		}
+		echo $content;
+	} else {
+		echo 'Looks like we couldn\'t find the content. :(';
+	}

+ 84 - 32
Readability.php

@@ -2,6 +2,8 @@
 /** 
 * Arc90's Readability ported to PHP for FiveFilters.org
 * Based on readability.js version 1.7.1 (without multi-page support)
+* Updated to allow HTML5 parsing with html5lib
+* Updated with lightClean mode to preserve more images and youtube/vimeo/viddler embeds
 * ------------------------------------------------------
 * Original URL: http://lab.arc90.com/experiments/readability/js/readability.js
 * Arc90's project URL: http://lab.arc90.com/experiments/readability/
@@ -10,7 +12,7 @@
 * More information: http://fivefilters.org/content-only/
 * License: Apache License, Version 2.0
 * Requires: PHP5
-* Date: 2011-07-22
+* Date: 2012-09-19
 * 
 * Differences between the PHP port and the original
 * ------------------------------------------------------
@@ -72,6 +74,7 @@ class Readability
 	public $dom;
 	public $url = null; // optional - URL where HTML was retrieved
 	public $debug = false;
+	public $lightClean = true; // preserves more content (experimental) added 2012-09-19
 	protected $body = null; // 
 	protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later
 	protected $flags = 7; // 1 | 2 | 4;   // Start with all flags set.
@@ -82,17 +85,17 @@ class Readability
 	* Defined up here so we don't instantiate them repeatedly in loops.
 	**/
 	public $regexps = array(
-		'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i',
+		'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i',
 		'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
-		'positive' => '/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i',
-		'negative' => '/combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i',
+		'positive' => '/article|body|content|entry|hentry|main|page|attachment|pagination|post|text|blog|story/i',
+		'negative' => '/combx|comment|com-|contact|foot|footer|_nav|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i',
 		'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i',
 		'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i',
 		'replaceFonts' => '/<(\/?)font[^>]*>/i',
 		// 'trimRe' => '/^\s+|\s+$/g', // PHP has trim()
 		'normalize' => '/\s{2,}/',
 		'killBreaks' => '/(<br\s*\/?>(\s|&nbsp;?)*){1,}/',
-		'video' => '/http:\/\/(www\.)?(youtube|vimeo)\.com/i',
+		'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i',
 		'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i'
 	);	
 	
@@ -105,19 +108,24 @@ class Readability
 	* Create instance of Readability
 	* @param string UTF-8 encoded string
 	* @param string (optional) URL associated with HTML (used for footnotes)
+	* @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib')
 	*/	
-	function __construct($html, $url=null)
+	function __construct($html, $url=null, $parser='libxml')
 	{
+		$this->url = $url;
 		/* Turn all double br's into p's */
 		$html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html);
 		$html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html);
 		$html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
-		$this->dom = new DOMDocument();
-		$this->dom->preserveWhiteSpace = false;
-		$this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
 		if (trim($html) == '') $html = '<html></html>';
-		@$this->dom->loadHTML($html);
-		$this->url = $url;
+		if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) {
+			// all good
+		} else {
+			$this->dom = new DOMDocument();
+			$this->dom->preserveWhiteSpace = false;
+			@$this->dom->loadHTML($html);
+		}
+		$this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
 	}
 
 	/**
@@ -213,7 +221,7 @@ class Readability
 	* Debug
 	*/
 	protected function dbg($msg) {
-		if ($this->debug) echo '* ',$msg, '<br />', "\n";
+		if ($this->debug) echo '* ',$msg, "\n";
 	}
 	
 	/**
@@ -420,7 +428,7 @@ class Readability
 		* If there is only one h2, they are probably using it
 		* as a header and not a subheader, so remove it since we already have a header.
 		***/
-		if ($articleContent->getElementsByTagName('h2')->length == 1) {
+		if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) {
 			$this->clean($articleContent, 'h2'); 
 		}
 		$this->clean($articleContent, 'iframe');
@@ -439,8 +447,9 @@ class Readability
 			$imgCount    = $articleParagraphs->item($i)->getElementsByTagName('img')->length;
 			$embedCount  = $articleParagraphs->item($i)->getElementsByTagName('embed')->length;
 			$objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length;
+			$iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length;
 			
-			if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '')
+			if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '')
 			{
 				$articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i));
 			}
@@ -946,13 +955,15 @@ class Readability
 	* Clean a node of all elements of type "tag".
 	* (Unless it's a youtube/vimeo video. People love movies.)
 	*
+	* Updated 2012-09-18 to preserve youtube/vimeo iframes
+	*
 	* @param DOMElement $e
 	* @param string $tag
 	* @return void
 	*/
 	public function clean($e, $tag) {
 		$targetList = $e->getElementsByTagName($tag);
-		$isEmbed = ($tag == 'object' || $tag == 'embed');
+		$isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed');
 		
 		for ($y=$targetList->length-1; $y >= 0; $y--) {
 			/* Allow youtube and vimeo videos through as people usually want to see those. */
@@ -1017,12 +1028,19 @@ class Readability
 				$img    = $tagsList->item($i)->getElementsByTagName('img')->length;
 				$li     = $tagsList->item($i)->getElementsByTagName('li')->length-100;
 				$input  = $tagsList->item($i)->getElementsByTagName('input')->length;
+				$a 		= $tagsList->item($i)->getElementsByTagName('a')->length;
 
 				$embedCount = 0;
 				$embeds = $tagsList->item($i)->getElementsByTagName('embed');
 				for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
 					if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
-					$embedCount++; 
+						$embedCount++; 
+					}
+				}
+				$embeds = $tagsList->item($i)->getElementsByTagName('iframe');
+				for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
+					if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
+						$embedCount++; 
 					}
 				}
 
@@ -1030,23 +1048,58 @@ class Readability
 				$contentLength = strlen($this->getInnerText($tagsList->item($i)));
 				$toRemove      = false;
 
-				if ( $img > $p ) {
-					$toRemove = true;
-				} else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
-					$toRemove = true;
-				} else if ( $input > floor($p/3) ) {
-					$toRemove = true; 
-				} else if ($contentLength < 25 && ($img === 0 || $img > 2) ) {
-					$toRemove = true;
-				} else if($weight < 25 && $linkDensity > 0.2) {
-					$toRemove = true;
-				} else if($weight >= 25 && $linkDensity > 0.5) {
-					$toRemove = true;
-				} else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) {
-					$toRemove = true;
+				if ($this->lightClean) {
+					$this->dbg('Light clean...');
+					if ( ($img > $p) && ($img > 4) ) {
+						$this->dbg(' more than 4 images and more image elements than paragraph elements');
+						$toRemove = true;
+					} else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
+						$this->dbg(' too many <li> elements, and parent is not <ul> or <ol>');
+						$toRemove = true;
+					} else if ( $input > floor($p/3) ) {
+						$this->dbg(' too many <input> elements');
+						$toRemove = true; 
+					} else if ($contentLength < 25 && ($embedCount === 0 && ($img === 0 || $img > 2))) {
+						$this->dbg(' content length less than 25 chars, 0 embeds and either 0 images or more than 2 images');
+						$toRemove = true;
+					} else if($weight < 25 && $linkDensity > 0.2) {
+						$this->dbg(' weight smaller than 25 and link density above 0.2');
+						$toRemove = true;
+					} else if($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) {
+						$this->dbg(' more than 2 links and weight above 25 but link density greater than 0.5');
+						$toRemove = true;
+					} else if($embedCount > 3) {
+						$this->dbg(' more than 3 embeds');
+						$toRemove = true;
+					}
+				} else {
+					$this->dbg('Standard clean...');
+					if ( $img > $p ) {
+						$this->dbg(' more image elements than paragraph elements');
+						$toRemove = true;
+					} else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
+						$this->dbg(' too many <li> elements, and parent is not <ul> or <ol>');
+						$toRemove = true;
+					} else if ( $input > floor($p/3) ) {
+						$this->dbg(' too many <input> elements');
+						$toRemove = true; 
+					} else if ($contentLength < 25 && ($img === 0 || $img > 2) ) {
+						$this->dbg(' content length less than 25 chars and 0 images, or more than 2 images');
+						$toRemove = true;
+					} else if($weight < 25 && $linkDensity > 0.2) {
+						$this->dbg(' weight smaller than 25 and link density above 0.2');
+						$toRemove = true;
+					} else if($weight >= 25 && $linkDensity > 0.5) {
+						$this->dbg(' weight above 25 but link density greater than 0.5');
+						$toRemove = true;
+					} else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) {
+						$this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed');
+						$toRemove = true;
+					}
 				}
 
 				if ($toRemove) {
+					//$this->dbg('Removing: '.$tagsList->item($i)->innerHTML);
 					$tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
 				}
 			}
@@ -1081,5 +1134,4 @@ class Readability
 	public function removeFlag($flag) {
 		$this->flags = $this->flags & ~$flag;
 	}
-}
-?>
+}

+ 27 - 0
composer.json

@@ -0,0 +1,27 @@
+{
+	"name": "fivefilters/php-readability",
+	"type": "library",
+	"description": "Automatic article extraction from HTML",
+	"keywords": ["article extraction","content extraction","extraction","article","content","html"],
+	"homepage": "http://code.fivefilters.org/php-readability/",
+	"license": "Apache-2.0",
+	"authors": [
+	{
+		"name": "Keyvan Minoukadeh",
+		"email": "keyvan@keyvan.net",
+		"homepage": "http://keyvan.net",
+		"role": "Developer (ported original JS code to PHP)"
+	},
+	{
+		"name": "Arc90",
+		"homepage": "http://arc90.com",
+		"role": "Developer (original JS version)"
+	}
+	],
+	"require": {
+		"php": ">=5.2"
+	},
+	"autoload": {
+		"psr-0": { "Readability": "" }
+	}
+}

+ 1 - 1
examples/Readability.php

@@ -4,7 +4,7 @@ header('Content-Type: text/plain; charset=utf-8');
 
 // get latest Medialens alert 
 // (change this URL to whatever you'd like to test)
-$url = 'http://medialens.org/alerts/index.php';
+$url = 'http://www.medialens.org/index.php/alerts/alert-archive/alerts-2013/729-thatcher.html';
 $html = file_get_contents($url);
 
 // Note: PHP Readability expects UTF-8 encoded content.