Readability.php 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137
  1. <?php
  2. /**
  3. * Arc90's Readability ported to PHP for FiveFilters.org
  4. * Based on readability.js version 1.7.1 (without multi-page support)
  5. * Updated to allow HTML5 parsing with html5lib
  6. * Updated with lightClean mode to preserve more images and youtube/vimeo/viddler embeds
  7. * ------------------------------------------------------
  8. * Original URL: http://lab.arc90.com/experiments/readability/js/readability.js
  9. * Arc90's project URL: http://lab.arc90.com/experiments/readability/
  10. * JS Source: http://code.google.com/p/arc90labs-readability
  11. * Ported by: Keyvan Minoukadeh, http://www.keyvan.net
  12. * More information: http://fivefilters.org/content-only/
  13. * License: Apache License, Version 2.0
  14. * Requires: PHP5
  15. * Date: 2012-09-19
  16. *
  17. * Differences between the PHP port and the original
  18. * ------------------------------------------------------
  19. * Arc90's Readability is designed to run in the browser. It works on the DOM
  20. * tree (the parsed HTML) after the page's CSS styles have been applied and
  21. * Javascript code executed. This PHP port does not run inside a browser.
  22. * We use PHP's ability to parse HTML to build our DOM tree, but we cannot
  23. * rely on CSS or Javascript support. As such, the results will not always
  24. * match Arc90's Readability. (For example, if a web page contains CSS style
  25. * rules or Javascript code which hide certain HTML elements from display,
  26. * Arc90's Readability will dismiss those from consideration but our PHP port,
  27. * unable to understand CSS or Javascript, will not know any better.)
  28. *
  29. * Another significant difference is that the aim of Arc90's Readability is
  30. * to re-present the main content block of a given web page so users can
  31. * read it more easily in their browsers. Correct identification, clean up,
  32. * and separation of the content block is only a part of this process.
  33. * This PHP port is only concerned with this part, it does not include code
  34. * that relates to presentation in the browser - Arc90 already do
  35. * that extremely well, and for PDF output there's FiveFilters.org's
  36. * PDF Newspaper: http://fivefilters.org/pdf-newspaper/.
  37. *
  38. * Finally, this class contains methods that might be useful for developers
  39. * working on HTML document fragments. So without deviating too much from
  40. * the original code (which I don't want to do because it makes debugging
  41. * and updating more difficult), I've tried to make it a little more
  42. * developer friendly. You should be able to use the methods here on
  43. * existing DOMElement objects without passing an entire HTML document to
  44. * be parsed.
  45. */
  46. // This class allows us to do JavaScript like assignements to innerHTML
  47. require_once(dirname(__FILE__).'/JSLikeHTMLElement.php');
  48. // Alternative usage (for testing only!)
  49. // uncomment the lines below and call Readability.php in your browser
  50. // passing it the URL of the page you'd like content from, e.g.:
  51. // Readability.php?url=http://medialens.org/alerts/09/090615_the_guardian_climate.php
  52. /*
  53. if (!isset($_GET['url']) || $_GET['url'] == '') {
  54. die('Please pass a URL to the script. E.g. Readability.php?url=bla.com/story.html');
  55. }
  56. $url = $_GET['url'];
  57. if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url;
  58. $html = file_get_contents($url);
  59. $r = new Readability($html, $url);
  60. $r->init();
  61. echo $r->articleContent->innerHTML;
  62. */
  63. class Readability
  64. {
  65. public $version = '1.7.1-without-multi-page';
  66. public $convertLinksToFootnotes = false;
  67. public $revertForcedParagraphElements = true;
  68. public $articleTitle;
  69. public $articleContent;
  70. public $dom;
  71. public $url = null; // optional - URL where HTML was retrieved
  72. public $debug = false;
  73. public $lightClean = true; // preserves more content (experimental) added 2012-09-19
  74. protected $body = null; //
  75. protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later
  76. protected $flags = 7; // 1 | 2 | 4; // Start with all flags set.
  77. protected $success = false; // indicates whether we were able to extract or not
  78. /**
  79. * All of the regular expressions in use within readability.
  80. * Defined up here so we don't instantiate them repeatedly in loops.
  81. **/
  82. public $regexps = array(
  83. 'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i',
  84. 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
  85. 'positive' => '/article|body|content|entry|hentry|main|page|attachment|pagination|post|text|blog|story/i',
  86. 'negative' => '/combx|comment|com-|contact|foot|footer|_nav|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i',
  87. 'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i',
  88. 'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i',
  89. 'replaceFonts' => '/<(\/?)font[^>]*>/i',
  90. // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim()
  91. 'normalize' => '/\s{2,}/',
  92. 'killBreaks' => '/(<br\s*\/?>(\s|&nbsp;?)*){1,}/',
  93. 'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i',
  94. 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i'
  95. );
  96. /* constants */
  97. const FLAG_STRIP_UNLIKELYS = 1;
  98. const FLAG_WEIGHT_CLASSES = 2;
  99. const FLAG_CLEAN_CONDITIONALLY = 4;
  100. /**
  101. * Create instance of Readability
  102. * @param string UTF-8 encoded string
  103. * @param string (optional) URL associated with HTML (used for footnotes)
  104. * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib')
  105. */
  106. function __construct($html, $url=null, $parser='libxml')
  107. {
  108. $this->url = $url;
  109. /* Turn all double br's into p's */
  110. $html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html);
  111. $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html);
  112. $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
  113. if (trim($html) == '') $html = '<html></html>';
  114. if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) {
  115. // all good
  116. } else {
  117. $this->dom = new DOMDocument();
  118. $this->dom->preserveWhiteSpace = false;
  119. @$this->dom->loadHTML($html);
  120. }
  121. $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
  122. }
  123. /**
  124. * Get article title element
  125. * @return DOMElement
  126. */
  127. public function getTitle() {
  128. return $this->articleTitle;
  129. }
  130. /**
  131. * Get article content element
  132. * @return DOMElement
  133. */
  134. public function getContent() {
  135. return $this->articleContent;
  136. }
  137. /**
  138. * Runs readability.
  139. *
  140. * Workflow:
  141. * 1. Prep the document by removing script tags, css, etc.
  142. * 2. Build readability's DOM tree.
  143. * 3. Grab the article content from the current dom tree.
  144. * 4. Replace the current DOM tree with the new one.
  145. * 5. Read peacefully.
  146. *
  147. * @return boolean true if we found content, false otherwise
  148. **/
  149. public function init()
  150. {
  151. if (!isset($this->dom->documentElement)) return false;
  152. $this->removeScripts($this->dom);
  153. //die($this->getInnerHTML($this->dom->documentElement));
  154. // Assume successful outcome
  155. $this->success = true;
  156. $bodyElems = $this->dom->getElementsByTagName('body');
  157. if ($bodyElems->length > 0) {
  158. if ($this->bodyCache == null) {
  159. $this->bodyCache = $bodyElems->item(0)->innerHTML;
  160. }
  161. if ($this->body == null) {
  162. $this->body = $bodyElems->item(0);
  163. }
  164. }
  165. $this->prepDocument();
  166. //die($this->dom->documentElement->parentNode->nodeType);
  167. //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement));
  168. //die($this->getInnerHTML($this->dom->documentElement));
  169. /* Build readability's DOM tree */
  170. $overlay = $this->dom->createElement('div');
  171. $innerDiv = $this->dom->createElement('div');
  172. $articleTitle = $this->getArticleTitle();
  173. $articleContent = $this->grabArticle();
  174. if (!$articleContent) {
  175. $this->success = false;
  176. $articleContent = $this->dom->createElement('div');
  177. $articleContent->setAttribute('id', 'readability-content');
  178. $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>';
  179. }
  180. $overlay->setAttribute('id', 'readOverlay');
  181. $innerDiv->setAttribute('id', 'readInner');
  182. /* Glue the structure of our document together. */
  183. $innerDiv->appendChild($articleTitle);
  184. $innerDiv->appendChild($articleContent);
  185. $overlay->appendChild($innerDiv);
  186. /* Clear the old HTML, insert the new content. */
  187. $this->body->innerHTML = '';
  188. $this->body->appendChild($overlay);
  189. //document.body.insertBefore(overlay, document.body.firstChild);
  190. $this->body->removeAttribute('style');
  191. $this->postProcessContent($articleContent);
  192. // Set title and content instance variables
  193. $this->articleTitle = $articleTitle;
  194. $this->articleContent = $articleContent;
  195. return $this->success;
  196. }
  197. /**
  198. * Debug
  199. */
  200. protected function dbg($msg) {
  201. if ($this->debug) echo '* ',$msg, "\n";
  202. }
  203. /**
  204. * Run any post-process modifications to article content as necessary.
  205. *
  206. * @param DOMElement
  207. * @return void
  208. */
  209. public function postProcessContent($articleContent) {
  210. if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) {
  211. $this->addFootnotes($articleContent);
  212. }
  213. }
  214. /**
  215. * Get the article title as an H1.
  216. *
  217. * @return DOMElement
  218. */
  219. protected function getArticleTitle() {
  220. $curTitle = '';
  221. $origTitle = '';
  222. try {
  223. $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
  224. } catch(Exception $e) {}
  225. if (preg_match('/ [\|\-] /', $curTitle))
  226. {
  227. $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
  228. if (count(explode(' ', $curTitle)) < 3) {
  229. $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle);
  230. }
  231. }
  232. else if (strpos($curTitle, ': ') !== false)
  233. {
  234. $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle);
  235. if (count(explode(' ', $curTitle)) < 3) {
  236. $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle);
  237. }
  238. }
  239. else if(strlen($curTitle) > 150 || strlen($curTitle) < 15)
  240. {
  241. $hOnes = $this->dom->getElementsByTagName('h1');
  242. if($hOnes->length == 1)
  243. {
  244. $curTitle = $this->getInnerText($hOnes->item(0));
  245. }
  246. }
  247. $curTitle = trim($curTitle);
  248. if (count(explode(' ', $curTitle)) <= 4) {
  249. $curTitle = $origTitle;
  250. }
  251. $articleTitle = $this->dom->createElement('h1');
  252. $articleTitle->innerHTML = $curTitle;
  253. return $articleTitle;
  254. }
  255. /**
  256. * Prepare the HTML document for readability to scrape it.
  257. * This includes things like stripping javascript, CSS, and handling terrible markup.
  258. *
  259. * @return void
  260. **/
  261. protected function prepDocument() {
  262. /**
  263. * In some cases a body element can't be found (if the HTML is totally hosed for example)
  264. * so we create a new body node and append it to the document.
  265. */
  266. if ($this->body == null)
  267. {
  268. $this->body = $this->dom->createElement('body');
  269. $this->dom->documentElement->appendChild($this->body);
  270. }
  271. $this->body->setAttribute('id', 'readabilityBody');
  272. /* Remove all style tags in head */
  273. $styleTags = $this->dom->getElementsByTagName('style');
  274. for ($i = $styleTags->length-1; $i >= 0; $i--)
  275. {
  276. $styleTags->item($i)->parentNode->removeChild($styleTags->item($i));
  277. }
  278. /* Turn all double br's into p's */
  279. /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */
  280. //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '</p><p>').replace(readability.regexps.replaceFonts, '<$1span>');
  281. // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree.
  282. // Manipulating innerHTML as it's done in JS is not possible in PHP.
  283. }
  284. /**
  285. * For easier reading, convert this document to have footnotes at the bottom rather than inline links.
  286. * @see http://www.roughtype.com/archives/2010/05/experiments_in.php
  287. *
  288. * @return void
  289. **/
  290. public function addFootnotes($articleContent) {
  291. $footnotesWrapper = $this->dom->createElement('div');
  292. $footnotesWrapper->setAttribute('id', 'readability-footnotes');
  293. $footnotesWrapper->innerHTML = '<h3>References</h3>';
  294. $articleFootnotes = $this->dom->createElement('ol');
  295. $articleFootnotes->setAttribute('id', 'readability-footnotes-list');
  296. $footnotesWrapper->appendChild($articleFootnotes);
  297. $articleLinks = $articleContent->getElementsByTagName('a');
  298. $linkCount = 0;
  299. for ($i = 0; $i < $articleLinks->length; $i++)
  300. {
  301. $articleLink = $articleLinks->item($i);
  302. $footnoteLink = $articleLink->cloneNode(true);
  303. $refLink = $this->dom->createElement('a');
  304. $footnote = $this->dom->createElement('li');
  305. $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST);
  306. if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST);
  307. //linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host,
  308. $linkText = $this->getInnerText($articleLink);
  309. if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) {
  310. continue;
  311. }
  312. $linkCount++;
  313. /** Add a superscript reference after the article link */
  314. $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount);
  315. $refLink->innerHTML = '<small><sup>[' . $linkCount . ']</sup></small>';
  316. $refLink->setAttribute('class', 'readability-DoNotFootnote');
  317. $refLink->setAttribute('style', 'color: inherit;');
  318. //TODO: does this work or should we use DOMNode.isSameNode()?
  319. if ($articleLink->parentNode->lastChild == $articleLink) {
  320. $articleLink->parentNode->appendChild($refLink);
  321. } else {
  322. $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling);
  323. }
  324. $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;');
  325. $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount);
  326. $footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> ';
  327. $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText);
  328. $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount);
  329. $footnote->appendChild($footnoteLink);
  330. if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . '<small> (' . $linkDomain . ')</small>';
  331. $articleFootnotes->appendChild($footnote);
  332. }
  333. if ($linkCount > 0) {
  334. $articleContent->appendChild($footnotesWrapper);
  335. }
  336. }
  337. /**
  338. * Reverts P elements with class 'readability-styled'
  339. * to text nodes - which is what they were before.
  340. *
  341. * @param DOMElement
  342. * @return void
  343. */
  344. function revertReadabilityStyledElements($articleContent) {
  345. $xpath = new DOMXPath($articleContent->ownerDocument);
  346. $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent);
  347. //$elems = $articleContent->getElementsByTagName('p');
  348. for ($i = $elems->length-1; $i >= 0; $i--) {
  349. $e = $elems->item($i);
  350. $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
  351. //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') {
  352. // $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e);
  353. //}
  354. }
  355. }
  356. /**
  357. * Prepare the article node for display. Clean out any inline styles,
  358. * iframes, forms, strip extraneous <p> tags, etc.
  359. *
  360. * @param DOMElement
  361. * @return void
  362. */
  363. function prepArticle($articleContent) {
  364. $this->cleanStyles($articleContent);
  365. $this->killBreaks($articleContent);
  366. if ($this->revertForcedParagraphElements) {
  367. $this->revertReadabilityStyledElements($articleContent);
  368. }
  369. /* Clean out junk from the article content */
  370. $this->cleanConditionally($articleContent, 'form');
  371. $this->clean($articleContent, 'object');
  372. $this->clean($articleContent, 'h1');
  373. /**
  374. * If there is only one h2, they are probably using it
  375. * as a header and not a subheader, so remove it since we already have a header.
  376. ***/
  377. if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) {
  378. $this->clean($articleContent, 'h2');
  379. }
  380. $this->clean($articleContent, 'iframe');
  381. $this->cleanHeaders($articleContent);
  382. /* Do these last as the previous stuff may have removed junk that will affect these */
  383. $this->cleanConditionally($articleContent, 'table');
  384. $this->cleanConditionally($articleContent, 'ul');
  385. $this->cleanConditionally($articleContent, 'div');
  386. /* Remove extra paragraphs */
  387. $articleParagraphs = $articleContent->getElementsByTagName('p');
  388. for ($i = $articleParagraphs->length-1; $i >= 0; $i--)
  389. {
  390. $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length;
  391. $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length;
  392. $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length;
  393. $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length;
  394. if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '')
  395. {
  396. $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i));
  397. }
  398. }
  399. try {
  400. $articleContent->innerHTML = preg_replace('/<br[^>]*>\s*<p/i', '<p', $articleContent->innerHTML);
  401. //articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p');
  402. }
  403. catch (Exception $e) {
  404. $this->dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e);
  405. }
  406. }
  407. /**
  408. * Initialize a node with the readability object. Also checks the
  409. * className/id for special names to add to its score.
  410. *
  411. * @param Element
  412. * @return void
  413. **/
  414. protected function initializeNode($node) {
  415. $readability = $this->dom->createAttribute('readability');
  416. $readability->value = 0; // this is our contentScore
  417. $node->setAttributeNode($readability);
  418. switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case
  419. case 'DIV':
  420. $readability->value += 5;
  421. break;
  422. case 'PRE':
  423. case 'TD':
  424. case 'BLOCKQUOTE':
  425. $readability->value += 3;
  426. break;
  427. case 'ADDRESS':
  428. case 'OL':
  429. case 'UL':
  430. case 'DL':
  431. case 'DD':
  432. case 'DT':
  433. case 'LI':
  434. case 'FORM':
  435. $readability->value -= 3;
  436. break;
  437. case 'H1':
  438. case 'H2':
  439. case 'H3':
  440. case 'H4':
  441. case 'H5':
  442. case 'H6':
  443. case 'TH':
  444. $readability->value -= 5;
  445. break;
  446. }
  447. $readability->value += $this->getClassWeight($node);
  448. }
  449. /***
  450. * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
  451. * most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
  452. *
  453. * @return DOMElement
  454. **/
  455. protected function grabArticle($page=null) {
  456. $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS);
  457. if (!$page) $page = $this->dom;
  458. $allElements = $page->getElementsByTagName('*');
  459. /**
  460. * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
  461. * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
  462. *
  463. * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
  464. * TODO: Shouldn't this be a reverse traversal?
  465. **/
  466. $node = null;
  467. $nodesToScore = array();
  468. for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) {
  469. //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) {
  470. //$node = $targetList->item($nodeIndex);
  471. $tagName = strtoupper($node->tagName);
  472. /* Remove unlikely candidates */
  473. if ($stripUnlikelyCandidates) {
  474. $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id');
  475. if (
  476. preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) &&
  477. !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) &&
  478. $tagName != 'BODY'
  479. )
  480. {
  481. $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString);
  482. //$nodesToRemove[] = $node;
  483. $node->parentNode->removeChild($node);
  484. $nodeIndex--;
  485. continue;
  486. }
  487. }
  488. if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') {
  489. $nodesToScore[] = $node;
  490. }
  491. /* Turn all divs that don't have children block level elements into p's */
  492. if ($tagName == 'DIV') {
  493. if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) {
  494. //$this->dbg('Altering div to p');
  495. $newNode = $this->dom->createElement('p');
  496. try {
  497. $newNode->innerHTML = $node->innerHTML;
  498. //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node);
  499. $node->parentNode->replaceChild($newNode, $node);
  500. $nodeIndex--;
  501. $nodesToScore[] = $node; // or $newNode?
  502. }
  503. catch(Exception $e) {
  504. $this->dbg('Could not alter div to p, reverting back to div.: ' . $e);
  505. }
  506. }
  507. else
  508. {
  509. /* EXPERIMENTAL */
  510. // TODO: change these p elements back to text nodes after processing
  511. for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) {
  512. $childNode = $node->childNodes->item($i);
  513. if ($childNode->nodeType == 3) { // XML_TEXT_NODE
  514. //$this->dbg('replacing text node with a p tag with the same content.');
  515. $p = $this->dom->createElement('p');
  516. $p->innerHTML = $childNode->nodeValue;
  517. $p->setAttribute('style', 'display: inline;');
  518. $p->setAttribute('class', 'readability-styled');
  519. $childNode->parentNode->replaceChild($p, $childNode);
  520. }
  521. }
  522. }
  523. }
  524. }
  525. /**
  526. * Loop through all paragraphs, and assign a score to them based on how content-y they look.
  527. * Then add their score to their parent node.
  528. *
  529. * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
  530. **/
  531. $candidates = array();
  532. for ($pt=0; $pt < count($nodesToScore); $pt++) {
  533. $parentNode = $nodesToScore[$pt]->parentNode;
  534. // $grandParentNode = $parentNode ? $parentNode->parentNode : null;
  535. $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null);
  536. $innerText = $this->getInnerText($nodesToScore[$pt]);
  537. if (!$parentNode || !isset($parentNode->tagName)) {
  538. continue;
  539. }
  540. /* If this paragraph is less than 25 characters, don't even count it. */
  541. if(strlen($innerText) < 25) {
  542. continue;
  543. }
  544. /* Initialize readability data for the parent. */
  545. if (!$parentNode->hasAttribute('readability'))
  546. {
  547. $this->initializeNode($parentNode);
  548. $candidates[] = $parentNode;
  549. }
  550. /* Initialize readability data for the grandparent. */
  551. if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName))
  552. {
  553. $this->initializeNode($grandParentNode);
  554. $candidates[] = $grandParentNode;
  555. }
  556. $contentScore = 0;
  557. /* Add a point for the paragraph itself as a base. */
  558. $contentScore++;
  559. /* Add points for any commas within this paragraph */
  560. $contentScore += count(explode(',', $innerText));
  561. /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
  562. $contentScore += min(floor(strlen($innerText) / 100), 3);
  563. /* Add the score to the parent. The grandparent gets half. */
  564. $parentNode->getAttributeNode('readability')->value += $contentScore;
  565. if ($grandParentNode) {
  566. $grandParentNode->getAttributeNode('readability')->value += $contentScore/2;
  567. }
  568. }
  569. /**
  570. * After we've calculated scores, loop through all of the possible candidate nodes we found
  571. * and find the one with the highest score.
  572. **/
  573. $topCandidate = null;
  574. for ($c=0, $cl=count($candidates); $c < $cl; $c++)
  575. {
  576. /**
  577. * Scale the final candidates score based on link density. Good content should have a
  578. * relatively small link density (5% or less) and be mostly unaffected by this operation.
  579. **/
  580. $readability = $candidates[$c]->getAttributeNode('readability');
  581. $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c]));
  582. $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value);
  583. if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) {
  584. $topCandidate = $candidates[$c];
  585. }
  586. }
  587. /**
  588. * If we still have no top candidate, just use the body as a last resort.
  589. * We also have to copy the body node so it is something we can modify.
  590. **/
  591. if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY')
  592. {
  593. $topCandidate = $this->dom->createElement('div');
  594. if ($page instanceof DOMDocument) {
  595. if (!isset($page->documentElement)) {
  596. // we don't have a body either? what a mess! :)
  597. } else {
  598. $topCandidate->innerHTML = $page->documentElement->innerHTML;
  599. $page->documentElement->innerHTML = '';
  600. $page->documentElement->appendChild($topCandidate);
  601. }
  602. } else {
  603. $topCandidate->innerHTML = $page->innerHTML;
  604. $page->innerHTML = '';
  605. $page->appendChild($topCandidate);
  606. }
  607. $this->initializeNode($topCandidate);
  608. }
  609. /**
  610. * Now that we have the top candidate, look through its siblings for content that might also be related.
  611. * Things like preambles, content split by ads that we removed, etc.
  612. **/
  613. $articleContent = $this->dom->createElement('div');
  614. $articleContent->setAttribute('id', 'readability-content');
  615. $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2);
  616. $siblingNodes = $topCandidate->parentNode->childNodes;
  617. if (!isset($siblingNodes)) {
  618. $siblingNodes = new stdClass;
  619. $siblingNodes->length = 0;
  620. }
  621. for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++)
  622. {
  623. $siblingNode = $siblingNodes->item($s);
  624. $append = false;
  625. $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
  626. //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown'));
  627. if ($siblingNode === $topCandidate)
  628. // or if ($siblingNode->isSameNode($topCandidate))
  629. {
  630. $append = true;
  631. }
  632. $contentBonus = 0;
  633. /* Give a bonus if sibling nodes and top candidates have the example same classname */
  634. if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') {
  635. $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2;
  636. }
  637. if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold)
  638. {
  639. $append = true;
  640. }
  641. if (strtoupper($siblingNode->nodeName) == 'P') {
  642. $linkDensity = $this->getLinkDensity($siblingNode);
  643. $nodeContent = $this->getInnerText($siblingNode);
  644. $nodeLength = strlen($nodeContent);
  645. if ($nodeLength > 80 && $linkDensity < 0.25)
  646. {
  647. $append = true;
  648. }
  649. else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent))
  650. {
  651. $append = true;
  652. }
  653. }
  654. if ($append)
  655. {
  656. $this->dbg('Appending node: ' . $siblingNode->nodeName);
  657. $nodeToAppend = null;
  658. $sibNodeName = strtoupper($siblingNode->nodeName);
  659. if ($sibNodeName != 'DIV' && $sibNodeName != 'P') {
  660. /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
  661. $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.');
  662. $nodeToAppend = $this->dom->createElement('div');
  663. try {
  664. $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id'));
  665. $nodeToAppend->innerHTML = $siblingNode->innerHTML;
  666. }
  667. catch(Exception $e)
  668. {
  669. $this->dbg('Could not alter siblingNode to div, reverting back to original.');
  670. $nodeToAppend = $siblingNode;
  671. $s--;
  672. $sl--;
  673. }
  674. } else {
  675. $nodeToAppend = $siblingNode;
  676. $s--;
  677. $sl--;
  678. }
  679. /* To ensure a node does not interfere with readability styles, remove its classnames */
  680. $nodeToAppend->removeAttribute('class');
  681. /* Append sibling and subtract from our list because it removes the node when you append to another node */
  682. $articleContent->appendChild($nodeToAppend);
  683. }
  684. }
  685. /**
  686. * So we have all of the content that we need. Now we clean it up for presentation.
  687. **/
  688. $this->prepArticle($articleContent);
  689. /**
  690. * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
  691. * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
  692. * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
  693. * finding the -right- content.
  694. **/
  695. if (strlen($this->getInnerText($articleContent, false)) < 250)
  696. {
  697. // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7
  698. // in the meantime, we check and create an empty element if it's not there.
  699. if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body');
  700. $this->body->innerHTML = $this->bodyCache;
  701. if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
  702. $this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
  703. return $this->grabArticle($this->body);
  704. }
  705. else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
  706. $this->removeFlag(self::FLAG_WEIGHT_CLASSES);
  707. return $this->grabArticle($this->body);
  708. }
  709. else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
  710. $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);
  711. return $this->grabArticle($this->body);
  712. }
  713. else {
  714. return false;
  715. }
  716. }
  717. return $articleContent;
  718. }
  719. /**
  720. * Remove script tags from document
  721. *
  722. * @param DOMElement
  723. * @return void
  724. */
  725. public function removeScripts($doc) {
  726. $scripts = $doc->getElementsByTagName('script');
  727. for($i = $scripts->length-1; $i >= 0; $i--)
  728. {
  729. $scripts->item($i)->parentNode->removeChild($scripts->item($i));
  730. }
  731. }
  732. /**
  733. * Get the inner text of a node.
  734. * This also strips out any excess whitespace to be found.
  735. *
  736. * @param DOMElement $
  737. * @param boolean $normalizeSpaces (default: true)
  738. * @return string
  739. **/
  740. public function getInnerText($e, $normalizeSpaces=true) {
  741. $textContent = '';
  742. if (!isset($e->textContent) || $e->textContent == '') {
  743. return '';
  744. }
  745. $textContent = trim($e->textContent);
  746. if ($normalizeSpaces) {
  747. return preg_replace($this->regexps['normalize'], ' ', $textContent);
  748. } else {
  749. return $textContent;
  750. }
  751. }
  752. /**
  753. * Get the number of times a string $s appears in the node $e.
  754. *
  755. * @param DOMElement $e
  756. * @param string - what to count. Default is ","
  757. * @return number (integer)
  758. **/
  759. public function getCharCount($e, $s=',') {
  760. return substr_count($this->getInnerText($e), $s);
  761. }
  762. /**
  763. * Remove the style attribute on every $e and under.
  764. *
  765. * @param DOMElement $e
  766. * @return void
  767. */
  768. public function cleanStyles($e) {
  769. if (!is_object($e)) return;
  770. $elems = $e->getElementsByTagName('*');
  771. foreach ($elems as $elem) {
  772. $elem->removeAttribute('style');
  773. }
  774. }
  775. /**
  776. * Get the density of links as a percentage of the content
  777. * This is the amount of text that is inside a link divided by the total text in the node.
  778. *
  779. * @param DOMElement $e
  780. * @return number (float)
  781. */
  782. public function getLinkDensity($e) {
  783. $links = $e->getElementsByTagName('a');
  784. $textLength = strlen($this->getInnerText($e));
  785. $linkLength = 0;
  786. for ($i=0, $il=$links->length; $i < $il; $i++)
  787. {
  788. $linkLength += strlen($this->getInnerText($links->item($i)));
  789. }
  790. if ($textLength > 0) {
  791. return $linkLength / $textLength;
  792. } else {
  793. return 0;
  794. }
  795. }
  796. /**
  797. * Get an elements class/id weight. Uses regular expressions to tell if this
  798. * element looks good or bad.
  799. *
  800. * @param DOMElement $e
  801. * @return number (Integer)
  802. */
  803. public function getClassWeight($e) {
  804. if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
  805. return 0;
  806. }
  807. $weight = 0;
  808. /* Look for a special classname */
  809. if ($e->hasAttribute('class') && $e->getAttribute('class') != '')
  810. {
  811. if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) {
  812. $weight -= 25;
  813. }
  814. if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) {
  815. $weight += 25;
  816. }
  817. }
  818. /* Look for a special ID */
  819. if ($e->hasAttribute('id') && $e->getAttribute('id') != '')
  820. {
  821. if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) {
  822. $weight -= 25;
  823. }
  824. if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) {
  825. $weight += 25;
  826. }
  827. }
  828. return $weight;
  829. }
  830. /**
  831. * Remove extraneous break tags from a node.
  832. *
  833. * @param DOMElement $node
  834. * @return void
  835. */
  836. public function killBreaks($node) {
  837. $html = $node->innerHTML;
  838. $html = preg_replace($this->regexps['killBreaks'], '<br />', $html);
  839. $node->innerHTML = $html;
  840. }
  841. /**
  842. * Clean a node of all elements of type "tag".
  843. * (Unless it's a youtube/vimeo video. People love movies.)
  844. *
  845. * Updated 2012-09-18 to preserve youtube/vimeo iframes
  846. *
  847. * @param DOMElement $e
  848. * @param string $tag
  849. * @return void
  850. */
  851. public function clean($e, $tag) {
  852. $targetList = $e->getElementsByTagName($tag);
  853. $isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed');
  854. for ($y=$targetList->length-1; $y >= 0; $y--) {
  855. /* Allow youtube and vimeo videos through as people usually want to see those. */
  856. if ($isEmbed) {
  857. $attributeValues = '';
  858. for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) {
  859. $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test)
  860. }
  861. /* First, check the elements attributes to see if any of them contain youtube or vimeo */
  862. if (preg_match($this->regexps['video'], $attributeValues)) {
  863. continue;
  864. }
  865. /* Then check the elements inside this element for the same. */
  866. if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) {
  867. continue;
  868. }
  869. }
  870. $targetList->item($y)->parentNode->removeChild($targetList->item($y));
  871. }
  872. }
  873. /**
  874. * Clean an element of all tags of type "tag" if they look fishy.
  875. * "Fishy" is an algorithm based on content length, classnames,
  876. * link density, number of images & embeds, etc.
  877. *
  878. * @param DOMElement $e
  879. * @param string $tag
  880. * @return void
  881. */
  882. public function cleanConditionally($e, $tag) {
  883. if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
  884. return;
  885. }
  886. $tagsList = $e->getElementsByTagName($tag);
  887. $curTagsLength = $tagsList->length;
  888. /**
  889. * Gather counts for other typical elements embedded within.
  890. * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
  891. *
  892. * TODO: Consider taking into account original contentScore here.
  893. */
  894. for ($i=$curTagsLength-1; $i >= 0; $i--) {
  895. $weight = $this->getClassWeight($tagsList->item($i));
  896. $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0;
  897. $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : ''));
  898. if ($weight + $contentScore < 0) {
  899. $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
  900. }
  901. else if ( $this->getCharCount($tagsList->item($i), ',') < 10) {
  902. /**
  903. * If there are not very many commas, and the number of
  904. * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
  905. **/
  906. $p = $tagsList->item($i)->getElementsByTagName('p')->length;
  907. $img = $tagsList->item($i)->getElementsByTagName('img')->length;
  908. $li = $tagsList->item($i)->getElementsByTagName('li')->length-100;
  909. $input = $tagsList->item($i)->getElementsByTagName('input')->length;
  910. $a = $tagsList->item($i)->getElementsByTagName('a')->length;
  911. $embedCount = 0;
  912. $embeds = $tagsList->item($i)->getElementsByTagName('embed');
  913. for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
  914. if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
  915. $embedCount++;
  916. }
  917. }
  918. $embeds = $tagsList->item($i)->getElementsByTagName('iframe');
  919. for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
  920. if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
  921. $embedCount++;
  922. }
  923. }
  924. $linkDensity = $this->getLinkDensity($tagsList->item($i));
  925. $contentLength = strlen($this->getInnerText($tagsList->item($i)));
  926. $toRemove = false;
  927. if ($this->lightClean) {
  928. $this->dbg('Light clean...');
  929. if ( ($img > $p) && ($img > 4) ) {
  930. $this->dbg(' more than 4 images and more image elements than paragraph elements');
  931. $toRemove = true;
  932. } else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
  933. $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>');
  934. $toRemove = true;
  935. } else if ( $input > floor($p/3) ) {
  936. $this->dbg(' too many <input> elements');
  937. $toRemove = true;
  938. } else if ($contentLength < 25 && ($embedCount === 0 && ($img === 0 || $img > 2))) {
  939. $this->dbg(' content length less than 25 chars, 0 embeds and either 0 images or more than 2 images');
  940. $toRemove = true;
  941. } else if($weight < 25 && $linkDensity > 0.2) {
  942. $this->dbg(' weight smaller than 25 and link density above 0.2');
  943. $toRemove = true;
  944. } else if($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) {
  945. $this->dbg(' more than 2 links and weight above 25 but link density greater than 0.5');
  946. $toRemove = true;
  947. } else if($embedCount > 3) {
  948. $this->dbg(' more than 3 embeds');
  949. $toRemove = true;
  950. }
  951. } else {
  952. $this->dbg('Standard clean...');
  953. if ( $img > $p ) {
  954. $this->dbg(' more image elements than paragraph elements');
  955. $toRemove = true;
  956. } else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
  957. $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>');
  958. $toRemove = true;
  959. } else if ( $input > floor($p/3) ) {
  960. $this->dbg(' too many <input> elements');
  961. $toRemove = true;
  962. } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) {
  963. $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images');
  964. $toRemove = true;
  965. } else if($weight < 25 && $linkDensity > 0.2) {
  966. $this->dbg(' weight smaller than 25 and link density above 0.2');
  967. $toRemove = true;
  968. } else if($weight >= 25 && $linkDensity > 0.5) {
  969. $this->dbg(' weight above 25 but link density greater than 0.5');
  970. $toRemove = true;
  971. } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) {
  972. $this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed');
  973. $toRemove = true;
  974. }
  975. }
  976. if ($toRemove) {
  977. //$this->dbg('Removing: '.$tagsList->item($i)->innerHTML);
  978. $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
  979. }
  980. }
  981. }
  982. }
  983. /**
  984. * Clean out spurious headers from an Element. Checks things like classnames and link density.
  985. *
  986. * @param DOMElement $e
  987. * @return void
  988. */
  989. public function cleanHeaders($e) {
  990. for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) {
  991. $headers = $e->getElementsByTagName('h' . $headerIndex);
  992. for ($i=$headers->length-1; $i >=0; $i--) {
  993. if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) {
  994. $headers->item($i)->parentNode->removeChild($headers->item($i));
  995. }
  996. }
  997. }
  998. }
  999. public function flagIsActive($flag) {
  1000. return ($this->flags & $flag) > 0;
  1001. }
  1002. public function addFlag($flag) {
  1003. $this->flags = $this->flags | $flag;
  1004. }
  1005. public function removeFlag($flag) {
  1006. $this->flags = $this->flags & ~$flag;
  1007. }
  1008. }