0byt3m1n1
Path:
/
data
/
applications
/
aps
/
tikiwiki
/
14.1-0
/
standard
/
htdocs
/
lib
/
rss
/
[
Home
]
File: pagecontentlib.php
<?php // (c) Copyright 2002-2015 by authors of the Tiki Wiki CMS Groupware Project // // All Rights Reserved. See copyright.txt for details and a complete list of authors. // Licensed under the GNU LESSER GENERAL PUBLIC LICENSE. See license.txt for details. // $Id: pagecontentlib.php 56769 2015-11-24 15:46:07Z nkoth $ class PageContentLib { function augmentInformation($data) { global $prefs; if ($prefs['page_content_fetch'] == 'y') { $new = $this->grabContent($data['url']); if ($new) { $data['content'] = $new['content']; } } return $data; } function grabContent($url) { $tikilib = TikiLib::lib('tiki'); $client = $tikilib->get_http_client($url); $response = $tikilib->http_perform_request($client); // Obtain the URL after redirections $url = (string) $client->getUri(); $html = $response->getBody(); // Note: PHP Readability expects UTF-8 encoded content. // If your content is not UTF-8 encoded, convert it // first before passing it to PHP Readability. // Both iconv() and mb_convert_encoding() can do this. // If we've got Tidy, let's clean up input. // This step is highly recommended - PHP's default HTML parser // often doesn't do a great job and results in strange output. $html = $this->tidy($html); // give it to Readability global $prefs; if (is_file($prefs['page_content_fetch_readability'])) { require_once($prefs['page_content_fetch_readability']); } if (!class_exists('Readability')) { return false; } $readability = new Readability($html, $url); $result = $readability->init(); if ($result) { $content = $this->tidy($readability->getContent()->innerHTML); $content = $this->replacePaths($content, $url); return array( 'title' => $readability->getTitle()->textContent, 'content' => $content, ); } } private function tidy($html) { if (function_exists('tidy_parse_string')) { $tidy = tidy_parse_string($html, array(), 'UTF8'); $tidy->cleanRepair(); $html = $tidy->value; } return $html; } private function getUrls($url) { // From http://stackoverflow.com/questions/21201062/using-readability-api-to-scrape-most-relavant-image-from-page // Parse URL $urlArr = parse_url($url); // Determine Base URL, with scheme, host, and port $base = $urlArr['scheme'] . "://" . $urlArr['host']; if(array_key_exists("port",$urlArr) && $urlArr['port'] != 80) { $base .= ":" . $urlArr['port']; } // Truncate the Path using the position of the last forward slash $relative = $base . substr($urlArr['path'], 0, strrpos($urlArr['path'], "/") + 1); // Return our two URLs return array($base, $relative); } function replacePaths($html, $url) { // Modified from: http://stackoverflow.com/questions/21201062/using-readability-api-to-scrape-most-relavant-image-from-page // Retrieve our URLs list($baseUrl, $relativeUrl) = $this->getUrls($url); $convert = function ($url) use ($baseUrl, $relativeUrl) { // Resolve relative paths if(substr($url, 0, 2) == "//") { // Missing protocol // Fine, use current } elseif(substr($url, 0, 1) == "/") { // Path Relative to Base $url = $baseUrl . $url; } elseif(substr($url, 0, 4) !== "http") { // Path Relative to Dimension $url = $relativeUrl . $url; } return $url; }; libxml_use_internal_errors(true); $dom = new DOMDocument(); $dom->loadHTML($html); foreach($dom->getElementsByTagName('img') as $node) { $image = $node->getAttribute('src'); $node->setAttribute('src', $convert($image)); } foreach($dom->getElementsByTagName('a') as $node) { $link = $node->getAttribute('href'); $node->setAttribute('href', $convert($link)); } return $dom->saveHTML(); } }