https://github.com/PressForward/pressforward
Raw File
Tip revision: dfb7c6e4217a1c80aa6025e1b6b6f04c60f404ef authored by Aram Zucker-Scharff on 23 August 2016, 17:20:26 UTC
changelog for 4.0.4
Tip revision: dfb7c6e
PF_Readability.php
<?php
namespace PressForward\Controllers;

use WP_Ajax_Response;
/**
 * Readability stuff
 */

class PF_Readability {

	/**
	 * Abstract function to make everything readable.
	 *
	 * Potential arguments to base via array
	 * 			$args = array(
	 *			'force' 		=> $force,
	 *			'descrip' 		=> $_POST['content'],
	 *			'url' 			=> $url,
	 *			'authorship'	=> $_POST['authorship']
	 *		);
	*/
	public function get_readable_text($args){
			#ob_start();
			extract( $args, EXTR_SKIP );
			set_time_limit(0);
			$readability_stat = $url;
			$url = pressforward('controller.http_tools')->resolve_full_url($url);
			//var_dump($url); die();
			$descrip = rawurldecode($descrip);
			if (get_magic_quotes_gpc())
				$descrip = stripslashes($descrip);

			if ($authorship == 'aggregation') {
				$aggregated = true;
			} else {
				$aggregated = false;
			}
			$stripped_descrip = strip_tags($descrip);
			if ((str_word_count($stripped_descrip) <= 150) || $aggregated || $force == 'force') {
				$itemReadReady = $this->readability_object($url);
				#print_r(  wp_richedit_pre($itemReadReady));
				if ($itemReadReady != 'error-secured') {
					if (!$itemReadReady) {
						$read_status = 'failed_readability';
						$readability_stat .= __( " This content failed Readability.", 'pf' );
						//$itemReadReady .= '<br />';
						$url = str_replace('&amp;','&', $url);
						#Try and get the OpenGraph description.
						if (pressforward('library.opengraph')->fetch($url)){
							$node = pressforward('library.opengraph')->fetch($url);
							$itemReadReady = $node->description;
						} //Note the @ below. This is because get_meta_tags doesn't have a failure state to check, it just throws errors. Thanks PHP...
						elseif ('' != ($contentHtml = @get_meta_tags($url))) {
							# Try and get the HEAD > META DESCRIPTION tag.
							$read_status = 'failed_readability_og';
							//$itemReadReady .= '<br />';
							$itemReadReady = $contentHtml['description'];

						}
						else
						{
							# Ugh... we can't get anything huh?
							$read_status = 'failed_readability_og_meta';
							//$itemReadReady .= '<br />';
							# We'll want to return a false to loop with.
							$itemReadReady = $descrip;

						}
						if(strlen($itemReadReady) < strlen($descrip)){
							$itemReadReady = $descrip;
							$readability_stat .= ' Retrieved text is less than original text.';
							$read_status = 'already_readable';
						}
						$itemReadReady = $this->process_in_oembeds($url, $itemReadReady);
					} else {
						$read_status = 'made_readable';
						$itemReadReady = $this->process_in_oembeds($url, $itemReadReady);
					}
				} else {
					$read_status = 'secured';
					$itemReadReady = $descrip;
				}
			} else {
				$read_status = 'already_readable';
				$itemReadReady = $descrip;
			}

			$return_args = array( 'status' => $read_status, 'readable' => $itemReadReady, 'url' => $url);
			#ob_end_flush();
			return $return_args;

	}

	/**
	 * Handles a readability request via POST
	 */
	public function make_it_readable($quickresponse = false){

		// Verify nonce
		if ( !wp_verify_nonce($_POST[PF_SLUG . '_nomination_nonce'], 'nomination') )
			die( __( "Nonce check failed. Please ensure you're supposed to be nominating stories.", 'pf' ) );
		ob_start();
		libxml_use_internal_errors(true);
		$read_status = 'readable';
		$item_id = $_POST['read_item_id'];
		$post_id = $_POST['post_id'];
		$force = $_POST['force'];
		$url = $_POST['url'];
		//error_reporting(0);
		if ( (false === ( $itemReadReady = get_transient( 'item_readable_content_' . $item_id ) )) || $force == 'force' ) {

			$args = array(
				'force' 		=> $force,
				'descrip' 		=> $_POST['content'],
				'url' 			=> $_POST['url'],
				'authorship'	=> $_POST['authorship'],
				'post_id'		=> $_POST['post_id']
			);

			$readable_ready = $this->get_readable_text($args);

			$read_status = $readable_ready['status'];
			$itemReadReady = $readable_ready['readable'];
			$url = $readable_ready['url'];
			if ( !strpos($itemReadReady, $url) ){
				$itemReadReady = $this->process_in_oembeds($url, $itemReadReady);
			}

			set_transient( 'item_readable_content_' . $item_id, $itemReadReady, 60*60*24 );
		}

		$contentObj = pressforward('library.htmlchecker');
		$itemReadReady = $contentObj->closetags($itemReadReady);
		pf_log('Making readable');

		# BIG FREAKING WARNING: This WILL NOT WORK if you have WP_DEBUG and WP_DEBUG_DISPLAY true and either your theme or plugins have bad functions on the save_post hook.
		if ($post_id != 0){

			$content = html_entity_decode($itemReadReady);
			$update_ready = array(
				'ID' => $post_id,
				'post_content' => $content
			);
			if ( strlen($_POST['content']) < strlen($content)){
				$update_check = wp_update_post($update_ready, true);
				if (!is_wp_error($update_check)){
					pressforward('controller.metas')->update_pf_meta($post_id, 'readable_status', 1);
					$error = 'no error';
				} else {
					$read_status = 'post_not_updated_readable';
					pressforward('controller.metas')->update_pf_meta($post_id, 'readable_status', 0);
					$error = $update_check->get_error_message();
				}
				$responseItemReadReady = $this->get_embed($_POST['url']).$itemReadReady;
			} else {
				$error = 'Not Updated, retrieved content is longer than stored content.';
			}
		}
		$domDocErrors = '';
		$dderrors = libxml_get_errors();
		foreach ($dderrors as $dderror){
			$domDocErrors .= ' Error: '.$dderror->code.' Line:'.$dderror->line.' '. $dderror->message;
		}

			$response = array(
				'what' => 'full_item_content',
				'action' => 'make_readable',
				'id' => $item_id,
				'data' => htmlspecialchars($responseItemReadReady),
				'supplemental' => array(
					'readable_status' => $read_status,
					'error' => $error,
					'buffered' => ob_get_contents(),
					'domDoc_errors' => $domDocErrors,
					'readable_applied_to_url' => $_POST['url']
				)
			);
			$xmlResponse = new WP_Ajax_Response($response);
			$xmlResponse->send();
			libxml_clear_errors();
			ob_end_flush();
			die();
	}

	/**
	 * Runs a URL through Readability and hands back the stripped content
	 *
	 * @since 1.7
	 * @see http://www.keyvan.net/2010/08/php-readability/
	 * @param $url
	 */
	public function readability_object($url) {

		set_time_limit(0);
		$url =  pressforward('controller.http_tools')->resolve_full_url($url);

		$request = pf_de_https($url, 'wp_remote_get', array(
																'timeout' => '30',
																'user-agent' => 'AdsBot-Google (+http://www.google.com/adsbot.html)',
																'headers'		=> array(
																	'X-PressForward'	=>	get_site_url()
																)
															)
														);
		//var_dump($request); die();
		//print_r($url); print_r(' - Readability<br />');
		// change from Boone - use wp_remote_get() instead of file_get_contents()
		//$request = wp_remote_get( $url, array('timeout' => '30') );
		if (is_wp_error($request)) {
			$content = 'error-secured';
			//print_r($request); die();
			return $content;
		}
		if ( ! empty( $request['body'] ) ){
			$html = $request['body'];
		} elseif ( ! empty( $request ) && ( ! is_array( $request ) ) ) {
			$html = $request;
		} else {
			$content = false;
			return $content;
		}

		//check if tidy exists to clean up the input.
		if (function_exists('tidy_parse_string')) {
			$tidy = tidy_parse_string($html, array('wrap' => 0, ), 'UTF8');
			$tidy->cleanRepair();
			$html = $tidy->value;
		}
		// give it to Readability
		$readabilitizer = pressforward('library.readability');
		$readability = $readabilitizer($html, $url);

		// print debug output?
		// useful to compare against Arc90's original JS version -
		// simply click the bookmarklet with FireBug's
		// console window open
		$readability->debug = false;

		// convert links to footnotes?
		$readability->convertLinksToFootnotes = false;

		// process it
		$result = $readability->init();

		if ($result){
			$content = $readability->getContent()->innerHTML;
			//$content = $contentOut->innerHTML;
				//if we've got tidy, let's use it.
				if (function_exists('tidy_parse_string')) {
					$tidy = tidy_parse_string($content,
						array('indent'=>true, 'show-body-only'=>true, 'wrap' => 0),
						'UTF8');
					$tidy->cleanRepair();
					$content = $tidy->value;
				}

			$content = balanceTags($content, true);
			$content = ent2ncr($content);
			$content = convert_chars($content);
			$domRotated = 0;
			$dom = new \domDocument('1.0', 'utf-8');


			$dom->preserveWhiteSpace = true;
			$dom->substituteEntities = true;
			$dom->resolveExternals = true;
			$dom->loadXML('<fullContent>'.$content.'</fullContent>');
			$images = $dom->getElementsByTagName('img');
			foreach ($images as $image) {
			  $img = $image->getAttribute('src');
			  if (((strpos($img, '/')) === 0) || (strpos($img, 'http') != 0)){
				$urlArray = parse_url($url);
				if ((strpos($img, 'http') != 0)){
					$urlBase = 'http://' . $urlArray['host'] . '/';
				} else {
					$urlBase = 'http://' . $urlArray['host'];
				}
				if (!is_wp_error(wp_remote_head($urlBase . $img))){
					$image->setAttribute('src', $urlBase . $img);
					$domRotated++;
				} elseif (!is_wp_error(wp_remote_head($url . $img))){
					$image->setAttribute('src', $url . $img);
					$domRotated++;
				} else {
					$image->parentNode->removeChild($image);
					$domRotated++;
				}
			  }
			}
			if ($domRotated > 0){
				$content = $dom->saveXML();
				$rel='(<\\?xml version="1\\.0" encoding="utf-8"\\?>)';
				$content=preg_replace("/".$rel."/is", ' ', $content);
				$rel='(<\\?xml version="1\\.0"\\?>)';
				$content=preg_replace("/".$rel."/is", ' ', $content);
			}
			if ( 120 > strlen($content)){$content = false;}
			#			$content = stripslashes($content);
			# print_r($content);
#				var_dump($content); die();
// this will also output doctype and comments at top level
#			$content = "";
#			foreach($dom->childNodes as $node){
#				$content .= $dom->saveXML($node)."\n";
#			}

		} else {
			# If Readability can't get the content, send back a FALSE to loop with.
			$content = false;
			# and let's throw up an error via AJAX as well, so we know what's going on.
			//print_r($url . ' fails Readability.<br />');
		}
		if ($content != false){
				$contentObj = pressforward('library.htmlchecker');
				$content = $contentObj->closetags($content);
				$content = $this->process_in_oembeds($url, $content);
		}

		return $content;
	}

	public function process_in_oembeds( $item_link, $item_content ){
		$providers = pressforward('schema.feed_item')->oembed_capables();
		foreach ($providers as $provider){
			if ( ( false == strpos($item_content, $item_link) ) && ( 0 != strpos($item_link, $provider) ) ){
				$added_content = '

				'.$item_link.'

				';
				$item_content = $added_content.$item_content;
			}
		}
		return $item_content;

	}

	public function get_embed( $item_link ){
		$oembed = wp_oembed_get( $item_link );
		if ( false != $oembed ){
			$providers = pressforward('schema.feed_item')->oembed_capables();
			foreach ($providers as $provider){
				if ( 0 != strpos($item_link, $provider) ) {
					return $oembed;
				}
			}
		} else {
			return false;
		}
		return false;
	}
}
back to top