X-Git-Url: https://git.ao2.it/tweeper.git/blobdiff_plain/45060bbb6b4c2d5e8d2ee461c5a0716b499dc988..fae2bb79789ff07d7cdeb6b609b3fd9b43b107b8:/src/Tweeper.php diff --git a/src/Tweeper.php b/src/Tweeper.php index f34258d..10e07e3 100644 --- a/src/Tweeper.php +++ b/src/Tweeper.php @@ -6,7 +6,7 @@ namespace Tweeper; * @file * Tweeper - a Twitter to RSS web scraper. * - * Copyright (C) 2013-2018 Antonio Ospite + * Copyright (C) 2013-2020 Antonio Ospite * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -36,13 +36,31 @@ date_default_timezone_set('UTC'); */ class Tweeper { - private static $userAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:60.0) Gecko/20100101 Firefox/60.0"; + private static $userAgent = "Mozilla/5.0 (Linux; U; Android 4.3; en-us; SM-N900T Build/JSS15J)"; + private static $maxConnectionTimeout = 5; + private static $maxConnectionRetries = 5; /** - * Constructor sets up {@link $generate_enclosure}. + * Create a new Tweeper object controlling optional settings. + * + * @param bool $generate_enclosure + * Enables the creation of elements (disabled by default). + * @param bool $show_usernames + * Enables showing the username in front of the content for multi-user + * sites (enabled by default). Only some stylesheets supports this + * functionality (twitter, instagram, pump.io). + * @param bool $show_multimedia + * Enables showing multimedia content (images, videos) directly in the + * item description (enabled by default). Only some stylesheets supports + * this functionality (twitter, instagram, dilbert). + * @param bool $verbose_output + * Enables showing non-fatal errors like XML parsing errors. */ - public function __construct($generate_enclosure = FALSE) { + public function __construct($generate_enclosure = FALSE, $show_usernames = TRUE, $show_multimedia = TRUE, $verbose_output = TRUE) { $this->generate_enclosure = $generate_enclosure; + $this->show_usernames = $show_usernames; + $this->show_multimedia = $show_multimedia; + $this->verbose_output = $verbose_output; } /** @@ -69,6 +87,46 @@ class Tweeper { } /** + * Convert Twitter mobile date to the date format expected in a RSS document. + */ + public static function twitterToRssDate($date) { + // Twitter uses relative timestamps in minutes for recent tweets. + if (preg_match('/^(\d+)m$/', $date, $matches)) { + $timestamp = strtotime("+" . $matches[1] . " min", time()); + if (FALSE === $timestamp) { + $timestamp = 0; + } + } + else { + /* + * In case the time is specified put it after the date, + * to make it recognized by strptime(). + */ + if (preg_match('/(.*) - (.*)/', $date, $matches)) { + $date = $matches[2] . " " . $matches[1]; + } + + $timestamp = strtotime($date); + if (FALSE === $timestamp) { + $timestamp = 0; + } + + /* + * The twitter mobile UI usually only specifies the month and the day, so + * strtotime($date) may interpret the date as being in the future. + * + * If the date is in the future it is probably in the same day but in the + * previous year. + */ + if ($timestamp > time()) { + $timestamp = strtotime('-1 years', $timestamp); + } + } + + return Tweeper::epochToRssDate($timestamp); + } + + /** * Convert string to UpperCamelCase. */ public static function toUpperCamelCase($str, $delim = ' ') { @@ -78,24 +136,40 @@ class Tweeper { } /** + * Perform a cURL session multiple times when it fails with a timeout. + * + * @param resource $ch + * a cURL session handle. + */ + private static function curlExec($ch) { + $ret = FALSE; + $attempt = 0; + do { + $ret = curl_exec($ch); + if (FALSE === $ret) { + trigger_error(curl_error($ch), E_USER_WARNING); + } + } while (curl_errno($ch) == CURLE_OPERATION_TIMEDOUT && ++$attempt < Tweeper::$maxConnectionRetries); + + return $ret; + } + + /** * Get the contents from a URL. */ private static function getUrlContents($url) { $ch = curl_init($url); - curl_setopt_array($ch, array( + curl_setopt_array($ch, [ CURLOPT_HEADER => FALSE, + CURLOPT_CONNECTTIMEOUT => Tweeper::$maxConnectionTimeout, // Follow http redirects to get the real URL. CURLOPT_FOLLOWLOCATION => TRUE, + CURLOPT_COOKIEFILE => "", CURLOPT_RETURNTRANSFER => TRUE, - CURLOPT_SSL_VERIFYHOST => FALSE, - CURLOPT_SSL_VERIFYPEER => FALSE, - CURLOPT_HTTPHEADER => array('Accept-language: en'), + CURLOPT_HTTPHEADER => ['Accept-language: en'], CURLOPT_USERAGENT => Tweeper::$userAgent, - )); - $contents = curl_exec($ch); - if (FALSE === $contents) { - trigger_error(curl_error($ch)); - } + ]); + $contents = Tweeper::curlExec($ch); curl_close($ch); return $contents; @@ -106,20 +180,25 @@ class Tweeper { */ private static function getUrlInfo($url) { $ch = curl_init($url); - curl_setopt_array($ch, array( + curl_setopt_array($ch, [ CURLOPT_HEADER => TRUE, CURLOPT_NOBODY => TRUE, + CURLOPT_CONNECTTIMEOUT => Tweeper::$maxConnectionTimeout, // Follow http redirects to get the real URL. CURLOPT_FOLLOWLOCATION => TRUE, CURLOPT_RETURNTRANSFER => TRUE, - CURLOPT_SSL_VERIFYHOST => FALSE, - CURLOPT_SSL_VERIFYPEER => FALSE, CURLOPT_USERAGENT => Tweeper::$userAgent, - )); - curl_exec($ch); + ]); + + $ret = Tweeper::curlExec($ch); + if (FALSE === $ret) { + curl_close($ch); + return FALSE; + } + $url_info = curl_getinfo($ch); if (FALSE === $url_info) { - trigger_error(curl_error($ch)); + trigger_error(curl_error($ch), E_USER_WARNING); } curl_close($ch); @@ -130,7 +209,7 @@ class Tweeper { * Generate an RSS element. */ public static function generateEnclosure($url) { - $supported_content_types = array( + $supported_content_types = [ "application/octet-stream", "application/ogg", "application/pdf", @@ -149,13 +228,17 @@ class Tweeper { "video/mp4", "video/mpeg", "video/ogg", - ); + ]; $url_info = Tweeper::getUrlInfo($url); + if (FALSE === $url_info) { + trigger_error("Failed to retrieve info for URL: " . $url, E_USER_WARNING); + return ''; + } $supported = in_array($url_info['content_type'], $supported_content_types); if (!$supported) { - error_log("Unsupported enclosure content type \"" . $url_info['content_type'] . "\" for URL: " . $url_info['url']); + trigger_error("Unsupported enclosure content type \"" . $url_info['content_type'] . "\" for URL: " . $url_info['url'], E_USER_WARNING); return ''; } @@ -163,10 +246,17 @@ class Tweeper { // See http://sourceforge.net/p/feedvalidator/bugs/72/ $http_url = preg_replace("/^https/", "http", $url_info['url']); + // When the server does not provide a Content-Length header, + // curl_getinfo() would return a negative value for + // "download_content_length", however RSS recommends to use 0 when the + // enclosure's size cannot be determined. + // See: https://www.feedvalidator.org/docs/error/UseZeroForUnknown.html + $length = max($url_info['download_content_length'], 0); + $dom = new DOMDocument(); $enc = $dom->createElement('enclosure'); $enc->setAttribute('url', $http_url); - $enc->setAttribute('length', $url_info['download_content_length']); + $enc->setAttribute('length', $length); $enc->setAttribute('type', $url_info['content_type']); return $enc; @@ -203,7 +293,7 @@ class Tweeper { $output .= " line $error->line"; - error_log($output); + trigger_error($output, E_USER_WARNING); } /** @@ -220,17 +310,17 @@ class Tweeper { $encoder = new XmlEncoder(); $normalizer = new ObjectNormalizer(); - $serializer = new Serializer(array($normalizer), array($encoder)); + $serializer = new Serializer([$normalizer], [$encoder]); - $serializer_options = array( + $serializer_options = [ 'xml_encoding' => "UTF-8", 'xml_format_output' => TRUE, 'xml_root_node_name' => $root_node_name, - ); + ]; $xml_data = $serializer->serialize($data, 'xml', $serializer_options); if (!$xml_data) { - trigger_error("Cannot serialize data", E_USER_ERROR); + trigger_error("Cannot serialize data", E_USER_WARNING); return NULL; } @@ -245,14 +335,20 @@ class Tweeper { $json_match_expr = '/window._sharedData = (.*);/'; $ret = preg_match($json_match_expr, $html, $matches); if ($ret !== 1) { - trigger_error("Cannot match expression: $json_match_expr\n", E_USER_ERROR); + trigger_error("Cannot match expression: $json_match_expr\n", E_USER_WARNING); return NULL; } + $data = json_decode($matches[1], $assoc = TRUE); + // The "qe" object contains elements which will result in invalid XML // element names, so remove it. - $data = json_decode($matches[1], $assoc = TRUE); unset($data["qe"]); + + // The "knobs" object contains elements with undefined namespaces, so + // remove it to silence an error message. + unset($data["knobs"]); + $json = json_encode($data); return Tweeper::jsonToXml($json, 'instagram'); @@ -279,15 +375,17 @@ class Tweeper { // If there is a host-specific method to get the XML data, use it! $get_xml_host_method = 'getXml' . Tweeper::toUpperCamelCase($host, '.'); if (method_exists($this, $get_xml_host_method)) { - $xml_data = call_user_func_array(array($this, $get_xml_host_method), array($html)); + $xml_data = call_user_func_array([$this, $get_xml_host_method], [$html]); $xmlDoc->loadXML($xml_data); } else { $xmlDoc->loadHTML($html); } - foreach (libxml_get_errors() as $xml_error) { - Tweeper::logXmlError($xml_error); + if ($this->verbose_output) { + foreach (libxml_get_errors() as $xml_error) { + Tweeper::logXmlError($xml_error); + } } libxml_clear_errors(); libxml_use_internal_errors($xml_errors_value); @@ -301,11 +399,15 @@ class Tweeper { private function loadStylesheet($host) { $stylesheet = "file://" . __DIR__ . "/rss_converter_" . $host . ".xsl"; if (FALSE === file_exists($stylesheet)) { - trigger_error("Conversion to RSS not supported for $host ($stylesheet not found)", E_USER_ERROR); + trigger_error("Conversion to RSS not supported for $host ($stylesheet not found)", E_USER_WARNING); return NULL; } $stylesheet_contents = Tweeper::getUrlContents($stylesheet); + if (FALSE === $stylesheet_contents) { + trigger_error("Cannot open $stylesheet", E_USER_WARNING); + return NULL; + } $xslDoc = new DOMDocument(); $xslDoc->loadXML($stylesheet_contents); @@ -313,6 +415,8 @@ class Tweeper { $xsltProcessor = new XSLTProcessor(); $xsltProcessor->registerPHPFunctions(); $xsltProcessor->setParameter('', 'generate-enclosure', $this->generate_enclosure); + $xsltProcessor->setParameter('', 'show-usernames', $this->show_usernames); + $xsltProcessor->setParameter('', 'show-multimedia', $this->show_multimedia); $xsltProcessor->importStylesheet($xslDoc); return $xsltProcessor; @@ -321,25 +425,25 @@ class Tweeper { /** * Convert the site content to RSS. */ - public function tweep($src_url, $host=NULL, $validate_scheme=TRUE) { + public function tweep($src_url, $host = NULL, $validate_scheme = TRUE) { $url = parse_url($src_url); if (FALSE === $url) { - trigger_error("Invalid URL: $src_url", E_USER_ERROR); + trigger_error("Invalid URL: $src_url", E_USER_WARNING); return NULL; } if (TRUE === $validate_scheme) { $scheme = $url["scheme"]; - if (!in_array($scheme, array("http", "https"))) { - trigger_error("unsupported scheme: $scheme", E_USER_ERROR); + if (!in_array($scheme, ["http", "https"])) { + trigger_error("unsupported scheme: $scheme", E_USER_WARNING); return NULL; } } - // if the host is not given derive it from the URL + // If the host is not given derive it from the URL. if (NULL === $host) { if (empty($url["host"])) { - trigger_error("Invalid host in URL: $src_url", E_USER_ERROR); + trigger_error("Invalid host in URL: $src_url", E_USER_WARNING); return NULL; } // Strip the leading www. to be more forgiving on input URLs. @@ -353,12 +457,13 @@ class Tweeper { $html = Tweeper::getUrlContents($src_url); if (FALSE === $html) { + trigger_error("Failed to retrieve $src_url", E_USER_WARNING); return NULL; } $preprocess_html_host_method = 'preprocessHtml' . Tweeper::toUpperCamelCase($host, '.'); if (method_exists($this, $preprocess_html_host_method)) { - $html = call_user_func_array(array($this, $preprocess_html_host_method), array($html)); + $html = call_user_func_array([$this, $preprocess_html_host_method], [$html]); } $xmlDoc = $this->htmlToXml($html, $host); @@ -367,11 +472,11 @@ class Tweeper { } $output = $xsltProcessor->transformToXML($xmlDoc); - if (FALSE === $output) { - trigger_error('XSL transformation failed.', E_USER_ERROR); + trigger_error('XSL transformation failed.', E_USER_WARNING); return NULL; } + return $output; }