From: Antonio Ospite Date: Fri, 16 Nov 2018 22:21:33 +0000 (+0100) Subject: Merge tag 'v1.4.0' into debian/master X-Git-Tag: debian/1.4.0-1~2 X-Git-Url: https://git.ao2.it/tweeper.git/commitdiff_plain/b65cf8599c904992401963be960c88ab4f4ab7ba?hp=7883a9a94cac5b7eb5dd2a5906841fa516eac7f2 Merge tag 'v1.4.0' into debian/master Release v1.4.0 --- diff --git a/NEWS b/NEWS index 33d3163..29ec569 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,17 @@ +News for v1.4.0: +================ + + * Make the images adapt to the screen width in feed readers which render the + HTML data in the description. + * Indicate if there is a GIF image in a tweet. + * Add option to enable or disable showing usernames in RSS items. + * Retry multiple times to retrieve a resource before giving up. + * Fix coding style. + * Add option to enable or disable showing multimedia content in RSS items. + * Fix generating enclosures for Dilbert.com + * Make enclosure elements validate with feedvalidator.org when the server + does not provide a Content-Length header. + News for v1.3.0: ================ diff --git a/TODO b/TODO index 7b72745..3c71811 100644 --- a/TODO +++ b/TODO @@ -1,7 +1,3 @@ -- re-evaluate the use of trigger_error() or use a custom error handler, - because right now the code exists as soon as trigger_error() gets called and - any following code is ignored. - - write better XSL stylesheets? I am not an XSL expert - evaluate the use of the RSS element - show cards directly in RSS items for twitter.com diff --git a/autoload.php b/autoload.php index d3ebc5a..d366bbb 100644 --- a/autoload.php +++ b/autoload.php @@ -1,4 +1,5 @@ elements (disabled by default). + * @param bool $show_usernames + * Enables showing the username in front of the content for multi-user + * sites (enabled by default). Only some stylesheets supports this + * functionality (twitter, instagram, pump.io). + * @param bool $show_multimedia + * Enables showing multimedia content (images, videos) directly in the + * item description (enabled by default). Only some stylesheets supports + * this functionality (twitter, instagram, dilbert). */ - public function __construct($generate_enclosure = FALSE) { + public function __construct($generate_enclosure = FALSE, $show_usernames = TRUE, $show_multimedia = TRUE) { $this->generate_enclosure = $generate_enclosure; + $this->show_usernames = $show_usernames; + $this->show_multimedia = $show_multimedia; } /** @@ -78,12 +93,32 @@ class Tweeper { } /** + * Perform a cURL session multiple times when it fails with a timeout. + * + * @param resource $ch + * a cURL session handle. + */ + private static function curlExec($ch) { + $ret = FALSE; + $attempt = 0; + do { + $ret = curl_exec($ch); + if (FALSE === $ret) { + trigger_error(curl_error($ch), E_USER_WARNING); + } + } while (curl_errno($ch) == CURLE_OPERATION_TIMEDOUT && ++$attempt < Tweeper::$maxConnectionRetries); + + return $ret; + } + + /** * Get the contents from a URL. */ private static function getUrlContents($url) { $ch = curl_init($url); curl_setopt_array($ch, array( CURLOPT_HEADER => FALSE, + CURLOPT_CONNECTTIMEOUT => Tweeper::$maxConnectionTimeout, // Follow http redirects to get the real URL. CURLOPT_FOLLOWLOCATION => TRUE, CURLOPT_RETURNTRANSFER => TRUE, @@ -92,10 +127,7 @@ class Tweeper { CURLOPT_HTTPHEADER => array('Accept-language: en'), CURLOPT_USERAGENT => Tweeper::$userAgent, )); - $contents = curl_exec($ch); - if (FALSE === $contents) { - trigger_error(curl_error($ch)); - } + $contents = Tweeper::curlExec($ch); curl_close($ch); return $contents; @@ -109,6 +141,7 @@ class Tweeper { curl_setopt_array($ch, array( CURLOPT_HEADER => TRUE, CURLOPT_NOBODY => TRUE, + CURLOPT_CONNECTTIMEOUT => Tweeper::$maxConnectionTimeout, // Follow http redirects to get the real URL. CURLOPT_FOLLOWLOCATION => TRUE, CURLOPT_RETURNTRANSFER => TRUE, @@ -116,10 +149,16 @@ class Tweeper { CURLOPT_SSL_VERIFYPEER => FALSE, CURLOPT_USERAGENT => Tweeper::$userAgent, )); - curl_exec($ch); + + $ret = Tweeper::curlExec($ch); + if (FALSE === $ret) { + curl_close($ch); + return FALSE; + } + $url_info = curl_getinfo($ch); if (FALSE === $url_info) { - trigger_error(curl_error($ch)); + trigger_error(curl_error($ch), E_USER_WARNING); } curl_close($ch); @@ -152,10 +191,14 @@ class Tweeper { ); $url_info = Tweeper::getUrlInfo($url); + if (FALSE === $url_info) { + trigger_error("Failed to retrieve info for URL: " . $url, E_USER_WARNING); + return ''; + } $supported = in_array($url_info['content_type'], $supported_content_types); if (!$supported) { - error_log("Unsupported enclosure content type \"" . $url_info['content_type'] . "\" for URL: " . $url_info['url']); + trigger_error("Unsupported enclosure content type \"" . $url_info['content_type'] . "\" for URL: " . $url_info['url'], E_USER_WARNING); return ''; } @@ -163,10 +206,17 @@ class Tweeper { // See http://sourceforge.net/p/feedvalidator/bugs/72/ $http_url = preg_replace("/^https/", "http", $url_info['url']); + // When the server does not provide a Content-Length header, + // curl_getinfo() would return a negative value for + // "download_content_length", however RSS recommends to use 0 when the + // enclosure's size cannot be determined. + // See: https://www.feedvalidator.org/docs/error/UseZeroForUnknown.html + $length = max($url_info['download_content_length'], 0); + $dom = new DOMDocument(); $enc = $dom->createElement('enclosure'); $enc->setAttribute('url', $http_url); - $enc->setAttribute('length', $url_info['download_content_length']); + $enc->setAttribute('length', $length); $enc->setAttribute('type', $url_info['content_type']); return $enc; @@ -203,7 +253,7 @@ class Tweeper { $output .= " line $error->line"; - error_log($output); + trigger_error($output, E_USER_WARNING); } /** @@ -230,7 +280,7 @@ class Tweeper { $xml_data = $serializer->serialize($data, 'xml', $serializer_options); if (!$xml_data) { - trigger_error("Cannot serialize data", E_USER_ERROR); + trigger_error("Cannot serialize data", E_USER_WARNING); return NULL; } @@ -245,14 +295,20 @@ class Tweeper { $json_match_expr = '/window._sharedData = (.*);/'; $ret = preg_match($json_match_expr, $html, $matches); if ($ret !== 1) { - trigger_error("Cannot match expression: $json_match_expr\n", E_USER_ERROR); + trigger_error("Cannot match expression: $json_match_expr\n", E_USER_WARNING); return NULL; } + $data = json_decode($matches[1], $assoc = TRUE); + // The "qe" object contains elements which will result in invalid XML // element names, so remove it. - $data = json_decode($matches[1], $assoc = TRUE); unset($data["qe"]); + + // The "knobs" object contains elements with undefined namespaces, so + // remove it to silence an error message. + unset($data["knobs"]); + $json = json_encode($data); return Tweeper::jsonToXml($json, 'instagram'); @@ -301,11 +357,15 @@ class Tweeper { private function loadStylesheet($host) { $stylesheet = "file://" . __DIR__ . "/rss_converter_" . $host . ".xsl"; if (FALSE === file_exists($stylesheet)) { - trigger_error("Conversion to RSS not supported for $host ($stylesheet not found)", E_USER_ERROR); + trigger_error("Conversion to RSS not supported for $host ($stylesheet not found)", E_USER_WARNING); return NULL; } $stylesheet_contents = Tweeper::getUrlContents($stylesheet); + if (FALSE === $stylesheet_contents) { + trigger_error("Cannot open $stylesheet", E_USER_WARNING); + return NULL; + } $xslDoc = new DOMDocument(); $xslDoc->loadXML($stylesheet_contents); @@ -313,6 +373,8 @@ class Tweeper { $xsltProcessor = new XSLTProcessor(); $xsltProcessor->registerPHPFunctions(); $xsltProcessor->setParameter('', 'generate-enclosure', $this->generate_enclosure); + $xsltProcessor->setParameter('', 'show-usernames', $this->show_usernames); + $xsltProcessor->setParameter('', 'show-multimedia', $this->show_multimedia); $xsltProcessor->importStylesheet($xslDoc); return $xsltProcessor; @@ -321,25 +383,25 @@ class Tweeper { /** * Convert the site content to RSS. */ - public function tweep($src_url, $host=NULL, $validate_scheme=TRUE) { + public function tweep($src_url, $host = NULL, $validate_scheme = TRUE) { $url = parse_url($src_url); if (FALSE === $url) { - trigger_error("Invalid URL: $src_url", E_USER_ERROR); + trigger_error("Invalid URL: $src_url", E_USER_WARNING); return NULL; } if (TRUE === $validate_scheme) { $scheme = $url["scheme"]; if (!in_array($scheme, array("http", "https"))) { - trigger_error("unsupported scheme: $scheme", E_USER_ERROR); + trigger_error("unsupported scheme: $scheme", E_USER_WARNING); return NULL; } } - // if the host is not given derive it from the URL + // If the host is not given derive it from the URL. if (NULL === $host) { if (empty($url["host"])) { - trigger_error("Invalid host in URL: $src_url", E_USER_ERROR); + trigger_error("Invalid host in URL: $src_url", E_USER_WARNING); return NULL; } // Strip the leading www. to be more forgiving on input URLs. @@ -353,6 +415,7 @@ class Tweeper { $html = Tweeper::getUrlContents($src_url); if (FALSE === $html) { + trigger_error("Failed to retrieve $src_url", E_USER_WARNING); return NULL; } @@ -367,11 +430,11 @@ class Tweeper { } $output = $xsltProcessor->transformToXML($xmlDoc); - if (FALSE === $output) { - trigger_error('XSL transformation failed.', E_USER_ERROR); + trigger_error('XSL transformation failed.', E_USER_WARNING); return NULL; } + return $output; } diff --git a/src/rss_converter_dilbert.com.xsl b/src/rss_converter_dilbert.com.xsl index dcc56af..94d7fef 100644 --- a/src/rss_converter_dilbert.com.xsl +++ b/src/rss_converter_dilbert.com.xsl @@ -34,9 +34,11 @@ + + + @@ -71,11 +73,22 @@ <![CDATA[ - {$picture-title} + + {$picture-title} + ]]> - + + diff --git a/src/rss_converter_facebook.com.xsl b/src/rss_converter_facebook.com.xsl index a735cf6..7ead3ef 100644 --- a/src/rss_converter_facebook.com.xsl +++ b/src/rss_converter_facebook.com.xsl @@ -33,7 +33,6 @@ diff --git a/src/rss_converter_instagram.com.xsl b/src/rss_converter_instagram.com.xsl index c714b1b..855ce0b 100644 --- a/src/rss_converter_instagram.com.xsl +++ b/src/rss_converter_instagram.com.xsl @@ -21,10 +21,11 @@ + + @@ -70,7 +71,12 @@ <xsl:variable name="title-length" select="140"/> - <xsl:variable name="item-content-title" select="normalize-space(concat($screen-name, ': ', $item-content-caption))"/> + <xsl:variable name="item-content-title"> + <xsl:if test="$show-usernames = 1"> + <xsl:value-of select="concat($screen-name, ': ')"/> + </xsl:if> + <xsl:value-of select="normalize-space($item-content-caption)"/> + </xsl:variable> <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 --> <xsl:choose> <xsl:when test="string-length($item-content-title) > $title-length"> @@ -101,7 +107,9 @@ </xsl:if> <xsl:value-of select="$item-content-caption"/> </p><br /> - <a href="{$item-permalink}"><img src="{$item-content-image}" style="max-width: 100%"/></a> + <xsl:if test="$show-multimedia = 1"> + <a href="{$item-permalink}"><img src="{$item-content-image}" style="max-width: 100%"/></a> + </xsl:if> <xsl:text disable-output-escaping="yes">]]></xsl:text> </description> <xsl:if test="$generate-enclosure = 1"> diff --git a/src/rss_converter_pump.io.xsl b/src/rss_converter_pump.io.xsl index 66e73cd..42f8ac0 100644 --- a/src/rss_converter_pump.io.xsl +++ b/src/rss_converter_pump.io.xsl @@ -22,9 +22,11 @@ <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:php="http://php.net/xsl" - xsl:extension-element-prefixes="php" exclude-result-prefixes="php"> + <xsl:param name="generate-enclosure"/> + <xsl:param name="show-usernames"/> + <xsl:output method="xml" indent="yes"/> <xsl:variable name="domain-name" select="substring-after(//div[@id='profile-block']/@data-profile-id, '@')"/> @@ -37,7 +39,10 @@ <xsl:variable name="item-permalink" select=".//p[@class='muted']/small/a/@href"/> <item> <title> - <xsl:value-of select="concat($user-name, ': ', normalize-space($item-content))"/> + <xsl:if test="$show-usernames = 1"> + <xsl:value-of select="concat($user-name, ': ')"/> + </xsl:if> + <xsl:value-of select="normalize-space($item-content)"/> @@ -49,8 +54,10 @@ - <![CDATA[ + + + ]]> diff --git a/src/rss_converter_twitter.com.xsl b/src/rss_converter_twitter.com.xsl index d1514c5..1c20e70 100644 --- a/src/rss_converter_twitter.com.xsl +++ b/src/rss_converter_twitter.com.xsl @@ -21,10 +21,11 @@ + + @@ -35,6 +36,11 @@ + @@ -67,15 +73,17 @@ too instead of the t.co redirections. --> - - - - + + + + + + + - - - - + + + @@ -84,7 +92,7 @@ - + @@ -97,7 +105,7 @@ - + @@ -120,9 +128,12 @@ + - <xsl:value-of select="concat($user-name, ': ')"/> + <xsl:if test="($show-usernames = 1) or ($screen-name != $user-name)"> + <xsl:value-of select="concat($user-name, ': ')"/> + </xsl:if> <xsl:if test="$item-has-video"> <xsl:text>(Video) </xsl:text> </xsl:if> @@ -149,17 +160,25 @@ </pubDate> <description> <xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text> - <xsl:value-of select="concat($user-name, ':')"/> - <xsl:element name="br"/> + <xsl:if test="($show-usernames = 1) or ($screen-name != $user-name)"> + <xsl:value-of select="concat($user-name, ':')"/> + <xsl:element name="br"/> + </xsl:if> <xsl:if test="$item-has-video"> <xsl:text> (Video)</xsl:text> <xsl:element name="br"/> </xsl:if> + <xsl:if test="$item-has-gif"> + <xsl:text> (GIF)</xsl:text> + <xsl:element name="br"/> + </xsl:if> <xsl:element name="span"> <xsl:attribute name="style">white-space: pre-wrap;</xsl:attribute> <xsl:apply-templates select="$item-content/node()"/> </xsl:element> - <xsl:apply-templates select="$item-media/node()"/> + <xsl:if test="$show-multimedia = 1"> + <xsl:apply-templates select="$item-media/node()"/> + </xsl:if> <xsl:text disable-output-escaping="yes">]]></xsl:text> </description> <xsl:if test="$generate-enclosure = 1"> diff --git a/tweeper.1.asciidoc b/tweeper.1.asciidoc index 82b3a43..37e885a 100644 --- a/tweeper.1.asciidoc +++ b/tweeper.1.asciidoc @@ -45,6 +45,14 @@ OPTIONS *-e*:: show links to supported media files in the RSS <enclosure/> element +*-m <0|1>*:: + enable or disable showing multimedia content (e.g. Twitter or Instagram + pictures) directly inside the item description. Default is 1 (enable). + +*-u <0|1>*:: + enable or disable showing usernames in front of the item for hosts which + supports it (Twitter.com/Instagram.com). Default is 1 (enable). + *-h, --help*:: show the help message diff --git a/tweeper.php b/tweeper.php index b1dd021..2d5017e 100644 --- a/tweeper.php +++ b/tweeper.php @@ -1,4 +1,5 @@ <?php + /** * @file * Tweeper - a Twitter to RSS web scraper. @@ -37,10 +38,10 @@ function is_cli() { */ function usage($argv) { if (is_cli()) { - $usage = "{$argv[0]} [-e|-h|--help] <src_url>\n"; + $usage = "{$argv[0]} [-e|-m <0|1>|-u <0|1>|-h|--help] <src_url>\n"; } else { - $usage = htmlentities("{$_SERVER['SCRIPT_NAME']}?src_url=<src_url>&generate_enclosure=<0|1>"); + $usage = htmlentities("{$_SERVER['SCRIPT_NAME']}?src_url=<src_url>&generate_enclosure=<0|1>&show_usernames=<0|1>&show_multimedia=<0|1>"); } return "usage: $usage"; @@ -52,19 +53,41 @@ function usage($argv) { function parse_options_cli($argv, $argc) { $options = array( 'generate_enclosure' => FALSE, + 'show_usernames' => TRUE, + 'show_multimedia' => TRUE, ); if ($argc < 2) { return $options; } - $cli_options = getopt("eh", array("help")); + $cli_options = getopt("em:u:h", array("help")); foreach ($cli_options as $opt => $val) { switch ($opt) { case 'e': $options['generate_enclosure'] = TRUE; break; + case 'm': + $ret = filter_var($val, FILTER_VALIDATE_BOOLEAN, FILTER_NULL_ON_FAILURE); + if (NULL === $ret) { + fwrite(STDERR, "Invalid argument for the -m option.\n"); + fwrite(STDERR, usage($argv)); + exit(1); + } + $options['show_multimedia'] = $val; + break; + + case 'u': + $ret = filter_var($val, FILTER_VALIDATE_BOOLEAN, FILTER_NULL_ON_FAILURE); + if (NULL === $ret) { + fwrite(STDERR, "Invalid argument for the -u option.\n"); + fwrite(STDERR, usage($argv)); + exit(1); + } + $options['show_usernames'] = $val; + break; + case 'h': case 'help': echo usage($argv); @@ -76,7 +99,9 @@ function parse_options_cli($argv, $argc) { } } - $options['src_url'] = $argv[count($cli_options) + 1]; + // For now assume that the URL is the lest argument, in the future we could + // switch to PHP >= 7.1 and use the $optind argument of getopt(). + $options['src_url'] = array_pop($argv); return $options; } @@ -87,6 +112,8 @@ function parse_options_cli($argv, $argc) { function parse_options_query_string() { $options = array( 'generate_enclosure' => FALSE, + 'show_usernames' => TRUE, + 'show_multimedia' => TRUE, ); if (isset($_GET['src_url'])) { @@ -97,6 +124,14 @@ function parse_options_query_string() { $options['generate_enclosure'] = $_GET['generate_enclosure'] == 1; } + if (isset($_GET['show_multimedia'])) { + $options['show_multimedia'] = $_GET['show_multimedia'] != 0; + } + + if (isset($_GET['show_usernames'])) { + $options['show_usernames'] = $_GET['show_usernames'] != 0; + } + return $options; } @@ -114,7 +149,7 @@ if (!isset($options['src_url'])) { exit(1); } -$tweeper = new Tweeper($options['generate_enclosure']); +$tweeper = new Tweeper($options['generate_enclosure'], $options['show_usernames'], $options['show_multimedia']); $output = $tweeper->tweep($options['src_url']); if (is_null($output)) { exit(1);