From: Antonio Ospite Date: Wed, 10 Jun 2020 21:17:19 +0000 (+0200) Subject: Merge tag 'v1.4.2' into debian/master X-Git-Tag: debian/1.4.2-1~5 X-Git-Url: https://git.ao2.it/tweeper.git/commitdiff_plain/a9526fe6a3604ebdcd2113ff2829ebd6ff21806b?hp=4d768875a0371b25557d6fb0081fe24ac40aa843 Merge tag 'v1.4.2' into debian/master Release v1.4.2 --- diff --git a/NEWS b/NEWS index e98c9ad..a0fcc6c 100644 --- a/NEWS +++ b/NEWS @@ -1,8 +1,15 @@ +News for v1.4.2: +================ + + * Add option to enable or disable showing verbose output + * Add back partial support for twitter.com using the old twitter mobile UI + * Misc fixes to code and documentation + News for v1.4.1: ================ - * Enable cookie handling in cURL to fix scraping twitter.com - * Update User-Agent version to fix scraping hashtag pages on twitter.com + * Enable cookie handling in cURL to fix scraping twitter.com + * Update User-Agent version to fix scraping hashtag pages on twitter.com News for v1.4.0: ================ diff --git a/src/Tweeper.php b/src/Tweeper.php index 877e882..f1d579f 100644 --- a/src/Tweeper.php +++ b/src/Tweeper.php @@ -6,7 +6,7 @@ namespace Tweeper; * @file * Tweeper - a Twitter to RSS web scraper. * - * Copyright (C) 2013-2018 Antonio Ospite + * Copyright (C) 2013-2020 Antonio Ospite * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -36,7 +36,7 @@ date_default_timezone_set('UTC'); */ class Tweeper { - private static $userAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0"; + private static $userAgent = "Mozilla/5.0"; private static $maxConnectionTimeout = 5; private static $maxConnectionRetries = 5; @@ -53,11 +53,14 @@ class Tweeper { * Enables showing multimedia content (images, videos) directly in the * item description (enabled by default). Only some stylesheets supports * this functionality (twitter, instagram, dilbert). + * @param bool $verbose_output + * Enables showing non-fatal errors like XML parsing errors. */ - public function __construct($generate_enclosure = FALSE, $show_usernames = TRUE, $show_multimedia = TRUE) { + public function __construct($generate_enclosure = FALSE, $show_usernames = TRUE, $show_multimedia = TRUE, $verbose_output = TRUE) { $this->generate_enclosure = $generate_enclosure; $this->show_usernames = $show_usernames; $this->show_multimedia = $show_multimedia; + $this->verbose_output = $verbose_output; } /** @@ -84,6 +87,46 @@ class Tweeper { } /** + * Convert Twitter mobile date to the date format expected in a RSS document. + */ + public static function twitterToRssDate($date) { + // Twitter uses relative timestamps in minutes for recent tweets. + if (preg_match('/^(\d+)m$/', $date, $matches)) { + $timestamp = strtotime("+" . $matches[1] . " min", time()); + if (FALSE === $timestamp) { + $timestamp = 0; + } + } + else { + /* + * In case the time is specified put it after the date, + * to make it recognized by strptime(). + */ + if (preg_match('/(.*) - (.*)/', $date, $matches)) { + $date = $matches[2] . " " . $matches[1]; + } + + $timestamp = strtotime($date); + if (FALSE === $timestamp) { + $timestamp = 0; + } + + /* + * The twitter mobile UI usually only specifies the month and the day, so + * strtotime($date) may interpret the date as being in the future. + * + * If the date is in the future it is probably in the same day but in the + * previous year. + */ + if ($timestamp > time()) { + $timestamp = strtotime('-1 years', $timestamp); + } + } + + return Tweeper::epochToRssDate($timestamp); + } + + /** * Convert string to UpperCamelCase. */ public static function toUpperCamelCase($str, $delim = ' ') { @@ -114,20 +157,18 @@ class Tweeper { /** * Get the contents from a URL. */ - private static function getUrlContents($url) { + private static function getUrlContents($url, $user_agent = NULL) { $ch = curl_init($url); - curl_setopt_array($ch, array( + curl_setopt_array($ch, [ CURLOPT_HEADER => FALSE, CURLOPT_CONNECTTIMEOUT => Tweeper::$maxConnectionTimeout, // Follow http redirects to get the real URL. CURLOPT_FOLLOWLOCATION => TRUE, CURLOPT_COOKIEFILE => "", CURLOPT_RETURNTRANSFER => TRUE, - CURLOPT_SSL_VERIFYHOST => FALSE, - CURLOPT_SSL_VERIFYPEER => FALSE, - CURLOPT_HTTPHEADER => array('Accept-language: en'), - CURLOPT_USERAGENT => Tweeper::$userAgent, - )); + CURLOPT_HTTPHEADER => ['Accept-language: en'], + CURLOPT_USERAGENT => isset($user_agent) ? $user_agent : Tweeper::$userAgent, + ]); $contents = Tweeper::curlExec($ch); curl_close($ch); @@ -137,19 +178,17 @@ class Tweeper { /** * Get the headers from a URL. */ - private static function getUrlInfo($url) { + private static function getUrlInfo($url, $user_agent = NULL) { $ch = curl_init($url); - curl_setopt_array($ch, array( + curl_setopt_array($ch, [ CURLOPT_HEADER => TRUE, CURLOPT_NOBODY => TRUE, CURLOPT_CONNECTTIMEOUT => Tweeper::$maxConnectionTimeout, // Follow http redirects to get the real URL. CURLOPT_FOLLOWLOCATION => TRUE, CURLOPT_RETURNTRANSFER => TRUE, - CURLOPT_SSL_VERIFYHOST => FALSE, - CURLOPT_SSL_VERIFYPEER => FALSE, - CURLOPT_USERAGENT => Tweeper::$userAgent, - )); + CURLOPT_USERAGENT => isset($user_agent) ? $user_agent : Tweeper::$userAgent, + ]); $ret = Tweeper::curlExec($ch); if (FALSE === $ret) { @@ -170,7 +209,7 @@ class Tweeper { * Generate an RSS element. */ public static function generateEnclosure($url) { - $supported_content_types = array( + $supported_content_types = [ "application/octet-stream", "application/ogg", "application/pdf", @@ -189,7 +228,7 @@ class Tweeper { "video/mp4", "video/mpeg", "video/ogg", - ); + ]; $url_info = Tweeper::getUrlInfo($url); if (FALSE === $url_info) { @@ -271,13 +310,13 @@ class Tweeper { $encoder = new XmlEncoder(); $normalizer = new ObjectNormalizer(); - $serializer = new Serializer(array($normalizer), array($encoder)); + $serializer = new Serializer([$normalizer], [$encoder]); - $serializer_options = array( + $serializer_options = [ 'xml_encoding' => "UTF-8", 'xml_format_output' => TRUE, 'xml_root_node_name' => $root_node_name, - ); + ]; $xml_data = $serializer->serialize($data, 'xml', $serializer_options); if (!$xml_data) { @@ -336,15 +375,17 @@ class Tweeper { // If there is a host-specific method to get the XML data, use it! $get_xml_host_method = 'getXml' . Tweeper::toUpperCamelCase($host, '.'); if (method_exists($this, $get_xml_host_method)) { - $xml_data = call_user_func_array(array($this, $get_xml_host_method), array($html)); + $xml_data = call_user_func_array([$this, $get_xml_host_method], [$html]); $xmlDoc->loadXML($xml_data); } else { $xmlDoc->loadHTML($html); } - foreach (libxml_get_errors() as $xml_error) { - Tweeper::logXmlError($xml_error); + if ($this->verbose_output) { + foreach (libxml_get_errors() as $xml_error) { + Tweeper::logXmlError($xml_error); + } } libxml_clear_errors(); libxml_use_internal_errors($xml_errors_value); @@ -362,7 +403,7 @@ class Tweeper { return NULL; } - $stylesheet_contents = Tweeper::getUrlContents($stylesheet); + $stylesheet_contents = file_get_contents($stylesheet); if (FALSE === $stylesheet_contents) { trigger_error("Cannot open $stylesheet", E_USER_WARNING); return NULL; @@ -393,7 +434,7 @@ class Tweeper { if (TRUE === $validate_scheme) { $scheme = $url["scheme"]; - if (!in_array($scheme, array("http", "https"))) { + if (!in_array($scheme, ["http", "https"])) { trigger_error("unsupported scheme: $scheme", E_USER_WARNING); return NULL; } @@ -414,7 +455,15 @@ class Tweeper { return NULL; } - $html = Tweeper::getUrlContents($src_url); + // Override User-Agent for twitter.com to force it to serve the mobile UI. + if ($host == "twitter.com") { + $user_agent = "Mozilla/5.0 (Linux; U; Android 4.3; en-us; SM-N900T Build/JSS15J)"; + } + else { + $user_agent = NULL; + } + + $html = Tweeper::getUrlContents($src_url, $user_agent); if (FALSE === $html) { trigger_error("Failed to retrieve $src_url", E_USER_WARNING); return NULL; @@ -422,7 +471,7 @@ class Tweeper { $preprocess_html_host_method = 'preprocessHtml' . Tweeper::toUpperCamelCase($host, '.'); if (method_exists($this, $preprocess_html_host_method)) { - $html = call_user_func_array(array($this, $preprocess_html_host_method), array($html)); + $html = call_user_func_array([$this, $preprocess_html_host_method], [$html]); } $xmlDoc = $this->htmlToXml($html, $host); diff --git a/src/rss_converter_twitter.com.xsl b/src/rss_converter_twitter.com.xsl index 1c20e70..bbb3bd8 100644 --- a/src/rss_converter_twitter.com.xsl +++ b/src/rss_converter_twitter.com.xsl @@ -1,7 +1,7 @@ + + + + - - - - + - - - - + - - + + - - + + - + - - - - - + + + + + + + + + + + + + + + + + - - <xsl:if test="($show-usernames = 1) or ($screen-name != $user-name)"> <xsl:value-of select="concat($user-name, ': ')"/> </xsl:if> + <!-- TODO twitter mobile UI does not have a way to detect this <xsl:if test="$item-has-video"> <xsl:text>(Video) </xsl:text> </xsl:if> + --> <!-- Prepend a space in front of the URLs which are not preceded by an open parenthesis, for aestethic reasons. @@ -155,8 +176,8 @@ <xsl:value-of select="$item-permalink"/> </guid> <pubDate> - <xsl:variable name="timestamp" select=".//span[contains(@class, 'js-short-timestamp')]/@data-time"/> - <xsl:value-of select="php:functionString('Tweeper\Tweeper::epochToRssDate', number($timestamp))"/> + <xsl:variable name="timestamp" select=".//td[@class='timestamp']/a|.//div[@class='metadata']/a"/> + <xsl:value-of select="php:functionString('Tweeper\Tweeper::twitterToRssDate', $timestamp)"/> </pubDate> <description> <xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text> @@ -164,6 +185,7 @@ <xsl:value-of select="concat($user-name, ':')"/> <xsl:element name="br"/> </xsl:if> + <!-- TODO twitter mobile UI does not support embedded media <xsl:if test="$item-has-video"> <xsl:text> (Video)</xsl:text> <xsl:element name="br"/> @@ -172,18 +194,22 @@ <xsl:text> (GIF)</xsl:text> <xsl:element name="br"/> </xsl:if> + --> <xsl:element name="span"> <xsl:attribute name="style">white-space: pre-wrap;</xsl:attribute> <xsl:apply-templates select="$item-content/node()"/> </xsl:element> + + <!-- TODO twitter mobile UI does not support embedded media <xsl:if test="$show-multimedia = 1"> - <xsl:apply-templates select="$item-media/node()"/> + <xsl:apply-templates select="$item-media"/> </xsl:if> + --> <xsl:text disable-output-escaping="yes">]]></xsl:text> </description> <xsl:if test="$generate-enclosure = 1"> <xsl:apply-templates select="$item-content//a[@data-expanded-url]" mode="enclosure"/> - <xsl:apply-templates select="$item-media//div[@data-image-url]" mode="enclosure"/> + <xsl:apply-templates select="$item-media" mode="enclosure"/> </xsl:if> </item> </xsl:template> @@ -195,12 +221,12 @@ <xsl:value-of select="concat('Twitter / ', $screen-name)"/> </xsl:when> <xsl:otherwise> - <xsl:value-of select="concat('Twitter / ', normalize-space(//h1[1]))"/> + <xsl:value-of select="concat('Twitter / ', normalize-space(//td[@id='search']//input/@value))"/> </xsl:otherwise> </xsl:choose> </xsl:variable> <xsl:variable name="channel-link" select="//link[@rel='canonical']/@href"/> - <xsl:variable name="channel-image" select="//a[contains(@class, 'profile-picture')]/@href"/> + <xsl:variable name="channel-image" select="//table[@class='profile-details' or @class='main-tweet']//td[@class='avatar']//img/@src"/> <rss version="2.0"> <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute> @@ -213,9 +239,7 @@ <xsl:value-of select="$channel-link"/> </link> <description> - <xsl:value-of select="normalize-space(//div[@class='ProfileHeaderCard'])"/> - <!-- The following rule should only match on hashtag URLs --> - <xsl:value-of select="normalize-space(//div[@class='SearchNavigation-textContainer'])"/> + <xsl:value-of select="normalize-space(//table[@class='profile-details' or @class='main-tweet']//td[@class='details'])"/> </description> <xsl:if test="$channel-image != ''"> <image> @@ -230,12 +254,7 @@ </url> </image> </xsl:if> - <xsl:apply-templates select="//ol[@id='stream-items-id']/li[@data-item-id and @data-item-type='tweet' and not(contains(@class, 'has-profile-promoted-tweet'))]"/> - - <!-- These rules will only match on permalink URLs --> - <xsl:apply-templates select="//div[@class='permalink-inner permalink-tweet-container']"/> - <xsl:apply-templates select="//div[@data-component-context='replies']//li[@data-item-id and @data-item-type='tweet' and not(contains(@class, 'has-profile-promoted-tweet'))]"/> - + <xsl:apply-templates select="//div[contains(@class, 'timeline')]/table[@class='tweet ']|//div[@class='main-tweet-container']/table[@class='main-tweet']"/> </channel> </rss> </xsl:template> diff --git a/tweeper.1.asciidoc b/tweeper.1.asciidoc index 37e885a..42a5d7e 100644 --- a/tweeper.1.asciidoc +++ b/tweeper.1.asciidoc @@ -53,6 +53,10 @@ OPTIONS enable or disable showing usernames in front of the item for hosts which supports it (Twitter.com/Instagram.com). Default is 1 (enable). +*-v <0|1>*:: + enable or disable showing verbose output like, for instance, non-fatal + errors and warnings from the XML parser. Default is 1 (enable). + *-h, --help*:: show the help message @@ -115,7 +119,7 @@ Main web site: <https://git.ao2.it/tweeper.git> COPYING ------- -Copyright \(C) 2013-2018 Antonio Ospite <ao2@ao2.it> +Copyright \(C) 2013-2020 Antonio Ospite <ao2@ao2.it> This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/tweeper.php b/tweeper.php index 2d5017e..bcc85fe 100644 --- a/tweeper.php +++ b/tweeper.php @@ -4,7 +4,7 @@ * @file * Tweeper - a Twitter to RSS web scraper. * - * Copyright (C) 2013-2018 Antonio Ospite <ao2@ao2.it> + * Copyright (C) 2013-2020 Antonio Ospite <ao2@ao2.it> * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -38,10 +38,10 @@ function is_cli() { */ function usage($argv) { if (is_cli()) { - $usage = "{$argv[0]} [-e|-m <0|1>|-u <0|1>|-h|--help] <src_url>\n"; + $usage = "{$argv[0]} [-e|-m <0|1>|-u <0|1>|-v <0|1>|-h|--help] <src_url>\n"; } else { - $usage = htmlentities("{$_SERVER['SCRIPT_NAME']}?src_url=<src_url>&generate_enclosure=<0|1>&show_usernames=<0|1>&show_multimedia=<0|1>"); + $usage = htmlentities("{$_SERVER['SCRIPT_NAME']}?src_url=<src_url>&generate_enclosure=<0|1>&show_usernames=<0|1>&show_multimedia=<0|1>&verbose_output=<0|1>"); } return "usage: $usage"; @@ -51,17 +51,18 @@ function usage($argv) { * Parse command line options. */ function parse_options_cli($argv, $argc) { - $options = array( + $options = [ 'generate_enclosure' => FALSE, 'show_usernames' => TRUE, 'show_multimedia' => TRUE, - ); + 'verbose_output' => TRUE, + ]; if ($argc < 2) { return $options; } - $cli_options = getopt("em:u:h", array("help")); + $cli_options = getopt("em:u:v:h", ["help"]); foreach ($cli_options as $opt => $val) { switch ($opt) { case 'e': @@ -88,6 +89,16 @@ function parse_options_cli($argv, $argc) { $options['show_usernames'] = $val; break; + case 'v': + $ret = filter_var($val, FILTER_VALIDATE_BOOLEAN, FILTER_NULL_ON_FAILURE); + if (NULL === $ret) { + fwrite(STDERR, "Invalid argument for the -v option.\n"); + fwrite(STDERR, usage($argv)); + exit(1); + } + $options['verbose_output'] = $val; + break; + case 'h': case 'help': echo usage($argv); @@ -110,11 +121,12 @@ function parse_options_cli($argv, $argc) { * Parse options passed from a query string. */ function parse_options_query_string() { - $options = array( + $options = [ 'generate_enclosure' => FALSE, 'show_usernames' => TRUE, 'show_multimedia' => TRUE, - ); + 'verbose_output' => TRUE, + ]; if (isset($_GET['src_url'])) { $options['src_url'] = $_GET['src_url']; @@ -132,6 +144,10 @@ function parse_options_query_string() { $options['show_usernames'] = $_GET['show_usernames'] != 0; } + if (isset($_GET['verbose_output'])) { + $options['verbose_output'] = $_GET['verbose_output'] != 0; + } + return $options; } @@ -149,7 +165,7 @@ if (!isset($options['src_url'])) { exit(1); } -$tweeper = new Tweeper($options['generate_enclosure'], $options['show_usernames'], $options['show_multimedia']); +$tweeper = new Tweeper($options['generate_enclosure'], $options['show_usernames'], $options['show_multimedia'], $options['verbose_output']); $output = $tweeper->tweep($options['src_url']); if (is_null($output)) { exit(1);