From: Antonio Ospite Date: Sun, 27 Dec 2020 17:05:34 +0000 (+0100) Subject: Merge tag 'v1.4.3' into debian/master X-Git-Tag: debian/1.4.3-1~2 X-Git-Url: https://git.ao2.it/tweeper.git/commitdiff_plain/65e167aaba567ddfc1c227e7699981637dd3d1b6?hp=ebee0d3e09b8e25f371519b72b24612c63dc8e07 Merge tag 'v1.4.3' into debian/master Release v1.4.3 --- diff --git a/NEWS b/NEWS index a0fcc6c..db1b1ee 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,10 @@ +News for v1.4.3: +================ + + * Fix scraping twitter.com again by impersonating a Google crawler + * Add check for http response code and return failure for error codes + * Return failure when instagram.com redirects to login page + News for v1.4.2: ================ diff --git a/README b/README index 7d47e8e..d420f7a 100644 --- a/README +++ b/README @@ -56,5 +56,5 @@ Example of use with identi.ca: [2] http://lzone.de/liferea/ -Tweeper is licensed under the GPLv3. +Tweeper is licensed under the GPLv3 or later. Tweeper was written by Antonio Ospite https://ao2.it diff --git a/src/Tweeper.php b/src/Tweeper.php index f1d579f..de1f474 100644 --- a/src/Tweeper.php +++ b/src/Tweeper.php @@ -36,7 +36,7 @@ date_default_timezone_set('UTC'); */ class Tweeper { - private static $userAgent = "Mozilla/5.0"; + private static $userAgent = "APIs-Google (+https://developers.google.com/webmasters/APIs-Google.html)"; private static $maxConnectionTimeout = 5; private static $maxConnectionRetries = 5; @@ -87,46 +87,6 @@ class Tweeper { } /** - * Convert Twitter mobile date to the date format expected in a RSS document. - */ - public static function twitterToRssDate($date) { - // Twitter uses relative timestamps in minutes for recent tweets. - if (preg_match('/^(\d+)m$/', $date, $matches)) { - $timestamp = strtotime("+" . $matches[1] . " min", time()); - if (FALSE === $timestamp) { - $timestamp = 0; - } - } - else { - /* - * In case the time is specified put it after the date, - * to make it recognized by strptime(). - */ - if (preg_match('/(.*) - (.*)/', $date, $matches)) { - $date = $matches[2] . " " . $matches[1]; - } - - $timestamp = strtotime($date); - if (FALSE === $timestamp) { - $timestamp = 0; - } - - /* - * The twitter mobile UI usually only specifies the month and the day, so - * strtotime($date) may interpret the date as being in the future. - * - * If the date is in the future it is probably in the same day but in the - * previous year. - */ - if ($timestamp > time()) { - $timestamp = strtotime('-1 years', $timestamp); - } - } - - return Tweeper::epochToRssDate($timestamp); - } - - /** * Convert string to UpperCamelCase. */ public static function toUpperCamelCase($str, $delim = ' ') { @@ -151,6 +111,17 @@ class Tweeper { } } while (curl_errno($ch) == CURLE_OPERATION_TIMEDOUT && ++$attempt < Tweeper::$maxConnectionRetries); + $response_code = curl_getinfo($ch, CURLINFO_RESPONSE_CODE); + if (FALSE === $response_code) { + trigger_error(curl_error($ch), E_USER_WARNING); + return FALSE; + } + + if ($response_code >= 400) { + trigger_error("HTTP reponse code $response_code", E_USER_WARNING); + return FALSE; + } + return $ret; } @@ -349,6 +320,14 @@ class Tweeper { // remove it to silence an error message. unset($data["knobs"]); + // Stop here in case Instagram redirected to the login page, this can + // happen when too many consecutive requests have been made from the same + // IP. + if (array_key_exists("LoginAndSignupPage", $data["entry_data"])) { + trigger_error("Cannot open instagram page: redirected to Login page.\n", E_USER_WARNING); + return NULL; + } + $json = json_encode($data); return Tweeper::jsonToXml($json, 'instagram'); @@ -376,6 +355,9 @@ class Tweeper { $get_xml_host_method = 'getXml' . Tweeper::toUpperCamelCase($host, '.'); if (method_exists($this, $get_xml_host_method)) { $xml_data = call_user_func_array([$this, $get_xml_host_method], [$html]); + if (NULL === $xml_data) { + return NULL; + } $xmlDoc->loadXML($xml_data); } else { @@ -455,15 +437,7 @@ class Tweeper { return NULL; } - // Override User-Agent for twitter.com to force it to serve the mobile UI. - if ($host == "twitter.com") { - $user_agent = "Mozilla/5.0 (Linux; U; Android 4.3; en-us; SM-N900T Build/JSS15J)"; - } - else { - $user_agent = NULL; - } - - $html = Tweeper::getUrlContents($src_url, $user_agent); + $html = Tweeper::getUrlContents($src_url); if (FALSE === $html) { trigger_error("Failed to retrieve $src_url", E_USER_WARNING); return NULL; diff --git a/src/rss_converter_twitter.com.xsl b/src/rss_converter_twitter.com.xsl index bbb3bd8..e14eaac 100644 --- a/src/rss_converter_twitter.com.xsl +++ b/src/rss_converter_twitter.com.xsl @@ -45,11 +45,6 @@ - - - - - + + - + + + + + - + + - - - - + + - + - - - - - - - - - - - - - - - - - + + + + + + + <xsl:if test="($show-usernames = 1) or ($screen-name != $user-name)"> <xsl:value-of select="concat($user-name, ': ')"/> </xsl:if> - <!-- TODO twitter mobile UI does not have a way to detect this <xsl:if test="$item-has-video"> <xsl:text>(Video) </xsl:text> </xsl:if> - --> <!-- Prepend a space in front of the URLs which are not preceded by an open parenthesis, for aestethic reasons. @@ -176,8 +155,8 @@ <xsl:value-of select="$item-permalink"/> </guid> <pubDate> - <xsl:variable name="timestamp" select=".//td[@class='timestamp']/a|.//div[@class='metadata']/a"/> - <xsl:value-of select="php:functionString('Tweeper\Tweeper::twitterToRssDate', $timestamp)"/> + <xsl:variable name="timestamp" select=".//span[contains(@class, 'js-short-timestamp')]/@data-time"/> + <xsl:value-of select="php:functionString('Tweeper\Tweeper::epochToRssDate', number($timestamp))"/> </pubDate> <description> <xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text> @@ -185,7 +164,6 @@ <xsl:value-of select="concat($user-name, ':')"/> <xsl:element name="br"/> </xsl:if> - <!-- TODO twitter mobile UI does not support embedded media <xsl:if test="$item-has-video"> <xsl:text> (Video)</xsl:text> <xsl:element name="br"/> @@ -194,22 +172,18 @@ <xsl:text> (GIF)</xsl:text> <xsl:element name="br"/> </xsl:if> - --> <xsl:element name="span"> <xsl:attribute name="style">white-space: pre-wrap;</xsl:attribute> <xsl:apply-templates select="$item-content/node()"/> </xsl:element> - - <!-- TODO twitter mobile UI does not support embedded media <xsl:if test="$show-multimedia = 1"> - <xsl:apply-templates select="$item-media"/> + <xsl:apply-templates select="$item-media/node()"/> </xsl:if> - --> <xsl:text disable-output-escaping="yes">]]></xsl:text> </description> <xsl:if test="$generate-enclosure = 1"> <xsl:apply-templates select="$item-content//a[@data-expanded-url]" mode="enclosure"/> - <xsl:apply-templates select="$item-media" mode="enclosure"/> + <xsl:apply-templates select="$item-media//div[@data-image-url]" mode="enclosure"/> </xsl:if> </item> </xsl:template> @@ -221,12 +195,12 @@ <xsl:value-of select="concat('Twitter / ', $screen-name)"/> </xsl:when> <xsl:otherwise> - <xsl:value-of select="concat('Twitter / ', normalize-space(//td[@id='search']//input/@value))"/> + <xsl:value-of select="concat('Twitter / ', normalize-space(//h1[1]))"/> </xsl:otherwise> </xsl:choose> </xsl:variable> <xsl:variable name="channel-link" select="//link[@rel='canonical']/@href"/> - <xsl:variable name="channel-image" select="//table[@class='profile-details' or @class='main-tweet']//td[@class='avatar']//img/@src"/> + <xsl:variable name="channel-image" select="//a[contains(@class, 'profile-picture')]/@href"/> <rss version="2.0"> <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute> @@ -239,7 +213,9 @@ <xsl:value-of select="$channel-link"/> </link> <description> - <xsl:value-of select="normalize-space(//table[@class='profile-details' or @class='main-tweet']//td[@class='details'])"/> + <xsl:value-of select="normalize-space(//div[@class='ProfileHeaderCard'])"/> + <!-- The following rule should only match on hashtag URLs --> + <xsl:value-of select="normalize-space(//div[@class='SearchNavigation-textContainer'])"/> </description> <xsl:if test="$channel-image != ''"> <image> @@ -254,7 +230,12 @@ </url> </image> </xsl:if> - <xsl:apply-templates select="//div[contains(@class, 'timeline')]/table[@class='tweet ']|//div[@class='main-tweet-container']/table[@class='main-tweet']"/> + <xsl:apply-templates select="//ol[@id='stream-items-id']/li[@data-item-id and @data-item-type='tweet' and not(contains(@class, 'has-profile-promoted-tweet'))]"/> + + <!-- These rules will only match on permalink URLs --> + <xsl:apply-templates select="//div[@class='permalink-inner permalink-tweet-container']"/> + <xsl:apply-templates select="//div[@data-component-context='replies']//li[@data-item-id and @data-item-type='tweet' and not(contains(@class, 'has-profile-promoted-tweet'))]"/> + </channel> </rss> </xsl:template>