Merge tag 'v1.4.3' into debian/master
authorAntonio Ospite <ao2@ao2.it>
Sun, 27 Dec 2020 17:05:34 +0000 (18:05 +0100)
committerAntonio Ospite <ao2@ao2.it>
Sun, 27 Dec 2020 17:05:34 +0000 (18:05 +0100)
Release v1.4.3

NEWS
README
src/Tweeper.php
src/rss_converter_twitter.com.xsl

diff --git a/NEWS b/NEWS
index a0fcc6c..db1b1ee 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,10 @@
+News for v1.4.3:
+================
+
+  * Fix scraping twitter.com again by impersonating a Google crawler
+  * Add check for http response code and return failure for error codes
+  * Return failure when instagram.com redirects to login page
+
 News for v1.4.2:
 ================
 
diff --git a/README b/README
index 7d47e8e..d420f7a 100644 (file)
--- a/README
+++ b/README
@@ -56,5 +56,5 @@ Example of use with identi.ca:
 
 [2] http://lzone.de/liferea/
 
-Tweeper is licensed under the GPLv3.
+Tweeper is licensed under the GPLv3 or later.
 Tweeper was written by Antonio Ospite https://ao2.it
index f1d579f..de1f474 100644 (file)
@@ -36,7 +36,7 @@ date_default_timezone_set('UTC');
  */
 class Tweeper {
 
-  private static $userAgent = "Mozilla/5.0";
+  private static $userAgent = "APIs-Google (+https://developers.google.com/webmasters/APIs-Google.html)";
   private static $maxConnectionTimeout = 5;
   private static $maxConnectionRetries = 5;
 
@@ -87,46 +87,6 @@ class Tweeper {
   }
 
   /**
-   * Convert Twitter mobile date to the date format expected in a RSS document.
-   */
-  public static function twitterToRssDate($date) {
-    // Twitter uses relative timestamps in minutes for recent tweets.
-    if (preg_match('/^(\d+)m$/', $date, $matches)) {
-      $timestamp = strtotime("+" . $matches[1] . " min", time());
-      if (FALSE === $timestamp) {
-        $timestamp = 0;
-      }
-    }
-    else {
-      /*
-       * In case the time is specified put it after the date,
-       * to make it recognized by strptime().
-       */
-      if (preg_match('/(.*) - (.*)/', $date, $matches)) {
-        $date = $matches[2] . " " . $matches[1];
-      }
-
-      $timestamp = strtotime($date);
-      if (FALSE === $timestamp) {
-        $timestamp = 0;
-      }
-
-      /*
-       * The twitter mobile UI usually only specifies the month and the day, so
-       * strtotime($date) may interpret the date as being in the future.
-       *
-       * If the date is in the future it is probably in the same day but in the
-       * previous year.
-       */
-      if ($timestamp > time()) {
-        $timestamp = strtotime('-1 years', $timestamp);
-      }
-    }
-
-    return Tweeper::epochToRssDate($timestamp);
-  }
-
-  /**
    * Convert string to UpperCamelCase.
    */
   public static function toUpperCamelCase($str, $delim = ' ') {
@@ -151,6 +111,17 @@ class Tweeper {
       }
     } while (curl_errno($ch) == CURLE_OPERATION_TIMEDOUT && ++$attempt < Tweeper::$maxConnectionRetries);
 
+    $response_code = curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
+    if (FALSE === $response_code) {
+      trigger_error(curl_error($ch), E_USER_WARNING);
+      return FALSE;
+    }
+
+    if ($response_code >= 400) {
+      trigger_error("HTTP reponse code $response_code", E_USER_WARNING);
+      return FALSE;
+    }
+
     return $ret;
   }
 
@@ -349,6 +320,14 @@ class Tweeper {
     // remove it to silence an error message.
     unset($data["knobs"]);
 
+    // Stop here in case Instagram redirected to the login page, this can
+    // happen when too many consecutive requests have been made from the same
+    // IP.
+    if (array_key_exists("LoginAndSignupPage", $data["entry_data"])) {
+      trigger_error("Cannot open instagram page: redirected to Login page.\n", E_USER_WARNING);
+      return NULL;
+    }
+
     $json = json_encode($data);
 
     return Tweeper::jsonToXml($json, 'instagram');
@@ -376,6 +355,9 @@ class Tweeper {
     $get_xml_host_method = 'getXml' . Tweeper::toUpperCamelCase($host, '.');
     if (method_exists($this, $get_xml_host_method)) {
       $xml_data = call_user_func_array([$this, $get_xml_host_method], [$html]);
+      if (NULL === $xml_data) {
+        return NULL;
+      }
       $xmlDoc->loadXML($xml_data);
     }
     else {
@@ -455,15 +437,7 @@ class Tweeper {
       return NULL;
     }
 
-    // Override User-Agent for twitter.com to force it to serve the mobile UI.
-    if ($host == "twitter.com") {
-      $user_agent = "Mozilla/5.0 (Linux; U; Android 4.3; en-us; SM-N900T Build/JSS15J)";
-    }
-    else {
-      $user_agent = NULL;
-    }
-
-    $html = Tweeper::getUrlContents($src_url, $user_agent);
+    $html = Tweeper::getUrlContents($src_url);
     if (FALSE === $html) {
       trigger_error("Failed to retrieve $src_url", E_USER_WARNING);
       return NULL;
index bbb3bd8..e14eaac 100644 (file)
         </xsl:copy>
     </xsl:template>
 
-    <!-- Strip leading spaces in first text node of the tweet-text. -->
-    <xsl:template match="div[@class='tweet-text']/div/text()[1]">
-        <xsl:value-of select="substring-after(substring-after(., ' '), ' ')"/>
-    </xsl:template>
-
     <!--
          Anchors to external links provide the direct URL in the
          data-expanded-url attribute, so use this in the href attribute too
          http://stackoverflow.com/questions/21984867/
     -->
     <xsl:template match="a[@data-expanded-url]">
+        <!-- Prepend and append a white space for aestethic reasons -->
+        <xsl:text> </xsl:text>
         <a>
             <xsl:attribute name="href">
                 <xsl:value-of select="@data-expanded-url"/>
             </xsl:attribute>
-            <xsl:value-of select="@data-expanded-url"/>
+            <!-- Also strip &nbsp; and &hellip; -->
+            <xsl:value-of select="translate(., '&#xA0;&#x2026;', '')"/>
         </a>
+        <xsl:text> </xsl:text>
     </xsl:template>
 
     <!--
     -->
     <xsl:template match="a[@data-pre-embedded='true']">
         <xsl:if test="$show-multimedia = 1">
+            <!-- Prepend and append a white space for aestethic reasons -->
+            <xsl:text> </xsl:text>
             <a>
                 <xsl:attribute name="href">
-                    <xsl:value-of select="@data-url"/>
+                    <xsl:value-of select="concat('https://', .)"/>
                 </xsl:attribute>
                 <xsl:value-of select="concat('https://', .)"/>
             </a>
+            <xsl:text> </xsl:text>
         </xsl:if>
     </xsl:template>
 
     <!-- Present images in a more convenient way -->
-    <!-- TODO: not supported in mobile UI
-    <xsl:template match="a[@data-pre-embedded='true' and contains(@data-url, '/photo/')]">
-        <xsl:variable name="embedded-photo-url" select="concat('https://pbs.twimg.com/media/', @data-tco-id, '?format=jpg')"/>
+    <xsl:template match="div[@data-image-url]">
         <a>
             <xsl:attribute name="href">
-                <xsl:value-of select="$embedded-photo-url"/>
+                <xsl:value-of select="concat(@data-image-url, ':orig')"/>
             </xsl:attribute>
             <img style="max-width: 100%">
                 <xsl:attribute name="src">
-                    <xsl:value-of select="$embedded-photo-url"/>
+                    <xsl:value-of select="@data-image-url"/>
                 </xsl:attribute>
             </img>
         </a>
     </xsl:template>
-    -->
 
     <!-- Don't repeat background in embedded media content -->
-    <!-- TODO: not supported in mobile UI
     <xsl:template match="div[contains(@class, 'PlayableMedia-player')]">
         <xsl:copy>
             <xsl:apply-templates select="@*"/>
             <xsl:apply-templates select="node()"/>
         </xsl:copy>
     </xsl:template>
-    -->
 
     <xsl:template match="a[@data-expanded-url]" mode="enclosure">
         <xsl:copy-of select="php:functionString('Tweeper\Tweeper::generateEnclosure', ./@data-expanded-url)"/>
     </xsl:template>
 
-    <xsl:template match="a[@data-pre-embedded='true']" mode="enclosure">
-        <xsl:copy-of select="php:functionString('Tweeper\Tweeper::generateEnclosure', @data-url)"/>
+    <xsl:template match="div[@data-image-url]" mode="enclosure">
+        <xsl:copy-of select="php:functionString('Tweeper\Tweeper::generateEnclosure', concat(./@data-image-url, ':orig'))"/>
     </xsl:template>
 
-    <xsl:variable name="screen-name" select="normalize-space(substring-after(//table[@class='profile-details' or @class='main-tweet']//*[@class='username'], '@'))"/>
+    <xsl:variable name="screen-name" select="//div[@class='user-actions btn-group not-following ']/@data-screen-name"/>
 
-    <xsl:template match="//div[contains(@class, 'timeline')]/table[@class='tweet  ']|//div[@class='main-tweet-container']/table[@class='main-tweet']">
-        <xsl:variable name="user-name" select="normalize-space(.//*[@class='username']/text()[2])"/>
-        <xsl:variable name="item-content" select=".//div[@class='tweet-text']/div"/>
-        <xsl:variable name="item-media" select=".//a[@data-pre-embedded='true']"/>
-        <xsl:variable name="item-permalink">
-            <xsl:choose>
-                <xsl:when test="@href">
-                    <xsl:value-of select="concat($BaseURL, substring-before(@href, '?'))"/>
-                </xsl:when>
-                <xsl:otherwise>
-                    <!--
-                        The main tweet in permalink pages do not have a timestamp tag,
-                        just use the canonical URL as permalink.
-                    -->
-                    <xsl:value-of select="//link[@rel='canonical']/@href"/>
-                </xsl:otherwise>
-            </xsl:choose>
-        </xsl:variable>
-
-        <!-- TODO twitter mobile UI does not have a way to detect this
-        <xsl:variable name="item-has-video" select="$item-media//*[contains(@class, 'PlayableMedia- -video')]"/>
-        <xsl:variable name="item-has-gif" select="$item-media//*[contains(@class, 'PlayableMedia- -gif')]"/>
-        -->
+    <xsl:template match="//div[@class='permalink-inner permalink-tweet-container'] | //li[@data-item-id and @data-item-type='tweet']">
+        <xsl:variable name="user-name" select=".//div[@data-tweet-id]/@data-screen-name"/>
+        <xsl:variable name="item-content" select=".//p[contains(@class, 'js-tweet-text')]"/>
+        <xsl:variable name="item-media" select=".//div[contains(@class, 'AdaptiveMedia-container')]"/>
+        <xsl:variable name="item-permalink" select="concat($BaseURL, .//div[@data-permalink-path]/@data-permalink-path)"/>
 
+        <xsl:variable name="item-has-video" select="$item-media//*[contains(@class, 'PlayableMedia--video')]"/>
+        <xsl:variable name="item-has-gif" select="$item-media//*[contains(@class, 'PlayableMedia--gif')]"/>
         <item>
             <title>
                 <xsl:if test="($show-usernames = 1) or ($screen-name != $user-name)">
                     <xsl:value-of select="concat($user-name, ': ')"/>
                 </xsl:if>
-                <!-- TODO twitter mobile UI does not have a way to detect this
                 <xsl:if test="$item-has-video">
                     <xsl:text>(Video) </xsl:text>
                 </xsl:if>
-                -->
                 <!--
                      Prepend a space in front of the URLs which are not
                      preceded by an open parenthesis, for aestethic reasons.
                 <xsl:value-of select="$item-permalink"/>
             </guid>
             <pubDate>
-                <xsl:variable name="timestamp" select=".//td[@class='timestamp']/a|.//div[@class='metadata']/a"/>
-                <xsl:value-of select="php:functionString('Tweeper\Tweeper::twitterToRssDate', $timestamp)"/>
+                <xsl:variable name="timestamp" select=".//span[contains(@class, 'js-short-timestamp')]/@data-time"/>
+                <xsl:value-of select="php:functionString('Tweeper\Tweeper::epochToRssDate', number($timestamp))"/>
             </pubDate>
             <description>
                 <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
                     <xsl:value-of select="concat($user-name, ':')"/>
                     <xsl:element name="br"/>
                 </xsl:if>
-                <!-- TODO twitter mobile UI does not support embedded media
                 <xsl:if test="$item-has-video">
                     <xsl:text> (Video)</xsl:text>
                     <xsl:element name="br"/>
                     <xsl:text> (GIF)</xsl:text>
                     <xsl:element name="br"/>
                 </xsl:if>
-                -->
                 <xsl:element name="span">
                     <xsl:attribute name="style">white-space: pre-wrap;</xsl:attribute>
                     <xsl:apply-templates select="$item-content/node()"/>
                 </xsl:element>
-
-                <!-- TODO twitter mobile UI does not support embedded media
                 <xsl:if test="$show-multimedia = 1">
-                    <xsl:apply-templates select="$item-media"/>
+                    <xsl:apply-templates select="$item-media/node()"/>
                 </xsl:if>
-                -->
                 <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
             </description>
             <xsl:if test="$generate-enclosure = 1">
                 <xsl:apply-templates select="$item-content//a[@data-expanded-url]" mode="enclosure"/>
-                <xsl:apply-templates select="$item-media" mode="enclosure"/>
+                <xsl:apply-templates select="$item-media//div[@data-image-url]" mode="enclosure"/>
             </xsl:if>
         </item>
     </xsl:template>
                     <xsl:value-of select="concat('Twitter / ', $screen-name)"/>
                 </xsl:when>
                 <xsl:otherwise>
-                    <xsl:value-of select="concat('Twitter / ', normalize-space(//td[@id='search']//input/@value))"/>
+                    <xsl:value-of select="concat('Twitter / ', normalize-space(//h1[1]))"/>
                 </xsl:otherwise>
             </xsl:choose>
         </xsl:variable>
         <xsl:variable name="channel-link" select="//link[@rel='canonical']/@href"/>
-        <xsl:variable name="channel-image" select="//table[@class='profile-details' or @class='main-tweet']//td[@class='avatar']//img/@src"/>
+        <xsl:variable name="channel-image" select="//a[contains(@class, 'profile-picture')]/@href"/>
 
         <rss version="2.0">
             <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
                     <xsl:value-of select="$channel-link"/>
                 </link>
                 <description>
-                    <xsl:value-of select="normalize-space(//table[@class='profile-details' or @class='main-tweet']//td[@class='details'])"/>
+                    <xsl:value-of select="normalize-space(//div[@class='ProfileHeaderCard'])"/>
+                    <!-- The following rule should only match on hashtag URLs -->
+                    <xsl:value-of select="normalize-space(//div[@class='SearchNavigation-textContainer'])"/>
                 </description>
                 <xsl:if test="$channel-image != ''">
                     <image>
                         </url>
                     </image>
                 </xsl:if>
-                <xsl:apply-templates select="//div[contains(@class, 'timeline')]/table[@class='tweet  ']|//div[@class='main-tweet-container']/table[@class='main-tweet']"/>
+                <xsl:apply-templates select="//ol[@id='stream-items-id']/li[@data-item-id and @data-item-type='tweet' and not(contains(@class, 'has-profile-promoted-tweet'))]"/>
+
+                <!-- These rules will only match on permalink URLs -->
+                <xsl:apply-templates select="//div[@class='permalink-inner permalink-tweet-container']"/>
+                <xsl:apply-templates select="//div[@data-component-context='replies']//li[@data-item-id and @data-item-type='tweet' and not(contains(@class, 'has-profile-promoted-tweet'))]"/>
+
             </channel>
         </rss>
     </xsl:template>