Merge tag 'v1.3.0' into debian/master
authorAntonio Ospite <ao2@ao2.it>
Wed, 6 Jun 2018 14:08:18 +0000 (16:08 +0200)
committerAntonio Ospite <ao2@ao2.it>
Wed, 6 Jun 2018 14:08:18 +0000 (16:08 +0200)
Release v1.3.0

12 files changed:
INSTALL
NEWS
TODO
autoload.php
src/Tweeper.php
src/rss_converter_dilbert.com.xsl
src/rss_converter_facebook.com.xsl
src/rss_converter_instagram.com.xsl
src/rss_converter_pump.io.xsl
src/rss_converter_twitter.com.xsl
tweeper.1.asciidoc
tweeper.php

diff --git a/INSTALL b/INSTALL
index 6c19099..7c9ccc1 100644 (file)
--- a/INSTALL
+++ b/INSTALL
@@ -2,11 +2,17 @@ The recommended way to install tweeper globally is to install all its files
 under /usr/share/php/tweeper and then make a symlink to the wrapper script
 "tweeper" under /usr/bin
 
+NOTES FOR PACKAGERS
+
+Even though the php json extensions are used, namely json_decode(), a direct
+dependency on php-json is not usually strictly necessary, because (at least on
+Debian) php-cli already depends on it.
+
 Tweeper depends on php-symfony-serializer which is used to convert json to xml
-for some sites which provide the timeline data in json rather than in usable
-html.
+for some sites which provide the timeline data in json rather than in directly
+transformable html.
 
-NOTE: Tweeper also depends indirectly on php-symfony-property-access because
-the code relies on the ObjectNormalizer class which requires the
-PropertyAccess component, see
+Tweeper also depends (indirectly) on php-symfony-property-access because the
+code relies on the ObjectNormalizer class which requires the PropertyAccess
+component, see
 http://symfony.com/doc/current/components/serializer.html#installation
diff --git a/NEWS b/NEWS
index cd5a3bc..33d3163 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,16 @@
+News for v1.3.0:
+================
+
+  * Fix scraping instagram.com
+  * Fix scraping twitter.com
+  * Improve scraping twitter.com hashtag pages, like for example
+    https://twitter.com/hashtag/tweeper
+  * Fix getting the channel logo URL for identi.ca/pump.io
+  * Add support for scraping Instagram hashtag pages, like for example
+    https://www.instagram.com/explore/tags/marechiaro
+  * Make the RSS feed for twitter.com hashtag pages validate with
+    feedvalidator.org
+
 News for v1.2.0:
 ================
 
diff --git a/TODO b/TODO
index 51b294b..7b72745 100644 (file)
--- a/TODO
+++ b/TODO
@@ -12,5 +12,3 @@
 
 - The dependencies on the symphony components in composer.json could be more
   relaxed like ">=2.7.0", but for now sticking to "2.7.*" is good enough.
-
-- Add support for instagram tags
index 4ba7832..d3ebc5a 100644 (file)
@@ -3,7 +3,7 @@
  * @file
  * Tweeper - some logic to allow tweeper to run with or without composer.
  *
- * Copyright (C) 2016  Antonio Ospite <ao2@ao2.it>
+ * Copyright (C) 2016-2018  Antonio Ospite <ao2@ao2.it>
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
index 566decb..50ff148 100644 (file)
@@ -6,7 +6,7 @@ namespace Tweeper;
  * @file
  * Tweeper - a Twitter to RSS web scraper.
  *
- * Copyright (C) 2013-2016  Antonio Ospite <ao2@ao2.it>
+ * Copyright (C) 2013-2018  Antonio Ospite <ao2@ao2.it>
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -36,7 +36,7 @@ date_default_timezone_set('UTC');
  */
 class Tweeper {
 
-  private static $userAgent = "Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0";
+  private static $userAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0";
 
   /**
    * Constructor sets up {@link $generate_enclosure}.
index d340183..dcc56af 100644 (file)
@@ -1,7 +1,7 @@
 <!--
   Stylesheet to convert Dilbert daily strips to RSS.
 
-  Copyright (C) 2013-2014  Antonio Ospite <ao2@ao2.it>
+  Copyright (C) 2013-2018  Antonio Ospite <ao2@ao2.it>
 
   This file is part of tweeper.
 
index 520d6ce..a735cf6 100644 (file)
@@ -1,7 +1,7 @@
 <!--
   Stylesheet to convert a Facebook public page to RSS.
 
-  Copyright (C) 2015  Antonio Ospite <ao2@ao2.it>
+  Copyright (C) 2015-2018  Antonio Ospite <ao2@ao2.it>
 
   This file is part of tweeper.
 
index a2de8b3..c714b1b 100644 (file)
@@ -1,7 +1,7 @@
 <!--
   Stylesheet to convert Instagram user timelines to RSS.
 
-  Copyright (C) 2015  Antonio Ospite <ao2@ao2.it>
+  Copyright (C) 2015-2018  Antonio Ospite <ao2@ao2.it>
 
   This file is part of tweeper.
 
@@ -32,7 +32,7 @@
         <xsl:text>https://instagram.com</xsl:text>
     </xsl:variable>
 
-    <xsl:variable name="user-name" select="//ProfilePage/user/username"/>
+    <xsl:variable name="user-name" select="//ProfilePage/graphql/user/username"/>
 
     <!--
          NOTE: some users do not specify the full name.
          Remember to handle this case when using it and fall-back to the plain
          user name when appropriate.
     -->
-    <xsl:variable name="full-name" select="//ProfilePage/user/full_name"/>
+    <xsl:variable name="full-name" select="//ProfilePage/graphql/user/full_name"/>
 
-    <xsl:variable name="location-name" select="//LocationsPage/location/name"/>
+    <xsl:variable name="location-name" select="//LocationsPage/graphql/location/name"/>
+
+    <xsl:variable name="hashtag-name" select="//TagPage/graphql/hashtag/name"/>
 
     <xsl:variable name="screen-name">
         <xsl:choose>
             <xsl:when test="$location-name != ''">
-                <xsl:variable name="location-latitude" select="//LocationsPage/location/lat"/>
-                <xsl:variable name="location-longitude" select="//LocationsPage/location/lng"/>
-                <xsl:value-of select="concat($location-name, ' (', $location-latitude, ', ', $location-longitude, ')')"/>
+                <xsl:value-of select="$location-name"/>
+            </xsl:when>
+            <xsl:when test="$hashtag-name != ''">
+                <xsl:value-of select="$hashtag-name"/>
             </xsl:when>
             <xsl:when test="$full-name != ''">
                 <xsl:value-of select="$full-name"/>
         </xsl:choose>
     </xsl:variable>
 
-    <xsl:template match="//media/nodes">
-        <xsl:variable name="item-content-image" select="./display_src"/>
-        <xsl:variable name="item-content-caption" select="./caption"/>
-        <xsl:variable name="item-permalink" select="concat($BaseURL, '/p/', ./code, '/')"/>
+    <xsl:template match="//edges/node">
+        <xsl:variable name="item-content-image" select="./display_url"/>
+        <xsl:variable name="item-content-caption" select="./edge_media_to_caption/edges/node/text"/>
+        <xsl:variable name="item-permalink" select="concat($BaseURL, '/p/', ./shortcode, '/')"/>
         <item>
             <title>
                 <xsl:variable name="title-length" select="140"/>
-                <xsl:variable name="item-content-title" select="normalize-space(concat($user-name, ': ', $item-content-caption))"/>
+                <xsl:variable name="item-content-title" select="normalize-space(concat($screen-name, ': ', $item-content-caption))"/>
                 <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 -->
                 <xsl:choose>
                     <xsl:when test="string-length($item-content-title) > $title-length">
@@ -87,7 +90,7 @@
                 <xsl:value-of select="$item-permalink"/>
             </guid>
             <pubDate>
-                <xsl:variable name="timestamp" select="./date"/>
+                <xsl:variable name="timestamp" select="./taken_at_timestamp"/>
                 <xsl:value-of select="php:functionString('Tweeper\Tweeper::epochToRssDate', $timestamp)"/>
             </pubDate>
             <description>
         <xsl:variable name="channel-link">
             <xsl:choose>
                 <xsl:when test="$location-name != ''">
-                    <xsl:variable name="location-id" select="//LocationsPage/location/id"/>
+                    <xsl:variable name="location-id" select="//LocationsPage/graphql/location/id"/>
                     <xsl:value-of select="concat($BaseURL, '/explore/locations/', $location-id)"/>
                 </xsl:when>
+                <xsl:when test="$hashtag-name != ''">
+                    <xsl:value-of select="concat($BaseURL, '/explore/tags/', $hashtag-name)"/>
+                </xsl:when>
                 <xsl:otherwise>
                     <xsl:value-of select="concat($BaseURL, '/', $user-name)"/>
                 </xsl:otherwise>
             </xsl:choose>
         </xsl:variable>
-        <xsl:variable name="channel-image" select="//ProfilePage/user/profile_pic_url"/>
+        <xsl:variable name="channel-image" select="//profile_pic_url"/>
 
         <rss version="2.0">
             <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
                 </link>
                 <description>
                     <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
-                    <xsl:value-of select="normalize-space(concat($screen-name, '. ', //user/biography))"/>
-                    <xsl:variable name="external-url" select="//user/external_url"/>
-                    <xsl:if test="$external-url != ''">
-                        <xsl:text> </xsl:text><a href="{$external-url}"><xsl:value-of select="$external-url"/></a>
-                    </xsl:if>
+                    <xsl:choose>
+                        <xsl:when test="$location-name != ''">
+                            <xsl:variable name="location-latitude" select="//LocationsPage/graphql/location/lat"/>
+                            <xsl:variable name="location-longitude" select="//LocationsPage/graphql/location/lng"/>
+                            <xsl:value-of select="concat($location-name, ' (', $location-latitude, ', ', $location-longitude, ')')"/>
+                        </xsl:when>
+                        <xsl:when test="$hashtag-name != ''">
+                            <xsl:value-of select="concat('#', $hashtag-name)"/>
+                        </xsl:when>
+                        <xsl:otherwise>
+                            <xsl:value-of select="normalize-space(concat($screen-name, '. ', //user/biography))"/>
+                            <xsl:variable name="external-url" select="//user/external_url"/>
+                            <xsl:if test="$external-url != ''">
+                                <xsl:text> </xsl:text><a href="{$external-url}"><xsl:value-of select="$external-url"/></a>
+                            </xsl:if>
+                        </xsl:otherwise>
+                    </xsl:choose>
                     <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
                 </description>
                 <xsl:if test="$channel-image != ''">
                         </url>
                     </image>
                 </xsl:if>
-                <xsl:apply-templates select="//ProfilePage/user/media/nodes|//LocationsPage/location/media/nodes"/>
+                <xsl:apply-templates select="//ProfilePage/graphql/user/edge_owner_to_timeline_media/edges/node|//LocationsPage/graphql/location/edge_location_to_media/edges/node|//TagPage/graphql/hashtag/edge_hashtag_to_media/edges/node"/>
             </channel>
         </rss>
     </xsl:template>
index bf9f674..66e73cd 100644 (file)
@@ -1,7 +1,7 @@
 <!--
   Stylesheet to convert Pump.io activity streams to RSS.
 
-  Copyright (C) 2013-2014  Antonio Ospite <ao2@ao2.it>
+  Copyright (C) 2013-2018  Antonio Ospite <ao2@ao2.it>
 
   This file is part of tweeper.
 
@@ -89,7 +89,7 @@
                         <xsl:value-of select="$channel-link"/>
                     </link>
                     <url>
-                        <xsl:value-of select="//div[@id='profile-block']/span/img[@class='img-rounded media-object']/@src"/>
+                        <xsl:value-of select="//div[@id='profile-block']/span/img[contains(@class, 'img-rounded media-object')]/@src"/>
                     </url>
                 </image>
                 <xsl:apply-templates select="//div[@id='user-content-activities']//ul[@id='major-stream']/li"/>
index 44a0416..d1514c5 100644 (file)
@@ -1,7 +1,7 @@
 <!--
   Stylesheet to convert Twitter user timelines to RSS.
 
-  Copyright (C) 2013-2014  Antonio Ospite <ao2@ao2.it>
+  Copyright (C) 2013-2018  Antonio Ospite <ao2@ao2.it>
 
   This file is part of tweeper.
 
             </pubDate>
             <description>
                 <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
+                <xsl:value-of select="concat($user-name, ':')"/>
+                <xsl:element name="br"/>
                 <xsl:if test="$item-has-video">
-                    <xsl:text>(Video)</xsl:text>
+                    <xsl:text> (Video)</xsl:text>
                     <xsl:element name="br"/>
                 </xsl:if>
                 <xsl:element name="span">
             </xsl:choose>
         </xsl:variable>
         <xsl:variable name="channel-link" select="//link[@rel='canonical']/@href"/>
+        <xsl:variable name="channel-image" select="//a[contains(@class, 'profile-picture')]/@href"/>
 
         <rss version="2.0">
             <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
                 </link>
                 <description>
                     <xsl:value-of select="normalize-space(//div[@class='ProfileHeaderCard'])"/>
+                    <!-- The following rule should only match on hashtag URLs -->
+                    <xsl:value-of select="normalize-space(//div[@class='SearchNavigation-textContainer'])"/>
                 </description>
-                <image>
-                    <title>
-                        <xsl:value-of select="$channel-title"/>
-                    </title>
-                    <link>
-                        <xsl:value-of select="$channel-link"/>
-                    </link>
-                    <url>
-                        <xsl:value-of select="//a[contains(@class, 'profile-picture')]/@href"/>
-                    </url>
-                </image>
+                <xsl:if test="$channel-image != ''">
+                    <image>
+                        <title>
+                            <xsl:value-of select="$channel-title"/>
+                        </title>
+                        <link>
+                            <xsl:value-of select="$channel-link"/>
+                        </link>
+                        <url>
+                            <xsl:value-of select="$channel-image"/>
+                        </url>
+                    </image>
+                </xsl:if>
                 <xsl:apply-templates select="//ol[@id='stream-items-id']/li[@data-item-id and @data-item-type='tweet' and not(contains(@class, 'has-profile-promoted-tweet'))]"/>
 
                 <!-- These rules will only match on permalink URLs -->
index 019df14..82b3a43 100644 (file)
@@ -107,7 +107,7 @@ Main web site: <https://git.ao2.it/tweeper.git>
 
 COPYING
 -------
-Copyright \(C) 2013-2016  Antonio Ospite <ao2@ao2.it>
+Copyright \(C) 2013-2018  Antonio Ospite <ao2@ao2.it>
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
index ff98ab7..b1dd021 100644 (file)
@@ -3,7 +3,7 @@
  * @file
  * Tweeper - a Twitter to RSS web scraper.
  *
- * Copyright (C) 2013-2016  Antonio Ospite <ao2@ao2.it>
+ * Copyright (C) 2013-2018  Antonio Ospite <ao2@ao2.it>
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by