Merge tag 'v0.4' into debian
authorAntonio Ospite <ao2@ao2.it>
Sun, 13 Sep 2015 19:29:04 +0000 (21:29 +0200)
committerAntonio Ospite <ao2@ao2.it>
Sun, 13 Sep 2015 19:29:04 +0000 (21:29 +0200)
Release v0.4

12 files changed:
INSTALL
NEWS
README
TODO
rss_converter_dilbert.com.xsl
rss_converter_facebook.com.xsl [new file with mode: 0644]
rss_converter_howtoons.com.xsl
rss_converter_instagram.com.xsl [new file with mode: 0644]
rss_converter_pump.io.xsl
rss_converter_twitter.com.xsl
tweeper.1.asciidoc
tweeper.php

diff --git a/INSTALL b/INSTALL
index a2e602d..d575bb1 100644 (file)
--- a/INSTALL
+++ b/INSTALL
@@ -1,3 +1,6 @@
 The recommended way to install tweeper globally is to install all its files
 under /usr/share/php/tweeper and then make a symlink to the wrapper script
 "tweeper" under /usr/bin
+
+Tweeper depends on php-xml-serializer which is used to convert json to xml for
+some sites that provide the timeline data in json rather than in usable html.
diff --git a/NEWS b/NEWS
index b183893..d50eb50 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,13 @@
+News for v0.4:
+==============
+
+  * Make the generated RSS validate with feedvalidator.org
+  * Fix support for Dilbert.com
+  * Add support for Instragram.com
+  * Add support for public pages on Facebook.com
+  * Make tweeper work with the PHP built-in web server
+  * Misc fixes to code and documentation
+
 News for v0.3:
 ==============
 
diff --git a/README b/README
index 0ebe37c..7703e64 100644 (file)
--- a/README
+++ b/README
@@ -1,10 +1,12 @@
-Tweeper is a web scraper which extracts the most recent public tweets of
-a given user from their home page on Twitter.com and formats them in RSS, so
-the information can be conveniently accessed and collected by a feed reader.
+Tweeper is a web scraper which can be used to conveniently follow the public
+activity of social network users without the need to log in or even be
+subscribed to the social network; tweeper converts the public information to
+RSS so that it can be accessed and collected by a feed reader.
 
-Since Jun 11th 2013 Twitter.com retired their API v1.0, so it's not possible
-to access a user timeline via RSS anymore, and it's also become mandatory to
-authenticate via OAuth to access this _public_ information in JSON format:
+Since Jun 11th 2013, when Twitter.com retired their API v1.0, it has not been
+possible anymore to access a user timeline via RSS, and it has also become
+mandatory to authenticate via OAuth to access this _public_ information in the
+JSON format:
 
   https://dev.twitter.com/discussions/16289
   https://dev.twitter.com/discussions/11564
@@ -24,14 +26,23 @@ whom you are friend to.
 
 [1] http://www.urbandictionary.com/define.php?term=TWEEPER&defid=3743173
 
-Tweeper can be used via web or as a command line program, for example as
-a filter in your feed reader, by passing the URL of the user's public timeline
-as the first argument.
-
 Tweeper can easily scrape sites other than Twitter, it is just a matter of
 writing an xsl stylesheet for the transformation; an example for pump.io
 activity stream is provided in rss_converter_pump.io.xsl
 
+The currently supported sites are:
+
+  * Twitter.com
+  * Pump.io based websites, like Identi.ca
+  * Dilbert.com
+  * Howtoons.com
+  * Instagram.com
+  * Facebook.com (public pages)
+
+Tweeper can be used via web or as a command line program, for example as
+a filter in your feed reader, by passing the URL of the user's public timeline
+as the first argument.
+
 Example of use on the command line:
 
   $ php tweeper.php http://twitter.com/NSACareers
diff --git a/TODO b/TODO
index 2b0f1ae..95d08a4 100644 (file)
--- a/TODO
+++ b/TODO
@@ -2,8 +2,7 @@
 - evaluate the use of the <ttl/> RSS element.
 - use the <enclosure/> element for pump.io media objects
 - use the <enclosure/> element for images on dilbert.com
-- consider using http://www.dilbert.com/fast for dilbert.com
-- debug some duplicated entries in the tweeter feeds in liferea
+- show images (or even cards) directly in RSS items for twitter.com
 - check the encoding of the tweets when UTF is used,
   maybe solvable with mb_convert_encoding()?
   See http://php.net/manual/en/domdocument.loadhtml.php
index 82c33f2..f255be1 100644 (file)
 <xsl:stylesheet version="1.0"
     xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
     xmlns:php="http://php.net/xsl"
-    xsl:extension-element-prefixes="php">
+    xsl:extension-element-prefixes="php"
+    exclude-result-prefixes="php">
 
     <xsl:output method="xml" indent="yes"/>
 
-    <xsl:variable name="BaseURL" select="concat('http://', //meta[@property='og:site_name']/@content)"/>
+    <xsl:variable name="BaseURL" select="//meta[@property='og:url']/@content"/>
 
-    <xsl:template match="//a[@id='strip_zoom']">
-        <xsl:variable name="picture-id" select="substring-after(./@href, '#')"/>
-        <xsl:variable name="picture-element" select="//div[@id=$picture-id]/img"/>
-        <xsl:variable name="picture-print-url" select="php:functionString('str_replace', 'zoom', 'print', $picture-element/@src)"/>
+    <xsl:template match="//section[@class='comic-item']">
+        <xsl:variable name="item-permalink" select=".//a[@class='img-comic-link']/@href"/>
+        <xsl:variable name="picture-url" select=".//img[@class='img-responsive img-comic']/@src"/>
+        <xsl:variable name="picture-title" select=".//img[@class='img-responsive img-comic']/@alt"/>
         <item>
             <title>
-                <xsl:value-of select="$picture-element/@title"/>
+                <xsl:value-of select="$picture-title"/>
             </title>
             <link>
-                <xsl:value-of select="concat($BaseURL, $picture-element/@src)"/>
+                <xsl:value-of select="$item-permalink"/>
             </link>
+            <guid>
+                <xsl:value-of select="$item-permalink"/>
+            </guid>
             <pubDate>
-                <xsl:value-of select="php:functionString('Tweeper::str_to_gmdate', substring-after($picture-id, 'strip_enlarged_'))"/>
+                <xsl:value-of select="php:functionString('Tweeper::str_to_gmdate', normalize-space(.//date))"/>
             </pubDate>
             <description>
                 <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
-                <img src="{$picture-print-url}" />
+                <img src="{$picture-url}" />
                 <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
             </description>
         </item>
     </xsl:template>
 
     <xsl:template match="/">
+        <xsl:variable name="channel-title" select="//meta[@property='og:title']/@content"/>
+        <xsl:variable name="channel-link" select="$BaseURL"/>
 
         <rss version="2.0">
             <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
             <channel>
                 <generator>Tweeper</generator>
                 <title>
-                    <xsl:value-of select="//meta[@property='og:title']/@content"/>
+                    <xsl:value-of select="$channel-title"/>
                 </title>
                 <link>
-                    <xsl:value-of select="$BaseURL"/>
+                    <xsl:value-of select="$channel-link"/>
                 </link>
                 <description>
                     <xsl:value-of select="//meta[@property='og:description']/@content"/>
                 </description>
                 <image>
+                    <title>
+                        <xsl:value-of select="$channel-title"/>
+                    </title>
+                    <link>
+                        <xsl:value-of select="$channel-link"/>
+                    </link>
                     <url>
-                        <xsl:value-of select="//meta[@property='og:image']/@content"/>
+                        <xsl:value-of select="concat($BaseURL, //img[@alt='Dilbert logo']/@src)"/>
                     </url>
                 </image>
-                <xsl:apply-templates select="//a[@id='strip_zoom']"/>
+                <xsl:apply-templates select="//section[@class='comic-item']"/>
             </channel>
         </rss>
     </xsl:template>
diff --git a/rss_converter_facebook.com.xsl b/rss_converter_facebook.com.xsl
new file mode 100644 (file)
index 0000000..9fbc187
--- /dev/null
@@ -0,0 +1,117 @@
+<!--
+  Stylesheet to convert a Facebook public page to RSS.
+
+  Copyright (C) 2015  Antonio Ospite <ao2@ao2.it>
+
+  This file is part of tweeper.
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+-->
+
+<!--
+  Since June 23rd, 2015 facebook.com deprecated the RSS feed endpoint for public pages:
+  https://developers.facebook.com/docs/apps/changelog#v2_3_90_day_deprecations
+
+  They suggest to use the Graph API but they fail to mention that it does not
+  work anymore without authentication, so it cannot be considered an
+  _equivalent_ solution.
+
+  Luckily we've got Tweeper!
+-->
+
+<xsl:stylesheet version="1.0"
+    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+    xmlns:php="http://php.net/xsl"
+    xsl:extension-element-prefixes="php"
+    exclude-result-prefixes="php">
+
+    <xsl:output method="xml" indent="yes"/>
+
+    <xsl:variable name="BaseURL">
+        <xsl:text>https://facebook.com</xsl:text>
+    </xsl:variable>
+
+    <xsl:template match="//div[contains(@class, 'userContentWrapper')]">
+        <xsl:variable name="item-content" select=".//div[contains(@class, 'userContent')]"/>
+        <xsl:variable name="item-permalink" select="concat($BaseURL, .//a[@target='']/@href)"/>
+        <item>
+            <title>
+                <xsl:variable name="item-title" select="$item-content/p"/>
+                <xsl:variable name="title-length" select="140"/>
+                <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 -->
+                <xsl:choose>
+                    <xsl:when test="string-length($item-title) > $title-length">
+                        <xsl:variable name="truncated-length" select="$title-length - 3"/>
+                        <xsl:value-of select="substring($item-title, 1, $truncated-length)"/>
+                        <xsl:text>...</xsl:text>
+                    </xsl:when>
+                    <xsl:otherwise>
+                        <xsl:value-of select="$item-title"/>
+                    </xsl:otherwise>
+                </xsl:choose>
+            </title>
+            <link>
+                <xsl:value-of select="$item-permalink"/>
+            </link>
+            <guid>
+                <xsl:value-of select="$item-permalink"/>
+            </guid>
+            <pubDate>
+                <xsl:variable name="timestamp" select=".//abbr[@data-shorten]/@data-utime"/>
+                <xsl:value-of select="php:functionString('Tweeper::epoch_to_gmdate', number($timestamp))"/>
+            </pubDate>
+            <description>
+                <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
+                <xsl:copy-of select="$item-content/node()"/>
+                <xsl:copy-of select=".//div[@class='mtm']/node()"/>
+                <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
+            </description>
+        </item>
+    </xsl:template>
+
+    <xsl:template match="/">
+        <xsl:variable name="channel-title" select="//title"/>
+        <xsl:variable name="channel-link" select="//div[contains(@class, 'userContentWrapper')][1]//a[1]/@href"/>
+
+        <rss version="2.0">
+            <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
+            <channel>
+                <generator>Tweeper</generator>
+                <title>
+                    <xsl:value-of select="$channel-title"/>
+                </title>
+                <link>
+                    <xsl:value-of select="$channel-link"/>
+                </link>
+                <description>
+                    <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
+                    <xsl:copy-of select="//div[@data-id='1']/node()"/>
+                    <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
+                </description>
+                <image>
+                    <title>
+                        <xsl:value-of select="$channel-title"/>
+                    </title>
+                    <link>
+                        <xsl:value-of select="$channel-link"/>
+                    </link>
+                    <url>
+                        <xsl:value-of select="//img[@class='profilePic img']/@src"/>
+                    </url>
+                </image>
+                <xsl:apply-templates select="//div[contains(@class, 'userContentWrapper')]"/>
+            </channel>
+        </rss>
+    </xsl:template>
+</xsl:stylesheet>
index 4067065..41939f1 100644 (file)
@@ -29,7 +29,8 @@
 <xsl:stylesheet version="1.0"
     xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
     xmlns:php="http://php.net/xsl"
-    xsl:extension-element-prefixes="php">
+    xsl:extension-element-prefixes="php"
+    exclude-result-prefixes="php">
 
     <xsl:output method="xml" indent="yes"/>
 
     </xsl:variable>
 
     <xsl:template match="//div[contains(@id, 'post-')]">
+        <xsl:variable name="item-permalink" select=".//div[@class='post-headline']//a/@href"/>
         <item>
             <title>
                 <xsl:value-of select="normalize-space(.//div[@class='post-headline']//a)"/>
             </title>
             <link>
-                <xsl:value-of select=".//div[@class='post-headline']//a/@href"/>
+                <xsl:value-of select="$item-permalink"/>
             </link>
+            <guid>
+                <xsl:value-of select="$item-permalink"/>
+            </guid>
             <pubDate>
                 <xsl:variable name="date" select="substring-before(.//div[@class='post-byline'], ',')"/>
                 <!-- date format is MM.DD.YY -->
     </xsl:template>
 
     <xsl:template match="/">
+        <xsl:variable name="channel-title" select="//title"/>
+        <xsl:variable name="channel-link" select="$BaseURL"/>
 
         <rss version="2.0">
             <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
             <channel>
                 <generator>Tweeper</generator>
                 <title>
-                    <xsl:value-of select="//title"/>
+                    <xsl:value-of select="$channel-title"/>
                 </title>
                 <link>
-                    <xsl:value-of select="$BaseURL"/>
+                    <xsl:value-of select="$channel-link"/>
                 </link>
                 <description>
                     <xsl:text>The world's greatest D.I.Y. comic website! Tools of mass construction!</xsl:text>
                 </description>
                 <image>
+                    <title>
+                        <xsl:value-of select="$channel-title"/>
+                    </title>
+                    <link>
+                        <xsl:value-of select="$channel-link"/>
+                    </link>
                     <url>
                         <xsl:text>http://www.howtoons.com/wp-content/themes/atahualpa/images/header/tuck1000.png</xsl:text>
                     </url>
diff --git a/rss_converter_instagram.com.xsl b/rss_converter_instagram.com.xsl
new file mode 100644 (file)
index 0000000..5f1bb7f
--- /dev/null
@@ -0,0 +1,139 @@
+<!--
+  Stylesheet to convert Instagram user timelines to RSS.
+
+  Copyright (C) 2015  Antonio Ospite <ao2@ao2.it>
+
+  This file is part of tweeper.
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+-->
+<xsl:stylesheet version="1.0"
+    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+    xmlns:php="http://php.net/xsl"
+    xsl:extension-element-prefixes="php"
+    exclude-result-prefixes="php">
+
+    <xsl:param name="generateEnclosure"/>
+
+    <xsl:output method="xml" indent="yes"/>
+
+    <xsl:variable name="BaseURL">
+        <xsl:text>https://instagram.com</xsl:text>
+    </xsl:variable>
+
+    <xsl:template match="display_src">
+        <xsl:value-of disable-output-escaping="yes" select="php:function('Tweeper::generate_enclosure', string(text()))"/>
+    </xsl:template>
+
+    <xsl:variable name="user-name" select="//ProfilePage/XML_Serializer_Tag/user/username"/>
+
+    <!-- Some users do not specify the full name -->
+    <xsl:variable name="full-name" select="//ProfilePage/XML_Serializer_Tag/user/full_name"/>
+    <xsl:variable name="screen-name">
+        <xsl:choose>
+            <xsl:when test="$full-name != ''">
+                <xsl:value-of select="$full-name"/>
+            </xsl:when>
+            <xsl:otherwise>
+                <xsl:value-of select="$user-name"/>
+            </xsl:otherwise>
+        </xsl:choose>
+    </xsl:variable>
+
+    <xsl:template match="//media/nodes/XML_Serializer_Tag">
+        <xsl:variable name="item-content-image" select="./display_src"/>
+        <xsl:variable name="item-content-caption" select="./caption"/>
+        <xsl:variable name="item-permalink" select="concat($BaseURL, '/p/', ./code, '/')"/>
+        <item>
+            <title>
+                <xsl:variable name="title-length" select="140"/>
+                <xsl:variable name="item-content-title" select="normalize-space(concat($user-name, ': ', $item-content-caption))"/>
+                <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 -->
+                <xsl:choose>
+                    <xsl:when test="string-length($item-content-title) > $title-length">
+                        <xsl:variable name="truncated-length" select="$title-length - 3"/>
+                        <xsl:value-of select="substring($item-content-title, 1, $truncated-length)"/>
+                        <xsl:text>...</xsl:text>
+                    </xsl:when>
+                    <xsl:otherwise>
+                        <xsl:value-of select="$item-content-title"/>
+                    </xsl:otherwise>
+                </xsl:choose>
+            </title>
+            <link>
+                <xsl:value-of select="$item-permalink"/>
+            </link>
+            <guid>
+                <xsl:value-of select="$item-permalink"/>
+            </guid>
+            <pubDate>
+                <xsl:variable name="timestamp" select="./date"/>
+                <xsl:value-of select="php:functionString('Tweeper::epoch_to_gmdate', number($timestamp))"/>
+            </pubDate>
+            <description>
+                <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
+                <p>
+                    <xsl:if test="./is_video/text() = 1">
+                        (Video) 
+                    </xsl:if>
+                    <xsl:value-of select="$item-content-caption"/>
+                </p><br />
+                <a href="{$item-permalink}"><img src="{$item-content-image}" /></a>
+                <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
+            </description>
+            <xsl:if test="$generateEnclosure = 1">
+                <xsl:apply-templates select="./display_src"/>
+            </xsl:if>
+        </item>
+    </xsl:template>
+
+    <xsl:template match="/">
+        <xsl:variable name="channel-title" select="concat('Instagram / ', $screen-name)"/>
+        <xsl:variable name="channel-link" select="concat($BaseURL, //__path)"/>
+
+        <rss version="2.0">
+            <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
+            <channel>
+                <generator>Tweeper</generator>
+                <title>
+                    <xsl:value-of select="$channel-title"/>
+                </title>
+                <link>
+                    <xsl:value-of select="$channel-link"/>
+                </link>
+                <description>
+                    <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
+                    <xsl:value-of select="normalize-space(concat($screen-name, '. ', //user/biography))"/>
+                    <xsl:variable name="external-url" select="//user/external_url"/>
+                    <xsl:if test="$external-url != ''">
+                        <xsl:text> </xsl:text><a href="{$external-url}"><xsl:value-of select="$external-url"/></a>
+                    </xsl:if>
+                    <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
+                </description>
+                <image>
+                    <title>
+                        <xsl:value-of select="$channel-title"/>
+                    </title>
+                    <link>
+                        <xsl:value-of select="$channel-link"/>
+                    </link>
+                    <url>
+                        <xsl:value-of select="//user/profile_pic_url"/>
+                    </url>
+                </image>
+                <xsl:apply-templates select="//media/nodes/XML_Serializer_Tag"/>
+            </channel>
+        </rss>
+    </xsl:template>
+</xsl:stylesheet>
index 94b083b..ed99713 100644 (file)
@@ -22,7 +22,8 @@
 <xsl:stylesheet version="1.0"
     xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
     xmlns:php="http://php.net/xsl"
-    xsl:extension-element-prefixes="php">
+    xsl:extension-element-prefixes="php"
+    exclude-result-prefixes="php">
 
     <xsl:output method="xml" indent="yes"/>
 
     <xsl:variable name="user-name" select="substring-after(//div[@id='profile-block']/@data-profile-id, ':')"/>
 
     <xsl:template match="//div[@id='user-content-activities']//ul[@id='major-stream']/li">
-        <xsl:variable name="activity-text" select=".//div[@class='activity-content']"/>
+        <xsl:variable name="item-content" select=".//div[@class='activity-content']"/>
+        <xsl:variable name="item-permalink" select=".//p[@class='muted']/small/a/@href"/>
         <item>
             <title>
-                <xsl:value-of select="concat($user-name, ': ', normalize-space($activity-text))"/>
+                <xsl:value-of select="concat($user-name, ': ', normalize-space($item-content))"/>
             </title>
             <link>
-                <xsl:value-of select=".//p[@class='muted']/small/a/@href"/>
+                <xsl:value-of select="$item-permalink"/>
             </link>
+            <guid>
+                <xsl:value-of select="$item-permalink"/>
+            </guid>
             <pubDate>
                 <xsl:value-of select="php:functionString('Tweeper::str_to_gmdate', .//abbr[@class='easydate']/@title)"/>
             </pubDate>
             <description>
                 <xsl:value-of select="concat($user-name, ': ')"/>
                 <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
-                <xsl:copy-of select="$activity-text/node()"/>
+                <xsl:copy-of select="$item-content/node()"/>
                 <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
             </description>
         </item>
     </xsl:template>
 
     <xsl:template match="/">
+        <xsl:variable name="channel-title" select="concat(substring-after($user-name, '@'), ' / ', substring-before($user-name, '@'))"/>
+        <xsl:variable name="channel-link" select="concat('https://', substring-after($user-name, '@'), '/', substring-before($user-name, '@'))"/>
 
         <rss version="2.0">
             <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
             <channel>
                 <generator>Tweeper</generator>
                 <title>
-                    <xsl:value-of select="concat(substring-after($user-name, '@'), ' / ', substring-before($user-name, '@'))"/>
+                    <xsl:value-of select="$channel-title"/>
                 </title>
                 <link>
-                    <xsl:value-of select="concat('https://', substring-after($user-name, '@'), '/', substring-before($user-name, '@'))"/>
+                    <xsl:value-of select="$channel-link"/>
                 </link>
                 <description>
                     <xsl:value-of select="normalize-space(//h1[@class='media-header'])"/>
                 </description>
                 <image>
+                    <title>
+                        <xsl:value-of select="$channel-title"/>
+                    </title>
+                    <link>
+                        <xsl:value-of select="$channel-link"/>
+                    </link>
                     <url>
                         <xsl:value-of select="//div[@id='profile-block']/span/img[@class='img-rounded media-object']/@src"/>
                     </url>
index a5f3c4b..9185a54 100644 (file)
 <xsl:stylesheet version="1.0"
     xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
     xmlns:php="http://php.net/xsl"
-    xsl:extension-element-prefixes="php">
+    xsl:extension-element-prefixes="php"
+    exclude-result-prefixes="php">
 
     <xsl:param name="generateEnclosure"/>
 
     <xsl:output method="xml" indent="yes"/>
 
-    <xsl:variable name="twitterBaseURL">
+    <xsl:variable name="BaseURL">
         <xsl:text>https://twitter.com</xsl:text>
     </xsl:variable>
 
 
     <xsl:variable name="screen-name" select="//div[@class='user-actions btn-group not-following ']/@data-screen-name"/>
 
-    <xsl:template match="//*[@data-item-type='tweet']">
+    <xsl:template match="//li[@data-item-type='tweet']">
         <xsl:variable name="user-name" select=".//div[contains(@class, 'js-stream-tweet')]/@data-screen-name"/>
-        <xsl:variable name="tweet-text" select=".//p[contains(@class, 'js-tweet-text')]"/>
+        <xsl:variable name="item-content" select=".//p[contains(@class, 'js-tweet-text')]"/>
+        <xsl:variable name="item-permalink" select="concat($BaseURL, .//div[@data-permalink-path]/@data-permalink-path)"/>
         <item>
             <title>
-                <xsl:value-of select="concat($user-name, ': ', $tweet-text)"/>
+                <xsl:value-of select="concat($user-name, ': ', $item-content)"/>
             </title>
             <link>
-                <xsl:value-of select="concat($twitterBaseURL, .//a[contains(@class, 'js-permalink')]/@href)"/>
+                <xsl:value-of select="$item-permalink"/>
             </link>
+            <guid>
+                <xsl:value-of select="$item-permalink"/>
+            </guid>
             <pubDate>
                 <xsl:variable name="timestamp" select=".//span[contains(@class, 'js-short-timestamp')]/@data-time"/>
                 <xsl:value-of select="php:functionString('Tweeper::epoch_to_gmdate', number($timestamp))"/>
             <description>
                 <xsl:value-of select="concat($user-name, ': ')"/>
                 <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
-                <xsl:copy-of select="$tweet-text/node()"/>
+                <xsl:copy-of select="$item-content/node()"/>
                 <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
             </description>
             <xsl:if test="$generateEnclosure = 1">
-                <xsl:apply-templates select="$tweet-text//a[@data-expanded-url]"/>
+                <xsl:apply-templates select="$item-content//a[@data-expanded-url]"/>
             </xsl:if>
         </item>
     </xsl:template>
 
     <xsl:template match="/">
+        <xsl:variable name="channel-title" select="concat('Twitter / ', $screen-name)"/>
+        <xsl:variable name="channel-link" select="//link[@rel='canonical']/@href"/>
 
         <rss version="2.0">
-            <xsl:attribute name="xml:base"><xsl:value-of select="$twitterBaseURL" /></xsl:attribute>
+            <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
             <channel>
                 <generator>Tweeper</generator>
                 <title>
-                    <xsl:text>Twitter / </xsl:text><xsl:value-of select="$screen-name"/>
+                    <xsl:value-of select="$channel-title"/>
                 </title>
                 <link>
-                    <xsl:value-of select="//link[@rel='canonical']/@href"/>
+                    <xsl:value-of select="$channel-link"/>
                 </link>
                 <description>
-                    <xsl:value-of select="//meta[@name='description']/@content"/>
+                    <xsl:value-of select="normalize-space(//div[@class='ProfileHeaderCard'])"/>
                 </description>
                 <image>
+                    <title>
+                        <xsl:value-of select="$channel-title"/>
+                    </title>
+                    <link>
+                        <xsl:value-of select="$channel-link"/>
+                    </link>
                     <url>
                         <xsl:value-of select="//a[contains(@class, 'profile-picture media-thumbnail')]/@href"/>
                     </url>
                 </image>
-                <xsl:apply-templates select="//*[@data-item-type='tweet']"/>
+                <xsl:apply-templates select="//li[@data-item-type='tweet']"/>
             </channel>
         </rss>
     </xsl:template>
index c5c5fa9..2a4a523 100644 (file)
@@ -5,7 +5,7 @@ TWEEPER(1)
 
 NAME
 ----
-tweeper - web scraper to convert a Twitter timeline to an RSS feed
+tweeper - web scraper to convert supported websites (e.g. Twitter.com) to RSS
 
 
 SYNOPSIS
@@ -16,24 +16,28 @@ SYNOPSIS
 DESCRIPTION
 -----------
 
-tweeper(1) is a web scraper which extracts the most recent public tweets of
-a given user from their home page on Twitter.com and formats them in RSS, so
-the information can be conveniently accessed and collected by a feed reader.
+tweeper(1) is a web scraper which can be used to conveniently follow the
+public activity of social network users without the need to log in or even be
+subscribed to the social network; tweeper converts the public information to
+RSS so that it can be accessed and collected by a feed reader.
 
 tweeper started as the TWitter fEEd scraPER but support for other web sites
 has been added.
 
 The sites that tweeper is able to scrape and convert to RSS are:
-  
+
 * Twitter.com
 * Pump.io based websites, like Identi.ca
 * Dilbert.com
+* Howtoons.com
+* Instagram.com
+* Facebook.com (public pages)
 
 tweeper can be used as:
 
 1. a command line tool;
 2. a filter for feed readers;
-3. a web based tool when PHP support is available in the web server.
+3. a web based tool when used with a PHP-enabled web server.
 
 
 OPTIONS
@@ -57,10 +61,22 @@ Using tweeper as a filter for the Liferea feed reader:
 
  liferea-add-feed  "|tweeper http://twitter.com/NSAcareers"
 
-Using tweeper via web (the symlink must be created only the very first time):
+To use tweeper via web there are two options (the examples assume the
+installation directory to be `/usr/share/php/tweeper/`):
+
+1. Using the PHP built-in web server:
+
+  php -S localhost:8000 -t /usr/share/php/tweeper/
++
+and then visit 'http://localhost:8000/tweeper.php' in the web browser.
+
+2. Using a generic web server with the document root in '/var/www':
 
   sudo ln -s /usr/share/php/tweeper/tweeper.php /var/www
   xdg-open http://localhost/tweeper.php?src_url=http://twitter.com/NSAcareers
++
+It is enough to create the symlink only the very first time tweeper is used
+this way.
 
 
 NOTES
@@ -92,7 +108,7 @@ Main web site: <http://git.ao2.it/tweeper.git>
 
 COPYING
 -------
-Copyright \(C) 2013-2014  Antonio Ospite <ao2@ao2.it>
+Copyright \(C) 2013-2015  Antonio Ospite <ao2@ao2.it>
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
index 4be5c7d..a9fce9b 100644 (file)
@@ -1,49 +1,51 @@
 <?php
 /*
  * tweeper - a Twitter to RSS web scraper
- * 
- * Copyright (C) 2013-2014  Antonio Ospite <ao2@ao2.it>
- * 
+ *
+ * Copyright (C) 2013-2015  Antonio Ospite <ao2@ao2.it>
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+require_once 'XML/Serializer.php';
+
 date_default_timezone_set('UTC');
 
 class Tweeper {
 
   private static $USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0";
 
-  public function __construct($stylesheet, $generate_enclosure = FALSE) {
-    $stylesheet_contents = $this->get_contents($stylesheet);
-
-    $xslDoc = new DOMDocument();
-    $xslDoc->loadXML($stylesheet_contents);
-
-    $this->xsltProcessor = new XSLTProcessor();
-    $this->xsltProcessor->registerPHPFunctions();
-    $this->xsltProcessor->setParameter('', 'generateEnclosure', $generate_enclosure);
-    $this->xsltProcessor->importStylesheet($xslDoc);
+  public function __construct($generate_enclosure = FALSE) {
+    $this->generate_enclosure = $generate_enclosure;
   }
 
   public static function epoch_to_gmdate($timestamp)
   {
+    if (!is_numeric($timestamp) || is_nan($timestamp)) {
+      $timestamp = 0;
+    }
+
     return gmdate('D, d M Y H:i:s', $timestamp) . ' GMT';
   }
 
   public static function str_to_gmdate($date)
   {
     $timestamp = strtotime($date);
+    if (FALSE === $timestamp) {
+      $timestamp = 0;
+    }
+
     return Tweeper::epoch_to_gmdate($timestamp);
   }
 
@@ -104,7 +106,11 @@ class Tweeper {
       "video/ogg",
     );
 
-    $url_info = Tweeper::get_info($url);
+    // The RSS specification says that the enclosure element url must be http.
+    // See http://sourceforge.net/p/feedvalidator/bugs/72/
+    $http_url = preg_replace("/^https/", "http", $url);
+
+    $url_info = Tweeper::get_info($http_url);
 
     $supported = in_array($url_info['content_type'], $supported_content_types);
     if (!$supported) {
@@ -152,21 +158,122 @@ class Tweeper {
     error_log($output);
   }
 
-  public function tweep($uri) {
-    $html = Tweeper::get_contents($uri);
+  private function load_stylesheet($host) {
+    $stylesheet = "file://" . __DIR__ . "/rss_converter_" . $host . ".xsl";
+    if (FALSE === file_exists($stylesheet)) {
+      trigger_error("Conversion to RSS not supported for $host ($stylesheet not found)", E_USER_ERROR);
+      return NULL;
+    }
+
+    $stylesheet_contents = $this->get_contents($stylesheet);
+
+    $xslDoc = new DOMDocument();
+    $xslDoc->loadXML($stylesheet_contents);
+
+    $xsltProcessor = new XSLTProcessor();
+    $xsltProcessor->registerPHPFunctions();
+    $xsltProcessor->setParameter('', 'generateEnclosure', $this->generate_enclosure);
+    $xsltProcessor->importStylesheet($xslDoc);
+
+    return $xsltProcessor;
+  }
+
+  private function json_to_xml($html, $json_match_expr, $rootName) {
+    // pre-process, convert json to XML
+    $ret = preg_match($json_match_expr, $html, $matches);
+    if ($ret !== 1) {
+      trigger_error("Cannot match expression: $json_match_expr\n", E_USER_ERROR);
+      return NULL;
+    }
+
+    $data = json_decode($matches[1]);
+    if (!$data) {
+      return NULL;
+    }
+
+    $serializer_options = array (
+      'addDecl' => TRUE,
+      'encoding' => "UTF-8",
+      'indent' => '  ',
+      'rootName' => $rootName,
+    );
+
+    $serializer = new XML_Serializer($serializer_options);
+
+    $status = $serializer->serialize($data);
+    if (PEAR::isError($status)) {
+      trigger_error($status->getMessage(), E_USER_ERROR);
+      return NULL;
+    }
+
+    return $serializer->getSerializedData();
+  }
+
+  private function get_xml_instagram_com($html) {
+    return $this->json_to_xml($html, '/window._sharedData = (.*);/', 'instagram');
+  }
+
+  private function preprocess_html_facebook_com($html) {
+    $html = str_replace('<!--', '', $html);
+    $html = str_replace('-->', '', $html);
+    return $html;
+  }
 
+  private function html_to_xml($html, $host) {
     $xmlDoc = new DOMDocument();
 
     // Handle warnings and errors when loading invalid HTML.
     $xml_errors_value = libxml_use_internal_errors(true);
-    $xmlDoc->loadHTML($html);
+
+    // If there is a host-specific method to get the xml data, use it!
+    $get_xml_host_method = 'get_xml_' . str_replace(".", "_", $host);
+    if (method_exists($this, $get_xml_host_method)) {
+      $xml_data = call_user_func_array(array($this, $get_xml_host_method), array($html));
+      $xmlDoc->loadXML($xml_data);
+    } else {
+      $xmlDoc->loadHTML($html);
+    }
+
     foreach (libxml_get_errors() as $xml_error) {
       $this->log_xml_error($xml_error);
     }
     libxml_clear_errors();
     libxml_use_internal_errors($xml_errors_value);
 
-    $output = $this->xsltProcessor->transformToXML($xmlDoc);
+    return $xmlDoc;
+  }
+
+  public function tweep($src_url) {
+    $url = parse_url($src_url);
+    if (FALSE === $url || empty($url["host"])) {
+      trigger_error("Invalid url: $src_url", E_USER_ERROR);
+      return NULL;
+    }
+
+    // Strip the leading www. to be more forgiving on input URLs
+    $host = preg_replace('/^www\./', '', $url["host"]);
+
+    $xsltProcessor = $this->load_stylesheet($host);
+    if (NULL === $xsltProcessor) {
+      return NULL;
+    }
+
+    $html = $this->get_contents($src_url);
+    if (FALSE === $html) {
+      return NULL;
+    }
+
+    $preprocess_html_host_method = 'preprocess_html_' . str_replace(".", "_", $host);
+    if (method_exists($this, $preprocess_html_host_method)) {
+      $html = call_user_func_array(array($this, $preprocess_html_host_method), array($html));
+    }
+
+    $xmlDoc = $this->html_to_xml($html, $host);
+    if (NULL === $xmlDoc) {
+      return NULL;
+    }
+
+    $output = $xsltProcessor->transformToXML($xmlDoc);
 
     if (FALSE === $output) {
       trigger_error('XSL transformation failed.', E_USER_ERROR);
@@ -176,12 +283,18 @@ class Tweeper {
   }
 }
 
+function is_cli()
+{
+  return (php_sapi_name() === "cli");
+}
+
 function usage($argv)
 {
-  if (php_sapi_name() != 'cli')
-    $usage = htmlentities("{$_SERVER['SCRIPT_NAME']}?src_url=<src_url>&generate_enclosure=<0|1>");
-  else
+  if (is_cli()) {
     $usage = "{$argv[0]} [-e|-h|--help] <src_url>\n";
+  } else {
+    $usage = htmlentities("{$_SERVER['SCRIPT_NAME']}?src_url=<src_url>&generate_enclosure=<0|1>");
+  }
 
   return "usage: $usage";
 }
@@ -232,30 +345,18 @@ function parse_options_query_string()
 }
 
 
-if (php_sapi_name() != 'cli') {
-  $options = parse_options_query_string();
-  $ERROR_STREAM = fopen('php://output', 'w');
-} else {
+if (is_cli()) {
   $options = parse_options_cli($argv, $argc);
   $ERROR_STREAM = fopen('php://stderr', 'w');
+} else {
+  $options = parse_options_query_string();
+  $ERROR_STREAM = fopen('php://output', 'w');
 }
 
 if (!isset($options['src_url'])) {
-  fwrite($ERROR_STREAM, usage($argv));
-  exit(1);
-}
-
-$url = parse_url($options['src_url']);
-if (FALSE === $url || empty($url["host"])) {
-  fwrite($ERROR_STREAM, "Invalid url: ${options['src_url']}\n");
-  exit(1);
-}
-
-$stylesheet = "file://" . __DIR__ . "/rss_converter_" . $url["host"] . ".xsl";
-if (FALSE === file_exists($stylesheet)) {
-  fwrite($ERROR_STREAM, "Conversion to RSS not supported: {$url["host"]}\n");
+  fwrite($ERROR_STREAM, usage(is_cli() ? $argv : NULL));
   exit(1);
 }
 
-$tweeper = new Tweeper($stylesheet, $options['generate_enclosure']);
+$tweeper = new Tweeper($options['generate_enclosure']);
 echo $tweeper->tweep($options['src_url']);