Add initial support for scraping Pump.io activity streams
authorAntonio Ospite <ospite@studenti.unina.it>
Sat, 27 Jul 2013 14:51:38 +0000 (16:51 +0200)
committerAntonio Ospite <ospite@studenti.unina.it>
Sat, 27 Jul 2013 15:06:18 +0000 (17:06 +0200)
Use symlinks to represent alternate sites with the same structure (i.e.
same server software).

Symlinks are handy and concise, an alternative way would be to introduce
some equivalence mapping, like in the patch below, but I don't really
like that:

  diff --git a/tweeper.php b/tweeper.php
  index a019684..eb12af2 100755
  --- a/tweeper.php
  +++ b/tweeper.php
  @@ -101,9 +101,18 @@ $url = parse_url($src_url);
   if (FALSE === $url || empty($url["host"]))
     die("Invalid url: $url\n");

  -$stylesheet = __DIR__ . "/rss_converter_" . $url["host"] . ".xsl";
  +$equivalence_map = array(
  +  "identi.ca" => "pump.io"
  +);
  +
  +if (array_key_exists($url["host"], $equivalence_map))
  +  $host = $equivalence_map[$url["host"]];
  +else
  +  $host = $url["host"];
  +
  +$stylesheet = __DIR__ . "/rss_converter_" . $host . ".xsl";
   if (FALSE === file_exists($stylesheet))
  -  die("Conversion to RSS not supported: {$url["host"]}\n");
  +  die("Conversion to RSS not supported: {$host}\n");

   $tweeper = new Tweeper($stylesheet);
   echo $tweeper->tweep($src_url);

rss_converter_identi.ca.xsl [new symlink]
rss_converter_pump.io.xsl [new file with mode: 0644]

diff --git a/rss_converter_identi.ca.xsl b/rss_converter_identi.ca.xsl
new file mode 120000 (symlink)
index 0000000..d8042a1
--- /dev/null
@@ -0,0 +1 @@
+rss_converter_pump.io.xsl
\ No newline at end of file
diff --git a/rss_converter_pump.io.xsl b/rss_converter_pump.io.xsl
new file mode 100644 (file)
index 0000000..ef4b6e8
--- /dev/null
@@ -0,0 +1,75 @@
+<!--
+  Stylesheet to convert Pump.io activity streams to RSS.
+
+  Copyright (C) 2013  Antonio Ospite <ospite@studenti.unina.it>
+
+  This file is part of tweeper.
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+-->
+<!-- To Evan, please reconsider publishing RSS ouput for _public_ contents -->
+<xsl:stylesheet version="1.0"
+    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+    xmlns:php="http://php.net/xsl"
+    xsl:extension-element-prefixes="php">
+
+    <xsl:output method="xml" indent="yes"/>
+
+    <xsl:variable name="user-name" select="substring-after(//div[@id='profile-block']/@data-profile-id, ':')"/>
+
+    <xsl:template match="//div[@id='user-content-activities']//ul[@id='major-stream']/li">
+        <xsl:variable name="activity-text" select=".//div[@class='activity-content']"/>
+        <item>
+            <title>
+                <xsl:value-of select="concat($user-name, ': ', normalize-space($activity-text))"/>
+            </title>
+            <link>
+                <xsl:value-of select=".//p[@class='muted']/small/a/@href"/>
+            </link>
+            <pubDate>
+                <xsl:value-of select="php:functionString('str_to_gmdate', .//abbr[@class='easydate']/@title)"/>
+            </pubDate>
+            <description>
+                <xsl:value-of select="concat($user-name, ': ')"/>
+                <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
+                <xsl:copy-of select="$activity-text/node()"/>
+                <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
+            </description>
+        </item>
+    </xsl:template>
+
+    <xsl:template match="/">
+
+        <rss version="2.0">
+            <channel>
+                <generator>Tweeper</generator>
+                <title>
+                    <xsl:value-of select="concat(substring-after($user-name, '@'), ' / ', substring-before($user-name, '@'))"/>
+                </title>
+                <link>
+                    <xsl:value-of select="concat('https://', substring-after($user-name, '@'), '/', substring-before($user-name, '@'))"/>
+                </link>
+                <description>
+                    <xsl:value-of select="normalize-space(//h1[@class='media-header'])"/>
+                </description>
+                <image>
+                    <url>
+                        <xsl:value-of select="//div[@id='profile-block']/span/img[@class='img-rounded media-object']/@src"/>
+                    </url>
+                </image>
+                <xsl:apply-templates select="//div[@id='user-content-activities']//ul[@id='major-stream']/li"/>
+            </channel>
+        </rss>
+    </xsl:template>
+</xsl:stylesheet>