Merge tag 'v1.0.0' into debian
authorAntonio Ospite <ao2@ao2.it>
Sun, 11 Dec 2016 09:24:00 +0000 (10:24 +0100)
committerAntonio Ospite <ao2@ao2.it>
Sun, 11 Dec 2016 09:24:00 +0000 (10:24 +0100)
Release v1.0.0

24 files changed:
Makefile
NEWS
TODO
autoload.php [new file with mode: 0644]
composer.json [new file with mode: 0644]
rss_converter_dilbert.com.xsl [deleted file]
rss_converter_facebook.com.xsl [deleted file]
rss_converter_howtoons.com.xsl [deleted file]
rss_converter_identi.ca.xsl [deleted symlink]
rss_converter_instagram.com.xsl [deleted file]
rss_converter_pump.io.xsl [deleted file]
rss_converter_twitter.com.xsl [deleted file]
src/Tweeper.php [new file with mode: 0644]
src/rss_converter_dilbert.com.xsl [new file with mode: 0644]
src/rss_converter_facebook.com.xsl [new file with mode: 0644]
src/rss_converter_howtoons.com.xsl [new file with mode: 0644]
src/rss_converter_identi.ca.xsl [new symlink]
src/rss_converter_instagram.com.xsl [new file with mode: 0644]
src/rss_converter_pump.io.xsl [new file with mode: 0644]
src/rss_converter_twitter.com.xsl [new file with mode: 0644]
tests/test_information_leakage.sh [new file with mode: 0755]
tweeper
tweeper.1.asciidoc
tweeper.php

index 4625aa8..eff450a 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -21,10 +21,11 @@ installdocs: docs
 
 install: installdocs
        install -d $(DESTDIR)$(TWEEPER_DIR)
-       install -m644 *.xsl $(DESTDIR)$(TWEEPER_DIR)
        install -m644 *.php $(DESTDIR)$(TWEEPER_DIR)
        install -m755 tweeper $(DESTDIR)$(TWEEPER_DIR)
+       install -d $(DESTDIR)$(TWEEPER_DIR)/src
+       install -m644 src/* $(DESTDIR)$(TWEEPER_DIR)/src
        install -d $(DESTDIR)$(BIN_DIR)
-       ln -sf $(TWEEPER_DIR)/tweeper $(DESTDIR)$(BIN_DIR)/tweeper
+       ln -rsf $(DESTDIR)$(TWEEPER_DIR)/tweeper $(DESTDIR)$(BIN_DIR)/tweeper
        @echo -e "\n\nINSTALLATION COMPLETE"
-       @echo -e "Make sure '$(PHP_SCRIPT_DIR)' is in PHP include_path!\n"
+       @echo -e "Make sure '$(DESTDIR)$(PHP_SCRIPT_DIR)' is in PHP include_path!\n"
diff --git a/NEWS b/NEWS
index d125dd5..6fccebb 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,15 @@
+News for v1.0.0:
+================
+
+  * Support "application/octet-stream" as an enclosure content type
+  * Support "application/pdf" as an enclosure content type
+  * Fix information leakage by validating the URL scheme
+  * Code restructuring to make it easier to use tweeper as a library in other
+    projects
+  * Allow installing tweeper via composer, the packagist page is at:
+    https://packagist.org/packages/ao2/tweeper
+  * Misc robustness fixes
+
 News for v0.6:
 ==============
 
diff --git a/TODO b/TODO
index b305783..3c71811 100644 (file)
--- a/TODO
+++ b/TODO
@@ -1,7 +1,10 @@
-- write a better XSL stylesheet? I am not an XSL expert.
-- evaluate the use of the <ttl/> RSS element.
+- write better XSL stylesheets? I am not an XSL expert
+- evaluate the use of the <ttl/> RSS element
 - show cards directly in RSS items for twitter.com
 - show direct links for videos in the Instagram feed
 - check the encoding of the tweets when UTF is used,
   maybe solvable with mb_convert_encoding()?
   See http://php.net/manual/en/domdocument.loadhtml.php
+
+- The dependencies on the symphony components in composer.json could be more
+  relaxed like ">=2.7.0", but for now sticking to "2.7.*" is good enough.
diff --git a/autoload.php b/autoload.php
new file mode 100644 (file)
index 0000000..4ba7832
--- /dev/null
@@ -0,0 +1,82 @@
+<?php
+/**
+ * @file
+ * Tweeper - some logic to allow tweeper to run with or without composer.
+ *
+ * Copyright (C) 2016  Antonio Ospite <ao2@ao2.it>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+$package_name = 'ao2/tweeper';
+
+if (file_exists(__DIR__ . '/vendor/autoload.php')) {
+  /*
+   * If "composer install" has been executed, use the composer autoloader.
+   *
+   * Using __DIR__ is OK as long as this file is on the same level of the
+   * project "vendor/" directory (usually the project root directory).
+   */
+  require __DIR__ . '/vendor/autoload.php';
+}
+elseif (preg_match('/' . preg_quote('/vendor/' . $package_name, '/') . '$/', __DIR__)) {
+  /*
+   * If running from a "vendor/" directory of another project use the
+   * autoloader of the parent project.
+   *
+   * This covers the case of running from a symlink in ./vendor/bin/ because
+   * __DIR__ contains the *real path* of this file.
+   *
+   * Note that using __DIR__ here and going back two levels is OK under the
+   * assumptions that this file is in the project root directory, and that the
+   * package name has the structure VENDOR/PROJECT_NAME.
+   */
+  require __DIR__ . '/../../autoload.php';
+}
+else {
+  /*
+   * Otherwise, run without composer:
+   *
+   *  1. register our own autoloader function for the Tweeper class
+   *
+   * The implementation follows the one suggested in:
+   * http://www.php-fig.org/psr/psr-4/
+   */
+  spl_autoload_register(function ($fully_qualified_class_name) {
+    /* This matches the data defined for the PSR-4 autoloader in composer.json */
+    $namespace_prefix = 'Tweeper\\';
+    $base_directory = 'src/';
+
+    $len = strlen($namespace_prefix);
+    if (strncmp($namespace_prefix, $fully_qualified_class_name, $len) !== 0) {
+      return;
+    }
+
+    $class_relative = substr($fully_qualified_class_name, $len);
+
+    $file_path = $base_directory . str_replace('\\', '/', $class_relative) . '.php';
+
+    require_once $file_path;
+  });
+
+  /*
+   *  2. load the system-wide autoloader from php-symphony-serializer
+   *
+   * This allows to run tweeper without composer, as long as the Symphony
+   * dependencies are available system-wide.
+   *
+   * For example, the Debian package takes care of that.
+   */
+  require_once 'Symfony/Component/Serializer/autoload.php';
+}
diff --git a/composer.json b/composer.json
new file mode 100644 (file)
index 0000000..d490494
--- /dev/null
@@ -0,0 +1,29 @@
+{
+    "name": "ao2/tweeper",
+    "type": "library",
+    "description": "Tweeper is a web scraper to convert popular social media sites to RSS (e.g. Twitter.com, Instagram.com).",
+    "keywords": ["Twitter", "Instagram", "Facebook", "RSS", "scraper"],
+    "homepage": "https://git.ao2.it/tweeper.git",
+    "license": "GPL-3.0+",
+    "authors": [
+        {
+            "name": "Antonio Ospite",
+            "email": "ao2@ao2.it",
+            "homepage": "https://ao2.it",
+            "role": "Developer"
+        }
+    ],
+    "require": {
+        "php": ">=5.3.0",
+        "ext-curl": "*",
+        "ext-dom": "*",
+        "ext-json": "*",
+        "ext-xsl": "*",
+        "symfony/serializer": ">=2.7.0",
+        "symfony/property-access": ">=2.7.0"
+    },
+    "autoload": {
+        "psr-4": { "Tweeper\\": "src/" }
+    },
+    "bin": ["tweeper"]
+}
diff --git a/rss_converter_dilbert.com.xsl b/rss_converter_dilbert.com.xsl
deleted file mode 100644 (file)
index b6d1975..0000000
+++ /dev/null
@@ -1,115 +0,0 @@
-<!--
-  Stylesheet to convert Dilbert daily strips to RSS.
-
-  Copyright (C) 2013-2014  Antonio Ospite <ao2@ao2.it>
-
-  This file is part of tweeper.
-
-  This program is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
--->
-
-<!--
-  Since June 18, 2013 dilbert.com strips are not accessible anymore
-  directly from the RSS feed, this message is displayed instead:
-
-    Dilbert readers - Please visit Dilbert.com to read this feature. Due
-    to changes with our feeds, we are now making this RSS feed a link to
-    Dilbert.com.
-
-  How unhandy is that, was it because of a management decision?
-  Maybe a parody dilbert strip is needed about this issue...
--->
-
-<xsl:stylesheet version="1.0"
-    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
-    xmlns:php="http://php.net/xsl"
-    xsl:extension-element-prefixes="php"
-    exclude-result-prefixes="php">
-
-    <xsl:output method="xml" indent="yes"/>
-
-    <xsl:variable name="BaseURL" select="//meta[@property='og:url']/@content"/>
-
-    <xsl:template match="//section[@class='comic-item']">
-        <xsl:variable name="item-permalink" select=".//a[@class='img-comic-link']/@href"/>
-        <xsl:variable name="picture-url" select=".//img[@class='img-responsive img-comic']/@src"/>
-        <xsl:variable name="picture-title" select=".//img[@class='img-responsive img-comic']/@alt"/>
-        <item>
-            <title>
-                <xsl:variable name="title-length" select="140"/>
-                <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 -->
-                <xsl:choose>
-                    <xsl:when test="string-length($picture-title) > $title-length">
-                        <xsl:variable name="truncated-length" select="$title-length - 3"/>
-                        <xsl:value-of select="substring($picture-title, 1, $truncated-length)"/>
-                        <xsl:text>...</xsl:text>
-                    </xsl:when>
-                    <xsl:otherwise>
-                        <xsl:value-of select="$picture-title"/>
-                    </xsl:otherwise>
-                </xsl:choose>
-            </title>
-            <link>
-                <xsl:value-of select="$item-permalink"/>
-            </link>
-            <guid>
-                <xsl:value-of select="$item-permalink"/>
-            </guid>
-            <pubDate>
-                <xsl:value-of select="php:functionString('Tweeper::strToRssDate', normalize-space(.//date))"/>
-            </pubDate>
-            <description>
-                <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
-                <img src="{$picture-url}" alt="{$picture-title}"/>
-                <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
-            </description>
-            <xsl:if test="$generate-enclosure = 1">
-                <xsl:copy-of select="php:functionString('Tweeper::generateEnclosure', $picture-url)"/>
-            </xsl:if>
-        </item>
-    </xsl:template>
-
-    <xsl:template match="/">
-        <xsl:variable name="channel-title" select="//meta[@property='og:title']/@content"/>
-        <xsl:variable name="channel-link" select="$BaseURL"/>
-
-        <rss version="2.0">
-            <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
-            <channel>
-                <generator>Tweeper</generator>
-                <title>
-                    <xsl:value-of select="$channel-title"/>
-                </title>
-                <link>
-                    <xsl:value-of select="$channel-link"/>
-                </link>
-                <description>
-                    <xsl:value-of select="//meta[@property='og:description']/@content"/>
-                </description>
-                <image>
-                    <title>
-                        <xsl:value-of select="$channel-title"/>
-                    </title>
-                    <link>
-                        <xsl:value-of select="$channel-link"/>
-                    </link>
-                    <url>
-                        <xsl:value-of select="concat($BaseURL, //img[@alt='Dilbert logo']/@src)"/>
-                    </url>
-                </image>
-                <xsl:apply-templates select="//section[@class='comic-item']"/>
-            </channel>
-        </rss>
-    </xsl:template>
-</xsl:stylesheet>
diff --git a/rss_converter_facebook.com.xsl b/rss_converter_facebook.com.xsl
deleted file mode 100644 (file)
index 418b3d2..0000000
+++ /dev/null
@@ -1,141 +0,0 @@
-<!--
-  Stylesheet to convert a Facebook public page to RSS.
-
-  Copyright (C) 2015  Antonio Ospite <ao2@ao2.it>
-
-  This file is part of tweeper.
-
-  This program is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
--->
-
-<!--
-  Since June 23rd, 2015 facebook.com deprecated the RSS feed endpoint for public pages:
-  https://developers.facebook.com/docs/apps/changelog#v2_3_90_day_deprecations
-
-  They suggest to use the Graph API but they fail to mention that it does not
-  work anymore without authentication, so it cannot be considered an
-  _equivalent_ solution.
-
-  Luckily we've got Tweeper!
--->
-
-<xsl:stylesheet version="1.0"
-    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
-    xmlns:php="http://php.net/xsl"
-    xsl:extension-element-prefixes="php"
-    exclude-result-prefixes="php">
-
-    <xsl:output method="xml" indent="yes"/>
-
-    <xsl:variable name="BaseURL">
-        <xsl:text>https://facebook.com</xsl:text>
-    </xsl:variable>
-
-    <!--
-         Extract the page id from an element like:
-        <meta property="al:android:url" content="fb://page/793837197390834">
-
-        The page id will be used to build the permalink.
-    -->
-    <xsl:variable
-        name="page-id"
-        select="substring-after(//meta[@property='al:android:url']/@content, 'fb://page/')"/>
-
-    <xsl:template match="//div[contains(@class, 'userContentWrapper')]">
-        <xsl:variable name="story-id" select=".//input[@name='ft_ent_identifier']/@value"/>
-        <xsl:variable
-            name="item-permalink"
-            select="concat($BaseURL, '/permalink.php?id=', $page-id, '&amp;story_fbid=', $story-id)"/>
-
-        <!-- Get only the first child in order to skip the footer of the content -->
-        <xsl:variable name="item-content" select="div[1]"/>
-
-        <item>
-            <title>
-                <xsl:variable name="item-title" select="$item-content//p"/>
-                <xsl:variable name="title-length" select="140"/>
-                <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 -->
-                <xsl:choose>
-                    <xsl:when test="string-length($item-title) > $title-length">
-                        <xsl:variable name="truncated-length" select="$title-length - 3"/>
-                        <xsl:value-of select="substring($item-title, 1, $truncated-length)"/>
-                        <xsl:text>...</xsl:text>
-                    </xsl:when>
-                    <xsl:otherwise>
-                        <xsl:value-of select="$item-title"/>
-                    </xsl:otherwise>
-                </xsl:choose>
-            </title>
-            <link>
-                <xsl:value-of select="$item-permalink"/>
-            </link>
-            <guid>
-                <xsl:value-of select="$item-permalink"/>
-            </guid>
-            <pubDate>
-                <xsl:variable name="timestamp" select=".//abbr[@data-shorten]/@data-utime"/>
-                <xsl:value-of select="php:functionString('Tweeper::epochToRssDate', $timestamp)"/>
-            </pubDate>
-            <description>
-
-                <!--
-                     Get only the children starting from the one with class="userContent",
-                     this way the content header is skipped
-                -->
-                <xsl:variable
-                    name="usercontent-position"
-                    select="count($item-content/div[contains(@class, 'userContent')]/preceding-sibling::*) + 1"/>
-
-                <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
-                <xsl:copy-of select="$item-content/div[position() >= $usercontent-position]"/>
-                <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
-            </description>
-        </item>
-    </xsl:template>
-
-    <xsl:template match="/">
-        <xsl:variable name="channel-title" select="//title"/>
-        <xsl:variable name="channel-link" select="//div[contains(@class, 'userContentWrapper')][1]//a[1]/@href"/>
-
-        <rss version="2.0">
-            <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
-            <channel>
-                <generator>Tweeper</generator>
-                <title>
-                    <xsl:value-of select="$channel-title"/>
-                </title>
-                <link>
-                    <xsl:value-of select="$channel-link"/>
-                </link>
-                <description>
-                    <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
-                    <xsl:copy-of select="//div[@data-id='1']/node()"/>
-                    <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
-                </description>
-                <image>
-                    <title>
-                        <xsl:value-of select="$channel-title"/>
-                    </title>
-                    <link>
-                        <xsl:value-of select="$channel-link"/>
-                    </link>
-                    <url>
-                        <xsl:value-of select="//img[@class='profilePic img']/@src"/>
-                    </url>
-                </image>
-                <xsl:apply-templates select="//div[contains(@class, 'userContentWrapper')]"/>
-            </channel>
-        </rss>
-    </xsl:template>
-</xsl:stylesheet>
diff --git a/rss_converter_howtoons.com.xsl b/rss_converter_howtoons.com.xsl
deleted file mode 100644 (file)
index 403b9ac..0000000
+++ /dev/null
@@ -1,102 +0,0 @@
-<!--
-  Stylesheet to convert Howtoons.com to RSS.
-
-  Copyright (C) 2014  Antonio Ospite <ao2@ao2.it>
-
-  This file is part of tweeper.
-
-  This program is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
--->
-
-<!--
-  The RSS feed link is broken on http://howtoons.com so just work around it.
-
-  Howtoons uses Wordpress, so maybe this style sheet can be used as a base for
-  scraping other Wordpress sites.
--->
-
-<xsl:stylesheet version="1.0"
-    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
-    xmlns:php="http://php.net/xsl"
-    xsl:extension-element-prefixes="php"
-    exclude-result-prefixes="php">
-
-    <xsl:output method="xml" indent="yes"/>
-
-    <xsl:variable name="BaseURL">
-        <xsl:text>http://howtoons.com</xsl:text>
-    </xsl:variable>
-
-    <xsl:template match="//div[contains(@id, 'post-')]">
-        <xsl:variable name="item-permalink" select=".//div[@class='post-headline']//a/@href"/>
-        <item>
-            <title>
-                <xsl:value-of select="normalize-space(.//div[@class='post-headline']//a)"/>
-            </title>
-            <link>
-                <xsl:value-of select="$item-permalink"/>
-            </link>
-            <guid>
-                <xsl:value-of select="$item-permalink"/>
-            </guid>
-            <pubDate>
-                <xsl:variable name="date" select="substring-before(.//div[@class='post-byline'], ',')"/>
-                <!-- date format is MM.DD.YY -->
-                <xsl:variable name="month" select="substring($date, 1, 2)"/>
-                <xsl:variable name="day" select="substring($date, 4, 2)"/>
-                <xsl:variable name="year" select="substring($date, 7, 2)"/>
-                <xsl:variable name="iso-date" select="concat('20', $year, '-', $month, '-', $day)"/>
-                <xsl:value-of select="php:functionString('Tweeper::strToRssDate', $iso-date)"/>
-            </pubDate>
-            <description>
-                <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
-                <xsl:copy-of select=".//div[contains(@class, 'post-bodycopy')]/p"/>
-                <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
-            </description>
-        </item>
-    </xsl:template>
-
-    <xsl:template match="/">
-        <xsl:variable name="channel-title" select="//title"/>
-        <xsl:variable name="channel-link" select="$BaseURL"/>
-
-        <rss version="2.0">
-            <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
-            <channel>
-                <generator>Tweeper</generator>
-                <title>
-                    <xsl:value-of select="$channel-title"/>
-                </title>
-                <link>
-                    <xsl:value-of select="$channel-link"/>
-                </link>
-                <description>
-                    <xsl:text>The world's greatest D.I.Y. comic website! Tools of mass construction!</xsl:text>
-                </description>
-                <image>
-                    <title>
-                        <xsl:value-of select="$channel-title"/>
-                    </title>
-                    <link>
-                        <xsl:value-of select="$channel-link"/>
-                    </link>
-                    <url>
-                        <xsl:text>http://www.howtoons.com/wp-content/themes/atahualpa/images/header/tuck1000.png</xsl:text>
-                    </url>
-                </image>
-                <xsl:apply-templates select="//div[contains(@id, 'post-')]"/>
-            </channel>
-        </rss>
-    </xsl:template>
-</xsl:stylesheet>
diff --git a/rss_converter_identi.ca.xsl b/rss_converter_identi.ca.xsl
deleted file mode 120000 (symlink)
index d8042a1..0000000
+++ /dev/null
@@ -1 +0,0 @@
-rss_converter_pump.io.xsl
\ No newline at end of file
diff --git a/rss_converter_instagram.com.xsl b/rss_converter_instagram.com.xsl
deleted file mode 100644 (file)
index e869d7d..0000000
+++ /dev/null
@@ -1,135 +0,0 @@
-<!--
-  Stylesheet to convert Instagram user timelines to RSS.
-
-  Copyright (C) 2015  Antonio Ospite <ao2@ao2.it>
-
-  This file is part of tweeper.
-
-  This program is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
--->
-<xsl:stylesheet version="1.0"
-    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
-    xmlns:php="http://php.net/xsl"
-    xsl:extension-element-prefixes="php"
-    exclude-result-prefixes="php">
-
-    <xsl:param name="generate-enclosure"/>
-
-    <xsl:output method="xml" indent="yes"/>
-
-    <xsl:variable name="BaseURL">
-        <xsl:text>https://instagram.com</xsl:text>
-    </xsl:variable>
-
-    <xsl:variable name="user-name" select="//ProfilePage/user/username"/>
-
-    <!-- Some users do not specify the full name -->
-    <xsl:variable name="full-name" select="//ProfilePage/user/full_name"/>
-    <xsl:variable name="screen-name">
-        <xsl:choose>
-            <xsl:when test="$full-name != ''">
-                <xsl:value-of select="$full-name"/>
-            </xsl:when>
-            <xsl:otherwise>
-                <xsl:value-of select="$user-name"/>
-            </xsl:otherwise>
-        </xsl:choose>
-    </xsl:variable>
-
-    <xsl:template match="//ProfilePage/user/media/nodes">
-        <xsl:variable name="item-content-image" select="./display_src"/>
-        <xsl:variable name="item-content-caption" select="./caption"/>
-        <xsl:variable name="item-permalink" select="concat($BaseURL, '/p/', ./code, '/')"/>
-        <item>
-            <title>
-                <xsl:variable name="title-length" select="140"/>
-                <xsl:variable name="item-content-title" select="normalize-space(concat($user-name, ': ', $item-content-caption))"/>
-                <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 -->
-                <xsl:choose>
-                    <xsl:when test="string-length($item-content-title) > $title-length">
-                        <xsl:variable name="truncated-length" select="$title-length - 3"/>
-                        <xsl:value-of select="substring($item-content-title, 1, $truncated-length)"/>
-                        <xsl:text>...</xsl:text>
-                    </xsl:when>
-                    <xsl:otherwise>
-                        <xsl:value-of select="$item-content-title"/>
-                    </xsl:otherwise>
-                </xsl:choose>
-            </title>
-            <link>
-                <xsl:value-of select="$item-permalink"/>
-            </link>
-            <guid>
-                <xsl:value-of select="$item-permalink"/>
-            </guid>
-            <pubDate>
-                <xsl:variable name="timestamp" select="./date"/>
-                <xsl:value-of select="php:functionString('Tweeper::epochToRssDate', $timestamp)"/>
-            </pubDate>
-            <description>
-                <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
-                <p>
-                    <xsl:if test="./is_video/text() = 1">
-                        (Video)
-                    </xsl:if>
-                    <xsl:value-of select="$item-content-caption"/>
-                </p><br />
-                <a href="{$item-permalink}"><img src="{$item-content-image}" style="max-width: 100%"/></a>
-                <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
-            </description>
-            <xsl:if test="$generate-enclosure = 1">
-                <xsl:copy-of select="php:functionString('Tweeper::generateEnclosure', $item-content-image)"/>
-            </xsl:if>
-        </item>
-    </xsl:template>
-
-    <xsl:template match="/">
-        <xsl:variable name="channel-title" select="concat('Instagram / ', $screen-name)"/>
-        <xsl:variable name="channel-link" select="concat($BaseURL, '/', $user-name)"/>
-
-        <rss version="2.0">
-            <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
-            <channel>
-                <generator>Tweeper</generator>
-                <title>
-                    <xsl:value-of select="$channel-title"/>
-                </title>
-                <link>
-                    <xsl:value-of select="$channel-link"/>
-                </link>
-                <description>
-                    <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
-                    <xsl:value-of select="normalize-space(concat($screen-name, '. ', //user/biography))"/>
-                    <xsl:variable name="external-url" select="//user/external_url"/>
-                    <xsl:if test="$external-url != ''">
-                        <xsl:text> </xsl:text><a href="{$external-url}"><xsl:value-of select="$external-url"/></a>
-                    </xsl:if>
-                    <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
-                </description>
-                <image>
-                    <title>
-                        <xsl:value-of select="$channel-title"/>
-                    </title>
-                    <link>
-                        <xsl:value-of select="$channel-link"/>
-                    </link>
-                    <url>
-                        <xsl:value-of select="//ProfilePage/user/profile_pic_url"/>
-                    </url>
-                </image>
-                <xsl:apply-templates select="//ProfilePage/user/media/nodes"/>
-            </channel>
-        </rss>
-    </xsl:template>
-</xsl:stylesheet>
diff --git a/rss_converter_pump.io.xsl b/rss_converter_pump.io.xsl
deleted file mode 100644 (file)
index 1577dcf..0000000
+++ /dev/null
@@ -1,99 +0,0 @@
-<!--
-  Stylesheet to convert Pump.io activity streams to RSS.
-
-  Copyright (C) 2013-2014  Antonio Ospite <ao2@ao2.it>
-
-  This file is part of tweeper.
-
-  This program is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
--->
-<!-- To Evan, please reconsider publishing RSS ouput for _public_ contents -->
-<xsl:stylesheet version="1.0"
-    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
-    xmlns:php="http://php.net/xsl"
-    xsl:extension-element-prefixes="php"
-    exclude-result-prefixes="php">
-
-    <xsl:output method="xml" indent="yes"/>
-
-    <xsl:variable name="domain-name" select="substring-after(//div[@id='profile-block']/@data-profile-id, '@')"/>
-    <xsl:variable name="BaseURL" select="concat('https://', $domain-name)"/>
-
-    <xsl:variable name="user-name" select="substring-after(//div[@id='profile-block']/@data-profile-id, ':')"/>
-
-    <xsl:template match="//div[@id='user-content-activities']//ul[@id='major-stream']/li">
-        <xsl:variable name="item-content" select=".//div[@class='activity-content']"/>
-        <xsl:variable name="item-permalink" select=".//p[@class='muted']/small/a/@href"/>
-        <item>
-            <title>
-                <xsl:value-of select="concat($user-name, ': ', normalize-space($item-content))"/>
-            </title>
-            <link>
-                <xsl:value-of select="$item-permalink"/>
-            </link>
-            <guid>
-                <xsl:value-of select="$item-permalink"/>
-            </guid>
-            <pubDate>
-                <xsl:value-of select="php:functionString('Tweeper::strToRssDate', .//abbr[@class='easydate']/@title)"/>
-            </pubDate>
-            <description>
-                <xsl:value-of select="concat($user-name, ': ')"/>
-                <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
-                <xsl:copy-of select="$item-content/node()"/>
-                <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
-            </description>
-            <xsl:if test="$generate-enclosure = 1">
-                <xsl:variable name="image-thumb-link" select=".//img[contains(@class, 'object-image')]/@src"/>
-                <xsl:if test="$image-thumb-link">
-                    <xsl:variable name="image-link" select="php:functionString('str_replace', '_thumb', '', $image-thumb-link)"/>
-                    <xsl:copy-of select="php:functionString('Tweeper::generateEnclosure', $image-link)"/>
-                </xsl:if>
-            </xsl:if>
-        </item>
-    </xsl:template>
-
-    <xsl:template match="/">
-        <xsl:variable name="channel-title" select="concat(substring-after($user-name, '@'), ' / ', substring-before($user-name, '@'))"/>
-        <xsl:variable name="channel-link" select="concat('https://', substring-after($user-name, '@'), '/', substring-before($user-name, '@'))"/>
-
-        <rss version="2.0">
-            <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
-            <channel>
-                <generator>Tweeper</generator>
-                <title>
-                    <xsl:value-of select="$channel-title"/>
-                </title>
-                <link>
-                    <xsl:value-of select="$channel-link"/>
-                </link>
-                <description>
-                    <xsl:value-of select="normalize-space(//h1[@class='media-header'])"/>
-                </description>
-                <image>
-                    <title>
-                        <xsl:value-of select="$channel-title"/>
-                    </title>
-                    <link>
-                        <xsl:value-of select="$channel-link"/>
-                    </link>
-                    <url>
-                        <xsl:value-of select="//div[@id='profile-block']/span/img[@class='img-rounded media-object']/@src"/>
-                    </url>
-                </image>
-                <xsl:apply-templates select="//div[@id='user-content-activities']//ul[@id='major-stream']/li"/>
-            </channel>
-        </rss>
-    </xsl:template>
-</xsl:stylesheet>
diff --git a/rss_converter_twitter.com.xsl b/rss_converter_twitter.com.xsl
deleted file mode 100644 (file)
index c154141..0000000
+++ /dev/null
@@ -1,208 +0,0 @@
-<!--
-  Stylesheet to convert Twitter user timelines to RSS.
-
-  Copyright (C) 2013-2014  Antonio Ospite <ao2@ao2.it>
-
-  This file is part of tweeper.
-
-  This program is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
--->
-<xsl:stylesheet version="1.0"
-    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
-    xmlns:php="http://php.net/xsl"
-    xsl:extension-element-prefixes="php"
-    exclude-result-prefixes="php">
-
-    <xsl:param name="generate-enclosure"/>
-
-    <xsl:output method="xml" indent="yes"/>
-
-    <xsl:variable name="BaseURL">
-        <xsl:text>https://twitter.com</xsl:text>
-    </xsl:variable>
-
-    <!-- Identity transform -->
-    <xsl:template match="@*|node()">
-        <xsl:copy>
-            <xsl:apply-templates select="@*|node()"/>
-        </xsl:copy>
-    </xsl:template>
-
-    <!--
-         Anchors to external links provide the direct URL in the
-         data-expanded-url attribute, so use this in the href attribute too
-         instead of the default short URL which uses the t.co redirection
-         service.
-
-         NOTE: when creating an element, attributes must be processed _before_
-         adding the contents (either children or a value):
-         http://stackoverflow.com/questions/21984867/
-    -->
-    <xsl:template match="a[@data-expanded-url]">
-        <!-- Prepend and append a white space for aestethic reasons -->
-        <xsl:text> </xsl:text>
-        <a>
-            <xsl:attribute name="href">
-                <xsl:value-of select="@data-expanded-url"/>
-            </xsl:attribute>
-            <!-- Also strip &nbsp; and &hellip; -->
-            <xsl:value-of select="translate(., '&#xA0;&#x2026;', '')"/>
-        </a>
-        <xsl:text> </xsl:text>
-    </xsl:template>
-
-    <!--
-         These are links to pic.twitter.com, use the direct link for those
-         too instead of the t.co redirections.
-    -->
-    <xsl:template match="a[@data-pre-embedded='true']">
-        <!-- Prepend and append a white space for aestethic reasons -->
-        <xsl:text> </xsl:text>
-        <a>
-            <xsl:attribute name="href">
-                <xsl:value-of select="concat('https://', .)"/>
-            </xsl:attribute>
-            <xsl:value-of select="concat('https://', .)"/>
-        </a>
-        <xsl:text> </xsl:text>
-    </xsl:template>
-
-    <!-- Present images in a more convenient way -->
-    <xsl:template match="div[@data-image-url]">
-        <a>
-            <xsl:attribute name="href">
-                <xsl:value-of select="concat(@data-image-url, ':orig')"/>
-            </xsl:attribute>
-            <img>
-                <xsl:attribute name="src">
-                    <xsl:value-of select="@data-image-url"/>
-                </xsl:attribute>
-            </img>
-        </a>
-    </xsl:template>
-
-    <!-- Don't repeat background in embedded media content -->
-    <xsl:template match="div[contains(@class, 'PlayableMedia-player')]">
-        <xsl:copy>
-            <xsl:apply-templates select="@*"/>
-            <xsl:attribute name="style">
-                <xsl:value-of select="concat(@style, '; background-repeat: no-repeat')"/>
-            </xsl:attribute>
-            <xsl:apply-templates select="node()"/>
-        </xsl:copy>
-    </xsl:template>
-
-    <xsl:template match="a[@data-expanded-url]" mode="enclosure">
-        <xsl:copy-of select="php:functionString('Tweeper::generateEnclosure', ./@data-expanded-url)"/>
-    </xsl:template>
-
-    <xsl:template match="div[@data-image-url]" mode="enclosure">
-        <xsl:copy-of select="php:functionString('Tweeper::generateEnclosure', concat(./@data-image-url, ':orig'))"/>
-    </xsl:template>
-
-    <xsl:variable name="screen-name" select="//div[@class='user-actions btn-group not-following ']/@data-screen-name"/>
-
-    <xsl:template match="//li[@data-item-id and @data-item-type='tweet']">
-        <xsl:variable name="user-name" select=".//div[contains(@class, 'js-stream-tweet')]/@data-screen-name"/>
-        <xsl:variable name="item-content" select=".//p[contains(@class, 'js-tweet-text')]"/>
-        <xsl:variable name="item-media" select=".//div[contains(@class, 'AdaptiveMedia-container')]"/>
-        <xsl:variable name="item-permalink" select="concat($BaseURL, .//div[@data-permalink-path]/@data-permalink-path)"/>
-
-        <xsl:variable name="item-has-video" select="$item-media//*[contains(@class, 'PlayableMedia--video')]"/>
-        <item>
-            <title>
-                <xsl:value-of select="concat($user-name, ': ')"/>
-                <xsl:if test="$item-has-video">
-                    <xsl:text>(Video) </xsl:text>
-                </xsl:if>
-                <!--
-                     Prepend a space in front of the URLs which are not
-                     preceded by an open parenthesis, for aestethic reasons.
-                     Also, regex, I know: http://xkcd.com/1171/
-                -->
-                <xsl:variable
-                    name="processed-title"
-                    select="php:functionString('preg_replace', '@((?&lt;!\()(?:http[s]?://|pic.twitter.com))@', ' \1', $item-content)"/>
-                <!-- Also strip &nbsp; and &hellip; -->
-                <xsl:value-of select="normalize-space(translate($processed-title, '&#xA0;&#x2026;', ''))"/>
-            </title>
-            <link>
-                <xsl:value-of select="$item-permalink"/>
-            </link>
-            <guid>
-                <xsl:value-of select="$item-permalink"/>
-            </guid>
-            <pubDate>
-                <xsl:variable name="timestamp" select=".//span[contains(@class, 'js-short-timestamp')]/@data-time"/>
-                <xsl:value-of select="php:functionString('Tweeper::epochToRssDate', number($timestamp))"/>
-            </pubDate>
-            <description>
-                <xsl:value-of select="concat($user-name, ': ')"/>
-                <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
-                <xsl:if test="$item-has-video">
-                    <xsl:text>(Video) </xsl:text>
-                </xsl:if>
-                <xsl:apply-templates select="$item-content/node()"/>
-                <xsl:apply-templates select="$item-media/node()"/>
-                <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
-            </description>
-            <xsl:if test="$generate-enclosure = 1">
-                <xsl:apply-templates select="$item-content//a[@data-expanded-url]" mode="enclosure"/>
-                <xsl:apply-templates select="$item-media//div[@data-image-url]" mode="enclosure"/>
-            </xsl:if>
-        </item>
-    </xsl:template>
-
-    <xsl:template match="/">
-        <xsl:variable name="channel-title">
-            <xsl:choose>
-                <xsl:when test="$screen-name != ''">
-                    <xsl:value-of select="concat('Twitter / ', $screen-name)"/>
-                </xsl:when>
-                <xsl:otherwise>
-                    <xsl:value-of select="concat('Twitter / ', normalize-space(//h1[1]))"/>
-                </xsl:otherwise>
-            </xsl:choose>
-        </xsl:variable>
-        <xsl:variable name="channel-link" select="//link[@rel='canonical']/@href"/>
-
-        <rss version="2.0">
-            <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
-            <channel>
-                <generator>Tweeper</generator>
-                <title>
-                    <xsl:value-of select="$channel-title"/>
-                </title>
-                <link>
-                    <xsl:value-of select="$channel-link"/>
-                </link>
-                <description>
-                    <xsl:value-of select="normalize-space(//div[@class='ProfileHeaderCard'])"/>
-                </description>
-                <image>
-                    <title>
-                        <xsl:value-of select="$channel-title"/>
-                    </title>
-                    <link>
-                        <xsl:value-of select="$channel-link"/>
-                    </link>
-                    <url>
-                        <xsl:value-of select="//a[contains(@class, 'profile-picture')]/@href"/>
-                    </url>
-                </image>
-                <xsl:apply-templates select="//ol[@id='stream-items-id']/li[@data-item-id and @data-item-type='tweet']"/>
-            </channel>
-        </rss>
-    </xsl:template>
-</xsl:stylesheet>
diff --git a/src/Tweeper.php b/src/Tweeper.php
new file mode 100644 (file)
index 0000000..93ac9e0
--- /dev/null
@@ -0,0 +1,363 @@
+<?php
+
+namespace Tweeper;
+
+/**
+ * @file
+ * Tweeper - a Twitter to RSS web scraper.
+ *
+ * Copyright (C) 2013-2016  Antonio Ospite <ao2@ao2.it>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+use DOMDocument;
+use XSLTProcessor;
+
+use Symfony\Component\Serializer\Serializer;
+use Symfony\Component\Serializer\Encoder\XmlEncoder;
+use Symfony\Component\Serializer\Normalizer\ObjectNormalizer;
+
+date_default_timezone_set('UTC');
+
+/**
+ * Scrape supported websites and perform conversion to RSS.
+ */
+class Tweeper {
+
+  private static $userAgent = "Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0";
+
+  /**
+   * Constructor sets up {@link $generate_enclosure}.
+   */
+  public function __construct($generate_enclosure = FALSE) {
+    $this->generate_enclosure = $generate_enclosure;
+  }
+
+  /**
+   * Convert numeric Epoch to the date format expected in a RSS document.
+   */
+  public static function epochToRssDate($timestamp) {
+    if (!is_numeric($timestamp) || is_nan($timestamp)) {
+      $timestamp = 0;
+    }
+
+    return gmdate(DATE_RSS, $timestamp);
+  }
+
+  /**
+   * Convert generic date string to the date format expected in a RSS document.
+   */
+  public static function strToRssDate($date) {
+    $timestamp = strtotime($date);
+    if (FALSE === $timestamp) {
+      $timestamp = 0;
+    }
+
+    return Tweeper::epochToRssDate($timestamp);
+  }
+
+  /**
+   * Convert string to UpperCamelCase.
+   */
+  public static function toUpperCamelCase($str, $delim = ' ') {
+    $str_upper = ucwords($str, $delim);
+    $str_camel_case = str_replace($delim, '', $str_upper);
+    return $str_camel_case;
+  }
+
+  /**
+   * Get the contents from a URL.
+   */
+  private static function getUrlContents($url) {
+    $ch = curl_init($url);
+    curl_setopt_array($ch, array(
+      CURLOPT_HEADER => FALSE,
+      // Follow http redirects to get the real URL.
+      CURLOPT_FOLLOWLOCATION => TRUE,
+      CURLOPT_RETURNTRANSFER => TRUE,
+      CURLOPT_SSL_VERIFYHOST => FALSE,
+      CURLOPT_SSL_VERIFYPEER => FALSE,
+      CURLOPT_HTTPHEADER => array('Accept-language: en'),
+      CURLOPT_USERAGENT => Tweeper::$userAgent,
+    ));
+    $contents = curl_exec($ch);
+    if (FALSE === $contents) {
+      trigger_error(curl_error($ch));
+    }
+    curl_close($ch);
+
+    return $contents;
+  }
+
+  /**
+   * Get the headers from a URL.
+   */
+  private static function getUrlInfo($url) {
+    $ch = curl_init($url);
+    curl_setopt_array($ch, array(
+      CURLOPT_HEADER => TRUE,
+      CURLOPT_NOBODY => TRUE,
+      // Follow http redirects to get the real URL.
+      CURLOPT_FOLLOWLOCATION => TRUE,
+      CURLOPT_RETURNTRANSFER => TRUE,
+      CURLOPT_SSL_VERIFYHOST => FALSE,
+      CURLOPT_SSL_VERIFYPEER => FALSE,
+      CURLOPT_USERAGENT => Tweeper::$userAgent,
+    ));
+    curl_exec($ch);
+    $url_info = curl_getinfo($ch);
+    if (FALSE === $url_info) {
+      trigger_error(curl_error($ch));
+    }
+    curl_close($ch);
+
+    return $url_info;
+  }
+
+  /**
+   * Generate an RSS <enclosure/> element.
+   */
+  public static function generateEnclosure($url) {
+    $supported_content_types = array(
+      "application/octet-stream",
+      "application/ogg",
+      "application/pdf",
+      "audio/aac",
+      "audio/mp4",
+      "audio/mpeg",
+      "audio/ogg",
+      "audio/vorbis",
+      "audio/wav",
+      "audio/webm",
+      "audio/x-midi",
+      "image/gif",
+      "image/jpeg",
+      "image/png",
+      "video/avi",
+      "video/mp4",
+      "video/mpeg",
+      "video/ogg",
+    );
+
+    $url_info = Tweeper::getUrlInfo($url);
+
+    $supported = in_array($url_info['content_type'], $supported_content_types);
+    if (!$supported) {
+      error_log("Unsupported enclosure content type \"" . $url_info['content_type'] . "\" for URL: " . $url_info['url']);
+      return '';
+    }
+
+    // The RSS specification says that the enclosure element URL must be http.
+    // See http://sourceforge.net/p/feedvalidator/bugs/72/
+    $http_url = preg_replace("/^https/", "http", $url_info['url']);
+
+    $dom = new DOMDocument();
+    $enc = $dom->createElement('enclosure');
+    $enc->setAttribute('url', $http_url);
+    $enc->setAttribute('length', $url_info['download_content_length']);
+    $enc->setAttribute('type', $url_info['content_type']);
+
+    return $enc;
+  }
+
+  /**
+   * Mimic the message from libxml.c::php_libxml_ctx_error_level()
+   */
+  private static function logXmlError($error) {
+    $output = "";
+
+    switch ($error->level) {
+      case LIBXML_ERR_WARNING:
+        $output .= "Warning $error->code: ";
+        break;
+
+      case LIBXML_ERR_ERROR:
+        $output .= "Error $error->code: ";
+        break;
+
+      case LIBXML_ERR_FATAL:
+        $output .= "Fatal Error $error->code: ";
+        break;
+    }
+
+    $output .= trim($error->message);
+
+    if ($error->file) {
+      $output .= " in $error->file";
+    }
+    else {
+      $output .= " in Entity,";
+    }
+
+    $output .= " line $error->line";
+
+    error_log($output);
+  }
+
+  /**
+   * Convert json to XML.
+   */
+  private static function jsonToXml($json, $root_node_name) {
+    // Apparently the ObjectNormalizer used afterwards is not able to handle
+    // the stdClass object created by json_decode() with the default setting
+    // $assoc = false; so use $assoc = true.
+    $data = json_decode($json, $assoc = TRUE);
+    if (!$data) {
+      return NULL;
+    }
+
+    $encoder = new XmlEncoder();
+    $normalizer = new ObjectNormalizer();
+    $serializer = new Serializer(array($normalizer), array($encoder));
+
+    $serializer_options = array(
+      'xml_encoding' => "UTF-8",
+      'xml_format_output' => TRUE,
+      'xml_root_node_name' => $root_node_name,
+    );
+
+    $xml_data = $serializer->serialize($data, 'xml', $serializer_options);
+    if (!$xml_data) {
+      trigger_error("Cannot serialize data", E_USER_ERROR);
+      return NULL;
+    }
+
+    return $xml_data;
+  }
+
+  /**
+   * Convert the Instagram content to XML.
+   */
+  private function getXmlInstagramCom($html) {
+    // Extract the json data from the html code.
+    $json_match_expr = '/window._sharedData = (.*);/';
+    $ret = preg_match($json_match_expr, $html, $matches);
+    if ($ret !== 1) {
+      trigger_error("Cannot match expression: $json_match_expr\n", E_USER_ERROR);
+      return NULL;
+    }
+
+    return Tweeper::jsonToXml($matches[1], 'instagram');
+  }
+
+  /**
+   * Make the Facebook HTML processable.
+   */
+  private function preprocessHtmlFacebookCom($html) {
+    $html = str_replace('<!--', '', $html);
+    $html = str_replace('-->', '', $html);
+    return $html;
+  }
+
+  /**
+   * Convert the HTML retrieved from the site to XML.
+   */
+  private function htmlToXml($html, $host) {
+    $xmlDoc = new DOMDocument();
+
+    // Handle warnings and errors when loading invalid HTML.
+    $xml_errors_value = libxml_use_internal_errors(TRUE);
+
+    // If there is a host-specific method to get the XML data, use it!
+    $get_xml_host_method = 'getXml' . Tweeper::toUpperCamelCase($host, '.');
+    if (method_exists($this, $get_xml_host_method)) {
+      $xml_data = call_user_func_array(array($this, $get_xml_host_method), array($html));
+      $xmlDoc->loadXML($xml_data);
+    }
+    else {
+      $xmlDoc->loadHTML($html);
+    }
+
+    foreach (libxml_get_errors() as $xml_error) {
+      Tweeper::logXmlError($xml_error);
+    }
+    libxml_clear_errors();
+    libxml_use_internal_errors($xml_errors_value);
+
+    return $xmlDoc;
+  }
+
+  /**
+   * Load a stylesheet if the web site is supported.
+   */
+  private function loadStylesheet($host) {
+    $stylesheet = "file://" . __DIR__ . "/rss_converter_" . $host . ".xsl";
+    if (FALSE === file_exists($stylesheet)) {
+      trigger_error("Conversion to RSS not supported for $host ($stylesheet not found)", E_USER_ERROR);
+      return NULL;
+    }
+
+    $stylesheet_contents = Tweeper::getUrlContents($stylesheet);
+
+    $xslDoc = new DOMDocument();
+    $xslDoc->loadXML($stylesheet_contents);
+
+    $xsltProcessor = new XSLTProcessor();
+    $xsltProcessor->registerPHPFunctions();
+    $xsltProcessor->setParameter('', 'generate-enclosure', $this->generate_enclosure);
+    $xsltProcessor->importStylesheet($xslDoc);
+
+    return $xsltProcessor;
+  }
+
+  /**
+   * Convert the site content to RSS.
+   */
+  public function tweep($src_url) {
+    $url = parse_url($src_url);
+    if (FALSE === $url || empty($url["host"])) {
+      trigger_error("Invalid URL: $src_url", E_USER_ERROR);
+      return NULL;
+    }
+
+    $scheme = $url["scheme"];
+    if (!in_array($scheme, array("http", "https"))) {
+      trigger_error("unsupported scheme: $scheme", E_USER_ERROR);
+      return NULL;
+    }
+
+    // Strip the leading www. to be more forgiving on input URLs.
+    $host = preg_replace('/^www\./', '', $url["host"]);
+
+    $xsltProcessor = $this->loadStylesheet($host);
+    if (NULL === $xsltProcessor) {
+      return NULL;
+    }
+
+    $html = Tweeper::getUrlContents($src_url);
+    if (FALSE === $html) {
+      return NULL;
+    }
+
+    $preprocess_html_host_method = 'preprocessHtml' . Tweeper::toUpperCamelCase($host, '.');
+    if (method_exists($this, $preprocess_html_host_method)) {
+      $html = call_user_func_array(array($this, $preprocess_html_host_method), array($html));
+    }
+
+    $xmlDoc = $this->htmlToXml($html, $host);
+    if (NULL === $xmlDoc) {
+      return NULL;
+    }
+
+    $output = $xsltProcessor->transformToXML($xmlDoc);
+
+    if (FALSE === $output) {
+      trigger_error('XSL transformation failed.', E_USER_ERROR);
+      return NULL;
+    }
+    return $output;
+  }
+
+}
diff --git a/src/rss_converter_dilbert.com.xsl b/src/rss_converter_dilbert.com.xsl
new file mode 100644 (file)
index 0000000..d340183
--- /dev/null
@@ -0,0 +1,115 @@
+<!--
+  Stylesheet to convert Dilbert daily strips to RSS.
+
+  Copyright (C) 2013-2014  Antonio Ospite <ao2@ao2.it>
+
+  This file is part of tweeper.
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+-->
+
+<!--
+  Since June 18, 2013 dilbert.com strips are not accessible anymore
+  directly from the RSS feed, this message is displayed instead:
+
+    Dilbert readers - Please visit Dilbert.com to read this feature. Due
+    to changes with our feeds, we are now making this RSS feed a link to
+    Dilbert.com.
+
+  How unhandy is that, was it because of a management decision?
+  Maybe a parody dilbert strip is needed about this issue...
+-->
+
+<xsl:stylesheet version="1.0"
+    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+    xmlns:php="http://php.net/xsl"
+    xsl:extension-element-prefixes="php"
+    exclude-result-prefixes="php">
+
+    <xsl:output method="xml" indent="yes"/>
+
+    <xsl:variable name="BaseURL" select="//meta[@property='og:url']/@content"/>
+
+    <xsl:template match="//section[@class='comic-item']">
+        <xsl:variable name="item-permalink" select=".//a[@class='img-comic-link']/@href"/>
+        <xsl:variable name="picture-url" select=".//img[@class='img-responsive img-comic']/@src"/>
+        <xsl:variable name="picture-title" select=".//img[@class='img-responsive img-comic']/@alt"/>
+        <item>
+            <title>
+                <xsl:variable name="title-length" select="140"/>
+                <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 -->
+                <xsl:choose>
+                    <xsl:when test="string-length($picture-title) > $title-length">
+                        <xsl:variable name="truncated-length" select="$title-length - 3"/>
+                        <xsl:value-of select="substring($picture-title, 1, $truncated-length)"/>
+                        <xsl:text>...</xsl:text>
+                    </xsl:when>
+                    <xsl:otherwise>
+                        <xsl:value-of select="$picture-title"/>
+                    </xsl:otherwise>
+                </xsl:choose>
+            </title>
+            <link>
+                <xsl:value-of select="$item-permalink"/>
+            </link>
+            <guid>
+                <xsl:value-of select="$item-permalink"/>
+            </guid>
+            <pubDate>
+                <xsl:value-of select="php:functionString('Tweeper\Tweeper::strToRssDate', normalize-space(.//date))"/>
+            </pubDate>
+            <description>
+                <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
+                <img src="{$picture-url}" alt="{$picture-title}"/>
+                <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
+            </description>
+            <xsl:if test="$generate-enclosure = 1">
+                <xsl:copy-of select="php:functionString('Tweeper\Tweeper::generateEnclosure', $picture-url)"/>
+            </xsl:if>
+        </item>
+    </xsl:template>
+
+    <xsl:template match="/">
+        <xsl:variable name="channel-title" select="//meta[@property='og:title']/@content"/>
+        <xsl:variable name="channel-link" select="$BaseURL"/>
+
+        <rss version="2.0">
+            <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
+            <channel>
+                <generator>Tweeper</generator>
+                <title>
+                    <xsl:value-of select="$channel-title"/>
+                </title>
+                <link>
+                    <xsl:value-of select="$channel-link"/>
+                </link>
+                <description>
+                    <xsl:value-of select="//meta[@property='og:description']/@content"/>
+                </description>
+                <image>
+                    <title>
+                        <xsl:value-of select="$channel-title"/>
+                    </title>
+                    <link>
+                        <xsl:value-of select="$channel-link"/>
+                    </link>
+                    <url>
+                        <xsl:value-of select="concat($BaseURL, //img[@alt='Dilbert logo']/@src)"/>
+                    </url>
+                </image>
+                <xsl:apply-templates select="//section[@class='comic-item']"/>
+            </channel>
+        </rss>
+    </xsl:template>
+</xsl:stylesheet>
diff --git a/src/rss_converter_facebook.com.xsl b/src/rss_converter_facebook.com.xsl
new file mode 100644 (file)
index 0000000..933d3d2
--- /dev/null
@@ -0,0 +1,141 @@
+<!--
+  Stylesheet to convert a Facebook public page to RSS.
+
+  Copyright (C) 2015  Antonio Ospite <ao2@ao2.it>
+
+  This file is part of tweeper.
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+-->
+
+<!--
+  Since June 23rd, 2015 facebook.com deprecated the RSS feed endpoint for public pages:
+  https://developers.facebook.com/docs/apps/changelog#v2_3_90_day_deprecations
+
+  They suggest to use the Graph API but they fail to mention that it does not
+  work anymore without authentication, so it cannot be considered an
+  _equivalent_ solution.
+
+  Luckily we've got Tweeper!
+-->
+
+<xsl:stylesheet version="1.0"
+    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+    xmlns:php="http://php.net/xsl"
+    xsl:extension-element-prefixes="php"
+    exclude-result-prefixes="php">
+
+    <xsl:output method="xml" indent="yes"/>
+
+    <xsl:variable name="BaseURL">
+        <xsl:text>https://facebook.com</xsl:text>
+    </xsl:variable>
+
+    <!--
+         Extract the page id from an element like:
+        <meta property="al:android:url" content="fb://page/793837197390834">
+
+        The page id will be used to build the permalink.
+    -->
+    <xsl:variable
+        name="page-id"
+        select="substring-after(//meta[@property='al:android:url']/@content, 'fb://page/')"/>
+
+    <xsl:template match="//div[contains(@class, 'userContentWrapper')]">
+        <xsl:variable name="story-id" select=".//input[@name='ft_ent_identifier']/@value"/>
+        <xsl:variable
+            name="item-permalink"
+            select="concat($BaseURL, '/permalink.php?id=', $page-id, '&amp;story_fbid=', $story-id)"/>
+
+        <!-- Get only the first child in order to skip the footer of the content -->
+        <xsl:variable name="item-content" select="div[1]"/>
+
+        <item>
+            <title>
+                <xsl:variable name="item-title" select="$item-content//p"/>
+                <xsl:variable name="title-length" select="140"/>
+                <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 -->
+                <xsl:choose>
+                    <xsl:when test="string-length($item-title) > $title-length">
+                        <xsl:variable name="truncated-length" select="$title-length - 3"/>
+                        <xsl:value-of select="substring($item-title, 1, $truncated-length)"/>
+                        <xsl:text>...</xsl:text>
+                    </xsl:when>
+                    <xsl:otherwise>
+                        <xsl:value-of select="$item-title"/>
+                    </xsl:otherwise>
+                </xsl:choose>
+            </title>
+            <link>
+                <xsl:value-of select="$item-permalink"/>
+            </link>
+            <guid>
+                <xsl:value-of select="$item-permalink"/>
+            </guid>
+            <pubDate>
+                <xsl:variable name="timestamp" select=".//abbr[@data-shorten]/@data-utime"/>
+                <xsl:value-of select="php:functionString('Tweeper\Tweeper::epochToRssDate', $timestamp)"/>
+            </pubDate>
+            <description>
+
+                <!--
+                     Get only the children starting from the one with class="userContent",
+                     this way the content header is skipped
+                -->
+                <xsl:variable
+                    name="usercontent-position"
+                    select="count($item-content/div[contains(@class, 'userContent')]/preceding-sibling::*) + 1"/>
+
+                <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
+                <xsl:copy-of select="$item-content/div[position() >= $usercontent-position]"/>
+                <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
+            </description>
+        </item>
+    </xsl:template>
+
+    <xsl:template match="/">
+        <xsl:variable name="channel-title" select="//title"/>
+        <xsl:variable name="channel-link" select="//div[contains(@class, 'userContentWrapper')][1]//a[1]/@href"/>
+
+        <rss version="2.0">
+            <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
+            <channel>
+                <generator>Tweeper</generator>
+                <title>
+                    <xsl:value-of select="$channel-title"/>
+                </title>
+                <link>
+                    <xsl:value-of select="$channel-link"/>
+                </link>
+                <description>
+                    <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
+                    <xsl:copy-of select="//div[@data-id='1']/node()"/>
+                    <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
+                </description>
+                <image>
+                    <title>
+                        <xsl:value-of select="$channel-title"/>
+                    </title>
+                    <link>
+                        <xsl:value-of select="$channel-link"/>
+                    </link>
+                    <url>
+                        <xsl:value-of select="//img[@class='profilePic img']/@src"/>
+                    </url>
+                </image>
+                <xsl:apply-templates select="//div[contains(@class, 'userContentWrapper')]"/>
+            </channel>
+        </rss>
+    </xsl:template>
+</xsl:stylesheet>
diff --git a/src/rss_converter_howtoons.com.xsl b/src/rss_converter_howtoons.com.xsl
new file mode 100644 (file)
index 0000000..35a6739
--- /dev/null
@@ -0,0 +1,102 @@
+<!--
+  Stylesheet to convert Howtoons.com to RSS.
+
+  Copyright (C) 2014  Antonio Ospite <ao2@ao2.it>
+
+  This file is part of tweeper.
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+-->
+
+<!--
+  The RSS feed link is broken on http://howtoons.com so just work around it.
+
+  Howtoons uses Wordpress, so maybe this style sheet can be used as a base for
+  scraping other Wordpress sites.
+-->
+
+<xsl:stylesheet version="1.0"
+    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+    xmlns:php="http://php.net/xsl"
+    xsl:extension-element-prefixes="php"
+    exclude-result-prefixes="php">
+
+    <xsl:output method="xml" indent="yes"/>
+
+    <xsl:variable name="BaseURL">
+        <xsl:text>http://howtoons.com</xsl:text>
+    </xsl:variable>
+
+    <xsl:template match="//div[contains(@id, 'post-')]">
+        <xsl:variable name="item-permalink" select=".//div[@class='post-headline']//a/@href"/>
+        <item>
+            <title>
+                <xsl:value-of select="normalize-space(.//div[@class='post-headline']//a)"/>
+            </title>
+            <link>
+                <xsl:value-of select="$item-permalink"/>
+            </link>
+            <guid>
+                <xsl:value-of select="$item-permalink"/>
+            </guid>
+            <pubDate>
+                <xsl:variable name="date" select="substring-before(.//div[@class='post-byline'], ',')"/>
+                <!-- date format is MM.DD.YY -->
+                <xsl:variable name="month" select="substring($date, 1, 2)"/>
+                <xsl:variable name="day" select="substring($date, 4, 2)"/>
+                <xsl:variable name="year" select="substring($date, 7, 2)"/>
+                <xsl:variable name="iso-date" select="concat('20', $year, '-', $month, '-', $day)"/>
+                <xsl:value-of select="php:functionString('Tweeper\Tweeper::strToRssDate', $iso-date)"/>
+            </pubDate>
+            <description>
+                <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
+                <xsl:copy-of select=".//div[contains(@class, 'post-bodycopy')]/p"/>
+                <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
+            </description>
+        </item>
+    </xsl:template>
+
+    <xsl:template match="/">
+        <xsl:variable name="channel-title" select="//title"/>
+        <xsl:variable name="channel-link" select="$BaseURL"/>
+
+        <rss version="2.0">
+            <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
+            <channel>
+                <generator>Tweeper</generator>
+                <title>
+                    <xsl:value-of select="$channel-title"/>
+                </title>
+                <link>
+                    <xsl:value-of select="$channel-link"/>
+                </link>
+                <description>
+                    <xsl:text>The world's greatest D.I.Y. comic website! Tools of mass construction!</xsl:text>
+                </description>
+                <image>
+                    <title>
+                        <xsl:value-of select="$channel-title"/>
+                    </title>
+                    <link>
+                        <xsl:value-of select="$channel-link"/>
+                    </link>
+                    <url>
+                        <xsl:text>http://www.howtoons.com/wp-content/themes/atahualpa/images/header/tuck1000.png</xsl:text>
+                    </url>
+                </image>
+                <xsl:apply-templates select="//div[contains(@id, 'post-')]"/>
+            </channel>
+        </rss>
+    </xsl:template>
+</xsl:stylesheet>
diff --git a/src/rss_converter_identi.ca.xsl b/src/rss_converter_identi.ca.xsl
new file mode 120000 (symlink)
index 0000000..d8042a1
--- /dev/null
@@ -0,0 +1 @@
+rss_converter_pump.io.xsl
\ No newline at end of file
diff --git a/src/rss_converter_instagram.com.xsl b/src/rss_converter_instagram.com.xsl
new file mode 100644 (file)
index 0000000..609be66
--- /dev/null
@@ -0,0 +1,135 @@
+<!--
+  Stylesheet to convert Instagram user timelines to RSS.
+
+  Copyright (C) 2015  Antonio Ospite <ao2@ao2.it>
+
+  This file is part of tweeper.
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+-->
+<xsl:stylesheet version="1.0"
+    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+    xmlns:php="http://php.net/xsl"
+    xsl:extension-element-prefixes="php"
+    exclude-result-prefixes="php">
+
+    <xsl:param name="generate-enclosure"/>
+
+    <xsl:output method="xml" indent="yes"/>
+
+    <xsl:variable name="BaseURL">
+        <xsl:text>https://instagram.com</xsl:text>
+    </xsl:variable>
+
+    <xsl:variable name="user-name" select="//ProfilePage/user/username"/>
+
+    <!-- Some users do not specify the full name -->
+    <xsl:variable name="full-name" select="//ProfilePage/user/full_name"/>
+    <xsl:variable name="screen-name">
+        <xsl:choose>
+            <xsl:when test="$full-name != ''">
+                <xsl:value-of select="$full-name"/>
+            </xsl:when>
+            <xsl:otherwise>
+                <xsl:value-of select="$user-name"/>
+            </xsl:otherwise>
+        </xsl:choose>
+    </xsl:variable>
+
+    <xsl:template match="//ProfilePage/user/media/nodes">
+        <xsl:variable name="item-content-image" select="./display_src"/>
+        <xsl:variable name="item-content-caption" select="./caption"/>
+        <xsl:variable name="item-permalink" select="concat($BaseURL, '/p/', ./code, '/')"/>
+        <item>
+            <title>
+                <xsl:variable name="title-length" select="140"/>
+                <xsl:variable name="item-content-title" select="normalize-space(concat($user-name, ': ', $item-content-caption))"/>
+                <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 -->
+                <xsl:choose>
+                    <xsl:when test="string-length($item-content-title) > $title-length">
+                        <xsl:variable name="truncated-length" select="$title-length - 3"/>
+                        <xsl:value-of select="substring($item-content-title, 1, $truncated-length)"/>
+                        <xsl:text>...</xsl:text>
+                    </xsl:when>
+                    <xsl:otherwise>
+                        <xsl:value-of select="$item-content-title"/>
+                    </xsl:otherwise>
+                </xsl:choose>
+            </title>
+            <link>
+                <xsl:value-of select="$item-permalink"/>
+            </link>
+            <guid>
+                <xsl:value-of select="$item-permalink"/>
+            </guid>
+            <pubDate>
+                <xsl:variable name="timestamp" select="./date"/>
+                <xsl:value-of select="php:functionString('Tweeper\Tweeper::epochToRssDate', $timestamp)"/>
+            </pubDate>
+            <description>
+                <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
+                <p>
+                    <xsl:if test="./is_video/text() = 1">
+                        (Video)
+                    </xsl:if>
+                    <xsl:value-of select="$item-content-caption"/>
+                </p><br />
+                <a href="{$item-permalink}"><img src="{$item-content-image}" style="max-width: 100%"/></a>
+                <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
+            </description>
+            <xsl:if test="$generate-enclosure = 1">
+                <xsl:copy-of select="php:functionString('Tweeper\Tweeper::generateEnclosure', $item-content-image)"/>
+            </xsl:if>
+        </item>
+    </xsl:template>
+
+    <xsl:template match="/">
+        <xsl:variable name="channel-title" select="concat('Instagram / ', $screen-name)"/>
+        <xsl:variable name="channel-link" select="concat($BaseURL, '/', $user-name)"/>
+
+        <rss version="2.0">
+            <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
+            <channel>
+                <generator>Tweeper</generator>
+                <title>
+                    <xsl:value-of select="$channel-title"/>
+                </title>
+                <link>
+                    <xsl:value-of select="$channel-link"/>
+                </link>
+                <description>
+                    <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
+                    <xsl:value-of select="normalize-space(concat($screen-name, '. ', //user/biography))"/>
+                    <xsl:variable name="external-url" select="//user/external_url"/>
+                    <xsl:if test="$external-url != ''">
+                        <xsl:text> </xsl:text><a href="{$external-url}"><xsl:value-of select="$external-url"/></a>
+                    </xsl:if>
+                    <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
+                </description>
+                <image>
+                    <title>
+                        <xsl:value-of select="$channel-title"/>
+                    </title>
+                    <link>
+                        <xsl:value-of select="$channel-link"/>
+                    </link>
+                    <url>
+                        <xsl:value-of select="//ProfilePage/user/profile_pic_url"/>
+                    </url>
+                </image>
+                <xsl:apply-templates select="//ProfilePage/user/media/nodes"/>
+            </channel>
+        </rss>
+    </xsl:template>
+</xsl:stylesheet>
diff --git a/src/rss_converter_pump.io.xsl b/src/rss_converter_pump.io.xsl
new file mode 100644 (file)
index 0000000..bf9f674
--- /dev/null
@@ -0,0 +1,99 @@
+<!--
+  Stylesheet to convert Pump.io activity streams to RSS.
+
+  Copyright (C) 2013-2014  Antonio Ospite <ao2@ao2.it>
+
+  This file is part of tweeper.
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+-->
+<!-- To Evan, please reconsider publishing RSS ouput for _public_ contents -->
+<xsl:stylesheet version="1.0"
+    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+    xmlns:php="http://php.net/xsl"
+    xsl:extension-element-prefixes="php"
+    exclude-result-prefixes="php">
+
+    <xsl:output method="xml" indent="yes"/>
+
+    <xsl:variable name="domain-name" select="substring-after(//div[@id='profile-block']/@data-profile-id, '@')"/>
+    <xsl:variable name="BaseURL" select="concat('https://', $domain-name)"/>
+
+    <xsl:variable name="user-name" select="substring-after(//div[@id='profile-block']/@data-profile-id, ':')"/>
+
+    <xsl:template match="//div[@id='user-content-activities']//ul[@id='major-stream']/li">
+        <xsl:variable name="item-content" select=".//div[@class='activity-content']"/>
+        <xsl:variable name="item-permalink" select=".//p[@class='muted']/small/a/@href"/>
+        <item>
+            <title>
+                <xsl:value-of select="concat($user-name, ': ', normalize-space($item-content))"/>
+            </title>
+            <link>
+                <xsl:value-of select="$item-permalink"/>
+            </link>
+            <guid>
+                <xsl:value-of select="$item-permalink"/>
+            </guid>
+            <pubDate>
+                <xsl:value-of select="php:functionString('Tweeper\Tweeper::strToRssDate', .//abbr[@class='easydate']/@title)"/>
+            </pubDate>
+            <description>
+                <xsl:value-of select="concat($user-name, ': ')"/>
+                <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
+                <xsl:copy-of select="$item-content/node()"/>
+                <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
+            </description>
+            <xsl:if test="$generate-enclosure = 1">
+                <xsl:variable name="image-thumb-link" select=".//img[contains(@class, 'object-image')]/@src"/>
+                <xsl:if test="$image-thumb-link">
+                    <xsl:variable name="image-link" select="php:functionString('str_replace', '_thumb', '', $image-thumb-link)"/>
+                    <xsl:copy-of select="php:functionString('Tweeper\Tweeper::generateEnclosure', $image-link)"/>
+                </xsl:if>
+            </xsl:if>
+        </item>
+    </xsl:template>
+
+    <xsl:template match="/">
+        <xsl:variable name="channel-title" select="concat(substring-after($user-name, '@'), ' / ', substring-before($user-name, '@'))"/>
+        <xsl:variable name="channel-link" select="concat('https://', substring-after($user-name, '@'), '/', substring-before($user-name, '@'))"/>
+
+        <rss version="2.0">
+            <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
+            <channel>
+                <generator>Tweeper</generator>
+                <title>
+                    <xsl:value-of select="$channel-title"/>
+                </title>
+                <link>
+                    <xsl:value-of select="$channel-link"/>
+                </link>
+                <description>
+                    <xsl:value-of select="normalize-space(//h1[@class='media-header'])"/>
+                </description>
+                <image>
+                    <title>
+                        <xsl:value-of select="$channel-title"/>
+                    </title>
+                    <link>
+                        <xsl:value-of select="$channel-link"/>
+                    </link>
+                    <url>
+                        <xsl:value-of select="//div[@id='profile-block']/span/img[@class='img-rounded media-object']/@src"/>
+                    </url>
+                </image>
+                <xsl:apply-templates select="//div[@id='user-content-activities']//ul[@id='major-stream']/li"/>
+            </channel>
+        </rss>
+    </xsl:template>
+</xsl:stylesheet>
diff --git a/src/rss_converter_twitter.com.xsl b/src/rss_converter_twitter.com.xsl
new file mode 100644 (file)
index 0000000..58539ae
--- /dev/null
@@ -0,0 +1,208 @@
+<!--
+  Stylesheet to convert Twitter user timelines to RSS.
+
+  Copyright (C) 2013-2014  Antonio Ospite <ao2@ao2.it>
+
+  This file is part of tweeper.
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+-->
+<xsl:stylesheet version="1.0"
+    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+    xmlns:php="http://php.net/xsl"
+    xsl:extension-element-prefixes="php"
+    exclude-result-prefixes="php">
+
+    <xsl:param name="generate-enclosure"/>
+
+    <xsl:output method="xml" indent="yes"/>
+
+    <xsl:variable name="BaseURL">
+        <xsl:text>https://twitter.com</xsl:text>
+    </xsl:variable>
+
+    <!-- Identity transform -->
+    <xsl:template match="@*|node()">
+        <xsl:copy>
+            <xsl:apply-templates select="@*|node()"/>
+        </xsl:copy>
+    </xsl:template>
+
+    <!--
+         Anchors to external links provide the direct URL in the
+         data-expanded-url attribute, so use this in the href attribute too
+         instead of the default short URL which uses the t.co redirection
+         service.
+
+         NOTE: when creating an element, attributes must be processed _before_
+         adding the contents (either children or a value):
+         http://stackoverflow.com/questions/21984867/
+    -->
+    <xsl:template match="a[@data-expanded-url]">
+        <!-- Prepend and append a white space for aestethic reasons -->
+        <xsl:text> </xsl:text>
+        <a>
+            <xsl:attribute name="href">
+                <xsl:value-of select="@data-expanded-url"/>
+            </xsl:attribute>
+            <!-- Also strip &nbsp; and &hellip; -->
+            <xsl:value-of select="translate(., '&#xA0;&#x2026;', '')"/>
+        </a>
+        <xsl:text> </xsl:text>
+    </xsl:template>
+
+    <!--
+         These are links to pic.twitter.com, use the direct link for those
+         too instead of the t.co redirections.
+    -->
+    <xsl:template match="a[@data-pre-embedded='true']">
+        <!-- Prepend and append a white space for aestethic reasons -->
+        <xsl:text> </xsl:text>
+        <a>
+            <xsl:attribute name="href">
+                <xsl:value-of select="concat('https://', .)"/>
+            </xsl:attribute>
+            <xsl:value-of select="concat('https://', .)"/>
+        </a>
+        <xsl:text> </xsl:text>
+    </xsl:template>
+
+    <!-- Present images in a more convenient way -->
+    <xsl:template match="div[@data-image-url]">
+        <a>
+            <xsl:attribute name="href">
+                <xsl:value-of select="concat(@data-image-url, ':orig')"/>
+            </xsl:attribute>
+            <img>
+                <xsl:attribute name="src">
+                    <xsl:value-of select="@data-image-url"/>
+                </xsl:attribute>
+            </img>
+        </a>
+    </xsl:template>
+
+    <!-- Don't repeat background in embedded media content -->
+    <xsl:template match="div[contains(@class, 'PlayableMedia-player')]">
+        <xsl:copy>
+            <xsl:apply-templates select="@*"/>
+            <xsl:attribute name="style">
+                <xsl:value-of select="concat(@style, '; background-repeat: no-repeat')"/>
+            </xsl:attribute>
+            <xsl:apply-templates select="node()"/>
+        </xsl:copy>
+    </xsl:template>
+
+    <xsl:template match="a[@data-expanded-url]" mode="enclosure">
+        <xsl:copy-of select="php:functionString('Tweeper\Tweeper::generateEnclosure', ./@data-expanded-url)"/>
+    </xsl:template>
+
+    <xsl:template match="div[@data-image-url]" mode="enclosure">
+        <xsl:copy-of select="php:functionString('Tweeper\Tweeper::generateEnclosure', concat(./@data-image-url, ':orig'))"/>
+    </xsl:template>
+
+    <xsl:variable name="screen-name" select="//div[@class='user-actions btn-group not-following ']/@data-screen-name"/>
+
+    <xsl:template match="//li[@data-item-id and @data-item-type='tweet']">
+        <xsl:variable name="user-name" select=".//div[contains(@class, 'js-stream-tweet')]/@data-screen-name"/>
+        <xsl:variable name="item-content" select=".//p[contains(@class, 'js-tweet-text')]"/>
+        <xsl:variable name="item-media" select=".//div[contains(@class, 'AdaptiveMedia-container')]"/>
+        <xsl:variable name="item-permalink" select="concat($BaseURL, .//div[@data-permalink-path]/@data-permalink-path)"/>
+
+        <xsl:variable name="item-has-video" select="$item-media//*[contains(@class, 'PlayableMedia--video')]"/>
+        <item>
+            <title>
+                <xsl:value-of select="concat($user-name, ': ')"/>
+                <xsl:if test="$item-has-video">
+                    <xsl:text>(Video) </xsl:text>
+                </xsl:if>
+                <!--
+                     Prepend a space in front of the URLs which are not
+                     preceded by an open parenthesis, for aestethic reasons.
+                     Also, regex, I know: http://xkcd.com/1171/
+                -->
+                <xsl:variable
+                    name="processed-title"
+                    select="php:functionString('preg_replace', '@((?&lt;!\()(?:http[s]?://|pic.twitter.com))@', ' \1', $item-content)"/>
+                <!-- Also strip &nbsp; and &hellip; -->
+                <xsl:value-of select="normalize-space(translate($processed-title, '&#xA0;&#x2026;', ''))"/>
+            </title>
+            <link>
+                <xsl:value-of select="$item-permalink"/>
+            </link>
+            <guid>
+                <xsl:value-of select="$item-permalink"/>
+            </guid>
+            <pubDate>
+                <xsl:variable name="timestamp" select=".//span[contains(@class, 'js-short-timestamp')]/@data-time"/>
+                <xsl:value-of select="php:functionString('Tweeper\Tweeper::epochToRssDate', number($timestamp))"/>
+            </pubDate>
+            <description>
+                <xsl:value-of select="concat($user-name, ': ')"/>
+                <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
+                <xsl:if test="$item-has-video">
+                    <xsl:text>(Video) </xsl:text>
+                </xsl:if>
+                <xsl:apply-templates select="$item-content/node()"/>
+                <xsl:apply-templates select="$item-media/node()"/>
+                <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
+            </description>
+            <xsl:if test="$generate-enclosure = 1">
+                <xsl:apply-templates select="$item-content//a[@data-expanded-url]" mode="enclosure"/>
+                <xsl:apply-templates select="$item-media//div[@data-image-url]" mode="enclosure"/>
+            </xsl:if>
+        </item>
+    </xsl:template>
+
+    <xsl:template match="/">
+        <xsl:variable name="channel-title">
+            <xsl:choose>
+                <xsl:when test="$screen-name != ''">
+                    <xsl:value-of select="concat('Twitter / ', $screen-name)"/>
+                </xsl:when>
+                <xsl:otherwise>
+                    <xsl:value-of select="concat('Twitter / ', normalize-space(//h1[1]))"/>
+                </xsl:otherwise>
+            </xsl:choose>
+        </xsl:variable>
+        <xsl:variable name="channel-link" select="//link[@rel='canonical']/@href"/>
+
+        <rss version="2.0">
+            <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
+            <channel>
+                <generator>Tweeper</generator>
+                <title>
+                    <xsl:value-of select="$channel-title"/>
+                </title>
+                <link>
+                    <xsl:value-of select="$channel-link"/>
+                </link>
+                <description>
+                    <xsl:value-of select="normalize-space(//div[@class='ProfileHeaderCard'])"/>
+                </description>
+                <image>
+                    <title>
+                        <xsl:value-of select="$channel-title"/>
+                    </title>
+                    <link>
+                        <xsl:value-of select="$channel-link"/>
+                    </link>
+                    <url>
+                        <xsl:value-of select="//a[contains(@class, 'profile-picture')]/@href"/>
+                    </url>
+                </image>
+                <xsl:apply-templates select="//ol[@id='stream-items-id']/li[@data-item-id and @data-item-type='tweet']"/>
+            </channel>
+        </rss>
+    </xsl:template>
+</xsl:stylesheet>
diff --git a/tests/test_information_leakage.sh b/tests/test_information_leakage.sh
new file mode 100755 (executable)
index 0000000..061d917
--- /dev/null
@@ -0,0 +1,54 @@
+#!/bin/sh
+
+set -e
+
+TWEEPER="/usr/share/php/tweeper/tweeper"
+#TWEEPER="./tweeper"
+
+check_result() {
+  URL="$1"
+  FILE="$2"
+  RESULT="$3"
+
+  echo "URL $URL"
+  if [ "$RESULT" ];
+  then
+    echo "--> $FILE"
+    echo "    exists"
+  else
+    echo "... $FILE"
+    echo "    does not exist"
+  fi
+  echo
+}
+
+file_exists() {
+  FILE="$1"
+  URL="file://twitter.com/$FILE"
+  OUTPUT=$($TWEEPER $URL)
+  check_result "$URL" "$FILE" "$OUTPUT"
+}
+
+file_exists_on_server() {
+  SERVER="$1"
+  FILE="$2"
+  URL="file://twitter.com/$FILE"
+  OUTPUT=$(curl $SERVER/tweeper.php?src_url=$URL 2> /dev/null)
+  check_result "$URL" "$FILE on $SERVER" "$OUTPUT"
+}
+
+file_exists /etc/passwd || true
+file_exists /etc/file_with_an_unlikely_name || true
+
+echo "Staring a test server"
+echo
+
+php -S localhost:8000 -t $(dirname $TWEEPER) > /dev/null 2>&1 &
+SERVER_PID=$!
+sleep 1
+
+file_exists_on_server http://localhost:8000 /etc/passwd || true
+file_exists_on_server http://localhost:8000 /etc/file_with_an_unlikely_name || true
+
+echo "Shutting down the test server"
+kill $SERVER_PID
diff --git a/tweeper b/tweeper
index 6256e20..d4b04e3 100755 (executable)
--- a/tweeper
+++ b/tweeper
@@ -6,4 +6,17 @@
  * CLI file to run tweeper.
  */
 
-require dirname(__FILE__) . '/tweeper.php';
+if (preg_match('/' . preg_quote('/vendor/bin', '/') . '$/', __DIR__)) {
+  /*
+   * This covers the case of tweeper running from a "vendor/bin" directory in
+   * a composer setup, but with the tweeper executable _not_ being a symlink.
+   *
+   * This can happen when the filesystem does not support symlinks.
+   */
+  $package_name = 'ao2/tweeper';
+  require __DIR__ . '/../' . $package_name . '/tweeper.php';
+}
+else {
+  /* For the other cases look at the autoload.php required by tweeper.php */
+  require __DIR__ . '/tweeper.php';
+}
index d2f1f50..ac1fdd1 100644 (file)
@@ -108,7 +108,7 @@ Main web site: <https://git.ao2.it/tweeper.git>
 
 COPYING
 -------
-Copyright \(C) 2013-2015  Antonio Ospite <ao2@ao2.it>
+Copyright \(C) 2013-2016  Antonio Ospite <ao2@ao2.it>
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
index 94ea05f..ff98ab7 100644 (file)
@@ -3,7 +3,7 @@
  * @file
  * Tweeper - a Twitter to RSS web scraper.
  *
- * Copyright (C) 2013-2015  Antonio Ospite <ao2@ao2.it>
+ * Copyright (C) 2013-2016  Antonio Ospite <ao2@ao2.it>
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-require_once 'Symfony/Component/Serializer/autoload.php';
+require_once 'autoload.php';
 
-use Symfony\Component\Serializer\Serializer;
-use Symfony\Component\Serializer\Encoder\XmlEncoder;
-use Symfony\Component\Serializer\Normalizer\ObjectNormalizer;
+use Tweeper\Tweeper;
 
 date_default_timezone_set('UTC');
 
 /**
- * Scrape supported websites and perform conversion to RSS.
- */
-class Tweeper {
-
-  private static $userAgent = "Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0";
-
-  /**
-   * Constructor sets up {@link $generate_enclosure}.
-   */
-  public function __construct($generate_enclosure = FALSE) {
-    $this->generate_enclosure = $generate_enclosure;
-  }
-
-  /**
-   * Convert numeric Epoch to the date format expected in a RSS document.
-   */
-  public static function epochToRssDate($timestamp) {
-    if (!is_numeric($timestamp) || is_nan($timestamp)) {
-      $timestamp = 0;
-    }
-
-    return gmdate(DATE_RSS, $timestamp);
-  }
-
-  /**
-   * Convert generic date string to the date format expected in a RSS document.
-   */
-  public static function strToRssDate($date) {
-    $timestamp = strtotime($date);
-    if (FALSE === $timestamp) {
-      $timestamp = 0;
-    }
-
-    return Tweeper::epochToRssDate($timestamp);
-  }
-
-  /**
-   * Convert string to UpperCamelCase.
-   */
-  public static function toUpperCamelCase($str, $delim = ' ') {
-    $str_upper = ucwords($str, $delim);
-    $str_camel_case = str_replace($delim, '', $str_upper);
-    return $str_camel_case;
-  }
-
-  /**
-   * Get the contents from a URL.
-   */
-  private static function getUrlContents($url) {
-    $ch = curl_init($url);
-    curl_setopt_array($ch, array(
-      CURLOPT_HEADER => FALSE,
-      // Follow http redirects to get the real URL.
-      CURLOPT_FOLLOWLOCATION => TRUE,
-      CURLOPT_RETURNTRANSFER => TRUE,
-      CURLOPT_SSL_VERIFYHOST => FALSE,
-      CURLOPT_SSL_VERIFYPEER => FALSE,
-      CURLOPT_HTTPHEADER => array('Accept-language: en'),
-      CURLOPT_USERAGENT => Tweeper::$userAgent,
-    ));
-    $contents = curl_exec($ch);
-    curl_close($ch);
-
-    return $contents;
-  }
-
-  /**
-   * Get the headers from a URL.
-   */
-  private static function getUrlInfo($url) {
-    $ch = curl_init($url);
-    curl_setopt_array($ch, array(
-      CURLOPT_HEADER => TRUE,
-      CURLOPT_NOBODY => TRUE,
-      // Follow http redirects to get the real URL.
-      CURLOPT_FOLLOWLOCATION => TRUE,
-      CURLOPT_RETURNTRANSFER => TRUE,
-      CURLOPT_SSL_VERIFYHOST => FALSE,
-      CURLOPT_SSL_VERIFYPEER => FALSE,
-      CURLOPT_USERAGENT => Tweeper::$userAgent,
-    ));
-    curl_exec($ch);
-    $url_info = curl_getinfo($ch);
-    curl_close($ch);
-
-    return $url_info;
-  }
-
-  /**
-   * Generate an RSS <enclosure/> element.
-   */
-  public static function generateEnclosure($url) {
-    $supported_content_types = array(
-      "application/ogg",
-      "audio/aac",
-      "audio/mp4",
-      "audio/mpeg",
-      "audio/ogg",
-      "audio/vorbis",
-      "audio/wav",
-      "audio/webm",
-      "audio/x-midi",
-      "image/gif",
-      "image/jpeg",
-      "image/png",
-      "video/avi",
-      "video/mp4",
-      "video/mpeg",
-      "video/ogg",
-    );
-
-    $url_info = Tweeper::getUrlInfo($url);
-
-    $supported = in_array($url_info['content_type'], $supported_content_types);
-    if (!$supported) {
-      error_log("Unsupported enclosure content type \"" . $url_info['content_type'] . "\" for URL: " . $url_info['url']);
-      return '';
-    }
-
-    // The RSS specification says that the enclosure element URL must be http.
-    // See http://sourceforge.net/p/feedvalidator/bugs/72/
-    $http_url = preg_replace("/^https/", "http", $url_info['url']);
-
-    $dom = new DOMDocument();
-    $enc = $dom->createElement('enclosure');
-    $enc->setAttribute('url', $http_url);
-    $enc->setAttribute('length', $url_info['download_content_length']);
-    $enc->setAttribute('type', $url_info['content_type']);
-
-    return $enc;
-  }
-
-  /**
-   * Mimic the message from libxml.c::php_libxml_ctx_error_level()
-   */
-  private static function logXmlError($error) {
-    $output = "";
-
-    switch ($error->level) {
-      case LIBXML_ERR_WARNING:
-        $output .= "Warning $error->code: ";
-        break;
-
-      case LIBXML_ERR_ERROR:
-        $output .= "Error $error->code: ";
-        break;
-
-      case LIBXML_ERR_FATAL:
-        $output .= "Fatal Error $error->code: ";
-        break;
-    }
-
-    $output .= trim($error->message);
-
-    if ($error->file) {
-      $output .= " in $error->file";
-    }
-    else {
-      $output .= " in Entity,";
-    }
-
-    $output .= " line $error->line";
-
-    error_log($output);
-  }
-
-  /**
-   * Convert json to XML.
-   */
-  private static function jsonToXml($json, $root_node_name) {
-    // Apparently the ObjectNormalizer used afterwards is not able to handle
-    // the stdClass object created by json_decode() with the default setting
-    // $assoc = false; so use $assoc = true.
-    $data = json_decode($json, $assoc = TRUE);
-    if (!$data) {
-      return NULL;
-    }
-
-    $encoder = new XmlEncoder();
-    $normalizer = new ObjectNormalizer();
-    $serializer = new Serializer(array($normalizer), array($encoder));
-
-    $serializer_options = array(
-      'xml_encoding' => "UTF-8",
-      'xml_format_output' => TRUE,
-      'xml_root_node_name' => $root_node_name,
-    );
-
-    $xml_data = $serializer->serialize($data, 'xml', $serializer_options);
-    if (!$xml_data) {
-      trigger_error("Cannot serialize data", E_USER_ERROR);
-      return NULL;
-    }
-
-    return $xml_data;
-  }
-
-  /**
-   * Convert the Instagram content to XML.
-   */
-  private function getXmlInstagramCom($html) {
-    // Extract the json data from the html code.
-    $json_match_expr = '/window._sharedData = (.*);/';
-    $ret = preg_match($json_match_expr, $html, $matches);
-    if ($ret !== 1) {
-      trigger_error("Cannot match expression: $json_match_expr\n", E_USER_ERROR);
-      return NULL;
-    }
-
-    return Tweeper::jsonToXml($matches[1], 'instagram');
-  }
-
-  /**
-   * Make the Facebook HTML processable.
-   */
-  private function preprocessHtmlFacebookCom($html) {
-    $html = str_replace('<!--', '', $html);
-    $html = str_replace('-->', '', $html);
-    return $html;
-  }
-
-  /**
-   * Convert the HTML retrieved from the site to XML.
-   */
-  private function htmlToXml($html, $host) {
-    $xmlDoc = new DOMDocument();
-
-    // Handle warnings and errors when loading invalid HTML.
-    $xml_errors_value = libxml_use_internal_errors(TRUE);
-
-    // If there is a host-specific method to get the XML data, use it!
-    $get_xml_host_method = 'getXml' . Tweeper::toUpperCamelCase($host, '.');
-    if (method_exists($this, $get_xml_host_method)) {
-      $xml_data = call_user_func_array(array($this, $get_xml_host_method), array($html));
-      $xmlDoc->loadXML($xml_data);
-    }
-    else {
-      $xmlDoc->loadHTML($html);
-    }
-
-    foreach (libxml_get_errors() as $xml_error) {
-      Tweeper::logXmlError($xml_error);
-    }
-    libxml_clear_errors();
-    libxml_use_internal_errors($xml_errors_value);
-
-    return $xmlDoc;
-  }
-
-  /**
-   * Load a stylesheet if the web site is supported.
-   */
-  private function loadStylesheet($host) {
-    $stylesheet = "file://" . __DIR__ . "/rss_converter_" . $host . ".xsl";
-    if (FALSE === file_exists($stylesheet)) {
-      trigger_error("Conversion to RSS not supported for $host ($stylesheet not found)", E_USER_ERROR);
-      return NULL;
-    }
-
-    $stylesheet_contents = Tweeper::getUrlContents($stylesheet);
-
-    $xslDoc = new DOMDocument();
-    $xslDoc->loadXML($stylesheet_contents);
-
-    $xsltProcessor = new XSLTProcessor();
-    $xsltProcessor->registerPHPFunctions();
-    $xsltProcessor->setParameter('', 'generate-enclosure', $this->generate_enclosure);
-    $xsltProcessor->importStylesheet($xslDoc);
-
-    return $xsltProcessor;
-  }
-
-  /**
-   * Convert the site content to RSS.
-   */
-  public function tweep($src_url) {
-    $url = parse_url($src_url);
-    if (FALSE === $url || empty($url["host"])) {
-      trigger_error("Invalid URL: $src_url", E_USER_ERROR);
-      return NULL;
-    }
-
-    // Strip the leading www. to be more forgiving on input URLs.
-    $host = preg_replace('/^www\./', '', $url["host"]);
-
-    $xsltProcessor = $this->loadStylesheet($host);
-    if (NULL === $xsltProcessor) {
-      return NULL;
-    }
-
-    $html = Tweeper::getUrlContents($src_url);
-    if (FALSE === $html) {
-      return NULL;
-    }
-
-    $preprocess_html_host_method = 'preprocessHtml' . Tweeper::toUpperCamelCase($host, '.');
-    if (method_exists($this, $preprocess_html_host_method)) {
-      $html = call_user_func_array(array($this, $preprocess_html_host_method), array($html));
-    }
-
-    $xmlDoc = $this->htmlToXml($html, $host);
-    if (NULL === $xmlDoc) {
-      return NULL;
-    }
-
-    $output = $xsltProcessor->transformToXML($xmlDoc);
-
-    if (FALSE === $output) {
-      trigger_error('XSL transformation failed.', E_USER_ERROR);
-      return NULL;
-    }
-    return $output;
-  }
-
-}
-
-/**
  * Check if the script is being run from the command line.
  */
 function is_cli() {
@@ -434,4 +115,8 @@ if (!isset($options['src_url'])) {
 }
 
 $tweeper = new Tweeper($options['generate_enclosure']);
-echo $tweeper->tweep($options['src_url']);
+$output = $tweeper->tweep($options['src_url']);
+if (is_null($output)) {
+  exit(1);
+}
+echo $output;