tweeper: move the main Tweeper class to its own file under src/

author Antonio Ospite <ao2@ao2.it>

Fri, 4 Nov 2016 12:13:54 +0000 (13:13 +0100)

committer Antonio Ospite <ao2@ao2.it>

Fri, 4 Nov 2016 15:03:13 +0000 (16:03 +0100)
author Antonio Ospite <ao2@ao2.it>
Fri, 4 Nov 2016 12:13:54 +0000 (13:13 +0100)
committer Antonio Ospite <ao2@ao2.it>
Fri, 4 Nov 2016 15:03:13 +0000 (16:03 +0100)
diff --git a/rss_converter_dilbert.com.xsl b/rss_converter_dilbert.com.xsl

deleted file mode 100644 (file)

index b6d1975..0000000
--- a/rss_converter_dilbert.com.xsl
+++ /dev/null
@@ -1,115 +0,0 @@
-<!--
-  Stylesheet to convert Dilbert daily strips to RSS.
-
-  Copyright (C) 2013-2014  Antonio Ospite <ao2@ao2.it>
-
-  This file is part of tweeper.
-
-  This program is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
--->
-
-<!--
-  Since June 18, 2013 dilbert.com strips are not accessible anymore
-  directly from the RSS feed, this message is displayed instead:
-
-    Dilbert readers - Please visit Dilbert.com to read this feature. Due
-    to changes with our feeds, we are now making this RSS feed a link to
-    Dilbert.com.
-
-  How unhandy is that, was it because of a management decision?
-  Maybe a parody dilbert strip is needed about this issue...
--->
-
-<xsl:stylesheet version="1.0"
-    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
-    xmlns:php="http://php.net/xsl"
-    xsl:extension-element-prefixes="php"
-    exclude-result-prefixes="php">
-
-    <xsl:output method="xml" indent="yes"/>
-
-    <xsl:variable name="BaseURL" select="//meta[@property='og:url']/@content"/>
-
-    <xsl:template match="//section[@class='comic-item']">
-        <xsl:variable name="item-permalink" select=".//a[@class='img-comic-link']/@href"/>
-        <xsl:variable name="picture-url" select=".//img[@class='img-responsive img-comic']/@src"/>
-        <xsl:variable name="picture-title" select=".//img[@class='img-responsive img-comic']/@alt"/>
-        <item>
-            <title>
-                <xsl:variable name="title-length" select="140"/>
-                <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 -->
-                <xsl:choose>
-                    <xsl:when test="string-length($picture-title) > $title-length">
-                        <xsl:variable name="truncated-length" select="$title-length - 3"/>
-                        <xsl:value-of select="substring($picture-title, 1, $truncated-length)"/>
-                        <xsl:text>...</xsl:text>
-                    </xsl:when>
-                    <xsl:otherwise>
-                        <xsl:value-of select="$picture-title"/>
-                    </xsl:otherwise>
-                </xsl:choose>
-            </title>
-            <link>
-                <xsl:value-of select="$item-permalink"/>
-            </link>
-            <guid>
-                <xsl:value-of select="$item-permalink"/>
-            </guid>
-            <pubDate>
-                <xsl:value-of select="php:functionString('Tweeper::strToRssDate', normalize-space(.//date))"/>
-            </pubDate>
-            <description>
-                <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
-                <img src="{$picture-url}" alt="{$picture-title}"/>
-                <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
-            </description>
-            <xsl:if test="$generate-enclosure = 1">
-                <xsl:copy-of select="php:functionString('Tweeper::generateEnclosure', $picture-url)"/>
-            </xsl:if>
-        </item>
-    </xsl:template>
-
-    <xsl:template match="/">
-        <xsl:variable name="channel-title" select="//meta[@property='og:title']/@content"/>
-        <xsl:variable name="channel-link" select="$BaseURL"/>
-
-        <rss version="2.0">
-            <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
-            <channel>
-                <generator>Tweeper</generator>
-                <title>
-                    <xsl:value-of select="$channel-title"/>
-                </title>
-                <link>
-                    <xsl:value-of select="$channel-link"/>
-                </link>
-                <description>
-                    <xsl:value-of select="//meta[@property='og:description']/@content"/>
-                </description>
-                <image>
-                    <title>
-                        <xsl:value-of select="$channel-title"/>
-                    </title>
-                    <link>
-                        <xsl:value-of select="$channel-link"/>
-                    </link>
-                    <url>
-                        <xsl:value-of select="concat($BaseURL, //img[@alt='Dilbert logo']/@src)"/>
-                    </url>
-                </image>
-                <xsl:apply-templates select="//section[@class='comic-item']"/>
-            </channel>
-        </rss>
-    </xsl:template>
-</xsl:stylesheet>
diff --git a/rss_converter_facebook.com.xsl b/rss_converter_facebook.com.xsl

deleted file mode 100644 (file)

index 418b3d2..0000000
--- a/rss_converter_facebook.com.xsl
+++ /dev/null
@@ -1,141 +0,0 @@
-<!--
-  Stylesheet to convert a Facebook public page to RSS.
-
-  Copyright (C) 2015  Antonio Ospite <ao2@ao2.it>
-
-  This file is part of tweeper.
-
-  This program is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
--->
-
-<!--
-  Since June 23rd, 2015 facebook.com deprecated the RSS feed endpoint for public pages:
-  https://developers.facebook.com/docs/apps/changelog#v2_3_90_day_deprecations
-
-  They suggest to use the Graph API but they fail to mention that it does not
-  work anymore without authentication, so it cannot be considered an
-  _equivalent_ solution.
-
-  Luckily we've got Tweeper!
--->
-
-<xsl:stylesheet version="1.0"
-    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
-    xmlns:php="http://php.net/xsl"
-    xsl:extension-element-prefixes="php"
-    exclude-result-prefixes="php">
-
-    <xsl:output method="xml" indent="yes"/>
-
-    <xsl:variable name="BaseURL">
-        <xsl:text>https://facebook.com</xsl:text>
-    </xsl:variable>
-
-    <!--
-         Extract the page id from an element like:
-        <meta property="al:android:url" content="fb://page/793837197390834">
-
-        The page id will be used to build the permalink.
-    -->
-    <xsl:variable
-        name="page-id"
-        select="substring-after(//meta[@property='al:android:url']/@content, 'fb://page/')"/>
-
-    <xsl:template match="//div[contains(@class, 'userContentWrapper')]">
-        <xsl:variable name="story-id" select=".//input[@name='ft_ent_identifier']/@value"/>
-        <xsl:variable
-            name="item-permalink"
-            select="concat($BaseURL, '/permalink.php?id=', $page-id, '&amp;story_fbid=', $story-id)"/>
-
-        <!-- Get only the first child in order to skip the footer of the content -->
-        <xsl:variable name="item-content" select="div[1]"/>
-
-        <item>
-            <title>
-                <xsl:variable name="item-title" select="$item-content//p"/>
-                <xsl:variable name="title-length" select="140"/>
-                <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 -->
-                <xsl:choose>
-                    <xsl:when test="string-length($item-title) > $title-length">
-                        <xsl:variable name="truncated-length" select="$title-length - 3"/>
-                        <xsl:value-of select="substring($item-title, 1, $truncated-length)"/>
-                        <xsl:text>...</xsl:text>
-                    </xsl:when>
-                    <xsl:otherwise>
-                        <xsl:value-of select="$item-title"/>
-                    </xsl:otherwise>
-                </xsl:choose>
-            </title>
-            <link>
-                <xsl:value-of select="$item-permalink"/>
-            </link>
-            <guid>
-                <xsl:value-of select="$item-permalink"/>
-            </guid>
-            <pubDate>
-                <xsl:variable name="timestamp" select=".//abbr[@data-shorten]/@data-utime"/>
-                <xsl:value-of select="php:functionString('Tweeper::epochToRssDate', $timestamp)"/>
-            </pubDate>
-            <description>
-
-                <!--
-                     Get only the children starting from the one with class="userContent",
-                     this way the content header is skipped
-                -->
-                <xsl:variable
-                    name="usercontent-position"
-                    select="count($item-content/div[contains(@class, 'userContent')]/preceding-sibling::*) + 1"/>
-
-                <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
-                <xsl:copy-of select="$item-content/div[position() >= $usercontent-position]"/>
-                <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
-            </description>
-        </item>
-    </xsl:template>
-
-    <xsl:template match="/">
-        <xsl:variable name="channel-title" select="//title"/>
-        <xsl:variable name="channel-link" select="//div[contains(@class, 'userContentWrapper')][1]//a[1]/@href"/>
-
-        <rss version="2.0">
-            <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
-            <channel>
-                <generator>Tweeper</generator>
-                <title>
-                    <xsl:value-of select="$channel-title"/>
-                </title>
-                <link>
-                    <xsl:value-of select="$channel-link"/>
-                </link>
-                <description>
-                    <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
-                    <xsl:copy-of select="//div[@data-id='1']/node()"/>
-                    <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
-                </description>
-                <image>
-                    <title>
-                        <xsl:value-of select="$channel-title"/>
-                    </title>
-                    <link>
-                        <xsl:value-of select="$channel-link"/>
-                    </link>
-                    <url>
-                        <xsl:value-of select="//img[@class='profilePic img']/@src"/>
-                    </url>
-                </image>
-                <xsl:apply-templates select="//div[contains(@class, 'userContentWrapper')]"/>
-            </channel>
-        </rss>
-    </xsl:template>
-</xsl:stylesheet>
diff --git a/rss_converter_howtoons.com.xsl b/rss_converter_howtoons.com.xsl

deleted file mode 100644 (file)

index 403b9ac..0000000
--- a/rss_converter_howtoons.com.xsl
+++ /dev/null
@@ -1,102 +0,0 @@
-<!--
-  Stylesheet to convert Howtoons.com to RSS.
-
-  Copyright (C) 2014  Antonio Ospite <ao2@ao2.it>
-
-  This file is part of tweeper.
-
-  This program is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
--->
-
-<!--
-  The RSS feed link is broken on http://howtoons.com so just work around it.
-
-  Howtoons uses Wordpress, so maybe this style sheet can be used as a base for
-  scraping other Wordpress sites.
--->
-
-<xsl:stylesheet version="1.0"
-    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
-    xmlns:php="http://php.net/xsl"
-    xsl:extension-element-prefixes="php"
-    exclude-result-prefixes="php">
-
-    <xsl:output method="xml" indent="yes"/>
-
-    <xsl:variable name="BaseURL">
-        <xsl:text>http://howtoons.com</xsl:text>
-    </xsl:variable>
-
-    <xsl:template match="//div[contains(@id, 'post-')]">
-        <xsl:variable name="item-permalink" select=".//div[@class='post-headline']//a/@href"/>
-        <item>
-            <title>
-                <xsl:value-of select="normalize-space(.//div[@class='post-headline']//a)"/>
-            </title>
-            <link>
-                <xsl:value-of select="$item-permalink"/>
-            </link>
-            <guid>
-                <xsl:value-of select="$item-permalink"/>
-            </guid>
-            <pubDate>
-                <xsl:variable name="date" select="substring-before(.//div[@class='post-byline'], ',')"/>
-                <!-- date format is MM.DD.YY -->
-                <xsl:variable name="month" select="substring($date, 1, 2)"/>
-                <xsl:variable name="day" select="substring($date, 4, 2)"/>
-                <xsl:variable name="year" select="substring($date, 7, 2)"/>
-                <xsl:variable name="iso-date" select="concat('20', $year, '-', $month, '-', $day)"/>
-                <xsl:value-of select="php:functionString('Tweeper::strToRssDate', $iso-date)"/>
-            </pubDate>
-            <description>
-                <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
-                <xsl:copy-of select=".//div[contains(@class, 'post-bodycopy')]/p"/>
-                <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
-            </description>
-        </item>
-    </xsl:template>
-
-    <xsl:template match="/">
-        <xsl:variable name="channel-title" select="//title"/>
-        <xsl:variable name="channel-link" select="$BaseURL"/>
-
-        <rss version="2.0">
-            <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
-            <channel>
-                <generator>Tweeper</generator>
-                <title>
-                    <xsl:value-of select="$channel-title"/>
-                </title>
-                <link>
-                    <xsl:value-of select="$channel-link"/>
-                </link>
-                <description>
-                    <xsl:text>The world's greatest D.I.Y. comic website! Tools of mass construction!</xsl:text>
-                </description>
-                <image>
-                    <title>
-                        <xsl:value-of select="$channel-title"/>
-                    </title>
-                    <link>
-                        <xsl:value-of select="$channel-link"/>
-                    </link>
-                    <url>
-                        <xsl:text>http://www.howtoons.com/wp-content/themes/atahualpa/images/header/tuck1000.png</xsl:text>
-                    </url>
-                </image>
-                <xsl:apply-templates select="//div[contains(@id, 'post-')]"/>
-            </channel>
-        </rss>
-    </xsl:template>
-</xsl:stylesheet>
diff --git a/rss_converter_identi.ca.xsl b/rss_converter_identi.ca.xsl

deleted file mode 120000 (symlink)

index d8042a1..0000000
--- a/rss_converter_identi.ca.xsl
+++ /dev/null
@@ -1 +0,0 @@
-rss_converter_pump.io.xsl
-\ No newline at end of file
diff --git a/rss_converter_instagram.com.xsl b/rss_converter_instagram.com.xsl

deleted file mode 100644 (file)

index e869d7d..0000000
--- a/rss_converter_instagram.com.xsl
+++ /dev/null
@@ -1,135 +0,0 @@
-<!--
-  Stylesheet to convert Instagram user timelines to RSS.
-
-  Copyright (C) 2015  Antonio Ospite <ao2@ao2.it>
-
-  This file is part of tweeper.
-
-  This program is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
--->
-<xsl:stylesheet version="1.0"
-    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
-    xmlns:php="http://php.net/xsl"
-    xsl:extension-element-prefixes="php"
-    exclude-result-prefixes="php">
-
-    <xsl:param name="generate-enclosure"/>
-
-    <xsl:output method="xml" indent="yes"/>
-
-    <xsl:variable name="BaseURL">
-        <xsl:text>https://instagram.com</xsl:text>
-    </xsl:variable>
-
-    <xsl:variable name="user-name" select="//ProfilePage/user/username"/>
-
-    <!-- Some users do not specify the full name -->
-    <xsl:variable name="full-name" select="//ProfilePage/user/full_name"/>
-    <xsl:variable name="screen-name">
-        <xsl:choose>
-            <xsl:when test="$full-name != ''">
-                <xsl:value-of select="$full-name"/>
-            </xsl:when>
-            <xsl:otherwise>
-                <xsl:value-of select="$user-name"/>
-            </xsl:otherwise>
-        </xsl:choose>
-    </xsl:variable>
-
-    <xsl:template match="//ProfilePage/user/media/nodes">
-        <xsl:variable name="item-content-image" select="./display_src"/>
-        <xsl:variable name="item-content-caption" select="./caption"/>
-        <xsl:variable name="item-permalink" select="concat($BaseURL, '/p/', ./code, '/')"/>
-        <item>
-            <title>
-                <xsl:variable name="title-length" select="140"/>
-                <xsl:variable name="item-content-title" select="normalize-space(concat($user-name, ': ', $item-content-caption))"/>
-                <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 -->
-                <xsl:choose>
-                    <xsl:when test="string-length($item-content-title) > $title-length">
-                        <xsl:variable name="truncated-length" select="$title-length - 3"/>
-                        <xsl:value-of select="substring($item-content-title, 1, $truncated-length)"/>
-                        <xsl:text>...</xsl:text>
-                    </xsl:when>
-                    <xsl:otherwise>
-                        <xsl:value-of select="$item-content-title"/>
-                    </xsl:otherwise>
-                </xsl:choose>
-            </title>
-            <link>
-                <xsl:value-of select="$item-permalink"/>
-            </link>
-            <guid>
-                <xsl:value-of select="$item-permalink"/>
-            </guid>
-            <pubDate>
-                <xsl:variable name="timestamp" select="./date"/>
-                <xsl:value-of select="php:functionString('Tweeper::epochToRssDate', $timestamp)"/>
-            </pubDate>
-            <description>
-                <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
-                <p>
-                    <xsl:if test="./is_video/text() = 1">
-                        (Video)
-                    </xsl:if>
-                    <xsl:value-of select="$item-content-caption"/>
-                </p><br />
-                <a href="{$item-permalink}"><img src="{$item-content-image}" style="max-width: 100%"/></a>
-                <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
-            </description>
-            <xsl:if test="$generate-enclosure = 1">
-                <xsl:copy-of select="php:functionString('Tweeper::generateEnclosure', $item-content-image)"/>
-            </xsl:if>
-        </item>
-    </xsl:template>
-
-    <xsl:template match="/">
-        <xsl:variable name="channel-title" select="concat('Instagram / ', $screen-name)"/>
-        <xsl:variable name="channel-link" select="concat($BaseURL, '/', $user-name)"/>
-
-        <rss version="2.0">
-            <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
-            <channel>
-                <generator>Tweeper</generator>
-                <title>
-                    <xsl:value-of select="$channel-title"/>
-                </title>
-                <link>
-                    <xsl:value-of select="$channel-link"/>
-                </link>
-                <description>
-                    <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
-                    <xsl:value-of select="normalize-space(concat($screen-name, '. ', //user/biography))"/>
-                    <xsl:variable name="external-url" select="//user/external_url"/>
-                    <xsl:if test="$external-url != ''">
-                        <xsl:text> </xsl:text><a href="{$external-url}"><xsl:value-of select="$external-url"/></a>
-                    </xsl:if>
-                    <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
-                </description>
-                <image>
-                    <title>
-                        <xsl:value-of select="$channel-title"/>
-                    </title>
-                    <link>
-                        <xsl:value-of select="$channel-link"/>
-                    </link>
-                    <url>
-                        <xsl:value-of select="//ProfilePage/user/profile_pic_url"/>
-                    </url>
-                </image>
-                <xsl:apply-templates select="//ProfilePage/user/media/nodes"/>
-            </channel>
-        </rss>
-    </xsl:template>
-</xsl:stylesheet>
diff --git a/rss_converter_pump.io.xsl b/rss_converter_pump.io.xsl

deleted file mode 100644 (file)

index 1577dcf..0000000
--- a/rss_converter_pump.io.xsl
+++ /dev/null
@@ -1,99 +0,0 @@
-<!--
-  Stylesheet to convert Pump.io activity streams to RSS.
-
-  Copyright (C) 2013-2014  Antonio Ospite <ao2@ao2.it>
-
-  This file is part of tweeper.
-
-  This program is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
--->
-<!-- To Evan, please reconsider publishing RSS ouput for _public_ contents -->
-<xsl:stylesheet version="1.0"
-    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
-    xmlns:php="http://php.net/xsl"
-    xsl:extension-element-prefixes="php"
-    exclude-result-prefixes="php">
-
-    <xsl:output method="xml" indent="yes"/>
-
-    <xsl:variable name="domain-name" select="substring-after(//div[@id='profile-block']/@data-profile-id, '@')"/>
-    <xsl:variable name="BaseURL" select="concat('https://', $domain-name)"/>
-
-    <xsl:variable name="user-name" select="substring-after(//div[@id='profile-block']/@data-profile-id, ':')"/>
-
-    <xsl:template match="//div[@id='user-content-activities']//ul[@id='major-stream']/li">
-        <xsl:variable name="item-content" select=".//div[@class='activity-content']"/>
-        <xsl:variable name="item-permalink" select=".//p[@class='muted']/small/a/@href"/>
-        <item>
-            <title>
-                <xsl:value-of select="concat($user-name, ': ', normalize-space($item-content))"/>
-            </title>
-            <link>
-                <xsl:value-of select="$item-permalink"/>
-            </link>
-            <guid>
-                <xsl:value-of select="$item-permalink"/>
-            </guid>
-            <pubDate>
-                <xsl:value-of select="php:functionString('Tweeper::strToRssDate', .//abbr[@class='easydate']/@title)"/>
-            </pubDate>
-            <description>
-                <xsl:value-of select="concat($user-name, ': ')"/>
-                <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
-                <xsl:copy-of select="$item-content/node()"/>
-                <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
-            </description>
-            <xsl:if test="$generate-enclosure = 1">
-                <xsl:variable name="image-thumb-link" select=".//img[contains(@class, 'object-image')]/@src"/>
-                <xsl:if test="$image-thumb-link">
-                    <xsl:variable name="image-link" select="php:functionString('str_replace', '_thumb', '', $image-thumb-link)"/>
-                    <xsl:copy-of select="php:functionString('Tweeper::generateEnclosure', $image-link)"/>
-                </xsl:if>
-            </xsl:if>
-        </item>
-    </xsl:template>
-
-    <xsl:template match="/">
-        <xsl:variable name="channel-title" select="concat(substring-after($user-name, '@'), ' / ', substring-before($user-name, '@'))"/>
-        <xsl:variable name="channel-link" select="concat('https://', substring-after($user-name, '@'), '/', substring-before($user-name, '@'))"/>
-
-        <rss version="2.0">
-            <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
-            <channel>
-                <generator>Tweeper</generator>
-                <title>
-                    <xsl:value-of select="$channel-title"/>
-                </title>
-                <link>
-                    <xsl:value-of select="$channel-link"/>
-                </link>
-                <description>
-                    <xsl:value-of select="normalize-space(//h1[@class='media-header'])"/>
-                </description>
-                <image>
-                    <title>
-                        <xsl:value-of select="$channel-title"/>
-                    </title>
-                    <link>
-                        <xsl:value-of select="$channel-link"/>
-                    </link>
-                    <url>
-                        <xsl:value-of select="//div[@id='profile-block']/span/img[@class='img-rounded media-object']/@src"/>
-                    </url>
-                </image>
-                <xsl:apply-templates select="//div[@id='user-content-activities']//ul[@id='major-stream']/li"/>
-            </channel>
-        </rss>
-    </xsl:template>
-</xsl:stylesheet>
diff --git a/rss_converter_twitter.com.xsl b/rss_converter_twitter.com.xsl

deleted file mode 100644 (file)

index c154141..0000000
--- a/rss_converter_twitter.com.xsl
+++ /dev/null
@@ -1,208 +0,0 @@
-<!--
-  Stylesheet to convert Twitter user timelines to RSS.
-
-  Copyright (C) 2013-2014  Antonio Ospite <ao2@ao2.it>
-
-  This file is part of tweeper.
-
-  This program is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
--->
-<xsl:stylesheet version="1.0"
-    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
-    xmlns:php="http://php.net/xsl"
-    xsl:extension-element-prefixes="php"
-    exclude-result-prefixes="php">
-
-    <xsl:param name="generate-enclosure"/>
-
-    <xsl:output method="xml" indent="yes"/>
-
-    <xsl:variable name="BaseURL">
-        <xsl:text>https://twitter.com</xsl:text>
-    </xsl:variable>
-
-    <!-- Identity transform -->
-    <xsl:template match="@*|node()">
-        <xsl:copy>
-            <xsl:apply-templates select="@*|node()"/>
-        </xsl:copy>
-    </xsl:template>
-
-    <!--
-         Anchors to external links provide the direct URL in the
-         data-expanded-url attribute, so use this in the href attribute too
-         instead of the default short URL which uses the t.co redirection
-         service.
-
-         NOTE: when creating an element, attributes must be processed _before_
-         adding the contents (either children or a value):
-         http://stackoverflow.com/questions/21984867/
-    -->
-    <xsl:template match="a[@data-expanded-url]">
-        <!-- Prepend and append a white space for aestethic reasons -->
-        <xsl:text> </xsl:text>
-        <a>
-            <xsl:attribute name="href">
-                <xsl:value-of select="@data-expanded-url"/>
-            </xsl:attribute>
-            <!-- Also strip &nbsp; and &hellip; -->
-            <xsl:value-of select="translate(., '&#xA0;&#x2026;', '')"/>
-        </a>
-        <xsl:text> </xsl:text>
-    </xsl:template>
-
-    <!--
-         These are links to pic.twitter.com, use the direct link for those
-         too instead of the t.co redirections.
-    -->
-    <xsl:template match="a[@data-pre-embedded='true']">
-        <!-- Prepend and append a white space for aestethic reasons -->
-        <xsl:text> </xsl:text>
-        <a>
-            <xsl:attribute name="href">
-                <xsl:value-of select="concat('https://', .)"/>
-            </xsl:attribute>
-            <xsl:value-of select="concat('https://', .)"/>
-        </a>
-        <xsl:text> </xsl:text>
-    </xsl:template>
-
-    <!-- Present images in a more convenient way -->
-    <xsl:template match="div[@data-image-url]">
-        <a>
-            <xsl:attribute name="href">
-                <xsl:value-of select="concat(@data-image-url, ':orig')"/>
-            </xsl:attribute>
-            <img>
-                <xsl:attribute name="src">
-                    <xsl:value-of select="@data-image-url"/>
-                </xsl:attribute>
-            </img>
-        </a>
-    </xsl:template>
-
-    <!-- Don't repeat background in embedded media content -->
-    <xsl:template match="div[contains(@class, 'PlayableMedia-player')]">
-        <xsl:copy>
-            <xsl:apply-templates select="@*"/>
-            <xsl:attribute name="style">
-                <xsl:value-of select="concat(@style, '; background-repeat: no-repeat')"/>
-            </xsl:attribute>
-            <xsl:apply-templates select="node()"/>
-        </xsl:copy>
-    </xsl:template>
-
-    <xsl:template match="a[@data-expanded-url]" mode="enclosure">
-        <xsl:copy-of select="php:functionString('Tweeper::generateEnclosure', ./@data-expanded-url)"/>
-    </xsl:template>
-
-    <xsl:template match="div[@data-image-url]" mode="enclosure">
-        <xsl:copy-of select="php:functionString('Tweeper::generateEnclosure', concat(./@data-image-url, ':orig'))"/>
-    </xsl:template>
-
-    <xsl:variable name="screen-name" select="//div[@class='user-actions btn-group not-following ']/@data-screen-name"/>
-
-    <xsl:template match="//li[@data-item-id and @data-item-type='tweet']">
-        <xsl:variable name="user-name" select=".//div[contains(@class, 'js-stream-tweet')]/@data-screen-name"/>
-        <xsl:variable name="item-content" select=".//p[contains(@class, 'js-tweet-text')]"/>
-        <xsl:variable name="item-media" select=".//div[contains(@class, 'AdaptiveMedia-container')]"/>
-        <xsl:variable name="item-permalink" select="concat($BaseURL, .//div[@data-permalink-path]/@data-permalink-path)"/>
-
-        <xsl:variable name="item-has-video" select="$item-media//*[contains(@class, 'PlayableMedia--video')]"/>
-        <item>
-            <title>
-                <xsl:value-of select="concat($user-name, ': ')"/>
-                <xsl:if test="$item-has-video">
-                    <xsl:text>(Video) </xsl:text>
-                </xsl:if>
-                <!--
-                     Prepend a space in front of the URLs which are not
-                     preceded by an open parenthesis, for aestethic reasons.
-                     Also, regex, I know: http://xkcd.com/1171/
-                -->
-                <xsl:variable
-                    name="processed-title"
-                    select="php:functionString('preg_replace', '@((?&lt;!\()(?:http[s]?://|pic.twitter.com))@', ' \1', $item-content)"/>
-                <!-- Also strip &nbsp; and &hellip; -->
-                <xsl:value-of select="normalize-space(translate($processed-title, '&#xA0;&#x2026;', ''))"/>
-            </title>
-            <link>
-                <xsl:value-of select="$item-permalink"/>
-            </link>
-            <guid>
-                <xsl:value-of select="$item-permalink"/>
-            </guid>
-            <pubDate>
-                <xsl:variable name="timestamp" select=".//span[contains(@class, 'js-short-timestamp')]/@data-time"/>
-                <xsl:value-of select="php:functionString('Tweeper::epochToRssDate', number($timestamp))"/>
-            </pubDate>
-            <description>
-                <xsl:value-of select="concat($user-name, ': ')"/>
-                <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
-                <xsl:if test="$item-has-video">
-                    <xsl:text>(Video) </xsl:text>
-                </xsl:if>
-                <xsl:apply-templates select="$item-content/node()"/>
-                <xsl:apply-templates select="$item-media/node()"/>
-                <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
-            </description>
-            <xsl:if test="$generate-enclosure = 1">
-                <xsl:apply-templates select="$item-content//a[@data-expanded-url]" mode="enclosure"/>
-                <xsl:apply-templates select="$item-media//div[@data-image-url]" mode="enclosure"/>
-            </xsl:if>
-        </item>
-    </xsl:template>
-
-    <xsl:template match="/">
-        <xsl:variable name="channel-title">
-            <xsl:choose>
-                <xsl:when test="$screen-name != ''">
-                    <xsl:value-of select="concat('Twitter / ', $screen-name)"/>
-                </xsl:when>
-                <xsl:otherwise>
-                    <xsl:value-of select="concat('Twitter / ', normalize-space(//h1[1]))"/>
-                </xsl:otherwise>
-            </xsl:choose>
-        </xsl:variable>
-        <xsl:variable name="channel-link" select="//link[@rel='canonical']/@href"/>
-
-        <rss version="2.0">
-            <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
-            <channel>
-                <generator>Tweeper</generator>
-                <title>
-                    <xsl:value-of select="$channel-title"/>
-                </title>
-                <link>
-                    <xsl:value-of select="$channel-link"/>
-                </link>
-                <description>
-                    <xsl:value-of select="normalize-space(//div[@class='ProfileHeaderCard'])"/>
-                </description>
-                <image>
-                    <title>
-                        <xsl:value-of select="$channel-title"/>
-                    </title>
-                    <link>
-                        <xsl:value-of select="$channel-link"/>
-                    </link>
-                    <url>
-                        <xsl:value-of select="//a[contains(@class, 'profile-picture')]/@href"/>
-                    </url>
-                </image>
-                <xsl:apply-templates select="//ol[@id='stream-items-id']/li[@data-item-id and @data-item-type='tweet']"/>
-            </channel>
-        </rss>
-    </xsl:template>
-</xsl:stylesheet>
diff --git a/src/Tweeper.php b/src/Tweeper.php

new file mode 100644 (file)

index 0000000..73cbe81
--- /dev/null
+++ b/src/Tweeper.php
@@ -0,0 +1,365 @@
+<?php
+
+namespace Tweeper;
+
+/**
+ * @file
+ * Tweeper - a Twitter to RSS web scraper.
+ *
+ * Copyright (C) 2013-2015  Antonio Ospite <ao2@ao2.it>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+use DOMDocument;
+use XSLTProcessor;
+
+require_once 'Symfony/Component/Serializer/autoload.php';
+
+use Symfony\Component\Serializer\Serializer;
+use Symfony\Component\Serializer\Encoder\XmlEncoder;
+use Symfony\Component\Serializer\Normalizer\ObjectNormalizer;
+
+date_default_timezone_set('UTC');
+
+/**
+ * Scrape supported websites and perform conversion to RSS.
+ */
+class Tweeper {
+
+  private static $userAgent = "Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0";
+
+  /**
+   * Constructor sets up {@link $generate_enclosure}.
+   */
+  public function __construct($generate_enclosure = FALSE) {
+    $this->generate_enclosure = $generate_enclosure;
+  }
+
+  /**
+   * Convert numeric Epoch to the date format expected in a RSS document.
+   */
+  public static function epochToRssDate($timestamp) {
+    if (!is_numeric($timestamp) || is_nan($timestamp)) {
+      $timestamp = 0;
+    }
+
+    return gmdate(DATE_RSS, $timestamp);
+  }
+
+  /**
+   * Convert generic date string to the date format expected in a RSS document.
+   */
+  public static function strToRssDate($date) {
+    $timestamp = strtotime($date);
+    if (FALSE === $timestamp) {
+      $timestamp = 0;
+    }
+
+    return Tweeper::epochToRssDate($timestamp);
+  }
+
+  /**
+   * Convert string to UpperCamelCase.
+   */
+  public static function toUpperCamelCase($str, $delim = ' ') {
+    $str_upper = ucwords($str, $delim);
+    $str_camel_case = str_replace($delim, '', $str_upper);
+    return $str_camel_case;
+  }
+
+  /**
+   * Get the contents from a URL.
+   */
+  private static function getUrlContents($url) {
+    $ch = curl_init($url);
+    curl_setopt_array($ch, array(
+      CURLOPT_HEADER => FALSE,
+      // Follow http redirects to get the real URL.
+      CURLOPT_FOLLOWLOCATION => TRUE,
+      CURLOPT_RETURNTRANSFER => TRUE,
+      CURLOPT_SSL_VERIFYHOST => FALSE,
+      CURLOPT_SSL_VERIFYPEER => FALSE,
+      CURLOPT_HTTPHEADER => array('Accept-language: en'),
+      CURLOPT_USERAGENT => Tweeper::$userAgent,
+    ));
+    $contents = curl_exec($ch);
+    if (FALSE === $contents) {
+      trigger_error(curl_error($ch));
+    }
+    curl_close($ch);
+
+    return $contents;
+  }
+
+  /**
+   * Get the headers from a URL.
+   */
+  private static function getUrlInfo($url) {
+    $ch = curl_init($url);
+    curl_setopt_array($ch, array(
+      CURLOPT_HEADER => TRUE,
+      CURLOPT_NOBODY => TRUE,
+      // Follow http redirects to get the real URL.
+      CURLOPT_FOLLOWLOCATION => TRUE,
+      CURLOPT_RETURNTRANSFER => TRUE,
+      CURLOPT_SSL_VERIFYHOST => FALSE,
+      CURLOPT_SSL_VERIFYPEER => FALSE,
+      CURLOPT_USERAGENT => Tweeper::$userAgent,
+    ));
+    curl_exec($ch);
+    $url_info = curl_getinfo($ch);
+    if (FALSE === $url_info) {
+      trigger_error(curl_error($ch));
+    }
+    curl_close($ch);
+
+    return $url_info;
+  }
+
+  /**
+   * Generate an RSS <enclosure/> element.
+   */
+  public static function generateEnclosure($url) {
+    $supported_content_types = array(
+      "application/octet-stream",
+      "application/ogg",
+      "application/pdf",
+      "audio/aac",
+      "audio/mp4",
+      "audio/mpeg",
+      "audio/ogg",
+      "audio/vorbis",
+      "audio/wav",
+      "audio/webm",
+      "audio/x-midi",
+      "image/gif",
+      "image/jpeg",
+      "image/png",
+      "video/avi",
+      "video/mp4",
+      "video/mpeg",
+      "video/ogg",
+    );
+
+    $url_info = Tweeper::getUrlInfo($url);
+
+    $supported = in_array($url_info['content_type'], $supported_content_types);
+    if (!$supported) {
+      error_log("Unsupported enclosure content type \"" . $url_info['content_type'] . "\" for URL: " . $url_info['url']);
+      return '';
+    }
+
+    // The RSS specification says that the enclosure element URL must be http.
+    // See http://sourceforge.net/p/feedvalidator/bugs/72/
+    $http_url = preg_replace("/^https/", "http", $url_info['url']);
+
+    $dom = new DOMDocument();
+    $enc = $dom->createElement('enclosure');
+    $enc->setAttribute('url', $http_url);
+    $enc->setAttribute('length', $url_info['download_content_length']);
+    $enc->setAttribute('type', $url_info['content_type']);
+
+    return $enc;
+  }
+
+  /**
+   * Mimic the message from libxml.c::php_libxml_ctx_error_level()
+   */
+  private static function logXmlError($error) {
+    $output = "";
+
+    switch ($error->level) {
+      case LIBXML_ERR_WARNING:
+        $output .= "Warning $error->code: ";
+        break;
+
+      case LIBXML_ERR_ERROR:
+        $output .= "Error $error->code: ";
+        break;
+
+      case LIBXML_ERR_FATAL:
+        $output .= "Fatal Error $error->code: ";
+        break;
+    }
+
+    $output .= trim($error->message);
+
+    if ($error->file) {
+      $output .= " in $error->file";
+    }
+    else {
+      $output .= " in Entity,";
+    }
+
+    $output .= " line $error->line";
+
+    error_log($output);
+  }
+
+  /**
+   * Convert json to XML.
+   */
+  private static function jsonToXml($json, $root_node_name) {
+    // Apparently the ObjectNormalizer used afterwards is not able to handle
+    // the stdClass object created by json_decode() with the default setting
+    // $assoc = false; so use $assoc = true.
+    $data = json_decode($json, $assoc = TRUE);
+    if (!$data) {
+      return NULL;
+    }
+
+    $encoder = new XmlEncoder();
+    $normalizer = new ObjectNormalizer();
+    $serializer = new Serializer(array($normalizer), array($encoder));
+
+    $serializer_options = array(
+      'xml_encoding' => "UTF-8",
+      'xml_format_output' => TRUE,
+      'xml_root_node_name' => $root_node_name,
+    );
+
+    $xml_data = $serializer->serialize($data, 'xml', $serializer_options);
+    if (!$xml_data) {
+      trigger_error("Cannot serialize data", E_USER_ERROR);
+      return NULL;
+    }
+
+    return $xml_data;
+  }
+
+  /**
+   * Convert the Instagram content to XML.
+   */
+  private function getXmlInstagramCom($html) {
+    // Extract the json data from the html code.
+    $json_match_expr = '/window._sharedData = (.*);/';
+    $ret = preg_match($json_match_expr, $html, $matches);
+    if ($ret !== 1) {
+      trigger_error("Cannot match expression: $json_match_expr\n", E_USER_ERROR);
+      return NULL;
+    }
+
+    return Tweeper::jsonToXml($matches[1], 'instagram');
+  }
+
+  /**
+   * Make the Facebook HTML processable.
+   */
+  private function preprocessHtmlFacebookCom($html) {
+    $html = str_replace('<!--', '', $html);
+    $html = str_replace('-->', '', $html);
+    return $html;
+  }
+
+  /**
+   * Convert the HTML retrieved from the site to XML.
+   */
+  private function htmlToXml($html, $host) {
+    $xmlDoc = new DOMDocument();
+
+    // Handle warnings and errors when loading invalid HTML.
+    $xml_errors_value = libxml_use_internal_errors(TRUE);
+
+    // If there is a host-specific method to get the XML data, use it!
+    $get_xml_host_method = 'getXml' . Tweeper::toUpperCamelCase($host, '.');
+    if (method_exists($this, $get_xml_host_method)) {
+      $xml_data = call_user_func_array(array($this, $get_xml_host_method), array($html));
+      $xmlDoc->loadXML($xml_data);
+    }
+    else {
+      $xmlDoc->loadHTML($html);
+    }
+
+    foreach (libxml_get_errors() as $xml_error) {
+      Tweeper::logXmlError($xml_error);
+    }
+    libxml_clear_errors();
+    libxml_use_internal_errors($xml_errors_value);
+
+    return $xmlDoc;
+  }
+
+  /**
+   * Load a stylesheet if the web site is supported.
+   */
+  private function loadStylesheet($host) {
+    $stylesheet = "file://" . __DIR__ . "/rss_converter_" . $host . ".xsl";
+    if (FALSE === file_exists($stylesheet)) {
+      trigger_error("Conversion to RSS not supported for $host ($stylesheet not found)", E_USER_ERROR);
+      return NULL;
+    }
+
+    $stylesheet_contents = Tweeper::getUrlContents($stylesheet);
+
+    $xslDoc = new DOMDocument();
+    $xslDoc->loadXML($stylesheet_contents);
+
+    $xsltProcessor = new XSLTProcessor();
+    $xsltProcessor->registerPHPFunctions();
+    $xsltProcessor->setParameter('', 'generate-enclosure', $this->generate_enclosure);
+    $xsltProcessor->importStylesheet($xslDoc);
+
+    return $xsltProcessor;
+  }
+
+  /**
+   * Convert the site content to RSS.
+   */
+  public function tweep($src_url) {
+    $url = parse_url($src_url);
+    if (FALSE === $url || empty($url["host"])) {
+      trigger_error("Invalid URL: $src_url", E_USER_ERROR);
+      return NULL;
+    }
+
+    $scheme = $url["scheme"];
+    if (!in_array($scheme, array("http", "https"))) {
+      trigger_error("unsupported scheme: $scheme", E_USER_ERROR);
+      return NULL;
+    }
+
+    // Strip the leading www. to be more forgiving on input URLs.
+    $host = preg_replace('/^www\./', '', $url["host"]);
+
+    $xsltProcessor = $this->loadStylesheet($host);
+    if (NULL === $xsltProcessor) {
+      return NULL;
+    }
+
+    $html = Tweeper::getUrlContents($src_url);
+    if (FALSE === $html) {
+      return NULL;
+    }
+
+    $preprocess_html_host_method = 'preprocessHtml' . Tweeper::toUpperCamelCase($host, '.');
+    if (method_exists($this, $preprocess_html_host_method)) {
+      $html = call_user_func_array(array($this, $preprocess_html_host_method), array($html));
+    }
+
+    $xmlDoc = $this->htmlToXml($html, $host);
+    if (NULL === $xmlDoc) {
+      return NULL;
+    }
+
+    $output = $xsltProcessor->transformToXML($xmlDoc);
+
+    if (FALSE === $output) {
+      trigger_error('XSL transformation failed.', E_USER_ERROR);
+      return NULL;
+    }
+    return $output;
+  }
+
+}
diff --git a/src/rss_converter_dilbert.com.xsl b/src/rss_converter_dilbert.com.xsl

new file mode 100644 (file)

index 0000000..b6d1975
--- /dev/null
+++ b/src/rss_converter_dilbert.com.xsl
@@ -0,0 +1,115 @@
+<!--
+  Stylesheet to convert Dilbert daily strips to RSS.
+
+  Copyright (C) 2013-2014  Antonio Ospite <ao2@ao2.it>
+
+  This file is part of tweeper.
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+-->
+
+<!--
+  Since June 18, 2013 dilbert.com strips are not accessible anymore
+  directly from the RSS feed, this message is displayed instead:
+
+    Dilbert readers - Please visit Dilbert.com to read this feature. Due
+    to changes with our feeds, we are now making this RSS feed a link to
+    Dilbert.com.
+
+  How unhandy is that, was it because of a management decision?
+  Maybe a parody dilbert strip is needed about this issue...
+-->
+
+<xsl:stylesheet version="1.0"
+    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+    xmlns:php="http://php.net/xsl"
+    xsl:extension-element-prefixes="php"
+    exclude-result-prefixes="php">
+
+    <xsl:output method="xml" indent="yes"/>
+
+    <xsl:variable name="BaseURL" select="//meta[@property='og:url']/@content"/>
+
+    <xsl:template match="//section[@class='comic-item']">
+        <xsl:variable name="item-permalink" select=".//a[@class='img-comic-link']/@href"/>
+        <xsl:variable name="picture-url" select=".//img[@class='img-responsive img-comic']/@src"/>
+        <xsl:variable name="picture-title" select=".//img[@class='img-responsive img-comic']/@alt"/>
+        <item>
+            <title>
+                <xsl:variable name="title-length" select="140"/>
+                <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 -->
+                <xsl:choose>
+                    <xsl:when test="string-length($picture-title) > $title-length">
+                        <xsl:variable name="truncated-length" select="$title-length - 3"/>
+                        <xsl:value-of select="substring($picture-title, 1, $truncated-length)"/>
+                        <xsl:text>...</xsl:text>
+                    </xsl:when>
+                    <xsl:otherwise>
+                        <xsl:value-of select="$picture-title"/>
+                    </xsl:otherwise>
+                </xsl:choose>
+            </title>
+            <link>
+                <xsl:value-of select="$item-permalink"/>
+            </link>
+            <guid>
+                <xsl:value-of select="$item-permalink"/>
+            </guid>
+            <pubDate>
+                <xsl:value-of select="php:functionString('Tweeper::strToRssDate', normalize-space(.//date))"/>
+            </pubDate>
+            <description>
+                <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
+                <img src="{$picture-url}" alt="{$picture-title}"/>
+                <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
+            </description>
+            <xsl:if test="$generate-enclosure = 1">
+                <xsl:copy-of select="php:functionString('Tweeper::generateEnclosure', $picture-url)"/>
+            </xsl:if>
+        </item>
+    </xsl:template>
+
+    <xsl:template match="/">
+        <xsl:variable name="channel-title" select="//meta[@property='og:title']/@content"/>
+        <xsl:variable name="channel-link" select="$BaseURL"/>
+
+        <rss version="2.0">
+            <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
+            <channel>
+                <generator>Tweeper</generator>
+                <title>
+                    <xsl:value-of select="$channel-title"/>
+                </title>
+                <link>
+                    <xsl:value-of select="$channel-link"/>
+                </link>
+                <description>
+                    <xsl:value-of select="//meta[@property='og:description']/@content"/>
+                </description>
+                <image>
+                    <title>
+                        <xsl:value-of select="$channel-title"/>
+                    </title>
+                    <link>
+                        <xsl:value-of select="$channel-link"/>
+                    </link>
+                    <url>
+                        <xsl:value-of select="concat($BaseURL, //img[@alt='Dilbert logo']/@src)"/>
+                    </url>
+                </image>
+                <xsl:apply-templates select="//section[@class='comic-item']"/>
+            </channel>
+        </rss>
+    </xsl:template>
+</xsl:stylesheet>
diff --git a/src/rss_converter_facebook.com.xsl b/src/rss_converter_facebook.com.xsl

new file mode 100644 (file)

index 0000000..418b3d2
--- /dev/null
+++ b/src/rss_converter_facebook.com.xsl
@@ -0,0 +1,141 @@
+<!--
+  Stylesheet to convert a Facebook public page to RSS.
+
+  Copyright (C) 2015  Antonio Ospite <ao2@ao2.it>
+
+  This file is part of tweeper.
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+-->
+
+<!--
+  Since June 23rd, 2015 facebook.com deprecated the RSS feed endpoint for public pages:
+  https://developers.facebook.com/docs/apps/changelog#v2_3_90_day_deprecations
+
+  They suggest to use the Graph API but they fail to mention that it does not
+  work anymore without authentication, so it cannot be considered an
+  _equivalent_ solution.
+
+  Luckily we've got Tweeper!
+-->
+
+<xsl:stylesheet version="1.0"
+    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+    xmlns:php="http://php.net/xsl"
+    xsl:extension-element-prefixes="php"
+    exclude-result-prefixes="php">
+
+    <xsl:output method="xml" indent="yes"/>
+
+    <xsl:variable name="BaseURL">
+        <xsl:text>https://facebook.com</xsl:text>
+    </xsl:variable>
+
+    <!--
+         Extract the page id from an element like:
+        <meta property="al:android:url" content="fb://page/793837197390834">
+
+        The page id will be used to build the permalink.
+    -->
+    <xsl:variable
+        name="page-id"
+        select="substring-after(//meta[@property='al:android:url']/@content, 'fb://page/')"/>
+
+    <xsl:template match="//div[contains(@class, 'userContentWrapper')]">
+        <xsl:variable name="story-id" select=".//input[@name='ft_ent_identifier']/@value"/>
+        <xsl:variable
+            name="item-permalink"
+            select="concat($BaseURL, '/permalink.php?id=', $page-id, '&amp;story_fbid=', $story-id)"/>
+
+        <!-- Get only the first child in order to skip the footer of the content -->
+        <xsl:variable name="item-content" select="div[1]"/>
+
+        <item>
+            <title>
+                <xsl:variable name="item-title" select="$item-content//p"/>
+                <xsl:variable name="title-length" select="140"/>
+                <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 -->
+                <xsl:choose>
+                    <xsl:when test="string-length($item-title) > $title-length">
+                        <xsl:variable name="truncated-length" select="$title-length - 3"/>
+                        <xsl:value-of select="substring($item-title, 1, $truncated-length)"/>
+                        <xsl:text>...</xsl:text>
+                    </xsl:when>
+                    <xsl:otherwise>
+                        <xsl:value-of select="$item-title"/>
+                    </xsl:otherwise>
+                </xsl:choose>
+            </title>
+            <link>
+                <xsl:value-of select="$item-permalink"/>
+            </link>
+            <guid>
+                <xsl:value-of select="$item-permalink"/>
+            </guid>
+            <pubDate>
+                <xsl:variable name="timestamp" select=".//abbr[@data-shorten]/@data-utime"/>
+                <xsl:value-of select="php:functionString('Tweeper::epochToRssDate', $timestamp)"/>
+            </pubDate>
+            <description>
+
+                <!--
+                     Get only the children starting from the one with class="userContent",
+                     this way the content header is skipped
+                -->
+                <xsl:variable
+                    name="usercontent-position"
+                    select="count($item-content/div[contains(@class, 'userContent')]/preceding-sibling::*) + 1"/>
+
+                <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
+                <xsl:copy-of select="$item-content/div[position() >= $usercontent-position]"/>
+                <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
+            </description>
+        </item>
+    </xsl:template>
+
+    <xsl:template match="/">
+        <xsl:variable name="channel-title" select="//title"/>
+        <xsl:variable name="channel-link" select="//div[contains(@class, 'userContentWrapper')][1]//a[1]/@href"/>
+
+        <rss version="2.0">
+            <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
+            <channel>
+                <generator>Tweeper</generator>
+                <title>
+                    <xsl:value-of select="$channel-title"/>
+                </title>
+                <link>
+                    <xsl:value-of select="$channel-link"/>
+                </link>
+                <description>
+                    <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
+                    <xsl:copy-of select="//div[@data-id='1']/node()"/>
+                    <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
+                </description>
+                <image>
+                    <title>
+                        <xsl:value-of select="$channel-title"/>
+                    </title>
+                    <link>
+                        <xsl:value-of select="$channel-link"/>
+                    </link>
+                    <url>
+                        <xsl:value-of select="//img[@class='profilePic img']/@src"/>
+                    </url>
+                </image>
+                <xsl:apply-templates select="//div[contains(@class, 'userContentWrapper')]"/>
+            </channel>
+        </rss>
+    </xsl:template>
+</xsl:stylesheet>
diff --git a/src/rss_converter_howtoons.com.xsl b/src/rss_converter_howtoons.com.xsl

new file mode 100644 (file)

index 0000000..403b9ac
--- /dev/null
+++ b/src/rss_converter_howtoons.com.xsl
@@ -0,0 +1,102 @@
+<!--
+  Stylesheet to convert Howtoons.com to RSS.
+
+  Copyright (C) 2014  Antonio Ospite <ao2@ao2.it>
+
+  This file is part of tweeper.
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+-->
+
+<!--
+  The RSS feed link is broken on http://howtoons.com so just work around it.
+
+  Howtoons uses Wordpress, so maybe this style sheet can be used as a base for
+  scraping other Wordpress sites.
+-->
+
+<xsl:stylesheet version="1.0"
+    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+    xmlns:php="http://php.net/xsl"
+    xsl:extension-element-prefixes="php"
+    exclude-result-prefixes="php">
+
+    <xsl:output method="xml" indent="yes"/>
+
+    <xsl:variable name="BaseURL">
+        <xsl:text>http://howtoons.com</xsl:text>
+    </xsl:variable>
+
+    <xsl:template match="//div[contains(@id, 'post-')]">
+        <xsl:variable name="item-permalink" select=".//div[@class='post-headline']//a/@href"/>
+        <item>
+            <title>
+                <xsl:value-of select="normalize-space(.//div[@class='post-headline']//a)"/>
+            </title>
+            <link>
+                <xsl:value-of select="$item-permalink"/>
+            </link>
+            <guid>
+                <xsl:value-of select="$item-permalink"/>
+            </guid>
+            <pubDate>
+                <xsl:variable name="date" select="substring-before(.//div[@class='post-byline'], ',')"/>
+                <!-- date format is MM.DD.YY -->
+                <xsl:variable name="month" select="substring($date, 1, 2)"/>
+                <xsl:variable name="day" select="substring($date, 4, 2)"/>
+                <xsl:variable name="year" select="substring($date, 7, 2)"/>
+                <xsl:variable name="iso-date" select="concat('20', $year, '-', $month, '-', $day)"/>
+                <xsl:value-of select="php:functionString('Tweeper::strToRssDate', $iso-date)"/>
+            </pubDate>
+            <description>
+                <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
+                <xsl:copy-of select=".//div[contains(@class, 'post-bodycopy')]/p"/>
+                <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
+            </description>
+        </item>
+    </xsl:template>
+
+    <xsl:template match="/">
+        <xsl:variable name="channel-title" select="//title"/>
+        <xsl:variable name="channel-link" select="$BaseURL"/>
+
+        <rss version="2.0">
+            <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
+            <channel>
+                <generator>Tweeper</generator>
+                <title>
+                    <xsl:value-of select="$channel-title"/>
+                </title>
+                <link>
+                    <xsl:value-of select="$channel-link"/>
+                </link>
+                <description>
+                    <xsl:text>The world's greatest D.I.Y. comic website! Tools of mass construction!</xsl:text>
+                </description>
+                <image>
+                    <title>
+                        <xsl:value-of select="$channel-title"/>
+                    </title>
+                    <link>
+                        <xsl:value-of select="$channel-link"/>
+                    </link>
+                    <url>
+                        <xsl:text>http://www.howtoons.com/wp-content/themes/atahualpa/images/header/tuck1000.png</xsl:text>
+                    </url>
+                </image>
+                <xsl:apply-templates select="//div[contains(@id, 'post-')]"/>
+            </channel>
+        </rss>
+    </xsl:template>
+</xsl:stylesheet>
diff --git a/src/rss_converter_identi.ca.xsl b/src/rss_converter_identi.ca.xsl

new file mode 120000 (symlink)

index 0000000..d8042a1
--- /dev/null
+++ b/src/rss_converter_identi.ca.xsl
@@ -0,0 +1 @@
+rss_converter_pump.io.xsl
+\ No newline at end of file
diff --git a/src/rss_converter_instagram.com.xsl b/src/rss_converter_instagram.com.xsl

new file mode 100644 (file)

index 0000000..e869d7d
--- /dev/null
+++ b/src/rss_converter_instagram.com.xsl
@@ -0,0 +1,135 @@
+<!--
+  Stylesheet to convert Instagram user timelines to RSS.
+
+  Copyright (C) 2015  Antonio Ospite <ao2@ao2.it>
+
+  This file is part of tweeper.
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+-->
+<xsl:stylesheet version="1.0"
+    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+    xmlns:php="http://php.net/xsl"
+    xsl:extension-element-prefixes="php"
+    exclude-result-prefixes="php">
+
+    <xsl:param name="generate-enclosure"/>
+
+    <xsl:output method="xml" indent="yes"/>
+
+    <xsl:variable name="BaseURL">
+        <xsl:text>https://instagram.com</xsl:text>
+    </xsl:variable>
+
+    <xsl:variable name="user-name" select="//ProfilePage/user/username"/>
+
+    <!-- Some users do not specify the full name -->
+    <xsl:variable name="full-name" select="//ProfilePage/user/full_name"/>
+    <xsl:variable name="screen-name">
+        <xsl:choose>
+            <xsl:when test="$full-name != ''">
+                <xsl:value-of select="$full-name"/>
+            </xsl:when>
+            <xsl:otherwise>
+                <xsl:value-of select="$user-name"/>
+            </xsl:otherwise>
+        </xsl:choose>
+    </xsl:variable>
+
+    <xsl:template match="//ProfilePage/user/media/nodes">
+        <xsl:variable name="item-content-image" select="./display_src"/>
+        <xsl:variable name="item-content-caption" select="./caption"/>
+        <xsl:variable name="item-permalink" select="concat($BaseURL, '/p/', ./code, '/')"/>
+        <item>
+            <title>
+                <xsl:variable name="title-length" select="140"/>
+                <xsl:variable name="item-content-title" select="normalize-space(concat($user-name, ': ', $item-content-caption))"/>
+                <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 -->
+                <xsl:choose>
+                    <xsl:when test="string-length($item-content-title) > $title-length">
+                        <xsl:variable name="truncated-length" select="$title-length - 3"/>
+                        <xsl:value-of select="substring($item-content-title, 1, $truncated-length)"/>
+                        <xsl:text>...</xsl:text>
+                    </xsl:when>
+                    <xsl:otherwise>
+                        <xsl:value-of select="$item-content-title"/>
+                    </xsl:otherwise>
+                </xsl:choose>
+            </title>
+            <link>
+                <xsl:value-of select="$item-permalink"/>
+            </link>
+            <guid>
+                <xsl:value-of select="$item-permalink"/>
+            </guid>
+            <pubDate>
+                <xsl:variable name="timestamp" select="./date"/>
+                <xsl:value-of select="php:functionString('Tweeper::epochToRssDate', $timestamp)"/>
+            </pubDate>
+            <description>
+                <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
+                <p>
+                    <xsl:if test="./is_video/text() = 1">
+                        (Video)
+                    </xsl:if>
+                    <xsl:value-of select="$item-content-caption"/>
+                </p><br />
+                <a href="{$item-permalink}"><img src="{$item-content-image}" style="max-width: 100%"/></a>
+                <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
+            </description>
+            <xsl:if test="$generate-enclosure = 1">
+                <xsl:copy-of select="php:functionString('Tweeper::generateEnclosure', $item-content-image)"/>
+            </xsl:if>
+        </item>
+    </xsl:template>
+
+    <xsl:template match="/">
+        <xsl:variable name="channel-title" select="concat('Instagram / ', $screen-name)"/>
+        <xsl:variable name="channel-link" select="concat($BaseURL, '/', $user-name)"/>
+
+        <rss version="2.0">
+            <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
+            <channel>
+                <generator>Tweeper</generator>
+                <title>
+                    <xsl:value-of select="$channel-title"/>
+                </title>
+                <link>
+                    <xsl:value-of select="$channel-link"/>
+                </link>
+                <description>
+                    <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
+                    <xsl:value-of select="normalize-space(concat($screen-name, '. ', //user/biography))"/>
+                    <xsl:variable name="external-url" select="//user/external_url"/>
+                    <xsl:if test="$external-url != ''">
+                        <xsl:text> </xsl:text><a href="{$external-url}"><xsl:value-of select="$external-url"/></a>
+                    </xsl:if>
+                    <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
+                </description>
+                <image>
+                    <title>
+                        <xsl:value-of select="$channel-title"/>
+                    </title>
+                    <link>
+                        <xsl:value-of select="$channel-link"/>
+                    </link>
+                    <url>
+                        <xsl:value-of select="//ProfilePage/user/profile_pic_url"/>
+                    </url>
+                </image>
+                <xsl:apply-templates select="//ProfilePage/user/media/nodes"/>
+            </channel>
+        </rss>
+    </xsl:template>
+</xsl:stylesheet>
diff --git a/src/rss_converter_pump.io.xsl b/src/rss_converter_pump.io.xsl

new file mode 100644 (file)

index 0000000..1577dcf
--- /dev/null
+++ b/src/rss_converter_pump.io.xsl
@@ -0,0 +1,99 @@
+<!--
+  Stylesheet to convert Pump.io activity streams to RSS.
+
+  Copyright (C) 2013-2014  Antonio Ospite <ao2@ao2.it>
+
+  This file is part of tweeper.
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+-->
+<!-- To Evan, please reconsider publishing RSS ouput for _public_ contents -->
+<xsl:stylesheet version="1.0"
+    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+    xmlns:php="http://php.net/xsl"
+    xsl:extension-element-prefixes="php"
+    exclude-result-prefixes="php">
+
+    <xsl:output method="xml" indent="yes"/>
+
+    <xsl:variable name="domain-name" select="substring-after(//div[@id='profile-block']/@data-profile-id, '@')"/>
+    <xsl:variable name="BaseURL" select="concat('https://', $domain-name)"/>
+
+    <xsl:variable name="user-name" select="substring-after(//div[@id='profile-block']/@data-profile-id, ':')"/>
+
+    <xsl:template match="//div[@id='user-content-activities']//ul[@id='major-stream']/li">
+        <xsl:variable name="item-content" select=".//div[@class='activity-content']"/>
+        <xsl:variable name="item-permalink" select=".//p[@class='muted']/small/a/@href"/>
+        <item>
+            <title>
+                <xsl:value-of select="concat($user-name, ': ', normalize-space($item-content))"/>
+            </title>
+            <link>
+                <xsl:value-of select="$item-permalink"/>
+            </link>
+            <guid>
+                <xsl:value-of select="$item-permalink"/>
+            </guid>
+            <pubDate>
+                <xsl:value-of select="php:functionString('Tweeper::strToRssDate', .//abbr[@class='easydate']/@title)"/>
+            </pubDate>
+            <description>
+                <xsl:value-of select="concat($user-name, ': ')"/>
+                <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
+                <xsl:copy-of select="$item-content/node()"/>
+                <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
+            </description>
+            <xsl:if test="$generate-enclosure = 1">
+                <xsl:variable name="image-thumb-link" select=".//img[contains(@class, 'object-image')]/@src"/>
+                <xsl:if test="$image-thumb-link">
+                    <xsl:variable name="image-link" select="php:functionString('str_replace', '_thumb', '', $image-thumb-link)"/>
+                    <xsl:copy-of select="php:functionString('Tweeper::generateEnclosure', $image-link)"/>
+                </xsl:if>
+            </xsl:if>
+        </item>
+    </xsl:template>
+
+    <xsl:template match="/">
+        <xsl:variable name="channel-title" select="concat(substring-after($user-name, '@'), ' / ', substring-before($user-name, '@'))"/>
+        <xsl:variable name="channel-link" select="concat('https://', substring-after($user-name, '@'), '/', substring-before($user-name, '@'))"/>
+
+        <rss version="2.0">
+            <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
+            <channel>
+                <generator>Tweeper</generator>
+                <title>
+                    <xsl:value-of select="$channel-title"/>
+                </title>
+                <link>
+                    <xsl:value-of select="$channel-link"/>
+                </link>
+                <description>
+                    <xsl:value-of select="normalize-space(//h1[@class='media-header'])"/>
+                </description>
+                <image>
+                    <title>
+                        <xsl:value-of select="$channel-title"/>
+                    </title>
+                    <link>
+                        <xsl:value-of select="$channel-link"/>
+                    </link>
+                    <url>
+                        <xsl:value-of select="//div[@id='profile-block']/span/img[@class='img-rounded media-object']/@src"/>
+                    </url>
+                </image>
+                <xsl:apply-templates select="//div[@id='user-content-activities']//ul[@id='major-stream']/li"/>
+            </channel>
+        </rss>
+    </xsl:template>
+</xsl:stylesheet>
diff --git a/src/rss_converter_twitter.com.xsl b/src/rss_converter_twitter.com.xsl

new file mode 100644 (file)

index 0000000..c154141
--- /dev/null
+++ b/src/rss_converter_twitter.com.xsl
@@ -0,0 +1,208 @@
+<!--
+  Stylesheet to convert Twitter user timelines to RSS.
+
+  Copyright (C) 2013-2014  Antonio Ospite <ao2@ao2.it>
+
+  This file is part of tweeper.
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+-->
+<xsl:stylesheet version="1.0"
+    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+    xmlns:php="http://php.net/xsl"
+    xsl:extension-element-prefixes="php"
+    exclude-result-prefixes="php">
+
+    <xsl:param name="generate-enclosure"/>
+
+    <xsl:output method="xml" indent="yes"/>
+
+    <xsl:variable name="BaseURL">
+        <xsl:text>https://twitter.com</xsl:text>
+    </xsl:variable>
+
+    <!-- Identity transform -->
+    <xsl:template match="@*|node()">
+        <xsl:copy>
+            <xsl:apply-templates select="@*|node()"/>
+        </xsl:copy>
+    </xsl:template>
+
+    <!--
+         Anchors to external links provide the direct URL in the
+         data-expanded-url attribute, so use this in the href attribute too
+         instead of the default short URL which uses the t.co redirection
+         service.
+
+         NOTE: when creating an element, attributes must be processed _before_
+         adding the contents (either children or a value):
+         http://stackoverflow.com/questions/21984867/
+    -->
+    <xsl:template match="a[@data-expanded-url]">
+        <!-- Prepend and append a white space for aestethic reasons -->
+        <xsl:text> </xsl:text>
+        <a>
+            <xsl:attribute name="href">
+                <xsl:value-of select="@data-expanded-url"/>
+            </xsl:attribute>
+            <!-- Also strip &nbsp; and &hellip; -->
+            <xsl:value-of select="translate(., '&#xA0;&#x2026;', '')"/>
+        </a>
+        <xsl:text> </xsl:text>
+    </xsl:template>
+
+    <!--
+         These are links to pic.twitter.com, use the direct link for those
+         too instead of the t.co redirections.
+    -->
+    <xsl:template match="a[@data-pre-embedded='true']">
+        <!-- Prepend and append a white space for aestethic reasons -->
+        <xsl:text> </xsl:text>
+        <a>
+            <xsl:attribute name="href">
+                <xsl:value-of select="concat('https://', .)"/>
+            </xsl:attribute>
+            <xsl:value-of select="concat('https://', .)"/>
+        </a>
+        <xsl:text> </xsl:text>
+    </xsl:template>
+
+    <!-- Present images in a more convenient way -->
+    <xsl:template match="div[@data-image-url]">
+        <a>
+            <xsl:attribute name="href">
+                <xsl:value-of select="concat(@data-image-url, ':orig')"/>
+            </xsl:attribute>
+            <img>
+                <xsl:attribute name="src">
+                    <xsl:value-of select="@data-image-url"/>
+                </xsl:attribute>
+            </img>
+        </a>
+    </xsl:template>
+
+    <!-- Don't repeat background in embedded media content -->
+    <xsl:template match="div[contains(@class, 'PlayableMedia-player')]">
+        <xsl:copy>
+            <xsl:apply-templates select="@*"/>
+            <xsl:attribute name="style">
+                <xsl:value-of select="concat(@style, '; background-repeat: no-repeat')"/>
+            </xsl:attribute>
+            <xsl:apply-templates select="node()"/>
+        </xsl:copy>
+    </xsl:template>
+
+    <xsl:template match="a[@data-expanded-url]" mode="enclosure">
+        <xsl:copy-of select="php:functionString('Tweeper::generateEnclosure', ./@data-expanded-url)"/>
+    </xsl:template>
+
+    <xsl:template match="div[@data-image-url]" mode="enclosure">
+        <xsl:copy-of select="php:functionString('Tweeper::generateEnclosure', concat(./@data-image-url, ':orig'))"/>
+    </xsl:template>
+
+    <xsl:variable name="screen-name" select="//div[@class='user-actions btn-group not-following ']/@data-screen-name"/>
+
+    <xsl:template match="//li[@data-item-id and @data-item-type='tweet']">
+        <xsl:variable name="user-name" select=".//div[contains(@class, 'js-stream-tweet')]/@data-screen-name"/>
+        <xsl:variable name="item-content" select=".//p[contains(@class, 'js-tweet-text')]"/>
+        <xsl:variable name="item-media" select=".//div[contains(@class, 'AdaptiveMedia-container')]"/>
+        <xsl:variable name="item-permalink" select="concat($BaseURL, .//div[@data-permalink-path]/@data-permalink-path)"/>
+
+        <xsl:variable name="item-has-video" select="$item-media//*[contains(@class, 'PlayableMedia--video')]"/>
+        <item>
+            <title>
+                <xsl:value-of select="concat($user-name, ': ')"/>
+                <xsl:if test="$item-has-video">
+                    <xsl:text>(Video) </xsl:text>
+                </xsl:if>
+                <!--
+                     Prepend a space in front of the URLs which are not
+                     preceded by an open parenthesis, for aestethic reasons.
+                     Also, regex, I know: http://xkcd.com/1171/
+                -->
+                <xsl:variable
+                    name="processed-title"
+                    select="php:functionString('preg_replace', '@((?&lt;!\()(?:http[s]?://|pic.twitter.com))@', ' \1', $item-content)"/>
+                <!-- Also strip &nbsp; and &hellip; -->
+                <xsl:value-of select="normalize-space(translate($processed-title, '&#xA0;&#x2026;', ''))"/>
+            </title>
+            <link>
+                <xsl:value-of select="$item-permalink"/>
+            </link>
+            <guid>
+                <xsl:value-of select="$item-permalink"/>
+            </guid>
+            <pubDate>
+                <xsl:variable name="timestamp" select=".//span[contains(@class, 'js-short-timestamp')]/@data-time"/>
+                <xsl:value-of select="php:functionString('Tweeper::epochToRssDate', number($timestamp))"/>
+            </pubDate>
+            <description>
+                <xsl:value-of select="concat($user-name, ': ')"/>
+                <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
+                <xsl:if test="$item-has-video">
+                    <xsl:text>(Video) </xsl:text>
+                </xsl:if>
+                <xsl:apply-templates select="$item-content/node()"/>
+                <xsl:apply-templates select="$item-media/node()"/>
+                <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
+            </description>
+            <xsl:if test="$generate-enclosure = 1">
+                <xsl:apply-templates select="$item-content//a[@data-expanded-url]" mode="enclosure"/>
+                <xsl:apply-templates select="$item-media//div[@data-image-url]" mode="enclosure"/>
+            </xsl:if>
+        </item>
+    </xsl:template>
+
+    <xsl:template match="/">
+        <xsl:variable name="channel-title">
+            <xsl:choose>
+                <xsl:when test="$screen-name != ''">
+                    <xsl:value-of select="concat('Twitter / ', $screen-name)"/>
+                </xsl:when>
+                <xsl:otherwise>
+                    <xsl:value-of select="concat('Twitter / ', normalize-space(//h1[1]))"/>
+                </xsl:otherwise>
+            </xsl:choose>
+        </xsl:variable>
+        <xsl:variable name="channel-link" select="//link[@rel='canonical']/@href"/>
+
+        <rss version="2.0">
+            <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
+            <channel>
+                <generator>Tweeper</generator>
+                <title>
+                    <xsl:value-of select="$channel-title"/>
+                </title>
+                <link>
+                    <xsl:value-of select="$channel-link"/>
+                </link>
+                <description>
+                    <xsl:value-of select="normalize-space(//div[@class='ProfileHeaderCard'])"/>
+                </description>
+                <image>
+                    <title>
+                        <xsl:value-of select="$channel-title"/>
+                    </title>
+                    <link>
+                        <xsl:value-of select="$channel-link"/>
+                    </link>
+                    <url>
+                        <xsl:value-of select="//a[contains(@class, 'profile-picture')]/@href"/>
+                    </url>
+                </image>
+                <xsl:apply-templates select="//ol[@id='stream-items-id']/li[@data-item-id and @data-item-type='tweet']"/>
+            </channel>
+        </rss>
+    </xsl:template>
+</xsl:stylesheet>
diff --git a/tweeper.php b/tweeper.php

index 87efd60..ba8b1d7 100644 (file)
--- a/tweeper.php
+++ b/tweeper.php
@@ -19,346 +19,13 @@
   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
   */
  
-require_once 'Symfony/Component/Serializer/autoload.php';
+require_once 'src/Tweeper.php';
  
-use Symfony\Component\Serializer\Serializer;
-use Symfony\Component\Serializer\Encoder\XmlEncoder;
-use Symfony\Component\Serializer\Normalizer\ObjectNormalizer;
+use Tweeper\Tweeper;
  
  date_default_timezone_set('UTC');
  
  /**
- * Scrape supported websites and perform conversion to RSS.
- */
-class Tweeper {
-
-  private static $userAgent = "Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0";
-
-  /**
-   * Constructor sets up {@link $generate_enclosure}.
-   */
-  public function __construct($generate_enclosure = FALSE) {
-    $this->generate_enclosure = $generate_enclosure;
-  }
-
-  /**
-   * Convert numeric Epoch to the date format expected in a RSS document.
-   */
-  public static function epochToRssDate($timestamp) {
-    if (!is_numeric($timestamp) || is_nan($timestamp)) {
-      $timestamp = 0;
-    }
-
-    return gmdate(DATE_RSS, $timestamp);
-  }
-
-  /**
-   * Convert generic date string to the date format expected in a RSS document.
-   */
-  public static function strToRssDate($date) {
-    $timestamp = strtotime($date);
-    if (FALSE === $timestamp) {
-      $timestamp = 0;
-    }
-
-    return Tweeper::epochToRssDate($timestamp);
-  }
-
-  /**
-   * Convert string to UpperCamelCase.
-   */
-  public static function toUpperCamelCase($str, $delim = ' ') {
-    $str_upper = ucwords($str, $delim);
-    $str_camel_case = str_replace($delim, '', $str_upper);
-    return $str_camel_case;
-  }
-
-  /**
-   * Get the contents from a URL.
-   */
-  private static function getUrlContents($url) {
-    $ch = curl_init($url);
-    curl_setopt_array($ch, array(
-      CURLOPT_HEADER => FALSE,
-      // Follow http redirects to get the real URL.
-      CURLOPT_FOLLOWLOCATION => TRUE,
-      CURLOPT_RETURNTRANSFER => TRUE,
-      CURLOPT_SSL_VERIFYHOST => FALSE,
-      CURLOPT_SSL_VERIFYPEER => FALSE,
-      CURLOPT_HTTPHEADER => array('Accept-language: en'),
-      CURLOPT_USERAGENT => Tweeper::$userAgent,
-    ));
-    $contents = curl_exec($ch);
-    if (FALSE === $contents) {
-      trigger_error(curl_error($ch));
-    }
-    curl_close($ch);
-
-    return $contents;
-  }
-
-  /**
-   * Get the headers from a URL.
-   */
-  private static function getUrlInfo($url) {
-    $ch = curl_init($url);
-    curl_setopt_array($ch, array(
-      CURLOPT_HEADER => TRUE,
-      CURLOPT_NOBODY => TRUE,
-      // Follow http redirects to get the real URL.
-      CURLOPT_FOLLOWLOCATION => TRUE,
-      CURLOPT_RETURNTRANSFER => TRUE,
-      CURLOPT_SSL_VERIFYHOST => FALSE,
-      CURLOPT_SSL_VERIFYPEER => FALSE,
-      CURLOPT_USERAGENT => Tweeper::$userAgent,
-    ));
-    curl_exec($ch);
-    $url_info = curl_getinfo($ch);
-    if (FALSE === $url_info) {
-      trigger_error(curl_error($ch));
-    }
-    curl_close($ch);
-
-    return $url_info;
-  }
-
-  /**
-   * Generate an RSS <enclosure/> element.
-   */
-  public static function generateEnclosure($url) {
-    $supported_content_types = array(
-      "application/octet-stream",
-      "application/ogg",
-      "application/pdf",
-      "audio/aac",
-      "audio/mp4",
-      "audio/mpeg",
-      "audio/ogg",
-      "audio/vorbis",
-      "audio/wav",
-      "audio/webm",
-      "audio/x-midi",
-      "image/gif",
-      "image/jpeg",
-      "image/png",
-      "video/avi",
-      "video/mp4",
-      "video/mpeg",
-      "video/ogg",
-    );
-
-    $url_info = Tweeper::getUrlInfo($url);
-
-    $supported = in_array($url_info['content_type'], $supported_content_types);
-    if (!$supported) {
-      error_log("Unsupported enclosure content type \"" . $url_info['content_type'] . "\" for URL: " . $url_info['url']);
-      return '';
-    }
-
-    // The RSS specification says that the enclosure element URL must be http.
-    // See http://sourceforge.net/p/feedvalidator/bugs/72/
-    $http_url = preg_replace("/^https/", "http", $url_info['url']);
-
-    $dom = new DOMDocument();
-    $enc = $dom->createElement('enclosure');
-    $enc->setAttribute('url', $http_url);
-    $enc->setAttribute('length', $url_info['download_content_length']);
-    $enc->setAttribute('type', $url_info['content_type']);
-
-    return $enc;
-  }
-
-  /**
-   * Mimic the message from libxml.c::php_libxml_ctx_error_level()
-   */
-  private static function logXmlError($error) {
-    $output = "";
-
-    switch ($error->level) {
-      case LIBXML_ERR_WARNING:
-        $output .= "Warning $error->code: ";
-        break;
-
-      case LIBXML_ERR_ERROR:
-        $output .= "Error $error->code: ";
-        break;
-
-      case LIBXML_ERR_FATAL:
-        $output .= "Fatal Error $error->code: ";
-        break;
-    }
-
-    $output .= trim($error->message);
-
-    if ($error->file) {
-      $output .= " in $error->file";
-    }
-    else {
-      $output .= " in Entity,";
-    }
-
-    $output .= " line $error->line";
-
-    error_log($output);
-  }
-
-  /**
-   * Convert json to XML.
-   */
-  private static function jsonToXml($json, $root_node_name) {
-    // Apparently the ObjectNormalizer used afterwards is not able to handle
-    // the stdClass object created by json_decode() with the default setting
-    // $assoc = false; so use $assoc = true.
-    $data = json_decode($json, $assoc = TRUE);
-    if (!$data) {
-      return NULL;
-    }
-
-    $encoder = new XmlEncoder();
-    $normalizer = new ObjectNormalizer();
-    $serializer = new Serializer(array($normalizer), array($encoder));
-
-    $serializer_options = array(
-      'xml_encoding' => "UTF-8",
-      'xml_format_output' => TRUE,
-      'xml_root_node_name' => $root_node_name,
-    );
-
-    $xml_data = $serializer->serialize($data, 'xml', $serializer_options);
-    if (!$xml_data) {
-      trigger_error("Cannot serialize data", E_USER_ERROR);
-      return NULL;
-    }
-
-    return $xml_data;
-  }
-
-  /**
-   * Convert the Instagram content to XML.
-   */
-  private function getXmlInstagramCom($html) {
-    // Extract the json data from the html code.
-    $json_match_expr = '/window._sharedData = (.*);/';
-    $ret = preg_match($json_match_expr, $html, $matches);
-    if ($ret !== 1) {
-      trigger_error("Cannot match expression: $json_match_expr\n", E_USER_ERROR);
-      return NULL;
-    }
-
-    return Tweeper::jsonToXml($matches[1], 'instagram');
-  }
-
-  /**
-   * Make the Facebook HTML processable.
-   */
-  private function preprocessHtmlFacebookCom($html) {
-    $html = str_replace('<!--', '', $html);
-    $html = str_replace('-->', '', $html);
-    return $html;
-  }
-
-  /**
-   * Convert the HTML retrieved from the site to XML.
-   */
-  private function htmlToXml($html, $host) {
-    $xmlDoc = new DOMDocument();
-
-    // Handle warnings and errors when loading invalid HTML.
-    $xml_errors_value = libxml_use_internal_errors(TRUE);
-
-    // If there is a host-specific method to get the XML data, use it!
-    $get_xml_host_method = 'getXml' . Tweeper::toUpperCamelCase($host, '.');
-    if (method_exists($this, $get_xml_host_method)) {
-      $xml_data = call_user_func_array(array($this, $get_xml_host_method), array($html));
-      $xmlDoc->loadXML($xml_data);
-    }
-    else {
-      $xmlDoc->loadHTML($html);
-    }
-
-    foreach (libxml_get_errors() as $xml_error) {
-      Tweeper::logXmlError($xml_error);
-    }
-    libxml_clear_errors();
-    libxml_use_internal_errors($xml_errors_value);
-
-    return $xmlDoc;
-  }
-
-  /**
-   * Load a stylesheet if the web site is supported.
-   */
-  private function loadStylesheet($host) {
-    $stylesheet = "file://" . __DIR__ . "/rss_converter_" . $host . ".xsl";
-    if (FALSE === file_exists($stylesheet)) {
-      trigger_error("Conversion to RSS not supported for $host ($stylesheet not found)", E_USER_ERROR);
-      return NULL;
-    }
-
-    $stylesheet_contents = Tweeper::getUrlContents($stylesheet);
-
-    $xslDoc = new DOMDocument();
-    $xslDoc->loadXML($stylesheet_contents);
-
-    $xsltProcessor = new XSLTProcessor();
-    $xsltProcessor->registerPHPFunctions();
-    $xsltProcessor->setParameter('', 'generate-enclosure', $this->generate_enclosure);
-    $xsltProcessor->importStylesheet($xslDoc);
-
-    return $xsltProcessor;
-  }
-
-  /**
-   * Convert the site content to RSS.
-   */
-  public function tweep($src_url) {
-    $url = parse_url($src_url);
-    if (FALSE === $url || empty($url["host"])) {
-      trigger_error("Invalid URL: $src_url", E_USER_ERROR);
-      return NULL;
-    }
-
-    $scheme = $url["scheme"];
-    if (!in_array($scheme, array("http", "https"))) {
-      trigger_error("unsupported scheme: $scheme", E_USER_ERROR);
-      return NULL;
-    }
-
-    // Strip the leading www. to be more forgiving on input URLs.
-    $host = preg_replace('/^www\./', '', $url["host"]);
-
-    $xsltProcessor = $this->loadStylesheet($host);
-    if (NULL === $xsltProcessor) {
-      return NULL;
-    }
-
-    $html = Tweeper::getUrlContents($src_url);
-    if (FALSE === $html) {
-      return NULL;
-    }
-
-    $preprocess_html_host_method = 'preprocessHtml' . Tweeper::toUpperCamelCase($host, '.');
-    if (method_exists($this, $preprocess_html_host_method)) {
-      $html = call_user_func_array(array($this, $preprocess_html_host_method), array($html));
-    }
-
-    $xmlDoc = $this->htmlToXml($html, $host);
-    if (NULL === $xmlDoc) {
-      return NULL;
-    }
-
-    $output = $xsltProcessor->transformToXML($xmlDoc);
-
-    if (FALSE === $output) {
-      trigger_error('XSL transformation failed.', E_USER_ERROR);
-      return NULL;
-    }
-    return $output;
-  }
-
-}
-
-/**
   * Check if the script is being run from the command line.
   */
  function is_cli() {
author	Antonio Ospite <ao2@ao2.it>
	Fri, 4 Nov 2016 12:13:54 +0000 (13:13 +0100)
committer	Antonio Ospite <ao2@ao2.it>
	Fri, 4 Nov 2016 15:03:13 +0000 (16:03 +0100)
rss_converter_dilbert.com.xsl	[deleted file]	patch \| blob \| history
rss_converter_facebook.com.xsl	[deleted file]	patch \| blob \| history
rss_converter_howtoons.com.xsl	[deleted file]	patch \| blob \| history
rss_converter_identi.ca.xsl	[deleted symlink]	patch \| blob \| history
rss_converter_instagram.com.xsl	[deleted file]	patch \| blob \| history
rss_converter_pump.io.xsl	[deleted file]	patch \| blob \| history
rss_converter_twitter.com.xsl	[deleted file]	patch \| blob \| history
src/Tweeper.php	[new file with mode: 0644]	patch \| blob
src/rss_converter_dilbert.com.xsl	[new file with mode: 0644]	patch \| blob
src/rss_converter_facebook.com.xsl	[new file with mode: 0644]	patch \| blob
src/rss_converter_howtoons.com.xsl	[new file with mode: 0644]	patch \| blob
src/rss_converter_identi.ca.xsl	[new symlink]	patch \| blob
src/rss_converter_instagram.com.xsl	[new file with mode: 0644]	patch \| blob
src/rss_converter_pump.io.xsl	[new file with mode: 0644]	patch \| blob
src/rss_converter_twitter.com.xsl	[new file with mode: 0644]	patch \| blob
tweeper.php		patch \| blob \| history