From b187bb677361d17a468abf749332d081a194b4bd Mon Sep 17 00:00:00 2001 From: Antonio Ospite <ao2@ao2.it> Date: Fri, 4 Nov 2016 13:13:54 +0100 Subject: [PATCH 1/1] tweeper: move the main Tweeper class to its own file under src/ This matches more closely the project structure expected by composer packages. --- rss_converter_dilbert.com.xsl | 115 ------------ rss_converter_facebook.com.xsl | 141 -------------- rss_converter_howtoons.com.xsl | 102 ---------- rss_converter_identi.ca.xsl | 1 - rss_converter_instagram.com.xsl | 135 ------------- rss_converter_pump.io.xsl | 99 ---------- rss_converter_twitter.com.xsl | 208 -------------------- src/Tweeper.php | 365 ++++++++++++++++++++++++++++++++++++ src/rss_converter_dilbert.com.xsl | 115 ++++++++++++ src/rss_converter_facebook.com.xsl | 141 ++++++++++++++ src/rss_converter_howtoons.com.xsl | 102 ++++++++++ src/rss_converter_identi.ca.xsl | 1 + src/rss_converter_instagram.com.xsl | 135 +++++++++++++ src/rss_converter_pump.io.xsl | 99 ++++++++++ src/rss_converter_twitter.com.xsl | 208 ++++++++++++++++++++ tweeper.php | 337 +-------------------------------- 16 files changed, 1168 insertions(+), 1136 deletions(-) delete mode 100644 rss_converter_dilbert.com.xsl delete mode 100644 rss_converter_facebook.com.xsl delete mode 100644 rss_converter_howtoons.com.xsl delete mode 120000 rss_converter_identi.ca.xsl delete mode 100644 rss_converter_instagram.com.xsl delete mode 100644 rss_converter_pump.io.xsl delete mode 100644 rss_converter_twitter.com.xsl create mode 100644 src/Tweeper.php create mode 100644 src/rss_converter_dilbert.com.xsl create mode 100644 src/rss_converter_facebook.com.xsl create mode 100644 src/rss_converter_howtoons.com.xsl create mode 120000 src/rss_converter_identi.ca.xsl create mode 100644 src/rss_converter_instagram.com.xsl create mode 100644 src/rss_converter_pump.io.xsl create mode 100644 src/rss_converter_twitter.com.xsl diff --git a/rss_converter_dilbert.com.xsl b/rss_converter_dilbert.com.xsl deleted file mode 100644 index b6d1975..0000000 --- a/rss_converter_dilbert.com.xsl +++ /dev/null @@ -1,115 +0,0 @@ -<!-- - Stylesheet to convert Dilbert daily strips to RSS. - - Copyright (C) 2013-2014 Antonio Ospite <ao2@ao2.it> - - This file is part of tweeper. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. ---> - -<!-- - Since June 18, 2013 dilbert.com strips are not accessible anymore - directly from the RSS feed, this message is displayed instead: - - Dilbert readers - Please visit Dilbert.com to read this feature. Due - to changes with our feeds, we are now making this RSS feed a link to - Dilbert.com. - - How unhandy is that, was it because of a management decision? - Maybe a parody dilbert strip is needed about this issue... ---> - -<xsl:stylesheet version="1.0" - xmlns:xsl="http://www.w3.org/1999/XSL/Transform" - xmlns:php="http://php.net/xsl" - xsl:extension-element-prefixes="php" - exclude-result-prefixes="php"> - - <xsl:output method="xml" indent="yes"/> - - <xsl:variable name="BaseURL" select="//meta[@property='og:url']/@content"/> - - <xsl:template match="//section[@class='comic-item']"> - <xsl:variable name="item-permalink" select=".//a[@class='img-comic-link']/@href"/> - <xsl:variable name="picture-url" select=".//img[@class='img-responsive img-comic']/@src"/> - <xsl:variable name="picture-title" select=".//img[@class='img-responsive img-comic']/@alt"/> - <item> - <title> - <xsl:variable name="title-length" select="140"/> - <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 --> - <xsl:choose> - <xsl:when test="string-length($picture-title) > $title-length"> - <xsl:variable name="truncated-length" select="$title-length - 3"/> - <xsl:value-of select="substring($picture-title, 1, $truncated-length)"/> - <xsl:text>...</xsl:text> - </xsl:when> - <xsl:otherwise> - <xsl:value-of select="$picture-title"/> - </xsl:otherwise> - </xsl:choose> - </title> - <link> - <xsl:value-of select="$item-permalink"/> - </link> - <guid> - <xsl:value-of select="$item-permalink"/> - </guid> - <pubDate> - <xsl:value-of select="php:functionString('Tweeper::strToRssDate', normalize-space(.//date))"/> - </pubDate> - <description> - <xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text> - <img src="{$picture-url}" alt="{$picture-title}"/> - <xsl:text disable-output-escaping="yes">]]></xsl:text> - </description> - <xsl:if test="$generate-enclosure = 1"> - <xsl:copy-of select="php:functionString('Tweeper::generateEnclosure', $picture-url)"/> - </xsl:if> - </item> - </xsl:template> - - <xsl:template match="/"> - <xsl:variable name="channel-title" select="//meta[@property='og:title']/@content"/> - <xsl:variable name="channel-link" select="$BaseURL"/> - - <rss version="2.0"> - <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute> - <channel> - <generator>Tweeper</generator> - <title> - <xsl:value-of select="$channel-title"/> - </title> - <link> - <xsl:value-of select="$channel-link"/> - </link> - <description> - <xsl:value-of select="//meta[@property='og:description']/@content"/> - </description> - <image> - <title> - <xsl:value-of select="$channel-title"/> - </title> - <link> - <xsl:value-of select="$channel-link"/> - </link> - <url> - <xsl:value-of select="concat($BaseURL, //img[@alt='Dilbert logo']/@src)"/> - </url> - </image> - <xsl:apply-templates select="//section[@class='comic-item']"/> - </channel> - </rss> - </xsl:template> -</xsl:stylesheet> diff --git a/rss_converter_facebook.com.xsl b/rss_converter_facebook.com.xsl deleted file mode 100644 index 418b3d2..0000000 --- a/rss_converter_facebook.com.xsl +++ /dev/null @@ -1,141 +0,0 @@ -<!-- - Stylesheet to convert a Facebook public page to RSS. - - Copyright (C) 2015 Antonio Ospite <ao2@ao2.it> - - This file is part of tweeper. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. ---> - -<!-- - Since June 23rd, 2015 facebook.com deprecated the RSS feed endpoint for public pages: - https://developers.facebook.com/docs/apps/changelog#v2_3_90_day_deprecations - - They suggest to use the Graph API but they fail to mention that it does not - work anymore without authentication, so it cannot be considered an - _equivalent_ solution. - - Luckily we've got Tweeper! ---> - -<xsl:stylesheet version="1.0" - xmlns:xsl="http://www.w3.org/1999/XSL/Transform" - xmlns:php="http://php.net/xsl" - xsl:extension-element-prefixes="php" - exclude-result-prefixes="php"> - - <xsl:output method="xml" indent="yes"/> - - <xsl:variable name="BaseURL"> - <xsl:text>https://facebook.com</xsl:text> - </xsl:variable> - - <!-- - Extract the page id from an element like: - <meta property="al:android:url" content="fb://page/793837197390834"> - - The page id will be used to build the permalink. - --> - <xsl:variable - name="page-id" - select="substring-after(//meta[@property='al:android:url']/@content, 'fb://page/')"/> - - <xsl:template match="//div[contains(@class, 'userContentWrapper')]"> - <xsl:variable name="story-id" select=".//input[@name='ft_ent_identifier']/@value"/> - <xsl:variable - name="item-permalink" - select="concat($BaseURL, '/permalink.php?id=', $page-id, '&story_fbid=', $story-id)"/> - - <!-- Get only the first child in order to skip the footer of the content --> - <xsl:variable name="item-content" select="div[1]"/> - - <item> - <title> - <xsl:variable name="item-title" select="$item-content//p"/> - <xsl:variable name="title-length" select="140"/> - <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 --> - <xsl:choose> - <xsl:when test="string-length($item-title) > $title-length"> - <xsl:variable name="truncated-length" select="$title-length - 3"/> - <xsl:value-of select="substring($item-title, 1, $truncated-length)"/> - <xsl:text>...</xsl:text> - </xsl:when> - <xsl:otherwise> - <xsl:value-of select="$item-title"/> - </xsl:otherwise> - </xsl:choose> - </title> - <link> - <xsl:value-of select="$item-permalink"/> - </link> - <guid> - <xsl:value-of select="$item-permalink"/> - </guid> - <pubDate> - <xsl:variable name="timestamp" select=".//abbr[@data-shorten]/@data-utime"/> - <xsl:value-of select="php:functionString('Tweeper::epochToRssDate', $timestamp)"/> - </pubDate> - <description> - - <!-- - Get only the children starting from the one with class="userContent", - this way the content header is skipped - --> - <xsl:variable - name="usercontent-position" - select="count($item-content/div[contains(@class, 'userContent')]/preceding-sibling::*) + 1"/> - - <xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text> - <xsl:copy-of select="$item-content/div[position() >= $usercontent-position]"/> - <xsl:text disable-output-escaping="yes">]]></xsl:text> - </description> - </item> - </xsl:template> - - <xsl:template match="/"> - <xsl:variable name="channel-title" select="//title"/> - <xsl:variable name="channel-link" select="//div[contains(@class, 'userContentWrapper')][1]//a[1]/@href"/> - - <rss version="2.0"> - <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute> - <channel> - <generator>Tweeper</generator> - <title> - <xsl:value-of select="$channel-title"/> - </title> - <link> - <xsl:value-of select="$channel-link"/> - </link> - <description> - <xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text> - <xsl:copy-of select="//div[@data-id='1']/node()"/> - <xsl:text disable-output-escaping="yes">]]></xsl:text> - </description> - <image> - <title> - <xsl:value-of select="$channel-title"/> - </title> - <link> - <xsl:value-of select="$channel-link"/> - </link> - <url> - <xsl:value-of select="//img[@class='profilePic img']/@src"/> - </url> - </image> - <xsl:apply-templates select="//div[contains(@class, 'userContentWrapper')]"/> - </channel> - </rss> - </xsl:template> -</xsl:stylesheet> diff --git a/rss_converter_howtoons.com.xsl b/rss_converter_howtoons.com.xsl deleted file mode 100644 index 403b9ac..0000000 --- a/rss_converter_howtoons.com.xsl +++ /dev/null @@ -1,102 +0,0 @@ -<!-- - Stylesheet to convert Howtoons.com to RSS. - - Copyright (C) 2014 Antonio Ospite <ao2@ao2.it> - - This file is part of tweeper. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. ---> - -<!-- - The RSS feed link is broken on http://howtoons.com so just work around it. - - Howtoons uses Wordpress, so maybe this style sheet can be used as a base for - scraping other Wordpress sites. ---> - -<xsl:stylesheet version="1.0" - xmlns:xsl="http://www.w3.org/1999/XSL/Transform" - xmlns:php="http://php.net/xsl" - xsl:extension-element-prefixes="php" - exclude-result-prefixes="php"> - - <xsl:output method="xml" indent="yes"/> - - <xsl:variable name="BaseURL"> - <xsl:text>http://howtoons.com</xsl:text> - </xsl:variable> - - <xsl:template match="//div[contains(@id, 'post-')]"> - <xsl:variable name="item-permalink" select=".//div[@class='post-headline']//a/@href"/> - <item> - <title> - <xsl:value-of select="normalize-space(.//div[@class='post-headline']//a)"/> - </title> - <link> - <xsl:value-of select="$item-permalink"/> - </link> - <guid> - <xsl:value-of select="$item-permalink"/> - </guid> - <pubDate> - <xsl:variable name="date" select="substring-before(.//div[@class='post-byline'], ',')"/> - <!-- date format is MM.DD.YY --> - <xsl:variable name="month" select="substring($date, 1, 2)"/> - <xsl:variable name="day" select="substring($date, 4, 2)"/> - <xsl:variable name="year" select="substring($date, 7, 2)"/> - <xsl:variable name="iso-date" select="concat('20', $year, '-', $month, '-', $day)"/> - <xsl:value-of select="php:functionString('Tweeper::strToRssDate', $iso-date)"/> - </pubDate> - <description> - <xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text> - <xsl:copy-of select=".//div[contains(@class, 'post-bodycopy')]/p"/> - <xsl:text disable-output-escaping="yes">]]></xsl:text> - </description> - </item> - </xsl:template> - - <xsl:template match="/"> - <xsl:variable name="channel-title" select="//title"/> - <xsl:variable name="channel-link" select="$BaseURL"/> - - <rss version="2.0"> - <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute> - <channel> - <generator>Tweeper</generator> - <title> - <xsl:value-of select="$channel-title"/> - </title> - <link> - <xsl:value-of select="$channel-link"/> - </link> - <description> - <xsl:text>The world's greatest D.I.Y. comic website! Tools of mass construction!</xsl:text> - </description> - <image> - <title> - <xsl:value-of select="$channel-title"/> - </title> - <link> - <xsl:value-of select="$channel-link"/> - </link> - <url> - <xsl:text>http://www.howtoons.com/wp-content/themes/atahualpa/images/header/tuck1000.png</xsl:text> - </url> - </image> - <xsl:apply-templates select="//div[contains(@id, 'post-')]"/> - </channel> - </rss> - </xsl:template> -</xsl:stylesheet> diff --git a/rss_converter_identi.ca.xsl b/rss_converter_identi.ca.xsl deleted file mode 120000 index d8042a1..0000000 --- a/rss_converter_identi.ca.xsl +++ /dev/null @@ -1 +0,0 @@ -rss_converter_pump.io.xsl \ No newline at end of file diff --git a/rss_converter_instagram.com.xsl b/rss_converter_instagram.com.xsl deleted file mode 100644 index e869d7d..0000000 --- a/rss_converter_instagram.com.xsl +++ /dev/null @@ -1,135 +0,0 @@ -<!-- - Stylesheet to convert Instagram user timelines to RSS. - - Copyright (C) 2015 Antonio Ospite <ao2@ao2.it> - - This file is part of tweeper. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. ---> -<xsl:stylesheet version="1.0" - xmlns:xsl="http://www.w3.org/1999/XSL/Transform" - xmlns:php="http://php.net/xsl" - xsl:extension-element-prefixes="php" - exclude-result-prefixes="php"> - - <xsl:param name="generate-enclosure"/> - - <xsl:output method="xml" indent="yes"/> - - <xsl:variable name="BaseURL"> - <xsl:text>https://instagram.com</xsl:text> - </xsl:variable> - - <xsl:variable name="user-name" select="//ProfilePage/user/username"/> - - <!-- Some users do not specify the full name --> - <xsl:variable name="full-name" select="//ProfilePage/user/full_name"/> - <xsl:variable name="screen-name"> - <xsl:choose> - <xsl:when test="$full-name != ''"> - <xsl:value-of select="$full-name"/> - </xsl:when> - <xsl:otherwise> - <xsl:value-of select="$user-name"/> - </xsl:otherwise> - </xsl:choose> - </xsl:variable> - - <xsl:template match="//ProfilePage/user/media/nodes"> - <xsl:variable name="item-content-image" select="./display_src"/> - <xsl:variable name="item-content-caption" select="./caption"/> - <xsl:variable name="item-permalink" select="concat($BaseURL, '/p/', ./code, '/')"/> - <item> - <title> - <xsl:variable name="title-length" select="140"/> - <xsl:variable name="item-content-title" select="normalize-space(concat($user-name, ': ', $item-content-caption))"/> - <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 --> - <xsl:choose> - <xsl:when test="string-length($item-content-title) > $title-length"> - <xsl:variable name="truncated-length" select="$title-length - 3"/> - <xsl:value-of select="substring($item-content-title, 1, $truncated-length)"/> - <xsl:text>...</xsl:text> - </xsl:when> - <xsl:otherwise> - <xsl:value-of select="$item-content-title"/> - </xsl:otherwise> - </xsl:choose> - </title> - <link> - <xsl:value-of select="$item-permalink"/> - </link> - <guid> - <xsl:value-of select="$item-permalink"/> - </guid> - <pubDate> - <xsl:variable name="timestamp" select="./date"/> - <xsl:value-of select="php:functionString('Tweeper::epochToRssDate', $timestamp)"/> - </pubDate> - <description> - <xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text> - <p> - <xsl:if test="./is_video/text() = 1"> - (Video) - </xsl:if> - <xsl:value-of select="$item-content-caption"/> - </p><br /> - <a href="{$item-permalink}"><img src="{$item-content-image}" style="max-width: 100%"/></a> - <xsl:text disable-output-escaping="yes">]]></xsl:text> - </description> - <xsl:if test="$generate-enclosure = 1"> - <xsl:copy-of select="php:functionString('Tweeper::generateEnclosure', $item-content-image)"/> - </xsl:if> - </item> - </xsl:template> - - <xsl:template match="/"> - <xsl:variable name="channel-title" select="concat('Instagram / ', $screen-name)"/> - <xsl:variable name="channel-link" select="concat($BaseURL, '/', $user-name)"/> - - <rss version="2.0"> - <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute> - <channel> - <generator>Tweeper</generator> - <title> - <xsl:value-of select="$channel-title"/> - </title> - <link> - <xsl:value-of select="$channel-link"/> - </link> - <description> - <xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text> - <xsl:value-of select="normalize-space(concat($screen-name, '. ', //user/biography))"/> - <xsl:variable name="external-url" select="//user/external_url"/> - <xsl:if test="$external-url != ''"> - <xsl:text> </xsl:text><a href="{$external-url}"><xsl:value-of select="$external-url"/></a> - </xsl:if> - <xsl:text disable-output-escaping="yes">]]></xsl:text> - </description> - <image> - <title> - <xsl:value-of select="$channel-title"/> - </title> - <link> - <xsl:value-of select="$channel-link"/> - </link> - <url> - <xsl:value-of select="//ProfilePage/user/profile_pic_url"/> - </url> - </image> - <xsl:apply-templates select="//ProfilePage/user/media/nodes"/> - </channel> - </rss> - </xsl:template> -</xsl:stylesheet> diff --git a/rss_converter_pump.io.xsl b/rss_converter_pump.io.xsl deleted file mode 100644 index 1577dcf..0000000 --- a/rss_converter_pump.io.xsl +++ /dev/null @@ -1,99 +0,0 @@ -<!-- - Stylesheet to convert Pump.io activity streams to RSS. - - Copyright (C) 2013-2014 Antonio Ospite <ao2@ao2.it> - - This file is part of tweeper. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. ---> -<!-- To Evan, please reconsider publishing RSS ouput for _public_ contents --> -<xsl:stylesheet version="1.0" - xmlns:xsl="http://www.w3.org/1999/XSL/Transform" - xmlns:php="http://php.net/xsl" - xsl:extension-element-prefixes="php" - exclude-result-prefixes="php"> - - <xsl:output method="xml" indent="yes"/> - - <xsl:variable name="domain-name" select="substring-after(//div[@id='profile-block']/@data-profile-id, '@')"/> - <xsl:variable name="BaseURL" select="concat('https://', $domain-name)"/> - - <xsl:variable name="user-name" select="substring-after(//div[@id='profile-block']/@data-profile-id, ':')"/> - - <xsl:template match="//div[@id='user-content-activities']//ul[@id='major-stream']/li"> - <xsl:variable name="item-content" select=".//div[@class='activity-content']"/> - <xsl:variable name="item-permalink" select=".//p[@class='muted']/small/a/@href"/> - <item> - <title> - <xsl:value-of select="concat($user-name, ': ', normalize-space($item-content))"/> - </title> - <link> - <xsl:value-of select="$item-permalink"/> - </link> - <guid> - <xsl:value-of select="$item-permalink"/> - </guid> - <pubDate> - <xsl:value-of select="php:functionString('Tweeper::strToRssDate', .//abbr[@class='easydate']/@title)"/> - </pubDate> - <description> - <xsl:value-of select="concat($user-name, ': ')"/> - <xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text> - <xsl:copy-of select="$item-content/node()"/> - <xsl:text disable-output-escaping="yes">]]></xsl:text> - </description> - <xsl:if test="$generate-enclosure = 1"> - <xsl:variable name="image-thumb-link" select=".//img[contains(@class, 'object-image')]/@src"/> - <xsl:if test="$image-thumb-link"> - <xsl:variable name="image-link" select="php:functionString('str_replace', '_thumb', '', $image-thumb-link)"/> - <xsl:copy-of select="php:functionString('Tweeper::generateEnclosure', $image-link)"/> - </xsl:if> - </xsl:if> - </item> - </xsl:template> - - <xsl:template match="/"> - <xsl:variable name="channel-title" select="concat(substring-after($user-name, '@'), ' / ', substring-before($user-name, '@'))"/> - <xsl:variable name="channel-link" select="concat('https://', substring-after($user-name, '@'), '/', substring-before($user-name, '@'))"/> - - <rss version="2.0"> - <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute> - <channel> - <generator>Tweeper</generator> - <title> - <xsl:value-of select="$channel-title"/> - </title> - <link> - <xsl:value-of select="$channel-link"/> - </link> - <description> - <xsl:value-of select="normalize-space(//h1[@class='media-header'])"/> - </description> - <image> - <title> - <xsl:value-of select="$channel-title"/> - </title> - <link> - <xsl:value-of select="$channel-link"/> - </link> - <url> - <xsl:value-of select="//div[@id='profile-block']/span/img[@class='img-rounded media-object']/@src"/> - </url> - </image> - <xsl:apply-templates select="//div[@id='user-content-activities']//ul[@id='major-stream']/li"/> - </channel> - </rss> - </xsl:template> -</xsl:stylesheet> diff --git a/rss_converter_twitter.com.xsl b/rss_converter_twitter.com.xsl deleted file mode 100644 index c154141..0000000 --- a/rss_converter_twitter.com.xsl +++ /dev/null @@ -1,208 +0,0 @@ -<!-- - Stylesheet to convert Twitter user timelines to RSS. - - Copyright (C) 2013-2014 Antonio Ospite <ao2@ao2.it> - - This file is part of tweeper. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. ---> -<xsl:stylesheet version="1.0" - xmlns:xsl="http://www.w3.org/1999/XSL/Transform" - xmlns:php="http://php.net/xsl" - xsl:extension-element-prefixes="php" - exclude-result-prefixes="php"> - - <xsl:param name="generate-enclosure"/> - - <xsl:output method="xml" indent="yes"/> - - <xsl:variable name="BaseURL"> - <xsl:text>https://twitter.com</xsl:text> - </xsl:variable> - - <!-- Identity transform --> - <xsl:template match="@*|node()"> - <xsl:copy> - <xsl:apply-templates select="@*|node()"/> - </xsl:copy> - </xsl:template> - - <!-- - Anchors to external links provide the direct URL in the - data-expanded-url attribute, so use this in the href attribute too - instead of the default short URL which uses the t.co redirection - service. - - NOTE: when creating an element, attributes must be processed _before_ - adding the contents (either children or a value): - http://stackoverflow.com/questions/21984867/ - --> - <xsl:template match="a[@data-expanded-url]"> - <!-- Prepend and append a white space for aestethic reasons --> - <xsl:text> </xsl:text> - <a> - <xsl:attribute name="href"> - <xsl:value-of select="@data-expanded-url"/> - </xsl:attribute> - <!-- Also strip and … --> - <xsl:value-of select="translate(., ' …', '')"/> - </a> - <xsl:text> </xsl:text> - </xsl:template> - - <!-- - These are links to pic.twitter.com, use the direct link for those - too instead of the t.co redirections. - --> - <xsl:template match="a[@data-pre-embedded='true']"> - <!-- Prepend and append a white space for aestethic reasons --> - <xsl:text> </xsl:text> - <a> - <xsl:attribute name="href"> - <xsl:value-of select="concat('https://', .)"/> - </xsl:attribute> - <xsl:value-of select="concat('https://', .)"/> - </a> - <xsl:text> </xsl:text> - </xsl:template> - - <!-- Present images in a more convenient way --> - <xsl:template match="div[@data-image-url]"> - <a> - <xsl:attribute name="href"> - <xsl:value-of select="concat(@data-image-url, ':orig')"/> - </xsl:attribute> - <img> - <xsl:attribute name="src"> - <xsl:value-of select="@data-image-url"/> - </xsl:attribute> - </img> - </a> - </xsl:template> - - <!-- Don't repeat background in embedded media content --> - <xsl:template match="div[contains(@class, 'PlayableMedia-player')]"> - <xsl:copy> - <xsl:apply-templates select="@*"/> - <xsl:attribute name="style"> - <xsl:value-of select="concat(@style, '; background-repeat: no-repeat')"/> - </xsl:attribute> - <xsl:apply-templates select="node()"/> - </xsl:copy> - </xsl:template> - - <xsl:template match="a[@data-expanded-url]" mode="enclosure"> - <xsl:copy-of select="php:functionString('Tweeper::generateEnclosure', ./@data-expanded-url)"/> - </xsl:template> - - <xsl:template match="div[@data-image-url]" mode="enclosure"> - <xsl:copy-of select="php:functionString('Tweeper::generateEnclosure', concat(./@data-image-url, ':orig'))"/> - </xsl:template> - - <xsl:variable name="screen-name" select="//div[@class='user-actions btn-group not-following ']/@data-screen-name"/> - - <xsl:template match="//li[@data-item-id and @data-item-type='tweet']"> - <xsl:variable name="user-name" select=".//div[contains(@class, 'js-stream-tweet')]/@data-screen-name"/> - <xsl:variable name="item-content" select=".//p[contains(@class, 'js-tweet-text')]"/> - <xsl:variable name="item-media" select=".//div[contains(@class, 'AdaptiveMedia-container')]"/> - <xsl:variable name="item-permalink" select="concat($BaseURL, .//div[@data-permalink-path]/@data-permalink-path)"/> - - <xsl:variable name="item-has-video" select="$item-media//*[contains(@class, 'PlayableMedia--video')]"/> - <item> - <title> - <xsl:value-of select="concat($user-name, ': ')"/> - <xsl:if test="$item-has-video"> - <xsl:text>(Video) </xsl:text> - </xsl:if> - <!-- - Prepend a space in front of the URLs which are not - preceded by an open parenthesis, for aestethic reasons. - Also, regex, I know: http://xkcd.com/1171/ - --> - <xsl:variable - name="processed-title" - select="php:functionString('preg_replace', '@((?<!\()(?:http[s]?://|pic.twitter.com))@', ' \1', $item-content)"/> - <!-- Also strip and … --> - <xsl:value-of select="normalize-space(translate($processed-title, ' …', ''))"/> - </title> - <link> - <xsl:value-of select="$item-permalink"/> - </link> - <guid> - <xsl:value-of select="$item-permalink"/> - </guid> - <pubDate> - <xsl:variable name="timestamp" select=".//span[contains(@class, 'js-short-timestamp')]/@data-time"/> - <xsl:value-of select="php:functionString('Tweeper::epochToRssDate', number($timestamp))"/> - </pubDate> - <description> - <xsl:value-of select="concat($user-name, ': ')"/> - <xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text> - <xsl:if test="$item-has-video"> - <xsl:text>(Video) </xsl:text> - </xsl:if> - <xsl:apply-templates select="$item-content/node()"/> - <xsl:apply-templates select="$item-media/node()"/> - <xsl:text disable-output-escaping="yes">]]></xsl:text> - </description> - <xsl:if test="$generate-enclosure = 1"> - <xsl:apply-templates select="$item-content//a[@data-expanded-url]" mode="enclosure"/> - <xsl:apply-templates select="$item-media//div[@data-image-url]" mode="enclosure"/> - </xsl:if> - </item> - </xsl:template> - - <xsl:template match="/"> - <xsl:variable name="channel-title"> - <xsl:choose> - <xsl:when test="$screen-name != ''"> - <xsl:value-of select="concat('Twitter / ', $screen-name)"/> - </xsl:when> - <xsl:otherwise> - <xsl:value-of select="concat('Twitter / ', normalize-space(//h1[1]))"/> - </xsl:otherwise> - </xsl:choose> - </xsl:variable> - <xsl:variable name="channel-link" select="//link[@rel='canonical']/@href"/> - - <rss version="2.0"> - <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute> - <channel> - <generator>Tweeper</generator> - <title> - <xsl:value-of select="$channel-title"/> - </title> - <link> - <xsl:value-of select="$channel-link"/> - </link> - <description> - <xsl:value-of select="normalize-space(//div[@class='ProfileHeaderCard'])"/> - </description> - <image> - <title> - <xsl:value-of select="$channel-title"/> - </title> - <link> - <xsl:value-of select="$channel-link"/> - </link> - <url> - <xsl:value-of select="//a[contains(@class, 'profile-picture')]/@href"/> - </url> - </image> - <xsl:apply-templates select="//ol[@id='stream-items-id']/li[@data-item-id and @data-item-type='tweet']"/> - </channel> - </rss> - </xsl:template> -</xsl:stylesheet> diff --git a/src/Tweeper.php b/src/Tweeper.php new file mode 100644 index 0000000..73cbe81 --- /dev/null +++ b/src/Tweeper.php @@ -0,0 +1,365 @@ +<?php + +namespace Tweeper; + +/** + * @file + * Tweeper - a Twitter to RSS web scraper. + * + * Copyright (C) 2013-2015 Antonio Ospite <ao2@ao2.it> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +use DOMDocument; +use XSLTProcessor; + +require_once 'Symfony/Component/Serializer/autoload.php'; + +use Symfony\Component\Serializer\Serializer; +use Symfony\Component\Serializer\Encoder\XmlEncoder; +use Symfony\Component\Serializer\Normalizer\ObjectNormalizer; + +date_default_timezone_set('UTC'); + +/** + * Scrape supported websites and perform conversion to RSS. + */ +class Tweeper { + + private static $userAgent = "Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0"; + + /** + * Constructor sets up {@link $generate_enclosure}. + */ + public function __construct($generate_enclosure = FALSE) { + $this->generate_enclosure = $generate_enclosure; + } + + /** + * Convert numeric Epoch to the date format expected in a RSS document. + */ + public static function epochToRssDate($timestamp) { + if (!is_numeric($timestamp) || is_nan($timestamp)) { + $timestamp = 0; + } + + return gmdate(DATE_RSS, $timestamp); + } + + /** + * Convert generic date string to the date format expected in a RSS document. + */ + public static function strToRssDate($date) { + $timestamp = strtotime($date); + if (FALSE === $timestamp) { + $timestamp = 0; + } + + return Tweeper::epochToRssDate($timestamp); + } + + /** + * Convert string to UpperCamelCase. + */ + public static function toUpperCamelCase($str, $delim = ' ') { + $str_upper = ucwords($str, $delim); + $str_camel_case = str_replace($delim, '', $str_upper); + return $str_camel_case; + } + + /** + * Get the contents from a URL. + */ + private static function getUrlContents($url) { + $ch = curl_init($url); + curl_setopt_array($ch, array( + CURLOPT_HEADER => FALSE, + // Follow http redirects to get the real URL. + CURLOPT_FOLLOWLOCATION => TRUE, + CURLOPT_RETURNTRANSFER => TRUE, + CURLOPT_SSL_VERIFYHOST => FALSE, + CURLOPT_SSL_VERIFYPEER => FALSE, + CURLOPT_HTTPHEADER => array('Accept-language: en'), + CURLOPT_USERAGENT => Tweeper::$userAgent, + )); + $contents = curl_exec($ch); + if (FALSE === $contents) { + trigger_error(curl_error($ch)); + } + curl_close($ch); + + return $contents; + } + + /** + * Get the headers from a URL. + */ + private static function getUrlInfo($url) { + $ch = curl_init($url); + curl_setopt_array($ch, array( + CURLOPT_HEADER => TRUE, + CURLOPT_NOBODY => TRUE, + // Follow http redirects to get the real URL. + CURLOPT_FOLLOWLOCATION => TRUE, + CURLOPT_RETURNTRANSFER => TRUE, + CURLOPT_SSL_VERIFYHOST => FALSE, + CURLOPT_SSL_VERIFYPEER => FALSE, + CURLOPT_USERAGENT => Tweeper::$userAgent, + )); + curl_exec($ch); + $url_info = curl_getinfo($ch); + if (FALSE === $url_info) { + trigger_error(curl_error($ch)); + } + curl_close($ch); + + return $url_info; + } + + /** + * Generate an RSS <enclosure/> element. + */ + public static function generateEnclosure($url) { + $supported_content_types = array( + "application/octet-stream", + "application/ogg", + "application/pdf", + "audio/aac", + "audio/mp4", + "audio/mpeg", + "audio/ogg", + "audio/vorbis", + "audio/wav", + "audio/webm", + "audio/x-midi", + "image/gif", + "image/jpeg", + "image/png", + "video/avi", + "video/mp4", + "video/mpeg", + "video/ogg", + ); + + $url_info = Tweeper::getUrlInfo($url); + + $supported = in_array($url_info['content_type'], $supported_content_types); + if (!$supported) { + error_log("Unsupported enclosure content type \"" . $url_info['content_type'] . "\" for URL: " . $url_info['url']); + return ''; + } + + // The RSS specification says that the enclosure element URL must be http. + // See http://sourceforge.net/p/feedvalidator/bugs/72/ + $http_url = preg_replace("/^https/", "http", $url_info['url']); + + $dom = new DOMDocument(); + $enc = $dom->createElement('enclosure'); + $enc->setAttribute('url', $http_url); + $enc->setAttribute('length', $url_info['download_content_length']); + $enc->setAttribute('type', $url_info['content_type']); + + return $enc; + } + + /** + * Mimic the message from libxml.c::php_libxml_ctx_error_level() + */ + private static function logXmlError($error) { + $output = ""; + + switch ($error->level) { + case LIBXML_ERR_WARNING: + $output .= "Warning $error->code: "; + break; + + case LIBXML_ERR_ERROR: + $output .= "Error $error->code: "; + break; + + case LIBXML_ERR_FATAL: + $output .= "Fatal Error $error->code: "; + break; + } + + $output .= trim($error->message); + + if ($error->file) { + $output .= " in $error->file"; + } + else { + $output .= " in Entity,"; + } + + $output .= " line $error->line"; + + error_log($output); + } + + /** + * Convert json to XML. + */ + private static function jsonToXml($json, $root_node_name) { + // Apparently the ObjectNormalizer used afterwards is not able to handle + // the stdClass object created by json_decode() with the default setting + // $assoc = false; so use $assoc = true. + $data = json_decode($json, $assoc = TRUE); + if (!$data) { + return NULL; + } + + $encoder = new XmlEncoder(); + $normalizer = new ObjectNormalizer(); + $serializer = new Serializer(array($normalizer), array($encoder)); + + $serializer_options = array( + 'xml_encoding' => "UTF-8", + 'xml_format_output' => TRUE, + 'xml_root_node_name' => $root_node_name, + ); + + $xml_data = $serializer->serialize($data, 'xml', $serializer_options); + if (!$xml_data) { + trigger_error("Cannot serialize data", E_USER_ERROR); + return NULL; + } + + return $xml_data; + } + + /** + * Convert the Instagram content to XML. + */ + private function getXmlInstagramCom($html) { + // Extract the json data from the html code. + $json_match_expr = '/window._sharedData = (.*);/'; + $ret = preg_match($json_match_expr, $html, $matches); + if ($ret !== 1) { + trigger_error("Cannot match expression: $json_match_expr\n", E_USER_ERROR); + return NULL; + } + + return Tweeper::jsonToXml($matches[1], 'instagram'); + } + + /** + * Make the Facebook HTML processable. + */ + private function preprocessHtmlFacebookCom($html) { + $html = str_replace('<!--', '', $html); + $html = str_replace('-->', '', $html); + return $html; + } + + /** + * Convert the HTML retrieved from the site to XML. + */ + private function htmlToXml($html, $host) { + $xmlDoc = new DOMDocument(); + + // Handle warnings and errors when loading invalid HTML. + $xml_errors_value = libxml_use_internal_errors(TRUE); + + // If there is a host-specific method to get the XML data, use it! + $get_xml_host_method = 'getXml' . Tweeper::toUpperCamelCase($host, '.'); + if (method_exists($this, $get_xml_host_method)) { + $xml_data = call_user_func_array(array($this, $get_xml_host_method), array($html)); + $xmlDoc->loadXML($xml_data); + } + else { + $xmlDoc->loadHTML($html); + } + + foreach (libxml_get_errors() as $xml_error) { + Tweeper::logXmlError($xml_error); + } + libxml_clear_errors(); + libxml_use_internal_errors($xml_errors_value); + + return $xmlDoc; + } + + /** + * Load a stylesheet if the web site is supported. + */ + private function loadStylesheet($host) { + $stylesheet = "file://" . __DIR__ . "/rss_converter_" . $host . ".xsl"; + if (FALSE === file_exists($stylesheet)) { + trigger_error("Conversion to RSS not supported for $host ($stylesheet not found)", E_USER_ERROR); + return NULL; + } + + $stylesheet_contents = Tweeper::getUrlContents($stylesheet); + + $xslDoc = new DOMDocument(); + $xslDoc->loadXML($stylesheet_contents); + + $xsltProcessor = new XSLTProcessor(); + $xsltProcessor->registerPHPFunctions(); + $xsltProcessor->setParameter('', 'generate-enclosure', $this->generate_enclosure); + $xsltProcessor->importStylesheet($xslDoc); + + return $xsltProcessor; + } + + /** + * Convert the site content to RSS. + */ + public function tweep($src_url) { + $url = parse_url($src_url); + if (FALSE === $url || empty($url["host"])) { + trigger_error("Invalid URL: $src_url", E_USER_ERROR); + return NULL; + } + + $scheme = $url["scheme"]; + if (!in_array($scheme, array("http", "https"))) { + trigger_error("unsupported scheme: $scheme", E_USER_ERROR); + return NULL; + } + + // Strip the leading www. to be more forgiving on input URLs. + $host = preg_replace('/^www\./', '', $url["host"]); + + $xsltProcessor = $this->loadStylesheet($host); + if (NULL === $xsltProcessor) { + return NULL; + } + + $html = Tweeper::getUrlContents($src_url); + if (FALSE === $html) { + return NULL; + } + + $preprocess_html_host_method = 'preprocessHtml' . Tweeper::toUpperCamelCase($host, '.'); + if (method_exists($this, $preprocess_html_host_method)) { + $html = call_user_func_array(array($this, $preprocess_html_host_method), array($html)); + } + + $xmlDoc = $this->htmlToXml($html, $host); + if (NULL === $xmlDoc) { + return NULL; + } + + $output = $xsltProcessor->transformToXML($xmlDoc); + + if (FALSE === $output) { + trigger_error('XSL transformation failed.', E_USER_ERROR); + return NULL; + } + return $output; + } + +} diff --git a/src/rss_converter_dilbert.com.xsl b/src/rss_converter_dilbert.com.xsl new file mode 100644 index 0000000..b6d1975 --- /dev/null +++ b/src/rss_converter_dilbert.com.xsl @@ -0,0 +1,115 @@ +<!-- + Stylesheet to convert Dilbert daily strips to RSS. + + Copyright (C) 2013-2014 Antonio Ospite <ao2@ao2.it> + + This file is part of tweeper. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +--> + +<!-- + Since June 18, 2013 dilbert.com strips are not accessible anymore + directly from the RSS feed, this message is displayed instead: + + Dilbert readers - Please visit Dilbert.com to read this feature. Due + to changes with our feeds, we are now making this RSS feed a link to + Dilbert.com. + + How unhandy is that, was it because of a management decision? + Maybe a parody dilbert strip is needed about this issue... +--> + +<xsl:stylesheet version="1.0" + xmlns:xsl="http://www.w3.org/1999/XSL/Transform" + xmlns:php="http://php.net/xsl" + xsl:extension-element-prefixes="php" + exclude-result-prefixes="php"> + + <xsl:output method="xml" indent="yes"/> + + <xsl:variable name="BaseURL" select="//meta[@property='og:url']/@content"/> + + <xsl:template match="//section[@class='comic-item']"> + <xsl:variable name="item-permalink" select=".//a[@class='img-comic-link']/@href"/> + <xsl:variable name="picture-url" select=".//img[@class='img-responsive img-comic']/@src"/> + <xsl:variable name="picture-title" select=".//img[@class='img-responsive img-comic']/@alt"/> + <item> + <title> + <xsl:variable name="title-length" select="140"/> + <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 --> + <xsl:choose> + <xsl:when test="string-length($picture-title) > $title-length"> + <xsl:variable name="truncated-length" select="$title-length - 3"/> + <xsl:value-of select="substring($picture-title, 1, $truncated-length)"/> + <xsl:text>...</xsl:text> + </xsl:when> + <xsl:otherwise> + <xsl:value-of select="$picture-title"/> + </xsl:otherwise> + </xsl:choose> + </title> + <link> + <xsl:value-of select="$item-permalink"/> + </link> + <guid> + <xsl:value-of select="$item-permalink"/> + </guid> + <pubDate> + <xsl:value-of select="php:functionString('Tweeper::strToRssDate', normalize-space(.//date))"/> + </pubDate> + <description> + <xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text> + <img src="{$picture-url}" alt="{$picture-title}"/> + <xsl:text disable-output-escaping="yes">]]></xsl:text> + </description> + <xsl:if test="$generate-enclosure = 1"> + <xsl:copy-of select="php:functionString('Tweeper::generateEnclosure', $picture-url)"/> + </xsl:if> + </item> + </xsl:template> + + <xsl:template match="/"> + <xsl:variable name="channel-title" select="//meta[@property='og:title']/@content"/> + <xsl:variable name="channel-link" select="$BaseURL"/> + + <rss version="2.0"> + <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute> + <channel> + <generator>Tweeper</generator> + <title> + <xsl:value-of select="$channel-title"/> + </title> + <link> + <xsl:value-of select="$channel-link"/> + </link> + <description> + <xsl:value-of select="//meta[@property='og:description']/@content"/> + </description> + <image> + <title> + <xsl:value-of select="$channel-title"/> + </title> + <link> + <xsl:value-of select="$channel-link"/> + </link> + <url> + <xsl:value-of select="concat($BaseURL, //img[@alt='Dilbert logo']/@src)"/> + </url> + </image> + <xsl:apply-templates select="//section[@class='comic-item']"/> + </channel> + </rss> + </xsl:template> +</xsl:stylesheet> diff --git a/src/rss_converter_facebook.com.xsl b/src/rss_converter_facebook.com.xsl new file mode 100644 index 0000000..418b3d2 --- /dev/null +++ b/src/rss_converter_facebook.com.xsl @@ -0,0 +1,141 @@ +<!-- + Stylesheet to convert a Facebook public page to RSS. + + Copyright (C) 2015 Antonio Ospite <ao2@ao2.it> + + This file is part of tweeper. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +--> + +<!-- + Since June 23rd, 2015 facebook.com deprecated the RSS feed endpoint for public pages: + https://developers.facebook.com/docs/apps/changelog#v2_3_90_day_deprecations + + They suggest to use the Graph API but they fail to mention that it does not + work anymore without authentication, so it cannot be considered an + _equivalent_ solution. + + Luckily we've got Tweeper! +--> + +<xsl:stylesheet version="1.0" + xmlns:xsl="http://www.w3.org/1999/XSL/Transform" + xmlns:php="http://php.net/xsl" + xsl:extension-element-prefixes="php" + exclude-result-prefixes="php"> + + <xsl:output method="xml" indent="yes"/> + + <xsl:variable name="BaseURL"> + <xsl:text>https://facebook.com</xsl:text> + </xsl:variable> + + <!-- + Extract the page id from an element like: + <meta property="al:android:url" content="fb://page/793837197390834"> + + The page id will be used to build the permalink. + --> + <xsl:variable + name="page-id" + select="substring-after(//meta[@property='al:android:url']/@content, 'fb://page/')"/> + + <xsl:template match="//div[contains(@class, 'userContentWrapper')]"> + <xsl:variable name="story-id" select=".//input[@name='ft_ent_identifier']/@value"/> + <xsl:variable + name="item-permalink" + select="concat($BaseURL, '/permalink.php?id=', $page-id, '&story_fbid=', $story-id)"/> + + <!-- Get only the first child in order to skip the footer of the content --> + <xsl:variable name="item-content" select="div[1]"/> + + <item> + <title> + <xsl:variable name="item-title" select="$item-content//p"/> + <xsl:variable name="title-length" select="140"/> + <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 --> + <xsl:choose> + <xsl:when test="string-length($item-title) > $title-length"> + <xsl:variable name="truncated-length" select="$title-length - 3"/> + <xsl:value-of select="substring($item-title, 1, $truncated-length)"/> + <xsl:text>...</xsl:text> + </xsl:when> + <xsl:otherwise> + <xsl:value-of select="$item-title"/> + </xsl:otherwise> + </xsl:choose> + </title> + <link> + <xsl:value-of select="$item-permalink"/> + </link> + <guid> + <xsl:value-of select="$item-permalink"/> + </guid> + <pubDate> + <xsl:variable name="timestamp" select=".//abbr[@data-shorten]/@data-utime"/> + <xsl:value-of select="php:functionString('Tweeper::epochToRssDate', $timestamp)"/> + </pubDate> + <description> + + <!-- + Get only the children starting from the one with class="userContent", + this way the content header is skipped + --> + <xsl:variable + name="usercontent-position" + select="count($item-content/div[contains(@class, 'userContent')]/preceding-sibling::*) + 1"/> + + <xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text> + <xsl:copy-of select="$item-content/div[position() >= $usercontent-position]"/> + <xsl:text disable-output-escaping="yes">]]></xsl:text> + </description> + </item> + </xsl:template> + + <xsl:template match="/"> + <xsl:variable name="channel-title" select="//title"/> + <xsl:variable name="channel-link" select="//div[contains(@class, 'userContentWrapper')][1]//a[1]/@href"/> + + <rss version="2.0"> + <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute> + <channel> + <generator>Tweeper</generator> + <title> + <xsl:value-of select="$channel-title"/> + </title> + <link> + <xsl:value-of select="$channel-link"/> + </link> + <description> + <xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text> + <xsl:copy-of select="//div[@data-id='1']/node()"/> + <xsl:text disable-output-escaping="yes">]]></xsl:text> + </description> + <image> + <title> + <xsl:value-of select="$channel-title"/> + </title> + <link> + <xsl:value-of select="$channel-link"/> + </link> + <url> + <xsl:value-of select="//img[@class='profilePic img']/@src"/> + </url> + </image> + <xsl:apply-templates select="//div[contains(@class, 'userContentWrapper')]"/> + </channel> + </rss> + </xsl:template> +</xsl:stylesheet> diff --git a/src/rss_converter_howtoons.com.xsl b/src/rss_converter_howtoons.com.xsl new file mode 100644 index 0000000..403b9ac --- /dev/null +++ b/src/rss_converter_howtoons.com.xsl @@ -0,0 +1,102 @@ +<!-- + Stylesheet to convert Howtoons.com to RSS. + + Copyright (C) 2014 Antonio Ospite <ao2@ao2.it> + + This file is part of tweeper. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +--> + +<!-- + The RSS feed link is broken on http://howtoons.com so just work around it. + + Howtoons uses Wordpress, so maybe this style sheet can be used as a base for + scraping other Wordpress sites. +--> + +<xsl:stylesheet version="1.0" + xmlns:xsl="http://www.w3.org/1999/XSL/Transform" + xmlns:php="http://php.net/xsl" + xsl:extension-element-prefixes="php" + exclude-result-prefixes="php"> + + <xsl:output method="xml" indent="yes"/> + + <xsl:variable name="BaseURL"> + <xsl:text>http://howtoons.com</xsl:text> + </xsl:variable> + + <xsl:template match="//div[contains(@id, 'post-')]"> + <xsl:variable name="item-permalink" select=".//div[@class='post-headline']//a/@href"/> + <item> + <title> + <xsl:value-of select="normalize-space(.//div[@class='post-headline']//a)"/> + </title> + <link> + <xsl:value-of select="$item-permalink"/> + </link> + <guid> + <xsl:value-of select="$item-permalink"/> + </guid> + <pubDate> + <xsl:variable name="date" select="substring-before(.//div[@class='post-byline'], ',')"/> + <!-- date format is MM.DD.YY --> + <xsl:variable name="month" select="substring($date, 1, 2)"/> + <xsl:variable name="day" select="substring($date, 4, 2)"/> + <xsl:variable name="year" select="substring($date, 7, 2)"/> + <xsl:variable name="iso-date" select="concat('20', $year, '-', $month, '-', $day)"/> + <xsl:value-of select="php:functionString('Tweeper::strToRssDate', $iso-date)"/> + </pubDate> + <description> + <xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text> + <xsl:copy-of select=".//div[contains(@class, 'post-bodycopy')]/p"/> + <xsl:text disable-output-escaping="yes">]]></xsl:text> + </description> + </item> + </xsl:template> + + <xsl:template match="/"> + <xsl:variable name="channel-title" select="//title"/> + <xsl:variable name="channel-link" select="$BaseURL"/> + + <rss version="2.0"> + <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute> + <channel> + <generator>Tweeper</generator> + <title> + <xsl:value-of select="$channel-title"/> + </title> + <link> + <xsl:value-of select="$channel-link"/> + </link> + <description> + <xsl:text>The world's greatest D.I.Y. comic website! Tools of mass construction!</xsl:text> + </description> + <image> + <title> + <xsl:value-of select="$channel-title"/> + </title> + <link> + <xsl:value-of select="$channel-link"/> + </link> + <url> + <xsl:text>http://www.howtoons.com/wp-content/themes/atahualpa/images/header/tuck1000.png</xsl:text> + </url> + </image> + <xsl:apply-templates select="//div[contains(@id, 'post-')]"/> + </channel> + </rss> + </xsl:template> +</xsl:stylesheet> diff --git a/src/rss_converter_identi.ca.xsl b/src/rss_converter_identi.ca.xsl new file mode 120000 index 0000000..d8042a1 --- /dev/null +++ b/src/rss_converter_identi.ca.xsl @@ -0,0 +1 @@ +rss_converter_pump.io.xsl \ No newline at end of file diff --git a/src/rss_converter_instagram.com.xsl b/src/rss_converter_instagram.com.xsl new file mode 100644 index 0000000..e869d7d --- /dev/null +++ b/src/rss_converter_instagram.com.xsl @@ -0,0 +1,135 @@ +<!-- + Stylesheet to convert Instagram user timelines to RSS. + + Copyright (C) 2015 Antonio Ospite <ao2@ao2.it> + + This file is part of tweeper. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +--> +<xsl:stylesheet version="1.0" + xmlns:xsl="http://www.w3.org/1999/XSL/Transform" + xmlns:php="http://php.net/xsl" + xsl:extension-element-prefixes="php" + exclude-result-prefixes="php"> + + <xsl:param name="generate-enclosure"/> + + <xsl:output method="xml" indent="yes"/> + + <xsl:variable name="BaseURL"> + <xsl:text>https://instagram.com</xsl:text> + </xsl:variable> + + <xsl:variable name="user-name" select="//ProfilePage/user/username"/> + + <!-- Some users do not specify the full name --> + <xsl:variable name="full-name" select="//ProfilePage/user/full_name"/> + <xsl:variable name="screen-name"> + <xsl:choose> + <xsl:when test="$full-name != ''"> + <xsl:value-of select="$full-name"/> + </xsl:when> + <xsl:otherwise> + <xsl:value-of select="$user-name"/> + </xsl:otherwise> + </xsl:choose> + </xsl:variable> + + <xsl:template match="//ProfilePage/user/media/nodes"> + <xsl:variable name="item-content-image" select="./display_src"/> + <xsl:variable name="item-content-caption" select="./caption"/> + <xsl:variable name="item-permalink" select="concat($BaseURL, '/p/', ./code, '/')"/> + <item> + <title> + <xsl:variable name="title-length" select="140"/> + <xsl:variable name="item-content-title" select="normalize-space(concat($user-name, ': ', $item-content-caption))"/> + <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 --> + <xsl:choose> + <xsl:when test="string-length($item-content-title) > $title-length"> + <xsl:variable name="truncated-length" select="$title-length - 3"/> + <xsl:value-of select="substring($item-content-title, 1, $truncated-length)"/> + <xsl:text>...</xsl:text> + </xsl:when> + <xsl:otherwise> + <xsl:value-of select="$item-content-title"/> + </xsl:otherwise> + </xsl:choose> + </title> + <link> + <xsl:value-of select="$item-permalink"/> + </link> + <guid> + <xsl:value-of select="$item-permalink"/> + </guid> + <pubDate> + <xsl:variable name="timestamp" select="./date"/> + <xsl:value-of select="php:functionString('Tweeper::epochToRssDate', $timestamp)"/> + </pubDate> + <description> + <xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text> + <p> + <xsl:if test="./is_video/text() = 1"> + (Video) + </xsl:if> + <xsl:value-of select="$item-content-caption"/> + </p><br /> + <a href="{$item-permalink}"><img src="{$item-content-image}" style="max-width: 100%"/></a> + <xsl:text disable-output-escaping="yes">]]></xsl:text> + </description> + <xsl:if test="$generate-enclosure = 1"> + <xsl:copy-of select="php:functionString('Tweeper::generateEnclosure', $item-content-image)"/> + </xsl:if> + </item> + </xsl:template> + + <xsl:template match="/"> + <xsl:variable name="channel-title" select="concat('Instagram / ', $screen-name)"/> + <xsl:variable name="channel-link" select="concat($BaseURL, '/', $user-name)"/> + + <rss version="2.0"> + <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute> + <channel> + <generator>Tweeper</generator> + <title> + <xsl:value-of select="$channel-title"/> + </title> + <link> + <xsl:value-of select="$channel-link"/> + </link> + <description> + <xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text> + <xsl:value-of select="normalize-space(concat($screen-name, '. ', //user/biography))"/> + <xsl:variable name="external-url" select="//user/external_url"/> + <xsl:if test="$external-url != ''"> + <xsl:text> </xsl:text><a href="{$external-url}"><xsl:value-of select="$external-url"/></a> + </xsl:if> + <xsl:text disable-output-escaping="yes">]]></xsl:text> + </description> + <image> + <title> + <xsl:value-of select="$channel-title"/> + </title> + <link> + <xsl:value-of select="$channel-link"/> + </link> + <url> + <xsl:value-of select="//ProfilePage/user/profile_pic_url"/> + </url> + </image> + <xsl:apply-templates select="//ProfilePage/user/media/nodes"/> + </channel> + </rss> + </xsl:template> +</xsl:stylesheet> diff --git a/src/rss_converter_pump.io.xsl b/src/rss_converter_pump.io.xsl new file mode 100644 index 0000000..1577dcf --- /dev/null +++ b/src/rss_converter_pump.io.xsl @@ -0,0 +1,99 @@ +<!-- + Stylesheet to convert Pump.io activity streams to RSS. + + Copyright (C) 2013-2014 Antonio Ospite <ao2@ao2.it> + + This file is part of tweeper. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +--> +<!-- To Evan, please reconsider publishing RSS ouput for _public_ contents --> +<xsl:stylesheet version="1.0" + xmlns:xsl="http://www.w3.org/1999/XSL/Transform" + xmlns:php="http://php.net/xsl" + xsl:extension-element-prefixes="php" + exclude-result-prefixes="php"> + + <xsl:output method="xml" indent="yes"/> + + <xsl:variable name="domain-name" select="substring-after(//div[@id='profile-block']/@data-profile-id, '@')"/> + <xsl:variable name="BaseURL" select="concat('https://', $domain-name)"/> + + <xsl:variable name="user-name" select="substring-after(//div[@id='profile-block']/@data-profile-id, ':')"/> + + <xsl:template match="//div[@id='user-content-activities']//ul[@id='major-stream']/li"> + <xsl:variable name="item-content" select=".//div[@class='activity-content']"/> + <xsl:variable name="item-permalink" select=".//p[@class='muted']/small/a/@href"/> + <item> + <title> + <xsl:value-of select="concat($user-name, ': ', normalize-space($item-content))"/> + </title> + <link> + <xsl:value-of select="$item-permalink"/> + </link> + <guid> + <xsl:value-of select="$item-permalink"/> + </guid> + <pubDate> + <xsl:value-of select="php:functionString('Tweeper::strToRssDate', .//abbr[@class='easydate']/@title)"/> + </pubDate> + <description> + <xsl:value-of select="concat($user-name, ': ')"/> + <xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text> + <xsl:copy-of select="$item-content/node()"/> + <xsl:text disable-output-escaping="yes">]]></xsl:text> + </description> + <xsl:if test="$generate-enclosure = 1"> + <xsl:variable name="image-thumb-link" select=".//img[contains(@class, 'object-image')]/@src"/> + <xsl:if test="$image-thumb-link"> + <xsl:variable name="image-link" select="php:functionString('str_replace', '_thumb', '', $image-thumb-link)"/> + <xsl:copy-of select="php:functionString('Tweeper::generateEnclosure', $image-link)"/> + </xsl:if> + </xsl:if> + </item> + </xsl:template> + + <xsl:template match="/"> + <xsl:variable name="channel-title" select="concat(substring-after($user-name, '@'), ' / ', substring-before($user-name, '@'))"/> + <xsl:variable name="channel-link" select="concat('https://', substring-after($user-name, '@'), '/', substring-before($user-name, '@'))"/> + + <rss version="2.0"> + <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute> + <channel> + <generator>Tweeper</generator> + <title> + <xsl:value-of select="$channel-title"/> + </title> + <link> + <xsl:value-of select="$channel-link"/> + </link> + <description> + <xsl:value-of select="normalize-space(//h1[@class='media-header'])"/> + </description> + <image> + <title> + <xsl:value-of select="$channel-title"/> + </title> + <link> + <xsl:value-of select="$channel-link"/> + </link> + <url> + <xsl:value-of select="//div[@id='profile-block']/span/img[@class='img-rounded media-object']/@src"/> + </url> + </image> + <xsl:apply-templates select="//div[@id='user-content-activities']//ul[@id='major-stream']/li"/> + </channel> + </rss> + </xsl:template> +</xsl:stylesheet> diff --git a/src/rss_converter_twitter.com.xsl b/src/rss_converter_twitter.com.xsl new file mode 100644 index 0000000..c154141 --- /dev/null +++ b/src/rss_converter_twitter.com.xsl @@ -0,0 +1,208 @@ +<!-- + Stylesheet to convert Twitter user timelines to RSS. + + Copyright (C) 2013-2014 Antonio Ospite <ao2@ao2.it> + + This file is part of tweeper. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +--> +<xsl:stylesheet version="1.0" + xmlns:xsl="http://www.w3.org/1999/XSL/Transform" + xmlns:php="http://php.net/xsl" + xsl:extension-element-prefixes="php" + exclude-result-prefixes="php"> + + <xsl:param name="generate-enclosure"/> + + <xsl:output method="xml" indent="yes"/> + + <xsl:variable name="BaseURL"> + <xsl:text>https://twitter.com</xsl:text> + </xsl:variable> + + <!-- Identity transform --> + <xsl:template match="@*|node()"> + <xsl:copy> + <xsl:apply-templates select="@*|node()"/> + </xsl:copy> + </xsl:template> + + <!-- + Anchors to external links provide the direct URL in the + data-expanded-url attribute, so use this in the href attribute too + instead of the default short URL which uses the t.co redirection + service. + + NOTE: when creating an element, attributes must be processed _before_ + adding the contents (either children or a value): + http://stackoverflow.com/questions/21984867/ + --> + <xsl:template match="a[@data-expanded-url]"> + <!-- Prepend and append a white space for aestethic reasons --> + <xsl:text> </xsl:text> + <a> + <xsl:attribute name="href"> + <xsl:value-of select="@data-expanded-url"/> + </xsl:attribute> + <!-- Also strip and … --> + <xsl:value-of select="translate(., ' …', '')"/> + </a> + <xsl:text> </xsl:text> + </xsl:template> + + <!-- + These are links to pic.twitter.com, use the direct link for those + too instead of the t.co redirections. + --> + <xsl:template match="a[@data-pre-embedded='true']"> + <!-- Prepend and append a white space for aestethic reasons --> + <xsl:text> </xsl:text> + <a> + <xsl:attribute name="href"> + <xsl:value-of select="concat('https://', .)"/> + </xsl:attribute> + <xsl:value-of select="concat('https://', .)"/> + </a> + <xsl:text> </xsl:text> + </xsl:template> + + <!-- Present images in a more convenient way --> + <xsl:template match="div[@data-image-url]"> + <a> + <xsl:attribute name="href"> + <xsl:value-of select="concat(@data-image-url, ':orig')"/> + </xsl:attribute> + <img> + <xsl:attribute name="src"> + <xsl:value-of select="@data-image-url"/> + </xsl:attribute> + </img> + </a> + </xsl:template> + + <!-- Don't repeat background in embedded media content --> + <xsl:template match="div[contains(@class, 'PlayableMedia-player')]"> + <xsl:copy> + <xsl:apply-templates select="@*"/> + <xsl:attribute name="style"> + <xsl:value-of select="concat(@style, '; background-repeat: no-repeat')"/> + </xsl:attribute> + <xsl:apply-templates select="node()"/> + </xsl:copy> + </xsl:template> + + <xsl:template match="a[@data-expanded-url]" mode="enclosure"> + <xsl:copy-of select="php:functionString('Tweeper::generateEnclosure', ./@data-expanded-url)"/> + </xsl:template> + + <xsl:template match="div[@data-image-url]" mode="enclosure"> + <xsl:copy-of select="php:functionString('Tweeper::generateEnclosure', concat(./@data-image-url, ':orig'))"/> + </xsl:template> + + <xsl:variable name="screen-name" select="//div[@class='user-actions btn-group not-following ']/@data-screen-name"/> + + <xsl:template match="//li[@data-item-id and @data-item-type='tweet']"> + <xsl:variable name="user-name" select=".//div[contains(@class, 'js-stream-tweet')]/@data-screen-name"/> + <xsl:variable name="item-content" select=".//p[contains(@class, 'js-tweet-text')]"/> + <xsl:variable name="item-media" select=".//div[contains(@class, 'AdaptiveMedia-container')]"/> + <xsl:variable name="item-permalink" select="concat($BaseURL, .//div[@data-permalink-path]/@data-permalink-path)"/> + + <xsl:variable name="item-has-video" select="$item-media//*[contains(@class, 'PlayableMedia--video')]"/> + <item> + <title> + <xsl:value-of select="concat($user-name, ': ')"/> + <xsl:if test="$item-has-video"> + <xsl:text>(Video) </xsl:text> + </xsl:if> + <!-- + Prepend a space in front of the URLs which are not + preceded by an open parenthesis, for aestethic reasons. + Also, regex, I know: http://xkcd.com/1171/ + --> + <xsl:variable + name="processed-title" + select="php:functionString('preg_replace', '@((?<!\()(?:http[s]?://|pic.twitter.com))@', ' \1', $item-content)"/> + <!-- Also strip and … --> + <xsl:value-of select="normalize-space(translate($processed-title, ' …', ''))"/> + </title> + <link> + <xsl:value-of select="$item-permalink"/> + </link> + <guid> + <xsl:value-of select="$item-permalink"/> + </guid> + <pubDate> + <xsl:variable name="timestamp" select=".//span[contains(@class, 'js-short-timestamp')]/@data-time"/> + <xsl:value-of select="php:functionString('Tweeper::epochToRssDate', number($timestamp))"/> + </pubDate> + <description> + <xsl:value-of select="concat($user-name, ': ')"/> + <xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text> + <xsl:if test="$item-has-video"> + <xsl:text>(Video) </xsl:text> + </xsl:if> + <xsl:apply-templates select="$item-content/node()"/> + <xsl:apply-templates select="$item-media/node()"/> + <xsl:text disable-output-escaping="yes">]]></xsl:text> + </description> + <xsl:if test="$generate-enclosure = 1"> + <xsl:apply-templates select="$item-content//a[@data-expanded-url]" mode="enclosure"/> + <xsl:apply-templates select="$item-media//div[@data-image-url]" mode="enclosure"/> + </xsl:if> + </item> + </xsl:template> + + <xsl:template match="/"> + <xsl:variable name="channel-title"> + <xsl:choose> + <xsl:when test="$screen-name != ''"> + <xsl:value-of select="concat('Twitter / ', $screen-name)"/> + </xsl:when> + <xsl:otherwise> + <xsl:value-of select="concat('Twitter / ', normalize-space(//h1[1]))"/> + </xsl:otherwise> + </xsl:choose> + </xsl:variable> + <xsl:variable name="channel-link" select="//link[@rel='canonical']/@href"/> + + <rss version="2.0"> + <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute> + <channel> + <generator>Tweeper</generator> + <title> + <xsl:value-of select="$channel-title"/> + </title> + <link> + <xsl:value-of select="$channel-link"/> + </link> + <description> + <xsl:value-of select="normalize-space(//div[@class='ProfileHeaderCard'])"/> + </description> + <image> + <title> + <xsl:value-of select="$channel-title"/> + </title> + <link> + <xsl:value-of select="$channel-link"/> + </link> + <url> + <xsl:value-of select="//a[contains(@class, 'profile-picture')]/@href"/> + </url> + </image> + <xsl:apply-templates select="//ol[@id='stream-items-id']/li[@data-item-id and @data-item-type='tweet']"/> + </channel> + </rss> + </xsl:template> +</xsl:stylesheet> diff --git a/tweeper.php b/tweeper.php index 87efd60..ba8b1d7 100644 --- a/tweeper.php +++ b/tweeper.php @@ -19,346 +19,13 @@ * along with this program. If not, see <http://www.gnu.org/licenses/>. */ -require_once 'Symfony/Component/Serializer/autoload.php'; +require_once 'src/Tweeper.php'; -use Symfony\Component\Serializer\Serializer; -use Symfony\Component\Serializer\Encoder\XmlEncoder; -use Symfony\Component\Serializer\Normalizer\ObjectNormalizer; +use Tweeper\Tweeper; date_default_timezone_set('UTC'); /** - * Scrape supported websites and perform conversion to RSS. - */ -class Tweeper { - - private static $userAgent = "Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0"; - - /** - * Constructor sets up {@link $generate_enclosure}. - */ - public function __construct($generate_enclosure = FALSE) { - $this->generate_enclosure = $generate_enclosure; - } - - /** - * Convert numeric Epoch to the date format expected in a RSS document. - */ - public static function epochToRssDate($timestamp) { - if (!is_numeric($timestamp) || is_nan($timestamp)) { - $timestamp = 0; - } - - return gmdate(DATE_RSS, $timestamp); - } - - /** - * Convert generic date string to the date format expected in a RSS document. - */ - public static function strToRssDate($date) { - $timestamp = strtotime($date); - if (FALSE === $timestamp) { - $timestamp = 0; - } - - return Tweeper::epochToRssDate($timestamp); - } - - /** - * Convert string to UpperCamelCase. - */ - public static function toUpperCamelCase($str, $delim = ' ') { - $str_upper = ucwords($str, $delim); - $str_camel_case = str_replace($delim, '', $str_upper); - return $str_camel_case; - } - - /** - * Get the contents from a URL. - */ - private static function getUrlContents($url) { - $ch = curl_init($url); - curl_setopt_array($ch, array( - CURLOPT_HEADER => FALSE, - // Follow http redirects to get the real URL. - CURLOPT_FOLLOWLOCATION => TRUE, - CURLOPT_RETURNTRANSFER => TRUE, - CURLOPT_SSL_VERIFYHOST => FALSE, - CURLOPT_SSL_VERIFYPEER => FALSE, - CURLOPT_HTTPHEADER => array('Accept-language: en'), - CURLOPT_USERAGENT => Tweeper::$userAgent, - )); - $contents = curl_exec($ch); - if (FALSE === $contents) { - trigger_error(curl_error($ch)); - } - curl_close($ch); - - return $contents; - } - - /** - * Get the headers from a URL. - */ - private static function getUrlInfo($url) { - $ch = curl_init($url); - curl_setopt_array($ch, array( - CURLOPT_HEADER => TRUE, - CURLOPT_NOBODY => TRUE, - // Follow http redirects to get the real URL. - CURLOPT_FOLLOWLOCATION => TRUE, - CURLOPT_RETURNTRANSFER => TRUE, - CURLOPT_SSL_VERIFYHOST => FALSE, - CURLOPT_SSL_VERIFYPEER => FALSE, - CURLOPT_USERAGENT => Tweeper::$userAgent, - )); - curl_exec($ch); - $url_info = curl_getinfo($ch); - if (FALSE === $url_info) { - trigger_error(curl_error($ch)); - } - curl_close($ch); - - return $url_info; - } - - /** - * Generate an RSS <enclosure/> element. - */ - public static function generateEnclosure($url) { - $supported_content_types = array( - "application/octet-stream", - "application/ogg", - "application/pdf", - "audio/aac", - "audio/mp4", - "audio/mpeg", - "audio/ogg", - "audio/vorbis", - "audio/wav", - "audio/webm", - "audio/x-midi", - "image/gif", - "image/jpeg", - "image/png", - "video/avi", - "video/mp4", - "video/mpeg", - "video/ogg", - ); - - $url_info = Tweeper::getUrlInfo($url); - - $supported = in_array($url_info['content_type'], $supported_content_types); - if (!$supported) { - error_log("Unsupported enclosure content type \"" . $url_info['content_type'] . "\" for URL: " . $url_info['url']); - return ''; - } - - // The RSS specification says that the enclosure element URL must be http. - // See http://sourceforge.net/p/feedvalidator/bugs/72/ - $http_url = preg_replace("/^https/", "http", $url_info['url']); - - $dom = new DOMDocument(); - $enc = $dom->createElement('enclosure'); - $enc->setAttribute('url', $http_url); - $enc->setAttribute('length', $url_info['download_content_length']); - $enc->setAttribute('type', $url_info['content_type']); - - return $enc; - } - - /** - * Mimic the message from libxml.c::php_libxml_ctx_error_level() - */ - private static function logXmlError($error) { - $output = ""; - - switch ($error->level) { - case LIBXML_ERR_WARNING: - $output .= "Warning $error->code: "; - break; - - case LIBXML_ERR_ERROR: - $output .= "Error $error->code: "; - break; - - case LIBXML_ERR_FATAL: - $output .= "Fatal Error $error->code: "; - break; - } - - $output .= trim($error->message); - - if ($error->file) { - $output .= " in $error->file"; - } - else { - $output .= " in Entity,"; - } - - $output .= " line $error->line"; - - error_log($output); - } - - /** - * Convert json to XML. - */ - private static function jsonToXml($json, $root_node_name) { - // Apparently the ObjectNormalizer used afterwards is not able to handle - // the stdClass object created by json_decode() with the default setting - // $assoc = false; so use $assoc = true. - $data = json_decode($json, $assoc = TRUE); - if (!$data) { - return NULL; - } - - $encoder = new XmlEncoder(); - $normalizer = new ObjectNormalizer(); - $serializer = new Serializer(array($normalizer), array($encoder)); - - $serializer_options = array( - 'xml_encoding' => "UTF-8", - 'xml_format_output' => TRUE, - 'xml_root_node_name' => $root_node_name, - ); - - $xml_data = $serializer->serialize($data, 'xml', $serializer_options); - if (!$xml_data) { - trigger_error("Cannot serialize data", E_USER_ERROR); - return NULL; - } - - return $xml_data; - } - - /** - * Convert the Instagram content to XML. - */ - private function getXmlInstagramCom($html) { - // Extract the json data from the html code. - $json_match_expr = '/window._sharedData = (.*);/'; - $ret = preg_match($json_match_expr, $html, $matches); - if ($ret !== 1) { - trigger_error("Cannot match expression: $json_match_expr\n", E_USER_ERROR); - return NULL; - } - - return Tweeper::jsonToXml($matches[1], 'instagram'); - } - - /** - * Make the Facebook HTML processable. - */ - private function preprocessHtmlFacebookCom($html) { - $html = str_replace('<!--', '', $html); - $html = str_replace('-->', '', $html); - return $html; - } - - /** - * Convert the HTML retrieved from the site to XML. - */ - private function htmlToXml($html, $host) { - $xmlDoc = new DOMDocument(); - - // Handle warnings and errors when loading invalid HTML. - $xml_errors_value = libxml_use_internal_errors(TRUE); - - // If there is a host-specific method to get the XML data, use it! - $get_xml_host_method = 'getXml' . Tweeper::toUpperCamelCase($host, '.'); - if (method_exists($this, $get_xml_host_method)) { - $xml_data = call_user_func_array(array($this, $get_xml_host_method), array($html)); - $xmlDoc->loadXML($xml_data); - } - else { - $xmlDoc->loadHTML($html); - } - - foreach (libxml_get_errors() as $xml_error) { - Tweeper::logXmlError($xml_error); - } - libxml_clear_errors(); - libxml_use_internal_errors($xml_errors_value); - - return $xmlDoc; - } - - /** - * Load a stylesheet if the web site is supported. - */ - private function loadStylesheet($host) { - $stylesheet = "file://" . __DIR__ . "/rss_converter_" . $host . ".xsl"; - if (FALSE === file_exists($stylesheet)) { - trigger_error("Conversion to RSS not supported for $host ($stylesheet not found)", E_USER_ERROR); - return NULL; - } - - $stylesheet_contents = Tweeper::getUrlContents($stylesheet); - - $xslDoc = new DOMDocument(); - $xslDoc->loadXML($stylesheet_contents); - - $xsltProcessor = new XSLTProcessor(); - $xsltProcessor->registerPHPFunctions(); - $xsltProcessor->setParameter('', 'generate-enclosure', $this->generate_enclosure); - $xsltProcessor->importStylesheet($xslDoc); - - return $xsltProcessor; - } - - /** - * Convert the site content to RSS. - */ - public function tweep($src_url) { - $url = parse_url($src_url); - if (FALSE === $url || empty($url["host"])) { - trigger_error("Invalid URL: $src_url", E_USER_ERROR); - return NULL; - } - - $scheme = $url["scheme"]; - if (!in_array($scheme, array("http", "https"))) { - trigger_error("unsupported scheme: $scheme", E_USER_ERROR); - return NULL; - } - - // Strip the leading www. to be more forgiving on input URLs. - $host = preg_replace('/^www\./', '', $url["host"]); - - $xsltProcessor = $this->loadStylesheet($host); - if (NULL === $xsltProcessor) { - return NULL; - } - - $html = Tweeper::getUrlContents($src_url); - if (FALSE === $html) { - return NULL; - } - - $preprocess_html_host_method = 'preprocessHtml' . Tweeper::toUpperCamelCase($host, '.'); - if (method_exists($this, $preprocess_html_host_method)) { - $html = call_user_func_array(array($this, $preprocess_html_host_method), array($html)); - } - - $xmlDoc = $this->htmlToXml($html, $host); - if (NULL === $xmlDoc) { - return NULL; - } - - $output = $xsltProcessor->transformToXML($xmlDoc); - - if (FALSE === $output) { - trigger_error('XSL transformation failed.', E_USER_ERROR); - return NULL; - } - return $output; - } - -} - -/** * Check if the script is being run from the command line. */ function is_cli() { -- 2.1.4