The recommended way to install tweeper globally is to install all its files
under /usr/share/php/tweeper and then make a symlink to the wrapper script
"tweeper" under /usr/bin
+
+Tweeper depends on php-xml-serializer which is used to convert json to xml for
+some sites that provide the timeline data in json rather than in usable html.
+News for v0.4:
+==============
+
+ * Make the generated RSS validate with feedvalidator.org
+ * Fix support for Dilbert.com
+ * Add support for Instragram.com
+ * Add support for public pages on Facebook.com
+ * Make tweeper work with the PHP built-in web server
+ * Misc fixes to code and documentation
+
News for v0.3:
==============
-Tweeper is a web scraper which extracts the most recent public tweets of
-a given user from their home page on Twitter.com and formats them in RSS, so
-the information can be conveniently accessed and collected by a feed reader.
+Tweeper is a web scraper which can be used to conveniently follow the public
+activity of social network users without the need to log in or even be
+subscribed to the social network; tweeper converts the public information to
+RSS so that it can be accessed and collected by a feed reader.
-Since Jun 11th 2013 Twitter.com retired their API v1.0, so it's not possible
-to access a user timeline via RSS anymore, and it's also become mandatory to
-authenticate via OAuth to access this _public_ information in JSON format:
+Since Jun 11th 2013, when Twitter.com retired their API v1.0, it has not been
+possible anymore to access a user timeline via RSS, and it has also become
+mandatory to authenticate via OAuth to access this _public_ information in the
+JSON format:
https://dev.twitter.com/discussions/16289
https://dev.twitter.com/discussions/11564
[1] http://www.urbandictionary.com/define.php?term=TWEEPER&defid=3743173
-Tweeper can be used via web or as a command line program, for example as
-a filter in your feed reader, by passing the URL of the user's public timeline
-as the first argument.
-
Tweeper can easily scrape sites other than Twitter, it is just a matter of
writing an xsl stylesheet for the transformation; an example for pump.io
activity stream is provided in rss_converter_pump.io.xsl
+The currently supported sites are:
+
+ * Twitter.com
+ * Pump.io based websites, like Identi.ca
+ * Dilbert.com
+ * Howtoons.com
+ * Instagram.com
+ * Facebook.com (public pages)
+
+Tweeper can be used via web or as a command line program, for example as
+a filter in your feed reader, by passing the URL of the user's public timeline
+as the first argument.
+
Example of use on the command line:
$ php tweeper.php http://twitter.com/NSACareers
- evaluate the use of the <ttl/> RSS element.
- use the <enclosure/> element for pump.io media objects
- use the <enclosure/> element for images on dilbert.com
-- consider using http://www.dilbert.com/fast for dilbert.com
-- debug some duplicated entries in the tweeter feeds in liferea
+- show images (or even cards) directly in RSS items for twitter.com
- check the encoding of the tweets when UTF is used,
maybe solvable with mb_convert_encoding()?
See http://php.net/manual/en/domdocument.loadhtml.php
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:php="http://php.net/xsl"
- xsl:extension-element-prefixes="php">
+ xsl:extension-element-prefixes="php"
+ exclude-result-prefixes="php">
<xsl:output method="xml" indent="yes"/>
- <xsl:variable name="BaseURL" select="concat('http://', //meta[@property='og:site_name']/@content)"/>
+ <xsl:variable name="BaseURL" select="//meta[@property='og:url']/@content"/>
- <xsl:template match="//a[@id='strip_zoom']">
- <xsl:variable name="picture-id" select="substring-after(./@href, '#')"/>
- <xsl:variable name="picture-element" select="//div[@id=$picture-id]/img"/>
- <xsl:variable name="picture-print-url" select="php:functionString('str_replace', 'zoom', 'print', $picture-element/@src)"/>
+ <xsl:template match="//section[@class='comic-item']">
+ <xsl:variable name="item-permalink" select=".//a[@class='img-comic-link']/@href"/>
+ <xsl:variable name="picture-url" select=".//img[@class='img-responsive img-comic']/@src"/>
+ <xsl:variable name="picture-title" select=".//img[@class='img-responsive img-comic']/@alt"/>
<item>
<title>
- <xsl:value-of select="$picture-element/@title"/>
+ <xsl:value-of select="$picture-title"/>
</title>
<link>
- <xsl:value-of select="concat($BaseURL, $picture-element/@src)"/>
+ <xsl:value-of select="$item-permalink"/>
</link>
+ <guid>
+ <xsl:value-of select="$item-permalink"/>
+ </guid>
<pubDate>
- <xsl:value-of select="php:functionString('Tweeper::str_to_gmdate', substring-after($picture-id, 'strip_enlarged_'))"/>
+ <xsl:value-of select="php:functionString('Tweeper::str_to_gmdate', normalize-space(.//date))"/>
</pubDate>
<description>
<xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text>
- <img src="{$picture-print-url}" />
+ <img src="{$picture-url}" />
<xsl:text disable-output-escaping="yes">]]></xsl:text>
</description>
</item>
</xsl:template>
<xsl:template match="/">
+ <xsl:variable name="channel-title" select="//meta[@property='og:title']/@content"/>
+ <xsl:variable name="channel-link" select="$BaseURL"/>
<rss version="2.0">
<xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
<channel>
<generator>Tweeper</generator>
<title>
- <xsl:value-of select="//meta[@property='og:title']/@content"/>
+ <xsl:value-of select="$channel-title"/>
</title>
<link>
- <xsl:value-of select="$BaseURL"/>
+ <xsl:value-of select="$channel-link"/>
</link>
<description>
<xsl:value-of select="//meta[@property='og:description']/@content"/>
</description>
<image>
+ <title>
+ <xsl:value-of select="$channel-title"/>
+ </title>
+ <link>
+ <xsl:value-of select="$channel-link"/>
+ </link>
<url>
- <xsl:value-of select="//meta[@property='og:image']/@content"/>
+ <xsl:value-of select="concat($BaseURL, //img[@alt='Dilbert logo']/@src)"/>
</url>
</image>
- <xsl:apply-templates select="//a[@id='strip_zoom']"/>
+ <xsl:apply-templates select="//section[@class='comic-item']"/>
</channel>
</rss>
</xsl:template>
--- /dev/null
+<!--
+ Stylesheet to convert a Facebook public page to RSS.
+
+ Copyright (C) 2015 Antonio Ospite <ao2@ao2.it>
+
+ This file is part of tweeper.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+-->
+
+<!--
+ Since June 23rd, 2015 facebook.com deprecated the RSS feed endpoint for public pages:
+ https://developers.facebook.com/docs/apps/changelog#v2_3_90_day_deprecations
+
+ They suggest to use the Graph API but they fail to mention that it does not
+ work anymore without authentication, so it cannot be considered an
+ _equivalent_ solution.
+
+ Luckily we've got Tweeper!
+-->
+
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:php="http://php.net/xsl"
+ xsl:extension-element-prefixes="php"
+ exclude-result-prefixes="php">
+
+ <xsl:output method="xml" indent="yes"/>
+
+ <xsl:variable name="BaseURL">
+ <xsl:text>https://facebook.com</xsl:text>
+ </xsl:variable>
+
+ <xsl:template match="//div[contains(@class, 'userContentWrapper')]">
+ <xsl:variable name="item-content" select=".//div[contains(@class, 'userContent')]"/>
+ <xsl:variable name="item-permalink" select="concat($BaseURL, .//a[@target='']/@href)"/>
+ <item>
+ <title>
+ <xsl:variable name="item-title" select="$item-content/p"/>
+ <xsl:variable name="title-length" select="140"/>
+ <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 -->
+ <xsl:choose>
+ <xsl:when test="string-length($item-title) > $title-length">
+ <xsl:variable name="truncated-length" select="$title-length - 3"/>
+ <xsl:value-of select="substring($item-title, 1, $truncated-length)"/>
+ <xsl:text>...</xsl:text>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="$item-title"/>
+ </xsl:otherwise>
+ </xsl:choose>
+ </title>
+ <link>
+ <xsl:value-of select="$item-permalink"/>
+ </link>
+ <guid>
+ <xsl:value-of select="$item-permalink"/>
+ </guid>
+ <pubDate>
+ <xsl:variable name="timestamp" select=".//abbr[@data-shorten]/@data-utime"/>
+ <xsl:value-of select="php:functionString('Tweeper::epoch_to_gmdate', number($timestamp))"/>
+ </pubDate>
+ <description>
+ <xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text>
+ <xsl:copy-of select="$item-content/node()"/>
+ <xsl:copy-of select=".//div[@class='mtm']/node()"/>
+ <xsl:text disable-output-escaping="yes">]]></xsl:text>
+ </description>
+ </item>
+ </xsl:template>
+
+ <xsl:template match="/">
+ <xsl:variable name="channel-title" select="//title"/>
+ <xsl:variable name="channel-link" select="//div[contains(@class, 'userContentWrapper')][1]//a[1]/@href"/>
+
+ <rss version="2.0">
+ <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
+ <channel>
+ <generator>Tweeper</generator>
+ <title>
+ <xsl:value-of select="$channel-title"/>
+ </title>
+ <link>
+ <xsl:value-of select="$channel-link"/>
+ </link>
+ <description>
+ <xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text>
+ <xsl:copy-of select="//div[@data-id='1']/node()"/>
+ <xsl:text disable-output-escaping="yes">]]></xsl:text>
+ </description>
+ <image>
+ <title>
+ <xsl:value-of select="$channel-title"/>
+ </title>
+ <link>
+ <xsl:value-of select="$channel-link"/>
+ </link>
+ <url>
+ <xsl:value-of select="//img[@class='profilePic img']/@src"/>
+ </url>
+ </image>
+ <xsl:apply-templates select="//div[contains(@class, 'userContentWrapper')]"/>
+ </channel>
+ </rss>
+ </xsl:template>
+</xsl:stylesheet>
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:php="http://php.net/xsl"
- xsl:extension-element-prefixes="php">
+ xsl:extension-element-prefixes="php"
+ exclude-result-prefixes="php">
<xsl:output method="xml" indent="yes"/>
</xsl:variable>
<xsl:template match="//div[contains(@id, 'post-')]">
+ <xsl:variable name="item-permalink" select=".//div[@class='post-headline']//a/@href"/>
<item>
<title>
<xsl:value-of select="normalize-space(.//div[@class='post-headline']//a)"/>
</title>
<link>
- <xsl:value-of select=".//div[@class='post-headline']//a/@href"/>
+ <xsl:value-of select="$item-permalink"/>
</link>
+ <guid>
+ <xsl:value-of select="$item-permalink"/>
+ </guid>
<pubDate>
<xsl:variable name="date" select="substring-before(.//div[@class='post-byline'], ',')"/>
<!-- date format is MM.DD.YY -->
</xsl:template>
<xsl:template match="/">
+ <xsl:variable name="channel-title" select="//title"/>
+ <xsl:variable name="channel-link" select="$BaseURL"/>
<rss version="2.0">
<xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
<channel>
<generator>Tweeper</generator>
<title>
- <xsl:value-of select="//title"/>
+ <xsl:value-of select="$channel-title"/>
</title>
<link>
- <xsl:value-of select="$BaseURL"/>
+ <xsl:value-of select="$channel-link"/>
</link>
<description>
<xsl:text>The world's greatest D.I.Y. comic website! Tools of mass construction!</xsl:text>
</description>
<image>
+ <title>
+ <xsl:value-of select="$channel-title"/>
+ </title>
+ <link>
+ <xsl:value-of select="$channel-link"/>
+ </link>
<url>
<xsl:text>http://www.howtoons.com/wp-content/themes/atahualpa/images/header/tuck1000.png</xsl:text>
</url>
--- /dev/null
+<!--
+ Stylesheet to convert Instagram user timelines to RSS.
+
+ Copyright (C) 2015 Antonio Ospite <ao2@ao2.it>
+
+ This file is part of tweeper.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+-->
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:php="http://php.net/xsl"
+ xsl:extension-element-prefixes="php"
+ exclude-result-prefixes="php">
+
+ <xsl:param name="generateEnclosure"/>
+
+ <xsl:output method="xml" indent="yes"/>
+
+ <xsl:variable name="BaseURL">
+ <xsl:text>https://instagram.com</xsl:text>
+ </xsl:variable>
+
+ <xsl:template match="display_src">
+ <xsl:value-of disable-output-escaping="yes" select="php:function('Tweeper::generate_enclosure', string(text()))"/>
+ </xsl:template>
+
+ <xsl:variable name="user-name" select="//ProfilePage/XML_Serializer_Tag/user/username"/>
+
+ <!-- Some users do not specify the full name -->
+ <xsl:variable name="full-name" select="//ProfilePage/XML_Serializer_Tag/user/full_name"/>
+ <xsl:variable name="screen-name">
+ <xsl:choose>
+ <xsl:when test="$full-name != ''">
+ <xsl:value-of select="$full-name"/>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="$user-name"/>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+
+ <xsl:template match="//media/nodes/XML_Serializer_Tag">
+ <xsl:variable name="item-content-image" select="./display_src"/>
+ <xsl:variable name="item-content-caption" select="./caption"/>
+ <xsl:variable name="item-permalink" select="concat($BaseURL, '/p/', ./code, '/')"/>
+ <item>
+ <title>
+ <xsl:variable name="title-length" select="140"/>
+ <xsl:variable name="item-content-title" select="normalize-space(concat($user-name, ': ', $item-content-caption))"/>
+ <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 -->
+ <xsl:choose>
+ <xsl:when test="string-length($item-content-title) > $title-length">
+ <xsl:variable name="truncated-length" select="$title-length - 3"/>
+ <xsl:value-of select="substring($item-content-title, 1, $truncated-length)"/>
+ <xsl:text>...</xsl:text>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="$item-content-title"/>
+ </xsl:otherwise>
+ </xsl:choose>
+ </title>
+ <link>
+ <xsl:value-of select="$item-permalink"/>
+ </link>
+ <guid>
+ <xsl:value-of select="$item-permalink"/>
+ </guid>
+ <pubDate>
+ <xsl:variable name="timestamp" select="./date"/>
+ <xsl:value-of select="php:functionString('Tweeper::epoch_to_gmdate', number($timestamp))"/>
+ </pubDate>
+ <description>
+ <xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text>
+ <p>
+ <xsl:if test="./is_video/text() = 1">
+ (Video)
+ </xsl:if>
+ <xsl:value-of select="$item-content-caption"/>
+ </p><br />
+ <a href="{$item-permalink}"><img src="{$item-content-image}" /></a>
+ <xsl:text disable-output-escaping="yes">]]></xsl:text>
+ </description>
+ <xsl:if test="$generateEnclosure = 1">
+ <xsl:apply-templates select="./display_src"/>
+ </xsl:if>
+ </item>
+ </xsl:template>
+
+ <xsl:template match="/">
+ <xsl:variable name="channel-title" select="concat('Instagram / ', $screen-name)"/>
+ <xsl:variable name="channel-link" select="concat($BaseURL, //__path)"/>
+
+ <rss version="2.0">
+ <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
+ <channel>
+ <generator>Tweeper</generator>
+ <title>
+ <xsl:value-of select="$channel-title"/>
+ </title>
+ <link>
+ <xsl:value-of select="$channel-link"/>
+ </link>
+ <description>
+ <xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text>
+ <xsl:value-of select="normalize-space(concat($screen-name, '. ', //user/biography))"/>
+ <xsl:variable name="external-url" select="//user/external_url"/>
+ <xsl:if test="$external-url != ''">
+ <xsl:text> </xsl:text><a href="{$external-url}"><xsl:value-of select="$external-url"/></a>
+ </xsl:if>
+ <xsl:text disable-output-escaping="yes">]]></xsl:text>
+ </description>
+ <image>
+ <title>
+ <xsl:value-of select="$channel-title"/>
+ </title>
+ <link>
+ <xsl:value-of select="$channel-link"/>
+ </link>
+ <url>
+ <xsl:value-of select="//user/profile_pic_url"/>
+ </url>
+ </image>
+ <xsl:apply-templates select="//media/nodes/XML_Serializer_Tag"/>
+ </channel>
+ </rss>
+ </xsl:template>
+</xsl:stylesheet>
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:php="http://php.net/xsl"
- xsl:extension-element-prefixes="php">
+ xsl:extension-element-prefixes="php"
+ exclude-result-prefixes="php">
<xsl:output method="xml" indent="yes"/>
<xsl:variable name="user-name" select="substring-after(//div[@id='profile-block']/@data-profile-id, ':')"/>
<xsl:template match="//div[@id='user-content-activities']//ul[@id='major-stream']/li">
- <xsl:variable name="activity-text" select=".//div[@class='activity-content']"/>
+ <xsl:variable name="item-content" select=".//div[@class='activity-content']"/>
+ <xsl:variable name="item-permalink" select=".//p[@class='muted']/small/a/@href"/>
<item>
<title>
- <xsl:value-of select="concat($user-name, ': ', normalize-space($activity-text))"/>
+ <xsl:value-of select="concat($user-name, ': ', normalize-space($item-content))"/>
</title>
<link>
- <xsl:value-of select=".//p[@class='muted']/small/a/@href"/>
+ <xsl:value-of select="$item-permalink"/>
</link>
+ <guid>
+ <xsl:value-of select="$item-permalink"/>
+ </guid>
<pubDate>
<xsl:value-of select="php:functionString('Tweeper::str_to_gmdate', .//abbr[@class='easydate']/@title)"/>
</pubDate>
<description>
<xsl:value-of select="concat($user-name, ': ')"/>
<xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text>
- <xsl:copy-of select="$activity-text/node()"/>
+ <xsl:copy-of select="$item-content/node()"/>
<xsl:text disable-output-escaping="yes">]]></xsl:text>
</description>
</item>
</xsl:template>
<xsl:template match="/">
+ <xsl:variable name="channel-title" select="concat(substring-after($user-name, '@'), ' / ', substring-before($user-name, '@'))"/>
+ <xsl:variable name="channel-link" select="concat('https://', substring-after($user-name, '@'), '/', substring-before($user-name, '@'))"/>
<rss version="2.0">
<xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
<channel>
<generator>Tweeper</generator>
<title>
- <xsl:value-of select="concat(substring-after($user-name, '@'), ' / ', substring-before($user-name, '@'))"/>
+ <xsl:value-of select="$channel-title"/>
</title>
<link>
- <xsl:value-of select="concat('https://', substring-after($user-name, '@'), '/', substring-before($user-name, '@'))"/>
+ <xsl:value-of select="$channel-link"/>
</link>
<description>
<xsl:value-of select="normalize-space(//h1[@class='media-header'])"/>
</description>
<image>
+ <title>
+ <xsl:value-of select="$channel-title"/>
+ </title>
+ <link>
+ <xsl:value-of select="$channel-link"/>
+ </link>
<url>
<xsl:value-of select="//div[@id='profile-block']/span/img[@class='img-rounded media-object']/@src"/>
</url>
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:php="http://php.net/xsl"
- xsl:extension-element-prefixes="php">
+ xsl:extension-element-prefixes="php"
+ exclude-result-prefixes="php">
<xsl:param name="generateEnclosure"/>
<xsl:output method="xml" indent="yes"/>
- <xsl:variable name="twitterBaseURL">
+ <xsl:variable name="BaseURL">
<xsl:text>https://twitter.com</xsl:text>
</xsl:variable>
<xsl:variable name="screen-name" select="//div[@class='user-actions btn-group not-following ']/@data-screen-name"/>
- <xsl:template match="//*[@data-item-type='tweet']">
+ <xsl:template match="//li[@data-item-type='tweet']">
<xsl:variable name="user-name" select=".//div[contains(@class, 'js-stream-tweet')]/@data-screen-name"/>
- <xsl:variable name="tweet-text" select=".//p[contains(@class, 'js-tweet-text')]"/>
+ <xsl:variable name="item-content" select=".//p[contains(@class, 'js-tweet-text')]"/>
+ <xsl:variable name="item-permalink" select="concat($BaseURL, .//div[@data-permalink-path]/@data-permalink-path)"/>
<item>
<title>
- <xsl:value-of select="concat($user-name, ': ', $tweet-text)"/>
+ <xsl:value-of select="concat($user-name, ': ', $item-content)"/>
</title>
<link>
- <xsl:value-of select="concat($twitterBaseURL, .//a[contains(@class, 'js-permalink')]/@href)"/>
+ <xsl:value-of select="$item-permalink"/>
</link>
+ <guid>
+ <xsl:value-of select="$item-permalink"/>
+ </guid>
<pubDate>
<xsl:variable name="timestamp" select=".//span[contains(@class, 'js-short-timestamp')]/@data-time"/>
<xsl:value-of select="php:functionString('Tweeper::epoch_to_gmdate', number($timestamp))"/>
<description>
<xsl:value-of select="concat($user-name, ': ')"/>
<xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text>
- <xsl:copy-of select="$tweet-text/node()"/>
+ <xsl:copy-of select="$item-content/node()"/>
<xsl:text disable-output-escaping="yes">]]></xsl:text>
</description>
<xsl:if test="$generateEnclosure = 1">
- <xsl:apply-templates select="$tweet-text//a[@data-expanded-url]"/>
+ <xsl:apply-templates select="$item-content//a[@data-expanded-url]"/>
</xsl:if>
</item>
</xsl:template>
<xsl:template match="/">
+ <xsl:variable name="channel-title" select="concat('Twitter / ', $screen-name)"/>
+ <xsl:variable name="channel-link" select="//link[@rel='canonical']/@href"/>
<rss version="2.0">
- <xsl:attribute name="xml:base"><xsl:value-of select="$twitterBaseURL" /></xsl:attribute>
+ <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
<channel>
<generator>Tweeper</generator>
<title>
- <xsl:text>Twitter / </xsl:text><xsl:value-of select="$screen-name"/>
+ <xsl:value-of select="$channel-title"/>
</title>
<link>
- <xsl:value-of select="//link[@rel='canonical']/@href"/>
+ <xsl:value-of select="$channel-link"/>
</link>
<description>
- <xsl:value-of select="//meta[@name='description']/@content"/>
+ <xsl:value-of select="normalize-space(//div[@class='ProfileHeaderCard'])"/>
</description>
<image>
+ <title>
+ <xsl:value-of select="$channel-title"/>
+ </title>
+ <link>
+ <xsl:value-of select="$channel-link"/>
+ </link>
<url>
<xsl:value-of select="//a[contains(@class, 'profile-picture media-thumbnail')]/@href"/>
</url>
</image>
- <xsl:apply-templates select="//*[@data-item-type='tweet']"/>
+ <xsl:apply-templates select="//li[@data-item-type='tweet']"/>
</channel>
</rss>
</xsl:template>
NAME
----
-tweeper - web scraper to convert a Twitter timeline to an RSS feed
+tweeper - web scraper to convert supported websites (e.g. Twitter.com) to RSS
SYNOPSIS
DESCRIPTION
-----------
-tweeper(1) is a web scraper which extracts the most recent public tweets of
-a given user from their home page on Twitter.com and formats them in RSS, so
-the information can be conveniently accessed and collected by a feed reader.
+tweeper(1) is a web scraper which can be used to conveniently follow the
+public activity of social network users without the need to log in or even be
+subscribed to the social network; tweeper converts the public information to
+RSS so that it can be accessed and collected by a feed reader.
tweeper started as the TWitter fEEd scraPER but support for other web sites
has been added.
The sites that tweeper is able to scrape and convert to RSS are:
-
+
* Twitter.com
* Pump.io based websites, like Identi.ca
* Dilbert.com
+* Howtoons.com
+* Instagram.com
+* Facebook.com (public pages)
tweeper can be used as:
1. a command line tool;
2. a filter for feed readers;
-3. a web based tool when PHP support is available in the web server.
+3. a web based tool when used with a PHP-enabled web server.
OPTIONS
liferea-add-feed "|tweeper http://twitter.com/NSAcareers"
-Using tweeper via web (the symlink must be created only the very first time):
+To use tweeper via web there are two options (the examples assume the
+installation directory to be `/usr/share/php/tweeper/`):
+
+1. Using the PHP built-in web server:
+
+ php -S localhost:8000 -t /usr/share/php/tweeper/
++
+and then visit 'http://localhost:8000/tweeper.php' in the web browser.
+
+2. Using a generic web server with the document root in '/var/www':
sudo ln -s /usr/share/php/tweeper/tweeper.php /var/www
xdg-open http://localhost/tweeper.php?src_url=http://twitter.com/NSAcareers
++
+It is enough to create the symlink only the very first time tweeper is used
+this way.
NOTES
COPYING
-------
-Copyright \(C) 2013-2014 Antonio Ospite <ao2@ao2.it>
+Copyright \(C) 2013-2015 Antonio Ospite <ao2@ao2.it>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
<?php
/*
* tweeper - a Twitter to RSS web scraper
- *
- * Copyright (C) 2013-2014 Antonio Ospite <ao2@ao2.it>
- *
+ *
+ * Copyright (C) 2013-2015 Antonio Ospite <ao2@ao2.it>
+ *
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
- *
+ *
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
- *
+ *
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+require_once 'XML/Serializer.php';
+
date_default_timezone_set('UTC');
class Tweeper {
private static $USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0";
- public function __construct($stylesheet, $generate_enclosure = FALSE) {
- $stylesheet_contents = $this->get_contents($stylesheet);
-
- $xslDoc = new DOMDocument();
- $xslDoc->loadXML($stylesheet_contents);
-
- $this->xsltProcessor = new XSLTProcessor();
- $this->xsltProcessor->registerPHPFunctions();
- $this->xsltProcessor->setParameter('', 'generateEnclosure', $generate_enclosure);
- $this->xsltProcessor->importStylesheet($xslDoc);
+ public function __construct($generate_enclosure = FALSE) {
+ $this->generate_enclosure = $generate_enclosure;
}
public static function epoch_to_gmdate($timestamp)
{
+ if (!is_numeric($timestamp) || is_nan($timestamp)) {
+ $timestamp = 0;
+ }
+
return gmdate('D, d M Y H:i:s', $timestamp) . ' GMT';
}
public static function str_to_gmdate($date)
{
$timestamp = strtotime($date);
+ if (FALSE === $timestamp) {
+ $timestamp = 0;
+ }
+
return Tweeper::epoch_to_gmdate($timestamp);
}
"video/ogg",
);
- $url_info = Tweeper::get_info($url);
+ // The RSS specification says that the enclosure element url must be http.
+ // See http://sourceforge.net/p/feedvalidator/bugs/72/
+ $http_url = preg_replace("/^https/", "http", $url);
+
+ $url_info = Tweeper::get_info($http_url);
$supported = in_array($url_info['content_type'], $supported_content_types);
if (!$supported) {
error_log($output);
}
- public function tweep($uri) {
- $html = Tweeper::get_contents($uri);
+ private function load_stylesheet($host) {
+ $stylesheet = "file://" . __DIR__ . "/rss_converter_" . $host . ".xsl";
+ if (FALSE === file_exists($stylesheet)) {
+ trigger_error("Conversion to RSS not supported for $host ($stylesheet not found)", E_USER_ERROR);
+ return NULL;
+ }
+
+ $stylesheet_contents = $this->get_contents($stylesheet);
+
+ $xslDoc = new DOMDocument();
+ $xslDoc->loadXML($stylesheet_contents);
+
+ $xsltProcessor = new XSLTProcessor();
+ $xsltProcessor->registerPHPFunctions();
+ $xsltProcessor->setParameter('', 'generateEnclosure', $this->generate_enclosure);
+ $xsltProcessor->importStylesheet($xslDoc);
+
+ return $xsltProcessor;
+ }
+
+ private function json_to_xml($html, $json_match_expr, $rootName) {
+ // pre-process, convert json to XML
+ $ret = preg_match($json_match_expr, $html, $matches);
+ if ($ret !== 1) {
+ trigger_error("Cannot match expression: $json_match_expr\n", E_USER_ERROR);
+ return NULL;
+ }
+
+ $data = json_decode($matches[1]);
+ if (!$data) {
+ return NULL;
+ }
+
+ $serializer_options = array (
+ 'addDecl' => TRUE,
+ 'encoding' => "UTF-8",
+ 'indent' => ' ',
+ 'rootName' => $rootName,
+ );
+
+ $serializer = new XML_Serializer($serializer_options);
+
+ $status = $serializer->serialize($data);
+ if (PEAR::isError($status)) {
+ trigger_error($status->getMessage(), E_USER_ERROR);
+ return NULL;
+ }
+
+ return $serializer->getSerializedData();
+ }
+
+ private function get_xml_instagram_com($html) {
+ return $this->json_to_xml($html, '/window._sharedData = (.*);/', 'instagram');
+ }
+
+ private function preprocess_html_facebook_com($html) {
+ $html = str_replace('<!--', '', $html);
+ $html = str_replace('-->', '', $html);
+ return $html;
+ }
+ private function html_to_xml($html, $host) {
$xmlDoc = new DOMDocument();
// Handle warnings and errors when loading invalid HTML.
$xml_errors_value = libxml_use_internal_errors(true);
- $xmlDoc->loadHTML($html);
+
+ // If there is a host-specific method to get the xml data, use it!
+ $get_xml_host_method = 'get_xml_' . str_replace(".", "_", $host);
+ if (method_exists($this, $get_xml_host_method)) {
+ $xml_data = call_user_func_array(array($this, $get_xml_host_method), array($html));
+ $xmlDoc->loadXML($xml_data);
+ } else {
+ $xmlDoc->loadHTML($html);
+ }
+
foreach (libxml_get_errors() as $xml_error) {
$this->log_xml_error($xml_error);
}
libxml_clear_errors();
libxml_use_internal_errors($xml_errors_value);
- $output = $this->xsltProcessor->transformToXML($xmlDoc);
+ return $xmlDoc;
+ }
+
+ public function tweep($src_url) {
+ $url = parse_url($src_url);
+ if (FALSE === $url || empty($url["host"])) {
+ trigger_error("Invalid url: $src_url", E_USER_ERROR);
+ return NULL;
+ }
+
+ // Strip the leading www. to be more forgiving on input URLs
+ $host = preg_replace('/^www\./', '', $url["host"]);
+
+ $xsltProcessor = $this->load_stylesheet($host);
+ if (NULL === $xsltProcessor) {
+ return NULL;
+ }
+
+ $html = $this->get_contents($src_url);
+ if (FALSE === $html) {
+ return NULL;
+ }
+
+ $preprocess_html_host_method = 'preprocess_html_' . str_replace(".", "_", $host);
+ if (method_exists($this, $preprocess_html_host_method)) {
+ $html = call_user_func_array(array($this, $preprocess_html_host_method), array($html));
+ }
+
+ $xmlDoc = $this->html_to_xml($html, $host);
+ if (NULL === $xmlDoc) {
+ return NULL;
+ }
+
+ $output = $xsltProcessor->transformToXML($xmlDoc);
if (FALSE === $output) {
trigger_error('XSL transformation failed.', E_USER_ERROR);
}
}
+function is_cli()
+{
+ return (php_sapi_name() === "cli");
+}
+
function usage($argv)
{
- if (php_sapi_name() != 'cli')
- $usage = htmlentities("{$_SERVER['SCRIPT_NAME']}?src_url=<src_url>&generate_enclosure=<0|1>");
- else
+ if (is_cli()) {
$usage = "{$argv[0]} [-e|-h|--help] <src_url>\n";
+ } else {
+ $usage = htmlentities("{$_SERVER['SCRIPT_NAME']}?src_url=<src_url>&generate_enclosure=<0|1>");
+ }
return "usage: $usage";
}
}
-if (php_sapi_name() != 'cli') {
- $options = parse_options_query_string();
- $ERROR_STREAM = fopen('php://output', 'w');
-} else {
+if (is_cli()) {
$options = parse_options_cli($argv, $argc);
$ERROR_STREAM = fopen('php://stderr', 'w');
+} else {
+ $options = parse_options_query_string();
+ $ERROR_STREAM = fopen('php://output', 'w');
}
if (!isset($options['src_url'])) {
- fwrite($ERROR_STREAM, usage($argv));
- exit(1);
-}
-
-$url = parse_url($options['src_url']);
-if (FALSE === $url || empty($url["host"])) {
- fwrite($ERROR_STREAM, "Invalid url: ${options['src_url']}\n");
- exit(1);
-}
-
-$stylesheet = "file://" . __DIR__ . "/rss_converter_" . $url["host"] . ".xsl";
-if (FALSE === file_exists($stylesheet)) {
- fwrite($ERROR_STREAM, "Conversion to RSS not supported: {$url["host"]}\n");
+ fwrite($ERROR_STREAM, usage(is_cli() ? $argv : NULL));
exit(1);
}
-$tweeper = new Tweeper($stylesheet, $options['generate_enclosure']);
+$tweeper = new Tweeper($options['generate_enclosure']);
echo $tweeper->tweep($options['src_url']);