From a41341e579712a0e6715eea8a96b864376350bfd Mon Sep 17 00:00:00 2001 From: Antonio Ospite Date: Tue, 5 May 2015 09:21:48 +0200 Subject: [PATCH 01/16] tweeper.php: make date handling functions a little more robust Provide at least _some_ error checking and a fall-back value for invalid dates. --- tweeper.php | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tweeper.php b/tweeper.php index 37b73f5..81f5f85 100644 --- a/tweeper.php +++ b/tweeper.php @@ -32,12 +32,20 @@ class Tweeper { public static function epoch_to_gmdate($timestamp) { + if (!is_numeric($timestamp) || is_nan($timestamp)) { + $timestamp = 0; + } + return gmdate('D, d M Y H:i:s', $timestamp) . ' GMT'; } public static function str_to_gmdate($date) { $timestamp = strtotime($date); + if (FALSE === $timestamp) { + $timestamp = 0; + } + return Tweeper::epoch_to_gmdate($timestamp); } -- 2.1.4 From 4c2e9862fdd7af1572f5a0fc90b030847b946ce8 Mon Sep 17 00:00:00 2001 From: Antonio Ospite Date: Tue, 5 May 2015 09:25:20 +0200 Subject: [PATCH 02/16] rss_converter_twitter.com.xsl: restrict tweet matching With new style retweets the quoted text is also matched by [@data-item-type='tweet'] but then the content is not handled, resulting in empty items in the RSS feed. Checking also for @role='listitem' allows to pick up only top-level tweets. --- rss_converter_twitter.com.xsl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rss_converter_twitter.com.xsl b/rss_converter_twitter.com.xsl index 665b71d..9efc769 100644 --- a/rss_converter_twitter.com.xsl +++ b/rss_converter_twitter.com.xsl @@ -38,7 +38,7 @@ - + @@ -96,7 +96,7 @@ - + -- 2.1.4 From 83812c773df73f3b73701e0653f42d32979a3b35 Mon Sep 17 00:00:00 2001 From: Antonio Ospite Date: Tue, 5 May 2015 09:28:23 +0200 Subject: [PATCH 03/16] rss_converter_twitter.com.xsl: improve matching the permalink Extract the permalink using the @data-permalink-path attribute, this works for withheld tweets too preventing them from having all the same guid. --- rss_converter_twitter.com.xsl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rss_converter_twitter.com.xsl b/rss_converter_twitter.com.xsl index 9efc769..77b6666 100644 --- a/rss_converter_twitter.com.xsl +++ b/rss_converter_twitter.com.xsl @@ -41,7 +41,7 @@ - + <xsl:value-of select="concat($user-name, ': ', $item-content)"/> -- 2.1.4 From e64422f4586b4ba25bbc13e56b8ac02ad74d92b7 Mon Sep 17 00:00:00 2001 From: Antonio Ospite <ao2@ao2.it> Date: Sun, 31 May 2015 19:17:28 +0200 Subject: [PATCH 04/16] rss_converter_twitter.com.xsl: update XPath of tweet content Using the role attribute to differentiate between original tweets and quoted tweet, as introduced in commit 4c2e986, does not work anymore, but the fact that original tweets are <li></li> elements while quoted tweets are <div></div> elements can be used instead. --- rss_converter_twitter.com.xsl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rss_converter_twitter.com.xsl b/rss_converter_twitter.com.xsl index 77b6666..9185a54 100644 --- a/rss_converter_twitter.com.xsl +++ b/rss_converter_twitter.com.xsl @@ -38,7 +38,7 @@ <xsl:variable name="screen-name" select="//div[@class='user-actions btn-group not-following ']/@data-screen-name"/> - <xsl:template match="//*[@data-item-type='tweet' and @role='listitem']"> + <xsl:template match="//li[@data-item-type='tweet']"> <xsl:variable name="user-name" select=".//div[contains(@class, 'js-stream-tweet')]/@data-screen-name"/> <xsl:variable name="item-content" select=".//p[contains(@class, 'js-tweet-text')]"/> <xsl:variable name="item-permalink" select="concat($BaseURL, .//div[@data-permalink-path]/@data-permalink-path)"/> @@ -96,7 +96,7 @@ <xsl:value-of select="//a[contains(@class, 'profile-picture media-thumbnail')]/@href"/> </url> </image> - <xsl:apply-templates select="//*[@data-item-type='tweet' and @role='listitem']"/> + <xsl:apply-templates select="//li[@data-item-type='tweet']"/> </channel> </rss> </xsl:template> -- 2.1.4 From 4b941ccd0115402aa2c7b0f50578f7f2a0e8d40a Mon Sep 17 00:00:00 2001 From: Antonio Ospite <ao2@ao2.it> Date: Fri, 12 Jun 2015 12:06:02 +0200 Subject: [PATCH 05/16] rss_converter_instagram.com.xsl: update to new json format The new Instagram homepage provides json data in a format different than before, update the xsl to support it. Unfortunately the data in the new format does not provide the descriptions of the items, so use some placeholder values (URL, comments count, likes count) to present at least something. --- rss_converter_instagram.com.xsl | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/rss_converter_instagram.com.xsl b/rss_converter_instagram.com.xsl index f42ca8b..d9dfcdd 100644 --- a/rss_converter_instagram.com.xsl +++ b/rss_converter_instagram.com.xsl @@ -36,16 +36,16 @@ <xsl:value-of disable-output-escaping="yes" select="php:function('Tweeper::generate_enclosure', string(./standard_resolution/url))"/> </xsl:template> - <xsl:variable name="screen-name" select="//UserProfile/XML_Serializer_Tag/user/full_name"/> - <xsl:variable name="user-name" select="//UserProfile/XML_Serializer_Tag/user/username"/> + <xsl:variable name="screen-name" select="//ProfilePage/XML_Serializer_Tag/user/full_name"/> + <xsl:variable name="user-name" select="//ProfilePage/XML_Serializer_Tag/user/username"/> - <xsl:template match="//userMedia/XML_Serializer_Tag"> - <xsl:variable name="item-content-title" select="./caption/text"/> - <xsl:variable name="item-content-src" select="./images/standard_resolution/url"/> - <xsl:variable name="item-permalink" select="./link"/> + <xsl:template match="//media/nodes/XML_Serializer_Tag"> + <xsl:variable name="item-content-src" select="./display_src"/> + <xsl:variable name="item-permalink" select="concat($BaseURL, '/p/', ./code, '/')"/> + <xsl:variable name="item-content-title" select="concat($user-name, ': ', $item-permalink, ', comments: ', ./comments/count, ', likes: ', ./likes/count)"/> <item> <title> - <xsl:value-of select="concat($user-name, ': ', $item-content-title)"/> + <xsl:value-of select="$item-content-title"/> @@ -54,13 +54,13 @@ - + <![CDATA[


- + ]]>
@@ -71,7 +71,7 @@ - + @@ -84,7 +84,7 @@ - + @@ -94,10 +94,10 @@ <xsl:value-of select="$channel-link"/> </link> <url> - <xsl:value-of select="//user/profile_picture"/> + <xsl:value-of select="//user/profile_pic_url"/> </url> </image> - <xsl:apply-templates select="//userMedia/XML_Serializer_Tag"/> + <xsl:apply-templates select="//media/nodes/XML_Serializer_Tag"/> </channel> </rss> </xsl:template> -- 2.1.4 From 69e71256e0c4a30c57fe13f0ff5b3038e2c5f563 Mon Sep 17 00:00:00 2001 From: Antonio Ospite <ao2@ao2.it> Date: Wed, 1 Jul 2015 13:34:53 +0200 Subject: [PATCH 06/16] tweeper.php: make error about missing stylesheet more explicit --- tweeper.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tweeper.php b/tweeper.php index 81f5f85..178176d 100644 --- a/tweeper.php +++ b/tweeper.php @@ -161,7 +161,7 @@ class Tweeper { private function load_stylesheet($host) { $stylesheet = "file://" . __DIR__ . "/rss_converter_" . $host . ".xsl"; if (FALSE === file_exists($stylesheet)) { - trigger_error("Conversion to RSS not supported: $host", E_USER_ERROR); + trigger_error("Conversion to RSS not supported for $host ($stylesheet not found)", E_USER_ERROR); return NULL; } -- 2.1.4 From 032dc6cebadb1bc3e1647dff131a1e99c53019ff Mon Sep 17 00:00:00 2001 From: Antonio Ospite <ao2@ao2.it> Date: Wed, 1 Jul 2015 13:35:56 +0200 Subject: [PATCH 07/16] tweeper.php: strip the leading "www." from hosts This makes tweeper more forgiving when it is passed URLs either with or without the "www" subdomain for the same host. --- tweeper.php | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tweeper.php b/tweeper.php index 178176d..2444a39 100644 --- a/tweeper.php +++ b/tweeper.php @@ -244,7 +244,10 @@ class Tweeper { return NULL; } - $xsltProcessor = $this->load_stylesheet($url["host"]); + // Strip the leading www. to be more forgiving on input URLs + $host = preg_replace('/^www\./', '', $url["host"]); + + $xsltProcessor = $this->load_stylesheet($host); if (NULL === $xsltProcessor) { return NULL; } @@ -254,7 +257,7 @@ class Tweeper { return NULL; } - $xmlDoc = $this->html_to_xml($html, $url["host"]); + $xmlDoc = $this->html_to_xml($html, $host); if (NULL === $xmlDoc) { return NULL; } -- 2.1.4 From 7097a8ad2ef040bc81a8c5f7ed7cc02e0073eaab Mon Sep 17 00:00:00 2001 From: Antonio Ospite <ao2@ao2.it> Date: Wed, 1 Jul 2015 13:37:57 +0200 Subject: [PATCH 08/16] tweeper.php: support host-specific methods for preprocessing the HTML data Some sites serve mangled HTML code, so a mechanism to clean it up before loading it as XML is needed. For instance, facebook.com puts come content inside HTML comments, and these must be stripped in order to make the content available to the HTML parser when loading the data into a DOMDocument. --- tweeper.php | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tweeper.php b/tweeper.php index 2444a39..efc0fd6 100644 --- a/tweeper.php +++ b/tweeper.php @@ -257,6 +257,11 @@ class Tweeper { return NULL; } + $preprocess_html_host_method = 'preprocess_html_' . str_replace(".", "_", $host); + if (method_exists($this, $preprocess_html_host_method)) { + $html = call_user_func_array(array($this, $preprocess_html_host_method), array($html)); + } + $xmlDoc = $this->html_to_xml($html, $host); if (NULL === $xmlDoc) { return NULL; -- 2.1.4 From 481f2d015d14180be9982ced2f281494e0ec3855 Mon Sep 17 00:00:00 2001 From: Antonio Ospite <ao2@ao2.it> Date: Wed, 1 Jul 2015 13:47:53 +0200 Subject: [PATCH 09/16] Add support for Facebook.com public pages --- README | 1 + rss_converter_facebook.com.xsl | 115 +++++++++++++++++++++++++++++++++++++++++ tweeper.1.asciidoc | 1 + tweeper.php | 6 +++ 4 files changed, 123 insertions(+) create mode 100644 rss_converter_facebook.com.xsl diff --git a/README b/README index 8612636..d2d200b 100644 --- a/README +++ b/README @@ -35,6 +35,7 @@ The currently supported sites are: * Dilbert.com * Howtoons.com * Instagram.com + * Facebook.com (public pages) Tweeper can be used via web or as a command line program, for example as a filter in your feed reader, by passing the URL of the user's public timeline diff --git a/rss_converter_facebook.com.xsl b/rss_converter_facebook.com.xsl new file mode 100644 index 0000000..b50be3d --- /dev/null +++ b/rss_converter_facebook.com.xsl @@ -0,0 +1,115 @@ +<!-- + Stylesheet to convert a Facebook public page to RSS. + + Copyright (C) 2015 Antonio Ospite <ao2@ao2.it> + + This file is part of tweeper. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +--> + +<!-- + Since June 23rd, 2015 facebook.com deprecated the RSS feed endpoint for public pages: + https://developers.facebook.com/docs/apps/changelog#v2_3_90_day_deprecations + + They suggest to use the Graph API but they fail to mention that it does not + work anymore without authentication, so it cannot be considered an + _equivalent_ solution. + + Luckily we've got Tweeper! +--> + +<xsl:stylesheet version="1.0" + xmlns:xsl="http://www.w3.org/1999/XSL/Transform" + xmlns:php="http://php.net/xsl" + xsl:extension-element-prefixes="php" + exclude-result-prefixes="php"> + + <xsl:output method="xml" indent="yes"/> + + <xsl:variable name="BaseURL"> + <xsl:text>https://facebook.com</xsl:text> + </xsl:variable> + + <xsl:template match="//div[contains(@class, 'userContentWrapper')]"> + <xsl:variable name="item-content" select=".//div[contains(@class, 'userContent')]"/> + <xsl:variable name="item-permalink" select="concat($BaseURL, .//a[@target='']/@href)"/> + <item> + <title> + <xsl:variable name="item-title" select="$item-content/p"/> + <xsl:variable name="title-length" select="140"/> + <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 --> + <xsl:choose> + <xsl:when test="string-length($item-title) > $title-length"> + <xsl:variable name="truncated-length" select="$title-length - 3"/> + <xsl:value-of select="substring($item-title, 1, $truncated-length)"/> + <xsl:text>...</xsl:text> + </xsl:when> + <xsl:otherwise> + <xsl:value-of select="$item-title"/> + </xsl:otherwise> + </xsl:choose> + + + + + + + + + + + + + <![CDATA[ + + + ]]> + +
+
+ + + + + + + + + Tweeper + + <xsl:value-of select="$channel-title"/> + + + + + + + + + + <xsl:value-of select="$channel-title"/> + + + + + + + + + + + + + diff --git a/tweeper.1.asciidoc b/tweeper.1.asciidoc index d6ced90..2782dac 100644 --- a/tweeper.1.asciidoc +++ b/tweeper.1.asciidoc @@ -30,6 +30,7 @@ The sites that tweeper is able to scrape and convert to RSS are: * Dilbert.com * Howtoons.com * Instagram.com +* Facebook.com (public pages) tweeper can be used as: diff --git a/tweeper.php b/tweeper.php index efc0fd6..5e9d242 100644 --- a/tweeper.php +++ b/tweeper.php @@ -213,6 +213,12 @@ class Tweeper { return $this->json_to_xml($html, '/window._sharedData = (.*);/', 'instagram'); } + private function preprocess_html_facebook_com($html) { + $html = str_replace('', '', $html); + return $html; + } + private function html_to_xml($html, $host) { $xmlDoc = new DOMDocument(); -- 2.1.4 From 81cbad67160cd9c338df577a28800f406615fdd6 Mon Sep 17 00:00:00 2001 From: Antonio Ospite Date: Sat, 25 Jul 2015 12:36:58 +0200 Subject: [PATCH 10/16] rss_converter_instagram.com.xsl: use better name for the image variable --- rss_converter_instagram.com.xsl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rss_converter_instagram.com.xsl b/rss_converter_instagram.com.xsl index d9dfcdd..1bb580f 100644 --- a/rss_converter_instagram.com.xsl +++ b/rss_converter_instagram.com.xsl @@ -40,7 +40,7 @@ - + @@ -60,7 +60,7 @@ <![CDATA[


- + ]]>
-- 2.1.4 From 8b8fce67efe48521ed344220c74be3d9658f703c Mon Sep 17 00:00:00 2001 From: Antonio Ospite Date: Sat, 25 Jul 2015 12:43:49 +0200 Subject: [PATCH 11/16] rss_converter_instagram.com.xsl: use the image caption as the item content Instagram has reintroduced serving the image caption in the json data, so use it; it is way nicer than the stats tweeper was showing before. --- rss_converter_instagram.com.xsl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rss_converter_instagram.com.xsl b/rss_converter_instagram.com.xsl index 1bb580f..9ef06ef 100644 --- a/rss_converter_instagram.com.xsl +++ b/rss_converter_instagram.com.xsl @@ -41,11 +41,11 @@ + - - <xsl:value-of select="$item-content-title"/> + <xsl:value-of select="$item-content-caption"/> @@ -59,7 +59,7 @@ <![CDATA[ -


+


]]>
-- 2.1.4 From 23bd2947e8afbb21dab696d9a655c53b39c8ff55 Mon Sep 17 00:00:00 2001 From: Antonio Ospite Date: Sat, 25 Jul 2015 12:48:30 +0200 Subject: [PATCH 12/16] rss_converter_instagram.com.xsl: ellipsize titles --- rss_converter_instagram.com.xsl | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/rss_converter_instagram.com.xsl b/rss_converter_instagram.com.xsl index 9ef06ef..0e1de8d 100644 --- a/rss_converter_instagram.com.xsl +++ b/rss_converter_instagram.com.xsl @@ -45,7 +45,18 @@ - <xsl:value-of select="$item-content-caption"/> + <xsl:variable name="title-length" select="140"/> + <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 --> + <xsl:choose> + <xsl:when test="string-length($item-content-caption) > $title-length"> + <xsl:variable name="truncated-length" select="$title-length - 3"/> + <xsl:value-of select="normalize-space(substring($item-content-caption, 1, $truncated-length))"/> + <xsl:text>...</xsl:text> + </xsl:when> + <xsl:otherwise> + <xsl:value-of select="normalize-space($item-content-caption)"/> + </xsl:otherwise> + </xsl:choose> -- 2.1.4 From e2e4404f14ad9c387a7b3c1d0c97ab289a7576b0 Mon Sep 17 00:00:00 2001 From: Antonio Ospite Date: Sat, 25 Jul 2015 13:08:31 +0200 Subject: [PATCH 13/16] rss_converter_instagram.com.xsl: fix enclosure generation --- rss_converter_instagram.com.xsl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rss_converter_instagram.com.xsl b/rss_converter_instagram.com.xsl index 0e1de8d..cc540ce 100644 --- a/rss_converter_instagram.com.xsl +++ b/rss_converter_instagram.com.xsl @@ -32,8 +32,8 @@ https://instagram.com - - + + @@ -75,7 +75,7 @@ ]]> - +
-- 2.1.4 From 0bfcdd22bd1ff2f2f1ea4e7cdec2fb8c9a5a79b2 Mon Sep 17 00:00:00 2001 From: Antonio Ospite Date: Sat, 25 Jul 2015 15:45:33 +0200 Subject: [PATCH 14/16] rss_converter_instagram.com.xsl: add a label if the content is a video --- rss_converter_instagram.com.xsl | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/rss_converter_instagram.com.xsl b/rss_converter_instagram.com.xsl index cc540ce..4f1d14f 100644 --- a/rss_converter_instagram.com.xsl +++ b/rss_converter_instagram.com.xsl @@ -70,7 +70,12 @@ <![CDATA[ -


+

+ + (Video) + + +


]]>
-- 2.1.4 From 154afcec55a2d098e825befb1d690a11d0b4bc5a Mon Sep 17 00:00:00 2001 From: Antonio Ospite Date: Sat, 25 Jul 2015 15:50:12 +0200 Subject: [PATCH 15/16] TODO: remove item about Instagram videos Even if tweeper does not show the video itself in the RSS item content it at least tells the user that the content is a video, so consider this done. --- TODO | 1 - 1 file changed, 1 deletion(-) diff --git a/TODO b/TODO index 4492297..66df976 100644 --- a/TODO +++ b/TODO @@ -6,4 +6,3 @@ - check the encoding of the tweets when UTF is used, maybe solvable with mb_convert_encoding()? See http://php.net/manual/en/domdocument.loadhtml.php -- add support for Instagram videos -- 2.1.4 From d831bbec69bfdbd12ce7774be02eb224cfe41d3b Mon Sep 17 00:00:00 2001 From: Antonio Ospite Date: Sat, 25 Jul 2015 15:52:57 +0200 Subject: [PATCH 16/16] TODO: remove item about duplicated RSS items Now all the generated feeds use the element to uniquely identify items. --- TODO | 1 - 1 file changed, 1 deletion(-) diff --git a/TODO b/TODO index 66df976..9cff1c6 100644 --- a/TODO +++ b/TODO @@ -2,7 +2,6 @@ - evaluate the use of the RSS element. - use the element for pump.io media objects - use the element for images on dilbert.com -- debug some duplicated entries in the tweeter feeds in liferea - check the encoding of the tweets when UTF is used, maybe solvable with mb_convert_encoding()? See http://php.net/manual/en/domdocument.loadhtml.php -- 2.1.4