From: Antonio Ospite Date: Sun, 13 Sep 2015 19:29:04 +0000 (+0200) Subject: Merge tag 'v0.4' into debian X-Git-Tag: debian/0.4-1~5 X-Git-Url: https://git.ao2.it/tweeper.git/commitdiff_plain/cfa97246db710911554532b4a6f88f6b5e5cb6b0?hp=516bc275df08ce980f955a1a862226ef15f129fe Merge tag 'v0.4' into debian Release v0.4 --- diff --git a/INSTALL b/INSTALL index a2e602d..d575bb1 100644 --- a/INSTALL +++ b/INSTALL @@ -1,3 +1,6 @@ The recommended way to install tweeper globally is to install all its files under /usr/share/php/tweeper and then make a symlink to the wrapper script "tweeper" under /usr/bin + +Tweeper depends on php-xml-serializer which is used to convert json to xml for +some sites that provide the timeline data in json rather than in usable html. diff --git a/NEWS b/NEWS index b183893..d50eb50 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,13 @@ +News for v0.4: +============== + + * Make the generated RSS validate with feedvalidator.org + * Fix support for Dilbert.com + * Add support for Instragram.com + * Add support for public pages on Facebook.com + * Make tweeper work with the PHP built-in web server + * Misc fixes to code and documentation + News for v0.3: ============== diff --git a/README b/README index 0ebe37c..7703e64 100644 --- a/README +++ b/README @@ -1,10 +1,12 @@ -Tweeper is a web scraper which extracts the most recent public tweets of -a given user from their home page on Twitter.com and formats them in RSS, so -the information can be conveniently accessed and collected by a feed reader. +Tweeper is a web scraper which can be used to conveniently follow the public +activity of social network users without the need to log in or even be +subscribed to the social network; tweeper converts the public information to +RSS so that it can be accessed and collected by a feed reader. -Since Jun 11th 2013 Twitter.com retired their API v1.0, so it's not possible -to access a user timeline via RSS anymore, and it's also become mandatory to -authenticate via OAuth to access this _public_ information in JSON format: +Since Jun 11th 2013, when Twitter.com retired their API v1.0, it has not been +possible anymore to access a user timeline via RSS, and it has also become +mandatory to authenticate via OAuth to access this _public_ information in the +JSON format: https://dev.twitter.com/discussions/16289 https://dev.twitter.com/discussions/11564 @@ -24,14 +26,23 @@ whom you are friend to. [1] http://www.urbandictionary.com/define.php?term=TWEEPER&defid=3743173 -Tweeper can be used via web or as a command line program, for example as -a filter in your feed reader, by passing the URL of the user's public timeline -as the first argument. - Tweeper can easily scrape sites other than Twitter, it is just a matter of writing an xsl stylesheet for the transformation; an example for pump.io activity stream is provided in rss_converter_pump.io.xsl +The currently supported sites are: + + * Twitter.com + * Pump.io based websites, like Identi.ca + * Dilbert.com + * Howtoons.com + * Instagram.com + * Facebook.com (public pages) + +Tweeper can be used via web or as a command line program, for example as +a filter in your feed reader, by passing the URL of the user's public timeline +as the first argument. + Example of use on the command line: $ php tweeper.php http://twitter.com/NSACareers diff --git a/TODO b/TODO index 2b0f1ae..95d08a4 100644 --- a/TODO +++ b/TODO @@ -2,8 +2,7 @@ - evaluate the use of the RSS element. - use the element for pump.io media objects - use the element for images on dilbert.com -- consider using http://www.dilbert.com/fast for dilbert.com -- debug some duplicated entries in the tweeter feeds in liferea +- show images (or even cards) directly in RSS items for twitter.com - check the encoding of the tweets when UTF is used, maybe solvable with mb_convert_encoding()? See http://php.net/manual/en/domdocument.loadhtml.php diff --git a/rss_converter_dilbert.com.xsl b/rss_converter_dilbert.com.xsl index 82c33f2..f255be1 100644 --- a/rss_converter_dilbert.com.xsl +++ b/rss_converter_dilbert.com.xsl @@ -34,55 +34,67 @@ + xsl:extension-element-prefixes="php" + exclude-result-prefixes="php"> - + - - - - + + + + - <xsl:value-of select="$picture-element/@title"/> + <xsl:value-of select="$picture-title"/> - + + + + - + <![CDATA[ - + ]]> + + Tweeper - <xsl:value-of select="//meta[@property='og:title']/@content"/> + <xsl:value-of select="$channel-title"/> - + + + <xsl:value-of select="$channel-title"/> + + + + - + - + diff --git a/rss_converter_facebook.com.xsl b/rss_converter_facebook.com.xsl new file mode 100644 index 0000000..9fbc187 --- /dev/null +++ b/rss_converter_facebook.com.xsl @@ -0,0 +1,117 @@ + + + + + + + + + + https://facebook.com + + + + + + + + <xsl:variable name="item-title" select="$item-content/p"/> + <xsl:variable name="title-length" select="140"/> + <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 --> + <xsl:choose> + <xsl:when test="string-length($item-title) > $title-length"> + <xsl:variable name="truncated-length" select="$title-length - 3"/> + <xsl:value-of select="substring($item-title, 1, $truncated-length)"/> + <xsl:text>...</xsl:text> + </xsl:when> + <xsl:otherwise> + <xsl:value-of select="$item-title"/> + </xsl:otherwise> + </xsl:choose> + + + + + + + + + + + + + <![CDATA[ + + + ]]> + + + + + + + + + + + + Tweeper + + <xsl:value-of select="$channel-title"/> + + + + + + <![CDATA[ + + ]]> + + + + <xsl:value-of select="$channel-title"/> + + + + + + + + + + + + + diff --git a/rss_converter_howtoons.com.xsl b/rss_converter_howtoons.com.xsl index 4067065..41939f1 100644 --- a/rss_converter_howtoons.com.xsl +++ b/rss_converter_howtoons.com.xsl @@ -29,7 +29,8 @@ + xsl:extension-element-prefixes="php" + exclude-result-prefixes="php"> @@ -38,13 +39,17 @@ + <xsl:value-of select="normalize-space(.//div[@class='post-headline']//a)"/> - + + + + @@ -63,21 +68,29 @@ + + Tweeper - <xsl:value-of select="//title"/> + <xsl:value-of select="$channel-title"/> - + The world's greatest D.I.Y. comic website! Tools of mass construction! + + <xsl:value-of select="$channel-title"/> + + + + http://www.howtoons.com/wp-content/themes/atahualpa/images/header/tuck1000.png diff --git a/rss_converter_instagram.com.xsl b/rss_converter_instagram.com.xsl new file mode 100644 index 0000000..5f1bb7f --- /dev/null +++ b/rss_converter_instagram.com.xsl @@ -0,0 +1,139 @@ + + + + + + + + + https://instagram.com + + + + + + + + + + + + + + + + + + + + + + + + + + + + <xsl:variable name="title-length" select="140"/> + <xsl:variable name="item-content-title" select="normalize-space(concat($user-name, ': ', $item-content-caption))"/> + <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 --> + <xsl:choose> + <xsl:when test="string-length($item-content-title) > $title-length"> + <xsl:variable name="truncated-length" select="$title-length - 3"/> + <xsl:value-of select="substring($item-content-title, 1, $truncated-length)"/> + <xsl:text>...</xsl:text> + </xsl:when> + <xsl:otherwise> + <xsl:value-of select="$item-content-title"/> + </xsl:otherwise> + </xsl:choose> + + + + + + + + + + + + + <![CDATA[ +

+ + (Video) + + +


+ + ]]> +
+ + + +
+
+ + + + + + + + + Tweeper + + <xsl:value-of select="$channel-title"/> + + + + + + <![CDATA[ + + + + + + ]]> + + + + <xsl:value-of select="$channel-title"/> + + + + + + + + + + + + +
diff --git a/rss_converter_pump.io.xsl b/rss_converter_pump.io.xsl index 94b083b..ed99713 100644 --- a/rss_converter_pump.io.xsl +++ b/rss_converter_pump.io.xsl @@ -22,7 +22,8 @@ + xsl:extension-element-prefixes="php" + exclude-result-prefixes="php"> @@ -32,42 +33,54 @@ - + + - <xsl:value-of select="concat($user-name, ': ', normalize-space($activity-text))"/> + <xsl:value-of select="concat($user-name, ': ', normalize-space($item-content))"/> - + + + + <![CDATA[ - + ]]> + + Tweeper - <xsl:value-of select="concat(substring-after($user-name, '@'), ' / ', substring-before($user-name, '@'))"/> + <xsl:value-of select="$channel-title"/> - + + + <xsl:value-of select="$channel-title"/> + + + + diff --git a/rss_converter_twitter.com.xsl b/rss_converter_twitter.com.xsl index a5f3c4b..9185a54 100644 --- a/rss_converter_twitter.com.xsl +++ b/rss_converter_twitter.com.xsl @@ -21,13 +21,14 @@ + xsl:extension-element-prefixes="php" + exclude-result-prefixes="php"> - + https://twitter.com @@ -37,16 +38,20 @@ - + - + + - <xsl:value-of select="concat($user-name, ': ', $tweet-text)"/> + <xsl:value-of select="concat($user-name, ': ', $item-content)"/> - + + + + @@ -54,36 +59,44 @@ <![CDATA[ - + ]]> - + + + - + Tweeper - <xsl:text>Twitter / </xsl:text><xsl:value-of select="$screen-name"/> + <xsl:value-of select="$channel-title"/> - + - + + + <xsl:value-of select="$channel-title"/> + + + + - + diff --git a/tweeper.1.asciidoc b/tweeper.1.asciidoc index c5c5fa9..2a4a523 100644 --- a/tweeper.1.asciidoc +++ b/tweeper.1.asciidoc @@ -5,7 +5,7 @@ TWEEPER(1) NAME ---- -tweeper - web scraper to convert a Twitter timeline to an RSS feed +tweeper - web scraper to convert supported websites (e.g. Twitter.com) to RSS SYNOPSIS @@ -16,24 +16,28 @@ SYNOPSIS DESCRIPTION ----------- -tweeper(1) is a web scraper which extracts the most recent public tweets of -a given user from their home page on Twitter.com and formats them in RSS, so -the information can be conveniently accessed and collected by a feed reader. +tweeper(1) is a web scraper which can be used to conveniently follow the +public activity of social network users without the need to log in or even be +subscribed to the social network; tweeper converts the public information to +RSS so that it can be accessed and collected by a feed reader. tweeper started as the TWitter fEEd scraPER but support for other web sites has been added. The sites that tweeper is able to scrape and convert to RSS are: - + * Twitter.com * Pump.io based websites, like Identi.ca * Dilbert.com +* Howtoons.com +* Instagram.com +* Facebook.com (public pages) tweeper can be used as: 1. a command line tool; 2. a filter for feed readers; -3. a web based tool when PHP support is available in the web server. +3. a web based tool when used with a PHP-enabled web server. OPTIONS @@ -57,10 +61,22 @@ Using tweeper as a filter for the Liferea feed reader: liferea-add-feed "|tweeper http://twitter.com/NSAcareers" -Using tweeper via web (the symlink must be created only the very first time): +To use tweeper via web there are two options (the examples assume the +installation directory to be `/usr/share/php/tweeper/`): + +1. Using the PHP built-in web server: + + php -S localhost:8000 -t /usr/share/php/tweeper/ ++ +and then visit 'http://localhost:8000/tweeper.php' in the web browser. + +2. Using a generic web server with the document root in '/var/www': sudo ln -s /usr/share/php/tweeper/tweeper.php /var/www xdg-open http://localhost/tweeper.php?src_url=http://twitter.com/NSAcareers ++ +It is enough to create the symlink only the very first time tweeper is used +this way. NOTES @@ -92,7 +108,7 @@ Main web site: COPYING ------- -Copyright \(C) 2013-2014 Antonio Ospite +Copyright \(C) 2013-2015 Antonio Ospite This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/tweeper.php b/tweeper.php index 4be5c7d..a9fce9b 100644 --- a/tweeper.php +++ b/tweeper.php @@ -1,49 +1,51 @@ - * + * + * Copyright (C) 2013-2015 Antonio Ospite + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ +require_once 'XML/Serializer.php'; + date_default_timezone_set('UTC'); class Tweeper { private static $USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0"; - public function __construct($stylesheet, $generate_enclosure = FALSE) { - $stylesheet_contents = $this->get_contents($stylesheet); - - $xslDoc = new DOMDocument(); - $xslDoc->loadXML($stylesheet_contents); - - $this->xsltProcessor = new XSLTProcessor(); - $this->xsltProcessor->registerPHPFunctions(); - $this->xsltProcessor->setParameter('', 'generateEnclosure', $generate_enclosure); - $this->xsltProcessor->importStylesheet($xslDoc); + public function __construct($generate_enclosure = FALSE) { + $this->generate_enclosure = $generate_enclosure; } public static function epoch_to_gmdate($timestamp) { + if (!is_numeric($timestamp) || is_nan($timestamp)) { + $timestamp = 0; + } + return gmdate('D, d M Y H:i:s', $timestamp) . ' GMT'; } public static function str_to_gmdate($date) { $timestamp = strtotime($date); + if (FALSE === $timestamp) { + $timestamp = 0; + } + return Tweeper::epoch_to_gmdate($timestamp); } @@ -104,7 +106,11 @@ class Tweeper { "video/ogg", ); - $url_info = Tweeper::get_info($url); + // The RSS specification says that the enclosure element url must be http. + // See http://sourceforge.net/p/feedvalidator/bugs/72/ + $http_url = preg_replace("/^https/", "http", $url); + + $url_info = Tweeper::get_info($http_url); $supported = in_array($url_info['content_type'], $supported_content_types); if (!$supported) { @@ -152,21 +158,122 @@ class Tweeper { error_log($output); } - public function tweep($uri) { - $html = Tweeper::get_contents($uri); + private function load_stylesheet($host) { + $stylesheet = "file://" . __DIR__ . "/rss_converter_" . $host . ".xsl"; + if (FALSE === file_exists($stylesheet)) { + trigger_error("Conversion to RSS not supported for $host ($stylesheet not found)", E_USER_ERROR); + return NULL; + } + + $stylesheet_contents = $this->get_contents($stylesheet); + + $xslDoc = new DOMDocument(); + $xslDoc->loadXML($stylesheet_contents); + + $xsltProcessor = new XSLTProcessor(); + $xsltProcessor->registerPHPFunctions(); + $xsltProcessor->setParameter('', 'generateEnclosure', $this->generate_enclosure); + $xsltProcessor->importStylesheet($xslDoc); + + return $xsltProcessor; + } + + private function json_to_xml($html, $json_match_expr, $rootName) { + // pre-process, convert json to XML + $ret = preg_match($json_match_expr, $html, $matches); + if ($ret !== 1) { + trigger_error("Cannot match expression: $json_match_expr\n", E_USER_ERROR); + return NULL; + } + + $data = json_decode($matches[1]); + if (!$data) { + return NULL; + } + + $serializer_options = array ( + 'addDecl' => TRUE, + 'encoding' => "UTF-8", + 'indent' => ' ', + 'rootName' => $rootName, + ); + + $serializer = new XML_Serializer($serializer_options); + + $status = $serializer->serialize($data); + if (PEAR::isError($status)) { + trigger_error($status->getMessage(), E_USER_ERROR); + return NULL; + } + + return $serializer->getSerializedData(); + } + + private function get_xml_instagram_com($html) { + return $this->json_to_xml($html, '/window._sharedData = (.*);/', 'instagram'); + } + + private function preprocess_html_facebook_com($html) { + $html = str_replace('', '', $html); + return $html; + } + private function html_to_xml($html, $host) { $xmlDoc = new DOMDocument(); // Handle warnings and errors when loading invalid HTML. $xml_errors_value = libxml_use_internal_errors(true); - $xmlDoc->loadHTML($html); + + // If there is a host-specific method to get the xml data, use it! + $get_xml_host_method = 'get_xml_' . str_replace(".", "_", $host); + if (method_exists($this, $get_xml_host_method)) { + $xml_data = call_user_func_array(array($this, $get_xml_host_method), array($html)); + $xmlDoc->loadXML($xml_data); + } else { + $xmlDoc->loadHTML($html); + } + foreach (libxml_get_errors() as $xml_error) { $this->log_xml_error($xml_error); } libxml_clear_errors(); libxml_use_internal_errors($xml_errors_value); - $output = $this->xsltProcessor->transformToXML($xmlDoc); + return $xmlDoc; + } + + public function tweep($src_url) { + $url = parse_url($src_url); + if (FALSE === $url || empty($url["host"])) { + trigger_error("Invalid url: $src_url", E_USER_ERROR); + return NULL; + } + + // Strip the leading www. to be more forgiving on input URLs + $host = preg_replace('/^www\./', '', $url["host"]); + + $xsltProcessor = $this->load_stylesheet($host); + if (NULL === $xsltProcessor) { + return NULL; + } + + $html = $this->get_contents($src_url); + if (FALSE === $html) { + return NULL; + } + + $preprocess_html_host_method = 'preprocess_html_' . str_replace(".", "_", $host); + if (method_exists($this, $preprocess_html_host_method)) { + $html = call_user_func_array(array($this, $preprocess_html_host_method), array($html)); + } + + $xmlDoc = $this->html_to_xml($html, $host); + if (NULL === $xmlDoc) { + return NULL; + } + + $output = $xsltProcessor->transformToXML($xmlDoc); if (FALSE === $output) { trigger_error('XSL transformation failed.', E_USER_ERROR); @@ -176,12 +283,18 @@ class Tweeper { } } +function is_cli() +{ + return (php_sapi_name() === "cli"); +} + function usage($argv) { - if (php_sapi_name() != 'cli') - $usage = htmlentities("{$_SERVER['SCRIPT_NAME']}?src_url=&generate_enclosure=<0|1>"); - else + if (is_cli()) { $usage = "{$argv[0]} [-e|-h|--help] \n"; + } else { + $usage = htmlentities("{$_SERVER['SCRIPT_NAME']}?src_url=&generate_enclosure=<0|1>"); + } return "usage: $usage"; } @@ -232,30 +345,18 @@ function parse_options_query_string() } -if (php_sapi_name() != 'cli') { - $options = parse_options_query_string(); - $ERROR_STREAM = fopen('php://output', 'w'); -} else { +if (is_cli()) { $options = parse_options_cli($argv, $argc); $ERROR_STREAM = fopen('php://stderr', 'w'); +} else { + $options = parse_options_query_string(); + $ERROR_STREAM = fopen('php://output', 'w'); } if (!isset($options['src_url'])) { - fwrite($ERROR_STREAM, usage($argv)); - exit(1); -} - -$url = parse_url($options['src_url']); -if (FALSE === $url || empty($url["host"])) { - fwrite($ERROR_STREAM, "Invalid url: ${options['src_url']}\n"); - exit(1); -} - -$stylesheet = "file://" . __DIR__ . "/rss_converter_" . $url["host"] . ".xsl"; -if (FALSE === file_exists($stylesheet)) { - fwrite($ERROR_STREAM, "Conversion to RSS not supported: {$url["host"]}\n"); + fwrite($ERROR_STREAM, usage(is_cli() ? $argv : NULL)); exit(1); } -$tweeper = new Tweeper($stylesheet, $options['generate_enclosure']); +$tweeper = new Tweeper($options['generate_enclosure']); echo $tweeper->tweep($options['src_url']);