From: Antonio Ospite Date: Mon, 23 May 2016 14:20:49 +0000 (+0200) Subject: Merge tag 'v0.6' into debian X-Git-Tag: debian/0.6-1~5 X-Git-Url: https://git.ao2.it/tweeper.git/commitdiff_plain/768c52191fabeee267767c74a5ffee452e15b05a?hp=dfce80ce98d895650961cfbdc19e5c097c7e4bf0 Merge tag 'v0.6' into debian Release v0.6 --- diff --git a/HACKING b/HACKING new file mode 100644 index 0000000..14fba7e --- /dev/null +++ b/HACKING @@ -0,0 +1,10 @@ +The code follows the Drupal coding standards: +https://www.drupal.org/coding-standards + +Style compliance can be checked using the Coder Sniffer extension to the PEAR +PHP_CodeSniffer project, for instructions about how to install Coder Sniffer +see https://www.drupal.org/node/1419988 + +Use this command to check the style: + + $ phpcs --standard=Drupal . diff --git a/INSTALL b/INSTALL index bb71797..6c19099 100644 --- a/INSTALL +++ b/INSTALL @@ -5,3 +5,8 @@ under /usr/share/php/tweeper and then make a symlink to the wrapper script Tweeper depends on php-symfony-serializer which is used to convert json to xml for some sites which provide the timeline data in json rather than in usable html. + +NOTE: Tweeper also depends indirectly on php-symfony-property-access because +the code relies on the ObjectNormalizer class which requires the +PropertyAccess component, see +http://symfony.com/doc/current/components/serializer.html#installation diff --git a/NEWS b/NEWS index 0a4af4a..d125dd5 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,25 @@ +News for v0.6: +============== + + * Fix support for Facebook.com public pages + * Fix support for Dilbert.com + * Major code cleanup (coding style, functions naming) + * Fix indentation when generating the element + * Support generating enclosure for "image/png" links + * Major improvements for Twitter.com: + - embed images directly in the item description, linking to the original + versions uploaded by the user; + - use direct links instead of ones pointing to the t.co redirector; + - show explicitly if the attached media is a video; + - add enclosure element support for attached images. + * Minor improvements for Instagram.com: + - fix the channel link; + - make images adapt to the feed reader view, this avoids horizontal + scrolling if the image is too big. + * Support generating enclosure for images on Dilbert.com + * Support generating enclosure for images on Pump.io sites + * Misc fixes to code and documentation + News for v0.5: ============== diff --git a/README b/README index 7703e64..5f22618 100644 --- a/README +++ b/README @@ -45,11 +45,11 @@ as the first argument. Example of use on the command line: - $ php tweeper.php http://twitter.com/NSACareers + $ php tweeper.php https://twitter.com/NSACareers Example of use as a Liferea[2] filter: - $ liferea-add-feed "|php .../path_to_tweeper/tweeper.php http://twitter.com/NSAcareers" + $ liferea-add-feed "|php .../path_to_tweeper/tweeper.php https://twitter.com/NSAcareers" Example of use with identi.ca: @@ -58,4 +58,4 @@ Example of use with identi.ca: [2] http://lzone.de/liferea/ Tweeper is licensed under the GPLv3. -Tweeper was written by Antonio Ospite http://ao2.it +Tweeper was written by Antonio Ospite https://ao2.it diff --git a/TODO b/TODO index 95d08a4..b305783 100644 --- a/TODO +++ b/TODO @@ -1,8 +1,7 @@ - write a better XSL stylesheet? I am not an XSL expert. - evaluate the use of the RSS element. -- use the element for pump.io media objects -- use the element for images on dilbert.com -- show images (or even cards) directly in RSS items for twitter.com +- show cards directly in RSS items for twitter.com +- show direct links for videos in the Instagram feed - check the encoding of the tweets when UTF is used, maybe solvable with mb_convert_encoding()? See http://php.net/manual/en/domdocument.loadhtml.php diff --git a/rss_converter_dilbert.com.xsl b/rss_converter_dilbert.com.xsl index f255be1..b6d1975 100644 --- a/rss_converter_dilbert.com.xsl +++ b/rss_converter_dilbert.com.xsl @@ -47,7 +47,18 @@ - <xsl:value-of select="$picture-title"/> + <xsl:variable name="title-length" select="140"/> + <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 --> + <xsl:choose> + <xsl:when test="string-length($picture-title) > $title-length"> + <xsl:variable name="truncated-length" select="$title-length - 3"/> + <xsl:value-of select="substring($picture-title, 1, $truncated-length)"/> + <xsl:text>...</xsl:text> + </xsl:when> + <xsl:otherwise> + <xsl:value-of select="$picture-title"/> + </xsl:otherwise> + </xsl:choose> @@ -56,13 +67,16 @@ - + <![CDATA[ - + {$picture-title} ]]> + + + diff --git a/rss_converter_facebook.com.xsl b/rss_converter_facebook.com.xsl index 9fbc187..418b3d2 100644 --- a/rss_converter_facebook.com.xsl +++ b/rss_converter_facebook.com.xsl @@ -42,12 +42,28 @@ https://facebook.com + + + - - + + + + + + - <xsl:variable name="item-title" select="$item-content/p"/> + <xsl:variable name="item-title" select="$item-content//p"/> <xsl:variable name="title-length" select="140"/> <!-- ellipsize, inspired from http://stackoverflow.com/questions/13622338 --> <xsl:choose> @@ -69,12 +85,20 @@ </guid> <pubDate> <xsl:variable name="timestamp" select=".//abbr[@data-shorten]/@data-utime"/> - <xsl:value-of select="php:functionString('Tweeper::epoch_to_gmdate', number($timestamp))"/> + <xsl:value-of select="php:functionString('Tweeper::epochToRssDate', $timestamp)"/> </pubDate> <description> + + <!-- + Get only the children starting from the one with class="userContent", + this way the content header is skipped + --> + <xsl:variable + name="usercontent-position" + select="count($item-content/div[contains(@class, 'userContent')]/preceding-sibling::*) + 1"/> + <xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text> - <xsl:copy-of select="$item-content/node()"/> - <xsl:copy-of select=".//div[@class='mtm']/node()"/> + <xsl:copy-of select="$item-content/div[position() >= $usercontent-position]"/> <xsl:text disable-output-escaping="yes">]]></xsl:text> </description> </item> diff --git a/rss_converter_howtoons.com.xsl b/rss_converter_howtoons.com.xsl index 41939f1..403b9ac 100644 --- a/rss_converter_howtoons.com.xsl +++ b/rss_converter_howtoons.com.xsl @@ -57,7 +57,7 @@ <xsl:variable name="day" select="substring($date, 4, 2)"/> <xsl:variable name="year" select="substring($date, 7, 2)"/> <xsl:variable name="iso-date" select="concat('20', $year, '-', $month, '-', $day)"/> - <xsl:value-of select="php:functionString('Tweeper::str_to_gmdate', $iso-date)"/> + <xsl:value-of select="php:functionString('Tweeper::strToRssDate', $iso-date)"/> </pubDate> <description> <xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text> diff --git a/rss_converter_instagram.com.xsl b/rss_converter_instagram.com.xsl index aaab2e8..e869d7d 100644 --- a/rss_converter_instagram.com.xsl +++ b/rss_converter_instagram.com.xsl @@ -24,7 +24,7 @@ xsl:extension-element-prefixes="php" exclude-result-prefixes="php"> - <xsl:param name="generateEnclosure"/> + <xsl:param name="generate-enclosure"/> <xsl:output method="xml" indent="yes"/> @@ -32,10 +32,6 @@ <xsl:text>https://instagram.com</xsl:text> </xsl:variable> - <xsl:template match="display_src"> - <xsl:value-of disable-output-escaping="yes" select="php:function('Tweeper::generate_enclosure', string(text()))"/> - </xsl:template> - <xsl:variable name="user-name" select="//ProfilePage/user/username"/> <!-- Some users do not specify the full name --> @@ -51,7 +47,7 @@ </xsl:choose> </xsl:variable> - <xsl:template match="//media/nodes"> + <xsl:template match="//ProfilePage/user/media/nodes"> <xsl:variable name="item-content-image" select="./display_src"/> <xsl:variable name="item-content-caption" select="./caption"/> <xsl:variable name="item-permalink" select="concat($BaseURL, '/p/', ./code, '/')"/> @@ -79,28 +75,28 @@ </guid> <pubDate> <xsl:variable name="timestamp" select="./date"/> - <xsl:value-of select="php:functionString('Tweeper::epoch_to_gmdate', number($timestamp))"/> + <xsl:value-of select="php:functionString('Tweeper::epochToRssDate', $timestamp)"/> </pubDate> <description> <xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text> <p> <xsl:if test="./is_video/text() = 1"> - (Video) + (Video) </xsl:if> <xsl:value-of select="$item-content-caption"/> </p><br /> - <a href="{$item-permalink}"><img src="{$item-content-image}" /></a> + <a href="{$item-permalink}"><img src="{$item-content-image}" style="max-width: 100%"/></a> <xsl:text disable-output-escaping="yes">]]></xsl:text> </description> - <xsl:if test="$generateEnclosure = 1"> - <xsl:apply-templates select="./display_src"/> + <xsl:if test="$generate-enclosure = 1"> + <xsl:copy-of select="php:functionString('Tweeper::generateEnclosure', $item-content-image)"/> </xsl:if> </item> </xsl:template> <xsl:template match="/"> <xsl:variable name="channel-title" select="concat('Instagram / ', $screen-name)"/> - <xsl:variable name="channel-link" select="concat($BaseURL, //__path)"/> + <xsl:variable name="channel-link" select="concat($BaseURL, '/', $user-name)"/> <rss version="2.0"> <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute> @@ -129,10 +125,10 @@ <xsl:value-of select="$channel-link"/> </link> <url> - <xsl:value-of select="//user/profile_pic_url"/> + <xsl:value-of select="//ProfilePage/user/profile_pic_url"/> </url> </image> - <xsl:apply-templates select="//media/nodes"/> + <xsl:apply-templates select="//ProfilePage/user/media/nodes"/> </channel> </rss> </xsl:template> diff --git a/rss_converter_pump.io.xsl b/rss_converter_pump.io.xsl index ed99713..1577dcf 100644 --- a/rss_converter_pump.io.xsl +++ b/rss_converter_pump.io.xsl @@ -46,7 +46,7 @@ <xsl:value-of select="$item-permalink"/> </guid> <pubDate> - <xsl:value-of select="php:functionString('Tweeper::str_to_gmdate', .//abbr[@class='easydate']/@title)"/> + <xsl:value-of select="php:functionString('Tweeper::strToRssDate', .//abbr[@class='easydate']/@title)"/> </pubDate> <description> <xsl:value-of select="concat($user-name, ': ')"/> @@ -54,6 +54,13 @@ <xsl:copy-of select="$item-content/node()"/> <xsl:text disable-output-escaping="yes">]]></xsl:text> </description> + <xsl:if test="$generate-enclosure = 1"> + <xsl:variable name="image-thumb-link" select=".//img[contains(@class, 'object-image')]/@src"/> + <xsl:if test="$image-thumb-link"> + <xsl:variable name="image-link" select="php:functionString('str_replace', '_thumb', '', $image-thumb-link)"/> + <xsl:copy-of select="php:functionString('Tweeper::generateEnclosure', $image-link)"/> + </xsl:if> + </xsl:if> </item> </xsl:template> diff --git a/rss_converter_twitter.com.xsl b/rss_converter_twitter.com.xsl index 15507f0..c154141 100644 --- a/rss_converter_twitter.com.xsl +++ b/rss_converter_twitter.com.xsl @@ -24,7 +24,7 @@ xsl:extension-element-prefixes="php" exclude-result-prefixes="php"> - <xsl:param name="generateEnclosure"/> + <xsl:param name="generate-enclosure"/> <xsl:output method="xml" indent="yes"/> @@ -32,8 +32,83 @@ <xsl:text>https://twitter.com</xsl:text> </xsl:variable> + <!-- Identity transform --> + <xsl:template match="@*|node()"> + <xsl:copy> + <xsl:apply-templates select="@*|node()"/> + </xsl:copy> + </xsl:template> + + <!-- + Anchors to external links provide the direct URL in the + data-expanded-url attribute, so use this in the href attribute too + instead of the default short URL which uses the t.co redirection + service. + + NOTE: when creating an element, attributes must be processed _before_ + adding the contents (either children or a value): + http://stackoverflow.com/questions/21984867/ + --> <xsl:template match="a[@data-expanded-url]"> - <xsl:value-of disable-output-escaping="yes" select="php:function('Tweeper::generate_enclosure', string(./@data-expanded-url))"/> + <!-- Prepend and append a white space for aestethic reasons --> + <xsl:text> </xsl:text> + <a> + <xsl:attribute name="href"> + <xsl:value-of select="@data-expanded-url"/> + </xsl:attribute> + <!-- Also strip   and … --> + <xsl:value-of select="translate(., ' …', '')"/> + </a> + <xsl:text> </xsl:text> + </xsl:template> + + <!-- + These are links to pic.twitter.com, use the direct link for those + too instead of the t.co redirections. + --> + <xsl:template match="a[@data-pre-embedded='true']"> + <!-- Prepend and append a white space for aestethic reasons --> + <xsl:text> </xsl:text> + <a> + <xsl:attribute name="href"> + <xsl:value-of select="concat('https://', .)"/> + </xsl:attribute> + <xsl:value-of select="concat('https://', .)"/> + </a> + <xsl:text> </xsl:text> + </xsl:template> + + <!-- Present images in a more convenient way --> + <xsl:template match="div[@data-image-url]"> + <a> + <xsl:attribute name="href"> + <xsl:value-of select="concat(@data-image-url, ':orig')"/> + </xsl:attribute> + <img> + <xsl:attribute name="src"> + <xsl:value-of select="@data-image-url"/> + </xsl:attribute> + </img> + </a> + </xsl:template> + + <!-- Don't repeat background in embedded media content --> + <xsl:template match="div[contains(@class, 'PlayableMedia-player')]"> + <xsl:copy> + <xsl:apply-templates select="@*"/> + <xsl:attribute name="style"> + <xsl:value-of select="concat(@style, '; background-repeat: no-repeat')"/> + </xsl:attribute> + <xsl:apply-templates select="node()"/> + </xsl:copy> + </xsl:template> + + <xsl:template match="a[@data-expanded-url]" mode="enclosure"> + <xsl:copy-of select="php:functionString('Tweeper::generateEnclosure', ./@data-expanded-url)"/> + </xsl:template> + + <xsl:template match="div[@data-image-url]" mode="enclosure"> + <xsl:copy-of select="php:functionString('Tweeper::generateEnclosure', concat(./@data-image-url, ':orig'))"/> </xsl:template> <xsl:variable name="screen-name" select="//div[@class='user-actions btn-group not-following ']/@data-screen-name"/> @@ -41,10 +116,26 @@ <xsl:template match="//li[@data-item-id and @data-item-type='tweet']"> <xsl:variable name="user-name" select=".//div[contains(@class, 'js-stream-tweet')]/@data-screen-name"/> <xsl:variable name="item-content" select=".//p[contains(@class, 'js-tweet-text')]"/> + <xsl:variable name="item-media" select=".//div[contains(@class, 'AdaptiveMedia-container')]"/> <xsl:variable name="item-permalink" select="concat($BaseURL, .//div[@data-permalink-path]/@data-permalink-path)"/> + + <xsl:variable name="item-has-video" select="$item-media//*[contains(@class, 'PlayableMedia--video')]"/> <item> <title> - <xsl:value-of select="concat($user-name, ': ', $item-content)"/> + <xsl:value-of select="concat($user-name, ': ')"/> + <xsl:if test="$item-has-video"> + <xsl:text>(Video) </xsl:text> + </xsl:if> + <!-- + Prepend a space in front of the URLs which are not + preceded by an open parenthesis, for aestethic reasons. + Also, regex, I know: http://xkcd.com/1171/ + --> + <xsl:variable + name="processed-title" + select="php:functionString('preg_replace', '@((?<!\()(?:http[s]?://|pic.twitter.com))@', ' \1', $item-content)"/> + <!-- Also strip   and … --> + <xsl:value-of select="normalize-space(translate($processed-title, ' …', ''))"/> @@ -54,16 +145,21 @@ - + <![CDATA[ - + + (Video) + + + ]]> - - + + + @@ -102,7 +198,7 @@ - + diff --git a/tweeper b/tweeper index a13752f..6256e20 100755 --- a/tweeper +++ b/tweeper @@ -1,4 +1,9 @@ #!/usr/bin/env php +Main web site: COPYING diff --git a/tweeper.php b/tweeper.php index bed6350..94ea05f 100644 --- a/tweeper.php +++ b/tweeper.php @@ -1,6 +1,7 @@ * @@ -26,44 +27,66 @@ use Symfony\Component\Serializer\Normalizer\ObjectNormalizer; date_default_timezone_set('UTC'); +/** + * Scrape supported websites and perform conversion to RSS. + */ class Tweeper { - private static $USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0"; + private static $userAgent = "Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0"; + /** + * Constructor sets up {@link $generate_enclosure}. + */ public function __construct($generate_enclosure = FALSE) { $this->generate_enclosure = $generate_enclosure; } - public static function epoch_to_gmdate($timestamp) - { + /** + * Convert numeric Epoch to the date format expected in a RSS document. + */ + public static function epochToRssDate($timestamp) { if (!is_numeric($timestamp) || is_nan($timestamp)) { $timestamp = 0; } - return gmdate('D, d M Y H:i:s', $timestamp) . ' GMT'; + return gmdate(DATE_RSS, $timestamp); } - public static function str_to_gmdate($date) - { + /** + * Convert generic date string to the date format expected in a RSS document. + */ + public static function strToRssDate($date) { $timestamp = strtotime($date); if (FALSE === $timestamp) { $timestamp = 0; } - return Tweeper::epoch_to_gmdate($timestamp); + return Tweeper::epochToRssDate($timestamp); + } + + /** + * Convert string to UpperCamelCase. + */ + public static function toUpperCamelCase($str, $delim = ' ') { + $str_upper = ucwords($str, $delim); + $str_camel_case = str_replace($delim, '', $str_upper); + return $str_camel_case; } - private static function get_contents($url) - { + /** + * Get the contents from a URL. + */ + private static function getUrlContents($url) { $ch = curl_init($url); curl_setopt_array($ch, array( CURLOPT_HEADER => FALSE, - CURLOPT_FOLLOWLOCATION => TRUE, // follow http redirects to get the real URL + // Follow http redirects to get the real URL. + CURLOPT_FOLLOWLOCATION => TRUE, CURLOPT_RETURNTRANSFER => TRUE, CURLOPT_SSL_VERIFYHOST => FALSE, CURLOPT_SSL_VERIFYPEER => FALSE, CURLOPT_HTTPHEADER => array('Accept-language: en'), - CURLOPT_USERAGENT => Tweeper::$USER_AGENT, + CURLOPT_USERAGENT => Tweeper::$userAgent, )); $contents = curl_exec($ch); curl_close($ch); @@ -71,17 +94,20 @@ class Tweeper { return $contents; } - private static function get_info($url) - { + /** + * Get the headers from a URL. + */ + private static function getUrlInfo($url) { $ch = curl_init($url); curl_setopt_array($ch, array( CURLOPT_HEADER => TRUE, CURLOPT_NOBODY => TRUE, - CURLOPT_FOLLOWLOCATION => TRUE, // follow http redirects to get the real URL + // Follow http redirects to get the real URL. + CURLOPT_FOLLOWLOCATION => TRUE, CURLOPT_RETURNTRANSFER => TRUE, CURLOPT_SSL_VERIFYHOST => FALSE, CURLOPT_SSL_VERIFYPEER => FALSE, - CURLOPT_USERAGENT => Tweeper::$USER_AGENT, + CURLOPT_USERAGENT => Tweeper::$userAgent, )); curl_exec($ch); $url_info = curl_getinfo($ch); @@ -90,8 +116,10 @@ class Tweeper { return $url_info; } - public static function generate_enclosure($url) - { + /** + * Generate an RSS element. + */ + public static function generateEnclosure($url) { $supported_content_types = array( "application/ogg", "audio/aac", @@ -104,17 +132,14 @@ class Tweeper { "audio/x-midi", "image/gif", "image/jpeg", + "image/png", "video/avi", "video/mp4", "video/mpeg", "video/ogg", ); - // The RSS specification says that the enclosure element url must be http. - // See http://sourceforge.net/p/feedvalidator/bugs/72/ - $http_url = preg_replace("/^https/", "http", $url); - - $url_info = Tweeper::get_info($http_url); + $url_info = Tweeper::getUrlInfo($url); $supported = in_array($url_info['content_type'], $supported_content_types); if (!$supported) { @@ -122,78 +147,61 @@ class Tweeper { return ''; } - $dom = new DomDocument(); + // The RSS specification says that the enclosure element URL must be http. + // See http://sourceforge.net/p/feedvalidator/bugs/72/ + $http_url = preg_replace("/^https/", "http", $url_info['url']); + + $dom = new DOMDocument(); $enc = $dom->createElement('enclosure'); - $enc->setAttribute('url', $url_info['url']); + $enc->setAttribute('url', $http_url); $enc->setAttribute('length', $url_info['download_content_length']); $enc->setAttribute('type', $url_info['content_type']); - $dom->appendChild($enc); - - return $dom->saveXML($enc); + return $enc; } - /* Mimic the message from libxml.c::php_libxml_ctx_error_level() */ - private function log_xml_error($error) { + /** + * Mimic the message from libxml.c::php_libxml_ctx_error_level() + */ + private static function logXmlError($error) { $output = ""; switch ($error->level) { - case LIBXML_ERR_WARNING: - $output .= "Warning $error->code: "; - break; - case LIBXML_ERR_ERROR: - $output .= "Error $error->code: "; - break; - case LIBXML_ERR_FATAL: - $output .= "Fatal Error $error->code: "; - break; + case LIBXML_ERR_WARNING: + $output .= "Warning $error->code: "; + break; + + case LIBXML_ERR_ERROR: + $output .= "Error $error->code: "; + break; + + case LIBXML_ERR_FATAL: + $output .= "Fatal Error $error->code: "; + break; } $output .= trim($error->message); if ($error->file) { $output .= " in $error->file"; - } else { + } + else { $output .= " in Entity,"; } - $output .=" line $error->line"; + $output .= " line $error->line"; error_log($output); } - private function load_stylesheet($host) { - $stylesheet = "file://" . __DIR__ . "/rss_converter_" . $host . ".xsl"; - if (FALSE === file_exists($stylesheet)) { - trigger_error("Conversion to RSS not supported for $host ($stylesheet not found)", E_USER_ERROR); - return NULL; - } - - $stylesheet_contents = $this->get_contents($stylesheet); - - $xslDoc = new DOMDocument(); - $xslDoc->loadXML($stylesheet_contents); - - $xsltProcessor = new XSLTProcessor(); - $xsltProcessor->registerPHPFunctions(); - $xsltProcessor->setParameter('', 'generateEnclosure', $this->generate_enclosure); - $xsltProcessor->importStylesheet($xslDoc); - - return $xsltProcessor; - } - - private function json_to_xml($html, $json_match_expr, $rootName) { - // pre-process, convert json to XML - $ret = preg_match($json_match_expr, $html, $matches); - if ($ret !== 1) { - trigger_error("Cannot match expression: $json_match_expr\n", E_USER_ERROR); - return NULL; - } - - // Apparenty the ObjectNormalizer used afterwards is not able to handle + /** + * Convert json to XML. + */ + private static function jsonToXml($json, $root_node_name) { + // Apparently the ObjectNormalizer used afterwards is not able to handle // the stdClass object created by json_decode() with the default setting - // $assoc = false; so use $assoc = true - $data = json_decode($matches[1], $assoc = true); + // $assoc = false; so use $assoc = true. + $data = json_decode($json, $assoc = TRUE); if (!$data) { return NULL; } @@ -202,10 +210,10 @@ class Tweeper { $normalizer = new ObjectNormalizer(); $serializer = new Serializer(array($normalizer), array($encoder)); - $serializer_options = array ( + $serializer_options = array( 'xml_encoding' => "UTF-8", 'xml_format_output' => TRUE, - 'xml_root_node_name' => $rootName, + 'xml_root_node_name' => $root_node_name, ); $xml_data = $serializer->serialize($data, 'xml', $serializer_options); @@ -217,33 +225,51 @@ class Tweeper { return $xml_data; } - private function get_xml_instagram_com($html) { - return $this->json_to_xml($html, '/window._sharedData = (.*);/', 'instagram'); + /** + * Convert the Instagram content to XML. + */ + private function getXmlInstagramCom($html) { + // Extract the json data from the html code. + $json_match_expr = '/window._sharedData = (.*);/'; + $ret = preg_match($json_match_expr, $html, $matches); + if ($ret !== 1) { + trigger_error("Cannot match expression: $json_match_expr\n", E_USER_ERROR); + return NULL; + } + + return Tweeper::jsonToXml($matches[1], 'instagram'); } - private function preprocess_html_facebook_com($html) { + /** + * Make the Facebook HTML processable. + */ + private function preprocessHtmlFacebookCom($html) { $html = str_replace('', '', $html); return $html; } - private function html_to_xml($html, $host) { + /** + * Convert the HTML retrieved from the site to XML. + */ + private function htmlToXml($html, $host) { $xmlDoc = new DOMDocument(); // Handle warnings and errors when loading invalid HTML. - $xml_errors_value = libxml_use_internal_errors(true); + $xml_errors_value = libxml_use_internal_errors(TRUE); - // If there is a host-specific method to get the xml data, use it! - $get_xml_host_method = 'get_xml_' . str_replace(".", "_", $host); + // If there is a host-specific method to get the XML data, use it! + $get_xml_host_method = 'getXml' . Tweeper::toUpperCamelCase($host, '.'); if (method_exists($this, $get_xml_host_method)) { $xml_data = call_user_func_array(array($this, $get_xml_host_method), array($html)); $xmlDoc->loadXML($xml_data); - } else { + } + else { $xmlDoc->loadHTML($html); } foreach (libxml_get_errors() as $xml_error) { - $this->log_xml_error($xml_error); + Tweeper::logXmlError($xml_error); } libxml_clear_errors(); libxml_use_internal_errors($xml_errors_value); @@ -251,32 +277,58 @@ class Tweeper { return $xmlDoc; } + /** + * Load a stylesheet if the web site is supported. + */ + private function loadStylesheet($host) { + $stylesheet = "file://" . __DIR__ . "/rss_converter_" . $host . ".xsl"; + if (FALSE === file_exists($stylesheet)) { + trigger_error("Conversion to RSS not supported for $host ($stylesheet not found)", E_USER_ERROR); + return NULL; + } + + $stylesheet_contents = Tweeper::getUrlContents($stylesheet); + + $xslDoc = new DOMDocument(); + $xslDoc->loadXML($stylesheet_contents); + + $xsltProcessor = new XSLTProcessor(); + $xsltProcessor->registerPHPFunctions(); + $xsltProcessor->setParameter('', 'generate-enclosure', $this->generate_enclosure); + $xsltProcessor->importStylesheet($xslDoc); + + return $xsltProcessor; + } + + /** + * Convert the site content to RSS. + */ public function tweep($src_url) { $url = parse_url($src_url); if (FALSE === $url || empty($url["host"])) { - trigger_error("Invalid url: $src_url", E_USER_ERROR); + trigger_error("Invalid URL: $src_url", E_USER_ERROR); return NULL; } - // Strip the leading www. to be more forgiving on input URLs + // Strip the leading www. to be more forgiving on input URLs. $host = preg_replace('/^www\./', '', $url["host"]); - $xsltProcessor = $this->load_stylesheet($host); + $xsltProcessor = $this->loadStylesheet($host); if (NULL === $xsltProcessor) { return NULL; } - $html = $this->get_contents($src_url); + $html = Tweeper::getUrlContents($src_url); if (FALSE === $html) { return NULL; } - $preprocess_html_host_method = 'preprocess_html_' . str_replace(".", "_", $host); + $preprocess_html_host_method = 'preprocessHtml' . Tweeper::toUpperCamelCase($host, '.'); if (method_exists($this, $preprocess_html_host_method)) { $html = call_user_func_array(array($this, $preprocess_html_host_method), array($html)); } - $xmlDoc = $this->html_to_xml($html, $host); + $xmlDoc = $this->htmlToXml($html, $host); if (NULL === $xmlDoc) { return NULL; } @@ -289,46 +341,57 @@ class Tweeper { } return $output; } + } -function is_cli() -{ +/** + * Check if the script is being run from the command line. + */ +function is_cli() { return (php_sapi_name() === "cli"); } -function usage($argv) -{ +/** + * Show the script usage. + */ +function usage($argv) { if (is_cli()) { $usage = "{$argv[0]} [-e|-h|--help] \n"; - } else { + } + else { $usage = htmlentities("{$_SERVER['SCRIPT_NAME']}?src_url=&generate_enclosure=<0|1>"); } return "usage: $usage"; } -function parse_options_cli($argv, $argc) -{ +/** + * Parse command line options. + */ +function parse_options_cli($argv, $argc) { $options = array( - 'generate_enclosure' => FALSE + 'generate_enclosure' => FALSE, ); - if ($argc < 2) + if ($argc < 2) { return $options; + } $cli_options = getopt("eh", array("help")); foreach ($cli_options as $opt => $val) { switch ($opt) { - case 'e': - $options['generate_enclosure'] = TRUE; - break; - case 'h': - case 'help': - echo usage($argv); - exit(0); - default: - fwrite(STDERR, usage($argv)); - exit(1); + case 'e': + $options['generate_enclosure'] = TRUE; + break; + + case 'h': + case 'help': + echo usage($argv); + exit(0); + + default: + fwrite(STDERR, usage($argv)); + exit(1); } } @@ -337,32 +400,36 @@ function parse_options_cli($argv, $argc) return $options; } -function parse_options_query_string() -{ +/** + * Parse options passed from a query string. + */ +function parse_options_query_string() { $options = array( - 'generate_enclosure' => FALSE + 'generate_enclosure' => FALSE, ); - if (isset($_GET['src_url'])) + if (isset($_GET['src_url'])) { $options['src_url'] = $_GET['src_url']; + } - if (isset($_GET['generate_enclosure'])) + if (isset($_GET['generate_enclosure'])) { $options['generate_enclosure'] = $_GET['generate_enclosure'] == 1; + } return $options; } - if (is_cli()) { $options = parse_options_cli($argv, $argc); - $ERROR_STREAM = fopen('php://stderr', 'w'); -} else { + $error_stream = fopen('php://stderr', 'w'); +} +else { $options = parse_options_query_string(); - $ERROR_STREAM = fopen('php://output', 'w'); + $error_stream = fopen('php://output', 'w'); } if (!isset($options['src_url'])) { - fwrite($ERROR_STREAM, usage(is_cli() ? $argv : NULL)); + fwrite($error_stream, usage(is_cli() ? $argv : NULL)); exit(1); }