X-Git-Url: https://git.ao2.it/tweeper.git/blobdiff_plain/25c3e276f01838c8fa55e9ca59f01d2707392988..5333d3c8ec110a4349dfc3b56168a157afc70082:/src/Tweeper.php diff --git a/src/Tweeper.php b/src/Tweeper.php index 93ac9e0..22d16ab 100644 --- a/src/Tweeper.php +++ b/src/Tweeper.php @@ -6,7 +6,7 @@ namespace Tweeper; * @file * Tweeper - a Twitter to RSS web scraper. * - * Copyright (C) 2013-2016 Antonio Ospite + * Copyright (C) 2013-2018 Antonio Ospite * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -36,13 +36,21 @@ date_default_timezone_set('UTC'); */ class Tweeper { - private static $userAgent = "Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0"; + private static $userAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:60.0) Gecko/20100101 Firefox/60.0"; /** - * Constructor sets up {@link $generate_enclosure}. + * Create a new Tweeper object controlling optional settings. + * + * @param bool $generate_enclosure + * Enables the creation of elements (disabled by default). + * @param bool $show_usernames + * Enables showing the username in front of the content for multi-user + * sites (enabled by default). Only some stylesheets supports this + * functionality (twitter, instagram, pump.io). */ - public function __construct($generate_enclosure = FALSE) { + public function __construct($generate_enclosure = FALSE, $show_usernames = TRUE) { $this->generate_enclosure = $generate_enclosure; + $this->show_usernames = $show_usernames; } /** @@ -116,7 +124,14 @@ class Tweeper { CURLOPT_SSL_VERIFYPEER => FALSE, CURLOPT_USERAGENT => Tweeper::$userAgent, )); - curl_exec($ch); + + $ret = curl_exec($ch); + if (FALSE === $ret) { + trigger_error(curl_error($ch)); + curl_close($ch); + return FALSE; + } + $url_info = curl_getinfo($ch); if (FALSE === $url_info) { trigger_error(curl_error($ch)); @@ -152,6 +167,10 @@ class Tweeper { ); $url_info = Tweeper::getUrlInfo($url); + if (FALSE === $url_info) { + error_log("Failed to retrieve info for URL: " . $url); + return ''; + } $supported = in_array($url_info['content_type'], $supported_content_types); if (!$supported) { @@ -249,7 +268,19 @@ class Tweeper { return NULL; } - return Tweeper::jsonToXml($matches[1], 'instagram'); + $data = json_decode($matches[1], $assoc = TRUE); + + // The "qe" object contains elements which will result in invalid XML + // element names, so remove it. + unset($data["qe"]); + + // The "knobs" object contains elements with undefined namespaces, so + // remove it to silence an error message. + unset($data["knobs"]); + + $json = json_encode($data); + + return Tweeper::jsonToXml($json, 'instagram'); } /** @@ -300,6 +331,9 @@ class Tweeper { } $stylesheet_contents = Tweeper::getUrlContents($stylesheet); + if (FALSE === $stylesheet_contents) { + return NULL; + } $xslDoc = new DOMDocument(); $xslDoc->loadXML($stylesheet_contents); @@ -307,6 +341,7 @@ class Tweeper { $xsltProcessor = new XSLTProcessor(); $xsltProcessor->registerPHPFunctions(); $xsltProcessor->setParameter('', 'generate-enclosure', $this->generate_enclosure); + $xsltProcessor->setParameter('', 'show-usernames', $this->show_usernames); $xsltProcessor->importStylesheet($xslDoc); return $xsltProcessor; @@ -315,21 +350,30 @@ class Tweeper { /** * Convert the site content to RSS. */ - public function tweep($src_url) { + public function tweep($src_url, $host=NULL, $validate_scheme=TRUE) { $url = parse_url($src_url); - if (FALSE === $url || empty($url["host"])) { + if (FALSE === $url) { trigger_error("Invalid URL: $src_url", E_USER_ERROR); return NULL; } - $scheme = $url["scheme"]; - if (!in_array($scheme, array("http", "https"))) { - trigger_error("unsupported scheme: $scheme", E_USER_ERROR); - return NULL; + if (TRUE === $validate_scheme) { + $scheme = $url["scheme"]; + if (!in_array($scheme, array("http", "https"))) { + trigger_error("unsupported scheme: $scheme", E_USER_ERROR); + return NULL; + } } - // Strip the leading www. to be more forgiving on input URLs. - $host = preg_replace('/^www\./', '', $url["host"]); + // if the host is not given derive it from the URL + if (NULL === $host) { + if (empty($url["host"])) { + trigger_error("Invalid host in URL: $src_url", E_USER_ERROR); + return NULL; + } + // Strip the leading www. to be more forgiving on input URLs. + $host = preg_replace('/^www\./', '', $url["host"]); + } $xsltProcessor = $this->loadStylesheet($host); if (NULL === $xsltProcessor) { @@ -352,11 +396,11 @@ class Tweeper { } $output = $xsltProcessor->transformToXML($xmlDoc); - if (FALSE === $output) { trigger_error('XSL transformation failed.', E_USER_ERROR); return NULL; } + return $output; }