From e053ac2a08c18296cdfe329e385a8d7a6c6fa962 Mon Sep 17 00:00:00 2001 From: Antonio Ospite Date: Sat, 28 Feb 2015 01:50:21 +0100 Subject: [PATCH] tweeper.php: add infrastructure for sites using json data Some websites provide the timelines as json data and convert that to html in the client. Supporting these sites will be done by defining host specific methods to extract the json data to pass to json_to_xml(). --- INSTALL | 3 +++ tweeper.php | 48 +++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 48 insertions(+), 3 deletions(-) diff --git a/INSTALL b/INSTALL index a2e602d..d575bb1 100644 --- a/INSTALL +++ b/INSTALL @@ -1,3 +1,6 @@ The recommended way to install tweeper globally is to install all its files under /usr/share/php/tweeper and then make a symlink to the wrapper script "tweeper" under /usr/bin + +Tweeper depends on php-xml-serializer which is used to convert json to xml for +some sites that provide the timeline data in json rather than in usable html. diff --git a/tweeper.php b/tweeper.php index d084398..9f09936 100644 --- a/tweeper.php +++ b/tweeper.php @@ -18,6 +18,8 @@ * along with this program. If not, see . */ +require_once 'XML/Serializer.php'; + date_default_timezone_set('UTC'); class Tweeper { @@ -168,12 +170,52 @@ class Tweeper { return $xsltProcessor; } - private function html_to_xml($html) { + private function json_to_xml($html, $json_match_expr, $rootName) { + // pre-process, convert json to XML + $ret = preg_match($json_match_expr, $html, $matches); + if ($ret !== 1) { + trigger_error("Cannot match expression: $json_match_expr\n", E_USER_ERROR); + return NULL; + } + + $data = json_decode($matches[1]); + if (!$data) { + return NULL; + } + + $serializer_options = array ( + 'addDecl' => TRUE, + 'encoding' => "UTF-8", + 'indent' => ' ', + 'rootName' => $rootName, + ); + + $serializer = new XML_Serializer($serializer_options); + + $status = $serializer->serialize($data); + if (PEAR::isError($status)) { + trigger_error($status->getMessage(), E_USER_ERROR); + return NULL; + } + + return $serializer->getSerializedData(); + } + + private function html_to_xml($html, $host) { $xmlDoc = new DOMDocument(); // Handle warnings and errors when loading invalid HTML. $xml_errors_value = libxml_use_internal_errors(true); - $xmlDoc->loadHTML($html); + + // If there is a host-specific method to get the xml data, use it! + $get_xml_host_method = 'get_xml_' . str_replace(".", "_", $host); + if (method_exists($this, $get_xml_host_method)) { + $xml_data = call_user_func_array(array($this, $get_xml_host_method), array($html)); + $xmlDoc->loadXML($xml_data); + } else { + $xmlDoc->loadHTML($html); + } + foreach (libxml_get_errors() as $xml_error) { $this->log_xml_error($xml_error); } @@ -200,7 +242,7 @@ class Tweeper { return NULL; } - $xmlDoc = $this->html_to_xml($html); + $xmlDoc = $this->html_to_xml($html, $url["host"]); if (NULL === $xmlDoc) { return NULL; } -- 2.1.4