From: Antonio Ospite Date: Sat, 28 Feb 2015 00:50:21 +0000 (+0100) Subject: tweeper.php: add infrastructure for sites using json data X-Git-Tag: v0.4~34 X-Git-Url: https://git.ao2.it/tweeper.git/commitdiff_plain/e053ac2a08c18296cdfe329e385a8d7a6c6fa962?hp=547175cc33bb2aa612c706eac783d8f506fe9e7e tweeper.php: add infrastructure for sites using json data Some websites provide the timelines as json data and convert that to html in the client. Supporting these sites will be done by defining host specific methods to extract the json data to pass to json_to_xml(). --- diff --git a/INSTALL b/INSTALL index a2e602d..d575bb1 100644 --- a/INSTALL +++ b/INSTALL @@ -1,3 +1,6 @@ The recommended way to install tweeper globally is to install all its files under /usr/share/php/tweeper and then make a symlink to the wrapper script "tweeper" under /usr/bin + +Tweeper depends on php-xml-serializer which is used to convert json to xml for +some sites that provide the timeline data in json rather than in usable html. diff --git a/tweeper.php b/tweeper.php index d084398..9f09936 100644 --- a/tweeper.php +++ b/tweeper.php @@ -18,6 +18,8 @@ * along with this program. If not, see . */ +require_once 'XML/Serializer.php'; + date_default_timezone_set('UTC'); class Tweeper { @@ -168,12 +170,52 @@ class Tweeper { return $xsltProcessor; } - private function html_to_xml($html) { + private function json_to_xml($html, $json_match_expr, $rootName) { + // pre-process, convert json to XML + $ret = preg_match($json_match_expr, $html, $matches); + if ($ret !== 1) { + trigger_error("Cannot match expression: $json_match_expr\n", E_USER_ERROR); + return NULL; + } + + $data = json_decode($matches[1]); + if (!$data) { + return NULL; + } + + $serializer_options = array ( + 'addDecl' => TRUE, + 'encoding' => "UTF-8", + 'indent' => ' ', + 'rootName' => $rootName, + ); + + $serializer = new XML_Serializer($serializer_options); + + $status = $serializer->serialize($data); + if (PEAR::isError($status)) { + trigger_error($status->getMessage(), E_USER_ERROR); + return NULL; + } + + return $serializer->getSerializedData(); + } + + private function html_to_xml($html, $host) { $xmlDoc = new DOMDocument(); // Handle warnings and errors when loading invalid HTML. $xml_errors_value = libxml_use_internal_errors(true); - $xmlDoc->loadHTML($html); + + // If there is a host-specific method to get the xml data, use it! + $get_xml_host_method = 'get_xml_' . str_replace(".", "_", $host); + if (method_exists($this, $get_xml_host_method)) { + $xml_data = call_user_func_array(array($this, $get_xml_host_method), array($html)); + $xmlDoc->loadXML($xml_data); + } else { + $xmlDoc->loadHTML($html); + } + foreach (libxml_get_errors() as $xml_error) { $this->log_xml_error($xml_error); } @@ -200,7 +242,7 @@ class Tweeper { return NULL; } - $xmlDoc = $this->html_to_xml($html); + $xmlDoc = $this->html_to_xml($html, $url["host"]); if (NULL === $xmlDoc) { return NULL; }