From e053ac2a08c18296cdfe329e385a8d7a6c6fa962 Mon Sep 17 00:00:00 2001
From: Antonio Ospite <ao2@ao2.it>
Date: Sat, 28 Feb 2015 01:50:21 +0100
Subject: [PATCH] tweeper.php: add infrastructure for sites using json data

Some websites provide the timelines as json data and convert that to
html in the client.

Supporting these sites will be done by defining host specific methods to
extract the json data to pass to json_to_xml().
---
 INSTALL     |  3 +++
 tweeper.php | 48 +++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/INSTALL b/INSTALL
index a2e602d..d575bb1 100644
--- a/INSTALL
+++ b/INSTALL
@@ -1,3 +1,6 @@
 The recommended way to install tweeper globally is to install all its files
 under /usr/share/php/tweeper and then make a symlink to the wrapper script
 "tweeper" under /usr/bin
+
+Tweeper depends on php-xml-serializer which is used to convert json to xml for
+some sites that provide the timeline data in json rather than in usable html.
diff --git a/tweeper.php b/tweeper.php
index d084398..9f09936 100644
--- a/tweeper.php
+++ b/tweeper.php
@@ -18,6 +18,8 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+require_once 'XML/Serializer.php';
+
 date_default_timezone_set('UTC');
 
 class Tweeper {
@@ -168,12 +170,52 @@ class Tweeper {
     return $xsltProcessor;
   }
 
-  private function html_to_xml($html) {
+  private function json_to_xml($html, $json_match_expr, $rootName) {
+    // pre-process, convert json to XML
+    $ret = preg_match($json_match_expr, $html, $matches);
+    if ($ret !== 1) {
+      trigger_error("Cannot match expression: $json_match_expr\n", E_USER_ERROR);
+      return NULL;
+    }
+
+    $data = json_decode($matches[1]);
+    if (!$data) {
+      return NULL;
+    }
+
+    $serializer_options = array (
+      'addDecl' => TRUE,
+      'encoding' => "UTF-8",
+      'indent' => '  ',
+      'rootName' => $rootName,
+    );
+
+    $serializer = new XML_Serializer($serializer_options);
+
+    $status = $serializer->serialize($data);
+    if (PEAR::isError($status)) {
+      trigger_error($status->getMessage(), E_USER_ERROR);
+      return NULL;
+    }
+
+    return $serializer->getSerializedData();
+  }
+
+  private function html_to_xml($html, $host) {
     $xmlDoc = new DOMDocument();
 
     // Handle warnings and errors when loading invalid HTML.
     $xml_errors_value = libxml_use_internal_errors(true);
-    $xmlDoc->loadHTML($html);
+
+    // If there is a host-specific method to get the xml data, use it!
+    $get_xml_host_method = 'get_xml_' . str_replace(".", "_", $host);
+    if (method_exists($this, $get_xml_host_method)) {
+      $xml_data = call_user_func_array(array($this, $get_xml_host_method), array($html));
+      $xmlDoc->loadXML($xml_data);
+    } else {
+      $xmlDoc->loadHTML($html);
+    }
+
     foreach (libxml_get_errors() as $xml_error) {
       $this->log_xml_error($xml_error);
     }
@@ -200,7 +242,7 @@ class Tweeper {
       return NULL;
     }
 
-    $xmlDoc = $this->html_to_xml($html);
+    $xmlDoc = $this->html_to_xml($html, $url["host"]);
     if (NULL === $xmlDoc) {
       return NULL;
     }
-- 
2.1.4