From 7097a8ad2ef040bc81a8c5f7ed7cc02e0073eaab Mon Sep 17 00:00:00 2001 From: Antonio Ospite Date: Wed, 1 Jul 2015 13:37:57 +0200 Subject: [PATCH] tweeper.php: support host-specific methods for preprocessing the HTML data Some sites serve mangled HTML code, so a mechanism to clean it up before loading it as XML is needed. For instance, facebook.com puts come content inside HTML comments, and these must be stripped in order to make the content available to the HTML parser when loading the data into a DOMDocument. --- tweeper.php | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tweeper.php b/tweeper.php index 2444a39..efc0fd6 100644 --- a/tweeper.php +++ b/tweeper.php @@ -257,6 +257,11 @@ class Tweeper { return NULL; } + $preprocess_html_host_method = 'preprocess_html_' . str_replace(".", "_", $host); + if (method_exists($this, $preprocess_html_host_method)) { + $html = call_user_func_array(array($this, $preprocess_html_host_method), array($html)); + } + $xmlDoc = $this->html_to_xml($html, $host); if (NULL === $xmlDoc) { return NULL; -- 2.1.4