<?php
-/*
- * tweeper - a Twitter to RSS web scraper
+/**
+ * @file
+ * Tweeper - a Twitter to RSS web scraper.
*
* Copyright (C) 2013-2015 Antonio Ospite <ao2@ao2.it>
*
date_default_timezone_set('UTC');
+/**
+ * Scrape supported websites and perform conversion to RSS.
+ */
class Tweeper {
private static $USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0";
+ /**
+ * Constructor sets up {@link $generate_enclosure}.
+ */
public function __construct($generate_enclosure = FALSE) {
$this->generate_enclosure = $generate_enclosure;
}
+ /**
+ * Convert numeric Epoch to the date format expected in a RSS document.
+ */
public static function epoch_to_gmdate($timestamp) {
if (!is_numeric($timestamp) || is_nan($timestamp)) {
$timestamp = 0;
return gmdate('D, d M Y H:i:s', $timestamp) . ' GMT';
}
+ /**
+ * Convert generic date string to the date format expected in a RSS document.
+ */
public static function str_to_gmdate($date) {
$timestamp = strtotime($date);
if (FALSE === $timestamp) {
return Tweeper::epoch_to_gmdate($timestamp);
}
+ /**
+ * Get the contents from a URL.
+ */
private static function get_contents($url) {
$ch = curl_init($url);
curl_setopt_array($ch, array(
CURLOPT_HEADER => FALSE,
- CURLOPT_FOLLOWLOCATION => TRUE, // follow http redirects to get the real URL
+ // Follow http redirects to get the real URL.
+ CURLOPT_FOLLOWLOCATION => TRUE,
CURLOPT_RETURNTRANSFER => TRUE,
CURLOPT_SSL_VERIFYHOST => FALSE,
CURLOPT_SSL_VERIFYPEER => FALSE,
return $contents;
}
+ /**
+ * Get the headers from a URL.
+ */
private static function get_info($url) {
$ch = curl_init($url);
curl_setopt_array($ch, array(
CURLOPT_HEADER => TRUE,
CURLOPT_NOBODY => TRUE,
- CURLOPT_FOLLOWLOCATION => TRUE, // follow http redirects to get the real URL
+ // Follow http redirects to get the real URL.
+ CURLOPT_FOLLOWLOCATION => TRUE,
CURLOPT_RETURNTRANSFER => TRUE,
CURLOPT_SSL_VERIFYHOST => FALSE,
CURLOPT_SSL_VERIFYPEER => FALSE,
return $url_info;
}
+ /**
+ * Generate an RSS <enclosure/> element.
+ */
public static function generate_enclosure($url) {
$supported_content_types = array(
"application/ogg",
return $dom->saveXML($enc);
}
- /* Mimic the message from libxml.c::php_libxml_ctx_error_level() */
+ /**
+ * Mimic the message from libxml.c::php_libxml_ctx_error_level()
+ */
private function log_xml_error($error) {
$output = "";
switch ($error->level) {
- case LIBXML_ERR_WARNING:
- $output .= "Warning $error->code: ";
- break;
- case LIBXML_ERR_ERROR:
- $output .= "Error $error->code: ";
- break;
- case LIBXML_ERR_FATAL:
- $output .= "Fatal Error $error->code: ";
- break;
+ case LIBXML_ERR_WARNING:
+ $output .= "Warning $error->code: ";
+ break;
+
+ case LIBXML_ERR_ERROR:
+ $output .= "Error $error->code: ";
+ break;
+
+ case LIBXML_ERR_FATAL:
+ $output .= "Fatal Error $error->code: ";
+ break;
}
$output .= trim($error->message);
if ($error->file) {
$output .= " in $error->file";
- } else {
+ }
+ else {
$output .= " in Entity,";
}
- $output .=" line $error->line";
+ $output .= " line $error->line";
error_log($output);
}
+ /**
+ * Load a stylesheet if the web site is supported.
+ */
private function load_stylesheet($host) {
$stylesheet = "file://" . __DIR__ . "/rss_converter_" . $host . ".xsl";
if (FALSE === file_exists($stylesheet)) {
return $xsltProcessor;
}
+ /**
+ * Convert json to xml.
+ */
private function json_to_xml($json, $root_node_name) {
// Apparenty the ObjectNormalizer used afterwards is not able to handle
// the stdClass object created by json_decode() with the default setting
- // $assoc = false; so use $assoc = true
- $data = json_decode($json, $assoc = true);
+ // $assoc = false; so use $assoc = true.
+ $data = json_decode($json, $assoc = TRUE);
if (!$data) {
return NULL;
}
$normalizer = new ObjectNormalizer();
$serializer = new Serializer(array($normalizer), array($encoder));
- $serializer_options = array (
+ $serializer_options = array(
'xml_encoding' => "UTF-8",
'xml_format_output' => TRUE,
'xml_root_node_name' => $root_node_name,
return $xml_data;
}
+ /**
+ * Convert the Instagram content to XML.
+ */
private function get_xml_instagram_com($html) {
- // extract the json data from the html code
+ // Extract the json data from the html code.
$json_match_expr = '/window._sharedData = (.*);/';
$ret = preg_match($json_match_expr, $html, $matches);
if ($ret !== 1) {
return $this->json_to_xml($matches[1], 'instagram');
}
+ /**
+ * Make the Facebook HTML processable.
+ */
private function preprocess_html_facebook_com($html) {
$html = str_replace('<!--', '', $html);
$html = str_replace('-->', '', $html);
return $html;
}
+ /**
+ * Convert the HTML retrieved from the site to XML.
+ */
private function html_to_xml($html, $host) {
$xmlDoc = new DOMDocument();
// Handle warnings and errors when loading invalid HTML.
- $xml_errors_value = libxml_use_internal_errors(true);
+ $xml_errors_value = libxml_use_internal_errors(TRUE);
// If there is a host-specific method to get the xml data, use it!
$get_xml_host_method = 'get_xml_' . str_replace(".", "_", $host);
if (method_exists($this, $get_xml_host_method)) {
$xml_data = call_user_func_array(array($this, $get_xml_host_method), array($html));
$xmlDoc->loadXML($xml_data);
- } else {
+ }
+ else {
$xmlDoc->loadHTML($html);
}
return $xmlDoc;
}
+ /**
+ * Convert the site content to RSS.
+ */
public function tweep($src_url) {
$url = parse_url($src_url);
if (FALSE === $url || empty($url["host"])) {
return NULL;
}
- // Strip the leading www. to be more forgiving on input URLs
+ // Strip the leading www. to be more forgiving on input URLs.
$host = preg_replace('/^www\./', '', $url["host"]);
$xsltProcessor = $this->load_stylesheet($host);
}
return $output;
}
+
}
+/**
+ * Check if the script is being run from the command line.
+ */
function is_cli() {
return (php_sapi_name() === "cli");
}
+/**
+ * Show the script usage.
+ */
function usage($argv) {
if (is_cli()) {
$usage = "{$argv[0]} [-e|-h|--help] <src_url>\n";
- } else {
+ }
+ else {
$usage = htmlentities("{$_SERVER['SCRIPT_NAME']}?src_url=<src_url>&generate_enclosure=<0|1>");
}
return "usage: $usage";
}
+/**
+ * Parse command line options.
+ */
function parse_options_cli($argv, $argc) {
$options = array(
- 'generate_enclosure' => FALSE
+ 'generate_enclosure' => FALSE,
);
- if ($argc < 2)
+ if ($argc < 2) {
return $options;
+ }
$cli_options = getopt("eh", array("help"));
foreach ($cli_options as $opt => $val) {
switch ($opt) {
- case 'e':
- $options['generate_enclosure'] = TRUE;
- break;
- case 'h':
- case 'help':
- echo usage($argv);
- exit(0);
- default:
- fwrite(STDERR, usage($argv));
- exit(1);
+ case 'e':
+ $options['generate_enclosure'] = TRUE;
+ break;
+
+ case 'h':
+ case 'help':
+ echo usage($argv);
+ exit(0);
+
+ default:
+ fwrite(STDERR, usage($argv));
+ exit(1);
}
}
return $options;
}
+/**
+ * Parse options passed from a query string.
+ */
function parse_options_query_string() {
$options = array(
- 'generate_enclosure' => FALSE
+ 'generate_enclosure' => FALSE,
);
- if (isset($_GET['src_url']))
+ if (isset($_GET['src_url'])) {
$options['src_url'] = $_GET['src_url'];
+ }
- if (isset($_GET['generate_enclosure']))
+ if (isset($_GET['generate_enclosure'])) {
$options['generate_enclosure'] = $_GET['generate_enclosure'] == 1;
+ }
return $options;
}
-
if (is_cli()) {
$options = parse_options_cli($argv, $argc);
$ERROR_STREAM = fopen('php://stderr', 'w');
-} else {
+}
+else {
$options = parse_options_query_string();
$ERROR_STREAM = fopen('php://output', 'w');
}