+News for v1.4.2:
+================
+
+ * Add option to enable or disable showing verbose output
+ * Add back partial support for twitter.com using the old twitter mobile UI
+ * Misc fixes to code and documentation
+
News for v1.4.1:
================
- * Enable cookie handling in cURL to fix scraping twitter.com
- * Update User-Agent version to fix scraping hashtag pages on twitter.com
+ * Enable cookie handling in cURL to fix scraping twitter.com
+ * Update User-Agent version to fix scraping hashtag pages on twitter.com
News for v1.4.0:
================
* @file
* Tweeper - a Twitter to RSS web scraper.
*
- * Copyright (C) 2013-2018 Antonio Ospite <ao2@ao2.it>
+ * Copyright (C) 2013-2020 Antonio Ospite <ao2@ao2.it>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
*/
class Tweeper {
- private static $userAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0";
+ private static $userAgent = "Mozilla/5.0";
private static $maxConnectionTimeout = 5;
private static $maxConnectionRetries = 5;
* Enables showing multimedia content (images, videos) directly in the
* item description (enabled by default). Only some stylesheets supports
* this functionality (twitter, instagram, dilbert).
+ * @param bool $verbose_output
+ * Enables showing non-fatal errors like XML parsing errors.
*/
- public function __construct($generate_enclosure = FALSE, $show_usernames = TRUE, $show_multimedia = TRUE) {
+ public function __construct($generate_enclosure = FALSE, $show_usernames = TRUE, $show_multimedia = TRUE, $verbose_output = TRUE) {
$this->generate_enclosure = $generate_enclosure;
$this->show_usernames = $show_usernames;
$this->show_multimedia = $show_multimedia;
+ $this->verbose_output = $verbose_output;
}
/**
}
/**
+ * Convert Twitter mobile date to the date format expected in a RSS document.
+ */
+ public static function twitterToRssDate($date) {
+ // Twitter uses relative timestamps in minutes for recent tweets.
+ if (preg_match('/^(\d+)m$/', $date, $matches)) {
+ $timestamp = strtotime("+" . $matches[1] . " min", time());
+ if (FALSE === $timestamp) {
+ $timestamp = 0;
+ }
+ }
+ else {
+ /*
+ * In case the time is specified put it after the date,
+ * to make it recognized by strptime().
+ */
+ if (preg_match('/(.*) - (.*)/', $date, $matches)) {
+ $date = $matches[2] . " " . $matches[1];
+ }
+
+ $timestamp = strtotime($date);
+ if (FALSE === $timestamp) {
+ $timestamp = 0;
+ }
+
+ /*
+ * The twitter mobile UI usually only specifies the month and the day, so
+ * strtotime($date) may interpret the date as being in the future.
+ *
+ * If the date is in the future it is probably in the same day but in the
+ * previous year.
+ */
+ if ($timestamp > time()) {
+ $timestamp = strtotime('-1 years', $timestamp);
+ }
+ }
+
+ return Tweeper::epochToRssDate($timestamp);
+ }
+
+ /**
* Convert string to UpperCamelCase.
*/
public static function toUpperCamelCase($str, $delim = ' ') {
/**
* Get the contents from a URL.
*/
- private static function getUrlContents($url) {
+ private static function getUrlContents($url, $user_agent = NULL) {
$ch = curl_init($url);
- curl_setopt_array($ch, array(
+ curl_setopt_array($ch, [
CURLOPT_HEADER => FALSE,
CURLOPT_CONNECTTIMEOUT => Tweeper::$maxConnectionTimeout,
// Follow http redirects to get the real URL.
CURLOPT_FOLLOWLOCATION => TRUE,
CURLOPT_COOKIEFILE => "",
CURLOPT_RETURNTRANSFER => TRUE,
- CURLOPT_SSL_VERIFYHOST => FALSE,
- CURLOPT_SSL_VERIFYPEER => FALSE,
- CURLOPT_HTTPHEADER => array('Accept-language: en'),
- CURLOPT_USERAGENT => Tweeper::$userAgent,
- ));
+ CURLOPT_HTTPHEADER => ['Accept-language: en'],
+ CURLOPT_USERAGENT => isset($user_agent) ? $user_agent : Tweeper::$userAgent,
+ ]);
$contents = Tweeper::curlExec($ch);
curl_close($ch);
/**
* Get the headers from a URL.
*/
- private static function getUrlInfo($url) {
+ private static function getUrlInfo($url, $user_agent = NULL) {
$ch = curl_init($url);
- curl_setopt_array($ch, array(
+ curl_setopt_array($ch, [
CURLOPT_HEADER => TRUE,
CURLOPT_NOBODY => TRUE,
CURLOPT_CONNECTTIMEOUT => Tweeper::$maxConnectionTimeout,
// Follow http redirects to get the real URL.
CURLOPT_FOLLOWLOCATION => TRUE,
CURLOPT_RETURNTRANSFER => TRUE,
- CURLOPT_SSL_VERIFYHOST => FALSE,
- CURLOPT_SSL_VERIFYPEER => FALSE,
- CURLOPT_USERAGENT => Tweeper::$userAgent,
- ));
+ CURLOPT_USERAGENT => isset($user_agent) ? $user_agent : Tweeper::$userAgent,
+ ]);
$ret = Tweeper::curlExec($ch);
if (FALSE === $ret) {
* Generate an RSS <enclosure/> element.
*/
public static function generateEnclosure($url) {
- $supported_content_types = array(
+ $supported_content_types = [
"application/octet-stream",
"application/ogg",
"application/pdf",
"video/mp4",
"video/mpeg",
"video/ogg",
- );
+ ];
$url_info = Tweeper::getUrlInfo($url);
if (FALSE === $url_info) {
$encoder = new XmlEncoder();
$normalizer = new ObjectNormalizer();
- $serializer = new Serializer(array($normalizer), array($encoder));
+ $serializer = new Serializer([$normalizer], [$encoder]);
- $serializer_options = array(
+ $serializer_options = [
'xml_encoding' => "UTF-8",
'xml_format_output' => TRUE,
'xml_root_node_name' => $root_node_name,
- );
+ ];
$xml_data = $serializer->serialize($data, 'xml', $serializer_options);
if (!$xml_data) {
// If there is a host-specific method to get the XML data, use it!
$get_xml_host_method = 'getXml' . Tweeper::toUpperCamelCase($host, '.');
if (method_exists($this, $get_xml_host_method)) {
- $xml_data = call_user_func_array(array($this, $get_xml_host_method), array($html));
+ $xml_data = call_user_func_array([$this, $get_xml_host_method], [$html]);
$xmlDoc->loadXML($xml_data);
}
else {
$xmlDoc->loadHTML($html);
}
- foreach (libxml_get_errors() as $xml_error) {
- Tweeper::logXmlError($xml_error);
+ if ($this->verbose_output) {
+ foreach (libxml_get_errors() as $xml_error) {
+ Tweeper::logXmlError($xml_error);
+ }
}
libxml_clear_errors();
libxml_use_internal_errors($xml_errors_value);
return NULL;
}
- $stylesheet_contents = Tweeper::getUrlContents($stylesheet);
+ $stylesheet_contents = file_get_contents($stylesheet);
if (FALSE === $stylesheet_contents) {
trigger_error("Cannot open $stylesheet", E_USER_WARNING);
return NULL;
if (TRUE === $validate_scheme) {
$scheme = $url["scheme"];
- if (!in_array($scheme, array("http", "https"))) {
+ if (!in_array($scheme, ["http", "https"])) {
trigger_error("unsupported scheme: $scheme", E_USER_WARNING);
return NULL;
}
return NULL;
}
- $html = Tweeper::getUrlContents($src_url);
+ // Override User-Agent for twitter.com to force it to serve the mobile UI.
+ if ($host == "twitter.com") {
+ $user_agent = "Mozilla/5.0 (Linux; U; Android 4.3; en-us; SM-N900T Build/JSS15J)";
+ }
+ else {
+ $user_agent = NULL;
+ }
+
+ $html = Tweeper::getUrlContents($src_url, $user_agent);
if (FALSE === $html) {
trigger_error("Failed to retrieve $src_url", E_USER_WARNING);
return NULL;
$preprocess_html_host_method = 'preprocessHtml' . Tweeper::toUpperCamelCase($host, '.');
if (method_exists($this, $preprocess_html_host_method)) {
- $html = call_user_func_array(array($this, $preprocess_html_host_method), array($html));
+ $html = call_user_func_array([$this, $preprocess_html_host_method], [$html]);
}
$xmlDoc = $this->htmlToXml($html, $host);
<!--
Stylesheet to convert Twitter user timelines to RSS.
- Copyright (C) 2013-2018 Antonio Ospite <ao2@ao2.it>
+ Copyright (C) 2013-2020 Antonio Ospite <ao2@ao2.it>
This file is part of tweeper.
</xsl:copy>
</xsl:template>
+ <!-- Strip leading spaces in first text node of the tweet-text. -->
+ <xsl:template match="div[@class='tweet-text']/div/text()[1]">
+ <xsl:value-of select="substring-after(substring-after(., ' '), ' ')"/>
+ </xsl:template>
+
<!--
Anchors to external links provide the direct URL in the
data-expanded-url attribute, so use this in the href attribute too
http://stackoverflow.com/questions/21984867/
-->
<xsl:template match="a[@data-expanded-url]">
- <!-- Prepend and append a white space for aestethic reasons -->
- <xsl:text> </xsl:text>
<a>
<xsl:attribute name="href">
<xsl:value-of select="@data-expanded-url"/>
</xsl:attribute>
- <!-- Also strip and … -->
- <xsl:value-of select="translate(., ' …', '')"/>
+ <xsl:value-of select="@data-expanded-url"/>
</a>
- <xsl:text> </xsl:text>
</xsl:template>
<!--
-->
<xsl:template match="a[@data-pre-embedded='true']">
<xsl:if test="$show-multimedia = 1">
- <!-- Prepend and append a white space for aestethic reasons -->
- <xsl:text> </xsl:text>
<a>
<xsl:attribute name="href">
- <xsl:value-of select="concat('https://', .)"/>
+ <xsl:value-of select="@data-url"/>
</xsl:attribute>
<xsl:value-of select="concat('https://', .)"/>
</a>
- <xsl:text> </xsl:text>
</xsl:if>
</xsl:template>
<!-- Present images in a more convenient way -->
- <xsl:template match="div[@data-image-url]">
+ <!-- TODO: not supported in mobile UI
+ <xsl:template match="a[@data-pre-embedded='true' and contains(@data-url, '/photo/')]">
+ <xsl:variable name="embedded-photo-url" select="concat('https://pbs.twimg.com/media/', @data-tco-id, '?format=jpg')"/>
<a>
<xsl:attribute name="href">
- <xsl:value-of select="concat(@data-image-url, ':orig')"/>
+ <xsl:value-of select="$embedded-photo-url"/>
</xsl:attribute>
<img style="max-width: 100%">
<xsl:attribute name="src">
- <xsl:value-of select="@data-image-url"/>
+ <xsl:value-of select="$embedded-photo-url"/>
</xsl:attribute>
</img>
</a>
</xsl:template>
+ -->
<!-- Don't repeat background in embedded media content -->
+ <!-- TODO: not supported in mobile UI
<xsl:template match="div[contains(@class, 'PlayableMedia-player')]">
<xsl:copy>
<xsl:apply-templates select="@*"/>
<xsl:apply-templates select="node()"/>
</xsl:copy>
</xsl:template>
+ -->
<xsl:template match="a[@data-expanded-url]" mode="enclosure">
<xsl:copy-of select="php:functionString('Tweeper\Tweeper::generateEnclosure', ./@data-expanded-url)"/>
</xsl:template>
- <xsl:template match="div[@data-image-url]" mode="enclosure">
- <xsl:copy-of select="php:functionString('Tweeper\Tweeper::generateEnclosure', concat(./@data-image-url, ':orig'))"/>
+ <xsl:template match="a[@data-pre-embedded='true']" mode="enclosure">
+ <xsl:copy-of select="php:functionString('Tweeper\Tweeper::generateEnclosure', @data-url)"/>
</xsl:template>
- <xsl:variable name="screen-name" select="//div[@class='user-actions btn-group not-following ']/@data-screen-name"/>
+ <xsl:variable name="screen-name" select="normalize-space(substring-after(//table[@class='profile-details' or @class='main-tweet']//*[@class='username'], '@'))"/>
- <xsl:template match="//div[@class='permalink-inner permalink-tweet-container'] | //li[@data-item-id and @data-item-type='tweet']">
- <xsl:variable name="user-name" select=".//div[@data-tweet-id]/@data-screen-name"/>
- <xsl:variable name="item-content" select=".//p[contains(@class, 'js-tweet-text')]"/>
- <xsl:variable name="item-media" select=".//div[contains(@class, 'AdaptiveMedia-container')]"/>
- <xsl:variable name="item-permalink" select="concat($BaseURL, .//div[@data-permalink-path]/@data-permalink-path)"/>
+ <xsl:template match="//div[contains(@class, 'timeline')]/table[@class='tweet ']|//div[@class='main-tweet-container']/table[@class='main-tweet']">
+ <xsl:variable name="user-name" select="normalize-space(.//*[@class='username']/text()[2])"/>
+ <xsl:variable name="item-content" select=".//div[@class='tweet-text']/div"/>
+ <xsl:variable name="item-media" select=".//a[@data-pre-embedded='true']"/>
+ <xsl:variable name="item-permalink">
+ <xsl:choose>
+ <xsl:when test="@href">
+ <xsl:value-of select="concat($BaseURL, substring-before(@href, '?'))"/>
+ </xsl:when>
+ <xsl:otherwise>
+ <!--
+ The main tweet in permalink pages do not have a timestamp tag,
+ just use the canonical URL as permalink.
+ -->
+ <xsl:value-of select="//link[@rel='canonical']/@href"/>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+
+ <!-- TODO twitter mobile UI does not have a way to detect this
+ <xsl:variable name="item-has-video" select="$item-media//*[contains(@class, 'PlayableMedia- -video')]"/>
+ <xsl:variable name="item-has-gif" select="$item-media//*[contains(@class, 'PlayableMedia- -gif')]"/>
+ -->
- <xsl:variable name="item-has-video" select="$item-media//*[contains(@class, 'PlayableMedia--video')]"/>
- <xsl:variable name="item-has-gif" select="$item-media//*[contains(@class, 'PlayableMedia--gif')]"/>
<item>
<title>
<xsl:if test="($show-usernames = 1) or ($screen-name != $user-name)">
<xsl:value-of select="concat($user-name, ': ')"/>
</xsl:if>
+ <!-- TODO twitter mobile UI does not have a way to detect this
<xsl:if test="$item-has-video">
<xsl:text>(Video) </xsl:text>
</xsl:if>
+ -->
<!--
Prepend a space in front of the URLs which are not
preceded by an open parenthesis, for aestethic reasons.
<xsl:value-of select="$item-permalink"/>
</guid>
<pubDate>
- <xsl:variable name="timestamp" select=".//span[contains(@class, 'js-short-timestamp')]/@data-time"/>
- <xsl:value-of select="php:functionString('Tweeper\Tweeper::epochToRssDate', number($timestamp))"/>
+ <xsl:variable name="timestamp" select=".//td[@class='timestamp']/a|.//div[@class='metadata']/a"/>
+ <xsl:value-of select="php:functionString('Tweeper\Tweeper::twitterToRssDate', $timestamp)"/>
</pubDate>
<description>
<xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text>
<xsl:value-of select="concat($user-name, ':')"/>
<xsl:element name="br"/>
</xsl:if>
+ <!-- TODO twitter mobile UI does not support embedded media
<xsl:if test="$item-has-video">
<xsl:text> (Video)</xsl:text>
<xsl:element name="br"/>
<xsl:text> (GIF)</xsl:text>
<xsl:element name="br"/>
</xsl:if>
+ -->
<xsl:element name="span">
<xsl:attribute name="style">white-space: pre-wrap;</xsl:attribute>
<xsl:apply-templates select="$item-content/node()"/>
</xsl:element>
+
+ <!-- TODO twitter mobile UI does not support embedded media
<xsl:if test="$show-multimedia = 1">
- <xsl:apply-templates select="$item-media/node()"/>
+ <xsl:apply-templates select="$item-media"/>
</xsl:if>
+ -->
<xsl:text disable-output-escaping="yes">]]></xsl:text>
</description>
<xsl:if test="$generate-enclosure = 1">
<xsl:apply-templates select="$item-content//a[@data-expanded-url]" mode="enclosure"/>
- <xsl:apply-templates select="$item-media//div[@data-image-url]" mode="enclosure"/>
+ <xsl:apply-templates select="$item-media" mode="enclosure"/>
</xsl:if>
</item>
</xsl:template>
<xsl:value-of select="concat('Twitter / ', $screen-name)"/>
</xsl:when>
<xsl:otherwise>
- <xsl:value-of select="concat('Twitter / ', normalize-space(//h1[1]))"/>
+ <xsl:value-of select="concat('Twitter / ', normalize-space(//td[@id='search']//input/@value))"/>
</xsl:otherwise>
</xsl:choose>
</xsl:variable>
<xsl:variable name="channel-link" select="//link[@rel='canonical']/@href"/>
- <xsl:variable name="channel-image" select="//a[contains(@class, 'profile-picture')]/@href"/>
+ <xsl:variable name="channel-image" select="//table[@class='profile-details' or @class='main-tweet']//td[@class='avatar']//img/@src"/>
<rss version="2.0">
<xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
<xsl:value-of select="$channel-link"/>
</link>
<description>
- <xsl:value-of select="normalize-space(//div[@class='ProfileHeaderCard'])"/>
- <!-- The following rule should only match on hashtag URLs -->
- <xsl:value-of select="normalize-space(//div[@class='SearchNavigation-textContainer'])"/>
+ <xsl:value-of select="normalize-space(//table[@class='profile-details' or @class='main-tweet']//td[@class='details'])"/>
</description>
<xsl:if test="$channel-image != ''">
<image>
</url>
</image>
</xsl:if>
- <xsl:apply-templates select="//ol[@id='stream-items-id']/li[@data-item-id and @data-item-type='tweet' and not(contains(@class, 'has-profile-promoted-tweet'))]"/>
-
- <!-- These rules will only match on permalink URLs -->
- <xsl:apply-templates select="//div[@class='permalink-inner permalink-tweet-container']"/>
- <xsl:apply-templates select="//div[@data-component-context='replies']//li[@data-item-id and @data-item-type='tweet' and not(contains(@class, 'has-profile-promoted-tweet'))]"/>
-
+ <xsl:apply-templates select="//div[contains(@class, 'timeline')]/table[@class='tweet ']|//div[@class='main-tweet-container']/table[@class='main-tweet']"/>
</channel>
</rss>
</xsl:template>
enable or disable showing usernames in front of the item for hosts which
supports it (Twitter.com/Instagram.com). Default is 1 (enable).
+*-v <0|1>*::
+ enable or disable showing verbose output like, for instance, non-fatal
+ errors and warnings from the XML parser. Default is 1 (enable).
+
*-h, --help*::
show the help message
COPYING
-------
-Copyright \(C) 2013-2018 Antonio Ospite <ao2@ao2.it>
+Copyright \(C) 2013-2020 Antonio Ospite <ao2@ao2.it>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
* @file
* Tweeper - a Twitter to RSS web scraper.
*
- * Copyright (C) 2013-2018 Antonio Ospite <ao2@ao2.it>
+ * Copyright (C) 2013-2020 Antonio Ospite <ao2@ao2.it>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
*/
function usage($argv) {
if (is_cli()) {
- $usage = "{$argv[0]} [-e|-m <0|1>|-u <0|1>|-h|--help] <src_url>\n";
+ $usage = "{$argv[0]} [-e|-m <0|1>|-u <0|1>|-v <0|1>|-h|--help] <src_url>\n";
}
else {
- $usage = htmlentities("{$_SERVER['SCRIPT_NAME']}?src_url=<src_url>&generate_enclosure=<0|1>&show_usernames=<0|1>&show_multimedia=<0|1>");
+ $usage = htmlentities("{$_SERVER['SCRIPT_NAME']}?src_url=<src_url>&generate_enclosure=<0|1>&show_usernames=<0|1>&show_multimedia=<0|1>&verbose_output=<0|1>");
}
return "usage: $usage";
* Parse command line options.
*/
function parse_options_cli($argv, $argc) {
- $options = array(
+ $options = [
'generate_enclosure' => FALSE,
'show_usernames' => TRUE,
'show_multimedia' => TRUE,
- );
+ 'verbose_output' => TRUE,
+ ];
if ($argc < 2) {
return $options;
}
- $cli_options = getopt("em:u:h", array("help"));
+ $cli_options = getopt("em:u:v:h", ["help"]);
foreach ($cli_options as $opt => $val) {
switch ($opt) {
case 'e':
$options['show_usernames'] = $val;
break;
+ case 'v':
+ $ret = filter_var($val, FILTER_VALIDATE_BOOLEAN, FILTER_NULL_ON_FAILURE);
+ if (NULL === $ret) {
+ fwrite(STDERR, "Invalid argument for the -v option.\n");
+ fwrite(STDERR, usage($argv));
+ exit(1);
+ }
+ $options['verbose_output'] = $val;
+ break;
+
case 'h':
case 'help':
echo usage($argv);
* Parse options passed from a query string.
*/
function parse_options_query_string() {
- $options = array(
+ $options = [
'generate_enclosure' => FALSE,
'show_usernames' => TRUE,
'show_multimedia' => TRUE,
- );
+ 'verbose_output' => TRUE,
+ ];
if (isset($_GET['src_url'])) {
$options['src_url'] = $_GET['src_url'];
$options['show_usernames'] = $_GET['show_usernames'] != 0;
}
+ if (isset($_GET['verbose_output'])) {
+ $options['verbose_output'] = $_GET['verbose_output'] != 0;
+ }
+
return $options;
}
exit(1);
}
-$tweeper = new Tweeper($options['generate_enclosure'], $options['show_usernames'], $options['show_multimedia']);
+$tweeper = new Tweeper($options['generate_enclosure'], $options['show_usernames'], $options['show_multimedia'], $options['verbose_output']);
$output = $tweeper->tweep($options['src_url']);
if (is_null($output)) {
exit(1);