PHP_CodeSniffer project, for instructions about how to install Coder Sniffer
see https://www.drupal.org/node/1419988
-Use this command to check the style:
+TL;DR: install drupla/coder and enable the Drupal coding standard in
+PHP_CodeSniffer:
+
+ $ composer global require drupal/coder
+ $ export PATH="$HOME/.config/composer/vendor/bin:$PATH"
+ $ phpcs --config-set installed_paths $HOME/.config/composer/vendor/drupal/coder/coder_sniffer/
+
+And then use this command to check the style:
$ phpcs --standard=Drupal .
+News for v1.1.0:
+================
+
+ * Make scraping Facebook.com pages more robust
+ * Fix getting the channel image for Facebook.com pages
+ * Add some development tools
+ * Fix a problem with some feed readers when showing images from Twitter.com
+ by ignoring the "style" attribute in the scraped HTML
+ * Filter out promoted tweets when scraping Twitter.com
+ * Remove support for Howtoons.com, the old blog is not available anymore
+
News for v1.0.0:
================
* Twitter.com
* Pump.io based websites, like Identi.ca
* Dilbert.com
- * Howtoons.com
* Instagram.com
* Facebook.com (public pages)
+- re-evaluate the use of trigger_error() or use a custom error handler,
+ because right now the code exists as soon as trigger_error() gets called and
+ any following code is ignored.
+
- write better XSL stylesheets? I am not an XSL expert
- evaluate the use of the <ttl/> RSS element
- show cards directly in RSS items for twitter.com
/**
* Convert the site content to RSS.
*/
- public function tweep($src_url) {
+ public function tweep($src_url, $host=NULL, $validate_scheme=TRUE) {
$url = parse_url($src_url);
- if (FALSE === $url || empty($url["host"])) {
+ if (FALSE === $url) {
trigger_error("Invalid URL: $src_url", E_USER_ERROR);
return NULL;
}
- $scheme = $url["scheme"];
- if (!in_array($scheme, array("http", "https"))) {
- trigger_error("unsupported scheme: $scheme", E_USER_ERROR);
- return NULL;
+ if (TRUE === $validate_scheme) {
+ $scheme = $url["scheme"];
+ if (!in_array($scheme, array("http", "https"))) {
+ trigger_error("unsupported scheme: $scheme", E_USER_ERROR);
+ return NULL;
+ }
}
- // Strip the leading www. to be more forgiving on input URLs.
- $host = preg_replace('/^www\./', '', $url["host"]);
+ // if the host is not given derive it from the URL
+ if (NULL === $host) {
+ if (empty($url["host"])) {
+ trigger_error("Invalid host in URL: $src_url", E_USER_ERROR);
+ return NULL;
+ }
+ // Strip the leading www. to be more forgiving on input URLs.
+ $host = preg_replace('/^www\./', '', $url["host"]);
+ }
$xsltProcessor = $this->loadStylesheet($host);
if (NULL === $xsltProcessor) {
name="page-id"
select="substring-after(//meta[@property='al:android:url']/@content, 'fb://page/')"/>
- <xsl:template match="//div[contains(@class, 'userContentWrapper')]">
+ <xsl:template match="//div[contains(@class, 'fbUserContent') or contains(@class, 'userContentWrapper')]">
<xsl:variable name="story-id" select=".//input[@name='ft_ent_identifier']/@value"/>
<xsl:variable
name="item-permalink"
<xsl:template match="/">
<xsl:variable name="channel-title" select="//title"/>
- <xsl:variable name="channel-link" select="//div[contains(@class, 'userContentWrapper')][1]//a[1]/@href"/>
+ <xsl:variable name="channel-link" select="//div[contains(@class, 'fbUserContent') or contains(@class, 'userContentWrapper')][1]//a[1]/@href"/>
+ <xsl:variable name="channel-image" select="//div[contains(@class, 'fbUserContent') or contains(@class, 'userContentWrapper')][1]//a[1]//img/@src"/>
<rss version="2.0">
<xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
<xsl:value-of select="$channel-link"/>
</link>
<url>
- <xsl:value-of select="//img[@class='profilePic img']/@src"/>
+ <xsl:value-of select="$channel-image"/>
</url>
</image>
- <xsl:apply-templates select="//div[contains(@class, 'userContentWrapper')]"/>
+ <xsl:apply-templates select="//div[contains(@class, 'fbUserContent') or contains(@class, 'userContentWrapper')]"/>
</channel>
</rss>
</xsl:template>
+++ /dev/null
-<!--
- Stylesheet to convert Howtoons.com to RSS.
-
- Copyright (C) 2014 Antonio Ospite <ao2@ao2.it>
-
- This file is part of tweeper.
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
--->
-
-<!--
- The RSS feed link is broken on http://howtoons.com so just work around it.
-
- Howtoons uses Wordpress, so maybe this style sheet can be used as a base for
- scraping other Wordpress sites.
--->
-
-<xsl:stylesheet version="1.0"
- xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
- xmlns:php="http://php.net/xsl"
- xsl:extension-element-prefixes="php"
- exclude-result-prefixes="php">
-
- <xsl:output method="xml" indent="yes"/>
-
- <xsl:variable name="BaseURL">
- <xsl:text>http://howtoons.com</xsl:text>
- </xsl:variable>
-
- <xsl:template match="//div[contains(@id, 'post-')]">
- <xsl:variable name="item-permalink" select=".//div[@class='post-headline']//a/@href"/>
- <item>
- <title>
- <xsl:value-of select="normalize-space(.//div[@class='post-headline']//a)"/>
- </title>
- <link>
- <xsl:value-of select="$item-permalink"/>
- </link>
- <guid>
- <xsl:value-of select="$item-permalink"/>
- </guid>
- <pubDate>
- <xsl:variable name="date" select="substring-before(.//div[@class='post-byline'], ',')"/>
- <!-- date format is MM.DD.YY -->
- <xsl:variable name="month" select="substring($date, 1, 2)"/>
- <xsl:variable name="day" select="substring($date, 4, 2)"/>
- <xsl:variable name="year" select="substring($date, 7, 2)"/>
- <xsl:variable name="iso-date" select="concat('20', $year, '-', $month, '-', $day)"/>
- <xsl:value-of select="php:functionString('Tweeper\Tweeper::strToRssDate', $iso-date)"/>
- </pubDate>
- <description>
- <xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text>
- <xsl:copy-of select=".//div[contains(@class, 'post-bodycopy')]/p"/>
- <xsl:text disable-output-escaping="yes">]]></xsl:text>
- </description>
- </item>
- </xsl:template>
-
- <xsl:template match="/">
- <xsl:variable name="channel-title" select="//title"/>
- <xsl:variable name="channel-link" select="$BaseURL"/>
-
- <rss version="2.0">
- <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
- <channel>
- <generator>Tweeper</generator>
- <title>
- <xsl:value-of select="$channel-title"/>
- </title>
- <link>
- <xsl:value-of select="$channel-link"/>
- </link>
- <description>
- <xsl:text>The world's greatest D.I.Y. comic website! Tools of mass construction!</xsl:text>
- </description>
- <image>
- <title>
- <xsl:value-of select="$channel-title"/>
- </title>
- <link>
- <xsl:value-of select="$channel-link"/>
- </link>
- <url>
- <xsl:text>http://www.howtoons.com/wp-content/themes/atahualpa/images/header/tuck1000.png</xsl:text>
- </url>
- </image>
- <xsl:apply-templates select="//div[contains(@id, 'post-')]"/>
- </channel>
- </rss>
- </xsl:template>
-</xsl:stylesheet>
<!-- Identity transform -->
<xsl:template match="@*|node()">
<xsl:copy>
- <xsl:apply-templates select="@*|node()"/>
+ <xsl:apply-templates select="@*[not(name() = 'style')]|node()"/>
</xsl:copy>
</xsl:template>
<xsl:value-of select="//a[contains(@class, 'profile-picture')]/@href"/>
</url>
</image>
- <xsl:apply-templates select="//ol[@id='stream-items-id']/li[@data-item-id and @data-item-type='tweet']"/>
+ <xsl:apply-templates select="//ol[@id='stream-items-id']/li[@data-item-id and @data-item-type='tweet' and not(contains(@class, 'has-profile-promoted-tweet'))]"/>
</channel>
</rss>
</xsl:template>
--- /dev/null
+#!/bin/sh
+#
+# Facebook requires a CAPTCHA most of the times, so keep fetching the URL as
+# long as needed, until the page is shown with no CAPTCHA.
+
+set -e
+
+USER_AGENT="Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0";
+
+while true;
+do
+ # Force language to en-us to make sure that the string matching works
+ OUTPUT=$(wget -nv --user-agent="$USER_AGENT" --header='Accept-Language: en-us' -O - -- "$1")
+ if echo $OUTPUT | grep -q -v "Security Check Required";
+ then
+ echo "$OUTPUT" > facebook.html
+ break
+ fi
+ sleep 5
+done
--- /dev/null
+diff --git a/src/Tweeper.php b/src/Tweeper.php
+index 8ac2fe3..c45aab5 100644
+--- a/src/Tweeper.php
++++ b/src/Tweeper.php
+@@ -355,6 +355,15 @@ class Tweeper {
+ $html = call_user_func_array(array($this, $preprocess_html_host_method), array($html));
+ }
+
++ // XXX REMOVE: instrumentation to catch promoted tweets
++ if ($host == "twitter.com") {
++ $twitter_promoted_match_expr = '/promoted/i';
++ $ret = preg_match($twitter_promoted_match_expr, $html, $matches);
++ if ($ret) {
++ file_put_contents("/home/ao2/TWITTER_PROMOTED_DUMP.html", $html);
++ }
++ }
++
+ $xmlDoc = $this->htmlToXml($html, $host);
+ if (NULL === $xmlDoc) {
+ return NULL;
--- /dev/null
+#!/usr/bin/env php
+<?php
+
+require_once __DIR__ . '/../autoload.php';
+
+use Tweeper\Tweeper;
+
+date_default_timezone_set('UTC');
+
+$usage = "{$argv[0]}: <file> <host>\n";
+
+if ($argc < 3) {
+ fwrite(STDERR, $usage);
+ exit(1);
+}
+
+$file_url = 'file://' . realpath($argv[1]);
+$host = $argv[2];
+
+$tweeper = new Tweeper();
+$output = $tweeper->tweep($file_url, $host, false);
+if (is_null($output)) {
+ exit(1);
+}
+echo $output;
* Twitter.com
* Pump.io based websites, like Identi.ca
* Dilbert.com
-* Howtoons.com
* Instagram.com
* Facebook.com (public pages)