Merge tag 'v1.1.0' into debian
authorAntonio Ospite <ao2@ao2.it>
Tue, 27 Jun 2017 12:32:31 +0000 (14:32 +0200)
committerAntonio Ospite <ao2@ao2.it>
Tue, 27 Jun 2017 12:32:31 +0000 (14:32 +0200)
Release v1.1.0

12 files changed:
HACKING
NEWS
README
TODO
src/Tweeper.php
src/rss_converter_facebook.com.xsl
src/rss_converter_howtoons.com.xsl [deleted file]
src/rss_converter_twitter.com.xsl
tests/fetch_facebook_page.sh [new file with mode: 0755]
tests/instument_to_catch_promoted_tweets.diff [new file with mode: 0644]
tests/tweeper_file [new file with mode: 0755]
tweeper.1.asciidoc

diff --git a/HACKING b/HACKING
index 14fba7e..8345eaf 100644 (file)
--- a/HACKING
+++ b/HACKING
@@ -5,6 +5,13 @@ Style compliance can be checked using the Coder Sniffer extension to the PEAR
 PHP_CodeSniffer project, for instructions about how to install Coder Sniffer
 see https://www.drupal.org/node/1419988
 
-Use this command to check the style:
+TL;DR: install drupla/coder and enable the Drupal coding standard in
+PHP_CodeSniffer:
+  
+  $ composer global require drupal/coder
+  $ export PATH="$HOME/.config/composer/vendor/bin:$PATH"
+  $ phpcs --config-set installed_paths $HOME/.config/composer/vendor/drupal/coder/coder_sniffer/
+
+And then use this command to check the style:
 
   $ phpcs --standard=Drupal .
diff --git a/NEWS b/NEWS
index 6fccebb..59e21b7 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,14 @@
+News for v1.1.0:
+================
+
+  * Make scraping Facebook.com pages more robust
+  * Fix getting the channel image for Facebook.com pages
+  * Add some development tools
+  * Fix a problem with some feed readers when showing images from Twitter.com
+    by ignoring the "style" attribute in the scraped HTML
+  * Filter out promoted tweets when scraping Twitter.com
+  * Remove support for Howtoons.com, the old blog is not available anymore
+
 News for v1.0.0:
 ================
 
diff --git a/README b/README
index 5f22618..7d47e8e 100644 (file)
--- a/README
+++ b/README
@@ -35,7 +35,6 @@ The currently supported sites are:
   * Twitter.com
   * Pump.io based websites, like Identi.ca
   * Dilbert.com
-  * Howtoons.com
   * Instagram.com
   * Facebook.com (public pages)
 
diff --git a/TODO b/TODO
index 3c71811..7b72745 100644 (file)
--- a/TODO
+++ b/TODO
@@ -1,3 +1,7 @@
+- re-evaluate the use of trigger_error() or use a custom error handler,
+  because right now the code exists as soon as trigger_error() gets called and
+  any following code is ignored.
+
 - write better XSL stylesheets? I am not an XSL expert
 - evaluate the use of the <ttl/> RSS element
 - show cards directly in RSS items for twitter.com
index 93ac9e0..8ac2fe3 100644 (file)
@@ -315,21 +315,30 @@ class Tweeper {
   /**
    * Convert the site content to RSS.
    */
-  public function tweep($src_url) {
+  public function tweep($src_url, $host=NULL, $validate_scheme=TRUE) {
     $url = parse_url($src_url);
-    if (FALSE === $url || empty($url["host"])) {
+    if (FALSE === $url) {
       trigger_error("Invalid URL: $src_url", E_USER_ERROR);
       return NULL;
     }
 
-    $scheme = $url["scheme"];
-    if (!in_array($scheme, array("http", "https"))) {
-      trigger_error("unsupported scheme: $scheme", E_USER_ERROR);
-      return NULL;
+    if (TRUE === $validate_scheme) {
+      $scheme = $url["scheme"];
+      if (!in_array($scheme, array("http", "https"))) {
+        trigger_error("unsupported scheme: $scheme", E_USER_ERROR);
+        return NULL;
+      }
     }
 
-    // Strip the leading www. to be more forgiving on input URLs.
-    $host = preg_replace('/^www\./', '', $url["host"]);
+    // if the host is not given derive it from the URL
+    if (NULL === $host) {
+      if (empty($url["host"])) {
+        trigger_error("Invalid host in URL: $src_url", E_USER_ERROR);
+        return NULL;
+      }
+      // Strip the leading www. to be more forgiving on input URLs.
+      $host = preg_replace('/^www\./', '', $url["host"]);
+    }
 
     $xsltProcessor = $this->loadStylesheet($host);
     if (NULL === $xsltProcessor) {
index 933d3d2..def8e69 100644 (file)
@@ -52,7 +52,7 @@
         name="page-id"
         select="substring-after(//meta[@property='al:android:url']/@content, 'fb://page/')"/>
 
-    <xsl:template match="//div[contains(@class, 'userContentWrapper')]">
+    <xsl:template match="//div[contains(@class, 'fbUserContent') or contains(@class, 'userContentWrapper')]">
         <xsl:variable name="story-id" select=".//input[@name='ft_ent_identifier']/@value"/>
         <xsl:variable
             name="item-permalink"
 
     <xsl:template match="/">
         <xsl:variable name="channel-title" select="//title"/>
-        <xsl:variable name="channel-link" select="//div[contains(@class, 'userContentWrapper')][1]//a[1]/@href"/>
+        <xsl:variable name="channel-link" select="//div[contains(@class, 'fbUserContent') or contains(@class, 'userContentWrapper')][1]//a[1]/@href"/>
+        <xsl:variable name="channel-image" select="//div[contains(@class, 'fbUserContent') or contains(@class, 'userContentWrapper')][1]//a[1]//img/@src"/>
 
         <rss version="2.0">
             <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
                         <xsl:value-of select="$channel-link"/>
                     </link>
                     <url>
-                        <xsl:value-of select="//img[@class='profilePic img']/@src"/>
+                        <xsl:value-of select="$channel-image"/>
                     </url>
                 </image>
-                <xsl:apply-templates select="//div[contains(@class, 'userContentWrapper')]"/>
+                <xsl:apply-templates select="//div[contains(@class, 'fbUserContent') or contains(@class, 'userContentWrapper')]"/>
             </channel>
         </rss>
     </xsl:template>
diff --git a/src/rss_converter_howtoons.com.xsl b/src/rss_converter_howtoons.com.xsl
deleted file mode 100644 (file)
index 35a6739..0000000
+++ /dev/null
@@ -1,102 +0,0 @@
-<!--
-  Stylesheet to convert Howtoons.com to RSS.
-
-  Copyright (C) 2014  Antonio Ospite <ao2@ao2.it>
-
-  This file is part of tweeper.
-
-  This program is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
--->
-
-<!--
-  The RSS feed link is broken on http://howtoons.com so just work around it.
-
-  Howtoons uses Wordpress, so maybe this style sheet can be used as a base for
-  scraping other Wordpress sites.
--->
-
-<xsl:stylesheet version="1.0"
-    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
-    xmlns:php="http://php.net/xsl"
-    xsl:extension-element-prefixes="php"
-    exclude-result-prefixes="php">
-
-    <xsl:output method="xml" indent="yes"/>
-
-    <xsl:variable name="BaseURL">
-        <xsl:text>http://howtoons.com</xsl:text>
-    </xsl:variable>
-
-    <xsl:template match="//div[contains(@id, 'post-')]">
-        <xsl:variable name="item-permalink" select=".//div[@class='post-headline']//a/@href"/>
-        <item>
-            <title>
-                <xsl:value-of select="normalize-space(.//div[@class='post-headline']//a)"/>
-            </title>
-            <link>
-                <xsl:value-of select="$item-permalink"/>
-            </link>
-            <guid>
-                <xsl:value-of select="$item-permalink"/>
-            </guid>
-            <pubDate>
-                <xsl:variable name="date" select="substring-before(.//div[@class='post-byline'], ',')"/>
-                <!-- date format is MM.DD.YY -->
-                <xsl:variable name="month" select="substring($date, 1, 2)"/>
-                <xsl:variable name="day" select="substring($date, 4, 2)"/>
-                <xsl:variable name="year" select="substring($date, 7, 2)"/>
-                <xsl:variable name="iso-date" select="concat('20', $year, '-', $month, '-', $day)"/>
-                <xsl:value-of select="php:functionString('Tweeper\Tweeper::strToRssDate', $iso-date)"/>
-            </pubDate>
-            <description>
-                <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
-                <xsl:copy-of select=".//div[contains(@class, 'post-bodycopy')]/p"/>
-                <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
-            </description>
-        </item>
-    </xsl:template>
-
-    <xsl:template match="/">
-        <xsl:variable name="channel-title" select="//title"/>
-        <xsl:variable name="channel-link" select="$BaseURL"/>
-
-        <rss version="2.0">
-            <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
-            <channel>
-                <generator>Tweeper</generator>
-                <title>
-                    <xsl:value-of select="$channel-title"/>
-                </title>
-                <link>
-                    <xsl:value-of select="$channel-link"/>
-                </link>
-                <description>
-                    <xsl:text>The world's greatest D.I.Y. comic website! Tools of mass construction!</xsl:text>
-                </description>
-                <image>
-                    <title>
-                        <xsl:value-of select="$channel-title"/>
-                    </title>
-                    <link>
-                        <xsl:value-of select="$channel-link"/>
-                    </link>
-                    <url>
-                        <xsl:text>http://www.howtoons.com/wp-content/themes/atahualpa/images/header/tuck1000.png</xsl:text>
-                    </url>
-                </image>
-                <xsl:apply-templates select="//div[contains(@id, 'post-')]"/>
-            </channel>
-        </rss>
-    </xsl:template>
-</xsl:stylesheet>
index 58539ae..e2c5125 100644 (file)
@@ -35,7 +35,7 @@
     <!-- Identity transform -->
     <xsl:template match="@*|node()">
         <xsl:copy>
-            <xsl:apply-templates select="@*|node()"/>
+            <xsl:apply-templates select="@*[not(name() = 'style')]|node()"/>
         </xsl:copy>
     </xsl:template>
 
                         <xsl:value-of select="//a[contains(@class, 'profile-picture')]/@href"/>
                     </url>
                 </image>
-                <xsl:apply-templates select="//ol[@id='stream-items-id']/li[@data-item-id and @data-item-type='tweet']"/>
+                <xsl:apply-templates select="//ol[@id='stream-items-id']/li[@data-item-id and @data-item-type='tweet' and not(contains(@class, 'has-profile-promoted-tweet'))]"/>
             </channel>
         </rss>
     </xsl:template>
diff --git a/tests/fetch_facebook_page.sh b/tests/fetch_facebook_page.sh
new file mode 100755 (executable)
index 0000000..f25966e
--- /dev/null
@@ -0,0 +1,20 @@
+#!/bin/sh
+#
+# Facebook requires a CAPTCHA most of the times, so keep fetching the URL as
+# long as needed, until the page is shown with no CAPTCHA.
+
+set -e
+
+USER_AGENT="Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0";
+
+while true;
+do
+  # Force language to en-us to make sure that the string matching works
+  OUTPUT=$(wget -nv --user-agent="$USER_AGENT" --header='Accept-Language: en-us' -O - -- "$1")
+  if echo $OUTPUT | grep -q -v "Security Check Required";
+  then
+    echo "$OUTPUT" > facebook.html
+    break
+  fi
+  sleep 5
+done
diff --git a/tests/instument_to_catch_promoted_tweets.diff b/tests/instument_to_catch_promoted_tweets.diff
new file mode 100644 (file)
index 0000000..3f27dd5
--- /dev/null
@@ -0,0 +1,20 @@
+diff --git a/src/Tweeper.php b/src/Tweeper.php
+index 8ac2fe3..c45aab5 100644
+--- a/src/Tweeper.php
++++ b/src/Tweeper.php
+@@ -355,6 +355,15 @@ class Tweeper {
+       $html = call_user_func_array(array($this, $preprocess_html_host_method), array($html));
+     }
++    // XXX REMOVE: instrumentation to catch promoted tweets
++    if ($host == "twitter.com") {
++      $twitter_promoted_match_expr = '/promoted/i';
++      $ret = preg_match($twitter_promoted_match_expr, $html, $matches);
++      if ($ret) {
++        file_put_contents("/home/ao2/TWITTER_PROMOTED_DUMP.html", $html);
++      }
++    }
++
+     $xmlDoc = $this->htmlToXml($html, $host);
+     if (NULL === $xmlDoc) {
+       return NULL;
diff --git a/tests/tweeper_file b/tests/tweeper_file
new file mode 100755 (executable)
index 0000000..15de10c
--- /dev/null
@@ -0,0 +1,25 @@
+#!/usr/bin/env php
+<?php
+
+require_once __DIR__ . '/../autoload.php';
+
+use Tweeper\Tweeper;
+
+date_default_timezone_set('UTC');
+
+$usage = "{$argv[0]}: <file> <host>\n";
+
+if ($argc < 3) {
+    fwrite(STDERR, $usage);
+    exit(1);
+}
+
+$file_url = 'file://' . realpath($argv[1]);
+$host = $argv[2];
+
+$tweeper = new Tweeper();
+$output = $tweeper->tweep($file_url, $host, false);
+if (is_null($output)) {
+    exit(1);
+}
+echo $output;
index ac1fdd1..019df14 100644 (file)
@@ -29,7 +29,6 @@ The sites that tweeper is able to scrape and convert to RSS are:
 * Twitter.com
 * Pump.io based websites, like Identi.ca
 * Dilbert.com
-* Howtoons.com
 * Instagram.com
 * Facebook.com (public pages)