From: Antonio Ospite Date: Tue, 27 Jun 2017 12:32:31 +0000 (+0200) Subject: Merge tag 'v1.1.0' into debian X-Git-Tag: debian/1.1.0-1~5 X-Git-Url: https://git.ao2.it/tweeper.git/commitdiff_plain/49a8c00316a34a003218fa272d4ff87323a76627?hp=b2f80aa1324d8735fb30bcc61f19133dfb09351f Merge tag 'v1.1.0' into debian Release v1.1.0 --- diff --git a/HACKING b/HACKING index 14fba7e..8345eaf 100644 --- a/HACKING +++ b/HACKING @@ -5,6 +5,13 @@ Style compliance can be checked using the Coder Sniffer extension to the PEAR PHP_CodeSniffer project, for instructions about how to install Coder Sniffer see https://www.drupal.org/node/1419988 -Use this command to check the style: +TL;DR: install drupla/coder and enable the Drupal coding standard in +PHP_CodeSniffer: + + $ composer global require drupal/coder + $ export PATH="$HOME/.config/composer/vendor/bin:$PATH" + $ phpcs --config-set installed_paths $HOME/.config/composer/vendor/drupal/coder/coder_sniffer/ + +And then use this command to check the style: $ phpcs --standard=Drupal . diff --git a/NEWS b/NEWS index 6fccebb..59e21b7 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,14 @@ +News for v1.1.0: +================ + + * Make scraping Facebook.com pages more robust + * Fix getting the channel image for Facebook.com pages + * Add some development tools + * Fix a problem with some feed readers when showing images from Twitter.com + by ignoring the "style" attribute in the scraped HTML + * Filter out promoted tweets when scraping Twitter.com + * Remove support for Howtoons.com, the old blog is not available anymore + News for v1.0.0: ================ diff --git a/README b/README index 5f22618..7d47e8e 100644 --- a/README +++ b/README @@ -35,7 +35,6 @@ The currently supported sites are: * Twitter.com * Pump.io based websites, like Identi.ca * Dilbert.com - * Howtoons.com * Instagram.com * Facebook.com (public pages) diff --git a/TODO b/TODO index 3c71811..7b72745 100644 --- a/TODO +++ b/TODO @@ -1,3 +1,7 @@ +- re-evaluate the use of trigger_error() or use a custom error handler, + because right now the code exists as soon as trigger_error() gets called and + any following code is ignored. + - write better XSL stylesheets? I am not an XSL expert - evaluate the use of the RSS element - show cards directly in RSS items for twitter.com diff --git a/src/Tweeper.php b/src/Tweeper.php index 93ac9e0..8ac2fe3 100644 --- a/src/Tweeper.php +++ b/src/Tweeper.php @@ -315,21 +315,30 @@ class Tweeper { /** * Convert the site content to RSS. */ - public function tweep($src_url) { + public function tweep($src_url, $host=NULL, $validate_scheme=TRUE) { $url = parse_url($src_url); - if (FALSE === $url || empty($url["host"])) { + if (FALSE === $url) { trigger_error("Invalid URL: $src_url", E_USER_ERROR); return NULL; } - $scheme = $url["scheme"]; - if (!in_array($scheme, array("http", "https"))) { - trigger_error("unsupported scheme: $scheme", E_USER_ERROR); - return NULL; + if (TRUE === $validate_scheme) { + $scheme = $url["scheme"]; + if (!in_array($scheme, array("http", "https"))) { + trigger_error("unsupported scheme: $scheme", E_USER_ERROR); + return NULL; + } } - // Strip the leading www. to be more forgiving on input URLs. - $host = preg_replace('/^www\./', '', $url["host"]); + // if the host is not given derive it from the URL + if (NULL === $host) { + if (empty($url["host"])) { + trigger_error("Invalid host in URL: $src_url", E_USER_ERROR); + return NULL; + } + // Strip the leading www. to be more forgiving on input URLs. + $host = preg_replace('/^www\./', '', $url["host"]); + } $xsltProcessor = $this->loadStylesheet($host); if (NULL === $xsltProcessor) { diff --git a/src/rss_converter_facebook.com.xsl b/src/rss_converter_facebook.com.xsl index 933d3d2..def8e69 100644 --- a/src/rss_converter_facebook.com.xsl +++ b/src/rss_converter_facebook.com.xsl @@ -52,7 +52,7 @@ name="page-id" select="substring-after(//meta[@property='al:android:url']/@content, 'fb://page/')"/> - + - + + @@ -131,10 +132,10 @@ - + - + diff --git a/src/rss_converter_howtoons.com.xsl b/src/rss_converter_howtoons.com.xsl deleted file mode 100644 index 35a6739..0000000 --- a/src/rss_converter_howtoons.com.xsl +++ /dev/null @@ -1,102 +0,0 @@ - - - - - - - - - - http://howtoons.com - - - - - - - <xsl:value-of select="normalize-space(.//div[@class='post-headline']//a)"/> - - - - - - - - - - - - - - - - - - <![CDATA[ - - ]]> - - - - - - - - - - - - Tweeper - - <xsl:value-of select="$channel-title"/> - - - - - - The world's greatest D.I.Y. comic website! Tools of mass construction! - - - - <xsl:value-of select="$channel-title"/> - - - - - - http://www.howtoons.com/wp-content/themes/atahualpa/images/header/tuck1000.png - - - - - - - diff --git a/src/rss_converter_twitter.com.xsl b/src/rss_converter_twitter.com.xsl index 58539ae..e2c5125 100644 --- a/src/rss_converter_twitter.com.xsl +++ b/src/rss_converter_twitter.com.xsl @@ -35,7 +35,7 @@ - + @@ -201,7 +201,7 @@ - + diff --git a/tests/fetch_facebook_page.sh b/tests/fetch_facebook_page.sh new file mode 100755 index 0000000..f25966e --- /dev/null +++ b/tests/fetch_facebook_page.sh @@ -0,0 +1,20 @@ +#!/bin/sh +# +# Facebook requires a CAPTCHA most of the times, so keep fetching the URL as +# long as needed, until the page is shown with no CAPTCHA. + +set -e + +USER_AGENT="Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0"; + +while true; +do + # Force language to en-us to make sure that the string matching works + OUTPUT=$(wget -nv --user-agent="$USER_AGENT" --header='Accept-Language: en-us' -O - -- "$1") + if echo $OUTPUT | grep -q -v "Security Check Required"; + then + echo "$OUTPUT" > facebook.html + break + fi + sleep 5 +done diff --git a/tests/instument_to_catch_promoted_tweets.diff b/tests/instument_to_catch_promoted_tweets.diff new file mode 100644 index 0000000..3f27dd5 --- /dev/null +++ b/tests/instument_to_catch_promoted_tweets.diff @@ -0,0 +1,20 @@ +diff --git a/src/Tweeper.php b/src/Tweeper.php +index 8ac2fe3..c45aab5 100644 +--- a/src/Tweeper.php ++++ b/src/Tweeper.php +@@ -355,6 +355,15 @@ class Tweeper { + $html = call_user_func_array(array($this, $preprocess_html_host_method), array($html)); + } + ++ // XXX REMOVE: instrumentation to catch promoted tweets ++ if ($host == "twitter.com") { ++ $twitter_promoted_match_expr = '/promoted/i'; ++ $ret = preg_match($twitter_promoted_match_expr, $html, $matches); ++ if ($ret) { ++ file_put_contents("/home/ao2/TWITTER_PROMOTED_DUMP.html", $html); ++ } ++ } ++ + $xmlDoc = $this->htmlToXml($html, $host); + if (NULL === $xmlDoc) { + return NULL; diff --git a/tests/tweeper_file b/tests/tweeper_file new file mode 100755 index 0000000..15de10c --- /dev/null +++ b/tests/tweeper_file @@ -0,0 +1,25 @@ +#!/usr/bin/env php + \n"; + +if ($argc < 3) { + fwrite(STDERR, $usage); + exit(1); +} + +$file_url = 'file://' . realpath($argv[1]); +$host = $argv[2]; + +$tweeper = new Tweeper(); +$output = $tweeper->tweep($file_url, $host, false); +if (is_null($output)) { + exit(1); +} +echo $output; diff --git a/tweeper.1.asciidoc b/tweeper.1.asciidoc index ac1fdd1..019df14 100644 --- a/tweeper.1.asciidoc +++ b/tweeper.1.asciidoc @@ -29,7 +29,6 @@ The sites that tweeper is able to scrape and convert to RSS are: * Twitter.com * Pump.io based websites, like Identi.ca * Dilbert.com -* Howtoons.com * Instagram.com * Facebook.com (public pages)