src/Tweeper.php: use a minimal User-Agent string to fix scraping twitter.com
[tweeper.git] / src / rss_converter_twitter.com.xsl
1 <!--
2   Stylesheet to convert Twitter user timelines to RSS.
3
4   Copyright (C) 2013-2018  Antonio Ospite <ao2@ao2.it>
5
6   This file is part of tweeper.
7
8   This program is free software: you can redistribute it and/or modify
9   it under the terms of the GNU General Public License as published by
10   the Free Software Foundation, either version 3 of the License, or
11   (at your option) any later version.
12
13   This program is distributed in the hope that it will be useful,
14   but WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16   GNU General Public License for more details.
17
18   You should have received a copy of the GNU General Public License
19   along with this program.  If not, see <http://www.gnu.org/licenses/>.
20 -->
21 <xsl:stylesheet version="1.0"
22     xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
23     xmlns:php="http://php.net/xsl"
24     exclude-result-prefixes="php">
25
26     <xsl:param name="generate-enclosure"/>
27     <xsl:param name="show-usernames"/>
28     <xsl:param name="show-multimedia"/>
29
30     <xsl:output method="xml" indent="yes"/>
31
32     <xsl:variable name="BaseURL">
33         <xsl:text>https://twitter.com</xsl:text>
34     </xsl:variable>
35
36     <!-- Identity transform -->
37     <xsl:template match="@*|node()">
38         <xsl:copy>
39             <!--
40                 Strip the style attribute while copying elements because it may be
41                 dangerous, see:
42                 https://validator.w3.org/feed/docs/warning/DangerousStyleAttr.html
43             -->
44             <xsl:apply-templates select="@*[not(name() = 'style')]|node()"/>
45         </xsl:copy>
46     </xsl:template>
47
48     <!--
49          Anchors to external links provide the direct URL in the
50          data-expanded-url attribute, so use this in the href attribute too
51          instead of the default short URL which uses the t.co redirection
52          service.
53
54          NOTE: when creating an element, attributes must be processed _before_
55          adding the contents (either children or a value):
56          http://stackoverflow.com/questions/21984867/
57     -->
58     <xsl:template match="a[@data-expanded-url]">
59         <!-- Prepend and append a white space for aestethic reasons -->
60         <xsl:text> </xsl:text>
61         <a>
62             <xsl:attribute name="href">
63                 <xsl:value-of select="@data-expanded-url"/>
64             </xsl:attribute>
65             <!-- Also strip &nbsp; and &hellip; -->
66             <xsl:value-of select="translate(., '&#xA0;&#x2026;', '')"/>
67         </a>
68         <xsl:text> </xsl:text>
69     </xsl:template>
70
71     <!--
72          These are links to pic.twitter.com, use the direct link for those
73          too instead of the t.co redirections.
74     -->
75     <xsl:template match="a[@data-pre-embedded='true']">
76         <xsl:if test="$show-multimedia = 1">
77             <!-- Prepend and append a white space for aestethic reasons -->
78             <xsl:text> </xsl:text>
79             <a>
80                 <xsl:attribute name="href">
81                     <xsl:value-of select="concat('https://', .)"/>
82                 </xsl:attribute>
83                 <xsl:value-of select="concat('https://', .)"/>
84             </a>
85             <xsl:text> </xsl:text>
86         </xsl:if>
87     </xsl:template>
88
89     <!-- Present images in a more convenient way -->
90     <xsl:template match="div[@data-image-url]">
91         <a>
92             <xsl:attribute name="href">
93                 <xsl:value-of select="concat(@data-image-url, ':orig')"/>
94             </xsl:attribute>
95             <img style="max-width: 100%">
96                 <xsl:attribute name="src">
97                     <xsl:value-of select="@data-image-url"/>
98                 </xsl:attribute>
99             </img>
100         </a>
101     </xsl:template>
102
103     <!-- Don't repeat background in embedded media content -->
104     <xsl:template match="div[contains(@class, 'PlayableMedia-player')]">
105         <xsl:copy>
106             <xsl:apply-templates select="@*"/>
107             <xsl:attribute name="style">
108                 <xsl:value-of select="concat(@style, '; background-repeat: no-repeat; background-size: 100% auto')"/>
109             </xsl:attribute>
110             <xsl:apply-templates select="node()"/>
111         </xsl:copy>
112     </xsl:template>
113
114     <xsl:template match="a[@data-expanded-url]" mode="enclosure">
115         <xsl:copy-of select="php:functionString('Tweeper\Tweeper::generateEnclosure', ./@data-expanded-url)"/>
116     </xsl:template>
117
118     <xsl:template match="div[@data-image-url]" mode="enclosure">
119         <xsl:copy-of select="php:functionString('Tweeper\Tweeper::generateEnclosure', concat(./@data-image-url, ':orig'))"/>
120     </xsl:template>
121
122     <xsl:variable name="screen-name" select="//div[@class='user-actions btn-group not-following ']/@data-screen-name"/>
123
124     <xsl:template match="//div[@class='permalink-inner permalink-tweet-container'] | //li[@data-item-id and @data-item-type='tweet']">
125         <xsl:variable name="user-name" select=".//div[@data-tweet-id]/@data-screen-name"/>
126         <xsl:variable name="item-content" select=".//p[contains(@class, 'js-tweet-text')]"/>
127         <xsl:variable name="item-media" select=".//div[contains(@class, 'AdaptiveMedia-container')]"/>
128         <xsl:variable name="item-permalink" select="concat($BaseURL, .//div[@data-permalink-path]/@data-permalink-path)"/>
129
130         <xsl:variable name="item-has-video" select="$item-media//*[contains(@class, 'PlayableMedia--video')]"/>
131         <xsl:variable name="item-has-gif" select="$item-media//*[contains(@class, 'PlayableMedia--gif')]"/>
132         <item>
133             <title>
134                 <xsl:if test="($show-usernames = 1) or ($screen-name != $user-name)">
135                     <xsl:value-of select="concat($user-name, ': ')"/>
136                 </xsl:if>
137                 <xsl:if test="$item-has-video">
138                     <xsl:text>(Video) </xsl:text>
139                 </xsl:if>
140                 <!--
141                      Prepend a space in front of the URLs which are not
142                      preceded by an open parenthesis, for aestethic reasons.
143                      Also, regex, I know: http://xkcd.com/1171/
144                 -->
145                 <xsl:variable
146                     name="processed-title"
147                     select="php:functionString('preg_replace', '@((?&lt;!\()(?:http[s]?://|pic.twitter.com))@', ' \1', $item-content)"/>
148                 <!-- Also strip &nbsp; and &hellip; -->
149                 <xsl:value-of select="normalize-space(translate($processed-title, '&#xA0;&#x2026;', ''))"/>
150             </title>
151             <link>
152                 <xsl:value-of select="$item-permalink"/>
153             </link>
154             <guid>
155                 <xsl:value-of select="$item-permalink"/>
156             </guid>
157             <pubDate>
158                 <xsl:variable name="timestamp" select=".//span[contains(@class, 'js-short-timestamp')]/@data-time"/>
159                 <xsl:value-of select="php:functionString('Tweeper\Tweeper::epochToRssDate', number($timestamp))"/>
160             </pubDate>
161             <description>
162                 <xsl:text disable-output-escaping="yes">&lt;![CDATA[</xsl:text>
163                 <xsl:if test="($show-usernames = 1) or ($screen-name != $user-name)">
164                     <xsl:value-of select="concat($user-name, ':')"/>
165                     <xsl:element name="br"/>
166                 </xsl:if>
167                 <xsl:if test="$item-has-video">
168                     <xsl:text> (Video)</xsl:text>
169                     <xsl:element name="br"/>
170                 </xsl:if>
171                 <xsl:if test="$item-has-gif">
172                     <xsl:text> (GIF)</xsl:text>
173                     <xsl:element name="br"/>
174                 </xsl:if>
175                 <xsl:element name="span">
176                     <xsl:attribute name="style">white-space: pre-wrap;</xsl:attribute>
177                     <xsl:apply-templates select="$item-content/node()"/>
178                 </xsl:element>
179                 <xsl:if test="$show-multimedia = 1">
180                     <xsl:apply-templates select="$item-media/node()"/>
181                 </xsl:if>
182                 <xsl:text disable-output-escaping="yes">]]&gt;</xsl:text>
183             </description>
184             <xsl:if test="$generate-enclosure = 1">
185                 <xsl:apply-templates select="$item-content//a[@data-expanded-url]" mode="enclosure"/>
186                 <xsl:apply-templates select="$item-media//div[@data-image-url]" mode="enclosure"/>
187             </xsl:if>
188         </item>
189     </xsl:template>
190
191     <xsl:template match="/">
192         <xsl:variable name="channel-title">
193             <xsl:choose>
194                 <xsl:when test="$screen-name != ''">
195                     <xsl:value-of select="concat('Twitter / ', $screen-name)"/>
196                 </xsl:when>
197                 <xsl:otherwise>
198                     <xsl:value-of select="concat('Twitter / ', normalize-space(//h1[1]))"/>
199                 </xsl:otherwise>
200             </xsl:choose>
201         </xsl:variable>
202         <xsl:variable name="channel-link" select="//link[@rel='canonical']/@href"/>
203         <xsl:variable name="channel-image" select="//a[contains(@class, 'profile-picture')]/@href"/>
204
205         <rss version="2.0">
206             <xsl:attribute name="xml:base"><xsl:value-of select="$BaseURL" /></xsl:attribute>
207             <channel>
208                 <generator>Tweeper</generator>
209                 <title>
210                     <xsl:value-of select="$channel-title"/>
211                 </title>
212                 <link>
213                     <xsl:value-of select="$channel-link"/>
214                 </link>
215                 <description>
216                     <xsl:value-of select="normalize-space(//div[@class='ProfileHeaderCard'])"/>
217                     <!-- The following rule should only match on hashtag URLs -->
218                     <xsl:value-of select="normalize-space(//div[@class='SearchNavigation-textContainer'])"/>
219                 </description>
220                 <xsl:if test="$channel-image != ''">
221                     <image>
222                         <title>
223                             <xsl:value-of select="$channel-title"/>
224                         </title>
225                         <link>
226                             <xsl:value-of select="$channel-link"/>
227                         </link>
228                         <url>
229                             <xsl:value-of select="$channel-image"/>
230                         </url>
231                     </image>
232                 </xsl:if>
233                 <xsl:apply-templates select="//ol[@id='stream-items-id']/li[@data-item-id and @data-item-type='tweet' and not(contains(@class, 'has-profile-promoted-tweet'))]"/>
234
235                 <!-- These rules will only match on permalink URLs -->
236                 <xsl:apply-templates select="//div[@class='permalink-inner permalink-tweet-container']"/>
237                 <xsl:apply-templates select="//div[@data-component-context='replies']//li[@data-item-id and @data-item-type='tweet' and not(contains(@class, 'has-profile-promoted-tweet'))]"/>
238
239             </channel>
240         </rss>
241     </xsl:template>
242 </xsl:stylesheet>