smooth-dl.py: improve getting the manifest of Clip elements
[smooth-dl.git] / smooth-dl.py
1 #!/usr/bin/env python
2 #
3 # smooth-dl - download videos served using Smooth Streaming technology
4 #
5 # Copyright (C) 2010  Antonio Ospite <ospite@studenti.unina.it>
6 #
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11 #
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 # GNU General Public License for more details.
16 #
17 # You should have received a copy of the GNU General Public License
18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19 #
20 #
21 # TODO:
22 #  - Handle HTTP errors:
23 #       "Connection reset by peer"
24 #       "Resource not  available"
25 #       "Gateway Time-out"
26 # - Support more Manifest formats:
27 #       WaveFormatEx attribute instead of PrivateCodecdata
28 #       'd' and other attributes in chunk element ('i', 's', 'q')
29 #
30 # basically, write a proper implementation of manifest parsing and chunk
31 # downloading
32
33 import os
34 import re
35 import sys
36 import xml.etree.ElementTree as etree
37 import urllib2
38 import struct
39 import tempfile
40 from optparse import OptionParser
41 from urlparse import urlparse, urlunparse
42
43 __description__ = "Download videos served using Smooth Streaming technology"
44 __version__ = "0.x"
45 __author_info__ = "Written by Antonio Ospite http://ao2.it"
46
47
48 def get_chunk_data(data):
49
50     moof_size = struct.unpack(">L", data[0:4])[0]
51     mdat_size = struct.unpack(">L", data[moof_size:moof_size + 4])[0]
52
53     data_start = moof_size + 4 + len('mdat')
54     data_size = mdat_size - 4 - len('mdat')
55
56     # print len(data[data_start:]), \
57     #        len(data[data_start:data_start + data_size]), data_size
58
59     assert len(data[data_start:]) == data_size
60
61     return data[data_start:data_start + data_size]
62
63
64 def hexstring_to_bytes(hex_string):
65     res = ""
66     for i in range(0, len(hex_string), 2):
67         res += chr(int(hex_string[i:i + 2], 16))
68
69     return res
70
71
72 def write_wav_header(out_file, fmt, codec_private_data, data_len):
73
74     extradata = hexstring_to_bytes(codec_private_data)
75
76     fmt['cbSize'] = len(extradata)
77     fmt_len = 18 + fmt['cbSize']
78     wave_len = len("WAVEfmt ") + 4 + fmt_len + len('data') + 4
79
80     out_file.write("RIFF")
81     out_file.write(struct.pack('<L', wave_len))
82     out_file.write("WAVEfmt ")
83     out_file.write(struct.pack('<L', fmt_len))
84     out_file.write(struct.pack('<H', fmt['wFormatTag']))
85     out_file.write(struct.pack('<H', fmt['nChannels']))
86     out_file.write(struct.pack('<L', fmt['nSamplesPerSec']))
87     out_file.write(struct.pack('<L', fmt['nAvgBytesPerSec']))
88     out_file.write(struct.pack('<H', fmt['nBlockAlign']))
89     out_file.write(struct.pack('<H', fmt['wBitsPerSample']))
90     out_file.write(struct.pack('<H', fmt['cbSize']))
91     out_file.write(extradata)
92     out_file.write("data")
93     out_file.write(struct.pack('<L', data_len))
94
95
96 def download_file(src_url, dest_file, mode):
97     try:
98         response = urllib2.urlopen(src_url)
99         data = response.read()
100     except urllib2.HTTPError:
101         sys.stderr.write("Error while dowloading URL: %s" % src_url)
102         raise
103
104     if dest_file:
105         f = open(dest_file, mode)
106         f.write(data)
107         f.close()
108
109     return data
110
111
112 def get_manifest(url, dest_dir):
113     """Returns the manifest element and the base content URL"""
114
115     # Remove the querystring if present
116     manifest_url = urlunparse(urlparse(url)._replace(query=''))
117
118     if not manifest_url.lower().endswith(('/manifest', '.ismc', '.csm')):
119         manifest_url += '/Manifest'
120
121     if os.path.exists(url):
122         local_manifest_path = url
123     else:
124         local_manifest_path = os.path.join(dest_dir, 'Manifest')
125         download_file(manifest_url, local_manifest_path, "w")
126
127     manifest = etree.parse(local_manifest_path)
128
129     version = manifest.getroot().attrib['MajorVersion']
130     if version != "2":
131         raise Exception('Only Smooth Streaming version 2 supported')
132
133     # if some intermediate client Manifest is used, like in Rai Replay
134     # then get the final manifest
135     clip = manifest.find("Clip")
136     if clip is not None and "Url" in clip.attrib:
137         tmp_manifest_url = clip.attrib["Url"]
138         try:
139             tmp_manifest = download_file(tmp_manifest_url, None, None)
140             # set the new values only if the dowload succeded
141             manifest_url = tmp_manifest_url
142             manifest = tmp_manifest
143         except urllib2.HTTPError:
144             pass
145
146     manifest_pattern = re.compile("/manifest$", re.IGNORECASE)
147     base_url = manifest_pattern.sub("", manifest_url)
148
149     return (manifest, base_url)
150
151
152 def print_manifest_info(manifest):
153
154     streams = manifest.findall('.//StreamIndex')
155
156     for i, s in enumerate(streams):
157         stream_type = s.attrib["Type"]
158
159         print "Stream: %s Type: %s" % (i, stream_type)
160
161         print "\tQuality Levels:"
162         qualities = s.findall("QualityLevel")
163         for i, q in enumerate(qualities):
164             bitrate = q.attrib["Bitrate"]
165             fourcc = q.attrib["FourCC"]
166
167             if stream_type == "video":
168                 size = "%sx%s" % (q.attrib["MaxWidth"], q.attrib["MaxHeight"])
169                 print "\t%2s: %4s %10s @ %7s bps" % (i, fourcc, size, bitrate)
170             if stream_type == "audio":
171                 channels = q.attrib["Channels"]
172                 sampling_rate = q.attrib["SamplingRate"]
173                 bits_per_sample = q.attrib["BitsPerSample"]
174                 print "\t%2s: %4s %sHz %sbits %sch @ %7s bps" % \
175                     (i, fourcc, sampling_rate, bits_per_sample, channels,
176                      bitrate)
177
178     print
179
180
181 def get_chunk_quality_string(stream, quality_level):
182     quality = stream.findall("QualityLevel")[quality_level]
183     bitrate = quality.attrib["Bitrate"]
184
185     quality_attributes = quality.findall("CustomAttributes/Attribute")
186     custom_attributes = ""
187     for i in quality_attributes:
188         custom_attributes += "%s=%s," % (i.attrib["Name"], i.attrib["Value"])
189     custom_attributes = custom_attributes.rstrip(',')
190
191     # Assume URLs are in this form:
192     # Url="QualityLevels({bitrate})/Fragments(video={start time})"
193     # or
194     # Url="QualityLevels({bitrate},{CustomAttributes})/Fragments(video={start time})"
195     url = stream.attrib["Url"]
196
197     chunks_quality = url.split('/')[0].replace("{bitrate}", bitrate)
198     chunks_quality = chunks_quality.replace("{CustomAttributes}",
199                                             custom_attributes)
200
201     return chunks_quality
202
203
204 def get_chunk_name_string(stream, chunk_time):
205     url = stream.attrib["Url"]
206     chunk_name = url.split('/')[1].replace("{start time}", str(chunk_time))
207
208     return chunk_name
209
210
211 def download_chunks(base_url, manifest, stream_index, quality_level, dest_dir):
212     stream = manifest.findall('.//StreamIndex')[stream_index]
213
214     chunks_quality = get_chunk_quality_string(stream, quality_level)
215
216     chunks_dest_dir = os.path.join(dest_dir, chunks_quality)
217     if not os.path.exists(chunks_dest_dir):
218         os.mkdir(chunks_dest_dir, 0755)
219
220     chunks = stream.findall("c")
221     data_size = 0
222     print "\nDownloading Stream %d" % stream_index
223     print "\tChunks %10d/%-10d" % (0, len(chunks)), "\r",
224     sys.stdout.flush()
225
226     stream_duration = 0
227     for i, chunk in enumerate(chunks):
228
229         if "t" in chunk.attrib:
230             chunk_time = chunk.attrib["t"]
231         elif "d" in chunk.attrib:
232             chunk_time = stream_duration
233             stream_duration = chunk_time + int(chunk.attrib["d"])
234
235         chunk_name = get_chunk_name_string(stream, chunk_time)
236         chunk_file = os.path.join(dest_dir, chunks_quality, chunk_name)
237
238         if not os.path.exists(chunk_file):
239             chunk_url = base_url + '/' + chunks_quality + '/' + chunk_name
240             data = download_file(chunk_url, chunk_file, "wb")
241         else:
242             f = open(chunk_file, "rb")
243             data = f.read()
244             f.close()
245
246         data_size += len(data)
247         print "\tChunks %10d/%-10d" % (i + 1, len(chunks)), "\r",
248         sys.stdout.flush()
249     print "\tDownloaded size:", data_size
250
251
252 def rebuild_stream(manifest, stream_index, quality_level, src_dir,
253                    dest_file_name, final_dest_file=None):
254
255     if final_dest_file is None:
256         final_dest_file = dest_file_name
257
258     stream = manifest.findall('.//StreamIndex')[stream_index]
259
260     chunks_quality = get_chunk_quality_string(stream, quality_level)
261
262     chunks_src_dir = os.path.join(src_dir, chunks_quality)
263
264     dest_file = open(dest_file_name, "wb")
265
266     chunks = stream.findall("c")
267     data_size = 0
268     print "\nRebuilding Stream %d" % stream_index
269     print "\tChunks %10d/%-10d" % (0, len(chunks)), "\r",
270     sys.stdout.flush()
271
272     stream_duration = 0
273     for i, chunk in enumerate(chunks):
274
275         if "t" in chunk.attrib:
276             chunk_time = chunk.attrib["t"]
277         elif "d" in chunk.attrib:
278             chunk_time = stream_duration
279             stream_duration = chunk_time + int(chunk.attrib["d"])
280
281         chunk_name = get_chunk_name_string(stream, chunk_time)
282         chunk_file = os.path.join(chunks_src_dir, chunk_name)
283
284         f = open(chunk_file, "rb")
285         data = get_chunk_data(f.read())
286         f.close()
287         dest_file.write(data)
288         data_size += len(data)
289         print "\tChunks %10d/%-10d" % (i + 1, len(chunks)), "\r",
290         sys.stdout.flush()
291
292     # Add a nice WAV header
293     if stream.attrib['Type'] == "audio":
294         quality = stream.findall("QualityLevel")[quality_level]
295         codec_private_data = quality.attrib['CodecPrivateData']
296
297         fmt = {}
298         fmt['wFormatTag'] = int(quality.attrib['AudioTag'])
299         fmt['nChannels'] = int(quality.attrib['Channels'])
300         fmt['nSamplesPerSec'] = int(quality.attrib['SamplingRate'])
301         fmt['nAvgBytesPerSec'] = int(quality.attrib['Bitrate']) / 8
302         fmt['wBitsPerSample'] = int(quality.attrib['BitsPerSample'])
303         fmt['nBlockAlign'] = int(quality.attrib['PacketSize'])
304         fmt['cbSize'] = 0
305
306         f = open(final_dest_file, "wb")
307         write_wav_header(f, fmt, codec_private_data, data_size)
308         dest_file.close()
309         dest_file = open(dest_file_name, "rb")
310         f.write(dest_file.read())
311         f.close()
312         dest_file.close()
313
314     print
315     print "Stream %d, actual data size: %d\n" % (stream_index, data_size)
316
317
318 def calc_tracks_delay(manifest, stream1_index, stream2_index):
319     streams = manifest.findall('.//StreamIndex')
320
321     s1 = streams[stream1_index]
322     s2 = streams[stream2_index]
323
324     if "TimeScale" not in s1 or "TimeScale" not in s2:
325         return 0
326
327     s1_start_chunk = s1.find("c")
328     s2_start_chunk = s2.find("c")
329
330     if "t" not in s1_start_chunk.attrib \
331        or "t" not in s2_start_chunk.attrib:
332         return 0
333
334     s1_start_time = int(s1_start_chunk.attrib['t'])
335     s2_start_time = int(s2_start_chunk.attrib['t'])
336
337     s1_timescale = float(s1.attrib['TimeScale'])
338     s2_timescale = float(s2.attrib['TimeScale'])
339
340     # calc difference in seconds
341     delay = s2_start_time / s2_timescale - \
342         s1_start_time / s1_timescale
343
344     return delay
345
346
347 def get_clip_duration(manifest):
348     # TODO: use <Clip ClipBegin="" ClipEnd=""> if Duration is not available
349     duration = manifest.getroot().attrib['Duration']
350
351     return float(duration) / 10000000  # here is the default timescale
352
353
354 def smooth_download(url, manifest, dest_dir,
355                     video_stream_index=0, audio_stream_index=1,
356                     video_quality_level=0, audio_quality_level=0,
357                     chunks_dir=None, download=True,
358                     out_video_file='_video.vc1', out_audio_file='_audio.raw'):
359
360     if chunks_dir is None:
361         chunks_dir = dest_dir
362
363     if download:
364         download_chunks(url, manifest, video_stream_index,
365                         video_quality_level, chunks_dir)
366         download_chunks(url, manifest, audio_stream_index,
367                         audio_quality_level, chunks_dir)
368
369     dest_video = os.path.join(dest_dir, out_video_file)
370     dest_audio = os.path.join(dest_dir, out_audio_file)
371
372     rebuild_stream(manifest, video_stream_index, video_quality_level,
373                    chunks_dir, dest_video)
374     rebuild_stream(manifest, audio_stream_index, audio_quality_level,
375                    chunks_dir, dest_audio, dest_audio + '.wav')
376
377     # duration = get_clip_duration(manifest)
378
379     delay = calc_tracks_delay(manifest, video_stream_index,
380                               audio_stream_index)
381
382     # optionally encode audio to vorbis:
383     # ffmpeg -i _audio.raw.wav -acodec libvorbis -aq 60 audio.ogg
384     mux_command = ("ffmpeg -i %s \\\n" +
385                    "  -itsoffset %f -async 1 -i %s \\\n" +
386                    "  -vcodec copy -acodec copy ffout.mkv") % \
387         (dest_video, delay, dest_audio + '.wav')
388
389     print mux_command
390
391
392 def options_parser():
393     version = "%%prog %s" % __version__
394     usage = "usage: %prog [options] <manifest URL or file>"
395     parser = OptionParser(usage=usage, version=version,
396                           description=__description__, epilog=__author_info__)
397     parser.add_option("-i", "--info",
398                       action="store_true", dest="info_only",
399                       default=False, help="print Manifest info and exit")
400     parser.add_option("-m", "--manifest-only",
401                       action="store_true", dest="manifest_only",
402                       default=False, help="download Manifest file and exit")
403     parser.add_option("-n", "--no-download",
404                       action="store_false", dest="download",
405                       default=True, help="disable downloading chunks")
406     parser.add_option("-s", "--sync-delay",
407                       action="store_true", dest="sync_delay",
408                       default=False, help="show the sync delay between the given streams and exit")
409     parser.add_option("-d", "--dest-dir", metavar="<dir>",
410                       dest="dest_dir", default=tempfile.gettempdir(),
411                       help="destination directory")
412     parser.add_option("-c", "--chunks-dir", metavar="<dir>",
413                       dest="chunks_dir", default=None,
414                       help="directory containing chunks, if different from destination dir")
415     parser.add_option("-v", "--video-stream", metavar="<n>",
416                       type="int", dest="video_stream_index", default=0,
417                       help="index of the video stream")
418     parser.add_option("-a", "--audio-stream", metavar="<n>",
419                       type="int", dest="audio_stream_index", default=1,
420                       help="index of the audio stream")
421     parser.add_option("-q", "--video-quality", metavar="<n>",
422                       type="int", dest="video_quality_level", default=0,
423                       help="index of the video quality level")
424     parser.add_option("-Q", "--audio-quality", metavar="<n>",
425                       type="int", dest="audio_quality_level", default=0,
426                       help="index of the audio quality level")
427
428     return parser
429
430
431 def main():
432     parser = options_parser()
433     (options, args) = parser.parse_args()
434
435     if len(args) != 1:
436         parser.print_help()
437         parser.exit(1)
438
439     if not os.path.exists(options.dest_dir):
440         os.mkdir(options.dest_dir, 0755)
441
442     url = args[0]
443     manifest, url = get_manifest(url, options.dest_dir)
444
445     if options.manifest_only:
446         parser.exit(0)
447
448     if options.sync_delay:
449         print calc_tracks_delay(manifest,
450                                 options.video_stream_index,
451                                 options.audio_stream_index)
452         parser.exit(0)
453
454     if options.info_only:
455         print_manifest_info(manifest)
456         parser.exit(0)
457
458     print_manifest_info(manifest)
459
460     smooth_download(url, manifest, options.dest_dir,
461                     options.video_stream_index, options.audio_stream_index,
462                     options.video_quality_level, options.audio_quality_level,
463                     options.chunks_dir, options.download)
464
465
466 if __name__ == "__main__":
467     main()