smooth-dl.py: improve getting the manifest of Clip elements
[smooth-dl.git] / smooth-dl.py
index 36ba345..a496bc7 100755 (executable)
 # basically, write a proper implementation of manifest parsing and chunk
 # downloading
 
 # basically, write a proper implementation of manifest parsing and chunk
 # downloading
 
-
-__description = "Download videos served using Smooth Streaming technology"
-__version = "0.x"
-__author_info = "Written by Antonio Ospite http://ao2.it"
-
 import os
 import os
+import re
 import sys
 import xml.etree.ElementTree as etree
 import urllib2
 import struct
 import tempfile
 from optparse import OptionParser
 import sys
 import xml.etree.ElementTree as etree
 import urllib2
 import struct
 import tempfile
 from optparse import OptionParser
+from urlparse import urlparse, urlunparse
+
+__description__ = "Download videos served using Smooth Streaming technology"
+__version__ = "0.x"
+__author_info__ = "Written by Antonio Ospite http://ao2.it"
 
 
 def get_chunk_data(data):
 
 
 def get_chunk_data(data):
@@ -52,10 +53,10 @@ def get_chunk_data(data):
     data_start = moof_size + 4 + len('mdat')
     data_size = mdat_size - 4 - len('mdat')
 
     data_start = moof_size + 4 + len('mdat')
     data_size = mdat_size - 4 - len('mdat')
 
-    #print len(data[data_start:]), \
+    # print len(data[data_start:]), \
     #        len(data[data_start:data_start + data_size]), data_size
 
     #        len(data[data_start:data_start + data_size]), data_size
 
-    assert(len(data[data_start:]) == data_size)
+    assert len(data[data_start:]) == data_size
 
     return data[data_start:data_start + data_size]
 
 
     return data[data_start:data_start + data_size]
 
@@ -63,7 +64,7 @@ def get_chunk_data(data):
 def hexstring_to_bytes(hex_string):
     res = ""
     for i in range(0, len(hex_string), 2):
 def hexstring_to_bytes(hex_string):
     res = ""
     for i in range(0, len(hex_string), 2):
-            res += chr(int(hex_string[i:i + 2], 16))
+        res += chr(int(hex_string[i:i + 2], 16))
 
     return res
 
 
     return res
 
@@ -92,53 +93,68 @@ def write_wav_header(out_file, fmt, codec_private_data, data_len):
     out_file.write(struct.pack('<L', data_len))
 
 
     out_file.write(struct.pack('<L', data_len))
 
 
-def get_manifest(base_url, dest_dir=tempfile.gettempdir(),
-        manifest_file='Manifest'):
-    """Returns the manifest and the new URL if this is changed"""
+def download_file(src_url, dest_file, mode):
+    try:
+        response = urllib2.urlopen(src_url)
+        data = response.read()
+    except urllib2.HTTPError:
+        sys.stderr.write("Error while dowloading URL: %s" % src_url)
+        raise
 
 
-    if os.path.exists(dest_dir) == False:
-        os.mkdir(dest_dir, 0755)
+    if dest_file:
+        f = open(dest_file, mode)
+        f.write(data)
+        f.close()
 
 
-    if base_url.startswith('http://'):
+    return data
 
 
-        manifest_url = base_url
-        if not manifest_url.lower().endswith(('/manifest', '.ismc', '.csm')):
-            manifest_url += '/Manifest'
 
 
-        response = urllib2.urlopen(manifest_url)
-        data = response.read()
+def get_manifest(url, dest_dir):
+    """Returns the manifest element and the base content URL"""
 
 
-        manifest_path = os.path.join(dest_dir, manifest_file)
-        f = open(manifest_path, "w")
-        f.write(data)
-        f.close()
+    # Remove the querystring if present
+    manifest_url = urlunparse(urlparse(url)._replace(query=''))
+
+    if not manifest_url.lower().endswith(('/manifest', '.ismc', '.csm')):
+        manifest_url += '/Manifest'
+
+    if os.path.exists(url):
+        local_manifest_path = url
     else:
     else:
-        manifest_path = base_url
+        local_manifest_path = os.path.join(dest_dir, 'Manifest')
+        download_file(manifest_url, local_manifest_path, "w")
 
 
-    manifest = etree.parse(manifest_path)
+    manifest = etree.parse(local_manifest_path)
 
     version = manifest.getroot().attrib['MajorVersion']
     if version != "2":
         raise Exception('Only Smooth Streaming version 2 supported')
 
 
     version = manifest.getroot().attrib['MajorVersion']
     if version != "2":
         raise Exception('Only Smooth Streaming version 2 supported')
 
-    try:
-        # if some intermediate client Manifest is used, like in Rai Replay
-        clip = manifest.find("Clip")
-        actual_manifest_url = clip.attrib["Url"]
-        base_url = actual_manifest_url.lower().replace("/manifest", "")
-    except:
-        pass
+    # if some intermediate client Manifest is used, like in Rai Replay
+    # then get the final manifest
+    clip = manifest.find("Clip")
+    if clip is not None and "Url" in clip.attrib:
+        tmp_manifest_url = clip.attrib["Url"]
+        try:
+            tmp_manifest = download_file(tmp_manifest_url, None, None)
+            # set the new values only if the dowload succeded
+            manifest_url = tmp_manifest_url
+            manifest = tmp_manifest
+        except urllib2.HTTPError:
+            pass
+
+    manifest_pattern = re.compile("/manifest$", re.IGNORECASE)
+    base_url = manifest_pattern.sub("", manifest_url)
 
     return (manifest, base_url)
 
 
 def print_manifest_info(manifest):
 
 
     return (manifest, base_url)
 
 
 def print_manifest_info(manifest):
 
-    streams = manifest.findall('//StreamIndex')
+    streams = manifest.findall('.//StreamIndex')
 
     for i, s in enumerate(streams):
         stream_type = s.attrib["Type"]
 
     for i, s in enumerate(streams):
         stream_type = s.attrib["Type"]
-        url = s.attrib["Url"]
 
         print "Stream: %s Type: %s" % (i, stream_type)
 
 
         print "Stream: %s Type: %s" % (i, stream_type)
 
@@ -155,29 +171,50 @@ def print_manifest_info(manifest):
                 channels = q.attrib["Channels"]
                 sampling_rate = q.attrib["SamplingRate"]
                 bits_per_sample = q.attrib["BitsPerSample"]
                 channels = q.attrib["Channels"]
                 sampling_rate = q.attrib["SamplingRate"]
                 bits_per_sample = q.attrib["BitsPerSample"]
-                print "\t%2s: %4s %sHz %sbits %sch @ %7s bps" % (i, fourcc,
-                        sampling_rate, bits_per_sample, channels, bitrate)
+                print "\t%2s: %4s %sHz %sbits %sch @ %7s bps" % \
+                    (i, fourcc, sampling_rate, bits_per_sample, channels,
+                     bitrate)
 
     print
 
 
 
     print
 
 
-def download_chunks(base_url, manifest, stream_index, quality_level, dest_dir):
-
-    if os.path.exists(dest_dir) == False:
-        os.mkdir(dest_dir, 0755)
-
-    stream = manifest.findall('//StreamIndex')[stream_index]
-
+def get_chunk_quality_string(stream, quality_level):
     quality = stream.findall("QualityLevel")[quality_level]
     bitrate = quality.attrib["Bitrate"]
 
     quality = stream.findall("QualityLevel")[quality_level]
     bitrate = quality.attrib["Bitrate"]
 
+    quality_attributes = quality.findall("CustomAttributes/Attribute")
+    custom_attributes = ""
+    for i in quality_attributes:
+        custom_attributes += "%s=%s," % (i.attrib["Name"], i.attrib["Value"])
+    custom_attributes = custom_attributes.rstrip(',')
+
     # Assume URLs are in this form:
     # Url="QualityLevels({bitrate})/Fragments(video={start time})"
     # Assume URLs are in this form:
     # Url="QualityLevels({bitrate})/Fragments(video={start time})"
+    # or
+    # Url="QualityLevels({bitrate},{CustomAttributes})/Fragments(video={start time})"
     url = stream.attrib["Url"]
 
     chunks_quality = url.split('/')[0].replace("{bitrate}", bitrate)
     url = stream.attrib["Url"]
 
     chunks_quality = url.split('/')[0].replace("{bitrate}", bitrate)
+    chunks_quality = chunks_quality.replace("{CustomAttributes}",
+                                            custom_attributes)
+
+    return chunks_quality
+
+
+def get_chunk_name_string(stream, chunk_time):
+    url = stream.attrib["Url"]
+    chunk_name = url.split('/')[1].replace("{start time}", str(chunk_time))
+
+    return chunk_name
+
+
+def download_chunks(base_url, manifest, stream_index, quality_level, dest_dir):
+    stream = manifest.findall('.//StreamIndex')[stream_index]
+
+    chunks_quality = get_chunk_quality_string(stream, quality_level)
+
     chunks_dest_dir = os.path.join(dest_dir, chunks_quality)
     chunks_dest_dir = os.path.join(dest_dir, chunks_quality)
-    if os.path.exists(chunks_dest_dir) == False:
+    if not os.path.exists(chunks_dest_dir):
         os.mkdir(chunks_dest_dir, 0755)
 
     chunks = stream.findall("c")
         os.mkdir(chunks_dest_dir, 0755)
 
     chunks = stream.findall("c")
@@ -185,20 +222,22 @@ def download_chunks(base_url, manifest, stream_index, quality_level, dest_dir):
     print "\nDownloading Stream %d" % stream_index
     print "\tChunks %10d/%-10d" % (0, len(chunks)), "\r",
     sys.stdout.flush()
     print "\nDownloading Stream %d" % stream_index
     print "\tChunks %10d/%-10d" % (0, len(chunks)), "\r",
     sys.stdout.flush()
-    for i, c in enumerate(chunks):
-        t = c.attrib["t"]
 
 
-        chunk_name = url.split('/')[1].replace("{start time}", t)
-        chunk_file = os.path.join(dest_dir,  chunks_quality, chunk_name)
+    stream_duration = 0
+    for i, chunk in enumerate(chunks):
 
 
-        if os.path.exists(chunk_file) == False:
-            chunk_url = base_url + '/' + chunks_quality + '/' + chunk_name
-            response = urllib2.urlopen(chunk_url)
-            data = response.read()
+        if "t" in chunk.attrib:
+            chunk_time = chunk.attrib["t"]
+        elif "d" in chunk.attrib:
+            chunk_time = stream_duration
+            stream_duration = chunk_time + int(chunk.attrib["d"])
 
 
-            f = open(chunk_file, "wb")
-            f.write(data)
-            f.close()
+        chunk_name = get_chunk_name_string(stream, chunk_time)
+        chunk_file = os.path.join(dest_dir, chunks_quality, chunk_name)
+
+        if not os.path.exists(chunk_file):
+            chunk_url = base_url + '/' + chunks_quality + '/' + chunk_name
+            data = download_file(chunk_url, chunk_file, "wb")
         else:
             f = open(chunk_file, "rb")
             data = f.read()
         else:
             f = open(chunk_file, "rb")
             data = f.read()
@@ -211,21 +250,15 @@ def download_chunks(base_url, manifest, stream_index, quality_level, dest_dir):
 
 
 def rebuild_stream(manifest, stream_index, quality_level, src_dir,
 
 
 def rebuild_stream(manifest, stream_index, quality_level, src_dir,
-        dest_file_name, final_dest_file=None):
+                   dest_file_name, final_dest_file=None):
 
 
-    if final_dest_file == None:
+    if final_dest_file is None:
         final_dest_file = dest_file_name
 
         final_dest_file = dest_file_name
 
-    stream = manifest.findall('//StreamIndex')[stream_index]
-
-    quality = stream.findall("QualityLevel")[quality_level]
-    bitrate = quality.attrib["Bitrate"]
+    stream = manifest.findall('.//StreamIndex')[stream_index]
 
 
-    # Assume URLs are in this form:
-    # Url="QualityLevels({bitrate})/Fragments(video={start time})"
-    url = stream.attrib["Url"]
+    chunks_quality = get_chunk_quality_string(stream, quality_level)
 
 
-    chunks_quality = url.split('/')[0].replace("{bitrate}", bitrate)
     chunks_src_dir = os.path.join(src_dir, chunks_quality)
 
     dest_file = open(dest_file_name, "wb")
     chunks_src_dir = os.path.join(src_dir, chunks_quality)
 
     dest_file = open(dest_file_name, "wb")
@@ -235,10 +268,17 @@ def rebuild_stream(manifest, stream_index, quality_level, src_dir,
     print "\nRebuilding Stream %d" % stream_index
     print "\tChunks %10d/%-10d" % (0, len(chunks)), "\r",
     sys.stdout.flush()
     print "\nRebuilding Stream %d" % stream_index
     print "\tChunks %10d/%-10d" % (0, len(chunks)), "\r",
     sys.stdout.flush()
-    for i, c in enumerate(chunks):
-        t = c.attrib["t"]
 
 
-        chunk_name = url.split('/')[1].replace("{start time}", t)
+    stream_duration = 0
+    for i, chunk in enumerate(chunks):
+
+        if "t" in chunk.attrib:
+            chunk_time = chunk.attrib["t"]
+        elif "d" in chunk.attrib:
+            chunk_time = stream_duration
+            stream_duration = chunk_time + int(chunk.attrib["d"])
+
+        chunk_name = get_chunk_name_string(stream, chunk_time)
         chunk_file = os.path.join(chunks_src_dir, chunk_name)
 
         f = open(chunk_file, "rb")
         chunk_file = os.path.join(chunks_src_dir, chunk_name)
 
         f = open(chunk_file, "rb")
@@ -251,6 +291,7 @@ def rebuild_stream(manifest, stream_index, quality_level, src_dir,
 
     # Add a nice WAV header
     if stream.attrib['Type'] == "audio":
 
     # Add a nice WAV header
     if stream.attrib['Type'] == "audio":
+        quality = stream.findall("QualityLevel")[quality_level]
         codec_private_data = quality.attrib['CodecPrivateData']
 
         fmt = {}
         codec_private_data = quality.attrib['CodecPrivateData']
 
         fmt = {}
@@ -275,14 +316,21 @@ def rebuild_stream(manifest, stream_index, quality_level, src_dir,
 
 
 def calc_tracks_delay(manifest, stream1_index, stream2_index):
 
 
 def calc_tracks_delay(manifest, stream1_index, stream2_index):
-    streams = manifest.findall('//StreamIndex')
+    streams = manifest.findall('.//StreamIndex')
 
     s1 = streams[stream1_index]
     s2 = streams[stream2_index]
 
 
     s1 = streams[stream1_index]
     s2 = streams[stream2_index]
 
+    if "TimeScale" not in s1 or "TimeScale" not in s2:
+        return 0
+
     s1_start_chunk = s1.find("c")
     s2_start_chunk = s2.find("c")
 
     s1_start_chunk = s1.find("c")
     s2_start_chunk = s2.find("c")
 
+    if "t" not in s1_start_chunk.attrib \
+       or "t" not in s2_start_chunk.attrib:
+        return 0
+
     s1_start_time = int(s1_start_chunk.attrib['t'])
     s2_start_time = int(s2_start_chunk.attrib['t'])
 
     s1_start_time = int(s1_start_chunk.attrib['t'])
     s2_start_time = int(s2_start_chunk.attrib['t'])
 
@@ -291,7 +339,7 @@ def calc_tracks_delay(manifest, stream1_index, stream2_index):
 
     # calc difference in seconds
     delay = s2_start_time / s2_timescale - \
 
     # calc difference in seconds
     delay = s2_start_time / s2_timescale - \
-            s1_start_time / s1_timescale
+        s1_start_time / s1_timescale
 
     return delay
 
 
     return delay
 
@@ -303,49 +351,49 @@ def get_clip_duration(manifest):
     return float(duration) / 10000000  # here is the default timescale
 
 
     return float(duration) / 10000000  # here is the default timescale
 
 
-def smooth_download(url, manifest, dest_dir=tempfile.gettempdir(),
-        video_stream_index=0, audio_stream_index=1,
-        video_quality_level=0, audio_quality_level=0,
-        chunks_dir=None, download=True,
-        out_video_file='_video.vc1', out_audio_file='_audio.raw'):
+def smooth_download(url, manifest, dest_dir,
+                    video_stream_index=0, audio_stream_index=1,
+                    video_quality_level=0, audio_quality_level=0,
+                    chunks_dir=None, download=True,
+                    out_video_file='_video.vc1', out_audio_file='_audio.raw'):
 
 
-        if chunks_dir == None:
-            chunks_dir = dest_dir
+    if chunks_dir is None:
+        chunks_dir = dest_dir
 
 
-        if download:
-            download_chunks(url, manifest, video_stream_index,
-                    video_quality_level, chunks_dir)
-            download_chunks(url, manifest, audio_stream_index,
-                    audio_quality_level, chunks_dir)
+    if download:
+        download_chunks(url, manifest, video_stream_index,
+                        video_quality_level, chunks_dir)
+        download_chunks(url, manifest, audio_stream_index,
+                        audio_quality_level, chunks_dir)
 
 
-        dest_video = os.path.join(dest_dir, out_video_file)
-        dest_audio = os.path.join(dest_dir, out_audio_file)
+    dest_video = os.path.join(dest_dir, out_video_file)
+    dest_audio = os.path.join(dest_dir, out_audio_file)
 
 
-        rebuild_stream(manifest, video_stream_index, video_quality_level,
-                chunks_dir, dest_video)
-        rebuild_stream(manifest, audio_stream_index, audio_quality_level,
-                chunks_dir, dest_audio, dest_audio + '.wav')
+    rebuild_stream(manifest, video_stream_index, video_quality_level,
+                   chunks_dir, dest_video)
+    rebuild_stream(manifest, audio_stream_index, audio_quality_level,
+                   chunks_dir, dest_audio, dest_audio + '.wav')
 
 
-        #duration = get_clip_duration(manifest)
+    duration = get_clip_duration(manifest)
 
 
-        delay = calc_tracks_delay(manifest, video_stream_index,
-                audio_stream_index)
+    delay = calc_tracks_delay(manifest, video_stream_index,
+                              audio_stream_index)
 
 
-        # optionally encode audio to vorbis:
-        # ffmpeg -i _audio.raw.wav -acodec libvorbis -aq 60 audio.ogg
-        mux_command = ("ffmpeg -i %s \\\n" +
-                      "  -itsoffset %f -async 1 -i %s \\\n" +
-                      "  -vcodec copy -acodec copy ffout.mkv") % \
-                      (dest_video, delay, dest_audio + '.wav')
+    # optionally encode audio to vorbis:
+    # ffmpeg -i _audio.raw.wav -acodec libvorbis -aq 60 audio.ogg
+    mux_command = ("ffmpeg -i %s \\\n" +
+                   "  -itsoffset %f -async 1 -i %s \\\n" +
+                   "  -vcodec copy -acodec copy ffout.mkv") % \
+        (dest_video, delay, dest_audio + '.wav')
 
 
-        print mux_command
+    print mux_command
 
 
 def options_parser():
 
 
 def options_parser():
-    version = "%%prog %s" % __version
+    version = "%%prog %s" % __version__
     usage = "usage: %prog [options] <manifest URL or file>"
     parser = OptionParser(usage=usage, version=version,
     usage = "usage: %prog [options] <manifest URL or file>"
     parser = OptionParser(usage=usage, version=version,
-            description=__description, epilog=__author_info)
+                          description=__description__, epilog=__author_info__)
     parser.add_option("-i", "--info",
                       action="store_true", dest="info_only",
                       default=False, help="print Manifest info and exit")
     parser.add_option("-i", "--info",
                       action="store_true", dest="info_only",
                       default=False, help="print Manifest info and exit")
@@ -364,7 +412,7 @@ def options_parser():
     parser.add_option("-c", "--chunks-dir", metavar="<dir>",
                       dest="chunks_dir", default=None,
                       help="directory containing chunks, if different from destination dir")
     parser.add_option("-c", "--chunks-dir", metavar="<dir>",
                       dest="chunks_dir", default=None,
                       help="directory containing chunks, if different from destination dir")
-    parser.add_option("-v", "--video-stream",  metavar="<n>",
+    parser.add_option("-v", "--video-stream", metavar="<n>",
                       type="int", dest="video_stream_index", default=0,
                       help="index of the video stream")
     parser.add_option("-a", "--audio-stream", metavar="<n>",
                       type="int", dest="video_stream_index", default=0,
                       help="index of the video stream")
     parser.add_option("-a", "--audio-stream", metavar="<n>",
@@ -380,8 +428,7 @@ def options_parser():
     return parser
 
 
     return parser
 
 
-if __name__ == "__main__":
-
+def main():
     parser = options_parser()
     (options, args) = parser.parse_args()
 
     parser = options_parser()
     (options, args) = parser.parse_args()
 
@@ -389,6 +436,9 @@ if __name__ == "__main__":
         parser.print_help()
         parser.exit(1)
 
         parser.print_help()
         parser.exit(1)
 
+    if not os.path.exists(options.dest_dir):
+        os.mkdir(options.dest_dir, 0755)
+
     url = args[0]
     manifest, url = get_manifest(url, options.dest_dir)
 
     url = args[0]
     manifest, url = get_manifest(url, options.dest_dir)
 
@@ -397,8 +447,8 @@ if __name__ == "__main__":
 
     if options.sync_delay:
         print calc_tracks_delay(manifest,
 
     if options.sync_delay:
         print calc_tracks_delay(manifest,
-                options.video_stream_index,
-                options.audio_stream_index)
+                                options.video_stream_index,
+                                options.audio_stream_index)
         parser.exit(0)
 
     if options.info_only:
         parser.exit(0)
 
     if options.info_only:
@@ -408,6 +458,10 @@ if __name__ == "__main__":
     print_manifest_info(manifest)
 
     smooth_download(url, manifest, options.dest_dir,
     print_manifest_info(manifest)
 
     smooth_download(url, manifest, options.dest_dir,
-            options.video_stream_index, options.audio_stream_index,
-            options.video_quality_level, options.audio_quality_level,
-            options.chunks_dir, options.download)
+                    options.video_stream_index, options.audio_stream_index,
+                    options.video_quality_level, options.audio_quality_level,
+                    options.chunks_dir, options.download)
+
+
+if __name__ == "__main__":
+    main()