# -*- coding: ascii -*-

###########################################################################
# clive, video extraction utility
# Copyright (C) 2007-2008 Toni Gundogdu
#
# This file is part of clive.
#
# clive is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# clive is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with clive.  If not, see <http://www.gnu.org/licenses/>.
###########################################################################

## Classes for parsing video page HTML

# Do not cache these imports: they are not used in any other files
import urllib
import re
from sgmllib import SGMLParser

__all__ = ['PageParser', 'Video']

from clive.path import ConfigDir
from clive.unicode import tostr, touni
from clive.modules import Modules
from clive.opts import Options
from clive.error import CliveError, CliveNoMediaError

## Parses video page title
class TitleParser(SGMLParser):
    def reset(self):
        SGMLParser.reset(self)
        self.title = ''
        self._in_title = 0

    def start_title(self, attr):
        self._in_title = 1

    def end_title(self):
        self._in_title = 0

    def handle_data(self, data):
        if self._in_title:
            self.title += data

## The class for host specific video page HTML parsing
class PageParser:

    ## Constructor
    def __init__(self):
        m = Modules()
        (self._URLGrabber, self._URLGrabError) = m.getinst('urlgrabber')
        self._os = m.getinst('os')
        self._supported_hosts = [
            ('youtube.com', 'ytube', self._parse_ytube),
            ('video.google.', 'vgoogle', self._parse_vgoogle),
            ('dailymotion.', 'dmotion', self._parse_dmotion),
            ('guba.com', 'guba', self._parse_guba),
            ('metacafe.', 'metac', self._parse_metacafe),
            ('sevenload.com', 'sevenl', self._parse_sevenload),
            #('myvideo.', 'myvideo', self._parse_myvideo),
            ('break.com', 'break', self._parse_break),
        ]

    ## Parses a video page data (HTML)
    def parse(self, url_data, batch, proxy):
        self._say = url_data['callb_say']
        self._opts = Options()._opts
        self._proxy = proxy
        if len(url_data['page_title']) == 0:
            p = TitleParser()
            p.feed(url_data['page_data'])
            p.close()
            title = touni(p.title)
        else:
            title = url_data['page_title']
        xurl = '' # video eXtraction URL
        low_quality = url_data['video_lowq'] # TODO: CLEANUP
        for (site, v_host, func) in self._supported_hosts:
            if site in url_data['page_url'].lower():
                if len(url_data['page_xurl']) > 0:
                    xurl = url_data['page_xurl'] # From cache
                    v_id = url_data['video_id']
                else:
                    (xurl, v_id, low_quality) = \
                        func(url_data['page_url'], url_data['page_data'], low_quality)
                break
        if len(xurl) == 0:
            raise CliveError('error: extraction url not found')
        if url_data['file_length'] == -1:
            try:
                length = url_data['callb_filelen'](xurl)
            except CliveNoMediaError: # mp4 not avail., fallback to flv
                self._say('warn: no-media: switched to low-quality')
                xurl = xurl.strip('&fmt=18')
                length = url_data['callb_filelen'](xurl)
                low_quality = 1 # !
        else:
            l = url_data['file_length']
            length = ('%.2fMB' % (float(l) / (1024*1024)), l)
        v_info = {
            'page_title':title,
            'url':url_data['page_url'],
            'xurl':xurl,
            'v_id':v_id,
            'v_host':v_host,
            'length':length[0],
            'length_bytes':length[1],
            'low_quality':low_quality}
        self._get_filename(v_info, batch)
        return v_info

    def _parse_ytube(self, url, data, low_quality):
        # More headache than use. Ignorant of non-English errors.
        """
        a = [
        ('verify you are 18','age verification: use --youtube'),
        ('no longer available','not available'),
        #('has been removed','video removed'),
        ('video is unavailable','video unavailable'),
        ('malformed video id','url contains malformed video id'),
        ('not available in your country','country restriction/censorship'),
        #('private video','private video')]
        ]
        for (lookup, errmsg) in a:
            if lookup in data.lower():
                raise CliveError('error: ' + errmsg)
        """                
        try: # &video_id param
            video_id = \
                self._parse_from_to(data, 'video_id=', '&', skip_from=1)
            video_id = video_id.replace("'", "")
            if len(video_id) == 0: raise CliveError()
        except:
            raise CliveError('error: extraction url (&video_id) not found')
        try: # &t param
            t = self._parse_from_to(data, '&t=', '&', skip_from=1)
            t = t.replace("'", "")
            if len(t) == 0: raise CliveError()
        except:
            raise CliveError('error: extraction url (&t) not found')
        url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (
            video_id, t)
        if not self._opts.enable_low_quality: url += '&fmt=18'
        return (url, video_id, low_quality)

    def _parse_vgoogle(self, url, data, low_quality):
        try:
            vid = url.split('docid=',1)[1].split('&',1)[0]
        except IndexError:
            vid = self._random_vid()
        data = data.decode('unicode-escape','ignore')
        data = tostr(data)
        # flv
        url = self._parse_from_to(data,
            'googleplayer.swf?videoUrl\x3d', '\x26', skip_from=1)
        url = urllib.unquote(url)
        try:
            url = url.split('&thumb',1)[0]
        except IndexError:
            url = ''
        # mp4            
        if not self._opts.enable_low_quality:
            mp4 = self._parse_from_to(data,
                'right-click <a href="', '"', skip_from=1)
            if len(mp4) == 0:
                self._say('warn: no-media: switched to low-quality')
                low_quality = 1 # !
            else:
                url = mp4
        return (url, vid, low_quality)
	
    def _parse_dmotion(self, url, data, low_quality):
        try:
            vid = url.rsplit('/',1)[1].split('_',1)[0]
        except IndexError:
            vid = self._random_vid()
        batch = self._parse_from_to(data, '"video", ', ');', skip_from=1)
        if len(batch) == 0:
            ldata = data.lower()
            a = [('content deleted','video removed'),
                ('users+have+flagged','age verification: use --dmotion')]
            for (lookup, errmsg) in a:
                if lookup in ldata:
                    raise CliveError('error: ' + errmsg)
            raise CliveError('error: extraction url not found')
        batch = urllib.unquote(batch.strip('"')).split('||')
        d = {}
        for i in batch:
            s = i.split('/',4)
            d[s[3]] = i.split('@',1)[0]
        batch = sorted(d.items(), key=lambda(k,v):(v,k))
        url = 'http://dailymotion.com'
        if self._opts.enable_low_quality:
            url += dict(batch)['320x240']
        else:
            if batch[0][0] == '320x240':
                self._say('warn: no-media: switched to low-quality')
                low_quality = 1 # !
            url += batch[0][1]
        return (url, vid, low_quality)
	
    def _parse_guba(self, url, data, low_quality):
        try:
            vid = url.split('watch/',1)[1].split('?',1)[0]
        except IndexError:
            vid = self._random_vid()
        url = self._parse_from_to(data,
            'http://free.guba.com/uploaditem/', '"')
        return (url, vid, low_quality)

    def _parse_metacafe(self, url, data, low_quality):
        if 'adult confirmation' in data.lower():
            raise CliveError('error: no-support; req. age verification')
        try:
            vid = url.split('/watch/',1)[1].split('/')[0]
        except IndexError:
            vid = self._random_vid()
        cdn = self._parse_from_to(data, '"videoCDNURL":"', '"', skip_from=1)
        cdn = cdn.replace('\\','')
        media = self._parse_from_to(data, '"mediaURL":"', '"', skip_from=1)
        media = media.replace('\\','')
        try:
            url = cdn + media.split('/ItemFiles')[1]
        except IndexError:
            raise CliveError('error: extraction url not found')
        return (url, vid, low_quality)

    def _parse_sevenload(self, url, data, low_quality):
        # We need to construct:
        #   http://flash.sevenload.com/player?itemId=[ID from URL]
        # that will give us an XML-File as a result, which contains the
        # Video-Location and a lot of other information that we discard atm
        try:
            s = '/videos/'
            if s in url:
                vid = url.split(s,1)[1].split('-')[0].split('/')[0]
            else:
                vid = url.rsplit('/',1)[1].split('-')[0]
        except IndexError:
            # We cannot use a random string for video id as the
            # extraction depends on the video id further below.
            raise CliveError('error: no-videoid: url parsing failed')
        data = self._fetch_page(
            'http://flash.sevenload.com/player?itemId=' + vid)
        url = self._parse_from_to(data, 'video url="', '"', skip_from=1)
        return (url, vid, low_quality)

    def _parse_myvideo(self, url, data, low_quality):
        try:
            vid = url.split('/watch/',1)[1].split('/')[0]
        except IndexError:
            vid = self._random_vid()
        url = self._parse_from_to(data, '.swf?', '&', skip_from=1)
        return (url, vid, low_quality)

    def _parse_break(self, url, data, low_quality):
        vid = self._parse_from_to(data,
            "GlobalContentID='", "'", skip_from=1)
        path = self._parse_from_to(data,
            "ContentFilePath='", "'", skip_from=1)
        name = self._parse_from_to(data,
            "FileName='", "'", skip_from=1)            
        url = 'http://media1.break.com/dnet/media/%s/%s' % (path,name)
        if low_quality: url += '.flv'
        else: url += '.wmv'
        return (url, vid, low_quality)

    def _fetch_page(self, url):
        g = self._URLGrabber(user_agent = self._opts.http_agent,
            http_headers = (('accept-encoding', 'gzip'),),
            throttle = self._opts.http_throttle,
            proxies = self._proxy)
        o = g.urlopen(url)
        data = o.read()
        if o.hdr.get('content-encoding') == 'gzip':
            m = Modules()
            gzip = m.getinst('gzip')
            StringIO = m.getinst('StringIO')
            data=gzip.GzipFile(fileobj=StringIO(data)).read()
        o.close()
        return data

    def _random_vid(self):
        md5 = Modules().getinst('md5')
        time = Modules().getinst('time')
        return md5.new(str(time.time())).hexdigest()[:8]

    def _parse_from_to(self, data, _from, to, skip_from=0):
        start = data.find(_from)
        if skip_from and start != -1:
            start = start + len(_from)
        end = data.find(to, start)
        text = ''
        if start != -1 and end != -1:
            text = data[start:end]
        return text

    def _get_filename(self, v_info, batch):
        urlg_reget = None
        skip = 0
        ext = 'flv'
        offset = 0
        if v_info['v_host'] in ['ytube','dmotion','vgoogle']:
            if not v_info['low_quality']:
                ext = 'mp4'
                if 'dailymotion.' in v_info['url'].lower():
                    end = v_info['xurl'].find('?') # copy ext from page html
                    ext = v_info['xurl'][end-3:end]
        title = v_info['page_title'].replace('YouTube -', '')
        title = title.replace('GUBA -', '')
        title = title.replace(' Video - Metacafe', '')
        try:
            if 'dailymotion.' in v_info['url'].lower():
                title = title.lstrip('Video ').split('-')[0].rstrip()
        except IndexError:
            pass
        if v_info['v_host'] == 'sevenl':
            title = v_info['page_title'].replace('Video "','')
            title = title.replace('" | sevenload','')
        if v_info['v_host'] == 'myvideo':
            title = v_info['page_title'].split('-',1)[0].rstrip()
        if v_info['v_host'] == 'break':
            title = title.rstrip('Video')
            if not v_info['low_quality']:
                ext = 'wmv'
        # Apply these title exceptions to all inspite of custom filters etc.
        title = title.replace('/', '_') # Will definitely cause issues otherwise
        # Use user-defined custom method from ~/.clive/custom.py
        if self._opts.output_filter == 'custom':
            sys = Modules().getinst('sys')
            sys.path.append(ConfigDir().dir())
            try:
                from custom import custom_output_filter
                title = custom_output_filter(title)
            except ImportError, err:
                self._say('error:%s: %s' % (ConfigDir().customfile(),err[0]))
                self._say('warn: ignoring --filter=custom, disabling filter')
        else:
            if self._opts.output_filter and \
            self._opts.output_filter.lower() != 'no':
                # Treat string as a regexp for re.sub
                title = re.sub('[^%s]' % self._opts.output_filter, '', title)
        title = title.lstrip().rstrip()
        if len(title) == 0:
            title = self._random_string(insert_dash=0)
        filename = self._opts.output_format.replace('%','$')
        d = {'t':title,'i':v_info['v_id'],'h':v_info['v_host'],'e':ext}
        string = Modules().getinst('string')
        filename = tostr(string.Template(filename).substitute(d))
        if self._opts.output_file:
            filename = self._opts.output_file
        if self._opts.output_savedir:
            filename = self._os.path.join(self._opts.output_savedir, filename)
        if self._os.path.exists(filename) and not self._opts.emit_csv:
            # If local_file < remote_file
            if self._os.path.getsize(filename) < v_info['length_bytes']:
                allows_continue = 1 # Most hosts allow by default
                if ext == 'flv': # Youtube/FLV, Vgoogle/FLV exceptions
                    if v_info['v_host'] in ['ytube','vgoogle']:
                        allows_continue = 0 # Disable
                if allows_continue: # Continue partial download
                    urlg_reget = 'simple'
                    offset = self._os.path.getsize(filename)
                else: # Overwrite or rename
                    if self._opts.output_exists != 'overwrite':
                        self._say('warn: no-support; cannot continue ' +
                            'partially downloaded file')
                        # Continue getting a partially-downloaded file
                        filename = self._rename_file(filename)
            else: # If local_file >= remote_file
                if self._opts.output_exists == 'overwrite':
                    self._os.remove(filename)
                else:    
                    if self._os.path.getsize(filename) == \
                    v_info['length_bytes']:
                        skip = 1
                        self._say('warn: skipped: %s (same length) ' \
                            'exists already' % self._os.path.basename(filename))
                    else:                                
                        filename = self._rename_file(filename)
        # Make sure there are not any duplicates in the URL batch
        for vi in batch:
            if vi['output_file'] == filename:
                filename = self._rename_file(filename) # Found
                break
        v_info['output_file'] = filename
        v_info['urlg_reget'] = urlg_reget
        v_info['skip_extraction'] = skip
        v_info['offset'] = offset
	
    def _rename_file(self, filename):
        (root, ext) = self._os.path.splitext(filename)
        return root + self._random_string() + ext

    def _random_string(self, insert_dash=1):
        s = ''
        if insert_dash: s += '-'
        time = Modules().getinst('time')
        md5 = Modules().getinst('md5')
        s += (md5.new(str(time.time())).hexdigest()[:8])
        return s


