<?php
/**
 * The IMP_MIME_Viewer_html class renders out plain text with
 * URLs made into hyperlinks.
 *
 * $Horde: imp/lib/MIME/Viewer/html.php,v 1.4.2.22 2004/11/25 14:35:42 jan Exp $
 *
 * Copyright 1999-2003 Anil Madhavapeddy <anil@recoil.org>
 * Copyright 1999-2003 Jon Parise <jon@recoil.org>
 *
 * See the enclosed file COPYING for license information (GPL). If you
 * did not receive this file, see http://www.fsf.org/copyleft/gpl.html.
 *
 * @author  Anil Madhavapeddy <anil@recoil.org>
 * @author  Jon Parise <jon@horde.org>
 * @version $Revision: 1.4.2.22 $
 * @since   IMP 3.0
 * @package horde.mime.viewer
 */
class IMP_MIME_Viewer_html extends MIME_Viewer {

    /**
     * Render out the currently set contents in HTML format.
     * The $mime_part class variable has the information to render
     * out, encapsulated in a MIME_Part object.
     */
    function render(&$mime)
    {
        $data = $this->mime_part->getContents();

        /* Sanitize HTML. */
        $data = $this->_cleanHTML($data);

        /* Search for inlined images that we can display. */
        global $MimeID, $imp;
        if (isset($MimeID)) {
            foreach ($MimeID as $ref => $id) {
                if (strlen($id) > 0) {
                    if ($id[0] == '<') $id = substr($id, 1);
                    if ($id[strlen($id) - 1] == '>') $id = substr($id, 0, strlen($id)-1);
                    $data = str_replace("cid:$id", Horde::url('view.php?actionID=' . VIEW_ATTACH . '&index=' . $mime->index . '&mailbox=' . urlencode($imp['mailbox']) . '&thismailbox=' . urlencode($imp['thismailbox']) . '&id=' . $ref), $data);
                }
            }
        }

        // Convert links to open in new windows.
        // But first we hide all links that have an "#xyz" anchor.
        $data = preg_replace('|<a([^>]*href=["\']?#)|i', "<\x01\\1", $data);
        $data = str_replace('<a', '<a target="_blank"', $data);
        $data = str_replace('<A', '<a target="_blank"', $data);
        $data = preg_replace("|\x01|", 'a', $data);

        return $data;
    }

    /**
     * Return text/html as the content-type
     *
     * @return string  'text/html' constant.
     */
    function getType()
    {
        return 'text/html';
    }

    /**
     * Try to sanitize HTML for viewing.
     *
     * These regular expressions attempt to make html safe for
     * viewing. THEY ARE NOT PERFECT. If you enable html viewing, you
     * are opening a security hole. With the current state of the web,
     * I believe that the best we can do is to make sure that people
     * _know_ html is a security hole, clean up what we can, and leave
     * it at that.
     */
    function _cleanHTML($data)
    {
        /* Deal with <base> tags in the HTML, since they will screw up our
           own relative paths. */
        if (($i = stristr($data, '<base ')) && ($i = stristr($i, 'http')) &&
            ($j = strchr($i, '>'))) {
            $base = substr($i, 0, strlen($i) - strlen($j));
            $base = preg_replace('|(http.*://[^/]*/?).*|i', '\1', $base);

            if ($base[strlen($base) - 1] != '/') {
                $base .= '/';
            }

            // Make sure malicious code isn't propagated through
            // $base.
            $base = $this->_cleanHTML($base);
        }

        /* Change space entities to space characters. */
        $data = preg_replace('/&#(x0*20|0*32);?/i', ' ', $data);

        /* Nuke non-printable characters (a play in three acts). */
        // Rule #1: If we have a semicolon, it's deterministically detectable
        // and fixable, without introducing collateral damage.
        $data = preg_replace('/&#x?0*([9A-D]|1[0-3]);/i', '&nbsp;', $data);

        // Rule #2: Hex numbers (usually having an x prefix) are also
        // deterministic, even if we don't have the semi.  Note that some
        // browsers will treat &#a or &#0a as a hex number even without the x
        // prefix; hence /x?/ which will cover those cases in this rule.
        $data = preg_replace('/&#x?0*[9A-D]([^0-9A-F]|$)/i', '&nbsp\\1', $data);

        // Rule #3: Decimal numbers without semi.  The problem is that
        // some browsers will interpret &#10a as "\na", some as "&#x10a" so we
        // have to clean the &#10 to be safe for the "\na" case at the expense
        // of mangling a valid entity in other cases.  (Solution for valid HTML
        // authors: always use the semicolon.)
        $data = preg_replace('/&#0*(9|1[0-3])([^0-9]|$)/i', '&nbsp\\2', $data);

        /* Remove overly long numeric entities. */
        $data = preg_replace('/&#x?0*[0-9A-F]{6,};?/i', '&nbsp;', $data);

        /* Get all attribute="javascript:foo()" tags. */
        /* This is essentially the regex /=("?)[^>]*script:/ but expanded */
        /* to catch camouflage with spaces and entities. */
        $preg = '/(&#0*61;?|&#x0*3D;?|=)\s*'.
                '(&#0*34;?|&#x0*22;?|")?'.
                '[^>]*\s*'.
                '(s|&#0*83;?|&#x0*53;?|&#0*115;?|&#x0*73;?)\s*'.
                '(c|&#0*67;?|&#x0*43;?|&#0*99;?|&#x0*63;?)\s*'.
                '(r|&#0*82;?|&#x0*52;?|&#0*114;?|&#x0*72;?)\s*'.
                '(i|&#0*73;?|&#x0*49;?|&#0*105;?|&#x0*69;?)\s*'.
                '(p|&#0*80;?|&#x0*50;?|&#0*112;?|&#x0*70;?)\s*'.
                '(t|&#0*84;?|&#x0*54;?|&#0*116;?|&#x0*74;?)\s*'.
                '(:|&#0*58;?|&#x0*3a;?)/i';
        $data = preg_replace($preg, '=\2cleaned', $data);

        /* Get all on<foo>="bar()" and style="" tags. */
        $data = preg_replace('/([\s"\']+' .
                             '(o|&#0*79;?|&#0*4f;?|&#0*111;?|&#0*6f;?)' .
                             '(n|&#0*78;?|&#0*4e;?|&#0*110;?|&#0*6e;?)' .
                             '\w+)\s*=/i', '\1HordeCleaned=', $data);

        /* Get all tags that might cause trouble - <object>, <embed>,
           <base>, etc. Meta refreshes and iframes, too. */
        $malicious = array(
            '/<([^>]*)' .
            '(s|&#0*83;?|&#x0*53;?|&#0*115;?|&#x0*73;?)\s*' .
            '(c|&#0*67;?|&#x0*43;?|&#0*99;?|&#x0*63;?)\s*' .
            '(r|&#0*82;?|&#x0*52;?|&#0*114;?|&#x0*72;?)\s*' .
            '(i|&#0*73;?|&#x0*49;?|&#0*105;?|&#x0*69;?)\s*' .
            '(p|&#0*80;?|&#x0*50;?|&#0*112;?|&#x0*70;?)\s*' .
            '(t|&#0*84;?|&#x0*54;?|&#0*116;?|&#x0*74;?)\s*/i',

            '/<([^>]*)' .
            '(e|&#0*69;?|&#0*45;?|&#0*101;?|&#0*65;?)\s*' .
            '(m|&#0*77;?|&#0*4d;?|&#0*109;?|&#0*6d;?)\s*' .
            '(b|&#0*66;?|&#0*42;?|&#0*98;?|&#0*62;?)\s*' .
            '(e|&#0*69;?|&#0*45;?|&#0*101;?|&#0*65;?)\s*' .
            '(d|&#0*68;?|&#0*44;?|&#0*100;?|&#0*64;?)\s*/i',

            '/<([^>]*)' .
            '(b|&#0*66;?|&#0*42;?|&#0*98;?|&#0*62;?)\s*' .
            '(a|&#0*65;?|&#0*41;?|&#0*97;?|&#0*61;?)\s*' .
            '(s|&#0*83;?|&#x0*53;?|&#0*115;?|&#x0*73;?)\s*' .
            '(e|&#0*69;?|&#0*45;?|&#0*101;?|&#0*65;?)\s*' .
            '[^line]/i',

            '/<([^>]*)' .
            '(m|&#0*77;?|&#0*4d;?|&#0*109;?|&#0*6d;?)\s*' .
            '(e|&#0*69;?|&#0*45;?|&#0*101;?|&#0*65;?)\s*' .
            '(t|&#0*84;?|&#x0*54;?|&#0*116;?|&#x0*74;?)\s*' .
            '(a|&#0*65;?|&#0*41;?|&#0*97;?|&#0*61;?)\s*/i',

            '/<([^>]*)' .
            '(j|&#0*74;?|&#0*4a;?|&#0*106;?|&#0*6a;?)\s*' .
            '(a|&#0*65;?|&#0*41;?|&#0*97;?|&#0*61;?)\s*' .
            '(v|&#0*86;?|&#0*56;?|&#0*118;?|&#0*76;?)\s*' .
            '(a|&#0*65;?|&#0*41;?|&#0*97;?|&#0*61;?)\s*/i',

            '/<([^>]*)' .
            '(o|&#0*79;?|&#0*4f;?|&#0*111;?|&#0*6f;?)\s*' .
            '(b|&#0*66;?|&#0*42;?|&#0*98;?|&#0*62;?)\s*' .
            '(j|&#0*74;?|&#0*4a;?|&#0*106;?|&#0*6a;?)\s*' .
            '(e|&#0*69;?|&#0*45;?|&#0*101;?|&#0*65;?)\s*' .
            '(c|&#0*67;?|&#x0*43;?|&#0*99;?|&#x0*63;?)\s*' .
            '(t|&#0*84;?|&#x0*54;?|&#0*116;?|&#x0*74;?)\s*/i',

            '/<([^>]*)' .
            '(i|&#0*73;?|&#x0*49;?|&#0*105;?|&#x0*69;?)\s*' .
            '(f|&#0*70;?|&#0*46;?|&#0*102;?|&#0*66;?)\s*' .
            '(r|&#0*82;?|&#x0*52;?|&#0*114;?|&#x0*72;?)\s*' .
            '(a|&#0*65;?|&#0*41;?|&#0*97;?|&#0*61;?)\s*' .
            '(m|&#0*77;?|&#0*4d;?|&#0*109;?|&#0*6d;?)\s*' .
            '(e|&#0*69;?|&#0*45;?|&#0*101;?|&#0*65;?)\s*/i');

        $data = preg_replace($malicious, '<cleaned_tag', $data);

        /* Comment out style/link tags. */
        $data = preg_replace('/\s+style\s*=/i', ' Cleaned=', $data);
        $data = preg_replace('|<style[^>]*>(?:\s*<\!--)*|i', '<!--', $data);
        $data = preg_replace('|(?:-->\s*)*</style>|i', '-->', $data);
        $data = preg_replace('|(<link[^>]*>)|i', '<!-- $1 -->', $data);

        /* A few other matches. */
        $data = preg_replace('|<([^>]*)&{.*}([^>]*)>|', '<&{;}\3>', $data);
        $data = preg_replace('|<([^>]*)mocha:([^>]*)>|i', '<cleaned\2>', $data);
        $data = preg_replace('|<([^>]*)binding:([^>]*)>|i', '<cleaned\2>', $data);

        /* Attempt to fix paths that were relying on a <base> tag. */
        if (!empty($base)) {
            $data = preg_replace('|src="/|i', 'src="' . $base, $data);
            $data = preg_replace('|src=\'/|i', 'src=\'' . $base, $data);
            $data = preg_replace('|src=[^\'"]/|i', 'src=' . $base, $data);

            $data = preg_replace('|href= *"/|i', 'href="' . $base, $data);
            $data = preg_replace('|href= *\'/|i', 'href=\'' . $base, $data);
            $data = preg_replace('|href= *[^\'"]/|i', 'href=' . $base, $data);
        }

        return $data;
    }

}
