<?php //$Id$
//Copyright (c) 2012-2016 Pierre Pronchery <khorben@defora.org>
//This file is part of DeforaOS Web DaPortal
//
//This program is free software: you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation, version 3 of the License.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with this program.  If not, see <http://www.gnu.org/licenses/>.



//HTML
class HTML
{
	//public
	//methods
	//essential
	//HTML::HTML
	protected function __construct($charset = FALSE, $form = FALSE)
	{
		global $config;

		//for escaping
		if(!defined('ENT_HTML401'))
			define('ENT_HTML401', 0);
		$this->flags = ENT_COMPAT | ENT_HTML401;
		//for encoding
		if($charset === FALSE)
			$charset = $config->get('defaults', 'charset');
		$this->charset = $charset;
		switch(strtolower($charset))
		{
			case 'ascii':
			case 'us-ascii':
				$this->parser = xml_parser_create('US-ASCII');
				break;
			case 'iso-8859-1':
			case 'iso-8859-15':
				$this->parser = xml_parser_create('ISO-8859-1');
				break;
			case 'utf-8':
				$this->parser = xml_parser_create('UTF-8');
				break;
			default:
				$this->parser = xml_parser_create('');
				break;
		}
		if($form)
		{
			$this->whitelist['button'] = array('class', 'disabled',
				'name', 'type', 'value');
			$this->whitelist['datalist'] = array('class', 'id');
			$this->whitelist['fieldset'] = array('class',
				'disabled');
			$this->whitelist['form'] = array('action', 'class',
				'enctype', 'method');
			$this->whitelist['input'] = array('checked', 'class',
				'disabled', 'id', 'list', 'name', 'readonly',
				'type', 'value');
			$this->whitelist['label'] = array('class', 'for');
			$this->whitelist['legend'] = array('class');
			$this->whitelist['option'] = array('class', 'value');
			$this->whitelist['select'] = array('class', 'disabled',
				'multiple', 'name');
			$this->whitelist['textarea'] = array('class',
				'disabled', 'name', 'rows');
		}
	}


	//HTML::~HTML
	public function __destruct()
	{
		xml_parser_free($this->parser);
	}


	//static
	//useful
	//HTML::filter
	static public function filter(Engine $engine, $content,
			$whitelist = FALSE, $form = FALSE, $charset = FALSE)
	{
		$class = static::$class;
		$html = new $class($charset, $form);
		$start = array($html, '_filterElementStart');
		$end = array($html, '_filterElementEnd');
		$filter = array($html, '_filterCharacterData');
		$from = array('<br>', '<hr>');
		$to = array('<br/>', '<hr/>');

		if(!is_string($content))
			return FALSE;
		if(strlen($content) == 0)
			return '';
		if($whitelist !== FALSE)
			$html->whitelist = $whitelist;
		if(xml_set_element_handler($html->parser, $start, $end)
				!== TRUE)
			return ''; //XXX report error
		xml_set_character_data_handler($html->parser, $filter);
		//plain text conversion needs carriage returns
		if(is_array($whitelist) && count($whitelist) == 0)
			$to = "\n";
		//give it more chances to validate
		$content = str_ireplace($from, $to, $content);
		switch(strtolower($html->charset))
		{
			case 'iso-8859-1':
			case 'iso-8859-15':
				//do not rely on input charset detection
				$content = utf8_encode($content);
				break;
		}
		//load as HTML if necessary
		$dom = new DOMDocument('1.0', 'UTF-8');
		if(@$dom->loadXML($content, LIBXML_NOENT | LIBXML_NONET)
					!== FALSE
				|| $dom->loadHTML($content,
					LIBXML_NOENT | LIBXML_NONET) !== FALSE)
			$content = $dom->saveXML();
		unset($dom);
		if(($ret = xml_parse($html->parser, $content, TRUE)) != 1)
		{
			$error = xml_error_string(xml_get_error_code(
					$html->parser)).' at line '
				.xml_get_current_line_number($html->parser)
				.', column '
				.xml_get_current_column_number($html->parser);
			$engine->log(LOG_DEBUG, $error);
		}
		//close the remaining tags
		while(($tag = array_pop($html->stack)) != NULL)
			$html->content .= "</$tag>";
		return $html->content;
	}

	protected function _filterCharacterData($parser, $data)
	{
		//skip the contents of blacklisted tags
		if($this->blacklist_level > 0)
			return;
		$this->content .= htmlspecialchars($data, ENT_NOQUOTES);
	}

	protected function _filterElementStart($parser, $name,
			$attributes)
	{
		$tag = strtolower($name);
		//skip the contents of blacklisted tags
		if($this->blacklist_level > 0)
			return $this->blacklist_level++;
		if(in_array($tag, $this->blacklist))
		{
			$this->blacklist_level = 1;
			return;
		}
		//output whitelisted tags and attributes
		if(!isset($this->whitelist[$tag]))
			return;
		$this->content .= "<$tag";
		$a = $this->whitelist[$tag];
		foreach($attributes as $k => $v)
		{
			$attr = strtolower($k);
			if(!in_array($attr, $a))
				continue;
			$this->content .= ' '.$attr.'="' .htmlspecialchars($v,
					$this->flags, $this->charset).'"';
		}
		//close the <br>, <hr> and <img> tags directly
		if($tag == 'br' || $tag == 'hr' || $tag == 'img')
			$this->content .= '/';
		else
			//remember which tags are opened
			$this->stack[] = $tag;
		$this->content .= '>';
	}

	protected function _filterElementEnd($parser, $name)
	{
		$tag = strtolower($name);
		//skip the contents of blacklisted tags
		if($this->blacklist_level > 1)
			return $this->blacklist_level--;
		if($this->blacklist_level == 1 && in_array($tag,
				$this->blacklist))
		{
			$this->blacklist_level = 0;
			return;
		}
		if(!isset($this->whitelist[$tag]))
			return;
		//the <br>, <hr> and <img> tags were already closed
		if($tag == 'br' || $tag == 'hr' || $tag == 'img')
			return;
		$this->content .= "</$tag>";
		//remember which tags were closed
		if(($key = array_search($tag, $this->stack)) !== FALSE)
			unset($this->stack[$key]);
	}


	//HTML::format
	static public function format(Engine $engine, $content)
	{
		$from = '/((ftp:\/\/|http:\/\/|https:\/\/|mailto:)'
			.'([-+a-zA-Z0-9.:\/_%?!=,;~#@()]|&amp;)+)/';
		//FIXME obfuscate e-mail addresses
		$to = '<a href="\1">\1</a>';

		$ret = '<div>';
		$lines = explode("\n", $content);
		$list = 0;
		foreach($lines as $l)
		{
			$l = htmlspecialchars($l, ENT_COMPAT);
			$l = preg_replace($from, $to, $l);
			if(strlen($l) > 0 && $l[0] == ' ')
			{
				if(strlen($l) > 2 && $l[1] == '*'
						&& $l[2] == ' ')
				{
					//list
					$l = '<li>'.substr($l, 3).'</li>';
					if($list == 0)
					{
						$list = 1;
						$l = '<ul>'.$l;
					}
					$ret .= $l;
				}
				else
					//preformatted content
					$ret .= '<span class="preformatted">'
						.substr($l, 1).'</span><br/>';
			}
			else if($list)
			{
				//close the list if necessary
				$ret .= '</ul>'.$l.'<br/>';
				$list = 0;
			}
			else
				$ret .= $l.'<br/>';
		}
		$ret .= '</div>';
		return $ret;
	}


	//HTML::validate
	static public function validate(Engine $engine, $content,
			$whitelist = FALSE, $form = FALSE)
	{
		$class = static::$class;
		$html = new $class(FALSE, $form);
		$start = array($html, '_validateElementStart');
		$end = array($html, '_validateElementEnd');

		if($whitelist !== FALSE)
			$html->whitelist = $whitelist;
		if(xml_set_element_handler($html->parser, $start, $end)
				!== TRUE)
			return FALSE;
		switch(strtolower($html->charset))
		{
			case 'iso-8859-1':
			case 'iso-8859-15':
				//do not rely on input charset detection
				$content = utf8_encode($content);
				break;
		}
		if(($ret = xml_parse($html->parser, $content, TRUE)) != 1)
		{
			$error = xml_error_string(xml_get_error_code(
					$html->parser)).' at line '
				.xml_get_current_line_number($html->parser)
				.', column '
				.xml_get_current_column_number($html->parser);
			$engine->log(LOG_DEBUG, $error);
		}
		return ($ret == 1) ? $html->valid : FALSE;
	}

	protected function _validateElementStart($parser, $name,
			$attributes)
	{
		//XXX report errors
		$tag = strtolower($name);
		if(!isset($this->whitelist[$tag]))
		{
			$this->valid = FALSE;
			return;
		}
		$a = $this->whitelist[$tag];
		foreach($attributes as $k => $v)
			if(!in_array(strtolower($k), $a))
			{
				$this->valid = FALSE;
				return;
			}
	}

	protected function _validateElementEnd($parser, $name)
	{
	}


	//protected
	//properties
	protected $charset = FALSE;
	static protected $class = 'HTML';
	protected $flags;
	protected $parser;
	protected $content = '';
	protected $stack = array();
	protected $valid = TRUE;
	protected $blacklist = array('script', 'style', 'title');
	protected $blacklist_level = 0;
	protected $whitelist = array(
		'a' => array('href', 'name', 'rel', 'title'),
		'abbr' => array('class', 'title'),
		'acronym' => array('class'),
		'b' => array('class'),
		'bdo' => array('dir'),
		'big' => array('class'),
		'blockquote' => array('class'),
		'br' => array(),
		'button' => array('class', 'disabled'),
		'center' => array(),
		'cite' => array('class', 'title'),
		'code' => array('class'),
		'col' => array('class'),
		'dd' => array('class'),
		'del' => array('class'),
		'dfn' => array('class'),
		'div' => array('class'),
		'dl' => array('class'),
		'dt' => array('class'),
		'em' => array('class'),
		'h1' => array('class'),
		'h2' => array('class'),
		'h3' => array('class'),
		'h4' => array('class'),
		'h5' => array('class'),
		'h6' => array('class'),
		'hr' => array('class'),
		'i' => array('class'),
		'img' => array('alt', 'class', 'src', 'title'),
		'ins' => array('class'),
		'kbd' => array('class'),
		'li' => array('class'),
		'ol' => array('class'),
		'p' => array('class'),
		'pre' => array('class'),
		's' => array('class'),
		'samp' => array('class'),
		'small' => array('class'),
		'span' => array('class'),
		'strong' => array('class'),
		'sub' => array('class'),
		'sup' => array('class'),
		'table' => array('border', 'class'),
		'tbody' => array('class'),
		'td' => array('align', 'class', 'colspan', 'rowspan'),
		'tfoot' => array('class'),
		'th' => array('class', 'colspan', 'rowspan'),
		'tr' => array('class'),
		'tt' => array('class'),
		'u' => array('class'),
		'ul' => array('class'),
		'var' => array('class'));
}

?>
