Files
roundcubemail/program/lib/Roundcube/rcube_tnef_decoder.php

760 lines
28 KiB
PHP
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
use RtfHtmlPhp\Document;
use RtfHtmlPhp\Html\HtmlFormatter;
/*
+-----------------------------------------------------------------------+
| This file is part of the Roundcube Webmail client |
| |
| Copyright (C) The Roundcube Dev Team |
| Copyright (C) 2002-2010, The Horde Project (https://www.horde.org/) |
| |
| Licensed under the GNU General Public License version 3 or |
| any later version with exceptions for skins & plugins. |
| See the README file for a full license statement. |
| |
| PURPOSE: |
| MS-TNEF format decoder |
+-----------------------------------------------------------------------+
| Author: Jan Schneider <jan@horde.org> |
| Author: Michael Slusarz <slusarz@horde.org> |
| Author: Aleksander Machniak <alec@alec.pl> |
+-----------------------------------------------------------------------+
*/
/**
* MS-TNEF format decoder based on code by:
* Graham Norbury <gnorbury@bondcar.com>
* Original design by:
* Thomas Boll <tb@boll.ch>, Mark Simpson <damned@world.std.com>
*/
class rcube_tnef_decoder
{
public const SIGNATURE = 0x223E9F78;
public const LVL_MESSAGE = 0x01;
public const LVL_ATTACHMENT = 0x02;
public const AFROM = 0x08000;
public const ASUBJECT = 0x18004;
public const AMESSAGEID = 0x18009;
public const AFILENAME = 0x18010;
public const APARENTID = 0x1800A;
public const ACONVERSATIONID = 0x1800B;
public const ABODY = 0x2800C;
public const ADATESENT = 0x38005;
public const ADATERECEIVED = 0x38006;
public const ADATEMODIFIED = 0x38020;
public const APRIORITY = 0x4800D;
public const AOWNER = 0x60000;
public const ASENTFOR = 0x60001;
public const ASTATUS = 0x68007;
public const ATTACHDATA = 0x6800F;
public const ATTACHMETAFILE = 0x68011;
public const ATTACHCREATEDATE = 0x38012;
public const ARENDDATA = 0x69002;
public const AMAPIPROPS = 0x69003;
public const ARECIPIENTTABLE = 0x69004;
public const AMAPIATTRS = 0x69005;
public const AOEMCODEPAGE = 0x69007;
public const AORIGINALMCLASS = 0x70006;
public const AMCLASS = 0x78008;
public const AVERSION = 0x89006;
public const MAPI_TYPE_UNSET = 0x0000;
public const MAPI_NULL = 0x0001;
public const MAPI_SHORT = 0x0002;
public const MAPI_INT = 0x0003;
public const MAPI_FLOAT = 0x0004;
public const MAPI_DOUBLE = 0x0005;
public const MAPI_CURRENCY = 0x0006;
public const MAPI_APPTIME = 0x0007;
public const MAPI_ERROR = 0x000A;
public const MAPI_BOOLEAN = 0x000B;
public const MAPI_OBJECT = 0x000D;
public const MAPI_INT8BYTE = 0x0014;
public const MAPI_STRING = 0x001E;
public const MAPI_UNICODE_STRING = 0x001F;
public const MAPI_SYSTIME = 0x0040;
public const MAPI_CLSID = 0x0048;
public const MAPI_BINARY = 0x0102;
public const MAPI_BODY = 0x1000;
public const MAPI_RTF_COMPRESSED = 0x1009;
public const MAPI_BODY_HTML = 0x1013;
public const MAPI_NATIVE_BODY = 0x1016;
public const MAPI_DISPLAY_NAME = 0x3001;
public const MAPI_ADDRTYPE = 0x3002;
public const MAPI_EMAIL_ADDRESS = 0x3003;
public const MAPI_COMMENT = 0x3004;
public const MAPI_DEPTH = 0x3005;
public const MAPI_PROVIDER_DISPLAY = 0x3006;
public const MAPI_CREATION_TIME = 0x3007;
public const MAPI_LAST_MODIFICATION_TIME = 0x3008;
public const MAPI_RESOURCE_FLAGS = 0x3009;
public const MAPI_PROVIDER_DLL_NAME = 0x300A;
public const MAPI_SEARCH_KEY = 0x300B;
public const MAPI_ATTACHMENT_X400_PARAMETERS = 0x3700;
public const MAPI_ATTACH_DATA = 0x3701;
public const MAPI_ATTACH_ENCODING = 0x3702;
public const MAPI_ATTACH_EXTENSION = 0x3703;
public const MAPI_ATTACH_FILENAME = 0x3704;
public const MAPI_ATTACH_METHOD = 0x3705;
public const MAPI_ATTACH_LONG_FILENAME = 0x3707;
public const MAPI_ATTACH_PATHNAME = 0x3708;
public const MAPI_ATTACH_RENDERING = 0x3709;
public const MAPI_ATTACH_TAG = 0x370A;
public const MAPI_RENDERING_POSITION = 0x370B;
public const MAPI_ATTACH_TRANSPORT_NAME = 0x370C;
public const MAPI_ATTACH_LONG_PATHNAME = 0x370D;
public const MAPI_ATTACH_MIME_TAG = 0x370E;
public const MAPI_ATTACH_ADDITIONAL_INFO = 0x370F;
public const MAPI_ATTACH_MIME_SEQUENCE = 0x3710;
public const MAPI_ATTACH_CONTENT_ID = 0x3712;
public const MAPI_ATTACH_CONTENT_LOCATION = 0x3713;
public const MAPI_ATTACH_FLAGS = 0x3714;
public const MAPI_NAMED_TYPE_ID = 0x0000;
public const MAPI_NAMED_TYPE_STRING = 0x0001;
public const MAPI_NAMED_TYPE_NONE = 0xFF;
public const MAPI_MV_FLAG = 0x1000;
public const RTF_UNCOMPRESSED = 0x414C454D;
public const RTF_COMPRESSED = 0x75465A4C;
protected $codepage;
/**
* Decompress the data.
*
* @param string $data the data to decompress
* @param bool $as_html Return message body as HTML
*
* @return array the decompressed data
*/
public function decompress($data, $as_html = false)
{
$attachments = [];
$message = [];
if ($this->_geti($data, 32) == self::SIGNATURE) {
$this->_geti($data, 16);
// Version
$this->_geti($data, 8); // lvl_message
$this->_geti($data, 32); // idTnefVersion
$this->_getx($data, $this->_geti($data, 32));
$this->_geti($data, 16); // checksum
while (strlen($data) > 0) {
switch ($this->_geti($data, 8)) {
case self::LVL_MESSAGE:
$this->_decodeMessage($data, $message);
break;
case self::LVL_ATTACHMENT:
$this->_decodeAttachment($data, $attachments);
break;
}
}
}
// Return the message body as HTML
if ($as_html) {
// HTML body
if (!empty($message['size']) && $message['subtype'] == 'html') {
$message = $message['stream'];
}
// RTF body (converted to HTML)
// Note: RTF can contain encapsulated HTML content
elseif (!empty($message['size']) && $message['subtype'] == 'rtf'
&& function_exists('iconv')
&& class_exists('RtfHtmlPhp\Document')
) {
try {
$document = new Document($message['stream']);
$formatter = new HtmlFormatter(RCUBE_CHARSET);
$message = $formatter->format($document);
} catch (Exception $e) {
// ignore the body
$message = null;
rcube::raise_error('Failed to extract RTF/HTML content from TNEF attachment', true);
}
} else {
$message = null;
}
}
return [
'message' => $message,
'attachments' => array_reverse($attachments),
];
}
/**
* Pop specified number of bytes from the buffer.
*
* @param string &$data The data string
* @param int $bytes how many bytes to retrieve
*
* @return string Extracted data
*/
protected function _getx(&$data, $bytes)
{
$value = null;
if (strlen($data) >= $bytes) {
$value = substr($data, 0, $bytes);
$data = substr($data, $bytes);
}
return $value;
}
/**
* Pop specified number of bits from the buffer
*
* @param string &$data The data string
* @param int $bits how many bits to retrieve
*
* @return int|null
*/
protected function _geti(&$data, $bits)
{
$bytes = $bits / 8;
$value = null;
if (strlen($data) >= $bytes) {
$value = ord($data[0]);
if ($bytes >= 2) {
$value += (ord($data[1]) << 8);
}
if ($bytes >= 4) {
$value += (ord($data[2]) << 16) + (ord($data[3]) << 24);
}
$data = substr($data, $bytes);
}
return $value;
}
/**
* Decode a single attribute
*
* @param string &$data The data string
*
* @return string Extracted data
*/
protected function _decodeAttribute(&$data)
{
// Data.
$value = $this->_getx($data, $this->_geti($data, 32));
// Checksum.
$this->_geti($data, 16);
return $value;
}
/**
* TODO
*
* @param string $data the data string
* @param array &$result TODO
*/
protected function _extractMapiAttributes($data, &$result)
{
// Number of attributes.
$number = $this->_geti($data, 32);
while ((strlen($data) > 0) && $number--) {
$have_mval = false;
$num_mval = 1;
$value = null;
$attr_type = $this->_geti($data, 16);
$attr_name = $this->_geti($data, 16);
if (($attr_type & self::MAPI_MV_FLAG) != 0) {
$have_mval = true;
$attr_type &= ~self::MAPI_MV_FLAG;
}
if (($attr_name >= 0x8000) && ($attr_name < 0xFFFE)) {
$this->_getx($data, 16);
$named_type = $this->_geti($data, 32);
switch ($named_type) {
case self::MAPI_NAMED_TYPE_ID:
$attr_name = $this->_geti($data, 32);
break;
case self::MAPI_NAMED_TYPE_STRING:
$attr_name = 0x9999;
$idlen = $this->_geti($data, 32);
$name = $this->_getx($data, $idlen + ((4 - ($idlen % 4)) % 4));
// $name = $this->convertString(substr($name, 0, $idlen));
break;
case self::MAPI_NAMED_TYPE_NONE:
default:
continue 2;
}
}
if ($have_mval) {
$num_mval = $this->_geti($data, 32);
}
switch ($attr_type) {
case self::MAPI_NULL:
case self::MAPI_TYPE_UNSET:
break;
case self::MAPI_SHORT:
$value = $this->_geti($data, 16);
$this->_geti($data, 16);
break;
case self::MAPI_INT:
case self::MAPI_BOOLEAN:
for ($i = 0; $i < $num_mval; $i++) {
$value = $this->_geti($data, 32);
}
break;
case self::MAPI_FLOAT:
case self::MAPI_ERROR:
$value = $this->_getx($data, 4);
break;
case self::MAPI_DOUBLE:
case self::MAPI_APPTIME:
case self::MAPI_CURRENCY:
case self::MAPI_INT8BYTE:
case self::MAPI_SYSTIME:
$value = $this->_getx($data, 8);
break;
case self::MAPI_STRING:
case self::MAPI_UNICODE_STRING:
case self::MAPI_BINARY:
case self::MAPI_OBJECT:
$num_vals = $have_mval ? $num_mval : $this->_geti($data, 32);
for ($i = 0; $i < $num_vals; $i++) {
$length = $this->_geti($data, 32);
// Pad to next 4 byte boundary.
$datalen = $length + ((4 - ($length % 4)) % 4);
// Read and truncate to length.
$value = $this->_getx($data, $datalen);
}
if ($attr_type == self::MAPI_UNICODE_STRING) {
$value = $this->convertString($value);
}
break;
}
// Store any interesting attributes.
switch ($attr_name) {
case self::MAPI_RTF_COMPRESSED:
$result['type'] = 'application';
$result['subtype'] = 'rtf';
$result['name'] = (!empty($result['name']) ? $result['name'] : 'Untitled') . '.rtf';
$result['stream'] = $this->_decodeRTF($value);
$result['size'] = strlen($result['stream']);
break;
case self::MAPI_BODY:
case self::MAPI_BODY_HTML:
$result['type'] = 'text';
$result['subtype'] = $attr_name == self::MAPI_BODY ? 'plain' : 'html';
$result['name'] = (!empty($result['name']) ? $result['name'] : 'Untitled')
. ($attr_name == self::MAPI_BODY ? '.txt' : '.html');
$result['stream'] = $value;
$result['size'] = strlen($value);
break;
case self::MAPI_ATTACH_LONG_FILENAME:
// Used in preference to AFILENAME value.
$result['name'] = trim(preg_replace('/.*[\/](.*)$/', '\1', $value));
break;
case self::MAPI_ATTACH_MIME_TAG:
// Is this ever set, and what is format?
$value = explode('/', trim($value));
$result['type'] = $value[0];
$result['subtype'] = $value[1];
break;
case self::MAPI_ATTACH_CONTENT_ID:
$result['content-id'] = $value;
break;
case self::MAPI_ATTACH_DATA:
$this->_getx($value, 16);
$att = new self();
$res = $att->decompress($value);
$result = array_merge($result, $res['message']);
break;
}
}
}
/**
* Decodes TNEF message attributes
*
* @param string &$data The data string
* @param array &$message Message data
*/
protected function _decodeMessage(&$data, &$message)
{
$attribute = $this->_geti($data, 32);
$value = $this->_decodeAttribute($data);
switch ($attribute) {
case self::AOEMCODEPAGE:
// Find codepage of the message
$value = unpack('V', $value);
$this->codepage = $value[1];
break;
case self::AMCLASS:
$value = trim(str_replace('Microsoft Mail v3.0 ', '', $value));
// Normal message will be that with prefix 'IPM.Microsoft Mail.
break;
case self::ASUBJECT:
$message['name'] = $value;
break;
case self::AMAPIPROPS:
$this->_extractMapiAttributes($value, $message);
break;
}
}
/**
* Decodes TNEF attachment attributes
*
* @param string &$data The data string
* @param array &$attachment Attachments data
*/
protected function _decodeAttachment(&$data, &$attachment)
{
$attribute = $this->_geti($data, 32);
$size = $this->_geti($data, 32);
$value = $this->_getx($data, $size);
$this->_geti($data, 16); // checksum
switch ($attribute) {
case self::ARENDDATA:
// Add a new default data block to hold details of this
// attachment. Reverse order is easier to handle later!
array_unshift($attachment, [
'type' => 'application',
'subtype' => 'octet-stream',
'name' => 'unknown',
'stream' => '',
]);
break;
case self::AFILENAME:
$value = $this->convertString($value, true);
// Strip path
$attachment[0]['name'] = trim(preg_replace('/.*[\/](.*)$/', '\1', $value));
break;
case self::ATTACHDATA:
// The attachment itself
$attachment[0]['size'] = $size;
$attachment[0]['stream'] = $value;
break;
case self::AMAPIATTRS:
$this->_extractMapiAttributes($value, $attachment[0]);
break;
}
}
/**
* Convert string value to system charset according to defined codepage
*/
protected function convertString($str, $use_codepage = false)
{
if ($use_codepage && $this->codepage
&& ($charset = rcube_charset::$windows_codepages[$this->codepage])
) {
$str = rcube_charset::convert($str, $charset, RCUBE_CHARSET);
} elseif (($pos = strpos($str, "\0")) !== false && $pos != strlen($str) - 1) {
$str = rcube_charset::convert($str, 'UTF-16LE', RCUBE_CHARSET);
}
return trim($str);
}
/**
* Decodes TNEF RTF
*/
protected function _decodeRTF($data)
{
$c_size = $this->_geti($data, 32);
$size = $this->_geti($data, 32);
$magic = $this->_geti($data, 32);
$crc = $this->_geti($data, 32);
if ($magic == self::RTF_COMPRESSED) {
$data = $this->_decompressRTF($data, $size);
}
return $data;
}
/**
* Decompress compressed RTF. Logic taken from Horde.
*/
protected function _decompressRTF($data, $size)
{
$in = $out = $flags = $flag_count = 0;
$uncomp = '';
$preload = "{\\rtf1\\ansi\\mac\\deff0\\deftab720{\\fonttbl;}{\\f0\\fnil \\froman \\fswiss \\fmodern \\fscript \\fdecor MS Sans SerifSymbolArialTimes New RomanCourier{\\colortbl\\red0\\green0\\blue0\n\r\\par \\pard\\plain\\f0\\fs20\\b\\i\\u\\tab\\tx";
$length_preload = strlen($preload);
for ($cnt = 0; $cnt < $length_preload; $cnt++) {
$uncomp .= $preload[$cnt];
$out++;
}
while ($out < ($size + $length_preload)) {
if (($flag_count++ % 8) == 0) {
$flags = ord($data[$in++]);
} else {
$flags = $flags >> 1;
}
if (($flags & 1) != 0) {
$offset = ord($data[$in++]);
$length = ord($data[$in++]);
$offset = ($offset << 4) | ($length >> 4);
$length = ($length & 0xF) + 2;
$offset = ((int) ($out / 4096)) * 4096 + $offset;
if ($offset >= $out) {
$offset -= 4096;
}
$end = $offset + $length;
while ($offset < $end) {
$uncomp .= $uncomp[$offset++];
$out++;
}
} else {
$uncomp .= $data[$in++];
$out++;
}
}
return substr($uncomp, $length_preload);
}
/**
* Parse RTF data and return the best plaintext representation we can.
* Adapted from: http://webcheatsheet.com/php/reading_the_clean_text_from_rtf.php
*
* @param string $text the RTF (uncompressed) text
*
* @return string the plain text
*/
public static function rtf2text($text)
{
$document = '';
$stack = [];
$j = -1;
// Read the data character-by- character…
for ($i = 0, $len = strlen($text); $i < $len; $i++) {
$c = $text[$i];
switch ($c) {
case '\\':
// Key Word
$nextChar = $text[$i + 1];
// If it is another backslash or nonbreaking space or hyphen,
// then the character is plain text and add it to the output stream.
if ($nextChar == '\\' && self::_rtfIsPlain($stack[$j])) {
$document .= '\\';
} elseif ($nextChar == '~' && self::_rtfIsPlain($stack[$j])) {
$document .= ' ';
} elseif ($nextChar == '_' && self::_rtfIsPlain($stack[$j])) {
$document .= '-';
} elseif ($nextChar == '*') {
// Add to the stack.
$stack[$j]['*'] = true;
} elseif ($nextChar == "'") {
// If it is a single quote, read next two characters that
// are the hexadecimal notation of a character we should add
// to the output stream.
$hex = substr($text, $i + 2, 2);
if (self::_rtfIsPlain($stack[$j])) {
$document .= html_entity_decode('&#' . hexdec($hex) . ';');
}
// Shift the pointer.
$i += 2;
} elseif ($nextChar >= 'a' && $nextChar <= 'z' || $nextChar >= 'A' && $nextChar <= 'Z') {
// Since, weve found the alphabetic character, the next
// characters are control words and, possibly, some digit
// parameter.
$word = '';
$param = null;
// Start reading characters after the backslash.
for ($k = $i + 1, $m = 0; $k < strlen($text); $k++, $m++) {
$nextChar = $text[$k];
// If the current character is a letter and there were
// no digits before it, then were still reading the
// control word. If there were digits, we should stop
// since we reach the end of the control word.
if ($nextChar >= 'a' && $nextChar <= 'z'
|| $nextChar >= 'A' && $nextChar <= 'Z') {
if (!empty($param)) {
break;
}
$word .= $nextChar;
} elseif ($nextChar >= '0' && $nextChar <= '9') {
// If it is a digit, store the parameter.
$param .= $nextChar;
} elseif ($nextChar == '-') {
// Since minus sign may occur only before a digit
// parameter, check whether $param is empty.
// Otherwise, we reach the end of the control word.
if (!empty($param)) {
break;
}
$param .= $nextChar;
} else {
break;
}
}
// Shift the pointer on the number of read characters.
$i += $m - 1;
// Start analyzing.We are interested mostly in control words
$toText = '';
switch (strtolower($word)) {
// If the control word is "u", then its parameter is
// the decimal notation of the Unicode character that
// should be added to the output stream. We need to
// check whether the stack contains \ucN control word.
// If it does, we should remove the N characters from
// the output stream.
case 'u':
$toText .= html_entity_decode('&#x' . dechex($param) . ';');
$ucDelta = @$stack[$j]['uc'];
if ($ucDelta > 0) {
$i += $ucDelta;
}
break;
case 'par':
case 'page':
case 'column':
case 'line':
case 'lbr':
$toText .= "\n";
break;
case 'emspace':
case 'enspace':
case 'qmspace':
$toText .= ' ';
break;
case 'tab':
$toText .= "\t";
break;
case 'chdate':
$toText .= date('m.d.Y');
break;
case 'chdpl':
$toText .= date('l, j F Y');
break;
case 'chdpa':
$toText .= date('D, j M Y');
break;
case 'chtime':
$toText .= date('H:i:s');
break;
case 'emdash':
$toText .= html_entity_decode('&mdash;');
break;
case 'endash':
$toText .= html_entity_decode('&ndash;');
break;
case 'bullet':
$toText .= html_entity_decode('&#149;');
break;
case 'lquote':
$toText .= html_entity_decode('&lsquo;');
break;
case 'rquote':
$toText .= html_entity_decode('&rsquo;');
break;
case 'ldblquote':
$toText .= html_entity_decode('&laquo;');
break;
case 'rdblquote':
$toText .= html_entity_decode('&raquo;');
break;
default:
$stack[$j][strtolower($word)] = empty($param) ? true : $param;
break;
}
// Add data to the output stream if required.
if (self::_rtfIsPlain($stack[$j])) {
$document .= $toText;
}
}
$i++;
break;
case '{':
// New subgroup starts, add new stack element and write the data
// from previous stack element to it.
if (!empty($stack[$j])) {
$stack[] = $stack[$j++];
} else {
$j++;
}
break;
case '}':
array_pop($stack);
$j--;
break;
case '\0':
case '\r':
case '\f':
case '\n':
// Junk
break;
default:
// Add other data to the output stream if required.
if (!empty($stack[$j]) && self::_rtfIsPlain($stack[$j])) {
$document .= $c;
}
break;
}
}
return $document;
}
/**
* Checks if an RTF element is plain text
*/
protected static function _rtfIsPlain($s)
{
$notPlain = ['*', 'fonttbl', 'colortbl', 'datastore', 'themedata', 'stylesheet'];
for ($i = 0; $i < count($notPlain); $i++) {
if (!empty($s[$notPlain[$i]])) {
return false;
}
}
return true;
}
}