| | Author: Michael Slusarz | | Author: Aleksander Machniak | +-----------------------------------------------------------------------+ */ /** * MS-TNEF format decoder based on code by: * Graham Norbury * Original design by: * Thomas Boll , Mark Simpson */ class rcube_tnef_decoder { public const SIGNATURE = 0x223E9F78; public const LVL_MESSAGE = 0x01; public const LVL_ATTACHMENT = 0x02; public const AFROM = 0x08000; public const ASUBJECT = 0x18004; public const AMESSAGEID = 0x18009; public const AFILENAME = 0x18010; public const APARENTID = 0x1800A; public const ACONVERSATIONID = 0x1800B; public const ABODY = 0x2800C; public const ADATESENT = 0x38005; public const ADATERECEIVED = 0x38006; public const ADATEMODIFIED = 0x38020; public const APRIORITY = 0x4800D; public const AOWNER = 0x60000; public const ASENTFOR = 0x60001; public const ASTATUS = 0x68007; public const ATTACHDATA = 0x6800F; public const ATTACHMETAFILE = 0x68011; public const ATTACHCREATEDATE = 0x38012; public const ARENDDATA = 0x69002; public const AMAPIPROPS = 0x69003; public const ARECIPIENTTABLE = 0x69004; public const AMAPIATTRS = 0x69005; public const AOEMCODEPAGE = 0x69007; public const AORIGINALMCLASS = 0x70006; public const AMCLASS = 0x78008; public const AVERSION = 0x89006; public const MAPI_TYPE_UNSET = 0x0000; public const MAPI_NULL = 0x0001; public const MAPI_SHORT = 0x0002; public const MAPI_INT = 0x0003; public const MAPI_FLOAT = 0x0004; public const MAPI_DOUBLE = 0x0005; public const MAPI_CURRENCY = 0x0006; public const MAPI_APPTIME = 0x0007; public const MAPI_ERROR = 0x000A; public const MAPI_BOOLEAN = 0x000B; public const MAPI_OBJECT = 0x000D; public const MAPI_INT8BYTE = 0x0014; public const MAPI_STRING = 0x001E; public const MAPI_UNICODE_STRING = 0x001F; public const MAPI_SYSTIME = 0x0040; public const MAPI_CLSID = 0x0048; public const MAPI_BINARY = 0x0102; public const MAPI_BODY = 0x1000; public const MAPI_RTF_COMPRESSED = 0x1009; public const MAPI_BODY_HTML = 0x1013; public const MAPI_NATIVE_BODY = 0x1016; public const MAPI_DISPLAY_NAME = 0x3001; public const MAPI_ADDRTYPE = 0x3002; public const MAPI_EMAIL_ADDRESS = 0x3003; public const MAPI_COMMENT = 0x3004; public const MAPI_DEPTH = 0x3005; public const MAPI_PROVIDER_DISPLAY = 0x3006; public const MAPI_CREATION_TIME = 0x3007; public const MAPI_LAST_MODIFICATION_TIME = 0x3008; public const MAPI_RESOURCE_FLAGS = 0x3009; public const MAPI_PROVIDER_DLL_NAME = 0x300A; public const MAPI_SEARCH_KEY = 0x300B; public const MAPI_ATTACHMENT_X400_PARAMETERS = 0x3700; public const MAPI_ATTACH_DATA = 0x3701; public const MAPI_ATTACH_ENCODING = 0x3702; public const MAPI_ATTACH_EXTENSION = 0x3703; public const MAPI_ATTACH_FILENAME = 0x3704; public const MAPI_ATTACH_METHOD = 0x3705; public const MAPI_ATTACH_LONG_FILENAME = 0x3707; public const MAPI_ATTACH_PATHNAME = 0x3708; public const MAPI_ATTACH_RENDERING = 0x3709; public const MAPI_ATTACH_TAG = 0x370A; public const MAPI_RENDERING_POSITION = 0x370B; public const MAPI_ATTACH_TRANSPORT_NAME = 0x370C; public const MAPI_ATTACH_LONG_PATHNAME = 0x370D; public const MAPI_ATTACH_MIME_TAG = 0x370E; public const MAPI_ATTACH_ADDITIONAL_INFO = 0x370F; public const MAPI_ATTACH_MIME_SEQUENCE = 0x3710; public const MAPI_ATTACH_CONTENT_ID = 0x3712; public const MAPI_ATTACH_CONTENT_LOCATION = 0x3713; public const MAPI_ATTACH_FLAGS = 0x3714; public const MAPI_NAMED_TYPE_ID = 0x0000; public const MAPI_NAMED_TYPE_STRING = 0x0001; public const MAPI_NAMED_TYPE_NONE = 0xFF; public const MAPI_MV_FLAG = 0x1000; public const RTF_UNCOMPRESSED = 0x414C454D; public const RTF_COMPRESSED = 0x75465A4C; protected $codepage; /** * Decompress the data. * * @param string $data the data to decompress * @param bool $as_html Return message body as HTML * * @return array the decompressed data */ public function decompress($data, $as_html = false) { $attachments = []; $message = []; if ($this->_geti($data, 32) == self::SIGNATURE) { $this->_geti($data, 16); // Version $this->_geti($data, 8); // lvl_message $this->_geti($data, 32); // idTnefVersion $this->_getx($data, $this->_geti($data, 32)); $this->_geti($data, 16); // checksum while (strlen($data) > 0) { switch ($this->_geti($data, 8)) { case self::LVL_MESSAGE: $this->_decodeMessage($data, $message); break; case self::LVL_ATTACHMENT: $this->_decodeAttachment($data, $attachments); break; } } } // Return the message body as HTML if ($as_html) { // HTML body if (!empty($message['size']) && $message['subtype'] == 'html') { $message = $message['stream']; } // RTF body (converted to HTML) // Note: RTF can contain encapsulated HTML content elseif (!empty($message['size']) && $message['subtype'] == 'rtf' && function_exists('iconv') && class_exists('RtfHtmlPhp\Document') ) { try { $document = new Document($message['stream']); $formatter = new HtmlFormatter(RCUBE_CHARSET); $message = $formatter->format($document); } catch (Exception $e) { // ignore the body $message = null; rcube::raise_error('Failed to extract RTF/HTML content from TNEF attachment', true); } } else { $message = null; } } return [ 'message' => $message, 'attachments' => array_reverse($attachments), ]; } /** * Pop specified number of bytes from the buffer. * * @param string &$data The data string * @param int $bytes how many bytes to retrieve * * @return string Extracted data */ protected function _getx(&$data, $bytes) { $value = null; if (strlen($data) >= $bytes) { $value = substr($data, 0, $bytes); $data = substr($data, $bytes); } return $value; } /** * Pop specified number of bits from the buffer * * @param string &$data The data string * @param int $bits how many bits to retrieve * * @return int|null */ protected function _geti(&$data, $bits) { $bytes = $bits / 8; $value = null; if (strlen($data) >= $bytes) { $value = ord($data[0]); if ($bytes >= 2) { $value += (ord($data[1]) << 8); } if ($bytes >= 4) { $value += (ord($data[2]) << 16) + (ord($data[3]) << 24); } $data = substr($data, $bytes); } return $value; } /** * Decode a single attribute * * @param string &$data The data string * * @return string Extracted data */ protected function _decodeAttribute(&$data) { // Data. $value = $this->_getx($data, $this->_geti($data, 32)); // Checksum. $this->_geti($data, 16); return $value; } /** * TODO * * @param string $data the data string * @param array &$result TODO */ protected function _extractMapiAttributes($data, &$result) { // Number of attributes. $number = $this->_geti($data, 32); while ((strlen($data) > 0) && $number--) { $have_mval = false; $num_mval = 1; $value = null; $attr_type = $this->_geti($data, 16); $attr_name = $this->_geti($data, 16); if (($attr_type & self::MAPI_MV_FLAG) != 0) { $have_mval = true; $attr_type &= ~self::MAPI_MV_FLAG; } if (($attr_name >= 0x8000) && ($attr_name < 0xFFFE)) { $this->_getx($data, 16); $named_type = $this->_geti($data, 32); switch ($named_type) { case self::MAPI_NAMED_TYPE_ID: $attr_name = $this->_geti($data, 32); break; case self::MAPI_NAMED_TYPE_STRING: $attr_name = 0x9999; $idlen = $this->_geti($data, 32); $name = $this->_getx($data, $idlen + ((4 - ($idlen % 4)) % 4)); // $name = $this->convertString(substr($name, 0, $idlen)); break; case self::MAPI_NAMED_TYPE_NONE: default: continue 2; } } if ($have_mval) { $num_mval = $this->_geti($data, 32); } switch ($attr_type) { case self::MAPI_NULL: case self::MAPI_TYPE_UNSET: break; case self::MAPI_SHORT: $value = $this->_geti($data, 16); $this->_geti($data, 16); break; case self::MAPI_INT: case self::MAPI_BOOLEAN: for ($i = 0; $i < $num_mval; $i++) { $value = $this->_geti($data, 32); } break; case self::MAPI_FLOAT: case self::MAPI_ERROR: $value = $this->_getx($data, 4); break; case self::MAPI_DOUBLE: case self::MAPI_APPTIME: case self::MAPI_CURRENCY: case self::MAPI_INT8BYTE: case self::MAPI_SYSTIME: $value = $this->_getx($data, 8); break; case self::MAPI_STRING: case self::MAPI_UNICODE_STRING: case self::MAPI_BINARY: case self::MAPI_OBJECT: $num_vals = $have_mval ? $num_mval : $this->_geti($data, 32); for ($i = 0; $i < $num_vals; $i++) { $length = $this->_geti($data, 32); // Pad to next 4 byte boundary. $datalen = $length + ((4 - ($length % 4)) % 4); // Read and truncate to length. $value = $this->_getx($data, $datalen); } if ($attr_type == self::MAPI_UNICODE_STRING) { $value = $this->convertString($value); } break; } // Store any interesting attributes. switch ($attr_name) { case self::MAPI_RTF_COMPRESSED: $result['type'] = 'application'; $result['subtype'] = 'rtf'; $result['name'] = (!empty($result['name']) ? $result['name'] : 'Untitled') . '.rtf'; $result['stream'] = $this->_decodeRTF($value); $result['size'] = strlen($result['stream']); break; case self::MAPI_BODY: case self::MAPI_BODY_HTML: $result['type'] = 'text'; $result['subtype'] = $attr_name == self::MAPI_BODY ? 'plain' : 'html'; $result['name'] = (!empty($result['name']) ? $result['name'] : 'Untitled') . ($attr_name == self::MAPI_BODY ? '.txt' : '.html'); $result['stream'] = $value; $result['size'] = strlen($value); break; case self::MAPI_ATTACH_LONG_FILENAME: // Used in preference to AFILENAME value. $result['name'] = trim(preg_replace('/.*[\/](.*)$/', '\1', $value)); break; case self::MAPI_ATTACH_MIME_TAG: // Is this ever set, and what is format? $value = explode('/', trim($value)); $result['type'] = $value[0]; $result['subtype'] = $value[1]; break; case self::MAPI_ATTACH_CONTENT_ID: $result['content-id'] = $value; break; case self::MAPI_ATTACH_DATA: $this->_getx($value, 16); $att = new self(); $res = $att->decompress($value); $result = array_merge($result, $res['message']); break; } } } /** * Decodes TNEF message attributes * * @param string &$data The data string * @param array &$message Message data */ protected function _decodeMessage(&$data, &$message) { $attribute = $this->_geti($data, 32); $value = $this->_decodeAttribute($data); switch ($attribute) { case self::AOEMCODEPAGE: // Find codepage of the message $value = unpack('V', $value); $this->codepage = $value[1]; break; case self::AMCLASS: $value = trim(str_replace('Microsoft Mail v3.0 ', '', $value)); // Normal message will be that with prefix 'IPM.Microsoft Mail. break; case self::ASUBJECT: $message['name'] = $value; break; case self::AMAPIPROPS: $this->_extractMapiAttributes($value, $message); break; } } /** * Decodes TNEF attachment attributes * * @param string &$data The data string * @param array &$attachment Attachments data */ protected function _decodeAttachment(&$data, &$attachment) { $attribute = $this->_geti($data, 32); $size = $this->_geti($data, 32); $value = $this->_getx($data, $size); $this->_geti($data, 16); // checksum switch ($attribute) { case self::ARENDDATA: // Add a new default data block to hold details of this // attachment. Reverse order is easier to handle later! array_unshift($attachment, [ 'type' => 'application', 'subtype' => 'octet-stream', 'name' => 'unknown', 'stream' => '', ]); break; case self::AFILENAME: $value = $this->convertString($value, true); // Strip path $attachment[0]['name'] = trim(preg_replace('/.*[\/](.*)$/', '\1', $value)); break; case self::ATTACHDATA: // The attachment itself $attachment[0]['size'] = $size; $attachment[0]['stream'] = $value; break; case self::AMAPIATTRS: $this->_extractMapiAttributes($value, $attachment[0]); break; } } /** * Convert string value to system charset according to defined codepage */ protected function convertString($str, $use_codepage = false) { if ($use_codepage && $this->codepage && ($charset = rcube_charset::$windows_codepages[$this->codepage]) ) { $str = rcube_charset::convert($str, $charset, RCUBE_CHARSET); } elseif (($pos = strpos($str, "\0")) !== false && $pos != strlen($str) - 1) { $str = rcube_charset::convert($str, 'UTF-16LE', RCUBE_CHARSET); } return trim($str); } /** * Decodes TNEF RTF */ protected function _decodeRTF($data) { $c_size = $this->_geti($data, 32); $size = $this->_geti($data, 32); $magic = $this->_geti($data, 32); $crc = $this->_geti($data, 32); if ($magic == self::RTF_COMPRESSED) { $data = $this->_decompressRTF($data, $size); } return $data; } /** * Decompress compressed RTF. Logic taken from Horde. */ protected function _decompressRTF($data, $size) { $in = $out = $flags = $flag_count = 0; $uncomp = ''; $preload = "{\\rtf1\\ansi\\mac\\deff0\\deftab720{\\fonttbl;}{\\f0\\fnil \\froman \\fswiss \\fmodern \\fscript \\fdecor MS Sans SerifSymbolArialTimes New RomanCourier{\\colortbl\\red0\\green0\\blue0\n\r\\par \\pard\\plain\\f0\\fs20\\b\\i\\u\\tab\\tx"; $length_preload = strlen($preload); for ($cnt = 0; $cnt < $length_preload; $cnt++) { $uncomp .= $preload[$cnt]; $out++; } while ($out < ($size + $length_preload)) { if (($flag_count++ % 8) == 0) { $flags = ord($data[$in++]); } else { $flags = $flags >> 1; } if (($flags & 1) != 0) { $offset = ord($data[$in++]); $length = ord($data[$in++]); $offset = ($offset << 4) | ($length >> 4); $length = ($length & 0xF) + 2; $offset = ((int) ($out / 4096)) * 4096 + $offset; if ($offset >= $out) { $offset -= 4096; } $end = $offset + $length; while ($offset < $end) { $uncomp .= $uncomp[$offset++]; $out++; } } else { $uncomp .= $data[$in++]; $out++; } } return substr($uncomp, $length_preload); } /** * Parse RTF data and return the best plaintext representation we can. * Adapted from: http://webcheatsheet.com/php/reading_the_clean_text_from_rtf.php * * @param string $text the RTF (uncompressed) text * * @return string the plain text */ public static function rtf2text($text) { $document = ''; $stack = []; $j = -1; // Read the data character-by- character… for ($i = 0, $len = strlen($text); $i < $len; $i++) { $c = $text[$i]; switch ($c) { case '\\': // Key Word $nextChar = $text[$i + 1]; // If it is another backslash or nonbreaking space or hyphen, // then the character is plain text and add it to the output stream. if ($nextChar == '\\' && self::_rtfIsPlain($stack[$j])) { $document .= '\\'; } elseif ($nextChar == '~' && self::_rtfIsPlain($stack[$j])) { $document .= ' '; } elseif ($nextChar == '_' && self::_rtfIsPlain($stack[$j])) { $document .= '-'; } elseif ($nextChar == '*') { // Add to the stack. $stack[$j]['*'] = true; } elseif ($nextChar == "'") { // If it is a single quote, read next two characters that // are the hexadecimal notation of a character we should add // to the output stream. $hex = substr($text, $i + 2, 2); if (self::_rtfIsPlain($stack[$j])) { $document .= html_entity_decode('&#' . hexdec($hex) . ';'); } // Shift the pointer. $i += 2; } elseif ($nextChar >= 'a' && $nextChar <= 'z' || $nextChar >= 'A' && $nextChar <= 'Z') { // Since, we’ve found the alphabetic character, the next // characters are control words and, possibly, some digit // parameter. $word = ''; $param = null; // Start reading characters after the backslash. for ($k = $i + 1, $m = 0; $k < strlen($text); $k++, $m++) { $nextChar = $text[$k]; // If the current character is a letter and there were // no digits before it, then we’re still reading the // control word. If there were digits, we should stop // since we reach the end of the control word. if ($nextChar >= 'a' && $nextChar <= 'z' || $nextChar >= 'A' && $nextChar <= 'Z') { if (!empty($param)) { break; } $word .= $nextChar; } elseif ($nextChar >= '0' && $nextChar <= '9') { // If it is a digit, store the parameter. $param .= $nextChar; } elseif ($nextChar == '-') { // Since minus sign may occur only before a digit // parameter, check whether $param is empty. // Otherwise, we reach the end of the control word. if (!empty($param)) { break; } $param .= $nextChar; } else { break; } } // Shift the pointer on the number of read characters. $i += $m - 1; // Start analyzing.We are interested mostly in control words $toText = ''; switch (strtolower($word)) { // If the control word is "u", then its parameter is // the decimal notation of the Unicode character that // should be added to the output stream. We need to // check whether the stack contains \ucN control word. // If it does, we should remove the N characters from // the output stream. case 'u': $toText .= html_entity_decode('&#x' . dechex($param) . ';'); $ucDelta = @$stack[$j]['uc']; if ($ucDelta > 0) { $i += $ucDelta; } break; case 'par': case 'page': case 'column': case 'line': case 'lbr': $toText .= "\n"; break; case 'emspace': case 'enspace': case 'qmspace': $toText .= ' '; break; case 'tab': $toText .= "\t"; break; case 'chdate': $toText .= date('m.d.Y'); break; case 'chdpl': $toText .= date('l, j F Y'); break; case 'chdpa': $toText .= date('D, j M Y'); break; case 'chtime': $toText .= date('H:i:s'); break; case 'emdash': $toText .= html_entity_decode('—'); break; case 'endash': $toText .= html_entity_decode('–'); break; case 'bullet': $toText .= html_entity_decode('•'); break; case 'lquote': $toText .= html_entity_decode('‘'); break; case 'rquote': $toText .= html_entity_decode('’'); break; case 'ldblquote': $toText .= html_entity_decode('«'); break; case 'rdblquote': $toText .= html_entity_decode('»'); break; default: $stack[$j][strtolower($word)] = empty($param) ? true : $param; break; } // Add data to the output stream if required. if (self::_rtfIsPlain($stack[$j])) { $document .= $toText; } } $i++; break; case '{': // New subgroup starts, add new stack element and write the data // from previous stack element to it. if (!empty($stack[$j])) { $stack[] = $stack[$j++]; } else { $j++; } break; case '}': array_pop($stack); $j--; break; case '\0': case '\r': case '\f': case '\n': // Junk break; default: // Add other data to the output stream if required. if (!empty($stack[$j]) && self::_rtfIsPlain($stack[$j])) { $document .= $c; } break; } } return $document; } /** * Checks if an RTF element is plain text */ protected static function _rtfIsPlain($s) { $notPlain = ['*', 'fonttbl', 'colortbl', 'datastore', 'themedata', 'stylesheet']; for ($i = 0; $i < count($notPlain); $i++) { if (!empty($s[$notPlain[$i]])) { return false; } } return true; } }