* @version 0.0.5 * @package phlyMail */ class uctc { private static $mechs = array( 'ucs4', /*'ucs4le', 'ucs4be', */'ucs4array', /*'utf16', 'utf16le', 'utf16be', */'utf8', 'utf7', 'utf7imap' ); private static $allow_overlong = false; private static $safe_mode; private static $safe_char; /** * The actual conversion routine * * @param mixed $data The data to convert, usually a string, array when converting from UCS-4 array * @param string $from Original encoding of the data * @param string $to Target encoding of the data * @param bool $safe_mode SafeMode tries to correct invalid codepoints * @return mixed False on failure, String or array on success, depending on target encoding * @access public * @since 0.0.1 */ public static function convert( $data, $from, $to, $safe_mode = false, $safe_char = 0xFFFC ) { self::$safe_mode = ( $safe_mode ) ? true : false; self::$safe_char = ( $safe_char ) ? $safe_char : 0xFFFC; if ( self::$safe_mode ) { self::$allow_overlong = true; } if ( ! in_array( $from, self::$mechs ) ) { throw new Exception( 'Invalid input format specified' ); } if ( ! in_array( $to, self::$mechs ) ) { throw new Exception( 'Invalid output format specified' ); } if ( 'ucs4array' != $from ) { $data = call_user_func( array( self, $from . '_ucs4array' ), $data ); // eval( '$data = self::' . $from . '_ucs4array($data);' ); } if ( 'ucs4array' != $to ) { $data = call_user_func( array( self, 'ucs4array_' . $to ), $data ); // eval( '$data = self::ucs4array_' . $to . '($data);' ); } return $data; } /** * This converts an UTF-8 encoded string to its UCS-4 representation * * @param string $input The UTF-8 string to convert * @return array Array of 32bit values representing each codepoint * @access private */ private static function utf8_ucs4array( $input ) { $output = array(); $out_len = 0; $inp_len = strlen( $input ); $mode = 'next'; $test = 'none'; for ( $k = 0; $k < $inp_len; ++$k ) { $v = ord( $input[ $k ] ); // Extract byte from input string if ( $v < 128 ) { // We found an ASCII char - put into stirng as is $output[ $out_len ] = $v; ++$out_len; if ( 'add' == $mode ) { if ( self::$safe_mode ) { $output[ $out_len - 2 ] = self::$safe_char; $mode = 'next'; } else { throw new Exception( 'Conversion from UTF-8 to UCS-4 failed: malformed input at byte ' . $k ); } } continue; } if ( 'next' == $mode ) { // Try to find the next start byte; determine the width of the Unicode char $start_byte = $v; $mode = 'add'; $test = 'range'; //phpcs:ignore WordPress.PHP.YodaConditions.NotYoda if ( $v >> 5 == 6 ) { // &110xxxxx 10xxxxx $next_byte = 0; // Tells, how many times subsequent bitmasks must rotate 6bits to the left $v = ( $v - 192 ) << 6; //phpcs:ignore WordPress.PHP.YodaConditions.NotYoda } elseif ( $v >> 4 == 14 ) { // &1110xxxx 10xxxxxx 10xxxxxx $next_byte = 1; $v = ( $v - 224 ) << 12; //phpcs:ignore WordPress.PHP.YodaConditions.NotYoda } elseif ( $v >> 3 == 30 ) { // &11110xxx 10xxxxxx 10xxxxxx 10xxxxxx $next_byte = 2; $v = ( $v - 240 ) << 18; } elseif ( self::$safe_mode ) { $mode = 'next'; $output[ $out_len ] = self::$safe_char; ++$out_len; continue; } else { throw new Exception( 'This might be UTF-8, but I don\'t understand it at byte ' . $k ); } if ( $inp_len - $k - $next_byte < 2 ) { $output[ $out_len ] = self::$safe_char; $mode = 'no'; continue; } if ( 'add' == $mode ) { $output[ $out_len ] = (int) $v; ++$out_len; continue; } } if ( 'add' == $mode ) { if ( ! self::$allow_overlong && 'range' == $test ) { $test = 'none'; if ( ( $v < 0xA0 && 0xE0 == $start_byte ) || ( $v < 0x90 && 0xF0 == $start_byte ) || ( $v > 0x8F && 0xF4 == $start_byte ) ) { throw new Exception( 'Bogus UTF-8 character detected (out of legal range) at byte ' . $k ); } } //phpcs:ignore WordPress.PHP.YodaConditions.NotYoda if ( $v >> 6 == 2 ) { // Bit mask must be 10xxxxxx $v = ( $v - 128 ) << ( $next_byte * 6 ); $output[ ( $out_len - 1 ) ] += $v; --$next_byte; } else { if ( self::$safe_mode ) { $output[ $out_len - 1 ] = ord( self::$safe_char ); $k--; $mode = 'next'; continue; } else { throw new Exception( 'Conversion from UTF-8 to UCS-4 failed: malformed input at byte ' . $k ); } } if ( $next_byte < 0 ) { $mode = 'next'; } } } // for return $output; } /** * Convert UCS-4 string into UTF-8 string * See utf8_ucs4array() for details * @access private */ private static function ucs4array_utf8( $input ) { $output = ''; foreach ( $input as $v ) { if ( $v < 128 ) { // 7bit are transferred literally $output .= chr( $v ); } elseif ( $v < ( 1 << 11 ) ) { // 2 bytes $output .= chr( 192 + ( $v >> 6 ) ) . chr( 128 + ( $v & 63 ) ); } elseif ( $v < ( 1 << 16 ) ) { // 3 bytes $output .= chr( 224 + ( $v >> 12 ) ) . chr( 128 + ( ( $v >> 6 ) & 63 ) ) . chr( 128 + ( $v & 63 ) ); } elseif ( $v < ( 1 << 21 ) ) { // 4 bytes $output .= chr( 240 + ( $v >> 18 ) ) . chr( 128 + ( ( $v >> 12 ) & 63 ) ) . chr( 128 + ( ( $v >> 6 ) & 63 ) ) . chr( 128 + ( $v & 63 ) ); } elseif ( self::$safe_mode ) { $output .= self::$safe_char; } else { throw new Exception( 'Conversion from UCS-4 to UTF-8 failed: malformed input at byte ' . $k ); } } return $output; } private static function utf7imap_ucs4array( $input ) { return self::utf7_ucs4array( str_replace( ',', '/', $input ), '&' ); } private static function utf7_ucs4array( $input, $sc = '+' ) { $output = array(); $out_len = 0; $inp_len = strlen( $input ); $mode = 'd'; $b64 = ''; for ( $k = 0; $k < $inp_len; ++$k ) { $c = $input[ $k ]; if ( 0 == ord( $c ) ) { continue; // Ignore zero bytes } if ( 'b' == $mode ) { // Sequence got terminated if ( ! preg_match( '![A-Za-z0-9/' . preg_quote( $sc, '!' ) . ']!', $c ) ) { if ( '-' == $c ) { if ( '' == $b64 ) { $output[ $out_len ] = ord( $sc ); $out_len++; $mode = 'd'; continue; } } $tmp = base64_decode( $b64 ); $tmp = substr( $tmp, -1 * ( strlen( $tmp ) % 2 ) ); for ( $i = 0; $i < strlen( $tmp ); $i++ ) { if ( $i % 2 ) { $output[ $out_len ] += ord( $tmp[ $i ] ); $out_len++; } else { $output[ $out_len ] = ord( $tmp[ $i ] ) << 8; } } $mode = 'd'; $b64 = ''; continue; } else { $b64 .= $c; } } if ( 'd' == $mode ) { if ( $sc == $c ) { $mode = 'b'; continue; } $output[ $out_len ] = ord( $c ); $out_len++; } } return $output; } private static function ucs4array_utf7imap( $input ) { return str_replace( '/', ',', self::ucs4array_utf7( $input, '&' ) ); } private static function ucs4array_utf7( $input, $sc = '+' ) { $output = ''; $mode = 'd'; $b64 = ''; foreach ( $input as $v ) { $is_direct = ( 0x20 <= $v && $v <= 0x7e && ord( $sc ) != $v ); if ( 'b' == $mode ) { if ( $is_direct ) { if ( chr( 0 ) . $sc == $b64 ) { $output .= $sc . '-'; $b64 = ''; } else { while ( strlen( $b64 ) % 3 ) { $b64 .= chr( 0 ); } $output .= $sc . base64_encode( $b64 ) . '-'; } $mode = 'd'; } else { $b64 .= ( chr( ( $v >> 8 ) & 255 ) . chr( $v & 255 ) ); } } if ( 'd' == $mode ) { if ( $is_direct ) { $output .= chr( $v ); } else { $b64 = ( chr( ( $v >> 8 ) & 255 ) . chr( $v & 255 ) ); $mode = 'b'; } } } return $output; } /** * Convert UCS-4 array into UCS-4 string (Little Endian at the moment) * @access private */ private static function ucs4array_ucs4( $input ) { $output = ''; foreach ( $input as $v ) { $output .= chr( ( $v >> 24 ) & 255 ) . chr( ( $v >> 16 ) & 255 ) . chr( ( $v >> 8 ) & 255 ) . chr( $v & 255 ); } return $output; } /** * Convert UCS-4 string (LE in the moment) into UCS-4 garray * @access private */ private static function ucs4_ucs4array( $input ) { $output = array(); $inp_len = strlen( $input ); // Input length must be dividable by 4 if ( $inp_len % 4 ) { throw new Exception( 'Input UCS4 string is broken' ); } // Empty input - return empty output if ( ! $inp_len ) { return $output; } for ( $i = 0, $out_len = -1; $i < $inp_len; ++$i ) { if ( ! ( $i % 4 ) ) { // Increment output position every 4 input bytes $out_len++; $output[ $out_len ] = 0; } $output[ $out_len ] += ord( $input[ $i ] ) << ( 8 * ( 3 - ( $i % 4 ) ) ); } return $output; } }