123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162 |
- <?php
- namespace dokuwiki\Utf8;
- /**
- * Methods to convert from and to UTF-8 strings
- */
- class Conversion
- {
- /**
- * Encodes UTF-8 characters to HTML entities
- *
- * @author Tom N Harris <tnharris@whoopdedo.org>
- * @author <vpribish at shopping dot com>
- * @link http://php.net/manual/en/function.utf8-decode.php
- *
- * @param string $str
- * @param bool $all Encode non-utf8 char to HTML as well
- * @return string
- */
- public static function toHtml($str, $all = false)
- {
- $ret = '';
- foreach (Unicode::fromUtf8($str) as $cp) {
- if ($cp < 0x80 && !$all) {
- $ret .= chr($cp);
- } elseif ($cp < 0x100) {
- $ret .= "&#$cp;";
- } else {
- $ret .= '&#x' . dechex($cp) . ';';
- }
- }
- return $ret;
- }
- /**
- * Decodes HTML entities to UTF-8 characters
- *
- * Convert any &#..; entity to a codepoint,
- * The entities flag defaults to only decoding numeric entities.
- * Pass HTML_ENTITIES and named entities, including & < etc.
- * are handled as well. Avoids the problem that would occur if you
- * had to decode "&#38;&amp;#38;"
- *
- * unhtmlspecialchars(\dokuwiki\Utf8\Conversion::fromHtml($s)) -> "&&"
- * \dokuwiki\Utf8\Conversion::fromHtml(unhtmlspecialchars($s)) -> "&&#38;"
- * what it should be -> "&&#38;"
- *
- * @author Tom N Harris <tnharris@whoopdedo.org>
- *
- * @param string $str UTF-8 encoded string
- * @param boolean $entities decode name entities in addtition to numeric ones
- * @return string UTF-8 encoded string with numeric (and named) entities replaced.
- */
- public static function fromHtml($str, $entities = false)
- {
- if (!$entities) {
- return preg_replace_callback(
- '/(&#([Xx])?([0-9A-Za-z]+);)/m',
- [__CLASS__, 'decodeNumericEntity'],
- $str
- );
- }
- return preg_replace_callback(
- '/&(#)?([Xx])?([0-9A-Za-z]+);/m',
- [__CLASS__, 'decodeAnyEntity'],
- $str
- );
- }
- /**
- * Decodes any HTML entity to it's correct UTF-8 char equivalent
- *
- * @param string $ent An entity
- * @return string
- */
- protected static function decodeAnyEntity($ent)
- {
- // create the named entity lookup table
- static $table = null;
- if ($table === null) {
- $table = get_html_translation_table(HTML_ENTITIES);
- $table = array_flip($table);
- $table = array_map(
- static function ($c) {
- return Unicode::toUtf8(array(ord($c)));
- },
- $table
- );
- }
- if ($ent[1] === '#') {
- return self::decodeNumericEntity($ent);
- }
- if (array_key_exists($ent[0], $table)) {
- return $table[$ent[0]];
- }
- return $ent[0];
- }
- /**
- * Decodes numeric HTML entities to their correct UTF-8 characters
- *
- * @param $ent string A numeric entity
- * @return string|false
- */
- protected static function decodeNumericEntity($ent)
- {
- switch ($ent[2]) {
- case 'X':
- case 'x':
- $cp = hexdec($ent[3]);
- break;
- default:
- $cp = intval($ent[3]);
- break;
- }
- return Unicode::toUtf8(array($cp));
- }
- /**
- * UTF-8 to UTF-16BE conversion.
- *
- * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
- *
- * @param string $str
- * @param bool $bom
- * @return string
- */
- public static function toUtf16be($str, $bom = false)
- {
- $out = $bom ? "\xFE\xFF" : '';
- if (UTF8_MBSTRING) {
- return $out . mb_convert_encoding($str, 'UTF-16BE', 'UTF-8');
- }
- $uni = Unicode::fromUtf8($str);
- foreach ($uni as $cp) {
- $out .= pack('n', $cp);
- }
- return $out;
- }
- /**
- * UTF-8 to UTF-16BE conversion.
- *
- * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
- *
- * @param string $str
- * @return false|string
- */
- public static function fromUtf16be($str)
- {
- $uni = unpack('n*', $str);
- return Unicode::toUtf8($uni);
- }
- }
|