Conversion.php 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. <?php
  2. namespace dokuwiki\Utf8;
  3. /**
  4. * Methods to convert from and to UTF-8 strings
  5. */
  6. class Conversion
  7. {
  8. /**
  9. * Encodes UTF-8 characters to HTML entities
  10. *
  11. * @author Tom N Harris <tnharris@whoopdedo.org>
  12. * @author <vpribish at shopping dot com>
  13. * @link http://php.net/manual/en/function.utf8-decode.php
  14. *
  15. * @param string $str
  16. * @param bool $all Encode non-utf8 char to HTML as well
  17. * @return string
  18. */
  19. public static function toHtml($str, $all = false)
  20. {
  21. $ret = '';
  22. foreach (Unicode::fromUtf8($str) as $cp) {
  23. if ($cp < 0x80 && !$all) {
  24. $ret .= chr($cp);
  25. } elseif ($cp < 0x100) {
  26. $ret .= "&#$cp;";
  27. } else {
  28. $ret .= '&#x' . dechex($cp) . ';';
  29. }
  30. }
  31. return $ret;
  32. }
  33. /**
  34. * Decodes HTML entities to UTF-8 characters
  35. *
  36. * Convert any &#..; entity to a codepoint,
  37. * The entities flag defaults to only decoding numeric entities.
  38. * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
  39. * are handled as well. Avoids the problem that would occur if you
  40. * had to decode "&amp;#38;&#38;amp;#38;"
  41. *
  42. * unhtmlspecialchars(\dokuwiki\Utf8\Conversion::fromHtml($s)) -> "&#38;&#38;"
  43. * \dokuwiki\Utf8\Conversion::fromHtml(unhtmlspecialchars($s)) -> "&&amp#38;"
  44. * what it should be -> "&#38;&amp#38;"
  45. *
  46. * @author Tom N Harris <tnharris@whoopdedo.org>
  47. *
  48. * @param string $str UTF-8 encoded string
  49. * @param boolean $entities decode name entities in addtition to numeric ones
  50. * @return string UTF-8 encoded string with numeric (and named) entities replaced.
  51. */
  52. public static function fromHtml($str, $entities = false)
  53. {
  54. if (!$entities) {
  55. return preg_replace_callback(
  56. '/(&#([Xx])?([0-9A-Za-z]+);)/m',
  57. [__CLASS__, 'decodeNumericEntity'],
  58. $str
  59. );
  60. }
  61. return preg_replace_callback(
  62. '/&(#)?([Xx])?([0-9A-Za-z]+);/m',
  63. [__CLASS__, 'decodeAnyEntity'],
  64. $str
  65. );
  66. }
  67. /**
  68. * Decodes any HTML entity to it's correct UTF-8 char equivalent
  69. *
  70. * @param string $ent An entity
  71. * @return string
  72. */
  73. protected static function decodeAnyEntity($ent)
  74. {
  75. // create the named entity lookup table
  76. static $table = null;
  77. if ($table === null) {
  78. $table = get_html_translation_table(HTML_ENTITIES);
  79. $table = array_flip($table);
  80. $table = array_map(
  81. static function ($c) {
  82. return Unicode::toUtf8(array(ord($c)));
  83. },
  84. $table
  85. );
  86. }
  87. if ($ent[1] === '#') {
  88. return self::decodeNumericEntity($ent);
  89. }
  90. if (array_key_exists($ent[0], $table)) {
  91. return $table[$ent[0]];
  92. }
  93. return $ent[0];
  94. }
  95. /**
  96. * Decodes numeric HTML entities to their correct UTF-8 characters
  97. *
  98. * @param $ent string A numeric entity
  99. * @return string|false
  100. */
  101. protected static function decodeNumericEntity($ent)
  102. {
  103. switch ($ent[2]) {
  104. case 'X':
  105. case 'x':
  106. $cp = hexdec($ent[3]);
  107. break;
  108. default:
  109. $cp = intval($ent[3]);
  110. break;
  111. }
  112. return Unicode::toUtf8(array($cp));
  113. }
  114. /**
  115. * UTF-8 to UTF-16BE conversion.
  116. *
  117. * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
  118. *
  119. * @param string $str
  120. * @param bool $bom
  121. * @return string
  122. */
  123. public static function toUtf16be($str, $bom = false)
  124. {
  125. $out = $bom ? "\xFE\xFF" : '';
  126. if (UTF8_MBSTRING) {
  127. return $out . mb_convert_encoding($str, 'UTF-16BE', 'UTF-8');
  128. }
  129. $uni = Unicode::fromUtf8($str);
  130. foreach ($uni as $cp) {
  131. $out .= pack('n', $cp);
  132. }
  133. return $out;
  134. }
  135. /**
  136. * UTF-8 to UTF-16BE conversion.
  137. *
  138. * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
  139. *
  140. * @param string $str
  141. * @return false|string
  142. */
  143. public static function fromUtf16be($str)
  144. {
  145. $uni = unpack('n*', $str);
  146. return Unicode::toUtf8($uni);
  147. }
  148. }