Unicode.php 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. <?php
  2. namespace dokuwiki\Utf8;
  3. /**
  4. * Convert between UTF-8 and a list of Unicode Code Points
  5. */
  6. class Unicode
  7. {
  8. /**
  9. * Takes an UTF-8 string and returns an array of ints representing the
  10. * Unicode characters. Astral planes are supported ie. the ints in the
  11. * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
  12. * are not allowed.
  13. *
  14. * If $strict is set to true the function returns false if the input
  15. * string isn't a valid UTF-8 octet sequence and raises a PHP error at
  16. * level E_USER_WARNING
  17. *
  18. * Note: this function has been modified slightly in this library to
  19. * trigger errors on encountering bad bytes
  20. *
  21. * @author <hsivonen@iki.fi>
  22. * @author Harry Fuecks <hfuecks@gmail.com>
  23. * @see unicode_to_utf8
  24. * @link http://hsivonen.iki.fi/php-utf8/
  25. * @link http://sourceforge.net/projects/phputf8/
  26. * @todo break into less complex chunks
  27. * @todo use exceptions instead of user errors
  28. *
  29. * @param string $str UTF-8 encoded string
  30. * @param boolean $strict Check for invalid sequences?
  31. * @return mixed array of unicode code points or false if UTF-8 invalid
  32. */
  33. public static function fromUtf8($str, $strict = false)
  34. {
  35. $mState = 0; // cached expected number of octets after the current octet
  36. // until the beginning of the next UTF8 character sequence
  37. $mUcs4 = 0; // cached Unicode character
  38. $mBytes = 1; // cached expected number of octets in the current sequence
  39. $out = array();
  40. $len = strlen($str);
  41. for ($i = 0; $i < $len; $i++) {
  42. $in = ord($str[$i]);
  43. if ($mState === 0) {
  44. // When mState is zero we expect either a US-ASCII character or a
  45. // multi-octet sequence.
  46. if (0 === (0x80 & $in)) {
  47. // US-ASCII, pass straight through.
  48. $out[] = $in;
  49. $mBytes = 1;
  50. } else if (0xC0 === (0xE0 & $in)) {
  51. // First octet of 2 octet sequence
  52. $mUcs4 = $in;
  53. $mUcs4 = ($mUcs4 & 0x1F) << 6;
  54. $mState = 1;
  55. $mBytes = 2;
  56. } else if (0xE0 === (0xF0 & $in)) {
  57. // First octet of 3 octet sequence
  58. $mUcs4 = $in;
  59. $mUcs4 = ($mUcs4 & 0x0F) << 12;
  60. $mState = 2;
  61. $mBytes = 3;
  62. } else if (0xF0 === (0xF8 & $in)) {
  63. // First octet of 4 octet sequence
  64. $mUcs4 = $in;
  65. $mUcs4 = ($mUcs4 & 0x07) << 18;
  66. $mState = 3;
  67. $mBytes = 4;
  68. } else if (0xF8 === (0xFC & $in)) {
  69. /* First octet of 5 octet sequence.
  70. *
  71. * This is illegal because the encoded codepoint must be either
  72. * (a) not the shortest form or
  73. * (b) outside the Unicode range of 0-0x10FFFF.
  74. * Rather than trying to resynchronize, we will carry on until the end
  75. * of the sequence and let the later error handling code catch it.
  76. */
  77. $mUcs4 = $in;
  78. $mUcs4 = ($mUcs4 & 0x03) << 24;
  79. $mState = 4;
  80. $mBytes = 5;
  81. } else if (0xFC === (0xFE & $in)) {
  82. // First octet of 6 octet sequence, see comments for 5 octet sequence.
  83. $mUcs4 = $in;
  84. $mUcs4 = ($mUcs4 & 1) << 30;
  85. $mState = 5;
  86. $mBytes = 6;
  87. } elseif ($strict) {
  88. /* Current octet is neither in the US-ASCII range nor a legal first
  89. * octet of a multi-octet sequence.
  90. */
  91. trigger_error(
  92. 'utf8_to_unicode: Illegal sequence identifier ' .
  93. 'in UTF-8 at byte ' . $i,
  94. E_USER_WARNING
  95. );
  96. return false;
  97. }
  98. } else {
  99. // When mState is non-zero, we expect a continuation of the multi-octet
  100. // sequence
  101. if (0x80 === (0xC0 & $in)) {
  102. // Legal continuation.
  103. $shift = ($mState - 1) * 6;
  104. $tmp = $in;
  105. $tmp = ($tmp & 0x0000003F) << $shift;
  106. $mUcs4 |= $tmp;
  107. /**
  108. * End of the multi-octet sequence. mUcs4 now contains the final
  109. * Unicode codepoint to be output
  110. */
  111. if (0 === --$mState) {
  112. /*
  113. * Check for illegal sequences and codepoints.
  114. */
  115. // From Unicode 3.1, non-shortest form is illegal
  116. if (((2 === $mBytes) && ($mUcs4 < 0x0080)) ||
  117. ((3 === $mBytes) && ($mUcs4 < 0x0800)) ||
  118. ((4 === $mBytes) && ($mUcs4 < 0x10000)) ||
  119. (4 < $mBytes) ||
  120. // From Unicode 3.2, surrogate characters are illegal
  121. (($mUcs4 & 0xFFFFF800) === 0xD800) ||
  122. // Codepoints outside the Unicode range are illegal
  123. ($mUcs4 > 0x10FFFF)) {
  124. if ($strict) {
  125. trigger_error(
  126. 'utf8_to_unicode: Illegal sequence or codepoint ' .
  127. 'in UTF-8 at byte ' . $i,
  128. E_USER_WARNING
  129. );
  130. return false;
  131. }
  132. }
  133. if (0xFEFF !== $mUcs4) {
  134. // BOM is legal but we don't want to output it
  135. $out[] = $mUcs4;
  136. }
  137. //initialize UTF8 cache
  138. $mState = 0;
  139. $mUcs4 = 0;
  140. $mBytes = 1;
  141. }
  142. } elseif ($strict) {
  143. /**
  144. *((0xC0 & (*in) != 0x80) && (mState != 0))
  145. * Incomplete multi-octet sequence.
  146. */
  147. trigger_error(
  148. 'utf8_to_unicode: Incomplete multi-octet ' .
  149. ' sequence in UTF-8 at byte ' . $i,
  150. E_USER_WARNING
  151. );
  152. return false;
  153. }
  154. }
  155. }
  156. return $out;
  157. }
  158. /**
  159. * Takes an array of ints representing the Unicode characters and returns
  160. * a UTF-8 string. Astral planes are supported ie. the ints in the
  161. * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
  162. * are not allowed.
  163. *
  164. * If $strict is set to true the function returns false if the input
  165. * array contains ints that represent surrogates or are outside the
  166. * Unicode range and raises a PHP error at level E_USER_WARNING
  167. *
  168. * Note: this function has been modified slightly in this library to use
  169. * output buffering to concatenate the UTF-8 string (faster) as well as
  170. * reference the array by it's keys
  171. *
  172. * @param array $arr of unicode code points representing a string
  173. * @param boolean $strict Check for invalid sequences?
  174. * @return string|false UTF-8 string or false if array contains invalid code points
  175. *
  176. * @author <hsivonen@iki.fi>
  177. * @author Harry Fuecks <hfuecks@gmail.com>
  178. * @see utf8_to_unicode
  179. * @link http://hsivonen.iki.fi/php-utf8/
  180. * @link http://sourceforge.net/projects/phputf8/
  181. * @todo use exceptions instead of user errors
  182. */
  183. public static function toUtf8($arr, $strict = false)
  184. {
  185. if (!is_array($arr)) return '';
  186. ob_start();
  187. foreach (array_keys($arr) as $k) {
  188. if (($arr[$k] >= 0) && ($arr[$k] <= 0x007f)) {
  189. # ASCII range (including control chars)
  190. echo chr($arr[$k]);
  191. } else if ($arr[$k] <= 0x07ff) {
  192. # 2 byte sequence
  193. echo chr(0xc0 | ($arr[$k] >> 6));
  194. echo chr(0x80 | ($arr[$k] & 0x003f));
  195. } else if ($arr[$k] == 0xFEFF) {
  196. # Byte order mark (skip)
  197. // nop -- zap the BOM
  198. } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
  199. # Test for illegal surrogates
  200. // found a surrogate
  201. if ($strict) {
  202. trigger_error(
  203. 'unicode_to_utf8: Illegal surrogate ' .
  204. 'at index: ' . $k . ', value: ' . $arr[$k],
  205. E_USER_WARNING
  206. );
  207. return false;
  208. }
  209. } else if ($arr[$k] <= 0xffff) {
  210. # 3 byte sequence
  211. echo chr(0xe0 | ($arr[$k] >> 12));
  212. echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
  213. echo chr(0x80 | ($arr[$k] & 0x003f));
  214. } else if ($arr[$k] <= 0x10ffff) {
  215. # 4 byte sequence
  216. echo chr(0xf0 | ($arr[$k] >> 18));
  217. echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
  218. echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
  219. echo chr(0x80 | ($arr[$k] & 0x3f));
  220. } elseif ($strict) {
  221. trigger_error(
  222. 'unicode_to_utf8: Codepoint out of Unicode range ' .
  223. 'at index: ' . $k . ', value: ' . $arr[$k],
  224. E_USER_WARNING
  225. );
  226. // out of range
  227. return false;
  228. }
  229. }
  230. return ob_get_clean();
  231. }
  232. }