Clean.php 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204
  1. <?php
  2. namespace dokuwiki\Utf8;
  3. /**
  4. * Methods to assess and clean UTF-8 strings
  5. */
  6. class Clean
  7. {
  8. /**
  9. * Checks if a string contains 7bit ASCII only
  10. *
  11. * @author Andreas Haerter <andreas.haerter@dev.mail-node.com>
  12. *
  13. * @param string $str
  14. * @return bool
  15. */
  16. public static function isASCII($str)
  17. {
  18. return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
  19. }
  20. /**
  21. * Tries to detect if a string is in Unicode encoding
  22. *
  23. * @author <bmorel@ssi.fr>
  24. * @link http://php.net/manual/en/function.utf8-encode.php
  25. *
  26. * @param string $str
  27. * @return bool
  28. */
  29. public static function isUtf8($str)
  30. {
  31. $len = strlen($str);
  32. for ($i = 0; $i < $len; $i++) {
  33. $b = ord($str[$i]);
  34. if ($b < 0x80) continue; # 0bbbbbbb
  35. elseif (($b & 0xE0) === 0xC0) $n = 1; # 110bbbbb
  36. elseif (($b & 0xF0) === 0xE0) $n = 2; # 1110bbbb
  37. elseif (($b & 0xF8) === 0xF0) $n = 3; # 11110bbb
  38. elseif (($b & 0xFC) === 0xF8) $n = 4; # 111110bb
  39. elseif (($b & 0xFE) === 0xFC) $n = 5; # 1111110b
  40. else return false; # Does not match any model
  41. for ($j = 0; $j < $n; $j++) { # n bytes matching 10bbbbbb follow ?
  42. if ((++$i === $len) || ((ord($str[$i]) & 0xC0) !== 0x80))
  43. return false;
  44. }
  45. }
  46. return true;
  47. }
  48. /**
  49. * Strips all high byte chars
  50. *
  51. * Returns a pure ASCII7 string
  52. *
  53. * @author Andreas Gohr <andi@splitbrain.org>
  54. *
  55. * @param string $str
  56. * @return string
  57. */
  58. public static function strip($str)
  59. {
  60. $ascii = '';
  61. $len = strlen($str);
  62. for ($i = 0; $i < $len; $i++) {
  63. if (ord($str[$i]) < 128) {
  64. $ascii .= $str[$i];
  65. }
  66. }
  67. return $ascii;
  68. }
  69. /**
  70. * Removes special characters (nonalphanumeric) from a UTF-8 string
  71. *
  72. * This function adds the controlchars 0x00 to 0x19 to the array of
  73. * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
  74. *
  75. * @author Andreas Gohr <andi@splitbrain.org>
  76. *
  77. * @param string $string The UTF8 string to strip of special chars
  78. * @param string $repl Replace special with this string
  79. * @param string $additional Additional chars to strip (used in regexp char class)
  80. * @return string
  81. */
  82. public static function stripspecials($string, $repl = '', $additional = '')
  83. {
  84. static $specials = null;
  85. if ($specials === null) {
  86. $specials = preg_quote(Table::specialChars(), '/');
  87. }
  88. return preg_replace('/[' . $additional . '\x00-\x19' . $specials . ']/u', $repl, $string);
  89. }
  90. /**
  91. * Replace bad bytes with an alternative character
  92. *
  93. * ASCII character is recommended for replacement char
  94. *
  95. * PCRE Pattern to locate bad bytes in a UTF-8 string
  96. * Comes from W3 FAQ: Multilingual Forms
  97. * Note: modified to include full ASCII range including control chars
  98. *
  99. * @author Harry Fuecks <hfuecks@gmail.com>
  100. * @see http://www.w3.org/International/questions/qa-forms-utf-8
  101. *
  102. * @param string $str to search
  103. * @param string $replace to replace bad bytes with (defaults to '?') - use ASCII
  104. * @return string
  105. */
  106. public static function replaceBadBytes($str, $replace = '')
  107. {
  108. $UTF8_BAD =
  109. '([\x00-\x7F]' . # ASCII (including control chars)
  110. '|[\xC2-\xDF][\x80-\xBF]' . # non-overlong 2-byte
  111. '|\xE0[\xA0-\xBF][\x80-\xBF]' . # excluding overlongs
  112. '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}' . # straight 3-byte
  113. '|\xED[\x80-\x9F][\x80-\xBF]' . # excluding surrogates
  114. '|\xF0[\x90-\xBF][\x80-\xBF]{2}' . # planes 1-3
  115. '|[\xF1-\xF3][\x80-\xBF]{3}' . # planes 4-15
  116. '|\xF4[\x80-\x8F][\x80-\xBF]{2}' . # plane 16
  117. '|(.{1}))'; # invalid byte
  118. ob_start();
  119. while (preg_match('/' . $UTF8_BAD . '/S', $str, $matches)) {
  120. if (!isset($matches[2])) {
  121. echo $matches[0];
  122. } else {
  123. echo $replace;
  124. }
  125. $str = substr($str, strlen($matches[0]));
  126. }
  127. return ob_get_clean();
  128. }
  129. /**
  130. * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
  131. *
  132. * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
  133. * letters. Default is to deaccent both cases ($case = 0)
  134. *
  135. * @author Andreas Gohr <andi@splitbrain.org>
  136. *
  137. * @param string $string
  138. * @param int $case
  139. * @return string
  140. */
  141. public static function deaccent($string, $case = 0)
  142. {
  143. if ($case <= 0) {
  144. $string = strtr($string, Table::lowerAccents());
  145. }
  146. if ($case >= 0) {
  147. $string = strtr($string, Table::upperAccents());
  148. }
  149. return $string;
  150. }
  151. /**
  152. * Romanize a non-latin string
  153. *
  154. * @author Andreas Gohr <andi@splitbrain.org>
  155. *
  156. * @param string $string
  157. * @return string
  158. */
  159. public static function romanize($string)
  160. {
  161. if (self::isASCII($string)) return $string; //nothing to do
  162. return strtr($string, Table::romanization());
  163. }
  164. /**
  165. * adjust a byte index into a utf8 string to a utf8 character boundary
  166. *
  167. * @author chris smith <chris@jalakai.co.uk>
  168. *
  169. * @param string $str utf8 character string
  170. * @param int $i byte index into $str
  171. * @param bool $next direction to search for boundary, false = up (current character) true = down (next character)
  172. * @return int byte index into $str now pointing to a utf8 character boundary
  173. */
  174. public static function correctIdx($str, $i, $next = false)
  175. {
  176. if ($i <= 0) return 0;
  177. $limit = strlen($str);
  178. if ($i >= $limit) return $limit;
  179. if ($next) {
  180. while (($i < $limit) && ((ord($str[$i]) & 0xC0) === 0x80)) $i++;
  181. } else {
  182. while ($i && ((ord($str[$i]) & 0xC0) === 0x80)) $i--;
  183. }
  184. return $i;
  185. }
  186. }