PhpString.php 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384
  1. <?php
  2. namespace dokuwiki\Utf8;
  3. /**
  4. * UTF-8 aware equivalents to PHP's string functions
  5. */
  6. class PhpString
  7. {
  8. /**
  9. * A locale independent basename() implementation
  10. *
  11. * works around a bug in PHP's basename() implementation
  12. *
  13. * @param string $path A path
  14. * @param string $suffix If the name component ends in suffix this will also be cut off
  15. * @return string
  16. * @link https://bugs.php.net/bug.php?id=37738
  17. *
  18. * @see basename()
  19. */
  20. public static function basename($path, $suffix = '')
  21. {
  22. $path = trim($path, '\\/');
  23. $rpos = max(strrpos($path, '/'), strrpos($path, '\\'));
  24. if ($rpos) {
  25. $path = substr($path, $rpos + 1);
  26. }
  27. $suflen = strlen($suffix);
  28. if ($suflen && (substr($path, -$suflen) === $suffix)) {
  29. $path = substr($path, 0, -$suflen);
  30. }
  31. return $path;
  32. }
  33. /**
  34. * Unicode aware replacement for strlen()
  35. *
  36. * utf8_decode() converts characters that are not in ISO-8859-1
  37. * to '?', which, for the purpose of counting, is alright
  38. *
  39. * @param string $string
  40. * @return int
  41. * @see utf8_decode()
  42. *
  43. * @author <chernyshevsky at hotmail dot com>
  44. * @see strlen()
  45. */
  46. public static function strlen($string)
  47. {
  48. if (UTF8_MBSTRING) {
  49. return mb_strlen($string, 'UTF-8');
  50. }
  51. if (function_exists('iconv_strlen')) {
  52. return iconv_strlen($string, 'UTF-8');
  53. }
  54. // utf8_decode is deprecated
  55. if (function_exists('utf8_decode')) {
  56. return strlen(utf8_decode($string));
  57. }
  58. return strlen($string);
  59. }
  60. /**
  61. * UTF-8 aware alternative to substr
  62. *
  63. * Return part of a string given character offset (and optionally length)
  64. *
  65. * @param string $str
  66. * @param int $offset number of UTF-8 characters offset (from left)
  67. * @param int $length (optional) length in UTF-8 characters from offset
  68. * @return string
  69. * @author Harry Fuecks <hfuecks@gmail.com>
  70. * @author Chris Smith <chris@jalakai.co.uk>
  71. *
  72. */
  73. public static function substr($str, $offset, $length = null)
  74. {
  75. if (UTF8_MBSTRING) {
  76. if ($length === null) {
  77. return mb_substr($str, $offset);
  78. }
  79. return mb_substr($str, $offset, $length);
  80. }
  81. /*
  82. * Notes:
  83. *
  84. * no mb string support, so we'll use pcre regex's with 'u' flag
  85. * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
  86. * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
  87. *
  88. * substr documentation states false can be returned in some cases (e.g. offset > string length)
  89. * mb_substr never returns false, it will return an empty string instead.
  90. *
  91. * calculating the number of characters in the string is a relatively expensive operation, so
  92. * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
  93. */
  94. // cast parameters to appropriate types to avoid multiple notices/warnings
  95. $str = (string)$str; // generates E_NOTICE for PHP4 objects, but not PHP5 objects
  96. $offset = (int)$offset;
  97. if ($length !== null) $length = (int)$length;
  98. // handle trivial cases
  99. if ($length === 0) return '';
  100. if ($offset < 0 && $length < 0 && $length < $offset) return '';
  101. $offset_pattern = '';
  102. $length_pattern = '';
  103. // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
  104. if ($offset < 0) {
  105. $strlen = self::strlen($str); // see notes
  106. $offset = $strlen + $offset;
  107. if ($offset < 0) $offset = 0;
  108. }
  109. // establish a pattern for offset, a non-captured group equal in length to offset
  110. if ($offset > 0) {
  111. $Ox = (int)($offset / 65535);
  112. $Oy = $offset % 65535;
  113. if ($Ox) $offset_pattern = '(?:.{65535}){' . $Ox . '}';
  114. $offset_pattern = '^(?:' . $offset_pattern . '.{' . $Oy . '})';
  115. } else {
  116. $offset_pattern = '^'; // offset == 0; just anchor the pattern
  117. }
  118. // establish a pattern for length
  119. if ($length === null) {
  120. $length_pattern = '(.*)$'; // the rest of the string
  121. } else {
  122. if (!isset($strlen)) $strlen = self::strlen($str); // see notes
  123. if ($offset > $strlen) return ''; // another trivial case
  124. if ($length > 0) {
  125. // reduce any length that would go past the end of the string
  126. $length = min($strlen - $offset, $length);
  127. $Lx = (int)($length / 65535);
  128. $Ly = $length % 65535;
  129. // +ve length requires ... a captured group of length characters
  130. if ($Lx) $length_pattern = '(?:.{65535}){' . $Lx . '}';
  131. $length_pattern = '(' . $length_pattern . '.{' . $Ly . '})';
  132. } else if ($length < 0) {
  133. if ($length < ($offset - $strlen)) return '';
  134. $Lx = (int)((-$length) / 65535);
  135. $Ly = (-$length) % 65535;
  136. // -ve length requires ... capture everything except a group of -length characters
  137. // anchored at the tail-end of the string
  138. if ($Lx) $length_pattern = '(?:.{65535}){' . $Lx . '}';
  139. $length_pattern = '(.*)(?:' . $length_pattern . '.{' . $Ly . '})$';
  140. }
  141. }
  142. if (!preg_match('#' . $offset_pattern . $length_pattern . '#us', $str, $match)) return '';
  143. return $match[1];
  144. }
  145. // phpcs:disable PSR1.Methods.CamelCapsMethodName.NotCamelCaps
  146. /**
  147. * Unicode aware replacement for substr_replace()
  148. *
  149. * @param string $string input string
  150. * @param string $replacement the replacement
  151. * @param int $start the replacing will begin at the start'th offset into string.
  152. * @param int $length If given and is positive, it represents the length of the portion of string which is
  153. * to be replaced. If length is zero then this function will have the effect of inserting
  154. * replacement into string at the given start offset.
  155. * @return string
  156. * @see substr_replace()
  157. *
  158. * @author Andreas Gohr <andi@splitbrain.org>
  159. */
  160. public static function substr_replace($string, $replacement, $start, $length = 0)
  161. {
  162. $ret = '';
  163. if ($start > 0) $ret .= self::substr($string, 0, $start);
  164. $ret .= $replacement;
  165. $ret .= self::substr($string, $start + $length);
  166. return $ret;
  167. }
  168. // phpcs:enable PSR1.Methods.CamelCapsMethodName.NotCamelCaps
  169. /**
  170. * Unicode aware replacement for ltrim()
  171. *
  172. * @param string $str
  173. * @param string $charlist
  174. * @return string
  175. * @see ltrim()
  176. *
  177. * @author Andreas Gohr <andi@splitbrain.org>
  178. */
  179. public static function ltrim($str, $charlist = '')
  180. {
  181. if ($charlist === '') return ltrim($str);
  182. //quote charlist for use in a characterclass
  183. $charlist = preg_replace('!([\\\\\\-\\]\\[/])!', '\\\${1}', $charlist);
  184. return preg_replace('/^[' . $charlist . ']+/u', '', $str);
  185. }
  186. /**
  187. * Unicode aware replacement for rtrim()
  188. *
  189. * @param string $str
  190. * @param string $charlist
  191. * @return string
  192. * @see rtrim()
  193. *
  194. * @author Andreas Gohr <andi@splitbrain.org>
  195. */
  196. public static function rtrim($str, $charlist = '')
  197. {
  198. if ($charlist === '') return rtrim($str);
  199. //quote charlist for use in a characterclass
  200. $charlist = preg_replace('!([\\\\\\-\\]\\[/])!', '\\\${1}', $charlist);
  201. return preg_replace('/[' . $charlist . ']+$/u', '', $str);
  202. }
  203. /**
  204. * Unicode aware replacement for trim()
  205. *
  206. * @param string $str
  207. * @param string $charlist
  208. * @return string
  209. * @see trim()
  210. *
  211. * @author Andreas Gohr <andi@splitbrain.org>
  212. */
  213. public static function trim($str, $charlist = '')
  214. {
  215. if ($charlist === '') return trim($str);
  216. return self::ltrim(self::rtrim($str, $charlist), $charlist);
  217. }
  218. /**
  219. * This is a unicode aware replacement for strtolower()
  220. *
  221. * Uses mb_string extension if available
  222. *
  223. * @param string $string
  224. * @return string
  225. * @see \dokuwiki\Utf8\PhpString::strtoupper()
  226. *
  227. * @author Leo Feyer <leo@typolight.org>
  228. * @see strtolower()
  229. */
  230. public static function strtolower($string)
  231. {
  232. if($string === null) return ''; // pre-8.1 behaviour
  233. if (UTF8_MBSTRING) {
  234. if (class_exists('Normalizer', $autoload = false)) {
  235. return \Normalizer::normalize(mb_strtolower($string, 'utf-8'));
  236. }
  237. return (mb_strtolower($string, 'utf-8'));
  238. }
  239. return strtr($string, Table::upperCaseToLowerCase());
  240. }
  241. /**
  242. * This is a unicode aware replacement for strtoupper()
  243. *
  244. * Uses mb_string extension if available
  245. *
  246. * @param string $string
  247. * @return string
  248. * @see \dokuwiki\Utf8\PhpString::strtoupper()
  249. *
  250. * @author Leo Feyer <leo@typolight.org>
  251. * @see strtoupper()
  252. */
  253. public static function strtoupper($string)
  254. {
  255. if (UTF8_MBSTRING) return mb_strtoupper($string, 'utf-8');
  256. return strtr($string, Table::lowerCaseToUpperCase());
  257. }
  258. /**
  259. * UTF-8 aware alternative to ucfirst
  260. * Make a string's first character uppercase
  261. *
  262. * @param string $str
  263. * @return string with first character as upper case (if applicable)
  264. * @author Harry Fuecks
  265. *
  266. */
  267. public static function ucfirst($str)
  268. {
  269. switch (self::strlen($str)) {
  270. case 0:
  271. return '';
  272. case 1:
  273. return self::strtoupper($str);
  274. default:
  275. preg_match('/^(.{1})(.*)$/us', $str, $matches);
  276. return self::strtoupper($matches[1]) . $matches[2];
  277. }
  278. }
  279. /**
  280. * UTF-8 aware alternative to ucwords
  281. * Uppercase the first character of each word in a string
  282. *
  283. * @param string $str
  284. * @return string with first char of each word uppercase
  285. * @author Harry Fuecks
  286. * @see http://php.net/ucwords
  287. *
  288. */
  289. public static function ucwords($str)
  290. {
  291. // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
  292. // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
  293. // This corresponds to the definition of a "word" defined at http://php.net/ucwords
  294. $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
  295. return preg_replace_callback(
  296. $pattern,
  297. function ($matches) {
  298. $leadingws = $matches[2];
  299. $ucfirst = self::strtoupper($matches[3]);
  300. $ucword = self::substr_replace(ltrim($matches[0]), $ucfirst, 0, 1);
  301. return $leadingws . $ucword;
  302. },
  303. $str
  304. );
  305. }
  306. /**
  307. * This is an Unicode aware replacement for strpos
  308. *
  309. * @param string $haystack
  310. * @param string $needle
  311. * @param integer $offset
  312. * @return integer
  313. * @author Leo Feyer <leo@typolight.org>
  314. * @see strpos()
  315. *
  316. */
  317. public static function strpos($haystack, $needle, $offset = 0)
  318. {
  319. $comp = 0;
  320. $length = null;
  321. while ($length === null || $length < $offset) {
  322. $pos = strpos($haystack, $needle, $offset + $comp);
  323. if ($pos === false)
  324. return false;
  325. $length = self::strlen(substr($haystack, 0, $pos));
  326. if ($length < $offset)
  327. $comp = $pos - $length;
  328. }
  329. return $length;
  330. }
  331. }