Asian.php 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. <?php
  2. namespace dokuwiki\Utf8;
  3. /**
  4. * Methods and constants to handle Asian "words"
  5. *
  6. * This uses a crude regexp to determine which parts of an Asian string should be treated as words.
  7. * This is necessary because in some Asian languages a single unicode char represents a whole idea
  8. * without spaces separating them.
  9. */
  10. class Asian
  11. {
  12. /**
  13. * This defines a non-capturing group for the use in regular expressions to match any asian character that
  14. * needs to be treated as a word. Uses the Unicode-Ranges for Asian characters taken from
  15. * http://en.wikipedia.org/wiki/Unicode_block
  16. */
  17. const REGEXP =
  18. '(?:' .
  19. '[\x{0E00}-\x{0E7F}]' . // Thai
  20. '|' .
  21. '[' .
  22. '\x{2E80}-\x{3040}' . // CJK -> Hangul
  23. '\x{309D}-\x{30A0}' .
  24. '\x{30FD}-\x{31EF}\x{3200}-\x{D7AF}' .
  25. '\x{F900}-\x{FAFF}' . // CJK Compatibility Ideographs
  26. '\x{FE30}-\x{FE4F}' . // CJK Compatibility Forms
  27. "\xF0\xA0\x80\x80-\xF0\xAA\x9B\x9F" . // CJK Extension B
  28. "\xF0\xAA\x9C\x80-\xF0\xAB\x9C\xBF" . // CJK Extension C
  29. "\xF0\xAB\x9D\x80-\xF0\xAB\xA0\x9F" . // CJK Extension D
  30. "\xF0\xAF\xA0\x80-\xF0\xAF\xAB\xBF" . // CJK Compatibility Supplement
  31. ']' .
  32. '|' .
  33. '[' . // Hiragana/Katakana (can be two characters)
  34. '\x{3042}\x{3044}\x{3046}\x{3048}' .
  35. '\x{304A}-\x{3062}\x{3064}-\x{3082}' .
  36. '\x{3084}\x{3086}\x{3088}-\x{308D}' .
  37. '\x{308F}-\x{3094}' .
  38. '\x{30A2}\x{30A4}\x{30A6}\x{30A8}' .
  39. '\x{30AA}-\x{30C2}\x{30C4}-\x{30E2}' .
  40. '\x{30E4}\x{30E6}\x{30E8}-\x{30ED}' .
  41. '\x{30EF}-\x{30F4}\x{30F7}-\x{30FA}' .
  42. '][' .
  43. '\x{3041}\x{3043}\x{3045}\x{3047}\x{3049}' .
  44. '\x{3063}\x{3083}\x{3085}\x{3087}\x{308E}\x{3095}-\x{309C}' .
  45. '\x{30A1}\x{30A3}\x{30A5}\x{30A7}\x{30A9}' .
  46. '\x{30C3}\x{30E3}\x{30E5}\x{30E7}\x{30EE}\x{30F5}\x{30F6}\x{30FB}\x{30FC}' .
  47. '\x{31F0}-\x{31FF}' .
  48. ']?' .
  49. ')';
  50. /**
  51. * Check if the given term contains Asian word characters
  52. *
  53. * @param string $term
  54. * @return bool
  55. */
  56. public static function isAsianWords($term)
  57. {
  58. return (bool)preg_match('/' . self::REGEXP . '/u', $term);
  59. }
  60. /**
  61. * Surround all Asian words in the given text with the given separator
  62. *
  63. * @param string $text Original text containing asian words
  64. * @param string $sep the separator to use
  65. * @return string Text with separated asian words
  66. */
  67. public static function separateAsianWords($text, $sep = ' ')
  68. {
  69. // handle asian chars as single words (may fail on older PHP version)
  70. $asia = @preg_replace('/(' . self::REGEXP . ')/u', $sep . '\1' . $sep, $text);
  71. if (!is_null($asia)) $text = $asia; // recover from regexp falure
  72. return $text;
  73. }
  74. /**
  75. * Split the given text into separate parts
  76. *
  77. * Each part is either a non-asian string, or a single asian word
  78. *
  79. * @param string $term
  80. * @return string[]
  81. */
  82. public static function splitAsianWords($term)
  83. {
  84. return preg_split('/(' . self::REGEXP . '+)/u', $term, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
  85. }
  86. }