punctuation_chars.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. #!/usr/bin/env python3
  2. # :Id: $Id: punctuation_chars.py 9037 2022-03-05 23:31:10Z milde $
  3. # :Copyright: © 2011, 2017 Günter Milde.
  4. # :License: Released under the terms of the `2-Clause BSD license`_, in short:
  5. #
  6. # Copying and distribution of this file, with or without modification,
  7. # are permitted in any medium without royalty provided the copyright
  8. # notice and this notice are preserved.
  9. # This file is offered as-is, without any warranty.
  10. #
  11. # .. _2-Clause BSD license: https://opensource.org/licenses/BSD-2-Clause
  12. #
  13. # This file is generated by
  14. # ``docutils/tools/dev/generate_punctuation_chars.py``.
  15. # ::
  16. import sys
  17. """Docutils character category patterns.
  18. Patterns for the implementation of the `inline markup recognition rules`_
  19. in the reStructuredText parser `docutils.parsers.rst.states.py` based
  20. on Unicode character categories.
  21. The patterns are used inside ``[ ]`` in regular expressions.
  22. Rule (5) requires determination of matching open/close pairs. However, the
  23. pairing of open/close quotes is ambiguous due to different typographic
  24. conventions in different languages. The ``quote_pairs`` function tests
  25. whether two characters form an open/close pair.
  26. The patterns are generated by
  27. ``docutils/tools/dev/generate_punctuation_chars.py`` to prevent dependence
  28. on the Python version and avoid the time-consuming generation with every
  29. Docutils run. See there for motives and implementation details.
  30. The category of some characters changed with the development of the
  31. Unicode standard. The current lists are generated with the help of the
  32. "unicodedata" module of Python 2.7.13 (based on Unicode version 5.2.0).
  33. .. _inline markup recognition rules:
  34. https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#inline-markup-recognition-rules
  35. """
  36. openers = (u'"\'(<\\[{\u0f3a\u0f3c\u169b\u2045\u207d\u208d\u2329\u2768'
  37. u'\u276a\u276c\u276e\u2770\u2772\u2774\u27c5\u27e6\u27e8\u27ea'
  38. u'\u27ec\u27ee\u2983\u2985\u2987\u2989\u298b\u298d\u298f\u2991'
  39. u'\u2993\u2995\u2997\u29d8\u29da\u29fc\u2e22\u2e24\u2e26\u2e28'
  40. u'\u3008\u300a\u300c\u300e\u3010\u3014\u3016\u3018\u301a\u301d'
  41. u'\u301d\ufd3e\ufe17\ufe35\ufe37\ufe39\ufe3b\ufe3d\ufe3f\ufe41'
  42. u'\ufe43\ufe47\ufe59\ufe5b\ufe5d\uff08\uff3b\uff5b\uff5f\uff62'
  43. u'\xab\u2018\u201c\u2039\u2e02\u2e04\u2e09\u2e0c\u2e1c\u2e20'
  44. u'\u201a\u201e\xbb\u2019\u201d\u203a\u2e03\u2e05\u2e0a\u2e0d'
  45. u'\u2e1d\u2e21\u201b\u201f')
  46. closers = (u'"\')>\\]}\u0f3b\u0f3d\u169c\u2046\u207e\u208e\u232a\u2769'
  47. u'\u276b\u276d\u276f\u2771\u2773\u2775\u27c6\u27e7\u27e9\u27eb'
  48. u'\u27ed\u27ef\u2984\u2986\u2988\u298a\u298c\u298e\u2990\u2992'
  49. u'\u2994\u2996\u2998\u29d9\u29db\u29fd\u2e23\u2e25\u2e27\u2e29'
  50. u'\u3009\u300b\u300d\u300f\u3011\u3015\u3017\u3019\u301b\u301e'
  51. u'\u301f\ufd3f\ufe18\ufe36\ufe38\ufe3a\ufe3c\ufe3e\ufe40\ufe42'
  52. u'\ufe44\ufe48\ufe5a\ufe5c\ufe5e\uff09\uff3d\uff5d\uff60\uff63'
  53. u'\xbb\u2019\u201d\u203a\u2e03\u2e05\u2e0a\u2e0d\u2e1d\u2e21'
  54. u'\u201b\u201f\xab\u2018\u201c\u2039\u2e02\u2e04\u2e09\u2e0c'
  55. u'\u2e1c\u2e20\u201a\u201e')
  56. delimiters = (u'\\-/:\u058a\xa1\xb7\xbf\u037e\u0387\u055a-\u055f\u0589'
  57. u'\u05be\u05c0\u05c3\u05c6\u05f3\u05f4\u0609\u060a\u060c'
  58. u'\u060d\u061b\u061e\u061f\u066a-\u066d\u06d4\u0700-\u070d'
  59. u'\u07f7-\u07f9\u0830-\u083e\u0964\u0965\u0970\u0df4\u0e4f'
  60. u'\u0e5a\u0e5b\u0f04-\u0f12\u0f85\u0fd0-\u0fd4\u104a-\u104f'
  61. u'\u10fb\u1361-\u1368\u1400\u166d\u166e\u16eb-\u16ed\u1735'
  62. u'\u1736\u17d4-\u17d6\u17d8-\u17da\u1800-\u180a\u1944\u1945'
  63. u'\u19de\u19df\u1a1e\u1a1f\u1aa0-\u1aa6\u1aa8-\u1aad\u1b5a-'
  64. u'\u1b60\u1c3b-\u1c3f\u1c7e\u1c7f\u1cd3\u2010-\u2017\u2020-'
  65. u'\u2027\u2030-\u2038\u203b-\u203e\u2041-\u2043\u2047-'
  66. u'\u2051\u2053\u2055-\u205e\u2cf9-\u2cfc\u2cfe\u2cff\u2e00'
  67. u'\u2e01\u2e06-\u2e08\u2e0b\u2e0e-\u2e1b\u2e1e\u2e1f\u2e2a-'
  68. u'\u2e2e\u2e30\u2e31\u3001-\u3003\u301c\u3030\u303d\u30a0'
  69. u'\u30fb\ua4fe\ua4ff\ua60d-\ua60f\ua673\ua67e\ua6f2-\ua6f7'
  70. u'\ua874-\ua877\ua8ce\ua8cf\ua8f8-\ua8fa\ua92e\ua92f\ua95f'
  71. u'\ua9c1-\ua9cd\ua9de\ua9df\uaa5c-\uaa5f\uaade\uaadf\uabeb'
  72. u'\ufe10-\ufe16\ufe19\ufe30-\ufe32\ufe45\ufe46\ufe49-\ufe4c'
  73. u'\ufe50-\ufe52\ufe54-\ufe58\ufe5f-\ufe61\ufe63\ufe68\ufe6a'
  74. u'\ufe6b\uff01-\uff03\uff05-\uff07\uff0a\uff0c-\uff0f\uff1a'
  75. u'\uff1b\uff1f\uff20\uff3c\uff61\uff64\uff65')
  76. if sys.maxunicode >= 0x10FFFF: # "wide" build
  77. delimiters += (u'\U00010100\U00010101\U0001039f\U000103d0\U00010857'
  78. u'\U0001091f\U0001093f\U00010a50-\U00010a58\U00010a7f'
  79. u'\U00010b39-\U00010b3f\U000110bb\U000110bc\U000110be-'
  80. u'\U000110c1\U00012470-\U00012473')
  81. closing_delimiters = u'\\\\.,;!?'
  82. # Matching open/close quotes
  83. # --------------------------
  84. quote_pairs = {
  85. # open char: matching closing characters # usage example
  86. u'\xbb': u'\xbb', # » » Swedish
  87. u'\u2018': u'\u201a', # ‘ ‚ Albanian/Greek/Turkish
  88. u'\u2019': u'\u2019', # ’ ’ Swedish
  89. u'\u201a': u'\u2018\u2019', # ‚ ‘ German ‚ ’ Polish
  90. u'\u201c': u'\u201e', # “ „ Albanian/Greek/Turkish
  91. u'\u201e': u'\u201c\u201d', # „ “ German „ ” Polish
  92. u'\u201d': u'\u201d', # ” ” Swedish
  93. u'\u203a': u'\u203a', # › › Swedish
  94. }
  95. """Additional open/close quote pairs."""
  96. def match_chars(c1, c2):
  97. """Test whether `c1` and `c2` are a matching open/close character pair.
  98. Matching open/close pairs are at the same position in
  99. `punctuation_chars.openers` and `punctuation_chars.closers`.
  100. The pairing of open/close quotes is ambiguous due to different
  101. typographic conventions in different languages,
  102. so we test for additional matches stored in `quote_pairs`.
  103. """
  104. try:
  105. i = openers.index(c1)
  106. except ValueError: # c1 not in openers
  107. return False
  108. return c2 == closers[i] or c2 in quote_pairs.get(c1, u'')