123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991 |
- #!/usr/bin/python3
- # :Id: $Id: smartquotes.py 9068 2022-06-13 12:05:08Z milde $
- # :Copyright: © 2010 Günter Milde,
- # original `SmartyPants`_: © 2003 John Gruber
- # smartypants.py: © 2004, 2007 Chad Miller
- # :Maintainer: docutils-develop@lists.sourceforge.net
- # :License: Released under the terms of the `2-Clause BSD license`_, in short:
- #
- # Copying and distribution of this file, with or without modification,
- # are permitted in any medium without royalty provided the copyright
- # notices and this notice are preserved.
- # This file is offered as-is, without any warranty.
- #
- # .. _2-Clause BSD license: https://opensource.org/licenses/BSD-2-Clause
- r"""
- =========================
- Smart Quotes for Docutils
- =========================
- Synopsis
- ========
- "SmartyPants" is a free web publishing plug-in for Movable Type, Blosxom, and
- BBEdit that easily translates plain ASCII punctuation characters into "smart"
- typographic punctuation characters.
- ``smartquotes.py`` is an adaption of "SmartyPants" to Docutils_.
- * Using Unicode instead of HTML entities for typographic punctuation
- characters, it works for any output format that supports Unicode.
- * Supports `language specific quote characters`__.
- __ https://en.wikipedia.org/wiki/Non-English_usage_of_quotation_marks
- Authors
- =======
- `John Gruber`_ did all of the hard work of writing this software in Perl for
- `Movable Type`_ and almost all of this useful documentation. `Chad Miller`_
- ported it to Python to use with Pyblosxom_.
- Adapted to Docutils_ by Günter Milde.
- Additional Credits
- ==================
- Portions of the SmartyPants original work are based on Brad Choate's nifty
- MTRegex plug-in. `Brad Choate`_ also contributed a few bits of source code to
- this plug-in. Brad Choate is a fine hacker indeed.
- `Jeremy Hedley`_ and `Charles Wiltgen`_ deserve mention for exemplary beta
- testing of the original SmartyPants.
- `Rael Dornfest`_ ported SmartyPants to Blosxom.
- .. _Brad Choate: http://bradchoate.com/
- .. _Jeremy Hedley: http://antipixel.com/
- .. _Charles Wiltgen: http://playbacktime.com/
- .. _Rael Dornfest: http://raelity.org/
- Copyright and License
- =====================
- SmartyPants_ license (3-Clause BSD license):
- Copyright (c) 2003 John Gruber (http://daringfireball.net/)
- All rights reserved.
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are
- met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in
- the documentation and/or other materials provided with the
- distribution.
- * Neither the name "SmartyPants" nor the names of its contributors
- may be used to endorse or promote products derived from this
- software without specific prior written permission.
- This software is provided by the copyright holders and contributors
- "as is" and any express or implied warranties, including, but not
- limited to, the implied warranties of merchantability and fitness for
- a particular purpose are disclaimed. In no event shall the copyright
- owner or contributors be liable for any direct, indirect, incidental,
- special, exemplary, or consequential damages (including, but not
- limited to, procurement of substitute goods or services; loss of use,
- data, or profits; or business interruption) however caused and on any
- theory of liability, whether in contract, strict liability, or tort
- (including negligence or otherwise) arising in any way out of the use
- of this software, even if advised of the possibility of such damage.
- smartypants.py license (2-Clause BSD license):
- smartypants.py is a derivative work of SmartyPants.
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are
- met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in
- the documentation and/or other materials provided with the
- distribution.
- This software is provided by the copyright holders and contributors
- "as is" and any express or implied warranties, including, but not
- limited to, the implied warranties of merchantability and fitness for
- a particular purpose are disclaimed. In no event shall the copyright
- owner or contributors be liable for any direct, indirect, incidental,
- special, exemplary, or consequential damages (including, but not
- limited to, procurement of substitute goods or services; loss of use,
- data, or profits; or business interruption) however caused and on any
- theory of liability, whether in contract, strict liability, or tort
- (including negligence or otherwise) arising in any way out of the use
- of this software, even if advised of the possibility of such damage.
- .. _John Gruber: http://daringfireball.net/
- .. _Chad Miller: http://web.chad.org/
- .. _Pyblosxom: http://pyblosxom.bluesock.org/
- .. _SmartyPants: http://daringfireball.net/projects/smartypants/
- .. _Movable Type: http://www.movabletype.org/
- .. _2-Clause BSD license: https://opensource.org/licenses/BSD-2-Clause
- .. _Docutils: https://docutils.sourceforge.io/
- Description
- ===========
- SmartyPants can perform the following transformations:
- - Straight quotes ( " and ' ) into "curly" quote characters
- - Backticks-style quotes (\`\`like this'') into "curly" quote characters
- - Dashes (``--`` and ``---``) into en- and em-dash entities
- - Three consecutive dots (``...`` or ``. . .``) into an ellipsis entity
- This means you can write, edit, and save your posts using plain old
- ASCII straight quotes, plain dashes, and plain dots, but your published
- posts (and final HTML output) will appear with smart quotes, em-dashes,
- and proper ellipses.
- SmartyPants does not modify characters within ``<pre>``, ``<code>``, ``<kbd>``,
- ``<math>`` or ``<script>`` tag blocks. Typically, these tags are used to
- display text where smart quotes and other "smart punctuation" would not be
- appropriate, such as source code or example markup.
- Backslash Escapes
- =================
- If you need to use literal straight quotes (or plain hyphens and periods),
- `smartquotes` accepts the following backslash escape sequences to force
- ASCII-punctuation. Mind, that you need two backslashes as Docutils expands it,
- too.
- ======== =========
- Escape Character
- ======== =========
- ``\\`` \\
- ``\\"`` \\"
- ``\\'`` \\'
- ``\\.`` \\.
- ``\\-`` \\-
- ``\\``` \\`
- ======== =========
- This is useful, for example, when you want to use straight quotes as
- foot and inch marks: 6\\'2\\" tall; a 17\\" iMac.
- Caveats
- =======
- Why You Might Not Want to Use Smart Quotes in Your Weblog
- ---------------------------------------------------------
- For one thing, you might not care.
- Most normal, mentally stable individuals do not take notice of proper
- typographic punctuation. Many design and typography nerds, however, break
- out in a nasty rash when they encounter, say, a restaurant sign that uses
- a straight apostrophe to spell "Joe's".
- If you're the sort of person who just doesn't care, you might well want to
- continue not caring. Using straight quotes -- and sticking to the 7-bit
- ASCII character set in general -- is certainly a simpler way to live.
- Even if you *do* care about accurate typography, you still might want to
- think twice before educating the quote characters in your weblog. One side
- effect of publishing curly quote characters is that it makes your
- weblog a bit harder for others to quote from using copy-and-paste. What
- happens is that when someone copies text from your blog, the copied text
- contains the 8-bit curly quote characters (as well as the 8-bit characters
- for em-dashes and ellipses, if you use these options). These characters
- are not standard across different text encoding methods, which is why they
- need to be encoded as characters.
- People copying text from your weblog, however, may not notice that you're
- using curly quotes, and they'll go ahead and paste the unencoded 8-bit
- characters copied from their browser into an email message or their own
- weblog. When pasted as raw "smart quotes", these characters are likely to
- get mangled beyond recognition.
- That said, my own opinion is that any decent text editor or email client
- makes it easy to stupefy smart quote characters into their 7-bit
- equivalents, and I don't consider it my problem if you're using an
- indecent text editor or email client.
- Algorithmic Shortcomings
- ------------------------
- One situation in which quotes will get curled the wrong way is when
- apostrophes are used at the start of leading contractions. For example::
- 'Twas the night before Christmas.
- In the case above, SmartyPants will turn the apostrophe into an opening
- secondary quote, when in fact it should be the `RIGHT SINGLE QUOTATION MARK`
- character which is also "the preferred character to use for apostrophe"
- (Unicode). I don't think this problem can be solved in the general case --
- every word processor I've tried gets this wrong as well. In such cases, it's
- best to inset the `RIGHT SINGLE QUOTATION MARK` (’) by hand.
- In English, the same character is used for apostrophe and closing secondary
- quote (both plain and "smart" ones). For other locales (French, Italean,
- Swiss, ...) "smart" secondary closing quotes differ from the curly apostrophe.
- .. class:: language-fr
- Il dit : "C'est 'super' !"
- If the apostrophe is used at the end of a word, it cannot be distinguished
- from a secondary quote by the algorithm. Therefore, a text like::
- .. class:: language-de-CH
- "Er sagt: 'Ich fass' es nicht.'"
- will get a single closing guillemet instead of an apostrophe.
- This can be prevented by use use of the `RIGHT SINGLE QUOTATION MARK` in
- the source::
- - "Er sagt: 'Ich fass' es nicht.'"
- + "Er sagt: 'Ich fass’ es nicht.'"
- Version History
- ===============
- 1.9 2022-03-04
- - Code cleanup. Require Python 3.
- 1.8.1 2017-10-25
- - Use open quote after Unicode whitespace, ZWSP, and ZWNJ.
- - Code cleanup.
- 1.8: 2017-04-24
- - Command line front-end.
- 1.7.1: 2017-03-19
- - Update and extend language-dependent quotes.
- - Differentiate apostrophe from single quote.
- 1.7: 2012-11-19
- - Internationalization: language-dependent quotes.
- 1.6.1: 2012-11-06
- - Refactor code, code cleanup,
- - `educate_tokens()` generator as interface for Docutils.
- 1.6: 2010-08-26
- - Adaption to Docutils:
- - Use Unicode instead of HTML entities,
- - Remove code special to pyblosxom.
- 1.5_1.6: Fri, 27 Jul 2007 07:06:40 -0400
- - Fixed bug where blocks of precious unalterable text was instead
- interpreted. Thanks to Le Roux and Dirk van Oosterbosch.
- 1.5_1.5: Sat, 13 Aug 2005 15:50:24 -0400
- - Fix bogus magical quotation when there is no hint that the
- user wants it, e.g., in "21st century". Thanks to Nathan Hamblen.
- - Be smarter about quotes before terminating numbers in an en-dash'ed
- range.
- 1.5_1.4: Thu, 10 Feb 2005 20:24:36 -0500
- - Fix a date-processing bug, as reported by jacob childress.
- - Begin a test-suite for ensuring correct output.
- - Removed import of "string", since I didn't really need it.
- (This was my first every Python program. Sue me!)
- 1.5_1.3: Wed, 15 Sep 2004 18:25:58 -0400
- - Abort processing if the flavour is in forbidden-list. Default of
- [ "rss" ] (Idea of Wolfgang SCHNERRING.)
- - Remove stray virgules from en-dashes. Patch by Wolfgang SCHNERRING.
- 1.5_1.2: Mon, 24 May 2004 08:14:54 -0400
- - Some single quotes weren't replaced properly. Diff-tesuji played
- by Benjamin GEIGER.
- 1.5_1.1: Sun, 14 Mar 2004 14:38:28 -0500
- - Support upcoming pyblosxom 0.9 plugin verification feature.
- 1.5_1.0: Tue, 09 Mar 2004 08:08:35 -0500
- - Initial release
- """
- import re
- import sys
- options = r"""
- Options
- =======
- Numeric values are the easiest way to configure SmartyPants' behavior:
- :0: Suppress all transformations. (Do nothing.)
- :1: Performs default SmartyPants transformations: quotes (including
- \`\`backticks'' -style), em-dashes, and ellipses. "``--``" (dash dash)
- is used to signify an em-dash; there is no support for en-dashes
- :2: Same as smarty_pants="1", except that it uses the old-school typewriter
- shorthand for dashes: "``--``" (dash dash) for en-dashes, "``---``"
- (dash dash dash)
- for em-dashes.
- :3: Same as smarty_pants="2", but inverts the shorthand for dashes:
- "``--``" (dash dash) for em-dashes, and "``---``" (dash dash dash) for
- en-dashes.
- :-1: Stupefy mode. Reverses the SmartyPants transformation process, turning
- the characters produced by SmartyPants into their ASCII equivalents.
- E.g. the LEFT DOUBLE QUOTATION MARK (“) is turned into a simple
- double-quote (\"), "—" is turned into two dashes, etc.
- The following single-character attribute values can be combined to toggle
- individual transformations from within the smarty_pants attribute. For
- example, ``"1"`` is equivalent to ``"qBde"``.
- :q: Educates normal quote characters: (") and (').
- :b: Educates \`\`backticks'' -style double quotes.
- :B: Educates \`\`backticks'' -style double quotes and \`single' quotes.
- :d: Educates em-dashes.
- :D: Educates em-dashes and en-dashes, using old-school typewriter
- shorthand: (dash dash) for en-dashes, (dash dash dash) for em-dashes.
- :i: Educates em-dashes and en-dashes, using inverted old-school typewriter
- shorthand: (dash dash) for em-dashes, (dash dash dash) for en-dashes.
- :e: Educates ellipses.
- :w: Translates any instance of ``"`` into a normal double-quote
- character. This should be of no interest to most people, but
- of particular interest to anyone who writes their posts using
- Dreamweaver, as Dreamweaver inexplicably uses this entity to represent
- a literal double-quote character. SmartyPants only educates normal
- quotes, not entities (because ordinarily, entities are used for
- the explicit purpose of representing the specific character they
- represent). The "w" option must be used in conjunction with one (or
- both) of the other quote options ("q" or "b"). Thus, if you wish to
- apply all SmartyPants transformations (quotes, en- and em-dashes, and
- ellipses) and also translate ``"`` entities into regular quotes
- so SmartyPants can educate them, you should pass the following to the
- smarty_pants attribute:
- """
- class smartchars:
- """Smart quotes and dashes"""
- endash = '–' # "–" EN DASH
- emdash = '—' # "—" EM DASH
- ellipsis = '…' # "…" HORIZONTAL ELLIPSIS
- apostrophe = '’' # "’" RIGHT SINGLE QUOTATION MARK
- # quote characters (language-specific, set in __init__())
- # https://en.wikipedia.org/wiki/Non-English_usage_of_quotation_marks
- # http://de.wikipedia.org/wiki/Anf%C3%BChrungszeichen#Andere_Sprachen
- # https://fr.wikipedia.org/wiki/Guillemet
- # http://typographisme.net/post/Les-espaces-typographiques-et-le-web
- # http://www.btb.termiumplus.gc.ca/tpv2guides/guides/redac/index-fra.html
- # https://en.wikipedia.org/wiki/Hebrew_punctuation#Quotation_marks
- # [7] http://www.tustep.uni-tuebingen.de/bi/bi00/bi001t1-anfuehrung.pdf
- # [8] http://www.korrekturavdelingen.no/anforselstegn.htm
- # [9] Typografisk håndbok. Oslo: Spartacus. 2000. s. 67. ISBN 8243001530.
- # [10] http://www.typografi.org/sitat/sitatart.html
- #
- # See also configuration option "smartquote-locales".
- quotes = {
- 'af': '“”‘’',
- 'af-x-altquot': '„”‚’',
- 'bg': '„“‚‘', # https://bg.wikipedia.org/wiki/Кавички
- 'ca': '«»“”',
- 'ca-x-altquot': '“”‘’',
- 'cs': '„“‚‘',
- 'cs-x-altquot': '»«›‹',
- 'da': '»«›‹',
- 'da-x-altquot': '„“‚‘',
- # 'da-x-altquot2': '””’’',
- 'de': '„“‚‘',
- 'de-x-altquot': '»«›‹',
- 'de-ch': '«»‹›',
- 'el': '«»“”',
- 'en': '“”‘’',
- 'en-uk-x-altquot': '‘’“”', # Attention: " → ‘ and ' → “ !
- 'eo': '“”‘’',
- 'es': '«»“”',
- 'es-x-altquot': '“”‘’',
- 'et': '„“‚‘', # no secondary quote listed in
- 'et-x-altquot': '«»‹›', # the sources above (wikipedia.org)
- 'eu': '«»‹›',
- 'fi': '””’’',
- 'fi-x-altquot': '»»››',
- 'fr': ('« ', ' »', '“', '”'), # full no-break space
- 'fr-x-altquot': ('« ', ' »', '“', '”'), # narrow no-break space
- 'fr-ch': '«»‹›', # http://typoguide.ch/
- 'fr-ch-x-altquot': ('« ', ' »', '‹ ', ' ›'), # narrow no-break space # noqa:E501
- 'gl': '«»“”',
- 'he': '”“»«', # Hebrew is RTL, test position:
- 'he-x-altquot': '„”‚’', # low quotation marks are opening.
- # 'he-x-altquot': '“„‘‚', # RTL: low quotation marks opening
- 'hr': '„”‘’', # http://hrvatska-tipografija.com/polunavodnici/ # noqa:E501
- 'hr-x-altquot': '»«›‹',
- 'hsb': '„“‚‘',
- 'hsb-x-altquot': '»«›‹',
- 'hu': '„”«»',
- 'is': '„“‚‘',
- 'it': '«»“”',
- 'it-ch': '«»‹›',
- 'it-x-altquot': '“”‘’',
- # 'it-x-altquot2': '“„‘‚', # [7] in headlines
- 'ja': '「」『』',
- 'ko': '“”‘’',
- 'lt': '„“‚‘',
- 'lv': '„“‚‘',
- 'mk': '„“‚‘', # Macedonian, https://mk.wikipedia.org/wiki/Правопис_и_правоговор_на_македонскиот_јазик # noqa:E501
- 'nl': '“”‘’',
- 'nl-x-altquot': '„”‚’',
- # 'nl-x-altquot2': '””’’',
- 'nb': '«»’’', # Norsk bokmål (canonical form 'no')
- 'nn': '«»’’', # Nynorsk [10]
- 'nn-x-altquot': '«»‘’', # [8], [10]
- # 'nn-x-altquot2': '«»«»', # [9], [10]
- # 'nn-x-altquot3': '„“‚‘', # [10]
- 'no': '«»’’', # Norsk bokmål [10]
- 'no-x-altquot': '«»‘’', # [8], [10]
- # 'no-x-altquot2': '«»«»', # [9], [10
- # 'no-x-altquot3': '„“‚‘', # [10]
- 'pl': '„”«»',
- 'pl-x-altquot': '«»‚’',
- # 'pl-x-altquot2': '„”‚’', # https://pl.wikipedia.org/wiki/Cudzys%C5%82%C3%B3w # noqa:E501
- 'pt': '«»“”',
- 'pt-br': '“”‘’',
- 'ro': '„”«»',
- 'ru': '«»„“',
- 'sh': '„”‚’', # Serbo-Croatian
- 'sh-x-altquot': '»«›‹',
- 'sk': '„“‚‘', # Slovak
- 'sk-x-altquot': '»«›‹',
- 'sl': '„“‚‘', # Slovenian
- 'sl-x-altquot': '»«›‹',
- 'sq': '«»‹›', # Albanian
- 'sq-x-altquot': '“„‘‚',
- 'sr': '„”’’',
- 'sr-x-altquot': '»«›‹',
- 'sv': '””’’',
- 'sv-x-altquot': '»»››',
- 'tr': '“”‘’',
- 'tr-x-altquot': '«»‹›',
- # 'tr-x-altquot2': '“„‘‚', # [7] antiquated?
- 'uk': '«»„“',
- 'uk-x-altquot': '„“‚‘',
- 'zh-cn': '“”‘’',
- 'zh-tw': '「」『』',
- }
- def __init__(self, language='en'):
- self.language = language
- try:
- (self.opquote, self.cpquote,
- self.osquote, self.csquote) = self.quotes[language.lower()]
- except KeyError:
- self.opquote, self.cpquote, self.osquote, self.csquote = '""\'\''
- default_smartypants_attr = '1'
- def smartyPants(text, attr=default_smartypants_attr, language='en'):
- """Main function for "traditional" use."""
- return "".join(t for t in educate_tokens(tokenize(text), attr, language))
- def educate_tokens(text_tokens, attr=default_smartypants_attr, language='en'):
- """Return iterator that "educates" the items of `text_tokens`."""
- # Parse attributes:
- # 0 : do nothing
- # 1 : set all
- # 2 : set all, using old school en- and em- dash shortcuts
- # 3 : set all, using inverted old school en and em- dash shortcuts
- #
- # q : quotes
- # b : backtick quotes (``double'' only)
- # B : backtick quotes (``double'' and `single')
- # d : dashes
- # D : old school dashes
- # i : inverted old school dashes
- # e : ellipses
- # w : convert " entities to " for Dreamweaver users
- convert_quot = False # translate " entities into normal quotes?
- do_dashes = False
- do_backticks = False
- do_quotes = False
- do_ellipses = False
- do_stupefy = False
- # if attr == "0": # pass tokens unchanged (see below).
- if attr == '1': # Do everything, turn all options on.
- do_quotes = True
- do_backticks = True
- do_dashes = 1
- do_ellipses = True
- elif attr == '2':
- # Do everything, turn all options on, use old school dash shorthand.
- do_quotes = True
- do_backticks = True
- do_dashes = 2
- do_ellipses = True
- elif attr == '3':
- # Do everything, use inverted old school dash shorthand.
- do_quotes = True
- do_backticks = True
- do_dashes = 3
- do_ellipses = True
- elif attr == '-1': # Special "stupefy" mode.
- do_stupefy = True
- else:
- if 'q' in attr: do_quotes = True # noqa: E701
- if 'b' in attr: do_backticks = True # noqa: E701
- if 'B' in attr: do_backticks = 2 # noqa: E701
- if 'd' in attr: do_dashes = 1 # noqa: E701
- if 'D' in attr: do_dashes = 2 # noqa: E701
- if 'i' in attr: do_dashes = 3 # noqa: E701
- if 'e' in attr: do_ellipses = True # noqa: E701
- if 'w' in attr: convert_quot = True # noqa: E701
- prev_token_last_char = ' '
- # Last character of the previous text token. Used as
- # context to curl leading quote characters correctly.
- for (ttype, text) in text_tokens:
- # skip HTML and/or XML tags as well as empty text tokens
- # without updating the last character
- if ttype == 'tag' or not text:
- yield text
- continue
- # skip literal text (math, literal, raw, ...)
- if ttype == 'literal':
- prev_token_last_char = text[-1:]
- yield text
- continue
- last_char = text[-1:] # Remember last char before processing.
- text = processEscapes(text)
- if convert_quot:
- text = text.replace('"', '"')
- if do_dashes == 1:
- text = educateDashes(text)
- elif do_dashes == 2:
- text = educateDashesOldSchool(text)
- elif do_dashes == 3:
- text = educateDashesOldSchoolInverted(text)
- if do_ellipses:
- text = educateEllipses(text)
- # Note: backticks need to be processed before quotes.
- if do_backticks:
- text = educateBackticks(text, language)
- if do_backticks == 2:
- text = educateSingleBackticks(text, language)
- if do_quotes:
- # Replace plain quotes in context to prevent conversion to
- # 2-character sequence in French.
- context = prev_token_last_char.replace('"', ';').replace("'", ';')
- text = educateQuotes(context+text, language)[1:]
- if do_stupefy:
- text = stupefyEntities(text, language)
- # Remember last char as context for the next token
- prev_token_last_char = last_char
- text = processEscapes(text, restore=True)
- yield text
- def educateQuotes(text, language='en'):
- """
- Parameter: - text string (unicode or bytes).
- - language (`BCP 47` language tag.)
- Returns: The `text`, with "educated" curly quote characters.
- Example input: "Isn't this fun?"
- Example output: “Isn’t this fun?“;
- """
- smart = smartchars(language)
- ch_classes = {'open': '[([{]', # opening braces
- 'close': r'[^\s]', # everything except whitespace
- 'punct': r"""[-!" #\$\%'()*+,.\/:;<=>?\@\[\\\]\^_`{|}~]""",
- 'dash': '[-–—]' # hyphen and em/en dashes
- r'|&[mn]dash;|&\#8211;|&\#8212;|&\#x201[34];',
- 'sep': '[\\s\u200B\u200C]| ', # Whitespace, ZWSP, ZWNJ
- }
- # Special case if the very first character is a quote
- # followed by punctuation at a non-word-break. Use closing quotes.
- # TODO: example (when does this match?)
- text = re.sub(r"^'(?=%s\\B)" % ch_classes['punct'], smart.csquote, text)
- text = re.sub(r'^"(?=%s\\B)' % ch_classes['punct'], smart.cpquote, text)
- # Special case for adjacent quotes
- # like "'Quoted' words in a larger quote."
- text = re.sub('"\'(?=\\w)', smart.opquote+smart.osquote, text)
- text = re.sub('\'"(?=\\w)', smart.osquote+smart.opquote, text)
- # Special case: "opening character" followed by quote,
- # optional punctuation and space like "[", '(', or '-'.
- text = re.sub(r"(%(open)s|%(dash)s)'(?=%(punct)s? )" % ch_classes,
- r'\1%s'%smart.csquote, text)
- text = re.sub(r'(%(open)s|%(dash)s)"(?=%(punct)s? )' % ch_classes,
- r'\1%s'%smart.cpquote, text)
- # Special case for decade abbreviations (the '80s):
- if language.startswith('en'): # TODO similar cases in other languages?
- text = re.sub(r"'(?=\d{2}s)", smart.apostrophe, text)
- # Get most opening secondary quotes:
- opening_secondary_quotes_regex = re.compile("""
- (# ?<= # look behind fails: requires fixed-width pattern
- %(sep)s | # a whitespace char, or
- %(open)s | # opening brace, or
- %(dash)s # em/en-dash
- )
- ' # the quote
- (?=\\w|%(punct)s) # word character or punctuation
- """ % ch_classes, re.VERBOSE)
- text = opening_secondary_quotes_regex.sub(r'\1'+smart.osquote, text)
- # In many locales, secondary closing quotes are different from apostrophe:
- if smart.csquote != smart.apostrophe:
- apostrophe_regex = re.compile(r"(?<=(\w|\d))'(?=\w)")
- text = apostrophe_regex.sub(smart.apostrophe, text)
- # TODO: keep track of quoting level to recognize apostrophe in, e.g.,
- # "Ich fass' es nicht."
- closing_secondary_quotes_regex = re.compile(r"(?<!\s)'")
- text = closing_secondary_quotes_regex.sub(smart.csquote, text)
- # Any remaining secondary quotes should be opening ones:
- text = text.replace(r"'", smart.osquote)
- # Get most opening primary quotes:
- opening_primary_quotes_regex = re.compile("""
- (
- %(sep)s | # a whitespace char, or
- %(open)s | # zero width separating char, or
- %(dash)s # em/en-dash
- )
- " # the quote, followed by
- (?=\\w|%(punct)s) # a word character or punctuation
- """ % ch_classes, re.VERBOSE)
- text = opening_primary_quotes_regex.sub(r'\1'+smart.opquote, text)
- # primary closing quotes:
- closing_primary_quotes_regex = re.compile(r"""
- (
- (?<!\s)" | # no whitespace before
- "(?=\s) # whitespace behind
- )
- """, re.VERBOSE)
- text = closing_primary_quotes_regex.sub(smart.cpquote, text)
- # Any remaining quotes should be opening ones.
- text = text.replace(r'"', smart.opquote)
- return text
- def educateBackticks(text, language='en'):
- """
- Parameter: String (unicode or bytes).
- Returns: The `text`, with ``backticks'' -style double quotes
- translated into HTML curly quote entities.
- Example input: ``Isn't this fun?''
- Example output: “Isn't this fun?“;
- """
- smart = smartchars(language)
- text = text.replace(r'``', smart.opquote)
- text = text.replace(r"''", smart.cpquote)
- return text
- def educateSingleBackticks(text, language='en'):
- """
- Parameter: String (unicode or bytes).
- Returns: The `text`, with `backticks' -style single quotes
- translated into HTML curly quote entities.
- Example input: `Isn't this fun?'
- Example output: ‘Isn’t this fun?’
- """
- smart = smartchars(language)
- text = text.replace(r'`', smart.osquote)
- text = text.replace(r"'", smart.csquote)
- return text
- def educateDashes(text):
- """
- Parameter: String (unicode or bytes).
- Returns: The `text`, with each instance of "--" translated to
- an em-dash character.
- """
- text = text.replace(r'---', smartchars.endash) # en (yes, backwards)
- text = text.replace(r'--', smartchars.emdash) # em (yes, backwards)
- return text
- def educateDashesOldSchool(text):
- """
- Parameter: String (unicode or bytes).
- Returns: The `text`, with each instance of "--" translated to
- an en-dash character, and each "---" translated to
- an em-dash character.
- """
- text = text.replace(r'---', smartchars.emdash)
- text = text.replace(r'--', smartchars.endash)
- return text
- def educateDashesOldSchoolInverted(text):
- """
- Parameter: String (unicode or bytes).
- Returns: The `text`, with each instance of "--" translated to
- an em-dash character, and each "---" translated to
- an en-dash character. Two reasons why: First, unlike the
- en- and em-dash syntax supported by
- EducateDashesOldSchool(), it's compatible with existing
- entries written before SmartyPants 1.1, back when "--" was
- only used for em-dashes. Second, em-dashes are more
- common than en-dashes, and so it sort of makes sense that
- the shortcut should be shorter to type. (Thanks to Aaron
- Swartz for the idea.)
- """
- text = text.replace(r'---', smartchars.endash) # em
- text = text.replace(r'--', smartchars.emdash) # en
- return text
- def educateEllipses(text):
- """
- Parameter: String (unicode or bytes).
- Returns: The `text`, with each instance of "..." translated to
- an ellipsis character.
- Example input: Huh...?
- Example output: Huh…?
- """
- text = text.replace(r'...', smartchars.ellipsis)
- text = text.replace(r'. . .', smartchars.ellipsis)
- return text
- def stupefyEntities(text, language='en'):
- """
- Parameter: String (unicode or bytes).
- Returns: The `text`, with each SmartyPants character translated to
- its ASCII counterpart.
- Example input: “Hello — world.”
- Example output: "Hello -- world."
- """
- smart = smartchars(language)
- text = text.replace(smart.endash, "-")
- text = text.replace(smart.emdash, "--")
- text = text.replace(smart.osquote, "'") # open secondary quote
- text = text.replace(smart.csquote, "'") # close secondary quote
- text = text.replace(smart.opquote, '"') # open primary quote
- text = text.replace(smart.cpquote, '"') # close primary quote
- text = text.replace(smart.ellipsis, '...')
- return text
- def processEscapes(text, restore=False):
- r"""
- Parameter: String (unicode or bytes).
- Returns: The `text`, with after processing the following backslash
- escape sequences. This is useful if you want to force a "dumb"
- quote or other character to appear.
- Escape Value
- ------ -----
- \\ \
- \" "
- \' '
- \. .
- \- -
- \` `
- """
- replacements = ((r'\\', r'\'),
- (r'\"', r'"'),
- (r"\'", r'''),
- (r'\.', r'.'),
- (r'\-', r'-'),
- (r'\`', r'`'))
- if restore:
- for (ch, rep) in replacements:
- text = text.replace(rep, ch[1])
- else:
- for (ch, rep) in replacements:
- text = text.replace(ch, rep)
- return text
- def tokenize(text):
- """
- Parameter: String containing HTML markup.
- Returns: An iterator that yields the tokens comprising the input
- string. Each token is either a tag (possibly with nested,
- tags contained therein, such as <a href="<MTFoo>">, or a
- run of text between tags. Each yielded element is a
- two-element tuple; the first is either 'tag' or 'text';
- the second is the actual value.
- Based on the _tokenize() subroutine from Brad Choate's MTRegex plugin.
- <http://www.bradchoate.com/past/mtregex.php>
- """
- tag_soup = re.compile(r'([^<]*)(<[^>]*>)')
- token_match = tag_soup.search(text)
- previous_end = 0
- while token_match is not None:
- if token_match.group(1):
- yield 'text', token_match.group(1)
- yield 'tag', token_match.group(2)
- previous_end = token_match.end()
- token_match = tag_soup.search(text, token_match.end())
- if previous_end < len(text):
- yield 'text', text[previous_end:]
- if __name__ == "__main__":
- import itertools
- import locale
- try:
- locale.setlocale(locale.LC_ALL, '') # set to user defaults
- defaultlanguage = locale.getdefaultlocale()[0]
- except: # noqa catchall
- defaultlanguage = 'en'
- # Normalize and drop unsupported subtags:
- defaultlanguage = defaultlanguage.lower().replace('-', '_')
- # split (except singletons, which mark the following tag as non-standard):
- defaultlanguage = re.sub(r'_([a-zA-Z0-9])_', r'_\1-', defaultlanguage)
- _subtags = [subtag for subtag in defaultlanguage.split('_')]
- _basetag = _subtags.pop(0)
- # find all combinations of subtags
- for n in range(len(_subtags), 0, -1):
- for tags in itertools.combinations(_subtags, n):
- _tag = '-'.join((_basetag, *tags))
- if _tag in smartchars.quotes:
- defaultlanguage = _tag
- break
- else:
- if _basetag in smartchars.quotes:
- defaultlanguage = _basetag
- else:
- defaultlanguage = 'en'
- import argparse
- parser = argparse.ArgumentParser(
- description='Filter <input> making ASCII punctuation "smart".')
- # TODO: require input arg or other means to print USAGE instead of waiting.
- # parser.add_argument("input", help="Input stream, use '-' for stdin.")
- parser.add_argument("-a", "--action", default="1",
- help="what to do with the input (see --actionhelp)")
- parser.add_argument("-e", "--encoding", default="utf-8",
- help="text encoding")
- parser.add_argument("-l", "--language", default=defaultlanguage,
- help="text language (BCP47 tag), "
- f"Default: {defaultlanguage}")
- parser.add_argument("-q", "--alternative-quotes", action="store_true",
- help="use alternative quote style")
- parser.add_argument("--doc", action="store_true",
- help="print documentation")
- parser.add_argument("--actionhelp", action="store_true",
- help="list available actions")
- parser.add_argument("--stylehelp", action="store_true",
- help="list available quote styles")
- parser.add_argument("--test", action="store_true",
- help="perform short self-test")
- args = parser.parse_args()
- if args.doc:
- print(__doc__)
- elif args.actionhelp:
- print(options)
- elif args.stylehelp:
- print()
- print("Available styles (primary open/close, secondary open/close)")
- print("language tag quotes")
- print("============ ======")
- for key in sorted(smartchars.quotes.keys()):
- print("%-14s %s" % (key, smartchars.quotes[key]))
- elif args.test:
- # Unit test output goes to stderr.
- import unittest
- class TestSmartypantsAllAttributes(unittest.TestCase):
- # the default attribute is "1", which means "all".
- def test_dates(self):
- self.assertEqual(smartyPants("1440-80's"), "1440-80’s")
- self.assertEqual(smartyPants("1440-'80s"), "1440-’80s")
- self.assertEqual(smartyPants("1440---'80s"), "1440–’80s")
- self.assertEqual(smartyPants("1960's"), "1960’s")
- self.assertEqual(smartyPants("one two '60s"), "one two ’60s")
- self.assertEqual(smartyPants("'60s"), "’60s")
- def test_educated_quotes(self):
- self.assertEqual(smartyPants('"Isn\'t this fun?"'),
- '“Isn’t this fun?”')
- def test_html_tags(self):
- text = '<a src="foo">more</a>'
- self.assertEqual(smartyPants(text), text)
- suite = unittest.TestLoader().loadTestsFromTestCase(
- TestSmartypantsAllAttributes)
- unittest.TextTestRunner().run(suite)
- else:
- if args.alternative_quotes:
- if '-x-altquot' in args.language:
- args.language = args.language.replace('-x-altquot', '')
- else:
- args.language += '-x-altquot'
- text = sys.stdin.read()
- print(smartyPants(text, attr=args.action, language=args.language))
|