pt.py 3.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. #!/bin/python3
  2. # -*- coding: utf-8 -*-
  3. # ##### BEGIN LICENSE BLOCK #####
  4. # Version: MPL 1.1/GPL 2.0/LGPL 2.1
  5. #
  6. # The contents of this file are subject to the Mozilla Public License Version
  7. # 1.1 (the "License"); you may not use this file except in compliance with
  8. # the License. You may obtain a copy of the License at
  9. # http://www.mozilla.org/MPL/
  10. #
  11. # Software distributed under the License is distributed on an "AS IS" basis,
  12. # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  13. # for the specific language governing rights and limitations under the
  14. # License.
  15. #
  16. # The Original Code is Mozilla Universal charset detector code.
  17. #
  18. # The Initial Developer of the Original Code is
  19. # Netscape Communications Corporation.
  20. # Portions created by the Initial Developer are Copyright (C) 2001
  21. # the Initial Developer. All Rights Reserved.
  22. #
  23. # Contributor(s):
  24. # Jehan <jehan@girinstud.io>
  25. #
  26. # Alternatively, the contents of this file may be used under the terms of
  27. # either the GNU General Public License Version 2 or later (the "GPL"), or
  28. # the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  29. # in which case the provisions of the GPL or the LGPL are applicable instead
  30. # of those above. If you wish to allow use of your version of this file only
  31. # under the terms of either the GPL or the LGPL, and not to allow others to
  32. # use your version of this file under the terms of the MPL, indicate your
  33. # decision by deleting the provisions above and replace them with the notice
  34. # and other provisions required by the GPL or the LGPL. If you do not delete
  35. # the provisions above, a recipient may use your version of this file under
  36. # the terms of any one of the MPL, the GPL or the LGPL.
  37. #
  38. # ##### END LICENSE BLOCK #####
  39. import re
  40. ## Mandatory Properties ##
  41. # The human name for the language, in English.
  42. name = 'Portuguese'
  43. # Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
  44. # or use another catalog as a last resort.
  45. code = 'pt'
  46. # ASCII characters are also used in French.
  47. use_ascii = True
  48. # The charsets we want to support and create data for.
  49. charsets = ['ISO-8859-15', 'ISO-8859-1', 'WINDOWS-1252', 'ISO-8859-9']
  50. ## Optional Properties ##
  51. # Alphabet characters.
  52. # If use_ascii=True, there is no need to add any ASCII characters.
  53. # If case_mapping=True, there is no need to add several cases of a same
  54. # character (provided Python algorithms know the right cases).
  55. alphabet = 'áâãàçéêíóôõú'
  56. # The starred page which was rewarded on the main page when I created
  57. # the data.
  58. start_pages = ['Papagaio-das-mascarenhas']
  59. # give possibility to select another code for the Wikipedia URL.
  60. wikipedia_code = code
  61. # 'a' and 'A' will be considered the same character, and so on.
  62. # This uses Python algorithm to determine upper/lower-case of a given
  63. # character.
  64. case_mapping = True
  65. # A function to clean content returned by the `wikipedia` python lib,
  66. # in case some unwanted data has been overlooked.
  67. # Note that we are already cleaning away the '=' from the title syntax
  68. # of Wikipedia, as well as double spaces. But sometimes, Wikipedia in
  69. # some language may return weird syntax or UI text which should be
  70. # discarded. If you encounter one of these cases, use this function.
  71. def clean_wikipedia_content(content):
  72. # Do your garbage text cleaning here.
  73. return content