1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980 |
- #!/bin/python3
- # -*- coding: utf-8 -*-
- # ##### BEGIN LICENSE BLOCK #####
- # Version: MPL 1.1/GPL 2.0/LGPL 2.1
- #
- # The contents of this file are subject to the Mozilla Public License Version
- # 1.1 (the "License"); you may not use this file except in compliance with
- # the License. You may obtain a copy of the License at
- # http://www.mozilla.org/MPL/
- #
- # Software distributed under the License is distributed on an "AS IS" basis,
- # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
- # for the specific language governing rights and limitations under the
- # License.
- #
- # The Original Code is Mozilla Universal charset detector code.
- #
- # The Initial Developer of the Original Code is
- # Netscape Communications Corporation.
- # Portions created by the Initial Developer are Copyright (C) 2001
- # the Initial Developer. All Rights Reserved.
- #
- # Contributor(s):
- # Jehan <jehan@girinstud.io>
- #
- # Alternatively, the contents of this file may be used under the terms of
- # either the GNU General Public License Version 2 or later (the "GPL"), or
- # the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
- # in which case the provisions of the GPL or the LGPL are applicable instead
- # of those above. If you wish to allow use of your version of this file only
- # under the terms of either the GPL or the LGPL, and not to allow others to
- # use your version of this file under the terms of the MPL, indicate your
- # decision by deleting the provisions above and replace them with the notice
- # and other provisions required by the GPL or the LGPL. If you do not delete
- # the provisions above, a recipient may use your version of this file under
- # the terms of any one of the MPL, the GPL or the LGPL.
- #
- # ##### END LICENSE BLOCK #####
- import re
- ## Mandatory Properties ##
- # The human name for the language, in English.
- name = 'Portuguese'
- # Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
- # or use another catalog as a last resort.
- code = 'pt'
- # ASCII characters are also used in French.
- use_ascii = True
- # The charsets we want to support and create data for.
- charsets = ['ISO-8859-15', 'ISO-8859-1', 'WINDOWS-1252', 'ISO-8859-9']
- ## Optional Properties ##
- # Alphabet characters.
- # If use_ascii=True, there is no need to add any ASCII characters.
- # If case_mapping=True, there is no need to add several cases of a same
- # character (provided Python algorithms know the right cases).
- alphabet = 'áâãàçéêíóôõú'
- # The starred page which was rewarded on the main page when I created
- # the data.
- start_pages = ['Papagaio-das-mascarenhas']
- # give possibility to select another code for the Wikipedia URL.
- wikipedia_code = code
- # 'a' and 'A' will be considered the same character, and so on.
- # This uses Python algorithm to determine upper/lower-case of a given
- # character.
- case_mapping = True
- # A function to clean content returned by the `wikipedia` python lib,
- # in case some unwanted data has been overlooked.
- # Note that we are already cleaning away the '=' from the title syntax
- # of Wikipedia, as well as double spaces. But sometimes, Wikipedia in
- # some language may return weird syntax or UI text which should be
- # discarded. If you encounter one of these cases, use this function.
- def clean_wikipedia_content(content):
- # Do your garbage text cleaning here.
- return content
|