nsSBCharSetProber.h 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176
  1. /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
  2. /* ***** BEGIN LICENSE BLOCK *****
  3. * Version: MPL 1.1/GPL 2.0/LGPL 2.1
  4. *
  5. * The contents of this file are subject to the Mozilla Public License Version
  6. * 1.1 (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. * http://www.mozilla.org/MPL/
  9. *
  10. * Software distributed under the License is distributed on an "AS IS" basis,
  11. * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  12. * for the specific language governing rights and limitations under the
  13. * License.
  14. *
  15. * The Original Code is Mozilla Universal charset detector code.
  16. *
  17. * The Initial Developer of the Original Code is
  18. * Netscape Communications Corporation.
  19. * Portions created by the Initial Developer are Copyright (C) 2001
  20. * the Initial Developer. All Rights Reserved.
  21. *
  22. * Contributor(s):
  23. * Shy Shalom <shooshX@gmail.com>
  24. *
  25. * Alternatively, the contents of this file may be used under the terms of
  26. * either the GNU General Public License Version 2 or later (the "GPL"), or
  27. * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  28. * in which case the provisions of the GPL or the LGPL are applicable instead
  29. * of those above. If you wish to allow use of your version of this file only
  30. * under the terms of either the GPL or the LGPL, and not to allow others to
  31. * use your version of this file under the terms of the MPL, indicate your
  32. * decision by deleting the provisions above and replace them with the notice
  33. * and other provisions required by the GPL or the LGPL. If you do not delete
  34. * the provisions above, a recipient may use your version of this file under
  35. * the terms of any one of the MPL, the GPL or the LGPL.
  36. *
  37. * ***** END LICENSE BLOCK ***** */
  38. #ifndef nsSingleByteCharSetProber_h__
  39. #define nsSingleByteCharSetProber_h__
  40. #include "nsCharSetProber.h"
  41. /** Codepoints **/
  42. /* Illegal codepoints.*/
  43. #define ILL 255
  44. /* Control character. */
  45. #define CTR 254
  46. /* Symbols and punctuation that does not belong to words. */
  47. #define SYM 253
  48. /* Return/Line feeds. */
  49. #define RET 252
  50. /* Numbers 0-9. */
  51. #define NUM 251
  52. #define SB_ENOUGH_REL_THRESHOLD 1024
  53. #define POSITIVE_SHORTCUT_THRESHOLD (float)0.95
  54. #define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05
  55. #define SYMBOL_CAT_ORDER 250
  56. #define NUMBER_OF_SEQ_CAT 4
  57. #define POSITIVE_CAT (NUMBER_OF_SEQ_CAT-1)
  58. #define PROBABLE_CAT (NUMBER_OF_SEQ_CAT-2)
  59. #define NEUTRAL_CAT (NUMBER_OF_SEQ_CAT-3)
  60. #define NEGATIVE_CAT 0
  61. typedef struct
  62. {
  63. /* [256] table mapping codepoints to chararacter orders. */
  64. const unsigned char* const charToOrderMap;
  65. /* freqCharCount x freqCharCount table of 2-char sequence's frequencies. */
  66. const PRUint8* const precedenceMatrix;
  67. /* The count of frequent characters. */
  68. int freqCharCount;
  69. float mTypicalPositiveRatio; // = freqSeqs / totalSeqs
  70. PRBool keepEnglishLetter; // says if this script contains English characters (not implemented)
  71. const char* const charsetName;
  72. } SequenceModel;
  73. class nsSingleByteCharSetProber : public nsCharSetProber{
  74. public:
  75. nsSingleByteCharSetProber(const SequenceModel *model)
  76. :mModel(model), mReversed(PR_FALSE), mNameProber(0) { Reset(); }
  77. nsSingleByteCharSetProber(const SequenceModel *model, PRBool reversed, nsCharSetProber* nameProber)
  78. :mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); }
  79. virtual const char* GetCharSetName();
  80. virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
  81. virtual nsProbingState GetState(void) {return mState;}
  82. virtual void Reset(void);
  83. virtual float GetConfidence(void);
  84. virtual void SetOpion() {}
  85. // This feature is not implemented yet. any current language model
  86. // contain this parameter as PR_FALSE. No one is looking at this
  87. // parameter or calling this method.
  88. // Moreover, the nsSBCSGroupProber which calls the HandleData of this
  89. // prober has a hard-coded call to FilterWithoutEnglishLetters which gets rid
  90. // of the English letters.
  91. PRBool KeepEnglishLetters() {return mModel->keepEnglishLetter;} // (not implemented)
  92. #ifdef DEBUG_chardet
  93. virtual void DumpStatus();
  94. #endif
  95. protected:
  96. nsProbingState mState;
  97. const SequenceModel* const mModel;
  98. const PRBool mReversed; // PR_TRUE if we need to reverse every pair in the model lookup
  99. //char order of last character
  100. unsigned char mLastOrder;
  101. PRUint32 mTotalSeqs;
  102. PRUint32 mSeqCounters[NUMBER_OF_SEQ_CAT];
  103. PRUint32 mTotalChar;
  104. PRUint32 mCtrlChar;
  105. //characters that fall in our sampling range
  106. PRUint32 mFreqChar;
  107. // Optional auxiliary prober for name decision. created and destroyed by the GroupProber
  108. nsCharSetProber* mNameProber;
  109. };
  110. extern const SequenceModel Windows_1256ArabicModel;
  111. extern const SequenceModel Iso_8859_6ArabicModel;
  112. extern const SequenceModel Koi8rRussianModel;
  113. extern const SequenceModel Win1251RussianModel;
  114. extern const SequenceModel Latin5RussianModel;
  115. extern const SequenceModel MacCyrillicRussianModel;
  116. extern const SequenceModel Ibm866RussianModel;
  117. extern const SequenceModel Ibm855RussianModel;
  118. extern const SequenceModel Iso_8859_7GreekModel;
  119. extern const SequenceModel Windows_1253GreekModel;
  120. extern const SequenceModel Latin5BulgarianModel;
  121. extern const SequenceModel Win1251BulgarianModel;
  122. extern const SequenceModel Iso_8859_2HungarianModel;
  123. extern const SequenceModel Windows_1250HungarianModel;
  124. extern const SequenceModel Win1255Model;
  125. extern const SequenceModel Tis_620ThaiModel;
  126. extern const SequenceModel Iso_8859_11ThaiModel;
  127. extern const SequenceModel Iso_8859_15FrenchModel;
  128. extern const SequenceModel Iso_8859_1FrenchModel;
  129. extern const SequenceModel Windows_1252FrenchModel;
  130. extern const SequenceModel Iso_8859_15SpanishModel;
  131. extern const SequenceModel Iso_8859_1SpanishModel;
  132. extern const SequenceModel Windows_1252SpanishModel;
  133. extern const SequenceModel Iso_8859_1GermanModel;
  134. extern const SequenceModel Windows_1252GermanModel;
  135. extern const SequenceModel Iso_8859_3EsperantoModel;
  136. extern const SequenceModel Iso_8859_3TurkishModel;
  137. extern const SequenceModel Iso_8859_9TurkishModel;
  138. extern const SequenceModel VisciiVietnameseModel;
  139. extern const SequenceModel Windows_1258VietnameseModel;
  140. extern const SequenceModel Iso_8859_15DanishModel;
  141. extern const SequenceModel Iso_8859_1DanishModel;
  142. extern const SequenceModel Windows_1252DanishModel;
  143. #endif /* nsSingleByteCharSetProber_h__ */