CharDistribution.h 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242
  1. /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
  2. /* ***** BEGIN LICENSE BLOCK *****
  3. * Version: MPL 1.1/GPL 2.0/LGPL 2.1
  4. *
  5. * The contents of this file are subject to the Mozilla Public License Version
  6. * 1.1 (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. * http://www.mozilla.org/MPL/
  9. *
  10. * Software distributed under the License is distributed on an "AS IS" basis,
  11. * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  12. * for the specific language governing rights and limitations under the
  13. * License.
  14. *
  15. * The Original Code is Mozilla Communicator client code.
  16. *
  17. * The Initial Developer of the Original Code is
  18. * Netscape Communications Corporation.
  19. * Portions created by the Initial Developer are Copyright (C) 1998
  20. * the Initial Developer. All Rights Reserved.
  21. *
  22. * Contributor(s):
  23. *
  24. * Alternatively, the contents of this file may be used under the terms of
  25. * either the GNU General Public License Version 2 or later (the "GPL"), or
  26. * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  27. * in which case the provisions of the GPL or the LGPL are applicable instead
  28. * of those above. If you wish to allow use of your version of this file only
  29. * under the terms of either the GPL or the LGPL, and not to allow others to
  30. * use your version of this file under the terms of the MPL, indicate your
  31. * decision by deleting the provisions above and replace them with the notice
  32. * and other provisions required by the GPL or the LGPL. If you do not delete
  33. * the provisions above, a recipient may use your version of this file under
  34. * the terms of any one of the MPL, the GPL or the LGPL.
  35. *
  36. * ***** END LICENSE BLOCK ***** */
  37. #ifndef CharDistribution_h__
  38. #define CharDistribution_h__
  39. #include "nscore.h"
  40. #define ENOUGH_DATA_THRESHOLD 1024
  41. #define MINIMUM_DATA_THRESHOLD 4
  42. class CharDistributionAnalysis
  43. {
  44. public:
  45. CharDistributionAnalysis() {Reset(PR_FALSE);}
  46. //feed a block of data and do distribution analysis
  47. void HandleData(const char* aBuf, PRUint32 aLen) {}
  48. //Feed a character with known length
  49. void HandleOneChar(const char* aStr, PRUint32 aCharLen)
  50. {
  51. PRInt32 order;
  52. //we only care about 2-bytes character in our distribution analysis
  53. order = (aCharLen == 2) ? GetOrder(aStr) : -1;
  54. if (order >= 0)
  55. {
  56. mTotalChars++;
  57. //order is valid
  58. if ((PRUint32)order < mTableSize)
  59. {
  60. if (512 > mCharToFreqOrder[order])
  61. mFreqChars++;
  62. }
  63. }
  64. }
  65. //return confidence base on existing data
  66. float GetConfidence(void);
  67. //Reset analyser, clear any state
  68. void Reset(PRBool aIsPreferredLanguage)
  69. {
  70. mDone = PR_FALSE;
  71. mTotalChars = 0;
  72. mFreqChars = 0;
  73. mDataThreshold = aIsPreferredLanguage ? 0 : MINIMUM_DATA_THRESHOLD;
  74. }
  75. //This function is for future extension. Caller can use this function to control
  76. //analyser's behavior
  77. void SetOpion(){}
  78. //It is not necessary to receive all data to draw conclusion. For charset detection,
  79. // certain amount of data is enough
  80. PRBool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;}
  81. protected:
  82. //we do not handle character base on its original encoding string, but
  83. //convert this encoding string to a number, here called order.
  84. //This allow multiple encoding of a language to share one frequency table
  85. virtual PRInt32 GetOrder(const char* str) {return -1;}
  86. //If this flag is set to PR_TRUE, detection is done and conclusion has been made
  87. PRBool mDone;
  88. //The number of characters whose frequency order is less than 512
  89. PRUint32 mFreqChars;
  90. //Total character encounted.
  91. PRUint32 mTotalChars;
  92. //Number of hi-byte characters needed to trigger detection
  93. PRUint32 mDataThreshold;
  94. //Mapping table to get frequency order from char order (get from GetOrder())
  95. const PRInt16 *mCharToFreqOrder;
  96. //Size of above table
  97. PRUint32 mTableSize;
  98. //This is a constant value varies from language to language, it is used in
  99. //calculating confidence. See my paper for further detail.
  100. float mTypicalDistributionRatio;
  101. };
  102. class EUCTWDistributionAnalysis: public CharDistributionAnalysis
  103. {
  104. public:
  105. EUCTWDistributionAnalysis();
  106. protected:
  107. //for EUC-TW encoding, we are interested
  108. // first byte range: 0xc4 -- 0xfe
  109. // second byte range: 0xa1 -- 0xfe
  110. //no validation needed here. State machine has done that
  111. PRInt32 GetOrder(const char* str)
  112. { if ((unsigned char)*str >= (unsigned char)0xc4)
  113. return 94*((unsigned char)str[0]-(unsigned char)0xc4) + (unsigned char)str[1] - (unsigned char)0xa1;
  114. else
  115. return -1;
  116. }
  117. };
  118. class EUCKRDistributionAnalysis : public CharDistributionAnalysis
  119. {
  120. public:
  121. EUCKRDistributionAnalysis();
  122. protected:
  123. //for euc-KR encoding, we are interested
  124. // first byte range: 0xb0 -- 0xfe
  125. // second byte range: 0xa1 -- 0xfe
  126. //no validation needed here. State machine has done that
  127. PRInt32 GetOrder(const char* str)
  128. { if ((unsigned char)*str >= (unsigned char)0xb0)
  129. return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
  130. else
  131. return -1;
  132. }
  133. };
  134. class GB2312DistributionAnalysis : public CharDistributionAnalysis
  135. {
  136. public:
  137. GB2312DistributionAnalysis();
  138. protected:
  139. //for GB2312 encoding, we are interested
  140. // first byte range: 0xb0 -- 0xfe
  141. // second byte range: 0xa1 -- 0xfe
  142. //no validation needed here. State machine has done that
  143. PRInt32 GetOrder(const char* str)
  144. { if ((unsigned char)*str >= (unsigned char)0xb0 && (unsigned char)str[1] >= (unsigned char)0xa1)
  145. return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
  146. else
  147. return -1;
  148. }
  149. };
  150. class Big5DistributionAnalysis : public CharDistributionAnalysis
  151. {
  152. public:
  153. Big5DistributionAnalysis();
  154. protected:
  155. //for big5 encoding, we are interested
  156. // first byte range: 0xa4 -- 0xfe
  157. // second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
  158. //no validation needed here. State machine has done that
  159. PRInt32 GetOrder(const char* str)
  160. { if ((unsigned char)*str >= (unsigned char)0xa4)
  161. if ((unsigned char)str[1] >= (unsigned char)0xa1)
  162. return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0xa1 +63;
  163. else
  164. return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40;
  165. else
  166. return -1;
  167. }
  168. };
  169. class SJISDistributionAnalysis : public CharDistributionAnalysis
  170. {
  171. public:
  172. SJISDistributionAnalysis();
  173. protected:
  174. //for sjis encoding, we are interested
  175. // first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
  176. // second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
  177. //no validation needed here. State machine has done that
  178. PRInt32 GetOrder(const char* str)
  179. {
  180. PRInt32 order;
  181. if ((unsigned char)*str >= (unsigned char)0x81 && (unsigned char)*str <= (unsigned char)0x9f)
  182. order = 188 * ((unsigned char)str[0]-(unsigned char)0x81);
  183. else if ((unsigned char)*str >= (unsigned char)0xe0 && (unsigned char)*str <= (unsigned char)0xef)
  184. order = 188 * ((unsigned char)str[0]-(unsigned char)0xe0 + 31);
  185. else
  186. return -1;
  187. order += (unsigned char)*(str+1) - 0x40;
  188. if ((unsigned char)str[1] > (unsigned char)0x7f)
  189. order--;
  190. return order;
  191. }
  192. };
  193. class EUCJPDistributionAnalysis : public CharDistributionAnalysis
  194. {
  195. public:
  196. EUCJPDistributionAnalysis();
  197. protected:
  198. //for euc-JP encoding, we are interested
  199. // first byte range: 0xa0 -- 0xfe
  200. // second byte range: 0xa1 -- 0xfe
  201. //no validation needed here. State machine has done that
  202. PRInt32 GetOrder(const char* str)
  203. { if ((unsigned char)*str >= (unsigned char)0xa0)
  204. return 94*((unsigned char)str[0]-(unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1;
  205. else
  206. return -1;
  207. }
  208. };
  209. #endif //CharDistribution_h__