nsMBCSGroupProber.cpp 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
  2. /* ***** BEGIN LICENSE BLOCK *****
  3. * Version: MPL 1.1/GPL 2.0/LGPL 2.1
  4. *
  5. * The contents of this file are subject to the Mozilla Public License Version
  6. * 1.1 (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. * http://www.mozilla.org/MPL/
  9. *
  10. * Software distributed under the License is distributed on an "AS IS" basis,
  11. * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  12. * for the specific language governing rights and limitations under the
  13. * License.
  14. *
  15. * The Original Code is Mozilla Universal charset detector code.
  16. *
  17. * The Initial Developer of the Original Code is
  18. * Netscape Communications Corporation.
  19. * Portions created by the Initial Developer are Copyright (C) 2001
  20. * the Initial Developer. All Rights Reserved.
  21. *
  22. * Contributor(s):
  23. * Shy Shalom <shooshX@gmail.com>
  24. * Proofpoint, Inc.
  25. *
  26. * Alternatively, the contents of this file may be used under the terms of
  27. * either the GNU General Public License Version 2 or later (the "GPL"), or
  28. * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  29. * in which case the provisions of the GPL or the LGPL are applicable instead
  30. * of those above. If you wish to allow use of your version of this file only
  31. * under the terms of either the GPL or the LGPL, and not to allow others to
  32. * use your version of this file under the terms of the MPL, indicate your
  33. * decision by deleting the provisions above and replace them with the notice
  34. * and other provisions required by the GPL or the LGPL. If you do not delete
  35. * the provisions above, a recipient may use your version of this file under
  36. * the terms of any one of the MPL, the GPL or the LGPL.
  37. *
  38. * ***** END LICENSE BLOCK ***** */
  39. #include <stdio.h>
  40. #include "nsMBCSGroupProber.h"
  41. #include "nsUniversalDetector.h"
  42. #if defined(DEBUG_chardet) || defined(DEBUG_jgmyers)
  43. const char *ProberName[] =
  44. {
  45. "UTF-8",
  46. "SJIS",
  47. "EUC-JP",
  48. "GB18030",
  49. "EUC-KR",
  50. "Big5",
  51. "EUC-TW",
  52. };
  53. #endif
  54. nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
  55. {
  56. for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
  57. mProbers[i] = nsnull;
  58. mProbers[0] = new nsUTF8Prober();
  59. if (aLanguageFilter & NS_FILTER_JAPANESE)
  60. {
  61. mProbers[1] = new nsSJISProber(aLanguageFilter == NS_FILTER_JAPANESE);
  62. mProbers[2] = new nsEUCJPProber(aLanguageFilter == NS_FILTER_JAPANESE);
  63. }
  64. if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED)
  65. mProbers[3] = new nsGB18030Prober(aLanguageFilter == NS_FILTER_CHINESE_SIMPLIFIED);
  66. if (aLanguageFilter & NS_FILTER_KOREAN)
  67. mProbers[4] = new nsEUCKRProber(aLanguageFilter == NS_FILTER_KOREAN);
  68. if (aLanguageFilter & NS_FILTER_CHINESE_TRADITIONAL)
  69. {
  70. mProbers[5] = new nsBig5Prober(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
  71. mProbers[6] = new nsEUCTWProber(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
  72. }
  73. Reset();
  74. }
  75. nsMBCSGroupProber::~nsMBCSGroupProber()
  76. {
  77. for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
  78. {
  79. delete mProbers[i];
  80. }
  81. }
  82. const char* nsMBCSGroupProber::GetCharSetName()
  83. {
  84. if (mBestGuess == -1)
  85. {
  86. GetConfidence();
  87. if (mBestGuess == -1)
  88. mBestGuess = 0;
  89. }
  90. return mProbers[mBestGuess]->GetCharSetName();
  91. }
  92. void nsMBCSGroupProber::Reset(void)
  93. {
  94. mActiveNum = 0;
  95. for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
  96. {
  97. if (mProbers[i])
  98. {
  99. mProbers[i]->Reset();
  100. mIsActive[i] = PR_TRUE;
  101. ++mActiveNum;
  102. }
  103. else
  104. mIsActive[i] = PR_FALSE;
  105. }
  106. mBestGuess = -1;
  107. mState = eDetecting;
  108. mKeepNext = 0;
  109. }
  110. nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
  111. {
  112. nsProbingState st;
  113. PRUint32 start = 0;
  114. PRUint32 keepNext = mKeepNext;
  115. //do filtering to reduce load to probers
  116. for (PRUint32 pos = 0; pos < aLen; ++pos)
  117. {
  118. if (aBuf[pos] & 0x80)
  119. {
  120. if (!keepNext)
  121. start = pos;
  122. keepNext = 2;
  123. }
  124. else if (keepNext)
  125. {
  126. if (--keepNext == 0)
  127. {
  128. for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
  129. {
  130. if (!mIsActive[i])
  131. continue;
  132. st = mProbers[i]->HandleData(aBuf + start, pos + 1 - start);
  133. if (st == eFoundIt)
  134. {
  135. mBestGuess = i;
  136. mState = eFoundIt;
  137. return mState;
  138. }
  139. }
  140. }
  141. }
  142. }
  143. if (keepNext) {
  144. for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
  145. {
  146. if (!mIsActive[i])
  147. continue;
  148. st = mProbers[i]->HandleData(aBuf + start, aLen - start);
  149. if (st == eFoundIt)
  150. {
  151. mBestGuess = i;
  152. mState = eFoundIt;
  153. return mState;
  154. }
  155. }
  156. }
  157. mKeepNext = keepNext;
  158. return mState;
  159. }
  160. float nsMBCSGroupProber::GetConfidence(void)
  161. {
  162. PRUint32 i;
  163. float bestConf = 0.0, cf;
  164. switch (mState)
  165. {
  166. case eFoundIt:
  167. return (float)0.99;
  168. case eNotMe:
  169. return (float)0.01;
  170. default:
  171. for (i = 0; i < NUM_OF_PROBERS; i++)
  172. {
  173. if (!mIsActive[i])
  174. continue;
  175. cf = mProbers[i]->GetConfidence();
  176. if (bestConf < cf)
  177. {
  178. bestConf = cf;
  179. mBestGuess = i;
  180. }
  181. }
  182. }
  183. return bestConf;
  184. }
  185. #ifdef DEBUG_chardet
  186. void nsMBCSGroupProber::DumpStatus()
  187. {
  188. PRUint32 i;
  189. float cf;
  190. GetConfidence();
  191. for (i = 0; i < NUM_OF_PROBERS; i++)
  192. {
  193. if (!mIsActive[i])
  194. printf(" MBCS inactive: [%s] (confidence is too low).\r\n", ProberName[i]);
  195. else
  196. {
  197. cf = mProbers[i]->GetConfidence();
  198. printf(" MBCS %1.3f: [%s]\r\n", cf, ProberName[i]);
  199. }
  200. }
  201. }
  202. #endif
  203. #ifdef DEBUG_jgmyers
  204. void nsMBCSGroupProber::GetDetectorState(nsUniversalDetector::DetectorState (&states)[nsUniversalDetector::NumDetectors], PRUint32 &offset)
  205. {
  206. for (PRUint32 i = 0; i < NUM_OF_PROBERS; ++i) {
  207. states[offset].name = ProberName[i];
  208. states[offset].isActive = mIsActive[i];
  209. states[offset].confidence = mIsActive[i] ? mProbers[i]->GetConfidence() : 0.0;
  210. ++offset;
  211. }
  212. }
  213. #endif /* DEBUG_jgmyers */