nsUniversalDetector.cpp 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
  2. /* ***** BEGIN LICENSE BLOCK *****
  3. * Version: MPL 1.1/GPL 2.0/LGPL 2.1
  4. *
  5. * The contents of this file are subject to the Mozilla Public License Version
  6. * 1.1 (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. * http://www.mozilla.org/MPL/
  9. *
  10. * Software distributed under the License is distributed on an "AS IS" basis,
  11. * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  12. * for the specific language governing rights and limitations under the
  13. * License.
  14. *
  15. * The Original Code is Mozilla Universal charset detector code.
  16. *
  17. * The Initial Developer of the Original Code is
  18. * Netscape Communications Corporation.
  19. * Portions created by the Initial Developer are Copyright (C) 2001
  20. * the Initial Developer. All Rights Reserved.
  21. *
  22. * Contributor(s):
  23. * Shy Shalom <shooshX@gmail.com>
  24. *
  25. * Alternatively, the contents of this file may be used under the terms of
  26. * either the GNU General Public License Version 2 or later (the "GPL"), or
  27. * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  28. * in which case the provisions of the GPL or the LGPL are applicable instead
  29. * of those above. If you wish to allow use of your version of this file only
  30. * under the terms of either the GPL or the LGPL, and not to allow others to
  31. * use your version of this file under the terms of the MPL, indicate your
  32. * decision by deleting the provisions above and replace them with the notice
  33. * and other provisions required by the GPL or the LGPL. If you do not delete
  34. * the provisions above, a recipient may use your version of this file under
  35. * the terms of any one of the MPL, the GPL or the LGPL.
  36. *
  37. * ***** END LICENSE BLOCK ***** */
  38. #include "nscore.h"
  39. #include "nsUniversalDetector.h"
  40. #include "nsMBCSGroupProber.h"
  41. #include "nsSBCSGroupProber.h"
  42. #include "nsEscCharsetProber.h"
  43. #include "nsLatin1Prober.h"
  44. nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)
  45. {
  46. mNbspFound = PR_FALSE;
  47. mDone = PR_FALSE;
  48. mBestGuess = -1; //illegal value as signal
  49. mInTag = PR_FALSE;
  50. mEscCharSetProber = nsnull;
  51. mStart = PR_TRUE;
  52. mDetectedCharset = nsnull;
  53. mGotData = PR_FALSE;
  54. mInputState = ePureAscii;
  55. mLastChar = '\0';
  56. mLanguageFilter = aLanguageFilter;
  57. PRUint32 i;
  58. for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
  59. mCharSetProbers[i] = nsnull;
  60. }
  61. nsUniversalDetector::~nsUniversalDetector()
  62. {
  63. for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
  64. delete mCharSetProbers[i];
  65. delete mEscCharSetProber;
  66. }
  67. void
  68. nsUniversalDetector::Reset()
  69. {
  70. mNbspFound = PR_FALSE;
  71. mDone = PR_FALSE;
  72. mBestGuess = -1; //illegal value as signal
  73. mInTag = PR_FALSE;
  74. mStart = PR_TRUE;
  75. mDetectedCharset = nsnull;
  76. mGotData = PR_FALSE;
  77. mInputState = ePureAscii;
  78. mLastChar = '\0';
  79. if (mEscCharSetProber)
  80. mEscCharSetProber->Reset();
  81. PRUint32 i;
  82. for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
  83. if (mCharSetProbers[i])
  84. mCharSetProbers[i]->Reset();
  85. }
  86. //---------------------------------------------------------------------
  87. #define SHORTCUT_THRESHOLD (float)0.95
  88. #define MINIMUM_THRESHOLD (float)0.20
  89. nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
  90. {
  91. if(mDone)
  92. return NS_OK;
  93. if (aLen > 0)
  94. mGotData = PR_TRUE;
  95. /* If the data starts with BOM, we know it is UTF. */
  96. if (mStart)
  97. {
  98. mStart = PR_FALSE;
  99. if (aLen > 2)
  100. switch (aBuf[0])
  101. {
  102. case '\xEF':
  103. if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
  104. /* EF BB BF: UTF-8 encoded BOM. */
  105. mDetectedCharset = "UTF-8";
  106. break;
  107. case '\xFE':
  108. if ('\xFF' == aBuf[1])
  109. /* FE FF: UTF-16, big endian BOM. */
  110. mDetectedCharset = "UTF-16";
  111. break;
  112. case '\xFF':
  113. if ('\xFE' == aBuf[1])
  114. {
  115. if (aLen > 3 &&
  116. aBuf[2] == '\x00' &&
  117. aBuf[3] == '\x00')
  118. {
  119. /* FF FE 00 00: UTF-32 (LE). */
  120. mDetectedCharset = "UTF-32";
  121. }
  122. else
  123. {
  124. /* FF FE: UTF-16, little endian BOM. */
  125. mDetectedCharset = "UTF-16";
  126. }
  127. }
  128. break;
  129. case '\x00':
  130. if (aLen > 3 &&
  131. aBuf[1] == '\x00' &&
  132. aBuf[2] == '\xFE' &&
  133. aBuf[3] == '\xFF')
  134. {
  135. /* 00 00 FE FF: UTF-32 (BE). */
  136. mDetectedCharset = "UTF-32";
  137. }
  138. break;
  139. }
  140. if (mDetectedCharset)
  141. {
  142. mDone = PR_TRUE;
  143. return NS_OK;
  144. }
  145. }
  146. PRUint32 i;
  147. for (i = 0; i < aLen; i++)
  148. {
  149. /* If every other character is ASCII or 0xA0, we don't run charset
  150. * probers.
  151. * 0xA0 (NBSP in a few charset) is apparently a rare exception
  152. * of non-ASCII character often contained in nearly-ASCII text. */
  153. if (aBuf[i] & '\x80' && aBuf[i] != '\xA0')
  154. {
  155. /* We got a non-ASCII byte (high-byte) */
  156. if (mInputState != eHighbyte)
  157. {
  158. //adjust state
  159. mInputState = eHighbyte;
  160. //kill mEscCharSetProber if it is active
  161. if (mEscCharSetProber) {
  162. delete mEscCharSetProber;
  163. mEscCharSetProber = nsnull;
  164. }
  165. //start multibyte and singlebyte charset prober
  166. if (nsnull == mCharSetProbers[0])
  167. {
  168. mCharSetProbers[0] = new nsMBCSGroupProber(mLanguageFilter);
  169. if (nsnull == mCharSetProbers[0])
  170. return NS_ERROR_OUT_OF_MEMORY;
  171. }
  172. if (nsnull == mCharSetProbers[1] &&
  173. (mLanguageFilter & NS_FILTER_NON_CJK))
  174. {
  175. mCharSetProbers[1] = new nsSBCSGroupProber;
  176. if (nsnull == mCharSetProbers[1])
  177. return NS_ERROR_OUT_OF_MEMORY;
  178. }
  179. if (nsnull == mCharSetProbers[2])
  180. {
  181. mCharSetProbers[2] = new nsLatin1Prober;
  182. if (nsnull == mCharSetProbers[2])
  183. return NS_ERROR_OUT_OF_MEMORY;
  184. }
  185. }
  186. }
  187. else
  188. {
  189. /* Just pure ASCII or NBSP so far. */
  190. if (aBuf[i] == '\xA0')
  191. {
  192. /* ASCII with the only exception of NBSP seems quite common.
  193. * I doubt it is really necessary to train a model here, so let's
  194. * just make an exception.
  195. */
  196. mNbspFound = PR_TRUE;
  197. }
  198. else if (mInputState == ePureAscii &&
  199. (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')))
  200. {
  201. /* We found an escape character or HZ "~{". */
  202. mInputState = eEscAscii;
  203. }
  204. mLastChar = aBuf[i];
  205. }
  206. }
  207. nsProbingState st;
  208. switch (mInputState)
  209. {
  210. case eEscAscii:
  211. if (nsnull == mEscCharSetProber) {
  212. mEscCharSetProber = new nsEscCharSetProber(mLanguageFilter);
  213. if (nsnull == mEscCharSetProber)
  214. return NS_ERROR_OUT_OF_MEMORY;
  215. }
  216. st = mEscCharSetProber->HandleData(aBuf, aLen);
  217. if (st == eFoundIt)
  218. {
  219. mDone = PR_TRUE;
  220. mDetectedCharset = mEscCharSetProber->GetCharSetName();
  221. }
  222. else if (mNbspFound)
  223. {
  224. mDetectedCharset = "ISO-8859-1";
  225. }
  226. else
  227. {
  228. /* ASCII with the ESC character (or the sequence "~{") is still
  229. * ASCII until proven otherwise. */
  230. mDetectedCharset = "ASCII";
  231. }
  232. break;
  233. case eHighbyte:
  234. for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
  235. {
  236. if (mCharSetProbers[i])
  237. {
  238. st = mCharSetProbers[i]->HandleData(aBuf, aLen);
  239. if (st == eFoundIt)
  240. {
  241. mDone = PR_TRUE;
  242. mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
  243. return NS_OK;
  244. }
  245. }
  246. }
  247. break;
  248. default:
  249. if (mNbspFound)
  250. {
  251. /* ISO-8859-1 is a good result candidate for ASCII + NBSP.
  252. * (though it could have been any ISO-8859 encoding). */
  253. mDetectedCharset = "ISO-8859-1";
  254. }
  255. else
  256. {
  257. /* Pure ASCII */
  258. mDetectedCharset = "ASCII";
  259. }
  260. break;
  261. }
  262. return NS_OK;
  263. }
  264. //---------------------------------------------------------------------
  265. void nsUniversalDetector::DataEnd()
  266. {
  267. if (!mGotData)
  268. {
  269. // we haven't got any data yet, return immediately
  270. // caller program sometimes call DataEnd before anything has been sent to detector
  271. return;
  272. }
  273. if (mDetectedCharset)
  274. {
  275. mDone = PR_TRUE;
  276. Report(mDetectedCharset);
  277. return;
  278. }
  279. switch (mInputState)
  280. {
  281. case eHighbyte:
  282. {
  283. float proberConfidence;
  284. float maxProberConfidence = (float)0.0;
  285. PRInt32 maxProber = 0;
  286. for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
  287. {
  288. if (mCharSetProbers[i])
  289. {
  290. proberConfidence = mCharSetProbers[i]->GetConfidence();
  291. if (proberConfidence > maxProberConfidence)
  292. {
  293. maxProberConfidence = proberConfidence;
  294. maxProber = i;
  295. }
  296. }
  297. }
  298. //do not report anything because we are not confident of it, that's in fact a negative answer
  299. if (maxProberConfidence > MINIMUM_THRESHOLD)
  300. Report(mCharSetProbers[maxProber]->GetCharSetName());
  301. }
  302. break;
  303. case eEscAscii:
  304. break;
  305. default:
  306. ;
  307. }
  308. return;
  309. }