00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028 #include "encodingdetector.h"
00029
00030 #undef DECODE_DEBUG
00031
00032
00033 #define MAX_BUFFER 16*1024
00034
00035 #include <assert.h>
00036 #include <stdlib.h>
00037
00038 #include "encodingdetector_ja_p.h"
00039
00040 #include <qregexp.h>
00041 #include <qtextcodec.h>
00042
00043 #include <kglobal.h>
00044 #include <kcharsets.h>
00045 #include <kdebug.h>
00046 #include <klocale.h>
00047
00048 #include <ctype.h>
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065 typedef struct _PangoScriptForLang {
00066 const char lang[6];
00067 EncodingDetector::AutoDetectScript scripts[1];
00068 } PangoScriptForLang;
00069
00070
00071
00072
00073
00074
00075
00076 #define PANGO_SCRIPT_ARMENIAN EncodingDetector::None
00077 #define PANGO_SCRIPT_BENGALI EncodingDetector::None
00078 #define PANGO_SCRIPT_CANADIAN_ABORIGINAL EncodingDetector::None
00079 #define PANGO_SCRIPT_CHEROKEE EncodingDetector::None
00080 #define PANGO_SCRIPT_DEVANAGARI EncodingDetector::None
00081 #define PANGO_SCRIPT_ETHIOPIC EncodingDetector::None
00082 #define PANGO_SCRIPT_GUJARATI EncodingDetector::None
00083 #define PANGO_SCRIPT_GURMUKHI EncodingDetector::None
00084 #define PANGO_SCRIPT_KANNADA EncodingDetector::None
00085 #define PANGO_SCRIPT_KHMER EncodingDetector::None
00086 #define PANGO_SCRIPT_LAO EncodingDetector::None
00087 #define PANGO_SCRIPT_MALAYALAM EncodingDetector::None
00088 #define PANGO_SCRIPT_MONGOLIAN EncodingDetector::None
00089 #define PANGO_SCRIPT_MYANMAR EncodingDetector::None
00090 #define PANGO_SCRIPT_ORIYA EncodingDetector::None
00091 #define PANGO_SCRIPT_SINHALA EncodingDetector::None
00092 #define PANGO_SCRIPT_SYRIAC EncodingDetector::None
00093 #define PANGO_SCRIPT_TAGALOG EncodingDetector::None
00094 #define PANGO_SCRIPT_TAMIL EncodingDetector::None
00095 #define PANGO_SCRIPT_TIBETAN EncodingDetector::None
00096 #define PANGO_SCRIPT_TELUGU EncodingDetector::None
00097
00098
00099 #define PANGO_SCRIPT_ARABIC EncodingDetector::Arabic
00100 #define PANGO_SCRIPT_CYRILLIC EncodingDetector::Cyrillic
00101 #define PANGO_SCRIPT_GEORGIAN EncodingDetector::SouthEasternEurope
00102 #define PANGO_SCRIPT_GREEK EncodingDetector::Greek
00103 #define PANGO_SCRIPT_HEBREW EncodingDetector::Hebrew
00104 #define PANGO_SCRIPT_LATIN EncodingDetector::WesternEuropean
00105 #define PANGO_SCRIPT_THAI EncodingDetector::Thai
00106
00107
00108 static const PangoScriptForLang pango_script_for_lang[] = {
00109 { "aa", { PANGO_SCRIPT_LATIN } },
00110 { "ab", { PANGO_SCRIPT_CYRILLIC } },
00111 { "af", { PANGO_SCRIPT_LATIN } },
00112 { "am", { PANGO_SCRIPT_ETHIOPIC } },
00113 { "ar", { PANGO_SCRIPT_ARABIC } },
00114 { "as", { PANGO_SCRIPT_BENGALI } },
00115 { "ast", { PANGO_SCRIPT_LATIN } },
00116 { "ava", { PANGO_SCRIPT_CYRILLIC } },
00117 { "ay", { PANGO_SCRIPT_LATIN } },
00118 { "az-ir", { PANGO_SCRIPT_ARABIC } },
00119 { "az", { PANGO_SCRIPT_CYRILLIC } },
00120 { "bam", { PANGO_SCRIPT_LATIN } },
00121 { "ba", { PANGO_SCRIPT_CYRILLIC } },
00122 { "be", { PANGO_SCRIPT_CYRILLIC } },
00123 { "bg", { PANGO_SCRIPT_CYRILLIC } },
00124 { "bh", { PANGO_SCRIPT_DEVANAGARI } },
00125 { "bho", { PANGO_SCRIPT_DEVANAGARI } },
00126 { "bi", { PANGO_SCRIPT_LATIN } },
00127 { "bin", { PANGO_SCRIPT_LATIN } },
00128 { "bn", { PANGO_SCRIPT_BENGALI } },
00129 { "bo", { PANGO_SCRIPT_TIBETAN } },
00130 { "br", { PANGO_SCRIPT_LATIN } },
00131 { "bs", { PANGO_SCRIPT_LATIN } },
00132 { "bua", { PANGO_SCRIPT_CYRILLIC } },
00133 { "ca", { PANGO_SCRIPT_LATIN } },
00134 { "ce", { PANGO_SCRIPT_CYRILLIC } },
00135 { "chm", { PANGO_SCRIPT_CYRILLIC } },
00136 { "chr", { PANGO_SCRIPT_CHEROKEE } },
00137 { "ch", { PANGO_SCRIPT_LATIN } },
00138 { "co", { PANGO_SCRIPT_LATIN } },
00139 { "cs", { PANGO_SCRIPT_LATIN } },
00140 { "cu", { PANGO_SCRIPT_CYRILLIC } },
00141 { "cv", { PANGO_SCRIPT_CYRILLIC } },
00142 { "cy", { PANGO_SCRIPT_LATIN } },
00143 { "da", { PANGO_SCRIPT_LATIN } },
00144 { "de", { PANGO_SCRIPT_LATIN } },
00145 { "dz", { PANGO_SCRIPT_TIBETAN } },
00146 { "el", { PANGO_SCRIPT_GREEK } },
00147 { "en", { PANGO_SCRIPT_LATIN } },
00148 { "eo", { PANGO_SCRIPT_LATIN } },
00149 { "es", { PANGO_SCRIPT_LATIN } },
00150
00151 { "et", { EncodingDetector::Baltic } },
00152 { "eu", { PANGO_SCRIPT_LATIN } },
00153 { "fa", { PANGO_SCRIPT_ARABIC } },
00154 { "fi", { PANGO_SCRIPT_LATIN } },
00155 { "fj", { PANGO_SCRIPT_LATIN } },
00156 { "fo", { PANGO_SCRIPT_LATIN } },
00157 { "fr", { PANGO_SCRIPT_LATIN } },
00158 { "ful", { PANGO_SCRIPT_LATIN } },
00159 { "fur", { PANGO_SCRIPT_LATIN } },
00160 { "fy", { PANGO_SCRIPT_LATIN } },
00161 { "ga", { PANGO_SCRIPT_LATIN } },
00162 { "gd", { PANGO_SCRIPT_LATIN } },
00163 { "gez", { PANGO_SCRIPT_ETHIOPIC } },
00164 { "gl", { PANGO_SCRIPT_LATIN } },
00165 { "gn", { PANGO_SCRIPT_LATIN } },
00166 { "gu", { PANGO_SCRIPT_GUJARATI } },
00167 { "gv", { PANGO_SCRIPT_LATIN } },
00168 { "ha", { PANGO_SCRIPT_LATIN } },
00169 { "haw", { PANGO_SCRIPT_LATIN } },
00170 { "he", { PANGO_SCRIPT_HEBREW } },
00171 { "hi", { PANGO_SCRIPT_DEVANAGARI } },
00172 { "ho", { PANGO_SCRIPT_LATIN } },
00173 { "hr", { PANGO_SCRIPT_LATIN } },
00174 { "hu", { PANGO_SCRIPT_LATIN } },
00175 { "hy", { PANGO_SCRIPT_ARMENIAN } },
00176 { "ia", { PANGO_SCRIPT_LATIN } },
00177 { "ibo", { PANGO_SCRIPT_LATIN } },
00178 { "id", { PANGO_SCRIPT_LATIN } },
00179 { "ie", { PANGO_SCRIPT_LATIN } },
00180 { "ik", { PANGO_SCRIPT_CYRILLIC } },
00181 { "io", { PANGO_SCRIPT_LATIN } },
00182 { "is", { PANGO_SCRIPT_LATIN } },
00183 { "it", { PANGO_SCRIPT_LATIN } },
00184 { "iu", { PANGO_SCRIPT_CANADIAN_ABORIGINAL } },
00185
00186 { "ja", { EncodingDetector::Japanese } },
00187 { "kaa", { PANGO_SCRIPT_CYRILLIC } },
00188 { "ka", { PANGO_SCRIPT_GEORGIAN } },
00189 { "ki", { PANGO_SCRIPT_LATIN } },
00190 { "kk", { PANGO_SCRIPT_CYRILLIC } },
00191 { "kl", { PANGO_SCRIPT_LATIN } },
00192 { "km", { PANGO_SCRIPT_KHMER } },
00193 { "kn", { PANGO_SCRIPT_KANNADA } },
00194
00195 { "ko", { EncodingDetector::Korean } },
00196 { "kok", { PANGO_SCRIPT_DEVANAGARI } },
00197 { "ks", { PANGO_SCRIPT_DEVANAGARI } },
00198 { "ku-ir", { PANGO_SCRIPT_ARABIC } },
00199 { "ku", { PANGO_SCRIPT_CYRILLIC } },
00200 { "kum", { PANGO_SCRIPT_CYRILLIC } },
00201 { "kv", { PANGO_SCRIPT_CYRILLIC } },
00202 { "kw", { PANGO_SCRIPT_LATIN } },
00203 { "ky", { PANGO_SCRIPT_CYRILLIC } },
00204 { "la", { PANGO_SCRIPT_LATIN } },
00205 { "lb", { PANGO_SCRIPT_LATIN } },
00206 { "lez", { PANGO_SCRIPT_CYRILLIC } },
00207 { "ln", { PANGO_SCRIPT_LATIN } },
00208 { "lo", { PANGO_SCRIPT_LAO } },
00209
00210 { "lt", { EncodingDetector::Baltic } },
00211
00212 { "lv", { EncodingDetector::Baltic } },
00213 { "mg", { PANGO_SCRIPT_LATIN } },
00214 { "mh", { PANGO_SCRIPT_LATIN } },
00215 { "mi", { PANGO_SCRIPT_LATIN } },
00216 { "mk", { PANGO_SCRIPT_CYRILLIC } },
00217 { "ml", { PANGO_SCRIPT_MALAYALAM } },
00218 { "mn", { PANGO_SCRIPT_MONGOLIAN } },
00219 { "mo", { PANGO_SCRIPT_CYRILLIC } },
00220 { "mr", { PANGO_SCRIPT_DEVANAGARI } },
00221 { "mt", { PANGO_SCRIPT_LATIN } },
00222 { "my", { PANGO_SCRIPT_MYANMAR } },
00223 { "nb", { PANGO_SCRIPT_LATIN } },
00224 { "nds", { PANGO_SCRIPT_LATIN } },
00225 { "ne", { PANGO_SCRIPT_DEVANAGARI } },
00226 { "nl", { PANGO_SCRIPT_LATIN } },
00227 { "nn", { PANGO_SCRIPT_LATIN } },
00228 { "no", { PANGO_SCRIPT_LATIN } },
00229 { "nr", { PANGO_SCRIPT_LATIN } },
00230 { "nso", { PANGO_SCRIPT_LATIN } },
00231 { "ny", { PANGO_SCRIPT_LATIN } },
00232 { "oc", { PANGO_SCRIPT_LATIN } },
00233 { "om", { PANGO_SCRIPT_LATIN } },
00234 { "or", { PANGO_SCRIPT_ORIYA } },
00235 { "os", { PANGO_SCRIPT_CYRILLIC } },
00236 { "pa", { PANGO_SCRIPT_GURMUKHI } },
00237 { "pl", { PANGO_SCRIPT_LATIN } },
00238 { "ps-af", { PANGO_SCRIPT_ARABIC } },
00239 { "ps-pk", { PANGO_SCRIPT_ARABIC } },
00240 { "pt", { PANGO_SCRIPT_LATIN } },
00241 { "rm", { PANGO_SCRIPT_LATIN } },
00242 { "ro", { PANGO_SCRIPT_LATIN } },
00243 { "ru", { PANGO_SCRIPT_CYRILLIC } },
00244 { "sah", { PANGO_SCRIPT_CYRILLIC } },
00245 { "sa", { PANGO_SCRIPT_DEVANAGARI } },
00246 { "sco", { PANGO_SCRIPT_LATIN } },
00247 { "sel", { PANGO_SCRIPT_CYRILLIC } },
00248 { "se", { PANGO_SCRIPT_LATIN } },
00249 { "sh", { PANGO_SCRIPT_CYRILLIC } },
00250 { "si", { PANGO_SCRIPT_SINHALA } },
00251 { "sk", { PANGO_SCRIPT_LATIN } },
00252 { "sl", { PANGO_SCRIPT_LATIN } },
00253 { "sma", { PANGO_SCRIPT_LATIN } },
00254 { "smj", { PANGO_SCRIPT_LATIN } },
00255 { "smn", { PANGO_SCRIPT_LATIN } },
00256 { "sms", { PANGO_SCRIPT_LATIN } },
00257 { "sm", { PANGO_SCRIPT_LATIN } },
00258 { "so", { PANGO_SCRIPT_LATIN } },
00259 { "sq", { PANGO_SCRIPT_LATIN } },
00260 { "sr", { PANGO_SCRIPT_CYRILLIC } },
00261 { "ss", { PANGO_SCRIPT_LATIN } },
00262 { "st", { PANGO_SCRIPT_LATIN } },
00263 { "sv", { PANGO_SCRIPT_LATIN } },
00264 { "sw", { PANGO_SCRIPT_LATIN } },
00265 { "syr", { PANGO_SCRIPT_SYRIAC } },
00266 { "ta", { PANGO_SCRIPT_TAMIL } },
00267 { "te", { PANGO_SCRIPT_TELUGU } },
00268 { "tg", { PANGO_SCRIPT_CYRILLIC } },
00269 { "th", { PANGO_SCRIPT_THAI } },
00270 { "ti-er", { PANGO_SCRIPT_ETHIOPIC } },
00271 { "ti-et", { PANGO_SCRIPT_ETHIOPIC } },
00272 { "tig", { PANGO_SCRIPT_ETHIOPIC } },
00273 { "tk", { PANGO_SCRIPT_CYRILLIC } },
00274 { "tl", { PANGO_SCRIPT_TAGALOG } },
00275 { "tn", { PANGO_SCRIPT_LATIN } },
00276 { "to", { PANGO_SCRIPT_LATIN } },
00277
00278 { "tr", { EncodingDetector::Turkish } },
00279 { "ts", { PANGO_SCRIPT_LATIN } },
00280 { "tt", { PANGO_SCRIPT_CYRILLIC } },
00281 { "tw", { PANGO_SCRIPT_LATIN } },
00282 { "tyv", { PANGO_SCRIPT_CYRILLIC } },
00283 { "ug", { PANGO_SCRIPT_ARABIC } },
00284 { "uk", { PANGO_SCRIPT_CYRILLIC } },
00285 { "ur", { PANGO_SCRIPT_ARABIC } },
00286 { "uz", { PANGO_SCRIPT_CYRILLIC } },
00287 { "ven", { PANGO_SCRIPT_LATIN } },
00288 { "vi", { PANGO_SCRIPT_LATIN } },
00289 { "vot", { PANGO_SCRIPT_LATIN } },
00290 { "vo", { PANGO_SCRIPT_LATIN } },
00291 { "wa", { PANGO_SCRIPT_LATIN } },
00292 { "wen", { PANGO_SCRIPT_LATIN } },
00293 { "wo", { PANGO_SCRIPT_LATIN } },
00294 { "xh", { PANGO_SCRIPT_LATIN } },
00295 { "yap", { PANGO_SCRIPT_LATIN } },
00296 { "yi", { PANGO_SCRIPT_HEBREW } },
00297 { "yo", { PANGO_SCRIPT_LATIN } },
00298
00299 { "zh-cn", { EncodingDetector::ChineseSimplified } },
00300
00301 { "zh-hk", { EncodingDetector::ChineseTraditional } },
00302
00303 { "zh-mo", { EncodingDetector::ChineseTraditional } },
00304
00305 { "zh-sg", { EncodingDetector::ChineseSimplified } },
00306
00307 { "zh-tw", { EncodingDetector::ChineseTraditional } },
00308 { "zu", { PANGO_SCRIPT_LATIN } },
00309 { "\x00", { EncodingDetector::None } }
00310 };
00311
00312 enum MIB
00313 {
00314 MibLatin1 = 4,
00315 Mib8859_8 = 85,
00316 MibUtf8 = 106,
00317 MibUcs2 = 1000,
00318 MibUtf16 = 1015,
00319 MibUtf16BE = 1013,
00320 MibUtf16LE = 1014
00321 };
00322
00323 static bool is16Bit(QTextCodec* codec)
00324 {
00325 switch (codec->mibEnum())
00326 {
00327 case MibUtf16:
00328 case MibUtf16BE:
00329 case MibUtf16LE:
00330 case MibUcs2:
00331 return true;
00332 default:
00333 return false;
00334 }
00335 }
00336
00337 class EncodingDetectorPrivate
00338 {
00339 public:
00340 QTextCodec *m_codec;
00341 QTextDecoder *m_decoder;
00342 QTextCodec *m_defaultCodec;
00343 QCString m_storeDecoderName;
00344
00345 EncodingDetector::EncodingChoiceSource m_source;
00346 EncodingDetector::AutoDetectScript m_autoDetectLanguage;
00347
00348 bool m_visualRTL : 1;
00349 bool m_seenBody : 1;
00350 bool m_writtingHappened : 1;
00351 bool m_analyzeCalled : 1;
00352 int m_multiByte;
00353
00354 QCString m_bufferForDefferedEncDetection;
00355
00356 EncodingDetectorPrivate()
00357 : m_codec(QTextCodec::codecForMib(MibLatin1))
00358 , m_decoder(m_codec->makeDecoder())
00359 , m_defaultCodec(m_codec)
00360 , m_source(EncodingDetector::DefaultEncoding)
00361 , m_autoDetectLanguage(EncodingDetector::SemiautomaticDetection)
00362 , m_visualRTL(false)
00363 , m_seenBody(false)
00364 , m_writtingHappened(false)
00365 , m_analyzeCalled(false)
00366 , m_multiByte(0)
00367 {
00368 }
00369
00370 EncodingDetectorPrivate(QTextCodec* codec,EncodingDetector::EncodingChoiceSource source, EncodingDetector::AutoDetectScript script)
00371 : m_codec(codec)
00372 , m_decoder(m_codec->makeDecoder())
00373 , m_defaultCodec(m_codec)
00374 , m_source(source)
00375 , m_autoDetectLanguage(script)
00376 , m_visualRTL(false)
00377 , m_seenBody(false)
00378 , m_writtingHappened(false)
00379 , m_analyzeCalled(false)
00380 , m_multiByte(0)
00381 {
00382 }
00383
00384 ~EncodingDetectorPrivate()
00385 {
00386 delete m_decoder;
00387 }
00388 };
00389
00390
00391 static QCString automaticDetectionForArabic( const unsigned char* ptr, int size )
00392 {
00393 for ( int i = 0; i < size; ++i ) {
00394 if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) || ptr[ i ] == 0xA1 || ptr[ i ] == 0xA2 || ptr[ i ] == 0xA3
00395 || ( ptr[ i ] >= 0xA5 && ptr[ i ] <= 0xAB ) || ( ptr[ i ] >= 0xAE && ptr[ i ] <= 0xBA )
00396 || ptr[ i ] == 0xBC || ptr[ i ] == 0xBD || ptr[ i ] == 0xBE || ptr[ i ] == 0xC0
00397 || ( ptr[ i ] >= 0xDB && ptr[ i ] <= 0xDF ) || ( ptr[ i ] >= 0xF3 ) ) {
00398 return "cp1256";
00399 }
00400 }
00401
00402 return "iso-8859-6";
00403 }
00404
00405 static QCString automaticDetectionForBaltic( const unsigned char* ptr, int size )
00406 {
00407 for ( int i = 0; i < size; ++i ) {
00408 if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9E ) )
00409 return "cp1257";
00410
00411 if ( ptr[ i ] == 0xA1 || ptr[ i ] == 0xA5 )
00412 return "iso-8859-13";
00413 }
00414
00415 return "iso-8859-13";
00416 }
00417
00418 static QCString automaticDetectionForCentralEuropean(const unsigned char* ptr, int size )
00419 {
00420 QCString charset;
00421 for ( int i = 0; i < size; ++i ) {
00422 if ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) {
00423 if ( ptr[ i ] == 0x81 || ptr[ i ] == 0x83 || ptr[ i ] == 0x90 || ptr[ i ] == 0x98 )
00424 return "ibm852";
00425
00426 if ( i + 1 > size )
00427 return "cp1250";
00428 else {
00429 charset = "cp1250";
00430 continue;
00431 }
00432 }
00433 if ( ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE || ptr[ i ] == 0xBE || ptr[ i ] == 0xC3 || ptr[ i ] == 0xD0 || ptr[ i ] == 0xE3 || ptr[ i ] == 0xF0 ) {
00434 if ( i + 1 > size )
00435 return "iso-8859-2";
00436 else {
00437 if ( charset.isNull() )
00438 charset = "iso-8859-2";
00439 continue;
00440 }
00441 }
00442 }
00443
00444 if ( charset.isNull() )
00445 charset = "iso-8859-3";
00446
00447 return charset.data();
00448 }
00449
00450 static QCString automaticDetectionForCyrillic( const unsigned char* ptr, int size)
00451 {
00452 #ifdef DECODE_DEBUG
00453 kWarning() << "EncodingDetector: Cyr heuristics";
00454 #endif
00455
00456
00457
00458 int utf8_mark=0;
00459 int koi_score=0;
00460 int cp1251_score=0;
00461
00462 int koi_st=0;
00463 int cp1251_st=0;
00464
00465
00466
00467
00468 int koi_o_capital=0;
00469 int koi_o=0;
00470 int cp1251_o_capital=0;
00471 int cp1251_o=0;
00472
00473 int koi_a_capital=0;
00474 int koi_a=0;
00475 int cp1251_a_capital=0;
00476 int cp1251_a=0;
00477
00478 int koi_s_capital=0;
00479 int koi_s=0;
00480 int cp1251_s_capital=0;
00481 int cp1251_s=0;
00482
00483 int koi_i_capital=0;
00484 int koi_i=0;
00485 int cp1251_i_capital=0;
00486 int cp1251_i=0;
00487
00488 int cp1251_small_range=0;
00489 int koi_small_range=0;
00490 int ibm866_small_range=0;
00491
00492 int i;
00493 for (i=1; (i<size) && (cp1251_small_range+koi_small_range<1000) ;++i)
00494 {
00495 if (ptr[i]>0xdf)
00496 {
00497 ++cp1251_small_range;
00498
00499 if (ptr[i]==0xee)
00500 ++cp1251_o;
00501 else if (ptr[i]==0xe0)
00502 ++cp1251_a;
00503 else if (ptr[i]==0xe8)
00504 ++cp1251_i;
00505 else if (ptr[i]==0xf1)
00506 ++cp1251_s;
00507 else if (ptr[i]==0xf2 && ptr[i-1]==0xf1)
00508 ++cp1251_st;
00509
00510 else if (ptr[i]==0xef)
00511 ++koi_o_capital;
00512 else if (ptr[i]==0xe1)
00513 ++koi_a_capital;
00514 else if (ptr[i]==0xe9)
00515 ++koi_i_capital;
00516 else if (ptr[i]==0xf3)
00517 ++koi_s_capital;
00518
00519 }
00520 else if (ptr[i]>0xbf)
00521 {
00522 ++koi_small_range;
00523
00524 if (ptr[i]==0xd0||ptr[i]==0xd1)
00525 ++utf8_mark;
00526 else if (ptr[i]==0xcf)
00527 ++koi_o;
00528 else if (ptr[i]==0xc1)
00529 ++koi_a;
00530 else if (ptr[i]==0xc9)
00531 ++koi_i;
00532 else if (ptr[i]==0xd3)
00533 ++koi_s;
00534 else if (ptr[i]==0xd4 && ptr[i-1]==0xd3)
00535 ++koi_st;
00536
00537 else if (ptr[i]==0xce)
00538 ++cp1251_o_capital;
00539 else if (ptr[i]==0xc0)
00540 ++cp1251_a_capital;
00541 else if (ptr[i]==0xc8)
00542 ++cp1251_i_capital;
00543 else if (ptr[i]==0xd1)
00544 ++cp1251_s_capital;
00545 }
00546 else if (ptr[i]>0x9f && ptr[i]<0xb0)
00547 ++ibm866_small_range;
00548
00549 }
00550
00551
00552 if (cp1251_small_range+koi_small_range+ibm866_small_range<8)
00553 {
00554 return "";
00555 }
00556
00557 if (3*utf8_mark>cp1251_small_range+koi_small_range+ibm866_small_range)
00558 {
00559 #ifdef DECODE_DEBUG
00560 kWarning() << "Cyr Enc Detection: UTF8";
00561 #endif
00562 return "UTF-8";
00563 }
00564
00565 if (ibm866_small_range>cp1251_small_range+koi_small_range)
00566 return "ibm866";
00567
00568
00569
00570
00571 if (cp1251_st==0 && koi_st>1)
00572 koi_score+=10;
00573 else if (koi_st==0 && cp1251_st>1)
00574 cp1251_score+=10;
00575
00576 if (cp1251_st && koi_st)
00577 {
00578 if (cp1251_st/koi_st>2)
00579 cp1251_score+=20;
00580 else if (koi_st/cp1251_st>2)
00581 koi_score+=20;
00582 }
00583
00584 if (cp1251_a>koi_a)
00585 cp1251_score+=10;
00586 else if (cp1251_a || koi_a)
00587 koi_score+=10;
00588
00589 if (cp1251_o>koi_o)
00590 cp1251_score+=10;
00591 else if (cp1251_o || koi_o)
00592 koi_score+=10;
00593
00594 if (cp1251_i>koi_i)
00595 cp1251_score+=10;
00596 else if (cp1251_i || koi_i)
00597 koi_score+=10;
00598
00599 if (cp1251_s>koi_s)
00600 cp1251_score+=10;
00601 else if (cp1251_s || koi_s)
00602 koi_score+=10;
00603
00604 if (cp1251_a_capital>koi_a_capital)
00605 cp1251_score+=9;
00606 else if (cp1251_a_capital || koi_a_capital)
00607 koi_score+=9;
00608
00609 if (cp1251_o_capital>koi_o_capital)
00610 cp1251_score+=9;
00611 else if (cp1251_o_capital || koi_o_capital)
00612 koi_score+=9;
00613
00614 if (cp1251_i_capital>koi_i_capital)
00615 cp1251_score+=9;
00616 else if (cp1251_i_capital || koi_i_capital)
00617 koi_score+=9;
00618
00619 if (cp1251_s_capital>koi_s_capital)
00620 cp1251_score+=9;
00621 else if (cp1251_s_capital || koi_s_capital)
00622 koi_score+=9;
00623 #ifdef DECODE_DEBUG
00624 kWarning()<<"koi_score " << koi_score << " cp1251_score " << cp1251_score;
00625 #endif
00626 if (abs(koi_score-cp1251_score)<10)
00627 {
00628
00629 cp1251_score=cp1251_small_range;
00630 koi_score=koi_small_range;
00631 }
00632 if (cp1251_score>koi_score)
00633 return "cp1251";
00634 else
00635 return "koi8-u";
00636
00637
00638
00639
00640
00641
00642
00643
00644 }
00645
00646 static QCString automaticDetectionForGreek( const unsigned char* ptr, int size )
00647 {
00648 for ( int i = 0; i < size; ++i ) {
00649 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x87 ) || ptr[ i ] == 0x89 || ptr[ i ] == 0x8B
00650 || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x97 ) || ptr[ i ] == 0x99 || ptr[ i ] == 0x9B || ptr[ i ] == 0xA4
00651 || ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE ) {
00652 return "cp1253";
00653 }
00654 }
00655
00656 return "iso-8859-7";
00657 }
00658
00659 static QCString automaticDetectionForHebrew( const unsigned char* ptr, int size )
00660 {
00661 for ( int i = 0; i < size; ++i ) {
00662 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x89 ) || ptr[ i ] == 0x8B
00663 || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x99 ) || ptr[ i ] == 0x9B || ptr[ i ] == 0xA1 || ( ptr[ i ] >= 0xBF && ptr[ i ] <= 0xC9 )
00664 || ( ptr[ i ] >= 0xCB && ptr[ i ] <= 0xD8 ) ) {
00665 return "cp1255";
00666 }
00667
00668 if ( ptr[ i ] == 0xDF )
00669 return "iso-8859-8-i";
00670 }
00671
00672 return "iso-8859-8-i";
00673 }
00674
00675 static QCString automaticDetectionForJapanese( const unsigned char* ptr, int size )
00676 {
00677 JapaneseCode kc;
00678
00679 switch ( kc.guess_jp( (const char*)ptr, size ) ) {
00680 case JapaneseCode::JIS:
00681 return "jis7";
00682 case JapaneseCode::EUC:
00683 return "eucjp";
00684 case JapaneseCode::SJIS:
00685 return "sjis";
00686 case JapaneseCode::UTF8:
00687 return "utf8";
00688 default:
00689 break;
00690 }
00691
00692 return "";
00693 }
00694
00695 static QCString automaticDetectionForTurkish( const unsigned char* ptr, int size )
00696 {
00697 for ( int i = 0; i < size; ++i ) {
00698 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x8C ) || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x9C ) || ptr[ i ] == 0x9F ) {
00699 return "cp1254";
00700 }
00701 }
00702
00703 return "iso-8859-9";
00704 }
00705
00706 static QCString automaticDetectionForWesternEuropean( const unsigned char* ptr, int size )
00707 {
00708 uint nonansi_count=0;
00709 for (int i=0; i<size; ++i)
00710 {
00711 if (ptr[i]>0x79)
00712 {
00713 ++nonansi_count;
00714 if ( ptr[i]>0xc1 && ptr[i]<0xf0 && i+1<size && ptr[i+1]>0x7f && ptr[i+1]<0xc0)
00715 {
00716 return "UTF-8";
00717 }
00718 if (ptr[i] >= 0x78 && ptr[i] <= 0x9 )
00719 {
00720 return "cp1252";
00721 }
00722 }
00723
00724 }
00725
00726 if (nonansi_count>0)
00727 return "iso-8859-15";
00728
00729 return "";
00730 }
00731
00732 bool EncodingDetector::errorsIfUtf8 (const char* data, int length)
00733 {
00734 if (d->m_codec->mibEnum()!=MibUtf8)
00735 return false;
00736
00737
00738
00739
00740
00741 static const unsigned char highest1Bits = 0x80;
00742 static const unsigned char highest2Bits = 0xC0;
00743 static const unsigned char highest3Bits = 0xE0;
00744 static const unsigned char highest4Bits = 0xF0;
00745 static const unsigned char highest5Bits = 0xF8;
00746
00747 for (int i=0; i<length; ++i)
00748 {
00749 unsigned char c = data[i];
00750
00751 if (d->m_multiByte>0)
00752 {
00753 if ((c & highest2Bits) == 0x80)
00754 {
00755 --(d->m_multiByte);
00756 continue;
00757 }
00758 #ifdef DECODE_DEBUG
00759 kWarning() << "EncDetector: Broken UTF8";
00760 #endif
00761 return true;
00762 }
00763
00764
00765 if ((c & highest1Bits) == 0x00)
00766 continue;
00767
00768
00769 if ((c & highest3Bits) == 0xC0)
00770 {
00771 d->m_multiByte = 1;
00772 continue;
00773 }
00774
00775
00776 if ((c & highest4Bits) == 0xE0)
00777 {
00778 d->m_multiByte = 2;
00779 continue;
00780 }
00781
00782
00783 if ((c & highest5Bits) == 0xF0)
00784 {
00785 d->m_multiByte = 3;
00786 continue;
00787 }
00788 #ifdef DECODE_DEBUG
00789 kWarning() << "EncDetector:_Broken UTF8";
00790 #endif
00791 return true;
00792 }
00793 return false;
00794 }
00795
00796 EncodingDetector::EncodingDetector() : d(new EncodingDetectorPrivate)
00797 {
00798 }
00799
00800 EncodingDetector::EncodingDetector(QTextCodec* codec, EncodingChoiceSource source, AutoDetectScript script) :
00801 d(new EncodingDetectorPrivate(codec,source,script))
00802 {
00803 }
00804
00805 EncodingDetector::~EncodingDetector()
00806 {
00807 delete d;
00808 }
00809
00810 void EncodingDetector::setAutoDetectLanguage( EncodingDetector::AutoDetectScript lang)
00811 {
00812 d->m_autoDetectLanguage=lang;
00813 }
00814 EncodingDetector::AutoDetectScript EncodingDetector::autoDetectLanguage() const
00815 {
00816 return d->m_autoDetectLanguage;
00817 }
00818
00819 EncodingDetector::EncodingChoiceSource EncodingDetector::encodingChoiceSource() const
00820 {
00821 return d->m_source;
00822 }
00823
00824 const char* EncodingDetector::encoding() const
00825 {
00826 d->m_storeDecoderName = d->m_codec->name();
00827 return d->m_storeDecoderName.data();
00828 }
00829
00830 bool EncodingDetector::visuallyOrdered() const
00831 {
00832 return d->m_visualRTL;
00833 }
00834
00835
00836
00837
00838
00839
00840 QTextDecoder* EncodingDetector::decoder()
00841 {
00842 return d->m_decoder;
00843 }
00844
00845 bool EncodingDetector::setEncoding(const char *_encoding, EncodingChoiceSource type)
00846 {
00847 QTextCodec *codec;
00848 QCString enc(_encoding);
00849 if(enc.isEmpty())
00850 {
00851 if (type==DefaultEncoding)
00852 codec=d->m_defaultCodec;
00853 else
00854 return false;
00855 }
00856 else
00857 {
00858
00859
00860 enc = enc.lower();
00861
00862 if(enc=="visual")
00863 enc="iso8859-8";
00864 bool b;
00865 codec = KGlobal::charsets()->codecForName(enc, b);
00866 if (!b)
00867 return false;
00868 }
00869
00870 if (d->m_codec->mibEnum()==codec->mibEnum())
00871 return true;
00872
00873 if ((type==EncodingFromMetaTag || type==EncodingFromXMLHeader) && is16Bit(codec))
00874 {
00875
00876
00877 return false;
00878 }
00879
00880 if (codec->mibEnum() == Mib8859_8)
00881 {
00882
00883 codec = QTextCodec::codecForName("iso8859-8-i");
00884
00885
00886 if(!(enc=="iso-8859-8-i"||enc=="iso_8859-8-i"||enc=="csiso88598i"||enc=="logical"))
00887 d->m_visualRTL = true;
00888 }
00889
00890 d->m_codec = codec;
00891 d->m_source = type;
00892 delete d->m_decoder;
00893 d->m_decoder = d->m_codec->makeDecoder();
00894 #ifdef DECODE_DEBUG
00895 kDebug(6005) << "EncodingDetector::encoding used is" << d->m_codec->name();
00896 #endif
00897 return true;
00898 }
00899
00900 bool EncodingDetector::analyze(const QByteArray &data)
00901 {
00902 return analyze( data.data(), data.size() );
00903 }
00904
00905 bool EncodingDetector::analyze(const char *data, int len)
00906 {
00907
00908
00909
00910 if (len >= 10 && ((d->m_source != UserChosenEncoding) || is16Bit(d->m_codec)))
00911 {
00912
00913 const uchar *udata = (const uchar *)data;
00914 uchar c1 = *udata++;
00915 uchar c2 = *udata++;
00916 uchar c3 = *udata++;
00917
00918
00919 const char *autoDetectedEncoding;
00920 if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE))
00921 {
00922 autoDetectedEncoding = "ISO-10646-UCS-2";
00923 }
00924 else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF)
00925 {
00926 autoDetectedEncoding = "UTF-8";
00927 }
00928 else if (c1 == 0x00 || c2 == 0x00)
00929 {
00930 uchar c4 = *udata++;
00931 uchar c5 = *udata++;
00932 uchar c6 = *udata++;
00933 uchar c7 = *udata++;
00934 uchar c8 = *udata++;
00935 uchar c9 = *udata++;
00936 uchar c10 = *udata++;
00937
00938 int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0);
00939 int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0);
00940 if ((nul_count_even==0 && nul_count_odd==5) || (nul_count_even==5 && nul_count_odd==0))
00941 autoDetectedEncoding = "ISO-10646-UCS-2";
00942 else
00943 autoDetectedEncoding = 0;
00944 }
00945 else
00946 {
00947 autoDetectedEncoding = 0;
00948 }
00949
00950
00951 if (autoDetectedEncoding != 0)
00952 {
00953 d->m_source = BOM;
00954 d->m_codec = QTextCodec::codecForName(autoDetectedEncoding);
00955 assert(d->m_codec);
00956
00957 delete d->m_decoder;
00958 d->m_decoder = d->m_codec->makeDecoder();
00959 #ifdef DECODE_DEBUG
00960 kWarning() << "Detection by BOM";
00961 #endif
00962 if (is16Bit(d->m_codec) && c2==0x00)
00963 {
00964
00965 char reverseUtf16[3] = {(char)0xFF, (char)0xFE, 0x00};
00966 d->m_decoder->toUnicode(reverseUtf16, 2);
00967 }
00968 return true;
00969 }
00970 }
00971
00972
00973 if (d->m_source==UserChosenEncoding)
00974 {
00975 #ifdef DECODE_DEBUG
00976 kWarning() << "EncodingDetector: UserChosenEncoding exit ";
00977 #endif
00978
00979 if (errorsIfUtf8(data, len))
00980 setEncoding("",DefaultEncoding);
00981 return true;
00982 }
00983 #if 0 //This is for plaintext, so don't try to parse HTML headers -- ahartmetz
00984 if (!d->m_seenBody)
00985 {
00986
00987
00988
00989 const char *ptr = data;
00990 const char *pEnd = data+len;
00991
00992 while(ptr != pEnd)
00993 {
00994 if(*ptr!='<')
00995 {
00996 ++ptr;
00997 continue;
00998 }
00999 ++ptr;
01000
01001 if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-')
01002 {
01003 ptr += 3;
01004 skipComment(ptr, pEnd);
01005 continue;
01006 }
01007
01008
01009 if (ptr[0]=='?' && ptr[1]=='x' && ptr[2]=='m' && ptr[3]=='l')
01010 {
01011 const char *end = ptr;
01012 while (*end != '>' && end < pEnd)
01013 end++;
01014 if (*end == '\0' || end == pEnd)
01015 break;
01016 QCString str(ptr, end - ptr + 1);
01017 int length;
01018 int pos = findXMLEncoding(str, length);
01019
01020 if (pos!=-1 && setEncoding(str.mid(pos, length), EncodingFromXMLHeader))
01021 {
01022 return true;
01023 }
01024 }
01025
01026
01027 while (
01028 !((*ptr >= 'a') && (*ptr <= 'z') ||
01029 (*ptr >= 'A') && (*ptr <= 'Z'))
01030 && ptr < pEnd
01031 )
01032 ++ptr;
01033
01034 char tmp[5];
01035 int length=0;
01036 const char* max=ptr+4;
01037 if (pEnd<max)
01038 max=pEnd;
01039 while (
01040 ((*ptr >= 'a') && (*ptr <= 'z') ||
01041 (*ptr >= 'A') && (*ptr <= 'Z') ||
01042 (*ptr >= '0') && (*ptr <= '9'))
01043 && ptr < max
01044 )
01045 {
01046 tmp[length] = tolower( *ptr );
01047 ++ptr;
01048 ++length;
01049 }
01050 tmp[length] = 0;
01051 if (tmp[0]=='m'&&tmp[1]=='e'&&tmp[2]=='t'&&tmp[3]=='a')
01052 {
01053
01054 const char* end = ptr;
01055 while(*end != '>' && *end != '\0' && end<pEnd)
01056 end++;
01057
01058 QCString str( ptr, (end-ptr)+1);
01059 str = str.lower();
01060 int pos=0;
01061
01062
01063 if( (pos = str.find("charset")) == -1)
01064 continue;
01065 pos+=6;
01066
01067 if( (pos = str.find('=', pos)) == -1)
01068 continue;
01069
01070
01071 while (pos < (int)str.length() && str[pos] <= ' ')
01072 ++pos;
01073 if ( pos == (int)str.length())
01074 continue;
01075
01076 int endpos = pos;
01077 while( endpos < str.length() &&
01078 (str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\''
01079 && str[endpos] != ';' && str[endpos] != '>') )
01080 ++endpos;
01081 #ifdef DECODE_DEBUG
01082 kDebug( 6005 ) << "EncodingDetector: found charset in <meta>: " << str.mid(pos,endpos-pos).data();
01083 #endif
01084 if (setEncoding(str.mid(pos,endpos-pos), EncodingFromMetaTag))
01085 return true;
01086 }
01087 else if (tmp[0]=='b'&&tmp[1]=='o'&&tmp[2]=='d'&&tmp[3]=='y')
01088 {
01089 d->m_seenBody=true;
01090 break;
01091 }
01092 }
01093 }
01094
01095 if (d->m_source==EncodingFromHTTPHeader)
01096 return true;
01097 #endif
01098
01099 if (len < 1)
01100 {
01101 setEncoding("",DefaultEncoding);
01102 return false;
01103 }
01104 #ifdef DECODE_DEBUG
01105 kDebug( 6005 ) << "EncodingDetector: using heuristics (" << strlen(data) << ")";
01106 #endif
01107
01108 switch ( d->m_autoDetectLanguage )
01109 {
01110 case EncodingDetector::Arabic:
01111 return setEncoding(automaticDetectionForArabic( (const unsigned char*) data, len ), AutoDetectedEncoding);
01112
01113 case EncodingDetector::Baltic:
01114 return setEncoding(automaticDetectionForBaltic( (const unsigned char*) data, len ), AutoDetectedEncoding);
01115
01116 case EncodingDetector::CentralEuropean:
01117 return setEncoding(automaticDetectionForCentralEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding);
01118 break;
01119 case EncodingDetector::Cyrillic:
01120 return setEncoding(automaticDetectionForCyrillic( (const unsigned char*) data, len), AutoDetectedEncoding);
01121
01122 case EncodingDetector::Greek:
01123 return setEncoding(automaticDetectionForGreek( (const unsigned char*) data, len ), AutoDetectedEncoding);
01124
01125 case EncodingDetector::Hebrew:
01126 return setEncoding(automaticDetectionForHebrew( (const unsigned char*) data, len ), AutoDetectedEncoding);
01127
01128 case EncodingDetector::Japanese:
01129 return setEncoding(automaticDetectionForJapanese( (const unsigned char*) data, len ), AutoDetectedEncoding);
01130
01131 case EncodingDetector::Turkish:
01132 return setEncoding(automaticDetectionForTurkish( (const unsigned char*) data, len ), AutoDetectedEncoding);
01133
01134 case EncodingDetector::WesternEuropean:
01135 if (setEncoding(automaticDetectionForWesternEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding))
01136 return true;
01137 else if (d->m_defaultCodec->mibEnum()==MibLatin1)
01138 {
01139 return setEncoding("iso-8859-15",AutoDetectedEncoding);
01140 }
01141 else
01142 {
01143 return setEncoding("",DefaultEncoding);
01144 }
01145
01146 case EncodingDetector::SemiautomaticDetection:
01147 case EncodingDetector::ChineseSimplified:
01148 case EncodingDetector::ChineseTraditional:
01149 case EncodingDetector::Korean:
01150 case EncodingDetector::Thai:
01151 case EncodingDetector::Unicode:
01152 case EncodingDetector::NorthernSaami:
01153 case EncodingDetector::SouthEasternEurope:
01154 case EncodingDetector::None:
01155
01156
01157 break;
01158 }
01159
01160 setEncoding("",DefaultEncoding);
01161 return true;
01162 }
01163
01164
01165 EncodingDetector::AutoDetectScript EncodingDetector::scriptForName(const QString& lang)
01166 {
01167 if (lang.isEmpty())
01168 return EncodingDetector::None;
01169 else if (lang==i18n("@item Text character set", "Unicode"))
01170 return EncodingDetector::Unicode;
01171 else if (lang==i18n("@item Text character set", "Cyrillic"))
01172 return EncodingDetector::Cyrillic;
01173 else if (lang==i18n("@item Text character set", "Western European"))
01174 return EncodingDetector::WesternEuropean;
01175 else if (lang==i18n("@item Text character set", "Central European"))
01176 return EncodingDetector::CentralEuropean;
01177 else if (lang==i18n("@item Text character set", "Greek"))
01178 return EncodingDetector::Greek;
01179 else if (lang==i18n("@item Text character set", "Hebrew"))
01180 return EncodingDetector::Hebrew;
01181 else if (lang==i18n("@item Text character set", "Turkish"))
01182 return EncodingDetector::Turkish;
01183 else if (lang==i18n("@item Text character set", "Japanese"))
01184 return EncodingDetector::Japanese;
01185 else if (lang==i18n("@item Text character set", "Baltic"))
01186 return EncodingDetector::Baltic;
01187 else if (lang==i18n("@item Text character set", "Arabic"))
01188 return EncodingDetector::Arabic;
01189
01190 return EncodingDetector::None;
01191 }
01192
01193 bool EncodingDetector::hasAutoDetectionForScript(EncodingDetector::AutoDetectScript script)
01194 {
01195 switch (script)
01196 {
01197 case EncodingDetector::Arabic:
01198 return true;
01199 case EncodingDetector::Baltic:
01200 return true;
01201 case EncodingDetector::CentralEuropean:
01202 return true;
01203 case EncodingDetector::Cyrillic:
01204 return true;
01205 case EncodingDetector::Greek:
01206 return true;
01207 case EncodingDetector::Hebrew:
01208 return true;
01209 case EncodingDetector::Japanese:
01210 return true;
01211 case EncodingDetector::Turkish:
01212 return true;
01213 case EncodingDetector::WesternEuropean:
01214 return true;
01215 case EncodingDetector::ChineseTraditional:
01216 return true;
01217 case EncodingDetector::ChineseSimplified:
01218 return true;
01219 case EncodingDetector::Unicode:
01220 return true;
01221 break;
01222 default:
01223 return false;
01224 }
01225 }
01226
01227 QString EncodingDetector::nameForScript(EncodingDetector::AutoDetectScript script)
01228 {
01229 switch (script)
01230 {
01231 case EncodingDetector::Arabic:
01232 return i18n("@item Text character set", "Arabic");
01233 break;
01234 case EncodingDetector::Baltic:
01235 return i18n("@item Text character set", "Baltic");
01236 break;
01237 case EncodingDetector::CentralEuropean:
01238 return i18n("@item Text character set", "Central European");
01239 break;
01240 case EncodingDetector::Cyrillic:
01241 return i18n("@item Text character set", "Cyrillic");
01242 break;
01243 case EncodingDetector::Greek:
01244 return i18n("@item Text character set", "Greek");
01245 break;
01246 case EncodingDetector::Hebrew:
01247 return i18n("@item Text character set", "Hebrew");
01248 break;
01249 case EncodingDetector::Japanese:
01250 return i18n("@item Text character set", "Japanese");
01251 break;
01252 case EncodingDetector::Turkish:
01253 return i18n("@item Text character set", "Turkish");
01254 break;
01255 case EncodingDetector::WesternEuropean:
01256 return i18n("@item Text character set", "Western European");
01257 break;
01258 case EncodingDetector::ChineseTraditional:
01259 return i18n("@item Text character set", "Chinese Traditional");
01260 break;
01261 case EncodingDetector::ChineseSimplified:
01262 return i18n("@item Text character set", "Chinese Simplified");
01263 break;
01264 case EncodingDetector::Korean:
01265 return i18n("@item Text character set", "Korean");
01266 break;
01267 case EncodingDetector::Thai:
01268 return i18n("@item Text character set", "Thai");
01269 break;
01270 case EncodingDetector::Unicode:
01271 return i18n("@item Text character set", "Unicode");
01272 break;
01273
01274 default:
01275 return QString();
01276
01277 }
01278 }
01279
01280 EncodingDetector::AutoDetectScript EncodingDetector::scriptForLanguageCode(const QString &lc)
01281 {
01282
01283
01284 const char *langStr = pango_script_for_lang[0].lang;
01285
01286 for ( int i = 0; langStr; i++ ) {
01287 langStr = pango_script_for_lang[i].lang;
01288
01289 if ( lc.startsWith( QString::fromAscii( langStr ) ) )
01290 return pango_script_for_lang[i].scripts[0];
01291 }
01292 return None;
01293 }
01294
01295 #undef DECODE_DEBUG
01296