1 // tinygettext - A gettext replacement that works directly on .po files
2 // Copyright (C) 2006 Ingo Ruhnke <grumbel@gmx.de>
4 // This program is free software; you can redistribute it and/or
5 // modify it under the terms of the GNU General Public License
6 // as published by the Free Software Foundation; either version 2
7 // of the License, or (at your option) any later version.
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
14 // You should have received a copy of the GNU General Public License
15 // along with this program; if not, write to the Free Software
16 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 #include "language.hpp"
24 namespace tinygettext {
27 /** Language code: "de", "en", ... */
30 /** Country code: "BR", "DE", ..., can be 0 */
33 /** Modifier/Varint: "Latn", "ije", "latin"..., can be 0 */
36 /** Language name: "German", "English", "French", ... */
40 /** Language Definitions */
42 LanguageSpec languages[] = {
43 { "aa", 0, 0, "Afar" },
44 { "af", 0, 0, "Afrikaans" },
45 { "af", "ZA", 0, "Afrikaans (South Africa)" },
46 { "am", 0, 0, "Amharic" },
47 { "ar", 0, 0, "Arabic" },
48 { "ar", "AR", 0, "Arabic (Argentina)" },
49 { "ar", "OM", 0, "Arabic (Oman)" },
50 { "ar", "SA", 0, "Arabic (Saudi Arabia)" },
51 { "ar", "SY", 0, "Arabic (Syrian Arab Republic)" },
52 { "ar", "TN", 0, "Arabic (Tunisia)" },
53 { "as", 0, 0, "Assamese" },
54 { "ast",0, 0, "Asturian" },
55 { "ay", 0, 0, "Aymara" },
56 { "az", 0, 0, "Azerbaijani" },
57 { "az", "IR", 0, "Azerbaijani (Iran)" },
58 { "be", 0, 0, "Belarusian" },
59 { "be", 0, "latin", "Belarusian" },
60 { "bg", 0, 0, "Bulgarian" },
61 { "bg", "BG", 0, "Bulgarian (Bulgaria)" },
62 { "bn", 0, 0, "Bengali" },
63 { "bn", "BD", 0, "Bengali (Bangladesh)" },
64 { "bn", "IN", 0, "Bengali (India)" },
65 { "bo", 0, 0, "Tibetan" },
66 { "br", 0, 0, "Breton" },
67 { "bs", 0, 0, "Bosnian" },
68 { "bs", "BA", 0, "Bosnian (Bosnia/Herzegovina)"},
69 { "bs", "BS", 0, "Bosnian (Bahamas)" },
70 { "ca", "ES", "valencia", "Catalan (valencia)" },
71 { "ca", "ES", 0, "Catalan (Spain)" },
72 { "ca", 0, "valencia", "Catalan (valencia)" },
73 { "cmn", 0, 0, "Mandarin" },
74 { "ca", 0, 0, "Catalan" },
75 { "co", 0, 0, "Corsican" },
76 { "cs", 0, 0, "Czech" },
77 { "cs", "CZ", 0, "Czech (Czech Republic)" },
78 { "cy", 0, 0, "Welsh" },
79 { "cy", "GB", 0, "Welsh (Great Britain)" },
80 { "cz", 0, 0, "Unknown language" },
81 { "da", 0, 0, "Danish" },
82 { "da", "DK", 0, "Danish (Denmark)" },
83 { "de", 0, 0, "German" },
84 { "de", "AT", 0, "German (Austria)" },
85 { "de", "CH", 0, "German (Switzerland)" },
86 { "de", "DE", 0, "German (Germany)" },
87 { "dk", 0, 0, "Unknown language" },
88 { "dz", 0, 0, "Dzongkha" },
89 { "el", 0, 0, "Greek" },
90 { "el", "GR", 0, "Greek (Greece)" },
91 { "en", 0, 0, "English" },
92 { "en", "AU", 0, "English (Australia)" },
93 { "en", "CA", 0, "English (Canada)" },
94 { "en", "GB", 0, "English (Great Britain)" },
95 { "en", "US", 0, "English (United States)" },
96 { "en", "ZA", 0, "English (South Africa)" },
97 { "en", 0, "boldquot", "English" },
98 { "en", 0, "quot", "English" },
99 { "en", "US", "piglatin", "English" },
100 { "eo", 0, 0, "Esperanto" },
101 { "es", 0, 0, "Spanish" },
102 { "es", "AR", 0, "Spanish (Argentina)" },
103 { "es", "CL", 0, "Spanish (Chile)" },
104 { "es", "CO", 0, "Spanish (Colombia)" },
105 { "es", "CR", 0, "Spanish (Costa Rica)" },
106 { "es", "DO", 0, "Spanish (Dominican Republic)"},
107 { "es", "EC", 0, "Spanish (Ecuador)" },
108 { "es", "ES", 0, "Spanish (Spain)" },
109 { "es", "GT", 0, "Spanish (Guatemala)" },
110 { "es", "HN", 0, "Spanish (Honduras)" },
111 { "es", "LA", 0, "Spanish (Laos)" },
112 { "es", "MX", 0, "Spanish (Mexico)" },
113 { "es", "NI", 0, "Spanish (Nicaragua)" },
114 { "es", "PA", 0, "Spanish (Panama)" },
115 { "es", "PE", 0, "Spanish (Peru)" },
116 { "es", "PR", 0, "Spanish (Puerto Rico)" },
117 { "es", "SV", 0, "Spanish (El Salvador)" },
118 { "es", "UY", 0, "Spanish (Uruguay)" },
119 { "es", "VE", 0, "Spanish (Venezuela)" },
120 { "et", 0, 0, "Estonian" },
121 { "et", "EE", 0, "Estonian (Estonia)" },
122 { "et", "ET", 0, "Estonian (Ethiopia)" },
123 { "eu", 0, 0, "Basque" },
124 { "eu", "ES", 0, "Basque (Spain)" },
125 { "fa", 0, 0, "Persian" },
126 { "fa", "AF", 0, "Persian (Afghanistan)" },
127 { "fa", "IR", 0, "Persian (Iran)" },
128 { "fi", 0, 0, "Finnish" },
129 { "fi", "FI", 0, "Finnish (Finland)" },
130 { "fo", 0, 0, "Faroese" },
131 { "fo", "FO", 0, "Faeroese (Faroe Islands)" },
132 { "fr", 0, 0, "French" },
133 { "fr", "CA", 0, "French (Canada)" },
134 { "fr", "CH", 0, "French (Switzerland)" },
135 { "fr", "FR", 0, "French (France)" },
136 { "fr", "LU", 0, "French (Luxembourg)" },
137 { "fy", 0, 0, "Frisian" },
138 { "ga", 0, 0, "Irish" },
139 { "gd", 0, 0, "Gaelic Scots" },
140 { "gl", 0, 0, "Galician" },
141 { "gl", "ES", 0, "Galician (Spain)" },
142 { "gn", 0, 0, "Guarani" },
143 { "gu", 0, 0, "Gujarati" },
144 { "gv", 0, 0, "Manx" },
145 { "ha", 0, 0, "Hausa" },
146 { "he", 0, 0, "Hebrew" },
147 { "he", "IL", 0, "Hebrew (Israel)" },
148 { "hi", 0, 0, "Hindi" },
149 { "hr", 0, 0, "Croatian" },
150 { "hr", "HR", 0, "Croatian (Croatia)" },
151 { "hu", 0, 0, "Hungarian" },
152 { "hu", "HU", 0, "Hungarian (Hungary)" },
153 { "hy", 0, 0, "Armenian" },
154 { "ia", 0, 0, "Interlingua" },
155 { "id", 0, 0, "Indonesian" },
156 { "id", "ID", 0, "Indonesian (Indonesia)" },
157 { "is", 0, 0, "Icelandic" },
158 { "is", "IS", 0, "Icelandic (Iceland)" },
159 { "it", 0, 0, "Italian" },
160 { "it", "CH", 0, "Italian (Switzerland)" },
161 { "it", "IT", 0, "Italian (Italy)" },
162 { "iu", 0, 0, "Inuktitut" },
163 { "ja", 0, 0, "Japanese" },
164 { "ja", "JP", 0, "Japanese (Japan)" },
165 { "ka", 0, 0, "Georgian" },
166 { "kk", 0, 0, "Kazakh" },
167 { "kl", 0, 0, "Kalaallisut" },
168 { "km", 0, 0, "Khmer" },
169 { "km", "KH", 0, "Khmer (Cambodia)" },
170 { "kn", 0, 0, "Kannada" },
171 { "ko", 0, 0, "Korean" },
172 { "ko", "KR", 0, "Korean (Korea)" },
173 { "ku", 0, 0, "Kurdish" },
174 { "kw", 0, 0, "Cornish" },
175 { "ky", 0, 0, "Kirghiz" },
176 { "la", 0, 0, "Latin" },
177 { "lo", 0, 0, "Lao" },
178 { "lt", 0, 0, "Lithuanian" },
179 { "lt", "LT", 0, "Lithuanian (Lithuania)" },
180 { "lv", 0, 0, "Latvian" },
181 { "lv", "LV", 0, "Latvian (Latvia)" },
182 { "mg", 0, 0, "Malagasy" },
183 { "mi", 0, 0, "Maori" },
184 { "mk", 0, 0, "Macedonian" },
185 { "mk", "MK", 0, "Macedonian (Macedonia)" },
186 { "ml", 0, 0, "Malayalam" },
187 { "mn", 0, 0, "Mongolian" },
188 { "mr", 0, 0, "Marathi" },
189 { "ms", 0, 0, "Malay" },
190 { "ms", "MY", 0, "Malay (Malaysia)" },
191 { "mt", 0, 0, "Maltese" },
192 { "my", 0, 0, "Burmese" },
193 { "my", "MM", 0, "Burmese (Myanmar)" },
194 { "nb", 0, 0, "Norwegian Bokmal" },
195 { "nb", "NO", 0, "Norwegian Bokm
\8cl (Norway)" },
196 { "ne", 0, 0, "Nepali" },
197 { "nl", 0, 0, "Dutch" },
198 { "nl", "BE", 0, "Dutch (Belgium)" },
199 { "nl", "NL", 0, "Dutch (Netherlands)" },
200 { "nn", 0, 0, "Norwegian Nynorsk" },
201 { "nn", "NO", 0, "Norwegian Nynorsk (Norway)" },
202 { "no", 0, 0, "Norwegian" },
203 { "no", "NO", 0, "Norwegian (Norway)" },
204 { "no", "NY", 0, "Norwegian (NY)" },
205 { "nr", 0, 0, "Ndebele, South" },
206 { "oc", 0, 0, "Occitan post 1500" },
207 { "om", 0, 0, "Oromo" },
208 { "or", 0, 0, "Oriya" },
209 { "pa", 0, 0, "Punjabi" },
210 { "pl", 0, 0, "Polish" },
211 { "pl", "PL", 0, "Polish (Poland)" },
212 { "ps", 0, 0, "Pashto" },
213 { "pt", 0, 0, "Portuguese" },
214 { "pt", "BR", 0, "Brazilian" },
215 { "pt", "PT", 0, "Portuguese (Portugal)" },
216 { "qu", 0, 0, "Quechua" },
217 { "rm", 0, 0, "Rhaeto-Romance" },
218 { "ro", 0, 0, "Romanian" },
219 { "ro", "RO", 0, "Romanian (Romania)" },
220 { "ru", 0, 0, "Russian" },
221 { "ru", "RU", 0, "Russian (Russia" },
222 { "rw", 0, 0, "Kinyarwanda" },
223 { "sa", 0, 0, "Sanskrit" },
224 { "sd", 0, 0, "Sindhi" },
225 { "se", 0, 0, "Sami" },
226 { "se", "NO", 0, "Sami (Norway)" },
227 { "si", 0, 0, "Sinhalese" },
228 { "sk", 0, 0, "Slovak" },
229 { "sk", "SK", 0, "Slovak (Slovakia)" },
230 { "sl", 0, 0, "Slovenian" },
231 { "sl", "SI", 0, "Slovenian (Slovenia)" },
232 { "sl", "SL", 0, "Slovenian (Sierra Leone)" },
233 { "sm", 0, 0, "Samoan" },
234 { "so", 0, 0, "Somali" },
235 { "sp", 0, 0, "Unknown language" },
236 { "sq", 0, 0, "Albanian" },
237 { "sq", "AL", 0, "Albanian (Albania)" },
238 { "sr", 0, 0, "Serbian" },
239 { "sr", "YU", 0, "Serbian (Yugoslavia)" },
240 { "sr", 0,"ije", "Serbian" },
241 { "sr", 0, "latin", "Serbian" },
242 { "sr", 0, "Latn", "Serbian" },
243 { "ss", 0, 0, "Swati" },
244 { "st", 0, 0, "Sotho" },
245 { "sv", 0, 0, "Swedish" },
246 { "sv", "SE", 0, "Swedish (Sweden)" },
247 { "sv", "SV", 0, "Swedish (El Salvador)" },
248 { "sw", 0, 0, "Swahili" },
249 { "ta", 0, 0, "Tamil" },
250 { "te", 0, 0, "Telugu" },
251 { "tg", 0, 0, "Tajik" },
252 { "th", 0, 0, "Thai" },
253 { "th", "TH", 0, "Thai (Thailand)" },
254 { "ti", 0, 0, "Tigrinya" },
255 { "tk", 0, 0, "Turkmen" },
256 { "tl", 0, 0, "Tagalog" },
257 { "to", 0, 0, "Tonga" },
258 { "tr", 0, 0, "Turkish" },
259 { "tr", "TR", 0, "Turkish (Turkey)" },
260 { "ts", 0, 0, "Tsonga" },
261 { "tt", 0, 0, "Tatar" },
262 { "ug", 0, 0, "Uighur" },
263 { "uk", 0, 0, "Ukrainian" },
264 { "uk", "UA", 0, "Ukrainian (Ukraine)" },
265 { "ur", 0, 0, "Urdu" },
266 { "ur", "PK", 0, "Urdu (Pakistan)" },
267 { "uz", 0, 0, "Uzbek" },
268 { "uz", 0, "cyrillic", "Uzbek" },
269 { "vi", 0, 0, "Vietnamese" },
270 { "vi", "VN", 0, "Vietnamese (Vietnam)" },
271 { "wa", 0, 0, "Walloon" },
272 { "wo", 0, 0, "Wolof" },
273 { "xh", 0, 0, "Xhosa" },
274 { "yi", 0, 0, "Yiddish" },
275 { "yo", 0, 0, "Yoruba" },
276 { "zh", 0, 0, "Chinese" },
277 { "zh", "CN", 0, "Chinese (simplified)" },
278 { "zh", "HK", 0, "Chinese (Hong Kong)" },
279 { "zh", "TW", 0, "Chinese (traditional)" },
280 { "zu", 0, 0, "Zulu" },
286 resolve_language_alias(const std::string& name)
288 typedef std::map<std::string, std::string> Aliases;
289 static Aliases language_aliases;
290 if (language_aliases.empty())
292 // FIXME: Many of those are not useful for us, since we leave
293 // encoding to the app, not to the language, we could/should
294 // also match against all language names, not just aliases from
297 // Aliases taken from /etc/locale.alias
298 language_aliases["bokmal"] = "nb_NO.ISO-8859-1";
299 language_aliases["bokm
\8cl"] = "nb_NO.ISO-8859-1";
300 language_aliases["catalan"] = "ca_ES.ISO-8859-1";
301 language_aliases["croatian"] = "hr_HR.ISO-8859-2";
302 language_aliases["czech"] = "cs_CZ.ISO-8859-2";
303 language_aliases["danish"] = "da_DK.ISO-8859-1";
304 language_aliases["dansk"] = "da_DK.ISO-8859-1";
305 language_aliases["deutsch"] = "de_DE.ISO-8859-1";
306 language_aliases["dutch"] = "nl_NL.ISO-8859-1";
307 language_aliases["eesti"] = "et_EE.ISO-8859-1";
308 language_aliases["estonian"] = "et_EE.ISO-8859-1";
309 language_aliases["finnish"] = "fi_FI.ISO-8859-1";
310 language_aliases["fran
\8dais"] = "fr_FR.ISO-8859-1";
311 language_aliases["french"] = "fr_FR.ISO-8859-1";
312 language_aliases["galego"] = "gl_ES.ISO-8859-1";
313 language_aliases["galician"] = "gl_ES.ISO-8859-1";
314 language_aliases["german"] = "de_DE.ISO-8859-1";
315 language_aliases["greek"] = "el_GR.ISO-8859-7";
316 language_aliases["hebrew"] = "he_IL.ISO-8859-8";
317 language_aliases["hrvatski"] = "hr_HR.ISO-8859-2";
318 language_aliases["hungarian"] = "hu_HU.ISO-8859-2";
319 language_aliases["icelandic"] = "is_IS.ISO-8859-1";
320 language_aliases["italian"] = "it_IT.ISO-8859-1";
321 language_aliases["japanese"] = "ja_JP.eucJP";
322 language_aliases["japanese.euc"] = "ja_JP.eucJP";
323 language_aliases["ja_JP"] = "ja_JP.eucJP";
324 language_aliases["ja_JP.ujis"] = "ja_JP.eucJP";
325 language_aliases["japanese.sjis"] = "ja_JP.SJIS";
326 language_aliases["korean"] = "ko_KR.eucKR";
327 language_aliases["korean.euc"] = "ko_KR.eucKR";
328 language_aliases["ko_KR"] = "ko_KR.eucKR";
329 language_aliases["lithuanian"] = "lt_LT.ISO-8859-13";
330 language_aliases["no_NO"] = "nb_NO.ISO-8859-1";
331 language_aliases["no_NO.ISO-8859-1"] = "nb_NO.ISO-8859-1";
332 language_aliases["norwegian"] = "nb_NO.ISO-8859-1";
333 language_aliases["nynorsk"] = "nn_NO.ISO-8859-1";
334 language_aliases["polish"] = "pl_PL.ISO-8859-2";
335 language_aliases["portuguese"] = "pt_PT.ISO-8859-1";
336 language_aliases["romanian"] = "ro_RO.ISO-8859-2";
337 language_aliases["russian"] = "ru_RU.ISO-8859-5";
338 language_aliases["slovak"] = "sk_SK.ISO-8859-2";
339 language_aliases["slovene"] = "sl_SI.ISO-8859-2";
340 language_aliases["slovenian"] = "sl_SI.ISO-8859-2";
341 language_aliases["spanish"] = "es_ES.ISO-8859-1";
342 language_aliases["swedish"] = "sv_SE.ISO-8859-1";
343 language_aliases["thai"] = "th_TH.TIS-620";
344 language_aliases["turkish"] = "tr_TR.ISO-8859-9";
347 std::string name_lowercase;
348 name_lowercase.resize(name.size());
349 for(std::string::size_type i = 0; i < name.size(); ++i)
350 name_lowercase[i] = static_cast<char>(tolower(name[i]));
352 Aliases::iterator i = language_aliases.find(name_lowercase);
353 if (i != language_aliases.end())
364 Language::from_spec(const std::string& language, const std::string& country, const std::string& modifier)
366 static std::map<std::string, std::vector<LanguageSpec*> > language_map;
368 if (language_map.empty())
369 { // Init language_map
370 for(int i = 0; languages[i].language != NULL; ++i)
371 language_map[languages[i].language].push_back(&languages[i]);
374 std::map<std::string, std::vector<LanguageSpec*> >::iterator i = language_map.find(language);
375 if (i != language_map.end())
377 std::vector<LanguageSpec*>& lst = i->second;
379 LanguageSpec tmpspec;
380 tmpspec.language = language.c_str();
381 tmpspec.country = country.c_str();
382 tmpspec.modifier = modifier.c_str();
383 Language tmplang(&tmpspec);
385 LanguageSpec* best_match = 0;
386 int best_match_score = 0;
387 for(std::vector<LanguageSpec*>::iterator j = lst.begin(); j != lst.end(); ++j)
388 { // Search for the language that best matches the given spec, value country more then modifier
389 int score = Language::match(Language(*j), tmplang);
391 if (score > best_match_score)
394 best_match_score = score;
398 return Language(best_match);
407 Language::from_name(const std::string& spec_str)
409 return from_env(resolve_language_alias(spec_str));
413 Language::from_env(const std::string& env)
415 // Split LANGUAGE_COUNTRY.CODESET@MODIFIER into parts
416 std::string::size_type ln = env.find('_');
417 std::string::size_type dt = env.find('.');
418 std::string::size_type at = env.find('@');
420 std::string language;
423 std::string modifier;
425 //std::cout << ln << " " << dt << " " << at << std::endl;
427 language = env.substr(0, std::min(std::min(ln, dt), at));
429 if (ln != std::string::npos && ln+1 < env.size()) // _
431 country = env.substr(ln+1, (std::min(dt, at) == std::string::npos) ? std::string::npos : std::min(dt, at) - (ln+1));
434 if (dt != std::string::npos && dt+1 < env.size()) // .
436 codeset = env.substr(dt+1, (at == std::string::npos) ? std::string::npos : (at - (dt+1)));
439 if (at != std::string::npos && at+1 < env.size()) // @
441 modifier = env.substr(at+1);
444 return from_spec(language, country, modifier);
447 Language::Language(LanguageSpec* language_spec_)
448 : language_spec(language_spec_)
458 Language::match(const Language& lhs, const Language& rhs)
460 if (lhs.get_language() != rhs.get_language())
466 static int match_tbl[3][3] = {
467 // modifier match, wildchard, miss
468 { 9, 8, 5 }, // country match
469 { 7, 6, 3 }, // country wildcard
470 { 4, 2, 1 }, // country miss
474 if (lhs.get_country() == rhs.get_country())
476 else if (lhs.get_country().empty() || rhs.get_country().empty())
482 if (lhs.get_modifier() == rhs.get_modifier())
484 else if (lhs.get_modifier().empty() || rhs.get_modifier().empty())
489 return match_tbl[c][m];
494 Language::get_language() const
497 return language_spec->language;
503 Language::get_country() const
505 if (language_spec && language_spec->country)
506 return language_spec->country;
512 Language::get_modifier() const
514 if (language_spec && language_spec->modifier)
515 return language_spec->modifier;
521 Language::get_name() const
524 return language_spec->name;
530 Language::str() const
535 var += language_spec->language;
536 if (language_spec->country)
539 var += language_spec->country;
542 if (language_spec->modifier)
545 var += language_spec->modifier;
556 Language::operator==(const Language& rhs)
558 return language_spec == rhs.language_spec;
562 Language::operator!=(const Language& rhs)
564 return language_spec != rhs.language_spec;
567 } // namespace tinygettext