regluit/utils/lang.py

82 lines
1.9 KiB
Python
Raw Normal View History

2019-03-28 01:22:37 +00:00
import re
from django.conf.global_settings import LANGUAGES
2019-03-28 01:46:25 +00:00
lang2code = dict([(lang[1].lower(), lang[0]) for lang in LANGUAGES])
2018-04-13 18:39:03 +00:00
code2lang = dict(LANGUAGES)
2019-03-28 01:22:37 +00:00
iso639 = re.compile(r'^[a-z][a-z][a-z]?$')
def get_language_code(language):
2019-03-28 01:22:37 +00:00
if language is None or not language:
return ''
2018-04-13 18:39:03 +00:00
language = language.lower().strip()
2019-03-28 01:22:37 +00:00
language = sep.split(language)[0].strip()
2018-04-13 18:39:03 +00:00
if language in code2lang:
return language
2019-03-28 01:46:25 +00:00
2019-03-28 01:22:37 +00:00
# language names (english)
if language in lang2code:
return lang2code.get(language)
2019-03-28 01:46:25 +00:00
2019-03-28 01:22:37 +00:00
# mispellings and language names
if language in EXTRA_LANG_MAP:
return EXTRA_LANG_MAP.get(language)
2019-03-28 01:46:25 +00:00
2019-03-28 01:22:37 +00:00
# accept 2 and 3 letter codes
if iso639.match(language):
2019-03-28 01:46:25 +00:00
return language
2019-03-28 01:22:37 +00:00
return ''
2019-03-28 01:46:25 +00:00
# let's do a mapping of the DOAB languages into the language codes used
2019-03-28 01:22:37 +00:00
# mostly, we just handle mispellings
# also null -> xx
sep = re.compile(r'[ ;^,/\|\'\"\]\[\t\n\r\-]+')
lang_and_locale = re.compile(r'^[a-z][a-z]\-[A-Z][A-Z]$')
# mispellings and non-english language names
EXTRA_LANG_MAP = dict([
(u'chinese', 'de'),
(u'deutsch', 'de'),
(u'eng', 'en'),
(u'engli', 'en'),
(u'englilsh', 'en'),
(u'englilsh', 'en'),
(u'englisch', 'en'),
(u'espanol', 'es'),
(u'ger', 'de'),
(u'fra', 'fr'),
(u'fre', 'fr'),
(u'francese', 'fr'),
(u'ita', 'it'),
(u'itali', 'it'),
(u'italiano', 'it'),
(u'norwegian', 'no'),
(u'por', 'pt'),
(u'portugese', 'pt'),
(u'slovene', 'sl'),
(u'spa', 'es'),
(u'spagnolo', 'es'),
2021-05-27 12:35:20 +00:00
(u'un', 'xx'),
2019-03-28 01:22:37 +00:00
])
def lang_to_language_code(lang):
if lang is None:
return ''
lang = lang.strip()
2019-03-28 01:46:25 +00:00
2019-03-28 01:22:37 +00:00
#get codes like en-US
if lang_and_locale.match(lang):
return lang
# get first item of lists
for langitem in sep.split(lang):
if langitem:
lang = langitem
continue
code = get_language_code(lang)
if code:
return code
2019-03-28 01:46:25 +00:00
return ''