better handling of language codes
parent
e433c13108
commit
a6039e4015
|
@ -8,63 +8,40 @@ import urlparse
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
from regluit.utils.lang import get_language_code
|
||||||
from .utils import get_soup
|
from .utils import get_soup
|
||||||
|
|
||||||
# utility functions for converting lists of individual items into individual items
|
# utility functions for converting lists of individual items into individual items
|
||||||
|
|
||||||
# let's do a mapping of the DOAB languages into the language codes used
|
# let's do a mapping of the DOAB languages into the language codes used
|
||||||
# by Google Books API and unglue.it
|
# mostly, we just handle mispellings
|
||||||
# https://en.wikipedia.org/wiki/ISO_639-1 (2 letter)
|
|
||||||
# http://stackoverflow.com/questions/1665667/python-list-filtering-and-transformation
|
|
||||||
|
|
||||||
# from looking at the DOAB languages that don't map obviously to ISO 639.1, I looked
|
|
||||||
# at a few example. the pattern I discern and conjecture as holding for the remainder
|
|
||||||
# is that ^ means a separator for languages in
|
|
||||||
# works with multiple languages. I will map to the first language.
|
|
||||||
|
|
||||||
# also null -> xx
|
# also null -> xx
|
||||||
|
|
||||||
LANG_MAP = dict([
|
EXTRA_LANG_MAP = dict([
|
||||||
('English', 'en'),
|
(u'deutsch', 'de'),
|
||||||
('German', 'de'),
|
(u'eng', 'en'),
|
||||||
('ger', 'de'),
|
(u'englilsh', 'en'),
|
||||||
('de', 'de'),
|
(u'englilsh', 'en'),
|
||||||
('fr', 'fr'),
|
(u'englisch', 'en'),
|
||||||
('Italian', 'it'),
|
(u'espanol', 'es'),
|
||||||
('Dutch', 'nt'),
|
(u'ger', 'de'),
|
||||||
('english', 'en'),
|
(u'norwegian', 'no'),
|
||||||
('en', 'en'),
|
(u'por', 'pt'),
|
||||||
('French', 'fr'),
|
(u'portugese', 'pt'),
|
||||||
('En', 'en'),
|
(u'spa', 'es'),
|
||||||
('italian', 'it'),
|
|
||||||
('de^it^rm', 'de'), # de /it/rm http://www.doabooks.org/doab?func=search&template=&query=Sprachatlas+des+Dolomitenladinischen+und+angrenzender+Gebiete
|
|
||||||
('de^English', 'de'), # eg., http://www.doabooks.org/doab?func=search&template=&query=Robert+Neumann%3A+Mit+eigener+Feder
|
|
||||||
('german', 'de'),
|
|
||||||
('Czech', 'cs'),
|
|
||||||
('Deutsch', 'de'),
|
|
||||||
('Italian / English', 'it'), # http://www.doabooks.org/doab?func=search&template=&query=JLIS
|
|
||||||
('English; French', 'en'),
|
|
||||||
('Englilsh ; Cree', 'en'),
|
|
||||||
('English; French; Cree; Michif; Chinese; Ukrainian', 'en'),
|
|
||||||
('Englisch', 'en'),
|
|
||||||
('Spanish', 'es'),
|
|
||||||
('English;', 'en'),
|
|
||||||
('de^la', 'de'),
|
|
||||||
('de^English^fr^es', 'de'),
|
|
||||||
('English; Italian', 'en'),
|
|
||||||
('Espanol', 'es'),
|
|
||||||
('Welsh', 'cy'),
|
|
||||||
('English; Czech', 'en'),
|
|
||||||
('Englilsh', 'en'),
|
|
||||||
('German;', 'de'),
|
|
||||||
('German; English', 'de'),
|
|
||||||
('Russian;', 'ru')
|
|
||||||
])
|
])
|
||||||
|
|
||||||
|
sep = re.compile(r'[ \-;^,/]+')
|
||||||
def doab_lang_to_iso_639_1(lang):
|
def doab_lang_to_iso_639_1(lang):
|
||||||
if lang is None or not lang:
|
if lang is None or not lang:
|
||||||
return "xx"
|
return "xx"
|
||||||
return LANG_MAP.get(lang, 'xx')
|
else:
|
||||||
|
lang = sep.split(lang)[0]
|
||||||
|
code = get_language_code(lang)
|
||||||
|
if code:
|
||||||
|
return code
|
||||||
|
else:
|
||||||
|
return EXTRA_LANG_MAP.get(lang.lower(), 'xx')
|
||||||
|
|
||||||
|
|
||||||
DOMAIN_TO_PROVIDER = dict([
|
DOMAIN_TO_PROVIDER = dict([
|
||||||
|
|
|
@ -1,6 +1,10 @@
|
||||||
from django.conf.global_settings import LANGUAGES
|
from django.conf.global_settings import LANGUAGES
|
||||||
|
|
||||||
lang2code = dict([ (lang[1].lower(), lang[0]) for lang in LANGUAGES ])
|
lang2code = dict([ (lang[1].lower(), lang[0]) for lang in LANGUAGES ])
|
||||||
|
code2lang = dict(LANGUAGES)
|
||||||
|
|
||||||
def get_language_code(language):
|
def get_language_code(language):
|
||||||
return lang2code.get(language.lower().strip(), '')
|
language = language.lower().strip()
|
||||||
|
if language in code2lang:
|
||||||
|
return language
|
||||||
|
return lang2code.get(language, '')
|
||||||
|
|
Loading…
Reference in New Issue