better handling of language codes

pull/91/head
eric 2018-04-13 14:39:03 -04:00
parent e433c13108
commit a6039e4015
2 changed files with 27 additions and 46 deletions

View File

@ -8,63 +8,40 @@ import urlparse
import requests import requests
from regluit.utils.lang import get_language_code
from .utils import get_soup from .utils import get_soup
# utility functions for converting lists of individual items into individual items # utility functions for converting lists of individual items into individual items
# let's do a mapping of the DOAB languages into the language codes used # let's do a mapping of the DOAB languages into the language codes used
# by Google Books API and unglue.it # mostly, we just handle mispellings
# https://en.wikipedia.org/wiki/ISO_639-1 (2 letter)
# http://stackoverflow.com/questions/1665667/python-list-filtering-and-transformation
# from looking at the DOAB languages that don't map obviously to ISO 639.1, I looked
# at a few example. the pattern I discern and conjecture as holding for the remainder
# is that ^ means a separator for languages in
# works with multiple languages. I will map to the first language.
# also null -> xx # also null -> xx
LANG_MAP = dict([ EXTRA_LANG_MAP = dict([
('English', 'en'), (u'deutsch', 'de'),
('German', 'de'), (u'eng', 'en'),
('ger', 'de'), (u'englilsh', 'en'),
('de', 'de'), (u'englilsh', 'en'),
('fr', 'fr'), (u'englisch', 'en'),
('Italian', 'it'), (u'espanol', 'es'),
('Dutch', 'nt'), (u'ger', 'de'),
('english', 'en'), (u'norwegian', 'no'),
('en', 'en'), (u'por', 'pt'),
('French', 'fr'), (u'portugese', 'pt'),
('En', 'en'), (u'spa', 'es'),
('italian', 'it'),
('de^it^rm', 'de'), # de /it/rm http://www.doabooks.org/doab?func=search&template=&query=Sprachatlas+des+Dolomitenladinischen+und+angrenzender+Gebiete
('de^English', 'de'), # eg., http://www.doabooks.org/doab?func=search&template=&query=Robert+Neumann%3A+Mit+eigener+Feder
('german', 'de'),
('Czech', 'cs'),
('Deutsch', 'de'),
('Italian / English', 'it'), # http://www.doabooks.org/doab?func=search&template=&query=JLIS
('English; French', 'en'),
('Englilsh ; Cree', 'en'),
('English; French; Cree; Michif; Chinese; Ukrainian', 'en'),
('Englisch', 'en'),
('Spanish', 'es'),
('English;', 'en'),
('de^la', 'de'),
('de^English^fr^es', 'de'),
('English; Italian', 'en'),
('Espanol', 'es'),
('Welsh', 'cy'),
('English; Czech', 'en'),
('Englilsh', 'en'),
('German;', 'de'),
('German; English', 'de'),
('Russian;', 'ru')
]) ])
sep = re.compile(r'[ \-;^,/]+')
def doab_lang_to_iso_639_1(lang): def doab_lang_to_iso_639_1(lang):
if lang is None or not lang: if lang is None or not lang:
return "xx" return "xx"
return LANG_MAP.get(lang, 'xx') else:
lang = sep.split(lang)[0]
code = get_language_code(lang)
if code:
return code
else:
return EXTRA_LANG_MAP.get(lang.lower(), 'xx')
DOMAIN_TO_PROVIDER = dict([ DOMAIN_TO_PROVIDER = dict([

View File

@ -1,6 +1,10 @@
from django.conf.global_settings import LANGUAGES from django.conf.global_settings import LANGUAGES
lang2code = dict([ (lang[1].lower(), lang[0]) for lang in LANGUAGES ]) lang2code = dict([ (lang[1].lower(), lang[0]) for lang in LANGUAGES ])
code2lang = dict(LANGUAGES)
def get_language_code(language): def get_language_code(language):
return lang2code.get(language.lower().strip(), '') language = language.lower().strip()
if language in code2lang:
return language
return lang2code.get(language, '')