make check_metas case insensitive for name

pull/46/head
eric 2018-01-03 11:54:48 -05:00
parent 3a11a408d3
commit c8837c3c74
1 changed files with 15 additions and 17 deletions

View File

@ -92,8 +92,7 @@ class BaseScraper(object):
value = '' value = ''
list_mode = attrs.pop('list_mode', 'longest') list_mode = attrs.pop('list_mode', 'longest')
for meta_name in meta_list: for meta_name in meta_list:
attrs['name'] = meta_name attrs['name'] = re.compile(meta_name, flags=re.I)
metas = self.doc.find_all('meta', attrs=attrs) metas = self.doc.find_all('meta', attrs=attrs)
if len(metas) == 0: if len(metas) == 0:
# some sites put schema.org metadata in metas # some sites put schema.org metadata in metas
@ -151,24 +150,23 @@ class BaseScraper(object):
# #
def get_genre(self): def get_genre(self):
value = self.check_metas(['DC.Type', 'dc.type', 'og:type']) value = self.check_metas([r'dc\.type', 'og:type'])
if value and value in ('Text.Book', 'book'): if value and value in ('Text.Book', 'book'):
self.set('genre', 'book') self.set('genre', 'book')
def get_title(self): def get_title(self):
value = self.check_metas(['DC.Title', 'dc.title', 'citation_title', 'og:title', 'title']) value = self.check_metas([r'dc\.title', 'citation_title', 'og:title', 'title'])
if not value: if not value:
value = self.fetch_one_el_content('title') value = self.fetch_one_el_content('title')
self.set('title', value) self.set('title', value)
def get_language(self): def get_language(self):
value = self.check_metas(['DC.Language', 'dc.language', 'language', 'inLanguage']) value = self.check_metas([r'dc\.language', 'language', 'inLanguage'])
self.set('language', value) self.set('language', value)
def get_description(self): def get_description(self):
value = self.check_metas([ value = self.check_metas([
'DC.Description', r'dc\.description',
'dc.description',
'og:description', 'og:description',
'description' 'description'
]) ])
@ -196,14 +194,14 @@ class BaseScraper(object):
return isbns return isbns
def get_identifiers(self): def get_identifiers(self):
value = self.check_metas(['DC.Identifier.URI']) value = self.check_metas([r'DC\.Identifier\.URI'])
if not value: if not value:
value = self.doc.select_one('link[rel=canonical]') value = self.doc.select_one('link[rel=canonical]')
value = value['href'] if value else None value = value['href'] if value else None
value = identifier_cleaner('http', quiet=True)(value) value = identifier_cleaner('http', quiet=True)(value)
if value: if value:
self.identifiers['http'] = value self.identifiers['http'] = value
value = self.check_metas(['DC.Identifier.DOI', 'citation_doi']) value = self.check_metas([r'DC\.Identifier\.DOI', 'citation_doi'])
value = identifier_cleaner('doi', quiet=True)(value) value = identifier_cleaner('doi', quiet=True)(value)
if value: if value:
self.identifiers['doi'] = value self.identifiers['doi'] = value
@ -247,7 +245,7 @@ class BaseScraper(object):
self.set('subjects', re.split(' *[;,] *', value)) self.set('subjects', re.split(' *[;,] *', value))
def get_publisher(self): def get_publisher(self):
value = self.check_metas(['citation_publisher', 'DC.Source']) value = self.check_metas(['citation_publisher', r'DC\.Source'])
if value: if value:
self.set('publisher', value) self.set('publisher', value)
@ -255,20 +253,20 @@ class BaseScraper(object):
value = self.get_itemprop('datePublished', list_mode='one_item') value = self.get_itemprop('datePublished', list_mode='one_item')
if not value: if not value:
value = self.check_metas([ value = self.check_metas([
'citation_publication_date', 'DC.Date.issued', 'datePublished', 'citation_publication_date', r'DC\.Date\.issued', 'datePublished',
'books:release_date', 'book:release_date' 'books:release_date', 'book:release_date'
]) ])
if value: if value:
self.set('publication_date', value) self.set('publication_date', value)
def get_author_list(self): def get_author_list(self):
value_list = self.check_metas([ value_list = self.get_itemprop('author')
'DC.Creator.PersonalName',
'citation_author',
'author',
], list_mode='list')
if not value_list: if not value_list:
value_list = self.get_itemprop('author') value_list = self.check_metas([
r'DC\.Creator\.PersonalName',
'citation_author',
'author',
], list_mode='list')
if not value_list: if not value_list:
return [] return []
return value_list return value_list