make check_metas case insensitive for name
parent
3a11a408d3
commit
c8837c3c74
|
@ -92,8 +92,7 @@ class BaseScraper(object):
|
||||||
value = ''
|
value = ''
|
||||||
list_mode = attrs.pop('list_mode', 'longest')
|
list_mode = attrs.pop('list_mode', 'longest')
|
||||||
for meta_name in meta_list:
|
for meta_name in meta_list:
|
||||||
attrs['name'] = meta_name
|
attrs['name'] = re.compile(meta_name, flags=re.I)
|
||||||
|
|
||||||
metas = self.doc.find_all('meta', attrs=attrs)
|
metas = self.doc.find_all('meta', attrs=attrs)
|
||||||
if len(metas) == 0:
|
if len(metas) == 0:
|
||||||
# some sites put schema.org metadata in metas
|
# some sites put schema.org metadata in metas
|
||||||
|
@ -151,24 +150,23 @@ class BaseScraper(object):
|
||||||
#
|
#
|
||||||
|
|
||||||
def get_genre(self):
|
def get_genre(self):
|
||||||
value = self.check_metas(['DC.Type', 'dc.type', 'og:type'])
|
value = self.check_metas([r'dc\.type', 'og:type'])
|
||||||
if value and value in ('Text.Book', 'book'):
|
if value and value in ('Text.Book', 'book'):
|
||||||
self.set('genre', 'book')
|
self.set('genre', 'book')
|
||||||
|
|
||||||
def get_title(self):
|
def get_title(self):
|
||||||
value = self.check_metas(['DC.Title', 'dc.title', 'citation_title', 'og:title', 'title'])
|
value = self.check_metas([r'dc\.title', 'citation_title', 'og:title', 'title'])
|
||||||
if not value:
|
if not value:
|
||||||
value = self.fetch_one_el_content('title')
|
value = self.fetch_one_el_content('title')
|
||||||
self.set('title', value)
|
self.set('title', value)
|
||||||
|
|
||||||
def get_language(self):
|
def get_language(self):
|
||||||
value = self.check_metas(['DC.Language', 'dc.language', 'language', 'inLanguage'])
|
value = self.check_metas([r'dc\.language', 'language', 'inLanguage'])
|
||||||
self.set('language', value)
|
self.set('language', value)
|
||||||
|
|
||||||
def get_description(self):
|
def get_description(self):
|
||||||
value = self.check_metas([
|
value = self.check_metas([
|
||||||
'DC.Description',
|
r'dc\.description',
|
||||||
'dc.description',
|
|
||||||
'og:description',
|
'og:description',
|
||||||
'description'
|
'description'
|
||||||
])
|
])
|
||||||
|
@ -196,14 +194,14 @@ class BaseScraper(object):
|
||||||
return isbns
|
return isbns
|
||||||
|
|
||||||
def get_identifiers(self):
|
def get_identifiers(self):
|
||||||
value = self.check_metas(['DC.Identifier.URI'])
|
value = self.check_metas([r'DC\.Identifier\.URI'])
|
||||||
if not value:
|
if not value:
|
||||||
value = self.doc.select_one('link[rel=canonical]')
|
value = self.doc.select_one('link[rel=canonical]')
|
||||||
value = value['href'] if value else None
|
value = value['href'] if value else None
|
||||||
value = identifier_cleaner('http', quiet=True)(value)
|
value = identifier_cleaner('http', quiet=True)(value)
|
||||||
if value:
|
if value:
|
||||||
self.identifiers['http'] = value
|
self.identifiers['http'] = value
|
||||||
value = self.check_metas(['DC.Identifier.DOI', 'citation_doi'])
|
value = self.check_metas([r'DC\.Identifier\.DOI', 'citation_doi'])
|
||||||
value = identifier_cleaner('doi', quiet=True)(value)
|
value = identifier_cleaner('doi', quiet=True)(value)
|
||||||
if value:
|
if value:
|
||||||
self.identifiers['doi'] = value
|
self.identifiers['doi'] = value
|
||||||
|
@ -247,7 +245,7 @@ class BaseScraper(object):
|
||||||
self.set('subjects', re.split(' *[;,] *', value))
|
self.set('subjects', re.split(' *[;,] *', value))
|
||||||
|
|
||||||
def get_publisher(self):
|
def get_publisher(self):
|
||||||
value = self.check_metas(['citation_publisher', 'DC.Source'])
|
value = self.check_metas(['citation_publisher', r'DC\.Source'])
|
||||||
if value:
|
if value:
|
||||||
self.set('publisher', value)
|
self.set('publisher', value)
|
||||||
|
|
||||||
|
@ -255,20 +253,20 @@ class BaseScraper(object):
|
||||||
value = self.get_itemprop('datePublished', list_mode='one_item')
|
value = self.get_itemprop('datePublished', list_mode='one_item')
|
||||||
if not value:
|
if not value:
|
||||||
value = self.check_metas([
|
value = self.check_metas([
|
||||||
'citation_publication_date', 'DC.Date.issued', 'datePublished',
|
'citation_publication_date', r'DC\.Date\.issued', 'datePublished',
|
||||||
'books:release_date', 'book:release_date'
|
'books:release_date', 'book:release_date'
|
||||||
])
|
])
|
||||||
if value:
|
if value:
|
||||||
self.set('publication_date', value)
|
self.set('publication_date', value)
|
||||||
|
|
||||||
def get_author_list(self):
|
def get_author_list(self):
|
||||||
value_list = self.check_metas([
|
value_list = self.get_itemprop('author')
|
||||||
'DC.Creator.PersonalName',
|
|
||||||
'citation_author',
|
|
||||||
'author',
|
|
||||||
], list_mode='list')
|
|
||||||
if not value_list:
|
if not value_list:
|
||||||
value_list = self.get_itemprop('author')
|
value_list = self.check_metas([
|
||||||
|
r'DC\.Creator\.PersonalName',
|
||||||
|
'citation_author',
|
||||||
|
'author',
|
||||||
|
], list_mode='list')
|
||||||
if not value_list:
|
if not value_list:
|
||||||
return []
|
return []
|
||||||
return value_list
|
return value_list
|
||||||
|
|
Loading…
Reference in New Issue