Compare commits

...

4 Commits

Author SHA1 Message Date
Eric Hellman e86b7d5ba4
Merge pull request #1066 from Gluejar/maintenance-2024
Maintenance 2024
2024-11-22 15:32:21 -05:00
eric fc0d32e541 add some indexes
5% improvement
2024-11-22 15:31:06 -05:00
eric 7bd6fba096 fix bookloader when a bookdata is empty 2024-11-22 14:34:17 -05:00
eric 37a5486abe fix mit, add redliclibros, nai010 bookshop 2024-11-22 14:33:33 -05:00
7 changed files with 53 additions and 20 deletions

View File

@ -1113,13 +1113,14 @@ def add_from_bookdatas(bookdatas):
editions = [] editions = []
for bookdata in bookdatas: for bookdata in bookdatas:
edition = work = None edition = work = None
loader = BasePandataLoader(bookdata.base) if bookdata and bookdata.metadata:
pandata = Pandata() loader = BasePandataLoader(bookdata.base)
pandata.metadata = bookdata.metadata pandata = Pandata()
for metadata in pandata.get_edition_list(): pandata.metadata = bookdata.metadata
edition = loader.load_from_pandata(metadata, work) for metadata in pandata.get_edition_list():
work = edition.work edition = loader.load_from_pandata(metadata, work)
loader.load_ebooks(pandata, edition) work = edition.work
if edition: loader.load_ebooks(pandata, edition)
editions.append(edition) if edition:
editions.append(edition)
return editions return editions

View File

@ -85,6 +85,7 @@ STOREPROVIDERS = [
'manchesteruniversitypress.co.uk', 'manchesteruniversitypress.co.uk',
'mitpress.mit.edu', 'mitpress.mit.edu',
'munishop.muni.cz', 'munishop.muni.cz',
'nai010.com',
'nomos-shop.de', 'nomos-shop.de',
'palgrave.com', 'palgrave.com',
'placedeslibraires.fr', 'placedeslibraires.fr',

View File

@ -138,6 +138,7 @@ CMPPROVIDERS = [
'omp.ub.rub.de', 'omp.ub.rub.de',
'penerbit.brin.go.id', 'penerbit.brin.go.id',
'press.uni.lodz.pl', 'press.uni.lodz.pl',
'redliclibros.com',
'Scholars Portal', 'Scholars Portal',
'teiresias-supplements.mcgill.ca', 'teiresias-supplements.mcgill.ca',
'textbooks.open.tudelft.nl', 'textbooks.open.tudelft.nl',
@ -938,9 +939,11 @@ def harvest_muse(ebook):
def harvest_mitpress(ebook): def harvest_mitpress(ebook):
def selector(doc):
return doc.select('a.book-pdfLink[href]')
def chap_selector(doc): def chap_selector(doc):
return doc.select('a.section-pdfLink[href]') return doc.select('a.section-pdfLink[href]')
return harvest_stapled_generic(ebook, None, chap_selector, strip_covers=0) return harvest_stapled_generic(ebook, selector, chap_selector, strip_covers=0)
def harvest_ios(ebook): def harvest_ios(ebook):

View File

@ -65,13 +65,16 @@ class BaseScraper(object):
for review in self.doc.find_all(itemtype="http://schema.org/Review"): for review in self.doc.find_all(itemtype="http://schema.org/Review"):
review.clear() review.clear()
self.get_all() self.get_all()
if not self.metadata.get('title', None): if not self.metadata.get('title', None):
self.set('title', '!!! missing title !!!') self.set('title', '!!! missing title !!!')
if not self.metadata.get('language', None): if not self.metadata.get('language', None):
self.set('language', 'en') self.set('language', 'en')
self.metadata['identifiers'] = self.identifiers
else:
self.metadata = None
else: else:
self.metadata = {} self.metadata = None
self.metadata['identifiers'] = self.identifiers
# #
# utilities # utilities

View File

@ -146,9 +146,9 @@ class SpringerScraper(BaseScraper):
self.set('rights_url', lic_url) self.set('rights_url', lic_url)
def get_pubdate(self): def get_pubdate(self):
pubinfo = self.doc.select_one('#copyright-info') pubinfo = self.doc.find(attrs={"data-test": "electronic_isbn_publication_date"})
if not pubinfo: if not pubinfo:
pubinfo = self.doc.select_one('header .c-article-identifiers') pubinfo = self.doc.find(attrs={"data-test": "softcover_isbn_publication_date"})
if pubinfo: if pubinfo:
for yearstring in pubinfo.stripped_strings: for yearstring in pubinfo.stripped_strings:
yearmatch = HAS_YEAR.search(yearstring) yearmatch = HAS_YEAR.search(yearstring)

View File

@ -0,0 +1,25 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.11.29 on 2024-11-22 15:25
from __future__ import unicode_literals
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0028_auto_20240819_1450'),
]
operations = [
migrations.AlterField(
model_name='subject',
name='name',
field=models.CharField(db_index=True, max_length=200, unique=True),
),
migrations.AlterField(
model_name='work',
name='is_free',
field=models.BooleanField(db_index=True, default=False),
),
]

View File

@ -130,7 +130,7 @@ class Work(models.Model):
# repurposed earliest_publication to actually be publication range # repurposed earliest_publication to actually be publication range
publication_range = models.CharField(max_length=50, null=True, blank=True) publication_range = models.CharField(max_length=50, null=True, blank=True)
featured = models.DateTimeField(null=True, blank=True, db_index=True,) featured = models.DateTimeField(null=True, blank=True, db_index=True,)
is_free = models.BooleanField(default=False) is_free = models.BooleanField(default=False, db_index=True)
related = models.ManyToManyField('self', symmetrical=False, blank=True, through='WorkRelation', related_name='reverse_related') related = models.ManyToManyField('self', symmetrical=False, blank=True, through='WorkRelation', related_name='reverse_related')
age_level = models.CharField(max_length=5, choices=AGE_LEVEL_CHOICES, default='', blank=True) age_level = models.CharField(max_length=5, choices=AGE_LEVEL_CHOICES, default='', blank=True)
@ -787,7 +787,7 @@ AUTHMATCH = re.compile(r'\s*!([a-z]+):?\s+(.*)')
class Subject(models.Model): class Subject(models.Model):
created = models.DateTimeField(auto_now_add=True) created = models.DateTimeField(auto_now_add=True)
name = models.CharField(max_length=200, unique=True) name = models.CharField(max_length=200, unique=True, db_index=True)
works = models.ManyToManyField("Work", related_name="subjects") works = models.ManyToManyField("Work", related_name="subjects")
is_visible = models.BooleanField(default=True) is_visible = models.BooleanField(default=True)
authority = models.CharField(max_length=10, blank=False, default="") authority = models.CharField(max_length=10, blank=False, default="")