From 1ce4323bc4c49371b33425305da0d81a6c2395dc Mon Sep 17 00:00:00 2001 From: eric Date: Fri, 15 Sep 2017 15:55:37 -0400 Subject: [PATCH] precheck every new subject fix bug with '/' in subject interpret ';' as list delimiter add cleaner script --- core/bookloader.py | 35 ++-------------- core/facets.py | 2 +- core/loaders/doab.py | 5 ++- core/management/commands/clean_subjects.py | 29 +++++++++++++ core/models/bibmodels.py | 36 ++++++++++++++++ core/tests.py | 48 ++++++++++++++-------- core/validation.py | 21 ++++++++++ 7 files changed, 126 insertions(+), 50 deletions(-) create mode 100644 core/management/commands/clean_subjects.py diff --git a/core/bookloader.py b/core/bookloader.py index c45fe541..e3567f17 100755 --- a/core/bookloader.py +++ b/core/bookloader.py @@ -649,35 +649,11 @@ def add_openlibrary(work, hard_refresh = False): # add the subjects to the Work for s in subjects: - if valid_subject(s): - logger.info("adding subject %s to work %s", s, work.id) - subject, created = models.Subject.objects.get_or_create(name=s) - work.subjects.add(subject) + logger.info("adding subject %s to work %s", s, work.id) + subject, created = models.Subject.set_by_name(s, work=work) work.save() -def valid_xml_char_ordinal(c): - codepoint = ord(c) - # conditions ordered by presumed frequency - return ( - 0x20 <= codepoint <= 0xD7FF or - codepoint in (0x9, 0xA, 0xD) or - 0xE000 <= codepoint <= 0xFFFD or - 0x10000 <= codepoint <= 0x10FFFF - ) - -def valid_subject( subject_name ): - num_commas = 0 - for c in subject_name: - if not valid_xml_char_ordinal(c): - return False - if c == ',': - num_commas += 1 - if num_commas > 2: - return False - return True - - def _get_json(url, params={}, type='gb'): # TODO: should X-Forwarded-For change based on the request from client? @@ -909,11 +885,8 @@ class BasePandataLoader(object): (authority, heading) = ( '', yaml_subject) else: continue - (subject, created) = models.Subject.objects.get_or_create(name=heading) - if not subject.authority and authority: - subject.authority = authority - subject.save() - subject.works.add(work) + subject = models.Subject.set_by_name(heading, work=work, authority=authority) + # the default edition uses the first cover in covers. for cover in metadata.covers: if cover.get('image_path', False): diff --git a/core/facets.py b/core/facets.py index 2757efff..20272808 100644 --- a/core/facets.py +++ b/core/facets.py @@ -221,7 +221,7 @@ class KeywordFacetGroup(FacetGroup): def set_name(self): self.facet_name=facet_name # facet_names of the form 'kw.SUBJECT' and SUBJECT is therefore the 4th character on - self.keyword=self.facet_name[3:] + self.keyword=self.facet_name[3:].replace(';', '/') def get_query_set(self): return self._get_query_set().filter(subjects__name=self.keyword) def template(self): diff --git a/core/loaders/doab.py b/core/loaders/doab.py index 3bf70bfd..b6e887e1 100644 --- a/core/loaders/doab.py +++ b/core/loaders/doab.py @@ -18,6 +18,7 @@ from regluit.core import models, tasks from regluit.core import bookloader from regluit.core.bookloader import add_by_isbn, merge_works from regluit.core.isbn import ISBN +from regluit.core.validation import valid_subject logger = logging.getLogger(__name__) @@ -100,8 +101,8 @@ def attach_more_doab_metadata(edition, description, subjects, # update subjects for s in subjects: - if bookloader.valid_subject(s): - work.subjects.add(models.Subject.objects.get_or_create(name=s)[0]) + if valid_subject(s): + models.Subject.set_by_name(s, work=work) # set reading level of work if it's empty; doab is for adults. if not work.age_level: diff --git a/core/management/commands/clean_subjects.py b/core/management/commands/clean_subjects.py new file mode 100644 index 00000000..2b6a96a9 --- /dev/null +++ b/core/management/commands/clean_subjects.py @@ -0,0 +1,29 @@ +from django.core.management.base import BaseCommand + +from regluit.core.models import Subject + + + +class Command(BaseCommand): + '''Have observed that if the subject has more than two commas in it, it probably means something else''' + help = "reprocess subjects containing ';' or starting with 'nyt:' or 'award:'" + + def handle(self, **options): + semicolon_subjects = Subject.objects.filter(name__contains=";") + + for subject in semicolon_subjects: + for work in subject.works.all(): + Subject.set_by_name(subject.name, work=work) + subject.delete() + + nyt_subjects = Subject.objects.filter(name__startswith="nyt:") + for subject in nyt_subjects: + for work in subject.works.all(): + Subject.set_by_name(subject.name, work=work) + subject.delete() + + award_subjects = Subject.objects.filter(name__startswith="award:") + for subject in award_subjects: + for work in subject.works.all(): + Subject.set_by_name(subject.name, work=work) + subject.delete() diff --git a/core/models/bibmodels.py b/core/models/bibmodels.py index b222e3f6..0ce8dc41 100644 --- a/core/models/bibmodels.py +++ b/core/models/bibmodels.py @@ -30,6 +30,7 @@ from regluit.core import mobi import regluit.core.cc as cc from regluit.core.epub import test_epub from regluit.core.links import id_url +from regluit.core.validation import valid_subject from regluit.core.parameters import ( AGE_LEVEL_CHOICES, @@ -759,7 +760,42 @@ class Subject(models.Model): class Meta: ordering = ['name'] + + @classmethod + def set_by_name(cls, subject, work=None, authority=None): + ''' use this method whenever you would be creating a new subject!''' + subject = subject.strip() + + # make sure it's not a ; delineated list + subjects = subject.split(';') + for additional_subject in subjects[1:]: + cls.set_by_name(additional_subject, work, authority) + subject = subjects[0] + # make sure there's no heading + headingmatch = re.match(r'^!(.+):(.+)', subject) + if headingmatch: + subject = headingmatch.group(2).strip() + authority = headingmatch.group(1).strip() + elif subject.startswith('nyt:'): + subject = subject[4:].split('=')[0].replace('_', ' ').strip().capitalize() + subject = 'NYT Bestseller - {}'.format(subject) + authority = 'nyt' + elif subject.startswith('award:'): + subject = subject[6:].split('=')[0].replace('_', ' ').strip().capitalize() + subject = 'Award Winner - {}'.format(subject) + authority = 'award' + if valid_subject(subject): + (subject_obj, created) = cls.objects.get_or_create(name=subject) + if not subject_obj.authority and authority: + subject_obj.authority = authority + subject_obj.save() + + subject_obj.works.add(work) + return subject_obj + else: + return None + def __unicode__(self): return self.name diff --git a/core/tests.py b/core/tests.py index abd4295b..2bd644d5 100755 --- a/core/tests.py +++ b/core/tests.py @@ -64,7 +64,8 @@ from regluit.core.models import ( ) from regluit.libraryauth.models import Library from regluit.core.parameters import TESTING, LIBRARY, RESERVE -from regluit.core.loaders.utils import (load_from_books, loaded_book_ok) +from regluit.core.loaders.utils import (load_from_books, loaded_book_ok, ) +from regluit.core.validation import valid_subject from regluit.frontend.views import safe_get_work from regluit.payment.models import Transaction from regluit.payment.parameters import PAYMENT_TYPE_AUTHORIZATION @@ -107,11 +108,6 @@ class BookLoaderTests(TestCase): f = ebook.get_archive() self.assertTrue(EbookFile.objects.all().count()>num_ebf) - def test_valid_subject(self): - self.assertTrue(bookloader.valid_subject('A, valid, suj\xc3t')) - self.assertFalse(bookloader.valid_subject('A, valid, suj\xc3t, ')) - self.assertFalse(bookloader.valid_subject('A valid suj\xc3t \x01')) - def test_add_by_isbn_mock(self): with requests_mock.Mocker(real_http=True) as m: with open(os.path.join(TESTDIR, 'gb_hamilton.json')) as gb: @@ -879,17 +875,37 @@ class SafeGetWorkTest(TestCase): self.assertRaises(Http404, safe_get_work, 3) class WorkTests(TestCase): + def setUp(self): + self.w1 = models.Work.objects.create() + self.w2 = models.Work.objects.create() + def test_preferred_edition(self): - w1 = models.Work.objects.create() - w2 = models.Work.objects.create() - ww = models.WasWork.objects.create(work=w1, was= w2.id) - e1 = models.Edition.objects.create(work=w1) - self.assertEqual(e1, w1.preferred_edition) - e2 = models.Edition.objects.create(work=w1) - w1.selected_edition=e2 - w1.save() - self.assertEqual(e2, w1.preferred_edition) - self.assertEqual(e2, w2.preferred_edition) + ww = models.WasWork.objects.create(work=self.w1, was= self.w2.id) + e1 = models.Edition.objects.create(work=self.w1) + self.assertEqual(e1, self.w1.preferred_edition) + e2 = models.Edition.objects.create(work=self.w1) + self.w1.selected_edition=e2 + self.w1.save() + self.assertEqual(e2, self.w1.preferred_edition) + self.assertEqual(e2, self.w2.preferred_edition) + + def test_valid_subject(self): + self.assertTrue(valid_subject('A, valid, suj\xc3t')) + self.assertFalse(valid_subject('A, valid, suj\xc3t, ')) + self.assertFalse(valid_subject('A valid suj\xc3t \x01')) + Subject.set_by_name('A, valid, suj\xc3t; A, valid, suj\xc3t, ', work=self.w1) + self.assertEqual(1, self.w1.subjects.count()) + sub = Subject.set_by_name('nyt:hardcover_advice=2011-06-18', work=self.w1) + self.assertEqual(sub.name, 'NYT Bestseller - Hardcover advice') + self.assertEqual(2, self.w1.subjects.count()) + sub2 = Subject.set_by_name('!lcsh: Something', work=self.w1) + self.assertEqual(sub2.name, 'Something') + self.assertEqual(3, self.w1.subjects.count()) + sub3 = Subject.set_by_name('Something', work=self.w1) + self.assertEqual(sub3.name, 'Something') + self.assertEqual(3, self.w1.subjects.count()) + self.assertEqual(sub3.authority, 'lcsh') + class DownloadPageTest(TestCase): fixtures = ['initial_data.json'] diff --git a/core/validation.py b/core/validation.py index 68ecbf66..aec554f5 100644 --- a/core/validation.py +++ b/core/validation.py @@ -108,3 +108,24 @@ def test_file(the_file, fformat): raise forms.ValidationError(_('%s is not a valid PDF file' % the_file.name) ) return True +def valid_xml_char_ordinal(c): + codepoint = ord(c) + # conditions ordered by presumed frequency + return ( + 0x20 <= codepoint <= 0xD7FF or + codepoint in (0x9, 0xA, 0xD) or + 0xE000 <= codepoint <= 0xFFFD or + 0x10000 <= codepoint <= 0x10FFFF + ) + +def valid_subject( subject_name ): + num_commas = 0 + for c in subject_name: + if not valid_xml_char_ordinal(c): + return False + if c == ',': + num_commas += 1 + if num_commas > 2: + return False + return True +