precheck every new subject

fix bug with '/' in subject
interpret ';' as list delimiter
add cleaner script
pull/43/head
eric 2017-09-15 15:55:37 -04:00
parent ec39eff244
commit 1ce4323bc4
7 changed files with 126 additions and 50 deletions

View File

@ -649,35 +649,11 @@ def add_openlibrary(work, hard_refresh = False):
# add the subjects to the Work
for s in subjects:
if valid_subject(s):
logger.info("adding subject %s to work %s", s, work.id)
subject, created = models.Subject.objects.get_or_create(name=s)
work.subjects.add(subject)
logger.info("adding subject %s to work %s", s, work.id)
subject, created = models.Subject.set_by_name(s, work=work)
work.save()
def valid_xml_char_ordinal(c):
codepoint = ord(c)
# conditions ordered by presumed frequency
return (
0x20 <= codepoint <= 0xD7FF or
codepoint in (0x9, 0xA, 0xD) or
0xE000 <= codepoint <= 0xFFFD or
0x10000 <= codepoint <= 0x10FFFF
)
def valid_subject( subject_name ):
num_commas = 0
for c in subject_name:
if not valid_xml_char_ordinal(c):
return False
if c == ',':
num_commas += 1
if num_commas > 2:
return False
return True
def _get_json(url, params={}, type='gb'):
# TODO: should X-Forwarded-For change based on the request from client?
@ -909,11 +885,8 @@ class BasePandataLoader(object):
(authority, heading) = ( '', yaml_subject)
else:
continue
(subject, created) = models.Subject.objects.get_or_create(name=heading)
if not subject.authority and authority:
subject.authority = authority
subject.save()
subject.works.add(work)
subject = models.Subject.set_by_name(heading, work=work, authority=authority)
# the default edition uses the first cover in covers.
for cover in metadata.covers:
if cover.get('image_path', False):

View File

@ -221,7 +221,7 @@ class KeywordFacetGroup(FacetGroup):
def set_name(self):
self.facet_name=facet_name
# facet_names of the form 'kw.SUBJECT' and SUBJECT is therefore the 4th character on
self.keyword=self.facet_name[3:]
self.keyword=self.facet_name[3:].replace(';', '/')
def get_query_set(self):
return self._get_query_set().filter(subjects__name=self.keyword)
def template(self):

View File

@ -18,6 +18,7 @@ from regluit.core import models, tasks
from regluit.core import bookloader
from regluit.core.bookloader import add_by_isbn, merge_works
from regluit.core.isbn import ISBN
from regluit.core.validation import valid_subject
logger = logging.getLogger(__name__)
@ -100,8 +101,8 @@ def attach_more_doab_metadata(edition, description, subjects,
# update subjects
for s in subjects:
if bookloader.valid_subject(s):
work.subjects.add(models.Subject.objects.get_or_create(name=s)[0])
if valid_subject(s):
models.Subject.set_by_name(s, work=work)
# set reading level of work if it's empty; doab is for adults.
if not work.age_level:

View File

@ -0,0 +1,29 @@
from django.core.management.base import BaseCommand
from regluit.core.models import Subject
class Command(BaseCommand):
'''Have observed that if the subject has more than two commas in it, it probably means something else'''
help = "reprocess subjects containing ';' or starting with 'nyt:' or 'award:'"
def handle(self, **options):
semicolon_subjects = Subject.objects.filter(name__contains=";")
for subject in semicolon_subjects:
for work in subject.works.all():
Subject.set_by_name(subject.name, work=work)
subject.delete()
nyt_subjects = Subject.objects.filter(name__startswith="nyt:")
for subject in nyt_subjects:
for work in subject.works.all():
Subject.set_by_name(subject.name, work=work)
subject.delete()
award_subjects = Subject.objects.filter(name__startswith="award:")
for subject in award_subjects:
for work in subject.works.all():
Subject.set_by_name(subject.name, work=work)
subject.delete()

View File

@ -30,6 +30,7 @@ from regluit.core import mobi
import regluit.core.cc as cc
from regluit.core.epub import test_epub
from regluit.core.links import id_url
from regluit.core.validation import valid_subject
from regluit.core.parameters import (
AGE_LEVEL_CHOICES,
@ -759,7 +760,42 @@ class Subject(models.Model):
class Meta:
ordering = ['name']
@classmethod
def set_by_name(cls, subject, work=None, authority=None):
''' use this method whenever you would be creating a new subject!'''
subject = subject.strip()
# make sure it's not a ; delineated list
subjects = subject.split(';')
for additional_subject in subjects[1:]:
cls.set_by_name(additional_subject, work, authority)
subject = subjects[0]
# make sure there's no heading
headingmatch = re.match(r'^!(.+):(.+)', subject)
if headingmatch:
subject = headingmatch.group(2).strip()
authority = headingmatch.group(1).strip()
elif subject.startswith('nyt:'):
subject = subject[4:].split('=')[0].replace('_', ' ').strip().capitalize()
subject = 'NYT Bestseller - {}'.format(subject)
authority = 'nyt'
elif subject.startswith('award:'):
subject = subject[6:].split('=')[0].replace('_', ' ').strip().capitalize()
subject = 'Award Winner - {}'.format(subject)
authority = 'award'
if valid_subject(subject):
(subject_obj, created) = cls.objects.get_or_create(name=subject)
if not subject_obj.authority and authority:
subject_obj.authority = authority
subject_obj.save()
subject_obj.works.add(work)
return subject_obj
else:
return None
def __unicode__(self):
return self.name

View File

@ -64,7 +64,8 @@ from regluit.core.models import (
)
from regluit.libraryauth.models import Library
from regluit.core.parameters import TESTING, LIBRARY, RESERVE
from regluit.core.loaders.utils import (load_from_books, loaded_book_ok)
from regluit.core.loaders.utils import (load_from_books, loaded_book_ok, )
from regluit.core.validation import valid_subject
from regluit.frontend.views import safe_get_work
from regluit.payment.models import Transaction
from regluit.payment.parameters import PAYMENT_TYPE_AUTHORIZATION
@ -107,11 +108,6 @@ class BookLoaderTests(TestCase):
f = ebook.get_archive()
self.assertTrue(EbookFile.objects.all().count()>num_ebf)
def test_valid_subject(self):
self.assertTrue(bookloader.valid_subject('A, valid, suj\xc3t'))
self.assertFalse(bookloader.valid_subject('A, valid, suj\xc3t, '))
self.assertFalse(bookloader.valid_subject('A valid suj\xc3t \x01'))
def test_add_by_isbn_mock(self):
with requests_mock.Mocker(real_http=True) as m:
with open(os.path.join(TESTDIR, 'gb_hamilton.json')) as gb:
@ -879,17 +875,37 @@ class SafeGetWorkTest(TestCase):
self.assertRaises(Http404, safe_get_work, 3)
class WorkTests(TestCase):
def setUp(self):
self.w1 = models.Work.objects.create()
self.w2 = models.Work.objects.create()
def test_preferred_edition(self):
w1 = models.Work.objects.create()
w2 = models.Work.objects.create()
ww = models.WasWork.objects.create(work=w1, was= w2.id)
e1 = models.Edition.objects.create(work=w1)
self.assertEqual(e1, w1.preferred_edition)
e2 = models.Edition.objects.create(work=w1)
w1.selected_edition=e2
w1.save()
self.assertEqual(e2, w1.preferred_edition)
self.assertEqual(e2, w2.preferred_edition)
ww = models.WasWork.objects.create(work=self.w1, was= self.w2.id)
e1 = models.Edition.objects.create(work=self.w1)
self.assertEqual(e1, self.w1.preferred_edition)
e2 = models.Edition.objects.create(work=self.w1)
self.w1.selected_edition=e2
self.w1.save()
self.assertEqual(e2, self.w1.preferred_edition)
self.assertEqual(e2, self.w2.preferred_edition)
def test_valid_subject(self):
self.assertTrue(valid_subject('A, valid, suj\xc3t'))
self.assertFalse(valid_subject('A, valid, suj\xc3t, '))
self.assertFalse(valid_subject('A valid suj\xc3t \x01'))
Subject.set_by_name('A, valid, suj\xc3t; A, valid, suj\xc3t, ', work=self.w1)
self.assertEqual(1, self.w1.subjects.count())
sub = Subject.set_by_name('nyt:hardcover_advice=2011-06-18', work=self.w1)
self.assertEqual(sub.name, 'NYT Bestseller - Hardcover advice')
self.assertEqual(2, self.w1.subjects.count())
sub2 = Subject.set_by_name('!lcsh: Something', work=self.w1)
self.assertEqual(sub2.name, 'Something')
self.assertEqual(3, self.w1.subjects.count())
sub3 = Subject.set_by_name('Something', work=self.w1)
self.assertEqual(sub3.name, 'Something')
self.assertEqual(3, self.w1.subjects.count())
self.assertEqual(sub3.authority, 'lcsh')
class DownloadPageTest(TestCase):
fixtures = ['initial_data.json']

View File

@ -108,3 +108,24 @@ def test_file(the_file, fformat):
raise forms.ValidationError(_('%s is not a valid PDF file' % the_file.name) )
return True
def valid_xml_char_ordinal(c):
codepoint = ord(c)
# conditions ordered by presumed frequency
return (
0x20 <= codepoint <= 0xD7FF or
codepoint in (0x9, 0xA, 0xD) or
0xE000 <= codepoint <= 0xFFFD or
0x10000 <= codepoint <= 0x10FFFF
)
def valid_subject( subject_name ):
num_commas = 0
for c in subject_name:
if not valid_xml_char_ordinal(c):
return False
if c == ',':
num_commas += 1
if num_commas > 2:
return False
return True