precheck every new subject
fix bug with '/' in subject interpret ';' as list delimiter add cleaner scriptpull/43/head
parent
ec39eff244
commit
1ce4323bc4
|
@ -649,35 +649,11 @@ def add_openlibrary(work, hard_refresh = False):
|
|||
|
||||
# add the subjects to the Work
|
||||
for s in subjects:
|
||||
if valid_subject(s):
|
||||
logger.info("adding subject %s to work %s", s, work.id)
|
||||
subject, created = models.Subject.objects.get_or_create(name=s)
|
||||
work.subjects.add(subject)
|
||||
logger.info("adding subject %s to work %s", s, work.id)
|
||||
subject, created = models.Subject.set_by_name(s, work=work)
|
||||
|
||||
work.save()
|
||||
|
||||
def valid_xml_char_ordinal(c):
|
||||
codepoint = ord(c)
|
||||
# conditions ordered by presumed frequency
|
||||
return (
|
||||
0x20 <= codepoint <= 0xD7FF or
|
||||
codepoint in (0x9, 0xA, 0xD) or
|
||||
0xE000 <= codepoint <= 0xFFFD or
|
||||
0x10000 <= codepoint <= 0x10FFFF
|
||||
)
|
||||
|
||||
def valid_subject( subject_name ):
|
||||
num_commas = 0
|
||||
for c in subject_name:
|
||||
if not valid_xml_char_ordinal(c):
|
||||
return False
|
||||
if c == ',':
|
||||
num_commas += 1
|
||||
if num_commas > 2:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
|
||||
def _get_json(url, params={}, type='gb'):
|
||||
# TODO: should X-Forwarded-For change based on the request from client?
|
||||
|
@ -909,11 +885,8 @@ class BasePandataLoader(object):
|
|||
(authority, heading) = ( '', yaml_subject)
|
||||
else:
|
||||
continue
|
||||
(subject, created) = models.Subject.objects.get_or_create(name=heading)
|
||||
if not subject.authority and authority:
|
||||
subject.authority = authority
|
||||
subject.save()
|
||||
subject.works.add(work)
|
||||
subject = models.Subject.set_by_name(heading, work=work, authority=authority)
|
||||
|
||||
# the default edition uses the first cover in covers.
|
||||
for cover in metadata.covers:
|
||||
if cover.get('image_path', False):
|
||||
|
|
|
@ -221,7 +221,7 @@ class KeywordFacetGroup(FacetGroup):
|
|||
def set_name(self):
|
||||
self.facet_name=facet_name
|
||||
# facet_names of the form 'kw.SUBJECT' and SUBJECT is therefore the 4th character on
|
||||
self.keyword=self.facet_name[3:]
|
||||
self.keyword=self.facet_name[3:].replace(';', '/')
|
||||
def get_query_set(self):
|
||||
return self._get_query_set().filter(subjects__name=self.keyword)
|
||||
def template(self):
|
||||
|
|
|
@ -18,6 +18,7 @@ from regluit.core import models, tasks
|
|||
from regluit.core import bookloader
|
||||
from regluit.core.bookloader import add_by_isbn, merge_works
|
||||
from regluit.core.isbn import ISBN
|
||||
from regluit.core.validation import valid_subject
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -100,8 +101,8 @@ def attach_more_doab_metadata(edition, description, subjects,
|
|||
|
||||
# update subjects
|
||||
for s in subjects:
|
||||
if bookloader.valid_subject(s):
|
||||
work.subjects.add(models.Subject.objects.get_or_create(name=s)[0])
|
||||
if valid_subject(s):
|
||||
models.Subject.set_by_name(s, work=work)
|
||||
|
||||
# set reading level of work if it's empty; doab is for adults.
|
||||
if not work.age_level:
|
||||
|
|
|
@ -0,0 +1,29 @@
|
|||
from django.core.management.base import BaseCommand
|
||||
|
||||
from regluit.core.models import Subject
|
||||
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
'''Have observed that if the subject has more than two commas in it, it probably means something else'''
|
||||
help = "reprocess subjects containing ';' or starting with 'nyt:' or 'award:'"
|
||||
|
||||
def handle(self, **options):
|
||||
semicolon_subjects = Subject.objects.filter(name__contains=";")
|
||||
|
||||
for subject in semicolon_subjects:
|
||||
for work in subject.works.all():
|
||||
Subject.set_by_name(subject.name, work=work)
|
||||
subject.delete()
|
||||
|
||||
nyt_subjects = Subject.objects.filter(name__startswith="nyt:")
|
||||
for subject in nyt_subjects:
|
||||
for work in subject.works.all():
|
||||
Subject.set_by_name(subject.name, work=work)
|
||||
subject.delete()
|
||||
|
||||
award_subjects = Subject.objects.filter(name__startswith="award:")
|
||||
for subject in award_subjects:
|
||||
for work in subject.works.all():
|
||||
Subject.set_by_name(subject.name, work=work)
|
||||
subject.delete()
|
|
@ -30,6 +30,7 @@ from regluit.core import mobi
|
|||
import regluit.core.cc as cc
|
||||
from regluit.core.epub import test_epub
|
||||
from regluit.core.links import id_url
|
||||
from regluit.core.validation import valid_subject
|
||||
|
||||
from regluit.core.parameters import (
|
||||
AGE_LEVEL_CHOICES,
|
||||
|
@ -760,6 +761,41 @@ class Subject(models.Model):
|
|||
class Meta:
|
||||
ordering = ['name']
|
||||
|
||||
@classmethod
|
||||
def set_by_name(cls, subject, work=None, authority=None):
|
||||
''' use this method whenever you would be creating a new subject!'''
|
||||
subject = subject.strip()
|
||||
|
||||
# make sure it's not a ; delineated list
|
||||
subjects = subject.split(';')
|
||||
for additional_subject in subjects[1:]:
|
||||
cls.set_by_name(additional_subject, work, authority)
|
||||
subject = subjects[0]
|
||||
# make sure there's no heading
|
||||
headingmatch = re.match(r'^!(.+):(.+)', subject)
|
||||
if headingmatch:
|
||||
subject = headingmatch.group(2).strip()
|
||||
authority = headingmatch.group(1).strip()
|
||||
elif subject.startswith('nyt:'):
|
||||
subject = subject[4:].split('=')[0].replace('_', ' ').strip().capitalize()
|
||||
subject = 'NYT Bestseller - {}'.format(subject)
|
||||
authority = 'nyt'
|
||||
elif subject.startswith('award:'):
|
||||
subject = subject[6:].split('=')[0].replace('_', ' ').strip().capitalize()
|
||||
subject = 'Award Winner - {}'.format(subject)
|
||||
authority = 'award'
|
||||
|
||||
if valid_subject(subject):
|
||||
(subject_obj, created) = cls.objects.get_or_create(name=subject)
|
||||
if not subject_obj.authority and authority:
|
||||
subject_obj.authority = authority
|
||||
subject_obj.save()
|
||||
|
||||
subject_obj.works.add(work)
|
||||
return subject_obj
|
||||
else:
|
||||
return None
|
||||
|
||||
def __unicode__(self):
|
||||
return self.name
|
||||
|
||||
|
|
|
@ -64,7 +64,8 @@ from regluit.core.models import (
|
|||
)
|
||||
from regluit.libraryauth.models import Library
|
||||
from regluit.core.parameters import TESTING, LIBRARY, RESERVE
|
||||
from regluit.core.loaders.utils import (load_from_books, loaded_book_ok)
|
||||
from regluit.core.loaders.utils import (load_from_books, loaded_book_ok, )
|
||||
from regluit.core.validation import valid_subject
|
||||
from regluit.frontend.views import safe_get_work
|
||||
from regluit.payment.models import Transaction
|
||||
from regluit.payment.parameters import PAYMENT_TYPE_AUTHORIZATION
|
||||
|
@ -107,11 +108,6 @@ class BookLoaderTests(TestCase):
|
|||
f = ebook.get_archive()
|
||||
self.assertTrue(EbookFile.objects.all().count()>num_ebf)
|
||||
|
||||
def test_valid_subject(self):
|
||||
self.assertTrue(bookloader.valid_subject('A, valid, suj\xc3t'))
|
||||
self.assertFalse(bookloader.valid_subject('A, valid, suj\xc3t, '))
|
||||
self.assertFalse(bookloader.valid_subject('A valid suj\xc3t \x01'))
|
||||
|
||||
def test_add_by_isbn_mock(self):
|
||||
with requests_mock.Mocker(real_http=True) as m:
|
||||
with open(os.path.join(TESTDIR, 'gb_hamilton.json')) as gb:
|
||||
|
@ -879,17 +875,37 @@ class SafeGetWorkTest(TestCase):
|
|||
self.assertRaises(Http404, safe_get_work, 3)
|
||||
|
||||
class WorkTests(TestCase):
|
||||
def setUp(self):
|
||||
self.w1 = models.Work.objects.create()
|
||||
self.w2 = models.Work.objects.create()
|
||||
|
||||
def test_preferred_edition(self):
|
||||
w1 = models.Work.objects.create()
|
||||
w2 = models.Work.objects.create()
|
||||
ww = models.WasWork.objects.create(work=w1, was= w2.id)
|
||||
e1 = models.Edition.objects.create(work=w1)
|
||||
self.assertEqual(e1, w1.preferred_edition)
|
||||
e2 = models.Edition.objects.create(work=w1)
|
||||
w1.selected_edition=e2
|
||||
w1.save()
|
||||
self.assertEqual(e2, w1.preferred_edition)
|
||||
self.assertEqual(e2, w2.preferred_edition)
|
||||
ww = models.WasWork.objects.create(work=self.w1, was= self.w2.id)
|
||||
e1 = models.Edition.objects.create(work=self.w1)
|
||||
self.assertEqual(e1, self.w1.preferred_edition)
|
||||
e2 = models.Edition.objects.create(work=self.w1)
|
||||
self.w1.selected_edition=e2
|
||||
self.w1.save()
|
||||
self.assertEqual(e2, self.w1.preferred_edition)
|
||||
self.assertEqual(e2, self.w2.preferred_edition)
|
||||
|
||||
def test_valid_subject(self):
|
||||
self.assertTrue(valid_subject('A, valid, suj\xc3t'))
|
||||
self.assertFalse(valid_subject('A, valid, suj\xc3t, '))
|
||||
self.assertFalse(valid_subject('A valid suj\xc3t \x01'))
|
||||
Subject.set_by_name('A, valid, suj\xc3t; A, valid, suj\xc3t, ', work=self.w1)
|
||||
self.assertEqual(1, self.w1.subjects.count())
|
||||
sub = Subject.set_by_name('nyt:hardcover_advice=2011-06-18', work=self.w1)
|
||||
self.assertEqual(sub.name, 'NYT Bestseller - Hardcover advice')
|
||||
self.assertEqual(2, self.w1.subjects.count())
|
||||
sub2 = Subject.set_by_name('!lcsh: Something', work=self.w1)
|
||||
self.assertEqual(sub2.name, 'Something')
|
||||
self.assertEqual(3, self.w1.subjects.count())
|
||||
sub3 = Subject.set_by_name('Something', work=self.w1)
|
||||
self.assertEqual(sub3.name, 'Something')
|
||||
self.assertEqual(3, self.w1.subjects.count())
|
||||
self.assertEqual(sub3.authority, 'lcsh')
|
||||
|
||||
|
||||
class DownloadPageTest(TestCase):
|
||||
fixtures = ['initial_data.json']
|
||||
|
|
|
@ -108,3 +108,24 @@ def test_file(the_file, fformat):
|
|||
raise forms.ValidationError(_('%s is not a valid PDF file' % the_file.name) )
|
||||
return True
|
||||
|
||||
def valid_xml_char_ordinal(c):
|
||||
codepoint = ord(c)
|
||||
# conditions ordered by presumed frequency
|
||||
return (
|
||||
0x20 <= codepoint <= 0xD7FF or
|
||||
codepoint in (0x9, 0xA, 0xD) or
|
||||
0xE000 <= codepoint <= 0xFFFD or
|
||||
0x10000 <= codepoint <= 0x10FFFF
|
||||
)
|
||||
|
||||
def valid_subject( subject_name ):
|
||||
num_commas = 0
|
||||
for c in subject_name:
|
||||
if not valid_xml_char_ordinal(c):
|
||||
return False
|
||||
if c == ',':
|
||||
num_commas += 1
|
||||
if num_commas > 2:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
|
Loading…
Reference in New Issue