change doab ids to doab handles

pull/94/head
eric 2021-02-28 17:23:35 -05:00
parent 97ce6bfa49
commit c7e2ae7b25
9 changed files with 47 additions and 33 deletions

View File

@ -338,7 +338,7 @@ def loaded_book_ok(book, work, edition):
ID_URLPATTERNS = {
'goog': re.compile(r'[\./]google\.com/books\?.*id=(?P<id>[a-zA-Z0-9\-_]{12})'),
'olwk': re.compile(r'[\./]openlibrary\.org(?P<id>/works/OL\d{1,8}W)'),
'doab': re.compile(r'([\./]doabooks\.org/doab\?.*rid:|=oai:doab-books:)(?P<id>\d{1,8})'),
'doab': re.compile(r'([\./]directory\.doabooks\.org/handle/)(?P<id>20\.500\.12854/\d{5,8})'),
'gdrd': re.compile(r'[\./]goodreads\.com/book/show/(?P<id>\d{1,8})'),
'ltwk': re.compile(r'[\./]librarything\.com/work/(?P<id>\d{1,8})'),
'oclc': re.compile(r'\.worldcat\.org/.*oclc/(?P<id>\d{8,12})'),

View File

@ -2,15 +2,13 @@ from django.core.management.base import BaseCommand
import re
import requests
from bs4 import BeautifulSoup
from regluit.core.models import Edition
from regluit.core.loaders.doab import store_doab_cover
to_fix = ['16198', '16199', '16201', '16202', '16204', '16205', '16206', '16207', '16208', '16209', '16210', '16213', '16279', '16287', '16302', '17116', '17117', '17121', '17129', '17149', '17154', '19501', '20186', '20238', '20280', '20395', '20447', '20504', '20706', '20750', '21317', '21319', '21332', '21338', '21343', '21345', '21348', '21356', '21643', '21814', '23509', '23510', '23516', '23517', '23518', '23521', '23523', '23593', '23596', '24216', '24282', '24587', '24865', '24867', '25496', '25497', '25655', '26181', '26308', '26508', '26509', '26510', '26511', '26512', '26513', '26514', '26515', '26516', '26517', '26518', '26523', '26524', '26710', '26995', '31899', '31902', '31908', '31924', '31930', '31933', '32327', '44061']
COVER_PATTERN = re.compile(r'doabooks.org/doab\?func=cover\&rid=(\d+)')
to_fix = []
class Command(BaseCommand):
""" To repair covers, will need a new refresh_cover method"""
help = "fix bad covers for doab"
def add_arguments(self, parser):
@ -26,28 +24,23 @@ class Command(BaseCommand):
def fix_doab_cover(self, doab):
eds = Edition.objects.filter(cover_image__contains='amazonaws.com/doab/%s/cover' % doab)
resp = requests.get('https://unglueit-files.s3.amazonaws.com/doab/%s/cover' % doab)
if resp.status_code == 200 and 'text/html' in resp.headers['Content-Type']:
doc = BeautifulSoup(resp.content, 'lxml')
link = doc.find('a')
if link:
self.stdout.write(link['href'])
new_doab = COVER_PATTERN.search(link['href'])
if new_doab:
(cover_url, new_cover) = store_doab_cover(new_doab.group(1), redo=True)
if cover_url:
for e in eds:
e.cover_image = cover_url
e.save()
if e.cover_image_small() and e.cover_image_thumbnail():
self.stdout.write('fixed %s using %s' % (doab, new_doab.group(1)))
else:
self.stdout.write('bad thumbnails for %s' % new_doab.group(1))
return False
return True
cover_url = self.refresh_cover(doab)
if cover_url:
for e in eds:
e.cover_image = cover_url
e.save()
if e.cover_image_small() and e.cover_image_thumbnail():
self.stdout.write('fixed %s using %s' % (doab, new_doab.group(1)))
else:
self.stdout.write('bad thumbnails for %s' % new_doab.group(1))
return False
return True
self.stdout.write('removing bad cover for %s' % doab)
for e in eds:
e.cover_image = None
e.save()
return False
def refresh_cover(self, doab):
return False

View File

@ -6,7 +6,7 @@ class Command(BaseCommand):
help = "load doab books by doab_id via oai"
def add_arguments(self, parser):
parser.add_argument('doab_ids', nargs='+', type=int, default=1, help="doab ids to add")
parser.add_argument('doab_ids', nargs='+', type=str, default=1, help="doab ids to add")
def handle(self, doab_ids, **options):
for doab_id in doab_ids:

View File

@ -12,7 +12,7 @@ class Command(BaseCommand):
def add_arguments(self, parser):
parser.add_argument('from_date', nargs='?', type=timefromiso,
default=None, help="YYYY-MM-DD to start")
parser.add_argument('--from_id', nargs='?', type=int, default=0, help="id to start with")
parser.add_argument('--from_id', nargs='?', type=str, default='', help="handle to start with")
parser.add_argument('--max', nargs='?', type=int, default=None, help="max desired records")
def handle(self, **options):

View File

@ -0,0 +1,21 @@
import csv
from django.core.management.base import BaseCommand
from regluit.core.models import Identifier
class Command(BaseCommand):
help = "translate doab ids to handles"
def add_arguments(self, parser):
parser.add_argument('filename', nargs='+', help="filename")
def handle(self, filename, **options):
with open(filename,'r') as jsonfile:
newdoab = json.loads(jsonfile.read())
for doab in Identifier.objects.filter(type='doab'):
if doab.value in newdoab:
doab.value = newdoab[doab.value]
doab.save()
else:
doab.delete()
self.stdout.write("new doab ids loaded!")

View File

@ -27,7 +27,7 @@ TEXT_RELATION_CHOICES = (
ID_CHOICES = (
('http', 'Web Address'),
('isbn', 'ISBN'),
('doab', 'DOABooks ID'),
('doab', 'DOABooks handle'),
('gtbg', 'Project Gutenberg Number'),
('doi', 'Digital Object Identifier'),
('oclc', 'OCLC Number'),
@ -43,7 +43,7 @@ OTHER_ID_CHOICES = (
('edid', 'pragmatic edition ID'),
)
WORK_IDENTIFIERS = ('doi','olwk','glue','ltwk', 'http', 'doab')
WORK_IDENTIFIERS = ('doi', 'olwk', 'glue', 'ltwk', 'http', 'doab')
ID_CHOICES_MAP = dict(ID_CHOICES)

View File

@ -24,8 +24,8 @@ ID_VALIDATION = {
"The Web Address must be a valid http(s) URL."),
'isbn': (u'^([\\dxX \\-–—‐,;]+|delete)$', #includes unicode hyphen, endash and emdash
"The ISBN must be a valid ISBN-13."),
'doab': (r'^(\d{1,6}|delete)$',
"The value must be 1-6 digits."),
'doab': (r'^20.500.12854/(\d{5,8}|delete)$',
"The value must be a handle, starting with 20.500.12854/, followed by 5-8 digits."),
'gtbg': (r'^(\d{1,6}|delete)$',
"The Gutenberg number must be 1-6 digits."),
'doi': (r'^(https?://dx\.doi\.org/|https?://doi\.org/)?(10\.\d+/\S+|delete)$',

View File

@ -5,13 +5,13 @@
These books are included in <a href="https://www.gutenberg.org">Project Gutenberg</a>. They are archived and can be improved at <a href="https://www.gitenberg.org">GITenberg</a>.
{% endif %}
{% if facet.facet_name == 'doab' %}
These books are included in the <a href="http://doabooks.org/">Directory of Open Access Books</a>. This means that they have been peer-reviewed and are of interest to scholars.
These books are included in the <a href="https://doabooks.org/">Directory of Open Access Books</a>. This means that they have been peer-reviewed and are of interest to scholars.
{% endif %}
{% if facet.facet_name == '-gtbg' %}
These books are not included in <a href="https://www.gutenberg.org">Project Gutenberg</a>.
{% endif %}
{% if facet.facet_name == '-doab' %}
These books do not seem to be included in the <a href="http://doabooks.org/">Directory of Open Access Books</a>.
These books do not seem to be included in the <a href="https://doabooks.org/">Directory of Open Access Books</a>.
{% endif %}
</p>
</div>

View File

@ -300,7 +300,7 @@
{% endfor %}
{% if work.doab %}
<p>
This book is included in <a href="http://www.doabooks.org/doab?func=search&amp;query=rid%3A{{ work.doab }}">DOAB</a>.
This book is included in <a href="https://directory.doabooks.org/handle/{{ work.doab }}">DOAB</a>.
</p>
{% endif %}
{% if work.gtbg %}