diff --git a/doab_check/admin.py b/doab_check/admin.py index e529bb5..6e16513 100644 --- a/doab_check/admin.py +++ b/doab_check/admin.py @@ -1,12 +1,49 @@ from django.contrib import admin +from django.utils.safestring import mark_safe # Register your models here. from . import models -admin.site.register(models.Item) -admin.site.register(models.Link) -admin.site.register(models.Timestamp) -admin.site.register(models.Record) -admin.site.register(models.LinkRel) -admin.site.register(models.Check) + +@admin.register(models.Check) +class CheckAdmin(admin.ModelAdmin): + list_display = ('link_url', 'return_code', 'content_type') + date_hierarchy = 'created' + search_fields = ['return_code'] + ordering = ('created', 'return_code', 'content_type') + readonly_fields = ('link_url', 'return_code', 'content_type', 'link', 'location') + def link_url(self, obj): + return mark_safe(f'{obj.link.url}') + +@admin.register(models.Item) +class ItemAdmin(admin.ModelAdmin): + list_display = (str, 'title', 'resource_type') + date_hierarchy = 'created' + search_fields = ['title'] + +@admin.register(models.Link) +class LinkAdmin(admin.ModelAdmin): + list_display = ('url', 'provider') + date_hierarchy = 'created' + search_fields = ['url'] + exclude = ['url'] + readonly_fields = ('link_display', 'provider') + def link_display(self, obj): + return mark_safe(f'{obj.url}') + +@admin.register(models.LinkRel) +class LinkAdmin(admin.ModelAdmin): + list_display = ('role', 'doab', 'url',) + readonly_fields = ('doab', 'url') + search_fields = ['link__url'] + def doab(self, obj): + return mark_safe(f'{obj.item}') + def url(self, obj): + return mark_safe(f'{obj.link.url}') + +@admin.register(models.Record) +class RecordAdmin(admin.ModelAdmin): + readonly_fields = ['item'] + + diff --git a/doab_check/check.py b/doab_check/check.py new file mode 100755 index 0000000..b9f17e7 --- /dev/null +++ b/doab_check/check.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python +# encoding: utf-8 + +import logging +import re +import time +from urllib.parse import urlparse + +import requests + +from django.conf import settings + +from .models import Check, Link + +HEADERS = {"User-Agent": settings.USER_AGENT} + +DELAYS = { + 'oapen.org': 0.05, + '*': 0.5, +} + +logger = logging.getLogger(__name__) + +class ContentTyper(object): + """ won't make more checks faster than the DELAY set for the host """ + def __init__(self): + self.last_call = dict() + + def content_type(self, url): + try: + r = requests.head(url, allow_redirects=True, headers=HEADERS) + if r.status_code == 405: + r = requests.get(url, headers=HEADERS) + return r + except: + # unexplained error + return None + + def calc_type(self, url): + logger.info(url) + # is there a delay associated with the url + netloc = urlparse(url).netloc + delay = DELAYS.get(netloc, DELAYS.get('*')) + + # wait if necessary + last_call = self.last_call.get(netloc) + if last_call is not None: + now = time.time() + min_time_next_call = last_call + delay + if min_time_next_call > now: + time.sleep(min_time_next_call-now) + + self.last_call[netloc] = time.time() + + # compute the content-type + + return self.content_type(url) + +contenttyper = ContentTyper() + +RE_EBOOK_TYPES = re.compile(r'(epub|pdf|mobi)', flags=re.I) + +def response_parts(response): + ''' return code, content type, content disposition handling any missing data''' + if response == None: + return 0, '', '' + try: + if response.status_code == 404: + return 404, '', '' + cdisp = response.headers.get('content-disposition', '') + return response.status_code, response.headers.get('content-type', ''), cdisp + except: + return response.status_code, '', '' + + + +def type_for_url(url, response=None): + ''' check a url to see what's there. the content-disposition header is often needed to + dermine the type of file i.e. pdf, epub, etc. at the end of the link ''' + if not url and not response: + return '' + + if response: + code, ct, disposition = response_parts(response) + url = response.url + else: + code, ct, disposition = response_parts(contenttyper.calc_type(url)) + url_disp = url + disposition + if code == 404: + return 404, '' + binary_type = re.search("octet-stream", ct) or re.search("application/binary", ct) + ebook_type_match = RE_EBOOK_TYPES.search(url_disp) + if re.search("pdf", ct): + return code, "pdf" + elif binary_type and ebook_type_match: + return code, ebook_type_match.group(1).lower() + elif re.search("text/plain", ct): + return code, "text" + elif re.search("text/html", ct): + return code, "html" + elif re.search("epub", ct): + return code, "epub" + elif re.search("mobi", ct): + return code, "mobi" + # no content-type header? + elif ebook_type_match: + return code, ebook_type_match.group(1).lower() + + return code, f'other; {ct}' + +def check_link(link): + ''' given a Link object, check it's URL, put the result in a Check object ''' + check = Check(link=link) + code, ct = type_for_url(link.url) + check.return_code = code + check.content_type = ct + check.save() + + diff --git a/doab_check/management/commands/check_some_links.py b/doab_check/management/commands/check_some_links.py new file mode 100644 index 0000000..14fd219 --- /dev/null +++ b/doab_check/management/commands/check_some_links.py @@ -0,0 +1,21 @@ +import datetime +from django.core.management.base import BaseCommand + +from doab_check.check import check_link +from doab_check.models import Link + +class Command(BaseCommand): + help = "check links in rando. order" + + def add_arguments(self, parser): + parser.add_argument('--max', nargs='?', type=int, default=1000, help="max checks") + + def handle(self, **options): + max = options['max'] + n_checked = 0 + for link in Link.objects.all().order_by('?'): + check_link(link) + n_checked += 1 + if n_checked >= max: + break + self.stdout.write(f'checked {n_checked} links') diff --git a/doab_check/models.py b/doab_check/models.py index bc77f50..f204112 100644 --- a/doab_check/models.py +++ b/doab_check/models.py @@ -16,6 +16,9 @@ class Item(models.Model): # titles can change title = models.CharField(max_length=1000) + + def __str__(self): + return self.doab.split('/')[1] if '/' in self.doab else self.doab class Link(models.Model): ''' these are the links we're going to check ''' @@ -25,7 +28,6 @@ class Link(models.Model): # the items reporting this link items = models.ManyToManyField("Item", related_name="links", db_index=True, through="LinkRel") - # so we can set it to dead instead of deleting live = models.BooleanField(default=True) @@ -40,17 +42,24 @@ class Link(models.Model): self.provider = netloc super().save(*args, **kwargs) + class Timestamp(models.Model): ''' timestamp of the record returned by doab. records can have multiple timestamps ''' created = models.DateTimeField(auto_now_add=True) datetime = models.DateTimeField() record = models.ForeignKey("Record", related_name="timestamps", null=False, on_delete=models.CASCADE) + def __str__(self): + return f'Record for {self.record.item} on {self.datetime}' + class Record(models.Model): ''' a harvested record ''' created = models.DateTimeField(auto_now_add=True) item = models.ForeignKey("Item", related_name="records", on_delete=models.CASCADE) + def __str__(self): + return f'Record for {self.item} harvested on {self.created}' + class LinkRel(models.Model): ''' association between an item and a link '''