diff --git a/doab_check/admin.py b/doab_check/admin.py
index e529bb5..6e16513 100644
--- a/doab_check/admin.py
+++ b/doab_check/admin.py
@@ -1,12 +1,49 @@
from django.contrib import admin
+from django.utils.safestring import mark_safe
# Register your models here.
from . import models
-admin.site.register(models.Item)
-admin.site.register(models.Link)
-admin.site.register(models.Timestamp)
-admin.site.register(models.Record)
-admin.site.register(models.LinkRel)
-admin.site.register(models.Check)
+
+@admin.register(models.Check)
+class CheckAdmin(admin.ModelAdmin):
+ list_display = ('link_url', 'return_code', 'content_type')
+ date_hierarchy = 'created'
+ search_fields = ['return_code']
+ ordering = ('created', 'return_code', 'content_type')
+ readonly_fields = ('link_url', 'return_code', 'content_type', 'link', 'location')
+ def link_url(self, obj):
+ return mark_safe(f'{obj.link.url}')
+
+@admin.register(models.Item)
+class ItemAdmin(admin.ModelAdmin):
+ list_display = (str, 'title', 'resource_type')
+ date_hierarchy = 'created'
+ search_fields = ['title']
+
+@admin.register(models.Link)
+class LinkAdmin(admin.ModelAdmin):
+ list_display = ('url', 'provider')
+ date_hierarchy = 'created'
+ search_fields = ['url']
+ exclude = ['url']
+ readonly_fields = ('link_display', 'provider')
+ def link_display(self, obj):
+ return mark_safe(f'{obj.url}')
+
+@admin.register(models.LinkRel)
+class LinkAdmin(admin.ModelAdmin):
+ list_display = ('role', 'doab', 'url',)
+ readonly_fields = ('doab', 'url')
+ search_fields = ['link__url']
+ def doab(self, obj):
+ return mark_safe(f'{obj.item}')
+ def url(self, obj):
+ return mark_safe(f'{obj.link.url}')
+
+@admin.register(models.Record)
+class RecordAdmin(admin.ModelAdmin):
+ readonly_fields = ['item']
+
+
diff --git a/doab_check/check.py b/doab_check/check.py
new file mode 100755
index 0000000..b9f17e7
--- /dev/null
+++ b/doab_check/check.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python
+# encoding: utf-8
+
+import logging
+import re
+import time
+from urllib.parse import urlparse
+
+import requests
+
+from django.conf import settings
+
+from .models import Check, Link
+
+HEADERS = {"User-Agent": settings.USER_AGENT}
+
+DELAYS = {
+ 'oapen.org': 0.05,
+ '*': 0.5,
+}
+
+logger = logging.getLogger(__name__)
+
+class ContentTyper(object):
+ """ won't make more checks faster than the DELAY set for the host """
+ def __init__(self):
+ self.last_call = dict()
+
+ def content_type(self, url):
+ try:
+ r = requests.head(url, allow_redirects=True, headers=HEADERS)
+ if r.status_code == 405:
+ r = requests.get(url, headers=HEADERS)
+ return r
+ except:
+ # unexplained error
+ return None
+
+ def calc_type(self, url):
+ logger.info(url)
+ # is there a delay associated with the url
+ netloc = urlparse(url).netloc
+ delay = DELAYS.get(netloc, DELAYS.get('*'))
+
+ # wait if necessary
+ last_call = self.last_call.get(netloc)
+ if last_call is not None:
+ now = time.time()
+ min_time_next_call = last_call + delay
+ if min_time_next_call > now:
+ time.sleep(min_time_next_call-now)
+
+ self.last_call[netloc] = time.time()
+
+ # compute the content-type
+
+ return self.content_type(url)
+
+contenttyper = ContentTyper()
+
+RE_EBOOK_TYPES = re.compile(r'(epub|pdf|mobi)', flags=re.I)
+
+def response_parts(response):
+ ''' return code, content type, content disposition handling any missing data'''
+ if response == None:
+ return 0, '', ''
+ try:
+ if response.status_code == 404:
+ return 404, '', ''
+ cdisp = response.headers.get('content-disposition', '')
+ return response.status_code, response.headers.get('content-type', ''), cdisp
+ except:
+ return response.status_code, '', ''
+
+
+
+def type_for_url(url, response=None):
+ ''' check a url to see what's there. the content-disposition header is often needed to
+ dermine the type of file i.e. pdf, epub, etc. at the end of the link '''
+ if not url and not response:
+ return ''
+
+ if response:
+ code, ct, disposition = response_parts(response)
+ url = response.url
+ else:
+ code, ct, disposition = response_parts(contenttyper.calc_type(url))
+ url_disp = url + disposition
+ if code == 404:
+ return 404, ''
+ binary_type = re.search("octet-stream", ct) or re.search("application/binary", ct)
+ ebook_type_match = RE_EBOOK_TYPES.search(url_disp)
+ if re.search("pdf", ct):
+ return code, "pdf"
+ elif binary_type and ebook_type_match:
+ return code, ebook_type_match.group(1).lower()
+ elif re.search("text/plain", ct):
+ return code, "text"
+ elif re.search("text/html", ct):
+ return code, "html"
+ elif re.search("epub", ct):
+ return code, "epub"
+ elif re.search("mobi", ct):
+ return code, "mobi"
+ # no content-type header?
+ elif ebook_type_match:
+ return code, ebook_type_match.group(1).lower()
+
+ return code, f'other; {ct}'
+
+def check_link(link):
+ ''' given a Link object, check it's URL, put the result in a Check object '''
+ check = Check(link=link)
+ code, ct = type_for_url(link.url)
+ check.return_code = code
+ check.content_type = ct
+ check.save()
+
+
diff --git a/doab_check/management/commands/check_some_links.py b/doab_check/management/commands/check_some_links.py
new file mode 100644
index 0000000..14fd219
--- /dev/null
+++ b/doab_check/management/commands/check_some_links.py
@@ -0,0 +1,21 @@
+import datetime
+from django.core.management.base import BaseCommand
+
+from doab_check.check import check_link
+from doab_check.models import Link
+
+class Command(BaseCommand):
+ help = "check links in rando. order"
+
+ def add_arguments(self, parser):
+ parser.add_argument('--max', nargs='?', type=int, default=1000, help="max checks")
+
+ def handle(self, **options):
+ max = options['max']
+ n_checked = 0
+ for link in Link.objects.all().order_by('?'):
+ check_link(link)
+ n_checked += 1
+ if n_checked >= max:
+ break
+ self.stdout.write(f'checked {n_checked} links')
diff --git a/doab_check/models.py b/doab_check/models.py
index bc77f50..f204112 100644
--- a/doab_check/models.py
+++ b/doab_check/models.py
@@ -16,6 +16,9 @@ class Item(models.Model):
# titles can change
title = models.CharField(max_length=1000)
+
+ def __str__(self):
+ return self.doab.split('/')[1] if '/' in self.doab else self.doab
class Link(models.Model):
''' these are the links we're going to check '''
@@ -25,7 +28,6 @@ class Link(models.Model):
# the items reporting this link
items = models.ManyToManyField("Item", related_name="links", db_index=True, through="LinkRel")
-
# so we can set it to dead instead of deleting
live = models.BooleanField(default=True)
@@ -40,17 +42,24 @@ class Link(models.Model):
self.provider = netloc
super().save(*args, **kwargs)
+
class Timestamp(models.Model):
''' timestamp of the record returned by doab. records can have multiple timestamps '''
created = models.DateTimeField(auto_now_add=True)
datetime = models.DateTimeField()
record = models.ForeignKey("Record", related_name="timestamps", null=False,
on_delete=models.CASCADE)
+ def __str__(self):
+ return f'Record for {self.record.item} on {self.datetime}'
+
class Record(models.Model):
''' a harvested record '''
created = models.DateTimeField(auto_now_add=True)
item = models.ForeignKey("Item", related_name="records", on_delete=models.CASCADE)
+ def __str__(self):
+ return f'Record for {self.item} harvested on {self.created}'
+
class LinkRel(models.Model):
''' association between an item and a link '''