clean up admin site

main
eric 2023-03-02 13:03:08 -05:00
parent e034013b81
commit 04a1da355c
4 changed files with 193 additions and 7 deletions

View File

@ -1,12 +1,49 @@
from django.contrib import admin
from django.utils.safestring import mark_safe
# Register your models here.
from . import models
admin.site.register(models.Item)
admin.site.register(models.Link)
admin.site.register(models.Timestamp)
admin.site.register(models.Record)
admin.site.register(models.LinkRel)
admin.site.register(models.Check)
@admin.register(models.Check)
class CheckAdmin(admin.ModelAdmin):
list_display = ('link_url', 'return_code', 'content_type')
date_hierarchy = 'created'
search_fields = ['return_code']
ordering = ('created', 'return_code', 'content_type')
readonly_fields = ('link_url', 'return_code', 'content_type', 'link', 'location')
def link_url(self, obj):
return mark_safe(f'<a href="/admin/doab_check/link/{obj.link.id}/">{obj.link.url}</a>')
@admin.register(models.Item)
class ItemAdmin(admin.ModelAdmin):
list_display = (str, 'title', 'resource_type')
date_hierarchy = 'created'
search_fields = ['title']
@admin.register(models.Link)
class LinkAdmin(admin.ModelAdmin):
list_display = ('url', 'provider')
date_hierarchy = 'created'
search_fields = ['url']
exclude = ['url']
readonly_fields = ('link_display', 'provider')
def link_display(self, obj):
return mark_safe(f'<a href="{obj.url}">{obj.url}</a>')
@admin.register(models.LinkRel)
class LinkAdmin(admin.ModelAdmin):
list_display = ('role', 'doab', 'url',)
readonly_fields = ('doab', 'url')
search_fields = ['link__url']
def doab(self, obj):
return mark_safe(f'<a href="/admin/doab_check/item/{obj.item.id}/">{obj.item}</a>')
def url(self, obj):
return mark_safe(f'<a href="/admin/doab_check/link/{obj.link.id}/">{obj.link.url}</a>')
@admin.register(models.Record)
class RecordAdmin(admin.ModelAdmin):
readonly_fields = ['item']

119
doab_check/check.py Executable file
View File

@ -0,0 +1,119 @@
#!/usr/bin/env python
# encoding: utf-8
import logging
import re
import time
from urllib.parse import urlparse
import requests
from django.conf import settings
from .models import Check, Link
HEADERS = {"User-Agent": settings.USER_AGENT}
DELAYS = {
'oapen.org': 0.05,
'*': 0.5,
}
logger = logging.getLogger(__name__)
class ContentTyper(object):
""" won't make more checks faster than the DELAY set for the host """
def __init__(self):
self.last_call = dict()
def content_type(self, url):
try:
r = requests.head(url, allow_redirects=True, headers=HEADERS)
if r.status_code == 405:
r = requests.get(url, headers=HEADERS)
return r
except:
# unexplained error
return None
def calc_type(self, url):
logger.info(url)
# is there a delay associated with the url
netloc = urlparse(url).netloc
delay = DELAYS.get(netloc, DELAYS.get('*'))
# wait if necessary
last_call = self.last_call.get(netloc)
if last_call is not None:
now = time.time()
min_time_next_call = last_call + delay
if min_time_next_call > now:
time.sleep(min_time_next_call-now)
self.last_call[netloc] = time.time()
# compute the content-type
return self.content_type(url)
contenttyper = ContentTyper()
RE_EBOOK_TYPES = re.compile(r'(epub|pdf|mobi)', flags=re.I)
def response_parts(response):
''' return code, content type, content disposition handling any missing data'''
if response == None:
return 0, '', ''
try:
if response.status_code == 404:
return 404, '', ''
cdisp = response.headers.get('content-disposition', '')
return response.status_code, response.headers.get('content-type', ''), cdisp
except:
return response.status_code, '', ''
def type_for_url(url, response=None):
''' check a url to see what's there. the content-disposition header is often needed to
dermine the type of file i.e. pdf, epub, etc. at the end of the link '''
if not url and not response:
return ''
if response:
code, ct, disposition = response_parts(response)
url = response.url
else:
code, ct, disposition = response_parts(contenttyper.calc_type(url))
url_disp = url + disposition
if code == 404:
return 404, ''
binary_type = re.search("octet-stream", ct) or re.search("application/binary", ct)
ebook_type_match = RE_EBOOK_TYPES.search(url_disp)
if re.search("pdf", ct):
return code, "pdf"
elif binary_type and ebook_type_match:
return code, ebook_type_match.group(1).lower()
elif re.search("text/plain", ct):
return code, "text"
elif re.search("text/html", ct):
return code, "html"
elif re.search("epub", ct):
return code, "epub"
elif re.search("mobi", ct):
return code, "mobi"
# no content-type header?
elif ebook_type_match:
return code, ebook_type_match.group(1).lower()
return code, f'other; {ct}'
def check_link(link):
''' given a Link object, check it's URL, put the result in a Check object '''
check = Check(link=link)
code, ct = type_for_url(link.url)
check.return_code = code
check.content_type = ct
check.save()

View File

@ -0,0 +1,21 @@
import datetime
from django.core.management.base import BaseCommand
from doab_check.check import check_link
from doab_check.models import Link
class Command(BaseCommand):
help = "check links in rando. order"
def add_arguments(self, parser):
parser.add_argument('--max', nargs='?', type=int, default=1000, help="max checks")
def handle(self, **options):
max = options['max']
n_checked = 0
for link in Link.objects.all().order_by('?'):
check_link(link)
n_checked += 1
if n_checked >= max:
break
self.stdout.write(f'checked {n_checked} links')

View File

@ -16,6 +16,9 @@ class Item(models.Model):
# titles can change
title = models.CharField(max_length=1000)
def __str__(self):
return self.doab.split('/')[1] if '/' in self.doab else self.doab
class Link(models.Model):
''' these are the links we're going to check '''
@ -25,7 +28,6 @@ class Link(models.Model):
# the items reporting this link
items = models.ManyToManyField("Item", related_name="links", db_index=True, through="LinkRel")
# so we can set it to dead instead of deleting
live = models.BooleanField(default=True)
@ -40,17 +42,24 @@ class Link(models.Model):
self.provider = netloc
super().save(*args, **kwargs)
class Timestamp(models.Model):
''' timestamp of the record returned by doab. records can have multiple timestamps '''
created = models.DateTimeField(auto_now_add=True)
datetime = models.DateTimeField()
record = models.ForeignKey("Record", related_name="timestamps", null=False,
on_delete=models.CASCADE)
def __str__(self):
return f'Record for {self.record.item} on {self.datetime}'
class Record(models.Model):
''' a harvested record '''
created = models.DateTimeField(auto_now_add=True)
item = models.ForeignKey("Item", related_name="records", on_delete=models.CASCADE)
def __str__(self):
return f'Record for {self.item} harvested on {self.created}'
class LinkRel(models.Model):
''' association between an item and a link '''