clean up admin site
parent
e034013b81
commit
04a1da355c
|
@ -1,12 +1,49 @@
|
|||
from django.contrib import admin
|
||||
from django.utils.safestring import mark_safe
|
||||
|
||||
# Register your models here.
|
||||
|
||||
from . import models
|
||||
|
||||
admin.site.register(models.Item)
|
||||
admin.site.register(models.Link)
|
||||
admin.site.register(models.Timestamp)
|
||||
admin.site.register(models.Record)
|
||||
admin.site.register(models.LinkRel)
|
||||
admin.site.register(models.Check)
|
||||
|
||||
@admin.register(models.Check)
|
||||
class CheckAdmin(admin.ModelAdmin):
|
||||
list_display = ('link_url', 'return_code', 'content_type')
|
||||
date_hierarchy = 'created'
|
||||
search_fields = ['return_code']
|
||||
ordering = ('created', 'return_code', 'content_type')
|
||||
readonly_fields = ('link_url', 'return_code', 'content_type', 'link', 'location')
|
||||
def link_url(self, obj):
|
||||
return mark_safe(f'<a href="/admin/doab_check/link/{obj.link.id}/">{obj.link.url}</a>')
|
||||
|
||||
@admin.register(models.Item)
|
||||
class ItemAdmin(admin.ModelAdmin):
|
||||
list_display = (str, 'title', 'resource_type')
|
||||
date_hierarchy = 'created'
|
||||
search_fields = ['title']
|
||||
|
||||
@admin.register(models.Link)
|
||||
class LinkAdmin(admin.ModelAdmin):
|
||||
list_display = ('url', 'provider')
|
||||
date_hierarchy = 'created'
|
||||
search_fields = ['url']
|
||||
exclude = ['url']
|
||||
readonly_fields = ('link_display', 'provider')
|
||||
def link_display(self, obj):
|
||||
return mark_safe(f'<a href="{obj.url}">{obj.url}</a>')
|
||||
|
||||
@admin.register(models.LinkRel)
|
||||
class LinkAdmin(admin.ModelAdmin):
|
||||
list_display = ('role', 'doab', 'url',)
|
||||
readonly_fields = ('doab', 'url')
|
||||
search_fields = ['link__url']
|
||||
def doab(self, obj):
|
||||
return mark_safe(f'<a href="/admin/doab_check/item/{obj.item.id}/">{obj.item}</a>')
|
||||
def url(self, obj):
|
||||
return mark_safe(f'<a href="/admin/doab_check/link/{obj.link.id}/">{obj.link.url}</a>')
|
||||
|
||||
@admin.register(models.Record)
|
||||
class RecordAdmin(admin.ModelAdmin):
|
||||
readonly_fields = ['item']
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,119 @@
|
|||
#!/usr/bin/env python
|
||||
# encoding: utf-8
|
||||
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from .models import Check, Link
|
||||
|
||||
HEADERS = {"User-Agent": settings.USER_AGENT}
|
||||
|
||||
DELAYS = {
|
||||
'oapen.org': 0.05,
|
||||
'*': 0.5,
|
||||
}
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class ContentTyper(object):
|
||||
""" won't make more checks faster than the DELAY set for the host """
|
||||
def __init__(self):
|
||||
self.last_call = dict()
|
||||
|
||||
def content_type(self, url):
|
||||
try:
|
||||
r = requests.head(url, allow_redirects=True, headers=HEADERS)
|
||||
if r.status_code == 405:
|
||||
r = requests.get(url, headers=HEADERS)
|
||||
return r
|
||||
except:
|
||||
# unexplained error
|
||||
return None
|
||||
|
||||
def calc_type(self, url):
|
||||
logger.info(url)
|
||||
# is there a delay associated with the url
|
||||
netloc = urlparse(url).netloc
|
||||
delay = DELAYS.get(netloc, DELAYS.get('*'))
|
||||
|
||||
# wait if necessary
|
||||
last_call = self.last_call.get(netloc)
|
||||
if last_call is not None:
|
||||
now = time.time()
|
||||
min_time_next_call = last_call + delay
|
||||
if min_time_next_call > now:
|
||||
time.sleep(min_time_next_call-now)
|
||||
|
||||
self.last_call[netloc] = time.time()
|
||||
|
||||
# compute the content-type
|
||||
|
||||
return self.content_type(url)
|
||||
|
||||
contenttyper = ContentTyper()
|
||||
|
||||
RE_EBOOK_TYPES = re.compile(r'(epub|pdf|mobi)', flags=re.I)
|
||||
|
||||
def response_parts(response):
|
||||
''' return code, content type, content disposition handling any missing data'''
|
||||
if response == None:
|
||||
return 0, '', ''
|
||||
try:
|
||||
if response.status_code == 404:
|
||||
return 404, '', ''
|
||||
cdisp = response.headers.get('content-disposition', '')
|
||||
return response.status_code, response.headers.get('content-type', ''), cdisp
|
||||
except:
|
||||
return response.status_code, '', ''
|
||||
|
||||
|
||||
|
||||
def type_for_url(url, response=None):
|
||||
''' check a url to see what's there. the content-disposition header is often needed to
|
||||
dermine the type of file i.e. pdf, epub, etc. at the end of the link '''
|
||||
if not url and not response:
|
||||
return ''
|
||||
|
||||
if response:
|
||||
code, ct, disposition = response_parts(response)
|
||||
url = response.url
|
||||
else:
|
||||
code, ct, disposition = response_parts(contenttyper.calc_type(url))
|
||||
url_disp = url + disposition
|
||||
if code == 404:
|
||||
return 404, ''
|
||||
binary_type = re.search("octet-stream", ct) or re.search("application/binary", ct)
|
||||
ebook_type_match = RE_EBOOK_TYPES.search(url_disp)
|
||||
if re.search("pdf", ct):
|
||||
return code, "pdf"
|
||||
elif binary_type and ebook_type_match:
|
||||
return code, ebook_type_match.group(1).lower()
|
||||
elif re.search("text/plain", ct):
|
||||
return code, "text"
|
||||
elif re.search("text/html", ct):
|
||||
return code, "html"
|
||||
elif re.search("epub", ct):
|
||||
return code, "epub"
|
||||
elif re.search("mobi", ct):
|
||||
return code, "mobi"
|
||||
# no content-type header?
|
||||
elif ebook_type_match:
|
||||
return code, ebook_type_match.group(1).lower()
|
||||
|
||||
return code, f'other; {ct}'
|
||||
|
||||
def check_link(link):
|
||||
''' given a Link object, check it's URL, put the result in a Check object '''
|
||||
check = Check(link=link)
|
||||
code, ct = type_for_url(link.url)
|
||||
check.return_code = code
|
||||
check.content_type = ct
|
||||
check.save()
|
||||
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
import datetime
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from doab_check.check import check_link
|
||||
from doab_check.models import Link
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "check links in rando. order"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--max', nargs='?', type=int, default=1000, help="max checks")
|
||||
|
||||
def handle(self, **options):
|
||||
max = options['max']
|
||||
n_checked = 0
|
||||
for link in Link.objects.all().order_by('?'):
|
||||
check_link(link)
|
||||
n_checked += 1
|
||||
if n_checked >= max:
|
||||
break
|
||||
self.stdout.write(f'checked {n_checked} links')
|
|
@ -16,6 +16,9 @@ class Item(models.Model):
|
|||
|
||||
# titles can change
|
||||
title = models.CharField(max_length=1000)
|
||||
|
||||
def __str__(self):
|
||||
return self.doab.split('/')[1] if '/' in self.doab else self.doab
|
||||
|
||||
class Link(models.Model):
|
||||
''' these are the links we're going to check '''
|
||||
|
@ -25,7 +28,6 @@ class Link(models.Model):
|
|||
# the items reporting this link
|
||||
items = models.ManyToManyField("Item", related_name="links", db_index=True, through="LinkRel")
|
||||
|
||||
|
||||
# so we can set it to dead instead of deleting
|
||||
live = models.BooleanField(default=True)
|
||||
|
||||
|
@ -40,17 +42,24 @@ class Link(models.Model):
|
|||
self.provider = netloc
|
||||
super().save(*args, **kwargs)
|
||||
|
||||
|
||||
class Timestamp(models.Model):
|
||||
''' timestamp of the record returned by doab. records can have multiple timestamps '''
|
||||
created = models.DateTimeField(auto_now_add=True)
|
||||
datetime = models.DateTimeField()
|
||||
record = models.ForeignKey("Record", related_name="timestamps", null=False,
|
||||
on_delete=models.CASCADE)
|
||||
def __str__(self):
|
||||
return f'Record for {self.record.item} on {self.datetime}'
|
||||
|
||||
|
||||
class Record(models.Model):
|
||||
''' a harvested record '''
|
||||
created = models.DateTimeField(auto_now_add=True)
|
||||
item = models.ForeignKey("Item", related_name="records", on_delete=models.CASCADE)
|
||||
def __str__(self):
|
||||
return f'Record for {self.item} harvested on {self.created}'
|
||||
|
||||
|
||||
class LinkRel(models.Model):
|
||||
''' association between an item and a link '''
|
||||
|
|
Loading…
Reference in New Issue