diff --git a/.gitignore b/.gitignore index e85277c..3605d1f 100644 --- a/.gitignore +++ b/.gitignore @@ -154,3 +154,6 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ + +*.zip +*.csv diff --git a/doab_check/management/commands/dump_checks.py b/doab_check/management/commands/dump_checks.py new file mode 100644 index 0000000..c7eefa4 --- /dev/null +++ b/doab_check/management/commands/dump_checks.py @@ -0,0 +1,48 @@ +import csv +import datetime +import logging +import os +from django.core.management.base import BaseCommand + +logger = logging.getLogger(__name__) + +from doab_check.models import Item + + +class Command(BaseCommand): + help = "check items in based on primary key" + + def add_arguments(self, parser): + parser.add_argument('--outdir', nargs='?', type=str, default='', action="store", + help="output directory") + + def check_data(self, item): + link_dict = {'doab': item.doab} + for link in item.links.filter(live=True): + link_dict['url'] = link.url + if link.recent_check: + link_dict['checked'] = link.recent_check.created + link_dict['return_code'] = link.recent_check.return_code + link_dict['content_type'] = link.recent_check.content_type + else: + link_dict['checked'] = '' + link_dict['return_code'] = '' + link_dict['content_type'] = '' + yield(link_dict) + + def handle(self, outdir, **options): + start_time = datetime.datetime.now() + num = 0 + filepath = os.path.join(outdir, 'doab_checks.csv') + with open(filepath, 'w', newline='') as f: + writer = csv.DictWriter(f, fieldnames=[ + 'doab', 'url', 'checked', 'return_code', 'content_type']) + writer.writeheader() + for item in Item.objects.filter(status=1): + writer.writerows(self.check_data(item)) + num += 1 + + end_time = datetime.datetime.now() + logger.info(f'wrote {num} link checks in {end_time - start_time}') + self.stdout.write(f'wrote {num} link checks in {end_time - start_time}') + diff --git a/doab_check/templates/api.html b/doab_check/templates/api.html new file mode 100644 index 0000000..4bac990 --- /dev/null +++ b/doab_check/templates/api.html @@ -0,0 +1,58 @@ +{% load static %} + + +Fixing DOAB Links +{% include "basestyle.html" %} + + +{% include "header.html" %} +
+
+
+

+DOAB-Check Data +

+

+Link checking data is available from an api, on a per-link basis, and in bulk via download. +

+

+API +

+The api is pretty self-explanatory. requests are of the form: +
+https://doab-check.ebookfoundation.org/api/doab/[the doab id]
+
+DOAB ids look like this: +
+oai:doab-books:20.500.12854/NNNNN
+
+(you can omit the 'oai:doab-books:'). +The response (JSON) looks like this: +
+{
+    "doab": "oai:doab-books:20.500.12854/35337",
+    "status": "found",
+    "links": [
+        {"url": "http://library.oapen.org/handle/20.500.12657/32815",
+         "checked": "2023-10-01T04:05:08.816Z",
+         "return_code": 200,
+         "content_type": "html"},
+        {"url": "https://library.oapen.org/bitstream/20.500.12657/37516/1/604614.pdf",
+         "checked": "2023-10-01T04:05:09.900Z",
+         "return_code": 200,
+         "content_type": "pdf"},
+    ]
+}
+
+

+Bulk download +

+

+A zipped CSV file of all the active urls and the results of our most recent check is available at:
+https://doab-check.ebookfoundation.org/static/doab_checks.csv.zip.
This file is updated roughly once a day. +

+
+
+
+ + \ No newline at end of file diff --git a/doab_check/templates/index.html b/doab_check/templates/index.html index 15998c7..886ecaf 100644 --- a/doab_check/templates/index.html +++ b/doab_check/templates/index.html @@ -71,6 +71,9 @@ When a link is checked we record the status code and content type returned by th +

+The doab-check data is available for individual books via an api. +

\ No newline at end of file diff --git a/doab_check/urls.py b/doab_check/urls.py index f370dad..5c2e211 100644 --- a/doab_check/urls.py +++ b/doab_check/urls.py @@ -10,6 +10,7 @@ urlpatterns = [ path('', views.HomepageView.as_view(), name='home'), path('admin/', admin.site.urls), path('fixing/', TemplateView.as_view(template_name='fixing.html'), name='fixing'), + path('api/help/', TemplateView.as_view(template_name='api.html'), name='apihelp'), path('problems/publishers/', views.ProblemPublishersView.as_view(), name='probpubs'), path('problems//', views.ProblemsView.as_view(), name='problems'), path('providers/', views.ProvidersView.as_view(), name='providers'), @@ -17,5 +18,6 @@ urlpatterns = [ path('publishers/', views.PublishersView.as_view(), name='publishers'), re_path(r'publishers/(?P.*)', views.PublisherView.as_view(), name='publisher'), re_path(r'link/(?P\d*)', views.LinkView.as_view(), name='link'), + re_path(r'api/doab/(?P.*)', views.link_api_view, name='link_api'), ] diff --git a/doab_check/views.py b/doab_check/views.py index f009289..cd6a70a 100644 --- a/doab_check/views.py +++ b/doab_check/views.py @@ -2,10 +2,11 @@ """ from django.db.models import Count, F -from django.http import HttpResponseRedirect +from django.http import HttpResponseRedirect, JsonResponse, Http404 from django.shortcuts import get_object_or_404, render from django.urls import reverse from django.views import generic +from django.views.decorators.csrf import csrf_exempt from .models import Item, Link @@ -133,3 +134,34 @@ class PublisherView(generic.TemplateView): return {'codes': codes, 'publisher': pub, 'count': link_count} +@csrf_exempt +def link_api_view(request, doab): + data = {'doab': doab} + if not doab.startswith('oai:doab-books:'): + if doab.startswith('20.500.12854'): + doab = 'oai:doab-books:' + doab + data['doab'] = doab + else: + data['status'] = 'invalid' + return JsonResponse(data) + item = Item.objects.get(doab=doab) + if not item: + data['status'] = 'not found' + return JsonResponse(data) + data['status'] = 'found' + links = [] + data['links'] = links + for link in item.links.filter(live=True): + link_dict = {'url': link.url} + if link.recent_check: + link_dict['checked'] = link.recent_check.created + link_dict['return_code'] = link.recent_check.return_code + link_dict['content_type'] = link.recent_check.content_type + links.append(link_dict) + return JsonResponse(data) + + + + + + \ No newline at end of file diff --git a/scripts/dump_checks.sh b/scripts/dump_checks.sh new file mode 100755 index 0000000..2f27391 --- /dev/null +++ b/scripts/dump_checks.sh @@ -0,0 +1,7 @@ +#!/bin/bash +# + +cd /home/ubuntu/doab-check +/home/ubuntu/.local/bin/pipenv run python manage.py dump_checks >> dump_checks.log +gzip dump_checks.csv +mv dump_checks.zip static/dump_checks.zip \ No newline at end of file