add simple api, data dumper

main
eric 2023-10-20 14:40:31 -04:00
parent 5a0a602fed
commit ff433edebd
7 changed files with 154 additions and 1 deletions

3
.gitignore vendored
View File

@ -154,3 +154,6 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
*.zip
*.csv

View File

@ -0,0 +1,48 @@
import csv
import datetime
import logging
import os
from django.core.management.base import BaseCommand
logger = logging.getLogger(__name__)
from doab_check.models import Item
class Command(BaseCommand):
help = "check items in based on primary key"
def add_arguments(self, parser):
parser.add_argument('--outdir', nargs='?', type=str, default='', action="store",
help="output directory")
def check_data(self, item):
link_dict = {'doab': item.doab}
for link in item.links.filter(live=True):
link_dict['url'] = link.url
if link.recent_check:
link_dict['checked'] = link.recent_check.created
link_dict['return_code'] = link.recent_check.return_code
link_dict['content_type'] = link.recent_check.content_type
else:
link_dict['checked'] = ''
link_dict['return_code'] = ''
link_dict['content_type'] = ''
yield(link_dict)
def handle(self, outdir, **options):
start_time = datetime.datetime.now()
num = 0
filepath = os.path.join(outdir, 'doab_checks.csv')
with open(filepath, 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=[
'doab', 'url', 'checked', 'return_code', 'content_type'])
writer.writeheader()
for item in Item.objects.filter(status=1):
writer.writerows(self.check_data(item))
num += 1
end_time = datetime.datetime.now()
logger.info(f'wrote {num} link checks in {end_time - start_time}')
self.stdout.write(f'wrote {num} link checks in {end_time - start_time}')

View File

@ -0,0 +1,58 @@
{% load static %}
<html>
<head>
<title>Fixing DOAB Links</title>
{% include "basestyle.html" %}
</head>
<body>
{% include "header.html" %}
<main class="section oapen-topic oapen-main">
<div class="container">
<div class="content">
<h1>
DOAB-Check Data
</h1>
<p>
Link checking data is available from an api, on a per-link basis, and in bulk via download.
</p>
<h2>
API
</h2>
The api is pretty self-explanatory. requests are of the form:
<pre>
https://doab-check.ebookfoundation.org/api/doab/[the doab id]
</pre>
DOAB ids look like this:
<pre>
oai:doab-books:20.500.12854/NNNNN
</pre>
(you can omit the 'oai:doab-books:').
The response (JSON) looks like this:
<pre>
{
"doab": "oai:doab-books:20.500.12854/35337",
"status": "found",
"links": [
{"url": "http://library.oapen.org/handle/20.500.12657/32815",
"checked": "2023-10-01T04:05:08.816Z",
"return_code": 200,
"content_type": "html"},
{"url": "https://library.oapen.org/bitstream/20.500.12657/37516/1/604614.pdf",
"checked": "2023-10-01T04:05:09.900Z",
"return_code": 200,
"content_type": "pdf"},
]
}
</pre>
<h2>
Bulk download
</h2>
<p>
A zipped CSV file of all the active urls and the results of our most recent check is available at:<br>
<a href="/static/doab_checks.csv.zip">https://doab-check.ebookfoundation.org/static/doab_checks.csv.zip</a>. <br>This file is updated roughly once a day.
</p>
</div>
</div>
</main>
</body>
</html>

View File

@ -71,6 +71,9 @@ When a link is checked we record the status code and content type returned by th
</table>
</div>
</div>
<p>
The doab-check data is available for individual books <a href="{% url 'apihelp' %}">via an api</a>.
</p>
</main>
</body>
</html>

View File

@ -10,6 +10,7 @@ urlpatterns = [
path('', views.HomepageView.as_view(), name='home'),
path('admin/', admin.site.urls),
path('fixing/', TemplateView.as_view(template_name='fixing.html'), name='fixing'),
path('api/help/', TemplateView.as_view(template_name='api.html'), name='apihelp'),
path('problems/publishers/', views.ProblemPublishersView.as_view(), name='probpubs'),
path('problems/<str:code>/', views.ProblemsView.as_view(), name='problems'),
path('providers/', views.ProvidersView.as_view(), name='providers'),
@ -17,5 +18,6 @@ urlpatterns = [
path('publishers/', views.PublishersView.as_view(), name='publishers'),
re_path(r'publishers/(?P<publisher>.*)', views.PublisherView.as_view(), name='publisher'),
re_path(r'link/(?P<link_id>\d*)', views.LinkView.as_view(), name='link'),
re_path(r'api/doab/(?P<doab>.*)', views.link_api_view, name='link_api'),
]

View File

@ -2,10 +2,11 @@
"""
from django.db.models import Count, F
from django.http import HttpResponseRedirect
from django.http import HttpResponseRedirect, JsonResponse, Http404
from django.shortcuts import get_object_or_404, render
from django.urls import reverse
from django.views import generic
from django.views.decorators.csrf import csrf_exempt
from .models import Item, Link
@ -133,3 +134,34 @@ class PublisherView(generic.TemplateView):
return {'codes': codes, 'publisher': pub, 'count': link_count}
@csrf_exempt
def link_api_view(request, doab):
data = {'doab': doab}
if not doab.startswith('oai:doab-books:'):
if doab.startswith('20.500.12854'):
doab = 'oai:doab-books:' + doab
data['doab'] = doab
else:
data['status'] = 'invalid'
return JsonResponse(data)
item = Item.objects.get(doab=doab)
if not item:
data['status'] = 'not found'
return JsonResponse(data)
data['status'] = 'found'
links = []
data['links'] = links
for link in item.links.filter(live=True):
link_dict = {'url': link.url}
if link.recent_check:
link_dict['checked'] = link.recent_check.created
link_dict['return_code'] = link.recent_check.return_code
link_dict['content_type'] = link.recent_check.content_type
links.append(link_dict)
return JsonResponse(data)

7
scripts/dump_checks.sh Executable file
View File

@ -0,0 +1,7 @@
#!/bin/bash
#
cd /home/ubuntu/doab-check
/home/ubuntu/.local/bin/pipenv run python manage.py dump_checks >> dump_checks.log
gzip dump_checks.csv
mv dump_checks.zip static/dump_checks.zip