add simple api, data dumper
parent
5a0a602fed
commit
ff433edebd
|
@ -154,3 +154,6 @@ cython_debug/
|
||||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
#.idea/
|
#.idea/
|
||||||
|
|
||||||
|
*.zip
|
||||||
|
*.csv
|
||||||
|
|
|
@ -0,0 +1,48 @@
|
||||||
|
import csv
|
||||||
|
import datetime
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from django.core.management.base import BaseCommand
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
from doab_check.models import Item
|
||||||
|
|
||||||
|
|
||||||
|
class Command(BaseCommand):
|
||||||
|
help = "check items in based on primary key"
|
||||||
|
|
||||||
|
def add_arguments(self, parser):
|
||||||
|
parser.add_argument('--outdir', nargs='?', type=str, default='', action="store",
|
||||||
|
help="output directory")
|
||||||
|
|
||||||
|
def check_data(self, item):
|
||||||
|
link_dict = {'doab': item.doab}
|
||||||
|
for link in item.links.filter(live=True):
|
||||||
|
link_dict['url'] = link.url
|
||||||
|
if link.recent_check:
|
||||||
|
link_dict['checked'] = link.recent_check.created
|
||||||
|
link_dict['return_code'] = link.recent_check.return_code
|
||||||
|
link_dict['content_type'] = link.recent_check.content_type
|
||||||
|
else:
|
||||||
|
link_dict['checked'] = ''
|
||||||
|
link_dict['return_code'] = ''
|
||||||
|
link_dict['content_type'] = ''
|
||||||
|
yield(link_dict)
|
||||||
|
|
||||||
|
def handle(self, outdir, **options):
|
||||||
|
start_time = datetime.datetime.now()
|
||||||
|
num = 0
|
||||||
|
filepath = os.path.join(outdir, 'doab_checks.csv')
|
||||||
|
with open(filepath, 'w', newline='') as f:
|
||||||
|
writer = csv.DictWriter(f, fieldnames=[
|
||||||
|
'doab', 'url', 'checked', 'return_code', 'content_type'])
|
||||||
|
writer.writeheader()
|
||||||
|
for item in Item.objects.filter(status=1):
|
||||||
|
writer.writerows(self.check_data(item))
|
||||||
|
num += 1
|
||||||
|
|
||||||
|
end_time = datetime.datetime.now()
|
||||||
|
logger.info(f'wrote {num} link checks in {end_time - start_time}')
|
||||||
|
self.stdout.write(f'wrote {num} link checks in {end_time - start_time}')
|
||||||
|
|
|
@ -0,0 +1,58 @@
|
||||||
|
{% load static %}
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Fixing DOAB Links</title>
|
||||||
|
{% include "basestyle.html" %}
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
{% include "header.html" %}
|
||||||
|
<main class="section oapen-topic oapen-main">
|
||||||
|
<div class="container">
|
||||||
|
<div class="content">
|
||||||
|
<h1>
|
||||||
|
DOAB-Check Data
|
||||||
|
</h1>
|
||||||
|
<p>
|
||||||
|
Link checking data is available from an api, on a per-link basis, and in bulk via download.
|
||||||
|
</p>
|
||||||
|
<h2>
|
||||||
|
API
|
||||||
|
</h2>
|
||||||
|
The api is pretty self-explanatory. requests are of the form:
|
||||||
|
<pre>
|
||||||
|
https://doab-check.ebookfoundation.org/api/doab/[the doab id]
|
||||||
|
</pre>
|
||||||
|
DOAB ids look like this:
|
||||||
|
<pre>
|
||||||
|
oai:doab-books:20.500.12854/NNNNN
|
||||||
|
</pre>
|
||||||
|
(you can omit the 'oai:doab-books:').
|
||||||
|
The response (JSON) looks like this:
|
||||||
|
<pre>
|
||||||
|
{
|
||||||
|
"doab": "oai:doab-books:20.500.12854/35337",
|
||||||
|
"status": "found",
|
||||||
|
"links": [
|
||||||
|
{"url": "http://library.oapen.org/handle/20.500.12657/32815",
|
||||||
|
"checked": "2023-10-01T04:05:08.816Z",
|
||||||
|
"return_code": 200,
|
||||||
|
"content_type": "html"},
|
||||||
|
{"url": "https://library.oapen.org/bitstream/20.500.12657/37516/1/604614.pdf",
|
||||||
|
"checked": "2023-10-01T04:05:09.900Z",
|
||||||
|
"return_code": 200,
|
||||||
|
"content_type": "pdf"},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
</pre>
|
||||||
|
<h2>
|
||||||
|
Bulk download
|
||||||
|
</h2>
|
||||||
|
<p>
|
||||||
|
A zipped CSV file of all the active urls and the results of our most recent check is available at:<br>
|
||||||
|
<a href="/static/doab_checks.csv.zip">https://doab-check.ebookfoundation.org/static/doab_checks.csv.zip</a>. <br>This file is updated roughly once a day.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -71,6 +71,9 @@ When a link is checked we record the status code and content type returned by th
|
||||||
</table>
|
</table>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
<p>
|
||||||
|
The doab-check data is available for individual books <a href="{% url 'apihelp' %}">via an api</a>.
|
||||||
|
</p>
|
||||||
</main>
|
</main>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
|
@ -10,6 +10,7 @@ urlpatterns = [
|
||||||
path('', views.HomepageView.as_view(), name='home'),
|
path('', views.HomepageView.as_view(), name='home'),
|
||||||
path('admin/', admin.site.urls),
|
path('admin/', admin.site.urls),
|
||||||
path('fixing/', TemplateView.as_view(template_name='fixing.html'), name='fixing'),
|
path('fixing/', TemplateView.as_view(template_name='fixing.html'), name='fixing'),
|
||||||
|
path('api/help/', TemplateView.as_view(template_name='api.html'), name='apihelp'),
|
||||||
path('problems/publishers/', views.ProblemPublishersView.as_view(), name='probpubs'),
|
path('problems/publishers/', views.ProblemPublishersView.as_view(), name='probpubs'),
|
||||||
path('problems/<str:code>/', views.ProblemsView.as_view(), name='problems'),
|
path('problems/<str:code>/', views.ProblemsView.as_view(), name='problems'),
|
||||||
path('providers/', views.ProvidersView.as_view(), name='providers'),
|
path('providers/', views.ProvidersView.as_view(), name='providers'),
|
||||||
|
@ -17,5 +18,6 @@ urlpatterns = [
|
||||||
path('publishers/', views.PublishersView.as_view(), name='publishers'),
|
path('publishers/', views.PublishersView.as_view(), name='publishers'),
|
||||||
re_path(r'publishers/(?P<publisher>.*)', views.PublisherView.as_view(), name='publisher'),
|
re_path(r'publishers/(?P<publisher>.*)', views.PublisherView.as_view(), name='publisher'),
|
||||||
re_path(r'link/(?P<link_id>\d*)', views.LinkView.as_view(), name='link'),
|
re_path(r'link/(?P<link_id>\d*)', views.LinkView.as_view(), name='link'),
|
||||||
|
re_path(r'api/doab/(?P<doab>.*)', views.link_api_view, name='link_api'),
|
||||||
|
|
||||||
]
|
]
|
||||||
|
|
|
@ -2,10 +2,11 @@
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from django.db.models import Count, F
|
from django.db.models import Count, F
|
||||||
from django.http import HttpResponseRedirect
|
from django.http import HttpResponseRedirect, JsonResponse, Http404
|
||||||
from django.shortcuts import get_object_or_404, render
|
from django.shortcuts import get_object_or_404, render
|
||||||
from django.urls import reverse
|
from django.urls import reverse
|
||||||
from django.views import generic
|
from django.views import generic
|
||||||
|
from django.views.decorators.csrf import csrf_exempt
|
||||||
|
|
||||||
from .models import Item, Link
|
from .models import Item, Link
|
||||||
|
|
||||||
|
@ -133,3 +134,34 @@ class PublisherView(generic.TemplateView):
|
||||||
|
|
||||||
return {'codes': codes, 'publisher': pub, 'count': link_count}
|
return {'codes': codes, 'publisher': pub, 'count': link_count}
|
||||||
|
|
||||||
|
@csrf_exempt
|
||||||
|
def link_api_view(request, doab):
|
||||||
|
data = {'doab': doab}
|
||||||
|
if not doab.startswith('oai:doab-books:'):
|
||||||
|
if doab.startswith('20.500.12854'):
|
||||||
|
doab = 'oai:doab-books:' + doab
|
||||||
|
data['doab'] = doab
|
||||||
|
else:
|
||||||
|
data['status'] = 'invalid'
|
||||||
|
return JsonResponse(data)
|
||||||
|
item = Item.objects.get(doab=doab)
|
||||||
|
if not item:
|
||||||
|
data['status'] = 'not found'
|
||||||
|
return JsonResponse(data)
|
||||||
|
data['status'] = 'found'
|
||||||
|
links = []
|
||||||
|
data['links'] = links
|
||||||
|
for link in item.links.filter(live=True):
|
||||||
|
link_dict = {'url': link.url}
|
||||||
|
if link.recent_check:
|
||||||
|
link_dict['checked'] = link.recent_check.created
|
||||||
|
link_dict['return_code'] = link.recent_check.return_code
|
||||||
|
link_dict['content_type'] = link.recent_check.content_type
|
||||||
|
links.append(link_dict)
|
||||||
|
return JsonResponse(data)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,7 @@
|
||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
|
||||||
|
cd /home/ubuntu/doab-check
|
||||||
|
/home/ubuntu/.local/bin/pipenv run python manage.py dump_checks >> dump_checks.log
|
||||||
|
gzip dump_checks.csv
|
||||||
|
mv dump_checks.zip static/dump_checks.zip
|
Loading…
Reference in New Issue