add simple api, data dumper
parent
5a0a602fed
commit
ff433edebd
|
@ -154,3 +154,6 @@ cython_debug/
|
|||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
||||
*.zip
|
||||
*.csv
|
||||
|
|
|
@ -0,0 +1,48 @@
|
|||
import csv
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
from doab_check.models import Item
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "check items in based on primary key"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--outdir', nargs='?', type=str, default='', action="store",
|
||||
help="output directory")
|
||||
|
||||
def check_data(self, item):
|
||||
link_dict = {'doab': item.doab}
|
||||
for link in item.links.filter(live=True):
|
||||
link_dict['url'] = link.url
|
||||
if link.recent_check:
|
||||
link_dict['checked'] = link.recent_check.created
|
||||
link_dict['return_code'] = link.recent_check.return_code
|
||||
link_dict['content_type'] = link.recent_check.content_type
|
||||
else:
|
||||
link_dict['checked'] = ''
|
||||
link_dict['return_code'] = ''
|
||||
link_dict['content_type'] = ''
|
||||
yield(link_dict)
|
||||
|
||||
def handle(self, outdir, **options):
|
||||
start_time = datetime.datetime.now()
|
||||
num = 0
|
||||
filepath = os.path.join(outdir, 'doab_checks.csv')
|
||||
with open(filepath, 'w', newline='') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=[
|
||||
'doab', 'url', 'checked', 'return_code', 'content_type'])
|
||||
writer.writeheader()
|
||||
for item in Item.objects.filter(status=1):
|
||||
writer.writerows(self.check_data(item))
|
||||
num += 1
|
||||
|
||||
end_time = datetime.datetime.now()
|
||||
logger.info(f'wrote {num} link checks in {end_time - start_time}')
|
||||
self.stdout.write(f'wrote {num} link checks in {end_time - start_time}')
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
{% load static %}
|
||||
<html>
|
||||
<head>
|
||||
<title>Fixing DOAB Links</title>
|
||||
{% include "basestyle.html" %}
|
||||
</head>
|
||||
<body>
|
||||
{% include "header.html" %}
|
||||
<main class="section oapen-topic oapen-main">
|
||||
<div class="container">
|
||||
<div class="content">
|
||||
<h1>
|
||||
DOAB-Check Data
|
||||
</h1>
|
||||
<p>
|
||||
Link checking data is available from an api, on a per-link basis, and in bulk via download.
|
||||
</p>
|
||||
<h2>
|
||||
API
|
||||
</h2>
|
||||
The api is pretty self-explanatory. requests are of the form:
|
||||
<pre>
|
||||
https://doab-check.ebookfoundation.org/api/doab/[the doab id]
|
||||
</pre>
|
||||
DOAB ids look like this:
|
||||
<pre>
|
||||
oai:doab-books:20.500.12854/NNNNN
|
||||
</pre>
|
||||
(you can omit the 'oai:doab-books:').
|
||||
The response (JSON) looks like this:
|
||||
<pre>
|
||||
{
|
||||
"doab": "oai:doab-books:20.500.12854/35337",
|
||||
"status": "found",
|
||||
"links": [
|
||||
{"url": "http://library.oapen.org/handle/20.500.12657/32815",
|
||||
"checked": "2023-10-01T04:05:08.816Z",
|
||||
"return_code": 200,
|
||||
"content_type": "html"},
|
||||
{"url": "https://library.oapen.org/bitstream/20.500.12657/37516/1/604614.pdf",
|
||||
"checked": "2023-10-01T04:05:09.900Z",
|
||||
"return_code": 200,
|
||||
"content_type": "pdf"},
|
||||
]
|
||||
}
|
||||
</pre>
|
||||
<h2>
|
||||
Bulk download
|
||||
</h2>
|
||||
<p>
|
||||
A zipped CSV file of all the active urls and the results of our most recent check is available at:<br>
|
||||
<a href="/static/doab_checks.csv.zip">https://doab-check.ebookfoundation.org/static/doab_checks.csv.zip</a>. <br>This file is updated roughly once a day.
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
</main>
|
||||
</body>
|
||||
</html>
|
|
@ -71,6 +71,9 @@ When a link is checked we record the status code and content type returned by th
|
|||
</table>
|
||||
</div>
|
||||
</div>
|
||||
<p>
|
||||
The doab-check data is available for individual books <a href="{% url 'apihelp' %}">via an api</a>.
|
||||
</p>
|
||||
</main>
|
||||
</body>
|
||||
</html>
|
|
@ -10,6 +10,7 @@ urlpatterns = [
|
|||
path('', views.HomepageView.as_view(), name='home'),
|
||||
path('admin/', admin.site.urls),
|
||||
path('fixing/', TemplateView.as_view(template_name='fixing.html'), name='fixing'),
|
||||
path('api/help/', TemplateView.as_view(template_name='api.html'), name='apihelp'),
|
||||
path('problems/publishers/', views.ProblemPublishersView.as_view(), name='probpubs'),
|
||||
path('problems/<str:code>/', views.ProblemsView.as_view(), name='problems'),
|
||||
path('providers/', views.ProvidersView.as_view(), name='providers'),
|
||||
|
@ -17,5 +18,6 @@ urlpatterns = [
|
|||
path('publishers/', views.PublishersView.as_view(), name='publishers'),
|
||||
re_path(r'publishers/(?P<publisher>.*)', views.PublisherView.as_view(), name='publisher'),
|
||||
re_path(r'link/(?P<link_id>\d*)', views.LinkView.as_view(), name='link'),
|
||||
re_path(r'api/doab/(?P<doab>.*)', views.link_api_view, name='link_api'),
|
||||
|
||||
]
|
||||
|
|
|
@ -2,10 +2,11 @@
|
|||
"""
|
||||
|
||||
from django.db.models import Count, F
|
||||
from django.http import HttpResponseRedirect
|
||||
from django.http import HttpResponseRedirect, JsonResponse, Http404
|
||||
from django.shortcuts import get_object_or_404, render
|
||||
from django.urls import reverse
|
||||
from django.views import generic
|
||||
from django.views.decorators.csrf import csrf_exempt
|
||||
|
||||
from .models import Item, Link
|
||||
|
||||
|
@ -133,3 +134,34 @@ class PublisherView(generic.TemplateView):
|
|||
|
||||
return {'codes': codes, 'publisher': pub, 'count': link_count}
|
||||
|
||||
@csrf_exempt
|
||||
def link_api_view(request, doab):
|
||||
data = {'doab': doab}
|
||||
if not doab.startswith('oai:doab-books:'):
|
||||
if doab.startswith('20.500.12854'):
|
||||
doab = 'oai:doab-books:' + doab
|
||||
data['doab'] = doab
|
||||
else:
|
||||
data['status'] = 'invalid'
|
||||
return JsonResponse(data)
|
||||
item = Item.objects.get(doab=doab)
|
||||
if not item:
|
||||
data['status'] = 'not found'
|
||||
return JsonResponse(data)
|
||||
data['status'] = 'found'
|
||||
links = []
|
||||
data['links'] = links
|
||||
for link in item.links.filter(live=True):
|
||||
link_dict = {'url': link.url}
|
||||
if link.recent_check:
|
||||
link_dict['checked'] = link.recent_check.created
|
||||
link_dict['return_code'] = link.recent_check.return_code
|
||||
link_dict['content_type'] = link.recent_check.content_type
|
||||
links.append(link_dict)
|
||||
return JsonResponse(data)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
#!/bin/bash
|
||||
#
|
||||
|
||||
cd /home/ubuntu/doab-check
|
||||
/home/ubuntu/.local/bin/pipenv run python manage.py dump_checks >> dump_checks.log
|
||||
gzip dump_checks.csv
|
||||
mv dump_checks.zip static/dump_checks.zip
|
Loading…
Reference in New Issue