add problem links by publisher

main
eric 2023-05-09 15:19:06 -04:00
parent acaf8777bf
commit b69aeaeaa8
6 changed files with 78 additions and 4 deletions

View File

@ -21,7 +21,7 @@ We've grouped the links by the publisher's name and by the server url to make it
We have a few ways to view the results. We have a few ways to view the results.
<ul> <ul>
<li> <li>
View the links which appear to have <a href="#codes">problems</a>. View the links which appear to have problems <a href="{% url 'probpubs' %}">by publisher name</a>, or below, by the return code.
</li> </li>
<li> <li>
View <a href="{% url 'providers' %}">the list of servers we've checked</a>. View <a href="{% url 'providers' %}">the list of servers we've checked</a>.
@ -41,7 +41,7 @@ When a link is checked we record the status code and content type returned by th
<li>"404" means the link is broken - the resource is not found. <li>"404" means the link is broken - the resource is not found.
<li>"408" means the website didn't respond in a reasonable time. <li>"408" means the website didn't respond in a reasonable time.
<li>"500" means something has gone wrong at the website server. <li>"500" means something has gone wrong at the website server.
<li>"502" means is a gateway error. Some websites use load balancers or content distribution networks; if these gateways have a problem connecting with the sorce website, they send a 502 response. <li>"502" means is a gateway error. Some websites use load balancers or content distribution networks; if these gateways have a problem connecting with the source website, they send a 502 response.
<li>"503" means that a website couldnt be reached. This could happen because the server was too busy, under maintenance, or something else. Amazon's robot blocker returns 503 codes, so these must be checked manually. <li>"503" means that a website couldnt be reached. This could happen because the server was too busy, under maintenance, or something else. Amazon's robot blocker returns 503 codes, so these must be checked manually.
<li>"504" indicates that the server, while acting as a gateway or proxy did not get a response in time from an upstream server. <li>"504" indicates that the server, while acting as a gateway or proxy did not get a response in time from an upstream server.
<li>"511" indicates a problem with the security of the connection - most often an incomplete certificate. <li>"511" indicates a problem with the security of the connection - most often an incomplete certificate.

View File

@ -20,7 +20,7 @@ DOAB links with Status code: {{ code | default:'0 or None' }}
<ul> <ul>
{% for link in provider.links %} {% for link in provider.links %}
<li> <li>
<p><a href="{{link.url}}">{{link.url}}</a> ({{link.items.first.publisher_name}}) <p><a href="{{link.url}}">{{link.url}}</a> ({{link.items.first.publisher_name | default:'*** no publisher name ***'}})
<table> <table>
{% for check in link.recent_checks %} {% for check in link.recent_checks %}
<tr> <tr>

View File

@ -0,0 +1,53 @@
<html>
<head>
<title>DOAB Link Checking Problems by Publisher</title>
</head>
<body>
<h2>
DOAB Link Checking Problems by Publisher
</h2>
<h3>Problem Link Summary</h3>
<table>
<tr>
<th>Publisher</th>
<th>Number</th>
</tr>
{% for pub in pubs %}
<tr style="color:red">
<td> <a href="#{{ pub.pub | default:'*** no publisher name ***'| urlencode}}">{{ pub.pub | default:'*** no publisher name ***' }}</a> </td>
<td> {{ pub.bad_links.count }} </td>
</tr>
{% endfor %}
</table>
<h3>Problem links by publisher</h3>
<ul>
{% for pub in pubs %}
<li id={{ pub.pub | default:'*** no publisher name ***' | urlencode }}><h4>{{ pub.pub | default:'*** no publisher name ***'}}</h4>
{% for link in pub.bad_links.all %}
<table>
<tr>
<th>
<a href="{{ link.url }}">{{ link.url }}</a>
</th>
<tr>
<td>
<table>
{% for check in link.recent_checks %}
<tr>
<td>{{ check.created }}:</td>
<td style="color:red">{{ check.return_code }}</td>
<td>{{ check.content_type }}</td>
</tr>
{% endfor %}
</table>
</td>
</tr>
{% endfor %}
</table>
</li>
{% endfor %}
</ul>
</body>
</html>

View File

@ -23,6 +23,8 @@ class PageTests(TestCase):
self.assertEqual(r.status_code, 200) self.assertEqual(r.status_code, 200)
r = self.client.get("/problems/404/") r = self.client.get("/problems/404/")
self.assertEqual(r.status_code, 200) self.assertEqual(r.status_code, 200)
r = self.client.get("/problems/publishers/")
self.assertEqual(r.status_code, 200)
sample_doab = 'oai:doab-books:20.500.12854/25850' sample_doab = 'oai:doab-books:20.500.12854/25850'

View File

@ -9,6 +9,7 @@ from . import views
urlpatterns = [ urlpatterns = [
path('', views.HomepageView.as_view(), name='home'), path('', views.HomepageView.as_view(), name='home'),
path('admin/', admin.site.urls), path('admin/', admin.site.urls),
path('problems/publishers/', views.ProblemPublishersView.as_view(), name='probpubs'),
path('problems/<str:code>/', views.ProblemsView.as_view(), name='problems'), path('problems/<str:code>/', views.ProblemsView.as_view(), name='problems'),
path('providers/', views.ProvidersView.as_view(), name='providers'), path('providers/', views.ProvidersView.as_view(), name='providers'),
path('providers/<str:provider>/', views.ProviderView.as_view(), name='provider'), path('providers/<str:provider>/', views.ProviderView.as_view(), name='provider'),

View File

@ -1,7 +1,7 @@
"""doab_check views """doab_check views
""" """
from django.db.models import Count from django.db.models import Count, OuterRef, Subquery
from django.http import HttpResponseRedirect from django.http import HttpResponseRedirect
from django.shortcuts import get_object_or_404, render from django.shortcuts import get_object_or_404, render
from django.urls import reverse from django.urls import reverse
@ -69,6 +69,7 @@ class ProviderView(generic.TemplateView):
return {'provider': provider, 'links': provider_links, 'codes': codes} return {'provider': provider, 'links': provider_links, 'codes': codes}
class PublishersView(generic.TemplateView): class PublishersView(generic.TemplateView):
template_name = 'publishers.html' template_name = 'publishers.html'
@ -79,6 +80,23 @@ class PublishersView(generic.TemplateView):
publisher_name=publisher['publisher_name'], status=1).count() publisher_name=publisher['publisher_name'], status=1).count()
return {'publisher_list': publishers} return {'publisher_list': publishers}
class ProblemPublishersView(generic.TemplateView):
template_name = 'probpubs.html'
def get_context_data(self, **kwargs):
onepub = Link.objects.filter(items=OuterRef("pk"))[:1].values('items__publisher_name')
problinks = Link.objects.exclude(
recent_check__isnull=True).exclude(
recent_check__return_code__exact=200)
probpubs = problinks.annotate(pub=onepub).order_by('pub')
pubs = probpubs.values('pub').distinct()
numlinks = probpubs.count()
for publisher in pubs:
publisher['bad_links'] = probpubs.filter(pub=publisher['pub'])
return {'pubs': pubs}
class PublisherView(generic.TemplateView): class PublisherView(generic.TemplateView):
template_name = 'publisher.html' template_name = 'publisher.html'