add loading from sitemap list

pull/91/head
eric 2018-02-20 13:35:00 -05:00
parent ad9523314d
commit b88d678058
2 changed files with 27 additions and 2 deletions

9
bookdata/sitemaps.txt Normal file
View File

@ -0,0 +1,9 @@
https://www.ubiquitypress.com/sitemap.xml
https://www.kriterium.se/sitemap.xml
https://oa.finlit.fi/sitemap.xml
https://www.humanities-map.net/sitemap.xml
https://oa.psupress.org/sitemap.xml
https://www.larcommons.net/sitemap.xml
https://www.uwestminsterpress.co.uk/sitemap.xml
https://www.stockholmuniversitypress.se/sitemap.xml
https://www.luminosoa.org/sitemap.xml

View File

@ -1,9 +1,10 @@
import os
from django.core.management.base import BaseCommand
from regluit.core.loaders import add_by_sitemap
class Command(BaseCommand):
help = "load books based on a website sitemap"
help = "load books based on a website sitemap; use url=all to load from sitemap list"
def add_arguments(self, parser):
# Positional arguments
@ -20,5 +21,20 @@ class Command(BaseCommand):
)
def handle(self, url, max=None, **options):
books = add_by_sitemap(url, maxnum=max)
if url == 'all':
file_name = "../../../bookdata/sitemaps.txt"
command_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(command_dir, file_name)
with open(file_path) as f:
content = f.readlines()
books = []
for sitemap in content:
added = add_by_sitemap(sitemap.strip(), maxnum=max)
max = max - len(added)
books = books + added
if max < 0:
break
else:
books = add_by_sitemap(url, maxnum=max)
print "loaded {} books".format(len(books))