From b88d6780589d34c2428ab4ece3514ed54af86e90 Mon Sep 17 00:00:00 2001 From: eric Date: Tue, 20 Feb 2018 13:35:00 -0500 Subject: [PATCH] add loading from sitemap list --- bookdata/sitemaps.txt | 9 +++++++++ .../commands/load_books_from_sitemap.py | 20 +++++++++++++++++-- 2 files changed, 27 insertions(+), 2 deletions(-) create mode 100644 bookdata/sitemaps.txt diff --git a/bookdata/sitemaps.txt b/bookdata/sitemaps.txt new file mode 100644 index 00000000..b91f2814 --- /dev/null +++ b/bookdata/sitemaps.txt @@ -0,0 +1,9 @@ +https://www.ubiquitypress.com/sitemap.xml +https://www.kriterium.se/sitemap.xml +https://oa.finlit.fi/sitemap.xml +https://www.humanities-map.net/sitemap.xml +https://oa.psupress.org/sitemap.xml +https://www.larcommons.net/sitemap.xml +https://www.uwestminsterpress.co.uk/sitemap.xml +https://www.stockholmuniversitypress.se/sitemap.xml +https://www.luminosoa.org/sitemap.xml \ No newline at end of file diff --git a/core/management/commands/load_books_from_sitemap.py b/core/management/commands/load_books_from_sitemap.py index 80b48f81..fbccd552 100644 --- a/core/management/commands/load_books_from_sitemap.py +++ b/core/management/commands/load_books_from_sitemap.py @@ -1,9 +1,10 @@ +import os from django.core.management.base import BaseCommand from regluit.core.loaders import add_by_sitemap class Command(BaseCommand): - help = "load books based on a website sitemap" + help = "load books based on a website sitemap; use url=all to load from sitemap list" def add_arguments(self, parser): # Positional arguments @@ -20,5 +21,20 @@ class Command(BaseCommand): ) def handle(self, url, max=None, **options): - books = add_by_sitemap(url, maxnum=max) + if url == 'all': + file_name = "../../../bookdata/sitemaps.txt" + command_dir = os.path.dirname(os.path.realpath(__file__)) + file_path = os.path.join(command_dir, file_name) + with open(file_path) as f: + content = f.readlines() + books = [] + for sitemap in content: + added = add_by_sitemap(sitemap.strip(), maxnum=max) + max = max - len(added) + books = books + added + if max < 0: + break + else: + books = add_by_sitemap(url, maxnum=max) + print "loaded {} books".format(len(books))