diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index 9c4d3a7d..49c8396a 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -489,6 +489,9 @@ def harvest_fahce(ebook): def harvest_cmp(ebook): def selector(doc): + objs = doc.select('.tab-content a.cmp_download_link[href]') + if (len({obj['href'] for obj in objs})) > 1: + return [] return doc.select('a.cmp_download_link[href]') def dl(url): return url.replace('view', 'download') + '?inline=1' diff --git a/core/management/commands/fix_cmp.py b/core/management/commands/fix_cmp.py new file mode 100644 index 00000000..9c8684f5 --- /dev/null +++ b/core/management/commands/fix_cmp.py @@ -0,0 +1,23 @@ +from django.core.management.base import BaseCommand + +from regluit.core.models import EbookFile, Ebook +from regluit.core.loaders.utils import get_soup + +class Command(BaseCommand): + + def handle(self, **options): + prov = 'editorial.uniagustiniana.edu.co' + for ebook in Ebook.objects.filter(provider=prov, format='online'): + print(ebook.url) + doc = get_soup(ebook.url) + if doc: + objs = doc.select('.tab-content a.cmp_download_link[href]') + for obj in objs: + for ebf in EbookFile.objects.filter(source=obj['href']): + bad_ebook = ebf.ebook + try: + ebf.file.delete() + except: + pass + ebf.delete() + bad_ebook.delete()