From fc2204a0fdd9a1bc70ee1bef3261c849497dd951 Mon Sep 17 00:00:00 2001 From: eric Date: Sat, 1 Aug 2020 22:12:25 -0400 Subject: [PATCH 1/5] make sure cmp is not just chapters --- core/loaders/harvest.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index 9c4d3a7d..2b03326c 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -489,6 +489,9 @@ def harvest_fahce(ebook): def harvest_cmp(ebook): def selector(doc): + objs = doc.select('.tab-content a.cmp_download_link[href]') + if (len({obj['href'] for obj in objs})) > 1: + return [] return doc.select('a.cmp_download_link[href]') def dl(url): return url.replace('view', 'download') + '?inline=1' From 4839dc549a54590ede603037f43088de409dc544 Mon Sep 17 00:00:00 2001 From: eric Date: Sat, 1 Aug 2020 22:12:43 -0400 Subject: [PATCH 2/5] fix cmp --- core/management/commands/fix_cmp.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 core/management/commands/fix_cmp.py diff --git a/core/management/commands/fix_cmp.py b/core/management/commands/fix_cmp.py new file mode 100644 index 00000000..9cbe9969 --- /dev/null +++ b/core/management/commands/fix_cmp.py @@ -0,0 +1,22 @@ +from django.core.management.base import BaseCommand + +from regluit.core.models import EbookFile + +class Command(BaseCommand): + + def handle(self, **options): + prov = 'editorial.uniagustiniana.edu.co' + for ebook in Ebook.objects.filter(provider=prov, format='online'): + print(ebook.url) + doc = get_soup(ebook.url) + if doc: + objs = doc.select('.tab-content a.cmp_download_link[href]') + for obj in objs: + for ebf in EbookFile.objects.filter(source=ebook.url): + bad_ebook = ebf.ebook + try: + ebf.file.delete() + except: + pass + ebf.delete() + bad_ebook.delete() From c5787bdd6e37091e3ce5089278136eef08d86a68 Mon Sep 17 00:00:00 2001 From: eric Date: Sat, 1 Aug 2020 22:22:07 -0400 Subject: [PATCH 3/5] fix import --- core/management/commands/fix_cmp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/management/commands/fix_cmp.py b/core/management/commands/fix_cmp.py index 9cbe9969..2eb3c835 100644 --- a/core/management/commands/fix_cmp.py +++ b/core/management/commands/fix_cmp.py @@ -1,6 +1,6 @@ from django.core.management.base import BaseCommand -from regluit.core.models import EbookFile +from regluit.core.models import EbookFile, Ebookfile class Command(BaseCommand): From 7981b08abe5caabfa890daf80a0cc8d20f9b0bff Mon Sep 17 00:00:00 2001 From: eric Date: Sat, 1 Aug 2020 22:43:31 -0400 Subject: [PATCH 4/5] fix --- core/management/commands/fix_cmp.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/core/management/commands/fix_cmp.py b/core/management/commands/fix_cmp.py index 2eb3c835..9c8684f5 100644 --- a/core/management/commands/fix_cmp.py +++ b/core/management/commands/fix_cmp.py @@ -1,6 +1,7 @@ from django.core.management.base import BaseCommand -from regluit.core.models import EbookFile, Ebookfile +from regluit.core.models import EbookFile, Ebook +from regluit.core.loaders.utils import get_soup class Command(BaseCommand): @@ -12,7 +13,7 @@ class Command(BaseCommand): if doc: objs = doc.select('.tab-content a.cmp_download_link[href]') for obj in objs: - for ebf in EbookFile.objects.filter(source=ebook.url): + for ebf in EbookFile.objects.filter(source=obj['href']): bad_ebook = ebf.ebook try: ebf.file.delete() From c63e20c3fe3b37d7fd355b1431cb038e78eb2b2c Mon Sep 17 00:00:00 2001 From: eric Date: Sat, 1 Aug 2020 22:50:55 -0400 Subject: [PATCH 5/5] fix --- core/loaders/harvest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index 2b03326c..49c8396a 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -489,7 +489,7 @@ def harvest_fahce(ebook): def harvest_cmp(ebook): def selector(doc): - objs = doc.select('.tab-content a.cmp_download_link[href]') + objs = doc.select('.tab-content a.cmp_download_link[href]') if (len({obj['href'] for obj in objs})) > 1: return [] return doc.select('a.cmp_download_link[href]')