[#113378215] prevent a duplicate ebooks with exact same metadata from being created by /api/loader/yaml

add dedupe_ebooks_with_same_urls.py command for deleting duplicate ebooks
2016-02-10 11:04:06 -08:00 · 2016-02-10 11:04:06 -08:00 · 45cb7d4eac
parent 4792d03920
commit 45cb7d4eac
2 changed files with 39 additions and 1 deletions
--- a/core/bookloader.py
+++ b/core/bookloader.py
@ -867,7 +867,7 @@ def load_from_yaml(yaml_url):
        if metadata._version and not metadata._version.startswith('0.0.'):
            #there should be an ebook to link to!
            try:
-                ebook= models.Ebook.objects.create(
+                ebook= models.Ebook.objects.get_or_create(
                    url=git_download_from_yaml_url(yaml_url,metadata._version,edition_name=metadata._edition ),
                    provider='Github',
                    rights = metadata.rights if metadata.rights in cc.LICENSE_LIST_ALL else None,
--- a/core/management/commands/dedupe_ebooks_with_same_urls.py
+++ b/core/management/commands/dedupe_ebooks_with_same_urls.py
@ -0,0 +1,38 @@
+from collections import defaultdict
+from django.core.management.base import BaseCommand
+from regluit.core.models import Ebook
+
+def delete_newest_ebooks(ebooks):
+     """
+     given a list of ebooks (presumably with the same URL), delete all but the ebook that was created first
+     """
+     for ebook in sorted(ebooks, key=lambda ebook: ebook.created)[1:]:
+         print "deleting ebook.id {}, edition.id {} work.id {}".format(ebook.id,
+                                                                       ebook.edition.id,
+                                                                       ebook.edition.work.id)
+         ebook.delete()
+         
+         intact = ebooks[0]
+         print "leaving undeleted: ebook.id {}, edition.id {} work.id {}".format(
+            intact.id,
+            intact.edition.id,
+            intact.edition.work.id
+         )
+            
+
+class Command(BaseCommand):
+    help = "delete redundant ebooks based on having the same URL"
+    
+    def handle(self, **options):
+        ebooks_by_url = defaultdict(list)
+        
+        # aggregate ebooks by url
+        # consider only active ebooks in deduping 
+        for ebook in Ebook.objects.filter(active=True):
+            ebooks_by_url[ebook.url].append(ebook)
+            
+        # look through the URLs locating ones with more than one ebook
+        for (url, ebooks) in ebooks_by_url.items():
+            if len(ebooks) > 1:
+                print (url, len(ebooks))
+                delete_newest_ebooks(ebooks)