[#113378215] prevent a duplicate ebooks with exact same metadata from being created by /api/loader/yaml

add dedupe_ebooks_with_same_urls.py command for deleting duplicate ebooks
pull/1/head
Raymond Yee 2016-02-10 11:04:06 -08:00
parent 4792d03920
commit 45cb7d4eac
2 changed files with 39 additions and 1 deletions

View File

@ -867,7 +867,7 @@ def load_from_yaml(yaml_url):
if metadata._version and not metadata._version.startswith('0.0.'):
#there should be an ebook to link to!
try:
ebook= models.Ebook.objects.create(
ebook= models.Ebook.objects.get_or_create(
url=git_download_from_yaml_url(yaml_url,metadata._version,edition_name=metadata._edition ),
provider='Github',
rights = metadata.rights if metadata.rights in cc.LICENSE_LIST_ALL else None,

View File

@ -0,0 +1,38 @@
from collections import defaultdict
from django.core.management.base import BaseCommand
from regluit.core.models import Ebook
def delete_newest_ebooks(ebooks):
"""
given a list of ebooks (presumably with the same URL), delete all but the ebook that was created first
"""
for ebook in sorted(ebooks, key=lambda ebook: ebook.created)[1:]:
print "deleting ebook.id {}, edition.id {} work.id {}".format(ebook.id,
ebook.edition.id,
ebook.edition.work.id)
ebook.delete()
intact = ebooks[0]
print "leaving undeleted: ebook.id {}, edition.id {} work.id {}".format(
intact.id,
intact.edition.id,
intact.edition.work.id
)
class Command(BaseCommand):
help = "delete redundant ebooks based on having the same URL"
def handle(self, **options):
ebooks_by_url = defaultdict(list)
# aggregate ebooks by url
# consider only active ebooks in deduping
for ebook in Ebook.objects.filter(active=True):
ebooks_by_url[ebook.url].append(ebook)
# look through the URLs locating ones with more than one ebook
for (url, ebooks) in ebooks_by_url.items():
if len(ebooks) > 1:
print (url, len(ebooks))
delete_newest_ebooks(ebooks)