[#113378215] prevent a duplicate ebooks with exact same metadata from being created by /api/loader/yaml
add dedupe_ebooks_with_same_urls.py command for deleting duplicate ebookspull/1/head
parent
4792d03920
commit
45cb7d4eac
|
@ -867,7 +867,7 @@ def load_from_yaml(yaml_url):
|
|||
if metadata._version and not metadata._version.startswith('0.0.'):
|
||||
#there should be an ebook to link to!
|
||||
try:
|
||||
ebook= models.Ebook.objects.create(
|
||||
ebook= models.Ebook.objects.get_or_create(
|
||||
url=git_download_from_yaml_url(yaml_url,metadata._version,edition_name=metadata._edition ),
|
||||
provider='Github',
|
||||
rights = metadata.rights if metadata.rights in cc.LICENSE_LIST_ALL else None,
|
||||
|
|
|
@ -0,0 +1,38 @@
|
|||
from collections import defaultdict
|
||||
from django.core.management.base import BaseCommand
|
||||
from regluit.core.models import Ebook
|
||||
|
||||
def delete_newest_ebooks(ebooks):
|
||||
"""
|
||||
given a list of ebooks (presumably with the same URL), delete all but the ebook that was created first
|
||||
"""
|
||||
for ebook in sorted(ebooks, key=lambda ebook: ebook.created)[1:]:
|
||||
print "deleting ebook.id {}, edition.id {} work.id {}".format(ebook.id,
|
||||
ebook.edition.id,
|
||||
ebook.edition.work.id)
|
||||
ebook.delete()
|
||||
|
||||
intact = ebooks[0]
|
||||
print "leaving undeleted: ebook.id {}, edition.id {} work.id {}".format(
|
||||
intact.id,
|
||||
intact.edition.id,
|
||||
intact.edition.work.id
|
||||
)
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "delete redundant ebooks based on having the same URL"
|
||||
|
||||
def handle(self, **options):
|
||||
ebooks_by_url = defaultdict(list)
|
||||
|
||||
# aggregate ebooks by url
|
||||
# consider only active ebooks in deduping
|
||||
for ebook in Ebook.objects.filter(active=True):
|
||||
ebooks_by_url[ebook.url].append(ebook)
|
||||
|
||||
# look through the URLs locating ones with more than one ebook
|
||||
for (url, ebooks) in ebooks_by_url.items():
|
||||
if len(ebooks) > 1:
|
||||
print (url, len(ebooks))
|
||||
delete_newest_ebooks(ebooks)
|
Loading…
Reference in New Issue