regluit/core/management/commands/clean_db_strings.py

64 lines
3.0 KiB
Python

from __future__ import print_function
from django.core.management.base import BaseCommand
from django.db import IntegrityError
from regluit.core import models
from regluit.utils.text import sanitize_line, remove_badxml
class Command(BaseCommand):
help = "clean work and edition titles, work descriptions, and author and publisher names"
def handle(self, **options):
work_titles_fixed = edition_titles_fixed = work_descriptions_fixed = author_names_fixed = 0
publisher_names_fixed = 0
for work in models.Work.objects.all():
if sanitize_line(work.title) != work.title:
work.title = sanitize_line(work.title)
work.save()
work_titles_fixed +=1
if work.description and remove_badxml(work.description) != work.description:
work.description = remove_badxml(work.description)
work.save()
work_descriptions_fixed +=1
print ("work_titles_fixed = {}".format(work_titles_fixed))
print ("work_descriptions_fixed = {}".format(work_descriptions_fixed))
for edition in models.Edition.objects.all():
if sanitize_line(edition.title) != edition.title:
edition.title = sanitize_line(edition.title)
edition.save()
edition_titles_fixed +=1
print ("edition_titles_fixed = {}".format(edition_titles_fixed))
for author in models.Author.objects.all():
if sanitize_line(author.name) != author.name:
author.name = sanitize_line(author.name)
try:
author.save()
except IntegrityError as e:
# duplicate entry
correct = models.Author.objects.get(name=sanitize_line(author.name))
for relator in author.relator_set.all():
relator.author = correct
relator.save()
author.delete()
author_names_fixed +=1
print ("author_names_fixed = {}".format(author_names_fixed))
for publishername in models.PublisherName.objects.all():
if sanitize_line(publishername.name) != publishername.name:
publishername.name = sanitize_line(publishername.name)
try:
publishername.save()
except IntegrityError as e:
# duplicate entry
correct = models.PublisherName.objects.get(name=sanitize_line(publishername.name))
for edition in publishername.editions.all():
edition.publisher_name = correct
edition.save()
for publisher in publishername.key_publisher.all():
publisher.name = correct
publisher.save()
publishername.delete()
publisher_names_fixed +=1
print ("publisher_names_fixed = {}".format(publisher_names_fixed))