despam descriptions in db

wasn't real happy with this: https://unglue.it/work/66938/ so I looked
for other spammy descriptions in the db
pull/1/head
eric 2013-03-07 11:37:40 -05:00
parent 14174ecb17
commit 05d37163ea
2 changed files with 30 additions and 0 deletions

View File

@ -488,6 +488,17 @@ def merge_works(w1, w2, user=None):
w2.delete()
def despam_description(description):
""" a lot of descriptions from openlibrary have free-book promotion text; this removes some of it."""
if description.find("GeneralBooksClub.com")>-1 or description.find("AkashaPublishing.Com")>-1:
return ""
pieces=description.split("1stWorldLibrary.ORG -")
if len(pieces)>1:
return pieces[1]
pieces=description.split("a million books for free.")
if len(pieces)>1:
return pieces[1]
return description
def add_openlibrary(work, hard_refresh = False):
if (not hard_refresh) and work.openlibrary_lookup is not None:
@ -541,6 +552,7 @@ def add_openlibrary(work, hard_refresh = False):
if isinstance(description,dict):
if description.has_key('value'):
description=description['value']
description=despam_description(description)
if not work.description or work.description.startswith('{') or len(description) > len(work.description):
work.description = description
work.save()
@ -710,3 +722,4 @@ def add_missing_isbn_to_editions(max_num=None, confirm=False):
class LookupFailure(Exception):
pass

View File

@ -0,0 +1,17 @@
from django.core.management.base import BaseCommand
from regluit.core import models, bookloader
class Command(BaseCommand):
help = "check description db for free ebook spam"
def handle(self, **options):
spam_strings=["1stWorldLibrary.ORG", "GeneralBooksClub.com", "million-books.com", "AkashaPublishing.Com"]
for spam_string in spam_strings:
qs=models.Work.objects.filter(description__icontains=spam_string)
print "Number of Works with %s in description: %s" % (spam_string, qs.count())
for work in qs:
work.description = bookloader.despam_description(work.description)
print "updating work %s" % work
bookloader.add_openlibrary(work, hard_refresh = True)