From 05d37163eaa1a37539cf3e58ea523286ca2405b3 Mon Sep 17 00:00:00 2001 From: eric Date: Thu, 7 Mar 2013 11:37:40 -0500 Subject: [PATCH] despam descriptions in db wasn't real happy with this: https://unglue.it/work/66938/ so I looked for other spammy descriptions in the db --- core/bookloader.py | 13 +++++++++++++ core/management/commands/despam_descriptions.py | 17 +++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 core/management/commands/despam_descriptions.py diff --git a/core/bookloader.py b/core/bookloader.py index cbccb436..a14e224c 100755 --- a/core/bookloader.py +++ b/core/bookloader.py @@ -488,6 +488,17 @@ def merge_works(w1, w2, user=None): w2.delete() +def despam_description(description): + """ a lot of descriptions from openlibrary have free-book promotion text; this removes some of it.""" + if description.find("GeneralBooksClub.com")>-1 or description.find("AkashaPublishing.Com")>-1: + return "" + pieces=description.split("1stWorldLibrary.ORG -") + if len(pieces)>1: + return pieces[1] + pieces=description.split("a million books for free.") + if len(pieces)>1: + return pieces[1] + return description def add_openlibrary(work, hard_refresh = False): if (not hard_refresh) and work.openlibrary_lookup is not None: @@ -541,6 +552,7 @@ def add_openlibrary(work, hard_refresh = False): if isinstance(description,dict): if description.has_key('value'): description=description['value'] + description=despam_description(description) if not work.description or work.description.startswith('{') or len(description) > len(work.description): work.description = description work.save() @@ -710,3 +722,4 @@ def add_missing_isbn_to_editions(max_num=None, confirm=False): class LookupFailure(Exception): pass + diff --git a/core/management/commands/despam_descriptions.py b/core/management/commands/despam_descriptions.py new file mode 100644 index 00000000..7faa3cc0 --- /dev/null +++ b/core/management/commands/despam_descriptions.py @@ -0,0 +1,17 @@ + +from django.core.management.base import BaseCommand +from regluit.core import models, bookloader + +class Command(BaseCommand): + help = "check description db for free ebook spam" + + def handle(self, **options): + spam_strings=["1stWorldLibrary.ORG", "GeneralBooksClub.com", "million-books.com", "AkashaPublishing.Com"] + for spam_string in spam_strings: + qs=models.Work.objects.filter(description__icontains=spam_string) + print "Number of Works with %s in description: %s" % (spam_string, qs.count()) + + for work in qs: + work.description = bookloader.despam_description(work.description) + print "updating work %s" % work + bookloader.add_openlibrary(work, hard_refresh = True)