diff --git a/api/onix.py b/api/onix.py index 1aba6b76..8b84c1fc 100644 --- a/api/onix.py +++ b/api/onix.py @@ -33,7 +33,9 @@ def onix_feed_for_work(work): feed = etree.fromstring(feed_xml) feed.append(header(work)) for edition in models.Edition.objects.filter(work=work,ebooks__isnull=False).distinct(): - feed.append(product(edition)) + edition_prod = product(edition) + if edition_prod: + feed.append(product(edition)) return etree.tostring(feed, pretty_print=True) def header(facet=None): diff --git a/bisac/__init__.py b/bisac/__init__.py index 08e027a4..554feb6d 100644 --- a/bisac/__init__.py +++ b/bisac/__init__.py @@ -23123,5 +23123,11 @@ bisac= { "pref_label": "Political Science / Terrorism", "notation": "POL037000", "alt_label": [] - } + }, + "History / Europe / Greece": { + "related": [], + "pref_label": "History / Europe / Greece", + "notation": "HIS042000", + "alt_label": [] + }, } \ No newline at end of file diff --git a/core/bookloader.py b/core/bookloader.py index f161c6a1..c45fe541 100755 --- a/core/bookloader.py +++ b/core/bookloader.py @@ -38,7 +38,7 @@ from . import cc from . import models from .parameters import WORK_IDENTIFIERS from .validation import identifier_cleaner -from .loaders.scrape import BaseScraper +from .loaders.scrape import BaseScraper, scrape_sitemap logger = logging.getLogger(__name__) request_log = logging.getLogger("requests") @@ -413,7 +413,6 @@ def relate_isbn(isbn, cluster_size=1): elif related_edition.work.id != edition.work.id: logger.debug("merge_works path 1 %s %s", edition.work.id, related_edition.work.id ) merge_works(related_edition.work, edition.work) - if related_edition.work.editions.count()>cluster_size: return related_edition.work return edition.work @@ -452,7 +451,7 @@ def add_related(isbn): related_edition.save() elif related_edition.work.id != work.id: logger.debug("merge_works path 1 %s %s", work.id, related_edition.work.id ) - merge_works(work, related_edition.work) + work = merge_works(work, related_edition.work) else: if other_editions.has_key(related_language): other_editions[related_language].append(related_edition) @@ -469,12 +468,15 @@ def add_related(isbn): works_to_merge = set([ed.work for ed in lang_group[1:]]) - set([lang_edition.work]) for w in works_to_merge: logger.debug("merge_works path 2 %s %s", lang_edition.work.id, w.id ) - merge_works(lang_edition.work, w) - models.WorkRelation.objects.get_or_create(to_work=lang_edition.work, from_work=work, relation='translation') + merged_work = merge_works(lang_edition.work, w) + models.WorkRelation.objects.get_or_create( + to_work=lang_group[0].work, + from_work=work, + relation='translation' + ) return new_editions - def thingisbn(isbn): """given an ISBN return a list of related edition ISBNs, according to Library Thing. (takes isbn_10 or isbn_13, returns isbn_10, except for 979 isbns, which come back as isbn_13') @@ -492,7 +494,7 @@ def merge_works(w1, w2, user=None): logger.info("merging work %s into %s", w2.id, w1.id) # don't merge if the works are the same or at least one of the works has no id (for example, when w2 has already been deleted) if w1 is None or w2 is None or w1.id == w2.id or w1.id is None or w2.id is None: - return + return w1 if w2.selected_edition != None and w1.selected_edition == None: #the merge should be reversed temp = w1 @@ -546,9 +548,14 @@ def merge_works(w1, w2, user=None): for subject in w2.subjects.all(): if subject not in w1.subjects.all(): w1.subjects.add(subject) - - + for work_relation in w2.works_related_to.all(): + work_relation.to_work = w1 + work_relation.save() + for work_relation in w2.works_related_from.all(): + work_relation.from_work = w1 + work_relation.save() w2.delete() + return w1 def detach_edition(e): """will detach edition from its work, creating a new stub work. if remerge=true, will see if there's another work to attach to @@ -776,7 +783,7 @@ def load_from_yaml(yaml_url, test_mode=False): return edition.work.id if edition else None def edition_for_ident(id_type, id_value): - print 'returning edition for {}: {}'.format(id_type, id_value) + #print 'returning edition for {}: {}'.format(id_type, id_value) for ident in models.Identifier.objects.filter(type=id_type, value=id_value): return ident.edition if ident.edition else ident.work.editions[0] @@ -844,7 +851,15 @@ class BasePandataLoader(object): value = value[0] if isinstance(value, list) else value try: id = models.Identifier.objects.get(type=id_code, value=value) - work = id.work + if work and id.work and id.work.id is not work.id: + # dangerous! merge newer into older + if work.id < id.work.id: + merge_works(work, id.work) + else: + merge_works(id.work, work) + work = id.work + else: + work = id.work if id.edition and not edition: edition = id.edition except models.Identifier.DoesNotExist: @@ -1045,5 +1060,21 @@ def add_by_webpage(url, work=None, user=None): loader.load_ebooks(pandata, edition, user=user) return edition if edition else None +def add_by_sitemap(url, maxnum=None): + editions = [] + scraper = BaseScraper(url) + for bookdata in scrape_sitemap(url, maxnum=maxnum): + edition = work = None + loader = BasePandataLoader(bookdata.base) + pandata = Pandata() + pandata.metadata = bookdata.metadata + for metadata in pandata.get_edition_list(): + edition = loader.load_from_pandata(metadata, work) + work = edition.work + loader.load_ebooks(pandata, edition) + if edition: + editions.append(edition) + return editions + diff --git a/core/loaders/doab.py b/core/loaders/doab.py index 6127d0c0..3bf70bfd 100644 --- a/core/loaders/doab.py +++ b/core/loaders/doab.py @@ -130,9 +130,9 @@ def add_all_isbns(isbns, work, language=None, title=None): first_edition = first_edition if first_edition else edition if work and (edition.work.id != work.id): if work.created < edition.work.created: - merge_works(work, edition.work) + work = merge_works(work, edition.work) else: - merge_works(edition.work, work) + work = merge_works(edition.work, work) else: work = edition.work return first_edition diff --git a/core/loaders/scrape.py b/core/loaders/scrape.py index 7c690690..ac757266 100644 --- a/core/loaders/scrape.py +++ b/core/loaders/scrape.py @@ -15,6 +15,9 @@ CONTAINS_COVER = re.compile('cover') CONTAINS_CC = re.compile('creativecommons.org') class BaseScraper(object): + ''' + designed to make at least a decent gues for webpages that embed metadata + ''' def __init__(self, url): self.metadata = {} self.identifiers = {'http': url} @@ -24,6 +27,7 @@ class BaseScraper(object): response = requests.get(url, headers={"User-Agent": settings.USER_AGENT}) if response.status_code == 200: self.doc = BeautifulSoup(response.content, 'lxml') + self.get_genre() self.get_title() self.get_language() self.get_description() @@ -41,7 +45,7 @@ class BaseScraper(object): self.set('language', 'en') except requests.exceptions.RequestException as e: logger.error(e) - self.metadata = None + self.metadata = {} self.metadata['identifiers'] = self.identifiers def set(self, name, value): @@ -75,18 +79,28 @@ class BaseScraper(object): return value return value + def get_genre(self): + value = self.check_metas(['DC.Type', 'dc.type', 'og:type']) + if value and value in ('Text.Book', 'book'): + self.set('genre', 'book') + def get_title(self): - value = self.check_metas(['DC.Title','dc.title', 'citation_title', 'title']) + value = self.check_metas(['DC.Title', 'dc.title', 'citation_title', 'title']) if not value: value = self.fetch_one_el_content('title') self.set('title', value) def get_language(self): - value = self.check_metas(['DC.Language','dc.language','language']) + value = self.check_metas(['DC.Language', 'dc.language', 'language']) self.set('language', value) def get_description(self): - value = self.check_metas(['DC.Description','dc.description','description']) + value = self.check_metas([ + 'DC.Description', + 'dc.description', + 'og:description', + 'description' + ]) self.set('description', value) def get_identifiers(self): @@ -100,7 +114,7 @@ class BaseScraper(object): self.identifiers['doi'] = value isbns = {} label_map = {'epub': 'EPUB', 'mobi': 'Mobi', - 'paper': 'Paperback', 'pdf': 'PDF', 'hard':'Hardback'} + 'paper': 'Paperback', 'pdf':'PDF', 'hard':'Hardback'} for key in label_map.keys(): isbn_key = 'isbn_{}'.format(key) value = self.check_metas(['citation_isbn'], type=label_map[key]) @@ -126,7 +140,7 @@ class BaseScraper(object): if isbn: ed_list.append({ '_edition': isbn, - 'edition_identifiers': {'isbn': isbn} + 'edition_identifiers': {'isbn':isbn} }) if len(ed_list): self.set('edition_list', ed_list) @@ -147,7 +161,10 @@ class BaseScraper(object): self.set('publication_date', value) def get_authors(self): - value_list = self.check_metas(['DC.Creator.PersonalName', 'citation_author',], list_mode='list') + value_list = self.check_metas([ + 'DC.Creator.PersonalName', + 'citation_author', + ], list_mode='list') if not value_list: return if len(value_list) == 1: @@ -161,13 +178,17 @@ class BaseScraper(object): self.set('creator', creator) def get_cover(self): - block = self.doc.find(class_=CONTAINS_COVER) - block = block if block else self.doc - img = block.find_all('img', src=CONTAINS_COVER) - if img: - cover_uri = img[0].get('src', None) - if cover_uri: - self.set('covers', [{'image_url': urljoin(self.base, cover_uri)}]) + image_url = self.check_metas(['og.image']) + if not image_url: + block = self.doc.find(class_=CONTAINS_COVER) + block = block if block else self.doc + img = block.find_all('img', src=CONTAINS_COVER) + if img: + cover_uri = img[0].get('src', None) + if cover_uri: + image_url = urljoin(self.base, cover_uri) + if image_url: + self.set('covers', [{'image_url': image_url}]) def get_downloads(self): for dl_type in ['epub', 'mobi', 'pdf']: @@ -181,3 +202,14 @@ class BaseScraper(object): links = self.doc.find_all(href=CONTAINS_CC) for link in links: self.set('rights_url', link['href']) + +def scrape_sitemap(url, maxnum=None): + try: + response = requests.get(url, headers={"User-Agent": settings.USER_AGENT}) + doc = BeautifulSoup(response.content, 'lxml') + for page in doc.find_all('loc')[0:maxnum]: + scraper = BaseScraper(page.text) + if scraper.metadata.get('genre', None) == 'book': + yield scraper + except requests.exceptions.RequestException as e: + logger.error(e) diff --git a/core/loaders/utils.py b/core/loaders/utils.py index 68607d5f..1357bbd2 100644 --- a/core/loaders/utils.py +++ b/core/loaders/utils.py @@ -220,9 +220,7 @@ def load_from_books(books): for isbn in isbns: edition = add_by_isbn_from_google(isbn, work=work) if edition and edition.work != work: - merge_works(work, edition.work) - work = work if work.pk is not None else edition.work - edition.work=work # because integrity errors if not + work = merge_works(work, edition.work) if not edition: edition= Edition(title=title, work=work) edition.save() diff --git a/core/management/commands/load_books_from_sitemap.py b/core/management/commands/load_books_from_sitemap.py new file mode 100644 index 00000000..31b9b130 --- /dev/null +++ b/core/management/commands/load_books_from_sitemap.py @@ -0,0 +1,24 @@ +from django.core.management.base import BaseCommand + +from regluit.core.bookloader import add_by_sitemap + +class Command(BaseCommand): + help = "load books based on a website sitemap" + + def add_arguments(self, parser): + # Positional arguments + parser.add_argument('url') + + # Named (optional) arguments + parser.add_argument( + '--max', + dest='max', + type=int, + default=None, + nargs='?', + help='set a maximum number of books to load', + ) + + def handle(self, url, max=None, **options): + books = add_by_sitemap(url, maxnum=max) + print "loaded {} books".format(len(books)) diff --git a/core/models/__init__.py b/core/models/__init__.py index 9c04848a..405711a9 100755 --- a/core/models/__init__.py +++ b/core/models/__init__.py @@ -1034,6 +1034,7 @@ class Campaign(models.Model): url=ebf.file.url, version_label=ebf.version['label'], version_iter=ebf.version['iter'], + filesize=ebf.file.size, ) ebf.ebook = ebook ebf.save() @@ -1041,7 +1042,8 @@ class Campaign(models.Model): for old_ebf in self.work.ebookfiles().filter(asking=True).exclude(pk__in=new_ebf_pks): obsolete = Ebook.objects.filter(url=old_ebf.file.url) - old_ebf.ebook.deactivate() + if old_ebf.ebook: + old_ebf.ebook.deactivate() old_ebf.file.delete() old_ebf.delete() diff --git a/core/validation.py b/core/validation.py index 3bba30b1..68ecbf66 100644 --- a/core/validation.py +++ b/core/validation.py @@ -43,7 +43,7 @@ def isbn_cleaner(value): if value == 'delete': return value if not value: - raise forms.ValidationError('no identifier value found') + raise ValidationError('no identifier value found') elif value == 'delete': return value isbn=ISBN(value) diff --git a/frontend/templates/_template_map.txt b/frontend/templates/_template_map.txt index faecb921..d1894adc 100644 --- a/frontend/templates/_template_map.txt +++ b/frontend/templates/_template_map.txt @@ -1,3 +1,5 @@ +PAGE TEMPLATES + base.html extra_css(empty) extra_js(empty) extra_head(empty) 404.html 500.html @@ -9,10 +11,12 @@ base.html extra_css(empty) extra_js(empty) extra_head(empty) about_unglued_empty.html about_wishlist.html about_wishlist_empty.html + base-questionnaire.html campaign_list.html extra_css extra_head cc_list.html extra_css extra_head comments.html extra_css extra_head download.html extra_js + faceted_list.html extra_css extra_head goodreads_display.html extra_head home.html extra_css extra_js kindle_change_successful.html extra_js @@ -22,43 +26,52 @@ base.html extra_css(empty) extra_js(empty) extra_head(empty) librarything.html lockss.html lockss_manifest.html + map_subject.html profiles/create_profile.html profiles/edit_profile.html profiles/profile_detail.html profiles/profile_list.html registration/registration_base.html extra_js extra_head extra_extra_head basedocumentation.html extra_js extra_extra_head - api_help.html about.html about_smashwords.html admins_only.html + api_help.html + ask_rh.html campaign_admin.html extra_extra_head campaign_results.html claim.html comments/base.html comments/preview.html extra_css + edit_edition.html extra_extra_head edition_uploads.html + emailshare.html gift.html extra_extra_head emailshare.html extra_css faq.html feedback.html front_matter.html extra_extra_head + join_library.html languages.html libraries.html extra_css extra_js libraryauth/edit.html extra_extra_head manage_account.html extra_extra_head manage_campaign.html extra_extra_head + manage_ebooks.html + manage_survey.html + marc.html merge.html extra_extra_head metrics.html new_edition.html extra_extra_head notification/base.html notification/notice_settings.html extra_css extra_js notification/notices.html extra_css - press.html press_new.html + press_submitterator.html privacy.html rh_tools.html extra_extra_head rights_holders.html extra_extra_head + surveys.html terms.html extra_css thanks.html basepledge.html extra_css extra_js extra_extra_head(empty) @@ -75,8 +88,25 @@ base.html extra_css(empty) extra_js(empty) extra_head(empty) pledge_user_error.html extra_extra_head purchase.html extra_extra_head stripe.html extra_extra_head + email_change/base.html + email_change_complete.html + email_change_form.html + email_verification_sent.html + email_verify.html + kindle_change_successful.html + kindle_config.html + marc_config.html + gift_duplicate.html + gift_error.html + gift_login.html + gift_welcome.html + kindle_response_graceful_degradation.html registration/activation_complete.html registration/activate.html + registration/from_pledge.html + registration/from_add.html + registration/from_error.html + registration/from_purchase.html registration/login.html registration/logout.html registration/password_change_done.html @@ -98,18 +128,41 @@ base.html extra_css(empty) extra_js(empty) extra_head(empty) work.html extra_css extra_js work_list.html extra_css extra_head bypub_list.html + recommended.html - +COMPONENT TEMPLATES +about_lightbox_footer.html +book_plain.html book_panel_addbutton.html cardform.html -cardscripts.html +cardscripts.html +claim_terms.html +ebook_list.html ebookfiles.html edition_display.html edition_upload.html explore.html -faqmenu.html +faq_b2u.html +faq_pledge_cancel.html +faq_pledge.html +faq_purchase.html faq_t4u.html +faqmenu.html +kindle_response_message.html learn_more.html +marc_form.html num_wishes.html +press_item.html +refine.html +registration/login_form.html +registration/password_reset_email.html +registration/registration_closed.html +registration/test_template_name.html +sidebar_pledge_complete.html +slideshow.html split.html +stripe_stuff.html +subjectbox.html trans_summary.html +work_action.html +workbox.html diff --git a/frontend/templates/edit_edition.html b/frontend/templates/edit_edition.html index 940a18f2..7d4ae3fa 100644 --- a/frontend/templates/edit_edition.html +++ b/frontend/templates/edit_edition.html @@ -204,11 +204,9 @@ ul.fancytree-container {

Cover Image:
- {% if edition.cover_image %} -
- {% else %} - [ no cover specified for this edition ]
- {% endif %} +

+ +
{{ form.cover_image.errors }}{{ form.cover_image }}{{ form.cover_image.help_text }} (Enter a URL for an image, at least 300 px wide. The image will be scaled to the proportions of a 6x9 cover. )
OR...
diff --git a/frontend/templates/edition_display.html b/frontend/templates/edition_display.html index bb5a94cf..c3c4eea4 100644 --- a/frontend/templates/edition_display.html +++ b/frontend/templates/edition_display.html @@ -1,10 +1,8 @@
- {% if edition.googlebooks_id %} -
- edition cover -
- {% endif %} +
+ edition cover +