Merge pull request #551 from Gluejar/multi_formats_in_load_from_yaml

updating load_from_yaml to handle formats other than epub
pull/1/head
Raymond Yee 2016-03-07 13:35:46 -08:00
commit f15cfdcc7f
5 changed files with 117 additions and 25 deletions

View File

@ -3,14 +3,22 @@ from django.db import models
#https://github.com/GITenberg/Adventures-of-Huckleberry-Finn_76/raw/master/metadata.yaml
#or https://raw.githubusercontent.com/GITenberg/Free-Russia_51117/master/metadata.yaml
def repo_allowed(repo_url):
if not repo_url.startswith('https://github.com/'):
return (False, "repo url must start with 'https://github.com/'")
if repo_url.startswith('https://github.com/'):
try:
(org,repo_name,raw,branch,filename) = repo_url[19:].split('/')
except ValueError:
return (False, "repo url must be well formed, metadata at top repo level")
elif repo_url.startswith('https://raw.githubusercontent.com/'):
try:
(org,repo_name,branch,filename) = repo_url[34:].split('/')
raw = 'raw'
except ValueError:
return (False, "repo url must be well formed, metadata at top repo level")
else:
return (False, "repo url must start with 'https://github.com/' or 'https://raw.githubusercontent.com/'")
if not raw == 'raw':
return (False, "repo url must point at 'raw' file")
if not filename == 'metadata.yaml':

View File

@ -6,9 +6,10 @@ import logging
import requests
from datetime import timedelta
from itertools import izip, islice
from itertools import (izip, islice)
from xml.etree import ElementTree
from urlparse import urljoin
from urlparse import (urljoin, urlparse)
"""
django imports
@ -18,6 +19,9 @@ from django.contrib.comments.models import Comment
from django.db import IntegrityError
from django.db.models import Q
from github3 import (login, GitHub)
from github3.repos.release import Release
from gitenberg.metadata.pandata import Pandata
from ..marc.models import inverse_marc_rels
@ -800,7 +804,10 @@ def unreverse(name):
return '%s %s, %s' % (first.strip(),last.strip(),rest.strip())
def load_from_yaml(yaml_url):
def load_from_yaml(yaml_url, test_mode=False):
"""
if mock_ebook is True, don't construct list of ebooks from a release -- rather use an epub
"""
all_metadata = Pandata(yaml_url)
for metadata in all_metadata.get_edition_list():
#find an work to associate
@ -863,24 +870,85 @@ def load_from_yaml(yaml_url):
edition.cover_image=urljoin(yaml_url,cover['image_path'])
break
edition.save()
# if there is a version, assume there is an ebook. if not, not.
# create Ebook for any ebook in the corresponding GitHub release
# assuming yaml_url of form (from GitHub, though not necessarily GITenberg)
# https://github.com/GITenberg/Adventures-of-Huckleberry-Finn_76/raw/master/metadata.yaml
url_path = urlparse(yaml_url).path.split("/")
(repo_owner, repo_name) = (url_path[1], url_path[2])
repo_tag = metadata._version
# allow for there not to be a token in the settings
try:
token = settings.GITHUB_PUBLIC_TOKEN
except:
token = None
if metadata._version and not metadata._version.startswith('0.0.'):
#there should be an ebook to link to!
# use GitHub API to compute the ebooks in release until we're in test mode
if test_mode:
# not using ebook_name in this code
ebooks_in_release = [('epub', None)]
else:
ebooks_in_release = ebooks_in_github_release(repo_owner, repo_name, repo_tag, token=token)
for (ebook_format, ebook_name) in ebooks_in_release:
(ebook, created)= models.Ebook.objects.get_or_create(
url=git_download_from_yaml_url(yaml_url,metadata._version,edition_name=metadata._edition ),
url=git_download_from_yaml_url(yaml_url,metadata._version,edition_name=metadata._edition,
format_= ebook_format),
provider='Github',
rights = metadata.rights if metadata.rights in cc.LICENSE_LIST_ALL else None,
format = 'epub',
format = ebook_format,
edition = edition,
# version = metadata._version
)
return work.id
def git_download_from_yaml_url(yaml_url, version, edition_name='book'):
def git_download_from_yaml_url(yaml_url, version, edition_name='book', format_='epub'):
# go from https://github.com/GITenberg/Adventures-of-Huckleberry-Finn_76/raw/master/metadata.yaml
# to https://github.com/GITenberg/Adventures-of-Huckleberry-Finn_76/releases/download/v0.0.3/Adventures-of-Huckleberry-Finn.epub
if yaml_url.endswith('raw/master/metadata.yaml'):
repo_url = yaml_url[0:-24]
#print (repo_url,version,edition_name)
ebook_url = repo_url + 'releases/download/' + version + '/' + edition_name + '.epub'
ebook_url = repo_url + 'releases/download/' + version + '/' + edition_name + '.' + format_
return ebook_url
def release_from_tag(repo, tag_name):
"""Get a release by tag name.
release_from_tag() returns a release with specified tag
while release() returns a release with specified release id
:param str tag_name: (required) name of tag
:returns: :class:`Release <github3.repos.release.Release>`
"""
# release_from_tag adapted from
# https://github.com/sigmavirus24/github3.py/blob/38de787e465bffc63da73d23dc51f50d86dc903d/github3/repos/repo.py#L1781-L1793
url = repo._build_url('releases', 'tags', tag_name,
base_url=repo._api)
json = repo._json(repo._get(url), 200)
return Release(json, repo) if json else None
def ebooks_in_github_release(repo_owner, repo_name, tag, token=None):
"""
returns a list of (book_type, book_name) for a given GitHub release (specified by
owner, name, tag). token is a GitHub authorization token -- useful for accessing
higher rate limit in the GitHub API
"""
# map mimetype to file extension
EBOOK_FORMATS = dict([(v,k) for (k,v) in settings.CONTENT_TYPES.items()])
if token is not None:
gh = login(token=token)
else:
# anonymous access
gh = GitHub()
repo = gh.repository(repo_owner, repo_name)
release = release_from_tag(repo, tag)
return [(EBOOK_FORMATS.get(asset.content_type), asset.name)
for asset in release.iter_assets()
if EBOOK_FORMATS.get(asset.content_type) is not None]

View File

@ -83,7 +83,7 @@ class BookLoaderTests(TestCase):
noebook_id = bookloader.load_from_yaml(YAML_VERSIONFILE)
noebook = models.Work.objects.get(id=noebook_id)
self.assertEqual( noebook.first_ebook(), None)
huck_id = bookloader.load_from_yaml(YAML_HUCKFILE)
huck_id = bookloader.load_from_yaml(YAML_HUCKFILE, test_mode=True)
huck = models.Work.objects.get(id=huck_id)
self.assertTrue( huck.ebooks().count()>1)
@ -1041,4 +1041,15 @@ class LibTests(TestCase):
tasks.refresh_acqs()
self.assertEqual(reserve_acq.holds.count(),0)
class GitHubTests(TestCase):
def test_ebooks_in_github_release(self):
(repo_owner, repo_name, repo_tag) = ('GITenberg', 'Adventures-of-Huckleberry-Finn_76', '0.0.50')
ebooks = bookloader.ebooks_in_github_release(repo_owner, repo_name,
tag=repo_tag, token=settings.GITHUB_PUBLIC_TOKEN)
expected_set = set([
('epub', u'Adventures-of-Huckleberry-Finn.epub'),
('mobi', u'Adventures-of-Huckleberry-Finn.mobi'),
('pdf', u'Adventures-of-Huckleberry-Finn.pdf')
])
self.assertEqual(set(ebooks), expected_set)

View File

@ -42,6 +42,7 @@ feedparser==5.1.2
freebase==1.0.8
#gitenberg.metadata==0.1.6
git+ssh://git@github.com/gitenberg-dev/metadata.git@0.1.11
github3.py==0.9.5
html5lib==1.0b3
httplib2==0.7.5
isodate==0.5.1

View File

@ -451,6 +451,10 @@ FILE_UPLOAD_MAX_MEMORY_SIZE = 20971520 #20MB
DROPBOX_KEY = '4efhwty5aph52bd' #for unglue.it, just.unglue.it
#DROPBOX_KEY = '6uefhocpvp0s1ep' #for localhost
# for reading GITenberg releases
# generated from rdhyee account
GITHUB_PUBLIC_TOKEN = 'f702409f913d7f9046f93c677710f829e2b599c9'
SOUTH_MIGRATION_MODULES = {
'default': 'social.apps.django_app.default.south_migrations'
}