61 lines
1.8 KiB
Python
Executable File
61 lines
1.8 KiB
Python
Executable File
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
from django.conf import settings
|
|
|
|
from gitenberg.metadata.pandata import Pandata
|
|
|
|
from regluit.core.bookloader import add_from_bookdatas, BasePandataLoader
|
|
from .scrape import BaseScraper
|
|
from .hathitrust import HathitrustScraper
|
|
from .pressbooks import PressbooksScraper
|
|
from .springer import SpringerScraper
|
|
from .ubiquity import UbiquityScraper
|
|
from .smashwords import SmashwordsScraper
|
|
|
|
def get_scraper(url):
|
|
scrapers = [
|
|
PressbooksScraper,
|
|
SpringerScraper,
|
|
UbiquityScraper,
|
|
SmashwordsScraper,
|
|
HathitrustScraper,
|
|
BaseScraper,
|
|
]
|
|
for scraper in scrapers:
|
|
if scraper.can_scrape(url):
|
|
return scraper(url)
|
|
|
|
def scrape_sitemap(url, maxnum=None):
|
|
try:
|
|
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
|
|
doc = BeautifulSoup(response.content, 'lxml')
|
|
for page in doc.find_all('loc')[0:maxnum]:
|
|
scraper = get_scraper(page.text)
|
|
if scraper.metadata.get('genre', None) == 'book':
|
|
yield scraper
|
|
except requests.exceptions.RequestException as e:
|
|
logger.error(e)
|
|
|
|
def add_by_webpage(url, work=None, user=None):
|
|
edition = None
|
|
scraper = get_scraper(url)
|
|
loader = BasePandataLoader(url)
|
|
pandata = Pandata()
|
|
pandata.metadata = scraper.metadata
|
|
for metadata in pandata.get_edition_list():
|
|
edition = loader.load_from_pandata(metadata, work)
|
|
work = edition.work
|
|
loader.load_ebooks(pandata, edition, user=user)
|
|
return edition if edition else None
|
|
|
|
|
|
def add_by_sitemap(url, maxnum=None):
|
|
return add_from_bookdatas(scrape_sitemap(url, maxnum=maxnum))
|
|
|
|
def scrape_language(url):
|
|
scraper = get_scraper(url)
|
|
return scraper.metadata.get('language')
|
|
|
|
|