Merge pull request #764 from Gluejar/springer-option

load springer improvements
pull/91/head
eshellman 2018-03-22 16:14:29 -04:00 committed by GitHub
commit 007307f411
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 18 additions and 14 deletions

View File

@ -110,15 +110,19 @@ class SpringerScraper(BaseScraper):
self.set('publisher', 'Springer') self.set('publisher', 'Springer')
search_url = 'https://link.springer.com/search/page/{}?facet-content-type=%22Book%22&package=openaccess' search_url = 'https://link.springer.com/search/page/{}?facet-content-type=%22Book%22&package=openaccess'
def load_springer(num_pages): def load_springer(startpage=1, endpage=None):
def springer_open_books(num_pages): def springer_open_books(startpage, endpage):
for page in range(1, num_pages+1): endpage = endpage if endpage else startpage + 10
for page in range(startpage, endpage + 1):
url = search_url.format(page) url = search_url.format(page)
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT}) try:
if response.status_code == 200: response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
base = response.url if response.status_code == 200:
doc = BeautifulSoup(response.content, 'lxml') base = response.url
for link in doc.select('a.title'): doc = BeautifulSoup(response.content, 'lxml')
book_url = urljoin(base, link['href']) for link in doc.select('a.title'):
yield SpringerScraper(book_url) book_url = urljoin(base, link['href'])
return add_from_bookdatas(springer_open_books(num_pages)) yield SpringerScraper(book_url)
except requests.exceptions.ConnectionError:
print 'couldn\'t connect to %s' % url
return add_from_bookdatas(springer_open_books(startpage, endpage))

View File

@ -4,9 +4,9 @@ from regluit.core.loaders.springer import load_springer
class Command(BaseCommand): class Command(BaseCommand):
help = "load books from springer open" help = "load books from springer open"
args = "<pages>" args = "<startpage> <endpage>"
def handle(self, pages, **options): def handle(self, startpage, endpage=0, **options):
books = load_springer(int(pages)) books = load_springer(int(startpage), int(endpage))
print "loaded {} books".format(len(books)) print "loaded {} books".format(len(books))