commit
007307f411
|
@ -110,15 +110,19 @@ class SpringerScraper(BaseScraper):
|
||||||
self.set('publisher', 'Springer')
|
self.set('publisher', 'Springer')
|
||||||
|
|
||||||
search_url = 'https://link.springer.com/search/page/{}?facet-content-type=%22Book%22&package=openaccess'
|
search_url = 'https://link.springer.com/search/page/{}?facet-content-type=%22Book%22&package=openaccess'
|
||||||
def load_springer(num_pages):
|
def load_springer(startpage=1, endpage=None):
|
||||||
def springer_open_books(num_pages):
|
def springer_open_books(startpage, endpage):
|
||||||
for page in range(1, num_pages+1):
|
endpage = endpage if endpage else startpage + 10
|
||||||
|
for page in range(startpage, endpage + 1):
|
||||||
url = search_url.format(page)
|
url = search_url.format(page)
|
||||||
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
|
try:
|
||||||
if response.status_code == 200:
|
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
|
||||||
base = response.url
|
if response.status_code == 200:
|
||||||
doc = BeautifulSoup(response.content, 'lxml')
|
base = response.url
|
||||||
for link in doc.select('a.title'):
|
doc = BeautifulSoup(response.content, 'lxml')
|
||||||
book_url = urljoin(base, link['href'])
|
for link in doc.select('a.title'):
|
||||||
yield SpringerScraper(book_url)
|
book_url = urljoin(base, link['href'])
|
||||||
return add_from_bookdatas(springer_open_books(num_pages))
|
yield SpringerScraper(book_url)
|
||||||
|
except requests.exceptions.ConnectionError:
|
||||||
|
print 'couldn\'t connect to %s' % url
|
||||||
|
return add_from_bookdatas(springer_open_books(startpage, endpage))
|
||||||
|
|
|
@ -4,9 +4,9 @@ from regluit.core.loaders.springer import load_springer
|
||||||
|
|
||||||
class Command(BaseCommand):
|
class Command(BaseCommand):
|
||||||
help = "load books from springer open"
|
help = "load books from springer open"
|
||||||
args = "<pages>"
|
args = "<startpage> <endpage>"
|
||||||
|
|
||||||
|
|
||||||
def handle(self, pages, **options):
|
def handle(self, startpage, endpage=0, **options):
|
||||||
books = load_springer(int(pages))
|
books = load_springer(int(startpage), int(endpage))
|
||||||
print "loaded {} books".format(len(books))
|
print "loaded {} books".format(len(books))
|
||||||
|
|
Loading…
Reference in New Issue