regluit/core/loaders/soup.py

34 lines
1.1 KiB
Python
Raw Normal View History

2020-08-16 00:21:56 +00:00
import logging
from bs4 import BeautifulSoup
import requests
from django.conf import settings
2020-09-02 15:42:53 +00:00
logger = logging.getLogger(__name__)
2020-08-16 00:21:56 +00:00
2021-06-05 00:04:33 +00:00
def get_soup(url, user_agent=settings.USER_AGENT, follow_redirects=False):
2020-08-16 00:21:56 +00:00
try:
2021-06-05 00:04:33 +00:00
response = requests.get(url, headers={"User-Agent": user_agent},
allow_redirects=follow_redirects)
2020-08-16 00:21:56 +00:00
except requests.exceptions.MissingSchema:
response = requests.get('http://%s' % url, headers={"User-Agent": user_agent})
except requests.exceptions.ConnectionError as e:
logger.error("Connection refused for %s", url)
logger.error(e)
return None
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'lxml')
# make sure document has a base
if not soup.find('base'):
obj = soup.find('head')
if obj:
obj.append(soup.new_tag("base", href=response.url))
else:
logger.error('No head for %s', url)
return soup
else:
logger.error('%s returned code %s', url, response.status_code)
return None