readthedocs.org/readthedocs/search/utils.py

309 lines
9.2 KiB
Python
Raw Normal View History

2014-08-21 00:31:26 +00:00
# -*- coding: utf-8 -*-
"""Utilities related to reading and generating indexable search content."""
2014-08-21 00:31:26 +00:00
2017-05-30 21:55:56 +00:00
from __future__ import absolute_import
2014-10-13 19:56:52 +00:00
import os
import fnmatch
import re
2014-08-21 00:31:26 +00:00
import codecs
import logging
2014-10-13 19:56:52 +00:00
import json
2014-08-21 00:31:26 +00:00
from builtins import next, range
2014-08-21 00:31:26 +00:00
from pyquery import PyQuery
2014-08-21 00:31:26 +00:00
log = logging.getLogger(__name__)
2014-10-13 19:56:52 +00:00
def process_mkdocs_json(version, build_dir=True):
"""Given a version object, return a list of page dicts from disk content."""
2014-10-13 19:56:52 +00:00
if build_dir:
full_path = version.project.full_json_path(version.slug)
else:
full_path = version.project.get_production_media_path(
type_='json', version_slug=version.slug, include_file=False)
2014-10-13 19:56:52 +00:00
html_files = []
for root, _, files in os.walk(full_path):
2014-10-13 19:56:52 +00:00
for filename in fnmatch.filter(files, '*.json'):
html_files.append(os.path.join(root, filename))
page_list = []
for filename in html_files:
if not valid_mkdocs_json(file_path=filename):
continue
relative_path = parse_path_from_file(file_path=filename)
html = parse_content_from_file(file_path=filename)
2014-10-13 19:56:52 +00:00
headers = parse_headers_from_file(documentation_type='mkdocs', file_path=filename)
sections = parse_sections_from_file(documentation_type='mkdocs', file_path=filename)
2014-10-14 02:24:30 +00:00
try:
title = sections[0]['title']
except IndexError:
title = relative_path
page_list.append({
'content': html,
'path': relative_path,
'title': title,
'headers': headers,
'sections': sections,
})
2014-10-13 19:56:52 +00:00
return page_list
2014-08-21 00:31:26 +00:00
def recurse_while_none(element):
if element.text is None:
return recurse_while_none(element.getchildren()[0])
return element.text
2014-08-21 00:31:26 +00:00
def valid_mkdocs_json(file_path):
try:
with codecs.open(file_path, encoding='utf-8', mode='r') as f:
content = f.read()
except IOError as e:
log.warning(
'(Search Index) Unable to index file: %s',
file_path,
exc_info=True,
)
return None
# TODO: wrap this in a try/except block and use ``exc_info=True`` in the
# ``log.warning`` call
page_json = json.loads(content)
for to_check in ['url', 'content']:
if to_check not in page_json:
log.warning('(Search Index) Unable to index file: %s error: Invalid JSON', file_path)
return None
return True
def parse_path_from_file(file_path):
"""Retrieve path information from a json-encoded file on disk."""
2014-10-13 19:56:52 +00:00
try:
with codecs.open(file_path, encoding='utf-8', mode='r') as f:
content = f.read()
except IOError as e:
log.warning(
'(Search Index) Unable to index file: %s',
file_path,
exc_info=True,
)
2014-10-13 19:56:52 +00:00
return ''
# TODO: wrap this in a try/except block
2014-10-13 19:56:52 +00:00
page_json = json.loads(content)
path = page_json['url']
2015-02-11 08:36:49 +00:00
# The URLs here should be of the form "path/index". So we need to
# convert:
# "path/" => "path/index"
# "path/index.html" => "path/index"
# "/path/index" => "path/index"
2014-10-13 19:56:52 +00:00
path = re.sub('/$', '/index', path)
2018-11-28 13:33:00 +00:00
path = re.sub(r'\.html$', '', path)
2014-10-14 02:44:28 +00:00
path = re.sub('^/', '', path)
2014-10-13 19:56:52 +00:00
return path
def parse_content_from_file(file_path):
"""Retrieve content from a json-encoded file on disk."""
2014-08-21 00:31:26 +00:00
try:
with codecs.open(file_path, encoding='utf-8', mode='r') as f:
content = f.read()
except IOError as e:
log.info(
'(Search Index) Unable to index file: %s',
file_path,
exc_info=True,
)
2014-08-21 00:31:26 +00:00
return ''
# TODO: wrap this in a try/except block
2014-10-13 19:56:52 +00:00
page_json = json.loads(content)
page_content = page_json['content']
content = parse_content(page_content)
2014-08-21 00:31:26 +00:00
if not content:
log.info('(Search Index) Unable to index file: %s, empty file', file_path)
2014-08-21 00:31:26 +00:00
else:
log.debug('(Search Index) %s length: %s', file_path, len(content))
2014-08-21 00:31:26 +00:00
return content
def parse_content(content):
2014-08-21 00:31:26 +00:00
"""
Prepare the text of the html file.
2014-08-21 00:31:26 +00:00
Returns the body text of a document
"""
try:
2014-10-13 19:56:52 +00:00
to_index = PyQuery(content).text()
2014-08-21 00:31:26 +00:00
except ValueError:
return ''
return to_index
def parse_headers_from_file(documentation_type, file_path):
log.debug('(Search Index) Parsing headers for %s', file_path)
2014-08-21 00:31:26 +00:00
try:
with codecs.open(file_path, encoding='utf-8', mode='r') as f:
content = f.read()
except IOError as e:
log.info(
'(Search Index) Unable to index file: %s',
file_path,
exc_info=True,
)
2014-08-21 00:31:26 +00:00
return ''
2014-10-13 19:56:52 +00:00
# TODO: wrap this in a try/except block
2014-10-13 19:56:52 +00:00
page_json = json.loads(content)
page_content = page_json['content']
headers = parse_headers(documentation_type, page_content)
2014-08-21 00:31:26 +00:00
if not headers:
log.error('Unable to index file headers for: %s', file_path)
2014-08-21 00:31:26 +00:00
return headers
def parse_headers(documentation_type, content):
headers = []
if documentation_type == 'mkdocs':
for element in PyQuery(content)('h2'):
headers.append(recurse_while_none(element))
return headers
def parse_sections_from_file(documentation_type, file_path):
log.debug('(Search Index) Parsing sections for %s', file_path)
2014-08-21 00:31:26 +00:00
try:
with codecs.open(file_path, encoding='utf-8', mode='r') as f:
content = f.read()
except IOError as e:
log.info(
'(Search Index) Unable to index file: %s',
file_path,
exc_info=True,
)
2014-08-21 00:31:26 +00:00
return ''
2014-10-13 19:56:52 +00:00
# TODO: wrap this in a try/except block
2014-10-13 19:56:52 +00:00
page_json = json.loads(content)
page_content = page_json['content']
sections = parse_sections(documentation_type, page_content)
2014-08-21 00:31:26 +00:00
if not sections:
log.error('Unable to index file sections for: %s', file_path)
2014-08-21 00:31:26 +00:00
return sections
def parse_sphinx_sections(content):
"""Generate a list of sections from sphinx-style html."""
body = PyQuery(content)
h1_section = body('.section > h1')
if h1_section:
div = h1_section.parent()
h1_title = h1_section.text().replace(u'', '').strip()
h1_id = div.attr('id')
h1_content = ""
next_p = next(body('h1')) # pylint: disable=stop-iteration-return
while next_p:
if next_p[0].tag == 'div' and 'class' in next_p[0].attrib:
if 'section' in next_p[0].attrib['class']:
break
h1_content += "\n%s\n" % next_p.html()
next_p = next(next_p) # pylint: disable=stop-iteration-return
if h1_content:
yield {
'id': h1_id,
'title': h1_title,
'content': h1_content,
}
# Capture text inside h2's
section_list = body('.section > h2')
for num in range(len(section_list)):
div = section_list.eq(num).parent()
header = section_list.eq(num)
title = header.text().replace(u'', '').strip()
section_id = div.attr('id')
content = div.html()
yield {
'id': section_id,
'title': title,
'content': content,
}
def parse_mkdocs_sections(content):
"""
Generate a list of sections from mkdocs-style html.
May raise a ValueError
"""
body = PyQuery(content)
try:
# H1 content
h1 = body('h1')
h1_id = h1.attr('id')
h1_title = h1.text().strip()
h1_content = ""
next_p = next(body('h1')) # pylint: disable=stop-iteration-return
while next_p:
if next_p[0].tag == 'h2':
break
h1_html = next_p.html()
if h1_html:
h1_content += "\n%s\n" % h1_html
next_p = next(next_p) # pylint: disable=stop-iteration-return
if h1_content:
yield {
'id': h1_id,
'title': h1_title,
'content': h1_content,
}
# H2 content
section_list = body('h2')
for num in range(len(section_list)):
h2 = section_list.eq(num)
h2_title = h2.text().strip()
section_id = h2.attr('id')
h2_content = ""
next_p = next(body('h2')) # pylint: disable=stop-iteration-return
while next_p:
if next_p[0].tag == 'h2':
break
h2_html = next_p.html()
if h2_html:
h2_content += "\n%s\n" % h2_html
next_p = next(next_p) # pylint: disable=stop-iteration-return
if h2_content:
yield {
'id': section_id,
'title': h2_title,
'content': h2_content,
}
# we're unsure which exceptions can be raised
2018-05-25 18:45:15 +00:00
except: # noqa
2017-11-08 19:47:43 +00:00
log.exception('Failed indexing')
2014-08-21 00:31:26 +00:00
def parse_sections(documentation_type, content):
"""Retrieve a list of section dicts from a string of html."""
2014-08-21 00:31:26 +00:00
sections = []
if 'sphinx' in documentation_type:
sections.extend(parse_sphinx_sections(content))
2014-08-21 00:31:26 +00:00
if 'mkdocs' in documentation_type:
try:
sections.extend(parse_mkdocs_sections(content))
2014-08-21 00:31:26 +00:00
except ValueError:
return ''
return sections