Elasticsearch 1.5.0 Upgrade, including:

- Many thanks to @robhudson, who wrote the original 0.90 -> 1.0+ ElasticSearch query code.
- Updating pip elasticsearch requirement to 1.5.0 for the new server.
- Refactor argument order to match new API.
- Updated to boost relevance in a Elasticsearch >= 1.0 way
- Removing score script and adding in field_value_factor per @robhudson's insight.
- Adding default project_scale and page_scale values.
- Found the source of the 400 error on _bulk, and noted.
front-end-standardization
Andrew Kreps 2015-05-19 22:14:02 -07:00
parent c5aababca0
commit 7b78791166
6 changed files with 92 additions and 56 deletions

View File

@ -42,6 +42,6 @@ class Command(BaseCommand):
commit = None
try:
page_list = parse_json.process_all_json_files(version, build_dir=False)
index_search_request(version=version, page_list=page_list, commit=commit)
index_search_request(version=version, page_list=page_list, commit=commit, project_scale=0, page_scale=0)
except Exception:
log.error('Build failed for %s' % version, exc_info=True)

View File

@ -686,7 +686,7 @@ def update_search(version_pk, commit):
log_msg = ' '.join([page['path'] for page in page_list])
log.info("(Search Index) Sending Data: %s [%s]" % (version.project.slug, log_msg))
index_search_request(version=version, page_list=page_list, commit=commit)
index_search_request(version=version, page_list=page_list, commit=commit, project_scale=0, page_scale=0)
@task(queue='web')

View File

@ -86,13 +86,13 @@ def delete_versions(project, version_data):
return set()
def index_search_request(version, page_list, commit):
def index_search_request(version, page_list, commit, project_scale, page_scale):
log_msg = ' '.join([page['path'] for page in page_list])
log.info("(Server Search) Indexing Pages: %s [%s]" % (
version.project.slug, log_msg))
project = version.project
page_obj = PageIndex()
project_scale = 1
section_obj = SectionIndex()
#tags = [tag.name for tag in project.tags.all()]
@ -106,13 +106,13 @@ def index_search_request(version, page_list, commit):
'author': [user.username for user in project.users.all()],
'url': project.get_absolute_url(),
'tags': None,
'_boost': project_scale,
'weight': project_scale,
})
index_list = []
section_index_list = []
for page in page_list:
log.debug("(API Index) %s:%s" % (project.slug, page['path']))
page_scale = 1
page_id = hashlib.md5('%s-%s-%s' % (project.slug, version.slug, page['path'])).hexdigest()
index_list.append({
'id': page_id,
@ -124,27 +124,38 @@ def index_search_request(version, page_list, commit):
'content': page['content'],
'taxonomy': None,
'commit': commit,
'_boost': page_scale + project_scale,
'weight': page_scale + project_scale,
})
for section in page['sections']:
section_index_list.append({
'id': hashlib.md5('%s-%s-%s-%s' % (project.slug, version.slug, page['path'], section['id'])).hexdigest(),
'project': project.slug,
'version': version.slug,
'path': page['path'],
'page_id': section['id'],
'title': section['title'],
'content': section['content'],
'weight': page_scale,
})
section_obj.bulk_index(section_index_list, parent=page_id, routing=project.slug)
page_obj.bulk_index(index_list, parent=project.slug)
log.info("(Server Search) Deleting files not in commit: %s" % commit)
# Figure this out later
# TODO: AK Make sure this works
delete_query = {
# ES .90 doesn't wrap this
#"query": {
"bool": {
"must": [
{"term": {"project": project.slug, }},
{"term": {"version": version.slug, }},
],
"must_not": {
"term": {
"commit": commit
"query": {
"bool": {
"must": [
{"term": {"project": project.slug, }},
{"term": {"version": version.slug, }},
],
"must_not": {
"term": {
"commit": commit
}
}
}
}
#}
}
page_obj.delete_document(body=delete_query)

View File

@ -1,6 +1,6 @@
import logging
from rest_framework import decorators, permissions, viewsets, status
from rest_framework import decorators, permissions, status
from rest_framework.renderers import JSONPRenderer, JSONRenderer, BrowsableAPIRenderer
from rest_framework.response import Response
import requests
@ -11,8 +11,10 @@ from search.indexes import PageIndex, ProjectIndex, SectionIndex
from projects.models import Project
from restapi import utils
log = logging.getLogger(__name__)
@decorators.api_view(['GET'])
@decorators.permission_classes((permissions.AllowAny,))
@decorators.renderer_classes((JSONRenderer, JSONPRenderer, BrowsableAPIRenderer))
@ -43,7 +45,14 @@ def index_search(request):
commit = data.get('commit')
project = Project.objects.get(pk=project_pk)
version = Version.objects.get(pk=version_pk)
utils.index_search_request(version=version, page_list=data['page_list'], commit=commit)
resp = requests.get('https://api.grokthedocs.com/api/v1/index/1/heatmap/', params={'project': project.slug, 'compare': True})
ret_json = resp.json()
project_scale = ret_json.get('scaled_project', {}).get(project.slug)
page_scale = ret_json.get('scaled_page', {}).get(page['path'], 1)
utils.index_search_request(version=version, page_list=data['page_list'], commit=commit, project_scale=project_scale, page_scale=page_scale)
return Response({'indexed': True})
@ -59,12 +68,17 @@ def search(request):
kwargs = {}
body = {
"query": {
"bool": {
"should": [
{"match": {"title": {"query": query, "boost": 10}}},
{"match": {"headers": {"query": query, "boost": 5}}},
{"match": {"content": {"query": query}}},
]
"function_score": {
"field_value_factor": {"field": "weight"},
"query": {
"bool": {
"should": [
{"match": {"title": {"query": query, "boost": 10}}},
{"match": {"headers": {"query": query, "boost": 5}}},
{"match": {"content": {"query": query}}},
]
}
}
}
},
"highlight": {
@ -102,12 +116,17 @@ def project_search(request):
log.debug("(API Project Search) %s" % (query))
body = {
"query": {
"bool": {
"should": [
{"match": {"name": {"query": query, "boost": 10}}},
{"match": {"description": {"query": query}}},
]
},
"function_score": {
"field_value_factor": {"field": "weight"},
"query": {
"bool": {
"should": [
{"match": {"name": {"query": query, "boost": 10}}},
{"match": {"description": {"query": query}}},
]
}
}
}
},
"fields": ["name", "slug", "description", "lang"]
}
@ -115,6 +134,7 @@ def project_search(request):
return Response({'results': results})
@decorators.api_view(['GET'])
@decorators.permission_classes((permissions.AllowAny,))
@decorators.renderer_classes((JSONRenderer, JSONPRenderer, BrowsableAPIRenderer))
@ -165,11 +185,16 @@ def section_search(request):
kwargs = {}
body = {
"query": {
"bool": {
"should": [
{"match": {"title": {"query": query, "boost": 10}}},
{"match": {"content": {"query": query}}},
]
"function_score": {
"field_value_factor": {"field": "weight"},
"query": {
"bool": {
"should": [
{"match": {"title": {"query": query, "boost": 10}}},
{"match": {"content": {"query": query}}},
]
}
}
}
},
"facets": {
@ -177,7 +202,7 @@ def section_search(request):
"terms": {"field": "project"},
"facet_filter": {
"term": {"version": version_slug},
}
}
},
},
"highlight": {
@ -197,11 +222,11 @@ def section_search(request):
{"term": {"version": version_slug}},
]
}
body["facets"]['path'] = {
body['facets']['path'] = {
"terms": {"field": "path"},
"facet_filter": {
"term": {"project": project_slug},
}
}
},
# Add routing to optimize search by hitting the right shard.
kwargs['routing'] = project_slug
@ -212,14 +237,13 @@ def section_search(request):
{"term": {"path": path_slug}},
]
}
if path_slug and not project_slug:
# Show facets when we only have a path
body["facets"]['path'] = {
body['facets']['path'] = {
"terms": {"field": "path"}
}
results = SectionIndex().search(body, **kwargs)
return Response({'results': results})

View File

@ -107,7 +107,7 @@ class Index(object):
def put_mapping(self, index=None):
index = index or self._index
self.es.indices.put_mapping(index, self._type, self.get_mapping())
self.es.indices.put_mapping(self._type, self.get_mapping(), index)
def bulk_index(self, data, index=None, chunk_size=500, parent=None,
routing=None):
@ -136,6 +136,7 @@ class Index(object):
doc['_routing'] = routing
docs.append(doc)
# TODO: This doesn't work with the new ES setup.
bulk_index(self.es, docs, chunk_size=chunk_size)
def index_document(self, data, index=None, parent=None, routing=None):
@ -218,8 +219,6 @@ class ProjectIndex(Index):
self._type: {
# Disable _all field to reduce index size.
'_all': {'enabled': False},
# Add a boost field to enhance relevancy of a document.
'_boost': {'name': '_boost', 'null_value': 1.0},
'properties': {
'id': {'type': 'long'},
'name': {'type': 'string', 'analyzer': 'default_icu'},
@ -240,6 +239,8 @@ class ProjectIndex(Index):
},
},
'url': {'type': 'string', 'index': 'not_analyzed'},
# Add a weight field to enhance relevancy scoring.
'weight': {'type': 'float'},
}
}
}
@ -254,7 +255,7 @@ class ProjectIndex(Index):
doc[attr] = data.get(attr, '')
# Add project boost.
doc['_boost'] = data.get('_boost', 1.0)
doc['weight'] = data.get('weight', 1.0)
return doc
@ -269,8 +270,6 @@ class PageIndex(Index):
self._type: {
# Disable _all field to reduce index size.
'_all': {'enabled': False},
# Add a boost field to enhance relevancy of a document.
'_boost': {'name': '_boost', 'null_value': 1.0},
# Associate a page with a project.
'_parent': {'type': self._parent},
'properties': {
@ -285,6 +284,8 @@ class PageIndex(Index):
'title': {'type': 'string', 'analyzer': 'default_icu'},
'headers': {'type': 'string', 'analyzer': 'default_icu'},
'content': {'type': 'string', 'analyzer': 'default_icu'},
# Add a weight field to enhance relevancy scoring.
'weight': {'type': 'float'},
}
}
}
@ -299,7 +300,7 @@ class PageIndex(Index):
doc[attr] = data.get(attr, '')
# Add page boost.
doc['_boost'] = data.get('_boost', 1.0)
doc['weight'] = data.get('weight', 1.0)
return doc
@ -314,8 +315,6 @@ class SectionIndex(Index):
self._type: {
# Disable _all field to reduce index size.
'_all': {'enabled': False},
# Add a boost field to enhance relevancy of a document.
'_boost': {'name': '_boost', 'null_value': 1.0},
# Associate a section with a page.
'_parent': {'type': self._parent},
'suggest': {
@ -338,7 +337,9 @@ class SectionIndex(Index):
'properties': {
'code': {'type': 'string', 'analyzer': 'default_icu'}
}
}
},
# Add a weight field to enhance relevancy scoring.
'weight': {'type': 'float'},
}
}
}
@ -353,6 +354,6 @@ class SectionIndex(Index):
doc[attr] = data.get(attr, '')
# Add page boost.
doc['_boost'] = data.get('_boost', 1.0)
doc['weight'] = data.get('weight', 1.0)
return doc

View File

@ -43,7 +43,7 @@ github2==0.5.2
httplib2==0.7.7
# Search
elasticsearch==0.4.3
elasticsearch==1.5.0
pyelasticsearch==0.7.1
pyquery==1.2.2