Merge remote branch 'origin/master' into search

Conflicts:
	readthedocs/restapi/views.py
rtd2
Eric Holscher 2013-10-26 19:18:59 -07:00
commit ab871ad889
5 changed files with 318 additions and 20 deletions

View File

@ -14,6 +14,7 @@ django-profiles==0.2
django-secure==0.1.2 django-secure==0.1.2
django==1.4.8 django==1.4.8
docutils==0.8.1 docutils==0.8.1
elasticsearch==0.4.3
github2==0.5.2 github2==0.5.2
httplib2==0.7.2 httplib2==0.7.2
mercurial==2.6.3 mercurial==2.6.3

View File

@ -1,3 +1,6 @@
import json
import hashlib
from django.shortcuts import get_object_or_404 from django.shortcuts import get_object_or_404
from django.template import Template, Context from django.template import Template, Context
from django.conf import settings from django.conf import settings
@ -7,15 +10,13 @@ from elasticsearch import Elasticsearch
from rest_framework import decorators, permissions, viewsets, status from rest_framework import decorators, permissions, viewsets, status
from rest_framework.renderers import JSONPRenderer, JSONRenderer, BrowsableAPIRenderer from rest_framework.renderers import JSONPRenderer, JSONRenderer, BrowsableAPIRenderer
from rest_framework.response import Response from rest_framework.response import Response
import json
import hashlib
import requests import requests
from betterversion.better import version_windows, BetterVersion from betterversion.better import version_windows, BetterVersion
from builds.models import Version from builds.models import Version
from djangome import views as djangome from djangome import views as djangome
from search.indexes import Page as PageIndex, Project as ProjectIndex
from projects.models import Project, EmailHook from projects.models import Project, EmailHook
from search.indexes import Page
from .serializers import ProjectSerializer from .serializers import ProjectSerializer
from .permissions import RelatedProjectIsOwner from .permissions import RelatedProjectIsOwner
@ -181,23 +182,6 @@ def quick_search(request):
ret_dict[key] = value ret_dict[key] = value
return Response({"results": ret_dict}) return Response({"results": ret_dict})
@decorators.api_view(['GET'])
@decorators.permission_classes((permissions.AllowAny,))
@decorators.renderer_classes((JSONRenderer, JSONPRenderer, BrowsableAPIRenderer))
def search(request):
project_slug = request.GET.get('project', None)
if not project_slug:
return Response({'error': 'project GET argument required'}, status=status.HTTP_400_BAD_REQUEST)
version_slug = request.GET.get('version', 'latest')
query = request.GET.get('q', None)
es = Elasticsearch(settings.ES_HOSTS)
ret_dict = {}
results = es.query({'project': project_slug, 'version': version_slug, 'query': query})
for result in results:
#ret_dict[result['key']] = result['url']
pass
return Response({"results": ret_dict})
@decorators.api_view(['POST']) @decorators.api_view(['POST'])
@decorators.permission_classes((permissions.IsAdminUser,)) @decorators.permission_classes((permissions.IsAdminUser,))
@decorators.renderer_classes((JSONRenderer, JSONPRenderer, BrowsableAPIRenderer)) @decorators.renderer_classes((JSONRenderer, JSONPRenderer, BrowsableAPIRenderer))
@ -233,3 +217,45 @@ def index_search(request):
index_list.append(page) index_list.append(page)
page_obj.bulk_index(index_list, parent=project_pk) page_obj.bulk_index(index_list, parent=project_pk)
return Response({'indexed': True}) return Response({'indexed': True})
@decorators.api_view(['GET'])
@decorators.permission_classes((permissions.AllowAny,))
@decorators.renderer_classes((JSONRenderer, JSONPRenderer, BrowsableAPIRenderer))
def search(request):
project_id = request.GET.get('project', None)
version_slug = request.GET.get('version', 'latest')
query = request.GET.get('q', None)
if project_id:
# This is a search within a project -- do a Page search.
body = {
'filter': {
'term': {'project': project_id},
'term': {'version': version_slug},
},
'query': {
'bool': {
'should': [
{'match': {'title': {'query': query, 'boost': 10}}},
{'match': {'headers': {'query': query, 'boost': 5}}},
{'match': {'content': {'query': query}}},
]
}
}
}
results = PageIndex().search(body, routing=project_id)
else:
body = {
'query': {
'bool': {
'should': [
{'match': {'name': {'query': query, 'boost': 10}}},
{'match': {'description': {'query': query}}},
]
}
}
}
results = ProjectIndex().search(body)
return Response({'results': results})

View File

View File

@ -0,0 +1,267 @@
"""
Search indexing classes to index into Elasticsearch.
Django settings that should be defined:
`ES_HOSTS`: A list of hosts where Elasticsearch lives. E.g.
['192.168.1.1:9200', '192.168.2.1:9200']
`ES_DEFAULT_NUM_REPLICAS`: An integer of the number of replicas.
`ES_DEFAULT_NUM_SHARDS`: An integer of the number of shards.
TODO: Handle page removal case in Page.
"""
import datetime
from elasticsearch import Elasticsearch, exceptions
from elasticsearch.helpers import bulk_index
from django.conf import settings
class Index(object):
"""
Base class to define some common methods across indexes.
"""
# The _index and _type define the URL path to Elasticsearch, e.g.:
# http://localhost:9200/{_index}/{_type}/_search
_index = 'readthedocs'
_type = None
def __init__(self):
self.es = Elasticsearch(settings.ES_HOSTS)
def get_settings(self, settings_override=None):
"""
Returns settings to be passed to ES create_index.
If `settings_override` is provided, this will use `settings_override`
to override the defaults defined here.
"""
default_settings = {
'number_of_replicas': settings.ES_DEFAULT_NUM_REPLICAS,
'number_of_shards': settings.ES_DEFAULT_NUM_SHARDS,
'refresh_interval': '5s',
'store.compress.tv': True,
'store.compress.stored': True,
'analysis': self.get_analysis(),
}
if settings_override:
default_settings.update(settings_override)
return default_settings
def get_analysis(self):
"""
Returns the analysis dict to be used in settings for create_index.
For languages that ES supports we define either the minimal or light
stemming, which isn't as aggresive as the snowball stemmer. We also
define the stopwords for that language.
For all languages we've customized we're using the ICU plugin.
"""
analyzers = {}
filters = {}
# The default is used for fields that need ICU but are composed of
# many languages.
analyzers['default_icu'] = {
'type': 'custom',
'tokenizer': 'icu_tokenizer',
'filter': ['word_delimiter', 'icu_folding', 'icu_normalizer'],
}
# Customize the word_delimiter filter to set various options.
filters['custom_word_delimiter'] = {
'type': 'word_delimiter',
'preserve_original': True,
}
return {
'analyzer': analyzers,
'filter': filters,
}
def timestamped_index(self):
return '{0}-{1}'.format(
self._index, datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
def create_index(self, index=None):
"""
Creates index.
This uses `get_settings` and `get_mappings` to define the index.
"""
index = index or self._index
body = {
'settings': self.get_settings(),
}
self.es.indices.create(index=index, body=body)
def put_mapping(self, index=None):
index = index or self._index
self.es.indices.put_mapping(index, self._type, self.get_mapping())
def bulk_index(self, data, index=None, chunk_size=500, parent=None):
"""
Given a list of documents, uses Elasticsearch bulk indexing.
For each doc this calls `extract_document`, then indexes.
`chunk_size` defaults to the elasticsearch lib's default. Override per
your document size as needed.
"""
index = index or self._index
docs = []
for d in data:
source = self.extract_document(d)
doc = {
'_index': index,
'_type': self._type,
'_id': source['id'],
'_source': source,
}
if parent:
doc['_parent'] = parent
docs.append(doc)
bulk_index(self.es, docs, chunk_size=chunk_size)
def index_document(self, data, index=None, parent=None):
index = index or self._index
doc = self.extract_document(data)
self.es.index(index=index, doc_type=self._type, body=doc, id=doc['id'],
parent=parent)
def get_mapping(self):
"""
Returns the mapping for this _index and _type.
"""
raise NotImplemented
def extract_document(self, pk, obj):
"""
Extracts the Elasticsearch document for this object instance.
"""
raise NotImplemented
def update_aliases(self, new_index, delete=True):
"""
Points `_index` to `new_index` and deletes `_index` if delete=True.
The ES `update_aliases` is atomic.
"""
old_index = None
# Get current alias, if any.
try:
aliases = self.es.indices.get_alias(name=self._index)
if aliases and aliases.keys():
old_index = aliases.keys()[0]
except exceptions.NotFoundError:
pass
actions = []
if old_index:
actions.append({'remove': {'index': old_index,
'alias': self._index}})
actions.append({'add': {'index': new_index, 'alias': self._index}})
self.es.indices.update_aliases(body={'actions': actions})
# Delete old index if any and if specified.
if delete and old_index:
self.es.indices.delete(index=old_index)
def search(self, body, **kwargs):
return self.es.search(index=self._index, doc_type=self._type,
body=body, **kwargs)
class Project(Index):
_type = 'project'
def get_mapping(self):
mapping = {
self._type: {
# Disable _all field to reduce index size.
'_all': {'enabled': False},
# Add a boost field to enhance relevancy of a document.
'_boost': {'name': '_boost', 'null_value': 1.0},
'properties': {
'id': {'type': 'long'},
'name': {'type': 'string', 'analyzer': 'default_icu'},
'slug': {'type': 'string', 'index': 'not_analyzed'},
'description': {'type': 'string',
'analyzer': 'default_icu'},
'lang': {'type': 'string', 'index': 'not_analyzed'},
'author': {'type': 'string', 'analyzer': 'default_icu'},
'url': {'type': 'string', 'index': 'not_analyzed'},
}
}
}
return mapping
def extract_document(self, data):
doc = {}
attrs = ('id', 'name', 'description', 'author', 'url')
for attr in attrs:
doc[attr] = data.get(attr, '')
# Add project boost.
doc['_boost'] = data.get('_boost', 1.0)
return doc
class Page(Index):
_type = 'page'
_parent = 'project'
def get_mapping(self):
mapping = {
self._type: {
# Disable _all field to reduce index size.
'_all': {'enabled': False},
# Add a boost field to enhance relevancy of a document.
'_boost': {'name': '_boost', 'null_value': 1.0},
# Associate a page with a project.
'_parent': {'type': self._parent},
'properties': {
'id': {'type': 'string', 'index': 'not_analyzed'},
'project': {'type': 'long'},
'title': {'type': 'string', 'analyzer': 'default_icu'},
'headers': {'type': 'string', 'analyzer': 'default_icu'},
'version': {'type': 'string', 'index': 'not_analyzed'},
'path': {'type': 'string', 'index': 'not_analyzed'},
'content': {'type': 'string', 'analyzer': 'default_icu'},
}
}
}
return mapping
def extract_document(self, data):
doc = {}
attrs = ('id', 'project', 'title', 'headers', 'version', 'path',
'content')
for attr in attrs:
doc[attr] = data.get(attr, '')
# Add page boost.
doc['_boost'] = data.get('_boost', 1.0)
return doc

View File

@ -173,6 +173,10 @@ HAYSTACK_CONNECTIONS = {
}, },
} }
# Elasticsearch settings.
ES_HOSTS = ['127.0.0.1:9200']
ES_DEFAULT_NUM_REPLICAS = 0
ES_DEFAULT_NUM_SHARDS = 5
AUTH_PROFILE_MODULE = "core.UserProfile" AUTH_PROFILE_MODULE = "core.UserProfile"
SOUTH_TESTS_MIGRATE = False SOUTH_TESTS_MIGRATE = False