Merge remote branch 'origin/master' into search

Conflicts: readthedocs/restapi/views.py
2013-10-26 19:18:59 -07:00 · 2013-10-26 19:18:59 -07:00 · ab871ad889
parent e74ad6a45c 9b0bde319d
commit ab871ad889
5 changed files with 318 additions and 20 deletions
--- a/pip_requirements.txt
+++ b/pip_requirements.txt
@ -14,6 +14,7 @@ django-profiles==0.2
 django-secure==0.1.2
 django==1.4.8
 docutils==0.8.1
 elasticsearch==0.4.3
 github2==0.5.2
 httplib2==0.7.2
 mercurial==2.6.3
--- a/readthedocs/restapi/views.py
+++ b/readthedocs/restapi/views.py
@ -1,3 +1,6 @@
 import json
 import hashlib
 from django.shortcuts import get_object_or_404
 from django.template import Template, Context
 from django.conf import settings
@ -7,15 +10,13 @@ from elasticsearch import Elasticsearch
 from rest_framework import decorators, permissions, viewsets, status
 from rest_framework.renderers import JSONPRenderer, JSONRenderer, BrowsableAPIRenderer
 from rest_framework.response import Response
 import json
 import hashlib
 import requests
 from betterversion.better import version_windows, BetterVersion 
 from builds.models import Version
 from djangome import views as djangome
 from search.indexes import Page as PageIndex, Project as ProjectIndex
 from projects.models import Project, EmailHook
 from search.indexes import Page
 from .serializers import ProjectSerializer
 from .permissions import RelatedProjectIsOwner
@ -181,23 +182,6 @@ def quick_search(request):
            ret_dict[key] = value
    return Response({"results": ret_dict})
@decorators.api_view(['GET'])
@decorators.permission_classes((permissions.AllowAny,))
@decorators.renderer_classes((JSONRenderer, JSONPRenderer, BrowsableAPIRenderer))
 def search(request):
    project_slug = request.GET.get('project', None)
    if not project_slug:
        return Response({'error': 'project GET argument required'}, status=status.HTTP_400_BAD_REQUEST)
    version_slug = request.GET.get('version', 'latest')
    query = request.GET.get('q', None)
    es = Elasticsearch(settings.ES_HOSTS)
    ret_dict = {}
    results = es.query({'project': project_slug, 'version': version_slug, 'query': query})
    for result in results:
        #ret_dict[result['key']] = result['url']
        pass
    return Response({"results": ret_dict})
@decorators.api_view(['POST'])
@decorators.permission_classes((permissions.IsAdminUser,))
@decorators.renderer_classes((JSONRenderer, JSONPRenderer, BrowsableAPIRenderer))
@ -233,3 +217,45 @@ def index_search(request):
        index_list.append(page)
    page_obj.bulk_index(index_list, parent=project_pk)
    return Response({'indexed': True})
@decorators.api_view(['GET'])
@decorators.permission_classes((permissions.AllowAny,))
@decorators.renderer_classes((JSONRenderer, JSONPRenderer, BrowsableAPIRenderer))
 def search(request):
    project_id = request.GET.get('project', None)
    version_slug = request.GET.get('version', 'latest')
    query = request.GET.get('q', None)
    if project_id:
        # This is a search within a project -- do a Page search.
        body = {
            'filter': {
                'term': {'project': project_id},
                'term': {'version': version_slug},
            },
            'query': {
                'bool': {
                    'should': [
                        {'match': {'title': {'query': query, 'boost': 10}}},
                        {'match': {'headers': {'query': query, 'boost': 5}}},
                        {'match': {'content': {'query': query}}},
                    ]
                }
            }
        }
        results = PageIndex().search(body, routing=project_id)
    else:
        body = {
            'query': {
                'bool': {
                    'should': [
                        {'match': {'name': {'query': query, 'boost': 10}}},
                        {'match': {'description': {'query': query}}},
                    ]
                }
            }
        }
        results = ProjectIndex().search(body)
    return Response({'results': results})
--- a/readthedocs/search/init.py
+++ b/readthedocs/search/init.py
--- a/readthedocs/search/indexes.py
+++ b/readthedocs/search/indexes.py
@ -0,0 +1,267 @@
 """
 Search indexing classes to index into Elasticsearch.
 Django settings that should be defined:
    `ES_HOSTS`: A list of hosts where Elasticsearch lives. E.g.
                ['192.168.1.1:9200', '192.168.2.1:9200']
    `ES_DEFAULT_NUM_REPLICAS`: An integer of the number of replicas.
    `ES_DEFAULT_NUM_SHARDS`: An integer of the number of shards.
 TODO: Handle page removal case in Page.
 """
 import datetime
 from elasticsearch import Elasticsearch, exceptions
 from elasticsearch.helpers import bulk_index
 from django.conf import settings
 class Index(object):
    """
    Base class to define some common methods across indexes.
    """
    # The _index and _type define the URL path to Elasticsearch, e.g.:
    #   http://localhost:9200/{_index}/{_type}/_search
    _index = 'readthedocs'
    _type = None
    def __init__(self):
        self.es = Elasticsearch(settings.ES_HOSTS)
    def get_settings(self, settings_override=None):
        """
        Returns settings to be passed to ES create_index.
        If `settings_override` is provided, this will use `settings_override`
        to override the defaults defined here.
        """
        default_settings = {
            'number_of_replicas': settings.ES_DEFAULT_NUM_REPLICAS,
            'number_of_shards': settings.ES_DEFAULT_NUM_SHARDS,
            'refresh_interval': '5s',
            'store.compress.tv': True,
            'store.compress.stored': True,
            'analysis': self.get_analysis(),
        }
        if settings_override:
            default_settings.update(settings_override)
        return default_settings
    def get_analysis(self):
        """
        Returns the analysis dict to be used in settings for create_index.
        For languages that ES supports we define either the minimal or light
        stemming, which isn't as aggresive as the snowball stemmer. We also
        define the stopwords for that language.
        For all languages we've customized we're using the ICU plugin.
        """
        analyzers = {}
        filters = {}
        # The default is used for fields that need ICU but are composed of
        # many languages.
        analyzers['default_icu'] = {
            'type': 'custom',
            'tokenizer': 'icu_tokenizer',
            'filter': ['word_delimiter', 'icu_folding', 'icu_normalizer'],
        }
        # Customize the word_delimiter filter to set various options.
        filters['custom_word_delimiter'] = {
            'type': 'word_delimiter',
            'preserve_original': True,
        }
        return {
            'analyzer': analyzers,
            'filter': filters,
        }
    def timestamped_index(self):
        return '{0}-{1}'.format(
            self._index, datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
    def create_index(self, index=None):
        """
        Creates index.
        This uses `get_settings` and `get_mappings` to define the index.
        """
        index = index or self._index
        body = {
            'settings': self.get_settings(),
        }
        self.es.indices.create(index=index, body=body)
    def put_mapping(self, index=None):
        index = index or self._index
        self.es.indices.put_mapping(index, self._type, self.get_mapping())
    def bulk_index(self, data, index=None, chunk_size=500, parent=None):
        """
        Given a list of documents, uses Elasticsearch bulk indexing.
        For each doc this calls `extract_document`, then indexes.
        `chunk_size` defaults to the elasticsearch lib's default. Override per
        your document size as needed.
        """
        index = index or self._index
        docs = []
        for d in data:
            source = self.extract_document(d)
            doc = {
                '_index': index,
                '_type': self._type,
                '_id': source['id'],
                '_source': source,
            }
            if parent:
                doc['_parent'] = parent
            docs.append(doc)
        bulk_index(self.es, docs, chunk_size=chunk_size)
    def index_document(self, data, index=None, parent=None):
        index = index or self._index
        doc = self.extract_document(data)
        self.es.index(index=index, doc_type=self._type, body=doc, id=doc['id'],
                      parent=parent)
    def get_mapping(self):
        """
        Returns the mapping for this _index and _type.
        """
        raise NotImplemented
    def extract_document(self, pk, obj):
        """
        Extracts the Elasticsearch document for this object instance.
        """
        raise NotImplemented
    def update_aliases(self, new_index, delete=True):
        """
        Points `_index` to `new_index` and deletes `_index` if delete=True.
        The ES `update_aliases` is atomic.
        """
        old_index = None
        # Get current alias, if any.
        try:
            aliases = self.es.indices.get_alias(name=self._index)
            if aliases and aliases.keys():
                old_index = aliases.keys()[0]
        except exceptions.NotFoundError:
            pass
        actions = []
        if old_index:
            actions.append({'remove': {'index': old_index,
                                       'alias': self._index}})
        actions.append({'add': {'index': new_index, 'alias': self._index}})
        self.es.indices.update_aliases(body={'actions': actions})
        # Delete old index if any and if specified.
        if delete and old_index:
            self.es.indices.delete(index=old_index)
    def search(self, body, **kwargs):
        return self.es.search(index=self._index, doc_type=self._type,
                              body=body, **kwargs)
 class Project(Index):
    _type = 'project'
    def get_mapping(self):
        mapping = {
            self._type: {
                # Disable _all field to reduce index size.
                '_all': {'enabled': False},
                # Add a boost field to enhance relevancy of a document.
                '_boost': {'name': '_boost', 'null_value': 1.0},
                'properties': {
                    'id': {'type': 'long'},
                    'name': {'type': 'string', 'analyzer': 'default_icu'},
                    'slug': {'type': 'string', 'index': 'not_analyzed'},
                    'description': {'type': 'string',
                                    'analyzer': 'default_icu'},
                    'lang': {'type': 'string', 'index': 'not_analyzed'},
                    'author': {'type': 'string', 'analyzer': 'default_icu'},
                    'url': {'type': 'string', 'index': 'not_analyzed'},
                }
            }
        }
        return mapping
    def extract_document(self, data):
        doc = {}
        attrs = ('id', 'name', 'description', 'author', 'url')
        for attr in attrs:
            doc[attr] = data.get(attr, '')
        # Add project boost.
        doc['_boost'] = data.get('_boost', 1.0)
        return doc
 class Page(Index):
    _type = 'page'
    _parent = 'project'
    def get_mapping(self):
        mapping = {
            self._type: {
                # Disable _all field to reduce index size.
                '_all': {'enabled': False},
                # Add a boost field to enhance relevancy of a document.
                '_boost': {'name': '_boost', 'null_value': 1.0},
                # Associate a page with a project.
                '_parent': {'type': self._parent},
                'properties': {
                    'id': {'type': 'string', 'index': 'not_analyzed'},
                    'project': {'type': 'long'},
                    'title': {'type': 'string', 'analyzer': 'default_icu'},
                    'headers': {'type': 'string', 'analyzer': 'default_icu'},
                    'version': {'type': 'string', 'index': 'not_analyzed'},
                    'path': {'type': 'string', 'index': 'not_analyzed'},
                    'content': {'type': 'string', 'analyzer': 'default_icu'},
                }
            }
        }
        return mapping
    def extract_document(self, data):
        doc = {}
        attrs = ('id', 'project', 'title', 'headers', 'version', 'path',
                 'content')
        for attr in attrs:
            doc[attr] = data.get(attr, '')
        # Add page boost.
        doc['_boost'] = data.get('_boost', 1.0)
        return doc
--- a/readthedocs/settings/base.py
+++ b/readthedocs/settings/base.py
@ -173,6 +173,10 @@ HAYSTACK_CONNECTIONS = {
    },
 }
 # Elasticsearch settings.
 ES_HOSTS = ['127.0.0.1:9200']
 ES_DEFAULT_NUM_REPLICAS = 0
 ES_DEFAULT_NUM_SHARDS = 5
 AUTH_PROFILE_MODULE = "core.UserProfile"
 SOUTH_TESTS_MIGRATE = False