Changeset - 2b7a0e28c4dc
[Not reviewed]
default
0 2 0
Takumi IINO - 10 years ago 2015-10-16 17:12:50
trot.thunder@gmail.com
indexers: introduce list of filenames for indexing

Before this patch, we cannot search files that without file extension (like
Makefile) and dotfiles (like .hgtags).

This patch makes it possible to search for these files by introducing a list
of filenames for indexing. The list is currently empty.
2 files changed with 15 insertions and 7 deletions:
0 comments (0 inline, 0 general)
kallithea/config/conf.py
Show inline comments
 
@@ -11,54 +11,56 @@
 
#
 
# You should have received a copy of the GNU General Public License
 
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
"""
 
kallithea.config.conf
 
~~~~~~~~~~~~~~~~~~~~~
 

	
 
Various config settings for Kallithea
 

	
 
This file was forked by the Kallithea project in July 2014.
 
Original author and date, and relevant copyright and licensing information is below:
 
:created_on: Mar 7, 2012
 
:author: marcink
 
:copyright: (c) 2013 RhodeCode GmbH, and others.
 
:license: GPLv3, see LICENSE.md for more details.
 
"""
 

	
 
from kallithea.lib.utils2 import __get_lem
 

	
 

	
 
# language map is also used by whoosh indexer, which for those specified
 
# extensions will index it's content
 
LANGUAGES_EXTENSIONS_MAP = __get_lem()
 

	
 
#==============================================================================
 
# WHOOSH INDEX EXTENSIONS
 
#==============================================================================
 
# EXTENSIONS WE WANT TO INDEX CONTENT OFF USING WHOOSH
 
# Whoosh index targets
 

	
 
# Extensions we want to index content of using whoosh
 
INDEX_EXTENSIONS = LANGUAGES_EXTENSIONS_MAP.keys()
 

	
 
# Filenames we want to index content of using whoosh
 
INDEX_FILENAMES = []
 

	
 
# list of readme files to search in file tree and display in summary
 
# attached weights defines the search  order lower is first
 
ALL_READMES = [
 
    ('readme', 0), ('README', 0), ('Readme', 0),
 
    ('doc/readme', 1), ('doc/README', 1), ('doc/Readme', 1),
 
    ('Docs/readme', 2), ('Docs/README', 2), ('Docs/Readme', 2),
 
    ('DOCS/readme', 2), ('DOCS/README', 2), ('DOCS/Readme', 2),
 
    ('docs/readme', 2), ('docs/README', 2), ('docs/Readme', 2),
 
]
 

	
 
# extension together with weights to search lower is first
 
RST_EXTS = [
 
    ('', 0), ('.rst', 1), ('.rest', 1),
 
    ('.RST', 2), ('.REST', 2),
 
    ('.txt', 3), ('.TXT', 3)
 
]
 

	
 
MARKDOWN_EXTS = [
 
    ('.md', 1), ('.MD', 1),
 
    ('.mkdn', 2), ('.MKDN', 2),
 
    ('.mdown', 3), ('.MDOWN', 3),
 
    ('.markdown', 4), ('.MARKDOWN', 4)
 
]
 

	
kallithea/lib/indexers/daemon.py
Show inline comments
 
@@ -20,49 +20,49 @@ A daemon will read from task table and r
 
This file was forked by the Kallithea project in July 2014.
 
Original author and date, and relevant copyright and licensing information is below:
 
:created_on: Jan 26, 2010
 
:author: marcink
 
:copyright: (c) 2013 RhodeCode GmbH, and others.
 
:license: GPLv3, see LICENSE.md for more details.
 
"""
 

	
 

	
 
import os
 
import sys
 
import logging
 
import traceback
 

	
 
from shutil import rmtree
 
from time import mktime
 

	
 
from os.path import dirname as dn
 
from os.path import join as jn
 

	
 
# Add location of top level folder to sys.path
 
project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
 
sys.path.append(project_path)
 

	
 
from kallithea.config.conf import INDEX_EXTENSIONS
 
from kallithea.config.conf import INDEX_EXTENSIONS, INDEX_FILENAMES
 
from kallithea.model.scm import ScmModel
 
from kallithea.model.db import Repository
 
from kallithea.lib.utils2 import safe_unicode, safe_str
 
from kallithea.lib.indexers import SCHEMA, IDX_NAME, CHGSETS_SCHEMA, \
 
    CHGSET_IDX_NAME
 

	
 
from kallithea.lib.vcs.exceptions import ChangesetError, RepositoryError, \
 
    NodeDoesNotExistError
 

	
 
from whoosh.index import create_in, open_dir, exists_in
 
from whoosh.query import *
 
from whoosh.qparser import QueryParser
 

	
 
log = logging.getLogger('whoosh_indexer')
 

	
 

	
 
class WhooshIndexingDaemon(object):
 
    """
 
    Daemon for atomic indexing jobs
 
    """
 

	
 
    def __init__(self, indexname=IDX_NAME, index_location=None,
 
                 repo_location=None, sa=None, repo_list=None,
 
                 repo_update_list=None):
 
@@ -141,61 +141,67 @@ class WhooshIndexingDaemon(object):
 
        except RepositoryError:
 
            log.debug(traceback.format_exc())
 
            pass
 
        return index_paths_
 

	
 
    def get_node(self, repo, path, index_rev=None):
 
        """
 
        gets a filenode based on given full path. It operates on string for
 
        hg git compatibility.
 

	
 
        :param repo: scm repo instance
 
        :param path: full path including root location
 
        :return: FileNode
 
        """
 
        # FIXME: paths should be normalized ... or even better: don't include repo.path
 
        path = safe_str(path)
 
        repo_path = safe_str(repo.path)
 
        assert path.startswith(repo_path)
 
        assert path[len(repo_path)] in (os.path.sep, os.path.altsep)
 
        node_path = path[len(repo_path) + 1:]
 
        cs = self._get_index_changeset(repo, index_rev=index_rev)
 
        node = cs.get_node(node_path)
 
        return node
 

	
 
    def is_indexable_node(self, node):
 
        """
 
        Just index the content of chosen files, skipping binary files
 
        """
 
        return (node.extension in INDEX_EXTENSIONS or node.name in INDEX_FILENAMES) and \
 
               not node.is_binary
 

	
 
    def get_node_mtime(self, node):
 
        return mktime(node.last_changeset.date.timetuple())
 

	
 
    def add_doc(self, writer, path, repo, repo_name, index_rev=None):
 
        """
 
        Adding doc to writer this function itself fetches data from
 
        the instance of vcs backend
 
        """
 

	
 
        node = self.get_node(repo, path, index_rev)
 
        indexed = indexed_w_content = 0
 
        # we just index the content of chosen files, and skip binary files
 
        if node.extension in INDEX_EXTENSIONS and not node.is_binary:
 
        if self.is_indexable_node(node):
 
            u_content = node.content
 
            if not isinstance(u_content, unicode):
 
                log.warning('  >> %s Could not get this content as unicode '
 
                            'replacing with empty content' % path)
 
                u_content = u''
 
            else:
 
                log.debug('    >> %s [WITH CONTENT]', path)
 
                indexed_w_content += 1
 

	
 
        else:
 
            log.debug('    >> %s', path)
 
            # just index file name without it's content
 
            u_content = u''
 
            indexed += 1
 

	
 
        p = safe_unicode(path)
 
        writer.add_document(
 
            fileid=p,
 
            owner=unicode(repo.contact),
 
            repository=safe_unicode(repo_name),
 
            path=p,
 
            content=u_content,
 
            modtime=self.get_node_mtime(node),
 
            extension=node.extension
0 comments (0 inline, 0 general)