kallithea Changeset - 2b7a0e28c4dc

Changeset - 2b7a0e28c4dc

Parent rev.

Child rev.

[Not reviewed]

default

0 2 0

Takumi IINO - 10 years ago 2015-10-16 17:12:50
trot.thunder@gmail.com

indexers: introduce list of filenames for indexing

Before this patch, we cannot search files that without file extension (like
Makefile) and dotfiles (like .hgtags).

This patch makes it possible to search for these files by introducing a list
of filenames for indexing. The list is currently empty.

2 files changed with 15 insertions and 7 deletions:

kallithea/config/conf.py

kallithea/lib/indexers/daemon.py

0 comments (0 inline, 0 general)

kallithea/config/conf.py

➞

Show inline comments

@@ @@ -11,54 +11,56 @@ @@
+#
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """
 kallithea.config.conf
 ~~~~~~~~~~~~~~~~~~~~~
 Various config settings for Kallithea
 This file was forked by the Kallithea project in July 2014.
 Original author and date, and relevant copyright and licensing information is below:
 :created_on: Mar 7, 2012
 :author: marcink
 :copyright: (c) 2013 RhodeCode GmbH, and others.
 :license: GPLv3, see LICENSE.md for more details.
 """
 from kallithea.lib.utils2 import __get_lem
 # language map is also used by whoosh indexer, which for those specified
 # extensions will index it's content
 LANGUAGES_EXTENSIONS_MAP = __get_lem()
 #==============================================================================
 # WHOOSH INDEX EXTENSIONS
 #==============================================================================
 # EXTENSIONS WE WANT TO INDEX CONTENT OFF USING WHOOSH
 # Whoosh index targets
 # Extensions we want to index content of using whoosh
 INDEX_EXTENSIONS = LANGUAGES_EXTENSIONS_MAP.keys()
 # Filenames we want to index content of using whoosh
 INDEX_FILENAMES = []
 # list of readme files to search in file tree and display in summary
 # attached weights defines the search  order lower is first
 ALL_READMES = [
     ('readme', 0), ('README', 0), ('Readme', 0),
     ('doc/readme', 1), ('doc/README', 1), ('doc/Readme', 1),
     ('Docs/readme', 2), ('Docs/README', 2), ('Docs/Readme', 2),
     ('DOCS/readme', 2), ('DOCS/README', 2), ('DOCS/Readme', 2),
     ('docs/readme', 2), ('docs/README', 2), ('docs/Readme', 2),
+]
 # extension together with weights to search lower is first
 RST_EXTS = [
     ('', 0), ('.rst', 1), ('.rest', 1),
     ('.RST', 2), ('.REST', 2),
     ('.txt', 3), ('.TXT', 3)
+]
 MARKDOWN_EXTS = [
     ('.md', 1), ('.MD', 1),
     ('.mkdn', 2), ('.MKDN', 2),
     ('.mdown', 3), ('.MDOWN', 3),
     ('.markdown', 4), ('.MARKDOWN', 4)
+]

kallithea/lib/indexers/daemon.py

➞

Show inline comments

@@ @@ -20,49 +20,49 @@ A daemon will read from task table and r @@
 This file was forked by the Kallithea project in July 2014.
 Original author and date, and relevant copyright and licensing information is below:
 :created_on: Jan 26, 2010
 :author: marcink
 :copyright: (c) 2013 RhodeCode GmbH, and others.
 :license: GPLv3, see LICENSE.md for more details.
 """
 import os
 import sys
 import logging
 import traceback
 from shutil import rmtree
 from time import mktime
 from os.path import dirname as dn
 from os.path import join as jn
 # Add location of top level folder to sys.path
 project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
 sys.path.append(project_path)
 from kallithea.config.conf import INDEX_EXTENSIONS
+from kallithea.config.conf import INDEX_EXTENSIONS, INDEX_FILENAMES
 from kallithea.model.scm import ScmModel
 from kallithea.model.db import Repository
 from kallithea.lib.utils2 import safe_unicode, safe_str
 from kallithea.lib.indexers import SCHEMA, IDX_NAME, CHGSETS_SCHEMA, \
     CHGSET_IDX_NAME
 from kallithea.lib.vcs.exceptions import ChangesetError, RepositoryError, \
     NodeDoesNotExistError
 from whoosh.index import create_in, open_dir, exists_in
 from whoosh.query import *
 from whoosh.qparser import QueryParser
 log = logging.getLogger('whoosh_indexer')
 class WhooshIndexingDaemon(object):
     """
     Daemon for atomic indexing jobs
     """
     def __init__(self, indexname=IDX_NAME, index_location=None,
                  repo_location=None, sa=None, repo_list=None,
                  repo_update_list=None):
@@ @@ -141,61 +141,67 @@ class WhooshIndexingDaemon(object): @@
         except RepositoryError:
             log.debug(traceback.format_exc())
             pass
         return index_paths_
     def get_node(self, repo, path, index_rev=None):
         """
         gets a filenode based on given full path. It operates on string for
         hg git compatibility.
         :param repo: scm repo instance
         :param path: full path including root location
         :return: FileNode
         """
         # FIXME: paths should be normalized ... or even better: don't include repo.path
         path = safe_str(path)
         repo_path = safe_str(repo.path)
         assert path.startswith(repo_path)
         assert path[len(repo_path)] in (os.path.sep, os.path.altsep)
         node_path = path[len(repo_path) + 1:]
         cs = self._get_index_changeset(repo, index_rev=index_rev)
         node = cs.get_node(node_path)
         return node
     def is_indexable_node(self, node):
         """
         Just index the content of chosen files, skipping binary files
         """
         return (node.extension in INDEX_EXTENSIONS or node.name in INDEX_FILENAMES) and \
                not node.is_binary
     def get_node_mtime(self, node):
         return mktime(node.last_changeset.date.timetuple())
     def add_doc(self, writer, path, repo, repo_name, index_rev=None):
         """
         Adding doc to writer this function itself fetches data from
         the instance of vcs backend
         """
         node = self.get_node(repo, path, index_rev)
         indexed = indexed_w_content = 0
         # we just index the content of chosen files, and skip binary files
         if node.extension in INDEX_EXTENSIONS and not node.is_binary:
         if self.is_indexable_node(node):
             u_content = node.content
             if not isinstance(u_content, unicode):
                 log.warning('  >> %s Could not get this content as unicode '
                             'replacing with empty content' % path)
                 u_content = u''
             else:
                 log.debug('    >> %s [WITH CONTENT]', path)
                 indexed_w_content += 1
         else:
             log.debug('    >> %s', path)
             # just index file name without it's content
             u_content = u''
             indexed += 1
         p = safe_unicode(path)
         writer.add_document(
             fileid=p,
             owner=unicode(repo.contact),
             repository=safe_unicode(repo_name),
             path=p,
             content=u_content,
             modtime=self.get_node_mtime(node),
             extension=node.extension

0 comments (0 inline, 0 general)