Changeset - 28f19fa562df
[Not reviewed]
default
0 4 0
Marcin Kuzminski - 15 years ago 2010-08-28 14:53:32
marcin@python-works.com
updated config files,
Implemented content index extensions with whoosh,
fixed analyzer to match more words
4 files changed with 67 insertions and 15 deletions:
0 comments (0 inline, 0 general)
development.ini
Show inline comments
 
@@ -52,6 +52,26 @@ beaker.cache.short_term.expire=60
 
beaker.cache.super_short_term.type=memory
 
beaker.cache.super_short_term.expire=10
 

	
 
####################################
 
###       BEAKER SESSION        ####
 
####################################
 
## Type of storage used for the session, current types are 
 
## “dbm”, “file”, “memcached”, “database”, and “memory”. 
 
## The storage uses the Container API 
 
##that is also used by the cache system.
 
beaker.session.type = file
 

	
 
beaker.session.key = hg-app
 
beaker.session.secret = g654dcno0-9873jhgfreyu
 
beaker.session.timeout = 36000
 

	
 
##auto save the session to not to use .save()
 
beaker.session.auto = False
 

	
 
##true exire at browser close
 
#beaker.session.cookie_expires = 3600
 

	
 
    
 
################################################################################
 
## WARNING: *THE LINE BELOW MUST BE UNCOMMENTED ON A PRODUCTION ENVIRONMENT*  ##
 
## Debug mode will enable the interactive debugging tool, allowing ANYONE to  ##
production.ini
Show inline comments
 
@@ -51,6 +51,26 @@ beaker.cache.short_term.type=memory
 
beaker.cache.short_term.expire=60
 
beaker.cache.super_short_term.type=memory
 
beaker.cache.super_short_term.expire=10
 

	
 
####################################
 
###       BEAKER SESSION        ####
 
####################################
 
## Type of storage used for the session, current types are 
 
## “dbm”, “file”, “memcached”, “database”, and “memory”. 
 
## The storage uses the Container API 
 
##that is also used by the cache system.
 
beaker.session.type = file
 

	
 
beaker.session.key = hg-app
 
beaker.session.secret = g654dcno0-9873jhgfreyu
 
beaker.session.timeout = 36000
 

	
 
##auto save the session to not to use .save()
 
beaker.session.auto = False
 

	
 
##true exire at browser close
 
#beaker.session.cookie_expires = 3600
 

	
 
    
 
################################################################################
 
## WARNING: *THE LINE BELOW MUST BE UNCOMMENTED ON A PRODUCTION ENVIRONMENT*  ##
pylons_app/lib/indexers/__init__.py
Show inline comments
 
@@ -19,18 +19,23 @@ from shutil import rmtree
 
#LOCATION WE KEEP THE INDEX
 
IDX_LOCATION = jn(dn(dn(dn(dn(os.path.abspath(__file__))))), 'data', 'index')
 

	
 
#EXTENSION TO SKIP READING CONTENT ON
 
EXCLUDE_EXTENSIONS = ['pyc', 'mo', 'png', 'jpg', 'jpeg', 'gif', 'swf',
 
                       'dll', 'ttf', 'psd', 'svg', 'pdf', 'bmp', 'dll']
 
#EXTENSIONS WE WANT TO INDEX CONTENT OFF
 
INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c', 
 
                    'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl', 'h', 
 
                    'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp', 
 
                    'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3', 
 
                    'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql', 
 
                    'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml','xsl','xslt', 
 
                    'yaws']
 

	
 
#CUSTOM ANALYZER wordsplit + lowercase filter
 
ANALYZER = RegexTokenizer() | LowercaseFilter()
 
ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
 

	
 
#INDEX SCHEMA DEFINITION
 
SCHEMA = Schema(owner=TEXT(),
 
                repository=TEXT(stored=True),
 
                path=ID(stored=True, unique=True),
 
                content=TEXT(stored=True, analyzer=ANALYZER),
 
                modtime=STORED())
 
                modtime=STORED(),extension=TEXT(stored=True))
 

	
 
IDX_NAME = 'HG_INDEX'
 
IDX_NAME = 'HG_INDEX'
 
\ No newline at end of file
pylons_app/lib/indexers/daemon.py
Show inline comments
 
@@ -38,7 +38,7 @@ from pylons_app.config.environment impor
 
from pylons_app.model.hg_model import HgModel
 
from whoosh.index import create_in, open_dir
 
from shutil import rmtree
 
from pylons_app.lib.indexers import ANALYZER, EXCLUDE_EXTENSIONS, IDX_LOCATION, \
 
from pylons_app.lib.indexers import ANALYZER, INDEX_EXTENSIONS, IDX_LOCATION, \
 
SCHEMA, IDX_NAME
 

	
 
import logging
 
@@ -70,8 +70,10 @@ class WhooshIndexingDaemon(object):
 
    def add_doc(self, writer, path, repo):
 
        """Adding doc to writer"""
 
        
 
        #we don't won't to read excluded file extensions just index them
 
        if path.split('/')[-1].split('.')[-1].lower() not in EXCLUDE_EXTENSIONS:
 
        ext = unicode(path.split('/')[-1].split('.')[-1].lower())
 
        #we just index the content of choosen files
 
        if ext in INDEX_EXTENSIONS:
 
            log.debug('    >> %s [WITH CONTENT]' % path)
 
            fobj = open(path, 'rb')
 
            content = fobj.read()
 
            fobj.close()
 
@@ -81,15 +83,20 @@ class WhooshIndexingDaemon(object):
 
                #incase we have a decode error just represent as byte string
 
                u_content = unicode(str(content).encode('string_escape'))
 
        else:
 
            u_content = u''    
 
            log.debug('    >> %s' % path)
 
            #just index file name without it's content
 
            u_content = u''
 
                
 
        writer.add_document(owner=unicode(repo.contact),
 
                            repository=u"%s" % repo.name,
 
                            path=u"%s" % path,
 
                            content=u_content,
 
                            modtime=os.path.getmtime(path)) 
 
                            modtime=os.path.getmtime(path),
 
                            extension=ext) 
 
    
 
    def build_index(self):
 
        if os.path.exists(IDX_LOCATION):
 
            log.debug('removing previos index')
 
            rmtree(IDX_LOCATION)
 
            
 
        if not os.path.exists(IDX_LOCATION):
 
@@ -102,7 +109,6 @@ class WhooshIndexingDaemon(object):
 
            log.debug('building index @ %s' % repo.path)
 
        
 
            for idx_path in self.get_paths(repo.path):
 
                log.debug('    >> %s' % idx_path)
 
                self.add_doc(writer, idx_path, repo)
 
        writer.commit(merge=True)
 
                
 
@@ -170,11 +176,12 @@ class WhooshIndexingDaemon(object):
 
            self.update_index()
 
        
 
if __name__ == "__main__":
 
    repo_location = '/home/marcink/python_workspace_dirty/*'
 
    
 
    repo_location = '/home/marcink/hg_repos/*'
 
    full_index = True # False means looking just for changes
 
    try:
 
        l = DaemonLock()
 
        WhooshIndexingDaemon(repo_location=repo_location).run(full_index=True)
 
        WhooshIndexingDaemon(repo_location=repo_location)\
 
            .run(full_index=full_index)
 
        l.release()
 
    except LockHeld:
 
        sys.exit(1)
0 comments (0 inline, 0 general)