Changeset - 28f19fa562df
[Not reviewed]
default
0 4 0
Marcin Kuzminski - 15 years ago 2010-08-28 14:53:32
marcin@python-works.com
updated config files,
Implemented content index extensions with whoosh,
fixed analyzer to match more words
4 files changed with 67 insertions and 15 deletions:
0 comments (0 inline, 0 general)
development.ini
Show inline comments
 
@@ -43,24 +43,44 @@ cache_dir = %(here)s/data
 
###         BEAKER CACHE        ####
 
####################################
 
beaker.cache.data_dir=/%(here)s/data/cache/data
 
beaker.cache.lock_dir=/%(here)s/data/cache/lock
 
beaker.cache.regions=super_short_term,short_term,long_term
 
beaker.cache.long_term.type=memory
 
beaker.cache.long_term.expire=36000
 
beaker.cache.short_term.type=memory
 
beaker.cache.short_term.expire=60
 
beaker.cache.super_short_term.type=memory
 
beaker.cache.super_short_term.expire=10
 

	
 
####################################
 
###       BEAKER SESSION        ####
 
####################################
 
## Type of storage used for the session, current types are 
 
## “dbm”, “file”, “memcached”, “database”, and “memory”. 
 
## The storage uses the Container API 
 
##that is also used by the cache system.
 
beaker.session.type = file
 

	
 
beaker.session.key = hg-app
 
beaker.session.secret = g654dcno0-9873jhgfreyu
 
beaker.session.timeout = 36000
 

	
 
##auto save the session to not to use .save()
 
beaker.session.auto = False
 

	
 
##true exire at browser close
 
#beaker.session.cookie_expires = 3600
 

	
 
    
 
################################################################################
 
## WARNING: *THE LINE BELOW MUST BE UNCOMMENTED ON A PRODUCTION ENVIRONMENT*  ##
 
## Debug mode will enable the interactive debugging tool, allowing ANYONE to  ##
 
## execute malicious code after an exception is raised.                       ##
 
################################################################################
 
#set debug = false
 

	
 
##################################
 
###       LOGVIEW CONFIG       ###
 
##################################
 
logview.sqlalchemy = #faa
 
logview.pylons.templating = #bfb
production.ini
Show inline comments
 
@@ -42,24 +42,44 @@ cache_dir = %(here)s/data
 
####################################
 
###         BEAKER CACHE        ####
 
####################################
 
beaker.cache.data_dir=/%(here)s/data/cache/data
 
beaker.cache.lock_dir=/%(here)s/data/cache/lock
 
beaker.cache.regions=super_short_term,short_term,long_term
 
beaker.cache.long_term.type=memory
 
beaker.cache.long_term.expire=36000
 
beaker.cache.short_term.type=memory
 
beaker.cache.short_term.expire=60
 
beaker.cache.super_short_term.type=memory
 
beaker.cache.super_short_term.expire=10
 

	
 
####################################
 
###       BEAKER SESSION        ####
 
####################################
 
## Type of storage used for the session, current types are 
 
## “dbm”, “file”, “memcached”, “database”, and “memory”. 
 
## The storage uses the Container API 
 
##that is also used by the cache system.
 
beaker.session.type = file
 

	
 
beaker.session.key = hg-app
 
beaker.session.secret = g654dcno0-9873jhgfreyu
 
beaker.session.timeout = 36000
 

	
 
##auto save the session to not to use .save()
 
beaker.session.auto = False
 

	
 
##true exire at browser close
 
#beaker.session.cookie_expires = 3600
 

	
 
    
 
################################################################################
 
## WARNING: *THE LINE BELOW MUST BE UNCOMMENTED ON A PRODUCTION ENVIRONMENT*  ##
 
## Debug mode will enable the interactive debugging tool, allowing ANYONE to  ##
 
## execute malicious code after an exception is raised.                       ##
 
################################################################################
 
set debug = false
 

	
 
##################################
 
###       LOGVIEW CONFIG       ###
 
##################################
 
logview.sqlalchemy = #faa
pylons_app/lib/indexers/__init__.py
Show inline comments
 
@@ -10,27 +10,32 @@ from os.path import join as jn
 
sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))
 

	
 
from pylons_app.config.environment import load_environment
 
from pylons_app.model.hg_model import HgModel
 
from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
 
from whoosh.fields import TEXT, ID, STORED, Schema
 
from whoosh.index import create_in, open_dir
 
from shutil import rmtree
 

	
 
#LOCATION WE KEEP THE INDEX
 
IDX_LOCATION = jn(dn(dn(dn(dn(os.path.abspath(__file__))))), 'data', 'index')
 

	
 
#EXTENSION TO SKIP READING CONTENT ON
 
EXCLUDE_EXTENSIONS = ['pyc', 'mo', 'png', 'jpg', 'jpeg', 'gif', 'swf',
 
                       'dll', 'ttf', 'psd', 'svg', 'pdf', 'bmp', 'dll']
 
#EXTENSIONS WE WANT TO INDEX CONTENT OFF
 
INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c', 
 
                    'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl', 'h', 
 
                    'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp', 
 
                    'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3', 
 
                    'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql', 
 
                    'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml','xsl','xslt', 
 
                    'yaws']
 

	
 
#CUSTOM ANALYZER wordsplit + lowercase filter
 
ANALYZER = RegexTokenizer() | LowercaseFilter()
 
ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
 

	
 
#INDEX SCHEMA DEFINITION
 
SCHEMA = Schema(owner=TEXT(),
 
                repository=TEXT(stored=True),
 
                path=ID(stored=True, unique=True),
 
                content=TEXT(stored=True, analyzer=ANALYZER),
 
                modtime=STORED())
 
                modtime=STORED(),extension=TEXT(stored=True))
 

	
 
IDX_NAME = 'HG_INDEX'
 
IDX_NAME = 'HG_INDEX'
 
\ No newline at end of file
pylons_app/lib/indexers/daemon.py
Show inline comments
 
@@ -29,25 +29,25 @@ from os.path import dirname as dn
 
from os.path import join as jn
 

	
 
#to get the pylons_app import
 
project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
 
sys.path.append(project_path)
 

	
 
from pidlock import LockHeld, DaemonLock
 
import traceback
 
from pylons_app.config.environment import load_environment
 
from pylons_app.model.hg_model import HgModel
 
from whoosh.index import create_in, open_dir
 
from shutil import rmtree
 
from pylons_app.lib.indexers import ANALYZER, EXCLUDE_EXTENSIONS, IDX_LOCATION, \
 
from pylons_app.lib.indexers import ANALYZER, INDEX_EXTENSIONS, IDX_LOCATION, \
 
SCHEMA, IDX_NAME
 

	
 
import logging
 
import logging.config
 
logging.config.fileConfig(jn(project_path, 'development.ini'))
 
log = logging.getLogger('whooshIndexer')
 

	
 
def scan_paths(root_location):
 
    return HgModel.repo_scan('/', root_location, None, True)
 

	
 
class WhooshIndexingDaemon(object):
 
    """Deamon for atomic jobs"""
 
@@ -61,57 +61,63 @@ class WhooshIndexingDaemon(object):
 
        excluding files in .hg dir"""
 
        index_paths_ = set()
 
        for path, dirs, files in os.walk(root_dir):
 
            if path.find('.hg') == -1:
 
                for f in files:
 
                    index_paths_.add(jn(path, f))
 
    
 
        return index_paths_
 
    
 
    def add_doc(self, writer, path, repo):
 
        """Adding doc to writer"""
 
        
 
        #we don't won't to read excluded file extensions just index them
 
        if path.split('/')[-1].split('.')[-1].lower() not in EXCLUDE_EXTENSIONS:
 
        ext = unicode(path.split('/')[-1].split('.')[-1].lower())
 
        #we just index the content of choosen files
 
        if ext in INDEX_EXTENSIONS:
 
            log.debug('    >> %s [WITH CONTENT]' % path)
 
            fobj = open(path, 'rb')
 
            content = fobj.read()
 
            fobj.close()
 
            try:
 
                u_content = unicode(content)
 
            except UnicodeDecodeError:
 
                #incase we have a decode error just represent as byte string
 
                u_content = unicode(str(content).encode('string_escape'))
 
        else:
 
            u_content = u''    
 
            log.debug('    >> %s' % path)
 
            #just index file name without it's content
 
            u_content = u''
 
                
 
        writer.add_document(owner=unicode(repo.contact),
 
                            repository=u"%s" % repo.name,
 
                            path=u"%s" % path,
 
                            content=u_content,
 
                            modtime=os.path.getmtime(path)) 
 
                            modtime=os.path.getmtime(path),
 
                            extension=ext) 
 
    
 
    def build_index(self):
 
        if os.path.exists(IDX_LOCATION):
 
            log.debug('removing previos index')
 
            rmtree(IDX_LOCATION)
 
            
 
        if not os.path.exists(IDX_LOCATION):
 
            os.mkdir(IDX_LOCATION)
 
        
 
        idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME)
 
        writer = idx.writer()
 
        
 
        for cnt, repo in enumerate(scan_paths(self.repo_location).values()):
 
            log.debug('building index @ %s' % repo.path)
 
        
 
            for idx_path in self.get_paths(repo.path):
 
                log.debug('    >> %s' % idx_path)
 
                self.add_doc(writer, idx_path, repo)
 
        writer.commit(merge=True)
 
                
 
        log.debug('>>> FINISHED BUILDING INDEX <<<')
 
            
 
    
 
    def update_index(self):
 
        log.debug('STARTING INCREMENTAL INDEXING UPDATE')
 
            
 
        idx = open_dir(IDX_LOCATION, indexname=self.indexname)
 
        # The set of all paths in the index
 
        indexed_paths = set()
 
@@ -161,21 +167,22 @@ class WhooshIndexingDaemon(object):
 
        writer.commit(merge=True)
 
        #idx.optimize()
 
        log.debug('>>> FINISHED <<<')
 
        
 
    def run(self, full_index=False):
 
        """Run daemon"""
 
        if full_index:
 
            self.build_index()
 
        else:
 
            self.update_index()
 
        
 
if __name__ == "__main__":
 
    repo_location = '/home/marcink/python_workspace_dirty/*'
 
    
 
    repo_location = '/home/marcink/hg_repos/*'
 
    full_index = True # False means looking just for changes
 
    try:
 
        l = DaemonLock()
 
        WhooshIndexingDaemon(repo_location=repo_location).run(full_index=True)
 
        WhooshIndexingDaemon(repo_location=repo_location)\
 
            .run(full_index=full_index)
 
        l.release()
 
    except LockHeld:
 
        sys.exit(1)
 

	
0 comments (0 inline, 0 general)