Changeset - e5157e2a530e
[Not reviewed]
default
0 2 0
Marcin Kuzminski - 15 years ago 2010-09-01 23:38:03
marcin@python-works.com
added safe unicode funtion, and implemented it in whoosh indexer
2 files changed with 18 insertions and 5 deletions:
0 comments (0 inline, 0 general)
pylons_app/lib/helpers.py
Show inline comments
 
@@ -333,6 +333,22 @@ def gravatar_url(email_address, size=30)
 
    
 
    # construct the url
 
    gravatar_url = baseurl + hashlib.md5(email_address.lower()).hexdigest() + "?"
 
    gravatar_url += urllib.urlencode({'d':default, 's':str(size)})
 

	
 
    return gravatar_url
 

	
 
def safe_unicode(str):
 
    """safe unicode function. In case of UnicodeDecode error we try to return
 
    unicode with errors replace, if this failes we return unicode with 
 
    string_escape decoding """
 
    
 
    try:
 
        u_str = unicode(str)
 
    except UnicodeDecodeError:
 
        try:
 
            u_str = unicode(str, 'utf-8', 'replace')
 
        except UnicodeDecodeError:
 
            #incase we have a decode error just represent as byte string
 
            u_str = unicode(str(str).encode('string_escape'))
 
        
 
    return u_str
 
\ No newline at end of file
pylons_app/lib/indexers/daemon.py
Show inline comments
 
@@ -33,12 +33,13 @@ project_path = dn(dn(dn(dn(os.path.realp
 
sys.path.append(project_path)
 

	
 
from pidlock import LockHeld, DaemonLock
 
import traceback
 
from pylons_app.config.environment import load_environment
 
from pylons_app.model.hg_model import HgModel
 
from pylons_app.lib.helpers import safe_unicode
 
from whoosh.index import create_in, open_dir
 
from shutil import rmtree
 
from pylons_app.lib.indexers import ANALYZER, INDEX_EXTENSIONS, IDX_LOCATION, \
 
SCHEMA, IDX_NAME
 

	
 
import logging
 
@@ -74,17 +75,13 @@ class WhooshIndexingDaemon(object):
 
        #we just index the content of choosen files
 
        if ext in INDEX_EXTENSIONS:
 
            log.debug('    >> %s [WITH CONTENT]' % path)
 
            fobj = open(path, 'rb')
 
            content = fobj.read()
 
            fobj.close()
 
            try:
 
                u_content = unicode(content)
 
            except UnicodeDecodeError:
 
                #incase we have a decode error just represent as byte string
 
                u_content = unicode(str(content).encode('string_escape'))
 
            u_content = safe_unicode(content)
 
        else:
 
            log.debug('    >> %s' % path)
 
            #just index file name without it's content
 
            u_content = u''
 
        
 
        
0 comments (0 inline, 0 general)