Guest User

Spelling suggestions using whoosh

a guest
Jan 18th, 2014
94
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.64 KB | None | 0 0
  1. import os.path
  2. import urllib
  3. import codecs
  4. import cStringIO as StringIO
  5. import re
  6. import string
  7. from whoosh import index, qparser
  8. from whoosh.fields import Schema, ID, TEXT
  9. from whoosh.index import open_dir
  10.  
  11. def write_index(src_paths, dst_dir):
  12.     """Write search index"""
  13.     schema = Schema(path=ID(unique=True, stored=True),
  14.                     content=TEXT(spelling=True))
  15.     ix = index.create_in(dst_dir, schema=schema)
  16.     writer = ix.writer()
  17.     for src_path in src_paths:
  18.         add_doc(writer, src_path)
  19.     writer.commit()
  20.  
  21. def strip_punctuation(s):
  22.     """strip all the punctuation from a string"""
  23.     return re.sub("[%s]*" % re.escape(string.punctuation), "", s)
  24.  
  25. def add_doc(writer, path):
  26.     """Add utf-8 encoded document to index"""
  27.     fileObj = codecs.open(path, encoding="utf-8")
  28.     content = strip_punctuation(fileObj.read())
  29.     fileObj.close()
  30.     for word in content.split():
  31.         writer.add_document(path=path, content=word)
  32.  
  33. def parse_string(qstring, index):
  34.     """Parse the user query string"""
  35.     parser = qparser.QueryParser("content", index.schema)
  36.     q = parser.parse(qstring)
  37.     with index.searcher() as s:   # Try correcting the query
  38.         corrected = s.correct_query(q, qstring)
  39.         if corrected.query != q:
  40.             print qstring, "--> Did you mean:", corrected.string
  41.  
  42. def extract_comments(fileObj):
  43.     """Extract comments from spss syntax and strip out punctuation"""
  44.     subst = strip_punctuation
  45.     comments = [subst(line).split() for line in fileObj if line[0] == u"*"]
  46.     fileObj.close()
  47.     return reduce(list.__add__, comments)
  48.  
  49. syntax = u"""\
  50. * Id iz jajaja to saj thad names shoult reveal intent. What we want to impress upon you is that
  51. * we are serious about this. Choosing good names takes time but saves more than it takes.
  52. * So take care with your names and change them when you find better ones. Everyone who
  53. * reads your code (including you) will be happier if you do.
  54. * The name of a variable, function, or class, should answer all the big questions. It
  55. * should tell you why it exists, what it does, and how it is used. If a name requires a com-
  56. * ment, then the name does not reveal its intent."""
  57.  
  58.  
  59. if __name__ == "__main__":
  60.     url = "http://www.gutenberg.org/ebooks/97.txt.utf-8"  # Flatland
  61.     dst_dir = u"d:/temp"
  62.     book = os.path.join(dst_dir, u"book.txt")
  63.     if not os.path.exists(book):
  64.         urllib.urlretrieve (url, book)
  65.         write_index([book], dst_dir)
  66.     comments = extract_comments(StringIO.StringIO(syntax))
  67.     index = open_dir(dst_dir)  # open index from file
  68.     for comment in comments:
  69.         parse_string(comment, index)
Advertisement
Add Comment
Please, Sign In to add comment