Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os.path
- import urllib
- import codecs
- import cStringIO as StringIO
- import re
- import string
- from whoosh import index, qparser
- from whoosh.fields import Schema, ID, TEXT
- from whoosh.index import open_dir
- def write_index(src_paths, dst_dir):
- """Write search index"""
- schema = Schema(path=ID(unique=True, stored=True),
- content=TEXT(spelling=True))
- ix = index.create_in(dst_dir, schema=schema)
- writer = ix.writer()
- for src_path in src_paths:
- add_doc(writer, src_path)
- writer.commit()
- def strip_punctuation(s):
- """strip all the punctuation from a string"""
- return re.sub("[%s]*" % re.escape(string.punctuation), "", s)
- def add_doc(writer, path):
- """Add utf-8 encoded document to index"""
- fileObj = codecs.open(path, encoding="utf-8")
- content = strip_punctuation(fileObj.read())
- fileObj.close()
- for word in content.split():
- writer.add_document(path=path, content=word)
- def parse_string(qstring, index):
- """Parse the user query string"""
- parser = qparser.QueryParser("content", index.schema)
- q = parser.parse(qstring)
- with index.searcher() as s: # Try correcting the query
- corrected = s.correct_query(q, qstring)
- if corrected.query != q:
- print qstring, "--> Did you mean:", corrected.string
- def extract_comments(fileObj):
- """Extract comments from spss syntax and strip out punctuation"""
- subst = strip_punctuation
- comments = [subst(line).split() for line in fileObj if line[0] == u"*"]
- fileObj.close()
- return reduce(list.__add__, comments)
- syntax = u"""\
- * Id iz jajaja to saj thad names shoult reveal intent. What we want to impress upon you is that
- * we are serious about this. Choosing good names takes time but saves more than it takes.
- * So take care with your names and change them when you find better ones. Everyone who
- * reads your code (including you) will be happier if you do.
- * The name of a variable, function, or class, should answer all the big questions. It
- * should tell you why it exists, what it does, and how it is used. If a name requires a com-
- * ment, then the name does not reveal its intent."""
- if __name__ == "__main__":
- url = "http://www.gutenberg.org/ebooks/97.txt.utf-8" # Flatland
- dst_dir = u"d:/temp"
- book = os.path.join(dst_dir, u"book.txt")
- if not os.path.exists(book):
- urllib.urlretrieve (url, book)
- write_index([book], dst_dir)
- comments = extract_comments(StringIO.StringIO(syntax))
- index = open_dir(dst_dir) # open index from file
- for comment in comments:
- parse_string(comment, index)
Advertisement
Add Comment
Please, Sign In to add comment