SHARE
TWEET

spacy segfault

a guest Nov 16th, 2019 74 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1.  
  2. def readDoc(path, r):
  3.     report = []
  4.     fn = os.path.join(path, r) if ".txt" in r else os.path.join(path, r + ".txt")
  5.  
  6.     for l in open(fn, encoding="utf-8").readlines():
  7.         if l == "\n":
  8.             report.append(l)
  9.         else:
  10.             report.append(l.strip())
  11.     return " ".join(report).replace("  ", " ")
  12.  
  13.  
  14.  
  15. def getFilesDir(path):
  16.     return [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
  17.  
  18.  
  19.  
  20. @plac.annotations(
  21.         start=plac.Annotation("Amount to start", "option", "s", int),
  22.         interval=plac.Annotation("Interval for document increase", "option", "n", int),
  23.         iterations=plac.Annotation("Number of iterations to run", "option", "i", int, metavar="i"), docs=plac.Annotation("Number of documents to test", "option", "d", int),
  24.         reload_after_iteration=plac.Annotation("Reload model after iteration", "option", "r", bool), dont_repeat=plac.Annotation("Dont reuse same batch", "option", "b", bool),
  25.         garbage_collect=plac.Annotation("Run gc after each iteration", "option", "g", bool),
  26.  
  27. )
  28. def main(start=500, interval=250, iterations=10, docs=3000, reload_after_iteration=False, dont_repeat=False, garbage_collect=False):
  29.     print("Reading Files")
  30.     if dont_repeat and start + interval * iterations > docs:
  31.         docs = start + interval + iterations
  32.     documents = [readDoc(DOC_PATH, f) for f in getFilesDir(DOC_PATH)[:docs]]
  33.     print("Finished reading files")
  34.     # dump_path = os.path.join(os.getcwd(),"seg_dumps")
  35.     # if not os.path.isdir(dump_path): os.mkdir(dump_path)
  36.     current_amount = start
  37.     previous = 0
  38.     nlp = spacy.load("en_core_web_sm")
  39.     for i in range(iterations):
  40.  
  41.         print("Iteration {}, {} documents".format(i,current_amount))
  42.         if current_amount > docs:
  43.             print("{} is more than {}".format(current_amount, docs))
  44.             break
  45.         if dont_repeat:
  46.             docs_use = documents[previous:current_amount]
  47.         else:
  48.             docs_use = documents[:current_amount]
  49.         t0 = datetime.utcnow()
  50.         parsed_docs = list(nlp.pipe(docs_use))
  51.         t1 = datetime.utcnow()
  52.         time_passed = t1 - t0
  53.         minutes, seconds = divmod(time_passed.seconds, 60)
  54.         print("Parsed {} in {}m {}s".format(current_amount - previous,minutes, seconds))
  55.         if dont_repeat:
  56.             previous = current_amount
  57.         if reload_after_iteration:
  58.             print("Reloading nlp")
  59.             del nlp
  60.             nlp = spacy.load("en_core_web_sm")
  61.         if garbage_collect:
  62.             gc.collect()
  63.         current_amount += interval
  64.  
  65.  
  66. if __name__ == "__main__":
  67.     print("Starting")
  68.     plac.call(main)
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top