SHARE
TWEET

spacy_segfault

a guest Nov 16th, 2019 95 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. def readDoc(path, r):
  2.     report = []
  3.     fn = os.path.join(path, r) if ".txt" in r else os.path.join(path, r + ".txt")
  4.  
  5.     for l in open(fn, encoding="utf-8").readlines():
  6.         if l == "\n":
  7.             report.append(l)
  8.         else:
  9.             report.append(l.strip())
  10.     return " ".join(report).replace("  ", " ")
  11.  
  12.  
  13. def getFilesDir(path):
  14.     return [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
  15.  
  16.  
  17. @plac.annotations(start=plac.Annotation("Amount to start", "option", "s", int), interval=plac.Annotation("Interval for document increase", "option", "n", int),
  18.         iterations=plac.Annotation("Number of iterations to run", "option", "i", int, metavar="i"), docs=plac.Annotation("Number of documents to test", "option", "d", int),
  19.         reload_after_iteration=plac.Annotation("Reload model after iteration", "option", "r", bool), dont_repeat=plac.Annotation("Dont reuse same batch", "option", "b", bool),
  20.         garbage_collect=plac.Annotation("Run gc after each iteration", "option", "g", bool),
  21.  
  22. )
  23. def main(start=500, interval=250, iterations=10, docs=3000, reload_after_iteration=False, dont_repeat=False, garbage_collect=False):
  24.     print("Reading Files")
  25.     if dont_repeat and start + interval * iterations > docs:
  26.         docs = start + interval + iterations
  27.     documents = [readDoc(DOC_PATH, f) for f in tqdm(getFilesDir(DOC_PATH)[:docs])]
  28.     print("{:.2f} average character count".format(np.mean([len(r) for r in documents])))
  29.     print("{:.2f} Total characters".format(sum([len(r) for r in documents])))
  30.     print("{:.2f} average line count".format(np.mean([len(r.split("\n")) for r in documents])))
  31.     print("Size of reports = {}".format(size(sum([sys.getsizeof(r) for r in documents]), si)))
  32.     print("Finished reading files")
  33.     # dump_path = os.path.join(os.getcwd(),"seg_dumps")
  34.     # if not os.path.isdir(dump_path): os.mkdir(dump_path)
  35.     current_amount = start
  36.     previous = 0
  37.     nlp = spacy.load("en_core_web_sm")
  38.     for i in range(iterations):
  39.         print("Iteration {}, {} documents".format(i, current_amount))
  40.         if current_amount > docs:
  41.             print("{} is more than {}".format(current_amount, docs))
  42.             break
  43.         if dont_repeat:
  44.             docs_use = documents[previous:current_amount]
  45.         else:
  46.             docs_use = documents[:current_amount]
  47.         t0 = datetime.utcnow()
  48.         parsed_docs = list(nlp.pipe(docs_use))
  49.         t1 = datetime.utcnow()
  50.         time_passed = t1 - t0
  51.         minutes, seconds = divmod(time_passed.seconds, 60)
  52.         print("Parsed {} in {}m {}s".format(current_amount - previous, minutes, seconds))
  53.         if dont_repeat:
  54.             previous = current_amount
  55.         if reload_after_iteration:
  56.             print("Reloading nlp")
  57.             del nlp
  58.             nlp = spacy.load("en_core_web_sm")
  59.         if garbage_collect:
  60.             gc.collect()
  61.         current_amount += interval
  62.  
  63.  
  64. if __name__ == "__main__":
  65.     print("Starting")
  66.     plac.call(main)
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top