Advertisement
Guest User

spacy_segfault

a guest
Nov 16th, 2019
133
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.99 KB | None | 0 0
  1. def readDoc(path, r):
  2. report = []
  3. fn = os.path.join(path, r) if ".txt" in r else os.path.join(path, r + ".txt")
  4.  
  5. for l in open(fn, encoding="utf-8").readlines():
  6. if l == "\n":
  7. report.append(l)
  8. else:
  9. report.append(l.strip())
  10. return " ".join(report).replace(" ", " ")
  11.  
  12.  
  13. def getFilesDir(path):
  14. return [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
  15.  
  16.  
  17. @plac.annotations(start=plac.Annotation("Amount to start", "option", "s", int), interval=plac.Annotation("Interval for document increase", "option", "n", int),
  18. iterations=plac.Annotation("Number of iterations to run", "option", "i", int, metavar="i"), docs=plac.Annotation("Number of documents to test", "option", "d", int),
  19. reload_after_iteration=plac.Annotation("Reload model after iteration", "option", "r", bool), dont_repeat=plac.Annotation("Dont reuse same batch", "option", "b", bool),
  20. garbage_collect=plac.Annotation("Run gc after each iteration", "option", "g", bool),
  21.  
  22. )
  23. def main(start=500, interval=250, iterations=10, docs=3000, reload_after_iteration=False, dont_repeat=False, garbage_collect=False):
  24. print("Reading Files")
  25. if dont_repeat and start + interval * iterations > docs:
  26. docs = start + interval + iterations
  27. documents = [readDoc(DOC_PATH, f) for f in tqdm(getFilesDir(DOC_PATH)[:docs])]
  28. print("{:.2f} average character count".format(np.mean([len(r) for r in documents])))
  29. print("{:.2f} Total characters".format(sum([len(r) for r in documents])))
  30. print("{:.2f} average line count".format(np.mean([len(r.split("\n")) for r in documents])))
  31. print("Size of reports = {}".format(size(sum([sys.getsizeof(r) for r in documents]), si)))
  32. print("Finished reading files")
  33. # dump_path = os.path.join(os.getcwd(),"seg_dumps")
  34. # if not os.path.isdir(dump_path): os.mkdir(dump_path)
  35. current_amount = start
  36. previous = 0
  37. nlp = spacy.load("en_core_web_sm")
  38. for i in range(iterations):
  39. print("Iteration {}, {} documents".format(i, current_amount))
  40. if current_amount > docs:
  41. print("{} is more than {}".format(current_amount, docs))
  42. break
  43. if dont_repeat:
  44. docs_use = documents[previous:current_amount]
  45. else:
  46. docs_use = documents[:current_amount]
  47. t0 = datetime.utcnow()
  48. parsed_docs = list(nlp.pipe(docs_use))
  49. t1 = datetime.utcnow()
  50. time_passed = t1 - t0
  51. minutes, seconds = divmod(time_passed.seconds, 60)
  52. print("Parsed {} in {}m {}s".format(current_amount - previous, minutes, seconds))
  53. if dont_repeat:
  54. previous = current_amount
  55. if reload_after_iteration:
  56. print("Reloading nlp")
  57. del nlp
  58. nlp = spacy.load("en_core_web_sm")
  59. if garbage_collect:
  60. gc.collect()
  61. current_amount += interval
  62.  
  63.  
  64. if __name__ == "__main__":
  65. print("Starting")
  66. plac.call(main)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement