Advertisement
Guest User

spacy segfault

a guest
Nov 16th, 2019
91
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.65 KB | None | 0 0
  1.  
  2. def readDoc(path, r):
  3. report = []
  4. fn = os.path.join(path, r) if ".txt" in r else os.path.join(path, r + ".txt")
  5.  
  6. for l in open(fn, encoding="utf-8").readlines():
  7. if l == "\n":
  8. report.append(l)
  9. else:
  10. report.append(l.strip())
  11. return " ".join(report).replace(" ", " ")
  12.  
  13.  
  14.  
  15. def getFilesDir(path):
  16. return [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
  17.  
  18.  
  19.  
  20. @plac.annotations(
  21. start=plac.Annotation("Amount to start", "option", "s", int),
  22. interval=plac.Annotation("Interval for document increase", "option", "n", int),
  23. iterations=plac.Annotation("Number of iterations to run", "option", "i", int, metavar="i"), docs=plac.Annotation("Number of documents to test", "option", "d", int),
  24. reload_after_iteration=plac.Annotation("Reload model after iteration", "option", "r", bool), dont_repeat=plac.Annotation("Dont reuse same batch", "option", "b", bool),
  25. garbage_collect=plac.Annotation("Run gc after each iteration", "option", "g", bool),
  26.  
  27. )
  28. def main(start=500, interval=250, iterations=10, docs=3000, reload_after_iteration=False, dont_repeat=False, garbage_collect=False):
  29. print("Reading Files")
  30. if dont_repeat and start + interval * iterations > docs:
  31. docs = start + interval + iterations
  32. documents = [readDoc(DOC_PATH, f) for f in getFilesDir(DOC_PATH)[:docs]]
  33. print("Finished reading files")
  34. # dump_path = os.path.join(os.getcwd(),"seg_dumps")
  35. # if not os.path.isdir(dump_path): os.mkdir(dump_path)
  36. current_amount = start
  37. previous = 0
  38. nlp = spacy.load("en_core_web_sm")
  39. for i in range(iterations):
  40.  
  41. print("Iteration {}, {} documents".format(i,current_amount))
  42. if current_amount > docs:
  43. print("{} is more than {}".format(current_amount, docs))
  44. break
  45. if dont_repeat:
  46. docs_use = documents[previous:current_amount]
  47. else:
  48. docs_use = documents[:current_amount]
  49. t0 = datetime.utcnow()
  50. parsed_docs = list(nlp.pipe(docs_use))
  51. t1 = datetime.utcnow()
  52. time_passed = t1 - t0
  53. minutes, seconds = divmod(time_passed.seconds, 60)
  54. print("Parsed {} in {}m {}s".format(current_amount - previous,minutes, seconds))
  55. if dont_repeat:
  56. previous = current_amount
  57. if reload_after_iteration:
  58. print("Reloading nlp")
  59. del nlp
  60. nlp = spacy.load("en_core_web_sm")
  61. if garbage_collect:
  62. gc.collect()
  63. current_amount += interval
  64.  
  65.  
  66. if __name__ == "__main__":
  67. print("Starting")
  68. plac.call(main)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement