Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def readDoc(path, r):
- report = []
- fn = os.path.join(path, r) if ".txt" in r else os.path.join(path, r + ".txt")
- for l in open(fn, encoding="utf-8").readlines():
- if l == "\n":
- report.append(l)
- else:
- report.append(l.strip())
- return " ".join(report).replace(" ", " ")
- def getFilesDir(path):
- return [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
- @plac.annotations(
- start=plac.Annotation("Amount to start", "option", "s", int),
- interval=plac.Annotation("Interval for document increase", "option", "n", int),
- iterations=plac.Annotation("Number of iterations to run", "option", "i", int, metavar="i"), docs=plac.Annotation("Number of documents to test", "option", "d", int),
- reload_after_iteration=plac.Annotation("Reload model after iteration", "option", "r", bool), dont_repeat=plac.Annotation("Dont reuse same batch", "option", "b", bool),
- garbage_collect=plac.Annotation("Run gc after each iteration", "option", "g", bool),
- )
- def main(start=500, interval=250, iterations=10, docs=3000, reload_after_iteration=False, dont_repeat=False, garbage_collect=False):
- print("Reading Files")
- if dont_repeat and start + interval * iterations > docs:
- docs = start + interval + iterations
- documents = [readDoc(DOC_PATH, f) for f in getFilesDir(DOC_PATH)[:docs]]
- print("Finished reading files")
- # dump_path = os.path.join(os.getcwd(),"seg_dumps")
- # if not os.path.isdir(dump_path): os.mkdir(dump_path)
- current_amount = start
- previous = 0
- nlp = spacy.load("en_core_web_sm")
- for i in range(iterations):
- print("Iteration {}, {} documents".format(i,current_amount))
- if current_amount > docs:
- print("{} is more than {}".format(current_amount, docs))
- break
- if dont_repeat:
- docs_use = documents[previous:current_amount]
- else:
- docs_use = documents[:current_amount]
- t0 = datetime.utcnow()
- parsed_docs = list(nlp.pipe(docs_use))
- t1 = datetime.utcnow()
- time_passed = t1 - t0
- minutes, seconds = divmod(time_passed.seconds, 60)
- print("Parsed {} in {}m {}s".format(current_amount - previous,minutes, seconds))
- if dont_repeat:
- previous = current_amount
- if reload_after_iteration:
- print("Reloading nlp")
- del nlp
- nlp = spacy.load("en_core_web_sm")
- if garbage_collect:
- gc.collect()
- current_amount += interval
- if __name__ == "__main__":
- print("Starting")
- plac.call(main)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement