Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # X: input; y: output
- import itertools
- from collections import defaultdict
- from itertools import zip_longest
- def genX(texts,fss,ypreds, maxlenTest):
- X = np.zeros((1, maxlenTest, 1024+4+13))
- nx =[]
- for text,fs,ypred in zip(texts,fss,ypreds):
- nx.extend(
- [np.concatenate([
- elmo.predict(gE([t]))[0],
- fs,
- ypred])
- for t in text]
- )
- if(len(nx)>0):
- X[0,:len(nx),:] = nx
- return X
- def genTags(words, heat):
- bins = [
- ("SK", [], []),
- ("LG", [], []),
- ("NM", [], []),
- ("AD", [], []),
- ("PH", [], []),
- ("MA", [], []),
- ("CP", [], []),
- ("WP", [], []),
- ("PO", [], []),
- ("IN", [], []),
- ("FI", [], []),
- ("EP", [], [])
- ]
- for w, h in zip(words, heat):
- v = np.argmax(h)
- if v != 0:
- ent = int(np.ceil(v / 2))
- acc = bins[ent - 1][1]
- lacc = bins[ent - 1][2]
- if (v == (ent * 2) - 1):
- if (len(lacc) > 0):
- acc.append(' '.join(lacc))
- lacc.clear()
- lacc.append(w)
- elif (v == ent * 2):
- lacc.append(w)
- for b in bins:
- if (len(b[2]) > 0):
- b[1].append(' '.join(b[2]))
- used = set()
- ents = [(ent, item) for ent, acc, lacc in bins for item in acc]
- mylist = [u'nowplaying', u'PBS', u'PBS', u'nowplaying', u'job', u'debate', u'thenandnow']
- nonrepeat = [ent for ent in ents if ent not in used and (used.add(ent) or True)]
- d = defaultdict(list)
- embDict = defaultdict(list)
- def nameFormat(word):
- return word[0].upper()+word[1:].lower()
- for k, v in nonrepeat:
- d[k].append(v)
- embDict[k].append(list(elmo.predict(gE([nameFormat(v)]))))
- return d, embDict
- def getEntities(fileName, model2):
- path = "ManuallyTagged" + "/" + fileName
- d = open(path, encoding="utf8").read()
- cssRules = getCssRules(d)
- text = remove_html_tags(d, "span")
- text = remove_nl(text)
- testSents, ids = getSents(text, cssRules) # testSents = [x, y, fontSize, CvPage], ids = [?, ...]
- test_embs = elmo.predict(gE([t for _, t in testSents]))
- test_fs = scaler.transform([f for f, _ in testSents]) # Positions are here TODO
- test_sents = np.hstack([test_embs, test_fs])
- test_docs = np.zeros((len(test_sents), 259, 1028), dtype='float32')
- c = 0
- for _ in range(len(test_sents)):
- for t, sentEmb in enumerate(test_sents):
- test_docs[c, t, :] = sentEmb
- c += 1
- test_texts = [t for _, t in testSents]
- ypred = model.predict([test_docs, test_sents])
- clusts = np.argmax(ypred, axis=1)
- mask = np.any([ypred[:, vi] > 0.5 for vi in KEEP], axis=0)
- procCurr = [[tokenize(t) for (f, t), m in zip(testSents, mask) if m],
- test_fs[mask], ypred[mask]]
- X = genX(*procCurr, maxlenTest)
- heat = modelEnt.predict(X)
- heat = heat.reshape(len(heat[0]), len(data_out))
- words = [t for text, fs, ypred in zip(*procCurr) for t in text]
- relevantClusts = [2, 3, 5, 6, 7, 8, 9]
- skDict, embDict = genTags(words, heat)
- # Postprocessing the data
- NAME = "NM"
- ADDRESS = "AD"
- PHONE = "PH"
- MAIL = "MA"
- SKILLS = "SK" # Embedding
- COMPANY = "CP" # Embedding
- WORK_PERIOD = "WP"
- POSITION = "PO" # Embedding
- LANGUAGES = "LG" # Embedding
- UNI = "IN"
- FIELD = "FI" # Embedding
- EDUCATION_PERIOD = "EP"
- def get_sentence_embedding(sentence_string):
- return elmo.predict(gE([sentence_string]))[0]
- def getEmbeddingsForList(datas):
- return [(data, get_sentence_embedding(data)) for data in datas]
- entities = {}
- entities[NAME] = " ".join(skDict[NAME])
- entities[ADDRESS] = " ".join(skDict[ADDRESS])
- entities[PHONE] = " ".join(skDict[PHONE])
- entities[MAIL] = "".join(skDict[MAIL])
- entities[SKILLS] = getEmbeddingsForList(skDict[SKILLS])
- entities[LANGUAGES] = getEmbeddingsForList(skDict[LANGUAGES])
- companies = getEmbeddingsForList(skDict[COMPANY])
- workPeriods = skDict[WORK_PERIOD]
- positions = getEmbeddingsForList(skDict[POSITION])
- entities["WORK_EXPERIENCE"] = list(zip_longest(companies, positions, workPeriods, fillvalue="UNKNOWN"))
- unis, educationPeriods = skDict[UNI], skDict[EDUCATION_PERIOD]
- fields = getEmbeddingsForList(skDict[FIELD])
- entities["EDUCATION"] = list(zip_longest(unis, fields, educationPeriods, fillvalue="UNKNOWN"))
- curr = {
- "Name": fileName,
- "HTML": getTaggedHtml(path, clusts),
- "Texts": test_texts,
- "Embeds": test_embs,
- "EmbedsPos":
- [np.mean(test_sents[np.isin(clusts, relevantClusts)], axis=0)] +
- [np.mean(test_sents[clusts == rc], axis=0) for rc in relevantClusts],
- "Clusts": clusts,
- "Properties": skDict,
- "PropEmbs": embDict
- }
- print(entities)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement