Untitled

# X: input; y: output
import itertools
from collections import defaultdict
from itertools import zip_longest

def genX(texts,fss,ypreds, maxlenTest):
  X = np.zeros((1, maxlenTest, 1024+4+13))
  nx =[]
  for text,fs,ypred in zip(texts,fss,ypreds):
    nx.extend(
      [np.concatenate([
          elmo.predict(gE([t]))[0],
          fs,
          ypred])
          for t in text]
    )
  if(len(nx)>0):
    X[0,:len(nx),:] = nx
  return X

def genTags(words, heat):
    bins = [
        ("SK", [], []),
        ("LG", [], []),
        ("NM", [], []),
        ("AD", [], []),
        ("PH", [], []),
        ("MA", [], []),
        ("CP", [], []),
        ("WP", [], []),
        ("PO", [], []),
        ("IN", [], []),
        ("FI", [], []),
        ("EP", [], [])
    ]
    for w, h in zip(words, heat):
        v = np.argmax(h)
        if v != 0:
            ent = int(np.ceil(v / 2))
            acc = bins[ent - 1][1]
            lacc = bins[ent - 1][2]
            if (v == (ent * 2) - 1):
                if (len(lacc) > 0):
                    acc.append(' '.join(lacc))
                    lacc.clear()
                lacc.append(w)
            elif (v == ent * 2):
                lacc.append(w)
    for b in bins:
        if (len(b[2]) > 0):
            b[1].append(' '.join(b[2]))

    used = set()
    ents = [(ent, item) for ent, acc, lacc in bins for item in acc]

    mylist = [u'nowplaying', u'PBS', u'PBS', u'nowplaying', u'job', u'debate', u'thenandnow']
    nonrepeat = [ent for ent in ents if ent not in used and (used.add(ent) or True)]

    d = defaultdict(list)
    embDict = defaultdict(list)

    def nameFormat(word):
      return word[0].upper()+word[1:].lower()

    for k, v in nonrepeat:
        d[k].append(v)
        embDict[k].append(list(elmo.predict(gE([nameFormat(v)]))))
    return d, embDict

def getEntities(fileName, model2):
    path = "ManuallyTagged" + "/" + fileName
    d = open(path, encoding="utf8").read()
    cssRules = getCssRules(d)
    text = remove_html_tags(d, "span")
    text = remove_nl(text)
    testSents, ids = getSents(text, cssRules) # testSents = [x, y, fontSize, CvPage], ids = [?, ...]

    test_embs = elmo.predict(gE([t for _, t in testSents]))
    test_fs = scaler.transform([f for f, _ in testSents]) # Positions are here TODO
    test_sents = np.hstack([test_embs, test_fs])
    test_docs = np.zeros((len(test_sents), 259, 1028), dtype='float32')
    c = 0
    for _ in range(len(test_sents)):
        for t, sentEmb in enumerate(test_sents):
            test_docs[c, t, :] = sentEmb
        c += 1

    test_texts = [t for _, t in testSents]

    ypred = model.predict([test_docs, test_sents])
    clusts = np.argmax(ypred, axis=1)

    mask = np.any([ypred[:, vi] > 0.5 for vi in KEEP], axis=0)

    procCurr = [[tokenize(t) for (f, t), m in zip(testSents, mask) if m],
                test_fs[mask], ypred[mask]]

    X = genX(*procCurr, maxlenTest)

    heat = modelEnt.predict(X)

    heat = heat.reshape(len(heat[0]), len(data_out))
    words = [t for text, fs, ypred in zip(*procCurr) for t in text]

    relevantClusts = [2, 3, 5, 6, 7, 8, 9]
    skDict, embDict = genTags(words, heat)

    # Postprocessing the data
    NAME = "NM"
    ADDRESS = "AD"
    PHONE = "PH"
    MAIL = "MA"
    SKILLS = "SK" # Embedding
    COMPANY = "CP" # Embedding
    WORK_PERIOD = "WP"
    POSITION = "PO" # Embedding
    LANGUAGES = "LG" # Embedding

    UNI = "IN"
    FIELD = "FI" # Embedding
    EDUCATION_PERIOD = "EP"

    def get_sentence_embedding(sentence_string):
      return elmo.predict(gE([sentence_string]))[0]

    def getEmbeddingsForList(datas):
      return [(data, get_sentence_embedding(data)) for data in datas]

    entities = {}
    entities[NAME] = " ".join(skDict[NAME])
    entities[ADDRESS] = " ".join(skDict[ADDRESS])
    entities[PHONE] = " ".join(skDict[PHONE])
    entities[MAIL] = "".join(skDict[MAIL])
    entities[SKILLS] = getEmbeddingsForList(skDict[SKILLS])
    entities[LANGUAGES] = getEmbeddingsForList(skDict[LANGUAGES])

    companies = getEmbeddingsForList(skDict[COMPANY])
    workPeriods = skDict[WORK_PERIOD]
    positions = getEmbeddingsForList(skDict[POSITION])
    entities["WORK_EXPERIENCE"] = list(zip_longest(companies, positions, workPeriods, fillvalue="UNKNOWN"))

    unis, educationPeriods = skDict[UNI], skDict[EDUCATION_PERIOD]
    fields = getEmbeddingsForList(skDict[FIELD])
    entities["EDUCATION"] = list(zip_longest(unis, fields, educationPeriods, fillvalue="UNKNOWN"))

    curr = {
        "Name": fileName,
        "HTML": getTaggedHtml(path, clusts),
        "Texts": test_texts,
        "Embeds": test_embs,
        "EmbedsPos":
        [np.mean(test_sents[np.isin(clusts, relevantClusts)], axis=0)] +
        [np.mean(test_sents[clusts == rc], axis=0) for rc in relevantClusts],
        "Clusts": clusts,
        "Properties": skDict,
        "PropEmbs": embDict
    }

    print(entities)