Untitled

from collections import Counter
import re
import glob

class ConllEntry:
    def __init__(self, id, form, pos, cpos, parent_id=None, relation=None):
        self.id = id
        self.form = form
        self.norm = normalize(form)
        self.cpos = cpos.upper()
        self.pos = pos.upper()
        self.parent_id = parent_id
        self.relation = relation
        # Added so that write_conll accepts these.
        self.pred_parent_id = parent_id
        self.pred_relation = relation

class ParseForest:
    def __init__(self, sentence):
        self.roots = list(sentence)

        for root in self.roots:
            root.children = []
            root.scores = None
            root.parent = None
            root.pred_parent_id = 0 # None
            root.pred_relation = 'rroot' # None
            root.vecs = None
            root.lstms = None

    def __len__(self):
        return len(self.roots)


    def Attach(self, parent_index, child_index):
        parent = self.roots[parent_index]
        child = self.roots[child_index]

        child.pred_parent_id = parent.id
        del self.roots[child_index]


def isProj(sentence):
    forest = ParseForest(sentence)
    unassigned = {entry.id: sum([1 for pentry in sentence if pentry.parent_id == entry.id]) for entry in sentence}

    for _ in xrange(len(sentence)):
        for i in xrange(len(forest.roots) - 1):
            if forest.roots[i].parent_id == forest.roots[i+1].id and unassigned[forest.roots[i].id] == 0:
                unassigned[forest.roots[i+1].id]-=1
                forest.Attach(i+1, i)
                break
            if forest.roots[i+1].parent_id == forest.roots[i].id and unassigned[forest.roots[i+1].id] == 0:
                unassigned[forest.roots[i].id]-=1
                forest.Attach(i, i+1)
                break

    return len(forest.roots) == 1

def vocab(conll_path):
    wordsCount = Counter()
    posCount = Counter()
    relCount = Counter()

    with open(conll_path, 'r') as conllFP:
        for sentence in read_conll(conllFP, True):
            wordsCount.update([node.norm for node in sentence])
            posCount.update([node.pos for node in sentence])
            relCount.update([node.relation for node in sentence])

    return (wordsCount, {w: i for i, w in enumerate(wordsCount.keys())},  posCount.keys(), relCount.keys())

def read_conll(fh, proj):
    root = ConllEntry(0, '*root*', 'ROOT-POS', 'ROOT-CPOS', 0, 'rroot')
    tokens = [root]
    for line in fh:
        tok = line.strip().split()
        if not tok:
            if len(tokens)>1:
                if not proj or isProj(tokens):
                    yield tokens
                else:
                    print 'Non-projective sentence dropped'
            tokens = [root]
            id = 0
        else:
            tokens.append(ConllEntry(int(tok[0]), tok[1], tok[3], tok[4], int(tok[6]) if tok[6] != '_' else -1, tok[7]))
    if len(tokens) > 1:
        yield tokens


def write_conll(fn, conll_gen):
    with open(fn, 'w') as fh:
        for sentence in conll_gen:
            for entry in sentence[1:]:
                fh.write('\t'.join([str(entry.id), entry.form, '_', entry.pos, entry.cpos, '_', str(entry.pred_parent_id), entry.pred_relation, '_', '_']))
                fh.write('\n')
            fh.write('\n')

# ---------------------------
# ADDED

# CONLL-U format:

# 0    ID: Word index, integer starting at 1 for each new sentence; may be a range for tokens with multiple words.
# 1    FORM: Word form or punctuation symbol.
# 2    LEMMA: Lemma or stem of word form.
# 3    UPOSTAG: Universal part-of-speech tag drawn from our revised version of the Google universal POS tags.
# 4    XPOSTAG: Language-specific part-of-speech tag; underscore if not available.
# 5    FEATS: List of morphological features from the universal feature inventory or from a defined language-specific extension; underscore if not available.
# 6    HEAD: Head of the current token, which is either a value of ID or zero (0).
# 7    DEPREL: Universal Stanford dependency relation to the HEAD (root iff HEAD = 0) or a defined language-specific subtype of one.
# 8    DEPS: List of secondary dependencies (head-deprel pairs).
# 9    MISC: Any other annotation.

def read_conll_u(filename):
    with open(filename) as f:
        root = ConllEntry(0, '*root*', 'ROOT-POS', 'ROOT-CPOS', 0, 'rroot')
        tokens = [root]
        for line in f:
            if line.startswith('#'):
                # This is just metadata.
                continue
            elif line == '\n':
                yield tokens
                tokens = [root]
            else:
                tok = line.strip().split()
                entry = ConllEntry(id = int(tok[0]),
                                   form = tok[1],
                                   pos = tok[3],
                                   cpos = '_',
                                   parent_id = int(tok[6]) if tok[6] != '_' else -1,
                                   relation = tok[7])
                tokens.append(entry)

def convert_conllu_to_conll(filename):
    if filename.endswith('conllu'):
        write_conll(fn = filename[:-1],
                    conll_gen = read_conll_u(filename))
    else:
        raise ValueErrorError('This does not seem to be a .conllu file!')

def convert_all(dirname):
    files = glob.glob(dirname + '/*.conllu')
    print "Converting %d files" % len(files)
    for filename in files:
        convert_conllu_to_conll(filename)

# ----------------------------------------

numberRegex = re.compile("[0-9]+|[0-9]+\\.[0-9]+|[0-9]+[0-9,]+");
def normalize(word):
    return 'NUM' if numberRegex.match(word) else word.lower()

cposTable = {"PRP$": "PRON", "VBG": "VERB", "VBD": "VERB", "VBN": "VERB", ",": ".", "''": ".", "VBP": "VERB", "WDT": "DET", "JJ": "ADJ", "WP": "PRON", "VBZ": "VERB",
             "DT": "DET", "#": ".", "RP": "PRT", "$": ".", "NN": "NOUN", ")": ".", "(": ".", "FW": "X", "POS": "PRT", ".": ".", "TO": "PRT", "PRP": "PRON", "RB": "ADV",
             ":": ".", "NNS": "NOUN", "NNP": "NOUN", "``": ".", "WRB": "ADV", "CC": "CONJ", "LS": "X", "PDT": "DET", "RBS": "ADV", "RBR": "ADV", "CD": "NUM", "EX": "DET",
             "IN": "ADP", "WP$": "PRON", "MD": "VERB", "NNPS": "NOUN", "JJS": "ADJ", "JJR": "ADJ", "SYM": "X", "VB": "VERB", "UH": "X", "ROOT-POS": "ROOT-CPOS",
             "-LRB-": ".", "-RRB-": "."}