https://gitlab.com/mcoavoux/mtgpy-release-findings-2021.git
Tip revision: c9972219cd75049269d26632d2bb79619d661298 authored by mcoavoux on 20 May 2021, 13:04:44 UTC
up readme
up readme
Tip revision: c997221
corpus_reader.py
from tree import Token, Tree, get_yield
ID,FORM,LEMMA,CPOS,FPOS,MORPH,HEAD,DEPREL,PHEAD,PDEPREL=range(10)
def is_xml(s) : return s[0] == "<" and s[-1] == ">"
def is_xml_beg(s) : return is_xml(s) and s[1] != "/"
def is_xml_end(s) : return is_xml(s) and not is_xml_beg(s)
def is_head(s) : return is_xml(s) and "^head" in s
def get_nt_from_xml(s) :
if is_xml_beg(s) :
s = s[1:-1]
elif is_xml_end(s) :
s = s[2:-1]
else : assert(False)
if s[-5:] == "^head" :
return s[:-5]
return s
def parse_token(line) :
idx, token, line = line[0],line[1],line[2:]
idx = int(idx.split("^")[0]) # in case head is on idx
tok = Token(token, idx-1, line[:-1])
return tok
def read_tbk_tree_rec(lines, beg, end, headersize) :
if len(lines[beg]) == 1 :
assert(is_xml_beg(lines[beg][0]))
assert(is_xml_end(lines[end-1][0]))
label = get_nt_from_xml(lines[beg][0])
assert(label == get_nt_from_xml(lines[end-1][0]))
i = beg + 1
c_beg = []
counter = 0
while i < end :
if counter == 0 :
c_beg.append(i)
if is_xml_beg(lines[i][0]) :
counter += 1
elif is_xml_end(lines[i][0]) :
counter -= 1
i += 1
children = [ read_tbk_tree_rec(lines, i, j, headersize) for i,j in zip(c_beg[:-1], c_beg[1:]) ]
#is_head = "^head" in lines[beg][0]
subtree = Tree(label, children)
#node = CtbkTree(label, children)
#node.head = is_head
#node.idx = min([c.idx for c in node.children])
#node.children = sorted(node.children, key = lambda x : x.idx)
return subtree
else :
assert(len(lines[beg]) == headersize + 1)
assert(end == beg + 1)
return parse_token(lines[beg])
def read_tbk_tree(string, headersize) :
lines = [ line.strip().split("\t") for line in string.split("\n") if line.strip()]
return read_tbk_tree_rec(lines, 0, len(lines), headersize)
def read_ctbk_corpus(filename) :
instream = open(filename, "r", encoding="utf8")
header = instream.readline().strip().split()
assert(header[-1] == "gdeprel")
Token.header = header[2:-1]
sentences = instream.read().split("\n\n")
return [ read_tbk_tree(s, len(header)) for s in sentences if s.strip() ]
def get_conll(tree):
tokens = get_yield(tree)
conll_tokens = []
for tok in tokens :
newtok = ["_" for i in range(10)]
newtok[ID] = str(tok.i)
newtok[FORM] = tok.token
newtok[CPOS] = newtok[FPOS] = tok.features[0]
newtok[MORPH] = "|".join(sorted(["{}={}".format(a,v) for a,v in zip( Token.header, tok.features[1:] ) if v != "UNDEF"]))
conll_tokens.append(newtok)
return conll_tokens
def write_conll(ctree, out):
for tok in ctree :
out.write("{}\n".format("\t".join(tok)))
def nltk_tree_to_Tree(nltk_tree):
# Leaf
if len(nltk_tree) == 1 and type(nltk_tree[0]) == str:
idx, token = nltk_tree[0].split("=", 1)
idx = int(idx)
return Token(token, idx, [nltk_tree.label()])
else:
children = [nltk_tree_to_Tree(child) for child in nltk_tree]
return Tree(nltk_tree.label(), children)
def read_discbracket_corpus(filename):
from nltk import Tree as nTree
with open(filename, encoding="utf8") as f:
ctrees = [nTree.fromstring(line.strip()) for line in f]
return [nltk_tree_to_Tree(t) for t in ctrees]
def get_leaves_rec(nltk_tree, leaves):
if len(nltk_tree) == 1 and type(nltk_tree[0]) == str:
leaves.append(nltk_tree)
else:
for child in nltk_tree:
get_leaves_rec(child, leaves)
def update_leaves(nltk_tree):
# Add i=token for each token (convert to discbracket format
leaves = []
get_leaves_rec(nltk_tree, leaves)
for i, leaf in enumerate(leaves):
leaf[0] = f"{i}={leaf[0]}"
def read_bracket_corpus(filename):
from nltk import Tree as nTree
with open(filename, encoding="utf8") as f:
ctrees = [nTree.fromstring(line.strip()) for line in f]
for t in ctrees:
update_leaves(t)
return [nltk_tree_to_Tree(t) for t in ctrees]
if __name__ == "__main__":
import sys
treebank_1 = read_ctbk_corpus("/home/mcoavoux/data/multilingual_disco_data/data/dptb/dev.ctbk")
for t in treebank_1[:3]:
#conll = get_conll(t)
#write_conll(conll, sys.stdout)
print(t)
treebank_2 = read_discbracket_corpus("/home/mcoavoux/data/multilingual_disco_data/data/dptb/dev.discbracket")
for t in treebank_2[:3]:
print(t)
for t1, t2 in zip(treebank_1, treebank_2):
if str(t1) != str(t2):
print(t1)
print(t2)
print()
treebank_3 = read_bracket_corpus("/home/mcoavoux/data/FRENCH/gold/ptb/dev/treebank.mrg")
for t in treebank_3[:3]:
print(t)