https://gricad-gitlab.univ-grenoble-alpes.fr/coavouxm/flaubertagger.git
Tip revision: c939ca9fac094ac3c379256ef3d3d4d14a5a4bf1 authored by m on 23 February 2024, 16:44:50 UTC
up
up
Tip revision: c939ca9
conll2txt.py
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description = "Conll 2 text", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("input", help="Input conll file")
parser.add_argument("--output", help="Output text (default = <input>.tokens")
parser.add_argument("--ignore-header", action="store_true", help="Ignore header")
parser.add_argument("-i", type=int, default=1, help="column id")
args = parser.parse_args()
output = f"{args.input}.tokens"
if args.output is not None:
output = args.output
with open(args.input, encoding="utf8") as f, open(output, "w", encoding="utf8") as o:
if args.ignore_header:
f.readline()
sentences = f.read().split("\n\n")
for sentence in sentences:
if sentence.strip():
tokens = sentence.split("\n")
tokens = [line.split("\t")[args.i] for line in tokens if line.strip()]
if len(tokens) > 0:
o.write(f"{' '.join(tokens)}\n")