cqpserver/merge_corpus_files.py
2019-11-04 14:14:22 +01:00

22 lines
727 B
Python

import argparse
import os
import xml.etree.ElementTree as ET
parser = argparse.ArgumentParser()
parser.add_argument('-dir', dest='corpus_dir', required=True)
args = parser.parse_args()
text_nodes = []
for corpus_file in os.listdir(args.corpus_dir):
if os.path.isdir(os.path.join(args.corpus_dir, corpus_file)):
continue
if corpus_file == 'corpus.vrt':
continue
element_tree = ET.parse(os.path.join(args.corpus_dir, corpus_file))
text_nodes.append(element_tree.find('text'))
element_tree = ET.ElementTree(ET.fromstring('<corpus></corpus>'))
root = element_tree.getroot()
for text_node in text_nodes:
root.insert(1, text_node)
element_tree.write(os.path.join(args.corpus_dir, 'corpus.vrt'))