mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/cqpserver.git
synced 2024-12-26 03:14:20 +00:00
22 lines
727 B
Python
22 lines
727 B
Python
import argparse
|
|
import os
|
|
import xml.etree.ElementTree as ET
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('-dir', dest='corpus_dir', required=True)
|
|
args = parser.parse_args()
|
|
|
|
text_nodes = []
|
|
for corpus_file in os.listdir(args.corpus_dir):
|
|
if os.path.isdir(os.path.join(args.corpus_dir, corpus_file)):
|
|
continue
|
|
if corpus_file == 'corpus.vrt':
|
|
continue
|
|
element_tree = ET.parse(os.path.join(args.corpus_dir, corpus_file))
|
|
text_nodes.append(element_tree.find('text'))
|
|
element_tree = ET.ElementTree(ET.fromstring('<corpus></corpus>'))
|
|
root = element_tree.getroot()
|
|
for text_node in text_nodes:
|
|
root.insert(1, text_node)
|
|
element_tree.write(os.path.join(args.corpus_dir, 'corpus.vrt'))
|