Add preparation scripts

This commit is contained in:
Patrick Jentsch 2019-11-04 14:14:22 +01:00
parent 08689317ab
commit 2bb169264c
6 changed files with 16951 additions and 0 deletions

16
create_corpus_file.py Normal file
View File

@ -0,0 +1,16 @@
import argparse
import xml.etree.ElementTree as ET
parser = argparse.ArgumentParser()
parser.add_argument('-i', dest='input_file', required=True)
parser.add_argument('--author', dest='author', required=True)
parser.add_argument('--publishing_year', dest='publishing_year', required=True)
parser.add_argument('--title', dest='title', required=True)
args = parser.parse_args()
element_tree = ET.parse(args.input_file)
text_node = element_tree.find('text')
text_node.set('author', args.author)
text_node.set('publishing_year', args.publishing_year)
text_node.set('title', args.title)
element_tree.write(args.input_file)

17
foo.sh Executable file
View File

@ -0,0 +1,17 @@
echo $0
vrt_file="$1"
exit
docker run \
-d \
pjentsch/cqpserver:latest
docker exec "$CONTAINER_NAME" mkdir /corpora/data/example
docker exec "$CONTAINER_NAME" cwb-encode \
-d /corpora/data/example \
-f /root/files/example.vrt \
-R /usr/local/share/cwb/registry/example \
text, lemma, simple_pos, pos, ner
-P lemma -P simple_pos -P pos -P ner \
-S s -S text:
docker exec "$CONTAINER_NAME" cwb-make \
-V EXAMPLE

21
merge_corpus_files.py Normal file
View File

@ -0,0 +1,21 @@
import argparse
import os
import xml.etree.ElementTree as ET
parser = argparse.ArgumentParser()
parser.add_argument('-dir', dest='corpus_dir', required=True)
args = parser.parse_args()
text_nodes = []
for corpus_file in os.listdir(args.corpus_dir):
if os.path.isdir(os.path.join(args.corpus_dir, corpus_file)):
continue
if corpus_file == 'corpus.vrt':
continue
element_tree = ET.parse(os.path.join(args.corpus_dir, corpus_file))
text_nodes.append(element_tree.find('text'))
element_tree = ET.ElementTree(ET.fromstring('<corpus></corpus>'))
root = element_tree.getroot()
for text_node in text_nodes:
root.insert(1, text_node)
element_tree.write(os.path.join(args.corpus_dir, 'corpus.vrt'))

8447
test_files/corpus.vrt Normal file

File diff suppressed because it is too large Load Diff

4225
test_files/pjentsch.vrt Normal file

File diff suppressed because it is too large Load Diff

4225
test_files/sporada.vrt Normal file

File diff suppressed because it is too large Load Diff