mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/cqpserver.git
synced 2024-12-26 03:14:20 +00:00
Add preparation scripts
This commit is contained in:
parent
08689317ab
commit
2bb169264c
16
create_corpus_file.py
Normal file
16
create_corpus_file.py
Normal file
@ -0,0 +1,16 @@
|
||||
import argparse
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-i', dest='input_file', required=True)
|
||||
parser.add_argument('--author', dest='author', required=True)
|
||||
parser.add_argument('--publishing_year', dest='publishing_year', required=True)
|
||||
parser.add_argument('--title', dest='title', required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
element_tree = ET.parse(args.input_file)
|
||||
text_node = element_tree.find('text')
|
||||
text_node.set('author', args.author)
|
||||
text_node.set('publishing_year', args.publishing_year)
|
||||
text_node.set('title', args.title)
|
||||
element_tree.write(args.input_file)
|
17
foo.sh
Executable file
17
foo.sh
Executable file
@ -0,0 +1,17 @@
|
||||
echo $0
|
||||
vrt_file="$1"
|
||||
exit
|
||||
|
||||
docker run \
|
||||
-d \
|
||||
pjentsch/cqpserver:latest
|
||||
docker exec "$CONTAINER_NAME" mkdir /corpora/data/example
|
||||
docker exec "$CONTAINER_NAME" cwb-encode \
|
||||
-d /corpora/data/example \
|
||||
-f /root/files/example.vrt \
|
||||
-R /usr/local/share/cwb/registry/example \
|
||||
text, lemma, simple_pos, pos, ner
|
||||
-P lemma -P simple_pos -P pos -P ner \
|
||||
-S s -S text:
|
||||
docker exec "$CONTAINER_NAME" cwb-make \
|
||||
-V EXAMPLE
|
21
merge_corpus_files.py
Normal file
21
merge_corpus_files.py
Normal file
@ -0,0 +1,21 @@
|
||||
import argparse
|
||||
import os
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-dir', dest='corpus_dir', required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
text_nodes = []
|
||||
for corpus_file in os.listdir(args.corpus_dir):
|
||||
if os.path.isdir(os.path.join(args.corpus_dir, corpus_file)):
|
||||
continue
|
||||
if corpus_file == 'corpus.vrt':
|
||||
continue
|
||||
element_tree = ET.parse(os.path.join(args.corpus_dir, corpus_file))
|
||||
text_nodes.append(element_tree.find('text'))
|
||||
element_tree = ET.ElementTree(ET.fromstring('<corpus></corpus>'))
|
||||
root = element_tree.getroot()
|
||||
for text_node in text_nodes:
|
||||
root.insert(1, text_node)
|
||||
element_tree.write(os.path.join(args.corpus_dir, 'corpus.vrt'))
|
8447
test_files/corpus.vrt
Normal file
8447
test_files/corpus.vrt
Normal file
File diff suppressed because it is too large
Load Diff
4225
test_files/pjentsch.vrt
Normal file
4225
test_files/pjentsch.vrt
Normal file
File diff suppressed because it is too large
Load Diff
4225
test_files/sporada.vrt
Normal file
4225
test_files/sporada.vrt
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user