mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/cqpserver.git
synced 2026-05-09 08:44:39 +00:00
Add preparation scripts
This commit is contained in:
@@ -0,0 +1,16 @@
|
||||
import argparse
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-i', dest='input_file', required=True)
|
||||
parser.add_argument('--author', dest='author', required=True)
|
||||
parser.add_argument('--publishing_year', dest='publishing_year', required=True)
|
||||
parser.add_argument('--title', dest='title', required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
element_tree = ET.parse(args.input_file)
|
||||
text_node = element_tree.find('text')
|
||||
text_node.set('author', args.author)
|
||||
text_node.set('publishing_year', args.publishing_year)
|
||||
text_node.set('title', args.title)
|
||||
element_tree.write(args.input_file)
|
||||
@@ -0,0 +1,17 @@
|
||||
echo $0
|
||||
vrt_file="$1"
|
||||
exit
|
||||
|
||||
docker run \
|
||||
-d \
|
||||
pjentsch/cqpserver:latest
|
||||
docker exec "$CONTAINER_NAME" mkdir /corpora/data/example
|
||||
docker exec "$CONTAINER_NAME" cwb-encode \
|
||||
-d /corpora/data/example \
|
||||
-f /root/files/example.vrt \
|
||||
-R /usr/local/share/cwb/registry/example \
|
||||
text, lemma, simple_pos, pos, ner
|
||||
-P lemma -P simple_pos -P pos -P ner \
|
||||
-S s -S text:
|
||||
docker exec "$CONTAINER_NAME" cwb-make \
|
||||
-V EXAMPLE
|
||||
@@ -0,0 +1,21 @@
|
||||
import argparse
|
||||
import os
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-dir', dest='corpus_dir', required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
text_nodes = []
|
||||
for corpus_file in os.listdir(args.corpus_dir):
|
||||
if os.path.isdir(os.path.join(args.corpus_dir, corpus_file)):
|
||||
continue
|
||||
if corpus_file == 'corpus.vrt':
|
||||
continue
|
||||
element_tree = ET.parse(os.path.join(args.corpus_dir, corpus_file))
|
||||
text_nodes.append(element_tree.find('text'))
|
||||
element_tree = ET.ElementTree(ET.fromstring('<corpus></corpus>'))
|
||||
root = element_tree.getroot()
|
||||
for text_node in text_nodes:
|
||||
root.insert(1, text_node)
|
||||
element_tree.write(os.path.join(args.corpus_dir, 'corpus.vrt'))
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user