diff --git a/Dockerfile b/Dockerfile index 799e2c7..ebad0a6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -36,8 +36,7 @@ RUN echo "yes" | cpan HTML::Entities && \ COPY cqpserver.init /root/ -COPY foo.sh /usr/local/bin/ +COPY docker-entrypoint.sh /usr/local/bin/ -ENTRYPOINT ["cqpserver"] -CMD ["-I", "/root/cqpserver.init"] +ENTRYPOINT ["docker-entrypoint.sh"] diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh new file mode 100755 index 0000000..fcd8f51 --- /dev/null +++ b/docker-entrypoint.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +if [ $# -eq 0 ]; then + cqpserver -I /root/cqpserver.init +elif [ $1 == "prepare" ]; then + # Input: + # - Source file: "/root/files/corpus.vrt" + # Output + # - Corpus name: "CORPUS" + # - Corpus data directory: "/corpora/data/corpus" + # - Corpus registry file: "/usr/local/share/cwb/registry/corpus" + mkdir -p /corpora/data/corpus + cwb-encode \ + -c utf8 \ + -d /corpora/data/corpus \ + -f /root/files/corpus.vrt \ + -R /usr/local/share/cwb/registry/corpus \ + -P lemma -P simple_pos -P pos -P ner \ + -S text:0+address+author+booktitle+chapter+editor+institution+journal+pages+publisher+publishing_year+school+title -S s \ + -0 corpus \ + -xsB + cwb-make \ + -V CORPUS +else + echo "Unknown command" +fi diff --git a/foo.sh b/foo.sh deleted file mode 100755 index 5a0678e..0000000 --- a/foo.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -mkdir -p /corpora/data/corpus -cwb-encode \ - -c utf8 \ - -d /corpora/data/corpus \ - -f /root/files/corpus.vrt \ - -R /usr/local/share/cwb/registry/corpus \ - -P lemma -P simple_pos -P pos -P ner \ - -S text:0+address+author+booktitle+chapter+editor+institution+journal+pages+publisher+publishing_year+school+title -S s \ - -0 corpus \ - -xsB -cwb-make \ - -V CORPUS diff --git a/merge_corpus_files.py b/merge_corpus_files.py deleted file mode 100644 index cd5a2d1..0000000 --- a/merge_corpus_files.py +++ /dev/null @@ -1,21 +0,0 @@ -import argparse -import os -import xml.etree.ElementTree as ET - -parser = argparse.ArgumentParser() -parser.add_argument('-dir', dest='corpus_dir', required=True) -args = parser.parse_args() - -text_nodes = [] -for corpus_file in os.listdir(args.corpus_dir): - if os.path.isdir(os.path.join(args.corpus_dir, corpus_file)): - continue - if corpus_file == 'corpus.vrt': - continue - element_tree = ET.parse(os.path.join(args.corpus_dir, corpus_file)) - text_nodes.append(element_tree.find('text')) -element_tree = ET.ElementTree(ET.fromstring('')) -root = element_tree.getroot() -for text_node in text_nodes: - root.insert(1, text_node) -element_tree.write(os.path.join(args.corpus_dir, 'corpus.vrt'))