From 27900f391b0979a0126c64bda7059ca374c669ea Mon Sep 17 00:00:00 2001
From: Patrick Jentsch
Date: Wed, 29 Jan 2020 11:48:15 +0100
Subject: [PATCH] Remove foo script and its functionality in an entrypoint
script.
---
Dockerfile | 5 ++---
docker-entrypoint.sh | 26 ++++++++++++++++++++++++++
foo.sh | 14 --------------
merge_corpus_files.py | 21 ---------------------
4 files changed, 28 insertions(+), 38 deletions(-)
create mode 100755 docker-entrypoint.sh
delete mode 100755 foo.sh
delete mode 100644 merge_corpus_files.py
diff --git a/Dockerfile b/Dockerfile
index 799e2c7..ebad0a6 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -36,8 +36,7 @@ RUN echo "yes" | cpan HTML::Entities && \
COPY cqpserver.init /root/
-COPY foo.sh /usr/local/bin/
+COPY docker-entrypoint.sh /usr/local/bin/
-ENTRYPOINT ["cqpserver"]
-CMD ["-I", "/root/cqpserver.init"]
+ENTRYPOINT ["docker-entrypoint.sh"]
diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh
new file mode 100755
index 0000000..fcd8f51
--- /dev/null
+++ b/docker-entrypoint.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+if [ $# -eq 0 ]; then
+ cqpserver -I /root/cqpserver.init
+elif [ $1 == "prepare" ]; then
+ # Input:
+ # - Source file: "/root/files/corpus.vrt"
+ # Output
+ # - Corpus name: "CORPUS"
+ # - Corpus data directory: "/corpora/data/corpus"
+ # - Corpus registry file: "/usr/local/share/cwb/registry/corpus"
+ mkdir -p /corpora/data/corpus
+ cwb-encode \
+ -c utf8 \
+ -d /corpora/data/corpus \
+ -f /root/files/corpus.vrt \
+ -R /usr/local/share/cwb/registry/corpus \
+ -P lemma -P simple_pos -P pos -P ner \
+ -S text:0+address+author+booktitle+chapter+editor+institution+journal+pages+publisher+publishing_year+school+title -S s \
+ -0 corpus \
+ -xsB
+ cwb-make \
+ -V CORPUS
+else
+ echo "Unknown command"
+fi
diff --git a/foo.sh b/foo.sh
deleted file mode 100755
index 5a0678e..0000000
--- a/foo.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-mkdir -p /corpora/data/corpus
-cwb-encode \
- -c utf8 \
- -d /corpora/data/corpus \
- -f /root/files/corpus.vrt \
- -R /usr/local/share/cwb/registry/corpus \
- -P lemma -P simple_pos -P pos -P ner \
- -S text:0+address+author+booktitle+chapter+editor+institution+journal+pages+publisher+publishing_year+school+title -S s \
- -0 corpus \
- -xsB
-cwb-make \
- -V CORPUS
diff --git a/merge_corpus_files.py b/merge_corpus_files.py
deleted file mode 100644
index cd5a2d1..0000000
--- a/merge_corpus_files.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import argparse
-import os
-import xml.etree.ElementTree as ET
-
-parser = argparse.ArgumentParser()
-parser.add_argument('-dir', dest='corpus_dir', required=True)
-args = parser.parse_args()
-
-text_nodes = []
-for corpus_file in os.listdir(args.corpus_dir):
- if os.path.isdir(os.path.join(args.corpus_dir, corpus_file)):
- continue
- if corpus_file == 'corpus.vrt':
- continue
- element_tree = ET.parse(os.path.join(args.corpus_dir, corpus_file))
- text_nodes.append(element_tree.find('text'))
-element_tree = ET.ElementTree(ET.fromstring(''))
-root = element_tree.getroot()
-for text_node in text_nodes:
- root.insert(1, text_node)
-element_tree.write(os.path.join(args.corpus_dir, 'corpus.vrt'))