fix pipeline

This commit is contained in:
Patrick Jentsch 2020-06-23 15:19:39 +02:00
parent 5980a995e5
commit 5bd0feda5c
4 changed files with 61 additions and 42 deletions

View File

@ -30,7 +30,7 @@ RUN tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
ENV SPACY_VERSION=2.2.4
ENV SPACY_MODELS_VERSION=2.2.5
RUN pip3 install "spacy==${SPACY_VERSION}" \
RUN pip3 install setuptools wheel && pip3 install "spacy==${SPACY_VERSION}" \
&& python3 -m spacy download "de_core_news_sm-${SPACY_MODELS_VERSION}" --direct \
&& python3 -m spacy download "el_core_news_sm-${SPACY_MODELS_VERSION}" --direct \
&& python3 -m spacy download "en_core_web_sm-${SPACY_MODELS_VERSION}" --direct \

68
nlp
View File

@ -28,16 +28,21 @@ SPACY_MODELS = {'de': 'de_core_news_sm',
def parse_args():
parser = ArgumentParser(description='NLP Pipeline utilizing spaCy.')
parser.add_argument('i')
parser.add_argument('o')
parser.add_argument('-i', '--input-directory',
help='Input directory (only txt files get processed)',
required=True)
parser.add_argument('-o', '--output-directory',
help='Output directory',
required=True)
parser.add_argument('-l', '--language', choices=SPACY_MODELS.keys(),
required=True)
parser.add_argument('--check-encoding', action='store_true')
parser.add_argument('--log-dir')
parser.add_argument('--n-cores',
default=min(4, multiprocessing.cpu_count()),
help='total number of cores available', type=int)
parser.add_argument('--check-encoding', action='store_true')
parser.add_argument('--log-dir')
parser.add_argument('--zip')
parser.add_argument('--zip',
help='Zips everything into one archive.')
return parser.parse_args()
@ -63,41 +68,47 @@ class NLPPipeline(WorkflowRunner):
'''
' ##################################################
' # mkdir_jobs #
' # setup output directory #
' ##################################################
'''
mkdir_jobs = []
setup_output_directory_jobs = []
for i, job in enumerate(self.jobs):
cmd = 'mkdir'
cmd += ' -p'
cmd += ' "{}"'.format(job.output_dir)
lbl = 'mkdir_job_-_{}'.format(i)
mkdir_jobs.append(self.addTask(command=cmd, label=lbl))
lbl = 'setup_output_directory_-_{}'.format(i)
setup_output_directory_jobs.append(self.addTask(command=cmd,
label=lbl))
'''
' ##################################################
' # spacy_nlp_jobs #
' # nlp #
' ##################################################
'''
spacy_nlp_jobs = []
nlp_jobs = []
n_cores = min(self.n_cores, max(1, int(self.n_cores / len(self.jobs))))
for i, job in enumerate(self.jobs):
output_file = os.path.join(job.output_dir,
'{}.vrt'.format(job.name))
cmd = 'spacy-nlp "{}" "{}"'.format(job.file, output_file)
cmd = 'spacy-nlp'
cmd += ' -i "{}"'.format(job.file)
cmd += ' -l "{}"'.format(self.lang)
cmd += ' --check-encoding' if self.check_encoding else ''
deps = 'mkdir_job_-_{}'.format(i)
lbl = 'spacy_nlp_job_-_{}'.format(i)
spacy_nlp_jobs.append(self.addTask(command=cmd, dependencies=deps,
label=lbl, nCores=n_cores))
cmd += ' -o "{}"'.format(output_file)
if self.check_encoding:
cmd += ' --check-encoding'
deps = 'setup_output_directory_-_{}'.format(i)
lbl = 'nlp_-_{}'.format(i)
nlp_jobs.append(self.addTask(command=cmd,
dependencies=deps,
label=lbl,
nCores=n_cores))
'''
' ##################################################
' # zip_jobs #
' # zip creation #
' ##################################################
'''
zip_jobs = []
zip_creation_jobs = []
if self.zip is not None:
# Remove .zip file extension if provided
if self.zip.endswith('.zip'):
@ -112,10 +123,11 @@ class NLPPipeline(WorkflowRunner):
cmd += ' -i "*.vrt"'
cmd += ' && '
cmd += 'cd -'
deps = spacy_nlp_jobs
lbl = 'zip_job'
zip_jobs.append(self.addTask(command=cmd, dependencies=deps,
label=lbl))
deps = nlp_jobs
lbl = 'zip_creation'
zip_creation_jobs.append(self.addTask(command=cmd,
dependencies=deps,
label=lbl))
def collect_jobs(input_dir, output_dir):
@ -132,11 +144,13 @@ def collect_jobs(input_dir, output_dir):
def main():
args = parse_args()
jobs = collect_jobs(args.i, args.o)
jobs = collect_jobs(args.input_directory, args.output_directory)
nlp_pipeline = NLPPipeline(args.check_encoding, jobs, args.language,
args.n_cores, args.o, args.zip)
retval = nlp_pipeline.run(dataDirRoot=(args.log_dir or args.o),
nCores=args.n_cores)
args.n_cores, args.output_directory, args.zip)
retval = nlp_pipeline.run(
dataDirRoot=(args.log_dir or args.output_directory),
nCores=args.n_cores
)
sys.exit(retval)

View File

@ -22,9 +22,10 @@ SPACY_MODELS = {'de': 'de_core_news_sm',
# Parse the given arguments
parser = ArgumentParser(description=('Tag a text file with spaCy and save it '
'as a verticalized text file.'))
parser.add_argument('i', metavar='txt-sourcefile')
parser.add_argument('o', metavar='vrt-destfile')
parser.add_argument('-l', '--language', choices=SPACY_MODELS.keys(),
parser.add_argument('-i', '--input', metavar='txt-sourcefile', required=True)
parser.add_argument('-o', '--output', metavar='vrt-destfile', required=True)
parser.add_argument('-l', '--language',
choices=SPACY_MODELS.keys(),
required=True)
parser.add_argument('--check-encoding', action='store_true')
args = parser.parse_args()
@ -33,7 +34,7 @@ args = parser.parse_args()
# If requested: Check the encoding of the text contents from the input file
# Else: Use utf-8
if args.check_encoding:
with open(args.i, "rb") as input_file:
with open(args.input, "rb") as input_file:
bytes = input_file.read()
encoding = chardet.detect(bytes)['encoding']
else:
@ -41,14 +42,14 @@ else:
# hashing in chunks to avoid full RAM with huge files.
with open(args.i, 'rb') as input_file:
with open(args.input, 'rb') as input_file:
md5_hash = hashlib.md5()
for chunk in iter(lambda: input_file.read(128 * md5_hash.block_size), b''):
md5_hash.update(chunk)
md5_hash = md5_hash.hexdigest()
# Load the text contents from the input file
with open(args.i, encoding=encoding) as input_file:
with open(args.input, encoding=encoding) as input_file:
text = input_file.read()
# spaCys NLP is limited to strings with maximum 1 million characters at
# once. So we split it into suitable chunks.
@ -64,8 +65,8 @@ nlp = spacy.load(SPACY_MODELS[args.language])
# Create the output file in verticalized text format
# See: http://cwb.sourceforge.net/files/CWB_Encoding_Tutorial/node3.html
output_file_original_filename = args.o
output_file_stand_off_filename = args.o.replace('.vrt', '.stand-off.vrt')
output_file_original_filename = args.output
output_file_stand_off_filename = args.output.replace('.vrt', '.stand-off.vrt')
common_xml = ('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n'
+ '<corpus>\n'
+ '<text>\n'

View File

@ -12,17 +12,21 @@ UID = str(os.getuid())
GID = str(os.getgid())
parser = ArgumentParser(add_help=False)
parser.add_argument('-i')
parser.add_argument('-o')
parser.add_argument('-i', '--input-directory')
parser.add_argument('-o', '--output-directory')
args, remaining_args = parser.parse_known_args()
cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)]
if args.o is not None:
cmd += ['-v', '{}:{}'.format(os.path.abspath(args.o), CONTAINER_OUTPUT_DIR)]
if args.output_directory is not None:
cmd += ['-v', '{}:{}'.format(os.path.abspath(args.output_directory),
CONTAINER_OUTPUT_DIR)]
remaining_args.insert(0, CONTAINER_OUTPUT_DIR)
if args.i is not None:
cmd += ['-v', '{}:{}'.format(os.path.abspath(args.i), CONTAINER_INPUT_DIR)]
remaining_args.insert(0, '-o')
if args.input_directory is not None:
cmd += ['-v', '{}:{}'.format(os.path.abspath(args.input_directory),
CONTAINER_INPUT_DIR)]
remaining_args.insert(0, CONTAINER_INPUT_DIR)
remaining_args.insert(0, '-i')
cmd.append(CONTAINER_IMAGE)
cmd += remaining_args