fix pipeline

This commit is contained in:
Patrick Jentsch 2020-06-23 15:19:39 +02:00
parent 5980a995e5
commit 5bd0feda5c
4 changed files with 61 additions and 42 deletions

View File

@ -30,7 +30,7 @@ RUN tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
ENV SPACY_VERSION=2.2.4 ENV SPACY_VERSION=2.2.4
ENV SPACY_MODELS_VERSION=2.2.5 ENV SPACY_MODELS_VERSION=2.2.5
RUN pip3 install "spacy==${SPACY_VERSION}" \ RUN pip3 install setuptools wheel && pip3 install "spacy==${SPACY_VERSION}" \
&& python3 -m spacy download "de_core_news_sm-${SPACY_MODELS_VERSION}" --direct \ && python3 -m spacy download "de_core_news_sm-${SPACY_MODELS_VERSION}" --direct \
&& python3 -m spacy download "el_core_news_sm-${SPACY_MODELS_VERSION}" --direct \ && python3 -m spacy download "el_core_news_sm-${SPACY_MODELS_VERSION}" --direct \
&& python3 -m spacy download "en_core_web_sm-${SPACY_MODELS_VERSION}" --direct \ && python3 -m spacy download "en_core_web_sm-${SPACY_MODELS_VERSION}" --direct \

66
nlp
View File

@ -28,16 +28,21 @@ SPACY_MODELS = {'de': 'de_core_news_sm',
def parse_args(): def parse_args():
parser = ArgumentParser(description='NLP Pipeline utilizing spaCy.') parser = ArgumentParser(description='NLP Pipeline utilizing spaCy.')
parser.add_argument('i') parser.add_argument('-i', '--input-directory',
parser.add_argument('o') help='Input directory (only txt files get processed)',
required=True)
parser.add_argument('-o', '--output-directory',
help='Output directory',
required=True)
parser.add_argument('-l', '--language', choices=SPACY_MODELS.keys(), parser.add_argument('-l', '--language', choices=SPACY_MODELS.keys(),
required=True) required=True)
parser.add_argument('--check-encoding', action='store_true')
parser.add_argument('--log-dir')
parser.add_argument('--n-cores', parser.add_argument('--n-cores',
default=min(4, multiprocessing.cpu_count()), default=min(4, multiprocessing.cpu_count()),
help='total number of cores available', type=int) help='total number of cores available', type=int)
parser.add_argument('--check-encoding', action='store_true') parser.add_argument('--zip',
parser.add_argument('--log-dir') help='Zips everything into one archive.')
parser.add_argument('--zip')
return parser.parse_args() return parser.parse_args()
@ -63,41 +68,47 @@ class NLPPipeline(WorkflowRunner):
''' '''
' ################################################## ' ##################################################
' # mkdir_jobs # ' # setup output directory #
' ################################################## ' ##################################################
''' '''
mkdir_jobs = [] setup_output_directory_jobs = []
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
cmd = 'mkdir' cmd = 'mkdir'
cmd += ' -p' cmd += ' -p'
cmd += ' "{}"'.format(job.output_dir) cmd += ' "{}"'.format(job.output_dir)
lbl = 'mkdir_job_-_{}'.format(i) lbl = 'setup_output_directory_-_{}'.format(i)
mkdir_jobs.append(self.addTask(command=cmd, label=lbl)) setup_output_directory_jobs.append(self.addTask(command=cmd,
label=lbl))
''' '''
' ################################################## ' ##################################################
' # spacy_nlp_jobs # ' # nlp #
' ################################################## ' ##################################################
''' '''
spacy_nlp_jobs = [] nlp_jobs = []
n_cores = min(self.n_cores, max(1, int(self.n_cores / len(self.jobs)))) n_cores = min(self.n_cores, max(1, int(self.n_cores / len(self.jobs))))
for i, job in enumerate(self.jobs): for i, job in enumerate(self.jobs):
output_file = os.path.join(job.output_dir, output_file = os.path.join(job.output_dir,
'{}.vrt'.format(job.name)) '{}.vrt'.format(job.name))
cmd = 'spacy-nlp "{}" "{}"'.format(job.file, output_file) cmd = 'spacy-nlp'
cmd += ' -i "{}"'.format(job.file)
cmd += ' -l "{}"'.format(self.lang) cmd += ' -l "{}"'.format(self.lang)
cmd += ' --check-encoding' if self.check_encoding else '' cmd += ' -o "{}"'.format(output_file)
deps = 'mkdir_job_-_{}'.format(i) if self.check_encoding:
lbl = 'spacy_nlp_job_-_{}'.format(i) cmd += ' --check-encoding'
spacy_nlp_jobs.append(self.addTask(command=cmd, dependencies=deps, deps = 'setup_output_directory_-_{}'.format(i)
label=lbl, nCores=n_cores)) lbl = 'nlp_-_{}'.format(i)
nlp_jobs.append(self.addTask(command=cmd,
dependencies=deps,
label=lbl,
nCores=n_cores))
''' '''
' ################################################## ' ##################################################
' # zip_jobs # ' # zip creation #
' ################################################## ' ##################################################
''' '''
zip_jobs = [] zip_creation_jobs = []
if self.zip is not None: if self.zip is not None:
# Remove .zip file extension if provided # Remove .zip file extension if provided
if self.zip.endswith('.zip'): if self.zip.endswith('.zip'):
@ -112,9 +123,10 @@ class NLPPipeline(WorkflowRunner):
cmd += ' -i "*.vrt"' cmd += ' -i "*.vrt"'
cmd += ' && ' cmd += ' && '
cmd += 'cd -' cmd += 'cd -'
deps = spacy_nlp_jobs deps = nlp_jobs
lbl = 'zip_job' lbl = 'zip_creation'
zip_jobs.append(self.addTask(command=cmd, dependencies=deps, zip_creation_jobs.append(self.addTask(command=cmd,
dependencies=deps,
label=lbl)) label=lbl))
@ -132,11 +144,13 @@ def collect_jobs(input_dir, output_dir):
def main(): def main():
args = parse_args() args = parse_args()
jobs = collect_jobs(args.i, args.o) jobs = collect_jobs(args.input_directory, args.output_directory)
nlp_pipeline = NLPPipeline(args.check_encoding, jobs, args.language, nlp_pipeline = NLPPipeline(args.check_encoding, jobs, args.language,
args.n_cores, args.o, args.zip) args.n_cores, args.output_directory, args.zip)
retval = nlp_pipeline.run(dataDirRoot=(args.log_dir or args.o), retval = nlp_pipeline.run(
nCores=args.n_cores) dataDirRoot=(args.log_dir or args.output_directory),
nCores=args.n_cores
)
sys.exit(retval) sys.exit(retval)

View File

@ -22,9 +22,10 @@ SPACY_MODELS = {'de': 'de_core_news_sm',
# Parse the given arguments # Parse the given arguments
parser = ArgumentParser(description=('Tag a text file with spaCy and save it ' parser = ArgumentParser(description=('Tag a text file with spaCy and save it '
'as a verticalized text file.')) 'as a verticalized text file.'))
parser.add_argument('i', metavar='txt-sourcefile') parser.add_argument('-i', '--input', metavar='txt-sourcefile', required=True)
parser.add_argument('o', metavar='vrt-destfile') parser.add_argument('-o', '--output', metavar='vrt-destfile', required=True)
parser.add_argument('-l', '--language', choices=SPACY_MODELS.keys(), parser.add_argument('-l', '--language',
choices=SPACY_MODELS.keys(),
required=True) required=True)
parser.add_argument('--check-encoding', action='store_true') parser.add_argument('--check-encoding', action='store_true')
args = parser.parse_args() args = parser.parse_args()
@ -33,7 +34,7 @@ args = parser.parse_args()
# If requested: Check the encoding of the text contents from the input file # If requested: Check the encoding of the text contents from the input file
# Else: Use utf-8 # Else: Use utf-8
if args.check_encoding: if args.check_encoding:
with open(args.i, "rb") as input_file: with open(args.input, "rb") as input_file:
bytes = input_file.read() bytes = input_file.read()
encoding = chardet.detect(bytes)['encoding'] encoding = chardet.detect(bytes)['encoding']
else: else:
@ -41,14 +42,14 @@ else:
# hashing in chunks to avoid full RAM with huge files. # hashing in chunks to avoid full RAM with huge files.
with open(args.i, 'rb') as input_file: with open(args.input, 'rb') as input_file:
md5_hash = hashlib.md5() md5_hash = hashlib.md5()
for chunk in iter(lambda: input_file.read(128 * md5_hash.block_size), b''): for chunk in iter(lambda: input_file.read(128 * md5_hash.block_size), b''):
md5_hash.update(chunk) md5_hash.update(chunk)
md5_hash = md5_hash.hexdigest() md5_hash = md5_hash.hexdigest()
# Load the text contents from the input file # Load the text contents from the input file
with open(args.i, encoding=encoding) as input_file: with open(args.input, encoding=encoding) as input_file:
text = input_file.read() text = input_file.read()
# spaCys NLP is limited to strings with maximum 1 million characters at # spaCys NLP is limited to strings with maximum 1 million characters at
# once. So we split it into suitable chunks. # once. So we split it into suitable chunks.
@ -64,8 +65,8 @@ nlp = spacy.load(SPACY_MODELS[args.language])
# Create the output file in verticalized text format # Create the output file in verticalized text format
# See: http://cwb.sourceforge.net/files/CWB_Encoding_Tutorial/node3.html # See: http://cwb.sourceforge.net/files/CWB_Encoding_Tutorial/node3.html
output_file_original_filename = args.o output_file_original_filename = args.output
output_file_stand_off_filename = args.o.replace('.vrt', '.stand-off.vrt') output_file_stand_off_filename = args.output.replace('.vrt', '.stand-off.vrt')
common_xml = ('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n' common_xml = ('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n'
+ '<corpus>\n' + '<corpus>\n'
+ '<text>\n' + '<text>\n'

View File

@ -12,17 +12,21 @@ UID = str(os.getuid())
GID = str(os.getgid()) GID = str(os.getgid())
parser = ArgumentParser(add_help=False) parser = ArgumentParser(add_help=False)
parser.add_argument('-i') parser.add_argument('-i', '--input-directory')
parser.add_argument('-o') parser.add_argument('-o', '--output-directory')
args, remaining_args = parser.parse_known_args() args, remaining_args = parser.parse_known_args()
cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)] cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)]
if args.o is not None: if args.output_directory is not None:
cmd += ['-v', '{}:{}'.format(os.path.abspath(args.o), CONTAINER_OUTPUT_DIR)] cmd += ['-v', '{}:{}'.format(os.path.abspath(args.output_directory),
CONTAINER_OUTPUT_DIR)]
remaining_args.insert(0, CONTAINER_OUTPUT_DIR) remaining_args.insert(0, CONTAINER_OUTPUT_DIR)
if args.i is not None: remaining_args.insert(0, '-o')
cmd += ['-v', '{}:{}'.format(os.path.abspath(args.i), CONTAINER_INPUT_DIR)] if args.input_directory is not None:
cmd += ['-v', '{}:{}'.format(os.path.abspath(args.input_directory),
CONTAINER_INPUT_DIR)]
remaining_args.insert(0, CONTAINER_INPUT_DIR) remaining_args.insert(0, CONTAINER_INPUT_DIR)
remaining_args.insert(0, '-i')
cmd.append(CONTAINER_IMAGE) cmd.append(CONTAINER_IMAGE)
cmd += remaining_args cmd += remaining_args