mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git
synced 2024-12-26 07:54:18 +00:00
fix pipeline
This commit is contained in:
parent
5980a995e5
commit
5bd0feda5c
@ -30,7 +30,7 @@ RUN tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
|
||||
|
||||
ENV SPACY_VERSION=2.2.4
|
||||
ENV SPACY_MODELS_VERSION=2.2.5
|
||||
RUN pip3 install "spacy==${SPACY_VERSION}" \
|
||||
RUN pip3 install setuptools wheel && pip3 install "spacy==${SPACY_VERSION}" \
|
||||
&& python3 -m spacy download "de_core_news_sm-${SPACY_MODELS_VERSION}" --direct \
|
||||
&& python3 -m spacy download "el_core_news_sm-${SPACY_MODELS_VERSION}" --direct \
|
||||
&& python3 -m spacy download "en_core_web_sm-${SPACY_MODELS_VERSION}" --direct \
|
||||
|
66
nlp
66
nlp
@ -28,16 +28,21 @@ SPACY_MODELS = {'de': 'de_core_news_sm',
|
||||
|
||||
def parse_args():
|
||||
parser = ArgumentParser(description='NLP Pipeline utilizing spaCy.')
|
||||
parser.add_argument('i')
|
||||
parser.add_argument('o')
|
||||
parser.add_argument('-i', '--input-directory',
|
||||
help='Input directory (only txt files get processed)',
|
||||
required=True)
|
||||
parser.add_argument('-o', '--output-directory',
|
||||
help='Output directory',
|
||||
required=True)
|
||||
parser.add_argument('-l', '--language', choices=SPACY_MODELS.keys(),
|
||||
required=True)
|
||||
parser.add_argument('--check-encoding', action='store_true')
|
||||
parser.add_argument('--log-dir')
|
||||
parser.add_argument('--n-cores',
|
||||
default=min(4, multiprocessing.cpu_count()),
|
||||
help='total number of cores available', type=int)
|
||||
parser.add_argument('--check-encoding', action='store_true')
|
||||
parser.add_argument('--log-dir')
|
||||
parser.add_argument('--zip')
|
||||
parser.add_argument('--zip',
|
||||
help='Zips everything into one archive.')
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
@ -63,41 +68,47 @@ class NLPPipeline(WorkflowRunner):
|
||||
|
||||
'''
|
||||
' ##################################################
|
||||
' # mkdir_jobs #
|
||||
' # setup output directory #
|
||||
' ##################################################
|
||||
'''
|
||||
mkdir_jobs = []
|
||||
setup_output_directory_jobs = []
|
||||
for i, job in enumerate(self.jobs):
|
||||
cmd = 'mkdir'
|
||||
cmd += ' -p'
|
||||
cmd += ' "{}"'.format(job.output_dir)
|
||||
lbl = 'mkdir_job_-_{}'.format(i)
|
||||
mkdir_jobs.append(self.addTask(command=cmd, label=lbl))
|
||||
lbl = 'setup_output_directory_-_{}'.format(i)
|
||||
setup_output_directory_jobs.append(self.addTask(command=cmd,
|
||||
label=lbl))
|
||||
|
||||
'''
|
||||
' ##################################################
|
||||
' # spacy_nlp_jobs #
|
||||
' # nlp #
|
||||
' ##################################################
|
||||
'''
|
||||
spacy_nlp_jobs = []
|
||||
nlp_jobs = []
|
||||
n_cores = min(self.n_cores, max(1, int(self.n_cores / len(self.jobs))))
|
||||
for i, job in enumerate(self.jobs):
|
||||
output_file = os.path.join(job.output_dir,
|
||||
'{}.vrt'.format(job.name))
|
||||
cmd = 'spacy-nlp "{}" "{}"'.format(job.file, output_file)
|
||||
cmd = 'spacy-nlp'
|
||||
cmd += ' -i "{}"'.format(job.file)
|
||||
cmd += ' -l "{}"'.format(self.lang)
|
||||
cmd += ' --check-encoding' if self.check_encoding else ''
|
||||
deps = 'mkdir_job_-_{}'.format(i)
|
||||
lbl = 'spacy_nlp_job_-_{}'.format(i)
|
||||
spacy_nlp_jobs.append(self.addTask(command=cmd, dependencies=deps,
|
||||
label=lbl, nCores=n_cores))
|
||||
cmd += ' -o "{}"'.format(output_file)
|
||||
if self.check_encoding:
|
||||
cmd += ' --check-encoding'
|
||||
deps = 'setup_output_directory_-_{}'.format(i)
|
||||
lbl = 'nlp_-_{}'.format(i)
|
||||
nlp_jobs.append(self.addTask(command=cmd,
|
||||
dependencies=deps,
|
||||
label=lbl,
|
||||
nCores=n_cores))
|
||||
|
||||
'''
|
||||
' ##################################################
|
||||
' # zip_jobs #
|
||||
' # zip creation #
|
||||
' ##################################################
|
||||
'''
|
||||
zip_jobs = []
|
||||
zip_creation_jobs = []
|
||||
if self.zip is not None:
|
||||
# Remove .zip file extension if provided
|
||||
if self.zip.endswith('.zip'):
|
||||
@ -112,9 +123,10 @@ class NLPPipeline(WorkflowRunner):
|
||||
cmd += ' -i "*.vrt"'
|
||||
cmd += ' && '
|
||||
cmd += 'cd -'
|
||||
deps = spacy_nlp_jobs
|
||||
lbl = 'zip_job'
|
||||
zip_jobs.append(self.addTask(command=cmd, dependencies=deps,
|
||||
deps = nlp_jobs
|
||||
lbl = 'zip_creation'
|
||||
zip_creation_jobs.append(self.addTask(command=cmd,
|
||||
dependencies=deps,
|
||||
label=lbl))
|
||||
|
||||
|
||||
@ -132,11 +144,13 @@ def collect_jobs(input_dir, output_dir):
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
jobs = collect_jobs(args.i, args.o)
|
||||
jobs = collect_jobs(args.input_directory, args.output_directory)
|
||||
nlp_pipeline = NLPPipeline(args.check_encoding, jobs, args.language,
|
||||
args.n_cores, args.o, args.zip)
|
||||
retval = nlp_pipeline.run(dataDirRoot=(args.log_dir or args.o),
|
||||
nCores=args.n_cores)
|
||||
args.n_cores, args.output_directory, args.zip)
|
||||
retval = nlp_pipeline.run(
|
||||
dataDirRoot=(args.log_dir or args.output_directory),
|
||||
nCores=args.n_cores
|
||||
)
|
||||
sys.exit(retval)
|
||||
|
||||
|
||||
|
17
spacy-nlp
17
spacy-nlp
@ -22,9 +22,10 @@ SPACY_MODELS = {'de': 'de_core_news_sm',
|
||||
# Parse the given arguments
|
||||
parser = ArgumentParser(description=('Tag a text file with spaCy and save it '
|
||||
'as a verticalized text file.'))
|
||||
parser.add_argument('i', metavar='txt-sourcefile')
|
||||
parser.add_argument('o', metavar='vrt-destfile')
|
||||
parser.add_argument('-l', '--language', choices=SPACY_MODELS.keys(),
|
||||
parser.add_argument('-i', '--input', metavar='txt-sourcefile', required=True)
|
||||
parser.add_argument('-o', '--output', metavar='vrt-destfile', required=True)
|
||||
parser.add_argument('-l', '--language',
|
||||
choices=SPACY_MODELS.keys(),
|
||||
required=True)
|
||||
parser.add_argument('--check-encoding', action='store_true')
|
||||
args = parser.parse_args()
|
||||
@ -33,7 +34,7 @@ args = parser.parse_args()
|
||||
# If requested: Check the encoding of the text contents from the input file
|
||||
# Else: Use utf-8
|
||||
if args.check_encoding:
|
||||
with open(args.i, "rb") as input_file:
|
||||
with open(args.input, "rb") as input_file:
|
||||
bytes = input_file.read()
|
||||
encoding = chardet.detect(bytes)['encoding']
|
||||
else:
|
||||
@ -41,14 +42,14 @@ else:
|
||||
|
||||
|
||||
# hashing in chunks to avoid full RAM with huge files.
|
||||
with open(args.i, 'rb') as input_file:
|
||||
with open(args.input, 'rb') as input_file:
|
||||
md5_hash = hashlib.md5()
|
||||
for chunk in iter(lambda: input_file.read(128 * md5_hash.block_size), b''):
|
||||
md5_hash.update(chunk)
|
||||
md5_hash = md5_hash.hexdigest()
|
||||
|
||||
# Load the text contents from the input file
|
||||
with open(args.i, encoding=encoding) as input_file:
|
||||
with open(args.input, encoding=encoding) as input_file:
|
||||
text = input_file.read()
|
||||
# spaCys NLP is limited to strings with maximum 1 million characters at
|
||||
# once. So we split it into suitable chunks.
|
||||
@ -64,8 +65,8 @@ nlp = spacy.load(SPACY_MODELS[args.language])
|
||||
|
||||
# Create the output file in verticalized text format
|
||||
# See: http://cwb.sourceforge.net/files/CWB_Encoding_Tutorial/node3.html
|
||||
output_file_original_filename = args.o
|
||||
output_file_stand_off_filename = args.o.replace('.vrt', '.stand-off.vrt')
|
||||
output_file_original_filename = args.output
|
||||
output_file_stand_off_filename = args.output.replace('.vrt', '.stand-off.vrt')
|
||||
common_xml = ('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n'
|
||||
+ '<corpus>\n'
|
||||
+ '<text>\n'
|
||||
|
16
wrapper/nlp
16
wrapper/nlp
@ -12,17 +12,21 @@ UID = str(os.getuid())
|
||||
GID = str(os.getgid())
|
||||
|
||||
parser = ArgumentParser(add_help=False)
|
||||
parser.add_argument('-i')
|
||||
parser.add_argument('-o')
|
||||
parser.add_argument('-i', '--input-directory')
|
||||
parser.add_argument('-o', '--output-directory')
|
||||
args, remaining_args = parser.parse_known_args()
|
||||
|
||||
cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)]
|
||||
if args.o is not None:
|
||||
cmd += ['-v', '{}:{}'.format(os.path.abspath(args.o), CONTAINER_OUTPUT_DIR)]
|
||||
if args.output_directory is not None:
|
||||
cmd += ['-v', '{}:{}'.format(os.path.abspath(args.output_directory),
|
||||
CONTAINER_OUTPUT_DIR)]
|
||||
remaining_args.insert(0, CONTAINER_OUTPUT_DIR)
|
||||
if args.i is not None:
|
||||
cmd += ['-v', '{}:{}'.format(os.path.abspath(args.i), CONTAINER_INPUT_DIR)]
|
||||
remaining_args.insert(0, '-o')
|
||||
if args.input_directory is not None:
|
||||
cmd += ['-v', '{}:{}'.format(os.path.abspath(args.input_directory),
|
||||
CONTAINER_INPUT_DIR)]
|
||||
remaining_args.insert(0, CONTAINER_INPUT_DIR)
|
||||
remaining_args.insert(0, '-i')
|
||||
cmd.append(CONTAINER_IMAGE)
|
||||
cmd += remaining_args
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user