mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git
synced 2024-11-13 08:05:41 +00:00
fix pipeline
This commit is contained in:
parent
5980a995e5
commit
5bd0feda5c
@ -30,7 +30,7 @@ RUN tar -xzf "pyflow-${PYFLOW_VERSION}.tar.gz" \
|
|||||||
|
|
||||||
ENV SPACY_VERSION=2.2.4
|
ENV SPACY_VERSION=2.2.4
|
||||||
ENV SPACY_MODELS_VERSION=2.2.5
|
ENV SPACY_MODELS_VERSION=2.2.5
|
||||||
RUN pip3 install "spacy==${SPACY_VERSION}" \
|
RUN pip3 install setuptools wheel && pip3 install "spacy==${SPACY_VERSION}" \
|
||||||
&& python3 -m spacy download "de_core_news_sm-${SPACY_MODELS_VERSION}" --direct \
|
&& python3 -m spacy download "de_core_news_sm-${SPACY_MODELS_VERSION}" --direct \
|
||||||
&& python3 -m spacy download "el_core_news_sm-${SPACY_MODELS_VERSION}" --direct \
|
&& python3 -m spacy download "el_core_news_sm-${SPACY_MODELS_VERSION}" --direct \
|
||||||
&& python3 -m spacy download "en_core_web_sm-${SPACY_MODELS_VERSION}" --direct \
|
&& python3 -m spacy download "en_core_web_sm-${SPACY_MODELS_VERSION}" --direct \
|
||||||
|
68
nlp
68
nlp
@ -28,16 +28,21 @@ SPACY_MODELS = {'de': 'de_core_news_sm',
|
|||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
parser = ArgumentParser(description='NLP Pipeline utilizing spaCy.')
|
parser = ArgumentParser(description='NLP Pipeline utilizing spaCy.')
|
||||||
parser.add_argument('i')
|
parser.add_argument('-i', '--input-directory',
|
||||||
parser.add_argument('o')
|
help='Input directory (only txt files get processed)',
|
||||||
|
required=True)
|
||||||
|
parser.add_argument('-o', '--output-directory',
|
||||||
|
help='Output directory',
|
||||||
|
required=True)
|
||||||
parser.add_argument('-l', '--language', choices=SPACY_MODELS.keys(),
|
parser.add_argument('-l', '--language', choices=SPACY_MODELS.keys(),
|
||||||
required=True)
|
required=True)
|
||||||
|
parser.add_argument('--check-encoding', action='store_true')
|
||||||
|
parser.add_argument('--log-dir')
|
||||||
parser.add_argument('--n-cores',
|
parser.add_argument('--n-cores',
|
||||||
default=min(4, multiprocessing.cpu_count()),
|
default=min(4, multiprocessing.cpu_count()),
|
||||||
help='total number of cores available', type=int)
|
help='total number of cores available', type=int)
|
||||||
parser.add_argument('--check-encoding', action='store_true')
|
parser.add_argument('--zip',
|
||||||
parser.add_argument('--log-dir')
|
help='Zips everything into one archive.')
|
||||||
parser.add_argument('--zip')
|
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
@ -63,41 +68,47 @@ class NLPPipeline(WorkflowRunner):
|
|||||||
|
|
||||||
'''
|
'''
|
||||||
' ##################################################
|
' ##################################################
|
||||||
' # mkdir_jobs #
|
' # setup output directory #
|
||||||
' ##################################################
|
' ##################################################
|
||||||
'''
|
'''
|
||||||
mkdir_jobs = []
|
setup_output_directory_jobs = []
|
||||||
for i, job in enumerate(self.jobs):
|
for i, job in enumerate(self.jobs):
|
||||||
cmd = 'mkdir'
|
cmd = 'mkdir'
|
||||||
cmd += ' -p'
|
cmd += ' -p'
|
||||||
cmd += ' "{}"'.format(job.output_dir)
|
cmd += ' "{}"'.format(job.output_dir)
|
||||||
lbl = 'mkdir_job_-_{}'.format(i)
|
lbl = 'setup_output_directory_-_{}'.format(i)
|
||||||
mkdir_jobs.append(self.addTask(command=cmd, label=lbl))
|
setup_output_directory_jobs.append(self.addTask(command=cmd,
|
||||||
|
label=lbl))
|
||||||
|
|
||||||
'''
|
'''
|
||||||
' ##################################################
|
' ##################################################
|
||||||
' # spacy_nlp_jobs #
|
' # nlp #
|
||||||
' ##################################################
|
' ##################################################
|
||||||
'''
|
'''
|
||||||
spacy_nlp_jobs = []
|
nlp_jobs = []
|
||||||
n_cores = min(self.n_cores, max(1, int(self.n_cores / len(self.jobs))))
|
n_cores = min(self.n_cores, max(1, int(self.n_cores / len(self.jobs))))
|
||||||
for i, job in enumerate(self.jobs):
|
for i, job in enumerate(self.jobs):
|
||||||
output_file = os.path.join(job.output_dir,
|
output_file = os.path.join(job.output_dir,
|
||||||
'{}.vrt'.format(job.name))
|
'{}.vrt'.format(job.name))
|
||||||
cmd = 'spacy-nlp "{}" "{}"'.format(job.file, output_file)
|
cmd = 'spacy-nlp'
|
||||||
|
cmd += ' -i "{}"'.format(job.file)
|
||||||
cmd += ' -l "{}"'.format(self.lang)
|
cmd += ' -l "{}"'.format(self.lang)
|
||||||
cmd += ' --check-encoding' if self.check_encoding else ''
|
cmd += ' -o "{}"'.format(output_file)
|
||||||
deps = 'mkdir_job_-_{}'.format(i)
|
if self.check_encoding:
|
||||||
lbl = 'spacy_nlp_job_-_{}'.format(i)
|
cmd += ' --check-encoding'
|
||||||
spacy_nlp_jobs.append(self.addTask(command=cmd, dependencies=deps,
|
deps = 'setup_output_directory_-_{}'.format(i)
|
||||||
label=lbl, nCores=n_cores))
|
lbl = 'nlp_-_{}'.format(i)
|
||||||
|
nlp_jobs.append(self.addTask(command=cmd,
|
||||||
|
dependencies=deps,
|
||||||
|
label=lbl,
|
||||||
|
nCores=n_cores))
|
||||||
|
|
||||||
'''
|
'''
|
||||||
' ##################################################
|
' ##################################################
|
||||||
' # zip_jobs #
|
' # zip creation #
|
||||||
' ##################################################
|
' ##################################################
|
||||||
'''
|
'''
|
||||||
zip_jobs = []
|
zip_creation_jobs = []
|
||||||
if self.zip is not None:
|
if self.zip is not None:
|
||||||
# Remove .zip file extension if provided
|
# Remove .zip file extension if provided
|
||||||
if self.zip.endswith('.zip'):
|
if self.zip.endswith('.zip'):
|
||||||
@ -112,10 +123,11 @@ class NLPPipeline(WorkflowRunner):
|
|||||||
cmd += ' -i "*.vrt"'
|
cmd += ' -i "*.vrt"'
|
||||||
cmd += ' && '
|
cmd += ' && '
|
||||||
cmd += 'cd -'
|
cmd += 'cd -'
|
||||||
deps = spacy_nlp_jobs
|
deps = nlp_jobs
|
||||||
lbl = 'zip_job'
|
lbl = 'zip_creation'
|
||||||
zip_jobs.append(self.addTask(command=cmd, dependencies=deps,
|
zip_creation_jobs.append(self.addTask(command=cmd,
|
||||||
label=lbl))
|
dependencies=deps,
|
||||||
|
label=lbl))
|
||||||
|
|
||||||
|
|
||||||
def collect_jobs(input_dir, output_dir):
|
def collect_jobs(input_dir, output_dir):
|
||||||
@ -132,11 +144,13 @@ def collect_jobs(input_dir, output_dir):
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
jobs = collect_jobs(args.i, args.o)
|
jobs = collect_jobs(args.input_directory, args.output_directory)
|
||||||
nlp_pipeline = NLPPipeline(args.check_encoding, jobs, args.language,
|
nlp_pipeline = NLPPipeline(args.check_encoding, jobs, args.language,
|
||||||
args.n_cores, args.o, args.zip)
|
args.n_cores, args.output_directory, args.zip)
|
||||||
retval = nlp_pipeline.run(dataDirRoot=(args.log_dir or args.o),
|
retval = nlp_pipeline.run(
|
||||||
nCores=args.n_cores)
|
dataDirRoot=(args.log_dir or args.output_directory),
|
||||||
|
nCores=args.n_cores
|
||||||
|
)
|
||||||
sys.exit(retval)
|
sys.exit(retval)
|
||||||
|
|
||||||
|
|
||||||
|
17
spacy-nlp
17
spacy-nlp
@ -22,9 +22,10 @@ SPACY_MODELS = {'de': 'de_core_news_sm',
|
|||||||
# Parse the given arguments
|
# Parse the given arguments
|
||||||
parser = ArgumentParser(description=('Tag a text file with spaCy and save it '
|
parser = ArgumentParser(description=('Tag a text file with spaCy and save it '
|
||||||
'as a verticalized text file.'))
|
'as a verticalized text file.'))
|
||||||
parser.add_argument('i', metavar='txt-sourcefile')
|
parser.add_argument('-i', '--input', metavar='txt-sourcefile', required=True)
|
||||||
parser.add_argument('o', metavar='vrt-destfile')
|
parser.add_argument('-o', '--output', metavar='vrt-destfile', required=True)
|
||||||
parser.add_argument('-l', '--language', choices=SPACY_MODELS.keys(),
|
parser.add_argument('-l', '--language',
|
||||||
|
choices=SPACY_MODELS.keys(),
|
||||||
required=True)
|
required=True)
|
||||||
parser.add_argument('--check-encoding', action='store_true')
|
parser.add_argument('--check-encoding', action='store_true')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
@ -33,7 +34,7 @@ args = parser.parse_args()
|
|||||||
# If requested: Check the encoding of the text contents from the input file
|
# If requested: Check the encoding of the text contents from the input file
|
||||||
# Else: Use utf-8
|
# Else: Use utf-8
|
||||||
if args.check_encoding:
|
if args.check_encoding:
|
||||||
with open(args.i, "rb") as input_file:
|
with open(args.input, "rb") as input_file:
|
||||||
bytes = input_file.read()
|
bytes = input_file.read()
|
||||||
encoding = chardet.detect(bytes)['encoding']
|
encoding = chardet.detect(bytes)['encoding']
|
||||||
else:
|
else:
|
||||||
@ -41,14 +42,14 @@ else:
|
|||||||
|
|
||||||
|
|
||||||
# hashing in chunks to avoid full RAM with huge files.
|
# hashing in chunks to avoid full RAM with huge files.
|
||||||
with open(args.i, 'rb') as input_file:
|
with open(args.input, 'rb') as input_file:
|
||||||
md5_hash = hashlib.md5()
|
md5_hash = hashlib.md5()
|
||||||
for chunk in iter(lambda: input_file.read(128 * md5_hash.block_size), b''):
|
for chunk in iter(lambda: input_file.read(128 * md5_hash.block_size), b''):
|
||||||
md5_hash.update(chunk)
|
md5_hash.update(chunk)
|
||||||
md5_hash = md5_hash.hexdigest()
|
md5_hash = md5_hash.hexdigest()
|
||||||
|
|
||||||
# Load the text contents from the input file
|
# Load the text contents from the input file
|
||||||
with open(args.i, encoding=encoding) as input_file:
|
with open(args.input, encoding=encoding) as input_file:
|
||||||
text = input_file.read()
|
text = input_file.read()
|
||||||
# spaCys NLP is limited to strings with maximum 1 million characters at
|
# spaCys NLP is limited to strings with maximum 1 million characters at
|
||||||
# once. So we split it into suitable chunks.
|
# once. So we split it into suitable chunks.
|
||||||
@ -64,8 +65,8 @@ nlp = spacy.load(SPACY_MODELS[args.language])
|
|||||||
|
|
||||||
# Create the output file in verticalized text format
|
# Create the output file in verticalized text format
|
||||||
# See: http://cwb.sourceforge.net/files/CWB_Encoding_Tutorial/node3.html
|
# See: http://cwb.sourceforge.net/files/CWB_Encoding_Tutorial/node3.html
|
||||||
output_file_original_filename = args.o
|
output_file_original_filename = args.output
|
||||||
output_file_stand_off_filename = args.o.replace('.vrt', '.stand-off.vrt')
|
output_file_stand_off_filename = args.output.replace('.vrt', '.stand-off.vrt')
|
||||||
common_xml = ('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n'
|
common_xml = ('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n'
|
||||||
+ '<corpus>\n'
|
+ '<corpus>\n'
|
||||||
+ '<text>\n'
|
+ '<text>\n'
|
||||||
|
16
wrapper/nlp
16
wrapper/nlp
@ -12,17 +12,21 @@ UID = str(os.getuid())
|
|||||||
GID = str(os.getgid())
|
GID = str(os.getgid())
|
||||||
|
|
||||||
parser = ArgumentParser(add_help=False)
|
parser = ArgumentParser(add_help=False)
|
||||||
parser.add_argument('-i')
|
parser.add_argument('-i', '--input-directory')
|
||||||
parser.add_argument('-o')
|
parser.add_argument('-o', '--output-directory')
|
||||||
args, remaining_args = parser.parse_known_args()
|
args, remaining_args = parser.parse_known_args()
|
||||||
|
|
||||||
cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)]
|
cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)]
|
||||||
if args.o is not None:
|
if args.output_directory is not None:
|
||||||
cmd += ['-v', '{}:{}'.format(os.path.abspath(args.o), CONTAINER_OUTPUT_DIR)]
|
cmd += ['-v', '{}:{}'.format(os.path.abspath(args.output_directory),
|
||||||
|
CONTAINER_OUTPUT_DIR)]
|
||||||
remaining_args.insert(0, CONTAINER_OUTPUT_DIR)
|
remaining_args.insert(0, CONTAINER_OUTPUT_DIR)
|
||||||
if args.i is not None:
|
remaining_args.insert(0, '-o')
|
||||||
cmd += ['-v', '{}:{}'.format(os.path.abspath(args.i), CONTAINER_INPUT_DIR)]
|
if args.input_directory is not None:
|
||||||
|
cmd += ['-v', '{}:{}'.format(os.path.abspath(args.input_directory),
|
||||||
|
CONTAINER_INPUT_DIR)]
|
||||||
remaining_args.insert(0, CONTAINER_INPUT_DIR)
|
remaining_args.insert(0, CONTAINER_INPUT_DIR)
|
||||||
|
remaining_args.insert(0, '-i')
|
||||||
cmd.append(CONTAINER_IMAGE)
|
cmd.append(CONTAINER_IMAGE)
|
||||||
cmd += remaining_args
|
cmd += remaining_args
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user