mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git
				synced 2025-10-31 13:02:44 +00:00 
			
		
		
		
	Bump spaCy version, bugfixes, codestyle
This commit is contained in:
		
							
								
								
									
										28
									
								
								Dockerfile
									
									
									
									
									
								
							
							
						
						
									
										28
									
								
								Dockerfile
									
									
									
									
									
								
							| @@ -9,7 +9,14 @@ ENV LANG=C.UTF-8 | ||||
|  | ||||
| RUN apt-get update \ | ||||
|  && apt-get install --no-install-recommends --yes \ | ||||
|      wget | ||||
|       procps \ | ||||
|       python3.7 \ | ||||
|       python3-pip \ | ||||
|       wget \ | ||||
|  && python3 -m pip install \ | ||||
|       chardet \ | ||||
|       setuptools \ | ||||
|       wheel | ||||
|  | ||||
| # Install the NLP pipeline and it's dependencies # | ||||
| ## Install pyFlow ## | ||||
| @@ -21,12 +28,12 @@ RUN wget --no-check-certificate --quiet \ | ||||
|  && apt-get install --no-install-recommends --yes \ | ||||
|       python2.7 \ | ||||
|  && python2.7 setup.py build install \ | ||||
|  && cd .. \ | ||||
|  && cd - > /dev/null \ | ||||
|  && rm -r "pyflow-${PYFLOW_VERSION}" "pyflow-${PYFLOW_VERSION}.tar.gz" | ||||
|  | ||||
|  | ||||
| ## Install spaCy ## | ||||
| ENV SPACY_VERSION=3.0.5 | ||||
| ENV SPACY_VERSION=3.2.1 | ||||
| RUN apt-get install --no-install-recommends --yes \ | ||||
|       python3.7 \ | ||||
|       python3-pip \ | ||||
| @@ -38,23 +45,14 @@ RUN apt-get install --no-install-recommends --yes \ | ||||
|  && pip3 install "spacy==${SPACY_VERSION}" | ||||
|  | ||||
|  | ||||
| # Only models that include the following components are compatibel: | ||||
| # lemmatizer, ner, parser, senter, tagger, | ||||
| ENV SPACY_MODELS="de_core_news_md,en_core_web_md,it_core_news_md,nl_core_news_md,pl_core_news_md,zh_core_web_md" | ||||
| ENV SPACY_MODELS_VERSION=3.0.0 | ||||
| ENV SPACY_MODELS="de_core_news_md,en_core_web_md,it_core_news_md,pl_core_news_md,zh_core_web_md" | ||||
| ENV SPACY_MODELS_VERSION=3.2.0 | ||||
| RUN for spacy_model in $(echo ${SPACY_MODELS} | tr "," "\n"); do python3 -m spacy download "${spacy_model}-${SPACY_MODELS_VERSION}" --direct; done | ||||
|  | ||||
|  | ||||
| ## Further dependencies ## | ||||
| RUN apt-get install --no-install-recommends --yes \ | ||||
|       procps \ | ||||
|       zip | ||||
|  | ||||
|  | ||||
| COPY packages . | ||||
| RUN cd stand-off-data-py \ | ||||
|  && python3 setup.py build \ | ||||
|  && python3 setup.py install \ | ||||
|  && python3 -m pip install . \ | ||||
|  && cd - | ||||
|  | ||||
|  | ||||
|   | ||||
							
								
								
									
										21
									
								
								LICENSE
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								LICENSE
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,21 @@ | ||||
| MIT License | ||||
|  | ||||
| Copyright (c) 2021 Bielefeld University - CRC 1288 - INF | ||||
|  | ||||
| Permission is hereby granted, free of charge, to any person obtaining a copy | ||||
| of this software and associated documentation files (the "Software"), to deal | ||||
| in the Software without restriction, including without limitation the rights | ||||
| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||||
| copies of the Software, and to permit persons to whom the Software is | ||||
| furnished to do so, subject to the following conditions: | ||||
|  | ||||
| The above copyright notice and this permission notice shall be included in all | ||||
| copies or substantial portions of the Software. | ||||
|  | ||||
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||||
| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||||
| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||||
| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||||
| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||||
| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||||
| SOFTWARE. | ||||
							
								
								
									
										61
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										61
									
								
								README.md
									
									
									
									
									
								
							| @@ -1,48 +1,41 @@ | ||||
| # NLP - Natural Language Processing | ||||
|  | ||||
| This software implements a heavily parallelized pipeline for Natural Language Processing of text files. It is used for nopaque's NLP service but you can also use it standalone, for that purpose a convenient wrapper script is provided. | ||||
| This software implements a heavily parallelized pipeline for Natural Language Processing of text files. It is used for nopaque's NLP service but you can also use it standalone, for that purpose a convenient wrapper script is provided. The pipeline is designed to run on Linux operating systems, but with some tweaks it should also run on Windows with WSL installed. | ||||
|  | ||||
| ## Software used in this pipeline implementation | ||||
| - Official Debian Docker image (buster-slim) and programs from its free repositories: https://hub.docker.com/_/debian | ||||
|  | ||||
| - Official Debian Docker image (buster-slim): https://hub.docker.com/_/debian | ||||
|   - Software from Debian Buster's free repositories | ||||
| - pyFlow (1.1.20): https://github.com/Illumina/pyflow/releases/tag/v1.1.20 | ||||
| - spaCy (3.0.5): https://github.com/tesseract-ocr/tesseract/releases/tag/4.1.1 | ||||
| - spaCy medium sized models (3.0.0): | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/de_core_news_md-3.0.0 | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/en_core_web_md-3.0.0 | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/it_core_news_md-3.0.0 | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/nl_core_news_md-3.0.0 | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/pl_core_news_md-3.0.0 | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/zh_core_web_md-3.0.0 | ||||
| - spaCy (3.2.1): https://github.com/explosion/spaCy/releases/tag/v3.2.1 | ||||
| - spaCy medium sized models (3.2.0): | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/de_core_news_md-3.2.0 | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/en_core_web_md-3.2.0 | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/it_core_news_md-3.2.0 | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/nl_core_news_md-3.2.0 | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/pl_core_news_md-3.2.0 | ||||
|   - https://github.com/explosion/spacy-models/releases/tag/zh_core_web_md-3.2.0 | ||||
|  | ||||
|  | ||||
| ## Use this image | ||||
| ## Installation | ||||
|  | ||||
| 1. Create input and output directories for the pipeline. | ||||
| ``` bash | ||||
| mkdir -p /<my_data_location>/input /<my_data_location>/output | ||||
| ``` | ||||
| 1. Install Docker and Python 3. | ||||
| 2. Clone this repository: `git clone https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git` | ||||
| 3. Build the Docker image: `docker build -t gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:v0.1.0 nlp` | ||||
| 4. Add the wrapper script (`wrapper/nlp` relative to this README file) to your `${PATH}`. | ||||
| 5. Create working directories for the pipeline: `mkdir -p /<my_data_location>/{input,output}`. | ||||
|  | ||||
| 2. Place your text files inside `/<my_data_location>/input`. Files should all contain text of the same language. | ||||
|  | ||||
| ## Use the Pipeline | ||||
|  | ||||
| 1. Place your plain text files inside `/<my_data_location>/input`. Files should all contain text of the same language. | ||||
| 2. Clear your `/<my_data_location>/output` directory. | ||||
| 3. Start the pipeline process. Check the pipeline help (`nlp --help`) for more details. | ||||
| ``` | ||||
| # Option one: Use the wrapper script | ||||
| ## Install the wrapper script (only on first run). Get it from https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/raw/1.0.0/wrapper/nlp, make it executeable and add it to your ${PATH} | ||||
| ```bash | ||||
| cd /<my_data_location> | ||||
| nlp -i input -l <language_code> -o output <optional_pipeline_arguments> | ||||
|  | ||||
| # Option two: Classic Docker style | ||||
| docker run \ | ||||
|     --rm \ | ||||
|     -it \ | ||||
|     -u $(id -u $USER):$(id -g $USER) \ | ||||
|     -v /<my_data_location>/input:/input \ | ||||
|     -v /<my_data_location>/output:/output \ | ||||
|     gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0 \ | ||||
|         -i /input \ | ||||
|         -l <language_code> | ||||
|         -o /output \ | ||||
|         <optional_pipeline_arguments> | ||||
| nlp \ | ||||
|   --input-dir input \ | ||||
|   --output-dir output \ | ||||
|   -m <model_code> <optional_pipeline_arguments> | ||||
| ``` | ||||
|  | ||||
| 4. Check your results in the `/<my_data_location>/output` directory. | ||||
|   | ||||
							
								
								
									
										339
									
								
								nlp
									
									
									
									
									
								
							
							
						
						
									
										339
									
								
								nlp
									
									
									
									
									
								
							| @@ -1,73 +1,141 @@ | ||||
| #!/usr/bin/env python2.7 | ||||
| # coding=utf-8 | ||||
|  | ||||
| """A NLP pipeline for text file processing.""" | ||||
|  | ||||
| __author__ = 'Patrick Jentsch <p.jentsch@uni-bielefeld.de>,' \ | ||||
|              'Stephan Porada <porada@posteo.de>' | ||||
| __version__ = '1.0.0' | ||||
| ''' A NLP pipeline for text file processing. ''' | ||||
| __version__ = '0.1.0' | ||||
|  | ||||
| from argparse import ArgumentParser | ||||
| from pyflow import WorkflowRunner | ||||
| import multiprocessing | ||||
| import json | ||||
| import os | ||||
| import sys | ||||
|  | ||||
|  | ||||
| SPACY_MODELS = {'de': 'de_core_news_md', | ||||
| SPACY_MODELS = { | ||||
|     'de': 'de_core_news_md', | ||||
|     'en': 'en_core_web_md', | ||||
|     'it': 'it_core_news_md', | ||||
|     'nl': 'nl_core_news_md', | ||||
|     'pl': 'pl_core_news_md', | ||||
|                 'zh': 'zh_core_web_md'} | ||||
|     'zh': 'zh_core_web_md' | ||||
| } | ||||
|  | ||||
|  | ||||
| class NLPPipelineJob: | ||||
|     """An NLP pipeline job class | ||||
| class PipelineJob: | ||||
|     ''' | ||||
|     NLP pipeline job class. | ||||
|  | ||||
|     Each input file of the pipeline is represented as an NLP pipeline job, | ||||
|     which holds all necessary information for the pipeline to process it. | ||||
|     Each plain text input file of the pipeline is represented as an NLP | ||||
|     pipeline job, which holds all necessary information for the pipeline to | ||||
|     process it. | ||||
|  | ||||
|     Arguments: | ||||
|     file -- Path to the file | ||||
|     output_dir -- Path to a directory, where job results a stored | ||||
|     """ | ||||
|     output_dir -- Path to a directory, where job results are stored | ||||
|     ''' | ||||
|  | ||||
|     def __init__(self, file, output_dir): | ||||
|         self.file = file | ||||
|         self.name = os.path.basename(file).rsplit('.', 1)[0] | ||||
|         self.name = os.path.basename(file)[:-4] | ||||
|         self.output_dir = output_dir | ||||
|         catma_stand_off_data_file = file.rsplit('.', 1)[0] + '.catma-stand-off.json'  # noqa | ||||
|         if os.path.exists(catma_stand_off_data_file): | ||||
|             self.catma_stand_off_data_file = catma_stand_off_data_file | ||||
|         else: | ||||
|             self.catma_stand_off_data_file = None | ||||
|  | ||||
|  | ||||
| class NLPPipeline(WorkflowRunner): | ||||
|     def __init__(self, input_dir, output_dir, check_encoding, lang, zip): | ||||
| class NLPWorkflow(WorkflowRunner): | ||||
|     def __init__(self, job, model, check_encoding=False, id_prefix=''): | ||||
|         self.job = job | ||||
|         self.model = model | ||||
|         self.check_encoding = check_encoding | ||||
|         self.id_prefix = id_prefix | ||||
|  | ||||
|     def workflow(self): | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # spacy                                          # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         n_cores = 1 | ||||
|         mem_mb = min(1024, self.getMemMb()) | ||||
|         cmd = 'spacy-nlp' | ||||
|         cmd += ' --input-file "{}"'.format(self.job.file) | ||||
|         cmd += ' --output-file "{}"'.format( | ||||
|             os.path.join(self.job.output_dir, '{}.json'.format(self.job.name)) | ||||
|         ) | ||||
|         cmd += ' -m "{}"'.format(self.model) | ||||
|         if self.check_encoding: | ||||
|             cmd += ' --check-encoding' | ||||
|         cmd += ' --id-prefix "{}"'.format(self.id_prefix) | ||||
|         self.addTask( | ||||
|             'spacy', | ||||
|             command=cmd, | ||||
|             memMb=mem_mb, | ||||
|             nCores=n_cores | ||||
|         ) | ||||
|  | ||||
|  | ||||
| class CreateVrtWorkflow(WorkflowRunner): | ||||
|     def __init__(self, job): | ||||
|         self.job = job | ||||
|  | ||||
|     def workflow(self): | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # vrt-creator                                    # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         n_cores = 1 | ||||
|         mem_mb = min(256, self.getMemMb()) | ||||
|         cmd = 'vrt-creator' | ||||
|         cmd += ' --stand-off-data-file "{}"'.format( | ||||
|             os.path.join(self.job.output_dir, '{}.json'.format(self.job.name)) | ||||
|         ) | ||||
|         cmd += ' --text-file "{}"'.format(self.job.file) | ||||
|         cmd += ' --output-file "{}"'.format( | ||||
|             os.path.join(self.job.output_dir, '{}.vrt'.format(self.job.name)) | ||||
|         ) | ||||
|         self.addTask( | ||||
|             'vrt_creator', | ||||
|             command=cmd, | ||||
|             memMb=mem_mb, | ||||
|             nCores=n_cores | ||||
|         ) | ||||
|  | ||||
|  | ||||
| class MainWorkflow(WorkflowRunner): | ||||
|     def __init__( | ||||
|         self, | ||||
|         input_dir, | ||||
|         model, | ||||
|         output_dir, | ||||
|         check_encoding=False, | ||||
|         id_prefix='' | ||||
|     ): | ||||
|         self.input_dir = input_dir | ||||
|         self.model = model | ||||
|         self.output_dir = output_dir | ||||
|         self.check_encoding = check_encoding | ||||
|         self.lang = lang | ||||
|         self.zip = zip | ||||
|         self.jobs = collect_jobs(self.input_dir, self.output_dir) | ||||
|         self.id_prefix = id_prefix | ||||
|         self.jobs = [] | ||||
|  | ||||
|     def collect_jobs(self): | ||||
|         self.jobs = [] | ||||
|         for file in os.listdir(self.input_dir): | ||||
|             if os.path.isdir(os.path.join(self.input_dir, file)): | ||||
|                 continue | ||||
|             if not file.lower().endswith('.txt'): | ||||
|                 continue | ||||
|             job = PipelineJob( | ||||
|                 os.path.join(self.input_dir, file), | ||||
|                 os.path.join(self.output_dir, file) | ||||
|             ) | ||||
|             self.jobs.append(job) | ||||
|  | ||||
|     def workflow(self): | ||||
|         if not self.jobs: | ||||
|             return | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # setup output directory                         # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         setup_output_directory_tasks = [] | ||||
|         for i, job in enumerate(self.jobs): | ||||
|             cmd = 'mkdir -p "{}"'.format(job.output_dir) | ||||
|             lbl = 'setup_output_directory_-_{}'.format(i) | ||||
|             task = self.addTask(command=cmd, label=lbl) | ||||
|             setup_output_directory_tasks.append(task) | ||||
|         # Create output and temporary directories | ||||
|         for job in self.jobs: | ||||
|             os.mkdir(job.output_dir) | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
| @@ -75,106 +143,116 @@ class NLPPipeline(WorkflowRunner): | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         nlp_tasks = [] | ||||
|         n_cores = max(1, int(self.getNCores() / len(self.jobs))) | ||||
|         mem_mb = min(n_cores * 2048, int(self.getMemMb() / len(self.jobs))) | ||||
|         for i, job in enumerate(self.jobs): | ||||
|             output_file = os.path.join(job.output_dir, '{}.nopaque-stand-off.json'.format(job.name))  # noqa | ||||
|             cmd = 'spacy-nlp' | ||||
|             cmd += ' -l "{}"'.format(self.lang) | ||||
|             cmd += ' --check-encoding' if self.check_encoding else '' | ||||
|             cmd += ' "{}"'.format(job.file) | ||||
|             cmd += ' "{}"'.format(output_file) | ||||
|             deps = 'setup_output_directory_-_{}'.format(i) | ||||
|             lbl = 'nlp_-_{}'.format(i) | ||||
|             task = self.addTask(command=cmd, dependencies=deps, label=lbl, | ||||
|                                 memMb=mem_mb, nCores=n_cores) | ||||
|             task = self.addWorkflowTask( | ||||
|                 'nlp_-_{}'.format(i), | ||||
|                 NLPWorkflow( | ||||
|                     job, | ||||
|                     self.model, | ||||
|                     check_encoding=self.check_encoding, | ||||
|                     id_prefix=self.id_prefix | ||||
|                 ) | ||||
|             ) | ||||
|             nlp_tasks.append(task) | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # vrt creation                                   # | ||||
|         ' # create vrt                                     # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         vrt_creation_tasks = [] | ||||
|         create_vrt_tasks = [] | ||||
|         for i, job in enumerate(self.jobs): | ||||
|             output_file = os.path.join(job.output_dir, '{}.vrt'.format(job.name))  # noqa | ||||
|             nopaque_stand_off_data_file = os.path.join(job.output_dir, '{}.nopaque-stand-off.json'.format(job.name))  # noqa | ||||
|             cmd = 'vrt-creator' | ||||
|             cmd += ' "{}"'.format(job.file) | ||||
|             cmd += ' "{}"'.format(nopaque_stand_off_data_file) | ||||
|             if job.catma_stand_off_data_file is not None: | ||||
|                 cmd += ' --catma-stand-off-data "{}"'.format(job.catma_stand_off_data_file)  # noqa | ||||
|             cmd += ' "{}"'.format(output_file) | ||||
|             deps = 'nlp_-_{}'.format(i) | ||||
|             lbl = 'vrt_creation_-_{}'.format(i) | ||||
|             task = self.addTask(command=cmd, dependencies=deps, label=lbl) | ||||
|             vrt_creation_tasks.append(task) | ||||
|             task = self.addWorkflowTask( | ||||
|                 'create_vrt_-_{}'.format(i), | ||||
|                 CreateVrtWorkflow(job), | ||||
|                 dependencies='nlp_-_{}'.format(i) | ||||
|             ) | ||||
|             create_vrt_tasks.append(task) | ||||
|  | ||||
|         ''' | ||||
|         ' ################################################## | ||||
|         ' # zip creation                                   # | ||||
|         ' ################################################## | ||||
|         ''' | ||||
|         zip_creation_tasks = [] | ||||
|         if self.zip is not None: | ||||
|             cmd = 'cd "{}"'.format(self.output_dir) | ||||
|             cmd += ' && ' | ||||
|             cmd += 'zip' | ||||
|             cmd += ' -r' | ||||
|             cmd += ' "{}.zip" .'.format(self.zip) | ||||
|             cmd += ' -x "pyflow.data*"' | ||||
|             cmd += ' -i "*.vrt" "*.json"' | ||||
|             cmd += ' && ' | ||||
|             cmd += 'cd -' | ||||
|             deps = vrt_creation_tasks | ||||
|             lbl = 'zip_creation' | ||||
|             task = self.addTask(command=cmd, dependencies=deps, label=lbl) | ||||
|             zip_creation_tasks.append(task) | ||||
|  | ||||
|  | ||||
| def collect_jobs(input_dir, output_dir): | ||||
|     jobs = [] | ||||
|     for file in os.listdir(input_dir): | ||||
|         if os.path.isdir(os.path.join(input_dir, file)): | ||||
|             continue | ||||
|         if file.lower().endswith('.txt'): | ||||
|             job = NLPPipelineJob(os.path.join(input_dir, file), | ||||
|                                  os.path.join(output_dir, file)) | ||||
|             jobs.append(job) | ||||
|     return jobs | ||||
|         self.waitForTasks() | ||||
|         outputs = [] | ||||
|         for job in self.jobs: | ||||
|             # Track output files | ||||
|             relative_output_dir = os.path.relpath( | ||||
|                 job.output_dir, | ||||
|                 start=self.output_dir | ||||
|             ) | ||||
|             outputs.append( | ||||
|                 { | ||||
|                     'description': 'JSON stand off data', | ||||
|                     'file': os.path.join( | ||||
|                         relative_output_dir, | ||||
|                         '{}.json'.format(job.name) | ||||
|                     ), | ||||
|                     'mimetype': 'application/json' | ||||
|                 } | ||||
|             ) | ||||
|             outputs.append( | ||||
|                 { | ||||
|                     'description': 'CWB vrt file', | ||||
|                     'file': os.path.join( | ||||
|                         relative_output_dir, | ||||
|                         '{}.vrt'.format(job.name) | ||||
|                     ), | ||||
|                     'mimetype': 'application/vrt+xml' | ||||
|                 } | ||||
|             ) | ||||
|         with open(os.path.join(self.output_dir, 'outputs.json'), 'w') as f: | ||||
|             json.dump(outputs, f, indent=4) | ||||
|  | ||||
|  | ||||
| def parse_args(): | ||||
|     parser = ArgumentParser(description='NLP pipeline for TXT file processing', | ||||
|                             prog='NLP pipeline') | ||||
|     parser.add_argument('-i', '--input-dir', | ||||
|     parser = ArgumentParser( | ||||
|         description='NLP pipeline for plain text file processing' | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         '-i', '--input-dir', | ||||
|         help='Input directory', | ||||
|                         required=True) | ||||
|     parser.add_argument('-o', '--output-dir', | ||||
|         required=True | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         '-o', '--output-dir', | ||||
|         help='Output directory', | ||||
|                         required=True) | ||||
|     parser.add_argument('-l', '--language', | ||||
|         required=True | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         '-m', '--model', | ||||
|         choices=SPACY_MODELS.keys(), | ||||
|                         help='Language of the input (2-character ISO 639-1 language codes)',  # noqa | ||||
|                         required=True) | ||||
|     parser.add_argument('--check-encoding', | ||||
|         help='The model to be used', | ||||
|         required=True | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         '--check-encoding', | ||||
|         action='store_true', | ||||
|                         help='Check encoding of the input file, UTF-8 is used instead')  # noqa | ||||
|     parser.add_argument('--log-dir', | ||||
|                         help='Logging directory') | ||||
|     parser.add_argument('--mem-mb', | ||||
|                         help='Amount of system memory to be used (Default: min(--n-cores * 2048, available system memory))',  # noqa | ||||
|                         type=int) | ||||
|     parser.add_argument('--n-cores', | ||||
|                         default=min(4, multiprocessing.cpu_count()), | ||||
|                         help='Number of CPU threads to be used (Default: min(4, number of CPUs))',  # noqa | ||||
|                         type=int) | ||||
|     parser.add_argument('--zip', | ||||
|                         help='Create one zip file per filetype') | ||||
|     parser.add_argument('-v', '--version', | ||||
|         help='Check encoding of the input file, UTF-8 is used instead' | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         '--id-prefix', | ||||
|         default='', | ||||
|         help='A prefix for all the ids within the stand off annotations' | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         '--log-dir', | ||||
|         help='Logging directory (Default: --output-dir)' | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         '--mem-mb', | ||||
|         help='Amount of system memory to be used ' | ||||
|              '(Default: min(--n-cores * 1024, available system memory))', | ||||
|         type=int | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         '--n-cores', | ||||
|         default=1, | ||||
|         help='Number of CPU threads to be used', | ||||
|         type=int | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         '-v', '--version', | ||||
|         action='version', | ||||
|         help='Returns the current version of the NLP pipeline', | ||||
|                         version='%(prog)s {}'.format(__version__)) | ||||
|         version='%(prog)s {}'.format(__version__) | ||||
|     ) | ||||
|     args = parser.parse_args() | ||||
|  | ||||
|     # Set some tricky default values and check for insufficient input | ||||
| @@ -184,20 +262,27 @@ def parse_args(): | ||||
|         raise Exception('--n-cores must be greater or equal 1') | ||||
|     if args.mem_mb is None: | ||||
|         max_mem_mb = int(os.popen('free -t -m').readlines()[-1].split()[1:][0]) | ||||
|         args.mem_mb = min(args.n_cores * 2048, max_mem_mb) | ||||
|     if args.mem_mb < 2048: | ||||
|         raise Exception('--mem-mb must be greater or equal 2048') | ||||
|     if args.zip is not None and args.zip.lower().endswith('.zip'): | ||||
|         # Remove .zip file extension if provided | ||||
|         args.zip = args.zip[:-4] | ||||
|         args.zip = args.zip if args.zip else 'output' | ||||
|         args.mem_mb = min(args.n_cores * 1024, max_mem_mb) | ||||
|     if args.mem_mb < 1024: | ||||
|         raise Exception('--mem-mb must be greater or equal 1024') | ||||
|     return args | ||||
|  | ||||
|  | ||||
| def main(): | ||||
|     args = parse_args() | ||||
|     nlp_pipeline = NLPPipeline(args.input_dir, args.output_dir, args.check_encoding, args.language, args.zip)  # noqa | ||||
|     retval = nlp_pipeline.run(dataDirRoot=args.log_dir, memMb=args.mem_mb, nCores=args.n_cores)  # noqa | ||||
|     main_workflow = MainWorkflow( | ||||
|         args.input_dir, | ||||
|         args.model, | ||||
|         args.output_dir, | ||||
|         check_encoding=args.check_encoding, | ||||
|         id_prefix=args.id_prefix | ||||
|     ) | ||||
|     main_workflow.collect_jobs() | ||||
|     retval = main_workflow.run( | ||||
|         dataDirRoot=args.log_dir, | ||||
|         memMb=args.mem_mb, | ||||
|         nCores=args.n_cores | ||||
|     ) | ||||
|     sys.exit(retval) | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -1,14 +1,14 @@ | ||||
| import setuptools | ||||
|  | ||||
| setuptools.setup( | ||||
|     name='stand-off-data', | ||||
|     name='Stand off data', | ||||
|     author='Patrick Jentsch', | ||||
|     author_email='p.jentsch@uni-bielefeld.de', | ||||
|     description='A python library to handle stand off data.', | ||||
|     py_modules=['stand_off_data'], | ||||
|     classifiers=[ | ||||
|         'Programming Language :: Python :: 3', | ||||
|         'Operating System :: OS Independent', | ||||
|     ], | ||||
|     packages=setuptools.find_packages(), | ||||
|     python_requires='>=3.5' | ||||
| ) | ||||
|   | ||||
| @@ -7,13 +7,15 @@ class StandOffData: | ||||
|         self.lookup = {} | ||||
|         for x in attrs.get('tags', []): | ||||
|             self.add_tag_definition(x) | ||||
|         self.annotations = [TagAnnotation(x, self.lookup) | ||||
|                             for x in attrs.get('annotations', [])] | ||||
|         self.annotations = [ | ||||
|             TagAnnotation(x, self.lookup) | ||||
|             for x in attrs.get('annotations', []) | ||||
|         ] | ||||
| 
 | ||||
|     def add_tag_definition(self, attrs): | ||||
|         tag_definition = TagDefinition(attrs) | ||||
|         if tag_definition.id in self.lookup: | ||||
|             raise Exception('Tag id already in use: {}'.format(self.to_dict())) | ||||
|             raise Exception(f'Tag id already in use: {self.to_dict()}') | ||||
|         self.lookup[tag_definition.id] = tag_definition | ||||
| 
 | ||||
|     def to_dict(self): | ||||
| @@ -42,7 +44,9 @@ class StandOffData: | ||||
|             if ((p_attr.start >= next_p_attr.start) and (p_attr.start < next_p_attr.end)  # noqa | ||||
|                 or (p_attr.end > next_p_attr.start) and (p_attr.end <= next_p_attr.end)):  # noqa | ||||
|                 raise Exception( | ||||
|                     'Positional attribute overlaps another: {}<->{}'.format(p_attr.to_dict(), next_p_attr.to_dict())) | ||||
|                     'Positional attribute overlaps another: ' | ||||
|                     f'{p_attr.to_dict()}<->{next_p_attr.to_dict()}' | ||||
|                 ) | ||||
|         # Check for s_attr<->p_attr overlap | ||||
|         for i, s_attr in enumerate(s_attrs): | ||||
|             for p_attr in p_attrs: | ||||
| @@ -56,8 +60,11 @@ class StandOffData: | ||||
|                     s_attrs[i].end = p_attr.end | ||||
|                 # Check if s_attr starts/ends before/after p_attr | ||||
|                 if p_attr.start >= s_attr.end or p_attr.end <= s_attr.start: | ||||
|                     # No further Checking needed (just because p_attrs are sorted) | ||||
|                     # No further Checking needed (because p_attrs are sorted) | ||||
|                     break | ||||
|         p_attr_buffer = {} | ||||
|         for i, p_attr in enumerate(p_attrs): | ||||
|             p_attr_buffer[p_attr.start] = i | ||||
|         s_attr_start_buffer = {} | ||||
|         s_attr_end_buffer = {} | ||||
|         for i, s_attr in enumerate(s_attrs): | ||||
| @@ -66,34 +73,56 @@ class StandOffData: | ||||
|             else: | ||||
|                 s_attr_start_buffer[s_attr.start] = [i] | ||||
|             if s_attr.end in s_attr_end_buffer: | ||||
|                 s_attr_end_buffer[s_attr.end].append(i) | ||||
|                 s_attr_end_buffer[s_attr.end].insert(0, i) | ||||
|             else: | ||||
|                 s_attr_end_buffer[s_attr.end] = [i] | ||||
|         vrt = '' | ||||
|         vrt += '<text>\n' | ||||
|         for p_attr in p_attrs: | ||||
|             # s_attr_ends | ||||
|             for k in {k: v for k, v in s_attr_end_buffer.items() if k <= p_attr.start}:  # noqa | ||||
|                 s_attr_indexes = s_attr_end_buffer.pop(k) | ||||
|         current_position = 0 | ||||
|         text_len = len(text) | ||||
|         # As long as we have something in our buffers we process it | ||||
|         while current_position <= text_len: | ||||
|             # s_attr endings | ||||
|             # for k in {k: v for k, v in s_attr_end_buffer.items() if k <= current_position}:  # noqa | ||||
|             if current_position in s_attr_end_buffer: | ||||
|                 # s_attr_indexes = s_attr_end_buffer.pop(k) | ||||
|                 s_attr_indexes = s_attr_end_buffer.pop(current_position) | ||||
|                 for s_attr_index in s_attr_indexes: | ||||
|                     s_attr = s_attrs[s_attr_index] | ||||
|                     vrt += '</{}>\n'.format(escape(s_attr.name)) | ||||
|             # s_attr_starts | ||||
|             for k in {k: v for k, v in s_attr_start_buffer.items() if k <= p_attr.start}:  # noqa | ||||
|                 s_attr_indexes = s_attr_start_buffer.pop(k) | ||||
|                     vrt += f'</{escape(s_attr.name)}>\n' | ||||
|             # s_attrs starts | ||||
|             # for k in {k: v for k, v in s_attr_start_buffer.items() if k <= current_position}:  # noqa | ||||
|             if current_position in s_attr_start_buffer: | ||||
|                 # s_attr_indexes = s_attr_start_buffer.pop(k) | ||||
|                 s_attr_indexes = s_attr_start_buffer.pop(current_position) | ||||
|                 for s_attr_index in s_attr_indexes: | ||||
|                     s_attr = s_attrs[s_attr_index] | ||||
|                     foo = '' | ||||
|                     vrt += f'<{escape(s_attr.name)}' | ||||
|                     for property in s_attr.properties: | ||||
|                         foo += ' {}="{}"'.format(escape(property.name), | ||||
|                                                  escape(property.value)) | ||||
|                     vrt += '<{}{}>\n'.format(escape(s_attr.name), foo) | ||||
|             foo = {'lemma': None, 'ner': None, 'pos': None, 'simple_pos': None, 'word': None}  # noqa | ||||
|                         vrt += f' {escape(property.name)}="{escape(str(property.value))}"'  # noqa | ||||
|                     vrt += '>\n' | ||||
|             # p_attrs | ||||
|             if current_position not in p_attr_buffer: | ||||
|                 current_position += 1 | ||||
|                 continue | ||||
|             p_attr_index = p_attr_buffer.pop(current_position) | ||||
|             p_attr = p_attrs[p_attr_index] | ||||
|             if text[p_attr.start:p_attr.end].isspace(): | ||||
|                 current_position = p_attr.end | ||||
|                 continue | ||||
|             _p_attr = { | ||||
|                 'lemma': 'None', | ||||
|                 'pos': 'None', | ||||
|                 'simple_pos': 'None', | ||||
|                 'word': 'None' | ||||
|             } | ||||
|             for property in p_attr.properties: | ||||
|                 foo[property.name] = escape(property.value) | ||||
|             foo['word'] = escape(text[p_attr.start:p_attr.end]) | ||||
|             vrt += '{word}\t{pos}\t{lemma}\t{simple_pos}\t{ner}\n'.format( | ||||
|                 **foo) | ||||
|                 if property.name not in _p_attr: | ||||
|                     continue | ||||
|                 _p_attr[property.name] = escape(str(property.value)) | ||||
|             _p_attr['word'] = escape(text[p_attr.start:p_attr.end]) | ||||
|             vrt += '{word}\t{pos}\t{lemma}\t{simple_pos}\n'.format(**_p_attr) | ||||
|             current_position = p_attr.end | ||||
|         vrt += '</text>\n' | ||||
|         return vrt | ||||
| 
 | ||||
| @@ -110,15 +139,15 @@ class TagAnnotation: | ||||
|         ] | ||||
|         ''' Sanity checks ''' | ||||
|         if self.tag_id not in self.lookup: | ||||
|             raise Exception('Unknown tag: {}'.format(self.to_dict())) | ||||
|             raise Exception(f'Unknown tag: {self.to_dict()}') | ||||
|         if self.end < self.start: | ||||
|             raise Exception('Annotation end less then start: ' | ||||
|                             '{}'.format(self.to_dict())) | ||||
|         property_ids = [x.property_id for x in self.properties] | ||||
|         for required_property_id, required_property in self.lookup[self.tag_id].required_properties.items():  # noqa | ||||
|             if required_property_id not in property_ids: | ||||
|                 raise Exception('Missing required property: ' | ||||
|                                 '{}'.format(required_property.to_dict())) | ||||
|             raise Exception(f'Annotation end less then start: {self.to_dict()}')  # noqa | ||||
|         # property_ids = [x.property_id for x in self.properties] | ||||
|         # for required_property_id, required_property in self.lookup[self.tag_id].required_properties.items():  # noqa | ||||
|         #     if required_property_id not in property_ids: | ||||
|         #         raise Exception( | ||||
|         #             f'Missing required property: {required_property.to_dict()}' | ||||
|         #         ) | ||||
| 
 | ||||
|     @property | ||||
|     def name(self): | ||||
| @@ -134,33 +163,45 @@ class TagAnnotation: | ||||
| 
 | ||||
|     def __lt__(self, other): | ||||
|         if self.start == other.start: | ||||
|             return self.name == 'token' and other.name != 'token' | ||||
|             if self.name == 'token' and other.name != 'token': | ||||
|                 return False | ||||
|             elif self.name != 'token' and other.name == 'token': | ||||
|                 return True | ||||
|             else: | ||||
|                 return self.end > other.end | ||||
|         else: | ||||
|             return self.start < other.start | ||||
| 
 | ||||
|     def __le__(self, other): | ||||
|         if self.start == other.start: | ||||
|             return self.name == 'token' or other.name != 'token' | ||||
|             if self.name == 'token' and other.name != 'token': | ||||
|                 return False | ||||
|             elif self.name != 'token' and other.name == 'token': | ||||
|                 return True | ||||
|             else: | ||||
|             return self.start < other.start | ||||
|                 return self.end >= other.end | ||||
|         else: | ||||
|             return self.start <= other.start | ||||
| 
 | ||||
|     def __eq__(self, other): | ||||
|         return self.start == other.start and self.name == other.name | ||||
|         if self.start == other.start: | ||||
|             if self.name == 'token' and other.name != 'token': | ||||
|                 return False | ||||
|             elif self.name != 'token' and other.name == 'token': | ||||
|                 return False | ||||
|             else: | ||||
|                 return self.end == other.end | ||||
|         else: | ||||
|             return False | ||||
| 
 | ||||
|     def __ne__(self, other): | ||||
|         return self.start != other.start and self.name != other.name | ||||
|         return not self == other | ||||
| 
 | ||||
|     def __gt__(self, other): | ||||
|         if self.start == other.start: | ||||
|             return self.name != 'token' and other.name == 'token' | ||||
|         else: | ||||
|             return self.start > other.start | ||||
|         return not self <= other | ||||
| 
 | ||||
|     def __ge__(self, other): | ||||
|         if self.start == other.start: | ||||
|             return self.name != 'token' or other.name == 'token' | ||||
|         else: | ||||
|             return self.start > other.start | ||||
|         return not self < other | ||||
| 
 | ||||
| 
 | ||||
| class PropertyAnnotation: | ||||
| @@ -171,7 +212,7 @@ class PropertyAnnotation: | ||||
|         # TODO: Process attrs['possibleValues'] as self.labels (no id?) | ||||
|         ''' Sanity checks ''' | ||||
|         if self.property_id not in self.lookup: | ||||
|             raise Exception('Unknown property: {}'.format(self.to_dict())) | ||||
|             raise Exception(f'Unknown property: {self.to_dict()}') | ||||
| 
 | ||||
|     @property | ||||
|     def name(self): | ||||
| @@ -197,14 +238,14 @@ class TagDefinition: | ||||
|     def add_property_definition(self, attrs): | ||||
|         property_definition = PropertyDefinition(attrs) | ||||
|         if property_definition.id in self.properties: | ||||
|             raise Exception('Property id already in use: ' | ||||
|                             '{}'.format(property_definition.to_dict())) | ||||
|             raise Exception( | ||||
|                 f'Property id already in use: {property_definition.to_dict()}') | ||||
|         self.properties[property_definition.id] = property_definition | ||||
| 
 | ||||
|     @property | ||||
|     def required_properties(self): | ||||
|         return {property.id: property for property in self.properties.values() | ||||
|                 if property.is_required} | ||||
|     # @property | ||||
|     # def required_properties(self): | ||||
|     #     return {property.id: property for property in self.properties.values() | ||||
|     #             if property.is_required} | ||||
| 
 | ||||
|     def to_dict(self): | ||||
|         return { | ||||
| @@ -223,9 +264,9 @@ class PropertyDefinition: | ||||
|         self.flags = attrs.get('flags', []) | ||||
|         self.labels = attrs.get('labels', []) | ||||
| 
 | ||||
|     @property | ||||
|     def is_required(self): | ||||
|         return 'required' in self.flags | ||||
|     # @property | ||||
|     # def is_required(self): | ||||
|     #     return 'required' in self.flags | ||||
| 
 | ||||
|     @property | ||||
|     def has_multiple_values(self): | ||||
							
								
								
									
										299
									
								
								spacy-nlp
									
									
									
									
									
								
							
							
						
						
									
										299
									
								
								spacy-nlp
									
									
									
									
									
								
							| @@ -11,46 +11,64 @@ import textwrap | ||||
| import uuid | ||||
|  | ||||
|  | ||||
| def UUIDnopaque(name): | ||||
|     return 'nopaque_{}'.format( | ||||
|         uuid.uuid3(uuid.NAMESPACE_DNS, | ||||
|                    '{}@nopaque.sfb1288.uni-bielefeld.de'.format(name)) | ||||
|     ) | ||||
|  | ||||
|  | ||||
| spacy_models = {spacy.info(pipeline)['lang']: pipeline | ||||
|                 for pipeline in spacy.info()['pipelines']} | ||||
| spacy_models = { | ||||
|     spacy.info(pipeline)['lang']: pipeline | ||||
|     for pipeline in spacy.info()['pipelines'] | ||||
| } | ||||
|  | ||||
|  | ||||
| # Parse the given arguments | ||||
| parser = ArgumentParser(description='Create annotations for a given txt file') | ||||
| parser.add_argument('input', help='Path to txt input file') | ||||
| parser.add_argument('output', help='Path to JSON output file') | ||||
| parser.add_argument('-l', '--language', | ||||
| parser = ArgumentParser( | ||||
|     description='Create annotations for a given plain txt file' | ||||
| ) | ||||
| parser.add_argument( | ||||
|     '-i', '--input-file', | ||||
|     help='Input file' | ||||
| ) | ||||
| parser.add_argument( | ||||
|     '-o', '--output-file', | ||||
|     help='Output file', | ||||
|     required=True | ||||
| ) | ||||
| parser.add_argument( | ||||
|     '-m', '--model', | ||||
|     choices=spacy_models.keys(), | ||||
|                     help='Language of the input (2-character ISO 639-1 language codes)',  # noqa | ||||
|                     required=True) | ||||
| parser.add_argument('-c', '--check-encoding', | ||||
|     help='The model to be used', | ||||
|     required=True | ||||
| ) | ||||
| parser.add_argument( | ||||
|     '-c', '--check-encoding', | ||||
|     action='store_true', | ||||
|                     help='Check encoding of the input file, UTF-8 is used instead')  # noqa | ||||
|     help='Check encoding of the input file, UTF-8 is used instead' | ||||
| ) | ||||
| parser.add_argument( | ||||
|     '--id-prefix', | ||||
|     default='', | ||||
|     help='A prefix for all the ids within the stand off annotations' | ||||
| ) | ||||
| args = parser.parse_args() | ||||
|  | ||||
| with open(args.input, "rb") as text_file: | ||||
|  | ||||
| def generate_id(name): | ||||
|     return f'{args.id_prefix}{uuid.uuid3(uuid.NAMESPACE_DNS, name)}' | ||||
|  | ||||
|  | ||||
| with open(args.input_file, "rb") as input_file: | ||||
|     if args.check_encoding: | ||||
|         encoding = chardet.detect(text_file.read())['encoding'] | ||||
|         encoding = chardet.detect(input_file.read())['encoding'] | ||||
|     else: | ||||
|         encoding = 'utf-8' | ||||
|     text_file.seek(0) | ||||
|     input_file.seek(0) | ||||
|     text_md5 = hashlib.md5() | ||||
|     for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''): | ||||
|     for chunk in iter(lambda: input_file.read(128 * text_md5.block_size), b''): | ||||
|         text_md5.update(chunk) | ||||
|  | ||||
| # Load the text contents from the input file | ||||
| with open(args.input, encoding=encoding) as text_file: | ||||
| with open(args.input_file, encoding=encoding) as input_file: | ||||
|     # spaCy NLP is limited to strings with a maximum of 1 million characters at | ||||
|     # once. So we split it into suitable chunks. | ||||
|     text_chunks = textwrap.wrap( | ||||
|         text_file.read(), | ||||
|         input_file.read(), | ||||
|         1000000, | ||||
|         break_long_words=False, | ||||
|         break_on_hyphens=False, | ||||
| @@ -59,186 +77,197 @@ with open(args.input, encoding=encoding) as text_file: | ||||
|         replace_whitespace=False | ||||
|     ) | ||||
|  | ||||
| model = spacy_models[args.language] | ||||
| nlp = spacy.load(model) | ||||
| model_name = spacy_models[args.model] | ||||
| nlp = spacy.load(model_name) | ||||
|  | ||||
| meta = { | ||||
|     'generator': { | ||||
|         'name': 'nopaque NLP service', | ||||
|         'version': '1.0.0', | ||||
|         'name': 'nopaque spacy NLP', | ||||
|         'version': '0.1.0', | ||||
|         'arguments': { | ||||
|             'check_encoding': args.check_encoding, | ||||
|             'language': args.language | ||||
|             'model': args.model | ||||
|         } | ||||
|     }, | ||||
|     'file': { | ||||
|         'encoding': encoding, | ||||
|         'md5': text_md5.hexdigest(), | ||||
|         'name': os.path.basename(args.input) | ||||
|         'name': os.path.basename(args.input_file) | ||||
|     } | ||||
| } | ||||
|  | ||||
| tags = [ | ||||
|     { | ||||
|         'id': UUIDnopaque('token'), | ||||
| tags = [] | ||||
| token = { | ||||
|     'id': generate_id('token'), | ||||
|     'name': 'token', | ||||
|         'description': 'An individual token — i.e. a word, punctuation symbol, whitespace, etc.', | ||||
|         'properties': [ | ||||
|     'description': 'An individual token — i.e. a word, punctuation symbol, whitespace, etc.',  # noqa | ||||
|     'properties': [] | ||||
| } | ||||
| # TODO: Check if all languages support token.sentiment | ||||
| token['properties'].append( | ||||
|     { | ||||
|                 'id': UUIDnopaque('token.lemma'), | ||||
|         'id': generate_id('token.sentiment'), | ||||
|         'name': 'sentiment', | ||||
|         'description': 'A scalar value indicating the positivity or negativity of the token.'  # noqa | ||||
|     } | ||||
| ) | ||||
| if nlp.has_pipe('lemmatizer'): | ||||
|     token['properties'].append( | ||||
|         { | ||||
|             'id': generate_id('token.lemma'), | ||||
|             'name': 'lemma', | ||||
|                 'description': 'The base form of the word', | ||||
|                 'flags': ['required'], | ||||
|                 'labels': [] | ||||
|             }, | ||||
|             'description': 'The base form of the word' | ||||
|         } | ||||
|     ) | ||||
| if nlp.has_pipe('morphologizer') or nlp.has_pipe('tagger'): | ||||
|     token['properties'].append( | ||||
|         { | ||||
|                 'id': UUIDnopaque('token.pos'), | ||||
|                 'name': 'pos', | ||||
|                 'description': 'The detailed part-of-speech tag', | ||||
|                 'flags': ['required'], | ||||
|                 'labels': [ | ||||
|                     { | ||||
|                         'id': UUIDnopaque('token.pos={}'.format(label)), | ||||
|                         'name': label, | ||||
|                         'description': spacy.explain(label) or '' | ||||
|                     } for label in spacy.info(model)['labels']['tagger'] | ||||
|                 ] | ||||
|             }, | ||||
|             { | ||||
|                 'id': UUIDnopaque('token.simple_pos'), | ||||
|             'id': generate_id('token.simple_pos'), | ||||
|             'name': 'simple_pos', | ||||
|             'description': 'The simple UPOS part-of-speech tag', | ||||
|                 'flags': ['required'], | ||||
|             'labels': [ | ||||
|                 { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                     'id': generate_id('token.simple_pos=ADJ'), | ||||
|                     'name': 'ADJ', | ||||
|                     'description': 'adjective' | ||||
|                 }, | ||||
|                 { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                     'id': generate_id('token.simple_pos=ADJ'), | ||||
|                     'name': 'ADP', | ||||
|                     'description': 'adposition' | ||||
|                 }, | ||||
|                 { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                     'id': generate_id('token.simple_pos=ADJ'), | ||||
|                     'name': 'ADV', | ||||
|                     'description': 'adverb' | ||||
|                 }, | ||||
|                 { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                     'id': generate_id('token.simple_pos=ADJ'), | ||||
|                     'name': 'AUX', | ||||
|                     'description': 'auxiliary verb' | ||||
|                 }, | ||||
|                 { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                     'id': generate_id('token.simple_pos=ADJ'), | ||||
|                     'name': 'CONJ', | ||||
|                     'description': 'coordinating conjunction' | ||||
|                 }, | ||||
|                 { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                     'id': generate_id('token.simple_pos=ADJ'), | ||||
|                     'name': 'DET', | ||||
|                     'description': 'determiner' | ||||
|                 }, | ||||
|                 { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                     'id': generate_id('token.simple_pos=ADJ'), | ||||
|                     'name': 'INTJ', | ||||
|                     'description': 'interjection' | ||||
|                 }, | ||||
|                 { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                     'id': generate_id('token.simple_pos=ADJ'), | ||||
|                     'name': 'NOUN', | ||||
|                     'description': 'noun' | ||||
|                 }, | ||||
|                 { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                     'id': generate_id('token.simple_pos=ADJ'), | ||||
|                     'name': 'NUM', | ||||
|                     'description': 'numeral' | ||||
|                 }, | ||||
|                 { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                     'id': generate_id('token.simple_pos=ADJ'), | ||||
|                     'name': 'PART', | ||||
|                     'description': 'particle' | ||||
|                 }, | ||||
|                 { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                     'id': generate_id('token.simple_pos=ADJ'), | ||||
|                     'name': 'PRON', | ||||
|                     'description': 'pronoun' | ||||
|                 }, | ||||
|                 { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                     'id': generate_id('token.simple_pos=ADJ'), | ||||
|                     'name': 'PROPN', | ||||
|                     'description': 'proper noun' | ||||
|                 }, | ||||
|                 { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                     'id': generate_id('token.simple_pos=ADJ'), | ||||
|                     'name': 'PUNCT', | ||||
|                     'description': 'punctuation' | ||||
|                 }, | ||||
|                 { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                     'id': generate_id('token.simple_pos=ADJ'), | ||||
|                     'name': 'SCONJ', | ||||
|                     'description': 'subordinating conjunction' | ||||
|                 }, | ||||
|                 { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                     'id': generate_id('token.simple_pos=ADJ'), | ||||
|                     'name': 'SYM', | ||||
|                     'description': 'symbol' | ||||
|                 }, | ||||
|                 { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                     'id': generate_id('token.simple_pos=ADJ'), | ||||
|                     'name': 'VERB', | ||||
|                     'description': 'verb' | ||||
|                 }, | ||||
|                 { | ||||
|                         'id': UUIDnopaque('token.simple_pos=ADJ'), | ||||
|                     'id': generate_id('token.simple_pos=ADJ'), | ||||
|                     'name': 'X', | ||||
|                     'description': 'other' | ||||
|                 } | ||||
|             ] | ||||
|             }, | ||||
|         } | ||||
|     ) | ||||
| if nlp.has_pipe('tagger'): | ||||
|     token['properties'].append( | ||||
|         { | ||||
|                 'id': UUIDnopaque('token.ner'), | ||||
|                 'name': 'ner', | ||||
|                 'description': 'Label indicating the type of the entity', | ||||
|                 'flags': ['required'], | ||||
|             'id': generate_id('token.pos'), | ||||
|             'name': 'pos', | ||||
|             'description': 'The detailed part-of-speech tag', | ||||
|             'labels': [ | ||||
|                 { | ||||
|                         'id': UUIDnopaque('token.ner={}'.format(label)), | ||||
|                     'id': generate_id(f'token.pos={label}'), | ||||
|                     'name': label, | ||||
|                     'description': spacy.explain(label) or '' | ||||
|                     } for label in spacy.info(model)['labels']['ner'] | ||||
|                 } for label in spacy.info(model_name)['labels']['tagger'] | ||||
|             ] | ||||
|         } | ||||
|         ] | ||||
|     }, | ||||
|     ) | ||||
| if nlp.has_pipe('ner') or nlp.has_pipe('entity_ruler'): | ||||
|     tags.append( | ||||
|         { | ||||
|         'id': UUIDnopaque('s'), | ||||
|         'name': 's', | ||||
|         'description': 'Encodes the start and end of a sentence', | ||||
|         'properties': [] | ||||
|     }, | ||||
|     { | ||||
|         'id': UUIDnopaque('ent'), | ||||
|             'id': generate_id('ent'), | ||||
|             'name': 'ent', | ||||
|             'description': 'Encodes the start and end of a named entity', | ||||
|             'properties': [ | ||||
|                 { | ||||
|                 'id': UUIDnopaque('ent.type'), | ||||
|                     'id': generate_id('ent.type'), | ||||
|                     'name': 'type', | ||||
|                     'description': 'Label indicating the type of the entity', | ||||
|                 'flags': ['required'], | ||||
|                     'labels': [ | ||||
|                         { | ||||
|                         'id': UUIDnopaque('ent.type={}'.format(label)), | ||||
|                             'id': generate_id('ent.type={}'.format(label)), | ||||
|                             'name': label, | ||||
|                             'description': spacy.explain(label) or '' | ||||
|                     } for label in spacy.info(model)['labels']['ner'] | ||||
|                         } for label in spacy.info(model_name)['labels']['ner'] | ||||
|                     ] | ||||
|                 } | ||||
|             ] | ||||
|         } | ||||
| ] | ||||
|     ) | ||||
| if nlp.has_pipe('parser') or nlp.has_pipe('senter') or nlp.has_pipe('sentencizer'):  # noqa | ||||
|     # TODO: Check if all languages support sent.sentiment | ||||
|     tags.append( | ||||
|         { | ||||
|             'id': generate_id('s'), | ||||
|             'name': 's', | ||||
|             'description': 'Encodes the start and end of a sentence', | ||||
|             'properties': [ | ||||
|                 { | ||||
|                     'id': generate_id('s.sentiment'), | ||||
|                     'name': 'sentiment', | ||||
|                     'description': 'A scalar value indicating the positivity or negativity of the sentence.'  # noqa | ||||
|                 } | ||||
|             ] | ||||
|         } | ||||
|     ) | ||||
| tags.append(token) | ||||
|  | ||||
| annotations = [] | ||||
|  | ||||
| @@ -246,60 +275,78 @@ chunk_offset = 0 | ||||
| while text_chunks: | ||||
|     text_chunk = text_chunks.pop(0) | ||||
|     doc = nlp(text_chunk) | ||||
|     for token in doc: | ||||
|         if token.is_space: | ||||
|             continue | ||||
|         if token.is_sent_start: | ||||
|             annotation = {'start': token.sent.start_char + chunk_offset, | ||||
|                           'end': token.sent.end_char + chunk_offset, | ||||
|                           'tag_id': UUIDnopaque('s'), | ||||
|                           'properties': []} | ||||
|             annotations.append(annotation) | ||||
|         # Check if the token is the start of an entity | ||||
|         if token.ent_iob == 3: | ||||
|             for ent_candidate in token.sent.ents: | ||||
|                 if ent_candidate.start_char == token.idx: | ||||
|                     ent = ent_candidate | ||||
|     if hasattr(doc, 'ents'): | ||||
|         for ent in doc.ents: | ||||
|             annotation = { | ||||
|                 'start': ent.start_char + chunk_offset, | ||||
|                 'end': ent.end_char + chunk_offset, | ||||
|                         'tag_id': UUIDnopaque('ent'), | ||||
|                 'tag_id': generate_id('ent'), | ||||
|                 'properties': [ | ||||
|                     { | ||||
|                                 'property_id': UUIDnopaque('ent.type'), | ||||
|                                 'value': token.ent_type_ | ||||
|                         'property_id': generate_id('ent.type'), | ||||
|                         'value': ent.label_ | ||||
|                     } | ||||
|                 ] | ||||
|             } | ||||
|             annotations.append(annotation) | ||||
|                     break | ||||
|     if hasattr(doc, 'sents'): | ||||
|         for sent in doc.sents: | ||||
|             annotation = { | ||||
|                 'start': sent.start_char + chunk_offset, | ||||
|                 'end': sent.end_char + chunk_offset, | ||||
|                 'tag_id': generate_id('s'), | ||||
|                 'properties': [] | ||||
|             } | ||||
|             if hasattr(sent, 'sentiment'): | ||||
|                 annotation['properties'].append( | ||||
|                     { | ||||
|                         'property_id': generate_id('s.sentiment'), | ||||
|                         'value': sent.sentiment | ||||
|                     } | ||||
|                 ) | ||||
|             annotations.append(annotation) | ||||
|     for token in doc: | ||||
|         annotation = { | ||||
|             'start': token.idx + chunk_offset, | ||||
|             'end': token.idx + len(token.text) + chunk_offset, | ||||
|             'tag_id': UUIDnopaque('token'), | ||||
|             'properties': [ | ||||
|             'tag_id': generate_id('token'), | ||||
|             'properties': [] | ||||
|         } | ||||
|         if hasattr(token, 'lemma_'): | ||||
|             annotation['properties'].append( | ||||
|                 { | ||||
|                    'property_id': UUIDnopaque('token.pos'), | ||||
|                    'value': token.tag_ | ||||
|                 }, | ||||
|                 { | ||||
|                     'property_id': UUIDnopaque('token.lemma'), | ||||
|                     'property_id': generate_id('token.lemma'), | ||||
|                     'value': token.lemma_ | ||||
|                 }, | ||||
|                 } | ||||
|             ) | ||||
|         if hasattr(token, 'pos_'): | ||||
|             annotation['properties'].append( | ||||
|                 { | ||||
|                     'property_id': UUIDnopaque('token.simple_pos'), | ||||
|                     'property_id': generate_id('token.simple_pos'), | ||||
|                     'value': token.pos_ | ||||
|                 }, | ||||
|                 } | ||||
|             ) | ||||
|         if hasattr(token, 'sentiment'): | ||||
|             annotation['properties'].append( | ||||
|                 { | ||||
|                     'property_id': UUIDnopaque('token.ner'), | ||||
|                     'value': token.ent_type_ if token.ent_type_ else 'None' | ||||
|                     'property_id': generate_id('token.sentiment'), | ||||
|                     'value': token.sentiment | ||||
|                 } | ||||
|             ] | ||||
|             ) | ||||
|         if hasattr(token, 'tag_'): | ||||
|             annotation['properties'].append( | ||||
|                 { | ||||
|                    'property_id': generate_id('token.pos'), | ||||
|                    'value': token.tag_ | ||||
|                 } | ||||
|             ) | ||||
|         annotations.append(annotation) | ||||
|     chunk_offset += len(text_chunk) | ||||
|     text_chunk = None | ||||
|  | ||||
| with open(args.output, 'w') as output_file: | ||||
|     json.dump({'meta': meta, 'tags': tags, 'annotations': annotations}, | ||||
|               output_file, indent=4) | ||||
| with open(args.output_file, 'w') as output_file: | ||||
|     json.dump( | ||||
|         {'meta': meta, 'tags': tags, 'annotations': annotations}, | ||||
|         output_file, | ||||
|         indent=4 | ||||
|     ) | ||||
|   | ||||
							
								
								
									
										39
									
								
								vrt-creator
									
									
									
									
									
								
							
							
						
						
									
										39
									
								
								vrt-creator
									
									
									
									
									
								
							| @@ -6,31 +6,36 @@ from stand_off_data import StandOffData | ||||
| import hashlib | ||||
| import json | ||||
|  | ||||
| parser = ArgumentParser( | ||||
|     description='Convert plain text and JSON stand off to a CWB vrt file' | ||||
| ) | ||||
| parser.add_argument( | ||||
|     '-s', '--stand-off-data-file', | ||||
|     help='JSON stand off data input file' | ||||
| ) | ||||
| parser.add_argument( | ||||
|     '-t', '--text-file', | ||||
|     help='Plain text input file' | ||||
| ) | ||||
| parser.add_argument( | ||||
|     '-o', '--output-file', | ||||
|     help='Output file', | ||||
|     required=True | ||||
| ) | ||||
| args = parser.parse_args() | ||||
|  | ||||
| def main(): | ||||
|     # Parse the given arguments | ||||
|     parser = ArgumentParser(description='Create a vrt from JSON and txt') | ||||
|     parser.add_argument('text', help='Path to txt file') | ||||
|     parser.add_argument('stand_off_data', help='Path to JSON file') | ||||
|     parser.add_argument('output', help='Path to vrt output file') | ||||
|     args = parser.parse_args() | ||||
|  | ||||
|     with open(args.stand_off_data) as stand_of_data_file: | ||||
| with open(args.stand_off_data_file) as stand_of_data_file: | ||||
|     stand_off_data = StandOffData(json.load(stand_of_data_file)) | ||||
|  | ||||
|     with open(args.text, "rb") as text_file: | ||||
| with open(args.text_file, "rb") as text_file: | ||||
|     text_md5 = hashlib.md5() | ||||
|         for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''):  # noqa | ||||
|     for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''): | ||||
|         text_md5.update(chunk) | ||||
|     if text_md5.hexdigest() != stand_off_data.meta['file']['md5']: | ||||
|         raise Exception('md5 not equal') | ||||
|  | ||||
|     with open(args.text, encoding=stand_off_data.meta['file']['encoding']) as text_file: | ||||
| with open(args.text_file, encoding=stand_off_data.meta['file']['encoding']) as text_file:  # noqa | ||||
|     text = text_file.read() | ||||
|  | ||||
|     with open(args.output, 'w') as vrt_file: | ||||
| with open(args.output_file, 'w') as vrt_file: | ||||
|     vrt_file.write(stand_off_data.to_vrt(text)) | ||||
|  | ||||
|  | ||||
| if __name__ == '__main__': | ||||
|     main() | ||||
|   | ||||
							
								
								
									
										10
									
								
								wrapper/nlp
									
									
									
									
									
								
							
							
						
						
									
										10
									
								
								wrapper/nlp
									
									
									
									
									
								
							| @@ -6,7 +6,7 @@ import os | ||||
| import subprocess | ||||
| import sys | ||||
|  | ||||
| CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:1.0.0b' | ||||
| CONTAINER_IMAGE = 'gitlab.ub.uni-bielefeld.de:4567/sfb1288inf/nlp:v0.1.0' | ||||
| CONTAINER_INPUT_DIR = '/input' | ||||
| CONTAINER_OUTPUT_DIR = '/output' | ||||
| CONTAINER_LOG_DIR = '/logs' | ||||
| @@ -19,17 +19,17 @@ parser.add_argument('-o', '--output-dir') | ||||
| parser.add_argument('--log-dir') | ||||
| args, remaining_args = parser.parse_known_args() | ||||
|  | ||||
| cmd = ['docker', 'run', '--rm', '-it', '-u', '{}:{}'.format(UID, GID)] | ||||
| cmd = ['docker', 'run', '--rm', '-it', '-u', f'{UID}:{GID}'] | ||||
| if args.input_dir is not None: | ||||
|     mapping = os.path.abspath(args.input_dir) + ':' + CONTAINER_INPUT_DIR | ||||
|     mapping = f'{os.path.abspath(args.input_dir)}:{CONTAINER_INPUT_DIR}' | ||||
|     cmd += ['-v', mapping] | ||||
|     remaining_args += ['-i', CONTAINER_INPUT_DIR] | ||||
| if args.output_dir is not None: | ||||
|     mapping = os.path.abspath(args.output_dir) + ':' + CONTAINER_OUTPUT_DIR | ||||
|     mapping = f'{os.path.abspath(args.output_dir)}:{CONTAINER_OUTPUT_DIR}' | ||||
|     cmd += ['-v', mapping] | ||||
|     remaining_args += ['-o', CONTAINER_OUTPUT_DIR] | ||||
| if args.log_dir is not None: | ||||
|     mapping = os.path.abspath(args.log_dir) + ':' + CONTAINER_LOG_DIR | ||||
|     mapping = '{os.path.abspath(args.log_dir)}:{CONTAINER_LOG_DIR}' | ||||
|     cmd += ['-v', mapping] | ||||
|     remaining_args += ['--log-dir', CONTAINER_LOG_DIR] | ||||
| cmd.append(CONTAINER_IMAGE) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user