nlp/spacy-nlp

#!/usr/bin/env python3.7
# coding=utf-8

from argparse import ArgumentParser
import chardet
import hashlib
import json
import os
import spacy
import textwrap


spacy_models = {spacy.info(pipeline)['lang']: pipeline
                for pipeline in spacy.info()['pipelines']}


# Parse the given arguments
parser = ArgumentParser(description='Create annotations for a given txt file')
parser.add_argument('input', metavar='Path to txt input file')
parser.add_argument('output', metavar='Path to JSON output file')
parser.add_argument('-l', '--language',
                    choices=spacy_models.keys(),
                    required=True)
parser.add_argument('-c', '--check-encoding', action='store_true')
args = parser.parse_args()


# If requested: Check the encoding of the text contents from the input file
# Else: Use utf-8
with open(args.input, "rb") as input_file:
    if args.check_encoding:
        encoding = chardet.detect(input_file.read())['encoding']
    else:
        encoding = 'utf-8'
    text_md5 = hashlib.md5()
    for chunk in iter(lambda: input_file.read(128 * text_md5.block_size), b''):
        text_md5.update(chunk)

# Load the text contents from the input file
with open(args.input, encoding=encoding) as input_file:
    text = input_file.read()
    # spaCys NLP is limited to strings with maximum 1 million characters at
    # once. So we split it into suitable chunks.
    text_chunks = textwrap.wrap(text, 1000000, break_long_words=False)
    # the text variable potentially occupies a lot of system memory and is no
    # longer needed...
    del text

model = spacy_models[args.language]
nlp = spacy.load(model)

meta = {
    'generator': {
        'name': 'nopaque NLP service',
        'version': '1.0.0',
        'arguments': {
            'check_encoding': args.check_encoding,
            'language': args.language
        }
    },
    'file': {
        'md5': text_md5.hexdigest(),
        'name': os.path.basename(args.input)
    }
}


tags = {
    'token': {
        'description': '',
        'properties': {
            'lemma': {
                'description': 'The base form of the word',
                'flags': ['required'],
                'tagset': None
            },
            'pos': {
                'description': 'The detailed part-of-speech tag',
                'flags': ['required'],
                'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['tagger']}  # noqa
            },
            'simple_pos': {
                'description': 'The simple UPOS part-of-speech tag',
                'flags': ['required'],
                'tagset': {
                    'ADJ': 'adjective',
                    'ADP': 'adposition',
                    'ADV': 'adverb',
                    'AUX': 'auxiliary verb',
                    'CONJ': 'coordinating conjunction',
                    'DET': 'determiner',
                    'INTJ': 'interjection',
                    'NOUN': 'noun',
                    'NUM': 'numeral',
                    'PART': 'particle',
                    'PRON': 'pronoun',
                    'PROPN': 'proper noun',
                    'PUNCT': 'punctuation',
                    'SCONJ': 'subordinating conjunction',
                    'SYM': 'symbol',
                    'VERB': 'verb',
                    'X': 'other'
                }
            },
            'ner': {
                'description': 'Label indicating the type of the entity',
                'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['ner']}  # noqa
            }
        }
    },
    's': {
        'description': 'Encodes the start and end of a sentence',
        'properties': None
    },
    'ent': {
        'description': 'Encodes the start and end of a named entity',
        'properties': {
            'type': {
                'description': 'Label indicating the type of the entity',
                'flags': ['required'],
                'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['ner']}  # noqa
            }
        }
    }
}

annotations = []

chunk_offset = 0
for text_chunk in text_chunks:
    doc = nlp(text_chunk)
    for token in doc:
        if token.is_space:
            continue
        if token.is_sent_start:
            annotation = {'start': token.sent.start_char + chunk_offset,
                          'end': token.sent.end_char + chunk_offset,
                          'tag': 's'}
            annotations.append(annotation)
        # Check if the token is the start of an entity
        if token.ent_iob == 3:
            for ent_candidate in token.sent.ents:
                if ent_candidate.start_char == token.idx:
                    ent = ent_candidate
                    annotation = {'start': ent.start_char + chunk_offset,
                                  'end': ent.end_char + chunk_offset,
                                  'tag': 'ent',
                                  'properties': {'type': token.ent_type_}}
                    annotations.append(annotation)
                    break
        annotation = {'start': token.idx + chunk_offset,
                      'end': token.idx + len(token.text) + chunk_offset,
                      'tag': 'token',
                      'properties': {'pos': token.tag_,
                                     'lemma': token.lemma_,
                                     'simple_pos': token.pos_}}
        if token.ent_type_:
            annotation['properties']['ner'] = token.ent_type_
        annotations.append(annotation)
    chunk_offset += len(text_chunk)

with open(args.output, 'w') as output_file:
    json.dump({'meta': meta, 'tags': tags, 'annotations': annotations},
              output_file, indent=4)