mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git
				synced 2025-10-31 20:33:12 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			173 lines
		
	
	
		
			5.9 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			173 lines
		
	
	
		
			5.9 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/env python3.7
 | |
| # coding=utf-8
 | |
| 
 | |
| from argparse import ArgumentParser
 | |
| import chardet
 | |
| import hashlib
 | |
| import json
 | |
| import os
 | |
| import spacy
 | |
| import textwrap
 | |
| 
 | |
| 
 | |
| spacy_models = {spacy.info(pipeline)['lang']: pipeline
 | |
|                 for pipeline in spacy.info()['pipelines']}
 | |
| 
 | |
| 
 | |
| # Parse the given arguments
 | |
| parser = ArgumentParser(description='Create annotations for a given txt file')
 | |
| parser.add_argument('input', help='Path to txt input file')
 | |
| parser.add_argument('output', help='Path to JSON output file')
 | |
| parser.add_argument('-l', '--language',
 | |
|                     choices=spacy_models.keys(),
 | |
|                     help='Language of the input (2-character ISO 639-1 language codes)',  # noqa
 | |
|                     required=True)
 | |
| parser.add_argument('-c', '--check-encoding',
 | |
|                     action='store_true',
 | |
|                     help='Check encoding of the input file, UTF-8 is used instead')  # noqa
 | |
| args = parser.parse_args()
 | |
| 
 | |
| with open(args.input, "rb") as text_file:
 | |
|     if args.check_encoding:
 | |
|         encoding = chardet.detect(text_file.read())['encoding']
 | |
|     else:
 | |
|         encoding = 'utf-8'
 | |
|     text_file.seek(0)
 | |
|     text_md5 = hashlib.md5()
 | |
|     for chunk in iter(lambda: text_file.read(128 * text_md5.block_size), b''):
 | |
|         text_md5.update(chunk)
 | |
| 
 | |
| # Load the text contents from the input file
 | |
| with open(args.input, encoding=encoding) as text_file:
 | |
|     # spaCy NLP is limited to strings with a maximum of 1 million characters at
 | |
|     # once. So we split it into suitable chunks.
 | |
|     text_chunks = textwrap.wrap(
 | |
|         text_file.read(),
 | |
|         1000000,
 | |
|         break_long_words=False,
 | |
|         break_on_hyphens=False,
 | |
|         drop_whitespace=False,
 | |
|         expand_tabs=False,
 | |
|         replace_whitespace=False
 | |
|     )
 | |
| 
 | |
| model = spacy_models[args.language]
 | |
| nlp = spacy.load(model)
 | |
| 
 | |
| meta = {
 | |
|     'generator': {
 | |
|         'name': 'nopaque NLP service',
 | |
|         'version': '1.0.0',
 | |
|         'arguments': {
 | |
|             'check_encoding': args.check_encoding,
 | |
|             'language': args.language
 | |
|         }
 | |
|     },
 | |
|     'file': {
 | |
|         'encoding': encoding,
 | |
|         'md5': text_md5.hexdigest(),
 | |
|         'name': os.path.basename(args.input)
 | |
|     }
 | |
| }
 | |
| 
 | |
| 
 | |
| tags = {
 | |
|     'token': {
 | |
|         'description': '',
 | |
|         'properties': {
 | |
|             'lemma': {
 | |
|                 'description': 'The base form of the word',
 | |
|                 'flags': ['required'],
 | |
|                 'tagset': None
 | |
|             },
 | |
|             'pos': {
 | |
|                 'description': 'The detailed part-of-speech tag',
 | |
|                 'flags': ['required'],
 | |
|                 'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['tagger']}  # noqa
 | |
|             },
 | |
|             'simple_pos': {
 | |
|                 'description': 'The simple UPOS part-of-speech tag',
 | |
|                 'flags': ['required'],
 | |
|                 'tagset': {
 | |
|                     'ADJ': 'adjective',
 | |
|                     'ADP': 'adposition',
 | |
|                     'ADV': 'adverb',
 | |
|                     'AUX': 'auxiliary verb',
 | |
|                     'CONJ': 'coordinating conjunction',
 | |
|                     'DET': 'determiner',
 | |
|                     'INTJ': 'interjection',
 | |
|                     'NOUN': 'noun',
 | |
|                     'NUM': 'numeral',
 | |
|                     'PART': 'particle',
 | |
|                     'PRON': 'pronoun',
 | |
|                     'PROPN': 'proper noun',
 | |
|                     'PUNCT': 'punctuation',
 | |
|                     'SCONJ': 'subordinating conjunction',
 | |
|                     'SYM': 'symbol',
 | |
|                     'VERB': 'verb',
 | |
|                     'X': 'other'
 | |
|                 }
 | |
|             },
 | |
|             'ner': {
 | |
|                 'description': 'Label indicating the type of the entity',
 | |
|                 'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['ner']}  # noqa
 | |
|             }
 | |
|         }
 | |
|     },
 | |
|     's': {
 | |
|         'description': 'Encodes the start and end of a sentence',
 | |
|         'properties': None
 | |
|     },
 | |
|     'ent': {
 | |
|         'description': 'Encodes the start and end of a named entity',
 | |
|         'properties': {
 | |
|             'type': {
 | |
|                 'description': 'Label indicating the type of the entity',
 | |
|                 'flags': ['required'],
 | |
|                 'tagset': {label: spacy.explain(label) for label in spacy.info(model)['labels']['ner']}  # noqa
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| annotations = []
 | |
| 
 | |
| chunk_offset = 0
 | |
| while text_chunks:
 | |
|     text_chunk = text_chunks.pop(0)
 | |
|     doc = nlp(text_chunk)
 | |
|     for token in doc:
 | |
|         if token.is_space:
 | |
|             continue
 | |
|         if token.is_sent_start:
 | |
|             annotation = {'start': token.sent.start_char + chunk_offset,
 | |
|                           'end': token.sent.end_char + chunk_offset,
 | |
|                           'tag': 's'}
 | |
|             annotations.append(annotation)
 | |
|         # Check if the token is the start of an entity
 | |
|         if token.ent_iob == 3:
 | |
|             for ent_candidate in token.sent.ents:
 | |
|                 if ent_candidate.start_char == token.idx:
 | |
|                     ent = ent_candidate
 | |
|                     annotation = {'start': ent.start_char + chunk_offset,
 | |
|                                   'end': ent.end_char + chunk_offset,
 | |
|                                   'tag': 'ent',
 | |
|                                   'properties': {'type': token.ent_type_}}
 | |
|                     annotations.append(annotation)
 | |
|                     break
 | |
|         annotation = {'start': token.idx + chunk_offset,
 | |
|                       'end': token.idx + len(token.text) + chunk_offset,
 | |
|                       'tag': 'token',
 | |
|                       'properties': {'pos': token.tag_,
 | |
|                                      'lemma': token.lemma_,
 | |
|                                      'simple_pos': token.pos_}}
 | |
|         if token.ent_type_:
 | |
|             annotation['properties']['ner'] = token.ent_type_
 | |
|         annotations.append(annotation)
 | |
|     chunk_offset += len(text_chunk)
 | |
|     text_chunk = None
 | |
| 
 | |
| with open(args.output, 'w') as output_file:
 | |
|     json.dump({'meta': meta, 'tags': tags, 'annotations': annotations},
 | |
|               output_file, indent=4)
 |