From 88d03d436073f7fb7e41175da6289b727088f03b Mon Sep 17 00:00:00 2001 From: Stephan Porada Date: Wed, 12 Feb 2020 13:46:43 +0100 Subject: [PATCH] Add function to check the encoding of input text files. --- nlp | 15 +++++++++++++-- spacy_nlp | 14 +++++++++++++- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/nlp b/nlp index e4156d4..74cd2b2 100755 --- a/nlp +++ b/nlp @@ -41,6 +41,15 @@ def parse_arguments(): dest='zip', help='package result files in zip bundles', required=False) + parser.add_argument('--check-encoding', + action='store_true', + default=False, + dest="check_encoding", + help='''if used the nlp process will know hat the encoding of + the input files is unkown and thus != utf-8. The process will + try to determine the encoding of the input files and use this. + encoding.''' + ) return parser.parse_args() @@ -51,6 +60,7 @@ class NLPWorkflow(WorkflowRunner): self.n_cores = args.n_cores self.output_dir = args.output_dir self.zip = args.zip + self.check_encoding def workflow(self): if len(self.jobs) == 0: @@ -82,10 +92,11 @@ class NLPWorkflow(WorkflowRunner): max(1, int(self.n_cores / len(self.jobs))) ) for index, job in enumerate(self.jobs): - cmd = 'spacy_nlp -l "{}" "{}" "{}"'.format( + cmd = 'spacy_nlp -l "{}" "{}" "{}" "{}"'.format( self.lang, job['path'], - os.path.join(job['output_dir'], job['name'] + '.vrt') + os.path.join(job['output_dir'], job['name'] + '.vrt', + if self.check_encoding "--check-encoding" else "") ) nlp_jobs.append( self.addTask( diff --git a/spacy_nlp b/spacy_nlp index 0844ad1..1cb4ac4 100755 --- a/spacy_nlp +++ b/spacy_nlp @@ -17,6 +17,11 @@ parser.add_argument('-l', dest='lang', required=True) parser.add_argument('o', metavar='vrt-destfile') +parser.add_argument('--check-encoding', + default=False, + action='store_true', + dest='check_encoding' + ) args = parser.parse_args() SPACY_MODELS = {'de': 'de_core_news_sm', @@ -31,9 +36,16 @@ SPACY_MODELS = {'de': 'de_core_news_sm', # Set the language model for spacy nlp = spacy.load(SPACY_MODELS[args.lang]) +# Try to determine the encoding of the text in the input file +if args.check_encoding: + with open(args.i, "rb") as input_file: + bytes = input_file.read() + encoding = chardet.detect(bytes)['encoding'] +else: + encoding='utf-8' # Read text from the input file and if neccessary split it into parts with a # length of less than 1 million characters. -with open(args.i) as input_file: +with open(args.i, encoding=encoding) as input_file: text = input_file.read() texts = textwrap.wrap(text, 1000000, break_long_words=False) text = None