From 88d03d436073f7fb7e41175da6289b727088f03b Mon Sep 17 00:00:00 2001
From: Stephan Porada <sporada@techfak.uni-bielefeld.de>
Date: Wed, 12 Feb 2020 13:46:43 +0100
Subject: [PATCH] Add function to check the encoding of input text files.

---
 nlp       | 15 +++++++++++++--
 spacy_nlp | 14 +++++++++++++-
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/nlp b/nlp
index e4156d4..74cd2b2 100755
--- a/nlp
+++ b/nlp
@@ -41,6 +41,15 @@ def parse_arguments():
                         dest='zip',
                         help='package result files in zip bundles',
                         required=False)
+    parser.add_argument('--check-encoding',
+                        action='store_true',
+                        default=False,
+                        dest="check_encoding",
+                        help='''if used the nlp process will know hat the encoding of 
+                        the input files is unkown and thus != utf-8. The process will
+                        try to determine the encoding of the input files and use this.
+                        encoding.'''
+                        )
     return parser.parse_args()
 
 
@@ -51,6 +60,7 @@ class NLPWorkflow(WorkflowRunner):
         self.n_cores = args.n_cores
         self.output_dir = args.output_dir
         self.zip = args.zip
+        self.check_encoding
 
     def workflow(self):
         if len(self.jobs) == 0:
@@ -82,10 +92,11 @@ class NLPWorkflow(WorkflowRunner):
             max(1, int(self.n_cores / len(self.jobs)))
         )
         for index, job in enumerate(self.jobs):
-            cmd = 'spacy_nlp -l "{}" "{}" "{}"'.format(
+            cmd = 'spacy_nlp -l "{}" "{}" "{}" "{}"'.format(
                 self.lang,
                 job['path'],
-                os.path.join(job['output_dir'], job['name'] + '.vrt')
+                os.path.join(job['output_dir'], job['name'] + '.vrt',
+                if self.check_encoding "--check-encoding" else "")
             )
             nlp_jobs.append(
                 self.addTask(
diff --git a/spacy_nlp b/spacy_nlp
index 0844ad1..1cb4ac4 100755
--- a/spacy_nlp
+++ b/spacy_nlp
@@ -17,6 +17,11 @@ parser.add_argument('-l',
                     dest='lang',
                     required=True)
 parser.add_argument('o', metavar='vrt-destfile')
+parser.add_argument('--check-encoding',
+                    default=False,
+                    action='store_true',
+                    dest='check_encoding'
+                    )
 args = parser.parse_args()
 
 SPACY_MODELS = {'de': 'de_core_news_sm',
@@ -31,9 +36,16 @@ SPACY_MODELS = {'de': 'de_core_news_sm',
 # Set the language model for spacy
 nlp = spacy.load(SPACY_MODELS[args.lang])
 
+# Try to determine the encoding of the text in the input file
+if args.check_encoding:
+    with open(args.i, "rb") as input_file:
+        bytes = input_file.read()
+        encoding = chardet.detect(bytes)['encoding']
+else:
+    encoding='utf-8'
 # Read text from the input file and if neccessary split it into parts with a
 # length of less than 1 million characters.
-with open(args.i) as input_file:
+with open(args.i, encoding=encoding) as input_file:
     text = input_file.read()
     texts = textwrap.wrap(text, 1000000, break_long_words=False)
     text = None