Add function to check the encoding of input text files.

2026-03-07 11:06:39 +00:00 · 2020-02-12 13:46:43 +01:00
parent 6769be049a
commit 88d03d4360
2 changed files with 26 additions and 3 deletions
--- a/14
+++ b/14
@@ -17,6 +17,11 @@ parser.add_argument('-l',
                    dest='lang',
                    required=True)
 parser.add_argument('o', metavar='vrt-destfile')
+parser.add_argument('--check-encoding',
+                    default=False,
+                    action='store_true',
+                    dest='check_encoding'
+                    )
 args = parser.parse_args()

 SPACY_MODELS = {'de': 'de_core_news_sm',
@@ -31,9 +36,16 @@ SPACY_MODELS = {'de': 'de_core_news_sm',
 # Set the language model for spacy
 nlp = spacy.load(SPACY_MODELS[args.lang])

+# Try to determine the encoding of the text in the input file
+if args.check_encoding:
+    with open(args.i, "rb") as input_file:
+        bytes = input_file.read()
+        encoding = chardet.detect(bytes)['encoding']
+else:
+    encoding='utf-8'
 # Read text from the input file and if neccessary split it into parts with a
 # length of less than 1 million characters.
-with open(args.i) as input_file:
+with open(args.i, encoding=encoding) as input_file:
    text = input_file.read()
    texts = textwrap.wrap(text, 1000000, break_long_words=False)
    text = None