Add function to check the encoding of input text files.

This commit is contained in:
Stephan Porada 2020-02-12 13:46:43 +01:00
parent 6769be049a
commit 88d03d4360
2 changed files with 26 additions and 3 deletions

15
nlp
View File

@ -41,6 +41,15 @@ def parse_arguments():
dest='zip',
help='package result files in zip bundles',
required=False)
parser.add_argument('--check-encoding',
action='store_true',
default=False,
dest="check_encoding",
help='''if used the nlp process will know hat the encoding of
the input files is unkown and thus != utf-8. The process will
try to determine the encoding of the input files and use this.
encoding.'''
)
return parser.parse_args()
@ -51,6 +60,7 @@ class NLPWorkflow(WorkflowRunner):
self.n_cores = args.n_cores
self.output_dir = args.output_dir
self.zip = args.zip
self.check_encoding
def workflow(self):
if len(self.jobs) == 0:
@ -82,10 +92,11 @@ class NLPWorkflow(WorkflowRunner):
max(1, int(self.n_cores / len(self.jobs)))
)
for index, job in enumerate(self.jobs):
cmd = 'spacy_nlp -l "{}" "{}" "{}"'.format(
cmd = 'spacy_nlp -l "{}" "{}" "{}" "{}"'.format(
self.lang,
job['path'],
os.path.join(job['output_dir'], job['name'] + '.vrt')
os.path.join(job['output_dir'], job['name'] + '.vrt',
if self.check_encoding "--check-encoding" else "")
)
nlp_jobs.append(
self.addTask(

View File

@ -17,6 +17,11 @@ parser.add_argument('-l',
dest='lang',
required=True)
parser.add_argument('o', metavar='vrt-destfile')
parser.add_argument('--check-encoding',
default=False,
action='store_true',
dest='check_encoding'
)
args = parser.parse_args()
SPACY_MODELS = {'de': 'de_core_news_sm',
@ -31,9 +36,16 @@ SPACY_MODELS = {'de': 'de_core_news_sm',
# Set the language model for spacy
nlp = spacy.load(SPACY_MODELS[args.lang])
# Try to determine the encoding of the text in the input file
if args.check_encoding:
with open(args.i, "rb") as input_file:
bytes = input_file.read()
encoding = chardet.detect(bytes)['encoding']
else:
encoding='utf-8'
# Read text from the input file and if neccessary split it into parts with a
# length of less than 1 million characters.
with open(args.i) as input_file:
with open(args.i, encoding=encoding) as input_file:
text = input_file.read()
texts = textwrap.wrap(text, 1000000, break_long_words=False)
text = None