mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git
synced 2025-01-14 00:04:04 +00:00
Add function to check the encoding of input text files.
This commit is contained in:
parent
6769be049a
commit
88d03d4360
15
nlp
15
nlp
@ -41,6 +41,15 @@ def parse_arguments():
|
|||||||
dest='zip',
|
dest='zip',
|
||||||
help='package result files in zip bundles',
|
help='package result files in zip bundles',
|
||||||
required=False)
|
required=False)
|
||||||
|
parser.add_argument('--check-encoding',
|
||||||
|
action='store_true',
|
||||||
|
default=False,
|
||||||
|
dest="check_encoding",
|
||||||
|
help='''if used the nlp process will know hat the encoding of
|
||||||
|
the input files is unkown and thus != utf-8. The process will
|
||||||
|
try to determine the encoding of the input files and use this.
|
||||||
|
encoding.'''
|
||||||
|
)
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
@ -51,6 +60,7 @@ class NLPWorkflow(WorkflowRunner):
|
|||||||
self.n_cores = args.n_cores
|
self.n_cores = args.n_cores
|
||||||
self.output_dir = args.output_dir
|
self.output_dir = args.output_dir
|
||||||
self.zip = args.zip
|
self.zip = args.zip
|
||||||
|
self.check_encoding
|
||||||
|
|
||||||
def workflow(self):
|
def workflow(self):
|
||||||
if len(self.jobs) == 0:
|
if len(self.jobs) == 0:
|
||||||
@ -82,10 +92,11 @@ class NLPWorkflow(WorkflowRunner):
|
|||||||
max(1, int(self.n_cores / len(self.jobs)))
|
max(1, int(self.n_cores / len(self.jobs)))
|
||||||
)
|
)
|
||||||
for index, job in enumerate(self.jobs):
|
for index, job in enumerate(self.jobs):
|
||||||
cmd = 'spacy_nlp -l "{}" "{}" "{}"'.format(
|
cmd = 'spacy_nlp -l "{}" "{}" "{}" "{}"'.format(
|
||||||
self.lang,
|
self.lang,
|
||||||
job['path'],
|
job['path'],
|
||||||
os.path.join(job['output_dir'], job['name'] + '.vrt')
|
os.path.join(job['output_dir'], job['name'] + '.vrt',
|
||||||
|
if self.check_encoding "--check-encoding" else "")
|
||||||
)
|
)
|
||||||
nlp_jobs.append(
|
nlp_jobs.append(
|
||||||
self.addTask(
|
self.addTask(
|
||||||
|
14
spacy_nlp
14
spacy_nlp
@ -17,6 +17,11 @@ parser.add_argument('-l',
|
|||||||
dest='lang',
|
dest='lang',
|
||||||
required=True)
|
required=True)
|
||||||
parser.add_argument('o', metavar='vrt-destfile')
|
parser.add_argument('o', metavar='vrt-destfile')
|
||||||
|
parser.add_argument('--check-encoding',
|
||||||
|
default=False,
|
||||||
|
action='store_true',
|
||||||
|
dest='check_encoding'
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
SPACY_MODELS = {'de': 'de_core_news_sm',
|
SPACY_MODELS = {'de': 'de_core_news_sm',
|
||||||
@ -31,9 +36,16 @@ SPACY_MODELS = {'de': 'de_core_news_sm',
|
|||||||
# Set the language model for spacy
|
# Set the language model for spacy
|
||||||
nlp = spacy.load(SPACY_MODELS[args.lang])
|
nlp = spacy.load(SPACY_MODELS[args.lang])
|
||||||
|
|
||||||
|
# Try to determine the encoding of the text in the input file
|
||||||
|
if args.check_encoding:
|
||||||
|
with open(args.i, "rb") as input_file:
|
||||||
|
bytes = input_file.read()
|
||||||
|
encoding = chardet.detect(bytes)['encoding']
|
||||||
|
else:
|
||||||
|
encoding='utf-8'
|
||||||
# Read text from the input file and if neccessary split it into parts with a
|
# Read text from the input file and if neccessary split it into parts with a
|
||||||
# length of less than 1 million characters.
|
# length of less than 1 million characters.
|
||||||
with open(args.i) as input_file:
|
with open(args.i, encoding=encoding) as input_file:
|
||||||
text = input_file.read()
|
text = input_file.read()
|
||||||
texts = textwrap.wrap(text, 1000000, break_long_words=False)
|
texts = textwrap.wrap(text, 1000000, break_long_words=False)
|
||||||
text = None
|
text = None
|
||||||
|
Loading…
x
Reference in New Issue
Block a user