mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp.git
synced 2025-07-01 12:00:34 +00:00
Add function to check the encoding of input text files.
This commit is contained in:
14
spacy_nlp
14
spacy_nlp
@ -17,6 +17,11 @@ parser.add_argument('-l',
|
||||
dest='lang',
|
||||
required=True)
|
||||
parser.add_argument('o', metavar='vrt-destfile')
|
||||
parser.add_argument('--check-encoding',
|
||||
default=False,
|
||||
action='store_true',
|
||||
dest='check_encoding'
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
SPACY_MODELS = {'de': 'de_core_news_sm',
|
||||
@ -31,9 +36,16 @@ SPACY_MODELS = {'de': 'de_core_news_sm',
|
||||
# Set the language model for spacy
|
||||
nlp = spacy.load(SPACY_MODELS[args.lang])
|
||||
|
||||
# Try to determine the encoding of the text in the input file
|
||||
if args.check_encoding:
|
||||
with open(args.i, "rb") as input_file:
|
||||
bytes = input_file.read()
|
||||
encoding = chardet.detect(bytes)['encoding']
|
||||
else:
|
||||
encoding='utf-8'
|
||||
# Read text from the input file and if neccessary split it into parts with a
|
||||
# length of less than 1 million characters.
|
||||
with open(args.i) as input_file:
|
||||
with open(args.i, encoding=encoding) as input_file:
|
||||
text = input_file.read()
|
||||
texts = textwrap.wrap(text, 1000000, break_long_words=False)
|
||||
text = None
|
||||
|
Reference in New Issue
Block a user