mirror of
https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque.git
synced 2025-06-15 10:30:40 +00:00
Add prototype
This commit is contained in:
43
vre_nlp_node/Dockerfile
Normal file
43
vre_nlp_node/Dockerfile
Normal file
@ -0,0 +1,43 @@
|
||||
FROM debian:stretch-slim
|
||||
|
||||
MAINTAINER Patrick Jentsch <p.jentsch@uni-bielefeld.de>
|
||||
|
||||
ENV LANG=C.UTF-8
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
ca-certificates \
|
||||
python2.7 \
|
||||
python3 \
|
||||
python3-dev \
|
||||
python3-pip \
|
||||
python3-setuptools \
|
||||
wget \
|
||||
zip
|
||||
|
||||
WORKDIR /root
|
||||
|
||||
# Install pyFlow
|
||||
ENV PYFLOW_VERSION 1.1.20
|
||||
RUN wget -nv https://github.com/Illumina/pyflow/releases/download/v"$PYFLOW_VERSION"/pyflow-"$PYFLOW_VERSION".tar.gz && \
|
||||
tar -xzf pyflow-"$PYFLOW_VERSION".tar.gz && \
|
||||
rm pyflow-"$PYFLOW_VERSION".tar.gz && \
|
||||
cd pyflow-"$PYFLOW_VERSION" && \
|
||||
python2.7 setup.py build install && \
|
||||
cd ..
|
||||
|
||||
# Install spaCy
|
||||
RUN pip3 install wheel && pip3 install -U spacy && \
|
||||
python3 -m spacy download de && \
|
||||
python3 -m spacy download en && \
|
||||
python3 -m spacy download es && \
|
||||
python3 -m spacy download fr && \
|
||||
python3 -m spacy download pt
|
||||
|
||||
RUN mkdir files_for_nlp files_from_nlp
|
||||
|
||||
COPY nlp /usr/local/bin
|
||||
COPY spacy_nlp /usr/local/bin
|
||||
|
||||
CMD ["/bin/bash"]
|
38
vre_nlp_node/README.md
Normal file
38
vre_nlp_node/README.md
Normal file
@ -0,0 +1,38 @@
|
||||
# vre_nlp_node
|
||||
Dieses Repository stellt ein Dockerfile zur Erstellung eines Dockerimages zur linguistischen Datenverarbeitung (NLP) zur Verfügung. Es werden Textdateien entgegengenommen und verticalized text-Dateien ausgegeben.
|
||||
|
||||
## Dockerimage erstellen
|
||||
Die GitLab Registry stellt ein automatisch erstelltes Dockerimage zur Verfügung, das stehts die neusten Änderungen beinhaltet. Das Dockerimage kann aber auch lokal erstellt werden, dazu muss folgender Befehl ins Terminal eingegeben werden.
|
||||
```bash
|
||||
docker build -t gitlab.ub.uni-bielefeld.de:4567/pjentsch/vre_nlp_node .
|
||||
```
|
||||
|
||||
## Nutzung
|
||||
|
||||
### Starten eines Dockercontainers
|
||||
```bash
|
||||
docker run \
|
||||
--name <containername> \
|
||||
-dit \
|
||||
-v <datalocation>/files_for_nlp:/root/files_for_nlp \
|
||||
-v <datalocation>/files_from_nlp:/root/files_from_nlp \
|
||||
gitlab.ub.uni-bielefeld.de:4567/pjentsch/vre_nlp_node
|
||||
```
|
||||
|
||||
### Daten zur linguistischen Datenverarbeitung in das Eingabeverzeichnis kopieren
|
||||
```bash
|
||||
cp <textfile1> <textfile2> ... <textfilen> <datalocation>/files_for_nlp
|
||||
```
|
||||
|
||||
### Linguistische Datenverarbeitung starten
|
||||
```bash
|
||||
docker exec -it <containername> ocr -i /root/files_for_nlp -o /root/files_from_nlp -l <languagecode>
|
||||
```
|
||||
Valide Angaben für `<languagecode>` sind:
|
||||
* de (Deutsch)
|
||||
* en (Englisch)
|
||||
* es (Spanish)
|
||||
* fr (Französisch)
|
||||
* pt (Portugisisch)
|
||||
|
||||
Nach Beendigung des Vorgangs, stehen die aufbereitet Daten im Verzeichnis `<datalocation>/files_from_nlp` zur Verfügung.
|
131
vre_nlp_node/nlp
Executable file
131
vre_nlp_node/nlp
Executable file
@ -0,0 +1,131 @@
|
||||
#!/usr/bin/env python2.7
|
||||
# coding=utf-8
|
||||
|
||||
"""
|
||||
nlp
|
||||
|
||||
Usage: For usage instructions run with option --help
|
||||
Author: Patrick Jentsch <p.jentsch@uni-bielefeld.de>
|
||||
"""
|
||||
|
||||
|
||||
import argparse
|
||||
import multiprocessing
|
||||
import os
|
||||
import sys
|
||||
from pyflow import WorkflowRunner
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser(
|
||||
"Performs NLP of documents utilizing spaCy. \
|
||||
Output is .vrt."
|
||||
)
|
||||
|
||||
parser.add_argument("-i",
|
||||
dest="inputDir",
|
||||
help="Input directory.",
|
||||
required=True)
|
||||
parser.add_argument("-l",
|
||||
dest='lang',
|
||||
help="Language for NLP",
|
||||
required=True)
|
||||
parser.add_argument("-o",
|
||||
dest="outputDir",
|
||||
help="Output directory.",
|
||||
required=True)
|
||||
parser.add_argument("--nCores",
|
||||
default=multiprocessing.cpu_count(),
|
||||
dest="nCores",
|
||||
help="Total number of cores available.",
|
||||
required=False,
|
||||
type=int)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
class NLPWorkflow(WorkflowRunner):
|
||||
def __init__(self, jobs, lang, nCores):
|
||||
self.jobs = jobs
|
||||
self.lang = lang
|
||||
self.nCores = nCores
|
||||
|
||||
|
||||
def workflow(self):
|
||||
###
|
||||
# Task "mkdir_job": create output directories
|
||||
# Dependencies: None
|
||||
###
|
||||
mkdir_jobs = []
|
||||
mkdir_job_number = 0
|
||||
for job in self.jobs:
|
||||
mkdir_job_number += 1
|
||||
cmd = 'mkdir -p "%s"' % (
|
||||
job["output_dir"]
|
||||
)
|
||||
mkdir_jobs.append(self.addTask(label="mkdir_job_-_%i" % (mkdir_job_number), command=cmd))
|
||||
|
||||
###
|
||||
# Task "spacy_nlp_job": perform NLP
|
||||
# Dependencies: mkdir_jobs
|
||||
###
|
||||
self.waitForTasks()
|
||||
nlp_jobs = []
|
||||
nlp_job_number = 0
|
||||
for job in self.jobs:
|
||||
nlp_job_number += 1
|
||||
cmd = 'spacy_nlp -i "%s" -o "%s" -l "%s"' % (
|
||||
job["path"],
|
||||
os.path.join(job["output_dir"], os.path.basename(job["path"]).rsplit(".", 1)[0] + ".vrt"),
|
||||
self.lang
|
||||
)
|
||||
nlp_jobs.append(self.addTask(label="nlp_job_-_%i" % (nlp_job_number), command=cmd, dependencies=mkdir_jobs))
|
||||
|
||||
###
|
||||
# Task "zip_job": compress output
|
||||
# Dependencies: nlp_jobs
|
||||
###
|
||||
zip_jobs = []
|
||||
zip_job_number = 0
|
||||
for job in self.jobs:
|
||||
zip_job_number += 1
|
||||
cmd = 'zip -jqr %s %s' % (
|
||||
job["output_dir"] + "_-_nlp",
|
||||
job["output_dir"]
|
||||
)
|
||||
zip_jobs.append(self.addTask(label="zip_job_-_%i" % (zip_job_number), command=cmd, dependencies=nlp_jobs))
|
||||
|
||||
|
||||
def analyze_jobs(inputDir, outputDir, level=1):
|
||||
jobs = []
|
||||
|
||||
if level > 2:
|
||||
return jobs
|
||||
|
||||
for file in os.listdir(inputDir):
|
||||
if os.path.isdir(os.path.join(inputDir, file)):
|
||||
jobs += analyze_jobs(
|
||||
os.path.join(inputDir, file),
|
||||
os.path.join(outputDir, file),
|
||||
level + 1
|
||||
)
|
||||
elif file.endswith(".txt"):
|
||||
jobs.append({"path": os.path.join(inputDir, file), "output_dir": os.path.join(outputDir, file.rsplit(".", 1)[0])})
|
||||
|
||||
return jobs
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_arguments()
|
||||
|
||||
wflow = NLPWorkflow(
|
||||
analyze_jobs(args.inputDir, args.outputDir),
|
||||
args.lang,
|
||||
args.nCores
|
||||
)
|
||||
|
||||
retval = wflow.run(nCores=args.nCores)
|
||||
sys.exit(retval)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
59
vre_nlp_node/spacy_nlp
Executable file
59
vre_nlp_node/spacy_nlp
Executable file
@ -0,0 +1,59 @@
|
||||
#!/usr/bin/env python3
|
||||
# coding=utf-8
|
||||
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import spacy
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser(description="Tag a .txt file with spaCy and \
|
||||
save it in .vrt format")
|
||||
parser.add_argument("-i",
|
||||
dest="input",
|
||||
help="Input file.",
|
||||
required=True)
|
||||
parser.add_argument("-l",
|
||||
choices=["de", "en", "es", "fr", "pt"],
|
||||
dest="lang",
|
||||
help="Language for tagging",
|
||||
required=True)
|
||||
parser.add_argument("-o",
|
||||
dest="output",
|
||||
help="Output file.",
|
||||
required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
SPACY_MODELS = {"de": "de_core_news_sm", "en": "en_core_web_sm",
|
||||
"es": "es_core_news_sm", "fr": "fr_core_news_sm",
|
||||
"pt": "pt_core_news_sm"}
|
||||
|
||||
|
||||
# Set the language model for spacy
|
||||
nlp = spacy.load(SPACY_MODELS[args.lang])
|
||||
|
||||
# Read text from the input file
|
||||
with open(args.input) as input_file:
|
||||
text = input_file.read()
|
||||
|
||||
# Run spacy nlp over the text
|
||||
doc = nlp(text)
|
||||
|
||||
# Create and open the output file
|
||||
output_file = open(args.output, "w+")
|
||||
output_file.write('<?xml version="1.0" encoding="UTF-8"?>\n<corpus>\n<text id="' + os.path.basename(args.input).rsplit(".", 1)[0] + '">\n')
|
||||
for sent in doc.sents:
|
||||
output_file.write('<s>\n')
|
||||
for token in sent:
|
||||
# Skip whitespace tokens like "\n" or "\t"
|
||||
if token.text.isspace():
|
||||
continue
|
||||
# Write all information in .vrt style to the output file
|
||||
# text, lemma, simple_pos, pos, ner
|
||||
output_file.write(token.text + "\t" + token.lemma_ + "\t"
|
||||
+ token.pos_ + "\t" + token.tag_ + "\t"
|
||||
+ (token.ent_type_ if token.ent_type_ != "" else "NULL") + "\n")
|
||||
output_file.write('</s>\n')
|
||||
output_file.write('</text>\n</corpus>')
|
||||
output_file.close()
|
Reference in New Issue
Block a user