mirror of
				https://gitlab.ub.uni-bielefeld.de/sfb1288inf/cqpserver.git
				synced 2025-10-31 10:42:46 +00:00 
			
		
		
		
	Add preparation scripts
This commit is contained in:
		
							
								
								
									
										16
									
								
								create_corpus_file.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								create_corpus_file.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,16 @@ | ||||
| import argparse | ||||
| import xml.etree.ElementTree as ET | ||||
|  | ||||
| parser = argparse.ArgumentParser() | ||||
| parser.add_argument('-i', dest='input_file', required=True) | ||||
| parser.add_argument('--author', dest='author', required=True) | ||||
| parser.add_argument('--publishing_year', dest='publishing_year', required=True) | ||||
| parser.add_argument('--title', dest='title', required=True) | ||||
| args = parser.parse_args() | ||||
|  | ||||
| element_tree = ET.parse(args.input_file) | ||||
| text_node = element_tree.find('text') | ||||
| text_node.set('author', args.author) | ||||
| text_node.set('publishing_year', args.publishing_year) | ||||
| text_node.set('title', args.title) | ||||
| element_tree.write(args.input_file) | ||||
							
								
								
									
										17
									
								
								foo.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										17
									
								
								foo.sh
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,17 @@ | ||||
| echo $0 | ||||
| vrt_file="$1" | ||||
| exit | ||||
|  | ||||
| docker run \ | ||||
|     -d \ | ||||
|     pjentsch/cqpserver:latest | ||||
| docker exec "$CONTAINER_NAME" mkdir /corpora/data/example | ||||
| docker exec "$CONTAINER_NAME" cwb-encode \ | ||||
|     -d /corpora/data/example \ | ||||
|     -f /root/files/example.vrt \ | ||||
|     -R /usr/local/share/cwb/registry/example \ | ||||
|     text, lemma, simple_pos, pos, ner | ||||
|     -P lemma -P simple_pos -P pos -P ner \ | ||||
|     -S s -S text: | ||||
| docker exec "$CONTAINER_NAME" cwb-make \ | ||||
|     -V EXAMPLE | ||||
							
								
								
									
										21
									
								
								merge_corpus_files.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								merge_corpus_files.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,21 @@ | ||||
| import argparse | ||||
| import os | ||||
| import xml.etree.ElementTree as ET | ||||
|  | ||||
| parser = argparse.ArgumentParser() | ||||
| parser.add_argument('-dir', dest='corpus_dir', required=True) | ||||
| args = parser.parse_args() | ||||
|  | ||||
| text_nodes = [] | ||||
| for corpus_file in os.listdir(args.corpus_dir): | ||||
|     if os.path.isdir(os.path.join(args.corpus_dir, corpus_file)): | ||||
|         continue | ||||
|     if corpus_file == 'corpus.vrt': | ||||
|         continue | ||||
|     element_tree = ET.parse(os.path.join(args.corpus_dir, corpus_file)) | ||||
|     text_nodes.append(element_tree.find('text')) | ||||
| element_tree = ET.ElementTree(ET.fromstring('<corpus></corpus>')) | ||||
| root = element_tree.getroot() | ||||
| for text_node in text_nodes: | ||||
|     root.insert(1, text_node) | ||||
| element_tree.write(os.path.join(args.corpus_dir, 'corpus.vrt')) | ||||
							
								
								
									
										8447
									
								
								test_files/corpus.vrt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										8447
									
								
								test_files/corpus.vrt
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										4225
									
								
								test_files/pjentsch.vrt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4225
									
								
								test_files/pjentsch.vrt
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										4225
									
								
								test_files/sporada.vrt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4225
									
								
								test_files/sporada.vrt
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
		Reference in New Issue
	
	Block a user