Skip to content

Commit 9198563

Browse files
committed
implemented option to skip TICCL (#21), with explicit textclass handling in frog/ucto pipelines
1 parent f6412c8 commit 9198563

File tree

4 files changed

+35
-17
lines changed

4 files changed

+35
-17
lines changed

frog.nf

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ params.inputformat = "text"
1818
params.outputdir = "frog_output"
1919
params.sentenceperline = false
2020
params.inputclass = "current"
21+
params.outputclass = "current"
2122
params.skip = ""
2223

2324
if (params.containsKey('help') || !params.containsKey('inputdir')) {
@@ -34,6 +35,7 @@ if (params.containsKey('help') || !params.containsKey('inputdir')) {
3435
log.info " --sentenceperline Indicates that the input (plain text only) is already in a one sentence per line format, skips sentence detection (default: false)"
3536
log.info " --outputdir DIRECTORY Output directory (FoLiA documents)"
3637
log.info " --inputclass CLASS Set the FoLiA text class to use as input (default: current)"
38+
log.info " --ouputclass CLASS Set the FoLiA text class to use as input (default: current)"
3739
log.info " --skip=[mptncla] Skip Tokenizer (t), Lemmatizer (l), Morphological Analyzer (a), Chunker (c), Multi-Word Units (m), Named Entity Recognition (n), or Parser (p)"
3840
exit 2
3941
}
@@ -52,6 +54,7 @@ if (params.inputformat == "folia") {
5254
file inputdocument from inputdocuments
5355
val skip from params.skip
5456
val inputclass from params.inputclass
57+
val outputclass from params.outputclass
5558
val virtualenv from params.virtualenv
5659

5760
output:
@@ -70,7 +73,7 @@ if (params.inputformat == "folia") {
7073
skip="--skip=${skip}"
7174
fi
7275
73-
frog \$opts -X ${inputdocument.baseName}.frog.folia.xml --textclass ${inputclass} --id ${inputdocument.baseName} -x ${inputdocument}
76+
frog \$opts -X ${inputdocument.baseName}.frog.folia.xml --inputclass ${inputclass} --outputclass ${outputclass} --id ${inputdocument.baseName} -x ${inputdocument}
7477
"""
7578
}
7679
} else {
@@ -83,6 +86,7 @@ if (params.inputformat == "folia") {
8386
val sentenceperline from params.sentenceperline
8487
val skip from params.skip
8588
val virtualenv from params.virtualenv
89+
val outputclass from params.outputclass
8690

8791
output:
8892
file "${inputdocument.baseName}.frog.folia.xml" into tokoutput
@@ -103,7 +107,7 @@ if (params.inputformat == "folia") {
103107
skip="--skip=${skip}"
104108
fi
105109
106-
frog \$opts -X ${inputdocument.baseName}.frog.folia.xml --id ${inputdocument.baseName} -t ${inputdocument}
110+
frog \$opts -X ${inputdocument.baseName}.frog.folia.xml --outputclass ${outputclass} --id ${inputdocument.baseName} -t ${inputdocument}
107111
"""
108112
}
109113
}

tokenize.nf

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ params.inputformat = "text"
1818
params.outputdir = "tokenized_output"
1919
params.sentenceperline = false
2020
params.inputclass = "current"
21+
params.outputclass = "current"
2122

2223
if (params.containsKey('help') || !params.containsKey('inputdir') || !params.containsKey('language')) {
2324
log.info "Usage:"
@@ -34,6 +35,7 @@ if (params.containsKey('help') || !params.containsKey('inputdir') || !params.con
3435
log.info " --sentenceperline Indicates that the input (plain text only) is already in a one sentence per line format, skips sentence detection (default: false)"
3536
log.info " --outputdir DIRECTORY Output directory (FoLiA documents)"
3637
log.info " --inputclass CLASS Set the FoLiA text class to use as input (default: current)"
38+
log.info " --outputclass CLASS Set the FoLiA text class to use as output (default: current)"
3739
exit 2
3840
}
3941

@@ -51,6 +53,7 @@ if (params.inputformat == "folia") {
5153
file inputdocument from inputdocuments
5254
val language from params.language
5355
val inputclass from params.inputclass
56+
val outputclass from params.outputclass
5457
val virtualenv from params.virtualenv
5558

5659
output:
@@ -65,7 +68,7 @@ if (params.inputformat == "folia") {
6568
set -u
6669
6770
ID="${inputdocument.baseName}"
68-
ucto -L ${language} -X --id \$ID --inputclass ${inputclass} -F ${inputdocument} ${inputdocument.baseName}.tok.folia.xml
71+
ucto -L ${language} -X --id \$ID --inputclass ${inputclass} --outputclass ${outputclass} -F ${inputdocument} ${inputdocument.baseName}.tok.folia.xml
6972
"""
7073
}
7174
} else {
@@ -78,6 +81,7 @@ if (params.inputformat == "folia") {
7881
val language from params.language
7982
val sentenceperline from params.sentenceperline
8083
val virtualenv from params.virtualenv
84+
val outputclass from params.outputclass
8185

8286
output:
8387
file "${inputdocument.baseName}.tok.folia.xml" into tokoutput
@@ -96,7 +100,7 @@ if (params.inputformat == "folia") {
96100
fi
97101
98102
ID="${inputdocument.baseName}"
99-
ucto -L ${language} \$opts -X --id \$ID ${inputdocument} ${inputdocument.baseName}.tok.folia.xml
103+
ucto -L ${language} \$opts -X --id \$ID ${inputdocument} --outputclass ${outputclass} ${inputdocument.baseName}.tok.folia.xml
100104
"""
101105
}
102106
}

webservice/picclservice/picclservice.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,9 @@
437437
ChoiceParameter('lang','Language?',"Specify the language of your input documents", choices=LANGUAGECHOICES), #old ticcl -t
438438
BooleanParameter('reassemble','Reassemble PDF',"Use this option if you have PDF input files, such as chapters or pages, that first need to be merged together prior to processing. Filenames must be named {documentname}-{sequencenumber}.pdf for this to work.")
439439
]),
440+
("OCR post-correction", [
441+
ChoiceParameter('ticcl','Enable TICCL?',"Perform OCR post-correction and normalisation using TICCL?", choices=[('yes','Yes'),('no','No')], default='yes'),
442+
]),
440443
('N-best Ranking', [
441444
ChoiceParameter('rank','How many ranked variants?','Return N best-first ranked variants',choices=[('3','Up to three N-best ranked'),('1','First-best Only'),('2','Up to two N-best ranked'),('5','Up to five N-best ranked'),('10','Up to ten N-best ranked'),('20','Up to twenty N-best ranked')]) #old ticcl -r
442445
]),

webservice/picclservice/picclservice_wrapper.py

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -166,28 +166,35 @@ def fail():
166166
if 'tok' in clamdata and clamdata['tok']:
167167
print("Tokeniser enabled (" + str(clamdata['tok']) + ")",file=sys.stderr)
168168

169-
clam.common.status.write(statusfile, "Running TICCL Pipeline",50) # status update
170-
if ('frog' in clamdata and clamdata['frog']) or ('tok' in clamdata and clamdata['tok']):
171-
ticcl_outputdir = 'ticcl_out'
172-
else:
173-
ticcl_outputdir = outputdir
174-
if os.system(run_piccl + "ticcl.nf --inputdir " + ticclinputdir + " --inputtype " + ticcl_inputtype + " --outputdir " + shellsafe(ticcl_outputdir,'"') + " --lexicon lexicon.lst --alphabet alphabet.lst --charconfus confusion.lst --clip " + shellsafe(clamdata['rank']) + " --distance " + shellsafe(clamdata['distance']) + " --clip " + shellsafe(clamdata['rank']) + " --pdfhandling " + pdfhandling + " -with-trace >&2" ) != 0:
175-
fail()
169+
if 'ticcl' in clamdata and clamdata['ticcl'] == 'yes':
170+
clam.common.status.write(statusfile, "Running TICCL Pipeline",50) # status update
171+
if ('frog' in clamdata and clamdata['frog']) or ('tok' in clamdata and clamdata['tok']):
172+
ticcl_outputdir = 'ticcl_out'
173+
else:
174+
ticcl_outputdir = outputdir
175+
if os.system(run_piccl + "ticcl.nf --inputdir " + ticclinputdir + " --inputtype " + ticcl_inputtype + " --outputdir " + shellsafe(ticcl_outputdir,'"') + " --lexicon lexicon.lst --alphabet alphabet.lst --charconfus confusion.lst --clip " + shellsafe(clamdata['rank']) + " --distance " + shellsafe(clamdata['distance']) + " --clip " + shellsafe(clamdata['rank']) + " --pdfhandling " + pdfhandling + " -with-trace >&2" ) != 0:
176+
fail()
176177

177-
#Print Nextflow trace information to stderr so it ends up in the CLAM error.log and is available for inspection
178-
print("TICCL pipeline trace summary",file=sys.stderr)
179-
print("-------------------------------",file=sys.stderr)
180-
print(open('trace.txt','r',encoding='utf-8').read(), file=sys.stderr)
178+
#Print Nextflow trace information to stderr so it ends up in the CLAM error.log and is available for inspection
179+
print("TICCL pipeline trace summary",file=sys.stderr)
180+
print("-------------------------------",file=sys.stderr)
181+
print(open('trace.txt','r',encoding='utf-8').read(), file=sys.stderr)
182+
frog_inputdir = ticcl_outputdir
183+
textclass_opts = ""
184+
else:
185+
print("TICCL skipped as requested...",file=sys.stderr)
186+
frog_inputdir = 'ocr_output'
187+
textclass_opts = "--inputclass \"OCR\" --outputclass \"current\"" #extra textclass opts for both frog and/or ucto
181188

182189

183190
if 'frog' in clamdata and clamdata['frog']:
184191
print("Running Frog...",file=sys.stderr)
185192
clam.common.status.write(statusfile, "Running Frog Pipeline (linguistic enrichment)",75) # status update
186-
if os.system(run_piccl + "frog.nf --skip=p --inputdir " + shellsafe(ticcl_outputdir,'"') + " --inputformat folia --extension folia.xml --outputdir " + shellsafe(outputdir,'"') + " -with-trace >&2" ) != 0:
193+
if os.system(run_piccl + "frog.nf " + textclass_opts + " --inputdir " + shellsafe(frog_inputdir,'"') + " --inputformat folia --extension folia.xml --outputdir " + shellsafe(outputdir,'"') + " -with-trace >&2" ) != 0:
187194
fail()
188195
elif 'tok' in clamdata and clamdata['tok']:
189196
clam.common.status.write(statusfile, "Running Tokeniser (ucto)",75) # status update
190-
if os.system(run_piccl + "tokenize.nf -L " + shellsafe(lang,'"') + " --inputformat folia --inputdir " + shellsafe(ticcl_outputdir,'"') + " --extension folia.xml --outputdir " + shellsafe(outputdir,'"') + " -with-trace >&2" ) != 0:
197+
if os.system(run_piccl + "tokenize.nf " + textclass_opts + " -L " + shellsafe(lang,'"') + " --inputformat folia --inputdir " + shellsafe(frog_inputdir,'"') + " --extension folia.xml --outputdir " + shellsafe(outputdir,'"') + " -with-trace >&2" ) != 0:
191198
fail()
192199

193200
#cleanup

0 commit comments

Comments
 (0)