forked from Early-Modern-OCR/RETAS
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTextPreprocessorUniversal.java
executable file
·92 lines (78 loc) · 3.5 KB
/
TextPreprocessorUniversal.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
/* Copyright (C) <2013> University of Massachusetts Amherst
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* This class simply merges hyphenated tokens at the end of each line. Ignores
* the characters in the INGORED_CHARS list.
*
* @author Ismet Zeki Yalniz
*/
import java.util.Locale;
public class TextPreprocessorUniversal extends TextPreprocessor {
public TextPreprocessorUniversal(Locale loc) {
if (loc == null) {
locale = new Locale("en", "US");
} else {
locale = loc;
}
}
public String processText(String s) {
char[] charAr = s.toCharArray();
char[] output = new char[charAr.length + 1];
int backIndex = 0;
for (int i = 0; i < charAr.length; i++) {
char ch = charAr[i];
// connect phonemes of words which are seperated by a dash
// -------------------------------------------------------------
// MERGE HYPHENATED WORDS:
if (ch == '-' && i < (charAr.length - 1)) {
// CASE: Merge hyphenated words at the end of each line.
// Regular exp: HYPHEN (SPACE|TAB)* (NEWLINE|RETURN)
int j = i + 1;
while (charAr[j] == ' ' || charAr[j] == '\t') {
j++;
}
if (charAr[j] == '\n' || charAr[j] == '\r') {
// j++;
i = j;
continue;
}
//i = j-1;
}
// TODO: it is possible to speed up the following code by hashing the list of ignored characters for O(1) time look-up. Current implementation has O(k) complexity where k is the total number of characters in the ingored char list.
// output the char
// if ( isValidChar(ch) ) {
if (IGNORED_CHARS.indexOf(ch) == -1) {
output[backIndex] = ch;
backIndex++;
}
}
output[backIndex] = '\0';
String result = new String(output).substring(0, backIndex);
// MCZ - adding trim because leading spaces result in empty tokens which
// puts an empty word into the alignment that doesn't exist in the text:
// String t[] = " a b c ".split("\\s+");
// System.out.println(Arrays.toString(t));
// outputs: [,a,b,c]
// we could do it in the character array above, but that's also where we're
// removing ignored characters so that removal could result in extra leading/trailing
// spaces and the logic to figure out which ones to remove would be messy. Much
// easier to just trim once all the processing is done.
return (result.trim());
}
public boolean isValidChar(char a) {
if (IGNORED_CHARS.indexOf(a) == -1) {
return true;
}
return false;
}
}