add more split-chars to tokenizer, add () support (tested)

This commit is contained in:
Daniella / Tove 2022-06-24 18:04:49 +02:00
parent 8075a2f44e
commit d6461db848

View file

@ -1719,15 +1719,25 @@ public class ISBPL {
word.append('"'); word.append('"');
isInString = true; isInString = true;
} }
else if(c == ' ') { else if(c == ' ' || c == '°' || c == 'ß' || c == '§') {
words.add(word.toString()); String w = word.toString();
while(w.startsWith("(") && w.length() > 1)
w = w.substring(1);
while(w.endsWith(")") && w.length() > 1)
w = w.substring(0, w.length() - 1);
words.add(w);
word = new StringBuilder(); word = new StringBuilder();
} }
else { else {
word.append(c); word.append(c);
} }
} }
words.add(word.toString()); String w = word.toString();
while(w.startsWith("(") && w.length() > 1)
w = w.substring(1);
while(w.endsWith(")") && w.length() > 1)
w = w.substring(0, w.length() - 1);
words.add(w);
ArrayList<String> cleanWords = new ArrayList<>(); ArrayList<String> cleanWords = new ArrayList<>();
for(int i = 0; i < words.size(); i++) { for(int i = 0; i < words.size(); i++) {