| /* GENERATED SOURCE. DO NOT MODIFY. */ |
| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ******************************************************************************* |
| * Copyright (C) 2012-2016, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| */ |
| package android.icu.impl.breakiter; |
| |
| import static android.icu.impl.CharacterIteration.DONE32; |
| import static android.icu.impl.CharacterIteration.current32; |
| import static android.icu.impl.CharacterIteration.next32; |
| import static android.icu.impl.CharacterIteration.previous32; |
| |
| import java.io.IOException; |
| import java.text.CharacterIterator; |
| import java.util.HashSet; |
| |
| import android.icu.impl.Assert; |
| import android.icu.impl.ICUConfig; |
| import android.icu.impl.ICUData; |
| import android.icu.text.Normalizer; |
| import android.icu.text.UnicodeSet; |
| import android.icu.text.UnicodeSetIterator; |
| import android.icu.util.UResourceBundle; |
| import android.icu.util.UResourceBundleIterator; |
| |
| /** |
| * @hide Only a subset of ICU is exposed in Android |
| */ |
| public class CjkBreakEngine extends DictionaryBreakEngine { |
| private UnicodeSet fHangulWordSet; |
| private UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet; |
| private UnicodeSet fClosePunctuationSet; |
| private DictionaryMatcher fDictionary = null; |
| private HashSet<String> fSkipSet; |
| private MlBreakEngine fMlBreakEngine; |
| private boolean isCj = false; |
| |
| public CjkBreakEngine(boolean korean) throws IOException { |
| fHangulWordSet = new UnicodeSet("[\\uac00-\\ud7a3]"); |
| fHangulWordSet.freeze(); |
| // Digit, open punctuation and Alphabetic characters. |
| fDigitOrOpenPunctuationOrAlphabetSet = new UnicodeSet("[[:Nd:][:Pi:][:Ps:][:Alphabetic:]]"); |
| fDigitOrOpenPunctuationOrAlphabetSet.freeze(); |
| |
| fClosePunctuationSet = new UnicodeSet("[[:Pc:][:Pd:][:Pe:][:Pf:][:Po:]]"); |
| fClosePunctuationSet.freeze(); |
| fSkipSet = new HashSet<String>(); |
| |
| fDictionary = DictionaryData.loadDictionaryFor("Hira"); |
| if (korean) { |
| setCharacters(fHangulWordSet); |
| } else { //Chinese and Japanese |
| isCj = true; |
| UnicodeSet cjSet = new UnicodeSet("[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]"); |
| setCharacters(cjSet); |
| if (Boolean.parseBoolean( |
| ICUConfig.get("android.icu.impl.breakiter.useMLPhraseBreaking", "false"))) { |
| fMlBreakEngine = new MlBreakEngine(fDigitOrOpenPunctuationOrAlphabetSet, |
| fClosePunctuationSet); |
| } else { |
| initializeJapanesePhraseParamater(); |
| } |
| } |
| } |
| |
| private void initializeJapanesePhraseParamater() { |
| loadJapaneseExtensions(); |
| loadHiragana(); |
| } |
| |
| private void loadJapaneseExtensions() { |
| UResourceBundle rb = UResourceBundle.getBundleInstance(ICUData.ICU_BRKITR_BASE_NAME, "ja"); |
| final String tag = "extensions"; |
| UResourceBundle bundle = rb.get(tag); |
| UResourceBundleIterator iterator = bundle.getIterator(); |
| while (iterator.hasNext()) { |
| fSkipSet.add(iterator.nextString()); |
| } |
| } |
| |
| private void loadHiragana() { |
| UnicodeSet hiraganaWordSet = new UnicodeSet("[:Hiragana:]"); |
| hiraganaWordSet.freeze(); |
| UnicodeSetIterator iterator = new UnicodeSetIterator(hiraganaWordSet); |
| while (iterator.next()) { |
| fSkipSet.add(iterator.getString()); |
| } |
| } |
| |
| @Override |
| public boolean equals(Object obj) { |
| if (obj instanceof CjkBreakEngine) { |
| CjkBreakEngine other = (CjkBreakEngine)obj; |
| return this.fSet.equals(other.fSet); |
| } |
| return false; |
| } |
| |
| @Override |
| public int hashCode() { |
| return getClass().hashCode(); |
| } |
| |
| private static final int kMaxKatakanaLength = 8; |
| private static final int kMaxKatakanaGroupLength = 20; |
| private static final int maxSnlp = 255; |
| private static final int kint32max = Integer.MAX_VALUE; |
| private static int getKatakanaCost(int wordlength) { |
| int katakanaCost[] = new int[] { 8192, 984, 408, 240, 204, 252, 300, 372, 480 }; |
| return (wordlength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordlength]; |
| } |
| |
| private static boolean isKatakana(int value) { |
| return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) || |
| (value >= 0xFF66 && value <= 0xFF9F); |
| } |
| |
| @Override |
| public int divideUpDictionaryRange(CharacterIterator inText, int startPos, int endPos, |
| DequeI foundBreaks, boolean isPhraseBreaking) { |
| if (startPos >= endPos) { |
| return 0; |
| } |
| |
| inText.setIndex(startPos); |
| |
| int inputLength = endPos - startPos; |
| int[] charPositions = new int[inputLength + 1]; |
| StringBuffer s = new StringBuffer(""); |
| inText.setIndex(startPos); |
| while (inText.getIndex() < endPos) { |
| s.append(inText.current()); |
| inText.next(); |
| } |
| String prenormstr = s.toString(); |
| boolean isNormalized = Normalizer.quickCheck(prenormstr, Normalizer.NFKC) == Normalizer.YES || |
| Normalizer.isNormalized(prenormstr, Normalizer.NFKC, 0); |
| CharacterIterator text; |
| int numCodePts = 0; |
| if (isNormalized) { |
| text = new java.text.StringCharacterIterator(prenormstr); |
| int index = 0; |
| charPositions[0] = 0; |
| while (index < prenormstr.length()) { |
| int codepoint = prenormstr.codePointAt(index); |
| index += Character.charCount(codepoint); |
| numCodePts++; |
| charPositions[numCodePts] = index; |
| } |
| } else { |
| String normStr = Normalizer.normalize(prenormstr, Normalizer.NFKC); |
| text = new java.text.StringCharacterIterator(normStr); |
| charPositions = new int[normStr.length() + 1]; |
| Normalizer normalizer = new Normalizer(prenormstr, Normalizer.NFKC, 0); |
| int index = 0; |
| charPositions[0] = 0; |
| while (index < normalizer.endIndex()) { |
| normalizer.next(); |
| numCodePts++; |
| index = normalizer.getIndex(); |
| charPositions[numCodePts] = index; |
| } |
| } |
| // Use ML phrase breaking |
| if (Boolean.parseBoolean( |
| ICUConfig.get("android.icu.impl.breakiter.useMLPhraseBreaking", "false"))) { |
| // PhraseBreaking is supported in ja and ko; MlBreakEngine only supports ja. |
| if (isPhraseBreaking && isCj) { |
| return fMlBreakEngine.divideUpRange(inText, startPos, endPos, text, |
| numCodePts, charPositions, foundBreaks); |
| } |
| } |
| |
| // From here on out, do the algorithm. Note that our indices |
| // refer to indices within the normalized string. |
| int[] bestSnlp = new int[numCodePts + 1]; |
| bestSnlp[0] = 0; |
| for (int i = 1; i <= numCodePts; i++) { |
| bestSnlp[i] = kint32max; |
| } |
| |
| int[] prev = new int[numCodePts + 1]; |
| for (int i = 0; i <= numCodePts; i++) { |
| prev[i] = -1; |
| } |
| |
| final int maxWordSize = 20; |
| int values[] = new int[numCodePts]; |
| int lengths[] = new int[numCodePts]; |
| // dynamic programming to find the best segmentation |
| |
| // In outer loop, i is the code point index, |
| // ix is the corresponding code unit index. |
| // They differ when the string contains supplementary characters. |
| int ix = 0; |
| text.setIndex(ix); |
| boolean is_prev_katakana = false; |
| for (int i = 0; i < numCodePts; i++, text.setIndex(ix), next32(text)) { |
| ix = text.getIndex(); |
| if (bestSnlp[i] == kint32max) { |
| continue; |
| } |
| |
| int maxSearchLength = (i + maxWordSize < numCodePts) ? maxWordSize : (numCodePts - i); |
| int[] count_ = new int[1]; |
| fDictionary.matches(text, maxSearchLength, lengths, count_, maxSearchLength, values); |
| int count = count_[0]; |
| |
| // if there are no single character matches found in the dictionary |
| // starting with this character, treat character as a 1-character word |
| // with the highest value possible (i.e. the least likely to occur). |
| // Exclude Korean characters from this treatment, as they should be |
| // left together by default. |
| text.setIndex(ix); // fDictionary.matches() advances the text position; undo that. |
| if ((count == 0 || lengths[0] != 1) && current32(text) != DONE32 && !fHangulWordSet.contains(current32(text))) { |
| values[count] = maxSnlp; |
| lengths[count] = 1; |
| count++; |
| } |
| |
| for (int j = 0; j < count; j++) { |
| int newSnlp = bestSnlp[i] + values[j]; |
| if (newSnlp < bestSnlp[lengths[j] + i]) { |
| bestSnlp[lengths[j] + i] = newSnlp; |
| prev[lengths[j] + i] = i; |
| } |
| } |
| |
| // In Japanese, single-character Katakana words are pretty rare. |
| // So we apply the following heuristic to Katakana: any continuous |
| // run of Katakana characters is considered a candidate word with |
| // a default cost specified in the katakanaCost table according |
| // to its length. |
| boolean is_katakana = isKatakana(current32(text)); |
| if (!is_prev_katakana && is_katakana) { |
| int j = i + 1; |
| next32(text); |
| while (j < numCodePts && (j - i) < kMaxKatakanaGroupLength && isKatakana(current32(text))) { |
| next32(text); |
| ++j; |
| } |
| |
| if ((j - i) < kMaxKatakanaGroupLength) { |
| int newSnlp = bestSnlp[i] + getKatakanaCost(j - i); |
| if (newSnlp < bestSnlp[j]) { |
| bestSnlp[j] = newSnlp; |
| prev[j] = i; |
| } |
| } |
| } |
| is_prev_katakana = is_katakana; |
| } |
| |
| int t_boundary[] = new int[numCodePts + 1]; |
| int numBreaks = 0; |
| if (bestSnlp[numCodePts] == kint32max) { |
| t_boundary[numBreaks] = numCodePts; |
| numBreaks++; |
| } else if (isPhraseBreaking) { |
| t_boundary[numBreaks] = numCodePts; |
| numBreaks++; |
| int prevIdx = numCodePts; |
| int codeUnitIdx = 0, prevCodeUnitIdx = 0, length = 0; |
| for (int i = prev[numCodePts]; i > 0; i = prev[i]) { |
| codeUnitIdx = prenormstr.offsetByCodePoints(0, i); |
| prevCodeUnitIdx = prenormstr.offsetByCodePoints(0, prevIdx); |
| length = prevCodeUnitIdx - codeUnitIdx; |
| prevIdx = i; |
| String pattern = getPatternFromText(text, s, codeUnitIdx, length); |
| // Keep the breakpoint if the pattern is not in the fSkipSet and continuous Katakana |
| // characters don't occur. |
| text.setIndex(codeUnitIdx); |
| if (!fSkipSet.contains(pattern) |
| && (!isKatakana(current32(text)) || !isKatakana(previous32(text)))) { |
| t_boundary[numBreaks] = i; |
| numBreaks++; |
| } |
| } |
| } else { |
| for (int i = numCodePts; i > 0; i = prev[i]) { |
| t_boundary[numBreaks] = i; |
| numBreaks++; |
| } |
| Assert.assrt(prev[t_boundary[numBreaks - 1]] == 0); |
| } |
| |
| if (foundBreaks.size() == 0 || foundBreaks.peek() < startPos) { |
| t_boundary[numBreaks++] = 0; |
| } |
| |
| int correctedNumBreaks = 0; |
| int previous = -1; |
| for (int i = numBreaks - 1; i >= 0; i--) { |
| int pos = charPositions[t_boundary[i]] + startPos; |
| // In phrase breaking, there has to be a breakpoint between Cj character and close |
| // punctuation. |
| // E.g.[携帯電話]正しい選択 -> [携帯▁電話]▁正しい▁選択 -> breakpoint between ] and 正 |
| inText.setIndex(pos); |
| if (pos > previous) { |
| if (pos != startPos |
| || (isPhraseBreaking && pos > 0 |
| && fClosePunctuationSet.contains(previous32(inText)))) { |
| foundBreaks.push(charPositions[t_boundary[i]] + startPos); |
| correctedNumBreaks++; |
| } |
| } |
| previous = pos; |
| } |
| |
| if (!foundBreaks.isEmpty() && foundBreaks.peek() == endPos) { |
| // In phrase breaking, there has to be a breakpoint between Cj character and |
| // the number/open punctuation. |
| // E.g. る文字「そうだ、京都」->る▁文字▁「そうだ、▁京都」-> breakpoint between 字 and「 |
| // E.g. 乗車率90%程度だろうか -> 乗車▁率▁90%▁程度だろうか -> breakpoint between 率 and 9 |
| // E.g. しかもロゴがUnicode! -> しかも▁ロゴが▁Unicode!-> breakpoint between が and U |
| if (isPhraseBreaking) { |
| inText.setIndex(endPos); |
| int current = current32(inText); |
| if (current != DONE32 && !fDigitOrOpenPunctuationOrAlphabetSet.contains(current)) { |
| foundBreaks.pop(); |
| correctedNumBreaks--; |
| } |
| } else { |
| foundBreaks.pop(); |
| correctedNumBreaks--; |
| } |
| } |
| if (!foundBreaks.isEmpty()) |
| inText.setIndex(foundBreaks.peek()); |
| return correctedNumBreaks; |
| } |
| |
| private String getPatternFromText(CharacterIterator text, StringBuffer sb, int start, |
| int length) { |
| sb.setLength(0); |
| if (length > 0) { |
| text.setIndex(start); |
| sb.append(text.current()); |
| for (int i = 1; i < length; i++) { |
| sb.append(text.next()); |
| } |
| } |
| return sb.toString(); |
| } |
| } |