| /* GENERATED SOURCE. DO NOT MODIFY. */ |
| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ******************************************************************************* |
| * Copyright (C) 2009-2015, Google, International Business Machines Corporation |
| * and others. All Rights Reserved. |
| ******************************************************************************* |
| */ |
| package android.icu.impl; |
| |
| import java.io.BufferedReader; |
| import java.io.FileInputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.InputStreamReader; |
| import java.io.UnsupportedEncodingException; |
| import java.text.ParsePosition; |
| import java.util.Arrays; |
| import java.util.Comparator; |
| import java.util.LinkedHashSet; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Map.Entry; |
| import java.util.Set; |
| import java.util.TreeMap; |
| import java.util.regex.Pattern; |
| |
| import android.icu.text.StringTransform; |
| import android.icu.text.SymbolTable; |
| import android.icu.text.UnicodeSet; |
| import android.icu.util.Freezable; |
| |
| /** |
| * Contains utilities to supplement the JDK Regex, since it doesn't handle |
| * Unicode well. |
| * |
| * <p>TODO: Move to android.icu.dev.somewhere. |
| * 2015-sep-03: This is used there, and also in CLDR and in UnicodeTools. |
| * |
| * @author markdavis |
| * @hide Only a subset of ICU is exposed in Android |
| */ |
| public class UnicodeRegex implements Cloneable, Freezable<UnicodeRegex>, StringTransform { |
| private static final Pattern SUPP_ESCAPE = Pattern.compile("\\\\U00([0-9a-fA-F]{6})"); |
| |
| // Note: we don't currently have any state, but intend to in the future, |
| // particularly for the regex style supported. |
| |
| private SymbolTable symbolTable; |
| |
| /** |
| * Set the symbol table for internal processing |
| * @hide draft / provisional / internal are hidden on Android |
| */ |
| public SymbolTable getSymbolTable() { |
| return symbolTable; |
| } |
| |
| /** |
| * Get the symbol table for internal processing |
| * @hide draft / provisional / internal are hidden on Android |
| */ |
| public UnicodeRegex setSymbolTable(SymbolTable symbolTable) { |
| this.symbolTable = symbolTable; |
| return this; |
| } |
| |
| /** |
| * Adds full Unicode property support, with the latest version of Unicode, |
| * to Java Regex, bringing it up to Level 1 (see |
| * http://www.unicode.org/reports/tr18/). It does this by preprocessing the |
| * regex pattern string and interpreting the character classes (\p{...}, |
| * \P{...}, [...]) according to their syntax and meaning in UnicodeSet. With |
| * this utility, Java regex expressions can be updated to work with the |
| * latest version of Unicode, and with all Unicode properties. Note that the |
| * UnicodeSet syntax has not yet, however, been updated to be completely |
| * consistent with Java regex, so be careful of the differences. |
| * <p>Not thread-safe; create a separate copy for different threads. |
| * <p>In the future, we may extend this to support other regex packages. |
| * |
| * @param regex A modified Java regex pattern, as in the input to |
| * Pattern.compile(), except that all "character classes" are |
| * processed as if they were UnicodeSet patterns. Example: |
| * "abc[:bc=N:]. See UnicodeSet for the differences in syntax. |
| * @return A processed Java regex pattern, suitable for input to |
| * Pattern.compile(). |
| */ |
| @Override |
| public String transform(String regex) { |
| StringBuilder result = new StringBuilder(); |
| UnicodeSet temp = new UnicodeSet(); |
| ParsePosition pos = new ParsePosition(0); |
| int state = 0; // 1 = after \ |
| |
| // We add each character unmodified to the output, unless we have a |
| // UnicodeSet. Note that we don't worry about supplementary characters, |
| // since none of the syntax uses them. |
| |
| for (int i = 0; i < regex.length(); ++i) { |
| // look for UnicodeSets, allowing for quoting with \ and \Q |
| char ch = regex.charAt(i); |
| switch (state) { |
| case 0: // we only care about \, and '['. |
| if (ch == '\\') { |
| if (UnicodeSet.resemblesPattern(regex, i)) { |
| // should only happen with \p |
| i = processSet(regex, i, result, temp, pos); |
| continue; |
| } |
| state = 1; |
| } else if (ch == '[') { |
| // if we have what looks like a UnicodeSet |
| if (UnicodeSet.resemblesPattern(regex, i)) { |
| i = processSet(regex, i, result, temp, pos); |
| continue; |
| } |
| } |
| break; |
| |
| case 1: // we are after a \ |
| if (ch == 'Q') { |
| state = 2; |
| } else { |
| state = 0; |
| } |
| break; |
| |
| case 2: // we are in a \Q... |
| if (ch == '\\') { |
| state = 3; |
| } |
| break; |
| |
| case 3: // we are in a \Q...\ |
| if (ch == 'E') { |
| state = 0; |
| } else if (ch != '\\') { |
| state = 2; |
| } |
| break; |
| } |
| result.append(ch); |
| } |
| return result.toString(); |
| } |
| |
| /** |
| * Convenience static function, using standard parameters. |
| * @param regex as in process() |
| * @return processed regex pattern, as in process() |
| */ |
| public static String fix(String regex) { |
| return STANDARD.transform(regex); |
| } |
| |
| /** |
| * Compile a regex string, after processing by fix(...). |
| * |
| * @param regex Raw regex pattern, as in fix(...). |
| * @return Pattern |
| */ |
| public static Pattern compile(String regex) { |
| return Pattern.compile(STANDARD.transform(regex)); |
| } |
| |
| /** |
| * Compile a regex string, after processing by fix(...). |
| * |
| * @param regex Raw regex pattern, as in fix(...). |
| * @return Pattern |
| */ |
| public static Pattern compile(String regex, int options) { |
| return Pattern.compile(STANDARD.transform(regex), options); |
| } |
| |
| /** |
| * Compile a composed string from a set of BNF lines; see the List version for more information. |
| * |
| * @param bnfLines Series of BNF lines. |
| * @return Pattern |
| */ |
| public String compileBnf(String bnfLines) { |
| return compileBnf(Arrays.asList(bnfLines.split("\\r\\n?|\\n"))); |
| } |
| |
| /** |
| * Compile a composed string from a set of BNF lines, such as for composing a regex |
| * expression. The lines can be in any order, but there must not be any |
| * cycles. The result can be used as input for fix(). |
| * <p> |
| * Example: |
| * <pre> |
| * uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?; |
| * scheme = reserved+; |
| * host = // reserved+; |
| * query = [\\=reserved]+; |
| * fragment = reserved+; |
| * reserved = [[:ascii:][:alphabetic:]]; |
| * </pre> |
| * <p> |
| * Caveats: at this point the parsing is simple; for example, # cannot be |
| * quoted (use \\u0023); you can set it to null to disable. |
| * The equality sign and a few others can be reset with |
| * setBnfX(). |
| * |
| * @param lines Series of lines that represent a BNF expression. The lines contain |
| * a series of statements that of the form x=y;. A statement can take |
| * multiple lines, but there can't be multiple statements on a line. |
| * A hash quotes to the end of the line. |
| * @return Pattern |
| */ |
| public String compileBnf(List<String> lines) { |
| Map<String, String> variables = getVariables(lines); |
| Set<String> unused = new LinkedHashSet<>(variables.keySet()); |
| // brute force replacement; do twice to allow for different order |
| // later on can optimize |
| for (int i = 0; i < 2; ++i) { |
| for (Entry<String, String> entry : variables.entrySet()) { |
| String variable = entry.getKey(), |
| definition = entry.getValue(); |
| |
| for (Entry<String, String> entry2 : variables.entrySet()) { |
| String variable2 = entry2.getKey(), |
| definition2 = entry2.getValue(); |
| if (variable.equals(variable2)) { |
| continue; |
| } |
| String altered2 = definition2.replace(variable, definition); |
| if (!altered2.equals(definition2)) { |
| unused.remove(variable); |
| variables.put(variable2, altered2); |
| // if (log != null) { |
| // try { |
| // log.append(variable2 + "=" + altered2 + ";"); |
| // } catch (IOException e) { |
| // throw (IllegalArgumentException) new IllegalArgumentException().initCause(e); |
| // } |
| // } |
| } |
| } |
| } |
| } |
| if (unused.size() != 1) { |
| throw new IllegalArgumentException("Not a single root: " + unused); |
| } |
| return variables.get(unused.iterator().next()); |
| } |
| |
| public String getBnfCommentString() { |
| return bnfCommentString; |
| } |
| |
| public void setBnfCommentString(String bnfCommentString) { |
| this.bnfCommentString = bnfCommentString; |
| } |
| |
| public String getBnfVariableInfix() { |
| return bnfVariableInfix; |
| } |
| |
| public void setBnfVariableInfix(String bnfVariableInfix) { |
| this.bnfVariableInfix = bnfVariableInfix; |
| } |
| |
| public String getBnfLineSeparator() { |
| return bnfLineSeparator; |
| } |
| |
| public void setBnfLineSeparator(String bnfLineSeparator) { |
| this.bnfLineSeparator = bnfLineSeparator; |
| } |
| |
| /** |
| * Utility for loading lines from a file. |
| * @param result The result of the appended lines. |
| * @param file The file to have an input stream. |
| * @param encoding if null, then UTF-8 |
| * @return filled list |
| * @throws IOException If there were problems opening the file for input stream. |
| */ |
| public static List<String> appendLines(List<String> result, String file, String encoding) throws IOException { |
| InputStream is = new FileInputStream(file); |
| try { |
| return appendLines(result, is, encoding); |
| } finally { |
| is.close(); |
| } |
| } |
| |
| /** |
| * Utility for loading lines from a UTF8 file. |
| * @param result The result of the appended lines. |
| * @param inputStream The input stream. |
| * @param encoding if null, then UTF-8 |
| * @return filled list |
| * @throws IOException If there were problems opening the input stream for reading. |
| */ |
| public static List<String> appendLines(List<String> result, InputStream inputStream, String encoding) |
| throws UnsupportedEncodingException, IOException { |
| BufferedReader in = new BufferedReader(new InputStreamReader(inputStream, encoding == null ? "UTF-8" : encoding)); |
| while (true) { |
| String line = in.readLine(); |
| if (line == null) break; |
| result.add(line); |
| } |
| return result; |
| } |
| |
| |
| |
| /* (non-Javadoc) |
| * @see android.icu.util.Freezable#cloneAsThawed() |
| */ |
| @Override |
| public UnicodeRegex cloneAsThawed() { |
| // TODO Auto-generated method stub |
| try { |
| return (UnicodeRegex)clone(); |
| } catch (CloneNotSupportedException e) { |
| throw new IllegalArgumentException(); // should never happen |
| } |
| } |
| |
| /* (non-Javadoc) |
| * @see android.icu.util.Freezable#freeze() |
| */ |
| @Override |
| public UnicodeRegex freeze() { |
| // no action needed now. |
| return this; |
| } |
| |
| /* (non-Javadoc) |
| * @see android.icu.util.Freezable#isFrozen() |
| */ |
| @Override |
| public boolean isFrozen() { |
| // at this point, always true |
| return true; |
| } |
| |
| // ===== PRIVATES ===== |
| |
| private int processSet(String regex, int i, StringBuilder result, UnicodeSet temp, ParsePosition pos) { |
| try { |
| pos.setIndex(i); |
| UnicodeSet x = temp.clear().applyPattern(regex, pos, symbolTable, 0); |
| x.complement().complement(); // hack to fix toPattern |
| String pattern = x.toPattern(false); |
| // Escaping of supplementary code points differs between ICU UnicodeSet and Java regex. |
| if (pattern.contains("\\U")) { |
| pattern = SUPP_ESCAPE.matcher(pattern).replaceAll("\\\\x{$1}"); |
| } |
| result.append(pattern); |
| i = pos.getIndex() - 1; // allow for the loop increment |
| return i; |
| } catch (Exception e) { |
| throw (IllegalArgumentException) new IllegalArgumentException("Error in " + regex).initCause(e); |
| } |
| } |
| |
| private static final UnicodeRegex STANDARD = new UnicodeRegex(); |
| private String bnfCommentString = "#"; |
| private String bnfVariableInfix = "="; |
| private String bnfLineSeparator = "\n"; |
| // private Appendable log = null; |
| |
| private Comparator<Object> LongestFirst = new Comparator<Object>() { |
| @Override |
| public int compare(Object obj0, Object obj1) { |
| String arg0 = obj0.toString(); |
| String arg1 = obj1.toString(); |
| int len0 = arg0.length(); |
| int len1 = arg1.length(); |
| if (len0 != len1) return len1 - len0; |
| return arg0.compareTo(arg1); |
| } |
| }; |
| |
| private Map<String, String> getVariables(List<String> lines) { |
| Map<String, String> variables = new TreeMap<>(LongestFirst); |
| String variable = null; |
| StringBuffer definition = new StringBuffer(); |
| int count = 0; |
| for (String line : lines) { |
| ++count; |
| // remove initial bom, comments |
| if (line.length() == 0) continue; |
| if (line.charAt(0) == '\uFEFF') line = line.substring(1); |
| |
| if (bnfCommentString != null) { |
| int hashPos = line.indexOf(bnfCommentString); |
| if (hashPos >= 0) line = line.substring(0, hashPos); |
| } |
| String trimline = line.trim(); |
| if (trimline.length() == 0) continue; |
| |
| // String[] lineParts = line.split(";"); |
| String linePart = line; // lineParts[i]; // .trim().replace("\\s+", " "); |
| if (linePart.trim().length() == 0) continue; |
| boolean terminated = trimline.endsWith(";"); |
| if (terminated) { |
| linePart = linePart.substring(0,linePart.lastIndexOf(';')); |
| } |
| int equalsPos = linePart.indexOf(bnfVariableInfix); |
| if (equalsPos >= 0) { |
| if (variable != null) { |
| throw new IllegalArgumentException("Missing ';' before " + count + ") " + line); |
| } |
| variable = linePart.substring(0,equalsPos).trim(); |
| if (variables.containsKey(variable)) { |
| throw new IllegalArgumentException("Duplicate variable definition in " + line); |
| } |
| definition.append(linePart.substring(equalsPos+1).trim()); |
| } else { // no equals, so |
| if (variable == null) { |
| throw new IllegalArgumentException("Missing '=' at " + count + ") " + line); |
| } |
| definition.append(bnfLineSeparator).append(linePart); |
| } |
| // we are terminated if i is not at the end, or the line ends with a ; |
| if (terminated) { |
| variables.put(variable, definition.toString()); |
| variable = null; // signal we have no variable |
| definition.setLength(0); |
| } |
| } |
| if (variable != null) { |
| throw new IllegalArgumentException("Missing ';' at end"); |
| } |
| return variables; |
| } |
| } |