diff -r 84215fa115fc src/java.base/share/classes/java/util/regex/Grapheme.java --- a/src/java.base/share/classes/java/util/regex/Grapheme.java Fri Mar 20 17:37:52 2020 -0700 +++ b/src/java.base/share/classes/java/util/regex/Grapheme.java Sat Mar 21 15:51:11 2020 +0100 @@ -30,21 +30,8 @@ final class Grapheme { /** - * Determines if there is an extended grapheme cluster boundary between two - * continuing characters {@code cp1} and {@code cp2}. - *

- * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification - * for the extended grapheme cluster boundary rules - *

- * Note: this method does not take care of stateful breaking. - */ - static boolean isBoundary(int cp1, int cp2) { - return rules[getType(cp1)][getType(cp2)]; - } - - /** - * Look for the next extended grapheme cluster boundary in a CharSequence. It assumes - * the start of the char sequence is a boundary. + * Look for the next extended grapheme cluster boundary in a CharSequence. + * It assumes the start of the char sequence at offset {@code off} is a boundary. *

* See Unicode Standard Annex #29 Unicode Text Segmentation for the specification * for the extended grapheme cluster boundary rules. The following implementation @@ -54,20 +41,19 @@ * @param src the {@code CharSequence} to be scanned * @param off offset to start looking for the next boundary in the src * @param limit limit offset in the src (exclusive) - * @return the next possible boundary + * @return the next grapheme boundary */ static int nextBoundary(CharSequence src, int off, int limit) { Objects.checkFromToIndex(off, limit, src.length()); - int ch0 = Character.codePointAt(src, 0); - int ret = Character.charCount(ch0); - int ch1; + int ch0 = Character.codePointAt(src, off); + int ret = off + Character.charCount(ch0); // indicates whether gb11 or gb12 is underway int t0 = getGraphemeType(ch0); int riCount = t0 == RI ? 1 : 0; boolean gb11 = t0 == EXTENDED_PICTOGRAPHIC; while (ret < limit) { - ch1 = Character.codePointAt(src, ret); + int ch1 = Character.codePointAt(src, ret); int t1 = getGraphemeType(ch1); if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) { @@ -177,6 +163,7 @@ cp == 0xAA7B || cp == 0xAA7D; } + @SuppressWarnings("fallthrough") private static int getGraphemeType(int cp) { if (cp < 0x007F) { // ASCII if (cp < 32) { // Control characters @@ -188,11 +175,7 @@ } return OTHER; } - return getType(cp); - } - @SuppressWarnings("fallthrough") - private static int getType(int cp) { if (EmojiData.isExtendedPictographic(cp)) { return EXTENDED_PICTOGRAPHIC; } diff -r 84215fa115fc src/java.base/share/classes/java/util/regex/Pattern.java --- a/src/java.base/share/classes/java/util/regex/Pattern.java Fri Mar 20 17:37:52 2020 -0700 +++ b/src/java.base/share/classes/java/util/regex/Pattern.java Sat Mar 21 15:51:11 2020 +0100 @@ -4035,17 +4035,8 @@ if (i < matcher.to) { int ch0 = Character.codePointAt(seq, i); int n = Character.charCount(ch0); - int j = i + n; - // Fast check if it's necessary to call Normalizer; - // testing Grapheme.isBoundary is enough for this case - while (j < matcher.to) { - int ch1 = Character.codePointAt(seq, j); - if (Grapheme.isBoundary(ch0, ch1)) - break; - ch0 = ch1; - j += Character.charCount(ch1); - } - if (i + n == j) { // single, assume nfc cp + int j = Grapheme.nextBoundary(seq, i, matcher.to); + if (i + n == j) { // single cp grapheme, assume nfc if (predicate.is(ch0)) return next.match(matcher, j, seq); } else { @@ -4109,13 +4100,12 @@ endIndex = matcher.getTextLength(); } if (i == startIndex) { - return next.match(matcher, i, seq); - } - if (i < endIndex) { - if (Character.isSurrogatePair(seq.charAt(i-1), seq.charAt(i)) || - Grapheme.nextBoundary(seq, - i - Character.charCount(Character.codePointBefore(seq, i)), - i + Character.charCount(Character.codePointAt(seq, i))) > i) { + // continue with return below + } else if (i < endIndex) { + if (Character.isSurrogatePair(seq.charAt(i - 1), seq.charAt(i))) { + return false; + } + if (Grapheme.nextBoundary(seq, matcher.last, endIndex) > i) { return false; } } else { diff -r 84215fa115fc test/jdk/java/util/regex/GraphemeTimePerCharTest.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/jdk/java/util/regex/GraphemeTimePerCharTest.java Sat Mar 21 15:51:11 2020 +0100 @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.testng.annotations.*; +import static org.testng.Assert.*; + +/** + * @test + * @bug 8241055 + * @run testng GraphemeTimePerCharTest + * @summary Measures {@link java.util.regex.Grapheme#nextBoundary} performance + * depending on input character sequence size. + */ +public class GraphemeTimePerCharTest { + + static final Pattern GRAPHEME_REGEX = Pattern.compile("\\X"); + + void matchPerformance(String value) { + Matcher charMatcher = GRAPHEME_REGEX.matcher(value); + while (charMatcher.find()) charMatcher.group(); + } + + @Test + public void testDependsLinearlyOnInputSize() throws Exception { + matchPerformance("warmup"); + + final int STEPS = 50; // number of measurements + final int LIMIT = 5000000; // maximum string size tested + + double[][] strTimesByNChars = new double[STEPS][]; + double[][] chrTimesByNChars = new double[STEPS][]; + System.out.println( + "number of chars, time string [ms], time per char [ns]"); + for (int i = 0; i < STEPS; i++) { + int chunkSize = (i + 1) * LIMIT / STEPS; + String value = "x".repeat(chunkSize); + long start = System.nanoTime(); + matchPerformance(value); + long end = System.nanoTime(); + long t = end - start; + strTimesByNChars[i] = new double[] { chunkSize, t }; + double timePerChar = (double) t / (double) chunkSize; + chrTimesByNChars[i] = new double[] { chunkSize, timePerChar }; + System.out.printf("%7d %6d %10.3f%n", + chunkSize, (t / 1000000), timePerChar); + } + double cStr = correlate(strTimesByNChars); + double cChr = correlate(chrTimesByNChars); + System.out.println("correlation factor between string size and " + + "total time (expected 1): " + cStr); + System.out.println("correlation factor between string size and " + + "time per character (expected 0 or less): " + cChr); + assertTrue(cChr < .5 && .5 < cStr); + } + + double correlate(double[][] data) { + double sumx = 0, sumy = 0; + for (int i = 0; i < data.length; i++) { + sumx += data[i][0]; + sumy += data[i][1]; + } + double avgx = sumx / data.length; + double avgy = sumy / data.length; + double sxy = 0, sxx = 0, syy = 0; + for (int i = 0; i < data.length; i++) { + sxy += (data[i][0] - avgx) * (data[i][1] - avgy); + sxx += (data[i][0] - avgx) * (data[i][0] - avgx); + syy += (data[i][1] - avgy) * (data[i][1] - avgy); + } + return sxy / Math.sqrt(sxx * syy); + } + +} diff -r 84215fa115fc test/jdk/java/util/regex/RegExTest.java --- a/test/jdk/java/util/regex/RegExTest.java Fri Mar 20 17:37:52 2020 -0700 +++ b/test/jdk/java/util/regex/RegExTest.java Sat Mar 21 15:51:11 2020 +0100 @@ -4796,48 +4796,97 @@ } private static void grapheme() throws Exception { + final int[] lineNumber = new int[1]; Stream.concat(Files.lines(UCDFiles.GRAPHEME_BREAK_TEST), Files.lines(Paths.get(System.getProperty("test.src", "."), "GraphemeTestCases.txt"))) - .filter( ln -> ln.length() != 0 && !ln.startsWith("#") ) .forEach( ln -> { - ln = ln.replaceAll("\\s+|\\([a-zA-Z]+\\)|\\[[a-zA-Z]]+\\]|#.*", ""); - // System.out.println(str); - String[] strs = ln.split("\u00f7|\u00d7"); - StringBuilder src = new StringBuilder(); - ArrayList graphemes = new ArrayList<>(); - StringBuilder buf = new StringBuilder(); - int offBk = 0; - for (String str : strs) { - if (str.length() == 0) // first empty str - continue; - int cp = Integer.parseInt(str, 16); - src.appendCodePoint(cp); - buf.appendCodePoint(cp); - offBk += (str.length() + 1); - if (ln.charAt(offBk) == '\u00f7') { // DIV - graphemes.add(buf.toString()); - buf = new StringBuilder(); + lineNumber[0]++; + if (ln.length() == 0 || ln.startsWith("#")) return; + ln = ln.replaceAll("\\s+|\\([a-zA-Z]+\\)|\\[[a-zA-Z]]+\\]|#.*", ""); + // System.out.println(str); + String[] strs = ln.split("\u00f7|\u00d7"); + StringBuilder src = new StringBuilder(); + ArrayList graphemes = new ArrayList<>(); + StringBuilder buf = new StringBuilder(); + int offBk = 0; + for (String str : strs) { + if (str.length() == 0) // first empty str + continue; + int cp = Integer.parseInt(str, 16); + src.appendCodePoint(cp); + buf.appendCodePoint(cp); + offBk += (str.length() + 1); + if (ln.charAt(offBk) == '\u00f7') { // DIV + graphemes.add(buf.toString()); + buf = new StringBuilder(); + } } - } - Pattern p = Pattern.compile("\\X"); - Matcher m = p.matcher(src.toString()); - Scanner s = new Scanner(src.toString()).useDelimiter("\\b{g}"); - for (String g : graphemes) { - // System.out.printf(" grapheme:=[%s]%n", g); - // (1) test \\X directly - if (!m.find() || !m.group().equals(g)) { - System.out.println("Failed \\X [" + ln + "] : " + g); + Pattern p = Pattern.compile("\\X"); + // (1) test \X directly + Matcher m = p.matcher(src.toString()); + for (String g : graphemes) { + // System.out.printf(" grapheme:=[%s]%n", g); + String group = null; + if (!m.find() || !(group = m.group()).equals(g)) { + System.out.println("Failed pattern \\X [" + ln + "] : " + + "expected: " + g + " - actual: " + group + + "(line " + lineNumber[0] + ")"); + failCount++; + } + } + if (m.find()) { failCount++; } - // (2) test \\b{g} + \\X via Scanner - boolean hasNext = s.hasNext(p); - // if (!s.hasNext() || !s.next().equals(next)) { - if (!s.hasNext(p) || !s.next(p).equals(g)) { - System.out.println("Failed b{g} [" + ln + "] : " + g); + // test \b{g} without \X via Pattern + Pattern pbg = Pattern.compile("\\b{g}"); + m = pbg.matcher(src.toString()); + m.find(); + int prev = m.end(); + for (String g : graphemes) { + String group = null; + if (!m.find() || !(group = src.substring(prev, m.end())).equals(g)) { + System.out.println("Failed pattern \\b{g} [" + ln + "] : " + + "expected: " + g + " - actual: " + group + + "(line " + lineNumber[0] + ")"); + failCount++; + } + if (!"".equals(m.group())) { + failCount++; + } + prev = m.end(); + } + if (m.find()) { failCount++; } - } - }); + // (2) test \b{g} + \X via Scanner + Scanner s = new Scanner(src.toString()).useDelimiter("\\b{g}"); + for (String g : graphemes) { + String next = null; + if (!s.hasNext(p) || !(next = s.next(p)).equals(g)) { + System.out.println("Failed \\b{g} [" + ln + "] : " + + "expected: " + g + " - actual: " + next + + " (line " + lineNumber[0] + ")"); + failCount++; + } + } + if (s.hasNext(p)) { + failCount++; + } + // test \b{g} without \X via Scanner + s = new Scanner(src.toString()).useDelimiter("\\b{g}"); + for (String g : graphemes) { + String next = null; + if (!s.hasNext() || !(next = s.next()).equals(g)) { + System.out.println("Failed \\b{g} [" + ln + "] : " + + "expected: " + g + " - actual: " + next + + " (line " + lineNumber[0] + ")"); + failCount++; + } + } + if (s.hasNext()) { + failCount++; + } + }); // some sanity checks if (!Pattern.compile("\\X{10}").matcher("abcdefghij").matches() || !Pattern.compile("\\b{g}(?:\\X\\b{g}){5}\\b{g}").matcher("abcde").matches() ||