diff -r 563d6852da45 src/java.base/share/classes/java/util/jar/Attributes.java --- a/src/java.base/share/classes/java/util/jar/Attributes.java Mon Mar 09 21:43:01 2020 +0100 +++ b/src/java.base/share/classes/java/util/jar/Attributes.java Tue Mar 10 08:08:19 2020 +0100 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -301,7 +301,6 @@ /* * Writes the current attributes to the specified data output stream. - * XXX Need to handle UTF8 values and break up lines longer than 72 bytes */ void write(DataOutputStream out) throws IOException { StringBuilder buffer = new StringBuilder(72); @@ -310,7 +309,7 @@ buffer.append(e.getKey().toString()); buffer.append(": "); buffer.append(e.getValue()); - Manifest.println72(out, buffer.toString()); + Manifest.printLine72(out, buffer.toString()); } Manifest.println(out); // empty line after individual section } @@ -319,8 +318,6 @@ * Writes the current attributes to the specified data output stream, * make sure to write out the MANIFEST_VERSION or SIGNATURE_VERSION * attributes first. - * - * XXX Need to handle UTF8 values and break up lines longer than 72 bytes */ void writeMain(DataOutputStream out) throws IOException { StringBuilder buffer = new StringBuilder(72); @@ -350,7 +347,7 @@ buffer.append(name); buffer.append(": "); buffer.append(e.getValue()); - Manifest.println72(out, buffer.toString()); + Manifest.printLine72(out, buffer.toString()); } } diff -r 563d6852da45 src/java.base/share/classes/java/util/jar/Manifest.java --- a/src/java.base/share/classes/java/util/jar/Manifest.java Mon Mar 09 21:43:01 2020 +0100 +++ b/src/java.base/share/classes/java/util/jar/Manifest.java Tue Mar 10 08:08:19 2020 +0100 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -32,6 +32,8 @@ import java.io.OutputStream; import java.util.HashMap; import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import sun.nio.cs.UTF_8; import sun.security.util.SecurityProperties; @@ -207,7 +209,7 @@ buffer.setLength(0); buffer.append("Name: "); buffer.append(e.getKey()); - println72(dos, buffer.toString()); + printLine72(dos, buffer.toString()); e.getValue().write(dos); } dos.flush(); @@ -216,7 +218,7 @@ /** * Adds line breaks to enforce a maximum of 72 bytes per line. * - * @deprecation Replaced with {@link #println72}. + * @deprecation Replaced with {@link #printLine72}. */ @Deprecated(since = "13") static void make72Safe(StringBuffer line) { @@ -229,31 +231,90 @@ } } + private static final Pattern CHARACTER_REGEX = Pattern.compile("\\X"); + /** * Writes {@code line} to {@code out} with line breaks and continuation - * spaces within the limits of 72 bytes of contents per line followed - * by a line break. + * spaces within the limits of 72 bytes of contents per line keeping byte + * sequences of characters encoded in UTF-8 together also if the same + * character is encoded with more than one byte or consists of a character + * sequence containing combining diacritical marks followed by a line break. + *

+ * Combining diacritical marks may be separated from the associated base + * character or other combining diacritical marks of that base character + * by a continuation line break ("{@code \r\n }") if the whole sequence of + * base character and all the combining diacritical marks belonging to it + * exceed 71 bytes in their binary form encoded with UTF-8. This limit is + * only 71 bytes rather than 72 because continuation lines start with a + * space that uses the first byte of the 72 bytes each line can hold up to + * and the first line provides even less space for the value because it + * starts with the name ({@see #printChar72}). */ - static void println72(OutputStream out, String line) throws IOException { - if (!line.isEmpty()) { - byte[] lineBytes = line.getBytes(UTF_8.INSTANCE); - int length = lineBytes.length; - // first line can hold one byte more than subsequent lines which - // start with a continuation line break space - out.write(lineBytes[0]); - int pos = 1; - while (length - pos > 71) { - out.write(lineBytes, pos, 71); - pos += 71; - println(out); - out.write(' '); - } - out.write(lineBytes, pos, length - pos); + static void printLine72(OutputStream out, String line) throws IOException { + int linePos = 0; // number of bytes already put out on current line + Matcher charMatcher = CHARACTER_REGEX.matcher(line); + while (charMatcher.find()) { + linePos = printChar72(out, linePos, charMatcher.group()); } println(out); } /** + * Breaks a string at code point boundaries within the limit of 72 bytes + * per line. + */ + private static int printChar72(OutputStream out, int linePos, + String characterString) throws IOException { + byte[] characterBytes = characterString.getBytes(UTF_8.INSTANCE); + int characterLength = characterBytes.length; + int characterPos = 0; // number of bytes of current character + // already put out + + // Put out a break onto a new line if the character or rather combining + // character sequence does not fit on the current line anymore but fits + // on a new line. In other words, only if the current character does not + // fit on one whole line alone, fill the current line first before + // breaking inside of the combining character sequence onto a new line. + if (linePos + characterLength > 72 && characterLength < 72) { + println(out); + out.write(' '); + linePos = 1; + } + + // Break exceptionally large combining character sequences that don't + // fit on one line at code point boundaries. + int nextBreakPos; + while (characterLength - characterPos > (nextBreakPos = 72 - linePos)) { + while (isContinuation(characterBytes[characterPos + nextBreakPos])) { + nextBreakPos--; + } + out.write(characterBytes, characterPos, nextBreakPos); + characterPos += nextBreakPos; + println(out); + out.write(' '); + linePos = 1; + } + + int remainder = characterLength - characterPos; + out.write(characterBytes, characterPos, remainder); + return linePos + remainder; + } + + /** + * Returns {@code true} if the passed byte as parameter {@code b} + * is not the first (or only) byte of a Unicode character encoded in UTF-8 + * and {@code false} otherwise. + * + * @see + * RFC 3629 - UTF-8, a transformation format of ISO 10646 + * @see StringCoding#isNotContinuation(int) + * @see sun.nio.cs.UTF_8.Decoder#isNotContinuation(int) + */ + private static boolean isContinuation(byte b) { + return (b & 0xc0) == 0x80; + } + + /** * Writes a line break to {@code out}. */ static void println(OutputStream out) throws IOException { diff -r 563d6852da45 src/java.base/share/classes/java/util/regex/Grapheme.java --- a/src/java.base/share/classes/java/util/regex/Grapheme.java Mon Mar 09 21:43:01 2020 +0100 +++ b/src/java.base/share/classes/java/util/regex/Grapheme.java Tue Mar 10 08:08:19 2020 +0100 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2016, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -30,19 +30,6 @@ final class Grapheme { /** - * Determines if there is an extended grapheme cluster boundary between two - * continuing characters {@code cp1} and {@code cp2}. - *

- * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification - * for the extended grapheme cluster boundary rules - *

- * Note: this method does not take care of stateful breaking. - */ - static boolean isBoundary(int cp1, int cp2) { - return rules[getType(cp1)][getType(cp2)]; - } - - /** * Look for the next extended grapheme cluster boundary in a CharSequence. It assumes * the start of the char sequence is a boundary. *

@@ -59,15 +46,14 @@ static int nextBoundary(CharSequence src, int off, int limit) { Objects.checkFromToIndex(off, limit, src.length()); - int ch0 = Character.codePointAt(src, 0); - int ret = Character.charCount(ch0); - int ch1; + int ch0 = Character.codePointAt(src, off); + int ret = off + Character.charCount(ch0); // indicates whether gb11 or gb12 is underway int t0 = getGraphemeType(ch0); int riCount = t0 == RI ? 1 : 0; boolean gb11 = t0 == EXTENDED_PICTOGRAPHIC; while (ret < limit) { - ch1 = Character.codePointAt(src, ret); + int ch1 = Character.codePointAt(src, ret); int t1 = getGraphemeType(ch1); if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) { @@ -177,7 +163,8 @@ cp == 0xAA7B || cp == 0xAA7D; } - private static int getGraphemeType(int cp) { + @SuppressWarnings("fallthrough") + static int getGraphemeType(int cp) { if (cp < 0x007F) { // ASCII if (cp < 32) { // Control characters if (cp == 0x000D) @@ -188,11 +175,7 @@ } return OTHER; } - return getType(cp); - } - @SuppressWarnings("fallthrough") - private static int getType(int cp) { if (EmojiData.isExtendedPictographic(cp)) { return EXTENDED_PICTOGRAPHIC; } diff -r 563d6852da45 src/java.base/share/classes/java/util/regex/Pattern.java --- a/src/java.base/share/classes/java/util/regex/Pattern.java Mon Mar 09 21:43:01 2020 +0100 +++ b/src/java.base/share/classes/java/util/regex/Pattern.java Tue Mar 10 08:08:19 2020 +0100 @@ -4037,17 +4037,8 @@ if (i < matcher.to) { int ch0 = Character.codePointAt(seq, i); int n = Character.charCount(ch0); - int j = i + n; - // Fast check if it's necessary to call Normalizer; - // testing Grapheme.isBoundary is enough for this case - while (j < matcher.to) { - int ch1 = Character.codePointAt(seq, j); - if (Grapheme.isBoundary(ch0, ch1)) - break; - ch0 = ch1; - j += Character.charCount(ch1); - } - if (i + n == j) { // single, assume nfc cp + int j = Grapheme.nextBoundary(seq, i, matcher.to); + if (i + n == j) { // single cp grapheme, assume nfc if (predicate.is(ch0)) return next.match(matcher, j, seq); } else { @@ -4111,13 +4102,12 @@ endIndex = matcher.getTextLength(); } if (i == startIndex) { - return next.match(matcher, i, seq); - } - if (i < endIndex) { - if (Character.isSurrogatePair(seq.charAt(i-1), seq.charAt(i)) || - Grapheme.nextBoundary(seq, - i - Character.charCount(Character.codePointBefore(seq, i)), - i + Character.charCount(Character.codePointAt(seq, i))) > i) { + // continue with return statment below + } else if (i < endIndex) { + if (Character.isSurrogatePair(seq.charAt(i - 1), seq.charAt(i))) { + return false; + } + if (Grapheme.nextBoundary(seq, matcher.last, endIndex) > i) { return false; } } else { diff -r 563d6852da45 test/jdk/java/util/jar/Manifest/PrintChar72.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/jdk/java/util/jar/Manifest/PrintChar72.java Tue Mar 10 08:08:19 2020 +0100 @@ -0,0 +1,480 @@ +/* + * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +import static java.nio.charset.StandardCharsets.UTF_8; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.jar.Attributes; +import java.util.jar.Manifest; +import java.util.jar.Attributes.Name; +import java.util.LinkedList; +import java.util.List; +import java.util.stream.IntStream; +import java.util.stream.Stream; + +import org.testng.annotations.*; +import static org.testng.Assert.*; + +/** + * @test + * @bug 6443578 6202130 + * @compile ../../../../sun/security/tools/jarsigner/Utils.java + * @run testng PrintChar72 + * @summary Tests {@link Manifest#printChar72} breaking manifest header values + * across lines in conjunction with Unicode characters encoded in UTF-8 with a + * variable number of bytes when reading and writing jar manifests results in + * valid UTF-8. + *

+ * The manifest line length limit (72 bytes) may be reached at a position + * between multiple bytes of a single UTF-8 encoded character. Although + * characters should not be broken across lines according to the specification + * the previous {@link Manifest} implementation did. + *

+ * This test makes sure that no Unicode code point character is broken apart + * across a line break when writing manifests and also that manifests are still + * read correctly whether or not characters encoded in UTF-8 with more than one + * byte are interrupted with and continued after a line break for compatibility + * when reading older manifests. + */ +public class PrintChar72 { + + static final int MANIFEST_LINE_CONTENT_WIDTH_BYTES = 72; + + /** + * Character string that has one byte size in its UTF-8 encoded form to + * yield one byte of position offset. + */ + static final String FILL1BYTE = "x"; + static final String MARK_BEFORE = "y"; + static final String MARK_AFTER = "z"; + + /** + * Four byte name. + * By using header names of four characters length the same values can be + * used for testing line breaks in both headers (in main attributes as well + * as named sections) as well as section names because a named section name + * is represented basically like any other header but follows an empty line + * and the key is always "Name". + * Relative to the start of the value, this way the same offset to the + * character to test breaking can be used in all cases. + */ + static final String FOUR_BYTE_NAME = "Name"; + + /** + * Distinguishes main attributes headers, section names, and headers in + * named sections because an implementation might make a difference. + */ + enum PositionInManifest { + /** + * @see Attributes#writeMain + */ + MAIN_ATTRIBUTES, + /** + * @see Attributes#write + */ + SECTION_NAME, + /** + * @see Manifest#write + */ + NAMED_SECTION; + } + + static String numByteUnicodeCharacter(int numBytes) { + String string; + switch (numBytes) { + case 1: string = "i"; break; + case 2: string = "\u00EF"; break; // small letter i with diaresis + case 3: string = "\uFB00"; break; // small double f ligature + case 4: string = Character.toString(0x2070E); break; // ? + default: throw new RuntimeException(); + } + assertEquals(string.getBytes(UTF_8).length, numBytes, + "self-test failed: unexpected UTF-8 encoded character length"); + return string; + } + + /** + * Produces test cases with all combinations of circumstances covered in + * which a character could possibly be attempted to be broken across a line + * break onto a continuation line:

+ * The same set of test parameters is used to write and read manifests + * once without breaking characters apart + * ({@link #testWriteLineBreaksKeepCharactersTogether(int, int, int, int, + * PositionInManifest, String, String)}) and once with doing so + * ({@link #readCharactersBrokenAcrossLines(int, int, int, int, + * PositionInManifest, String, String)}). + * The latter case covers backwards compatibility and involves writing + * manifests like they were written before resolution of bug 6443578. + */ + @DataProvider(name = "lineBreakParameters") + public static Object[][] lineBreakParameters() { + return Stream.of(new Object[] { null }).flatMap(o -> + // b: number of line breaks before character under test + IntStream.rangeClosed(0, 3).mapToObj( + b -> new Object[] { b }) + ).flatMap(o -> + // c: unicode character UTF-8 encoded length in bytes + IntStream.rangeClosed(1, 4).mapToObj( + c -> new Object[] { o[0], c }) + ).flatMap(o -> + // p: potential break position offset in bytes + // p == 0 => before character, + // p == c => after character, and + // 0 < p < c => character potentially broken across line break + // within the character + IntStream.rangeClosed(0, (int) o[1]).mapToObj( + p -> new Object[] { o[0], o[1], p }) + ).flatMap(o -> + // a: no or one character following the one under test + // (a == 0 meaning the character under test is the end of + // the value which is followed by a line break in the + // resulting manifest without continuation line space which + // concludes the value) + IntStream.rangeClosed(0, 1).mapToObj( + a -> new Object[] { o[0], o[1], o[2], a }) + ).flatMap(o -> + Stream.of(PositionInManifest.values()).map( + i -> new Object[] { o[0], o[1], o[2], o[3], i }) + ).map(o -> { + int b = (int) o[0]; + int c = (int) o[1]; + int p = (int) o[2]; + int a = (int) o[3]; + PositionInManifest i = (PositionInManifest) o[4]; + + // offset: so many characters (actually bytes here, + // filled with one byte characters) are needed to place + // the next character (the character under test) into a + // position relative to the maximum line width that it + // may or may not have to be broken onto the next line + int offset = + // number of lines; - 1 due to continuation " " + b * (MANIFEST_LINE_CONTENT_WIDTH_BYTES - 1) + // line length minus "Name: ".length() + + MANIFEST_LINE_CONTENT_WIDTH_BYTES - 6 + // position of maximum line width relative to + // beginning of encoded character + - p; + + String value = ""; + value += FILL1BYTE.repeat(offset - 1); + // character before the one to test the break + value += MARK_BEFORE; + String character = numByteUnicodeCharacter(c); + value += character; + // character after the one to test the break + value += MARK_AFTER.repeat(a); + + return new Object[] { b, c, p, a, i, character, value }; + }).toArray(size -> new Object[size][]); + } + + /** + * Checks that unicode characters work well with line breaks and + * continuation lines in jar manifests without breaking a character across + * a line break even when encoded in UTF-8 with more than one byte. + *

+ * For each of the cases provided by {@link #lineBreakParameters()} the + * break position is verified in the written manifest binary form as well + * as verified that it restores to the original values when read again. + *

+ * As an additional check, the binary manifests are decoded from UTF-8 + * into Strings before re-joining continued lines. + */ + @Test(dataProvider = "lineBreakParameters") + public void testWriteLineBreaksKeepCharactersTogether(int b, int c, int p, + int a, PositionInManifest i, String character, String value) + throws IOException { + byte[] mfBytes = writeManifest(i, FOUR_BYTE_NAME, value); + + // in order to unambiguously establish the position of "character" in + // brokenPart, brokenPart is prepended and appended with what is + // expected before and after it... + String brokenPart = MARK_BEFORE; + + // expect the whole character on the next line unless it fits + // completely on the current line + boolean breakExpected = p < c; + if (breakExpected) { + brokenPart += "\r\n "; + } + brokenPart += character; + // expect a line break before the next character if there is a next + // character and the previous not already broken on next line + if (a > 0) { + if (!breakExpected) { + brokenPart += "\r\n "; + } + brokenPart += MARK_AFTER; + } + brokenPart = brokenPart + "\r\n"; + try { + assertOccurrence(mfBytes, brokenPart.getBytes(UTF_8)); + readManifestAndAssertValue(mfBytes, i, FOUR_BYTE_NAME, value); + decodeManifestFromUtf8AndAssertValue( + mfBytes, FOUR_BYTE_NAME, value, true); + } catch (AssertionError e) { + Utils.echoManifest(mfBytes, "faulty manifest: " + e); + throw e; + } + } + + static byte[] writeManifest(PositionInManifest i, String name, + String value) throws IOException { + Manifest mf = new Manifest(); + mf.getMainAttributes().put(Name.MANIFEST_VERSION, "1.0"); + Attributes attributes = new Attributes(); + + switch (i) { + case MAIN_ATTRIBUTES: + mf.getMainAttributes().put(new Name(name), value); + break; + case SECTION_NAME: + mf.getEntries().put(value, attributes); + break; + case NAMED_SECTION: + mf.getEntries().put(FOUR_BYTE_NAME, attributes); + attributes.put(new Name(name), value); + break; + } + + ByteArrayOutputStream out = new ByteArrayOutputStream(); + mf.write(out); + return out.toByteArray(); + } + + /** + * Asserts one and only one occurrence of a sequence of bytes {@code part} + * representing the character and how it is expected to be broken and its + * surrounding bytes in a larger sequence that corresponds to the manifest + * in binary form {@code mf}. + */ + static void assertOccurrence(byte[] mf, byte[] part) { + List matchPos = new LinkedList<>(); + for (int i = 0; i < mf.length; i++) { + for (int j = 0; j < part.length && i + j <= mf.length; j++) { + if (part[j] == 0) { + if (i + j != mf.length) { + break; // expected eof not found + } + } else if (i + j == mf.length) { + break; + } else if (mf[i + j] != part[j]) { + break; + } + if (j == part.length - 1) { + matchPos.add(i); + } + } + } + assertEquals(matchPos.size(), 1, "not " + + (matchPos.size() < 1 ? "found" : "unique") + ": '" + + new String(part, UTF_8) + "'"); + } + + static void readManifestAndAssertValue( + byte[] mfBytes, PositionInManifest i, String name, String value) + throws IOException { + Manifest mf = new Manifest(new ByteArrayInputStream(mfBytes)); + + switch (i) { + case MAIN_ATTRIBUTES: + assertEquals(mf.getMainAttributes().getValue(name), value, + "main attributes header value"); + break; + case SECTION_NAME: + Attributes attributes = mf.getAttributes(value); + assertNotNull(attributes, "named section not found"); + break; + case NAMED_SECTION: + attributes = mf.getAttributes(FOUR_BYTE_NAME); + assertEquals(attributes.getValue(name), value, + "named section attributes header value"); + break; + } + } + + /** + * Decodes a binary manifest {@code mfBytes} into UTF-8 first, before + * joining the continuation lines unlike {@link Manifest} and + * {@link Attributes} which join the continuation lines first, before + * decoding the joined line from UTF-8 into a {@link String}, evaluating + * whether or not the binary manifest is valid UTF-8. + */ + static void decodeManifestFromUtf8AndAssertValue( + byte[] mfBytes, String name, String value, + boolean validUtf8ManifestExpected) throws IOException { + String mf = new String(mfBytes, UTF_8) + .replaceAll("(\\r\\n|(?!\\r)\\n|\\r(?!\\n)) ", ""); + String header = "\r\n" + name + ": " + value + "\r\n"; + int pos = mf.indexOf(header); + if (validUtf8ManifestExpected) { + assertTrue(pos > 0); + pos = mf.indexOf(header, pos + 1); + } + // assert no ocurrence or no other occurrence after one match above + assertTrue(pos == -1); + } + + @Test(dataProvider = "lineBreakParameters") + public void readCharactersBrokenAcrossLines(int b, int c, int p, int a, + PositionInManifest i, String character, String value) + throws IOException { + byte[] mfBytes = writeManifestWithBrokenCharacters( + i, FOUR_BYTE_NAME, value); + + ByteArrayOutputStream buf = new ByteArrayOutputStream(); + buf.write(MARK_BEFORE.getBytes(UTF_8)); + byte[] characterBytes = character.getBytes(UTF_8); + // the portion of the character that fits on the current line before + // a break at 72 bytes, ranges from nothing (p == 0) to the whole + // character (p == c) + for (int j = 0; j < p; j++) { + buf.write(characterBytes, j, 1); + } + // expect a line break at exactly 72 bytes from the beginning of the + // line unless the whole character fits on that line + boolean breakExpected = p < c; + if (breakExpected) { + buf.write("\r\n ".getBytes(UTF_8)); + } + // the remaining portion of the character, if any + for (int j = p; j < c; j++) { + buf.write(characterBytes, j, 1); + } + // expect another line break if the whole character fitted on the same + // line and there is another character + if (a == 1) { + if (c == p) { + buf.write("\r\n ".getBytes(UTF_8)); + } + buf.write(MARK_AFTER.getBytes(UTF_8)); + } + // if no other character followed expect a line break immediately + buf.write("\r\n".getBytes(UTF_8)); + byte[] brokenPart = buf.toByteArray(); + try { + assertOccurrence(mfBytes, brokenPart); + readManifestAndAssertValue(mfBytes, i, FOUR_BYTE_NAME, value); + decodeManifestFromUtf8AndAssertValue( + mfBytes, FOUR_BYTE_NAME, value, p == 0 || p == c); + } catch (AssertionError e) { + Utils.echoManifest(mfBytes, "faulty manifest: " + e); + throw e; + } + } + + /** + * From the previous {@link Manifest} implementation reduced to the minimum + * required to demonstrate compatibility. + */ + @SuppressWarnings("deprecation") + static byte[] writeManifestWithBrokenCharacters( + PositionInManifest i, String name, String value) + throws IOException { + byte[] vb = value.getBytes(UTF_8); + value = new String(vb, 0, 0, vb.length); + ByteArrayOutputStream out = new ByteArrayOutputStream(); + DataOutputStream dos = new DataOutputStream(out); + dos.writeBytes(Name.MANIFEST_VERSION + ": 0.1\r\n"); + + if (i == PositionInManifest.MAIN_ATTRIBUTES) { + StringBuffer buffer = new StringBuffer(name); + buffer.append(": "); + buffer.append(value); + make72Safe(buffer); + buffer.append("\r\n"); + dos.writeBytes(buffer.toString()); + } + dos.writeBytes("\r\n"); + + if (i == PositionInManifest.SECTION_NAME || + i == PositionInManifest.NAMED_SECTION) { + StringBuffer buffer = new StringBuffer("Name: "); + if (i == PositionInManifest.SECTION_NAME) { + buffer.append(value); + } else { + buffer.append(FOUR_BYTE_NAME); + } + make72Safe(buffer); + buffer.append("\r\n"); + dos.writeBytes(buffer.toString()); + + if (i == PositionInManifest.NAMED_SECTION) { + buffer = new StringBuffer(name); + buffer.append(": "); + buffer.append(value); + make72Safe(buffer); + buffer.append("\r\n"); + dos.writeBytes(buffer.toString()); + } + + dos.writeBytes("\r\n"); + } + + dos.flush(); + return out.toByteArray(); + } + + /** + * Adds line breaks to enforce a maximum 72 bytes per line. + *

+ * From previous Manifest implementation without respect for UTF-8 encoded + * character boundaries breaking also within multi-byte UTF-8 encoded + * characters. + * + * @see {@link Manifest#make72Safe(StringBuffer)} + */ + static void make72Safe(StringBuffer line) { + int length = line.length(); + int index = 72; + while (index < length) { + line.insert(index, "\r\n "); + index += 74; // + line width + line break ("\r\n") + length += 3; // + line break ("\r\n") and space + } + } + + @Test + public void testEmptyValue() throws Exception { + for (PositionInManifest i : PositionInManifest.values()) { + byte[] mfBytes = writeManifest(i, FOUR_BYTE_NAME, ""); + readManifestAndAssertValue(mfBytes, i, FOUR_BYTE_NAME, ""); + } + } + +} diff -r 563d6852da45 test/jdk/java/util/jar/Manifest/PrintLine72.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/jdk/java/util/jar/Manifest/PrintLine72.java Tue Mar 10 08:08:19 2020 +0100 @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +import static java.nio.charset.StandardCharsets.UTF_8; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.jar.Manifest; +import java.util.jar.Attributes.Name; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import org.testng.annotations.*; +import static org.testng.Assert.*; + +/** + * @test + * @compile ../../../../sun/security/tools/jarsigner/Utils.java + * @bug 6443578 6202130 + * @run testng PrintLine72 + * @summary Tests {@link Manifest#printLine72} line breaking with some particular + * border case kind of test cases involving combining character sequences. + *

+ * For another test covering the complete Unicode character set see + * {@link ValueUtf8Coding}, for a test for not breaking Unicode code point + * UTF-8 encoded byte sequences see {@link PrintChar72}. + */ +public class PrintLine72 { + + static final Name TEST_NAME = new Name("test"); + + static final int NAME_SEP_LENGTH = (TEST_NAME + ": ").length(); + + void test(String originalValue, int... breakPositionsBytes) + throws IOException { + String expectedValueWithBreaksInManifest = originalValue; + // iterating backwards because inserting breaks affects original + // positions + for (int i = breakPositionsBytes.length - 1; i >= 0; i--) { + int breakPositionBytes = breakPositionsBytes[i]; + + // Translate breakPositionBytes byte offset into + // breakPositionCharacters (primitive char type) character offset + // for cutting the string with String.substring lateron. + // Higher code points may be represented with two UTF-16 surrogate + // pair characters which both count for String.substring. + int bytesSoFar = 0; + int charsSoFar = 0; + while (bytesSoFar < breakPositionBytes) { + String s = expectedValueWithBreaksInManifest + .substring(charsSoFar, ++charsSoFar); + bytesSoFar += s.getBytes(UTF_8).length; + assertTrue(bytesSoFar <= breakPositionBytes, + "break position not aligned with characters"); + } + int breakPositionCharacters = charsSoFar; + + expectedValueWithBreaksInManifest = + expectedValueWithBreaksInManifest + .substring(0, breakPositionCharacters) + + "\r\n " + + expectedValueWithBreaksInManifest + .substring(breakPositionCharacters); + } + + Manifest mf = new Manifest(); + mf.getMainAttributes().put(Name.MANIFEST_VERSION, "1.0"); + mf.getMainAttributes().put(TEST_NAME, originalValue); + + ByteArrayOutputStream out = new ByteArrayOutputStream(); + mf.write(out); + byte[] mfBytes = out.toByteArray(); + + byte[] actual = mfBytes; + String expected = + "Manifest-Version: 1.0\r\n" + + TEST_NAME + ": " + expectedValueWithBreaksInManifest + + "\r\n\r\n"; + try { + assertEquals(new String(actual, UTF_8), expected); + assertEquals(actual, expected.getBytes(UTF_8)); + } catch (AssertionError e) { + Utils.echoManifest(mfBytes, "faulty manifest: " + e); + System.out.println("actual = " + byteArrayToIntList(actual)); + System.out.println("expected = " + byteArrayToIntList( + expected.getBytes(UTF_8))); + throw e; + } + } + + static List byteArrayToIntList(byte[] bytes) { + List list = new ArrayList<>(); + for (int i = 0; i < bytes.length; i++) { + list.add((int) bytes[i]); + } + return list; + } + + @Test + public void testEmpty() throws Exception { + test(""); // expect neither a line break nor an exception + } + + static final String COMBINING_DIACRITICAL_MARKS = + IntStream.range(0x300, 0x36F) + .mapToObj(i -> new String(Character.toChars(i))) + .collect(Collectors.joining()); + + static String getCharSeq(int numberOfBytes) { + String seq = (numberOfBytes % 2 == 1 ? "e" : "\u00E6") + + COMBINING_DIACRITICAL_MARKS.substring(0, (numberOfBytes - 1) / 2); + assertEquals(seq.getBytes(UTF_8).length, numberOfBytes); + return seq; + } + + @Test + public void testBreakOnFirstLine() throws Exception { + // Combining sequence starts immediately after name and ": " and fits + // the remaining space in the first line. Expect no break. + test(getCharSeq(66)); + + // Combining sequence starts after name and ": " and exceeds the + // remaining space in the first line by one byte. Expect to break on a + // new line because the combining sequence fits on a continuation line + // which does not start with name and ": " and provides enough space. + test(getCharSeq(67), 0); + + // Combining sequence starts after name and ": " and exceeds the + // remaining space in the first line but still just fits exactly on a + // continuation line. Expect the value to break onto a new line. + test(getCharSeq(71), 0); + + // Combining sequence starts after name and ": " and exceeds the + // remaining space in the first line and neither fits on a continuation + // line. Expect that the first line to be filled with as many codepoints + // as fit on it and expect a line break onto a continuation line after + // 66 bytes of the first line value. + test(getCharSeq(72), 72 - NAME_SEP_LENGTH); + + // Combining sequence starts after name and ": x" and exceeds the + // remaining space in the first line and neither fits on a continuation + // line. Expect that the first line to be filled with as many codepoints + // as fit on it and expect a line break onto a continuation line already + // after 65 bytes of the first line because the following character is + // a code point represented with two bytes in UTF-8 which should not + // be interrupted with a line break. + test("x" + getCharSeq(72), 72 - NAME_SEP_LENGTH - 1); + } + + @Test + public void testBreakOnContinuationLine() throws Exception { + // fits on next line by skipping one byte free on current line + test("x".repeat(72 - NAME_SEP_LENGTH + 71 - 1) + getCharSeq(71), + 72 - NAME_SEP_LENGTH, + 72 - NAME_SEP_LENGTH + 71 - 1); + + // fits on current line exactly + test("x".repeat(72 - NAME_SEP_LENGTH + 71) + getCharSeq(71), + 72 - NAME_SEP_LENGTH, + 72 - NAME_SEP_LENGTH + 71); + + // fits on next line by inserting a line break after a line that + // contains only one character yet + test("x".repeat(72 - NAME_SEP_LENGTH + 71 + 1) + getCharSeq(71), + 72 - NAME_SEP_LENGTH, + 72 - NAME_SEP_LENGTH + 71, + 72 - NAME_SEP_LENGTH + 71 + 1); + + // does not fit on the next line and the one byte not yet used on the + // current line does not hold the first code point of the combined + // character sequence which is a code point encoded with two bytes in + // UTF-8. + test("x".repeat(72 - NAME_SEP_LENGTH + 71 - 1) + getCharSeq(72), + 72 - NAME_SEP_LENGTH, + 72 - NAME_SEP_LENGTH + 71 - 1, + 72 - NAME_SEP_LENGTH + 71 - 1 + 71 - 1); + + // would not fit on the next line alone but fits on the remaining two + // bytes available on the current line and the whole subsequent line. + test("x".repeat(72 - NAME_SEP_LENGTH + 71 - 2) + getCharSeq(72), + 72 - NAME_SEP_LENGTH, + 72 - NAME_SEP_LENGTH + 71); + + // previous character filled the whole previous line completely + // but the combined character sequence with 72 bytes still does not fit + // on a single line. the last code point is a two byte one so that an + // unused byte is left unused on the second last line. + test("x".repeat(72 - NAME_SEP_LENGTH + 71) + getCharSeq(72), + 72 - NAME_SEP_LENGTH, + 72 - NAME_SEP_LENGTH + 71, + 72 - NAME_SEP_LENGTH + 71 + 71 - 1); + + // previous character left one byte used on the current line and the + // remaining 70 bytes available. the combining sequence can use all of + // these 70 bytes because after 70 bytes a new code point starts + test("x".repeat(72 - NAME_SEP_LENGTH + 71 + 1) + getCharSeq(72), + 72 - NAME_SEP_LENGTH, + 72 - NAME_SEP_LENGTH + 71, + 72 - NAME_SEP_LENGTH + 71 + 71); + } + +} diff -r 563d6852da45 test/jdk/java/util/regex/GraphemeTest.java --- a/test/jdk/java/util/regex/GraphemeTest.java Mon Mar 09 21:43:01 2020 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,384 +0,0 @@ -/* - * Copyright (c) 2016, 2019, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* - * @test - * @bug 7071819 8221431 - * @summary tests Unicode Extended Grapheme support - * @library /lib/testlibrary/java/lang - * @run main GraphemeTest - */ - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.Arrays; -import java.util.ArrayList; -import java.util.Scanner; -import java.util.regex.Pattern; -import java.util.regex.Matcher; - -public class GraphemeTest { - - public static void main(String[] args) throws Throwable { - testProps(UCDFiles.GRAPHEME_BREAK_PROPERTY); - testProps(UCDFiles.EMOJI_DATA); - } - - private static void testProps(Path path) throws IOException { - Files.lines(path) - .map( ln -> ln.replaceFirst("#.*", "") ) - .filter( ln -> ln.length() != 0 ) - .forEach(ln -> { - String[] strs = ln.split("\\s+"); - int off = strs[0].indexOf(".."); - int cp0, cp1; - String expected = strs[2]; - if (off != -1) { - cp0 = Integer.parseInt(strs[0], 0, off, 16); - cp1 = Integer.parseInt(strs[0], off + 2, strs[0].length(), 16); - } else { - cp0 = cp1 = Integer.parseInt(strs[0], 16); - } - for (int cp = cp0; cp <= cp1; cp++) { - // Ignore Emoji* for now (only interested in Extended_Pictographic) - if (expected.startsWith("Emoji")) { - continue; - } - - // NOTE: - // #tr29 "plus a few General_Category = Spacing_Mark needed for - // canonical equivalence." - // For "extended grapheme clusters" support, there is no - // need actually to diff "extend" and "spackmark" given GB9, GB9a. - if (!expected.equals(types[getType(cp)])) { - if ("Extend".equals(expected) && - "SpacingMark".equals(types[getType(cp)])) - System.out.printf("[%x] [%s][%d] -> [%s]%n", - cp, expected, Character.getType(cp), types[getType(cp)]); - else - throw new RuntimeException(String.format( - "cp=[%x], expeced:[%s] result:[%s]%n", - cp, expected, types[getType(cp)])); - } - } - }); - } - - private static final String[] types = { - "Other", "CR", "LF", "Control", "Extend", "ZWJ", "Regional_Indicator", - "Prepend", "SpacingMark", - "L", "V", "T", "LV", "LVT", - "Extended_Pictographic" }; - - /////////////////////////////////////////////////////////////////////////////////////////////////////////////// - - // from java.util.regex.Grapheme.java - // types - private static final int OTHER = 0; - private static final int CR = 1; - private static final int LF = 2; - private static final int CONTROL = 3; - private static final int EXTEND = 4; - private static final int ZWJ = 5; - private static final int RI = 6; - private static final int PREPEND = 7; - private static final int SPACINGMARK = 8; - private static final int L = 9; - private static final int V = 10; - private static final int T = 11; - private static final int LV = 12; - private static final int LVT = 13; - private static final int EXTENDED_PICTOGRAPHIC = 14; - - private static final int FIRST_TYPE = 0; - private static final int LAST_TYPE = 14; - - private static boolean[][] rules; - static { - rules = new boolean[LAST_TYPE + 1][LAST_TYPE + 1]; - // GB 999 Any + Any -> default - for (int i = FIRST_TYPE; i <= LAST_TYPE; i++) - for (int j = FIRST_TYPE; j <= LAST_TYPE; j++) - rules[i][j] = true; - // GB 6 L x (L | V | LV | VT) - rules[L][L] = false; - rules[L][V] = false; - rules[L][LV] = false; - rules[L][LVT] = false; - // GB 7 (LV | V) x (V | T) - rules[LV][V] = false; - rules[LV][T] = false; - rules[V][V] = false; - rules[V][T] = false; - // GB 8 (LVT | T) x T - rules[LVT][T] = false; - rules[T][T] = false; - // GB 9 x (Extend|ZWJ) - // GB 9a x Spacing Mark - // GB 9b Prepend x - for (int i = FIRST_TYPE; i <= LAST_TYPE; i++) { - rules[i][EXTEND] = false; - rules[i][ZWJ] = false; - rules[i][SPACINGMARK] = false; - rules[PREPEND][i] = false; - } - // GB 4 (Control | CR | LF) + - // GB 5 + (Control | CR | LF) - for (int i = FIRST_TYPE; i <= LAST_TYPE; i++) - for (int j = CR; j <= CONTROL; j++) { - rules[i][j] = true; - rules[j][i] = true; - } - // GB 3 CR x LF - rules[CR][LF] = false; - // GB 11 Exended_Pictographic x (Extend|ZWJ) - rules[EXTENDED_PICTOGRAPHIC][EXTEND] = false; - rules[EXTENDED_PICTOGRAPHIC][ZWJ] = false; - } - - // Hangul syllables - private static final int SYLLABLE_BASE = 0xAC00; - private static final int LCOUNT = 19; - private static final int VCOUNT = 21; - private static final int TCOUNT = 28; - private static final int NCOUNT = VCOUNT * TCOUNT; // 588 - private static final int SCOUNT = LCOUNT * NCOUNT; // 11172 - - // #tr29: SpacingMark exceptions: The following (which have - // General_Category = Spacing_Mark and would otherwise be included) - // are specifically excluded - private static boolean isExcludedSpacingMark(int cp) { - return cp == 0x102B || cp == 0x102C || cp == 0x1038 || - cp >= 0x1062 && cp <= 0x1064 || - cp >= 0x1062 && cp <= 0x106D || - cp == 0x1083 || - cp >= 0x1087 && cp <= 0x108C || - cp == 0x108F || - cp >= 0x109A && cp <= 0x109C || - cp == 0x1A61 || cp == 0x1A63 || cp == 0x1A64 || - cp == 0xAA7B || cp == 0xAA7D; - } - - @SuppressWarnings("fallthrough") - private static int getType(int cp) { - if (isExtendedPictographic(cp)) { - return EXTENDED_PICTOGRAPHIC; - } - - int type = Character.getType(cp); - switch(type) { - case Character.CONTROL: - if (cp == 0x000D) - return CR; - if (cp == 0x000A) - return LF; - return CONTROL; - case Character.UNASSIGNED: - // NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control - // but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other" - // so type it as "Other" to make the test happy - if (cp == 0x0378) - return OTHER; - - case Character.LINE_SEPARATOR: - case Character.PARAGRAPH_SEPARATOR: - case Character.SURROGATE: - return CONTROL; - case Character.FORMAT: - if (cp == 0x200C || - cp >= 0xE0020 && cp <= 0xE007F) - return EXTEND; - if (cp == 0x200D) - return ZWJ; - if (cp >= 0x0600 && cp <= 0x0605 || - cp == 0x06DD || cp == 0x070F || cp == 0x08E2 || - cp == 0x110BD || cp == 0x110CD) - return PREPEND; - return CONTROL; - case Character.NON_SPACING_MARK: - case Character.ENCLOSING_MARK: - // NOTE: - // #tr29 "plus a few General_Category = Spacing_Mark needed for - // canonical equivalence." - // but for "extended grapheme clusters" support, there is no - // need actually to diff "extend" and "spackmark" given GB9, GB9a - return EXTEND; - case Character.COMBINING_SPACING_MARK: - if (isExcludedSpacingMark(cp)) - return OTHER; - // NOTE: - // 0x11720 and 0x11721 are mentioned in #tr29 as - // OTHER_LETTER but it appears their category has been updated to - // COMBING_SPACING_MARK already (verified in ver.8) - return SPACINGMARK; - case Character.OTHER_SYMBOL: - if (cp >= 0x1F1E6 && cp <= 0x1F1FF) - return RI; - return OTHER; - case Character.MODIFIER_LETTER: - case Character.MODIFIER_SYMBOL: - // WARNING: - // not mentioned in #tr29 but listed in GraphemeBreakProperty.txt - if (cp == 0xFF9E || cp == 0xFF9F || - cp >= 0x1F3FB && cp <= 0x1F3FF) - return EXTEND; - return OTHER; - case Character.OTHER_LETTER: - if (cp == 0x0E33 || cp == 0x0EB3) - return SPACINGMARK; - // hangul jamo - if (cp >= 0x1100 && cp <= 0x11FF) { - if (cp <= 0x115F) - return L; - if (cp <= 0x11A7) - return V; - return T; - } - // hangul syllables - int sindex = cp - SYLLABLE_BASE; - if (sindex >= 0 && sindex < SCOUNT) { - - if (sindex % TCOUNT == 0) - return LV; - return LVT; - } - // hangul jamo_extended A - if (cp >= 0xA960 && cp <= 0xA97C) - return L; - // hangul jamo_extended B - if (cp >= 0xD7B0 && cp <= 0xD7C6) - return V; - if (cp >= 0xD7CB && cp <= 0xD7FB) - return T; - - // Prepend - switch (cp) { - case 0x0D4E: - case 0x111C2: - case 0x111C3: - case 0x11A3A: - case 0x11A84: - case 0x11A85: - case 0x11A86: - case 0x11A87: - case 0x11A88: - case 0x11A89: - case 0x11D46: - return PREPEND; - } - } - return OTHER; - } - - // from generated java.util.regex.EmojiData.java - static boolean isExtendedPictographic(int cp) { - return - cp == 0x00A9 || - cp == 0x00AE || - cp == 0x203C || - cp == 0x2049 || - cp == 0x2122 || - cp == 0x2139 || - (cp >= 0x2194 && cp <= 0x2199) || - cp == 0x21A9 || - cp == 0x21AA || - cp == 0x231A || - cp == 0x231B || - cp == 0x2328 || - cp == 0x2388 || - cp == 0x23CF || - (cp >= 0x23E9 && cp <= 0x23F3) || - (cp >= 0x23F8 && cp <= 0x23FA) || - cp == 0x24C2 || - cp == 0x25AA || - cp == 0x25AB || - cp == 0x25B6 || - cp == 0x25C0 || - (cp >= 0x25FB && cp <= 0x25FE) || - (cp >= 0x2600 && cp <= 0x2605) || - (cp >= 0x2607 && cp <= 0x2612) || - (cp >= 0x2614 && cp <= 0x2685) || - (cp >= 0x2690 && cp <= 0x2705) || - (cp >= 0x2708 && cp <= 0x2712) || - cp == 0x2714 || - cp == 0x2716 || - cp == 0x271D || - cp == 0x2721 || - cp == 0x2728 || - cp == 0x2733 || - cp == 0x2734 || - cp == 0x2744 || - cp == 0x2747 || - cp == 0x274C || - cp == 0x274E || - (cp >= 0x2753 && cp <= 0x2755) || - cp == 0x2757 || - (cp >= 0x2763 && cp <= 0x2767) || - (cp >= 0x2795 && cp <= 0x2797) || - cp == 0x27A1 || - cp == 0x27B0 || - cp == 0x27BF || - cp == 0x2934 || - cp == 0x2935 || - (cp >= 0x2B05 && cp <= 0x2B07) || - cp == 0x2B1B || - cp == 0x2B1C || - cp == 0x2B50 || - cp == 0x2B55 || - cp == 0x3030 || - cp == 0x303D || - cp == 0x3297 || - cp == 0x3299 || - (cp >= 0x1F000 && cp <= 0x1F0FF) || - (cp >= 0x1F10D && cp <= 0x1F10F) || - cp == 0x1F12F || - (cp >= 0x1F16C && cp <= 0x1F171) || - cp == 0x1F17E || - cp == 0x1F17F || - cp == 0x1F18E || - (cp >= 0x1F191 && cp <= 0x1F19A) || - (cp >= 0x1F1AD && cp <= 0x1F1E5) || - (cp >= 0x1F201 && cp <= 0x1F20F) || - cp == 0x1F21A || - cp == 0x1F22F || - (cp >= 0x1F232 && cp <= 0x1F23A) || - (cp >= 0x1F23C && cp <= 0x1F23F) || - (cp >= 0x1F249 && cp <= 0x1F3FA) || - (cp >= 0x1F400 && cp <= 0x1F53D) || - (cp >= 0x1F546 && cp <= 0x1F64F) || - (cp >= 0x1F680 && cp <= 0x1F6FF) || - (cp >= 0x1F774 && cp <= 0x1F77F) || - (cp >= 0x1F7D5 && cp <= 0x1F7FF) || - (cp >= 0x1F80C && cp <= 0x1F80F) || - (cp >= 0x1F848 && cp <= 0x1F84F) || - (cp >= 0x1F85A && cp <= 0x1F85F) || - (cp >= 0x1F888 && cp <= 0x1F88F) || - (cp >= 0x1F8AE && cp <= 0x1F8FF) || - (cp >= 0x1F90C && cp <= 0x1F93A) || - (cp >= 0x1F93C && cp <= 0x1F945) || - (cp >= 0x1F947 && cp <= 0x1FFFD); - - } -} diff -r 563d6852da45 test/jdk/java/util/regex/GraphemeTestRun.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/jdk/java/util/regex/GraphemeTestRun.java Tue Mar 10 08:08:19 2020 +0100 @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2016, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + * @test + * @bug 7071819 8221431 + * @library /lib/testlibrary/java/lang + * @run main java.base/java.util.regex.GraphemeTest + * @summary tests Unicode Extended Grapheme support + */ + +} diff -r 563d6852da45 test/jdk/java/util/regex/RegExTest.java --- a/test/jdk/java/util/regex/RegExTest.java Mon Mar 09 21:43:01 2020 +0100 +++ b/test/jdk/java/util/regex/RegExTest.java Tue Mar 10 08:08:19 2020 +0100 @@ -4812,24 +4812,71 @@ buf = new StringBuilder(); } } + + // test \X directly Pattern p = Pattern.compile("\\X"); Matcher m = p.matcher(src.toString()); - Scanner s = new Scanner(src.toString()).useDelimiter("\\b{g}"); for (String g : graphemes) { // System.out.printf(" grapheme:=[%s]%n", g); - // (1) test \\X directly - if (!m.find() || !m.group().equals(g)) { - System.out.println("Failed \\X [" + ln + "] : " + g); + String group = null; + if (!m.find() || !(group = m.group()).equals(g)) { + System.out.println("Failed pattern \\X [" + ln + "] : " + + "expected: " + g + " - actual: " + group); + failCount++; + } + } + if (m.find()) { + failCount++; + } + + // test \b{g} (without \X) via Pattern + Pattern pbg = Pattern.compile("\\b{g}"); + m = pbg.matcher(src.toString()); + m.find(); + int prev = m.end(); + for (String g : graphemes) { + String group = null; + if (!m.find() || !(group = src.substring(prev, m.end())).equals(g)) { + System.out.println("Failed pattern \\b{g} [" + ln + "] : " + + "expected: " + g + " - actual: " + group); + failCount++; + } + if (!"".equals(m.group())) { failCount++; } - // (2) test \\b{g} + \\X via Scanner - boolean hasNext = s.hasNext(p); - // if (!s.hasNext() || !s.next().equals(next)) { - if (!s.hasNext(p) || !s.next(p).equals(g)) { - System.out.println("Failed b{g} [" + ln + "] : " + g); + prev = m.end(); + } + if (m.find()) { + failCount++; + } + + // test \b{g} + \X via Scanner + Scanner s = new Scanner(src.toString()).useDelimiter("\\b{g}"); + for (String g : graphemes) { + String next = null; + if (!s.hasNext(p) || !(next = s.next(p)).equals(g)) { + System.out.println("Failed \\b{g} [" + ln + "] : " + + "expected: " + g + " - actual: " + next); failCount++; } } + if (s.hasNext(p)) { + failCount++; + } + + // test \b{g} without \X via Scanner + s = new Scanner(src.toString()).useDelimiter("\\b{g}"); + for (String g : graphemes) { + String next = null; + if (!s.hasNext() || !(next = s.next()).equals(g)) { + System.out.println("Failed \\b{g} [" + ln + "] : " + + "expected: " + g + " - actual: " + next); + failCount++; + } + } + if (s.hasNext()) { + failCount++; + } }); // some sanity checks if (!Pattern.compile("\\X{10}").matcher("abcdefghij").matches() || diff -r 563d6852da45 test/jdk/java/util/regex/java.base/java/util/regex/GraphemeTest.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/jdk/java/util/regex/java.base/java/util/regex/GraphemeTest.java Tue Mar 10 08:08:19 2020 +0100 @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2016, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package java.util.regex; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; + +public class GraphemeTest { + + /** from UCDFiles in @library /lib/testlibrary/java/lang which I cannot + * access in the default (with no) package */ + static Path UCD_DIR = Paths.get( + System.getProperty("test.root"), + "..", "..", "make", "data", "unicodedata"); + static Path GRAPHEME_BREAK_PROPERTY = + UCD_DIR.resolve("auxiliary").resolve("GraphemeBreakProperty.txt"); + static Path EMOJI_DATA = + UCD_DIR.resolve("emoji-data.txt"); + + public static void main(String[] args) throws Exception { + testGraphemeType(GRAPHEME_BREAK_PROPERTY); + testGraphemeType(EMOJI_DATA); + } + + private static void testGraphemeType(Path path) throws Exception { + Files.lines(path) + .map( ln -> ln.replaceFirst("#.*", "") ) + .filter( ln -> ln.length() != 0 ) + .forEach(ln -> { + String[] strs = ln.split("\\s+"); + int off = strs[0].indexOf(".."); + int cp0, cp1; + String expected = strs[2]; + if (off != -1) { + cp0 = Integer.parseInt(strs[0], 0, off, 16); + cp1 = Integer.parseInt(strs[0], off + 2, strs[0].length(), 16); + } else { + cp0 = cp1 = Integer.parseInt(strs[0], 16); + } + for (int cp = cp0; cp <= cp1; cp++) { + // Ignore Emoji* for now (only interested in Extended_Pictographic) + if (expected.startsWith("Emoji")) { + continue; + } + + // NOTE: + // #tr29 "plus a few General_Category = Spacing_Mark needed for + // canonical equivalence." + // For "extended grapheme clusters" support, there is no + // need actually to diff "extend" and "spackmark" given GB9, GB9a. + String type = types[Grapheme.getGraphemeType(cp)]; + if (!expected.equalsIgnoreCase(type)) { + if ("Extend".equals(expected) && + "SpacingMark".equalsIgnoreCase(type)) + System.out.printf("[%x] [%s][%d] -> [%s]%n", + cp, expected, Character.getType(cp), type); + else + throw new RuntimeException(String.format( + "cp=[%x], expeced:[%s] result:[%s]%n", + cp, expected, type)); + } + } + }); + } + + private static final String[] types = { + "Other", "CR", "LF", "Control", "Extend", "ZWJ", "Regional_Indicator", + "Prepend", "SpacingMark", + "L", "V", "T", "LV", "LVT", + "Extended_Pictographic" }; + +} diff -r 563d6852da45 test/jdk/sun/security/tools/jarsigner/LineBrokenMultiByteCharacter.java --- a/test/jdk/sun/security/tools/jarsigner/LineBrokenMultiByteCharacter.java Mon Mar 09 21:43:01 2020 +0100 +++ b/test/jdk/sun/security/tools/jarsigner/LineBrokenMultiByteCharacter.java Tue Mar 10 08:08:19 2020 +0100 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2017, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -29,15 +29,25 @@ * @library /test/lib */ +import java.io.File; import java.io.IOException; import java.io.InputStream; +import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Paths; +import java.util.Arrays; +import java.util.stream.Collectors; +import java.util.jar.Attributes.Name; +import java.util.jar.Manifest; import java.util.jar.JarFile; -import java.util.jar.Attributes.Name; import java.util.jar.JarEntry; +import java.util.zip.ZipFile; +import java.util.zip.ZipEntry; +import static java.util.jar.JarFile.MANIFEST_NAME; import static java.nio.charset.StandardCharsets.UTF_8; +import static java.nio.file.StandardCopyOption.COPY_ATTRIBUTES; +import static java.nio.file.StandardCopyOption.REPLACE_EXISTING; import jdk.test.lib.SecurityTools; import jdk.test.lib.util.JarUtils; @@ -46,7 +56,7 @@ /** * this name will break across lines in MANIFEST.MF at the - * middle of a two-byte utf-8 encoded character due to its e acute letter + * middle of a two-byte UTF-8 encoded character due to its e acute letter * at its exact position. * * because no file with such a name exists {@link JarUtils} will add the @@ -63,53 +73,58 @@ static final String anotherName = "LineBrokenMultiByteCharacterA1234567890B1234567890C123456789D1234567890.class"; - static final String alias = "a"; - static final String keystoreFileName = "test.jks"; - static final String manifestFileName = "MANIFEST.MF"; + static final String ALIAS = "A"; + static final String KEYSTORE_FILENAME = "test.jks"; + static final String SOME_OTHER_SIG_FILE = "META-INF/FAKE_SIG.DSA"; public static void main(String[] args) throws Exception { prepare(); testSignJar("test.jar"); - testSignJarNoManifest("test-no-manifest.jar"); testSignJarUpdate("test-update.jar", "test-updated.jar"); } static void prepare() throws Exception { - SecurityTools.keytool("-keystore", keystoreFileName, "-genkeypair", + SecurityTools.keytool("-keystore", KEYSTORE_FILENAME, "-genkeypair", "-keyalg", "dsa", "-storepass", "changeit", "-keypass", "changeit", "-storetype", - "JKS", "-alias", alias, "-dname", "CN=X", "-validity", "366") + "JKS", "-alias", ALIAS, "-dname", "CN=X", "-validity", "366") .shouldHaveExitValue(0); - Files.write(Paths.get(manifestFileName), (Name. + new File(MANIFEST_NAME).getParentFile().mkdirs(); + Files.write(Paths.get(MANIFEST_NAME), (Name. MANIFEST_VERSION.toString() + ": 1.0\r\n").getBytes(UTF_8)); + + // prevent jarsigner from assuming it was safe to rewrite the manifest + // and its line breaks assuming there were no other signatures present + Files.write(Paths.get(SOME_OTHER_SIG_FILE), new byte[] {}); } static void testSignJar(String jarFileName) throws Exception { - JarUtils.createJar(jarFileName, manifestFileName, testClassName); - verifyJarSignature(jarFileName); - } - - static void testSignJarNoManifest(String jarFileName) throws Exception { - JarUtils.createJar(jarFileName, testClassName); + JarUtils.createJar(jarFileName, testClassName, SOME_OTHER_SIG_FILE); + createManifestEntries(jarFileName); + rebreakManifest72bytes(jarFileName); verifyJarSignature(jarFileName); } static void testSignJarUpdate( String initialFileName, String updatedFileName) throws Exception { - JarUtils.createJar(initialFileName, manifestFileName, anotherName); - SecurityTools.jarsigner("-keystore", keystoreFileName, "-storetype", + JarUtils.createJar(initialFileName, testClassName, anotherName, + SOME_OTHER_SIG_FILE); + createManifestEntries(initialFileName); + rebreakManifest72bytes(initialFileName); + removeJarEntry(initialFileName, testClassName); + SecurityTools.jarsigner("-keystore", KEYSTORE_FILENAME, "-storetype", "JKS", "-storepass", "changeit", "-debug", initialFileName, - alias).shouldHaveExitValue(0); + ALIAS).shouldHaveExitValue(0); JarUtils.updateJar(initialFileName, updatedFileName, testClassName); verifyJarSignature(updatedFileName); } static void verifyJarSignature(String jarFileName) throws Exception { // actually sign the jar - SecurityTools.jarsigner("-keystore", keystoreFileName, "-storetype", - "JKS", "-storepass", "changeit", "-debug", jarFileName, alias) + SecurityTools.jarsigner("-keystore", KEYSTORE_FILENAME, "-storetype", + "JKS", "-storepass", "changeit", "-debug", jarFileName, ALIAS) .shouldHaveExitValue(0); try ( @@ -130,7 +145,7 @@ * the signature file does not even contain the desired entry at all. * * this relies on {@link java.util.jar.Manifest} breaking lines unaware - * of bytes that belong to the same multi-byte utf characters. + * of bytes that belong to the same multi-byte UTF-8 encoded characters. */ static void verifyClassNameLineBroken(JarFile jar, String className) throws IOException { @@ -142,7 +157,7 @@ throw new AssertionError(className + " not found in manifest"); } - JarEntry manifestEntry = jar.getJarEntry(JarFile.MANIFEST_NAME); + JarEntry manifestEntry = jar.getJarEntry(MANIFEST_NAME); try ( InputStream manifestIs = jar.getInputStream(manifestEntry); ) { @@ -159,7 +174,7 @@ } if (bytesMatched < eAcuteBroken.length) { throw new AssertionError("self-test failed: multi-byte " - + "utf-8 character not broken across lines"); + + "UTF-8 encoded character not broken across lines"); } } } @@ -183,4 +198,108 @@ } } + static void createManifestEntries(String jarFileName) throws Exception { + JarUtils.updateJarFile(Paths.get(jarFileName), + Paths.get("."), Paths.get(MANIFEST_NAME)); + SecurityTools.jarsigner("-keystore", KEYSTORE_FILENAME, + "-storepass", "changeit", "-debug", jarFileName, ALIAS) + .shouldHaveExitValue(0); + // remove the signature files, only manifest is used + removeJarEntry(jarFileName, + "META-INF/" + ALIAS + ".SF", + "META-INF/" + ALIAS + ".DSA"); + } + + @SuppressWarnings("deprecation") + static void removeJarEntry(String jarFileName, String... entryNames) + throws IOException { + String aCopy = "swap-" + jarFileName; + JarUtils.updateJar(jarFileName, aCopy, Arrays.asList(entryNames) + .stream().collect(Collectors.toMap(e -> e, e -> false))); + Files.copy(Paths.get(aCopy), Paths.get(jarFileName), + COPY_ATTRIBUTES, REPLACE_EXISTING); + Files.delete(Paths.get(aCopy)); + } + + static void rebreakManifest72bytes(String jarFileName) throws Exception { + byte[] manifest; + try (ZipFile zip = new ZipFile(jarFileName)) { + ZipEntry zipEntry = zip.getEntry(MANIFEST_NAME); + manifest = zip.getInputStream(zipEntry).readAllBytes(); + } + Utils.echoManifest(manifest, MANIFEST_NAME + " before re-break:"); + byte[] manifest72 = rebreak72bytes(manifest); + Utils.echoManifest(manifest72, MANIFEST_NAME + " after re-break:"); + String aCopy = "swap-" + jarFileName; + JarUtils.updateManifest(jarFileName, aCopy, new Manifest() { @Override + public void write(OutputStream out) throws IOException { + out.write(manifest72); + } + }); + Files.copy(Paths.get(aCopy), Paths.get(jarFileName), + COPY_ATTRIBUTES, REPLACE_EXISTING); + Files.delete(Paths.get(aCopy)); + } + + /** + * Simulates a jar manifest as it would have been created by an earlier + * JDK by re-arranging the line break at exactly 72 bytes content thereby + * breaking the multi-byte UTF-8 encoded character under test like before + * resolution of bug 6202130. + *

+ * The returned manifest is accepted as unmodified by + * {@link jdk.security.jarsigner.JarSigner#updateDigests + * (ZipEntry,ZipFile,MessageDigest[],Manifest)} on line 985: + *

+     * if (!mfDigest.equalsIgnoreCase(base64Digests[i])) {
+     * 
+ * and therefore left unchanged when the jar is signed and also signature + * verification will check it. + */ + static byte[] rebreak72bytes(byte[] mf0) { + byte[] mf1 = new byte[mf0.length]; + int c0 = 0, c1 = 0; // bytes since last line start + for (int i0 = 0, i1 = 0; i0 < mf0.length; i0++, i1++) { + switch (mf0[i0]) { + case '\r': + if (i0 + 2 < mf0.length && + mf0[i0 + 1] == '\n' && mf0[i0 + 2] == ' ') { + // skip line break + i0 += 2; + i1 -= 1; + } else { + mf1[i1] = mf0[i0]; + c0 = c1 = 0; + } + break; + case '\n': + if (i0 + 1 < mf0.length && mf0[i0 + 1] == ' ') { + // skip line break + i0 += 1; + i1 -= 1; + } else { + mf1[i1] = mf0[i0]; + c0 = c1 = 0; + } + break; + case ' ': + if (c0 == 0) { + continue; + } + default: + c0++; + if (c1 == 72) { + mf1[i1++] = '\r'; + mf1[i1++] = '\n'; + mf1[i1++] = ' '; + c1 = 1; + } else { + c1++; + } + mf1[i1] = mf0[i0]; + } + } + return mf1; + } + }