java/nio/charset/ModifiedUtf8.java - platform/prebuilts/fullsdk/sources/android-28 - Git at Google

 /*
  * Copyright (C) 2015 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License
  */

 package java.nio.charset;

 import java.io.UTFDataFormatException;

 /**
  * Encoding and decoding methods for Modified UTF-8
  *
  * <p>Modified UTF-8 is a simple variation of UTF-8 in which {@code \u0000} is encoded as
  * 0xc0 0x80 . This avoids the presence of bytes 0 in the output.
  *
  * @hide
  */
 public class ModifiedUtf8 {

     /**
      * Count the number of bytes in the modified UTF-8 representation of {@code s}.
      *
      * <p>Additionally, if {@code shortLength} is true, throw a {@code UTFDataFormatException} if
      * the size cannot be presented in an (unsigned) java short.
      */
     public static long countBytes(String s, boolean shortLength) throws UTFDataFormatException {
         long counter = 0;
         int strLen = s.length();
         for (int i = 0; i < strLen; i++) {
             char c = s.charAt(i);
             if (c < '\u0080') {
                 counter++;
                 if (c == '\u0000') {
                     counter++;
                 }
             } else if (c < '\u0800') {
                 counter += 2;
             } else {
                 counter += 3;
             }
         }
         // Allow up to the maximum value of an unsigned short (as the value is known to be
         // unsigned.
         if (shortLength && counter > 0xffff) {
             throw new UTFDataFormatException(
                     "Size of the encoded string doesn't fit in two bytes");
         }
         return counter;
     }

     /**
      * Encode {@code s} into {@code dst} starting at offset {@code offset}.
      *
      * <p>The output buffer is guaranteed to have enough space.
      */
     public static void encode(byte[] dst, int offset, String s) {
         int strLen = s.length();
         for (int i = 0; i < strLen; i++) {
             char c = s.charAt(i);
             if (c < '\u0080') {
                 if (c == 0) {
                     dst[offset++] = (byte) 0xc0;
                     dst[offset++] = (byte) 0x80;
                 } else {
                     dst[offset++] = (byte) c;
                 }
             } else if (c < '\u0800') {
                 dst[offset++] = (byte) ((c >>> 6) | 0xc0);
                 dst[offset++] = (byte) ((c & 0x3f) | 0x80);
             } else {
                 dst[offset++] = (byte) ((c >>> 12) | 0xe0);
                 dst[offset++] = (byte) (((c >>> 6) & 0x3f) | 0x80);
                 dst[offset++] = (byte) ((c & 0x3f) | 0x80);
             }
         }
     }

     /**
      * Encodes {@code s} into a buffer with the following format:
      *
      * <p>- the first two bytes of the buffer are the length of the modified-utf8 output
      * (as a big endian short. A UTFDataFormatException is thrown if the encoded size cannot be
      * represented as a short.
      *
      * <p>- the remainder of the buffer contains the modified-utf8 output (equivalent to
      * {@code encode(buf, 2, s)}).
      */
     public static byte[] encode(String s) throws UTFDataFormatException {
         long size = countBytes(s, true);
         byte[] output = new byte[(int) size + 2];
         encode(output, 2, s);
         output[0] = (byte) (size >>> 8);
         output[1] = (byte) size;
         return output;
     }

     /**
      * Decodes {@code length} utf-8 bytes from {@code in} starting at offset {@code offset} to
      * {@code out},
      *
      * <p>A maximum of {@code length} chars are written to the output starting at offset 0.
      * {@code out} is assumed to have enough space for the output (a standard
      * {@code ArrayIndexOutOfBoundsException} is thrown otherwise).
      *
      * <p>If a ‘0’ byte is encountered, it is converted to U+0000.
      */
     public static String decode(byte[] in, char[] out, int offset, int length)
             throws UTFDataFormatException {
         if (offset < 0 || length < 0) {
             throw new IllegalArgumentException("Illegal arguments: offset " + offset
                     + ". Length: " + length);
         }
         int outputIndex = 0;
         int limitIndex = offset + length;
         while (offset < limitIndex) {
             int i = in[offset] & 0xff;
             offset++;
             if (i < 0x80) {
                 out[outputIndex] = (char) i;
                 outputIndex++;
                 continue;
             }
             if (0xc0 <= i && i < 0xe0) {
                 // This branch covers the case 0 = 0xc080.

                 // The result is: 5 least-significant bits of i + 6 l-s bits of next input byte.
                 i = (i & 0x1f) << 6;
                 if(offset == limitIndex) {
                     throw new UTFDataFormatException("unexpected end of input");
                 }
                 // Include 6 least-significant bits of the input byte.
                 if ((in[offset] & 0xc0) != 0x80) {
                     throw new UTFDataFormatException("bad second byte at " + offset);
                 }
                 out[outputIndex] = (char) (i | (in[offset] & 0x3f));
                 offset++;
                 outputIndex++;
             } else if(i < 0xf0) {
                 // The result is: 5 least-significant bits of i + 6 l-s bits of next input byte
                 // + 6 l-s of next to next input byte.
                 i = (i & 0x1f) << 12;
                 // Make sure there are are at least two bytes left.
                 if (offset + 1 >= limitIndex) {
                     throw new UTFDataFormatException("unexpected end of input");
                 }
                 // Include 6 least-significant bits of the input byte, with 6 bits of room
                 // for the next byte.
                 if ((in[offset] & 0xc0) != 0x80) {
                     throw new UTFDataFormatException("bad second byte at " + offset);
                 }
                 i = i | (in[offset] & 0x3f) << 6;
                 offset++;
                 // Include 6 least-significant bits of the input byte.
                 if ((in[offset] & 0xc0) != 0x80) {
                     throw new UTFDataFormatException("bad third byte at " + offset);
                 }
                 out[outputIndex] = (char) (i | (in[offset] & 0x3f));
                 offset++;
                 outputIndex++;
             } else {
                 throw new UTFDataFormatException("Invalid UTF8 byte "
                         + (int) i + " at position " + (offset - 1));
             }
         }
         return String.valueOf(out, 0, outputIndex);
     }
 }
	/*
	* Copyright (C) 2015 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License
	*/

	package java.nio.charset;

	import java.io.UTFDataFormatException;

	/**
	* Encoding and decoding methods for Modified UTF-8
	*
	* <p>Modified UTF-8 is a simple variation of UTF-8 in which {@code \u0000} is encoded as
	* 0xc0 0x80 . This avoids the presence of bytes 0 in the output.
	*
	* @hide
	*/
	public class ModifiedUtf8 {

	/**
	* Count the number of bytes in the modified UTF-8 representation of {@code s}.
	*
	* <p>Additionally, if {@code shortLength} is true, throw a {@code UTFDataFormatException} if
	* the size cannot be presented in an (unsigned) java short.
	*/
	public static long countBytes(String s, boolean shortLength) throws UTFDataFormatException {
	long counter = 0;
	int strLen = s.length();
	for (int i = 0; i < strLen; i++) {
	char c = s.charAt(i);
	if (c < '\u0080') {
	counter++;
	if (c == '\u0000') {
	counter++;
	}
	} else if (c < '\u0800') {
	counter += 2;
	} else {
	counter += 3;
	}
	}
	// Allow up to the maximum value of an unsigned short (as the value is known to be
	// unsigned.
	if (shortLength && counter > 0xffff) {
	throw new UTFDataFormatException(
	"Size of the encoded string doesn't fit in two bytes");
	}
	return counter;
	}

	/**
	* Encode {@code s} into {@code dst} starting at offset {@code offset}.
	*
	* <p>The output buffer is guaranteed to have enough space.
	*/
	public static void encode(byte[] dst, int offset, String s) {
	int strLen = s.length();
	for (int i = 0; i < strLen; i++) {
	char c = s.charAt(i);
	if (c < '\u0080') {
	if (c == 0) {
	dst[offset++] = (byte) 0xc0;
	dst[offset++] = (byte) 0x80;
	} else {
	dst[offset++] = (byte) c;
	}
	} else if (c < '\u0800') {
	dst[offset++] = (byte) ((c >>> 6) \| 0xc0);
	dst[offset++] = (byte) ((c & 0x3f) \| 0x80);
	} else {
	dst[offset++] = (byte) ((c >>> 12) \| 0xe0);
	dst[offset++] = (byte) (((c >>> 6) & 0x3f) \| 0x80);
	dst[offset++] = (byte) ((c & 0x3f) \| 0x80);
	}
	}
	}

	/**
	* Encodes {@code s} into a buffer with the following format:
	*
	* <p>- the first two bytes of the buffer are the length of the modified-utf8 output
	* (as a big endian short. A UTFDataFormatException is thrown if the encoded size cannot be
	* represented as a short.
	*
	* <p>- the remainder of the buffer contains the modified-utf8 output (equivalent to
	* {@code encode(buf, 2, s)}).
	*/
	public static byte[] encode(String s) throws UTFDataFormatException {
	long size = countBytes(s, true);
	byte[] output = new byte[(int) size + 2];
	encode(output, 2, s);
	output[0] = (byte) (size >>> 8);
	output[1] = (byte) size;
	return output;
	}

	/**
	* Decodes {@code length} utf-8 bytes from {@code in} starting at offset {@code offset} to
	* {@code out},
	*
	* <p>A maximum of {@code length} chars are written to the output starting at offset 0.
	* {@code out} is assumed to have enough space for the output (a standard
	* {@code ArrayIndexOutOfBoundsException} is thrown otherwise).
	*
	* <p>If a ‘0’ byte is encountered, it is converted to U+0000.
	*/
	public static String decode(byte[] in, char[] out, int offset, int length)
	throws UTFDataFormatException {
	if (offset < 0 \|\| length < 0) {
	throw new IllegalArgumentException("Illegal arguments: offset " + offset
	+ ". Length: " + length);
	}
	int outputIndex = 0;
	int limitIndex = offset + length;
	while (offset < limitIndex) {
	int i = in[offset] & 0xff;
	offset++;
	if (i < 0x80) {
	out[outputIndex] = (char) i;
	outputIndex++;
	continue;
	}
	if (0xc0 <= i && i < 0xe0) {
	// This branch covers the case 0 = 0xc080.

	// The result is: 5 least-significant bits of i + 6 l-s bits of next input byte.
	i = (i & 0x1f) << 6;
	if(offset == limitIndex) {
	throw new UTFDataFormatException("unexpected end of input");
	}
	// Include 6 least-significant bits of the input byte.
	if ((in[offset] & 0xc0) != 0x80) {
	throw new UTFDataFormatException("bad second byte at " + offset);
	}
	out[outputIndex] = (char) (i \| (in[offset] & 0x3f));
	offset++;
	outputIndex++;
	} else if(i < 0xf0) {
	// The result is: 5 least-significant bits of i + 6 l-s bits of next input byte
	// + 6 l-s of next to next input byte.
	i = (i & 0x1f) << 12;
	// Make sure there are are at least two bytes left.
	if (offset + 1 >= limitIndex) {
	throw new UTFDataFormatException("unexpected end of input");
	}
	// Include 6 least-significant bits of the input byte, with 6 bits of room
	// for the next byte.
	if ((in[offset] & 0xc0) != 0x80) {
	throw new UTFDataFormatException("bad second byte at " + offset);
	}
	i = i \| (in[offset] & 0x3f) << 6;
	offset++;
	// Include 6 least-significant bits of the input byte.
	if ((in[offset] & 0xc0) != 0x80) {
	throw new UTFDataFormatException("bad third byte at " + offset);
	}
	out[outputIndex] = (char) (i \| (in[offset] & 0x3f));
	offset++;
	outputIndex++;
	} else {
	throw new UTFDataFormatException("Invalid UTF8 byte "
	+ (int) i + " at position " + (offset - 1));
	}
	}
	return String.valueOf(out, 0, outputIndex);
	}
	}