Start line:  
End line:  

Snippet Preview

Snippet HTML Code

Stack Overflow Questions
  /*
   * Copyright (C) 2013 The Guava Authors
   *
   * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
   * in compliance with the License. You may obtain a copy of the License at
   *
   * http://www.apache.org/licenses/LICENSE-2.0
   *
   * Unless required by applicable law or agreed to in writing, software distributed under the License
  * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
  * or implied. See the License for the specific language governing permissions and limitations under
  * the License.
  */
 
 package com.google.common.base;
 
 import static com.google.common.base.Preconditions.checkPositionIndexes;
 import static java.lang.Character.MAX_SURROGATE;
 import static java.lang.Character.MIN_SURROGATE;
 
 
Low-level, high-performance utility methods related to the UTF-8 character encoding. UTF-8 is defined in section D92 of The Unicode Standard Core Specification, Chapter 3.

The variant of UTF-8 implemented by this class is the restricted definition of UTF-8 introduced in Unicode 3.1. One implication of this is that it rejects "non-shortest form" byte sequences, even though the JDK decoder may accept them.

Author(s):
Martin Buchholz
Clément Roux
Since:
16.0
 
 public final class Utf8 {
  
Returns the number of bytes in the UTF-8-encoded form of sequence. For a string, this method is equivalent to string.getBytes(UTF_8).length, but is more efficient in both time and space.

Throws:
java.lang.IllegalArgumentException if sequence contains ill-formed UTF-16 (unpaired surrogates)
 
   public static int encodedLength(CharSequence sequence) {
     // Warning to maintainers: this implementation is highly optimized.
     int utf16Length = sequence.length();
     int utf8Length = utf16Length;
     int i = 0;
 
     // This loop optimizes for pure ASCII.
     while (i < utf16Length && sequence.charAt(i) < 0x80) {
       i++;
     }
 
     // This loop optimizes for chars less than 0x800.
     for (; i < utf16Lengthi++) {
       char c = sequence.charAt(i);
       if (c < 0x800) {
         utf8Length += ((0x7f - c) >>> 31); // branch free!
       } else {
         utf8Length += encodedLengthGeneral(sequencei);
         break;
       }
     }
 
     if (utf8Length < utf16Length) {
       // Necessary and sufficient condition for overflow because of maximum 3x expansion
       throw new IllegalArgumentException(
           "UTF-8 length does not fit in int: " + (utf8Length + (1L << 32)));
     }
     return utf8Length;
   }
 
   private static int encodedLengthGeneral(CharSequence sequenceint start) {
     int utf16Length = sequence.length();
     int utf8Length = 0;
     for (int i = starti < utf16Lengthi++) {
       char c = sequence.charAt(i);
       if (c < 0x800) {
         utf8Length += (0x7f - c) >>> 31; // branch free!
       } else {
         utf8Length += 2;
         // jdk7+: if (Character.isSurrogate(c)) {
         if ( <= c && c <= ) {
           // Check that we have a well-formed surrogate pair.
           if (Character.codePointAt(sequencei) == c) {
             throw new IllegalArgumentException(unpairedSurrogateMsg(i));
           }
           i++;
         }
      }
    }
    return utf8Length;
  }

  
Returns true if bytes is a well-formed UTF-8 byte sequence according to Unicode 6.0. Note that this is a stronger criterion than simply whether the bytes can be decoded. For example, some versions of the JDK decoder will accept "non-shortest form" byte sequences, but encoding never reproduces these. Such byte sequences are not considered well-formed.

This method returns true if and only if Arrays.equals(bytes, new String(bytes, UTF_8).getBytes(UTF_8)) does, but is more efficient in both time and space.

  public static boolean isWellFormed(byte[] bytes) {
    return isWellFormed(bytes, 0, bytes.length);
  }

  
Returns whether the given byte array slice is a well-formed UTF-8 byte sequence, as defined by isWellFormed(byte[]). Note that this can be false even when isWellFormed(bytes) is true.

Parameters:
bytes the input buffer
off the offset in the buffer of the first byte to read
len the number of bytes to read from the buffer
  public static boolean isWellFormed(byte[] bytesint offint len) {
    int end = off + len;
    checkPositionIndexes(offendbytes.length);
    // Look for the first non-ASCII character.
    for (int i = offi < endi++) {
      if (bytes[i] < 0) {
        return isWellFormedSlowPath(bytesiend);
      }
    }
    return true;
  }
  private static boolean isWellFormedSlowPath(byte[] bytesint offint end) {
    int index = off;
    while (true) {
      int byte1;
      // Optimize for interior runs of ASCII bytes.
      do {
        if (index >= end) {
          return true;
        }
      } while ((byte1 = bytes[index++]) >= 0);
      if (byte1 < (byte) 0xE0) {
        // Two-byte form.
        if (index == end) {
          return false;
        }
        // Simultaneously check for illegal trailing-byte in leading position
        // and overlong 2-byte form.
        if (byte1 < (byte) 0xC2 || bytes[index++] > (byte) 0xBF) {
          return false;
        }
      } else if (byte1 < (byte) 0xF0) {
        // Three-byte form.
        if (index + 1 >= end) {
          return false;
        }
        int byte2 = bytes[index++];
        if (byte2 > (byte) 0xBF
            // Overlong? 5 most significant bits must not all be zero.
            || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
            // Check for illegal surrogate codepoints.
            || (byte1 == (byte) 0xED && (byte) 0xA0 <= byte2)
            // Third byte trailing-byte test.
            || bytes[index++] > (byte) 0xBF) {
          return false;
        }
      } else {
        // Four-byte form.
        if (index + 2 >= end) {
          return false;
        }
        int byte2 = bytes[index++];
        if (byte2 > (byte) 0xBF
            // Check that 1 <= plane <= 16. Tricky optimized form of:
            // if (byte1 > (byte) 0xF4
            //     || byte1 == (byte) 0xF0 && byte2 < (byte) 0x90
            //     || byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
            || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
            // Third byte trailing-byte test
            || bytes[index++] > (byte) 0xBF
            // Fourth byte trailing-byte test
            || bytes[index++] > (byte) 0xBF) {
          return false;
        }
      }
    }
  }
  private static String unpairedSurrogateMsg(int i) {
    return "Unpaired surrogate at index " + i;
  }
  private Utf8() {}
New to GrepCode? Check out our FAQ X