Start line:  
End line:  

Snippet Preview

Snippet HTML Code

Stack Overflow Questions
  package com.fasterxml.jackson.core.json;
  
  import java.io.*;
  
This class is used to determine the encoding of byte stream that is to contain JSON content. Rules are fairly simple, and defined in JSON specification (RFC-4627 or newer), except for BOM handling, which is a property of underlying streams.
 
 public final class ByteSourceJsonBootstrapper
 {
     final static byte UTF8_BOM_1 = (byte) 0xEF;
     final static byte UTF8_BOM_2 = (byte) 0xBB;
     final static byte UTF8_BOM_3 = (byte) 0xBF;
     
     /*
     /**********************************************************
     /* Configuration
     /**********************************************************
      */
 
     protected final IOContext _context;
 
     protected final InputStream _in;
 
     /*
     /**********************************************************
     /* Input buffering
     /**********************************************************
      */
 
     protected final byte[] _inputBuffer;
 
     private int _inputPtr;
 
     private int _inputEnd;

    
Flag that indicates whether buffer above is to be recycled after being used or not.
 
     private final boolean _bufferRecyclable;
 
     /*
     /**********************************************************
     /* Input location
     /**********************************************************
      */

    
Current number of input units (bytes or chars) that were processed in previous blocks, before contents of current input buffer.

Note: includes possible BOMs, if those were part of the input.

 
     protected int _inputProcessed;
 
     /*
     /**********************************************************
     /* Data gathered
     /**********************************************************
      */
 
     protected boolean _bigEndian = true;
 
     protected int _bytesPerChar = 0; // 0 means "dunno yet"
 
     /*
     /**********************************************************
     /* Life-cycle
     /**********************************************************
      */
 
     public ByteSourceJsonBootstrapper(IOContext ctxtInputStream in)
     {
          = ctxt;
          = in;
          = ctxt.allocReadIOBuffer();
          =  = 0;
          = 0;
          = true;
     }
 
     public ByteSourceJsonBootstrapper(IOContext ctxtbyte[] inputBufferint inputStartint inputLen)
     {
          = ctxt;
          = null;
          = inputBuffer;
          = inputStart;
         = (inputStart + inputLen);
        // Need to offset this for correct location info
         = -inputStart;
         = false;
    }
    /*
    /**********************************************************
    /*  Encoding detection during bootstrapping
    /**********************************************************
     */
    
    
Method that should be called after constructing an instace. It will figure out encoding that content uses, to allow for instantiating a proper scanner object.
    public JsonEncoding detectEncoding()
        throws IOExceptionJsonParseException
    {
        boolean foundEncoding = false;
        // First things first: BOM handling
        /* Note: we can require 4 bytes to be read, since no
         * combination of BOM + valid JSON content can have
         * shorter length (shortest valid JSON content is single
         * digit char, but BOMs are chosen such that combination
         * is always at least 4 chars long)
         */
        if (ensureLoaded(4)) {
            int quad =  ([] << 24)
                | (([+1] & 0xFF) << 16)
                | (([+2] & 0xFF) << 8)
                | ([+3] & 0xFF);
            
            if (handleBOM(quad)) {
                foundEncoding = true;
            } else {
                /* If no BOM, need to auto-detect based on first char;
                 * this works since it must be 7-bit ascii (wrt. unicode
                 * compatible encodings, only ones JSON can be transferred
                 * over)
                 */
                // UTF-32?
                if (checkUTF32(quad)) {
                    foundEncoding = true;
                } else if (checkUTF16(quad >>> 16)) {
                    foundEncoding = true;
                }
            }
        } else if (ensureLoaded(2)) {
            int i16 = (([] & 0xFF) << 8)
                | ([+1] & 0xFF);
            if (checkUTF16(i16)) {
                foundEncoding = true;
            }
        }
        JsonEncoding enc;
        /* Not found yet? As per specs, this means it must be UTF-8. */
        if (!foundEncoding) {
            enc = .;
        } else {
            switch () {
            case 1:
                enc = .;
                break;
            case 2:
                enc =  ? . : .;
                break;
            case 4:
                enc =  ? . : .;
                break;
            default:
                throw new RuntimeException("Internal error"); // should never get here
            }
        }
        .setEncoding(enc);
        return enc;
    }
    /*
    /**********************************************************
    /* Constructing a Reader
    /**********************************************************
     */
    
    public Reader constructReader()
        throws IOException
    {
        JsonEncoding enc = .getEncoding();
        switch (enc) { 
        case :
        case :
            return new UTF32Reader(,
                                   .getEncoding().isBigEndian());
        case :
        case :
        case // only in non-common case where we don't want to do direct mapping
            {
                // First: do we have a Stream? If not, need to create one:
                InputStream in = ;
                if (in == null) {
                    in = new ByteArrayInputStream();
                } else {
                    /* Also, if we have any read but unused input (usually true),
                     * need to merge that input in:
                     */
                    if ( < ) {
                        in = new MergedStream(in);
                    }
                }
                return new InputStreamReader(inenc.getJavaName());
            }
        }
        throw new RuntimeException("Internal error"); // should never get here
    }
    public JsonParser constructParser(int parserFeaturesObjectCodec codec,
            BytesToNameCanonicalizer rootByteSymbolsCharsToNameCanonicalizer rootCharSymbols,
            boolean canonicalizeboolean intern)
        throws IOExceptionJsonParseException
    {
        JsonEncoding enc = detectEncoding();
        if (enc == .) {
            /* and without canonicalization, byte-based approach is not performance; just use std UTF-8 reader
             * (which is ok for larger input; not so hot for smaller; but this is not a common case)
             */
            if (canonicalize) {
                BytesToNameCanonicalizer can = rootByteSymbols.makeChild(canonicalizeintern);
                return new UTF8StreamJsonParser(parserFeaturescodeccan);
            }
        }
        return new ReaderBasedJsonParser(parserFeaturesconstructReader(), codec,
                rootCharSymbols.makeChild(canonicalizeintern));
    }
    /*
    /**********************************************************
    /*  Encoding detection for data format auto-detection
    /**********************************************************
     */

    
Current implementation is not as thorough as other functionality (ByteSourceJsonBootstrapper); supports UTF-8, for example. But it should work, for now, and can be improved as necessary.
    public static MatchStrength hasJSONFormat(InputAccessor accthrows IOException
    {
        // Ideally we should see "[" or "{"; but if not, we'll accept double-quote (String)
        // in future could also consider accepting non-standard matches?
        
        if (!acc.hasMoreBytes()) {
            return .;
        }
        byte b = acc.nextByte();
        // Very first thing, a UTF-8 BOM?
        if (b == ) { // yes, looks like UTF-8 BOM
            if (!acc.hasMoreBytes()) {
                return .;
            }
            if (acc.nextByte() != ) {
                return .;
            }
            if (!acc.hasMoreBytes()) {
                return .;
            }
            if (acc.nextByte() != ) {
                return .;
            }
            if (!acc.hasMoreBytes()) {
                return .;
            }
            b = acc.nextByte();
        }
        // Then possible leading space
        int ch = skipSpace(accb);
        if (ch < 0) {
            return .;
        }
        // First, let's see if it looks like a structured type:
        if (ch == '{') { // JSON object?
            // Ideally we need to find either double-quote or closing bracket
            ch = skipSpace(acc);
            if (ch < 0) {
                return .;
            }
            if (ch == '"' || ch == '}') {
                return .;
            }
            // ... should we allow non-standard? Let's not yet... can add if need be
            return .;
        }
        MatchStrength strength;
        
        if (ch == '[') {
            ch = skipSpace(acc);
            if (ch < 0) {
                return .;
            }
            // closing brackets is easy; but for now, let's also accept opening...
            if (ch == ']' || ch == '[') {
                return .;
            }
            return .;
        } else {
            // plain old value is not very convincing...
            strength = .;
        }
        if (ch == '"') { // string value
            return strength;
        }
        if (ch <= '9' && ch >= '0') { // number
            return strength;
        }
        if (ch == '-') { // negative number
            ch = skipSpace(acc);
            if (ch < 0) {
                return .;
            }
            return (ch <= '9' && ch >= '0') ? strength : .;
        }
        // or one of literals
        if (ch == 'n') { // null
            return tryMatch(acc"ull"strength);
        }
        if (ch == 't') { // true
            return tryMatch(acc"rue"strength);
        }
        if (ch == 'f') { // false
            return tryMatch(acc"alse"strength);
        }
        return .;
    }
    private static MatchStrength tryMatch(InputAccessor accString matchStrMatchStrength fullMatchStrength)
        throws IOException
    {
        for (int i = 0, len = matchStr.length(); i < len; ++i) {
            if (!acc.hasMoreBytes()) {
                return .;
            }
            if (acc.nextByte() != matchStr.charAt(i)) {
                return .;
            }
        }
        return fullMatchStrength;
    }
    
    private static int skipSpace(InputAccessor accthrows IOException
    {
        if (!acc.hasMoreBytes()) {
            return -1;
        }
        return skipSpace(accacc.nextByte());
    }
    
    private static int skipSpace(InputAccessor accbyte bthrows IOException
    {
        while (true) {
            int ch = (intb & 0xFF;
            if (!(ch == ' ' || ch == '\r' || ch == '\n' || ch == '\t')) {
                return ch;
            }
            if (!acc.hasMoreBytes()) {
                return -1;
            }
            b = acc.nextByte();
            ch = (intb & 0xFF;
        }
    }
    
    /*
    /**********************************************************
    /* Internal methods, parsing
    /**********************************************************
     */

    

Returns:
True if a BOM was succesfully found, and encoding thereby recognized.
    private boolean handleBOM(int quad)
        throws IOException
    {
        /* Handling of (usually) optional BOM (required for
         * multi-byte formats); first 32-bit charsets:
         */
        switch (quad) {
        case 0x0000FEFF:
             = true;
             += 4;
             = 4;
            return true;
        case 0xFFFE0000: // UCS-4, LE?
             += 4;
             = 4;
             = false;
            return true;
        case 0x0000FFFE: // UCS-4, in-order...
            reportWeirdUCS4("2143"); // throws exception
        case 0xFEFF0000: // UCS-4, in-order...
            reportWeirdUCS4("3412"); // throws exception
        }
        // Ok, if not, how about 16-bit encoding BOMs?
        int msw = quad >>> 16;
        if (msw == 0xFEFF) { // UTF-16, BE
             += 2;
             = 2;
             = true;
            return true;
        }
        if (msw == 0xFFFE) { // UTF-16, LE
             += 2;
             = 2;
             = false;
            return true;
        }
        // And if not, then UTF-8 BOM?
        if ((quad >>> 8) == 0xEFBBBF) { // UTF-8
             += 3;
             = 1;
             = true// doesn't really matter
            return true;
        }
        return false;
    }
    private boolean checkUTF32(int quad)
        throws IOException
    {
        /* Handling of (usually) optional BOM (required for
         * multi-byte formats); first 32-bit charsets:
         */
        if ((quad >> 8) == 0) { // 0x000000?? -> UTF32-BE
             = true;
        } else if ((quad & 0x00FFFFFF) == 0) { // 0x??000000 -> UTF32-LE
             = false;
        } else if ((quad & ~0x00FF0000) == 0) { // 0x00??0000 -> UTF32-in-order
            reportWeirdUCS4("3412");
        } else if ((quad & ~0x0000FF00) == 0) { // 0x0000??00 -> UTF32-in-order
            reportWeirdUCS4("2143");
        } else {
            // Can not be valid UTF-32 encoded JSON...
            return false;
        }
        // Not BOM (just regular content), nothing to skip past:
        //_inputPtr += 4;
         = 4;
        return true;
    }
    private boolean checkUTF16(int i16)
    {
        if ((i16 & 0xFF00) == 0) { // UTF-16BE
             = true;
        } else if ((i16 & 0x00FF) == 0) { // UTF-16LE
             = false;
        } else { // nope, not  UTF-16
            return false;
        }
        // Not BOM (just regular content), nothing to skip past:
        //_inputPtr += 2;
         = 2;
        return true;
    }
    /*
    /**********************************************************
    /* Internal methods, problem reporting
    /**********************************************************
     */
    private void reportWeirdUCS4(String type)
        throws IOException
    {
        throw new CharConversionException("Unsupported UCS-4 endianness ("+type+") detected");
    }
    /*
    /**********************************************************
    /* Internal methods, raw input access
    /**********************************************************
     */
    protected boolean ensureLoaded(int minimum)
        throws IOException
    {
        /* Let's assume here buffer has enough room -- this will always
         * be true for the limited used this method gets
         */
        int gotten = ( - );
        while (gotten < minimum) {
            int count;
            if ( == null) { // block source
                count = -1;
            } else {
                count = .read(. - );
            }
            if (count < 1) {
                return false;
            }
             += count;
            gotten += count;
        }
        return true;
    }
New to GrepCode? Check out our FAQ X