Start line:  
End line:  

Snippet Preview

Snippet HTML Code

Stack Overflow Questions
  package it.unimi.dsi.sux4j.mph;
  
  /*		 
   * Sux4J: Succinct data structures for Java
   *
   * Copyright (C) 2008-2014 Sebastiano Vigna 
   *
   *  This library is free software; you can redistribute it and/or modify it
   *  under the terms of the GNU Lesser General Public License as published by the Free
  *  Software Foundation; either version 3 of the License, or (at your option)
  *  any later version.
  *
  *  This library is distributed in the hope that it will be useful, but
  *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
  *  for more details.
  *
  *  You should have received a copy of the GNU Lesser General Public License
  *  along with this program; if not, see <http://www.gnu.org/licenses/>.
  *
  */
 
 
 import java.io.File;
 
 
 import  com.martiansoftware.jsap.FlaggedOption;
 import  com.martiansoftware.jsap.JSAP;
 import  com.martiansoftware.jsap.JSAPException;
 import  com.martiansoftware.jsap.JSAPResult;
 import  com.martiansoftware.jsap.Parameter;
 import  com.martiansoftware.jsap.SimpleJSAP;
 import  com.martiansoftware.jsap.Switch;
 import  com.martiansoftware.jsap.UnflaggedOption;
 import  com.martiansoftware.jsap.stringparsers.FileStringParser;
 import  com.martiansoftware.jsap.stringparsers.ForNameStringParser;

A monotone minimal perfect hash implementation based on fixed-size bucketing that uses longest common prefixes as distributors, and store their lengths using a TwoStepsMWHCFunction.

This implementation should use a few less bits per elements than LcpMonotoneMinimalPerfectHashFunction, but it is a bit slower as one or two additional functions must be queried.

See the package overview for a comparison with other implementations. Similarly to an MWHCFunction, an instance of this class may be signed.

 
 
     public static final long serialVersionUID = 3L;
 	private static final Logger LOGGER = LoggerFactory.getLoggerTwoStepsLcpMonotoneMinimalPerfectHashFunction.class );
 	private static final boolean DEBUG = false;
 	private static final boolean ASSERTS = false;

A builder class for TwoStepsLcpMonotoneMinimalPerfectHashFunction.
 
 	public static class Builder<T> {
 		protected Iterable<? extends T> keys;
 		protected TransformationStrategy<? super T> transform;
 		protected long numKeys = -1;
 		protected int signatureWidth;
 		protected File tempDir;
Whether build() has already been called.
 
 		protected boolean built;

Specifies the keys to hash.

Parameters:
keys the keys to hash.
Returns:
this builder.
 
 		public Builder<T> keysfinal Iterable<? extends T> keys ) {
			this. = keys;
			return this;
		}

Specifies the number of keys.

The argument must be equal to the number of keys returned by an iterator generated by the set of keys. Without this information, a first scan of the key set will be necessary to compute its cardinality, unless the set of keys implements Size64 or Collection.

Parameters:
numKeys the keys to hash.
Returns:
this builder.
		public Builder<T> numKeysfinal long numKeys ) {
			this. = numKeys;
			return this;
		}

Specifies the transformation strategy for the keys to hash.

Parameters:
transform a transformation strategy for the keys to hash.
Returns:
this builder.
		public Builder<T> transformfinal TransformationStrategy<? super T> transform ) {
			this. = transform;
			return this;
		}

Specifies that the resulting LcpMonotoneMinimalPerfectHashFunction should be signed using a given number of bits per key.

Parameters:
signatureWidth a signature width, or 0 for no signature.
Returns:
this builder.
		public Builder<T> signedfinal int signatureWidth ) {
			this. = signatureWidth;
			return this;
		}

Specifies a temporary directory for the ChunkedHashStore.

Parameters:
tempDir a temporary directory for the ChunkedHashStore. files, or null for the standard temporary directory.
Returns:
this builder.
		public Builder<T> tempDirfinal File tempDir ) {
			this. = tempDir;
			return this;
		}

Builds a two-steps LCP monotone minimal perfect hash function.

Returns:
a TwoStepsLcpMonotoneMinimalPerfectHashFunction instance with the specified parameters.
Throws:
IllegalStateException if called more than once.
			if (  ) throw new IllegalStateException"This builder has been already used" );
			 = true;
		}
	}


The number of elements.
	protected final long n;
The size of a bucket.
	protected final int bucketSize;
Fast.ceilLog2(int) of bucketSize.
	protected final int log2BucketSize;
The mask for log2BucketSize bits.
	protected final int bucketSizeMask;
A function mapping each element to the offset inside its bucket.
	protected final MWHCFunction<BitVectoroffsets;
A function mapping each element to the length of the longest common prefix of its bucket.
A function mapping each longest common prefix to its bucket.
	protected final MWHCFunction<BitVectorlcp2Bucket;
The transformation strategy.
	protected final TransformationStrategy<? super T> transform;
The seed returned by the ChunkedHashStore.
	protected final long seed;
The mask to compare signatures, or zero for no signatures.
	protected final long signatureMask;
The signatures.
	protected final LongBigList signatures

Creates a new two-steps LCP monotone minimal perfect hash function for the given keys.

Deprecated:
Please use the new builder.
Parameters:
keys the keys to hash.
transform a transformation strategy for the keys.
	public TwoStepsLcpMonotoneMinimalPerfectHashFunctionfinal Iterable<? extends T> keysfinal TransformationStrategy<? super T> transform ) throws IOException {
		thiskeys, -1, transform );
	}

Creates a new two-steps LCP monotone minimal perfect hash function for the given keys.

Deprecated:
Please use the new builder.
Parameters:
keys the keys to hash.
numKeys the number of keys, or -1 if the number of keys is not known (will be computed).
transform a transformation strategy for the keys.
	public TwoStepsLcpMonotoneMinimalPerfectHashFunctionfinal Iterable<? extends T> keysfinal int numKeysfinal TransformationStrategy<? super T> transform ) throws IOException {
		thiskeysnumKeystransform, 0, null );
	}

Creates a new two-steps LCP monotone minimal perfect hash function for the given keys.

Parameters:
keys the keys to hash.
numKeys the number of keys, or -1 if the number of keys is not known (will be computed).
transform a transformation strategy for the keys.
signatureWidth a signature width, or 0 for no signature.
tempDir a temporary directory for the store files, or null for the standard temporary directory.
	protected TwoStepsLcpMonotoneMinimalPerfectHashFunctionfinal Iterable<? extends T> keysfinal long numKeysfinal TransformationStrategy<? super T> transformfinal int signatureWidthfinal File tempDir ) throws IOException {
		final ProgressLogger pl = new ProgressLogger );
		pl.displayLocalSpeed = true;
		pl.displayFreeMemory = true;
		this. = transform;
		if ( numKeys == -1 ) {
			if ( keys instanceof Size64 )  = ((Size64)keys).size64();
			else if ( keys instanceof Collection )  = ((Collection<?>)keys).size();
			else {
				long c = 0;
				for( T dummykeys ) c++;
				 = c;
			}
		}
		else  = numKeys;
		 = -1; // For the very few cases in which we can decide
		if (  == 0 ) {
			 = null;
			 = null;
			 = null;
			 = null;
			return;
		}
		int t = (int)Math.ceil( 1 + . * Math.log( 2 ) + Math.log ) - Math.log( 1 + Math.log ) ) );
		 = Fast.ceilLog2t );
		.debug"Bucket size: " +  );
		final long numBuckets = (  +  - 1 ) / ;
		LongArrayBitVector prev = LongArrayBitVector.getInstance();
		LongArrayBitVector curr = LongArrayBitVector.getInstance();
		int currLcp = 0;
		@SuppressWarnings("resource")
		final int[][] lcpLengths = IntBigArrays.newBigArray( (  +  - 1 ) /  );
		int maxLcp = 0;
		long maxLength = 0;
		@SuppressWarnings("resource")
		final ChunkedHashStore<BitVectorchunkedHashStore = new ChunkedHashStore<BitVector>( TransformationStrategies.identity(), pl );
		chunkedHashStore.resetr.nextLong() );
		pl.expectedUpdates = ;
		pl.start"Scanning collection..." );
		Iterator<? extends T> iterator = keys.iterator();
		forlong b = 0; b < numBucketsb++ ) {
			prev.replacetransform.toBitVectoriterator.next() ) );
			chunkedHashStore.addprev );
			maxLength = Math.maxmaxLengthprev.length() );
			currLcp = (int)prev.length();
			final int currBucketSize = (int)Math.min - b *  );
			forint i = 0; i < currBucketSize - 1; i++ ) {
				curr.replacetransform.toBitVectoriterator.next() ) );
				chunkedHashStore.addcurr );
				final int prefix = (int)curr.longestCommonPrefixLengthprev );
				if ( prefix == prev.length() && prefix == curr.length()  ) throw new IllegalArgumentException"The input bit vectors are not distinct" );
				if ( prefix == prev.length() || prefix == curr.length() ) throw new IllegalArgumentException"The input bit vectors are not prefix-free" );
				if ( prev.getBooleanprefix ) ) throw new IllegalArgumentException"The input bit vectors are not lexicographically sorted" );
				currLcp = Math.minprefixcurrLcp );
				prev.replacecurr );
				maxLength = Math.maxmaxLengthprev.length() );
			}
			lcps.addprev.subVector( 0, currLcp  ) );
			IntBigArrays.setlcpLengthsbcurrLcp );
			maxLcp = Math.maxmaxLcpcurrLcp );
		}
		pl.done();
		// We must be sure that both functions are built on the same store.
		chunkedHashStore.checkAndRetry( TransformationStrategies.wrapkeystransform ) );
		this. = chunkedHashStore.seed();
		if (  ) {
			forLongArrayBitVector bvlcps ) s.addbv.copy() );
			assert s.size() == lcps.size() : s.size() + " != " + lcps.size(); // No duplicates.
		}
		// Build function assigning each lcp to its bucket.
		 = new MWHCFunction.Builder<BitVector>().keyslcps ).transform( TransformationStrategies.identity() ).build();
		if (  ) {
			int p = 0;
			forBitVector vlcps ) ..printlnv  + " " + v.length() );
			forBitVector vlcps ) {
				final long value = .getLongv );
				if ( p++ != value ) {
					..println"p: " + (p-1) + "  value: " + value + " key:" + v );
					throw new AssertionError();
				}
			}
		}
		lcps.close();
		// Build function assigning the bucket offset to each element.
		 = new MWHCFunction.Builder<BitVector>().storechunkedHashStore ).valuesnew AbstractLongBigList() {
			public long getLonglong index ) {
				return index & 
			}
			public long size64() {
				return ;
			}
		// Build function assigning the lcp length to each element.
		this. = new TwoStepsMWHCFunction.Builder<BitVector>().storechunkedHashStore ).valuesnew AbstractLongBigList() {
			public long getLonglong index ) {
				return IntBigArrays.getlcpLengthsindex >>>  ); 
			}
			public long size64() {
				return ;
			}
		} ).build();
		// Build function assigning the lcp length and the bucketing data to each element.
		final double p = 1.0 / ( this.. + 1 );
		final double s = spthis.. );
		.debug"Forecast best threshold: " + s ); 
		if (  ) {
			int j = 0;
			for( T keykeys ) {
				BitVector bv = transform.toBitVectorkey );
				if ( j++ != .getLongbv.subVector( 0, this..getLongbv ) ) ) *  + .getLongbv ) ) {
					..println"p: " + ( j - 1 ) 
"  Key: " + key 
" bucket size: " +  
" lcp " + transform.toBitVectorkey ).subVector( 0, this..getLongbv ) )
" lcp length: " + this..getLongbv ) 
" bucket " + .getLongtransform.toBitVectorkey ).subVector( 0, this..getLongbv ) ) ) 
" offset: " + .getLongbv ) );
					throw new AssertionError();
				}
			}
		}
		double secondFunctionForecastBitsPerElement = ( s + . + ( Math.pow( 2, s ) - 1 ) * this.. /  + ( this.. + . ) * ( Math.pow( 1 - p, Math.pow( 2, s ) + 1 ) ) );
		.debug"Forecast bit cost per element: " + (  + . + secondFunctionForecastBitsPerElement + ( Fast.log2. ) ) ) ); 
		.info"Actual bit cost per element: " + (double)numBits() /  );
		if ( signatureWidth != 0 ) {
			 = -1L >>> . - signatureWidth;
			chunkedHashStore.filternull ); // TwoStepsMWHCFunction uses filtering.
			 = chunkedHashStore.signaturessignatureWidthpl );
		}
		else {
			 = null;
		}
		chunkedHashStore.close();
	}
	private static double Wdouble x ) {
		return - Math.log( -1/x ) - Math.log( Math.log( -1/x ) );
	}
	private static double sdouble pint r ) {
		return Fast.log2W( 1 / ( Math.log(2) * ( r + . ) * ( p - 1 ) ) ) / Math.log( 1 - p ) );
	}
	public long size64() {
		return ;
	}

Returns the number of bits used by this structure.

Returns:
the number of bits used by this structure.
	public long numBits() {
		if (  == 0 ) return 0;
	}
	@SuppressWarnings("unchecked")
	public long getLongfinal Object o ) {
		if (  == 0 ) return ;
		final BitVector bitVector = .toBitVector( (T)o ).fast();
		final long[] triple = new long[ 3 ];
		Hashes.jenkinsbitVectortriple );
		final long prefix = .getLongByTripletriple ); 
		if ( prefix == -1 || prefix > bitVector.length() ) return ;
		final long result = ( .getLongbitVector.subVector( 0, prefix ) ) <<  ) + .getLongByTripletriple );
		if (  != 0 ) return result < 0 || result >=  || .getLongresult ) != ( triple[ 0 ] &  ) ?  : result;
		// Out-of-set strings can generate bizarre 3-hyperedges.
		return result < 0 || result >=  ?  : result;
	}
	public long getLongByBitVectorAndTriplefinal BitVector bitVectorfinal long[] triple ) {
		if (  == 0 ) return ;
		final long prefix = .getLongByTripletriple ); 
		if ( prefix == -1 || prefix > bitVector.length() ) return ;
		final long result = ( .getLongbitVector.subVector( 0, prefix ) ) <<  ) + .getLongByTripletriple );
		if (  != 0 ) return result < 0 || result >=  || .getLongresult ) != ( triple[ 0 ] &  ) ?  : result;
		// Out-of-set strings can generate bizarre 3-hyperedges.
		return result < 0 || result >=  ?  : result;
	}
	public static void mainfinal String[] arg ) throws NoSuchMethodExceptionIOException, JSAPException {
		final SimpleJSAP jsap = new SimpleJSAP( TwoStepsLcpMonotoneMinimalPerfectHashFunction.class.getName(), "Builds a two-steps LCP-based monotone minimal perfect hash function reading a newline-separated list of strings.",
				new Parameter[] {
			new FlaggedOption( "encoding", ForNameStringParser.getParser( Charset.class ), "UTF-8", JSAP.NOT_REQUIRED, 'e'"encoding""The string file encoding." ),
			new FlaggedOption( "tempDir", FileStringParser.getParser(), JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'T'"temp-dir""A directory for temporary files." ),
			new Switch( "huTucker"'h'"hu-tucker""Use Hu-Tucker coding to reduce string length." ),
			new Switch( "iso"'i'"iso""Use ISO-8859-1 coding internally (i.e., just use the lower eight bits of each character)." ),
			new Switch( "utf32", JSAP.NO_SHORTFLAG, "utf-32""Use UTF-32 internally (handles surrogate pairs)." ),
			new FlaggedOption( "signatureWidth", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 's'"signature-width""If specified, the signature width in bits; if negative, the generated function will be a dictionary." ),
			new Switch( "zipped"'z'"zipped""The string list is compressed in gzip format." ),
			new UnflaggedOption( "function", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename for the serialised monotone minimal perfect hash function." ),
			new UnflaggedOption( "stringFile", JSAP.STRING_PARSER, "-", JSAP.NOT_REQUIRED, JSAP.NOT_GREEDY, "The name of a file containing a newline-separated list of strings, or - for standard input; in the first case, strings will not be loaded into core memory." ),
		});
		JSAPResult jsapResult = jsap.parse( arg );
		if ( jsap.messagePrinted() ) return;
		final String functionName = jsapResult.getString( "function" );
		final String stringFile = jsapResult.getString( "stringFile" );
		final Charset encoding = (Charset)jsapResult.getObject( "encoding" );
		final File tempDir = jsapResult.getFile( "tempDir" );
		final boolean zipped = jsapResult.getBoolean( "zipped" );
		final boolean iso = jsapResult.getBoolean( "iso" );
		final boolean utf32 = jsapResult.getBoolean( "utf32" );
		final int signatureWidth = jsapResult.getInt( "signatureWidth", 0 ); 
		final Collection<MutableStringcollection;
		if ( "-".equalsstringFile ) ) {
			final ProgressLogger pl = new ProgressLogger );
			pl.displayLocalSpeed = true;
			pl.displayFreeMemory = true;
			pl.start"Loading strings..." );
			collection = new LineIteratornew FastBufferedReadernew InputStreamReaderzipped ? new GZIPInputStream. ) : .encoding ) ), pl ).allLines();
			pl.done();
		}
		else collection = new FileLinesCollectionstringFileencoding.toString(), zipped );
		final TransformationStrategy<CharSequencetransformationStrategy = iso
				? TransformationStrategies.prefixFreeIso()
utf32 
					? TransformationStrategies.prefixFreeUtf32()
					: TransformationStrategies.prefixFreeUtf16();
		BinIO.storeObjectnew TwoStepsLcpMonotoneMinimalPerfectHashFunction<CharSequence>( collection, -1, transformationStrategysignatureWidthtempDir ), functionName );
		.info"Completed." );
	}
New to GrepCode? Check out our FAQ X