Start line:  
End line:  

Snippet Preview

Snippet HTML Code

Stack Overflow Questions
  package it.unimi.dsi.sux4j.mph;
  
  /*		 
   * Sux4J: Succinct data structures for Java
   *
   * Copyright (C) 2008-2014 Sebastiano Vigna 
   *
   *  This library is free software; you can redistribute it and/or modify it
   *  under the terms of the GNU Lesser General Public License as published by the Free
  *  Software Foundation; either version 3 of the License, or (at your option)
  *  any later version.
  *
  *  This library is distributed in the hope that it will be useful, but
  *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
  *  for more details.
  *
  *  You should have received a copy of the GNU Lesser General Public License
  *  along with this program; if not, see <http://www.gnu.org/licenses/>.
  *
  */
 
 import static it.unimi.dsi.bits.Fast.log2;
 import static it.unimi.dsi.sux4j.mph.HypergraphSorter.GAMMA;
 import static java.lang.Math.E;
 import static java.lang.Math.log;
 
 import java.io.File;
 
 
 import  com.martiansoftware.jsap.FlaggedOption;
 import  com.martiansoftware.jsap.JSAP;
 import  com.martiansoftware.jsap.JSAPException;
 import  com.martiansoftware.jsap.JSAPResult;
 import  com.martiansoftware.jsap.Parameter;
 import  com.martiansoftware.jsap.SimpleJSAP;
 import  com.martiansoftware.jsap.Switch;
 import  com.martiansoftware.jsap.UnflaggedOption;
 import  com.martiansoftware.jsap.stringparsers.FileStringParser;
 import  com.martiansoftware.jsap.stringparsers.ForNameStringParser;

A monotone minimal perfect hash implementation based on fixed-size bucketing that uses a z-fast trie as a distributor.

See the package overview for a comparison with other implementations. Similarly to an MWHCFunction, an instance of this class may be signed.

 
 
     public static final long serialVersionUID = 3L;
 	private static final Logger LOGGER = LoggerFactory.getLoggerZFastTrieDistributorMonotoneMinimalPerfectHashFunction.class );

The number of elements.
 
 	private final long size;
The logarithm of the bucket size.
 
 	private final int log2BucketSize;
The transformation strategy.
 
 	private final TransformationStrategy<? super T> transform;
A hollow trie distributor assigning keys to buckets.
 
The offset of each element into his bucket.
 
 	private final MWHCFunction<BitVectoroffset;
The seed returned by the ChunkedHashStore.
 
 	private long seed
The mask to compare signatures, or zero for no signatures.
 
 	protected final long signatureMask;
The signatures.
 
 	protected final LongBigList signatures;

A builder class for ZFastTrieDistributorMonotoneMinimalPerfectHashFunction.
 
 	public static class Builder<T> {
 		protected Iterable<? extends T> keys;
 		protected TransformationStrategy<? super T> transform;
 		protected long numKeys = -1;
 		protected int signatureWidth;
		protected File tempDir;
Whether build() has already been called.
		protected boolean built;

Specifies the keys to hash.

Parameters:
keys the keys to hash.
Returns:
this builder.
		public Builder<T> keysfinal Iterable<? extends T> keys ) {
			this. = keys;
			return this;
		}

Specifies the transformation strategy for the keys to hash.

Parameters:
transform a transformation strategy for the keys to hash.
Returns:
this builder.
		public Builder<T> transformfinal TransformationStrategy<? super T> transform ) {
			this. = transform;
			return this;
		}

Specifies that the resulting LcpMonotoneMinimalPerfectHashFunction should be signed using a given number of bits per key.

Parameters:
signatureWidth a signature width, or 0 for no signature.
Returns:
this builder.
		public Builder<T> signedfinal int signatureWidth ) {
			this. = signatureWidth;
			return this;
		}

Specifies a temporary directory for the ChunkedHashStore.

Parameters:
tempDir a temporary directory for the ChunkedHashStore. files, or null for the standard temporary directory.
Returns:
this builder.
		public Builder<T> tempDirfinal File tempDir ) {
			this. = tempDir;
			return this;
		}

Builds a monotone minimal perfect hash function based on a z-fast trie distributor.

Returns:
a ZFastTrieDistributorMonotoneMinimalPerfectHashFunction instance with the specified parameters.
Throws:
IllegalStateException if called more than once.
			if (  ) throw new IllegalStateException"This builder has been already used" );
			 = true;
		}
	}


Creates a new monotone minimal perfect hash function based on a z-fast trie distributor using the given keys and transformation strategy.

Deprecated:
Please use the new builder.
Parameters:
keys the keys among which the trie must be able to rank.
transform a transformation strategy that must turn the keys into a list of distinct, prefix-free, lexicographically increasing (in iteration order) bit vectors.
	public ZFastTrieDistributorMonotoneMinimalPerfectHashFunctionfinal Iterable<? extends T> keysfinal TransformationStrategy<? super T> transform ) throws IOException {
		thiskeystransform, -1, 0, null );
	}

Creates a new monotone minimal perfect hash function based on a z-fast trie distributor using the given keys, transformation strategy and bucket size.

Parameters:
keys the keys among which the trie must be able to rank.
transform a transformation strategy that must turn the keys into a list of distinct, prefix-free, lexicographically increasing (in iteration order) bit vectors.
log2BucketSize the logarithm of the bucket size, or -1 for the default value.
signatureWidth a signature width, or 0 for no signature.
tempDir a temporary directory for the store files, or null for the standard temporary directory.
	protected ZFastTrieDistributorMonotoneMinimalPerfectHashFunctionfinal Iterable<? extends T> keysfinal TransformationStrategy<? super T> transformfinal int log2BucketSizefinal int signatureWidthfinal File tempDir ) throws IOException {
		this. = transform;
		 = -1; // For the very few cases in which we can decide
		long maxLength = 0;
		long totalLength = 0;
		final ChunkedHashStore<BitVectorchunkedHashStore = new ChunkedHashStore<BitVector>( TransformationStrategies.identity(), tempDir );
		chunkedHashStore.resetr.nextLong() );
		final Iterable<BitVectorbitVectors = TransformationStrategies.wrapkeystransform );
		final ProgressLogger pl = new ProgressLogger );
		pl.displayLocalSpeed = true;
		pl.displayFreeMemory = true;
		pl.itemsName = "keys";
		pl.start"Scanning collection..." );
		forBitVector bvbitVectors ) {
			maxLength = Math.maxmaxLengthbv.length() );
			totalLength += bv.length();
			chunkedHashStore.addbv );
		}
		pl.done();
		chunkedHashStore.checkAndRetrybitVectors );
		 = chunkedHashStore.size();
		if (  == 0 ) {
			this. = -1;
			 = null;
			 = null;
			 = null;
			chunkedHashStore.close();
			return;
		}
		final long averageLength = ( totalLength +  - 1 ) / ;
		final long forecastBucketSize = (long)Math.ceil( 10.5 + 4.05 * logaverageLength ) + 2.43 * loglog (  ) + 1 ) + 2.43 * loglogaverageLength ) + 1 ) );
		this. = log2BucketSize == -1 ? Fast.mostSignificantBitforecastBucketSize ) : log2BucketSize;
		.debug"Average length: " + averageLength );
		.debug"Max length: " + maxLength );
		.debug"Bucket size: " + ( 1L << this. ) );
		.info"Computing z-fast trie distributor..." );
		 = new ZFastTrieDistributor<BitVector>( bitVectorsthis., TransformationStrategies.identity(), chunkedHashStore );
		.info"Computing offsets..." );
		 = new MWHCFunction.Builder<BitVector>().storechunkedHashStore ).valuesnew AbstractLongBigList() {
			public long getLonglong index ) {
				return index & 
			}
			public long size64() {
				return ;
			}
		}, this. ).indirect().build();
		//System.err.println( "*********" + chunkedHashStore.seed() );
		 = chunkedHashStore.seed();
		double logU = averageLength * log( 2 );
		.info"Forecast bit cost per element: "
				+ 1.0 / forecastBucketSize
				* ( -6 * log2log( 2 ) ) + 5 * log2logU ) + 2 * log2forecastBucketSize ) + 
						log2loglogU ) - loglog( 2 ) ) ) + 6 *  + 3 * log2 (  ) + 3 * log2log( 3.0 *  ) ) 
						+ 3 +  * forecastBucketSize +  * forecastBucketSize * log2forecastBucketSize ) ) ); 
		.info"Actual bit cost per element: " + (double)numBits() /  );
		if ( signatureWidth != 0 ) {
			 = -1L >>> . - signatureWidth;
			 = chunkedHashStore.signaturessignatureWidthpl );
		}
		else {
			 = null;
		}
		chunkedHashStore.close();
	}
	@SuppressWarnings("unchecked")
	public long getLongfinal Object o ) {
		if (  == 0 ) return ;
		final BitVector bv = .toBitVector( (T)o ).fast();
		final long state[][] = Hashes.preprocessJenkinsbv );
		final long[] triple = new long[ 3 ];
		Hashes.jenkinsbvbv.length(), state[ 0 ], state[ 1 ], state[ 2 ], triple );
		final long bucket = .getLongByBitVectorTripleAndStatebvtriplestate );
		final long result = ( bucket <<  ) + .getLongByTripletriple );
		if (  != 0 ) return result < 0 || result >=  || .getLongresult ) != ( triple[ 0 ] &  ) ?  : result;
		// Out-of-set strings can generate bizarre 3-hyperedges.
		return result < 0 || result >=  ?  : result;
	}
	public long size64() {
		return ;
	}
	public long numBits() {
		if (  == 0 ) return 0;
	}
	public static void mainfinal String[] arg ) throws NoSuchMethodExceptionIOException, JSAPException {
		final SimpleJSAP jsap = new SimpleJSAP( ZFastTrieDistributorMonotoneMinimalPerfectHashFunction.class.getName(), "Builds an PaCo trie-based monotone minimal perfect hash function reading a newline-separated list of strings.",
				new Parameter[] {
			new FlaggedOption( "encoding", ForNameStringParser.getParser( Charset.class ), "UTF-8", JSAP.NOT_REQUIRED, 'e'"encoding""The string file encoding." ),
			new FlaggedOption( "tempDir", FileStringParser.getParser(), JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'T'"temp-dir""A directory for temporary files." ),
			new Switch( "huTucker"'h'"hu-tucker""Use Hu-Tucker coding to reduce string length." ),
			new Switch( "iso"'i'"iso""Use ISO-8859-1 coding internally (i.e., just use the lower eight bits of each character)." ),
			new Switch( "utf32", JSAP.NO_SHORTFLAG, "utf-32""Use UTF-32 internally (handles surrogate pairs)." ),
			new FlaggedOption( "signatureWidth", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 's'"signature-width""If specified, the signature width in bits; if negative, the generated function will be a dictionary." ),
			new Switch( "zipped"'z'"zipped""The string list is compressed in gzip format." ),
			new FlaggedOption( "log2bucket", JSAP.INTEGER_PARSER, "-1", JSAP.NOT_REQUIRED, 'b'"log2bucket""The base 2 logarithm of the bucket size (mainly for testing)." ),
			new UnflaggedOption( "function", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename for the serialised monotone minimal perfect hash function." ),
			new UnflaggedOption( "stringFile", JSAP.STRING_PARSER, "-", JSAP.NOT_REQUIRED, JSAP.NOT_GREEDY, "The name of a file containing a newline-separated list of strings, or - for standard input; in the first case, strings will not be loaded into core memory." ),
		});
		JSAPResult jsapResult = jsap.parse( arg );
		if ( jsap.messagePrinted() ) return;
		final String functionName = jsapResult.getString( "function" );
		final String stringFile = jsapResult.getString( "stringFile" );
		final int log2BucketSize = jsapResult.getInt( "log2bucket" );
		final Charset encoding = (Charset)jsapResult.getObject( "encoding" );
		final File tempDir = jsapResult.getFile( "tempDir" );
		final boolean zipped = jsapResult.getBoolean( "zipped" );
		final boolean iso = jsapResult.getBoolean( "iso" );
		final boolean utf32 = jsapResult.getBoolean( "utf32" );
		final boolean huTucker = jsapResult.getBoolean( "huTucker" );
		final int signatureWidth = jsapResult.getInt( "signatureWidth", 0 ); 
		final Collection<MutableStringcollection;
		if ( "-".equalsstringFile ) ) {
			final ProgressLogger pl = new ProgressLogger );
			pl.displayLocalSpeed = true;
			pl.displayFreeMemory = true;
			pl.start"Loading strings..." );
			collection = new LineIteratornew FastBufferedReadernew InputStreamReaderzipped ? new GZIPInputStream. ) : .encoding ) ), pl ).allLines();
			pl.done();
		}
		else collection = new FileLinesCollectionstringFileencoding.toString(), zipped );
		final TransformationStrategy<CharSequencetransformationStrategy = huTucker 
new HuTuckerTransformationStrategycollectiontrue )
iso
					? TransformationStrategies.prefixFreeIso() 
utf32
						? TransformationStrategies.prefixFreeUtf32()
						: TransformationStrategies.prefixFreeUtf16();
		BinIO.storeObjectnew ZFastTrieDistributorMonotoneMinimalPerfectHashFunction<CharSequence>( collectiontransformationStrategylog2BucketSizesignatureWidthtempDir ), functionName );
		.info"Completed." );
	}
New to GrepCode? Check out our FAQ X