Re: 64-bit hashing function
On Mon, 21 Apr 2014 12:51:32 +0200, Marcel M?ller
<news.5.maazl@spamgourmet.org> wrote, quoted or indirectly quoted
someone who said :
There is no map needed for that purpose at all
here is my first cut at the code. I have not run it yet
It shows what I had in mind.
/*
* [Lazy.java]
*
* Summary: Lets us avoid the work of expanding macros if they were
done successfully earlier.
*
* Copyright: (c) 2012-2014 Roedy Green, Canadian Mind Products,
http://mindprod.com
*
* Licence: This software may be copied and used freely for any
purpose but military.
* http://mindprod.com/contact/nonmil.html
*
* Requires: JDK 1.7+
*
* Created with: JetBrains IntelliJ IDEA IDE
http://www.jetbrains.com/idea/
*
* Version History:
* 1.0 2014-04-21 initial version.
*/
package com.mindprod.htmlmacros.support;
import com.mindprod.common17.ST;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map.Entry;
import java.util.concurrent.TimeUnit;
import java.util.zip.Adler32;
import static com.mindprod.htmlmacros.macro.Global.configuration;
import static java.lang.System.err;
/**
* Lets us avoid the work of expanding macros if they were done
successfully earlier.
*
* @author Roedy Green, Canadian Mind Products
* @version 1.0 2014-04-21 initial version
* @since 2014-04-21
*/
public class Lazy
{
// -------------------------- PUBLIC STATIC METHODS
--------------------------
static final Adler32 digester = new Adler32();
/**
* encoding for UTF-8
*/
private static final Charset UTF8Charset = Charset.forName(
"UTF-8" );
/**
* where the lazy cache is kept are kept.
*/
private static final String CACHE_FILE_NAME =
"embellishment/cache.bin";
/**
* how many entries we are expecting in the lookup file.
*/
private static final int size = 20000;
/**
* root directory of the local websites
*/
private static final File root = new File(
configuration.getLocalWebrootWithSlashes() );
/**
* allow 5 seconds of slop in matching dates
*/
private static final long slop = TimeUnit.SECONDS.toMillis( 5 );
static
{
load();
}
static
{
// arrange for save to be run at shutdown.
Runtime.getRuntime().addShutdownHook( new Thread()
{
public void run()
{
save();
}
} );
}
/**
* look up filename hash-32 to timestamp
*/
private static HashMap<Integer, Long> lookup;
/**
* load the previous contents of the lookup cache from
embellishments/cache.bin
* It is a binary file of pairs hash-32, timestamp-64
*/
private static void load()
{
// load up the HashMap we use to track when files were last
successfully processed.
final File cacheFile = new File( root, CACHE_FILE_NAME );
// allow some padding to avoid collisions
lookup = new HashMap<>( size + size / 4 );
// if no cachefile carry on with empty lookup.
// if cachefile exists, load pair from it.
if ( cacheFile.exists() & cacheFile.canRead() )
{
DataInputStream dis = null;
try
{
try
{
// O P E N
final FileInputStream fis = new FileInputStream(
cacheFile );
final BufferedInputStream bis = new
BufferedInputStream( fis, 512 * 1024 );
dis = new DataInputStream( bis );
while ( true )
{
// R E A D pairs hash-32, timestamp-64
int hash = dis.readInt();
long timestamp = dis.readInt();
safePut( hash, timestamp );
} // end loop
} // end inner try
catch ( EOFException e )
{
// nothing to do
}
finally
{
// C L O S E
if ( dis != null )
{
dis.close();
}
}
} // end outer try
catch ( IOException e )
{
err.println( ">>> Warning. Unable to read cache.bin
file" );
}
} // end if
else
{
err.println( ">>> clearing the cache.bin file" );
} // end load
} // end save
/**
* save the contents of the lookup cache into
embellishments/cache.bin
* It is a binary file of pairs hash-32, timestamp-64
*/
private static void save()
{
final File cacheFile = new File( root, CACHE_FILE_NAME );
if ( cacheFile.canWrite() )
{
try
{
// O P E N
final FileOutputStream fos = new FileOutputStream(
cacheFile, false /* append */ );
final BufferedOutputStream bos = new
BufferedOutputStream( fos, 65536 /* 64K bytes */ );
final DataOutputStream dos = new DataOutputStream( bos
);
for ( Entry<Integer, Long> entry : lookup.entrySet() )
{
// W R I T E
int hash = entry.getKey();
long timestamp = entry.getValue();
dos.writeInt( hash ); // we write int and long,
not Integer and Long.
dos.writeLong( timestamp );
}
// C L O S E
dos.close();
} // end if
catch ( IOException e )
{
err.println( ">>> Warning. Unable to write cache.bin
file" + e.getMessage() );
}
} // end if
else
{
err.println( ">>> Warning. Unable to write cache.bin file"
);
}
} // end save
/**
* has this file already been processed and is unchanged since
that time?
*
* @param file file we are processing.
*
* @return true if the file has already been successfully
processed.
*/
public static boolean isAlreadyDone( File file )
{
// prune filename down to jgloss/jdk
int hash = calcHash( file );
Long timestampL = lookup.get( hash );
// if no entry, it was not registered as done.
if ( timestampL == null )
{
return false;
}
// if all is well ,the last modified date should not have
changed since we recorded the file as
// successfully processed.
if ( file.lastModified() > timestampL + slop )
{
// the file has been modified since we last processed it.
// we will have to reprocess it.
// This cache entry is useless. We might as well get rid
of it to save some space.
lookup.remove( hash );
return false;
}
else
{
// it has not been touched since we last successfully
processed it.
// the cache entry is fine as is.
return true;
}
}
/**
* Mark the status of this file.
*
* @param file file we are processing.
* @param status true= file successfully processed, false=file was
not successfully processed.
*/
public static void markStatus( File file, boolean status )
{
int hash = calcHash( file );
if ( status )
{
// GOOD
// If there already is an entry we must modify it.
// If there is no entry, we must create one
// The easiest way to accomplish this is just to create a
new entry for that key.
// Collisions should be very rare, but when they happen
keep the conservative older date of the pair.
// if one of the pair needs to be reprocessed, both will
be.
long timestamp = file.lastModified();
safePut( hash, timestamp );
}
else
{
// BAD
// processing this file failed.
// get rid of any entry. There might not be one. remove
does not mind.
lookup.remove( hash );
}
}
/**
* but this hash : timestamp into the lookup.
* However if there is an existing entry, do not replace it
* if it is older than this one.
*
* @param hash has-32 of the file name
* @param timestamp timestamp of file when it was last
successfully processed.
*/
private static void safePut( final int hash, final long timestamp
)
{
Long prevTimeStampL = lookup.put( hash, timestamp );
// collisions should be very rare, but when they happen keep
the conservative older date of the pair.
// if one of the pair needs to be reprocessed, both will be.
if ( prevTimeStampL != null && prevTimeStampL < timestamp )
{
// put it back. This should happen very rarely.
lookup.put( hash, prevTimeStampL );
}
}
/**
* calculate a hash-32 of the name of the file, not its contents
*
* @param file file to be processed
*
* @return 32-bit Adlerian hash.
*/
private static int calcHash( final File file )
{
// prune filename E:/mindprod/jgloss/jdk.html down to
jgloss/jdk
String chopped = ST.chopTrailingString(
Tools.webrootRelativeNameWithSlashes( file ), ".html" );
final byte[] theTextToDigestAsBytes = chopped.getBytes(
UTF8Charset );
digester.reset();
digester.update( theTextToDigestAsBytes );
return ( int ) digester.getValue();
}
}
--
Roedy Green Canadian Mind Products http://mindprod.com
"Don't worry about people stealing an idea; if it's original, you'll
have to shove it down their throats."
~ Howard Aiken (born: 1900-03-08 died: 1973-03-14 at age: 73)