2002
-
- This code is released under the GNU General Public License version 2
- or later. Alteratively, you may also use this code under the terms
- of the Perl Artistic license.
-
- If you wish to distribute this code under the terms of a different
- free software license then please ask me. If there is a good reason
- then I will probably say yes.
-
-*/
-
-//package eu.scape_project.bitwiser.utils;
-//https://raw.github.com/openplanets/bitwiser/master/src/main/java/eu/scape_project/bitwiser/utils/SSDeep.java
-package org.codesecure.dependencycheck.utils;
-
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.util.Arrays;
-
-import org.apache.commons.lang.StringUtils;
-
-/**
- * SSDeep
- *
- *
- * A Java version of the ssdeep algorithm, based on the fuzzy.c source
- * code, taken from version 2.6 of the ssdeep package.
- *
- *
- * Transliteration/port to Java from C by...
- *
- * @author Andrew Jackson
- *
- */
-public class SSDeep {
-
- public class FuzzyHash {
- /** the blocksize used by the program, */
- int blocksize;
- /** the hash for this blocksize */
- String hash;
- /** the hash for twice the blocksize, */
- String hash2;
- /** the filename. */
- String filename;
- }
-
- /// Length of an individual fuzzy hash signature component
- public static final int SPAMSUM_LENGTH = 64;
-
- /// The longest possible length for a fuzzy hash signature (without the filename)
- public static final int FUZZY_MAX_RESULT = (SPAMSUM_LENGTH + (SPAMSUM_LENGTH/2 + 20));
-
-
- public static final int MIN_BLOCKSIZE = 3;
- public static final int ROLLING_WINDOW = 7;
-
- public static final int HASH_PRIME = 0x01000193;
- public static final int HASH_INIT = 0x28021967;
-
- // Our input buffer when reading files to hash
- public static final int BUFFER_SIZE = 8192;
-
- static class roll_state_class {
- int[] window = new int[ROLLING_WINDOW];
- int h1, h2, h3;
- int n;
- }
- private static roll_state_class roll_state = new roll_state_class();
-
-
- /*
- a rolling hash, based on the Adler checksum. By using a rolling hash
- we can perform auto resynchronisation after inserts/deletes
-
- internally, h1 is the sum of the bytes in the window and h2
- is the sum of the bytes times the index
-
- h3 is a shift/xor based rolling hash, and is mostly needed to ensure that
- we can cope with large blocksize values
- */
- static int roll_hash(int c)
- {
-
-// System.out.println(""+roll_state.h1+","+roll_state.h2+","+roll_state.h3);
- roll_state.h2 -= roll_state.h1;
- //roll_state.h2 = roll_state.h2 & 0x7fffffff;
- roll_state.h2 += ROLLING_WINDOW * c;
- //roll_state.h2 = roll_state.h2 & 0x7fffffff;
-
- roll_state.h1 += c;
- //roll_state.h1 = roll_state.h1 & 0x7fffffff;
- roll_state.h1 -= roll_state.window[(roll_state.n % ROLLING_WINDOW)];
- //roll_state.h1 = roll_state.h1 & 0x7fffffff;
-
- roll_state.window[roll_state.n % ROLLING_WINDOW] = (char)c;
- roll_state.n = (roll_state.n+1)%ROLLING_WINDOW;
-
- /* The original spamsum AND'ed this value with 0xFFFFFFFF which
- in theory should have no effect. This AND has been removed
- for performance (jk) */
- roll_state.h3 = (roll_state.h3 << 5);// & 0xFFFFFFFF;
- roll_state.h3 ^= c;
- //roll_state.h3 = roll_state.h3 & 0x7FFFFFFF;
- //if( roll_state.h3 > 0xEFFFFFFF ) roll_state.h3 -= 0xEFFFFFFF;
-
- long result = ((roll_state.h1 + roll_state.h2 + roll_state.h3));//&0x7FFFFFFF;
- //System.out.println("Result: "+result);
- //System.out.println("Result2: "+(result&0xFFFFFFFF));
- //System.out.println("Result3: "+(result&0x7FFFFFFF));
-
- return (int) result;//&0xFFFFFFFF;
- }
-
- /*
- reset the state of the rolling hash and return the initial rolling hash value
- */
- static void roll_reset()
- {
- roll_state.h1 = 0;
- roll_state.h2 = 0;
- roll_state.h3 = 0;
- roll_state.n = 0;
- Arrays.fill(roll_state.window,(char)0);
- }
-
- /* a simple non-rolling hash, based on the FNV hash */
- static int sum_hash(int c, int h)
- {
- h *= HASH_PRIME;
- //h = h & 0xFFFFFFFF;
- h ^= c;
- //h = h & 0xFFFFFFFF;
- return h;
- }
-
- class ss_context {
- char[] ret;
- char[] p;
- long total_chars;
- int h, h2, h3;
- int j, n, i, k;
- int block_size;
- char[] ret2 = new char[SPAMSUM_LENGTH/2 + 1];
- }
-
-
- static void ss_destroy(ss_context ctx)
- {
- if (ctx.ret != null)
- ctx.ret = null;
- //free(ctx.ret);
- }
-
-
- static boolean ss_init(ss_context ctx, File handle)
- {
- if ( ctx == null )
- return true;
-
- ctx.ret = new char[FUZZY_MAX_RESULT];
- if (ctx.ret == null)
- return true;
-
- if (handle != null)
- ctx.total_chars = handle.length();
-
- ctx.block_size = MIN_BLOCKSIZE;
- while (ctx.block_size * SPAMSUM_LENGTH < ctx.total_chars) {
- ctx.block_size = ctx.block_size * 2;
- }
-
- System.out.println("bs:"+ctx.block_size);
-
- return false;
- }
-
- static char[] b64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/".toCharArray();
-
- static void ss_engine(ss_context ctx,
- byte[] buffer,
- int buffer_size)
- {
- if (null == ctx || null == buffer)
- return;
-
- for ( int i = 0 ; i < buffer_size ; ++i)
- {
-
- /*
- at each character we update the rolling hash and
- the normal hash. When the rolling hash hits the
- reset value then we emit the normal hash as a
- element of the signature and reset both hashes
- */
-
- System.out.println(""+ctx.h+","+ctx.h2+","+ctx.h3);
- ctx.h = roll_hash(buffer[i]);// & 0x7FFFFFFF;
- ctx.h2 = sum_hash(buffer[i], ctx.h2);// & 0x7FFFFFFF;
- ctx.h3 = sum_hash(buffer[i], ctx.h3);// & 0x7FFFFFFF;
-
- if (((0xFFFFFFFFl & ctx.h) % ctx.block_size) == (ctx.block_size-1)) {
- /* we have hit a reset point. We now emit a
- hash which is based on all chacaters in the
- piece of the message between the last reset
- point and this one */
- ctx.p[ctx.j] = b64[(int)((ctx.h2&0xFFFF) % 64)];
- System.out.println("::"+ctx.j+":"+new String(ctx.p));
-// for( char c : ctx.p ) {
-// System.out.print(c);
-// }
-// System.out.println();
- if (ctx.j < SPAMSUM_LENGTH-1) {
- /* we can have a problem with the tail
- overflowing. The easiest way to
- cope with this is to only reset the
- second hash if we have room for
- more characters in our
- signature. This has the effect of
- combining the last few pieces of
- the message into a single piece */
-
- ctx.h2 = HASH_INIT;
- (ctx.j)++;
- }
- }
-
- /* this produces a second signature with a block size
- of block_size*2. By producing dual signatures in
- this way the effect of small changes in the message
- size near a block size boundary is greatly reduced. */
- if (((0xFFFFFFFFl & ctx.h) % (ctx.block_size*2)) == ((ctx.block_size*2)-1)) {
- ctx.ret2[ctx.k] = b64[(int) (ctx.h3&0xFFFF % 64)];
- if (ctx.k < SPAMSUM_LENGTH/2-1) {
- ctx.h3 = HASH_INIT;
- (ctx.k)++;
- }
- }
- }
- }
-
- static boolean ss_update(ss_context ctx, File handle) throws IOException
- {
- int bytes_read = 0;
- byte[] buffer;
-
- if (null == ctx || null == handle)
- return true;
-
- buffer = new byte[BUFFER_SIZE];
- if (buffer == null)
- return true;
-
- // snprintf(ctx.ret, 12, "%u:", ctx.block_size);
- ctx.ret = (ctx.block_size + ":").toCharArray();
- // ctx.p = ctx.ret + strlen(ctx.ret);
- ctx.p = new char[SPAMSUM_LENGTH];
-
- //memset(ctx.p, 0, SPAMSUM_LENGTH+1);
- Arrays.fill(ctx.p, (char)0 );
- //memset(ctx.ret2, 0, sizeof(ctx.ret2.length));
- Arrays.fill(ctx.ret2, (char)0 );
-
- ctx.k = ctx.j = 0;
- ctx.h3 = ctx.h2 = HASH_INIT;
- ctx.h = 0;
- roll_reset();
-
- System.out.println("Opening file:"+handle);
- FileInputStream in = new FileInputStream(handle);
- // while ((bytes_read = fread(buffer,sizeof(byte),BUFFER_SIZE,handle)) > 0)
- while (in.available() > 0 )
- {
- bytes_read = in.read(buffer);
- ss_engine(ctx,buffer,bytes_read);
- }
-
- if (ctx.h != 0)
- {
- ctx.p[ctx.j] = b64[(int) ((ctx.h2 & 0xFFFF) % 64)];
- ctx.ret2[ctx.k] = b64[(int) ((ctx.h3 &0xFFFF) % 64)];
- }
-
- // strcat(ctx.p+ctx.j, ":");
- // strcat(ctx.p+ctx.j, ctx.ret2);
- ctx.ret = (new String(ctx.ret) + new String(ctx.p) + ":" + new String(ctx.ret2)).toCharArray();
-
- // free(buffer);
- return false;
- }
-
-
- boolean fuzzy_hash_file(File handle) throws IOException
- {
- ss_context ctx;
- int filepos;
- boolean done = false;
-
- if (null == handle)
- return true;
-
- ctx = new ss_context();
- if (ctx == null)
- return true;
-
- // filepos = ftello(handle);
-
- ss_init(ctx, handle);
- System.out.println("bs-pre:"+ctx.block_size);
-
- while (!done)
- {
- // if (fseeko(handle,0,SEEK_SET))
- // return true;
-
- ss_update(ctx,handle);
-
- System.out.println("RESULT:"+new String(ctx.ret));
-
- // our blocksize guess may have been way off - repeat if necessary
- if (ctx.block_size > MIN_BLOCKSIZE && ctx.j < SPAMSUM_LENGTH/2)
- ctx.block_size = ctx.block_size / 2;
- else
- done = true;
- }
-
- System.out.println("bs-post:"+ctx.block_size);
- // strncpy(result,ctx.ret,FUZZY_MAX_RESULT);
-
- System.out.println("RESULT:"+new String(ctx.ret));
-
- ss_destroy(ctx);
- // free(ctx);
-
- // if (fseeko(handle,filepos,SEEK_SET))
- // return true;
-
- return false;
- }
-
-
- public boolean fuzzy_hash_filename(String filename) throws IOException
- {
- boolean status;
-
- if (null == filename)
- return true;
-
- File handle = new File(filename);//,"rb");
- if (null == handle)
- return true;
-
- status = fuzzy_hash_file(handle);
-
- // fclose(handle);
-
- return status;
- }
-
-
- boolean fuzzy_hash_buf(byte[] buf,
- int buf_len,
- char[] result)
- {
- ss_context ctx = new ss_context();
- boolean done = false;
-
- if (buf == null)
- return true;
-
- ctx.total_chars = buf_len;
- ss_init(ctx, null);
-
- System.out.println("total_chars: "+ctx.total_chars);
-
- while (!done)
- {
- // snprintf(ctx.ret, 12, "%u:", ctx.block_size);
- // ctx.p = ctx.ret + strlen(ctx.ret);
- ctx.p = new char[SPAMSUM_LENGTH+1]; // TODO Duplication!
-
- // memset(ctx.p, 0, SPAMSUM_LENGTH+1);
- // memset(ctx.ret2, 0, sizeof(ctx.ret2));
-
- ctx.k = ctx.j = 0;
- ctx.h3 = ctx.h2 = HASH_INIT;
- ctx.h = 0;
- roll_reset();
-
- System.out.println("h:"+ctx.h);
- System.out.println("h2:"+ctx.h2);
-
- ss_engine(ctx,buf,buf_len);
-
- /* our blocksize guess may have been way off - repeat if necessary */
- if (ctx.block_size > MIN_BLOCKSIZE && ctx.j < SPAMSUM_LENGTH/2)
- ctx.block_size = ctx.block_size / 2;
- else
- done = true;
-
- System.out.println("h:"+ctx.h);
- System.out.println("h2:"+ctx.h2);
- System.out.println("h3:"+ctx.h3);
- System.out.println("bs:"+ctx.block_size);
- System.out.println("ret:"+new String(ctx.ret));
- System.out.println("p:"+new String(ctx.p));
- System.out.println("ret2:"+new String(ctx.ret2));
- if (ctx.h != 0)
- {
- ctx.p[ctx.j] = b64[(int) ((ctx.h2&0xFFFF) % 64)];
- ctx.ret2[ctx.k] = b64[(int) ((ctx.h3&0xFFFF) % 64)];
- }
-
- // strcat(ctx.p+ctx.j, ":");
- // strcat(ctx.p+ctx.j, ctx.ret2);
- }
-
-
- // strncpy(result,ctx.ret,FUZZY_MAX_RESULT);
- System.out.println("bs:"+ctx.block_size);
- System.out.println("ret:"+new String(ctx.ret));
- System.out.println("p:"+new String(ctx.p));
- System.out.println("ret2:"+new String(ctx.ret2));
- System.out.println("h3:"+ctx.h3);
- result = ctx.ret;
-
- ss_destroy(ctx);
- // free(ctx);
- return false;
- }
-
-
-
-
- /*
- we only accept a match if we have at least one common substring in
- the signature of length ROLLING_WINDOW. This dramatically drops the
- false positive rate for low score thresholds while having
- negligable affect on the rate of spam detection.
-
- return 1 if the two strings do have a common substring, 0 otherwise
- */
- static int has_common_substring(char[] s1, char[] s2)
- {
- int i, j;
- int num_hashes;
- long[] hashes = new long[SPAMSUM_LENGTH];
-
- /* there are many possible algorithms for common substring
- detection. In this case I am re-using the rolling hash code
- to act as a filter for possible substring matches */
-
- roll_reset();
- // memset(hashes, 0, sizeof(hashes));
-
- /* first compute the windowed rolling hash at each offset in
- the first string */
- for (i=0;s1[i] != 0;i++)
- {
- hashes[i] = roll_hash((char)s1[i]);
- }
- num_hashes = i;
-
- roll_reset();
-
- /* now for each offset in the second string compute the
- rolling hash and compare it to all of the rolling hashes
- for the first string. If one matches then we have a
- candidate substring match. We then confirm that match with
- a direct string comparison */
- for (i=0;s2[i] != 0;i++) {
- long h = roll_hash((char)s2[i]);
- if (i < ROLLING_WINDOW-1) continue;
- for (j=ROLLING_WINDOW-1;j= ROLLING_WINDOW &&
- strncmp(s2+i-(ROLLING_WINDOW-1),
- s1+j-(ROLLING_WINDOW-1),
- ROLLING_WINDOW) == 0)
- {
- return 1;
- }
- */
- }
- }
- }
-
- return 0;
- }
-
-
- // eliminate sequences of longer than 3 identical characters. These
- // sequences contain very little information so they tend to just bias
- // the result unfairly
- static char[] eliminate_sequences(String string)
- {
- char[] str = string.toCharArray();
- StringBuffer ret = new StringBuffer();
-
- // Do not include repeats:
- for (int i=3;i SPAMSUM_LENGTH || len2 > SPAMSUM_LENGTH) {
- /* not a real spamsum signature? */
- return 0;
- }
-
- /* the two strings must have a common substring of length
- ROLLING_WINDOW to be candidates */
- if (has_common_substring(s1, s2) == 0) {
- return 0;
- }
-
- /* compute the edit distance between the two strings. The edit distance gives
- us a pretty good idea of how closely related the two strings are */
- score = StringUtils.getLevenshteinDistance(new String(s1), new String(s2));
-
- /* scale the edit distance by the lengths of the two
- strings. This changes the score to be a measure of the
- proportion of the message that has changed rather than an
- absolute quantity. It also copes with the variability of
- the string lengths. */
- score = (score * SPAMSUM_LENGTH) / (len1 + len2);
-
- /* at this stage the score occurs roughly on a 0-64 scale,
- * with 0 being a good match and 64 being a complete
- * mismatch */
-
- /* rescale to a 0-100 scale (friendlier to humans) */
- score = (100 * score) / 64;
-
- /* it is possible to get a score above 100 here, but it is a
- really terrible match */
- if (score >= 100) return 0;
-
- /* now re-scale on a 0-100 scale with 0 being a poor match and
- 100 being a excellent match. */
- score = 100 - score;
-
- // printf ("len1: %"PRIu32" len2: %"PRIu32"\n", len1, len2);
-
- /* when the blocksize is small we don't want to exaggerate the match size */
- if (score > block_size/MIN_BLOCKSIZE * Math.min(len1, len2)) {
- score = block_size/MIN_BLOCKSIZE * Math.min(len1, len2);
- }
- return score;
- }
-
- /*
- given two spamsum strings return a value indicating the degree to which they match.
- */
- int fuzzy_compare(FuzzyHash fh1, FuzzyHash fh2 )
- {
- int score = 0;
- char[] s1_1, s1_2;
- char[] s2_1, s2_2;
-
- // if the blocksizes don't match then we are comparing
- // apples to oranges. This isn't an 'error' per se. We could
- // have two valid signatures, but they can't be compared.
- if (fh1.blocksize != fh2.blocksize &&
- fh1.blocksize != fh2.blocksize*2 &&
- fh2.blocksize != fh1.blocksize*2) {
- return 0;
- }
-
- // there is very little information content is sequences of
- // the same character like 'LLLLL'. Eliminate any sequences
- // longer than 3. This is especially important when combined
- // with the has_common_substring() test below.
- s1_1 = eliminate_sequences(fh1.hash+1);
- s2_1 = eliminate_sequences(fh2.hash+1);
-
- s1_2 = eliminate_sequences(fh1.hash2+1);
- s2_2 = eliminate_sequences(fh1.hash2+1);
-
- // each signature has a string for two block sizes. We now
- // choose how to combine the two block sizes. We checked above
- // that they have at least one block size in common
- if (fh1.blocksize == fh2.blocksize) {
- int score1, score2;
- score1 = score_strings(s1_1, s2_1, fh1.blocksize);
- score2 = score_strings(s1_2, s2_2, fh2.blocksize);
-
- // s.block_size = fh1.blocksize;
-
- score = Math.max(score1, score2);
- } else if (fh1.blocksize == fh2.blocksize*2) {
-
- score = score_strings(s1_1, s2_2, fh1.blocksize);
- // s.block_size = fh1.blocksize;
- } else {
-
- score = score_strings(s1_2, s2_1, fh2.blocksize);
- // s.block_size = fh2.blocksize;
- }
-
- return (int)score;
- }
-
- /**
- * Main class for quick testing.
- * @param args
- * @throws IOException
- */
- public static void main( String[] args ) throws IOException {
- SSDeep ssd = new SSDeep();
- byte[] b2 = "Hello World how are you today...\n".getBytes();
- byte[] b3 = "Helli".getBytes();
- char[] h1 = null;
- boolean t1 = ssd.fuzzy_hash_buf(b2, b2.length, h1);
- System.out.println("Got "+h1);
- ssd.fuzzy_hash_file(new File("test"));
- //ssd.fuzzy_hash_file(new File("pom.xml"));
- }
-}
\ No newline at end of file