Changeset 4312


Ignore:
Timestamp:
08/18/08 10:59:08 (12 years ago)
Author:
melissa
Message:

Added faster LZW codec, courtesy of Mikhail Kovtun.

Location:
trunk/loci/formats/codec
Files:
1 deleted
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/loci/formats/codec/LZWCodec.java

    r4048 r4312  
    2424package loci.formats.codec; 
    2525 
     26import java.util.Arrays; 
    2627import loci.formats.FormatException; 
    2728 
    2829/** 
    29  * Implements basic LZW compression and decompression, as outlined in the 
    30  * TIFF 6.0 Specification at 
    31  * http://partners.adobe.com/asn/developer/pdfs/tn/TIFF6.pdf (page 61). 
     30 * This is an optimized LZW codec. 
     31 * Most of the code is inlined, and specifics of LZW usage 
     32 * (known size of decompressor output; possible lengths of LZW codes; specified 
     33 * values for <code>CLEAR</code> and <code>END_OF_INFORMATION</code> codes) 
     34 * are taken in account. 
     35 * <p> 
     36 * Estimating the worst-case size of compressor output: 
     37 * <ul> 
     38 * <li> The worst case means that there is no compression at all, and every 
     39 *      input byte generates code to output. 
     40 * <li> This means that the LZW table will be full (and reset) after reading 
     41 *      each portion of 4096-256-2-1=3837 bytes of input 
     42 *      (first 256 codes are preallocated, 2 codes are used for CLEAR and 
     43 *      END_OF_IFORMATION, 1 code is lost due to original bug in TIFF library 
     44 *      that now is a feature). 
     45 * <li> Each full portion of 3837 byte will produce in output: 
     46 * <ul> 
     47 * <li> 9 bits for CLEAR code; 
     48 * <li> 9*253 + 10*512 + 11*1024 + 12*2048 = 43237 bits for character codes. 
     49 * </ul> 
     50 * <li> Let n=3837, m=(number of bytes in the last incomplete portion), 
     51 *      N=(number of bytes in compressed complete portion with CLEAR code), 
     52 *      M=(number of bytes in compressed last incomplete portion). 
     53 *      We have inequalities: 
     54 * <ul> 
     55 * <li> N <= 1.41 * n 
     56 * <li> M <= 1.41 * m 
     57 * <li> The last incomplete portion should also include CLEAR and 
     58 *      END_OF_INFORMATION codes; they occupy less than 3 bytes. 
     59 * </ul> 
     60 * Thus, we can claim than the number of bytes in compressed output never 
     61 * exceeds 1.41*(number of input bytes)+3. 
     62 * <p> 
    3263 * 
    3364 * <dl><dt><b>Source code:</b></dt> 
     
    3566 * <a href="https://skyking.microscopy.wisc.edu/svn/java/trunk/loci/formats/codec/LZWCodec.java">SVN</a></dd></dl> 
    3667 * 
    37  * @author Eric Kjellman egkjellman at wisc.edu 
    38  * @author Curtis Rueden ctrueden at wisc.edu 
    39  * @author Wayne Rasband wsr at nih.gov 
     68 * @author Mikhail Kovtun mikhail.kovtun at duke.edu 
    4069 */ 
    41 public class LZWCodec extends BaseCodec implements Codec { 
    42  
    43   // LZW compression codes 
    44   protected static final int CLEAR_CODE = 256; 
    45   protected static final int EOI_CODE = 257; 
     70public class LZWCodec { 
    4671 
    4772  /** 
    48    * Compresses a block of data using LZW compression. If input is null or of 
    49    * 0 length, simply returns input. 
    50    * 
    51    * @param input the data to be compressed 
    52    * @param x ignored for LZW. 
    53    * @param y ignored for LZW. 
    54    * @param dims ignored for LZW. 
    55    * @param options ignored for LZW. 
    56    * @return The compressed data 
     73   * Size of hash table. Must be greater 3837 (the number of possible codes). 
     74   * Bigger size reduces number of rehashing steps -- 
     75   * at expence of initialization time. 
    5776   */ 
    58   public byte[] compress(byte[] input, int x, int y, int[] dims, 
    59     Object options) throws FormatException 
     77  private static final int HASH_SIZE = 7349; 
     78 
     79  /** Rehashing step. HASH_SIZE and HASH_STEP shoulg be coprime. */ 
     80  private static final int HASH_STEP = 257; 
     81 
     82  private static final int CLEAR_CODE = 256; 
     83  private static final int EOI_CODE = 257; 
     84  private static final int FIRST_CODE = 258; 
     85 
     86  /** Masks for writing bits in compressor. */ 
     87  private static final int[] COMPR_MASKS = 
     88    {0xff, 0x7f, 0x3f, 0x1f, 0x0f, 0x07, 0x03, 0x01}; 
     89 
     90  /** Masks for reading bits in decompressor. */ 
     91  private static final int[] DECOMPR_MASKS = 
     92    {0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f}; 
     93 
     94  /** 
     95   * Compress sequence of bytes. 
     96   * <p> 
     97   * Compresses the input sequence of bytes according to LZW algorithm. 
     98   * <p> 
     99   * @param input 
     100   *          The sequence of bytes to be compressed. 
     101   * @return  Result of compression. 
     102   */ 
     103  public byte[] compress(byte[] input, int x, int y, int[] dims, Object options) 
     104    throws FormatException 
    60105  { 
    61106    if (input == null || input.length == 0) return input; 
    62107 
    63     // initialize symbol table 
    64     LZWTreeNode symbols = new LZWTreeNode(-1); 
    65     symbols.initialize(); 
    66     int nextCode = 258; 
    67     int numBits = 9; 
    68  
    69     BitWriter out = new BitWriter(); 
    70     out.write(CLEAR_CODE, numBits); 
    71     ByteVector omega = new ByteVector(); 
    72     for (int i=0; i<input.length; i++) { 
    73       byte k = input[i]; 
    74       LZWTreeNode omegaNode = symbols.nodeFromString(omega); 
    75       LZWTreeNode omegaKNode = omegaNode.getChild(k); 
    76       if (omegaKNode != null) { 
    77         // omega+k is in the symbol table 
    78         omega.add(k); 
    79       } 
    80       else { 
    81         out.write(omegaNode.getCode(), numBits); 
    82         omega.add(k); 
    83         symbols.addTableEntry(omega, nextCode++); 
    84         omega.clear(); 
    85         omega.add(k); 
    86         if (nextCode == 512) numBits = 10; 
    87         else if (nextCode == 1024) numBits = 11; 
    88         else if (nextCode == 2048) numBits = 12; 
    89         else if (nextCode == 4096) { 
    90           out.write(CLEAR_CODE, numBits); 
    91           symbols.initialize(); 
    92           nextCode = 258; 
    93           numBits = 9; 
    94         } 
    95       } 
    96     } 
    97     out.write(symbols.codeFromString(omega), numBits); 
    98     out.write(EOI_CODE, numBits); 
    99  
    100     return out.toByteArray(); 
     108    // Output buffer (see class comments for justification of size). 
     109    byte[] output = new byte[(input.length * 141) / 100 + 3]; 
     110 
     111    // Current size of output buffer (and position to write next byte). 
     112    int outSize = 0; 
     113    // The output always starts with CLEAR code 
     114    output[outSize++] = (byte) (CLEAR_CODE >> 1); 
     115    // Last incomplete byte to be written to output (bits shifted to the right). 
     116    // Always contains at least 1 bit, and may contain 8 bits. 
     117    int currOutByte = CLEAR_CODE & 0x01; 
     118    // Number of unused bits in currOutByte (from 0 to 7). 
     119    int freeBits = 7; 
     120 
     121    // Hash table. 
     122    // Keys in the table are pairs (code,byte) and values are codes. 
     123    // Pair (code,byte) is represented as ( (code<<8) | byte ). 
     124    // Unused table entries have key=-1. 
     125    int[] htKeys   = new int[HASH_SIZE]; 
     126    int[] htValues = new int[HASH_SIZE]; 
     127    // Initialize hash table: mark all entries as unused 
     128    Arrays.fill(htKeys, -1); 
     129 
     130    // Next code to be used by compressor. 
     131    int nextCode = FIRST_CODE; 
     132    // Number of bits to be used to output code. Ranges from 9 to 12. 
     133    int currCodeLength = 9; 
     134 
     135    // The first byte of input is handled specially. 
     136    int tiffK = input[0] & 0xff; 
     137    int tiffOmega = tiffK; 
     138 
     139    // Main loop. 
     140    for (int currInPos=1; currInPos<input.length; currInPos++) { 
     141      tiffK = input[currInPos] & 0xff; 
     142      int hashKey = (tiffOmega << 8) | tiffK; 
     143      int hashCode = hashKey % HASH_SIZE; 
     144      do { 
     145        if (htKeys[hashCode] == hashKey) { 
     146          // Omega+K in the table 
     147          tiffOmega = htValues[hashCode]; 
     148          break; 
     149        } 
     150        else if (htKeys[hashCode] < 0) { 
     151          // Omega+K not in the table 
     152          // 1) add new entry to hash table 
     153          htKeys[hashCode] = hashKey; 
     154          htValues[hashCode] = nextCode++; 
     155          // 2) output last code 
     156          int shift = currCodeLength - freeBits; 
     157          output[outSize++] = 
     158            (byte) ((currOutByte << freeBits) | (tiffOmega >> shift)); 
     159          if (shift > 8) { 
     160            output[outSize++] = (byte) (tiffOmega >> (shift - 8)); 
     161            shift -= 8; 
     162          } 
     163          freeBits = 8 - shift; 
     164          currOutByte = tiffOmega & COMPR_MASKS[freeBits]; 
     165          // 3) omega = K 
     166          tiffOmega = tiffK; 
     167          break; 
     168        } 
     169        else { 
     170          // we have to rehash 
     171          hashCode = (hashCode + HASH_STEP) % HASH_SIZE; 
     172        }; 
     173      } while (true); 
     174 
     175      switch (nextCode) { 
     176        case 512: 
     177          currCodeLength = 10; 
     178          break; 
     179        case 1024: 
     180          currCodeLength = 11; 
     181          break; 
     182        case 2048: 
     183          currCodeLength = 12; 
     184          break; 
     185        case 4096:  // write CLEAR code and reinitialize hash table 
     186         int shift = currCodeLength - freeBits; 
     187         output[outSize++] = 
     188           (byte) ((currOutByte << freeBits) | (CLEAR_CODE >> shift)); 
     189         if (shift > 8) { 
     190           output[outSize++] = (byte) (CLEAR_CODE >> (shift - 8)); 
     191           shift -= 8; 
     192         } 
     193         freeBits = 8 - shift; 
     194         currOutByte = CLEAR_CODE & COMPR_MASKS[freeBits]; 
     195         Arrays.fill(htKeys, -1); 
     196         nextCode = FIRST_CODE; 
     197         currCodeLength = 9; 
     198         break; 
     199      } 
     200    } 
     201 
     202    // End of input: 
     203    // 1) write code from tiff_Omega 
     204    { 
     205      int shift = currCodeLength - freeBits; 
     206      output[outSize++] = 
     207        (byte) ((currOutByte << freeBits) | (tiffOmega >> shift)); 
     208      if (shift > 8) { 
     209        output[outSize++] = (byte) (tiffOmega >> (shift - 8)); 
     210        shift -= 8; 
     211      } 
     212      freeBits = 8 - shift; 
     213      currOutByte = tiffOmega & COMPR_MASKS[freeBits]; 
     214    } 
     215    // 2) write END_OF_INFORMATION code 
     216    //    -- we write the last incomplete byte here as well 
     217    // !!! We have to increase length of code if needed !!! 
     218    switch (nextCode) { 
     219      case 511: 
     220        currCodeLength = 10; 
     221        break; 
     222      case 1023: 
     223        currCodeLength = 11; 
     224        break; 
     225      case 2047: 
     226        currCodeLength = 12; 
     227        break; 
     228    } 
     229 
     230    { 
     231      int shift = currCodeLength - freeBits; 
     232      output[outSize++] = 
     233        (byte) ((currOutByte << freeBits) | (EOI_CODE >> shift)); 
     234      if (shift > 8) { 
     235        output[outSize++] = (byte) (EOI_CODE >> (shift - 8)); 
     236        shift -= 8; 
     237      } 
     238      freeBits = 8 - shift; 
     239      currOutByte = EOI_CODE & COMPR_MASKS[freeBits]; 
     240      output[outSize++] = (byte) (currOutByte << freeBits); 
     241    } 
     242 
     243    byte[] result = new byte[outSize]; 
     244    System.arraycopy(output, 0, result, 0, outSize); 
     245    return result; 
    101246  } 
    102247 
    103248  /** 
    104    * Decodes an LZW-compressed data block. 
    105    * 
    106    * @param input the data to be decompressed 
    107    * @return The decompressed data 
    108    * @throws FormatException If input is not an LZW-compressed data block. 
     249   * Decompresses the input sequence of bytes according to LZW algorithm. 
     250   * @param input The sequence of bytes to be decompressed. 
     251   * @param options Decompression options.  In this case, an Integer indicating 
     252   *   the maximum number of bytes to be decompressed. 
     253   * @return Result of decompression. 
     254   * @throws FormatException if input is not LZW-compressed data. 
    109255   */ 
    110256  public byte[] decompress(byte[] input, Object options) throws FormatException 
    111257  { 
    112258    if (input == null || input.length == 0) return input; 
    113  
    114     int maxLength = -1; 
    115     if (options != null) maxLength = ((Integer) options).intValue(); 
    116  
    117     byte[][] symbolTable = new byte[4096][1]; 
    118     int bitsToRead = 9; 
    119     int nextSymbol = 258; 
    120     int code; 
    121     int oldCode = -1; 
    122     ByteVector out = new ByteVector(8192); 
    123     BitBuffer bb = new BitBuffer(input); 
    124     byte[] byteBuffer1 = new byte[16]; 
    125     byte[] byteBuffer2 = new byte[16]; 
    126  
    127     while (true) { 
    128       code = bb.getBits(bitsToRead); 
    129       if (code == EOI_CODE || code == -1) break; 
    130       if (code == CLEAR_CODE) { 
    131         // initialize symbol table 
    132         for (int i = 0; i < 256; i++) symbolTable[i][0] = (byte) i; 
    133         nextSymbol = 258; 
    134         bitsToRead = 9; 
    135         code = bb.getBits(bitsToRead); 
    136         if (code == EOI_CODE || code == -1) break; 
    137         out.add(symbolTable[code]); 
    138         if (out.size() >= maxLength && maxLength != -1) break; 
    139         oldCode = code; 
     259    if (options == null) { 
     260      throw new FormatException("Options must be the maximum number of " + 
     261        "decompressed bytes."); 
     262    } 
     263 
     264    int outSize = ((Integer) options).intValue(); 
     265 
     266    // Output buffer 
     267    byte[] output = new byte[outSize]; 
     268    // Position in output buffer to write next byte to 
     269    int currOutPos = 0; 
     270 
     271    // Table mapping codes to strings. 
     272    // Its structure is based on the fact that a string for a code has form: 
     273    // (string for another code) + (new byte). 
     274    // Thus, at index 'code': first array contains 'another code', second array 
     275    // contains 'new byte', and third array contains length of the string. 
     276    // The length is needed to make retrieving the string faster. 
     277    int[] anotherCodes = new int[4096]; 
     278    byte[] newBytes = new byte[4096]; 
     279    int[] lengths = new int[4096]; 
     280    // We need to initialize only firt 256 entries in the table 
     281    for (int i=0; i<256; i++) { 
     282      newBytes[i] = (byte) i; 
     283      lengths[i] = 1; 
     284    } 
     285 
     286    // Length of the code to be read from input 
     287    int currCodeLength = 9; 
     288    // Next code to be added to the table 
     289    int nextCode = FIRST_CODE; 
     290 
     291    // Variables to handle reading bit stream: 
     292    // Position in 'input' to read next byte from 
     293    int currInPos = 0; 
     294    // Byte from 'input[curr_in_pos-1]' -- only 'bits_read' bits on the right 
     295    // are non-zero 
     296    int currRead = 0; 
     297    // Number of bits in 'curr_read' that were not consumed yet 
     298    int bitsRead = 0; 
     299 
     300    // Current code being processed by decompressor. 
     301    int currCode; 
     302    // Previous code processed by decompressor. 
     303    int oldCode = 0;   // without initializer, Java reports error later 
     304 
     305    do { 
     306      // read next code 
     307      { 
     308        int bitsLeft = currCodeLength - bitsRead; 
     309        if (bitsLeft > 8) { 
     310          currRead = (currRead << 8) | (input[currInPos++] & 0xff); 
     311          bitsLeft -= 8; 
     312        } 
     313        bitsRead = 8 - bitsLeft; 
     314        int nextByte = input[currInPos++] & 0xff; 
     315        currCode = (currRead << bitsLeft) | (nextByte >> bitsRead); 
     316        currRead = nextByte & DECOMPR_MASKS[bitsRead]; 
     317      } 
     318 
     319      if (currCode == EOI_CODE) break; 
     320 
     321      if (currCode == CLEAR_CODE) { 
     322        // initialize table -- nothing to do 
     323        nextCode = FIRST_CODE; 
     324        currCodeLength = 9; 
     325        // read next code 
     326        { 
     327          int bitsLeft = currCodeLength - bitsRead; 
     328          if (bitsLeft > 8) { 
     329            currRead = (currRead << 8) | (input[currInPos++] & 0xff); 
     330            bitsLeft -= 8; 
     331          } 
     332          bitsRead = 8 - bitsLeft; 
     333 
     334          int nextByte = input[currInPos++] & 0xff; 
     335          currCode = (currRead << bitsLeft) | (nextByte >> bitsRead); 
     336          currRead = nextByte & DECOMPR_MASKS[bitsRead]; 
     337        } 
     338        if (currCode == EOI_CODE) break; 
     339          // write string[curr_code] to output 
     340          // -- but here we are sure that string consists of a single byte 
     341          output[currOutPos++] = newBytes[currCode]; 
     342          oldCode = currCode; 
     343      } 
     344      else if (currCode < nextCode) { 
     345        // Code is already in the table 
     346        // 1) Write strin[curr_code] to output 
     347        int outLength = lengths[currCode]; 
     348        int i = currOutPos + outLength; 
     349        int tablePos = currCode; 
     350        while (i > currOutPos) { 
     351          output[--i] = newBytes[tablePos]; 
     352          tablePos = anotherCodes[tablePos]; 
     353        } 
     354        currOutPos += outLength; 
     355        // 2) Add string[old_code]+firstByte(string[curr_code]) to the table 
     356        anotherCodes[nextCode] = oldCode; 
     357        newBytes[nextCode] = output[i]; 
     358        lengths[nextCode] = lengths[oldCode] + 1; 
     359        oldCode = currCode; 
     360        nextCode++; 
    140361      } 
    141362      else { 
    142         if (code < nextSymbol) { 
    143           // code is in table 
    144           out.add(symbolTable[code]); 
    145           if (out.size() >= maxLength && maxLength != -1) break; 
    146           // add string to table 
    147           ByteVector symbol = new ByteVector(byteBuffer1); 
    148           try { 
    149             symbol.add(symbolTable[oldCode]); 
    150           } 
    151           catch (ArrayIndexOutOfBoundsException a) { 
    152             throw new FormatException("Sorry, old LZW codes not supported"); 
    153           } 
    154           symbol.add(symbolTable[code][0]); 
    155           symbolTable[nextSymbol] = symbol.toByteArray(); //** 
    156           oldCode = code; 
    157           nextSymbol++; 
    158         } 
    159         else { 
    160           // out of table 
    161           ByteVector symbol = new ByteVector(byteBuffer2); 
    162           symbol.add(symbolTable[oldCode]); 
    163           symbol.add(symbolTable[oldCode][0]); 
    164           byte[] outString = symbol.toByteArray(); 
    165           out.add(outString); 
    166           if (out.size() >= maxLength && maxLength != -1) break; 
    167           symbolTable[nextSymbol] = outString; //** 
    168           oldCode = code; 
    169           nextSymbol++; 
    170         } 
    171         if (nextSymbol == 511) bitsToRead = 10; 
    172         if (nextSymbol == 1023) bitsToRead = 11; 
    173         if (nextSymbol == 2047) bitsToRead = 12; 
    174       } 
    175     } 
    176     return out.toByteArray(); 
     363        // Special case: code is not in the table 
     364        // 1) Write string[old_code] to output 
     365        int outLength = lengths[oldCode]; 
     366        int i = currOutPos + outLength; 
     367        int tablePos = oldCode; 
     368        while (i > currOutPos) { 
     369          output[--i] = newBytes[tablePos]; 
     370          tablePos = anotherCodes[tablePos]; 
     371        } 
     372        currOutPos += outLength; 
     373        // 2) Write firstByte(string[old_code]) to output 
     374        output[currOutPos++] = output[i]; 
     375        // 3) Add string[old_code]+firstByte(string[old_code]) to the table 
     376        anotherCodes[nextCode] = oldCode; 
     377        newBytes[nextCode] = output[i]; 
     378        lengths[nextCode] = outLength + 1; 
     379        oldCode = currCode; 
     380        nextCode++; 
     381      } 
     382      // Increase length of code if needed 
     383      switch (nextCode) { 
     384        case 511: 
     385          currCodeLength = 10; 
     386          break; 
     387        case 1023: 
     388          currCodeLength = 11; 
     389          break; 
     390        case 2047: 
     391          currCodeLength = 12; 
     392          break; 
     393      } 
     394    } while(true); 
     395    return output; 
    177396  } 
    178  
    179   /** 
    180    * Main testing method. 
    181    * 
    182    * @param args ignored 
    183    * @throws FormatException Can only occur if there is a bug in the 
    184    *                         compress method. 
    185    */ 
    186   public static void main(String[] args) throws FormatException { 
    187     LZWCodec c = new LZWCodec(); 
    188     c.test(); 
    189   } 
    190  
    191397} 
Note: See TracChangeset for help on using the changeset viewer.